Nature开源 | 学习RNA和scRNA分析，提供全部代码

小杜的生信笔记

「一边学习，一边总结，一边分享！」

由于微信改版，一直有同学反映。存在长时间接收不到公众号的推文。那么请跟随以下步骤，将**「小杜的生信筆記」设置为「星标」，不错过每一条推文教程。**

本期教程，我们提供的原文的译文，若有需求请回复关键词：20240530

本期教程

「小杜的生信笔记」，自2021年11月开始做的知识分享，主要内容是「R语言绘图教程」、「转录组上游分析」、**「转录组下游分析」等内容。凡是**在社群同学，可免费获得自2021年11月份至今全部教程，教程配备事例数据和相关代码，我们会持续更新中。

往期教程部分内容

RNA-seq上游分析

文章中使用kallisto软件进行mapped，与我们平时使用的软件有所差异。

「1. kallisto index」

#Use annotated transcripts file
kallisto index -i CryptoDB-46_CparvumIowaII_AnnotatedTranscripts.index CryptoDB-46_CparvumIowaII_AnnotatedTranscripts.fasta

「2. FastQC质控」

fastqc 13-MALE-KWJT_S13_mergedLanes_R1_001.fastq.gz -t 24 -o /venice/striepenlab/Katelyn_Walzer_data/Jayesh_manuscript/Male_Sort_CryptoDB46/fastqc

定量

kallisto quant -i CryptoDB-46_CparvumIowaII_AnnotatedTranscripts.index -o male_CryptoDB_46_rep_1 -t 24 -b 60 --single -l 400 -s 10013-MALE-KWJT_S13_mergedLanes_R1_001.fastq.gz
kallisto quant -i CryptoDB-46_CparvumIowaII_AnnotatedTranscripts.index -o male_CryptoDB_46_rep_2 -t 24 -b 60 --single -l 400 -s 10014-MALE-KWJT_S14_mergedLanes_R1_001.fastq.gz

RNA-seq下游分析

load packages ----
library(tidyverse) 
library(tximport) 
library(hrbrthemes) 
library(RColorBrewer) 
library(reshape2)
library(genefilter) 
library(edgeR) 
library(matrixStats)
library(DT) 
library(gt) 
library(plotly) 
library(skimr)
library(limma)

setwd("/Volumes/KATIE/2023_1_3_Sorted_samples_bulk_Crypto46")
getwd()

ggplot(M_vs_F_invivo_top_hits, aes(y=-log10(adj.P.Val), x=logFC, text = paste("Symbol:", geneSymbol))) +
  geom_point(size=4) +
  geom_point(mapping = NULL, AP2M_HAP2_subset_female_invivo, size = 4, colour = "#0095FF", inherit.aes = TRUE) +
  ylim(-0.5,12) +
  xlim(-15,15) +
  geom_hline(yintercept = -log10(0.01), linetype="dashed", colour="grey", size=0.5) +
  geom_hline(yintercept = -log10(0.05), linetype="dashed", colour="grey", size=1) +
  geom_vline(xintercept = 1, linetype="longdash", colour="grey", size=1) +
  geom_vline(xintercept = -1, linetype="longdash", colour="grey", size=1) +
  theme_bw() +
  #theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), panel.border = element_blank(), axis.line = element_line(colour = "black")) +
  theme(axis.text=element_text(size = 14),
        axis.title = element_text(size=16),
        plot.title=element_text(face = "bold", size = 20),
        plot.subtitle = element_text(size=16)) +
  labs(title="Male versus female in vivo")

ggplot(WT_30hrs_hits, aes(y=-log10(adj.P.Val), x=logFC, text = paste("Symbol:", geneSymbol))) + geom_point(size=4) +
  geom_point(mapping = NULL, Asexual_genes_WT, size = 4, colour = "#4CBB17", inherit.aes = TRUE) +
  geom_point(mapping = NULL, Male_specific_genes_WT, size = 4, colour = "#0095FF", inherit.aes = TRUE) +
  ylim(-0.5,5) +
  xlim(-2.75,2.75) +
  geom_hline(yintercept = -log10(0.01), linetype="dashed", colour="grey", size=0.5) +
  geom_hline(yintercept = -log10(0.05), linetype="dashed", colour="grey", size=1) +
  geom_vline(xintercept = 1, linetype="longdash", colour="grey", size=1) +
  geom_vline(xintercept = -1, linetype="longdash", colour="grey", size=1) +
  theme_bw() +
  theme(plot.margin = margin(0.5, 15, 0.5, 0.5, "cm")) +
  theme(axis.text=element_text(size = 14),
        axis.title = element_text(size=16),
        plot.title=element_text(face = "bold", size = 20),
        plot.subtitle = element_text(size=16)) +
  labs(title="Effects of Shield treatment on wild type parasites")

scRNA-seq上游分析

「1. 将gff转为gtf」

gffread -E CryptoDB-46_CparvumIowaII.gff -T -o CryptoDB-46_CparvumIowaII.gtf

「2. 制作参考基因组」

# cellranger mkref --genome=output_genome --fasta=input.fa --genes=input.gtf
cellranger mkref --genome=Cryptosporidium_parvum_iowa_ii_EuPathDB46_Feb2020_ref --fasta=/data/striepenlab/Katelyn_Walzer_data/Single_Cell_Transcriptomics/Striepen_10x_Dec_2019/Cryptosporidium_parvum_ii_EuPathDB46/CryptoDB-46_CparvumIowaII_Genome.fasta --genes=/data/striepenlab/Katelyn_Walzer_data/Single_Cell_Transcriptomics/Striepen_10x_Dec_2019/Cryptosporidium_parvum_ii_EuPathDB46/CryptoDB-46_CparvumIowaII.gtf

「3. 转录组比对」

cellranger mkfastq --id=10x_fastq_files \
--run=/data/striepenlab/Katelyn_Walzer_data/Single_Cell_Transcriptomics/Striepen_10x_Feb_2020/2020_2_27_10x_BCL_files \
--csv=/data/striepenlab/Katelyn_Walzer_data/Single_Cell_Transcriptomics/Striepen_10x_Feb_2020/cellranger-KAW-bcl-simple-2020-2-27.csv \
--localcores=18

使用 CryptoDB 参考对 Crypto 样本进行细胞游离计数

「4. 进行单细胞基因表达定量」

cellranger count --id=crypto24_CryptoDB_only_1 \
--transcriptome=/data/striepenlab/Katelyn_Walzer_data/Single_Cell_Transcriptomics/Striepen_10x_Dec_2019/Cryptosporidium_parvum_iowa_ii_EuPathDB46_Feb2020_ref/ \
--fastqs=/data/striepenlab/Katelyn_Walzer_data/Single_Cell_Transcriptomics/Striepen_10x_Feb_2020/10x_fastq_files/outs/fastq_path/HV2NVBGXC/ \
--sample=crypto24hrs \
--expect-cells=4000 \
--localcores=18

cellranger count --id=crypto36_CryptoDB_only_1 \
--transcriptome=/data/striepenlab/Katelyn_Walzer_data/Single_Cell_Transcriptomics/Striepen_10x_Dec_2019/Cryptosporidium_parvum_iowa_ii_EuPathDB46_Feb2020_ref/ \
--fastqs=/data/striepenlab/Katelyn_Walzer_data/Single_Cell_Transcriptomics/Striepen_10x_Feb_2020/10x_fastq_files/outs/fastq_path/HV2NVBGXC/ \
--sample=crypto36hrs \
--expect-cells=6000 \
--localcores=18

DimPlot(seurat.integrated_0.4, group.by = "cluster_labels", cols = c("lightgreen", "seagreen1", "seagreen3",
                                                                     "chartreuse3", "yellowgreen", "chartreuse1", "green1", "green3",
                                                                     "mediumseagreen"), pt.size = 2) +
                                                            FontSize(x.title = 20, y.title = 20) + NoLegend() + xlim(-10, 10)+ylim(-10, 10) + theme(axis.text=element_text(size = 16)) + ggtitle(NULL) +
                                                            coord_fixed(ratio = 1)

#With legend
DimPlot(seurat.integrated_0.4, group.by = "cluster_labels", cols = c("lightgreen", "seagreen1", "seagreen3",
                                                                     "chartreuse3", "yellowgreen", "chartreuse1", "green1", "green3",
                                                                     "mediumseagreen"), pt.size = 2) +
                                                            FontSize(x.title = 20, y.title = 20) + xlim(-10, 10)+ylim(-10, 10) + theme(axis.text=element_text(size = 16), legend.key.size = unit(0.6, 'cm'), legend.text=element_text(size=16)) + ggtitle(NULL) +
                                                            coord_fixed(ratio = 1)

show.velocity.on.embedding.cor(emb = Embeddings(object = new.object.vel, reduction = "umap"), vel = Tool(object = new.object.vel, 
                                                                                                         slot = "RunVelocity"), n = 200, scale = "sqrt", cell.colors = ac(x = cell.colors), 
                               cex = 1, arrow.scale = 1.2, show.grid.flow = TRUE, min.grid.cell.mass = 0.5, grid.n = 40, arrow.lwd = 1, 
                               do.par = FALSE, cell.border.alpha = 0, frame = FALSE)

lot_cells(cds, color_cells_by = "pseudotime", cell_size = 2, trajectory_graph_color = "red", alpha = 1, trajectory_graph_segment_size = 1.2, label_branch_points = FALSE, label_roots = FALSE, label_leaves = FALSE) + NoLegend() + FontSize(x.title = 20, y.title = 20) + xlim(-10, 10)+ylim(-10, 10) + theme(axis.text=element_text(size = 16)) +
  viridis::scale_color_viridis(begin = 0.05, end = 1, option = "D") +
  theme(axis.line.y = element_line(size=0.5, color="black")) +
  theme(axis.text.x=element_text(colour="black")) +
  theme(axis.text.y=element_text(colour="black")) +
  coord_fixed(ratio = 1) 

#With legend
plot_cells(cds, color_cells_by = "pseudotime", cell_size = 2, trajectory_graph_color = "red", alpha = 1, trajectory_graph_segment_size = 1.2, label_branch_points = FALSE, label_roots = FALSE, label_leaves = FALSE) + FontSize(x.title = 20, y.title = 20) + xlim(-10, 10)+ylim(-10, 10) + theme(axis.text=element_text(size = 16), legend.key.size = unit(1.5, 'cm'), legend.text=element_text(size=16), legend.title = element_blank()) +
  viridis::scale_color_viridis(begin = 0.05, end = 1, option = "D") +
  theme(axis.line.y = element_line(size=0.5, color="black")) +
  theme(axis.text.x=element_text(colour="black")) +
  theme(axis.text.y=element_text(colour="black")) +
  coord_fixed(ratio = 1) 


pseudotime_start <- cds@principal_graph_aux@listData[["UMAP"]][["pseudotime"]]
seurat.integrated_0.4.updated <- AddMetaData(seurat.integrated_0.4.updated, metadata = pseudotime_start, col.name = "Pseudotime")

data_to_write_out <- as.data.frame(as.matrix([email]seurat.integrated_0.4.updated@meta.data[/email]))
fwrite(x = data_to_write_out, row.names = TRUE, file = "2023_1_19_Asexual_24hrs_36hrs_with_pseudotime.csv")

#Rename columns because ggplot does not like dashes
colnames(rhoptry_pseudotime_results_transposed) <- c("cgd4_1790", "cgd8_540", "cgd8_2530", "cgd4_2420", "cgd1_1870", "cgd2_370", 
                                                     "cgd3_2010",  "cgd1_1380", "cgd1_950", "cgd2_2900", "cgd3_1330", "cgd3_1710", 
                                                     "cgd3_1730", "cgd3_1770", "cgd3_1780", "cgd3_2750", "cgd3_910", "cgd5_20", 
                                                     "cgd5_2760", "cgd6_1000", "cgd6_3630", "cgd6_3930", "cgd6_3940", "cgd8_520", "Pseudotime")

#Now plot, Extended Data Figure 3a
ggplot(rhoptry_pseudotime_results_transposed, aes(Pseudotime)) + 
  stat_smooth(aes(y=cgd4_1790), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd8_540), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd8_2530), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd4_2420), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd1_1870), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd2_370), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd3_2010), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd1_1380), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd1_950), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd2_2900), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd3_1330), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd3_1710), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd3_1730), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd3_1770), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd3_1780), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd3_2750), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd3_910), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd5_20), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd5_2760), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd6_1000), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd6_3630), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd6_3930), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd6_3940), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(aes(y=cgd8_520), fill=TRUE, colour="lightgray", se=FALSE, show.legend = TRUE) +
  stat_smooth(data = mean_gene_groups_pseudotime, aes(y=Rhoptry), level = 0.95, colour="black", fill = "red1", show.legend = TRUE) +
  theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), panel.border = element_blank(), axis.line = element_line(colour = "black"), plot.margin = margin(0.5, 0.5, 0.5, 0.5, "cm")) + 
  scale_x_continuous(expand=c(0,0)) +
  ylab("Scaled Gene Expression") +
  theme(legend.title = element_blank()) +
  theme(axis.text.x=element_text(colour="black")) +
  theme(axis.text.y=element_text(colour="black")) +
  theme(axis.text=element_text(size = 28),
        axis.title = element_text(size=32),
        plot.title=element_text(face = "bold", size = 20),
        legend.text = element_text(size=16))

文中提供了详细代码，可以自己对照原文进行查看。

本期教程，我们提供的原文的译文，若有需求请回复关键词：20240530

若我们的教程对你有所帮助，请点赞+收藏+转发，这是对我们最大的支持。

往期部分文章

「1. 最全WGCNA教程（替换数据即可出全部结果与图形）」

「2. 精美图形绘制教程」

精美图形绘制教程

「3. 转录组分析教程」

「4. 转录组下游分析」

「小杜的生信筆記」 ，主要发表或收录生物信息学教程，以及基于R分析和可视化（包括数据分析，图形绘制等）；分享感兴趣的文献和学习资料!!

微信扫一扫分享文章

Nature开源 | 学习RNA和scRNA分析，提供全部代码

本期教程

往期教程部分内容

RNA-seq上游分析

RNA-seq下游分析

scRNA-seq上游分析

往期部分文章

相关帖子

绘制带P值的差异柱状图

RNA-seq（3）

生信技能总汇，总有一个技能可以帮助你！| 2024年更新教程

FPKM和TPM的概念及计算脚本

RNA-seq 差异分析的点点滴滴（3）

小杜的生信笔记 LV4