[模式植物构建orgDb数据库 | 以org.Slycompersicum.eg.db为例 | 二

R语言 R语言 318 人阅读 | 0 人回复 | 2024-08-02

原文链接 :模式植物构建orgDb数据库

前面的步骤请看,模式植物构建orgDb数据库 | 以org.Slycompersicum.eg.db为例 | 一

  1. 提取KEGG信息 在这里,我们提取的是 <span class="ne-text">KO号列</span>
#把Emapper中的query列、KEGG_ko列提出出来
gene2ko <- emapper %>% 
  dplyr::select(GID = query, 
                Ko = KEGG_ko) %>% 
  na.omit()
#将gene2ko的Ko列中的“Ko:”删除,不然后面找pathway会报错
gene2ko$Ko <- gsub("ko:","",gene2ko$Ko)

head(gene2ko)
> head(gene2ko)
                 GID            Ko
1 Solyc00g500001.1.1             -
2 Solyc00g500002.1.1             -
3 Solyc00g500003.1.1             -
4 Solyc00g500019.1.1             -
5 Solyc00g500020.1.1        K02634
6 Solyc00g500021.1.1 K02707,K02713
  1. 下载KO的json文件,下载地址https://www.genome.jp/kegg-bin/get_htext?ko00001

下载后,放在 <span class="ne-text">路劲文件夹</span>中。

  1. 运行一下代码,本地创建 <span class="ne-text">kegg_info.RData</span>文件
update_kegg <- function(json = "ko00001.json") {
  pathway2name <- tibble(Pathway = character(), Name = character())
  ko2pathway <- tibble(Ko = character(), Pathway = character())
  kegg <- fromJSON(json)
  for (a in seq_along(kegg[["children"]][["children"]])) {
    A <- kegg[["children"]][["name"]][[a]]
    for (b in seq_along(kegg[["children"]][["children"]][[a]][["children"]])) {
      B <- kegg[["children"]][["children"]][[a]][["name"]][[b]] 
      for (c in seq_along(kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]])) {
        pathway_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["name"]][[c]]
        pathway_id <- str_match(pathway_info, "ko[0-9]{5}")[1]
        pathway_name <- str_replace(pathway_info, " \\[PATH:ko[0-9]{5}\\]", "") %>% str_replace("[0-9]{5} ", "")
        pathway2name <- rbind(pathway2name, tibble(Pathway = pathway_id, Name = pathway_name))
        kos_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]][[c]][["name"]]
        kos <- str_match(kos_info, "K[0-9]*")[,1]
        ko2pathway <- rbind(ko2pathway, tibble(Ko = kos, Pathway = rep(pathway_id, length(kos))))
      }
    }
    }
  save(pathway2name, ko2pathway, file = "kegg_info.RData")
  }

##'@本地创建kegg_info.RData文件
update_kegg()

##'@载kegg_info.RData
load(file = "kegg_info.RData")
  1. 根据gene2ko中的ko信息将ko2pathway中的pathway列提取出来
gene2pathway <- gene2ko  %>% 
  left_join(ko2pathway,by = "Ko") %>% 
  dplyr::select(GID, Pathway) %>% 
  na.omit()

注意:在此步骤,可能会出现警告提示,大家忽略即可,不要以为是报错信息。

  1. 在网站https://www.ncbi.nlm.nih.gov/taxonomy上查询物种信息

** 进入网站:**

搜索物种

点进进去

获得相关信息

  1. 填写对应信息
tax_id="4081"
genus="Solanum"
species="lycompersicum"
  1. 去重
gene2go <- unique(gene2go)
gene2go <- gene2go[!duplicated(gene2go),]
gene2ko <- gene2ko[!duplicated(gene2ko),]
gene2pathway <- gene2pathway[!duplicated(gene2pathway),]
  1. 构建OrgDb数据库
makeOrgPackage(gene_info=gene_info,
               go=gene2go,
               ko=gene2ko,
               pathway=gene2pathway,
               version="4.0",  #版本
               maintainer = "du<****@qq.com>",  #修改为你的名字和邮箱
               author = "du<****@qq.com>",  #修改为你的名字和邮箱
               outputDir = ".",  #输出文件位置
               tax_id=tax_id,
               genus=genus,
               species=species,
               goTable="go")

三、GO和KEGG富集分析

3.1 加载自建OrgDb数据库

install.packages("org.Slycompersicum.eg.db/", repos = NULL, type = "sources")

3.2 KEGG富集分析

  1. 读取DEG_list
DEG_list <- read.csv("DEG_list.csv",header = F)

gene_list <- DEG_list$V1
  1. KEGG富集分析
pathway2gene <- AnnotationDbi::select(org.Slycompersicum.eg.db, 
                                      keys = keys(org.Slycompersicum.eg.db), 
                                      columns = c("Pathway","Ko")) %>%
  na.omit() %>%
  dplyr::select(Pathway, GID)
  1. 导入 Pathway 与名称对应关系
load("kegg_info.RData")
kegg <- enricher(gene_list, 
                 TERM2GENE = pathway2gene, 
                 TERM2NAME = pathway2name, 
                 pvalueCutoff = 1, 
                 qvalueCutoff = 1,
                 pAdjustMethod = "BH",
                 minGSSize = 200)
  1. 查看KEGG富集结果
kegg_results <- as.data.frame(kegg)
#
head(kegg_results)
##'@导出结果
write.csv(kegg_results,"./KEGG_output.csv")

  1. 绘制柱状图
pdf("./功能富集/01.kegg.柱状图.pdf",height = 4, width = 6)
barplot(kegg, showCategory= 10,
        drop=F,
        color="pvalue", 
        font.size=10) 
dev.off()

6. 气泡图

pdf("./功能富集/02.kegg.气泡图.pdf",height = 4, width = 6)
dotplot(kegg)
dev.off()

enrichplot::cnetplot(kegg,circular = F, colorEdge = T)

3.3 GO富集分析

enrich.go <- enrichGO(gene=gene_list,
                      OrgDb = org.Slycompersicum.eg.db,
                      # pvalueCutoff = 0.5,  ##'@结合自己的需求进行调整
                      # qvalueCutoff = 0.5,  ##'@结合自己的需求进行调整
                      pAdjustMethod = "BH",
                      pvalueCutoff  = 0.05,
                      ont = "ALL",  ## GO的种类,BP,CC, MF, ALL
                      keyType = "GID")
  1. 查看GO富集结果
GO_results <- as.data.frame(enrich.go)
#
head(GO_results)
##'@导出结果
write.csv(kegg_results,"./GO_output.csv")
  1. 绘制GO富集柱状图
pdf(file = "./功能富集/03.GO_bar.pdf", width = 8, height = 6)
barplot(enrich.go, 
        drop = TRUE, 
        showCategory = 10,   ## 显示前10个GO term
        split="ONTOLOGY") +
  scale_y_discrete(labels=function(x) str_wrap(x, width=80))+
  facet_grid(ONTOLOGY~., scale='free')
dev.off()

4. GO 气泡图

pdf(file = "./功能富集/04.GO_dotplot.pdf", width = 6, height = 6)
dotplot(enrich.go, showCategory = 10)
dev.off()
  1. GO网络图
MF_ego <- enrichGO(
  gene  = gene_list,
  keyType = "GID",
  OrgDb   = org.Slycompersicum.eg.db,
  ont     = "MF", ##'@GO的种类,BP,CC,MF
  pAdjustMethod = "BH",
  pvalueCutoff  = 0.5##'@结合自己的需求进行调整
)

MF富集网络图

pdf("./功能富集/05.MF_网络图.pdf", width = 8, height = 8)
plotGOgraph(MF_ego)
dev.off()

MF_ego <- pairwise_termsim(MF_ego)
pdf("./功能富集/06.网络图.pdf",width = 8, height = 8)
emapplot(MF_ego, cex.params = list(category_label = 0.8, line = 0.5)) + 
  scale_fill_continuous(low = "#e06663", high = "#327eba", name = "p.adjust",
                        guide = guide_colorbar(reverse = TRUE, order=2), trans='log10')
dev.off()

参考资源:

  1. https://www.jianshu.com/p/693d66681710
  2. https://www.jianshu.com/p/bb4281e6604e
  3. https://www.jianshu.com/p/9c9e97167377
  4. https://bioconductor.org/packages/release/bioc/vignettes/clusterProfiler/inst/doc/clusterProfiler.html#supported-organisms

若我们的教程对你有所帮助,请 <span class="ne-text">点赞+收藏+转发</span>,这是对我们最大的支持。

往期部分文章

1. 最全WGCNA教程(替换数据即可出全部结果与图形)


2. 精美图形绘制教程

3. 转录组分析教程

4. 转录组下游分析

小杜的生信筆記** ,主要发表或收录生物信息学教程,以及基于R分析和可视化(包括数据分析,图形绘制等);分享感兴趣的文献和学习资料!!**

微信扫一扫分享文章

+10
无需登陆也可“点赞”支持作者
分享到:
评论

使用道具 举报

热门推荐