其他
GSEA 图是如何画出来的? (源码解析)
梦幻与现实的撕裂感才是生活的感觉吗
1引言
前面我们讲到 Y叔的 clusterProfiler 包做 GSEA
富集分析及相应的可视化,有时候需要个性化绘图时却无能为力, 拿不到绘图数据就做不了这样的事情。GSEA 绘图推送过之前的文章如下:
我们去看看 gseaplot2 函数的源码,看看图是如何一步一步绘制出来的。
源码地址:
https://github.com/YuLab-SMU/enrichplot/blob/master/R/gseaplot.R
2常规富集分析
先做个 GSEA 的富集分析:
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(RColorBrewer)
library(ggplot2)
library(cowplot)
# load test data
data(geneList, package="DOSE")
# check
head(geneList)
# 4312 8318 10874 55143 55388 991
# 4.572613 4.514594 4.418218 4.144075 3.876258 3.677857
# GO enrich
ego3 <- gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
ont = "BP",
minGSSize = 100,
maxGSSize = 500,
pvalueCutoff = 0.05,
verbose = FALSE)
绘图:
# gseaplot2
gseaplot2(ego3, geneSetID = 1,
title = ego3$Description[1])
图主要包括三个部分:
富集曲线 富集的基因排序线段 所有基因的排序及对应的差异倍数排序
3绘图解析
定义提取数据的函数
# export gseaScores function
gseaScores <- getFromNamespace("gseaScores", "DOSE")
# define function
gsInfo <- function(object, geneSetID) {
geneList <- object@geneList
if (is.numeric(geneSetID))
geneSetID <- object@result[geneSetID, "ID"]
geneSet <- object@geneSets[[geneSetID]]
exponent <- object@params[["exponent"]]
df <- gseaScores(geneList, geneSet, exponent, fortify=TRUE)
df$ymin <- 0
df$ymax <- 0
pos <- df$position == 1
h <- diff(range(df$runningScore))/20
df$ymin[pos] <- -h
df$ymax[pos] <- h
df$geneList <- geneList
df$Description <- object@result[geneSetID, "Description"]
return(df)
}
提取数据:
# extract data
gsdata <- gsInfo(ego3, geneSetID = 1)
# check
head(gsdata,3)
# x runningScore position ymin ymax geneList Description
# 1 1 -8.090615e-05 0 0 0 4.572613 mitotic sister chromatid segregation
# 2 2 -1.618123e-04 0 0 0 4.514594 mitotic sister chromatid segregation
# 3 3 -2.427184e-04 0 0 0 4.418218 mitotic sister chromatid segregation
绘制一个空画板
# plot panel
p <- ggplot(gsdata, aes_(x = ~x)) + xlab(NULL) +
# theme_classic(14) +
theme_bw(14) +
theme(panel.grid.major = element_line(colour = "grey92"),
panel.grid.minor = element_line(colour = "grey92"),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank()) +
scale_x_continuous(expand=c(0,0))
p
添加曲线
# add line plot
ES_geom = "line"
if (ES_geom == "line") {
es_layer <- geom_line(aes_(y = ~runningScore, color= ~Description),
size=1)
} else {
es_layer <- geom_point(aes_(y = ~runningScore, color= ~Description),
size=1, data = subset(gsdata, position == 1))
}
# combine
p.res <- p + es_layer +
theme(legend.position = c(.8, .8), legend.title = element_blank(),
legend.background = element_rect(fill = "transparent"))
p.res <- p.res + ylab("Running Enrichment Score") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.line.x=element_blank(),
plot.margin=margin(t=.2, r = .2, b=0, l=.2, unit="cm"))
p.res
添加基因排序线段
# add segment plot
i <- 0
for (term in unique(gsdata$Description)) {
idx <- which(gsdata$ymin != 0 & gsdata$Description == term)
gsdata[idx, "ymin"] <- i
gsdata[idx, "ymax"] <- i + 1
i <- i + 1
}
# plot
p2 <- ggplot(gsdata, aes_(x = ~x)) +
geom_linerange(aes_(ymin=~ymin, ymax=~ymax),color= 'black') +
xlab(NULL) + ylab(NULL) +
# theme_classic(14) +
theme_bw(14) +
theme(legend.position = "none",
plot.margin = margin(t=-.1, b=0,unit="cm"),
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.line.x = element_blank(),
panel.grid = element_blank()) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0))
p2
添加热图和拼图
# add heatmap
geneSetID = 1
if(length(geneSetID) == 1) {
v <- seq(1, sum(gsdata$position), length.out=9)
inv <- findInterval(rev(cumsum(gsdata$position)), v)
if (min(inv) == 0) inv <- inv + 1
col <- c(rev(brewer.pal(5, "Blues")), brewer.pal(5, "Reds"))
ymin <- min(p2$data$ymin)
yy <- max(p2$data$ymax - p2$data$ymin) * .3
xmin <- which(!duplicated(inv))
xmax <- xmin + as.numeric(table(inv)[as.character(unique(inv))])
d <- data.frame(ymin = ymin, ymax = yy,
xmin = xmin,
xmax = xmax,
col = col[unique(inv)])
p2 <- p2 + geom_rect(
aes_(xmin=~xmin,
xmax=~xmax,
ymin=~ymin,
ymax=~ymax,
fill=~I(col)),
data=d,
alpha=1,
inherit.aes=FALSE) +
theme(panel.grid = element_blank())
}
p2
看看 d 的数据内容,热图数据:
# check d
d
# ymin ymax xmin xmax col
# 1 0 0.3 1 707 #A50F15
# 2 0 0.3 707 3004 #DE2D26
# 3 0 0.3 3004 5194 #FB6A4A
# 4 0 0.3 5194 7981 #FCAE91
# 5 0 0.3 7981 9461 #FEE5D9
# 6 0 0.3 9461 10972 #EFF3FF
# 7 0 0.3 10972 12304 #BDD7E7
# 8 0 0.3 12304 12409 #6BAED6
# 9 0 0.3 12409 12493 #3182BD
# 10 0 0.3 12493 12496 #08519C
添加所有基因排序
# add gene rank plot
df2 <- p$data
df2$y <- p$data$geneList[df2$x]
p.pos <- p + geom_segment(data=df2, aes_(x=~x, xend=~x, y=~y, yend=0),
color="grey")
p.pos <- p.pos + ylab("Ranked List Metric") +
xlab("Rank in Ordered Dataset") +
theme(plot.margin=margin(t = -.1, r = .2, b=.2, l=.2, unit="cm"))
p.pos
最终拼图
# last combine all plot
plotlist <- list(p.res, p2, p.pos)
n <- length(plotlist)
plotlist[[n]] <- plotlist[[n]] +
theme(axis.line.x = element_line(),
axis.ticks.x=element_line(),
axis.text.x = element_text())
# combine
aplot::plot_list(gglist = plotlist, ncol=1, heights=c(0.6,0.15,0.25))
主题背景我稍微修改了一下,可以看到最终结果和 gseaplot2 基本差不多了。
4结尾
大家可以好好看看每句代码是什么意思,就能明白数据怎么绘图的了。此外,针对个性化绘图只需要提取数据稍微修改一下即可。
欢迎加入生信交流群。加我微信我也拉你进 微信群聊 老俊俊生信交流群
(微信交流群需收取20元入群费用(防止骗子和便于管理)
)。
老俊俊微信:
知识星球:
所以今天你学习了吗?
今天的分享就到这里了,敬请期待下一篇!
最后欢迎大家分享转发,您的点赞是对我的鼓励和肯定!
如果觉得对您帮助很大,赏杯快乐水喝喝吧!
往期回顾
◀scRNAtoolVis 绘制单细胞 Marker 基因均值表达热图
◀genesorteR 快速准确鉴定亚群 Marker 基因
◀...