GSEA 图是如何画出来的? (源码解析)

Original JunJunLab 老俊俊的生信笔记 2023-06-15

收录于合集 #R绘图 116个

梦幻与现实的撕裂感才是生活的感觉吗

1引言

前面我们讲到 Y叔的 clusterProfiler 包做 GSEA 富集分析及相应的可视化,有时候需要个性化绘图时却无能为力, 拿不到绘图数据就做不了这样的事情。GSEA 绘图推送过之前的文章如下:

跟着 NC 学画图: GSEA 富集结果可视化
提取 GSEA 富集结果绘制唯美 GSEA 图

我们去看看 gseaplot2 函数的源码,看看图是如何一步一步绘制出来的。

源码地址:

https://github.com/YuLab-SMU/enrichplot/blob/master/R/gseaplot.R

2常规富集分析

先做个 GSEA 的富集分析:

library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(RColorBrewer)
library(ggplot2)
library(cowplot)

# load test data
data(geneList, package="DOSE")

# check
head(geneList)
# 4312     8318    10874    55143    55388      991
# 4.572613 4.514594 4.418218 4.144075 3.876258 3.677857

# GO enrich
ego3 <- gseGO(geneList     = geneList,
              OrgDb        = org.Hs.eg.db,
              ont          = "BP",
              minGSSize    = 100,
              maxGSSize    = 500,
              pvalueCutoff = 0.05,
              verbose      = FALSE)

绘图:

# gseaplot2
gseaplot2(ego3, geneSetID = 1,
          title = ego3$Description[1])

图主要包括三个部分:

富集曲线
富集的基因排序线段
所有基因的排序及对应的差异倍数排序

3绘图解析

定义提取数据的函数

# export gseaScores function
gseaScores <- getFromNamespace("gseaScores", "DOSE")

# define function
gsInfo <- function(object, geneSetID) {
  geneList <- object@geneList

  if (is.numeric(geneSetID))
    geneSetID <- object@result[geneSetID, "ID"]

  geneSet <- object@geneSets[[geneSetID]]
  exponent <- object@params[["exponent"]]
  df <- gseaScores(geneList, geneSet, exponent, fortify=TRUE)
  df$ymin <- 0
  df$ymax <- 0
  pos <- df$position == 1
  h <- diff(range(df$runningScore))/20
  df$ymin[pos] <- -h
  df$ymax[pos] <- h
  df$geneList <- geneList

  df$Description <- object@result[geneSetID, "Description"]
  return(df)
}

提取数据:

# extract data
gsdata <- gsInfo(ego3, geneSetID = 1)

# check
head(gsdata,3)
#   x  runningScore position ymin ymax geneList                          Description
# 1 1 -8.090615e-05        0    0    0 4.572613 mitotic sister chromatid segregation
# 2 2 -1.618123e-04        0    0    0 4.514594 mitotic sister chromatid segregation
# 3 3 -2.427184e-04        0    0    0 4.418218 mitotic sister chromatid segregation

绘制一个空画板

# plot panel
p <- ggplot(gsdata, aes_(x = ~x)) + xlab(NULL) +
  # theme_classic(14) +
  theme_bw(14) +
  theme(panel.grid.major = element_line(colour = "grey92"),
        panel.grid.minor = element_line(colour = "grey92"),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) +
  scale_x_continuous(expand=c(0,0))

p

添加曲线

# add line plot
ES_geom = "line"

if (ES_geom == "line") {
  es_layer <- geom_line(aes_(y = ~runningScore, color= ~Description),
                        size=1)
} else {
  es_layer <- geom_point(aes_(y = ~runningScore, color= ~Description),
                         size=1, data = subset(gsdata, position == 1))
}

# combine
p.res <- p + es_layer +
  theme(legend.position = c(.8, .8), legend.title = element_blank(),
        legend.background = element_rect(fill = "transparent"))

p.res <- p.res + ylab("Running Enrichment Score") +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.line.x=element_blank(),
        plot.margin=margin(t=.2, r = .2, b=0, l=.2, unit="cm"))

p.res

添加基因排序线段

# add segment plot
i <- 0
for (term in unique(gsdata$Description)) {
  idx <- which(gsdata$ymin != 0 & gsdata$Description == term)
  gsdata[idx, "ymin"] <- i
  gsdata[idx, "ymax"] <- i + 1
  i <- i + 1
  }

# plot
p2 <- ggplot(gsdata, aes_(x = ~x)) +
  geom_linerange(aes_(ymin=~ymin, ymax=~ymax),color= 'black') +
  xlab(NULL) + ylab(NULL) +
  # theme_classic(14) +
  theme_bw(14) +
  theme(legend.position = "none",
        plot.margin = margin(t=-.1, b=0,unit="cm"),
        axis.ticks = element_blank(),
        axis.text = element_blank(),
        axis.line.x = element_blank(),
        panel.grid = element_blank()) +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0))

p2

添加热图和拼图

# add heatmap
geneSetID = 1

if(length(geneSetID) == 1) {
  v <- seq(1, sum(gsdata$position), length.out=9)
  inv <- findInterval(rev(cumsum(gsdata$position)), v)
  if (min(inv) == 0) inv <- inv + 1

  col <- c(rev(brewer.pal(5, "Blues")), brewer.pal(5, "Reds"))

  ymin <- min(p2$data$ymin)
  yy <- max(p2$data$ymax - p2$data$ymin) * .3
  xmin <- which(!duplicated(inv))
  xmax <- xmin + as.numeric(table(inv)[as.character(unique(inv))])
  d <- data.frame(ymin = ymin, ymax = yy,
                  xmin = xmin,
                  xmax = xmax,
                  col = col[unique(inv)])
  p2 <- p2 + geom_rect(
    aes_(xmin=~xmin,
         xmax=~xmax,
         ymin=~ymin,
         ymax=~ymax,
         fill=~I(col)),
    data=d,
    alpha=1,
    inherit.aes=FALSE) +
    theme(panel.grid = element_blank())
}

p2

看看 d 的数据内容,热图数据:

# check d
d
#    ymin ymax  xmin  xmax     col
# 1     0  0.3     1   707 #A50F15
# 2     0  0.3   707  3004 #DE2D26
# 3     0  0.3  3004  5194 #FB6A4A
# 4     0  0.3  5194  7981 #FCAE91
# 5     0  0.3  7981  9461 #FEE5D9
# 6     0  0.3  9461 10972 #EFF3FF
# 7     0  0.3 10972 12304 #BDD7E7
# 8     0  0.3 12304 12409 #6BAED6
# 9     0  0.3 12409 12493 #3182BD
# 10    0  0.3 12493 12496 #08519C

添加所有基因排序

# add gene rank plot
df2 <- p$data
df2$y <- p$data$geneList[df2$x]
p.pos <- p + geom_segment(data=df2, aes_(x=~x, xend=~x, y=~y, yend=0),
                          color="grey")

p.pos <- p.pos + ylab("Ranked List Metric") +
  xlab("Rank in Ordered Dataset") +
  theme(plot.margin=margin(t = -.1, r = .2, b=.2, l=.2, unit="cm"))

p.pos

最终拼图

# last combine all plot
plotlist <- list(p.res, p2, p.pos)

n <- length(plotlist)

plotlist[[n]] <- plotlist[[n]] +
  theme(axis.line.x = element_line(),
        axis.ticks.x=element_line(),
        axis.text.x = element_text())

# combine
aplot::plot_list(gglist = plotlist, ncol=1, heights=c(0.6,0.15,0.25))