查看原文
其他

比原文献的还好看!绘制添加基因标签的火山图

莫北 基迪奥生物 2023-12-20

之前在《案例复现(一):表达数据的下载与差异分析》一文中已经介绍了如何从GEO数据库下载数据并进行基于limma的差异分析。完成差异分析后,文章作者首先使用火山图对差异分析结果进行可视化,如下图(A),共得到上调基因2496个,下调基因1130个。



那么,如何使用我们自己得到的差异分析数据,复现出原文的火山图呢?甚至绘制出更新颖更好看的火山图?下面继续为大家演示。


1. 读取数据


#读取差异分析表格;
exp <- read.csv("patient-control_diff_new.csv")
#预览数据;
head(exp)


2. 数据过滤


#去掉没有symbol号的探针记录;
expr1 <- exp[exp$GENE_SYMBOL!='',]
#查看筛选后的数据量;
dim(expr1)
#[1] 29833 6
#一个基因对应多个探针,继续过滤;
#查看基因个数;
length(unique(expr1$GENE_SYMBOL))
#[1] 21754
library(dplyr)
#根据平均表达量对表格进行降序排列;
expr2 <- arrange(expr1,desc(AveExpr))
#根据Gene symbol列移除包含重复的行;
expr3<- expr2 %>% distinct(GENE_SYMBOL, .keep_all = TRUE)
#查看表格的行列数;
dim(expr3)
#[1] 21754 6
#预览数据;
head(expr3)


3. 绘制火山图


#载入绘图相关的R包;
library(ggplot2)
library(ggrepel)
#转成tibble便于后续使用,去掉不需要的列;
dt <- as_tibble(expr3[c(6,4,3)])
#对p值取对数;
dt$log10PValue <- -log10(dt$P.Value)
#生成显著上下调数据的分组标签;
dt$group <- case_when(dt$logFC > 1 & dt$P.Value < 0.05 ~ "Up",
dt$logFC < -1 & dt$P.Value < 0.05 ~ "Down",
abs(dt$logFC) <= 1 ~ "None",
dt$P.Value >= 0.05 ~ "None")
head(dt)


#重新设置阈值(logFC=1.5),生成显著上下调数据的分组标签;
dt$group2 <- case_when(dt$logFC > 1.5 & dt$adj.P.Val < 0.05 ~ "Up",
dt$logFC < -1.5 & dt$adj.P.Val < 0.05 ~ "Down",
abs(dt$logFC) <= 1.5 ~ "None",
dt$adj.P.Val >= 0.05 ~ "None")
#提取上下调基因;
Up <- filter(dt,group2=="Up")
up_genes <- Up$GENE_SYMBOL
Down <- filter(dt,group2=="Down")
down_genes <- Down$GENE_SYMBOL
#确定上下调基因数量;
paste0("The number of up gene is ",length(up_genes))
#[1] "The number of up gene is 2483"
paste0("The number of down gene is ",length(down_genes))
#[1] "The number of down gene is 1165"

注意,可能由于上游分析过程中过滤条件、差异分析参数等存在差异,我这里的分析结果与文献原文的结果有些许差异,原文的上调基因数为2496,下调基因数为1130。


#获取表达差异最显著的10个基因;
top10sig <- filter(dt,group!="None") %>% distinct(GENE_SYMBOL,.keep_all = T) %>% top_n(10,abs(logFC))
top10sig


#将差异表达Top10的基因表格拆分成up和down两部分;
up <- filter(top10sig,group=="Up")
up
down <- filter(top10sig,group=="Down")
down


#新增一列,将Top10的差异基因标记为2,其他的标记为1;
dt$size <- case_when(!(dt$GENE_SYMBOL %in% top10sig$GENE_SYMBOL)~ 1,
dt$GENE_SYMBOL %in% top10sig$GENE_SYMBOL ~ 2)
head(dt)


#提取非Top10的基因表格;
df <- filter(dt,size==1)
#指定绘图顺序,将group列转成因子型;
df$group <- factor(df$group,
levels = c("Up","Down","None"),
ordered = T)
#开始绘图,建立映射;
p0 <-ggplot(data=df,aes(logFC,log10PValue,color=group))
#添加散点;
p1 <- p0+geom_point(size=1)
p1


#自定义半透明颜色(红绿);
mycolor <- c("#FF9999","#99CC00","gray80")
p2 <- p1 + scale_colour_manual(name="",values=alpha(mycolor,0.9))
p2


#继续添加Top10基因对应的点;
p3 <- p2+geom_point(data=up,aes(logFC,log10PValue),
color="#FF9999",size=2,alpha=0.9)+
geom_point(data=down,aes(logFC,log10PValue),
color="#7cae00",size=2,alpha=0.9)
p3


#expansion函数设置坐标轴范围两端空白区域的大小;mult为“倍数”模式,add为“加性”模式;
p4<-p3+labs(y="-log10 P Value",x="log2FC")+
scale_y_continuous(expand=expansion(add = c(0.5, 0)),
limits = c(0, 12),
breaks = c(0,3,6,9,12),
label = c("0","3","6","9","12"))+
scale_x_continuous(limits = c(-12, 12),
breaks = c(-8,-4,0,4,8),
label = c("-8","-4","0","4","8"))
p4


#添加箭头;
set.seed(007)
p5 <- p4+geom_text_repel(data=top10sig,aes(logFC,log10PValue,label=GENE_SYMBOL),
force=20,color="grey20",
size=2,
point.padding = 0.5,hjust = 0.5,
arrow = arrow(length = unit(0.01, "npc"),
type = "open", ends = "last"),
segment.color="grey20",
segment.size=0.2,
segment.alpha=0.8,
nudge_x=0,
nudge_y=0)
p5


#自定义图表主题,对图表做精细调整;
top.mar=0.2
right.mar=0.2
bottom.mar=0.2
left.mar=0.2
#隐藏纵轴,并对字体样式、坐标轴的粗细、颜色、刻度长度进行限定;
mytheme<-theme_classic()+
theme(text=element_text(family = "sans",colour ="gray30",size = 10),
axis.line = element_line(size = 0.6,colour = "gray30"),
axis.ticks = element_line(size = 0.6,colour = "gray30"),
axis.ticks.length = unit(1.5,units = "mm"),
plot.margin=unit(x=c(top.mar,right.mar,bottom.mar,left.mar),
units="inches"))
#应用自定义主题;
p5+mytheme


#添加辅助线;
p6 <- p4+geom_hline(yintercept = c(-log10(0.05)),
size = 0.5,
color = "orange",
lty = "dashed")+
geom_vline(xintercept = c(-1,1),
size = 0.5,
color = "orange",
lty = "dashed")
p6


#添加其他样式的标签;
#为了方便自定义左右区域的标签,这里使用up、down两个独立的子表格;
p7 <- p6+geom_label_repel(
data = up,aes(logFC,log10PValue,label=GENE_SYMBOL),
nudge_x = 3,
nudge_y = -0.8,
color = "white",
alpha = 0.9,
point.padding = 0.5,
size = 2.5,
fill = "#96C93D",
segment.size = 0.5,
segment.color = "grey50",
direction = "y",
hjust = 0.5) +
geom_label_repel(
data = down,aes(logFC,log10PValue,label=GENE_SYMBOL),
nudge_x = -3,
nudge_y = 0.2,
color = "white",
alpha = 0.9,
point.padding = 0.5,
size = 2.5,
fill = "#9881F5",
segment.size = 0.5,
segment.color = "grey50",
direction = "y",
hjust = 0.5)
#应用自定义主题;
p8 <- p7+mytheme
p8


好啦,本次的绘图方法分享就到这里啦!


*未经许可,不得以任何方式复制或抄袭本篇文章之部分或全部内容。版权所有,侵权必究。


基迪奥生物|专业定制测序服务
联系方式:020-39341079;service@genedenovo.com扫码关注


继续滑动看下一个

您可能也对以下帖子感兴趣

文章有问题?点此查看未经处理的缓存