七步走纯R代码通过数据挖掘复现一篇实验文章(第七步WGCNA)
最近给新的学徒布置了去年学徒的数据挖掘任务,让我意外的是一个影像学专业大三学生表现最好,两天就表现优异的完成了作业,而且项目代码组织习惯也非常棒,任务拆解为7个步骤,条理清晰,逻辑流畅,值得分享!
如果你也想向优秀的小伙伴看齐,欢迎加入生信技能树超级VIP计划(生信技能树超级VIP入场券发放(人民币一万起)),如果你时间有限,也可以考虑参与我们的全国巡讲:全国巡讲第14、15站-兰州和贵阳(生信入门课全新改版)(早鸟9折优惠最后两天)
下面看学徒表演
本期我们的任务是全代码复现一篇paper,标题为 :Co-expression networks revealed potential core lncRNAs in the triple-negative breast cancer. PMID:27380926 (大家需要自行阅读文献才明白jimmy交给学徒的任务!)文章里面是自己测了8个TNBC病人的转录组然后分析,但是我们有TCGA数据库,所以可以复现,这就是为什么标题是七步走纯R代码通过数据挖掘复现一篇实验文章!
因为排版限制,前面的1-6步内容在本期推文的头条!
step7.WGCNA
WGCNA分析完全是照搬jimmy老师三年前的教程,要提醒大家的是要用别人的教程一定要看清楚别人数据格式,行名,列名,然后做成绝对一致的格式,这样会很大程度的减少报错!
rm(list = ls())
load(file = "../02_data/TCGA_TNBC_pair_annotation_result.Rdata")
# step1_构建WGCNA所需表达矩阵
--------------------------------------------------------------------------------
exprSet_lncRNA[1:3,1:3]
# TCGA-BH-A0B3-01A TCGA-BH-A0B3-11B TCGA-BH-A18V-01A
# IGF2-AS 4.201151 4.945464 3.167927
# AL136982.1 2.628749 5.484643 4.256068
# SHANK2-AS3 2.628749 2.628749 3.552230
exprSet_mRNA[1:3,1:3]
# TCGA-BH-A0B3-01A TCGA-BH-A0B3-11B TCGA-BH-A18V-01A
# TNMD 4.443206 8.357049 5.006992
# DBNDD1 11.148636 8.672442 12.093123
# TFPI 8.780552 12.019202 10.205594
exprSet_lncRNA = as.data.frame(t(exprSet_lncRNA))
exprSet_mRNA = as.data.frame(t(exprSet_mRNA))
exprSet_lncRNA[1:3,1:3]
# IGF2-AS AL136982.1 SHANK2-AS3
# TCGA-BH-A0B3-01A 4.201151 2.628749 2.628749
# TCGA-BH-A0B3-11B 4.945464 5.484643 2.628749
# TCGA-BH-A18V-01A 3.167927 4.256068 3.552230
exprSet_mRNA[1:3,1:3]
# TNMD DBNDD1 TFPI
# TCGA-BH-A0B3-01A 4.443206 11.148636 8.780552
# TCGA-BH-A0B3-11B 8.357049 8.672442 12.019202
# TCGA-BH-A18V-01A 5.006992 12.093123 10.205594
sample = rownames(exprSet_mRNA)
sample = rownames(exprSet_lncRNA)
tmp = merge(exprSet_mRNA,exprSet_lncRNA, by = "sample")
tmp[1:3,1:3]
# sample TNMD DBNDD1
# 1 TCGA-BH-A0B3-01A 4.443206 11.148636
# 2 TCGA-BH-A0B3-11B 8.357049 8.672442
# 3 TCGA-BH-A18V-01A 5.006992 12.093123
rownames(tmp) = tmp$sample
tmp = tmp[,-1]
tmp[1:3,1:3]
# TNMD DBNDD1 TFPI
# TCGA-BH-A0B3-01A 4.443206 11.148636 8.780552
# TCGA-BH-A0B3-11B 8.357049 8.672442 12.019202
# TCGA-BH-A18V-01A 5.006992 12.093123 10.205594
-----------------------------------------------------------------------------------
group_list=factor(ifelse(as.numeric(substr(rownames(exprSet_mRNA),14,15)) < 10,'tumor','normal'))
table(group_list)
# group_list
# normal tumor
# 8 8
datTraits <- data.frame(row.names=rownames(exprSet_mRNA),
subtype=group_list)
library(WGCNA)
rt = tmp
datExpr = rt
nSamples = nrow(datExpr)
step2_找到合适的beta值
---------------------------------------------------------------------------------------
powers1=c(seq(1,10,by=1),seq(12,30,by=2))
RpowerTable=pickSoftThreshold(datExpr, powerVector=powers1)[[2]]
cex1=0.7
pdf(file="softThresholding.pdf")
par(mfrow=c(1,2))
plot(RpowerTable[,1], -sign(RpowerTable[,3])*RpowerTable[,2],xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n")
text(RpowerTable[,1], -sign(RpowerTable[,3])*RpowerTable[,2], labels=powers1,cex=cex1,col="red")
this line corresponds to using an R^2 cut-off of h
abline(h=0.85,col="red")
plot(RpowerTable[,1], RpowerTable[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n")
text(RpowerTable[,1], RpowerTable[,5], labels=powers1, cex=cex1,col="red")
beta1=14
step3_对基因聚类成模块并且可视化
---------------------------------------------------------------------------------------
net = blockwiseModules(
datExpr,
power = 14,
maxBlockSize = 6000,
TOMType = "unsigned", minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = TRUE,
saveTOMFileBase = "AS-green-FPKM-TOM",
verbose = 3
)
# Calculating module eigengenes block-wise from all genes
colors)
#
# 0 1 2 3
# 78 1433 1300 500
mergedColors = labels2colors(net$colors)
table(mergedColors)
# mergedColors
# blue brown grey turquoise
# 1300 500 78 1433
将目标基因聚成几个模块
dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
# step4_将样本信息添加进去,分析样本性状与模块的关系
# --------------------------------------------------------------------------
design=model.matrix(~0+ datTraits$subtype)
design = as.data.frame(design)
colnames(design)=levels(datTraits$subtype)
moduleColors <- labels2colors(net$colors)
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, design , use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = colnames(design),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
# step5_感兴趣性状的模块的具体基因分析
# ---------------------------------------------------------------------------------
## 首先计算模块与基因的相关性矩阵
modNames = substring(names(MEs), 3)
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));
names(geneModuleMembership) = paste("MM", modNames, sep="");
names(MMPvalue) = paste("p.MM", modNames, sep="")
## 再计算性状与基因的相关性矩阵
tumor = as.data.frame(design[,2])
names(tumor) = "tumor"
geneTraitSignificance = as.data.frame(cor(datExpr, tumor, use = "p"))
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))
names(geneTraitSignificance) = paste("GS.", names(tumor), sep="")
names(GSPvalue) = paste("p.GS.", names(tumor), sep="")
## 最后把两个相关性矩阵联合起来,指定感兴趣模块进行分析
module = "blue"
module
## [1] "blue"
column = match(module, modNames);
moduleGenes = moduleColors==module;
# step6_网络的可视化
# -----------------------------------------------------------------------------------
## 首先针对所有基因画热图
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
geneTree = net$dendrograms[[1]];
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 14);
plotTOM = dissTOM^7;
diag(plotTOM) = NA;
# TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")
## 最后画模块和性状的关系
# Recalculate module eigengenes
MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
## 这里把是否属于 tumor 表型这个变量用0,1进行数值化。
tumor = as.data.frame(design[,2])
names(tumor) = "tumor"
# Add the weight to existing module eigengenes
MET = orderMEs(cbind(MEs, tumor))
# Plot the relationships among the eigengenes and the trait
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2),
marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle = 90)
# Plot the dendrogram
## 模块与性状的聚类图
plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),
plotHeatmaps = FALSE)
# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)
## 性状与模块热图
plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),
plotDendrograms = FALSE, xLabelsAngle = 90)
# step7_提取指定模块的基因名
# --------------------------------------------------------------------------------------
# Select module
module = "blue"
# Select module probes
probes = colnames(datExpr) ## 我们例子里面的probe就是基因名
inModule = (moduleColors==module);
modProbes = probes[inModule];
head(modProbes)
## [1] "SLC7A2" "TAC1" "DLX6" "DBF4" "TKTL1" "E2F2"
# Step9: 模块的导出
# --------------------------------------------------------------------------------------
# Recalculate topological overlap
TOM = TOMsimilarityFromExpr(datExpr, power = 14);
module = "brown";
# Select module probes
probes = colnames(datExpr) ## 我们例子里面的probe就是基因名
inModule = (moduleColors==module);
modProbes = probes[inModule];
## 也是提取指定模块的基因名
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
## 模块对应的基因关系矩阵
cyt = exportNetworkToCytoscape(
modTOM,
edgeFile = paste("CytoscapeInput-edges-", paste(module, collapse="-"), ".txt", sep=""),
nodeFile = paste("CytoscapeInput-nodes-", paste(module, collapse="-"), ".txt", sep=""),
weighted = TRUE,
threshold = 0.02,
nodeNames = modProbes,
nodeAttr = moduleColors[inModule]
)
最后分享这篇教程的心得:遇到问题勤找Google~
另外,本文是rmarkdown格式,如果需要文章pdf以及rmarkdown源代码和网页报告,需要后台回复:优秀学徒,即可获取!
全国巡讲约你
第1-11站北上广深杭,西安,郑州, 吉林,武汉,成都,港珠澳(全部结束)
一年一度的生信技能树单细胞线下培训班(已结束)
全国巡讲第12站-北京(下一站杭州)(已结束)