其他
GEO数据挖掘-第四期-肝细胞癌(HCC)
GEO数据挖掘系列文-第四期-肝细胞癌
疾病背景
肝细胞癌(HCC),起源于肝细胞,是全球第七大常见癌症。
每年全球约诊断HCC病例50万例,约占所有癌症病例的5.4%,每年因肝细胞癌而死亡得人数约70万人。
绝大多数HCC发生在亚洲和撒哈拉以南非洲,美国和其他发展中国家的HCC发病率正在上升。
GEO数据框:GSE54238
◎ 10个正常样本 (NL)
◎ 10个慢性肝炎 (IL)
◎ 10个肝硬化 (CL)
◎ 13个早期肝细胞癌 (eHCC)
◎ 13个晚期肝细胞癌 (aHCC)
■ ■ ■
这一期还是WGCNA。。。。。。
rm( list = ls() )
load( './gset.Rdata' )
load('./exprSet_for_WGCNA.Rdata')
options( stringsAsFactors = F )
◎ gset包含原始数据
◎ exprSet是已经处理并注释好的数据集
第一步 准备样本表型数据和数据集
library( "GEOquery" )
## 取表达矩阵和样本信息表
{
gset = gset[[1]]
pdata = pData( gset )
}
colnames(pdata)
hclust_table = pdata[,38:42]
colnames(hclust_table) = c("age", "condition", "gender", "hbeag", "hbsag")
hclust_table[hclust_table$gender=="M", 3] = 1
## 这样就得到了一个样本特性的数据框
## age condition gender hbeag hbsag
## GSM1310758 44 normal liver 1 Negative Negative
## GSM1310759 41 normal liver 1 Negative Negative
## GSM1310760 49 normal liver 1 Negative Negative
exprSet[1:5,1:5]
## GSM1310758 GSM1310759 GSM1310760 GSM1310761 GSM1310762
## FAM138C 6.944570 6.248560 7.526373 8.255203 8.717276
## FLJ39609 6.594915 6.269244 6.801226 6.705848 6.784622
## LOC100128842 9.240880 9.793829 9.232292 8.533079 8.549601
## LOC148413 10.696339 10.126397 10.274773 9.017882 8.310329
## LOC100128003 7.156712 8.209122 7.413724 6.254033 6.980212
group_list = as.character( pdata[, 8] )
table( group_list )
## advanced HCC chronic inflammatory liver cirrhotic livers
## 13 10 10
## early HCC normal liver
## 13 10
colnames( exprSet ) = paste( group_list, 1:ncol( exprSet ), sep = '_' )
exprSet[1:5,1:5]
## normal liver_1 normal liver_2 normal liver_3 normal liver_4
## FAM138C 6.944570 6.248560 7.526373 8.255203
## FLJ39609 6.594915 6.269244 6.801226 6.705848
## LOC100128842 9.240880 9.793829 9.232292 8.533079
## LOC148413 10.696339 10.126397 10.274773 9.017882
## LOC100128003 7.156712 8.209122 7.413724 6.254033
# 计算数据集的每个基因的平均绝对离差,取排在前面最大的5000个基因。
datExpr = t(exprSet[order(apply(exprSet,1,mad), decreasing = T)[1:5000],])
# 上面这步实际就是把一个基因在个样本间表达变化最大的基因跳出来,继续下面的分析
dim( datExpr )
# [1] 56 5000
## 接下来就是对这56个样本的5000个基因做分析
第二步 绘制进化树
library("WGCNA")
{
# 样本信息不同特征值赋予不同的颜色
status_colors <- numbers2colors(as.numeric(factor(hclust_table$condition)),
colors = c("red","white","pink"),signed = FALSE)
age_colors <- numbers2colors(as.numeric(factor(hclust_table$age)),
colors = c("red","white","pink"),signed = FALSE)
sex_colors[,1] <- rep( 'white', 56 )
hbeag_colors <- numbers2colors(as.numeric(factor(hclust_table$hbeag)),
colors = c("red","white"),signed = FALSE)
hbsag_colors <- numbers2colors(as.numeric(factor(hclust_table$hbsag)),
colors = c("red","white"),signed = FALSE)
par(mar = c(1,4,3,1),cex=0.8)
plotDendroAndColors(datExpr_tree, cbind(status_colors, age_colors, sex_colors, hbeag_colors, hbsag_colors),
groupLabels = colnames(hclust_table),
cex.dendroLabels = 0.8,
marAll = c(1, 4, 3, 1),
cex.rowText = 0.01,
main = "Sample dendrogram and trait heatmap")
}
第三步 确定软阀值
powers = c( c(1:10), seq(12, 20, by = 2) )
## [1] 1 2 3 4 5 6 7 8 9 10 12 14 16 18 20
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 0)
str(sft)
par(mfrow = c(1,2))
cex1 = 0.9
## 两种方法挑选软阀值
# Scale independence
plot(sft$fitIndices$Power,
-sign(sft$fitIndices$slope) * sft$fitIndices$SFT.R.sq,
xlab="Soft Threshold (power)",
ylab="Scale Free Topology Model Fit,signed R^2",
type="n",
main = paste("Scale independence"))
text(sft$fitIndices$Power,
-sign(sft$fitIndices$slope) * sft$fitIndices$SFT.R.sq,
labels=powers,
cex=cex1,
col="red")
abline(h=0.85,col="RED")
# Mean connectivity
plot(sft$fitIndices$Power,
sft$fitIndices$mean.k.,
xlab="Soft Threshold (power)",
ylab="Mean Connectivity",
type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices$Power,
sft$fitIndices$mean.k.,
labels=powers,
cex=cex1,
col="red")
best_beta=sft$powerEstimate
第四步 构建共表达矩阵
## blockwiseModules自动网络构建和模块检测,单个基因水平上的
net = blockwiseModules(datExpr, power = best_beta,
TOMType = "unsigned", minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = TRUE,
saveTOMFileBase = "HCCTOM",
verbose = 3)
table(net$colors)
# 这里的color是标签,下一步转换为对应的颜色
moduleColors <- labels2colors(net$colors)
# 计算模块特征基因,为第一主成分
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
# 计算根据模块特征基因计算模块相异度
MEDiss = 1-cor(MEs0)
# 计算模块特征基因间的距离,并聚类
METree = hclust(as.dist(MEDiss), method = "average");
plot(METree,
main = "Clustering of module eigengenes",
xlab = "",
sub = "")
# 在聚类图中画出剪切线
MEDissThres = 0.25
abline(h=MEDissThres, col = "red")
# 合并特征基因相近的模块,特征基因水平上的
merge_modules = mergeCloseModules(datExpr, moduleColors, cutHeight = MEDissThres, verbose = 3)
mergedColors = merge_modules$colors
mergedMEs = merge_modules$newMEs
plotDendroAndColors(net$dendrograms[[1]], cbind(moduleColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
第五步 绘制TOM图
# 每个块中基因的层次聚类树状图
geneTree = net$dendrograms[[1]]
dissTOM = 1-TOMsimilarityFromExpr(datExpr)
set.seed(10)
select = sample(ncol(datExpr), size = 400)
selectTOM = dissTOM[select, select]
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select]
sizeGrWindow(9,9)
plotDiss = selectTOM^7
diag(plotDiss) = NA
TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
第六步 模块与性状的关系
design=model.matrix(~0+ group_list)
colnames(design)=levels(factor(group_list))
# 根据相关性重新排序
MEs = orderMEs(MEs0)
# 计算两个相关性值
moduleTraitCor = cor(MEs, design , use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))
sizeGrWindow(10,6)
textMatrix = paste(signif(moduleTraitCor, 2), "
(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3))
# 可视化
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(hclust_table[,1:5]),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
sizeGrWindow(5,7.5);
par(cex = 0.9)
# 特征基因网络图
plotEigengeneNetworks(MEs, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8,
xLabelsAngle= 90)
# Plot the dendrogram
sizeGrWindow(6,6);
par(cex = 1.0)
## 聚类图
plotEigengeneNetworks(MEs, "Eigengene dendrogram", marDendro = c(0,4,2,0),
plotHeatmaps = FALSE)
# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)
par(cex = 1.0)
## 性状与模块热图
plotEigengeneNetworks(MEs, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),
plotDendrograms = FALSE, xLabelsAngle = 90)
第七步 对感兴趣模块的进一步分析
modNames = substring(names(MEs), 3)
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));
## 算出每个模块跟特征基因的皮尔森相关系数矩阵
## MEs是每个模块在每个样本里面的值
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));
names(geneModuleMembership) = paste("MM", modNames, sep="");
names(MMPvalue) = paste("p.MM", modNames, sep="");
geneTraitSignificance = as.data.frame(cor(datExpr,use = "p"));
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));
module = "turquoise"
column = match(module, modNames);
moduleGenes = moduleColors==module;
sizeGrWindow(7, 7);
par(mfrow = c(1,1));
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, 1]),
xlab = paste("Module Membership in", module, "module"),
ylab = "Gene significance for condition",
main = paste("Module membership vs. gene significance
"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
module = "blue"
column = match(module, modNames);
moduleGenes = moduleColors==module;
sizeGrWindow(7, 7);
par(mfrow = c(1,1));
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, 1]),
xlab = paste("Module Membership in", module, "module"),
ylab = "Gene significance for condition",
main = paste("Module membership vs. gene significance
"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
我们提供GEO线上视频课程,以及课后答疑服务。
如有意向,请扫描下面的二维码与我联系。
(长按图片扫描二维码可添加微信)
点击阅读原文直达我们的网易云课堂购买链接。
请务必思考再三后再购买!