有PIK3CA基因突变的肿瘤病人的转录水平变化
这期要对既有肿瘤样本又有正常样本,并且还检出了PIK3CA突变的病人进行差异分析
首先就是下载突变信息
UCSC Xena数据库下载
UCSC Xena网址:https://xena.ucsc.edu/public-hubs/
挑选出有PIK3CA突变的样本
载入下载好的突变信息文件
mutype_file <- read.table('TCGA-BRCA.mutect2_snv.tsv.gz',header = T,sep = '\t',quote = '')
## 取出含有'PIK3CA'信息的行
PIK3CA <- mutype_file[mutype_file$gene=='PIK3CA',]
PIK3CA_sample <- unique(sort(PIK3CA$Sample_ID))
save(PIK3CA_sample,file = 'all_sample.Rdata')
到这里,我们就从986个样本中挑出了327个有PIK3CA突变的样本
从Xena下载到的表达矩阵不是可以直接用的,我们要先把它处理一下
library(data.table)
a=fread('TCGA-BRCA.htseq_counts.tsv',sep = ' ',header = T)
a=as.data.frame(a)
a[1:4,1:4]
rownames(a)=a[,1]
a=a[,-1]
genes=rownames(a)
a[1:4,1:4]
## 在数据的介绍页面上我们已经得知了数据的计算方法现在我们只要把它还原回去就可以了
a=2^a-1
a[1:4,1:4]
## 接下来我们就要取出表达数据,但是我们只要这327个tnbc样本中成对的样本,## 即,同一个病人既有正常样本,又有肿瘤样本的表达信息
load('all_sample.Rdata')
PIK3CA_p=substring(PIK3CA_sample,1,12)
all_p=substring(colnames(a),1,12)
paired_p=names(table(all_p)[table(all_p)==2])
## 这样挑选过后,符合要求的样本就只有35个了
need_p=intersect(PIK3CA_p,paired_p)
exprSet=a[,all_p %in% need_p]
tmp=apply(exprSet,1,function(x){
sum(x==0) < 10
})
exprSet=exprSet[tmp,]
save(exprSet,file = 'PIK3CA_paired_exprSet.Rdata')
差异分析
## 接下来做差异分析
Rdata_dir='.'
load( file =
file.path(Rdata_dir,'PIK3CA_paired_exprSet.Rdata')
)
dim(exprSet)
## 根据TCGA样本的命名可以区分正常组织和肿瘤样本的测序结果
group_list=factor(ifelse(as.numeric(substr(colnames(exprSet),14,15)) < 10,'tumor','normal'))
table(group_list)
group_list
## 有两个病人没有normal对照,诡异的是它们居然测了两次tumor组织,可能 是复发或者其它生物学现象,我们这次的分析需要删除他们的数据
exprSet <- exprSet[,-c(5,6,55,56)]
group_list=factor(ifelse(as.numeric(substr(colnames(exprSet),14,15)) < 10,'brca','no'))
table(group_list)
## 对表达差异结果数据取整
exprSet=ceiling(exprSet)
这个时候exprSet结果就可以进行差异分析了
## 方法一:DESeq2
if(T){
library(DESeq2)
(colData <- data.frame(row.names=colnames(exprSet),
group_list=group_list) )
dds <- DESeqDataSetFromMatrix(countData = exprSet,
colData = colData,
design = ~ group_list)
tmp_f=file.path(Rdata_dir,'TCGA-PIK3CA-miRNA-DESeq2-dds.Rdata')
if(!file.exists(tmp_f)){
dds <- DESeq(dds)
save(dds,file = tmp_f)
}
load(file = tmp_f)
res <- results(dds,
contrast=c("group_list","tumor","normal"))
resOrdered <- res[order(res$padj),]
head(resOrdered)
DEG =as.data.frame(resOrdered)
DESeq2_DEG = na.omit(DEG)
nrDEG=DESeq2_DEG[,c(2,6)]
colnames(nrDEG)=c('log2FoldChange','pvalue')
}
## 方法二:edgeR
if(T){
library(edgeR)
d <- DGEList(counts=exprSet,group=factor(group_list))
keep <- rowSums(cpm(d)>1) >= 2
table(keep)
d <- d[keep, , keep.lib.sizes=FALSE]
d$samples$lib.size <- colSums(d$counts)
d <- calcNormFactors(d)
d$samples
dge=d
design <- model.matrix(~0+factor(group_list))
rownames(design)<-colnames(dge)
colnames(design)<-levels(factor(group_list))
dge=d
dge <- estimateGLMCommonDisp(dge,design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
# https://www.biostars.org/p/110861/
lrt <- glmLRT(fit, contrast=c(-1,1))
nrDEG=topTags(lrt, n=nrow(dge))
nrDEG=as.data.frame(nrDEG)
head(nrDEG)
edgeR_DEG =nrDEG
nrDEG=edgeR_DEG[,c(1,5)]
colnames(nrDEG)=c('log2FoldChange','pvalue')
}
## 方法三:limma
if(T){
suppressMessages(library(limma))
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exprSet)
design
dge <- DGEList(counts=exprSet)
dge <- calcNormFactors(dge)
logCPM <- cpm(dge, log=TRUE, prior.count=3)
v <- voom(dge,design,plot=TRUE, normalize="quantile")
fit <- lmFit(v, design)
group_list
cont.matrix=makeContrasts(contrasts=c('tumor-normal'),levels = design)
fit2=contrasts.fit(fit,cont.matrix)
fit2=eBayes(fit2)
tempOutput = topTable(fit2, coef='tumor-normal', n=Inf)
DEG_limma_voom = na.omit(tempOutput)
head(DEG_limma_voom)
nrDEG=DEG_limma_voom[,c(1,4)]
colnames(nrDEG)=c('log2FoldChange','pvalue')
}
## 比较一下这三个差异分析的结果
nrDEG1=DEG_limma_voom[,c(1,4)]
colnames(nrDEG1)=c('log2FoldChange','pvalue')
nrDEG2=edgeR_DEG[,c(1,5)]
colnames(nrDEG2)=c('log2FoldChange','pvalue')
nrDEG3=DESeq2_DEG[,c(2,6)]
colnames(nrDEG3)=c('log2FoldChange','pvalue')
mi=unique(c(rownames(nrDEG1),rownames(nrDEG1),rownames(nrDEG1)))
lf=data.frame(limma=nrDEG1[mi,1],
edgeR=nrDEG2[mi,1],
DESeq2=nrDEG3[mi,1])
cor(na.omit(lf))
limma edgeR DESeq2
limma 1.0000000 0.9215024 0.9534569
edgeR 0.9215024 1.0000000 0.9643088
DESeq2 0.9534569 0.9643088 1.0000000
注:热图里面的基因应该是以symbol表示,而不是上图的ensembl ID
注释
library( "clusterProfiler" )
library( "org.Hs.eg.db" )
nrDEG = nrDEG1
nrDEG = nrDEG2
nrDEG <- DEG_limma_voom
colnames(DESeq2_DEG)[c(2,5)]=c('logFC','P.Value')
nrDEG <- DESeq2_DEG
colnames(edgeR_DEG)[c(1,4)]=c('logFC','P.Value')
nrDEG <- edgeR_DEG
logFC_cutoff <- with( nrDEG, mean( abs( logFC ) ) + 2 * sd( abs( logFC ) ) )
logFC_cutoff
logFC_cutoff = 1.2
nrDEG$change = as.factor( ifelse( nrDEG$P.Value < 0.05 & abs(nrDEG$logFC) > logFC_cutoff,
ifelse( nrDEG$logFC > logFC_cutoff , 'UP', 'DOWN' ), 'NOT' ) )
table(nrDEG$change)
keytypes(org.Hs.eg.db)
library("stringr")
rownames( nrDEG ) <- str_sub(rownames( nrDEG ), start = 1, end = 15)
nrDEG$ENSEMBL <- rownames( nrDEG )
df <- bitr( rownames( nrDEG ), fromType = "ENSEMBL", toType = c( "ENTREZID" ), OrgDb = org.Hs.eg.db )
head( df )
{
nrDEG$SYMBOL = rownames( nrDEG )
nrDEG = merge( nrDEG, df, by='ENSEMBL' )
}
head( nrDEG )
{
gene_up = nrDEG[ nrDEG$change == 'UP', 'ENTREZID' ]
gene_down = nrDEG[ nrDEG$change == 'DOWN', 'ENTREZID' ]
gene_diff = c( gene_up, gene_down )
gene_all = as.character(nrDEG[ ,'ENTREZID'] )
}
{
geneList = nrDEG$logFC
names( geneList ) = nrDEG$ENTREZID
geneList = sort( geneList, decreasing = T )
}
{
## KEGG pathway analysis
kk.up <- enrichKEGG( gene = gene_up ,
organism = 'hsa' ,
universe = gene_all ,
pvalueCutoff = 0.8 ,
qvalueCutoff = 0.8 )
kk.down <- enrichKEGG( gene = gene_down ,
organism = 'hsa' ,
universe = gene_all ,
pvalueCutoff = 0.05 )
}
head( kk.up )[ ,1:6 ]
head( kk.down )[ ,1:6 ]
library( "ggplot2" )
{
kegg_down_dt <- as.data.frame( kk.down )
kegg_up_dt <- as.data.frame( kk.up )
down_kegg <- kegg_down_dt[ kegg_down_dt$pvalue < 0.05, ]
down_kegg$group = -1
up_kegg <- kegg_up_dt[ kegg_up_dt$pvalue < 0.05, ]
up_kegg$group = 1
dat = rbind( up_kegg, down_kegg )
dat$pvalue = -log10( dat$pvalue )
dat$pvalue = dat$pvalue * dat$group
dat = dat[ order( dat$pvalue, decreasing = F ), ]
g_kegg <- ggplot( dat,
aes(x = reorder( Description, order( pvalue, decreasing=F ) ), y = pvalue, fill = group)) +
geom_bar( stat = "identity" ) +
scale_fill_gradient( low = "blue", high = "red", guide = FALSE ) +
scale_x_discrete( name = "Pathway names" ) +
scale_y_continuous( name = "log10P-value" ) +
coord_flip() + theme_bw() + theme( plot.title = element_text( hjust = 0.5 ) ) +
ggtitle( "Pathway Enrichment" )
print( g_kegg )
ggsave( g_kegg, filename = 'kegg_up_down_edgeR.png' )
}
可以肯定上下调基因集经过超几何分布的统计学检验后有着 共同的KEGG结果,需要有更高级的展现方式!
KEGG通路几乎没有差异,说明找到的差异基因很一致
现在只能说明我们使用的3种R包找差异比较一致,但事实上这个是早有定论,我们只是在数据上面验证了一下,这个时候感兴趣的应该是有PIK3CA基因突变的那些病人和并没有突变的那些病人的差异分析的结果的区别,留给读者去实现吧!
■ ■ ■
生信基础知识大全系列:生信基础知识100讲
史上最强的生信自学环境准备课来啦!! 7次改版,11节课程,14K的讲稿,30个夜晚打磨,100页PPT的课程。
如果需要组装自己的服务器;代办生物信息学服务器
如果需要帮忙下载海外数据(GEO/TCGA/GTEx等等),点我?
如果需要线下辅导及培训,看招学徒
如果需要个人电脑:个人计算机推荐
如果需要置办生物信息学书籍,看:生信人必备书单
如果需要实习岗位:实习职位发布
如果需要售后:点我
如果需要入门资料大全:点我
点击下面的阅读原文直达我的github