有PIK3CA基因突变的肿瘤病人的转录水平变化

Original 生信技能树生信菜鸟团 2022-06-06

这期要对既有肿瘤样本又有正常样本，并且还检出了PIK3CA突变的病人进行差异分析

首先就是下载突变信息

UCSC Xena数据库下载

UCSC Xena网址：https://xena.ucsc.edu/public-hubs/

挑选出有PIK3CA突变的样本

载入下载好的突变信息文件

mutype_file <- read.table('TCGA-BRCA.mutect2_snv.tsv.gz',header = T,sep = '\t',quote = '')

## 取出含有'PIK3CA'信息的行

PIK3CA <- mutype_file[mutype_file$gene=='PIK3CA',]

PIK3CA_sample <- unique(sort(PIK3CA$Sample_ID))

save(PIK3CA_sample,file = 'all_sample.Rdata')

到这里，我们就从986个样本中挑出了327个有PIK3CA突变的样本

从Xena下载到的表达矩阵不是可以直接用的，我们要先把它处理一下

library(data.table)

a=fread('TCGA-BRCA.htseq_counts.tsv',sep = ' ',header = T)

a=as.data.frame(a)

a[1:4,1:4]

rownames(a)=a[,1]

a=a[,-1]

genes=rownames(a)

a[1:4,1:4]

## 在数据的介绍页面上我们已经得知了数据的计算方法现在我们只要把它还原回去就可以了

a=2^a-1

a[1:4,1:4]

## 接下来我们就要取出表达数据，但是我们只要这327个tnbc样本中成对的样本，## 即，同一个病人既有正常样本，又有肿瘤样本的表达信息

load('all_sample.Rdata')

PIK3CA_p=substring(PIK3CA_sample,1,12)

all_p=substring(colnames(a),1,12)

paired_p=names(table(all_p)[table(all_p)==2])

## 这样挑选过后，符合要求的样本就只有35个了

need_p=intersect(PIK3CA_p,paired_p)

exprSet=a[,all_p %in% need_p]

tmp=apply(exprSet,1,function(x){

sum(x==0) < 10

})

exprSet=exprSet[tmp,]

save(exprSet,file = 'PIK3CA_paired_exprSet.Rdata')

差异分析

## 接下来做差异分析

Rdata_dir='.'

load( file =

file.path(Rdata_dir,'PIK3CA_paired_exprSet.Rdata')

)

dim(exprSet)

## 根据TCGA样本的命名可以区分正常组织和肿瘤样本的测序结果

group_list=factor(ifelse(as.numeric(substr(colnames(exprSet),14,15)) < 10,'tumor','normal'))

table(group_list)

group_list

## 有两个病人没有normal对照，诡异的是它们居然测了两次tumor组织，可能是复发或者其它生物学现象，我们这次的分析需要删除他们的数据

exprSet <- exprSet[,-c(5,6,55,56)]

group_list=factor(ifelse(as.numeric(substr(colnames(exprSet),14,15)) < 10,'brca','no'))

table(group_list)

## 对表达差异结果数据取整

exprSet=ceiling(exprSet)

这个时候exprSet结果就可以进行差异分析了

## 方法一：DESeq2

if(T){

library(DESeq2)

(colData <- data.frame(row.names=colnames(exprSet),

group_list=group_list) )

dds <- DESeqDataSetFromMatrix(countData = exprSet,

colData = colData,

design = ~ group_list)

tmp_f=file.path(Rdata_dir,'TCGA-PIK3CA-miRNA-DESeq2-dds.Rdata')

if(!file.exists(tmp_f)){

dds <- DESeq(dds)

save(dds,file = tmp_f)

}

load(file = tmp_f)

res <- results(dds,

contrast=c("group_list","tumor","normal"))

resOrdered <- res[order(res$padj),]

head(resOrdered)

DEG =as.data.frame(resOrdered)

DESeq2_DEG = na.omit(DEG)

nrDEG=DESeq2_DEG[,c(2,6)]

colnames(nrDEG)=c('log2FoldChange','pvalue')

}

## 方法二：edgeR

if(T){

library(edgeR)

d <- DGEList(counts=exprSet,group=factor(group_list))

keep <- rowSums(cpm(d)>1) >= 2

table(keep)

d <- d[keep, , keep.lib.sizes=FALSE]

d$samples$lib.size <- colSums(d$counts)

d <- calcNormFactors(d)

d$samples

dge=d

design <- model.matrix(~0+factor(group_list))

rownames(design)<-colnames(dge)

colnames(design)<-levels(factor(group_list))

dge=d

dge <- estimateGLMCommonDisp(dge,design)

dge <- estimateGLMTrendedDisp(dge, design)

dge <- estimateGLMTagwiseDisp(dge, design)

fit <- glmFit(dge, design)

# https://www.biostars.org/p/110861/

lrt <- glmLRT(fit, contrast=c(-1,1))

nrDEG=topTags(lrt, n=nrow(dge))

nrDEG=as.data.frame(nrDEG)

head(nrDEG)

edgeR_DEG =nrDEG

nrDEG=edgeR_DEG[,c(1,5)]

colnames(nrDEG)=c('log2FoldChange','pvalue')

}

## 方法三：limma

if(T){

suppressMessages(library(limma))

design <- model.matrix(~0+factor(group_list))

colnames(design)=levels(factor(group_list))

rownames(design)=colnames(exprSet)

design

dge <- DGEList(counts=exprSet)

dge <- calcNormFactors(dge)

logCPM <- cpm(dge, log=TRUE, prior.count=3)

v <- voom(dge,design,plot=TRUE, normalize="quantile")

fit <- lmFit(v, design)

group_list

cont.matrix=makeContrasts(contrasts=c('tumor-normal'),levels = design)

fit2=contrasts.fit(fit,cont.matrix)

fit2=eBayes(fit2)

tempOutput = topTable(fit2, coef='tumor-normal', n=Inf)

DEG_limma_voom = na.omit(tempOutput)

head(DEG_limma_voom)

nrDEG=DEG_limma_voom[,c(1,4)]

colnames(nrDEG)=c('log2FoldChange','pvalue')

}

## 比较一下这三个差异分析的结果

nrDEG1=DEG_limma_voom[,c(1,4)]

colnames(nrDEG1)=c('log2FoldChange','pvalue')

nrDEG2=edgeR_DEG[,c(1,5)]

colnames(nrDEG2)=c('log2FoldChange','pvalue')

nrDEG3=DESeq2_DEG[,c(2,6)]

colnames(nrDEG3)=c('log2FoldChange','pvalue')

mi=unique(c(rownames(nrDEG1),rownames(nrDEG1),rownames(nrDEG1)))

lf=data.frame(limma=nrDEG1[mi,1],

edgeR=nrDEG2[mi,1],

DESeq2=nrDEG3[mi,1])

cor(na.omit(lf))

limma edgeR DESeq2

limma 1.0000000 0.9215024 0.9534569

edgeR 0.9215024 1.0000000 0.9643088

DESeq2 0.9534569 0.9643088 1.0000000

注：热图里面的基因应该是以symbol表示，而不是上图的ensembl ID

注释

library( "clusterProfiler" )

library( "org.Hs.eg.db" )

nrDEG = nrDEG1

nrDEG = nrDEG2

nrDEG <- DEG_limma_voom

colnames(DESeq2_DEG)[c(2,5)]=c('logFC','P.Value')

nrDEG <- DESeq2_DEG

colnames(edgeR_DEG)[c(1,4)]=c('logFC','P.Value')

nrDEG <- edgeR_DEG

logFC_cutoff <- with( nrDEG, mean( abs( logFC ) ) + 2 * sd( abs( logFC ) ) )

logFC_cutoff

logFC_cutoff = 1.2

nrDEG$change = as.factor( ifelse( nrDEG$P.Value < 0.05 & abs(nrDEG$logFC) > logFC_cutoff,

ifelse( nrDEG$logFC > logFC_cutoff , 'UP', 'DOWN' ), 'NOT' ) )

table(nrDEG$change)

keytypes(org.Hs.eg.db)

library("stringr")

rownames( nrDEG ) <- str_sub(rownames( nrDEG ), start = 1, end = 15)

nrDEG$ENSEMBL <- rownames( nrDEG )

df <- bitr( rownames( nrDEG ), fromType = "ENSEMBL", toType = c( "ENTREZID" ), OrgDb = org.Hs.eg.db )

head( df )

{

nrDEG$SYMBOL = rownames( nrDEG )

nrDEG = merge( nrDEG, df, by='ENSEMBL' )

}

head( nrDEG )

{

gene_up = nrDEG[ nrDEG$change == 'UP', 'ENTREZID' ]

gene_down = nrDEG[ nrDEG$change == 'DOWN', 'ENTREZID' ]

gene_diff = c( gene_up, gene_down )

gene_all = as.character(nrDEG[ ,'ENTREZID'] )

}

{

geneList = nrDEG$logFC

names( geneList ) = nrDEG$ENTREZID

geneList = sort( geneList, decreasing = T )

}

{

## KEGG pathway analysis

kk.up <- enrichKEGG( gene = gene_up ,

organism = 'hsa' ,

universe = gene_all ,

pvalueCutoff = 0.8 ,

qvalueCutoff = 0.8 )

kk.down <- enrichKEGG( gene = gene_down ,

organism = 'hsa' ,

universe = gene_all ,

pvalueCutoff = 0.05 )

}

head( kk.up )[ ,1:6 ]

head( kk.down )[ ,1:6 ]

library( "ggplot2" )

{

kegg_down_dt <- as.data.frame( kk.down )

kegg_up_dt <- as.data.frame( kk.up )

down_kegg <- kegg_down_dt[ kegg_down_dt$pvalue < 0.05, ]

down_kegg$group = -1

up_kegg <- kegg_up_dt[ kegg_up_dt$pvalue < 0.05, ]

up_kegg$group = 1

dat = rbind( up_kegg, down_kegg )

dat$pvalue = -log10( dat$pvalue )

dat$pvalue = dat$pvalue * dat$group

dat = dat[ order( dat$pvalue, decreasing = F ), ]

g_kegg <- ggplot( dat,

aes(x = reorder( Description, order( pvalue, decreasing=F ) ), y = pvalue, fill = group)) +

geom_bar( stat = "identity" ) +

scale_fill_gradient( low = "blue", high = "red", guide = FALSE ) +

scale_x_discrete( name = "Pathway names" ) +

scale_y_continuous( name = "log10P-value" ) +

coord_flip() + theme_bw() + theme( plot.title = element_text( hjust = 0.5 ) ) +

ggtitle( "Pathway Enrichment" )

print( g_kegg )

ggsave( g_kegg, filename = 'kegg_up_down_edgeR.png' )

}

可以肯定上下调基因集经过超几何分布的统计学检验后有着共同的KEGG结果，需要有更高级的展现方式！

KEGG通路几乎没有差异，说明找到的差异基因很一致

现在只能说明我们使用的3种R包找差异比较一致，但事实上这个是早有定论，我们只是在数据上面验证了一下，这个时候感兴趣的应该是有PIK3CA基因突变的那些病人和并没有突变的那些病人的差异分析的结果的区别，留给读者去实现吧！

■ ■ ■

生信基础知识大全系列：生信基础知识100讲

史上最强的生信自学环境准备课来啦！！ 7次改版，11节课程，14K的讲稿，30个夜晚打磨，100页PPT的课程。

如果需要组装自己的服务器；代办生物信息学服务器

如果需要帮忙下载海外数据(GEO/TCGA/GTEx等等)，点我？

如果需要线下辅导及培训，看招学徒

如果需要个人电脑：个人计算机推荐

如果需要置办生物信息学书籍，看：生信人必备书单

如果需要实习岗位：实习职位发布

如果需要售后：点我

如果需要入门资料大全：点我

点击下面的阅读原文直达我的github

李尚福、魏凤和双双被拿下，与美国一份报告是否有关？

这样的洞庭湖决堤，实在让人同情不起来

有的人走了，却永远活着

圈内疯传某谣言

不要放过这些人渣

有PIK3CA基因突变的肿瘤病人的转录水平变化

您可能也对以下帖子感兴趣

李尚福、魏凤和双双被拿下，与美国一份报告是否有关？

这样的洞庭湖决堤，实在让人同情不起来

有的人走了，却永远活着

圈内疯传某谣言

不要放过这些人渣

生成图片，分享到微信朋友圈

有PIK3CA基因突变的肿瘤病人的转录水平变化

您可能也对以下帖子感兴趣