其他
周一学徒数据挖掘专场:NEAT1在组织和TCGA所有癌症中的表达
大家好,欢迎来到周一学徒数据挖掘专场,前面精彩示例如下:
批量COX回归生存分析图,指定挑选lncRNA基因,森林图,ROC曲线打包给你
Pan-cancer analysis of long non-coding RNA NEAT1 in various cancers
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6146416/
作者分析了GTEx和TCGA的数据,来研究lncRNA-NEAT1.
全文就两幅图,下面的代码也是实现了这两幅图。
首先是GTEx的数据
地址:https://gtexportal.org/home/datasets
下载下面的两个数据就可以了。
# step0 Load data ---------------------------------------------------------
rm(list = objects( all = TRUE ))
if (!is.null( dev.list() )) dev.off()
clearhistory()
# step1 normal tissue -----------------------------------------------------
Rdata_file <- './data/GTEx.PAN.NEAT1.Rdata'
if (!file.exists( Rdata_file )) {
destfile <- './raw_data/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_reads.gct.gz'
library(R.utils)
gunzip(destfile, remove = F)
library(data.table)
raw_data <- fread( './raw_data/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_reads.gct',
sep = ' ', header = T)
raw_data <- as.data.frame( raw_data )
raw_data <- raw_data[raw_data[,2] == 'NEAT1', ]
raw_data[1, 1:10]
rownames( raw_data ) <- 'NEAT1'
raw_data <- raw_data[, -1]
raw_data <- raw_data[, -1]
raw_data[1, 1:10]
dim( raw_data )
save( raw_data, file = Rdata_file )
}else{
load( Rdata_file )
}
Rdata_file <- './data/GTEx.Pheno.Rdata'
if (!file.exists( Rdata_file )) {
destfile <- './raw_data/GTEx_v7_Annotations_SampleAttributesDS.txt'
phenoData <- read.table( destfile,
header = T,
sep = ' ',
quote = '' )
phenoData[1:5, 1:5]
dim(phenoData)
rownames( phenoData ) <- phenoData[ , 1]
phenoData <- phenoData[colnames(raw_data), ]
dim(phenoData)
phenoData[1,]
colnames(phenoData)
save( phenoData, file = Rdata_file )
}else{
load( Rdata_file )
}
pheno_num <- c()
invisible(
lapply( 1:ncol( phenoData ),
function(col_num){
## Assume that the classification project is between 2 and 4
if ( 1 < dim( table( phenoData[,col_num] ) ) &
dim( table( phenoData[, col_num] ) ) < 50 ) {
pheno_num <<- append( pheno_num, col_num,
after = length( pheno_num ) )
}
}
)
)
for (i in pheno_num) {
print( colnames( phenoData )[i] )
print( table( phenoData[, i] ) )
}
table(phenoData[, "SMATSSCR"])
Severe_ID <- rownames(phenoData)[phenoData[, "SMATSSCR"] == 3]
table(phenoData[Severe_ID, "SMTS"])
Moderate_ID <- rownames(phenoData)[phenoData[, "SMATSSCR"] == 2]
table(phenoData[Moderate_ID, "SMTS"])
Mild_ID <- rownames(phenoData)[phenoData[, "SMATSSCR"] == 1]
table(phenoData[Mild_ID, "SMTS"])
raw_data <- as.data.frame(t(log10(raw_data + 1)))
raw_data$SMTSD <- phenoData$SMTSD
raw_data <- raw_data[!(rownames(raw_data) %in% Severe_ID), ]
library(ggpubr)
ggbarplot(raw_data, x = "SMTSD", y = "NEAT1",
fill = "grey",
color = "black",
add = c("mean_range", "point"),
##"jitter", "boxplot", "point",
##"mean_se", "mean_sd", "mean_ci", "mean_range",
##"median_iqr", "median_mad", "median_range"
error.plot = "upper_errorbar",
desc_stat = "mean_sd",
x.text.angle = 75,
title = "NEAT1 expression in different normal human tissues",
ylab = "Relative expression of NEAT1",
xlab = "",
width = 0.6
##orientation = "horiz",
) + ylim(0, 7.5) +
theme(axis.title = element_text(size = 11, color = "black", face = "bold"),
axis.text = element_text(size = 9, color = "black"),
title = element_text(size = 15, color = "black"),
plot.title = element_text(hjust = 0.5))
然后是TCGA的数据
不需要分别下载的,xena已经将数据整合在一起了,下载一下临床数据和表达矩阵就可以了。
https://xenabrowser.net/datapages/?host=https%3A%2F%2Fpancanatlas.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443
# step2 Tumor -------------------------------------------------------------
Rdata_file <- './data/TCGA.PAN.NEAT1.Rdata'
if (!file.exists( Rdata_file )) {
destfile <- './raw_data/EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena.gz'
library(R.utils)
gunzip(destfile, remove = F)
library(data.table)
raw_data <- fread( './raw_data/EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena',
sep = ' ', header = T)
raw_data <- as.data.frame( raw_data )
raw_data <- raw_data[raw_data[,1] == 'NEAT1', ]
raw_data[1, 1:10]
rownames( raw_data ) <- 'NEAT1'
raw_data <- raw_data[, -1]
raw_data[1, 1:10]
dim( raw_data )
save( raw_data, file = Rdata_file )
}else{
load( Rdata_file )
}
Rdata_file <- './data/TCGA.Pheno.Rdata'
if (!file.exists( Rdata_file )) {
destfile <- './raw_data/TCGA_phenotype_denseDataOnlyDownload.tsv.gz'
phenoData <- read.table( destfile,
header = T,
sep = ' ',
quote = '' )
phenoData[1:5, 1:4]
dim(phenoData)
rownames( phenoData ) <- phenoData[ , 1]
phenoData <- phenoData[colnames(raw_data), ]
dim(phenoData)
phenoData[1:10,]
colnames(phenoData)
save( phenoData, file = Rdata_file )
}else{
load( Rdata_file )
}
for (i in 1:4) {
print( colnames( phenoData )[i] )
print( table( phenoData[, i] ) )
}
raw_data <- as.data.frame(t(raw_data ))
raw_data$type <- ifelse(phenoData$sample_type_id < 10, "tumor", "normal")
table(raw_data$type)
raw_data$disease <- phenoData$X_primary_disease
raw_data[1:6, 1:3]
raw_data <- raw_data[!is.na(raw_data$type), ]
sub.disease <- split(raw_data, raw_data$disease)
length(sub.disease)
## 分开作图
for (i in 1:length(sub.disease)) {
if (length(table(sub.disease[[i]]$type)) == 2) {
p.v <- compare_means(NEAT1~type, data = sub.disease[[i]])$p
if (p.v < 0.05) {
diease <- sub.disease[[i]]$disease[1]
title <- paste( 'The expression of NEAT1 in ', diease, sep = '' )
p <- ggboxplot(sub.disease[[i]], x = "type", y = "NEAT1",
title = title,
ylab = "Relative expression of NEAT1",
xlab = "",
color = "type",
palette = "jco")
p + stat_compare_means(label.y = 18) +
theme(plot.title = element_text(hjust = 0.5))
ggsave(filename = paste('./fig/tmp', i, '.png', sep = ''))
}
}
}
## 合并作图
new_data <- data.frame(NEAT1 = 0, type = '', disease = '')
new_data = new_data[-1,]
for (i in 1:length(sub.disease)) {
if (length(table(sub.disease[[i]]$type)) == 2) {
p.v <- compare_means(NEAT1~type, data = sub.disease[[i]])$p
if (p.v < 0.05) {
new_data <- rbind(sub.disease[[i]], new_data)
}
}
}
library(ggpubr)
ggboxplot(new_data, x = "type", y = "NEAT1",
title = "The expression of NEAT1 in different tumors leveraging RNA-seq data from TCGA",
ylab = "Relative expression of NEAT1",
xlab = "",
facet.by = "disease",
color = "type",
palette = "jco") +
stat_compare_means(label.y = 6) +
ylim(5, 18) +
theme(plot.title = element_text(hjust = 0.5))
如果你完全看不懂上面的代码,那么你可能需要下面的课程:
■ ■ ■
生信技能树(爆款入门培训课)巡讲第一站-重庆 (已结束)
生物信息学全国巡讲之粤港澳大湾区专场 (已结束)
生信技能树(爆款入门培训课)巡讲第二站-济南 (已结束)