其他
NC 文章单细胞部分图复现
回归
1引言
尝试复现一篇 NC 文章的部分结果。
文章标题:
Transcriptional profiling of mESC-derived tendon and fibrocartilage cell fate switch
复现图:
图中的 a,b,c,d,e
:
2数据下载
GSE 号: GSE154525
3文章方法解读
上游:
预处理:
下游分析:
关键点提取
过滤基因在分位数 0.02-0.98 间的细胞。 过滤线粒体百分比大于 7.5 的细胞。 SCTransform 方法归一化和标准化。 数据整合以 embryonic tail 作为参考锚点。 使用 CCA 整合数据。 保留 30 个主成分,拿 3000 个高变基因,cluster 分类分辨率 0.2。 UMAP 降维。
4批量读取数据并进行过滤
library(Seurat)
library(ggplot2)
library(patchwork)
library(tidyverse)
library(ggsci)
library(reshape2)
library(clustree)
library(data.table)
# batch read data
file <- list.files(pattern = '.csv')[1:2]
sample <- c('culture','tail')
scRNAlist <- list()
# x = 1
for(x in 1:length(file)){
count <- read.csv(file[x],header = T,row.names = 1)
# create Seurat object
scRNAlist[[sample[x]]] <- CreateSeuratObject(count,
# min.cells = 3,
# min.features = 300
)
# add group
scRNAlist[[sample[x]]]$group <- sample[x]
# mitocondral percent
scRNAlist[[sample[x]]][["percent.mt"]] <- PercentageFeatureSet(scRNAlist[[sample[x]]],
pattern = "^mt-")
# calculate quantile 2-98
qt <- quantile(scRNAlist[[sample[x]]]$nFeature_RNA,probs = c(0.02,0.98))
# filter cells gene in quantile 2-98 and mitocondral below 7.5
scRNAlist[[sample[x]]] <- subset(scRNAlist[[sample[x]]],
subset = nFeature_RNA >= qt[1] & nFeature_RNA <= qt[2] & percent.mt <= 7.5)
# SCTransform normalize
scRNAlist[[sample[x]]] <- SCTransform(scRNAlist[[sample[x]]],
method = "glmGamPoi",verbose = FALSE) %>%
RunPCA(npcs = 30, verbose = FALSE)
}
查看数据:
# check
scRNAlist
# $culture
# An object of class Seurat
# 50537 features across 7508 samples within 2 assays
# Active assay: SCT (22781 features, 3000 variable features)
# 1 other assay present: RNA
# 1 dimensional reduction calculated: pca
#
# $tail
# An object of class Seurat
# 49891 features across 1968 samples within 2 assays
# Active assay: SCT (22135 features, 3000 variable features)
# 1 other assay present: RNA
# 1 dimensional reduction calculated: pca
5查看质控结果
# vlnplot
lapply(1:length(scRNAlist), function(x){
VlnPlot(scRNAlist[[x]], features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
ncol = 3)
}) -> vlnlist
# plot
wrap_plots(vlnlist,nrow = 2)
6数据整合
# select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = scRNAlist, nfeatures = 3000)
# Perform integration
scRNAlist <- PrepSCTIntegration(object.list = scRNAlist, anchor.features = features)
# |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=05s
# find Anchors !tail as refrence
combined.anchors <- FindIntegrationAnchors(object.list = scRNAlist,
reference = c(2),
normalization.method = "SCT",
reduction = 'cca',
anchor.features = features)
# intergrate
combined.sct <- IntegrateData(anchorset = combined.anchors, normalization.method = "SCT")
7降维可视化
# Perform an integrated analysis
combined.sct <- RunPCA(combined.sct, verbose = FALSE)
combined.sct <- FindNeighbors(combined.sct, reduction = "pca", dims = 1:30)
不同分辨率下可视化聚类树:
# check different resolution and clusters
combined.sct <- FindClusters(combined.sct, resolution = c(seq(.1,1,0.1)))
# plot tree
clustree(combined.sct@meta.data, prefix = "integrated_snn_res.")
文章有 14 个 cluster, 但是图里可以看到分辨率设置为 0.3 才有 14 个,所以我这里设置 0.3:
# resolution = 0.3
combined.sct <- FindClusters(combined.sct, resolution = 0.3)
# check groups
table(combined.sct@meta.data$group)
# culture tail
# 7508 1968
降维可视化:
# run umap
combined.sct <- RunUMAP(combined.sct, reduction = "pca", dims = 1:30, verbose = FALSE)
# DimPlot by group
p2 <- DimPlot(combined.sct, reduction = "umap",group.by = 'group') +
scale_color_manual(values = c('#009999','#CC9933')) +
theme(aspect.ratio = 1,
legend.position = c(0.05,0.1)) +
labs(title = '')
# DimPlot cluster
p1 <- DimPlot(combined.sct, reduction = "umap",label = T,
label.size = 3,label.box = T) + NoLegend() +
# scale_color_npg() +
# scale_fill_npg(alpha = 0.5) +
theme(aspect.ratio = 1)
# combine
p1 + p2
8Featureplot
默认可视化:
# gene plot
gene <- c('Scx','Dcn','Bgn','Col5a2','Col1a1','Col3a1','Fbn2','Postn','Tnc')
FeaturePlot(combined.sct,features = gene,
cols = c('grey90','#CC0033'),ncol = 3,
min.cutoff = 'q2',max.cutoff = 'q98')
提取数据重新绘图:
##################################
# self replot
pc <- Embeddings(combined.sct,reduction = 'umap') %>% data.frame()
# add gene name
exp <- FetchData(combined.sct,vars = gene)
# megrge
mer <- cbind(pc,exp)
# wide to long
dflong <- melt(mer,id.vars = colnames(mer)[1:2],
value.name = 'expressiton',
variable.name = 'gene')
# check
head(dflong,3)
# UMAP_1 UMAP_2 gene expressiton
# 1 -3.011463 3.503852 Scx -0.26745991
# 2 -7.580204 3.401761 Scx -0.02333779
# 3 -3.620352 -3.481013 Scx -3.29670049
# filter quantile in 0.02~0.98
quantile(dflong$expressiton,probs = c(0.02,0.98))
# 2% 98%
# -3.322756 7.487187
# filter expression
dflong <- dflong %>% filter(expressiton >= -3.322756 & expressiton <= 7.487187)
绘图:
# plot
ggplot(dflong,aes(x = UMAP_1,y = UMAP_2)) +
geom_point(aes(color = expressiton),size = 0.1,alpha = 1) +
scale_color_gradient(low = 'grey90',high = '#CC0033',
name = 'percentile \n expression',
breaks = c(-3.322756,7.487187),
limits = c(-3.322756,7.487187),
labels = c('< 2%','> 98%')
) +
theme_classic(base_size = 16) +
theme(axis.line = element_blank(),
axis.ticks = element_blank(),
axis.text = element_blank(),
aspect.ratio = 1,
legend.position = 'right',
strip.text.x = element_text(size = 16,face = 'italic'),
strip.background = element_rect(fill = NA,color = NA)) +
facet_wrap(~gene) +
labs(x = '',y = '') +
guides(color = guide_colorbar(title.position = 'left',
title.theme = element_text(angle = 90,hjust = 0.5)))
9小提琴图
# vlionplot
vlngene <- c('Pax1','Col2a1','Pax9','Col9a1','Sox9','Col9a3','Sox5','Wwp2','Sox6','Matn4')
VlnPlot(combined.sct,features = vlngene,
ncol = 2,idents = c(0,2,4),
slot = 'counts',assay = 'RNA',
log = T,pt.size = 0,cols = c('#99CCFF','#006699','#66CC99'))
SCT 后的基因数量会少一些,因为前者会有基因的过滤指标:
rnacount <- combined.sct@assays$RNA@counts %>% data.frame()
dim(rnacount)
# 1] 27756 9476
# SCTransform filter min.cells < 5
sctcount <- combined.sct@assays$SCT@counts %>% data.frame()
dim(sctcount)
# [1] 24559 9476
10同一 cluster 不同样本的基因差异分析
##################################################
# differential analysis
# add test for group
combined.sct$diffGroup <- paste(combined.sct$group,combined.sct$seurat_clusters,
sep = '|')
# make it to active ident
Idents(combined.sct) <- "diffGroup"
# check
table(combined.sct$diffGroup)
# culture|0 culture|1 culture|10 culture|11 culture|12 culture|13 culture|2 culture|3
# 1453 1467 62 31 46 4 1490 1302
# culture|4 culture|5 culture|6 culture|7 culture|8 culture|9 tail|0 tail|1
# 655 546 374 26 50 2 398 324
# tail|10 tail|11 tail|13 tail|2 tail|3 tail|4 tail|5 tail|6
# 67 68 33 203 44 192 88 159
# tail|7 tail|8 tail|9
# 152 103 137
# diff test
test <- FindMarkers(combined.sct,ident.1 = 'tail|0',ident.2 = 'culture|0',
test.use = 'wilcox',logfc.threshold = 0.25)
# add gene name
test$gene_name <- rownames(test)
# check
head(test,3)
# p_val avg_log2FC pct.1 pct.2 p_val_adj gene_name
# Vwce 1.893026e-157 5.596902 0.055 0.847 5.679077e-154 Vwce
# Sphkap 9.220839e-156 -8.948104 0.045 0.667 2.766252e-152 Sphkap
# Kcna1 8.684915e-155 -8.669776 0.053 0.587 2.605475e-151 Kcna1
11热图可视化
# heatmap genes
htgene <- c('Postn','Col1a2','Dcn','Col1a1','Col3a1','Igfbp5','Col5a2',
'Mgp','Ogn','Thbs2','Rflnb','Mfap2','Fbn2','Fstl1','Mfap4',
'Bgn','Fn1','Igfbp7','Igf1','Col8a1',
'Hist1h1b','Mki67','Top2a','Pclaf','Prc1','Ube2c','Cenpf','Hist1h1e',
'Hist1h2ae','Smc2','Cdk1','Nusap1','Cenpe','Kif11','Birc5','Smc4',
'Hmmr','Tpx2','Aurkb','Arl6ip1','Col2a1','Col9a1','Matn4','Col9a3',
'Wwp2','Col11a1','Sox9','Col9a2','Acan','Pax1','Plod2','Dlk1','Pcp4',
'Fmod','Fibin','Itm2a','Stk26','Id1','H19','Ndufa4l2')
# get subgroup data
htdata <- subset(combined.sct,idents = c('tail|0','culture|0',
'tail|2','culture|2',
'tail|4','culture|4'))
# order
Idents(htdata) <- factor(Idents(htdata),levels = c('culture|0','tail|0',
'culture|2','tail|2',
'culture|4','tail|4'))
# heatmap
DoHeatmap(htdata,features = htgene,
draw.lines = T,
assay = 'RNA',
slot = 'counts') +
scale_fill_gradient(low = 'white',high = '#990033')
#####################################
# save
# saveRDS(combined.sct,file = 'combined.sct.Rds')
# combined.sct <- readRDS('combined.sct.Rds')
12结尾
感觉文章里的图有几个都是用的原始 counts 进行可视化的,并没有使用 scaled-data 或者 normalized-data。
欢迎加入生信交流群。加我微信我也拉你进 微信群聊 老俊俊生信交流群
(微信交流群满200人后需收取20元入群费用)。
群二维码:
老俊俊微信:
知识星球:
所以今天你学习了吗?
今天的分享就到这里了,敬请期待下一篇!
最后欢迎大家分享转发,您的点赞是对我的鼓励和肯定!
如果觉得对您帮助很大,赏杯快乐水喝喝吧!
往期回顾
◀Molecular Cell 文章 ribosome pausing 结果复现 (终)
◀Molecular Cell 文章 ribosome pausing 结果复现 (四)
◀...