NC 文章单细胞部分图复现

Original JunJunLab 老俊俊的生信笔记 2022-08-27

收录于合集 #单细胞笔记 21个

回归

1引言

尝试复现一篇 NC 文章的部分结果。

文章标题:

Transcriptional profiling of mESC-derived tendon and fibrocartilage cell fate switch

复现图:

图中的 a,b,c,d,e:

2数据下载

GSE 号: GSE154525

3文章方法解读

上游:

预处理:

下游分析:

关键点提取

过滤基因在分位数 0.02-0.98 间的细胞。
过滤线粒体百分比大于 7.5 的细胞。
SCTransform 方法归一化和标准化。
数据整合以 embryonic tail 作为参考锚点。
使用 CCA 整合数据。
保留 30 个主成分,拿 3000 个高变基因,cluster 分类分辨率 0.2。
UMAP 降维。

4批量读取数据并进行过滤

library(Seurat)
library(ggplot2)
library(patchwork)
library(tidyverse)
library(ggsci)
library(reshape2)
library(clustree)
library(data.table)

# batch read data
file <- list.files(pattern = '.csv')[1:2]
sample <- c('culture','tail')
scRNAlist <- list()

# x = 1
for(x in 1:length(file)){
  count <- read.csv(file[x],header = T,row.names = 1)

  # create Seurat object
  scRNAlist[[sample[x]]] <- CreateSeuratObject(count,
                                               # min.cells = 3,
                                               # min.features = 300
                                               )

  # add group
  scRNAlist[[sample[x]]]$group <- sample[x]

  # mitocondral percent
  scRNAlist[[sample[x]]][["percent.mt"]] <- PercentageFeatureSet(scRNAlist[[sample[x]]],
                                                                 pattern = "^mt-")

  # calculate quantile 2-98
  qt <- quantile(scRNAlist[[sample[x]]]$nFeature_RNA,probs = c(0.02,0.98))

  # filter cells gene in quantile 2-98 and mitocondral below 7.5
  scRNAlist[[sample[x]]] <- subset(scRNAlist[[sample[x]]],
                                   subset = nFeature_RNA >= qt[1] & nFeature_RNA <= qt[2] & percent.mt <= 7.5)

  # SCTransform normalize
  scRNAlist[[sample[x]]] <- SCTransform(scRNAlist[[sample[x]]],
                                        method = "glmGamPoi",verbose = FALSE) %>%
    RunPCA(npcs = 30, verbose = FALSE)
}

查看数据:

# check
scRNAlist

# $culture
# An object of class Seurat
# 50537 features across 7508 samples within 2 assays
# Active assay: SCT (22781 features, 3000 variable features)
# 1 other assay present: RNA
# 1 dimensional reduction calculated: pca
#
# $tail
# An object of class Seurat
# 49891 features across 1968 samples within 2 assays
# Active assay: SCT (22135 features, 3000 variable features)
# 1 other assay present: RNA
# 1 dimensional reduction calculated: pca

5查看质控结果

# vlnplot
lapply(1:length(scRNAlist), function(x){
  VlnPlot(scRNAlist[[x]], features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
          ncol = 3)
}) -> vlnlist

# plot
wrap_plots(vlnlist,nrow = 2)

6数据整合

# select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = scRNAlist, nfeatures = 3000)

# Perform integration
scRNAlist <- PrepSCTIntegration(object.list = scRNAlist, anchor.features = features)
# |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=05s

# find Anchors !tail as refrence
combined.anchors <- FindIntegrationAnchors(object.list = scRNAlist,
                                           reference = c(2),
                                           normalization.method = "SCT",
                                           reduction = 'cca',
                                           anchor.features = features)

# intergrate
combined.sct <- IntegrateData(anchorset = combined.anchors, normalization.method = "SCT")

7降维可视化

# Perform an integrated analysis
combined.sct <- RunPCA(combined.sct, verbose = FALSE)
combined.sct <- FindNeighbors(combined.sct, reduction = "pca", dims = 1:30)

不同分辨率下可视化聚类树:

# check different resolution and clusters
combined.sct <- FindClusters(combined.sct, resolution = c(seq(.1,1,0.1)))

# plot tree
clustree(combined.sct@meta.data, prefix = "integrated_snn_res.")

文章有 14 个 cluster, 但是图里可以看到分辨率设置为 0.3 才有 14 个,所以我这里设置 0.3:

# resolution = 0.3
combined.sct <- FindClusters(combined.sct, resolution = 0.3)

# check groups
table(combined.sct@meta.data$group)

# culture    tail
#    7508    1968

降维可视化:

# run umap
combined.sct <- RunUMAP(combined.sct, reduction = "pca", dims = 1:30, verbose = FALSE)

# DimPlot by group
p2 <- DimPlot(combined.sct, reduction = "umap",group.by = 'group') +
  scale_color_manual(values = c('#009999','#CC9933')) +
  theme(aspect.ratio = 1,
        legend.position = c(0.05,0.1)) +
  labs(title = '')

# DimPlot cluster
p1 <- DimPlot(combined.sct, reduction = "umap",label = T,
        label.size = 3,label.box = T) + NoLegend() +
  # scale_color_npg() +
  # scale_fill_npg(alpha = 0.5) +
  theme(aspect.ratio = 1)

# combine
p1 + p2

8Featureplot

默认可视化:

# gene plot
gene <- c('Scx','Dcn','Bgn','Col5a2','Col1a1','Col3a1','Fbn2','Postn','Tnc')

FeaturePlot(combined.sct,features = gene,
            cols = c('grey90','#CC0033'),ncol = 3,
            min.cutoff = 'q2',max.cutoff = 'q98')

提取数据重新绘图:

##################################
# self replot
pc <- Embeddings(combined.sct,reduction = 'umap') %>% data.frame()
# add gene name
exp <- FetchData(combined.sct,vars = gene)
# megrge
mer <- cbind(pc,exp)
# wide to long
dflong <- melt(mer,id.vars = colnames(mer)[1:2],
               value.name = 'expressiton',
               variable.name = 'gene')

# check
head(dflong,3)

#      UMAP_1    UMAP_2 gene expressiton
# 1 -3.011463  3.503852  Scx -0.26745991
# 2 -7.580204  3.401761  Scx -0.02333779
# 3 -3.620352 -3.481013  Scx -3.29670049

# filter quantile in 0.02~0.98
quantile(dflong$expressiton,probs = c(0.02,0.98))

#        2%       98%
# -3.322756  7.487187

# filter expression
dflong <- dflong %>% filter(expressiton >= -3.322756 & expressiton <= 7.487187)

绘图:

# plot
ggplot(dflong,aes(x = UMAP_1,y = UMAP_2)) +
  geom_point(aes(color = expressiton),size = 0.1,alpha = 1) +
  scale_color_gradient(low = 'grey90',high = '#CC0033',
                       name = 'percentile \n expression',
                       breaks = c(-3.322756,7.487187),
                       limits = c(-3.322756,7.487187),
                       labels = c('< 2%','> 98%')
                       ) +
  theme_classic(base_size = 16) +
  theme(axis.line = element_blank(),
        axis.ticks = element_blank(),
        axis.text = element_blank(),
        aspect.ratio = 1,
        legend.position = 'right',
        strip.text.x = element_text(size = 16,face = 'italic'),
        strip.background = element_rect(fill = NA,color = NA)) +
  facet_wrap(~gene) +
  labs(x = '',y = '') +
  guides(color = guide_colorbar(title.position = 'left',
                                title.theme = element_text(angle = 90,hjust = 0.5)))

9小提琴图

# vlionplot
vlngene <- c('Pax1','Col2a1','Pax9','Col9a1','Sox9','Col9a3','Sox5','Wwp2','Sox6','Matn4')

VlnPlot(combined.sct,features = vlngene,
        ncol = 2,idents = c(0,2,4),
        slot = 'counts',assay = 'RNA',
        log = T,pt.size = 0,cols = c('#99CCFF','#006699','#66CC99'))

SCT 后的基因数量会少一些,因为前者会有基因的过滤指标:

rnacount <- combined.sct@assays$RNA@counts %>% data.frame()
dim(rnacount)
# 1] 27756  9476

# SCTransform filter min.cells < 5
sctcount <- combined.sct@assays$SCT@counts %>% data.frame()
dim(sctcount)
# [1] 24559  9476

10同一 cluster 不同样本的基因差异分析

##################################################
# differential analysis

# add test for group
combined.sct$diffGroup <- paste(combined.sct$group,combined.sct$seurat_clusters,
                                sep = '|')

# make it to active ident
Idents(combined.sct) <- "diffGroup"

# check
table(combined.sct$diffGroup)

# culture|0  culture|1 culture|10 culture|11 culture|12 culture|13  culture|2  culture|3
# 1453       1467         62         31         46          4       1490       1302
# culture|4  culture|5  culture|6  culture|7  culture|8  culture|9     tail|0     tail|1
# 655        546        374         26         50          2        398        324
# tail|10    tail|11    tail|13     tail|2     tail|3     tail|4     tail|5     tail|6
# 67         68         33        203         44        192         88        159
# tail|7     tail|8     tail|9
# 152        103        137

# diff test
test <- FindMarkers(combined.sct,ident.1 = 'tail|0',ident.2 = 'culture|0',
            test.use = 'wilcox',logfc.threshold = 0.25)

# add gene name
test$gene_name <- rownames(test)

# check
head(test,3)

#                p_val avg_log2FC pct.1 pct.2     p_val_adj gene_name
# Vwce   1.893026e-157   5.596902 0.055 0.847 5.679077e-154      Vwce
# Sphkap 9.220839e-156  -8.948104 0.045 0.667 2.766252e-152    Sphkap
# Kcna1  8.684915e-155  -8.669776 0.053 0.587 2.605475e-151     Kcna1

11热图可视化

# heatmap genes
htgene <- c('Postn','Col1a2','Dcn','Col1a1','Col3a1','Igfbp5','Col5a2',
            'Mgp','Ogn','Thbs2','Rflnb','Mfap2','Fbn2','Fstl1','Mfap4',
            'Bgn','Fn1','Igfbp7','Igf1','Col8a1',
            'Hist1h1b','Mki67','Top2a','Pclaf','Prc1','Ube2c','Cenpf','Hist1h1e',
            'Hist1h2ae','Smc2','Cdk1','Nusap1','Cenpe','Kif11','Birc5','Smc4',
            'Hmmr','Tpx2','Aurkb','Arl6ip1','Col2a1','Col9a1','Matn4','Col9a3',
            'Wwp2','Col11a1','Sox9','Col9a2','Acan','Pax1','Plod2','Dlk1','Pcp4',
            'Fmod','Fibin','Itm2a','Stk26','Id1','H19','Ndufa4l2')

# get subgroup data
htdata <- subset(combined.sct,idents = c('tail|0','culture|0',
                                         'tail|2','culture|2',
                                         'tail|4','culture|4'))

# order
Idents(htdata) <- factor(Idents(htdata),levels = c('culture|0','tail|0',
                                                   'culture|2','tail|2',
                                                   'culture|4','tail|4'))

# heatmap
DoHeatmap(htdata,features = htgene,
          draw.lines = T,
          assay = 'RNA',
          slot = 'counts') +
  scale_fill_gradient(low = 'white',high = '#990033')

#####################################
# save
# saveRDS(combined.sct,file = 'combined.sct.Rds')
# combined.sct <- readRDS('combined.sct.Rds')