跟着 Nature medicine 学单细胞数据分析

Original JunJunLab 老俊俊的生信笔记 2022-08-27

收录于合集 #单细胞笔记 21个

这世界,难逃避命运

1引言

前面我们直接使用作者提供的计算好的数据绘图的,这次我们尝试从 loom 文件提取矩阵和 metadata 信息自己从头分析看看。

上期:

跟着 Nature medicine 学画图-tSNE

文章:

复现图:

2文章方法部分

>>>

3前期准备

读取数据

library(hdf5r)
library(SeuratDisk)
library(scater)
library(tidyverse)
library(Seurat)

library(ggplot2)
library(cowplot)
library(reshape2)
library(ggsci)
library(patchwork)

# load loom
allcell <- Connect(filename = "./Thienpont_Tumors_52k_v4_R_fixed.loom", mode = "r")
allcell

# Class: loom
# Filename: F:\文章复现数据\11.LungTumor-scRNASEQ\all_Rds_data\Thienpont_Tumors_52k_v4_R_fixed.loom
# Access type: H5F_ACC_RDONLY
# Attributes: title, MetaData, version
# Listing:
#       name    obj_type  dataset.dims dataset.type_class
#  col_attrs   H5I_GROUP          <NA>               <NA>
# col_graphs   H5I_GROUP          <NA>               <NA>
#     layers   H5I_GROUP          <NA>               <NA>
#     matrix H5I_DATASET 52698 x 33694          H5T_FLOAT
#  row_attrs   H5I_GROUP          <NA>               <NA>
# row_graphs   H5I_GROUP          <NA>               <NA>

提取矩阵

注意:

loom 文件提取的 矩阵没有行名和列名。
矩阵的 列是基因, 行是细胞。

# get matirx
expr <- allcell[["matrix"]][,] %>% data.frame()

# add gene
colnames(expr) <- allcell[["row_attrs/Gene"]][]

# add cell
rownames(expr) <- allcell[["col_attrs/CellID"]][]

# transpose
texpr <- as(t(expr),'dgCMatrix')

# check
head(texpr[1:5,1:5])

# 5 x 5 sparse Matrix of class "dgCMatrix"
#              AAACATACCTGAGT_1 AAACCGTGCTGGTA_1 AAACTTGATTGCGA_1 AAAGACGACAACCA_1
# RP11-34P13.3                .                .                .                .
# FAM138A                     .                .                .                .
# OR4F5                       .                .                .                .
# RP11-34P13.7                .                .                .                .
# RP11-34P13.8                .                .                .                .
#              AAAGAGACATCGTG_1
# RP11-34P13.3                .
# FAM138A                     .
# OR4F5                       .
# RP11-34P13.7                .
# RP11-34P13.8                .

准备 metadata 信息

# make metadata
metadata <- data.frame(
  CellID = allcell[["col_attrs/CellID"]][],
  ClusterName = allcell[["col_attrs/ClusterName"]][],
  CellFromTumor = allcell[["col_attrs/CellFromTumor"]][],
  PatientNumber = allcell[["col_attrs/PatientNumber"]][],
  ClusterID = allcell[["col_attrs/ClusterID"]][])

# add rownames
rownames(metadata) <- metadata$CellID

4创建 seurat 对象及下游分析

# make seurat object
lung.obj <- CreateSeuratObject(counts = texpr,
                               meta.data = metadata,
                               project = 'lung')
lung.obj
# An object of class Seurat
# 33694 features across 52698 samples within 1 assay
# Active assay: RNA (33694 features, 0 variable features)

# 计算每个细胞的线粒体含量
lung.obj[["percent.mt"]] <- PercentageFeatureSet(lung.obj, pattern = "^MT-")

# Visualize QC metrics as a violin plot
VlnPlot(lung.obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
        pt.size = 0,ncol = 3)

可以看到作者上传的数据是已经过滤好的数据了,下面过滤的步骤就不用做了:

# filter
# lung.obj <- subset(lung.obj,
#                    subset = nCount_RNA >= 201 & nFeature_RNA >= 100 & nFeature_RNA <= 6000 & percent.mt <= 10)

下游常规流程:

# normalize
lung.obj <- NormalizeData(lung.obj,
                          normalization.method = "LogNormalize",
                          scale.factor = 10000)

lung.obj <- FindVariableFeatures(lung.obj, selection.method = "vst",
                                 mean.cutoff = c(0.125,3),
                                 dispersion.cutoff = c(0.5,Inf),
                                 nfeatures = 2192)

# 对所有基因进行标准化
all.genes <- rownames(lung.obj)
lung.obj <- ScaleData(lung.obj, vars.to.regress = c("percent.mt","nCount_RNA"))

lung.obj <- RunPCA(lung.obj, features = VariableFeatures(object = lung.obj))

# check PC
ElbowPlot(lung.obj,ndims = 50)

文章里选的是前 8 个主成分,是不是太少了?我这里选了前 30 个主成分。

lung.obj <- FindNeighbors(lung.obj, dims = 1:30)
lung.obj <- FindClusters(lung.obj, resolution = 0.5)

# plot
# lung.obj <- RunUMAP(lung.obj, dims = 1:30)
lung.obj <- RunTSNE(lung.obj, dims = 1:30)

# plot
DimPlot(lung.obj,reduction = 'tsne',label = T)

出来的亚群数量也没有文章里多?文章里好像有 64 个亚群,但文章用了 paran 做了一些处理,也许可以按照这个试试?

featureplot

作者总共坚定了 8 类细胞,然后绘制了 8 个亚群的 marker 基因进行可视化:

# default plot
FeaturePlot(lung.obj,features = c('CLDN18','CLDN5','CAPS','COL1A1',
                                  'CD79A','LYZ','CD3D','EPCAM'),
            ncol = 4,
            reduction = 'tsne')

重新绘图:

# get gene expression
GeneExp <- FetchData(lung.obj,vars = c('CLDN18','CLDN5','CAPS','COL1A1',
                                       'CD79A','LYZ','CD3D','EPCAM'))

# get tsne pc
pc <- Embeddings(lung.obj,reduction = 'tsne') %>% data.frame()

# cbind
gbid <- cbind(pc,GeneExp)

# wide to long
gbidlong <- melt(gbid,id.vars = c('tSNE_1','tSNE_2'),
                 value.name = 'exp',
                 variable.name = 'gene')

# plot
ggplot(gbidlong,aes(x = tSNE_1,y = tSNE_2,color = exp)) +
  geom_point(size = 0.2,
             show.legend = F) +
  scale_color_gradient(low = 'grey90',high = '#CC0033',name = '') +
  theme_bw(base_size = 14) +
  theme(panel.grid = element_blank(),
        axis.ticks = element_blank(),
        aspect.ratio = 1,
        strip.background = element_rect(colour = NA,fill = NA),
        axis.text = element_blank()) +
  facet_wrap(~gene,ncol = 4)

细胞注释

我是直接根据上面 marker 基因的图来对 cluster 进行命名的,其它未知的 cluster 直接写成 unknow,可以是非常的主观了。

########################################

lung.obj$clusterName <- case_when(
  lung.obj$seurat_clusters %in% c('14','20') ~ 'Alveolar',
  lung.obj$seurat_clusters %in% c('10') ~ 'Endothelial',
  lung.obj$seurat_clusters %in% c('22') ~ 'Epithelial',
  lung.obj$seurat_clusters %in% c('11') ~ 'Fibroblast',
  lung.obj$seurat_clusters %in% c('6','8') ~ 'B cell',
  lung.obj$seurat_clusters %in% c('2','3','16','17') ~ 'Myeloid',
  lung.obj$seurat_clusters %in% c('0','1','7','9','15','18','12') ~ 'T cell',
  lung.obj$seurat_clusters %in% c('4','5','13') ~ 'Alveolar',
  lung.obj$seurat_clusters %in% c('14','20') ~ 'Cancer',
  TRUE ~ 'Unknow'
)

table(lung.obj$clusterName)

# Alveolar      B cell Endothelial  Epithelial  Fibroblast     Myeloid      T cell
# 8710        4880        1578         219        1459        9736       24943
# Unknow
# 1173

# set ident to clusterName
Idents(lung.obj) <- 'clusterName'

# plot
DimPlot(lung.obj,reduction = 'tsne',label = T) +
  NoLegend()

寻找 marker 基因

########################################
# find markers for every cluster compared to all remaining cells, report only the positive
# ones
all.markers <- FindAllMarkers(lung.obj, only.pos = TRUE,
                               min.pct = 0.15,
                               logfc.threshold = 1)
# 查看每个亚群的前两个
marker <- all.markers %>%
  group_by(cluster) %>%
  slice_max(n = 10, order_by = avg_log2FC)

DoHeatmap(lung.obj,features = marker$gene) +
  scale_fill_gradient2(low = 'blue',mid = 'white',high = 'red')

对 tSNE 图分类上色

########################################

colnames(metadata)
# [1] "CellID"        "ClusterName"   "CellFromTumor" "PatientNumber" "ClusterID"

# add log10 UMI
lung.obj$logumi <- log10(lung.obj@meta.data$nCount_RNA)

# plot
p1 <- DimPlot(lung.obj, reduction = "tsne",group.by = 'CellFromTumor') +
  scale_color_manual(values = c('0' = '#66CC99','1' = '#336699'),
                     name = '',
                     label = c('0' = 'Non-malignant','1' = 'Tumour')) +
  theme_bw(base_size = 14) +
  ggtitle('Sample origin') +
  theme(panel.grid = element_blank(),
        legend.position = c(0.2,0.1),
        legend.background = element_blank(),
        plot.title = element_text(hjust = 0.5),
        aspect.ratio = 1)

######################
p2 <- DimPlot(lung.obj, reduction = "tsne",group.by = 'PatientNumber') +
  scale_color_manual(values = c('1' = '#339933','2' = '#FF6600','3' = '#CCFF99',
                                '4' = '#FFCC33','5' = '#336699'),
                     name = '') +
  theme_bw(base_size = 14) +
  ggtitle('Patient') +
  theme(panel.grid = element_blank(),
        legend.position = c(0.2,0.1),
        legend.background = element_blank(),
        plot.title = element_text(hjust = 0.5),
        aspect.ratio = 1) +
  guides(color = guide_legend(override.aes = list(size = 3),nrow = 2))

######################
p3 <- DimPlot(lung.obj, reduction = "tsne",group.by = 'clusterName',
        label = T,label.size = 5) +
  theme_bw(base_size = 14) +
  ggtitle('Cell type') +
  theme(panel.grid = element_blank(),
        legend.position = c(0.1,0.1),
        legend.background = element_blank(),
        plot.title = element_text(hjust = 0.5),
        aspect.ratio = 1) +
  NoLegend() +
  scale_color_npg()

######################
p4 <- FeaturePlot(lung.obj,features = 'logumi',reduction = 'tsne') +
  theme_bw(base_size = 14) +
  scale_color_gradient(low = 'grey90',high = 'blue') +
  ggtitle('Transcript count') +
  theme(panel.grid = element_blank(),
        legend.position = c(0.1,0.1),
        legend.background = element_blank(),
        plot.title = element_text(hjust = 0.5),
        aspect.ratio = 1) +
  NoLegend()

# combine
plot_grid(plotlist = list(p1,p2,p3,p4),nrow = 1,align = 'hv')

不同病人的细胞比例

########################################
df_meta <- lung.obj@meta.data

colnames(df_meta)
# [1] "orig.ident"      "nCount_RNA"      "nFeature_RNA"    "CellID"
# [5] "ClusterName"     "CellFromTumor"   "PatientNumber"   "ClusterID"
# [9] "percent.mt"      "RNA_snn_res.0.5" "seurat_clusters" "logumi"
# [13] "clusterName"

# summarise cell type
sum_cell <- df_meta %>% group_by(PatientNumber,clusterName) %>%
  summarise(n = n())

ggplot(sum_cell,aes(x = factor(PatientNumber),y = n,fill = clusterName)) +
  geom_col(position = position_fill(),width = 0.75) +
  theme_classic(base_size = 14) +
  theme(panel.grid = element_blank(),
        legend.position = 'right',
        legend.background = element_blank(),
        plot.title = element_text(hjust = 0.5),
        # panel.background = element_rect(fill = 'grey90'),
        aspect.ratio = 1) +
  scale_fill_npg(name = '') +
  coord_cartesian(expand = 0) +
  xlab('Patient number') + ylab('Cell Percent')

# save
saveRDS(lung.obj,file = 'lung.obj.RDS')