其他
跟着 Nature medicine 学单细胞数据分析
这世界,难逃避命运
1引言
前面我们直接使用作者提供的计算好的数据绘图的,这次我们尝试从 loom 文件提取矩阵
和 metadata
信息自己从头分析看看。
上期:
文章:
复现图:
2文章方法部分
>>>
3前期准备
读取数据
library(hdf5r)
library(SeuratDisk)
library(scater)
library(tidyverse)
library(Seurat)
library(ggplot2)
library(cowplot)
library(reshape2)
library(ggsci)
library(patchwork)
# load loom
allcell <- Connect(filename = "./Thienpont_Tumors_52k_v4_R_fixed.loom", mode = "r")
allcell
# Class: loom
# Filename: F:\文章复现数据\11.LungTumor-scRNASEQ\all_Rds_data\Thienpont_Tumors_52k_v4_R_fixed.loom
# Access type: H5F_ACC_RDONLY
# Attributes: title, MetaData, version
# Listing:
# name obj_type dataset.dims dataset.type_class
# col_attrs H5I_GROUP <NA> <NA>
# col_graphs H5I_GROUP <NA> <NA>
# layers H5I_GROUP <NA> <NA>
# matrix H5I_DATASET 52698 x 33694 H5T_FLOAT
# row_attrs H5I_GROUP <NA> <NA>
# row_graphs H5I_GROUP <NA> <NA>
提取矩阵
注意:
loom 文件提取的 矩阵没有行名和列名。 矩阵的 列是基因, 行是细胞。
# get matirx
expr <- allcell[["matrix"]][,] %>% data.frame()
# add gene
colnames(expr) <- allcell[["row_attrs/Gene"]][]
# add cell
rownames(expr) <- allcell[["col_attrs/CellID"]][]
# transpose
texpr <- as(t(expr),'dgCMatrix')
# check
head(texpr[1:5,1:5])
# 5 x 5 sparse Matrix of class "dgCMatrix"
# AAACATACCTGAGT_1 AAACCGTGCTGGTA_1 AAACTTGATTGCGA_1 AAAGACGACAACCA_1
# RP11-34P13.3 . . . .
# FAM138A . . . .
# OR4F5 . . . .
# RP11-34P13.7 . . . .
# RP11-34P13.8 . . . .
# AAAGAGACATCGTG_1
# RP11-34P13.3 .
# FAM138A .
# OR4F5 .
# RP11-34P13.7 .
# RP11-34P13.8 .
准备 metadata 信息
# make metadata
metadata <- data.frame(
CellID = allcell[["col_attrs/CellID"]][],
ClusterName = allcell[["col_attrs/ClusterName"]][],
CellFromTumor = allcell[["col_attrs/CellFromTumor"]][],
PatientNumber = allcell[["col_attrs/PatientNumber"]][],
ClusterID = allcell[["col_attrs/ClusterID"]][])
# add rownames
rownames(metadata) <- metadata$CellID
4创建 seurat 对象及下游分析
# make seurat object
lung.obj <- CreateSeuratObject(counts = texpr,
meta.data = metadata,
project = 'lung')
lung.obj
# An object of class Seurat
# 33694 features across 52698 samples within 1 assay
# Active assay: RNA (33694 features, 0 variable features)
# 计算每个细胞的线粒体含量
lung.obj[["percent.mt"]] <- PercentageFeatureSet(lung.obj, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(lung.obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
pt.size = 0,ncol = 3)
可以看到作者上传的数据是已经过滤好的数据了,下面过滤的步骤就不用做了:
# filter
# lung.obj <- subset(lung.obj,
# subset = nCount_RNA >= 201 & nFeature_RNA >= 100 & nFeature_RNA <= 6000 & percent.mt <= 10)
下游常规流程:
# normalize
lung.obj <- NormalizeData(lung.obj,
normalization.method = "LogNormalize",
scale.factor = 10000)
lung.obj <- FindVariableFeatures(lung.obj, selection.method = "vst",
mean.cutoff = c(0.125,3),
dispersion.cutoff = c(0.5,Inf),
nfeatures = 2192)
# 对所有基因进行标准化
all.genes <- rownames(lung.obj)
lung.obj <- ScaleData(lung.obj, vars.to.regress = c("percent.mt","nCount_RNA"))
lung.obj <- RunPCA(lung.obj, features = VariableFeatures(object = lung.obj))
# check PC
ElbowPlot(lung.obj,ndims = 50)
文章里选的是前 8 个主成分,是不是太少了?我这里选了前 30 个主成分。
lung.obj <- FindNeighbors(lung.obj, dims = 1:30)
lung.obj <- FindClusters(lung.obj, resolution = 0.5)
# plot
# lung.obj <- RunUMAP(lung.obj, dims = 1:30)
lung.obj <- RunTSNE(lung.obj, dims = 1:30)
# plot
DimPlot(lung.obj,reduction = 'tsne',label = T)
出来的亚群数量也没有文章里多?文章里好像有 64 个亚群,但文章用了 paran 做了一些处理,也许可以按照这个试试?
featureplot
作者总共坚定了 8 类细胞,然后绘制了 8 个亚群的 marker 基因进行可视化:
# default plot
FeaturePlot(lung.obj,features = c('CLDN18','CLDN5','CAPS','COL1A1',
'CD79A','LYZ','CD3D','EPCAM'),
ncol = 4,
reduction = 'tsne')
重新绘图:
# get gene expression
GeneExp <- FetchData(lung.obj,vars = c('CLDN18','CLDN5','CAPS','COL1A1',
'CD79A','LYZ','CD3D','EPCAM'))
# get tsne pc
pc <- Embeddings(lung.obj,reduction = 'tsne') %>% data.frame()
# cbind
gbid <- cbind(pc,GeneExp)
# wide to long
gbidlong <- melt(gbid,id.vars = c('tSNE_1','tSNE_2'),
value.name = 'exp',
variable.name = 'gene')
# plot
ggplot(gbidlong,aes(x = tSNE_1,y = tSNE_2,color = exp)) +
geom_point(size = 0.2,
show.legend = F) +
scale_color_gradient(low = 'grey90',high = '#CC0033',name = '') +
theme_bw(base_size = 14) +
theme(panel.grid = element_blank(),
axis.ticks = element_blank(),
aspect.ratio = 1,
strip.background = element_rect(colour = NA,fill = NA),
axis.text = element_blank()) +
facet_wrap(~gene,ncol = 4)
细胞注释
我是直接根据上面 marker 基因的图来对 cluster 进行命名的,其它未知的 cluster 直接写成 unknow,可以是非常的主观了。
########################################
lung.obj$clusterName <- case_when(
lung.obj$seurat_clusters %in% c('14','20') ~ 'Alveolar',
lung.obj$seurat_clusters %in% c('10') ~ 'Endothelial',
lung.obj$seurat_clusters %in% c('22') ~ 'Epithelial',
lung.obj$seurat_clusters %in% c('11') ~ 'Fibroblast',
lung.obj$seurat_clusters %in% c('6','8') ~ 'B cell',
lung.obj$seurat_clusters %in% c('2','3','16','17') ~ 'Myeloid',
lung.obj$seurat_clusters %in% c('0','1','7','9','15','18','12') ~ 'T cell',
lung.obj$seurat_clusters %in% c('4','5','13') ~ 'Alveolar',
lung.obj$seurat_clusters %in% c('14','20') ~ 'Cancer',
TRUE ~ 'Unknow'
)
table(lung.obj$clusterName)
# Alveolar B cell Endothelial Epithelial Fibroblast Myeloid T cell
# 8710 4880 1578 219 1459 9736 24943
# Unknow
# 1173
# set ident to clusterName
Idents(lung.obj) <- 'clusterName'
# plot
DimPlot(lung.obj,reduction = 'tsne',label = T) +
NoLegend()
寻找 marker 基因
########################################
# find markers for every cluster compared to all remaining cells, report only the positive
# ones
all.markers <- FindAllMarkers(lung.obj, only.pos = TRUE,
min.pct = 0.15,
logfc.threshold = 1)
# 查看每个亚群的前两个
marker <- all.markers %>%
group_by(cluster) %>%
slice_max(n = 10, order_by = avg_log2FC)
DoHeatmap(lung.obj,features = marker$gene) +
scale_fill_gradient2(low = 'blue',mid = 'white',high = 'red')
对 tSNE 图分类上色
########################################
colnames(metadata)
# [1] "CellID" "ClusterName" "CellFromTumor" "PatientNumber" "ClusterID"
# add log10 UMI
lung.obj$logumi <- log10(lung.obj@meta.data$nCount_RNA)
# plot
p1 <- DimPlot(lung.obj, reduction = "tsne",group.by = 'CellFromTumor') +
scale_color_manual(values = c('0' = '#66CC99','1' = '#336699'),
name = '',
label = c('0' = 'Non-malignant','1' = 'Tumour')) +
theme_bw(base_size = 14) +
ggtitle('Sample origin') +
theme(panel.grid = element_blank(),
legend.position = c(0.2,0.1),
legend.background = element_blank(),
plot.title = element_text(hjust = 0.5),
aspect.ratio = 1)
######################
p2 <- DimPlot(lung.obj, reduction = "tsne",group.by = 'PatientNumber') +
scale_color_manual(values = c('1' = '#339933','2' = '#FF6600','3' = '#CCFF99',
'4' = '#FFCC33','5' = '#336699'),
name = '') +
theme_bw(base_size = 14) +
ggtitle('Patient') +
theme(panel.grid = element_blank(),
legend.position = c(0.2,0.1),
legend.background = element_blank(),
plot.title = element_text(hjust = 0.5),
aspect.ratio = 1) +
guides(color = guide_legend(override.aes = list(size = 3),nrow = 2))
######################
p3 <- DimPlot(lung.obj, reduction = "tsne",group.by = 'clusterName',
label = T,label.size = 5) +
theme_bw(base_size = 14) +
ggtitle('Cell type') +
theme(panel.grid = element_blank(),
legend.position = c(0.1,0.1),
legend.background = element_blank(),
plot.title = element_text(hjust = 0.5),
aspect.ratio = 1) +
NoLegend() +
scale_color_npg()
######################
p4 <- FeaturePlot(lung.obj,features = 'logumi',reduction = 'tsne') +
theme_bw(base_size = 14) +
scale_color_gradient(low = 'grey90',high = 'blue') +
ggtitle('Transcript count') +
theme(panel.grid = element_blank(),
legend.position = c(0.1,0.1),
legend.background = element_blank(),
plot.title = element_text(hjust = 0.5),
aspect.ratio = 1) +
NoLegend()
# combine
plot_grid(plotlist = list(p1,p2,p3,p4),nrow = 1,align = 'hv')
不同病人的细胞比例
########################################
df_meta <- lung.obj@meta.data
colnames(df_meta)
# [1] "orig.ident" "nCount_RNA" "nFeature_RNA" "CellID"
# [5] "ClusterName" "CellFromTumor" "PatientNumber" "ClusterID"
# [9] "percent.mt" "RNA_snn_res.0.5" "seurat_clusters" "logumi"
# [13] "clusterName"
# summarise cell type
sum_cell <- df_meta %>% group_by(PatientNumber,clusterName) %>%
summarise(n = n())
ggplot(sum_cell,aes(x = factor(PatientNumber),y = n,fill = clusterName)) +
geom_col(position = position_fill(),width = 0.75) +
theme_classic(base_size = 14) +
theme(panel.grid = element_blank(),
legend.position = 'right',
legend.background = element_blank(),
plot.title = element_text(hjust = 0.5),
# panel.background = element_rect(fill = 'grey90'),
aspect.ratio = 1) +
scale_fill_npg(name = '') +
coord_cartesian(expand = 0) +
xlab('Patient number') + ylab('Cell Percent')
# save
saveRDS(lung.obj,file = 'lung.obj.RDS')
5结尾
感兴趣的大家可以自己去看看文章,欢迎交流讨论。
欢迎加入生信交流群。加我微信我也拉你进 微信群聊 老俊俊生信交流群
(微信交流群需收取20元入群费用(防止骗子和便于管理)
)。
老俊俊微信:
知识星球:
所以今天你学习了吗?
今天的分享就到这里了,敬请期待下一篇!
最后欢迎大家分享转发,您的点赞是对我的鼓励和肯定!
如果觉得对您帮助很大,赏杯快乐水喝喝吧!
往期回顾
◀Seurat 官网单细胞教程四 (SCTransform 使用及 SCTransform V2 版本)
◀Seurat 官网单细胞教程三 (RPCA 快速整合数据)
◀...