使用scHCL探索单细胞转录组细胞类型
分享是一种态度
作者 | 周运来
男,
一个长大了才会遇到的帅哥,
稳健,潇洒,大方,靠谱。
一段生信缘,一棵技能树,
一枚大型测序工厂的螺丝钉,
一个随机森林中提灯觅食的津门旅客。
文章信息
文中鉴定了60种人体组织样品涵盖八大系统的人类细胞图谱分析涉及人体100余种细胞大类和800余种细胞亚类。建立了70多万个单细胞的转录组数据库,基于该数据库,团队开发了scHCL单细胞比对系统用于人体细胞类型的识别,并搭建了人类细胞蓝图网站HCL(http://bis.zju.edu.cn/HCL/)(国家基因库镜像HCL(https://db.cngb.org/HCL/))。
这么大规模的单细胞测序,细胞类型的鉴定肯定是一道亮丽的技术风景。同时,人类的细胞图谱的绘制对以后人类细胞类型的鉴定也定有他独特的借鉴意义。那么,文章中是如何鉴定的呢?
文章中有更为详细的描述,实现细节在R包https://github.com/ggjlab/scHCL中以及在线http://bis.zju.edu.cn/HCL/blast.html中。在线版的界面如下:
界面清晰明了。我们用scHCL来尝试做一下细胞类型的鉴定。
1library(scHCL)
2library(Seurat)
3library(tidyverse)
4
5# hcl_lung is an example expression matrix from HCL project.
6> data(hcl_lung)
7> dim(hcl_lung)
8[1] 2884 80
9# 2884 genes expression value of 80 cells
10
11# scHCL has two parameters , single cell expression matrix(scdata) and
12# the number of most similar cell types
13> hcl_result <- scHCL(scdata = hcl_lung, numbers_plot = 3)
14
15 head(hcl_result$scHCL_probility)
16 Cell Cell type Score
171 E18_2_C06 Stratified epithelial cell (Adult-Trachea2) 0.07423052
182 E18_2_C06 Epithelial cell_MSMB high (Adult-Trachea2) 0.06744750
193 E18_2_C06 Basal cell_KRT6A high (Adult-Trachea2) 0.07889934
204 E18_2_C07 Zona fasciculata cell (Fetal-Adrenal-Gland2) 0.05358975
215 E18_2_C07 Intra-adrenal chromoblast (Fetal-Adrenal-Gland4) 0.05475451
226 E18_2_C07 Unknown (Fetal-Muscle1) 0.05533774
每个细胞亚群有三种可能,当然我们需要深入的探索一下,于是推出了shiny程序:
1# open shiny for visualize result for scHCL
2scHCL_vis(hcl_result)
那如何使用我们自己的数据呢?
我们先看一看,R包自带的参考数据集的信息。
1 rm(list=ls())
2library(scHCL)
3load("C:\\Users\\Administrator\\Desktop\\scHCL-master\\data\\ref.expr.rda")
4ref.expr[1:4,1:4]
5 M2 Macrophage(Adult-Adipose1) Stromal cell(Adult-Adipose1) Adipocyte_SPP1 high(Adult-Adipose1) Mast cell(Adult-Adipose1)
6CD163 409.0000 162.00000 285 315
7F13A1 237.0000 75.00000 52 190
8STAB1 172.3333 56.33333 66 164
9VSIG4 188.3333 52.33333 93 113
10 dim(ref.expr)
11[1] 6075 1841
12colnames(ref.expr)[1:4]
13[1] "M2 Macrophage(Adult-Adipose1)" "Stromal cell(Adult-Adipose1)" "Adipocyte_SPP1 high(Adult-Adipose1)" "Mast cell(Adult-Adipose1)"
然后我们读入自己的数据:
1met <- 'F:\\scOrange\\example\\GSM3587965_AML556-D15.dem.txt\\AML556-D15.dem.txt'
2met <-readr::read_tsv(met)
3
4Parsed with column specification:
5cols(
6 .default = col_double(),
7 Gene = col_character()
8)
9See spec(...) for full column specifications.
10|=============================================================================================================================================| 100% 64 MB
11met[1:4,1:4]
12# A tibble: 4 x 4
13 Gene `AML556-D15_AAAAAGGTGCTN` `AML556-D15_AAAAAGGTTTCT` `AML556-D15_AAAACCTCGGTN`
14 <chr> <dbl> <dbl> <dbl>
151 A1BG 0 0 0
162 A1BG-AS1 0 0 0
173 A1CF 0 0 0
184 A2M 0 0 0
19met <- as.data.frame(met)
20rownames(met) <- met$Gene
21met <- met[,-1]
22met[1:4,1:4]
23 AML556-D15_AAAAAGGTGCTN AML556-D15_AAAAAGGTTTCT AML556-D15_AAAACCTCGGTN AML556-D15_AAACACTAAGGN
24A1BG 0 0 0 0
25A1BG-AS1 0 0 0 0
26A1CF 0 0 0 0
27A2M 0 0 0 0
28>
运行scHCL预测。
也可以先运行聚类求平均表达谱,用平均表达谱来做。这里由于我不知道分群多少合适,就直接用表达谱做了(这样比较慢)。
1que.seu <-CreateSeuratObject(met)
2que.seu_suerat <- scHCL(scdata = que.seu@assays$RNA, numbers_plot = 3)
我们来看看预测的结果:
1que.seu_suerat$cors_matrix[1:4,1:4]
2 AML556-D15_AAAAAGGTGCTN AML556-D15_AAAAAGGTTTCT AML556-D15_AAAACCTCGGTN AML556-D15_AAACACTAAGGN
3M2 Macrophage (Adult-Adipose1) 0.08364598 0.04890905 0.04529126 0.10508757
4Stromal cell (Adult-Adipose1) 0.05262400 0.03501448 0.02367288 0.06850328
5Adipocyte_SPP1 high (Adult-Adipose1) 0.06046088 0.04372272 0.04175932 0.09665644
6Mast cell (Adult-Adipose1) 0.07187894 0.06195473 0.04336573 0.10477821
7> que.seu_suerat$top_cors
8[1] 3
9> head(que.seu_suerat$scHCL_probility,5)
10 Cell Cell type Score
111 AML556-D15_AAAAAGGTGCTN T cells2 (Placenta_VentoTormo) 0.2451411
122 AML556-D15_AAAAAGGTGCTN Blood NK CD16+ (Placenta_VentoTormo) 0.2206799
133 AML556-D15_AAAAAGGTGCTN T cells3 (Placenta_VentoTormo) 0.2256222
144 AML556-D15_AAAAAGGTTTCT T cells2 (Placenta_VentoTormo) 0.2001931
155 AML556-D15_AAAAAGGTTTCT T cells3 (Placenta_VentoTormo) 0.1936185
每个细胞numbers_plot =3 ,所以每个细胞展示3种celltype。
1scHCL_vis(que.seu_suerat)
在单细胞通量那么高的情况下,其实人们没时间去鉴定每一个细胞的类型,而这里还给每个细胞好几种类型。其实可先分群做成一个平均表达谱,再mapping到bulk的表达谱上,以鉴定亚群的细胞类型。这里作为探索,我们取Score值最值,在某个Score范围内注释到的celltype最多的。我们先退一步根据每个细胞的类型推测分群的celltype。
1 que.seu_suerat <- scHCL(scdata = que.seu@assays$RNA, numbers_plot = 1)
2> que.seu_suerat$cors_matrix[1:4,1:4]
3 AML556-D15_AAAAAGGTGCTN AML556-D15_AAAAAGGTTTCT AML556-D15_AAAACCTCGGTN AML556-D15_AAACACTAAGGN
4M2 Macrophage (Adult-Adipose1) 0.08364598 0.04890905 0.04529126 0.10508757
5Stromal cell (Adult-Adipose1) 0.05262400 0.03501448 0.02367288 0.06850328
6Adipocyte_SPP1 high (Adult-Adipose1) 0.06046088 0.04372272 0.04175932 0.09665644
7Mast cell (Adult-Adipose1) 0.07187894 0.06195473 0.04336573 0.10477821
8> que.seu_suerat$top_cors
9[1] 1
10> head(que.seu_suerat$scHCL_probility,5)
11 Cell Cell type Score
121 AML556-D15_AAAAAGGTGCTN T cells2 (Placenta_VentoTormo) 0.2451411
132 AML556-D15_AAAAAGGTTTCT T cells2 (Placenta_VentoTormo) 0.2001931
143 AML556-D15_AAAACCTCGGTN T cells2 (Placenta_VentoTormo) 0.2413753
154 AML556-D15_AAACACTAAGGN T cells3 (Placenta_VentoTormo) 0.2812856
165 AML556-D15_AAACAGCTACAA T cells3 (Placenta_VentoTormo) 0.2290333
17>
执行seurat标准流程:
1rownames(que.seu_suerat$scHCL_probility) <- que.seu_suerat$scHCL_probility$Cell
2que.seu %>% FindVariableFeatures() %>%
3 ScaleData() %>%
4 RunPCA() %>%
5 RunUMAP(dim=1:30) %>%
6 FindNeighbors() %>%
7 FindClusters(resolution =c(0.2,0.4,0.6,0.8))->que.seu # 制定多种分群结果。
8
9que.seu@meta.data$Cell <- rownames(que.seu@meta.data)
10merge(que.seu@meta.data,que.seu_suerat$scHCL_probility,by= "Cell") -> que.seu@meta.data
11rownames(que.seu@meta.data) <- que.seu@meta.data$Cell
1head(que.seu@meta.data)
2 Cell orig.ident nCount_RNA nFeature_RNA RNA_snn_res.0.2
3AML556-D15_AAAAAGGTGCTN AML556-D15_AAAAAGGTGCTN AML556-D15 1251 609 0
4AML556-D15_AAAAAGGTTTCT AML556-D15_AAAAAGGTTTCT AML556-D15 1253 529 0
5AML556-D15_AAAACCTCGGTN AML556-D15_AAAACCTCGGTN AML556-D15 1610 671 0
6AML556-D15_AAACACTAAGGN AML556-D15_AAACACTAAGGN AML556-D15 1229 597 0
7AML556-D15_AAACAGCTACAA AML556-D15_AAACAGCTACAA AML556-D15 1062 547 0
8AML556-D15_AAACCTCCGGGA AML556-D15_AAACCTCCGGGA AML556-D15 1325 708 0
9 RNA_snn_res.0.4 RNA_snn_res.0.6 RNA_snn_res.0.8 seurat_clusters
10AML556-D15_AAAAAGGTGCTN 0 1 1 1
11AML556-D15_AAAAAGGTTTCT 0 0 0 0
12AML556-D15_AAAACCTCGGTN 0 0 0 0
13AML556-D15_AAACACTAAGGN 0 0 0 0
14AML556-D15_AAACAGCTACAA 0 1 1 1
15AML556-D15_AAACCTCCGGGA 1 2 2 2
16 Cell type Score
17AML556-D15_AAAAAGGTGCTN T cells2 (Placenta_VentoTormo) 0.2451411
18AML556-D15_AAAAAGGTTTCT T cells2 (Placenta_VentoTormo) 0.2001931
19AML556-D15_AAAACCTCGGTN T cells2 (Placenta_VentoTormo) 0.2413753
20AML556-D15_AAACACTAAGGN T cells3 (Placenta_VentoTormo) 0.2812856
21AML556-D15_AAACAGCTACAA T cells3 (Placenta_VentoTormo) 0.2290333
22AML556-D15_AAACCTCCGGGA T cells2 (Placenta_VentoTormo) 0.2517737
差异分析:
1DT::datatable(FindAllMarkers(que.seu))
2Calculating cluster 0
3 |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=23s
4Calculating cluster 1
5 |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=26s
6Calculating cluster 2
7 |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=19s
8Calculating cluster 3
9 |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=28s
10Calculating cluster 4
11 |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=01m 07s
12Calculating cluster 5
13 |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=24s
这可以和HCL官网的数据做个比较:
做个我们的桑基图看一看我们的注释结果吧。
1library(sankeywheel)
2sankeywheel(
3 from = que.seu@meta.data$seurat_clusters, to = que.seu@meta.data$`Cell type`,
4 weight = que.seu@meta.data$Score,#type = "sankey",
5 title = "sk",
6 subtitle = "scHCL Project",
7 seriesName = "", width = "100%", height = "600px"
8)
还是掰直了吧:
我们看到0,1,2到细胞类型那都比较均匀的混合在一起了,再看看他们最可能的注释结果:
1que.seu@meta.data %>%
2 group_by(seurat_clusters,`Cell type`) %>%
3 count(`Cell type`) %>%
4 arrange(seurat_clusters,desc(n)) %>%
5 group_by(seurat_clusters)%>%
6 top_n(1)#summarise()
7
8Selecting by n
9# A tibble: 6 x 3
10# Groups: seurat_clusters [6]
11 seurat_clusters `Cell type` n
12 <fct> <fct> <int>
131 0 T cells2 (Placenta_VentoTormo) 252
142 1 T cells2 (Placenta_VentoTormo) 239
153 2 T cells2 (Placenta_VentoTormo) 131
164 3 B cell (Plasmocyte) (Adult-Thyroid2) 38
175 4 Immature myeloid progenitor cell (Haematopoietic-Stem-Cell_Velten) 59
186 5 dNKp (Placenta_VentoTormo) 25
这是seurat_clusters = RNA_snn_res.0.8 时的分群以及注释结果,果然0,1,2都注释到了同一种细胞类型,这是真的吗?所以我们希望知道这三个群的关系是怎样的呢?
1library(clustree)
2que.seu@meta.data %>%
3 select(starts_with("RNA_snn_res")) %>%
4 mutate(RNA_snn_res.0.0=0)%>%
5 clustree( prefix = "RNA_snn_res.0")
他们真的就是来自 RNA_snn_res.0.2时的同一个群啊,这是不是启发我们:其实分为3-4个群也是可以呢?
当 RNA_snn_res.0.2时:
1que.seu@meta.data %>%
2 group_by(RNA_snn_res.0.2,`Cell type`) %>%
3 count(`Cell type`) %>%
4 arrange(RNA_snn_res.0.2,desc(n)) %>%
5 group_by(RNA_snn_res.0.2)%>%
6 top_n(1)#summarise()
7
8Selecting by n
9# A tibble: 3 x 3
10# Groups: RNA_snn_res.0.2 [3]
11 RNA_snn_res.0.2 `Cell type` n
12 <fct> <fct> <int>
131 0 T cells2 (Placenta_VentoTormo) 621
142 1 Immature myeloid progenitor cell (Haematopoietic-Stem-Cell_Velten) 61
153 2 B cell (Plasmocyte) (Adult-Thyroid2) 38
1que.seu@meta.data %>% mutate(CellType = ifelse(RNA_snn_res.0.2 == 0,"T cells2 (Placenta_VentoTormo)",
2 ifelse(RNA_snn_res.0.2==1,"Immature myeloid progenitor cell (Haematopoietic-Stem-Cell_Velten)",
3 "B cell (Plasmocyte) (Adult-Thyroid2)")))->que.seu@meta.data
4rownames(que.seu@meta.data ) <-que.seu@meta.data$Cell
5head(que.seu@meta.data)
6
7DimPlot(que.seu,group.by ="CellType")
这里是探索细胞类型,而不是百分百的鉴定它,但是这至少给了我们一个可能的选择。我相信细胞类型的注释会随着单细胞图谱的发表而得到改善,基于表达谱mapping的方法是个应用的方向。其实之前我们用seurst就讲过它可以把未知类型的细胞mapping到reference上,只是一直以来缺少可以mapping的reference。
1ref.seu <-CreateSeuratObject(ref.expr)
2ref.seu %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA() -> ref.seu
3
4head(ref.seu@meta.data)
5?FindTransferAnchors
6
7anchors <- FindTransferAnchors(reference = ref.seu, query = que.seu,
8 dims = 1:30,k.anchor = 3 ,k.filter = 50,
9 features = intersect(VariableFeatures(que.seu),VariableFeatures(ref.seu))[1:500])# features = VariableFeatures(rque.seu)[1:500]
10
11?TransferData
12ref.seu %>% FindNeighbors() -> ref.seu
13ref.seu <-FindClusters(
14 ref.seu,
15 resolution = c(0.2)
16)
17
18head(ref.seu@meta.data)
19table(Idents(ref.seu))
20predictions <- TransferData(anchorset = anchors, refdata = ref.seu@meta.data$seurat_clusters,
21 dims = 1:10,
22 k.weight = 5,)
23head(predictions)
24
25query <- AddMetaData(que.seu, metadata = predictions)
26head(query@meta.data)
27query$prediction.match <- query$predicted.id == query$celltype
28
29query %>% RunUMAP(dims = 1:30)%>% RunTSNE(dims = 1:30) -> query
30
1head(query@meta.data)
2 Cell orig.ident nCount_RNA nFeature_RNA RNA_snn_res.0.2
3AML556-D15_AAAAAGGTGCTN AML556-D15_AAAAAGGTGCTN AML556-D15 1251 609 0
4AML556-D15_AAAAAGGTTTCT AML556-D15_AAAAAGGTTTCT AML556-D15 1253 529 0
5AML556-D15_AAAACCTCGGTN AML556-D15_AAAACCTCGGTN AML556-D15 1610 671 0
6AML556-D15_AAACACTAAGGN AML556-D15_AAACACTAAGGN AML556-D15 1229 597 0
7AML556-D15_AAACAGCTACAA AML556-D15_AAACAGCTACAA AML556-D15 1062 547 0
8AML556-D15_AAACCTCCGGGA AML556-D15_AAACCTCCGGGA AML556-D15 1325 708 0
9 RNA_snn_res.0.4 RNA_snn_res.0.6 RNA_snn_res.0.8 seurat_clusters
10AML556-D15_AAAAAGGTGCTN 0 1 1 1
11AML556-D15_AAAAAGGTTTCT 0 0 0 0
12AML556-D15_AAAACCTCGGTN 0 0 0 0
13AML556-D15_AAACACTAAGGN 0 0 0 0
14AML556-D15_AAACAGCTACAA 0 1 1 1
15AML556-D15_AAACCTCCGGGA 1 2 2 2
16 Cell type Score CellTpye
17AML556-D15_AAAAAGGTGCTN T cells2 (Placenta_VentoTormo) 0.2451411 T cells2 (Placenta_VentoTormo)
18AML556-D15_AAAAAGGTTTCT T cells2 (Placenta_VentoTormo) 0.2001931 T cells2 (Placenta_VentoTormo)
19AML556-D15_AAAACCTCGGTN T cells2 (Placenta_VentoTormo) 0.2413753 T cells2 (Placenta_VentoTormo)
20AML556-D15_AAACACTAAGGN T cells3 (Placenta_VentoTormo) 0.2812856 T cells2 (Placenta_VentoTormo)
21AML556-D15_AAACAGCTACAA T cells3 (Placenta_VentoTormo) 0.2290333 T cells2 (Placenta_VentoTormo)
22AML556-D15_AAACCTCCGGGA T cells2 (Placenta_VentoTormo) 0.2517737 T cells2 (Placenta_VentoTormo)
23 CellType predicted.id prediction.score.1
24AML556-D15_AAAAAGGTGCTN T cells2 (Placenta_VentoTormo) 2 0.0000000
25AML556-D15_AAAAAGGTTTCT T cells2 (Placenta_VentoTormo) 2 0.1732816
26AML556-D15_AAAACCTCGGTN T cells2 (Placenta_VentoTormo) 2 0.0000000
27AML556-D15_AAACACTAAGGN T cells2 (Placenta_VentoTormo) 2 0.1732816
28AML556-D15_AAACAGCTACAA T cells2 (Placenta_VentoTormo) 2 0.0000000
29AML556-D15_AAACCTCCGGGA T cells2 (Placenta_VentoTormo) 2 0.0000000
30 prediction.score.0 prediction.score.3 prediction.score.5 prediction.score.2
31AML556-D15_AAAAAGGTGCTN 0 0 0 1.0000000
32AML556-D15_AAAAAGGTTTCT 0 0 0 0.8267184
33AML556-D15_AAAACCTCGGTN 0 0 0 1.0000000
34AML556-D15_AAACACTAAGGN 0 0 0 0.8267184
35AML556-D15_AAACAGCTACAA 0 0 0 1.0000000
36AML556-D15_AAACCTCCGGGA 0 0 0 1.0000000
37 prediction.score.6 prediction.score.4 prediction.score.max
38AML556-D15_AAAAAGGTGCTN 0 0 1.0000000
39AML556-D15_AAAAAGGTTTCT 0 0 0.8267184
40AML556-D15_AAAACCTCGGTN 0 0 1.0000000
41AML556-D15_AAACACTAAGGN 0 0 0.8267184
42AML556-D15_AAACAGCTACAA 0 0 1.0000000
43AML556-D15_AAACCTCCGGGA 0 0 1.0000000
44#
1#DimPlot(query,group.by ="predicted.id")
往期精彩
如果你对单细胞转录组研究感兴趣,但又不知道如何入门,也许你可以关注一下下面的课程
生信爆款入门-全球听(买一得五)(第4期) 你的生物信息入门课
数据挖掘第2期(两天变三周,实力加量)医学生/医生首选技能提高课
生信技能树的2019年终总结 你的生物信息成长宝藏
单细胞天地欢迎你