其他
根据表达矩阵进行分群-2
课程笔记
粉丝:有单细胞线上课程吗?
小编:什么
好了,戏演完了,下面郑重介绍下我们的单细胞线上课程:(详情戳下方链接)
这个课程笔记栏目记录了学员们学习单细胞转录组课程的学习笔记
希望大家能有所收获!
作者 | 单细胞天地小编 刘小泽
课程链接在:http://jm.grazy.cn/index/mulitcourse/detail.html?cid=55
这次会介绍如何针对表达矩阵进行分群,不一定需要包装好的函数。对应视频第三单元5-6讲
3 使用Seurat进行tSNE
上面我们使用了RPKM矩阵,下面的Seurat将会使用原始表达矩阵。当然也是推荐使用原始矩阵进行分析的
3.1 下载原始表达矩阵
链接在:https://raw.githubusercontent.com/IStevant/XX-XY-mouse-gonad-scRNA-seq/master/data/female_count.Robj
1load(file="../female_count.Robj")
2load('../female_rpkm.Rdata')
3# 直接对细胞和基因过滤
4female_count <- female_count[rownames(female_count) %in% rownames(females),!colnames(female_count) %in% grep("rep",colnames(female_count), value=TRUE)]
5
6> female_count[1:3,1:3]
7 E10.5_XX_20140505_C01_150331_1 E10.5_XX_20140505_C02_150331_1
8eGFP 19582 526
9Gnai3 2218 122
10Pbsn 0 0
11 E10.5_XX_20140505_C03_150331_1
12eGFP 4786
13Gnai3 4
14Pbsn 0
15
16save(female_count,file = '../female_count.Rdata')
3.2 对细胞操作=》细胞发育时期的获取
1load('../female_count.Rdata')
2female_stages <- sapply(strsplit(colnames(female_count), "_"), `[`, 1)
3names(female_stages) <- colnames(female_count)
4> table(female_stages)
5female_stages
6E10.5 E11.5 E12.5 E13.5 E16.5 P6
7 68 100 103 99 85 108
3.3 使用Seurat V3
构建对象
1sce_female <- CreateSeuratObject(counts = female_count,
2 project = "sce_female",
3 min.cells = 1, min.features = 0)
4> sce_female
5An object of class Seurat
616765 features across 563 samples within 1 assay
7Active assay: RNA (16765 features)
添加样本注释信息
1sce_female <- AddMetaData(object = sce_female,
2 metadata = apply(female_count, 2, sum),
3 col.name = 'nUMI_raw')
4sce_female <- AddMetaData(object = sce_female,
5 metadata = female_stages,
6 col.name = 'female_stages')
数据归一化
1sce_female <- NormalizeData(sce_female)
2sce_female[["RNA"]]@data[1:3,1:3]
找差异基因HVGs
1sce_female <- FindVariableFeatures(sce_female,
2 selection.method = "vst",
3 nfeatures = 2000)
4# HVGs可视化
5VariableFeaturePlot(sce_female)
1seurat3_HVGs <- VariableFeatures(sce_female)
2# 检查与之前得到的HVGs重合度
3load('females_hvg_matrix.Rdata')
4load('seurat3_HVGs.Rdata')
5length(intersect(rownames(females_data),seurat3_HVGs))
6# 结果和之前822个HVGs有434个重合
数据标准化
1# 默认只对FindVariableFeatures得到的HVGs进行操作
2sce_female <- ScaleData(object = sce_female,
3 vars.to.regress = c('nUMI_raw'),
4 model.use = 'linear',
5 use.umi = FALSE)
PCA降维
1sce_female <- RunPCA(sce_female,
2 features = VariableFeatures(object = sce_female))
降维后聚类
1# 这里可以多选一些PCs
2sce_female <- FindNeighbors(sce_female, dims = 1:20)
3sce_female <- FindClusters(sce_female, resolution = 0.3)
进行tSNE
1ElbowPlot(sce_female)
2sce_female_tsne <- RunTSNE(sce_female, dims = 1:9)
tSNE结果可视化
1# 6个发育时间
2DimPlot(object = sce_female_tsne, reduction = "tsne",
3 group.by = 'female_stages')
4# 4个cluster
5DimPlot(sce_female_tsne, reduction = "tsne")
比较两次的聚类结果
1cluster1 <- read.csv('female_clustering.csv')
2cluster2 <- as.data.frame(Idents(sce_female_tsne))
3# 把它们放在一起比较,前提条件是它们的行名相同
4> identical(cluster1[,1],rownames(cluster2))
5[1] TRUE
6
7> table(cluster1[,2],cluster2[,1])
8
9 0 1 2 3
10 C1 224 3 13 0
11 C2 6 0 84 0
12 C3 12 177 0 1
13 C4 0 0 0 43
这也说明了,不同方法虽然选择的HVGs数量不同,也不完全一样,聚类的参数也不同,但最后真正的生物学意义是不会去掉的。只能说,最后选多少群是根据分析的人根据自己的理解去解释,只要参数变化,就会有各种不同的结果
4 使用更简单的函数去分群
1rm(list = ls())
2options(warn=-1)
3options(stringsAsFactors = F)
4load('../female_rpkm.Rdata')
5
6# 根据分群获得颜色
7cluster <- read.csv('female_clustering.csv')
8color <- rainbow(4)[as.factor(cluster[,2])]
9> table(color)
10color
11#00FFFFFF #8000FFFF #80FF00FF #FF0000FF
12 190 43 90 240
13
14# 取前1000个sd最大的基因作为HVGs
15choosed_count <- females
16# 表达矩阵过滤
17choosed_count <- choosed_count[apply(choosed_count, 1, sd)>0,]
18choosed_count <- choosed_count[names(head(sort(apply(choosed_count, 1, sd),decreasing = T),1000)),]
进行PCA分析
1pca_out <- prcomp(t(choosed_count),scale. = T)
2
3> pca_out$x[1:3,1:3]
4 PC1 PC2 PC3
5E10.5_XX_20140505_C01_150331_1 13.21660 -4.1600782 1.5287334
6E10.5_XX_20140505_C02_150331_1 13.73109 -0.2848806 -0.8443587
7E10.5_XX_20140505_C03_150331_1 10.89558 -0.2720221 -3.3839651
8
9library(ggfortify)
10autoplot(pca_out, col=color) +theme_classic()+ggtitle('PCA plot')
进行tSNE
1library(Rtsne)
2# 依旧选前9个
3tsne_out <- Rtsne(pca_out$x[,1:9], perplexity = 10,
4 pca = F, max_iter = 2000,
5 verbose = T)
6tsnes_cord <- tsne_out$Y
7colnames(tsnes_cord) <- c('tSNE1','tSNE2')
8ggplot(tsnes_cord, aes(x=tSNE1, y = tSNE2)) + geom_point(col=color) + theme_classic()+ggtitle('tSNE plot')
除了之前的HCPC和seurat分群,还可以利用DBSCAN、kmeans分群
1# 这个运行会非常慢!
2if(T){
3 library(Rtsne)
4 N_tsne <- 50
5 tsne_out <- list(length = N_tsne)
6 KL <- vector(length = N_tsne)
7 set.seed(1234)
8 for(k in 1:N_tsne)
9 {
10 tsne_out[[k]]<-Rtsne(t(log2(females+1)),initial_dims=30,verbose=FALSE,check_duplicates=FALSE,
11 perplexity=27, dims=2,max_iter=5000)
12 KL[k]<-tail(tsne_out[[k]]$itercosts,1)
13 print(paste0("FINISHED ",k," TSNE ITERATION"))
14 }
15 names(KL) <- c(1:N_tsne)
16 opt_tsne <- tsne_out[[as.numeric(names(KL)[KL==min(KL)])]]$Y
17}
18# DBSCAN结果
19library(dbscan)
20plot(opt_tsne, col=dbscan(opt_tsne,eps=3.1)$cluster,
21 pch=19, xlab="tSNE dim 1", ylab="tSNE dim 2")
1# kmeans结果
2plot(opt_tsne, col=kmeans(opt_tsne,centers = 4)$clust,
3 pch=19, xlab="tSNE dim 1", ylab="tSNE dim 2")
比较它们的差异
1# 其中kmeans是4群
2> table(kmeans(opt_tsne,centers = 4)$clust,dbscan(opt_tsne,eps=3.5)$cluster)
3
4 0 1 2 3 4
5 1 2 0 0 206 0
6 2 1 106 0 0 0
7 3 0 93 10 0 0
8 4 1 138 0 1 5
如果你对单细胞转录组研究感兴趣,但又不知道如何入门,也许你可以关注一下下面的课程