查看原文
其他

细胞的数量由誰决定?

Biomamba Biomamba 生信基地 2023-06-15
最近在用Cellranger构建10x Genomic的Seurat标准输入文件时发现有“ --expect-cells”这一参数,本人对于这一参数表示不理解,Seurat中细胞的数量难道不是由上游建库时测序中捕获到的细胞数决定的?居然有参数指定的操作,故有下文,一探究竟。



分别选取--expect-cells=5000及--expect-cells=10000两种参数进行构建,将得到的barcodes.tsv.gz、features.tsv.gz、matrix.mtx.gz三个文件用于下游的Seurat读取,为了防止Seurat的过滤参数影响我们判断Cellranger的过滤参数,我们将Seurat的过滤忽略,得到如下结果:

> library(Seurat)> > pbmc.data <- Read10X(data.dir = "5000/") > pbmc5000 <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k") > length(pbmc5000@active.ident)[1] 14334> > pbmc.data <- Read10X(data.dir = "10000/") > pbmc10000 <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k") > length(pbmc10000@active.ident)[1] 14317

显然,这一结果告诉我们--expect-cells并不是字面上那样能够指定下游产生的细胞数量,细胞的真实数量依然由上游的建库和测序而决定。不过,这一参数的改变还是对细胞数量产生了细微的影响,至于对下游的Seurat标准流程会产生多大的影响,大家可以跑通试一下,示例代码给大家放在下面。

install.packages("Seurat")if (!requireNamespace("BiocManager", quietly = TRUE))+ install.packages("BiocManager")BiocManager::install("multtest")BiocManager::install("multtest")library(multtest)library(Seurat)library(dplyr)pbmc.data <- Read10X(data.dir = "路径") #自动读取10X的数据,是一些tsv与mtx文件pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)#min.cells表示基因至少在几个细胞中表达,min.genes代表每个细胞中至少表达多少基因,细胞的平均reads数量> 5万一般比较好,否则需要补测pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^mt-") #通过线粒体的序列数来对数据进行计算head(pbmc@meta.data, 5) #QC的数据存在meta.data里,可以用这个来查看前5行VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) #用小提琴图来展示QC的结果,展示了每个barcode中基因的数目、UMI数目以及线粒体基因含量的分布情况plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")CombinePlots(plots = list(plot1, plot2)) #将这些指标画出来,判断趋势及占比pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5) #质控,选子集,RNA数量在200与2500之间的pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000) #用LogNormalize法对数据进行标准化(乘10000再取对数)数据存在pbmc[["RNA"]]@data.里pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)# 筛选差异基因(输出2000个)top10 <- head(VariableFeatures(pbmc), 10) #输出差异最大的十个基因plot1 <- VariableFeaturePlot(pbmc)plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)CombinePlots(plots = list(plot1, plot2)) #输出差异基因散点图(有无标签)all.genes <- rownames(pbmc)pbmc <- ScaleData(pbmc, features = all.genes) #将数据进行标准化,为后续的PCA分析做准备,数据存在pbmc[["RNA"]]@scale.datapbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt") #剔除不想要的变量(如线粒体的比例)pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc)) #PCA降维分析print(pbmc[["pca"]], dims = 1:5, nfeatures = 5) #打印PCA部分结果VizDimLoadings(pbmc, dims = 1:2, reduction = "pca") DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE) #三种PCA展示方式DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE) #展示15种PCA#筛选合适的维度:pbmc <- JackStraw(pbmc, num.replicate = 100) #重复计算次数pbmc <- ScoreJackStraw(pbmc, dims = 1:20) #计算维度这一步花的时间比较久JackStrawPlot(pbmc, dims = 1:15) #画出1到15个维度ElbowPlot(pbmc) #利用ElbowPlot来评价PC,PCA切记为了结果好看而降低PC数pbmc <- FindNeighbors(pbmc, dims = 1:10)pbmc <- FindClusters(pbmc, resolution = 0.5) #细胞分群,resolution分辨率在细胞数在3000附近时一般设为0.4-1.2,resolution越大得到的类群越多head(Idents(pbmc), 5) #看前五个分类群ID#非线性降维方法:UMAP、tSNEpbmc <- RunUMAP(pbmc, dims = 1:10) #运行UMAP算法pbmc <- RunTSNE(pbmc, dims = 1:10) #运行TSNE算法,TSNE算法运行时间较UMAP更久DimPlot(pbmc, reduction = "umap") #画出UMAP结果DimPlot(pbmc, reduction = "umap", label = TRUE)#显示分类群的标号saveRDS(pbmc, file = "C:/Users/53513/Desktop/pbmc_tutorial.rds")#把结果存出来cluster1.markers <- FindMarkers(pbmc, ident.1 = 1, min.pct = 0.25) #寻找分类群1中所有的的标记基因,head(cluster1.markers, n = 5) #查看前五个标记基因cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25) #寻找分类群5相较于分类群0、3的标记基因head(cluster5.markers, n = 5) #查看前五个标记基因pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) #寻找每一个分类群与其他所有细胞之间的标记基因,并只显示positive结果pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC) #展示部分cluster1.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE) #roc算法寻找标记基因VlnPlot(pbmc, features = c("基因名1", "基因名2"))#画出某一基因在各个分类群中的表达情况 VlnPlot(pbmc, features = c("基因名1", "基因名2"), slot = "counts", log = TRUE) #加logFeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A")) #以UMAP的图展示标记基因top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)DoHeatmap(pbmc, features = top10$gene) + NoLegend() #展示前20个标记基因的热图#通过标记基因及文献,可以人工确定各分类群的细胞类型,则可以如下手动添加细胞群名称:new.cluster.ids <- c("Naive CD4 T", "Memory CD4 T", "CD14+ Mono", "B", "CD8 T", "FCGR3A+ Mono", "NK", "DC", "Platelet")names(new.cluster.ids) <- levels(pbmc)pbmc <- RenameIdents(pbmc, new.cluster.ids)DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()saveRDS(pbmc, file = "../output/pbmc3k_final.rds")#结果文件输出

您可能也对以下帖子感兴趣

文章有问题?点此查看未经处理的缓存