其他
细胞的数量由誰决定?
> library(Seurat)
>
> pbmc.data <- Read10X(data.dir = "5000/")
> pbmc5000 <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k")
> length(pbmc5000@active.ident)
[1] 14334
>
> pbmc.data <- Read10X(data.dir = "10000/")
> pbmc10000 <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k")
> length(pbmc10000@active.ident)
[1] 14317
install.packages("Seurat")
if (!requireNamespace("BiocManager", quietly = TRUE))
+ install.packages("BiocManager")
BiocManager::install("multtest")
BiocManager::install("multtest")
library(multtest)
library(Seurat)
library(dplyr)
pbmc.data <- Read10X(data.dir = "路径")
#自动读取10X的数据,是一些tsv与mtx文件
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
#min.cells表示基因至少在几个细胞中表达,min.genes代表每个细胞中至少表达多少基因,细胞的平均reads数量> 5万一般比较好,否则需要补测
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^mt-") #通过线粒体的序列数来对数据进行计算
head(pbmc@meta.data, 5) #QC的数据存在meta.data里,可以用这个来查看前5行
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
#用小提琴图来展示QC的结果,展示了每个barcode中基因的数目、UMI数目以及线粒体基因含量的分布情况
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2)) #将这些指标画出来,判断趋势及占比
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
#质控,选子集,RNA数量在200与2500之间的
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
#用LogNormalize法对数据进行标准化(乘10000再取对数)数据存在pbmc[["RNA"]]@data.里
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
# 筛选差异基因(输出2000个)
top10 <- head(VariableFeatures(pbmc), 10) #输出差异最大的十个基因
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
CombinePlots(plots = list(plot1, plot2)) #输出差异基因散点图(有无标签)
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes) #将数据进行标准化,为后续的PCA分析做准备,数据存在pbmc[["RNA"]]@scale.data
pbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt") #剔除不想要的变量(如线粒体的比例)
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc)) #PCA降维分析
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5) #打印PCA部分结果
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE) #三种PCA展示方式
DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE) #展示15种PCA
#筛选合适的维度:
pbmc <- JackStraw(pbmc, num.replicate = 100) #重复计算次数
pbmc <- ScoreJackStraw(pbmc, dims = 1:20) #计算维度这一步花的时间比较久
JackStrawPlot(pbmc, dims = 1:15) #画出1到15个维度
ElbowPlot(pbmc) #利用ElbowPlot来评价PC,PCA切记为了结果好看而降低PC数
pbmc <- FindNeighbors(pbmc, dims = 1:10)
pbmc <- FindClusters(pbmc, resolution = 0.5)
#细胞分群,resolution分辨率在细胞数在3000附近时一般设为0.4-1.2,resolution越大得到的类群越多
head(Idents(pbmc), 5) #看前五个分类群ID
#非线性降维方法:UMAP、tSNE
pbmc <- RunUMAP(pbmc, dims = 1:10) #运行UMAP算法
pbmc <- RunTSNE(pbmc, dims = 1:10) #运行TSNE算法,TSNE算法运行时间较UMAP更久
DimPlot(pbmc, reduction = "umap") #画出UMAP结果
DimPlot(pbmc, reduction = "umap", label = TRUE)
#显示分类群的标号
saveRDS(pbmc, file = "C:/Users/53513/Desktop/pbmc_tutorial.rds")
#把结果存出来
cluster1.markers <- FindMarkers(pbmc, ident.1 = 1, min.pct = 0.25)
#寻找分类群1中所有的的标记基因,
head(cluster1.markers, n = 5)
#查看前五个标记基因
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
#寻找分类群5相较于分类群0、3的标记基因
head(cluster5.markers, n = 5)
#查看前五个标记基因
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
#寻找每一个分类群与其他所有细胞之间的标记基因,并只显示positive结果
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC)
#展示部分
cluster1.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
#roc算法寻找标记基因
VlnPlot(pbmc, features = c("基因名1", "基因名2"))
#画出某一基因在各个分类群中的表达情况
VlnPlot(pbmc, features = c("基因名1", "基因名2"), slot = "counts", log = TRUE)
#加log
FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
"CD8A"))
#以UMAP的图展示标记基因
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
#展示前20个标记基因的热图
#通过标记基因及文献,可以人工确定各分类群的细胞类型,则可以如下手动添加细胞群名称:
new.cluster.ids <- c("Naive CD4 T", "Memory CD4 T", "CD14+ Mono", "B", "CD8 T", "FCGR3A+ Mono",
"NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
saveRDS(pbmc, file = "../output/pbmc3k_final.rds")
#结果文件输出