Seurat 官网单细胞教程一 (数据整合)

Original JunJunLab 老俊俊的生信笔记 2022-08-27

收录于合集 #单细胞笔记 21个

不确定的事

1引言

单细胞测序经常会测量多个样本,比如 重复样本, 不同时间点的样本, 不同条件下处理样本, 不同平台的数据集整合 等等情况,我们需要根据具体情况具体分析,比如我们的整合目的是什么,整合后是否需要去除批次效应等等问题。

Seurat 官网提供了一个两个条件下(对照和处理)的数据整合教程,接下来跟着一起学习一下。

2数据准备

加载内置数据集:

library(Seurat)
library(SeuratData)
library(patchwork)

# install dataset
InstallData("ifnb")

# load dataset
ifnb <- UpdateSeuratObject(LoadData("ifnb"))
ifnb
# An object of class Seurat
# 14053 features across 13999 samples within 1 assay
# Active assay: RNA (14053 features, 0 variable features)

# check group
table(ifnb$stim)
# CTRL STIM
# 6548 7451

可以看到这是有 CTRL 和 STIM 的数据,我们按 stim 分割出两个数据集:

# split the dataset into a list of two seurat objects (stim and CTRL)
ifnb.list <- SplitObject(ifnb, split.by = "stim")
ifnb.list

# $CTRL
# An object of class Seurat
# 14053 features across 6548 samples within 1 assay
# Active assay: RNA (14053 features, 0 variable features)
#
# $STIM
# An object of class Seurat
# 14053 features across 7451 samples within 1 assay
# Active assay: RNA (14053 features, 0 variable features)

这个格式的 list 我们批量读取也可以产生类似的结构。

3批量归一化和找高变基因

# normalize and identify variable features for each dataset independently
ifnb.list <- lapply(X = ifnb.list, FUN = function(x) {
  x <- NormalizeData(x)
  x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

ifnb.list
# $CTRL
# An object of class Seurat
# 14053 features across 6548 samples within 1 assay
# Active assay: RNA (14053 features, 2000 variable features)
#
# $STIM
# An object of class Seurat
# 14053 features across 7451 samples within 1 assay
# Active assay: RNA (14053 features, 2000 variable features)

4整合

# select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = ifnb.list)

immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list,
                                         anchor.features = features)
# Scaling features for provided objects
#   |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=01s
# ...

# this command creates an 'integrated' data assay
immune.combined <- IntegrateData(anchorset = immune.anchors)

5下游常规分析

首先我们需要将 Assay 切换到 integrated:

# specify that we will perform downstream analysis on the corrected data note that the
# original unmodified data still resides in the 'RNA' assay
DefaultAssay(immune.combined) <- "integrated"

# Run the standard workflow for visualization and clustering
immune.combined <- ScaleData(immune.combined, verbose = FALSE)
immune.combined <- RunPCA(immune.combined, npcs = 30, verbose = FALSE)
immune.combined <- RunUMAP(immune.combined, reduction = "pca", dims = 1:30)
immune.combined <- FindNeighbors(immune.combined, reduction = "pca", dims = 1:30)
immune.combined <- FindClusters(immune.combined, resolution = 0.6)

6可视化

聚类图:

# Visualization
p1 <- DimPlot(immune.combined, reduction = "umap", group.by = "stim")
p2 <- DimPlot(immune.combined, reduction = "umap", label = TRUE, repel = TRUE)
p1 + p2

按 stim 分组:

DimPlot(immune.combined, reduction = "umap", split.by = "stim")

展示不同分组的 marker 基因:

# 切换 Assay
DefaultAssay(immune.combined) <- "RNA"

# 亚群命名
immune.combined <- RenameIdents(immune.combined, `0` = "CD14 Mono", `1` = "CD4 Naive T", `2` = "CD4 Memory T",
                                `3` = "CD16 Mono", `4` = "B", `5` = "CD8 T", `6` = "NK", `7` = "T activated", `8` = "DC", `9` = "B Activated",
                                `10` = "Mk", `11` = "pDC", `12` = "Eryth", `13` = "Mono/Mk Doublets", `14` = "HSPC")

Idents(immune.combined) <- factor(Idents(immune.combined),
                                  levels = c("HSPC", "Mono/Mk Doublets",
                                             "pDC", "Eryth", "Mk", "DC",
                                             "CD14 Mono", "CD16 Mono",
                                             "B Activated", "B", "CD8 T",
                                             "NK", "T activated",
                                             "CD4 Naive T", "CD4 Memory T"))

# marker 基因
markers.to.plot <- c("CD3D", "CREM", "HSPH1", "SELL", "GIMAP5", "CACYBP", "GNLY", "NKG7", "CCL5",
                     "CD8A", "MS4A1", "CD79A", "MIR155HG", "NME1", "FCGR3A", "VMO1", "CCL2", "S100A9", "HLA-DQA1",
                     "GPR183", "PPBP", "GNG11", "HBA2", "HBB", "TSPAN13", "IL3RA", "IGJ", "PRSS57")

# 点图
DotPlot(immune.combined,
        features = markers.to.plot,
        cols = c("blue", "red"),
        dot.scale = 8,
        split.by = "stim") +
  RotatedAxis()

7不同条件的亚群的差异分析

################################### diff analysis

# add new group
immune.combined$celltype.stim <- paste(Idents(immune.combined),
                                       immune.combined$stim, sep = "_")

# check
head(immune.combined@meta.data$celltype.stim,3)
# [1] "CD14 Mono_CTRL" "CD14 Mono_CTRL" "CD14 Mono_CTRL"

# add cell type
immune.combined$celltype <- Idents(immune.combined)

# set Idents to new group
Idents(immune.combined) <- "celltype.stim"

# diff test
b.interferon.response <- FindMarkers(immune.combined,
                                     ident.1 = "B_STIM", ident.2 = "B_CTRL",
                                     verbose = FALSE)
head(b.interferon.response, n = 3)

#               p_val avg_log2FC pct.1 pct.2     p_val_adj
# ISG15 8.657899e-156   4.596502 0.998 0.240 1.216695e-151
# IFIT3 3.536522e-151   4.500500 0.964 0.052 4.969874e-147
# IFI6  1.204612e-149   4.233582 0.966 0.080 1.692841e-145

可视化不同条件的差异基因:

FeaturePlot(immune.combined,
            features = c("CD3D", "GNLY", "IFI6"),
            split.by = "stim", max.cutoff = 3,
            cols = c("grey", "red"))

小提琴图:

plots <- VlnPlot(immune.combined,
                 features = c("LYZ", "ISG15", "CXCL10"),
                 split.by = "stim", group.by = "celltype",
                 pt.size = 0,
                 combine = FALSE)

wrap_plots(plots = plots, ncol = 1)

8改进的 SCTransform 方法

后面 seurat 团体在 2019 年基于的改进的regularized negative binomial regression(SCTransform) 方法,将 NormalizeData,FindVariableFeatures和ScaleData 包装于一体,使用此方法进行整合前的预处理操作,多了 PrepSCTIntegration 步骤:

示例代码:

ifnb.list <- SplitObject(ifnb, split.by = "stim")

# SCTransform
ifnb.list <- lapply(X = ifnb.list, FUN = SCTransform)

# get common features
features <- SelectIntegrationFeatures(object.list = ifnb.list, nfeatures = 3000)

# Integration.1
ifnb.list <- PrepSCTIntegration(object.list = ifnb.list,
                                anchor.features = features)

# Integration.2
immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list,
                                         normalization.method = "SCT",
                                         anchor.features = features)

# Integration.3
immune.combined.sct <- IntegrateData(anchorset = immune.anchors,
                                     normalization.method = "SCT")

接下来是常规步骤:

# common steps
immune.combined.sct <- RunPCA(immune.combined.sct, verbose = FALSE)
immune.combined.sct <- RunUMAP(immune.combined.sct, reduction = "pca", dims = 1:30)

p1 <- DimPlot(immune.combined.sct, reduction = "umap", group.by = "stim")
p2 <- DimPlot(immune.combined.sct, reduction = "umap", group.by = "seurat_annotations", label = TRUE,
              repel = TRUE)

p1 + p2