CUT&Tag 数据处理与分析教程五：Peak calling

工具人~ 狮山生信 2022-08-15

CUT&Tag 数据处理与分析教程二：质控（不需要修剪 reads！不需要修剪 reads！不需要修剪reads！）和数据比对

CUT&Tag 数据处理与分析教程三：BAM 文件统计（CUT&Tag 不建议去重不去重不去重）

CUT&Tag 数据处理与分析教程四：Spike-in 对 CUT&Tag 数据的校正对数据的校正

本部分我们将介绍 CUT&Tag 数据的 Peak calling，大部分内容与 ChIP-seq 数据部分一致。

VI. Peak calling

SEACR

CUT&RUN 稀疏富集分析，SEACR 包的设计是从非常低的背景 (即没有 reads 覆盖的区域) 的染色质谱数据进行 call peak 和富集，这是典型的 CUT&Tag 染色质谱分析实验。SEACR 需要来自双端序列的 bedGraph 文件作为输入，并将 peaks 定义为碱基覆盖的相邻块，它们与 IgG 对照数据集中描述的背景信号块不重叠。SEACR 既能有效地从转录因子结合位点 call 窄峰，也能有效地 call 一些组蛋白修饰特征的宽峰。该方法的描述发表在 2019-Peak calling by Sparse Enrichment Analysis for CUT&RUN chromatin profiling，并且用户使用手册在 github SEACR: Sparse Enrichment Analysis for CUT&RUN 上。由于我们已经用大肠杆菌 reads 计数规标准化了片段计数，所以我们将 SEACR 的规范化选项设置为 non。否则，建议使用 norm。

##== linux 命令==##
seacr="/fh/fast/gottardo_r/yezheng_working/Software/SEACR/SEACR_1.3.sh"
histControl=$2
mkdir -p $projPath/peakCalling/SEACR
bash $seacr $projPath/alignment/bedgraph/${histName}_bowtie2.fragments.normalized.bedgraph \
     $projPath/alignment/bedgraph/${histControl}_bowtie2.fragments.normalized.bedgraph \
     non stringent $projPath/peakCalling/SEACR/${histName}_seacr_control.peaks
bash $seacr $projPath/alignment/bedgraph/${histName}_bowtie2.fragments.normalized.bedgraph 0.01 non stringent $projPath/peakCalling/SEACR/${histName}_seacr_top0.01.peaks

Called Peak 的数目

##=== R command ===## 
peakN = c()
peakWidth = c()
peakType = c("control", "top0.01")
for(hist in sampleList){
  histInfo = strsplit(hist, "_")[[1]]
  if(histInfo[1] != "IgG"){
    for(type in peakType){
      peakInfo = read.table(paste0(projPath, "/peakCalling/SEACR/", hist, "_seacr_", type, ".peaks.stringent.bed"), header = FALSE, fill = TRUE)  %>% mutate(width = abs(V3-V2))
      peakN = data.frame(peakN = nrow(peakInfo), peakType = type, Histone = histInfo[1], Replicate = histInfo[2]) %>% rbind(peakN, .)
      peakWidth = data.frame(width = peakInfo$width, peakType = type, Histone = histInfo[1], Replicate = histInfo[2])  %>% rbind(peakWidth, .)
    }
  }
}
peakN %>% select(Histone, Replicate, peakType, peakN)

生物重复 Peak 的可重现性

对重复数据集的 Peak calling 与定义可重现峰值进行比较。选取 Peak 前 1% (按每块信号总量排序) 作为高可信位点。

##=== R 命令 ===## 
histL = c("K27me3", "K4me3")
repL = paste0("rep", 1:2)
peakType = c("control", "top0.01")
peakOverlap = c()
for(type in peakType){
  for(hist in histL){
    overlap.gr = GRanges()
    for(rep in repL){
      peakInfo = read.table(paste0(projPath, "/peakCalling/SEACR/", hist, "_", rep, "_seacr_", type, ".peaks.stringent.bed"), header = FALSE, fill = TRUE)
      peakInfo.gr = GRanges(peakInfo$V1, IRanges(start = peakInfo$V2, end = peakInfo$V3), strand = "*")
      if(length(overlap.gr) >0){
        overlap.gr = overlap.gr[findOverlaps(overlap.gr, peakInfo.gr)@from]
      }else{
        overlap.gr = peakInfo.gr
        
      }
    }
    peakOverlap = data.frame(peakReprod = length(overlap.gr), Histone = hist, peakType = type) %>% rbind(peakOverlap, .)
  }
}
peakReprod = left_join(peakN, peakOverlap, by = c("Histone", "peakType")) %>% mutate(peakReprodRate = peakReprod/peakN * 100)
peakReprod %>% select(Histone, Replicate, peakType, peakN, peakReprodNum = peakReprod, peakReprodRate)

可重现性计算公式为：
peaks overlapping rep1 and rep2/ peaks of rep1 or rep2 * 100
因此，它对每次重复中 Called Peaks 总数非常敏感。

Peak 区域中的片段比例：FRagment proportion in Peaks regions (FRiPs)

我们计算峰值 (FRiPs) 中 reads 的比率，作为信号噪声的度量，并将其与 IgG 对照数据集中的 FRiPs 进行对比以作说明。虽然 CUT&Tag 的测序深度通常只有 1-500 万 reads，但是该方法的低背景导致了高 FRiP 分数。

##=== R 命令 ===## 
library(chromVAR)
bamDir = paste0(projPath, "/alignment/bam")
inPeakData = c()
## overlap with bam file to get count
for(hist in histL){
  for(rep in repL){
    peakRes = read.table(paste0(projPath, "/peakCalling/SEACR/", hist, "_", rep, "_seacr_control.peaks.stringent.bed"), header = FALSE, fill = TRUE)
    peak.gr = GRanges(seqnames = peakRes$V1, IRanges(start = peakRes$V2, end = peakRes$V3), strand = "*")
    bamFile = paste0(bamDir, "/", hist, "_", rep, "_bowtie2.mapped.bam")
    fragment_counts <- getCounts(bamFile, peak.gr, paired = TRUE, by_rg = FALSE, format = "bam")
    inPeakN = counts(fragment_counts)[,1] %>% sum
    inPeakData = rbind(inPeakData, data.frame(inPeakN = inPeakN, Histone = hist, Replicate = rep))
  }
}
frip = left_join(inPeakData, alignResult, by = c("Histone", "Replicate")) %>% 
    mutate(frip = inPeakN/MappedFragNum_hg38 * 100)
    
frip %>% select(Histone, Replicate, SequencingDepth, MappedFragNum_hg38, AlignmentRate_hg38,
    FragInPeakNum = inPeakN, FRiPs = frip)

Peak 数目、宽度、可再现性、FRiPs 可视化

fig7A = peakN %>% ggplot(aes(x = Histone, y = peakN, fill = Histone)) +
    geom_boxplot() +
    geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
    facet_grid(~peakType) +
    scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.55, option = "magma", alpha = 0.8) +
    scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
    theme_bw(base_size = 18) +
    ylab("Number of Peaks") +
    xlab("")
fig7B = peakWidth %>% ggplot(aes(x = Histone, y = width, fill = Histone)) +
    geom_violin() +
    facet_grid(Replicate~peakType) +
    scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.55, option = "magma", alpha = 0.8) +
    scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
    scale_y_continuous(trans = "log", breaks = c(400, 3000, 22000)) +
    theme_bw(base_size = 18) +
    ylab("Width of Peaks") +
    xlab("")
fig7C = peakReprod %>% ggplot(aes(x = Histone, y = peakReprodRate, fill = Histone, label = round(peakReprodRate, 2))) +
    geom_bar(stat = "identity") +
    geom_text(vjust = 0.1) +
    facet_grid(Replicate~peakType) +
    scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.55, option = "magma", alpha = 0.8) +
    scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
    theme_bw(base_size = 18) +
    ylab("% of Peaks Reproduced") +
    xlab("")
fig7D = frip %>% ggplot(aes(x = Histone, y = frip, fill = Histone, label = round(frip, 2))) +
    geom_boxplot() +
    geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
    scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.55, option = "magma", alpha = 0.8) +
    scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
    theme_bw(base_size = 18) +
    ylab("% of Fragments in Peaks") +
    xlab("")
ggarrange(fig7A, fig7B, fig7C, fig7D, ncol = 2, nrow=2, common.legend = TRUE, legend="bottom")

IX. 额外可替代的分析

ChIPseqSpikeInFree for normalizing data without spike-in DNA（可选）

2019-ChIPseqSpikeInFree: a ChIP-seq normalization approach to reveal global changes in histone modifications without spike-in 是一种新的 ChIP-seq 标准化方法，可以有效地确定样品在各种条件和处理中的换算系数，它不依赖 外源性 spikein 染色质或 Peak 检测来揭示组蛋白修饰占位的整体变化。安装细节可以在 github stjude/ChIPseqSpikeInFree 上找到。

ChIPseqSpikeInFree 输出结果说明：https://github.com/stjude/ChIPseqSpikeInFree#interpretation-of-scaling-factor-table
How to use ChIPseqSpikeInFree scaling factor: https://github.com/stjude/ChIPseqSpikeInFree#how-to-use-chipseqspikein-scaling-factor

其他 Peak calling 方法

MACS2：2008-Model-based Analysis of ChIP-Seq (MACS)。详细安装教程：MACS2 安装指南

##== linux 命令 ==##
histName="K27me3"
controlName="IgG"
mkdir -p $projPath/peakCalling
macs2 callpeak -t ${projPath}/alignment/bam/${histName}_rep1_bowtie2.mapped.bam \
      -c ${projPath}/alignment/bam/${controlName}_rep1_bowtie2.mapped.bam \
      -g hs -f BAMPE -n macs2_peak_q0.1 --outdir $projPath/peakCalling/MACS2 -q 0.1 --keep-dup all 2>${projPath}/peakCalling/MACS2/macs2Peak_summary.txt

dPeak: 2013-dPeak: High Resolution Identification of Transcription Factor Binding Sites from PET and SET ChIP-Seq Data
MOSAiCS: 2011-A Statistical Framework for the Analysis of ChIP-Seq Data

古树普洱茶山头直发，买茶别再花冤枉钱！

19岁中国留学生投票被抓，“假装”公民身份！且已无法撤回.........

中国留学生在美国非法投票，后果很严重

19岁中国留学生非法投票美国大选，被控2罪！或被判15年监禁

恶魔医生刘翔峰判了，湘雅二院改好了吗？

CUT&Tag 数据处理与分析教程五：Peak calling

其他 Peak calling 方法

您可能也对以下帖子感兴趣

古树普洱茶山头直发，买茶别再花冤枉钱！

19岁中国留学生投票被抓，“假装”公民身份！且已无法撤回.........

中国留学生在美国非法投票，后果很严重

19岁中国留学生非法投票美国大选，被控2罪！或被判15年监禁

恶魔医生刘翔峰判了，湘雅二院改好了吗？

生成图片，分享到微信朋友圈

CUT&Tag 数据处理与分析教程 五：Peak calling

其他 Peak calling 方法

您可能也对以下帖子感兴趣

CUT&Tag 数据处理与分析教程五：Peak calling