其他
Molecular Cell 文章 ribosome pausing 结果复现 (三)
身在社会里,难逃社会的束缚
1引言
这次做的部分是文章关于 Ribo-seq 数据的质控部分,包括 reads 长度分布
,3nt 周期性
等。
下面是文章里的图:
2质控
sample = ["SRR12594201","SRR12594202","SRR12594203","SRR12594204","SRR12594205","SRR12594206",
"SRR12594207","SRR12594208","SRR12594209","SRR12594210","SRR12594211","SRR12594212",
"SRR12594213","SRR12594214","SRR12594215","SRR12594216","SRR12594217","SRR12594218"]
outname = ["Normal-rep1","mOSM500-2h-rep1","mOSM600-2h-rep1","mOSM700-15m-rep1","mOSM700-2h-rep1",
"mOSM700-15m-15mrecover-rep1","mOSM700-15m-2hrecover-rep1","mOSM700-2h-15mrecover-rep1","mOSM700-2h-2hrecover-rep1",
"Normal-rep2","mOSM500-2h-rep2","mOSM600-2h-rep2","mOSM700-15m-rep2","mOSM700-2h-rep2",
"mOSM700-15m-15mrecover-rep2","mOSM700-15m-2hrecover-rep2","mOSM700-2h-15mrecover-rep2","mOSM700-2h-2hrecover-rep2"]
for i in range(1,18)
RiboQcAnalysis("3.map-data/"*sample[i]*".uniq.txt","4.QC-data/"*outname[i]*".qc.txt")
end
3绘图
批量读取数据
只选取几个样本 replicate1 来测试:
library(tidyverse)
library(ggplot2)
library(cowplot)
library(patchwork)
library(ggsci)
# load data
file <- c("Normal-rep1","mOSM500-2h-rep1","mOSM600-2h-rep1","mOSM700-2h-rep1")
map_df(1:length(file),function(x){
tmp <- read.table(paste('4.QC-data/',file[x],'.qc.txt',sep = ''))
colnames(tmp) <- c('length','framest','relst','framesp','relsp','feature','counts')
# add sample
tmp$sample <- file[x]
return(tmp)
}) -> dfqc
长度分布
#################################################
# length
#################################################
len <- dfqc %>% group_by(sample,length) %>% summarise(num = sum(counts))
# order
len$sample <- factor(len$sample,levels = c("Normal-rep1","mOSM500-2h-rep1",
"mOSM600-2h-rep1","mOSM700-2h-rep1"))
# plot
ggplot(len,
aes(x = factor(length),y = num/1000)) +
geom_col(aes(fill = sample),
position = position_dodge2(),width = 0.6) +
# scale_fill_brewer(palette = 'GnBu',name = '') +
scale_fill_npg(name = '') +
theme_bw(base_size = 14) +
theme(aspect.ratio = 0.6,
strip.background = element_rect(colour = NA,fill = 'grey90'),
legend.position = 'top',
legend.background = element_blank(),
panel.grid = element_blank()) +
xlab('Read length') + ylab('Reads numbers(k)') +
facet_wrap(~sample,scales = 'free_y',ncol = 2)
可以看到主要是在 29nt 左右的。
frame 分布
#################################################
# frame
#################################################
frame <- dfqc %>% group_by(sample,length,framest) %>% summarise(num = sum(counts))
# order
frame$sample <- factor(frame$sample,levels = c("Normal-rep1","mOSM500-2h-rep1",
"mOSM600-2h-rep1","mOSM700-2h-rep1"))
# plot
ggplot(frame,
aes(x = factor(length),y = num/1000)) +
geom_col(aes(fill = factor(framest)),position = position_dodge2()) +
scale_fill_brewer(palette = 'Greens',name = '',direction = -1,
labels = c('frame 0','frame 1','frame 2')) +
theme_bw(base_size = 14) +
theme(aspect.ratio = 0.6,
strip.background = element_rect(colour = NA,fill = 'grey90'),
legend.position = 'top',
legend.background = element_blank(),
panel.grid = element_blank()) +
xlab('Read length') + ylab('Reads numbers(k)') +
facet_wrap(~sample,scales = 'free',ncol = 2)
frame0 占比较多。
reads 特征分布
#################################################
# features
#################################################
featuredf <- dfqc %>% group_by(sample,feature) %>% summarise(num = sum(counts))
# order
featuredf$sample <- factor(featuredf$sample,levels = c("Normal-rep1","mOSM500-2h-rep1",
"mOSM600-2h-rep1","mOSM700-2h-rep1"))
ggplot(featuredf,
aes(x = rev(factor(feature)),y = num/1000)) +
geom_col(aes(fill = factor(feature)),
width = 0.6,
position = position_dodge2()) +
scale_x_discrete(labels = c('CDS',"5'UTR","3'UTR")) +
scale_fill_brewer(palette = 'Blues',name = '',direction = 1,
labels = c("3'UTR","5'UTR",'CDS')) +
theme_bw(base_size = 16) +
theme(aspect.ratio = 1.3,
strip.background = element_rect(colour = NA,fill = 'grey90'),
legend.position = 'top',
legend.background = element_blank(),
panel.grid = element_blank()) +
xlab('Read distribution') + ylab('Reads numbers(k)') +
facet_wrap(~sample,scales = 'free',ncol = 4)
主要分布于 CDS 区域。
4文章图复现
distance from start codon
#################################################
# relative to start codon
#################################################
relst <- dfqc %>% group_by(sample,relst,framest) %>%
summarise(allCounts = sum(counts))
# total reads
totalReads <- dfqc %>% group_by(sample) %>%
summarise(total = sum(counts))
# A tibble: 4 x 2
# sample total
# <chr> <int>
# 1 mOSM500-2h-rep1 3016994
# 2 mOSM600-2h-rep1 1917018
# 3 mOSM700-2h-rep1 575837
# 4 Normal-rep1 5258174
# normalize to total reads
relst$normaledReads <- case_when(
relst$sample == "Normal-rep1" ~ (relst$allCounts/5258174)*10^6,
relst$sample == "mOSM500-2h-rep1" ~ (relst$allCounts/3016994)*10^6,
relst$sample == "mOSM600-2h-rep1" ~ (relst$allCounts/1917018)*10^6,
relst$sample == "mOSM700-2h-rep1" ~ (relst$allCounts/575837)*10^6)
# filter region
dfst <- relst %>% filter(relst >= -50 & relst <= 100)
# order
dfst$sample <- factor(dfst$sample,levels = c("Normal-rep1","mOSM500-2h-rep1",
"mOSM600-2h-rep1","mOSM700-2h-rep1"))
# plot
pst <- ggplot(dfst,
aes(x = relst + 12,y = normaledReads/1000)) +
geom_col(aes(fill = factor(framest)),width = 0.8,
position = position_dodge2()) +
scale_fill_brewer(palette = 'Blues',name = '',direction = -1,
labels = c('frame 0','frame 1','frame 2')) +
theme_bw(base_size = 14) +
theme(aspect.ratio = 1,
strip.background = element_rect(colour = NA,fill = 'grey90'),
legend.position = 'top',
legend.background = element_blank(),
panel.grid = element_blank()) +
xlab('Relative to start codon') +
ylab('Normalizee Reads counts(k)') +
facet_wrap(~sample,ncol = 4)
distance from start codon
#################################################
# relative to stop codon
#################################################
relsp <- dfqc %>% group_by(sample,relsp,framesp) %>%
summarise(allCounts = sum(counts))
# total reads
totalReads <- dfqc %>% group_by(sample) %>%
summarise(total = sum(counts))
# A tibble: 4 x 2
# sample total
# <chr> <int>
# 1 mOSM500-2h-rep1 3016994
# 2 mOSM600-2h-rep1 1917018
# 3 mOSM700-2h-rep1 575837
# 4 Normal-rep1 5258174
# normalize to total reads
relsp$normaledReads <- case_when(
relsp$sample == "Normal-rep1" ~ (relsp$allCounts/5258174)*10^6,
relsp$sample == "mOSM500-2h-rep1" ~ (relsp$allCounts/3016994)*10^6,
relsp$sample == "mOSM600-2h-rep1" ~ (relsp$allCounts/1917018)*10^6,
relsp$sample == "mOSM700-2h-rep1" ~ (relsp$allCounts/575837)*10^6)
# filter region
dfsp <- relsp %>% filter(relsp >= -100 & relsp <= 50)
# order
dfsp$sample <- factor(dfsp$sample,levels = c("Normal-rep1","mOSM500-2h-rep1",
"mOSM600-2h-rep1","mOSM700-2h-rep1"))
# plot
psp <- ggplot(dfsp,
aes(x = relsp + 12,y = normaledReads/1000)) +
geom_col(aes(fill = factor(framesp)),width = 0.8,
show.legend = F,
position = position_dodge2()) +
scale_fill_brewer(palette = 'Blues',name = '',direction = -1,
labels = c('frame 0','frame 1','frame 2')) +
theme_bw(base_size = 14) +
theme(aspect.ratio = 1,
strip.background = element_rect(colour = NA,fill = 'grey90'),
legend.position = 'top',
legend.background = element_blank(),
panel.grid = element_blank()) +
xlab('Relative to stop codon') +
ylab('Normalizee Reads counts(k)') +
facet_wrap(~sample,ncol = 4)
# combine
pst / psp
拼图
可以看到在 mOSM 浓度逐渐增加的时候,在 start codon 处发生了了明显的停滞现象。
5结尾
后面的图我直接对每个 reads 的长度做了 12nt 的 shift,这是不太严谨的作法,应该去计算不太长度 reads 的 p-site 的位置相应的做 shift 操作。好在大部分 reads 基本实在 12nt 的位置的。
可以注意一下。
欢迎加入生信交流群。加我微信我也拉你进 微信群聊 老俊俊生信交流群
(微信交流群满200人后需收取20元入群费用), 数据代码已上传至QQ群,欢迎加入下载。
群二维码:
老俊俊微信:
知识星球:
所以今天你学习了吗?
今天的分享就到这里了,敬请期待下一篇!
最后欢迎大家分享转发,您的点赞是对我的鼓励和肯定!
如果觉得对您帮助很大,赏杯快乐水喝喝吧!
往期回顾
◀Molecular Cell 文章 ribosome pausing 结果复现 (二) (PCR 去重)
◀Molecular Cell 文章 ribosome pausing 结果复现 (一)
◀...