使用 python 进行 Ribo-seq 质控分析一

Original JunJunLab 老俊俊的生信笔记 2022-08-15

收录于合集 #Ribo-seq 39个

随缘

1前言

小时候放牛,躺在山里的草地上仰望天空 是自由 现在偶尔躺在城市的人造草坪上仰望天空 是喘息 双眼一闭 一二十年时光倒流 炎热的烈日,吃草的黄牛,山间里的鸟鸣 彷佛依然还在 但我已不再少年

2引言

前面关于 Ribo-seq 的质控都是在 R 里面读入 bam 文件,这样 R 的效率太慢,如果有多个数据,消耗的时间将会更慢。

这里尝试使用 python 进行分析输出结果文件,然后读入 R 里面进行可视化,将会大大提高效率。

3分析

这里使用的是 bowtie 比对到 转录组序列上 的默认输出文件,不是 sam/bam,作为 python 分析的输入文件:

SRR1596101.1 HWI-ST640:681:C289NACXX:1:1101:1230:2221/1 +       Rpn2_ENSMUSG00000027642_ENSMUST00000116380_264_2157_2756        813     ATTGAGGACCTTGTTGCTCGGCTGGATGA   ?@@B?DDBHHFHHGGGHGDEHGBFI?<E3   0
SRR1596101.14 HWI-ST640:681:C289NACXX:1:1101:1938:2176/1        +       Parp1_ENSMUSG00000026496_ENSMUST00000027777_105_3147_3873       553     AGCTGGNTATGATTGACCGCTGGTACCA    ?<?DBD#222<<C<<@E>GFEEFFFIEE    0       6:G>N
SRR1596101.63 HWI-ST640:681:C289NACXX:1:1101:4867:2161/1        +       Psmc1_ENSMUSG00000021178_ENSMUST00000021595_55_1375_1591        316     TTAGAANAAAAGCAAGAGGAGGAAAGATC   @@?DDD#2=:ACBFB@EEHCFGDFCCCGG   0       6:G>N
SRR1596101.20 HWI-ST640:681:C289NACXX:1:1101:2399:2248/1        +       Polr1b_ENSMUSG00000027395_ENSMUST00000103205_92_3497_3998       3143    AGGGACAAAGTCACCAACCAGCCCCTCGG   CCCFFFFFHHHHHJJJJJJJJJJJJJJJJ   0
SRR1596101.77 HWI-ST640:681:C289NACXX:1:1101:5685:2171/1        +       Cobl_ENSMUSG00000020173_ENSMUST00000046755_235_4246_5615        3532    CTGCTCNTGGCAGAGGCCCGTGACTCTGGG  CCCFFF#2AFHHHJJJIJIJHIGIIIJIIJ  0       6:T>N

定义分析函数:

# Ribo-seq mapping file QC function
def summaryFramelength(inputFile,outputFile):
    # from collections import Counter
    frame_dict = {}
    # open file
    with open(inputFile,'r') as input:
            for line in input:
                fileds = line.split()
                start_codon_pos = int(fileds[3].split('_')[3])
                # Bowtie uses zero-based offset
                stop_codon_pos = int(fileds[3].split('_')[4]) + 1
                # Bowtie uses zero-based offset
                align_pos = int(fileds[4])
                length = len(fileds[5])
                # relative distance
                rel2st = align_pos - start_codon_pos
                rel2sp = align_pos - stop_codon_pos
                # assign frame
                yushu = abs(align_pos - start_codon_pos)%3
                # key
                key = '|'.join([str(length),str(yushu),str(rel2st),str(rel2sp)])
                # init dict and count
                frame_dict.setdefault(key,0)
                frame_dict[key] += 1
    # output
    output = open(outputFile,'w')
    # output
    for key,val in frame_dict.items():
        elem = key.split('|')
        output.write('\t'.join([elem[0],elem[1],elem[2],elem[3],str(val)]) + '\n')

    output.close()

分析数据:

summaryFramelength('ribo-EB_rmtRNA.map','ribo-EB.qc.txt')
summaryFramelength('ribo-ESC_rmtRNA.map','ribo-ESC.qc.txt')

4导入 R 可视化

读取文件:

library(tidyverse)
library(data.table)
library(ggsci)
library(ggplot2)
library(scales)

file <- list.files('./',pattern = '.qc.txt')

# bacth read
map_df(file,function(x){
  tmp = fread(paste('./',x,sep = ''))
  colnames(tmp) <- c('readlength','frame','rel2st','rel2sp','numbers')
  tmp$sample <- sapply(strsplit(x,split = '\\.'),'[',1)
  # filter length 25-35 nt
  tmp <- tmp %>% filter(readlength >= 25 & readlength <= 35)
  return(tmp)
}) -> frame_len

# check
head(frame_len,3)
#    readlength frame rel2st rel2sp numbers  sample
# 1:         30     0   4500  -1381       4 ribo-EB
# 2:         27     1   1558   -318       1 ribo-EB
# 3:         28     0    252  -2155       8 ribo-EB

read length

###########################################
# read length
readlen <- frame_len %>%
  dplyr::group_by(sample,readlength) %>%
  dplyr::summarise(count = n())

# plot
ggplot(readlen,aes(x = factor(readlength),y = count)) +
  geom_col() +
  theme_bw(base_size = 16) +
  xlab('') +
  theme(aspect.ratio = 0.6) +
  facet_wrap(~sample,scales = 'free',ncol = 4)

All reads frame

###########################################
# summarise all frame type
all_frame <- frame_len %>% group_by(sample,frame) %>%
  dplyr::summarise(count = sum(numbers)) %>%
  dplyr::mutate(percent = count/sum(count))

# plot
ggplot(all_frame,aes(x = factor(frame),y = percent)) +
  geom_col(aes(fill = factor(frame)),width = 0.6) +
  theme_classic(base_size = 16) +
  scale_fill_manual(values = c('#ED0000FF','#0099B4FF','#42B540FF'),
                    labels = c('0'='frame 0','1'='frame 1','2'='frame 2'),
                    name = '') +
  theme(strip.background = element_rect(colour = NA,fill = 'grey')) +
  facet_wrap(~sample,scales = 'free',ncol = 2) +
  xlab('')

Different read length frame

###########################################
# summarise all frame type
all_frame_bylen <- frame_len %>%
  dplyr::group_by(sample,readlength,frame) %>%
  dplyr::summarise(count = sum(numbers))

# plot
ggplot(all_frame_bylen,aes(x = factor(readlength),y = count)) +
  geom_col(aes(fill = factor(frame)),
           position = position_dodge2()) +
  theme_classic(base_size = 16) +
  scale_fill_manual(values = c('#ED0000FF','#0099B4FF','#42B540FF'),
                    labels = c('0'='frame 0','1'='frame 1','2'='frame 2'),
                    name = '') +
  theme(strip.background = element_rect(colour = NA,fill = 'grey'),
        aspect.ratio = 0.6) +
  facet_wrap(~sample,scales = 'free',ncol = 2) +
  xlab('')

distance to start codon

# distance to start codon

tost <- frame_len %>%
  dplyr::group_by(sample,readlength,rel2st,frame) %>%
  dplyr::summarise(count = sum(numbers)) %>%
  dplyr::filter(rel2st >= -40 & rel2st <= 20)

# plot
ggplot(tost %>% filter(readlength %in% c(28,29,30)),
       aes(x = rel2st,y = count)) +
  geom_col(aes(fill = factor(frame)),
           position = position_dodge2()) +
  theme_classic(base_size = 16) +
  scale_fill_manual(values = c('#ED0000FF','#0099B4FF','#42B540FF'),
                    labels = c('0'='frame 0','1'='frame 1','2'='frame 2'),
                    name = '') +
  scale_x_continuous(breaks = c(-40,-12,0,20)) +
  xlab('Distance from start codon (nt)') +
  theme(strip.background = element_rect(colour = NA,fill = 'grey'),
        aspect.ratio = 0.6) +
  facet_grid(sample~readlength,scales = 'free')

distance to stop codon

# distance to stop codon

tosp <- frame_len %>%
  dplyr::group_by(sample,readlength,rel2sp,frame) %>%
  dplyr::summarise(count = sum(numbers)) %>%
  dplyr::filter(rel2sp >= -40 & rel2sp <= 20)

# plot
ggplot(tosp %>% filter(readlength %in% c(28,29,30)),
       aes(x = rel2sp,y = count)) +
  geom_col(aes(fill = factor(frame)),
           position = position_dodge2()) +
  theme_classic(base_size = 16) +
  scale_fill_manual(values = c('#ED0000FF','#0099B4FF','#42B540FF'),
                    labels = c('0'='frame 0','1'='frame 1','2'='frame 2'),
                    name = '') +
  scale_x_continuous(breaks = c(-40,-12,0,20)) +
  xlab('Distance from stop codon (nt)') +
  theme(strip.background = element_rect(colour = NA,fill = 'grey'),
        aspect.ratio = 0.6) +
  facet_grid(sample~readlength,scales = 'free')