Molecular Cell 文章 ribosome pausing 结果复现 (终)

Original JunJunLab 老俊俊的生信笔记 2022-08-17

收录于合集 #Ribo-seq 39个

两难

1引言

复现文章中每个密码子 occupancy (codon occupancy)的情况,可以看到 ATG 在高浓度处理后相对于对照组发生了 显著上升:

计算思路:

提取基因的 CDS 序列。
计算每个基因密码子上的 ribosome density。
codon occupancy =

2提取 CDS 序列

定义函数:julia语言编写

这里有一些筛选条件,把没有 5'UTR 区域和 CDS 区域长度不是 3 的倍数的基因过滤掉:

using FASTX
using BioSequences

function GetCDSseq(genomeFile,geneInfo,outputFile)
    ################################################################
    # 1.load genome file
    ################################################################
    # save chromsome length
    chromDict = Dict{String,String}()

    open(FASTA.Reader, genomeFile) do reader
        record = FASTA.Record()
        while !eof(reader)
            read!(reader, record)
            ## Do something.
            chromDict[FASTA.identifier(record)] = FASTA.sequence(record)
        end
    end

    ################################################################
    # 2.extarct sequence
    ################################################################
    longestCDS_Dict = Dict{String,String}()

    open(geneInfo,"r") do geneinfo
        for line in eachline(geneinfo)
            # split tags
            _,gene_name,gene_id,_,chr,strand,cdsRg,exon,utr5,cds,utr3 = split(line)

            # exon length
            utr5,cds,utr3 = parse(Int64,utr5),parse(Int64,cds),parse(Int64,utr3)

            # chromosome sequence [2 for only know ATGC,4 for including N base]
            format_seq = LongSequence{DNAAlphabet{4}}(chromDict[chr])

            # exclude no 5UTR genes and can't be divided exactly by 3 genes
            if utr5 != 0 && cds%3 == 0
                # exonLength = utr5 + cds +utr3

                # cds start and end position
                cdsStart,cdsEnd = (utr5 + 1),(utr5 + cds - 3)

                # loop for every exon regions
                if strand == "+" # + strand gene
                    region = split(cdsRg,",")
                else # - strand gene
                    region = reverse(split(cdsRg,","))
                end

                # key
                key = join([gene_name,gene_id,cdsStart,cdsEnd,cds],"|")

                # fill denisty
                for rg in region
                    posSt,posEnd = parse(Int64,split(rg,":")[1]),parse(Int64,split(rg,":")[2])

                    # get sequence
                    if strand == "+" # + strand gene
                        seq = format_seq[posSt:posEnd]
                    else # - strand gene
                        # reverse and complement sequence
                        seq = reverse_complement(format_seq[posSt:posEnd])
                    end

                    # save in dict
                    if !haskey(longestCDS_Dict,key)
                        longestCDS_Dict[key] = seq
                    else
                        longestCDS_Dict[key] *= string(seq)
                    end
                end
            else
                continue
            end
        end
    end

    ################################################################
    # 3.output data
    ################################################################
    outFa = open(outputFile,"w")

    for (key,val) in longestCDS_Dict
        write(outFa,join([">$key",val],"\n")*"\n")
    end
    close(outFa)
end

提取序列:

GetCDSseq("Mus_musculus.GRCm38.dna.primary_assembly.fa","longestCDSinfo.txt","longestCDS.fa")

3计算每个 codon 的 ribosome occupancy

cdslength=900,expression=50,exclude=90 这几个参数可以对基因的 CDS 的长度,表达量和去除前面多少碱基来进行一定范围的过滤。此外代码里 还对 64 种密码子进行了对应的氨基酸注释, 这样可以方便的知道每个 codon 对应的氨基酸名称。

定义函数:julia语言编写

using FASTX
using Formatting

function CodonsOcupancyOnORF(geneInfo,cdsFa,inputFile,outputFile;cdslength=600,expression=75,exclude=90)
    ################################################################
    # 1.extract gene feature info
    ################################################################
    featureLenDict = Dict{String,Tuple}()

    # open gene info file
    open(geneInfo,"r") do input
        for line in eachline(input)
            fileds = split(line)

            # split tags
            _,_,gene_id,_,_,_,_,_,utr5,cds,utr3 = split(line)
            # exon length
            utr5,cds,utr3 = parse(Int64,utr5),parse(Int64,cds),parse(Int64,utr3)
            # cds start and end position
            cdsStart,cdsEnd = (utr5 + 1),(utr5 + cds - 2)

            # save in dict
            featureLenDict[gene_id] = (cdsStart,cdsEnd,cds)
        end
    end

    ################################################################
    # 2.filter gene CDS length and expression higher thean threshold
    ################################################################
    gene_infoDict = Dict{String,Float64}()
    filtedGeneDict = Dict{String,Float64}()

    # open gene info file
    open(inputFile,"r") do input
        for line in eachline(input)
            fileds = split(line)

            # split tags
            gene_id = fileds[1]
            transpos = parse(Int64,fileds[2])
            rpm = parse(Float64,fileds[3])

            # filter CDS > 600 nt gene
            if haskey(featureLenDict,gene_id)
                cdsStart,cdsEnd,cdsLength = featureLenDict[gene_id]
                if cdsLength >= cdslength && cdsLength%3 == 0
                    # not include first 30 codons to exclude high densitys around start codon
                    key = join([gene_id,cdsLength - exclude],":") # key
                    if !haskey(gene_infoDict,key)
                        gene_infoDict[key] = 0
                    else
                        if cdsStart + exclude <= transpos <= cdsEnd
                            gene_infoDict[key] += rpm
                        else
                            continue
                        end
                    end
                else
                    continue
                end
            end
        end
    end

    # filter CDS expression >= 75
    for (key,val) in gene_infoDict
        if val >= expression
            meanNorm = val / parse(Int64,split(key,":")[2])
            filtedGeneDict[key] = meanNorm
        else
            continue
        end
    end

    ################################################################
    # 3.load trans density file and filter CDS density
    ################################################################
    riboDict = Dict{String,Float64}()
    geneidDict = Dict{String,Float64}()

    # open gene info file
    open(inputFile,"r") do input
        for line in eachline(input)
            fileds = split(line)

            # split tags
            gene_id = fileds[1]
            transpos = parse(Int64,fileds[2])
            rpm = parse(Float64,fileds[3])
            cdsStart,cdsEnd,cdsLength = featureLenDict[gene_id]

            # key
            id = join([gene_id,cdsLength - exclude],":")
            if haskey(filtedGeneDict,id)
                # filter CDS region
                if cdsStart <= transpos <= cdsEnd
                    # save
                    # relRPM = rpm / filtedGeneDict[id]
                    riboDict[join([gene_id,transpos - cdsStart + 1],"|")] = rpm
                else
                    continue
                end
            end

            # get how many gene_id this sample
            if haskey(geneidDict,gene_id)
                continue
            else
                geneidDict[gene_id] = 0.0
            end
        end
    end

    ################################################################
    # 4.prepare 64 codons Dict
    ################################################################
    baseATGC = ["A","T","G","C"]

    codonDistDict = Dict{String,Float64}()
    codonCountDict = Dict{String,Int64}()

    for i in baseATGC
        for j in baseATGC
            for z in baseATGC
                codonDistDict["$i$j$z"] = 0.0
                codonCountDict["$i$j$z"] = 0
            end
        end
    end

    ################################################################
    # 5.prepare anmino acids info
    ################################################################
    # add codon with amino acids annotation
    AnminoAcidsDict = Dict{String,String}()
    for codon in keys(codonCountDict)
        if codon ∈ ["TTT","TTC"]
            newKey = join([codon,"Phe"],"|")
        elseif codon ∈ ["TTA","TTG"]
            newKey = join([codon,"Leu"],"|")
        elseif codon ∈ ["TCT","TCC","TCA","TCG"]
            newKey = join([codon,"Ser"],"|")
        elseif codon ∈ ["TAT","TAC"]
            newKey = join([codon,"Tyr"],"|")
        elseif codon ∈ ["TAA","TAG","TGA"]
            newKey = join([codon,"Stop"],"|")
        elseif codon ∈ ["TGT","TGC"]
            newKey = join([codon,"Cys"],"|")
        elseif codon ∈ ["TGG"]
            newKey = join([codon,"Trp"],"|")
        elseif codon ∈ ["CTT","CTC","CTA","CTG"]
            newKey = join([codon,"Leu"],"|")
        elseif codon ∈ ["CCT","CCC","CCA","CCG"]
            newKey = join([codon,"Pro"],"|")
        elseif codon ∈ ["CAT","CAC"]
            newKey = join([codon,"His"],"|")
        elseif codon ∈ ["CAA","CAG"]
            newKey = join([codon,"Gln"],"|")
        elseif codon ∈ ["CGT","CGC","CGA","CGG"]
            newKey = join([codon,"Arg"],"|")
        elseif codon ∈ ["ATT","ATC","ATA"]
            newKey = join([codon,"Lle"],"|")
        elseif codon ∈ ["ATG"]
            newKey = join([codon,"Met"],"|")
        elseif codon ∈ ["ACT","ACC","ACA","ACG"]
            newKey = join([codon,"Thr"],"|")
        elseif codon ∈ ["AAT","AAC"]
            newKey = join([codon,"Asn"],"|")
        elseif codon ∈ ["AAA","AAG"]
            newKey = join([codon,"Lys"],"|")
        elseif codon ∈ ["AGT","AGC"]
            newKey = join([codon,"Ser"],"|")
        elseif codon ∈ ["AGA","AGG"]
            newKey = join([codon,"Arg"],"|")
        elseif codon ∈ ["GTT","GTC","GTA","GTG"]
            newKey = join([codon,"Val"],"|")
        elseif codon ∈ ["GCT","GCC","GCA","GCG"]
            newKey = join([codon,"Ala"],"|")
        elseif codon ∈ ["GAT","GAC"]
            newKey = join([codon,"Asp"],"|")
        elseif codon ∈ ["GAA","GAG"]
            newKey = join([codon,"Glu"],"|")
        elseif codon ∈ ["GGT","GGC","GGA","GGG"]
            newKey = join([codon,"Gly"],"|")
        else
            println("No this codon!")
        end
        AnminoAcidsDict[codon] = newKey
    end
    ################################################################
    # 6.calculate ribo occupancy on codons
    ################################################################
    open(FASTA.Reader, cdsFa) do reader
        record = FASTA.Record()
        while !eof(reader)
            read!(reader, record)
            ## Do something.
            name = FASTA.identifier(record)
            seq = FASTA.sequence(record)

            # split tags
            _,gene_id,cdsStart,cdsEnd,cdsLength = split(name,"|")

            # whtether this gene in samples
            if haskey(geneidDict,gene_id)

                # codon list
                codonlist = [seq[i:i+2] for i in 1:3:parse(Int64,cdsLength)]

                # codon numbers
                codonLength = length(codonlist)

                for (index,codon) in enumerate(codonlist)
                    codon = string(codon)

                    if haskey(codonDistDict,codon)

                        # get codon density
                        sumOcp = 0.0
                        for pos in index:index+2
                            occupancy = get(riboDict,join([gene_id,pos],"|"),0.0)
                            sumOcp += occupancy
                        end

                        # get mean codon density
                        # meanOcp = sumOcp / 3

                        # save in codonDistDict
                        codonDistDict[codon] += sumOcp
                        # count numbers
                        codonCountDict[codon] += 1
                    else
                        continue
                    end
                end
            else
                continue
            end
        end
    end

    ################################################################
    # 7.output data
    ################################################################
    outputData = open(outputFile,"w")

    totalDensity = sum(values(codonDistDict))
    totalCodons = sum(values(codonCountDict))
    # normalize number of counts and Densitys| (Density/totalDensity)/(codonCounts/totalCodons)
    for (codon,Density) in codonDistDict

        # get counts
        codonCounts = codonCountDict[codon]
        if codonCounts == 0
            normDensity = 0
        else
            normDensity = (Density*totalCodons / totalDensity*codonCounts)*10^-10
        end

        # output
        write(outputData,join([AnminoAcidsDict[codon],normDensity],"\t")*"\n")
    end
    close(outputData)
end

计算:

mkdir("8.codonOcupancy-data/")

# sample name
sample = ["Normal-rep1","mOSM500-2h-rep1","mOSM600-2h-rep1","mOSM700-2h-rep1","mOSM700-15m-rep1",
            "mOSM700-15m-15mrecover-rep1","mOSM700-15m-2hrecover-rep1","mOSM700-2h-15mrecover-rep1","mOSM700-2h-2hrecover-rep1",
            "Normal-rep2","mOSM500-2h-rep2","mOSM600-2h-rep2","mOSM700-15m-rep2","mOSM700-2h-rep2",
            "mOSM700-15m-15mrecover-rep2","mOSM700-15m-2hrecover-rep2","mOSM700-2h-15mrecover-rep2","mOSM700-2h-2hrecover-rep2"]

for i in range(1,18)
    CodonsOcupancyOnORF("longestCDSinfo.txt","longestCDS.fa",
                        "5.density-data/"*sample[i]*".trans.txt","8.codonOcupancy-data/"*sample[i]*".codonOcupancy.txt",
                        cdslength=900,expression=50,exclude=90)
end

4绘图

文章中对于三种终止密码子 (TGA/TAG/TAA) 并不进行分析,所以筛选掉它们就行了:

数据批量读取并预处理:

library(ggplot2)
library(tidyverse)
library(cowplot)
library(ggrepel)

file <- c("Normal-rep1","mOSM500-2h-rep1","mOSM600-2h-rep1","mOSM700-2h-rep1",
          "mOSM700-15m-15mrecover-rep1","mOSM700-2h-15mrecover-rep1","mOSM700-2h-2hrecover-rep1",
          "Normal-rep2","mOSM500-2h-rep2","mOSM600-2h-rep2","mOSM700-2h-rep2",
          "mOSM700-15m-15mrecover-rep2","mOSM700-2h-15mrecover-rep2","mOSM700-2h-2hrecover-rep2")

grou <- rep(c('Control','500 mOSM','600 mOSM','700 mOSM',
              '700 mOSM(15m)-15m-Rec','700 mOSM(2h)-15m-Rec','700 mOSM(2h)-2h-Rec'),2)

# x = 1
map_df(1:length(file),function(x){
  tmp <- read.table(paste('8.codonOcupancy-data/',file[x],'.codonOcupancy.txt',sep = ''))
  # add colname
  colnames(tmp) <- c('codon','occupancy')
  # remove stop codon
  tmp <- tmp %>% filter(!(codon %in% c("TGA|Stop","TAG|Stop","TAA|Stop")))
  # add group
  tmp$group <- grou[x]
  # add sample
  tmp$sample <- file[x]
  return(tmp)
}) -> df

# mean in replicate
df_mean <- df %>% group_by(group,codon) %>%
  summarise(meanOcup = mean(occupancy))

# long to wide
wide_df <- df_mean %>% spread(key = 'group', value = 'meanOcup')
wide_df$codonName <- sapply(strsplit(wide_df$codon,split = '\\|'),'[',1)

myp <- wide_df %>%
  filter(codonName %in% c('ATG','ATA','ATC','ATT','CTG','TTG'))

myp$col <- ifelse(myp$codonName == 'ATG','Met','others')

绘图:

# plot
p5 <- ggplot(wide_df,aes(x = Control,y = `500 mOSM`)) +
  geom_point(size = 4,color = 'grey80') +
  theme_bw(base_size = 16) +
  theme(panel.grid = element_blank(),
        aspect.ratio = 1) +
  geom_point(data = myp,
             show.legend = F,
             aes(x = Control,y = `500 mOSM`,color = col),
             size = 4) +
  scale_color_manual(values = c('Met' = 'red','others' = '#FF9933')) +
  geom_abline(slope = 1,intercept = 0,lty = 'dashed',size = 1,color = '#336699')

# plot
p6 <- ggplot(wide_df,aes(x = Control,y = `600 mOSM`)) +
  geom_point(size = 4,color = 'grey80') +
  theme_bw(base_size = 16) +
  theme(panel.grid = element_blank(),
        aspect.ratio = 1) +
  geom_point(data = myp,
             show.legend = F,
             aes(x = Control,y = `600 mOSM`,color = col),
             size = 4) +
  scale_color_manual(values = c('Met' = 'red','others' = '#FF9933')) +
  geom_abline(slope = 1,intercept = 0,lty = 'dashed',size = 1,color = '#336699')

# plot
p7 <- ggplot(wide_df,aes(x = Control,y = `700 mOSM`)) +
  geom_point(size = 4,color = 'grey80') +
  theme_bw(base_size = 16) +
  theme(panel.grid = element_blank(),
        aspect.ratio = 1) +
  geom_point(data = myp,
             show.legend = F,
             aes(x = Control,y = `700 mOSM`,color = col),
             size = 4) +
  geom_text_repel(data = myp,
            aes(x = Control,y = `700 mOSM`,color = col,label = codonName),
            force = 20,nudge_y = -2.5,nudge_x = 2.5,
            show.legend = F) +
  scale_color_manual(values = c('Met' = 'red','others' = '#FF9933')) +
  geom_abline(slope = 1,intercept = 0,lty = 'dashed',size = 1,color = '#336699')


# plot
p15m15mrec <- ggplot(wide_df,aes(x = Control,y = `700 mOSM(15m)-15m-Rec`)) +
  geom_point(size = 4,color = 'grey80') +
  theme_bw(base_size = 16) +
  theme(panel.grid = element_blank(),
        aspect.ratio = 1) +
  geom_point(data = myp,
             show.legend = F,
             aes(x = Control,y = `700 mOSM(15m)-15m-Rec`,color = col),
             size = 4) +
  geom_text_repel(data = myp,
                  aes(x = Control,y = `700 mOSM(15m)-15m-Rec`,color = col,label = codonName),
                  force = 20,nudge_y = -2.5,nudge_x = 2.5,
                  show.legend = F) +
  scale_color_manual(values = c('Met' = 'red','others' = '#FF9933')) +
  geom_abline(slope = 1,intercept = 0,lty = 'dashed',size = 1,color = '#336699')

# plot
p2h15m <- ggplot(wide_df,aes(x = Control,y = `700 mOSM(2h)-15m-Rec`)) +
  geom_point(size = 4,color = 'grey80') +
  theme_bw(base_size = 16) +
  theme(panel.grid = element_blank(),
        aspect.ratio = 1) +
  geom_point(data = myp,
             show.legend = F,
             aes(x = Control,y = `700 mOSM(2h)-15m-Rec`,color = col),
             size = 4) +
  geom_text_repel(data = myp,
                  aes(x = Control,y = `700 mOSM(2h)-15m-Rec`,color = col,label = codonName),
                  force = 20,nudge_y = -2.5,nudge_x = 2.5,
                  show.legend = F) +
  scale_color_manual(values = c('Met' = 'red','others' = '#FF9933')) +
  geom_abline(slope = 1,intercept = 0,lty = 'dashed',size = 1,color = '#336699')

# plot
p2h2hrec <- ggplot(wide_df,aes(x = Control,y = `700 mOSM(2h)-2h-Rec`)) +
  geom_point(size = 4,color = 'grey80') +
  theme_bw(base_size = 16) +
  theme(panel.grid = element_blank(),
        aspect.ratio = 1) +
  geom_point(data = myp,
             show.legend = F,
             aes(x = Control,y = `700 mOSM(2h)-2h-Rec`,color = col),
             size = 4) +
  geom_text_repel(data = myp,
                  aes(x = Control,y = `700 mOSM(2h)-2h-Rec`,color = col,label = codonName),
                  force = 20,nudge_y = -2.5,nudge_x = 2.5,
                  show.legend = F) +
  scale_color_manual(values = c('Met' = 'red','others' = '#FF9933')) +
  geom_abline(slope = 1,intercept = 0,lty = 'dashed',size = 1,color = '#336699')

# combine
plot_grid(plotlist = list(p5,p6,p7,p15m15mrec,p2h15m,p2h2hrec),align = 'hv',ncol = 3)