Molecular Cell 文章 ribosome pausing 结果复现 (终)
两难
1引言
复现文章中每个密码子 occupancy (codon occupancy)
的情况,可以看到 ATG 在高浓度处理后相对于对照组发生了 显著上升:
计算思路:
提取基因的 CDS 序列。 计算每个基因密码子上的 ribosome density。 codon occupancy =
2提取 CDS 序列
定义函数:julia语言编写
这里有一些筛选条件,把没有 5'UTR 区域
和 CDS 区域长度不是 3 的倍数
的基因过滤掉:
using FASTX
using BioSequences
function GetCDSseq(genomeFile,geneInfo,outputFile)
################################################################
# 1.load genome file
################################################################
# save chromsome length
chromDict = Dict{String,String}()
open(FASTA.Reader, genomeFile) do reader
record = FASTA.Record()
while !eof(reader)
read!(reader, record)
## Do something.
chromDict[FASTA.identifier(record)] = FASTA.sequence(record)
end
end
################################################################
# 2.extarct sequence
################################################################
longestCDS_Dict = Dict{String,String}()
open(geneInfo,"r") do geneinfo
for line in eachline(geneinfo)
# split tags
_,gene_name,gene_id,_,chr,strand,cdsRg,exon,utr5,cds,utr3 = split(line)
# exon length
utr5,cds,utr3 = parse(Int64,utr5),parse(Int64,cds),parse(Int64,utr3)
# chromosome sequence [2 for only know ATGC,4 for including N base]
format_seq = LongSequence{DNAAlphabet{4}}(chromDict[chr])
# exclude no 5UTR genes and can't be divided exactly by 3 genes
if utr5 != 0 && cds%3 == 0
# exonLength = utr5 + cds +utr3
# cds start and end position
cdsStart,cdsEnd = (utr5 + 1),(utr5 + cds - 3)
# loop for every exon regions
if strand == "+" # + strand gene
region = split(cdsRg,",")
else # - strand gene
region = reverse(split(cdsRg,","))
end
# key
key = join([gene_name,gene_id,cdsStart,cdsEnd,cds],"|")
# fill denisty
for rg in region
posSt,posEnd = parse(Int64,split(rg,":")[1]),parse(Int64,split(rg,":")[2])
# get sequence
if strand == "+" # + strand gene
seq = format_seq[posSt:posEnd]
else # - strand gene
# reverse and complement sequence
seq = reverse_complement(format_seq[posSt:posEnd])
end
# save in dict
if !haskey(longestCDS_Dict,key)
longestCDS_Dict[key] = seq
else
longestCDS_Dict[key] *= string(seq)
end
end
else
continue
end
end
end
################################################################
# 3.output data
################################################################
outFa = open(outputFile,"w")
for (key,val) in longestCDS_Dict
write(outFa,join([">$key",val],"\n")*"\n")
end
close(outFa)
end
提取序列:
GetCDSseq("Mus_musculus.GRCm38.dna.primary_assembly.fa","longestCDSinfo.txt","longestCDS.fa")
3计算每个 codon 的 ribosome occupancy
cdslength=900,expression=50,exclude=90 这几个参数可以对基因的 CDS 的长度
,表达量
和去除前面多少碱基
来进行一定范围的过滤。此外代码里 还对 64 种密码子进行了对应的氨基酸注释, 这样可以方便的知道每个 codon 对应的氨基酸名称。
定义函数:julia语言编写
using FASTX
using Formatting
function CodonsOcupancyOnORF(geneInfo,cdsFa,inputFile,outputFile;cdslength=600,expression=75,exclude=90)
################################################################
# 1.extract gene feature info
################################################################
featureLenDict = Dict{String,Tuple}()
# open gene info file
open(geneInfo,"r") do input
for line in eachline(input)
fileds = split(line)
# split tags
_,_,gene_id,_,_,_,_,_,utr5,cds,utr3 = split(line)
# exon length
utr5,cds,utr3 = parse(Int64,utr5),parse(Int64,cds),parse(Int64,utr3)
# cds start and end position
cdsStart,cdsEnd = (utr5 + 1),(utr5 + cds - 2)
# save in dict
featureLenDict[gene_id] = (cdsStart,cdsEnd,cds)
end
end
################################################################
# 2.filter gene CDS length and expression higher thean threshold
################################################################
gene_infoDict = Dict{String,Float64}()
filtedGeneDict = Dict{String,Float64}()
# open gene info file
open(inputFile,"r") do input
for line in eachline(input)
fileds = split(line)
# split tags
gene_id = fileds[1]
transpos = parse(Int64,fileds[2])
rpm = parse(Float64,fileds[3])
# filter CDS > 600 nt gene
if haskey(featureLenDict,gene_id)
cdsStart,cdsEnd,cdsLength = featureLenDict[gene_id]
if cdsLength >= cdslength && cdsLength%3 == 0
# not include first 30 codons to exclude high densitys around start codon
key = join([gene_id,cdsLength - exclude],":") # key
if !haskey(gene_infoDict,key)
gene_infoDict[key] = 0
else
if cdsStart + exclude <= transpos <= cdsEnd
gene_infoDict[key] += rpm
else
continue
end
end
else
continue
end
end
end
end
# filter CDS expression >= 75
for (key,val) in gene_infoDict
if val >= expression
meanNorm = val / parse(Int64,split(key,":")[2])
filtedGeneDict[key] = meanNorm
else
continue
end
end
################################################################
# 3.load trans density file and filter CDS density
################################################################
riboDict = Dict{String,Float64}()
geneidDict = Dict{String,Float64}()
# open gene info file
open(inputFile,"r") do input
for line in eachline(input)
fileds = split(line)
# split tags
gene_id = fileds[1]
transpos = parse(Int64,fileds[2])
rpm = parse(Float64,fileds[3])
cdsStart,cdsEnd,cdsLength = featureLenDict[gene_id]
# key
id = join([gene_id,cdsLength - exclude],":")
if haskey(filtedGeneDict,id)
# filter CDS region
if cdsStart <= transpos <= cdsEnd
# save
# relRPM = rpm / filtedGeneDict[id]
riboDict[join([gene_id,transpos - cdsStart + 1],"|")] = rpm
else
continue
end
end
# get how many gene_id this sample
if haskey(geneidDict,gene_id)
continue
else
geneidDict[gene_id] = 0.0
end
end
end
################################################################
# 4.prepare 64 codons Dict
################################################################
baseATGC = ["A","T","G","C"]
codonDistDict = Dict{String,Float64}()
codonCountDict = Dict{String,Int64}()
for i in baseATGC
for j in baseATGC
for z in baseATGC
codonDistDict["$i$j$z"] = 0.0
codonCountDict["$i$j$z"] = 0
end
end
end
################################################################
# 5.prepare anmino acids info
################################################################
# add codon with amino acids annotation
AnminoAcidsDict = Dict{String,String}()
for codon in keys(codonCountDict)
if codon ∈ ["TTT","TTC"]
newKey = join([codon,"Phe"],"|")
elseif codon ∈ ["TTA","TTG"]
newKey = join([codon,"Leu"],"|")
elseif codon ∈ ["TCT","TCC","TCA","TCG"]
newKey = join([codon,"Ser"],"|")
elseif codon ∈ ["TAT","TAC"]
newKey = join([codon,"Tyr"],"|")
elseif codon ∈ ["TAA","TAG","TGA"]
newKey = join([codon,"Stop"],"|")
elseif codon ∈ ["TGT","TGC"]
newKey = join([codon,"Cys"],"|")
elseif codon ∈ ["TGG"]
newKey = join([codon,"Trp"],"|")
elseif codon ∈ ["CTT","CTC","CTA","CTG"]
newKey = join([codon,"Leu"],"|")
elseif codon ∈ ["CCT","CCC","CCA","CCG"]
newKey = join([codon,"Pro"],"|")
elseif codon ∈ ["CAT","CAC"]
newKey = join([codon,"His"],"|")
elseif codon ∈ ["CAA","CAG"]
newKey = join([codon,"Gln"],"|")
elseif codon ∈ ["CGT","CGC","CGA","CGG"]
newKey = join([codon,"Arg"],"|")
elseif codon ∈ ["ATT","ATC","ATA"]
newKey = join([codon,"Lle"],"|")
elseif codon ∈ ["ATG"]
newKey = join([codon,"Met"],"|")
elseif codon ∈ ["ACT","ACC","ACA","ACG"]
newKey = join([codon,"Thr"],"|")
elseif codon ∈ ["AAT","AAC"]
newKey = join([codon,"Asn"],"|")
elseif codon ∈ ["AAA","AAG"]
newKey = join([codon,"Lys"],"|")
elseif codon ∈ ["AGT","AGC"]
newKey = join([codon,"Ser"],"|")
elseif codon ∈ ["AGA","AGG"]
newKey = join([codon,"Arg"],"|")
elseif codon ∈ ["GTT","GTC","GTA","GTG"]
newKey = join([codon,"Val"],"|")
elseif codon ∈ ["GCT","GCC","GCA","GCG"]
newKey = join([codon,"Ala"],"|")
elseif codon ∈ ["GAT","GAC"]
newKey = join([codon,"Asp"],"|")
elseif codon ∈ ["GAA","GAG"]
newKey = join([codon,"Glu"],"|")
elseif codon ∈ ["GGT","GGC","GGA","GGG"]
newKey = join([codon,"Gly"],"|")
else
println("No this codon!")
end
AnminoAcidsDict[codon] = newKey
end
################################################################
# 6.calculate ribo occupancy on codons
################################################################
open(FASTA.Reader, cdsFa) do reader
record = FASTA.Record()
while !eof(reader)
read!(reader, record)
## Do something.
name = FASTA.identifier(record)
seq = FASTA.sequence(record)
# split tags
_,gene_id,cdsStart,cdsEnd,cdsLength = split(name,"|")
# whtether this gene in samples
if haskey(geneidDict,gene_id)
# codon list
codonlist = [seq[i:i+2] for i in 1:3:parse(Int64,cdsLength)]
# codon numbers
codonLength = length(codonlist)
for (index,codon) in enumerate(codonlist)
codon = string(codon)
if haskey(codonDistDict,codon)
# get codon density
sumOcp = 0.0
for pos in index:index+2
occupancy = get(riboDict,join([gene_id,pos],"|"),0.0)
sumOcp += occupancy
end
# get mean codon density
# meanOcp = sumOcp / 3
# save in codonDistDict
codonDistDict[codon] += sumOcp
# count numbers
codonCountDict[codon] += 1
else
continue
end
end
else
continue
end
end
end
################################################################
# 7.output data
################################################################
outputData = open(outputFile,"w")
totalDensity = sum(values(codonDistDict))
totalCodons = sum(values(codonCountDict))
# normalize number of counts and Densitys| (Density/totalDensity)/(codonCounts/totalCodons)
for (codon,Density) in codonDistDict
# get counts
codonCounts = codonCountDict[codon]
if codonCounts == 0
normDensity = 0
else
normDensity = (Density*totalCodons / totalDensity*codonCounts)*10^-10
end
# output
write(outputData,join([AnminoAcidsDict[codon],normDensity],"\t")*"\n")
end
close(outputData)
end
计算:
mkdir("8.codonOcupancy-data/")
# sample name
sample = ["Normal-rep1","mOSM500-2h-rep1","mOSM600-2h-rep1","mOSM700-2h-rep1","mOSM700-15m-rep1",
"mOSM700-15m-15mrecover-rep1","mOSM700-15m-2hrecover-rep1","mOSM700-2h-15mrecover-rep1","mOSM700-2h-2hrecover-rep1",
"Normal-rep2","mOSM500-2h-rep2","mOSM600-2h-rep2","mOSM700-15m-rep2","mOSM700-2h-rep2",
"mOSM700-15m-15mrecover-rep2","mOSM700-15m-2hrecover-rep2","mOSM700-2h-15mrecover-rep2","mOSM700-2h-2hrecover-rep2"]
for i in range(1,18)
CodonsOcupancyOnORF("longestCDSinfo.txt","longestCDS.fa",
"5.density-data/"*sample[i]*".trans.txt","8.codonOcupancy-data/"*sample[i]*".codonOcupancy.txt",
cdslength=900,expression=50,exclude=90)
end
4绘图
文章中对于 三种终止密码子 (TGA/TAG/TAA) 并不进行分析,所以筛选掉它们就行了:
数据批量读取并预处理:
library(ggplot2)
library(tidyverse)
library(cowplot)
library(ggrepel)
file <- c("Normal-rep1","mOSM500-2h-rep1","mOSM600-2h-rep1","mOSM700-2h-rep1",
"mOSM700-15m-15mrecover-rep1","mOSM700-2h-15mrecover-rep1","mOSM700-2h-2hrecover-rep1",
"Normal-rep2","mOSM500-2h-rep2","mOSM600-2h-rep2","mOSM700-2h-rep2",
"mOSM700-15m-15mrecover-rep2","mOSM700-2h-15mrecover-rep2","mOSM700-2h-2hrecover-rep2")
grou <- rep(c('Control','500 mOSM','600 mOSM','700 mOSM',
'700 mOSM(15m)-15m-Rec','700 mOSM(2h)-15m-Rec','700 mOSM(2h)-2h-Rec'),2)
# x = 1
map_df(1:length(file),function(x){
tmp <- read.table(paste('8.codonOcupancy-data/',file[x],'.codonOcupancy.txt',sep = ''))
# add colname
colnames(tmp) <- c('codon','occupancy')
# remove stop codon
tmp <- tmp %>% filter(!(codon %in% c("TGA|Stop","TAG|Stop","TAA|Stop")))
# add group
tmp$group <- grou[x]
# add sample
tmp$sample <- file[x]
return(tmp)
}) -> df
# mean in replicate
df_mean <- df %>% group_by(group,codon) %>%
summarise(meanOcup = mean(occupancy))
# long to wide
wide_df <- df_mean %>% spread(key = 'group', value = 'meanOcup')
wide_df$codonName <- sapply(strsplit(wide_df$codon,split = '\\|'),'[',1)
myp <- wide_df %>%
filter(codonName %in% c('ATG','ATA','ATC','ATT','CTG','TTG'))
myp$col <- ifelse(myp$codonName == 'ATG','Met','others')
绘图:
# plot
p5 <- ggplot(wide_df,aes(x = Control,y = `500 mOSM`)) +
geom_point(size = 4,color = 'grey80') +
theme_bw(base_size = 16) +
theme(panel.grid = element_blank(),
aspect.ratio = 1) +
geom_point(data = myp,
show.legend = F,
aes(x = Control,y = `500 mOSM`,color = col),
size = 4) +
scale_color_manual(values = c('Met' = 'red','others' = '#FF9933')) +
geom_abline(slope = 1,intercept = 0,lty = 'dashed',size = 1,color = '#336699')
# plot
p6 <- ggplot(wide_df,aes(x = Control,y = `600 mOSM`)) +
geom_point(size = 4,color = 'grey80') +
theme_bw(base_size = 16) +
theme(panel.grid = element_blank(),
aspect.ratio = 1) +
geom_point(data = myp,
show.legend = F,
aes(x = Control,y = `600 mOSM`,color = col),
size = 4) +
scale_color_manual(values = c('Met' = 'red','others' = '#FF9933')) +
geom_abline(slope = 1,intercept = 0,lty = 'dashed',size = 1,color = '#336699')
# plot
p7 <- ggplot(wide_df,aes(x = Control,y = `700 mOSM`)) +
geom_point(size = 4,color = 'grey80') +
theme_bw(base_size = 16) +
theme(panel.grid = element_blank(),
aspect.ratio = 1) +
geom_point(data = myp,
show.legend = F,
aes(x = Control,y = `700 mOSM`,color = col),
size = 4) +
geom_text_repel(data = myp,
aes(x = Control,y = `700 mOSM`,color = col,label = codonName),
force = 20,nudge_y = -2.5,nudge_x = 2.5,
show.legend = F) +
scale_color_manual(values = c('Met' = 'red','others' = '#FF9933')) +
geom_abline(slope = 1,intercept = 0,lty = 'dashed',size = 1,color = '#336699')
# plot
p15m15mrec <- ggplot(wide_df,aes(x = Control,y = `700 mOSM(15m)-15m-Rec`)) +
geom_point(size = 4,color = 'grey80') +
theme_bw(base_size = 16) +
theme(panel.grid = element_blank(),
aspect.ratio = 1) +
geom_point(data = myp,
show.legend = F,
aes(x = Control,y = `700 mOSM(15m)-15m-Rec`,color = col),
size = 4) +
geom_text_repel(data = myp,
aes(x = Control,y = `700 mOSM(15m)-15m-Rec`,color = col,label = codonName),
force = 20,nudge_y = -2.5,nudge_x = 2.5,
show.legend = F) +
scale_color_manual(values = c('Met' = 'red','others' = '#FF9933')) +
geom_abline(slope = 1,intercept = 0,lty = 'dashed',size = 1,color = '#336699')
# plot
p2h15m <- ggplot(wide_df,aes(x = Control,y = `700 mOSM(2h)-15m-Rec`)) +
geom_point(size = 4,color = 'grey80') +
theme_bw(base_size = 16) +
theme(panel.grid = element_blank(),
aspect.ratio = 1) +
geom_point(data = myp,
show.legend = F,
aes(x = Control,y = `700 mOSM(2h)-15m-Rec`,color = col),
size = 4) +
geom_text_repel(data = myp,
aes(x = Control,y = `700 mOSM(2h)-15m-Rec`,color = col,label = codonName),
force = 20,nudge_y = -2.5,nudge_x = 2.5,
show.legend = F) +
scale_color_manual(values = c('Met' = 'red','others' = '#FF9933')) +
geom_abline(slope = 1,intercept = 0,lty = 'dashed',size = 1,color = '#336699')
# plot
p2h2hrec <- ggplot(wide_df,aes(x = Control,y = `700 mOSM(2h)-2h-Rec`)) +
geom_point(size = 4,color = 'grey80') +
theme_bw(base_size = 16) +
theme(panel.grid = element_blank(),
aspect.ratio = 1) +
geom_point(data = myp,
show.legend = F,
aes(x = Control,y = `700 mOSM(2h)-2h-Rec`,color = col),
size = 4) +
geom_text_repel(data = myp,
aes(x = Control,y = `700 mOSM(2h)-2h-Rec`,color = col,label = codonName),
force = 20,nudge_y = -2.5,nudge_x = 2.5,
show.legend = F) +
scale_color_manual(values = c('Met' = 'red','others' = '#FF9933')) +
geom_abline(slope = 1,intercept = 0,lty = 'dashed',size = 1,color = '#336699')
# combine
plot_grid(plotlist = list(p5,p6,p7,p15m15mrec,p2h15m,p2h2hrec),align = 'hv',ncol = 3)
这里我多画了三个样本,可以看到 随着浓度的提高, ATG 密码子相比于对照组发生了明显的上升, 随着时间的恢复, ATG 水平也相应的逐渐降低了。
5结尾
复现的结果相比于文章里的,还是有很多区别的,比如文章里的图的点更离散一些
,此外还有一些其它几个密码子也有一定水平的上升,但是复现出来的却没这么明显或者是不变的
。我感觉其中影响的因素很多,包括上游的处理,下游的计算等等过程都会有一定的影响。
每次的复现都会收获很多新的经验和知识,虽然复现过程是很艰难的,但是坚持下去总归是好的。
欢迎加入生信交流群。加我微信我也拉你进 微信群聊 老俊俊生信交流群
(微信交流群满200人后需收取20元入群费用), 数据代码已上传至QQ群,欢迎加入下载。
群二维码:
老俊俊微信:
知识星球:
所以今天你学习了吗?
今天的分享就到这里了,敬请期待下一篇!
最后欢迎大家分享转发,您的点赞是对我的鼓励和肯定!
如果觉得对您帮助很大,赏杯快乐水喝喝吧!
往期回顾
◀Molecular Cell 文章 ribosome pausing 结果复现 (四)
◀Molecular Cell 文章 ribosome pausing 结果复现 (三)
◀Molecular Cell 文章 ribosome pausing 结果复现 (二) (PCR 去重)
◀Molecular Cell 文章 ribosome pausing 结果复现 (一)
◀...