s06 - Consensus and Profile
Problem
A matrix is a rectangular table of values divided into rows and columns. An m×n matrix has m rows and n columns. Given a matrix A, we write Ai,j to indicate the value found at the intersection of row i and column j.
Say that we have a collection of DNA strings, all having the same length n. Their profile matrix is a 4×n matrix P in which P1,j represents the number of times that ‘A’ occurs in the jth position of one of the strings, P2,j represents the number of times that C occurs in the jth position, and so on (see below).
A consensus string c is a string of length n formed from our collection by taking the most common symbol at each position; the jth symbol of c therefore corresponds to the symbol having the maximum value in the j-th column of the profile matrix. Of course, there may be more than one most common symbol, leading to multiple possible consensus strings.
A T C C A G C T
G G G C A A C T
A T G G A T C T
DNA Strings A A G C A A C C
T T G G A A C T
A T G C C A T T
A T G G C A C T
A 5 1 0 0 5 5 0 0
Profile C 0 0 1 4 2 0 6 1
G 1 1 6 3 0 1 0 0
T 1 5 0 0 0 1 1 6
Consensus A T G C A A C T
Given: A collection of at most 10 DNA strings of equal length (at most 1 kbp) in FASTA format.
Return: A consensus string and profile matrix for the collection. (If several possible consensus strings exist, then you may return any one of them.)
Sample Dataset
>Rosalind_1
ATCCAGCT
>Rosalind_2
GGGCAACT
>Rosalind_3
ATGGATCT
>Rosalind_4
AAGCAACC
>Rosalind_5
TTGGAACT
>Rosalind_6
ATGCCATT
>Rosalind_7
ATGGCACT
Sample Output
ATGCAACT
A: 5 1 0 0 5 5 0 0
C: 0 0 1 4 2 0 6 1
G: 1 1 6 3 0 1 0 0
T: 1 5 0 0 0 1 1 6
Solution
这道题给出alignment,要求生成一个碱基的频数表,并给出consensus序列。
C version
在Computing GC content这一道题中就引入了FASTA格式,但那道题太简单,根本不需要存储序列,而这道题不同,我们需要考虑到序列的解析存储问题。这道题虽然说FASTA的描述行和序列行在不同序列中都是等长的,但现实中的FASTA文件并非如此,我们需要考虑到这一点,并实现一个通用的解析函数。这里再次使用了链表,最终读出来的序列,也是以链表形式返回,当然我们很容易写个wrapper function,返回SEQ结构体的指针构成的向量。
首先定义了readFASTA.h
:
typedef struct SEQ {
char * desc;
char * seq;
int width;
struct SEQ * next;
} SEQ;
typedef struct CHLL { // CHaracter Link-List
char ch;
struct CHLL * next;
} CHLL;
SEQ * readFASTA(char *filename);
SEQ * readFASTA(char *filename) {
FILE *INFILE;
INFILE = fopen(filename, "r");
SEQ *head = malloc(sizeof(SEQ));
SEQ *curr;
curr = head;
char ch;
int desc_line = 0, i;
while( (ch = fgetc(INFILE)) != EOF) {
SEQ *SEQ_NODE = malloc(sizeof(SEQ));
CHLL *CHLL_head = malloc(sizeof(CHLL));
CHLL *CHLL_curr;
CHLL_curr = CHLL_head; // description line
if ( ch == '>') {
desc_line = 1;
}
if (desc_line == 1) {
i = 0;
while( (ch = fgetc(INFILE) ) != '\n') {
CHLL *CHLL_NODE = malloc(sizeof(CHLL));
i++;
CHLL_NODE->ch = ch;
CHLL_curr->next = CHLL_NODE;
CHLL_curr = CHLL_NODE;
}
}
int desc_width = i;
CHLL_curr = CHLL_head->next;
char* desc = malloc(sizeof(char) * desc_width);
for (i=0; i < desc_width; i++) {
desc[i] = CHLL_curr->ch;
CHLL_curr = CHLL_curr->next;
}
SEQ_NODE->desc = desc; // sequence lines
// re-initial the CHLL link list to store sequence characters
CHLL_curr = CHLL_head;
desc_line = 0;
i = 0;
while( (ch=fgetc(INFILE)) != '>' && ch != EOF) {
if (ch == '\n') {
continue;
}
CHLL *CHLL_NODE = malloc(sizeof(CHLL));
CHLL_NODE->ch = ch;
CHLL_curr->next = CHLL_NODE;
CHLL_curr = CHLL_NODE;
i++;
}
int seq_width = i;
char * seq = malloc(sizeof(char)*seq_width);
CHLL_curr = CHLL_head->next;
for (i=0; i < seq_width; i++) {
seq[i] = CHLL_curr->ch;
CHLL_curr = CHLL_curr->next;
}
SEQ_NODE->seq = seq;
SEQ_NODE->width = seq_width;
curr->next = SEQ_NODE;
curr = SEQ_NODE;
desc_line = 1;
}
return head;
}
有了FASTA的解析函数之后,问题就容易了,无非是读文件,核苷酸计数,最近打印结果。
#include<stdio.h>
#include<stdlib.h>
#include "include/readFASTA.h"
int main() {
SEQ * head, *curr;
head = readFASTA("DATA/rosalind_cons.txt");
curr = head;
int width=0;
while(curr = curr->next) {
if (width < curr->width) {
width = curr->width;
}
}
int NT_type = 4; // ACGT
int cons[NT_type][width];
int r, c;
for (r=0; r < NT_type; r++) {
for (c=0; c < width; c++) {
cons[r][c] = 0;
}
}
curr=head;
int i;
while(curr = curr->next) {
for (i=0; i < curr->width; i++) {
switch((curr->seq)[i]) {
case 'A':
cons[0][i]++;
break;
case 'C':
cons[1][i]++;
break;
case 'G':
cons[2][i]++;
break;
case 'T':
cons[3][i]++;
break;
}
}
}
char NT[4] = "ACGT";
int max_r_idx, max_r=0;
for (c=0; c < width; c++) {
for (r=0; r < NT_type; r++) {
if (max_r < cons[r][c]) {
max_r = cons[r][c];
max_r_idx = r;
}
}
printf("%c", NT[max_r_idx]);
max_r = 0;
}
printf("\n");
for (r=0; r < NT_type; r++) {
printf("%c: ", NT[r]);
for (c=0; c < width; c++) {
printf("%d ", cons[r][c]);
}
printf("\n");
}
}
Python version
这里我读序列就是看到>就证明上一条序列读完了,可以更新计数了。
而这个更新,主要是zip函数稍微难懂一点,str是序列,而list是列表,str如果是char一样就是1不然是0,所以 [1 if x == char else 0 for x in str] 这一句根据str生成了一个0,1的列表,这个列表当然也和list等长,用zip对两个列表同时进行迭代,并加和,这样就更新了计数。
#!/usr/bin/env python3
def update_cnt(list, char, str):
res = [a + b for a,b in zip(list, [1 if x == char else 0 for x in str])] return(res)def print_cnt(list, label):
print(label, end = ": ")
for x in list:
print(x, end=" ")
print()
f = open("DATA/rosalind_cons.txt", 'r')
fas = f.readlines()
seq = ''
j = 0
for i in range(len(fas)):
if fas[i][0] == '>' and j == 0:
j += 1
next
elif fas[i][0] == '>':
if j == 1:
A = [0 for x in seq]
C = A
G = A
T = A
A = update_cnt(A, 'A', seq)
C = update_cnt(C, 'C', seq)
G = update_cnt(G, 'G', seq)
T = update_cnt(T, 'T', seq)
j += 1
seq = ''
else:
seq += fas[i].strip()
## last seq
A = update_cnt(A, 'A', seq)
C = update_cnt(C, 'C', seq)
G = update_cnt(G, 'G', seq)
T = update_cnt(T, 'T', seq)
consensus = ''
for i in range(len(A)):
m = max(A[i], C[i], G[i], T[i])
if A[i] == m:
consensus += 'A'
elif C[i] == m:
consensus += 'C'
elif G[i] == m:
consensus += 'G'
elif T[i] == m:
consensus += 'T'
print(consensus)
print_cnt(A, 'A')
print_cnt(C, 'C')
print_cnt(G, 'G')
print_cnt(T, 'T')