s06 - Consensus and Profile | 自由微信

s06 - Consensus and Profile

Original Y叔 biobabble 2020-02-05

Problem

A matrix is a rectangular table of values divided into rows and columns. An m×n matrix has m rows and n columns. Given a matrix A, we write Ai,j to indicate the value found at the intersection of row i and column j.

Say that we have a collection of DNA strings, all having the same length n. Their profile matrix is a 4×n matrix P in which P1,j represents the number of times that ‘A’ occurs in the jth position of one of the strings, P2,j represents the number of times that C occurs in the jth position, and so on (see below).

A consensus string c is a string of length n formed from our collection by taking the most common symbol at each position; the jth symbol of c therefore corresponds to the symbol having the maximum value in the j-th column of the profile matrix. Of course, there may be more than one most common symbol, leading to multiple possible consensus strings.

            A T C C A G C T
            G G G C A A C T
            A T G G A T C T
DNA Strings A A G C A A C C
            T T G G A A C T
            A T G C C A T T
            A T G G C A C T

            A   5 1 0 0 5 5 0 0
Profile     C   0 0 1 4 2 0 6 1
            G   1 1 6 3 0 1 0 0
            T   1 5 0 0 0 1 1 6

Consensus    A T G C A A C T

Given: A collection of at most 10 DNA strings of equal length (at most 1 kbp) in FASTA format.
Return: A consensus string and profile matrix for the collection. (If several possible consensus strings exist, then you may return any one of them.)

Sample Dataset

>Rosalind_1
ATCCAGCT
>Rosalind_2
GGGCAACT
>Rosalind_3
ATGGATCT
>Rosalind_4
AAGCAACC
>Rosalind_5
TTGGAACT
>Rosalind_6
ATGCCATT
>Rosalind_7
ATGGCACT

Sample Output

ATGCAACT
A: 5 1 0 0 5 5 0 0
C: 0 0 1 4 2 0 6 1
G: 1 1 6 3 0 1 0 0
T: 1 5 0 0 0 1 1 6

Solution

这道题给出alignment，要求生成一个碱基的频数表，并给出consensus序列。

C version

在Computing GC content这一道题中就引入了FASTA格式，但那道题太简单，根本不需要存储序列，而这道题不同，我们需要考虑到序列的解析存储问题。这道题虽然说FASTA的描述行和序列行在不同序列中都是等长的，但现实中的FASTA文件并非如此，我们需要考虑到这一点，并实现一个通用的解析函数。这里再次使用了链表，最终读出来的序列，也是以链表形式返回，当然我们很容易写个wrapper function，返回SEQ结构体的指针构成的向量。

首先定义了readFASTA.h：

typedef struct SEQ {
  char * desc;  
  char * seq;  
  int width;  
  struct SEQ * next;
} SEQ;

typedef struct CHLL {  // CHaracter Link-List
  char ch;
  struct CHLL * next;
} CHLL;

SEQ * readFASTA(char *filename);

SEQ * readFASTA(char *filename) {
  FILE *INFILE;
  INFILE = fopen(filename, "r");

  SEQ *head = malloc(sizeof(SEQ));
  SEQ *curr;
  curr = head;
  char ch;  
  int desc_line = 0, i;  
  while( (ch = fgetc(INFILE)) != EOF) {
    SEQ *SEQ_NODE = malloc(sizeof(SEQ));

    CHLL *CHLL_head = malloc(sizeof(CHLL));
    CHLL *CHLL_curr;
    CHLL_curr = CHLL_head;    // description line
    if ( ch == '>') {
      desc_line = 1;
    }
    if (desc_line == 1) {
      i = 0;
      while( (ch = fgetc(INFILE) ) != '\n') {
          CHLL *CHLL_NODE = malloc(sizeof(CHLL));
          i++;
          CHLL_NODE->ch = ch;
          CHLL_curr->next = CHLL_NODE;
          CHLL_curr = CHLL_NODE;
      }
    }    
    int desc_width = i;
    CHLL_curr = CHLL_head->next;
    char* desc = malloc(sizeof(char) * desc_width);    
    for (i=0; i < desc_width; i++) {
      desc[i] = CHLL_curr->ch;
      CHLL_curr = CHLL_curr->next;
    }

    SEQ_NODE->desc = desc;    // sequence lines
    // re-initial the CHLL link list to store sequence characters
    CHLL_curr = CHLL_head;
    desc_line = 0;
    i = 0;    
    while( (ch=fgetc(INFILE)) != '>' && ch != EOF) {
      if (ch == '\n') {
          continue;
      }
      CHLL *CHLL_NODE = malloc(sizeof(CHLL));
      CHLL_NODE->ch = ch;
      CHLL_curr->next = CHLL_NODE;
      CHLL_curr = CHLL_NODE;
      i++;
    }
    int seq_width = i;    
    char * seq = malloc(sizeof(char)*seq_width);
    CHLL_curr = CHLL_head->next;    
    for (i=0; i < seq_width; i++) {
      seq[i] = CHLL_curr->ch;
      CHLL_curr = CHLL_curr->next;
    }

    SEQ_NODE->seq = seq;
    SEQ_NODE->width = seq_width;

    curr->next = SEQ_NODE;
    curr = SEQ_NODE;

    desc_line = 1;
  }  
  return head;
}

有了FASTA的解析函数之后，问题就容易了，无非是读文件，核苷酸计数，最近打印结果。

#include<stdio.h>
#include<stdlib.h>
#include "include/readFASTA.h"

int main() {
  SEQ * head, *curr;
  head = readFASTA("DATA/rosalind_cons.txt");
  curr = head;
  int width=0;
  while(curr = curr->next) {
    if (width < curr->width) {
      width = curr->width;
    }
  }

  int NT_type = 4; // ACGT
  int cons[NT_type][width];
  int r, c;
  for (r=0; r < NT_type; r++) {
    for (c=0; c < width; c++) {
      cons[r][c] = 0;
    }
  }

  curr=head;
  int i;
  while(curr = curr->next) {
    for (i=0; i < curr->width; i++) {
      switch((curr->seq)[i]) {
      case 'A':
          cons[0][i]++;
          break;
      case 'C':
          cons[1][i]++;
          break;
      case 'G':
          cons[2][i]++;
          break;
      case 'T':
          cons[3][i]++;
          break;
      }
    }
  }

  char NT[4] = "ACGT";
  int max_r_idx, max_r=0;
  for (c=0; c < width; c++) {
    for (r=0; r < NT_type; r++) {
      if (max_r < cons[r][c]) {
          max_r = cons[r][c];
          max_r_idx = r;
      }
    }
    printf("%c", NT[max_r_idx]);
    max_r = 0;
  }
  printf("\n");
  for (r=0; r < NT_type; r++) {
    printf("%c: ", NT[r]);
    for (c=0; c < width; c++) {
      printf("%d ", cons[r][c]);
    }
    printf("\n");
  }
}

Python version

这里我读序列就是看到>就证明上一条序列读完了，可以更新计数了。
而这个更新，主要是zip函数稍微难懂一点，str是序列，而list是列表，str如果是char一样就是1不然是0，所以 [1 if x == char else 0 for x in str] 这一句根据str生成了一个0，1的列表，这个列表当然也和list等长，用zip对两个列表同时进行迭代，并加和，这样就更新了计数。

#!/usr/bin/env python3

def update_cnt(list, char, str):
    res = [a + b for a,b in zip(list, [1 if x == char else 0 for x in str])]    return(res)def print_cnt(list, label):
    print(label, end = ": ")
    for x in list:
        print(x, end=" ")
    print()

f = open("DATA/rosalind_cons.txt", 'r')
fas = f.readlines()

seq = ''
j = 0
for i in range(len(fas)):
    if fas[i][0] == '>' and j == 0:
        j += 1
        next    
    elif fas[i][0] == '>':
        if j == 1:
            A = [0 for x in seq]
            C = A
            G = A
            T = A
        A = update_cnt(A, 'A', seq)
        C = update_cnt(C, 'C', seq)
        G = update_cnt(G, 'G', seq)
        T = update_cnt(T, 'T', seq)
        j += 1
        seq = ''
    else:
        seq += fas[i].strip()

## last seq
A = update_cnt(A, 'A', seq)
C = update_cnt(C, 'C', seq)
G = update_cnt(G, 'G', seq)
T = update_cnt(T, 'T', seq)

consensus = ''
for i in range(len(A)):
    m = max(A[i], C[i], G[i], T[i])
    if A[i] == m:
        consensus += 'A'
    elif C[i] == m:
        consensus += 'C'
    elif G[i] == m:
        consensus += 'G'
    elif T[i] == m:
        consensus += 'T'

print(consensus)

print_cnt(A, 'A')
print_cnt(C, 'C')
print_cnt(G, 'G')
print_cnt(T, 'T')

常德悲剧：让谴责无差别杀戮之声更加响亮一点

2024【公共营养师】培训报名通道已开启，不限学历，23岁及以上可报！还能领2000补贴

【惊】"以为要写遗书"! 飞温哥华航班遇炸弹气旋华人崩溃大哭连空姐都吐了; 客机颠簸盘旋3小时

魏加宁：日本之所以能走出“大衰退”，靠的是不断改革，而不是所谓“积极的财政政策”

国内突然宣布，将大规模调整经济布局！又一朝阳行业悄然崛起，新一轮机会出现了！