Kodomo

Пользователь

   1 #include <stdio.h>
   2 #include <stdlib.h>
   3 
   4 #define FASTA_PARSER_CODE_OK 0
   5 #define FASTA_PARSER_EOF     1
   6 #define FASTA_PARSER_BAD_FORMAT -1
   7 #define FASTA_PARSER_UNEXPECTED_ERROR -128
   8 
   9 
  10 #define ALLONG_SIZE 2048
  11 
  12 enum parsing_state
  13 {
  14   BEGIN,
  15   PARSE_HEADER,
  16   PARSE_SEQUENCE,
  17   ERROR
  18 };
  19 
  20 
  21 const char *DNA_SYMBOLS="atgcATGC";
  22 const char *RNA_SYMBOLS="augcAUGC";
  23 
  24 int fasta_parser(FILE *file, char **sequence_name_p, char **sequence_p, const char *residues);
  25 int append_symb(char **str_p, size_t *str_len, size_t *mem_allocated, int symb);
  26 
  27 int fasta_parser(FILE *file, char **sequence_name_p, char **sequence_p, const char *residues)
  28 {
  29     int symb;
  30     enum parsing_state state=BEGIN;
  31 
  32     char     *seq_name=NULL;
  33     size_t   seq_name_allocated=0;
  34     size_t   seq_name_len=0;
  35 
  36     char     *seq=NULL;
  37     size_t   seq_allocated=0;
  38     size_t   seq_len=0;
  39 
  40 
  41 
  42     while(1)
  43     {
  44         switch(state)
  45         {
  46             case BEGIN:
  47                 symb=fgetc(file);
  48                 if( symb==' ' || symb=='\n' ||  symb == '\t' || symb == '\r' || symb == '\v')
  49                 {
  50                     continue;
  51                 }
  52                 if(symb == EOF)
  53                 {
  54                     return FASTA_PARSER_EOF;
  55                 }
  56                 if(symb == '>')
  57                 {
  58                     state= PARSE_HEADER;
  59                     continue; 
  60                 }
  61                 state = ERROR;                
  62             break;
  63             case PARSE_HEADER:
  64                 symb=fgetc(file);
  65                 if(symb == '\n')
  66                 {
  67                     state = PARSE_SEQUENCE;
  68                     continue;
  69                 }
  70                 if(symb == EOF)
  71                 {
  72                     return FASTA_PARSER_BAD_FORMAT;
  73                 }
  74                 if(append_symb(&seq_name,&seq_name_len, &seq_name_allocated) == -1)
  75                 {
  76                     free(seq_name);
  77                     return FASTA_PARSER_MEM_ERROR;
  78                 }
  79             break;
  80             case PARSE_SEQUENCE:
  81                 symb=fgetc(file);
  82                 if(symb == '>')
  83                 {
  84                     ungetc('>', file);
  85                     return FASTA_PARSER_CODE_OK;
  86                 }
  87                 if(symb == EOF)
  88                 {
  89                     return FASTA_PARSER_CODE_OK;
  90                 }
  91                 if( symb==' '    ||
  92                     symb=='\n'   ||
  93                     symb == '\t' ||
  94                     symb == '\r' ||
  95                     symb == '\v' ||
  96                     (symb >= '0' && symb <= '9') ||
  97                     symb == ':'  /* || //uncomment it if you need to skip gaps
  98                     symb == '-' */
  99                   )
 100                 {
 101                     continue;
 102                 }
 103                 
 104                 {
 105                     int i;
 106                     for(i=0; residues[i]!='\0';i++)
 107                     {
 108                         if(symb == residues[i])
 109                         {
 110                             if(append_symb(&seq,&seq_len, &seq_allocated) == -1)
 111                             {
 112                                 free(seq);
 113                                 free(seq_name);
 114                                 return FASTA_PARSER_MEM_ERROR;
 115                             }
 116                             break;
 117                         }
 118                     }
 119                     if(residues[i] != '\0')
 120                     {
 121                         continue;
 122                     }
 123                 }
 124                 state = ERROR;
 125             break;
 126             case ERROR:
 127                 fprintf(stderr,"Sorry unexpected symbol '%c'\n",symb);
 128                 return FASTA_PARSER_BAD_FORMAT;
 129             break;
 130             default:
 131                 return FASTA_PARSER_UNEXPECTED_ERROR;
 132             break;
 133         }
 134         symb=fgetc(file);
 135 
 136     }
 137     return FASTA_PARSER_CODE_OK;
 138 }
 139 
 140 int append_symb(char **str_p, size_t *str_len, size_t *mem_allocated, int symb)
 141 {
 142     char *str=*str_p;
 143     
 144     if(*str_len == *mem_allocated)
 145     {
 146         char *tmp;
 147         tmp=(char *)realloc(str,*mem_allocated+ALLONG_SIZE);
 148         if(tmp == NULL)
 149         {
 150             return -1;
 151         }
 152         str=tmp;
 153         *mem_allocated+=ALLONG_SIZE;
 154     }
 155     
 156     str[*str_len] = symb;
 157 
 158     if(*str_len == *mem_allocated)
 159     {
 160         char *tmp;
 161         tmp=(char *)realloc(str,mem_allocated+ALLONG_SIZE);
 162         if(tmp == NULL)
 163         {
 164             return -1;
 165         }
 166         str=tmp;
 167         *mem_allocated+=ALLONG_SIZE;
 168    }
 169 
 170     str[*str_len] = '\0';
 171     *str_len++;
 172 
 173     return 0;    
 174 }