1 #include
2 #include
3
4 #define FASTA_PARSER_CODE_OK 0
5 #define FASTA_PARSER_EOF 1
6 #define FASTA_PARSER_BAD_FORMAT -1
7 #define FASTA_PARSER_UNEXPECTED_ERROR -128
8
9
10 #define ALLONG_SIZE 2048
11
12 enum parsing_state
13 {
14 BEGIN,
15 PARSE_HEADER,
16 PARSE_SEQUENCE,
17 ERROR
18 };
19
20
21 const char *DNA_SYMBOLS="atgcATGC";
22 const char *RNA_SYMBOLS="augcAUGC";
23
24 int fasta_parser(FILE *file, char **sequence_name_p, char **sequence_p, const char *residues);
25 int append_symb(char **str_p, size_t *str_len, size_t *mem_allocated, int symb);
26
27 int fasta_parser(FILE *file, char **sequence_name_p, char **sequence_p, const char *residues)
28 {
29 int symb;
30 enum parsing_state state=BEGIN;
31
32 char *seq_name=NULL;
33 size_t seq_name_allocated=0;
34 size_t seq_name_len=0;
35
36 char *seq=NULL;
37 size_t seq_allocated=0;
38 size_t seq_len=0;
39
40
41
42 while(1)
43 {
44 switch(state)
45 {
46 case BEGIN:
47 symb=fgetc(file);
48 if( symb==' ' || symb=='\n' || symb == '\t' || symb == '\r' || symb == '\v')
49 {
50 continue;
51 }
52 if(symb == EOF)
53 {
54 return FASTA_PARSER_EOF;
55 }
56 if(symb == '>')
57 {
58 state= PARSE_HEADER;
59 continue;
60 }
61 state = ERROR;
62 break;
63 case PARSE_HEADER:
64 symb=fgetc(file);
65 if(symb == '\n')
66 {
67 state = PARSE_SEQUENCE;
68 continue;
69 }
70 if(symb == EOF)
71 {
72 return FASTA_PARSER_BAD_FORMAT;
73 }
74 if(append_symb(&seq_name,&seq_name_len, &seq_name_allocated) == -1)
75 {
76 free(seq_name);
77 return FASTA_PARSER_MEM_ERROR;
78 }
79 break;
80 case PARSE_SEQUENCE:
81 symb=fgetc(file);
82 if(symb == '>')
83 {
84 ungetc('>', file);
85 return FASTA_PARSER_CODE_OK;
86 }
87 if(symb == EOF)
88 {
89 return FASTA_PARSER_CODE_OK;
90 }
91 if( symb==' ' ||
92 symb=='\n' ||
93 symb == '\t' ||
94 symb == '\r' ||
95 symb == '\v' ||
96 (symb >= '0' && symb <= '9') ||
97 symb == ':'
98
99 )
100 {
101 continue;
102 }
103
104 {
105 int i;
106 for(i=0; residues[i]!='\0';i++)
107 {
108 if(symb == residues[i])
109 {
110 if(append_symb(&seq,&seq_len, &seq_allocated) == -1)
111 {
112 free(seq);
113 free(seq_name);
114 return FASTA_PARSER_MEM_ERROR;
115 }
116 break;
117 }
118 }
119 if(residues[i] != '\0')
120 {
121 continue;
122 }
123 }
124 state = ERROR;
125 break;
126 case ERROR:
127 fprintf(stderr,"Sorry unexpected symbol '%c'\n",symb);
128 return FASTA_PARSER_BAD_FORMAT;
129 break;
130 default:
131 return FASTA_PARSER_UNEXPECTED_ERROR;
132 break;
133 }
134 symb=fgetc(file);
135
136 }
137 return FASTA_PARSER_CODE_OK;
138 }
139
140 int append_symb(char **str_p, size_t *str_len, size_t *mem_allocated, int symb)
141 {
142 char *str=*str_p;
143
144 if(*str_len == *mem_allocated)
145 {
146 char *tmp;
147 tmp=(char *)realloc(str,*mem_allocated+ALLONG_SIZE);
148 if(tmp == NULL)
149 {
150 return -1;
151 }
152 str=tmp;
153 *mem_allocated+=ALLONG_SIZE;
154 }
155
156 str[*str_len] = symb;
157
158 if(*str_len == *mem_allocated)
159 {
160 char *tmp;
161 tmp=(char *)realloc(str,mem_allocated+ALLONG_SIZE);
162 if(tmp == NULL)
163 {
164 return -1;
165 }
166 str=tmp;
167 *mem_allocated+=ALLONG_SIZE;
168 }
169
170 str[*str_len] = '\0';
171 *str_len++;
172
173 return 0;
174 }