Browse code

add package to the repository

msa


git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@102253 bc3139a8-67e5-0310-9ffc-ced21a209358

Sonali Arora authored on 10/04/2015 00:12:33
Showing 1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,627 @@
1
+/*****************************************************************
2
+ * SQUID - a library of functions for biological sequence analysis
3
+ * Copyright (C) 1992-2002 Washington University School of Medicine
4
+ * 
5
+ *     This source code is freely distributed under the terms of the
6
+ *     GNU General Public License. See the files COPYRIGHT and LICENSE
7
+ *     for details.
8
+ *****************************************************************/
9
+
10
+/* stockholm.c
11
+ * SRE, Fri May 28 15:46:41 1999
12
+ * 
13
+ * Reading/writing of Stockholm format multiple sequence alignments.
14
+ * 
15
+ * example of API:
16
+ * 
17
+ * MSA     *msa;
18
+ * FILE    *fp;        -- opened for write with fopen()
19
+ * MSAFILE *afp;       -- opened for read with MSAFileOpen()
20
+ *      
21
+ * while ((msa = ReadStockholm(afp)) != NULL)
22
+ *   {
23
+ *      WriteStockholm(fp, msa);
24
+ *      MSAFree(msa);
25
+ *   }
26
+ * 
27
+ * RCS $Id: stockholm.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: stockholm.c,v 1.7 2002/10/12 04:40:36 eddy Exp)
28
+ */
29
+#include <stdio.h>
30
+#include <string.h>
31
+#include "squid.h"
32
+#include "msa.h"
33
+
34
+static int  parse_gf(MSA *msa, char *buf);
35
+static int  parse_gs(MSA *msa, char *buf);
36
+static int  parse_gc(MSA *msa, char *buf);
37
+static int  parse_gr(MSA *msa, char *buf);
38
+static int  parse_comment(MSA *msa, char *buf);
39
+static int  parse_sequence(MSA *msa, char *buf);
40
+static void actually_write_stockholm(FILE *fp, MSA *msa, int cpl);
41
+
42
+#ifdef TESTDRIVE_STOCKHOLM
43
+/*****************************************************************
44
+ * stockholm.c test driver: 
45
+ * cc -DTESTDRIVE_STOCKHOLM -g -O2 -Wall -o test stockholm.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c -lm 
46
+ * 
47
+ */
48
+int
49
+main(int argc, char **argv)
50
+{
51
+  MSAFILE *afp;
52
+  MSA     *msa;
53
+  char    *file;
54
+  
55
+  file = argv[1];
56
+
57
+  if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL)
58
+    Die("Couldn't open %s\n", file);
59
+
60
+  while ((msa = ReadStockholm(afp)) != NULL)
61
+    {
62
+      WriteStockholm(stdout, msa);
63
+      MSAFree(msa); 
64
+    }
65
+  
66
+  MSAFileClose(afp);
67
+  throw(ClustalOmegaException, "0");
68
+}
69
+/******************************************************************/
70
+#endif /* testdriver */
71
+
72
+
73
+/* Function: ReadStockholm()
74
+ * Date:     SRE, Fri May 21 17:33:10 1999 [St. Louis]
75
+ *
76
+ * Purpose:  Parse the next alignment from an open Stockholm
77
+ *           format alignment file. Return the alignment, or
78
+ *           NULL if there are no more alignments in the file.
79
+ *
80
+ * Args:     afp  - open alignment file
81
+ *
82
+ * Returns:  MSA *   - an alignment object. 
83
+ *                     caller responsible for an MSAFree() 
84
+ *           NULL if no more alignments
85
+ *
86
+ * Diagnostics:
87
+ *           Will Die() here with a (potentially) useful message
88
+ *           if a parsing error occurs 
89
+ */
90
+MSA *
91
+ReadStockholm(MSAFILE *afp)
92
+{
93
+  MSA   *msa;
94
+  char  *s;
95
+  int    status;
96
+
97
+  if (feof(afp->f)) return NULL;
98
+
99
+  /* Initialize allocation of the MSA.
100
+   */
101
+  msa = MSAAlloc(10, 0);
102
+
103
+  /* Check the magic Stockholm header line.
104
+   * We have to skip blank lines here, else we perceive
105
+   * trailing blank lines in a file as a format error when
106
+   * reading in multi-record mode.
107
+   */
108
+  do {
109
+    if ((s = MSAFileGetLine(afp)) == NULL) {
110
+      MSAFree(msa);
111
+      return NULL;
112
+    }
113
+  } while (IsBlankline(s));
114
+
115
+  if (strncmp(s, "# STOCKHOLM 1.", 14) != 0)
116
+    Die("\
117
+File %s doesn't appear to be in Stockholm format.\n\
118
+Assuming there isn't some other problem with your file (it is an\n\
119
+alignment file, right?), please either:\n\
120
+  a) use the Babelfish format autotranslator option (-B, usually);\n\
121
+  b) specify the file's format with the --informat option; or\n\
122
+  a) reformat the alignment to Stockholm format.\n", 
123
+	afp->fname);
124
+
125
+  /* Read the alignment file one line at a time.
126
+   */
127
+  while ((s = MSAFileGetLine(afp)) != NULL) 
128
+    {
129
+      while (*s == ' ' || *s == '\t') s++;  /* skip leading whitespace */
130
+
131
+      if (*s == '#') {
132
+	if      (strncmp(s, "#=GF", 4) == 0)   status = parse_gf(msa, s);
133
+	else if (strncmp(s, "#=GS", 4) == 0)   status = parse_gs(msa, s);
134
+	else if (strncmp(s, "#=GC", 4) == 0)   status = parse_gc(msa, s);
135
+	else if (strncmp(s, "#=GR", 4) == 0)   status = parse_gr(msa, s);
136
+	else                                   status = parse_comment(msa, s);
137
+      } 
138
+      else if (strncmp(s, "//",   2) == 0)   break;
139
+      else if (*s == '\n')                   continue;
140
+      else                                   status = parse_sequence(msa, s);
141
+
142
+      if (status == 0)  
143
+	Die("Stockholm format parse error: line %d of file %s while reading alignment %s",
144
+	    afp->linenumber, afp->fname, msa->name == NULL? "" : msa->name);
145
+    }
146
+
147
+  if (s == NULL && msa->nseq != 0)
148
+    Die ("Didn't find // at end of alignment %s", msa->name == NULL ? "" : msa->name);
149
+
150
+  if (s == NULL && msa->nseq == 0) {
151
+    				/* probably just some junk at end of file */
152
+      MSAFree(msa); 
153
+      return NULL; 
154
+    }
155
+  
156
+  MSAVerifyParse(msa);
157
+  return msa;
158
+}
159
+
160
+
161
+/* Function: WriteStockholm()
162
+ * Date:     SRE, Mon May 31 19:15:22 1999 [St. Louis]
163
+ *
164
+ * Purpose:  Write an alignment in standard multi-block 
165
+ *           Stockholm format to an open file. A wrapper
166
+ *           for actually_write_stockholm().
167
+ *
168
+ * Args:     fp  - file that's open for writing
169
+ *           msa - alignment to write    
170
+ *
171
+ * Returns:  (void)
172
+ */
173
+void
174
+WriteStockholm(FILE *fp, MSA *msa)
175
+{
176
+  actually_write_stockholm(fp, msa, 50); /* 50 char per block */
177
+}
178
+
179
+/* Function: WriteStockholmOneBlock()
180
+ * Date:     SRE, Mon May 31 19:15:22 1999 [St. Louis]
181
+ *
182
+ * Purpose:  Write an alignment in Pfam's single-block
183
+ *           Stockholm format to an open file. A wrapper
184
+ *           for actually_write_stockholm().
185
+ *
186
+ * Args:     fp  - file that's open for writing
187
+ *           msa - alignment to write    
188
+ *
189
+ * Returns:  (void)
190
+ */
191
+void
192
+WriteStockholmOneBlock(FILE *fp, MSA *msa)
193
+{
194
+  actually_write_stockholm(fp, msa, msa->alen); /* one big block */
195
+}
196
+
197
+
198
+/* Function: actually_write_stockholm()
199
+ * Date:     SRE, Fri May 21 17:39:22 1999 [St. Louis]
200
+ *
201
+ * Purpose:  Write an alignment in Stockholm format to 
202
+ *           an open file. This is the function that actually
203
+ *           does the work. The API's WriteStockholm()
204
+ *           and WriteStockholmOneBlock() are wrappers.
205
+ *
206
+ * Args:     fp    - file that's open for writing
207
+ *           msa   - alignment to write        
208
+ *           cpl   - characters to write per line in alignment block
209
+ *
210
+ * Returns:  (void)
211
+ */
212
+static void
213
+actually_write_stockholm(FILE *fp, MSA *msa, int cpl)
214
+{
215
+  int  i, j;
216
+  int  len = 0;
217
+  int  namewidth;
218
+  int  typewidth = 0;		/* markup tags are up to 5 chars long */
219
+  int  markupwidth = 0;		/* #=GR, #=GC are four char wide + 1 space */
220
+  char *buf;
221
+  int  currpos;
222
+  char *s, *tok;
223
+  
224
+  /* Figure out how much space we need for name + markup
225
+   * to keep the alignment in register. Required by Stockholm
226
+   * spec, even though our Stockholm parser doesn't care (Erik's does).
227
+   */
228
+  namewidth = 0;
229
+  for (i = 0; i < msa->nseq; i++)
230
+    if ((len = strlen(msa->sqname[i])) > namewidth) 
231
+      namewidth = len;
232
+
233
+  /* Figure out how much space we need for markup tags
234
+   *   markupwidth = always 4 if we're doing markup:  strlen("#=GR")
235
+   *   typewidth   = longest markup tag
236
+   */
237
+  if (msa->ss      != NULL) { markupwidth = 4; typewidth = 2; }
238
+  if (msa->sa      != NULL) { markupwidth = 4; typewidth = 2; }
239
+  for (i = 0; i < msa->ngr; i++)
240
+    if ((len = strlen(msa->gr_tag[i])) > typewidth) typewidth = len;
241
+
242
+  if (msa->rf      != NULL) { markupwidth = 4; if (typewidth < 2) typewidth = 2; }
243
+  if (msa->ss_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; }
244
+  if (msa->sa_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; }
245
+  for (i = 0; i < msa->ngc; i++)
246
+    if ((len = strlen(msa->gc_tag[i])) > typewidth) typewidth = len;
247
+  
248
+  buf = MallocOrDie(sizeof(char) * (cpl+namewidth+typewidth+markupwidth+61)); 
249
+
250
+  /* Magic Stockholm header
251
+   */
252
+  fprintf(fp, "# STOCKHOLM 1.0\n");
253
+
254
+  /* Free text comments
255
+   */
256
+  for (i = 0;  i < msa->ncomment; i++)
257
+    fprintf(fp, "# %s\n", msa->comment[i]);
258
+  if (msa->ncomment > 0) fprintf(fp, "\n");
259
+
260
+  /* GF section: per-file annotation
261
+   */
262
+  if (msa->name  != NULL)       fprintf(fp, "#=GF ID    %s\n", msa->name);
263
+  if (msa->acc   != NULL)       fprintf(fp, "#=GF AC    %s\n", msa->acc);
264
+  if (msa->desc  != NULL)       fprintf(fp, "#=GF DE    %s\n", msa->desc);
265
+  if (msa->au    != NULL)       fprintf(fp, "#=GF AU    %s\n", msa->au);
266
+  
267
+  /* Thresholds are hacky. Pfam has two. Rfam has one.
268
+   */
269
+  if      (msa->cutoff_is_set[MSA_CUTOFF_GA1] && msa->cutoff_is_set[MSA_CUTOFF_GA2])
270
+    fprintf(fp, "#=GF GA    %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_GA1], msa->cutoff[MSA_CUTOFF_GA2]);
271
+  else if (msa->cutoff_is_set[MSA_CUTOFF_GA1])
272
+    fprintf(fp, "#=GF GA    %.1f\n", msa->cutoff[MSA_CUTOFF_GA1]);
273
+  if      (msa->cutoff_is_set[MSA_CUTOFF_NC1] && msa->cutoff_is_set[MSA_CUTOFF_NC2])
274
+    fprintf(fp, "#=GF NC    %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_NC1], msa->cutoff[MSA_CUTOFF_NC2]);
275
+  else if (msa->cutoff_is_set[MSA_CUTOFF_NC1])
276
+    fprintf(fp, "#=GF NC    %.1f\n", msa->cutoff[MSA_CUTOFF_NC1]);
277
+  if      (msa->cutoff_is_set[MSA_CUTOFF_TC1] && msa->cutoff_is_set[MSA_CUTOFF_TC2])
278
+    fprintf(fp, "#=GF TC    %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_TC1], msa->cutoff[MSA_CUTOFF_TC2]);
279
+  else if (msa->cutoff_is_set[MSA_CUTOFF_TC1])
280
+    fprintf(fp, "#=GF TC    %.1f\n", msa->cutoff[MSA_CUTOFF_TC1]);
281
+
282
+  for (i = 0; i < msa->ngf; i++)
283
+    fprintf(fp, "#=GF %-5s %s\n", msa->gf_tag[i], msa->gf[i]); 
284
+  fprintf(fp, "\n");
285
+
286
+
287
+  /* GS section: per-sequence annotation
288
+   */
289
+  if (msa->flags & MSA_SET_WGT) 
290
+    {
291
+      for (i = 0; i < msa->nseq; i++) 
292
+	fprintf(fp, "#=GS %-*.*s WT    %.2f\n", namewidth, namewidth, msa->sqname[i], msa->wgt[i]);
293
+      fprintf(fp, "\n");
294
+    }
295
+  if (msa->sqacc != NULL) 
296
+    {
297
+      for (i = 0; i < msa->nseq; i++) 
298
+	if (msa->sqacc[i] != NULL)
299
+	  fprintf(fp, "#=GS %-*.*s AC    %s\n", namewidth, namewidth, msa->sqname[i], msa->sqacc[i]);
300
+      fprintf(fp, "\n");
301
+    }
302
+  if (msa->sqdesc != NULL) 
303
+    {
304
+      for (i = 0; i < msa->nseq; i++) 
305
+	if (msa->sqdesc[i] != NULL)
306
+	  fprintf(fp, "#=GS %*.*s DE    %s\n", namewidth, namewidth, msa->sqname[i], msa->sqdesc[i]);
307
+      fprintf(fp, "\n");
308
+    }
309
+  for (i = 0; i < msa->ngs; i++)
310
+    {
311
+      /* Multiannotated GS tags are possible; for example, 
312
+       *     #=GS foo DR PDB; 1xxx;
313
+       *     #=GS foo DR PDB; 2yyy;
314
+       * These are stored, for example, as:
315
+       *     msa->gs[0][0] = "PDB; 1xxx;\nPDB; 2yyy;"
316
+       * and must be decomposed.
317
+       */
318
+      for (j = 0; j < msa->nseq; j++)
319
+	if (msa->gs[i][j] != NULL)
320
+	  {
321
+	    s = msa->gs[i][j];
322
+	    while ((tok = sre_strtok(&s, "\n", NULL)) != NULL)
323
+	      fprintf(fp, "#=GS %*.*s %5s %s\n", namewidth, namewidth,
324
+		      msa->sqname[j], msa->gs_tag[i], tok);
325
+	  }
326
+      fprintf(fp, "\n");
327
+    }
328
+
329
+  /* Alignment section:
330
+   * contains aligned sequence, #=GR annotation, and #=GC annotation
331
+   */
332
+  for (currpos = 0; currpos < msa->alen; currpos += cpl)
333
+    {
334
+      if (currpos > 0) fprintf(fp, "\n");
335
+      for (i = 0; i < msa->nseq; i++)
336
+	{
337
+	  strncpy(buf, msa->aseq[i] + currpos, cpl);
338
+	  buf[cpl] = '\0';	      
339
+	  fprintf(fp, "%-*.*s  %s\n", namewidth+typewidth+markupwidth, namewidth+typewidth+markupwidth, 
340
+		  msa->sqname[i], buf);
341
+
342
+	  if (msa->ss != NULL && msa->ss[i] != NULL) {
343
+	    strncpy(buf, msa->ss[i] + currpos, cpl);
344
+	    buf[cpl] = '\0';	 
345
+	    fprintf(fp, "#=GR %-*.*s SS     %s\n", namewidth, namewidth, msa->sqname[i], buf);
346
+	  }
347
+	  if (msa->sa != NULL && msa->sa[i] != NULL) {
348
+	    strncpy(buf, msa->sa[i] + currpos, cpl);
349
+	    buf[cpl] = '\0';
350
+	    fprintf(fp, "#=GR %-*.*s SA     %s\n", namewidth, namewidth, msa->sqname[i], buf);
351
+	  }
352
+	  for (j = 0; j < msa->ngr; j++)
353
+	    if (msa->gr[j][i] != NULL) {
354
+	      strncpy(buf, msa->gr[j][i] + currpos, cpl);
355
+	      buf[cpl] = '\0';
356
+	      fprintf(fp, "#=GR %-*.*s %5s  %s\n", 
357
+		      namewidth, namewidth, msa->sqname[i], msa->gr_tag[j], buf);
358
+	    }
359
+	}
360
+      if (msa->ss_cons != NULL) {
361
+	strncpy(buf, msa->ss_cons + currpos, cpl);
362
+	buf[cpl] = '\0';
363
+	fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SS_cons", buf);
364
+      }
365
+
366
+      if (msa->sa_cons != NULL) {
367
+	strncpy(buf, msa->sa_cons + currpos, cpl);
368
+	buf[cpl] = '\0';
369
+	fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SA_cons", buf);
370
+      }
371
+
372
+      if (msa->rf != NULL) {
373
+	strncpy(buf, msa->rf + currpos, cpl);
374
+	buf[cpl] = '\0';
375
+	fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "RF", buf);
376
+      }
377
+      for (j = 0; j < msa->ngc; j++) {
378
+	strncpy(buf, msa->gc[j] + currpos, cpl);
379
+	buf[cpl] = '\0';
380
+	fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, 
381
+		msa->gc_tag[j], buf);
382
+      }
383
+    }
384
+  fprintf(fp, "//\n");
385
+  free(buf);
386
+}
387
+
388
+
389
+
390
+
391
+
392
+/* Format of a GF line:
393
+ *    #=GF <featurename> <text>
394
+ */
395
+static int
396
+parse_gf(MSA *msa, char *buf)
397
+{
398
+  char *gf;
399
+  char *featurename;
400
+  char *text;
401
+  char *s;
402
+
403
+  s = buf;
404
+  if ((gf          = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
405
+  if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
406
+  if ((text        = sre_strtok(&s, "\n",       NULL)) == NULL) return 0;
407
+  while (*text && (*text == ' ' || *text == '\t')) text++;
408
+
409
+  if      (strcmp(featurename, "ID") == 0) 
410
+    msa->name                 = sre_strdup(text, -1);
411
+  else if (strcmp(featurename, "AC") == 0) 
412
+    msa->acc                  = sre_strdup(text, -1);
413
+  else if (strcmp(featurename, "DE") == 0) 
414
+    msa->desc                 = sre_strdup(text, -1);
415
+  else if (strcmp(featurename, "AU") == 0) 
416
+    msa->au                   = sre_strdup(text, -1);
417
+  else if (strcmp(featurename, "GA") == 0) 
418
+    {				/* Pfam has GA1, GA2. Rfam just has GA1. */
419
+      s = text;
420
+      if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
421
+      msa->cutoff[MSA_CUTOFF_GA1]        = atof(text);
422
+      msa->cutoff_is_set[MSA_CUTOFF_GA1] = TRUE;
423
+      if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
424
+	msa->cutoff[MSA_CUTOFF_GA2]        = atof(text);
425
+	msa->cutoff_is_set[MSA_CUTOFF_GA2] = TRUE;
426
+      }
427
+    }
428
+  else if (strcmp(featurename, "NC") == 0) 
429
+    {
430
+      s = text;
431
+      if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
432
+      msa->cutoff[MSA_CUTOFF_NC1]        = atof(text);
433
+      msa->cutoff_is_set[MSA_CUTOFF_NC1] = TRUE;
434
+      if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
435
+	msa->cutoff[MSA_CUTOFF_NC2]        = atof(text);
436
+	msa->cutoff_is_set[MSA_CUTOFF_NC2] = TRUE;
437
+      }
438
+    }
439
+  else if (strcmp(featurename, "TC") == 0) 
440
+    {
441
+      s = text;
442
+      if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
443
+      msa->cutoff[MSA_CUTOFF_TC1]        = atof(text);
444
+      msa->cutoff_is_set[MSA_CUTOFF_TC1] = TRUE;
445
+      if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
446
+	msa->cutoff[MSA_CUTOFF_TC2]        = atof(text);
447
+	msa->cutoff_is_set[MSA_CUTOFF_TC2] = TRUE;
448
+      }
449
+    }
450
+  else 
451
+    MSAAddGF(msa, featurename, text);
452
+
453
+  return 1;
454
+}
455
+
456
+
457
+/* Format of a GS line:
458
+ *    #=GS <seqname> <featurename> <text>
459
+ */
460
+static int
461
+parse_gs(MSA *msa, char *buf)
462
+{
463
+  char *gs;
464
+  char *seqname;
465
+  char *featurename;
466
+  char *text; 
467
+  int   seqidx;
468
+  char *s;
469
+
470
+  s = buf;
471
+  if ((gs          = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
472
+  if ((seqname     = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
473
+  if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
474
+  if ((text        = sre_strtok(&s, "\n",       NULL)) == NULL) return 0;
475
+  while (*text && (*text == ' ' || *text == '\t')) text++;
476
+  
477
+  /* GS usually follows another GS; guess lastidx+1
478
+   */
479
+  seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1);
480
+  msa->lastidx = seqidx;
481
+
482
+  if (strcmp(featurename, "WT") == 0)
483
+    {
484
+      msa->wgt[seqidx]          = atof(text);
485
+      msa->flags |= MSA_SET_WGT;
486
+    }
487
+
488
+  else if (strcmp(featurename, "AC") == 0)
489
+    MSASetSeqAccession(msa, seqidx, text);
490
+
491
+  else if (strcmp(featurename, "DE") == 0)
492
+    MSASetSeqDescription(msa, seqidx, text);
493
+
494
+  else				
495
+    MSAAddGS(msa, featurename, seqidx, text);
496
+
497
+  return 1;
498
+}
499
+
500
+/* Format of a GC line:
501
+ *    #=GC <featurename> <text>
502
+ */
503
+static int 
504
+parse_gc(MSA *msa, char *buf)
505
+{
506
+  char *gc;
507
+  char *featurename;
508
+  char *text; 
509
+  char *s;
510
+  int   len;
511
+
512
+  s = buf;
513
+  if ((gc          = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
514
+  if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
515
+  if ((text        = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
516
+  
517
+  if (strcmp(featurename, "SS_cons") == 0)
518
+    sre_strcat(&(msa->ss_cons), -1, text, len);
519
+  else if (strcmp(featurename, "SA_cons") == 0)
520
+    sre_strcat(&(msa->sa_cons), -1, text, len);
521
+  else if (strcmp(featurename, "RF") == 0)
522
+    sre_strcat(&(msa->rf), -1, text, len);
523
+  else
524
+    MSAAppendGC(msa, featurename, text);
525
+
526
+  return 1;
527
+}
528
+
529
+/* Format of a GR line:
530
+ *    #=GR <seqname> <featurename> <text>
531
+ */
532
+static int
533
+parse_gr(MSA *msa, char *buf)
534
+{
535
+  char *gr;
536
+  char *seqname;
537
+  char *featurename;
538
+  char *text;
539
+  int   seqidx;
540
+  int   len;
541
+  int   j;
542
+  char *s;
543
+
544
+  s = buf;
545
+  if ((gr          = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
546
+  if ((seqname     = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
547
+  if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
548
+  if ((text        = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
549
+
550
+  /* GR usually follows sequence it refers to; guess msa->lastidx */
551
+  seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx);
552
+  msa->lastidx = seqidx;
553
+
554
+  if (strcmp(featurename, "SS") == 0) 
555
+    {
556
+      if (msa->ss == NULL)
557
+	{
558
+	  msa->ss    = MallocOrDie(sizeof(char *) * msa->nseqalloc);
559
+	  msa->sslen = MallocOrDie(sizeof(int)    * msa->nseqalloc);
560
+	  for (j = 0; j < msa->nseqalloc; j++)
561
+	    {
562
+	      msa->ss[j]    = NULL;
563
+	      msa->sslen[j] = 0;
564
+	    }
565
+	}
566
+      msa->sslen[seqidx] = sre_strcat(&(msa->ss[seqidx]), msa->sslen[seqidx], text, len);
567
+    }
568
+  else if (strcmp(featurename, "SA") == 0)
569
+    {
570
+      if (msa->sa == NULL)
571
+	{
572
+	  msa->sa    = MallocOrDie(sizeof(char *) * msa->nseqalloc);
573
+	  msa->salen = MallocOrDie(sizeof(int)    * msa->nseqalloc);
574
+	  for (j = 0; j < msa->nseqalloc; j++) 
575
+	    {
576
+	      msa->sa[j]    = NULL;
577
+	      msa->salen[j] = 0;
578
+	    }
579
+	}
580
+      msa->salen[seqidx] = sre_strcat(&(msa->sa[seqidx]), msa->salen[seqidx], text, len);
581
+    }
582
+  else 
583
+    MSAAppendGR(msa, featurename, seqidx, text);
584
+
585
+  return 1;
586
+}
587
+
588
+
589
+/* comments are simply stored verbatim, not parsed
590
+ */
591
+static int
592
+parse_comment(MSA *msa, char *buf)
593
+{
594
+  char *s;
595
+  char *comment;
596
+
597
+  s = buf + 1;			               /* skip leading '#' */
598
+  if (*s == '\n') { *s = '\0'; comment = s; }  /* deal with blank comment */
599
+  else if ((comment = sre_strtok(&s, "\n", NULL)) == NULL) return 0;
600
+  
601
+  MSAAddComment(msa, comment);
602
+  return 1;
603
+}
604
+
605
+static int
606
+parse_sequence(MSA *msa, char *buf)
607
+{
608
+  char *s;
609
+  char *seqname;
610
+  char *text;
611
+  int   seqidx;
612
+  int   len;
613
+
614
+  s = buf;
615
+  if ((seqname     = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
616
+  if ((text        = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0; 
617
+  
618
+  /* seq usually follows another seq; guess msa->lastidx +1 */
619
+  seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1);
620
+  msa->lastidx = seqidx;
621
+
622
+  msa->sqlen[seqidx] = sre_strcat(&(msa->aseq[seqidx]), msa->sqlen[seqidx], text, len);
623
+  return 1;
624
+}
625
+
626
+
627
+