Browse code

Multiple fixes; final comit before release; version number bumped to 0.99.6

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@102514 bc3139a8-67e5-0310-9ffc-ced21a209358

Ulrich Bodenhofer authored on 15/04/2015 13:20:09
Showing 1 changed files
... ...
@@ -17,6 +17,7 @@
17 17
 #include "ssi.h"
18 18
 #ifdef CLUSTALO
19 19
 #include <limits.h>
20
+#include <inttypes.h>
20 21
 #endif
21 22
 
22 23
 static sqd_uint32 v20magic = 0xf3f3e9b1; /* SSI 1.0: "ssi1" + 0x80808080 */
... ...
@@ -747,9 +748,9 @@ SSIAddPrimaryKeyToIndex(SSIINDEX *g, char *key, int fh,
747 748
 	      (unsigned long) L);
748 749
     } else {
749 750
 #ifdef CLUSTALO
750
-        fprintf(g->ptmp, "%s\t%d\t%llu\t%llu\t%lu\n", 
751
+        fprintf(g->ptmp, "%s\t%d\t%" PRIu64 "\t%" PRIu64 "\t%lu\n",
751 752
                 key, fh, (unsigned long long)r_off->off.i64, 
752
-                d_off == NULL? 0 : (unsigned long long) d_off->off.i64, 
753
+                d_off == NULL ? 0 : (unsigned long long) d_off->off.i64,
753 754
 	      (unsigned long) L);
754 755
 #else
755 756
         fprintf(g->ptmp, "%s\t%d\t%llu\t%llu\t%lu\n", 
... ...
@@ -1403,7 +1404,7 @@ activate_external_sort(SSIINDEX *g)
1403 1404
 	      (unsigned long) g->pkeys[i].d_off.off.i32, 
1404 1405
 	      (unsigned long) g->pkeys[i].len);
1405 1406
     } else {
1406
-      fprintf(g->ptmp, "%s\t%u\t%llu\t%llu\t%lu\n", 
1407
+      fprintf(g->ptmp, "%s\t%u\t%" PRIu64 "\t%" PRIu64 "\t%lu\n",
1407 1408
 	      g->pkeys[i].key, g->pkeys[i].fnum,
1408 1409
 	      (unsigned long long) g->pkeys[i].r_off.off.i64, 
1409 1410
 	      (unsigned long long) g->pkeys[i].d_off.off.i64, 
Browse code

add package to the repository

msa


git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@102253 bc3139a8-67e5-0310-9ffc-ced21a209358

Sonali Arora authored on 10/04/2015 00:12:33
Showing 1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,1537 @@
1
+/*****************************************************************
2
+ * SQUID - a library of functions for biological sequence analysis
3
+ * Copyright (C) 1992-2002 Washington University School of Medicine
4
+ * 
5
+ *     This source code is freely distributed under the terms of the
6
+ *     GNU General Public License. See the files COPYRIGHT and LICENSE
7
+ *     for details.
8
+ *****************************************************************/
9
+
10
+#include <stdio.h>
11
+#include <stdlib.h>
12
+#include <string.h>
13
+#include <sys/stat.h>
14
+#include <sys/types.h>
15
+#include <unistd.h>
16
+#include "squid.h"
17
+#include "ssi.h"
18
+#ifdef CLUSTALO
19
+#include <limits.h>
20
+#endif
21
+
22
+static sqd_uint32 v20magic = 0xf3f3e9b1; /* SSI 1.0: "ssi1" + 0x80808080 */
23
+static sqd_uint32 v20swap  = 0xb1e9f3f3; /* byteswapped */
24
+
25
+static int read_i16(FILE *fp, sqd_uint16 *ret_result);
26
+static int read_i32(FILE *fp, sqd_uint32 *ret_result);
27
+static int read_i64(FILE *fp, sqd_uint64 *ret_result);
28
+static int read_offset(FILE *fp, char mode, SSIOFFSET *ret_offset);
29
+static int write_i16(FILE *fp, sqd_uint16 n);
30
+static int write_i32(FILE *fp, sqd_uint32 n);
31
+static int write_i64(FILE *fp, sqd_uint64 n);
32
+static int write_offset(FILE *fp, SSIOFFSET *offset);
33
+static int binary_search(SSIFILE *sfp, char *key, int klen, SSIOFFSET *base, 
34
+			 sqd_uint32 recsize, sqd_uint32 maxidx);
35
+static int indexfile_position(SSIFILE *sfp, SSIOFFSET *base, sqd_uint32 len,
36
+			      sqd_uint32 n);
37
+static void clear_ssifile(SSIFILE *sfp);
38
+static sqd_uint64 current_index_size(SSIINDEX *g);
39
+static int        activate_external_sort(SSIINDEX *g);
40
+static int        load_indexfile(SSIFILE *sfp);
41
+static int        parse_pkey_info(char *buf, char mode, struct ssipkey_s *pkey);
42
+static int        parse_skey_info(char *buf, struct ssiskey_s *skey);
43
+
44
+/* Function: SSIOpen()
45
+ * Date:     SRE, Sun Dec 31 12:40:03 2000 [St. Louis]
46
+ *
47
+ * Purpose:  Opens the SSI index file {filename} and returns
48
+ *           a SSIFILE * stream thru {ret_sfp}.
49
+ *           The caller must eventually close this stream using
50
+ *           SSIClose(). More than one index file can be open
51
+ *           at once.
52
+ *
53
+ * Args:     filename - full path to a SSI index file
54
+ *
55
+ * Returns:  Returns 0 on success, nonzero on failure.
56
+ */
57
+int
58
+SSIOpen(char *filename, SSIFILE **ret_sfp)
59
+{
60
+  SSIFILE  *sfp = NULL;
61
+  int       status;
62
+  if ((sfp = malloc(sizeof(SSIFILE))) == NULL)   return SSI_ERR_MALLOC;
63
+  if ((sfp->fp = fopen(filename, "rb")) == NULL) {
64
+    free(sfp);
65
+    return SSI_ERR_NOFILE;    
66
+  }
67
+  status = load_indexfile(sfp);
68
+  *ret_sfp = sfp;
69
+  return status;
70
+}
71
+/* load_indexfile(): given a SSIFILE structure with an open and positioned 
72
+ *    stream (fp) -- but no other data loaded -- read the next SSIFILE
73
+ *    in from disk. We use this routine without its SSIOpen() wrapper
74
+ *    as part of the external mergesort when creating large indices.
75
+ */
76
+static int
77
+load_indexfile(SSIFILE *sfp)
78
+{
79
+  sqd_uint32   magic;
80
+  sqd_uint16   i;		/* counter over files */
81
+  int          status;		/* overall return status if an error is thrown */
82
+
83
+  status = SSI_ERR_BADFORMAT; /* default: almost every kind of error is a bad format error */
84
+
85
+  sfp->filename   = NULL;
86
+  sfp->fileformat = NULL;
87
+  sfp->fileflags  = NULL;
88
+  sfp->bpl        = NULL;
89
+  sfp->rpl        = NULL;
90
+  sfp->nfiles     = 0;          
91
+  if (! read_i32(sfp->fp, &magic))               {status = SSI_ERR_BADMAGIC;  goto FAILURE; }
92
+  if (magic != v20magic && magic != v20swap)     {status = SSI_ERR_BADMAGIC;  goto FAILURE; }
93
+  if (! read_i32(sfp->fp, &(sfp->flags))) goto FAILURE; 
94
+
95
+  /* If we have 64-bit offsets, make sure we can deal with them.
96
+   */
97
+#ifndef HAS_64BIT_FILE_OFFSETS  
98
+  if ((sfp->flags & SSI_USE64_INDEX) ||
99
+      (sfp->flags & SSI_USE64))
100
+    { status = SSI_ERR_NO64BIT; goto FAILURE; }
101
+#endif
102
+
103
+  sfp->imode = (sfp->flags & SSI_USE64_INDEX) ? SSI_OFFSET_I64 : SSI_OFFSET_I32;
104
+  sfp->smode = (sfp->flags & SSI_USE64) ?       SSI_OFFSET_I64 : SSI_OFFSET_I32;
105
+
106
+  if (! read_i16(sfp->fp, &(sfp->nfiles)))     goto FAILURE;
107
+  if (! read_i32(sfp->fp, &(sfp->nprimary)))   goto FAILURE;
108
+  if (! read_i32(sfp->fp, &(sfp->nsecondary))) goto FAILURE;
109
+  if (! read_i32(sfp->fp, &(sfp->flen)))       goto FAILURE;
110
+  if (! read_i32(sfp->fp, &(sfp->plen)))       goto FAILURE;
111
+  if (! read_i32(sfp->fp, &(sfp->slen)))       goto FAILURE;
112
+  if (! read_i32(sfp->fp, &(sfp->frecsize)))   goto FAILURE;
113
+  if (! read_i32(sfp->fp, &(sfp->precsize)))   goto FAILURE;
114
+  if (! read_i32(sfp->fp, &(sfp->srecsize)))   goto FAILURE;
115
+  
116
+  if (! read_offset(sfp->fp, sfp->imode, &(sfp->foffset))) goto FAILURE;
117
+  if (! read_offset(sfp->fp, sfp->imode, &(sfp->poffset))) goto FAILURE;
118
+  if (! read_offset(sfp->fp, sfp->imode, &(sfp->soffset))) goto FAILURE;
119
+
120
+  /* Read the file information and keep it.
121
+   * We expect the number of files to be small, so reading it
122
+   * once should be advantageous overall. If SSI ever had to
123
+   * deal with large numbers of files, you'd probably want to
124
+   * read file information on demand.
125
+   */
126
+  if (sfp->nfiles == 0)                                                   goto FAILURE;
127
+  if ((sfp->filename=malloc(sizeof(char *)    *sfp->nfiles)) == NULL)   {status = SSI_ERR_MALLOC; goto FAILURE; }
128
+  for (i = 0; i < sfp->nfiles; i++) sfp->filename[i] = NULL; 
129
+  if ((sfp->fileformat=malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; }
130
+  if ((sfp->fileflags =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; }
131
+  if ((sfp->bpl     =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL)   {status = SSI_ERR_MALLOC; goto FAILURE; }
132
+  if ((sfp->rpl     =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL)   {status = SSI_ERR_MALLOC; goto FAILURE; }
133
+
134
+  for (i = 0; i < sfp->nfiles; i++) 
135
+    {
136
+      /* We have to explicitly position, because header and file 
137
+       * records may expand in the future; frecsize and foffset 
138
+       * give us forwards compatibility. 
139
+       */ 
140
+      if (indexfile_position(sfp, &(sfp->foffset), sfp->frecsize, i) !=0)  goto FAILURE;
141
+      if ((sfp->filename[i] =malloc(sizeof(char)*sfp->flen)) == NULL)        {status = SSI_ERR_MALLOC; goto FAILURE; }
142
+      if (fread(sfp->filename[i],sizeof(char),sfp->flen, sfp->fp)!=sfp->flen) goto FAILURE;
143
+      if (! read_i32(sfp->fp, &(sfp->fileformat[i])))                             goto FAILURE;
144
+      if (! read_i32(sfp->fp, &(sfp->fileflags[i])))                              goto FAILURE;
145
+      if (! read_i32(sfp->fp, &(sfp->bpl[i])))                                    goto FAILURE;
146
+      if (! read_i32(sfp->fp, &(sfp->rpl[i])))                                    goto FAILURE;
147
+    }
148
+  
149
+  /* Success. Return 0.
150
+   */
151
+  return 0;			
152
+
153
+ FAILURE:
154
+  /* Failure: free the damaged structure, return status code.
155
+   */
156
+  SSIClose(sfp);
157
+  return status;
158
+}
159
+
160
+
161
+
162
+/* Function: SSIGetOffsetByName()
163
+ * Date:     SRE, Sun Dec 31 13:55:31 2000 [St. Louis]
164
+ *
165
+ * Purpose:  Looks up the string {key} in the open index {sfp}.
166
+ *           {key} can be either a primary or secondary key. If {key}
167
+ *           is found, {*ret_fh} contains a unique handle on
168
+ *           the file that contains {key} (suitable for an SSIFileInfo()
169
+ *           call, or for comparison to the handle of the last file
170
+ *           that was opened for retrieval), and {offset} is filled 
171
+ *           in with the offset in that file.
172
+ *           
173
+ * Args:     sfp         - open index file
174
+ *           key         - string to search for
175
+ *           ret_fh      - RETURN: handle on file that key is in
176
+ *           ret_offset  - RETURN: offset of the start of that key's record
177
+ *
178
+ * Returns:  0 on success.
179
+ *           non-zero on error.
180
+ */
181
+int
182
+SSIGetOffsetByName(SSIFILE *sfp, char *key, int *ret_fh,
183
+		   SSIOFFSET *ret_offset)
184
+{
185
+  int         status;
186
+  sqd_uint16  fnum;
187
+
188
+  /* Look in the primary keys.
189
+   */
190
+  status = binary_search(sfp, key, sfp->plen, &(sfp->poffset), sfp->precsize,
191
+			 sfp->nprimary);
192
+  if (status == 0) {		
193
+    /* We found it as a primary key; get our data & return.
194
+     */
195
+    if (! read_i16(sfp->fp, &fnum)) return SSI_ERR_NODATA;
196
+    *ret_fh = (int) fnum;
197
+    if (! read_offset(sfp->fp, sfp->smode, ret_offset))  return SSI_ERR_NODATA;
198
+
199
+    return 0;	/* success! (we don't need the other key data) */
200
+  } else if (status == SSI_ERR_NO_SUCH_KEY) {
201
+    /* Not in the primary keys? OK, try the secondary keys.
202
+     */
203
+    if (sfp->nsecondary > 0) {
204
+      char *pkey;
205
+      status = binary_search(sfp, key, sfp->slen, &(sfp->soffset), sfp->srecsize,
206
+			     sfp->nsecondary);
207
+      if (status != 0) return status;
208
+      if ((pkey = malloc(sizeof(char) * sfp->plen)) == NULL) return SSI_ERR_MALLOC;
209
+      if (fread(pkey, sizeof(char), sfp->plen, sfp->fp) != sfp->plen) return SSI_ERR_NODATA;
210
+
211
+      status = SSIGetOffsetByName(sfp, pkey, ret_fh, ret_offset);
212
+      free(pkey);
213
+    }
214
+    return status;
215
+
216
+  } else return status;		
217
+  /*NOTREACHED*/
218
+}
219
+
220
+/* Function: SSIGetOffsetByNumber()
221
+ * Date:     SRE, Mon Jan  1 19:42:42 2001 [St. Louis]
222
+ *
223
+ * Purpose:  Looks up primary key #{n} in the open index {sfp}.
224
+ *           {n} ranges from 0..nprimary-1. When key #{n} 
225
+ *           is found, {*ret_fh} contains a unique 
226
+ *           handle on the file that contains {key} (suitable
227
+ *           for an SSIFileInfo() call, or for comparison to 
228
+ *           the handle of the last file that was opened for retrieval),
229
+ *           and {offset} is filled in with the offset in that file.
230
+ *           
231
+ * Args:     sfp        - open index file
232
+ *           n          - primary key number to retrieve.
233
+ *           ret_fh     - RETURN: handle on file that key is in
234
+ *           ret_offset - RETURN: offset of the start of that key's record
235
+ *
236
+ * Returns:  0 on success.
237
+ *           non-zero on error.
238
+ */
239
+int
240
+SSIGetOffsetByNumber(SSIFILE *sfp, int n, int *ret_fh, SSIOFFSET *ret_offset)
241
+{
242
+  sqd_uint16 fnum;
243
+  char      *pkey;
244
+
245
+  if (n >= sfp->nprimary) return SSI_ERR_NO_SUCH_KEY;
246
+  if (indexfile_position(sfp, &(sfp->poffset), sfp->precsize, n) != 0) 
247
+    return SSI_ERR_SEEK_FAILED;
248
+
249
+  if ((pkey = malloc(sizeof(char) * sfp->plen)) == NULL) return SSI_ERR_MALLOC;
250
+  if (fread(pkey, sizeof(char), sfp->plen, sfp->fp) != sfp->plen) return SSI_ERR_NODATA;
251
+  if (! read_i16(sfp->fp, &fnum))                      return SSI_ERR_NODATA;
252
+  if (! read_offset(sfp->fp, sfp->smode, ret_offset))  return SSI_ERR_NODATA;  
253
+  *ret_fh = fnum;
254
+  free(pkey);
255
+  return 0;
256
+}
257
+
258
+/* Function: SSIGetSubseqOffset()
259
+ * Date:     SRE, Mon Jan  1 19:49:31 2001 [St. Louis]
260
+ *
261
+ * Purpose:  Implements SSI_FAST_SUBSEQ.
262
+ * 
263
+ *           Looks up a primary or secondary {key} in the open
264
+ *           index {sfp}. Asks for the nearest offset to a
265
+ *           subsequence starting at position {requested_start}
266
+ *           in the sequence (numbering the sequence 1..L). 
267
+ *           If {key} is found, on return, {ret_fh}
268
+ *           contains a unique handle on the file that contains 
269
+ *           {key} (suitable for an SSIFileInfo() call, or for 
270
+ *           comparison to the handle of the last file that was 
271
+ *           opened for retrieval); {record_offset} contains the
272
+ *           disk offset to the start of the record; {data_offset}
273
+ *           contains the disk offset either exactly at the requested
274
+ *           residue, or at the start of the line containing the
275
+ *           requested residue; {ret_actual_start} contains the 
276
+ *           coordinate (1..L) of the first valid residue at or
277
+ *           after {data_offset}. {ret_actual_start} is <= 
278
+ *           {requested_start}. 
279
+ *
280
+ * Args:     sfp             - open index file
281
+ *           key             - primary or secondary key to find
282
+ *           requested_start - residue we'd like to start at (1..L)
283
+ *           ret_fh          - RETURN: handle for file the key is in
284
+ *           record_offset   - RETURN: offset of entire record
285
+ *           data_offset     - RETURN: offset of subseq (see above)
286
+ *           ret_actual_start- RETURN: coord (1..L) of residue at data_offset
287
+ *
288
+ * Returns:  0 on success, non-zero on failure.
289
+ */
290
+int
291
+SSIGetSubseqOffset(SSIFILE *sfp, char *key, int requested_start,
292
+		    int *ret_fh, SSIOFFSET *record_offset,
293
+		    SSIOFFSET *data_offset, int *ret_actual_start)
294
+{
295
+  int        status;
296
+  sqd_uint32 len;
297
+  int        r, b, i, l;	/* tmp variables for "clarity", to match docs */
298
+  
299
+  /* Look up the key. Rely on the fact that SSIGetOffsetByName()
300
+   * leaves the index file positioned at the rest of the data for this key.
301
+   */
302
+  status = SSIGetOffsetByName(sfp, key, ret_fh, record_offset);
303
+  if (status != 0) return status;
304
+
305
+  /* Check that we're allowed to do subseq lookup on that file.
306
+   */
307
+  if (! (sfp->fileflags[*ret_fh] & SSI_FAST_SUBSEQ))
308
+    return SSI_ERR_NO_SUBSEQS;
309
+
310
+  /* Read the data we need for subseq lookup
311
+   */
312
+  if (! read_offset(sfp->fp, sfp->smode, data_offset)) return SSI_ERR_NODATA;
313
+  if (! read_i32(sfp->fp, &len))                         return SSI_ERR_NODATA;
314
+
315
+  /* Set up tmp variables for clarity of equations below,
316
+   * and to make them match documentation (ssi-format.tex).
317
+   */
318
+  r = sfp->rpl[*ret_fh];    /* residues per line */
319
+  b = sfp->bpl[*ret_fh];    /* bytes per line    */
320
+  i = requested_start;	    /* start position 1..L */
321
+  l = (i-1)/r;		    /* data line # (0..) that the residue is on */
322
+  if (r == 0 || b == 0) return SSI_ERR_NO_SUBSEQS;
323
+  if (i < 0 || i > len) return SSI_ERR_RANGE;
324
+  
325
+  /* When b = r+1, there's nothing but sequence on each data line (and the \0),
326
+   * and we can find each residue precisely.
327
+   */
328
+  if (b == r+1) {
329
+    if (sfp->smode == SSI_OFFSET_I32) {
330
+      data_offset->mode    = SSI_OFFSET_I32;
331
+      data_offset->off.i32 = data_offset->off.i32 + l*b + (i-1)%r;
332
+    } else if (sfp->smode == SSI_OFFSET_I64) {
333
+      data_offset->mode    = SSI_OFFSET_I64;
334
+      data_offset->off.i64 = data_offset->off.i64 + l*b + (i-1)%r;
335
+    } 
336
+    *ret_actual_start = requested_start;
337
+  } else { 
338
+    /* else, there's other stuff on seq lines, so the best
339
+     * we can do easily is to position at start of relevant line.
340
+     */
341
+    if (sfp->smode == SSI_OFFSET_I32) {
342
+      data_offset->mode    = SSI_OFFSET_I32;
343
+      data_offset->off.i32 = data_offset->off.i32 + l*b;
344
+    } else if (sfp->smode == SSI_OFFSET_I64) {
345
+      data_offset->mode    = SSI_OFFSET_I64;
346
+      data_offset->off.i64 = data_offset->off.i64 + l*b;
347
+    } 
348
+    /* yes, the eq below is = 1 + (i-1)/r*r but it's not = i. that's an integer /. */
349
+    *ret_actual_start = 1 + l*r;
350
+  }
351
+  return 0;
352
+}
353
+
354
+/* Function: SSISetFilePosition()
355
+ * Date:     SRE, Tue Jan  2 09:13:46 2001 [St. Louis]
356
+ *
357
+ * Purpose:  Uses {offset} to sets the file position for {fp}, usually an
358
+ *           open sequence file, relative to the start of the file.
359
+ *           Hides the details of system-dependent shenanigans necessary for
360
+ *           file positioning in large (>2 GB) files. 
361
+ *           
362
+ *           Behaves just like fseek(fp, offset, SEEK_SET) for 32 bit
363
+ *           offsets and <2 GB files.
364
+ *           
365
+ *           Warning: if all else fails, in desperation, it will try to
366
+ *           use fsetpos(). This requires making assumptions about fpos_t
367
+ *           that may be unwarranted... assumptions that ANSI C prohibits
368
+ *           me from making... though I believe the ./configure
369
+ *           script robustly tests whether I can play with fpos_t like this.
370
+ *
371
+ * Args:     fp      - file to position.
372
+ *           offset  - SSI offset relative to file start.
373
+ *                 
374
+ * Returns:  0 on success, nonzero on error.
375
+ */
376
+int
377
+SSISetFilePosition(FILE *fp, SSIOFFSET *offset)
378
+{
379
+  if (offset->mode == SSI_OFFSET_I32) {
380
+    if (fseek(fp, offset->off.i32, SEEK_SET) != 0)       return SSI_ERR_SEEK_FAILED;
381
+  }
382
+#ifndef HAS_64BIT_FILE_OFFSETS
383
+  else return SSI_ERR_NO64BIT;
384
+#elif defined HAVE_FSEEKO && SIZEOF_OFF_T == 8
385
+  else if (fseeko(fp, offset->off.i64, SEEK_SET) != 0)   return SSI_ERR_SEEK_FAILED;
386
+#elif defined HAVE_FSEEKO64 && SIZEOF_OFF64_T == 8
387
+  else if (fseeko64(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED;
388
+#elif defined HAVE_FSEEK64
389
+  else if (fseek64(fp, offset->off.i64, SEEK_SET) != 0)  return SSI_ERR_SEEK_FAILED;
390
+#elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8
391
+  else if (fsetpos(fp, &(offset->off.i64)) != 0)         return SSI_ERR_SEEK_FAILED;
392
+#endif
393
+  return 0;
394
+}
395
+
396
+
397
+/* Function: SSIFileInfo()
398
+ * Date:     SRE, Tue Jan  2 10:31:01 2001 [St. Louis]
399
+ *
400
+ * Purpose:  Given a file number {fh} in an open index file
401
+ *           {sfp}, retrieve file name {ret_filename} and
402
+ *           the file format {ret_format}. 
403
+ *           
404
+ *           {ret_filename} is a pointer to a string maintained
405
+ *           internally by {sfp}. It should not be free'd; 
406
+ *           SSIClose(sfp) takes care of it.
407
+ *
408
+ * Args:     sfp          - open index file
409
+ *           fh           - handle on file to look up
410
+ *           ret_filename - RETURN: name of file n
411
+ *           ret_format   - RETURN: format of file n
412
+ *
413
+ * Returns:  0 on success, nonzero on failure.
414
+ */
415
+int
416
+SSIFileInfo(SSIFILE *sfp, int fh, char **ret_filename, int *ret_format)
417
+{
418
+  if (fh < 0 || fh >= sfp->nfiles) return SSI_ERR_BADARG;
419
+  *ret_filename = sfp->filename[fh];
420
+  *ret_format   = sfp->fileformat[fh];
421
+  return 0;
422
+}
423
+
424
+/* Function: SSIClose()
425
+ * Date:     SRE, Sun Dec 31 14:56:37 2000 [St. Louis]
426
+ *
427
+ * Purpose:  Close an open {SSIFILE *}.
428
+ *
429
+ * Args:     sfp - index file to close.
430
+ *
431
+ * Returns:  (void)
432
+ */
433
+void
434
+SSIClose(SSIFILE *sfp) 
435
+{
436
+  if (sfp != NULL) {
437
+    clear_ssifile(sfp);
438
+    if (sfp->fp       != NULL) fclose(sfp->fp);
439
+    free(sfp);
440
+  }
441
+}  
442
+/* clear_ssifile(): free the innards of SSIFILE, without 
443
+ * destroying the structure or closing the stream.
444
+ */
445
+static void
446
+clear_ssifile(SSIFILE *sfp)
447
+{
448
+  int i;
449
+
450
+  if (sfp->filename != NULL) {
451
+    for (i = 0; i < sfp->nfiles; i++) 
452
+      if (sfp->filename[i] != NULL) free(sfp->filename[i]);
453
+    free(sfp->filename);
454
+  }
455
+  if (sfp->fileformat   != NULL) free(sfp->fileformat);
456
+  if (sfp->fileflags    != NULL) free(sfp->fileflags);
457
+  if (sfp->bpl          != NULL) free(sfp->bpl);
458
+  if (sfp->rpl          != NULL) free(sfp->rpl);
459
+}
460
+  
461
+
462
+/* Function: SSIRecommendMode()
463
+ * Date:     SRE, Fri Feb 16 08:23:47 2001 [St. Louis]
464
+ *
465
+ * Purpose:  Examines the file and determines whether it should be
466
+ *           indexed with large file support or not; returns 
467
+ *           SSI_OFFSET_I32 for most files, SSI_OFFSET_I64 for large
468
+ *           files, or -1 on failure.
469
+ *
470
+ * Args:     file - name of file to check for size
471
+ *
472
+ * Returns:  -1 on failure (including case where file is too big)
473
+ *           SSI_OFFSET_I32 for most files (<= 2^31-1 bytes)
474
+ *           SSI_OFFSET_I64 for large files (> 2^31-1 bytes)
475
+ */
476
+int
477
+SSIRecommendMode(char *file)
478
+{
479
+#if HAVE_STAT64
480
+  struct stat64 s1;
481
+  if (stat64(file, &s1) == 0) {
482
+    if (s1.st_size <= 2146483647L) return SSI_OFFSET_I32;
483
+    else                           return SSI_OFFSET_I64;
484
+  }
485
+#else 
486
+  struct stat s2;
487
+  if (stat(file, &s2) == 0) {
488
+    if (s2.st_size <= 2146483647L) return SSI_OFFSET_I32;
489
+    else                           return SSI_OFFSET_I64;
490
+  }
491
+#endif
492
+  return -1;
493
+}
494
+ 
495
+
496
+/* Function: SSICreateIndex()
497
+ * Date:     SRE, Tue Jan  2 11:23:25 2001 [St. Louis]
498
+ *
499
+ * Purpose:  Creates and initializes a SSI index structure. 
500
+ *           Sequence file offset type is specified by {mode}.
501
+ *
502
+ * Args:     mode    - SSI_OFFSET_I32 or SSI_OFFSET_I64, sequence file index mode.
503
+ *
504
+ * Returns:  ptr to new index structure, or NULL on failure.
505
+ *           Caller is responsible for free'ing the returned
506
+ *           structure with SSIFreeIndex().
507
+ */
508
+SSIINDEX *
509
+SSICreateIndex(int mode)
510
+{
511
+  SSIINDEX *g;
512
+
513
+  g = NULL;
514
+  if ((g = malloc(sizeof(SSIINDEX))) == NULL)                               goto FAILURE;
515
+  g->smode    = mode;
516
+  g->imode    = SSI_OFFSET_I32;	/* index always starts as 32-bit; may get upgraded later */
517
+  g->external = FALSE;
518
+  g->max_ram  = SSI_MAXRAM;
519
+
520
+#ifndef HAS_64BIT_FILE_OFFSETS
521
+  if (mode == SSI_OFFSET_I64) 
522
+    Die("\
523
+Can't create a 64-bit SSI index on this system, sorry;\n\
524
+I don't have 64-bit file offset functions available.\n");
525
+#endif
526
+
527
+  g->filenames  = NULL;
528
+  g->fileformat = NULL;
529
+  g->bpl        = NULL;
530
+  g->rpl        = NULL;
531
+  g->flen       = 0;
532
+  g->nfiles     = 0;
533
+
534
+  g->pkeys         = NULL;
535
+  g->plen          = 0;
536
+  g->nprimary      = 0;
537
+  g->ptmpfile      = "tmp.ssi.1"; /* hardcoded, for now. */
538
+  g->ptmp          = NULL;
539
+  
540
+  g->skeys         = NULL;
541
+  g->slen          = 0;
542
+  g->nsecondary    = 0;
543
+  g->stmpfile      = "tmp.ssi.2"; /* hardcoded, for now. */
544
+  g->stmp          = NULL;
545
+
546
+  /* All mallocs must go after NULL initializations, because of the cleanup strategy;
547
+   * we'll try to free anything non-NULL if a malloc fails.
548
+   */
549
+  if ((g->filenames = malloc(sizeof(char *)     * SSI_FILE_BLOCK)) == NULL) goto FAILURE;
550
+  if ((g->fileformat= malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; 
551
+  if ((g->bpl       = malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; 
552
+  if ((g->rpl       = malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; 
553
+  
554
+  if ((g->pkeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL)  goto FAILURE;
555
+  if ((g->skeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL)  goto FAILURE;
556
+
557
+  return g;
558
+
559
+ FAILURE:
560
+  SSIFreeIndex(g);		/* free the damaged structure */
561
+  return NULL;
562
+}
563
+
564
+/* Function: SSIGetFilePosition()
565
+ * Date:     SRE, Tue Jan  2 09:59:26 2001 [St. Louis]
566
+ *
567
+ * Purpose:  Fills {ret_offset} with the current disk
568
+ *           offset of {fp}, relative to the start of the file. 
569
+ *           {mode} is set to either SSI_OFFSET_I32 or 
570
+ *           SSI_OFFSET_I64. If {mode} is _I32 (32 bit), just wraps
571
+ *           a call to ftell(); otherwise deals with system-dependent
572
+ *           details of 64-bit file offsets.
573
+ *
574
+ * Args:     fp         - open stream
575
+ *           mode       - SSI_OFFSET_I32 or SSI_OFFSET_I64
576
+ *           ret_offset - RETURN: file position       
577
+ *
578
+ * Returns:  0 on success. nonzero on error.
579
+ */
580
+int 
581
+SSIGetFilePosition(FILE *fp, int mode, SSIOFFSET *ret_offset)
582
+{
583
+  if (mode == SSI_OFFSET_I32) 
584
+    {
585
+      ret_offset->mode    = SSI_OFFSET_I32;
586
+      ret_offset->off.i32 = ftell(fp);
587
+      if (ret_offset->off.i32 == -1) return SSI_ERR_TELL_FAILED;
588
+    }
589
+  else if (mode != SSI_OFFSET_I64) abort(); /* only happens on a coding error */
590
+  else {
591
+    ret_offset->mode    = SSI_OFFSET_I64;
592
+#ifndef HAS_64BIT_FILE_OFFSETS
593
+    return SSI_ERR_NO64BIT;
594
+#elif defined HAVE_FTELLO && SIZEOF_OFF_T == 8
595
+    if ((ret_offset->off.i64 = ftello(fp)) == -1)   return SSI_ERR_TELL_FAILED;
596
+#elif defined HAVE_FTELLO64 && SIZEOF_OFF64_T == 8
597
+    if ((ret_offset->off.i64 = ftello64(fp)) == -1) return SSI_ERR_TELL_FAILED;
598
+#elif defined HAVE_FTELL64
599
+    if ((ret_offset->off.i64 = ftell64(fp)) == -1)  return SSI_ERR_TELL_FAILED;
600
+#elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8
601
+    if (fgetpos(fp, &(ret_offset->off.i64)) != 0)   return SSI_ERR_TELL_FAILED;
602
+#endif
603
+  }
604
+  return 0;
605
+}
606
+
607
+/* Function: SSIAddFileToIndex()
608
+ * Date:     SRE, Tue Jan  2 12:54:36 2001 [St. Louis]
609
+ *
610
+ * Purpose:  Adds the sequence file {filename}, which is known to 
611
+ *           be in format {fmt}, to the index {g}. Creates and returns
612
+ *           a unique filehandle {fh} for then associating primary keys
613
+ *           with this file using SSIAddPrimaryKeyToIndex().
614
+ *
615
+ * Args:     g         - active index
616
+ *           filename  - file to add 
617
+ *           fmt       - format code for this file (e.g. SQFILE_FASTA)
618
+ *           ret_fh    - RETURN: unique handle for this file
619
+ *
620
+ * Returns:  0 on success; nonzero on error.
621
+ */
622
+int
623
+SSIAddFileToIndex(SSIINDEX *g, char *filename, int fmt, int *ret_fh)
624
+{
625
+  int n;
626
+  
627
+  if (g->nfiles >= SSI_MAXFILES) return SSI_ERR_TOOMANY_FILES;
628
+
629
+  n = strlen(filename);
630
+  if ((n+1) > g->flen) g->flen = n+1;
631
+
632
+  g->filenames[g->nfiles]  = FileTail(filename, FALSE);
633
+  g->fileformat[g->nfiles] = fmt;
634
+  g->bpl[g->nfiles]        = 0;
635
+  g->rpl[g->nfiles]        = 0;
636
+  *ret_fh                  = g->nfiles;   /* handle is simply = file number */
637
+  g->nfiles++;
638
+
639
+  if (g->nfiles % SSI_FILE_BLOCK == 0) {
640
+    g->filenames = realloc(g->filenames,  sizeof(char *) * (g->nfiles+SSI_FILE_BLOCK));
641
+    if (g->filenames == NULL) return SSI_ERR_MALLOC;
642
+    g->fileformat= realloc(g->fileformat, sizeof(sqd_uint32) * (g->nfiles+SSI_FILE_BLOCK));
643
+    if (g->fileformat == NULL) return SSI_ERR_MALLOC;
644
+    g->bpl       = realloc(g->bpl,        sizeof(sqd_uint32) * (g->nfiles+SSI_FILE_BLOCK));
645
+    if (g->bpl == NULL) return SSI_ERR_MALLOC;
646
+    g->rpl       = realloc(g->rpl,        sizeof(sqd_uint32) * (g->nfiles+SSI_FILE_BLOCK));
647
+    if (g->rpl == NULL) return SSI_ERR_MALLOC;
648
+  }
649
+  return 0;
650
+}
651
+
652
+
653
+/* Function: SSISetFileForSubseq()
654
+ * Date:     SRE, Tue Jan  9 10:02:05 2001 [St. Louis]
655
+ *
656
+ * Purpose:  Set SSI_FAST_SUBSEQ for the file indicated by
657
+ *           filehandle {fh} in the index {g}, setting
658
+ *           parameters {bpl} and {rpl} to the values given.
659
+ *           {bpl} is the number of bytes per sequence data line.
660
+ *           {rpl} is the number of residues per sequence data line. 
661
+ *           Caller must be sure that {bpl} and {rpl} do not change
662
+ *           on any line of any sequence record in the file
663
+ *           (except for the last data line of each record). If
664
+ *           this is not the case in this file, SSI_FAST_SUBSEQ
665
+ *           will not work, and this routine should not be
666
+ *           called.
667
+ *
668
+ * Args:     g    - the active index
669
+ *           fh   - handle for file to set SSI_FAST_SUBSEQ on
670
+ *           bpl  - bytes per data line
671
+ *           rpl  - residues per data line
672
+ *
673
+ * Returns:  0 on success; 1 on error.
674
+ */
675
+int
676
+SSISetFileForSubseq(SSIINDEX *g, int fh, int bpl, int rpl)
677
+{
678
+  if (fh < 0 || fh >= g->nfiles) return SSI_ERR_BADARG;
679
+  if (bpl <= 0 || rpl <= 0)      return SSI_ERR_BADARG;
680
+  g->bpl[fh] = bpl;
681
+  g->rpl[fh] = rpl;
682
+  return 0;
683
+}
684
+
685
+
686
+/* Function: SSIAddPrimaryKeyToIndex()
687
+ * Date:     SRE, Tue Jan  2 11:50:54 2001 [St. Louis]
688
+ *
689
+ * Purpose:  Put primary key {key} in the index {g}, while telling
690
+ *           the index this primary key is in the file associated
691
+ *           with filehandle {fh} (returned by a previous call
692
+ *           to SSIAddFileToIndex()), and its record starts at 
693
+ *           position {r_off} in the file.
694
+ *           
695
+ *           {d_off} and {L} are optional; they may be left unset
696
+ *           by passing NULL and 0, respectively. (If one is
697
+ *           provided, both must be provided.) If they are provided,
698
+ *           {d_off} gives the position of the first line of sequence
699
+ *           data in the record, and {L} gives the length of
700
+ *           the sequence in residues. They are used when 
701
+ *           SSI_FAST_SUBSEQ is set for this file. If SSI_FAST_SUBSEQ
702
+ *           is not set for the file, {d_off} and {L} will be
703
+ *           ignored by the index reading API even if they are stored
704
+ *           by the index writing API, so it doesn't hurt for the 
705
+ *           indexing program to provide them; typically they
706
+ *           won't know whether it's safe to set SSI_FAST_SUBSEQ
707
+ *           for the whole file until the whole file has been
708
+ *           read and every key has already been added to the index.
709
+ *           
710
+ * Args:     g      - active index
711
+ *           key    - primary key to add
712
+ *           fh     - handle on file that this key's in 
713
+ *           r_off  - offset to start of record
714
+ *           d_off  - offset to start of sequence data
715
+ *           L      - length of sequence, or 0
716
+ *
717
+ * Returns:  0 on success, nonzero on error.
718
+ */
719
+int
720
+SSIAddPrimaryKeyToIndex(SSIINDEX *g, char *key, int fh,
721
+			SSIOFFSET *r_off, SSIOFFSET *d_off, int L)
722
+{
723
+  int n;			/* a string length */
724
+  
725
+  if (fh >= SSI_MAXFILES)         return SSI_ERR_TOOMANY_FILES;
726
+  if (g->nprimary >= SSI_MAXKEYS) return SSI_ERR_TOOMANY_KEYS;
727
+  if (L > 0 && d_off == NULL) abort(); /* need both. */
728
+
729
+  /* Before adding the key: check how big our index is.
730
+   * If it's getting too large, switch to external mode.
731
+   */
732
+  if (!g->external && current_index_size(g) >= g->max_ram) 
733
+    if (activate_external_sort(g) != 0)  return SSI_ERR_NOFILE;
734
+
735
+  /* Update maximum pkey length, if needed.
736
+   */
737
+  n = strlen(key);
738
+  if ((n+1) > g->plen) g->plen = n+1;
739
+
740
+  /* External mode? Simply append to disk...
741
+   */
742
+  if (g->external) {
743
+    if (g->smode == SSI_OFFSET_I32) {
744
+      fprintf(g->ptmp, "%s\t%d\t%lu\t%lu\t%lu\n", 
745
+	      key, fh, (unsigned long) r_off->off.i32, 
746
+	      (unsigned long) (d_off == NULL? 0 : d_off->off.i32),
747
+	      (unsigned long) L);
748
+    } else {
749
+#ifdef CLUSTALO
750
+        fprintf(g->ptmp, "%s\t%d\t%llu\t%llu\t%lu\n", 
751
+                key, fh, (unsigned long long)r_off->off.i64, 
752
+                d_off == NULL? 0 : (unsigned long long) d_off->off.i64, 
753
+	      (unsigned long) L);
754
+#else
755
+        fprintf(g->ptmp, "%s\t%d\t%llu\t%llu\t%lu\n", 
756
+	      key, fh, r_off->off.i64, 
757
+	      d_off == NULL? 0 : d_off->off.i64, 
758
+	      (unsigned long) L);
759
+#endif
760
+    }
761
+    g->nprimary++;
762
+    return 0;
763
+  }
764
+
765
+  /* Else: internal mode, keep keys in memory...
766
+   */
767
+  if ((g->pkeys[g->nprimary].key = sre_strdup(key, n)) == NULL) return SSI_ERR_MALLOC;
768
+  g->pkeys[g->nprimary].fnum  = (sqd_uint16) fh;
769
+  g->pkeys[g->nprimary].r_off = *r_off;
770
+  if (d_off != NULL && L > 0) {
771
+    g->pkeys[g->nprimary].d_off = *d_off;
772
+    g->pkeys[g->nprimary].len   = L;
773
+  } else {
774
+	/* yeah, this looks stupid, but look: we have to give a valid
775
+           looking, non-NULL d_off of some sort, or writes will fail. 
776
+           It's going to be unused anyway. */
777
+    g->pkeys[g->nprimary].d_off = *r_off;
778
+    g->pkeys[g->nprimary].len   = 0;
779
+  }
780
+  g->nprimary++;
781
+
782
+  if (g->nprimary % SSI_KEY_BLOCK == 0) {
783
+    g->pkeys = realloc(g->pkeys, sizeof(struct ssipkey_s) * (g->nprimary+SSI_KEY_BLOCK));
784
+    if (g->pkeys == NULL) return SSI_ERR_MALLOC;
785
+  }
786
+  return 0;
787
+}
788
+
789
+
790
+/* Function: SSIAddSecondaryKeyToIndex()
791
+ * Date:     SRE, Tue Jan  2 12:44:40 2001 [St. Louis]
792
+ *
793
+ * Purpose:  Puts secondary key {key} in the index {g}, associating
794
+ *           it with primary key {pkey} that was previously
795
+ *           registered by SSIAddPrimaryKeyToIndex().
796
+ *
797
+ * Args:     g    - active index 
798
+ *           key  - secondary key to add             
799
+ *           pkey - primary key to associate this key with
800
+ *
801
+ * Returns:  0 on success, nonzero on failure.
802
+ */
803
+int
804
+SSIAddSecondaryKeyToIndex(SSIINDEX *g, char *key, char *pkey)
805
+{
806
+  int n;			/* a string length */
807
+  
808
+  if (g->nsecondary >= SSI_MAXKEYS) return SSI_ERR_TOOMANY_KEYS;
809
+
810
+  /* Before adding the key: check how big our index is.
811
+   * If it's getting too large, switch to external mode.
812
+   */
813
+  if (!g->external && current_index_size(g) >= g->max_ram) 
814
+    if (activate_external_sort(g) != 0)  return SSI_ERR_NOFILE;
815
+
816
+  /* Update maximum secondary key length, if necessary.
817
+   */
818
+  n = strlen(key);
819
+  if ((n+1) > g->slen) g->slen = n+1;
820
+
821
+  /* if external mode: write info to disk.
822
+   */
823
+  if (g->external) {
824
+    fprintf(g->stmp, "%s\t%s\n", key, pkey);
825
+    g->nsecondary++;
826
+    return 0;
827
+  }
828
+
829
+  /* else, internal mode... store info in memory.
830
+   */
831
+  if ((g->skeys[g->nsecondary].key  = sre_strdup(key, n))   == NULL) return SSI_ERR_MALLOC;
832
+  if ((g->skeys[g->nsecondary].pkey = sre_strdup(pkey, -1)) == NULL) return SSI_ERR_MALLOC;
833
+  g->nsecondary++;
834
+
835
+  if (g->nsecondary % SSI_KEY_BLOCK == 0) {
836
+    g->skeys = realloc(g->skeys, sizeof(struct ssiskey_s) * (g->nsecondary+SSI_KEY_BLOCK));
837
+    if (g->skeys == NULL) return SSI_ERR_MALLOC;
838
+  }
839
+  return 0;
840
+}
841
+
842
+
843
+
844
+
845
+/* Function: SSIWriteIndex()
846
+ * Date:     SRE, Tue Jan  2 13:55:56 2001 [St. Louis]
847
+ *
848
+ * Purpose:  Writes complete index {g} in SSI format to a 
849
+ *           binary file {file}. Does all           
850
+ *           the overhead of sorting the primary and secondary keys, 
851
+ *           and maintaining the association of secondary keys
852
+ *           with primary keys during and after the sort.
853
+ *
854
+ * Args:     file  - file to write to
855
+ *           g     - index to sort & write out.      
856
+ *
857
+ * Returns:  0 on success, nonzero on error.
858
+ */
859
+/* needed for qsort() */
860
+static int 
861
+pkeysort(const void *k1, const void *k2)
862
+{
863
+  struct ssipkey_s *key1;
864
+  struct ssipkey_s *key2;