git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@102514 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -17,6 +17,7 @@ |
17 | 17 |
#include "ssi.h" |
18 | 18 |
#ifdef CLUSTALO |
19 | 19 |
#include <limits.h> |
20 |
+#include <inttypes.h> |
|
20 | 21 |
#endif |
21 | 22 |
|
22 | 23 |
static sqd_uint32 v20magic = 0xf3f3e9b1; /* SSI 1.0: "ssi1" + 0x80808080 */ |
... | ... |
@@ -747,9 +748,9 @@ SSIAddPrimaryKeyToIndex(SSIINDEX *g, char *key, int fh, |
747 | 748 |
(unsigned long) L); |
748 | 749 |
} else { |
749 | 750 |
#ifdef CLUSTALO |
750 |
- fprintf(g->ptmp, "%s\t%d\t%llu\t%llu\t%lu\n", |
|
751 |
+ fprintf(g->ptmp, "%s\t%d\t%" PRIu64 "\t%" PRIu64 "\t%lu\n", |
|
751 | 752 |
key, fh, (unsigned long long)r_off->off.i64, |
752 |
- d_off == NULL? 0 : (unsigned long long) d_off->off.i64, |
|
753 |
+ d_off == NULL ? 0 : (unsigned long long) d_off->off.i64, |
|
753 | 754 |
(unsigned long) L); |
754 | 755 |
#else |
755 | 756 |
fprintf(g->ptmp, "%s\t%d\t%llu\t%llu\t%lu\n", |
... | ... |
@@ -1403,7 +1404,7 @@ activate_external_sort(SSIINDEX *g) |
1403 | 1404 |
(unsigned long) g->pkeys[i].d_off.off.i32, |
1404 | 1405 |
(unsigned long) g->pkeys[i].len); |
1405 | 1406 |
} else { |
1406 |
- fprintf(g->ptmp, "%s\t%u\t%llu\t%llu\t%lu\n", |
|
1407 |
+ fprintf(g->ptmp, "%s\t%u\t%" PRIu64 "\t%" PRIu64 "\t%lu\n", |
|
1407 | 1408 |
g->pkeys[i].key, g->pkeys[i].fnum, |
1408 | 1409 |
(unsigned long long) g->pkeys[i].r_off.off.i64, |
1409 | 1410 |
(unsigned long long) g->pkeys[i].d_off.off.i64, |
msa
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@102253 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,1537 @@ |
1 |
+/***************************************************************** |
|
2 |
+ * SQUID - a library of functions for biological sequence analysis |
|
3 |
+ * Copyright (C) 1992-2002 Washington University School of Medicine |
|
4 |
+ * |
|
5 |
+ * This source code is freely distributed under the terms of the |
|
6 |
+ * GNU General Public License. See the files COPYRIGHT and LICENSE |
|
7 |
+ * for details. |
|
8 |
+ *****************************************************************/ |
|
9 |
+ |
|
10 |
+#include <stdio.h> |
|
11 |
+#include <stdlib.h> |
|
12 |
+#include <string.h> |
|
13 |
+#include <sys/stat.h> |
|
14 |
+#include <sys/types.h> |
|
15 |
+#include <unistd.h> |
|
16 |
+#include "squid.h" |
|
17 |
+#include "ssi.h" |
|
18 |
+#ifdef CLUSTALO |
|
19 |
+#include <limits.h> |
|
20 |
+#endif |
|
21 |
+ |
|
22 |
+static sqd_uint32 v20magic = 0xf3f3e9b1; /* SSI 1.0: "ssi1" + 0x80808080 */ |
|
23 |
+static sqd_uint32 v20swap = 0xb1e9f3f3; /* byteswapped */ |
|
24 |
+ |
|
25 |
+static int read_i16(FILE *fp, sqd_uint16 *ret_result); |
|
26 |
+static int read_i32(FILE *fp, sqd_uint32 *ret_result); |
|
27 |
+static int read_i64(FILE *fp, sqd_uint64 *ret_result); |
|
28 |
+static int read_offset(FILE *fp, char mode, SSIOFFSET *ret_offset); |
|
29 |
+static int write_i16(FILE *fp, sqd_uint16 n); |
|
30 |
+static int write_i32(FILE *fp, sqd_uint32 n); |
|
31 |
+static int write_i64(FILE *fp, sqd_uint64 n); |
|
32 |
+static int write_offset(FILE *fp, SSIOFFSET *offset); |
|
33 |
+static int binary_search(SSIFILE *sfp, char *key, int klen, SSIOFFSET *base, |
|
34 |
+ sqd_uint32 recsize, sqd_uint32 maxidx); |
|
35 |
+static int indexfile_position(SSIFILE *sfp, SSIOFFSET *base, sqd_uint32 len, |
|
36 |
+ sqd_uint32 n); |
|
37 |
+static void clear_ssifile(SSIFILE *sfp); |
|
38 |
+static sqd_uint64 current_index_size(SSIINDEX *g); |
|
39 |
+static int activate_external_sort(SSIINDEX *g); |
|
40 |
+static int load_indexfile(SSIFILE *sfp); |
|
41 |
+static int parse_pkey_info(char *buf, char mode, struct ssipkey_s *pkey); |
|
42 |
+static int parse_skey_info(char *buf, struct ssiskey_s *skey); |
|
43 |
+ |
|
44 |
+/* Function: SSIOpen() |
|
45 |
+ * Date: SRE, Sun Dec 31 12:40:03 2000 [St. Louis] |
|
46 |
+ * |
|
47 |
+ * Purpose: Opens the SSI index file {filename} and returns |
|
48 |
+ * a SSIFILE * stream thru {ret_sfp}. |
|
49 |
+ * The caller must eventually close this stream using |
|
50 |
+ * SSIClose(). More than one index file can be open |
|
51 |
+ * at once. |
|
52 |
+ * |
|
53 |
+ * Args: filename - full path to a SSI index file |
|
54 |
+ * |
|
55 |
+ * Returns: Returns 0 on success, nonzero on failure. |
|
56 |
+ */ |
|
57 |
+int |
|
58 |
+SSIOpen(char *filename, SSIFILE **ret_sfp) |
|
59 |
+{ |
|
60 |
+ SSIFILE *sfp = NULL; |
|
61 |
+ int status; |
|
62 |
+ if ((sfp = malloc(sizeof(SSIFILE))) == NULL) return SSI_ERR_MALLOC; |
|
63 |
+ if ((sfp->fp = fopen(filename, "rb")) == NULL) { |
|
64 |
+ free(sfp); |
|
65 |
+ return SSI_ERR_NOFILE; |
|
66 |
+ } |
|
67 |
+ status = load_indexfile(sfp); |
|
68 |
+ *ret_sfp = sfp; |
|
69 |
+ return status; |
|
70 |
+} |
|
71 |
+/* load_indexfile(): given a SSIFILE structure with an open and positioned |
|
72 |
+ * stream (fp) -- but no other data loaded -- read the next SSIFILE |
|
73 |
+ * in from disk. We use this routine without its SSIOpen() wrapper |
|
74 |
+ * as part of the external mergesort when creating large indices. |
|
75 |
+ */ |
|
76 |
+static int |
|
77 |
+load_indexfile(SSIFILE *sfp) |
|
78 |
+{ |
|
79 |
+ sqd_uint32 magic; |
|
80 |
+ sqd_uint16 i; /* counter over files */ |
|
81 |
+ int status; /* overall return status if an error is thrown */ |
|
82 |
+ |
|
83 |
+ status = SSI_ERR_BADFORMAT; /* default: almost every kind of error is a bad format error */ |
|
84 |
+ |
|
85 |
+ sfp->filename = NULL; |
|
86 |
+ sfp->fileformat = NULL; |
|
87 |
+ sfp->fileflags = NULL; |
|
88 |
+ sfp->bpl = NULL; |
|
89 |
+ sfp->rpl = NULL; |
|
90 |
+ sfp->nfiles = 0; |
|
91 |
+ if (! read_i32(sfp->fp, &magic)) {status = SSI_ERR_BADMAGIC; goto FAILURE; } |
|
92 |
+ if (magic != v20magic && magic != v20swap) {status = SSI_ERR_BADMAGIC; goto FAILURE; } |
|
93 |
+ if (! read_i32(sfp->fp, &(sfp->flags))) goto FAILURE; |
|
94 |
+ |
|
95 |
+ /* If we have 64-bit offsets, make sure we can deal with them. |
|
96 |
+ */ |
|
97 |
+#ifndef HAS_64BIT_FILE_OFFSETS |
|
98 |
+ if ((sfp->flags & SSI_USE64_INDEX) || |
|
99 |
+ (sfp->flags & SSI_USE64)) |
|
100 |
+ { status = SSI_ERR_NO64BIT; goto FAILURE; } |
|
101 |
+#endif |
|
102 |
+ |
|
103 |
+ sfp->imode = (sfp->flags & SSI_USE64_INDEX) ? SSI_OFFSET_I64 : SSI_OFFSET_I32; |
|
104 |
+ sfp->smode = (sfp->flags & SSI_USE64) ? SSI_OFFSET_I64 : SSI_OFFSET_I32; |
|
105 |
+ |
|
106 |
+ if (! read_i16(sfp->fp, &(sfp->nfiles))) goto FAILURE; |
|
107 |
+ if (! read_i32(sfp->fp, &(sfp->nprimary))) goto FAILURE; |
|
108 |
+ if (! read_i32(sfp->fp, &(sfp->nsecondary))) goto FAILURE; |
|
109 |
+ if (! read_i32(sfp->fp, &(sfp->flen))) goto FAILURE; |
|
110 |
+ if (! read_i32(sfp->fp, &(sfp->plen))) goto FAILURE; |
|
111 |
+ if (! read_i32(sfp->fp, &(sfp->slen))) goto FAILURE; |
|
112 |
+ if (! read_i32(sfp->fp, &(sfp->frecsize))) goto FAILURE; |
|
113 |
+ if (! read_i32(sfp->fp, &(sfp->precsize))) goto FAILURE; |
|
114 |
+ if (! read_i32(sfp->fp, &(sfp->srecsize))) goto FAILURE; |
|
115 |
+ |
|
116 |
+ if (! read_offset(sfp->fp, sfp->imode, &(sfp->foffset))) goto FAILURE; |
|
117 |
+ if (! read_offset(sfp->fp, sfp->imode, &(sfp->poffset))) goto FAILURE; |
|
118 |
+ if (! read_offset(sfp->fp, sfp->imode, &(sfp->soffset))) goto FAILURE; |
|
119 |
+ |
|
120 |
+ /* Read the file information and keep it. |
|
121 |
+ * We expect the number of files to be small, so reading it |
|
122 |
+ * once should be advantageous overall. If SSI ever had to |
|
123 |
+ * deal with large numbers of files, you'd probably want to |
|
124 |
+ * read file information on demand. |
|
125 |
+ */ |
|
126 |
+ if (sfp->nfiles == 0) goto FAILURE; |
|
127 |
+ if ((sfp->filename=malloc(sizeof(char *) *sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } |
|
128 |
+ for (i = 0; i < sfp->nfiles; i++) sfp->filename[i] = NULL; |
|
129 |
+ if ((sfp->fileformat=malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } |
|
130 |
+ if ((sfp->fileflags =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } |
|
131 |
+ if ((sfp->bpl =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } |
|
132 |
+ if ((sfp->rpl =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } |
|
133 |
+ |
|
134 |
+ for (i = 0; i < sfp->nfiles; i++) |
|
135 |
+ { |
|
136 |
+ /* We have to explicitly position, because header and file |
|
137 |
+ * records may expand in the future; frecsize and foffset |
|
138 |
+ * give us forwards compatibility. |
|
139 |
+ */ |
|
140 |
+ if (indexfile_position(sfp, &(sfp->foffset), sfp->frecsize, i) !=0) goto FAILURE; |
|
141 |
+ if ((sfp->filename[i] =malloc(sizeof(char)*sfp->flen)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } |
|
142 |
+ if (fread(sfp->filename[i],sizeof(char),sfp->flen, sfp->fp)!=sfp->flen) goto FAILURE; |
|
143 |
+ if (! read_i32(sfp->fp, &(sfp->fileformat[i]))) goto FAILURE; |
|
144 |
+ if (! read_i32(sfp->fp, &(sfp->fileflags[i]))) goto FAILURE; |
|
145 |
+ if (! read_i32(sfp->fp, &(sfp->bpl[i]))) goto FAILURE; |
|
146 |
+ if (! read_i32(sfp->fp, &(sfp->rpl[i]))) goto FAILURE; |
|
147 |
+ } |
|
148 |
+ |
|
149 |
+ /* Success. Return 0. |
|
150 |
+ */ |
|
151 |
+ return 0; |
|
152 |
+ |
|
153 |
+ FAILURE: |
|
154 |
+ /* Failure: free the damaged structure, return status code. |
|
155 |
+ */ |
|
156 |
+ SSIClose(sfp); |
|
157 |
+ return status; |
|
158 |
+} |
|
159 |
+ |
|
160 |
+ |
|
161 |
+ |
|
162 |
+/* Function: SSIGetOffsetByName() |
|
163 |
+ * Date: SRE, Sun Dec 31 13:55:31 2000 [St. Louis] |
|
164 |
+ * |
|
165 |
+ * Purpose: Looks up the string {key} in the open index {sfp}. |
|
166 |
+ * {key} can be either a primary or secondary key. If {key} |
|
167 |
+ * is found, {*ret_fh} contains a unique handle on |
|
168 |
+ * the file that contains {key} (suitable for an SSIFileInfo() |
|
169 |
+ * call, or for comparison to the handle of the last file |
|
170 |
+ * that was opened for retrieval), and {offset} is filled |
|
171 |
+ * in with the offset in that file. |
|
172 |
+ * |
|
173 |
+ * Args: sfp - open index file |
|
174 |
+ * key - string to search for |
|
175 |
+ * ret_fh - RETURN: handle on file that key is in |
|
176 |
+ * ret_offset - RETURN: offset of the start of that key's record |
|
177 |
+ * |
|
178 |
+ * Returns: 0 on success. |
|
179 |
+ * non-zero on error. |
|
180 |
+ */ |
|
181 |
+int |
|
182 |
+SSIGetOffsetByName(SSIFILE *sfp, char *key, int *ret_fh, |
|
183 |
+ SSIOFFSET *ret_offset) |
|
184 |
+{ |
|
185 |
+ int status; |
|
186 |
+ sqd_uint16 fnum; |
|
187 |
+ |
|
188 |
+ /* Look in the primary keys. |
|
189 |
+ */ |
|
190 |
+ status = binary_search(sfp, key, sfp->plen, &(sfp->poffset), sfp->precsize, |
|
191 |
+ sfp->nprimary); |
|
192 |
+ if (status == 0) { |
|
193 |
+ /* We found it as a primary key; get our data & return. |
|
194 |
+ */ |
|
195 |
+ if (! read_i16(sfp->fp, &fnum)) return SSI_ERR_NODATA; |
|
196 |
+ *ret_fh = (int) fnum; |
|
197 |
+ if (! read_offset(sfp->fp, sfp->smode, ret_offset)) return SSI_ERR_NODATA; |
|
198 |
+ |
|
199 |
+ return 0; /* success! (we don't need the other key data) */ |
|
200 |
+ } else if (status == SSI_ERR_NO_SUCH_KEY) { |
|
201 |
+ /* Not in the primary keys? OK, try the secondary keys. |
|
202 |
+ */ |
|
203 |
+ if (sfp->nsecondary > 0) { |
|
204 |
+ char *pkey; |
|
205 |
+ status = binary_search(sfp, key, sfp->slen, &(sfp->soffset), sfp->srecsize, |
|
206 |
+ sfp->nsecondary); |
|
207 |
+ if (status != 0) return status; |
|
208 |
+ if ((pkey = malloc(sizeof(char) * sfp->plen)) == NULL) return SSI_ERR_MALLOC; |
|
209 |
+ if (fread(pkey, sizeof(char), sfp->plen, sfp->fp) != sfp->plen) return SSI_ERR_NODATA; |
|
210 |
+ |
|
211 |
+ status = SSIGetOffsetByName(sfp, pkey, ret_fh, ret_offset); |
|
212 |
+ free(pkey); |
|
213 |
+ } |
|
214 |
+ return status; |
|
215 |
+ |
|
216 |
+ } else return status; |
|
217 |
+ /*NOTREACHED*/ |
|
218 |
+} |
|
219 |
+ |
|
220 |
+/* Function: SSIGetOffsetByNumber() |
|
221 |
+ * Date: SRE, Mon Jan 1 19:42:42 2001 [St. Louis] |
|
222 |
+ * |
|
223 |
+ * Purpose: Looks up primary key #{n} in the open index {sfp}. |
|
224 |
+ * {n} ranges from 0..nprimary-1. When key #{n} |
|
225 |
+ * is found, {*ret_fh} contains a unique |
|
226 |
+ * handle on the file that contains {key} (suitable |
|
227 |
+ * for an SSIFileInfo() call, or for comparison to |
|
228 |
+ * the handle of the last file that was opened for retrieval), |
|
229 |
+ * and {offset} is filled in with the offset in that file. |
|
230 |
+ * |
|
231 |
+ * Args: sfp - open index file |
|
232 |
+ * n - primary key number to retrieve. |
|
233 |
+ * ret_fh - RETURN: handle on file that key is in |
|
234 |
+ * ret_offset - RETURN: offset of the start of that key's record |
|
235 |
+ * |
|
236 |
+ * Returns: 0 on success. |
|
237 |
+ * non-zero on error. |
|
238 |
+ */ |
|
239 |
+int |
|
240 |
+SSIGetOffsetByNumber(SSIFILE *sfp, int n, int *ret_fh, SSIOFFSET *ret_offset) |
|
241 |
+{ |
|
242 |
+ sqd_uint16 fnum; |
|
243 |
+ char *pkey; |
|
244 |
+ |
|
245 |
+ if (n >= sfp->nprimary) return SSI_ERR_NO_SUCH_KEY; |
|
246 |
+ if (indexfile_position(sfp, &(sfp->poffset), sfp->precsize, n) != 0) |
|
247 |
+ return SSI_ERR_SEEK_FAILED; |
|
248 |
+ |
|
249 |
+ if ((pkey = malloc(sizeof(char) * sfp->plen)) == NULL) return SSI_ERR_MALLOC; |
|
250 |
+ if (fread(pkey, sizeof(char), sfp->plen, sfp->fp) != sfp->plen) return SSI_ERR_NODATA; |
|
251 |
+ if (! read_i16(sfp->fp, &fnum)) return SSI_ERR_NODATA; |
|
252 |
+ if (! read_offset(sfp->fp, sfp->smode, ret_offset)) return SSI_ERR_NODATA; |
|
253 |
+ *ret_fh = fnum; |
|
254 |
+ free(pkey); |
|
255 |
+ return 0; |
|
256 |
+} |
|
257 |
+ |
|
258 |
+/* Function: SSIGetSubseqOffset() |
|
259 |
+ * Date: SRE, Mon Jan 1 19:49:31 2001 [St. Louis] |
|
260 |
+ * |
|
261 |
+ * Purpose: Implements SSI_FAST_SUBSEQ. |
|
262 |
+ * |
|
263 |
+ * Looks up a primary or secondary {key} in the open |
|
264 |
+ * index {sfp}. Asks for the nearest offset to a |
|
265 |
+ * subsequence starting at position {requested_start} |
|
266 |
+ * in the sequence (numbering the sequence 1..L). |
|
267 |
+ * If {key} is found, on return, {ret_fh} |
|
268 |
+ * contains a unique handle on the file that contains |
|
269 |
+ * {key} (suitable for an SSIFileInfo() call, or for |
|
270 |
+ * comparison to the handle of the last file that was |
|
271 |
+ * opened for retrieval); {record_offset} contains the |
|
272 |
+ * disk offset to the start of the record; {data_offset} |
|
273 |
+ * contains the disk offset either exactly at the requested |
|
274 |
+ * residue, or at the start of the line containing the |
|
275 |
+ * requested residue; {ret_actual_start} contains the |
|
276 |
+ * coordinate (1..L) of the first valid residue at or |
|
277 |
+ * after {data_offset}. {ret_actual_start} is <= |
|
278 |
+ * {requested_start}. |
|
279 |
+ * |
|
280 |
+ * Args: sfp - open index file |
|
281 |
+ * key - primary or secondary key to find |
|
282 |
+ * requested_start - residue we'd like to start at (1..L) |
|
283 |
+ * ret_fh - RETURN: handle for file the key is in |
|
284 |
+ * record_offset - RETURN: offset of entire record |
|
285 |
+ * data_offset - RETURN: offset of subseq (see above) |
|
286 |
+ * ret_actual_start- RETURN: coord (1..L) of residue at data_offset |
|
287 |
+ * |
|
288 |
+ * Returns: 0 on success, non-zero on failure. |
|
289 |
+ */ |
|
290 |
+int |
|
291 |
+SSIGetSubseqOffset(SSIFILE *sfp, char *key, int requested_start, |
|
292 |
+ int *ret_fh, SSIOFFSET *record_offset, |
|
293 |
+ SSIOFFSET *data_offset, int *ret_actual_start) |
|
294 |
+{ |
|
295 |
+ int status; |
|
296 |
+ sqd_uint32 len; |
|
297 |
+ int r, b, i, l; /* tmp variables for "clarity", to match docs */ |
|
298 |
+ |
|
299 |
+ /* Look up the key. Rely on the fact that SSIGetOffsetByName() |
|
300 |
+ * leaves the index file positioned at the rest of the data for this key. |
|
301 |
+ */ |
|
302 |
+ status = SSIGetOffsetByName(sfp, key, ret_fh, record_offset); |
|
303 |
+ if (status != 0) return status; |
|
304 |
+ |
|
305 |
+ /* Check that we're allowed to do subseq lookup on that file. |
|
306 |
+ */ |
|
307 |
+ if (! (sfp->fileflags[*ret_fh] & SSI_FAST_SUBSEQ)) |
|
308 |
+ return SSI_ERR_NO_SUBSEQS; |
|
309 |
+ |
|
310 |
+ /* Read the data we need for subseq lookup |
|
311 |
+ */ |
|
312 |
+ if (! read_offset(sfp->fp, sfp->smode, data_offset)) return SSI_ERR_NODATA; |
|
313 |
+ if (! read_i32(sfp->fp, &len)) return SSI_ERR_NODATA; |
|
314 |
+ |
|
315 |
+ /* Set up tmp variables for clarity of equations below, |
|
316 |
+ * and to make them match documentation (ssi-format.tex). |
|
317 |
+ */ |
|
318 |
+ r = sfp->rpl[*ret_fh]; /* residues per line */ |
|
319 |
+ b = sfp->bpl[*ret_fh]; /* bytes per line */ |
|
320 |
+ i = requested_start; /* start position 1..L */ |
|
321 |
+ l = (i-1)/r; /* data line # (0..) that the residue is on */ |
|
322 |
+ if (r == 0 || b == 0) return SSI_ERR_NO_SUBSEQS; |
|
323 |
+ if (i < 0 || i > len) return SSI_ERR_RANGE; |
|
324 |
+ |
|
325 |
+ /* When b = r+1, there's nothing but sequence on each data line (and the \0), |
|
326 |
+ * and we can find each residue precisely. |
|
327 |
+ */ |
|
328 |
+ if (b == r+1) { |
|
329 |
+ if (sfp->smode == SSI_OFFSET_I32) { |
|
330 |
+ data_offset->mode = SSI_OFFSET_I32; |
|
331 |
+ data_offset->off.i32 = data_offset->off.i32 + l*b + (i-1)%r; |
|
332 |
+ } else if (sfp->smode == SSI_OFFSET_I64) { |
|
333 |
+ data_offset->mode = SSI_OFFSET_I64; |
|
334 |
+ data_offset->off.i64 = data_offset->off.i64 + l*b + (i-1)%r; |
|
335 |
+ } |
|
336 |
+ *ret_actual_start = requested_start; |
|
337 |
+ } else { |
|
338 |
+ /* else, there's other stuff on seq lines, so the best |
|
339 |
+ * we can do easily is to position at start of relevant line. |
|
340 |
+ */ |
|
341 |
+ if (sfp->smode == SSI_OFFSET_I32) { |
|
342 |
+ data_offset->mode = SSI_OFFSET_I32; |
|
343 |
+ data_offset->off.i32 = data_offset->off.i32 + l*b; |
|
344 |
+ } else if (sfp->smode == SSI_OFFSET_I64) { |
|
345 |
+ data_offset->mode = SSI_OFFSET_I64; |
|
346 |
+ data_offset->off.i64 = data_offset->off.i64 + l*b; |
|
347 |
+ } |
|
348 |
+ /* yes, the eq below is = 1 + (i-1)/r*r but it's not = i. that's an integer /. */ |
|
349 |
+ *ret_actual_start = 1 + l*r; |
|
350 |
+ } |
|
351 |
+ return 0; |
|
352 |
+} |
|
353 |
+ |
|
354 |
+/* Function: SSISetFilePosition() |
|
355 |
+ * Date: SRE, Tue Jan 2 09:13:46 2001 [St. Louis] |
|
356 |
+ * |
|
357 |
+ * Purpose: Uses {offset} to sets the file position for {fp}, usually an |
|
358 |
+ * open sequence file, relative to the start of the file. |
|
359 |
+ * Hides the details of system-dependent shenanigans necessary for |
|
360 |
+ * file positioning in large (>2 GB) files. |
|
361 |
+ * |
|
362 |
+ * Behaves just like fseek(fp, offset, SEEK_SET) for 32 bit |
|
363 |
+ * offsets and <2 GB files. |
|
364 |
+ * |
|
365 |
+ * Warning: if all else fails, in desperation, it will try to |
|
366 |
+ * use fsetpos(). This requires making assumptions about fpos_t |
|
367 |
+ * that may be unwarranted... assumptions that ANSI C prohibits |
|
368 |
+ * me from making... though I believe the ./configure |
|
369 |
+ * script robustly tests whether I can play with fpos_t like this. |
|
370 |
+ * |
|
371 |
+ * Args: fp - file to position. |
|
372 |
+ * offset - SSI offset relative to file start. |
|
373 |
+ * |
|
374 |
+ * Returns: 0 on success, nonzero on error. |
|
375 |
+ */ |
|
376 |
+int |
|
377 |
+SSISetFilePosition(FILE *fp, SSIOFFSET *offset) |
|
378 |
+{ |
|
379 |
+ if (offset->mode == SSI_OFFSET_I32) { |
|
380 |
+ if (fseek(fp, offset->off.i32, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED; |
|
381 |
+ } |
|
382 |
+#ifndef HAS_64BIT_FILE_OFFSETS |
|
383 |
+ else return SSI_ERR_NO64BIT; |
|
384 |
+#elif defined HAVE_FSEEKO && SIZEOF_OFF_T == 8 |
|
385 |
+ else if (fseeko(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED; |
|
386 |
+#elif defined HAVE_FSEEKO64 && SIZEOF_OFF64_T == 8 |
|
387 |
+ else if (fseeko64(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED; |
|
388 |
+#elif defined HAVE_FSEEK64 |
|
389 |
+ else if (fseek64(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED; |
|
390 |
+#elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8 |
|
391 |
+ else if (fsetpos(fp, &(offset->off.i64)) != 0) return SSI_ERR_SEEK_FAILED; |
|
392 |
+#endif |
|
393 |
+ return 0; |
|
394 |
+} |
|
395 |
+ |
|
396 |
+ |
|
397 |
+/* Function: SSIFileInfo() |
|
398 |
+ * Date: SRE, Tue Jan 2 10:31:01 2001 [St. Louis] |
|
399 |
+ * |
|
400 |
+ * Purpose: Given a file number {fh} in an open index file |
|
401 |
+ * {sfp}, retrieve file name {ret_filename} and |
|
402 |
+ * the file format {ret_format}. |
|
403 |
+ * |
|
404 |
+ * {ret_filename} is a pointer to a string maintained |
|
405 |
+ * internally by {sfp}. It should not be free'd; |
|
406 |
+ * SSIClose(sfp) takes care of it. |
|
407 |
+ * |
|
408 |
+ * Args: sfp - open index file |
|
409 |
+ * fh - handle on file to look up |
|
410 |
+ * ret_filename - RETURN: name of file n |
|
411 |
+ * ret_format - RETURN: format of file n |
|
412 |
+ * |
|
413 |
+ * Returns: 0 on success, nonzero on failure. |
|
414 |
+ */ |
|
415 |
+int |
|
416 |
+SSIFileInfo(SSIFILE *sfp, int fh, char **ret_filename, int *ret_format) |
|
417 |
+{ |
|
418 |
+ if (fh < 0 || fh >= sfp->nfiles) return SSI_ERR_BADARG; |
|
419 |
+ *ret_filename = sfp->filename[fh]; |
|
420 |
+ *ret_format = sfp->fileformat[fh]; |
|
421 |
+ return 0; |
|
422 |
+} |
|
423 |
+ |
|
424 |
+/* Function: SSIClose() |
|
425 |
+ * Date: SRE, Sun Dec 31 14:56:37 2000 [St. Louis] |
|
426 |
+ * |
|
427 |
+ * Purpose: Close an open {SSIFILE *}. |
|
428 |
+ * |
|
429 |
+ * Args: sfp - index file to close. |
|
430 |
+ * |
|
431 |
+ * Returns: (void) |
|
432 |
+ */ |
|
433 |
+void |
|
434 |
+SSIClose(SSIFILE *sfp) |
|
435 |
+{ |
|
436 |
+ if (sfp != NULL) { |
|
437 |
+ clear_ssifile(sfp); |
|
438 |
+ if (sfp->fp != NULL) fclose(sfp->fp); |
|
439 |
+ free(sfp); |
|
440 |
+ } |
|
441 |
+} |
|
442 |
+/* clear_ssifile(): free the innards of SSIFILE, without |
|
443 |
+ * destroying the structure or closing the stream. |
|
444 |
+ */ |
|
445 |
+static void |
|
446 |
+clear_ssifile(SSIFILE *sfp) |
|
447 |
+{ |
|
448 |
+ int i; |
|
449 |
+ |
|
450 |
+ if (sfp->filename != NULL) { |
|
451 |
+ for (i = 0; i < sfp->nfiles; i++) |
|
452 |
+ if (sfp->filename[i] != NULL) free(sfp->filename[i]); |
|
453 |
+ free(sfp->filename); |
|
454 |
+ } |
|
455 |
+ if (sfp->fileformat != NULL) free(sfp->fileformat); |
|
456 |
+ if (sfp->fileflags != NULL) free(sfp->fileflags); |
|
457 |
+ if (sfp->bpl != NULL) free(sfp->bpl); |
|
458 |
+ if (sfp->rpl != NULL) free(sfp->rpl); |
|
459 |
+} |
|
460 |
+ |
|
461 |
+ |
|
462 |
+/* Function: SSIRecommendMode() |
|
463 |
+ * Date: SRE, Fri Feb 16 08:23:47 2001 [St. Louis] |
|
464 |
+ * |
|
465 |
+ * Purpose: Examines the file and determines whether it should be |
|
466 |
+ * indexed with large file support or not; returns |
|
467 |
+ * SSI_OFFSET_I32 for most files, SSI_OFFSET_I64 for large |
|
468 |
+ * files, or -1 on failure. |
|
469 |
+ * |
|
470 |
+ * Args: file - name of file to check for size |
|
471 |
+ * |
|
472 |
+ * Returns: -1 on failure (including case where file is too big) |
|
473 |
+ * SSI_OFFSET_I32 for most files (<= 2^31-1 bytes) |
|
474 |
+ * SSI_OFFSET_I64 for large files (> 2^31-1 bytes) |
|
475 |
+ */ |
|
476 |
+int |
|
477 |
+SSIRecommendMode(char *file) |
|
478 |
+{ |
|
479 |
+#if HAVE_STAT64 |
|
480 |
+ struct stat64 s1; |
|
481 |
+ if (stat64(file, &s1) == 0) { |
|
482 |
+ if (s1.st_size <= 2146483647L) return SSI_OFFSET_I32; |
|
483 |
+ else return SSI_OFFSET_I64; |
|
484 |
+ } |
|
485 |
+#else |
|
486 |
+ struct stat s2; |
|
487 |
+ if (stat(file, &s2) == 0) { |
|
488 |
+ if (s2.st_size <= 2146483647L) return SSI_OFFSET_I32; |
|
489 |
+ else return SSI_OFFSET_I64; |
|
490 |
+ } |
|
491 |
+#endif |
|
492 |
+ return -1; |
|
493 |
+} |
|
494 |
+ |
|
495 |
+ |
|
496 |
+/* Function: SSICreateIndex() |
|
497 |
+ * Date: SRE, Tue Jan 2 11:23:25 2001 [St. Louis] |
|
498 |
+ * |
|
499 |
+ * Purpose: Creates and initializes a SSI index structure. |
|
500 |
+ * Sequence file offset type is specified by {mode}. |
|
501 |
+ * |
|
502 |
+ * Args: mode - SSI_OFFSET_I32 or SSI_OFFSET_I64, sequence file index mode. |
|
503 |
+ * |
|
504 |
+ * Returns: ptr to new index structure, or NULL on failure. |
|
505 |
+ * Caller is responsible for free'ing the returned |
|
506 |
+ * structure with SSIFreeIndex(). |
|
507 |
+ */ |
|
508 |
+SSIINDEX * |
|
509 |
+SSICreateIndex(int mode) |
|
510 |
+{ |
|
511 |
+ SSIINDEX *g; |
|
512 |
+ |
|
513 |
+ g = NULL; |
|
514 |
+ if ((g = malloc(sizeof(SSIINDEX))) == NULL) goto FAILURE; |
|
515 |
+ g->smode = mode; |
|
516 |
+ g->imode = SSI_OFFSET_I32; /* index always starts as 32-bit; may get upgraded later */ |
|
517 |
+ g->external = FALSE; |
|
518 |
+ g->max_ram = SSI_MAXRAM; |
|
519 |
+ |
|
520 |
+#ifndef HAS_64BIT_FILE_OFFSETS |
|
521 |
+ if (mode == SSI_OFFSET_I64) |
|
522 |
+ Die("\ |
|
523 |
+Can't create a 64-bit SSI index on this system, sorry;\n\ |
|
524 |
+I don't have 64-bit file offset functions available.\n"); |
|
525 |
+#endif |
|
526 |
+ |
|
527 |
+ g->filenames = NULL; |
|
528 |
+ g->fileformat = NULL; |
|
529 |
+ g->bpl = NULL; |
|
530 |
+ g->rpl = NULL; |
|
531 |
+ g->flen = 0; |
|
532 |
+ g->nfiles = 0; |
|
533 |
+ |
|
534 |
+ g->pkeys = NULL; |
|
535 |
+ g->plen = 0; |
|
536 |
+ g->nprimary = 0; |
|
537 |
+ g->ptmpfile = "tmp.ssi.1"; /* hardcoded, for now. */ |
|
538 |
+ g->ptmp = NULL; |
|
539 |
+ |
|
540 |
+ g->skeys = NULL; |
|
541 |
+ g->slen = 0; |
|
542 |
+ g->nsecondary = 0; |
|
543 |
+ g->stmpfile = "tmp.ssi.2"; /* hardcoded, for now. */ |
|
544 |
+ g->stmp = NULL; |
|
545 |
+ |
|
546 |
+ /* All mallocs must go after NULL initializations, because of the cleanup strategy; |
|
547 |
+ * we'll try to free anything non-NULL if a malloc fails. |
|
548 |
+ */ |
|
549 |
+ if ((g->filenames = malloc(sizeof(char *) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; |
|
550 |
+ if ((g->fileformat= malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; |
|
551 |
+ if ((g->bpl = malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; |
|
552 |
+ if ((g->rpl = malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; |
|
553 |
+ |
|
554 |
+ if ((g->pkeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL) goto FAILURE; |
|
555 |
+ if ((g->skeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL) goto FAILURE; |
|
556 |
+ |
|
557 |
+ return g; |
|
558 |
+ |
|
559 |
+ FAILURE: |
|
560 |
+ SSIFreeIndex(g); /* free the damaged structure */ |
|
561 |
+ return NULL; |
|
562 |
+} |
|
563 |
+ |
|
564 |
+/* Function: SSIGetFilePosition() |
|
565 |
+ * Date: SRE, Tue Jan 2 09:59:26 2001 [St. Louis] |
|
566 |
+ * |
|
567 |
+ * Purpose: Fills {ret_offset} with the current disk |
|
568 |
+ * offset of {fp}, relative to the start of the file. |
|
569 |
+ * {mode} is set to either SSI_OFFSET_I32 or |
|
570 |
+ * SSI_OFFSET_I64. If {mode} is _I32 (32 bit), just wraps |
|
571 |
+ * a call to ftell(); otherwise deals with system-dependent |
|
572 |
+ * details of 64-bit file offsets. |
|
573 |
+ * |
|
574 |
+ * Args: fp - open stream |
|
575 |
+ * mode - SSI_OFFSET_I32 or SSI_OFFSET_I64 |
|
576 |
+ * ret_offset - RETURN: file position |
|
577 |
+ * |
|
578 |
+ * Returns: 0 on success. nonzero on error. |
|
579 |
+ */ |
|
580 |
+int |
|
581 |
+SSIGetFilePosition(FILE *fp, int mode, SSIOFFSET *ret_offset) |
|
582 |
+{ |
|
583 |
+ if (mode == SSI_OFFSET_I32) |
|
584 |
+ { |
|
585 |
+ ret_offset->mode = SSI_OFFSET_I32; |
|
586 |
+ ret_offset->off.i32 = ftell(fp); |
|
587 |
+ if (ret_offset->off.i32 == -1) return SSI_ERR_TELL_FAILED; |
|
588 |
+ } |
|
589 |
+ else if (mode != SSI_OFFSET_I64) abort(); /* only happens on a coding error */ |
|
590 |
+ else { |
|
591 |
+ ret_offset->mode = SSI_OFFSET_I64; |
|
592 |
+#ifndef HAS_64BIT_FILE_OFFSETS |
|
593 |
+ return SSI_ERR_NO64BIT; |
|
594 |
+#elif defined HAVE_FTELLO && SIZEOF_OFF_T == 8 |
|
595 |
+ if ((ret_offset->off.i64 = ftello(fp)) == -1) return SSI_ERR_TELL_FAILED; |
|
596 |
+#elif defined HAVE_FTELLO64 && SIZEOF_OFF64_T == 8 |
|
597 |
+ if ((ret_offset->off.i64 = ftello64(fp)) == -1) return SSI_ERR_TELL_FAILED; |
|
598 |
+#elif defined HAVE_FTELL64 |
|
599 |
+ if ((ret_offset->off.i64 = ftell64(fp)) == -1) return SSI_ERR_TELL_FAILED; |
|
600 |
+#elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8 |
|
601 |
+ if (fgetpos(fp, &(ret_offset->off.i64)) != 0) return SSI_ERR_TELL_FAILED; |
|
602 |
+#endif |
|
603 |
+ } |
|
604 |
+ return 0; |
|
605 |
+} |
|
606 |
+ |
|
607 |
+/* Function: SSIAddFileToIndex() |
|
608 |
+ * Date: SRE, Tue Jan 2 12:54:36 2001 [St. Louis] |
|
609 |
+ * |
|
610 |
+ * Purpose: Adds the sequence file {filename}, which is known to |
|
611 |
+ * be in format {fmt}, to the index {g}. Creates and returns |
|
612 |
+ * a unique filehandle {fh} for then associating primary keys |
|
613 |
+ * with this file using SSIAddPrimaryKeyToIndex(). |
|
614 |
+ * |
|
615 |
+ * Args: g - active index |
|
616 |
+ * filename - file to add |
|
617 |
+ * fmt - format code for this file (e.g. SQFILE_FASTA) |
|
618 |
+ * ret_fh - RETURN: unique handle for this file |
|
619 |
+ * |
|
620 |
+ * Returns: 0 on success; nonzero on error. |
|
621 |
+ */ |
|
622 |
+int |
|
623 |
+SSIAddFileToIndex(SSIINDEX *g, char *filename, int fmt, int *ret_fh) |
|
624 |
+{ |
|
625 |
+ int n; |
|
626 |
+ |
|
627 |
+ if (g->nfiles >= SSI_MAXFILES) return SSI_ERR_TOOMANY_FILES; |
|
628 |
+ |
|
629 |
+ n = strlen(filename); |
|
630 |
+ if ((n+1) > g->flen) g->flen = n+1; |
|
631 |
+ |
|
632 |
+ g->filenames[g->nfiles] = FileTail(filename, FALSE); |
|
633 |
+ g->fileformat[g->nfiles] = fmt; |
|
634 |
+ g->bpl[g->nfiles] = 0; |
|
635 |
+ g->rpl[g->nfiles] = 0; |
|
636 |
+ *ret_fh = g->nfiles; /* handle is simply = file number */ |
|
637 |
+ g->nfiles++; |
|
638 |
+ |
|
639 |
+ if (g->nfiles % SSI_FILE_BLOCK == 0) { |
|
640 |
+ g->filenames = realloc(g->filenames, sizeof(char *) * (g->nfiles+SSI_FILE_BLOCK)); |
|
641 |
+ if (g->filenames == NULL) return SSI_ERR_MALLOC; |
|
642 |
+ g->fileformat= realloc(g->fileformat, sizeof(sqd_uint32) * (g->nfiles+SSI_FILE_BLOCK)); |
|
643 |
+ if (g->fileformat == NULL) return SSI_ERR_MALLOC; |
|
644 |
+ g->bpl = realloc(g->bpl, sizeof(sqd_uint32) * (g->nfiles+SSI_FILE_BLOCK)); |
|
645 |
+ if (g->bpl == NULL) return SSI_ERR_MALLOC; |
|
646 |
+ g->rpl = realloc(g->rpl, sizeof(sqd_uint32) * (g->nfiles+SSI_FILE_BLOCK)); |
|
647 |
+ if (g->rpl == NULL) return SSI_ERR_MALLOC; |
|
648 |
+ } |
|
649 |
+ return 0; |
|
650 |
+} |
|
651 |
+ |
|
652 |
+ |
|
653 |
+/* Function: SSISetFileForSubseq() |
|
654 |
+ * Date: SRE, Tue Jan 9 10:02:05 2001 [St. Louis] |
|
655 |
+ * |
|
656 |
+ * Purpose: Set SSI_FAST_SUBSEQ for the file indicated by |
|
657 |
+ * filehandle {fh} in the index {g}, setting |
|
658 |
+ * parameters {bpl} and {rpl} to the values given. |
|
659 |
+ * {bpl} is the number of bytes per sequence data line. |
|
660 |
+ * {rpl} is the number of residues per sequence data line. |
|
661 |
+ * Caller must be sure that {bpl} and {rpl} do not change |
|
662 |
+ * on any line of any sequence record in the file |
|
663 |
+ * (except for the last data line of each record). If |
|
664 |
+ * this is not the case in this file, SSI_FAST_SUBSEQ |
|
665 |
+ * will not work, and this routine should not be |
|
666 |
+ * called. |
|
667 |
+ * |
|
668 |
+ * Args: g - the active index |
|
669 |
+ * fh - handle for file to set SSI_FAST_SUBSEQ on |
|
670 |
+ * bpl - bytes per data line |
|
671 |
+ * rpl - residues per data line |
|
672 |
+ * |
|
673 |
+ * Returns: 0 on success; 1 on error. |
|
674 |
+ */ |
|
675 |
+int |
|
676 |
+SSISetFileForSubseq(SSIINDEX *g, int fh, int bpl, int rpl) |
|
677 |
+{ |
|
678 |
+ if (fh < 0 || fh >= g->nfiles) return SSI_ERR_BADARG; |
|
679 |
+ if (bpl <= 0 || rpl <= 0) return SSI_ERR_BADARG; |
|
680 |
+ g->bpl[fh] = bpl; |
|
681 |
+ g->rpl[fh] = rpl; |
|
682 |
+ return 0; |
|
683 |
+} |
|
684 |
+ |
|
685 |
+ |
|
686 |
+/* Function: SSIAddPrimaryKeyToIndex() |
|
687 |
+ * Date: SRE, Tue Jan 2 11:50:54 2001 [St. Louis] |
|
688 |
+ * |
|
689 |
+ * Purpose: Put primary key {key} in the index {g}, while telling |
|
690 |
+ * the index this primary key is in the file associated |
|
691 |
+ * with filehandle {fh} (returned by a previous call |
|
692 |
+ * to SSIAddFileToIndex()), and its record starts at |
|
693 |
+ * position {r_off} in the file. |
|
694 |
+ * |
|
695 |
+ * {d_off} and {L} are optional; they may be left unset |
|
696 |
+ * by passing NULL and 0, respectively. (If one is |
|
697 |
+ * provided, both must be provided.) If they are provided, |
|
698 |
+ * {d_off} gives the position of the first line of sequence |
|
699 |
+ * data in the record, and {L} gives the length of |
|
700 |
+ * the sequence in residues. They are used when |
|
701 |
+ * SSI_FAST_SUBSEQ is set for this file. If SSI_FAST_SUBSEQ |
|
702 |
+ * is not set for the file, {d_off} and {L} will be |
|
703 |
+ * ignored by the index reading API even if they are stored |
|
704 |
+ * by the index writing API, so it doesn't hurt for the |
|
705 |
+ * indexing program to provide them; typically they |
|
706 |
+ * won't know whether it's safe to set SSI_FAST_SUBSEQ |
|
707 |
+ * for the whole file until the whole file has been |
|
708 |
+ * read and every key has already been added to the index. |
|
709 |
+ * |
|
710 |
+ * Args: g - active index |
|
711 |
+ * key - primary key to add |
|
712 |
+ * fh - handle on file that this key's in |
|
713 |
+ * r_off - offset to start of record |
|
714 |
+ * d_off - offset to start of sequence data |
|
715 |
+ * L - length of sequence, or 0 |
|
716 |
+ * |
|
717 |
+ * Returns: 0 on success, nonzero on error. |
|
718 |
+ */ |
|
719 |
+int |
|
720 |
+SSIAddPrimaryKeyToIndex(SSIINDEX *g, char *key, int fh, |
|
721 |
+ SSIOFFSET *r_off, SSIOFFSET *d_off, int L) |
|
722 |
+{ |
|
723 |
+ int n; /* a string length */ |
|
724 |
+ |
|
725 |
+ if (fh >= SSI_MAXFILES) return SSI_ERR_TOOMANY_FILES; |
|
726 |
+ if (g->nprimary >= SSI_MAXKEYS) return SSI_ERR_TOOMANY_KEYS; |
|
727 |
+ if (L > 0 && d_off == NULL) abort(); /* need both. */ |
|
728 |
+ |
|
729 |
+ /* Before adding the key: check how big our index is. |
|
730 |
+ * If it's getting too large, switch to external mode. |
|
731 |
+ */ |
|
732 |
+ if (!g->external && current_index_size(g) >= g->max_ram) |
|
733 |
+ if (activate_external_sort(g) != 0) return SSI_ERR_NOFILE; |
|
734 |
+ |
|
735 |
+ /* Update maximum pkey length, if needed. |
|
736 |
+ */ |
|
737 |
+ n = strlen(key); |
|
738 |
+ if ((n+1) > g->plen) g->plen = n+1; |
|
739 |
+ |
|
740 |
+ /* External mode? Simply append to disk... |
|
741 |
+ */ |
|
742 |
+ if (g->external) { |
|
743 |
+ if (g->smode == SSI_OFFSET_I32) { |
|
744 |
+ fprintf(g->ptmp, "%s\t%d\t%lu\t%lu\t%lu\n", |
|
745 |
+ key, fh, (unsigned long) r_off->off.i32, |
|
746 |
+ (unsigned long) (d_off == NULL? 0 : d_off->off.i32), |
|
747 |
+ (unsigned long) L); |
|
748 |
+ } else { |
|
749 |
+#ifdef CLUSTALO |
|
750 |
+ fprintf(g->ptmp, "%s\t%d\t%llu\t%llu\t%lu\n", |
|
751 |
+ key, fh, (unsigned long long)r_off->off.i64, |
|
752 |
+ d_off == NULL? 0 : (unsigned long long) d_off->off.i64, |
|
753 |
+ (unsigned long) L); |
|
754 |
+#else |
|
755 |
+ fprintf(g->ptmp, "%s\t%d\t%llu\t%llu\t%lu\n", |
|
756 |
+ key, fh, r_off->off.i64, |
|
757 |
+ d_off == NULL? 0 : d_off->off.i64, |
|
758 |
+ (unsigned long) L); |
|
759 |
+#endif |
|
760 |
+ } |
|
761 |
+ g->nprimary++; |
|
762 |
+ return 0; |
|
763 |
+ } |
|
764 |
+ |
|
765 |
+ /* Else: internal mode, keep keys in memory... |
|
766 |
+ */ |
|
767 |
+ if ((g->pkeys[g->nprimary].key = sre_strdup(key, n)) == NULL) return SSI_ERR_MALLOC; |
|
768 |
+ g->pkeys[g->nprimary].fnum = (sqd_uint16) fh; |
|
769 |
+ g->pkeys[g->nprimary].r_off = *r_off; |
|
770 |
+ if (d_off != NULL && L > 0) { |
|
771 |
+ g->pkeys[g->nprimary].d_off = *d_off; |
|
772 |
+ g->pkeys[g->nprimary].len = L; |
|
773 |
+ } else { |
|
774 |
+ /* yeah, this looks stupid, but look: we have to give a valid |
|
775 |
+ looking, non-NULL d_off of some sort, or writes will fail. |
|
776 |
+ It's going to be unused anyway. */ |
|
777 |
+ g->pkeys[g->nprimary].d_off = *r_off; |
|
778 |
+ g->pkeys[g->nprimary].len = 0; |
|
779 |
+ } |
|
780 |
+ g->nprimary++; |
|
781 |
+ |
|
782 |
+ if (g->nprimary % SSI_KEY_BLOCK == 0) { |
|
783 |
+ g->pkeys = realloc(g->pkeys, sizeof(struct ssipkey_s) * (g->nprimary+SSI_KEY_BLOCK)); |
|
784 |
+ if (g->pkeys == NULL) return SSI_ERR_MALLOC; |
|
785 |
+ } |
|
786 |
+ return 0; |
|
787 |
+} |
|
788 |
+ |
|
789 |
+ |
|
790 |
+/* Function: SSIAddSecondaryKeyToIndex() |
|
791 |
+ * Date: SRE, Tue Jan 2 12:44:40 2001 [St. Louis] |
|
792 |
+ * |
|
793 |
+ * Purpose: Puts secondary key {key} in the index {g}, associating |
|
794 |
+ * it with primary key {pkey} that was previously |
|
795 |
+ * registered by SSIAddPrimaryKeyToIndex(). |
|
796 |
+ * |
|
797 |
+ * Args: g - active index |
|
798 |
+ * key - secondary key to add |
|
799 |
+ * pkey - primary key to associate this key with |
|
800 |
+ * |
|
801 |
+ * Returns: 0 on success, nonzero on failure. |
|
802 |
+ */ |
|
803 |
+int |
|
804 |
+SSIAddSecondaryKeyToIndex(SSIINDEX *g, char *key, char *pkey) |
|
805 |
+{ |
|
806 |
+ int n; /* a string length */ |
|
807 |
+ |
|
808 |
+ if (g->nsecondary >= SSI_MAXKEYS) return SSI_ERR_TOOMANY_KEYS; |
|
809 |
+ |
|
810 |
+ /* Before adding the key: check how big our index is. |
|
811 |
+ * If it's getting too large, switch to external mode. |
|
812 |
+ */ |
|
813 |
+ if (!g->external && current_index_size(g) >= g->max_ram) |
|
814 |
+ if (activate_external_sort(g) != 0) return SSI_ERR_NOFILE; |
|
815 |
+ |
|
816 |
+ /* Update maximum secondary key length, if necessary. |
|
817 |
+ */ |
|
818 |
+ n = strlen(key); |
|
819 |
+ if ((n+1) > g->slen) g->slen = n+1; |
|
820 |
+ |
|
821 |
+ /* if external mode: write info to disk. |
|
822 |
+ */ |
|
823 |
+ if (g->external) { |
|
824 |
+ fprintf(g->stmp, "%s\t%s\n", key, pkey); |
|
825 |
+ g->nsecondary++; |
|
826 |
+ return 0; |
|
827 |
+ } |
|
828 |
+ |
|
829 |
+ /* else, internal mode... store info in memory. |
|
830 |
+ */ |
|
831 |
+ if ((g->skeys[g->nsecondary].key = sre_strdup(key, n)) == NULL) return SSI_ERR_MALLOC; |
|
832 |
+ if ((g->skeys[g->nsecondary].pkey = sre_strdup(pkey, -1)) == NULL) return SSI_ERR_MALLOC; |
|
833 |
+ g->nsecondary++; |
|
834 |
+ |
|
835 |
+ if (g->nsecondary % SSI_KEY_BLOCK == 0) { |
|
836 |
+ g->skeys = realloc(g->skeys, sizeof(struct ssiskey_s) * (g->nsecondary+SSI_KEY_BLOCK)); |
|
837 |
+ if (g->skeys == NULL) return SSI_ERR_MALLOC; |
|
838 |
+ } |
|
839 |
+ return 0; |
|
840 |
+} |
|
841 |
+ |
|
842 |
+ |
|
843 |
+ |
|
844 |
+ |
|
845 |
+/* Function: SSIWriteIndex() |
|
846 |
+ * Date: SRE, Tue Jan 2 13:55:56 2001 [St. Louis] |
|
847 |
+ * |
|
848 |
+ * Purpose: Writes complete index {g} in SSI format to a |
|
849 |
+ * binary file {file}. Does all |
|
850 |
+ * the overhead of sorting the primary and secondary keys, |
|
851 |
+ * and maintaining the association of secondary keys |
|
852 |
+ * with primary keys during and after the sort. |
|
853 |
+ * |
|
854 |
+ * Args: file - file to write to |
|
855 |
+ * g - index to sort & write out. |
|
856 |
+ * |
|
857 |
+ * Returns: 0 on success, nonzero on error. |
|
858 |
+ */ |
|
859 |
+/* needed for qsort() */ |
|
860 |
+static int |
|
861 |
+pkeysort(const void *k1, const void *k2) |
|
862 |
+{ |
|
863 |
+ struct ssipkey_s *key1; |
|
864 |
+ struct ssipkey_s *key2; |
|