msa
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@102253 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,639 @@ |
1 |
+/***************************************************************** |
|
2 |
+ * SQUID - a library of functions for biological sequence analysis |
|
3 |
+ * Copyright (C) 1992-2002 Washington University School of Medicine |
|
4 |
+ * |
|
5 |
+ * This source code is freely distributed under the terms of the |
|
6 |
+ * GNU General Public License. See the files COPYRIGHT and LICENSE |
|
7 |
+ * for details. |
|
8 |
+ *****************************************************************/ |
|
9 |
+ |
|
10 |
+/* alignio.c |
|
11 |
+ * SRE, Mon Jul 12 11:57:37 1993 |
|
12 |
+ * RCS $Id: alignio.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: alignio.c,v 1.11 2002/10/09 14:26:09 eddy Exp) |
|
13 |
+ * |
|
14 |
+ * Input/output of sequence alignments. |
|
15 |
+ */ |
|
16 |
+ |
|
17 |
+#include <stdio.h> |
|
18 |
+#include <stdlib.h> |
|
19 |
+#include <string.h> |
|
20 |
+#include <ctype.h> |
|
21 |
+#include "squid.h" |
|
22 |
+#include "sre_random.h" |
|
23 |
+ |
|
24 |
+/* Function: AllocAlignment() |
|
25 |
+ * |
|
26 |
+ * Purpose: Allocate space for an alignment, given the number |
|
27 |
+ * of sequences and the alignment length in columns. |
|
28 |
+ * |
|
29 |
+ * Args: nseq - number of sequences |
|
30 |
+ * alen - width of alignment |
|
31 |
+ * ret_aseq - RETURN: alignment itself |
|
32 |
+ * ainfo - RETURN: other info associated with alignment |
|
33 |
+ * |
|
34 |
+ * Return: (void) |
|
35 |
+ * aseq, ainfo free'd by caller: FreeAlignment(aseq, &ainfo). |
|
36 |
+ * note that ainfo itself is alloc'ed in caller, usually |
|
37 |
+ * just by a "AINFO ainfo" definition. |
|
38 |
+ */ |
|
39 |
+void |
|
40 |
+AllocAlignment(int nseq, int alen, char ***ret_aseq, AINFO *ainfo) |
|
41 |
+{ |
|
42 |
+ char **aseq; |
|
43 |
+ int idx; |
|
44 |
+ |
|
45 |
+ InitAinfo(ainfo); |
|
46 |
+ |
|
47 |
+ aseq = (char **) MallocOrDie (sizeof(char *) * nseq); |
|
48 |
+ for (idx = 0; idx < nseq; idx++) |
|
49 |
+ aseq[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1)); |
|
50 |
+ |
|
51 |
+ ainfo->alen = alen; |
|
52 |
+ ainfo->nseq = nseq; |
|
53 |
+ |
|
54 |
+ ainfo->wgt = (float *) MallocOrDie (sizeof(float) * nseq); |
|
55 |
+ FSet(ainfo->wgt, nseq, 1.0); |
|
56 |
+ |
|
57 |
+ ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq); |
|
58 |
+ for (idx = 0; idx < nseq; idx++) |
|
59 |
+ ainfo->sqinfo[idx].flags = 0; |
|
60 |
+ |
|
61 |
+ *ret_aseq = aseq; |
|
62 |
+} |
|
63 |
+ |
|
64 |
+ |
|
65 |
+/* Function: InitAinfo() |
|
66 |
+ * Date: SRE, Tue Jan 19 10:16:02 1999 [St. Louis] |
|
67 |
+ * |
|
68 |
+ * Purpose: Initialize the fields in ainfo structure to |
|
69 |
+ * default (null) values. Does nothing with |
|
70 |
+ * fields that are dependent on nseq or alen. |
|
71 |
+ * |
|
72 |
+ * Args: ainfo - optional info structure for an alignment |
|
73 |
+ * |
|
74 |
+ * Returns: (void). ainfo is modified. |
|
75 |
+ */ |
|
76 |
+void |
|
77 |
+InitAinfo(AINFO *ainfo) |
|
78 |
+{ |
|
79 |
+ ainfo->name = NULL; |
|
80 |
+ ainfo->desc = NULL; |
|
81 |
+ ainfo->cs = NULL; |
|
82 |
+ ainfo->rf = NULL; |
|
83 |
+ ainfo->acc = NULL; |
|
84 |
+ ainfo->au = NULL; |
|
85 |
+ ainfo->flags = 0; |
|
86 |
+ |
|
87 |
+ ainfo->tc1 = ainfo->tc2 = 0.0; |
|
88 |
+ ainfo->nc1 = ainfo->nc2 = 0.0; |
|
89 |
+ ainfo->ga1 = ainfo->ga2 = 0.0; |
|
90 |
+} |
|
91 |
+ |
|
92 |
+ |
|
93 |
+/* Function: FreeAlignment() |
|
94 |
+ * |
|
95 |
+ * Purpose: Free the space allocated to alignment, names, and optional |
|
96 |
+ * information. |
|
97 |
+ * |
|
98 |
+ * Args: aseqs - sequence alignment |
|
99 |
+ * ainfo - associated alignment data. |
|
100 |
+ */ |
|
101 |
+void |
|
102 |
+FreeAlignment(char **aseqs, AINFO *ainfo) |
|
103 |
+{ |
|
104 |
+ int i; |
|
105 |
+ |
|
106 |
+ for (i = 0; i < ainfo->nseq; i++) |
|
107 |
+ { |
|
108 |
+ if (ainfo->sqinfo[i].flags & SQINFO_SS) free(ainfo->sqinfo[i].ss); |
|
109 |
+ if (ainfo->sqinfo[i].flags & SQINFO_SA) free(ainfo->sqinfo[i].sa); |
|
110 |
+ } |
|
111 |
+ if (ainfo->cs != NULL) free(ainfo->cs); |
|
112 |
+ if (ainfo->rf != NULL) free(ainfo->rf); |
|
113 |
+ if (ainfo->name != NULL) free(ainfo->name); |
|
114 |
+ if (ainfo->desc != NULL) free(ainfo->desc); |
|
115 |
+ if (ainfo->acc != NULL) free(ainfo->acc); |
|
116 |
+ if (ainfo->au != NULL) free(ainfo->au); |
|
117 |
+ |
|
118 |
+ free(ainfo->sqinfo); |
|
119 |
+ free(ainfo->wgt); |
|
120 |
+ Free2DArray((void **) aseqs, ainfo->nseq); |
|
121 |
+} |
|
122 |
+ |
|
123 |
+ |
|
124 |
+ |
|
125 |
+/* Function: SAMizeAlignment() |
|
126 |
+ * Date: SRE, Tue Jun 30 09:49:40 1998 [St. Louis] |
|
127 |
+ * |
|
128 |
+ * Purpose: Make a "best effort" attempt to convert an alignment |
|
129 |
+ * to SAM gap format: - in delete col, . in insert col. |
|
130 |
+ * Only works if alignment adheres to SAM's upper/lower |
|
131 |
+ * case convention, which is true for instance of old |
|
132 |
+ * HMMER alignments. |
|
133 |
+ * |
|
134 |
+ * Args: aseq - alignment to convert |
|
135 |
+ * nseq - number of seqs in alignment |
|
136 |
+ * alen - length of alignment |
|
137 |
+ * |
|
138 |
+ * Returns: (void) |
|
139 |
+ */ |
|
140 |
+void |
|
141 |
+SAMizeAlignment(char **aseq, int nseq, int alen) |
|
142 |
+{ |
|
143 |
+ int col; /* counter for aligned columns */ |
|
144 |
+ int i; /* counter for seqs */ |
|
145 |
+ int sawlower, sawupper, sawgap; |
|
146 |
+ char gapchar; |
|
147 |
+ |
|
148 |
+ for (col = 0; col < alen; col++) |
|
149 |
+ { |
|
150 |
+ sawlower = sawupper = sawgap = 0; |
|
151 |
+ /* pass 1: do we see only upper or lower? */ |
|
152 |
+ for (i = 0; i < nseq; i++) |
|
153 |
+ { |
|
154 |
+ if (isgap(aseq[i][col])) { sawgap = 1; continue; } |
|
155 |
+ if (isupper((int) aseq[i][col])) { sawupper = 1; continue; } |
|
156 |
+ if (islower((int) aseq[i][col])) sawlower = 1; |
|
157 |
+ } |
|
158 |
+ /* select gap character for column */ |
|
159 |
+ gapchar = '-'; /* default */ |
|
160 |
+ if (sawlower && ! sawupper) gapchar = '.'; |
|
161 |
+ |
|
162 |
+ /* pass 2: set gap char */ |
|
163 |
+ for (i = 0; i < nseq; i++) |
|
164 |
+ if (isgap(aseq[i][col])) aseq[i][col] = gapchar; |
|
165 |
+ } |
|
166 |
+} |
|
167 |
+ |
|
168 |
+ |
|
169 |
+/* Function: SAMizeAlignmentByGapFrac() |
|
170 |
+ * Date: SRE, Tue Jun 30 10:58:38 1998 [St. Louis] |
|
171 |
+ * |
|
172 |
+ * Purpose: Convert an alignment to SAM's gap and case |
|
173 |
+ * conventions, using gap fraction in a column |
|
174 |
+ * to choose match versus insert columns. In match columns, |
|
175 |
+ * residues are upper case and gaps are '-'. |
|
176 |
+ * In insert columns, residues are lower case and |
|
177 |
+ * gaps are '.' |
|
178 |
+ * |
|
179 |
+ * Args: aseq - aligned sequences |
|
180 |
+ * nseq - number of sequences |
|
181 |
+ * alen - length of alignment |
|
182 |
+ * maxgap - if more gaps than this fraction, column is insert. |
|
183 |
+ * |
|
184 |
+ * Returns: (void) Characters in aseq may be altered. |
|
185 |
+ */ |
|
186 |
+void |
|
187 |
+SAMizeAlignmentByGapFrac(char **aseq, int nseq, int alen, float maxgap) |
|
188 |
+{ |
|
189 |
+ int apos; /* counter over columns */ |
|
190 |
+ int idx; /* counter over sequences */ |
|
191 |
+ int ngap; /* number of gaps seen */ |
|
192 |
+ |
|
193 |
+ for (apos = 0; apos < alen; apos++) |
|
194 |
+ { |
|
195 |
+ /* count gaps */ |
|
196 |
+ ngap = 0; |
|
197 |
+ for (idx = 0; idx < nseq; idx++) |
|
198 |
+ if (isgap(aseq[idx][apos])) ngap++; |
|
199 |
+ |
|
200 |
+ /* convert to SAM conventions */ |
|
201 |
+ if ((float) ngap / (float) nseq > maxgap) |
|
202 |
+ { /* insert column */ |
|
203 |
+ for (idx = 0; idx < nseq; idx++) |
|
204 |
+ if (isgap(aseq[idx][apos])) aseq[idx][apos] = '.'; |
|
205 |
+ else aseq[idx][apos] = (char) tolower((int) aseq[idx][apos]); |
|
206 |
+ } |
|
207 |
+ else |
|
208 |
+ { /* match column */ |
|
209 |
+ for (idx = 0; idx < nseq; idx++) |
|
210 |
+ if (isgap(aseq[idx][apos])) aseq[idx][apos] = '-'; |
|
211 |
+ else aseq[idx][apos] = (char) toupper((int) aseq[idx][apos]); |
|
212 |
+ } |
|
213 |
+ } |
|
214 |
+} |
|
215 |
+ |
|
216 |
+ |
|
217 |
+ |
|
218 |
+ |
|
219 |
+/* Function: MakeAlignedString() |
|
220 |
+ * |
|
221 |
+ * Purpose: Given a raw string of some type (secondary structure, say), |
|
222 |
+ * align it to a given aseq by putting gaps wherever the |
|
223 |
+ * aseq has gaps. |
|
224 |
+ * |
|
225 |
+ * Args: aseq: template for alignment |
|
226 |
+ * alen: length of aseq |
|
227 |
+ * ss: raw string to align to aseq |
|
228 |
+ * ret_s: RETURN: aligned ss |
|
229 |
+ * |
|
230 |
+ * Return: 1 on success, 0 on failure (and squid_errno is set.) |
|
231 |
+ * ret_ss is malloc'ed here and must be free'd by caller. |
|
232 |
+ */ |
|
233 |
+int |
|
234 |
+MakeAlignedString(char *aseq, int alen, char *ss, char **ret_s) |
|
235 |
+{ |
|
236 |
+ char *new; |
|
237 |
+ int apos, rpos; |
|
238 |
+ |
|
239 |
+ new = (char *) MallocOrDie ((alen+1) * sizeof(char)); |
|
240 |
+ for (apos = rpos = 0; apos < alen; apos++) |
|
241 |
+ if (! isgap(aseq[apos])) |
|
242 |
+ { |
|
243 |
+ new[apos] = ss[rpos]; |
|
244 |
+ rpos++; |
|
245 |
+ } |
|
246 |
+ else |
|
247 |
+ new[apos] = '.'; |
|
248 |
+ new[apos] = '\0'; |
|
249 |
+ |
|
250 |
+ if (rpos != strlen(ss)) |
|
251 |
+ { squid_errno = SQERR_PARAMETER; free(new); return 0; } |
|
252 |
+ *ret_s = new; |
|
253 |
+ return 1; |
|
254 |
+} |
|
255 |
+ |
|
256 |
+ |
|
257 |
+/* Function: MakeDealignedString() |
|
258 |
+ * |
|
259 |
+ * Purpose: Given an aligned string of some type (either sequence or |
|
260 |
+ * secondary structure, for instance), dealign it relative |
|
261 |
+ * to a given aseq. Return a ptr to the new string. |
|
262 |
+ * |
|
263 |
+ * Args: aseq : template alignment |
|
264 |
+ * alen : length of aseq |
|
265 |
+ * ss: : string to make dealigned copy of; same length as aseq |
|
266 |
+ * ret_s : RETURN: dealigned copy of ss |
|
267 |
+ * |
|
268 |
+ * Return: 1 on success, 0 on failure (and squid_errno is set) |
|
269 |
+ * ret_s is alloc'ed here and must be freed by caller |
|
270 |
+ */ |
|
271 |
+int |
|
272 |
+MakeDealignedString(char *aseq, int alen, char *ss, char **ret_s) |
|
273 |
+{ |
|
274 |
+ char *new; |
|
275 |
+ int apos, rpos; |
|
276 |
+ |
|
277 |
+ new = (char *) MallocOrDie ((alen+1) * sizeof(char)); |
|
278 |
+ for (apos = rpos = 0; apos < alen; apos++) |
|
279 |
+ if (! isgap(aseq[apos])) |
|
280 |
+ { |
|
281 |
+ new[rpos] = ss[apos]; |
|
282 |
+ rpos++; |
|
283 |
+ } |
|
284 |
+ new[rpos] = '\0'; |
|
285 |
+ if (alen != strlen(ss)) |
|
286 |
+ { squid_errno = SQERR_PARAMETER; free(new); return 0; } |
|
287 |
+ *ret_s = new; |
|
288 |
+ return 1; |
|
289 |
+} |
|
290 |
+ |
|
291 |
+ |
|
292 |
+/* Function: DealignedLength() |
|
293 |
+ * |
|
294 |
+ * Purpose: Count the number of non-gap symbols in seq. |
|
295 |
+ * (i.e. find the length of the unaligned sequence) |
|
296 |
+ * |
|
297 |
+ * Args: aseq - aligned sequence to count symbols in, \0 terminated |
|
298 |
+ * |
|
299 |
+ * Return: raw length of seq. |
|
300 |
+ */ |
|
301 |
+int |
|
302 |
+DealignedLength(char *aseq) |
|
303 |
+{ |
|
304 |
+ int rlen; |
|
305 |
+ for (rlen = 0; *aseq; aseq++) |
|
306 |
+ if (! isgap(*aseq)) rlen++; |
|
307 |
+ return rlen; |
|
308 |
+} |
|
309 |
+ |
|
310 |
+ |
|
311 |
+/* Function: WritePairwiseAlignment() |
|
312 |
+ * |
|
313 |
+ * Purpose: Write a nice formatted pairwise alignment out, |
|
314 |
+ * with a BLAST-style middle line showing identities |
|
315 |
+ * as themselves (single letter) and conservative |
|
316 |
+ * changes as '+'. |
|
317 |
+ * |
|
318 |
+ * Args: ofp - open fp to write to (stdout, perhaps) |
|
319 |
+ * aseq1, aseq2 - alignments to write (not necessarily |
|
320 |
+ * flushed right with gaps) |
|
321 |
+ * name1, name2 - names of sequences |
|
322 |
+ * spos1, spos2 - starting position in each (raw) sequence |
|
323 |
+ * pam - PAM matrix; positive values define |
|
324 |
+ * conservative changes |
|
325 |
+ * indent - how many extra spaces to print on left |
|
326 |
+ * |
|
327 |
+ * Return: 1 on success, 0 on failure |
|
328 |
+ */ |
|
329 |
+int |
|
330 |
+WritePairwiseAlignment(FILE *ofp, |
|
331 |
+ char *aseq1, char *name1, int spos1, |
|
332 |
+ char *aseq2, char *name2, int spos2, |
|
333 |
+ int **pam, int indent) |
|
334 |
+{ |
|
335 |
+ char sname1[11]; /* shortened name */ |
|
336 |
+ char sname2[11]; |
|
337 |
+ int still_going; /* True if writing another block */ |
|
338 |
+ char buf1[61]; /* buffer for writing seq1; CPL+1*/ |
|
339 |
+ char bufmid[61]; /* buffer for writing consensus */ |
|
340 |
+ char buf2[61]; |
|
341 |
+ char *s1, *s2; /* ptrs into each sequence */ |
|
342 |
+ int count1, count2; /* number of symbols we're writing */ |
|
343 |
+ int rpos1, rpos2; /* position in raw seqs */ |
|
344 |
+ int rawcount1, rawcount2; /* number of nongap symbols written */ |
|
345 |
+ int apos; |
|
346 |
+ |
|
347 |
+ strncpy(sname1, name1, 10); |
|
348 |
+ sname1[10] = '\0'; |
|
349 |
+ strtok(sname1, WHITESPACE); |
|
350 |
+ |
|
351 |
+ strncpy(sname2, name2, 10); |
|
352 |
+ sname2[10] = '\0'; |
|
353 |
+ strtok(sname2, WHITESPACE); |
|
354 |
+ |
|
355 |
+ s1 = aseq1; |
|
356 |
+ s2 = aseq2; |
|
357 |
+ rpos1 = spos1; |
|
358 |
+ rpos2 = spos2; |
|
359 |
+ |
|
360 |
+ still_going = TRUE; |
|
361 |
+ while (still_going) |
|
362 |
+ { |
|
363 |
+ still_going = FALSE; |
|
364 |
+ |
|
365 |
+ /* get next line's worth from both */ |
|
366 |
+ strncpy(buf1, s1, 60); buf1[60] = '\0'; |
|
367 |
+ strncpy(buf2, s2, 60); buf2[60] = '\0'; |
|
368 |
+ count1 = strlen(buf1); |
|
369 |
+ count2 = strlen(buf2); |
|
370 |
+ |
|
371 |
+ /* is there still more to go? */ |
|
372 |
+ if ((count1 == 60 && s1[60] != '\0') || |
|
373 |
+ (count2 == 60 && s2[60] != '\0')) |
|
374 |
+ still_going = TRUE; |
|
375 |
+ |
|
376 |
+ /* shift seq ptrs by a line */ |
|
377 |
+ s1 += count1; |
|
378 |
+ s2 += count2; |
|
379 |
+ |
|
380 |
+ /* assemble the consensus line */ |
|
381 |
+ for (apos = 0; apos < count1 && apos < count2; apos++) |
|
382 |
+ { |
|
383 |
+ if (!isgap(buf1[apos]) && !isgap(buf2[apos])) |
|
384 |
+ { |
|
385 |
+ if (buf1[apos] == buf2[apos]) |
|
386 |
+ bufmid[apos] = buf1[apos]; |
|
387 |
+ else if (pam[buf1[apos] - 'A'][buf2[apos] - 'A'] > 0) |
|
388 |
+ bufmid[apos] = '+'; |
|
389 |
+ else |
|
390 |
+ bufmid[apos] = ' '; |
|
391 |
+ } |
|
392 |
+ else |
|
393 |
+ bufmid[apos] = ' '; |
|
394 |
+ } |
|
395 |
+ bufmid[apos] = '\0'; |
|
396 |
+ |
|
397 |
+ rawcount1 = 0; |
|
398 |
+ for (apos = 0; apos < count1; apos++) |
|
399 |
+ if (!isgap(buf1[apos])) rawcount1++; |
|
400 |
+ |
|
401 |
+ rawcount2 = 0; |
|
402 |
+ for (apos = 0; apos < count2; apos++) |
|
403 |
+ if (!isgap(buf2[apos])) rawcount2++; |
|
404 |
+ |
|
405 |
+ (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "", |
|
406 |
+ sname1, rpos1, buf1, rpos1 + rawcount1 -1); |
|
407 |
+ (void) fprintf(ofp, "%*s %s\n", indent, "", |
|
408 |
+ bufmid); |
|
409 |
+ (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "", |
|
410 |
+ sname2, rpos2, buf2, rpos2 + rawcount2 -1); |
|
411 |
+ (void) fprintf(ofp, "\n"); |
|
412 |
+ |
|
413 |
+ rpos1 += rawcount1; |
|
414 |
+ rpos2 += rawcount2; |
|
415 |
+ } |
|
416 |
+ |
|
417 |
+ return 1; |
|
418 |
+} |
|
419 |
+ |
|
420 |
+ |
|
421 |
+/* Function: MingapAlignment() |
|
422 |
+ * |
|
423 |
+ * Purpose: Remove all-gap columns from a multiple sequence alignment |
|
424 |
+ * and its associated data. The alignment is assumed to be |
|
425 |
+ * flushed (all aseqs the same length). |
|
426 |
+ */ |
|
427 |
+int |
|
428 |
+MingapAlignment(char **aseqs, AINFO *ainfo) |
|
429 |
+{ |
|
430 |
+ int apos; /* position in original alignment */ |
|
431 |
+ int mpos; /* position in new alignment */ |
|
432 |
+ int idx; |
|
433 |
+ |
|
434 |
+ /* We overwrite aseqs, using its allocated memory. |
|
435 |
+ */ |
|
436 |
+ for (apos = 0, mpos = 0; aseqs[0][apos] != '\0'; apos++) |
|
437 |
+ { |
|
438 |
+ /* check for all-gap in column */ |
|
439 |
+ for (idx = 0; idx < ainfo->nseq; idx++) |
|
440 |
+ if (! isgap(aseqs[idx][apos])) |
|
441 |
+ break; |
|
442 |
+ if (idx == ainfo->nseq) continue; |
|
443 |
+ |
|
444 |
+ /* shift alignment and ainfo */ |
|
445 |
+ if (mpos != apos) |
|
446 |
+ { |
|
447 |
+ for (idx = 0; idx < ainfo->nseq; idx++) |
|
448 |
+ aseqs[idx][mpos] = aseqs[idx][apos]; |
|
449 |
+ |
|
450 |
+ if (ainfo->cs != NULL) ainfo->cs[mpos] = ainfo->cs[apos]; |
|
451 |
+ if (ainfo->rf != NULL) ainfo->rf[mpos] = ainfo->rf[apos]; |
|
452 |
+ } |
|
453 |
+ mpos++; |
|
454 |
+ } |
|
455 |
+ /* null terminate everything */ |
|
456 |
+ for (idx = 0; idx < ainfo->nseq; idx++) |
|
457 |
+ aseqs[idx][mpos] = '\0'; |
|
458 |
+ ainfo->alen = mpos; /* set new length */ |
|
459 |
+ if (ainfo->cs != NULL) ainfo->cs[mpos] = '\0'; |
|
460 |
+ if (ainfo->rf != NULL) ainfo->rf[mpos] = '\0'; |
|
461 |
+ return 1; |
|
462 |
+} |
|
463 |
+ |
|
464 |
+ |
|
465 |
+ |
|
466 |
+/* Function: RandomAlignment() |
|
467 |
+ * |
|
468 |
+ * Purpose: Create a random alignment from raw sequences. |
|
469 |
+ * |
|
470 |
+ * Ideally, we would like to sample an alignment from the |
|
471 |
+ * space of possible alignments according to its probability, |
|
472 |
+ * given a prior probability distribution for alignments. |
|
473 |
+ * I don't see how to describe such a distribution, let alone |
|
474 |
+ * sample it. |
|
475 |
+ * |
|
476 |
+ * This is a rough approximation that tries to capture some |
|
477 |
+ * desired properties. We assume the alignment is generated |
|
478 |
+ * by a simple HMM composed of match and insert states. |
|
479 |
+ * Given parameters (pop, pex) for the probability of opening |
|
480 |
+ * and extending an insertion, we can find the expected number |
|
481 |
+ * of match states, M, in the underlying model for each sequence. |
|
482 |
+ * We use an average M taken over all the sequences (this is |
|
483 |
+ * an approximation. The expectation of M given all the sequence |
|
484 |
+ * lengths is a nasty-looking summation.) |
|
485 |
+ * |
|
486 |
+ * M = len / ( 1 + pop ( 1 + 1/ (1-pex) ) ) |
|
487 |
+ * |
|
488 |
+ * Then, we assign positions in each raw sequence onto the M match |
|
489 |
+ * states and M+1 insert states of this "HMM", by rolling random |
|
490 |
+ * numbers and inserting the (rlen-M) inserted positions randomly |
|
491 |
+ * into the insert slots, taking into account the relative probability |
|
492 |
+ * of open vs. extend. |
|
493 |
+ * |
|
494 |
+ * The resulting alignment has two desired properties: insertions |
|
495 |
+ * tend to follow the HMM-like exponential distribution, and |
|
496 |
+ * the "sparseness" of the alignment is controllable through |
|
497 |
+ * pop and pex. |
|
498 |
+ * |
|
499 |
+ * Args: rseqs - raw sequences to "align", 0..nseq-1 |
|
500 |
+ * sqinfo - array of 0..nseq-1 info structures for the sequences |
|
501 |
+ * nseq - number of sequences |
|
502 |
+ * pop - probability to open insertion (0<pop<1) |
|
503 |
+ * pex - probability to extend insertion (0<pex<1) |
|
504 |
+ * ret_aseqs - RETURN: alignment (flushed) |
|
505 |
+ * ainfo - fill in: alignment info |
|
506 |
+ * |
|
507 |
+ * Return: 1 on success, 0 on failure. Sets squid_errno to indicate cause |
|
508 |
+ * of failure. |
|
509 |
+ */ |
|
510 |
+int |
|
511 |
+RandomAlignment(char **rseqs, SQINFO *sqinfo, int nseq, float pop, float pex, |
|
512 |
+ char ***ret_aseqs, AINFO *ainfo) |
|
513 |
+{ |
|
514 |
+ char **aseqs; /* RETURN: alignment */ |
|
515 |
+ int alen; /* length of alignment */ |
|
516 |
+ int *rlen; /* lengths of each raw sequence */ |
|
517 |
+ int M; /* length of "model" */ |
|
518 |
+ int **ins; /* insertion counts, 0..nseq-1 by 0..M */ |
|
519 |
+ int *master_ins; /* max insertion counts, 0..M */ |
|
520 |
+ int apos, rpos, idx; |
|
521 |
+ int statepos; |
|
522 |
+ int count; |
|
523 |
+ int minlen; |
|
524 |
+ |
|
525 |
+ /* calculate expected length of model, M |
|
526 |
+ */ |
|
527 |
+ rlen = (int *) MallocOrDie (sizeof(int) * nseq); |
|
528 |
+ M = 0; |
|
529 |
+ minlen = 9999999; |
|
530 |
+ for (idx = 0; idx < nseq; idx++) |
|
531 |
+ { |
|
532 |
+ rlen[idx] = strlen(rseqs[idx]); |
|
533 |
+ M += rlen[idx]; |
|
534 |
+ minlen = (rlen[idx] < minlen) ? rlen[idx] : minlen; |
|
535 |
+ } |
|
536 |
+ M = (int) ((float) M / (1.0 + pop * (1.0 + 1.0 / (1.0 - pex)))); |
|
537 |
+ M /= nseq; |
|
538 |
+ if (M > minlen) M = minlen; |
|
539 |
+ |
|
540 |
+ /* make arrays that count insertions in M+1 possible insert states |
|
541 |
+ */ |
|
542 |
+ ins = (int **) MallocOrDie (sizeof(int *) * nseq); |
|
543 |
+ master_ins = (int *) MallocOrDie (sizeof(int) * (M+1)); |
|
544 |
+ for (idx = 0; idx < nseq; idx++) |
|
545 |
+ { |
|
546 |
+ ins[idx] = (int *) MallocOrDie (sizeof(int) * (M+1)); |
|
547 |
+ for (rpos = 0; rpos <= M; rpos++) |
|
548 |
+ ins[idx][rpos] = 0; |
|
549 |
+ } |
|
550 |
+ /* normalize */ |
|
551 |
+ pop = pop / (pop+pex); |
|
552 |
+ pex = 1.0 - pop; |
|
553 |
+ /* make insertions for individual sequences */ |
|
554 |
+ for (idx = 0; idx < nseq; idx++) |
|
555 |
+ { |
|
556 |
+ apos = -1; |
|
557 |
+ for (rpos = 0; rpos < rlen[idx]-M; rpos++) |
|
558 |
+ { |
|
559 |
+ if (sre_random() < pop || apos == -1) /* open insertion */ |
|
560 |
+ apos = CHOOSE(M+1); /* choose 0..M */ |
|
561 |
+ ins[idx][apos]++; |
|
562 |
+ } |
|
563 |
+ } |
|
564 |
+ /* calculate master_ins, max inserts */ |
|
565 |
+ alen = M; |
|
566 |
+ for (apos = 0; apos <= M; apos++) |
|
567 |
+ { |
|
568 |
+ master_ins[apos] = 0; |
|
569 |
+ for (idx = 0; idx < nseq; idx++) |
|
570 |
+ if (ins[idx][apos] > master_ins[apos]) |
|
571 |
+ master_ins[apos] = ins[idx][apos]; |
|
572 |
+ alen += master_ins[apos]; |
|
573 |
+ } |
|
574 |
+ |
|
575 |
+ |
|
576 |
+ /* Now, construct alignment |
|
577 |
+ */ |
|
578 |
+ aseqs = (char **) MallocOrDie (sizeof (char *) * nseq); |
|
579 |
+ for (idx = 0; idx < nseq; idx++) |
|
580 |
+ aseqs[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1)); |
|
581 |
+ for (idx = 0; idx < nseq; idx++) |
|
582 |
+ { |
|
583 |
+ apos = rpos = 0; |
|
584 |
+ |
|
585 |
+ for (statepos = 0; statepos <= M; statepos++) |
|
586 |
+ { |
|
587 |
+ for (count = 0; count < ins[idx][statepos]; count++) |
|
588 |
+ aseqs[idx][apos++] = rseqs[idx][rpos++]; |
|
589 |
+ for (; count < master_ins[statepos]; count++) |
|
590 |
+ aseqs[idx][apos++] = ' '; |
|
591 |
+ |
|
592 |
+ if (statepos != M) |
|
593 |
+ aseqs[idx][apos++] = rseqs[idx][rpos++]; |
|
594 |
+ } |
|
595 |
+ aseqs[idx][alen] = '\0'; |
|
596 |
+ } |
|
597 |
+ ainfo->flags = 0; |
|
598 |
+ ainfo->alen = alen; |
|
599 |
+ ainfo->nseq = nseq; |
|
600 |
+ ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq); |
|
601 |
+ for (idx = 0; idx < nseq; idx++) |
|
602 |
+ SeqinfoCopy(&(ainfo->sqinfo[idx]), &(sqinfo[idx])); |
|
603 |
+ |
|
604 |
+ free(rlen); |
|
605 |
+ free(master_ins); |
|
606 |
+ Free2DArray((void **) ins, nseq); |
|
607 |
+ *ret_aseqs = aseqs; |
|
608 |
+ return 1; |
|
609 |
+} |
|
610 |
+ |
|
611 |
+/* Function: AlignmentHomogenousGapsym() |
|
612 |
+ * Date: SRE, Sun Mar 19 19:37:12 2000 [wren, St. Louis] |
|
613 |
+ * |
|
614 |
+ * Purpose: Sometimes we've got to convert alignments to |
|
615 |
+ * a lowest common denominator, and we need |
|
616 |
+ * a single specific gap character -- for example, |
|
617 |
+ * PSI-BLAST blastpgp -B takes a very simplistic |
|
618 |
+ * alignment input format which appears to only |
|
619 |
+ * allow '-' as a gap symbol. |
|
620 |
+ * |
|
621 |
+ * Anything matching the isgap() macro is |
|
622 |
+ * converted. |
|
623 |
+ * |
|
624 |
+ * Args: aseq - aligned character strings, [0..nseq-1][0..alen-1] |
|
625 |
+ * nseq - number of aligned strings |
|
626 |
+ * alen - length of alignment |
|
627 |
+ * gapsym - character to use for gaps. |
|
628 |
+ * |
|
629 |
+ * Returns: void ("never fails") |
|
630 |
+ */ |
|
631 |
+void |
|
632 |
+AlignmentHomogenousGapsym(char **aseq, int nseq, int alen, char gapsym) |
|
633 |
+{ |
|
634 |
+ int i, apos; |
|
635 |
+ |
|
636 |
+ for (i = 0; i < nseq; i++) |
|
637 |
+ for (apos = 0; apos < alen; apos++) |
|
638 |
+ if (isgap(aseq[i][apos])) aseq[i][apos] = gapsym; |
|
639 |
+} |