msa
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@102253 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,1435 @@ |
1 |
+/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
|
2 |
+ |
|
3 |
+/********************************************************************* |
|
4 |
+ * Clustal Omega - Multiple sequence alignment |
|
5 |
+ * |
|
6 |
+ * Copyright (C) 2010 University College Dublin |
|
7 |
+ * |
|
8 |
+ * Clustal-Omega is free software; you can redistribute it and/or |
|
9 |
+ * modify it under the terms of the GNU General Public License as |
|
10 |
+ * published by the Free Software Foundation; either version 2 of the |
|
11 |
+ * License, or (at your option) any later version. |
|
12 |
+ * |
|
13 |
+ * This file is part of Clustal-Omega. |
|
14 |
+ * |
|
15 |
+ ********************************************************************/ |
|
16 |
+ |
|
17 |
+/* |
|
18 |
+ * RCS $Id: clustal-omega.c 284 2013-06-12 10:10:11Z fabian $ |
|
19 |
+ */ |
|
20 |
+ |
|
21 |
+#ifdef HAVE_CONFIG_H |
|
22 |
+#include "config.h" |
|
23 |
+#endif |
|
24 |
+ |
|
25 |
+#include <assert.h> |
|
26 |
+ |
|
27 |
+#include "clustal-omega.h" |
|
28 |
+#include "hhalign/general.h" |
|
29 |
+#include "clustal/hhalign_wrapper.h" |
|
30 |
+ |
|
31 |
+/* The following comment block contains the frontpage/mainpage of the doxygen |
|
32 |
+ * documentation. Please add some more info. FIXME add more |
|
33 |
+ */ |
|
34 |
+ |
|
35 |
+/** |
|
36 |
+ * |
|
37 |
+ * @mainpage Clustal-Omega Documentation |
|
38 |
+ * |
|
39 |
+ * @section intro_sec Introduction |
|
40 |
+ * |
|
41 |
+ * For more information see http://www.clustal.org/ |
|
42 |
+ * |
|
43 |
+ * @section api_section API |
|
44 |
+ * |
|
45 |
+ * @subsection example_prog_subsection An Example Program |
|
46 |
+ * |
|
47 |
+ * To use libclustalo you will have to include the clustal-omega.h header and |
|
48 |
+ * link against libclustalo. For linking against libclustalo you will have to |
|
49 |
+ * use a C++ compiler, no matter if your program was written in C or C++. See |
|
50 |
+ * below (\ref pkgconfig_subsubsec)) on how to figure out compiler flags with |
|
51 |
+ * pkg-config. |
|
52 |
+ * |
|
53 |
+ * Assuming Clustal Omega was installed in system-wide default directory (e.g. |
|
54 |
+ * /usr), first compile (don't link yet) your source (for example code see |
|
55 |
+ * section \ref example_src_subsubsec) and then link against libclustalo: |
|
56 |
+ * |
|
57 |
+ * @code |
|
58 |
+ * $ gcc -c -ansi -Wall clustalo-api-test.c |
|
59 |
+ * $ g++ -ansi -Wall -o clustalo-api-test clustalo-api-test.o -lclustalo |
|
60 |
+ * @endcode |
|
61 |
+ * |
|
62 |
+ * Voila! Now you have your own alignment program based on Clustal Omega which |
|
63 |
+ * can be run with |
|
64 |
+ * |
|
65 |
+ * @code |
|
66 |
+ * $ ./clustalo-api-test <your-sequence-input> |
|
67 |
+ * @endcode |
|
68 |
+ * |
|
69 |
+ * It's best to use the same compiler that you used for compiling libclustal. |
|
70 |
+ * If libclustal was compiled with OpenMP support, you will have to use OpenMP |
|
71 |
+ * flags for you program as well. |
|
72 |
+ * |
|
73 |
+ * |
|
74 |
+ * @subsubsection pkgconfig_subsubsec Using pkg-config / Figuring out compiler flags |
|
75 |
+ * |
|
76 |
+ * Clustal Omega comes with support for <a |
|
77 |
+ * href="http://pkg-config.freedesktop.org">pkg-config</a>, which means you |
|
78 |
+ * can run |
|
79 |
+ * |
|
80 |
+ * @code |
|
81 |
+ * $ pkg-config --cflags --libs clustalo |
|
82 |
+ * @endcode |
|
83 |
+ * |
|
84 |
+ * to figure out cflags and library flags needed to compile and link against |
|
85 |
+ * libclustalo. This is especially handy if Clustal Omega was installed to a |
|
86 |
+ * non-standard directory. |
|
87 |
+ * |
|
88 |
+ * You might have to change PKG_CONFIG_PATH. For example, if you used the prefix $HOME/local/ for |
|
89 |
+ * installation then you will first need to set PKG_CONFIG_PATH: |
|
90 |
+ * |
|
91 |
+ * @code |
|
92 |
+ * $ export PKG_CONFIG_PATH=$HOME/local/lib/pkgconfig |
|
93 |
+ * $ pkg-config --cflags --libs clustalo |
|
94 |
+ * @endcode |
|
95 |
+ * |
|
96 |
+ * |
|
97 |
+ * To compile your source use as above but this time using proper flags: |
|
98 |
+ * |
|
99 |
+ * @code |
|
100 |
+ * $ export PKG_CONFIG_PATH=$HOME/local/lib/pkgconfig |
|
101 |
+ * $ gcc -c -ansi -Wall $(pkg-config --cflags clustalo) clustalo-api-test.c |
|
102 |
+ * $ g++ -ansi -Wall -o clustalo-api-test $(pkg-config --libs clustalo) clustalo-api-test.o |
|
103 |
+ * @endcode |
|
104 |
+ * |
|
105 |
+ * |
|
106 |
+ * @subsubsection example_src_subsubsec Example Source Code |
|
107 |
+ * |
|
108 |
+ * @include "clustalo-api-test.c" |
|
109 |
+ * |
|
110 |
+ * |
|
111 |
+ */ |
|
112 |
+ |
|
113 |
+ |
|
114 |
+/* the following are temporary flags while the code is still under construction; |
|
115 |
+ had problems internalising hhmake, so as temporary crutch |
|
116 |
+ write alignment to file and get external hmmer/hhmake via system call |
|
117 |
+ to read alignment and convert into HMM |
|
118 |
+ All this will go, once hhmake is properly internalised */ |
|
119 |
+#define INDIRECT_HMM 0 /* temp flag: (1) write aln to file, use system(hmmer/hhmake), (0) internal hhmake */ |
|
120 |
+#define USEHMMER 1 /* temp flag: use system(hmmer) to build HMM */ |
|
121 |
+#define USEHHMAKE (!USEHMMER) /* temp flag: use system(hhmake) to build HMM */ |
|
122 |
+ |
|
123 |
+ |
|
124 |
+/* shuffle order of input sequences */ |
|
125 |
+#define SHUFFLE_INPUT_SEQ_ORDER 0 |
|
126 |
+ |
|
127 |
+/* sort input sequences by length */ |
|
128 |
+#define SORT_INPUT_SEQS 0 |
|
129 |
+ |
|
130 |
+ |
|
131 |
+int iNumberOfThreads; |
|
132 |
+ |
|
133 |
+/* broken, unused and lonely */ |
|
134 |
+static const int ITERATION_SCORE_IMPROVEMENT_THRESHOLD = 0.01; |
|
135 |
+ |
|
136 |
+ |
|
137 |
+/** |
|
138 |
+ * @brief Print Long version information to pre-allocated char. |
|
139 |
+ * |
|
140 |
+ * @note short version |
|
141 |
+ * information is equivalent to PACKAGE_VERSION |
|
142 |
+ * |
|
143 |
+ * @param[out] pcStr |
|
144 |
+ * char pointer to write to preallocated to hold iSize chars. |
|
145 |
+ * @param[in] iSize |
|
146 |
+ * size of pcStr |
|
147 |
+ */ |
|
148 |
+void |
|
149 |
+PrintLongVersion(char *pcStr, int iSize) |
|
150 |
+{ |
|
151 |
+ snprintf(pcStr, iSize, "version %s; code-name '%s'; build date %s", |
|
152 |
+ PACKAGE_VERSION, PACKAGE_CODENAME, __DATE__); |
|
153 |
+} |
|
154 |
+/* end of PrintLongVersion() */ |
|
155 |
+ |
|
156 |
+ |
|
157 |
+ |
|
158 |
+/** |
|
159 |
+ * @brief free aln opts members |
|
160 |
+ * |
|
161 |
+ */ |
|
162 |
+void |
|
163 |
+FreeAlnOpts(opts_t *prAlnOpts) { |
|
164 |
+ if (NULL != prAlnOpts->pcGuidetreeInfile) { |
|
165 |
+ CKFREE(prAlnOpts->pcGuidetreeInfile); |
|
166 |
+ } |
|
167 |
+ if (NULL != prAlnOpts->pcGuidetreeOutfile) { |
|
168 |
+ CKFREE(prAlnOpts->pcGuidetreeOutfile); |
|
169 |
+ } |
|
170 |
+ if (NULL != prAlnOpts->pcDistmatOutfile) { |
|
171 |
+ CKFREE(prAlnOpts->pcDistmatOutfile); |
|
172 |
+ } |
|
173 |
+ if (NULL != prAlnOpts->pcDistmatInfile) { |
|
174 |
+ CKFREE(prAlnOpts->pcDistmatInfile); |
|
175 |
+ } |
|
176 |
+} |
|
177 |
+/* end of FreeAlnOpts() */ |
|
178 |
+ |
|
179 |
+ |
|
180 |
+ |
|
181 |
+/** |
|
182 |
+ * @brief Sets members of given user opts struct to default values |
|
183 |
+ * |
|
184 |
+ * @param[out] prOpts |
|
185 |
+ * User opt struct to initialise |
|
186 |
+ * |
|
187 |
+ */ |
|
188 |
+void |
|
189 |
+SetDefaultAlnOpts(opts_t *prOpts) { |
|
190 |
+ prOpts->bAutoOptions = FALSE; |
|
191 |
+ |
|
192 |
+ prOpts->pcDistmatInfile = NULL; |
|
193 |
+ prOpts->pcDistmatOutfile = NULL; |
|
194 |
+ |
|
195 |
+ prOpts->iClustersizes = 100; /* FS, r274 -> */ |
|
196 |
+ prOpts->pcClustfile = NULL; /* FS, r274 -> */ |
|
197 |
+ prOpts->iClusteringType = CLUSTERING_UPGMA; |
|
198 |
+ prOpts->iPairDistType = PAIRDIST_KTUPLE; |
|
199 |
+ prOpts->bUseMbed = TRUE; /* FS, r250 -> */ |
|
200 |
+ prOpts->bUseMbedForIteration = TRUE; /* FS, r250 -> */ |
|
201 |
+ prOpts->pcGuidetreeOutfile = NULL; |
|
202 |
+ prOpts->pcGuidetreeInfile = NULL; |
|
203 |
+ prOpts->bPercID = FALSE; |
|
204 |
+ |
|
205 |
+ prOpts->ppcHMMInput = NULL; |
|
206 |
+ prOpts->iHMMInputFiles = 0; |
|
207 |
+ |
|
208 |
+ prOpts->iNumIterations = 0; |
|
209 |
+ prOpts->bIterationsAuto = FALSE; |
|
210 |
+ prOpts->iMaxGuidetreeIterations = INT_MAX; |
|
211 |
+ prOpts->iMaxHMMIterations = INT_MAX; |
|
212 |
+ |
|
213 |
+ SetDefaultHhalignPara(& prOpts->rHhalignPara); |
|
214 |
+ } |
|
215 |
+/* end of SetDefaultAlnOpts() */ |
|
216 |
+ |
|
217 |
+ |
|
218 |
+ |
|
219 |
+/** |
|
220 |
+ * @brief Check logic of parsed user options. Will exit (call Log(&rLog, LOG_FATAL, )) |
|
221 |
+ * on Fatal logic error |
|
222 |
+ * |
|
223 |
+ * @param[in] prOpts |
|
224 |
+ * Already parsed user options |
|
225 |
+ * |
|
226 |
+ */ |
|
227 |
+void |
|
228 |
+AlnOptsLogicCheck(opts_t *prOpts) |
|
229 |
+{ |
|
230 |
+ /* guide-tree & distmat |
|
231 |
+ * |
|
232 |
+ */ |
|
233 |
+ if (prOpts->pcDistmatInfile && prOpts->pcGuidetreeInfile) { |
|
234 |
+ Log(&rLog, LOG_FATAL, "Read distances *and* guide-tree from file doesn't make sense."); |
|
235 |
+ } |
|
236 |
+ |
|
237 |
+ if (prOpts->pcDistmatOutfile && prOpts->pcGuidetreeInfile) { |
|
238 |
+ Log(&rLog, LOG_FATAL, "Won't be able to save distances to file, because I got a guide-tree as input."); |
|
239 |
+ } |
|
240 |
+ |
|
241 |
+ /* combination of options that don't make sense when not iterating |
|
242 |
+ */ |
|
243 |
+ if (prOpts->iNumIterations==0 && prOpts->bIterationsAuto != TRUE) { |
|
244 |
+ |
|
245 |
+ if (prOpts->pcGuidetreeInfile && prOpts->pcGuidetreeOutfile) { |
|
246 |
+ Log(&rLog, LOG_FATAL, "Got a guide-tree as input and output which doesn't make sense when not iterating."); |
|
247 |
+ } |
|
248 |
+ /* |
|
249 |
+ if (prOpts->pcGuidetreeInfile && prOpts->bUseMbed > 0) { |
|
250 |
+ Log(&rLog, LOG_FATAL, "Got a guide-tree as input and was requested to cluster with mBed, which doesn't make sense when not iterating."); |
|
251 |
+ } |
|
252 |
+ */ |
|
253 |
+ /* |
|
254 |
+ AW: bUseMbedForIteration default since at least R252 |
|
255 |
+ if (prOpts->bUseMbedForIteration > 0) { |
|
256 |
+ Log(&rLog, LOG_FATAL, "No iteration requested, but mbed for iteration was set. Paranoia exit."); |
|
257 |
+ } |
|
258 |
+ */ |
|
259 |
+ } |
|
260 |
+ |
|
261 |
+ if (prOpts->rHhalignPara.iMacRamMB < 512) { |
|
262 |
+ Log(&rLog, LOG_WARN, "Memory for MAC Algorithm quite low, Viterbi Algorithm may be triggered."); |
|
263 |
+ if (prOpts->rHhalignPara.iMacRamMB < 1) { |
|
264 |
+ Log(&rLog, LOG_WARN, "Viterbi Algorithm always turned on, increase MAC-RAM to turn on MAC."); |
|
265 |
+ } |
|
266 |
+ } |
|
267 |
+ |
|
268 |
+ return; |
|
269 |
+} |
|
270 |
+/* end of AlnOptsLogicCheck() */ |
|
271 |
+ |
|
272 |
+ |
|
273 |
+/** |
|
274 |
+ * @brief FIXME doc |
|
275 |
+ */ |
|
276 |
+void |
|
277 |
+PrintAlnOpts(FILE *prFile, opts_t *prOpts) |
|
278 |
+{ |
|
279 |
+ int iAux; |
|
280 |
+ |
|
281 |
+ |
|
282 |
+ /* keep in same order as struct */ |
|
283 |
+ fprintf(prFile, "option: auto-options = %d\n", prOpts->bAutoOptions); |
|
284 |
+ fprintf(prFile, "option: distmat-infile = %s\n", |
|
285 |
+ NULL != prOpts->pcDistmatInfile? prOpts->pcDistmatInfile: "(null)"); |
|
286 |
+ fprintf(prFile, "option: distmat-outfile = %s\n", |
|
287 |
+ NULL != prOpts->pcDistmatOutfile? prOpts->pcDistmatOutfile: "(null)"); |
|
288 |
+ fprintf(prFile, "option: clustering-type = %d\n", prOpts->iClusteringType); |
|
289 |
+ fprintf(prFile, "option: pair-dist-type = %d\n", prOpts->iPairDistType); |
|
290 |
+ fprintf(prFile, "option: use-mbed = %d\n", prOpts->bUseMbed); |
|
291 |
+ fprintf(prFile, "option: use-mbed-for-iteration = %d\n", prOpts->bUseMbedForIteration); |
|
292 |
+ fprintf(prFile, "option: guidetree-outfile = %s\n", |
|
293 |
+ NULL != prOpts->pcGuidetreeOutfile? prOpts->pcGuidetreeOutfile: "(null)"); |
|
294 |
+ fprintf(prFile, "option: guidetree-infile = %s\n", |
|
295 |
+ NULL != prOpts->pcGuidetreeInfile? prOpts->pcGuidetreeInfile: "(null)"); |
|
296 |
+ for (iAux=0; iAux<prOpts->iHMMInputFiles; iAux++) { |
|
297 |
+ fprintf(prFile, "option: hmm-input no %d = %s\n", iAux, prOpts->ppcHMMInput[iAux]); |
|
298 |
+ } |
|
299 |
+ fprintf(prFile, "option: hmm-input-files = %d\n", prOpts->iHMMInputFiles); |
|
300 |
+ fprintf(prFile, "option: num-iterations = %d\n", prOpts->iNumIterations); |
|
301 |
+ fprintf(prFile, "option: iterations-auto = %d\n", prOpts->bIterationsAuto); |
|
302 |
+ fprintf(prFile, "option: max-hmm-iterations = %d\n", prOpts->iMaxHMMIterations); |
|
303 |
+ fprintf(prFile, "option: max-guidetree-iterations = %d\n", prOpts->iMaxGuidetreeIterations); |
|
304 |
+ fprintf(prFile, "option: iMacRamMB = %d\n", prOpts->rHhalignPara.iMacRamMB); |
|
305 |
+ fprintf(prFile, "option: percent-id = %d\n", prOpts->bPercID); |
|
306 |
+ fprintf(prFile, "option: use-kimura = %d\n", prOpts->bUseKimura); |
|
307 |
+ |
|
308 |
+} |
|
309 |
+/* end of PrintAlnOpts() */ |
|
310 |
+ |
|
311 |
+ |
|
312 |
+ |
|
313 |
+/** |
|
314 |
+ * @brief Returns major version of HMMER. Whichever hmmbuild version |
|
315 |
+ * is found first in your PATH will be used |
|
316 |
+ * |
|
317 |
+ * @return -1 on error, major hmmer version otherwise |
|
318 |
+ * |
|
319 |
+ */ |
|
320 |
+int |
|
321 |
+HmmerVersion() |
|
322 |
+{ |
|
323 |
+ char zcHmmerTestCall[] = "hmmbuild -h"; |
|
324 |
+ FILE *fp = NULL; |
|
325 |
+ int iMajorVersion = 0; |
|
326 |
+ char zcLine[16384]; |
|
327 |
+ |
|
328 |
+ if (NULL == (fp = popen(zcHmmerTestCall, "r"))) { |
|
329 |
+ Log(&rLog, LOG_ERROR, "Couldn't exec %s", zcHmmerTestCall); |
|
330 |
+ return -1; |
|
331 |
+ } |
|
332 |
+ while (fgets(zcLine, sizeof(zcLine), fp)) { |
|
333 |
+ char *pcLocate; |
|
334 |
+ if ((pcLocate = strstr(zcLine, "HMMER "))) { |
|
335 |
+ iMajorVersion = atoi(&pcLocate[6]); |
|
336 |
+ break; |
|
337 |
+ } |
|
338 |
+ } |
|
339 |
+ pclose(fp); |
|
340 |
+ |
|
341 |
+ return iMajorVersion; |
|
342 |
+} |
|
343 |
+/* end of HmmerVersion() */ |
|
344 |
+ |
|
345 |
+ |
|
346 |
+ |
|
347 |
+/** |
|
348 |
+ * @brief Create a HHM file from aligned sequences |
|
349 |
+ * |
|
350 |
+ * @warning Should be eliminated in the future |
|
351 |
+ * as building routine should not create intermediate files |
|
352 |
+ * |
|
353 |
+ * @param[in] prMSeq |
|
354 |
+ * Aligned mseq_t |
|
355 |
+ * @param[in] pcHMMOut |
|
356 |
+ * HMM output file name |
|
357 |
+ * |
|
358 |
+ * @return Non-zero on error |
|
359 |
+ * |
|
360 |
+ */ |
|
361 |
+int |
|
362 |
+AlnToHHMFile(mseq_t *prMSeq, char *pcHMMOut) |
|
363 |
+{ |
|
364 |
+ char *tmp_aln = NULL; |
|
365 |
+ int retcode = OK; |
|
366 |
+ |
|
367 |
+ int msa_c = 0; |
|
368 |
+ char **msa_v = NULL; |
|
369 |
+ |
|
370 |
+ assert(NULL!=prMSeq); |
|
371 |
+ assert(NULL!=pcHMMOut); |
|
372 |
+ |
|
373 |
+ if (FALSE == prMSeq->aligned) { |
|
374 |
+ Log(&rLog, LOG_ERROR, "Sequences need to be aligned to create an HMM"); |
|
375 |
+ return FAILURE; |
|
376 |
+ } |
|
377 |
+ |
|
378 |
+ /* Convert alignment to a2m, and call hhmake |
|
379 |
+ * |
|
380 |
+ * can't be static templates, or mktemp fails (at least on os x |
|
381 |
+ * (with a bus error)) |
|
382 |
+ * |
|
383 |
+ * gcc says we should use mkstemp to avoid race conditions, |
|
384 |
+ * but that returns a file descriptor, which is of no use to |
|
385 |
+ * us |
|
386 |
+ */ |
|
387 |
+ /* NOTE: the following won't work on windows: missing /tmp/ */ |
|
388 |
+ tmp_aln = CkStrdup("/tmp/clustalo_tmpaln_XXXXXX"); |
|
389 |
+ if (NULL == mktemp(tmp_aln)) { |
|
390 |
+ Log(&rLog, LOG_ERROR, "Could not create temporary alignment filename"); |
|
391 |
+ retcode = FAILURE; |
|
392 |
+ goto cleanup_and_return; |
|
393 |
+ } |
|
394 |
+#define LINE_WRAP 60 |
|
395 |
+ |
|
396 |
+ if (WriteAlignment(prMSeq, tmp_aln, MSAFILE_A2M, LINE_WRAP, FALSE, &msa_c, &msa_v)) { |
|
397 |
+ Log(&rLog, LOG_ERROR, "Could not save alignment to %s", tmp_aln); |
|
398 |
+ retcode = FAILURE; |
|
399 |
+ goto cleanup_and_return; |
|
400 |
+ } |
|
401 |
+ |
|
402 |
+ if (HHMake_Wrapper(tmp_aln, pcHMMOut)){ |
|
403 |
+ Log(&rLog, LOG_ERROR, "Could not convert alignment %s into HHM", tmp_aln); |
|
404 |
+ retcode = FAILURE; |
|
405 |
+ goto cleanup_and_return; |
|
406 |
+ } |
|
407 |
+ |
|
408 |
+ |
|
409 |
+ cleanup_and_return: |
|
410 |
+ |
|
411 |
+ if (NULL != tmp_aln) { |
|
412 |
+ if (FileExists(tmp_aln)) { |
|
413 |
+ if (remove(tmp_aln)) { |
|
414 |
+ Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_aln); |
|
415 |
+ } |
|
416 |
+ } |
|
417 |
+ CKFREE(tmp_aln); |
|
418 |
+ } |
|
419 |
+ |
|
420 |
+ return retcode; |
|
421 |
+ |
|
422 |
+} /* end of AlnToHHMFile() */ |
|
423 |
+ |
|
424 |
+ |
|
425 |
+ |
|
426 |
+/** |
|
427 |
+ * @brief Create a HMM file from aligned sequences |
|
428 |
+ * |
|
429 |
+ * @warning Should be replaced in the future by some internal HMM |
|
430 |
+ * building routine that does not call external programs |
|
431 |
+ * |
|
432 |
+ * @param[in] prMSeq |
|
433 |
+ * Aligned mseq_t |
|
434 |
+ * @param[in] pcHMMOut |
|
435 |
+ * HMM output file name |
|
436 |
+ * |
|
437 |
+ * @return Non-zero on error |
|
438 |
+ * |
|
439 |
+ |
|
440 |
+ */ |
|
441 |
+int |
|
442 |
+AlnToHMMFile(mseq_t *prMSeq, const char *pcHMMOut) |
|
443 |
+{ |
|
444 |
+ char *tmp_aln = NULL; |
|
445 |
+ char *tmp_hmm = NULL; /* only needed for hmmer3 to hmmer2 conversion */ |
|
446 |
+ char cmdbuf[16384]; |
|
447 |
+ int iHmmerVersion = 0; |
|
448 |
+ int retcode = OK; |
|
449 |
+ |
|
450 |
+ int msa_c = 0; |
|
451 |
+ char **msa_v = NULL; |
|
452 |
+ |
|
453 |
+ assert(NULL!=prMSeq); |
|
454 |
+ assert(NULL!=pcHMMOut); |
|
455 |
+ |
|
456 |
+ if (FALSE == prMSeq->aligned) { |
|
457 |
+ Log(&rLog, LOG_ERROR, "Sequences need to be aligned to create an HMM"); |
|
458 |
+ return FAILURE; |
|
459 |
+ } |
|
460 |
+ |
|
461 |
+ iHmmerVersion = HmmerVersion(); |
|
462 |
+ if (2 != iHmmerVersion && 3 != iHmmerVersion) { |
|
463 |
+ Log(&rLog, LOG_ERROR, "Could not find suitable HMMER binaries"); |
|
464 |
+ return FAILURE; |
|
465 |
+ } |
|
466 |
+ |
|
467 |
+ /* Convert alignment to stockholm, call hmmbuild and then |
|
468 |
+ * either hmmconvert (hmmer3) or hmmcalibrate (hmmer2) |
|
469 |
+ * |
|
470 |
+ * can't be static templates, or mktemp fails (at least on os x |
|
471 |
+ * (with a bus error)) |
|
472 |
+ * |
|
473 |
+ * gcc says we should use mkstemp to avoid race conditions, |
|
474 |
+ * but that returns a file descriptor, which is of no use to |
|
475 |
+ * us |
|
476 |
+ */ |
|
477 |
+ /* NOTE: the following won't work on windows: missing /tmp/ */ |
|
478 |
+ tmp_aln = CkStrdup("/tmp/clustalo_tmpaln_XXXXXX"); |
|
479 |
+ if (NULL == mktemp(tmp_aln)) { |
|
480 |
+ Log(&rLog, LOG_ERROR, "Could not create temporary alignment filename"); |
|
481 |
+ retcode = FAILURE; |
|
482 |
+ goto cleanup_and_return; |
|
483 |
+ } |
|
484 |
+#define LINE_WRAP 60 |
|
485 |
+ if (WriteAlignment(prMSeq, tmp_aln, MSAFILE_STOCKHOLM, LINE_WRAP, FALSE, &msa_c, &msa_v)) { |
|
486 |
+ Log(&rLog, LOG_ERROR, "Could not save alignment to %s", tmp_aln); |
|
487 |
+ retcode = FAILURE; |
|
488 |
+ goto cleanup_and_return; |
|
489 |
+ } |
|
490 |
+ |
|
491 |
+ if (2 == iHmmerVersion) { |
|
492 |
+ sprintf(cmdbuf, "hmmbuild %s %s >/dev/null && hmmcalibrate %s >/dev/null", |
|
493 |
+ pcHMMOut, tmp_aln, pcHMMOut); |
|
494 |
+ if (system(cmdbuf)) { |
|
495 |
+ Log(&rLog, LOG_ERROR, "Command '%s' failed", cmdbuf); |
|
496 |
+ retcode = FAILURE; |
|
497 |
+ goto cleanup_and_return; |
|
498 |
+ } |
|
499 |
+ } else if (3 == iHmmerVersion) { |
|
500 |
+ /* NOTE: the following won't work on windows: missing /tmp/ */ |
|
501 |
+ tmp_hmm = CkStrdup("/tmp/clustalo_tmphmm2_XXXXXX"); |
|
502 |
+ if (NULL == mktemp(tmp_hmm)) { |
|
503 |
+ Log(&rLog, LOG_ERROR, "Could not create temporary hmm filename"); |
|
504 |
+ retcode = FAILURE; |
|
505 |
+ goto cleanup_and_return; |
|
506 |
+ } |
|
507 |
+ sprintf(cmdbuf, "hmmbuild %s %s >/dev/null && hmmconvert -2 %s > %s", |
|
508 |
+ tmp_hmm, tmp_aln, tmp_hmm, pcHMMOut); |
|
509 |
+ if (system(cmdbuf)) { |
|
510 |
+ Log(&rLog, LOG_ERROR, "Command '%s' failed", cmdbuf); |
|
511 |
+ retcode = FAILURE; |
|
512 |
+ goto cleanup_and_return; |
|
513 |
+ } |
|
514 |
+ } else { |
|
515 |
+ CKFREE(tmp_aln); |
|
516 |
+ Log(&rLog, LOG_FATAL, "Internal error: Unknown Hmmer version %d", iHmmerVersion); |
|
517 |
+ } |
|
518 |
+ |
|
519 |
+ |
|
520 |
+ cleanup_and_return: |
|
521 |
+ |
|
522 |
+ if (NULL != tmp_aln) { |
|
523 |
+ if (FileExists(tmp_aln)) { |
|
524 |
+ if (remove(tmp_aln)) { |
|
525 |
+ Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_aln); |
|
526 |
+ } |
|
527 |
+ } |
|
528 |
+ CKFREE(tmp_aln); |
|
529 |
+ } |
|
530 |
+ if (NULL != tmp_hmm) { |
|
531 |
+ if (FileExists(tmp_hmm)) { |
|
532 |
+ if (remove(tmp_hmm)) { |
|
533 |
+ Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_hmm); |
|
534 |
+ } |
|
535 |
+ } |
|
536 |
+ CKFREE(tmp_hmm); |
|
537 |
+ } |
|
538 |
+ |
|
539 |
+ return retcode; |
|
540 |
+} |
|
541 |
+/* end of AlnToHMMFile() */ |
|
542 |
+ |
|
543 |
+ |
|
544 |
+ |
|
545 |
+/** |
|
546 |
+ * @brief Convert a multiple sequence structure into a HMM |
|
547 |
+ * |
|
548 |
+ * @param[out] prHMM |
|
549 |
+ * Pointer to preallocted HMM which will be set here |
|
550 |
+ * @param[in] prMSeq |
|
551 |
+ * Pointer to an alignment |
|
552 |
+ * |
|
553 |
+ * @return 0 on error, non-0 otherwise |
|
554 |
+ * |
|
555 |
+ * @see AlnToHMMFile() |
|
556 |
+ * |
|
557 |
+ */ |
|
558 |
+int |
|
559 |
+AlnToHMM(hmm_light *prHMM, mseq_t *prMSeq) |
|
560 |
+{ |
|
561 |
+ char *pcHMM; /* temp hmm file */ |
|
562 |
+ |
|
563 |
+ Log(&rLog, LOG_INFO, |
|
564 |
+ "Using HMMER version %d to calculate a new HMM.", |
|
565 |
+ HmmerVersion()); |
|
566 |
+ /* FIXME replace all this with internal HMM computation (HHmake) */ |
|
567 |
+ |
|
568 |
+ /** |
|
569 |
+ * @warning the following probably won't work on windows: missing |
|
570 |
+ * /tmp/. Should be ok on Cygwin though |
|
571 |
+ */ |
|
572 |
+ pcHMM = CkStrdup("/tmp/clustalo-hmm-iter_XXXXXX"); |
|
573 |
+ if (NULL == mktemp(pcHMM)) { |
|
574 |
+ Log(&rLog, LOG_ERROR, "Could not create temporary hmm filename"); |
|
575 |
+ CKFREE(pcHMM); |
|
576 |
+ return FAILURE; |
|
577 |
+ } |
|
578 |
+ |
|
579 |
+ /* Create a HMM representing the current alignment |
|
580 |
+ */ |
|
581 |
+#if USEHMMER |
|
582 |
+ if (AlnToHMMFile(prMSeq, pcHMM)) { |
|
583 |
+ Log(&rLog, LOG_ERROR, "AlnToHMMFile() on %s failed.", pcHMM); |
|
584 |
+ CKFREE(pcHMM); |
|
585 |
+ return FAILURE; |
|
586 |
+ } |
|
587 |
+#elif USEHHMAKE |
|
588 |
+ if (AlnToHHMFile(prMSeq, pcHMM)) { |
|
589 |
+ Log(&rLog, LOG_ERROR, "AlnToHHMFile() on %s failed.", pcHMM); |
|
590 |
+ CKFREE(pcHMM); |
|
591 |
+ return FAILURE; |
|
592 |
+ } |
|
593 |
+ /* Log(&rLog, LOG_FATAL, "Method to create HHM (HMM using hhmake) not installed yet"); */ |
|
594 |
+#else |
|
595 |
+ Log(&rLog, LOG_FATAL, "Unknown method to create temporary HMM"); |
|
596 |
+#endif |
|
597 |
+ |
|
598 |
+ /* Read HMM information |
|
599 |
+ */ |
|
600 |
+ if (OK != readHMMWrapper(prHMM, pcHMM)){ |
|
601 |
+ Log(&rLog, LOG_ERROR, "Processing of HMM file %s failed", pcHMM); |
|
602 |
+ CKFREE(pcHMM); |
|
603 |
+ return FAILURE; |
|
604 |
+ } |
|
605 |
+ |
|
606 |
+ if (remove(pcHMM)) { |
|
607 |
+ Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", pcHMM); |
|
608 |
+ } |
|
609 |
+ CKFREE(pcHMM); |
|
610 |
+ |
|
611 |
+ return OK; |
|
612 |
+} |
|
613 |
+/* end of AlnToHMM() */ |
|
614 |
+ |
|
615 |
+ |
|
616 |
+ |
|
617 |
+/** |
|
618 |
+ * @brief FIXME |
|
619 |
+ * |
|
620 |
+ */ |
|
621 |
+void |
|
622 |
+InitClustalOmega(int iNumThreadsRequested) |
|
623 |
+{ |
|
624 |
+ |
|
625 |
+#ifdef WIN32 |
|
626 |
+ if (iNumThreadsRequested>1) { |
|
627 |
+ Log(&rLog, LOG_FATAL, "Cannot change number of threads to %d. %s was build without OpenMP support.", |
|
628 |
+ iNumThreadsRequested, PACKAGE_NAME); |
|
629 |
+ } |
|
630 |
+ iNumberOfThreads = 1; /* need to set this, even if build without support */ |
|
631 |
+#elif HAVE_OPENMP |
|
632 |
+ iNumberOfThreads = iNumThreadsRequested; |
|
633 |
+ omp_set_num_threads(iNumberOfThreads); |
|
634 |
+#else |
|
635 |
+ if (iNumThreadsRequested>1) { |
|
636 |
+ Log(&rLog, LOG_FATAL, "Cannot change number of threads to %d. %s was build without OpenMP support.", |
|
637 |
+ iNumThreadsRequested, PACKAGE_NAME); |
|
638 |
+ } |
|
639 |
+ iNumberOfThreads = 1; /* need to set this, even if build without support */ |
|
640 |
+#endif |
|
641 |
+ |
|
642 |
+ Log(&rLog, LOG_INFO, "Using %d threads", |
|
643 |
+ iNumberOfThreads); |
|
644 |
+ |
|
645 |
+} |
|
646 |
+/* end of InitClustalOmega() */ |
|
647 |
+ |
|
648 |
+ |
|
649 |
+ |
|
650 |
+/** |
|
651 |
+ * @brief Defines an alignment order, which adds sequences |
|
652 |
+ * sequentially, i.e. one at a time starting with seq 1 & 2 |
|
653 |
+ * |
|
654 |
+ * @param[out] piOrderLR_p |
|
655 |
+ * order in which nodes/profiles are to be merged/aligned |
|
656 |
+ * @param[in] iNumSeq |
|
657 |
+ * Number of sequences |
|
658 |
+ * |
|
659 |
+ * @see TraverseTree() |
|
660 |
+ * |
|
661 |
+ */ |
|
662 |
+void |
|
663 |
+SequentialAlignmentOrder(int **piOrderLR_p, int iNumSeq) |
|
664 |
+{ |
|
665 |
+ unsigned int uNodes = iNumSeq*2-1; |
|
666 |
+ unsigned int uNodeCounter = 0; |
|
667 |
+ unsigned int uSeqCounter = 0; |
|
668 |
+ |
|
669 |
+ Log(&rLog, LOG_FATAL, "FIXME: Untested..."); |
|
670 |
+ |
|
671 |
+ (*piOrderLR_p) = (int *)CKCALLOC(DIFF_NODE * uNodes, sizeof(int)); |
|
672 |
+ /* loop over merge nodes, which have per definition even indices |
|
673 |
+ * and set up children which have odd indices |
|
674 |
+ */ |
|
675 |
+ uSeqCounter = 0; |
|
676 |
+ for (uNodeCounter=iNumSeq; uNodeCounter<uNodes; uNodeCounter+=1) { |
|
677 |
+ unsigned int uLeftChildNodeIndex = uNodeCounter-1; |
|
678 |
+ unsigned int uRightChildNodeIndex = uNodeCounter-iNumSeq+1; |
|
679 |
+ unsigned int uParentNodeIndex = uNodeCounter+1; |
|
680 |
+ |
|
681 |
+ /* merge node setup */ |
|
682 |
+ (*piOrderLR_p)[DIFF_NODE*uNodeCounter+LEFT_NODE] = uLeftChildNodeIndex; |
|
683 |
+ (*piOrderLR_p)[DIFF_NODE*uNodeCounter+RGHT_NODE] = uRightChildNodeIndex; |
|
684 |
+ (*piOrderLR_p)[DIFF_NODE*uNodeCounter+PRNT_NODE] = uParentNodeIndex; |
|
685 |
+ /* only setup left child if at first merge node, all other left childs |
|
686 |
+ * should be merge nodes that are already set up. also correct |
|
687 |
+ * left node number here. |
|
688 |
+ */ |
|
689 |
+ if (uNodeCounter==iNumSeq) { |
|
690 |
+ (*piOrderLR_p)[DIFF_NODE*uNodeCounter+LEFT_NODE] = 0; |
|
691 |
+ |
|
692 |
+ (*piOrderLR_p)[0+LEFT_NODE] = 0; |
|
693 |
+ (*piOrderLR_p)[0+RGHT_NODE] = 0; |
|
694 |
+ (*piOrderLR_p)[0+PRNT_NODE] = uNodeCounter; |
|
695 |
+ uSeqCounter++; |
|
696 |
+ |
|
697 |
+ Log(&rLog, LOG_FORCED_DEBUG, "Set up first leaf with node counter %d: left=%d right=%d parent=%d", |
|
698 |
+ 0, |
|
699 |
+ (*piOrderLR_p)[DIFF_NODE*uLeftChildNodeIndex+LEFT_NODE], |
|
700 |
+ (*piOrderLR_p)[DIFF_NODE*uLeftChildNodeIndex+RGHT_NODE], |
|
701 |
+ (*piOrderLR_p)[DIFF_NODE*uLeftChildNodeIndex+PRNT_NODE]); |
|
702 |
+ } |
|
703 |
+ Log(&rLog, LOG_FORCED_DEBUG, "Set up merge node with node counter %d: left=%d right=%d parent=%d", |
|
704 |
+ uNodeCounter, (*piOrderLR_p)[DIFF_NODE*uNodeCounter+LEFT_NODE], |
|
705 |
+ (*piOrderLR_p)[DIFF_NODE*uNodeCounter+RGHT_NODE], |
|
706 |
+ (*piOrderLR_p)[DIFF_NODE*uNodeCounter+PRNT_NODE]); |
|
707 |
+ |
|
708 |
+ /* right child */ |
|
709 |
+ (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+LEFT_NODE] = uSeqCounter; |
|
710 |
+ (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+RGHT_NODE] = uSeqCounter; |
|
711 |
+ (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+PRNT_NODE] = uNodeCounter; |
|
712 |
+ uSeqCounter++; |
|
713 |
+ |
|
714 |
+ Log(&rLog, LOG_FORCED_DEBUG, "Set up leaf with node counter %d: left=%d right=%d parent=%d", |
|
715 |
+ uRightChildNodeIndex, (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+LEFT_NODE], |
|
716 |
+ (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+RGHT_NODE], |
|
717 |
+ (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+PRNT_NODE]); |
|
718 |
+ } |
|
719 |
+} |
|
720 |
+/* end of SequentialAlignmentOrder() */ |
|
721 |
+ |
|
722 |
+ |
|
723 |
+ |
|
724 |
+/** |
|
725 |
+ * @brief Defines the alignment order by calculating a guide tree. In |
|
726 |
+ * a first-step pairwise distances will be calculated (or read from a |
|
727 |
+ * file). In a second step those distances will be clustered and a |
|
728 |
+ * guide-tree created. Steps 1 and 2 will be skipped if a guide-tree |
|
729 |
+ * file was given, in which case the guide-tree will be just read from |
|
730 |
+ * the file. |
|
731 |
+ * |
|
732 |
+ * @param[out] piOrderLR_p |
|
733 |
+ * order in which nodes/profiles are to be merged/aligned |
|
734 |
+ * @param[out] pdSeqWeights_p |
|
735 |
+ * Sequence weights |
|
736 |
+ * @param[out] pdSeqWeights_p |
|
737 |
+ * Sequence weights |
|
738 |
+ * @param[in] prMSeq |
|
739 |
+ * The sequences from which the alignment order is to be calculated |
|
740 |
+ * @param[in] iPairDistType |
|
741 |
+ * Method of pairwise distance comparison |
|
742 |
+ * @param[in] pcDistmatInfile |
|
743 |
+ * If not NULL distances will be read from this file instead of being |
|
744 |
+ * calculated |
|
745 |
+ * @param[in] pcDistmatOutfile |
|
746 |
+ * If not NULL computed pairwise distances will be written to this file |
|
747 |
+ * @param[in] iClusteringType |
|
748 |
+ * Clustering method to be used to cluster the pairwise distances |
|
749 |
+ * @param[in] pcGuidetreeInfile |
|
750 |
+ * If not NULL guidetree will be read from this file. Skips pairwise |
|
751 |
+ * distance and guidetree computation |
|
752 |
+ * @param[in] pcGuidetreeOutfile |
|
753 |
+ * If not NULL computed guidetree will be written to this file |
|
754 |
+ * @param[in] bUseMbed |
|
755 |
+ * If TRUE, fast mBed guidetree computation will be employed |
|
756 |
+ * |
|
757 |
+ * @return Non-zero on error |
|
758 |
+ * |
|
759 |
+ */ |
|
760 |
+int |
|
761 |
+AlignmentOrder(int **piOrderLR_p, double **pdSeqWeights_p, mseq_t *prMSeq, |
|
762 |
+ int iPairDistType, char *pcDistmatInfile, char *pcDistmatOutfile, |
|
763 |
+ int iClusteringType, int iClustersizes, |
|
764 |
+ char *pcGuidetreeInfile, char *pcGuidetreeOutfile, char *pcClusterFile, |
|
765 |
+ bool bUseMbed, bool bPercID) |
|
766 |
+{ |
|
767 |
+ /* pairwise distance matrix (tmat in 1.83) */ |
|
768 |
+ symmatrix_t *distmat = NULL; |
|
769 |
+ /* guide tree */ |
|
770 |
+ tree_t *prTree = NULL; |
|
771 |
+ int i = 0; |
|
772 |
+ |
|
773 |
+ |
|
774 |
+ /* Shortcut for only two sequences: Do not compute k-tuple |
|
775 |
+ * distances. Use the same logic as in TraverseTree() to setup |
|
776 |
+ * piOrderLR_p. Changes there will have to be reflected here as |
|
777 |
+ * well. */ |
|
778 |
+ if (2==prMSeq->nseqs) { |
|
779 |
+ Log(&rLog, LOG_VERBOSE, |
|
780 |
+ "Have only two sequences: No need to compute pairwise score and compute a tree."); |
|
781 |
+ |
|
782 |
+ if (NULL != pcDistmatOutfile){ |
|
783 |
+ Log(&rLog, LOG_WARN, "Have only two sequences: Will not calculate/print distance matrix."); |
|
784 |
+ } |
|
785 |
+ |
|
786 |
+ (*piOrderLR_p) = (int*) CKMALLOC(DIFF_NODE * 3 * sizeof(int)); |
|
787 |
+ (*piOrderLR_p)[DIFF_NODE*0+LEFT_NODE] = 0; |
|
788 |
+ (*piOrderLR_p)[DIFF_NODE*0+RGHT_NODE] = 0; |
|
789 |
+ (*piOrderLR_p)[DIFF_NODE*0+PRNT_NODE] = 0; |
|
790 |
+ |
|
791 |
+ (*piOrderLR_p)[DIFF_NODE*1+LEFT_NODE] = 1; |
|
792 |
+ (*piOrderLR_p)[DIFF_NODE*1+RGHT_NODE] = 1; |
|
793 |
+ (*piOrderLR_p)[DIFF_NODE*1+PRNT_NODE] = 1; |
|
794 |
+ |
|
795 |
+ /* root */ |
|
796 |
+ (*piOrderLR_p)[DIFF_NODE*2+LEFT_NODE] = 0; |
|
797 |
+ (*piOrderLR_p)[DIFF_NODE*2+RGHT_NODE] = 1; |
|
798 |
+ (*piOrderLR_p)[DIFF_NODE*2+PRNT_NODE] = 2; |
|
799 |
+ |
|
800 |
+ /* Same logic as CalcClustalWeights(). Changes there will |
|
801 |
+ have to be reflected here as well. */ |
|
802 |
+#if USE_WEIGHTS |
|
803 |
+ (*pdWeights_p) = (double *) CKMALLOC(uNodeCount * sizeof(double)); |
|
804 |
+ (*pdWeights_p)[0] = 0.5; |
|
805 |
+ (*pdWeights_p)[1] = 0.5; |
|
806 |
+#endif |
|
807 |
+ |
|
808 |
+ return OK; |
|
809 |
+ } |
|
810 |
+ |
|
811 |
+ |
|
812 |
+ /* compute distance & guide tree, alternatively read distances or |
|
813 |
+ * guide tree from file |
|
814 |
+ * |
|
815 |
+ */ |
|
816 |
+ if (NULL != pcGuidetreeInfile) { |
|
817 |
+ Log(&rLog, LOG_INFO, "Reading guide-tree from %s", pcGuidetreeInfile); |
|
818 |
+ if (GuideTreeFromFile(&prTree, prMSeq, pcGuidetreeInfile)) { |
|
819 |
+ Log(&rLog, LOG_ERROR, "Reading of guide tree %s failed.", pcGuidetreeInfile); |
|
820 |
+ return FAILURE; |
|
821 |
+ } |
|
822 |
+ |
|
823 |
+ } else { |
|
824 |
+ |
|
825 |
+ if (bUseMbed) { |
|
826 |
+ if (Mbed(&prTree, prMSeq, iPairDistType, pcGuidetreeOutfile, iClustersizes, pcClusterFile)) { |
|
827 |
+ Log(&rLog, LOG_ERROR, "mbed execution failed."); |
|
828 |
+ return FAILURE; |
|
829 |
+ } |
|
830 |
+ Log(&rLog, LOG_INFO, "Guide-tree computation (mBed) done."); |
|
831 |
+ if (NULL != pcDistmatOutfile) { |
|
832 |
+ Log(&rLog, LOG_INFO, |
|
833 |
+ "Ignoring request to write distance matrix (am in mBed mode)"); |
|
834 |
+ } |
|
835 |
+ } else { |
|
836 |
+ |
|
837 |
+ if (PairDistances(&distmat, prMSeq, iPairDistType, bPercID, |
|
838 |
+ 0, prMSeq->nseqs, 0, prMSeq->nseqs, |
|
839 |
+ pcDistmatInfile, pcDistmatOutfile)) { |
|
840 |
+ Log(&rLog, LOG_ERROR, "Couldn't compute pair distances"); |
|
841 |
+ return FAILURE; |
|
842 |
+ } |
|
843 |
+ |
|
844 |
+ /* clustering of distances to get guide tree |
|
845 |
+ */ |
|
846 |
+ if (CLUSTERING_UPGMA == iClusteringType) { |
|
847 |
+ char **labels; |
|
848 |
+ labels = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char*)); |
|
849 |
+ for (i=0; i<prMSeq->nseqs; i++) { |
|
850 |
+ labels[i] = prMSeq->sqinfo[i].name; |
|
851 |
+ } |
|
852 |
+ |
|
853 |
+ GuideTreeUpgma(&prTree, labels, distmat, pcGuidetreeOutfile); |
|
854 |
+ Log(&rLog, LOG_INFO, "Guide-tree computation done."); |
|
855 |
+ |
|
856 |
+ CKFREE(labels); |
|
857 |
+ } else { |
|
858 |
+ Log(&rLog, LOG_FATAL, "INTERNAL ERROR %s", |
|
859 |
+ "clustering method should have been checked before"); |
|
860 |
+ } |
|
861 |
+ } /* did not use mBed */ |
|
862 |
+ } /* had to calculate tree (did not read from file) */ |
|
863 |
+ |
|
864 |
+ |
|
865 |
+#if USE_WEIGHTS |
|
866 |
+ /* derive sequence weights from tree |
|
867 |
+ * |
|
868 |
+ */ |
|
869 |
+ Log(&rLog, LOG_INFO, "Calculating sequence weights"); |
|
870 |
+ CalcClustalWeights(pdSeqWeights_p, prTree); |
|
871 |
+ for (i = 0; i < GetLeafCount(prTree); i++) { |
|
872 |
+ Log(&rLog, LOG_VERBOSE, |
|
873 |
+ "Weight for seq no %d: %s = %f", |
|
874 |
+ i, prMSeq->sqinfo[i].name, (*pdSeqWeights_p)[i]); |
|
875 |
+ } |
|
876 |
+#else |
|
877 |
+ Log(&rLog, LOG_DEBUG, "Not using weights"); |
|
878 |
+#endif |
|
879 |
+ |
|
880 |
+ |
|
881 |
+ /* define traversing order of tree |
|
882 |
+ * |
|
883 |
+ */ |
|
884 |
+ TraverseTree(piOrderLR_p, prTree, prMSeq); |
|
885 |
+ if (rLog.iLogLevelEnabled <= LOG_DEBUG) { |
|
886 |
+ /* FIXME: debug only, FS */ |
|
887 |
+ uint uNodeIndex; |
|
888 |
+ FILE *fp = LogGetFP(&rLog, LOG_INFO); |
|
889 |
+ Log(&rLog, LOG_DEBUG, "left/right order after tree traversal"); |
|
890 |
+ for (uNodeIndex = 0; uNodeIndex < GetNodeCount(prTree); uNodeIndex++) { |
|
891 |
+ fprintf(fp, "%3d:\t%2d/%2d -> %d\n", i, |
|
892 |
+ (*piOrderLR_p)[DIFF_NODE*uNodeIndex+LEFT_NODE], |
|
893 |
+ (*piOrderLR_p)[DIFF_NODE*uNodeIndex+RGHT_NODE], |
|
894 |
+ (*piOrderLR_p)[DIFF_NODE*uNodeIndex+PRNT_NODE]); |
|
895 |
+ } |
|
896 |
+ } |
|
897 |
+ |
|
898 |
+ FreeMuscleTree(prTree); |
|
899 |
+ FreeSymMatrix(&distmat); |
|
900 |
+ |
|
901 |
+#if 0 |
|
902 |
+ Log(&rLog, LOG_FATAL, "DEBUG EXIT before leaving %s", __FUNCTION__); |
|
903 |
+#endif |
|
904 |
+ return OK; |
|
905 |
+} |
|
906 |
+/* end of AlignmentOrder() */ |
|
907 |
+ |
|
908 |
+ |
|
909 |
+ |
|
910 |
+/** |
|
911 |
+ * @brief Set some options automatically based on number of sequences. Might |
|
912 |
+ * overwrite some user-set options. |
|
913 |
+ * |
|
914 |
+ * @param[out] prOpts |
|
915 |
+ * Pointer to alignment options structure |
|
916 |
+ * @param[in] iNumSeq |
|
917 |
+ * Number of sequences to align |
|
918 |
+ */ |
|
919 |
+void |
|
920 |
+SetAutoOptions(opts_t *prOpts, int iNumSeq) { |
|
921 |
+ |
|
922 |
+ Log(&rLog, LOG_INFO, |
|
923 |
+ "Setting options automatically based on input sequence characteristics (might overwrite some of your options)."); |
|
924 |
+ |
|
925 |
+ /* AW: new version of mbed is always good (uses subclusters) */ |
|
926 |
+ if (FALSE == prOpts->bUseMbed) { |
|
927 |
+ Log(&rLog, LOG_INFO, "Auto settings: Enabling mBed."); |
|
928 |
+ prOpts->bUseMbed = TRUE; |
|
929 |
+ } |
|
930 |
+ |
|
931 |
+ if (iNumSeq >= 1000) { |
|
932 |
+ if (0 != prOpts->iNumIterations) { |
|
933 |
+ Log(&rLog, LOG_INFO, "Auto settings: Disabling iterations."); |
|
934 |
+ prOpts->iNumIterations = 0; |
|
935 |
+ } |
|
936 |
+ |
|
937 |
+ } else if (iNumSeq < 1000) { |
|
938 |
+ if (1 != prOpts->iNumIterations) { |
|
939 |
+ Log(&rLog, LOG_INFO, "Auto settings: Setting iteration to 1."); |
|
940 |
+ prOpts->iNumIterations = 1; |
|
941 |
+ } |
|
942 |
+ } |
|
943 |
+} |
|
944 |
+/* end of */ |
|
945 |
+ |
|
946 |
+ |
|
947 |
+ |
|
948 |
+/** |
|
949 |
+ * @brief The main alignment function which wraps everything else. |
|
950 |
+ * |
|
951 |
+ * @param[out] prMSeq |
|
952 |
+ * *the* multiple sequences structure |
|
953 |
+ * @param[in] prMSeqProfile |
|
954 |
+ * optional profile to align against |
|
955 |
+ * @param[in] prOpts |
|
956 |
+ * alignment options to use |
|
957 |
+ * |
|
958 |
+ * @return 0 on success, -1 on failure |
|
959 |
+ * |
|
960 |
+ */ |
|
961 |
+int |
|
962 |
+Align(mseq_t *prMSeq, |
|
963 |
+ mseq_t *prMSeqProfile, |
|
964 |
+ opts_t *prOpts) { |
|
965 |
+ |
|
966 |
+ /* HMM |
|
967 |
+ */ |
|
968 |
+ /* structs with pseudocounts etc; one for each HMM infile, i.e. |
|
969 |
+ * index range: 0..iHMMInputFiles */ |
|
970 |
+ hmm_light *prHMMs = NULL; |
|
971 |
+ |
|
972 |
+ /* MSA order in which nodes/profiles are to be merged/aligned |
|
973 |
+ (order of nodes in guide tree (left/right)*/ |
|
974 |
+ int *piOrderLR = NULL; |
|
975 |
+ |
|
976 |
+ /* weights per sequence */ |
|
977 |
+ double *pdSeqWeights = NULL; |
|
978 |
+ |
|
979 |
+ /* Iteration |
|
980 |
+ */ |
|
981 |
+ int iIterationCounter = 0; |
|
982 |
+ double dAlnScore; |
|
983 |
+ /* last dAlnScore for iteration */ |
|
984 |
+ double dLastAlnScore = -666.666; |
|
985 |
+ |
|
986 |
+ int i, j; /* aux */ |
|
987 |
+ |
|
988 |
+ assert(NULL != prMSeq); |
|
989 |
+ if (NULL != prMSeqProfile) { |
|
990 |
+ assert(TRUE == prMSeqProfile->aligned); |
|
991 |
+ } |
|
992 |
+ |
|
993 |
+ |
|
994 |
+ /* automatic setting of options |
|
995 |
+ * |
|
996 |
+ */ |
|
997 |
+ if (prOpts->bAutoOptions) { |
|
998 |
+ SetAutoOptions(prOpts, prMSeq->nseqs); |
|
999 |
+ } |
|
1000 |
+ |
|
1001 |
+ |
|
1002 |
+#if SHUFFLE_INPUT_SEQ_ORDER |
|
1003 |
+ /* |
|
1004 |
+ * shuffle input: only useful for testing/debugging |
|
1005 |
+ */ |
|
1006 |
+ Log(&rLog, LOG_WARN, "Shuffling input sequences! (Will also change output order)"); |
|
1007 |
+ ShuffleMSeq(prMSeq); |
|
1008 |
+#endif |
|
1009 |
+ |
|
1010 |
+ |
|
1011 |
+#if SORT_INPUT_SEQS |
|
1012 |
+ /* |
|
1013 |
+ * sort input: |
|
1014 |
+ * |
|
1015 |
+ * would ensure we *always* (unless we get into the mbed k-means stage) |
|
1016 |
+ * get the same answer. usually you don't, because most pairwise alignment |
|
1017 |
+ * scores are in theory not symmetric, therefore sequence ordering might |
|
1018 |
+ * have an effect on the guide-tree. Sorting by length should get rid of |
|
1019 |
+ * this (and takes no time even for 100k seqs). Benchmark results on |
|
1020 |
+ * Balibase show almost no difference after sorting. |
|
1021 |
+ */ |
|
1022 |
+ Log(&rLog, LOG_WARN, "Sorting input seq by length! This will also change the output order"); |
|
1023 |
+ SortMSeqByLength(prMSeq, 'd'); |
|
1024 |
+ |
|
1025 |
+#endif |
|
1026 |
+ |
|
1027 |
+ |
|
1028 |
+ /* Read backgrounds HMMs and store in prHMMs |
|
1029 |
+ * |
|
1030 |
+ */ |
|
1031 |
+ if (0 < prOpts->iHMMInputFiles) { |
|
1032 |
+ int iHMMInfileIndex; |
|
1033 |
+ |
|
1034 |
+ /** |
|
1035 |
+ * @warning old structure used to be initialised like this: |
|
1036 |
+ * hmm_light rHMM = {0}; |
|
1037 |
+ */ |
|
1038 |
+ prHMMs = (hmm_light *) CKMALLOC(prOpts->iHMMInputFiles * sizeof(hmm_light)); |
|
1039 |
+ |
|
1040 |
+ for (iHMMInfileIndex=0; iHMMInfileIndex<prOpts->iHMMInputFiles; iHMMInfileIndex++) { |
|
1041 |
+ char *pcHMMInput = prOpts->ppcHMMInput[iHMMInfileIndex]; |
|
1042 |
+ if (OK != readHMMWrapper(&prHMMs[iHMMInfileIndex], pcHMMInput)){ |
|
1043 |
+ Log(&rLog, LOG_ERROR, "Processing of HMM file %s failed", pcHMMInput); |
|
1044 |
+ return -1; |
|
1045 |
+ } |
|
1046 |
+ |
|
1047 |
+#if 0 |
|
1048 |
+ Log(&rLog, LOG_FORCED_DEBUG, "HMM length is %d", prHMMs[iHMMInfileIndex].L); |
|
1049 |
+ Log(&rLog, LOG_FORCED_DEBUG, "n-display is %d", prHMMs[iHMMInfileIndex].n_display); |
|
1050 |
+ for (i = 0; NULL != prHMMs[prOpts->iHMMInputFiles].seq[i]; i++){ |
|
1051 |
+ printf("seq[%d]: %s\n", i, prHMMs[iHMMInfileIndex].seq[i]); |
|
1052 |
+ } |
|
1053 |
+ Log(&rLog, LOG_FORCED_DEBUG, "Neff_HMM is %f", prHMMs[iHMMInfileIndex].Neff_HMM); |
|
1054 |
+#endif |
|
1055 |
+ if (rLog.iLogLevelEnabled <= LOG_DEBUG){ |
|
1056 |
+ Log(&rLog, LOG_DEBUG, "print frequencies"); |
|
1057 |
+ for (i = 0; i < prHMMs[iHMMInfileIndex].L; i++){ |
|
1058 |
+#define PRINT_TAIL 5 |
|
1059 |
+ if ( (PRINT_TAIL+1 == i) && (prHMMs[iHMMInfileIndex].L-PRINT_TAIL != i) ){ |
|
1060 |
+ printf("....\n"); |
|
1061 |
+ } |
|
1062 |
+ if ( (i > PRINT_TAIL) && (i < prHMMs[iHMMInfileIndex].L-PRINT_TAIL) ){ |
|
1063 |
+ continue; |
|
1064 |
+ } |
|
1065 |
+ printf("%3d:", i); |
|
1066 |
+ for (j = 0; j < 20; j++){ |
|
1067 |
+ printf("\t%1.3f", prHMMs[iHMMInfileIndex].f[i][j]); |
|
1068 |
+ } |
|
1069 |
+ printf("\n"); |
|
1070 |
+ } |
|
1071 |
+ } /* debug print block */ |
|
1072 |
+ |
|
1073 |
+ CKFREE(prOpts->ppcHMMInput[iHMMInfileIndex]); |
|
1074 |
+ } /* for each background HMM file */ |
|
1075 |
+ CKFREE(prOpts->ppcHMMInput); |
|
1076 |
+ } /* there were background HMM files */ |
|
1077 |
+ |
|
1078 |
+ |
|
1079 |
+ |
|
1080 |
+ /* If the input ("non-profile") sequences are aligned, then turn |
|
1081 |
+ * the alignment into a HMM and add to the list of background HMMs |
|
1082 |
+ * |
|
1083 |
+ */ |
|
1084 |
+ if (TRUE == prMSeq->aligned) { |
|
1085 |
+ /* FIXME: gcc warns about missing initialiser here (-Wall -Wextra -pedantic) */ |
|
1086 |
+ hmm_light rHMMLocal = {0}; |
|
1087 |
+ |
|
1088 |
+ Log(&rLog, LOG_INFO, |
|
1089 |
+ "Input sequences are aligned. Will turn alignment into HMM and add it to the user provided background HMMs."); |
|
1090 |
+ /* certain gap parameters ('~' MSF) cause problems, |
|
1091 |
+ sanitise them; FS, r258 -> r259 */ |
|
1092 |
+ SanitiseUnknown(prMSeq); |
|
1093 |
+ if (OK != |
|
1094 |
+#if INDIRECT_HMM |
|
1095 |
+ AlnToHMM(&rHMMLocal, prMSeq) |
|
1096 |
+#else |
|
1097 |
+ AlnToHMM2(&rHMMLocal, prMSeq->seq, prMSeq->nseqs) |
|
1098 |
+#endif |
|
1099 |
+ ) { |
|
1100 |
+ Log(&rLog, LOG_ERROR, "Couldn't convert aligned input sequences to HMM. Will try to continue"); |
|
1101 |
+ } else { |
|
1102 |
+ prHMMs = (hmm_light *) CKREALLOC(prHMMs, ((prOpts->iHMMInputFiles+1) * sizeof(hmm_light))); |
|
1103 |
+ memcpy(&(prHMMs[prOpts->iHMMInputFiles]), &rHMMLocal, sizeof(hmm_light)); |
|
1104 |
+ prOpts->iHMMInputFiles++; |
|
1105 |
+ } |
|
1106 |
+ } |
|
1107 |
+ |
|
1108 |
+ |
|
1109 |
+ /* If we have a profile turn it into a HMM and add to |
|
1110 |
+ * the list of background HMMs. |
|
1111 |
+ * |
|
1112 |
+ */ |
|
1113 |
+ if (NULL != prMSeqProfile) { |
|
1114 |
+ /* FIXME: gcc warns about missing initialiser here (-Wall -Wextra -pedantic) */ |
|
1115 |
+ hmm_light rHMMLocal = {0}; |
|
1116 |
+ Log(&rLog, LOG_INFO, |
|
1117 |
+ "Turning profile1 into HMM and will use it during progressive alignment."); |
|
1118 |
+ if (OK != |
|
1119 |
+#if INDIRECT_HMM |
|
1120 |
+ AlnToHMM(&rHMMLocal, prMSeqProfile) |
|
1121 |
+#else |
|
1122 |
+ AlnToHMM2(&rHMMLocal, prMSeqProfile->seq, prMSeqProfile->nseqs) |
|
1123 |
+#endif |
|
1124 |
+ ) { |
|
1125 |
+ Log(&rLog, LOG_ERROR, "Couldn't convert profile1 to HMM. Will try to continue"); |
|
1126 |
+ } else { |
|
1127 |
+ prHMMs = (hmm_light *) CKREALLOC(prHMMs, ((prOpts->iHMMInputFiles+1) * sizeof(hmm_light))); |
|
1128 |
+ memcpy(&(prHMMs[prOpts->iHMMInputFiles]), &rHMMLocal, sizeof(hmm_light)); |
|
1129 |
+ prOpts->iHMMInputFiles++; |
|
1130 |
+ } |
|
1131 |
+ } |
|
1132 |
+ |
|
1133 |
+ |
|
1134 |
+ /* Now do a first alignment of the input sequences (prMSeq) adding |
|
1135 |
+ * all collected background HMMs |
|
1136 |
+ * |
|
1137 |
+ */ |
|
1138 |
+ /* Determine progressive alignment order |
|
1139 |
+ */ |
|
1140 |
+ if (TRUE == prMSeq->aligned) { |
|
1141 |
+ if ( (SEQTYPE_PROTEIN == prMSeq->seqtype) && (TRUE == prOpts->bUseKimura) ){ |
|
1142 |
+ Log(&rLog, LOG_INFO, "%s %s", |
|
1143 |
+ "Input sequences are aligned.", |
|
1144 |
+ "Will use Kimura distances of aligned sequences."); |
|
1145 |
+ prOpts->iPairDistType = PAIRDIST_SQUIDID_KIMURA; |
|
1146 |
+ } |
|
1147 |
+ else { |
|
1148 |
+ prOpts->iPairDistType = PAIRDIST_SQUIDID; |
|
1149 |
+ } |
|
1150 |
+ } |
|
1151 |
+ |
|
1152 |
+#if 0 |
|
1153 |
+ Log(&rLog, LOG_WARN, "Using a sequential alignment order."); |
|
1154 |
+ SequentialAlignmentOrder(&piOrderLR, prMSeq->nseqs); |
|
1155 |
+#else |
|
1156 |
+ if (OK != AlignmentOrder(&piOrderLR, &pdSeqWeights, prMSeq, |
|
1157 |
+ prOpts->iPairDistType, |
|
1158 |
+ prOpts->pcDistmatInfile, prOpts->pcDistmatOutfile, |
|
1159 |
+ prOpts->iClusteringType, prOpts->iClustersizes, |
|
1160 |
+ prOpts->pcGuidetreeInfile, prOpts->pcGuidetreeOutfile, prOpts->pcClustfile, |
|
1161 |
+ prOpts->bUseMbed, prOpts->bPercID)) { |
|
1162 |
+ Log(&rLog, LOG_ERROR, "AlignmentOrder() failed. Cannot continue"); |
|
1163 |
+ return -1; |
|
1164 |
+ } |
|
1165 |
+#endif |
|
1166 |
+ |
|
1167 |
+ /* if max-hmm-iter is set < 0 then do not perform alignment |
|
1168 |
+ * there is a problem/feature(?) that the un-aligned sequences are output |
|
1169 |
+ */ |
|
1170 |
+ if (prOpts->iMaxHMMIterations < 0){ |
|
1171 |
+ Log(&rLog, LOG_VERBOSE, |
|
1172 |
+ "iMaxHMMIterations < 0 (%d), will not perform alignment", prOpts->iMaxHMMIterations); |
|
1173 |
+ return 0; |
|
1174 |
+ } |
|
1175 |
+ |
|
1176 |
+ |
|
1177 |
+ /* Progressive alignment of input sequences. Order defined by |
|
1178 |
+ * branching of guide tree (piOrderLR). Use optional |
|
1179 |
+ * background HMM information (prHMMs[0..prOpts->iHMMInputFiles-1]) |
|
1180 |
+ * |
|
1181 |
+ */ |
|
1182 |
+ dAlnScore = HHalignWrapper(prMSeq, piOrderLR, pdSeqWeights, |
|
1183 |
+ 2*prMSeq->nseqs -1/* nodes */, |
|
1184 |
+ prHMMs, prOpts->iHMMInputFiles, -1, prOpts->rHhalignPara); |
|
1185 |
+ dLastAlnScore = dAlnScore; |
|
1186 |
+ Log(&rLog, LOG_VERBOSE, |
|
1187 |
+ "Alignment score for first alignment = %f", dAlnScore); |
|
1188 |
+ |
|
1189 |
+ |
|
1190 |
+ |
|
1191 |
+ |
|
1192 |
+ /* ------------------------------------------------------------ |
|
1193 |
+ * |
|
1194 |
+ * prMSeq is aligned now. Now start iterations if requested and save the |
|
1195 |
+ * alignment at the very end. |
|
1196 |
+ * |
|
1197 |
+ * @note We discard the background HMM information at this point, |
|
1198 |
+ * because it was already used. Could consider to make this choice |
|
1199 |
+ * optional. FIXME |
|
1200 |
+ * |
|
1201 |
+ * ------------------------------------------------------------ */ |
|
1202 |
+ |
|
1203 |
+ |
|
1204 |
+ /* iteration after first alignment was computed (if not profile-profile |
|
1205 |
+ * alignment) |
|
1206 |
+ * |
|
1207 |
+ */ |
|
1208 |
+ for (iIterationCounter=0; |
|
1209 |
+ (iIterationCounter < prOpts->iNumIterations || prOpts->bIterationsAuto); |
|
1210 |
+ iIterationCounter++) { |
|
1211 |
+ |
|
1212 |
+ hmm_light rHMMLocal = {0}; |
|
1213 |
+ /* FIXME Keep copy of old alignment in case new one sucks? */ |
|
1214 |
+ |
|
1215 |
+ |
|
1216 |
+ if (iIterationCounter >= prOpts->iMaxHMMIterations |
|
1217 |
+ && |
|
1218 |
+ iIterationCounter >= prOpts->iMaxGuidetreeIterations) { |
|
1219 |
+ Log(&rLog, LOG_VERBOSE, "Reached maximum number of HMM and guide-tree iterations"); |
|
1220 |
+ break; |
|
1221 |
+ } |
|
1222 |
+ |
|
1223 |
+ if (! prOpts->bIterationsAuto) { |
|
1224 |
+ Log(&rLog, LOG_INFO, "Iteration step %d out of %d", |
|
1225 |
+ iIterationCounter+1, prOpts->iNumIterations); |
|
1226 |
+ } else { |
|
1227 |
+ Log(&rLog, LOG_INFO, "Iteration step %d out of <auto>", |
|
1228 |
+ iIterationCounter+1); |
|
1229 |
+ } |
|
1230 |
+#if 0 |
|
1231 |
+ if (rLog.iLogLevelEnabled <= LOG_VERBOSE) { |
|
1232 |
+ char zcIntermediate[1000] = {0}; |
|
1233 |
+ char *pcFormat = "fasta"; |
|
1234 |
+ sprintf(zcIntermediate, "clustalo-aln-iter~%d~", iIterationCounter); |
|
1235 |
+#define LINE_WRAP 60 |
|
1236 |
+ if (WriteAlignment(prMSeq, zcIntermediate, MSAFILE_A2M, LINE_WRAP, &msa_c, &msa_v)) { |
|
1237 |
+ Log(&rLog, LOG_ERROR, "Could not save alignment to %s", zcIntermediate); |
|
1238 |
+ return -1; |
|
1239 |
+ } |
|
1240 |
+ } |
|
1241 |
+#endif |
|
1242 |
+ |
|
1243 |
+ |
|
1244 |
+ /* new guide-tree |
|
1245 |
+ * |
|
1246 |
+ */ |
|
1247 |
+ if (iIterationCounter < prOpts->iMaxGuidetreeIterations) { |
|
1248 |
+ /* determine progressive alignment order |
|
1249 |
+ * |
|
1250 |
+ * few things are different now when calling AlignmentOrder: |
|
1251 |
+ * - we have to ignore prOpts->pcDistmatInfile and pcGuidetreeInfile |
|
1252 |
+ * as they were used before |
|
1253 |
+ * - the corresponding outfiles are still valid though |
|
1254 |
+ */ |
|
1255 |
+ /* Free stuff that has already been allocated by or further |
|
1256 |
+ * downstream of AlignmentOrder() |
|
1257 |
+ */ |
|
1258 |
+ if (NULL != piOrderLR) |
|
1259 |
+ CKFREE(piOrderLR); |
|
1260 |
+ if (NULL != pdSeqWeights) |
|
1261 |
+ CKFREE(pdSeqWeights); |
|
1262 |
+ Log(&rLog, LOG_INFO, "Computing new guide tree (iteration step %d)"); |
|
1263 |
+ if (AlignmentOrder(&piOrderLR, &pdSeqWeights, prMSeq, |
|
1264 |
+ ((SEQTYPE_PROTEIN == prMSeq->seqtype) && (TRUE == prOpts->bUseKimura)) ? PAIRDIST_SQUIDID_KIMURA : PAIRDIST_SQUIDID, |
|
1265 |
+ NULL, prOpts->pcDistmatOutfile, |
|
1266 |
+ prOpts->iClusteringType, prOpts->iClustersizes, |
|
1267 |
+ NULL, prOpts->pcGuidetreeOutfile, prOpts->pcClustfile, |
|
1268 |
+ prOpts->bUseMbedForIteration, prOpts->bPercID)) { |
|
1269 |
+ Log(&rLog, LOG_ERROR, "AlignmentOrder() failed. Cannot continue"); |
|
1270 |
+ return -1; |
|
1271 |
+ } |
|
1272 |
+ } else { |
|
1273 |
+ Log(&rLog, LOG_INFO, "Skipping guide-tree iteration at iteration step %d (reached maximum)", |
|
1274 |
+ iIterationCounter); |
|
1275 |
+ } |
|
1276 |
+ |
|
1277 |
+ |
|
1278 |
+ /* new local hmm iteration |
|
1279 |
+ * |
|
1280 |
+ */ |
|
1281 |
+ /* non-residue/gap characters will crash AlnToHMM2(), |
|
1282 |
+ therefore sanitise unknown characters, FS, r259 -> r260 */ |
|
1283 |
+ SanitiseUnknown(prMSeq); |
|
1284 |
+ if (iIterationCounter < prOpts->iMaxHMMIterations) { |
|
1285 |
+ Log(&rLog, LOG_INFO, "Computing HMM from alignment"); |
|
1286 |
+ |
|
1287 |
+ if (OK != |
|
1288 |
+#if INDIRECT_HMM |
|
1289 |
+ AlnToHMM(&rHMMLocal, prMSeq) |
|
1290 |
+#else |
|
1291 |
+ AlnToHMM2(&rHMMLocal, prMSeq->seq, prMSeq->nseqs) |
|
1292 |
+#endif |
|
1293 |
+ ) { |
|
1294 |
+ Log(&rLog, LOG_ERROR, "Couldn't convert alignment to HMM. Will stop iterating now..."); |
|
1295 |
+ break; |
|
1296 |
+ } |
|
1297 |
+ } else { |
|
1298 |
+ Log(&rLog, LOG_INFO, "Skipping HMM iteration at iteration step %d (reached maximum)", |
|
1299 |
+ iIterationCounter); |
|
1300 |
+ } |
|
1301 |
+ |
|
1302 |
+ |
|
1303 |
+ /* align the sequences (again) |
|
1304 |
+ */ |
|
1305 |
+ dAlnScore = HHalignWrapper(prMSeq, piOrderLR, pdSeqWeights, |
|
1306 |
+ 2*prMSeq->nseqs -1/* nodes */, &rHMMLocal, 1, -1, |
|
1307 |
+ prOpts->rHhalignPara); |
|
1308 |
+ Log(&rLog, LOG_VERBOSE, |
|
1309 |
+ "Alignment score for alignmnent in hmm-iteration no %d = %f (last score = %f)", |
|
1310 |
+ iIterationCounter+1, dAlnScore, dLastAlnScore); |
|
1311 |
+ |
|
1312 |
+ |
|
1313 |
+ FreeHMMstruct(&rHMMLocal); |
|
1314 |
+ |
|
1315 |
+#if 0 |
|
1316 |
+ /* FIXME: need a better score for automatic iteration */ |
|
1317 |
+ if (prOpts->bIterationsAuto) { |
|
1318 |
+ /* automatic iteration: break if score improvement was not |
|
1319 |
+ * big enough |
|
1320 |
+ */ |
|
1321 |
+ double dScoreImprovement = (dAlnScore-dLastAlnScore)/dLastAlnScore; |
|
1322 |
+ if (dScoreImprovement < ITERATION_SCORE_IMPROVEMENT_THRESHOLD) { |
|
1323 |
+ Log(&rLog, LOG_INFO, |
|
1324 |
+ "Stopping after %d guide-tree iterations. No further alignment score improvement achieved.", |
|
1325 |
+ iIterationCounter+1); |
|
1326 |
+ /* use previous alignment */ |
|
1327 |
+ FreeMSeq(&prMSeq); |
|
1328 |
+ Log(&rLog, LOG_FORCED_DEBUG, "FIXME: %s", "CopyMSeq breaks things in this context"); |
|
1329 |
+ CopyMSeq(&prMSeq, prMSeqCopy); |
|
1330 |
+ /* FIXME: prOpts->pcDistmatOutfile and pcGuidetreeOutfile |
|
1331 |
+ * might have been updated, but then discarded here? |
|
1332 |
+ */ |
|
1333 |
+ break; |
|
1334 |
+ } else { |
|
1335 |
+ Log(&rLog, LOG_INFO, |
|
1336 |
+ "Got a %d%% better score in iteration step %d", |
|
1337 |
+ (int)dScoreImprovement*100, iIterationCounter+1); |
|
1338 |
+ FreeMSeq(&prMSeqCopy); |
|
1339 |
+ } |
|
1340 |
+ } |
|
1341 |
+ dLastAlnScore = dAlnScore; |
|
1342 |
+#endif |
|
1343 |
+ |
|
1344 |
+ } |
|
1345 |
+ /* end of iterations */ |
|
1346 |
+ |
|
1347 |
+ |
|
1348 |
+ |
|
1349 |
+ /* Last step: if a profile was also provided then align now-aligned mseq |
|
1350 |
+ * with this profile |
|
1351 |
+ * |
|
1352 |
+ * Don't use the backgrounds HMMs anymore and don't iterate. |
|
1353 |
+ * (which was done before). |
|
1354 |
+ * |
|
1355 |
+ */ |
|
1356 |
+ if (NULL != prMSeqProfile) { |
|
1357 |
+ if (AlignProfiles(prMSeq, prMSeqProfile, prOpts->rHhalignPara)) { |
|
1358 |
+ Log(&rLog, LOG_ERROR, "An error occured during the profile/profile alignment"); |
|
1359 |
+ return -1; |
|
1360 |
+ } |
|
1361 |
+ } |
|
1362 |
+ |
|
1363 |
+ |
|
1364 |
+ if (NULL != piOrderLR) { |
|
1365 |
+ CKFREE(piOrderLR); |
|
1366 |
+ } |
|
1367 |
+ if (NULL != pdSeqWeights) { |
|
1368 |
+ CKFREE(pdSeqWeights); |
|
1369 |
+ } |
|
1370 |
+ if (0 < prOpts->iHMMInputFiles) { |
|
1371 |
+ for (i=0; i<prOpts->iHMMInputFiles; i++) { |
|
1372 |
+ FreeHMMstruct(&prHMMs[i]); |
|
1373 |
+ } |
|
1374 |
+ CKFREE(prHMMs); |
|
1375 |
+ } |
|
1376 |
+ |
|
1377 |
+ return 0; |
|
1378 |
+} |
|
1379 |
+/* end of Align() */ |
|
1380 |
+ |
|
1381 |
+ |
|
1382 |
+ |
|
1383 |
+ |
|
1384 |
+/** |
|
1385 |
+ * @brief Align two profiles, ie two sets of prealigned sequences. Already |
|
1386 |
+ * aligned columns won't be changed. |
|
1387 |
+ * |
|
1388 |
+ * @param[out] prMSeqProfile1 |
|
1389 |
+ * First profile/aligned set of sequences. Merged alignment will be found in |
|
1390 |
+ * here. |
|
1391 |
+ * @param[in] prMSeqProfile2 |
|
1392 |
+ * First profile/aligned set of sequences |
|
1393 |
+ * @param[in] rHhalignPara |
|
1394 |
+ * FIXME |
|
1395 |
+ * |
|
1396 |
+ * @return 0 on success, -1 on failure |
|
1397 |
+ * |
|