git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/seqbias@53976 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -13,7 +13,7 @@ YAML_CPP_OBJ = aliascontent.o conversion.o emitter.o emitterstate.o \ |
13 | 13 |
stream.o tag.o |
14 | 14 |
|
15 | 15 |
SEQBIAS_OBJ = sequencing_bias.o kmers.o miscmath.o common.o \ |
16 |
- table.o superfasthash.o logger.o seqbias.o \ |
|
16 |
+ table.o logger.o seqbias.o \ |
|
17 | 17 |
|
18 | 18 |
GNULIB_OBJ = asprintf.o printf-args.o printf-parse.o vasnprintf.o vasprintf.o |
19 | 19 |
|
... | ... |
@@ -5,6 +5,8 @@ |
5 | 5 |
#include <algorithm> |
6 | 6 |
#include <cstdlib> |
7 | 7 |
#include "samtools/faidx.h" |
8 |
+#include "samtools/sam.h" |
|
9 |
+#include "samtools/bam.h" |
|
8 | 10 |
|
9 | 11 |
|
10 | 12 |
typedef long pos; /* genomic position */ |
... | ... |
@@ -24,6 +26,12 @@ void rev( T* xs, int n ) { |
24 | 26 |
while( i < j ) std::swap(xs[i++],xs[j--]); |
25 | 27 |
} |
26 | 28 |
|
29 |
+ |
|
30 |
+/* defined in bam_aux.c */ |
|
31 |
+extern "C" { |
|
32 |
+void bam_init_header_hash(bam_header_t *header); |
|
33 |
+} |
|
34 |
+ |
|
27 | 35 |
/* Change the behavior of the faidx_fetch_seq function to be more useful. If |
28 | 36 |
* coordinates are outside the actual sequence, write N's, rather than adjusting |
29 | 37 |
* the start,end. */ |
... | ... |
@@ -3,7 +3,6 @@ |
3 | 3 |
#include "common.hpp" |
4 | 4 |
#include "miscmath.hpp" |
5 | 5 |
#include "logger.h" |
6 |
-#include "asprintf.h" |
|
7 | 6 |
|
8 | 7 |
#include <string> |
9 | 8 |
#include <cstring> |
... | ... |
@@ -24,8 +23,6 @@ kmer_matrix::kmer_matrix( const YAML::Node& node ) |
24 | 23 |
for( i = 0; i < n*m; i++ ) { |
25 | 24 |
node_A[i] >> A[i]; |
26 | 25 |
} |
27 |
- |
|
28 |
- stored_row = new double[ m ]; |
|
29 | 26 |
} |
30 | 27 |
|
31 | 28 |
kmer_matrix::kmer_matrix( size_t n, size_t k ) |
... | ... |
@@ -34,7 +31,6 @@ kmer_matrix::kmer_matrix( size_t n, size_t k ) |
34 | 31 |
m = 1<<(2*k); /* == 4^k */ |
35 | 32 |
|
36 | 33 |
A = new double[ n * m ]; |
37 |
- stored_row = new double[ m ]; |
|
38 | 34 |
} |
39 | 35 |
|
40 | 36 |
kmer_matrix::kmer_matrix( const kmer_matrix& M ) |
... | ... |
@@ -45,10 +41,6 @@ kmer_matrix::kmer_matrix( const kmer_matrix& M ) |
45 | 41 |
m = M.m; |
46 | 42 |
A = new double[ n * m ]; |
47 | 43 |
memcpy( (void*)A, (void*)M.A, n * m * sizeof(double) ); |
48 |
- |
|
49 |
- |
|
50 |
- stored_row = new double[ m ]; |
|
51 |
- memcpy( (void*)stored_row, (void*)M.stored_row, m * sizeof(double) ); |
|
52 | 44 |
} |
53 | 45 |
|
54 | 46 |
size_t kmer_matrix::getn() const { return n; } |
... | ... |
@@ -66,13 +58,9 @@ void kmer_matrix::operator=( const kmer_matrix& M ) |
66 | 58 |
|
67 | 59 |
delete[] A; |
68 | 60 |
A = new double[ n * m ]; |
69 |
- |
|
70 |
- delete[] stored_row; |
|
71 |
- stored_row = new double[ m ]; |
|
72 | 61 |
} |
73 | 62 |
|
74 | 63 |
memcpy( (void*)A, (void*)M.A, n * m * sizeof(double) ); |
75 |
- memcpy( (void*)stored_row, (void*)M.stored_row, m * sizeof(double) ); |
|
76 | 64 |
} |
77 | 65 |
|
78 | 66 |
|
... | ... |
@@ -81,11 +69,11 @@ void kmer_matrix::to_yaml( YAML::Emitter& out ) const |
81 | 69 |
out << YAML::BeginMap; |
82 | 70 |
|
83 | 71 |
out << YAML::Key << "k"; |
84 |
- out << YAML::Value << (unsigned int)k; |
|
72 |
+ out << YAML::Value << k; |
|
85 | 73 |
out << YAML::Key << "n"; |
86 |
- out << YAML::Value << (unsigned int)n; |
|
74 |
+ out << YAML::Value << n; |
|
87 | 75 |
out << YAML::Key << "m"; |
88 |
- out << YAML::Value << (unsigned int)m; |
|
76 |
+ out << YAML::Value << m; |
|
89 | 77 |
out << YAML::Key << "A"; |
90 | 78 |
out << YAML::Flow; |
91 | 79 |
out << YAML::Value; |
... | ... |
@@ -103,7 +91,6 @@ void kmer_matrix::to_yaml( YAML::Emitter& out ) const |
103 | 91 |
kmer_matrix::~kmer_matrix() |
104 | 92 |
{ |
105 | 93 |
delete[] A; |
106 |
- delete[] stored_row; |
|
107 | 94 |
} |
108 | 95 |
|
109 | 96 |
|
... | ... |
@@ -119,7 +106,7 @@ void kmer_matrix::setall( double x ) |
119 | 106 |
for( i = 0; i < n * m; i++ ) A[i] = x; |
120 | 107 |
} |
121 | 108 |
|
122 |
-void kmer_matrix::setrow( size_t i, double x ) |
|
109 |
+void kmer_matrix::setrowall( size_t i, double x ) |
|
123 | 110 |
{ |
124 | 111 |
size_t j; |
125 | 112 |
for( j = 0; j < m; j++ ) { |
... | ... |
@@ -127,6 +114,16 @@ void kmer_matrix::setrow( size_t i, double x ) |
127 | 114 |
} |
128 | 115 |
} |
129 | 116 |
|
117 |
+void kmer_matrix::getrow( size_t i, double* xs ) |
|
118 |
+{ |
|
119 |
+ memcpy( (void*)xs, (void*)(A + i * m), m * sizeof(double) ); |
|
120 |
+} |
|
121 |
+ |
|
122 |
+void kmer_matrix::setrow( size_t i, double* xs ) |
|
123 |
+{ |
|
124 |
+ memcpy( (void*)(A + i * m ), (void*)xs, m * sizeof(double) ); |
|
125 |
+} |
|
126 |
+ |
|
130 | 127 |
|
131 | 128 |
void kmer_matrix::dist_normalize() |
132 | 129 |
{ |
... | ... |
@@ -155,21 +152,30 @@ void kmer_matrix::dist_conditionalize( int effective_k ) |
155 | 152 |
} |
156 | 153 |
|
157 | 154 |
|
158 |
-void kmer_matrix::dist_conditionalize_row( size_t i, int effective_k ) |
|
155 |
+void kmer_matrix::dist_conditionalize_row( size_t i, size_t j, int effective_k ) |
|
159 | 156 |
{ |
160 | 157 |
if( effective_k <= 0 ) effective_k = m; |
161 |
- |
|
162 |
- kmer L; |
|
163 |
- kmer L_max = 1 << (2*(effective_k-1)); |
|
158 |
+ kmer L, R; |
|
159 |
+ kmer L_max = 1 << (2*j); |
|
160 |
+ kmer R_max = 1 << (2*(effective_k-j-1)); |
|
164 | 161 |
kmer K; |
165 |
- kmer nt; |
|
166 | 162 |
double z; |
163 |
+ kmer nt; |
|
167 | 164 |
|
168 | 165 |
for( L = 0; L < L_max; L++ ) { |
169 |
- K = L<<2; |
|
170 |
- z = 0.0; |
|
171 |
- for( nt = 0; nt < 4; nt++ ) z += A[ i * m + (nt | K) ]; |
|
172 |
- for( nt = 0; nt < 4; nt++ ) A[ i * m + (nt | K) ] /= z; |
|
166 |
+ for( R = 0; R < R_max; R++ ) { |
|
167 |
+ z = 0.0; |
|
168 |
+ |
|
169 |
+ for( nt = 0; nt < 4; nt++ ) { |
|
170 |
+ K = (L<<(2*(effective_k-j))) | (nt<<(2*(effective_k-j-1))) | R; |
|
171 |
+ z += A[ i*m + K ]; |
|
172 |
+ } |
|
173 |
+ |
|
174 |
+ for( nt = 0; nt < 4; nt++ ) { |
|
175 |
+ K = (L<<(2*(effective_k-j))) | (nt<<(2*(effective_k-j-1))) | R; |
|
176 |
+ A[ i*m + K ] /= z; |
|
177 |
+ } |
|
178 |
+ } |
|
173 | 179 |
} |
174 | 180 |
} |
175 | 181 |
|
... | ... |
@@ -186,24 +192,11 @@ void kmer_matrix::log_transform_row( size_t i, int effective_k ) |
186 | 192 |
} |
187 | 193 |
|
188 | 194 |
|
189 |
-void kmer_matrix::store_row( size_t i ) |
|
190 |
-{ |
|
191 |
- memcpy( (void*)stored_row, (void*)(A + i * m), m * sizeof(double) ); |
|
192 |
- stored_row_index = i; |
|
193 |
-} |
|
194 |
- |
|
195 |
- |
|
196 |
-void kmer_matrix::restore_stored_row() |
|
197 |
-{ |
|
198 |
- memcpy( (void*)(A + stored_row_index * m), (void*)stored_row, m * sizeof(double) ); |
|
199 |
-} |
|
200 |
- |
|
201 |
- |
|
202 | 195 |
const size_t sequence::kmer_max_k = 4*sizeof(kmer); |
203 | 196 |
|
204 | 197 |
|
205 |
-sequence::sequence( const char* s, int c ) |
|
206 |
- : c(c), xs(NULL), n(0) |
|
198 |
+sequence::sequence( const char* s, int c, double w ) |
|
199 |
+ : c(c), w(w), xs(NULL), n(0) |
|
207 | 200 |
{ |
208 | 201 |
n = strlen(s); |
209 | 202 |
if( n > 0 ) { |
... | ... |
@@ -223,6 +216,7 @@ sequence::sequence( const sequence& s ) |
223 | 216 |
: xs(NULL), n(0) |
224 | 217 |
{ |
225 | 218 |
c = s.c; |
219 |
+ w = s.w; |
|
226 | 220 |
n = s.n; |
227 | 221 |
if( n > 0 ) { |
228 | 222 |
xs = new kmer[ n/kmer_max_k + 1 ]; |
... | ... |
@@ -291,6 +285,10 @@ motif::motif( const YAML::Node& node ) |
291 | 285 |
} |
292 | 286 |
|
293 | 287 |
P = new kmer_matrix( node["P"] ); |
288 |
+ |
|
289 |
+ R = new bool[n*n]; |
|
290 |
+ memset( R, 0, n*n*sizeof(bool) ); |
|
291 |
+ compute_reachability(); |
|
294 | 292 |
} |
295 | 293 |
|
296 | 294 |
motif::motif( size_t n, size_t k, int c ) |
... | ... |
@@ -301,6 +299,9 @@ motif::motif( size_t n, size_t k, int c ) |
301 | 299 |
|
302 | 300 |
parents = new bool[n*n]; |
303 | 301 |
memset( parents, 0, n*n*sizeof(bool) ); |
302 |
+ |
|
303 |
+ R = new bool[n*n]; |
|
304 |
+ memset( R, 0, n*n*sizeof(bool) ); |
|
304 | 305 |
} |
305 | 306 |
|
306 | 307 |
|
... | ... |
@@ -313,6 +314,9 @@ motif::motif( const motif& M ) |
313 | 314 |
|
314 | 315 |
parents = new bool[n*n]; |
315 | 316 |
memcpy( parents, M.parents, n*n*sizeof(bool) ); |
317 |
+ |
|
318 |
+ R = new bool[n*n]; |
|
319 |
+ memset( R, 0, n*n*sizeof(bool) ); |
|
316 | 320 |
} |
317 | 321 |
|
318 | 322 |
|
... | ... |
@@ -320,6 +324,7 @@ motif::motif( const motif& M ) |
320 | 324 |
motif::~motif() |
321 | 325 |
{ |
322 | 326 |
delete[] parents; |
327 |
+ delete[] R; |
|
323 | 328 |
delete P; |
324 | 329 |
} |
325 | 330 |
|
... | ... |
@@ -329,10 +334,10 @@ void motif::to_yaml( YAML::Emitter& out ) const |
329 | 334 |
out << YAML::BeginMap; |
330 | 335 |
|
331 | 336 |
out << YAML::Key << "n"; |
332 |
- out << YAML::Value << (unsigned int)n; |
|
337 |
+ out << YAML::Value << n; |
|
333 | 338 |
|
334 | 339 |
out << YAML::Key << "k"; |
335 |
- out << YAML::Value << (unsigned int)k; |
|
340 |
+ out << YAML::Value << k; |
|
336 | 341 |
|
337 | 342 |
out << YAML::Key << "c"; |
338 | 343 |
out << YAML::Value << c; |
... | ... |
@@ -385,7 +390,7 @@ size_t motif::num_parents( size_t i ) const |
385 | 390 |
{ |
386 | 391 |
size_t j; |
387 | 392 |
size_t M = 0; |
388 |
- for( j = 0; j <= i; j++ ) { |
|
393 |
+ for( j = 0; j < n; j++ ) { |
|
389 | 394 |
if( parents[i*n+j] ) M++; |
390 | 395 |
} |
391 | 396 |
|
... | ... |
@@ -402,19 +407,29 @@ void motif::set_edge( size_t i, size_t j, bool x ) |
402 | 407 |
parents[j*n+i] = x; |
403 | 408 |
} |
404 | 409 |
|
405 |
- |
|
406 |
-void motif::store_row( size_t i ) |
|
410 |
+bool motif::reachable( size_t i, size_t j ) |
|
407 | 411 |
{ |
408 |
- P->store_row( i ); |
|
412 |
+ return R[j*n+i]; |
|
409 | 413 |
} |
410 | 414 |
|
411 |
- |
|
412 |
-void motif::restore_stored_row() |
|
415 |
+void motif::compute_reachability() |
|
413 | 416 |
{ |
414 |
- P->restore_stored_row(); |
|
417 |
+ /* find the transitive closure of the parents matrix via flyod-warshall */ |
|
418 |
+ |
|
419 |
+ memcpy( R, parents, n*n*sizeof(bool) ); |
|
420 |
+ |
|
421 |
+ size_t k, i, j; |
|
422 |
+ for( k = 0; k < n; k++ ) { |
|
423 |
+ for( i = 0; i < n; i++ ) { |
|
424 |
+ for( j = 0; j < n; j++ ) { |
|
425 |
+ R[j*n+i] = R[j*n+i] || (R[k*n+i] && R[j*n+k]); |
|
426 |
+ } |
|
427 |
+ } |
|
428 |
+ } |
|
415 | 429 |
} |
416 | 430 |
|
417 | 431 |
|
432 |
+ |
|
418 | 433 |
char* motif::print_model_graph( int offset ) |
419 | 434 |
{ |
420 | 435 |
std::string graph_str; |
... | ... |
@@ -439,7 +454,8 @@ char* motif::print_model_graph( int offset ) |
439 | 454 |
for( j = 0; j < (int)n; j++ ) { |
440 | 455 |
if( !parents[j*n+j] ) continue; |
441 | 456 |
|
442 |
- for( i = 0; i < j; i++ ) { |
|
457 |
+ for( i = 0; i < (int)n; i++ ) { |
|
458 |
+ if( i == j ) continue; |
|
443 | 459 |
if( parents[j*n+i] ) { |
444 | 460 |
r = asprintf( &tmp, "n%d -> n%d;\n", i, j ); |
445 | 461 |
graph_str += tmp; |
... | ... |
@@ -455,50 +471,13 @@ char* motif::print_model_graph( int offset ) |
455 | 471 |
} |
456 | 472 |
|
457 | 473 |
|
458 |
-void motif::add_all_edges( const std::deque<sequence*>* data ) |
|
459 |
-{ |
|
460 |
- size_t i, j; |
|
461 |
- size_t n_parents; |
|
462 |
- size_t m; |
|
463 |
- kmer K; |
|
464 |
- std::deque<sequence*>::const_iterator seq; |
|
465 |
- |
|
466 |
- for( j = 0; j < n; j++ ) { |
|
467 |
- for( i = k-1 <= j ? j-(k-1) : 0; i <= j; i++ ) { |
|
468 |
- set_edge( i, j, true ); |
|
469 |
- } |
|
470 |
- P->setrow( j, 0.0 ); |
|
471 |
- n_parents = num_parents(j); |
|
472 |
- m = 1 << (2*n_parents); |
|
473 |
- |
|
474 |
- for( K = 0; K < m; K++ ) { |
|
475 |
- (*P)( j, K ) = pseudocount; |
|
476 |
- } |
|
477 |
- |
|
478 |
- for( seq = data->begin(); seq != data->end(); seq++ ) { |
|
479 |
- if( (*seq)->c == c && (*seq)->get( parents + j*n, n, K ) ) { |
|
480 |
- (*P)( j, K )++; |
|
481 |
- } |
|
482 |
- } |
|
483 |
- |
|
484 |
- P->dist_normalize_row( j ); |
|
485 |
- P->dist_conditionalize_row( j, n_parents ); |
|
486 |
- P->log_transform_row( j, n_parents ); |
|
487 |
- } |
|
488 |
-} |
|
489 |
- |
|
490 |
- |
|
491 | 474 |
/* make an edge i --> j |
492 | 475 |
* That is, condition j on i. */ |
493 | 476 |
void motif::add_edge( size_t i, size_t j, const std::deque<sequence*>* data ) |
494 | 477 |
{ |
495 |
- if( i > j ) { |
|
496 |
- failf( "Invalid motif edge (%zu, %zu)\n", i, j ); |
|
497 |
- } |
|
498 |
- |
|
499 | 478 |
set_edge( i, j, true ); |
500 | 479 |
|
501 |
- P->setrow( j, 0.0 ); |
|
480 |
+ P->setrowall( j, 0.0 ); |
|
502 | 481 |
size_t n_parents = num_parents(j); |
503 | 482 |
size_t m = 1 << (2*n_parents); |
504 | 483 |
kmer K; |
... | ... |
@@ -510,26 +489,30 @@ void motif::add_edge( size_t i, size_t j, const std::deque<sequence*>* data ) |
510 | 489 |
std::deque<sequence*>::const_iterator seq; |
511 | 490 |
for( seq = data->begin(); seq != data->end(); seq++ ) { |
512 | 491 |
if( (*seq)->c == c && (*seq)->get( parents + j*n, n, K ) ) { |
513 |
- (*P)( j, K )++; |
|
492 |
+ (*P)( j, K ) += (*seq)->w; |
|
514 | 493 |
} |
515 | 494 |
} |
516 | 495 |
|
496 |
+ size_t n_pred_parents = 0; |
|
497 |
+ size_t u; |
|
498 |
+ for( u = 0; u < j; u++ ) { |
|
499 |
+ if( has_edge( u, j ) ) n_pred_parents++; |
|
500 |
+ } |
|
501 |
+ |
|
517 | 502 |
P->dist_normalize_row( j ); |
518 |
- P->dist_conditionalize_row( j, n_parents ); |
|
503 |
+ P->dist_conditionalize_row( j, n_pred_parents, n_parents ); |
|
519 | 504 |
P->log_transform_row( j, n_parents ); |
505 |
+ |
|
506 |
+ compute_reachability(); |
|
520 | 507 |
} |
521 | 508 |
|
522 | 509 |
|
523 | 510 |
|
524 | 511 |
void motif::remove_edge( size_t i, size_t j, const std::deque<sequence*>* data ) |
525 | 512 |
{ |
526 |
- if( i > j ) { |
|
527 |
- failf( "Invalid motif edge (%zu, %zu)\n", i, j ); |
|
528 |
- } |
|
529 |
- |
|
530 | 513 |
set_edge( i, j, false ); |
531 | 514 |
|
532 |
- P->setrow( j, 0.0 ); |
|
515 |
+ P->setrowall( j, 0.0 ); |
|
533 | 516 |
size_t n_parents = num_parents(j); |
534 | 517 |
size_t m = 1 << (2*n_parents); |
535 | 518 |
kmer K; |
... | ... |
@@ -545,38 +528,22 @@ void motif::remove_edge( size_t i, size_t j, const std::deque<sequence*>* data ) |
545 | 528 |
} |
546 | 529 |
} |
547 | 530 |
|
531 |
+ size_t n_pred_parents = 0; |
|
532 |
+ size_t u; |
|
533 |
+ for( u = 0; u < j; u++ ) { |
|
534 |
+ if( has_edge( u, j ) ) n_pred_parents++; |
|
535 |
+ } |
|
536 |
+ |
|
548 | 537 |
P->dist_normalize_row( j ); |
549 |
- P->dist_conditionalize_row( j, n_parents ); |
|
538 |
+ P->dist_conditionalize_row( j, n_pred_parents, n_parents ); |
|
550 | 539 |
P->log_transform_row( j, n_parents ); |
551 |
-} |
|
552 | 540 |
|
541 |
+ compute_reachability(); |
|
542 |
+} |
|
553 | 543 |
|
554 | 544 |
|
555 | 545 |
|
556 | 546 |
|
557 |
-double motif_log_likelihood( const motif& M0, const motif& M1, |
|
558 |
- const std::deque<sequence*>* training_seqs ) |
|
559 |
-{ |
|
560 |
- double L, L0, L1; |
|
561 |
- |
|
562 |
- L = 0.0; |
|
563 |
- |
|
564 |
- std::deque<sequence*>::const_iterator i; |
|
565 |
- for( i = training_seqs->begin(); i != training_seqs->end(); i++ ) { |
|
566 |
- L0 = M0.eval( **i ); |
|
567 |
- L1 = M1.eval( **i ); |
|
568 |
- if( (*i)->c == 0 ) { |
|
569 |
- L += L0 - logaddexp( L0, L1 ); |
|
570 |
- } |
|
571 |
- else if( (*i)->c == 1 ) { |
|
572 |
- L += L1 - logaddexp( L0, L1 ); |
|
573 |
- } |
|
574 |
- } |
|
575 |
- |
|
576 |
- return L; |
|
577 |
-} |
|
578 |
- |
|
579 |
- |
|
580 | 547 |
|
581 | 548 |
/* Let x_i be the sequence of training example i, c be the class, and M be the |
582 | 549 |
* model. Then given l0 and l1, where, |
... | ... |
@@ -590,15 +557,19 @@ double motif_log_likelihood( const motif& M0, const motif& M1, |
590 | 557 |
* = sum_i log( P( x_i | c_i, M ) / ( P( x_i | c = 0, M ) + P( x_i | c = 1, M ) ) |
591 | 558 |
* |
592 | 559 |
*/ |
593 |
-double conditional_likelihood( size_t n, const double* l0, const double* l1, const int* c ) |
|
560 |
+double conditional_likelihood( size_t n, const double* l0, const double* l1, const int* c, |
|
561 |
+ double prior ) |
|
594 | 562 |
{ |
595 | 563 |
double l; |
596 | 564 |
size_t i; |
597 | 565 |
|
566 |
+ double p = log(prior); |
|
567 |
+ double q = log(1-prior); |
|
568 |
+ |
|
598 | 569 |
l = 0.0; |
599 | 570 |
for( i = 0; i < n; i++ ) { |
600 |
- l += (c[i] == 0 ? l0[i] : l1[i]) |
|
601 |
- - logaddexp( l0[i], l1[i] ); |
|
571 |
+ l += (c[i] == 0 ? (q+l0[i]) : (p+l1[i])) |
|
572 |
+ - logaddexp( q+l0[i], p+l1[i] ); |
|
602 | 573 |
} |
603 | 574 |
|
604 | 575 |
return l; |
... | ... |
@@ -615,7 +586,7 @@ void motif::update_likelihood_column( double* L, size_t n, size_t m, size_t j, |
615 | 586 |
|
616 | 587 |
for( i = 0; i < n; i++ ) { |
617 | 588 |
if( (*training_seqs)[i]->get( parents + j * m, m, K ) ) { |
618 |
- L[ i * m + j ] = (*P)( j, K ); |
|
589 |
+ L[ i * m + j ] = (*training_seqs)[i]->w * (*P)( j, K ); |
|
619 | 590 |
} |
620 | 591 |
} |
621 | 592 |
} |
... | ... |
@@ -628,24 +599,24 @@ void motif::update_likelihood_column( double* L, size_t n, size_t m, size_t j, |
628 | 599 |
/* Akaike Information Criterion */ |
629 | 600 |
double aic( double L, double n_obs, double n_params, double c = 1.0 ) |
630 | 601 |
{ |
631 |
- return L - c*n_params*n_obs / (n_obs - n_params - 1.0); |
|
602 |
+ return L - c*n_params; |
|
632 | 603 |
} |
633 | 604 |
|
634 |
-/* Bayesian (Schwarz) Information Criterion */ |
|
635 |
-double bic( double L, double n_obs, double n_params, double c = 1.0 ) |
|
605 |
+/* Akaike Information Criterion with second order small-sample bias correction. |
|
606 |
+ * (From Hurvich and Tsai (1989)) */ |
|
607 |
+double aicc( double L, double n_obs, double n_params, double c = 1.0 ) |
|
636 | 608 |
{ |
637 |
- return 2.0*L - c*n_params*log(n_obs); |
|
609 |
+ return L - c*n_params |
|
610 |
+ - (n_params + 1) * (n_params + 2) / (n_obs - n_params - 2); |
|
638 | 611 |
} |
639 | 612 |
|
640 |
- |
|
641 |
-/* Hannan-Quinn Information Criterion */ |
|
642 |
-double hqic( double L, double n_obs, double n_params, double c = 1.0 ) |
|
613 |
+/* Bayesian (Schwarz) Information Criterion */ |
|
614 |
+double bic( double L, double n_obs, double n_params, double c = 1.0 ) |
|
643 | 615 |
{ |
644 |
- return L - c*n_params*log(log(n_obs)); |
|
616 |
+ return 2.0*L - c*n_params*log(n_obs); |
|
645 | 617 |
} |
646 | 618 |
|
647 | 619 |
|
648 |
- |
|
649 | 620 |
void train_motifs( motif& M0, motif& M1, |
650 | 621 |
const std::deque<sequence*>* training_seqs, |
651 | 622 |
size_t max_dep_dist, double complexity_penalty ) |
... | ... |
@@ -658,11 +629,11 @@ void train_motifs( motif& M0, motif& M1, |
658 | 629 |
failf( "Motif models of mismatching size. (%zu != %zu)\n", M0.n, M1.n ); |
659 | 630 |
} |
660 | 631 |
|
661 |
- double (*compute_ic)( double, double, double, double ) = aic; |
|
632 |
+ double (*compute_ic)( double, double, double, double ) = aicc; |
|
662 | 633 |
|
663 | 634 |
|
664 |
- size_t i, j; |
|
665 |
- size_t i_start; |
|
635 |
+ int i, j; |
|
636 |
+ int i_start, i_end; |
|
666 | 637 |
|
667 | 638 |
const size_t n = training_seqs->size(); |
668 | 639 |
const size_t m = M0.n; |
... | ... |
@@ -680,23 +651,51 @@ void train_motifs( motif& M0, motif& M1, |
680 | 651 |
double* l1 = new double[ n ]; memset( (void*)l1, 0, n * sizeof(double) ); |
681 | 652 |
|
682 | 653 |
|
654 |
+ /* vectors for saving and restoring parameters, to avoid recomputing |
|
655 |
+ * frequencies */ |
|
656 |
+ double* M0_row_j = new double[ M0.P->getm() ]; |
|
657 |
+ double* M1_row_j = new double[ M0.P->getm() ]; |
|
658 |
+ double* M0_row_i = new double[ M0.P->getm() ]; |
|
659 |
+ double* M1_row_i = new double[ M0.P->getm() ]; |
|
660 |
+ |
|
661 |
+ |
|
683 | 662 |
/* 0-1 vector giving labeling each sequence as foreground or background */ |
684 | 663 |
int* cs = new int[ n ]; |
685 | 664 |
|
686 |
- for( i = 0; i < n; i++ ) { |
|
665 |
+ for( i = 0; i < (int)n; i++ ) { |
|
687 | 666 |
cs[i] = (*training_seqs)[i]->c; |
688 | 667 |
} |
689 | 668 |
|
669 |
+ /* set prior probability of an example being foreground */ |
|
670 |
+ double ws[2]; |
|
671 |
+ for( i = 0; i < (int)n; i++ ) { |
|
672 |
+ if( cs[i] <= 1 ) ws[cs[i]] += (*training_seqs)[i]->w; |
|
673 |
+ } |
|
674 |
+ double prior = ws[1] / (ws[0] + ws[1]); |
|
675 |
+ |
|
690 | 676 |
|
691 | 677 |
/* backup likelihood matrix columns, to restore state after trying a new edge */ |
692 |
- double* b0 = new double[ n ]; |
|
693 |
- double* b1 = new double[ n ]; |
|
678 |
+ double* b0_j = new double[ n ]; |
|
679 |
+ double* b1_j = new double[ n ]; |
|
680 |
+ double* b0_i = new double[ n ]; |
|
681 |
+ double* b1_i = new double[ n ]; |
|
694 | 682 |
|
695 | 683 |
|
696 | 684 |
/* keeping track of the optimal edge */ |
697 |
- double ic, ic_curr, ic_best; |
|
698 |
- size_t j_best, i_best; |
|
685 |
+ double ic, ic_curr; |
|
686 |
+ |
|
687 |
+ double ic_forw_best; |
|
688 |
+ int j_forw_best, i_forw_best; |
|
699 | 689 |
|
690 |
+ double ic_back_best; |
|
691 |
+ int j_back_best, i_back_best; |
|
692 |
+ |
|
693 |
+ double ic_rev_best; |
|
694 |
+ int j_rev_best, i_rev_best; |
|
695 |
+ |
|
696 |
+ |
|
697 |
+ /* for cycle detection */ |
|
698 |
+ bool has_ij_path; |
|
700 | 699 |
|
701 | 700 |
/* parameters to compute information criterion */ |
702 | 701 |
double n_obs = training_seqs->size(); |
... | ... |
@@ -706,51 +705,81 @@ void train_motifs( motif& M0, motif& M1, |
706 | 705 |
/* log conditional likelihood */ |
707 | 706 |
double l; |
708 | 707 |
|
709 |
- |
|
710 | 708 |
/* baseline ic */ |
711 |
- l = conditional_likelihood( n, l0, l1, cs ); |
|
709 |
+ l = conditional_likelihood( n, l0, l1, cs, prior ); |
|
712 | 710 |
ic_curr = compute_ic( l, n_obs, n_params, complexity_penalty ); |
713 | 711 |
|
714 | 712 |
|
713 |
+ /* for pretty output */ |
|
714 |
+ size_t col; |
|
715 |
+ const char* col_base = "\n%34s"; |
|
716 |
+ const size_t col_max = 30; |
|
717 |
+ |
|
718 |
+ |
|
715 | 719 |
size_t round_num = 0; |
716 | 720 |
|
717 | 721 |
while( true ) { |
718 | 722 |
round_num++; |
719 | 723 |
|
720 | 724 |
log_printf( LOG_MSG, "round %4d (ic = %0.4e) ", round_num, ic_curr ); |
725 |
+ col = 0; |
|
726 |
+ |
|
727 |
+ ic_forw_best = ic_back_best = ic_rev_best = -HUGE_VAL; |
|
728 |
+ i_forw_best = i_back_best = i_rev_best = 0; |
|
729 |
+ j_forw_best = j_back_best = j_rev_best = 0; |
|
721 | 730 |
|
722 |
- ic_best = -HUGE_VAL; |
|
723 |
- j_best = i_best = 0; |
|
724 | 731 |
|
725 |
- for( j = 0; j < M0.n; j++ ) { |
|
732 |
+ /* phase 1: try all possible edge additions */ |
|
733 |
+ for( j = M0.n-1; j >= 0; j-- ) { |
|
726 | 734 |
|
727 | 735 |
if( M0.has_edge( j, j ) ) { |
728 |
- if( max_dep_dist == 0 || j <= max_dep_dist ) i_start = 0; |
|
729 |
- else i_start = i - max_dep_dist; |
|
736 |
+ if( max_dep_dist == 0 || j <= (int)max_dep_dist ) i_start = 0; |
|
737 |
+ else i_start = j - max_dep_dist; |
|
738 |
+ |
|
739 |
+ if( max_dep_dist == 0 ) i_end = M0.n - 1; |
|
740 |
+ else i_end = std::min( M0.n-1, j + max_dep_dist ); |
|
730 | 741 |
} |
731 |
- else i_start = j; |
|
742 |
+ else i_start = i_end = j; |
|
743 |
+ |
|
732 | 744 |
|
733 | 745 |
|
734 |
- for( i = i_start; i <= j; i++ ) { |
|
746 |
+ for( i = i_start; i <= i_end; i++ ) { |
|
735 | 747 |
|
748 |
+ /* skip existing edges */ |
|
736 | 749 |
if( M0.has_edge( i, j ) ) { |
737 | 750 |
continue; |
738 | 751 |
} |
739 | 752 |
|
753 |
+ /* skip edges that would introduce cycles */ |
|
754 |
+ if( M0.reachable( j, i ) ) { |
|
755 |
+ continue; |
|
756 |
+ } |
|
757 |
+ |
|
758 |
+ /* skip edges that would exceed the parent limit */ |
|
740 | 759 |
if( M0.num_parents(j) >= M0.k ) { |
741 | 760 |
continue; |
742 | 761 |
} |
743 | 762 |
|
744 |
- log_puts( LOG_MSG, "." ); |
|
763 |
+ /* skip edges that are equivalent to one already tried */ |
|
764 |
+ if( i > j && M0.num_parents(j) == 1 && M0.num_parents(i) == 1 ) { |
|
765 |
+ continue; |
|
766 |
+ } |
|
745 | 767 |
|
746 |
- /* keep track of the old parameters to avoid retraining */ |
|
747 |
- M0.store_row(j); |
|
748 |
- M1.store_row(j); |
|
768 |
+ |
|
769 |
+ log_puts( LOG_MSG, "+" ); |
|
770 |
+ if( ++col > col_max ) { |
|
771 |
+ col = 0; |
|
772 |
+ log_printf( LOG_MSG, col_base, "" ); |
|
773 |
+ } |
|
749 | 774 |
|
750 | 775 |
|
776 |
+ /* keep track of the old parameters to avoid retraining */ |
|
777 |
+ M0.P->getrow( j, M0_row_j ); |
|
778 |
+ M1.P->getrow( j, M1_row_j ); |
|
779 |
+ |
|
751 | 780 |
/* keep track of the old likelihoods to avoid reevaluating */ |
752 |
- colcpy( b0, L0, j, n, m ); |
|
753 |
- colcpy( b1, L1, j, n, m ); |
|
781 |
+ colcpy( b0_j, L0, j, n, m ); |
|
782 |
+ colcpy( b1_j, L1, j, n, m ); |
|
754 | 783 |
|
755 | 784 |
/* add edge */ |
756 | 785 |
M0.add_edge( i, j, training_seqs ); |
... | ... |
@@ -762,254 +791,313 @@ void train_motifs( motif& M0, motif& M1, |
762 | 791 |
|
763 | 792 |
|
764 | 793 |
/* update training example likelihoods */ |
765 |
- vecsub( l0, b0, n ); |
|
794 |
+ vecsub( l0, b0_j, n ); |
|
766 | 795 |
vecaddcol( l0, L0, n, m, j ); |
767 | 796 |
|
768 |
- vecsub( l1, b1, n ); |
|
797 |
+ vecsub( l1, b1_j, n ); |
|
769 | 798 |
vecaddcol( l1, L1, n, m, j ); |
770 | 799 |
|
771 | 800 |
|
772 |
- l = conditional_likelihood( n, l0, l1, cs ); |
|
801 |
+ l = conditional_likelihood( n, l0, l1, cs, prior ); |
|
773 | 802 |
n_params = M0.num_params() + M1.num_params(); |
774 | 803 |
ic = compute_ic( l, n_obs, n_params, complexity_penalty ); |
775 | 804 |
|
776 |
- if( ic > ic_best ) { |
|
777 |
- ic_best = ic; |
|
778 |
- i_best = i; |
|
779 |
- j_best = j; |
|
805 |
+ |
|
806 |
+ if( ic >= ic_forw_best ) { |
|
807 |
+ ic_forw_best = ic; |
|
808 |
+ i_forw_best = i; |
|
809 |
+ j_forw_best = j; |
|
780 | 810 |
} |
781 | 811 |
|
812 |
+ /* TODO: delete me */ |
|
813 |
+ //log_printf( LOG_MSG, "\n(%zu,%zu) : %0.4e\n", i, j, ic ); |
|
782 | 814 |
|
783 | 815 |
/* remove edge */ |
784 | 816 |
M0.set_edge( i, j, false ); |
785 | 817 |
M1.set_edge( i, j, false ); |
786 | 818 |
|
787 | 819 |
/* restore previous parameters */ |
788 |
- M0.restore_stored_row(); |
|
789 |
- M1.restore_stored_row(); |
|
820 |
+ M0.P->setrow( j, M0_row_j ); |
|
821 |
+ M1.P->setrow( j, M1_row_j ); |
|
790 | 822 |
|
791 | 823 |
/* restore previous likelihoods */ |
792 | 824 |
vecsubcol( l0, L0, n, m, j ); |
793 |
- vecadd( l0, b0, n ); |
|
825 |
+ vecadd( l0, b0_j, n ); |
|
794 | 826 |
|
795 | 827 |
vecsubcol( l1, L1, n, m, j ); |
796 |
- vecadd( l1, b1, n ); |
|
828 |
+ vecadd( l1, b1_j, n ); |
|
797 | 829 |
|
798 |
- matsetcol( L0, b0, n, m, j ); |
|
799 |
- matsetcol( L1, b1, n, m, j ); |
|
830 |
+ matsetcol( L0, b0_j, n, m, j ); |
|
831 |
+ matsetcol( L1, b1_j, n, m, j ); |
|
800 | 832 |
} |
801 | 833 |
} |
802 | 834 |
|
803 |
- log_puts( LOG_MSG, "\n" ); |
|
804 |
- |
|
805 |
- if( ic_best <= ic_curr || ic_best == -HUGE_VAL ) break; |
|
806 |
- |
|
807 |
- ic_curr = ic_best; |
|
808 |
- |
|
809 |
- M0.add_edge( i_best, j_best, training_seqs ); |
|
810 |
- M1.add_edge( i_best, j_best, training_seqs ); |
|
811 |
- |
|
812 |
- vecsubcol( l0, L0, n, m, j_best ); |
|
813 |
- vecsubcol( l1, L1, n, m, j_best ); |
|
814 |
- |
|
815 |
- M0.update_likelihood_column( L0, n, m, j_best, training_seqs ); |
|
816 |
- M1.update_likelihood_column( L1, n, m, j_best, training_seqs ); |
|
817 |
- |
|
818 |
- vecaddcol( l0, L0, n, m, j_best ); |
|
819 |
- vecaddcol( l1, L1, n, m, j_best ); |
|
820 |
- } |
|
835 |
+ /* phase 2: try all possible edge removals */ |
|
836 |
+ for( j = 0; j < (int)M0.n; j++ ) { |
|
837 |
+ for( i = 0; i < (int)M0.n; i++ ) { |
|
821 | 838 |
|
839 |
+ if( !M0.has_edge( i, j ) ) continue; |
|
840 |
+ if( i == j && M0.num_parents(j) > 1 ) continue; |
|
822 | 841 |
|
842 |
+ log_puts( LOG_MSG, "-" ); |
|
843 |
+ if( ++col > col_max ) { |
|
844 |
+ col = 0; |
|
845 |
+ log_printf( LOG_MSG, col_base, "" ); |
|
846 |
+ } |
|
823 | 847 |
|
824 |
- delete[] cs; |
|
825 |
- delete[] L0; |
|
826 |
- delete[] L1; |
|
827 |
- delete[] l0; |
|
828 |
- delete[] l1; |
|
829 |
- delete[] b0; |
|
830 |
- delete[] b1; |
|
831 |
- |
|
832 |
- log_unindent(); |
|
833 |
-} |
|
834 |
- |
|
835 |
- |
|
836 |
- |
|
837 |
- |
|
838 |
-void train_motifs_backwards( motif& M0, motif& M1, |
|
839 |
- const std::deque<sequence*>* training_seqs, |
|
840 |
- size_t max_dep_dist, double complexity_penalty ) |
|
841 |
-{ |
|
842 |
- log_puts( LOG_MSG, "training motifs (backwards) ...\n" ); |
|
843 |
- log_indent(); |
|
844 |
- |
|
845 |
- |
|
846 |
- if( M0.n != M1.n ) { |
|
847 |
- failf( "Motif models of mismatching size. (%zu != %zu)\n", M0.n, M1.n ); |
|
848 |
- } |
|
849 |
- |
|
850 |
- double (*compute_ic)( double, double, double, double ) = aic; |
|
851 |
- |
|
852 |
- size_t i, j; |
|
853 |
- size_t i_last; |
|
854 |
- |
|
855 |
- const size_t n = training_seqs->size(); |
|
856 |
- const size_t m = M0.n; |
|
857 |
- |
|
858 |
- /* likelihood matrices: |
|
859 |
- * Lk_ij gives the likelihood of training example i on node j in model k. |
|
860 |
- */ |
|
861 |
- double* L0 = new double[ n * m ]; memset( (void*)L0, 0, n * m * sizeof(double) ); |
|
862 |
- double* L1 = new double[ n * m ]; memset( (void*)L1, 0, n * m * sizeof(double) ); |
|
863 |
- |
|
864 |
- /* likelihood vectors: |
|
865 |
- * summed columns of L0, L1, maintained to minimize redundant computation. |
|
866 |
- * */ |
|
867 |
- double* l0 = new double[ n ]; memset( (void*)l0, 0, n * sizeof(double) ); |
|
868 |
- double* l1 = new double[ n ]; memset( (void*)l1, 0, n * sizeof(double) ); |
|
848 |
+ /* keep track of the old parameters to avoid retraining */ |
|
849 |
+ M0.P->getrow( j, M0_row_j ); |
|
850 |
+ M1.P->getrow( j, M1_row_j ); |
|
869 | 851 |
|
852 |
+ /* keep track of the old likelihoods to avoid reevaluating */ |
|
853 |
+ colcpy( b0_j, L0, j, n, m ); |
|
854 |
+ colcpy( b1_j, L1, j, n, m ); |
|
870 | 855 |
|
871 |
- /* 0-1 vectors giving labeling each sequence as foreground or background */ |
|
872 |
- int* cs = new int[ training_seqs->size() ]; |
|
856 |
+ /* remove edge */ |
|
857 |
+ M0.remove_edge( i, j, training_seqs ); |
|
858 |
+ M1.remove_edge( i, j, training_seqs ); |
|
873 | 859 |
|
874 |
- for( i = 0; i < training_seqs->size(); i++ ) { |
|
875 |
- cs[i] = (*training_seqs)[i]->c; |
|
876 |
- } |
|
860 |
+ /* evaluate likelihoods for that column */ |
|
861 |
+ M0.update_likelihood_column( L0, n, m, j, training_seqs ); |
|
862 |
+ M1.update_likelihood_column( L1, n, m, j, training_seqs ); |
|
877 | 863 |
|
864 |
+ /* update training example likelihoods */ |
|
865 |
+ vecsub( l0, b0_j, n ); |
|
866 |
+ vecaddcol( l0, L0, n, m, j ); |
|
878 | 867 |
|
879 |
- /* backup likelihood matrix columns, to restore state after trying a new edge */ |
|
880 |
- double* b0 = new double[ n ]; |
|
881 |
- double* b1 = new double[ n ]; |
|
882 |
- |
|
868 |
+ vecsub( l1, b1_j, n ); |
|
869 |
+ vecaddcol( l1, L1, n, m, j ); |
|
883 | 870 |
|
871 |
+ /* evaluate likelihood / ic */ |
|
872 |
+ l = conditional_likelihood( n, l0, l1, cs, prior ); |
|
873 |
+ n_params = M0.num_params() + M1.num_params(); |
|
874 |
+ ic = compute_ic( l, n_obs, n_params, complexity_penalty ); |
|
884 | 875 |
|
885 |
- /* keeping track of the optimal edge */ |
|
886 |
- double ic, ic_curr, ic_best; |
|
887 |
- size_t j_best, i_best; |
|
876 |
+ if( ic > ic_back_best ) { |
|
877 |
+ ic_back_best = ic; |
|
878 |
+ i_back_best = i; |
|
879 |
+ j_back_best = j; |
|
880 |
+ } |
|
888 | 881 |
|
882 |
+ /* TODO: delete me */ |
|
883 |
+ //log_printf( LOG_MSG, "\n(%zu,%zu) : %0.4e\n", i, j, ic ); |
|
889 | 884 |
|
890 |
- /* start with all edges */ |
|
891 |
- M0.add_all_edges( training_seqs ); |
|
892 |
- M1.add_all_edges( training_seqs ); |
|
885 |
+ /* replace edge */ |
|
886 |
+ M0.set_edge( i, j, true ); |
|
887 |
+ M1.set_edge( i, j, true ); |
|
893 | 888 |
|
889 |
+ /* restore previous parameters */ |
|
890 |
+ M0.P->setrow( j, M0_row_j ); |
|
891 |
+ M1.P->setrow( j, M1_row_j ); |
|
894 | 892 |
|
895 |
- /* parameters to compute information criterion */ |
|
896 |
- double n_obs = training_seqs->size(); |
|
897 |
- double n_params = M0.num_params() + M1.num_params(); |
|
893 |
+ /* restore previous likelihoods */ |
|
894 |
+ vecsubcol( l0, L0, n, m, j ); |
|
895 |
+ vecadd( l0, b0_j, n ); |
|
898 | 896 |
|
897 |
+ vecsubcol( l1, L1, n, m, j ); |
|
898 |
+ vecadd( l1, b1_j, n ); |
|
899 | 899 |
|
900 |
- /* log conditional likelihood */ |
|
901 |
- double l; |
|
900 |
+ matsetcol( L0, b0_j, n, m, j ); |
|
901 |
+ matsetcol( L1, b1_j, n, m, j ); |
|
902 |
+ } |
|
903 |
+ } |
|
902 | 904 |
|
903 |
- /* baseline ic */ |
|
904 |
- l = conditional_likelihood( n, l0, l1, cs ); |
|
905 |
- ic_curr = compute_ic( l, n_obs, n_params, complexity_penalty ); |
|
905 |
+ /* phase 3: try all possible edge reversals */ |
|
906 |
+ for( j = 0; j < (int)M0.n; j++ ) { |
|
907 |
+ for( i = 0; i < (int)M0.n; i++ ) { |
|
906 | 908 |
|
907 |
- size_t round = 0; |
|
909 |
+ /* skip nonsense reversals */ |
|
910 |
+ if( i == j ) continue; |
|
908 | 911 |
|
909 |
- while( true ) { |
|
910 |
- round++; |
|
912 |
+ /* skip reversals that add parameters */ |
|
913 |
+ if( !(M0.has_edge( i, j ) && M0.has_edge( i, i ) && M0.has_edge( j, j )) ) { |
|
914 |
+ continue; |
|
915 |
+ } |
|
911 | 916 |
|
912 |
- log_printf( LOG_MSG, "round %4d (ic = %0.4e) ", round, ic_curr ); |
|
917 |
+ /* skip reversals that would introduce cycles: |
|
918 |
+ * this is determined by cuting the (i,j) edge, then recomputing |
|
919 |
+ * reachability to see if an i, j path remains */ |
|
920 |
+ M0.set_edge( i, j, false ); |
|
921 |
+ M0.compute_reachability(); |
|
913 | 922 |
|
914 |
- ic_best = -HUGE_VAL; |
|
915 |
- j_best = i_best = 0; |
|
923 |
+ has_ij_path = M0.reachable( i, j ); |
|
916 | 924 |
|
917 |
- for( j = 0; j < M0.n; j++ ) { |
|
925 |
+ M0.set_edge( i, j, true ); |
|
926 |
+ M0.compute_reachability(); |
|
918 | 927 |
|
919 |
- /* do not remove the position unless it has no edges */ |
|
920 |
- i_last = M0.num_parents(j) > 1 ? j-1 : j; |
|
928 |
+ if( has_ij_path ) continue; |
|
921 | 929 |
|
922 |
- for( i = 0; i <= i_last; i++ ) { |
|
923 | 930 |
|
924 |
- if( !M0.has_edge( i, j ) ) continue; |
|
925 | 931 |
|
926 | 932 |
log_puts( LOG_MSG, "." ); |
933 |
+ if( ++col > col_max ) { |
|
934 |
+ col = 0; |
|
935 |
+ log_printf( LOG_MSG, col_base, "" ); |
|
936 |
+ } |
|
927 | 937 |
|
928 | 938 |
/* keep track of the old parameters to avoid retraining */ |
929 |
- M0.store_row(j); |
|
930 |
- M1.store_row(j); |
|
939 |
+ M0.P->getrow( j, M0_row_j ); |
|
940 |
+ M1.P->getrow( j, M1_row_j ); |
|
941 |
+ M0.P->getrow( i, M0_row_i ); |
|
942 |
+ M1.P->getrow( i, M1_row_i ); |
|
931 | 943 |
|
932 | 944 |
/* keep track of the old likelihoods to avoid reevaluating */ |
933 |
- colcpy( b0, L0, j, n, m ); |
|
934 |
- colcpy( b1, L1, j, n, m ); |
|
945 |
+ colcpy( b0_j, L0, j, n, m ); |
|
946 |
+ colcpy( b1_j, L1, j, n, m ); |
|
947 |
+ colcpy( b0_i, L0, i, n, m ); |
|
948 |
+ colcpy( b1_i, L1, i, n, m ); |
|
935 | 949 |
|
936 |
- /* remove edge */ |
|
950 |
+ /* reverse edge */ |
|
937 | 951 |
M0.remove_edge( i, j, training_seqs ); |
938 | 952 |
M1.remove_edge( i, j, training_seqs ); |
953 |
+ M0.add_edge( j, i, training_seqs ); |
|
954 |
+ M1.add_edge( j, i, training_seqs ); |
|
939 | 955 |
|
940 |
- /* evaluate likelihoods for that column */ |
|
941 |
- M0.update_likelihood_column( L0, j, n, m, training_seqs ); |
|
942 |
- M1.update_likelihood_column( L1, j, n, m, training_seqs ); |
|
956 |
+ /* evaluate likelihoods for those columnn */ |
|
957 |
+ M0.update_likelihood_column( L0, n, m, j, training_seqs ); |
|
958 |
+ M1.update_likelihood_column( L1, n, m, j, training_seqs ); |
|
959 |
+ M0.update_likelihood_column( L0, n, m, i, training_seqs ); |
|
960 |
+ M1.update_likelihood_column( L1, n, m, i, training_seqs ); |
|
943 | 961 |
|
944 | 962 |
/* update training example likelihoods */ |
945 |
- vecsub( l0, b0, n ); |
|
963 |
+ vecsub( l0, b0_j, n ); |
|
946 | 964 |
vecaddcol( l0, L0, n, m, j ); |
947 |
- |
|
948 |
- vecsub( l1, b1, n ); |
|
965 |
+ vecsub( l1, b1_j, n ); |
|
949 | 966 |
vecaddcol( l1, L1, n, m, j ); |
950 | 967 |
|
968 |
+ vecsub( l0, b0_i, n ); |
|
969 |
+ vecaddcol( l0, L0, n, m, i ); |
|
970 |
+ vecsub( l1, b1_i, n ); |
|
971 |
+ vecaddcol( l1, L1, n, m, i ); |
|
972 |
+ |
|
973 |
+ |
|
951 | 974 |
/* evaluate likelihood / ic */ |
952 |
- l = conditional_likelihood( n, l0, l1, cs ); |
|
975 |
+ l = conditional_likelihood( n, l0, l1, cs, prior ); |
|
953 | 976 |
n_params = M0.num_params() + M1.num_params(); |
954 | 977 |
ic = compute_ic( l, n_obs, n_params, complexity_penalty ); |
955 | 978 |
|
956 |
- if( ic > ic_best ) { |
|
957 |
- ic_best = ic; |
|
958 |
- i_best = i; |
|
959 |
- j_best = j; |
|
979 |
+ if( ic > ic_rev_best ) { |
|
980 |
+ ic_rev_best = ic; |
|
981 |
+ i_rev_best = i; |
|
982 |
+ j_rev_best = j; |
|
960 | 983 |
} |
961 | 984 |
|
985 |
+ /* TODO: delete me */ |
|
986 |
+ //log_printf( LOG_MSG, "\n(%zu,%zu) : %0.4e\n", i, j, ic ); |
|
987 |
+ |
|
962 | 988 |
/* replace edge */ |
989 |
+ M0.set_edge( j, i, false ); |
|
990 |
+ M1.set_edge( j, i, false ); |
|
963 | 991 |
M0.set_edge( i, j, true ); |
964 | 992 |
M1.set_edge( i, j, true ); |
965 | 993 |
|
966 | 994 |
/* restore previous parameters */ |
967 |
- M0.restore_stored_row(); |
|
968 |
- M1.restore_stored_row(); |
|
995 |
+ M0.P->setrow( j, M0_row_j ); |
|
996 |
+ M1.P->setrow( j, M1_row_j ); |
|
997 |
+ M0.P->setrow( i, M0_row_i ); |
|
998 |
+ M1.P->setrow( i, M1_row_i ); |
|
969 | 999 |
|
970 | 1000 |
/* restore previous likelihoods */ |
971 | 1001 |
vecsubcol( l0, L0, n, m, j ); |
972 |
- vecadd( l0, b0, n ); |
|
973 |
- |
|
1002 |
+ vecadd( l0, b0_j, n ); |
|
974 | 1003 |
vecsubcol( l1, L1, n, m, j ); |
975 |
- vecadd( l1, b1, n ); |
|
1004 |
+ vecadd( l1, b1_j, n ); |
|
1005 |
+ |
|
1006 |
+ vecsubcol( l0, L0, n, m, i ); |
|
1007 |
+ vecadd( l0, b0_i, n ); |
|
1008 |
+ vecsubcol( l1, L1, n, m, i ); |
|
1009 |
+ vecadd( l1, b1_i, n ); |
|
976 | 1010 |
|
977 |
- matsetcol( L0, b0, n, m, j ); |
|
978 |
- matsetcol( L1, b1, n, m, j ); |
|
1011 |
+ matsetcol( L0, b0_j, n, m, j ); |
|
1012 |
+ matsetcol( L1, b1_j, n, m, j ); |
|
1013 |
+ matsetcol( L0, b0_i, n, m, i ); |
|
1014 |
+ matsetcol( L1, b1_i, n, m, i ); |
|
979 | 1015 |
} |
980 | 1016 |
} |
981 | 1017 |
|
1018 |
+ |
|
1019 |
+ |
|
982 | 1020 |
log_puts( LOG_MSG, "\n" ); |
983 | 1021 |
|
1022 |
+ if( std::max( std::max( ic_forw_best, ic_back_best ), ic_rev_best ) <= ic_curr ) break; |
|
1023 |
+ |
|
1024 |
+ if( ic_forw_best > ic_back_best && ic_forw_best > ic_rev_best ) { |
|
1025 |
+ log_printf( LOG_MSG, " [+] %zu->%zu\n", i_forw_best, j_forw_best ); |
|
1026 |
+ |
|
1027 |
+ ic_curr = ic_forw_best; |
|
984 | 1028 |
|
985 |
- if( ic_best <= ic_curr || ic_best == -HUGE_VAL ) break; |
|
1029 |
+ M0.add_edge( i_forw_best, j_forw_best, training_seqs ); |
|
1030 |
+ M1.add_edge( i_forw_best, j_forw_best, training_seqs ); |
|
986 | 1031 |
|
987 |
- ic_curr = ic_best; |
|
1032 |
+ vecsubcol( l0, L0, n, m, j_forw_best ); |
|
1033 |
+ vecsubcol( l1, L1, n, m, j_forw_best ); |
|
1034 |
+ |
|
1035 |
+ M0.update_likelihood_column( L0, n, m, j_forw_best, training_seqs ); |
|
1036 |
+ M1.update_likelihood_column( L1, n, m, j_forw_best, training_seqs ); |
|
1037 |
+ |
|
1038 |
+ vecaddcol( l0, L0, n, m, j_forw_best ); |
|
1039 |
+ vecaddcol( l1, L1, n, m, j_forw_best ); |
|
1040 |
+ } |
|
1041 |
+ else if( ic_back_best > ic_rev_best ) { |
|
1042 |
+ log_printf( LOG_MSG, " [-] %zu->%zu\n", i_back_best, j_back_best ); |
|
1043 |
+ ic_curr = ic_back_best; |
|
988 | 1044 |
|
989 |
- M0.remove_edge( i_best, j_best, training_seqs ); |
|
990 |
- M1.remove_edge( i_best, j_best, training_seqs ); |
|
1045 |
+ M0.remove_edge( i_back_best, j_back_best, training_seqs ); |
|
1046 |
+ M1.remove_edge( i_back_best, j_back_best, training_seqs ); |
|
991 | 1047 |
|
992 |
- vecsubcol( l0, L0, n, m, j_best ); |
|
993 |
- vecsubcol( l1, L1, n, m, j_best ); |
|
1048 |
+ vecsubcol( l0, L0, n, m, j_back_best ); |
|
1049 |
+ vecsubcol( l1, L1, n, m, j_back_best ); |
|
994 | 1050 |
|
995 |
- M0.update_likelihood_column( L0, n, m, j_best, training_seqs ); |
|
996 |
- M1.update_likelihood_column( L1, n, m, j_best, training_seqs ); |
|
1051 |
+ M0.update_likelihood_column( L0, n, m, j_back_best, training_seqs ); |
|
1052 |
+ M1.update_likelihood_column( L1, n, m, j_back_best, training_seqs ); |
|
997 | 1053 |
|
998 |
- vecaddcol( l0, L0, n, m, j_best ); |
|
999 |
- vecaddcol( l1, L1, n, m, j_best ); |
|
1054 |
+ vecaddcol( l0, L0, n, m, j_back_best ); |
|
1055 |
+ vecaddcol( l1, L1, n, m, j_back_best ); |
|
1056 |
+ } |
|
1057 |
+ else { |
|
1058 |
+ log_printf( LOG_MSG, " [.] %zu->%zu\n", i_rev_best, j_rev_best ); |
|
1059 |
+ ic_curr = ic_rev_best; |
|
1060 |
+ |
|
1061 |
+ M0.remove_edge( i_rev_best, j_rev_best, training_seqs ); |
|
1062 |
+ M1.remove_edge( i_rev_best, j_rev_best, training_seqs ); |
|
1063 |
+ M0.add_edge( j_rev_best, i_rev_best, training_seqs ); |
|
1064 |
+ M1.add_edge( j_rev_best, i_rev_best, training_seqs ); |
|
1065 |
+ |
|
1066 |
+ vecsubcol( l0, L0, n, m, j_rev_best ); |
|
1067 |
+ vecsubcol( l1, L1, n, m, j_rev_best ); |
|
1068 |
+ vecsubcol( l0, L0, n, m, i_rev_best ); |
|
1069 |
+ vecsubcol( l1, L1, n, m, i_rev_best ); |
|
1070 |
+ |
|
1071 |
+ M0.update_likelihood_column( L0, n, m, j_rev_best, training_seqs ); |
|
1072 |
+ M1.update_likelihood_column( L1, n, m, j_rev_best, training_seqs ); |
|
1073 |
+ M0.update_likelihood_column( L0, n, m, i_rev_best, training_seqs ); |
|
1074 |
+ M1.update_likelihood_column( L1, n, m, i_rev_best, training_seqs ); |
|
1075 |
+ |
|
1076 |
+ vecaddcol( l0, L0, n, m, j_rev_best ); |
|
1077 |
+ vecaddcol( l1, L1, n, m, j_rev_best ); |
|
1078 |
+ vecaddcol( l0, L0, n, m, i_rev_best ); |
|
1079 |
+ vecaddcol( l1, L1, n, m, i_rev_best ); |
|
1080 |
+ } |
|
1000 | 1081 |
} |
1001 | 1082 |
|
1002 |
- |
|
1083 |
+ |
|
1003 | 1084 |
|
1004 | 1085 |
delete[] cs; |
1005 | 1086 |
delete[] L0; |
1006 | 1087 |
delete[] L1; |
1007 | 1088 |
delete[] l0; |
1008 | 1089 |
delete[] l1; |
1009 |
- delete[] b0; |
|
1010 |
- delete[] b1; |
|
1011 |
- |
|
1090 |
+ delete[] M0_row_j; |
|
1091 |
+ delete[] M1_row_j; |
|
1092 |
+ delete[] M0_row_i; |
|
1093 |
+ delete[] M1_row_i; |
|
1094 |
+ delete[] b0_j; |
|
1095 |
+ delete[] b1_j; |
|
1096 |
+ delete[] b0_i; |
|
1097 |
+ delete[] b1_i; |
|
1098 |
+ |
|
1012 | 1099 |
log_unindent(); |
1013 | 1100 |
} |
1014 | 1101 |
|
1015 | 1102 |
|
1103 |
+ |
... | ... |
@@ -32,8 +32,11 @@ class kmer_matrix |
32 | 32 |
|
33 | 33 |
void to_yaml( YAML::Emitter& out ) const; |
34 | 34 |
|
35 |
- void setall( double x ); |
|
36 |
- void setrow( size_t i, double x ); |
|
35 |
+ void setall( double x ); |
|
36 |
+ void setrowall( size_t i, double x ); |
|
37 |
+ |
|
38 |
+ void getrow( size_t i, double* x ); |
|
39 |
+ void setrow( size_t i, double* x ); |
|
37 | 40 |
|
38 | 41 |
size_t getn() const; |
39 | 42 |
size_t getm() const; |
... | ... |
@@ -47,24 +50,15 @@ class kmer_matrix |
47 | 50 |
void dist_marginalize( size_t i, size_t j ); |
48 | 51 |
|
49 | 52 |
void dist_conditionalize( int effective_k = -1 ); |
50 |
- void dist_conditionalize_row( size_t i, int effective_k = -1 ); |
|
53 |
+ void dist_conditionalize_row( size_t i, size_t j, int effective_k = -1 ); |
|
51 | 54 |
void log_transform_row( size_t i, int effective_k = -1 ); |
52 | 55 |
|
53 |
- /* allows one row to be stored then reset, which is used when searching |
|
54 |
- * for the optimal edge to add when training the model */ |
|
55 |
- void store_row( size_t i ); |
|
56 |
- void restore_stored_row(); |
|
57 |
- |
|
58 |
- |
|
59 | 56 |
private: |
60 | 57 |
|
61 |
- size_t k; |
|
62 |
- |
|
63 |
- size_t n, m; |
|
58 |
+ size_t k; // size of k-mer |
|
59 |
+ size_t n; // number of positions |
|
60 |
+ size_t m; // 4^k |
|
64 | 61 |
double* A; |
65 |
- |
|
66 |
- double* stored_row; |
|
67 |
- size_t stored_row_index; |
|
68 | 62 |
}; |
69 | 63 |
|
70 | 64 |
|
... | ... |
@@ -76,7 +70,7 @@ class kmer_matrix |
76 | 70 |
class sequence |
77 | 71 |
{ |
78 | 72 |
public: |
79 |
- sequence( const char* s, int c = 0 ); |
|
73 |
+ sequence( const char* s, int c = 0, double w = 1.0 ); |
|
80 | 74 |
sequence( const sequence& ); |
81 | 75 |
void operator=( const sequence& ); |
82 | 76 |
~sequence(); |
... | ... |
@@ -85,6 +79,7 @@ class sequence |
85 | 79 |
bool get( const bool* indexes, size_t maxn, kmer& K, size_t offset = 0 ) const; |
86 | 80 |
|
87 | 81 |
int c; /* class (i.e. foreground (0) or foreground (1)) */ |
82 |
+ double w; |
|
88 | 83 |
|
89 | 84 |
private: |
90 | 85 |
kmer* xs; |
... | ... |
@@ -112,7 +107,6 @@ class motif |
112 | 107 |
|
113 | 108 |
void add_edge( size_t i, size_t j, const std::deque<sequence*>* data ); |
114 | 109 |
void remove_edge( size_t i, size_t j, const std::deque<sequence*>* data ); |
115 |
- void add_all_edges( const std::deque<sequence*>* data ); |
|
116 | 110 |
|
117 | 111 |
double eval( const sequence&, size_t offset = 0 ) const; |
118 | 112 |
double eval_node( size_t i, const std::deque<sequence*>* data, |
... | ... |
@@ -120,8 +114,6 @@ class motif |
120 | 114 |
|
121 | 115 |
size_t num_params() const; |
122 | 116 |
|
123 |
- void store_row( size_t i ); |
|
124 |
- void restore_stored_row(); |
|
125 | 117 |
char* print_model_graph( int offset = 0 ); |
126 | 118 |
|
127 | 119 |
int c; /* which subset of the training data to consider */ |
... | ... |
@@ -132,6 +124,9 @@ class motif |
132 | 124 |
bool has_edge( size_t i, size_t j ); |
133 | 125 |
void set_edge( size_t i, size_t j, bool ); |
134 | 126 |
|
127 |
+ bool reachable( size_t i, size_t j ); |
|
128 |
+ void compute_reachability(); |
|
129 |
+ |
|
135 | 130 |
|
136 | 131 |
void update_likelihood_column( double* L, size_t n, size_t m, size_t j, |
137 | 132 |
const std::deque<sequence*>* training_seqs ); |
... | ... |
@@ -141,6 +136,7 @@ class motif |
141 | 136 |
kmer_matrix* P; |
142 | 137 |
|
143 | 138 |
bool* parents; |
139 |
+ bool* R; // reachability |
|
144 | 140 |
|
145 | 141 |
static const double pseudocount; |
146 | 142 |
|
... | ... |
@@ -149,19 +145,12 @@ class motif |
149 | 145 |
const std::deque<sequence*>* training_seqs, |
150 | 146 |
size_t max_dep_dist, double complexity_penalty ); |
151 | 147 |
|
152 |
- friend void train_motifs_backwards( motif& M0, motif& M1, |
|
153 |
- const std::deque<sequence*>* training_seqs, |
|
154 |
- size_t max_dep_dist, double complexity_penalty ); |
|
155 | 148 |
}; |
156 | 149 |
|
157 | 150 |
void train_motifs( motif& M0, motif& M1, |
158 | 151 |
const std::deque<sequence*>* training_seqs, |
159 | 152 |
size_t max_dep_dist = 0, double complexity_penalty = 1.0 ); |
160 | 153 |
|
161 |
-void train_motifs_backwards( motif& M0, motif& M1, |
|
162 |
- const std::deque<sequence*>* training_seqs, |
|
163 |
- size_t max_dep_dist = 0, double complexity_penalty = 1.0 ); |
|
164 |
- |
|
165 | 154 |
|
166 | 155 |
|
167 | 156 |
|
... | ... |
@@ -41,10 +41,52 @@ static double rand_gauss( double sigma ) |
41 | 41 |
} |
42 | 42 |
|
43 | 43 |
|
44 |
+static double rand_trunc_gauss( double sigma, double a, double b ) |
|
45 |
+{ |
|
46 |
+ double x; |
|
47 |
+ do { |
|
48 |
+ x = rand_gauss( sigma ); |
|
49 |
+ } while( x < a || x > b ); |
|
50 |
+ |
|
51 |
+ return x; |
|
52 |
+} |
|
53 |
+ |
|
54 |
+ |
|
55 |
+double gauss_pdf (const double x, const double sigma) |
|
56 |
+{ |
|
57 |
+ double u = x / fabs (sigma); |
|
58 |
+ double p = (1 / (sqrt (2 * M_PI) * fabs (sigma))) * exp (-u * u / 2); |
|
59 |
+ return p; |
|
60 |
+} |
|
61 |
+ |
|
62 |
+ |
|
44 | 63 |
/* pseudocount used when sampling foreground and background nucleotide * frequencies */ |
45 | 64 |
const double sequencing_bias::pseudocount = 1; |
46 | 65 |
|
47 | 66 |
|
67 |
+int read_pos_tid_compare( const void* p1, const void* p2 ) |
|
68 |
+{ |
|
69 |
+ return ((read_pos*)p1)->tid - ((read_pos*)p2)->tid; |
|
70 |
+} |
|
71 |
+ |
|
72 |
+int read_pos_count_compare( const void* p1, const void* p2 ) |
|
73 |
+{ |
|
74 |
+ return (int)((read_pos*)p2)->count - (int)((read_pos*)p1)->count; |
|
75 |
+} |
|
76 |
+ |
|
77 |
+ |
|
78 |
+int read_pos_tid_count_compare( const void* p1, const void* p2 ) |
|
79 |
+{ |
|
80 |
+ int c = (int)(((read_pos*)p1)->tid - ((read_pos*)p2)->tid); |
|
81 |
+ |
|
82 |
+ if( c == 0 ) { |
|
83 |
+ return (int)((read_pos*)p2)->count - (int)((read_pos*)p1)->count; |
|
84 |
+ } |
|
85 |
+ else return c; |
|
86 |
+} |
|
87 |
+ |
|
88 |
+ |
|
89 |
+ |
|
48 | 90 |
sequencing_bias::sequencing_bias() |
49 | 91 |
: ref_f(NULL) |
50 | 92 |
, ref_fn(NULL) |
... | ... |
@@ -78,19 +120,35 @@ sequencing_bias::sequencing_bias( const char* ref_fn, |
78 | 120 |
} |
79 | 121 |
|
80 | 122 |
|
81 |
- |
|
82 | 123 |
sequencing_bias::sequencing_bias( const char* ref_fn, |
83 | 124 |
const char* reads_fn, |
84 |
- size_t n, pos L, pos R, |
|
85 |
- bool train_backwards, |
|
86 |
- double complexity_penalty ) |
|
125 |
+ size_t max_reads, pos L, pos R, |
|
126 |
+ double complexity_penalty, |
|
127 |
+ double offset_std ) |
|
87 | 128 |
: ref_f(NULL) |
88 | 129 |
, ref_fn(NULL) |
89 | 130 |
, M0(NULL), M1(NULL) |
90 | 131 |
{ |
91 |
- build( ref_fn, reads_fn, n, L, R, train_backwards, complexity_penalty ); |
|
132 |
+ build( ref_fn, reads_fn, max_reads, L, R, |
|
133 |
+ complexity_penalty, offset_std ); |
|
92 | 134 |
} |
93 | 135 |
|
136 |
+ |
|
137 |
+ |
|
138 |
+sequencing_bias::sequencing_bias( const char* ref_fn, |
|
139 |
+ table* T, size_t max_reads, |
|
140 |
+ pos L, pos R, |
|
141 |
+ double complexity_penalty, |
|
142 |
+ double offset_std ) |
|
143 |
+ : ref_f(NULL) |
|
144 |
+ , ref_fn(NULL) |
|
145 |
+ , M0(NULL), M1(NULL) |
|
146 |
+{ |
|
147 |
+ build( ref_fn, T, max_reads, L, R, complexity_penalty, offset_std ); |
|
148 |
+} |
|
149 |
+ |
|
150 |
+ |
|
151 |
+ |
|
94 | 152 |
sequencing_bias* sequencing_bias::copy() const |
95 | 153 |
{ |
96 | 154 |
sequencing_bias* sb = new sequencing_bias(); |
... | ... |
@@ -167,40 +225,86 @@ void sequencing_bias::clear() |
167 | 225 |
} |
168 | 226 |
|
169 | 227 |
|
228 |
+ |
|
170 | 229 |
void sequencing_bias::build( const char* ref_fn, |
171 | 230 |
const char* reads_fn, |
172 |
- size_t n, pos L, pos R, |
|
173 |
- bool train_backwards, |
|
174 |
- double complexity_penalty ) |
|
231 |
+ size_t max_reads, pos L, pos R, |
|
232 |
+ double complexity_penalty, |
|
233 |
+ double offset_std ) |
|
175 | 234 |
{ |
176 |
- log_puts( LOG_MSG, "Determining sequencing bias...\n" ); |
|
235 |
+ log_printf( LOG_MSG, "hashing reads ... \n" ); |
|
177 | 236 |
log_indent(); |
178 | 237 |
|
179 |
- clock_t t0, t1; |
|
180 |
- t0 = clock(); |
|
238 |
+ samfile_t* reads_f = samopen( reads_fn, "rb", NULL ); |
|
239 |
+ if( reads_f == NULL ) { |
|
240 |
+ failf( "Can't open bam file '%s'.", reads_fn ); |
|
241 |
+ } |
|
242 |
+ |
|
243 |
+ bam_index_t* reads_index = bam_index_load( reads_fn ); |
|
244 |
+ if( reads_index == NULL ) { |
|
245 |
+ failf( "Can't open bam index '%s.bai'.", reads_fn ); |
|
246 |
+ } |
|
247 |
+ |
|
248 |
+ bam_init_header_hash( reads_f->header ); |
|
249 |
+ |
|
250 |
+ bam1_t* read = bam_init1(); |
|
251 |
+ |
|
252 |
+ table T; |
|
253 |
+ table_create( &T, reads_f->header->n_targets ); |
|
254 |
+ T.seq_names = reads_f->header->target_name; |
|
255 |
+ |
|
256 |
+ size_t k = 0; |
|
257 |
+ |
|
258 |
+ while( samread( reads_f, read ) > 0 ) { |
|
259 |
+ k++; |
|
260 |
+ if( k % 1000000 == 0 ) { |
|
261 |
+ log_printf( LOG_MSG, "%zu reads\n", k ); |
|
262 |
+ } |
|
263 |
+ table_inc( &T, read ); |
|
264 |
+ } |
|
265 |
+ log_printf( LOG_MSG, "%zu reads\n", k ); |
|
266 |
+ |
|
267 |
+ bam_destroy1(read); |
|
268 |
+ |
|
269 |
+ log_puts( LOG_MSG, "done.\n" ); |
|
270 |
+ log_unindent(); |
|
271 |
+ |
|
272 |
+ build( ref_fn, &T, max_reads, L, R, complexity_penalty, offset_std ); |
|
273 |
+ |
|
274 |
+ table_destroy( &T ); |
|
181 | 275 |
|
276 |
+ bam_index_destroy(reads_index); |
|
277 |
+ samclose( reads_f ); |
|
278 |