Browse code

merged a number of improvements

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/seqbias@53976 bc3139a8-67e5-0310-9ffc-ced21a209358

Daniel Jones authored on 23/03/2011 19:03:43
Showing 11 changed files

... ...
@@ -1,5 +1,5 @@
1 1
 Package: seqbias
2
-Version: 0.99.6
2
+Version: 0.99.7
3 3
 Date: 25-12-2010
4 4
 Title: Estimation of per-position bias in high-throughput sequencing data
5 5
 Description: This package implements a model of per-position sequencing bias in
... ...
@@ -13,7 +13,7 @@ YAML_CPP_OBJ = aliascontent.o conversion.o emitter.o emitterstate.o \
13 13
                stream.o tag.o
14 14
 
15 15
 SEQBIAS_OBJ = sequencing_bias.o kmers.o miscmath.o common.o \
16
-			  table.o superfasthash.o logger.o seqbias.o \
16
+			  table.o logger.o seqbias.o \
17 17
 
18 18
 GNULIB_OBJ = asprintf.o printf-args.o printf-parse.o vasnprintf.o vasprintf.o
19 19
 
... ...
@@ -5,6 +5,8 @@
5 5
 #include <algorithm>
6 6
 #include <cstdlib>
7 7
 #include "samtools/faidx.h"
8
+#include "samtools/sam.h"
9
+#include "samtools/bam.h"
8 10
 
9 11
 
10 12
 typedef long          pos;     /* genomic position */
... ...
@@ -24,6 +26,12 @@ void rev( T* xs, int n ) {
24 26
     while( i < j ) std::swap(xs[i++],xs[j--]);
25 27
 }
26 28
 
29
+
30
+/* defined in bam_aux.c */
31
+extern "C" {
32
+void bam_init_header_hash(bam_header_t *header);
33
+}
34
+
27 35
 /* Change the behavior of the faidx_fetch_seq function to be more useful. If
28 36
  * coordinates are outside the actual sequence, write N's, rather than adjusting
29 37
  * the start,end. */
... ...
@@ -3,7 +3,6 @@
3 3
 #include "common.hpp"
4 4
 #include "miscmath.hpp"
5 5
 #include "logger.h"
6
-#include "asprintf.h"
7 6
 
8 7
 #include <string>
9 8
 #include <cstring>
... ...
@@ -24,8 +23,6 @@ kmer_matrix::kmer_matrix( const YAML::Node& node )
24 23
     for( i = 0; i < n*m; i++ ) {
25 24
         node_A[i] >> A[i];
26 25
     }
27
-
28
-    stored_row = new double[ m ];
29 26
 }
30 27
 
31 28
 kmer_matrix::kmer_matrix( size_t n, size_t k )
... ...
@@ -34,7 +31,6 @@ kmer_matrix::kmer_matrix( size_t n, size_t k )
34 31
     m = 1<<(2*k); /* == 4^k */
35 32
 
36 33
     A = new double[ n * m ];
37
-    stored_row = new double[ m ];
38 34
 }
39 35
 
40 36
 kmer_matrix::kmer_matrix( const kmer_matrix& M )
... ...
@@ -45,10 +41,6 @@ kmer_matrix::kmer_matrix( const kmer_matrix& M )
45 41
     m = M.m;
46 42
     A = new double[ n * m ];
47 43
     memcpy( (void*)A, (void*)M.A, n * m * sizeof(double) );
48
-
49
-
50
-    stored_row = new double[ m ];
51
-    memcpy( (void*)stored_row, (void*)M.stored_row, m * sizeof(double) );
52 44
 }
53 45
 
54 46
 size_t kmer_matrix::getn() const { return n; }
... ...
@@ -66,13 +58,9 @@ void kmer_matrix::operator=( const kmer_matrix& M )
66 58
 
67 59
         delete[] A;
68 60
         A = new double[ n * m ];
69
-
70
-        delete[] stored_row;
71
-        stored_row = new double[ m ];
72 61
     }
73 62
 
74 63
     memcpy( (void*)A, (void*)M.A, n * m * sizeof(double) );
75
-    memcpy( (void*)stored_row, (void*)M.stored_row, m * sizeof(double) );
76 64
 }
77 65
 
78 66
 
... ...
@@ -81,11 +69,11 @@ void kmer_matrix::to_yaml( YAML::Emitter& out ) const
81 69
     out << YAML::BeginMap;
82 70
 
83 71
     out << YAML::Key   << "k";
84
-    out << YAML::Value << (unsigned int)k;
72
+    out << YAML::Value << k;
85 73
     out << YAML::Key   << "n";
86
-    out << YAML::Value << (unsigned int)n;
74
+    out << YAML::Value << n;
87 75
     out << YAML::Key   << "m";
88
-    out << YAML::Value << (unsigned int)m;
76
+    out << YAML::Value << m;
89 77
     out << YAML::Key   << "A";
90 78
     out << YAML::Flow;
91 79
     out << YAML::Value;
... ...
@@ -103,7 +91,6 @@ void kmer_matrix::to_yaml( YAML::Emitter& out ) const
103 91
 kmer_matrix::~kmer_matrix()
104 92
 {
105 93
     delete[] A;
106
-    delete[] stored_row;
107 94
 }
108 95
 
109 96
 
... ...
@@ -119,7 +106,7 @@ void kmer_matrix::setall( double x )
119 106
     for( i = 0; i < n * m; i++ ) A[i] = x;
120 107
 }
121 108
 
122
-void kmer_matrix::setrow( size_t i, double x )
109
+void kmer_matrix::setrowall( size_t i, double x )
123 110
 {
124 111
     size_t j;
125 112
     for( j = 0; j < m; j++ ) {
... ...
@@ -127,6 +114,16 @@ void kmer_matrix::setrow( size_t i, double x )
127 114
     }
128 115
 }
129 116
 
117
+void kmer_matrix::getrow( size_t i, double* xs )
118
+{
119
+    memcpy( (void*)xs, (void*)(A + i * m), m * sizeof(double) );
120
+}
121
+
122
+void kmer_matrix::setrow( size_t i, double* xs )
123
+{
124
+    memcpy( (void*)(A + i * m ), (void*)xs, m * sizeof(double) );
125
+}
126
+
130 127
 
131 128
 void kmer_matrix::dist_normalize()
132 129
 {
... ...
@@ -155,21 +152,30 @@ void kmer_matrix::dist_conditionalize( int effective_k )
155 152
 }
156 153
 
157 154
 
158
-void kmer_matrix::dist_conditionalize_row( size_t i, int effective_k  )
155
+void kmer_matrix::dist_conditionalize_row( size_t i, size_t j, int effective_k  )
159 156
 {
160 157
     if( effective_k <= 0 ) effective_k = m;
161
-
162
-    kmer L;
163
-    kmer L_max = 1 << (2*(effective_k-1));
158
+    kmer L, R;
159
+    kmer L_max = 1 << (2*j);
160
+    kmer R_max = 1 << (2*(effective_k-j-1));
164 161
     kmer K;
165
-    kmer nt;
166 162
     double z;
163
+    kmer nt;
167 164
 
168 165
     for( L = 0; L < L_max; L++ ) {
169
-        K = L<<2;
170
-        z = 0.0;
171
-        for( nt = 0; nt < 4; nt++ ) z += A[ i * m + (nt | K) ];
172
-        for( nt = 0; nt < 4; nt++ ) A[ i * m + (nt | K) ] /= z;
166
+        for( R = 0; R < R_max; R++ ) { 
167
+            z = 0.0;
168
+
169
+            for( nt = 0; nt < 4; nt++ ) {
170
+                K = (L<<(2*(effective_k-j))) | (nt<<(2*(effective_k-j-1))) | R;
171
+                z += A[ i*m + K ];
172
+            }
173
+
174
+            for( nt = 0; nt < 4; nt++ ) {
175
+                K = (L<<(2*(effective_k-j))) | (nt<<(2*(effective_k-j-1))) | R;
176
+                A[ i*m + K ] /= z;
177
+            }
178
+        }
173 179
     }
174 180
 }
175 181
 
... ...
@@ -186,24 +192,11 @@ void kmer_matrix::log_transform_row( size_t i, int effective_k )
186 192
 }
187 193
 
188 194
 
189
-void kmer_matrix::store_row( size_t i )
190
-{
191
-    memcpy( (void*)stored_row, (void*)(A + i * m), m * sizeof(double) );
192
-    stored_row_index = i;
193
-}
194
-
195
-
196
-void kmer_matrix::restore_stored_row()
197
-{
198
-    memcpy( (void*)(A + stored_row_index * m), (void*)stored_row, m * sizeof(double) );
199
-}
200
-
201
-
202 195
 const size_t sequence::kmer_max_k = 4*sizeof(kmer);
203 196
 
204 197
 
205
-sequence::sequence( const char* s, int c )
206
-    : c(c), xs(NULL), n(0)
198
+sequence::sequence( const char* s, int c, double w )
199
+    : c(c), w(w), xs(NULL), n(0)
207 200
 {
208 201
     n = strlen(s);
209 202
     if( n > 0 ) {
... ...
@@ -223,6 +216,7 @@ sequence::sequence( const sequence& s )
223 216
     : xs(NULL), n(0)
224 217
 {
225 218
     c = s.c;
219
+    w = s.w;
226 220
     n = s.n;
227 221
     if( n > 0 ) {
228 222
         xs = new kmer[ n/kmer_max_k + 1 ];
... ...
@@ -291,6 +285,10 @@ motif::motif( const YAML::Node& node )
291 285
     }
292 286
 
293 287
     P = new kmer_matrix( node["P"] );
288
+
289
+    R = new bool[n*n];
290
+    memset( R, 0, n*n*sizeof(bool) );
291
+    compute_reachability();
294 292
 }
295 293
 
296 294
 motif::motif( size_t n, size_t k, int c )
... ...
@@ -301,6 +299,9 @@ motif::motif( size_t n, size_t k, int c )
301 299
 
302 300
     parents = new bool[n*n];
303 301
     memset( parents, 0, n*n*sizeof(bool) );
302
+
303
+    R = new bool[n*n];
304
+    memset( R, 0, n*n*sizeof(bool) );
304 305
 }
305 306
 
306 307
 
... ...
@@ -313,6 +314,9 @@ motif::motif( const motif& M )
313 314
 
314 315
     parents = new bool[n*n];
315 316
     memcpy( parents, M.parents, n*n*sizeof(bool) );
317
+
318
+    R = new bool[n*n];
319
+    memset( R, 0, n*n*sizeof(bool) );
316 320
 }
317 321
 
318 322
 
... ...
@@ -320,6 +324,7 @@ motif::motif( const motif& M )
320 324
 motif::~motif()
321 325
 {
322 326
     delete[] parents;
327
+    delete[] R;
323 328
     delete P;
324 329
 }
325 330
 
... ...
@@ -329,10 +334,10 @@ void motif::to_yaml( YAML::Emitter& out ) const
329 334
     out << YAML::BeginMap;
330 335
 
331 336
     out << YAML::Key   << "n";
332
-    out << YAML::Value << (unsigned int)n;
337
+    out << YAML::Value << n;
333 338
 
334 339
     out << YAML::Key   << "k";
335
-    out << YAML::Value << (unsigned int)k;
340
+    out << YAML::Value << k;
336 341
 
337 342
     out << YAML::Key   << "c";
338 343
     out << YAML::Value << c;
... ...
@@ -385,7 +390,7 @@ size_t motif::num_parents( size_t i ) const
385 390
 {
386 391
     size_t j;
387 392
     size_t M = 0;
388
-    for( j = 0; j <= i; j++ ) {
393
+    for( j = 0; j < n; j++ ) {
389 394
         if( parents[i*n+j] ) M++;
390 395
     }
391 396
 
... ...
@@ -402,19 +407,29 @@ void motif::set_edge( size_t i, size_t j, bool x )
402 407
     parents[j*n+i] = x;
403 408
 }
404 409
 
405
-
406
-void motif::store_row( size_t i )
410
+bool motif::reachable( size_t i, size_t j )
407 411
 {
408
-    P->store_row( i );
412
+    return R[j*n+i];
409 413
 }
410 414
 
411
-
412
-void motif::restore_stored_row()
415
+void motif::compute_reachability()
413 416
 {
414
-    P->restore_stored_row();
417
+    /* find the transitive closure of the parents matrix via flyod-warshall */
418
+
419
+    memcpy( R, parents, n*n*sizeof(bool) );
420
+
421
+    size_t k, i, j;
422
+    for( k = 0; k < n; k++ ) {
423
+        for( i = 0; i < n; i++ ) {
424
+            for( j = 0; j < n; j++ ) {
425
+                R[j*n+i] = R[j*n+i] || (R[k*n+i] && R[j*n+k]);
426
+            }
427
+        }
428
+    }
415 429
 }
416 430
 
417 431
 
432
+
418 433
 char* motif::print_model_graph( int offset )
419 434
 {
420 435
     std::string graph_str;
... ...
@@ -439,7 +454,8 @@ char* motif::print_model_graph( int offset )
439 454
     for( j = 0; j < (int)n; j++ ) {
440 455
         if( !parents[j*n+j] ) continue;
441 456
 
442
-        for( i = 0; i < j; i++ ) {
457
+        for( i = 0; i < (int)n; i++ ) {
458
+            if( i == j ) continue;
443 459
             if( parents[j*n+i] ) {
444 460
                 r = asprintf( &tmp, "n%d -> n%d;\n", i, j );
445 461
                 graph_str += tmp;
... ...
@@ -455,50 +471,13 @@ char* motif::print_model_graph( int offset )
455 471
 }
456 472
 
457 473
 
458
-void motif::add_all_edges( const std::deque<sequence*>* data )
459
-{
460
-    size_t i, j;
461
-    size_t n_parents;
462
-    size_t m;
463
-    kmer K;
464
-    std::deque<sequence*>::const_iterator seq;
465
-
466
-    for( j = 0; j < n; j++ ) {
467
-        for( i = k-1 <= j ? j-(k-1) : 0; i <= j; i++ ) {
468
-            set_edge( i, j, true );
469
-        }
470
-        P->setrow( j, 0.0 );
471
-        n_parents = num_parents(j);
472
-        m = 1 << (2*n_parents);
473
-
474
-        for( K = 0; K < m; K++ ) {
475
-            (*P)( j, K ) = pseudocount;
476
-        }
477
-
478
-        for( seq = data->begin(); seq != data->end(); seq++ ) {
479
-            if( (*seq)->c == c && (*seq)->get( parents + j*n, n, K ) ) {
480
-                (*P)( j, K )++;
481
-            }
482
-        }
483
-
484
-        P->dist_normalize_row( j );
485
-        P->dist_conditionalize_row( j, n_parents );
486
-        P->log_transform_row( j, n_parents );
487
-    }
488
-}
489
-
490
-
491 474
 /* make an edge i --> j
492 475
  * That is, condition j on i. */
493 476
 void motif::add_edge( size_t i, size_t j, const std::deque<sequence*>* data )
494 477
 {
495
-    if( i > j ) {
496
-        failf( "Invalid motif edge (%zu, %zu)\n", i, j );
497
-    }
498
-
499 478
     set_edge( i, j, true );
500 479
 
501
-    P->setrow( j, 0.0 );
480
+    P->setrowall( j, 0.0 );
502 481
     size_t n_parents = num_parents(j);
503 482
     size_t m = 1 << (2*n_parents);
504 483
     kmer K;
... ...
@@ -510,26 +489,30 @@ void motif::add_edge( size_t i, size_t j, const std::deque<sequence*>* data )
510 489
     std::deque<sequence*>::const_iterator seq;
511 490
     for( seq = data->begin(); seq != data->end(); seq++ ) {
512 491
         if( (*seq)->c == c && (*seq)->get( parents + j*n, n, K ) ) {
513
-            (*P)( j, K )++;
492
+            (*P)( j, K ) += (*seq)->w;
514 493
         }
515 494
     }
516 495
 
496
+    size_t n_pred_parents = 0;
497
+    size_t u;
498
+    for( u = 0; u < j; u++ ) {
499
+        if( has_edge( u, j ) ) n_pred_parents++;
500
+    }
501
+
517 502
     P->dist_normalize_row( j );
518
-    P->dist_conditionalize_row( j, n_parents );
503
+    P->dist_conditionalize_row( j, n_pred_parents, n_parents );
519 504
     P->log_transform_row( j, n_parents );
505
+
506
+    compute_reachability();
520 507
 }
521 508
 
522 509
 
523 510
 
524 511
 void motif::remove_edge( size_t i, size_t j, const std::deque<sequence*>* data )
525 512
 {
526
-    if( i > j ) {
527
-        failf( "Invalid motif edge (%zu, %zu)\n", i, j );
528
-    }
529
-
530 513
     set_edge( i, j, false );
531 514
 
532
-    P->setrow( j, 0.0 );
515
+    P->setrowall( j, 0.0 );
533 516
     size_t n_parents = num_parents(j);
534 517
     size_t m = 1 << (2*n_parents);
535 518
     kmer K;
... ...
@@ -545,38 +528,22 @@ void motif::remove_edge( size_t i, size_t j, const std::deque<sequence*>* data )
545 528
         }
546 529
     }
547 530
 
531
+    size_t n_pred_parents = 0;
532
+    size_t u;
533
+    for( u = 0; u < j; u++ ) {
534
+        if( has_edge( u, j ) ) n_pred_parents++;
535
+    }
536
+
548 537
     P->dist_normalize_row( j );
549
-    P->dist_conditionalize_row( j, n_parents );
538
+    P->dist_conditionalize_row( j, n_pred_parents, n_parents );
550 539
     P->log_transform_row( j, n_parents );
551
-}
552 540
 
541
+    compute_reachability();
542
+}
553 543
 
554 544
 
555 545
 
556 546
 
557
-double motif_log_likelihood( const motif& M0, const motif& M1,
558
-                             const std::deque<sequence*>* training_seqs )
559
-{
560
-    double L, L0, L1;
561
-
562
-    L = 0.0;
563
-    
564
-    std::deque<sequence*>::const_iterator i;
565
-    for( i = training_seqs->begin(); i != training_seqs->end(); i++ ) {
566
-        L0 = M0.eval( **i );
567
-        L1 = M1.eval( **i );
568
-        if( (*i)->c == 0 ) {
569
-            L += L0 - logaddexp( L0, L1 );
570
-        }
571
-        else if( (*i)->c == 1 ) {
572
-            L += L1 - logaddexp( L0, L1 );
573
-        }
574
-    }
575
-
576
-    return L;
577
-}
578
-
579
-
580 547
 
581 548
 /* Let x_i be the sequence of training example i, c be the class, and M be the
582 549
  * model. Then given l0 and l1, where,
... ...
@@ -590,15 +557,19 @@ double motif_log_likelihood( const motif& M0, const motif& M1,
590 557
  *                      = sum_i log( P( x_i | c_i, M ) / ( P( x_i | c = 0, M ) + P( x_i | c = 1, M ) )
591 558
  *
592 559
  */
593
-double conditional_likelihood( size_t n, const double* l0, const double* l1, const int* c )
560
+double conditional_likelihood( size_t n, const double* l0, const double* l1, const int* c,
561
+                               double prior )
594 562
 {
595 563
     double l;
596 564
     size_t i;
597 565
 
566
+    double p = log(prior);
567
+    double q = log(1-prior);
568
+
598 569
     l = 0.0;
599 570
     for( i = 0; i < n; i++ ) {
600
-        l += (c[i] == 0 ? l0[i] : l1[i])
601
-                    - logaddexp( l0[i], l1[i] );
571
+        l += (c[i] == 0 ? (q+l0[i]) : (p+l1[i]))
572
+                    - logaddexp( q+l0[i], p+l1[i] );
602 573
     }
603 574
 
604 575
     return l;
... ...
@@ -615,7 +586,7 @@ void motif::update_likelihood_column( double* L, size_t n, size_t m, size_t j,
615 586
 
616 587
     for( i = 0; i < n; i++ )  {
617 588
         if( (*training_seqs)[i]->get( parents + j * m, m, K ) ) {
618
-            L[ i * m + j ] = (*P)( j, K );
589
+            L[ i * m + j ] = (*training_seqs)[i]->w * (*P)( j, K );
619 590
         }
620 591
     }
621 592
 }
... ...
@@ -628,24 +599,24 @@ void motif::update_likelihood_column( double* L, size_t n, size_t m, size_t j,
628 599
 /* Akaike Information Criterion */
629 600
 double aic( double L, double n_obs, double n_params, double c = 1.0 )
630 601
 {
631
-    return L - c*n_params*n_obs / (n_obs - n_params - 1.0);
602
+    return L - c*n_params;
632 603
 }
633 604
 
634
-/* Bayesian (Schwarz) Information Criterion */
635
-double bic( double L, double n_obs, double n_params, double c = 1.0 )
605
+/* Akaike Information Criterion with second order small-sample bias correction.
606
+ * (From Hurvich and Tsai (1989)) */
607
+double aicc( double L, double n_obs, double n_params, double c = 1.0 )
636 608
 {
637
-    return 2.0*L - c*n_params*log(n_obs);
609
+    return L - c*n_params
610
+             - (n_params + 1) * (n_params + 2) / (n_obs - n_params - 2);
638 611
 }
639 612
 
640
-
641
-/* Hannan-Quinn Information Criterion */
642
-double hqic( double L, double n_obs, double n_params, double c = 1.0 )
613
+/* Bayesian (Schwarz) Information Criterion */
614
+double bic( double L, double n_obs, double n_params, double c = 1.0 )
643 615
 {
644
-    return L - c*n_params*log(log(n_obs));
616
+    return 2.0*L - c*n_params*log(n_obs);
645 617
 }
646 618
 
647 619
 
648
-
649 620
 void train_motifs( motif& M0, motif& M1,
650 621
                    const std::deque<sequence*>* training_seqs,
651 622
                    size_t max_dep_dist, double complexity_penalty )
... ...
@@ -658,11 +629,11 @@ void train_motifs( motif& M0, motif& M1,
658 629
         failf( "Motif models of mismatching size. (%zu != %zu)\n", M0.n, M1.n );
659 630
     }
660 631
 
661
-    double (*compute_ic)( double, double, double, double ) = aic;
632
+    double (*compute_ic)( double, double, double, double ) = aicc;
662 633
 
663 634
 
664
-    size_t i, j;
665
-    size_t i_start;
635
+    int i, j;
636
+    int i_start, i_end;
666 637
 
667 638
     const size_t n = training_seqs->size();
668 639
     const size_t m = M0.n;
... ...
@@ -680,23 +651,51 @@ void train_motifs( motif& M0, motif& M1,
680 651
     double* l1 = new double[ n ]; memset( (void*)l1, 0, n * sizeof(double) );
681 652
 
682 653
 
654
+    /* vectors for saving and restoring parameters, to avoid recomputing
655
+     * frequencies */
656
+    double* M0_row_j = new double[ M0.P->getm() ];
657
+    double* M1_row_j = new double[ M0.P->getm() ];
658
+    double* M0_row_i = new double[ M0.P->getm() ];
659
+    double* M1_row_i = new double[ M0.P->getm() ];
660
+
661
+
683 662
     /* 0-1 vector giving labeling each sequence as foreground or background */
684 663
     int* cs = new int[ n ];
685 664
 
686
-    for( i = 0; i < n; i++ ) {
665
+    for( i = 0; i < (int)n; i++ ) {
687 666
         cs[i] = (*training_seqs)[i]->c;
688 667
     }
689 668
 
669
+    /* set prior probability of an example being foreground */
670
+    double ws[2];
671
+    for( i = 0; i < (int)n; i++ ) {
672
+        if( cs[i] <= 1 ) ws[cs[i]] += (*training_seqs)[i]->w;
673
+    }
674
+    double prior = ws[1] / (ws[0] + ws[1]);
675
+
690 676
 
691 677
     /* backup likelihood matrix columns, to restore state after trying a new edge */
692
-    double* b0 = new double[ n ];
693
-    double* b1 = new double[ n ];
678
+    double* b0_j = new double[ n ];
679
+    double* b1_j = new double[ n ];
680
+    double* b0_i = new double[ n ];
681
+    double* b1_i = new double[ n ];
694 682
     
695 683
 
696 684
     /* keeping track of the optimal edge */
697
-    double ic, ic_curr, ic_best;
698
-    size_t j_best, i_best;
685
+    double ic, ic_curr;
686
+
687
+    double ic_forw_best;
688
+    int j_forw_best, i_forw_best;
699 689
 
690
+    double ic_back_best;
691
+    int j_back_best, i_back_best;
692
+
693
+    double ic_rev_best;
694
+    int j_rev_best, i_rev_best;
695
+
696
+
697
+    /* for cycle detection */
698
+    bool has_ij_path;
700 699
 
701 700
     /* parameters to compute information criterion */
702 701
     double n_obs    = training_seqs->size();
... ...
@@ -706,51 +705,81 @@ void train_motifs( motif& M0, motif& M1,
706 705
     /* log conditional likelihood */
707 706
     double l; 
708 707
 
709
-
710 708
     /* baseline ic */
711
-    l = conditional_likelihood( n, l0, l1, cs );
709
+    l = conditional_likelihood( n, l0, l1, cs, prior );
712 710
     ic_curr = compute_ic( l, n_obs, n_params, complexity_penalty );
713 711
 
714 712
 
713
+    /* for pretty output */
714
+    size_t col;
715
+    const char* col_base = "\n%34s";
716
+    const size_t col_max = 30;
717
+
718
+
715 719
     size_t round_num = 0;
716 720
 
717 721
     while( true ) {
718 722
         round_num++;
719 723
 
720 724
         log_printf( LOG_MSG, "round %4d (ic = %0.4e) ", round_num, ic_curr );
725
+        col = 0;
726
+
727
+        ic_forw_best = ic_back_best = ic_rev_best = -HUGE_VAL;
728
+        i_forw_best = i_back_best = i_rev_best = 0;
729
+        j_forw_best = j_back_best = j_rev_best = 0;
721 730
 
722
-        ic_best = -HUGE_VAL;
723
-        j_best = i_best = 0;
724 731
 
725
-        for( j = 0; j < M0.n; j++ ) {
732
+        /* phase 1: try all possible edge additions */
733
+        for( j = M0.n-1; j >= 0; j-- ) {
726 734
 
727 735
             if( M0.has_edge( j, j ) ) {
728
-                if( max_dep_dist == 0 || j <= max_dep_dist ) i_start = 0;
729
-                else i_start = i - max_dep_dist;
736
+                if( max_dep_dist == 0 || j <= (int)max_dep_dist ) i_start = 0;
737
+                else i_start = j - max_dep_dist;
738
+
739
+                if( max_dep_dist == 0 ) i_end = M0.n - 1;
740
+                else i_end = std::min( M0.n-1, j + max_dep_dist );
730 741
             }
731
-            else i_start = j;
742
+            else i_start = i_end = j;
743
+
732 744
 
733 745
 
734
-            for( i = i_start; i <= j; i++ ) {
746
+            for( i = i_start; i <= i_end; i++ ) {
735 747
 
748
+                /* skip existing edges  */
736 749
                 if( M0.has_edge( i, j ) ) {
737 750
                     continue;
738 751
                 }
739 752
 
753
+                /* skip edges that would introduce cycles */
754
+                if( M0.reachable( j, i ) ) {
755
+                    continue;
756
+                }
757
+
758
+                /* skip edges that would exceed the parent limit */
740 759
                 if( M0.num_parents(j) >= M0.k ) {
741 760
                     continue;
742 761
                 }
743 762
 
744
-                log_puts( LOG_MSG, "." );
763
+                /* skip edges that are equivalent to one already tried */
764
+                if( i > j && M0.num_parents(j) == 1 && M0.num_parents(i) == 1 ) {
765
+                    continue;
766
+                }
745 767
 
746
-                /* keep track of the old parameters to avoid retraining */
747
-                M0.store_row(j);
748
-                M1.store_row(j);
768
+
769
+                log_puts( LOG_MSG, "+" );
770
+                if( ++col > col_max ) {
771
+                    col = 0;
772
+                    log_printf( LOG_MSG, col_base, "" );
773
+                }
749 774
 
750 775
 
776
+                /* keep track of the old parameters to avoid retraining */
777
+                M0.P->getrow( j, M0_row_j );
778
+                M1.P->getrow( j, M1_row_j );
779
+
751 780
                 /* keep track of the old likelihoods to avoid reevaluating */
752
-                colcpy( b0, L0, j, n, m  );
753
-                colcpy( b1, L1, j, n, m );
781
+                colcpy( b0_j, L0, j, n, m  );
782
+                colcpy( b1_j, L1, j, n, m );
754 783
 
755 784
                 /* add edge */
756 785
                 M0.add_edge( i, j, training_seqs );
... ...
@@ -762,254 +791,313 @@ void train_motifs( motif& M0, motif& M1,
762 791
 
763 792
 
764 793
                 /* update training example likelihoods */
765
-                vecsub( l0, b0, n );
794
+                vecsub( l0, b0_j, n );
766 795
                 vecaddcol( l0, L0, n, m, j );
767 796
 
768
-                vecsub( l1, b1, n );
797
+                vecsub( l1, b1_j, n );
769 798
                 vecaddcol( l1, L1, n, m, j );
770 799
 
771 800
 
772
-                l        = conditional_likelihood( n, l0, l1, cs );
801
+                l        = conditional_likelihood( n, l0, l1, cs, prior );
773 802
                 n_params = M0.num_params() + M1.num_params();
774 803
                 ic       = compute_ic( l, n_obs, n_params, complexity_penalty );
775 804
 
776
-                if( ic > ic_best ) {
777
-                    ic_best = ic;
778
-                    i_best = i;
779
-                    j_best = j;
805
+
806
+                if( ic >= ic_forw_best ) {
807
+                    ic_forw_best = ic;
808
+                    i_forw_best = i;
809
+                    j_forw_best = j;
780 810
                 }
781 811
         
812
+                /* TODO: delete me */
813
+                //log_printf( LOG_MSG, "\n(%zu,%zu) : %0.4e\n", i, j, ic );
782 814
 
783 815
                 /* remove edge */
784 816
                 M0.set_edge( i, j, false );
785 817
                 M1.set_edge( i, j, false );
786 818
 
787 819
                 /* restore previous parameters */
788
-                M0.restore_stored_row();
789
-                M1.restore_stored_row();
820
+                M0.P->setrow( j, M0_row_j );
821
+                M1.P->setrow( j, M1_row_j );
790 822
 
791 823
                 /* restore previous likelihoods */
792 824
                 vecsubcol( l0, L0, n, m, j );
793
-                vecadd( l0, b0, n );
825
+                vecadd( l0, b0_j, n );
794 826
 
795 827
                 vecsubcol( l1, L1, n, m, j );
796
-                vecadd( l1, b1, n );
828
+                vecadd( l1, b1_j, n );
797 829
 
798
-                matsetcol( L0, b0, n, m, j );
799
-                matsetcol( L1, b1, n, m, j );
830
+                matsetcol( L0, b0_j, n, m, j );
831
+                matsetcol( L1, b1_j, n, m, j );
800 832
             }
801 833
         }
802 834
 
803
-        log_puts( LOG_MSG, "\n" );
804
-
805
-        if( ic_best <= ic_curr || ic_best == -HUGE_VAL ) break;
806
-
807
-        ic_curr = ic_best;
808
-
809
-        M0.add_edge( i_best, j_best, training_seqs );
810
-        M1.add_edge( i_best, j_best, training_seqs );
811
-
812
-        vecsubcol( l0, L0, n, m, j_best );
813
-        vecsubcol( l1, L1, n, m, j_best );
814
-
815
-        M0.update_likelihood_column( L0, n, m, j_best, training_seqs );
816
-        M1.update_likelihood_column( L1, n, m, j_best, training_seqs );
817
-
818
-        vecaddcol( l0, L0, n, m, j_best );
819
-        vecaddcol( l1, L1, n, m, j_best );
820
-    }
835
+        /* phase 2: try all possible edge removals */
836
+        for( j = 0; j < (int)M0.n; j++ ) {
837
+            for( i = 0; i < (int)M0.n; i++ ) {
821 838
 
839
+                if( !M0.has_edge( i, j ) ) continue;
840
+                if( i == j && M0.num_parents(j) > 1 ) continue;
822 841
 
842
+                log_puts( LOG_MSG, "-" );
843
+                if( ++col > col_max ) {
844
+                    col = 0;
845
+                    log_printf( LOG_MSG, col_base, "" );
846
+                }
823 847
 
824
-    delete[] cs;
825
-    delete[] L0;
826
-    delete[] L1;
827
-    delete[] l0;
828
-    delete[] l1;
829
-    delete[] b0;
830
-    delete[] b1;
831
-    
832
-    log_unindent();
833
-}
834
-
835
-
836
-
837
-
838
-void train_motifs_backwards( motif& M0, motif& M1,
839
-                             const std::deque<sequence*>* training_seqs,
840
-                             size_t max_dep_dist, double complexity_penalty )
841
-{
842
-    log_puts( LOG_MSG, "training motifs (backwards) ...\n" );
843
-    log_indent();
844
-
845
-
846
-    if( M0.n != M1.n ) {
847
-        failf( "Motif models of mismatching size. (%zu != %zu)\n", M0.n, M1.n );
848
-    }
849
-
850
-    double (*compute_ic)( double, double, double, double ) = aic;
851
-
852
-    size_t i, j;
853
-    size_t i_last;
854
-
855
-    const size_t n = training_seqs->size();
856
-    const size_t m = M0.n;
857
-
858
-    /* likelihood matrices: 
859
-     * Lk_ij gives the likelihood of training example i on node j in model k.
860
-     */
861
-    double* L0 = new double[ n * m ]; memset( (void*)L0, 0, n * m * sizeof(double) );
862
-    double* L1 = new double[ n * m ]; memset( (void*)L1, 0, n * m * sizeof(double) );
863
-
864
-    /* likelihood vectors:
865
-     * summed columns of L0, L1, maintained to minimize redundant computation.
866
-     * */
867
-    double* l0 = new double[ n ]; memset( (void*)l0, 0, n * sizeof(double) );
868
-    double* l1 = new double[ n ]; memset( (void*)l1, 0, n * sizeof(double) );
848
+                /* keep track of the old parameters to avoid retraining */
849
+                M0.P->getrow( j, M0_row_j );
850
+                M1.P->getrow( j, M1_row_j );
869 851
 
852
+                /* keep track of the old likelihoods to avoid reevaluating */
853
+                colcpy( b0_j, L0, j, n, m  );
854
+                colcpy( b1_j, L1, j, n, m );
870 855
 
871
-    /* 0-1 vectors giving labeling each sequence as foreground or background */
872
-    int* cs = new int[ training_seqs->size() ];
856
+                /* remove edge */
857
+                M0.remove_edge( i, j, training_seqs );
858
+                M1.remove_edge( i, j, training_seqs );
873 859
 
874
-    for( i = 0; i < training_seqs->size(); i++ ) {
875
-        cs[i] = (*training_seqs)[i]->c;
876
-    }
860
+                /* evaluate likelihoods for that column */
861
+                M0.update_likelihood_column( L0, n, m, j, training_seqs );
862
+                M1.update_likelihood_column( L1, n, m, j, training_seqs );
877 863
 
864
+                /* update training example likelihoods */
865
+                vecsub( l0, b0_j, n );
866
+                vecaddcol( l0, L0, n, m, j );
878 867
 
879
-    /* backup likelihood matrix columns, to restore state after trying a new edge */
880
-    double* b0 = new double[ n ];
881
-    double* b1 = new double[ n ];
882
-    
868
+                vecsub( l1, b1_j, n );
869
+                vecaddcol( l1, L1, n, m, j );
883 870
 
871
+                /* evaluate likelihood / ic */
872
+                l        = conditional_likelihood( n, l0, l1, cs, prior );
873
+                n_params = M0.num_params() + M1.num_params();
874
+                ic       = compute_ic( l, n_obs, n_params, complexity_penalty );
884 875
 
885
-    /* keeping track of the optimal edge */
886
-    double ic, ic_curr, ic_best;
887
-    size_t j_best, i_best;
876
+                if( ic > ic_back_best ) {
877
+                    ic_back_best = ic;
878
+                    i_back_best = i;
879
+                    j_back_best = j;
880
+                }
888 881
 
882
+                /* TODO: delete me */
883
+                //log_printf( LOG_MSG, "\n(%zu,%zu) : %0.4e\n", i, j, ic );
889 884
 
890
-    /* start with all edges */
891
-    M0.add_all_edges( training_seqs );
892
-    M1.add_all_edges( training_seqs );
885
+                /* replace edge */
886
+                M0.set_edge( i, j, true );
887
+                M1.set_edge( i, j, true );
893 888
 
889
+                /* restore previous parameters */
890
+                M0.P->setrow( j, M0_row_j );
891
+                M1.P->setrow( j, M1_row_j );
894 892
 
895
-    /* parameters to compute information criterion */
896
-    double n_obs    = training_seqs->size();
897
-    double n_params = M0.num_params() + M1.num_params();
893
+                /* restore previous likelihoods */
894
+                vecsubcol( l0, L0, n, m, j );
895
+                vecadd( l0, b0_j, n );
898 896
 
897
+                vecsubcol( l1, L1, n, m, j );
898
+                vecadd( l1, b1_j, n );
899 899
 
900
-    /* log conditional likelihood */
901
-    double l; 
900
+                matsetcol( L0, b0_j, n, m, j );
901
+                matsetcol( L1, b1_j, n, m, j );
902
+            }
903
+        }
902 904
 
903
-    /* baseline ic */
904
-    l = conditional_likelihood( n, l0, l1, cs );
905
-    ic_curr = compute_ic( l, n_obs, n_params, complexity_penalty );
905
+        /* phase 3: try all possible edge reversals */
906
+        for( j = 0; j < (int)M0.n; j++ ) {
907
+            for( i = 0; i < (int)M0.n; i++ ) {
906 908
 
907
-    size_t round = 0;
909
+                /* skip nonsense reversals */
910
+                if( i == j ) continue;
908 911
 
909
-    while( true ) {
910
-        round++;
912
+                /* skip reversals that add parameters */
913
+                if( !(M0.has_edge( i, j ) && M0.has_edge( i, i ) && M0.has_edge( j, j )) ) {
914
+                    continue;
915
+                }
911 916
 
912
-        log_printf( LOG_MSG, "round %4d (ic = %0.4e) ", round, ic_curr );
917
+                /* skip reversals that would introduce cycles:
918
+                 * this is determined by cuting the (i,j) edge, then recomputing
919
+                 * reachability to see if an i, j path remains */
920
+                M0.set_edge( i, j, false );
921
+                M0.compute_reachability();
913 922
 
914
-        ic_best = -HUGE_VAL;
915
-        j_best = i_best = 0;
923
+                has_ij_path = M0.reachable( i, j );
916 924
 
917
-        for( j = 0; j < M0.n; j++ ) {
925
+                M0.set_edge( i, j, true );
926
+                M0.compute_reachability();
918 927
 
919
-            /* do not remove the position unless it has no edges */
920
-            i_last = M0.num_parents(j) > 1 ? j-1 : j;
928
+                if( has_ij_path ) continue;
921 929
 
922
-            for( i = 0; i <= i_last; i++ ) {
923 930
 
924
-                if( !M0.has_edge( i, j ) ) continue;
925 931
 
926 932
                 log_puts( LOG_MSG, "." );
933
+                if( ++col > col_max ) {
934
+                    col = 0;
935
+                    log_printf( LOG_MSG, col_base, "" );
936
+                }
927 937
 
928 938
                 /* keep track of the old parameters to avoid retraining */
929
-                M0.store_row(j);
930
-                M1.store_row(j);
939
+                M0.P->getrow( j, M0_row_j );
940
+                M1.P->getrow( j, M1_row_j );
941
+                M0.P->getrow( i, M0_row_i );
942
+                M1.P->getrow( i, M1_row_i );
931 943
 
932 944
                 /* keep track of the old likelihoods to avoid reevaluating */
933
-                colcpy( b0, L0, j, n, m  );
934
-                colcpy( b1, L1, j, n, m );
945
+                colcpy( b0_j, L0, j, n, m  );
946
+                colcpy( b1_j, L1, j, n, m );
947
+                colcpy( b0_i, L0, i, n, m  );
948
+                colcpy( b1_i, L1, i, n, m );
935 949
 
936
-                /* remove edge */
950
+                /* reverse edge */
937 951
                 M0.remove_edge( i, j, training_seqs );
938 952
                 M1.remove_edge( i, j, training_seqs );
953
+                M0.add_edge( j, i, training_seqs );
954
+                M1.add_edge( j, i, training_seqs );
939 955
 
940
-                /* evaluate likelihoods for that column */
941
-                M0.update_likelihood_column( L0, j, n, m, training_seqs );
942
-                M1.update_likelihood_column( L1, j, n, m, training_seqs );
956
+                /* evaluate likelihoods for those columnn */
957
+                M0.update_likelihood_column( L0, n, m, j, training_seqs );
958
+                M1.update_likelihood_column( L1, n, m, j, training_seqs );
959
+                M0.update_likelihood_column( L0, n, m, i, training_seqs );
960
+                M1.update_likelihood_column( L1, n, m, i, training_seqs );
943 961
 
944 962
                 /* update training example likelihoods */
945
-                vecsub( l0, b0, n );
963
+                vecsub( l0, b0_j, n );
946 964
                 vecaddcol( l0, L0, n, m, j );
947
-
948
-                vecsub( l1, b1, n );
965
+                vecsub( l1, b1_j, n );
949 966
                 vecaddcol( l1, L1, n, m, j );
950 967
 
968
+                vecsub( l0, b0_i, n );
969
+                vecaddcol( l0, L0, n, m, i );
970
+                vecsub( l1, b1_i, n );
971
+                vecaddcol( l1, L1, n, m, i );
972
+
973
+
951 974
                 /* evaluate likelihood / ic */
952
-                l        = conditional_likelihood( n, l0, l1, cs );
975
+                l        = conditional_likelihood( n, l0, l1, cs, prior );
953 976
                 n_params = M0.num_params() + M1.num_params();
954 977
                 ic       = compute_ic( l, n_obs, n_params, complexity_penalty );
955 978
 
956
-                if( ic > ic_best ) {
957
-                    ic_best = ic;
958
-                    i_best = i;
959
-                    j_best = j;
979
+                if( ic > ic_rev_best ) {
980
+                    ic_rev_best = ic;
981
+                    i_rev_best = i;
982
+                    j_rev_best = j;
960 983
                 }
961 984
 
985
+                /* TODO: delete me */
986
+                //log_printf( LOG_MSG, "\n(%zu,%zu) : %0.4e\n", i, j, ic );
987
+
962 988
                 /* replace edge */
989
+                M0.set_edge( j, i, false );
990
+                M1.set_edge( j, i, false );
963 991
                 M0.set_edge( i, j, true );
964 992
                 M1.set_edge( i, j, true );
965 993
 
966 994
                 /* restore previous parameters */
967
-                M0.restore_stored_row();
968
-                M1.restore_stored_row();
995
+                M0.P->setrow( j, M0_row_j );
996
+                M1.P->setrow( j, M1_row_j );
997
+                M0.P->setrow( i, M0_row_i );
998
+                M1.P->setrow( i, M1_row_i );
969 999
 
970 1000
                 /* restore previous likelihoods */
971 1001
                 vecsubcol( l0, L0, n, m, j );
972
-                vecadd( l0, b0, n );
973
-
1002
+                vecadd( l0, b0_j, n );
974 1003
                 vecsubcol( l1, L1, n, m, j );
975
-                vecadd( l1, b1, n );
1004
+                vecadd( l1, b1_j, n );
1005
+
1006
+                vecsubcol( l0, L0, n, m, i );
1007
+                vecadd( l0, b0_i, n );
1008
+                vecsubcol( l1, L1, n, m, i );
1009
+                vecadd( l1, b1_i, n );
976 1010
 
977
-                matsetcol( L0, b0, n, m, j );
978
-                matsetcol( L1, b1, n, m, j );
1011
+                matsetcol( L0, b0_j, n, m, j );
1012
+                matsetcol( L1, b1_j, n, m, j );
1013
+                matsetcol( L0, b0_i, n, m, i );
1014
+                matsetcol( L1, b1_i, n, m, i );
979 1015
             }
980 1016
         }
981 1017
 
1018
+
1019
+
982 1020
         log_puts( LOG_MSG, "\n" );
983 1021
 
1022
+        if( std::max( std::max( ic_forw_best, ic_back_best ), ic_rev_best ) <= ic_curr ) break;
1023
+
1024
+        if( ic_forw_best > ic_back_best && ic_forw_best > ic_rev_best ) {
1025
+            log_printf( LOG_MSG, " [+] %zu->%zu\n", i_forw_best, j_forw_best );
1026
+
1027
+            ic_curr = ic_forw_best;
984 1028
 
985
-        if( ic_best <= ic_curr || ic_best == -HUGE_VAL ) break;
1029
+            M0.add_edge( i_forw_best, j_forw_best, training_seqs );
1030
+            M1.add_edge( i_forw_best, j_forw_best, training_seqs );
986 1031
 
987
-        ic_curr = ic_best;
1032
+            vecsubcol( l0, L0, n, m, j_forw_best );
1033
+            vecsubcol( l1, L1, n, m, j_forw_best );
1034
+
1035
+            M0.update_likelihood_column( L0, n, m, j_forw_best, training_seqs );
1036
+            M1.update_likelihood_column( L1, n, m, j_forw_best, training_seqs );
1037
+
1038
+            vecaddcol( l0, L0, n, m, j_forw_best );
1039
+            vecaddcol( l1, L1, n, m, j_forw_best );
1040
+        }
1041
+        else if( ic_back_best > ic_rev_best ) {
1042
+            log_printf( LOG_MSG, " [-] %zu->%zu\n", i_back_best, j_back_best );
1043
+            ic_curr = ic_back_best;
988 1044
 
989
-        M0.remove_edge( i_best, j_best, training_seqs );
990
-        M1.remove_edge( i_best, j_best, training_seqs );
1045
+            M0.remove_edge( i_back_best, j_back_best, training_seqs );
1046
+            M1.remove_edge( i_back_best, j_back_best, training_seqs );
991 1047
 
992
-        vecsubcol( l0, L0, n, m, j_best );
993
-        vecsubcol( l1, L1, n, m, j_best );
1048
+            vecsubcol( l0, L0, n, m, j_back_best );
1049
+            vecsubcol( l1, L1, n, m, j_back_best );
994 1050
 
995
-        M0.update_likelihood_column( L0, n, m, j_best, training_seqs );
996
-        M1.update_likelihood_column( L1, n, m, j_best, training_seqs );
1051
+            M0.update_likelihood_column( L0, n, m, j_back_best, training_seqs );
1052
+            M1.update_likelihood_column( L1, n, m, j_back_best, training_seqs );
997 1053
 
998
-        vecaddcol( l0, L0, n, m, j_best );
999
-        vecaddcol( l1, L1, n, m, j_best );
1054
+            vecaddcol( l0, L0, n, m, j_back_best );
1055
+            vecaddcol( l1, L1, n, m, j_back_best );
1056
+        }
1057
+        else {
1058
+            log_printf( LOG_MSG, " [.] %zu->%zu\n", i_rev_best, j_rev_best );
1059
+            ic_curr = ic_rev_best;
1060
+
1061
+            M0.remove_edge( i_rev_best, j_rev_best, training_seqs );
1062
+            M1.remove_edge( i_rev_best, j_rev_best, training_seqs );
1063
+            M0.add_edge( j_rev_best, i_rev_best, training_seqs );
1064
+            M1.add_edge( j_rev_best, i_rev_best, training_seqs );
1065
+
1066
+            vecsubcol( l0, L0, n, m, j_rev_best );
1067
+            vecsubcol( l1, L1, n, m, j_rev_best );
1068
+            vecsubcol( l0, L0, n, m, i_rev_best );
1069
+            vecsubcol( l1, L1, n, m, i_rev_best );
1070
+
1071
+            M0.update_likelihood_column( L0, n, m, j_rev_best, training_seqs );
1072
+            M1.update_likelihood_column( L1, n, m, j_rev_best, training_seqs );
1073
+            M0.update_likelihood_column( L0, n, m, i_rev_best, training_seqs );
1074
+            M1.update_likelihood_column( L1, n, m, i_rev_best, training_seqs );
1075
+
1076
+            vecaddcol( l0, L0, n, m, j_rev_best );
1077
+            vecaddcol( l1, L1, n, m, j_rev_best );
1078
+            vecaddcol( l0, L0, n, m, i_rev_best );
1079
+            vecaddcol( l1, L1, n, m, i_rev_best );
1080
+        }
1000 1081
     }
1001 1082
 
1002
-    
1083
+
1003 1084
 
1004 1085
     delete[] cs;
1005 1086
     delete[] L0;
1006 1087
     delete[] L1;
1007 1088
     delete[] l0;
1008 1089
     delete[] l1;
1009
-    delete[] b0;
1010
-    delete[] b1;
1011
-
1090
+    delete[] M0_row_j;
1091
+    delete[] M1_row_j;
1092
+    delete[] M0_row_i;
1093
+    delete[] M1_row_i;
1094
+    delete[] b0_j;
1095
+    delete[] b1_j;
1096
+    delete[] b0_i;
1097
+    delete[] b1_i;
1098
+    
1012 1099
     log_unindent();
1013 1100
 }
1014 1101
 
1015 1102
 
1103
+
... ...
@@ -32,8 +32,11 @@ class kmer_matrix
32 32
 
33 33
         void to_yaml( YAML::Emitter& out ) const;
34 34
 
35
-        void   setall( double x );
36
-        void   setrow( size_t i, double x );
35
+        void setall( double x );
36
+        void setrowall( size_t i, double x );
37
+
38
+        void getrow( size_t i, double* x );
39
+        void setrow( size_t i, double* x );
37 40
 
38 41
         size_t getn() const;
39 42
         size_t getm() const;
... ...
@@ -47,24 +50,15 @@ class kmer_matrix
47 50
         void dist_marginalize( size_t i, size_t j );
48 51
 
49 52
         void dist_conditionalize( int effective_k = -1 );
50
-        void dist_conditionalize_row( size_t i, int effective_k = -1 );
53
+        void dist_conditionalize_row( size_t i, size_t j, int effective_k = -1 );
51 54
         void log_transform_row( size_t i, int effective_k = -1 );
52 55
 
53
-        /* allows one row to be stored then reset, which is used when searching
54
-         * for the optimal edge to add when training the model */
55
-        void store_row( size_t i );
56
-        void restore_stored_row();
57
-
58
-
59 56
     private:
60 57
 
61
-        size_t k;
62
-        
63
-        size_t n, m;
58
+        size_t k; // size of k-mer
59
+        size_t n; // number of positions
60
+        size_t m; // 4^k
64 61
         double* A;
65
-
66
-        double* stored_row;
67
-        size_t stored_row_index;
68 62
 };
69 63
 
70 64
 
... ...
@@ -76,7 +70,7 @@ class kmer_matrix
76 70
 class sequence
77 71
 {
78 72
     public:
79
-        sequence( const char* s, int c = 0 );
73
+        sequence( const char* s, int c = 0, double w = 1.0 );
80 74
         sequence( const sequence& );
81 75
         void operator=( const sequence& );
82 76
         ~sequence();
... ...
@@ -85,6 +79,7 @@ class sequence
85 79
         bool get( const bool* indexes, size_t maxn, kmer& K, size_t offset = 0 ) const;
86 80
 
87 81
         int c; /* class (i.e. foreground (0) or foreground (1)) */
82
+        double w;
88 83
 
89 84
     private:
90 85
         kmer* xs;
... ...
@@ -112,7 +107,6 @@ class motif
112 107
 
113 108
         void add_edge( size_t i, size_t j, const std::deque<sequence*>* data );
114 109
         void remove_edge( size_t i, size_t j, const std::deque<sequence*>* data );
115
-        void add_all_edges( const std::deque<sequence*>* data );
116 110
 
117 111
         double eval( const sequence&, size_t offset = 0 ) const;
118 112
         double eval_node( size_t i, const std::deque<sequence*>* data,
... ...
@@ -120,8 +114,6 @@ class motif
120 114
 
121 115
         size_t num_params() const;
122 116
 
123
-        void store_row( size_t i );
124
-        void restore_stored_row();
125 117
         char* print_model_graph( int offset = 0 );
126 118
 
127 119
         int c; /* which subset of the training data to consider */
... ...
@@ -132,6 +124,9 @@ class motif
132 124
         bool has_edge( size_t i, size_t j );
133 125
         void set_edge( size_t i, size_t j, bool );
134 126
 
127
+        bool reachable( size_t i, size_t j );
128
+        void compute_reachability();
129
+
135 130
 
136 131
         void update_likelihood_column( double* L, size_t n, size_t m, size_t j,
137 132
                                        const std::deque<sequence*>* training_seqs );
... ...
@@ -141,6 +136,7 @@ class motif
141 136
         kmer_matrix* P;
142 137
 
143 138
         bool* parents;
139
+        bool* R; // reachability
144 140
 
145 141
         static const double pseudocount;
146 142
 
... ...
@@ -149,19 +145,12 @@ class motif
149 145
                                   const std::deque<sequence*>* training_seqs,
150 146
                                   size_t max_dep_dist, double complexity_penalty );
151 147
 
152
-        friend void train_motifs_backwards( motif& M0, motif& M1,
153
-                                            const std::deque<sequence*>* training_seqs,
154
-                                            size_t max_dep_dist, double complexity_penalty );
155 148
 };
156 149
 
157 150
 void train_motifs( motif& M0, motif& M1,
158 151
                    const std::deque<sequence*>* training_seqs,
159 152
                    size_t max_dep_dist = 0, double complexity_penalty = 1.0 );
160 153
 
161
-void train_motifs_backwards( motif& M0, motif& M1,
162
-                             const std::deque<sequence*>* training_seqs,
163
-                             size_t max_dep_dist = 0, double complexity_penalty = 1.0 );
164
-                   
165 154
 
166 155
 
167 156
 
... ...
@@ -41,10 +41,52 @@ static double rand_gauss( double sigma )
41 41
 }
42 42
 
43 43
 
44
+static double rand_trunc_gauss( double sigma, double a, double b )
45
+{
46
+    double x;
47
+    do {
48
+        x = rand_gauss( sigma );
49
+    } while( x < a || x > b );
50
+
51
+    return x;
52
+}
53
+
54
+
55
+double gauss_pdf (const double x, const double sigma)
56
+{
57
+  double u = x / fabs (sigma);
58
+  double p = (1 / (sqrt (2 * M_PI) * fabs (sigma))) * exp (-u * u / 2);
59
+  return p;
60
+}
61
+
62
+
44 63
 /* pseudocount used when sampling foreground and background nucleotide * frequencies */
45 64
 const double sequencing_bias::pseudocount = 1;
46 65
 
47 66
 
67
+int read_pos_tid_compare( const void* p1, const void* p2 )
68
+{
69
+    return ((read_pos*)p1)->tid - ((read_pos*)p2)->tid;
70
+}
71
+
72
+int read_pos_count_compare( const void* p1, const void* p2 )
73
+{
74
+    return (int)((read_pos*)p2)->count - (int)((read_pos*)p1)->count;
75
+}
76
+
77
+
78
+int read_pos_tid_count_compare( const void* p1, const void* p2 )
79
+{
80
+    int c = (int)(((read_pos*)p1)->tid - ((read_pos*)p2)->tid);
81
+
82
+    if( c == 0 ) {
83
+        return (int)((read_pos*)p2)->count - (int)((read_pos*)p1)->count;
84
+    }
85
+    else return c;
86
+}
87
+
88
+
89
+
48 90
 sequencing_bias::sequencing_bias()
49 91
     : ref_f(NULL)
50 92
     , ref_fn(NULL)
... ...
@@ -78,19 +120,35 @@ sequencing_bias::sequencing_bias( const char* ref_fn,
78 120
 }
79 121
 
80 122
 
81
-
82 123
 sequencing_bias::sequencing_bias( const char* ref_fn,
83 124
                                   const char* reads_fn,
84
-                                  size_t n, pos L, pos R,
85
-                                  bool   train_backwards,
86
-                                  double complexity_penalty )
125
+                                  size_t max_reads, pos L, pos R,
126
+                                  double complexity_penalty,
127
+                                  double offset_std )
87 128
     : ref_f(NULL)
88 129
     , ref_fn(NULL)
89 130
     , M0(NULL), M1(NULL)
90 131
 {
91
-    build( ref_fn, reads_fn, n, L, R, train_backwards, complexity_penalty );
132
+    build( ref_fn, reads_fn, max_reads, L, R,
133
+           complexity_penalty, offset_std );
92 134
 }
93 135
 
136
+
137
+
138
+sequencing_bias::sequencing_bias( const char* ref_fn,
139
+                                  table* T, size_t max_reads,
140
+                                  pos L, pos R,
141
+                                  double complexity_penalty,
142
+                                  double offset_std )
143
+    : ref_f(NULL)
144
+    , ref_fn(NULL)
145
+    , M0(NULL), M1(NULL)
146
+{
147
+    build( ref_fn, T, max_reads, L, R, complexity_penalty, offset_std );
148
+}
149
+
150
+
151
+
94 152
 sequencing_bias* sequencing_bias::copy() const
95 153
 {
96 154
     sequencing_bias* sb = new sequencing_bias();
... ...
@@ -167,40 +225,86 @@ void sequencing_bias::clear()
167 225
 }
168 226
 
169 227
 
228
+
170 229
 void sequencing_bias::build( const char* ref_fn,
171 230
                              const char* reads_fn,
172
-                             size_t n, pos L, pos R,
173
-                             bool   train_backwards,
174
-                             double complexity_penalty )
231
+                             size_t max_reads, pos L, pos R,
232
+                             double complexity_penalty,
233
+                             double offset_std )
175 234
 {
176
-    log_puts( LOG_MSG, "Determining sequencing bias...\n" );
235
+    log_printf( LOG_MSG, "hashing reads ... \n" );
177 236
     log_indent();
178 237
 
179
-    clock_t t0, t1;
180
-    t0 = clock();
238
+    samfile_t* reads_f = samopen( reads_fn, "rb", NULL );
239
+    if( reads_f == NULL ) {
240
+        failf( "Can't open bam file '%s'.", reads_fn );
241
+    }
242
+
243
+    bam_index_t* reads_index = bam_index_load( reads_fn );
244
+    if( reads_index == NULL ) {
245
+        failf( "Can't open bam index '%s.bai'.", reads_fn );
246
+    }
247
+
248
+    bam_init_header_hash( reads_f->header );
249
+
250
+    bam1_t* read = bam_init1();
251
+
252
+    table T;
253
+    table_create( &T, reads_f->header->n_targets );
254
+    T.seq_names = reads_f->header->target_name;
255
+
256
+    size_t k = 0;
257
+
258
+    while( samread( reads_f, read ) > 0 ) {
259
+        k++;
260
+        if( k % 1000000 == 0 ) {
261
+            log_printf( LOG_MSG, "%zu reads\n", k );
262
+        }
263
+        table_inc( &T, read );
264
+    }
265
+    log_printf( LOG_MSG, "%zu reads\n", k );
266
+
267
+    bam_destroy1(read);
268
+
269
+    log_puts( LOG_MSG, "done.\n" );
270
+    log_unindent();
271
+
272
+    build( ref_fn, &T, max_reads, L, R, complexity_penalty, offset_std );
273
+
274
+    table_destroy( &T );
181 275
 
276
+    bam_index_destroy(reads_index);
277
+    samclose( reads_f );
278