Browse code

some small api improvements and cross-platform fixes

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/seqbias@59091 bc3139a8-67e5-0310-9ffc-ced21a209358

Daniel Jones authored on 11/10/2011 21:38:58
Showing 12 changed files

... ...
@@ -1,5 +1,5 @@
1 1
 Package: seqbias
2
-Version: 1.1.1
2
+Version: 1.1.3
3 3
 Date: 25-12-2010
4 4
 Title: Estimation of per-position bias in high-throughput sequencing data
5 5
 Description: This package implements a model of per-position sequencing bias in
... ...
@@ -109,21 +109,27 @@ setClass( "seqbias",
109 109
 
110 110
 
111 111
 
112
-"count.reads" <- function( reads_fn, I, binary = TRUE )
112
+"count.reads" <- function( reads_fn, I, sb = NULL, binary = FALSE, sum.counts = FALSE )
113 113
 {
114 114
     require(GenomicRanges)
115 115
     stopifnot( is( I, "GRanges" ) )
116 116
 
117
+    if (!is.null(sb) & class(sb) != "seqbias") {
118
+        stop( "The 'sb' parameter of count.reads must be a seqbias class." )
119
+    }
120
+
117 121
     bam_ptr <- .Call( "seqbias_open_bam", path.expand(reads_fn),
118 122
                       PACKAGE = "seqbias" )
119 123
 
120 124
     counts <- tapply( I,
121 125
                       INDEX = 1:length(I),
122 126
                       FUN   = function(x) .Call( "seqbias_count_reads",
127
+                                                 sb,
123 128
                                                  bam_ptr,
124 129
                                                  as.character(seqnames(x)),
125 130
                                                  start(x), end(x),
126 131
                                                  as.character(strand(x)),
132
+                                                 sum.counts,
127 133
                       PACKAGE = "seqbias" ) )
128 134
 
129 135
     if( binary ) lapply( counts, FUN = function(c) as.integer( c > 0 ) )
... ...
@@ -126,7 +126,7 @@ seqs[neg_idx] <- reverseComplement( seqs[neg_idx] )
126 126
 Finally, we count the number of reads mapping to each position in our sampled
127 127
 intervals.
128 128
 <<>>=
129
-counts <- count.reads( reads_fn, I )
129
+counts <- count.reads( reads_fn, I, binary = T )
130 130
 @
131 131
 
132 132
 Unless the \texttt{binary} argument is FALSE, this function returns a 0-1 vector,
... ...
@@ -192,7 +192,7 @@ more reads are available a more accurate model can be trained at the expense of
192 192
 the training procedure taking up to several minutes.
193 193
 
194 194
 <<results=hide>>=
195
-sb <- seqbias.fit( ref_fn, reads_fn, L = 5, R = 20 )
195
+sb <- seqbias.fit( ref_fn, reads_fn, L = 5, R = 15 )
196 196
 @
197 197
 
198 198
 The \texttt{L} and \texttt{R} arguments control the maximum number of positions
... ...
@@ -3,21 +3,25 @@
3 3
 \title{Counting reads across intervals}
4 4
 \description{Counts the number of reads starting at each position across given
5 5
 genomic intervals}
6
-\usage{count.reads(reads_fn, I, binary=TRUE)}
6
+\usage{count.reads(reads_fn, I, sb=NULL, binary=FALSE, sum.counts=TRUE)}
7 7
 \arguments{
8 8
   \item{reads_fn}{filename of aligned reads in BAM format}
9 9
   \item{I}{a GRanges object giving valid genomic intervals}
10
+  \item{sb}{a seqbias object}
10 11
   \item{binary}{if \code{TRUE}, return a 0-1 vector, otherwise return a vector
11 12
   counting the number of reads mapped to each position}
13
+  \item{sum.counts}{if \code{TRUE} return the total read count for each interval}
12 14
 }
13 15
 \details{
14 16
   Given an indexed BAM file, this function counts the number of reads starting
15
-  at each position of each provided interval. These counts can then be
16
-  normalized by dividing by the predicted bias obtained from 'bias.predict'.
17
-
18
-  By default, a 0-1 vector is returned, where positions at which no reads are
19
-  mapped are 0, and those with one or more are 1. If \code{binary} is
20
-  \code{FALSE}, the number of reads mapping to each position is returned.
17
+  at each position of each provided interval. If a seqbias object is provided
18
+  through the \code{sb} attribute, counts are corrected for sequence bias.
19
+  The total read count for each interval is returned if \code{sum.counts} is
20
+  \code{TRUE}.
21
+  
22
+  If \code{binary} is \code{TRUE} a 0-1 vector is returned instead, where
23
+  positions at which no reads are mapped are 0, and those with one or more are
24
+  1.
21 25
 }
22 26
 \value{
23 27
   A list of numeric vectors is returned, one for each interval provided. Each
... ...
@@ -36,7 +36,6 @@ kmer_matrix::kmer_matrix(const YAML::Node& node)
36 36
     A = new double[size1 * size2];
37 37
 
38 38
     const YAML::Node& node_A = node["A"];
39
-
40 39
     size_t i;
41 40
     for (i = 0; i < size1 * size2; ++i) {
42 41
         node_A[i] >> A[i];
... ...
@@ -186,7 +186,8 @@ class motif_trainer
186 186
             compute_reachability();
187 187
 
188 188
             logger::print(col_base);
189
-            snprintf(msg, sizeof(msg), "round %4zu (ic = %0.4e)", round_num, ic);
189
+            snprintf(msg, sizeof(msg), "round %4lu (ic = %0.4e)",
190
+                                       (unsigned long)round_num, ic);
190 191
             logger::print(msg);
191 192
             logger::print(col_base);
192 193
             col = 0;
... ...
@@ -442,7 +443,6 @@ class motif_trainer
442 443
             else i_start = i_end = j;
443 444
 
444 445
             for (i = i_start; i <= i_end; ++i) {
445
-
446 446
                 /* skip existing edges */
447 447
                 if (M.has_edge(i, j)) continue;
448 448
 
... ...
@@ -800,7 +800,7 @@ motif::motif(const YAML::Node& node)
800 800
     /* m */
801 801
     unsigned int m_;
802 802
     node["m"] >> m_;
803
-    m = (size_t) m;
803
+    m = (size_t) m_;
804 804
 
805 805
 
806 806
     /* parents */
... ...
@@ -809,8 +809,10 @@ motif::motif(const YAML::Node& node)
809 809
     const YAML::Node& parents_node = node["parents"];
810 810
 
811 811
     size_t i;
812
-    for (i = 0; i < m; ++i) {
813
-        parents_node[i] >> parents[i];
812
+    int b;
813
+    for (i = 0; i < m * m; ++i) {
814
+        parents_node[i] >> b;
815
+        parents[i] = (bool) b;
814 816
     }
815 817
 
816 818
 
... ...
@@ -840,7 +842,7 @@ void motif::to_yaml(YAML::Emitter& out) const
840 842
     out << YAML::Flow << YAML::BeginSeq;
841 843
 
842 844
     size_t i;
843
-    for (i = 0; i < m * m; ++i) out << parents[i];
845
+    for (i = 0; i < m * m; ++i) out << (parents[i] ? 1 : 0);
844 846
 
845 847
     out << YAML::EndSeq;
846 848
 
... ...
@@ -860,6 +862,45 @@ void motif::to_yaml(YAML::Emitter& out) const
860 862
 }
861 863
 
862 864
 
865
+string motif::model_graph(int offset) const
866
+{
867
+    string graph_str;
868
+    char strbuf[512];
869
+
870
+    graph_str += "digraph {\n";
871
+    graph_str += "splines=\"true\";\n";
872
+    graph_str += "node [shape=\"box\"];\n";
873
+
874
+    /* print nodes */
875
+    size_t i, j;
876
+    for (j = 0; j < m; j++) {
877
+        snprintf(strbuf, sizeof(strbuf), 
878
+                 "n%d [label=\"%d\",pos=\"%d,0\",style=\"%s\"];\n",
879
+                 (int) j, (int) j - offset, (int) j * 100,
880
+                 parents[j * m + j] ? "solid" : "dotted");
881
+
882
+        graph_str += strbuf;
883
+    }
884
+
885
+    /* print edges */
886
+    for (j = 0; j < m; ++j) {
887
+        if (!parents[j * m + j]) continue;
888
+
889
+        for (i = 0; i < m; ++i) {
890
+            if (i == j) continue;
891
+            if (parents[j * m + i]) {
892
+                snprintf(strbuf, sizeof(strbuf),
893
+                         "n%lu -> n%lu;\n", (unsigned long) i, (unsigned long) j);
894
+                graph_str += strbuf;
895
+            }
896
+        }
897
+    }
898
+
899
+    graph_str += "}\n";
900
+    return graph_str;
901
+}
902
+
903
+
863 904
 double motif::eval(const twobitseq& seq, size_t offset)
864 905
 {
865 906
     double ll0 = 0.0;
... ...
@@ -45,6 +45,10 @@ class motif
45 45
         /** emit yaml serialization */
46 46
         void to_yaml(YAML::Emitter& out) const;
47 47
 
48
+        /** print the model graph in GraphViz (dot) format,
49
+         *  adjusting position labels by the given offset. */
50
+        std::string model_graph(int offset) const;
51
+
48 52
     private:
49 53
         motif();
50 54
 
... ...
@@ -153,8 +153,13 @@ SEXP seqbias_predict( SEXP seqbias,
153 153
     pos_t c_start, c_end;
154 154
     strand_t c_strand;
155 155
 
156
+
156 157
     coerce_genomic_coords( seqname, start, end, strand,
157 158
                            &c_seqname, &c_start, &c_end, &c_strand );
159
+
160
+    /* we expect 1-based coordinates, but work with 0-based */
161
+    c_start -= 1;
162
+    c_end   -= 1;
158 163
                            
159 164
     if( c_strand != 0 && c_strand != 1 ) {
160 165
         warning( "strand should be '+' or '-'" );
... ...
@@ -217,11 +222,13 @@ SEXP seqbias_open_bam( SEXP reads_fn )
217 222
 }
218 223
 
219 224
 
220
-SEXP seqbias_count_reads( SEXP bam_ptr,
225
+SEXP seqbias_count_reads( SEXP seqbias,
226
+                          SEXP bam_ptr,
221 227
                           SEXP seqname,
222 228
                           SEXP start,
223 229
                           SEXP end,
224
-                          SEXP strand )
230
+                          SEXP strand,
231
+                          SEXP sum_counts )
225 232
 {
226 233
     if( TYPEOF(bam_ptr) != EXTPTRSXP ) error( "argument is not a indexed bam pointer" );
227 234
     indexed_bam_f* c_bam_ptr = (indexed_bam_f*)EXTPTR_PTR( bam_ptr );
... ...
@@ -232,13 +239,52 @@ SEXP seqbias_count_reads( SEXP bam_ptr,
232 239
 
233 240
     coerce_genomic_coords( seqname, start, end, strand,
234 241
                            &c_seqname, &c_start, &c_end, &c_strand );
242
+
243
+
244
+    /* we expect 1-based coordinates, but use 0-based */
245
+    c_start -= 1;
246
+    c_end   -= 1;
235 247
                            
236 248
 
249
+    double* bs[2] = {NULL, NULL};
250
+
251
+    if( !isNull(seqbias) ) {
252
+
253
+        SEXP idx;
254
+        PROTECT( idx = allocVector( STRSXP, 1 ) );
255
+        SET_STRING_ELT(idx, 0, mkChar("ptr"));
256
+
257
+        sequencing_bias* c_seqbias = NULL;
258
+        if( TYPEOF(GET_SLOT(seqbias, idx)) != EXTPTRSXP ||
259
+            !(c_seqbias = (sequencing_bias*)EXTPTR_PTR(GET_SLOT(seqbias, idx))) ) {
260
+            error( "first argument is not a proper seqbias class." );
261
+        }
262
+        
263
+        if (c_strand == strand_na || c_strand == strand_pos ) {
264
+            bs[0] = c_seqbias->get_bias(c_seqname, c_start, c_end, strand_pos);
265
+        }
266
+
267
+        if (c_strand == strand_na || c_strand == strand_neg ) {
268
+            bs[1] = c_seqbias->get_bias(c_seqname, c_start, c_end, strand_neg);
269
+            std::reverse(bs[1], bs[1] + (c_end - c_start + 1));
270
+        }
271
+
272
+        UNPROTECT(1);
273
+    }
274
+
275
+    bool c_sum_counts = asLogical(sum_counts) == TRUE;
276
+
237 277
     /* init vector */
238 278
     SEXP v;
239
-    PROTECT( v = allocVector( REALSXP, c_end - c_start + 1 ) );
240
-    pos_t i;
241
-    for( i = 0; i < c_end - c_start + 1; i++ ) REAL(v)[i] = 0;
279
+    if (c_sum_counts) {
280
+        PROTECT( v = allocVector( REALSXP, 1 ) );
281
+        REAL(v)[0] = 0;
282
+    }
283
+    else {
284
+        PROTECT( v = allocVector( REALSXP, c_end - c_start + 1 ) );
285
+        pos_t i;
286
+        for( i = 0; i < c_end - c_start + 1; i++ ) REAL(v)[i] = 0;
287
+    }
242 288
 
243 289
 
244 290
     const size_t region_len = 1024;
... ...
@@ -261,26 +307,41 @@ SEXP seqbias_count_reads( SEXP bam_ptr,
261 307
     bam_iter_t it = bam_iter_query( c_bam_ptr->idx, bam_ref_id, bam_start, bam_end );
262 308
     bam1_t* b = bam_init1();
263 309
     pos_t x;
310
+    strand_t s;
311
+
264 312
 
265 313
     while( bam_iter_read( c_bam_ptr->f->x.bam, it, b ) > 0 ) {
266
-        if( bam1_strand(b) !=  c_strand ) continue;
314
+        s = (strand_t) bam1_strand(b);
315
+        if( c_strand != strand_na && s !=  c_strand ) continue;
316
+
267 317
         x = bam1_strand(b) == 1 ? bam_calend( &b->core,  bam1_cigar(b) ) - 1 : b->core.pos;
268
-        if( c_start <= x && x <= c_end ) REAL(v)[x - c_start]++;
318
+        if (x < c_start || x > c_end) continue;
319
+
320
+        if (c_sum_counts) {
321
+            REAL(v)[0] += bs[s] == NULL ? 1.0 : (1.0 / bs[s][x - c_start]);
322
+        }
323
+        else {
324
+            REAL(v)[x - c_start] += bs[s] == NULL ? 1.0 : (1.0 / bs[s][x - c_start]);
325
+        }
269 326
     }
270 327
 
271
-    if( c_strand == strand_neg ) {
328
+    if( c_strand == strand_neg && !c_sum_counts ) {
272 329
         std::reverse(REAL(v), REAL(v) + (c_end - c_start + 1));
273 330
     }
274 331
 
275 332
     bam_iter_destroy( it );
276 333
     bam_destroy1(b);
277 334
 
335
+    delete [] bs[0];
336
+    delete [] bs[1];
337
+
278 338
     UNPROTECT(1);
279 339
     return v;
280 340
 }
281 341
 
282 342
 
283 343
 
344
+
284 345
 void dealloc_kmer_matrix( SEXP M )
285 346
 {
286 347
     if( TYPEOF(M) != EXTPTRSXP ) error( "argument is not a kmer_matrix pointer" );
... ...
@@ -433,7 +494,7 @@ void R_init_seqbias( DllInfo* info )
433 494
         { "seqbias_load",        (DL_FUNC) &seqbias_load, 2 },
434 495
         { "seqbias_save",        (DL_FUNC) &seqbias_save, 2 },
435 496
         { "seqbias_open_bam",       (DL_FUNC) &seqbias_open_bam, 1 },
436
-        { "seqbias_count_reads",    (DL_FUNC) &seqbias_count_reads, 5 },
497
+        { "seqbias_count_reads",    (DL_FUNC) &seqbias_count_reads, 7 },
437 498
         { "seqbias_alloc_kmer_matrix",          (DL_FUNC) &seqbias_alloc_kmer_matrix, 2 },
438 499
         { "seqbias_tally_kmers",                (DL_FUNC) &seqbias_tally_kmers, 4 },
439 500
         { "sebqias_dataframe_from_kmer_matrix", (DL_FUNC) &seqbias_dataframe_from_kmer_matrix, 2 },
... ...
@@ -11,6 +11,7 @@
11 11
 #include "miscmath.h"
12 12
 #include "samtools_extra.h"
13 13
 #include "samtools/faidx.h"
14
+#include <climits>
14 15
 #include <cmath>
15 16
 #include <cctype>
16 17
 #include <ctime>
... ...
@@ -55,17 +56,6 @@ static double rand_gauss(double sigma)
55 56
 }
56 57
 
57 58
 
58
-static double rand_trunc_gauss(double sigma, double a, double b)
59
-{
60
-    double x;
61
-    do {
62
-        x = rand_gauss(sigma);
63
-    } while (x < a || x > b);
64
-
65
-    return x;
66
-}
67
-
68
-
69 59
 double gauss_pdf (const double x, const double sigma)
70 60
 {
71 61
   double u = x / fabs (sigma);
... ...
@@ -86,25 +76,30 @@ static int read_pos_count_compare(const void* p1, const void* p2)
86 76
 }
87 77
 
88 78
 
89
-/*
90
-static int read_pos_tid_count_compare(const void* p1, const void* p2)
91
-{
92
-    int c = (int)(((read_pos*)p1)->tid - ((read_pos*)p2)->tid);
93
-
94
-    if (c == 0) {
95
-        return (int)((read_pos*)p2)->count - (int)((read_pos*)p1)->count;
96
-    }
97
-    else return c;
98
-}
99
-*/
100
-
101
-
102 79
 
103 80
 sequencing_bias::sequencing_bias()
104 81
     : ref_f(NULL)
105 82
     , M(NULL)
106 83
 {}
107 84
 
85
+
86
+sequencing_bias::sequencing_bias(const char* model_fn)
87
+    : ref_f(NULL)
88
+    , M(NULL)
89
+{
90
+    std::ifstream fin;
91
+    fin.open(model_fn);
92
+
93
+    YAML::Parser parser(fin);
94
+    YAML::Node doc;
95
+    parser.GetNextDocument(doc);
96
+
97
+    doc["L"] >> L;
98
+    doc["R"] >> R;
99
+
100
+    M = new motif(doc["motif"]);
101
+}
102
+
108 103
 sequencing_bias::sequencing_bias(const char* ref_fn,
109 104
                                  const char* model_fn)
110 105
     : ref_f(NULL)
... ...
@@ -122,12 +117,15 @@ sequencing_bias::sequencing_bias(const char* ref_fn,
122 117
 
123 118
     M = new motif(doc["motif"]);
124 119
 
125
-    this->ref_fn = ref_fn;
126 120
 
127
-    ref_f = fai_load(ref_fn);
128
-    if (ref_f == NULL) {
129
-        logger::abort("Can't open indexed FASTA file %s\n", ref_fn);
121
+    if (ref_fn != NULL) {
122
+        this->ref_fn = ref_fn;
123
+        ref_f = fai_load(ref_fn);
124
+        if (ref_f == NULL) {
125
+            logger::abort("Can't open indexed FASTA file %s\n", ref_fn);
126
+        }
130 127
     }
128
+    else ref_f = NULL;
131 129
 }
132 130
 
133 131
 
... ...
@@ -233,7 +231,6 @@ void sequencing_bias::build(const char* ref_fn,
233 231
 
234 232
     while (samread(reads_f, read) > 0) {
235 233
         if (read->core.n_cigar != 1) continue;
236
-        if (read->core.flag & BAM_FREAD2) continue;
237 234
         k++;
238 235
         if (k % 1000000 == 0) {
239 236
             logger::info("hashed %zu reads.", k);
... ...
@@ -265,7 +262,7 @@ void sequencing_bias::build(const char* ref_fn,
265 262
     this->L = L;
266 263
     this->R = R;
267 264
 
268
-    unsigned int i;
265
+    unsigned int i = 0;
269 266
 
270 267
     read_pos* S_tmp;
271 268
     read_pos* S;
... ...
@@ -273,34 +270,12 @@ void sequencing_bias::build(const char* ref_fn,
273 270
     const size_t max_dump = 10000000;
274 271
     pos_table_dump(T, &S_tmp, &N, max_dump);
275 272
 
276
-    /* sort by count */
277
-    qsort(S_tmp, N, sizeof(read_pos), read_pos_count_compare);
278
-
279
-    /* consider only reads with at least one duplicate */
280
-    for (i = 0; i < N; ++i) {
281
-        if (S_tmp[i].count <= 1) break;
282
-    }
283
-
284
-    /* (unless there are very few of these reads */
285
-    if (i > 10000) {
286
-        max_reads = std::min<size_t>(max_reads, i);
287
-        logger::info("%zu reads with duplicates.", i);
288
-    }
289
-    else {
290
-        i = N;
291
-    }
292
-
293
-
294
-    /* ignore the top 1%, as they tend to be vastly higher than anything else,
295
-     * and thus can throw things off when training with small numbers of reads
296
-     * */
297
-    S = S_tmp + i/100;
298
-    max_reads = min<size_t>(max_reads, 99*i/100); 
273
+    S = S_tmp;
299 274
 
300 275
     /* sort by tid (so we can load one chromosome at a time) */
276
+    random_shuffle(S, S + N);
301 277
     qsort(S, std::min<size_t>(max_reads, N), sizeof(read_pos), read_pos_tid_compare);
302 278
 
303
-
304 279
     /* sample foreground and background kmer frequencies */
305 280
     ref_f = fai_load(ref_fn);
306 281
     if (ref_f == NULL) {
... ...
@@ -337,7 +312,7 @@ void sequencing_bias::build(const char* ref_fn,
337 312
             seqname = T->seq_names[ S[i].tid ];
338 313
             if (seq) free(seq); 
339 314
 
340
-            seq = faidx_fetch_seq(ref_f, seqname, 0, 1000000000, &seqlen);
315
+            seq = faidx_fetch_seq(ref_f, seqname, 0, INT_MAX, &seqlen);
341 316
 
342 317
             logger::info("read sequence %s.", seqname);
343 318
 
... ...
@@ -374,7 +349,7 @@ void sequencing_bias::build(const char* ref_fn,
374 349
         /* adjust the current read position randomly, and sample */
375 350
         for (bg_sample_num = 0; bg_sample_num < bg_samples;) {
376 351
 
377
-            bg_pos = S[i].pos + (pos_t)round_away(rand_trunc_gauss(12, -200, 200));
352
+            bg_pos = S[i].pos + (pos_t)round_away(rand_gauss(500));
378 353
 
379 354
             if (S[i].strand == strand_neg) {
380 355
                 if (bg_pos < R || bg_pos >= seqlen - L) continue;
... ...
@@ -464,17 +439,201 @@ double* sequencing_bias::get_bias(const char* seqname,
464 439
         bs[i] = M->eval(seq, i);
465 440
     }
466 441
 
467
-
468 442
     free(seqstr);
469 443
     return bs;
470 444
 }
471 445
 
472 446
 
473
-string sequencing_bias::print_model_graph()
447
+string sequencing_bias::model_graph() const
448
+{
449
+    return M->model_graph((int) L);
450
+}
451
+
452
+
453
+kmer_matrix tabulate_bias(double* kl,
454
+                          pos_t L, pos_t R, int k,
455
+                          const char* ref_fn,
456
+                          const char* reads_fn,
457
+                          const char* model_fn)
474 458
 {
475
-    // TODO
476
-    //return M->print_model_graph(L);
477
-    return string();
459
+    /* This function procedes very much like the sequencing_bias::build
460
+     * function, but does not finally train a motif, but rather tabulates
461
+     * k-mer frequencies. */
462
+    size_t max_reads = 250000;
463
+
464
+    kmer_matrix dest((size_t) (L + 1 + R), (size_t) k);
465
+    dest.set_all(0.0);
466
+
467
+    faidx_t* ref_f = fai_load(ref_fn);
468
+    if (ref_f == NULL) {
469
+        logger::abort("Can't open fasta file '%s'.", ref_fn);
470
+    }
471
+
472
+    samfile_t* reads_f = samopen(reads_fn, "rb", NULL);
473
+    if (reads_f == NULL) {
474
+        logger::abort("Can't open bam file '%s'.", reads_fn);
475
+    }
476
+
477
+    bam_index_t* reads_index = bam_index_load(reads_fn);
478
+    if (reads_index == NULL) {
479
+        logger::abort("Can't open bam index '%s.bai'.", reads_fn);
480
+    }
481
+
482
+    sequencing_bias* sb = NULL;
483
+    if (model_fn != NULL) {
484
+        sb = new sequencing_bias(ref_fn, model_fn);
485
+    }
486
+
487
+
488
+    bam_init_header_hash(reads_f->header);
489
+    bam1_t* read = bam_init1();
490
+
491
+    pos_table T;
492
+    pos_table_create(&T, reads_f->header->n_targets);
493
+    T.seq_names = reads_f->header->target_name;
494
+
495
+    size_t hashed_count = 0;
496
+    while (samread(reads_f, read) > 0) {
497
+        if (read->core.n_cigar != 1) continue;
498
+        if (++hashed_count % 1000000 == 0) {
499
+            logger::info("hashed %zu reads.", hashed_count);
500
+        }
501
+        pos_table_inc(&T, read);
502
+    }
503
+    logger::info("hashed %zu reads.", hashed_count);
504
+
505
+    read_pos* S_tmp;
506
+    read_pos* S;
507
+    size_t N;
508
+    const size_t max_dump = 10000000;
509
+    pos_table_dump(&T, &S_tmp, &N, max_dump);
510
+
511
+    /* sort by count */
512
+    qsort(S_tmp, N, sizeof(read_pos), read_pos_count_compare);
513
+
514
+    /* consider only reads with at least one duplicate */
515
+    size_t i;
516
+    for (i = 0; i < N; ++i) {
517
+        if (S_tmp[i].count <= 1) break;
518
+    }
519
+
520
+    /* (unless there are very few of these reads */
521
+    if (i > 10000) {
522
+        max_reads = std::min<size_t>(max_reads, i);
523
+        logger::info("%zu reads with duplicates.", i);
524
+    }
525
+    else {
526
+        i = N;
527
+    }
528
+
529
+    /* ignore the top 1%, as they tend to be vastly higher than anything else,
530
+     * and thus can throw things off when training with small numbers of reads
531
+     * */
532
+    S = S_tmp + i/100;
533
+    max_reads = min<size_t>(max_reads, 99*i/100); 
534
+
535
+    /* sort by tid (so we can load one chromosome at a time) */
536
+    qsort(S, std::min<size_t>(max_reads, N), sizeof(read_pos), read_pos_tid_compare);
537
+
538
+    twobitseq tbs;
539
+
540
+    char* local_seq;
541
+    local_seq = new char[ (k - 1) + L + 1 + R + 1 ];
542
+    local_seq[(k - 1) + L + 1 + R] = '\0';
543
+
544
+    double         w;
545
+    char*          seq       = NULL;
546
+    int            seqlen    = 0;
547
+    int            curr_tid  = -1;
548
+    char*          seqname   = NULL;
549
+
550
+    for (i = 0; i < N && i < max_reads; i++) {
551
+        if (S[i].tid != curr_tid) {
552
+            seqname = T.seq_names[S[i].tid];
553
+            free(seq);
554
+            seq = faidx_fetch_seq(ref_f, seqname, 0, INT_MAX, &seqlen);
555
+            logger::info("read sequence %s.", seqname);
556
+            curr_tid = S[i].tid;
557
+
558
+            if (seq == NULL) {
559
+                logger::warn("warning: reference sequence not found, skipping.");
560
+            }
561
+        }
562
+
563
+        if (seq == NULL) continue;
564
+
565
+        if (S[i].strand == strand_neg) {
566
+            if (S[i].pos < R || S[i].pos >= seqlen - L - (k-1)) continue;
567
+            memcpy(local_seq, seq + S[i].pos - R, ((k-1)+L+1+R)*sizeof(char));
568
+            seqrc(local_seq, L+1+R);
569
+        }
570
+        else {
571
+            if (S[i].pos < L + (k-1) || S[i].pos >= seqlen - R) continue;
572
+            memcpy(local_seq, seq + (S[i].pos - L - (k-1)), ((k-1)+L+1+R)*sizeof(char));
573
+        }
574
+
575
+        if (sb) {
576
+            // TODO: get_bias
577
+        }
578
+        else w = 1.0;
579
+
580
+        tbs = local_seq;
581
+        kmer K;
582
+        for (pos_t pos = (k-1); pos < (k-1) + L + 1 + R; ++pos) {
583
+            K = tbs.get_kmer(k, pos);
584
+            dest(pos - (k-1), K) += w;
585
+        }
586
+    }
587
+
588
+    /* compute KL divergence */
589
+
590
+    /* estimate a background distribution by averaging across the entire window
591
+     * */
592
+    int four_to_k = 1 << (2 * k);
593
+    double* bg = new double[four_to_k];
594
+    memset(bg, 0, four_to_k * sizeof(double));
595
+    for (pos_t pos = 0; pos < L + 1 + R; ++pos) {
596
+        for (kmer K = 0; K < (kmer) four_to_k; ++K) {
597
+            bg[K] += dest(pos, K);
598
+        }
599
+    }
600
+
601
+    kmer_matrix norm_dest(dest);
602
+    norm_dest.make_distribution();
603
+
604
+    double z = 0.0;
605
+    for (kmer K = 0; K < (kmer) four_to_k; ++K) z += bg[K];
606
+    for (kmer K = 0; K < (kmer) four_to_k; ++K) bg[K] /= z;
607
+
608
+
609
+    /* Compute the (symetric) kullback-leibler divegnnce */
610
+    memset(kl, 0, (L + 1 + R) * sizeof(double));
611
+    for (pos_t pos = 0; pos < L + 1 + R; ++pos) {
612
+        kl[pos] = 0.0;
613
+        for (kmer K = 0; K < (kmer) four_to_k; ++K) {
614
+            if (norm_dest(pos, K) > 0.0) {
615
+                kl[pos] += norm_dest(pos, K) * (log2(norm_dest(pos, K)) - log2(bg[K]));
616
+            }
617
+
618
+            if (bg[K] > 0.0) {
619
+                kl[pos] += bg[K] * (log2(bg[K]) - log2(norm_dest(pos, K)));
620
+            }
621
+        }
622
+    }
623
+
624
+    delete [] bg;
625
+
626
+
627
+    free(seq);
628
+    free(local_seq);
629
+    free(S_tmp);
630
+    bam_destroy1(read);
631
+    pos_table_destroy(&T);
632
+    delete sb;
633
+    bam_index_destroy(reads_index);
634
+    samclose(reads_f);
635
+
636
+    return dest;
478 637
 }
479 638
 
480 639
 
... ...
@@ -21,7 +21,13 @@
21 21
 class sequencing_bias
22 22
 {
23 23
     public:
24
-        /** Load a model that has bee previously trained. */
24
+        /** Load a model that has been previously trained, without a reference
25
+         * sequence. (It can not be actually used this way.)
26
+         */
27
+        sequencing_bias(const char* model_fn);
28
+
29
+
30
+        /** Load a model that has been previously trained. */
25 31
         sequencing_bias(const char* ref_fn,
26 32
                         const char* model_fn);
27 33
 
... ...
@@ -74,7 +80,7 @@ class sequencing_bias
74 80
         void to_yaml(YAML::Emitter&) const;
75 81
 
76 82
         /** Return a string of the model graph in dot format. */
77
-        std::string print_model_graph();
83
+        std::string model_graph() const;
78 84
 
79 85
 
80 86
     private:
... ...
@@ -107,4 +113,13 @@ class sequencing_bias
107 113
 };
108 114
 
109 115
 
116
+/** Tabulate the bias in a given dataset, optionally correcting for bias using
117
+ *  the given model. */
118
+kmer_matrix tabulate_bias(double* kl,
119
+                          pos_t L, pos_t R, int k,
120
+                          const char* ref_fn,
121
+                          const char* reads_fn,
122
+                          const char* model_fn = NULL);
123
+
124
+
110 125
 #endif
... ...
@@ -8,10 +8,8 @@
8 8
 #include "twobitseq.hpp"
9 9
 #include <algorithm>
10 10
 #include <cstring>
11
+#include <cstdlib>
11 12
 
12
-/* Don't expose this function. No one should have to know how we are mapping
13
- * letters to numbers.
14
- */
15 13
 kmer nuc_to_num(char c)
16 14
 {
17 15
     switch( c ) {
... ...
@@ -26,36 +24,48 @@ kmer nuc_to_num(char c)
26 24
     }
27 25
 }
28 26
 
29
-void num_to_nuc(char* dest, int n, int k)
27
+
28
+
29
+void num_to_nuc(char* dest, kmer K, int k)
30 30
 {
31 31
     int i;
32 32
     /* read backwards, then reverse */
33
-    for( i = 0; i < k; i++ ) {
34
-        switch( n & 0x3 ) {
33
+    for (i = 0; i < k; ++i) {
34
+        switch (K & 0x3) {
35 35
             case 0: dest[i] = 'a'; break;
36 36
             case 1: dest[i] = 'c'; break;
37 37
             case 2: dest[i] = 'g'; break;
38 38
             case 3: dest[i] = 't'; break;
39 39
         }
40
-        n >>= 2;
40
+
41
+        K >>= 2;
41 42
     }
42
-    dest[i] = '\0';
43 43
 
44
+    dest[i] = '\0';
44 45
     std::reverse(dest, dest + i);
45 46
 }
46 47
 
47 48
 
48 49
 const size_t twobitseq::max_kmer = 4 * sizeof(kmer);
49 50
 
51
+twobitseq::twobitseq()
52
+    : xs(NULL)
53
+    , n(0)
54
+{
55
+
56
+}
57
+
50 58
 
51 59
 twobitseq::twobitseq(const char* seq)
52 60
     : xs(NULL)
53 61
     , n(0)
54 62
 {
63
+    if (seq == NULL) return;
55 64
     n = strlen(seq);
56 65
     if (n == 0) return;
57 66
 
58
-    xs = new kmer[n / max_kmer + 1];
67
+    xs = reinterpret_cast<kmer*>(
68
+            malloc_or_die((n / max_kmer + 1) * sizeof(kmer)));
59 69
     memset(xs, 0, (n / max_kmer + 1) * sizeof(kmer));
60 70
 
61 71
     size_t i;
... ...
@@ -77,7 +87,8 @@ twobitseq::twobitseq(const twobitseq& other)
77 87
     n = other.n;
78 88
     if (n == 0) return;
79 89
 
80
-    xs = new kmer[n / max_kmer + 1];
90
+    xs = reinterpret_cast<kmer*>(
91
+            malloc_or_die((n / max_kmer + 1) * sizeof(kmer)));
81 92
     memcpy(xs, other.xs, (n / max_kmer + 1) * sizeof(kmer));
82 93
 }
83 94
 
... ...
@@ -85,20 +96,60 @@ twobitseq::twobitseq(const twobitseq& other)
85 96
 
86 97
 twobitseq::~twobitseq()
87 98
 {
88
-    delete [] xs;
99
+    free(xs);
89 100
 }
90 101
 
91 102
 
92 103
 void twobitseq::operator = (const twobitseq& other)
93 104
 {
94 105
     n = other.n;
95
-
96
-    delete [] xs;
97
-    xs = new kmer[n / max_kmer + 1];
106
+    xs = reinterpret_cast<kmer*>(
107
+            realloc_or_die(xs, (n / max_kmer + 1) * sizeof(kmer)));
98 108
     memcpy(xs, other.xs, (n / max_kmer + 1) * sizeof(kmer));
99 109
 }
100 110
 
101 111
 
112
+void twobitseq::operator = (const char* seq)
113
+{
114
+    if (seq == NULL) {
115
+        n = 0;
116
+        free(xs);
117
+        xs = NULL;
118
+        return;
119
+    }
120
+
121
+    n = strlen(seq);
122
+    xs = reinterpret_cast<kmer*>(
123
+            realloc_or_die(xs, (n / max_kmer + 1) * sizeof(kmer)));
124
+    memset(xs, 0, (n / max_kmer + 1) * sizeof(kmer));
125
+
126
+    size_t i;
127
+    size_t block, offset;
128
+    for (i = 0; i < n; ++i) {
129
+        block  = i / max_kmer;
130
+        offset = i % max_kmer;
131
+
132
+        xs[block] = xs[block] | (nuc_to_num(seq[i]) << (2*offset));
133
+    }
134
+}
135
+
136
+
137
+kmer twobitseq::get_kmer(int k, pos_t pos)
138
+{
139
+    size_t block, offset;
140
+    size_t i;
141
+    kmer K = 0;
142
+
143
+    for (i = 0; i < (size_t) k; ++i) {
144
+        block  = (i + (size_t) (pos - (k - 1))) / max_kmer;
145
+        offset = (i + (size_t) (pos - (k - 1))) % max_kmer;
146
+        K = (K << 2) | ((xs[block] >> (2 * offset)) & 0x3);
147
+    }
148
+
149
+    return K;
150
+}
151
+
152
+
102 153
 int twobitseq::make_kmer(kmer& K, size_t pos, bool* mask, size_t mask_len) const
103 154
 {
104 155
     int k = 0;
... ...
@@ -21,8 +21,8 @@
21 21
 class twobitseq
22 22
 {
23 23
     public:
24
-        /** create an empty sequence (uninitialized, filled with noise) */
25
-        twobitseq(size_t n);
24
+        /** create an empty sequence. */
25
+        twobitseq();
26 26
 
27 27
         /** create a sequence from a DNA/RNA sequence string */
28 28
         twobitseq(const char* seq);
... ...
@@ -33,6 +33,11 @@ class twobitseq
33 33
         ~twobitseq();
34 34
 
35 35
         void operator = (const twobitseq&);
36
+        void operator = (const char* seq);
37
+
38
+        /* get the kmer ending at position i */
39
+        kmer get_kmer(int k, pos_t i);
40
+
36 41
 
37 42
         /** extract a kmer.
38 43
          *
... ...
@@ -51,12 +56,13 @@ class twobitseq
51 56
         static const size_t max_kmer;
52 57
 };
53 58
 
59
+
54 60
 /** Convert a nucleotide charactor to a number, using the same scheme as
55 61
  * twobitseq */
56 62
 kmer nuc_to_num(char c);
57 63
 
58 64
 /** Convert a number n encoding a kmer into a string of nucleotides */
59
-void num_to_nuc(char* dest, int n, int k);
65
+void num_to_nuc(char* dest, kmer K, int k);
60 66
 
61 67
 #endif
62 68