some small api improvements and cross-platform fixes

Daniel Jones authored on 11/10/2011 21:38:58
Showing 12 changed files

 ... ... @@ -1,5 +1,5 @@ 1 1  Package: seqbias 2 -Version: 1.1.1 2 +Version: 1.1.3 3 3  Date: 25-12-2010 4 4  Title: Estimation of per-position bias in high-throughput sequencing data 5 5  Description: This package implements a model of per-position sequencing bias in
 ... ... @@ -109,21 +109,27 @@ setClass( "seqbias", 109 109   110 110   111 111   112 -"count.reads" <- function( reads_fn, I, binary = TRUE ) 112 +"count.reads" <- function( reads_fn, I, sb = NULL, binary = FALSE, sum.counts = FALSE ) 113 113  { 114 114  require(GenomicRanges) 115 115  stopifnot( is( I, "GRanges" ) ) 116 116   117 + if (!is.null(sb) & class(sb) != "seqbias") { 118 + stop( "The 'sb' parameter of count.reads must be a seqbias class." ) 119 + } 120 + 117 121  bam_ptr <- .Call( "seqbias_open_bam", path.expand(reads_fn), 118 122  PACKAGE = "seqbias" ) 119 123   120 124  counts <- tapply( I, 121 125  INDEX = 1:length(I), 122 126  FUN = function(x) .Call( "seqbias_count_reads", 127 + sb, 123 128  bam_ptr, 124 129  as.character(seqnames(x)), 125 130  start(x), end(x), 126 131  as.character(strand(x)), 132 + sum.counts, 127 133  PACKAGE = "seqbias" ) ) 128 134   129 135  if( binary ) lapply( counts, FUN = function(c) as.integer( c > 0 ) )
 ... ... @@ -126,7 +126,7 @@ seqs[neg_idx] <- reverseComplement( seqs[neg_idx] ) 126 126  Finally, we count the number of reads mapping to each position in our sampled 127 127  intervals. 128 128  <<>>= 129 -counts <- count.reads( reads_fn, I ) 129 +counts <- count.reads( reads_fn, I, binary = T ) 130 130  @ 131 131   132 132  Unless the \texttt{binary} argument is FALSE, this function returns a 0-1 vector, ... ... @@ -192,7 +192,7 @@ more reads are available a more accurate model can be trained at the expense of 192 192  the training procedure taking up to several minutes. 193 193   194 194  <>= 195 -sb <- seqbias.fit( ref_fn, reads_fn, L = 5, R = 20 ) 195 +sb <- seqbias.fit( ref_fn, reads_fn, L = 5, R = 15 ) 196 196  @ 197 197   198 198  The \texttt{L} and \texttt{R} arguments control the maximum number of positions
 ... ... @@ -3,21 +3,25 @@ 3 3  \title{Counting reads across intervals} 4 4  \description{Counts the number of reads starting at each position across given 5 5  genomic intervals} 6 -\usage{count.reads(reads_fn, I, binary=TRUE)} 6 +\usage{count.reads(reads_fn, I, sb=NULL, binary=FALSE, sum.counts=TRUE)} 7 7  \arguments{ 8 8  \item{reads_fn}{filename of aligned reads in BAM format} 9 9  \item{I}{a GRanges object giving valid genomic intervals} 10 + \item{sb}{a seqbias object} 10 11  \item{binary}{if \code{TRUE}, return a 0-1 vector, otherwise return a vector 11 12  counting the number of reads mapped to each position} 13 + \item{sum.counts}{if \code{TRUE} return the total read count for each interval} 12 14  } 13 15  \details{ 14 16  Given an indexed BAM file, this function counts the number of reads starting 15 - at each position of each provided interval. These counts can then be 16 - normalized by dividing by the predicted bias obtained from 'bias.predict'. 17 - 18 - By default, a 0-1 vector is returned, where positions at which no reads are 19 - mapped are 0, and those with one or more are 1. If \code{binary} is 20 - \code{FALSE}, the number of reads mapping to each position is returned. 17 + at each position of each provided interval. If a seqbias object is provided 18 + through the \code{sb} attribute, counts are corrected for sequence bias. 19 + The total read count for each interval is returned if \code{sum.counts} is 20 + \code{TRUE}. 21 +  22 + If \code{binary} is \code{TRUE} a 0-1 vector is returned instead, where 23 + positions at which no reads are mapped are 0, and those with one or more are 24 + 1. 21 25  } 22 26  \value{ 23 27  A list of numeric vectors is returned, one for each interval provided. Each
 ... ... @@ -36,7 +36,6 @@ kmer_matrix::kmer_matrix(const YAML::Node& node) 36 36  A = new double[size1 * size2]; 37 37   38 38  const YAML::Node& node_A = node["A"]; 39 - 40 39  size_t i; 41 40  for (i = 0; i < size1 * size2; ++i) { 42 41  node_A[i] >> A[i];
 ... ... @@ -186,7 +186,8 @@ class motif_trainer 186 186  compute_reachability(); 187 187   188 188  logger::print(col_base); 189 - snprintf(msg, sizeof(msg), "round %4zu (ic = %0.4e)", round_num, ic); 189 + snprintf(msg, sizeof(msg), "round %4lu (ic = %0.4e)", 190 + (unsigned long)round_num, ic); 190 191  logger::print(msg); 191 192  logger::print(col_base); 192 193  col = 0; ... ... @@ -442,7 +443,6 @@ class motif_trainer 442 443  else i_start = i_end = j; 443 444   444 445  for (i = i_start; i <= i_end; ++i) { 445 - 446 446  /* skip existing edges */ 447 447  if (M.has_edge(i, j)) continue; 448 448   ... ... @@ -800,7 +800,7 @@ motif::motif(const YAML::Node& node) 800 800  /* m */ 801 801  unsigned int m_; 802 802  node["m"] >> m_; 803 - m = (size_t) m; 803 + m = (size_t) m_; 804 804   805 805   806 806  /* parents */ ... ... @@ -809,8 +809,10 @@ motif::motif(const YAML::Node& node) 809 809  const YAML::Node& parents_node = node["parents"]; 810 810   811 811  size_t i; 812 - for (i = 0; i < m; ++i) { 813 - parents_node[i] >> parents[i]; 812 + int b; 813 + for (i = 0; i < m * m; ++i) { 814 + parents_node[i] >> b; 815 + parents[i] = (bool) b; 814 816  } 815 817   816 818   ... ... @@ -840,7 +842,7 @@ void motif::to_yaml(YAML::Emitter& out) const 840 842  out << YAML::Flow << YAML::BeginSeq; 841 843   842 844  size_t i; 843 - for (i = 0; i < m * m; ++i) out << parents[i]; 845 + for (i = 0; i < m * m; ++i) out << (parents[i] ? 1 : 0); 844 846   845 847  out << YAML::EndSeq; 846 848   ... ... @@ -860,6 +862,45 @@ void motif::to_yaml(YAML::Emitter& out) const 860 862  } 861 863   862 864   865 +string motif::model_graph(int offset) const 866 +{ 867 + string graph_str; 868 + char strbuf[512]; 869 + 870 + graph_str += "digraph {\n"; 871 + graph_str += "splines=\"true\";\n"; 872 + graph_str += "node [shape=\"box\"];\n"; 873 + 874 + /* print nodes */ 875 + size_t i, j; 876 + for (j = 0; j < m; j++) { 877 + snprintf(strbuf, sizeof(strbuf),  878 + "n%d [label=\"%d\",pos=\"%d,0\",style=\"%s\"];\n", 879 + (int) j, (int) j - offset, (int) j * 100, 880 + parents[j * m + j] ? "solid" : "dotted"); 881 + 882 + graph_str += strbuf; 883 + } 884 + 885 + /* print edges */ 886 + for (j = 0; j < m; ++j) { 887 + if (!parents[j * m + j]) continue; 888 + 889 + for (i = 0; i < m; ++i) { 890 + if (i == j) continue; 891 + if (parents[j * m + i]) { 892 + snprintf(strbuf, sizeof(strbuf), 893 + "n%lu -> n%lu;\n", (unsigned long) i, (unsigned long) j); 894 + graph_str += strbuf; 895 + } 896 + } 897 + } 898 + 899 + graph_str += "}\n"; 900 + return graph_str; 901 +} 902 + 903 + 863 904  double motif::eval(const twobitseq& seq, size_t offset) 864 905  { 865 906  double ll0 = 0.0;
 ... ... @@ -45,6 +45,10 @@ class motif 45 45  /** emit yaml serialization */ 46 46  void to_yaml(YAML::Emitter& out) const; 47 47   48 + /** print the model graph in GraphViz (dot) format, 49 + * adjusting position labels by the given offset. */ 50 + std::string model_graph(int offset) const; 51 + 48 52  private: 49 53  motif(); 50 54 
 ... ... @@ -153,8 +153,13 @@ SEXP seqbias_predict( SEXP seqbias, 153 153  pos_t c_start, c_end; 154 154  strand_t c_strand; 155 155   156 + 156 157  coerce_genomic_coords( seqname, start, end, strand, 157 158  &c_seqname, &c_start, &c_end, &c_strand ); 159 + 160 + /* we expect 1-based coordinates, but work with 0-based */ 161 + c_start -= 1; 162 + c_end -= 1; 158 163   159 164  if( c_strand != 0 && c_strand != 1 ) { 160 165  warning( "strand should be '+' or '-'" ); ... ... @@ -217,11 +222,13 @@ SEXP seqbias_open_bam( SEXP reads_fn ) 217 222  } 218 223   219 224   220 -SEXP seqbias_count_reads( SEXP bam_ptr, 225 +SEXP seqbias_count_reads( SEXP seqbias, 226 + SEXP bam_ptr, 221 227  SEXP seqname, 222 228  SEXP start, 223 229  SEXP end, 224 - SEXP strand ) 230 + SEXP strand, 231 + SEXP sum_counts ) 225 232  { 226 233  if( TYPEOF(bam_ptr) != EXTPTRSXP ) error( "argument is not a indexed bam pointer" ); 227 234  indexed_bam_f* c_bam_ptr = (indexed_bam_f*)EXTPTR_PTR( bam_ptr ); ... ... @@ -232,13 +239,52 @@ SEXP seqbias_count_reads( SEXP bam_ptr, 232 239   233 240  coerce_genomic_coords( seqname, start, end, strand, 234 241  &c_seqname, &c_start, &c_end, &c_strand ); 242 + 243 + 244 + /* we expect 1-based coordinates, but use 0-based */ 245 + c_start -= 1; 246 + c_end -= 1; 235 247   236 248   249 + double* bs[2] = {NULL, NULL}; 250 + 251 + if( !isNull(seqbias) ) { 252 + 253 + SEXP idx; 254 + PROTECT( idx = allocVector( STRSXP, 1 ) ); 255 + SET_STRING_ELT(idx, 0, mkChar("ptr")); 256 + 257 + sequencing_bias* c_seqbias = NULL; 258 + if( TYPEOF(GET_SLOT(seqbias, idx)) != EXTPTRSXP || 259 + !(c_seqbias = (sequencing_bias*)EXTPTR_PTR(GET_SLOT(seqbias, idx))) ) { 260 + error( "first argument is not a proper seqbias class." ); 261 + } 262 +  263 + if (c_strand == strand_na || c_strand == strand_pos ) { 264 + bs[0] = c_seqbias->get_bias(c_seqname, c_start, c_end, strand_pos); 265 + } 266 + 267 + if (c_strand == strand_na || c_strand == strand_neg ) { 268 + bs[1] = c_seqbias->get_bias(c_seqname, c_start, c_end, strand_neg); 269 + std::reverse(bs[1], bs[1] + (c_end - c_start + 1)); 270 + } 271 + 272 + UNPROTECT(1); 273 + } 274 + 275 + bool c_sum_counts = asLogical(sum_counts) == TRUE; 276 + 237 277  /* init vector */ 238 278  SEXP v; 239 - PROTECT( v = allocVector( REALSXP, c_end - c_start + 1 ) ); 240 - pos_t i; 241 - for( i = 0; i < c_end - c_start + 1; i++ ) REAL(v)[i] = 0; 279 + if (c_sum_counts) { 280 + PROTECT( v = allocVector( REALSXP, 1 ) ); 281 + REAL(v)[0] = 0; 282 + } 283 + else { 284 + PROTECT( v = allocVector( REALSXP, c_end - c_start + 1 ) ); 285 + pos_t i; 286 + for( i = 0; i < c_end - c_start + 1; i++ ) REAL(v)[i] = 0; 287 + } 242 288   243 289   244 290  const size_t region_len = 1024; ... ... @@ -261,26 +307,41 @@ SEXP seqbias_count_reads( SEXP bam_ptr, 261 307  bam_iter_t it = bam_iter_query( c_bam_ptr->idx, bam_ref_id, bam_start, bam_end ); 262 308  bam1_t* b = bam_init1(); 263 309  pos_t x; 310 + strand_t s; 311 + 264 312   265 313  while( bam_iter_read( c_bam_ptr->f->x.bam, it, b ) > 0 ) { 266 - if( bam1_strand(b) != c_strand ) continue; 314 + s = (strand_t) bam1_strand(b); 315 + if( c_strand != strand_na && s != c_strand ) continue; 316 + 267 317  x = bam1_strand(b) == 1 ? bam_calend( &b->core, bam1_cigar(b) ) - 1 : b->core.pos; 268 - if( c_start <= x && x <= c_end ) REAL(v)[x - c_start]++; 318 + if (x < c_start || x > c_end) continue; 319 + 320 + if (c_sum_counts) { 321 + REAL(v)[0] += bs[s] == NULL ? 1.0 : (1.0 / bs[s][x - c_start]); 322 + } 323 + else { 324 + REAL(v)[x - c_start] += bs[s] == NULL ? 1.0 : (1.0 / bs[s][x - c_start]); 325 + } 269 326  } 270 327   271 - if( c_strand == strand_neg ) { 328 + if( c_strand == strand_neg && !c_sum_counts ) { 272 329  std::reverse(REAL(v), REAL(v) + (c_end - c_start + 1)); 273 330  } 274 331   275 332  bam_iter_destroy( it ); 276 333  bam_destroy1(b); 277 334   335 + delete [] bs[0]; 336 + delete [] bs[1]; 337 + 278 338  UNPROTECT(1); 279 339  return v; 280 340  } 281 341   282 342   283 343   344 + 284 345  void dealloc_kmer_matrix( SEXP M ) 285 346  { 286 347  if( TYPEOF(M) != EXTPTRSXP ) error( "argument is not a kmer_matrix pointer" ); ... ... @@ -433,7 +494,7 @@ void R_init_seqbias( DllInfo* info ) 433 494  { "seqbias_load", (DL_FUNC) &seqbias_load, 2 }, 434 495  { "seqbias_save", (DL_FUNC) &seqbias_save, 2 }, 435 496  { "seqbias_open_bam", (DL_FUNC) &seqbias_open_bam, 1 }, 436 - { "seqbias_count_reads", (DL_FUNC) &seqbias_count_reads, 5 }, 497 + { "seqbias_count_reads", (DL_FUNC) &seqbias_count_reads, 7 }, 437 498  { "seqbias_alloc_kmer_matrix", (DL_FUNC) &seqbias_alloc_kmer_matrix, 2 }, 438 499  { "seqbias_tally_kmers", (DL_FUNC) &seqbias_tally_kmers, 4 }, 439 500  { "sebqias_dataframe_from_kmer_matrix", (DL_FUNC) &seqbias_dataframe_from_kmer_matrix, 2 },
 ... ... @@ -11,6 +11,7 @@ 11 11  #include "miscmath.h" 12 12  #include "samtools_extra.h" 13 13  #include "samtools/faidx.h" 14 +#include  14 15  #include  15 16  #include  16 17  #include  ... ... @@ -55,17 +56,6 @@ static double rand_gauss(double sigma) 55 56  } 56 57   57 58   58 -static double rand_trunc_gauss(double sigma, double a, double b) 59 -{ 60 - double x; 61 - do { 62 - x = rand_gauss(sigma); 63 - } while (x < a || x > b); 64 - 65 - return x; 66 -} 67 - 68 - 69 59  double gauss_pdf (const double x, const double sigma) 70 60  { 71 61  double u = x / fabs (sigma); ... ... @@ -86,25 +76,30 @@ static int read_pos_count_compare(const void* p1, const void* p2) 86 76  } 87 77   88 78   89 -/* 90 -static int read_pos_tid_count_compare(const void* p1, const void* p2) 91 -{ 92 - int c = (int)(((read_pos*)p1)->tid - ((read_pos*)p2)->tid); 93 - 94 - if (c == 0) { 95 - return (int)((read_pos*)p2)->count - (int)((read_pos*)p1)->count; 96 - } 97 - else return c; 98 -} 99 -*/ 100 - 101 - 102 79   103 80  sequencing_bias::sequencing_bias() 104 81  : ref_f(NULL) 105 82  , M(NULL) 106 83  {} 107 84   85 + 86 +sequencing_bias::sequencing_bias(const char* model_fn) 87 + : ref_f(NULL) 88 + , M(NULL) 89 +{ 90 + std::ifstream fin; 91 + fin.open(model_fn); 92 + 93 + YAML::Parser parser(fin); 94 + YAML::Node doc; 95 + parser.GetNextDocument(doc); 96 + 97 + doc["L"] >> L; 98 + doc["R"] >> R; 99 + 100 + M = new motif(doc["motif"]); 101 +} 102 + 108 103  sequencing_bias::sequencing_bias(const char* ref_fn, 109 104  const char* model_fn) 110 105  : ref_f(NULL) ... ... @@ -122,12 +117,15 @@ sequencing_bias::sequencing_bias(const char* ref_fn, 122 117   123 118  M = new motif(doc["motif"]); 124 119   125 - this->ref_fn = ref_fn; 126 120   127 - ref_f = fai_load(ref_fn); 128 - if (ref_f == NULL) { 129 - logger::abort("Can't open indexed FASTA file %s\n", ref_fn); 121 + if (ref_fn != NULL) { 122 + this->ref_fn = ref_fn; 123 + ref_f = fai_load(ref_fn); 124 + if (ref_f == NULL) { 125 + logger::abort("Can't open indexed FASTA file %s\n", ref_fn); 126 + } 130 127  } 128 + else ref_f = NULL; 131 129  } 132 130   133 131   ... ... @@ -233,7 +231,6 @@ void sequencing_bias::build(const char* ref_fn, 233 231   234 232  while (samread(reads_f, read) > 0) { 235 233  if (read->core.n_cigar != 1) continue; 236 - if (read->core.flag & BAM_FREAD2) continue; 237 234  k++; 238 235  if (k % 1000000 == 0) { 239 236  logger::info("hashed %zu reads.", k); ... ... @@ -265,7 +262,7 @@ void sequencing_bias::build(const char* ref_fn, 265 262  this->L = L; 266 263  this->R = R; 267 264   268 - unsigned int i; 265 + unsigned int i = 0; 269 266   270 267  read_pos* S_tmp; 271 268  read_pos* S; ... ... @@ -273,34 +270,12 @@ void sequencing_bias::build(const char* ref_fn, 273 270  const size_t max_dump = 10000000; 274 271  pos_table_dump(T, &S_tmp, &N, max_dump); 275 272   276 - /* sort by count */ 277 - qsort(S_tmp, N, sizeof(read_pos), read_pos_count_compare); 278 - 279 - /* consider only reads with at least one duplicate */ 280 - for (i = 0; i < N; ++i) { 281 - if (S_tmp[i].count <= 1) break; 282 - } 283 - 284 - /* (unless there are very few of these reads */ 285 - if (i > 10000) { 286 - max_reads = std::min(max_reads, i); 287 - logger::info("%zu reads with duplicates.", i); 288 - } 289 - else { 290 - i = N; 291 - } 292 - 293 - 294 - /* ignore the top 1%, as they tend to be vastly higher than anything else, 295 - * and thus can throw things off when training with small numbers of reads 296 - * */ 297 - S = S_tmp + i/100; 298 - max_reads = min(max_reads, 99*i/100);  273 + S = S_tmp; 299 274   300 275  /* sort by tid (so we can load one chromosome at a time) */ 276 + random_shuffle(S, S + N); 301 277  qsort(S, std::min(max_reads, N), sizeof(read_pos), read_pos_tid_compare); 302 278   303 - 304 279  /* sample foreground and background kmer frequencies */ 305 280  ref_f = fai_load(ref_fn); 306 281  if (ref_f == NULL) { ... ... @@ -337,7 +312,7 @@ void sequencing_bias::build(const char* ref_fn, 337 312  seqname = T->seq_names[ S[i].tid ]; 338 313  if (seq) free(seq);  339 314   340 - seq = faidx_fetch_seq(ref_f, seqname, 0, 1000000000, &seqlen); 315 + seq = faidx_fetch_seq(ref_f, seqname, 0, INT_MAX, &seqlen); 341 316   342 317  logger::info("read sequence %s.", seqname); 343 318   ... ... @@ -374,7 +349,7 @@ void sequencing_bias::build(const char* ref_fn, 374 349  /* adjust the current read position randomly, and sample */ 375 350  for (bg_sample_num = 0; bg_sample_num < bg_samples;) { 376 351   377 - bg_pos = S[i].pos + (pos_t)round_away(rand_trunc_gauss(12, -200, 200)); 352 + bg_pos = S[i].pos + (pos_t)round_away(rand_gauss(500)); 378 353   379 354  if (S[i].strand == strand_neg) { 380 355  if (bg_pos < R || bg_pos >= seqlen - L) continue; ... ... @@ -464,17 +439,201 @@ double* sequencing_bias::get_bias(const char* seqname, 464 439  bs[i] = M->eval(seq, i); 465 440  } 466 441   467 - 468 442  free(seqstr); 469 443  return bs; 470 444  } 471 445   472 446   473 -string sequencing_bias::print_model_graph() 447 +string sequencing_bias::model_graph() const 448 +{ 449 + return M->model_graph((int) L); 450 +} 451 + 452 + 453 +kmer_matrix tabulate_bias(double* kl, 454 + pos_t L, pos_t R, int k, 455 + const char* ref_fn, 456 + const char* reads_fn, 457 + const char* model_fn) 474 458  { 475 - // TODO 476 - //return M->print_model_graph(L); 477 - return string(); 459 + /* This function procedes very much like the sequencing_bias::build 460 + * function, but does not finally train a motif, but rather tabulates 461 + * k-mer frequencies. */ 462 + size_t max_reads = 250000; 463 + 464 + kmer_matrix dest((size_t) (L + 1 + R), (size_t) k); 465 + dest.set_all(0.0); 466 + 467 + faidx_t* ref_f = fai_load(ref_fn); 468 + if (ref_f == NULL) { 469 + logger::abort("Can't open fasta file '%s'.", ref_fn); 470 + } 471 + 472 + samfile_t* reads_f = samopen(reads_fn, "rb", NULL); 473 + if (reads_f == NULL) { 474 + logger::abort("Can't open bam file '%s'.", reads_fn); 475 + } 476 + 477 + bam_index_t* reads_index = bam_index_load(reads_fn); 478 + if (reads_index == NULL) { 479 + logger::abort("Can't open bam index '%s.bai'.", reads_fn); 480 + } 481 + 482 + sequencing_bias* sb = NULL; 483 + if (model_fn != NULL) { 484 + sb = new sequencing_bias(ref_fn, model_fn); 485 + } 486 + 487 + 488 + bam_init_header_hash(reads_f->header); 489 + bam1_t* read = bam_init1(); 490 + 491 + pos_table T; 492 + pos_table_create(&T, reads_f->header->n_targets); 493 + T.seq_names = reads_f->header->target_name; 494 + 495 + size_t hashed_count = 0; 496 + while (samread(reads_f, read) > 0) { 497 + if (read->core.n_cigar != 1) continue; 498 + if (++hashed_count % 1000000 == 0) { 499 + logger::info("hashed %zu reads.", hashed_count); 500 + } 501 + pos_table_inc(&T, read); 502 + } 503 + logger::info("hashed %zu reads.", hashed_count); 504 + 505 + read_pos* S_tmp; 506 + read_pos* S; 507 + size_t N; 508 + const size_t max_dump = 10000000; 509 + pos_table_dump(&T, &S_tmp, &N, max_dump); 510 + 511 + /* sort by count */ 512 + qsort(S_tmp, N, sizeof(read_pos), read_pos_count_compare); 513 + 514 + /* consider only reads with at least one duplicate */ 515 + size_t i; 516 + for (i = 0; i < N; ++i) { 517 + if (S_tmp[i].count <= 1) break; 518 + } 519 + 520 + /* (unless there are very few of these reads */ 521 + if (i > 10000) { 522 + max_reads = std::min(max_reads, i); 523 + logger::info("%zu reads with duplicates.", i); 524 + } 525 + else { 526 + i = N; 527 + } 528 + 529 + /* ignore the top 1%, as they tend to be vastly higher than anything else, 530 + * and thus can throw things off when training with small numbers of reads 531 + * */ 532 + S = S_tmp + i/100; 533 + max_reads = min(max_reads, 99*i/100);  534 + 535 + /* sort by tid (so we can load one chromosome at a time) */ 536 + qsort(S, std::min(max_reads, N), sizeof(read_pos), read_pos_tid_compare); 537 + 538 + twobitseq tbs; 539 + 540 + char* local_seq; 541 + local_seq = new char[ (k - 1) + L + 1 + R + 1 ]; 542 + local_seq[(k - 1) + L + 1 + R] = '\0'; 543 + 544 + double w; 545 + char* seq = NULL; 546 + int seqlen = 0; 547 + int curr_tid = -1; 548 + char* seqname = NULL; 549 + 550 + for (i = 0; i < N && i < max_reads; i++) { 551 + if (S[i].tid != curr_tid) { 552 + seqname = T.seq_names[S[i].tid]; 553 + free(seq); 554 + seq = faidx_fetch_seq(ref_f, seqname, 0, INT_MAX, &seqlen); 555 + logger::info("read sequence %s.", seqname); 556 + curr_tid = S[i].tid; 557 + 558 + if (seq == NULL) { 559 + logger::warn("warning: reference sequence not found, skipping."); 560 + } 561 + } 562 + 563 + if (seq == NULL) continue; 564 + 565 + if (S[i].strand == strand_neg) { 566 + if (S[i].pos < R || S[i].pos >= seqlen - L - (k-1)) continue; 567 + memcpy(local_seq, seq + S[i].pos - R, ((k-1)+L+1+R)*sizeof(char)); 568 + seqrc(local_seq, L+1+R); 569 + } 570 + else { 571 + if (S[i].pos < L + (k-1) || S[i].pos >= seqlen - R) continue; 572 + memcpy(local_seq, seq + (S[i].pos - L - (k-1)), ((k-1)+L+1+R)*sizeof(char)); 573 + } 574 + 575 + if (sb) { 576 + // TODO: get_bias 577 + } 578 + else w = 1.0; 579 + 580 + tbs = local_seq; 581 + kmer K; 582 + for (pos_t pos = (k-1); pos < (k-1) + L + 1 + R; ++pos) { 583 + K = tbs.get_kmer(k, pos); 584 + dest(pos - (k-1), K) += w; 585 + } 586 + } 587 + 588 + /* compute KL divergence */ 589 + 590 + /* estimate a background distribution by averaging across the entire window 591 + * */ 592 + int four_to_k = 1 << (2 * k); 593 + double* bg = new double[four_to_k]; 594 + memset(bg, 0, four_to_k * sizeof(double)); 595 + for (pos_t pos = 0; pos < L + 1 + R; ++pos) { 596 + for (kmer K = 0; K < (kmer) four_to_k; ++K) { 597 + bg[K] += dest(pos, K); 598 + } 599 + } 600 + 601 + kmer_matrix norm_dest(dest); 602 + norm_dest.make_distribution(); 603 + 604 + double z = 0.0; 605 + for (kmer K = 0; K < (kmer) four_to_k; ++K) z += bg[K]; 606 + for (kmer K = 0; K < (kmer) four_to_k; ++K) bg[K] /= z; 607 + 608 + 609 + /* Compute the (symetric) kullback-leibler divegnnce */ 610 + memset(kl, 0, (L + 1 + R) * sizeof(double)); 611 + for (pos_t pos = 0; pos < L + 1 + R; ++pos) { 612 + kl[pos] = 0.0; 613 + for (kmer K = 0; K < (kmer) four_to_k; ++K) { 614 + if (norm_dest(pos, K) > 0.0) { 615 + kl[pos] += norm_dest(pos, K) * (log2(norm_dest(pos, K)) - log2(bg[K])); 616 + } 617 + 618 + if (bg[K] > 0.0) { 619 + kl[pos] += bg[K] * (log2(bg[K]) - log2(norm_dest(pos, K))); 620 + } 621 + } 622 + } 623 + 624 + delete [] bg; 625 + 626 + 627 + free(seq); 628 + free(local_seq); 629 + free(S_tmp); 630 + bam_destroy1(read); 631 + pos_table_destroy(&T); 632 + delete sb; 633 + bam_index_destroy(reads_index); 634 + samclose(reads_f); 635 + 636 + return dest; 478 637  } 479 638   480 639 
 ... ... @@ -21,7 +21,13 @@ 21 21  class sequencing_bias 22 22  { 23 23  public: 24 - /** Load a model that has bee previously trained. */ 24 + /** Load a model that has been previously trained, without a reference 25 + * sequence. (It can not be actually used this way.) 26 + */ 27 + sequencing_bias(const char* model_fn); 28 + 29 + 30 + /** Load a model that has been previously trained. */ 25 31  sequencing_bias(const char* ref_fn, 26 32  const char* model_fn); 27 33   ... ... @@ -74,7 +80,7 @@ class sequencing_bias 74 80  void to_yaml(YAML::Emitter&) const; 75 81   76 82  /** Return a string of the model graph in dot format. */ 77 - std::string print_model_graph(); 83 + std::string model_graph() const; 78 84   79 85   80 86  private: ... ... @@ -107,4 +113,13 @@ class sequencing_bias 107 113  }; 108 114   109 115   116 +/** Tabulate the bias in a given dataset, optionally correcting for bias using 117 + * the given model. */ 118 +kmer_matrix tabulate_bias(double* kl, 119 + pos_t L, pos_t R, int k, 120 + const char* ref_fn, 121 + const char* reads_fn, 122 + const char* model_fn = NULL); 123 + 124 + 110 125  #endif
 ... ... @@ -8,10 +8,8 @@ 8 8  #include "twobitseq.hpp" 9 9  #include  10 10  #include  11 +#include  11 12   12 -/* Don't expose this function. No one should have to know how we are mapping 13 - * letters to numbers. 14 - */ 15 13  kmer nuc_to_num(char c) 16 14  { 17 15  switch( c ) { ... ... @@ -26,36 +24,48 @@ kmer nuc_to_num(char c) 26 24  } 27 25  } 28 26   29 -void num_to_nuc(char* dest, int n, int k) 27 + 28 + 29 +void num_to_nuc(char* dest, kmer K, int k) 30 30  { 31 31  int i; 32 32  /* read backwards, then reverse */ 33 - for( i = 0; i < k; i++ ) { 34 - switch( n & 0x3 ) { 33 + for (i = 0; i < k; ++i) { 34 + switch (K & 0x3) { 35 35  case 0: dest[i] = 'a'; break; 36 36  case 1: dest[i] = 'c'; break; 37 37  case 2: dest[i] = 'g'; break; 38 38  case 3: dest[i] = 't'; break; 39 39  } 40 - n >>= 2; 40 + 41 + K >>= 2; 41 42  } 42 - dest[i] = '\0'; 43 43   44 + dest[i] = '\0'; 44 45  std::reverse(dest, dest + i); 45 46  } 46 47   47 48   48 49  const size_t twobitseq::max_kmer = 4 * sizeof(kmer); 49 50   51 +twobitseq::twobitseq() 52 + : xs(NULL) 53 + , n(0) 54 +{ 55 + 56 +} 57 + 50 58   51 59  twobitseq::twobitseq(const char* seq) 52 60  : xs(NULL) 53 61  , n(0) 54 62  { 63 + if (seq == NULL) return; 55 64  n = strlen(seq); 56 65  if (n == 0) return; 57 66   58 - xs = new kmer[n / max_kmer + 1]; 67 + xs = reinterpret_cast( 68 + malloc_or_die((n / max_kmer + 1) * sizeof(kmer))); 59 69  memset(xs, 0, (n / max_kmer + 1) * sizeof(kmer)); 60 70   61 71  size_t i; ... ... @@ -77,7 +87,8 @@ twobitseq::twobitseq(const twobitseq& other) 77 87  n = other.n; 78 88  if (n == 0) return; 79 89   80 - xs = new kmer[n / max_kmer + 1]; 90 + xs = reinterpret_cast( 91 + malloc_or_die((n / max_kmer + 1) * sizeof(kmer))); 81 92  memcpy(xs, other.xs, (n / max_kmer + 1) * sizeof(kmer)); 82 93  } 83 94   ... ... @@ -85,20 +96,60 @@ twobitseq::twobitseq(const twobitseq& other) 85 96   86 97  twobitseq::~twobitseq() 87 98  { 88 - delete [] xs; 99 + free(xs); 89 100  } 90 101   91 102   92 103  void twobitseq::operator = (const twobitseq& other) 93 104  { 94 105  n = other.n; 95 - 96 - delete [] xs; 97 - xs = new kmer[n / max_kmer + 1]; 106 + xs = reinterpret_cast( 107 + realloc_or_die(xs, (n / max_kmer + 1) * sizeof(kmer))); 98 108  memcpy(xs, other.xs, (n / max_kmer + 1) * sizeof(kmer)); 99 109  } 100 110   101 111   112 +void twobitseq::operator = (const char* seq) 113 +{ 114 + if (seq == NULL) { 115 + n = 0; 116 + free(xs); 117 + xs = NULL; 118 + return; 119 + } 120 + 121 + n = strlen(seq); 122 + xs = reinterpret_cast( 123 + realloc_or_die(xs, (n / max_kmer + 1) * sizeof(kmer))); 124 + memset(xs, 0, (n / max_kmer + 1) * sizeof(kmer)); 125 + 126 + size_t i; 127 + size_t block, offset; 128 + for (i = 0; i < n; ++i) { 129 + block = i / max_kmer; 130 + offset = i % max_kmer; 131 + 132 + xs[block] = xs[block] | (nuc_to_num(seq[i]) << (2*offset)); 133 + } 134 +} 135 + 136 + 137 +kmer twobitseq::get_kmer(int k, pos_t pos) 138 +{ 139 + size_t block, offset; 140 + size_t i; 141 + kmer K = 0; 142 + 143 + for (i = 0; i < (size_t) k; ++i) { 144 + block = (i + (size_t) (pos - (k - 1))) / max_kmer; 145 + offset = (i + (size_t) (pos - (k - 1))) % max_kmer; 146 + K = (K << 2) | ((xs[block] >> (2 * offset)) & 0x3); 147 + } 148 + 149 + return K; 150 +} 151 + 152 + 102 153  int twobitseq::make_kmer(kmer& K, size_t pos, bool* mask, size_t mask_len) const 103 154  { 104 155  int k = 0;
 ... ... @@ -21,8 +21,8 @@ 21 21  class twobitseq 22 22  { 23 23  public: 24 - /** create an empty sequence (uninitialized, filled with noise) */ 25 - twobitseq(size_t n); 24 + /** create an empty sequence. */ 25 + twobitseq(); 26 26   27 27  /** create a sequence from a DNA/RNA sequence string */ 28 28  twobitseq(const char* seq); ... ... @@ -33,6 +33,11 @@ class twobitseq 33 33  ~twobitseq(); 34 34   35 35  void operator = (const twobitseq&); 36 + void operator = (const char* seq); 37 + 38 + /* get the kmer ending at position i */ 39 + kmer get_kmer(int k, pos_t i); 40 + 36 41   37 42  /** extract a kmer. 38 43  * ... ... @@ -51,12 +56,13 @@ class twobitseq 51 56  static const size_t max_kmer; 52 57  }; 53 58   59 + 54 60  /** Convert a nucleotide charactor to a number, using the same scheme as 55 61  * twobitseq */ 56 62  kmer nuc_to_num(char c); 57 63   58 64  /** Convert a number n encoding a kmer into a string of nucleotides */ 59 -void num_to_nuc(char* dest, int n, int k); 65 +void num_to_nuc(char* dest, kmer K, int k); 60 66   61 67  #endif 62 68