git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/seqbias@59091 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -109,21 +109,27 @@ setClass( "seqbias", |
109 | 109 |
|
110 | 110 |
|
111 | 111 |
|
112 |
-"count.reads" <- function( reads_fn, I, binary = TRUE ) |
|
112 |
+"count.reads" <- function( reads_fn, I, sb = NULL, binary = FALSE, sum.counts = FALSE ) |
|
113 | 113 |
{ |
114 | 114 |
require(GenomicRanges) |
115 | 115 |
stopifnot( is( I, "GRanges" ) ) |
116 | 116 |
|
117 |
+ if (!is.null(sb) & class(sb) != "seqbias") { |
|
118 |
+ stop( "The 'sb' parameter of count.reads must be a seqbias class." ) |
|
119 |
+ } |
|
120 |
+ |
|
117 | 121 |
bam_ptr <- .Call( "seqbias_open_bam", path.expand(reads_fn), |
118 | 122 |
PACKAGE = "seqbias" ) |
119 | 123 |
|
120 | 124 |
counts <- tapply( I, |
121 | 125 |
INDEX = 1:length(I), |
122 | 126 |
FUN = function(x) .Call( "seqbias_count_reads", |
127 |
+ sb, |
|
123 | 128 |
bam_ptr, |
124 | 129 |
as.character(seqnames(x)), |
125 | 130 |
start(x), end(x), |
126 | 131 |
as.character(strand(x)), |
132 |
+ sum.counts, |
|
127 | 133 |
PACKAGE = "seqbias" ) ) |
128 | 134 |
|
129 | 135 |
if( binary ) lapply( counts, FUN = function(c) as.integer( c > 0 ) ) |
... | ... |
@@ -126,7 +126,7 @@ seqs[neg_idx] <- reverseComplement( seqs[neg_idx] ) |
126 | 126 |
Finally, we count the number of reads mapping to each position in our sampled |
127 | 127 |
intervals. |
128 | 128 |
<<>>= |
129 |
-counts <- count.reads( reads_fn, I ) |
|
129 |
+counts <- count.reads( reads_fn, I, binary = T ) |
|
130 | 130 |
@ |
131 | 131 |
|
132 | 132 |
Unless the \texttt{binary} argument is FALSE, this function returns a 0-1 vector, |
... | ... |
@@ -192,7 +192,7 @@ more reads are available a more accurate model can be trained at the expense of |
192 | 192 |
the training procedure taking up to several minutes. |
193 | 193 |
|
194 | 194 |
<<results=hide>>= |
195 |
-sb <- seqbias.fit( ref_fn, reads_fn, L = 5, R = 20 ) |
|
195 |
+sb <- seqbias.fit( ref_fn, reads_fn, L = 5, R = 15 ) |
|
196 | 196 |
@ |
197 | 197 |
|
198 | 198 |
The \texttt{L} and \texttt{R} arguments control the maximum number of positions |
... | ... |
@@ -3,21 +3,25 @@ |
3 | 3 |
\title{Counting reads across intervals} |
4 | 4 |
\description{Counts the number of reads starting at each position across given |
5 | 5 |
genomic intervals} |
6 |
-\usage{count.reads(reads_fn, I, binary=TRUE)} |
|
6 |
+\usage{count.reads(reads_fn, I, sb=NULL, binary=FALSE, sum.counts=TRUE)} |
|
7 | 7 |
\arguments{ |
8 | 8 |
\item{reads_fn}{filename of aligned reads in BAM format} |
9 | 9 |
\item{I}{a GRanges object giving valid genomic intervals} |
10 |
+ \item{sb}{a seqbias object} |
|
10 | 11 |
\item{binary}{if \code{TRUE}, return a 0-1 vector, otherwise return a vector |
11 | 12 |
counting the number of reads mapped to each position} |
13 |
+ \item{sum.counts}{if \code{TRUE} return the total read count for each interval} |
|
12 | 14 |
} |
13 | 15 |
\details{ |
14 | 16 |
Given an indexed BAM file, this function counts the number of reads starting |
15 |
- at each position of each provided interval. These counts can then be |
|
16 |
- normalized by dividing by the predicted bias obtained from 'bias.predict'. |
|
17 |
- |
|
18 |
- By default, a 0-1 vector is returned, where positions at which no reads are |
|
19 |
- mapped are 0, and those with one or more are 1. If \code{binary} is |
|
20 |
- \code{FALSE}, the number of reads mapping to each position is returned. |
|
17 |
+ at each position of each provided interval. If a seqbias object is provided |
|
18 |
+ through the \code{sb} attribute, counts are corrected for sequence bias. |
|
19 |
+ The total read count for each interval is returned if \code{sum.counts} is |
|
20 |
+ \code{TRUE}. |
|
21 |
+ |
|
22 |
+ If \code{binary} is \code{TRUE} a 0-1 vector is returned instead, where |
|
23 |
+ positions at which no reads are mapped are 0, and those with one or more are |
|
24 |
+ 1. |
|
21 | 25 |
} |
22 | 26 |
\value{ |
23 | 27 |
A list of numeric vectors is returned, one for each interval provided. Each |
... | ... |
@@ -186,7 +186,8 @@ class motif_trainer |
186 | 186 |
compute_reachability(); |
187 | 187 |
|
188 | 188 |
logger::print(col_base); |
189 |
- snprintf(msg, sizeof(msg), "round %4zu (ic = %0.4e)", round_num, ic); |
|
189 |
+ snprintf(msg, sizeof(msg), "round %4lu (ic = %0.4e)", |
|
190 |
+ (unsigned long)round_num, ic); |
|
190 | 191 |
logger::print(msg); |
191 | 192 |
logger::print(col_base); |
192 | 193 |
col = 0; |
... | ... |
@@ -442,7 +443,6 @@ class motif_trainer |
442 | 443 |
else i_start = i_end = j; |
443 | 444 |
|
444 | 445 |
for (i = i_start; i <= i_end; ++i) { |
445 |
- |
|
446 | 446 |
/* skip existing edges */ |
447 | 447 |
if (M.has_edge(i, j)) continue; |
448 | 448 |
|
... | ... |
@@ -800,7 +800,7 @@ motif::motif(const YAML::Node& node) |
800 | 800 |
/* m */ |
801 | 801 |
unsigned int m_; |
802 | 802 |
node["m"] >> m_; |
803 |
- m = (size_t) m; |
|
803 |
+ m = (size_t) m_; |
|
804 | 804 |
|
805 | 805 |
|
806 | 806 |
/* parents */ |
... | ... |
@@ -809,8 +809,10 @@ motif::motif(const YAML::Node& node) |
809 | 809 |
const YAML::Node& parents_node = node["parents"]; |
810 | 810 |
|
811 | 811 |
size_t i; |
812 |
- for (i = 0; i < m; ++i) { |
|
813 |
- parents_node[i] >> parents[i]; |
|
812 |
+ int b; |
|
813 |
+ for (i = 0; i < m * m; ++i) { |
|
814 |
+ parents_node[i] >> b; |
|
815 |
+ parents[i] = (bool) b; |
|
814 | 816 |
} |
815 | 817 |
|
816 | 818 |
|
... | ... |
@@ -840,7 +842,7 @@ void motif::to_yaml(YAML::Emitter& out) const |
840 | 842 |
out << YAML::Flow << YAML::BeginSeq; |
841 | 843 |
|
842 | 844 |
size_t i; |
843 |
- for (i = 0; i < m * m; ++i) out << parents[i]; |
|
845 |
+ for (i = 0; i < m * m; ++i) out << (parents[i] ? 1 : 0); |
|
844 | 846 |
|
845 | 847 |
out << YAML::EndSeq; |
846 | 848 |
|
... | ... |
@@ -860,6 +862,45 @@ void motif::to_yaml(YAML::Emitter& out) const |
860 | 862 |
} |
861 | 863 |
|
862 | 864 |
|
865 |
+string motif::model_graph(int offset) const |
|
866 |
+{ |
|
867 |
+ string graph_str; |
|
868 |
+ char strbuf[512]; |
|
869 |
+ |
|
870 |
+ graph_str += "digraph {\n"; |
|
871 |
+ graph_str += "splines=\"true\";\n"; |
|
872 |
+ graph_str += "node [shape=\"box\"];\n"; |
|
873 |
+ |
|
874 |
+ /* print nodes */ |
|
875 |
+ size_t i, j; |
|
876 |
+ for (j = 0; j < m; j++) { |
|
877 |
+ snprintf(strbuf, sizeof(strbuf), |
|
878 |
+ "n%d [label=\"%d\",pos=\"%d,0\",style=\"%s\"];\n", |
|
879 |
+ (int) j, (int) j - offset, (int) j * 100, |
|
880 |
+ parents[j * m + j] ? "solid" : "dotted"); |
|
881 |
+ |
|
882 |
+ graph_str += strbuf; |
|
883 |
+ } |
|
884 |
+ |
|
885 |
+ /* print edges */ |
|
886 |
+ for (j = 0; j < m; ++j) { |
|
887 |
+ if (!parents[j * m + j]) continue; |
|
888 |
+ |
|
889 |
+ for (i = 0; i < m; ++i) { |
|
890 |
+ if (i == j) continue; |
|
891 |
+ if (parents[j * m + i]) { |
|
892 |
+ snprintf(strbuf, sizeof(strbuf), |
|
893 |
+ "n%lu -> n%lu;\n", (unsigned long) i, (unsigned long) j); |
|
894 |
+ graph_str += strbuf; |
|
895 |
+ } |
|
896 |
+ } |
|
897 |
+ } |
|
898 |
+ |
|
899 |
+ graph_str += "}\n"; |
|
900 |
+ return graph_str; |
|
901 |
+} |
|
902 |
+ |
|
903 |
+ |
|
863 | 904 |
double motif::eval(const twobitseq& seq, size_t offset) |
864 | 905 |
{ |
865 | 906 |
double ll0 = 0.0; |
... | ... |
@@ -45,6 +45,10 @@ class motif |
45 | 45 |
/** emit yaml serialization */ |
46 | 46 |
void to_yaml(YAML::Emitter& out) const; |
47 | 47 |
|
48 |
+ /** print the model graph in GraphViz (dot) format, |
|
49 |
+ * adjusting position labels by the given offset. */ |
|
50 |
+ std::string model_graph(int offset) const; |
|
51 |
+ |
|
48 | 52 |
private: |
49 | 53 |
motif(); |
50 | 54 |
|
... | ... |
@@ -153,8 +153,13 @@ SEXP seqbias_predict( SEXP seqbias, |
153 | 153 |
pos_t c_start, c_end; |
154 | 154 |
strand_t c_strand; |
155 | 155 |
|
156 |
+ |
|
156 | 157 |
coerce_genomic_coords( seqname, start, end, strand, |
157 | 158 |
&c_seqname, &c_start, &c_end, &c_strand ); |
159 |
+ |
|
160 |
+ /* we expect 1-based coordinates, but work with 0-based */ |
|
161 |
+ c_start -= 1; |
|
162 |
+ c_end -= 1; |
|
158 | 163 |
|
159 | 164 |
if( c_strand != 0 && c_strand != 1 ) { |
160 | 165 |
warning( "strand should be '+' or '-'" ); |
... | ... |
@@ -217,11 +222,13 @@ SEXP seqbias_open_bam( SEXP reads_fn ) |
217 | 222 |
} |
218 | 223 |
|
219 | 224 |
|
220 |
-SEXP seqbias_count_reads( SEXP bam_ptr, |
|
225 |
+SEXP seqbias_count_reads( SEXP seqbias, |
|
226 |
+ SEXP bam_ptr, |
|
221 | 227 |
SEXP seqname, |
222 | 228 |
SEXP start, |
223 | 229 |
SEXP end, |
224 |
- SEXP strand ) |
|
230 |
+ SEXP strand, |
|
231 |
+ SEXP sum_counts ) |
|
225 | 232 |
{ |
226 | 233 |
if( TYPEOF(bam_ptr) != EXTPTRSXP ) error( "argument is not a indexed bam pointer" ); |
227 | 234 |
indexed_bam_f* c_bam_ptr = (indexed_bam_f*)EXTPTR_PTR( bam_ptr ); |
... | ... |
@@ -232,13 +239,52 @@ SEXP seqbias_count_reads( SEXP bam_ptr, |
232 | 239 |
|
233 | 240 |
coerce_genomic_coords( seqname, start, end, strand, |
234 | 241 |
&c_seqname, &c_start, &c_end, &c_strand ); |
242 |
+ |
|
243 |
+ |
|
244 |
+ /* we expect 1-based coordinates, but use 0-based */ |
|
245 |
+ c_start -= 1; |
|
246 |
+ c_end -= 1; |
|
235 | 247 |
|
236 | 248 |
|
249 |
+ double* bs[2] = {NULL, NULL}; |
|
250 |
+ |
|
251 |
+ if( !isNull(seqbias) ) { |
|
252 |
+ |
|
253 |
+ SEXP idx; |
|
254 |
+ PROTECT( idx = allocVector( STRSXP, 1 ) ); |
|
255 |
+ SET_STRING_ELT(idx, 0, mkChar("ptr")); |
|
256 |
+ |
|
257 |
+ sequencing_bias* c_seqbias = NULL; |
|
258 |
+ if( TYPEOF(GET_SLOT(seqbias, idx)) != EXTPTRSXP || |
|
259 |
+ !(c_seqbias = (sequencing_bias*)EXTPTR_PTR(GET_SLOT(seqbias, idx))) ) { |
|
260 |
+ error( "first argument is not a proper seqbias class." ); |
|
261 |
+ } |
|
262 |
+ |
|
263 |
+ if (c_strand == strand_na || c_strand == strand_pos ) { |
|
264 |
+ bs[0] = c_seqbias->get_bias(c_seqname, c_start, c_end, strand_pos); |
|
265 |
+ } |
|
266 |
+ |
|
267 |
+ if (c_strand == strand_na || c_strand == strand_neg ) { |
|
268 |
+ bs[1] = c_seqbias->get_bias(c_seqname, c_start, c_end, strand_neg); |
|
269 |
+ std::reverse(bs[1], bs[1] + (c_end - c_start + 1)); |
|
270 |
+ } |
|
271 |
+ |
|
272 |
+ UNPROTECT(1); |
|
273 |
+ } |
|
274 |
+ |
|
275 |
+ bool c_sum_counts = asLogical(sum_counts) == TRUE; |
|
276 |
+ |
|
237 | 277 |
/* init vector */ |
238 | 278 |
SEXP v; |
239 |
- PROTECT( v = allocVector( REALSXP, c_end - c_start + 1 ) ); |
|
240 |
- pos_t i; |
|
241 |
- for( i = 0; i < c_end - c_start + 1; i++ ) REAL(v)[i] = 0; |
|
279 |
+ if (c_sum_counts) { |
|
280 |
+ PROTECT( v = allocVector( REALSXP, 1 ) ); |
|
281 |
+ REAL(v)[0] = 0; |
|
282 |
+ } |
|
283 |
+ else { |
|
284 |
+ PROTECT( v = allocVector( REALSXP, c_end - c_start + 1 ) ); |
|
285 |
+ pos_t i; |
|
286 |
+ for( i = 0; i < c_end - c_start + 1; i++ ) REAL(v)[i] = 0; |
|
287 |
+ } |
|
242 | 288 |
|
243 | 289 |
|
244 | 290 |
const size_t region_len = 1024; |
... | ... |
@@ -261,26 +307,41 @@ SEXP seqbias_count_reads( SEXP bam_ptr, |
261 | 307 |
bam_iter_t it = bam_iter_query( c_bam_ptr->idx, bam_ref_id, bam_start, bam_end ); |
262 | 308 |
bam1_t* b = bam_init1(); |
263 | 309 |
pos_t x; |
310 |
+ strand_t s; |
|
311 |
+ |
|
264 | 312 |
|
265 | 313 |
while( bam_iter_read( c_bam_ptr->f->x.bam, it, b ) > 0 ) { |
266 |
- if( bam1_strand(b) != c_strand ) continue; |
|
314 |
+ s = (strand_t) bam1_strand(b); |
|
315 |
+ if( c_strand != strand_na && s != c_strand ) continue; |
|
316 |
+ |
|
267 | 317 |
x = bam1_strand(b) == 1 ? bam_calend( &b->core, bam1_cigar(b) ) - 1 : b->core.pos; |
268 |
- if( c_start <= x && x <= c_end ) REAL(v)[x - c_start]++; |
|
318 |
+ if (x < c_start || x > c_end) continue; |
|
319 |
+ |
|
320 |
+ if (c_sum_counts) { |
|
321 |
+ REAL(v)[0] += bs[s] == NULL ? 1.0 : (1.0 / bs[s][x - c_start]); |
|
322 |
+ } |
|
323 |
+ else { |
|
324 |
+ REAL(v)[x - c_start] += bs[s] == NULL ? 1.0 : (1.0 / bs[s][x - c_start]); |
|
325 |
+ } |
|
269 | 326 |
} |
270 | 327 |
|
271 |
- if( c_strand == strand_neg ) { |
|
328 |
+ if( c_strand == strand_neg && !c_sum_counts ) { |
|
272 | 329 |
std::reverse(REAL(v), REAL(v) + (c_end - c_start + 1)); |
273 | 330 |
} |
274 | 331 |
|
275 | 332 |
bam_iter_destroy( it ); |
276 | 333 |
bam_destroy1(b); |
277 | 334 |
|
335 |
+ delete [] bs[0]; |
|
336 |
+ delete [] bs[1]; |
|
337 |
+ |
|
278 | 338 |
UNPROTECT(1); |
279 | 339 |
return v; |
280 | 340 |
} |
281 | 341 |
|
282 | 342 |
|
283 | 343 |
|
344 |
+ |
|
284 | 345 |
void dealloc_kmer_matrix( SEXP M ) |
285 | 346 |
{ |
286 | 347 |
if( TYPEOF(M) != EXTPTRSXP ) error( "argument is not a kmer_matrix pointer" ); |
... | ... |
@@ -433,7 +494,7 @@ void R_init_seqbias( DllInfo* info ) |
433 | 494 |
{ "seqbias_load", (DL_FUNC) &seqbias_load, 2 }, |
434 | 495 |
{ "seqbias_save", (DL_FUNC) &seqbias_save, 2 }, |
435 | 496 |
{ "seqbias_open_bam", (DL_FUNC) &seqbias_open_bam, 1 }, |
436 |
- { "seqbias_count_reads", (DL_FUNC) &seqbias_count_reads, 5 }, |
|
497 |
+ { "seqbias_count_reads", (DL_FUNC) &seqbias_count_reads, 7 }, |
|
437 | 498 |
{ "seqbias_alloc_kmer_matrix", (DL_FUNC) &seqbias_alloc_kmer_matrix, 2 }, |
438 | 499 |
{ "seqbias_tally_kmers", (DL_FUNC) &seqbias_tally_kmers, 4 }, |
439 | 500 |
{ "sebqias_dataframe_from_kmer_matrix", (DL_FUNC) &seqbias_dataframe_from_kmer_matrix, 2 }, |
... | ... |
@@ -11,6 +11,7 @@ |
11 | 11 |
#include "miscmath.h" |
12 | 12 |
#include "samtools_extra.h" |
13 | 13 |
#include "samtools/faidx.h" |
14 |
+#include <climits> |
|
14 | 15 |
#include <cmath> |
15 | 16 |
#include <cctype> |
16 | 17 |
#include <ctime> |
... | ... |
@@ -55,17 +56,6 @@ static double rand_gauss(double sigma) |
55 | 56 |
} |
56 | 57 |
|
57 | 58 |
|
58 |
-static double rand_trunc_gauss(double sigma, double a, double b) |
|
59 |
-{ |
|
60 |
- double x; |
|
61 |
- do { |
|
62 |
- x = rand_gauss(sigma); |
|
63 |
- } while (x < a || x > b); |
|
64 |
- |
|
65 |
- return x; |
|
66 |
-} |
|
67 |
- |
|
68 |
- |
|
69 | 59 |
double gauss_pdf (const double x, const double sigma) |
70 | 60 |
{ |
71 | 61 |
double u = x / fabs (sigma); |
... | ... |
@@ -86,25 +76,30 @@ static int read_pos_count_compare(const void* p1, const void* p2) |
86 | 76 |
} |
87 | 77 |
|
88 | 78 |
|
89 |
-/* |
|
90 |
-static int read_pos_tid_count_compare(const void* p1, const void* p2) |
|
91 |
-{ |
|
92 |
- int c = (int)(((read_pos*)p1)->tid - ((read_pos*)p2)->tid); |
|
93 |
- |
|
94 |
- if (c == 0) { |
|
95 |
- return (int)((read_pos*)p2)->count - (int)((read_pos*)p1)->count; |
|
96 |
- } |
|
97 |
- else return c; |
|
98 |
-} |
|
99 |
-*/ |
|
100 |
- |
|
101 |
- |
|
102 | 79 |
|
103 | 80 |
sequencing_bias::sequencing_bias() |
104 | 81 |
: ref_f(NULL) |
105 | 82 |
, M(NULL) |
106 | 83 |
{} |
107 | 84 |
|
85 |
+ |
|
86 |
+sequencing_bias::sequencing_bias(const char* model_fn) |
|
87 |
+ : ref_f(NULL) |
|
88 |
+ , M(NULL) |
|
89 |
+{ |
|
90 |
+ std::ifstream fin; |
|
91 |
+ fin.open(model_fn); |
|
92 |
+ |
|
93 |
+ YAML::Parser parser(fin); |
|
94 |
+ YAML::Node doc; |
|
95 |
+ parser.GetNextDocument(doc); |
|
96 |
+ |
|
97 |
+ doc["L"] >> L; |
|
98 |
+ doc["R"] >> R; |
|
99 |
+ |
|
100 |
+ M = new motif(doc["motif"]); |
|
101 |
+} |
|
102 |
+ |
|
108 | 103 |
sequencing_bias::sequencing_bias(const char* ref_fn, |
109 | 104 |
const char* model_fn) |
110 | 105 |
: ref_f(NULL) |
... | ... |
@@ -122,12 +117,15 @@ sequencing_bias::sequencing_bias(const char* ref_fn, |
122 | 117 |
|
123 | 118 |
M = new motif(doc["motif"]); |
124 | 119 |
|
125 |
- this->ref_fn = ref_fn; |
|
126 | 120 |
|
127 |
- ref_f = fai_load(ref_fn); |
|
128 |
- if (ref_f == NULL) { |
|
129 |
- logger::abort("Can't open indexed FASTA file %s\n", ref_fn); |
|
121 |
+ if (ref_fn != NULL) { |
|
122 |
+ this->ref_fn = ref_fn; |
|
123 |
+ ref_f = fai_load(ref_fn); |
|
124 |
+ if (ref_f == NULL) { |
|
125 |
+ logger::abort("Can't open indexed FASTA file %s\n", ref_fn); |
|
126 |
+ } |
|
130 | 127 |
} |
128 |
+ else ref_f = NULL; |
|
131 | 129 |
} |
132 | 130 |
|
133 | 131 |
|
... | ... |
@@ -233,7 +231,6 @@ void sequencing_bias::build(const char* ref_fn, |
233 | 231 |
|
234 | 232 |
while (samread(reads_f, read) > 0) { |
235 | 233 |
if (read->core.n_cigar != 1) continue; |
236 |
- if (read->core.flag & BAM_FREAD2) continue; |
|
237 | 234 |
k++; |
238 | 235 |
if (k % 1000000 == 0) { |
239 | 236 |
logger::info("hashed %zu reads.", k); |
... | ... |
@@ -265,7 +262,7 @@ void sequencing_bias::build(const char* ref_fn, |
265 | 262 |
this->L = L; |
266 | 263 |
this->R = R; |
267 | 264 |
|
268 |
- unsigned int i; |
|
265 |
+ unsigned int i = 0; |
|
269 | 266 |
|
270 | 267 |
read_pos* S_tmp; |
271 | 268 |
read_pos* S; |
... | ... |
@@ -273,34 +270,12 @@ void sequencing_bias::build(const char* ref_fn, |
273 | 270 |
const size_t max_dump = 10000000; |
274 | 271 |
pos_table_dump(T, &S_tmp, &N, max_dump); |
275 | 272 |
|
276 |
- /* sort by count */ |
|
277 |
- qsort(S_tmp, N, sizeof(read_pos), read_pos_count_compare); |
|
278 |
- |
|
279 |
- /* consider only reads with at least one duplicate */ |
|
280 |
- for (i = 0; i < N; ++i) { |
|
281 |
- if (S_tmp[i].count <= 1) break; |
|
282 |
- } |
|
283 |
- |
|
284 |
- /* (unless there are very few of these reads */ |
|
285 |
- if (i > 10000) { |
|
286 |
- max_reads = std::min<size_t>(max_reads, i); |
|
287 |
- logger::info("%zu reads with duplicates.", i); |
|
288 |
- } |
|
289 |
- else { |
|
290 |
- i = N; |
|
291 |
- } |
|
292 |
- |
|
293 |
- |
|
294 |
- /* ignore the top 1%, as they tend to be vastly higher than anything else, |
|
295 |
- * and thus can throw things off when training with small numbers of reads |
|
296 |
- * */ |
|
297 |
- S = S_tmp + i/100; |
|
298 |
- max_reads = min<size_t>(max_reads, 99*i/100); |
|
273 |
+ S = S_tmp; |
|
299 | 274 |
|
300 | 275 |
/* sort by tid (so we can load one chromosome at a time) */ |
276 |
+ random_shuffle(S, S + N); |
|
301 | 277 |
qsort(S, std::min<size_t>(max_reads, N), sizeof(read_pos), read_pos_tid_compare); |
302 | 278 |
|
303 |
- |
|
304 | 279 |
/* sample foreground and background kmer frequencies */ |
305 | 280 |
ref_f = fai_load(ref_fn); |
306 | 281 |
if (ref_f == NULL) { |
... | ... |
@@ -337,7 +312,7 @@ void sequencing_bias::build(const char* ref_fn, |
337 | 312 |
seqname = T->seq_names[ S[i].tid ]; |
338 | 313 |
if (seq) free(seq); |
339 | 314 |
|
340 |
- seq = faidx_fetch_seq(ref_f, seqname, 0, 1000000000, &seqlen); |
|
315 |
+ seq = faidx_fetch_seq(ref_f, seqname, 0, INT_MAX, &seqlen); |
|
341 | 316 |
|
342 | 317 |
logger::info("read sequence %s.", seqname); |
343 | 318 |
|
... | ... |
@@ -374,7 +349,7 @@ void sequencing_bias::build(const char* ref_fn, |
374 | 349 |
/* adjust the current read position randomly, and sample */ |
375 | 350 |
for (bg_sample_num = 0; bg_sample_num < bg_samples;) { |
376 | 351 |
|
377 |
- bg_pos = S[i].pos + (pos_t)round_away(rand_trunc_gauss(12, -200, 200)); |
|
352 |
+ bg_pos = S[i].pos + (pos_t)round_away(rand_gauss(500)); |
|
378 | 353 |
|
379 | 354 |
if (S[i].strand == strand_neg) { |
380 | 355 |
if (bg_pos < R || bg_pos >= seqlen - L) continue; |
... | ... |
@@ -464,17 +439,201 @@ double* sequencing_bias::get_bias(const char* seqname, |
464 | 439 |
bs[i] = M->eval(seq, i); |
465 | 440 |
} |
466 | 441 |
|
467 |
- |
|
468 | 442 |
free(seqstr); |
469 | 443 |
return bs; |
470 | 444 |
} |
471 | 445 |
|
472 | 446 |
|
473 |
-string sequencing_bias::print_model_graph() |
|
447 |
+string sequencing_bias::model_graph() const |
|
448 |
+{ |
|
449 |
+ return M->model_graph((int) L); |
|
450 |
+} |
|
451 |
+ |
|
452 |
+ |
|
453 |
+kmer_matrix tabulate_bias(double* kl, |
|
454 |
+ pos_t L, pos_t R, int k, |
|
455 |
+ const char* ref_fn, |
|
456 |
+ const char* reads_fn, |
|
457 |
+ const char* model_fn) |
|
474 | 458 |
{ |
475 |
- // TODO |
|
476 |
- //return M->print_model_graph(L); |
|
477 |
- return string(); |
|
459 |
+ /* This function procedes very much like the sequencing_bias::build |
|
460 |
+ * function, but does not finally train a motif, but rather tabulates |
|
461 |
+ * k-mer frequencies. */ |
|
462 |
+ size_t max_reads = 250000; |
|
463 |
+ |
|
464 |
+ kmer_matrix dest((size_t) (L + 1 + R), (size_t) k); |
|
465 |
+ dest.set_all(0.0); |
|
466 |
+ |
|
467 |
+ faidx_t* ref_f = fai_load(ref_fn); |
|
468 |
+ if (ref_f == NULL) { |
|
469 |
+ logger::abort("Can't open fasta file '%s'.", ref_fn); |
|
470 |
+ } |
|
471 |
+ |
|
472 |
+ samfile_t* reads_f = samopen(reads_fn, "rb", NULL); |
|
473 |
+ if (reads_f == NULL) { |
|
474 |
+ logger::abort("Can't open bam file '%s'.", reads_fn); |
|
475 |
+ } |
|
476 |
+ |
|
477 |
+ bam_index_t* reads_index = bam_index_load(reads_fn); |
|
478 |
+ if (reads_index == NULL) { |
|
479 |
+ logger::abort("Can't open bam index '%s.bai'.", reads_fn); |
|
480 |
+ } |
|
481 |
+ |
|
482 |
+ sequencing_bias* sb = NULL; |
|
483 |
+ if (model_fn != NULL) { |
|
484 |
+ sb = new sequencing_bias(ref_fn, model_fn); |
|
485 |
+ } |
|
486 |
+ |
|
487 |
+ |
|
488 |
+ bam_init_header_hash(reads_f->header); |
|
489 |
+ bam1_t* read = bam_init1(); |
|
490 |
+ |
|
491 |
+ pos_table T; |
|
492 |
+ pos_table_create(&T, reads_f->header->n_targets); |
|
493 |
+ T.seq_names = reads_f->header->target_name; |
|
494 |
+ |
|
495 |
+ size_t hashed_count = 0; |
|
496 |
+ while (samread(reads_f, read) > 0) { |
|
497 |
+ if (read->core.n_cigar != 1) continue; |
|
498 |
+ if (++hashed_count % 1000000 == 0) { |
|
499 |
+ logger::info("hashed %zu reads.", hashed_count); |
|
500 |
+ } |
|
501 |
+ pos_table_inc(&T, read); |
|
502 |
+ } |
|
503 |
+ logger::info("hashed %zu reads.", hashed_count); |
|
504 |
+ |
|
505 |
+ read_pos* S_tmp; |
|
506 |
+ read_pos* S; |
|
507 |
+ size_t N; |
|
508 |
+ const size_t max_dump = 10000000; |
|
509 |
+ pos_table_dump(&T, &S_tmp, &N, max_dump); |
|
510 |
+ |
|
511 |
+ /* sort by count */ |
|
512 |
+ qsort(S_tmp, N, sizeof(read_pos), read_pos_count_compare); |
|
513 |
+ |
|
514 |
+ /* consider only reads with at least one duplicate */ |
|
515 |
+ size_t i; |
|
516 |
+ for (i = 0; i < N; ++i) { |
|
517 |
+ if (S_tmp[i].count <= 1) break; |
|
518 |
+ } |
|
519 |
+ |
|
520 |
+ /* (unless there are very few of these reads */ |
|
521 |
+ if (i > 10000) { |
|
522 |
+ max_reads = std::min<size_t>(max_reads, i); |
|
523 |
+ logger::info("%zu reads with duplicates.", i); |
|
524 |
+ } |
|
525 |
+ else { |
|
526 |
+ i = N; |
|
527 |
+ } |
|
528 |
+ |
|
529 |
+ /* ignore the top 1%, as they tend to be vastly higher than anything else, |
|
530 |
+ * and thus can throw things off when training with small numbers of reads |
|
531 |
+ * */ |
|
532 |
+ S = S_tmp + i/100; |
|
533 |
+ max_reads = min<size_t>(max_reads, 99*i/100); |
|
534 |
+ |
|
535 |
+ /* sort by tid (so we can load one chromosome at a time) */ |
|
536 |
+ qsort(S, std::min<size_t>(max_reads, N), sizeof(read_pos), read_pos_tid_compare); |
|
537 |
+ |
|
538 |
+ twobitseq tbs; |
|
539 |
+ |
|
540 |
+ char* local_seq; |
|
541 |
+ local_seq = new char[ (k - 1) + L + 1 + R + 1 ]; |
|
542 |
+ local_seq[(k - 1) + L + 1 + R] = '\0'; |
|
543 |
+ |
|
544 |
+ double w; |
|
545 |
+ char* seq = NULL; |
|
546 |
+ int seqlen = 0; |
|
547 |
+ int curr_tid = -1; |
|
548 |
+ char* seqname = NULL; |
|
549 |
+ |
|
550 |
+ for (i = 0; i < N && i < max_reads; i++) { |
|
551 |
+ if (S[i].tid != curr_tid) { |
|
552 |
+ seqname = T.seq_names[S[i].tid]; |
|
553 |
+ free(seq); |
|
554 |
+ seq = faidx_fetch_seq(ref_f, seqname, 0, INT_MAX, &seqlen); |
|
555 |
+ logger::info("read sequence %s.", seqname); |
|
556 |
+ curr_tid = S[i].tid; |
|
557 |
+ |
|
558 |
+ if (seq == NULL) { |
|
559 |
+ logger::warn("warning: reference sequence not found, skipping."); |
|
560 |
+ } |
|
561 |
+ } |
|
562 |
+ |
|
563 |
+ if (seq == NULL) continue; |
|
564 |
+ |
|
565 |
+ if (S[i].strand == strand_neg) { |
|
566 |
+ if (S[i].pos < R || S[i].pos >= seqlen - L - (k-1)) continue; |
|
567 |
+ memcpy(local_seq, seq + S[i].pos - R, ((k-1)+L+1+R)*sizeof(char)); |
|
568 |
+ seqrc(local_seq, L+1+R); |
|
569 |
+ } |
|
570 |
+ else { |
|
571 |
+ if (S[i].pos < L + (k-1) || S[i].pos >= seqlen - R) continue; |
|
572 |
+ memcpy(local_seq, seq + (S[i].pos - L - (k-1)), ((k-1)+L+1+R)*sizeof(char)); |
|
573 |
+ } |
|
574 |
+ |
|
575 |
+ if (sb) { |
|
576 |
+ // TODO: get_bias |
|
577 |
+ } |
|
578 |
+ else w = 1.0; |
|
579 |
+ |
|
580 |
+ tbs = local_seq; |
|
581 |
+ kmer K; |
|
582 |
+ for (pos_t pos = (k-1); pos < (k-1) + L + 1 + R; ++pos) { |
|
583 |
+ K = tbs.get_kmer(k, pos); |
|
584 |
+ dest(pos - (k-1), K) += w; |
|
585 |
+ } |
|
586 |
+ } |
|
587 |
+ |
|
588 |
+ /* compute KL divergence */ |
|
589 |
+ |
|
590 |
+ /* estimate a background distribution by averaging across the entire window |
|
591 |
+ * */ |
|
592 |
+ int four_to_k = 1 << (2 * k); |
|
593 |
+ double* bg = new double[four_to_k]; |
|
594 |
+ memset(bg, 0, four_to_k * sizeof(double)); |
|
595 |
+ for (pos_t pos = 0; pos < L + 1 + R; ++pos) { |
|
596 |
+ for (kmer K = 0; K < (kmer) four_to_k; ++K) { |
|
597 |
+ bg[K] += dest(pos, K); |
|
598 |
+ } |
|
599 |
+ } |
|
600 |
+ |
|
601 |
+ kmer_matrix norm_dest(dest); |
|
602 |
+ norm_dest.make_distribution(); |
|
603 |
+ |
|
604 |
+ double z = 0.0; |
|
605 |
+ for (kmer K = 0; K < (kmer) four_to_k; ++K) z += bg[K]; |
|
606 |
+ for (kmer K = 0; K < (kmer) four_to_k; ++K) bg[K] /= z; |
|
607 |
+ |
|
608 |
+ |
|
609 |
+ /* Compute the (symetric) kullback-leibler divegnnce */ |
|
610 |
+ memset(kl, 0, (L + 1 + R) * sizeof(double)); |
|
611 |
+ for (pos_t pos = 0; pos < L + 1 + R; ++pos) { |
|
612 |
+ kl[pos] = 0.0; |
|
613 |
+ for (kmer K = 0; K < (kmer) four_to_k; ++K) { |
|
614 |
+ if (norm_dest(pos, K) > 0.0) { |
|
615 |
+ kl[pos] += norm_dest(pos, K) * (log2(norm_dest(pos, K)) - log2(bg[K])); |
|
616 |
+ } |
|
617 |
+ |
|
618 |
+ if (bg[K] > 0.0) { |
|
619 |
+ kl[pos] += bg[K] * (log2(bg[K]) - log2(norm_dest(pos, K))); |
|
620 |
+ } |
|
621 |
+ } |
|
622 |
+ } |
|
623 |
+ |
|
624 |
+ delete [] bg; |
|
625 |
+ |
|
626 |
+ |
|
627 |
+ free(seq); |
|
628 |
+ free(local_seq); |
|
629 |
+ free(S_tmp); |
|
630 |
+ bam_destroy1(read); |
|
631 |
+ pos_table_destroy(&T); |
|
632 |
+ delete sb; |
|
633 |
+ bam_index_destroy(reads_index); |
|
634 |
+ samclose(reads_f); |
|
635 |
+ |
|
636 |
+ return dest; |
|
478 | 637 |
} |
479 | 638 |
|
480 | 639 |
|
... | ... |
@@ -21,7 +21,13 @@ |
21 | 21 |
class sequencing_bias |
22 | 22 |
{ |
23 | 23 |
public: |
24 |
- /** Load a model that has bee previously trained. */ |
|
24 |
+ /** Load a model that has been previously trained, without a reference |
|
25 |
+ * sequence. (It can not be actually used this way.) |
|
26 |
+ */ |
|
27 |
+ sequencing_bias(const char* model_fn); |
|
28 |
+ |
|
29 |
+ |
|
30 |
+ /** Load a model that has been previously trained. */ |
|
25 | 31 |
sequencing_bias(const char* ref_fn, |
26 | 32 |
const char* model_fn); |
27 | 33 |
|
... | ... |
@@ -74,7 +80,7 @@ class sequencing_bias |
74 | 80 |
void to_yaml(YAML::Emitter&) const; |
75 | 81 |
|
76 | 82 |
/** Return a string of the model graph in dot format. */ |
77 |
- std::string print_model_graph(); |
|
83 |
+ std::string model_graph() const; |
|
78 | 84 |
|
79 | 85 |
|
80 | 86 |
private: |
... | ... |
@@ -107,4 +113,13 @@ class sequencing_bias |
107 | 113 |
}; |
108 | 114 |
|
109 | 115 |
|
116 |
+/** Tabulate the bias in a given dataset, optionally correcting for bias using |
|
117 |
+ * the given model. */ |
|
118 |
+kmer_matrix tabulate_bias(double* kl, |
|
119 |
+ pos_t L, pos_t R, int k, |
|
120 |
+ const char* ref_fn, |
|
121 |
+ const char* reads_fn, |
|
122 |
+ const char* model_fn = NULL); |
|
123 |
+ |
|
124 |
+ |
|
110 | 125 |
#endif |
... | ... |
@@ -8,10 +8,8 @@ |
8 | 8 |
#include "twobitseq.hpp" |
9 | 9 |
#include <algorithm> |
10 | 10 |
#include <cstring> |
11 |
+#include <cstdlib> |
|
11 | 12 |
|
12 |
-/* Don't expose this function. No one should have to know how we are mapping |
|
13 |
- * letters to numbers. |
|
14 |
- */ |
|
15 | 13 |
kmer nuc_to_num(char c) |
16 | 14 |
{ |
17 | 15 |
switch( c ) { |
... | ... |
@@ -26,36 +24,48 @@ kmer nuc_to_num(char c) |
26 | 24 |
} |
27 | 25 |
} |
28 | 26 |
|
29 |
-void num_to_nuc(char* dest, int n, int k) |
|
27 |
+ |
|
28 |
+ |
|
29 |
+void num_to_nuc(char* dest, kmer K, int k) |
|
30 | 30 |
{ |
31 | 31 |
int i; |
32 | 32 |
/* read backwards, then reverse */ |
33 |
- for( i = 0; i < k; i++ ) { |
|
34 |
- switch( n & 0x3 ) { |
|
33 |
+ for (i = 0; i < k; ++i) { |
|
34 |
+ switch (K & 0x3) { |
|
35 | 35 |
case 0: dest[i] = 'a'; break; |
36 | 36 |
case 1: dest[i] = 'c'; break; |
37 | 37 |
case 2: dest[i] = 'g'; break; |
38 | 38 |
case 3: dest[i] = 't'; break; |
39 | 39 |
} |
40 |
- n >>= 2; |
|
40 |
+ |
|
41 |
+ K >>= 2; |
|
41 | 42 |
} |
42 |
- dest[i] = '\0'; |
|
43 | 43 |
|
44 |
+ dest[i] = '\0'; |
|
44 | 45 |
std::reverse(dest, dest + i); |
45 | 46 |
} |
46 | 47 |
|
47 | 48 |
|
48 | 49 |
const size_t twobitseq::max_kmer = 4 * sizeof(kmer); |
49 | 50 |
|
51 |
+twobitseq::twobitseq() |
|
52 |
+ : xs(NULL) |
|
53 |
+ , n(0) |
|
54 |
+{ |
|
55 |
+ |
|
56 |
+} |
|
57 |
+ |
|
50 | 58 |
|
51 | 59 |
twobitseq::twobitseq(const char* seq) |
52 | 60 |
: xs(NULL) |
53 | 61 |
, n(0) |
54 | 62 |
{ |
63 |
+ if (seq == NULL) return; |
|
55 | 64 |
n = strlen(seq); |
56 | 65 |
if (n == 0) return; |
57 | 66 |
|
58 |
- xs = new kmer[n / max_kmer + 1]; |
|
67 |
+ xs = reinterpret_cast<kmer*>( |
|
68 |
+ malloc_or_die((n / max_kmer + 1) * sizeof(kmer))); |
|
59 | 69 |
memset(xs, 0, (n / max_kmer + 1) * sizeof(kmer)); |
60 | 70 |
|
61 | 71 |
size_t i; |
... | ... |
@@ -77,7 +87,8 @@ twobitseq::twobitseq(const twobitseq& other) |
77 | 87 |
n = other.n; |
78 | 88 |
if (n == 0) return; |
79 | 89 |
|
80 |
- xs = new kmer[n / max_kmer + 1]; |
|
90 |
+ xs = reinterpret_cast<kmer*>( |
|
91 |
+ malloc_or_die((n / max_kmer + 1) * sizeof(kmer))); |
|
81 | 92 |
memcpy(xs, other.xs, (n / max_kmer + 1) * sizeof(kmer)); |
82 | 93 |
} |
83 | 94 |
|
... | ... |
@@ -85,20 +96,60 @@ twobitseq::twobitseq(const twobitseq& other) |
85 | 96 |
|
86 | 97 |
twobitseq::~twobitseq() |
87 | 98 |
{ |
88 |
- delete [] xs; |
|
99 |
+ free(xs); |
|
89 | 100 |
} |
90 | 101 |
|
91 | 102 |
|
92 | 103 |
void twobitseq::operator = (const twobitseq& other) |
93 | 104 |
{ |
94 | 105 |
n = other.n; |
95 |
- |
|
96 |
- delete [] xs; |
|
97 |
- xs = new kmer[n / max_kmer + 1]; |
|
106 |
+ xs = reinterpret_cast<kmer*>( |
|
107 |
+ realloc_or_die(xs, (n / max_kmer + 1) * sizeof(kmer))); |
|
98 | 108 |
memcpy(xs, other.xs, (n / max_kmer + 1) * sizeof(kmer)); |
99 | 109 |
} |
100 | 110 |
|
101 | 111 |
|
112 |
+void twobitseq::operator = (const char* seq) |
|
113 |
+{ |
|
114 |
+ if (seq == NULL) { |
|
115 |
+ n = 0; |
|
116 |
+ free(xs); |
|
117 |
+ xs = NULL; |
|
118 |
+ return; |
|
119 |
+ } |
|
120 |
+ |
|
121 |
+ n = strlen(seq); |
|
122 |
+ xs = reinterpret_cast<kmer*>( |
|
123 |
+ realloc_or_die(xs, (n / max_kmer + 1) * sizeof(kmer))); |
|
124 |
+ memset(xs, 0, (n / max_kmer + 1) * sizeof(kmer)); |
|
125 |
+ |
|
126 |
+ size_t i; |
|
127 |
+ size_t block, offset; |
|
128 |
+ for (i = 0; i < n; ++i) { |
|
129 |
+ block = i / max_kmer; |
|
130 |
+ offset = i % max_kmer; |
|
131 |
+ |
|
132 |
+ xs[block] = xs[block] | (nuc_to_num(seq[i]) << (2*offset)); |
|
133 |
+ } |
|
134 |
+} |
|
135 |
+ |
|
136 |
+ |
|
137 |
+kmer twobitseq::get_kmer(int k, pos_t pos) |
|
138 |
+{ |
|
139 |
+ size_t block, offset; |
|
140 |
+ size_t i; |
|
141 |
+ kmer K = 0; |
|
142 |
+ |
|
143 |
+ for (i = 0; i < (size_t) k; ++i) { |
|
144 |
+ block = (i + (size_t) (pos - (k - 1))) / max_kmer; |
|
145 |
+ offset = (i + (size_t) (pos - (k - 1))) % max_kmer; |
|
146 |
+ K = (K << 2) | ((xs[block] >> (2 * offset)) & 0x3); |
|
147 |
+ } |
|
148 |
+ |
|
149 |
+ return K; |
|
150 |
+} |
|
151 |
+ |
|
152 |
+ |
|
102 | 153 |
int twobitseq::make_kmer(kmer& K, size_t pos, bool* mask, size_t mask_len) const |
103 | 154 |
{ |
104 | 155 |
int k = 0; |
... | ... |
@@ -21,8 +21,8 @@ |
21 | 21 |
class twobitseq |
22 | 22 |
{ |
23 | 23 |
public: |
24 |
- /** create an empty sequence (uninitialized, filled with noise) */ |
|
25 |
- twobitseq(size_t n); |
|
24 |
+ /** create an empty sequence. */ |
|
25 |
+ twobitseq(); |
|
26 | 26 |
|
27 | 27 |
/** create a sequence from a DNA/RNA sequence string */ |
28 | 28 |
twobitseq(const char* seq); |
... | ... |
@@ -33,6 +33,11 @@ class twobitseq |
33 | 33 |
~twobitseq(); |
34 | 34 |
|
35 | 35 |
void operator = (const twobitseq&); |
36 |
+ void operator = (const char* seq); |
|
37 |
+ |
|
38 |
+ /* get the kmer ending at position i */ |
|
39 |
+ kmer get_kmer(int k, pos_t i); |
|
40 |
+ |
|
36 | 41 |
|
37 | 42 |
/** extract a kmer. |
38 | 43 |
* |
... | ... |
@@ -51,12 +56,13 @@ class twobitseq |
51 | 56 |
static const size_t max_kmer; |
52 | 57 |
}; |
53 | 58 |
|
59 |
+ |
|
54 | 60 |
/** Convert a nucleotide charactor to a number, using the same scheme as |
55 | 61 |
* twobitseq */ |
56 | 62 |
kmer nuc_to_num(char c); |
57 | 63 |
|
58 | 64 |
/** Convert a number n encoding a kmer into a string of nucleotides */ |
59 |
-void num_to_nuc(char* dest, int n, int k); |
|
65 |
+void num_to_nuc(char* dest, kmer K, int k); |
|
60 | 66 |
|
61 | 67 |
#endif |
62 | 68 |
|