git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/SICtools@109238 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,1521 @@ |
1 |
+/* |
|
2 |
+ Author: petr.danecek@sanger |
|
3 |
+ gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam -lpthread |
|
4 |
+ |
|
5 |
+ Assumptions, approximations and other issues: |
|
6 |
+ - GC-depth graph does not split reads, the starting position determines which bin is incremented. |
|
7 |
+ There are small overlaps between bins (max readlen-1). However, the bins are big (20k). |
|
8 |
+ - coverage distribution ignores softclips and deletions |
|
9 |
+ - some stats require sorted BAMs |
|
10 |
+ - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths. |
|
11 |
+ - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq) |
|
12 |
+ - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar) |
|
13 |
+ counts, no splicing is done, no indels or soft clips are considered, even small overlap is |
|
14 |
+ good enough to include the read in the stats. |
|
15 |
+ |
|
16 |
+*/ |
|
17 |
+ |
|
18 |
+#define BAMCHECK_VERSION "2012-09-04" |
|
19 |
+ |
|
20 |
+#define _ISOC99_SOURCE |
|
21 |
+#include <stdio.h> |
|
22 |
+#include <stdlib.h> |
|
23 |
+#include <stdarg.h> |
|
24 |
+#include <string.h> |
|
25 |
+#include <math.h> |
|
26 |
+#include <ctype.h> |
|
27 |
+#include <getopt.h> |
|
28 |
+#include <errno.h> |
|
29 |
+#include <assert.h> |
|
30 |
+#include "faidx.h" |
|
31 |
+#include "khash.h" |
|
32 |
+#include "sam.h" |
|
33 |
+#include "sam_header.h" |
|
34 |
+#include "razf.h" |
|
35 |
+ |
|
36 |
+#define BWA_MIN_RDLEN 35 |
|
37 |
+#define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP)) |
|
38 |
+#define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP) |
|
39 |
+#define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE) |
|
40 |
+#define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE) |
|
41 |
+#define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1) |
|
42 |
+#define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2) |
|
43 |
+#define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP) |
|
44 |
+ |
|
45 |
+typedef struct |
|
46 |
+{ |
|
47 |
+ int32_t line_len, line_blen; |
|
48 |
+ int64_t len; |
|
49 |
+ uint64_t offset; |
|
50 |
+} |
|
51 |
+faidx1_t; |
|
52 |
+KHASH_MAP_INIT_STR(kh_faidx, faidx1_t) |
|
53 |
+KHASH_MAP_INIT_STR(kh_bam_tid, int) |
|
54 |
+KHASH_MAP_INIT_STR(kh_rg, const char *) |
|
55 |
+struct __faidx_t { |
|
56 |
+ RAZF *rz; |
|
57 |
+ int n, m; |
|
58 |
+ char **name; |
|
59 |
+ khash_t(kh_faidx) *hash; |
|
60 |
+}; |
|
61 |
+ |
|
62 |
+typedef struct |
|
63 |
+{ |
|
64 |
+ float gc; |
|
65 |
+ uint32_t depth; |
|
66 |
+} |
|
67 |
+gc_depth_t; |
|
68 |
+ |
|
69 |
+// For coverage distribution, a simple pileup |
|
70 |
+typedef struct |
|
71 |
+{ |
|
72 |
+ int64_t pos; |
|
73 |
+ int size, start; |
|
74 |
+ int *buffer; |
|
75 |
+} |
|
76 |
+round_buffer_t; |
|
77 |
+ |
|
78 |
+typedef struct { uint32_t from, to; } pos_t; |
|
79 |
+typedef struct |
|
80 |
+{ |
|
81 |
+ int npos,mpos,cpos; |
|
82 |
+ pos_t *pos; |
|
83 |
+} |
|
84 |
+regions_t; |
|
85 |
+ |
|
86 |
+typedef struct |
|
87 |
+{ |
|
88 |
+ // Parameters |
|
89 |
+ int trim_qual; // bwa trim quality |
|
90 |
+ |
|
91 |
+ // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd), |
|
92 |
+ // insert size histogram holder |
|
93 |
+ int nquals; // The number of quality bins |
|
94 |
+ int nbases; // The maximum sequence length the allocated array can hold |
|
95 |
+ int nisize; // The maximum insert size that the allocated array can hold |
|
96 |
+ int ngc; // The size of gc_1st and gc_2nd |
|
97 |
+ int nindels; // The maximum indel length for indel distribution |
|
98 |
+ |
|
99 |
+ // Arrays for the histogram data |
|
100 |
+ uint64_t *quals_1st, *quals_2nd; |
|
101 |
+ uint64_t *gc_1st, *gc_2nd; |
|
102 |
+ uint64_t *isize_inward, *isize_outward, *isize_other; |
|
103 |
+ uint64_t *acgt_cycles; |
|
104 |
+ uint64_t *read_lengths; |
|
105 |
+ uint64_t *insertions, *deletions; |
|
106 |
+ uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd; |
|
107 |
+ |
|
108 |
+ // The extremes encountered |
|
109 |
+ int max_len; // Maximum read length |
|
110 |
+ int max_qual; // Maximum quality |
|
111 |
+ float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part |
|
112 |
+ int is_sorted; |
|
113 |
+ |
|
114 |
+ // Summary numbers |
|
115 |
+ uint64_t total_len; |
|
116 |
+ uint64_t total_len_dup; |
|
117 |
+ uint64_t nreads_1st; |
|
118 |
+ uint64_t nreads_2nd; |
|
119 |
+ uint64_t nreads_filtered; |
|
120 |
+ uint64_t nreads_dup; |
|
121 |
+ uint64_t nreads_unmapped; |
|
122 |
+ uint64_t nreads_unpaired; |
|
123 |
+ uint64_t nreads_paired; |
|
124 |
+ uint64_t nreads_anomalous; |
|
125 |
+ uint64_t nreads_mq0; |
|
126 |
+ uint64_t nbases_mapped; |
|
127 |
+ uint64_t nbases_mapped_cigar; |
|
128 |
+ uint64_t nbases_trimmed; // bwa trimmed bases |
|
129 |
+ uint64_t nmismatches; |
|
130 |
+ uint64_t nreads_QCfailed, nreads_secondary; |
|
131 |
+ |
|
132 |
+ // GC-depth related data |
|
133 |
+ uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin |
|
134 |
+ gc_depth_t *gcd; // The GC-depth bins holder |
|
135 |
+ int gcd_bin_size; // The size of GC-depth bin |
|
136 |
+ uint32_t gcd_ref_size; // The approximate size of the genome |
|
137 |
+ int32_t tid, gcd_pos; // Position of the current bin |
|
138 |
+ int32_t pos; // Position of the last read |
|
139 |
+ |
|
140 |
+ // Coverage distribution related data |
|
141 |
+ int ncov; // The number of coverage bins |
|
142 |
+ uint64_t *cov; // The coverage frequencies |
|
143 |
+ int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins |
|
144 |
+ round_buffer_t cov_rbuf; // Pileup round buffer |
|
145 |
+ |
|
146 |
+ // Mismatches by read cycle |
|
147 |
+ uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against |
|
148 |
+ int mrseq_buf; // The size of the buffer |
|
149 |
+ int32_t rseq_pos; // The coordinate of the first base in the buffer |
|
150 |
+ int32_t nrseq_buf; // The used part of the buffer |
|
151 |
+ uint64_t *mpc_buf; // Mismatches per cycle |
|
152 |
+ |
|
153 |
+ // Filters |
|
154 |
+ int filter_readlen; |
|
155 |
+ |
|
156 |
+ // Target regions |
|
157 |
+ int nregions, reg_from,reg_to; |
|
158 |
+ regions_t *regions; |
|
159 |
+ |
|
160 |
+ // Auxiliary data |
|
161 |
+ int flag_require, flag_filter; |
|
162 |
+ double sum_qual; // For calculating average quality value |
|
163 |
+ samfile_t *sam; |
|
164 |
+ khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated |
|
165 |
+ faidx_t *fai; // Reference sequence for GC-depth graph |
|
166 |
+ int argc; // Command line arguments to be printed on the output |
|
167 |
+ char **argv; |
|
168 |
+} |
|
169 |
+stats_t; |
|
170 |
+ |
|
171 |
+void error(const char *format, ...); |
|
172 |
+void bam_init_header_hash(bam_header_t *header); |
|
173 |
+int is_in_regions(bam1_t *bam_line, stats_t *stats); |
|
174 |
+ |
|
175 |
+ |
|
176 |
+// Coverage distribution methods |
|
177 |
+inline int coverage_idx(int min, int max, int n, int step, int depth) |
|
178 |
+{ |
|
179 |
+ if ( depth < min ) |
|
180 |
+ return 0; |
|
181 |
+ |
|
182 |
+ if ( depth > max ) |
|
183 |
+ return n-1; |
|
184 |
+ |
|
185 |
+ return 1 + (depth - min) / step; |
|
186 |
+} |
|
187 |
+ |
|
188 |
+inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos) |
|
189 |
+{ |
|
190 |
+ return (offset + (pos-refpos) % size) % size; |
|
191 |
+} |
|
192 |
+ |
|
193 |
+void round_buffer_flush(stats_t *stats, int64_t pos) |
|
194 |
+{ |
|
195 |
+ int ibuf,idp; |
|
196 |
+ |
|
197 |
+ if ( pos==stats->cov_rbuf.pos ) |
|
198 |
+ return; |
|
199 |
+ |
|
200 |
+ int64_t new_pos = pos; |
|
201 |
+ if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size ) |
|
202 |
+ { |
|
203 |
+ // Flush the whole buffer, but in sequential order, |
|
204 |
+ pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1; |
|
205 |
+ } |
|
206 |
+ |
|
207 |
+ if ( pos < stats->cov_rbuf.pos ) |
|
208 |
+ error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos); |
|
209 |
+ |
|
210 |
+ int ifrom = stats->cov_rbuf.start; |
|
211 |
+ int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1); |
|
212 |
+ if ( ifrom>ito ) |
|
213 |
+ { |
|
214 |
+ for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++) |
|
215 |
+ { |
|
216 |
+ if ( !stats->cov_rbuf.buffer[ibuf] ) |
|
217 |
+ continue; |
|
218 |
+ idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]); |
|
219 |
+ stats->cov[idp]++; |
|
220 |
+ stats->cov_rbuf.buffer[ibuf] = 0; |
|
221 |
+ } |
|
222 |
+ ifrom = 0; |
|
223 |
+ } |
|
224 |
+ for (ibuf=ifrom; ibuf<=ito; ibuf++) |
|
225 |
+ { |
|
226 |
+ if ( !stats->cov_rbuf.buffer[ibuf] ) |
|
227 |
+ continue; |
|
228 |
+ idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]); |
|
229 |
+ stats->cov[idp]++; |
|
230 |
+ stats->cov_rbuf.buffer[ibuf] = 0; |
|
231 |
+ } |
|
232 |
+ stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos); |
|
233 |
+ stats->cov_rbuf.pos = new_pos; |
|
234 |
+} |
|
235 |
+ |
|
236 |
+void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to) |
|
237 |
+{ |
|
238 |
+ if ( to-from >= rbuf->size ) |
|
239 |
+ error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size); |
|
240 |
+ if ( from < rbuf->pos ) |
|
241 |
+ error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos); |
|
242 |
+ |
|
243 |
+ int ifrom,ito,ibuf; |
|
244 |
+ ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from); |
|
245 |
+ ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to); |
|
246 |
+ if ( ifrom>ito ) |
|
247 |
+ { |
|
248 |
+ for (ibuf=ifrom; ibuf<rbuf->size; ibuf++) |
|
249 |
+ rbuf->buffer[ibuf]++; |
|
250 |
+ ifrom = 0; |
|
251 |
+ } |
|
252 |
+ for (ibuf=ifrom; ibuf<=ito; ibuf++) |
|
253 |
+ rbuf->buffer[ibuf]++; |
|
254 |
+} |
|
255 |
+ |
|
256 |
+// Calculate the number of bases in the read trimmed by BWA |
|
257 |
+int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse) |
|
258 |
+{ |
|
259 |
+ if ( len<BWA_MIN_RDLEN ) return 0; |
|
260 |
+ |
|
261 |
+ // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN, |
|
262 |
+ // the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c). |
|
263 |
+ int max_trimmed = len - BWA_MIN_RDLEN + 1; |
|
264 |
+ int l, sum=0, max_sum=0, max_l=0; |
|
265 |
+ |
|
266 |
+ for (l=0; l<max_trimmed; l++) |
|
267 |
+ { |
|
268 |
+ sum += trim_qual - quals[ reverse ? l : len-1-l ]; |
|
269 |
+ if ( sum<0 ) break; |
|
270 |
+ if ( sum>max_sum ) |
|
271 |
+ { |
|
272 |
+ max_sum = sum; |
|
273 |
+ // This is the correct way, but bwa clips from some reason one base less |
|
274 |
+ // max_l = l+1; |
|
275 |
+ max_l = l; |
|
276 |
+ } |
|
277 |
+ } |
|
278 |
+ return max_l; |
|
279 |
+} |
|
280 |
+ |
|
281 |
+ |
|
282 |
+void count_indels(stats_t *stats,bam1_t *bam_line) |
|
283 |
+{ |
|
284 |
+ int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; |
|
285 |
+ int is_1st = IS_READ1(bam_line) ? 1 : 0; |
|
286 |
+ int icig; |
|
287 |
+ int icycle = 0; |
|
288 |
+ int read_len = bam_line->core.l_qseq; |
|
289 |
+ for (icig=0; icig<bam_line->core.n_cigar; icig++) |
|
290 |
+ { |
|
291 |
+ // Conversion from uint32_t to MIDNSHP |
|
292 |
+ // 0123456 |
|
293 |
+ // MIDNSHP |
|
294 |
+ int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK; |
|
295 |
+ int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT; |
|
296 |
+ |
|
297 |
+ if ( cig==1 ) |
|
298 |
+ { |
|
299 |
+ int idx = is_fwd ? icycle : read_len-icycle-ncig; |
|
300 |
+ if ( idx<0 ) |
|
301 |
+ error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle); |
|
302 |
+ if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); |
|
303 |
+ if ( is_1st ) |
|
304 |
+ stats->ins_cycles_1st[idx]++; |
|
305 |
+ else |
|
306 |
+ stats->ins_cycles_2nd[idx]++; |
|
307 |
+ icycle += ncig; |
|
308 |
+ if ( ncig<=stats->nindels ) |
|
309 |
+ stats->insertions[ncig-1]++; |
|
310 |
+ continue; |
|
311 |
+ } |
|
312 |
+ if ( cig==2 ) |
|
313 |
+ { |
|
314 |
+ int idx = is_fwd ? icycle-1 : read_len-icycle-1; |
|
315 |
+ if ( idx<0 ) continue; // discard meaningless deletions |
|
316 |
+ if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases); |
|
317 |
+ if ( is_1st ) |
|
318 |
+ stats->del_cycles_1st[idx]++; |
|
319 |
+ else |
|
320 |
+ stats->del_cycles_2nd[idx]++; |
|
321 |
+ if ( ncig<=stats->nindels ) |
|
322 |
+ stats->deletions[ncig-1]++; |
|
323 |
+ continue; |
|
324 |
+ } |
|
325 |
+ if ( cig!=3 && cig!=5 ) |
|
326 |
+ icycle += ncig; |
|
327 |
+ } |
|
328 |
+} |
|
329 |
+ |
|
330 |
+void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) |
|
331 |
+{ |
|
332 |
+ int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; |
|
333 |
+ int icig,iread=0,icycle=0; |
|
334 |
+ int iref = bam_line->core.pos - stats->rseq_pos; |
|
335 |
+ int read_len = bam_line->core.l_qseq; |
|
336 |
+ uint8_t *read = bam1_seq(bam_line); |
|
337 |
+ uint8_t *quals = bam1_qual(bam_line); |
|
338 |
+ uint64_t *mpc_buf = stats->mpc_buf; |
|
339 |
+ for (icig=0; icig<bam_line->core.n_cigar; icig++) |
|
340 |
+ { |
|
341 |
+ // Conversion from uint32_t to MIDNSHP |
|
342 |
+ // 0123456 |
|
343 |
+ // MIDNSHP |
|
344 |
+ int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK; |
|
345 |
+ int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT; |
|
346 |
+ if ( cig==1 ) |
|
347 |
+ { |
|
348 |
+ iread += ncig; |
|
349 |
+ icycle += ncig; |
|
350 |
+ continue; |
|
351 |
+ } |
|
352 |
+ if ( cig==2 ) |
|
353 |
+ { |
|
354 |
+ iref += ncig; |
|
355 |
+ continue; |
|
356 |
+ } |
|
357 |
+ if ( cig==4 ) |
|
358 |
+ { |
|
359 |
+ icycle += ncig; |
|
360 |
+ // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence |
|
361 |
+ // iref += ncig; |
|
362 |
+ iread += ncig; |
|
363 |
+ continue; |
|
364 |
+ } |
|
365 |
+ if ( cig==5 ) |
|
366 |
+ { |
|
367 |
+ icycle += ncig; |
|
368 |
+ continue; |
|
369 |
+ } |
|
370 |
+ // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large |
|
371 |
+ // chunk of refseq in memory. Not very frequent and not noticable in the stats. |
|
372 |
+ if ( cig==3 || cig==5 ) continue; |
|
373 |
+ if ( cig!=0 ) |
|
374 |
+ error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); |
|
375 |
+ |
|
376 |
+ if ( ncig+iref > stats->nrseq_buf ) |
|
377 |
+ error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1); |
|
378 |
+ |
|
379 |
+ int im; |
|
380 |
+ for (im=0; im<ncig; im++) |
|
381 |
+ { |
|
382 |
+ uint8_t cread = bam1_seqi(read,iread); |
|
383 |
+ uint8_t cref = stats->rseq_buf[iref]; |
|
384 |
+ |
|
385 |
+ // ---------------15 |
|
386 |
+ // =ACMGRSVTWYHKDBN |
|
387 |
+ if ( cread==15 ) |
|
388 |
+ { |
|
389 |
+ int idx = is_fwd ? icycle : read_len-icycle-1; |
|
390 |
+ if ( idx>stats->max_len ) |
|
391 |
+ error("mpc: %d>%d\n",idx,stats->max_len); |
|
392 |
+ idx = idx*stats->nquals; |
|
393 |
+ if ( idx>=stats->nquals*stats->nbases ) |
|
394 |
+ error("FIXME: mpc_buf overflow\n"); |
|
395 |
+ mpc_buf[idx]++; |
|
396 |
+ } |
|
397 |
+ else if ( cref && cread && cref!=cread ) |
|
398 |
+ { |
|
399 |
+ uint8_t qual = quals[iread] + 1; |
|
400 |
+ if ( qual>=stats->nquals ) |
|
401 |
+ error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); |
|
402 |
+ |
|
403 |
+ int idx = is_fwd ? icycle : read_len-icycle-1; |
|
404 |
+ if ( idx>stats->max_len ) |
|
405 |
+ error("mpc: %d>%d\n",idx,stats->max_len); |
|
406 |
+ |
|
407 |
+ idx = idx*stats->nquals + qual; |
|
408 |
+ if ( idx>=stats->nquals*stats->nbases ) |
|
409 |
+ error("FIXME: mpc_buf overflow\n"); |
|
410 |
+ mpc_buf[idx]++; |
|
411 |
+ } |
|
412 |
+ |
|
413 |
+ iref++; |
|
414 |
+ iread++; |
|
415 |
+ icycle++; |
|
416 |
+ } |
|
417 |
+ } |
|
418 |
+} |
|
419 |
+ |
|
420 |
+void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos) |
|
421 |
+{ |
|
422 |
+ khash_t(kh_faidx) *h; |
|
423 |
+ khiter_t iter; |
|
424 |
+ faidx1_t val; |
|
425 |
+ char *chr, c; |
|
426 |
+ faidx_t *fai = stats->fai; |
|
427 |
+ |
|
428 |
+ h = fai->hash; |
|
429 |
+ chr = stats->sam->header->target_name[tid]; |
|
430 |
+ |
|
431 |
+ // ID of the sequence name |
|
432 |
+ iter = kh_get(kh_faidx, h, chr); |
|
433 |
+ if (iter == kh_end(h)) |
|
434 |
+ error("No such reference sequence [%s]?\n", chr); |
|
435 |
+ val = kh_value(h, iter); |
|
436 |
+ |
|
437 |
+ // Check the boundaries |
|
438 |
+ if (pos >= val.len) |
|
439 |
+ error("Was the bam file mapped with the reference sequence supplied?" |
|
440 |
+ " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len); |
|
441 |
+ int size = stats->mrseq_buf; |
|
442 |
+ // The buffer extends beyond the chromosome end. Later the rest will be filled with N's. |
|
443 |
+ if (size+pos > val.len) size = val.len-pos; |
|
444 |
+ |
|
445 |
+ // Position the razf reader |
|
446 |
+ razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET); |
|
447 |
+ |
|
448 |
+ uint8_t *ptr = stats->rseq_buf; |
|
449 |
+ int nread = 0; |
|
450 |
+ while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err ) |
|
451 |
+ { |
|
452 |
+ if ( !isgraph(c) ) |
|
453 |
+ continue; |
|
454 |
+ |
|
455 |
+ // Conversion between uint8_t coding and ACGT |
|
456 |
+ // -12-4---8------- |
|
457 |
+ // =ACMGRSVTWYHKDBN |
|
458 |
+ if ( c=='A' || c=='a' ) |
|
459 |
+ *ptr = 1; |
|
460 |
+ else if ( c=='C' || c=='c' ) |
|
461 |
+ *ptr = 2; |
|
462 |
+ else if ( c=='G' || c=='g' ) |
|
463 |
+ *ptr = 4; |
|
464 |
+ else if ( c=='T' || c=='t' ) |
|
465 |
+ *ptr = 8; |
|
466 |
+ else |
|
467 |
+ *ptr = 0; |
|
468 |
+ ptr++; |
|
469 |
+ nread++; |
|
470 |
+ } |
|
471 |
+ if ( nread < stats->mrseq_buf ) |
|
472 |
+ { |
|
473 |
+ memset(ptr,0, stats->mrseq_buf - nread); |
|
474 |
+ nread = stats->mrseq_buf; |
|
475 |
+ } |
|
476 |
+ stats->nrseq_buf = nread; |
|
477 |
+ stats->rseq_pos = pos; |
|
478 |
+ stats->tid = tid; |
|
479 |
+} |
|
480 |
+ |
|
481 |
+float fai_gc_content(stats_t *stats, int pos, int len) |
|
482 |
+{ |
|
483 |
+ uint32_t gc,count,c; |
|
484 |
+ int i = pos - stats->rseq_pos, ito = i + len; |
|
485 |
+ assert( i>=0 && ito<=stats->nrseq_buf ); |
|
486 |
+ |
|
487 |
+ // Count GC content |
|
488 |
+ gc = count = 0; |
|
489 |
+ for (; i<ito; i++) |
|
490 |
+ { |
|
491 |
+ c = stats->rseq_buf[i]; |
|
492 |
+ if ( c==2 || c==4 ) |
|
493 |
+ { |
|
494 |
+ gc++; |
|
495 |
+ count++; |
|
496 |
+ } |
|
497 |
+ else if ( c==1 || c==8 ) |
|
498 |
+ count++; |
|
499 |
+ } |
|
500 |
+ return count ? (float)gc/count : 0; |
|
501 |
+} |
|
502 |
+ |
|
503 |
+void realloc_rseq_buffer(stats_t *stats) |
|
504 |
+{ |
|
505 |
+ int n = stats->nbases*10; |
|
506 |
+ if ( stats->gcd_bin_size > n ) n = stats->gcd_bin_size; |
|
507 |
+ if ( stats->mrseq_buf<n ) |
|
508 |
+ { |
|
509 |
+ stats->rseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n); |
|
510 |
+ stats->mrseq_buf = n; |
|
511 |
+ } |
|
512 |
+} |
|
513 |
+ |
|
514 |
+void realloc_gcd_buffer(stats_t *stats, int seq_len) |
|
515 |
+{ |
|
516 |
+ if ( seq_len >= stats->gcd_bin_size ) |
|
517 |
+ error("The --GC-depth bin size (%d) is set too low for the read length %d\n", stats->gcd_bin_size, seq_len); |
|
518 |
+ |
|
519 |
+ int n = 1 + stats->gcd_ref_size / (stats->gcd_bin_size - seq_len); |
|
520 |
+ if ( n <= stats->igcd ) |
|
521 |
+ error("The --GC-depth bin size is too small or reference genome too big; please decrease the bin size or increase the reference length\n"); |
|
522 |
+ |
|
523 |
+ if ( n > stats->ngcd ) |
|
524 |
+ { |
|
525 |
+ stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t)); |
|
526 |
+ if ( !stats->gcd ) |
|
527 |
+ error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n); |
|
528 |
+ memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t)); |
|
529 |
+ stats->ngcd = n; |
|
530 |
+ } |
|
531 |
+ |
|
532 |
+ realloc_rseq_buffer(stats); |
|
533 |
+} |
|
534 |
+ |
|
535 |
+void realloc_buffers(stats_t *stats, int seq_len) |
|
536 |
+{ |
|
537 |
+ int n = 2*(1 + seq_len - stats->nbases) + stats->nbases; |
|
538 |
+ |
|
539 |
+ stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t)); |
|
540 |
+ if ( !stats->quals_1st ) |
|
541 |
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t)); |
|
542 |
+ memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t)); |
|
543 |
+ |
|
544 |
+ stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t)); |
|
545 |
+ if ( !stats->quals_2nd ) |
|
546 |
+ error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t)); |
|
547 |
+ memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t)); |
|
548 |
+ |
|
549 |
+ if ( stats->mpc_buf ) |
|
550 |
+ { |
|
551 |
+ stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t)); |
|
552 |
+ if ( !stats->mpc_buf ) |
|
553 |
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t)); |
|
554 |
+ memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t)); |
|
555 |
+ } |
|
556 |
+ |
|
557 |
+ stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t)); |
|
558 |
+ if ( !stats->acgt_cycles ) |
|
559 |
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t)); |
|
560 |
+ memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t)); |
|
561 |
+ |
|
562 |
+ stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t)); |
|
563 |
+ if ( !stats->read_lengths ) |
|
564 |
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); |
|
565 |
+ memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); |
|
566 |
+ |
|
567 |
+ stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t)); |
|
568 |
+ if ( !stats->insertions ) |
|
569 |
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); |
|
570 |
+ memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); |
|
571 |
+ |
|
572 |
+ stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t)); |
|
573 |
+ if ( !stats->deletions ) |
|
574 |
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); |
|
575 |
+ memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); |
|
576 |
+ |
|
577 |
+ stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t)); |
|
578 |
+ if ( !stats->ins_cycles_1st ) |
|
579 |
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); |
|
580 |
+ memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); |
|
581 |
+ |
|
582 |
+ stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t)); |
|
583 |
+ if ( !stats->ins_cycles_2nd ) |
|
584 |
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); |
|
585 |
+ memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); |
|
586 |
+ |
|
587 |
+ stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t)); |
|
588 |
+ if ( !stats->del_cycles_1st ) |
|
589 |
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); |
|
590 |
+ memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); |
|
591 |
+ |
|
592 |
+ stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t)); |
|
593 |
+ if ( !stats->del_cycles_2nd ) |
|
594 |
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); |
|
595 |
+ memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); |
|
596 |
+ |
|
597 |
+ stats->nbases = n; |
|
598 |
+ |
|
599 |
+ // Realloc the coverage distribution buffer |
|
600 |
+ int *rbuffer = calloc(sizeof(int),seq_len*5); |
|
601 |
+ n = stats->cov_rbuf.size-stats->cov_rbuf.start; |
|
602 |
+ memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n); |
|
603 |
+ if ( stats->cov_rbuf.start>1 ) |
|
604 |
+ memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start); |
|
605 |
+ stats->cov_rbuf.start = 0; |
|
606 |
+ free(stats->cov_rbuf.buffer); |
|
607 |
+ stats->cov_rbuf.buffer = rbuffer; |
|
608 |
+ stats->cov_rbuf.size = seq_len*5; |
|
609 |
+ |
|
610 |
+ realloc_rseq_buffer(stats); |
|
611 |
+} |
|
612 |
+ |
|
613 |
+void collect_stats(bam1_t *bam_line, stats_t *stats) |
|
614 |
+{ |
|
615 |
+ if ( stats->rg_hash ) |
|
616 |
+ { |
|
617 |
+ const uint8_t *rg = bam_aux_get(bam_line, "RG"); |
|
618 |
+ if ( !rg ) return; |
|
619 |
+ khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1)); |
|
620 |
+ if ( k == kh_end(stats->rg_hash) ) return; |
|
621 |
+ } |
|
622 |
+ if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require ) |
|
623 |
+ { |
|
624 |
+ stats->nreads_filtered++; |
|
625 |
+ return; |
|
626 |
+ } |
|
627 |
+ if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) ) |
|
628 |
+ { |
|
629 |
+ stats->nreads_filtered++; |
|
630 |
+ return; |
|
631 |
+ } |
|
632 |
+ if ( !is_in_regions(bam_line,stats) ) |
|
633 |
+ return; |
|
634 |
+ if ( stats->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->filter_readlen ) |
|
635 |
+ return; |
|
636 |
+ |
|
637 |
+ if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++; |
|
638 |
+ if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++; |
|
639 |
+ |
|
640 |
+ int seq_len = bam_line->core.l_qseq; |
|
641 |
+ if ( !seq_len ) return; |
|
642 |
+ |
|
643 |
+ if ( seq_len >= stats->nbases ) |
|
644 |
+ realloc_buffers(stats,seq_len); |
|
645 |
+ if ( stats->max_len<seq_len ) |
|
646 |
+ stats->max_len = seq_len; |
|
647 |
+ |
|
648 |
+ stats->read_lengths[seq_len]++; |
|
649 |
+ |
|
650 |
+ // Count GC and ACGT per cycle |
|
651 |
+ uint8_t base, *seq = bam1_seq(bam_line); |
|
652 |
+ int gc_count = 0; |
|
653 |
+ int i; |
|
654 |
+ int reverse = IS_REVERSE(bam_line); |
|
655 |
+ for (i=0; i<seq_len; i++) |
|
656 |
+ { |
|
657 |
+ // Conversion from uint8_t coding to ACGT |
|
658 |
+ // -12-4---8------- |
|
659 |
+ // =ACMGRSVTWYHKDBN |
|
660 |
+ // 01 2 3 |
|
661 |
+ base = bam1_seqi(seq,i); |
|
662 |
+ base /= 2; |
|
663 |
+ if ( base==1 || base==2 ) gc_count++; |
|
664 |
+ else if ( base>2 ) base=3; |
|
665 |
+ if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 ) |
|
666 |
+ error("FIXME: acgt_cycles\n"); |
|
667 |
+ stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++; |
|
668 |
+ } |
|
669 |
+ int gc_idx_min = gc_count*(stats->ngc-1)/seq_len; |
|
670 |
+ int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len; |
|
671 |
+ if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1; |
|
672 |
+ |
|
673 |
+ // Determine which array (1st or 2nd read) will these stats go to, |
|
674 |
+ // trim low quality bases from end the same way BWA does, |
|
675 |
+ // fill GC histogram |
|
676 |
+ uint64_t *quals; |
|
677 |
+ uint8_t *bam_quals = bam1_qual(bam_line); |
|
678 |
+ if ( bam_line->core.flag&BAM_FREAD2 ) |
|
679 |
+ { |
|
680 |
+ quals = stats->quals_2nd; |
|
681 |
+ stats->nreads_2nd++; |
|
682 |
+ for (i=gc_idx_min; i<gc_idx_max; i++) |
|
683 |
+ stats->gc_2nd[i]++; |
|
684 |
+ } |
|
685 |
+ else |
|
686 |
+ { |
|
687 |
+ quals = stats->quals_1st; |
|
688 |
+ stats->nreads_1st++; |
|
689 |
+ for (i=gc_idx_min; i<gc_idx_max; i++) |
|
690 |
+ stats->gc_1st[i]++; |
|
691 |
+ } |
|
692 |
+ if ( stats->trim_qual>0 ) |
|
693 |
+ stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse); |
|
694 |
+ |
|
695 |
+ // Quality histogram and average quality |
|
696 |
+ for (i=0; i<seq_len; i++) |
|
697 |
+ { |
|
698 |
+ uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i]; |
|
699 |
+ if ( qual>=stats->nquals ) |
|
700 |
+ error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); |
|
701 |
+ if ( qual>stats->max_qual ) |
|
702 |
+ stats->max_qual = qual; |
|
703 |
+ |
|
704 |
+ quals[ i*stats->nquals+qual ]++; |
|
705 |
+ stats->sum_qual += qual; |
|
706 |
+ } |
|
707 |
+ |
|
708 |
+ // Look at the flags and increment appropriate counters (mapped, paired, etc) |
|
709 |
+ if ( IS_UNMAPPED(bam_line) ) |
|
710 |
+ stats->nreads_unmapped++; |
|
711 |
+ else |
|
712 |
+ { |
|
713 |
+ if ( !bam_line->core.qual ) |
|
714 |
+ stats->nreads_mq0++; |
|
715 |
+ |
|
716 |
+ count_indels(stats,bam_line); |
|
717 |
+ |
|
718 |
+ if ( !IS_PAIRED(bam_line) ) |
|
719 |
+ stats->nreads_unpaired++; |
|
720 |
+ else |
|
721 |
+ { |
|
722 |
+ stats->nreads_paired++; |
|
723 |
+ |
|
724 |
+ if ( bam_line->core.tid!=bam_line->core.mtid ) |
|
725 |
+ stats->nreads_anomalous++; |
|
726 |
+ |
|
727 |
+ // The insert size is tricky, because for long inserts the libraries are |
|
728 |
+ // prepared differently and the pairs point in other direction. BWA does |
|
729 |
+ // not set the paired flag for them. Similar thing is true also for 454 |
|
730 |
+ // reads. Mates mapped to different chromosomes have isize==0. |
|
731 |
+ int32_t isize = bam_line->core.isize; |
|
732 |
+ if ( isize<0 ) isize = -isize; |
|
733 |
+ if ( isize >= stats->nisize ) |
|
734 |
+ isize = stats->nisize-1; |
|
735 |
+ if ( isize>0 || bam_line->core.tid==bam_line->core.mtid ) |
|
736 |
+ { |
|
737 |
+ int pos_fst = bam_line->core.mpos - bam_line->core.pos; |
|
738 |
+ int is_fst = IS_READ1(bam_line) ? 1 : -1; |
|
739 |
+ int is_fwd = IS_REVERSE(bam_line) ? -1 : 1; |
|
740 |
+ int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1; |
|
741 |
+ |
|
742 |
+ if ( is_fwd*is_mfwd>0 ) |
|
743 |
+ stats->isize_other[isize]++; |
|
744 |
+ else if ( is_fst*pos_fst>0 ) |
|
745 |
+ { |
|
746 |
+ if ( is_fst*is_fwd>0 ) |
|
747 |
+ stats->isize_inward[isize]++; |
|
748 |
+ else |
|
749 |
+ stats->isize_outward[isize]++; |
|
750 |
+ } |
|
751 |
+ else if ( is_fst*pos_fst<0 ) |
|
752 |
+ { |
|
753 |
+ if ( is_fst*is_fwd>0 ) |
|
754 |
+ stats->isize_outward[isize]++; |
|
755 |
+ else |
|
756 |
+ stats->isize_inward[isize]++; |
|
757 |
+ } |
|
758 |
+ } |
|
759 |
+ } |
|
760 |
+ |
|
761 |
+ // Number of mismatches |
|
762 |
+ uint8_t *nm = bam_aux_get(bam_line,"NM"); |
|
763 |
+ if (nm) |
|
764 |
+ stats->nmismatches += bam_aux2i(nm); |
|
765 |
+ |
|
766 |
+ // Number of mapped bases from cigar |
|
767 |
+ // Conversion from uint32_t to MIDNSHP |
|
768 |
+ // 012-4-- |
|
769 |
+ // MIDNSHP |
|
770 |
+ if ( bam_line->core.n_cigar == 0) |
|
771 |
+ error("FIXME: mapped read with no cigar?\n"); |
|
772 |
+ int readlen=seq_len; |
|
773 |
+ if ( stats->regions ) |
|
774 |
+ { |
|
775 |
+ // Count only on-target bases |
|
776 |
+ int iref = bam_line->core.pos + 1; |
|
777 |
+ for (i=0; i<bam_line->core.n_cigar; i++) |
|
778 |
+ { |
|
779 |
+ int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK; |
|
780 |
+ int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; |
|
781 |
+ if ( cig==2 ) readlen += ncig; |
|
782 |
+ else if ( cig==0 ) |
|
783 |
+ { |
|
784 |
+ if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref; |
|
785 |
+ else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to; |
|
786 |
+ if ( ncig<0 ) ncig = 0; |
|
787 |
+ stats->nbases_mapped_cigar += ncig; |
|
788 |
+ iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; |
|
789 |
+ } |
|
790 |
+ else if ( cig==1 ) |
|
791 |
+ { |
|
792 |
+ iref += ncig; |
|
793 |
+ if ( iref>=stats->reg_from && iref<=stats->reg_to ) |
|
794 |
+ stats->nbases_mapped_cigar += ncig; |
|
795 |
+ } |
|
796 |
+ } |
|
797 |
+ } |
|
798 |
+ else |
|
799 |
+ { |
|
800 |
+ // Count the whole read |
|
801 |
+ for (i=0; i<bam_line->core.n_cigar; i++) |
|
802 |
+ { |
|
803 |
+ if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 ) |
|
804 |
+ stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; |
|
805 |
+ if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 ) |
|
806 |
+ readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; |
|
807 |
+ } |
|
808 |
+ } |
|
809 |
+ stats->nbases_mapped += seq_len; |
|
810 |
+ |
|
811 |
+ if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos ) |
|
812 |
+ stats->is_sorted = 0; |
|
813 |
+ stats->pos = bam_line->core.pos; |
|
814 |
+ |
|
815 |
+ if ( stats->is_sorted ) |
|
816 |
+ { |
|
817 |
+ if ( stats->tid==-1 || stats->tid!=bam_line->core.tid ) |
|
818 |
+ round_buffer_flush(stats,-1); |
|
819 |
+ |
|
820 |
+ // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins |
|
821 |
+ // are not splitted which results in up to seq_len-1 overlaps. The default bin size is |
|
822 |
+ // 20kbp, so the effect is negligible. |
|
823 |
+ if ( stats->fai ) |
|
824 |
+ { |
|
825 |
+ int inc_ref = 0, inc_gcd = 0; |
|
826 |
+ // First pass or new chromosome |
|
827 |
+ if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; } |
|
828 |
+ // Read goes beyond the end of the rseq buffer |
|
829 |
+ else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; } |
|
830 |
+ // Read overlaps the next gcd bin |
|
831 |
+ else if ( stats->gcd_pos+stats->gcd_bin_size < bam_line->core.pos+readlen ) |
|
832 |
+ { |
|
833 |
+ inc_gcd = 1; |
|
834 |
+ if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->gcd_bin_size ) inc_ref = 1; |
|
835 |
+ } |
|
836 |
+ if ( inc_gcd ) |
|
837 |
+ { |
|
838 |
+ stats->igcd++; |
|
839 |
+ if ( stats->igcd >= stats->ngcd ) |
|
840 |
+ realloc_gcd_buffer(stats, readlen); |
|
841 |
+ if ( inc_ref ) |
|
842 |
+ read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos); |
|
843 |
+ stats->gcd_pos = bam_line->core.pos; |
|
844 |
+ stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size); |
|
845 |
+ } |
|
846 |
+ |
|
847 |
+ count_mismatches_per_cycle(stats,bam_line); |
|
848 |
+ } |
|
849 |
+ // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin |
|
850 |
+ else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size ) |
|
851 |
+ { |
|
852 |
+ // First pass or a new chromosome |
|
853 |
+ stats->tid = bam_line->core.tid; |
|
854 |
+ stats->gcd_pos = bam_line->core.pos; |
|
855 |
+ stats->igcd++; |
|
856 |
+ if ( stats->igcd >= stats->ngcd ) |
|
857 |
+ realloc_gcd_buffer(stats, readlen); |
|
858 |
+ } |
|
859 |
+ stats->gcd[ stats->igcd ].depth++; |
|
860 |
+ // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK) |
|
861 |
+ if ( !stats->fai ) |
|
862 |
+ stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len; |
|
863 |
+ |
|
864 |
+ // Coverage distribution graph |
|
865 |
+ round_buffer_flush(stats,bam_line->core.pos); |
|
866 |
+ round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1); |
|
867 |
+ } |
|
868 |
+ } |
|
869 |
+ |
|
870 |
+ stats->total_len += seq_len; |
|
871 |
+ if ( IS_DUP(bam_line) ) |
|
872 |
+ { |
|
873 |
+ stats->total_len_dup += seq_len; |
|
874 |
+ stats->nreads_dup++; |
|
875 |
+ } |
|
876 |
+} |
|
877 |
+ |
|
878 |
+// Sort by GC and depth |
|
879 |
+#define GCD_t(x) ((gc_depth_t *)x) |
|
880 |
+static int gcd_cmp(const void *a, const void *b) |
|
881 |
+{ |
|
882 |
+ if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1; |
|
883 |
+ if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1; |
|
884 |
+ if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1; |
|
885 |
+ if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1; |
|
886 |
+ return 0; |
|
887 |
+} |
|
888 |
+#undef GCD_t |
|
889 |
+ |
|
890 |
+float gcd_percentile(gc_depth_t *gcd, int N, int p) |
|
891 |
+{ |
|
892 |
+ float n,d; |
|
893 |
+ int k; |
|
894 |
+ |
|
895 |
+ n = p*(N+1)/100; |
|
896 |
+ k = n; |
|
897 |
+ if ( k<=0 ) |
|
898 |
+ return gcd[0].depth; |
|
899 |
+ if ( k>=N ) |
|
900 |
+ return gcd[N-1].depth; |
|
901 |
+ |
|
902 |
+ d = n - k; |
|
903 |
+ return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth); |
|
904 |
+} |
|
905 |
+ |
|
906 |
+void output_stats(stats_t *stats) |
|
907 |
+{ |
|
908 |
+ // Calculate average insert size and standard deviation (from the main bulk data only) |
|
909 |
+ int isize, ibulk=0; |
|
910 |
+ uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0; |
|
911 |
+ for (isize=0; isize<stats->nisize; isize++) |
|
912 |
+ { |
|
913 |
+ // Each pair was counted twice |
|
914 |
+ stats->isize_inward[isize] *= 0.5; |
|
915 |
+ stats->isize_outward[isize] *= 0.5; |
|
916 |
+ stats->isize_other[isize] *= 0.5; |
|
917 |
+ |
|
918 |
+ nisize_inward += stats->isize_inward[isize]; |
|
919 |
+ nisize_outward += stats->isize_outward[isize]; |
|
920 |
+ nisize_other += stats->isize_other[isize]; |
|
921 |
+ nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]; |
|
922 |
+ } |
|
923 |
+ |
|
924 |
+ double bulk=0, avg_isize=0, sd_isize=0; |
|
925 |
+ for (isize=0; isize<stats->nisize; isize++) |
|
926 |
+ { |
|
927 |
+ bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]; |
|
928 |
+ avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]); |
|
929 |
+ |
|
930 |
+ if ( bulk/nisize > stats->isize_main_bulk ) |
|
931 |
+ { |
|
932 |
+ ibulk = isize+1; |
|
933 |
+ nisize = bulk; |
|
934 |
+ break; |
|
935 |
+ } |
|
936 |
+ } |
|
937 |
+ avg_isize /= nisize ? nisize : 1; |
|
938 |
+ for (isize=1; isize<ibulk; isize++) |
|
939 |
+ sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize; |
|
940 |
+ sd_isize = sqrt(sd_isize); |
|
941 |
+ |
|
942 |
+ |
|
943 |
+ printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION); |
|
944 |
+ printf("# The command line was: %s",stats->argv[0]); |
|
945 |
+ int i; |
|
946 |
+ for (i=1; i<stats->argc; i++) |
|
947 |
+ printf(" %s",stats->argv[i]); |
|
948 |
+ printf("\n"); |
|
949 |
+ printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n"); |
|
950 |
+ printf("SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd)); |
|
951 |
+ printf("SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered); |
|
952 |
+ printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd)); |
|
953 |
+ printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0); |
|
954 |
+ printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0); |
|
955 |
+ printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st); |
|
956 |
+ printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd); |
|
957 |
+ printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired)); |
|
958 |
+ printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped); |
|
959 |
+ printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired); |
|
960 |
+ printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired); |
|
961 |
+ printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup); |
|
962 |
+ printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0); |
|
963 |
+ printf("SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed); |
|
964 |
+ printf("SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary); |
|
965 |
+ printf("SN\ttotal length:\t%ld\n", (long)stats->total_len); |
|
966 |
+ printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped); |
|
967 |
+ printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar); |
|
968 |
+ printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed); |
|
969 |
+ printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup); |
|
970 |
+ printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches); |
|
971 |
+ printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar); |
|
972 |
+ float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0; |
|
973 |
+ printf("SN\taverage length:\t%.0f\n", avg_read_length); |
|
974 |
+ printf("SN\tmaximum length:\t%d\n", stats->max_len); |
|
975 |
+ printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0); |
|
976 |
+ printf("SN\tinsert size average:\t%.1f\n", avg_isize); |
|
977 |
+ printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize); |
|
978 |
+ printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward); |
|
979 |
+ printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward); |
|
980 |
+ printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other); |
|
981 |
+ printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2); |
|
982 |
+ |
|
983 |
+ int ibase,iqual; |
|
984 |
+ if ( stats->max_len<stats->nbases ) stats->max_len++; |
|
985 |
+ if ( stats->max_qual+1<stats->nquals ) stats->max_qual++; |
|
986 |
+ printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n"); |
|
987 |
+ printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n"); |
|
988 |
+ for (ibase=0; ibase<stats->max_len; ibase++) |
|
989 |
+ { |
|
990 |
+ printf("FFQ\t%d",ibase+1); |
|
991 |
+ for (iqual=0; iqual<=stats->max_qual; iqual++) |
|
992 |
+ { |
|
993 |
+ printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]); |
|
994 |
+ } |
|
995 |
+ printf("\n"); |
|
996 |
+ } |
|
997 |
+ printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n"); |
|
998 |
+ printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n"); |
|
999 |
+ for (ibase=0; ibase<stats->max_len; ibase++) |
|
1000 |
+ { |
|
1001 |
+ printf("LFQ\t%d",ibase+1); |
|
1002 |
+ for (iqual=0; iqual<=stats->max_qual; iqual++) |
|
1003 |
+ { |
|
1004 |
+ printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]); |
|
1005 |
+ } |
|
1006 |
+ printf("\n"); |
|
1007 |
+ } |
|
1008 |
+ if ( stats->mpc_buf ) |
|
1009 |
+ { |
|
1010 |
+ printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n"); |
|
1011 |
+ printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n"); |
|
1012 |
+ printf("# is the number of N's and the rest is the number of mismatches\n"); |
|
1013 |
+ for (ibase=0; ibase<stats->max_len; ibase++) |
|
1014 |
+ { |
|
1015 |
+ printf("MPC\t%d",ibase+1); |
|
1016 |
+ for (iqual=0; iqual<=stats->max_qual; iqual++) |
|
1017 |
+ { |
|
1018 |
+ printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]); |
|
1019 |
+ } |
|
1020 |
+ printf("\n"); |
|
1021 |
+ } |
|
1022 |
+ } |
|
1023 |
+ printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n"); |
|
1024 |
+ int ibase_prev = 0; |
|
1025 |
+ for (ibase=0; ibase<stats->ngc; ibase++) |
|
1026 |
+ { |
|
1027 |
+ if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue; |
|
1028 |
+ printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]); |
|
1029 |
+ ibase_prev = ibase; |
|
1030 |
+ } |
|
1031 |
+ printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n"); |
|
1032 |
+ ibase_prev = 0; |
|
1033 |
+ for (ibase=0; ibase<stats->ngc; ibase++) |
|
1034 |
+ { |
|
1035 |
+ if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue; |
|
1036 |
+ printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]); |
|
1037 |
+ ibase_prev = ibase; |
|
1038 |
+ } |
|
1039 |
+ printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n"); |
|
1040 |
+ for (ibase=0; ibase<stats->max_len; ibase++) |
|
1041 |
+ { |
|
1042 |
+ uint64_t *ptr = &(stats->acgt_cycles[ibase*4]); |
|
1043 |
+ uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3]; |
|
1044 |
+ if ( ! sum ) continue; |
|
1045 |
+ printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum); |
|
1046 |
+ } |
|
1047 |
+ printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n"); |
|
1048 |
+ for (isize=0; isize<ibulk; isize++) |
|
1049 |
+ printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]), |
|
1050 |
+ (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]); |
|
1051 |
+ |
|
1052 |
+ printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n"); |
|
1053 |
+ int ilen; |
|
1054 |
+ for (ilen=0; ilen<stats->max_len; ilen++) |
|
1055 |
+ { |
|
1056 |
+ if ( stats->read_lengths[ilen]>0 ) |
|
1057 |
+ printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]); |
|
1058 |
+ } |
|
1059 |
+ |
|
1060 |
+ printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n"); |
|
1061 |
+ for (ilen=0; ilen<stats->nindels; ilen++) |
|
1062 |
+ { |
|
1063 |
+ if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 ) |
|
1064 |
+ printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]); |
|
1065 |
+ } |
|
1066 |
+ |
|
1067 |
+ printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n"); |
|
1068 |
+ for (ilen=0; ilen<=stats->nbases; ilen++) |
|
1069 |
+ { |
|
1070 |
+ // For deletions we print the index of the cycle before the deleted base (1-based) and for insertions |
|
1071 |
+ // the index of the cycle of the first inserted base (also 1-based) |
|
1072 |
+ if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 ) |
|
1073 |
+ printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]); |
|
1074 |
+ } |
|
1075 |
+ |
|
1076 |
+ printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n"); |
|
1077 |
+ if ( stats->cov[0] ) |
|
1078 |
+ printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]); |
|
1079 |
+ int icov; |
|
1080 |
+ for (icov=1; icov<stats->ncov-1; icov++) |
|
1081 |
+ if ( stats->cov[icov] ) |
|
1082 |
+ printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]); |
|
1083 |
+ if ( stats->cov[stats->ncov-1] ) |
|
1084 |
+ printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]); |
|
1085 |
+ |
|
1086 |
+ // Calculate average GC content, then sort by GC and depth |
|
1087 |
+ printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n"); |
|
1088 |
+ uint32_t igcd; |
|
1089 |
+ for (igcd=0; igcd<stats->igcd; igcd++) |
|
1090 |
+ { |
|
1091 |
+ if ( stats->fai ) |
|
1092 |
+ stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc); |
|
1093 |
+ else |
|
1094 |
+ if ( stats->gcd[igcd].depth ) |
|
1095 |
+ stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth); |
|
1096 |
+ } |
|
1097 |
+ qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp); |
|
1098 |
+ igcd = 0; |
|
1099 |
+ while ( igcd < stats->igcd ) |
|
1100 |
+ { |
|
1101 |
+ // Calculate percentiles (10,25,50,75,90th) for the current GC content and print |
|
1102 |
+ uint32_t nbins=0, itmp=igcd; |
|
1103 |
+ float gc = stats->gcd[igcd].gc; |
|
1104 |
+ while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 ) |
|
1105 |
+ { |
|
1106 |
+ nbins++; |
|
1107 |
+ itmp++; |
|
1108 |
+ } |
|
1109 |
+ printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1), |
|
1110 |
+ gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size, |
|
1111 |
+ gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size, |
|
1112 |
+ gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size, |
|
1113 |
+ gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size, |
|
1114 |
+ gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size |
|
1115 |
+ ); |
|
1116 |
+ igcd += nbins; |
|
1117 |
+ } |
|
1118 |
+} |
|
1119 |
+ |
|
1120 |
+size_t mygetline(char **line, size_t *n, FILE *fp) |
|
1121 |
+{ |
|
1122 |
+ if (line == NULL || n == NULL || fp == NULL) |
|
1123 |
+ { |
|
1124 |
+ errno = EINVAL; |
|
1125 |
+ return -1; |
|
1126 |
+ } |
|
1127 |
+ if (*n==0 || !*line) |
|
1128 |
+ { |
|
1129 |
+ *line = NULL; |
|
1130 |
+ *n = 0; |
|
1131 |
+ } |
|
1132 |
+ |
|
1133 |
+ size_t nread=0; |
|
1134 |
+ int c; |
|
1135 |
+ while ((c=getc(fp))!= EOF && c!='\n') |
|
1136 |
+ { |
|
1137 |
+ if ( ++nread>=*n ) |
|
1138 |
+ { |
|
1139 |
+ *n += 255; |
|
1140 |
+ *line = realloc(*line, sizeof(char)*(*n)); |
|
1141 |
+ } |
|
1142 |
+ (*line)[nread-1] = c; |
|
1143 |
+ } |
|
1144 |
+ if ( nread>=*n ) |
|
1145 |
+ { |
|
1146 |
+ *n += 255; |
|
1147 |
+ *line = realloc(*line, sizeof(char)*(*n)); |
|
1148 |
+ } |
|
1149 |
+ (*line)[nread] = 0; |
|
1150 |
+ return nread>0 ? nread : -1; |
|
1151 |
+ |
|
1152 |
+} |
|
1153 |
+ |
|
1154 |
+void init_regions(stats_t *stats, char *file) |
|
1155 |
+{ |
|
1156 |
+ khiter_t iter; |
|
1157 |
+ khash_t(kh_bam_tid) *header_hash; |
|
1158 |
+ |
|
1159 |
+ bam_init_header_hash(stats->sam->header); |
|
1160 |
+ header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash; |
|
1161 |
+ |
|
1162 |
+ FILE *fp = fopen(file,"r"); |
|
1163 |
+ if ( !fp ) error("%s: %s\n",file,strerror(errno)); |
|
1164 |
+ |
|
1165 |
+ char *line = NULL; |
|
1166 |
+ size_t len = 0; |
|
1167 |
+ ssize_t nread; |
|
1168 |
+ int warned = 0; |
|
1169 |
+ int prev_tid=-1, prev_pos=-1; |
|
1170 |
+ while ((nread = mygetline(&line, &len, fp)) != -1) |
|
1171 |
+ { |
|
1172 |
+ if ( line[0] == '#' ) continue; |
|
1173 |
+ |
|
1174 |
+ int i = 0; |
|
1175 |
+ while ( i<nread && !isspace(line[i]) ) i++; |
|
1176 |
+ if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line); |
|
1177 |
+ line[i] = 0; |
|
1178 |
+ |
|
1179 |
+ iter = kh_get(kh_bam_tid, header_hash, line); |
|
1180 |
+ int tid = kh_val(header_hash, iter); |
|
1181 |
+ if ( iter == kh_end(header_hash) ) |
|
1182 |
+ { |
|
1183 |
+ if ( !warned ) |
|
1184 |
+ fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line); |
|
1185 |
+ warned = 1; |
|
1186 |
+ continue; |
|
1187 |
+ } |
|
1188 |
+ |
|
1189 |
+ if ( tid >= stats->nregions ) |
|
1190 |
+ { |
|
1191 |
+ stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100)); |
|
1192 |
+ int j; |
|
1193 |
+ for (j=stats->nregions; j<stats->nregions+100; j++) |
|
1194 |
+ { |
|
1195 |
+ stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0; |
|
1196 |
+ stats->regions[j].pos = NULL; |
|
1197 |
+ } |
|
1198 |
+ stats->nregions += 100; |
|
1199 |
+ } |
|
1200 |
+ int npos = stats->regions[tid].npos; |
|
1201 |
+ if ( npos >= stats->regions[tid].mpos ) |
|
1202 |
+ { |
|
1203 |
+ stats->regions[tid].mpos += 1000; |
|
1204 |
+ stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos); |
|
1205 |
+ } |
|
1206 |
+ |
|
1207 |
+ if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n"); |
|
1208 |
+ if ( prev_tid==-1 || prev_tid!=tid ) |
|
1209 |
+ { |
|
1210 |
+ prev_tid = tid; |
|
1211 |
+ prev_pos = stats->regions[tid].pos[npos].from; |
|
1212 |
+ } |
|
1213 |
+ if ( prev_pos>stats->regions[tid].pos[npos].from ) |
|
1214 |
+ error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos); |
|
1215 |
+ stats->regions[tid].npos++; |
|
1216 |
+ } |
|
1217 |
+ if (line) free(line); |
|
1218 |
+ if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n"); |
|
1219 |
+ fclose(fp); |
|
1220 |
+} |
|
1221 |
+ |
|
1222 |
+void destroy_regions(stats_t *stats) |
|
1223 |
+{ |
|
1224 |
+ int i; |
|
1225 |
+ for (i=0; i<stats->nregions; i++) |
|
1226 |
+ { |
|
1227 |
+ if ( !stats->regions[i].mpos ) continue; |
|
1228 |
+ free(stats->regions[i].pos); |
|
1229 |
+ } |
|
1230 |
+ if ( stats->regions ) free(stats->regions); |
|
1231 |
+} |
|
1232 |
+ |
|
1233 |
+static int fetch_read(const bam1_t *bam_line, void *data) |
|
1234 |
+{ |
|
1235 |
+ collect_stats((bam1_t*)bam_line,(stats_t*)data); |
|
1236 |
+ return 1; |
|
1237 |
+} |
|
1238 |
+ |
|
1239 |
+void reset_regions(stats_t *stats) |
|
1240 |
+{ |
|
1241 |
+ int i; |
|
1242 |
+ for (i=0; i<stats->nregions; i++) |
|
1243 |
+ stats->regions[i].cpos = 0; |
|
1244 |
+} |
|
1245 |
+ |
|
1246 |
+int is_in_regions(bam1_t *bam_line, stats_t *stats) |
|
1247 |
+{ |
|
1248 |
+ if ( !stats->regions ) return 1; |
|
1249 |
+ |
|
1250 |
+ if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0; |
|
1251 |
+ if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n"); |
|
1252 |
+ |
|
1253 |
+ regions_t *reg = &stats->regions[bam_line->core.tid]; |
|
1254 |
+ if ( reg->cpos==reg->npos ) return 0; // done for this chr |
|
1255 |
+ |
|
1256 |
+ // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered, |
|
1257 |
+ // even small overlap is enough to include the read in the stats. |
|
1258 |
+ int i = reg->cpos; |
|
1259 |
+ while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++; |
|
1260 |
+ if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; } |
|
1261 |
+ if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0; |
|
1262 |
+ reg->cpos = i; |
|
1263 |
+ stats->reg_from = reg->pos[i].from; |
|
1264 |
+ stats->reg_to = reg->pos[i].to; |
|
1265 |
+ |
|
1266 |
+ return 1; |
|
1267 |
+} |
|
1268 |
+ |
|
1269 |
+void init_group_id(stats_t *stats, char *id) |
|
1270 |
+{ |
|
1271 |
+ if ( !stats->sam->header->dict ) |
|
1272 |
+ stats->sam->header->dict = sam_header_parse2(stats->sam->header->text); |
|
1273 |
+ void *iter = stats->sam->header->dict; |
|
1274 |
+ const char *key, *val; |
|
1275 |
+ int n = 0; |
|
1276 |
+ stats->rg_hash = kh_init(kh_rg); |
|
1277 |
+ while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) ) |
|
1278 |
+ { |
|
1279 |
+ if ( !strcmp(id,key) || (val && !strcmp(id,val)) ) |
|
1280 |
+ { |
|
1281 |
+ khiter_t k = kh_get(kh_rg, stats->rg_hash, key); |
|
1282 |
+ if ( k != kh_end(stats->rg_hash) ) |
|
1283 |
+ fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key); |
|
1284 |
+ int ret; |
|
1285 |
+ k = kh_put(kh_rg, stats->rg_hash, key, &ret); |
|
1286 |
+ kh_value(stats->rg_hash, k) = val; |
|
1287 |
+ n++; |
|
1288 |
+ } |
|
1289 |
+ } |
|
1290 |
+ if ( !n ) |
|
1291 |
+ error("The sample or read group \"%s\" not present.\n", id); |
|
1292 |
+} |
|
1293 |
+ |
|
1294 |
+ |
|
1295 |
+void error(const char *format, ...) |
|
1296 |
+{ |
|
1297 |
+ if ( !format ) |
|
1298 |
+ { |
|
1299 |
+ printf("Version: %s\n", BAMCHECK_VERSION); |
|
1300 |
+ printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n"); |
|
1301 |
+ printf("Usage: bamcheck [OPTIONS] file.bam\n"); |
|
1302 |
+ printf(" bamcheck [OPTIONS] file.bam chr:from-to\n"); |
|
1303 |
+ printf("Options:\n"); |
|
1304 |
+ printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n"); |
|
1305 |
+ printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n"); |
|
1306 |
+ printf(" -f, --required-flag <int> Required flag, 0 for unset [0]\n"); |
|
1307 |
+ printf(" -F, --filtering-flag <int> Filtering flag, 0 for unset [0]\n"); |
|
1308 |
+ printf(" --GC-depth <float,float> Bin size for GC-depth graph and the maximum reference length [2e4,4.2e9]\n"); |
|
1309 |
+ printf(" -h, --help This help message\n"); |
|
1310 |
+ printf(" -i, --insert-size <int> Maximum insert size [8000]\n"); |
|
1311 |
+ printf(" -I, --id <string> Include only listed read group or sample name\n"); |
|
1312 |
+ printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n"); |
|
1313 |
+ printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n"); |
|
1314 |
+ printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n"); |
|
1315 |
+ printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth calculation).\n"); |
|
1316 |
+ printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n"); |
|
1317 |
+ printf(" -s, --sam Input is SAM\n"); |
|
1318 |
+ printf("\n"); |
|
1319 |
+ } |
|
1320 |
+ else |
|
1321 |
+ { |
|
1322 |
+ va_list ap; |
|
1323 |
+ va_start(ap, format); |
|
1324 |
+ vfprintf(stderr, format, ap); |
|
1325 |
+ va_end(ap); |
|
1326 |
+ } |
|
1327 |
+ exit(-1); |
|
1328 |
+} |
|
1329 |
+ |
|
1330 |
+int main(int argc, char *argv[]) |
|
1331 |
+{ |
|
1332 |
+ char *targets = NULL; |
|
1333 |
+ char *bam_fname = NULL; |
|
1334 |
+ char *group_id = NULL; |
|
1335 |
+ samfile_t *sam = NULL; |
|
1336 |
+ char in_mode[5]; |
|
1337 |
+ |
|
1338 |
+ stats_t *stats = calloc(1,sizeof(stats_t)); |
|
1339 |
+ stats->ngc = 200; |
|
1340 |
+ stats->nquals = 256; |
|
1341 |
+ stats->nbases = 300; |
|
1342 |
+ stats->nisize = 8000; |
|
1343 |
+ stats->max_len = 30; |
|
1344 |
+ stats->max_qual = 40; |
|
1345 |
+ stats->isize_main_bulk = 0.99; // There are always outliers at the far end |
|
1346 |
+ stats->gcd_bin_size = 20e3; |
|
1347 |
+ stats->gcd_ref_size = 4.2e9; |
|
1348 |
+ stats->rseq_pos = -1; |
|
1349 |
+ stats->tid = stats->gcd_pos = -1; |
|
1350 |
+ stats->igcd = 0; |
|
1351 |
+ stats->is_sorted = 1; |
|
1352 |
+ stats->cov_min = 1; |
|
1353 |
+ stats->cov_max = 1000; |
|
1354 |
+ stats->cov_step = 1; |
|
1355 |
+ stats->argc = argc; |
|
1356 |
+ stats->argv = argv; |
|
1357 |
+ stats->filter_readlen = -1; |
|
1358 |
+ stats->nindels = stats->nbases; |
|
1359 |
+ |
|
1360 |
+ strcpy(in_mode, "rb"); |
|
1361 |
+ |
|
1362 |
+ static struct option loptions[] = |
|
1363 |
+ { |
|
1364 |
+ {"help",0,0,'h'}, |
|
1365 |
+ {"remove-dups",0,0,'d'}, |
|
1366 |
+ {"sam",0,0,'s'}, |
|
1367 |
+ {"ref-seq",1,0,'r'}, |
|
1368 |
+ {"coverage",1,0,'c'}, |
|
1369 |
+ {"read-length",1,0,'l'}, |
|
1370 |
+ {"insert-size",1,0,'i'}, |
|
1371 |
+ {"most-inserts",1,0,'m'}, |
|
1372 |
+ {"trim-quality",1,0,'q'}, |
|
1373 |
+ {"target-regions",0,0,'t'}, |
|
1374 |
+ {"required-flag",1,0,'f'}, |
|
1375 |
+ {"filtering-flag",0,0,'F'}, |
|
1376 |
+ {"id",1,0,'I'}, |
|
1377 |
+ {"GC-depth",1,0,1}, |
|
1378 |
+ {0,0,0,0} |
|
1379 |
+ }; |
|
1380 |
+ int opt; |
|
1381 |
+ while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 ) |
|
1382 |
+ { |
|
1383 |
+ switch (opt) |
|
1384 |
+ { |
|