Browse code

Add DAPAR, gcatest, iCheck, Imetagene, lfa, MEAL, metagenomeFeatures, pathVar, Prostar, SICtools, SISPA, SWATH2stats

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/SICtools@109238 bc3139a8-67e5-0310-9ffc-ced21a209358

James Hester authored on 06/10/2015 17:15:15
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,1521 @@
1
+/* 
2
+    Author: petr.danecek@sanger
3
+    gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam -lpthread
4
+
5
+    Assumptions, approximations and other issues:
6
+        - GC-depth graph does not split reads, the starting position determines which bin is incremented.
7
+            There are small overlaps between bins (max readlen-1). However, the bins are big (20k).
8
+        - coverage distribution ignores softclips and deletions
9
+        - some stats require sorted BAMs
10
+        - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths.
11
+        - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq)
12
+        - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar)
13
+            counts, no splicing is done, no indels or soft clips are considered, even small overlap is
14
+            good enough to include the read in the stats.
15
+
16
+*/
17
+
18
+#define BAMCHECK_VERSION "2012-09-04"
19
+
20
+#define _ISOC99_SOURCE
21
+#include <stdio.h>
22
+#include <stdlib.h>
23
+#include <stdarg.h>
24
+#include <string.h>
25
+#include <math.h>
26
+#include <ctype.h>
27
+#include <getopt.h>
28
+#include <errno.h>
29
+#include <assert.h>
30
+#include "faidx.h"
31
+#include "khash.h"
32
+#include "sam.h"
33
+#include "sam_header.h"
34
+#include "razf.h"
35
+
36
+#define BWA_MIN_RDLEN 35
37
+#define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP))
38
+#define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP)
39
+#define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE)
40
+#define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE)
41
+#define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1)
42
+#define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2)
43
+#define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP)
44
+
45
+typedef struct 
46
+{
47
+    int32_t line_len, line_blen;
48
+    int64_t len;
49
+    uint64_t offset;
50
+} 
51
+faidx1_t;
52
+KHASH_MAP_INIT_STR(kh_faidx, faidx1_t)
53
+KHASH_MAP_INIT_STR(kh_bam_tid, int)
54
+KHASH_MAP_INIT_STR(kh_rg, const char *)
55
+struct __faidx_t {
56
+    RAZF *rz;
57
+    int n, m;
58
+    char **name;
59
+    khash_t(kh_faidx) *hash;
60
+};
61
+
62
+typedef struct
63
+{
64
+    float gc;
65
+    uint32_t depth;
66
+}
67
+gc_depth_t;
68
+
69
+// For coverage distribution, a simple pileup
70
+typedef struct 
71
+{
72
+    int64_t pos;
73
+    int size, start;
74
+    int *buffer;
75
+} 
76
+round_buffer_t;
77
+
78
+typedef struct { uint32_t from, to; } pos_t;
79
+typedef struct
80
+{
81
+    int npos,mpos,cpos;
82
+    pos_t *pos;
83
+}
84
+regions_t;
85
+
86
+typedef struct
87
+{
88
+    // Parameters
89
+    int trim_qual;      // bwa trim quality
90
+
91
+    // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
92
+    //  insert size histogram holder
93
+    int nquals;         // The number of quality bins 
94
+    int nbases;         // The maximum sequence length the allocated array can hold
95
+    int nisize;         // The maximum insert size that the allocated array can hold
96
+    int ngc;            // The size of gc_1st and gc_2nd
97
+    int nindels;        // The maximum indel length for indel distribution
98
+
99
+    // Arrays for the histogram data
100
+    uint64_t *quals_1st, *quals_2nd;
101
+    uint64_t *gc_1st, *gc_2nd;
102
+    uint64_t *isize_inward, *isize_outward, *isize_other;
103
+    uint64_t *acgt_cycles;
104
+    uint64_t *read_lengths;
105
+    uint64_t *insertions, *deletions;
106
+    uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
107
+
108
+    // The extremes encountered
109
+    int max_len;            // Maximum read length
110
+    int max_qual;           // Maximum quality
111
+    float isize_main_bulk;  // There are always some unrealistically big insert sizes, report only the main part
112
+    int is_sorted;
113
+
114
+    // Summary numbers
115
+    uint64_t total_len;
116
+    uint64_t total_len_dup;
117
+    uint64_t nreads_1st;
118
+    uint64_t nreads_2nd;
119
+    uint64_t nreads_filtered;
120
+    uint64_t nreads_dup;
121
+    uint64_t nreads_unmapped;
122
+    uint64_t nreads_unpaired;
123
+    uint64_t nreads_paired;
124
+    uint64_t nreads_anomalous;
125
+    uint64_t nreads_mq0;
126
+    uint64_t nbases_mapped;
127
+    uint64_t nbases_mapped_cigar;
128
+    uint64_t nbases_trimmed;  // bwa trimmed bases
129
+    uint64_t nmismatches;
130
+    uint64_t nreads_QCfailed, nreads_secondary;
131
+
132
+    // GC-depth related data
133
+    uint32_t ngcd, igcd;        // The maximum number of GC depth bins and index of the current bin
134
+    gc_depth_t *gcd;            // The GC-depth bins holder
135
+    int gcd_bin_size;           // The size of GC-depth bin
136
+    uint32_t gcd_ref_size;      // The approximate size of the genome
137
+    int32_t tid, gcd_pos;       // Position of the current bin
138
+    int32_t pos;                // Position of the last read
139
+
140
+    // Coverage distribution related data
141
+    int ncov;                       // The number of coverage bins
142
+    uint64_t *cov;                  // The coverage frequencies
143
+    int cov_min,cov_max,cov_step;   // Minimum, maximum coverage and size of the coverage bins
144
+    round_buffer_t cov_rbuf;        // Pileup round buffer
145
+
146
+    // Mismatches by read cycle 
147
+    uint8_t *rseq_buf;              // A buffer for reference sequence to check the mismatches against
148
+    int mrseq_buf;                  // The size of the buffer
149
+    int32_t rseq_pos;               // The coordinate of the first base in the buffer
150
+    int32_t nrseq_buf;              // The used part of the buffer
151
+    uint64_t *mpc_buf;              // Mismatches per cycle
152
+
153
+    // Filters
154
+    int filter_readlen;
155
+
156
+    // Target regions
157
+    int nregions, reg_from,reg_to;
158
+    regions_t *regions;
159
+
160
+    // Auxiliary data
161
+    int flag_require, flag_filter;
162
+    double sum_qual;                // For calculating average quality value 
163
+    samfile_t *sam;             
164
+    khash_t(kh_rg) *rg_hash;        // Read groups to include, the array is null-terminated
165
+    faidx_t *fai;                   // Reference sequence for GC-depth graph
166
+    int argc;                       // Command line arguments to be printed on the output
167
+    char **argv;
168
+}
169
+stats_t;
170
+
171
+void error(const char *format, ...);
172
+void bam_init_header_hash(bam_header_t *header);
173
+int is_in_regions(bam1_t *bam_line, stats_t *stats);
174
+
175
+
176
+// Coverage distribution methods
177
+inline int coverage_idx(int min, int max, int n, int step, int depth)
178
+{
179
+    if ( depth < min )
180
+        return 0;
181
+
182
+    if ( depth > max )
183
+        return n-1;
184
+
185
+    return 1 + (depth - min) / step;
186
+}
187
+
188
+inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos)
189
+{
190
+    return (offset + (pos-refpos) % size) % size;
191
+}
192
+
193
+void round_buffer_flush(stats_t *stats, int64_t pos)
194
+{
195
+    int ibuf,idp;
196
+
197
+    if ( pos==stats->cov_rbuf.pos ) 
198
+        return;
199
+
200
+    int64_t new_pos = pos;
201
+    if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size )
202
+    {
203
+        // Flush the whole buffer, but in sequential order, 
204
+        pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1;
205
+    }
206
+
207
+    if ( pos < stats->cov_rbuf.pos ) 
208
+        error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos);
209
+
210
+    int ifrom = stats->cov_rbuf.start;
211
+    int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1);
212
+    if ( ifrom>ito )
213
+    {
214
+        for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++)
215
+        {
216
+            if ( !stats->cov_rbuf.buffer[ibuf] )
217
+                continue;
218
+            idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
219
+            stats->cov[idp]++;
220
+            stats->cov_rbuf.buffer[ibuf] = 0;
221
+        }
222
+        ifrom = 0;
223
+    }
224
+    for (ibuf=ifrom; ibuf<=ito; ibuf++)
225
+    {
226
+        if ( !stats->cov_rbuf.buffer[ibuf] )
227
+            continue;
228
+        idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
229
+        stats->cov[idp]++;
230
+        stats->cov_rbuf.buffer[ibuf] = 0;
231
+    }
232
+    stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos);
233
+    stats->cov_rbuf.pos   = new_pos;
234
+}
235
+
236
+void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to)
237
+{
238
+    if ( to-from >= rbuf->size )
239
+        error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size);
240
+    if ( from < rbuf->pos )
241
+        error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos);
242
+
243
+    int ifrom,ito,ibuf;
244
+    ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from);
245
+    ito   = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to);
246
+    if ( ifrom>ito )
247
+    {
248
+        for (ibuf=ifrom; ibuf<rbuf->size; ibuf++)
249
+            rbuf->buffer[ibuf]++;
250
+        ifrom = 0;
251
+    }
252
+    for (ibuf=ifrom; ibuf<=ito; ibuf++)
253
+        rbuf->buffer[ibuf]++;
254
+}
255
+
256
+// Calculate the number of bases in the read trimmed by BWA
257
+int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse) 
258
+{
259
+    if ( len<BWA_MIN_RDLEN ) return 0;
260
+
261
+    // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN,
262
+    //  the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c).
263
+    int max_trimmed = len - BWA_MIN_RDLEN + 1;
264
+    int l, sum=0, max_sum=0, max_l=0;
265
+
266
+    for (l=0; l<max_trimmed; l++)
267
+    {
268
+        sum += trim_qual - quals[ reverse ? l : len-1-l ];
269
+        if ( sum<0 ) break;
270
+        if ( sum>max_sum )
271
+        {
272
+            max_sum = sum;
273
+            // This is the correct way, but bwa clips from some reason one base less
274
+            // max_l   = l+1;
275
+            max_l   = l;
276
+        }
277
+    }
278
+    return max_l;
279
+}
280
+
281
+
282
+void count_indels(stats_t *stats,bam1_t *bam_line) 
283
+{
284
+    int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
285
+    int is_1st = IS_READ1(bam_line) ? 1 : 0;
286
+    int icig;
287
+    int icycle = 0;
288
+    int read_len = bam_line->core.l_qseq;
289
+    for (icig=0; icig<bam_line->core.n_cigar; icig++) 
290
+    {
291
+        // Conversion from uint32_t to MIDNSHP
292
+        //  0123456
293
+        //  MIDNSHP
294
+        int cig  = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
295
+        int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
296
+
297
+        if ( cig==1 )
298
+        {
299
+            int idx = is_fwd ? icycle : read_len-icycle-ncig;
300
+            if ( idx<0 ) 
301
+                error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
302
+            if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
303
+            if ( is_1st ) 
304
+                stats->ins_cycles_1st[idx]++;
305
+            else
306
+                stats->ins_cycles_2nd[idx]++;
307
+            icycle += ncig;
308
+            if ( ncig<=stats->nindels )
309
+                stats->insertions[ncig-1]++;
310
+            continue;
311
+        }
312
+        if ( cig==2 )
313
+        {
314
+            int idx = is_fwd ? icycle-1 : read_len-icycle-1;
315
+            if ( idx<0 ) continue;  // discard meaningless deletions
316
+            if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
317
+            if ( is_1st )
318
+                stats->del_cycles_1st[idx]++;
319
+            else
320
+                stats->del_cycles_2nd[idx]++;
321
+            if ( ncig<=stats->nindels )
322
+                stats->deletions[ncig-1]++;
323
+            continue;
324
+        }
325
+        if ( cig!=3 && cig!=5 )
326
+            icycle += ncig;
327
+    }
328
+}
329
+
330
+void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) 
331
+{
332
+    int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
333
+    int icig,iread=0,icycle=0;
334
+    int iref = bam_line->core.pos - stats->rseq_pos;
335
+    int read_len   = bam_line->core.l_qseq;
336
+    uint8_t *read  = bam1_seq(bam_line);
337
+    uint8_t *quals = bam1_qual(bam_line);
338
+    uint64_t *mpc_buf = stats->mpc_buf;
339
+    for (icig=0; icig<bam_line->core.n_cigar; icig++) 
340
+    {
341
+        // Conversion from uint32_t to MIDNSHP
342
+        //  0123456
343
+        //  MIDNSHP
344
+        int cig  = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
345
+        int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
346
+        if ( cig==1 )
347
+        {
348
+            iread  += ncig;
349
+            icycle += ncig;
350
+            continue;
351
+        }
352
+        if ( cig==2 )
353
+        {
354
+            iref += ncig;
355
+            continue;
356
+        }
357
+        if ( cig==4 )
358
+        {
359
+            icycle += ncig;
360
+            // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence
361
+            //   iref += ncig;
362
+            iread  += ncig;
363
+            continue;
364
+        }
365
+        if ( cig==5 )
366
+        {
367
+            icycle += ncig;
368
+            continue;
369
+        }
370
+        // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large
371
+        //  chunk of refseq in memory. Not very frequent and not noticable in the stats.
372
+        if ( cig==3 || cig==5 ) continue;
373
+        if ( cig!=0 )
374
+            error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
375
+       
376
+        if ( ncig+iref > stats->nrseq_buf )
377
+            error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
378
+
379
+        int im;
380
+        for (im=0; im<ncig; im++)
381
+        {
382
+            uint8_t cread = bam1_seqi(read,iread);
383
+            uint8_t cref  = stats->rseq_buf[iref];
384
+
385
+            // ---------------15
386
+            // =ACMGRSVTWYHKDBN
387
+            if ( cread==15 )
388
+            {
389
+                int idx = is_fwd ? icycle : read_len-icycle-1;
390
+                if ( idx>stats->max_len )
391
+                    error("mpc: %d>%d\n",idx,stats->max_len);
392
+                idx = idx*stats->nquals;
393
+                if ( idx>=stats->nquals*stats->nbases )
394
+                    error("FIXME: mpc_buf overflow\n");
395
+                mpc_buf[idx]++;
396
+            }
397
+            else if ( cref && cread && cref!=cread )
398
+            {
399
+                uint8_t qual = quals[iread] + 1;
400
+                if ( qual>=stats->nquals )
401
+                    error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
402
+
403
+                int idx = is_fwd ? icycle : read_len-icycle-1;
404
+                if ( idx>stats->max_len )
405
+                    error("mpc: %d>%d\n",idx,stats->max_len);
406
+
407
+                idx = idx*stats->nquals + qual;
408
+                if ( idx>=stats->nquals*stats->nbases )
409
+                    error("FIXME: mpc_buf overflow\n");
410
+                mpc_buf[idx]++;
411
+            }
412
+
413
+            iref++;
414
+            iread++;
415
+            icycle++;
416
+        }
417
+    }
418
+}
419
+
420
+void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
421
+{
422
+    khash_t(kh_faidx) *h;
423
+    khiter_t iter;
424
+    faidx1_t val;
425
+    char *chr, c;
426
+    faidx_t *fai = stats->fai;
427
+
428
+    h = fai->hash;
429
+    chr = stats->sam->header->target_name[tid];
430
+
431
+    // ID of the sequence name
432
+    iter = kh_get(kh_faidx, h, chr);
433
+    if (iter == kh_end(h)) 
434
+        error("No such reference sequence [%s]?\n", chr);
435
+    val = kh_value(h, iter);
436
+
437
+    // Check the boundaries
438
+    if (pos >= val.len)
439
+        error("Was the bam file mapped with the reference sequence supplied?"
440
+              " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len);
441
+    int size = stats->mrseq_buf;
442
+    // The buffer extends beyond the chromosome end. Later the rest will be filled with N's.
443
+    if (size+pos > val.len) size = val.len-pos;
444
+
445
+    // Position the razf reader
446
+    razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET);
447
+
448
+    uint8_t *ptr = stats->rseq_buf;
449
+    int nread = 0;
450
+    while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err )
451
+    {
452
+        if ( !isgraph(c) )
453
+            continue;
454
+
455
+        // Conversion between uint8_t coding and ACGT
456
+        //      -12-4---8-------
457
+        //      =ACMGRSVTWYHKDBN
458
+        if ( c=='A' || c=='a' )
459
+            *ptr = 1;
460
+        else if ( c=='C' || c=='c' )
461
+            *ptr = 2;
462
+        else if ( c=='G' || c=='g' )
463
+            *ptr = 4;
464
+        else if ( c=='T' || c=='t' )
465
+            *ptr = 8;
466
+        else
467
+            *ptr = 0;
468
+        ptr++;
469
+        nread++;
470
+    }
471
+    if ( nread < stats->mrseq_buf )
472
+    {
473
+        memset(ptr,0, stats->mrseq_buf - nread);
474
+        nread = stats->mrseq_buf;
475
+    }
476
+    stats->nrseq_buf = nread;
477
+    stats->rseq_pos  = pos;
478
+    stats->tid       = tid;
479
+}
480
+
481
+float fai_gc_content(stats_t *stats, int pos, int len)
482
+{
483
+    uint32_t gc,count,c;
484
+    int i = pos - stats->rseq_pos, ito = i + len;
485
+    assert( i>=0 && ito<=stats->nrseq_buf );
486
+
487
+    // Count GC content
488
+    gc = count = 0;
489
+    for (; i<ito; i++)
490
+    {
491
+        c = stats->rseq_buf[i];
492
+        if ( c==2 || c==4 )
493
+        {
494
+            gc++;
495
+            count++;
496
+        }
497
+        else if ( c==1 || c==8 )
498
+            count++;
499
+    }
500
+    return count ? (float)gc/count : 0;
501
+}
502
+
503
+void realloc_rseq_buffer(stats_t *stats)
504
+{
505
+    int n = stats->nbases*10;
506
+    if ( stats->gcd_bin_size > n ) n = stats->gcd_bin_size;
507
+    if ( stats->mrseq_buf<n )
508
+    {
509
+        stats->rseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n);
510
+        stats->mrseq_buf = n;
511
+    }
512
+}
513
+
514
+void realloc_gcd_buffer(stats_t *stats, int seq_len)
515
+{
516
+    if ( seq_len >= stats->gcd_bin_size )
517
+        error("The --GC-depth bin size (%d) is set too low for the read length %d\n", stats->gcd_bin_size, seq_len);
518
+
519
+    int n = 1 + stats->gcd_ref_size / (stats->gcd_bin_size - seq_len);
520
+    if ( n <= stats->igcd )
521
+        error("The --GC-depth bin size is too small or reference genome too big; please decrease the bin size or increase the reference length\n");
522
+        
523
+    if ( n > stats->ngcd )
524
+    {
525
+        stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
526
+        if ( !stats->gcd )
527
+            error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
528
+        memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
529
+        stats->ngcd = n;
530
+    }
531
+
532
+    realloc_rseq_buffer(stats);
533
+}
534
+
535
+void realloc_buffers(stats_t *stats, int seq_len)
536
+{
537
+    int n = 2*(1 + seq_len - stats->nbases) + stats->nbases;
538
+
539
+    stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t));
540
+    if ( !stats->quals_1st )
541
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
542
+    memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
543
+
544
+    stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t));
545
+    if ( !stats->quals_2nd )
546
+        error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
547
+    memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
548
+
549
+    if ( stats->mpc_buf )
550
+    {
551
+        stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t));
552
+        if ( !stats->mpc_buf )
553
+            error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
554
+        memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
555
+    }
556
+
557
+    stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t));
558
+    if ( !stats->acgt_cycles )
559
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t));
560
+    memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t));
561
+
562
+    stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
563
+    if ( !stats->read_lengths )
564
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
565
+    memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
566
+
567
+    stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t));
568
+    if ( !stats->insertions )
569
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
570
+    memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
571
+
572
+    stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t));
573
+    if ( !stats->deletions )
574
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
575
+    memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
576
+
577
+    stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t));
578
+    if ( !stats->ins_cycles_1st )
579
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
580
+    memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
581
+
582
+    stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t));
583
+    if ( !stats->ins_cycles_2nd )
584
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
585
+    memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
586
+
587
+    stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t));
588
+    if ( !stats->del_cycles_1st )
589
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
590
+    memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
591
+
592
+    stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t));
593
+    if ( !stats->del_cycles_2nd )
594
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
595
+    memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
596
+
597
+    stats->nbases = n;
598
+
599
+    // Realloc the coverage distribution buffer
600
+    int *rbuffer = calloc(sizeof(int),seq_len*5);
601
+    n = stats->cov_rbuf.size-stats->cov_rbuf.start;
602
+    memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n);
603
+    if ( stats->cov_rbuf.start>1 )
604
+        memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start);
605
+    stats->cov_rbuf.start = 0;
606
+    free(stats->cov_rbuf.buffer);
607
+    stats->cov_rbuf.buffer = rbuffer;
608
+    stats->cov_rbuf.size = seq_len*5;
609
+
610
+    realloc_rseq_buffer(stats);
611
+}
612
+
613
+void collect_stats(bam1_t *bam_line, stats_t *stats)
614
+{
615
+    if ( stats->rg_hash )
616
+    {
617
+        const uint8_t *rg = bam_aux_get(bam_line, "RG");
618
+        if ( !rg ) return; 
619
+        khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1));
620
+        if ( k == kh_end(stats->rg_hash) ) return;
621
+    }
622
+    if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require )
623
+    {
624
+        stats->nreads_filtered++;
625
+        return;
626
+    }
627
+    if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
628
+    {
629
+        stats->nreads_filtered++;
630
+        return;
631
+    }
632
+    if ( !is_in_regions(bam_line,stats) )
633
+        return;
634
+    if ( stats->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->filter_readlen ) 
635
+        return;
636
+
637
+    if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++;
638
+    if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++;
639
+
640
+    int seq_len = bam_line->core.l_qseq;
641
+    if ( !seq_len ) return;
642
+
643
+    if ( seq_len >= stats->nbases )
644
+        realloc_buffers(stats,seq_len);
645
+    if ( stats->max_len<seq_len )
646
+        stats->max_len = seq_len;
647
+
648
+    stats->read_lengths[seq_len]++;
649
+
650
+    // Count GC and ACGT per cycle
651
+    uint8_t base, *seq  = bam1_seq(bam_line);
652
+    int gc_count  = 0;
653
+    int i;
654
+    int reverse = IS_REVERSE(bam_line);
655
+    for (i=0; i<seq_len; i++)
656
+    {
657
+        // Conversion from uint8_t coding to ACGT
658
+        //      -12-4---8-------
659
+        //      =ACMGRSVTWYHKDBN
660
+        //       01 2   3
661
+        base = bam1_seqi(seq,i);
662
+        base /= 2;
663
+        if ( base==1 || base==2 ) gc_count++;
664
+        else if ( base>2 ) base=3;
665
+        if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 ) 
666
+            error("FIXME: acgt_cycles\n");
667
+        stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++;
668
+    }
669
+    int gc_idx_min = gc_count*(stats->ngc-1)/seq_len;
670
+    int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len;
671
+    if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1;
672
+
673
+    // Determine which array (1st or 2nd read) will these stats go to,
674
+    //  trim low quality bases from end the same way BWA does, 
675
+    //  fill GC histogram
676
+    uint64_t *quals;
677
+    uint8_t *bam_quals = bam1_qual(bam_line);
678
+    if ( bam_line->core.flag&BAM_FREAD2 )
679
+    {
680
+        quals  = stats->quals_2nd;
681
+        stats->nreads_2nd++;
682
+        for (i=gc_idx_min; i<gc_idx_max; i++)
683
+            stats->gc_2nd[i]++;
684
+    }
685
+    else
686
+    {
687
+        quals = stats->quals_1st;
688
+        stats->nreads_1st++;
689
+        for (i=gc_idx_min; i<gc_idx_max; i++)
690
+            stats->gc_1st[i]++;
691
+    }
692
+    if ( stats->trim_qual>0 ) 
693
+        stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
694
+
695
+    // Quality histogram and average quality
696
+    for (i=0; i<seq_len; i++)
697
+    {
698
+        uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
699
+        if ( qual>=stats->nquals )
700
+            error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
701
+        if ( qual>stats->max_qual )
702
+            stats->max_qual = qual;
703
+
704
+        quals[ i*stats->nquals+qual ]++;
705
+        stats->sum_qual += qual;
706
+    }
707
+
708
+    // Look at the flags and increment appropriate counters (mapped, paired, etc)
709
+    if ( IS_UNMAPPED(bam_line) )
710
+        stats->nreads_unmapped++;
711
+    else
712
+    {
713
+        if ( !bam_line->core.qual )
714
+            stats->nreads_mq0++;
715
+
716
+        count_indels(stats,bam_line);
717
+
718
+        if ( !IS_PAIRED(bam_line) )
719
+            stats->nreads_unpaired++;
720
+        else 
721
+        {
722
+            stats->nreads_paired++;
723
+
724
+            if ( bam_line->core.tid!=bam_line->core.mtid )
725
+                stats->nreads_anomalous++;
726
+
727
+            // The insert size is tricky, because for long inserts the libraries are
728
+            // prepared differently and the pairs point in other direction. BWA does
729
+            // not set the paired flag for them.  Similar thing is true also for 454
730
+            // reads. Mates mapped to different chromosomes have isize==0.
731
+            int32_t isize = bam_line->core.isize;
732
+            if ( isize<0 ) isize = -isize;
733
+            if ( isize >= stats->nisize )
734
+                isize = stats->nisize-1;
735
+            if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
736
+            {
737
+                int pos_fst = bam_line->core.mpos - bam_line->core.pos;
738
+                int is_fst  = IS_READ1(bam_line) ? 1 : -1;
739
+                int is_fwd  = IS_REVERSE(bam_line) ? -1 : 1;
740
+                int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
741
+
742
+                if ( is_fwd*is_mfwd>0 )
743
+                    stats->isize_other[isize]++;
744
+                else if ( is_fst*pos_fst>0 )
745
+                {
746
+                    if ( is_fst*is_fwd>0 )
747
+                        stats->isize_inward[isize]++;
748
+                    else
749
+                        stats->isize_outward[isize]++;
750
+                }
751
+                else if ( is_fst*pos_fst<0 )
752
+                {
753
+                    if ( is_fst*is_fwd>0 )
754
+                        stats->isize_outward[isize]++;
755
+                    else
756
+                        stats->isize_inward[isize]++;
757
+                }
758
+            }
759
+        }
760
+
761
+        // Number of mismatches 
762
+        uint8_t *nm = bam_aux_get(bam_line,"NM");
763
+        if (nm) 
764
+            stats->nmismatches += bam_aux2i(nm);
765
+
766
+        // Number of mapped bases from cigar 
767
+        // Conversion from uint32_t to MIDNSHP
768
+        //  012-4--
769
+        //  MIDNSHP
770
+        if ( bam_line->core.n_cigar == 0) 
771
+            error("FIXME: mapped read with no cigar?\n");
772
+        int readlen=seq_len;
773
+        if ( stats->regions )
774
+        {
775
+            // Count only on-target bases
776
+            int iref = bam_line->core.pos + 1;
777
+            for (i=0; i<bam_line->core.n_cigar; i++)
778
+            {
779
+                int cig  = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK;
780
+                int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
781
+                if ( cig==2 ) readlen += ncig;
782
+                else if ( cig==0 ) 
783
+                {
784
+                    if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
785
+                    else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
786
+                    if ( ncig<0 ) ncig = 0;
787
+                    stats->nbases_mapped_cigar += ncig;
788
+                    iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
789
+                }
790
+                else if ( cig==1 )
791
+                {
792
+                    iref += ncig;
793
+                    if ( iref>=stats->reg_from && iref<=stats->reg_to )
794
+                        stats->nbases_mapped_cigar += ncig;
795
+                }
796
+            }
797
+        }
798
+        else
799
+        {
800
+            // Count the whole read
801
+            for (i=0; i<bam_line->core.n_cigar; i++) 
802
+            {
803
+                if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 )
804
+                    stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
805
+                if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 )
806
+                    readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
807
+            }
808
+        }
809
+        stats->nbases_mapped += seq_len;
810
+
811
+        if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
812
+            stats->is_sorted = 0;
813
+        stats->pos = bam_line->core.pos;
814
+
815
+        if ( stats->is_sorted )
816
+        {
817
+            if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
818
+                round_buffer_flush(stats,-1);
819
+
820
+            // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins
821
+            //  are not splitted which results in up to seq_len-1 overlaps. The default bin size is
822
+            //  20kbp, so the effect is negligible.
823
+            if ( stats->fai )
824
+            {
825
+                int inc_ref = 0, inc_gcd = 0;
826
+                // First pass or new chromosome
827
+                if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; }
828
+                // Read goes beyond the end of the rseq buffer
829
+                else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; }
830
+                // Read overlaps the next gcd bin
831
+                else if ( stats->gcd_pos+stats->gcd_bin_size < bam_line->core.pos+readlen ) 
832
+                { 
833
+                    inc_gcd = 1;
834
+                    if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->gcd_bin_size ) inc_ref = 1;
835
+                }
836
+                if ( inc_gcd )
837
+                {
838
+                    stats->igcd++;
839
+                    if ( stats->igcd >= stats->ngcd )
840
+                        realloc_gcd_buffer(stats, readlen);
841
+                    if ( inc_ref )
842
+                        read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
843
+                    stats->gcd_pos = bam_line->core.pos;
844
+                    stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size);
845
+                }
846
+
847
+                count_mismatches_per_cycle(stats,bam_line);
848
+            }
849
+            // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin
850
+            else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
851
+            {
852
+                // First pass or a new chromosome
853
+                stats->tid     = bam_line->core.tid;
854
+                stats->gcd_pos = bam_line->core.pos;
855
+                stats->igcd++;
856
+                if ( stats->igcd >= stats->ngcd )
857
+                    realloc_gcd_buffer(stats, readlen);
858
+            }
859
+            stats->gcd[ stats->igcd ].depth++;
860
+            // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
861
+            if ( !stats->fai )
862
+                stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
863
+
864
+            // Coverage distribution graph
865
+            round_buffer_flush(stats,bam_line->core.pos);
866
+            round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
867
+        }
868
+    }
869
+
870
+    stats->total_len += seq_len;
871
+    if ( IS_DUP(bam_line) )
872
+    {
873
+        stats->total_len_dup += seq_len;
874
+        stats->nreads_dup++;
875
+    }
876
+}
877
+
878
+// Sort by GC and depth
879
+#define GCD_t(x) ((gc_depth_t *)x)
880
+static int gcd_cmp(const void *a, const void *b)
881
+{
882
+    if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1;
883
+    if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1;
884
+    if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1;
885
+    if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1;
886
+    return 0;
887
+}
888
+#undef GCD_t
889
+
890
+float gcd_percentile(gc_depth_t *gcd, int N, int p)
891
+{
892
+    float n,d;
893
+    int k;
894
+
895
+    n = p*(N+1)/100;
896
+    k = n;
897
+    if ( k<=0 ) 
898
+        return gcd[0].depth;
899
+    if ( k>=N ) 
900
+        return gcd[N-1].depth;
901
+
902
+    d = n - k;
903
+    return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth);
904
+}
905
+
906
+void output_stats(stats_t *stats)
907
+{
908
+    // Calculate average insert size and standard deviation (from the main bulk data only)
909
+    int isize, ibulk=0;
910
+    uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
911
+    for (isize=0; isize<stats->nisize; isize++)
912
+    {
913
+        // Each pair was counted twice
914
+        stats->isize_inward[isize] *= 0.5;
915
+        stats->isize_outward[isize] *= 0.5;
916
+        stats->isize_other[isize] *= 0.5;
917
+
918
+        nisize_inward += stats->isize_inward[isize];
919
+        nisize_outward += stats->isize_outward[isize];
920
+        nisize_other += stats->isize_other[isize];
921
+        nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
922
+    }
923
+
924
+    double bulk=0, avg_isize=0, sd_isize=0;
925
+    for (isize=0; isize<stats->nisize; isize++)
926
+    {
927
+        bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
928
+        avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
929
+
930
+        if ( bulk/nisize > stats->isize_main_bulk )
931
+        {
932
+            ibulk  = isize+1;
933
+            nisize = bulk;
934
+            break;
935
+        }
936
+    }
937
+    avg_isize /= nisize ? nisize : 1;
938
+    for (isize=1; isize<ibulk; isize++)
939
+        sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize;
940
+    sd_isize = sqrt(sd_isize);
941
+
942
+
943
+    printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION);
944
+    printf("# The command line was:  %s",stats->argv[0]);
945
+    int i;
946
+    for (i=1; i<stats->argc; i++)
947
+        printf(" %s",stats->argv[i]);
948
+    printf("\n");
949
+    printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
950
+    printf("SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd));
951
+    printf("SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
952
+    printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
953
+    printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0);
954
+    printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
955
+    printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
956
+    printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
957
+    printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired));
958
+    printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
959
+    printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired);
960
+    printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired);
961
+    printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup);
962
+    printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0);
963
+    printf("SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
964
+    printf("SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
965
+    printf("SN\ttotal length:\t%ld\n", (long)stats->total_len);
966
+    printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped);
967
+    printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar);
968
+    printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
969
+    printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
970
+    printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches);
971
+    printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar);
972
+    float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0;
973
+    printf("SN\taverage length:\t%.0f\n", avg_read_length);
974
+    printf("SN\tmaximum length:\t%d\n", stats->max_len);
975
+    printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
976
+    printf("SN\tinsert size average:\t%.1f\n", avg_isize);
977
+    printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
978
+    printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
979
+    printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
980
+    printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
981
+    printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
982
+
983
+    int ibase,iqual;
984
+    if ( stats->max_len<stats->nbases ) stats->max_len++;
985
+    if ( stats->max_qual+1<stats->nquals ) stats->max_qual++;
986
+    printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
987
+    printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
988
+    for (ibase=0; ibase<stats->max_len; ibase++)
989
+    {
990
+        printf("FFQ\t%d",ibase+1);
991
+        for (iqual=0; iqual<=stats->max_qual; iqual++)
992
+        {
993
+            printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
994
+        }
995
+        printf("\n");
996
+    }
997
+    printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
998
+    printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
999
+    for (ibase=0; ibase<stats->max_len; ibase++)
1000
+    {
1001
+        printf("LFQ\t%d",ibase+1);
1002
+        for (iqual=0; iqual<=stats->max_qual; iqual++)
1003
+        {
1004
+            printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
1005
+        }
1006
+        printf("\n");
1007
+    }
1008
+    if ( stats->mpc_buf )
1009
+    {
1010
+        printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
1011
+        printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
1012
+        printf("# is the number of N's and the rest is the number of mismatches\n");
1013
+        for (ibase=0; ibase<stats->max_len; ibase++)
1014
+        {
1015
+            printf("MPC\t%d",ibase+1);
1016
+            for (iqual=0; iqual<=stats->max_qual; iqual++)
1017
+            {
1018
+                printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
1019
+            }
1020
+            printf("\n");
1021
+        }
1022
+    }
1023
+    printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
1024
+    int ibase_prev = 0;
1025
+    for (ibase=0; ibase<stats->ngc; ibase++)
1026
+    {
1027
+        if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue;
1028
+        printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
1029
+        ibase_prev = ibase;
1030
+    }
1031
+    printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
1032
+    ibase_prev = 0;
1033
+    for (ibase=0; ibase<stats->ngc; ibase++)
1034
+    {
1035
+        if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue;
1036
+        printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
1037
+        ibase_prev = ibase;
1038
+    }
1039
+    printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n");
1040
+    for (ibase=0; ibase<stats->max_len; ibase++)
1041
+    {
1042
+        uint64_t *ptr = &(stats->acgt_cycles[ibase*4]);
1043
+        uint64_t  sum = ptr[0]+ptr[1]+ptr[2]+ptr[3];
1044
+        if ( ! sum ) continue;
1045
+        printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
1046
+    }
1047
+    printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
1048
+    for (isize=0; isize<ibulk; isize++)
1049
+        printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
1050
+            (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
1051
+
1052
+    printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
1053
+    int ilen;
1054
+    for (ilen=0; ilen<stats->max_len; ilen++)
1055
+    {
1056
+        if ( stats->read_lengths[ilen]>0 )
1057
+            printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
1058
+    }
1059
+
1060
+    printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
1061
+    for (ilen=0; ilen<stats->nindels; ilen++)
1062
+    {
1063
+        if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 )
1064
+            printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
1065
+    }
1066
+
1067
+    printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
1068
+    for (ilen=0; ilen<=stats->nbases; ilen++)
1069
+    {
1070
+        // For deletions we print the index of the cycle before the deleted base (1-based) and for insertions
1071
+        //  the index of the cycle of the first inserted base (also 1-based)
1072
+        if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
1073
+            printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
1074
+    }
1075
+
1076
+    printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
1077
+    if  ( stats->cov[0] )
1078
+        printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
1079
+    int icov;
1080
+    for (icov=1; icov<stats->ncov-1; icov++)
1081
+        if ( stats->cov[icov] )
1082
+            printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
1083
+    if ( stats->cov[stats->ncov-1] )
1084
+        printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
1085
+
1086
+    // Calculate average GC content, then sort by GC and depth
1087
+    printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
1088
+    uint32_t igcd;
1089
+    for (igcd=0; igcd<stats->igcd; igcd++)
1090
+    {
1091
+        if ( stats->fai )
1092
+            stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc);
1093
+        else
1094
+            if ( stats->gcd[igcd].depth ) 
1095
+                stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth);
1096
+    }
1097
+    qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
1098
+    igcd = 0;
1099
+    while ( igcd < stats->igcd )
1100
+    {
1101
+        // Calculate percentiles (10,25,50,75,90th) for the current GC content and print
1102
+        uint32_t nbins=0, itmp=igcd;
1103
+        float gc = stats->gcd[igcd].gc;
1104
+        while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 )
1105
+        {
1106
+            nbins++;
1107
+            itmp++;
1108
+        }
1109
+        printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
1110
+                gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size,
1111
+                gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size, 
1112
+                gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size, 
1113
+                gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size, 
1114
+                gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size 
1115
+              );
1116
+        igcd += nbins;
1117
+    }
1118
+}
1119
+
1120
+size_t mygetline(char **line, size_t *n, FILE *fp)
1121
+{
1122
+    if (line == NULL || n == NULL || fp == NULL)
1123
+    {
1124
+        errno = EINVAL;
1125
+        return -1;
1126
+    }
1127
+    if (*n==0 || !*line)
1128
+    {
1129
+        *line = NULL;
1130
+        *n = 0;
1131
+    }
1132
+
1133
+    size_t nread=0;
1134
+    int c;
1135
+    while ((c=getc(fp))!= EOF && c!='\n')
1136
+    {
1137
+        if ( ++nread>=*n )
1138
+        {
1139
+            *n += 255;
1140
+            *line = realloc(*line, sizeof(char)*(*n));
1141
+        }
1142
+        (*line)[nread-1] = c;
1143
+    }
1144
+    if ( nread>=*n )
1145
+    {
1146
+        *n += 255;
1147
+        *line = realloc(*line, sizeof(char)*(*n));
1148
+    }
1149
+    (*line)[nread] = 0;
1150
+    return nread>0 ? nread : -1;
1151
+
1152
+}
1153
+
1154
+void init_regions(stats_t *stats, char *file)
1155
+{
1156
+    khiter_t iter;
1157
+    khash_t(kh_bam_tid) *header_hash;
1158
+
1159
+    bam_init_header_hash(stats->sam->header);
1160
+    header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash;
1161
+
1162
+    FILE *fp = fopen(file,"r");
1163
+    if ( !fp ) error("%s: %s\n",file,strerror(errno));
1164
+
1165
+    char *line = NULL;
1166
+    size_t len = 0;
1167
+    ssize_t nread;
1168
+    int warned = 0;
1169
+    int prev_tid=-1, prev_pos=-1;
1170
+    while ((nread = mygetline(&line, &len, fp)) != -1) 
1171
+    {
1172
+        if ( line[0] == '#' ) continue;
1173
+
1174
+        int i = 0;
1175
+        while ( i<nread && !isspace(line[i]) ) i++;
1176
+        if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
1177
+        line[i] = 0;
1178
+
1179
+        iter = kh_get(kh_bam_tid, header_hash, line);
1180
+        int tid = kh_val(header_hash, iter);
1181
+        if ( iter == kh_end(header_hash) )
1182
+        {
1183
+            if ( !warned )
1184
+                fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line);
1185
+            warned = 1;
1186
+            continue;
1187
+        }
1188
+
1189
+        if ( tid >= stats->nregions )
1190
+        {
1191
+            stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100));
1192
+            int j;
1193
+            for (j=stats->nregions; j<stats->nregions+100; j++)
1194
+            {
1195
+                stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0;
1196
+                stats->regions[j].pos = NULL;
1197
+            }
1198
+            stats->nregions += 100;
1199
+        }
1200
+        int npos = stats->regions[tid].npos;
1201
+        if ( npos >= stats->regions[tid].mpos )
1202
+        {
1203
+            stats->regions[tid].mpos += 1000;
1204
+            stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
1205
+        }
1206
+
1207
+        if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n");
1208
+        if ( prev_tid==-1 || prev_tid!=tid )
1209
+        {
1210
+            prev_tid = tid;
1211
+            prev_pos = stats->regions[tid].pos[npos].from;
1212
+        }
1213
+        if ( prev_pos>stats->regions[tid].pos[npos].from )
1214
+            error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos);
1215
+        stats->regions[tid].npos++;
1216
+    }
1217
+    if (line) free(line);
1218
+    if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n");
1219
+    fclose(fp);
1220
+}
1221
+
1222
+void destroy_regions(stats_t *stats)
1223
+{
1224
+    int i;
1225
+    for (i=0; i<stats->nregions; i++)
1226
+    {
1227
+        if ( !stats->regions[i].mpos ) continue;
1228
+        free(stats->regions[i].pos);
1229
+    }
1230
+    if ( stats->regions ) free(stats->regions);
1231
+}
1232
+
1233
+static int fetch_read(const bam1_t *bam_line, void *data)
1234
+{
1235
+    collect_stats((bam1_t*)bam_line,(stats_t*)data);
1236
+    return 1;
1237
+}
1238
+
1239
+void reset_regions(stats_t *stats)
1240
+{
1241
+    int i;
1242
+    for (i=0; i<stats->nregions; i++)
1243
+        stats->regions[i].cpos = 0;
1244
+}
1245
+
1246
+int is_in_regions(bam1_t *bam_line, stats_t *stats)
1247
+{
1248
+    if ( !stats->regions ) return 1;
1249
+
1250
+    if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0;
1251
+    if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n");
1252
+
1253
+    regions_t *reg = &stats->regions[bam_line->core.tid];
1254
+    if ( reg->cpos==reg->npos ) return 0;       // done for this chr
1255
+
1256
+    // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered, 
1257
+    //  even small overlap is enough to include the read in the stats.
1258
+    int i = reg->cpos;
1259
+    while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
1260
+    if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; }
1261
+    if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0;
1262
+    reg->cpos = i;
1263
+    stats->reg_from = reg->pos[i].from;
1264
+    stats->reg_to   = reg->pos[i].to;
1265
+
1266
+    return 1;
1267
+}
1268
+
1269
+void init_group_id(stats_t *stats, char *id)
1270
+{
1271
+    if ( !stats->sam->header->dict )
1272
+        stats->sam->header->dict = sam_header_parse2(stats->sam->header->text);
1273
+    void *iter = stats->sam->header->dict;
1274
+    const char *key, *val;
1275
+    int n = 0;
1276
+    stats->rg_hash = kh_init(kh_rg);
1277
+    while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
1278
+    {
1279
+        if ( !strcmp(id,key) || (val && !strcmp(id,val)) )
1280
+        {
1281
+            khiter_t k = kh_get(kh_rg, stats->rg_hash, key);
1282
+            if ( k != kh_end(stats->rg_hash) ) 
1283
+                fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key);
1284
+            int ret;
1285
+            k = kh_put(kh_rg, stats->rg_hash, key, &ret);
1286
+            kh_value(stats->rg_hash, k) = val;
1287
+            n++;
1288
+        }
1289
+    }
1290
+    if ( !n )
1291
+        error("The sample or read group \"%s\" not present.\n", id);
1292
+}
1293
+
1294
+
1295
+void error(const char *format, ...)
1296
+{
1297
+    if ( !format )
1298
+    {
1299
+        printf("Version: %s\n", BAMCHECK_VERSION);
1300
+        printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n");
1301
+        printf("Usage: bamcheck [OPTIONS] file.bam\n");
1302
+        printf("       bamcheck [OPTIONS] file.bam chr:from-to\n");
1303
+        printf("Options:\n");
1304
+        printf("    -c, --coverage <int>,<int>,<int>    Coverage distribution min,max,step [1,1000,1]\n");
1305
+        printf("    -d, --remove-dups                   Exlude from statistics reads marked as duplicates\n");
1306
+        printf("    -f, --required-flag <int>           Required flag, 0 for unset [0]\n");
1307
+        printf("    -F, --filtering-flag <int>          Filtering flag, 0 for unset [0]\n");
1308
+        printf("        --GC-depth <float,float>        Bin size for GC-depth graph and the maximum reference length [2e4,4.2e9]\n");
1309
+        printf("    -h, --help                          This help message\n");
1310
+        printf("    -i, --insert-size <int>             Maximum insert size [8000]\n");
1311
+        printf("    -I, --id <string>                   Include only listed read group or sample name\n");
1312
+        printf("    -l, --read-length <int>             Include in the statistics only reads with the given read length []\n");
1313
+        printf("    -m, --most-inserts <float>          Report only the main part of inserts [0.99]\n");
1314
+        printf("    -q, --trim-quality <int>            The BWA trimming parameter [0]\n");
1315
+        printf("    -r, --ref-seq <file>                Reference sequence (required for GC-depth calculation).\n");
1316
+        printf("    -t, --target-regions <file>         Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
1317
+        printf("    -s, --sam                           Input is SAM\n");
1318
+        printf("\n");
1319
+    }
1320
+    else
1321
+    {
1322
+        va_list ap;
1323
+        va_start(ap, format);
1324
+        vfprintf(stderr, format, ap);
1325
+        va_end(ap);
1326
+    }
1327
+    exit(-1);
1328
+}
1329
+
1330
+int main(int argc, char *argv[])
1331
+{
1332
+    char *targets = NULL;
1333
+    char *bam_fname = NULL;
1334
+    char *group_id = NULL;
1335
+    samfile_t *sam = NULL;
1336
+    char in_mode[5];
1337
+
1338
+    stats_t *stats = calloc(1,sizeof(stats_t));
1339
+    stats->ngc    = 200;
1340
+    stats->nquals = 256;
1341
+    stats->nbases = 300;
1342
+    stats->nisize = 8000;
1343
+    stats->max_len   = 30;
1344
+    stats->max_qual  = 40;
1345
+    stats->isize_main_bulk = 0.99;   // There are always outliers at the far end
1346
+    stats->gcd_bin_size = 20e3;
1347
+    stats->gcd_ref_size = 4.2e9;
1348
+    stats->rseq_pos     = -1;
1349
+    stats->tid = stats->gcd_pos = -1;
1350
+    stats->igcd = 0;
1351
+    stats->is_sorted = 1;
1352
+    stats->cov_min  = 1;
1353
+    stats->cov_max  = 1000;
1354
+    stats->cov_step = 1;
1355
+    stats->argc = argc;
1356
+    stats->argv = argv;
1357
+    stats->filter_readlen = -1;
1358
+    stats->nindels = stats->nbases;
1359
+
1360
+    strcpy(in_mode, "rb");
1361
+
1362
+    static struct option loptions[] = 
1363
+    {
1364
+        {"help",0,0,'h'},
1365
+        {"remove-dups",0,0,'d'},
1366
+        {"sam",0,0,'s'},
1367
+        {"ref-seq",1,0,'r'},
1368
+        {"coverage",1,0,'c'},
1369
+        {"read-length",1,0,'l'},
1370
+        {"insert-size",1,0,'i'},
1371
+        {"most-inserts",1,0,'m'},
1372
+        {"trim-quality",1,0,'q'},
1373
+        {"target-regions",0,0,'t'},
1374
+        {"required-flag",1,0,'f'},
1375
+        {"filtering-flag",0,0,'F'},
1376
+        {"id",1,0,'I'},
1377
+        {"GC-depth",1,0,1},
1378
+        {0,0,0,0}
1379
+    };
1380
+    int opt;
1381
+    while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 )
1382
+    {
1383
+        switch (opt)
1384
+        {