Browse code

Decoupling the bam tallying from summarization. Soon, we will return an IIT, which can be summarized in various ways. Currently, the only way is the variant summary.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@69806 bc3139a8-67e5-0310-9ffc-ced21a209358

Michael Lawrence authored on 25/09/2012 16:00:17
Showing 9 changed files

... ...
@@ -121,20 +121,22 @@ normArgTRUEorFALSE <- function(x) {
121 121
     if (is.unsorted(cycle_breaks))
122 122
       stop("'cycle_breaks' must be sorted")
123 123
   }
124
-  .Call(R_Bamtally_iit, bamreader@.extptr, genome_dir, db, which,
124
+  iit <- .Call(R_Bamtally_iit, bamreader@.extptr, genome_dir, db, which,
125
+               normArgSingleInteger(alloclength),
126
+               normArgSingleInteger(minimum_mapq),
127
+               normArgSingleInteger(good_unique_mapq),
128
+               normArgSingleInteger(maximum_nhits),
129
+               normArgTRUEorFALSE(concordant_only),
130
+               normArgTRUEorFALSE(unique_only),
131
+               normArgTRUEorFALSE(primary_only),
132
+               normArgSingleInteger(min_depth),
133
+               normArgSingleInteger(variant_strand),
134
+               normArgTRUEorFALSE(ignore_query_Ns),
135
+               normArgTRUEorFALSE(indels),
136
+               normArgSingleInteger(blocksize),
137
+               normArgTRUEorFALSE(verbosep))
138
+  .Call(R_tally_iit_parse, iit,
125 139
         cycle_breaks,
126 140
         normArgSingleInteger(high_base_quality),
127
-        normArgSingleInteger(alloclength),
128
-        normArgSingleInteger(minimum_mapq),
129
-        normArgSingleInteger(good_unique_mapq),
130
-        normArgSingleInteger(maximum_nhits),
131
-        normArgTRUEorFALSE(concordant_only),
132
-        normArgTRUEorFALSE(unique_only),
133
-        normArgTRUEorFALSE(primary_only),
134
-        normArgSingleInteger(min_depth),
135
-        normArgSingleInteger(variant_strand),
136
-        normArgTRUEorFALSE(ignore_query_Ns),
137
-        normArgTRUEorFALSE(indels),
138
-        normArgSingleInteger(blocksize),
139
-        normArgTRUEorFALSE(verbosep))
141
+        NULL)
140 142
 }
... ...
@@ -2,7 +2,7 @@ SUBDIRS = gmap gstruct
2 2
 
3 3
 .PHONY: all clean $(SUBDIRS)
4 4
 
5
-OBJECTS = bamreader.o bamtally.o R_init_gmapR.o
5
+OBJECTS = bamreader.o bamtally.o iit.o variantsummary.o R_init_gmapR.o
6 6
 
7 7
 R_SRC_DIR = ${CURDIR}
8 8
 PREFIX = ${R_SRC_DIR}/../inst/usr
... ...
@@ -7,8 +7,9 @@
7 7
 static const R_CallMethodDef callMethods[] = {
8 8
 
9 9
   /* bamtally.c */
10
-  CALLMETHOD_DEF(R_Bamtally_iit, 19),
11
-
10
+  CALLMETHOD_DEF(R_Bamtally_iit, 17),
11
+  CALLMETHOD_DEF(R_tally_iit_parse, 4),
12
+  
12 13
   /* bamreader.c */
13 14
   CALLMETHOD_DEF(R_Bamread_new, 1),
14 15
   
... ...
@@ -11,233 +11,11 @@
11 11
 #include <gstruct/bamtally.h>
12 12
 
13 13
 #include "gmapR.h"
14
-
15
-enum { SEQNAMES, POS, REF, READ, N_CYCLES, N_CYCLES_REF, COUNT, COUNT_REF,
16
-       COUNT_TOTAL, HIGH_QUALITY, HIGH_QUALITY_REF, HIGH_QUALITY_TOTAL,
17
-       MEAN_QUALITY, MEAN_QUALITY_REF, COUNT_PLUS, COUNT_PLUS_REF, COUNT_MINUS,
18
-       COUNT_MINUS_REF, N_BASE_COLS };
19
-
20
-typedef struct TallyTable {
21
-  SEXP seqnames_R;
22
-  int *pos;
23
-  SEXP ref_R;
24
-  SEXP read_R;
25
-  int *n_cycles;
26
-  int *n_cycles_ref;
27
-  int *count;
28
-  int *count_ref;
29
-  int *count_total;
30
-  int *high_quality;
31
-  int *high_quality_ref;
32
-  int *high_quality_total;
33
-  double *mean_quality;
34
-  double *mean_quality_ref;
35
-  int *count_plus;
36
-  int *count_plus_ref;
37
-  int *count_minus;
38
-  int *count_minus_ref;
39
-  SEXP cycle_bins_R;
40
-} TallyTable;
41
-
42
-static inline int
43
-read_int (unsigned char **bytes) {
44
-  int x = ((int *)*bytes)[0];
45
-  *bytes += sizeof(int);
46
-  return x;
47
-}
48
-
49
-static inline char
50
-read_char (unsigned char **bytes) {
51
-  char x = (*bytes)[0];
52
-  (*bytes)++;
53
-  return x;
54
-}
55
-
56
-static inline const char *
57
-read_string (unsigned char **bytes) {
58
-    const char *string = *bytes;
59
-    (*bytes) += strlen(string) + 1;
60
-    return string;
61
-}
62
-
63
-static int
64
-parse_indel_count(unsigned char *bytes) {
65
-  int count = read_int(&bytes);
66
-  return count;
67
-}
68
-
69
-static int
70
-parse_allele_count(unsigned char *bytes) {
71
-  int n_alleles = 1; /* always have a reference */
72
-  bytes += sizeof(int) * 4 + 1; /* skip total and reference */
73
-  while(bytes[0] != '\0') {
74
-    bytes += sizeof(int) * 2 + 1;
75
-    n_alleles++;
76
-  }
77
-  return n_alleles;
78
-}
79
-
80
-static void
81
-read_total_counts(unsigned char **bytes, int row, int *count_total) {
82
-  int count_total_plus = read_int(bytes);
83
-  int count_total_minus = read_int(bytes);
84
-  count_total[row] = count_total_plus + count_total_minus;
85
-}
86
-
87
-static void
88
-read_cycle_counts(unsigned char **bytes, int row, int *n_cycles,
89
-                  SEXP cycle_bins_R, SEXP cycle_breaks_R)
90
-{
91
-  n_cycles[row] = read_int(bytes);
92
-  if (cycle_breaks_R != R_NilValue) {
93
-    for (int index = 0; index < n_cycles[row]; index++) {
94
-      int cycle = abs(read_int(bytes));
95
-      int count = read_int(bytes);
96
-      int bin = 0;
97
-      while(length(cycle_breaks_R) > bin &&
98
-            cycle > INTEGER(cycle_breaks_R)[bin])
99
-        bin++;
100
-      if (bin > 0 && bin < length(cycle_breaks_R)) {
101
-        SEXP bin_vector = VECTOR_ELT(cycle_bins_R, bin - 1);
102
-        INTEGER(bin_vector)[row] += count;
103
-      }
104
-    }
105
-  } else {
106
-    (*bytes) += n_cycles[row] * 2 * sizeof(int);
107
-  }
108
-}
109
-
110
-static int
111
-parse_indels(unsigned char *bytes, int row, SEXP cycle_breaks_R,
112
-             int high_base_quality, TallyTable tally, bool insertion)
113
-{
114
-  int indel_count = read_int(&bytes);
115
-  for (int indel = 0; indel < indel_count; indel++) {
116
-    tally.count_plus[row] = read_int(&bytes);
117
-    tally.count_minus[row] = read_int(&bytes);
118
-    tally.count[row] = tally.count_plus[row] + tally.count_minus[row];
119
-    SEXP seq_R = mkChar(read_string(&bytes));
120
-    if (insertion)
121
-      SET_STRING_ELT(tally.read_R, row, seq_R);
122
-    else SET_STRING_ELT(tally.ref_R, row, seq_R);
123
-    read_cycle_counts(&bytes, row, tally.n_cycles, tally.cycle_bins_R,
124
-                      cycle_breaks_R);
125
-    /* stuff not relevant for indels */
126
-    tally.mean_quality[row] = NA_REAL;
127
-    tally.high_quality[row] = NA_INTEGER;
128
-  }
129
-  return indel_count;
130
-}
131
-
132
-static void
133
-read_quality_counts(unsigned char **bytes, int row, int *high_quality,
134
-                    double *mean_quality, int high_base_quality)
135
-{
136
-  int n_qualities = read_int(bytes);
137
-  int total_quality = 0;
138
-  int total_quality_weight = 0;
139
-  high_quality[row] = 0;
140
-  for (int index = 0; index < n_qualities; index++) {
141
-    int quality = read_int(bytes);
142
-    int count = read_int(bytes);
143
-    if (quality > high_base_quality) {
144
-      total_quality += quality * count;
145
-      total_quality_weight += count;
146
-      high_quality[row] += count;
147
-    }
148
-  }
149
-  mean_quality[row] = total_quality_weight > 0 ?
150
-    (double)total_quality / total_quality_weight : R_NaN;
151
-}
152
-
153
-static int
154
-read_allele_counts(unsigned char **bytes, int row, SEXP read_R,
155
-                   int *count_plus, int *count_minus, int *count)
156
-{
157
-  int n_alleles = 0;
158
-  char allele;
159
-  while((allele = read_char(bytes)) != '\0') {
160
-    SET_STRING_ELT(read_R, row, mkCharLen(&allele, 1));
161
-    count_plus[row] = read_int(bytes);
162
-    count_minus[row] = read_int(bytes);
163
-    count[row] = count_plus[row] + count_minus[row];
164
-    row++;
165
-    n_alleles++;
166
-  }
167
-  return n_alleles;
168
-}
169
-
170
-static int
171
-parse_alleles(unsigned char *bytes, int row, int ref_row, SEXP cycle_breaks_R,
172
-              int high_base_quality, TallyTable tally)
173
-{
174
-  read_total_counts(&bytes, row, tally.count_total);
175
-  int n_alleles = read_allele_counts(&bytes, row, tally.read_R,
176
-                                     tally.count_plus, tally.count_minus,
177
-                                     tally.count);
178
-  for (int allele = 0; allele < n_alleles; allele++, row++) {
179
-    tally.n_cycles[row] = 0;
180
-    for (int b = 0; b < length(cycle_breaks_R) - 1; b++) {
181
-      INTEGER(VECTOR_ELT(tally.cycle_bins_R, b))[row] = 0;
182
-    }
183
-    tally.high_quality[row] = 0;
184
-    tally.mean_quality[row] = R_NaN;
185
-    if (tally.count[row] > 0) {
186
-      read_cycle_counts(&bytes, row, tally.n_cycles, tally.cycle_bins_R,
187
-                        cycle_breaks_R);
188
-      read_quality_counts(&bytes, row, tally.high_quality, tally.mean_quality,
189
-                          high_base_quality);
190
-    }
191
-  }
192
-  int high_quality_total = 0;
193
-  for (int r = ref_row; r < row; r++) {
194
-    high_quality_total += tally.high_quality[r];
195
-  }
196
-  for (int r = ref_row; r < row; r++) {
197
-    tally.n_cycles_ref[r] = tally.n_cycles[ref_row];
198
-    tally.count_total[r] = tally.count_total[ref_row];
199
-    tally.mean_quality_ref[r] = tally.mean_quality[ref_row];
200
-    tally.high_quality_ref[r] = tally.high_quality[ref_row];
201
-    tally.high_quality_total[r] = high_quality_total;
202
-    tally.count_plus_ref[r] = tally.count_plus[ref_row];
203
-    tally.count_minus_ref[r] = tally.count_minus[ref_row];
204
-    tally.count_ref[r] = tally.count_plus_ref[r] + tally.count_minus_ref[r];
205
-    SET_STRING_ELT(tally.ref_R, r, STRING_ELT(tally.read_R, ref_row));  
206
-  }
207
-  bool have_ref_row = row > ref_row;
208
-  if (have_ref_row) {
209
-    /* clear the 'alt' columns for the 'ref' row with NAs */
210
-    SET_STRING_ELT(tally.read_R, ref_row, NA_STRING);
211
-    tally.n_cycles[ref_row] = NA_INTEGER;
212
-    tally.mean_quality[ref_row] = NA_REAL;
213
-    tally.high_quality[ref_row] = NA_REAL;
214
-    tally.count_plus[ref_row] = NA_INTEGER;
215
-    tally.count_minus[ref_row] = NA_INTEGER;
216
-    tally.count[ref_row] = NA_INTEGER;
217
-  }
218
-  return n_alleles;
219
-}
220
-
221
-
222
-/* FORMAT
223
-
224
-   block header:
225
-   [0][ins][del][mm]
226
-
227
-   mismatches for each position:
228
-   [t+][t-][rn][r+][r-]([an][a+][a-])*[\0]([c#]([cv][cc])*)*([q#]([qv][qc])*)*
229
-
230
-   insertions:
231
-   [i#]([seq][t+][t-][c#]([cv][cc])*)*
232
-
233
-   deletions:
234
-   [d#]([seq][t+][t-][c#]([cv][cc])*)*
235
-*/
14
+#include "iit.h"
236 15
 
237 16
 SEXP
238 17
 R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R,
239 18
                 SEXP which_R,
240
-                SEXP cycle_breaks_R, SEXP high_base_quality_R,
241 19
                 SEXP alloclength_R,
242 20
                 SEXP minimum_mapq_R, SEXP good_unique_mapq_R,
243 21
                 SEXP maximum_nhits_R,
... ...
@@ -266,7 +44,6 @@ R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R,
266 44
   bool print_indels_p = asLogical(print_indels_p_R);
267 45
   int blocksize = asInteger(blocksize_R);
268 46
   int verbosep = asLogical(verbosep_R);
269
-  int high_base_quality = asInteger(high_base_quality_R);
270 47
   
271 48
   char *genomesubdir, *fileroot, *dbversion, *iitfile;
272 49
   genomesubdir = Datadir_find_genomesubdir(&fileroot, &dbversion,
... ...
@@ -310,119 +87,5 @@ R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R,
310 87
     error("Could not create tally\n");
311 88
   }
312 89
 
313
-  int n_cycle_bins =
314
-    length(cycle_breaks_R) == 0 ? 0 : length(cycle_breaks_R) - 1;
315
-
316
-  int n_rows = 0;
317
-  /* loop over the IIT, getting the total number of rows
318
-     this is (num alts + 1) for every position. */
319
-  for (int index = 1; index <= IIT_total_nintervals(tally_iit); index++) {
320
-    unsigned char *bytes = IIT_data(tally_iit, index);
321
-    int width = IIT_length(tally_iit, index);
322
-    unsigned char *base = bytes + (3 * width + 1) * sizeof(int);
323
-    for (int pos = 0; pos < width; pos++) {
324
-      int insertion_offset = read_int(&bytes);
325
-      int deletion_offset = read_int(&bytes);
326
-      int allele_offset = read_int(&bytes);
327
-      int next_offset = read_int(&bytes);
328
-      bytes -= 4; /* rewind from read-ahead */
329
-      if (deletion_offset - insertion_offset > 0) {
330
-        n_rows += parse_indel_count(base + insertion_offset);
331
-      }
332
-      if (allele_offset - deletion_offset > 0) {
333
-        n_rows += parse_indel_count(base + deletion_offset);
334
-      }
335
-      if (next_offset - allele_offset > 0) {
336
-        n_rows += parse_allele_count(base + allele_offset);
337
-      }
338
-    }
339
-  }
340
-
341
-  SEXP tally_R; /* the result list */
342
-  PROTECT(tally_R =
343
-          allocVector(VECSXP, N_BASE_COLS + n_cycle_bins));
344
-
345
-  TallyTable tally;
346
-  
347
-  SET_VECTOR_ELT(tally_R, SEQNAMES, allocVector(STRSXP, n_rows));
348
-  tally.seqnames_R = VECTOR_ELT(tally_R, SEQNAMES);
349
-  SET_VECTOR_ELT(tally_R, POS, allocVector(INTSXP, n_rows));
350
-  tally.pos = INTEGER(VECTOR_ELT(tally_R, POS));
351
-  SET_VECTOR_ELT(tally_R, REF, allocVector(STRSXP, n_rows));
352
-  tally.ref_R = VECTOR_ELT(tally_R, REF);
353
-  SET_VECTOR_ELT(tally_R, READ, allocVector(STRSXP, n_rows));
354
-  tally.read_R = VECTOR_ELT(tally_R, READ);
355
-  SET_VECTOR_ELT(tally_R, N_CYCLES, allocVector(INTSXP, n_rows));
356
-  tally.n_cycles = INTEGER(VECTOR_ELT(tally_R, N_CYCLES));
357
-  SET_VECTOR_ELT(tally_R, N_CYCLES_REF, allocVector(INTSXP, n_rows));
358
-  tally.n_cycles_ref = INTEGER(VECTOR_ELT(tally_R, N_CYCLES_REF));
359
-  SET_VECTOR_ELT(tally_R, COUNT, allocVector(INTSXP, n_rows));
360
-  tally.count = INTEGER(VECTOR_ELT(tally_R, COUNT));
361
-  SET_VECTOR_ELT(tally_R, COUNT_REF, allocVector(INTSXP, n_rows));
362
-  tally.count_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_REF));
363
-  SET_VECTOR_ELT(tally_R, COUNT_TOTAL, allocVector(INTSXP, n_rows));
364
-  tally.count_total = INTEGER(VECTOR_ELT(tally_R, COUNT_TOTAL));
365
-  SET_VECTOR_ELT(tally_R, HIGH_QUALITY, allocVector(INTSXP, n_rows));
366
-  tally.high_quality = INTEGER(VECTOR_ELT(tally_R, HIGH_QUALITY));
367
-  SET_VECTOR_ELT(tally_R, HIGH_QUALITY_REF, allocVector(INTSXP, n_rows));
368
-  tally.high_quality_ref = INTEGER(VECTOR_ELT(tally_R, HIGH_QUALITY_REF));
369
-  SET_VECTOR_ELT(tally_R, HIGH_QUALITY_TOTAL, allocVector(INTSXP, n_rows));
370
-  tally.high_quality_total = INTEGER(VECTOR_ELT(tally_R, HIGH_QUALITY_TOTAL));
371
-  SET_VECTOR_ELT(tally_R, MEAN_QUALITY, allocVector(REALSXP, n_rows));
372
-  tally.mean_quality = REAL(VECTOR_ELT(tally_R, MEAN_QUALITY));
373
-  SET_VECTOR_ELT(tally_R, MEAN_QUALITY_REF, allocVector(REALSXP, n_rows));
374
-  tally.mean_quality_ref = REAL(VECTOR_ELT(tally_R, MEAN_QUALITY_REF));
375
-  SET_VECTOR_ELT(tally_R, COUNT_PLUS, allocVector(INTSXP, n_rows));
376
-  tally.count_plus = INTEGER(VECTOR_ELT(tally_R, COUNT_PLUS));
377
-  SET_VECTOR_ELT(tally_R, COUNT_PLUS_REF, allocVector(INTSXP, n_rows));
378
-  tally.count_plus_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_PLUS_REF));
379
-  SET_VECTOR_ELT(tally_R, COUNT_MINUS, allocVector(INTSXP, n_rows));
380
-  tally.count_minus = INTEGER(VECTOR_ELT(tally_R, COUNT_MINUS));
381
-  SET_VECTOR_ELT(tally_R, COUNT_MINUS_REF, allocVector(INTSXP, n_rows));
382
-  tally.count_minus_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_MINUS_REF));
383
-
384
-  PROTECT(tally.cycle_bins_R = allocVector(VECSXP, n_cycle_bins));
385
-  for (int bin = 0; bin < n_cycle_bins; bin++) {
386
-    SEXP cycle_bin_R = allocVector(INTSXP, n_rows);
387
-    SET_VECTOR_ELT(tally_R, bin + N_BASE_COLS, cycle_bin_R);
388
-    SET_VECTOR_ELT(tally.cycle_bins_R, bin, cycle_bin_R);
389
-  }
390
-
391
-  int row = 0;
392
-  for (int index = 1; index <= IIT_total_nintervals(tally_iit); index++) {
393
-    unsigned char *bytes = IIT_data(tally_iit, index);
394
-    int width = IIT_length(tally_iit, index);
395
-    unsigned char *base = bytes + (3 * width + 1) * sizeof(int);
396
-    int start = IIT_interval_low(tally_iit, index);
397
-    SEXP divstring_R;
398
-    PROTECT(divstring_R = mkChar(IIT_divstring_from_index(tally_iit, index)));
399
-    for (int position = 0; position < width; position++) {
400
-      int insertion_offset = read_int(&bytes);
401
-      int deletion_offset = read_int(&bytes);
402
-      int allele_offset = read_int(&bytes);
403
-      int next_offset = read_int(&bytes);
404
-      int ref_row = row;
405
-      bytes -= 4; /* rewind from read-ahead */
406
-      if (next_offset - allele_offset > 0)
407
-        row += parse_alleles(base + allele_offset, row, ref_row, cycle_breaks_R,
408
-                             high_base_quality, tally);
409
-      if (deletion_offset - insertion_offset > 0)
410
-        row += parse_indels(base + insertion_offset, row, cycle_breaks_R,
411
-                            high_base_quality, tally, true);
412
-      if (allele_offset - deletion_offset > 0)
413
-        row += parse_indels(base + deletion_offset, row, cycle_breaks_R,
414
-                            high_base_quality, tally, false);
415
-      /* fill in position information */
416
-      for (int r = ref_row; r < row; r++) {
417
-        SET_STRING_ELT(tally.seqnames_R, r, divstring_R);
418
-        tally.pos[r] = start + position;
419
-      }      
420
-    }
421
-    UNPROTECT(1);
422
-  }
423
-
424
-  IIT_free(&tally_iit);
425
-  UNPROTECT(2);
426
-  return tally_R;
90
+  return R_IIT_new(tally_iit);
427 91
 }
428
-
429 92
new file mode 100644
... ...
@@ -0,0 +1,23 @@
1
+/* Functions for reading data from byte streams. Inlined because they
2
+   are tiny and need to be fast. */
3
+
4
+static inline int
5
+read_int (unsigned char **bytes) {
6
+  int x = ((int *)*bytes)[0];
7
+  *bytes += sizeof(int);
8
+  return x;
9
+}
10
+
11
+static inline char
12
+read_char (unsigned char **bytes) {
13
+  char x = (*bytes)[0];
14
+  (*bytes)++;
15
+  return x;
16
+}
17
+
18
+static inline const char *
19
+read_string (unsigned char **bytes) {
20
+  const char *string = *bytes;
21
+  (*bytes) += strlen(string) + 1;
22
+  return string;
23
+}
... ...
@@ -3,13 +3,14 @@
3 3
 
4 4
 #include <Rinternals.h>
5 5
 
6
+/* .Call entry points for the gmapR package */
7
+
6 8
 SEXP
7 9
 R_Bamread_new (SEXP bamfile_R);
8 10
 
9 11
 SEXP
10 12
 R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R,
11 13
                 SEXP which_R,
12
-                SEXP cycle_breaks_R, SEXP high_quality_cutoff_R,
13 14
                 SEXP alloclength_R,
14 15
                 SEXP minimum_mapq_R, SEXP good_unique_mapq_R,
15 16
                 SEXP maximum_nhits_R,
... ...
@@ -21,4 +22,8 @@ R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R,
21 22
                 SEXP blocksize_R, 
22 23
                 SEXP verbosep_R);
23 24
 
25
+SEXP
26
+R_tally_iit_parse(SEXP tally_iit_R, SEXP cycle_breaks_R,
27
+                  SEXP high_base_quality, SEXP which_R);
28
+
24 29
 #endif
25 30
new file mode 100644
... ...
@@ -0,0 +1,37 @@
1
+#include <gstruct/iit-read.h>
2
+
3
+#include "iit.h"
4
+
5
+static void R_IIT_free(SEXP iit_R) {
6
+  IIT_T iit = R_ExternalPtrAddr(iit_R);
7
+  IIT_free(&iit);
8
+}
9
+
10
+SEXP R_IIT_new(IIT_T iit) {
11
+  SEXP iit_R;
12
+  iit_R = R_MakeExternalPtr((void *) iit, R_NilValue, R_NilValue);
13
+  R_RegisterCFinalizer(iit_R, R_IIT_free);
14
+  return iit_R;
15
+}
16
+
17
+SEXP R_iit_read(SEXP iitfile_R) {
18
+  char *iitfile = (char *) CHAR(asChar(iitfile_R));
19
+  IIT_T iit = IIT_read(iitfile, /*name*/NULL, /*readonlyp*/true,
20
+                       /*divread*/READ_ALL, /*divstring*/NULL,
21
+                       /*add_iit_p*/false, /*labels_read_p*/true);
22
+  return R_IIT_new(iit);
23
+}
24
+
25
+/* 'IIT_output_direct' is not included in the libgstruct binary */
26
+
27
+/*
28
+  #include <gstruct/iit-write.h>
29
+
30
+  SEXP R_iit_write(SEXP tally_iit_R, SEXP iitfile_R) {
31
+  IIT_T tally_iit = (IIT_T) R_ExternalPtrAddr(tally_iit_R);
32
+  char *iitfile = (char *) CHAR(asChar(iitfile_R));
33
+  IIT_output_direct(iitfile, tally_iit, IIT_LATEST_VERSION);
34
+  return iitfile_R;
35
+  }
36
+*/
37
+
0 38
new file mode 100644
... ...
@@ -0,0 +1,9 @@
1
+#ifndef IIT_H
2
+#define IIT_H
3
+
4
+#include <gstruct/iit-read.h>
5
+#include <Rinternals.h>
6
+
7
+SEXP R_IIT_new(IIT_T iit);
8
+
9
+#endif
0 10
new file mode 100644
... ...
@@ -0,0 +1,426 @@
1
+#include <stdlib.h> /* for abs */
2
+#include <string.h> /* for strlen */
3
+
4
+#include "iit.h"
5
+#include "bytestream.h"
6
+
7
+enum { SEQNAMES, POS, REF, READ, N_CYCLES, N_CYCLES_REF, COUNT, COUNT_REF,
8
+       COUNT_TOTAL, HIGH_QUALITY, HIGH_QUALITY_REF, HIGH_QUALITY_TOTAL,
9
+       MEAN_QUALITY, MEAN_QUALITY_REF, COUNT_PLUS, COUNT_PLUS_REF, COUNT_MINUS,
10
+       COUNT_MINUS_REF, N_BASE_COLS };
11
+
12
+typedef struct TallyTable {
13
+  SEXP seqnames_R;
14
+  int *pos;
15
+  SEXP ref_R;
16
+  SEXP read_R;
17
+  int *n_cycles;
18
+  int *n_cycles_ref;
19
+  int *count;
20
+  int *count_ref;
21
+  int *count_total;
22
+  int *high_quality;
23
+  int *high_quality_ref;
24
+  int *high_quality_total;
25
+  double *mean_quality;
26
+  double *mean_quality_ref;
27
+  int *count_plus;
28
+  int *count_plus_ref;
29
+  int *count_minus;
30
+  int *count_minus_ref;
31
+  int **cycle_bins;
32
+} TallyTable;
33
+
34
+static SEXP R_TallyTable_new(int n_rows, int n_cycle_bins) {
35
+  SEXP tally_R; /* the result list */
36
+  PROTECT(tally_R = allocVector(VECSXP, N_BASE_COLS + n_cycle_bins));
37
+  
38
+  SET_VECTOR_ELT(tally_R, SEQNAMES, allocVector(STRSXP, n_rows));
39
+  SET_VECTOR_ELT(tally_R, POS, allocVector(INTSXP, n_rows));
40
+  SET_VECTOR_ELT(tally_R, REF, allocVector(STRSXP, n_rows));
41
+  SET_VECTOR_ELT(tally_R, READ, allocVector(STRSXP, n_rows));
42
+  SET_VECTOR_ELT(tally_R, N_CYCLES, allocVector(INTSXP, n_rows));
43
+  SET_VECTOR_ELT(tally_R, N_CYCLES_REF, allocVector(INTSXP, n_rows));
44
+  SET_VECTOR_ELT(tally_R, COUNT, allocVector(INTSXP, n_rows));
45
+  SET_VECTOR_ELT(tally_R, COUNT_REF, allocVector(INTSXP, n_rows));
46
+  SET_VECTOR_ELT(tally_R, COUNT_TOTAL, allocVector(INTSXP, n_rows));
47
+  SET_VECTOR_ELT(tally_R, HIGH_QUALITY, allocVector(INTSXP, n_rows));
48
+  SET_VECTOR_ELT(tally_R, HIGH_QUALITY_REF, allocVector(INTSXP, n_rows));
49
+  SET_VECTOR_ELT(tally_R, HIGH_QUALITY_TOTAL, allocVector(INTSXP, n_rows));
50
+  SET_VECTOR_ELT(tally_R, MEAN_QUALITY, allocVector(REALSXP, n_rows));
51
+  SET_VECTOR_ELT(tally_R, MEAN_QUALITY_REF, allocVector(REALSXP, n_rows));
52
+  SET_VECTOR_ELT(tally_R, COUNT_PLUS, allocVector(INTSXP, n_rows));
53
+  SET_VECTOR_ELT(tally_R, COUNT_PLUS_REF, allocVector(INTSXP, n_rows));
54
+  SET_VECTOR_ELT(tally_R, COUNT_MINUS, allocVector(INTSXP, n_rows));
55
+  SET_VECTOR_ELT(tally_R, COUNT_MINUS_REF, allocVector(INTSXP, n_rows));
56
+  for (int bin = 0; bin < n_cycle_bins; bin++) {
57
+    SEXP cycle_bin_R = allocVector(INTSXP, n_rows);
58
+    SET_VECTOR_ELT(tally_R, bin + N_BASE_COLS, cycle_bin_R);
59
+  }
60
+
61
+  UNPROTECT(1);
62
+  return tally_R;
63
+}
64
+
65
+static TallyTable *TallyTable_new(SEXP tally_R) {
66
+  TallyTable *tally = (TallyTable *) R_alloc(sizeof(TallyTable), 1);
67
+  int n_cycle_bins = length(tally_R) - N_BASE_COLS;
68
+  
69
+  tally->seqnames_R = VECTOR_ELT(tally_R, SEQNAMES);
70
+  tally->pos = INTEGER(VECTOR_ELT(tally_R, POS));
71
+  tally->ref_R = VECTOR_ELT(tally_R, REF);
72
+  tally->read_R = VECTOR_ELT(tally_R, READ);
73
+  tally->n_cycles = INTEGER(VECTOR_ELT(tally_R, N_CYCLES));
74
+  tally->n_cycles_ref = INTEGER(VECTOR_ELT(tally_R, N_CYCLES_REF));
75
+  tally->count = INTEGER(VECTOR_ELT(tally_R, COUNT));
76
+  tally->count_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_REF));
77
+  tally->count_total = INTEGER(VECTOR_ELT(tally_R, COUNT_TOTAL));
78
+  tally->high_quality = INTEGER(VECTOR_ELT(tally_R, HIGH_QUALITY));
79
+  tally->high_quality_ref = INTEGER(VECTOR_ELT(tally_R, HIGH_QUALITY_REF));
80
+  tally->high_quality_total = INTEGER(VECTOR_ELT(tally_R, HIGH_QUALITY_TOTAL));
81
+  tally->mean_quality = REAL(VECTOR_ELT(tally_R, MEAN_QUALITY));
82
+  tally->mean_quality_ref = REAL(VECTOR_ELT(tally_R, MEAN_QUALITY_REF));
83
+  tally->count_plus = INTEGER(VECTOR_ELT(tally_R, COUNT_PLUS));
84
+  tally->count_plus_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_PLUS_REF));
85
+  tally->count_minus = INTEGER(VECTOR_ELT(tally_R, COUNT_MINUS));
86
+  tally->count_minus_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_MINUS_REF));
87
+  tally->cycle_bins = (int **) R_alloc(sizeof(int*), n_cycle_bins);
88
+  for (int bin = 0; bin < n_cycle_bins; bin++) {
89
+    tally->cycle_bins[bin] = INTEGER(VECTOR_ELT(tally_R, bin + N_BASE_COLS));
90
+  }
91
+
92
+  return tally;
93
+}
94
+
95
+static void
96
+read_total_counts(unsigned char **bytes, int row, int *count_total) {
97
+  int count_total_plus = read_int(bytes);
98
+  int count_total_minus = read_int(bytes);
99
+  count_total[row] = count_total_plus + count_total_minus;
100
+}
101
+
102
+static void
103
+read_cycle_counts(unsigned char **bytes, int row, int *n_cycles,
104
+                  int **cycle_bins, int *cycle_breaks, int n_cycle_bins)
105
+{
106
+  int n_cycle_breaks = n_cycle_bins + 1;
107
+  n_cycles[row] = read_int(bytes);
108
+  if (cycle_breaks != NULL) {
109
+    for (int index = 0; index < n_cycles[row]; index++) {
110
+      int cycle = abs(read_int(bytes));
111
+      int count = read_int(bytes);
112
+      int bin = 0;
113
+      while(n_cycle_breaks > bin &&
114
+            cycle > cycle_breaks[bin])
115
+        bin++;
116
+      if (bin > 0 && bin < n_cycle_breaks) {
117
+        cycle_bins[bin-1][row] += count;
118
+      }
119
+    }
120
+  } else {
121
+    (*bytes) += n_cycles[row] * 2 * sizeof(int);
122
+  }
123
+}
124
+
125
+static int
126
+parse_indels(unsigned char *bytes, int row,
127
+             int *cycle_breaks, int n_cycle_bins,
128
+             int high_base_quality, TallyTable *tally, bool insertion)
129
+{
130
+  int indel_count = read_int(&bytes);
131
+  for (int indel = 0; indel < indel_count; indel++) {
132
+    /* TODO: add ref counts when we have them from Tom. */
133
+    /* TODO: sum the ref counts with the indel counts for total. */
134
+    tally->count_plus[row] = read_int(&bytes);
135
+    tally->count_minus[row] = read_int(&bytes);
136
+    tally->count[row] = tally->count_plus[row] + tally->count_minus[row];
137
+    SEXP seq_R = mkChar(read_string(&bytes));
138
+    if (insertion) {
139
+      SET_STRING_ELT(tally->read_R, row, seq_R);
140
+      SET_STRING_ELT(tally->ref_R, row, R_BlankString);
141
+    }
142
+    else {
143
+      SET_STRING_ELT(tally->read_R, row, R_BlankString);
144
+      SET_STRING_ELT(tally->ref_R, row, seq_R);
145
+    }
146
+    read_cycle_counts(&bytes, row, tally->n_cycles, tally->cycle_bins,
147
+                      cycle_breaks, n_cycle_bins);
148
+    /* quality is per-base and so not relevant for indels */
149
+    tally->mean_quality[row] = NA_REAL;
150
+    tally->mean_quality_ref[row] = NA_REAL;
151
+    tally->high_quality[row] = NA_INTEGER;
152
+    tally->high_quality_ref[row] = NA_INTEGER;
153
+    tally->high_quality_total[row] = NA_INTEGER;
154
+  }
155
+  return indel_count;
156
+}
157
+
158
+static void
159
+read_quality_counts(unsigned char **bytes, int row, int *high_quality,
160
+                    double *mean_quality, int high_base_quality)
161
+{
162
+  int n_qualities = read_int(bytes);
163
+  int total_quality = 0;
164
+  int total_quality_weight = 0;
165
+  high_quality[row] = 0;
166
+  for (int index = 0; index < n_qualities; index++) {
167
+    int quality = read_int(bytes);
168
+    int count = read_int(bytes);
169
+    if (quality > high_base_quality) {
170
+      total_quality += quality * count;
171
+      total_quality_weight += count;
172
+      high_quality[row] += count;
173
+    }
174
+  }
175
+  mean_quality[row] = total_quality_weight > 0 ?
176
+    (double)total_quality / total_quality_weight : R_NaN;
177
+}
178
+
179
+static int
180
+read_allele_counts(unsigned char **bytes, int row, SEXP read_R,
181
+                   int *count_plus, int *count_minus, int *count)
182
+{
183
+  int n_alleles = 0;
184
+  char allele;
185
+  while((allele = read_char(bytes)) != '\0') {
186
+    SET_STRING_ELT(read_R, row, mkCharLen(&allele, 1));
187
+    count_plus[row] = read_int(bytes);
188
+    count_minus[row] = read_int(bytes);
189
+    count[row] = count_plus[row] + count_minus[row];
190
+    row++;
191
+    n_alleles++;
192
+  }
193
+  return n_alleles;
194
+}
195
+
196
+static int
197
+parse_alleles(unsigned char *bytes, int row, int ref_row, int *cycle_breaks,
198
+              int n_cycle_bins, int high_base_quality, TallyTable *tally)
199
+{
200
+  read_total_counts(&bytes, row, tally->count_total);
201
+  int n_alleles = read_allele_counts(&bytes, row, tally->read_R,
202
+                                     tally->count_plus, tally->count_minus,
203
+                                     tally->count);
204
+  for (int allele = 0; allele < n_alleles; allele++, row++) {
205
+    tally->n_cycles[row] = 0;
206
+    for (int b = 0; b < n_cycle_bins; b++) {
207
+      tally->cycle_bins[b][row] = 0;
208
+    }
209
+    tally->high_quality[row] = 0;
210
+    tally->mean_quality[row] = R_NaN;
211
+    if (tally->count[row] > 0) {
212
+      read_cycle_counts(&bytes, row, tally->n_cycles, tally->cycle_bins,
213
+                        cycle_breaks, n_cycle_bins);
214
+      read_quality_counts(&bytes, row, tally->high_quality, tally->mean_quality,
215
+                          high_base_quality);
216
+    }
217
+  }
218
+  int high_quality_total = 0;
219
+  for (int r = ref_row; r < row; r++) {
220
+    high_quality_total += tally->high_quality[r];
221
+  }
222
+  for (int r = ref_row; r < row; r++) {
223
+    tally->n_cycles_ref[r] = tally->n_cycles[ref_row];
224
+    tally->count_total[r] = tally->count_total[ref_row];
225
+    tally->mean_quality_ref[r] = tally->mean_quality[ref_row];
226
+    tally->high_quality_ref[r] = tally->high_quality[ref_row];
227
+    tally->high_quality_total[r] = high_quality_total;
228
+    tally->count_plus_ref[r] = tally->count_plus[ref_row];
229
+    tally->count_minus_ref[r] = tally->count_minus[ref_row];
230
+    tally->count_ref[r] = tally->count_plus_ref[r] + tally->count_minus_ref[r];
231
+    SET_STRING_ELT(tally->ref_R, r, STRING_ELT(tally->read_R, ref_row));  
232
+  }
233
+  bool have_ref_row = row > ref_row;
234
+  if (have_ref_row) {
235
+    /* clear the 'alt' columns for the 'ref' row with NAs */
236
+    SET_STRING_ELT(tally->read_R, ref_row, NA_STRING);
237
+    tally->n_cycles[ref_row] = NA_INTEGER;
238
+    tally->mean_quality[ref_row] = NA_REAL;
239
+    tally->high_quality[ref_row] = NA_REAL;
240
+    tally->count_plus[ref_row] = NA_INTEGER;
241
+    tally->count_minus[ref_row] = NA_INTEGER;
242
+    tally->count[ref_row] = NA_INTEGER;
243
+  }
244
+  return n_alleles;
245
+}
246
+    
247
+static int parse_indel_count(unsigned char *bytes) {
248
+  int count = read_int(&bytes);
249
+  return count;
250
+}
251
+
252
+static int parse_allele_count(unsigned char *bytes) {
253
+  int n_alleles = 1; /* always have a reference */
254
+  bytes += sizeof(int) * 4 + 1; /* skip total and reference */
255
+  while(bytes[0] != '\0') {
256
+    bytes += sizeof(int) * 2 + 1;
257
+    n_alleles++;
258
+  }
259
+  return n_alleles;
260
+}
261
+
262
+static int count_rows_for_interval(IIT_T tally_iit, int index) {
263
+  int n_rows;
264
+  unsigned char *bytes = IIT_data(tally_iit, index);
265
+  int width = IIT_length(tally_iit, index);
266
+  unsigned char *base = bytes + (3 * width + 1) * sizeof(int);
267
+  for (int pos = 0; pos < width; pos++) {
268
+    int insertion_offset = read_int(&bytes);
269
+    int deletion_offset = read_int(&bytes);
270
+    int allele_offset = read_int(&bytes);
271
+    int next_offset = read_int(&bytes);
272
+    bytes -= 4; /* rewind from read-ahead */
273
+    if (deletion_offset - insertion_offset > 0) {
274
+      n_rows += parse_indel_count(base + insertion_offset);
275
+    }
276
+    if (allele_offset - deletion_offset > 0) {
277
+      n_rows += parse_indel_count(base + deletion_offset);
278
+    }
279
+    if (next_offset - allele_offset > 0) {
280
+      n_rows += parse_allele_count(base + allele_offset);
281
+    }
282
+  }
283
+  return n_rows;
284
+}
285
+
286
+static int parse_interval(IIT_T tally_iit, int index,
287
+                          TallyTable *tally, int row,
288
+                          int *cycle_breaks, int n_cycle_bins,
289
+                          int high_base_quality)
290
+{
291
+  unsigned char *bytes = IIT_data(tally_iit, index);
292
+  int width = IIT_length(tally_iit, index);
293
+  unsigned char *base = bytes + (3 * width + 1) * sizeof(int);
294
+  int start = IIT_interval_low(tally_iit, index);
295
+  SEXP divstring_R;
296
+  PROTECT(divstring_R = mkChar(IIT_divstring_from_index(tally_iit, index)));
297
+  for (int position = 0; position < width; position++) {
298
+    int insertion_offset = read_int(&bytes);
299
+    int deletion_offset = read_int(&bytes);
300
+    int allele_offset = read_int(&bytes);
301
+    int next_offset = read_int(&bytes);
302
+    int ref_row = row;
303
+    bytes -= 4; /* rewind from read-ahead */
304
+    if (next_offset - allele_offset > 0)
305
+      row += parse_alleles(base + allele_offset, row, ref_row,
306
+                           cycle_breaks, n_cycle_bins,
307
+                           high_base_quality, tally);
308
+    if (deletion_offset - insertion_offset > 0)
309
+      row += parse_indels(base + insertion_offset, row,
310
+                          cycle_breaks, n_cycle_bins,
311
+                          high_base_quality, tally, true);
312
+    if (allele_offset - deletion_offset > 0)
313
+      row += parse_indels(base + deletion_offset, row,
314
+                          cycle_breaks, n_cycle_bins,
315
+                          high_base_quality, tally, false);
316
+    /* fill in position information */
317
+    for (int r = ref_row; r < row; r++) {
318
+      SET_STRING_ELT(tally->seqnames_R, r, divstring_R);
319
+      tally->pos[r] = start + position;
320
+    }      
321
+  }
322
+  UNPROTECT(1);
323
+  return row;
324
+}
325
+
326
+static SEXP parse_all(IIT_T tally_iit, int *cycle_breaks,
327
+                      int n_cycle_bins,
328
+                      int high_base_quality)
329
+{
330
+  int n_rows = 0;
331
+  /* loop over the IIT, getting the total number of rows
332
+     this is (num alts + 1) for every position. */
333
+  for (int index = 1; index <= IIT_total_nintervals(tally_iit); index++) {
334
+    n_rows += count_rows_for_interval(tally_iit, index);
335
+  }
336
+  
337
+  SEXP tally_R;
338
+  PROTECT(tally_R = R_TallyTable_new(n_rows, n_cycle_bins));
339
+  TallyTable *tally = TallyTable_new(tally_R);
340
+  
341
+  int row = 0;
342
+  for (int index = 1; index <= IIT_total_nintervals(tally_iit); index++) {
343
+    row = parse_interval(tally_iit, index, tally, row,
344
+                         cycle_breaks, n_cycle_bins,
345
+                         high_base_quality);
346
+  }
347
+
348
+  UNPROTECT(1);
349
+  return tally_R;
350
+}
351
+
352
+static SEXP parse_some(IIT_T tally_iit, int *cycle_breaks,
353
+                       int n_cycle_bins,
354
+                       int high_base_quality,
355
+                       SEXP chr_R, int *start, int *end)
356
+{
357
+  int n_rows = 0;
358
+  int *nmatches = (int *) R_alloc(sizeof(int), length(chr_R));
359
+  int **indexes = (int **) R_alloc(sizeof(int*), length(chr_R));
360
+  
361
+  for (int i = 0; i < length(chr_R); i++) {
362
+    indexes[i] = IIT_get(nmatches + i, tally_iit,
363
+                         (char *) CHAR(STRING_ELT(chr_R, i)),
364
+                         start[i], end[i], false);
365
+    for (int j = 0; j < nmatches[i]; j++) {
366
+      n_rows += count_rows_for_interval(tally_iit, indexes[i][j]);
367
+    }
368
+  }
369
+  
370
+  SEXP tally_R;
371
+  PROTECT(tally_R = R_TallyTable_new(n_rows, n_cycle_bins));
372
+  TallyTable *tally = TallyTable_new(tally_R);
373
+
374
+  int row = 0;
375
+  for (int i = 0; i < length(chr_R); i++) {
376
+    for (int j = 0; j < nmatches[i]; j++) {
377
+      row = parse_interval(tally_iit, indexes[i][j],
378
+                           tally, row,
379
+                           cycle_breaks, n_cycle_bins,
380
+                           high_base_quality);
381
+    }
382
+  }
383
+  
384
+  UNPROTECT(1);
385
+  return tally_R;
386
+}
387
+
388
+/* FORMAT
389
+
390
+   block header:
391
+   [0][ins][del][mm]
392
+
393
+   mismatches for each position:
394
+   [t+][t-][rn][r+][r-]([an][a+][a-])*[\0]([c#]([cv][cc])*)*([q#]([qv][qc])*)*
395
+
396
+   insertions:
397
+   [i#]([seq][t+][t-][c#]([cv][cc])*)*
398
+
399
+   deletions:
400
+   [d#]([seq][t+][t-][c#]([cv][cc])*)*
401
+*/
402
+
403
+SEXP R_tally_iit_parse(SEXP tally_iit_R, SEXP cycle_breaks_R,
404
+                       SEXP high_base_quality_R, SEXP which_R)
405
+{
406
+  int high_base_quality = asInteger(high_base_quality_R);
407
+  int *cycle_breaks =
408
+    cycle_breaks_R == R_NilValue ? NULL : INTEGER(cycle_breaks_R);
409
+  int n_cycle_bins =
410
+    length(cycle_breaks_R) == 0 ? 0 : length(cycle_breaks_R) - 1;
411
+  IIT_T tally_iit = (IIT_T) R_ExternalPtrAddr(tally_iit_R);
412
+  SEXP tally_R;
413
+  
414
+  if (which_R == R_NilValue) {
415
+    tally_R = parse_all(tally_iit, cycle_breaks, n_cycle_bins,
416
+                        high_base_quality);
417
+  } else {
418
+    SEXP chr_R = VECTOR_ELT(which_R, 0);
419
+    int *start = INTEGER(VECTOR_ELT(which_R, 1));
420
+    int *end = INTEGER(VECTOR_ELT(which_R, 2));
421
+    tally_R = parse_some(tally_iit, cycle_breaks, n_cycle_bins,
422
+                         high_base_quality, chr_R, start, end);
423
+  }
424
+  
425
+  return tally_R;
426
+}