git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@69806 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -121,20 +121,22 @@ normArgTRUEorFALSE <- function(x) { |
121 | 121 |
if (is.unsorted(cycle_breaks)) |
122 | 122 |
stop("'cycle_breaks' must be sorted") |
123 | 123 |
} |
124 |
- .Call(R_Bamtally_iit, bamreader@.extptr, genome_dir, db, which, |
|
124 |
+ iit <- .Call(R_Bamtally_iit, bamreader@.extptr, genome_dir, db, which, |
|
125 |
+ normArgSingleInteger(alloclength), |
|
126 |
+ normArgSingleInteger(minimum_mapq), |
|
127 |
+ normArgSingleInteger(good_unique_mapq), |
|
128 |
+ normArgSingleInteger(maximum_nhits), |
|
129 |
+ normArgTRUEorFALSE(concordant_only), |
|
130 |
+ normArgTRUEorFALSE(unique_only), |
|
131 |
+ normArgTRUEorFALSE(primary_only), |
|
132 |
+ normArgSingleInteger(min_depth), |
|
133 |
+ normArgSingleInteger(variant_strand), |
|
134 |
+ normArgTRUEorFALSE(ignore_query_Ns), |
|
135 |
+ normArgTRUEorFALSE(indels), |
|
136 |
+ normArgSingleInteger(blocksize), |
|
137 |
+ normArgTRUEorFALSE(verbosep)) |
|
138 |
+ .Call(R_tally_iit_parse, iit, |
|
125 | 139 |
cycle_breaks, |
126 | 140 |
normArgSingleInteger(high_base_quality), |
127 |
- normArgSingleInteger(alloclength), |
|
128 |
- normArgSingleInteger(minimum_mapq), |
|
129 |
- normArgSingleInteger(good_unique_mapq), |
|
130 |
- normArgSingleInteger(maximum_nhits), |
|
131 |
- normArgTRUEorFALSE(concordant_only), |
|
132 |
- normArgTRUEorFALSE(unique_only), |
|
133 |
- normArgTRUEorFALSE(primary_only), |
|
134 |
- normArgSingleInteger(min_depth), |
|
135 |
- normArgSingleInteger(variant_strand), |
|
136 |
- normArgTRUEorFALSE(ignore_query_Ns), |
|
137 |
- normArgTRUEorFALSE(indels), |
|
138 |
- normArgSingleInteger(blocksize), |
|
139 |
- normArgTRUEorFALSE(verbosep)) |
|
141 |
+ NULL) |
|
140 | 142 |
} |
... | ... |
@@ -7,8 +7,9 @@ |
7 | 7 |
static const R_CallMethodDef callMethods[] = { |
8 | 8 |
|
9 | 9 |
/* bamtally.c */ |
10 |
- CALLMETHOD_DEF(R_Bamtally_iit, 19), |
|
11 |
- |
|
10 |
+ CALLMETHOD_DEF(R_Bamtally_iit, 17), |
|
11 |
+ CALLMETHOD_DEF(R_tally_iit_parse, 4), |
|
12 |
+ |
|
12 | 13 |
/* bamreader.c */ |
13 | 14 |
CALLMETHOD_DEF(R_Bamread_new, 1), |
14 | 15 |
|
... | ... |
@@ -11,233 +11,11 @@ |
11 | 11 |
#include <gstruct/bamtally.h> |
12 | 12 |
|
13 | 13 |
#include "gmapR.h" |
14 |
- |
|
15 |
-enum { SEQNAMES, POS, REF, READ, N_CYCLES, N_CYCLES_REF, COUNT, COUNT_REF, |
|
16 |
- COUNT_TOTAL, HIGH_QUALITY, HIGH_QUALITY_REF, HIGH_QUALITY_TOTAL, |
|
17 |
- MEAN_QUALITY, MEAN_QUALITY_REF, COUNT_PLUS, COUNT_PLUS_REF, COUNT_MINUS, |
|
18 |
- COUNT_MINUS_REF, N_BASE_COLS }; |
|
19 |
- |
|
20 |
-typedef struct TallyTable { |
|
21 |
- SEXP seqnames_R; |
|
22 |
- int *pos; |
|
23 |
- SEXP ref_R; |
|
24 |
- SEXP read_R; |
|
25 |
- int *n_cycles; |
|
26 |
- int *n_cycles_ref; |
|
27 |
- int *count; |
|
28 |
- int *count_ref; |
|
29 |
- int *count_total; |
|
30 |
- int *high_quality; |
|
31 |
- int *high_quality_ref; |
|
32 |
- int *high_quality_total; |
|
33 |
- double *mean_quality; |
|
34 |
- double *mean_quality_ref; |
|
35 |
- int *count_plus; |
|
36 |
- int *count_plus_ref; |
|
37 |
- int *count_minus; |
|
38 |
- int *count_minus_ref; |
|
39 |
- SEXP cycle_bins_R; |
|
40 |
-} TallyTable; |
|
41 |
- |
|
42 |
-static inline int |
|
43 |
-read_int (unsigned char **bytes) { |
|
44 |
- int x = ((int *)*bytes)[0]; |
|
45 |
- *bytes += sizeof(int); |
|
46 |
- return x; |
|
47 |
-} |
|
48 |
- |
|
49 |
-static inline char |
|
50 |
-read_char (unsigned char **bytes) { |
|
51 |
- char x = (*bytes)[0]; |
|
52 |
- (*bytes)++; |
|
53 |
- return x; |
|
54 |
-} |
|
55 |
- |
|
56 |
-static inline const char * |
|
57 |
-read_string (unsigned char **bytes) { |
|
58 |
- const char *string = *bytes; |
|
59 |
- (*bytes) += strlen(string) + 1; |
|
60 |
- return string; |
|
61 |
-} |
|
62 |
- |
|
63 |
-static int |
|
64 |
-parse_indel_count(unsigned char *bytes) { |
|
65 |
- int count = read_int(&bytes); |
|
66 |
- return count; |
|
67 |
-} |
|
68 |
- |
|
69 |
-static int |
|
70 |
-parse_allele_count(unsigned char *bytes) { |
|
71 |
- int n_alleles = 1; /* always have a reference */ |
|
72 |
- bytes += sizeof(int) * 4 + 1; /* skip total and reference */ |
|
73 |
- while(bytes[0] != '\0') { |
|
74 |
- bytes += sizeof(int) * 2 + 1; |
|
75 |
- n_alleles++; |
|
76 |
- } |
|
77 |
- return n_alleles; |
|
78 |
-} |
|
79 |
- |
|
80 |
-static void |
|
81 |
-read_total_counts(unsigned char **bytes, int row, int *count_total) { |
|
82 |
- int count_total_plus = read_int(bytes); |
|
83 |
- int count_total_minus = read_int(bytes); |
|
84 |
- count_total[row] = count_total_plus + count_total_minus; |
|
85 |
-} |
|
86 |
- |
|
87 |
-static void |
|
88 |
-read_cycle_counts(unsigned char **bytes, int row, int *n_cycles, |
|
89 |
- SEXP cycle_bins_R, SEXP cycle_breaks_R) |
|
90 |
-{ |
|
91 |
- n_cycles[row] = read_int(bytes); |
|
92 |
- if (cycle_breaks_R != R_NilValue) { |
|
93 |
- for (int index = 0; index < n_cycles[row]; index++) { |
|
94 |
- int cycle = abs(read_int(bytes)); |
|
95 |
- int count = read_int(bytes); |
|
96 |
- int bin = 0; |
|
97 |
- while(length(cycle_breaks_R) > bin && |
|
98 |
- cycle > INTEGER(cycle_breaks_R)[bin]) |
|
99 |
- bin++; |
|
100 |
- if (bin > 0 && bin < length(cycle_breaks_R)) { |
|
101 |
- SEXP bin_vector = VECTOR_ELT(cycle_bins_R, bin - 1); |
|
102 |
- INTEGER(bin_vector)[row] += count; |
|
103 |
- } |
|
104 |
- } |
|
105 |
- } else { |
|
106 |
- (*bytes) += n_cycles[row] * 2 * sizeof(int); |
|
107 |
- } |
|
108 |
-} |
|
109 |
- |
|
110 |
-static int |
|
111 |
-parse_indels(unsigned char *bytes, int row, SEXP cycle_breaks_R, |
|
112 |
- int high_base_quality, TallyTable tally, bool insertion) |
|
113 |
-{ |
|
114 |
- int indel_count = read_int(&bytes); |
|
115 |
- for (int indel = 0; indel < indel_count; indel++) { |
|
116 |
- tally.count_plus[row] = read_int(&bytes); |
|
117 |
- tally.count_minus[row] = read_int(&bytes); |
|
118 |
- tally.count[row] = tally.count_plus[row] + tally.count_minus[row]; |
|
119 |
- SEXP seq_R = mkChar(read_string(&bytes)); |
|
120 |
- if (insertion) |
|
121 |
- SET_STRING_ELT(tally.read_R, row, seq_R); |
|
122 |
- else SET_STRING_ELT(tally.ref_R, row, seq_R); |
|
123 |
- read_cycle_counts(&bytes, row, tally.n_cycles, tally.cycle_bins_R, |
|
124 |
- cycle_breaks_R); |
|
125 |
- /* stuff not relevant for indels */ |
|
126 |
- tally.mean_quality[row] = NA_REAL; |
|
127 |
- tally.high_quality[row] = NA_INTEGER; |
|
128 |
- } |
|
129 |
- return indel_count; |
|
130 |
-} |
|
131 |
- |
|
132 |
-static void |
|
133 |
-read_quality_counts(unsigned char **bytes, int row, int *high_quality, |
|
134 |
- double *mean_quality, int high_base_quality) |
|
135 |
-{ |
|
136 |
- int n_qualities = read_int(bytes); |
|
137 |
- int total_quality = 0; |
|
138 |
- int total_quality_weight = 0; |
|
139 |
- high_quality[row] = 0; |
|
140 |
- for (int index = 0; index < n_qualities; index++) { |
|
141 |
- int quality = read_int(bytes); |
|
142 |
- int count = read_int(bytes); |
|
143 |
- if (quality > high_base_quality) { |
|
144 |
- total_quality += quality * count; |
|
145 |
- total_quality_weight += count; |
|
146 |
- high_quality[row] += count; |
|
147 |
- } |
|
148 |
- } |
|
149 |
- mean_quality[row] = total_quality_weight > 0 ? |
|
150 |
- (double)total_quality / total_quality_weight : R_NaN; |
|
151 |
-} |
|
152 |
- |
|
153 |
-static int |
|
154 |
-read_allele_counts(unsigned char **bytes, int row, SEXP read_R, |
|
155 |
- int *count_plus, int *count_minus, int *count) |
|
156 |
-{ |
|
157 |
- int n_alleles = 0; |
|
158 |
- char allele; |
|
159 |
- while((allele = read_char(bytes)) != '\0') { |
|
160 |
- SET_STRING_ELT(read_R, row, mkCharLen(&allele, 1)); |
|
161 |
- count_plus[row] = read_int(bytes); |
|
162 |
- count_minus[row] = read_int(bytes); |
|
163 |
- count[row] = count_plus[row] + count_minus[row]; |
|
164 |
- row++; |
|
165 |
- n_alleles++; |
|
166 |
- } |
|
167 |
- return n_alleles; |
|
168 |
-} |
|
169 |
- |
|
170 |
-static int |
|
171 |
-parse_alleles(unsigned char *bytes, int row, int ref_row, SEXP cycle_breaks_R, |
|
172 |
- int high_base_quality, TallyTable tally) |
|
173 |
-{ |
|
174 |
- read_total_counts(&bytes, row, tally.count_total); |
|
175 |
- int n_alleles = read_allele_counts(&bytes, row, tally.read_R, |
|
176 |
- tally.count_plus, tally.count_minus, |
|
177 |
- tally.count); |
|
178 |
- for (int allele = 0; allele < n_alleles; allele++, row++) { |
|
179 |
- tally.n_cycles[row] = 0; |
|
180 |
- for (int b = 0; b < length(cycle_breaks_R) - 1; b++) { |
|
181 |
- INTEGER(VECTOR_ELT(tally.cycle_bins_R, b))[row] = 0; |
|
182 |
- } |
|
183 |
- tally.high_quality[row] = 0; |
|
184 |
- tally.mean_quality[row] = R_NaN; |
|
185 |
- if (tally.count[row] > 0) { |
|
186 |
- read_cycle_counts(&bytes, row, tally.n_cycles, tally.cycle_bins_R, |
|
187 |
- cycle_breaks_R); |
|
188 |
- read_quality_counts(&bytes, row, tally.high_quality, tally.mean_quality, |
|
189 |
- high_base_quality); |
|
190 |
- } |
|
191 |
- } |
|
192 |
- int high_quality_total = 0; |
|
193 |
- for (int r = ref_row; r < row; r++) { |
|
194 |
- high_quality_total += tally.high_quality[r]; |
|
195 |
- } |
|
196 |
- for (int r = ref_row; r < row; r++) { |
|
197 |
- tally.n_cycles_ref[r] = tally.n_cycles[ref_row]; |
|
198 |
- tally.count_total[r] = tally.count_total[ref_row]; |
|
199 |
- tally.mean_quality_ref[r] = tally.mean_quality[ref_row]; |
|
200 |
- tally.high_quality_ref[r] = tally.high_quality[ref_row]; |
|
201 |
- tally.high_quality_total[r] = high_quality_total; |
|
202 |
- tally.count_plus_ref[r] = tally.count_plus[ref_row]; |
|
203 |
- tally.count_minus_ref[r] = tally.count_minus[ref_row]; |
|
204 |
- tally.count_ref[r] = tally.count_plus_ref[r] + tally.count_minus_ref[r]; |
|
205 |
- SET_STRING_ELT(tally.ref_R, r, STRING_ELT(tally.read_R, ref_row)); |
|
206 |
- } |
|
207 |
- bool have_ref_row = row > ref_row; |
|
208 |
- if (have_ref_row) { |
|
209 |
- /* clear the 'alt' columns for the 'ref' row with NAs */ |
|
210 |
- SET_STRING_ELT(tally.read_R, ref_row, NA_STRING); |
|
211 |
- tally.n_cycles[ref_row] = NA_INTEGER; |
|
212 |
- tally.mean_quality[ref_row] = NA_REAL; |
|
213 |
- tally.high_quality[ref_row] = NA_REAL; |
|
214 |
- tally.count_plus[ref_row] = NA_INTEGER; |
|
215 |
- tally.count_minus[ref_row] = NA_INTEGER; |
|
216 |
- tally.count[ref_row] = NA_INTEGER; |
|
217 |
- } |
|
218 |
- return n_alleles; |
|
219 |
-} |
|
220 |
- |
|
221 |
- |
|
222 |
-/* FORMAT |
|
223 |
- |
|
224 |
- block header: |
|
225 |
- [0][ins][del][mm] |
|
226 |
- |
|
227 |
- mismatches for each position: |
|
228 |
- [t+][t-][rn][r+][r-]([an][a+][a-])*[\0]([c#]([cv][cc])*)*([q#]([qv][qc])*)* |
|
229 |
- |
|
230 |
- insertions: |
|
231 |
- [i#]([seq][t+][t-][c#]([cv][cc])*)* |
|
232 |
- |
|
233 |
- deletions: |
|
234 |
- [d#]([seq][t+][t-][c#]([cv][cc])*)* |
|
235 |
-*/ |
|
14 |
+#include "iit.h" |
|
236 | 15 |
|
237 | 16 |
SEXP |
238 | 17 |
R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R, |
239 | 18 |
SEXP which_R, |
240 |
- SEXP cycle_breaks_R, SEXP high_base_quality_R, |
|
241 | 19 |
SEXP alloclength_R, |
242 | 20 |
SEXP minimum_mapq_R, SEXP good_unique_mapq_R, |
243 | 21 |
SEXP maximum_nhits_R, |
... | ... |
@@ -266,7 +44,6 @@ R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R, |
266 | 44 |
bool print_indels_p = asLogical(print_indels_p_R); |
267 | 45 |
int blocksize = asInteger(blocksize_R); |
268 | 46 |
int verbosep = asLogical(verbosep_R); |
269 |
- int high_base_quality = asInteger(high_base_quality_R); |
|
270 | 47 |
|
271 | 48 |
char *genomesubdir, *fileroot, *dbversion, *iitfile; |
272 | 49 |
genomesubdir = Datadir_find_genomesubdir(&fileroot, &dbversion, |
... | ... |
@@ -310,119 +87,5 @@ R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R, |
310 | 87 |
error("Could not create tally\n"); |
311 | 88 |
} |
312 | 89 |
|
313 |
- int n_cycle_bins = |
|
314 |
- length(cycle_breaks_R) == 0 ? 0 : length(cycle_breaks_R) - 1; |
|
315 |
- |
|
316 |
- int n_rows = 0; |
|
317 |
- /* loop over the IIT, getting the total number of rows |
|
318 |
- this is (num alts + 1) for every position. */ |
|
319 |
- for (int index = 1; index <= IIT_total_nintervals(tally_iit); index++) { |
|
320 |
- unsigned char *bytes = IIT_data(tally_iit, index); |
|
321 |
- int width = IIT_length(tally_iit, index); |
|
322 |
- unsigned char *base = bytes + (3 * width + 1) * sizeof(int); |
|
323 |
- for (int pos = 0; pos < width; pos++) { |
|
324 |
- int insertion_offset = read_int(&bytes); |
|
325 |
- int deletion_offset = read_int(&bytes); |
|
326 |
- int allele_offset = read_int(&bytes); |
|
327 |
- int next_offset = read_int(&bytes); |
|
328 |
- bytes -= 4; /* rewind from read-ahead */ |
|
329 |
- if (deletion_offset - insertion_offset > 0) { |
|
330 |
- n_rows += parse_indel_count(base + insertion_offset); |
|
331 |
- } |
|
332 |
- if (allele_offset - deletion_offset > 0) { |
|
333 |
- n_rows += parse_indel_count(base + deletion_offset); |
|
334 |
- } |
|
335 |
- if (next_offset - allele_offset > 0) { |
|
336 |
- n_rows += parse_allele_count(base + allele_offset); |
|
337 |
- } |
|
338 |
- } |
|
339 |
- } |
|
340 |
- |
|
341 |
- SEXP tally_R; /* the result list */ |
|
342 |
- PROTECT(tally_R = |
|
343 |
- allocVector(VECSXP, N_BASE_COLS + n_cycle_bins)); |
|
344 |
- |
|
345 |
- TallyTable tally; |
|
346 |
- |
|
347 |
- SET_VECTOR_ELT(tally_R, SEQNAMES, allocVector(STRSXP, n_rows)); |
|
348 |
- tally.seqnames_R = VECTOR_ELT(tally_R, SEQNAMES); |
|
349 |
- SET_VECTOR_ELT(tally_R, POS, allocVector(INTSXP, n_rows)); |
|
350 |
- tally.pos = INTEGER(VECTOR_ELT(tally_R, POS)); |
|
351 |
- SET_VECTOR_ELT(tally_R, REF, allocVector(STRSXP, n_rows)); |
|
352 |
- tally.ref_R = VECTOR_ELT(tally_R, REF); |
|
353 |
- SET_VECTOR_ELT(tally_R, READ, allocVector(STRSXP, n_rows)); |
|
354 |
- tally.read_R = VECTOR_ELT(tally_R, READ); |
|
355 |
- SET_VECTOR_ELT(tally_R, N_CYCLES, allocVector(INTSXP, n_rows)); |
|
356 |
- tally.n_cycles = INTEGER(VECTOR_ELT(tally_R, N_CYCLES)); |
|
357 |
- SET_VECTOR_ELT(tally_R, N_CYCLES_REF, allocVector(INTSXP, n_rows)); |
|
358 |
- tally.n_cycles_ref = INTEGER(VECTOR_ELT(tally_R, N_CYCLES_REF)); |
|
359 |
- SET_VECTOR_ELT(tally_R, COUNT, allocVector(INTSXP, n_rows)); |
|
360 |
- tally.count = INTEGER(VECTOR_ELT(tally_R, COUNT)); |
|
361 |
- SET_VECTOR_ELT(tally_R, COUNT_REF, allocVector(INTSXP, n_rows)); |
|
362 |
- tally.count_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_REF)); |
|
363 |
- SET_VECTOR_ELT(tally_R, COUNT_TOTAL, allocVector(INTSXP, n_rows)); |
|
364 |
- tally.count_total = INTEGER(VECTOR_ELT(tally_R, COUNT_TOTAL)); |
|
365 |
- SET_VECTOR_ELT(tally_R, HIGH_QUALITY, allocVector(INTSXP, n_rows)); |
|
366 |
- tally.high_quality = INTEGER(VECTOR_ELT(tally_R, HIGH_QUALITY)); |
|
367 |
- SET_VECTOR_ELT(tally_R, HIGH_QUALITY_REF, allocVector(INTSXP, n_rows)); |
|
368 |
- tally.high_quality_ref = INTEGER(VECTOR_ELT(tally_R, HIGH_QUALITY_REF)); |
|
369 |
- SET_VECTOR_ELT(tally_R, HIGH_QUALITY_TOTAL, allocVector(INTSXP, n_rows)); |
|
370 |
- tally.high_quality_total = INTEGER(VECTOR_ELT(tally_R, HIGH_QUALITY_TOTAL)); |
|
371 |
- SET_VECTOR_ELT(tally_R, MEAN_QUALITY, allocVector(REALSXP, n_rows)); |
|
372 |
- tally.mean_quality = REAL(VECTOR_ELT(tally_R, MEAN_QUALITY)); |
|
373 |
- SET_VECTOR_ELT(tally_R, MEAN_QUALITY_REF, allocVector(REALSXP, n_rows)); |
|
374 |
- tally.mean_quality_ref = REAL(VECTOR_ELT(tally_R, MEAN_QUALITY_REF)); |
|
375 |
- SET_VECTOR_ELT(tally_R, COUNT_PLUS, allocVector(INTSXP, n_rows)); |
|
376 |
- tally.count_plus = INTEGER(VECTOR_ELT(tally_R, COUNT_PLUS)); |
|
377 |
- SET_VECTOR_ELT(tally_R, COUNT_PLUS_REF, allocVector(INTSXP, n_rows)); |
|
378 |
- tally.count_plus_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_PLUS_REF)); |
|
379 |
- SET_VECTOR_ELT(tally_R, COUNT_MINUS, allocVector(INTSXP, n_rows)); |
|
380 |
- tally.count_minus = INTEGER(VECTOR_ELT(tally_R, COUNT_MINUS)); |
|
381 |
- SET_VECTOR_ELT(tally_R, COUNT_MINUS_REF, allocVector(INTSXP, n_rows)); |
|
382 |
- tally.count_minus_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_MINUS_REF)); |
|
383 |
- |
|
384 |
- PROTECT(tally.cycle_bins_R = allocVector(VECSXP, n_cycle_bins)); |
|
385 |
- for (int bin = 0; bin < n_cycle_bins; bin++) { |
|
386 |
- SEXP cycle_bin_R = allocVector(INTSXP, n_rows); |
|
387 |
- SET_VECTOR_ELT(tally_R, bin + N_BASE_COLS, cycle_bin_R); |
|
388 |
- SET_VECTOR_ELT(tally.cycle_bins_R, bin, cycle_bin_R); |
|
389 |
- } |
|
390 |
- |
|
391 |
- int row = 0; |
|
392 |
- for (int index = 1; index <= IIT_total_nintervals(tally_iit); index++) { |
|
393 |
- unsigned char *bytes = IIT_data(tally_iit, index); |
|
394 |
- int width = IIT_length(tally_iit, index); |
|
395 |
- unsigned char *base = bytes + (3 * width + 1) * sizeof(int); |
|
396 |
- int start = IIT_interval_low(tally_iit, index); |
|
397 |
- SEXP divstring_R; |
|
398 |
- PROTECT(divstring_R = mkChar(IIT_divstring_from_index(tally_iit, index))); |
|
399 |
- for (int position = 0; position < width; position++) { |
|
400 |
- int insertion_offset = read_int(&bytes); |
|
401 |
- int deletion_offset = read_int(&bytes); |
|
402 |
- int allele_offset = read_int(&bytes); |
|
403 |
- int next_offset = read_int(&bytes); |
|
404 |
- int ref_row = row; |
|
405 |
- bytes -= 4; /* rewind from read-ahead */ |
|
406 |
- if (next_offset - allele_offset > 0) |
|
407 |
- row += parse_alleles(base + allele_offset, row, ref_row, cycle_breaks_R, |
|
408 |
- high_base_quality, tally); |
|
409 |
- if (deletion_offset - insertion_offset > 0) |
|
410 |
- row += parse_indels(base + insertion_offset, row, cycle_breaks_R, |
|
411 |
- high_base_quality, tally, true); |
|
412 |
- if (allele_offset - deletion_offset > 0) |
|
413 |
- row += parse_indels(base + deletion_offset, row, cycle_breaks_R, |
|
414 |
- high_base_quality, tally, false); |
|
415 |
- /* fill in position information */ |
|
416 |
- for (int r = ref_row; r < row; r++) { |
|
417 |
- SET_STRING_ELT(tally.seqnames_R, r, divstring_R); |
|
418 |
- tally.pos[r] = start + position; |
|
419 |
- } |
|
420 |
- } |
|
421 |
- UNPROTECT(1); |
|
422 |
- } |
|
423 |
- |
|
424 |
- IIT_free(&tally_iit); |
|
425 |
- UNPROTECT(2); |
|
426 |
- return tally_R; |
|
90 |
+ return R_IIT_new(tally_iit); |
|
427 | 91 |
} |
428 |
- |
429 | 92 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,23 @@ |
1 |
+/* Functions for reading data from byte streams. Inlined because they |
|
2 |
+ are tiny and need to be fast. */ |
|
3 |
+ |
|
4 |
+static inline int |
|
5 |
+read_int (unsigned char **bytes) { |
|
6 |
+ int x = ((int *)*bytes)[0]; |
|
7 |
+ *bytes += sizeof(int); |
|
8 |
+ return x; |
|
9 |
+} |
|
10 |
+ |
|
11 |
+static inline char |
|
12 |
+read_char (unsigned char **bytes) { |
|
13 |
+ char x = (*bytes)[0]; |
|
14 |
+ (*bytes)++; |
|
15 |
+ return x; |
|
16 |
+} |
|
17 |
+ |
|
18 |
+static inline const char * |
|
19 |
+read_string (unsigned char **bytes) { |
|
20 |
+ const char *string = *bytes; |
|
21 |
+ (*bytes) += strlen(string) + 1; |
|
22 |
+ return string; |
|
23 |
+} |
... | ... |
@@ -3,13 +3,14 @@ |
3 | 3 |
|
4 | 4 |
#include <Rinternals.h> |
5 | 5 |
|
6 |
+/* .Call entry points for the gmapR package */ |
|
7 |
+ |
|
6 | 8 |
SEXP |
7 | 9 |
R_Bamread_new (SEXP bamfile_R); |
8 | 10 |
|
9 | 11 |
SEXP |
10 | 12 |
R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R, |
11 | 13 |
SEXP which_R, |
12 |
- SEXP cycle_breaks_R, SEXP high_quality_cutoff_R, |
|
13 | 14 |
SEXP alloclength_R, |
14 | 15 |
SEXP minimum_mapq_R, SEXP good_unique_mapq_R, |
15 | 16 |
SEXP maximum_nhits_R, |
... | ... |
@@ -21,4 +22,8 @@ R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R, |
21 | 22 |
SEXP blocksize_R, |
22 | 23 |
SEXP verbosep_R); |
23 | 24 |
|
25 |
+SEXP |
|
26 |
+R_tally_iit_parse(SEXP tally_iit_R, SEXP cycle_breaks_R, |
|
27 |
+ SEXP high_base_quality, SEXP which_R); |
|
28 |
+ |
|
24 | 29 |
#endif |
25 | 30 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,37 @@ |
1 |
+#include <gstruct/iit-read.h> |
|
2 |
+ |
|
3 |
+#include "iit.h" |
|
4 |
+ |
|
5 |
+static void R_IIT_free(SEXP iit_R) { |
|
6 |
+ IIT_T iit = R_ExternalPtrAddr(iit_R); |
|
7 |
+ IIT_free(&iit); |
|
8 |
+} |
|
9 |
+ |
|
10 |
+SEXP R_IIT_new(IIT_T iit) { |
|
11 |
+ SEXP iit_R; |
|
12 |
+ iit_R = R_MakeExternalPtr((void *) iit, R_NilValue, R_NilValue); |
|
13 |
+ R_RegisterCFinalizer(iit_R, R_IIT_free); |
|
14 |
+ return iit_R; |
|
15 |
+} |
|
16 |
+ |
|
17 |
+SEXP R_iit_read(SEXP iitfile_R) { |
|
18 |
+ char *iitfile = (char *) CHAR(asChar(iitfile_R)); |
|
19 |
+ IIT_T iit = IIT_read(iitfile, /*name*/NULL, /*readonlyp*/true, |
|
20 |
+ /*divread*/READ_ALL, /*divstring*/NULL, |
|
21 |
+ /*add_iit_p*/false, /*labels_read_p*/true); |
|
22 |
+ return R_IIT_new(iit); |
|
23 |
+} |
|
24 |
+ |
|
25 |
+/* 'IIT_output_direct' is not included in the libgstruct binary */ |
|
26 |
+ |
|
27 |
+/* |
|
28 |
+ #include <gstruct/iit-write.h> |
|
29 |
+ |
|
30 |
+ SEXP R_iit_write(SEXP tally_iit_R, SEXP iitfile_R) { |
|
31 |
+ IIT_T tally_iit = (IIT_T) R_ExternalPtrAddr(tally_iit_R); |
|
32 |
+ char *iitfile = (char *) CHAR(asChar(iitfile_R)); |
|
33 |
+ IIT_output_direct(iitfile, tally_iit, IIT_LATEST_VERSION); |
|
34 |
+ return iitfile_R; |
|
35 |
+ } |
|
36 |
+*/ |
|
37 |
+ |
0 | 10 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,426 @@ |
1 |
+#include <stdlib.h> /* for abs */ |
|
2 |
+#include <string.h> /* for strlen */ |
|
3 |
+ |
|
4 |
+#include "iit.h" |
|
5 |
+#include "bytestream.h" |
|
6 |
+ |
|
7 |
+enum { SEQNAMES, POS, REF, READ, N_CYCLES, N_CYCLES_REF, COUNT, COUNT_REF, |
|
8 |
+ COUNT_TOTAL, HIGH_QUALITY, HIGH_QUALITY_REF, HIGH_QUALITY_TOTAL, |
|
9 |
+ MEAN_QUALITY, MEAN_QUALITY_REF, COUNT_PLUS, COUNT_PLUS_REF, COUNT_MINUS, |
|
10 |
+ COUNT_MINUS_REF, N_BASE_COLS }; |
|
11 |
+ |
|
12 |
+typedef struct TallyTable { |
|
13 |
+ SEXP seqnames_R; |
|
14 |
+ int *pos; |
|
15 |
+ SEXP ref_R; |
|
16 |
+ SEXP read_R; |
|
17 |
+ int *n_cycles; |
|
18 |
+ int *n_cycles_ref; |
|
19 |
+ int *count; |
|
20 |
+ int *count_ref; |
|
21 |
+ int *count_total; |
|
22 |
+ int *high_quality; |
|
23 |
+ int *high_quality_ref; |
|
24 |
+ int *high_quality_total; |
|
25 |
+ double *mean_quality; |
|
26 |
+ double *mean_quality_ref; |
|
27 |
+ int *count_plus; |
|
28 |
+ int *count_plus_ref; |
|
29 |
+ int *count_minus; |
|
30 |
+ int *count_minus_ref; |
|
31 |
+ int **cycle_bins; |
|
32 |
+} TallyTable; |
|
33 |
+ |
|
34 |
+static SEXP R_TallyTable_new(int n_rows, int n_cycle_bins) { |
|
35 |
+ SEXP tally_R; /* the result list */ |
|
36 |
+ PROTECT(tally_R = allocVector(VECSXP, N_BASE_COLS + n_cycle_bins)); |
|
37 |
+ |
|
38 |
+ SET_VECTOR_ELT(tally_R, SEQNAMES, allocVector(STRSXP, n_rows)); |
|
39 |
+ SET_VECTOR_ELT(tally_R, POS, allocVector(INTSXP, n_rows)); |
|
40 |
+ SET_VECTOR_ELT(tally_R, REF, allocVector(STRSXP, n_rows)); |
|
41 |
+ SET_VECTOR_ELT(tally_R, READ, allocVector(STRSXP, n_rows)); |
|
42 |
+ SET_VECTOR_ELT(tally_R, N_CYCLES, allocVector(INTSXP, n_rows)); |
|
43 |
+ SET_VECTOR_ELT(tally_R, N_CYCLES_REF, allocVector(INTSXP, n_rows)); |
|
44 |
+ SET_VECTOR_ELT(tally_R, COUNT, allocVector(INTSXP, n_rows)); |
|
45 |
+ SET_VECTOR_ELT(tally_R, COUNT_REF, allocVector(INTSXP, n_rows)); |
|
46 |
+ SET_VECTOR_ELT(tally_R, COUNT_TOTAL, allocVector(INTSXP, n_rows)); |
|
47 |
+ SET_VECTOR_ELT(tally_R, HIGH_QUALITY, allocVector(INTSXP, n_rows)); |
|
48 |
+ SET_VECTOR_ELT(tally_R, HIGH_QUALITY_REF, allocVector(INTSXP, n_rows)); |
|
49 |
+ SET_VECTOR_ELT(tally_R, HIGH_QUALITY_TOTAL, allocVector(INTSXP, n_rows)); |
|
50 |
+ SET_VECTOR_ELT(tally_R, MEAN_QUALITY, allocVector(REALSXP, n_rows)); |
|
51 |
+ SET_VECTOR_ELT(tally_R, MEAN_QUALITY_REF, allocVector(REALSXP, n_rows)); |
|
52 |
+ SET_VECTOR_ELT(tally_R, COUNT_PLUS, allocVector(INTSXP, n_rows)); |
|
53 |
+ SET_VECTOR_ELT(tally_R, COUNT_PLUS_REF, allocVector(INTSXP, n_rows)); |
|
54 |
+ SET_VECTOR_ELT(tally_R, COUNT_MINUS, allocVector(INTSXP, n_rows)); |
|
55 |
+ SET_VECTOR_ELT(tally_R, COUNT_MINUS_REF, allocVector(INTSXP, n_rows)); |
|
56 |
+ for (int bin = 0; bin < n_cycle_bins; bin++) { |
|
57 |
+ SEXP cycle_bin_R = allocVector(INTSXP, n_rows); |
|
58 |
+ SET_VECTOR_ELT(tally_R, bin + N_BASE_COLS, cycle_bin_R); |
|
59 |
+ } |
|
60 |
+ |
|
61 |
+ UNPROTECT(1); |
|
62 |
+ return tally_R; |
|
63 |
+} |
|
64 |
+ |
|
65 |
+static TallyTable *TallyTable_new(SEXP tally_R) { |
|
66 |
+ TallyTable *tally = (TallyTable *) R_alloc(sizeof(TallyTable), 1); |
|
67 |
+ int n_cycle_bins = length(tally_R) - N_BASE_COLS; |
|
68 |
+ |
|
69 |
+ tally->seqnames_R = VECTOR_ELT(tally_R, SEQNAMES); |
|
70 |
+ tally->pos = INTEGER(VECTOR_ELT(tally_R, POS)); |
|
71 |
+ tally->ref_R = VECTOR_ELT(tally_R, REF); |
|
72 |
+ tally->read_R = VECTOR_ELT(tally_R, READ); |
|
73 |
+ tally->n_cycles = INTEGER(VECTOR_ELT(tally_R, N_CYCLES)); |
|
74 |
+ tally->n_cycles_ref = INTEGER(VECTOR_ELT(tally_R, N_CYCLES_REF)); |
|
75 |
+ tally->count = INTEGER(VECTOR_ELT(tally_R, COUNT)); |
|
76 |
+ tally->count_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_REF)); |
|
77 |
+ tally->count_total = INTEGER(VECTOR_ELT(tally_R, COUNT_TOTAL)); |
|
78 |
+ tally->high_quality = INTEGER(VECTOR_ELT(tally_R, HIGH_QUALITY)); |
|
79 |
+ tally->high_quality_ref = INTEGER(VECTOR_ELT(tally_R, HIGH_QUALITY_REF)); |
|
80 |
+ tally->high_quality_total = INTEGER(VECTOR_ELT(tally_R, HIGH_QUALITY_TOTAL)); |
|
81 |
+ tally->mean_quality = REAL(VECTOR_ELT(tally_R, MEAN_QUALITY)); |
|
82 |
+ tally->mean_quality_ref = REAL(VECTOR_ELT(tally_R, MEAN_QUALITY_REF)); |
|
83 |
+ tally->count_plus = INTEGER(VECTOR_ELT(tally_R, COUNT_PLUS)); |
|
84 |
+ tally->count_plus_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_PLUS_REF)); |
|
85 |
+ tally->count_minus = INTEGER(VECTOR_ELT(tally_R, COUNT_MINUS)); |
|
86 |
+ tally->count_minus_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_MINUS_REF)); |
|
87 |
+ tally->cycle_bins = (int **) R_alloc(sizeof(int*), n_cycle_bins); |
|
88 |
+ for (int bin = 0; bin < n_cycle_bins; bin++) { |
|
89 |
+ tally->cycle_bins[bin] = INTEGER(VECTOR_ELT(tally_R, bin + N_BASE_COLS)); |
|
90 |
+ } |
|
91 |
+ |
|
92 |
+ return tally; |
|
93 |
+} |
|
94 |
+ |
|
95 |
+static void |
|
96 |
+read_total_counts(unsigned char **bytes, int row, int *count_total) { |
|
97 |
+ int count_total_plus = read_int(bytes); |
|
98 |
+ int count_total_minus = read_int(bytes); |
|
99 |
+ count_total[row] = count_total_plus + count_total_minus; |
|
100 |
+} |
|
101 |
+ |
|
102 |
+static void |
|
103 |
+read_cycle_counts(unsigned char **bytes, int row, int *n_cycles, |
|
104 |
+ int **cycle_bins, int *cycle_breaks, int n_cycle_bins) |
|
105 |
+{ |
|
106 |
+ int n_cycle_breaks = n_cycle_bins + 1; |
|
107 |
+ n_cycles[row] = read_int(bytes); |
|
108 |
+ if (cycle_breaks != NULL) { |
|
109 |
+ for (int index = 0; index < n_cycles[row]; index++) { |
|
110 |
+ int cycle = abs(read_int(bytes)); |
|
111 |
+ int count = read_int(bytes); |
|
112 |
+ int bin = 0; |
|
113 |
+ while(n_cycle_breaks > bin && |
|
114 |
+ cycle > cycle_breaks[bin]) |
|
115 |
+ bin++; |
|
116 |
+ if (bin > 0 && bin < n_cycle_breaks) { |
|
117 |
+ cycle_bins[bin-1][row] += count; |
|
118 |
+ } |
|
119 |
+ } |
|
120 |
+ } else { |
|
121 |
+ (*bytes) += n_cycles[row] * 2 * sizeof(int); |
|
122 |
+ } |
|
123 |
+} |
|
124 |
+ |
|
125 |
+static int |
|
126 |
+parse_indels(unsigned char *bytes, int row, |
|
127 |
+ int *cycle_breaks, int n_cycle_bins, |
|
128 |
+ int high_base_quality, TallyTable *tally, bool insertion) |
|
129 |
+{ |
|
130 |
+ int indel_count = read_int(&bytes); |
|
131 |
+ for (int indel = 0; indel < indel_count; indel++) { |
|
132 |
+ /* TODO: add ref counts when we have them from Tom. */ |
|
133 |
+ /* TODO: sum the ref counts with the indel counts for total. */ |
|
134 |
+ tally->count_plus[row] = read_int(&bytes); |
|
135 |
+ tally->count_minus[row] = read_int(&bytes); |
|
136 |
+ tally->count[row] = tally->count_plus[row] + tally->count_minus[row]; |
|
137 |
+ SEXP seq_R = mkChar(read_string(&bytes)); |
|
138 |
+ if (insertion) { |
|
139 |
+ SET_STRING_ELT(tally->read_R, row, seq_R); |
|
140 |
+ SET_STRING_ELT(tally->ref_R, row, R_BlankString); |
|
141 |
+ } |
|
142 |
+ else { |
|
143 |
+ SET_STRING_ELT(tally->read_R, row, R_BlankString); |
|
144 |
+ SET_STRING_ELT(tally->ref_R, row, seq_R); |
|
145 |
+ } |
|
146 |
+ read_cycle_counts(&bytes, row, tally->n_cycles, tally->cycle_bins, |
|
147 |
+ cycle_breaks, n_cycle_bins); |
|
148 |
+ /* quality is per-base and so not relevant for indels */ |
|
149 |
+ tally->mean_quality[row] = NA_REAL; |
|
150 |
+ tally->mean_quality_ref[row] = NA_REAL; |
|
151 |
+ tally->high_quality[row] = NA_INTEGER; |
|
152 |
+ tally->high_quality_ref[row] = NA_INTEGER; |
|
153 |
+ tally->high_quality_total[row] = NA_INTEGER; |
|
154 |
+ } |
|
155 |
+ return indel_count; |
|
156 |
+} |
|
157 |
+ |
|
158 |
+static void |
|
159 |
+read_quality_counts(unsigned char **bytes, int row, int *high_quality, |
|
160 |
+ double *mean_quality, int high_base_quality) |
|
161 |
+{ |
|
162 |
+ int n_qualities = read_int(bytes); |
|
163 |
+ int total_quality = 0; |
|
164 |
+ int total_quality_weight = 0; |
|
165 |
+ high_quality[row] = 0; |
|
166 |
+ for (int index = 0; index < n_qualities; index++) { |
|
167 |
+ int quality = read_int(bytes); |
|
168 |
+ int count = read_int(bytes); |
|
169 |
+ if (quality > high_base_quality) { |
|
170 |
+ total_quality += quality * count; |
|
171 |
+ total_quality_weight += count; |
|
172 |
+ high_quality[row] += count; |
|
173 |
+ } |
|
174 |
+ } |
|
175 |
+ mean_quality[row] = total_quality_weight > 0 ? |
|
176 |
+ (double)total_quality / total_quality_weight : R_NaN; |
|
177 |
+} |
|
178 |
+ |
|
179 |
+static int |
|
180 |
+read_allele_counts(unsigned char **bytes, int row, SEXP read_R, |
|
181 |
+ int *count_plus, int *count_minus, int *count) |
|
182 |
+{ |
|
183 |
+ int n_alleles = 0; |
|
184 |
+ char allele; |
|
185 |
+ while((allele = read_char(bytes)) != '\0') { |
|
186 |
+ SET_STRING_ELT(read_R, row, mkCharLen(&allele, 1)); |
|
187 |
+ count_plus[row] = read_int(bytes); |
|
188 |
+ count_minus[row] = read_int(bytes); |
|
189 |
+ count[row] = count_plus[row] + count_minus[row]; |
|
190 |
+ row++; |
|
191 |
+ n_alleles++; |
|
192 |
+ } |
|
193 |
+ return n_alleles; |
|
194 |
+} |
|
195 |
+ |
|
196 |
+static int |
|
197 |
+parse_alleles(unsigned char *bytes, int row, int ref_row, int *cycle_breaks, |
|
198 |
+ int n_cycle_bins, int high_base_quality, TallyTable *tally) |
|
199 |
+{ |
|
200 |
+ read_total_counts(&bytes, row, tally->count_total); |
|
201 |
+ int n_alleles = read_allele_counts(&bytes, row, tally->read_R, |
|
202 |
+ tally->count_plus, tally->count_minus, |
|
203 |
+ tally->count); |
|
204 |
+ for (int allele = 0; allele < n_alleles; allele++, row++) { |
|
205 |
+ tally->n_cycles[row] = 0; |
|
206 |
+ for (int b = 0; b < n_cycle_bins; b++) { |
|
207 |
+ tally->cycle_bins[b][row] = 0; |
|
208 |
+ } |
|
209 |
+ tally->high_quality[row] = 0; |
|
210 |
+ tally->mean_quality[row] = R_NaN; |
|
211 |
+ if (tally->count[row] > 0) { |
|
212 |
+ read_cycle_counts(&bytes, row, tally->n_cycles, tally->cycle_bins, |
|
213 |
+ cycle_breaks, n_cycle_bins); |
|
214 |
+ read_quality_counts(&bytes, row, tally->high_quality, tally->mean_quality, |
|
215 |
+ high_base_quality); |
|
216 |
+ } |
|
217 |
+ } |
|
218 |
+ int high_quality_total = 0; |
|
219 |
+ for (int r = ref_row; r < row; r++) { |
|
220 |
+ high_quality_total += tally->high_quality[r]; |
|
221 |
+ } |
|
222 |
+ for (int r = ref_row; r < row; r++) { |
|
223 |
+ tally->n_cycles_ref[r] = tally->n_cycles[ref_row]; |
|
224 |
+ tally->count_total[r] = tally->count_total[ref_row]; |
|
225 |
+ tally->mean_quality_ref[r] = tally->mean_quality[ref_row]; |
|
226 |
+ tally->high_quality_ref[r] = tally->high_quality[ref_row]; |
|
227 |
+ tally->high_quality_total[r] = high_quality_total; |
|
228 |
+ tally->count_plus_ref[r] = tally->count_plus[ref_row]; |
|
229 |
+ tally->count_minus_ref[r] = tally->count_minus[ref_row]; |
|
230 |
+ tally->count_ref[r] = tally->count_plus_ref[r] + tally->count_minus_ref[r]; |
|
231 |
+ SET_STRING_ELT(tally->ref_R, r, STRING_ELT(tally->read_R, ref_row)); |
|
232 |
+ } |
|
233 |
+ bool have_ref_row = row > ref_row; |
|
234 |
+ if (have_ref_row) { |
|
235 |
+ /* clear the 'alt' columns for the 'ref' row with NAs */ |
|
236 |
+ SET_STRING_ELT(tally->read_R, ref_row, NA_STRING); |
|
237 |
+ tally->n_cycles[ref_row] = NA_INTEGER; |
|
238 |
+ tally->mean_quality[ref_row] = NA_REAL; |
|
239 |
+ tally->high_quality[ref_row] = NA_REAL; |
|
240 |
+ tally->count_plus[ref_row] = NA_INTEGER; |
|
241 |
+ tally->count_minus[ref_row] = NA_INTEGER; |
|
242 |
+ tally->count[ref_row] = NA_INTEGER; |
|
243 |
+ } |
|
244 |
+ return n_alleles; |
|
245 |
+} |
|
246 |
+ |
|
247 |
+static int parse_indel_count(unsigned char *bytes) { |
|
248 |
+ int count = read_int(&bytes); |
|
249 |
+ return count; |
|
250 |
+} |
|
251 |
+ |
|
252 |
+static int parse_allele_count(unsigned char *bytes) { |
|
253 |
+ int n_alleles = 1; /* always have a reference */ |
|
254 |
+ bytes += sizeof(int) * 4 + 1; /* skip total and reference */ |
|
255 |
+ while(bytes[0] != '\0') { |
|
256 |
+ bytes += sizeof(int) * 2 + 1; |
|
257 |
+ n_alleles++; |
|
258 |
+ } |
|
259 |
+ return n_alleles; |
|
260 |
+} |
|
261 |
+ |
|
262 |
+static int count_rows_for_interval(IIT_T tally_iit, int index) { |
|
263 |
+ int n_rows; |
|
264 |
+ unsigned char *bytes = IIT_data(tally_iit, index); |
|
265 |
+ int width = IIT_length(tally_iit, index); |
|
266 |
+ unsigned char *base = bytes + (3 * width + 1) * sizeof(int); |
|
267 |
+ for (int pos = 0; pos < width; pos++) { |
|
268 |
+ int insertion_offset = read_int(&bytes); |
|
269 |
+ int deletion_offset = read_int(&bytes); |
|
270 |
+ int allele_offset = read_int(&bytes); |
|
271 |
+ int next_offset = read_int(&bytes); |
|
272 |
+ bytes -= 4; /* rewind from read-ahead */ |
|
273 |
+ if (deletion_offset - insertion_offset > 0) { |
|
274 |
+ n_rows += parse_indel_count(base + insertion_offset); |
|
275 |
+ } |
|
276 |
+ if (allele_offset - deletion_offset > 0) { |
|
277 |
+ n_rows += parse_indel_count(base + deletion_offset); |
|
278 |
+ } |
|
279 |
+ if (next_offset - allele_offset > 0) { |
|
280 |
+ n_rows += parse_allele_count(base + allele_offset); |
|
281 |
+ } |
|
282 |
+ } |
|
283 |
+ return n_rows; |
|
284 |
+} |
|
285 |
+ |
|
286 |
+static int parse_interval(IIT_T tally_iit, int index, |
|
287 |
+ TallyTable *tally, int row, |
|
288 |
+ int *cycle_breaks, int n_cycle_bins, |
|
289 |
+ int high_base_quality) |
|
290 |
+{ |
|
291 |
+ unsigned char *bytes = IIT_data(tally_iit, index); |
|
292 |
+ int width = IIT_length(tally_iit, index); |
|
293 |
+ unsigned char *base = bytes + (3 * width + 1) * sizeof(int); |
|
294 |
+ int start = IIT_interval_low(tally_iit, index); |
|
295 |
+ SEXP divstring_R; |
|
296 |
+ PROTECT(divstring_R = mkChar(IIT_divstring_from_index(tally_iit, index))); |
|
297 |
+ for (int position = 0; position < width; position++) { |
|
298 |
+ int insertion_offset = read_int(&bytes); |
|
299 |
+ int deletion_offset = read_int(&bytes); |
|
300 |
+ int allele_offset = read_int(&bytes); |
|
301 |
+ int next_offset = read_int(&bytes); |
|
302 |
+ int ref_row = row; |
|
303 |
+ bytes -= 4; /* rewind from read-ahead */ |
|
304 |
+ if (next_offset - allele_offset > 0) |
|
305 |
+ row += parse_alleles(base + allele_offset, row, ref_row, |
|
306 |
+ cycle_breaks, n_cycle_bins, |
|
307 |
+ high_base_quality, tally); |
|
308 |
+ if (deletion_offset - insertion_offset > 0) |
|
309 |
+ row += parse_indels(base + insertion_offset, row, |
|
310 |
+ cycle_breaks, n_cycle_bins, |
|
311 |
+ high_base_quality, tally, true); |
|
312 |
+ if (allele_offset - deletion_offset > 0) |
|
313 |
+ row += parse_indels(base + deletion_offset, row, |
|
314 |
+ cycle_breaks, n_cycle_bins, |
|
315 |
+ high_base_quality, tally, false); |
|
316 |
+ /* fill in position information */ |
|
317 |
+ for (int r = ref_row; r < row; r++) { |
|
318 |
+ SET_STRING_ELT(tally->seqnames_R, r, divstring_R); |
|
319 |
+ tally->pos[r] = start + position; |
|
320 |
+ } |
|
321 |
+ } |
|
322 |
+ UNPROTECT(1); |
|
323 |
+ return row; |
|
324 |
+} |
|
325 |
+ |
|
326 |
+static SEXP parse_all(IIT_T tally_iit, int *cycle_breaks, |
|
327 |
+ int n_cycle_bins, |
|
328 |
+ int high_base_quality) |
|
329 |
+{ |
|
330 |
+ int n_rows = 0; |
|
331 |
+ /* loop over the IIT, getting the total number of rows |
|
332 |
+ this is (num alts + 1) for every position. */ |
|
333 |
+ for (int index = 1; index <= IIT_total_nintervals(tally_iit); index++) { |
|
334 |
+ n_rows += count_rows_for_interval(tally_iit, index); |
|
335 |
+ } |
|
336 |
+ |
|
337 |
+ SEXP tally_R; |
|
338 |
+ PROTECT(tally_R = R_TallyTable_new(n_rows, n_cycle_bins)); |
|
339 |
+ TallyTable *tally = TallyTable_new(tally_R); |
|
340 |
+ |
|
341 |
+ int row = 0; |
|
342 |
+ for (int index = 1; index <= IIT_total_nintervals(tally_iit); index++) { |
|
343 |
+ row = parse_interval(tally_iit, index, tally, row, |
|
344 |
+ cycle_breaks, n_cycle_bins, |
|
345 |
+ high_base_quality); |
|
346 |
+ } |
|
347 |
+ |
|
348 |
+ UNPROTECT(1); |
|
349 |
+ return tally_R; |
|
350 |
+} |
|
351 |
+ |
|
352 |
+static SEXP parse_some(IIT_T tally_iit, int *cycle_breaks, |
|
353 |
+ int n_cycle_bins, |
|
354 |
+ int high_base_quality, |
|
355 |
+ SEXP chr_R, int *start, int *end) |
|
356 |
+{ |
|
357 |
+ int n_rows = 0; |
|
358 |
+ int *nmatches = (int *) R_alloc(sizeof(int), length(chr_R)); |
|
359 |
+ int **indexes = (int **) R_alloc(sizeof(int*), length(chr_R)); |
|
360 |
+ |
|
361 |
+ for (int i = 0; i < length(chr_R); i++) { |
|
362 |
+ indexes[i] = IIT_get(nmatches + i, tally_iit, |
|
363 |
+ (char *) CHAR(STRING_ELT(chr_R, i)), |
|
364 |
+ start[i], end[i], false); |
|
365 |
+ for (int j = 0; j < nmatches[i]; j++) { |
|
366 |
+ n_rows += count_rows_for_interval(tally_iit, indexes[i][j]); |
|
367 |
+ } |
|
368 |
+ } |
|
369 |
+ |
|
370 |
+ SEXP tally_R; |
|
371 |
+ PROTECT(tally_R = R_TallyTable_new(n_rows, n_cycle_bins)); |
|
372 |
+ TallyTable *tally = TallyTable_new(tally_R); |
|
373 |
+ |
|
374 |
+ int row = 0; |
|
375 |
+ for (int i = 0; i < length(chr_R); i++) { |
|
376 |
+ for (int j = 0; j < nmatches[i]; j++) { |
|
377 |
+ row = parse_interval(tally_iit, indexes[i][j], |
|
378 |
+ tally, row, |
|
379 |
+ cycle_breaks, n_cycle_bins, |
|
380 |
+ high_base_quality); |
|
381 |
+ } |
|
382 |
+ } |
|
383 |
+ |
|
384 |
+ UNPROTECT(1); |
|
385 |
+ return tally_R; |
|
386 |
+} |
|
387 |
+ |
|
388 |
+/* FORMAT |
|
389 |
+ |
|
390 |
+ block header: |
|
391 |
+ [0][ins][del][mm] |
|
392 |
+ |
|
393 |
+ mismatches for each position: |
|
394 |
+ [t+][t-][rn][r+][r-]([an][a+][a-])*[\0]([c#]([cv][cc])*)*([q#]([qv][qc])*)* |
|
395 |
+ |
|
396 |
+ insertions: |
|
397 |
+ [i#]([seq][t+][t-][c#]([cv][cc])*)* |
|
398 |
+ |
|
399 |
+ deletions: |
|
400 |
+ [d#]([seq][t+][t-][c#]([cv][cc])*)* |
|
401 |
+*/ |
|
402 |
+ |
|
403 |
+SEXP R_tally_iit_parse(SEXP tally_iit_R, SEXP cycle_breaks_R, |
|
404 |
+ SEXP high_base_quality_R, SEXP which_R) |
|
405 |
+{ |
|
406 |
+ int high_base_quality = asInteger(high_base_quality_R); |
|
407 |
+ int *cycle_breaks = |
|
408 |
+ cycle_breaks_R == R_NilValue ? NULL : INTEGER(cycle_breaks_R); |
|
409 |
+ int n_cycle_bins = |
|
410 |
+ length(cycle_breaks_R) == 0 ? 0 : length(cycle_breaks_R) - 1; |
|
411 |
+ IIT_T tally_iit = (IIT_T) R_ExternalPtrAddr(tally_iit_R); |
|
412 |
+ SEXP tally_R; |
|
413 |
+ |
|
414 |
+ if (which_R == R_NilValue) { |
|
415 |
+ tally_R = parse_all(tally_iit, cycle_breaks, n_cycle_bins, |
|
416 |
+ high_base_quality); |
|
417 |
+ } else { |
|
418 |
+ SEXP chr_R = VECTOR_ELT(which_R, 0); |
|
419 |
+ int *start = INTEGER(VECTOR_ELT(which_R, 1)); |
|
420 |
+ int *end = INTEGER(VECTOR_ELT(which_R, 2)); |
|
421 |
+ tally_R = parse_some(tally_iit, cycle_breaks, n_cycle_bins, |
|
422 |
+ high_base_quality, chr_R, start, end); |
|
423 |
+ } |
|
424 |
+ |
|
425 |
+ return tally_R; |
|
426 |
+} |