Browse code

re-add counting of unique read positions (convenient when dupes are not marked) and fix an uninitialized value (param.cycle_breaks) detected by valgrind

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@89842 bc3139a8-67e5-0310-9ffc-ced21a209358

Michael Lawrence authored on 02/05/2014 12:51:58
Showing 4 changed files

... ...
@@ -10,7 +10,7 @@ Description: GSNAP and GMAP are a pair of tools to align short-read
10 10
         methods to work with GMAP and GSNAP from within R. In addition,
11 11
         it provides methods to tally alignment results on a
12 12
         per-nucleotide basis using the bam_tally tool.
13
-Version: 1.7.2
13
+Version: 1.7.3
14 14
 Depends: R (>= 2.15.0), methods, GenomicRanges
15 15
 Imports: S4Vectors, IRanges, Rsamtools (>= 1.7.4), rtracklayer (>= 1.17.15),
16 16
         GenomicFeatures, Biostrings, VariantAnnotation (>= 1.11.1),
... ...
@@ -83,6 +83,7 @@ variantSummary <- function(x, read_pos_breaks = NULL, high_base_quality = 0L,
83 83
                  NULL, read_length)
84 84
   
85 85
   tally_names <- c("seqnames", "pos", "ref", "alt",
86
+                   "n.read.pos", "n.read.pos.ref",
86 87
                    "raw.count", "raw.count.ref",
87 88
                    "raw.count.total",
88 89
                    "high.quality", "high.quality.ref",
... ...
@@ -243,6 +244,8 @@ normArgTRUEorFALSE <- function(x) {
243 244
 
244 245
 variantSummaryColumnDescriptions <- function(read_pos_breaks) {
245 246
   desc <- c(
247
+    n.read.pos = "Number of unique read positions for the ALT",
248
+    n.read.pos.ref = "Number of unique read positions for the REF",
246 249
     raw.count = "Raw ALT count",
247 250
     raw.count.ref = "Raw REF count",
248 251
     raw.count.total = "Raw total count",
... ...
@@ -53,6 +53,8 @@ variantSummary(x, read_pos_breaks = NULL, high_base_quality = 0L,
53 53
   after quality filtering (except for indels, for which there is no
54 54
   quality filtering). The following \code{elementMetadata}
55 55
   columns are also present:
56
+  \item{n.read.pos}{The number of unique read positions for the alt allele.}
57
+  \item{n.read.pos.ref}{The number of unique read positions for the ref allele.}
56 58
   \item{raw.count}{The number of reads with the alternate allele,
57 59
     \code{NA} for the reference allele row.}
58 60
   \item{raw.count.ref}{The number of reads with the reference allele.}
... ...
@@ -4,7 +4,7 @@
4 4
 #include "iit.h"
5 5
 #include "bytestream.h"
6 6
 
7
-enum { SEQNAMES, POS, REF, READ, COUNT, COUNT_REF,
7
+enum { SEQNAMES, POS, REF, READ, N_CYCLES, N_CYCLES_REF, COUNT, COUNT_REF,
8 8
        COUNT_TOTAL, HIGH_QUALITY, HIGH_QUALITY_REF, HIGH_QUALITY_TOTAL,
9 9
        MEAN_QUALITY, MEAN_QUALITY_REF, COUNT_PLUS, COUNT_PLUS_REF, COUNT_MINUS,
10 10
        COUNT_MINUS_REF, READ_POS_MEAN, READ_POS_MEAN_REF, READ_POS_VAR,
... ...
@@ -15,6 +15,8 @@ typedef struct TallyTable {
15 15
   int *pos;
16 16
   SEXP ref_R;
17 17
   SEXP read_R;
18
+  int *n_cycles;
19
+  int *n_cycles_ref;
18 20
   int *count;
19 21
   int *count_ref;
20 22
   int *count_total;
... ...
@@ -52,6 +54,8 @@ static SEXP R_TallyTable_new(int n_rows, int n_cycle_bins) {
52 54
   SET_VECTOR_ELT(tally_R, POS, allocVector(INTSXP, n_rows));
53 55
   SET_VECTOR_ELT(tally_R, REF, allocVector(STRSXP, n_rows));
54 56
   SET_VECTOR_ELT(tally_R, READ, allocVector(STRSXP, n_rows));
57
+  SET_VECTOR_ELT(tally_R, N_CYCLES, allocVector(INTSXP, n_rows));
58
+  SET_VECTOR_ELT(tally_R, N_CYCLES_REF, allocVector(INTSXP, n_rows));
55 59
   SET_VECTOR_ELT(tally_R, COUNT, allocVector(INTSXP, n_rows));
56 60
   SET_VECTOR_ELT(tally_R, COUNT_REF, allocVector(INTSXP, n_rows));
57 61
   SET_VECTOR_ELT(tally_R, COUNT_TOTAL, allocVector(INTSXP, n_rows));
... ...
@@ -88,6 +92,8 @@ static TallyTable *TallyTable_new(SEXP tally_R) {
88 92
   tally->pos = INTEGER(VECTOR_ELT(tally_R, POS));
89 93
   tally->ref_R = VECTOR_ELT(tally_R, REF);
90 94
   tally->read_R = VECTOR_ELT(tally_R, READ);
95
+  tally->n_cycles = INTEGER(VECTOR_ELT(tally_R, N_CYCLES));
96
+  tally->n_cycles_ref = INTEGER(VECTOR_ELT(tally_R, N_CYCLES_REF));
91 97
   tally->count = INTEGER(VECTOR_ELT(tally_R, COUNT));
92 98
   tally->count_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_REF));
93 99
   tally->count_total = INTEGER(VECTOR_ELT(tally_R, COUNT_TOTAL));
... ...
@@ -123,7 +129,7 @@ read_total_counts(unsigned char **bytes, int row, int *count_total) {
123 129
 
124 130
 static void
125 131
 read_cycle_counts(unsigned char **bytes, int row, TallyParam param,
126
-                  double *read_pos_mean, double *read_pos_var,
132
+                  int *n_cycles, double *read_pos_mean, double *read_pos_var,
127 133
                   double *mdfne, int **cycle_bins)
128 134
 {
129 135
   int n_cycle_breaks = param.n_cycle_bins + 1;
... ...
@@ -133,9 +139,9 @@ read_cycle_counts(unsigned char **bytes, int row, TallyParam param,
133 139
   if (param.mdfne_buf != NULL) {
134 140
     memset(param.mdfne_buf, 0, sizeof(int) * midpoint);
135 141
   }
136
-  int n_cycles = read_int(bytes);
142
+  n_cycles[row] = read_int(bytes);
137 143
   
138
-  for (int index = 0; index < n_cycles; index++) {
144
+  for (int index = 0; index < n_cycles[row]; index++) {
139 145
     int cycle = abs(read_int(bytes));
140 146
     int count = read_int(bytes);
141 147
     int bin = 0;
... ...
@@ -145,7 +151,7 @@ read_cycle_counts(unsigned char **bytes, int row, TallyParam param,
145 151
     count_sum += count;
146 152
     weighted_sum += cycle * count;
147 153
     weighted_sum_sq += cycle * cycle * count;
148
-    if (param.cycle_breaks != NULL) {
154
+    if (param.n_cycle_bins > 0) {
149 155
       while(n_cycle_breaks > bin &&
150 156
             cycle > param.cycle_breaks[bin])
151 157
         bin++;
... ...
@@ -211,7 +217,7 @@ parse_indels(unsigned char *bytes, int row,
211 217
       SET_STRING_ELT(tally->read_R, row, R_BlankString);
212 218
       SET_STRING_ELT(tally->ref_R, row, seq_R);
213 219
     }
214
-    read_cycle_counts(&bytes, row, param,
220
+    read_cycle_counts(&bytes, row, param, tally->n_cycles,
215 221
                       tally->read_pos_mean, tally->read_pos_var,
216 222
                       tally->mdfne, tally->cycle_bins);
217 223
     /* quality is per-base and so not relevant for indels */
... ...
@@ -221,6 +227,7 @@ parse_indels(unsigned char *bytes, int row,
221 227
     tally->high_quality_ref[row] = NA_INTEGER;
222 228
     tally->high_quality_total[row] = NA_INTEGER;
223 229
     /* no position from which to tabulate the cycles */
230
+    tally->n_cycles_ref[row] = NA_INTEGER;
224 231
     tally->read_pos_mean_ref[row] = NA_REAL;
225 232
     tally->read_pos_var_ref[row] = NA_REAL;
226 233
     tally->mdfne_ref[row] = NA_REAL;
... ...
@@ -275,13 +282,14 @@ parse_alleles(unsigned char *bytes, int row, int ref_row,
275 282
                                      tally->count_plus, tally->count_minus,
276 283
                                      tally->count);
277 284
   for (int allele = 0; allele < n_alleles; allele++, row++) {
285
+    tally->n_cycles[row] = 0;
278 286
     for (int b = 0; b < param.n_cycle_bins; b++) {
279 287
       tally->cycle_bins[b][row] = 0;
280 288
     }
281 289
     tally->high_quality[row] = 0;
282 290
     tally->mean_quality[row] = R_NaN;
283 291
     if (tally->count[row] > 0) {
284
-      read_cycle_counts(&bytes, row, param,
292
+      read_cycle_counts(&bytes, row, param, tally->n_cycles,
285 293
                         tally->read_pos_mean, tally->read_pos_var,
286 294
                         tally->mdfne, tally->cycle_bins);
287 295
       read_quality_counts(&bytes, row, tally->high_quality, tally->mean_quality,
... ...
@@ -297,6 +305,7 @@ parse_alleles(unsigned char *bytes, int row, int ref_row,
297 305
     high_quality_total += tally->high_quality[r];
298 306
   }
299 307
   for (int r = ref_row; r < row; r++) {
308
+    tally->n_cycles_ref[r] = tally->n_cycles[ref_row];
300 309
     tally->count_total[r] = tally->count_total[ref_row];
301 310
     tally->mean_quality_ref[r] = tally->mean_quality[ref_row];
302 311
     tally->high_quality_ref[r] = tally->high_quality[ref_row];
... ...
@@ -313,6 +322,7 @@ parse_alleles(unsigned char *bytes, int row, int ref_row,
313 322
   if (have_ref_row) {
314 323
     /* clear the 'alt' columns for the 'ref' row with NAs */
315 324
     SET_STRING_ELT(tally->read_R, ref_row, NA_STRING);
325
+    tally->n_cycles[ref_row] = NA_INTEGER;
316 326
     tally->mean_quality[ref_row] = NA_REAL;
317 327
     tally->high_quality[ref_row] = NA_REAL;
318 328
     tally->count_plus[ref_row] = NA_INTEGER;