Browse code

drop unique read position count and mean/variance of read positions from statistics output by bam_tally

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@86546 bc3139a8-67e5-0310-9ffc-ced21a209358

Michael Lawrence authored on 18/02/2014 21:24:22
Showing 3 changed files

... ...
@@ -10,7 +10,7 @@ Description: GSNAP and GMAP are a pair of tools to align short-read
10 10
     to work with GMAP and GSNAP from within R. In addition, it provides 
11 11
     methods to tally alignment results on a per-nucleotide basis using 
12 12
     the bam_tally tool.
13
-Version: 1.5.7
13
+Version: 1.5.8
14 14
 Depends: R (>= 2.15.0), methods, GenomicRanges
15 15
 Imports: IRanges, Rsamtools (>= 1.7.4), rtracklayer (>= 1.17.15),
16 16
          GenomicFeatures, Biostrings, VariantAnnotation (>= 1.9.4), tools,
... ...
@@ -76,16 +76,14 @@ variantSummary <- function(x, read_pos_breaks = NULL, high_base_quality = 0L,
76 76
                  normArgSingleInteger(high_base_quality),
77 77
                  NULL, read_length)
78 78
   
79
-  tally_names <- c("seqnames", "pos", "ref", "alt", "n.read.pos",
80
-                   "n.read.pos.ref", "raw.count", "raw.count.ref",
79
+  tally_names <- c("seqnames", "pos", "ref", "alt",
80
+                   "raw.count", "raw.count.ref",
81 81
                    "raw.count.total",
82 82
                    "high.quality", "high.quality.ref",
83 83
                    "high.quality.total", "mean.quality",
84 84
                    "mean.quality.ref",
85 85
                    "count.pos", "count.pos.ref",
86 86
                    "count.neg", "count.neg.ref",
87
-                   "read.pos.mean", "read.pos.mean.ref",
88
-                   "read.pos.var", "read.pos.var.ref",
89 87
                    "mdfne", "mdfne.ref")
90 88
   break_names <- character()
91 89
   if (length(read_pos_breaks) > 0L) {
... ...
@@ -127,8 +125,6 @@ variantSummary <- function(x, read_pos_breaks = NULL, high_base_quality = 0L,
127 125
 
128 126
 checkTallyConsistency <- function(x) {
129 127
   with(mcols(x), {
130
-    stopifnot(all(n.read.pos.ref <= raw.count.ref, na.rm=TRUE))
131
-    stopifnot(all(n.read.pos <= raw.count, na.rm=TRUE))
132 128
     stopifnot(all(raw.count + raw.count.ref <= raw.count.total))
133 129
     stopifnot(all(altDepth(x) <= raw.count, na.rm=TRUE))
134 130
     stopifnot(all(refDepth(x) <= raw.count.ref, na.rm=TRUE))
... ...
@@ -225,8 +221,6 @@ normArgTRUEorFALSE <- function(x) {
225 221
 
226 222
 variantSummaryColumnDescriptions <- function(read_pos_breaks) {
227 223
   desc <- c(
228
-    n.read.pos = "Number of unique read positions for the ALT",
229
-    n.read.pos.ref = "Number of unique read positions for the REF",
230 224
     raw.count = "Raw ALT count",
231 225
     raw.count.ref = "Raw REF count",
232 226
     raw.count.total = "Raw total count",
... ...
@@ -236,10 +230,6 @@ variantSummaryColumnDescriptions <- function(read_pos_breaks) {
236 230
     count.pos.ref = "Raw positive strand REF count",
237 231
     count.neg = "Raw negative strand ALT count",
238 232
     count.neg.ref = "Raw negative strand REF count",
239
-    read.pos.mean = "Average read position for the ALT",
240
-    read.pos.mean.ref = "Average read position for the ALT",
241
-    read.pos.var = "Variance in read position for the ALT",
242
-    read.pos.var.ref = "Variance in read position for the REF",
243 233
     mdfne = "Median distance from nearest end of read for the ALT",
244 234
     mdfne.ref = "Median distance from nearest end of read for the REF")
245 235
   if (length(read_pos_breaks) > 0L) {
... ...
@@ -4,19 +4,16 @@
4 4
 #include "iit.h"
5 5
 #include "bytestream.h"
6 6
 
7
-enum { SEQNAMES, POS, REF, READ, N_CYCLES, N_CYCLES_REF, COUNT, COUNT_REF,
7
+enum { SEQNAMES, POS, REF, READ, COUNT, COUNT_REF,
8 8
        COUNT_TOTAL, HIGH_QUALITY, HIGH_QUALITY_REF, HIGH_QUALITY_TOTAL,
9 9
        MEAN_QUALITY, MEAN_QUALITY_REF, COUNT_PLUS, COUNT_PLUS_REF, COUNT_MINUS,
10
-       COUNT_MINUS_REF, READ_POS_MEAN, READ_POS_MEAN_REF, READ_POS_VAR,
11
-       READ_POS_VAR_REF, MDFNE, MDFNE_REF, N_BASE_COLS };
10
+       COUNT_MINUS_REF, MDFNE, MDFNE_REF, N_BASE_COLS };
12 11
 
13 12
 typedef struct TallyTable {
14 13
   SEXP seqnames_R;
15 14
   int *pos;
16 15
   SEXP ref_R;
17 16
   SEXP read_R;
18
-  int *n_cycles;
19
-  int *n_cycles_ref;
20 17
   int *count;
21 18
   int *count_ref;
22 19
   int *count_total;
... ...
@@ -29,10 +26,6 @@ typedef struct TallyTable {
29 26
   int *count_plus_ref;
30 27
   int *count_minus;
31 28
   int *count_minus_ref;
32
-  double *read_pos_mean;
33
-  double *read_pos_mean_ref;
34
-  double *read_pos_var;
35
-  double *read_pos_var_ref;
36 29
   double *mdfne;
37 30
   double *mdfne_ref;
38 31
   int **cycle_bins;
... ...
@@ -54,8 +47,6 @@ static SEXP R_TallyTable_new(int n_rows, int n_cycle_bins) {
54 47
   SET_VECTOR_ELT(tally_R, POS, allocVector(INTSXP, n_rows));
55 48
   SET_VECTOR_ELT(tally_R, REF, allocVector(STRSXP, n_rows));
56 49
   SET_VECTOR_ELT(tally_R, READ, allocVector(STRSXP, n_rows));
57
-  SET_VECTOR_ELT(tally_R, N_CYCLES, allocVector(INTSXP, n_rows));
58
-  SET_VECTOR_ELT(tally_R, N_CYCLES_REF, allocVector(INTSXP, n_rows));
59 50
   SET_VECTOR_ELT(tally_R, COUNT, allocVector(INTSXP, n_rows));
60 51
   SET_VECTOR_ELT(tally_R, COUNT_REF, allocVector(INTSXP, n_rows));
61 52
   SET_VECTOR_ELT(tally_R, COUNT_TOTAL, allocVector(INTSXP, n_rows));
... ...
@@ -68,10 +59,6 @@ static SEXP R_TallyTable_new(int n_rows, int n_cycle_bins) {
68 59
   SET_VECTOR_ELT(tally_R, COUNT_PLUS_REF, allocVector(INTSXP, n_rows));
69 60
   SET_VECTOR_ELT(tally_R, COUNT_MINUS, allocVector(INTSXP, n_rows));
70 61
   SET_VECTOR_ELT(tally_R, COUNT_MINUS_REF, allocVector(INTSXP, n_rows));
71
-  SET_VECTOR_ELT(tally_R, READ_POS_MEAN, allocVector(REALSXP, n_rows));
72
-  SET_VECTOR_ELT(tally_R, READ_POS_MEAN_REF, allocVector(REALSXP, n_rows));
73
-  SET_VECTOR_ELT(tally_R, READ_POS_VAR, allocVector(REALSXP, n_rows));
74
-  SET_VECTOR_ELT(tally_R, READ_POS_VAR_REF, allocVector(REALSXP, n_rows));
75 62
   SET_VECTOR_ELT(tally_R, MDFNE, allocVector(REALSXP, n_rows));
76 63
   SET_VECTOR_ELT(tally_R, MDFNE_REF, allocVector(REALSXP, n_rows));
77 64
 
... ...
@@ -92,8 +79,6 @@ static TallyTable *TallyTable_new(SEXP tally_R) {
92 79
   tally->pos = INTEGER(VECTOR_ELT(tally_R, POS));
93 80
   tally->ref_R = VECTOR_ELT(tally_R, REF);
94 81
   tally->read_R = VECTOR_ELT(tally_R, READ);
95
-  tally->n_cycles = INTEGER(VECTOR_ELT(tally_R, N_CYCLES));
96
-  tally->n_cycles_ref = INTEGER(VECTOR_ELT(tally_R, N_CYCLES_REF));
97 82
   tally->count = INTEGER(VECTOR_ELT(tally_R, COUNT));
98 83
   tally->count_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_REF));
99 84
   tally->count_total = INTEGER(VECTOR_ELT(tally_R, COUNT_TOTAL));
... ...
@@ -106,10 +91,6 @@ static TallyTable *TallyTable_new(SEXP tally_R) {
106 91
   tally->count_plus_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_PLUS_REF));
107 92
   tally->count_minus = INTEGER(VECTOR_ELT(tally_R, COUNT_MINUS));
108 93
   tally->count_minus_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_MINUS_REF));
109
-  tally->read_pos_mean = REAL(VECTOR_ELT(tally_R, READ_POS_MEAN));
110
-  tally->read_pos_mean_ref = REAL(VECTOR_ELT(tally_R, READ_POS_MEAN_REF));
111
-  tally->read_pos_var = REAL(VECTOR_ELT(tally_R, READ_POS_VAR));
112
-  tally->read_pos_var_ref = REAL(VECTOR_ELT(tally_R, READ_POS_VAR_REF));
113 94
   tally->mdfne = REAL(VECTOR_ELT(tally_R, MDFNE));
114 95
   tally->mdfne_ref = REAL(VECTOR_ELT(tally_R, MDFNE_REF));
115 96
   tally->cycle_bins = (int **) R_alloc(sizeof(int*), n_cycle_bins);
... ...
@@ -129,7 +110,6 @@ read_total_counts(unsigned char **bytes, int row, int *count_total) {
129 110
 
130 111
 static void
131 112
 read_cycle_counts(unsigned char **bytes, int row, TallyParam param,
132
-                  int *n_cycles, double *read_pos_mean, double *read_pos_var,
133 113
                   double *mdfne, int **cycle_bins)
134 114
 {
135 115
   int n_cycle_breaks = param.n_cycle_bins + 1;
... ...
@@ -139,9 +119,9 @@ read_cycle_counts(unsigned char **bytes, int row, TallyParam param,
139 119
   if (param.mdfne_buf != NULL) {
140 120
     memset(param.mdfne_buf, 0, sizeof(int) * midpoint);
141 121
   }
142
-  n_cycles[row] = read_int(bytes);
122
+  int n_cycles = read_int(bytes);
143 123
   
144
-  for (int index = 0; index < n_cycles[row]; index++) {
124
+  for (int index = 0; index < n_cycles; index++) {
145 125
     int cycle = abs(read_int(bytes));
146 126
     int count = read_int(bytes);
147 127
     int bin = 0;
... ...
@@ -160,14 +140,6 @@ read_cycle_counts(unsigned char **bytes, int row, TallyParam param,
160 140
       }
161 141
     }
162 142
   }
163
-
164
-  read_pos_mean[row] = ((double)weighted_sum) / count_sum;
165
-  if (count_sum > 1) {
166
-    read_pos_var[row] = ((double)weighted_sum_sq) / (count_sum - 1) -
167
-      (count_sum / (count_sum - 1)) * read_pos_mean[row] * read_pos_mean[row];
168
-  } else {
169
-    read_pos_var[row] = NA_REAL;
170
-  }
171 143
   
172 144
   mdfne[row] = NA_REAL;
173 145
   if (param.mdfne_buf != NULL) {
... ...
@@ -217,8 +189,7 @@ parse_indels(unsigned char *bytes, int row,
217 189
       SET_STRING_ELT(tally->read_R, row, R_BlankString);
218 190
       SET_STRING_ELT(tally->ref_R, row, seq_R);
219 191
     }
220
-    read_cycle_counts(&bytes, row, param, tally->n_cycles,
221
-                      tally->read_pos_mean, tally->read_pos_var,
192
+    read_cycle_counts(&bytes, row, param,
222 193
                       tally->mdfne, tally->cycle_bins);
223 194
     /* quality is per-base and so not relevant for indels */
224 195
     tally->mean_quality[row] = NA_REAL;
... ...
@@ -227,9 +198,6 @@ parse_indels(unsigned char *bytes, int row,
227 198
     tally->high_quality_ref[row] = NA_INTEGER;
228 199
     tally->high_quality_total[row] = NA_INTEGER;
229 200
     /* no position from which to tabulate the cycles */
230
-    tally->n_cycles_ref[row] = NA_INTEGER;
231
-    tally->read_pos_mean_ref[row] = NA_REAL;
232
-    tally->read_pos_var_ref[row] = NA_REAL;
233 201
     tally->mdfne_ref[row] = NA_REAL;
234 202
   }
235 203
   return indel_count;
... ...
@@ -282,21 +250,17 @@ parse_alleles(unsigned char *bytes, int row, int ref_row,
282 250
                                      tally->count_plus, tally->count_minus,
283 251
                                      tally->count);
284 252
   for (int allele = 0; allele < n_alleles; allele++, row++) {
285
-    tally->n_cycles[row] = 0;
286 253
     for (int b = 0; b < param.n_cycle_bins; b++) {
287 254
       tally->cycle_bins[b][row] = 0;
288 255
     }
289 256
     tally->high_quality[row] = 0;
290 257
     tally->mean_quality[row] = R_NaN;
291 258
     if (tally->count[row] > 0) {
292
-      read_cycle_counts(&bytes, row, param, tally->n_cycles,
293
-                        tally->read_pos_mean, tally->read_pos_var,
259
+      read_cycle_counts(&bytes, row, param,
294 260
                         tally->mdfne, tally->cycle_bins);
295 261
       read_quality_counts(&bytes, row, tally->high_quality, tally->mean_quality,
296 262
                           param.high_base_quality);
297 263
     } else {
298
-      tally->read_pos_mean[row] = R_NaN;
299
-      tally->read_pos_var[row] = NA_REAL;
300 264
       tally->mdfne[row] = NA_REAL;
301 265
     }
302 266
   }
... ...
@@ -305,7 +269,6 @@ parse_alleles(unsigned char *bytes, int row, int ref_row,
305 269
     high_quality_total += tally->high_quality[r];
306 270
   }
307 271
   for (int r = ref_row; r < row; r++) {
308
-    tally->n_cycles_ref[r] = tally->n_cycles[ref_row];
309 272
     tally->count_total[r] = tally->count_total[ref_row];
310 273
     tally->mean_quality_ref[r] = tally->mean_quality[ref_row];
311 274
     tally->high_quality_ref[r] = tally->high_quality[ref_row];
... ...
@@ -313,8 +276,6 @@ parse_alleles(unsigned char *bytes, int row, int ref_row,
313 276
     tally->count_plus_ref[r] = tally->count_plus[ref_row];
314 277
     tally->count_minus_ref[r] = tally->count_minus[ref_row];
315 278
     tally->count_ref[r] = tally->count_plus_ref[r] + tally->count_minus_ref[r];
316
-    tally->read_pos_mean_ref[r] = tally->read_pos_mean[ref_row];
317
-    tally->read_pos_var_ref[r] = tally->read_pos_var[ref_row];
318 279
     tally->mdfne_ref[r] = tally->mdfne[ref_row];
319 280
     SET_STRING_ELT(tally->ref_R, r, STRING_ELT(tally->read_R, ref_row));  
320 281
   }
... ...
@@ -322,14 +283,11 @@ parse_alleles(unsigned char *bytes, int row, int ref_row,
322 283
   if (have_ref_row) {
323 284
     /* clear the 'alt' columns for the 'ref' row with NAs */
324 285
     SET_STRING_ELT(tally->read_R, ref_row, NA_STRING);
325
-    tally->n_cycles[ref_row] = NA_INTEGER;
326 286
     tally->mean_quality[ref_row] = NA_REAL;
327 287
     tally->high_quality[ref_row] = NA_REAL;
328 288
     tally->count_plus[ref_row] = NA_INTEGER;
329 289
     tally->count_minus[ref_row] = NA_INTEGER;
330 290
     tally->count[ref_row] = NA_INTEGER;
331
-    tally->read_pos_mean[ref_row] = NA_REAL;
332
-    tally->read_pos_var[ref_row] = NA_REAL;
333 291
     tally->mdfne[ref_row] = NA_REAL;
334 292
   }
335 293
   return n_alleles;