Browse code

bring back the unique read position count, and mean/variance of read positions

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@88415 bc3139a8-67e5-0310-9ffc-ced21a209358

Michael Lawrence authored on 04/04/2014 13:16:32
Showing 3 changed files

... ...
@@ -77,19 +77,24 @@ variantSummary <- function(x, read_pos_breaks = NULL, high_base_quality = 0L,
77 77
   if (length(read_length) != 1L) {
78 78
     stop("'read_length' must be a single integer")
79 79
   }
80
+  if (is.na(read_length)) {
81
+    read_length <- guessReadLengthFromBam(bamFile(x))
82
+  }
80 83
   tally <- .Call(R_tally_iit_parse, x@ptr,
81 84
                  read_pos_breaks,
82 85
                  normArgSingleInteger(high_base_quality),
83 86
                  NULL, read_length)
84 87
   
85
-  tally_names <- c("seqnames", "pos", "ref", "alt",
86
-                   "raw.count", "raw.count.ref",
88
+  tally_names <- c("seqnames", "pos", "ref", "alt", "n.read.pos",
89
+                   "n.read.pos.ref", "raw.count", "raw.count.ref",
87 90
                    "raw.count.total",
88 91
                    "high.quality", "high.quality.ref",
89 92
                    "high.quality.total", "mean.quality",
90 93
                    "mean.quality.ref",
91
-                   "count.pos", "count.pos.ref",
92
-                   "count.neg", "count.neg.ref",
94
+                   "count.plus", "count.plus.ref",
95
+                   "count.minus", "count.minus.ref",
96
+                   "read.pos.mean", "read.pos.mean.ref",
97
+                   "read.pos.var", "read.pos.var.ref",
93 98
                    "mdfne", "mdfne.ref")
94 99
   break_names <- character()
95 100
   if (length(read_pos_breaks) > 0L) {
... ...
@@ -155,6 +160,14 @@ normalizeIndelAlleles <- function(x, genome) {
155 160
   x
156 161
 }
157 162
 
163
+guessReadLengthFromBam <- function(x, n=100L) {
164
+  ga <- readGAlignments(BamSampler(x, yieldSize=n))
165
+  readlen <- unique(qwidth(ga))
166
+  if (length(readlen) != 1L)
167
+    NA
168
+  else readlen
169
+}
170
+
158 171
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
159 172
 ### Low-level wrappers
160 173
 ###
... ...
@@ -232,15 +245,21 @@ normArgTRUEorFALSE <- function(x) {
232 245
 
233 246
 variantSummaryColumnDescriptions <- function(read_pos_breaks) {
234 247
   desc <- c(
248
+    n.read.pos = "Number of unique read positions for the ALT",
249
+    n.read.pos.ref = "Number of unique read positions for the REF",
235 250
     raw.count = "Raw ALT count",
236 251
     raw.count.ref = "Raw REF count",
237 252
     raw.count.total = "Raw total count",
238 253
     mean.quality = "Average ALT base quality",
239 254
     mean.quality.ref = "Average REF base quality",
240
-    count.pos = "Raw positive strand ALT count",
241
-    count.pos.ref = "Raw positive strand REF count",
242
-    count.neg = "Raw negative strand ALT count",
243
-    count.neg.ref = "Raw negative strand REF count",
255
+    count.plus = "Raw positive strand ALT count",
256
+    count.plus.ref = "Raw positive strand REF count",
257
+    count.minus = "Raw negative strand ALT count",
258
+    count.minus.ref = "Raw negative strand REF count",
259
+    read.pos.mean = "Average read position for the ALT",
260
+    read.pos.mean.ref = "Average read position for the ALT",
261
+    read.pos.var = "Variance in read position for the ALT",
262
+    read.pos.var.ref = "Variance in read position for the REF",
244 263
     mdfne = "Median distance from nearest end of read for the ALT",
245 264
     mdfne.ref = "Median distance from nearest end of read for the REF")
246 265
   if (length(read_pos_breaks) > 0L) {
... ...
@@ -50,27 +50,28 @@ variantSummary(x, read_pos_breaks = NULL, high_base_quality = 0L,
50 50
   after quality filtering (except for indels, for which there is no
51 51
   quality filtering). The following \code{elementMetadata}
52 52
   columns are also present:
53
-  \item{n.read.pos}{The number of unique cycles at which the alternate allele was
54
-    observed, \code{NA} for the reference allele row.}
55
-  \item{n.read.pos.ref}{The number of unique cycles at which the reference
56
-    allele was observed.}
57 53
   \item{raw.count}{The number of reads with the alternate allele,
58 54
     \code{NA} for the reference allele row.}
59 55
   \item{raw.count.ref}{The number of reads with the reference allele.}
60 56
   \item{raw.count.total}{The total number of reads at that position,
61 57
     including reference and all alternates.}
58
+  \item{mean.quality}{The mean mapping quality for the alt allele.}
62 59
   \item{mean.quality.ref}{The mean mapping quality for the reference
63 60
     allele.}
64
-  \item{count.pos}{The number of positive strand reads for the alternate
61
+  \item{count.plus}{The number of positive strand reads for the alternate
65 62
     allele, \code{NA} for the reference allele row.}
66
-  \item{count.pos.ref}{The number of positive strand reads for the reference
63
+  \item{count.plus.ref}{The number of positive strand reads for the reference
67 64
     allele.}
68
-  \item{count.neg}{The number of negative strand reads for the alternate
65
+  \item{count.minus}{The number of negative strand reads for the alternate
69 66
     allele, \code{NA} for the reference allele row.}
70
-  \item{count.neg.ref}{The number of negative strand reads for the reference
67
+  \item{count.minus.ref}{The number of negative strand reads for the reference
71 68
     allele.}
69
+  \item{read.pos.mean}{Mean read position for the alt allele.}
70
+  \item{read.pos.mean.ref}{Mean read position for the ref allele.}
72 71
   \item{read.pos.var}{Variance in the read positions for the alt allele.}
73 72
   \item{read.pos.var.ref}{Variance in the read positions for the ref allele.}
73
+  \item{mdfne}{Median distance from nearest end for the alt allele.}
74
+  \item{mdfne.ref}{Median distance from nearest end for the ref allele.}
74 75
   
75 76
   An additional column is present for each bin formed by
76 77
   the \code{read_pos_breaks} parameter, with the read count for that bin.
... ...
@@ -4,16 +4,19 @@
4 4
 #include "iit.h"
5 5
 #include "bytestream.h"
6 6
 
7
-enum { SEQNAMES, POS, REF, READ, COUNT, COUNT_REF,
7
+enum { SEQNAMES, POS, REF, READ, N_CYCLES, N_CYCLES_REF, COUNT, COUNT_REF,
8 8
        COUNT_TOTAL, HIGH_QUALITY, HIGH_QUALITY_REF, HIGH_QUALITY_TOTAL,
9 9
        MEAN_QUALITY, MEAN_QUALITY_REF, COUNT_PLUS, COUNT_PLUS_REF, COUNT_MINUS,
10
-       COUNT_MINUS_REF, MDFNE, MDFNE_REF, N_BASE_COLS };
10
+       COUNT_MINUS_REF, READ_POS_MEAN, READ_POS_MEAN_REF, READ_POS_VAR,
11
+       READ_POS_VAR_REF, MDFNE, MDFNE_REF, N_BASE_COLS };
11 12
 
12 13
 typedef struct TallyTable {
13 14
   SEXP seqnames_R;
14 15
   int *pos;
15 16
   SEXP ref_R;
16 17
   SEXP read_R;
18
+  int *n_cycles;
19
+  int *n_cycles_ref;
17 20
   int *count;
18 21
   int *count_ref;
19 22
   int *count_total;
... ...
@@ -26,6 +29,10 @@ typedef struct TallyTable {
26 29
   int *count_plus_ref;
27 30
   int *count_minus;
28 31
   int *count_minus_ref;
32
+  double *read_pos_mean;
33
+  double *read_pos_mean_ref;
34
+  double *read_pos_var;
35
+  double *read_pos_var_ref;
29 36
   double *mdfne;
30 37
   double *mdfne_ref;
31 38
   int **cycle_bins;
... ...
@@ -47,6 +54,8 @@ static SEXP R_TallyTable_new(int n_rows, int n_cycle_bins) {
47 54
   SET_VECTOR_ELT(tally_R, POS, allocVector(INTSXP, n_rows));
48 55
   SET_VECTOR_ELT(tally_R, REF, allocVector(STRSXP, n_rows));
49 56
   SET_VECTOR_ELT(tally_R, READ, allocVector(STRSXP, n_rows));
57
+  SET_VECTOR_ELT(tally_R, N_CYCLES, allocVector(INTSXP, n_rows));
58
+  SET_VECTOR_ELT(tally_R, N_CYCLES_REF, allocVector(INTSXP, n_rows));
50 59
   SET_VECTOR_ELT(tally_R, COUNT, allocVector(INTSXP, n_rows));
51 60
   SET_VECTOR_ELT(tally_R, COUNT_REF, allocVector(INTSXP, n_rows));
52 61
   SET_VECTOR_ELT(tally_R, COUNT_TOTAL, allocVector(INTSXP, n_rows));
... ...
@@ -59,6 +68,10 @@ static SEXP R_TallyTable_new(int n_rows, int n_cycle_bins) {
59 68
   SET_VECTOR_ELT(tally_R, COUNT_PLUS_REF, allocVector(INTSXP, n_rows));
60 69
   SET_VECTOR_ELT(tally_R, COUNT_MINUS, allocVector(INTSXP, n_rows));
61 70
   SET_VECTOR_ELT(tally_R, COUNT_MINUS_REF, allocVector(INTSXP, n_rows));
71
+  SET_VECTOR_ELT(tally_R, READ_POS_MEAN, allocVector(REALSXP, n_rows));
72
+  SET_VECTOR_ELT(tally_R, READ_POS_MEAN_REF, allocVector(REALSXP, n_rows));
73
+  SET_VECTOR_ELT(tally_R, READ_POS_VAR, allocVector(REALSXP, n_rows));
74
+  SET_VECTOR_ELT(tally_R, READ_POS_VAR_REF, allocVector(REALSXP, n_rows));
62 75
   SET_VECTOR_ELT(tally_R, MDFNE, allocVector(REALSXP, n_rows));
63 76
   SET_VECTOR_ELT(tally_R, MDFNE_REF, allocVector(REALSXP, n_rows));
64 77
 
... ...
@@ -79,6 +92,8 @@ static TallyTable *TallyTable_new(SEXP tally_R) {
79 92
   tally->pos = INTEGER(VECTOR_ELT(tally_R, POS));
80 93
   tally->ref_R = VECTOR_ELT(tally_R, REF);
81 94
   tally->read_R = VECTOR_ELT(tally_R, READ);
95
+  tally->n_cycles = INTEGER(VECTOR_ELT(tally_R, N_CYCLES));
96
+  tally->n_cycles_ref = INTEGER(VECTOR_ELT(tally_R, N_CYCLES_REF));
82 97
   tally->count = INTEGER(VECTOR_ELT(tally_R, COUNT));
83 98
   tally->count_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_REF));
84 99
   tally->count_total = INTEGER(VECTOR_ELT(tally_R, COUNT_TOTAL));
... ...
@@ -91,6 +106,10 @@ static TallyTable *TallyTable_new(SEXP tally_R) {
91 106
   tally->count_plus_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_PLUS_REF));
92 107
   tally->count_minus = INTEGER(VECTOR_ELT(tally_R, COUNT_MINUS));
93 108
   tally->count_minus_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_MINUS_REF));
109
+  tally->read_pos_mean = REAL(VECTOR_ELT(tally_R, READ_POS_MEAN));
110
+  tally->read_pos_mean_ref = REAL(VECTOR_ELT(tally_R, READ_POS_MEAN_REF));
111
+  tally->read_pos_var = REAL(VECTOR_ELT(tally_R, READ_POS_VAR));
112
+  tally->read_pos_var_ref = REAL(VECTOR_ELT(tally_R, READ_POS_VAR_REF));
94 113
   tally->mdfne = REAL(VECTOR_ELT(tally_R, MDFNE));
95 114
   tally->mdfne_ref = REAL(VECTOR_ELT(tally_R, MDFNE_REF));
96 115
   tally->cycle_bins = (int **) R_alloc(sizeof(int*), n_cycle_bins);
... ...
@@ -110,18 +129,19 @@ read_total_counts(unsigned char **bytes, int row, int *count_total) {
110 129
 
111 130
 static void
112 131
 read_cycle_counts(unsigned char **bytes, int row, TallyParam param,
132
+                  int *n_cycles, double *read_pos_mean, double *read_pos_var,
113 133
                   double *mdfne, int **cycle_bins)
114 134
 {
115 135
   int n_cycle_breaks = param.n_cycle_bins + 1;
116
-  int count_sum = 0;
136
+  int count_sum = 0, weighted_sum = 0, weighted_sum_sq = 0;
117 137
   int midpoint = param.read_length / 2.0 + 0.5;
118 138
 
119 139
   if (param.mdfne_buf != NULL) {
120 140
     memset(param.mdfne_buf, 0, sizeof(int) * midpoint);
121 141
   }
122
-  int n_cycles = read_int(bytes);
142
+  n_cycles[row] = read_int(bytes);
123 143
   
124
-  for (int index = 0; index < n_cycles; index++) {
144
+  for (int index = 0; index < n_cycles[row]; index++) {
125 145
     int cycle = abs(read_int(bytes));
126 146
     int count = read_int(bytes);
127 147
     int bin = 0;
... ...
@@ -129,6 +149,8 @@ read_cycle_counts(unsigned char **bytes, int row, TallyParam param,
129 149
       param.mdfne_buf[midpoint-abs(cycle-midpoint)-1] = count;
130 150
     }
131 151
     count_sum += count;
152
+    weighted_sum += cycle * count;
153
+    weighted_sum_sq += cycle * cycle * count;
132 154
     if (param.cycle_breaks != NULL) {
133 155
       while(n_cycle_breaks > bin &&
134 156
             cycle > param.cycle_breaks[bin])
... ...
@@ -138,6 +160,14 @@ read_cycle_counts(unsigned char **bytes, int row, TallyParam param,
138 160
       }
139 161
     }
140 162
   }
163
+
164
+  read_pos_mean[row] = ((double)weighted_sum) / count_sum;
165
+  if (count_sum > 1) {
166
+    read_pos_var[row] = ((double)weighted_sum_sq) / (count_sum - 1) -
167
+      (count_sum / (count_sum - 1)) * read_pos_mean[row] * read_pos_mean[row];
168
+  } else {
169
+    read_pos_var[row] = NA_REAL;
170
+  }
141 171
   
142 172
   mdfne[row] = NA_REAL;
143 173
   if (param.mdfne_buf != NULL) {
... ...
@@ -187,7 +217,8 @@ parse_indels(unsigned char *bytes, int row,
187 217
       SET_STRING_ELT(tally->read_R, row, R_BlankString);
188 218
       SET_STRING_ELT(tally->ref_R, row, seq_R);
189 219
     }
190
-    read_cycle_counts(&bytes, row, param,
220
+    read_cycle_counts(&bytes, row, param, tally->n_cycles,
221
+                      tally->read_pos_mean, tally->read_pos_var,
191 222
                       tally->mdfne, tally->cycle_bins);
192 223
     /* quality is per-base and so not relevant for indels */
193 224
     tally->mean_quality[row] = NA_REAL;
... ...
@@ -196,6 +227,9 @@ parse_indels(unsigned char *bytes, int row,
196 227
     tally->high_quality_ref[row] = NA_INTEGER;
197 228
     tally->high_quality_total[row] = NA_INTEGER;
198 229
     /* no position from which to tabulate the cycles */
230
+    tally->n_cycles_ref[row] = NA_INTEGER;
231
+    tally->read_pos_mean_ref[row] = NA_REAL;
232
+    tally->read_pos_var_ref[row] = NA_REAL;
199 233
     tally->mdfne_ref[row] = NA_REAL;
200 234
   }
201 235
   return indel_count;
... ...
@@ -248,17 +282,21 @@ parse_alleles(unsigned char *bytes, int row, int ref_row,
248 282
                                      tally->count_plus, tally->count_minus,
249 283
                                      tally->count);
250 284
   for (int allele = 0; allele < n_alleles; allele++, row++) {
285
+    tally->n_cycles[row] = 0;
251 286
     for (int b = 0; b < param.n_cycle_bins; b++) {
252 287
       tally->cycle_bins[b][row] = 0;
253 288
     }
254 289
     tally->high_quality[row] = 0;
255 290
     tally->mean_quality[row] = R_NaN;
256 291
     if (tally->count[row] > 0) {
257
-      read_cycle_counts(&bytes, row, param,
292
+      read_cycle_counts(&bytes, row, param, tally->n_cycles,
293
+                        tally->read_pos_mean, tally->read_pos_var,
258 294
                         tally->mdfne, tally->cycle_bins);
259 295
       read_quality_counts(&bytes, row, tally->high_quality, tally->mean_quality,
260 296
                           param.high_base_quality);
261 297
     } else {
298
+      tally->read_pos_mean[row] = R_NaN;
299
+      tally->read_pos_var[row] = NA_REAL;
262 300
       tally->mdfne[row] = NA_REAL;
263 301
     }
264 302
   }
... ...
@@ -267,6 +305,7 @@ parse_alleles(unsigned char *bytes, int row, int ref_row,
267 305
     high_quality_total += tally->high_quality[r];
268 306
   }
269 307
   for (int r = ref_row; r < row; r++) {
308
+    tally->n_cycles_ref[r] = tally->n_cycles[ref_row];
270 309
     tally->count_total[r] = tally->count_total[ref_row];
271 310
     tally->mean_quality_ref[r] = tally->mean_quality[ref_row];
272 311
     tally->high_quality_ref[r] = tally->high_quality[ref_row];
... ...
@@ -274,6 +313,8 @@ parse_alleles(unsigned char *bytes, int row, int ref_row,
274 313
     tally->count_plus_ref[r] = tally->count_plus[ref_row];
275 314
     tally->count_minus_ref[r] = tally->count_minus[ref_row];
276 315
     tally->count_ref[r] = tally->count_plus_ref[r] + tally->count_minus_ref[r];
316
+    tally->read_pos_mean_ref[r] = tally->read_pos_mean[ref_row];
317
+    tally->read_pos_var_ref[r] = tally->read_pos_var[ref_row];
277 318
     tally->mdfne_ref[r] = tally->mdfne[ref_row];
278 319
     SET_STRING_ELT(tally->ref_R, r, STRING_ELT(tally->read_R, ref_row));  
279 320
   }
... ...
@@ -281,11 +322,14 @@ parse_alleles(unsigned char *bytes, int row, int ref_row,
281 322
   if (have_ref_row) {
282 323
     /* clear the 'alt' columns for the 'ref' row with NAs */
283 324
     SET_STRING_ELT(tally->read_R, ref_row, NA_STRING);
325
+    tally->n_cycles[ref_row] = NA_INTEGER;
284 326
     tally->mean_quality[ref_row] = NA_REAL;
285 327
     tally->high_quality[ref_row] = NA_REAL;
286 328
     tally->count_plus[ref_row] = NA_INTEGER;
287 329
     tally->count_minus[ref_row] = NA_INTEGER;
288 330
     tally->count[ref_row] = NA_INTEGER;
331
+    tally->read_pos_mean[ref_row] = NA_REAL;
332
+    tally->read_pos_var[ref_row] = NA_REAL;
289 333
     tally->mdfne[ref_row] = NA_REAL;
290 334
   }
291 335
   return n_alleles;