git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@88415 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -77,19 +77,24 @@ variantSummary <- function(x, read_pos_breaks = NULL, high_base_quality = 0L, |
77 | 77 |
if (length(read_length) != 1L) { |
78 | 78 |
stop("'read_length' must be a single integer") |
79 | 79 |
} |
80 |
+ if (is.na(read_length)) { |
|
81 |
+ read_length <- guessReadLengthFromBam(bamFile(x)) |
|
82 |
+ } |
|
80 | 83 |
tally <- .Call(R_tally_iit_parse, x@ptr, |
81 | 84 |
read_pos_breaks, |
82 | 85 |
normArgSingleInteger(high_base_quality), |
83 | 86 |
NULL, read_length) |
84 | 87 |
|
85 |
- tally_names <- c("seqnames", "pos", "ref", "alt", |
|
86 |
- "raw.count", "raw.count.ref", |
|
88 |
+ tally_names <- c("seqnames", "pos", "ref", "alt", "n.read.pos", |
|
89 |
+ "n.read.pos.ref", "raw.count", "raw.count.ref", |
|
87 | 90 |
"raw.count.total", |
88 | 91 |
"high.quality", "high.quality.ref", |
89 | 92 |
"high.quality.total", "mean.quality", |
90 | 93 |
"mean.quality.ref", |
91 |
- "count.pos", "count.pos.ref", |
|
92 |
- "count.neg", "count.neg.ref", |
|
94 |
+ "count.plus", "count.plus.ref", |
|
95 |
+ "count.minus", "count.minus.ref", |
|
96 |
+ "read.pos.mean", "read.pos.mean.ref", |
|
97 |
+ "read.pos.var", "read.pos.var.ref", |
|
93 | 98 |
"mdfne", "mdfne.ref") |
94 | 99 |
break_names <- character() |
95 | 100 |
if (length(read_pos_breaks) > 0L) { |
... | ... |
@@ -155,6 +160,14 @@ normalizeIndelAlleles <- function(x, genome) { |
155 | 160 |
x |
156 | 161 |
} |
157 | 162 |
|
163 |
+guessReadLengthFromBam <- function(x, n=100L) { |
|
164 |
+ ga <- readGAlignments(BamSampler(x, yieldSize=n)) |
|
165 |
+ readlen <- unique(qwidth(ga)) |
|
166 |
+ if (length(readlen) != 1L) |
|
167 |
+ NA |
|
168 |
+ else readlen |
|
169 |
+} |
|
170 |
+ |
|
158 | 171 |
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
159 | 172 |
### Low-level wrappers |
160 | 173 |
### |
... | ... |
@@ -232,15 +245,21 @@ normArgTRUEorFALSE <- function(x) { |
232 | 245 |
|
233 | 246 |
variantSummaryColumnDescriptions <- function(read_pos_breaks) { |
234 | 247 |
desc <- c( |
248 |
+ n.read.pos = "Number of unique read positions for the ALT", |
|
249 |
+ n.read.pos.ref = "Number of unique read positions for the REF", |
|
235 | 250 |
raw.count = "Raw ALT count", |
236 | 251 |
raw.count.ref = "Raw REF count", |
237 | 252 |
raw.count.total = "Raw total count", |
238 | 253 |
mean.quality = "Average ALT base quality", |
239 | 254 |
mean.quality.ref = "Average REF base quality", |
240 |
- count.pos = "Raw positive strand ALT count", |
|
241 |
- count.pos.ref = "Raw positive strand REF count", |
|
242 |
- count.neg = "Raw negative strand ALT count", |
|
243 |
- count.neg.ref = "Raw negative strand REF count", |
|
255 |
+ count.plus = "Raw positive strand ALT count", |
|
256 |
+ count.plus.ref = "Raw positive strand REF count", |
|
257 |
+ count.minus = "Raw negative strand ALT count", |
|
258 |
+ count.minus.ref = "Raw negative strand REF count", |
|
259 |
+ read.pos.mean = "Average read position for the ALT", |
|
260 |
+ read.pos.mean.ref = "Average read position for the ALT", |
|
261 |
+ read.pos.var = "Variance in read position for the ALT", |
|
262 |
+ read.pos.var.ref = "Variance in read position for the REF", |
|
244 | 263 |
mdfne = "Median distance from nearest end of read for the ALT", |
245 | 264 |
mdfne.ref = "Median distance from nearest end of read for the REF") |
246 | 265 |
if (length(read_pos_breaks) > 0L) { |
... | ... |
@@ -50,27 +50,28 @@ variantSummary(x, read_pos_breaks = NULL, high_base_quality = 0L, |
50 | 50 |
after quality filtering (except for indels, for which there is no |
51 | 51 |
quality filtering). The following \code{elementMetadata} |
52 | 52 |
columns are also present: |
53 |
- \item{n.read.pos}{The number of unique cycles at which the alternate allele was |
|
54 |
- observed, \code{NA} for the reference allele row.} |
|
55 |
- \item{n.read.pos.ref}{The number of unique cycles at which the reference |
|
56 |
- allele was observed.} |
|
57 | 53 |
\item{raw.count}{The number of reads with the alternate allele, |
58 | 54 |
\code{NA} for the reference allele row.} |
59 | 55 |
\item{raw.count.ref}{The number of reads with the reference allele.} |
60 | 56 |
\item{raw.count.total}{The total number of reads at that position, |
61 | 57 |
including reference and all alternates.} |
58 |
+ \item{mean.quality}{The mean mapping quality for the alt allele.} |
|
62 | 59 |
\item{mean.quality.ref}{The mean mapping quality for the reference |
63 | 60 |
allele.} |
64 |
- \item{count.pos}{The number of positive strand reads for the alternate |
|
61 |
+ \item{count.plus}{The number of positive strand reads for the alternate |
|
65 | 62 |
allele, \code{NA} for the reference allele row.} |
66 |
- \item{count.pos.ref}{The number of positive strand reads for the reference |
|
63 |
+ \item{count.plus.ref}{The number of positive strand reads for the reference |
|
67 | 64 |
allele.} |
68 |
- \item{count.neg}{The number of negative strand reads for the alternate |
|
65 |
+ \item{count.minus}{The number of negative strand reads for the alternate |
|
69 | 66 |
allele, \code{NA} for the reference allele row.} |
70 |
- \item{count.neg.ref}{The number of negative strand reads for the reference |
|
67 |
+ \item{count.minus.ref}{The number of negative strand reads for the reference |
|
71 | 68 |
allele.} |
69 |
+ \item{read.pos.mean}{Mean read position for the alt allele.} |
|
70 |
+ \item{read.pos.mean.ref}{Mean read position for the ref allele.} |
|
72 | 71 |
\item{read.pos.var}{Variance in the read positions for the alt allele.} |
73 | 72 |
\item{read.pos.var.ref}{Variance in the read positions for the ref allele.} |
73 |
+ \item{mdfne}{Median distance from nearest end for the alt allele.} |
|
74 |
+ \item{mdfne.ref}{Median distance from nearest end for the ref allele.} |
|
74 | 75 |
|
75 | 76 |
An additional column is present for each bin formed by |
76 | 77 |
the \code{read_pos_breaks} parameter, with the read count for that bin. |
... | ... |
@@ -4,16 +4,19 @@ |
4 | 4 |
#include "iit.h" |
5 | 5 |
#include "bytestream.h" |
6 | 6 |
|
7 |
-enum { SEQNAMES, POS, REF, READ, COUNT, COUNT_REF, |
|
7 |
+enum { SEQNAMES, POS, REF, READ, N_CYCLES, N_CYCLES_REF, COUNT, COUNT_REF, |
|
8 | 8 |
COUNT_TOTAL, HIGH_QUALITY, HIGH_QUALITY_REF, HIGH_QUALITY_TOTAL, |
9 | 9 |
MEAN_QUALITY, MEAN_QUALITY_REF, COUNT_PLUS, COUNT_PLUS_REF, COUNT_MINUS, |
10 |
- COUNT_MINUS_REF, MDFNE, MDFNE_REF, N_BASE_COLS }; |
|
10 |
+ COUNT_MINUS_REF, READ_POS_MEAN, READ_POS_MEAN_REF, READ_POS_VAR, |
|
11 |
+ READ_POS_VAR_REF, MDFNE, MDFNE_REF, N_BASE_COLS }; |
|
11 | 12 |
|
12 | 13 |
typedef struct TallyTable { |
13 | 14 |
SEXP seqnames_R; |
14 | 15 |
int *pos; |
15 | 16 |
SEXP ref_R; |
16 | 17 |
SEXP read_R; |
18 |
+ int *n_cycles; |
|
19 |
+ int *n_cycles_ref; |
|
17 | 20 |
int *count; |
18 | 21 |
int *count_ref; |
19 | 22 |
int *count_total; |
... | ... |
@@ -26,6 +29,10 @@ typedef struct TallyTable { |
26 | 29 |
int *count_plus_ref; |
27 | 30 |
int *count_minus; |
28 | 31 |
int *count_minus_ref; |
32 |
+ double *read_pos_mean; |
|
33 |
+ double *read_pos_mean_ref; |
|
34 |
+ double *read_pos_var; |
|
35 |
+ double *read_pos_var_ref; |
|
29 | 36 |
double *mdfne; |
30 | 37 |
double *mdfne_ref; |
31 | 38 |
int **cycle_bins; |
... | ... |
@@ -47,6 +54,8 @@ static SEXP R_TallyTable_new(int n_rows, int n_cycle_bins) { |
47 | 54 |
SET_VECTOR_ELT(tally_R, POS, allocVector(INTSXP, n_rows)); |
48 | 55 |
SET_VECTOR_ELT(tally_R, REF, allocVector(STRSXP, n_rows)); |
49 | 56 |
SET_VECTOR_ELT(tally_R, READ, allocVector(STRSXP, n_rows)); |
57 |
+ SET_VECTOR_ELT(tally_R, N_CYCLES, allocVector(INTSXP, n_rows)); |
|
58 |
+ SET_VECTOR_ELT(tally_R, N_CYCLES_REF, allocVector(INTSXP, n_rows)); |
|
50 | 59 |
SET_VECTOR_ELT(tally_R, COUNT, allocVector(INTSXP, n_rows)); |
51 | 60 |
SET_VECTOR_ELT(tally_R, COUNT_REF, allocVector(INTSXP, n_rows)); |
52 | 61 |
SET_VECTOR_ELT(tally_R, COUNT_TOTAL, allocVector(INTSXP, n_rows)); |
... | ... |
@@ -59,6 +68,10 @@ static SEXP R_TallyTable_new(int n_rows, int n_cycle_bins) { |
59 | 68 |
SET_VECTOR_ELT(tally_R, COUNT_PLUS_REF, allocVector(INTSXP, n_rows)); |
60 | 69 |
SET_VECTOR_ELT(tally_R, COUNT_MINUS, allocVector(INTSXP, n_rows)); |
61 | 70 |
SET_VECTOR_ELT(tally_R, COUNT_MINUS_REF, allocVector(INTSXP, n_rows)); |
71 |
+ SET_VECTOR_ELT(tally_R, READ_POS_MEAN, allocVector(REALSXP, n_rows)); |
|
72 |
+ SET_VECTOR_ELT(tally_R, READ_POS_MEAN_REF, allocVector(REALSXP, n_rows)); |
|
73 |
+ SET_VECTOR_ELT(tally_R, READ_POS_VAR, allocVector(REALSXP, n_rows)); |
|
74 |
+ SET_VECTOR_ELT(tally_R, READ_POS_VAR_REF, allocVector(REALSXP, n_rows)); |
|
62 | 75 |
SET_VECTOR_ELT(tally_R, MDFNE, allocVector(REALSXP, n_rows)); |
63 | 76 |
SET_VECTOR_ELT(tally_R, MDFNE_REF, allocVector(REALSXP, n_rows)); |
64 | 77 |
|
... | ... |
@@ -79,6 +92,8 @@ static TallyTable *TallyTable_new(SEXP tally_R) { |
79 | 92 |
tally->pos = INTEGER(VECTOR_ELT(tally_R, POS)); |
80 | 93 |
tally->ref_R = VECTOR_ELT(tally_R, REF); |
81 | 94 |
tally->read_R = VECTOR_ELT(tally_R, READ); |
95 |
+ tally->n_cycles = INTEGER(VECTOR_ELT(tally_R, N_CYCLES)); |
|
96 |
+ tally->n_cycles_ref = INTEGER(VECTOR_ELT(tally_R, N_CYCLES_REF)); |
|
82 | 97 |
tally->count = INTEGER(VECTOR_ELT(tally_R, COUNT)); |
83 | 98 |
tally->count_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_REF)); |
84 | 99 |
tally->count_total = INTEGER(VECTOR_ELT(tally_R, COUNT_TOTAL)); |
... | ... |
@@ -91,6 +106,10 @@ static TallyTable *TallyTable_new(SEXP tally_R) { |
91 | 106 |
tally->count_plus_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_PLUS_REF)); |
92 | 107 |
tally->count_minus = INTEGER(VECTOR_ELT(tally_R, COUNT_MINUS)); |
93 | 108 |
tally->count_minus_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_MINUS_REF)); |
109 |
+ tally->read_pos_mean = REAL(VECTOR_ELT(tally_R, READ_POS_MEAN)); |
|
110 |
+ tally->read_pos_mean_ref = REAL(VECTOR_ELT(tally_R, READ_POS_MEAN_REF)); |
|
111 |
+ tally->read_pos_var = REAL(VECTOR_ELT(tally_R, READ_POS_VAR)); |
|
112 |
+ tally->read_pos_var_ref = REAL(VECTOR_ELT(tally_R, READ_POS_VAR_REF)); |
|
94 | 113 |
tally->mdfne = REAL(VECTOR_ELT(tally_R, MDFNE)); |
95 | 114 |
tally->mdfne_ref = REAL(VECTOR_ELT(tally_R, MDFNE_REF)); |
96 | 115 |
tally->cycle_bins = (int **) R_alloc(sizeof(int*), n_cycle_bins); |
... | ... |
@@ -110,18 +129,19 @@ read_total_counts(unsigned char **bytes, int row, int *count_total) { |
110 | 129 |
|
111 | 130 |
static void |
112 | 131 |
read_cycle_counts(unsigned char **bytes, int row, TallyParam param, |
132 |
+ int *n_cycles, double *read_pos_mean, double *read_pos_var, |
|
113 | 133 |
double *mdfne, int **cycle_bins) |
114 | 134 |
{ |
115 | 135 |
int n_cycle_breaks = param.n_cycle_bins + 1; |
116 |
- int count_sum = 0; |
|
136 |
+ int count_sum = 0, weighted_sum = 0, weighted_sum_sq = 0; |
|
117 | 137 |
int midpoint = param.read_length / 2.0 + 0.5; |
118 | 138 |
|
119 | 139 |
if (param.mdfne_buf != NULL) { |
120 | 140 |
memset(param.mdfne_buf, 0, sizeof(int) * midpoint); |
121 | 141 |
} |
122 |
- int n_cycles = read_int(bytes); |
|
142 |
+ n_cycles[row] = read_int(bytes); |
|
123 | 143 |
|
124 |
- for (int index = 0; index < n_cycles; index++) { |
|
144 |
+ for (int index = 0; index < n_cycles[row]; index++) { |
|
125 | 145 |
int cycle = abs(read_int(bytes)); |
126 | 146 |
int count = read_int(bytes); |
127 | 147 |
int bin = 0; |
... | ... |
@@ -129,6 +149,8 @@ read_cycle_counts(unsigned char **bytes, int row, TallyParam param, |
129 | 149 |
param.mdfne_buf[midpoint-abs(cycle-midpoint)-1] = count; |
130 | 150 |
} |
131 | 151 |
count_sum += count; |
152 |
+ weighted_sum += cycle * count; |
|
153 |
+ weighted_sum_sq += cycle * cycle * count; |
|
132 | 154 |
if (param.cycle_breaks != NULL) { |
133 | 155 |
while(n_cycle_breaks > bin && |
134 | 156 |
cycle > param.cycle_breaks[bin]) |
... | ... |
@@ -138,6 +160,14 @@ read_cycle_counts(unsigned char **bytes, int row, TallyParam param, |
138 | 160 |
} |
139 | 161 |
} |
140 | 162 |
} |
163 |
+ |
|
164 |
+ read_pos_mean[row] = ((double)weighted_sum) / count_sum; |
|
165 |
+ if (count_sum > 1) { |
|
166 |
+ read_pos_var[row] = ((double)weighted_sum_sq) / (count_sum - 1) - |
|
167 |
+ (count_sum / (count_sum - 1)) * read_pos_mean[row] * read_pos_mean[row]; |
|
168 |
+ } else { |
|
169 |
+ read_pos_var[row] = NA_REAL; |
|
170 |
+ } |
|
141 | 171 |
|
142 | 172 |
mdfne[row] = NA_REAL; |
143 | 173 |
if (param.mdfne_buf != NULL) { |
... | ... |
@@ -187,7 +217,8 @@ parse_indels(unsigned char *bytes, int row, |
187 | 217 |
SET_STRING_ELT(tally->read_R, row, R_BlankString); |
188 | 218 |
SET_STRING_ELT(tally->ref_R, row, seq_R); |
189 | 219 |
} |
190 |
- read_cycle_counts(&bytes, row, param, |
|
220 |
+ read_cycle_counts(&bytes, row, param, tally->n_cycles, |
|
221 |
+ tally->read_pos_mean, tally->read_pos_var, |
|
191 | 222 |
tally->mdfne, tally->cycle_bins); |
192 | 223 |
/* quality is per-base and so not relevant for indels */ |
193 | 224 |
tally->mean_quality[row] = NA_REAL; |
... | ... |
@@ -196,6 +227,9 @@ parse_indels(unsigned char *bytes, int row, |
196 | 227 |
tally->high_quality_ref[row] = NA_INTEGER; |
197 | 228 |
tally->high_quality_total[row] = NA_INTEGER; |
198 | 229 |
/* no position from which to tabulate the cycles */ |
230 |
+ tally->n_cycles_ref[row] = NA_INTEGER; |
|
231 |
+ tally->read_pos_mean_ref[row] = NA_REAL; |
|
232 |
+ tally->read_pos_var_ref[row] = NA_REAL; |
|
199 | 233 |
tally->mdfne_ref[row] = NA_REAL; |
200 | 234 |
} |
201 | 235 |
return indel_count; |
... | ... |
@@ -248,17 +282,21 @@ parse_alleles(unsigned char *bytes, int row, int ref_row, |
248 | 282 |
tally->count_plus, tally->count_minus, |
249 | 283 |
tally->count); |
250 | 284 |
for (int allele = 0; allele < n_alleles; allele++, row++) { |
285 |
+ tally->n_cycles[row] = 0; |
|
251 | 286 |
for (int b = 0; b < param.n_cycle_bins; b++) { |
252 | 287 |
tally->cycle_bins[b][row] = 0; |
253 | 288 |
} |
254 | 289 |
tally->high_quality[row] = 0; |
255 | 290 |
tally->mean_quality[row] = R_NaN; |
256 | 291 |
if (tally->count[row] > 0) { |
257 |
- read_cycle_counts(&bytes, row, param, |
|
292 |
+ read_cycle_counts(&bytes, row, param, tally->n_cycles, |
|
293 |
+ tally->read_pos_mean, tally->read_pos_var, |
|
258 | 294 |
tally->mdfne, tally->cycle_bins); |
259 | 295 |
read_quality_counts(&bytes, row, tally->high_quality, tally->mean_quality, |
260 | 296 |
param.high_base_quality); |
261 | 297 |
} else { |
298 |
+ tally->read_pos_mean[row] = R_NaN; |
|
299 |
+ tally->read_pos_var[row] = NA_REAL; |
|
262 | 300 |
tally->mdfne[row] = NA_REAL; |
263 | 301 |
} |
264 | 302 |
} |
... | ... |
@@ -267,6 +305,7 @@ parse_alleles(unsigned char *bytes, int row, int ref_row, |
267 | 305 |
high_quality_total += tally->high_quality[r]; |
268 | 306 |
} |
269 | 307 |
for (int r = ref_row; r < row; r++) { |
308 |
+ tally->n_cycles_ref[r] = tally->n_cycles[ref_row]; |
|
270 | 309 |
tally->count_total[r] = tally->count_total[ref_row]; |
271 | 310 |
tally->mean_quality_ref[r] = tally->mean_quality[ref_row]; |
272 | 311 |
tally->high_quality_ref[r] = tally->high_quality[ref_row]; |
... | ... |
@@ -274,6 +313,8 @@ parse_alleles(unsigned char *bytes, int row, int ref_row, |
274 | 313 |
tally->count_plus_ref[r] = tally->count_plus[ref_row]; |
275 | 314 |
tally->count_minus_ref[r] = tally->count_minus[ref_row]; |
276 | 315 |
tally->count_ref[r] = tally->count_plus_ref[r] + tally->count_minus_ref[r]; |
316 |
+ tally->read_pos_mean_ref[r] = tally->read_pos_mean[ref_row]; |
|
317 |
+ tally->read_pos_var_ref[r] = tally->read_pos_var[ref_row]; |
|
277 | 318 |
tally->mdfne_ref[r] = tally->mdfne[ref_row]; |
278 | 319 |
SET_STRING_ELT(tally->ref_R, r, STRING_ELT(tally->read_R, ref_row)); |
279 | 320 |
} |
... | ... |
@@ -281,11 +322,14 @@ parse_alleles(unsigned char *bytes, int row, int ref_row, |
281 | 322 |
if (have_ref_row) { |
282 | 323 |
/* clear the 'alt' columns for the 'ref' row with NAs */ |
283 | 324 |
SET_STRING_ELT(tally->read_R, ref_row, NA_STRING); |
325 |
+ tally->n_cycles[ref_row] = NA_INTEGER; |
|
284 | 326 |
tally->mean_quality[ref_row] = NA_REAL; |
285 | 327 |
tally->high_quality[ref_row] = NA_REAL; |
286 | 328 |
tally->count_plus[ref_row] = NA_INTEGER; |
287 | 329 |
tally->count_minus[ref_row] = NA_INTEGER; |
288 | 330 |
tally->count[ref_row] = NA_INTEGER; |
331 |
+ tally->read_pos_mean[ref_row] = NA_REAL; |
|
332 |
+ tally->read_pos_var[ref_row] = NA_REAL; |
|
289 | 333 |
tally->mdfne[ref_row] = NA_REAL; |
290 | 334 |
} |
291 | 335 |
return n_alleles; |