Browse code

drop the unique read position counts; renamed count.pos/count.neg to count.plus/count.minus (way better names)

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@88417 bc3139a8-67e5-0310-9ffc-ced21a209358

Michael Lawrence authored on 04/04/2014 13:36:22
Showing 5 changed files

... ...
@@ -10,11 +10,11 @@ Description: GSNAP and GMAP are a pair of tools to align short-read
10 10
         methods to work with GMAP and GSNAP from within R. In addition,
11 11
         it provides methods to tally alignment results on a
12 12
         per-nucleotide basis using the bam_tally tool.
13
-Version: 1.5.14
13
+Version: 1.5.15
14 14
 Depends: R (>= 2.15.0), methods, GenomicRanges
15 15
 Imports: IRanges, Rsamtools (>= 1.7.4), rtracklayer (>= 1.17.15),
16 16
         GenomicFeatures, Biostrings, VariantAnnotation (>= 1.9.4),
17
-        tools, Biobase, BSgenome
17
+        tools, Biobase, BSgenome, GenomicAlignments
18 18
 Suggests: RUnit, BSgenome.Dmelanogaster.UCSC.dm3,
19 19
         BSgenome.Scerevisiae.UCSC.sacCer3, org.Hs.eg.db,
20 20
         TxDb.Hsapiens.UCSC.hg19.knownGene, BSgenome.Hsapiens.UCSC.hg19,
... ...
@@ -1,6 +1,7 @@
1 1
 useDynLib(gmapR, .registration = TRUE)
2 2
 
3
-importFrom(Rsamtools, path, bamPaths, "bamWhich<-", BamFile, BamFileList)
3
+importFrom(Rsamtools, path, bamPaths, "bamWhich<-", BamFile, BamFileList,
4
+           BamSampler)
4 5
 importFrom(tools, file_path_as_absolute, file_ext, file_path_sans_ext,
5 6
            list_files_with_exts)
6 7
 importFrom(Biobase, createPackage)
... ...
@@ -21,6 +22,8 @@ importFrom(VariantAnnotation, readVcf, ScanVcfParam, fixed, VRanges, ref, alt,
21 22
            "ref<-", "alt<-", altDepth, refDepth)
22 23
 importClassesFrom(VariantAnnotation, "VCF", "VRanges")
23 24
 importFrom(BSgenome, getSeq, providerVersion)
25
+importFrom(GenomicAlignments, readGAlignments)
26
+importMethodsFrom(GenomicAlignments, qwidth)
24 27
 
25 28
 ## public API
26 29
 
... ...
@@ -85,8 +85,8 @@ variantSummary <- function(x, read_pos_breaks = NULL, high_base_quality = 0L,
85 85
                  normArgSingleInteger(high_base_quality),
86 86
                  NULL, read_length)
87 87
   
88
-  tally_names <- c("seqnames", "pos", "ref", "alt", "n.read.pos",
89
-                   "n.read.pos.ref", "raw.count", "raw.count.ref",
88
+  tally_names <- c("seqnames", "pos", "ref", "alt",
89
+                   "raw.count", "raw.count.ref",
90 90
                    "raw.count.total",
91 91
                    "high.quality", "high.quality.ref",
92 92
                    "high.quality.total", "mean.quality",
... ...
@@ -142,8 +142,8 @@ checkTallyConsistency <- function(x) {
142 142
     stopifnot(all(raw.count + raw.count.ref <= raw.count.total, na.rm=TRUE))
143 143
     stopifnot(all(altDepth(x) <= raw.count, na.rm=TRUE))
144 144
     stopifnot(all(refDepth(x) <= raw.count.ref, na.rm=TRUE))
145
-    stopifnot(all(count.pos + count.neg == raw.count, na.rm=TRUE))
146
-    stopifnot(all(count.pos.ref + count.neg.ref == raw.count.ref))
145
+    stopifnot(all(count.plus + count.minus == raw.count, na.rm=TRUE))
146
+    stopifnot(all(count.plus.ref + count.minus.ref == raw.count.ref))
147 147
   })
148 148
 }
149 149
 
... ...
@@ -245,8 +245,6 @@ normArgTRUEorFALSE <- function(x) {
245 245
 
246 246
 variantSummaryColumnDescriptions <- function(read_pos_breaks) {
247 247
   desc <- c(
248
-    n.read.pos = "Number of unique read positions for the ALT",
249
-    n.read.pos.ref = "Number of unique read positions for the REF",
250 248
     raw.count = "Raw ALT count",
251 249
     raw.count.ref = "Raw REF count",
252 250
     raw.count.total = "Raw total count",
... ...
@@ -55,9 +55,10 @@ variantSummary(x, read_pos_breaks = NULL, high_base_quality = 0L,
55 55
   \item{raw.count.ref}{The number of reads with the reference allele.}
56 56
   \item{raw.count.total}{The total number of reads at that position,
57 57
     including reference and all alternates.}
58
-  \item{mean.quality}{The mean mapping quality for the alt allele.}
59
-  \item{mean.quality.ref}{The mean mapping quality for the reference
60
-    allele.}
58
+  \item{mean.quality}{The mean base quality for the alt allele,
59
+    truncated at \code{high_base_quality}.}
60
+  \item{mean.quality.ref}{The mean base quality for the ref allele,
61
+    truncated at \code{high_base_quality}.}
61 62
   \item{count.plus}{The number of positive strand reads for the alternate
62 63
     allele, \code{NA} for the reference allele row.}
63 64
   \item{count.plus.ref}{The number of positive strand reads for the reference
... ...
@@ -4,7 +4,7 @@
4 4
 #include "iit.h"
5 5
 #include "bytestream.h"
6 6
 
7
-enum { SEQNAMES, POS, REF, READ, N_CYCLES, N_CYCLES_REF, COUNT, COUNT_REF,
7
+enum { SEQNAMES, POS, REF, READ, COUNT, COUNT_REF,
8 8
        COUNT_TOTAL, HIGH_QUALITY, HIGH_QUALITY_REF, HIGH_QUALITY_TOTAL,
9 9
        MEAN_QUALITY, MEAN_QUALITY_REF, COUNT_PLUS, COUNT_PLUS_REF, COUNT_MINUS,
10 10
        COUNT_MINUS_REF, READ_POS_MEAN, READ_POS_MEAN_REF, READ_POS_VAR,
... ...
@@ -15,8 +15,6 @@ typedef struct TallyTable {
15 15
   int *pos;
16 16
   SEXP ref_R;
17 17
   SEXP read_R;
18
-  int *n_cycles;
19
-  int *n_cycles_ref;
20 18
   int *count;
21 19
   int *count_ref;
22 20
   int *count_total;
... ...
@@ -54,8 +52,6 @@ static SEXP R_TallyTable_new(int n_rows, int n_cycle_bins) {
54 52
   SET_VECTOR_ELT(tally_R, POS, allocVector(INTSXP, n_rows));
55 53
   SET_VECTOR_ELT(tally_R, REF, allocVector(STRSXP, n_rows));
56 54
   SET_VECTOR_ELT(tally_R, READ, allocVector(STRSXP, n_rows));
57
-  SET_VECTOR_ELT(tally_R, N_CYCLES, allocVector(INTSXP, n_rows));
58
-  SET_VECTOR_ELT(tally_R, N_CYCLES_REF, allocVector(INTSXP, n_rows));
59 55
   SET_VECTOR_ELT(tally_R, COUNT, allocVector(INTSXP, n_rows));
60 56
   SET_VECTOR_ELT(tally_R, COUNT_REF, allocVector(INTSXP, n_rows));
61 57
   SET_VECTOR_ELT(tally_R, COUNT_TOTAL, allocVector(INTSXP, n_rows));
... ...
@@ -92,8 +88,6 @@ static TallyTable *TallyTable_new(SEXP tally_R) {
92 88
   tally->pos = INTEGER(VECTOR_ELT(tally_R, POS));
93 89
   tally->ref_R = VECTOR_ELT(tally_R, REF);
94 90
   tally->read_R = VECTOR_ELT(tally_R, READ);
95
-  tally->n_cycles = INTEGER(VECTOR_ELT(tally_R, N_CYCLES));
96
-  tally->n_cycles_ref = INTEGER(VECTOR_ELT(tally_R, N_CYCLES_REF));
97 91
   tally->count = INTEGER(VECTOR_ELT(tally_R, COUNT));
98 92
   tally->count_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_REF));
99 93
   tally->count_total = INTEGER(VECTOR_ELT(tally_R, COUNT_TOTAL));
... ...
@@ -129,7 +123,7 @@ read_total_counts(unsigned char **bytes, int row, int *count_total) {
129 123
 
130 124
 static void
131 125
 read_cycle_counts(unsigned char **bytes, int row, TallyParam param,
132
-                  int *n_cycles, double *read_pos_mean, double *read_pos_var,
126
+                  double *read_pos_mean, double *read_pos_var,
133 127
                   double *mdfne, int **cycle_bins)
134 128
 {
135 129
   int n_cycle_breaks = param.n_cycle_bins + 1;
... ...
@@ -139,9 +133,9 @@ read_cycle_counts(unsigned char **bytes, int row, TallyParam param,
139 133
   if (param.mdfne_buf != NULL) {
140 134
     memset(param.mdfne_buf, 0, sizeof(int) * midpoint);
141 135
   }
142
-  n_cycles[row] = read_int(bytes);
136
+  int n_cycles = read_int(bytes);
143 137
   
144
-  for (int index = 0; index < n_cycles[row]; index++) {
138
+  for (int index = 0; index < n_cycles; index++) {
145 139
     int cycle = abs(read_int(bytes));
146 140
     int count = read_int(bytes);
147 141
     int bin = 0;
... ...
@@ -217,7 +211,7 @@ parse_indels(unsigned char *bytes, int row,
217 211
       SET_STRING_ELT(tally->read_R, row, R_BlankString);
218 212
       SET_STRING_ELT(tally->ref_R, row, seq_R);
219 213
     }
220
-    read_cycle_counts(&bytes, row, param, tally->n_cycles,
214
+    read_cycle_counts(&bytes, row, param,
221 215
                       tally->read_pos_mean, tally->read_pos_var,
222 216
                       tally->mdfne, tally->cycle_bins);
223 217
     /* quality is per-base and so not relevant for indels */
... ...
@@ -227,7 +221,6 @@ parse_indels(unsigned char *bytes, int row,
227 221
     tally->high_quality_ref[row] = NA_INTEGER;
228 222
     tally->high_quality_total[row] = NA_INTEGER;
229 223
     /* no position from which to tabulate the cycles */
230
-    tally->n_cycles_ref[row] = NA_INTEGER;
231 224
     tally->read_pos_mean_ref[row] = NA_REAL;
232 225
     tally->read_pos_var_ref[row] = NA_REAL;
233 226
     tally->mdfne_ref[row] = NA_REAL;
... ...
@@ -282,14 +275,13 @@ parse_alleles(unsigned char *bytes, int row, int ref_row,
282 275
                                      tally->count_plus, tally->count_minus,
283 276
                                      tally->count);
284 277
   for (int allele = 0; allele < n_alleles; allele++, row++) {
285
-    tally->n_cycles[row] = 0;
286 278
     for (int b = 0; b < param.n_cycle_bins; b++) {
287 279
       tally->cycle_bins[b][row] = 0;
288 280
     }
289 281
     tally->high_quality[row] = 0;
290 282
     tally->mean_quality[row] = R_NaN;
291 283
     if (tally->count[row] > 0) {
292
-      read_cycle_counts(&bytes, row, param, tally->n_cycles,
284
+      read_cycle_counts(&bytes, row, param,
293 285
                         tally->read_pos_mean, tally->read_pos_var,
294 286
                         tally->mdfne, tally->cycle_bins);
295 287
       read_quality_counts(&bytes, row, tally->high_quality, tally->mean_quality,
... ...
@@ -305,7 +297,6 @@ parse_alleles(unsigned char *bytes, int row, int ref_row,
305 297
     high_quality_total += tally->high_quality[r];
306 298
   }
307 299
   for (int r = ref_row; r < row; r++) {
308
-    tally->n_cycles_ref[r] = tally->n_cycles[ref_row];
309 300
     tally->count_total[r] = tally->count_total[ref_row];
310 301
     tally->mean_quality_ref[r] = tally->mean_quality[ref_row];
311 302
     tally->high_quality_ref[r] = tally->high_quality[ref_row];
... ...
@@ -322,7 +313,6 @@ parse_alleles(unsigned char *bytes, int row, int ref_row,
322 313
   if (have_ref_row) {
323 314
     /* clear the 'alt' columns for the 'ref' row with NAs */
324 315
     SET_STRING_ELT(tally->read_R, ref_row, NA_STRING);
325
-    tally->n_cycles[ref_row] = NA_INTEGER;
326 316
     tally->mean_quality[ref_row] = NA_REAL;
327 317
     tally->high_quality[ref_row] = NA_REAL;
328 318
     tally->count_plus[ref_row] = NA_INTEGER;