Browse code

Refactor bam_tally, so that bam_tally returns a TallyIIT object, which is then summarized via summarizeVariants; this allows computing tallies once and summarizing them in different ways (like maybe get the coverage). The summarizeVariants function yields a VRanges.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@78427 bc3139a8-67e5-0310-9ffc-ced21a209358

Michael Lawrence authored on 14/07/2013 00:43:43
Showing 9 changed files

... ...
@@ -13,8 +13,8 @@ Description: GSNAP and GMAP are a pair of tools to align short-read
13 13
 Version: 1.3.2
14 14
 Depends: R (>= 2.15.0), methods, GenomicRanges
15 15
 Imports: IRanges, Rsamtools (>= 1.7.4), rtracklayer (>= 1.17.15), GenomicRanges,
16
-         GenomicFeatures, Biostrings, VariantAnnotation (>= 1.7.17), tools,
17
-         Biobase, BSgenome
16
+         GenomicFeatures, Biostrings, VariantAnnotation (>= 1.7.34), tools,
17
+         Biobase, BSgenome, methods
18 18
 Suggests: RUnit, BSgenome.Dmelanogaster.UCSC.dm3,
19 19
 	  BSgenome.Scerevisiae.UCSC.sacCer3, VariantAnnotation,
20 20
           org.Hs.eg.db, TxDb.Hsapiens.UCSC.hg19.knownGene,
... ...
@@ -6,6 +6,7 @@ importFrom(tools, file_path_as_absolute, file_ext, file_path_sans_ext,
6 6
            list_files_with_exts)
7 7
 importFrom(Biobase, createPackage)
8 8
 import(IRanges)
9
+import(methods)
9 10
 import(GenomicRanges)
10 11
 importFrom(Biostrings, getSeq, readDNAStringSet, DNAStringSet)
11 12
 importFrom(GenomicRanges, genome, seqinfo)
... ...
@@ -16,15 +17,16 @@ importFrom(GenomicFeatures, transcripts, exons)
16 17
 importClassesFrom(rtracklayer, RTLFile, FastaFile)
17 18
 importFrom(rtracklayer, "referenceSequence<-", import, export, FastaFile)
18 19
 importMethodsFrom(rtracklayer, export)
19
-importFrom(VariantAnnotation, readVcf, ScanVcfParam, fixed)
20
-importClassesFrom(VariantAnnotation, "VCF")
20
+importFrom(VariantAnnotation, readVcf, ScanVcfParam, fixed, VRanges, ref, alt,
21
+           "ref<-", "alt<-")
22
+importClassesFrom(VariantAnnotation, "VCF", "VRanges")
21 23
 importFrom(BSgenome, getSeq, providerVersion)
22 24
 
23 25
 ## public API
24 26
 
25 27
 export(bam_tally, GmapGenome, GmapGenomeDirectory, GsnapParam,
26 28
        directory, BamTallyParam, makeGmapGenomePackage, GsnapOutput,
27
-       GmapSnps, GmapSnpDirectory, TP53Genome, TP53Which)
29
+       GmapSnps, GmapSnpDirectory, TP53Genome, TP53Which, summarizeVariants)
28 30
 
29 31
 exportClasses(GmapGenome, GmapGenomeDirectory, GmapSnpDirectory,
30 32
               GsnapOutput, GmapSnps, BamTallyParam, GsnapParam)
... ...
@@ -8,8 +8,6 @@
8 8
 setClass("BamTallyParam",
9 9
          representation(genome = "GmapGenome",
10 10
                         which = "RangesList",
11
-                        cycle_breaks = "integerORNULL",
12
-                        high_base_quality = "integer",
13 11
                         minimum_mapq = "integer",
14 12
                         concordant_only = "logical",
15 13
                         unique_only = "logical",
... ...
@@ -24,17 +22,18 @@ setClass("BamTallyParam",
24 22
 ### Constructor
25 23
 ###
26 24
 
27
-normArgWhich <- function(x) {
25
+normArgWhich <- function(x, genome) {
28 26
   if (is(x, "GenomicRanges"))
29 27
     x <- split(ranges(x), seqnames(x))
30 28
   else if (!is(x, "RangesList"))
31 29
     stop("'which' must be a GenomicRanges or RangesList")
30
+  si <- seqinfo(genome)
31
+  seqinfo(x, new2old = match(seqlevels(si), seqlevels(x))) <-
32
+    merge(si, seqinfo(x))
32 33
   x
33 34
 }
34 35
 
35 36
 BamTallyParam <- function(genome, which = RangesList(),
36
-                          cycle_breaks = NULL,
37
-                          high_base_quality = 0L,
38 37
                           minimum_mapq = 0L,
39 38
                           concordant_only = FALSE, unique_only = FALSE,
40 39
                           primary_only = FALSE, ignore_duplicates = FALSE,
... ...
@@ -42,12 +41,29 @@ BamTallyParam <- function(genome, which = RangesList(),
42 41
                           ignore_query_Ns = FALSE,
43 42
                           indels = FALSE)
44 43
 {
44
+  if (!isSingleNumber(minimum_mapq) || minimum_mapq < 0)
45
+    stop("minimum_mapq must be a single, non-negative, non-NA number")
46
+  if (!isTRUEorFALSE(concordant_only))
47
+    stop("concordant_only must be TRUE or FALSE")
48
+  if (!isTRUEorFALSE(unique_only))
49
+    stop("unique_only must be TRUE or FALSE")
50
+  if (!isTRUEorFALSE(primary_only))
51
+    stop("primary_only must be TRUE or FALSE")
52
+  if (!isTRUEorFALSE(ignore_duplicates))
53
+    stop("ignore_duplicates must be TRUE or FALSE")
54
+  if (!isSingleNumber(min_depth) || minimum_mapq < 0)
55
+    stop("min_depth must be a single, non-negative, non-NA number")
56
+  if (!variant_strand %in% c(0, 1, 2))
57
+    stop("variant_strand must be one of 0, 1, or 2")
58
+  if (!isTRUEorFALSE(ignore_query_Ns))
59
+    stop("ignore_query_Ns must be TRUE or FALSE")
60
+  if (!isTRUEorFALSE(indels))
61
+    stop("indels must be TRUE or FALSE")
45 62
   args <- names(formals(sys.function()))
46 63
   params <- mget(args, environment())
47 64
   params$genome <- as(genome, "GmapGenome")
48
-  params$which <- normArgWhich(which)
49
-  integer_params <- c("high_base_quality", "minimum_mapq", "min_depth",
50
-                      "variant_strand")
65
+  params$which <- normArgWhich(which, params$genome)
66
+  integer_params <- c("minimum_mapq", "min_depth", "variant_strand")
51 67
   params[integer_params] <- lapply(params[integer_params], as.integer)
52 68
   do.call(new, c("BamTallyParam", params))  
53 69
 }
... ...
@@ -3,6 +3,22 @@
3 3
 ### -------------------------------------------------------------------------
4 4
 ###
5 5
 
6
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
7
+### Raw tally result
8
+###
9
+
10
+setClass("TallyIIT", representation(ptr = "externalptr", genome = "GmapGenome"))
11
+
12
+TallyIIT <- function(ptr, genome) {
13
+  new("TallyIIT", ptr = ptr, genome = genome)
14
+}
15
+
16
+setMethod("genome", "TallyIIT", function(x) x@genome)
17
+
18
+setMethod("show", "TallyIIT", function(object) {
19
+  cat("Tally IIT object for genome '", genome(genome(object)), "'\n", sep = "")
20
+})
21
+
6 22
 ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
7 23
 ### High-level wrapper
8 24
 ###
... ...
@@ -45,40 +61,71 @@ setMethod("bam_tally", "GmapBamReader",
45 61
             }
46 62
             
47 63
             param_list$genome <- NULL
48
-            tally <- do.call(.bam_tally_C, c(list(x), param_list))
49
-            tally_names <- c("seqnames", "pos", "ref", "alt", "ncycles",
50
-                             "ncycles.ref", "count", "count.ref", "count.total",
51
-                             "high.quality", "high.quality.ref",
52
-                             "high.quality.total", "mean.quality",
53
-                             "mean.quality.ref",
54
-                             "count.pos", "count.pos.ref",
55
-                             "count.neg", "count.neg.ref",
56
-                             "read.pos.mean", "read.pos.mean.ref",
57
-                             "read.pos.var", "read.pos.var.ref")
58
-            cycle_breaks <- param_list$cycle_breaks
59
-            if (!is.null(cycle_breaks)) {
60
-              cycle_breaks <- as.integer(cycle_breaks)
61
-              break_names <- paste("cycleCount", head(cycle_breaks, -1),
62
-                                    tail(cycle_breaks, -1), sep = ".")
63
-              tally_names <- c(tally_names, break_names)
64
-            }
65
-            names(tally) <- tally_names
66
-            if (param_list$variant_strand > 0) {
67
-              variant_rows <- !is.na(tally$alt)
68
-              tally <- lapply(tally, `[`, variant_rows)
69
-            }
70
-            gr <- GRanges(tally$seqnames,
71
-                          IRanges(tally$pos,
72
-                                  width = rep.int(1L, length(tally$pos))),
73
-                          strand = Rle("+", length(tally$pos)),
74
-                          location = paste(tally$seqnames, tally$pos,
75
-                            sep = ":"),
76
-                          DataFrame(tail(tally, -2)),
77
-                          seqlengths = seqlengths(genome))
78
-            seqinfo(gr) <- seqinfo(genome)
79
-            gr
64
+            TallyIIT(do.call(.bam_tally_C, c(list(x), param_list)), genome)
80 65
           })
81 66
 
67
+summarizeVariants <- function(x, read_pos_breaks = NULL, high_base_quality = 0L)
68
+{
69
+  tally <- .Call(R_tally_iit_parse, x@ptr,
70
+                 read_pos_breaks,
71
+                 normArgSingleInteger(high_base_quality),
72
+                 NULL)
73
+
74
+  tally_names <- c("seqnames", "pos", "ref", "alt", "n.read.pos",
75
+                   "n.read.pos.ref", "raw.count", "raw.count.ref",
76
+                   "raw.count.total",
77
+                   "high.quality", "high.quality.ref",
78
+                   "high.quality.total", "mean.quality",
79
+                   "mean.quality.ref",
80
+                   "count.pos", "count.pos.ref",
81
+                   "count.neg", "count.neg.ref",
82
+                   "read.pos.mean", "read.pos.mean.ref",
83
+                   "read.pos.var", "read.pos.var.ref")
84
+  if (!is.null(read_pos_breaks)) {
85
+    read_pos_breaks <- as.integer(read_pos_breaks)
86
+    break_names <- paste("readPosCount", head(read_pos_breaks, -1),
87
+                         tail(read_pos_breaks, -1), sep = ".")
88
+    tally_names <- c(tally_names, break_names)
89
+  }
90
+  names(tally) <- tally_names
91
+
92
+  variant_rows <- !is.na(tally$alt)
93
+  if (!all(variant_rows))
94
+    tally <- lapply(tally, `[`, variant_rows)
95
+  
96
+  meta_names <- setdiff(tally_names,
97
+                        c("seqnames", "pos", "ref", "alt", "high.quality",
98
+                          "high.quality.total"))
99
+  genome <- genome(x)
100
+  indel <- nchar(tally$ref) == 0L | nchar(tally$alt) == 0L
101
+  gr <- with(tally,
102
+             VRanges(seqnames,
103
+                     IRanges(pos,
104
+                             width = ifelse(nchar(alt) == 0, nchar(ref), 1L)),
105
+                     ref, alt,
106
+                     ifelse(indel, raw.count.total, high.quality.total),
107
+                     ifelse(indel, raw.count.ref, high.quality.ref),
108
+                     ifelse(indel, raw.count, high.quality),
109
+                     DataFrame(tally[meta_names]),
110
+                     seqlengths = seqlengths(genome)))
111
+  seqinfo(gr) <- seqinfo(genome)
112
+  gr <- normalizeIndelAlleles(gr, genome)
113
+  gr
114
+}
115
+
116
+normalizeIndelAlleles <- function(x, genome) {
117
+  is.indel <- nchar(ref(x)) == 0L | nchar(alt(x)) == 0L
118
+  if (any(is.indel)) {
119
+    indels <- x[is.indel]
120
+    indels <- shift(indels, -1)
121
+    anchor <- getSeq(genome, indels)
122
+    ref(indels) <- paste0(anchor, ref(indels))
123
+    alt(indels) <- paste0(anchor, alt(indels))
124
+    x[is.indel] <- indels
125
+  }
126
+  x
127
+}
128
+
82 129
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
83 130
 ### Low-level wrappers
84 131
 ###
... ...
@@ -98,7 +145,7 @@ normArgTRUEorFALSE <- function(x) {
98 145
 }
99 146
 
100 147
 .bam_tally_C <- function(bamreader, genome_dir = NULL, db = NULL,
101
-                         which = NULL, cycle_breaks = NULL,
148
+                         which = NULL, read_pos_breaks = NULL,
102 149
                          high_base_quality = 0L, alloclength = 200000L,
103 150
                          minimum_mapq = 0L, good_unique_mapq = 35L,
104 151
                          maximum_nhits = 1000000L,
... ...
@@ -123,32 +170,28 @@ normArgTRUEorFALSE <- function(x) {
123 170
     stop("'genome_dir' must be NULL or a single, non-NA string")
124 171
   if (!is.null(db) && !IRanges:::isSingleString(db))
125 172
     stop("'db' must be NULL or a single, non-NA string")
126
-  if (!is.null(cycle_breaks)) {
127
-    cycle_breaks <- as.integer(cycle_breaks)
128
-    if (any(is.na(cycle_breaks)))
129
-      stop("'cycle_breaks' should not contain missing values")
130
-    if (length(cycle_breaks) < 2)
131
-      stop("'cycle_breaks' needs at least two elements to define a bin")
132
-    if (is.unsorted(cycle_breaks))
133
-      stop("'cycle_breaks' must be sorted")
173
+  if (!is.null(read_pos_breaks)) {
174
+    read_pos_breaks <- as.integer(read_pos_breaks)
175
+    if (any(is.na(read_pos_breaks)))
176
+      stop("'read_pos_breaks' should not contain missing values")
177
+    if (length(read_pos_breaks) < 2)
178
+      stop("'read_pos_breaks' needs at least two elements to define a bin")
179
+    if (is.unsorted(read_pos_breaks))
180
+      stop("'read_pos_breaks' must be sorted")
134 181
   }
135
-  iit <- .Call(R_Bamtally_iit, bamreader@.extptr, genome_dir, db, which,
136
-               normArgSingleInteger(alloclength),
137
-               normArgSingleInteger(minimum_mapq),
138
-               normArgSingleInteger(good_unique_mapq),
139
-               normArgSingleInteger(maximum_nhits),
140
-               normArgTRUEorFALSE(concordant_only),
141
-               normArgTRUEorFALSE(unique_only),
142
-               normArgTRUEorFALSE(primary_only),
143
-               normArgTRUEorFALSE(ignore_duplicates),
144
-               normArgSingleInteger(min_depth),
145
-               normArgSingleInteger(variant_strand),
146
-               normArgTRUEorFALSE(ignore_query_Ns),
147
-               normArgTRUEorFALSE(indels),
148
-               normArgSingleInteger(blocksize),
149
-               normArgTRUEorFALSE(verbosep))
150
-  .Call(R_tally_iit_parse, iit,
151
-        cycle_breaks,
152
-        normArgSingleInteger(high_base_quality),
153
-        NULL)
182
+  .Call(R_Bamtally_iit, bamreader@.extptr, genome_dir, db, which,
183
+        normArgSingleInteger(alloclength),
184
+        normArgSingleInteger(minimum_mapq),
185
+        normArgSingleInteger(good_unique_mapq),
186
+        normArgSingleInteger(maximum_nhits),
187
+        normArgTRUEorFALSE(concordant_only),
188
+        normArgTRUEorFALSE(unique_only),
189
+        normArgTRUEorFALSE(primary_only),
190
+        normArgTRUEorFALSE(ignore_duplicates),
191
+        normArgSingleInteger(min_depth),
192
+        normArgSingleInteger(variant_strand),
193
+        normArgTRUEorFALSE(ignore_query_Ns),
194
+        normArgTRUEorFALSE(indels),
195
+        normArgSingleInteger(blocksize),
196
+        normArgTRUEorFALSE(verbosep))
154 197
 }
... ...
@@ -22,14 +22,14 @@ test_bam_tally_C <- function() {
22 22
   empty <- RangesList(chr2L = IRanges(1e6, 2e6))
23 23
   gr <- bam_tally(bf, BamTallyParam(genome, which = empty))
24 24
   
25
-  gr <- bam_tally(bf, BamTallyParam(genome, cycle_breaks = c(1, 15, 30, 40)))
25
+  gr <- bam_tally(bf, BamTallyParam(genome, read_pos_breaks = c(1, 15, 30, 40)))
26 26
 
27 27
   genome <- GmapGenome("hg19_IGIS21")
28 28
   which <- RangesList("1" = IRanges(1e6, 2e6))
29 29
   bam <- "~/share/data/R1047_LIB6635_SAM636095_L1_NXG2449.analyzed.bam"
30 30
   bf <- Rsamtools::BamFile(bam)
31 31
   gr <- bam_tally(bf, BamTallyParam(genome, which = which, variant_strand = 1L,
32
-                                    cycle_breaks = c(0L, 10L, 75L),
32
+                                    read_pos_breaks = c(0L, 10L, 75L),
33 33
                                     indels = TRUE))
34 34
   
35 35
 }
... ...
@@ -1,7 +1,19 @@
1
+library(gmapR)
2
+
3
+test_bam_tally <- function() {
4
+  param <- BamTallyParam(TP53Genome(), TP53Which(), indels = TRUE)
5
+  bam <- system.file("extdata/H1993.analyzed.bam", 
6
+                     package="LungCancerLines", mustWork=TRUE)
7
+  tallies <- bam_tally(bam, param)
8
+  variants <- summarizeVariants(tallies, NULL, 0L)
9
+}
10
+
1 11
 test_IITs_not_created <- function() {
2 12
   gmapGenome <- GmapGenome(genome="NoGenome", directory = getwd())
3 13
   btp <- BamTallyParam(genome=gmapGenome,
4 14
                        which = RangesList())
5
-  bam_file <- system.file("extdata/test_data_aln/test_data_aln.concordant_uniq.bam", package="gmapR", mustWork=TRUE)
15
+  bam_file <-
16
+    system.file("extdata/test_data_aln/test_data_aln.concordant_uniq.bam",
17
+                package="gmapR", mustWork=TRUE)
6 18
   checkException(bam_tally(x=bam_file, param=btp))
7 19
 }
... ...
@@ -14,7 +14,7 @@
14 14
 }
15 15
 
16 16
 \usage{
17
-  BamTallyParam(genome, which = RangesList(), cycle_breaks = NULL,
17
+  BamTallyParam(genome, which = RangesList(), read_pos_breaks = NULL,
18 18
                 high_base_quality = 0L,
19 19
                 minimum_mapq = 0L,
20 20
                 concordant_only = FALSE, unique_only = FALSE,
... ...
@@ -29,8 +29,8 @@
29 29
     one that limits the tally to that range or set of ranges. By
30 30
     default, the entire genome is processed.
31 31
   }
32
-  \item{cycle_breaks}{The breaks, like those passed to \code{\link{cut}}
33
-    for aggregating the per-cycle counts. If \code{NULL}, no per-cycle
32
+  \item{read_pos_breaks}{The breaks, like those passed to \code{\link{cut}}
33
+    for aggregating the per-read position counts. If \code{NULL}, no per-cycle
34 34
     counts are returned.}
35 35
   \item{high_base_quality}{The minimum mapping quality for a
36 36
     read to be counted as high quality.}
... ...
@@ -36,9 +36,9 @@
36 36
   \item{ref}{The reference base at that position.}
37 37
   \item{alt}{The base for the alternate allele, \code{NA} for the
38 38
     reference allele row.}
39
-  \item{ncycles}{The number of unique cycles at which the alternate allele was
39
+  \item{n.read.pos}{The number of unique cycles at which the alternate allele was
40 40
     observed, \code{NA} for the reference allele row.}
41
-  \item{ncycles.ref}{The number of unique cycles at which the reference
41
+  \item{n.read.pos.ref}{The number of unique cycles at which the reference
42 42
     allele was observed.}
43 43
   \item{count}{The number of reads with the alternate allele,
44 44
     \code{NA} for the reference allele row.}
... ...
@@ -64,7 +64,7 @@
64 64
     allele.}
65 65
 
66 66
   An additional column is present for each bin formed by
67
-  the \code{cycle_breaks} parameter, with the read count for that bin.
67
+  the \code{read_pos_breaks} parameter, with the read count for that bin.
68 68
 }
69 69
 
70 70
 \author{Michael Lawrence}
... ...
@@ -277,7 +277,7 @@ aligned to the TP53 region of the human genome. We'll use this file to
277 277
 demonstrate the use of \software{bam\_tally} through the
278 278
 \software{gmapR} package. The resulting data structure will contain
279 279
 the needed information such as number of alternative alleles, quality
280
-scores, and nucleotide cycle for each allele.
280
+scores, and read position for each allele.
281 281
 
282 282
 <<run_bamtally, eval=FALSE>>=
283 283
 library(gmapR)
... ...
@@ -288,7 +288,7 @@ bam_file <- system.file("extdata/H1993.analyzed.bam",
288 288
 breaks <- c(0L, 15L, 60L, 75L)
289 289
 bqual <- 56L
290 290
 mapq <- 13L
291
-param <- BamTallyParam(genome, cycle_breaks = breaks,
291
+param <- BamTallyParam(genome, read_pos_breaks = breaks,
292 292
                        high_base_quality = bqual,
293 293
                        minimum_mapq = mapq,
294 294
                        concordant_only = FALSE, unique_only = FALSE,