Browse code

bam_tally annotates columns with Description field for inclusion in VCF header

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@82008 bc3139a8-67e5-0310-9ffc-ced21a209358

Michael Lawrence authored on 22/10/2013 22:49:06
Showing 3 changed files

... ...
@@ -10,13 +10,13 @@ Description: GSNAP and GMAP are a pair of tools to align short-read
10 10
     to work with GMAP and GSNAP from within R. In addition, it provides 
11 11
     methods to tally alignment results on a per-nucleotide basis using 
12 12
     the bam_tally tool.
13
-Version: 1.5.0
13
+Version: 1.5.1
14 14
 Depends: R (>= 2.15.0), methods, GenomicRanges
15
-Imports: IRanges, Rsamtools (>= 1.7.4), rtracklayer (>= 1.17.15), GenomicRanges,
16
-         GenomicFeatures, Biostrings, VariantAnnotation (>= 1.7.34), tools,
17
-         Biobase, BSgenome, methods
15
+Imports: IRanges, Rsamtools (>= 1.7.4), rtracklayer (>= 1.17.15),
16
+         GenomicFeatures, Biostrings, VariantAnnotation (>= 1.9.4), tools,
17
+         Biobase, BSgenome
18 18
 Suggests: RUnit, BSgenome.Dmelanogaster.UCSC.dm3,
19
-	  BSgenome.Scerevisiae.UCSC.sacCer3, VariantAnnotation,
19
+	  BSgenome.Scerevisiae.UCSC.sacCer3,
20 20
           org.Hs.eg.db, TxDb.Hsapiens.UCSC.hg19.knownGene,
21 21
           BSgenome.Hsapiens.UCSC.hg19, LungCancerLines
22 22
 License: Artistic-2.0
... ...
@@ -1,3 +1,44 @@
1
+CHANGES IN VERSION 1.4.0
2
+-----------------------
3
+
4
+NEW FEATURES
5
+
6
+    o Add desired_read_group to BamTallyParam; will limit tallies to
7
+      that specific read group (useful for multi-amplicon sequencing,
8
+      like Fluidigm)
9
+
10
+    o Add keep_ref_rows argument to variantSummary() for keeping rows
11
+      for positions where no alt is detected (the rows where ref == alt).
12
+
13
+    o gsnap() will now output a GsnapOutputList when there are
14
+      multiple input files
15
+
16
+    o Support 'terminal_threshold' and 'gmap_mode' parameters in
17
+      GsnapParam, and use different defaults for DNA vs. RNA. This
18
+      means a big improvement in alignment quality for DNA.
19
+
20
+    o GmapGenome now accepts a direct path to the genome as its first argument
21
+    
22
+USER-VISIBLE CHANGES
23
+
24
+    o Renamed summarizeVariants to variantSummary
25
+
26
+    o The 'which' in GsnapParam is now a GenomicRanges instead of a RangesList
27
+
28
+    o Refactor bam_tally, so that bam_tally returns a TallyIIT object,
29
+      which is then summarized via summarizeVariants; this allows computing
30
+      tallies once and summarizing them in different ways (like maybe get
31
+      the coverage). The summarizeVariants function yields a VRanges.
32
+
33
+BUG FIXES
34
+
35
+    o fix minimum quality cutoff check to >=, instead of >
36
+
37
+    o fix asBam,GsnapOutput for when unique_only is TRUE
38
+
39
+    o package created by makeGmapGenomePackage now have a GmapGenome with
40
+      the correct name
41
+
1 42
 CHANGES IN VERSION 1.2.0
2 43
 -----------------------
3 44
 
... ...
@@ -82,6 +82,7 @@ variantSummary <- function(x, read_pos_breaks = NULL, high_base_quality = 0L,
82 82
                    "count.neg", "count.neg.ref",
83 83
                    "read.pos.mean", "read.pos.mean.ref",
84 84
                    "read.pos.var", "read.pos.var.ref")
85
+  break_names <- character()
85 86
   if (length(read_pos_breaks) > 0L) {
86 87
     read_pos_breaks <- as.integer(read_pos_breaks)
87 88
     break_names <- paste("readPosCount", head(read_pos_breaks, -1),
... ...
@@ -101,6 +102,8 @@ variantSummary <- function(x, read_pos_breaks = NULL, high_base_quality = 0L,
101 102
                           "high.quality.ref", "high.quality.total"))
102 103
   genome <- genome(x)
103 104
   indel <- nchar(tally$ref) == 0L | nchar(tally$alt) == 0L
105
+  metacols <- DataFrame(tally[meta_names])
106
+  mcols(metacols) <- variantSummaryColumnDescriptions(read_pos_breaks)
104 107
   gr <- with(tally,
105 108
              VRanges(seqnames,
106 109
                      IRanges(pos,
... ...
@@ -109,8 +112,8 @@ variantSummary <- function(x, read_pos_breaks = NULL, high_base_quality = 0L,
109 112
                      ifelse(indel, raw.count.total, high.quality.total),
110 113
                      ifelse(indel, raw.count.ref, high.quality.ref),
111 114
                      ifelse(indel, raw.count, high.quality),
112
-                     DataFrame(tally[meta_names]),
113 115
                      seqlengths = seqlengths(genome)))
116
+  mcols(gr) <- metacols
114 117
   seqinfo(gr) <- seqinfo(genome)
115 118
   gr <- normalizeIndelAlleles(gr, genome)
116 119
   gr
... ...
@@ -197,3 +200,33 @@ normArgTRUEorFALSE <- function(x) {
197 200
         normArgSingleInteger(blocksize),
198 201
         normArgTRUEorFALSE(verbosep))
199 202
 }
203
+
204
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
205
+### Column metadata
206
+###
207
+
208
+variantSummaryColumnDescriptions <- function(read_pos_breaks) {
209
+  desc <- c(
210
+    n.read.pos = "Number of unique read positions for the ALT",
211
+    n.read.pos.ref = "Number of unique read positions for the REF",
212
+    raw.count = "Raw ALT count",
213
+    raw.count.ref = "Raw REF count",
214
+    raw.count.total = "Raw total count",
215
+    mean.quality = "Average ALT base quality",
216
+    mean.quality.ref = "Average REF base quality",
217
+    count.pos = "Raw positive strand ALT count",
218
+    count.pos.ref = "Raw positive strand REF count",
219
+    count.neg = "Raw negative strand ALT count",
220
+    count.neg.ref = "Raw negative strand REF count",
221
+    read.pos.mean = "Average read position for the ALT",
222
+    read.pos.mean.ref = "Average read position for the ALT",
223
+    read.pos.var = "Variance in read position for the ALT",
224
+    read.pos.var.ref = "Variance in read position for the REF")
225
+  if (length(read_pos_breaks) > 0L) {
226
+    break_desc <- paste0("Raw ALT count in read position range [",
227
+                         head(read_pos_breaks, -1), ",",
228
+                         tail(read_pos_breaks, -1), ")")
229
+    desc <- c(desc, break_desc)
230
+  }
231
+  DataFrame(Description = desc)
232
+}