R/bam_tally-command.R
b22faf0d
 ### =========================================================================
 ### bam_tally program
 ### -------------------------------------------------------------------------
 ###
 
59c36844
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 ### Raw tally result
 ###
 
32a9b516
 setClass("TallyIIT", representation(genome = "GmapGenome",
8418591e
                                     bam = "BamFile",
                                     xs = "logical",
685d60e3
                                     read_pos = "logical",
38518ddb
                                     nm = "logical",
32a9b516
                                     codon = "logical"),
          contains = "IIT")
59c36844
 
38518ddb
 TallyIIT <- function(ptr, genome, bam, xs, read_pos, nm, codon) {
8418591e
   new("TallyIIT", ptr = ptr, genome = genome, bam = bam, xs = xs,
38518ddb
       read_pos = read_pos, nm = nm, codon = codon)
59c36844
 }
 
 setMethod("genome", "TallyIIT", function(x) x@genome)
 
76132653
 bamFile <- function(x) x@bam
 
59c36844
 setMethod("show", "TallyIIT", function(object) {
76132653
   cat("Tally IIT object for '", path(bamFile(object)), "' on '",
       genome(genome(object)), "'\n", sep = "")
59c36844
 })
 
fc856d80
 ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
b22faf0d
 ### High-level wrapper
 ###
 
 setGeneric("bam_tally",
16509777
            function(x, param, ...)
b22faf0d
            standardGeneric("bam_tally"),
            signature = "x")
 
 setMethod("bam_tally", "BamFile",
16509777
           function(x, param, ...)
b22faf0d
           {
             x <- GmapBamReader(x)
             callGeneric()
           })
 
 setMethod("bam_tally", "character",
16509777
           function(x, param, ...)
b22faf0d
           {
             x <- BamFile(x)
             callGeneric()
           })
 
 setMethod("bam_tally", "GmapBamReader",
16509777
           function(x, param, ...)
b22faf0d
           {
             param_list <- as.list(param)
             args <- list(...)
             param_list[names(args)] <- args
16509777
             genome <- param_list$genome
46c17e5b
 
             ##verify genome has been created
57c5a705
 
b22faf0d
             param_list$db <- genome(genome)
             param_list$genome_dir <- path(directory(genome))
fc856d80
             if (!.gmapGenomeCreated(genome)) {
46c17e5b
               stop("The GmapGenome object has not yet been created. ",
3fed080f
                    "One solution is to run the GmapGenome constructor ",
                    "with create=TRUE")
46c17e5b
             }
57c5a705
 
b5a258b1
             param_list$genome <- NULL
c78be696
             
76132653
             TallyIIT(do.call(.bam_tally_C, c(list(x), param_list)), genome,
bfeb9ea6
                      as(x, "BamFile"), xs=param_list$xs,
38518ddb
                      read_pos=param_list$read_pos, nm=param_list$nm,
                      codon=!is.null(param_list$exon_iit))
b22faf0d
           })
 
685d60e3
 variantSummary <- function(x, read_pos_breaks = NULL,
                            keep_ref_rows = FALSE, read_length = NA_integer_,
                            high_nm_score = NA_integer_)
59c36844
 {
7c263469
   read_length <- as.integer(read_length)
   if (length(read_length) != 1L) {
     stop("'read_length' must be a single integer")
   }
685d60e3
   high_nm_score <- as.integer(high_nm_score)
   stopifnot(length(high_nm_score) == 1L)
   if (!is.na(high_nm_score) && !x@nm) {
       stop("'high_nm_score is not NA but NM scores were not tallied")
   }
f18ff85e
   if (length(read_pos_breaks) > 0L) {
8418591e
     if (!x@read_pos) {
f18ff85e
       stop("'read_pos_breaks' non-empty, but read positions were not tallied")
8418591e
     }
     read_pos_breaks <- as.integer(read_pos_breaks)
     if (any(is.na(read_pos_breaks)))
         stop("'read_pos_breaks' should not contain missing values")
     if (length(read_pos_breaks) < 2)
         stop("'read_pos_breaks' needs at least two elements to define a bin")
     if (is.unsorted(read_pos_breaks))
         stop("'read_pos_breaks' must be sorted")
   }
 
59c36844
   tally <- .Call(R_tally_iit_parse, x@ptr,
                  read_pos_breaks,
685d60e3
                  NULL, read_length, x@xs, high_nm_score)
7c263469
   
30f3bfda
   tally_names <- c("seqnames", "pos", "ref", "alt",
96cf9938
                    "n.read.pos", "n.read.pos.ref",
5659f096
                    "count", "count.ref", "raw.count.total", "count.total",
ae7cf39f
                    "count.plus", "count.plus.ref",
                    "count.minus", "count.minus.ref",
38518ddb
                    "count.del.plus", "count.del.minus",
ae7cf39f
                    "read.pos.mean", "read.pos.mean.ref",
                    "read.pos.var", "read.pos.var.ref",
685d60e3
                    "mdfne", "mdfne.ref", "codon.strand",
8418591e
                    "count.xs.plus", "count.xs.plus.ref",
685d60e3
                    "count.xs.minus", "count.xs.minus.ref",
                    "count.high.nm", "count.high.nm.ref")
 
084199c0
   break_names <- character()
f3db568b
   if (length(read_pos_breaks) > 0L) {
59c36844
     read_pos_breaks <- as.integer(read_pos_breaks)
     break_names <- paste("readPosCount", head(read_pos_breaks, -1),
                          tail(read_pos_breaks, -1), sep = ".")
     tally_names <- c(tally_names, break_names)
   }
   names(tally) <- tally_names
38518ddb
   tally$codon.strand <- if (x@codon) {
       structure(tally$codon.strand, class="factor", levels=levels(strand()))
   }
   tally <- Filter(Negate(is.null), tally)
59c36844
 
7032aa2e
   if (!keep_ref_rows) {
     variant_rows <- !is.na(tally$alt)
     if (!all(variant_rows))
       tally <- lapply(tally, `[`, variant_rows)
   }
   
685d60e3
   meta_names <- setdiff(names(tally),
                         c("seqnames", "pos", "ref", "alt", "count",
                           "count.ref", "count.total"))
59c36844
   genome <- genome(x)
   indel <- nchar(tally$ref) == 0L | nchar(tally$alt) == 0L
084199c0
   metacols <- DataFrame(tally[meta_names])
38518ddb
   mcols(metacols) <- variantSummaryColumnDescriptions(read_pos_breaks, x@xs,
                                                       high_nm_score, x@codon)
c78be696
 
59c36844
   gr <- with(tally,
              VRanges(seqnames,
                      IRanges(pos,
                              width = ifelse(nchar(alt) == 0, nchar(ref), 1L)),
                      ref, alt,
685d60e3
                      count.total, count.ref, count,
59c36844
                      seqlengths = seqlengths(genome)))
084199c0
   mcols(gr) <- metacols
9138f8cd
   checkTallyConsistency(gr)
ed089f79
   ## important to preserve seqlevel ordering compatible with 'genome'
ab8835c4
   seqinfo(gr) <- merge(seqinfo(genome), seqinfo(bamFile(x)))
   gr <- keepSeqlevels(gr, intersect(seqlevels(gr), seqlevels(bamFile(x))))
59c36844
   gr <- normalizeIndelAlleles(gr, genome)
   gr
 }
 
9138f8cd
 checkTallyConsistency <- function(x) {
   with(mcols(x), {
685d60e3
     stopifnot(all(count.plus + count.minus == altDepth(x), na.rm=TRUE))
     stopifnot(all(count.plus.ref + count.minus.ref == refDepth(x)))
9138f8cd
   })
 }
 
59c36844
 normalizeIndelAlleles <- function(x, genome) {
   is.indel <- nchar(ref(x)) == 0L | nchar(alt(x)) == 0L
   if (any(is.indel)) {
     indels <- x[is.indel]
4d5cc806
     flanks <- flank(indels, 1)
     anchor <- getSeq(genome, flanks)
     ref(x)[is.indel] <- paste0(anchor, ref(indels))
     alt(x)[is.indel] <- paste0(anchor, alt(indels))
63859322
     ranges(x)[is.indel] <- resize(ranges(flanks), nchar(ref(x)[is.indel]))
59c36844
   }
   x
 }
 
ae7cf39f
 guessReadLengthFromBam <- function(x, n=100L) {
6a7e619d
   ga <- readGAlignments(BamFile(x, yieldSize=n))
ae7cf39f
   readlen <- unique(qwidth(ga))
   if (length(readlen) != 1L)
     NA
   else readlen
 }
 
b22faf0d
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 ### Low-level wrappers
 ###
 
 normArgSingleInteger <- function(x) {
   name <- deparse(substitute(x))
   x <- as.integer(x)
4d5cc806
   if (!isSingleInteger(x))
b22faf0d
     stop("'", name, "' should be a single, non-NA integer")
   x
 }
 normArgTRUEorFALSE <- function(x) {
57c5a705
   name <- deparse(substitute(x))
b22faf0d
   if (!isTRUEorFALSE(x))
     stop("'", name, "' should be TRUE or FALSE")
   x
 }
 
085e7d85
 normArgSingleCharacterOrNULL <- function(x) {
c78be696
   name <- deparse(substitute(x))
085e7d85
   if (!is.null(x) && (!is(x, "character") || length(x) != 1))
c78be696
     stop("'", name, "' should be a single character value")
   x
 }
 
b22faf0d
 .bam_tally_C <- function(bamreader, genome_dir = NULL, db = NULL,
8418591e
                          which = NULL,
4d5cc806
                          high_base_quality = 0L, desired_read_group = NULL,
                          alloclength = 200000L,
b22faf0d
                          minimum_mapq = 0L, good_unique_mapq = 35L,
                          maximum_nhits = 1000000L,
                          concordant_only = FALSE, unique_only = FALSE,
df903e03
                          primary_only = FALSE, ignore_duplicates = FALSE,
b22faf0d
                          min_depth = 0L, variant_strand = 0L,
                          ignore_query_Ns = FALSE,
                          indels = FALSE,
17db5e75
                          blocksize = 1000L, verbosep = FALSE,
f5e63d36
                          include_soft_clips = 0L,
8418591e
                          exon_iit = NULL, xs = FALSE, read_pos = FALSE,
685d60e3
                          min_base_quality = 0L, noncovered = FALSE, nm = FALSE)
b22faf0d
 {
   if (!is(bamreader, "GmapBamReader"))
     stop("'bamreader' must be a GmapBamReader")
f3db568b
   if (length(which) > 0L) {
     which <- list(as.character(seqnames(which)), start(which), end(which))
   } else which <- NULL
4d5cc806
   if (!is.null(genome_dir) && !isSingleString(genome_dir))
b22faf0d
     stop("'genome_dir' must be NULL or a single, non-NA string")
4d5cc806
   if (!is.null(db) && !isSingleString(db))
b22faf0d
     stop("'db' must be NULL or a single, non-NA string")
4d5cc806
   if (!is.null(desired_read_group) && !isSingleString(desired_read_group))
     stop("'desired_read_group' must be NULL or a single, non-NA string")
9f0e3a7f
   .Call(R_Bamtally_iit, bamreader@.extptr, genome_dir, db, which,
4d5cc806
         desired_read_group,
59c36844
         normArgSingleInteger(alloclength),
         normArgSingleInteger(minimum_mapq),
         normArgSingleInteger(good_unique_mapq),
         normArgSingleInteger(maximum_nhits),
         normArgTRUEorFALSE(concordant_only),
         normArgTRUEorFALSE(unique_only),
         normArgTRUEorFALSE(primary_only),
         normArgTRUEorFALSE(ignore_duplicates),
         normArgSingleInteger(min_depth),
         normArgSingleInteger(variant_strand),
         normArgTRUEorFALSE(ignore_query_Ns),
         normArgTRUEorFALSE(indels),
         normArgSingleInteger(blocksize),
17db5e75
         normArgTRUEorFALSE(verbosep),
6d5d75f2
         normArgSingleInteger(include_soft_clips),
8418591e
         normArgSingleCharacterOrNULL(exon_iit),
         normArgTRUEorFALSE(xs),
         normArgTRUEorFALSE(read_pos),
         normArgSingleInteger(min_base_quality),
685d60e3
         normArgTRUEorFALSE(noncovered),
         normArgTRUEorFALSE(nm))
b22faf0d
 }
084199c0
 
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 ### Column metadata
 ###
 
38518ddb
 variantSummaryColumnDescriptions <-
     function(read_pos_breaks, xs, high_nm, codon)
 {
084199c0
   desc <- c(
96cf9938
     n.read.pos = "Number of unique read positions for the ALT",
     n.read.pos.ref = "Number of unique read positions for the REF",
5659f096
     raw.count.total = "Total depth, before quality filtering",  
38518ddb
     count.plus = "Positive strand ALT count",
     count.plus.ref = "Positive strand REF count",
     count.minus = "Negative strand ALT count",
     count.minus.ref = "Negative strand REF count",
     count.del.plus = "Plus strand deletion count",
     count.del.minus = "Count strand deletion count",
ae7cf39f
     read.pos.mean = "Average read position for the ALT",
c78be696
     read.pos.mean.ref = "Average read position for the ALT",
ae7cf39f
     read.pos.var = "Variance in read position for the ALT",
     read.pos.var.ref = "Variance in read position for the REF",
7c263469
     mdfne = "Median distance from nearest end of read for the ALT",
f5e63d36
     mdfne.ref = "Median distance from nearest end of read for the REF",
38518ddb
       if (codon) {
           c(codon.strand = "Strand of transcription for the codon")
       },
       if (xs) {
           c(count.xs.plus = "Plus strand XS counts",
             count.xs.plus.ref = "Plus strand reference XS counts",
             count.xs.minus = "Minus strand XS counts",
             count.xs.minus.ref = "Minus strand reference XS counts")
       },
       count.high.nm = paste("Count of reads with >=", high_nm,
           "NM score for the ALT"),
       count.high.nm.ref = paste("Count of reads with >=", high_nm,
           "NM score for the REF"))
084199c0
   if (length(read_pos_breaks) > 0L) {
     break_desc <- paste0("Raw ALT count in read position range [",
                          head(read_pos_breaks, -1), ",",
                          tail(read_pos_breaks, -1), ")")
     desc <- c(desc, break_desc)
   }
   DataFrame(Description = desc)
 }