##' Return number of reads in each of gsnap's output files ##' ##' With the --split-output argument, gsnap can output multiple ##' files. If these files are BAM files, this function counts the ##' number of reads in each file. ##' @title Return number of reads in each of gsnap's output files ##' @param aligner_dir Directory containing output bam files from gsnap ##' @param parallelize Boolean indicating if counting should be done in parallel ##' @return named list of read counts per gsnap output file ##' @author Cory Barr ##' @export getGsnapFileCounts <- function(aligner_dir, parallelize=TRUE) { if(!file.exists(aligner_dir)) stop("aligner_dir does not exist") ##have to count gsnap output by chr, or can get overflow errors .getNumUniqueReadsInBam <- function(bam_file) { chr_granges <- getSeqlensFromBAM(bam_file) .getUniqueReadsByChr <- function(index) { sb_param <- ScanBamParam(what=c("qname"), which=chr_granges[index]) res <- scanBam(bam_file, param=sb_param)[[1]] unique(res$qname) } apply_func <- lapply if (parallelize) apply_func <- mclapply res <- unlist(apply_func(seq_len(length(chr_granges)), .getUniqueReadsByChr)) length(unique(res)) } bam_files <- dir(aligner_dir, pattern="\\.bam$", full.names=TRUE) nomapping_index <- grep("\\.no_?mapping\\.bam$", bam_files) mult_mapping_index <- grep("_mult[_\\.]", bam_files) uniq_mapping_index <- seq_len(length(bam_files))[-c(mult_mapping_index, nomapping_index)] nomapping_bam <- bam_files[nomapping_index] mult_mapping_bams <- bam_files[mult_mapping_index] uniq_mapping_bams <- bam_files[uniq_mapping_index] named_names <- bam_files[-nomapping_index] names(named_names) <- named_names gsnap_counts <- lapply(named_names, .getNumUniqueReadsInBam) names(gsnap_counts) <- sub("^.*\\.gsnap\\.merged\\.", "", names(gsnap_counts)) names(gsnap_counts) <- sub("\\.bam$", "", names(gsnap_counts)) ##handling the 'nomapping' file differently, since nothing this (so ##nothing is returned if scanBam is given a ScanBamParam which arg nomapper_ids <- unlist(scanBam(nomapping_bam, param=ScanBamParam(what=c("qname")))[[1]], use.names=FALSE) num_nomappers <- sum(!duplicated(nomapper_ids)) gsnap_counts[['nomapping']] <- num_nomappers names(gsnap_counts) <- paste("gsnap", names(gsnap_counts), sep=".") names(gsnap_counts) <- paste(names(gsnap_counts), "_reads", sep="") return(gsnap_counts) }