## High-level interface to gsnap ## We probably want a parameters object. The question is whether we ## really want to maintain accessors, etc for so many parameters. ## Choices: ## - Just have people use a list if they want to group parameters ## - Have an S4 class with every parameter; with accessors ## - Have an S4 class with slots for common parameters, and an extra list ## If we chose the latter, how many arguments would we need? ## --part, so people can use multiple nodes ## --batch, so people can take advantage of their memory ## --max_mismatches, default is reasonable, but people will want to change this ## --suboptimal_levels, same reason for max_mismatches ## --use_snps, for SNP tolerance ## --mode, for methylation and RNA editing support ## --nthreads, so people can use their cores ## --novelsplicing, for RNA-seq ## --splicing, known splice sites, for RNA-seq ## --npaths, to limit multi-mappers ## --quiet-if-excessive, to limit multi-mappers ## --nofails, to ignore non-mappers ## --format, to get BAM ## So that is 13 parameters. We definitely need a class to hold ## those. Accessors for 'format' and 'mode' will collide with ## fundamental methods in R. What to do about that? Probably by ## prefixing gsnap on to all of them. ## parallelize_gsnap would also need to take this object, ## overriding the 'part' parameter. .valid_GsnapParam_part <- function(x) { ## validate the part string (i/n) } .valid_GsnapParam_batch <- function(x) { ## one of 0, 1, 2, 3, 4 } .valid_GsnapParam_max_mismatches <- function(x) { ## non-negative number or NULL } .valid_GsnapParam_suboptimal_levels <- function(x) { ## non-negative number or NULL } .valid_GsnapParam_use_snps <- function(x) { ## a file that exists (not sure) } .valid_GsnapParam_mode <- function(x) { ## one of standard, cmet, atoi } .valid_GsnapParam_nthreads <- function(x) { ## positive number } .valid_GsnapParam_novelsplicing <- function(x) { ## TRUE or FALSE } .valid_GsnapParam_splicing <- function(x) { ## a file that exists (not sure) or NULL } .valid_GsnapParam_npaths <- function(x) { ## positive number } .valid_GsnapParam_quiet_if_excessive <- function(x) { ## TRUE or FALSE } .valid_GsnapParam_nofails <- function(x) { ## TRUE or FALSE } .valid_GsnapParam_format <- function(x) { ## sam or NULL (this is a low-level check!) } .valid_GsnapParam_split_output <- function(x) { ## single string or NULL } .valid_GsnapParam_read_group_id <- function(x) { ## single string (any character constraints?) or NULL } .valid_GsnapParam_read_group_name <- function(x) { ## single string or NULL } .valid_gmap_parameter <- function(name, params) { validator <- get(paste(".valid_GsnapParam_", name, sep = "")) if (!is.null(validator)) validator(params) else NULL } .valid_GsnapParam <- function(object) { x <- as.list(object) # converts to low-level parameter list do.call(c, lapply(names(x), .valid_gmap_parameter, x)) } setClassUnion("integerORNULL", c("integer", "NULL")) setClass("GsnapParam", representation(part = "characterORNULL", # used by parallelized_gsnap batch = "character", # weird "0", "1", ... max_mismatches = "integerORNULL", suboptimal_levels = "integer", use_snps = "characterORNULL", mode = "character", nthreads = "integer", novelsplicing = "logical", splicing = "characterORNULL", npaths = "integer", quiet_if_excessive = "logical", nofails = "logical", format = "character", # bam, sam, gsnap split_output = "logical", extra = "list"), validity = .valid_GsnapParam) GsnapParam <- function(unique_only = FALSE, max_mismatches = NULL, suboptimal_levels = 0L, mode = "standard", use_snps = NULL, npaths = if (unique_only) 1L else 100L, quiet_if_excessive = unique_only, nofails = unique_only, format = "bam", split_output = !unique_only, novelsplicing = FALSE, splicing = NULL, nthreads = 1L, part = NULL, batch = "2", ...) { params <- formals(sys.function()) mc <- as.list(match.call(expand.dots = FALSE))[-1L] params[names(mc)] <- mc params$unique_only <- NULL params$extra <- params$... params$... <- NULL do.call(new, c("GsnapParam", params)) } setAs("GsnapParam", "list", function(from) { to <- lapply(slotNames(from), slot, x = from) to$format <- if (to$format == "gsnap") NULL else "sam" # bam or sam to$split_output <- if (to$split_output) "gsnap" else NULL to }) setMethod("as.list", "GsnapParam", function(x) as(x, "list")) setGeneric("gsnap", function(input_a, input_b = NULL, params, ...) standardGeneric("gsnap")) setMethod("gsnap", c("character", "character", "GsnapParam"), function(input_a, input_b, params, output = input_a, ...) { ### TODO: Vectorize over input_a and input_b, with recycling ### TODO: If split_output is TRUE, we use 'output' as the prefix, ### otherwise 'output' is used in .redirect, with format appended. ### Make sure the 'mkdir -p' the dirname of 'output'. params <- as.list(initialize(params, ...)) do.call(.gsnap, c(.input_a = input_a, .input_b = input_b, params)) ### would return a GsnapOutput }) ## Low-level interface to gnsap, with all the params .gsnap <- function(db, part = NULL, input_buffer_size = 1000L, barcode_length = 0L, pc_linefeeds = FALSE, orientation = c("FR", "RF", "FF"), gunzip = FALSE, batch = c("2", "0", "1", "3", "4"), max_mismatches = NULL, query_unk_mismatch = FALSE, genome_unk_mismatch = TRUE, terminal_penalty = 3L, indel_penalty = 2L, indel_endlength = 4L, max_middle_insertions = 9L, max_middle_deletions = 30L, max_end_insertions = 3L, max_end_deletions = 6L, suboptimal_levels = 0L, masking = c("2", "1", "3", "4"), adapter_string = NULL, trim_mismatch_score = -3L, snpsdir = NULL, use_snps = NULL, cmetdir = NULL, atoidir = NULL, mode = c("standard", "cmet", "atoi"), tallydir = NULL, use_tally = NULL, nthreads = NULL, novelsplicing = FALSE, splicing = NULL, novel_doublesplices = FALSE, localsplicedist = 200000L, local_splice_penalty = 0L, distant_splice_penalty = 3L, distant_splice_endlength = 16L, shortend_splice_endlength = 2L, distant_splice_identity = 0.95, pairmax_dna = 1000L, pairmax_rna = 200000L, pairexpect = 200L, quality_protocol = c("sanger", "illumina"), quality_zero_score = 33L, quality_print_shift = 0L, mapq_unique_score = NULL, npaths = 100L, quiet_if_excessive = FALSE, ordered = FALSE, show_rediff = FALSE, clip_overlap = FALSE, print_snps = FALSE, failsonly = FALSE, nofails = FALSE, fails_as_input = FALSE, format = NULL, split_output = NULL, no_sam_headers = FALSE, sam_headers_batch = NULL, read_group_id = NULL, read_group_name = NULL, .input_a = NULL, .input_b = NULL, .redirect = NULL # e.g., > gsnap.sam ) { formals <- formals(sys.function()) problems <- do.call(c, lapply(names(formals), .valid_gmap_parameter, formals)) if (!is.null(problems)) stop("validation failed:\n ", paste(problems, collapse = "\n ")) ### TODO: if input_a is NULL, return a pipe() .system(commandLine("gsnap")) ### TODO: return the bam file path as character vector ### This means interpreting split_output correctly }