##' Convert a data frame of exon information into input to build a splice site index for Gmap/Gsnap ##' ##' the input data frame needs to have columns named "chrom," ##' "strand," "exonStarts", "exonEnds", and "name." A common way to ##' obtain this is from UCSC via rtracklayer: ##' ##' library(rtracklayer) ##' session <- browserSession() ##' genome(session) <- genome ##' query <- ucscTableQuery(session, "refGene") ##' refGeneTable <- getTable(query) ##' ##' To create a splice-site index for Gmap/Gsnap, take the following ##' output (here called spliceSites) and pass it to buildGmapSpliceSites: ##' ##' spliceSites <- exonsToSpliceSites(refGeneTable) ##' ##' @title Convert a data frame of exon info into input for incorporating known splice sites into Gmap/Gsnap ##' @param exons A data frame containing the fields described in the details section. ##' @return a character vector ##' @author Cory Barr ##' @export exonsToGmapSpliceSites <- function(exons) { if(class(exons) != "data.frame") stop("input is not a data.frame") gene_names <- as.character(exons$name) chroms <- as.character(exons$chrom) strands <- as.character(exons$strand) exon_lcoords <- as.character(exons$exonStarts) exon_rcoords <- as.character(exons$exonEnds) .positions_to_int <- function(x) { as.integer(unlist(strsplit(x, ","))) } .int_to_half_open <- function(x, strand) { if (strand == '+') { paste(x, x + 1L, sep='..') } else if (strand == '-') { paste(x + 1L, x, sep='..') } } exon_lcoords <- lapply(exon_lcoords, .positions_to_int) exon_rcoords <- lapply(exon_rcoords, .positions_to_int) if(any(elementLengths(exon_lcoords) != elementLengths(exon_rcoords))) stop("not the same number of exon starts and stops") single_exon_genes <- which(elementLengths(exon_lcoords) < 2) if(length(single_exon_genes) > 0) { exon_lcoords <- exon_lcoords[-single_exon_genes] exon_rcoords <- exon_rcoords[-single_exon_genes] gene_names <- gene_names[-single_exon_genes] chroms <- chroms[-single_exon_genes] strands <- strands[-single_exon_genes] } rm(single_exon_genes) .positions_to_gsnap_input <- function(positions, gene_name, chr, type, strand) { strand_str <- 'pos' if (strand == '-') strand_str <- 'neg' exon_nums <- seq_len(length(positions)) if (type == 'acceptor') exon_nums <- exon_nums + 1 paste(">", gene_name, ".", strand_str, ".", "exon", exon_nums, " ", chr, ":", .int_to_half_open(positions, strand), " ", type, sep="") } .getGmapInputStrings <- function(i) { lcoords <- unlist(exon_lcoords[i]) rcoords <- unlist(exon_rcoords[i]) strand <- strands[i] gene_name <- gene_names[i] chrom <- chroms[i] num_exons <- length(lcoords) if (strand == '+') { donor_pos <- rcoords[1:(num_exons - 1)] acceptor_pos <- lcoords[2:num_exons] } else if (strand == '-') { donor_pos <- lcoords[num_exons:2] acceptor_pos <- rcoords[(num_exons - 1):1] } else { stop("Strand is not + or -") } ret_val <- list() ret_val$donor_str <- .positions_to_gsnap_input(donor_pos, gene_name, chrom, 'donor', strand) ret_val$acceptor_str <- .positions_to_gsnap_input(acceptor_pos, gene_name, chrom, 'acceptor', strand) return(ret_val) } if(is.loaded("mc_fork", PACKAGE = "multicore")) { apply_func <- mclapply } else { apply_func <- lapply } gmapInputStrings <- apply_func(seq_len(length(exon_lcoords)), .getGmapInputStrings) .pairUpResults <- function(l) { donors <- l$donor_str acceptors <- l$acceptor_str paired <- rep(as.character(NULL), length(donors) * 2) evens <- 2 * seq_len(length(donors)) odds <- evens - 1 paired[odds] <- donors paired[evens] <- acceptors return(paired) } paired_up <- apply_func(gmapInputStrings, .pairUpResults) return(unlist(paired_up)) }