recoup <- function( input, design=NULL, region=c("genebody","tss","tes","utr3","custom"), type=c("chipseq","rnaseq"), signal=c("coverage","rpm"), genome=c("hg18","hg19","hg38","mm9","mm10","rn5","rn6","dm3","dm6", "danrer7","danrer10","pantro4","pantro5","susscr3","susscr11", "ecucab2","tair10"), version="auto", refdb=c("ensembl","ucsc","refseq"), flank=c(2000,2000), onFlankFail=c("drop","trim"), fraction=1, orderBy=list( what=c("none","suma","sumn","maxa","maxn","avga","avgn","hcn"), order=c("descending","ascending"), custom=NULL ), #orderBy=getDefaultListArgs("orderBy"), binParams=list( flankBinSize=0, regionBinSize=0, sumStat=c("mean","median"), interpolation=c("auto","spline","linear","neighborhood"), binType=c("variable","fixed"), forceHeatmapBinning=TRUE, forcedBinSize=c(50,200), chunking=FALSE#, #seed=42 ), #binParams=getDefaultListArgs("binParams"), selector=NULL, #selector=getDefaultListArgs("selector"), preprocessParams=list( fragLen=NA, cleanLevel=c(0,1,2,3), normalize=c("none","linear","downsample","sampleto"), sampleTo=1e+6, spliceAction=c("split","keep","remove"), spliceRemoveQ=0.75, #bedGenome=ifelse(genome %in% c("hg18","hg19","hg38","mm9","mm10","rn5", # "dm3","danrer7","pantro4","susscr3"),genome,NA), bedGenome=NA#, #seed=42 ), #preprocessParams=getDefaultListArgs("preprocessParams"), plotParams=list( plot=TRUE, profile=TRUE, heatmap=TRUE, correlation=TRUE, signalScale=c("natural","log2"), heatmapScale=c("common","each"), heatmapFactor=1, corrScale=c("normalized","each"), sumStat=c("mean","median"), smooth=TRUE, corrSmoothPar=ifelse(is.null(design),0.1,0.5), singleFacet=c("none","wrap","grid"), multiFacet=c("wrap","grid"), singleFacetDirection=c("horizontal","vertical"), conf=TRUE, device=c("x11","png","jpg","tiff","bmp","pdf","ps"), outputDir=".", outputBase=NULL ), #plotParams=getDefaultListArgs("plotParams",design), saveParams=list( ranges=TRUE, coverage=TRUE, profile=TRUE, profilePlot=TRUE, heatmapPlot=TRUE, correlationPlot=TRUE ), #saveParams=getDefaultListArgs("saveParams"), kmParams=list( k=0, # Do not perform kmeans nstart=20, algorithm=c("Hartigan-Wong","Lloyd","Forgy","MacQueen"), iterMax=20, reference=NULL#, #seed=42 ), #kmParams=getDefaultListArgs("kmParams"), strandedParams=list( strand=NULL, ignoreStrand=TRUE ), #strandedParams=getDefaultListArgs("strandedParams"), ggplotParams=list( title=element_text(size=12), axis.title.x=element_text(size=10,face="bold"), axis.title.y=element_text(size=10,face="bold"), axis.text.x=element_text(size=9,face="bold"), axis.text.y=element_text(size=10,face="bold"), strip.text.x=element_text(size=10,face="bold"), strip.text.y=element_text(size=10,face="bold"), legend.position="bottom", panel.spacing=grid::unit(1,"lines") ), #ggplotParams=getDefaultListArgs("ggplotParams"), complexHeatmapParams=list( main=list( cluster_rows=ifelse(length(grep("hc",orderBy$what))>0,TRUE,FALSE), cluster_columns=FALSE, column_title_gp=grid::gpar(fontsize=10,font=2), show_row_names=FALSE, show_column_names=FALSE, heatmap_legend_param=list( color_bar="continuous" ) ), group=list( cluster_rows=ifelse(length(grep("hc",orderBy$what))>0,TRUE,FALSE), cluster_columns=FALSE, column_title_gp=grid::gpar(fontsize=10,font=2), show_row_names=FALSE, show_column_names=FALSE, row_title_gp=grid::gpar(fontsize=8,font=2), gap=unit(5,"mm"), heatmap_legend_param=list( color_bar="continuous" ) ) ), #complexHeatmapParams=getDefaultListArgs("complexHeatmapParams"), bamParams=NULL, onTheFly=FALSE, # Directly from BAM w/o Ranges storing, also N/A with BED, #localDb=file.path(path.expand("~"),".recoup"), localDb=file.path(system.file(package="recoup"),"annotation.sqlite"), rc=NULL ) { # Simple check to throw error if user us using the old databasing system if (any(dir.exists(file.path(dirname(localDb),refdb)))) stop("Possible old recoup database system detected. Please rebuild ", "using the new system or download a pre-built database. See also ", "the vignettes.") ############################################################################ # Begin of simple parameter checking for a new or a restored object if (is.list(input) && !is.null(input$data)) { # Refeeding recoup object message("Detected a previous recoup output as input. Any existing ", "data requiring\nlengthy calculations (short reads, coverages and ", "profile matrices will be\nreused depending on other parameters. ", "If you want a complete recalculation,\neither input a fresh list ", "of arguments or remove any of the above with the\nremoveData ", "function.\n") prevCallParams <- input$callopts refRanges <- input$refRanges input <- input$data } else prevCallParams <- refRanges <- NULL if (!is.list(input) && file.exists(input)) input <- readConfig(input) checkInput(input) if (is.null(names(input))) names(input) <- vapply(input,function(x) return(x$id),character(1)) # Check if there are any mispelled or invalid parameters and throw a warning checkMainArgs(as.list(match.call())) # Check rest of arguments region <- tolower(region[1]) refdb <- tolower(refdb[1]) type <- tolower(type[1]) onFlankFail <- tolower(onFlankFail[1]) signal <- tolower(signal[1]) if (!is.null(design) && !is.data.frame(design)) checkFileArgs("design",design) if (!is.data.frame(genome) && !is.list(genome) && file.exists(genome)) checkFileArgs("genome",genome) else if (is.character(genome)) { if (is.character(localDb) && file.exists(localDb)) { if (!.userOrg(genome,localDb)) checkTextArgs("genome",genome,getSupportedOrganisms(), multiarg=FALSE) } } if (is.character(localDb) && file.exists(localDb)) { if (!.userRefdb(refdb,localDb)) checkTextArgs("refdb",refdb,getSupportedRefDbs(),multiarg=FALSE) } checkTextArgs("type",type,c("chipseq","rnaseq"),multiarg=FALSE) checkTextArgs("onFlankFail",onFlankFail,c("drop","trim"),multiarg=FALSE) checkTextArgs("signal",signal,c("coverage","rpm"),multiarg=FALSE) checkNumArgs("fraction",fraction,"numeric",c(0,1),"botheq") if (any(flank<0)) stop("The minimum flanking allowed is 0 bp") if (any(flank>50000)) stop("The maximum flanking allowed is 50000 bp") # Check the version argument version <- version[1] if (is.character(version)) { version <- tolower(version) checkTextArgs("version",version,c("auto"),multiarg=FALSE) } else checkNumArgs("version",version,"numeric") # If type is rnaseq, only genebody plots are valid if (type=="rnaseq" && region!="genebody") { warning("When type is \"rnaseq\", plots can be created only on ", "genebodies! Switching to genebody regions...",immediate.=TRUE) region <- "genebody" } # If type is rnaseq, read extension is illegal because of potential splicing if (type=="rnaseq" && !is.null(preprocessParams$fragLen) && !is.na(preprocessParams$fragLen)) { warning("When type is \"rnaseq\", read extension/reduction should not ", "happen because of potential splicing! Ignoring...",immediate.=TRUE) preprocessParams$fragLen <- NA } # If type is rnaseq, the only allowed genomes are the ones supported by # recoup for the time being. In the future, a custom RNA-Seq genome may be # constructed from a GFF or like... if (type=="rnaseq" && is.character(localDb) && file.exists(localDb) && !.userOrg(genome,localDb) && !(genome %in% getSupportedOrganisms())) stop("When type is \"rnaseq\", only the supported or user imported ", "genomes are allowed!") # annotation must be a list to be fed to buildCustomAnnotation if (is.list(genome) && !is.data.frame(genome)) { # members are checked by buildCustomAnnotation if required # We only need to check that the gtfFile exists here if (!("gtf" %in% names(genome))) stop("A gtf field must be provided with an existing GTF file ", "when providing a list with custom annotation elements!") if ("gtf" %in% names(genome) && is.character(genome$gtf) && !file.exists(genome$gtf)) stop("An existing GTF file must be provided when providing a ", "list with custom annotation elements!") } # Check what is going on with genome, first check if file, then localDb if (!is.data.frame(genome) && !is.list(genome) && is.character(genome) && !file.exists(genome) && is.character(localDb) && file.exists(localDb)) { if (!.userOrg(genome,localDb)) checkTextArgs("genome",genome,c("hg18","hg19","hg38","mm9","mm10", "rn5","rn6","dm3","dm6","danrer7","danrer10","pantro4", "pantro5","susscr3","susscr11","ecucab2","tair10"), multiarg=FALSE) if (!.userRefdb(refdb,localDb)) checkTextArgs("refdb",refdb,c("ensembl","ucsc","refseq"), multiarg=FALSE) } # and list arguments orderByDefault <- getDefaultListArgs("orderBy") binParamsDefault <- getDefaultListArgs("binParams") selectorDefault <- getDefaultListArgs("selector") preprocessParamsDefault <- getDefaultListArgs("preprocessParams", genome=genome) plotParamsDefault <- getDefaultListArgs("plotParams",design=design) saveParamsDefault <- getDefaultListArgs("saveParams") kmParamsDefault <- getDefaultListArgs("kmParams") strandedParamsDefault <- getDefaultListArgs("strandedParams") orderBy <- setArg(orderByDefault,orderBy) binParams <- setArg(binParamsDefault,binParams) if (!is.null(selector)) selector <- setArg(selectorDefault,selector) preprocessParams <- setArg(preprocessParamsDefault,preprocessParams) plotParams <- setArg(plotParamsDefault,plotParams) saveParams <- setArg(saveParamsDefault,saveParams) kmParams <- setArg(kmParamsDefault,kmParams) strandedParams <- setArg(strandedParamsDefault,strandedParams) orderBy <- validateListArgs("orderBy",orderBy) binParams <- validateListArgs("binParams",binParams) if (!is.null(selector)) selector <- validateListArgs("selector",selector) preprocessParams <- validateListArgs("preprocessParams",preprocessParams) plotParams <- validateListArgs("plotParams",plotParams) saveParams <- validateListArgs("saveParams",saveParams) kmParams <- validateListArgs("kmParams",kmParams) strandedParams <- validateListArgs("strandedParams",strandedParams) if (is.null(plotParams$outputBase)) plotParams$outputBase <- paste(vapply(input,function(x) return(x$id), character(1)),collapse="-") bbb <- vapply(input,function(x) return(tolower(x$format)=="bed"),logical(1)) if (any(bbb) && !(preprocessParams$bedGenome %in% c("hg18","hg19","hg38", "mm9","mm10","rn5","dm3","danrer7","pantro4","susscr3"))) stop("When short read files are in BED format, either the genome ", "parameter should be one\nof the supported organisms, or ", "preprocessParams$bedGenome must be specified as one of them.") # End of simple parameter checking for a new or a restored object ############################################################################ ############################################################################ # Begin of complex parameter storage procedure and parameter recall from a # previous call # Store all parameters to an obect for later reference to a next call, after # checking if something has changed (e.g. using setr)... thisCall <- as.list(match.call())[-1] if (!is.null(prevCallParams)) { callParams <- list( region=if (is.null(thisCall$region)) prevCallParams$region else region, type=if (is.null(thisCall$type)) prevCallParams$type else type, onFlankFail=if (is.null(thisCall$onFlankFail)) prevCallParams$onFlankFail else onFlankFail, signal=if (is.null(thisCall$signal)) prevCallParams$signal else signal, genome=if (is.null(thisCall$genome)) prevCallParams$genome else genome, refdb=if (is.null(thisCall$refdb)) prevCallParams$refdb else refdb, version=if (is.null(thisCall$version)) prevCallParams$version else version, flank=if (is.null(thisCall$flank)) prevCallParams$flank else flank, fraction=if (is.null(thisCall$fraction)) prevCallParams$fraction else fraction, orderBy=if (is.null(thisCall$orderBy)) prevCallParams$orderBy else orderBy, binParams=if (is.null(thisCall$binParams)) prevCallParams$binParams else binParams, selector=selector, # selector is a special case preprocessParams=if (is.null(thisCall$preprocessParams)) prevCallParams$preprocessParams else preprocessParams, plotParams=if (is.null(thisCall$plotParams)) prevCallParams$plotParams else plotParams, saveParams=if (is.null(thisCall$saveParams)) prevCallParams$saveParams else saveParams, kmParams=if (is.null(thisCall$kmParams)) prevCallParams$kmParams else kmParams, strandedParams=if (is.null(thisCall$strandedParams)) prevCallParams$strandedParams else strandedParams, ggplotParams=if (is.null(thisCall$ggplotParams)) prevCallParams$ggplotParams else ggplotParams, complexHeatmapParams=if (is.null(thisCall$complexHeatmapParams)) prevCallParams$complexHeatmapParams else complexHeatmapParams, #bamParams=if (is.null(thisCall$bamParams), # prevCallParams$bamParams,bamParams), ## Unused onTheFly=if (is.null(thisCall$onTheFly)) prevCallParams$onTheFly else onTheFly, localDb=if (is.null(thisCall$localDb)) prevCallParams$localDb else localDb, rc=if (is.null(thisCall$rc)) prevCallParams$rc else rc, customIsBase=NULL # Additional placeholder ) } else { callParams <- list( region=region, type=type, onFlankFail=onFlankFail, signal=signal, genome=genome, version=version, refdb=refdb, flank=flank, fraction=fraction, orderBy=orderBy, binParams=binParams, selector=selector, preprocessParams=preprocessParams, plotParams=plotParams, saveParams=saveParams, kmParams=kmParams, strandedParams=strandedParams, ggplotParams=ggplotParams, complexHeatmapParams=complexHeatmapParams, bamParams=bamParams, onTheFly=onTheFly, localDb=localDb, rc=rc, customIsBase=NULL # Additional placeholder ) } # ...and check if there has been a previous call and decide on big # recalculations... input <- decideChanges(input,callParams,prevCallParams) # Redefine all final arguments for this call region <- callParams$region type <- callParams$type onFlankFail <- callParams$onFlankFail signal <- callParams$signal genome <- callParams$genome version <- callParams$version refdb <- callParams$refdb flank <- callParams$flank fraction <- callParams$fraction orderBy <- callParams$orderBy binParams <- callParams$binParams selector <- callParams$selector preprocessParams <- callParams$preprocessParams plotParams <- callParams$plotParams saveParams <- callParams$saveParams kmParams <- callParams$kmParams strandedParams <- callParams$strandedParams ggplotParams <- callParams$ggplotParams complexHeatmapParams <- callParams$complexHeatmapParams bamParams <- callParams$bamParams onTheFly <- callParams$onTheFly localDb <- callParams$localDb rc <- callParams$rc # End of complex parameter storage procedure and parameter recall from a # previous call ############################################################################ # Continue with actual work # If annotation already here and not removed from the input object because # of change in region, flank or type, no need to reload it and thus, recoup # object become even more mobile and portable as localDb is not needed. if (!is.null(refRanges)) { genomeRanges <- refRanges$genomeRanges helperRanges <- refRanges$helperRanges } else { # Annotation case #1: provided as a BED-like data frame with loci if (is.data.frame(genome)) { if (type=="rnaseq") stop("When type=\"rnaseq\", only the usage of a supported or ", "user-imported organism from GTF file is allowed!") rownames(genome) <- as.character(genome[,4]) genomeRanges <- makeGRangesFromDataFrame( df=genome, keep.extra.columns=TRUE ) helperRanges <- NULL } # Annotation case #2: provided strictly as a BED-like file with loci else if (!is.list(genome) && !is.data.frame(genome) && is.character(genome)) { if (file.exists(genome)) { if (type=="rnaseq") stop("When type=\"rnaseq\", only the usage of a supported ", "or user-imported organism from GTF file is allowed!") genome <- read.delim(genome) rownames(genome) <- as.character(genome[,4]) genomeRanges <- makeGRangesFromDataFrame( df=genome, keep.extra.columns=TRUE ) # Need to assigne Seqinfo... sf <- .chromInfoToSeqInfoDf(.chromInfoFromBAM(input[[1]]$file), asSeqinfo=TRUE) seqlevels(genomeRanges) <- seqlevels(sf) seqinfo(genomeRanges) <- sf helperRanges <- NULL } # Annotation case #3: use local database or automatically on-the-fly else { if (type=="chipseq") { if (region == "utr3") genomeRanges <- tryCatch(loadAnnotation(genome,refdb, type="utr",version=version,summarized=TRUE, db=localDb,rc=rc), error=function(e) { tryCatch({ gtfFile <- genome$gtf metadata <- genome metadata$gtf <- NULL genomeRanges <- importCustomAnnotation( gtfFile,metadata,"utr") },error=function(e) { message("Error ",e) stop("Please provide an existing organism ", "or a list with annotation metadata ", "and GTF file!") },finally="") },finally="") else genomeRanges <- tryCatch(loadAnnotation(genome,refdb, type="gene",version=version,db=localDb,rc=rc), error=function(e) { tryCatch({ gtfFile <- genome$gtf metadata <- genome metadata$gtf <- NULL geneData <- importCustomAnnotation(gtfFile, metadata,"gene") },error=function(e) { message("Error ",e) stop("Please provide an existing organism ", "or a list with annotation metadata ", "and GTF file!") },finally="") },finally="") helperRanges <- NULL } else if (type=="rnaseq") { genomeRanges <- tryCatch(loadAnnotation(genome,refdb, type="exon",version=version,summarized=TRUE,db=localDb, rc=rc), error=function(e) { tryCatch({ gtfFile <- genome$gtf metadata <- genome metadata$gtf <- NULL genomeRanges <- importCustomAnnotation(gtfFile, metadata,"exon") },error=function(e) { message("Error ",e) stop("Please provide an existing organism or ", "a list with annotation metadata and ", "GTF file!") },finally="") },finally="") helperRanges <- tryCatch(loadAnnotation(genome,refdb, type="gene",version=version,db=localDb,rc=rc), error=function(e) { tryCatch({ gtfFile <- genome$gtf metadata <- genome metadata$gtf <- NULL geneData <- importCustomAnnotation(gtfFile, metadata,"gene") },error=function(e) { message("Error ",e) stop("Please provide an existing organism or ", "a list with annotation metadata and GTF ", "file!") },finally="") },finally="") if (!is(genomeRanges,"GRangesList")) genomeRanges <- split(genomeRanges,genomeRanges$gene_id) helperRanges <- helperRanges[names(genomeRanges)] } } } # For saving... refRanges <- list(genomeRanges=genomeRanges,helperRanges=helperRanges) } # After genome read, check if we have a valid custom orderer if (!is.null(orderBy$custom)) { if (length(orderBy$custom) != length(genomeRanges)) { warning("The custom orderer provided with orderBy parameter does ", "not have length equal to the number\nof elements in the ", "interrogated genomic regions and will be ignored!", immediate.=TRUE) orderBy$custom <- NULL } } # Read and check design compatibilities. Check if k-means is requested and # message accordingly. If k-means is requested it will be added to the # design data frame #hasDesign <- FALSE if (!is.null(design)) { if (!is.data.frame(design)) design <- read.delim(design,row.names=1) nfac <- ncol(design) if (length(input)>1 && nfac>2) stop("When more than one files are provided for coverage ", "generation, the maximum number of allowed design factors is 2") if (length(input)>1 && nfac>1 && kmParams$k>0) stop("When more than one files are provided for coverage ", "generation and k-means clustering is also requested, the ", "maximum number of allowed design factors is 1") if (length(input)==1 && nfac>3) stop("The maximum number of allowed design factors is 3") if (length(input)==1 && nfac>2 && kmParams$k>0) stop("The maximum number of allowed design factors when k-means ", "clustering is requested is 2") if (length(input)==1 && nfac>2 && plotParams$singleFacet!="none") stop("The maximum number of allowed design factors whith one ", "sample but wishing a gridded profile layout is 2") if (length(input)==1 && nfac>1 && kmParams$k>0 && plotParams$singleFacet!="none") stop("The maximum number of allowed design factors whith one ", "sample but wishing for k-means clustering and gridded ", "profile layout is 1") # Reduce the genomeRanges according to design or the other way if (nrow(design)>length(genomeRanges)) design <- tryCatch({ design[names(genomeRanges),,drop=FALSE] },error=function(e) { stop("Unexpected error occured! Are you sure that element ", "(row) names in the design file are of the same type as ", "the genome file?") },finally={}) else if (nrow(design)<=length(genomeRanges)) { genomeRanges <- tryCatch({ genomeRanges[rownames(design)] },error=function(e) { stop("Unexpected error occured! Are you sure that element ", "(row) names in the design file are of the same type as ", "the genome file?") },finally={}) if (type=="rnaseq") helperRanges <- tryCatch({ helperRanges[rownames(design)] },error=function(e) { stop("Unexpected error occured! Are you sure that element ", "(row) names in the design file are of the same type ", "as the genome file?") },finally={}) } # ...but maybe the names are incompatible if (length(genomeRanges)==0) stop("No ranges left after using the identifiers provided with ", "the design file. Are you sure the identifiers between the ", "two files are compatible?") if (nrow(design)==0) stop("No design elements left after using the identifiers ", "provided with the genome file. Are you sure the identifiers ", "between the two files are compatible?") } # Apply the rest of the filters if any to reduce later computational burden if (!is.null(selector)) { if (type=="chipseq") genomeRanges <- applySelectors(genomeRanges,selector,rc=rc) if (type=="rnaseq") { helperRanges <- applySelectors(helperRanges,selector) # Filter and align names if we have helperRanges around genomeRanges <- genomeRanges[names(helperRanges)] } } # Now we must follow two paths according to region type, genebody and custom # areas with equal/unequal widths and utr3 or tss, tes and 1-width custom # areas. callParams$customIsBase <- FALSE if (region=="custom" && all(width(genomeRanges)==1)) { if (all(flank==0)) { warning("Flanking cannot be zero bp in both directions when the ", "reference region is only 1bp! Setting to default ", "(2000,2000)...",immediate.=TRUE) flank <- c(2000,2000) callParams$flank <- flank } callParams$customIsBase <- TRUE } if (region %in% c("tss","tes")) { if (all(flank==0)) { warning("Flanking cannot be zero bp in both directions when the ", "reference region is \"tss\" or \"tes\"! Setting to default ", "(2000,2000)...", immediate.=TRUE) flank <- c(2000,2000) callParams$flank <- flank } } # Here we must determine the final ranges to pass to preprocessRanges and # then work with these later on intermRanges <- getMainRanges(genomeRanges,helperRanges=helperRanges,type, region,flank,onFlankFail,rc=rc) # It is very important that all the Ranges have Seqinfo objects attached as # they have to be trimmed for potential extensions (due to flanking regions) # beyond chromosome lengths. The built-in annotations do provide this option # otherwise when a custom genome/areas is/are provided, the genome of # interest should be provided too. Or if not provided, the user will be # responsible for any crash. # TODO: When input ranges are custom, genome should be given to make Seqinfo # Should not be needed anymore with the new onFlankFail option #if (type == "chipseq") # Otherwise, throw error with GRangesList # mainRanges <- trim(intermRanges$mainRanges) #else if (type == "rnaseq") # mainRanges <- intermRanges$mainRanges #bamRanges <- trim(intermRanges$bamRanges) mainRanges <- intermRanges$mainRanges bamRanges <- intermRanges$bamRanges hadOffBound <- intermRanges$offBound # We must also correct the design in case of drop if (!is.null(design) && onFlankFail == "drop" && hadOffBound) design <- design[names(mainRanges),,drop=FALSE] # Say something if off bound detected if (hadOffBound) { if (onFlankFail == "drop") warning("Off bound regions detected after flanking! Offending ", "regions will be dropped from the reference\n and the design if ", "provided...",immediate.=TRUE) else if (onFlankFail == "trim") warning("Off bound regions detected after flanking! Offending ", "reference regions will be trimmed\n and the flanking regions ", "will be merged with the main...",immediate.=TRUE) } # Here we must write code for the reading and normalization of bam files # The preprocessRanges function looks if there is a valid (not null) ranges # field in input if (!onTheFly) input <- preprocessRanges(input,preprocessParams,genome,bamRanges, bamParams=bamParams,rc=rc) # At this point we must apply the fraction parameter if <1. We choose this # point in order not to restrict later usage of the read ranges and since it # does not take much time to load into memory. In addition, this operation # cannot be applied when streaming BAMs. In that case, user must subsample # outside recoup. if (!onTheFly && fraction<1) { newSize <- round(fraction*length(mainRanges)) #set.seed(preprocessParams$seed) refIndex <- sort(sample(length(mainRanges),newSize)) mainRanges <- mainRanges[refIndex] for (i in seq_len(length(input))) { if (!is.null(input[[i]]$ranges)) { newSize <- round(fraction*length(input[[i]]$ranges)) #set.seed(preprocessParams$seed) fracIndex <- sort(sample(length(input[[i]]$ranges),newSize)) input[[i]]$ranges <- input[[i]]$ranges[fracIndex] } if (!is.null(input[[i]]$coverage)) input[[i]]$coverage <- input[[i]]$coverage[refIndex] if (!is.null(input[[i]]$profile)) input[[i]]$profile <- input[[i]]$profile[refIndex,] } } # Remove unwanted seqnames from reference ranges (if not on the fly) and # properly subset ranges if not all chromosomes are provided with BAM if (!onTheFly) { chrs <- unique(unlist(lapply(input,function(x) { if (x$format %in% c("bam","bed")) return(as.character(runValue(seqnames(x$ranges)))) else if (x$format=="bigwig") { if (!requireNamespace("rtracklayer")) stop("R package rtracklayer is required to read and ", "import BigWig files!") return(as.character(seqnames(seqinfo(BigWigFile(x$file))))) } }))) if (type=="chipseq") { #keep <- which(as.character(seqnames(mainRanges)) %in% chrs) #mainRanges <- mainRanges[keep] mainRanges <- .subsetGRangesByChrs(mainRanges,chrs) } else if (type=="rnaseq") { #keeph <- which(as.character(seqnames(helperRanges)) %in% chrs) #helperRanges <- helperRanges[keeph] #mainRanges <- mainRanges[names(helperRanges)] helperRanges <- .subsetGRangesByChrs(helperRanges,chrs) mainRanges <- .subsetGRangesByChrs(mainRanges,chrs) } } if(signal == "coverage") input <- coverageRef(input,mainRanges,strandedParams,rc=rc) else if (signal == "rpm") input <- rpMatrix(input,mainRanges,flank,binParams,strandedParams,rc=rc) # If normalization method is linear, we must adjust the coverages # TODO: Check for onTheFly in the beginning if (preprocessParams$normalize == "linear" && !onTheFly) { message("Calculating normalization factors...") linFac <- calcLinearFactors(input,preprocessParams) for (n in names(input)) { message(" normalizing ",n) if (linFac[n]==1) next if (signal == "coverage") input[[n]]$coverage <- cmclapply(input[[n]]$coverage, function(x,f) { return(x*f) },linFac[n]) else if (signal == "rpm") input[[n]]$profile <- apply(input[[n]]$profile,1,function(x,f) { return(x*f) },linFac[n]) } } # Now we must summarize and create the matrices. If genebody, utr3 or # unequal custom lengths, bin is a must, else we leave to user mustBin <- FALSE if (region=="genebody" || region=="utr3") mustBin <- TRUE if (region=="custom") { w <- width(mainRanges) if (any(w!=w[1])) mustBin <- TRUE } if (mustBin) { if (binParams$regionBinSize==0) { warning("Central region bin size not set for a region that must ", "be binned! Setting to 200...",immediate.=TRUE) binParams$regionBinSize <- 200 } } # Chunking? if (signal == "coverage") { # Otherwise, matrix calculate by rpm if (binParams$chunking) { if (type=="chipseq") binParams$chunks <- split(seq_len(length(genomeRanges)), as.character(seqnames(genomeRanges))) else if (type=="rnaseq") binParams$chunks <- split(seq_len(length(helperRanges)), as.character(seqnames(helperRanges))) } # hadOffBounds must be passed on input at some point fe0 <- FALSE if (onFlankFail == "trim" && hadOffBound) fe0 <- TRUE input <- profileMatrix(input,flank,binParams,rc,.feNoSplit=fe0) } # In some strange glimpses, we are getting very few NaNs in profile matrix # which I was unable to reproduce on a gene by gene basis. If no NaNs are # detected, no action is performed in the input object. Also, these NaNs # seem to appear only on zero count gene profiles, so it's safe to impute # by zero. #input <- imputeProfile(input,method="simple",rc) input <- imputeZero(input) # Perform the k-means clustering if requested and append to design (which # has been checked, if we are allowed to do so) if (kmParams$k>0) design <- kmeansDesign(input,design,kmParams) # Coverages and profiles calculated... Now depending on plot option, we go # further or return the enriched input object for saving if (!plotParams$profile && !plotParams$heatmap && !plotParams$correlation) { recoupObj <- toOutput(input,design,saveParams,callParams=callParams, refRanges=refRanges) return(recoupObj) } else { recoupObj <- toOutput(input,design,list(ranges=TRUE,coverage=TRUE, profile=TRUE),callParams=callParams,refRanges=refRanges) } ## Our plot objects recoupPlots <- list() # We must pass the matrices to plotting function if (plotParams$profile) { message("Constructing genomic coverage profile curve(s)") #theProfile <- recoupProfile(recoupObj,rc=rc) recoupObj <- recoupProfile(recoupObj,rc=rc) theProfile <- getr(recoupObj,"profile") recoupPlots$profilePlot <- theProfile } # Some default binning MUST be applied for the heatmap... Otherwise it could # take insanely long time and space to draw/store if (plotParams$heatmap) { # Inform the user about enforced binning (or not) if (region %in% c("tss","tes") || callParams$customIsBase) { if (binParams$regionBinSize==0 && binParams$forceHeatmapBinning) { message("The resolution of the requested profiles will be ", "lowered to avoid\nincreased computation time and/or ", "storage space for heatmap profiles...") } else if (binParams$regionBinSize==0 && !binParams$forceHeatmapBinning) warning("forceHeatmapBinning is turned off for high ", "resolution plotting. Be prepared for\nlong computational ", "times and big figures!",immediate.=TRUE) } else { if ((binParams$regionBinSize==0 || binParams$flankBinSize==0) && binParams$forceHeatmapBinning) { message("The resolution of the requested profiles will be ", "lowered to avoid\nincreased computation time and/or ", "storage space for heatmap profiles...") } else if ((binParams$regionBinSize==0 || binParams$flankBinSize==0) && !binParams$forceHeatmapBinning) warning("forceHeatmapBinning is turned off for high ", "resolution plotting. Be prepared for\nlong computational ", "times and big figures!",immediate.=TRUE) } if (binParams$forceHeatmapBinning && (binParams$regionBinSize==0 || binParams$flankBinSize==0)) { helpInput <- recoupObj$data if (region %in% c("tss","tes") || callParams$customIsBase) { for (n in names(helpInput)) { message("Calculating ",region," profile for ", helpInput[[n]]$name) helpInput[[n]]$profile <- binCoverageMatrix(helpInput[[n]]$coverage, binSize=binParams$forcedBinSize[2], stat=binParams$sumStat,rc=rc) helpInput[[n]]$profile <- helpInput[[n]]$profile } } else { for (n in names(helpInput)) { message("Calculating ",region," profile for ", helpInput[[n]]$name) message(" center") #center <- binCoverageMatrix(helpInput[[n]]$coverage$center, # binSize=binParams$forcedBinSize[2], # stat=binParams$sumStat,rc=rc) center <- binCoverageMatrix( input[[n]]$coverage,binSize=binParams$forcedBinSize[2], stat=binParams$sumStat, interpolation=binParams$interpolation, flank=flank,where="center",rc=rc ) message(" upstream") #left <- binCoverageMatrix(helpInput[[n]]$coverage$upstream, # binSize=binParams$forcedBinSize[1], # stat=binParams$sumStat,rc=rc) left <- binCoverageMatrix( input[[n]]$coverage,binSize=binParams$forcedBinSize[1], stat=binParams$sumStat, interpolation=binParams$interpolation,flank=flank, where="upstream",rc=rc ) message(" downstream") #right <- binCoverageMatrix( # helpInput[[n]]$coverage$downstream, # binSize=binParams$forcedBinSize[1], # stat=binParams$sumStat,rc=rc) right <- binCoverageMatrix( input[[n]]$coverage,binSize=binParams$forcedBinSize[1], stat=binParams$sumStat, interpolation=binParams$interpolation,flank=flank, where="downstream",rc=rc ) helpInput[[n]]$profile <- cbind(left,center,right) rownames(helpInput[[n]]$profile) <- names(input[[n]]$coverage) helpInput[[n]]$profile <- helpInput[[n]]$profile } } } else helpInput <- recoupObj$data helpObj <- recoupObj helpObj$data <- helpInput message("Constructing genomic coverage heatmap(s)") #theHeatmap <- recoupHeatmap(helpObj,rc=rc) helpObj <- recoupHeatmap(helpObj,rc=rc) theHeatmap <- getr(helpObj,"heatmap") recoupObj <- setr(recoupObj,"heatmap",theHeatmap) recoupPlots$heatmapPlot <- theHeatmap # Derive the main heatmap in case of hierarchical clustering mainh <- 1 if (length(grep("hc",orderBy$what))>0) { nc <- nchar(orderBy$what) mh <- suppressWarnings(as.numeric(substr(orderBy$what,nc,nc))) if (is.na(mh)) warning("Reference profile for hierarchical clustering order ", "not recognized! Using the 1st...",immediate.=TRUE) else if (mh > length(input)) { warning("Reference profile (",mh,") for hierarchical ", "clustering order does not exist (the input has only ", length(input)," sources! Using the last...", immediate.=TRUE) mainh <- length(input) } else mainh <- mh } } # We must pass the matrices to plotting function if (plotParams$correlation) { message("Constructing coverage correlation profile curve(s)") recoupObj <- recoupCorrelation(recoupObj,rc=rc) theCorrelation <- getr(recoupObj,"correlation") recoupPlots$correlationPlot <- theCorrelation } # Overwrite final object so as to return it recoupObj <- toOutput(input,design,saveParams,recoupPlots,callParams, refRanges) # Make any plots asked if (plotParams$plot) { what <- character(0) if (plotParams$profile) what <- c(what,"profile") if (plotParams$heatmap) what <- c(what,"heatmap") if (plotParams$correlation) what <- c(what,"correlation") if (length(what)>0) recoupPlot(recoupObj,what,plotParams$device,plotParams$outputDir, plotParams$outputBase,mainh) } # Return the enriched input object according to save options return(recoupObj) } applySelectors <- function(ranges,selector,rc=NULL) { if (!is.null(selector$id)) { ranges <- ranges[selector$ids] if (length(ranges)==0) stop("No ranges left after using the identifiers provided with ", "the selector field. Are you sure the identifiers between the ", "two files are compatible?") } if (!is.null(selector$biotype) && !is.null(ranges$biotype)) { good <- which(ranges$biotype %in% selector$biotype) ranges <- ranges[good] if (length(ranges)==0) stop("No ranges left after using the biotypes provided with the ", "selector field. Are you sure the identifiers between the two ", "files are compatible?") } if (!is.null(selector$exonType) && !is.null(ranges$exon_type)) { good <- which(ranges$exonType %in% selector$exonType) ranges <- ranges[good] if (length(ranges)==0) stop("No ranges left after using the exon types provided with ", "the selector field. Are you sure the identifiers between the ", "two files are compatible?") } return(ranges) } toOutput <- function(input,design=NULL,saveParams,plotObjs=NULL, callParams=NULL,refRanges=NULL) { if (!saveParams$ranges) input <- removeData(input,"ranges") if (!saveParams$coverage) input <- removeData(input,"coverage") if (!saveParams$profile) input <- removeData(input,"profile") plots <- list(profile=NULL,heatmap=NULL,correlation=NULL) if (!is.null(plotObjs) && saveParams$profilePlot) plots$profile <- plotObjs$profilePlot if (!is.null(plotObjs) && saveParams$heatmapPlot) plots$heatmap <- plotObjs$heatmapPlot if (!is.null(plotObjs) && saveParams$correlationPlot) plots$correlation <- plotObjs$correlationPlot return(list( data=input, design=design, plots=plots, callopts=callParams, refRanges=refRanges )) }