Browse code

Fixed bug with GRangesList in RNA-Seq and drop-trim cases

Panagiotis Moulos authored on 03/12/2020 20:58:39
Showing 1 changed files
... ...
@@ -148,10 +148,12 @@ recoup <- function(
148 148
             "of arguments or remove any of the above with the\nremoveData ",
149 149
             "function.\n")
150 150
         prevCallParams <- input$callopts
151
+        refRanges <- input$refRanges
151 152
         input <- input$data
152 153
     }
153 154
     else
154
-        prevCallParams <- NULL
155
+        prevCallParams <- refRanges <- NULL
156
+        
155 157
     if (!is.list(input) && file.exists(input))
156 158
         input <- readConfig(input)
157 159
     checkInput(input)
... ...
@@ -409,42 +411,91 @@ recoup <- function(
409 411
     ############################################################################
410 412
     
411 413
     # Continue with actual work
412
-    # Annotation case #1: provided strictly as a BED-like data frame with loci
413
-    if (is.data.frame(genome)) {
414
-        if (type=="rnaseq")
415
-            stop("When type=\"rnaseq\", only the usage of a supported or user-",
416
-                "imported organism from GTF file is allowed!")
417
-        rownames(genome) <- as.character(genome[,4])
418
-        genomeRanges <- makeGRangesFromDataFrame(
419
-            df=genome,
420
-            keep.extra.columns=TRUE
421
-        )
414
+    # If annotation already here and not removed from the input object because
415
+    # of change in region, flank or type, no need to reload it and thus, recoup
416
+    # object become even more mobile and portable as localDb is not needed.
417
+    if (!is.null(refRanges)) {
418
+        genomeRanges <- refRanges$genomeRanges
419
+        helperRanges <- refRanges$helperRanges
422 420
     }
423
-    # Annotation case #2: provided strictly as a BED-like file with loci
424
-    else if (!is.list(genome) && !is.data.frame(genome) 
425
-        && is.character(genome)) {
426
-        if (file.exists(genome)) {
421
+    else {
422
+        # Annotation case #1: provided as a BED-like data frame with loci
423
+        if (is.data.frame(genome)) {
427 424
             if (type=="rnaseq")
428 425
                 stop("When type=\"rnaseq\", only the usage of a supported or ",
429 426
                     "user-imported organism from GTF file is allowed!")
430
-            genome <- read.delim(genome)
431 427
             rownames(genome) <- as.character(genome[,4])
432 428
             genomeRanges <- makeGRangesFromDataFrame(
433 429
                 df=genome,
434 430
                 keep.extra.columns=TRUE
435 431
             )
436
-            # Need to assigne Seqinfo...
437
-            sf <- .chromInfoToSeqInfoDf(.chromInfoFromBAM(input[[1]]$file),
438
-                asSeqinfo=TRUE)
439
-            seqlevels(genomeRanges) <- seqlevels(sf)
440
-            seqinfo(genomeRanges) <- sf
432
+            helperRanges <- NULL
441 433
         }
442
-        # Annotation case #3: use local database or automatically on-the-fly
443
-        else {
444
-            if (type=="chipseq") {
445
-                if (region == "utr3")
434
+        # Annotation case #2: provided strictly as a BED-like file with loci
435
+        else if (!is.list(genome) && !is.data.frame(genome) 
436
+            && is.character(genome)) {
437
+            if (file.exists(genome)) {
438
+                if (type=="rnaseq")
439
+                    stop("When type=\"rnaseq\", only the usage of a supported ",
440
+                        "or user-imported organism from GTF file is allowed!")
441
+                genome <- read.delim(genome)
442
+                rownames(genome) <- as.character(genome[,4])
443
+                genomeRanges <- makeGRangesFromDataFrame(
444
+                    df=genome,
445
+                    keep.extra.columns=TRUE
446
+                )
447
+                # Need to assigne Seqinfo...
448
+                sf <- .chromInfoToSeqInfoDf(.chromInfoFromBAM(input[[1]]$file),
449
+                    asSeqinfo=TRUE)
450
+                seqlevels(genomeRanges) <- seqlevels(sf)
451
+                seqinfo(genomeRanges) <- sf
452
+                
453
+                helperRanges <- NULL
454
+            }
455
+            # Annotation case #3: use local database or automatically on-the-fly
456
+            else {
457
+                if (type=="chipseq") {
458
+                    if (region == "utr3")
459
+                        genomeRanges <- tryCatch(loadAnnotation(genome,refdb,
460
+                            type="utr",version=version,summarized=TRUE,
461
+                            db=localDb,rc=rc),
462
+                            error=function(e) {
463
+                                tryCatch({
464
+                                    gtfFile <- genome$gtf
465
+                                    metadata <- genome
466
+                                    metadata$gtf <- NULL
467
+                                    genomeRanges <- importCustomAnnotation(
468
+                                        gtfFile,metadata,"utr")
469
+                                },error=function(e) {
470
+                                    message("Error ",e)
471
+                                    stop("Please provide an existing organism ",
472
+                                        "or a list with annotation metadata ",
473
+                                        "and GTF file!")
474
+                                },finally="")
475
+                            },finally="")
476
+                    else
477
+                        genomeRanges <- tryCatch(loadAnnotation(genome,refdb,
478
+                            type="gene",version=version,db=localDb,rc=rc),
479
+                            error=function(e) {
480
+                                tryCatch({
481
+                                    gtfFile <- genome$gtf
482
+                                    metadata <- genome
483
+                                    metadata$gtf <- NULL
484
+                                    geneData <- importCustomAnnotation(gtfFile,
485
+                                        metadata,"gene")
486
+                                },error=function(e) {
487
+                                    message("Error ",e)
488
+                                    stop("Please provide an existing organism ",
489
+                                        "or a list with annotation metadata ",
490
+                                        "and GTF file!")
491
+                                },finally="")
492
+                            },finally="")
493
+
494
+                    helperRanges <- NULL
495
+                }
496
+                else if (type=="rnaseq") {
446 497
                     genomeRanges <- tryCatch(loadAnnotation(genome,refdb,
447
-                        type="utr",version=version,summarized=TRUE,db=localDb,
498
+                        type="exon",version=version,summarized=TRUE,db=localDb,
448 499
                         rc=rc),
449 500
                         error=function(e) {
450 501
                             tryCatch({
... ...
@@ -452,16 +503,15 @@ recoup <- function(
452 503
                                 metadata <- genome
453 504
                                 metadata$gtf <- NULL
454 505
                                 genomeRanges <- importCustomAnnotation(gtfFile,
455
-                                    metadata,"utr")
506
+                                    metadata,"exon")
456 507
                             },error=function(e) {
457 508
                                 message("Error ",e)
458 509
                                 stop("Please provide an existing organism or ",
459
-                                    "a list with annotation metadata and GTF ",
460
-                                    "file!")
510
+                                        "a list with annotation metadata and ",
511
+                                        "GTF file!")
461 512
                             },finally="")
462 513
                         },finally="")
463
-                else
464
-                    genomeRanges <- tryCatch(loadAnnotation(genome,refdb,
514
+                    helperRanges <- tryCatch(loadAnnotation(genome,refdb,
465 515
                         type="gene",version=version,db=localDb,rc=rc),
466 516
                         error=function(e) {
467 517
                             tryCatch({
... ...
@@ -477,54 +527,21 @@ recoup <- function(
477 527
                                     "file!")
478 528
                             },finally="")
479 529
                         },finally="")
480
-            
481
-                helperRanges <- NULL
482
-            }
483
-            else if (type=="rnaseq") {
484
-                genomeRanges <- tryCatch(loadAnnotation(genome,refdb,
485
-                    type="exon",version=version,summarized=TRUE,db=localDb,
486
-                    rc=rc),
487
-                    error=function(e) {
488
-                        tryCatch({
489
-                            gtfFile <- genome$gtf
490
-                            metadata <- genome
491
-                            metadata$gtf <- NULL
492
-                            genomeRanges <- importCustomAnnotation(gtfFile,
493
-                                metadata,"exon")
494
-                        },error=function(e) {
495
-                            message("Error ",e)
496
-                            stop("Please provide an existing organism or ",
497
-                                    "a list with annotation metadata and GTF ",
498
-                                    "file!")
499
-                        },finally="")
500
-                    },finally="")
501
-                helperRanges <- tryCatch(loadAnnotation(genome,refdb,
502
-                    type="gene",version=version,db=localDb,rc=rc),
503
-                    error=function(e) {
504
-                        tryCatch({
505
-                            gtfFile <- genome$gtf
506
-                            metadata <- genome
507
-                            metadata$gtf <- NULL
508
-                            geneData <- importCustomAnnotation(gtfFile,
509
-                                metadata,"gene")
510
-                        },error=function(e) {
511
-                            message("Error ",e)
512
-                            stop("Please provide an existing organism or ",
513
-                                "a list with annotation metadata and GTF ",
514
-                                "file!")
515
-                        },finally="")
516
-                    },finally="")
517
-                if (!is(genomeRanges,"GRangesList"))
518
-                    genomeRanges <- split(genomeRanges,genomeRanges$gene_id)
519
-                helperRanges <- helperRanges[names(genomeRanges)]
530
+                    if (!is(genomeRanges,"GRangesList"))
531
+                        genomeRanges <- split(genomeRanges,genomeRanges$gene_id)
532
+                    helperRanges <- helperRanges[names(genomeRanges)]
533
+                }
520 534
             }
521 535
         }
536
+        
537
+        # For saving...
538
+        refRanges <- list(genomeRanges=genomeRanges,helperRanges=helperRanges)
522 539
     }
523 540
     
524 541
     # After genome read, check if we have a valid custom orderer
525 542
     if (!is.null(orderBy$custom)) {
526 543
         if (length(orderBy$custom) != length(genomeRanges)) {
527
-            warning("The custom orderer provide with orderBy parameter does ",
544
+            warning("The custom orderer provided with orderBy parameter does ",
528 545
                 "not have length equal to the number\nof elements in the ",
529 546
                 "interrogated genomic regions and will be ignored!",
530 547
                 immediate.=TRUE)
... ...
@@ -666,7 +683,7 @@ recoup <- function(
666 683
     else if (onFlankFail == "trim")
667 684
         warning("Off bound regions detected after flanking! Offending ",
668 685
             "reference regions will be trimmed\n  and the flanking regions ",
669
-            "will be merged with thre main...",immediate.=TRUE)
686
+            "will be merged with the main...",immediate.=TRUE)
670 687
     }
671 688
     
672 689
     # Here we must write code for the reading and normalization of bam files
... ...
@@ -675,7 +692,6 @@ recoup <- function(
675 692
     if (!onTheFly)
676 693
         input <- preprocessRanges(input,preprocessParams,genome,bamRanges,
677 694
             bamParams=bamParams,rc=rc)
678
-    
679 695
     # At this point we must apply the fraction parameter if <1. We choose this
680 696
     # point in order not to restrict later usage of the read ranges and since it
681 697
     # does not take much time to load into memory. In addition, this operation
... ...
@@ -723,17 +739,7 @@ recoup <- function(
723 739
             #helperRanges <- helperRanges[keeph]
724 740
             #mainRanges <- mainRanges[names(helperRanges)]
725 741
             helperRanges <- .subsetGRangesByChrs(helperRanges,chrs)
726
-            mainRanges <- .subsetGRangesByChrs(mainRanges,chrs)
727
-            
728
-            ####################################################################
729
-            ## There must be an R bug with `lengths` here as although it runs in
730
-            ## Rcmd, it does not pass package building or vignette kniting...  
731
-            ## But for the time being it seems that it is not needed as the name
732
-            ## filtering works
733
-            #lens <- which(lengths(genomeRanges)==0)
734
-            #if (length(lens)>0)
735
-            #    genomeRanges[lens] <- NULL
736
-            ####################################################################
742
+			mainRanges <- .subsetGRangesByChrs(mainRanges,chrs)
737 743
         }
738 744
     }
739 745
     
... ...
@@ -745,8 +751,10 @@ recoup <- function(
745 751
     # If normalization method is linear, we must adjust the coverages
746 752
     # TODO: Check for onTheFly in the beginning
747 753
     if (preprocessParams$normalize == "linear" && !onTheFly) {
754
+        message("Calculating normalization factors...")
748 755
         linFac <- calcLinearFactors(input,preprocessParams)
749 756
         for (n in names(input)) {
757
+            message("  normalizing ",n)
750 758
             if (linFac[n]==1)
751 759
                 next
752 760
             if (signal == "coverage")
... ...
@@ -812,12 +820,13 @@ recoup <- function(
812 820
     # Coverages and profiles calculated... Now depending on plot option, we go 
813 821
     # further or return the enriched input object for saving
814 822
     if (!plotParams$profile && !plotParams$heatmap && !plotParams$correlation) {
815
-        recoupObj <- toOutput(input,design,saveParams,callParams=callParams)
823
+        recoupObj <- toOutput(input,design,saveParams,callParams=callParams,
824
+            refRanges=refRanges)
816 825
         return(recoupObj)
817 826
     }
818 827
     else {
819 828
         recoupObj <- toOutput(input,design,list(ranges=TRUE,coverage=TRUE,
820
-            profile=TRUE),callParams=callParams)
829
+            profile=TRUE),callParams=callParams,refRanges=refRanges)
821 830
     }
822 831
             
823 832
     ## Our plot objects
... ...
@@ -961,7 +970,8 @@ recoup <- function(
961 970
     }
962 971
     
963 972
     # Overwrite final object so as to return it
964
-    recoupObj <- toOutput(input,design,saveParams,recoupPlots,callParams)
973
+    recoupObj <- toOutput(input,design,saveParams,recoupPlots,callParams,
974
+        refRanges)
965 975
     
966 976
     # Make any plots asked
967 977
     if (plotParams$plot) {
... ...
@@ -1009,7 +1019,7 @@ applySelectors <- function(ranges,selector,rc=NULL) {
1009 1019
 }
1010 1020
 
1011 1021
 toOutput <- function(input,design=NULL,saveParams,plotObjs=NULL,
1012
-    callParams=NULL) {
1022
+    callParams=NULL,refRanges=NULL) {
1013 1023
     if (!saveParams$ranges)
1014 1024
         input <- removeData(input,"ranges")
1015 1025
     if (!saveParams$coverage)
... ...
@@ -1027,6 +1037,7 @@ toOutput <- function(input,design=NULL,saveParams,plotObjs=NULL,
1027 1037
         data=input,
1028 1038
         design=design,
1029 1039
         plots=plots,
1030
-        callopts=callParams
1040
+        callopts=callParams,
1041
+        refRanges=refRanges
1031 1042
     ))
1032 1043
 }
Browse code

Fixed strand plot bug in RNA-Seq profiles, removed GenomeInfoDb deprecated function.

Panagiotis Moulos authored on 01/12/2020 17:08:27
Showing 1 changed files
... ...
@@ -216,10 +216,11 @@ recoup <- function(
216 216
     # If type is rnaseq, the only allowed genomes are the ones supported by
217 217
     # recoup for the time being. In the future, a custom RNA-Seq genome may be
218 218
     # constructed from a GFF or like...
219
-    if (type=="rnaseq" && !.userOrg(genome,localDb)
220
-        && !(genome %in% getSupportedOrganisms()))
221
-        stop("When type is \"rnaseq\", only the supported genomes are allowed!")
222
-    
219
+    if (type=="rnaseq" && is.character(localDb) && file.exists(localDb) 
220
+        && !.userOrg(genome,localDb) && !(genome %in% getSupportedOrganisms()))
221
+        stop("When type is \"rnaseq\", only the supported or user imported ",
222
+            "genomes are allowed!")
223
+
223 224
     # annotation must be a list to be fed to buildCustomAnnotation
224 225
     if (is.list(genome) && !is.data.frame(genome)) {
225 226
         # members are checked by buildCustomAnnotation if required
Browse code

Added options for out-of-bound reference regions. Added flexibility in GFF format. Added single facet grid/wrap.

Panagiotis Moulos authored on 19/11/2020 14:20:54
Showing 1 changed files
... ...
@@ -10,6 +10,7 @@ recoup <- function(
10 10
     version="auto",
11 11
     refdb=c("ensembl","ucsc","refseq"),
12 12
     flank=c(2000,2000),
13
+    onFlankFail=c("drop","trim"),
13 14
     fraction=1,
14 15
     orderBy=list(
15 16
         what=c("none","suma","sumn","maxa","maxn","avga","avgn","hcn"),
... ...
@@ -58,6 +59,7 @@ recoup <- function(
58 59
         corrSmoothPar=ifelse(is.null(design),0.1,0.5),
59 60
         singleFacet=c("none","wrap","grid"),
60 61
         multiFacet=c("wrap","grid"),
62
+        singleFacetDirection=c("horizontal","vertical"),
61 63
         conf=TRUE,
62 64
         device=c("x11","png","jpg","tiff","bmp","pdf","ps"),
63 65
         outputDir=".",
... ...
@@ -163,15 +165,25 @@ recoup <- function(
163 165
     region <- tolower(region[1])
164 166
     refdb <- tolower(refdb[1])
165 167
     type <- tolower(type[1])
168
+    onFlankFail <- tolower(onFlankFail[1])
166 169
     signal <- tolower(signal[1])
167 170
     if (!is.null(design) && !is.data.frame(design))
168 171
         checkFileArgs("design",design)
169 172
     if (!is.data.frame(genome) && !is.list(genome) && file.exists(genome))
170 173
         checkFileArgs("genome",genome)
171
-    else if (is.character(genome))
172
-        checkTextArgs("genome",genome,getSupportedOrganisms(),multiarg=FALSE)
173
-    checkTextArgs("refdb",refdb,getSupportedRefDbs(),multiarg=FALSE)
174
+    else if (is.character(genome)) {
175
+        if (is.character(localDb) && file.exists(localDb)) {
176
+            if (!.userOrg(genome,localDb))
177
+                checkTextArgs("genome",genome,getSupportedOrganisms(),
178
+                    multiarg=FALSE)
179
+        }
180
+    }
181
+    if (is.character(localDb) && file.exists(localDb)) {
182
+        if (!.userRefdb(refdb,localDb))
183
+            checkTextArgs("refdb",refdb,getSupportedRefDbs(),multiarg=FALSE)
184
+    }
174 185
     checkTextArgs("type",type,c("chipseq","rnaseq"),multiarg=FALSE)
186
+    checkTextArgs("onFlankFail",onFlankFail,c("drop","trim"),multiarg=FALSE)
175 187
     checkTextArgs("signal",signal,c("coverage","rpm"),multiarg=FALSE)
176 188
     checkNumArgs("fraction",fraction,"numeric",c(0,1),"botheq")
177 189
     if (any(flank<0))
... ...
@@ -204,7 +216,8 @@ recoup <- function(
204 216
     # If type is rnaseq, the only allowed genomes are the ones supported by
205 217
     # recoup for the time being. In the future, a custom RNA-Seq genome may be
206 218
     # constructed from a GFF or like...
207
-    if (type=="rnaseq" && !(genome %in% getSupportedOrganisms()))
219
+    if (type=="rnaseq" && !.userOrg(genome,localDb)
220
+        && !(genome %in% getSupportedOrganisms()))
208 221
         stop("When type is \"rnaseq\", only the supported genomes are allowed!")
209 222
     
210 223
     # annotation must be a list to be fed to buildCustomAnnotation
... ...
@@ -291,6 +304,8 @@ recoup <- function(
291 304
             region=if (is.null(thisCall$region)) prevCallParams$region else
292 305
                 region,
293 306
             type=if (is.null(thisCall$type)) prevCallParams$type else type,
307
+            onFlankFail=if (is.null(thisCall$onFlankFail)) 
308
+                prevCallParams$onFlankFail else onFlankFail,
294 309
             signal=if (is.null(thisCall$signal)) prevCallParams$signal 
295 310
                 else signal,
296 311
             genome=if (is.null(thisCall$genome)) prevCallParams$genome else
... ...
@@ -335,6 +350,7 @@ recoup <- function(
335 350
         callParams <- list(
336 351
             region=region,
337 352
             type=type,
353
+            onFlankFail=onFlankFail,
338 354
             signal=signal,
339 355
             genome=genome,
340 356
             version=version,
... ...
@@ -366,7 +382,8 @@ recoup <- function(
366 382
     # Redefine all final arguments for this call
367 383
     region <- callParams$region
368 384
     type <- callParams$type
369
-    singnal <- callParams$signal
385
+    onFlankFail <- callParams$onFlankFail
386
+    signal <- callParams$signal
370 387
     genome <- callParams$genome
371 388
     version <- callParams$version
372 389
     refdb <- callParams$refdb
... ...
@@ -517,7 +534,7 @@ recoup <- function(
517 534
     # Read and check design compatibilities. Check if k-means is requested and
518 535
     # message accordingly. If k-means is requested it will be added to the 
519 536
     # design data frame
520
-    hasDesign <- FALSE
537
+    #hasDesign <- FALSE
521 538
     if (!is.null(design)) {
522 539
         if (!is.data.frame(design))
523 540
             design <- read.delim(design,row.names=1)
... ...
@@ -617,7 +634,7 @@ recoup <- function(
617 634
     # Here we must determine the final ranges to pass to preprocessRanges and
618 635
     # then work with these later on
619 636
     intermRanges <- getMainRanges(genomeRanges,helperRanges=helperRanges,type,
620
-        region,flank,rc=rc)
637
+        region,flank,onFlankFail,rc=rc)
621 638
     # It is very important that all the Ranges have Seqinfo objects attached as
622 639
     # they have to be trimmed for potential extensions (due to flanking regions)
623 640
     # beyond chromosome lengths. The built-in annotations do provide this option
... ...
@@ -625,12 +642,32 @@ recoup <- function(
625 642
     # interest should be provided too. Or if not provided, the user will be
626 643
     # responsible for any crash.
627 644
     # TODO: When input ranges are custom, genome should be given to make Seqinfo
628
-    if (type == "chipseq") # Otherwise, throw error with GRangesList
629
-        mainRanges <- trim(intermRanges$mainRanges)
630
-    else if (type == "rnaseq")
631
-        mainRanges <- intermRanges$mainRanges
632
-    bamRanges <- trim(intermRanges$bamRanges)
633
-
645
+    
646
+    # Should not be needed anymore with the new onFlankFail option
647
+    #if (type == "chipseq") # Otherwise, throw error with GRangesList
648
+    #    mainRanges <- trim(intermRanges$mainRanges)
649
+    #else if (type == "rnaseq")
650
+    #    mainRanges <- intermRanges$mainRanges
651
+    #bamRanges <- trim(intermRanges$bamRanges)
652
+    
653
+    mainRanges <- intermRanges$mainRanges
654
+    bamRanges <- intermRanges$bamRanges
655
+    hadOffBound <- intermRanges$offBound
656
+    # We must also correct the design in case of drop
657
+    if (!is.null(design) && onFlankFail == "drop" && hadOffBound)
658
+        design <- design[names(mainRanges),,drop=FALSE]
659
+    # Say something if off bound detected
660
+    if (hadOffBound) {
661
+        if (onFlankFail == "drop")
662
+        warning("Off bound regions detected after flanking! Offending ",
663
+            "regions will be dropped from the reference\n  and the design if ",
664
+            "provided...",immediate.=TRUE)
665
+    else if (onFlankFail == "trim")
666
+        warning("Off bound regions detected after flanking! Offending ",
667
+            "reference regions will be trimmed\n  and the flanking regions ",
668
+            "will be merged with thre main...",immediate.=TRUE)
669
+    }
670
+    
634 671
     # Here we must write code for the reading and normalization of bam files
635 672
     # The preprocessRanges function looks if there is a valid (not null) ranges
636 673
     # field in input
... ...
@@ -648,7 +685,7 @@ recoup <- function(
648 685
         #set.seed(preprocessParams$seed)
649 686
         refIndex <- sort(sample(length(mainRanges),newSize))
650 687
         mainRanges <- mainRanges[refIndex]
651
-        for (i in 1:length(input)) {
688
+        for (i in seq_len(length(input))) {
652 689
             if (!is.null(input[[i]]$ranges)) {
653 690
                 newSize <- round(fraction*length(input[[i]]$ranges))
654 691
                 #set.seed(preprocessParams$seed)
... ...
@@ -745,13 +782,17 @@ recoup <- function(
745 782
     if (signal == "coverage") { # Otherwise, matrix calculate by rpm
746 783
         if (binParams$chunking) {
747 784
             if (type=="chipseq")
748
-                binParams$chunks <- split(1:length(genomeRanges),
785
+                binParams$chunks <- split(seq_len(length(genomeRanges)),
749 786
                     as.character(seqnames(genomeRanges)))
750 787
             else if (type=="rnaseq")
751
-                binParams$chunks <- split(1:length(helperRanges),
788
+                binParams$chunks <- split(seq_len(length(helperRanges)),
752 789
                     as.character(seqnames(helperRanges)))
753 790
         }
754
-        input <- profileMatrix(input,flank,binParams,rc)
791
+        # hadOffBounds must be passed on input at some point
792
+        fe0 <- FALSE
793
+        if (onFlankFail == "trim" && hadOffBound)
794
+            fe0 <- TRUE
795
+        input <- profileMatrix(input,flank,binParams,rc,.feNoSplit=fe0)
755 796
     }
756 797
     
757 798
     # In some strange glimpses, we are getting very few NaNs in profile matrix
Browse code

Fixed NaN problem in RNASeq profiles

Panagiotis Moulos authored on 08/10/2020 15:20:58
Showing 1 changed files
... ...
@@ -756,8 +756,11 @@ recoup <- function(
756 756
     
757 757
     # In some strange glimpses, we are getting very few NaNs in profile matrix
758 758
     # which I was unable to reproduce on a gene by gene basis. If no NaNs are
759
-    # detected, no action is performed in the input object.
760
-    input <- imputeProfile(input,method="simple",rc)
759
+    # detected, no action is performed in the input object. Also, these NaNs
760
+    # seem to appear only on zero count gene profiles, so it's safe to impute
761
+    # by zero.
762
+    #input <- imputeProfile(input,method="simple",rc)
763
+    input <- imputeZero(input)
761 764
     
762 765
     # Perform the k-means clustering if requested and append to design (which
763 766
     # has been checked, if we are allowed to do so)
Browse code

Fixed a bug with custom regions coverage naming

Panagiotis Moulos authored on 06/10/2020 15:49:52
Showing 1 changed files
... ...
@@ -415,6 +415,11 @@ recoup <- function(
415 415
                 df=genome,
416 416
                 keep.extra.columns=TRUE
417 417
             )
418
+            # Need to assigne Seqinfo...
419
+            sf <- .chromInfoToSeqInfoDf(.chromInfoFromBAM(input[[1]]$file),
420
+                asSeqinfo=TRUE)
421
+            seqlevels(genomeRanges) <- seqlevels(sf)
422
+            seqinfo(genomeRanges) <- sf
418 423
         }
419 424
         # Annotation case #3: use local database or automatically on-the-fly
420 425
         else {
... ...
@@ -752,7 +757,7 @@ recoup <- function(
752 757
     # In some strange glimpses, we are getting very few NaNs in profile matrix
753 758
     # which I was unable to reproduce on a gene by gene basis. If no NaNs are
754 759
     # detected, no action is performed in the input object.
755
-    input <- imputeProfile(input,rc)
760
+    input <- imputeProfile(input,method="simple",rc)
756 761
     
757 762
     # Perform the k-means clustering if requested and append to design (which
758 763
     # has been checked, if we are allowed to do so)
Browse code

Further changes to comply with check/BiocCheck

Panagiotis Moulos authored on 04/05/2020 08:23:36
Showing 1 changed files
... ...
@@ -212,7 +212,7 @@ recoup <- function(
212 212
         # members are checked by buildCustomAnnotation if required
213 213
         # We only need to check that the gtfFile exists here
214 214
         if (!("gtf" %in% names(genome)))
215
-            stopp("A gtf field must be provided with an existing GTF file ",
215
+            stop("A gtf field must be provided with an existing GTF file ",
216 216
                 "when providing a list with custom annotation elements!")
217 217
         if ("gtf" %in% names(genome) && is.character(genome$gtf)
218 218
             && !file.exists(genome$gtf))
... ...
@@ -479,8 +479,8 @@ recoup <- function(
479 479
                     type="gene",version=version,db=localDb,rc=rc),
480 480
                     error=function(e) {
481 481
                         tryCatch({
482
-                            gtfFile <- annotation$gtf
483
-                            metadata <- annotation
482
+                            gtfFile <- genome$gtf
483
+                            metadata <- genome
484 484
                             metadata$gtf <- NULL
485 485
                             geneData <- importCustomAnnotation(gtfFile,
486 486
                                 metadata,"gene")
... ...
@@ -749,6 +749,11 @@ recoup <- function(
749 749
         input <- profileMatrix(input,flank,binParams,rc)
750 750
     }
751 751
     
752
+    # In some strange glimpses, we are getting very few NaNs in profile matrix
753
+    # which I was unable to reproduce on a gene by gene basis. If no NaNs are
754
+    # detected, no action is performed in the input object.
755
+    input <- imputeProfile(input,rc)
756
+    
752 757
     # Perform the k-means clustering if requested and append to design (which
753 758
     # has been checked, if we are allowed to do so)
754 759
     if (kmParams$k>0)
Browse code

Changed the annotation system and several bug fixes and improvements

Panagiotis Moulos authored on 03/05/2020 06:02:45
Showing 1 changed files
... ...
@@ -1,7 +1,7 @@
1 1
 recoup <- function(
2 2
     input,
3 3
     design=NULL,
4
-    region=c("genebody","tss","tes","custom"),
4
+    region=c("genebody","tss","tes","utr3","custom"),
5 5
     type=c("chipseq","rnaseq"),
6 6
     signal=c("coverage","rpm"),
7 7
     genome=c("hg18","hg19","hg38","mm9","mm10","rn5","rn6","dm3","dm6",
... ...
@@ -126,9 +126,16 @@ recoup <- function(
126 126
     #complexHeatmapParams=getDefaultListArgs("complexHeatmapParams"),
127 127
     bamParams=NULL,
128 128
     onTheFly=FALSE, # Directly from BAM w/o Ranges storing, also N/A with BED,
129
-    localDbHome=file.path(path.expand("~"),".recoup"),
129
+    #localDb=file.path(path.expand("~"),".recoup"),
130
+    localDb=file.path(system.file(package="recoup"),"annotation.sqlite"),
130 131
     rc=NULL
131 132
 ) {
133
+    # Simple check to throw error if user us using the old databasing system
134
+    if (any(dir.exists(file.path(dirname(localDb),refdb))))
135
+        stop("Possible old recoup database system detected. Please rebuild ",
136
+            "using the new system or download a pre-built database. See also ",
137
+            "the vignettes.")
138
+    
132 139
     ############################################################################
133 140
     # Begin of simple parameter checking for a new or a restored object
134 141
     if (is.list(input) && !is.null(input$data)) { # Refeeding recoup object
... ...
@@ -147,7 +154,7 @@ recoup <- function(
147 154
         input <- readConfig(input)
148 155
     checkInput(input)
149 156
     if (is.null(names(input)))
150
-        names(input) <- sapply(input,function(x) return(x$id))
157
+        names(input) <- vapply(input,function(x) return(x$id),character(1))
151 158
     
152 159
     # Check if there are any mispelled or invalid parameters and throw a warning
153 160
     checkMainArgs(as.list(match.call()))
... ...
@@ -159,7 +166,7 @@ recoup <- function(
159 166
     signal <- tolower(signal[1])
160 167
     if (!is.null(design) && !is.data.frame(design))
161 168
         checkFileArgs("design",design)
162
-    if (!is.data.frame(genome) && file.exists(genome))
169
+    if (!is.data.frame(genome) && !is.list(genome) && file.exists(genome))
163 170
         checkFileArgs("genome",genome)
164 171
     else if (is.character(genome))
165 172
         checkTextArgs("genome",genome,getSupportedOrganisms(),multiarg=FALSE)
... ...
@@ -200,6 +207,33 @@ recoup <- function(
200 207
     if (type=="rnaseq" && !(genome %in% getSupportedOrganisms()))
201 208
         stop("When type is \"rnaseq\", only the supported genomes are allowed!")
202 209
     
210
+    # annotation must be a list to be fed to buildCustomAnnotation
211
+    if (is.list(genome) && !is.data.frame(genome)) {
212
+        # members are checked by buildCustomAnnotation if required
213
+        # We only need to check that the gtfFile exists here
214
+        if (!("gtf" %in% names(genome)))
215
+            stopp("A gtf field must be provided with an existing GTF file ",
216
+                "when providing a list with custom annotation elements!")
217
+        if ("gtf" %in% names(genome) && is.character(genome$gtf)
218
+            && !file.exists(genome$gtf))
219
+            stop("An existing GTF file must be provided when providing a ",
220
+                "list with custom annotation elements!")
221
+    }
222
+        
223
+    # Check what is going on with genome, first check if file, then localDb
224
+    if (!is.data.frame(genome) && !is.list(genome) && is.character(genome)
225
+        && !file.exists(genome) && is.character(localDb) 
226
+        && file.exists(localDb)) {
227
+        if (!.userOrg(genome,localDb))
228
+            checkTextArgs("genome",genome,c("hg18","hg19","hg38","mm9","mm10",
229
+                "rn5","rn6","dm3","dm6","danrer7","danrer10","pantro4",
230
+                "pantro5","susscr3","susscr11","ecucab2","tair10"),
231
+                multiarg=FALSE)
232
+        if (!.userRefdb(refdb,localDb))
233
+            checkTextArgs("refdb",refdb,c("ensembl","ucsc","refseq"),
234
+                multiarg=FALSE)
235
+    }
236
+    
203 237
     # and list arguments
204 238
     orderByDefault <- getDefaultListArgs("orderBy")
205 239
     binParamsDefault <- getDefaultListArgs("binParams")
... ...
@@ -232,12 +266,12 @@ recoup <- function(
232 266
     strandedParams <- validateListArgs("strandedParams",strandedParams)
233 267
     
234 268
     if (is.null(plotParams$outputBase))
235
-        plotParams$outputBase <- paste(sapply(input,function(x) return(x$id)),
236
-            collapse="-")
269
+        plotParams$outputBase <- paste(vapply(input,function(x) return(x$id),
270
+            character(1)),collapse="-")
237 271
     
238
-    if (any(sapply(input,function(x) return(tolower(x$format)=="bed")))
239
-        && !(preprocessParams$bedGenome %in% c("hg18","hg19","hg38","mm9",
240
-            "mm10","rn5","dm3","danrer7","pantro4","susscr3")))
272
+    bbb <- vapply(input,function(x) return(tolower(x$format)=="bed"),logical(1))
273
+    if (any(bbb) && !(preprocessParams$bedGenome %in% c("hg18","hg19","hg38",
274
+            "mm9","mm10","rn5","dm3","danrer7","pantro4","susscr3")))
241 275
         stop("When short read files are in BED format, either the genome ",
242 276
             "parameter should be one\nof the supported organisms, or ",
243 277
             "preprocessParams$bedGenome must be specified as one of them.")
... ...
@@ -291,8 +325,8 @@ recoup <- function(
291 325
             #    prevCallParams$bamParams,bamParams), ## Unused
292 326
             onTheFly=if (is.null(thisCall$onTheFly)) prevCallParams$onTheFly 
293 327
                 else onTheFly,
294
-            localDbHome=if (is.null(thisCall$localDbHome)) 
295
-                prevCallParams$localDbHome else localDbHome,
328
+            localDb=if (is.null(thisCall$localDb)) 
329
+                prevCallParams$localDb else localDb,
296 330
             rc=if (is.null(thisCall$rc)) prevCallParams$rc else rc,
297 331
             customIsBase=NULL # Additional placeholder
298 332
         )
... ...
@@ -319,7 +353,7 @@ recoup <- function(
319 353
             complexHeatmapParams=complexHeatmapParams,
320 354
             bamParams=bamParams,
321 355
             onTheFly=onTheFly,
322
-            localDbHome=localDbHome,
356
+            localDb=localDb,
323 357
             rc=rc,
324 358
             customIsBase=NULL # Additional placeholder
325 359
         )
... ...
@@ -350,152 +384,127 @@ recoup <- function(
350 384
     complexHeatmapParams <- callParams$complexHeatmapParams
351 385
     bamParams <- callParams$bamParams
352 386
     onTheFly <- callParams$onTheFly
353
-    localDbHome <- callParams$localDbHome
387
+    localDb <- callParams$localDb
354 388
     rc <- callParams$rc
355 389
     # End of complex parameter storage procedure and parameter recall from a
356 390
     # previous call
357 391
     ############################################################################
358 392
     
359 393
     # Continue with actual work
360
-    if (!is.data.frame(genome)) {
394
+    # Annotation case #1: provided strictly as a BED-like data frame with loci
395
+    if (is.data.frame(genome)) {
396
+        if (type=="rnaseq")
397
+            stop("When type=\"rnaseq\", only the usage of a supported or user-",
398
+                "imported organism from GTF file is allowed!")
399
+        rownames(genome) <- as.character(genome[,4])
400
+        genomeRanges <- makeGRangesFromDataFrame(
401
+            df=genome,
402
+            keep.extra.columns=TRUE
403
+        )
404
+    }
405
+    # Annotation case #2: provided strictly as a BED-like file with loci
406
+    else if (!is.list(genome) && !is.data.frame(genome) 
407
+        && is.character(genome)) {
361 408
         if (file.exists(genome)) {
362
-            genome <- read.delim(genome) # Must be bed like
409
+            if (type=="rnaseq")
410
+                stop("When type=\"rnaseq\", only the usage of a supported or ",
411
+                    "user-imported organism from GTF file is allowed!")
412
+            genome <- read.delim(genome)
363 413
             rownames(genome) <- as.character(genome[,4])
364 414
             genomeRanges <- makeGRangesFromDataFrame(
365 415
                 df=genome,
366 416
                 keep.extra.columns=TRUE
367 417
             )
368 418
         }
419
+        # Annotation case #3: use local database or automatically on-the-fly
369 420
         else {
370
-            # Check if local storage has been set
371
-            storageHome <- file.path(localDbHome,refdb,genome)
372
-            if (dir.exists(storageHome)) {
373
-                # Some compatibility with previous annoation stores
374
-                if ("gene.rda" %in% dir(storageHome)) { # Older version
375
-                    warning("Older annotation storage detected! Please ",
376
-                        "rebuild your annotation storage\nusing the ",
377
-                        "buildAnnotationStore function so as to have the\n",
378
-                        "ability of versioning genomic coordinate annotations.",
379
-                        "\nIn the future, older versions may become unusable.",
380
-                        "\n",immediate.=TRUE)
381
-                    storageHomeVersion <- storageHome
382
-                }
383
-                else { # Newer version
384
-                    # Decide on version
385
-                    if (version != "auto") {
386
-                        storageHomeVersion <- file.path(storageHome,version)
387
-                        if (!dir.exists(storageHomeVersion)) {
388
-                            warning("The annotation directory ",
389
-                                storageHomeVersion,
390
-                                " does not seem to exist! Have you run ",
391
-                                "buildAnnotationStorage? Will use newest ",
392
-                                "existing version.",immediate.=TRUE)
393
-                            version <- "auto"
394
-                        }
395
-                    }
396
-                    if (version == "auto") {
397
-                        vers <- suppressWarnings(as.numeric(dir(storageHome)))
398
-                        if (any(is.na(vers))) {
399
-                            of <- vers[which(is.na(vers))]
400
-                            stop("Corrupted annotation storage directory ",
401
-                                "contents! ->",of,"<- Either remove offending ",
402
-                                "files/directories or rebuild.")
403
-                        }
404
-                        vers <- sort(vers,decreasing=TRUE)
405
-                        version <- vers[1]
406
-                        storageHomeVersion <- file.path(storageHome,version)
407
-                    }
408
-                }
409
-                
410
-                if (type=="chipseq") {
411
-                    g <- load(file.path(storageHomeVersion,"gene.rda"))
412
-                    genomeRanges <- gene
413
-                    helperRanges <- NULL
414
-                }
415
-                else if (type=="rnaseq") {
416
-                    # Load the helper ranges anyway
417
-                    g <- load(file.path(storageHomeVersion,"gene.rda"))
418
-                    helperRanges <- gene
419
-                    if (all(flank==0)) {
420
-                        g <- load(file.path(storageHomeVersion,
421
-                            "summarized_exon.rda"))
422
-                        genomeRanges <- sexon
423
-                    }
424
-                    else if (all(flank==500)) {
425
-                        g <- load(file.path(storageHomeVersion,
426
-                            "summarized_exon_F500.rda"))
427
-                        genomeRanges <- flankedSexon
428
-                    }
429
-                    else if (all(flank==1000)) {
430
-                        g <- load(file.path(storageHomeVersion,
431
-                            "summarized_exon_F1000.rda"))
432
-                        genomeRanges <- flankedSexon
433
-                    }
434
-                    else if (all(flank==2000)) {
435
-                        g <- load(file.path(storageHomeVersion,
436
-                            "summarized_exon_F2000.rda"))
437
-                        genomeRanges <- flankedSexon
438
-                    }
439
-                    else if (all(flank==5000)) {
440
-                        g <- load(file.path(storageHomeVersion,
441
-                            "summarized_exon_F5000.rda"))
442
-                        genomeRanges <- flankedSexon
443
-                    }
444
-                    else {
445
-                        warning("When using recoup in RNA-Seq mode, it is ",
446
-                            "much faster to use a precalculated set of ",
447
-                            "regions and their flanks (0, 500, 1000, 2000 and ",
448
-                            "5000 bps supported) and subset this one for a ",
449
-                            "custom set of genes. Otherwise calculations may ",
450
-                            "take too long to complete.",immediate.=TRUE)
451
-                        genomeRanges <- 
452
-                            getMainRnaRangesOnTheFly(helperRanges,flank,rc=rc)
453
-                    }
454
-                }
421
+            if (type=="chipseq") {
422
+                if (region == "utr3")
423
+                    genomeRanges <- tryCatch(loadAnnotation(genome,refdb,
424
+                        type="utr",version=version,summarized=TRUE,db=localDb,
425
+                        rc=rc),
426
+                        error=function(e) {
427
+                            tryCatch({
428
+                                gtfFile <- genome$gtf
429
+                                metadata <- genome
430
+                                metadata$gtf <- NULL
431
+                                genomeRanges <- importCustomAnnotation(gtfFile,
432
+                                    metadata,"utr")
433
+                            },error=function(e) {
434
+                                message("Error ",e)
435
+                                stop("Please provide an existing organism or ",
436
+                                    "a list with annotation metadata and GTF ",
437
+                                    "file!")
438
+                            },finally="")
439
+                        },finally="")
440
+                else
441
+                    genomeRanges <- tryCatch(loadAnnotation(genome,refdb,
442
+                        type="gene",version=version,db=localDb,rc=rc),
443
+                        error=function(e) {
444
+                            tryCatch({
445
+                                gtfFile <- genome$gtf
446
+                                metadata <- genome
447
+                                metadata$gtf <- NULL
448
+                                geneData <- importCustomAnnotation(gtfFile,
449
+                                    metadata,"gene")
450
+                            },error=function(e) {
451
+                                message("Error ",e)
452
+                                stop("Please provide an existing organism or ",
453
+                                    "a list with annotation metadata and GTF ",
454
+                                    "file!")
455
+                            },finally="")
456
+                        },finally="")
457
+            
458
+                helperRanges <- NULL
455 459
             }
456
-            else { # On the fly
457
-                message("Getting annotation on the fly for ",genome," from ",
458
-                    refdb)
459
-                if (type=="chipseq") {
460
-                    genome <- getAnnotation(genome,"gene",refdb=refdb,rc=rc)
461
-                    helperRanges <- NULL
462
-                }
463
-                else if (type=="rnaseq") {
464
-                    helper <- getAnnotation(genome,"gene",refdb=refdb,rc=rc)
465
-                    helperRanges <- makeGRangesFromDataFrame(
466
-                        df=helper,
467
-                        keep.extra.columns=TRUE,
468
-                        seqnames.field="chromosome"
469
-                    )
470
-                    names(helperRanges) <- as.character(helperRanges$gene_id)
471
-                    genome <- getAnnotation(genome,"exon",refdb=refdb,rc=rc)
472
-                    annGr <- makeGRangesFromDataFrame(
473
-                        df=genome,
474
-                        keep.extra.columns=TRUE,
475
-                        seqnames.field="chromosome"
476
-                    )
477
-                    message("Merging exons")
478
-                    annGr <- reduceExons(annGr,rc=rc)
479
-                    names(annGr) <- as.character(annGr$exon_id)
480
-                    genomeRanges <- split(annGr,annGr$gene_id)
481
-                }
460
+            else if (type=="rnaseq") {
461
+                genomeRanges <- tryCatch(loadAnnotation(genome,refdb,
462
+                    type="exon",version=version,summarized=TRUE,db=localDb,
463
+                    rc=rc),
464
+                    error=function(e) {
465
+                        tryCatch({
466
+                            gtfFile <- genome$gtf
467
+                            metadata <- genome
468
+                            metadata$gtf <- NULL
469
+                            genomeRanges <- importCustomAnnotation(gtfFile,
470
+                                metadata,"exon")
471
+                        },error=function(e) {
472
+                            message("Error ",e)
473
+                            stop("Please provide an existing organism or ",
474
+                                    "a list with annotation metadata and GTF ",
475
+                                    "file!")
476
+                        },finally="")
477
+                    },finally="")
478
+                helperRanges <- tryCatch(loadAnnotation(genome,refdb,
479
+                    type="gene",version=version,db=localDb,rc=rc),
480
+                    error=function(e) {
481
+                        tryCatch({
482
+                            gtfFile <- annotation$gtf
483
+                            metadata <- annotation
484
+                            metadata$gtf <- NULL
485
+                            geneData <- importCustomAnnotation(gtfFile,
486
+                                metadata,"gene")
487
+                        },error=function(e) {
488
+                            message("Error ",e)
489
+                            stop("Please provide an existing organism or ",
490
+                                "a list with annotation metadata and GTF ",
491
+                                "file!")
492
+                        },finally="")
493
+                    },finally="")
494
+                if (!is(genomeRanges,"GRangesList"))
495
+                    genomeRanges <- split(genomeRanges,genomeRanges$gene_id)
496
+                helperRanges <- helperRanges[names(genomeRanges)]
482 497
             }
483 498
         }
484 499
     }
485
-    else {
486
-        rownames(genome) <- as.character(genome[,4])
487
-        genomeRanges <- makeGRangesFromDataFrame(
488
-            df=genome,
489
-            keep.extra.columns=TRUE
490
-        )
491
-    }
492 500
     
493 501
     # After genome read, check if we have a valid custom orderer
494 502
     if (!is.null(orderBy$custom)) {
495
-        if (length(orderBy$custom) != nrow(genome)) {
503
+        if (length(orderBy$custom) != length(genomeRanges)) {
496 504
             warning("The custom orderer provide with orderBy parameter does ",
497
-                "not have length equal to the number of elements in genome ",
498
-                "argument and will be ignored!",immediate.=TRUE)
505
+                "not have length equal to the number\nof elements in the ",
506
+                "interrogated genomic regions and will be ignored!",
507
+                immediate.=TRUE)
499 508
             orderBy$custom <- NULL
500 509
         }
501 510
     }
... ...
@@ -550,8 +559,8 @@ recoup <- function(
550 559
                     helperRanges[rownames(design)]
551 560
                 },error=function(e) {
552 561
                     stop("Unexpected error occured! Are you sure that element ",
553
-                        "(row) names in the design file are of the same type as ",
554
-                        "the genome file?")
562
+                        "(row) names in the design file are of the same type ",
563
+                        "as the genome file?")
555 564
                 },finally={})
556 565
         }
557 566
         # ...but maybe the names are incompatible
... ...
@@ -577,7 +586,8 @@ recoup <- function(
577 586
     }
578 587
     
579 588
     # Now we must follow two paths according to region type, genebody and custom
580
-    # areas with equal/unequal widths, or tss, tes and 1-width custom areas.
589
+    # areas with equal/unequal widths and utr3 or tss, tes and 1-width custom 
590
+    # areas.
581 591
     callParams$customIsBase <- FALSE
582 592
     if (region=="custom" && all(width(genomeRanges)==1)) {
583 593
         if (all(flank==0)) {
... ...
@@ -612,20 +622,23 @@ recoup <- function(
612 622
     # TODO: When input ranges are custom, genome should be given to make Seqinfo
613 623
     if (type == "chipseq") # Otherwise, throw error with GRangesList
614 624
         mainRanges <- trim(intermRanges$mainRanges)
625
+    else if (type == "rnaseq")
626
+        mainRanges <- intermRanges$mainRanges
615 627
     bamRanges <- trim(intermRanges$bamRanges)
616 628
 
617 629
     # Here we must write code for the reading and normalization of bam files
618 630
     # The preprocessRanges function looks if there is a valid (not null) ranges
619 631
     # field in input
620
-    #if (!onTheFly)
621
-    #input <- preprocessRanges(input,preprocessParams,bamParams=bamParams,rc=rc)
622
-    input <- preprocessRanges(input,preprocessParams,genome,bamRanges,
623
-        bamParams=bamParams,rc=rc)
632
+    if (!onTheFly)
633
+        input <- preprocessRanges(input,preprocessParams,genome,bamRanges,
634
+            bamParams=bamParams,rc=rc)
624 635
     
625 636
     # At this point we must apply the fraction parameter if <1. We choose this
626 637
     # point in order not to restrict later usage of the read ranges and since it
627
-    # does not take much time to load into memory.
628
-    if (fraction<1) {
638
+    # does not take much time to load into memory. In addition, this operation
639
+    # cannot be applied when streaming BAMs. In that case, user must subsample
640
+    # outside recoup.
641
+    if (!onTheFly && fraction<1) {
629 642
         newSize <- round(fraction*length(mainRanges))
630 643
         #set.seed(preprocessParams$seed)
631 644
         refIndex <- sort(sample(length(mainRanges),newSize))
... ...
@@ -644,70 +657,51 @@ recoup <- function(
644 657
         }
645 658
     }
646 659
 
647
-    # Remove unwanted seqnames from reference ranges
648
-    chrs <- unique(unlist(lapply(input,function(x) {
649
-        if (x$format %in% c("bam","bed"))
650
-            return(as.character(runValue(seqnames(x$ranges))))
651
-        else if (x$format=="bigwig") {
652
-            if (!requireNamespace("rtracklayer"))
653
-                stop("R package rtracklayer is required to read and import ",
654
-                    "BigWig files!")
655
-            return(as.character(seqnames(seqinfo(BigWigFile(x$file)))))
660
+    # Remove unwanted seqnames from reference ranges (if not on the fly) and
661
+    # properly subset ranges if not all chromosomes are provided with BAM
662
+    if (!onTheFly) {
663
+        chrs <- unique(unlist(lapply(input,function(x) {
664
+            if (x$format %in% c("bam","bed"))
665
+                return(as.character(runValue(seqnames(x$ranges))))
666
+            else if (x$format=="bigwig") {
667
+                if (!requireNamespace("rtracklayer"))
668
+                    stop("R package rtracklayer is required to read and ",
669
+                        "import BigWig files!")
670
+                return(as.character(seqnames(seqinfo(BigWigFile(x$file)))))
671
+            }
672
+        })))
673
+        if (type=="chipseq") {
674
+            #keep <- which(as.character(seqnames(mainRanges)) %in% chrs)
675
+            #mainRanges <- mainRanges[keep]
676
+            mainRanges <- .subsetGRangesByChrs(mainRanges,chrs)
677
+        }
678
+        else if (type=="rnaseq") {
679
+            #keeph <- which(as.character(seqnames(helperRanges)) %in% chrs)
680
+            #helperRanges <- helperRanges[keeph]
681
+            #mainRanges <- mainRanges[names(helperRanges)]
682
+            helperRanges <- .subsetGRangesByChrs(helperRanges,chrs)
683
+            mainRanges <- .subsetGRangesByChrs(mainRanges,chrs)
684
+            
685
+            ####################################################################
686
+            ## There must be an R bug with `lengths` here as although it runs in
687
+            ## Rcmd, it does not pass package building or vignette kniting...  
688
+            ## But for the time being it seems that it is not needed as the name
689
+            ## filtering works
690
+            #lens <- which(lengths(genomeRanges)==0)
691
+            #if (length(lens)>0)
692
+            #    genomeRanges[lens] <- NULL
693
+            ####################################################################
656 694
         }
657
-    })))
658
-    if (type=="chipseq") {
659
-        keep <- which(as.character(seqnames(mainRanges)) %in% chrs)
660
-        mainRanges <- mainRanges[keep]
661
-    }
662
-    else if (type=="rnaseq") {
663
-        keeph <- which(as.character(seqnames(helperRanges)) %in% chrs)
664
-        helperRanges <- helperRanges[keeph]
665
-        mainRanges <- mainRanges[names(helperRanges)]
666
-        ########################################################################
667
-        ## There must be an R bug with `lengths` here as although it runs in 
668
-        ## Rcmd, it does not pass package building or vignette kniting... But 
669
-        ## for the time being it seems that it is not needed as the name 
670
-        ## filtering works
671
-        #lens <- which(lengths(genomeRanges)==0)
672
-        #if (length(lens)>0)
673
-        #    genomeRanges[lens] <- NULL
674
-        ########################################################################
675 695
     }
676 696
     
677
-    #if (type=="chipseq")
678
-    #    input <- coverageRef(input,genomeRanges,region,flank,strandedParams,
679
-    #        rc=rc)#,bamParams)
680
-    #else if (type=="rnaseq")
681
-    #    input <- coverageRnaRef(input,genomeRanges,helperRanges,flank,
682
-    #        strandedParams,rc=rc)#,bamParams)
683
-    #if (type=="chipseq")
684
-    #    input <- coverageRef(input,mainRanges,strandedParams,rc=rc)
685
-    #else if (type=="rnaseq") #{
686
-        #if (flank[1]==0 && flank[2]==0)
687
-        #    mainRnaRanges <- genomeRanges
688
-        #else {
689
-        #    mainRnaRanges <- tryCatch(
690
-        #        loadMainRnaRanges(genome,refdb,flank),
691
-        #        error=function(e) { # Insanely slow fallback switch
692
-        #            warning("The requested flanking regions are not ",
693
-        #                "compatible with any of the precalculated ones. They ",
694
-        #                "will be calculated on the fly which is a lenghty ",
695
-        #                "process. If you wish to use precalculated flanking ",
696
-        #                "regions, stop the execution and adjust the flanking ",
697
-        #                "parameter.",immediate.=TRUE)
698
-        #            getMainRnaRangesOnTheFly(helperRanges,flank,rc=rc)
699
-        #        },finally=""
700
-        #    )
701
-        #}
702
-        #input <- coverageRnaRef(input,mainRanges,strandedParams,rc=rc)
703
-    #}
704 697
     if(signal == "coverage")
705 698
         input <- coverageRef(input,mainRanges,strandedParams,rc=rc)
706 699
     else if (signal == "rpm")
707 700
         input <- rpMatrix(input,mainRanges,flank,binParams,strandedParams,rc=rc)
708 701
     
709 702
     # If normalization method is linear, we must adjust the coverages
710
-    if (preprocessParams$normalize == "linear") {
703
+    # TODO: Check for onTheFly in the beginning
704
+    if (preprocessParams$normalize == "linear" && !onTheFly) {
711 705
         linFac <- calcLinearFactors(input,preprocessParams)
712 706
         for (n in names(input)) {
713 707
             if (linFac[n]==1)
... ...
@@ -724,13 +718,12 @@ recoup <- function(
724 718
         }
725 719
     }
726 720
     
727
-    # Now we must summarize and create the matrices. If genebody or unequal 
728
-    # custom lengths, bin is a must, else we leave to user
721
+    # Now we must summarize and create the matrices. If genebody, utr3 or 
722
+    # unequal custom lengths, bin is a must, else we leave to user
729 723
     mustBin <- FALSE
730
-    if (region=="genebody")
724
+    if (region=="genebody" || region=="utr3")
731 725
         mustBin <- TRUE
732 726
     if (region=="custom") {
733
-        #w <- width(genomeRanges)
734 727
         w <- width(mainRanges)
735 728
         if (any(w!=w[1]))
736 729
             mustBin <- TRUE
... ...
@@ -738,8 +731,8 @@ recoup <- function(
738 731
     if (mustBin) {
739 732
         if (binParams$regionBinSize==0) {
740 733
             warning("Central region bin size not set for a region that must ",
741
-                "be binned! Setting to 1000...",immediate.=TRUE)
742
-            binParams$regionBinSize <- 1000
734
+                "be binned! Setting to 200...",immediate.=TRUE)
735
+            binParams$regionBinSize <- 200
743 736
         }
744 737
     }
745 738
     
... ...
@@ -763,7 +756,7 @@ recoup <- function(
763 756
     
764 757
     # Coverages and profiles calculated... Now depending on plot option, we go 
765 758
     # further or return the enriched input object for saving
766
-    if (!plotParams$profile && !plotParams$heatmap) {
759
+    if (!plotParams$profile && !plotParams$heatmap && !plotParams$correlation) {
767 760
         recoupObj <- toOutput(input,design,saveParams,callParams=callParams)
768 761
         return(recoupObj)
769 762
     }
Browse code

Functionality and performance updates, fixes for scalar/vector comparisons

Panagiotis Moulos authored on 04/03/2020 15:28:39
Showing 1 changed files
1 1
old mode 100644
2 2
new mode 100755
... ...
@@ -3,6 +3,7 @@ recoup <- function(
3 3
     design=NULL,
4 4
     region=c("genebody","tss","tes","custom"),
5 5
     type=c("chipseq","rnaseq"),
6
+    signal=c("coverage","rpm"),
6 7
     genome=c("hg18","hg19","hg38","mm9","mm10","rn5","rn6","dm3","dm6",
7 8
         "danrer7","danrer10","pantro4","pantro5","susscr3","susscr11",
8 9
         "ecucab2","tair10"),
... ...
@@ -21,9 +22,11 @@ recoup <- function(
21 22
         regionBinSize=0,
22 23
         sumStat=c("mean","median"),
23 24
         interpolation=c("auto","spline","linear","neighborhood"),
25
+        binType=c("variable","fixed"),
24 26
         forceHeatmapBinning=TRUE,
25 27
         forcedBinSize=c(50,200),
26
-        chunking=FALSE
28
+        chunking=FALSE#,
29
+        #seed=42
27 30
     ),
28 31
     #binParams=getDefaultListArgs("binParams"),
29 32
     selector=NULL,
... ...
@@ -37,8 +40,8 @@ recoup <- function(
37 40
         spliceRemoveQ=0.75,
38 41
         #bedGenome=ifelse(genome %in% c("hg18","hg19","hg38","mm9","mm10","rn5",
39 42
         #    "dm3","danrer7","pantro4","susscr3"),genome,NA),
40
-        bedGenome=NA,
41
-        seed=42
43
+        bedGenome=NA#,
44
+        #seed=42
42 45
     ),
43 46
     #preprocessParams=getDefaultListArgs("preprocessParams"),
44 47
     plotParams=list(
... ...
@@ -75,8 +78,8 @@ recoup <- function(
75 78
         nstart=20,
76 79
         algorithm=c("Hartigan-Wong","Lloyd","Forgy","MacQueen"),
77 80
         iterMax=20,
78
-        reference=NULL,
79
-        seed=42
81
+        reference=NULL#,
82
+        #seed=42
80 83
     ),
81 84
     #kmParams=getDefaultListArgs("kmParams"),
82 85
     strandedParams=list(
... ...
@@ -93,7 +96,7 @@ recoup <- function(
93 96
         strip.text.x=element_text(size=10,face="bold"),
94 97
         strip.text.y=element_text(size=10,face="bold"),
95 98
         legend.position="bottom",
96
-        panel.margin=grid::unit(1,"lines")
99
+        panel.spacing=grid::unit(1,"lines")
97 100
     ),
98 101
     #ggplotParams=getDefaultListArgs("ggplotParams"),
99 102
     complexHeatmapParams=list(
... ...
@@ -153,6 +156,7 @@ recoup <- function(
153 156
     region <- tolower(region[1])
154 157
     refdb <- tolower(refdb[1])
155 158
     type <- tolower(type[1])
159
+    signal <- tolower(signal[1])
156 160
     if (!is.null(design) && !is.data.frame(design))
157 161
         checkFileArgs("design",design)
158 162
     if (!is.data.frame(genome) && file.exists(genome))
... ...
@@ -161,6 +165,7 @@ recoup <- function(
161 165
         checkTextArgs("genome",genome,getSupportedOrganisms(),multiarg=FALSE)
162 166
     checkTextArgs("refdb",refdb,getSupportedRefDbs(),multiarg=FALSE)
163 167
     checkTextArgs("type",type,c("chipseq","rnaseq"),multiarg=FALSE)
168
+    checkTextArgs("signal",signal,c("coverage","rpm"),multiarg=FALSE)
164 169
     checkNumArgs("fraction",fraction,"numeric",c(0,1),"botheq")
165 170
     if (any(flank<0))
166 171
         stop("The minimum flanking allowed is 0 bp")
... ...
@@ -189,6 +194,12 @@ recoup <- function(
189 194
         preprocessParams$fragLen <- NA
190 195
     }
191 196
     
197
+    # If type is rnaseq, the only allowed genomes are the ones supported by
198
+    # recoup for the time being. In the future, a custom RNA-Seq genome may be
199
+    # constructed from a GFF or like...
200
+    if (type=="rnaseq" && !(genome %in% getSupportedOrganisms()))
201
+        stop("When type is \"rnaseq\", only the supported genomes are allowed!")
202
+    
192 203
     # and list arguments
193 204
     orderByDefault <- getDefaultListArgs("orderBy")
194 205
     binParamsDefault <- getDefaultListArgs("binParams")
... ...
@@ -246,6 +257,8 @@ recoup <- function(
246 257
             region=if (is.null(thisCall$region)) prevCallParams$region else
247 258
                 region,
248 259
             type=if (is.null(thisCall$type)) prevCallParams$type else type,
260
+            signal=if (is.null(thisCall$signal)) prevCallParams$signal 
261
+                else signal,
249 262
             genome=if (is.null(thisCall$genome)) prevCallParams$genome else
250 263
                 genome,
251 264
             refdb=if (is.null(thisCall$refdb)) prevCallParams$refdb else
... ...
@@ -288,6 +301,7 @@ recoup <- function(
288 301
         callParams <- list(
289 302
             region=region,
290 303
             type=type,
304
+            signal=signal,
291 305
             genome=genome,
292 306
             version=version,
293 307
             refdb=refdb,
... ...
@@ -316,27 +330,28 @@ recoup <- function(
316 330
     input <- decideChanges(input,callParams,prevCallParams)
317 331
     
318 332
     # Redefine all final arguments for this call
319
-    region=callParams$region
320
-    type=callParams$type
321
-    genome=callParams$genome
322
-    version=callParams$version
323
-    refdb=callParams$refdb
324
-    flank=callParams$flank
325
-    fraction=callParams$fraction
326
-    orderBy=callParams$orderBy
327
-    binParams=callParams$binParams
328
-    selector=callParams$selector
329
-    preprocessParams=callParams$preprocessParams
330
-    plotParams=callParams$plotParams
331
-    saveParams=callParams$saveParams
332
-    kmParams=callParams$kmParams
333
-    strandedParams=callParams$strandedParams
334
-    ggplotParams=callParams$ggplotParams
335
-    complexHeatmapParams=callParams$complexHeatmapParams
336
-    bamParams=callParams$bamParams
337
-    onTheFly=callParams$onTheFly
338
-    localDbHome=callParams$localDbHome
339
-    rc=callParams$rc
333
+    region <- callParams$region
334
+    type <- callParams$type
335
+    singnal <- callParams$signal
336
+    genome <- callParams$genome
337
+    version <- callParams$version
338
+    refdb <- callParams$refdb
339
+    flank <- callParams$flank
340
+    fraction <- callParams$fraction
341
+    orderBy <- callParams$orderBy
342
+    binParams <- callParams$binParams
343
+    selector <- callParams$selector
344
+    preprocessParams <- callParams$preprocessParams
345
+    plotParams <- callParams$plotParams
346
+    saveParams <- callParams$saveParams
347
+    kmParams <- callParams$kmParams
348
+    strandedParams <- callParams$strandedParams
349
+    ggplotParams <- callParams$ggplotParams
350
+    complexHeatmapParams <- callParams$complexHeatmapParams
351
+    bamParams <- callParams$bamParams
352
+    onTheFly <- callParams$onTheFly
353
+    localDbHome <- callParams$localDbHome
354
+    rc <- callParams$rc
340 355
     # End of complex parameter storage procedure and parameter recall from a
341 356
     # previous call
342 357
     ############################################################################
... ...
@@ -358,11 +373,11 @@ recoup <- function(
358 373
                 # Some compatibility with previous annoation stores
359 374
                 if ("gene.rda" %in% dir(storageHome)) { # Older version
360 375
                     warning("Older annotation storage detected! Please ",
361
-                        "rebuild your annotation storage using the ",
362
-                        "buildAnnotationStore function\nso as to have the ",
376
+                        "rebuild your annotation storage\nusing the ",
377
+                        "buildAnnotationStore function so as to have the\n",
363 378
                         "ability of versioning genomic coordinate annotations.",
364 379
                         "\nIn the future, older versions may become unusable.",
365
-                        immediate.=TRUE)
380
+                        "\n",immediate.=TRUE)
366 381
                     storageHomeVersion <- storageHome
367 382
                 }
368 383
                 else { # Newer version
... ...
@@ -423,7 +438,7 @@ recoup <- function(
423 438
                     }
424 439
                     else if (all(flank==5000)) {
425 440
                         g <- load(file.path(storageHomeVersion,
426
-                            "summarized_exon_F2000.rda"))
441
+                            "summarized_exon_F5000.rda"))
427 442
                         genomeRanges <- flankedSexon
428 443
                     }
429 444
                     else {
... ...
@@ -588,8 +603,16 @@ recoup <- function(
588 603
     # then work with these later on
589 604
     intermRanges <- getMainRanges(genomeRanges,helperRanges=helperRanges,type,
590 605
         region,flank,rc=rc)
591
-    mainRanges <- intermRanges$mainRanges
592
-    bamRanges <- intermRanges$bamRanges
606
+    # It is very important that all the Ranges have Seqinfo objects attached as
607
+    # they have to be trimmed for potential extensions (due to flanking regions)
608
+    # beyond chromosome lengths. The built-in annotations do provide this option
609
+    # otherwise when a custom genome/areas is/are provided, the genome of 
610
+    # interest should be provided too. Or if not provided, the user will be
611
+    # responsible for any crash.
612
+    # TODO: When input ranges are custom, genome should be given to make Seqinfo
613
+    if (type == "chipseq") # Otherwise, throw error with GRangesList
614
+        mainRanges <- trim(intermRanges$mainRanges)
615
+    bamRanges <- trim(intermRanges$bamRanges)
593 616
 
594 617
     # Here we must write code for the reading and normalization of bam files
595 618
     # The preprocessRanges function looks if there is a valid (not null) ranges
... ...
@@ -602,41 +625,15 @@ recoup <- function(
602 625
     # At this point we must apply the fraction parameter if <1. We choose this
603 626
     # point in order not to restrict later usage of the read ranges and since it
604 627
     # does not take much time to load into memory.
605
-    #if (fraction<1) {
606
-    #    newSize <- round(fraction*length(genomeRanges))
607
-    #    set.seed(preprocessParams$seed)
608
-    #    refIndex <- sort(sample(length(genomeRanges),newSize))
609
-    #    genomeRanges <- genomeRanges[refIndex]
610
-    #    if (type=="rnaseq")
611
-    #        helperRanges <- helperRanges[refIndex]
612
-    #    for (i in 1:length(input)) {
613
-    #        if (!is.null(input[[i]]$ranges)) {
614
-    #            newSize <- round(fraction*length(input[[i]]$ranges))
615
-    #            set.seed(preprocessParams$seed)
616
-    #            fracIndex <- sort(sample(length(input[[i]]$ranges),newSize))
617
-    #            input[[i]]$ranges <- input[[i]]$ranges[fracIndex]
618
-    #        }
619
-    #        if (!is.null(input[[i]]$coverage)) {
620
-    #            #if (!is.null(input[[i]]$coverage$center)) {
621
-    #            #    input[[i]]$coverage$center <- 
622
-    #            #        input[[i]]$coverage$center[refIndex]
623
-    #            #}
624
-    #            #else
625
-    #            input[[i]]$coverage <- input[[i]]$coverage[refIndex]
626
-    #        }
627
-    #        if (!is.null(input[[i]]$profile))
628
-    #            input[[i]]$profile <- input[[i]]$profile[refIndex,]
629
-    #    }
630
-    #}
631 628
     if (fraction<1) {
632 629
         newSize <- round(fraction*length(mainRanges))
633
-        set.seed(preprocessParams$seed)
630
+        #set.seed(preprocessParams$seed)
634 631
         refIndex <- sort(sample(length(mainRanges),newSize))
635 632
         mainRanges <- mainRanges[refIndex]
636 633
         for (i in 1:length(input)) {
637 634
             if (!is.null(input[[i]]$ranges)) {
638 635
                 newSize <- round(fraction*length(input[[i]]$ranges))
639
-                set.seed(preprocessParams$seed)
636
+                #set.seed(preprocessParams$seed)
640 637
                 fracIndex <- sort(sample(length(input[[i]]$ranges),newSize))
641 638
                 input[[i]]$ranges <- input[[i]]$ranges[fracIndex]
642 639
             }
... ...
@@ -658,24 +655,6 @@ recoup <- function(
658 655
             return(as.character(seqnames(seqinfo(BigWigFile(x$file)))))
659 656
         }
660 657
     })))
661
-    #if (type=="chipseq") {
662
-    #    keep <- which(as.character(seqnames(genomeRanges)) %in% chrs)
663
-    #    genomeRanges <- genomeRanges[keep]
664
-    #}
665
-    #else if (type=="rnaseq") {
666
-    #    keeph <- which(as.character(seqnames(helperRanges)) %in% chrs)
667
-    #    helperRanges <- helperRanges[keeph]
668
-    #    genomeRanges <- genomeRanges[names(helperRanges)]
669
-    #    ########################################################################
670
-    #    ## There must be an R bug with `lengths` here as although it runs in 
671
-    #    ## Rcmd, it does not pass package building or vignette kniting... But 
672
-    #    ## for the time being it seems that it is not needed as the name 
673
-    #    ## filtering works
674
-    #    #lens <- which(lengths(genomeRanges)==0)
675
-    #    #if (length(lens)>0)
676
-    #    #    genomeRanges[lens] <- NULL
677
-    #    ########################################################################
678
-    #}
679 658
     if (type=="chipseq") {
680 659
         keep <- which(as.character(seqnames(mainRanges)) %in% chrs)
681 660
         mainRanges <- mainRanges[keep]
... ...
@@ -701,9 +680,9 @@ recoup <- function(
701 680
     #else if (type=="rnaseq")
702 681
     #    input <- coverageRnaRef(input,genomeRanges,helperRanges,flank,
703 682
     #        strandedParams,rc=rc)#,bamParams)
704
-    if (type=="chipseq")
705
-        input <- coverageRef(input,mainRanges,strandedParams,rc=rc)
706
-    else if (type=="rnaseq") #{
683
+    #if (type=="chipseq")
684
+    #    input <- coverageRef(input,mainRanges,strandedParams,rc=rc)
685
+    #else if (type=="rnaseq") #{
707 686
         #if (flank[1]==0 && flank[2]==0)
708 687
         #    mainRnaRanges <- genomeRanges
709 688
         #else {
... ...
@@ -720,26 +699,28 @@ recoup <- function(
720 699
         #        },finally=""
721 700
         #    )
722 701
         #}
723
-        input <- coverageRnaRef(input,mainRanges,strandedParams,rc=rc)
702
+        #input <- coverageRnaRef(input,mainRanges,strandedParams,rc=rc)
724 703
     #}
704
+    if(signal == "coverage")
705
+        input <- coverageRef(input,mainRanges,strandedParams,rc=rc)
706
+    else if (signal == "rpm")
707
+        input <- rpMatrix(input,mainRanges,flank,binParams,strandedParams,rc=rc)
708
+    
725 709
     # If normalization method is linear, we must adjust the coverages
726
-    if (preprocessParams$normalize=="linear") {
710
+    if (preprocessParams$normalize == "linear") {
727 711
         linFac <- calcLinearFactors(input,preprocessParams)
728 712
         for (n in names(input)) {
729 713
             if (linFac[n]==1)
730 714
                 next
731
-            #if (is.null(input[[n]]$coverage$center))
732
-            input[[n]]$coverage <- cmclapply(input[[n]]$coverage,
715
+            if (signal == "coverage")
716
+                input[[n]]$coverage <- cmclapply(input[[n]]$coverage,
733 717
                 function(x,f) {
734 718
                     return(x*f)
735
-                },linFac[n]
736
-            )
737
-            #else {
738
-            #    input[[n]]$coverage$center <- 
739
-            #        cmclapply(input[[n]]$coverage$center,function(x,f) {
740
-            #            return(x*f)
741
-            #        },linFac[n])
742
-            #}
719
+                },linFac[n])
720
+            else if (signal == "rpm")
721
+                input[[n]]$profile <- apply(input[[n]]$profile,1,function(x,f) {
722
+                    return(x*f)
723
+                },linFac[n])
743 724
         }
744 725
     }
745 726
     
... ...
@@ -763,15 +744,17 @@ recoup <- function(
763 744
     }
764 745
     
765 746
     # Chunking?
766
-    if (binParams$chunking) {
767
-        if (type=="chipseq")
768
-            binParams$chunks <- split(1:length(genomeRanges),
769
-                as.character(seqnames(genomeRanges)))
770
-        else if (type=="rnaseq")
771
-            binParams$chunks <- split(1:length(helperRanges),
772
-                as.character(seqnames(helperRanges)))
747
+    if (signal == "coverage") { # Otherwise, matrix calculate by rpm
748
+        if (binParams$chunking) {
749
+            if (type=="chipseq")
750
+                binParams$chunks <- split(1:length(genomeRanges),
751
+                    as.character(seqnames(genomeRanges)))
752
+            else if (type=="rnaseq")
753
+                binParams$chunks <- split(1:length(helperRanges),
754
+                    as.character(seqnames(helperRanges)))
755
+        }
756
+        input <- profileMatrix(input,flank,binParams,rc)
773 757
     }
774
-    input <- profileMatrix(input,flank,binParams,rc)
775 758
     
776 759
     # Perform the k-means clustering if requested and append to design (which
777 760
     # has been checked, if we are allowed to do so)
Browse code

Redesigned the local annotation build system. Fixed minor bugs.

Panagiotis Moulos authored on 25/04/2018 18:49:45
Showing 1 changed files
... ...
@@ -3,8 +3,10 @@ recoup <- function(
3 3
     design=NULL,
4 4
     region=c("genebody","tss","tes","custom"),
5 5
     type=c("chipseq","rnaseq"),
6
-    genome=c("hg18","hg19","hg38","mm9","mm10","rn5","dm3","danrer7","pantro4",
7
-        "susscr3"),
6
+    genome=c("hg18","hg19","hg38","mm9","mm10","rn5","rn6","dm3","dm6",
7
+        "danrer7","danrer10","pantro4","pantro5","susscr3","susscr11",
8
+        "ecucab2","tair10"),
9
+    version="auto",
8 10
     refdb=c("ensembl","ucsc","refseq"),
9 11
     flank=c(2000,2000),
10 12
     fraction=1,
... ...
@@ -27,8 +29,8 @@ recoup <- function(
27 29
     selector=NULL,
28 30
     #selector=getDefaultListArgs("selector"),
29 31
     preprocessParams=list(
30
-		fragLen=NA,
31
-		cleanLevel=c(0,1,2,3),
32
+        fragLen=NA,
33
+        cleanLevel=c(0,1,2,3),
32 34
         normalize=c("none","linear","downsample","sampleto"),
33 35
         sampleTo=1e+6,
34 36
         spliceAction=c("split","keep","remove"),
... ...
@@ -156,16 +158,22 @@ recoup <- function(
156 158
     if (!is.data.frame(genome) && file.exists(genome))
157 159
         checkFileArgs("genome",genome)
158 160
     else if (is.character(genome))
159
-        checkTextArgs("genome",genome,c("hg18","hg19","hg38","mm9","mm10","rn5",
160
-            "dm3","danrer7","pantro4","susscr3","tair10"),multiarg=FALSE)
161
-    checkTextArgs("refdb",refdb,c("ensembl","ucsc","refseq"),multiarg=FALSE)
161
+        checkTextArgs("genome",genome,getSupportedOrganisms(),multiarg=FALSE)
162
+    checkTextArgs("refdb",refdb,getSupportedRefDbs(),multiarg=FALSE)
162 163
     checkTextArgs("type",type,c("chipseq","rnaseq"),multiarg=FALSE)
163 164
     checkNumArgs("fraction",fraction,"numeric",c(0,1),"botheq")
164 165
     if (any(flank<0))
165 166
         stop("The minimum flanking allowed is 0 bp")
166 167
     if (any(flank>50000))
167 168
         stop("The maximum flanking allowed is 50000 bp")
168
-        
169
+    # Check the version argument
170
+    version <- version[1]
171
+    if (is.character(version)) {
172
+        version <- tolower(version)
173
+        checkTextArgs("version",version,c("auto"),multiarg=FALSE)
174
+    }
175
+    else
176
+        checkNumArgs("version",version,"numeric")
169 177
     # If type is rnaseq, only genebody plots are valid
170 178
     if (type=="rnaseq" && region!="genebody") {
171 179
         warning("When type is \"rnaseq\", plots can be created only on ",
... ...
@@ -175,11 +183,11 @@ recoup <- function(
175 183
     
176 184
     # If type is rnaseq, read extension is illegal because of potential splicing
177 185
     if (type=="rnaseq" && !is.null(preprocessParams$fragLen)
178
-		&& !is.na(preprocessParams$fragLen)) {
179
-		warning("When type is \"rnaseq\", read extension/reduction should not ",
186
+        && !is.na(preprocessParams$fragLen)) {
187
+        warning("When type is \"rnaseq\", read extension/reduction should not ",
180 188
             "happen because of potential splicing! Ignoring...",immediate.=TRUE)
181
-		preprocessParams$fragLen <- NA
182
-	}
189
+        preprocessParams$fragLen <- NA
190
+    }
183 191
     
184 192
     # and list arguments
185 193
     orderByDefault <- getDefaultListArgs("orderBy")
... ...
@@ -242,6 +250,8 @@ recoup <- function(
242 250
                 genome,
243 251
             refdb=if (is.null(thisCall$refdb)) prevCallParams$refdb else
244 252
                 refdb,
253
+            version=if (is.null(thisCall$version)) prevCallParams$version else
254
+                version,
245 255
             flank=if (is.null(thisCall$flank)) prevCallParams$flank else flank,
246 256
             fraction=if (is.null(thisCall$fraction)) prevCallParams$fraction
247 257
                 else fraction,
... ...
@@ -279,6 +289,7 @@ recoup <- function(
279 289
             region=region,
280 290
             type=type,
281 291
             genome=genome,
292
+            version=version,
282 293
             refdb=refdb,
283 294
             flank=flank,
284 295
             fraction=fraction,
... ...
@@ -308,6 +319,7 @@ recoup <- function(
308 319
     region=callParams$region
309 320
     type=callParams$type
310 321
     genome=callParams$genome
322
+    version=callParams$version
311 323
     refdb=callParams$refdb
312 324
     flank=callParams$flank
313 325
     fraction=callParams$fraction
... ...
@@ -341,38 +353,76 @@ recoup <- function(
341 353
         }
342 354
         else {
343 355
             # Check if local storage has been set
344
-            if (dir.exists(file.path(localDbHome,refdb,genome))) {
356
+            storageHome <- file.path(localDbHome,refdb,genome)
357
+            if (dir.exists(storageHome)) {
358
+                # Some compatibility with previous annoation stores
359
+                if ("gene.rda" %in% dir(storageHome)) { # Older version
360
+                    warning("Older annotation storage detected! Please ",
361
+                        "rebuild your annotation storage using the ",
362
+                        "buildAnnotationStore function\nso as to have the ",
363
+                        "ability of versioning genomic coordinate annotations.",
364
+                        "\nIn the future, older versions may become unusable.",
365
+                        immediate.=TRUE)
366
+                    storageHomeVersion <- storageHome
367
+                }
368
+                else { # Newer version
369
+                    # Decide on version
370
+                    if (version != "auto") {
371
+                        storageHomeVersion <- file.path(storageHome,version)
372
+                        if (!dir.exists(storageHomeVersion)) {
373
+                            warning("The annotation directory ",
374
+                                storageHomeVersion,
375
+                                " does not seem to exist! Have you run ",
376
+                                "buildAnnotationStorage? Will use newest ",
377
+                                "existing version.",immediate.=TRUE)
378
+                            version <- "auto"
379
+                        }
380
+                    }
381
+                    if (version == "auto") {
382
+                        vers <- suppressWarnings(as.numeric(dir(storageHome)))
383
+                        if (any(is.na(vers))) {
384
+                            of <- vers[which(is.na(vers))]
385
+                            stop("Corrupted annotation storage directory ",
386
+                                "contents! ->",of,"<- Either remove offending ",
387
+                                "files/directories or rebuild.")
388
+                        }
389
+                        vers <- sort(vers,decreasing=TRUE)
390
+                        version <- vers[1]
391
+                        storageHomeVersion <- file.path(storageHome,version)
392
+                    }
393
+                }
394
+                
345 395
                 if (type=="chipseq") {
346
-                    g <- load(file.path(localDbHome,refdb,genome,"gene.rda"))
396
+                    g <- load(file.path(storageHomeVersion,"gene.rda"))
347 397
                     genomeRanges <- gene
348 398
                     helperRanges <- NULL
349 399
                 }
350 400
                 else if (type=="rnaseq") {
351 401
                     # Load the helper ranges anyway
352
-                    g <- load(file.path(localDbHome,refdb,genome,"gene.rda"))
402
+                    g <- load(file.path(storageHomeVersion,"gene.rda"))
353 403
                     helperRanges <- gene
354 404
                     if (all(flank==0)) {
355
-                        g <- load(file.path(localDbHome,refdb,genome,
405
+                        g <- load(file.path(storageHomeVersion,
356 406
                             "summarized_exon.rda"))
357 407
                         genomeRanges <- sexon
358 408
                     }
359 409
                     else if (all(flank==500)) {
360
-                        g <- load(file.path(localDbHome,refdb,genome,
410
+                        g <- load(file.path(storageHomeVersion,
361 411
                             "summarized_exon_F500.rda"))
362 412
                         genomeRanges <- flankedSexon
363 413
                     }
364 414
                     else if (all(flank==1000)) {
365
-                        g <- load(file.path(localDbHome,refdb,genome,
415
+                        g <- load(file.path(storageHomeVersion,
366 416
                             "summarized_exon_F1000.rda"))
367 417
                         genomeRanges <- flankedSexon
368 418
                     }
369 419
                     else if (all(flank==2000)) {
370
-                        g <- load(file.path(localDbHome,refdb,genome,
420
+                        g <- load(file.path(storageHomeVersion,
371 421
                             "summarized_exon_F2000.rda"))
372 422
                         genomeRanges <- flankedSexon
373 423
                     }
374 424
                     else if (all(flank==5000)) {
375
-                        g <- load(file.path(localDbHome,refdb,genome,
425
+                        g <- load(file.path(storageHomeVersion,
376 426
                             "summarized_exon_F2000.rda"))
377 427
                         genomeRanges <- flankedSexon
378 428
                     }
... ...
@@ -404,15 +454,15 @@ recoup <- function(
404 454
                     )
405 455
                     names(helperRanges) <- as.character(helperRanges$gene_id)
406 456
                     genome <- getAnnotation(genome,"exon",refdb=refdb,rc=rc)
407
-                    ann.gr <- makeGRangesFromDataFrame(
457
+                    annGr <- makeGRangesFromDataFrame(
408 458
                         df=genome,
409 459
                         keep.extra.columns=TRUE,
410 460
                         seqnames.field="chromosome"
411 461
                     )
412 462
                     message("Merging exons")
413
-                    ann.gr <- reduceExons(ann.gr,rc=rc)
414
-                    names(ann.gr) <- as.character(ann.gr$exon_id)
415
-                    genomeRanges <- split(ann.gr,ann.gr$gene_id)
463
+                    annGr <- reduceExons(annGr,rc=rc)
464
+                    names(annGr) <- as.character(annGr$exon_id)
465
+                    genomeRanges <- split(annGr,annGr$gene_id)
416 466
                 }
417 467
             }
418 468
         }
... ...
@@ -644,7 +694,7 @@ recoup <- function(
644 694
         #    genomeRanges[lens] <- NULL
645 695
         ########################################################################
646 696
     }
647
-	
697
+    
648 698
     #if (type=="chipseq")
649 699
     #    input <- coverageRef(input,genomeRanges,region,flank,strandedParams,
650 700
     #        rc=rc)#,bamParams)
Browse code

Improved run times by orders of magnitude. Added a function to merge runs for big datasets with memory issues. Fixed bugs. Updated documentation.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/recoup@128121 bc3139a8-67e5-0310-9ffc-ced21a209358

Panagiotis Moulos authored on 06/04/2017 09:32:51
Showing 1 changed files
... ...
@@ -20,12 +20,15 @@ recoup <- function(
20 20
         sumStat=c("mean","median"),
21 21
         interpolation=c("auto","spline","linear","neighborhood"),
22 22
         forceHeatmapBinning=TRUE,
23
-        forcedBinSize=c(50,200)
23
+        forcedBinSize=c(50,200),
24
+        chunking=FALSE
24 25
     ),
25 26
     #binParams=getDefaultListArgs("binParams"),
26 27
     selector=NULL,
27 28
     #selector=getDefaultListArgs("selector"),
28 29
     preprocessParams=list(
30
+		fragLen=NA,
31
+		cleanLevel=c(0,1,2,3),
29 32
         normalize=c("none","linear","downsample","sampleto"),
30 33
         sampleTo=1e+6,
31 34
         spliceAction=c("split","keep","remove"),
... ...
@@ -170,6 +173,14 @@ recoup <- function(
170 173
         region <- "genebody"
171 174
     }
172 175
     
176
+    # If type is rnaseq, read extension is illegal because of potential splicing
177
+    if (type=="rnaseq" && !is.null(preprocessParams$fragLen)
178
+		&& !is.na(preprocessParams$fragLen)) {
179
+		warning("When type is \"rnaseq\", read extension/reduction should not ",
180
+            "happen because of potential splicing! Ignoring...",immediate.=TRUE)
181
+		preprocessParams$fragLen <- NA
182
+	}
183
+    
173 184
     # and list arguments
174 185
     orderByDefault <- getDefaultListArgs("orderBy")
175 186
     binParamsDefault <- getDefaultListArgs("binParams")
... ...
@@ -334,20 +345,56 @@ recoup <- function(
334 345
                 if (type=="chipseq") {
335 346
                     g <- load(file.path(localDbHome,refdb,genome,"gene.rda"))
336 347
                     genomeRanges <- gene
348
+                    helperRanges <- NULL
337 349
                 }
338 350
                 else if (type=="rnaseq") {
339
-                    g <- load(file.path(localDbHome,refdb,genome,
340
-                        "summarized_exon.rda"))
341
-                    genomeRanges <- sexon
351
+                    # Load the helper ranges anyway
342 352
                     g <- load(file.path(localDbHome,refdb,genome,"gene.rda"))
343 353
                     helperRanges <- gene
354
+                    if (all(flank==0)) {
355
+                        g <- load(file.path(localDbHome,refdb,genome,
356
+                            "summarized_exon.rda"))
357
+                        genomeRanges <- sexon
358
+                    }
359
+                    else if (all(flank==500)) {
360
+                        g <- load(file.path(localDbHome,refdb,genome,
361
+                            "summarized_exon_F500.rda"))
362
+                        genomeRanges <- flankedSexon
363
+                    }
364
+                    else if (all(flank==1000)) {
365
+                        g <- load(file.path(localDbHome,refdb,genome,
366
+                            "summarized_exon_F1000.rda"))
367
+                        genomeRanges <- flankedSexon
368
+                    }
369
+                    else if (all(flank==2000)) {
370
+                        g <- load(file.path(localDbHome,refdb,genome,
371
+                            "summarized_exon_F2000.rda"))
372
+                        genomeRanges <- flankedSexon
373
+                    }
374
+                    else if (all(flank==5000)) {
375
+                        g <- load(file.path(localDbHome,refdb,genome,
376
+                            "summarized_exon_F2000.rda"))
377
+                        genomeRanges <- flankedSexon
378
+                    }
379
+                    else {
380
+                        warning("When using recoup in RNA-Seq mode, it is ",
381
+                            "much faster to use a precalculated set of ",
382
+                            "regions and their flanks (0, 500, 1000, 2000 and ",
383
+                            "5000 bps supported) and subset this one for a ",
384
+                            "custom set of genes. Otherwise calculations may ",
385
+                            "take too long to complete.",immediate.=TRUE)
386
+                        genomeRanges <- 
387
+                            getMainRnaRangesOnTheFly(helperRanges,flank,rc=rc)
388
+                    }
344 389
                 }
345 390
             }
346 391
             else { # On the fly
347 392
                 message("Getting annotation on the fly for ",genome," from ",
348 393
                     refdb)
349
-                if (type=="chipseq")
394
+                if (type=="chipseq") {
350 395
                     genome <- getAnnotation(genome,"gene",refdb=refdb,rc=rc)
396
+                    helperRanges <- NULL
397
+                }
351 398
                 else if (type=="rnaseq") {
352 399
                     helper <- getAnnotation(genome,"gene",refdb=refdb,rc=rc)
353 400
                     helperRanges <- makeGRangesFromDataFrame(
... ...
@@ -464,22 +511,78 @@ recoup <- function(
464 511
         }
465 512
     }
466 513
     
514
+    # Now we must follow two paths according to region type, genebody and custom
515
+    # areas with equal/unequal widths, or tss, tes and 1-width custom areas.
516
+    callParams$customIsBase <- FALSE
517
+    if (region=="custom" && all(width(genomeRanges)==1)) {
518
+        if (all(flank==0)) {
519
+            warning("Flanking cannot be zero bp in both directions when the ",
520
+                "reference region is only 1bp! Setting to default ",
521
+                "(2000,2000)...",immediate.=TRUE)
522
+            flank <- c(2000,2000)
523
+            callParams$flank <- flank
524
+        }
525
+        callParams$customIsBase <- TRUE
526
+    }
527
+    if (region %in% c("tss","tes")) {
528
+        if (all(flank==0)) {
529
+            warning("Flanking cannot be zero bp in both directions when the ",
530
+                "reference region is \"tss\" or \"tes\"! Setting to default ",
531
+                "(2000,2000)...", immediate.=TRUE)
532
+            flank <- c(2000,2000)
533
+            callParams$flank <- flank
534
+        }
535
+    }
536
+    
537
+    # Here we must determine the final ranges to pass to preprocessRanges and
538
+    # then work with these later on
539
+    intermRanges <- getMainRanges(genomeRanges,helperRanges=helperRanges,type,
540
+        region,flank,rc=rc)
541
+    mainRanges <- intermRanges$mainRanges
542
+    bamRanges <- intermRanges$bamRanges
543
+
467 544
     # Here we must write code for the reading and normalization of bam files
468 545
     # The preprocessRanges function looks if there is a valid (not null) ranges
469 546
     # field in input
470 547
     #if (!onTheFly)
471
-    input <- preprocessRanges(input,preprocessParams,bamParams=bamParams,rc=rc)
548
+    #input <- preprocessRanges(input,preprocessParams,bamParams=bamParams,rc=rc)
549
+    input <- preprocessRanges(input,preprocessParams,genome,bamRanges,
550
+        bamParams=bamParams,rc=rc)
472 551
     
473 552
     # At this point we must apply the fraction parameter if <1. We choose this
474 553
     # point in order not to restrict later usage of the read ranges and since it
475 554
     # does not take much time to load into memory.
555
+    #if (fraction<1) {
556
+    #    newSize <- round(fraction*length(genomeRanges))
557
+    #    set.seed(preprocessParams$seed)
558
+    #    refIndex <- sort(sample(length(genomeRanges),newSize))
559
+    #    genomeRanges <- genomeRanges[refIndex]
560
+    #    if (type=="rnaseq")
561
+    #        helperRanges <- helperRanges[refIndex]
562
+    #    for (i in 1:length(input)) {
563
+    #        if (!is.null(input[[i]]$ranges)) {
564
+    #            newSize <- round(fraction*length(input[[i]]$ranges))
565
+    #            set.seed(preprocessParams$seed)
566
+    #            fracIndex <- sort(sample(length(input[[i]]$ranges),newSize))
567
+    #            input[[i]]$ranges <- input[[i]]$ranges[fracIndex]
568
+    #        }
569
+    #        if (!is.null(input[[i]]$coverage)) {
570
+    #            #if (!is.null(input[[i]]$coverage$center)) {
571
+    #            #    input[[i]]$coverage$center <- 
572
+    #            #        input[[i]]$coverage$center[refIndex]
573
+    #            #}
574
+    #            #else
575
+    #            input[[i]]$coverage <- input[[i]]$coverage[refIndex]
576
+    #        }
577
+    #        if (!is.null(input[[i]]$profile))
578
+    #            input[[i]]$profile <- input[[i]]$profile[refIndex,]
579
+    #    }
580
+    #}
476 581
     if (fraction<1) {
477
-        newSize <- round(fraction*length(genomeRanges))
582
+        newSize <- round(fraction*length(mainRanges))
478 583
         set.seed(preprocessParams$seed)
479
-        refIndex <- sort(sample(length(genomeRanges),newSize))
480
-        genomeRanges <- genomeRanges[refIndex]
481
-        if (type=="rnaseq")
482
-            helperRanges <- helperRanges[refIndex]
584
+        refIndex <- sort(sample(length(mainRanges),newSize))
585
+        mainRanges <- mainRanges[refIndex]
483 586
         for (i in 1:length(input)) {
484 587
             if (!is.null(input[[i]]$ranges)) {
485 588
                 newSize <- round(fraction*length(input[[i]]$ranges))
... ...
@@ -487,14 +590,8 @@ recoup <- function(
487 590
                 fracIndex <- sort(sample(length(input[[i]]$ranges),newSize))
488 591
                 input[[i]]$ranges <- input[[i]]$ranges[fracIndex]
489 592
             }
490
-            if (!is.null(input[[i]]$coverage)) {
491
-                #if (!is.null(input[[i]]$coverage$center)) {
492
-                #    input[[i]]$coverage$center <- 
493
-                #        input[[i]]$coverage$center[refIndex]
494
-                #}
495
-                #else
593
+            if (!is.null(input[[i]]$coverage))
496 594
                 input[[i]]$coverage <- input[[i]]$coverage[refIndex]
497
-            }
498 595
             if (!is.null(input[[i]]$profile))
499 596
                 input[[i]]$profile <- input[[i]]$profile[refIndex,]
500 597
         }
... ...
@@ -504,17 +601,39 @@ recoup <- function(
504 601
     chrs <- unique(unlist(lapply(input,function(x) {
505 602
         if (x$format %in% c("bam","bed"))
506 603
             return(as.character(runValue(seqnames(x$ranges))))
507
-        else if (x$format=="bigwig")
604
+        else if (x$format=="bigwig") {
605
+            if (!requireNamespace("rtracklayer"))
606
+                stop("R package rtracklayer is required to read and import ",
607
+                    "BigWig files!")
508 608
             return(as.character(seqnames(seqinfo(BigWigFile(x$file)))))
609
+        }
509 610
     })))
611
+    #if (type=="chipseq") {
612
+    #    keep <- which(as.character(seqnames(genomeRanges)) %in% chrs)
613
+    #    genomeRanges <- genomeRanges[keep]
614
+    #}
615
+    #else if (type=="rnaseq") {
616
+    #    keeph <- which(as.character(seqnames(helperRanges)) %in% chrs)
617
+    #    helperRanges <- helperRanges[keeph]
618
+    #    genomeRanges <- genomeRanges[names(helperRanges)]
619
+    #    ########################################################################
620
+    #    ## There must be an R bug with `lengths` here as although it runs in 
621
+    #    ## Rcmd, it does not pass package building or vignette kniting... But 
622
+    #    ## for the time being it seems that it is not needed as the name 
623
+    #    ## filtering works
624
+    #    #lens <- which(lengths(genomeRanges)==0)
625
+    #    #if (length(lens)>0)
626
+    #    #    genomeRanges[lens] <- NULL
627
+    #    ########################################################################
628
+    #}
510 629
     if (type=="chipseq") {
511
-        keep <- which(as.character(seqnames(genomeRanges)) %in% chrs)
512
-        genomeRanges <- genomeRanges[keep]
630
+        keep <- which(as.character(seqnames(mainRanges)) %in% chrs)
631
+        mainRanges <- mainRanges[keep]
513 632
     }
514 633
     else if (type=="rnaseq") {
515 634
         keeph <- which(as.character(seqnames(helperRanges)) %in% chrs)
516 635
         helperRanges <- helperRanges[keeph]
517
-        genomeRanges <- genomeRanges[names(helperRanges)]
636
+        mainRanges <- mainRanges[names(helperRanges)]
518 637
         ########################################################################
519 638
         ## There must be an R bug with `lengths` here as although it runs in 
520 639
         ## Rcmd, it does not pass package building or vignette kniting... But 
... ...
@@ -525,36 +644,34 @@ recoup <- function(
525 644
         #    genomeRanges[lens] <- NULL
526 645
         ########################################################################
527 646
     }
528
-    
529
-    # Now we must follow two paths according to region type, genebody and custom
530
-    # areas with equal/unequal widths, or tss, tes and 1-width custom areas
531
-    callParams$customIsBase <- FALSE
532
-    if (region=="custom" && all(width(genomeRanges)==1)) {
533
-        if (all(flank==0)) {
534
-            warning("Flanking cannot be zero bp in both directions when the ",
535
-                "reference region is only 1bp! Setting to default ",
536
-                "(2000,2000)...",immediate.=TRUE)
537
-            flank <- c(2000,2000)
538
-            callParams$flank <- flank
539
-        }
540
-        callParams$customIsBase <- TRUE
541
-    }
542
-    if (region %in% c("tss","tes")) {
543
-        if (all(flank==0)) {
544
-            warning("Flanking cannot be zero bp in both directions when the ",
545
-                "reference region is \"tss\" or \"tes\"! Setting to default ",
546
-                "(2000,2000)...", immediate.=TRUE)
547
-            flank <- c(2000,2000)
548
-            callParams$flank <- flank
549
-        }
550
-    }
647
+	
648
+    #if (type=="chipseq")
649
+    #    input <- coverageRef(input,genomeRanges,region,flank,strandedParams,
650
+    #        rc=rc)#,bamParams)
651
+    #else if (type=="rnaseq")
652
+    #    input <- coverageRnaRef(input,genomeRanges,helperRanges,flank,
653
+    #        strandedParams,rc=rc)#,bamParams)
551 654
     if (type=="chipseq")
552
-        input <- coverageRef(input,genomeRanges,region,flank,strandedParams,
553
-            rc=rc)#,bamParams)
554
-    else if (type=="rnaseq")
555
-        input <- coverageRnaRef(input,genomeRanges,helperRanges,flank,
556
-            strandedParams,rc=rc)#,bamParams)
557
-    
655
+        input <- coverageRef(input,mainRanges,strandedParams,rc=rc)
656
+    else if (type=="rnaseq") #{
657
+        #if (flank[1]==0 && flank[2]==0)
658
+        #    mainRnaRanges <- genomeRanges
659
+        #else {
660
+        #    mainRnaRanges <- tryCatch(
661
+        #        loadMainRnaRanges(genome,refdb,flank),
662
+        #        error=function(e) { # Insanely slow fallback switch
663
+        #            warning("The requested flanking regions are not ",
664
+        #                "compatible with any of the precalculated ones. They ",
665
+        #                "will be calculated on the fly which is a lenghty ",
666
+        #                "process. If you wish to use precalculated flanking ",
667
+        #                "regions, stop the execution and adjust the flanking ",
668
+        #                "parameter.",immediate.=TRUE)
669
+        #            getMainRnaRangesOnTheFly(helperRanges,flank,rc=rc)
670
+        #        },finally=""
671
+        #    )
672
+        #}
673
+        input <- coverageRnaRef(input,mainRanges,strandedParams,rc=rc)
674
+    #}
558 675
     # If normalization method is linear, we must adjust the coverages
559 676
     if (preprocessParams$normalize=="linear") {
560 677
         linFac <- calcLinearFactors(input,preprocessParams)
... ...
@@ -582,11 +699,11 @@ recoup <- function(
582 699
     if (region=="genebody")
583 700
         mustBin <- TRUE
584 701
     if (region=="custom") {
585
-        w <- width(genomeRanges)
702
+        #w <- width(genomeRanges)
703
+        w <- width(mainRanges)
586 704
         if (any(w!=w[1]))
587 705
             mustBin <- TRUE
588 706
     }
589
-    
590 707
     if (mustBin) {
591 708
         if (binParams$regionBinSize==0) {
592 709
             warning("Central region bin size not set for a region that must ",
... ...
@@ -594,6 +711,16 @@ recoup <- function(
594 711
             binParams$regionBinSize <- 1000
595 712
         }
596 713
     }
714
+    
715
+    # Chunking?
716
+    if (binParams$chunking) {
717
+        if (type=="chipseq")
718
+            binParams$chunks <- split(1:length(genomeRanges),
719
+                as.character(seqnames(genomeRanges)))
720
+        else if (type=="rnaseq")
721
+            binParams$chunks <- split(1:length(helperRanges),
722
+                as.character(seqnames(helperRanges)))
723
+    }
597 724
     input <- profileMatrix(input,flank,binParams,rc)
598 725
     
599 726
     # Perform the k-means clustering if requested and append to design (which
... ...
@@ -700,7 +827,7 @@ recoup <- function(
700 827
                     #    binSize=binParams$forcedBinSize[1],
701 828
                     #    stat=binParams$sumStat,rc=rc)
702 829
                     right <- binCoverageMatrix(
703
-                        input[[n]]$coverage,binSize=forcedBinSize[1],
830
+                        input[[n]]$coverage,binSize=binParams$forcedBinSize[1],
704 831
                         stat=binParams$sumStat,
705 832
                         interpolation=binParams$interpolation,flank=flank,
706 833
                         where="downstream",rc=rc
Browse code

Moved sumStat and smooth options from binParams to plotParams as smoothing should be available for reusing/replotting recoup objects. sumStat remained in binParams to be used for region binning over e.g. gene bodies. Fixed bug in calculation of average profiles in recoupCorrelation when using mean/median instead of splines.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/recoup@115628 bc3139a8-67e5-0310-9ffc-ced21a209358

Panagiotis Moulos authored on 01/04/2016 14:11:23
Showing 1 changed files
... ...
@@ -18,7 +18,6 @@ recoup <- function(
18 18
         flankBinSize=0,
19 19
         regionBinSize=0,
20 20
         sumStat=c("mean","median"),
21
-        smooth=TRUE,
22 21
         interpolation=c("auto","spline","linear","neighborhood"),
23 22
         forceHeatmapBinning=TRUE,
24 23
         forcedBinSize=c(50,200)
... ...
@@ -46,6 +45,8 @@ recoup <- function(
46 45
         heatmapScale=c("common","each"),
47 46
         heatmapFactor=1,
48 47
         corrScale=c("normalized","each"),
48
+        sumStat=c("mean","median"),
49
+        smooth=TRUE,
49 50
         corrSmoothPar=ifelse(is.null(design),0.1,0.5),
50 51
         singleFacet=c("none","wrap","grid"),
51 52
         multiFacet=c("wrap","grid"),
Browse code

Added support for BigWig files which increases reading speed as coverages are precalculated.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/recoup@115380 bc3139a8-67e5-0310-9ffc-ced21a209358

Panagiotis Moulos authored on 29/03/2016 20:53:31
Showing 1 changed files
... ...
@@ -31,8 +31,9 @@ recoup <- function(
31 31
         sampleTo=1e+6,
32 32
         spliceAction=c("split","keep","remove"),
33 33
         spliceRemoveQ=0.75,
34
-        bedGenome=ifelse(genome %in% c("hg18","hg19","hg38","mm9","mm10","rn5",
35
-            "dm3","danrer7","pantro4","susscr3"),genome,NA),
34
+        #bedGenome=ifelse(genome %in% c("hg18","hg19","hg38","mm9","mm10","rn5",
35
+        #    "dm3","danrer7","pantro4","susscr3"),genome,NA),
36
+        bedGenome=NA,
36 37
         seed=42
37 38
     ),
38 39
     #preprocessParams=getDefaultListArgs("preprocessParams"),
... ...
@@ -470,7 +471,7 @@ recoup <- function(
470 471
     
471 472
     # At this point we must apply the fraction parameter if <1. We choose this
472 473
     # point in order not to restrict later usage of the read ranges and since it
473
-    # does not take much time to load into memory
474
+    # does not take much time to load into memory.
474 475
     if (fraction<1) {
475 476
         newSize <- round(fraction*length(genomeRanges))
476 477
         set.seed(preprocessParams$seed)
... ...
@@ -479,10 +480,12 @@ recoup <- function(
479 480
         if (type=="rnaseq")
480 481
             helperRanges <- helperRanges[refIndex]
481 482
         for (i in 1:length(input)) {
482
-            newSize <- round(fraction*length(input[[i]]$ranges))
483
-            set.seed(preprocessParams$seed)
484
-            fracIndex <- sort(sample(length(input[[i]]$ranges),newSize))
485
-            input[[i]]$ranges <- input[[i]]$ranges[fracIndex]
483
+            if (!is.null(input[[i]]$ranges)) {
484
+                newSize <- round(fraction*length(input[[i]]$ranges))
485
+                set.seed(preprocessParams$seed)
486
+                fracIndex <- sort(sample(length(input[[i]]$ranges),newSize))
487
+                input[[i]]$ranges <- input[[i]]$ranges[fracIndex]
488
+            }
486 489
             if (!is.null(input[[i]]$coverage)) {
487 490
                 #if (!is.null(input[[i]]$coverage$center)) {
488 491
                 #    input[[i]]$coverage$center <- 
... ...
@@ -498,7 +501,10 @@ recoup <- function(
498 501
 
499 502
     # Remove unwanted seqnames from reference ranges
500 503
     chrs <- unique(unlist(lapply(input,function(x) {
501
-        return(as.character(runValue(seqnames(x$ranges))))
504
+        if (x$format %in% c("bam","bed"))
505
+            return(as.character(runValue(seqnames(x$ranges))))
506
+        else if (x$format=="bigwig")
507
+            return(as.character(seqnames(seqinfo(BigWigFile(x$file)))))
502 508
     })))
503 509
     if (type=="chipseq") {
504 510
         keep <- which(as.character(seqnames(genomeRanges)) %in% chrs)
Browse code

Adding SpidermiR, DRIMSeq, debrowser, GMRP, ISoLDE, CountClust, OncoScore, AneuFinder, recoup, genphen, DEFormats, sscu

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/recoup@115200 bc3139a8-67e5-0310-9ffc-ced21a209358

Martin Morgan authored on 23/03/2016 22:36:02
Showing 1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,817 @@
1
+recoup <- function(
2
+    input,
3
+    design=NULL,
4
+    region=c("genebody","tss","tes","custom"),
5
+    type=c("chipseq","rnaseq"),
6
+    genome=c("hg18","hg19","hg38","mm9","mm10","rn5","dm3","danrer7","pantro4",
7
+        "susscr3"),
8
+    refdb=c("ensembl","ucsc","refseq"),
9
+    flank=c(2000,2000),
10
+    fraction=1,
11
+    orderBy=list(
12
+        what=c("none","suma","sumn","maxa","maxn","avga","avgn","hcn"),
13
+        order=c("descending","ascending"),
14
+        custom=NULL
15
+    ),
16
+    #orderBy=getDefaultListArgs("orderBy"),
17
+    binParams=list(
18
+        flankBinSize=0,
19
+        regionBinSize=0,
20
+        sumStat=c("mean","median"),
21
+        smooth=TRUE,
22
+        interpolation=c("auto","spline","linear","neighborhood"),
23
+        forceHeatmapBinning=TRUE,
24
+        forcedBinSize=c(50,200)
25
+    ),
26
+    #binParams=getDefaultListArgs("binParams"),
27
+    selector=NULL,
28
+    #selector=getDefaultListArgs("selector"),
29
+    preprocessParams=list(
30
+        normalize=c("none","linear","downsample","sampleto"),
31
+        sampleTo=1e+6,
32
+        spliceAction=c("split","keep","remove"),
33
+        spliceRemoveQ=0.75,
34
+        bedGenome=ifelse(genome %in% c("hg18","hg19","hg38","mm9","mm10","rn5",
35
+            "dm3","danrer7","pantro4","susscr3"),genome,NA),
36
+        seed=42
37
+    ),
38
+    #preprocessParams=getDefaultListArgs("preprocessParams"),
39
+    plotParams=list(
40
+        plot=TRUE,
41
+        profile=TRUE,
42
+        heatmap=TRUE,
43
+        correlation=TRUE,
44
+        signalScale=c("natural","log2"),
45
+        heatmapScale=c("common","each"),
46
+        heatmapFactor=1,
47
+        corrScale=c("normalized","each"),
48
+        corrSmoothPar=ifelse(is.null(design),0.1,0.5),
49
+        singleFacet=c("none","wrap","grid"),
50
+        multiFacet=c("wrap","grid"),
51
+        conf=TRUE,
52
+        device=c("x11","png","jpg","tiff","bmp","pdf","ps"),
53
+        outputDir=".",
54
+        outputBase=NULL
55
+    ),
56
+    #plotParams=getDefaultListArgs("plotParams",design),
57
+    saveParams=list(
58
+        ranges=TRUE,
59
+        coverage=TRUE,
60
+        profile=TRUE,
61
+        profilePlot=TRUE,
62
+        heatmapPlot=TRUE,
63
+        correlationPlot=TRUE
64
+    ),
65
+    #saveParams=getDefaultListArgs("saveParams"),
66
+    kmParams=list(
67
+        k=0, # Do not perform kmeans
68
+        nstart=20,
69
+        algorithm=c("Hartigan-Wong","Lloyd","Forgy","MacQueen"),
70
+        iterMax=20,
71
+        reference=NULL,
72
+        seed=42
73
+    ),
74
+    #kmParams=getDefaultListArgs("kmParams"),
75
+    strandedParams=list(
76
+        strand=NULL,
77
+        ignoreStrand=TRUE
78
+    ),
79
+    #strandedParams=getDefaultListArgs("strandedParams"),
80
+    ggplotParams=list(
81
+        title=element_text(size=12),
82
+        axis.title.x=element_text(size=10,face="bold"),
83
+        axis.title.y=element_text(size=10,face="bold"),
84
+        axis.text.x=element_text(size=9,face="bold"),
85
+        axis.text.y=element_text(size=10,face="bold"),
86
+        strip.text.x=element_text(size=10,face="bold"),
87
+        strip.text.y=element_text(size=10,face="bold"),
88
+        legend.position="bottom",
89
+        panel.margin=grid::unit(1,"lines")
90
+    ),
91
+    #ggplotParams=getDefaultListArgs("ggplotParams"),
92
+    complexHeatmapParams=list(
93
+        main=list(
94
+            cluster_rows=ifelse(length(grep("hc",orderBy$what))>0,TRUE,FALSE),
95
+            cluster_columns=FALSE,
96
+            column_title_gp=grid::gpar(fontsize=10,font=2),
97
+            show_row_names=FALSE,
98
+            show_column_names=FALSE,
99
+            heatmap_legend_param=list(
100
+                color_bar="continuous"
101
+            )
102
+        ),
103
+        group=list(
104
+            cluster_rows=ifelse(length(grep("hc",orderBy$what))>0,TRUE,FALSE),
105
+            cluster_columns=FALSE,
106
+            column_title_gp=grid::gpar(fontsize=10,font=2),
107
+            show_row_names=FALSE,
108
+            show_column_names=FALSE,
109
+            row_title_gp=grid::gpar(fontsize=8,font=2),
110
+            gap=unit(5,"mm"),
111
+            heatmap_legend_param=list(
112
+                color_bar="continuous"
113
+            )
114
+        )
115
+    ),
116
+    #complexHeatmapParams=getDefaultListArgs("complexHeatmapParams"),
117
+    bamParams=NULL,
118
+    onTheFly=FALSE, # Directly from BAM w/o Ranges storing, also N/A with BED,
119
+    localDbHome=file.path(path.expand("~"),".recoup"),
120
+    rc=NULL
121
+) {
122
+    ############################################################################
123
+    # Begin of simple parameter checking for a new or a restored object
124
+    if (is.list(input) && !is.null(input$data)) { # Refeeding recoup object
125
+        message("Detected a previous recoup output as input. Any existing ",
126
+            "data requiring\nlengthy calculations (short reads, coverages and ",
127
+            "profile matrices will be\nreused depending on other parameters. ",
128
+            "If you want a complete recalculation,\neither input a fresh list ",
129
+            "of arguments or remove any of the above with the\nremoveData ",
130
+            "function.\n")
131
+        prevCallParams <- input$callopts
132
+        input <- input$data
133
+    }
134
+    else
135
+        prevCallParams <- NULL
136
+    if (!is.list(input) && file.exists(input))
137
+        input <- readConfig(input)
138
+    checkInput(input)
139
+    if (is.null(names(input)))
140
+        names(input) <- sapply(input,function(x) return(x$id))
141
+    
142
+    # Check if there are any mispelled or invalid parameters and throw a warning
143
+    checkMainArgs(as.list(match.call()))
144
+    
145
+    # Check rest of arguments
146
+    region <- tolower(region[1])
147
+    refdb <- tolower(refdb[1])
148
+    type <- tolower(type[1])
149
+    if (!is.null(design) && !is.data.frame(design))
150
+        checkFileArgs("design",design)
151
+    if (!is.data.frame(genome) && file.exists(genome))
152
+        checkFileArgs("genome",genome)
153
+    else if (is.character(genome))
154
+        checkTextArgs("genome",genome,c("hg18","hg19","hg38","mm9","mm10","rn5",
155
+            "dm3","danrer7","pantro4","susscr3","tair10"),multiarg=FALSE)
156
+    checkTextArgs("refdb",refdb,c("ensembl","ucsc","refseq"),multiarg=FALSE)
157
+    checkTextArgs("type",type,c("chipseq","rnaseq"),multiarg=FALSE)
158
+    checkNumArgs("fraction",fraction,"numeric",c(0,1),"botheq")
159
+    if (any(flank<0))
160
+        stop("The minimum flanking allowed is 0 bp")
161
+    if (any(flank>50000))
162
+        stop("The maximum flanking allowed is 50000 bp")
163
+        
164
+    # If type is rnaseq, only genebody plots are valid
165
+    if (type=="rnaseq" && region!="genebody") {
166
+        warning("When type is \"rnaseq\", plots can be created only on ",
167
+            "genebodies! Switching to genebody regions...",immediate.=TRUE)
168
+        region <- "genebody"
169
+    }
170
+    
171
+    # and list arguments
172
+    orderByDefault <- getDefaultListArgs("orderBy")
173
+    binParamsDefault <- getDefaultListArgs("binParams")
174
+    selectorDefault <- getDefaultListArgs("selector")
175
+    preprocessParamsDefault <- getDefaultListArgs("preprocessParams",
176
+        genome=genome)
177
+    plotParamsDefault <- getDefaultListArgs("plotParams",design=design)
178
+    saveParamsDefault <- getDefaultListArgs("saveParams")
179
+    kmParamsDefault <- getDefaultListArgs("kmParams")
180
+    strandedParamsDefault <- getDefaultListArgs("strandedParams")
181
+    
182
+    orderBy <- setArg(orderByDefault,orderBy)
183
+    binParams <- setArg(binParamsDefault,binParams)
184
+    if (!is.null(selector))
185
+        selector <- setArg(selectorDefault,selector)
186
+    preprocessParams <- setArg(preprocessParamsDefault,preprocessParams)
187
+    plotParams <- setArg(plotParamsDefault,plotParams)
188
+    saveParams <- setArg(saveParamsDefault,saveParams)
189
+    kmParams <- setArg(kmParamsDefault,kmParams)
190
+    strandedParams <- setArg(strandedParamsDefault,strandedParams)
191
+    
192
+    orderBy <- validateListArgs("orderBy",orderBy)
193
+    binParams <- validateListArgs("binParams",binParams)
194
+    if (!is.null(selector))
195
+        selector <- validateListArgs("selector",selector)
196
+    preprocessParams <- validateListArgs("preprocessParams",preprocessParams)
197
+    plotParams <- validateListArgs("plotParams",plotParams)
198
+    saveParams <- validateListArgs("saveParams",saveParams)
199
+    kmParams <- validateListArgs("kmParams",kmParams)
200
+    strandedParams <- validateListArgs("strandedParams",strandedParams)
201
+    
202
+    if (is.null(plotParams$outputBase))
203
+        plotParams$outputBase <- paste(sapply(input,function(x) return(x$id)),
204
+            collapse="-")
205
+    
206
+    if (any(sapply(input,function(x) return(tolower(x$format)=="bed")))
207
+        && !(preprocessParams$bedGenome %in% c("hg18","hg19","hg38","mm9",
208
+            "mm10","rn5","dm3","danrer7","pantro4","susscr3")))
209
+        stop("When short read files are in BED format, either the genome ",
210
+            "parameter should be one\nof the supported organisms, or ",
211
+            "preprocessParams$bedGenome must be specified as one of them.")
212
+   
213
+    # End of simple parameter checking for a new or a restored object
214
+    ############################################################################
215
+    
216
+    ############################################################################
217
+    # Begin of complex parameter storage procedure and parameter recall from a
218
+    # previous call
219
+    
220
+    # Store all parameters to an obect for later reference to a next call, after
221
+    # checking if something has changed (e.g. using setr)...
222
+    thisCall <- as.list(match.call())[-1]
223
+    if (!is.null(prevCallParams)) {
224
+        callParams <- list(
225
+            region=if (is.null(thisCall$region)) prevCallParams$region else
226
+                region,
227
+            type=if (is.null(thisCall$type)) prevCallParams$type else type,
228
+            genome=if (is.null(thisCall$genome)) prevCallParams$genome else
229
+                genome,
230
+            refdb=if (is.null(thisCall$refdb)) prevCallParams$refdb else
231
+                refdb,
232
+            flank=if (is.null(thisCall$flank)) prevCallParams$flank else flank,
233
+            fraction=if (is.null(thisCall$fraction)) prevCallParams$fraction
234
+                else fraction,
235
+            orderBy=if (is.null(thisCall$orderBy)) prevCallParams$orderBy else
236
+                orderBy,
237
+            binParams=if (is.null(thisCall$binParams)) prevCallParams$binParams
238
+                else binParams,
239
+            selector=selector, # selector is a special case
240
+            preprocessParams=if (is.null(thisCall$preprocessParams))
241
+                prevCallParams$preprocessParams else preprocessParams,
242
+            plotParams=if (is.null(thisCall$plotParams)) 
243
+                prevCallParams$plotParams else plotParams,
244
+            saveParams=if (is.null(thisCall$saveParams))
245
+                prevCallParams$saveParams else saveParams,
246
+            kmParams=if (is.null(thisCall$kmParams))
247
+                prevCallParams$kmParams else kmParams,
248
+            strandedParams=if (is.null(thisCall$strandedParams))
249
+                prevCallParams$strandedParams else strandedParams,
250
+            ggplotParams=if (is.null(thisCall$ggplotParams))
251
+                prevCallParams$ggplotParams else ggplotParams,
252
+            complexHeatmapParams=if (is.null(thisCall$complexHeatmapParams))
253
+                prevCallParams$complexHeatmapParams else complexHeatmapParams,
254
+            #bamParams=if (is.null(thisCall$bamParams),
255
+            #    prevCallParams$bamParams,bamParams), ## Unused
256
+            onTheFly=if (is.null(thisCall$onTheFly)) prevCallParams$onTheFly 
257
+                else onTheFly,
258
+            localDbHome=if (is.null(thisCall$localDbHome)) 
259
+                prevCallParams$localDbHome else localDbHome,
260
+            rc=if (is.null(thisCall$rc)) prevCallParams$rc else rc,
261
+            customIsBase=NULL # Additional placeholder
262
+        )
263
+    }
264
+    else {
265
+        callParams <- list(
266
+            region=region,
267
+            type=type,
268
+            genome=genome,
269
+            refdb=refdb,
270
+            flank=flank,
271
+            fraction=fraction,
272
+            orderBy=orderBy,
273
+            binParams=binParams,
274
+            selector=selector,
275
+            preprocessParams=preprocessParams,
276
+            plotParams=plotParams,
277
+            saveParams=saveParams,
278
+            kmParams=kmParams,
279
+            strandedParams=strandedParams,
280
+            ggplotParams=ggplotParams,
281
+            complexHeatmapParams=complexHeatmapParams,
282
+            bamParams=bamParams,
283
+            onTheFly=onTheFly,
284
+            localDbHome=localDbHome,
285
+            rc=rc,
286
+            customIsBase=NULL # Additional placeholder
287
+        )
288
+    }
289
+    
290
+    # ...and check if there has been a previous call and decide on big
291
+    # recalculations...
292
+    input <- decideChanges(input,callParams,prevCallParams)
293
+    
294
+    # Redefine all final arguments for this call
295
+    region=callParams$region
296
+    type=callParams$type
297
+    genome=callParams$genome
298
+    refdb=callParams$refdb
299
+    flank=callParams$flank
300
+    fraction=callParams$fraction
301
+    orderBy=callParams$orderBy
302
+    binParams=callParams$binParams
303
+    selector=callParams$selector
304
+    preprocessParams=callParams$preprocessParams
305
+    plotParams=callParams$plotParams
306
+    saveParams=callParams$saveParams
307
+    kmParams=callParams$kmParams
308
+    strandedParams=callParams$strandedParams
309
+    ggplotParams=callParams$ggplotParams
310
+    complexHeatmapParams=callParams$complexHeatmapParams
311
+    bamParams=callParams$bamParams
312
+    onTheFly=callParams$onTheFly
313
+    localDbHome=callParams$localDbHome
314
+    rc=callParams$rc
315
+    # End of complex parameter storage procedure and parameter recall from a
316
+    # previous call
317
+    ############################################################################
318
+    
319
+    # Continue with actual work
320
+    if (!is.data.frame(genome)) {
321
+        if (file.exists(genome)) {
322
+            genome <- read.delim(genome) # Must be bed like
323
+            rownames(genome) <- as.character(genome[,4])
324
+            genomeRanges <- makeGRangesFromDataFrame(
325
+                df=genome,
326
+                keep.extra.columns=TRUE
327
+            )
328
+        }
329
+        else {
330
+            # Check if local storage has been set
331
+            if (dir.exists(file.path(localDbHome,refdb,genome))) {
332
+                if (type=="chipseq") {
333
+                    g <- load(file.path(localDbHome,refdb,genome,"gene.rda"))
334
+                    genomeRanges <- gene
335
+                }
336
+                else if (type=="rnaseq") {
337
+                    g <- load(file.path(localDbHome,refdb,genome,
338
+                        "summarized_exon.rda"))
339
+                    genomeRanges <- sexon
340
+                    g <- load(file.path(localDbHome,refdb,genome,"gene.rda"))
341
+                    helperRanges <- gene
342
+                }
343
+            }
344
+            else { # On the fly
345
+                message("Getting annotation on the fly for ",genome," from ",
346
+                    refdb)
347
+                if (type=="chipseq")
348
+                    genome <- getAnnotation(genome,"gene",refdb=refdb,rc=rc)
349
+                else if (type=="rnaseq") {
350
+                    helper <- getAnnotation(genome,"gene",refdb=refdb,rc=rc)
351
+                    helperRanges <- makeGRangesFromDataFrame(
352
+                        df=helper,
353
+                        keep.extra.columns=TRUE,
354
+                        seqnames.field="chromosome"
355
+                    )
356
+                    names(helperRanges) <- as.character(helperRanges$gene_id)
357
+                    genome <- getAnnotation(genome,"exon",refdb=refdb,rc=rc)
358
+                    ann.gr <- makeGRangesFromDataFrame(
359
+                        df=genome,
360
+                        keep.extra.columns=TRUE,
361
+                        seqnames.field="chromosome"
362
+                    )
363
+                    message("Merging exons")
364
+                    ann.gr <- reduceExons(ann.gr,rc=rc)
365
+                    names(ann.gr) <- as.character(ann.gr$exon_id)
366
+                    genomeRanges <- split(ann.gr,ann.gr$gene_id)
367
+                }
368
+            }
369
+        }
370
+    }
371
+    else {
372
+        rownames(genome) <- as.character(genome[,4])
373
+        genomeRanges <- makeGRangesFromDataFrame(
374
+            df=genome,
375
+            keep.extra.columns=TRUE
376
+        )
377
+    }
378
+    
379
+    # After genome read, check if we have a valid custom orderer
380
+    if (!is.null(orderBy$custom)) {
381
+        if (length(orderBy$custom) != nrow(genome)) {
382
+            warning("The custom orderer provide with orderBy parameter does ",
383
+                "not have length equal to the number of elements in genome ",
384
+                "argument and will be ignored!",immediate.=TRUE)
385
+            orderBy$custom <- NULL
386
+        }
387
+    }
388
+    
389
+    # Read and check design compatibilities. Check if k-means is requested and
390
+    # message accordingly. If k-means is requested it will be added to the 
391
+    # design data frame
392
+    hasDesign <- FALSE
393
+    if (!is.null(design)) {
394
+        if (!is.data.frame(design))
395
+            design <- read.delim(design,row.names=1)
396
+        nfac <- ncol(design)
397
+        if (length(input)>1 && nfac>2)
398
+            stop("When more than one files are provided for coverage ",
399
+                "generation, the maximum number of allowed design factors is 2")
400
+        if (length(input)>1 && nfac>1 && kmParams$k>0)
401
+            stop("When more than one files are provided for coverage ",
402
+                "generation and k-means clustering is also requested, the ",
403
+                "maximum number of allowed design factors is 1")
404
+        if (length(input)==1 && nfac>3)
405
+            stop("The maximum number of allowed design factors is 3")
406
+        if (length(input)==1 && nfac>2 && kmParams$k>0)
407
+            stop("The maximum number of allowed design factors when k-means ",
408
+                "clustering is requested is 2")
409
+        if (length(input)==1 && nfac>2 && plotParams$singleFacet!="none")
410
+            stop("The maximum number of allowed design factors whith one ",
411
+                "sample but wishing a gridded profile layout is 2")
412
+        if (length(input)==1 && nfac>1 && kmParams$k>0 
413
+            && plotParams$singleFacet!="none")
414
+            stop("The maximum number of allowed design factors whith one ",
415
+                "sample but wishing for k-means clustering and gridded ",
416
+                "profile layout is 1")
417
+        # Reduce the genomeRanges according to design or the other way
418
+        if (nrow(design)>length(genomeRanges))
419
+            design <- tryCatch({
420
+                design[names(genomeRanges),,drop=FALSE]
421
+            },error=function(e) {
422
+                stop("Unexpected error occured! Are you sure that element ",
423
+                    "(row) names in the design file are of the same type as ",
424
+                    "the genome file?")
425
+            },finally={})
426
+        else if (nrow(design)<=length(genomeRanges)) {
427
+            genomeRanges <- tryCatch({
428
+                genomeRanges[rownames(design)]
429
+            },error=function(e) {
430
+                stop("Unexpected error occured! Are you sure that element ",
431
+                    "(row) names in the design file are of the same type as ",
432
+                    "the genome file?")
433
+            },finally={})
434
+            if (type=="rnaseq")
435
+                helperRanges <- tryCatch({
436
+                    helperRanges[rownames(design)]
437
+                },error=function(e) {
438
+                    stop("Unexpected error occured! Are you sure that element ",
439
+                        "(row) names in the design file are of the same type as ",
440
+                        "the genome file?")
441
+                },finally={})
442
+        }
443
+        # ...but maybe the names are incompatible
444
+        if (length(genomeRanges)==0)
445
+            stop("No ranges left after using the identifiers provided with ",
446
+                "the design file. Are you sure the identifiers between the ",
447
+                "two files are compatible?")
448
+        if (nrow(design)==0)
449
+            stop("No design elements left after using the identifiers ",
450
+                "provided with the genome file. Are you sure the identifiers ",
451
+                "between the two files are compatible?")
452
+    }
453
+    
454
+    # Apply the rest of the filters if any to reduce later computational burden
455
+    if (!is.null(selector)) {
456
+        if (type=="chipseq")
457
+            genomeRanges <- applySelectors(genomeRanges,selector,rc=rc)
458
+        if (type=="rnaseq") {
459
+            helperRanges <- applySelectors(helperRanges,selector)
460
+            # Filter and align names if we have helperRanges around
461
+            genomeRanges <- genomeRanges[names(helperRanges)]
462
+        }
463
+    }
464
+    
465
+    # Here we must write code for the reading and normalization of bam files
466
+    # The preprocessRanges function looks if there is a valid (not null) ranges
467
+    # field in input
468
+    #if (!onTheFly)
469
+    input <- preprocessRanges(input,preprocessParams,bamParams=bamParams,rc=rc)
470
+    
471
+    # At this point we must apply the fraction parameter if <1. We choose this
472
+    # point in order not to restrict later usage of the read ranges and since it
473
+    # does not take much time to load into memory
474
+    if (fraction<1) {
475
+        newSize <- round(fraction*length(genomeRanges))
476
+        set.seed(preprocessParams$seed)
477
+        refIndex <- sort(sample(length(genomeRanges),newSize))
478
+        genomeRanges <- genomeRanges[refIndex]
479
+        if (type=="rnaseq")
480
+            helperRanges <- helperRanges[refIndex]
481
+        for (i in 1:length(input)) {
482
+            newSize <- round(fraction*length(input[[i]]$ranges))
483
+            set.seed(preprocessParams$seed)
484
+            fracIndex <- sort(sample(length(input[[i]]$ranges),newSize))
485
+            input[[i]]$ranges <- input[[i]]$ranges[fracIndex]
486
+            if (!is.null(input[[i]]$coverage)) {
487
+                #if (!is.null(input[[i]]$coverage$center)) {
488
+                #    input[[i]]$coverage$center <- 
489
+                #        input[[i]]$coverage$center[refIndex]
490
+                #}
491
+                #else
492
+                input[[i]]$coverage <- input[[i]]$coverage[refIndex]
493
+            }
494
+            if (!is.null(input[[i]]$profile))
495
+                input[[i]]$profile <- input[[i]]$profile[refIndex,]
496
+        }
497
+    }
498
+
499
+    # Remove unwanted seqnames from reference ranges
500
+    chrs <- unique(unlist(lapply(input,function(x) {
501
+        return(as.character(runValue(seqnames(x$ranges))))
502
+    })))
503
+    if (type=="chipseq") {
504
+        keep <- which(as.character(seqnames(genomeRanges)) %in% chrs)
505
+        genomeRanges <- genomeRanges[keep]
506
+    }
507
+    else if (type=="rnaseq") {
508
+        keeph <- which(as.character(seqnames(helperRanges)) %in% chrs)
509
+        helperRanges <- helperRanges[keeph]
510
+        genomeRanges <- genomeRanges[names(helperRanges)]
511
+        ########################################################################
512
+        ## There must be an R bug with `lengths` here as although it runs in 
513
+        ## Rcmd, it does not pass package building or vignette kniting... But 
514
+        ## for the time being it seems that it is not needed as the name 
515
+        ## filtering works
516
+        #lens <- which(lengths(genomeRanges)==0)
517
+        #if (length(lens)>0)
518
+        #    genomeRanges[lens] <- NULL
519
+        ########################################################################
520
+    }
521
+    
522
+    # Now we must follow two paths according to region type, genebody and custom
523
+    # areas with equal/unequal widths, or tss, tes and 1-width custom areas
524
+    callParams$customIsBase <- FALSE
525
+    if (region=="custom" && all(width(genomeRanges)==1)) {
526
+        if (all(flank==0)) {
527
+            warning("Flanking cannot be zero bp in both directions when the ",
528
+                "reference region is only 1bp! Setting to default ",
529
+                "(2000,2000)...",immediate.=TRUE)
530
+            flank <- c(2000,2000)
531
+            callParams$flank <- flank
532
+        }
533
+        callParams$customIsBase <- TRUE
534
+    }
535
+    if (region %in% c("tss","tes")) {
536
+        if (all(flank==0)) {
537
+            warning("Flanking cannot be zero bp in both directions when the ",
538
+                "reference region is \"tss\" or \"tes\"! Setting to default ",
539
+                "(2000,2000)...", immediate.=TRUE)
540
+            flank <- c(2000,2000)
541
+            callParams$flank <- flank
542
+        }
543
+    }
544
+    if (type=="chipseq")
545
+        input <- coverageRef(input,genomeRanges,region,flank,strandedParams,
546
+            rc=rc)#,bamParams)
547
+    else if (type=="rnaseq")
548
+        input <- coverageRnaRef(input,genomeRanges,helperRanges,flank,
549
+            strandedParams,rc=rc)#,bamParams)
550
+    
551
+    # If normalization method is linear, we must adjust the coverages
552
+    if (preprocessParams$normalize=="linear") {
553
+        linFac <- calcLinearFactors(input,preprocessParams)
554
+        for (n in names(input)) {
555
+            if (linFac[n]==1)
556
+                next
557
+            #if (is.null(input[[n]]$coverage$center))
558
+            input[[n]]$coverage <- cmclapply(input[[n]]$coverage,
559
+                function(x,f) {
560
+                    return(x*f)
561
+                },linFac[n]
562
+            )
563
+            #else {
564
+            #    input[[n]]$coverage$center <- 
565
+            #        cmclapply(input[[n]]$coverage$center,function(x,f) {
566
+            #            return(x*f)
567
+            #        },linFac[n])
568
+            #}
569
+        }
570
+    }
571
+    
572
+    # Now we must summarize and create the matrices. If genebody or unequal 
573
+    # custom lengths, bin is a must, else we leave to user
574
+    mustBin <- FALSE
575
+    if (region=="genebody")
576
+        mustBin <- TRUE
577
+    if (region=="custom") {
578
+        w <- width(genomeRanges)
579
+        if (any(w!=w[1]))
580
+            mustBin <- TRUE
581
+    }
582
+    
583
+    if (mustBin) {
584
+        if (binParams$regionBinSize==0) {
585
+            warning("Central region bin size not set for a region that must ",
586
+                "be binned! Setting to 1000...",immediate.=TRUE)
587
+            binParams$regionBinSize <- 1000
588
+        }
589
+    }
590
+    input <- profileMatrix(input,flank,binParams,rc)
591
+    
592
+    # Perform the k-means clustering if requested and append to design (which
593
+    # has been checked, if we are allowed to do so)
594
+    if (kmParams$k>0)
595
+        design <- kmeansDesign(input,design,kmParams)
596
+    
597
+    # Coverages and profiles calculated... Now depending on plot option, we go 
598
+    # further or return the enriched input object for saving
599
+    if (!plotParams$profile && !plotParams$heatmap) {
600
+        recoupObj <- toOutput(input,design,saveParams,callParams=callParams)
601
+        return(recoupObj)
602
+    }
603
+    else {
604
+        recoupObj <- toOutput(input,design,list(ranges=TRUE,coverage=TRUE,
605
+            profile=TRUE),callParams=callParams)
606
+    }
607
+            
608
+    ## Our plot objects
609
+    recoupPlots <- list()
610
+    
611
+    # We must pass the matrices to plotting function
612
+    if (plotParams$profile) {
613
+        message("Constructing genomic coverage profile curve(s)")
614
+        #theProfile <- recoupProfile(recoupObj,rc=rc)
615
+        recoupObj <- recoupProfile(recoupObj,rc=rc)
616
+        theProfile <- getr(recoupObj,"profile")
617
+        recoupPlots$profilePlot <- theProfile
618
+    }
619
+    
620
+    # Some default binning MUST be applied for the heatmap... Otherwise it could
621
+    # take insanely long time and space to draw/store
622
+    if (plotParams$heatmap) {
623
+        # Inform the user about enforced binning (or not)
624
+        if (region %in% c("tss","tes") || callParams$customIsBase) {
625
+            if (binParams$regionBinSize==0 && binParams$forceHeatmapBinning) {
626
+                message("The resolution of the requested profiles will be ",
627
+                    "lowered to avoid\nincreased computation time and/or ",
628
+                    "storage space for heatmap profiles...")
629
+                
630
+            }
631
+            else if (binParams$regionBinSize==0
632
+                && !binParams$forceHeatmapBinning)
633
+                warning("forceHeatmapBinning is turned off for high ",
634
+                    "resolution plotting. Be prepared for\nlong computational ",
635
+                    "times and big figures!",immediate.=TRUE)
636
+        }
637
+        else {
638
+            if ((binParams$regionBinSize==0 || binParams$flankBinSize==0)
639
+                && binParams$forceHeatmapBinning) {
640
+                message("The resolution of the requested profiles will be ",
641
+                    "lowered to avoid\nincreased computation time and/or ",
642
+                    "storage space for heatmap profiles...")
643
+                
644
+            }
645
+            else if ((binParams$regionBinSize==0 || binParams$flankBinSize==0)
646
+                && !binParams$forceHeatmapBinning)
647
+                warning("forceHeatmapBinning is turned off for high ",
648
+                    "resolution plotting. Be prepared for\nlong computational ",
649
+                    "times and big figures!",immediate.=TRUE)
650
+        }
651
+        
652
+        if (binParams$forceHeatmapBinning 
653
+            && (binParams$regionBinSize==0 || binParams$flankBinSize==0)) {
654
+            helpInput <- recoupObj$data
655
+            if (region %in% c("tss","tes") || callParams$customIsBase) {
656
+                for (n in names(helpInput)) {
657
+                    message("Calculating ",region," profile for ",
658
+                        helpInput[[n]]$name)
659
+                    helpInput[[n]]$profile <- 
660
+                        binCoverageMatrix(helpInput[[n]]$coverage,
661
+                            binSize=binParams$forcedBinSize[2],
662
+                            stat=binParams$sumStat,rc=rc)
663
+                    helpInput[[n]]$profile <- helpInput[[n]]$profile
664
+                }
665
+            }
666
+            else {
667
+                for (n in names(helpInput)) {
668
+                    message("Calculating ",region," profile for ",
669
+                        helpInput[[n]]$name)
670
+                    message(" center")
671
+                    #center <- binCoverageMatrix(helpInput[[n]]$coverage$center,
672
+                    #    binSize=binParams$forcedBinSize[2],
673
+                    #    stat=binParams$sumStat,rc=rc)
674
+                    center <- binCoverageMatrix(
675
+                        input[[n]]$coverage,binSize=binParams$forcedBinSize[2],
676
+                        stat=binParams$sumStat,
677
+                        interpolation=binParams$interpolation,
678
+                        flank=flank,where="center",rc=rc
679
+                    )
680
+                    message(" upstream")
681
+                    #left <- binCoverageMatrix(helpInput[[n]]$coverage$upstream,
682
+                    #    binSize=binParams$forcedBinSize[1],
683
+                    #    stat=binParams$sumStat,rc=rc)
684
+                    left <- binCoverageMatrix(
685
+                        input[[n]]$coverage,binSize=binParams$forcedBinSize[1],
686
+                        stat=binParams$sumStat,
687
+                        interpolation=binParams$interpolation,flank=flank,
688
+                        where="upstream",rc=rc
689
+                    )
690
+                    message(" downstream")
691
+                    #right <- binCoverageMatrix(
692
+                    #    helpInput[[n]]$coverage$downstream,
693
+                    #    binSize=binParams$forcedBinSize[1],
694
+                    #    stat=binParams$sumStat,rc=rc)
695
+                    right <- binCoverageMatrix(
696
+                        input[[n]]$coverage,binSize=forcedBinSize[1],
697
+                        stat=binParams$sumStat,
698
+                        interpolation=binParams$interpolation,flank=flank,
699
+                        where="downstream",rc=rc
700
+                    )
701
+                    helpInput[[n]]$profile <- cbind(left,center,right)
702
+                    rownames(helpInput[[n]]$profile) <-
703
+                        names(input[[n]]$coverage)
704
+                    helpInput[[n]]$profile <- helpInput[[n]]$profile
705
+                }
706
+            }
707
+        }
708
+        else
709
+            helpInput <- recoupObj$data
710
+        
711
+        helpObj <- recoupObj
712
+        helpObj$data <- helpInput
713
+        message("Constructing genomic coverage heatmap(s)")
714
+        #theHeatmap <- recoupHeatmap(helpObj,rc=rc)
715
+        helpObj <- recoupHeatmap(helpObj,rc=rc)
716
+        theHeatmap <- getr(helpObj,"heatmap")
717
+        recoupObj <- setr(recoupObj,"heatmap",theHeatmap)
718
+        recoupPlots$heatmapPlot <- theHeatmap
719
+        
720
+        # Derive the main heatmap in case of hierarchical clustering        
721
+        mainh <- 1
722
+        if (length(grep("hc",orderBy$what))>0) {
723
+            nc <- nchar(orderBy$what)
724
+            mh <- suppressWarnings(as.numeric(substr(orderBy$what,nc,nc)))
725
+            if (is.na(mh))
726
+                warning("Reference profile for hierarchical clustering order ",
727
+                    "not recognized! Using the 1st...",immediate.=TRUE)
728
+            else if (mh > length(input)) {
729
+                warning("Reference profile (",mh,") for hierarchical ",
730
+                    "clustering order does not exist (the input has only ",
731
+                    length(input)," sources! Using the last...",
732
+                    immediate.=TRUE)
733
+                    mainh <- length(input)
734
+            }
735
+            else
736
+                mainh <- mh
737
+        }
738
+    }
739
+    
740
+    # We must pass the matrices to plotting function
741
+    if (plotParams$correlation) {
742
+        message("Constructing coverage correlation profile curve(s)")
743
+        recoupObj <- recoupCorrelation(recoupObj,rc=rc)
744
+        theCorrelation <- getr(recoupObj,"correlation")
745
+        recoupPlots$correlationPlot <- theCorrelation
746
+    }
747
+    
748
+    # Overwrite final object so as to return it
749
+    recoupObj <- toOutput(input,design,saveParams,recoupPlots,callParams)
750
+    
751
+    # Make any plots asked
752
+    if (plotParams$plot) {
753
+        what <- character(0)
754
+        if (plotParams$profile)
755
+            what <- c(what,"profile")
756
+        if (plotParams$heatmap)
757
+            what <- c(what,"heatmap")
758
+        if (plotParams$correlation)
759
+            what <- c(what,"correlation")
760
+        if (length(what)>0)
761
+            recoupPlot(recoupObj,what,plotParams$device,plotParams$outputDir,
762
+                plotParams$outputBase,mainh)
763
+    }
764
+    
765
+    # Return the enriched input object according to save options
766
+    return(recoupObj)
767
+}
768
+
769
+applySelectors <- function(ranges,selector,rc=NULL) {
770
+    if (!is.null(selector$id)) {
771
+        ranges <- ranges[selector$ids]
772
+        if (length(ranges)==0)
773
+            stop("No ranges left after using the identifiers provided with ",
774
+                "the selector field. Are you sure the identifiers between the ",
775
+                "two files are compatible?")
776
+    }
777
+    if (!is.null(selector$biotype) && !is.null(ranges$biotype)) {
778
+        good <- which(ranges$biotype %in% selector$biotype)
779
+        ranges <- ranges[good]
780
+        if (length(ranges)==0)
781
+            stop("No ranges left after using the biotypes provided with the ",
782
+                "selector field. Are you sure the identifiers between the two ",
783
+                "files are compatible?")
784
+    }
785
+    if (!is.null(selector$exonType) && !is.null(ranges$exon_type)) {
786
+        good <- which(ranges$exonType %in% selector$exonType)
787
+        ranges <- ranges[good]
788
+        if (length(ranges)==0)
789
+            stop("No ranges left after using the exon types provided with ",
790
+                "the selector field. Are you sure the identifiers between the ",
791
+                "two files are compatible?")
792
+    }
793
+    return(ranges)
794
+}
795
+
796
+toOutput <- function(input,design=NULL,saveParams,plotObjs=NULL,
797
+    callParams=NULL) {
798
+    if (!saveParams$ranges)
799
+        input <- removeData(input,"ranges")
800
+    if (!saveParams$coverage)
801
+        input <- removeData(input,"coverage")
802
+    if (!saveParams$profile)
803
+        input <- removeData(input,"profile")
804
+    plots <- list(profile=NULL,heatmap=NULL,correlation=NULL)
805
+    if (!is.null(plotObjs) && saveParams$profilePlot)
806
+        plots$profile <- plotObjs$profilePlot
807
+    if (!is.null(plotObjs) && saveParams$heatmapPlot)
808
+        plots$heatmap <- plotObjs$heatmapPlot
809
+    if (!is.null(plotObjs) && saveParams$correlationPlot)
810
+        plots$correlation <- plotObjs$correlationPlot
811
+    return(list(
812
+        data=input,
813
+        design=design,
814
+        plots=plots,
815
+        callopts=callParams
816
+    ))
817
+}