Browse code

bsseq now uses DelayedMatrix objects from the DelayedArray package for all matrix-like data

Peter Hickey authored on 07/04/2017 17:42:29
Showing 40 changed files

... ...
@@ -3,3 +3,4 @@
3 3
 .RData$
4 4
 auto/
5 5
 *~
6
+.Rhistory
... ...
@@ -1,19 +1,19 @@
1 1
 Package: bsseq
2
-Version: 1.9.2
2
+Version: 1.11.1
3
+Encoding: UTF-8
3 4
 Title: Analyze, manage and store bisulfite sequencing data
4 5
 Description: A collection of tools for analyzing and visualizing bisulfite
5 6
     sequencing data.
6 7
 Authors@R: c(person(c("Kasper", "Daniel"), "Hansen", role = c("aut", "cre"),
7 8
     email = "kasperdanielhansen@gmail.com"),
8
-    person("Peter", "Hickey", role = "ctb", email = "peter.hickey@gmail.com"))
9
+    person("Peter", "Hickey", role = "aut", email = "peter.hickey@gmail.com"))
9 10
 Depends:
10 11
     R (>= 3.3),
11 12
     methods,
12 13
     BiocGenerics,
13 14
     GenomicRanges (>= 1.23.7),
14 15
     SummarizedExperiment (>= 0.1.1),
15
-    parallel,
16
-    limma
16
+    parallel
17 17
 Imports:
18 18
     IRanges (>= 2.5.17),
19 19
     GenomeInfoDb,
... ...
@@ -27,19 +27,22 @@ Imports:
27 27
     S4Vectors,
28 28
     R.utils (>= 2.0.0),
29 29
     matrixStats (>= 0.50.0),
30
-    permute
30
+    permute,
31
+    limma,
32
+    DelayedArray,
33
+    HDF5Array
31 34
 Suggests:
32 35
     RUnit,
33 36
     bsseqData,
34 37
     BiocStyle,
35
-    knitr
38
+    knitr,
36 39
 Collate:
40
+    utils.R
37 41
     hasGRanges.R
38 42
     BSseq_class.R
39 43
     BSseqTstat_class.R
40 44
     BSseq_utils.R
41 45
     combine.R
42
-    utils.R
43 46
     read.bsmooth.R
44 47
     read.bismark.R
45 48
     BSmooth.R
... ...
@@ -52,6 +55,9 @@ Collate:
52 55
     BSmooth.fstat.R
53 56
     BSseqStat_class.R
54 57
     getStats.R
58
+    hdf5_utils.R
59
+    combine_utils.R
60
+    DelayedArray_utils.R
55 61
 License: Artistic-2.0
56 62
 VignetteBuilder: knitr
57 63
 URL: https://github.com/kasperdanielhansen/bsseq
... ...
@@ -10,7 +10,8 @@ importFrom(BiocGenerics, "anyDuplicated", "cbind", "colnames",
10 10
            "strand", "strand<-", "union", "unique", "updateObject")
11 11
 importFrom(stats, "approxfun", "fisher.test", "ppoints",
12 12
            "predict", "preplot", "qchisq",
13
-           "qnorm", "qqplot", "qunif", "cov2cor")
13
+           "qnorm", "qqplot", "qunif", "cov2cor",
14
+           "plogis")
14 15
 importFrom(graphics, "abline", "axis", "layout", "legend", "lines",
15 16
            "mtext", "par", "plot", "points", "polygon", "rect", "rug", "text")
16 17
 import(parallel)
... ...
@@ -33,6 +34,9 @@ importFrom(data.table, "fread", "setnames")
33 34
 importFrom(R.utils, "isGzipped", "gunzip")
34 35
 import(limma)
35 36
 importFrom(permute, "shuffleSet", "how")
37
+importClassesFrom(DelayedArray, "DelayedArray", "DelayedMatrix")
38
+importFrom(DelayedArray, "DelayedArray", "plogis", "pmin2", "pmax2",
39
+           "setRealizeBackend", "type")
36 40
 
37 41
 ##
38 42
 ## Exporting
... ...
@@ -17,7 +17,7 @@ makeClusters <- function(hasGRanges, maxGap = 10^8) {
17 17
     names(clusterIdx) <- NULL
18 18
     clusterIdx
19 19
 }
20
-    
20
+
21 21
 
22 22
 BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8, parallelBy = c("sample", "chromosome"),
23 23
                     mc.preschedule = FALSE, mc.cores = 1, keep.se = FALSE, verbose = TRUE) {
... ...
@@ -29,8 +29,8 @@ BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8, parallelBy = c("sam
29 29
         if(verbose >= 2)
30 30
             cat(sprintf("[BSmooth]   smoothing start: sample:%s, chr:%s, nLoci:%s\n",
31 31
                         this_sample_chr[1], this_sample_chr[2], length(idxes)))
32
-        Cov <- getCoverage(BSseq, type = "Cov")[idxes, sampleIdx]
33
-        M <- getCoverage(BSseq, type = "M")[idxes, sampleIdx]
32
+        Cov <- getCoverage(BSseq, type = "Cov")[idxes, sampleIdx, drop = FALSE]
33
+        M <- getCoverage(BSseq, type = "M")[idxes, sampleIdx, drop = FALSE]
34 34
         pos <- start(BSseq)[idxes]
35 35
         stopifnot(all(diff(pos) > 0))
36 36
         wh <- which(Cov != 0)
... ...
@@ -45,8 +45,9 @@ BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8, parallelBy = c("sam
45 45
                         trans = NULL, h = h, nn = nn))
46 46
         }
47 47
         sdata <- data.frame(pos = pos[wh],
48
-                            M = pmin(pmax(M[wh], 0.01), Cov[wh] - 0.01),
49
-                            Cov = Cov[wh])
48
+                            M = as.array(pmin2(pmax2(M[wh, ], 0.01),
49
+                                               Cov[wh, ] - 0.01)),
50
+                            Cov = Cov[wh, ])
50 51
         fit <- locfit(M ~ lp(pos, nn = nn, h = h), data = sdata,
51 52
                       weights = Cov, family = "binomial", maxk = 10000)
52 53
         pp <- preplot(fit, where = "data", band = "local",
... ...
@@ -119,11 +120,15 @@ BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8, parallelBy = c("sam
119 120
         coef <- do.call(rbind, lapply(out, function(xx) xx$coef))
120 121
         se.coef <- do.call(rbind, lapply(out, function(xx) xx$se.coef))
121 122
     })
123
+    coef <- .DelayedMatrix(coef)
124
+    if (!is.null(se.coef)) {
125
+        se.coef <- .DelayedMatrix(se.coef)
126
+    }
122 127
     ptime.outer2 <- proc.time()
123 128
     stime.outer <- (ptime.outer2 - ptime.outer1)[3]
124 129
     if(verbose)
125 130
         cat(sprintf("[BSmooth] smoothing done in %.1f sec\n", stime.outer))
126
-    
131
+
127 132
     rownames(coef) <- NULL
128 133
     colnames(coef) <- sampleNames(BSseq)
129 134
     if(!is.null(se.coef)) {
... ...
@@ -135,16 +140,8 @@ BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8, parallelBy = c("sam
135 140
         assay(BSseq, "coef") <- coef
136 141
     if(!is.null(se.coef))
137 142
         assay(BSseq, "se.coef") <- se.coef
138
-    mytrans <- function(x) {
139
-        y <- x
140
-        ix <- which(x < 0)
141
-        ix2 <- which(x > 0)
142
-        y[ix] <- exp(x[ix])/(1 + exp(x[ix]))
143
-        y[ix2] <- 1/(1 + exp(-x[ix2]))
144
-        y
145
-    }
146
-    environment(mytrans) <- baseenv()
147
-    BSseq@trans <- mytrans
143
+    BSseq@trans <- plogis
144
+
148 145
     parameters <- list(smoothText = sprintf("BSmooth (ns = %d, h = %d, maxGap = %d)", ns, h, maxGap),
149 146
                        ns = ns, h = h, maxGap = maxGap)
150 147
     BSseq@parameters <- parameters
... ...
@@ -9,6 +9,7 @@ BSmooth.fstat <- function(BSseq, design, contrasts, verbose = TRUE){
9 9
     ptime1 <- proc.time()
10 10
     allPs <- getMeth(BSseq, type = "smooth", what = "perBase",
11 11
                      confint = FALSE)
12
+    allPs <- as.array(allPs)
12 13
     fit <- lmFit(allPs, design)
13 14
     fitC <- contrasts.fit(fit, contrasts)
14 15
     ## Need
... ...
@@ -17,7 +18,7 @@ BSmooth.fstat <- function(BSseq, design, contrasts, verbose = TRUE){
17 18
     ##   tstats <- fitC$coefficients / fitC$stdev.unscaled / fitC$sigma
18 19
     ##   rawSds <- fitC$sigma
19 20
     ##   cor.coefficients <- cov2cor(fitC$cov.coefficients)
20
-    rawSds <- fitC$sigma
21
+    rawSds <- as.matrix(fitC$sigma)
21 22
     cor.coefficients <- cov2cor(fitC$cov.coefficients)
22 23
     rawTstats <- fitC$coefficients / fitC$stdev.unscaled / fitC$sigma
23 24
     names(dimnames(rawTstats)) <- NULL
... ...
@@ -31,7 +32,8 @@ BSmooth.fstat <- function(BSseq, design, contrasts, verbose = TRUE){
31 32
                   cor.coefficients = cor.coefficients,
32 33
                   rawTstats = rawTstats)
33 34
     out <- BSseqStat(gr = granges(BSseq),
34
-                     stats = stats, parameters = parameters)
35
+                     stats = stats,
36
+                     parameters = parameters)
35 37
     out
36 38
 }
37 39
 
... ...
@@ -57,8 +59,12 @@ smoothSds <- function(BSseqStat, k = 101, qSd = 0.75, mc.cores = 1,
57 59
     if(verbose) cat(sprintf("done in %.1f sec\n", stime))
58 60
     smoothSds <- do.call("c",
59 61
                          mclapply(clusterIdx, function(idx) {
60
-                             smoothSd(getStats(BSseqStat, what = "rawSds")[idx], k = k, qSd = qSd)
62
+                             rawSds <- getStats(BSseqStat,
63
+                                                what = "rawSds")[idx, ]
64
+                             rawSds <- as.array(rawSds)
65
+                             smoothSd(rawSds, k = k, qSd = qSd)
61 66
                          }, mc.cores = mc.cores))
67
+    smoothSds <- .DelayedMatrix(as.matrix(smoothSds))
62 68
     if("smoothSds" %in% names(getStats(BSseqStat)))
63 69
         BSseqStat@stats[["smoothSds"]] <- smoothSds
64 70
     else
... ...
@@ -68,21 +74,30 @@ smoothSds <- function(BSseqStat, k = 101, qSd = 0.75, mc.cores = 1,
68 74
 
69 75
 # Quieten R CMD check
70 76
 globalVariables("tstat")
77
+
78
+# NOTE: Realises in memory a matrix with nrow = length(BSseqStat) and
79
+#       ncol = length(coef)
71 80
 computeStat <- function(BSseqStat, coef = NULL) {
72 81
     stopifnot(is(BSseqStat, "BSseqStat"))
73 82
     if(is.null(coef)) {
74 83
         coef <- 1:ncol(getStats(BSseqStat, what = "rawTstats"))
75 84
     }
76
-    tstats <- getStats(BSseqStat, what = "rawTstats")[, coef, drop = FALSE]
77
-    tstats <- tstats * getStats(BSseqStat, what = "rawSds") /
85
+    raw_tstats <- getStats(BSseqStat, what = "rawTstats")[, coef, drop = FALSE]
86
+    scaled_sds <- getStats(BSseqStat, what = "rawSds") /
78 87
         getStats(BSseqStat, what = "smoothSds")
88
+    scaled_sds_matrix <- do.call(cbind, replicate(ncol(raw_tstats), scaled_sds))
89
+    tstats <- raw_tstats * scaled_sds_matrix
79 90
     if(length(coef) > 1) {
80
-        cor.coefficients <- getStats(BSseqStat, what = "cor.coefficients")[coef,coef]
81
-        stat <- as.numeric(classifyTestsF(tstats, cor.coefficients,
82
-                                          fstat.only = TRUE))
91
+        cor.coefficients <- getStats(BSseqStat,
92
+                                     what = "cor.coefficients")[coef,coef]
93
+        # NOTE: classifyTestsF() calls as.matrix(tstats) and so realises this
94
+        #       array
95
+        stat <- .DelayedMatrix(as.matrix(classifyTestsF(tstats,
96
+                                                        cor.coefficients,
97
+                                                        fstat.only = TRUE)))
83 98
         stat.type <- "fstat"
84 99
     } else {
85
-        stat <- as.numeric(tstats)
100
+        stat <- .DelayedMatrix(tstats)
86 101
         stat.type <- "tstat"
87 102
     }
88 103
     if("stat" %in% names(getStats(BSseqStat))) {
... ...
@@ -114,9 +129,9 @@ localCorrectStat <- function(BSseqStat, threshold = c(-15,15), mc.cores = 1, ver
114 129
         xx.reg <- seq(from = min(xx), to = max(xx), by = 2000)
115 130
         yy.reg <- tstat.function(xx.reg)
116 131
         fit <- locfit(yy.reg ~ lp(xx.reg, h = 25000, deg = 2, nn = 0),
117
-                      family = "huber", maxk = 50000) 
132
+                      family = "huber", maxk = 50000)
118 133
         correction <- predict(fit, newdata = data.frame(xx.reg = xx))
119
-        yy - correction 
134
+        yy - correction
120 135
     }
121 136
     maxGap <- BSseqStat$parameters$maxGap
122 137
     if(verbose) cat("[BSmooth.tstat] preprocessing ... ")
... ...
@@ -20,9 +20,9 @@ BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paire
20 20
         xx.reg <- seq(from = min(xx), to = max(xx), by = 2000)
21 21
         yy.reg <- tstat.function(xx.reg)
22 22
         fit <- locfit(yy.reg ~ lp(xx.reg, h = 25000, deg = 2, nn = 0),
23
-                      family = "huber", maxk = 50000) 
23
+                      family = "huber", maxk = 50000)
24 24
         correction <- predict(fit, newdata = data.frame(xx.reg = xx))
25
-        yy - correction 
25
+        yy - correction
26 26
     }
27 27
 
28 28
     estimate.var <- match.arg(estimate.var)
... ...
@@ -38,7 +38,7 @@ BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paire
38 38
     if(is.character(group2)) {
39 39
         stopifnot(all(group2 %in% sampleNames(BSseq)))
40 40
         group2 <- match(group2, sampleNames(BSseq))
41
-    }    
41
+    }
42 42
     if(is.numeric(group2)) {
43 43
         stopifnot(min(group2) >= 1 & max(group2) <= ncol(BSseq))
44 44
     } else stop("problems with argument 'group2'")
... ...
@@ -48,22 +48,29 @@ BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paire
48 48
     stopifnot(length(group1) + length(group2) >= 3)
49 49
     if(estimate.var == "paired")
50 50
         stopifnot(length(group1) == length(group2))
51
-    
52
-    if(any(rowSums(getCoverage(BSseq)[, c(group1, group2)]) == 0))
53
-        warning("Computing t-statistics at locations where there is no data; consider subsetting the 'BSseq' object first")
54
-    
51
+
52
+    if (!.isHDF5ArrayBacked(getCoverage(BSseq))) {
53
+        ## TODO: This check can be super expensive if Cov is a HDF5Matrix, so
54
+        ##       we skip it for the time being. Really noticeable when Cov
55
+        ##       needs row subsetted/reordered (i.e. has Cov@index[[1]] exists)
56
+        ##
57
+        if(any(rowSums(getCoverage(BSseq)[, c(group1, group2)]) == 0)) {
58
+            warning("Computing t-statistics at locations where there is no data; consider subsetting the 'BSseq' object first")
59
+        }
60
+    }
61
+
55 62
     if(is.null(maxGap))
56 63
         maxGap <- BSseq@parameters$maxGap
57 64
     if(is.null(maxGap))
58 65
         stop("need to set argument 'maxGap'")
59
-    
66
+
60 67
     if(verbose) cat("[BSmooth.tstat] preprocessing ... ")
61 68
     ptime1 <- proc.time()
62 69
     clusterIdx <- makeClusters(BSseq, maxGap = maxGap)
63 70
     ptime2 <- proc.time()
64 71
     stime <- (ptime2 - ptime1)[3]
65 72
     if(verbose) cat(sprintf("done in %.1f sec\n", stime))
66
-        
73
+
67 74
     if(verbose) cat("[BSmooth.tstat] computing stats within groups ... ")
68 75
     ptime1 <- proc.time()
69 76
     allPs <- getMeth(BSseq, type = "smooth", what = "perBase",
... ...
@@ -73,12 +80,12 @@ BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paire
73 80
     ptime2 <- proc.time()
74 81
     stime <- (ptime2 - ptime1)[3]
75 82
     if(verbose) cat(sprintf("done in %.1f sec\n", stime))
76
-    
83
+
77 84
     if(verbose) cat("[BSmooth.tstat] computing stats across groups ... ")
78 85
     ptime1 <- proc.time()
79 86
     switch(estimate.var,
80 87
            "group2" = {
81
-               rawSds <- rowSds(allPs, cols = group2, na.rm = TRUE)
88
+               rawSds <- .rowSds(x = allPs, cols = group2, na.rm = TRUE)
82 89
                smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) {
83 90
                    smoothSd(rawSds[idx], k = k)
84 91
                }, mc.cores = mc.cores))
... ...
@@ -86,9 +93,9 @@ BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paire
86 93
                tstat.sd <- smoothSds * scale
87 94
            },
88 95
            "same" = {
89
-               rawSds <- sqrt( ((length(group1) - 1) * rowVars(allPs, cols = group1) +
90
-                                (length(group2) - 1) * rowVars(allPs, cols = group2)) /
91
-                              (length(group1) + length(group2) - 2))
96
+               rawSds <- sqrt( ((length(group1) - 1) * .rowVars(x = allPs, cols = group1) +
97
+                                    (length(group2) - 1) * .rowVars(x = allPs, cols = group2)) /
98
+                                   (length(group1) + length(group2) - 2))
92 99
                smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) {
93 100
                    smoothSd(rawSds[idx], k = k)
94 101
                }, mc.cores = mc.cores))
... ...
@@ -96,7 +103,7 @@ BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paire
96 103
                tstat.sd <- smoothSds * scale
97 104
            },
98 105
            "paired" = {
99
-               rawSds <- rowSds(allPs[, group1, drop = FALSE] - allPs[, group2, drop = FALSE])
106
+               rawSds <- .rowSds(x = allPs[, group1, drop = FALSE] - allPs[, group2, drop = FALSE])
100 107
                smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) {
101 108
                    smoothSd(rawSds[idx], k = k)
102 109
                }, mc.cores = mc.cores))
... ...
@@ -113,24 +120,24 @@ BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paire
113 120
     ptime2 <- proc.time()
114 121
     stime <- (ptime2 - ptime1)[3]
115 122
     if(verbose) cat(sprintf("done in %.1f sec\n", stime))
116
-    
123
+
117 124
     if(local.correct) {
118 125
         stats <- cbind(rawSds, tstat.sd, group2.means, group1.means,
119 126
                        tstat, tstat.corrected)
120 127
         colnames(stats) <- c("rawSds", "tstat.sd",
121 128
                              "group2.means", "group1.means", "tstat",
122 129
                              "tstat.corrected")
123
- 
130
+
124 131
     } else {
125 132
         stats <- cbind(rawSds, tstat.sd, group2.means, group1.means,
126 133
                        tstat)
127 134
         colnames(stats) <- c("rawSds", "tstat.sd",
128 135
                              "group2.means", "group1.means", "tstat")
129 136
     }
130
-    
137
+
131 138
     parameters <- c(BSseq@parameters,
132 139
                     list(tstatText = sprintf("BSmooth.tstat (local.correct = %s, maxGap = %d)",
133
-                         local.correct, maxGap),
140
+                                             local.correct, maxGap),
134 141
                          group1 = group1, group2 = group2, k = k, qSd = qSd,
135 142
                          local.correct = local.correct, maxGap = maxGap))
136 143
     out <- BSseqTstat(gr = granges(BSseq), stats = stats, parameters = parameters)
... ...
@@ -1,4 +1,4 @@
1
-setClass("BSseqStat", contains = "hasGRanges", 
1
+setClass("BSseqStat", contains = "hasGRanges",
2 2
          representation(stats = "list",
3 3
                         parameters = "list")
4 4
          )
... ...
@@ -8,17 +8,12 @@ setValidity("BSseqStat", function(object) {
8 8
     if(is.null(names(object@stats)) || any(names(object@stats) == "") ||
9 9
        anyDuplicated(names(object@stats)))
10 10
         msg <- validMsg(msg, "the 'stats' list needs to be named with unique names.")
11
-    for(name in c("rawSds", "smoothsSds", "stat")) {
12
-        if(name %in% names(object@stats) && !is.vector(object@stats[[name]]))
13
-            msg <- validMsg(msg, sprintf("component '%s' of slot 'stats' have to be a vector", name))
14
-        if(name %in% names(object@stats) && length(object@stats[[name]]) != length(object@gr))
15
-            msg <- validMsg(msg, sprintf("component '%s' of slot 'stats' have to have the same length as slot 'gr'", name))
16
-    }
17
-    for(name in c("rawTstats")) {
18
-        if(name %in% names(object@stats) && !is.matrix(object@stats[[name]]))
19
-            msg <- validMsg(msg, sprintf("component '%s' of slot 'stats' have to be a matrix", name))
20
-        if(name %in% names(object@stats) && nrow(object@stats[[name]]) != length(object@gr))
21
-            msg <- validMsg(msg, sprintf("component '%s' of slot 'stats' have to have the same number of rows as slot 'gr' is long", name))
11
+    for(name in c("rawSds", "smoothsSds", "stat", "rawTstats")) {
12
+        if(name %in% names(object@stats) &&
13
+           !is(object@stats[[name]], "DelayedMatrix"))
14
+            msg <- validMsg(msg, sprintf("component '%s' of slot 'stats' has to be a DelayedMatrix object", name))
15
+        if(name %in% names(object@stats) && isTRUE(nrow(object@stats[[name]]) != length(object@gr)))
16
+            msg <- validMsg(msg, sprintf("component '%s' of slot 'stats' has to have the same number of rows as slot 'gr' is long", name))
22 17
     }
23 18
     if(is.null(msg)) TRUE else msg
24 19
 })
... ...
@@ -29,8 +24,17 @@ setMethod("show", signature(object = "BSseqStat"),
29 24
               cat(" ", length(object), "methylation loci\n")
30 25
               cat("based on smoothed data:\n")
31 26
               cat(" ", object@parameters$smoothText, "\n")
32
-              cat("with parameters\n")
33
-              cat(" ", object@parameters$tstatText, "\n")
27
+              not_delayed_matrices <- c("cor.coefficients", "stat.type")
28
+              delayed_matrices <- setdiff(names(object@stats),
29
+                                          not_delayed_matrices)
30
+              is_HDF5Array_backed <- vapply(object@stats[delayed_matrices],
31
+                                            .isHDF5ArrayBacked,
32
+                                            logical(1L))
33
+              if (any(is_HDF5Array_backed)) {
34
+                  cat("'stats' slot is HDF5Array-backed\n")
35
+              } else {
36
+                  cat("'stats' slot is in-memory\n")
37
+              }
34 38
           })
35 39
 
36 40
 setMethod("[", "BSseqStat", function(x, i, ...) {
... ...
@@ -42,14 +46,11 @@ setMethod("[", "BSseqStat", function(x, i, ...) {
42 46
     statnames <- names(x@stats)
43 47
     names(statnames) <- statnames
44 48
     x@stats <- lapply(statnames, function(nam) {
45
-        if(nam %in% c("rawTstats", "modelCoefficients")) {
46
-            stopifnot(is.matrix(x@stats[[nam]]))
49
+        if(nam %in% c("rawTstats", "modelCoefficients", "rawSds", "smoothSds",
50
+                      "stat")) {
51
+            stopifnot(is(x@stats[[nam]], "DelayedMatrix"))
47 52
             return(x@stats[[nam]][i,,drop=FALSE])
48 53
         }
49
-        if(nam %in% c("rawSds", "smoothSds", "stat")) {
50
-            stopifnot(is.vector(x@stats[[nam]]))
51
-            return(x@stats[[nam]][i])
52
-        }
53 54
         x@stats[[nam]]
54 55
     })
55 56
     x
... ...
@@ -58,11 +59,26 @@ setMethod("[", "BSseqStat", function(x, i, ...) {
58 59
 BSseqStat <- function(gr = NULL, stats = NULL, parameters = NULL) {
59 60
     out <- new("BSseqStat")
60 61
     out@gr <- gr
62
+    not_delayed_matrices <- c("cor.coefficients", "stat.type")
63
+    delayed_matrices <- setdiff(names(stats), not_delayed_matrices)
64
+    stats[delayed_matrices] <- endoapply(stats[delayed_matrices],
65
+                                         .DelayedMatrix)
61 66
     out@stats <- stats
62 67
     out@parameters <- parameters
63 68
     out
64 69
 }
65 70
 
71
+setMethod("updateObject", "BSseqStat",
72
+          function(object, ...) {
73
+              not_delayed_matrices <- c("cor.coefficients", "stat.type")
74
+              delayed_matrices <- setdiff(names(stats), not_delayed_matrices)
75
+              stats <- object@stats
76
+              stats[delayed_matrices] <- endoapply(stats[delayed_matrices],
77
+                                                   .DelayedMatrix)
78
+              object@stats <- stats
79
+              object
80
+          }
81
+)
66 82
 
67 83
 ## summary.BSseqStat <- function(object, ...) {
68 84
 ##     quant <- quantile(getStats(object)[, "tstat.corrected"],
... ...
@@ -1,5 +1,5 @@
1
-setClass("BSseqTstat", contains = "hasGRanges", 
2
-         representation(stats = "matrix",
1
+setClass("BSseqTstat", contains = "hasGRanges",
2
+         representation(stats = "DelayedMatrix",
3 3
                         parameters = "list")
4 4
          )
5 5
 setValidity("BSseqTstat", function(object) {
... ...
@@ -17,6 +17,11 @@ setMethod("show", signature(object = "BSseqTstat"),
17 17
               cat(" ", object@parameters$smoothText, "\n")
18 18
               cat("with parameters\n")
19 19
               cat(" ", object@parameters$tstatText, "\n")
20
+              if (.isHDF5ArrayBacked(object@stats)) {
21
+                  cat("'stats' slot is HDF5Array-backed\n")
22
+              } else {
23
+                  cat("'stats' slot is in-memory\n")
24
+              }
20 25
           })
21 26
 
22 27
 setMethod("[", "BSseqTstat", function(x, i, ...) {
... ...
@@ -32,14 +37,14 @@ setMethod("[", "BSseqTstat", function(x, i, ...) {
32 37
 BSseqTstat <- function(gr = NULL, stats = NULL, parameters = NULL) {
33 38
     out <- new("BSseqTstat")
34 39
     out@gr <- gr
35
-    out@stats <- stats
40
+    out@stats <- .DelayedMatrix(stats)
36 41
     out@parameters <- parameters
37 42
     out
38 43
 }
39 44
 
40 45
 summary.BSseqTstat <- function(object, ...) {
41
-    quant <- quantile(getStats(object)[, "tstat.corrected"],
42
-                      prob = c(0.0001, 0.001, 0.01, 0.5, 0.99, 0.999, 0.9999))
46
+    quant <- .quantile(getStats(object)[, "tstat.corrected"],
47
+                       prob = c(0.0001, 0.001, 0.01, 0.5, 0.99, 0.999, 0.9999))
43 48
     quant <- t(t(quant))
44 49
     colnames(quant) <- "quantiles"
45 50
     out <- list(quantiles = quant)
... ...
@@ -53,9 +58,11 @@ print.summary.BSseqTstat <- function(x, ...) {
53 58
 
54 59
 plot.BSseqTstat <- function(x, y, ...) {
55 60
     tstat <- getStats(x)[, "tstat"]
61
+    tstat <- as.array(tstat)
56 62
     plot(density(tstat), xlim = c(-10,10), col = "blue", main = "")
57 63
     if("tstat.corrected" %in% colnames(getStats(x))) {
58 64
         tstat.cor <- getStats(x)[, "tstat.corrected"]
65
+        tstat.cor <- as.array(tstat.cor)
59 66
         lines(density(tstat.cor), col = "black")
60 67
         legend("topleft", legend = c("uncorrected", "corrected"), lty = c(1,1),
61 68
                col = c("blue", "black"))
... ...
@@ -65,3 +72,11 @@ plot.BSseqTstat <- function(x, y, ...) {
65 72
     }
66 73
 }
67 74
 
75
+setMethod("updateObject", "BSseqTstat",
76
+          function(object, ...) {
77
+              stats <- object@stats
78
+              stats <- .DelayedMatrix(stats)
79
+              object@stats <- stats
80
+              object
81
+          }
82
+)
... ...
@@ -1,22 +1,30 @@
1
-setClass("BSseq", contains = "RangedSummarizedExperiment", 
1
+setClass("BSseq", contains = "RangedSummarizedExperiment",
2 2
          representation(trans = "function",
3 3
                         parameters = "list"))
4 4
 
5 5
 setValidity("BSseq", function(object) {
6 6
     msg <- validMsg(NULL, .checkAssayNames(object, c("Cov", "M")))
7
+    msg <- validMsg(msg, .checkAssayClasses(object,
8
+                                            c("Cov", "M", "coef", "se.coef")))
7 9
     if(class(rowRanges(object)) != "GRanges")
8 10
         msg <- validMsg(msg, sprintf("object of class '%s' needs to have a 'GRanges' in slot 'rowRanges'", class(object)))
9 11
     ## benchmarking shows that min(assay()) < 0 is faster than any(assay() < 0) if it is false
10 12
     if(is.null(colnames(object)))
11 13
         msg <- validMsg(msg, "colnames (aka sampleNames) need to be set")
12
-    if(min(assay(object, "M")) < 0)
13
-        msg <- validMsg(msg, "the 'M' assay has negative entries")
14
-    if(min(assay(object, "Cov")) < 0)
15
-        msg <- validMsg(msg, "the 'Cov' assay has negative entries")
16
-    if(max(assay(object, "M") - assay(object, "Cov")) > 0.5)
17
-        msg <- validMsg(msg, "the 'M' assay has at least one entry bigger than the 'Cov' assay")
18
-    if(!is.null(rownames(assay(object, "M"))) ||
19
-       !is.null(rownames(assay(object, "Cov"))) ||
14
+    M <- assay(object, "M", withDimnames = FALSE)
15
+    Cov <- assay(object, "Cov", withDimnames = FALSE)
16
+    if (!.isHDF5ArrayBacked(M) && !.isHDF5ArrayBacked(Cov)) {
17
+        ## TODO: This check is super expensive if M or Cov is a HDF5Matrix, so
18
+        ##       we skip it for the time being
19
+        if(min(assay(object, "M", withDimnames = FALSE)) < 0)
20
+            msg <- validMsg(msg, "the 'M' assay has negative entries")
21
+        if(min(assay(object, "Cov", withDimnames = FALSE)) < 0)
22
+            msg <- validMsg(msg, "the 'Cov' assay has negative entries")
23
+        if(max(assay(object, "M", withDimnames = FALSE) -
24
+               assay(object, "Cov", withDimnames = FALSE)) > 0.5)
25
+            msg <- validMsg(msg, "the 'M' assay has at least one entry bigger than the 'Cov' assay")
26
+    }
27
+    if(!is.null(rownames(M)) || !is.null(rownames(Cov)) ||
20 28
        ("coef" %in% assayNames(object) && !is.null(rownames(assay(object, "coef")))) ||
21 29
        ("se.coef" %in% assayNames(object) && !is.null(rownames(assay(object, "se.coef")))))
22 30
         warning("unnecessary rownames in object")
... ...
@@ -34,6 +42,11 @@ setMethod("show", signature(object = "BSseq"),
34 42
               } else {
35 43
                   cat("has not been smoothed\n")
36 44
               }
45
+              if (.isHDF5ArrayBacked(object)) {
46
+                  cat("Some assays are HDF5Array-backed\n")
47
+              } else {
48
+                  cat("All assays are in-memory\n")
49
+              }
37 50
           })
38 51
 
39 52
 setMethod("pData", "BSseq", function(object) {
... ...
@@ -93,7 +106,7 @@ getBSseq <- function(BSseq, type = c("Cov", "M", "gr", "coef", "se.coef", "trans
93 106
         return(BSseq@parameters)
94 107
     if(type == "gr")
95 108
         return(BSseq@rowRanges)
96
-    
109
+
97 110
 }
98 111
 
99 112
 BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
... ...
@@ -111,10 +124,14 @@ BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
111 124
         stop("'gr' needs to have widths of 1")
112 125
     if(is.null(M) || is.null(Cov))
113 126
         stop("Need M and Cov")
114
-    if(!is.matrix(M))
115
-        stop("'M' needs to be a matrix")
116
-    if(!is.matrix(Cov))
117
-        stop("'Cov' needs to be a matrix")
127
+    M <- .DelayedMatrix(M)
128
+    Cov <- .DelayedMatrix(Cov)
129
+    if (!is.null(coef)) {
130
+        coef <- .DelayedMatrix(coef)
131
+    }
132
+    if (!is.null(se.coef)) {
133
+        se.coef <- .DelayedMatrix(se.coef)
134
+    }
118 135
     if(length(gr) != nrow(M) ||
119 136
        length(gr) != nrow(Cov) ||
120 137
        ncol(Cov) != ncol(M))
... ...
@@ -139,9 +156,15 @@ BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
139 156
     if(length(unique(sampleNames)) != ncol(M))
140 157
         stop("sampleNames need to be unique and of the right length.")
141 158
     ## check that 0 <= M <= Cov and remove positions with Cov = 0
142
-    if(any(M < 0) || any(M > Cov) || any(is.na(M)) || any(is.na(Cov)) ||
143
-       any(is.infinite(Cov)))
144
-        stop("'M' and 'Cov' may not contain NA or infinite values and 0 <= M <= Cov")
159
+    if (!.isHDF5ArrayBacked(M) && !.isHDF5ArrayBacked(Cov)) {
160
+        ## TODO: This check is super expensive if M or Cov is a HDF5Matrix, so
161
+        ##       we skip it for the time being
162
+        if (any(M < 0) || any(M > Cov) || any(is.na(M)) || any(is.na(Cov)) ||
163
+            any(is.infinite(Cov))) {
164
+            stop("'M' and 'Cov' may not contain NA or infinite values and ",
165
+                 "0 <= M <= Cov")
166
+        }
167
+    }
145 168
     if(rmZeroCov) {
146 169
         wh <- which(rowSums(Cov) == 0)
147 170
         if(length(wh) > 0) {
... ...
@@ -156,7 +179,7 @@ BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
156 179
         mm <- as.matrix(findOverlaps(grR, gr))
157 180
         mm <- mm[order(mm[,1]),]
158 181
         if(length(grR) == length(gr)) {
159
-            ## only re-ordering is necessary 
182
+            ## only re-ordering is necessary
160 183
             gr <- grR
161 184
             M <- M[mm[,2],,drop = FALSE]
162 185
             Cov <- Cov[mm[,2],,drop = FALSE]
... ...
@@ -171,8 +194,31 @@ BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
171 194
             gr <- grR
172 195
             sp <- split(mm[,2], mm[,1])[as.character(1:length(grR))]
173 196
             names(sp) <- NULL
174
-            M <- do.call(rbind, lapply(sp, function(ii) colSums(M[ii,, drop = FALSE])))
175
-            Cov <- do.call(rbind, lapply(sp, function(ii) colSums(Cov[ii,, drop = FALSE])))
197
+            # TODO: .collapseDelayedMatrix() always return numeric; it may be
198
+            #       worth coercing M and Cov to integer DelayedMatrix objects,
199
+            #       which would halve storage requirements and impose some more
200
+            #       structure on the BSseq class (via new validity method
201
+            #       checks)
202
+            # NOTE: Tries to be smart about how collapsed DelayedMatrix should
203
+            #       be realized
204
+            if (.isHDF5ArrayBacked(M)) {
205
+                M_BACKEND <- "HDF5Array"
206
+            } else {
207
+                M_BACKEND <- NULL
208
+            }
209
+            M <- .collapseDelayedMatrix(x = M,
210
+                                        sp = sp,
211
+                                        MARGIN = 2,
212
+                                        BACKEND = M_BACKEND)
213
+            if (.isHDF5ArrayBacked(Cov)) {
214
+                Cov_BACKEND <- "HDF5Array"
215
+            } else {
216
+                Cov_BACKEND <- NULL
217
+            }
218
+            Cov <- .collapseDelayedMatrix(x = Cov,
219
+                                          sp = sp,
220
+                                          MARGIN = 2,
221
+                                          BACKEND = Cov_BACKEND)
176 222
         }
177 223
     }
178 224
     if(is.null(colnames(M)) || any(sampleNames != colnames(M)))
... ...
@@ -180,8 +226,8 @@ BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
180 226
     if(is.null(colnames(Cov)) || any(sampleNames != colnames(Cov)))
181 227
         colnames(Cov) <- sampleNames
182 228
     if(!is.null(coef)) {
183
-        if(!is.matrix(coef) ||
184
-           nrow(coef) != nrow(M) ||
229
+
230
+        if(nrow(coef) != nrow(M) ||
185 231
            ncol(coef) != ncol(M))
186 232
             stop("'coef' does not have the right dimensions")
187 233
         if(is.null(colnames(coef)) || any(sampleNames != colnames(coef)))
... ...
@@ -190,8 +236,7 @@ BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
190 236
             rownames(coef) <- NULL
191 237
     }
192 238
     if(!is.null(se.coef)) {
193
-        if(!is.matrix(se.coef) ||
194
-           nrow(se.coef) != nrow(M) ||
239
+        if(nrow(se.coef) != nrow(M) ||
195 240
            ncol(se.coef) != ncol(M))
196 241
             stop("'se.coef' does not have the right dimensions")
197 242
         if(is.null(colnames(se.coef)) || any(sampleNames != colnames(se.coef)))
... ...
@@ -213,19 +258,30 @@ BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
213 258
     BSseq
214 259
 }
215 260
 
216
-
217 261
 setMethod("updateObject", "BSseq",
218 262
           function(object, ...) {
219
-               if(.hasSlot(object, "assays")) {
220
-                   # call method for RangedSummarizedExperiment objects
221
-                   callNextMethod()
222
-               } else {
223
-                   BSseq(gr = object@gr, M = object@M, Cov = object@Cov,
224
-                         coef = object@coef, se.coef = object@se.coef,
225
-                         trans = object@trans, parameters = object@parameters,
226
-                         pData = object@phenoData@data)
227
-               }
228
-           })
263
+              # NOTE: identical() is too strong
264
+              if (isTRUE(all.equal(getBSseq(object, "trans"), .oldTrans))) {
265
+                  object@trans <- plogis
266
+              }
267
+              if (.hasSlot(object, "assays")) {
268
+                  assays(object) <- endoapply(
269
+                      assays(object, withDimnames = FALSE), function(assay) {
270
+                          if (is.null(assay)) {
271
+                              return(assay)
272
+                          } else {
273
+                              .DelayedMatrix(assay)
274
+                          }
275
+                      })
276
+                  # call method for RangedSummarizedExperiment objects
277
+                  callNextMethod()
278
+              } else {
279
+                  BSseq(gr = object@gr, M = object@M, Cov = object@Cov,
280
+                        coef = object@coef, se.coef = object@se.coef,
281
+                        trans = object@trans, parameters = object@parameters,
282
+                        pData = object@phenoData@data)
283
+              }
284
+          })
229 285
 
230 286
 
231 287
 strandCollapse <- function(BSseq, shift = TRUE) {
... ...
@@ -285,7 +341,7 @@ strandCollapse <- function(BSseq, shift = TRUE) {
285 341
 ##                ## Based on the delta method
286 342
 ##                se.d <- sqrt((data$se.coef[, sample1] * p1 * (1-p1))^2 +
287 343
 ##                             (data$se.coef[, sample2] * p2 * (1-p2))^2)
288
-               
344
+
289 345
 ##                lower <- d - z * se.d
290 346
 ##                upper <- d + z * se.d
291 347
 ##                out <- data.frame(d = d, lower = lower, upper = upper)
... ...
@@ -10,12 +10,33 @@ collapseBSseq <- function(BSseq, columns) {
10 10
     else
11 11
         columns.idx <- match(names(columns), sampleNames(BSseq))
12 12
     sp <- split(columns.idx, columns)
13
-    M <- do.call(cbind, lapply(sp, function(ss) {
14
-        rowSums(getBSseq(BSseq, "M")[, ss, drop = FALSE])
15
-    }))
16
-    Cov <- do.call(cbind, lapply(sp, function(ss) {
17
-        rowSums(getBSseq(BSseq, "Cov")[, ss, drop = FALSE])
18
-    }))
13
+    # TODO: .collapseDelayedMatrix() always return numeric; it may be
14
+    #       worth coercing M and Cov to integer DelayedMatrix objects,
15
+    #       which would halve storage requirements and impose some more
16
+    #       structure on the BSseq class (via new validity method
17
+    #       checks)
18
+    # NOTE: Tries to be smart about how collapsed DelayedMatrix should
19
+    #       be realized
20
+    M <- getBSseq(BSseq, "M")
21
+    if (.isHDF5ArrayBacked(M)) {
22
+        M_BACKEND <- "HDF5Array"
23
+    } else {
24
+        M_BACKEND <- NULL
25
+    }
26
+    M <- .collapseDelayedMatrix(x = M,
27
+                                sp = sp,
28
+                                MARGIN = 1,
29
+                                BACKEND = M_BACKEND)
30
+    Cov <- getBSseq(BSseq, "Cov")
31
+    if (.isHDF5ArrayBacked(Cov)) {
32
+        Cov_BACKEND <- "HDF5Array"
33
+    } else {
34
+        Cov_BACKEND <- NULL
35
+    }
36
+    Cov <- .collapseDelayedMatrix(x = Cov,
37
+                                  sp = sp,
38
+                                  MARGIN = 1,
39
+                                  BACKEND = Cov_BACKEND)
19 40
     BSseq(gr = getBSseq(BSseq, "gr"), M = M, Cov = Cov, sampleNames = names(sp))
20 41
 }
21 42
 
... ...
@@ -108,11 +129,10 @@ getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"),
108 129
         outMatrix <- matrix(NA, ncol = ncol(BSseq), nrow = length(regions))
109 130
         colnames(outMatrix) <- sampleNames(BSseq)
110 131
         outMatrix[as.integer(rownames(out)),] <- out
111
-        return(outMatrix)
132
+        .DelayedMatrix(outMatrix)
112 133
     }
113 134
 }
114 135
 
115
-
116 136
 getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"),
117 137
                     what = c("perBase", "perRegionAverage", "perRegionTotal")) {
118 138
     stopifnot(is(BSseq, "BSseq"))
... ...
@@ -175,7 +195,7 @@ getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"),
175 195
     outMatrix <- matrix(NA, ncol = ncol(BSseq), nrow = length(regions))
176 196
     colnames(outMatrix) <- sampleNames(BSseq)
177 197
     outMatrix[as.integer(rownames(out)),] <- out
178
-    return(outMatrix)
198
+    .DelayedMatrix(outMatrix)
179 199
 }
180 200
 
181 201
 
182 202
new file mode 100644
... ...
@@ -0,0 +1,91 @@
1
+# Functions/methods that would be good to have in DelayedArray
2
+
3
+.rowVars <- function(x, rows = NULL, cols = NULL, ...) {
4
+    if (is(x, "DelayedArray")) {
5
+        if (!is.null(rows)) {
6
+            x <- x[rows, ]
7
+        }
8
+        if (!is.null(cols)) {
9
+            x <- x[, cols]
10
+        }
11
+        row_vars <- rowVars(as.array(x), ...)
12
+    } else {
13
+        row_vars <- rowVars(x, rows = rows, cols = cols, ...)
14
+    }
15
+    row_vars
16
+}
17
+
18
+.rowSds <- function(x, rows = NULL, cols = NULL, ...) {
19
+    row_vars <- .rowVars(x, rows = rows, cols = cols, ...)
20
+    sqrt(row_vars)
21
+}
22
+
23
+.quantile <- function(x, ...) {
24
+    if (is(x, "DelayedArray")) {
25
+        x <- as.array(x)
26
+    }
27
+    quantile(x, ...)
28
+}
29
+
30
+.DelayedMatrix <- function(x) {
31
+    x_name <- deparse(substitute(x))
32
+    X <- try(DelayedArray(x), silent = TRUE)
33
+    if (is(X, "try-error")) {
34
+        stop("Could not construct DelayedMatrix from '", x_name, "'",
35
+             call. = FALSE)
36
+    }
37
+    if (!is(X, "DelayedMatrix")) {
38
+        stop("'", x_name, "' must be matrix-like", call. = FALSE)
39
+    }
40
+    X
41
+}
42
+
43
+.isSimpleDelayedMatrix <- function(x) {
44
+    is(x@seed, "matrix")
45
+}
46
+
47
+# NOTE: Equivalent to rowSums(x[, j, drop = FALSE]) but does it using a
48
+#       delayed operation and always returns a nrow(x) x 1 DelayedMatrix
49
+.delayed_rowSums <- function(x, j) {
50
+    Reduce(`+`, lapply(j, function(jj) x[, jj, drop = FALSE]))
51
+}
52
+
53
+# NOTE: Equivalent to colSums(x[i, , drop = FALSE]) but does it using a
54
+#       delayed operation and always returns a 1 x ncol(x) DelayedMatrix
55
+.delayed_colSums <- function(x, i) {
56
+    Reduce(`+`, lapply(i, function(ii) x[ii, , drop = FALSE]))
57
+}
58
+
59
+# MARGIN = 1: collapse using rowSums
60
+# MARGIN = 2: collapse using colSums
61
+.collapseDelayedMatrix <- function(x, sp, MARGIN, BACKEND = NULL) {
62
+    stopifnot(is(x, "DelayedMatrix"))
63
+    if (MARGIN == 1) {
64
+        if (is.null(BACKEND)) {
65
+            collapsed_x <- do.call(cbind, lapply(sp, function(j) {
66
+                rowSums(x[, j, drop = FALSE])
67
+            }))
68
+        } else {
69
+            collapsed_x <- do.call(cbind, lapply(sp, function(j) {
70
+                .delayed_rowSums(x, j)
71
+            }))
72
+            # NOTE: Need to manually add colnames when using this method
73
+            colnames(collapsed_x) <- names(sp)
74
+        }
75
+    } else if (MARGIN == 2) {
76
+        if (is.null(BACKEND)) {
77
+            collapsed_x <- do.call(rbind, lapply(sp, function(i) {
78
+                colSums(x[i, , drop = FALSE])
79
+            }))
80
+        } else {
81
+            collapsed_x <- do.call(rbind, lapply(sp, function(i) {
82
+                .delayed_colSums(x, i)
83
+            }))
84
+            # NOTE: Need to manually add rownames when using this method
85
+            rownames(collapsed_x) <- names(sp)
86
+        }
87
+    } else {
88
+        stop("'MARGIN' must be 1 or 2")
89
+    }
90
+    realize(collapsed_x, BACKEND = BACKEND)
91
+}
... ...
@@ -1,106 +1,219 @@
1
+# TODO: Ultimately, combine() is just a special case of combineList(), so
2
+#       should simplify code
1 3
 setMethod("combine", signature(x = "BSseq", y = "BSseq"), function(x, y, ...) {
2 4
     ## All of this assumes that we are place x and y "next" to each other,
3 5
     ##  ie. we are not combining the same set of samples sequenced at different times
4
-    if (class(x) != class(y))
6
+    if (class(x) != class(y)) {
5 7
         stop(paste("objects must be the same class, but are ",
6
-                   class(x), ", ", class(y), sep=""))
7
-    if(hasBeenSmoothed(x) && hasBeenSmoothed(y) && !all.equal(x@trans, y@trans))
8
+                   class(x), ", ", class(y), se = ""))
9
+    }
10
+    if (hasBeenSmoothed(x) && hasBeenSmoothed(y) &&
11
+        !all.equal(getBSseq(x, "trans"), getBSseq(y, "trans")) &&
12
+        !all.equal(getBSseq(x, "parameters"), getBSseq(y, "parameters"))) {
8 13
         stop("'x' and 'y' need to be smoothed on the same scale")
14
+    }
9 15
     pData <- combine(as(pData(x), "data.frame"), as(pData(y), "data.frame"))
10
-    if(identical(granges(x), granges(y))) {
16
+    # TODO: Could relax this by sorting x and y before doing the next steps
17
+    if (identical(granges(x), granges(y))) {
18
+        # NOTE: Don't realize() M, Cov, coef, or se.coef because we are
19
+        #       cbind()-ing, which is very efficient for DelayedArray objects.
20
+        #       E.g., cbind()-ing a bunch of HDF5Matrix objects is super fast
21
+        #       and uses basically no memory whereas realize()-ing the final
22
+        #       object as a new HDF5Matrix can take a long time.
11 23
         gr <- granges(x)
12 24
         M <- cbind(getBSseq(x, "M"), getBSseq(y, "M"))
13 25
         Cov <- cbind(getBSseq(x, "Cov"), getBSseq(y, "Cov"))
14
-        if(!hasBeenSmoothed(x) || !hasBeenSmoothed(y)) {
26
+        if (!hasBeenSmoothed(x) || !hasBeenSmoothed(y)) {
15 27
             coef <- NULL
16 28
             se.coef <- NULL
17 29
             trans <- NULL
30
+            parameters <- NULL
18 31
         } else {
32
+            x_trans <- getBSseq(x, "trans")
33
+            y_trans <- getBSseq(y, "trans")
34
+            if (!identical(x_trans, y_trans)) {
35
+                stop("'x' and 'y' need to have the same 'trans'")
36
+            }
37
+            trans <- x_trans
38
+            x_parameters <- getBSseq(x, "parameters")
39
+            y_parameters <- getBSseq(y, "parameters")
40
+            if (!identical(x_parameters, y_parameters)) {
41
+                stop("'x' and 'y' need to have the same 'parameters'")
42
+            }
43
+            parameters <- x_parameters
19 44
             coef <- cbind(getBSseq(x, "coef"), getBSseq(y, "coef"))
20
-            se.coef <- cbind(getBSseq(x, "se.coef"), getBSseq(y, "se.coef"))
21
-            trans <- getBSseq(x, "trans")
45
+            if (!is.null(getBSseq(x, "se.coef")) &&
46
+                !is.null(getBSseq(y, "se.coef"))) {
47
+                se.coef <- cbind(getBSseq(x, "se.coef"), getBSseq(y, "se.coef"))
48
+            } else {
49
+                if (!is.null(getBSseq(x, "se.coef")) ||
50
+                    !is.null(getBSseq(y, "se.coef"))) {
51
+                    warning("Setting 'se.coef' to NULL: Cannot combine() ",
52
+                            "these because one of 'x' or 'y' is missing ",
53
+                            "'se.coef'")
54
+                    }
55
+                se.coef <- NULL
56
+            }
22 57
         }
23 58
     } else {
24 59
         gr <- reduce(c(granges(x), granges(y)), min.gapwidth = 0L)
25 60
         mm.x <- as.matrix(findOverlaps(gr, granges(x)))
26 61
         mm.y <- as.matrix(findOverlaps(gr, granges(y)))
27
-        sampleNames <- c(sampleNames(x), sampleNames(y))
62
+        I <-  list(mm.x[, 1], mm.y[, 1])
28 63
         ## FIXME: there is no check that the two sampleNames are disjoint.
29
-        M <- Cov <- matrix(0, nrow = length(gr), ncol = length(sampleNames))
30
-        colnames(M) <- colnames(Cov) <- sampleNames
31
-        M[mm.x[,1], 1:ncol(x)] <- getBSseq(x, "M")[mm.x[,2],]
32
-        M[mm.y[,1], ncol(x) + 1:ncol(y)] <- getBSseq(y, "M")[mm.y[,2],]
33
-        Cov[mm.x[,1], 1:ncol(x)] <- getBSseq(x, "Cov")[mm.x[,2],]
34
-        Cov[mm.y[,1], ncol(x) + 1:ncol(y)] <- getBSseq(y, "Cov")[mm.y[,2],]
35
-        if(!hasBeenSmoothed(x) || !hasBeenSmoothed(y)) {
36
-            coef <- NULL
37
-            se.coef <- NULL
38
-            trans <- NULL
64
+        sampleNames <- c(sampleNames(x), sampleNames(y))
65
+        M_X <- list(getBSseq(x, "M"), getBSseq(y, "M"))
66
+        if (any(vapply(M_X, .isHDF5ArrayBacked, logical(1L)))) {
67
+            M_BACKEND <- "HDF5Array"
39 68
         } else {
40
-            trans <- x@trans
41
-            coef <- matrix(0, nrow = length(gr), ncol = length(sampleNames))
42
-            colnames(coef) <- rownames(pData)
43
-            if(hasBeenSmoothed(x))
44
-                coef[mm.x[,1], 1:ncol(x)] <- getBSseq(x, "coef")[mm.x[,2],]
45
-            if(hasBeenSmoothed(y))
46
-                coef[mm.y[,1], ncol(x) + 1:ncol(y)] <- getBSseq(y, "coef")[mm.y[,2],]
47
-            if(is.null(getBSseq(x, "se.coef")) && is.null(getBSseq(x, "se.coef")))
48
-                se.coef <- NULL
49
-            else {
50
-                se.coef <- matrix(0, nrow = length(gr), ncol = length(sampleNames))
51
-                colnames(se.coef) <- sampleNames(pData)
52
-                if(!is.null(getBSseq(x, "se.coef")))
53
-                    se.coef[mm.x[,1], 1:ncol(x)] <- getBSseq(x, "se.coef")[mm.x[,2],]
54
-                if(!is.null(getBSseq(y, "se.coef")))
55
-                    se.coef[mm.y[,1], ncol(x) + 1:ncol(y)] <- getBSseq(y, "se.coef")[mm.y[,2],]
56
-            }
69
+            M_BACKEND <- NULL
57 70
         }
71
+        # TODO: Figure out if fill should be 0 or 0L based on M_X
72
+        M <- .combineListOfDelayedMatrixObjects(
73
+            X = M_X,
74
+            I = I,
75
+            nrow = length(gr),
76
+            ncol = length(sampleNames),
77
+            dimnames = list(NULL, sampleNames),
78
+            fill = 0L,
79
+            BACKEND = M_BACKEND)
80
+        Cov_X <- list(getBSseq(x, "Cov"), getBSseq(y, "Cov"))
81
+        if (any(vapply(Cov_X, .isHDF5ArrayBacked, logical(1L)))) {
82
+            Cov_BACKEND <- "HDF5Array"
83
+        } else {
84
+            Cov_BACKEND <- NULL
85
+        }
86
+        Cov <- .combineListOfDelayedMatrixObjects(
87
+            X = list(getBSseq(x, "Cov"), getBSseq(y, "Cov")),
88
+            I = I,
89
+            nrow = length(gr),
90
+            ncol = length(sampleNames),
91
+            dimnames = list(NULL, sampleNames),
92
+            fill = 0L,
93
+            BACKEND = Cov_BACKEND)
94
+        if (hasBeenSmoothed(x) || hasBeenSmoothed(y)) {
95
+            warning("Setting 'coef' and 'se.coef' to NULL: Cannot combine() ",
96
+                    "these because 'x' and 'y' have different rowRanges")
97
+        }
98
+        coef <- NULL
99
+        se.coef <- NULL
100
+        trans <- NULL
101
+        parameters <- NULL
58 102
     }
59 103
     BSseq(gr = gr, M = M, Cov = Cov, coef = coef, se.coef = se.coef,
60
-          pData = pData, trans = trans, rmZeroCov = FALSE)
104
+          pData = pData, trans = trans, parameters = parameters,
105
+          rmZeroCov = FALSE)
61 106
 })
62 107
 
63
-combineList <- function(x, ...) {
64
-    if(class(x) == "BSseq")
108
+combineList <- function(x, ..., BACKEND = NULL) {
109
+    if (class(x) == "BSseq") {
65 110
         x <- list(x, ...)
111
+    }
66 112
     stopifnot(all(sapply(x, class) == "BSseq"))
67
-    gr <- getBSseq(x[[1]], "gr")
68 113
     trans <- getBSseq(x[[1]], "trans")
69 114
     sameTrans <- sapply(x[-1], function(xx) {
70 115
         identical(trans, getBSseq(xx, "trans"))
71 116
     })
72
-    if(!all(sameTrans))
73
-        stop("all elements of '...' in combineList needs to have the same trans")
117
+    if (!all(sameTrans)) {
118
+        stop("all elements of '...' in combineList needs to have the same ",
119
+             "'trans'")
120
+    }
121
+    parameters <- getBSseq(x[[1]], "parameters")
122
+    same_parameters <- vapply(x, function(xx) {
123
+        identical(getBSseq(xx, "parameters"), parameters)
124
+    }, logical(1L))
125
+    if (!all(same_parameters)) {
126
+        stop("all elements of '...' in combineList needs to have the same ",
127
+             "'parameters'")
128
+    }
129
+    # TODO: Could relax this by sorting all elements of x before doing the
130
+    #       next steps
131
+    gr <- getBSseq(x[[1]], "gr")
74 132
     sameGr <- sapply(x[-1], function(xx) {
75 133
         identical(gr, getBSseq(xx, "gr"))
76 134
     })
77
-    if(all(sameGr)) {
135
+    if (all(sameGr)) {
136
+        # NOTE: Don't realize() M, Cov, coef, or se.coef because we are
137
+        #       cbind()-ing, which is very efficient for DelayedArray objects.
138
+        #       E.g., cbind()-ing a bunch of HDF5Matrix objects is super fast
139
+        #       and uses basically no memory whereas realize()-ing the final
140
+        #       object as a new HDF5Matrix can take a long time.
78 141
         M <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "M")))
79 142
         Cov <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "Cov")))
143
+        if (any(!vapply(x, hasBeenSmoothed, logical(1L)))) {
144
+            warning("Setting 'coef' and 'se.coef' to NULL: Cannot combine() ",
145
+                    "these because BSseq objects have different rowRanges")
146
+            coef <- NULL
147
+            se.coef <- NULL
148
+            trans <- NULL
149
+        } else {
150
+            list_of_trans <- lapply(x, getBSseq, "trans")
151
+            if (!all(vapply(list_of_trans, function(trans) {
152
+                identical(trans, list_of_trans[[1]])
153
+            }, logical(1L)))) {
154
+                stop("All BSseq objects need to have the same 'trans'")
155
+            }
156
+            trans <- list_of_trans[[1]]
157
+            list_of_parameters <- lapply(x, getBSseq, "parameters")
158
+            if (!all(vapply(list_of_parameters, function(parameters) {
159
+                identical(parameters, list_of_parameters[[1]])
160
+            }, logical(1L)))) {
161
+                stop("All BSseq objects need to have the same 'parameters'")
162
+            }
163
+            parameters <- list_of_parameters[[1]]
164
+            list_of_coef <- lapply(x, function(xx) getBSseq(xx, "coef"))
165
+            coef <- do.call(cbind, list_of_coef)
166
+            list_of_se.coef <- lapply(x, function(xx) getBSseq(xx, "se.coef"))
167
+            if (all(!vapply(list_of_coef, is.null, logical(1L)))) {
168
+                se.coef <- do.call(cbind, list_of_se.coef)
169
+            } else {
170
+                if (any(vapply(list_of_coef, is.null, logical(1L)))) {
171
+                    warning("Setting 'se.coef' to NULL: Cannot combine() ",
172
+                            "these because at least of the BSseq objects ",
173
+                            "is missing 'se.coef'")
174
+                }
175
+                se.coef <- NULL
176
+            }
177
+        }
80 178
     } else {
81
-        gr <- sort(reduce(do.call(c, unname(lapply(x, granges))), min.gapwidth = 0L))
179
+        gr <- sort(reduce(do.call(c, unname(lapply(x, granges))),
180
+                          min.gapwidth = 0L))
181
+        I <- lapply(x, function(xx) {
182
+            ov <- findOverlaps(gr, xx)
183
+            queryHits(ov)
184
+        })
82 185
         sampleNames <- do.call(c, unname(lapply(x, sampleNames)))
83
-        M <- matrix(0, ncol = length(sampleNames), nrow = length(gr))
84
-        colnames(M) <- sampleNames
85
-        Cov <- M
86
-        idxes <- split(seq(along = sampleNames), rep(seq(along = x), times = sapply(x, ncol)))
87
-        for(ii in seq(along = idxes)) {
88
-            ov <- findOverlaps(gr, granges(x[[ii]]))
89
-            idx <- idxes[[ii]]
90
-            M[queryHits(ov), idx] <- getBSseq(x[[ii]], "M")[subjectHits(ov),]
91
-            Cov[queryHits(ov), idx] <- getBSseq(x[[ii]], "Cov")[subjectHits(ov),]
186
+        ## FIXME: there is no check that the sampleNames are unique/disjoint.
187
+        # TODO: Figure out if fill should be 0 or 0L based on X (getting it
188
+        #       right will avoid a copy/cast)
189
+        M <- .combineListOfDelayedMatrixObjects(
190
+            X = lapply(x, getBSseq, "M"),
191
+            I = I,
192
+            nrow = length(gr),
193
+            ncol = length(sampleNames),
194
+            dimnames = list(NULL, sampleNames),
195
+            fill = 0,
196
+            BACKEND = BACKEND)
197
+        Cov <- .combineListOfDelayedMatrixObjects(
198
+            X = lapply(x, getBSseq, "Cov"),
199
+            I = I,
200
+            nrow = length(gr),
201
+            ncol = length(sampleNames),
202
+            dimnames = list(NULL, sampleNames),
203
+            fill = 0,
204
+            BACKEND = BACKEND)
205
+        if (any(vapply(x, hasBeenSmoothed, logical(1L)))) {
206
+            warning("Setting 'coef' and 'se.coef' to NULL: Cannot combine() ",
207
+                    "these because BSseq objects have different rowRanges")
92 208
         }
93
-    }
94
-    if(any(!sapply(x, hasBeenSmoothed)) || !(all(sameGr))) {
95 209
         coef <- NULL
96 210
         se.coef <- NULL
97 211
         trans <- NULL
98
-    } else {
99
-        coef <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "coef")))
100
-        se.coef <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "se.coef")))
101 212
     }
102
-    pData <- as(Reduce(combine, lapply(x, function(xx) as.data.frame(pData(xx)))), "DataFrame")
213
+    pData <- as(Reduce(combine,
214
+                       lapply(x, function(xx) as.data.frame(pData(xx)))),
215
+                "DataFrame")
103 216
     BSseq(gr = gr, M = M, Cov = Cov, coef = coef, se.coef = se.coef,
104
-          pData = pData, trans = trans, rmZeroCov = FALSE)
217
+          pData = pData, trans = trans, parameters = parameters,
218
+          rmZeroCov = FALSE)
105 219
 }
106
-
107 220
new file mode 100644
... ...
@@ -0,0 +1,79 @@
1
+# NOTE: DelayedMatrix objects don't suport subassignment ([<- or [[<-). This
2
+#       function supports a special case of subassignment for DelayedMatrix
3
+#       objects namely:
4
+#       - Create a DelayedMatrix, value, with `nrow` and `ncol`
5
+#       - Fill in value[i, ] with `x`
6
+#       - Fill in remaining rows with `fill`
7
+.subassignRowsDelayedMatrix <- function(x, i, nrow, fill = NA_integer_,
8
+                                        BACKEND = NULL, by_row = FALSE) {
9
+    if (is.null(BACKEND)) {
10
+        if (by_row) {
11
+            warning("'by_row' ignored when 'BACKEND' is NULL")
12
+        }
13
+        z <- matrix(fill, nrow = nrow, ncol = ncol(x))
14
+        z[i, ] <- as.array(x)
15
+        return(realize(z, BACKEND = BACKEND))
16
+    } else {
17
+        # NOTE: Believe by_row == FALSE is better for the situations it's being
18
+        #       called in by bsseq but not done extensive benchmarking
19
+        if (by_row) {
20
+            # Construct the 'missing' rows of the returned matrix in memory,
21
+            # then realize(), then rbind these rows, then reorder the rows,
22
+            # then realize() the returned matrix
23
+            missing_i <- setdiff(seq_len(nrow), i)
24
+            x2 <- .DelayedMatrix(matrix(fill,
25
+                                        nrow = nrow - nrow(x),
26
+                                        ncol = ncol(x)))
27
+            x2 <- realize(x2, BACKEND = BACKEND)
28
+            z <- rbind(x, x2)
29
+            z <- z[order(c(i, missing_i)), ]
30
+            return(realize(z, BACKEND = BACKEND))
31
+        } else {
32
+            # Construct each column of the returned matrix in memory, then
33
+            # realize(), then cbind these columns
34
+            zjs <- lapply(seq_len(ncol(x)), function(j) {
35
+                zj <- matrix(fill, nrow = nrow, ncol = 1)
36
+                zj[i] <- as.array(x[, j])
37
+                realize(zj, BACKEND = BACKEND)
38
+            })
39
+            do.call(cbind, zjs)
40
+        }
41
+    }
42
+}
43
+
44
+# X is a list of DelayedMatrix objects
45
+# I is a list of row indices where I[[k]] is the vector of row indices for
46
+# X[[k]]
47
+.combineListOfDelayedMatrixObjects <- function(X, I, nrow, ncol,
48
+                                               dimnames = NULL,
49
+                                               fill = NA_integer_,
50
+                                               BACKEND = NULL) {
51
+    stopifnot(length(X) == length(I))
52
+    is_delayed_matrix <- vapply(X, function(XX) is(XX, "DelayedMatrix"),
53
+                                logical(1L))
54
+    stopifnot(all(is_delayed_matrix))
55
+    is_simple_delayed_matrix <- vapply(X, .isSimpleDelayedMatrix, logical(1L))
56
+    # NOTE: If all DelayedMatrix objects are 'simple' (i.e. the @seed of each
57
+    #       is a base::matrix), then can use this much faster and more
58
+    #       memory-efficient method to combine the DelayedMatrix objects into
59
+    #       a single DelayedMatrix
60
+    if (all(is_simple_delayed_matrix) && is.null(BACKEND)) {
61
+        X <- endoapply(X, as.array)
62
+        z <- matrix(fill, nrow, ncol, dimnames = dimnames)
63
+        j0 <- 0
64
+        for (j in seq_along(X)) {
65
+            i <- I[[j]]
66
+            jj <- j0 + seq_len(ncol(X[[j]]))
67
+            z[i, jj] <- X[[j]]
68
+            j0 <- j0 + ncol(X[[j]])
69
+        }
70
+        return(realize(z, BACKEND = BACKEND))
71
+    } else {
72
+        z <- mapply(.subassignRowsDelayedMatrix, x = X, i = I,
73
+                    MoreArgs = list(nrow = nrow, fill = fill,
74
+                                    BACKEND = BACKEND))
75
+        z <- do.call("cbind", z)
76
+        dimnames(z) <- dimnames
77
+        z
78
+    }
79
+}
... ...
@@ -1,20 +1,21 @@
1
+# NOTE: Realises in memory a matrix with nrow = length(bstat) and ncol = 1
1 2
 dmrFinder <- function(bstat, cutoff = NULL, qcutoff = c(0.025, 0.975),
2 3
                       maxGap = 300, stat = "tstat.corrected", verbose = TRUE) {
3 4
     if(is(bstat, "BSseqTstat")) {
4 5
         if(! stat %in% colnames(bstat@stats))
5 6
             stop("'stat' needs to be a column of 'bstat@stats'")
6
-        dmrStat <- bstat@stats[, stat]
7
+        dmrStat <- as.array(bstat@stats[, stat, drop = FALSE])
7 8
     }
8 9
     if(is(bstat, "BSseqStat")) {
9
-        dmrStat <- getStats(bstat, what = "stat")
10
+        dmrStat <- as.array(getStats(bstat, what = "stat"))
10 11
     }
11 12
     subverbose <- max(as.integer(verbose) - 1L, 0L)
12 13
     if(is.null(cutoff))
13
-        cutoff <- quantile(dmrStat, qcutoff)
14
+        cutoff <- .quantile(dmrStat, qcutoff)
14 15
     if(length(cutoff) == 1)
15 16
         cutoff <- c(-cutoff, cutoff)
16 17
     direction <- as.integer(dmrStat >= cutoff[2])
17
-    direction[dmrStat <= cutoff[1]] <- -1L
18
+    direction[as.array(dmrStat <= cutoff[1])] <- -1L
18 19
     direction[is.na(direction)] <- 0L
19 20
     chrs <- as.character(seqnames(bstat))
20 21
     positions <- start(bstat)
... ...
@@ -5,6 +5,16 @@ getStats <- function(bstat, regions = NULL, ...) {
5 5
     getStats_BSseqStat(bstat, regions = regions, ...)
6 6
 }
7 7
 
8
+# NOTE: Realises in memory a matrix with nrow = length(hits) and ncol = 1
9
+.getRegionStats <- function(stat, hits, na.rm = FALSE) {
10
+    stat_by_region <- split(as.array(stat[queryHits(hits), ]),
11
+                            subjectHits(hits))
12
+    data.frame(areaStat = vapply(stat_by_region, sum, numeric(1),
13
+                                 na.rm = na.rm, USE.NAMES = FALSE),
14
+               maxStat = vapply(stat_by_region, max, numeric(1),
15
+                                na.rm = na.rm, USE.NAMES = FALSE))
16
+}
17
+
8 18
 getStats_BSseqStat <- function(BSseqStat, regions = NULL, what = NULL) {
9 19
     stopifnot(is(BSseqStat, "BSseqStat"))
10 20
     if(!is.null(what)) {
... ...
@@ -16,24 +26,37 @@ getStats_BSseqStat <- function(BSseqStat, regions = NULL, what = NULL) {
16 26
     ## Now we have regions and no what
17 27
     if(class(regions) == "data.frame")
18 28
         regions <- data.frame2GRanges(regions)
19
-    ov <- findOverlaps(BSseqStat, regions)
20
-    ov.sp <- split(queryHits(ov), subjectHits(ov))
21
-    ## We need areaStat and maxStat
22
-    ## Could need averageMeth in each group?
23
-    ## Need to have a specific design for that
24
-    ## Could at least get contrast coefficient
25
-    getRegionStats <- function(idx) {
26
-        areaStat <- sum(BSseqStat@stats$stat[idx], na.rm = TRUE)
27
-        maxStat <- max(BSseqStat@stats$stat[idx], na.rm = TRUE)
28
-        c(areaStat, maxStat)
29
-    }
30
-    regionStats <- matrix(NA, ncol = 2, nrow = length(regions))
31
-    colnames(regionStats) <- c("areaStat", "maxStat")
32
-    tmp <- lapply(ov.sp, getRegionStats)
33
-    regionStats[as.integer(names(tmp)),] <- do.call(rbind, tmp)
29
+    hits <- findOverlaps(BSseqStat, regions)
30
+    regionStats <- .getRegionStats(stat = BSseqStat@stats$stat,
31
+                                   hits = hits)
34 32
     regionStats
35 33
 }
36 34
 
35
+# NOTE: Realises in memory a matrix with nrow = length(hits) and ncol = 3
36
+.getRegionStats_ttest <- function(stats, hits, na.rm = FALSE) {
37
+    stat_names <- c("group1.means", "group2.means", "tstat.sd")
38
+    stats <- as.array(stats[queryHits(hits), stat_names])
39
+    stats_by_region <- lapply(seq_along(stat_names), function(j) {
40
+        split(stats[, j], subjectHits(hits))
41
+    })
42
+    names(stats_by_region) <- stat_names
43
+    meanDiff <- mapply(function(g1, g2, na.rm) {
44
+        mean(g1 - g2, na.rm = na.rm)
45
+    }, g1 = stats_by_region[["group1.means"]],
46
+    g2 = stats_by_region[["group2.means"]],
47
+    MoreArgs = list(na.rm = na.rm), SIMPLIFY = TRUE, USE.NAMES = FALSE)
48
+    group1.mean <- vapply(stats_by_region[["group1.means"]], mean, numeric(1),
49
+                          na.rm = na.rm, USE.NAMES = FALSE)
50
+    group2.mean <- vapply(stats_by_region[["group2.means"]], mean, numeric(1),
51
+                          na.rm = na.rm, USE.NAMES = FALSE)
52
+    tstat.sd <- vapply(stats_by_region[["tstat.sd"]], mean, numeric(1),
53
+                       na.rm = na.rm, USE.NAMES = FALSE)
54
+    data.frame(meanDiff = meanDiff,
55
+               group1.mean = group1.mean,
56
+               group2.mean = group2.mean,
57
+               tstat.sd = tstat.sd)
58
+}
59
+
37 60
 getStats_BSseqTstat <- function(BSseqTstat, regions = NULL, stat = "tstat.corrected") {
38 61
     stopifnot(is(BSseqTstat, "BSseqTstat"))
39 62
     if(is.null(regions))
... ...
@@ -43,33 +66,13 @@ getStats_BSseqTstat <- function(BSseqTstat, regions = NULL, stat = "tstat.correc
43 66
     stopifnot(stat %in% colnames(BSseqTstat@stats))
44 67
     stopifnot(length(stat) == 1)
45 68
     stopifnot(is(regions, "GenomicRanges"))
46
-    ov <- findOverlaps(BSseqTstat, regions)
47
-    ov.sp <- split(queryHits(ov), subjectHits(ov))
48
-    getRegionStats <- function(idx) {
49
-        mat <- BSseqTstat@stats[idx,, drop=FALSE]
50
-        areaStat <- sum(mat[, stat])
51
-        maxStat <- max(mat[, stat])
52
-        c(areaStat, maxStat)
53
-    }
54
-    stats <- matrix(NA, ncol = 2, nrow = length(regions))
55
-    colnames(stats) <- c("areaStat", "maxStat")
56
-    tmp <- lapply(ov.sp, getRegionStats)
57
-    stats[as.integer(names(tmp)),] <- do.call(rbind, tmp)
58
-    out <- as.data.frame(stats)
69
+    hits <- findOverlaps(BSseqTstat, regions)
70
+    out <- .getRegionStats(stat = BSseqTstat@stats[, stat, drop = FALSE],
71
+                           hits = hits)
59 72
     if(! stat %in% c("tstat.corrected", "tstat"))
60 73
         return(out)
61
-    getRegionStats_ttest <- function(idx) {
62
-        mat <- BSseqTstat@stats[idx,, drop=FALSE]
63
-        group1.mean <- mean(mat[, "group1.means"])
64
-        group2.mean <- mean(mat[, "group2.means"])
65
-        meanDiff <- mean(mat[, "group1.means"] - mat[, "group2.means"])
66
-        tstat.sd <- mean(mat[, "tstat.sd"])
67
-        c(meanDiff, group1.mean, group2.mean, tstat.sd)
68
-    }
69
-    stats_ttest<- matrix(NA, ncol = 4, nrow = length(regions))
70
-    colnames(stats_ttest) <- c("meanDiff", "group1.mean", "group2.mean", "tstat.sd")
71
-    tmp <- lapply(ov.sp, getRegionStats_ttest)
72
-    stats_ttest[as.integer(names(tmp)),] <- do.call(rbind, tmp)
73
-    out <- cbind(out, as.data.frame(stats_ttest))
74
+    stats_ttest <- .getRegionStats_ttest(stats = BSseqTstat@stats,
75
+                                         hits = hits)
76
+    out <- cbind(out, stats_ttest)
74 77
     out