Browse code

Work in progress: refactoring bsseq

- BSseq objects can once again use ordinary matrix objects as assays.
- Reimplement `BSmooth()` more-or-less from scratch:
- Switch from 'parallel' to 'BiocParallel' for parallelization. This brings some notable improvements:
- Smoothed results can now be written directly to an on-disk realization backend by the worker. This dramatically reduces memory usage compared to previous versions of 'bsseq' that required all results be retained in-memory.
- Parallelization is now supported on Windows through the use of a 'SnowParam' object as the value of `BPPARAM`.
- Improved error handling makes it possible to gracefully resume `BSmooth()` jobs that encountered errors by re-doing only the necessary tasks.
- Detailed and extensive job logging facilities.
- Fix bug in `BSmooth()` with the `maxGap` parameter.
- Re-factor BSseq() constructor and add fast, internal .BSseq() constructor.
- Re-factor collapseBSseq() and combine(). Should be much more performant.
- Use beachmat to implement fast validity checking of 'M' and 'Cov' matrices.
- Resave BS.chr22 (supplied data) using integer for storage mode of assays to reduce size.
- Switch from RUnit to testthat. testthat has better integration with code coverage tools that help when refactoring.

Peter Hickey authored on 28/05/2018 23:42:18
Showing38 changed files

... ...
@@ -12,8 +12,7 @@ Depends:
12 12
     methods,
13 13
     BiocGenerics,
14 14
     GenomicRanges (>= 1.29.14),
15
-    SummarizedExperiment (>= 1.9.18),
16
-    parallel
15
+    SummarizedExperiment (>= 1.9.18)
17 16
 Imports:
18 17
     IRanges (>= 2.11.16),
19 18
     GenomeInfoDb,
... ...
@@ -30,17 +29,24 @@ Imports:
30 29
     permute,
31 30
     limma,
32 31
     DelayedArray (>= 0.5.34),
33
-    HDF5Array
32
+    Rcpp,
33
+    BiocParallel,
34
+    readr
34 35
 Suggests:
35
-    RUnit,
36
+    testthat,
36 37
     bsseqData,
37 38
     BiocStyle,
38 39
     rmarkdown,
39 40
     knitr,
41
+    beachmat,
42
+    Matrix,
43
+    HDF5Array,
44
+    doParallel,
45
+    testthat
40 46
 Collate:
41 47
     utils.R
42 48
     hasGRanges.R
43
-    BSseq_class.R
49
+    BSseq-class.R
44 50
     BSseqTstat_class.R
45 51
     BSseq_utils.R
46 52
     combine.R
... ...
@@ -57,8 +63,8 @@ Collate:
57 63
     BSseqStat_class.R
58 64
     getStats.R
59 65
     hdf5_utils.R
60
-    combine_utils.R
61 66
     DelayedArray_utils.R
67
+    collapseBSseq.R
62 68
 License: Artistic-2.0
63 69
 VignetteBuilder: knitr
64 70
 URL: https://github.com/kasperdanielhansen/bsseq
... ...
@@ -66,3 +72,6 @@ LazyData: yes
66 72
 LazyDataCompression: xz
67 73
 BugReports: https://github.com/kasperdanielhansen/bsseq/issues
68 74
 biocViews: DNAMethylation
75
+SystemRequirements: C++11
76
+LinkingTo: Rcpp, Rhdf5lib, beachmat
77
+NeedsCompilation: yes
... ...
@@ -17,7 +17,7 @@ importFrom(graphics, "abline", "axis", "layout", "legend", "lines",
17 17
 import(parallel)
18 18
 importFrom(locfit, "locfit", "lp")
19 19
 importFrom(DelayedMatrixStats, "rowSds", "rowVars", "colMeans2", "colSums2",
20
-           "rowSums2", "rowMeans2")
20
+           "rowSums2", "rowMeans2", "rowAlls")
21 21
 import(IRanges)
22 22
 import(GenomicRanges)
23 23
 import(SummarizedExperiment)
... ...
@@ -35,10 +35,9 @@ importFrom(data.table, "fread", "setnames")
35 35
 importFrom(R.utils, "isGzipped", "gunzip")
36 36
 import(limma)
37 37
 importFrom(permute, "shuffleSet", "how")
38
-importClassesFrom(DelayedArray, "DelayedArray", "DelayedMatrix")
39
-importFrom(DelayedArray, "DelayedArray", "plogis", "pmin2", "pmax2", "rowMaxs",
40
-           "rowMins")
41
-importClassesFrom(HDF5Array, "HDF5Array")
38
+import(DelayedArray)
39
+import(BiocParallel)
40
+importFrom(Rcpp, "sourceCpp")
42 41
 
43 42
 ##
44 43
 ## Exporting
... ...
@@ -86,3 +85,5 @@ S3method("plot", "BSseqTstat")
86 85
 
87 86
 exportMethods("assays", "assayNames")
88 87
 
88
+# C++ code registration
89
+useDynLib(bsseq, .registration = TRUE, .fixes = "cxx_")
... ...
@@ -1,3 +1,15 @@
1
+# Internal functions -----------------------------------------------------------
2
+
3
+.rowTickmarks <- function(hasGRanges, maxGap) {
4
+    gr <- granges(hasGRanges)
5
+    # NOTE: This relies on 'gr' being sorted, so really want to be sure of this!
6
+    stopifnot(!is.unsorted(gr))
7
+    clusters <- reduce(gr, min.gapwidth = maxGap, with.revmap = TRUE)
8
+    cumsum(lengths(mcols(clusters)[["revmap"]]))
9
+}
10
+
11
+# TODO: Remove other uses of this function, if possible.
12
+# TODO: Rename to .makeClusters() since it's an internal function
1 13
 makeClusters <- function(hasGRanges, maxGap = 10^8) {
2 14
     chrOrder <- as.character(runValue(seqnames(hasGRanges)))
3 15
     if(anyDuplicated(chrOrder))
... ...
@@ -18,136 +30,320 @@ makeClusters <- function(hasGRanges, maxGap = 10^8) {
18 30
     clusterIdx
19 31
 }
20 32
 
33
+.BSmooth <- function(b, M, Cov, pos, coef_sink, se.coef_sink,
34
+                     coef_sink_lock, se.coef_sink_lock, grid, pos_grid,
35
+                     ns, h, keep.se) {
36
+    # Load packages on worker (required for SnowParam) -------------------------
37
+
38
+    suppressPackageStartupMessages({
39
+        library(BiocParallel)
40
+    })
41
+    suppressPackageStartupMessages({
42
+        library(locfit)
43
+    })
44
+    suppressPackageStartupMessages({
45
+        library(DelayedArray)
46
+    })
47
+
48
+    # Construct inputs for smoothing -------------------------------------------
49
+
50
+    # NOTE: 'bb' indexes over elements of pos_grid by cycling `ncol(M)` times
51
+    #       over 1, ..., nrow(pos_grid).
52
+    bb <- (b - 1L) %% nrow(pos_grid) + 1L
53
+    # NOTE: unname() is necessary because M and Cov may carry colnames
54
+    sdata <- data.frame(
55
+        pos = DelayedArray:::extract_block(pos, pos_grid[[bb]]),
56
+        M = unname(DelayedArray:::extract_block(M, grid[[b]])),
57
+        Cov = unname(DelayedArray:::extract_block(Cov, grid[[b]])))
58
+    # Ensure 0 < M < Cov to avoid boundary issues (only relevant at loci with
59
+    # non-zero coverage, so doesn't matter what M is for these loci).
60
+    sdata[["M"]] <- pmin(pmax(sdata[["M"]], 0.01), pmax(sdata[["Cov"]] - 0.01))
61
+    n_loci <- nrow(sdata)
62
+    n_loci_with_non_zero_coverage <- sum(sdata[["Cov"]] > 0)
21 63
 
22
-BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8, parallelBy = c("sample", "chromosome"),
23
-                    mc.preschedule = FALSE, mc.cores = 1, keep.se = FALSE, verbose = TRUE) {
24
-    smooth <- function(idxes, sampleIdx) {
25
-        ## Assuming that idxes is a set of indexes into the BSseq object
26
-        ## sampleIdx is a single character
27
-        this_sample_chr <- c(sampleNames(BSseq)[sampleIdx],
28
-                             as.character(seqnames(BSseq)[idxes[1]]))
29
-        if(verbose >= 2)
30
-            cat(sprintf("[BSmooth]   smoothing start: sample:%s, chr:%s, nLoci:%s\n",
31
-                        this_sample_chr[1], this_sample_chr[2], length(idxes)))
32
-        Cov <- unname(as.array(
33
-            getCoverage(BSseq, type = "Cov")[idxes, sampleIdx]))
34
-        M <- unname(as.array(
35
-            getCoverage(BSseq, type = "M")[idxes, sampleIdx]))
36
-        pos <- start(BSseq)[idxes]
37
-        stopifnot(all(diff(pos) > 0))
38
-        wh <- which(Cov != 0)
39
-        nn <- ns / length(wh)
40
-        if(length(wh) <= ns) {
41
-            if(keep.se)
42
-                se.coef <- rep(NA_real_, length(Cov))
43
-            else
44
-                se.coef <- NULL
45
-            return(list(coef = rep(NA_real_, length(Cov)),
46
-                        se.coef = se.coef,
47
-                        trans = NULL, h = h, nn = nn))
64
+    # Fit local binomial regression model --------------------------------------
65
+
66
+    # NOTE: Require (ns + 1) loci with non-zero coverage to run smooth.
67
+    if (n_loci_with_non_zero_coverage <= ns) {
68
+        coef <- rep(NA_real_, n_loci)
69
+        if (keep.se) {
70
+            se.coef <- rep(NA_real_, n_loci)
71
+        } else {
72
+            se.coef <- NULL
48 73
         }
49
-        sdata <- data.frame(pos = pos[wh],
50
-                            M = pmin(pmax(M[wh], 0.01),
51
-                                     Cov[wh] - 0.01),
52
-                            Cov = Cov[wh])
53
-        fit <- locfit(M ~ lp(pos, nn = nn, h = h), data = sdata,
54
-                      weights = Cov, family = "binomial", maxk = 10000)
55
-        pp <- preplot(fit, where = "data", band = "local",
56
-                      newdata = data.frame(pos = pos))
57
-        if(keep.se) {
74
+    } else {
75
+        # Set 'nearest neighbour' smoothing parameter.
76
+        nn <- ns / n_loci_with_non_zero_coverage
77
+        # Fit model only using loci with non-zero coverage.
78
+        fit <- locfit(
79
+            M ~ lp(pos, nn = nn, h = h),
80
+            data = sdata,
81
+            weights = Cov,
82
+            family = "binomial",
83
+            subset = Cov > 0,
84
+            maxk = 10000)
85
+        # Evaluate smooth at all loci (regardless of coverage).
86
+        pp <- preplot(
87
+            object = fit,
88
+            newdata = sdata["pos"],
89
+            band = "local")
90
+        coef <- pp$fit
91
+        if (keep.se) {
58 92
             se.coef <- pp$se.fit
59 93
         } else {
60 94
             se.coef <- NULL
61 95
         }
62
-        if(verbose >= 2)
63
-            cat(sprintf("[BSmooth]   smoothing end: sample:%s, chr:%s, nLoci:%s, nCoveredLoci:%s\n",
64
-                        this_sample_chr[1], this_sample_chr[2], length(idxes), nrow(sdata)))
65
-        return(list(coef = pp$fit, se.coef = se.coef,
66
-                    trans = pp$trans, h = h, nn = nn))
67 96
     }
68
-    stopifnot(class(BSseq) == "BSseq")
69
-    parallelBy <- match.arg(parallelBy)
70
-    if(verbose) cat("[BSmooth] preprocessing ... ")
71
-    ptime1 <- proc.time()
72
-    clusterIdx <- makeClusters(BSseq, maxGap = maxGap)
73
-    ptime2 <- proc.time()
74
-    stime <- (ptime2 - ptime1)[3]
75
-    if(verbose) cat(sprintf("done in %.1f sec\n", stime))
76
-
77
-    sampleNames <- sampleNames(BSseq)
78
-    names(sampleNames) <- sampleNames
79
-
80
-    ptime.outer1 <- proc.time()
81
-    switch(parallelBy, "sample" = {
82
-        if(verbose) cat(sprintf("[BSmooth] smoothing by 'sample' (mc.cores = %d, mc.preschedule = %s)\n",
83
-                                mc.cores, mc.preschedule))
84
-        out <- mclapply(seq(along = sampleNames), function(sIdx) {
85
-            ptime1 <- proc.time()
86
-            tmp <- lapply(clusterIdx, function(jj) {
87
-                try(smooth(idxes = jj, sampleIdx = sIdx))
88
-            })
89
-            coef <- do.call(c, lapply(tmp, function(xx) xx$coef))
90
-            se.coef <- do.call(c, lapply(tmp, function(xx) xx$se.coef))
91
-            ptime2 <- proc.time()
92
-            stime <- (ptime2 - ptime1)[3]
93
-            if(verbose) {
94
-                cat(sprintf("[BSmooth] sample %s (out of %d), done in %.1f sec\n",
95
-                            sampleNames[sIdx], length(sampleNames), stime))
97
+
98
+    # Return the results of the smooth or write them to the RealizationSink ----
99
+
100
+    if (is.null(coef_sink)) {
101
+        return(list(coef = coef, se.coef = se.coef))
102
+    }
103
+    # Write to coef_sink and se.coef_sink while respecting the IPC lock(s).
104
+    ipclock(coef_sink_lock)
105
+    write_block_to_sink(as.matrix(coef), coef_sink, grid[[b]])
106
+    ipcunlock(coef_sink_lock)
107
+    if (keep.se) {
108
+        ipclock(se.coef_sink_lock)
109
+        write_block_to_sink(as.matrix(se.coef), se.coef_sink, grid[[b]])
110
+        ipcunlock(se.coef_sink_lock)
111
+    }
112
+}
113
+
114
+# Exported functions -----------------------------------------------------------
115
+
116
+# TODO: Make 'mc.cores', 'mc.preschedule', and 'verbose' defunct one release
117
+#       cycle with them deprecated.
118
+# TODO: Consider having BSmooth() create a 'smoothed' assay in addition to or
119
+#       instead of the 'coef' and 'se.coef' assays.
120
+# TODO: To benefit from error recovery requires that bpStopOnError(BPPARAM) is
121
+#       TRUE but the default is FALSE. How to help the user? Probably don't
122
+#       want to override the user-specified value.
123
+BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8,
124
+                    parallelBy = c("sample", "chromosome"),
125
+                    mc.preschedule = FALSE, mc.cores = 1, keep.se = FALSE,
126
+                    verbose = TRUE, BPREDO = list(), BPPARAM = bpparam()) {
127
+    # Argument checks-----------------------------------------------------------
128
+
129
+    # Check if this is a re-do.
130
+    # NOTE: Under the assumptions of a re-do (i.e. BSmooth() is being re-run
131
+    #       with the same arguments), we skip straight ahead to re-running
132
+    #       failed smoothing tasks with no further argument checks.
133
+    if (length(BPREDO)) {
134
+        if (!is.list(BPREDO) ||
135
+            identical(names(BPREDO), c("smooth", "coef_sink", "se.coef_sink",
136
+                                       "realization_backend"))) {
137
+            stop("'BPREDO' should be a list with elements 'smooth', ",
138
+                 "'coef_sink', 'se.coef_sink', and 'realization_backend'.")
139
+        }
140
+        is_redo <- TRUE
141
+        coef_sink <- BPREDO[["coef_sink"]]
142
+        se.coef_sink <- BPREDO[["se.coef_sink"]]
143
+        realization_backend <- BPREDO[["realization_sink"]]
144
+        BPREDO <- BPREDO[["smooth"]]
145
+    } else {
146
+        is_redo <- FALSE
147
+        # Check validity of 'BSseq' argument
148
+        if (!is(BSseq, "BSseq")) {
149
+            stop("'BSseq' must be a BSseq object.")
150
+        }
151
+        if (!isTRUE(all(width(BSseq) == 1L))) {
152
+            stop("All loci in 'BSseq' must have width == 1.")
153
+        }
154
+        if (is.unsorted(BSseq)) {
155
+            stop("'BSseq' must be sorted before smoothing. Use 'sort(BSseq)'.")
156
+        }
157
+        # Check for deprecated arguments and issue warning(s) if found.
158
+        if (!missing(parallelBy)) {
159
+            warning(
160
+                "'parallelBy' is deprecated.\n",
161
+                "See help(\"BSmooth\") for details.",
162
+                call. = FALSE,
163
+                immediate. = TRUE)
164
+        }
165
+        if (!missing(mc.preschedule)) {
166
+            warning(
167
+                "'mc.preschedule' is deprecated.\n",
168
+                "See help(\"BSmooth\") for details.",
169
+                call. = FALSE,
170
+                immediate. = TRUE)
171
+        }
172
+        if (!missing(mc.cores)) {
173
+            warning(
174
+                "'mc.cores' is deprecated.\n",
175
+                "See help(\"BSmooth\").",
176
+                call. = FALSE,
177
+                immediate. = TRUE)
178
+            BPPARAM <- MulticoreParam(workers = mc.cores)
179
+        }
180
+        if (!missing(verbose)) {
181
+            warning(
182
+                "'verbose' is deprecated.\n",
183
+                "See help(\"BSmooth\") for details.",
184
+                call. = FALSE,
185
+                immediate. = TRUE)
186
+            if (verbose) bpprogressbar(BPPARAM) <- TRUE
187
+        }
188
+        # Check compatability of realization backend with backend(s) of BSseq
189
+        # object.
190
+        realization_backend <- getRealizationBackend()
191
+        BSseq_backends <- .getBSseqBackends(BSseq)
192
+        if (.areBackendsInMemory(realization_backend) &&
193
+            !.areBackendsInMemory(BSseq_backends)) {
194
+            stop("Using an in-memory backend for a disk-backed BSseq object ",
195
+                 "is not supported.\n",
196
+                 "See help(\"BSmooth\") for details.",
197
+                 call. = FALSE)
198
+        }
199
+        # Check compatability of 'BPPARAM' with the realization backend.
200
+        if (!.areBackendsInMemory(realization_backend)) {
201
+            if (!.isSingleMachineBackend(BPPARAM)) {
202
+                stop("The parallelisation strategy must use a single machine ",
203
+                     "when using an on-disk realization backend.\n",
204
+                     "See help(\"BSmooth\") for details.",
205
+                     call. = FALSE)
206
+            }
207
+        } else {
208
+            if (!is.null(realization_backend)) {
209
+                # NOTE: Currently do not support any in-memory realization
210
+                #       backends. If the realization backend is NULL then an
211
+                #       ordinary matrix is returned rather than a matrix-backed
212
+                #       DelayedMatrix.
213
+                stop("The '", realization_backend, "' realization backend is ",
214
+                     "not supported.\n",
215
+                     "See help(\"BSmooth\") for details.",
216
+                     call. = FALSE)
217
+            }
218
+        }
219
+    }
220
+
221
+    # Smoothing ----------------------------------------------------------------
222
+
223
+    # Extract pieces of BSseq object required for smoothing
224
+    M <- getCoverage(BSseq, type = "M", withDimnames = FALSE)
225
+    Cov <- getCoverage(BSseq, type = "Cov", withDimnames = FALSE)
226
+    pos <- matrix(start(BSseq), ncol = 1)
227
+    # Set up ArrayGrid so that each block contains data for a single sample and
228
+    # single cluster of loci.
229
+    row_tickmarks <- .rowTickmarks(BSseq, maxGap)
230
+    col_tickmarks <- seq_len(ncol(M))
231
+    grid <- ArbitraryArrayGrid(list(row_tickmarks, col_tickmarks))
232
+    # Set up "parallel" ArrayGrid over pos
233
+    pos_grid <- ArbitraryArrayGrid(list(row_tickmarks, 1L))
234
+    # Construct RealizationSink objects (as required)
235
+    if (!is_redo) {
236
+        if (is.null(realization_backend)) {
237
+            coef_sink <- NULL
238
+            coef_sink_lock <- NULL
239
+            se.coef_sink <- NULL
240
+            se.coef_sink_lock <- NULL
241
+        } else {
242
+            coef_sink <- DelayedArray:::RealizationSink(dim(M), type = "double")
243
+            on.exit(close(coef_sink), add = TRUE)
244
+            coef_sink_lock <- ipcid()
245
+            on.exit(ipcremove(coef_sink_lock), add = TRUE)
246
+            if (keep.se) {
247
+                se.coef_sink <- DelayedArray:::RealizationSink(dim(M),
248
+                                                               type = "double")
249
+                on.exit(close(se.coef_sink), add = TRUE)
250
+                se.coef_sink_lock <- ipcid()
251
+                on.exit(ipcremove(se.coef_sink_lock), add = TRUE)
252
+            } else {
253
+                se.coef_sink <- NULL
254
+                se.coef_sink_lock <- NULL
96 255
             }
97
-            return(list(coef = coef, se.coef = se.coef))
98
-        }, mc.preschedule = mc.preschedule, mc.cores = mc.cores)
99
-        if(any(sapply(out, is, class2 = "try-error")))
100
-            stop("BSmooth encountered smoothing errors")
101
-        coef <- do.call(cbind, lapply(out, function(xx) xx$coef))
102
-        se.coef <- do.call(cbind, lapply(out, function(xx) xx$se.coef))
103
-    }, "chromosome" = {
104
-        if(verbose) cat(sprintf("[BSmooth] smoothing by 'chromosome' (mc.cores = %d, mc.preschedule = %s)\n",
105
-                                mc.cores, mc.preschedule))
106
-        out <- mclapply(1:length(clusterIdx), function(ii) {
107
-            ptime1 <- proc.time()
108
-            tmp <- lapply(seq(along = sampleNames), function(sIdx) {
109
-                smooth(idxes = clusterIdx[[ii]], sampleIdx = sIdx)
110
-            })
111
-            coef <- do.call(cbind, lapply(tmp, function(xx) xx$coef))
112
-            se.coef <- do.call(cbind, lapply(tmp, function(xx) xx$se.coef))
113
-            ptime2 <- proc.time()
114
-            stime <- (ptime2 - ptime1)[3]
115
-            if(verbose)
116
-                cat(sprintf("[BSmooth] chr idx %d (out of %d), done in %.1f sec\n",
117
-                            ii, length(clusterIdx), stime))
118
-            return(list(coef = coef, se.coef = se.coef))
119
-        }, mc.preschedule = mc.preschedule, mc.cores = mc.cores)
120
-        if(any(sapply(out, is, class2 = "try-error")))
121
-            stop("BSmooth encountered smoothing errors")
122
-        coef <- do.call(rbind, lapply(out, function(xx) xx$coef))
123
-        se.coef <- do.call(rbind, lapply(out, function(xx) xx$se.coef))
124
-    })
125
-    coef <- .DelayedMatrix(coef)
126
-    if (!is.null(se.coef)) {
127
-        se.coef <- .DelayedMatrix(se.coef)
256
+        }
257
+    }
258
+
259
+    # Set number of tasks to ensure the progress bar gives frequent updates.
260
+    # NOTE: The progress bar increments once per task
261
+    #       (https://github.com/Bioconductor/BiocParallel/issues/54).
262
+    #       Although it is somewhat of a bad idea to overrides a user-specified
263
+    #       bptasks(BPPARAM), the value of bptasks(BPPARAM) doesn't affect
264
+    #       performance in this instance, and so we opt for a useful progress
265
+    #       bar. Only SnowParam (and MulticoreParam by inheritance) have a
266
+    #       bptasks<-() method.
267
+    if (is(BPPARAM, "SnowParam") && bpprogressbar(BPPARAM)) {
268
+        bptasks(BPPARAM) <- length(grid)
269
+    }
270
+    smooth <- bptry(bplapply(
271
+        X = seq_along(grid),
272
+        FUN = .BSmooth,
273
+        M = M,
274
+        Cov = Cov,
275
+        pos = pos,
276
+        coef_sink = coef_sink,
277
+        se.coef_sink = se.coef_sink,
278
+        coef_sink_lock = coef_sink_lock,
279
+        se.coef_sink_lock = se.coef_sink_lock,
280
+        grid = grid,
281
+        pos_grid = pos_grid,
282
+        ns = ns,
283
+        h = h,
284
+        keep.se = keep.se,
285
+        BPREDO = BPREDO,
286
+        BPPARAM = BPPARAM))
287
+    if (!all(bpok(smooth))) {
288
+        # TODO: Feels like stop() rather than warning() should be used, but
289
+        #       stop() doesn't allow for the return of partial results;
290
+        #       see https://support.bioconductor.org/p/109374/
291
+        warning("BSmooth() encountered errors: ",
292
+                sum(!bpok(smooth)), " of ", length(smooth),
293
+                " smoothing tasks failed.\n",
294
+                "BSmooth() has returned partial results, including errors, ",
295
+                "for debugging purposes.\n",
296
+                "It may be possible to re-run just these failed smoothing ",
297
+                "tasks.\nSee help(\"BSmooth\")",
298
+                call. = FALSE)
299
+        # NOTE: Return intermediate results as well as all derived variables.
300
+        return(list(smooth = smooth,
301
+                    coef_sink = coef_sink,
302
+                    se.coef_sink = se.coef_sink,
303
+                    realization_backend = realization_backend))
128 304
     }
129
-    ptime.outer2 <- proc.time()
130
-    stime.outer <- (ptime.outer2 - ptime.outer1)[3]
131
-    if(verbose)
132
-        cat(sprintf("[BSmooth] smoothing done in %.1f sec\n", stime.outer))
133
-
134
-    rownames(coef) <- NULL
135
-    colnames(coef) <- sampleNames(BSseq)
136
-    if(!is.null(se.coef)) {
137
-        rownames(se.coef) <- NULL
138
-        colnames(se.coef) <- sampleNames(BSseq)
305
+    # Construct coef and se.coef from results of smooth().
306
+    if (is.null(realization_backend)) {
307
+        # Returning matrix objects.
308
+        coef <- do.call(c, lapply(smooth, "[[", "coef"))
309
+        attr(coef, "dim") <- dim(M)
310
+        if (keep.se) {
311
+            se.coef <- do.call(c, lapply(smooth, "[[", "se.coef"))
312
+            attr(se.coef, "dim") <- dim(M)
313
+        } else {
314
+            se.coef <- NULL
315
+        }
316
+    } else {
317
+        # Returning DelayedMatrix objects.
318
+        coef <- as(coef_sink, "DelayedArray")
319
+        if (keep.se) {
320
+            se.coef <- as(se.coef_sink, "DelayedArray")
321
+        } else {
322
+            se.coef <- NULL
323
+        }
139 324
     }
140 325
 
141
-    if(!is.null(coef))
142
-        assay(BSseq, "coef") <- coef
143
-    if(!is.null(se.coef))
144
-        assay(BSseq, "se.coef") <- se.coef
145
-    BSseq@trans <- plogis
326
+    # Construct BSseq object ---------------------------------------------------
146 327
 
147
-    parameters <- list(smoothText = sprintf("BSmooth (ns = %d, h = %d, maxGap = %d)", ns, h, maxGap),
148
-                       ns = ns, h = h, maxGap = maxGap)
149
-    BSseq@parameters <- parameters
150
-    BSseq
328
+    assays <- c(assays(BSseq, withDimnames = FALSE), SimpleList(coef = coef))
329
+    if (keep.se) assays <- c(assays, SimpleList(se.coef = se.coef))
330
+    BiocGenerics:::replaceSlots(
331
+        object = BSseq,
332
+        assays = Assays(assays),
333
+        trans = plogis,
334
+        parameters = list(
335
+            smoothText = sprintf(
336
+                "BSmooth (ns = %d, h = %d, maxGap = %d)", ns, h, maxGap),
337
+            ns = ns,
338
+            h = h,
339
+            maxGap = maxGap),
340
+        check = FALSE)
151 341
 }
152 342
 
343
+# TODOs ------------------------------------------------------------------------
153 344
 
345
+# TODO: Use the logging facilities of BiocParallel. This is a longterm goal.
346
+#       For example, we could set custom messages within .BSmooth() using the
347
+#       futile.logger syntax; see the BiocParalell vignette 'Errors, Logs and
348
+#       Debugging in BiocParallel'.
349
+# TODO: Remove NOTEs that are really documentation issues to the docs
154 350
new file mode 100644
... ...
@@ -0,0 +1,318 @@
1
+# Exported classes -------------------------------------------------------------
2
+
3
+# NOTE: This create a 'classGeneratorFunction' (internal constructor), .BSseq()
4
+.BSseq <- setClass(
5
+    "BSseq",
6
+    slots = representation(
7
+        trans = "function",
8
+        parameters = "list"),
9
+    contains = "RangedSummarizedExperiment"
10
+)
11
+
12
+# Validity methods -------------------------------------------------------------
13
+
14
+.checkAssayNames <- function(object, names) {
15
+    if (all(names %in% assayNames(object))) {
16
+        return(NULL)
17
+    } else {
18
+        paste0(
19
+            "object of class ",
20
+            class(object),
21
+            " needs to have assay slots with names ",
22
+            paste0(names, collapse = ", "))
23
+
24
+    }
25
+}
26
+
27
+.checkMandCov <- function(M, Cov) {
28
+    msg <- NULL
29
+    validMsg(msg, .Call(cxx_check_M_and_Cov, M, Cov))
30
+}
31
+
32
+setValidity2("BSseq", function(object) {
33
+    msg <- NULL
34
+
35
+    if (identical(object, .BSseq())) {
36
+        # No validity checks for object returned by internal constructor
37
+        return(msg)
38
+    }
39
+    msg <- validMsg(msg, .checkAssayNames(object, c("Cov", "M")))
40
+
41
+    # TODO: Are colnames strictly necessary?
42
+    if (is.null(colnames(object))) {
43
+        msg <- validMsg(msg, "colnames (aka sampleNames) need to be set")
44
+    }
45
+
46
+    assay_rownames <- lapply(assays(object), rownames)
47
+    if (!all(S4Vectors:::sapply_isNULL(assay_rownames))) {
48
+        msg <- validMsg(msg, "unnecessary rownames on one-or-more assays")
49
+    }
50
+
51
+    msg <- validMsg(msg,
52
+                    .checkMandCov(assay(object, "M", withDimnames = FALSE),
53
+                                  assay(object, "Cov", withDimnames = FALSE)))
54
+
55
+    if (is.null(msg)) TRUE else msg
56
+})
57
+
58
+# Internal functions -----------------------------------------------------------
59
+
60
+.oldTrans <- function(x) {
61
+    y <- x
62
+    ix <- which(x < 0)
63
+    ix2 <- which(x > 0)
64
+    y[ix] <- exp(x[ix])/(1 + exp(x[ix]))
65
+    y[ix2] <- 1/(1 + exp(-x[ix2]))
66
+    y
67
+}
68
+
69
+# Exported functions -----------------------------------------------------------
70
+
71
+# TODO: BSseq() is arguably a bad constructor. It doesn't return a valid BSseq
72
+#       object when called without any arguments. It also does some pretty
73
+#       complicated parsing of the inputs. Still, I think we're stuck with it
74
+#       because it's been around for a long time.
75
+BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
76
+                  trans = NULL, parameters = NULL, pData = NULL,
77
+                  gr = NULL, pos = NULL, chr = NULL, sampleNames = NULL,
78
+                  rmZeroCov = FALSE) {
79
+
80
+    # Process 'gr', or 'pos' and 'chr'
81
+    if (is.null(gr)) {
82
+        if (is.null(pos) || is.null(chr)) {
83
+            stop("Need 'pos' and 'chr' if 'gr' not supplied.")
84
+        }
85
+        gr <- GRanges(seqnames = chr, ranges = IRanges(start = pos, width = 1L))
86
+    }
87
+    if (!is(gr, "GRanges")) stop("'gr' needs to be a GRanges.")
88
+    if (any(width(gr) != 1L)) stop("'gr' needs to have widths of 1.")
89
+
90
+    # Process 'M' and 'Cov'
91
+    if (is.null(M) || is.null(Cov)) stop("Need 'M' and 'Cov'.")
92
+    if (length(gr) != nrow(M) ||
93
+        length(gr) != nrow(Cov) ||
94
+        ncol(Cov) != ncol(M)) {
95
+        stop("'gr', 'M' and 'Cov' need to have compatible dimensions.")
96
+    }
97
+    if (!is.null(rownames(M))) rownames(M) <- NULL
98
+    if (!is.null(rownames(Cov))) rownames(Cov) <- NULL
99
+    if (!is.null(names(gr))) names(gr) <- NULL
100
+
101
+    # Process 'sampleNames' and 'pData'
102
+    if (is.null(pData)) {
103
+        if (is.null(sampleNames)) {
104
+            if (!is.null(colnames(M))) {
105
+                sampleNames <- colnames(M)
106
+            } else if (!is.null(colnames(Cov))) {
107
+                sampleNames <- colnames(Cov)
108
+            } else {
109
+                sampleNames <- paste0("V", seq_len(ncol(M)))
110
+            }
111
+        }
112
+        pData <- DataFrame(row.names = sampleNames)
113
+    } else {
114
+        pData <- as(pData, "DataFrame")
115
+    }
116
+    if (is.null(sampleNames) && !is.null(pData) && !is.null(rownames(pData))) {
117
+        sampleNames <- rownames(pData)
118
+    }
119
+    if (length(unique(sampleNames)) != ncol(M)) {
120
+        stop("sampleNames need to be unique and of the right length.")
121
+    }
122
+
123
+    # TODO: Is this really necessary? It complicates things because HDF5Matrix
124
+    #       will be degraded to a DelayedMatrix
125
+    # Add column names to assays
126
+    if (is.null(colnames(M)) || any(sampleNames != colnames(M))) {
127
+        colnames(M) <- sampleNames
128
+    }
129
+    if (is.null(colnames(Cov)) || any(sampleNames != colnames(Cov))) {
130
+        colnames(Cov) <- sampleNames
131
+    }
132
+    if (!is.null(coef)) {
133
+        if (nrow(coef) != nrow(M) || ncol(coef) != ncol(M)) {
134
+            stop("'coef' does not have the right dimensions")
135
+        }
136
+        if (is.null(colnames(coef)) || any(sampleNames != colnames(coef))) {
137
+            colnames(coef) <- sampleNames
138
+        }
139
+        if (!is.null(rownames(coef))) {
140
+            rownames(coef) <- NULL
141
+        }
142
+    }
143
+    if (!is.null(se.coef)) {
144
+        if (nrow(se.coef) != nrow(M) || ncol(se.coef) != ncol(M)) {
145
+            stop("'se.coef' does not have the right dimensions")
146
+        }
147
+        if (is.null(colnames(se.coef)) ||
148
+            any(sampleNames != colnames(se.coef))) {
149
+            colnames(se.coef) <- sampleNames
150
+        }
151
+        if (!is.null(rownames(se.coef))) {
152
+            rownames(se.coef) <- NULL
153
+        }
154
+    }
155
+
156
+    # Optionally, remove positions with Cov == 0
157
+    if (rmZeroCov) {
158
+        loci_with_zero_cov <- rowAlls(Cov, value = 0)
159
+        if (any(loci_with_zero_cov)) {
160
+            gr <- gr[!loci_with_zero_cov]
161
+            M <- M[!loci_with_zero_cov, , drop = FALSE]
162
+            Cov <- Cov[!loci_with_zero_cov, , drop = FALSE]
163
+        }
164
+    }
165
+
166
+    # Collapse duplicate loci
167
+    if (any(duplicated(gr))) {
168
+        warning("Detected duplicate loci. Collapsing counts in 'M' and 'Cov' ",
169
+                "at these positions.")
170
+        if (!is.null(coef) || !is.null(se.coef)) {
171
+            stop("Cannot collapse when 'coef' or 'se.coef' are present.")
172
+        }
173
+        # NOTE: reduce() sorts the output
174
+        unique_gr <- unique(gr)
175
+        ol <- findOverlaps(unique_gr, gr)
176
+        gr <- unique(gr)
177
+        idx <- as.list(ol)
178
+        M <- .collapseMatrixLike(M, idx, MARGIN = 2L)
179
+        Cov <- .collapseMatrixLike(Cov, idx, MARGIN = 2L)
180
+    }
181
+
182
+    # Sort loci
183
+    if (is.unsorted(gr)) {
184
+        o <- order(gr)
185
+        gr <- gr[o]
186
+        M <- M[o, , drop = FALSE]
187
+        Cov <- Cov[o, , drop = FALSE]
188
+        coef <- coef[o, , drop = FALSE]
189
+        se.coef <- se.coef[o, , drop = FALSE]
190
+    }
191
+
192
+    # Construct BSseq object
193
+    assays <- SimpleList(M = M, Cov = Cov, coef = coef, se.coef = se.coef)
194
+    assays <- assays[!S4Vectors:::sapply_isNULL(assays)]
195
+    se <- SummarizedExperiment(assays = assays, rowRanges = gr, colData = pData)
196
+    if (is.null(parameters)) {
197
+        parameters <- list()
198
+    }
199
+    if (is.null(trans)) {
200
+        trans <- function(x) NULL
201
+    }
202
+    .BSseq(se, trans = trans, parameters = parameters)
203
+}
204
+
205
+hasBeenSmoothed <- function(BSseq) "coef" %in% assayNames(BSseq)
206
+
207
+# TODO: Document withDimnames
208
+getBSseq <- function(BSseq,
209
+                     type = c("Cov", "M", "gr", "coef", "se.coef", "trans",
210
+                              "parameters"),
211
+                     withDimnames = TRUE) {
212
+    type <- match.arg(type)
213
+    if (type %in% c("M", "Cov")) {
214
+        return(assay(BSseq, type, withDimnames = withDimnames))
215
+    }
216
+    if (type %in% c("coef", "se.coef") && type %in% assayNames(BSseq)) {
217
+        return(assay(BSseq, type, withDimnames = withDimnames))
218
+    }
219
+    if (type %in% c("coef", "se.coef")) return(NULL)
220
+    if (type == "trans") return(BSseq@trans)
221
+    if (type == "parameters") return(BSseq@parameters)
222
+    if (type == "gr") return(BSseq@rowRanges)
223
+
224
+}
225
+
226
+strandCollapse <- function(BSseq, shift = TRUE) {
227
+    if (all(runValue(strand(BSseq)) == "*")) {
228
+        warning("All loci are unstranded; nothing to collapse")
229
+        return(BSseq)
230
+    }
231
+    if (!(all(runValue(strand(BSseq)) %in% c("+", "-")))) {
232
+        stop("'BSseq' object has a mix of stranded and unstranded loci.")
233
+    }
234
+    BS.forward <- BSseq[strand(BSseq) == "+"]
235
+    strand(BS.forward) <- "*"
236
+    BS.reverse <- BSseq[strand(BSseq) == "-"]
237
+    strand(BS.reverse) <- "*"
238
+    if (shift) rowRanges(BS.reverse) <- shift(rowRanges(BS.reverse), -1L)
239
+    sampleNames(BS.reverse) <- paste0(sampleNames(BS.reverse), "_REVERSE")
240
+    BS.comb <- combine(BS.forward, BS.reverse)
241
+    newBSseq <- collapseBSseq(BS.comb, columns = rep(sampleNames(BSseq), 2))
242
+    newBSseq
243
+}
244
+
245
+# Exported methods -------------------------------------------------------------
246
+
247
+setMethod("show", signature(object = "BSseq"), function(object) {
248
+    cat("An object of type 'BSseq' with\n")
249
+    cat(" ", nrow(object), "methylation loci\n")
250
+    cat(" ", ncol(object), "samples\n")
251
+    if (hasBeenSmoothed(object)) {
252
+        cat("has been smoothed with\n")
253
+        cat(" ", object@parameters$smoothText, "\n")
254
+    } else {
255
+        cat("has not been smoothed\n")
256
+    }
257
+    if (.isHDF5ArrayBacked(object)) {
258
+        cat("Some assays are HDF5Array-backed\n")
259
+    } else {
260
+        cat("All assays are in-memory\n")
261
+    }
262
+})
263
+
264
+setMethod("pData", "BSseq", function(object) {
265
+    object@colData
266
+})
267
+
268
+setReplaceMethod(
269
+    "pData",
270
+    signature = signature(object = "BSseq", value = "data.frame"),
271
+    function(object, value) {
272
+        colData(object) <- as(value, "DataFrame")
273
+        object
274
+    }
275
+)
276
+
277
+setReplaceMethod(
278
+    "pData",
279
+    signature = signature(object = "BSseq", value = "DataFrame"),
280
+    function(object, value) {
281
+        colData(object) <- value
282
+        object
283
+    }
284
+)
285
+
286
+setMethod("sampleNames", "BSseq", function(object) colnames(object))
287
+
288
+setReplaceMethod(
289
+    "sampleNames",
290
+    signature = signature(object = "BSseq", value = "ANY"),
291
+    function(object, value) {
292
+        colnames(object) <- value
293
+        object
294
+    }
295
+)
296
+
297
+setMethod("updateObject", "BSseq",
298
+          function(object, ...) {
299
+              # NOTE: identical() is too strong
300
+              if (isTRUE(all.equal(getBSseq(object, "trans"), .oldTrans))) {
301
+                  object@trans <- plogis
302
+              }
303
+              if (is(object, "SummarizedExperiment")) {
304
+                  # NOTE: Call method for SummarizedExperiment objects
305
+                  object <- callNextMethod()
306
+                  return(object)
307
+              } else {
308
+                  BSseq(gr = object@gr,
309
+                        M = object@M,
310
+                        Cov = object@Cov,
311
+                        coef = object@coef,
312
+                        se.coef = object@se.coef,
313
+                        trans = object@trans,
314
+                        parameters = object@parameters,
315
+                        pData = object@phenoData@data)
316
+              }
317
+          }
318
+)
0 319
deleted file mode 100644
... ...
@@ -1,355 +0,0 @@
1
-setClass("BSseq", contains = "RangedSummarizedExperiment",
2
-         representation(trans = "function",
3
-                        parameters = "list"))
4
-
5
-setValidity("BSseq", function(object) {
6
-    msg <- validMsg(NULL, .checkAssayNames(object, c("Cov", "M")))
7
-    msg <- validMsg(msg, .checkAssayClasses(object,
8
-                                            c("Cov", "M", "coef", "se.coef")))
9
-    if(class(rowRanges(object)) != "GRanges")
10
-        msg <- validMsg(msg, sprintf("object of class '%s' needs to have a 'GRanges' in slot 'rowRanges'", class(object)))
11
-    ## benchmarking shows that min(assay()) < 0 is faster than any(assay() < 0) if it is false
12
-    if(is.null(colnames(object)))
13
-        msg <- validMsg(msg, "colnames (aka sampleNames) need to be set")
14
-    M <- assay(object, "M", withDimnames = FALSE)
15
-    Cov <- assay(object, "Cov", withDimnames = FALSE)
16
-    if (!.isHDF5ArrayBacked(M) && !.isHDF5ArrayBacked(Cov)) {
17
-        ## TODO: This check is super expensive if M or Cov is a HDF5Matrix, so
18
-        ##       we skip it for the time being
19
-        if(min(assay(object, "M", withDimnames = FALSE)) < 0)
20
-            msg <- validMsg(msg, "the 'M' assay has negative entries")
21
-        if(min(assay(object, "Cov", withDimnames = FALSE)) < 0)
22
-            msg <- validMsg(msg, "the 'Cov' assay has negative entries")
23
-        if(max(assay(object, "M", withDimnames = FALSE) -
24
-               assay(object, "Cov", withDimnames = FALSE)) > 0.5)
25
-            msg <- validMsg(msg, "the 'M' assay has at least one entry bigger than the 'Cov' assay")
26
-    }
27
-    if(!is.null(rownames(M)) || !is.null(rownames(Cov)) ||
28
-       ("coef" %in% assayNames(object) && !is.null(rownames(assay(object, "coef")))) ||
29
-       ("se.coef" %in% assayNames(object) && !is.null(rownames(assay(object, "se.coef")))))
30
-        warning("unnecessary rownames in object")
31
-    if (is.null(msg)) TRUE else msg
32
-})
33
-
34
-setMethod("show", signature(object = "BSseq"),
35
-          function(object) {
36
-              cat("An object of type 'BSseq' with\n")
37
-              cat(" ", nrow(object), "methylation loci\n")
38
-              cat(" ", ncol(object), "samples\n")
39
-              if(hasBeenSmoothed(object)) {
40
-                  cat("has been smoothed with\n")
41
-                  cat(" ", object@parameters$smoothText, "\n")
42
-              } else {
43
-                  cat("has not been smoothed\n")
44
-              }
45
-              if (.isHDF5ArrayBacked(object)) {
46
-                  cat("Some assays are HDF5Array-backed\n")
47
-              } else {
48
-                  cat("All assays are in-memory\n")
49
-              }
50
-          })
51
-
52
-setMethod("pData", "BSseq", function(object) {
53
-    object@colData
54
-})
55
-
56
-setReplaceMethod("pData",
57
-                 signature = signature(
58
-                      object = "BSseq",
59
-                      value = "data.frame"),
60
-                 function(object, value) {
61
-                     colData(object) <- as(value, "DataFrame")
62
-                     object
63
-                 })
64
-
65
-setReplaceMethod("pData",
66
-                 signature = signature(
67
-                      object = "BSseq",
68
-                      value = "DataFrame"),
69
-                 function(object, value) {
70
-                     colData(object) <- value
71
-                     object
72
-                 })
73
-
74
-setMethod("sampleNames", "BSseq", function(object) {
75
-    colnames(object)
76
-})
77
-
78
-setReplaceMethod("sampleNames",
79
-                 signature = signature(
80
-                     object = "BSseq",
81
-                     value = "ANY"),
82
-                 function(object, value) {
83
-                     colnames(object) <- value
84
-                     object
85
-                 })
86
-
87
-setMethod("length", "BSseq", function(x) {
88
-    length(granges(x))
89
-})
90
-
91
-hasBeenSmoothed <- function(BSseq) {
92
-    "coef" %in% assayNames(BSseq)
93
-}
94
-
95
-getBSseq <- function(BSseq, type = c("Cov", "M", "gr", "coef", "se.coef", "trans", "parameters")) {
96
-    type <- match.arg(type)
97
-    if(type %in% c("M", "Cov"))
98
-        return(assay(BSseq, type))
99
-    if(type %in% c("coef", "se.coef") && type %in% assayNames(BSseq))
100
-        return(assay(BSseq, type))
101
-    if(type %in% c("coef", "se.coef"))
102
-       return(NULL)
103
-    if(type == "trans")
104
-        return(BSseq@trans)
105
-    if(type == "parameters")
106
-        return(BSseq@parameters)
107
-    if(type == "gr")
108
-        return(BSseq@rowRanges)
109
-
110
-}
111
-
112
-BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
113
-                  trans = NULL, parameters = NULL, pData = NULL,
114
-                  gr = NULL, pos = NULL, chr = NULL, sampleNames = NULL,
115
-                  rmZeroCov = FALSE) {
116
-    if(is.null(gr)) {
117
-        if(is.null(pos) || is.null(chr))
118
-            stop("Need pos and chr")
119
-        gr <- GRanges(seqnames = chr, ranges = IRanges(start = pos, width = 1))
120
-    }
121
-    if(!is(gr, "GRanges"))
122
-        stop("'gr' needs to be a GRanges")
123
-    if(any(width(gr) != 1))
124
-        stop("'gr' needs to have widths of 1")
125
-    if(is.null(M) || is.null(Cov))
126
-        stop("Need M and Cov")
127
-    M <- .DelayedMatrix(M)
128
-    Cov <- .DelayedMatrix(Cov)
129
-    if (!is.null(coef)) {
130
-        coef <- .DelayedMatrix(coef)
131
-    }
132
-    if (!is.null(se.coef)) {
133
-        se.coef <- .DelayedMatrix(se.coef)
134
-    }
135
-    if(length(gr) != nrow(M) ||
136
-       length(gr) != nrow(Cov) ||
137
-       ncol(Cov) != ncol(M))
138
-        stop("'gr', 'M' and 'Cov' need to have similar dimensions")
139
-    if(!is.null(rownames(M)))
140
-        rownames(M) <- NULL
141
-    if(!is.null(rownames(Cov)))
142
-        rownames(Cov) <- NULL
143
-    if(!is.null(names(gr)))
144
-        names(gr) <- NULL
145
-    ## deal with sampleNames
146
-    if(!is(pData, "DataFrame"))
147
-        pData <- as(pData, "DataFrame")
148
-    if(is.null(sampleNames) && !is.null(pData) && !is.null(rownames(pData)))
149
-        sampleNames <- rownames(pData)
150
-    if(is.null(sampleNames) && !is.null(colnames(M)))
151
-        sampleNames <- colnames(M)
152
-    if(is.null(sampleNames) && !is.null(colnames(Cov)))
153
-        sampleNames <- colnames(Cov)
154
-    if(is.null(sampleNames))
155
-        sampleNames <- paste("V", 1:ncol(M), sep = "")
156
-    if(length(unique(sampleNames)) != ncol(M))
157
-        stop("sampleNames need to be unique and of the right length.")
158
-    ## check that 0 <= M <= Cov and remove positions with Cov = 0
159
-    if (!.isHDF5ArrayBacked(M) && !.isHDF5ArrayBacked(Cov)) {
160
-        ## TODO: This check is super expensive if M or Cov is a HDF5Matrix, so
161
-        ##       we skip it for the time being
162
-        if (any(M < 0) || any(M > Cov) || any(is.na(M)) || any(is.na(Cov)) ||
163
-            any(is.infinite(Cov))) {
164
-            stop("'M' and 'Cov' may not contain NA or infinite values and ",
165
-                 "0 <= M <= Cov")
166
-        }
167
-    }
168
-    if(rmZeroCov) {
169
-        wh <- which(rowSums2(Cov) == 0)
170
-        if(length(wh) > 0) {
171
-            gr <- gr[-wh]
172
-            M <- M[-wh,,drop = FALSE]
173
-            Cov <- Cov[-wh,,drop = FALSE]
174
-        }
175
-    }
176
-    grR <- reduce(gr, min.gapwidth = 0L)
177
-    if(!identical(grR, gr)) {
178
-        ## Now we either need to re-order or collapse or both
179
-        mm <- as.matrix(findOverlaps(grR, gr))
180
-        mm <- mm[order(mm[,1]),]
181
-        if(length(grR) == length(gr)) {
182
-            ## only re-ordering is necessary
183
-            gr <- grR
184
-            M <- M[mm[,2],,drop = FALSE]
185
-            Cov <- Cov[mm[,2],,drop = FALSE]
186
-            if(!is.null(coef))
187
-                coef <- coef[mm[,2],,drop = FALSE]
188
-            if(!is.null(se.coef))
189
-                se.coef <- se.coef[mm[,2],, drop = FALSE]
190
-        } else {
191
-            warning("multiple positions, collapsing BSseq object\n")
192
-            if(!is.null(coef) || !is.null(se.coef))
193
-                stop("Cannot collapse when 'coef' or 'se.coef' are present")
194
-            gr <- grR
195
-            sp <- split(mm[,2], mm[,1])[as.character(1:length(grR))]
196
-            names(sp) <- NULL
197
-            # TODO: .collapseDelayedMatrix() always return numeric; it may be
198
-            #       worth coercing M and Cov to integer DelayedMatrix objects,
199
-            #       which would halve storage requirements and impose some more
200
-            #       structure on the BSseq class (via new validity method
201
-            #       checks)
202
-            # NOTE: Tries to be smart about how collapsed DelayedMatrix should
203
-            #       be realized
204
-            if (.isHDF5ArrayBacked(M)) {
205
-                M_BACKEND <- "HDF5Array"
206
-            } else {
207
-                M_BACKEND <- NULL
208
-            }
209
-            M <- .collapseDelayedMatrix(x = M,
210
-                                        sp = sp,
211
-                                        MARGIN = 2,
212
-                                        BACKEND = M_BACKEND)
213
-            if (.isHDF5ArrayBacked(Cov)) {
214
-                Cov_BACKEND <- "HDF5Array"
215
-            } else {
216
-                Cov_BACKEND <- NULL
217
-            }
218
-            Cov <- .collapseDelayedMatrix(x = Cov,
219
-                                          sp = sp,
220
-                                          MARGIN = 2,
221
-                                          BACKEND = Cov_BACKEND)
222
-        }
223
-    }
224
-    if(is.null(colnames(M)) || any(sampleNames != colnames(M)))
225
-        colnames(M) <- sampleNames
226
-    if(is.null(colnames(Cov)) || any(sampleNames != colnames(Cov)))
227
-        colnames(Cov) <- sampleNames
228
-    if(!is.null(coef)) {
229
-
230
-        if(nrow(coef) != nrow(M) ||
231
-           ncol(coef) != ncol(M))
232
-            stop("'coef' does not have the right dimensions")
233
-        if(is.null(colnames(coef)) || any(sampleNames != colnames(coef)))
234
-            colnames(coef) <- sampleNames
235
-        if(!is.null(rownames(coef)))
236
-            rownames(coef) <- NULL
237
-    }
238
-    if(!is.null(se.coef)) {
239
-        if(nrow(se.coef) != nrow(M) ||
240
-           ncol(se.coef) != ncol(M))
241
-            stop("'se.coef' does not have the right dimensions")
242
-        if(is.null(colnames(se.coef)) || any(sampleNames != colnames(se.coef)))
243
-            colnames(se.coef) <- sampleNames
244
-        if(!is.null(rownames(se.coef)))
245
-            rownames(se.coef) <- NULL
246
-    }
247
-    assays <- SimpleList(M = M, Cov = Cov, coef = coef, se.coef = se.coef)
248
-    assays <- assays[!sapply(assays, is.null)]
249
-    if(is.null(pData) || all(dim(pData) == c(0,0)))
250
-        BSseq <- SummarizedExperiment(assays = assays, rowRanges = gr)
251
-    else
252
-        BSseq <- SummarizedExperiment(assays = assays, rowRanges = gr, colData = pData)
253
-    BSseq <- as(BSseq, "BSseq")
254
-    if(is.function(trans))
255
-        BSseq@trans <- trans
256
-    if(is.list(parameters))
257
-        BSseq@parameters <- parameters
258
-    BSseq
259
-}
260
-
261
-setMethod("updateObject", "BSseq",
262
-          function(object, ...) {
263
-              # NOTE: identical() is too strong
264
-              if (isTRUE(all.equal(getBSseq(object, "trans"), .oldTrans))) {
265
-                  object@trans <- plogis
266
-              }
267
-              if (.hasSlot(object, "assays")) {
268
-                  # call method for RangedSummarizedExperiment objects
269
-                  object <- callNextMethod()
270
-                  assays(object) <- endoapply(
271
-                      assays(object, withDimnames = FALSE), function(assay) {
272
-                          if (is.null(assay)) {
273
-                              return(assay)
274
-                          } else {
275
-                              .DelayedMatrix(assay)
276
-                          }
277
-                      })
278
-                  object
279
-              } else {
280
-                  BSseq(gr = object@gr, M = object@M, Cov = object@Cov,
281
-                        coef = object@coef, se.coef = object@se.coef,
282
-                        trans = object@trans, parameters = object@parameters,
283
-                        pData = object@phenoData@data)
284
-              }
285
-          })
286
-
287
-
288
-strandCollapse <- function(BSseq, shift = TRUE) {
289
-    if(all(runValue(strand(BSseq)) == "*")) {
290
-        warning("All loci are unstranded; nothing to collapse")
291
-        return(BSseq)
292
-    }
293
-    if(!(all(runValue(strand(BSseq)) %in% c("+", "-"))))
294
-        stop("'BSseq' object has a mix of stranded and unstranded loci.")
295
-    BS.forward <- BSseq[strand(BSseq) == "+"]
296
-    strand(BS.forward) <- "*"
297
-    BS.reverse <- BSseq[strand(BSseq) == "-"]
298
-    strand(BS.reverse) <- "*"
299
-    if(shift)
300
-        rowRanges(BS.reverse) <- shift(granges(BS.reverse), -1L)
301
-    sampleNames(BS.reverse) <- paste0(sampleNames(BS.reverse), "_REVERSE")
302
-    BS.comb <- combine(BS.forward, BS.reverse)
303
-    newBSseq <- collapseBSseq(BS.comb, columns = rep(sampleNames(BSseq), 2))
304
-    newBSseq
305
-}
306
-
307
-## getD <- function(data, sample1, sample2, type = c("raw", "fit"),
308
-##                  addPositions = FALSE, alpha = 0.95) {
309
-##     d.conf <- function(p1, n1, p2, n2) {
310
-##         ## Method 10 from Newcombe (Stat in Med, 1998)
311
-##         getRoots <- function(p, n) {
312
-##             mat <- cbind(p^2, -(2*p + z^2/n), 1+z^2/n)
313
-##             roots <- t(Re(apply(mat, 1, polyroot)))
314
-##             roots
315
-##         }
316
-##         roots1 <- roots2 <- matrix(NA_real_, nrow = length(p1), ncol = 2)
317
-##         idx <- !is.na(p1)
318
-##         roots1[idx,] <- getRoots(p1[idx], n1[idx])
319
-##         idx <- !is.na(p2)
320
-##         roots2[idx,] <- getRoots(p2[idx], n2[idx])
321
-##         d <- p1 - p2
322
-##         suppressWarnings({
323
-##             lower <- d - z * sqrt(roots1[,1]*(1-roots1[,1])/n1 + roots2[,2]*(1-roots2[,2])/n2)
324
-##             upper <- d + z * sqrt(roots1[,2]*(1-roots1[,2])/n1 + roots2[,1]*(1-roots2[,1])/n2)
325
-##         })
326
-##         return(data.frame(d = d, lower = lower, upper = upper))
327
-##     }
328
-##     type <- match.arg(type)
329
-##     z <- abs(qnorm((1-alpha)/2, mean = 0, sd = 1))
330
-##     switch(type,
331
-##            raw = {
332
-##                conf <- d.conf(p1 = data$M[, sample1] / data$Cov[, sample1],
333
-##                               n1 = data$Cov[, sample1],
334
-##                               p2 = data$M[, sample2] / data$Cov[, sample2],
335
-##                               n2 = data$Cov[, sample2])
336
-##                out <- conf
337
-##            },
338
-##            fit = {
339
-##                p1 <- data$trans(data$coef[, sample1])
340
-##                p2 <- data$trans(data$coef[, sample2])
341
-##                d <- p1 - p2
342
-##                ## Based on the delta method
343
-##                se.d <- sqrt((data$se.coef[, sample1] * p1 * (1-p1))^2 +
344
-##                             (data$se.coef[, sample2] * p2 * (1-p2))^2)
345
-
346
-##                lower <- d - z * se.d
347
-##                upper <- d + z * se.d
348
-##                out <- data.frame(d = d, lower = lower, upper = upper)
349
-##            })
350
-##     if(addPositions) {
351
-##         out$pos <- start(data$gr)
352
-##         out$chr <- seqnames(data$gr)
353
-##     }
354
-##     out
355
-## }
... ...
@@ -1,68 +1,25 @@
1
-collapseBSseq <- function(BSseq, columns) {
2
-    ## columns is a vector of new names, names(columns) is sampleNames or empty
3
-    stopifnot(is.character(columns))
4
-    if(is.null(names(columns)) && length(columns) != ncol(BSseq))
5
-        stop("if `columns' does not have names, it needs to be of the same length as `BSseq` has columns (samples)")
6
-    if(!is.null(names(columns)) && !all(names(columns) %in% sampleNames(BSseq)))
7
-        stop("if `columns` has names, they need to be sampleNames(BSseq)")
8
-    if(is.null(names(columns)))
9
-        columns.idx <- 1:ncol(BSseq)
10
-    else
11
-        columns.idx <- match(names(columns), sampleNames(BSseq))
12
-    sp <- split(columns.idx, columns)
13
-    # TODO: .collapseDelayedMatrix() always return numeric; it may be
14
-    #       worth coercing M and Cov to integer DelayedMatrix objects,
15
-    #       which would halve storage requirements and impose some more
16
-    #       structure on the BSseq class (via new validity method
17
-    #       checks)
18
-    # NOTE: Tries to be smart about how collapsed DelayedMatrix should
19
-    #       be realized
20
-    M <- getBSseq(BSseq, "M")
21
-    if (.isHDF5ArrayBacked(M)) {
22
-        M_BACKEND <- "HDF5Array"
23
-    } else {
24
-        M_BACKEND <- NULL
25
-    }
26
-    M <- .collapseDelayedMatrix(x = M,
27
-                                sp = sp,
28
-                                MARGIN = 1,
29
-                                BACKEND = M_BACKEND)
30
-    Cov <- getBSseq(BSseq, "Cov")
31
-    if (.isHDF5ArrayBacked(Cov)) {
32
-        Cov_BACKEND <- "HDF5Array"
33
-    } else {
34
-        Cov_BACKEND <- NULL
35
-    }
36
-    Cov <- .collapseDelayedMatrix(x = Cov,
37
-                                  sp = sp,
38
-                                  MARGIN = 1,
39
-                                  BACKEND = Cov_BACKEND)
40
-    BSseq(gr = getBSseq(BSseq, "gr"), M = M, Cov = Cov, sampleNames = names(sp))
41
-}
42
-
43 1
 chrSelectBSseq <- function(BSseq, seqnames = NULL, order = FALSE) {
44 2
     seqlevels(BSseq, pruning.mode = "coarse") <- seqnames
45
-    if(order)
46
-        BSseq <- orderBSseq(BSseq, seqOrder = seqnames)
3
+    if (order) BSseq <- orderBSseq(BSseq, seqOrder = seqnames)
47 4
     BSseq
48 5
 }
49 6
 
50
-
51 7
 orderBSseq <- function(BSseq, seqOrder = NULL) {
52
-    if(!is.null(seqOrder))
8
+    if (!is.null(seqOrder)) {
53 9
         seqlevels(BSseq, pruning.mode = "coarse") <- seqOrder
10
+    }
54 11
     BSseq[order(granges(BSseq))]
55 12
 }
56 13
 
57
-
58 14
 # TODO: getMeth() realises the result in memory iff regions is not NULL;
59 15
 #       discuss with Kasper
60 16
 # TODO: Whether or not colnames are added to returned value depends on whether
61 17
 #       regions is non-NULL; discuss with Kasper
62 18
 # TODO: Add parallel support
19
+# TODO: Document withDimnames
63 20
 getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"),
64 21
                     what = c("perBase", "perRegion"), confint = FALSE,
65
-                    alpha = 0.95) {
22
+                    alpha = 0.95, withDimnames = TRUE) {
66 23
     p.conf <- function(p, n, alpha) {
67 24
         z <- abs(qnorm((1 - alpha)/2, mean = 0, sd = 1))
68 25
         upper <- (p + z ^ 2 / (2 * n) +
... ...
@@ -85,21 +42,21 @@ getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"),
85 42
     }
86 43
     z <- abs(qnorm((1 - alpha)/2, mean = 0, sd = 1))
87 44
     if (is.null(regions) && type == "smooth") {
88
-        coef <- getBSseq(BSseq, type = "coef")
89
-        meth <- getBSseq(BSseq, type = "trans")(coef)
45
+        coef <- getBSseq(BSseq, "coef", withDimnames)
46
+        meth <- getBSseq(BSseq, "trans", withDimnames)(coef)
90 47
         if (confint) {
91
-            upper <- meth + z * getBSseq(BSseq, type = "se.coef")
92
-            lower <- meth - z * getBSseq(BSseq, type = "se.coef")
48
+            upper <- meth + z * getBSseq(BSseq, "se.coef", withDimnames)
49
+            lower <- meth - z * getBSseq(BSseq, "se.coef", withDimnames)
93 50
             return(list(meth = meth, lower = lower, upper = upper))
94 51
         } else {
95 52
             return(meth)
96 53
         }
97 54
     }
98 55
     if (is.null(regions) && type == "raw") {
99
-        meth <- getBSseq(BSseq, type = "M") / getBSseq(BSseq, type = "Cov")
56
+        meth <- getBSseq(BSseq, "M", withDimnames) /
57
+            getBSseq(BSseq, "Cov", withDimnames)
100 58
         if (confint) {
101
-            return(p.conf(meth, n = getBSseq(BSseq, type = "Cov"),
102
-                          alpha = alpha))
59
+            return(p.conf(meth, getBSseq(BSseq, "Cov", withDimnames), alpha))
103 60
         } else {
104 61
             return(meth)
105 62
         }
... ...
@@ -119,12 +76,14 @@ getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"),
119 76
     #       chunks if what = perRegion
120 77
     if (type == "smooth") {
121 78
         meth <- as.matrix(
122
-            getBSseq(BSseq, "trans")(getBSseq(BSseq, "coef"))[queryHits(ov), ,
123
-                                                              drop = FALSE])
79
+            getBSseq(BSseq, "trans", withDimnames)(
80
+                getBSseq(BSseq, "coef", withDimnames))[
81
+                    queryHits(ov), , drop = FALSE])
124 82
     } else if (type == "raw") {
125 83
         meth <- as.matrix(
126
-            (getBSseq(BSseq, "M") / getBSseq(BSseq, "Cov"))[queryHits(ov), ,
127
-                                                            drop = FALSE])
84
+            (getBSseq(BSseq, "M", withDimnames) /
85
+                 getBSseq(BSseq, "Cov", withDimnames))[
86
+                     queryHits(ov), , drop = FALSE])
128 87
     }
129 88
     out <- lapply(split(meth, subjectHits(ov)), matrix, ncol = ncol(meth))
130 89
     if (what == "perBase") {
... ...
@@ -138,9 +97,9 @@ getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"),
138 97
         # TODO: Don't really understand the logic of the remaining code; how
139 98
         #       could the rows end up in the wrong order?
140 99
         outMatrix <- matrix(NA, ncol = ncol(BSseq), nrow = length(regions))
141
-        colnames(outMatrix) <- sampleNames(BSseq)
100
+        if (withDimnames) colnames(outMatrix) <- sampleNames(BSseq)
142 101
         outMatrix[as.integer(rownames(out)), ] <- out
143
-        .DelayedMatrix(outMatrix)
102
+        outMatrix
144 103
     }
145 104
 }
146 105
 
... ...
@@ -148,21 +107,23 @@ getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"),
148 107
 #       discuss with Kasper
149 108
 # TODO: Whether or not colnames are added to returned value depends on whether
150 109
 #       regions is non-NULL; discuss with Kasper
110
+# TODO: Document withDimnames
151 111
 getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"),
152 112
                         what = c("perBase", "perRegionAverage",
153
-                                 "perRegionTotal")) {
113
+                                 "perRegionTotal"),
114
+                        withDimnames = TRUE) {
154 115
     stopifnot(is(BSseq, "BSseq"))
155 116
     type <- match.arg(type)
156 117
     what <- match.arg(what)
157 118
     if (is.null(regions)) {
158 119
         if (what == "perBase") {
159
-            return(getBSseq(BSseq, type = type))
120
+            return(getBSseq(BSseq, type, withDimnames))
160 121
         }
161 122
         if (what == "perRegionTotal") {
162
-            return(colSums2(getBSseq(BSseq, type = type)))
123
+            return(colSums2(getBSseq(BSseq, type, withDimnames)))
163 124
         }
164 125
         if (what == "perRegionAverage") {
165
-            return(colMeans2(getBSseq(BSseq, type = type)))
126
+            return(colMeans2(getBSseq(BSseq, type, withDimnames)))
166 127
         }
167 128
     }
168 129
     if (class(regions) == "data.frame") {
... ...
@@ -171,11 +132,8 @@ getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"),
171 132
     stopifnot(is(regions, "GenomicRanges"))
172 133
     grBSseq <- granges(BSseq)
173 134
     ov <- findOverlaps(grBSseq, regions)
174
-    if (type == "Cov") {
175
-        coverage <- getBSseq(BSseq, "Cov")[queryHits(ov), , drop = FALSE]
176
-    } else if (type == "M") {
177
-        coverage <- getBSseq(BSseq, "M")[queryHits(ov), , drop = FALSE]
178
-    }
135
+    coverage <- getBSseq(BSseq, type, withDimnames)[
136
+        queryHits(ov), , drop = FALSE]
179 137
     out <- lapply(split(coverage, subjectHits(ov)), matrix,
180 138
                   ncol = ncol(coverage))
181 139
 
... ...
@@ -193,7 +151,7 @@ getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"),
193 151
     # TODO: Don't really understand the logic of the remaining code; how
194 152
     #       could the rows end up in the wrong order?
195 153
     outMatrix <- matrix(NA, ncol = ncol(BSseq), nrow = length(regions))
196
-    colnames(outMatrix) <- sampleNames(BSseq)
197
-    outMatrix[as.integer(rownames(out)),] <- out
198
-    .DelayedMatrix(outMatrix)
154
+    if (withDimnames) colnames(outMatrix) <- sampleNames(BSseq)
155
+    outMatrix[as.integer(rownames(out)), ] <- out
156
+    outMatrix
199 157
 }
... ...
@@ -44,48 +44,12 @@
44 44
     is(x@seed, "matrix")
45 45
 }
46 46
 
47
-# NOTE: Equivalent to rowSums2(x[, j, drop = FALSE]) but does it using a
48
-#       delayed operation and always returns a nrow(x) x 1 DelayedMatrix
49
-.delayed_rowSums2 <- function(x, j) {
50
-    Reduce(`+`, lapply(j, function(jj) x[, jj, drop = FALSE]))
51
-}
52
-
53
-# NOTE: Equivalent to colSums2(x[i, , drop = FALSE]) but does it using a
54
-#       delayed operation and always returns a 1 x ncol(x) DelayedMatrix
55
-.delayed_colSums2 <- function(x, i) {
56
-    Reduce(`+`, lapply(i, function(ii) x[ii, , drop = FALSE]))
57
-}
58
-
59
-# MARGIN = 1: collapse using rowSums
60
-# MARGIN = 2: collapse using colSums
61
-.collapseDelayedMatrix <- function(x, sp, MARGIN, BACKEND = NULL) {
62
-    stopifnot(is(x, "DelayedMatrix"))
63
-    if (MARGIN == 1) {
64
-        if (is.null(BACKEND)) {
65
-            collapsed_x <- do.call(cbind, lapply(sp, function(j) {
66
-                rowSums2(x[, j, drop = FALSE])
67
-            }))
68
-        } else {
69
-            collapsed_x <- do.call(cbind, lapply(sp, function(j) {
70
-                .delayed_rowSums2(x, j)
71
-            }))
72
-            # NOTE: Need to manually add colnames when using this method
73
-            colnames(collapsed_x) <- names(sp)
74
-        }
75
-    } else if (MARGIN == 2) {
76
-        if (is.null(BACKEND)) {
77
-            collapsed_x <- do.call(rbind, lapply(sp, function(i) {
78
-                colSums2(x[i, , drop = FALSE])
79
-            }))
80
-        } else {
81
-            collapsed_x <- do.call(rbind, lapply(sp, function(i) {
82
-                .delayed_colSums2(x, i)
83
-            }))
84
-            # NOTE: Need to manually add rownames when using this method
85
-            rownames(collapsed_x) <- names(sp)
86
-        }
47
+.zero_type <- function(type) {
48
+    if (identical(type, "integer")) {
49
+        fill <- 0L
50
+    } else if (identical(type, "double")) {
51
+        fill <- 0
87 52
     } else {
88
-        stop("'MARGIN' must be 1 or 2")
53
+        stop("'type' = ", type, " is not supported")
89 54
     }
90
-    realize(collapsed_x, BACKEND = BACKEND)
91 55
 }
92 56
new file mode 100644
... ...
@@ -0,0 +1,123 @@
1
+# Internal generics ------------------------------------------------------------
2
+
3
+# 'idx': A list of columns (MARGIN = 1) or rows (MARGIN = 2) to collapse over.
4
+# E.g., `idx = list(1:3, 4:5)` with `MARGIN = 1` means collapse columns 1-3
5
+# into a single columna dn columns 4-5 into a single column.
6
+# MARGIN = 1: collapse along columns using rowSums2
7
+# MARGIN = 2: collapse along rows using colSums2
8
+# `...` are additional arguments passed to methods.
9
+setGeneric(
10
+    ".collapseMatrixLike",
11
+    function(x, idx, MARGIN, ...) standardGeneric,
12
+    signature = "x")
13
+
14
+# Internal methods -------------------------------------------------------------
15
+
16
+setMethod(".collapseMatrixLike", "matrix", function(x, idx, MARGIN) {
17
+    if (MARGIN == 1L) {
18
+        # TODO: Need colnames?
19
+        return(do.call(cbind, lapply(idx, function(j) rowSums2(x, cols = j))))
20
+    } else if (MARGIN == 2L) {
21
+        # TODO: Need rownames?
22
+        do.call(rbind, lapply(idx, function(i) colSums2(x, rows = i)))
23
+    } else {
24
+        stop("'MARGIN' must be 1 or 2")
25
+    }
26
+})
27
+
28
+setMethod(
29
+    ".collapseMatrixLike",
30
+    "DelayedMatrix",
31
+    function(x, idx, MARGIN, BPREDO = list(), BPPARAM = SerialParam()) {
32
+        # Set up intermediate RealizationSink objects of appropriate
33
+        # dimensions and type
34
+        # NOTE: `type = "double"` because .collapseMatrixLike,matrix-method
35
+        #       uses colSums2()/rowSums2(), which returns a numeric vector.
36
+        # NOTE: This is ultimately coerced to the output DelayedMatrix
37
+        #       object
38
+        # Set up ArrayGrid instances over `x` as well as "parallel"
39
+        # ArrayGrid instances over `sink`.
40
+        if (MARGIN == 1L) {
41
+            sink <- DelayedArray:::RealizationSink(
42
+                dim = c(nrow(x), length(idx)),
43
+                dimnames = list(rownames(x), names(idx)),
44
+                type = "double")
45
+            x_grid <- minfi:::colGrid(x)
46
+            sink_grid <- RegularArrayGrid(
47
+                refdim = dim(sink),
48
+                spacings = c(nrow(sink), ncol(sink) / length(x_grid)))
49
+        } else if (MARGIN == 2L) {
50
+            # TODO: Check sink has correct dim and dimnames
51
+            sink <- DelayedArray:::RealizationSink(
52
+                dim = c(length(idx), ncol(x)),
53
+                dimnames = list(names(idx), colnames(x)),
54
+                type = "double")
55
+            on.exit(close(sink))
56
+            x_grid <- minfi:::rowGrid(x)
57
+            sink_grid <- RegularArrayGrid(
58
+                refdim = dim(sink),
59
+                spacings = c(nrow(sink) / length(x_grid), ncol(sink)))
60
+        } else {
61
+            stop("'MARGIN' must be 1 or 2")
62
+        }
63
+
64
+        # Loop over blocks of 'x' and write to 'sink'.
65
+        minfi:::blockApplyWithRealization(
66
+            x = x,
67
+            FUN = .collapseMatrixLike,
68
+            idx = idx,
69
+            MARGIN = MARGIN,
70
+            sink = sink,
71
+            x_grid = x_grid,
72
+            sink_grid = sink_grid,
73
+            BPREDO = BPREDO,
74
+            BPPARAM = BPPARAM)
75
+
76
+        # Return as DelayedMatrix object
77
+        as(sink, "DelayedArray")
78
+    }
79
+)
80
+
81
+# Exported functions -----------------------------------------------------------
82
+
83
+collapseBSseq <- function(BSseq, columns) {
84
+    if (hasBeenSmoothed(BSseq)) {
85
+        warning("Collapsing a smoothed BSseq object. You will need to ",
86
+                "re-smooth using 'BSmooth()' on the returned object.")
87
+    }
88
+
89
+    # Construct index between current samples and collapsed samples
90
+    stopifnot(is.character(columns))
91
+    if (is.null(names(columns)) && length(columns) != ncol(BSseq)) {
92
+        stop("if `columns' does not have names, it needs to be of the same ",
93
+             "length as `BSseq` has columns (samples)")
94
+    }
95
+    if (!is.null(names(columns)) &&
96
+        !all(names(columns) %in% sampleNames(BSseq))) {
97
+        stop("if `columns` has names, they need to be sampleNames(BSseq)")
98
+    }
99
+    if (is.null(names(columns))) {
100
+        columns.idx <- seq_len(ncol(BSseq))
101
+    } else {
102
+        columns.idx <- match(names(columns), sampleNames(BSseq))
103
+    }
104
+    idx <- split(columns.idx, columns)
105
+
106
+    # Collapse 'M' and 'Cov' matrices
107
+    M <- .collapseMatrixLike(
108
+        x = assay(BSseq, "M", withDimnames = FALSE),
109
+        idx = idx,
110
+        MARGIN = 1L)
111
+    Cov <- .collapseMatrixLike(
112
+        x = assay(BSseq, "Cov", withDimnames = FALSE),
113
+        idx = idx,
114
+        MARGIN = 1L)
115
+
116
+    # Construct BSseq object
117
+    # TODO: Check sampleNames are preserved (could extract from names(idx) and
118
+    #       pass down to constructors).
119
+    se <- SummarizedExperiment(
120
+        assays = SimpleList(M = M, Cov = Cov),
121
+        rowRanges = rowRanges(BSseq))
122
+    .BSseq(se)
123
+}
... ...
@@ -1,224 +1,215 @@
1
-# TODO: Ultimately, combine() is just a special case of combineList(), so
2
-#       should simplify code
3
-setMethod("combine", signature(x = "BSseq", y = "BSseq"), function(x, y, ...) {
4
-    ## All of this assumes that we are place x and y "next" to each other,
5
-    ##  ie. we are not combining the same set of samples sequenced at different times
6
-    if (class(x) != class(y)) {
7
-        stop(paste("objects must be the same class, but are ",
8
-                   class(x), ", ", class(y), se = ""))
1
+# Internal functions -----------------------------------------------------------
2
+
3
+# TODO: Option to specify that objects should be combined into DelayedArray
4
+#       with a given backend (e.g., even if all inputs are matrix)?
5
+.combineList <- function(x, x_rowRanges, ans_rowRanges) {
6
+    x_is_matrix <- vapply(x, is.matrix, logical(1L))
7
+    if (isTRUE(all(x_is_matrix))) {
8
+        return(.combineList_matrix(x, x_rowRanges, ans_rowRanges))
9 9
     }
10
-    if (hasBeenSmoothed(x) && hasBeenSmoothed(y) &&
11
-        !all.equal(getBSseq(x, "trans"), getBSseq(y, "trans")) &&
12
-        !all.equal(getBSseq(x, "parameters"), getBSseq(y, "parameters"))) {
13
-        stop("'x' and 'y' need to be smoothed on the same scale")
10
+    .combineList_matrix_like(x, x_rowRanges, ans_rowRanges)
11
+}
12
+
13
+.combineList_matrix <- function(x, x_rowRanges, ans_rowRanges) {
14
+    stopifnot(identical(length(x), length(x_rowRanges)))
15
+    # Set up output matrix with appropriate dimension and type
16
+    # TODO: Modify minfi:::.highestType() to accept a list. Can then do
17
+    #       minfi:::.highestType(x)
18
+    x_types <- vapply(x, DelayedArray::type, character(1L))
19
+    ans_type <- typeof(do.call("c", lapply(x_types, vector)))
20
+    ans <- matrix(
21
+        data = .zero_type(ans_type),
22
+        nrow = length(ans_rowRanges),
23
+        ncol = sum(vapply(x, ncol, integer(1L))))
24
+
25
+    # Fill output matrix
26
+    col_offset <- 0L
27
+    for (k in seq_along(x_rowRanges)) {
28
+        ol <- findOverlaps(x_rowRanges[[k]], ans_rowRanges, type = "equal")
29
+        # row_idx <- findOverlaps(
30
+        #     query = x_rowRanges[[k]],
31
+        #     subject = ans_rowRanges,
32
+        #     type = "equal",
33
+        #     select = "first")
34
+        col_idx <- col_offset + seq_len(ncol(x[[k]]))
35
+        ans[subjectHits(ol), col_idx] <- x[[k]][queryHits(ol), , drop = FALSE]
36
+        col_offset <- col_offset + ncol(x[[k]])
14 37
     }
15
-    pData <- combine(as(pData(x), "data.frame"), as(pData(y), "data.frame"))
16
-    # TODO: Could relax this by sorting x and y before doing the next steps
17
-    if (identical(granges(x), granges(y))) {
18
-        # NOTE: Don't realize() M, Cov, coef, or se.coef because we are
19
-        #       cbind()-ing, which is very efficient for DelayedArray objects.
20
-        #       E.g., cbind()-ing a bunch of HDF5Matrix objects is super fast
21
-        #       and uses basically no memory whereas realize()-ing the final
22
-        #       object as a new HDF5Matrix can take a long time.
23
-        gr <- granges(x)
24
-        M <- cbind(getBSseq(x, "M"), getBSseq(y, "M"))
25
-        Cov <- cbind(getBSseq(x, "Cov"), getBSseq(y, "Cov"))
26
-        if (!hasBeenSmoothed(x) || !hasBeenSmoothed(y)) {
27
-            coef <- NULL
28
-            se.coef <- NULL
29
-            trans <- NULL
30
-            parameters <- NULL
31
-        } else {
32
-            trans_x <- getBSseq(x, "trans")
33
-            trans_y <- getBSseq(y, "trans")
34
-            if (!all.equal(trans_x, trans_y)) {
35
-                stop("'x' and 'y' need to have the same 'trans'")
36
-            }
37
-            trans <- trans_x
38
-            parameters_x <- getBSseq(x, "parameters")
39
-            parameters_y <- getBSseq(y, "parameters")
40
-            if (!identical(parameters_x, parameters_y)) {
41
-                stop("'x' and 'y' need to have the same 'parameters'")
42
-            }
43
-            parameters <- parameters_x
44
-            coef <- cbind(getBSseq(x, "coef"), getBSseq(y, "coef"))
45
-            if (!is.null(getBSseq(x, "se.coef")) &&
46
-                !is.null(getBSseq(y, "se.coef"))) {
47
-                se.coef <- cbind(getBSseq(x, "se.coef"), getBSseq(y, "se.coef"))
48
-            } else {
49
-                if (!is.null(getBSseq(x, "se.coef")) ||
50
-                    !is.null(getBSseq(y, "se.coef"))) {
51
-                    warning("Setting 'se.coef' to NULL: Cannot combine() ",
52
-                            "these because one of 'x' or 'y' is missing ",
53
-                            "'se.coef'")
54
-                    }
55
-                se.coef <- NULL
56
-            }
57
-        }
58
-    } else {
59
-        gr <- reduce(c(granges(x), granges(y)), min.gapwidth = 0L)
60
-        I <- lapply(list(x, y), function(xx) {
61
-            findOverlaps(xx, gr, type = "equal", select = "first")
38
+    # Return output matrix
39
+    ans
40
+}
41
+
42
+.combineList_matrix_like <- function(x, x_rowRanges, ans_rowRanges) {
43
+    # Argument checks
44
+    stopifnot(identical(length(x), length(x_rowRanges)))
45
+
46
+    # Construct data frame mapping each sample to an element of x and its
47
+    # corresponding column number.
48
+    x_ncol <- vapply(x, ncol, integer(1L))
49
+    x_samples_df <- data.frame(
50
+        sample = seq(1, sum(x_ncol)),
51
+        x_idx = rep(seq_along(x), times = x_ncol),
52
+        column_idx = unlist(lapply(x_ncol, function(end) seq(1, end))))
53
+
54
+    # Set up intermediate RealizationSink object of appropriate dimensions
55
+    # and type
56
+    # TODO: Modify minfi:::.highestType() to accept a list. Can then do
57
+    #       minfi:::.highestType(x)
58
+    x_types <- vapply(x, DelayedArray::type, character(1L))
59
+    ans_type <- typeof(do.call("c", lapply(x_types, vector)))
60
+    sink <- DelayedArray:::RealizationSink(
61
+        dim = c(length(ans_rowRanges), sum(vapply(x, ncol, integer(1L)))),
62
+        type = ans_type)
63
+    on.exit(close(sink))
64
+
65
+    # Set up ArrayGrid instance over columns of `sink`.
66
+    sink_grid <- minfi:::colGrid(sink)
67
+
68
+    # Loop over column grid of 'sink', identify samples required for that
69
+    # block, bring that data into memory, pass down to .combineList_matrix(),
70
+    # and write result to sink.
71
+    for (k in seq_along(sink_grid)) {
72
+        sink_viewport <- sink_grid[[k]]
73
+        block_samples <- seq(start(sink_viewport)[2L], end(sink_viewport)[2L])
74
+        block_samples_df <- x_samples_df[
75
+            x_samples_df[["sample"]] %in% block_samples, ]
76
+        x_to_load <- unique(block_samples_df[["x_idx"]])
77
+        block_x <- lapply(x_to_load, function(idx) {
78
+            columns_to_load <- block_samples_df[
79
+                block_samples_df[["x_idx"]] == idx, "column_idx"]
80
+            as.matrix(x[[idx]][, columns_to_load, drop = FALSE])
62 81
         })
63
-        ## FIXME: there is no check that the two sampleNames are disjoint.
64
-        sampleNames <- c(sampleNames(x), sampleNames(y))
65
-        X_M <- list(getBSseq(x, "M"), getBSseq(y, "M"))
66
-        if (any(vapply(X_M, .isHDF5ArrayBacked, logical(1L)))) {
67
-            BACKEND_M <- "HDF5Array"
68
-        } else {
69
-            BACKEND_M <- NULL
70
-        }
71
-        # TODO: Figure out if fill should be 0 or 0L based on X_M
72
-        M <- .combineListOfDelayedMatrixObjects(
73
-            X = X_M,
74
-            I = I,
75
-            nrow = length(gr),
76
-            ncol = length(sampleNames),
77
-            dimnames = list(NULL, sampleNames),
78
-            fill = 0L,
79
-            BACKEND = BACKEND_M)
80
-        X_Cov <- list(getBSseq(x, "Cov"), getBSseq(y, "Cov"))
81
-        if (any(vapply(X_Cov, .isHDF5ArrayBacked, logical(1L)))) {
82
-            BACKEND_Cov <- "HDF5Array"
83
-        } else {
84
-            BACKEND_Cov <- NULL
85
-        }
86
-        Cov <- .combineListOfDelayedMatrixObjects(
87
-            X = X_Cov,
88
-            I = I,
89
-            nrow = length(gr),
90
-            ncol = length(sampleNames),
91
-            dimnames = list(NULL, sampleNames),
92
-            fill = 0L,
93
-            BACKEND = BACKEND_Cov)
94
-        if (hasBeenSmoothed(x) || hasBeenSmoothed(y)) {
95
-            warning("Setting 'coef' and 'se.coef' to NULL: Cannot combine() ",
96
-                    "these because 'x' and 'y' have different rowRanges")
97
-        }
98
-        coef <- NULL
99
-        se.coef <- NULL
100
-        trans <- NULL
101
-        parameters <- NULL
82
+        block_rowRanges <- x_rowRanges[x_to_load]
83
+        block_ans <- .combineList_matrix(
84
+            x = block_x,
85
+            x_rowRanges = block_rowRanges,
86
+            ans_rowRanges = ans_rowRanges)
87
+        write_block_to_sink(block_ans, sink, sink_viewport)
102 88
     }
103
-    BSseq(gr = gr, M = M, Cov = Cov, coef = coef, se.coef = se.coef,
104
-          pData = pData, trans = trans, parameters = parameters,
105
-          rmZeroCov = FALSE)
89
+
90
+    as(sink, "DelayedArray")
91
+}
92
+
93
+# Exported methods -------------------------------------------------------------
94
+
95
+setMethod("combine", signature(x = "BSseq", y = "BSseq"), function(x, y, ...) {
96
+    combineList(list(x, y))
106 97
 })
107 98
 
99
+# Exported functions -----------------------------------------------------------
100
+
101
+# TODO: Check sampleNames are disjoint
102
+# TODO: Should this have a BACKEND argument?
108 103
 combineList <- function(x, ..., BACKEND = NULL) {
109
-    if (class(x) == "BSseq") {
110
-        x <- list(x, ...)
111
-    }
112
-    stopifnot(all(sapply(x, class) == "BSseq"))
113
-    trans <- getBSseq(x[[1]], "trans")
114
-    sameTrans <- sapply(x[-1], function(xx) {
115
-        all.equal(trans, getBSseq(xx, "trans"))
116
-    })
117
-    if (!all(sameTrans)) {
118
-        stop("all elements of '...' in combineList needs to have the same ",
119
-             "'trans'")
104
+    # Argument checks ----------------------------------------------------------
105
+
106
+    # Check inputs are BSseq objects
107
+    if (is(x, "BSseq")) x <- list(x, ...)
108
+    x_is_BSseq <- vapply(x, is, logical(1L), "BSseq")
109
+    stopifnot(isTRUE(all(x_is_BSseq)))
110
+    # Check inputs are combinable
111
+    x_trans <- lapply(x, getBSseq, "trans")
112
+    x_has_same_trans <- vapply(
113
+        x_trans,
114
+        function(t) isTRUE(all.equal(t, x_trans[[1L]])),
115
+        logical(1L))
116
+    stopifnot(isTRUE(all(x_has_same_trans)))
117
+    x_parameters <- lapply(x, getBSseq, "parameters")
118
+    x_has_same_parameters <- vapply(
119
+        x_parameters,
120
+        function(p) isTRUE(all.equal(p, x_parameters[[1L]])),
121
+        logical(1L))
122
+    stopifnot(isTRUE(all(x_has_same_parameters)))
123
+    # Check if all inputs have the same set of loci
124
+    x_rowRanges <- lapply(x, rowRanges)
125
+    ans_rowRanges <- Reduce(union, x_rowRanges)
126
+    intersect_rowRanges <- Reduce(intersect, x_rowRanges)
127
+    all_x_have_same_loci <- identical(ans_rowRanges, intersect_rowRanges)
128
+    # Check if safe to combine smoothed representations (coef and se.coef).
129
+    # It's only safe if all objects contain the same set of loci.
130
+    x_has_been_smoothed <- vapply(x, hasBeenSmoothed, logical(1L))
131
+    if (all_x_have_same_loci && all(x_has_been_smoothed)) {
132
+        combine_smooths <- TRUE
133
+        ans_trans <- x_trans[[1L]]
134
+        ans_parameters <- x_parameters[[1L]]
135
+    } else {
136
+        if (any(x_has_been_smoothed)) {
137
+            warning("Combining smoothed and unsmoothed BSseq objects. You ",
138
+                    "will need to re-smooth using 'BSmooth()' on the ",
139
+                    "returned object.")
140
+        }
141
+        if (!all_x_have_same_loci && any(x_has_been_smoothed)) {
142
+            warning("Combining BSseq objects with different loci. You ",
143
+                    "will need to re-smooth using 'BSmooth()' on the ",
144
+                    "returned object.")
145
+        }
146
+        combine_smooths <- FALSE
147
+        ans_coef <- NULL
148
+        ans_se.coef <- NULL
149
+        ans_trans <- function(x) NULL
150
+        ans_parameters <- list()
120 151
     }
121
-    parameters <- getBSseq(x[[1]], "parameters")
122
-    same_parameters <- vapply(x, function(xx) {
123
-        identical(getBSseq(xx, "parameters"), parameters)
124
-    }, logical(1L))
125
-    if (!all(same_parameters)) {
126
-        stop("all elements of '...' in combineList needs to have the same ",
127
-             "'parameters'")
152
+
153
+    # Combine elements of BSseq objects ----------------------------------------
154
+
155
+    # Combine colData
156
+    ans_colData <- as(
157
+        Reduce(combine, lapply(x, function(xx) as.data.frame(colData(xx)))),
158
+        "DataFrame")
159
+    # Extract assays to be combined
160
+    # NOTE: Use of `assay(..., withDimnames = FALSE)` is deliberate
161
+    x_M <- lapply(x, assay, "M", withDimnames = FALSE)
162
+    x_Cov <- lapply(x, assay, "Cov", withDimnames = FALSE)
163
+    if (combine_smooths) {
164
+        x_coef <- lapply(x, getBSseq, "coef")
165
+        x_se.coef <- lapply(x, getBSseq, "se.coef")
128 166
     }
129
-    # TODO: Could relax this by sorting all elements of x before doing the
130
-    #       next steps
131
-    gr <- getBSseq(x[[1]], "gr")
132
-    sameGr <- sapply(x[-1], function(xx) {
133
-        identical(gr, getBSseq(xx, "gr"))
134
-    })
135
-    if (all(sameGr)) {
136
-        # NOTE: Don't realize() M, Cov, coef, or se.coef because we are
137
-        #       cbind()-ing, which is very efficient for DelayedArray objects.
138
-        #       E.g., cbind()-ing a bunch of HDF5Matrix objects is super fast
139
-        #       and uses basically no memory whereas realize()-ing the final
140
-        #       object as a new HDF5Matrix can take a long time.
141
-        M <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "M")))
142
-        Cov <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "Cov")))
143
-        has_been_smoothed <- vapply(x, hasBeenSmoothed, logical(1L))
144
-        if (!all(has_been_smoothed)) {
145
-            if (any(has_been_smoothed)) {
146
-                warning("Setting 'coef' and 'se.coef' to NULL: Cannot ",
147
-                        "combine() these because not all BSseq objects have ",
148
-                        "been smoothed")
167
+    # Combine assay data
168
+    if (all_x_have_same_loci) {
169
+        x_has_same_rowRanges_as_ans_rowRanges <- vapply(
170
+            x_rowRanges,
171
+            function(r) isTRUE(all.equal(r, ans_rowRanges)),
172
+            logical(1L))
173
+        if (all(x_has_same_rowRanges_as_ans_rowRanges)) {
174
+            # If all rowRanges are identical then can just cbind assays.
175
+            ans_M <- do.call(cbind, x_M)
176
+            ans_Cov <- do.call(cbind, x_Cov)
177
+            if (combine_smooths) {
178
+                ans_coef <- do.call(cbind, x_coef)
179
+                ans_se.coef <- do.call(cbind, x_se.coef)
149 180
             }
150
-            coef <- NULL
151
-            se.coef <- NULL
152
-            trans <- NULL
153 181
         } else {
154
-            list_of_trans <- lapply(x, getBSseq, "trans")
155
-            if (!all(vapply(list_of_trans, function(trans) {
156
-                all.equal(trans, list_of_trans[[1]])
157
-            }, logical(1L)))) {
158
-                stop("All BSseq objects need to have the same 'trans'")
159
-            }
160
-            trans <- list_of_trans[[1]]
161
-            list_of_parameters <- lapply(x, getBSseq, "parameters")
162
-            if (!all(vapply(list_of_parameters, function(parameters) {
163
-                identical(parameters, list_of_parameters[[1]])
164
-            }, logical(1L)))) {
165
-                stop("All BSseq objects need to have the same 'parameters'")
166
-            }
167
-            parameters <- list_of_parameters[[1]]
168
-            list_of_coef <- lapply(x, function(xx) getBSseq(xx, "coef"))
169
-            coef <- do.call(cbind, list_of_coef)
170
-            list_of_se.coef <- lapply(x, function(xx) getBSseq(xx, "se.coef"))
171
-            has_se.coef <- !vapply(list_of_se.coef, is.null, logical(1L))
172
-            if (all(has_se.coef)) {
173
-                se.coef <- do.call(cbind, list_of_se.coef)
174
-            } else {
175
-                if (any(has_se.coef) && any(!has_se.coef)) {
176
-                    warning("Setting 'se.coef' to NULL: Cannot combine() ",
177
-                            "these because at least of the BSseq objects ",
178
-                            "is missing 'se.coef'")
179
-                }
180
-                se.coef <- NULL
182
+            # If all rowRanges contain same loci but aren't identical then they
183
+            # must differ by how loci are ordered. So order all inputs using a
184
+            # common order and then cbind assays.
185
+            x_order <- lapply(x_rowRanges, function(rr) {
186
+                seqlevels(rr) <- seqlevels(ans_rowRanges)
187
+                order(rr)
18