Browse code

Merge refactor branch into master

Peter Hickey authored on 17/09/2018 11:17:52
Showing52 changed files

... ...
@@ -8,12 +8,11 @@ Authors@R: c(person(c("Kasper", "Daniel"), "Hansen", role = c("aut", "cre"),
8 8
     email = "kasperdanielhansen@gmail.com"),
9 9
     person("Peter", "Hickey", role = "aut", email = "peter.hickey@gmail.com"))
10 10
 Depends:
11
-    R (>= 3.3),
11
+    R (>= 3.5),
12 12
     methods,
13 13
     BiocGenerics,
14 14
     GenomicRanges (>= 1.29.14),
15
-    SummarizedExperiment (>= 1.9.18),
16
-    parallel
15
+    SummarizedExperiment (>= 1.9.18)
17 16
 Imports:
18 17
     IRanges (>= 2.11.16),
19 18
     GenomeInfoDb,
... ...
@@ -26,21 +25,31 @@ Imports:
26 25
     data.table,
27 26
     S4Vectors,
28 27
     R.utils (>= 2.0.0),
29
-    DelayedMatrixStats (>= 1.1.12),
28
+    DelayedMatrixStats (>= 1.3.6),
30 29
     permute,
31 30
     limma,
32
-    DelayedArray (>= 0.5.34),
33
-    HDF5Array
31
+    DelayedArray (>= 0.7.15),
32
+    Rcpp,
33
+    BiocParallel,
34
+    BSgenome,
35
+    Biostrings,
36
+    utils,
37
+    HDF5Array,
38
+    beachmat
34 39
 Suggests:
35
-    RUnit,
40
+    testthat,
36 41
     bsseqData,
37 42
     BiocStyle,
38 43
     rmarkdown,
39 44
     knitr,
45
+    Matrix,
46
+    doParallel,
47
+    rtracklayer,
48
+    BSgenome.Hsapiens.UCSC.hg38
40 49
 Collate:
41 50
     utils.R
42 51
     hasGRanges.R
43
-    BSseq_class.R
52
+    BSseq-class.R
44 53
     BSseqTstat_class.R
45 54
     BSseq_utils.R
46 55
     combine.R
... ...
@@ -57,8 +66,11 @@ Collate:
57 66
     BSseqStat_class.R
58 67
     getStats.R
59 68
     hdf5_utils.R
60
-    combine_utils.R
61 69
     DelayedArray_utils.R
70
+    collapseBSseq.R
71
+    FWIRanges-class.R
72
+    FWGRanges-class.R
73
+    findLoci.R
62 74
 License: Artistic-2.0
63 75
 VignetteBuilder: knitr
64 76
 URL: https://github.com/kasperdanielhansen/bsseq
... ...
@@ -66,3 +78,7 @@ LazyData: yes
66 78
 LazyDataCompression: xz
67 79
 BugReports: https://github.com/kasperdanielhansen/bsseq/issues
68 80
 biocViews: DNAMethylation
81
+SystemRequirements: C++11
82
+LinkingTo: Rcpp, Rhdf5lib, beachmat
83
+NeedsCompilation: yes
84
+RoxygenNote: 6.1.0
... ...
@@ -3,48 +3,55 @@
3 3
 ##
4 4
 
5 5
 import(methods)
6
+import(S4Vectors)
7
+import(IRanges)
8
+import(GenomicRanges)
9
+import(SummarizedExperiment)
10
+import(DelayedArray)
11
+import(HDF5Array)
12
+import(BiocParallel)
13
+import(limma)
6 14
 importFrom(BiocGenerics, "anyDuplicated", "cbind", "colnames",
7 15
            "combine", "density", "intersect", "lapply", "ncol",
8 16
            "nrow", "order", "paste", "pmax", "pmin", "rbind",
9 17
            "Reduce", "rep.int", "rownames", "sapply", "setdiff",
10
-           "strand", "strand<-", "union", "unique", "updateObject")
18
+           "strand", "strand<-", "union", "unique", "updateObject", "unstrand")
11 19
 importFrom(stats, "approxfun", "fisher.test", "ppoints",
12 20
            "predict", "preplot", "qchisq",
13
-           "qnorm", "qqplot", "qunif", "cov2cor",
14
-           "plogis")
21
+           "qqplot", "qunif", "cov2cor",
22
+           "setNames")
15 23
 importFrom(graphics, "abline", "axis", "layout", "legend", "lines",
16 24
            "mtext", "par", "plot", "points", "polygon", "rect", "rug", "text")
17 25
 import(parallel)
18 26
 importFrom(locfit, "locfit", "lp")
19 27
 importFrom(DelayedMatrixStats, "rowSds", "rowVars", "colMeans2", "colSums2",
20
-           "rowSums2", "rowMeans2")
21
-import(IRanges)
22
-import(GenomicRanges)
23
-import(SummarizedExperiment)
28
+           "rowSums2", "rowMeans2", "rowAlls", "rowsum", "colsum")
29
+
24 30
 importFrom(scales, "alpha")
25 31
 importClassesFrom(Biobase, "AnnotatedDataFrame")
26 32
 importMethodsFrom(Biobase, "annotatedDataFrameFrom",
27 33
                   "pData", "pData<-",
28 34
                   "sampleNames", "sampleNames<-")
29 35
 importFrom(Biobase, "validMsg")
30
-importMethodsFrom(GenomeInfoDb, "seqlengths", "seqlengths<-", "seqinfo", "seqinfo<-",
31
-                  "seqnames", "seqnames<-", "seqlevels", "seqlevels<-")
32
-import(S4Vectors)
36
+importMethodsFrom(GenomeInfoDb, "seqlengths", "seqlengths<-", "seqinfo",
37
+                  "seqinfo<-", "seqnames", "seqnames<-", "seqlevels",
38
+                  "seqlevels<-", "sortSeqlevels")
39
+importFrom(GenomeInfoDb, "Seqinfo")
33 40
 importFrom(gtools, "combinations")
34
-importFrom(data.table, "fread", "setnames")
35
-importFrom(R.utils, "isGzipped", "gunzip")
36
-import(limma)
41
+# NOTE: data.table has some NAMESPACE clashes with functions in Bioconductor,
42
+#       e.g., shift(). If new ones are discovered, add them to this list.
43
+import(data.table, except = c(shift, first, second))
37 44
 importFrom(permute, "shuffleSet", "how")
38
-importClassesFrom(DelayedArray, "DelayedArray", "DelayedMatrix")
39
-importFrom(DelayedArray, "DelayedArray", "plogis", "pmin2", "pmax2", "rowMaxs",
40
-           "rowMins")
41
-importClassesFrom(HDF5Array, "HDF5Array")
45
+importFrom(Biostrings, "DNAString", "vmatchPattern", "reverseComplement")
46
+importFrom(utils, "read.delim")
47
+importFrom(BSgenome, "vmatchPattern")
48
+importFrom(tools, "file_path_as_absolute")
49
+importFrom(R.utils, "isGzipped", "isBzipped", "gunzip", "bunzip2")
42 50
 
43 51
 ##
44 52
 ## Exporting
45 53
 ##
46 54
 
47
-
48 55
 exportClasses("hasGRanges",
49 56
               "BSseq",
50 57
               "BSseqTstat",
... ...
@@ -74,7 +81,8 @@ export("BSseq", "getMeth", "getCoverage", "getBSseq", "getStats",
74 81
        "read.umtab", "read.umtab2", "read.bsmooth", "read.bismark",
75 82
        "poissonGoodnessOfFit", "binomialGoodnessOfFit",
76 83
        "data.frame2GRanges", "BSseqTstat",
77
-       "BSseqStat")
84
+       "BSseqStat",
85
+       "findLoci")
78 86
 
79 87
 S3method("print", "chisqGoodnessOfFit")
80 88
 S3method("plot", "chisqGoodnessOfFit")
... ...
@@ -86,3 +94,5 @@ S3method("plot", "BSseqTstat")
86 94
 
87 95
 exportMethods("assays", "assayNames")
88 96
 
97
+# C++ code registration
98
+useDynLib(bsseq, .registration = TRUE, .fixes = "cxx_")
... ...
@@ -1,3 +1,15 @@
1
+# Internal functions -----------------------------------------------------------
2
+
3
+.rowTickmarks <- function(hasGRanges, maxGap) {
4
+    gr <- granges(hasGRanges)
5
+    # NOTE: This relies on 'gr' being sorted, so really want to be sure of this!
6
+    stopifnot(!is.unsorted(gr))
7
+    clusters <- reduce(gr, min.gapwidth = maxGap, with.revmap = TRUE)
8
+    cumsum(lengths(mcols(clusters)[["revmap"]]))
9
+}
10
+
11
+# TODO: Remove other uses of this function, if possible.
12
+# TODO: Rename to .makeClusters() since it's an internal function
1 13
 makeClusters <- function(hasGRanges, maxGap = 10^8) {
2 14
     chrOrder <- as.character(runValue(seqnames(hasGRanges)))
3 15
     if(anyDuplicated(chrOrder))
... ...
@@ -18,136 +30,310 @@ makeClusters <- function(hasGRanges, maxGap = 10^8) {
18 30
     clusterIdx
19 31
 }
20 32
 
33
+.BSmooth <- function(b, M, Cov, pos, coef_sink, se.coef_sink, sink_lock, grid,
34
+                     pos_grid, ns, h, keep.se) {
35
+    # Load packages on worker (required for SnowParam) -------------------------
36
+
37
+    suppressPackageStartupMessages({
38
+        library(BiocParallel)
39
+    })
40
+    suppressPackageStartupMessages({
41
+        library(locfit)
42
+    })
43
+    suppressPackageStartupMessages({
44
+        library(DelayedArray)
45
+    })
46
+
47
+    # Construct inputs for smoothing -------------------------------------------
48
+
49
+    # NOTE: 'bb' indexes over elements of pos_grid by cycling `ncol(M)` times
50
+    #       over 1, ..., nrow(pos_grid).
51
+    bb <- (b - 1L) %% nrow(pos_grid) + 1L
52
+    # NOTE: unname() is necessary because M and Cov may carry colnames
53
+    sdata <- data.frame(
54
+        pos = read_block(pos, pos_grid[[bb]]),
55
+        M = unname(read_block(M, grid[[b]])),
56
+        Cov = unname(read_block(Cov, grid[[b]])))
57
+    # Ensure 0 < M < Cov to avoid boundary issues (only relevant at loci with
58
+    # non-zero coverage, so doesn't matter what M is for these loci).
59
+    sdata[["M"]] <- pmin(pmax(sdata[["M"]], 0.01), pmax(sdata[["Cov"]] - 0.01))
60
+    n_loci <- nrow(sdata)
61
+    n_loci_with_non_zero_coverage <- sum(sdata[["Cov"]] > 0)
21 62
 
22
-BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8, parallelBy = c("sample", "chromosome"),
23
-                    mc.preschedule = FALSE, mc.cores = 1, keep.se = FALSE, verbose = TRUE) {
24
-    smooth <- function(idxes, sampleIdx) {
25
-        ## Assuming that idxes is a set of indexes into the BSseq object
26
-        ## sampleIdx is a single character
27
-        this_sample_chr <- c(sampleNames(BSseq)[sampleIdx],
28
-                             as.character(seqnames(BSseq)[idxes[1]]))
29
-        if(verbose >= 2)
30
-            cat(sprintf("[BSmooth]   smoothing start: sample:%s, chr:%s, nLoci:%s\n",
31
-                        this_sample_chr[1], this_sample_chr[2], length(idxes)))
32
-        Cov <- unname(as.array(
33
-            getCoverage(BSseq, type = "Cov")[idxes, sampleIdx]))
34
-        M <- unname(as.array(
35
-            getCoverage(BSseq, type = "M")[idxes, sampleIdx]))
36
-        pos <- start(BSseq)[idxes]
37
-        stopifnot(all(diff(pos) > 0))
38
-        wh <- which(Cov != 0)
39
-        nn <- ns / length(wh)
40
-        if(length(wh) <= ns) {
41
-            if(keep.se)
42
-                se.coef <- rep(NA_real_, length(Cov))
43
-            else
44
-                se.coef <- NULL
45
-            return(list(coef = rep(NA_real_, length(Cov)),
46
-                        se.coef = se.coef,
47
-                        trans = NULL, h = h, nn = nn))
63
+    # Fit local binomial regression model --------------------------------------
64
+
65
+    # NOTE: Require (ns + 1) loci with non-zero coverage to run smooth.
66
+    if (n_loci_with_non_zero_coverage <= ns) {
67
+        coef <- rep(NA_real_, n_loci)
68
+        if (keep.se) {
69
+            se.coef <- rep(NA_real_, n_loci)
70
+        } else {
71
+            se.coef <- NULL
48 72
         }
49
-        sdata <- data.frame(pos = pos[wh],
50
-                            M = pmin(pmax(M[wh], 0.01),
51
-                                     Cov[wh] - 0.01),
52
-                            Cov = Cov[wh])
53
-        fit <- locfit(M ~ lp(pos, nn = nn, h = h), data = sdata,
54
-                      weights = Cov, family = "binomial", maxk = 10000)
55
-        pp <- preplot(fit, where = "data", band = "local",
56
-                      newdata = data.frame(pos = pos))
57
-        if(keep.se) {
73
+    } else {
74
+        # Set 'nearest neighbour' smoothing parameter.
75
+        nn <- ns / n_loci_with_non_zero_coverage
76
+        # Fit model only using loci with non-zero coverage.
77
+        fit <- locfit(
78
+            M ~ lp(pos, nn = nn, h = h),
79
+            data = sdata,
80
+            weights = Cov,
81
+            family = "binomial",
82
+            subset = Cov > 0,
83
+            maxk = 10000)
84
+        # Evaluate smooth at all loci (regardless of coverage).
85
+        pp <- preplot(
86
+            object = fit,
87
+            newdata = sdata["pos"],
88
+            band = "local")
89
+        coef <- pp$fit
90
+        if (keep.se) {
58 91
             se.coef <- pp$se.fit
59 92
         } else {
60 93
             se.coef <- NULL
61 94
         }
62
-        if(verbose >= 2)
63
-            cat(sprintf("[BSmooth]   smoothing end: sample:%s, chr:%s, nLoci:%s, nCoveredLoci:%s\n",
64
-                        this_sample_chr[1], this_sample_chr[2], length(idxes), nrow(sdata)))
65
-        return(list(coef = pp$fit, se.coef = se.coef,
66
-                    trans = pp$trans, h = h, nn = nn))
67 95
     }
68
-    stopifnot(class(BSseq) == "BSseq")
69
-    parallelBy <- match.arg(parallelBy)
70
-    if(verbose) cat("[BSmooth] preprocessing ... ")
71
-    ptime1 <- proc.time()
72
-    clusterIdx <- makeClusters(BSseq, maxGap = maxGap)
73
-    ptime2 <- proc.time()
74
-    stime <- (ptime2 - ptime1)[3]
75
-    if(verbose) cat(sprintf("done in %.1f sec\n", stime))
76
-
77
-    sampleNames <- sampleNames(BSseq)
78
-    names(sampleNames) <- sampleNames
79
-
80
-    ptime.outer1 <- proc.time()
81
-    switch(parallelBy, "sample" = {
82
-        if(verbose) cat(sprintf("[BSmooth] smoothing by 'sample' (mc.cores = %d, mc.preschedule = %s)\n",
83
-                                mc.cores, mc.preschedule))
84
-        out <- mclapply(seq(along = sampleNames), function(sIdx) {
85
-            ptime1 <- proc.time()
86
-            tmp <- lapply(clusterIdx, function(jj) {
87
-                try(smooth(idxes = jj, sampleIdx = sIdx))
88
-            })
89
-            coef <- do.call(c, lapply(tmp, function(xx) xx$coef))
90
-            se.coef <- do.call(c, lapply(tmp, function(xx) xx$se.coef))
91
-            ptime2 <- proc.time()
92
-            stime <- (ptime2 - ptime1)[3]
93
-            if(verbose) {
94
-                cat(sprintf("[BSmooth] sample %s (out of %d), done in %.1f sec\n",
95
-                            sampleNames[sIdx], length(sampleNames), stime))
96
+
97
+    # Return the results of the smooth or write them to the RealizationSink ----
98
+
99
+    if (is.null(coef_sink)) {
100
+        return(list(coef = coef, se.coef = se.coef))
101
+    }
102
+    # Write to coef_sink and se.coef_sink while respecting the IPC lock.
103
+    ipclock(sink_lock)
104
+    write_block(x = coef_sink, viewport = grid[[b]], block = as.matrix(coef))
105
+    if (keep.se) {
106
+        write_block(
107
+            x = se.coef_sink,
108
+            viewport = grid[[b]],
109
+            block = as.matrix(se.coef))
110
+    }
111
+    ipcunlock(sink_lock)
112
+    NULL
113
+}
114
+
115
+# Exported functions -----------------------------------------------------------
116
+
117
+# TODO: verbose = TRUE should report timings.
118
+# TODO: BSmooth() should warn if BSseq object contains mix of strands.
119
+# TODO: Consider having BSmooth() create a 'smoothed' assay in addition to or
120
+#       instead of the 'coef' and 'se.coef' assays.
121
+BSmooth <- function(BSseq,
122
+                    ns = 70,
123
+                    h = 1000,
124
+                    maxGap = 10^8,
125
+                    keep.se = FALSE,
126
+                    BPPARAM = bpparam(),
127
+                    chunkdim = NULL,
128
+                    level = NULL,
129
+                    verbose = getOption("verbose")) {
130
+
131
+    # Argument checks-----------------------------------------------------------
132
+
133
+    # Check validity of 'BSseq' argument
134
+    if (!is(BSseq, "BSseq")) {
135
+        stop("'BSseq' must be a BSseq object.")
136
+    }
137
+    if (!isTRUE(all(width(BSseq) == 1L))) {
138
+        stop("All loci in 'BSseq' must have width == 1.")
139
+    }
140
+    if (is.unsorted(BSseq)) {
141
+        stop("'BSseq' must be sorted before smoothing. Use 'sort(BSseq)'.")
142
+    }
143
+    stopifnot(isTRUEorFALSE(keep.se))
144
+    # Set appropriate BACKEND and check compatability with BPPARAM.
145
+    BACKEND <- .getBSseqBackends(BSseq)
146
+    if (!.areBackendsInMemory(BACKEND)) {
147
+        if (!.isSingleMachineBackend(BPPARAM)) {
148
+            stop("The parallelisation strategy must use a single machine ",
149
+                 "when using an on-disk realization backend.\n",
150
+                 "See help(\"BSmooth\") for details.",
151
+                 call. = FALSE)
152
+        }
153
+    } else {
154
+        if (!is.null(BACKEND)) {
155
+            # NOTE: Currently do not support any in-memory realization
156
+            #       backends. If 'BACKEND' is NULL then an ordinary matrix
157
+            #       is returned rather than a matrix-backed DelayedMatrix.
158
+            stop("The '", BACKEND, "' realization backend is not ",
159
+                 "supported.\n",
160
+                 "See help(\"BSmooth\") for details.",
161
+                 call. = FALSE)
162
+        }
163
+    }
164
+    # If using HDF5Array, check if BSseq object is updateable.
165
+    if (identical(BACKEND, "HDF5Array")) {
166
+        is_BSseq_updateable <- .isHDF5BackedBSseqUpdatable(BSseq)
167
+        if (is_BSseq_updateable) {
168
+            h5_path <- path(assay(BSseq, withDimnames = FALSE))
169
+            if (any(c("coef", "se.coef") %in% rhdf5::h5ls(h5_path)[, "name"])) {
170
+                # TODO: Better error message; what should be done in this case?
171
+                stop(wmsg("The HDF5 file '", h5_path, "' already contains a ",
172
+                          "dataset named 'coef' or 'se.coef'."))
96 173
             }
97
-            return(list(coef = coef, se.coef = se.coef))
98
-        }, mc.preschedule = mc.preschedule, mc.cores = mc.cores)
99
-        if(any(sapply(out, is, class2 = "try-error")))
100
-            stop("BSmooth encountered smoothing errors")
101
-        coef <- do.call(cbind, lapply(out, function(xx) xx$coef))
102
-        se.coef <- do.call(cbind, lapply(out, function(xx) xx$se.coef))
103
-    }, "chromosome" = {
104
-        if(verbose) cat(sprintf("[BSmooth] smoothing by 'chromosome' (mc.cores = %d, mc.preschedule = %s)\n",
105
-                                mc.cores, mc.preschedule))
106
-        out <- mclapply(1:length(clusterIdx), function(ii) {
107
-            ptime1 <- proc.time()
108
-            tmp <- lapply(seq(along = sampleNames), function(sIdx) {
109
-                smooth(idxes = clusterIdx[[ii]], sampleIdx = sIdx)
110
-            })
111
-            coef <- do.call(cbind, lapply(tmp, function(xx) xx$coef))
112
-            se.coef <- do.call(cbind, lapply(tmp, function(xx) xx$se.coef))
113
-            ptime2 <- proc.time()
114
-            stime <- (ptime2 - ptime1)[3]
115
-            if(verbose)
116
-                cat(sprintf("[BSmooth] chr idx %d (out of %d), done in %.1f sec\n",
117
-                            ii, length(clusterIdx), stime))
118
-            return(list(coef = coef, se.coef = se.coef))
119
-        }, mc.preschedule = mc.preschedule, mc.cores = mc.cores)
120
-        if(any(sapply(out, is, class2 = "try-error")))
121
-            stop("BSmooth encountered smoothing errors")
122
-        coef <- do.call(rbind, lapply(out, function(xx) xx$coef))
123
-        se.coef <- do.call(rbind, lapply(out, function(xx) xx$se.coef))
124
-    })
125
-    coef <- .DelayedMatrix(coef)
126
-    if (!is.null(se.coef)) {
127
-        se.coef <- .DelayedMatrix(se.coef)
174
+        } else {
175
+            h5_path <- NULL
176
+            warning(
177
+                wmsg("'BSseq' was not created with either read.bismark() or ",
178
+                     "HDF5Array::saveHDF5SummarizedExperiment(). BSmooth() is ",
179
+                     "using automatically created HDF5 file(s) (see ",
180
+                     "?HDF5Array::setHDF5DumpFile) to store smoothing result. ",
181
+                     "You will need to run ",
182
+                     "HDF5Array::saveHDF5SummarizedExperiment() on the ",
183
+                     "returned object if you wish to save the returned ",
184
+                     "object."))
185
+        }
186
+    }
187
+    stopifnot(isTRUEorFALSE(verbose))
188
+
189
+    # Smoothing ----------------------------------------------------------------
190
+
191
+    # Extract pieces of BSseq object required for smoothing
192
+    M <- getCoverage(BSseq, type = "M", withDimnames = FALSE)
193
+    Cov <- getCoverage(BSseq, type = "Cov", withDimnames = FALSE)
194
+    pos <- matrix(start(BSseq), ncol = 1)
195
+    # Set up ArrayGrid so that each block contains data for a single sample and
196
+    # single cluster of loci.
197
+    row_tickmarks <- .rowTickmarks(BSseq, maxGap)
198
+    col_tickmarks <- seq_len(ncol(M))
199
+    grid <- ArbitraryArrayGrid(list(row_tickmarks, col_tickmarks))
200
+    # Set up "parallel" ArrayGrid over pos
201
+    pos_grid <- ArbitraryArrayGrid(list(row_tickmarks, 1L))
202
+    # Construct RealizationSink objects (as required)
203
+    if (is.null(BACKEND)) {
204
+        coef_sink <- NULL
205
+        se.coef_sink <- NULL
206
+        sink_lock <- NULL
207
+    } else if (BACKEND == "HDF5Array") {
208
+        coef_sink <- HDF5RealizationSink(
209
+            dim = dim(M),
210
+            dimnames = NULL,
211
+            type = "double",
212
+            filepath = h5_path,
213
+            name = "coef",
214
+            chunkdim = chunkdim,
215
+            level = level)
216
+        on.exit(close(coef_sink), add = TRUE)
217
+        sink_lock <- ipcid()
218
+        on.exit(ipcremove(sink_lock), add = TRUE)
219
+        if (keep.se) {
220
+            se.coef_sink <- HDF5RealizationSink(
221
+                dim = dim(M),
222
+                dimnames = NULL,
223
+                type = "double",
224
+                filepath = h5_path,
225
+                name = "se.coef",
226
+                chunkdim = chunkdim,
227
+                level = level)
228
+            on.exit(close(se.coef_sink), add = TRUE)
229
+        } else {
230
+            se.coef_sink <- NULL
231
+        }
232
+    } else {
233
+        # TODO: This branch should probably never be entered because we
234
+        #       (implicitly) only support in-memory or HDF5Array backends.
235
+        #       However, we retain it for now (e.g., fstArray backend would
236
+        #       use this until a dedicated branch was implemented).
237
+        coef_sink <- DelayedArray:::RealizationSink(dim(M), type = "double")
238
+        on.exit(close(coef_sink), add = TRUE)
239
+        sink_lock <- ipcid()
240
+        on.exit(ipcremove(sink_lock), add = TRUE)
241
+        if (keep.se) {
242
+            se.coef_sink <- DelayedArray:::RealizationSink(
243
+                dim(M),
244
+                type = "double")
245
+            on.exit(close(se.coef_sink), add = TRUE)
246
+        } else {
247
+            se.coef_sink <- NULL
248
+        }
249
+    }
250
+
251
+    # Set number of tasks to ensure the progress bar gives frequent updates.
252
+    # NOTE: The progress bar increments once per task
253
+    #       (https://github.com/Bioconductor/BiocParallel/issues/54).
254
+    #       Although it is somewhat of a bad idea to overrides a user-specified
255
+    #       bptasks(BPPARAM), the value of bptasks(BPPARAM) doesn't affect
256
+    #       performance in this instance, and so we opt for a useful progress
257
+    #       bar. Only SnowParam (and MulticoreParam by inheritance) have a
258
+    #       bptasks<-() method.
259
+    if (is(BPPARAM, "SnowParam") && bpprogressbar(BPPARAM)) {
260
+        bptasks(BPPARAM) <- length(grid)
128 261
     }
129
-    ptime.outer2 <- proc.time()
130
-    stime.outer <- (ptime.outer2 - ptime.outer1)[3]
131
-    if(verbose)
132
-        cat(sprintf("[BSmooth] smoothing done in %.1f sec\n", stime.outer))
133
-
134
-    rownames(coef) <- NULL
135
-    colnames(coef) <- sampleNames(BSseq)
136
-    if(!is.null(se.coef)) {
137
-        rownames(se.coef) <- NULL
138
-        colnames(se.coef) <- sampleNames(BSseq)
262
+    smooth <- bptry(bplapply(
263
+        X = seq_along(grid),
264
+        FUN = .BSmooth,
265
+        M = M,
266
+        Cov = Cov,
267
+        pos = pos,
268
+        coef_sink = coef_sink,
269
+        se.coef_sink = se.coef_sink,
270
+        sink_lock = sink_lock,
271
+        grid = grid,
272
+        pos_grid = pos_grid,
273
+        ns = ns,
274
+        h = h,
275
+        keep.se = keep.se,
276
+        BPPARAM = BPPARAM))
277
+    if (!all(bpok(smooth))) {
278
+        stop("BSmooth() encountered errors: ",
279
+             sum(!bpok(smooth)), " of ", length(smooth),
280
+             " smoothing tasks failed.")
281
+    }
282
+    # Construct coef and se.coef from results of smooth().
283
+    if (is.null(BACKEND)) {
284
+        # Returning matrix objects.
285
+        coef <- do.call(c, lapply(smooth, "[[", "coef"))
286
+        attr(coef, "dim") <- dim(M)
287
+        if (keep.se) {
288
+            se.coef <- do.call(c, lapply(smooth, "[[", "se.coef"))
289
+            attr(se.coef, "dim") <- dim(M)
290
+        } else {
291
+            se.coef <- NULL
292
+        }
293
+    } else {
294
+        # Returning DelayedMatrix objects.
295
+        coef <- as(coef_sink, "DelayedArray")
296
+        if (keep.se) {
297
+            se.coef <- as(se.coef_sink, "DelayedArray")
298
+        } else {
299
+            se.coef <- NULL
300
+        }
139 301
     }
140 302
 
141
-    if(!is.null(coef))
142
-        assay(BSseq, "coef") <- coef
143
-    if(!is.null(se.coef))
144
-        assay(BSseq, "se.coef") <- se.coef
145
-    BSseq@trans <- plogis
303
+    # Construct BSseq object, saving it if it is HDF5-backed -------------------
146 304
 
147
-    parameters <- list(smoothText = sprintf("BSmooth (ns = %d, h = %d, maxGap = %d)", ns, h, maxGap),
148
-                       ns = ns, h = h, maxGap = maxGap)
149
-    BSseq@parameters <- parameters
305
+    # NOTE: Using BiocGenerics:::replaceSlots(..., check = FALSE) to avoid
306
+    #       triggering the validity method.
307
+    assays <- c(assays(BSseq, withDimnames = FALSE)[c("M", "Cov")],
308
+                SimpleList(coef = coef))
309
+    if (keep.se) {
310
+        assays <- c(assays, SimpleList(se.coef = se.coef))
311
+    }
312
+    BSseq <- BiocGenerics:::replaceSlots(
313
+        object = BSseq,
314
+        assays = Assays(assays),
315
+        trans = plogis,
316
+        parameters = list(
317
+            smoothText = sprintf(
318
+                "BSmooth (ns = %d, h = %d, maxGap = %d)", ns, h, maxGap),
319
+            ns = ns,
320
+            h = h,
321
+            maxGap = maxGap),
322
+        check = FALSE)
323
+    if (identical(BACKEND, "HDF5Array") && is_BSseq_updateable) {
324
+        # NOTE: Save BSseq object; mimicing
325
+        #       HDF5Array::saveHDF5SummarizedExperiment().
326
+        dir <- dirname(h5_path)
327
+        x <- BSseq
328
+        x@assays <- HDF5Array:::.shorten_h5_paths(x@assays)
329
+        saveRDS(x, file = file.path(dir, "se.rds"))
330
+    }
150 331
     BSseq
151 332
 }
152 333
 
334
+# TODOs ------------------------------------------------------------------------
153 335
 
336
+# TODO: Use the logging facilities of BiocParallel. This is a longterm goal.
337
+#       For example, we could set custom messages within .BSmooth() using the
338
+#       futile.logger syntax; see the BiocParalell vignette 'Errors, Logs and
339
+#       Debugging in BiocParallel'.
154 340
new file mode 100644
... ...
@@ -0,0 +1,389 @@
1
+# Exported classes -------------------------------------------------------------
2
+
3
+# NOTE: This create a 'classGeneratorFunction' (internal constructor), .BSseq()
4
+.BSseq <- setClass(
5
+    "BSseq",
6
+    slots = representation(
7
+        trans = "function",
8
+        parameters = "list"),
9
+    contains = "RangedSummarizedExperiment"
10
+)
11
+
12
+# Validity methods -------------------------------------------------------------
13
+
14
+.checkAssayNames <- function(object, names) {
15
+    if (all(names %in% assayNames(object))) {
16
+        return(NULL)
17
+    } else {
18
+        paste0(
19
+            "object of class ",
20
+            class(object),
21
+            " needs to have assay slots with names ",
22
+            paste0(names, collapse = ", "))
23
+
24
+    }
25
+}
26
+
27
+.checkMandCov <- function(M, Cov) {
28
+    msg <- NULL
29
+    validMsg(msg, .Call(cxx_check_M_and_Cov, M, Cov))
30
+}
31
+
32
+# TODO: Benchmark validity method
33
+setValidity2("BSseq", function(object) {
34
+    msg <- NULL
35
+
36
+    if (identical(object, .BSseq())) {
37
+        # No validity checks for object returned by internal constructor
38
+        return(msg)
39
+    }
40
+    msg <- validMsg(msg, .checkAssayNames(object, c("Cov", "M")))
41
+
42
+    msg <- validMsg(
43
+        msg = msg,
44
+        result = .checkMandCov(
45
+            M = assay(object, "M", withDimnames = FALSE),
46
+            Cov = assay(object, "Cov", withDimnames = FALSE)))
47
+
48
+    if (is.null(msg)) {
49
+        TRUE
50
+    } else {
51
+        msg
52
+    }
53
+})
54
+
55
+# Internal functions -----------------------------------------------------------
56
+
57
+.oldTrans <- function(x) {
58
+    y <- x
59
+    ix <- which(x < 0)
60
+    ix2 <- which(x > 0)
61
+    y[ix] <- exp(x[ix])/(1 + exp(x[ix]))
62
+    y[ix2] <- 1/(1 + exp(-x[ix2]))
63
+    y
64
+}
65
+
66
+# Exported functions -----------------------------------------------------------
67
+
68
+# TODO: BSseq() is arguably a bad constructor. It doesn't return a valid BSseq
69
+#       object when called without any arguments. It also does some pretty
70
+#       complicated parsing of the inputs. But we're stuck with it because it's
71
+#       been around for a long time.
72
+BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
73
+                  trans = NULL, parameters = NULL, pData = NULL, gr = NULL,
74
+                  pos = NULL, chr = NULL, sampleNames = NULL,
75
+                  rmZeroCov = FALSE) {
76
+
77
+    # Argument checks ----------------------------------------------------------
78
+
79
+    # Process assays.
80
+    # NOTE: Nothing to do for 'coef', and 'se.coef'.
81
+    if (is.null(M) || is.null(Cov)) {
82
+        stop("Need 'M' and 'Cov'.")
83
+    }
84
+    # Process 'trans' and 'parameters'.
85
+    if (is.null(trans)) {
86
+        trans <- function(x) NULL
87
+    }
88
+    if (is.null(parameters)) {
89
+        parameters <- list()
90
+    }
91
+    # Process 'sampleNames' and 'pData'.
92
+    if (is.null(sampleNames)) {
93
+        if (is.null(pData)) {
94
+            # BSseq object will have no colnames.
95
+            pData <- S4Vectors:::new_DataFrame(nrows = ncol(M))
96
+        } else {
97
+            # BSseq object will have 'sampleNames' as colnames.
98
+            pData <- DataFrame(row.names = sampleNames)
99
+        }
100
+    } else {
101
+        if (is.null(pData)) {
102
+            # BSseq object will have 'sampleNames' as colnames.
103
+            pData <- DataFrame(row.names = sampleNames)
104
+        } else {
105
+            if (is.null(rownames(pData))) {
106
+                rownames(pData) <- sampleNames
107
+            } else {
108
+                stopifnot(identical(rownames(pData), sampleNames))
109
+            }
110
+        }
111
+    }
112
+    # Process 'gr', 'pos', and 'chr'.
113
+    if (is.null(gr)) {
114
+        if (is.null(pos) || is.null(chr)) {
115
+            stop("Need 'pos' and 'chr' if 'gr' not supplied.")
116
+        }
117
+        gr <- GRanges(seqnames = chr, ranges = IRanges(start = pos, width = 1L))
118
+    }
119
+    if (!is(gr, "GRanges")) {
120
+        stop("'gr' needs to be a GRanges.")
121
+    }
122
+    # Process 'rmZeroCov'.
123
+    stopifnot(isTRUEorFALSE(rmZeroCov))
124
+
125
+    # Collapse duplicate loci --------------------------------------------------
126
+
127
+    is_duplicated <- duplicated(gr)
128
+    if (any(is_duplicated)) {
129
+        warning("Detected duplicate loci. Collapsing counts in 'M' and 'Cov' ",
130
+                "at these positions.")
131
+        if (!is.null(coef) || !is.null(se.coef)) {
132
+            stop("Cannot collapse when 'coef' or 'se.coef' are non-NULL.")
133
+        }
134
+        loci <- gr[!is_duplicated]
135
+        ol <- findOverlaps(gr, loci, type = "equal")
136
+        M <- rowsum(x = M, group = subjectHits(ol), reorder = FALSE)
137
+        rownames(M) <- NULL
138
+        Cov <- rowsum(x = Cov, group = subjectHits(ol), reorder = FALSE)
139
+        rownames(Cov) <- NULL
140
+    } else {
141
+        loci <- gr
142
+    }
143
+
144
+    # Optionally, remove positions with zero coverage --------------------------
145
+
146
+    if (rmZeroCov) {
147
+        loci_with_zero_cov <- rowAlls(Cov, value = 0)
148
+        if (any(loci_with_zero_cov)) {
149
+            loci_with_nonzero_cov <- !loci_with_zero_cov
150
+            gr <- gr[loci_with_nonzero_cov]
151
+            M <- M[loci_with_nonzero_cov, , drop = FALSE]
152
+            Cov <- Cov[loci_with_nonzero_cov, , drop = FALSE]
153
+            coef <- coef[loci_with_nonzero_cov, , drop = FALSE]
154
+            se.coef <- se.coef[loci_with_nonzero_cov, , drop = FALSE]
155
+        }
156
+    }
157
+
158
+    # Construct BSseq object ---------------------------------------------------
159
+
160
+    assays <- SimpleList(M = M, Cov = Cov, coef = coef, se.coef = se.coef)
161
+    assays <- assays[!S4Vectors:::sapply_isNULL(assays)]
162
+    se <- SummarizedExperiment(
163
+        assays = assays,
164
+        rowRanges = loci,
165
+        colData = pData)
166
+    .BSseq(se, trans = trans, parameters = parameters)
167
+}
168
+
169
+# Move to BSseq-utils?
170
+hasBeenSmoothed <- function(BSseq) {
171
+    "coef" %in% assayNames(BSseq)
172
+}
173
+
174
+# Move to BSseq-utils?
175
+getBSseq <- function(BSseq,
176
+                     type = c("Cov", "M", "gr", "coef", "se.coef", "trans",
177
+                              "parameters"),
178
+                     withDimnames = TRUE) {
179
+    type <- match.arg(type)
180
+    if (type %in% c("M", "Cov")) {
181
+        return(assay(BSseq, type, withDimnames = withDimnames))
182
+    }
183
+    if (type %in% c("coef", "se.coef")) {
184
+        if (type %in% assayNames(BSseq)) {
185
+            return(assay(BSseq, type, withDimnames = withDimnames))
186
+        } else {
187
+            return(NULL)
188
+        }
189
+    }
190
+    if (type == "trans") {
191
+        return(BSseq@trans)
192
+    }
193
+    if (type == "parameters") {
194
+        return(BSseq@parameters)
195
+    }
196
+    if (type == "gr") {
197
+        return(BSseq@rowRanges)
198
+    }
199
+}
200
+
201
+# Move to BSseq-utils?
202
+strandCollapse <- function(BSseq, shift = TRUE, BPPARAM = bpparam(),
203
+                           BACKEND = getRealizationBackend(),
204
+                           dir = tempfile("BSseq"), replace = FALSE,
205
+                           chunkdim = NULL, level = NULL,
206
+                           type = c("double", "integer")) {
207
+
208
+    # Argument checks ----------------------------------------------------------
209
+
210
+    if (all(runValue(strand(BSseq)) == "*")) {
211
+        warning("All loci are unstranded, nothing to collapse.", call. = FALSE)
212
+        return(BSseq)
213
+    }
214
+    if (!(all(runValue(strand(BSseq)) %in% c("+", "-")))) {
215
+        stop("'BSseq' object has a mix of stranded and unstranded loci.")
216
+    }
217
+    # Register 'BACKEND' and return to current value on exit.
218
+    # TODO: Is this strictly necessary?
219
+    current_BACKEND <- getRealizationBackend()
220
+    on.exit(setRealizationBackend(current_BACKEND), add = TRUE)
221
+    setRealizationBackend(BACKEND)
222
+    # Check compatability of 'BPPARAM' with 'BACKEND'.
223
+    if (!.areBackendsInMemory(BACKEND)) {
224
+        if (!.isSingleMachineBackend(BPPARAM)) {
225
+            stop("The parallelisation strategy must use a single machine ",
226
+                 "when using an on-disk realization backend.\n",
227
+                 "See help(\"read.bismark\") for details.",
228
+                 call. = FALSE)
229
+        }
230
+    } else {
231
+        if (!is.null(BACKEND)) {
232
+            # NOTE: Currently do not support any in-memory realization
233
+            #       backends. If the realization backend is NULL then an
234
+            #       ordinary matrix is returned rather than a matrix-backed
235
+            #       DelayedMatrix.
236
+            stop("The '", BACKEND, "' realization backend is not supported.",
237
+                 "\n  See help(\"read.bismark\") for details.",
238
+                 call. = FALSE)
239
+        }
240
+    }
241
+    # If using HDF5Array as BACKEND, check remaining options are sensible.
242
+    if (identical(BACKEND, "HDF5Array")) {
243
+        # NOTE: Most of this copied from
244
+        #       HDF5Array::saveHDF5SummarizedExperiment().
245
+        if (!isSingleString(dir)) {
246
+            stop(wmsg("'dir' must be a single string specifying the path to ",
247
+                      "the directory where to save the BSseq object (the ",
248
+                      "directory will be created)."))
249
+        }
250
+        if (!isTRUEorFALSE(replace)) {
251
+            stop("'replace' must be TRUE or FALSE")
252
+        }
253
+        HDF5Array:::.create_dir(dir = dir, replace = replace)
254
+        h5_path <- file.path(dir, "assays.h5")
255
+    } else if (identical(BACKEND, NULL)) {
256
+        h5_path <- NULL
257
+    }
258
+
259
+    # Collapse loci ------------------------------------------------------------
260
+
261
+    loci <- rowRanges(BSseq)
262
+    if (shift) {
263
+        loci <- shift(loci, shift = as.integer(-1L * (strand(loci) == "-")))
264
+    }
265
+    collapsed_loci <- reduce(loci, min.gapwidth = 0L, ignore.strand = TRUE)
266
+
267
+    # Collapse 'M' and 'Cov' matrices ------------------------------------------
268
+
269
+    ol <- findOverlaps(loci, collapsed_loci, type = "equal")
270
+    group <- subjectHits(ol)
271
+    M <- rowsum(
272
+        x = assay(BSseq, "M", withDimnames = FALSE),
273
+        group = group,
274
+        # NOTE: reorder = TRUE to ensure same row-order as collapsed_loci.
275
+        reorder = TRUE,
276
+        BPPARAM = BPPARAM,
277
+        filepath = h5_path,
278
+        name = "Cov",
279
+        chunkdim = chunkdim,
280
+        level = level,
281
+        type = type)
282
+    Cov <- rowsum(
283
+        x = assay(BSseq, "Cov", withDimnames = FALSE),
284
+        group = group,
285
+        # NOTE: reorder = TRUE to ensure same row-order as collapsed_loci.
286
+        reorder = TRUE,
287
+        BPPARAM = BPPARAM,
288
+        filepath = h5_path,
289
+        name = "Cov",
290
+        chunkdim = chunkdim,
291
+        level = level,
292
+        type = type)
293
+
294
+    # Construct BSseq object, saving it if it is HDF5-backed -------------------
295
+
296
+    se <- SummarizedExperiment(
297
+        assays = SimpleList(M = unname(M), Cov = unname(Cov)),
298
+        rowRanges = collapsed_loci,
299
+        colData = colData(BSseq))
300
+    # TODO: Is there a way to use the internal constructor with `check = FALSE`?
301
+    #       Assuming input was valid, the output is valid, too.
302
+    # .BSseq(se, trans = function(x) NULL, parameters = list())
303
+    bsseq <- new2("BSseq", se, check = FALSE)
304
+    if (!is.null(BACKEND) && BACKEND == "HDF5Array") {
305
+        # NOTE: Save BSseq object; mimicing
306
+        #       HDF5Array::saveHDF5SummarizedExperiment().
307
+        x <- bsseq
308
+        x@assays <- HDF5Array:::.shorten_h5_paths(x@assays)
309
+        saveRDS(x, file = file.path(dir, "se.rds"))
310
+    }
311
+    bsseq
312
+}
313
+
314
+# Exported methods -------------------------------------------------------------
315
+
316
+setMethod("show", signature(object = "BSseq"), function(object) {
317
+    cat("An object of type 'BSseq' with\n")
318
+    cat(" ", nrow(object), "methylation loci\n")
319
+    cat(" ", ncol(object), "samples\n")
320
+    if (hasBeenSmoothed(object)) {
321
+        cat("has been smoothed with\n")
322
+        cat(" ", object@parameters$smoothText, "\n")
323
+    } else {
324
+        cat("has not been smoothed\n")
325
+    }
326
+    if (.isHDF5ArrayBacked(object)) {
327
+        cat("Some assays are HDF5Array-backed\n")
328
+    } else {
329
+        cat("All assays are in-memory\n")
330
+    }
331
+})
332
+
333
+setMethod("pData", "BSseq", function(object) {
334
+    object@colData
335
+})
336
+
337
+setReplaceMethod(
338
+    "pData",
339
+    signature = signature(object = "BSseq", value = "data.frame"),
340
+    function(object, value) {
341
+        colData(object) <- as(value, "DataFrame")
342
+        object
343
+    }
344
+)
345
+
346
+setReplaceMethod(
347
+    "pData",
348
+    signature = signature(object = "BSseq", value = "DataFrame"),
349
+    function(object, value) {
350
+        colData(object) <- value
351
+        object
352
+    }
353
+)
354
+
355
+setMethod("sampleNames", "BSseq", function(object) colnames(object))
356
+
357
+setReplaceMethod(
358
+    "sampleNames",
359
+    signature = signature(object = "BSseq", value = "ANY"),
360
+    function(object, value) {
361
+        colnames(object) <- value
362
+        object
363
+    }
364
+)
365
+
366
+setMethod("updateObject", "BSseq",
367
+          function(object, ...) {
368
+              # NOTE: identical() is too strong
369
+              if (hasBeenSmoothed(object) &
370
+                  isTRUE(all.equal(getBSseq(object, "trans"), .oldTrans))) {
371
+                  object@trans <- plogis
372
+              }
373
+              if (is(object, "SummarizedExperiment")) {
374
+                  # NOTE: Call method for SummarizedExperiment objects
375
+                  object <- callNextMethod()
376
+                  return(object)
377
+              } else {
378
+                  BSseq(
379
+                      gr = object@gr,
380
+                      M = object@M,
381
+                      Cov = object@Cov,
382
+                      coef = object@coef,
383
+                      se.coef = object@se.coef,
384
+                      trans = object@trans,
385
+                      parameters = object@parameters,
386
+                      pData = object@phenoData@data)
387
+              }
388
+          }
389
+)
0 390
deleted file mode 100644
... ...
@@ -1,355 +0,0 @@
1
-setClass("BSseq", contains = "RangedSummarizedExperiment",
2
-         representation(trans = "function",
3
-                        parameters = "list"))
4
-
5
-setValidity("BSseq", function(object) {
6
-    msg <- validMsg(NULL, .checkAssayNames(object, c("Cov", "M")))
7
-    msg <- validMsg(msg, .checkAssayClasses(object,
8
-                                            c("Cov", "M", "coef", "se.coef")))
9
-    if(class(rowRanges(object)) != "GRanges")
10
-        msg <- validMsg(msg, sprintf("object of class '%s' needs to have a 'GRanges' in slot 'rowRanges'", class(object)))
11
-    ## benchmarking shows that min(assay()) < 0 is faster than any(assay() < 0) if it is false
12
-    if(is.null(colnames(object)))
13
-        msg <- validMsg(msg, "colnames (aka sampleNames) need to be set")
14
-    M <- assay(object, "M", withDimnames = FALSE)
15
-    Cov <- assay(object, "Cov", withDimnames = FALSE)
16
-    if (!.isHDF5ArrayBacked(M) && !.isHDF5ArrayBacked(Cov)) {
17
-        ## TODO: This check is super expensive if M or Cov is a HDF5Matrix, so
18
-        ##       we skip it for the time being
19
-        if(min(assay(object, "M", withDimnames = FALSE)) < 0)
20
-            msg <- validMsg(msg, "the 'M' assay has negative entries")
21
-        if(min(assay(object, "Cov", withDimnames = FALSE)) < 0)
22
-            msg <- validMsg(msg, "the 'Cov' assay has negative entries")
23
-        if(max(assay(object, "M", withDimnames = FALSE) -
24
-               assay(object, "Cov", withDimnames = FALSE)) > 0.5)
25
-            msg <- validMsg(msg, "the 'M' assay has at least one entry bigger than the 'Cov' assay")
26
-    }
27
-    if(!is.null(rownames(M)) || !is.null(rownames(Cov)) ||
28
-       ("coef" %in% assayNames(object) && !is.null(rownames(assay(object, "coef")))) ||
29
-       ("se.coef" %in% assayNames(object) && !is.null(rownames(assay(object, "se.coef")))))
30
-        warning("unnecessary rownames in object")
31
-    if (is.null(msg)) TRUE else msg
32
-})
33
-
34
-setMethod("show", signature(object = "BSseq"),
35
-          function(object) {
36
-              cat("An object of type 'BSseq' with\n")
37
-              cat(" ", nrow(object), "methylation loci\n")
38
-              cat(" ", ncol(object), "samples\n")
39
-              if(hasBeenSmoothed(object)) {
40
-                  cat("has been smoothed with\n")
41
-                  cat(" ", object@parameters$smoothText, "\n")
42
-              } else {
43
-                  cat("has not been smoothed\n")
44
-              }
45
-              if (.isHDF5ArrayBacked(object)) {
46
-                  cat("Some assays are HDF5Array-backed\n")
47
-              } else {
48
-                  cat("All assays are in-memory\n")
49
-              }
50
-          })
51
-
52
-setMethod("pData", "BSseq", function(object) {
53
-    object@colData
54
-})
55
-
56
-setReplaceMethod("pData",
57
-                 signature = signature(
58
-                      object = "BSseq",
59
-                      value = "data.frame"),
60
-                 function(object, value) {
61
-                     colData(object) <- as(value, "DataFrame")
62
-                     object
63
-                 })
64
-
65
-setReplaceMethod("pData",
66
-                 signature = signature(
67
-                      object = "BSseq",
68
-                      value = "DataFrame"),
69
-                 function(object, value) {
70
-                     colData(object) <- value
71
-                     object
72
-                 })
73
-
74
-setMethod("sampleNames", "BSseq", function(object) {
75
-    colnames(object)
76
-})
77
-
78
-setReplaceMethod("sampleNames",
79
-                 signature = signature(
80
-                     object = "BSseq",
81
-                     value = "ANY"),
82
-                 function(object, value) {
83
-                     colnames(object) <- value
84
-                     object
85
-                 })
86
-
87
-setMethod("length", "BSseq", function(x) {
88
-    length(granges(x))
89
-})
90
-
91
-hasBeenSmoothed <- function(BSseq) {
92
-    "coef" %in% assayNames(BSseq)
93
-}
94
-
95
-getBSseq <- function(BSseq, type = c("Cov", "M", "gr", "coef", "se.coef", "trans", "parameters")) {
96
-    type <- match.arg(type)
97
-    if(type %in% c("M", "Cov"))
98
-        return(assay(BSseq, type))
99
-    if(type %in% c("coef", "se.coef") && type %in% assayNames(BSseq))
100
-        return(assay(BSseq, type))
101
-    if(type %in% c("coef", "se.coef"))
102
-       return(NULL)
103
-    if(type == "trans")
104
-        return(BSseq@trans)
105
-    if(type == "parameters")
106
-        return(BSseq@parameters)
107
-    if(type == "gr")
108
-        return(BSseq@rowRanges)
109
-
110
-}
111
-
112
-BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
113
-                  trans = NULL, parameters = NULL, pData = NULL,
114
-                  gr = NULL, pos = NULL, chr = NULL, sampleNames = NULL,
115
-                  rmZeroCov = FALSE) {
116
-    if(is.null(gr)) {
117
-        if(is.null(pos) || is.null(chr))
118
-            stop("Need pos and chr")
119
-        gr <- GRanges(seqnames = chr, ranges = IRanges(start = pos, width = 1))
120
-    }
121
-    if(!is(gr, "GRanges"))
122
-        stop("'gr' needs to be a GRanges")
123
-    if(any(width(gr) != 1))
124
-        stop("'gr' needs to have widths of 1")
125
-    if(is.null(M) || is.null(Cov))
126
-        stop("Need M and Cov")
127
-    M <- .DelayedMatrix(M)
128
-    Cov <- .DelayedMatrix(Cov)
129
-    if (!is.null(coef)) {
130
-        coef <- .DelayedMatrix(coef)
131
-    }
132
-    if (!is.null(se.coef)) {
133
-        se.coef <- .DelayedMatrix(se.coef)
134
-    }
135
-    if(length(gr) != nrow(M) ||
136
-       length(gr) != nrow(Cov) ||
137
-       ncol(Cov) != ncol(M))
138
-        stop("'gr', 'M' and 'Cov' need to have similar dimensions")
139
-    if(!is.null(rownames(M)))
140
-        rownames(M) <- NULL
141
-    if(!is.null(rownames(Cov)))
142
-        rownames(Cov) <- NULL
143
-    if(!is.null(names(gr)))
144
-        names(gr) <- NULL
145
-    ## deal with sampleNames
146
-    if(!is(pData, "DataFrame"))
147
-        pData <- as(pData, "DataFrame")
148
-    if(is.null(sampleNames) && !is.null(pData) && !is.null(rownames(pData)))
149
-        sampleNames <- rownames(pData)
150
-    if(is.null(sampleNames) && !is.null(colnames(M)))
151
-        sampleNames <- colnames(M)
152
-    if(is.null(sampleNames) && !is.null(colnames(Cov)))
153
-        sampleNames <- colnames(Cov)
154
-    if(is.null(sampleNames))
155
-        sampleNames <- paste("V", 1:ncol(M), sep = "")
156
-    if(length(unique(sampleNames)) != ncol(M))
157
-        stop("sampleNames need to be unique and of the right length.")
158
-    ## check that 0 <= M <= Cov and remove positions with Cov = 0
159
-    if (!.isHDF5ArrayBacked(M) && !.isHDF5ArrayBacked(Cov)) {
160
-        ## TODO: This check is super expensive if M or Cov is a HDF5Matrix, so
161
-        ##       we skip it for the time being
162
-        if (any(M < 0) || any(M > Cov) || any(is.na(M)) || any(is.na(Cov)) ||
163
-            any(is.infinite(Cov))) {
164
-            stop("'M' and 'Cov' may not contain NA or infinite values and ",
165
-                 "0 <= M <= Cov")
166
-        }
167
-    }
168
-    if(rmZeroCov) {
169
-        wh <- which(rowSums2(Cov) == 0)
170
-        if(length(wh) > 0) {
171
-            gr <- gr[-wh]
172
-            M <- M[-wh,,drop = FALSE]
173
-            Cov <- Cov[-wh,,drop = FALSE]
174
-        }
175
-    }
176
-    grR <- reduce(gr, min.gapwidth = 0L)
177
-    if(!identical(grR, gr)) {
178
-        ## Now we either need to re-order or collapse or both
179
-        mm <- as.matrix(findOverlaps(grR, gr))
180
-        mm <- mm[order(mm[,1]),]
181
-        if(length(grR) == length(gr)) {
182
-            ## only re-ordering is necessary
183
-            gr <- grR
184
-            M <- M[mm[,2],,drop = FALSE]
185
-            Cov <- Cov[mm[,2],,drop = FALSE]
186
-            if(!is.null(coef))
187
-                coef <- coef[mm[,2],,drop = FALSE]
188
-            if(!is.null(se.coef))
189
-                se.coef <- se.coef[mm[,2],, drop = FALSE]
190
-        } else {
191
-            warning("multiple positions, collapsing BSseq object\n")
192
-            if(!is.null(coef) || !is.null(se.coef))
193
-                stop("Cannot collapse when 'coef' or 'se.coef' are present")
194
-            gr <- grR
195
-            sp <- split(mm[,2], mm[,1])[as.character(1:length(grR))]
196
-            names(sp) <- NULL
197
-            # TODO: .collapseDelayedMatrix() always return numeric; it may be
198
-            #       worth coercing M and Cov to integer DelayedMatrix objects,
199
-            #       which would halve storage requirements and impose some more
200
-            #       structure on the BSseq class (via new validity method
201
-            #       checks)
202
-            # NOTE: Tries to be smart about how collapsed DelayedMatrix should
203
-            #       be realized
204
-            if (.isHDF5ArrayBacked(M)) {
205
-                M_BACKEND <- "HDF5Array"
206
-            } else {
207
-                M_BACKEND <- NULL
208
-            }
209
-            M <- .collapseDelayedMatrix(x = M,
210
-                                        sp = sp,
211
-                                        MARGIN = 2,
212
-                                        BACKEND = M_BACKEND)
213
-            if (.isHDF5ArrayBacked(Cov)) {
214
-                Cov_BACKEND <- "HDF5Array"
215
-            } else {
216
-                Cov_BACKEND <- NULL
217
-            }
218
-            Cov <- .collapseDelayedMatrix(x = Cov,
219
-                                          sp = sp,
220
-                                          MARGIN = 2,
221
-                                          BACKEND = Cov_BACKEND)
222
-        }
223
-    }
224
-    if(is.null(colnames(M)) || any(sampleNames != colnames(M)))
225
-        colnames(M) <- sampleNames
226
-    if(is.null(colnames(Cov)) || any(sampleNames != colnames(Cov)))
227
-        colnames(Cov) <- sampleNames
228
-    if(!is.null(coef)) {
229
-
230
-        if(nrow(coef) != nrow(M) ||
231
-           ncol(coef) != ncol(M))
232
-            stop("'coef' does not have the right dimensions")
233
-        if(is.null(colnames(coef)) || any(sampleNames != colnames(coef)))
234
-            colnames(coef) <- sampleNames
235
-        if(!is.null(rownames(coef)))
236
-            rownames(coef) <- NULL
237
-    }
238
-    if(!is.null(se.coef)) {
239
-        if(nrow(se.coef) != nrow(M) ||
240
-           ncol(se.coef) != ncol(M))
241
-            stop("'se.coef' does not have the right dimensions")
242
-        if(is.null(colnames(se.coef)) || any(sampleNames != colnames(se.coef)))
243
-            colnames(se.coef) <- sampleNames
244
-        if(!is.null(rownames(se.coef)))
245
-            rownames(se.coef) <- NULL
246
-    }
247
-    assays <- SimpleList(M = M, Cov = Cov, coef = coef, se.coef = se.coef)
248
-    assays <- assays[!sapply(assays, is.null)]
249
-    if(is.null(pData) || all(dim(pData) == c(0,0)))
250
-        BSseq <- SummarizedExperiment(assays = assays, rowRanges = gr)
251
-    else
252
-        BSseq <- SummarizedExperiment(assays = assays, rowRanges = gr, colData = pData)
253
-    BSseq <- as(BSseq, "BSseq")
254
-    if(is.function(trans))
255
-        BSseq@trans <- trans
256
-    if(is.list(parameters))
257
-        BSseq@parameters <- parameters
258
-    BSseq
259
-}
260
-
261
-setMethod("updateObject", "BSseq",
262
-          function(object, ...) {
263
-              # NOTE: identical() is too strong
264
-              if (isTRUE(all.equal(getBSseq(object, "trans"), .oldTrans))) {
265
-                  object@trans <- plogis
266
-              }
267
-              if (.hasSlot(object, "assays")) {
268
-                  # call method for RangedSummarizedExperiment objects
269
-                  object <- callNextMethod()
270
-                  assays(object) <- endoapply(
271
-                      assays(object, withDimnames = FALSE), function(assay) {
272
-                          if (is.null(assay)) {
273
-                              return(assay)
274
-                          } else {
275
-                              .DelayedMatrix(assay)
276
-                          }
277
-                      })
278
-                  object
279
-              } else {
280
-                  BSseq(gr = object@gr, M = object@M, Cov = object@Cov,
281
-                        coef = object@coef, se.coef = object@se.coef,
282
-                        trans = object@trans, parameters = object@parameters,
283
-                        pData = object@phenoData@data)
284
-              }
285
-          })
286
-
287
-
288
-strandCollapse <- function(BSseq, shift = TRUE) {
289
-    if(all(runValue(strand(BSseq)) == "*")) {
290
-        warning("All loci are unstranded; nothing to collapse")
291
-        return(BSseq)
292
-    }
293
-    if(!(all(runValue(strand(BSseq)) %in% c("+", "-"))))
294
-        stop("'BSseq' object has a mix of stranded and unstranded loci.")
295
-    BS.forward <- BSseq[strand(BSseq) == "+"]
296
-    strand(BS.forward) <- "*"
297
-    BS.reverse <- BSseq[strand(BSseq) == "-"]
298
-    strand(BS.reverse) <- "*"
299
-    if(shift)
300
-        rowRanges(BS.reverse) <- shift(granges(BS.reverse), -1L)
301
-    sampleNames(BS.reverse) <- paste0(sampleNames(BS.reverse), "_REVERSE")
302
-    BS.comb <- combine(BS.forward, BS.reverse)
303
-    newBSseq <- collapseBSseq(BS.comb, columns = rep(sampleNames(BSseq), 2))
304
-    newBSseq
305
-}
306
-
307
-## getD <- function(data, sample1, sample2, type = c("raw", "fit"),
308
-##                  addPositions = FALSE, alpha = 0.95) {
309
-##     d.conf <- function(p1, n1, p2, n2) {
310
-##         ## Method 10 from Newcombe (Stat in Med, 1998)
311
-##         getRoots <- function(p, n) {
312
-##             mat <- cbind(p^2, -(2*p + z^2/n), 1+z^2/n)
313
-##             roots <- t(Re(apply(mat, 1, polyroot)))
314
-##             roots
315
-##         }
316
-##         roots1 <- roots2 <- matrix(NA_real_, nrow = length(p1), ncol = 2)
317
-##         idx <- !is.na(p1)
318
-##         roots1[idx,] <- getRoots(p1[idx], n1[idx])
319
-##         idx <- !is.na(p2)
320
-##         roots2[idx,] <- getRoots(p2[idx], n2[idx])
321
-##         d <- p1 - p2
322
-##         suppressWarnings({
323
-##             lower <- d - z * sqrt(roots1[,1]*(1-roots1[,1])/n1 + roots2[,2]*(1-roots2[,2])/n2)
324
-##             upper <- d + z * sqrt(roots1[,2]*(1-roots1[,2])/n1 + roots2[,1]*(1-roots2[,1])/n2)
325
-##         })
326
-##         return(data.frame(d = d, lower = lower, upper = upper))
327
-##     }
328
-##     type <- match.arg(type)
329
-##     z <- abs(qnorm((1-alpha)/2, mean = 0, sd = 1))
330
-##     switch(type,
331
-##            raw = {
332
-##                conf <- d.conf(p1 = data$M[, sample1] / data$Cov[, sample1],
333
-##                               n1 = data$Cov[, sample1],
334
-##                               p2 = data$M[, sample2] / data$Cov[, sample2],
335
-##                               n2 = data$Cov[, sample2])
336
-##                out <- conf
337
-##            },
338
-##            fit = {
339
-##                p1 <- data$trans(data$coef[, sample1])
340
-##                p2 <- data$trans(data$coef[, sample2])
341
-##                d <- p1 - p2
342
-##                ## Based on the delta method
343
-##                se.d <- sqrt((data$se.coef[, sample1] * p1 * (1-p1))^2 +
344
-##                             (data$se.coef[, sample2] * p2 * (1-p2))^2)
345
-
346
-##                lower <- d - z * se.d
347
-##                upper <- d + z * se.d
348
-##                out <- data.frame(d = d, lower = lower, upper = upper)
349
-##            })
350
-##     if(addPositions) {
351
-##         out$pos <- start(data$gr)
352
-##         out$chr <- seqnames(data$gr)
353
-##     }
354
-##     out
355
-## }
... ...
@@ -1,68 +1,26 @@
1
-collapseBSseq <- function(BSseq, columns) {
2
-    ## columns is a vector of new names, names(columns) is sampleNames or empty
3
-    stopifnot(is.character(columns))
4
-    if(is.null(names(columns)) && length(columns) != ncol(BSseq))
5
-        stop("if `columns' does not have names, it needs to be of the same length as `BSseq` has columns (samples)")
6
-    if(!is.null(names(columns)) && !all(names(columns) %in% sampleNames(BSseq)))
7
-        stop("if `columns` has names, they need to be sampleNames(BSseq)")
8
-    if(is.null(names(columns)))
9
-        columns.idx <- 1:ncol(BSseq)
10
-    else
11
-        columns.idx <- match(names(columns), sampleNames(BSseq))
12
-    sp <- split(columns.idx, columns)
13
-    # TODO: .collapseDelayedMatrix() always return numeric; it may be
14
-    #       worth coercing M and Cov to integer DelayedMatrix objects,
15
-    #       which would halve storage requirements and impose some more
16
-    #       structure on the BSseq class (via new validity method
17
-    #       checks)
18
-    # NOTE: Tries to be smart about how collapsed DelayedMatrix should
19
-    #       be realized
20
-    M <- getBSseq(BSseq, "M")
21
-    if (.isHDF5ArrayBacked(M)) {
22
-        M_BACKEND <- "HDF5Array"
23
-    } else {
24
-        M_BACKEND <- NULL
25
-    }
26
-    M <- .collapseDelayedMatrix(x = M,
27
-                                sp = sp,
28
-                                MARGIN = 1,
29
-                                BACKEND = M_BACKEND)
30
-    Cov <- getBSseq(BSseq, "Cov")
31
-    if (.isHDF5ArrayBacked(Cov)) {
32
-        Cov_BACKEND <- "HDF5Array"
33
-    } else {
34
-        Cov_BACKEND <- NULL
35
-    }
36
-    Cov <- .collapseDelayedMatrix(x = Cov,
37
-                                  sp = sp,
38
-                                  MARGIN = 1,
39
-                                  BACKEND = Cov_BACKEND)
40
-    BSseq(gr = getBSseq(BSseq, "gr"), M = M, Cov = Cov, sampleNames = names(sp))
41
-}
42
-
43 1
 chrSelectBSseq <- function(BSseq, seqnames = NULL, order = FALSE) {
44 2
     seqlevels(BSseq, pruning.mode = "coarse") <- seqnames
45
-    if(order)
46
-        BSseq <- orderBSseq(BSseq, seqOrder = seqnames)
3
+    if (order) BSseq <- orderBSseq(BSseq, seqOrder = seqnames)
47 4
     BSseq
48 5
 }
49 6
 
50
-
51 7
 orderBSseq <- function(BSseq, seqOrder = NULL) {
52
-    if(!is.null(seqOrder))
8
+    if (!is.null(seqOrder)) {
53 9
         seqlevels(BSseq, pruning.mode = "coarse") <- seqOrder
10
+    }
54 11
     BSseq[order(granges(BSseq))]
55 12
 }
56 13
 
57
-
58 14
 # TODO: getMeth() realises the result in memory iff regions is not NULL;
59 15
 #       discuss with Kasper
60 16
 # TODO: Whether or not colnames are added to returned value depends on whether
61 17
 #       regions is non-NULL; discuss with Kasper
62 18
 # TODO: Add parallel support
19
+# TODO: Document withDimnames
20
+# TODO: Should is be an explicit withDimnames arg or simply passed via `...`?
63 21
 getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"),
64 22
                     what = c("perBase", "perRegion"), confint = FALSE,
65
-                    alpha = 0.95) {
23
+                    alpha = 0.95, withDimnames = TRUE) {
66 24
     p.conf <- function(p, n, alpha) {
67 25
         z <- abs(qnorm((1 - alpha)/2, mean = 0, sd = 1))
68 26
         upper <- (p + z ^ 2 / (2 * n) +
... ...
@@ -85,21 +43,21 @@ getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"),
85 43
     }
86 44
     z <- abs(qnorm((1 - alpha)/2, mean = 0, sd = 1))
87 45
     if (is.null(regions) && type == "smooth") {
88
-        coef <- getBSseq(BSseq, type = "coef")
89
-        meth <- getBSseq(BSseq, type = "trans")(coef)
46
+        coef <- getBSseq(BSseq, "coef", withDimnames)
47
+        meth <- getBSseq(BSseq, "trans", withDimnames)(coef)
90 48
         if (confint) {
91
-            upper <- meth + z * getBSseq(BSseq, type = "se.coef")
92
-            lower <- meth - z * getBSseq(BSseq, type = "se.coef")
49
+            upper <- meth + z * getBSseq(BSseq, "se.coef", withDimnames)
50
+            lower <- meth - z * getBSseq(BSseq, "se.coef", withDimnames)
93 51
             return(list(meth = meth, lower = lower, upper = upper))
94 52
         } else {
95 53
             return(meth)
96 54
         }
97 55
     }
98 56
     if (is.null(regions) && type == "raw") {
99
-        meth <- getBSseq(BSseq, type = "M") / getBSseq(BSseq, type = "Cov")
57
+        meth <- getBSseq(BSseq, "M", withDimnames) /
58
+            getBSseq(BSseq, "Cov", withDimnames)
100 59
         if (confint) {
101
-            return(p.conf(meth, n = getBSseq(BSseq, type = "Cov"),
102
-                          alpha = alpha))
60
+            return(p.conf(meth, getBSseq(BSseq, "Cov", withDimnames), alpha))
103 61
         } else {
104 62
             return(meth)
105 63
         }
... ...
@@ -119,12 +77,14 @@ getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"),
119 77
     #       chunks if what = perRegion
120 78
     if (type == "smooth") {
121 79
         meth <- as.matrix(
122
-            getBSseq(BSseq, "trans")(getBSseq(BSseq, "coef"))[queryHits(ov), ,
123
-                                                              drop = FALSE])
80
+            getBSseq(BSseq, "trans", withDimnames)(
81
+                getBSseq(BSseq, "coef", withDimnames))[
82
+                    queryHits(ov), , drop = FALSE])
124 83
     } else if (type == "raw") {
125 84
         meth <- as.matrix(
126
-            (getBSseq(BSseq, "M") / getBSseq(BSseq, "Cov"))[queryHits(ov), ,
127
-                                                            drop = FALSE])
85
+            (getBSseq(BSseq, "M", withDimnames) /
86
+                 getBSseq(BSseq, "Cov", withDimnames))[
87
+                     queryHits(ov), , drop = FALSE])
128 88
     }
129 89
     out <- lapply(split(meth, subjectHits(ov)), matrix, ncol = ncol(meth))
130 90
     if (what == "perBase") {
... ...
@@ -138,9 +98,9 @@ getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"),
138 98
         # TODO: Don't really understand the logic of the remaining code; how
139 99
         #       could the rows end up in the wrong order?
140 100
         outMatrix <- matrix(NA, ncol = ncol(BSseq), nrow = length(regions))
141
-        colnames(outMatrix) <- sampleNames(BSseq)
101
+        if (withDimnames) colnames(outMatrix) <- sampleNames(BSseq)
142 102
         outMatrix[as.integer(rownames(out)), ] <- out
143
-        .DelayedMatrix(outMatrix)
103
+        outMatrix
144 104
     }
145 105
 }
146 106
 
... ...
@@ -148,21 +108,24 @@ getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"),
148 108
 #       discuss with Kasper
149 109
 # TODO: Whether or not colnames are added to returned value depends on whether
150 110
 #       regions is non-NULL; discuss with Kasper
111
+# TODO: Document withDimnames
112
+# TODO: Should is be an explicit withDimnames arg or simply passed via `...`?
151 113
 getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"),
152 114
                         what = c("perBase", "perRegionAverage",
153
-                                 "perRegionTotal")) {
115
+                                 "perRegionTotal"),
116
+                        withDimnames = TRUE) {
154 117
     stopifnot(is(BSseq, "BSseq"))
155 118
     type <- match.arg(type)
156 119
     what <- match.arg(what)
157 120
     if (is.null(regions)) {
158 121
         if (what == "perBase") {
159
-            return(getBSseq(BSseq, type = type))
122
+            return(getBSseq(BSseq, type, withDimnames))
160 123
         }
161 124
         if (what == "perRegionTotal") {
162
-            return(colSums2(getBSseq(BSseq, type = type)))
125
+            return(colSums2(getBSseq(BSseq, type, withDimnames)))
163 126
         }
164 127
         if (what == "perRegionAverage") {
165
-            return(colMeans2(getBSseq(BSseq, type = type)))
128
+            return(colMeans2(getBSseq(BSseq, type, withDimnames)))
166 129
         }
167 130
     }
168 131
     if (class(regions) == "data.frame") {
... ...
@@ -171,11 +134,8 @@ getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"),
171 134
     stopifnot(is(regions, "GenomicRanges"))
172 135
     grBSseq <- granges(BSseq)
173 136
     ov <- findOverlaps(grBSseq, regions)
174
-    if (type == "Cov") {
175
-        coverage <- getBSseq(BSseq, "Cov")[queryHits(ov), , drop = FALSE]
176
-    } else if (type == "M") {
177
-        coverage <- getBSseq(BSseq, "M")[queryHits(ov), , drop = FALSE]
178
-    }
137
+    coverage <- getBSseq(BSseq, type, withDimnames)[
138
+        queryHits(ov), , drop = FALSE]
179 139
     out <- lapply(split(coverage, subjectHits(ov)), matrix,
180 140
                   ncol = ncol(coverage))
181 141
 
... ...
@@ -193,7 +153,7 @@ getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"),
193 153
     # TODO: Don't really understand the logic of the remaining code; how
194 154
     #       could the rows end up in the wrong order?
195 155
     outMatrix <- matrix(NA, ncol = ncol(BSseq), nrow = length(regions))
196
-    colnames(outMatrix) <- sampleNames(BSseq)
197
-    outMatrix[as.integer(rownames(out)),] <- out
198
-    .DelayedMatrix(outMatrix)
156
+    if (withDimnames) colnames(outMatrix) <- sampleNames(BSseq)
157
+    outMatrix[as.integer(rownames(out)), ] <- out
158
+    outMatrix
199 159
 }
... ...
@@ -44,48 +44,116 @@
44 44
     is(x@seed, "matrix")
45 45
 }
46 46
 
47
-# NOTE: Equivalent to rowSums2(x[, j, drop = FALSE]) but does it using a
48
-#       delayed operation and always returns a nrow(x) x 1 DelayedMatrix
49
-.delayed_rowSums2 <- function(x, j) {
50
-    Reduce(`+`, lapply(j, function(jj) x[, jj, drop = FALSE]))
47
+.zero_type <- function(type) {
48
+    if (identical(type, "integer")) {
49
+        fill <- 0L
50
+    } else if (identical(type, "double")) {
51
+        fill <- 0
52
+    } else {
53
+        stop("'type' = ", type, " is not supported")
54
+    }
51 55
 }
52 56
 
53
-# NOTE: Equivalent to colSums2(x[i, , drop = FALSE]) but does it using a
54
-#       delayed operation and always returns a 1 x ncol(x) DelayedMatrix
55
-.delayed_colSums2 <- function(x, i) {
56
-    Reduce(`+`, lapply(i, function(ii) x[ii, , drop = FALSE]))
57
-}
57
+# Missing methods --------------------------------------------------------------
58
+
59
+# NOTE: Copied from minfi
60
+# TODO: Perhaps move this to DelayedMatrixStats?
61
+# TODO: DelayedArray::type() for all RealizationSink subclasses
62
+setMethod("type", "HDF5RealizationSink", function(x) {
63
+    x@type
64
+})
65
+# NOTE: Copied from minfi
66
+# TODO: Perhaps move this to DelayedMatrixStats?
67
+setMethod("type", "arrayRealizationSink", function(x) {
68
+    DelayedArray::type(x@result_envir$result)
69
+})
70
+# NOTE: Copied from minfi
71
+# TODO: Perhaps move this to DelayedMatrixStats?
72
+setMethod("type", "RleRealizationSink", function(x) {
73
+    x@type
74
+})
75
+# NOTE: Copied from minfi
76
+# TODO: Perhaps move this to DelayedMatrixStats?
77
+# TODO: dimnames() for all RealizationSink subclasses
78
+setMethod("dimnames", "arrayRealizationSink", function(x) {
79
+    dimnames(x@result_envir$result)
80
+})
81
+
82
+# Advanced block processing routines -------------------------------------------
58 83
 
59
-# MARGIN = 1: collapse using rowSums
60
-# MARGIN = 2: collapse using colSums
61
-.collapseDelayedMatrix <- function(x, sp, MARGIN, BACKEND = NULL) {
62
-    stopifnot(is(x, "DelayedMatrix"))
63
-    if (MARGIN == 1) {
64
-        if (is.null(BACKEND)) {
65
-            collapsed_x <- do.call(cbind, lapply(sp, function(j) {
66
-                rowSums2(x[, j, drop = FALSE])
67
-            }))
68
-        } else {
69
-            collapsed_x <- do.call(cbind, lapply(sp, function(j) {
70
-                .delayed_rowSums2(x, j)
71
-            }))
72
-            # NOTE: Need to manually add colnames when using this method
73
-            colnames(collapsed_x) <- names(sp)
84
+# NOTE: Copy of minfi:::blockApplyWithRealization()
85
+# TODO: Perhaps move this to DelayedMatrixStats?
86
+# NOTE: DelayedArray::blockApply() with the option to write the blocks to
87
+#       'sink'. Useful, for example, to apply a function across column-blocks
88
+#       of a DelayedMatrix, write these results to disk, and then wrap
89
+#       these in a DelayedMatrix.
90
+# TODO: See https://github.com/Bioconductor/DelayedArray/issues/10
91
+blockApplyWithRealization <- function(x, FUN, ..., sink = NULL, x_grid = NULL,
92
+                                      sink_grid = NULL, BPREDO = list(),
93
+                                      BPPARAM = bpparam()) {
94
+    FUN <- match.fun(FUN)
95
+
96
+    # Check conformable dots_grids and sinks_grids
97
+    x_grid <- DelayedArray:::.normarg_grid(x_grid, x)
98
+    sink_grid <- DelayedArray:::.normarg_grid(sink_grid, sink)
99
+    if (!identical(dim(x_grid), dim(sink_grid))) {
100
+        stop("non-conformable 'x_grid' and 'sink_grid'")
101
+    }
102
+
103
+    # Loop over blocks of `x` and write to `sink`
104
+    nblock <- length(x_grid)
105
+    bplapply(seq_len(nblock), function(b) {
106
+        if (DelayedArray:::get_verbose_block_processing()) {
107
+            message("Processing block ", b, "/", nblock, " ... ",
108
+                    appendLF = FALSE)
74 109
         }
75
-    } else if (MARGIN == 2) {
76
-        if (is.null(BACKEND)) {
77
-            collapsed_x <- do.call(rbind, lapply(sp, function(i) {
78
-                colSums2(x[i, , drop = FALSE])
79
-            }))
80
-        } else {
81
-            collapsed_x <- do.call(rbind, lapply(sp, function(i) {
82
-                .delayed_colSums2(x, i)
83
-            }))
84
-            # NOTE: Need to manually add rownames when using this method
85
-            rownames(collapsed_x) <- names(sp)
110
+        x_viewport <- x_grid[[b]]
111
+        sink_viewport <- sink_grid[[b]]
112
+        block <- read_block(x, x_viewport)
113
+        attr(block, "from_grid") <- x_grid
114
+        attr(block, "block_id") <- b
115
+        block_ans <- FUN(block, ...)
116
+        # NOTE: This is the only part different from DelayedArray::blockApply()
117
+        if (!is.null(sink)) {
118
+            write_block(x = sink, viewport = sink_viewport, block = block_ans)
119
+            block_ans <- NULL
86 120
         }
87
-    } else {
88
-        stop("'MARGIN' must be 1 or 2")
121
+        if (DelayedArray:::get_verbose_block_processing()) {
122
+            message("OK")
123
+        }
124
+    },
125
+    BPREDO = BPREDO,
126
+    BPPARAM = BPPARAM)
127
+}
128
+
129
+# TODO: Needed?
130
+.getSEDir <- function(x) {
131
+    paths <- lapply(assays(x, withDimnames = FALSE), function(a) {
132
+        try(path(a), silent = TRUE)
133
+    })
134
+    if (any(vapply(paths, is, logical(1L), "try-error"))) {
135
+        stop("Cannot extract 'dir'.")
136
+    }
137
+    unique_paths <- unique(unlist(paths, use.names = FALSE))
138
+    if (length(unique_paths) > 1) {
139
+        stop("Assay data spread across multiple HDF5 files.")
140
+    }
141
+    dirs <- dirname(unlist(paths, use.names = FALSE))
142
+    unique(dirs)
143
+}
144
+
145
+# Should return TRUE for BSseq object created with read.bismark() or saved with
146
+# HDF5Array::saveHDF5SummarizedExperiment().
147
+# TODO: Check dirname(paths[[1L]]) also contains 'se.rds'? It looks like dir
148
+#       can contain other files besides these; check.
149
+.isHDF5BackedBSseqUpdatable <- function(x) {
150
+    stopifnot(is(x, "BSseq"))
151
+    if (!identical(.getBSseqBackends(x), "HDF5Array")) {
152
+        return(FALSE)
153
+    }
154
+    paths <- vapply(assays(x, withDimnames = FALSE), path, character(1L))
155
+    if (all(paths == paths[[1L]]) && all(basename(paths) == "assays.h5")) {
156
+        return(TRUE)
89 157
     }
90
-    realize(collapsed_x, BACKEND = BACKEND)
158
+    FALSE
91 159
 }
92 160
new file mode 100644
... ...
@@ -0,0 +1,384 @@
1
+# Internal classes -------------------------------------------------------------
2
+
3
+# The FWGRanges class is a container for the fixed-width genomic locations and
4
+# their associated annotations.
5
+# NOTE: The intention is to make this a fully-fledged GenomicRanges subclass
6
+#       that is part of the GenomicRRanges package. For now, this class is
7
+#       internal to bsseq and only a subset of methods are properly implemented.
8
+.FWGRanges <- setClass(
9
+    "FWGRanges",
10
+    contains = "GenomicRanges",
11
+    representation(
12
+        seqnames = "Rle",
13
+        ranges = "FWIRanges",
14
+        strand = "Rle",
15
+        seqinfo = "Seqinfo"
16
+    ),
17
+    prototype(
18
+        seqnames = Rle(factor()),
19
+        strand = Rle(strand())
20
+    )
21
+)
22
+
23
+# Internal methods -------------------------------------------------------------
24
+
25
+# NOTE: Combine the new parallel slots with those of the parent class. Make
26
+#       sure to put the new parallel slots *first*.
27
+setMethod(
28
+    "parallelSlotNames",
29
+    "FWGRanges",
30
+    function(x) {
31
+        c(GenomicRanges:::extraColumnSlotNames(x), "seqnames", "ranges",
32
+          "strand", callNextMethod())
33
+    }
34
+)
35
+
36
+setMethod("seqnames", "FWGRanges", function(x) x@seqnames)
37
+
38
+setMethod("strand", "FWGRanges", function(x) x@strand)
39
+
40
+setMethod("seqinfo", "FWGRanges", function(x) x@seqinfo)
41
+
42
+setMethod(
43
+    "ranges",
44
+    "FWGRanges",
45
+    function(x, use.names = TRUE, use.mcols = FALSE) {
46
+        if (!isTRUEorFALSE(use.names)) stop("'use.names' must be TRUE or FALSE")
47
+        if (!isTRUEorFALSE(use.mcols)) stop("'use.mcols' must be TRUE or FALSE")
48
+        # TODO: This is a work-around the requirement in the GenomicRanges
49
+        #       validity method that ranges(GenomicRanges) returns a IRanges or
50
+        #       IPos instance.
51
+        # ans <- x@ranges
52
+        ans <- as(x@ranges, "IRanges")
53
+        if (!use.names) names(ans) <- NULL
54
+        if (use.mcols) mcols(ans) <- mcols(x)
55
+        ans
56
+    }
57
+)
58
+
59
+# TODO: These wouldn't be necessary if ranges(FWGRanges) returned a FWIRanges
60
+#       instead of an IRanges.
61
+setMethod("start", "FWGRanges", function(x) start(x@ranges))
62
+setMethod("width", "FWGRanges", function(x) width(x@ranges))
63
+setMethod("end", "FWGRanges", function(x) {
64
+    if (x@ranges@width == 1L) {
65
+        start(x)
66
+    } else {
67
+        width(x) - 1L + start(x)
68
+    }
69
+})
70
+
71
+# TODO: Follow shift,GRanges-method
72
+setMethod("shift", "FWGRanges", function(x, shift = 0L, use.names = TRUE) {
73
+    stopifnot(use.names)
74
+    new_ranges <- shift(x@ranges, shift, use.names)
75
+    x@ranges <- new_ranges
76
+    validObject(x)
77
+    x
78
+})
79
+
80
+# NOTE: Needed for seqinfo<-,FWGRanges-method
81
+setMethod("update", "FWGRanges", function(object, ...) {
82
+    BiocGenerics:::replaceSlots(object, ...)
83
+})
84
+
85
+# NOTE: This is pieced together in order to get something up and running. It
86
+#       should, however, be possible to this 'properly' via inheritance. Most
87
+#       of this hack is to work around the fact that ranges(FWGRanges)
88
+#       currently has to return a IRanges instance instead of a FWIRanges
89
+#       instance.
90
+.findOverlaps_FWGRanges <- function(query, subject, maxgap = -1L,
91
+                                    minoverlap = 0L,
92
+                                    type = c("any", "start", "end", "within",
93
+                                             "equal"),
94
+                                    select = c("all", "first", "last",
95
+                                               "arbitrary"),
96
+                                    ignore.strand = FALSE) {
97
+    # findOverlaps,GenomicRanges,GenomicRanges-method ----------------------
98
+    type <- match.arg(type)
99
+    select <- match.arg(select)
100
+
101
+    # GenomicRanges:::findOverlaps_GNCList() -------------------------------
102
+    if (!(is(query, "FWGRanges") && is(subject, "FWGRanges"))) {
103
+        stop("'query' and 'subject' must be FWGRanges objects")
104
+    }
105
+    if (!isTRUEorFALSE(ignore.strand)) {
106
+        stop("'ignore.strand' must be TRUE or FALSE")
107
+    }
108
+
109
+    si <- merge(seqinfo(query), seqinfo(subject))
110
+    q_seqlevels <- seqlevels(query)
111
+    s_seqlevels <- seqlevels(subject)
112
+    common_seqlevels <- intersect(q_seqlevels, s_seqlevels)
113
+    NG <- length(common_seqlevels)
114
+    q_group_idx <- match(common_seqlevels, q_seqlevels)
115
+    s_group_idx <- match(common_seqlevels, s_seqlevels)
116
+
117
+    # Extract 'q_groups' and 's_groups' (both of length NG).
118
+    q_groups <- GenomicRanges:::.extract_groups_from_GenomicRanges(
119
+        query)[q_group_idx]
120
+    s_groups <- GenomicRanges:::.extract_groups_from_GenomicRanges(
121
+        subject)[s_group_idx]
122
+
123
+    # Extract 'nclists' and 'nclist_is_q' (both of length NG).
124
+    if (is(subject, "GNCList")) {
125
+        nclists <- subject@nclists[s_group_idx]
126
+        nclist_is_q <- rep.int(FALSE, NG)
127
+    } else if (is(query, "GNCList")) {
128
+        nclists <- query@nclists[q_group_idx]
129
+        nclist_is_q <- rep.int(TRUE, NG)
130
+    } else {
131
+        # We'll do "on-the-fly preprocessing".
132
+        nclists <- vector(mode = "list", length = NG)
133
+        nclist_is_q <- rep.int(NA, NG)
134
+    }
135
+
136
+    # Extract 'circle_length' (of length NG).
137
+    circle_length <- GenomicRanges:::.get_circle_length(si)[q_group_idx]
138
+
139
+    ##Extract 'q_space' and 's_space'.
140
+    if (ignore.strand) {
141
+        q_space <- s_space <- NULL
142
+    } else {
143
+        q_space <- as.integer(strand(query)) - 3L
144
+        s_space <- as.integer(strand(subject)) - 3L
145
+    }
146
+
147
+    # IRanges:::NCList_find_overlaps_in_groups() ---------------------------
148
+    q <- query@ranges
149
+    s <- subject@ranges
150
+    circle.length <- circle_length
151
+    if (!(is(q, "IntegerRanges") && is(s, "IntegerRanges"))) {
152
+        stop("'q' and 's' must be IntegerRanges object")
153
+    }
154
+    if (!is(q_groups, "CompressedIntegerList")) {
155
+        stop("'q_groups' must be a CompressedIntegerList object")
156
+    }
157
+    if (!is(s_groups, "CompressedIntegerList")) {
158
+        stop("'s_groups' must be a CompressedIntegerList object")
159
+    }
160
+
161
+    if (!isSingleNumber(maxgap)) {
162
+        stop("'maxgap' must be a single integer")
163
+    }
164
+    if (!is.integer(maxgap)) {
165
+        maxgap <- as.integer(maxgap)
166
+    }
167
+
168
+    if (!isSingleNumber(minoverlap)) {
169
+        stop("'minoverlap' must be a single integer")
170
+    }
171
+    if (!is.integer(minoverlap)) {
172
+        minoverlap <- as.integer(minoverlap)
173
+    }
174
+
175
+    q_circle_len <- circle.length
176
+    q_circle_len[which(nclist_is_q)] <- NA_integer_
177
+    q <- IRanges:::.shift_ranges_in_groups_to_first_circle(
178
+        x = q,
179
+        x_groups = q_groups,
180
+        circle.length = q_circle_len)
181
+    s_circle_len <- circle.length
182
+    s_circle_len[which(!nclist_is_q)] <- NA_integer_
183
+    s <- IRanges:::.shift_ranges_in_groups_to_first_circle(
184
+        x = s,
185
+        x_groups = s_groups,
186
+        circle.length = s_circle_len)
187
+    .Call2("NCList_find_overlaps_in_groups",
188
+           start(q), end(q), q_space, q_groups,
189
+           start(s), end(s), s_space, s_groups,
190
+           nclists, nclist_is_q,
191
+           maxgap, minoverlap, type, select, circle.length,
192
+           PACKAGE = "IRanges")
193
+}
194
+
195
+setMethod("findOverlaps", c("FWGRanges", "FWGRanges"), .findOverlaps_FWGRanges)
196
+
197
+# TODO: A dedicated duplicated method would be faster than
198
+#       duplicated,GenomicRanges, because it could use end(x) instead of
199
+#       width(x) in call to duplicatedIntegerQuads().
200
+
201
+# Internal functions -----------------------------------------------------------
202
+
203
+# TODO: Warn if x contains mix of + and * or - and * loci?
204
+.strandCollapse <- function(x) {
205
+    stopifnot(is(x, "GenomicRanges"))
206
+    if (all(strand(x) == "*")) return(x)
207
+    # Shift loci on negative strand by 1 to the left
208
+    x[strand(x) == "-"] <- IRanges::shift(x[strand(x) == "-"], -1L)
209
+    # Unstrand all loci
210
+    x <- unstrand(x)
211
+    # Extract unique loci
212
+    unique(x)
213
+}
214
+
215
+# TODO: Document that the default 'sort = TRUE' applies sort(sortSeqlevels())
216
+#       to the output. This is the default behaviour because it results in
217
+#       the smallest returned object (albeit at the small cost of a sort).
218
+.readBismarkAsFWGRanges <- function(file, rmZeroCov = FALSE,
219
+                                    strandCollapse = FALSE, sort = TRUE,
220
+                                    verbose = FALSE) {
221
+    # Argument checks ----------------------------------------------------------
222
+
223
+    stopifnot(isTRUEorFALSE(rmZeroCov))
224
+    stopifnot(isTRUEorFALSE(strandCollapse))
225
+    stopifnot(isTRUEorFALSE(sort))
226
+
227
+    # Quieten R CMD check about 'no visible binding for global variable'
228
+    M <- U <- NULL
229
+
230
+    # Read file to construct data.table of valid loci --------------------------
231
+    if (rmZeroCov) {
232
+        dt <- .readBismarkAsDT(
233
+            file = file,
234
+            col_spec = "BSseq",
235
+            check = TRUE,
236
+            verbose = verbose)
237
+        if (strandCollapse && !is.null(dt[["strand"]]) &&
238
+            !dt[, all(strand == "*")]) {
239
+            # Shift loci on negative strand by 1 to the left and then remove
240
+            # strand since no longer valid.
241
+            dt[strand == "-", start := start - 1L][, strand := NULL]
242
+            # Aggregate counts at loci with the same 'seqnames' and 'start'.
243
+            dt <- dt[,