Browse code

Modify BSmooth()

- Automatic selection of appropriate BACKEND and better control over HDF5 options (when applicable)
- Tweak unit tests

Peter Hickey authored on 20/07/2018 04:30:21
Showing3 changed files

... ...
@@ -122,10 +122,11 @@ BSmooth <- function(BSseq,
122 122
                     h = 1000,
123 123
                     maxGap = 10^8,
124 124
                     keep.se = FALSE,
125
-                    verbose = TRUE,
126 125
                     BPPARAM = bpparam(),
127
-                    BACKEND = getRealizationBackend(),
128
-                    ...) {
126
+                    chunkdim = NULL,
127
+                    level = NULL,
128
+                    verbose = getOption("verbose")) {
129
+
129 130
     # Argument checks-----------------------------------------------------------
130 131
 
131 132
     # Check validity of 'BSseq' argument
... ...
@@ -138,20 +139,9 @@ BSmooth <- function(BSseq,
138 139
     if (is.unsorted(BSseq)) {
139 140
         stop("'BSseq' must be sorted before smoothing. Use 'sort(BSseq)'.")
140 141
     }
141
-    # Register 'BACKEND' and return to current value on exit
142
-    current_BACKEND <- getRealizationBackend()
143
-    on.exit(setRealizationBackend(current_BACKEND), add = TRUE)
144
-    setRealizationBackend(BACKEND)
145
-    # Check compatability of 'BACKEND' with backend(s) of BSseq object.
146
-    BSseq_backends <- .getBSseqBackends(BSseq)
147
-    if (.areBackendsInMemory(BACKEND) &&
148
-        !.areBackendsInMemory(BSseq_backends)) {
149
-        stop("Using an in-memory backend for a disk-backed BSseq object ",
150
-             "is not supported.\n",
151
-             "See help(\"BSmooth\") for details.",
152
-             call. = FALSE)
153
-    }
154
-    # Check compatability of 'BPPARAM' with the realization backend.
142
+    stopifnot(isTRUEorFALSE(keep.se))
143
+    # Set appropriate BACKEND and check compatability with BPPARAM.
144
+    BACKEND <- .getBSseqBackends(BSseq)
155 145
     if (!.areBackendsInMemory(BACKEND)) {
156 146
         if (!.isSingleMachineBackend(BPPARAM)) {
157 147
             stop("The parallelisation strategy must use a single machine ",
... ...
@@ -170,6 +160,30 @@ BSmooth <- function(BSseq,
170 160
                  call. = FALSE)
171 161
         }
172 162
     }
163
+    # If using HDF5Array, check if BSseq object is updateable.
164
+    if (identical(BACKEND, "HDF5Array")) {
165
+        is_BSseq_updateable <- .isHDF5BackedBSseqUpdatable(BSseq)
166
+        if (is_BSseq_updateable) {
167
+            h5_path <- path(assay(BSseq, withDimnames = FALSE))
168
+            if (any(c("coef", "se.coef") %in% rhdf5::h5ls(h5_path)[, "name"])) {
169
+                # TODO: Better error message; what should be done in this case?
170
+                stop(wmsg("The HDF5 file '", h5_path, "' already contains a ",
171
+                          "dataset named 'coef' or 'se.coef'."))
172
+            }
173
+        } else {
174
+            h5_path <- NULL
175
+            warning(
176
+                wmsg("'BSseq' was not created with either read.bismark() or ",
177
+                     "HDF5Array::saveHDF5SummarizedExperiment(). BSmooth() is ",
178
+                     "using automatically created HDF5 file(s) (see ",
179
+                     "?HDF5Array::setHDF5DumpFile) to store smoothing result. ",
180
+                     "You will need to run ",
181
+                     "HDF5Array::saveHDF5SummarizedExperiment() on the ",
182
+                     "returned object if you wish to save the returned ",
183
+                     "object."))
184
+        }
185
+    }
186
+    stopifnot(isTRUEorFALSE(verbose))
173 187
 
174 188
     # Smoothing ----------------------------------------------------------------
175 189
 
... ...
@@ -192,22 +206,24 @@ BSmooth <- function(BSseq,
192 206
     } else if (BACKEND == "HDF5Array") {
193 207
         coef_sink <- HDF5RealizationSink(
194 208
             dim = dim(M),
195
-            # NOTE: Never allow dimnames.
196 209
             dimnames = NULL,
197 210
             type = "double",
211
+            filepath = h5_path,
198 212
             name = "coef",
199
-            ...)
213
+            chunkdim = chunkdim,
214
+            level = level)
200 215
         on.exit(close(coef_sink), add = TRUE)
201 216
         sink_lock <- ipcid()
202 217
         on.exit(ipcremove(sink_lock), add = TRUE)
203 218
         if (keep.se) {
204
-            se.coef_sink <- HDF5Array::HDF5RealizationSink(
219
+            se.coef_sink <- HDF5RealizationSink(
205 220
                 dim = dim(M),
206
-                # NOTE: Never allow dimnames.
207 221
                 dimnames = NULL,
208 222
                 type = "double",
223
+                filepath = h5_path,
209 224
                 name = "se.coef",
210
-                ...)
225
+                chunkdim = chunkdim,
226
+                level = level)
211 227
             on.exit(close(se.coef_sink), add = TRUE)
212 228
         } else {
213 229
             se.coef_sink <- NULL
... ...
@@ -283,11 +299,16 @@ BSmooth <- function(BSseq,
283 299
         }
284 300
     }
285 301
 
286
-    # Construct BSseq object ---------------------------------------------------
302
+    # Construct BSseq object, saving it if it is HDF5-backed -------------------
287 303
 
288
-    assays <- c(assays(BSseq, withDimnames = FALSE), SimpleList(coef = coef))
289
-    if (keep.se) assays <- c(assays, SimpleList(se.coef = se.coef))
290
-    BiocGenerics:::replaceSlots(
304
+    # NOTE: Using BiocGenerics:::replaceSlots(..., check = FALSE) to avoid
305
+    #       triggering the validity method.
306
+    assays <- c(assays(BSseq, withDimnames = FALSE)[c("M", "Cov")],
307
+                SimpleList(coef = coef))
308
+    if (keep.se) {
309
+        assays <- c(assays, SimpleList(se.coef = se.coef))
310
+    }
311
+    BSseq <- BiocGenerics:::replaceSlots(
291 312
         object = BSseq,
292 313
         assays = Assays(assays),
293 314
         trans = plogis,
... ...
@@ -298,6 +319,15 @@ BSmooth <- function(BSseq,
298 319
             h = h,
299 320
             maxGap = maxGap),
300 321
         check = FALSE)
322
+    if (identical(BACKEND, "HDF5Array") && is_BSseq_updateable) {
323
+        # NOTE: Save BSseq object; mimicing
324
+        #       HDF5Array::saveHDF5SummarizedExperiment().
325
+        dir <- dirname(h5_path)
326
+        x <- BSseq
327
+        x@assays <- HDF5Array:::.shorten_h5_paths(x@assays)
328
+        saveRDS(x, file = file.path(dir, "se.rds"))
329
+    }
330
+    BSseq
301 331
 }
302 332
 
303 333
 # TODOs ------------------------------------------------------------------------
... ...
@@ -8,24 +8,25 @@
8 8
   Parsing output from the Bismark alignment suite.
9 9
 }
10 10
 \usage{read.bismark(files,
11
-             sampleNames = basename(files),
12 11
              loci = NULL,
12
+             colData = NULL,
13 13
              rmZeroCov = FALSE,
14 14
              strandCollapse = TRUE,
15
-             verbose = TRUE,
16 15
              BPPARAM = bpparam(),
17
-             BACKEND = getRealizationBackend(),
18
-             ...,
19
-             fileType = c("cov", "oldBedGraph", "cytosineReport"),
20
-             mc.cores = 1)
16
+             BACKEND = getRealizationBackend()
17
+             dir = tempdir(),
18
+             replace = FALSE,
19
+             chunkdim = NULL,
20
+             level = NULL,
21
+             verbose = TRUE)
21 22
 }
22 23
 \arguments{
23 24
   \item{files}{The path to the files created by running Bismark's methylation
24 25
     extractor, one sample per file.
25
-    Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed.
26
+    Files ending in \code{.gz} or \code{.bz2} will be automatically
27
+    decompressed to \code{\link{tempfile}()}.
26 28
     We strongly recommend you use the 'genome wide cytosine report' output files.
27 29
     See section 'File formats' for further details.}
28
-  \item{sampleNames}{A character vector of sample names, based on the order of \code{files}.}
29 30
   \item{loci}{\code{NULL} (default) or a \linkS4class{GenomicRanges} instance
30 31
     containing methylation loci (all with width equal to 1).
31 32
     If \code{loci = NULL}, then \code{read.bismark()} will perform a first pass over the Bismark file to identify candidate loci.
... ...
@@ -33,6 +34,10 @@
33 34
     In either case, the candidate loci will be filtered if
34 35
     \code{rmZeroCov = TRUE} and collapsed if \code{strandCollapse = TRUE} to form the final set of methylation loci that form the \code{\link[SummarizedExperiment]{rowRanges}} of the returned \linkS4class{BSseq} object.
35 36
     See section 'Efficient use of \code{read.bismark()}' for further details.}
37
+  \item{colData}{An optional \linkS4class{DataFrame} describing the samples.
38
+    Row names, if present, become the column names of the \linkS4class{BSseq}
39
+    object. If \code{NULL}, then a \linkS4class{DataFrame} will be created with
40
+    \code{files} used as the row names.}
36 41
   \item{rmZeroCov}{A \code{logical(1)} indicating whether methylation loci that
37 42
     have zero coverage in all samples be removed.
38 43
     For several reasons, the default \code{rmZeroCov = FALSE} is recommended even in cases where you ultimately want to remove such loci.
... ...
@@ -41,9 +46,7 @@
41 46
     methylation loci (i.e. CpGs) should be collapsed across strands.
42 47
     This is only applicable for stranded methylation loci, e.g., loci extracted
43 48
     from 'genome wide cytosine reports' (see section 'File formats' for further details).}
44
-  \item{verbose}{A \code{logical(1)} indicating whether progress messages
45
-    should be printed (default \code{TRUE}).}
46
-\item{BPPARAM}{An optional \linkS4class{BiocParallelParam} instance
49
+  \item{BPPARAM}{An optional \linkS4class{BiocParallelParam} instance
47 50
     determining the parallel back-end to be used during evaluation.
48 51
     Currently supported are \linkS4class{SerialParam} (Unix, Mac, Windows),
49 52
     \linkS4class{MulticoreParam} (Unix and Mac), \linkS4class{SnowParam}
... ...
@@ -56,13 +59,25 @@
56 59
     When the backend is set to \code{NULL}, the \code{M} and \code{Cov} assays
57 60
     are realized in memory as ordinary matrices, otherwise these are realized with the given \code{BACKEND}.
58 61
     See section 'Realization backends' for further details.}
59
-  \item{...}{Additional arguments passed to the \linkS4class{RealizationSink}
60
-    constructor.
61
-    See section 'Realization backends' for further details.}
62
-  \item{fileType}{\strong{Deprecated}.
63
-    The format of each file is now automatically detected using the internal function \code{bsseq:::.guessBismarkFileType()}.}
64
-  \item{mc.cores}{\strong{Deprecated}.
65
-    See section 'Parallelization and progress monitoring' for further details.}
62
+  \item{dir}{\strong{Only applicable if \code{BACKEND == "HDF5Array"}.}
63
+    The path (as a single string) to the directory where to save the HDF5-based
64
+    \linkS4class{BSseq} object. The directory will be created so should not
65
+    already exist, unless \code{replace} is set to \code{TRUE}.}
66
+  \item{replace}{\strong{Only applicable if \code{BACKEND == "HDF5Array"}.}
67
+    If directory \code{dir} already exists, should it be replaced
68
+    with a new one? The content of the existing directory will be lost!}
69
+  \item{chunkdim}{\strong{Only applicable if \code{BACKEND == "HDF5Array"}.}
70
+    The dimensions of the chunks to use for writing the data to
71
+    disk. By default, \code{\link[HDF5Array]{getHDF5DumpChunkDim}()} using the
72
+    dimensions of the returned \linkS4class{BSseq} object will be used. See
73
+    \code{?\link[HDF5Array]{getHDF5DumpChunkDim}} for more information.}
74
+  \item{level}{\strong{Only applicable if \code{BACKEND == "HDF5Array"}.}
75
+    The compression level to use for writing the data to disk. By
76
+    default, \code{\link[HDF5Array]{getHDF5DumpCompressionLevel}()} will be
77
+    used. See \code{?\link[HDF5Array]{getHDF5DumpCompressionLevel}} for more
78
+    information.}
79
+  \item{verbose}{A \code{logical(1)} indicating whether progress messages
80
+    should be printed (default \code{TRUE}).}
66 81
 }
67 82
 
68 83
 \section{Efficient use of \code{read.bismark()}}{
... ...
@@ -159,7 +174,7 @@
159 174
   Please see the examples at the bottom of the page.
160 175
 }
161 176
 
162
-\section{Parallelization and progress monitoring}{
177
+\section{Parallelization, progress monitoring, and logging}{
163 178
   \code{read.bismark()} now uses the \pkg{BiocParallel} package to implement
164 179
   parallelization. This brings some notable improvements:
165 180
 
... ...
@@ -170,10 +185,6 @@
170 185
       results be retained in-memory.
171 186
     \item Parallelization is now supported on Windows through the use of a
172 187
     \linkS4class{SnowParam} object as the value of \code{BPPARAM}.
173
-    % TODO: Uncomment when/if implemented.
174
-    %\item Improved error handling makes it possible to gracefully resume
175
-    %\code{read.bismark()} jobs that encountered errors by re-doing only the necessary
176
-    %tasks.
177 188
     \item Detailed and extensive job logging facilities.
178 189
   }
179 190
 
... ...
@@ -185,24 +196,14 @@
185 196
   single-machine \linkS4class{SnowParam}. Please consult the \pkg{BiocParallel}
186 197
   documentation to take full advantage of the more advanced features.
187 198
 
188
-  \subsection{Deprecated arguments}{
189
-    \code{mc.cores} is deprecated and will be removed in subsequent releases of \pkg{bsseq}.
190
-    This argument was necessary when \code{read.bismark()} used the \pkg{parallel} package to implement parallelization, but this functionality is superseded by the aforementioned use of \pkg{BiocParallel}.
191
-    We recommend that users who previously relied on these arguments switch to \code{BPPARAM = MulticoreParam(workers = mc.cores, progressbar = TRUE)}.
192
-  }
199
+  A useful feature of \pkg{BiocParallel} are progress bars to monitor the
200
+  status of long-running jobs, such as \code{BSmooth()}. Progress bars are
201
+  controlled via the \code{progressbar} argument in the
202
+  \linkS4class{BiocParallelParam} constructor.
193 203
 
194
-  \subsection{Progress monitoring}{
195
-    A useful feature of \pkg{BiocParallel} are progress bars to monitor the
196
-    status of long-running jobs, such as \code{BSmooth()}. Progress bars are
197
-    controlled via the \code{progressbar} argument in the
198
-    \linkS4class{BiocParallelParam} constructor. Progress bars replace the
199
-    use of the deprecated \code{verbose} argument to print out information on
200
-    the status of \code{BSmooth()}.
201
-
202
-    \pkg{BiocParallel} also supports extensive and detailed logging facilities.
203
-    Please consult the \pkg{BiocParallel} documentation to take full advantage
204
-    these advanced features.
205
-  }
204
+  \pkg{BiocParallel} also supports extensive and detailed logging facilities.
205
+  Please consult the \pkg{BiocParallel} documentation to take full advantage
206
+  these advanced features.
206 207
 }
207 208
 
208 209
 \value{
... ...
@@ -220,9 +221,9 @@
220 221
   # Run read.bismark() on a single sample to construct a matrix-backed BSseq
221 222
   # object.
222 223
   infile <- system.file("extdata/test_data.fastq_bismark.bismark.cov.gz",
223
-                        package = 'bsseq')
224
+                        package = "bsseq")
224 225
   bsseq <- read.bismark(files = infile,
225
-                        sampleNames = "test_data",
226
+                        colData = DataFrame(row.names = "test_data"),
226 227
                         rmZeroCov = FALSE,
227 228
                         strandCollapse = FALSE,
228 229
                         verbose = TRUE)
... ...
@@ -232,18 +233,18 @@
232 233
 
233 234
   \dontrun{
234 235
   # Run read.bismark() on a single sample to construct a HDF5Array-backed BSseq
235
-  # object (with data written to the file 'read.bismark_example.h5')
236
+  # object (with data written to 'test_dir')
237
+  test_dir <- tempfile("BSseq")
236 238
   bsseq <- read.bismark(files = infile,
237
-                        sampleNames = "test_data",
239
+                        colData = DataFrame(row.names = "test_data"),
238 240
                         rmZeroCov = FALSE,
239 241
                         strandCollapse = FALSE,
240
-                        verbose = TRUE,
241 242
                         BACKEND = "HDF5Array",
242
-                        filepath = "read.bismark_example.h5")
243
+                        dir = test_dir,
244
+                        verbose = TRUE)
243 245
   # This is a HDF5Array-backed BSseq object.
244 246
   sapply(assays(bsseq, withDimnames = FALSE), class)
245
-  # The 'M' and 'Cov' assays are in the HDF5 file 'read.bismark_example.h5' (in
246
-  # the current working directory).
247
+  # The 'M' and 'Cov' assays are in the HDF5 file 'assays.h5' (in 'test_dir').
247 248
   sapply(assays(bsseq, withDimnames = FALSE), path)
248 249
   }
249 250
 }
... ...
@@ -10,68 +10,32 @@ test_that("Errors on bad input", {
10 10
         "All loci in 'BSseq' must have width == 1")
11 11
 })
12 12
 
13
-test_that("Issues warning for deprecated arguments", {
14
-    expect_warning(
15
-        BSmooth(bsseq_test, parallelBy = "sample", BPPARAM = SerialParam()),
16
-        "'parallelBy' is deprecated")
17
-    expect_warning(
18
-        BSmooth(bsseq_test, mc.preschedule = FALSE, BPPARAM = SerialParam()),
19
-        "'mc.preschedule' is deprecated")
20
-    expect_warning(
21
-        BSmooth(bsseq_test, mc.cores = 1, BPPARAM = SerialParam()),
22
-        "'mc.cores' is deprecated")
23
-    expect_warning(
24
-        BSmooth(bsseq_test, verbose = FALSE, BPPARAM = SerialParam()),
25
-        "'verbose' is deprecated")
13
+test_that("BSmooth properly inherits 'dir'", {
14
+    infile <- system.file("extdata", "test_data.fastq_bismark.bismark.cov.gz",
15
+                          package = "bsseq")
16
+    bsseq <- read.bismark(files = infile, BACKEND = "HDF5Array")
17
+    bsseq <- BSmooth(bsseq)
18
+    expect_is(bsseq, "BSseq")
19
+    expect_true(hasBeenSmoothed(bsseq))
20
+    expect_error(
21
+        BSmooth(bsseq),
22
+        "already contains a dataset named 'coef' or 'se.coef'")
26 23
 })
27 24
 
28
-test_that(
29
-    "Checks compatability of realization backend with backend(s) of BSseq object", {
30
-        setRealizationBackend(NULL)
31
-        expect_error(
32
-            BSmooth(realize(bsseq_test, "HDF5Array"),
33
-                    "Using an in-memory backend for a disk-backed BSseq object is not supported")
34
-        )
35
-        bsseq_test2 <- bsseq_test
36
-        assay(bsseq_test2, withDimnames = FALSE) <- realize(
37
-            assay(bsseq_test2, withDimnames = FALSE),
38
-            "HDF5Array")
39
-        expect_error(
40
-            BSmooth(realize(bsseq_test2, "HDF5Array"),
41
-                    "Using an in-memory backend for a disk-backed BSseq object is not supported")
42
-        )
43
-
44
-        setRealizationBackend("RleArray")
45
-        expect_error(
46
-            BSmooth(realize(bsseq_test, "HDF5Array"),
47
-                    "Using an in-memory backend for a disk-backed BSseq object is not supported")
48
-        )
49
-        setRealizationBackend("HDF5Array")
50
-        expect_silent(
51
-            BSmooth(realize(bsseq_test, "HDF5Array"), BPPARAM = SerialParam()))
52
-    })
53
-
54 25
 test_that("Checks compatability of 'BPPARAM' with the realization backend", {
55
-        setRealizationBackend("HDF5Array")
56 26
         expect_error(
57
-            BSmooth(bsseq_test, BPPARAM = SnowParam(c("node1", "node2"))),
27
+            BSmooth(
28
+                BSseq = realize(bsseq_test, "HDF5Array"),
29
+                BPPARAM = SnowParam(c("node1", "node2"))),
58 30
             "The parallelisation strategy must use a single machine when using an on-disk realization backend")
59 31
     }
60 32
 )
61 33
 
62
-test_that(
63
-    "Expected realization backends work when using SerialParam parallelisation backend", {
64
-        setRealizationBackend(NULL)
65
-        bsseq_NULL <- BSmooth(bsseq_test, BPPARAM = SerialParam())
66
-
67
-        setRealizationBackend("HDF5Array")
68
-        bsseq_HDF5Array <- BSmooth(bsseq_test, BPPARAM = SerialParam())
69
-        expect_equivalent_SE(bsseq_NULL, bsseq_HDF5Array)
70
-
71
-        setRealizationBackend("RleArray")
72
-        expect_error(
73
-            BSmooth(bsseq_test, BPPARAM = SerialParam()),
74
-            "The 'RleArray' realization backend is not supported")
34
+test_that("Backends work when using SerialParam parallelisation backend", {
35
+    bsseq_NULL <- BSmooth(
36
+        BSseq = realize(bsseq_test, NULL), BPPARAM = SerialParam())
37
+    bsseq_HDF5Array <- BSmooth(bsseq_test, BPPARAM = SerialParam())
38
+    expect_equivalent_SE(bsseq_NULL, bsseq_HDF5Array)
75 39
 })
76 40
 
77 41
 test_that(
... ...
@@ -117,7 +81,7 @@ test_that(
117 81
 
118 82
 test_that(
119 83
     "Expected parallelisation backends work with on-disk realization backend", {
120
-        setRealizationBackend("HDF5Array")
84
+        bsseq_test <- realize(bsseq_test, "HDF5Array")
121 85
 
122 86
         bsseq_serial_param <- BSmooth(bsseq_test, BPPARAM = SerialParam())
123 87
         bsseq_multicore_param <- BSmooth(
... ...
@@ -167,7 +131,6 @@ test_that(
167 131
 )
168 132
 
169 133
 test_that("keep.se works", {
170
-    setRealizationBackend(NULL)
171 134
     expect_is(
172 135
         assay(BSmooth(bsseq_test, keep.se = TRUE, BPPARAM = SerialParam()),
173 136
               "se.coef"),
... ...
@@ -176,10 +139,14 @@ test_that("keep.se works", {
176 139
         assay(BSmooth(bsseq_test, keep.se = FALSE, BPPARAM = SerialParam()),
177 140
               "'se.coef' not in names"))
178 141
 
179
-    setRealizationBackend("HDF5Array")
180 142
     expect_is(
181
-        assay(BSmooth(bsseq_test, keep.se = TRUE, BPPARAM = SerialParam()),
182
-              "se.coef", withDimnames = FALSE),
143
+        assay(
144
+            BSmooth(
145
+                BSseq = realize(bsseq_test, "HDF5Array"),
146
+                keep.se = TRUE,
147
+                BPPARAM = SerialParam()),
148
+              "se.coef",
149
+            withDimnames = FALSE),
183 150
         "HDF5Matrix")
184 151
     expect_error(
185 152
         assay(BSmooth(bsseq_test, keep.se = FALSE, BPPARAM = SerialParam()),