Browse code

Modify read.bismark()

- Pass colData rather than sampleNames
- Better control over HDF5 file and its location
- Save BSseq object if HDF5-backed
- Tweak unit tests

Peter Hickey authored on 20/07/2018 04:28:37
Showing2 changed files

... ...
@@ -6,8 +6,7 @@
6 6
 #       (https://github.com/FelixKrueger/Bismark/tree/master/Docs#output-1)
7 7
 #       and error out (they're not supported by .readBismaskAsDT()).
8 8
 .guessBismarkFileType <- function(files, n_max = 10L) {
9
-    guessed_file_types <- setNames(vector("character", length(files)), files)
10
-    for (file in files) {
9
+    vapply(files, function(file) {
11 10
         x <- read.delim(
12 11
             file = file,
13 12
             header = FALSE,
... ...
@@ -15,14 +14,14 @@
15 14
             stringsAsFactors = FALSE)
16 15
         n_fields <- ncol(x)
17 16
         if (isTRUE(all(n_fields == 6L))) {
18
-            guessed_file_types[file] <- "cov"
19
-        } else if (isTRUE(all(n_fields == 7L))) {
20
-            guessed_file_types[file] <- "cytosineReport"
21
-        } else {
22
-            stop("Could not guess Bismark file type for '", file, "'")
17
+            return("cov")
23 18
         }
24
-    }
25
-    guessed_file_types
19
+        if (isTRUE(all(n_fields == 7L))) {
20
+            return("cytosineReport")
21
+        }
22
+        # Couldn't guess the file type.
23
+        NA_character_
24
+    }, character(1L))
26 25
 }
27 26
 
28 27
 # NOTE: In brief benchmarking, readr::read_csv() is ~1.3-1.6x faster than
... ...
@@ -32,26 +31,28 @@
32 31
 # TODO: (long term) Formalise benchmarks of utils::read.table(),
33 32
 #       data.table::fread(), and readr::read_tsv() as a document in the bsseq
34 33
 #       package so that we can readily re-visit these as needed.
35
-# TODO: Add `showProgress` instead of using `verbose` twice?
36 34
 .readBismarkAsDT <- function(file,
37 35
                              col_spec = c("all", "BSseq", "GRanges"),
38 36
                              check = FALSE,
39
-                             is_zcat_available = TRUE,
37
+                             showProgress = FALSE,
40 38
                              nThread = 1L,
41
-                             verbose = FALSE,
42
-                             ...) {
43
-    # Check arguments ----------------------------------------------------------
39
+                             verbose = FALSE) {
40
+    # Argument checks ----------------------------------------------------------
44 41
 
45 42
     file <- file_path_as_absolute(file)
46
-    col_spec <- match.arg(col_spec)
47 43
     file_type <- .guessBismarkFileType(file)
44
+    col_spec <- match.arg(col_spec)
48 45
     stopifnot(isTRUEorFALSE(check))
49
-    stopifnot(isTRUEorFALSE(is_zcat_available))
50
-    subverbose <- as.logical(max(verbose - 1L, 0L))
46
+    stopifnot(isTRUEorFALSE(showProgress))
47
+    # NOTE: 'verbose' controls the verbosity of .readBismarkAsDT() and
48
+    #       'fread_verbose' is derived from that to control the verbosity of
49
+    #       data.table::fread().
50
+    fread_verbose <- as.logical(max(verbose - 1L, 0L))
51 51
 
52 52
     # Construct the column spec ------------------------------------------------
53 53
 
54 54
     if (file_type == "cov") {
55
+        header <- FALSE
55 56
         if (col_spec == "BSseq") {
56 57
             colClasses <- c("factor", "integer", "NULL", "NULL", "integer",
57 58
                             "integer")
... ...
@@ -68,6 +69,7 @@
68 69
             col.names <- c("seqnames", "start", "end", "beta", "M", "U")
69 70
         }
70 71
     } else if (file_type == "cytosineReport") {
72
+        header <- FALSE
71 73
         if (col_spec == "BSseq") {
72 74
             colClasses <- c("factor", "integer", "factor", "integer",
73 75
                             "integer", "NULL", "NULL")
... ...
@@ -90,73 +92,42 @@
90 92
     # Read the file ------------------------------------------------------------
91 93
 
92 94
     if (verbose) {
93
-        message("[.readBismarkAsDT] Reading file '", file, "'")
95
+        message("[.readBismarkAsDT] Parsing '", file, "'")
94 96
     }
95 97
     ptime1 <- proc.time()
96
-    # TODO: Make copy if isGzipped() to avoid dependency on R.utils?
97
-    # TODO: Use isCompressedFile() and decompressFile().
98 98
     if (isGzipped(file)) {
99
-        message(
100
-            "[.readBismarkAsDT] Gunzipping file '", file, "' to tempfile().")
99
+        if (verbose) {
100
+            message("[.readBismarkAsDT] Decompressing to tempfile() ...")
101
+        }
102
+        file <- gunzip(
103
+            filename = file,
104
+            destname = tempfile(),
105
+            remove = FALSE)
106
+        on.exit(unlink(file), add = TRUE)
107
+    } else if (isBzipped(file)) {
108
+        if (verbose) {
109
+            message("[.readBismarkAsDT] Decompressing to tempfile() ...")
101 110
 
102
-        file <- gunzip(filename = file, destname = tempfile(), remove = FALSE)
111
+        }
112
+        file <- bunzip2(
113
+            filename = file,
114
+            destname = tempfile(),
115
+            remove = FALSE)
103 116
         on.exit(unlink(file), add = TRUE)
104 117
     }
105
-    # sysname <- Sys.info()[["sysname"]]
106
-    # if (sysname == "Linux") {
107
-    #     input <- sprintf("zcat %s", shQuote(file))
108
-    # } else if (sysname == "Darwin") {
109
-    #     # macOS/OS X needs a different command; see
110
-    #     # https://github.com/Rdatatable/data.table/issues/717#issuecomment-140028670
111
-    #     input <- sprintf("zcat < %s", shQuote(file))
112
-    # } else {
113
-    #     # TODO: Support other OSs (e.g, Windows, sunOS)
114
-    #     warning(
115
-    #         "Unable to find 'zcat' for use with 'data.table::fread()'.\n",
116
-    #         "Falling back to 'utils::read.table()'.")
117
-    #     is_zcat_available <- FALSE
118
-    # }
119
-    # TODO: Gracefully handle situation where user incorrectly sets
120
-    #       `is_zcat_available = TRUE` and it consequently fails.
121
-    # if (is_zcat_available) {
122
-    #     x <- fread(
123
-    #         input = input,
124
-    #         sep = "\t",
125
-    #         header = FALSE,
126
-    #         verbose = subverbose,
127
-    #         drop = drop,
128
-    #         colClasses = colClasses,
129
-    #         col.names = col.names,
130
-    #         # TODO: Check remainder of these arguments are optimal.
131
-    #         # TODO: Add `key` argument? Check other arguments available in
132
-    #         #       fread().
133
-    #         quote = "",
134
-    #         strip.white = FALSE,
135
-    #         showProgress = as.logical(verbose),
136
-    #         nThread = nThread)
137
-    # } else {
138
-    #     x <- read.table(
139
-    #         file = file,
140
-    #         header = FALSE,
141
-    #         sep = "\t",
142
-    #         quote = "",
143
-    #         col.names = col.names,
144
-    #         colClasses = colClasses,
145
-    #         strip.white = FALSE)
146
-    #     setDT(x)
147
-    # }
118
+    if (verbose) {
119
+        message("[.readBismarkAsDT] Reading file ...")
120
+    }
148 121
     x <- fread(
149 122
         file = file,
150 123
         sep = "\t",
151
-        header = FALSE,
152
-        verbose = subverbose,
124
+        header = header,
125
+        verbose = fread_verbose,
153 126
         drop = drop,
154 127
         colClasses = colClasses,
155 128
         col.names = col.names,
156
-        # TODO: Check remainder of these arguments are optimal.
157 129
         quote = "",
158
-        strip.white = FALSE,
159
-        showProgress = as.logical(verbose),
130
+        showProgress = showProgress,
160 131
         nThread = nThread)
161 132
 
162 133
     # Construct the result -----------------------------------------------------
... ...
@@ -172,8 +143,8 @@
172 143
         }
173 144
         valid <- x[, isTRUE(all(M >= 0L & U >= 0L))]
174 145
         if (!valid) {
175
-            stop("[.readBismarkAsDT] Invalid counts detected!\n",
176
-                 "M and U columns should be non-negative integers.")
146
+            stop("[.readBismarkAsDT] Invalid counts detected.\n",
147
+                 "'M' and 'U' columns should be non-negative integers.")
177 148
         } else {
178 149
             if (verbose) {
179 150
                 message("[.readBismarkAsDT] All counts in file are valid.")
... ...
@@ -183,17 +154,19 @@
183 154
     ptime2 <- proc.time()
184 155
     stime <- (ptime2 - ptime1)[3]
185 156
     if (verbose) {
186
-        cat(sprintf("done in %.1f secs\n", stime))
157
+        message("Done in ", round(stime, 1), " secs")
187 158
     }
188 159
     x
189 160
 }
190 161
 
191
-.constructCountsFromSingleFile <- function(b, files, strandCollapse, loci,
162
+.constructCountsFromSingleFile <- function(b, files, loci, strandCollapse,
192 163
                                            grid, M_sink, Cov_sink, sink_lock,
193
-                                           verbose, BPPARAM,
194
-                                           is_zcat_available, nThread) {
164
+                                           verbose) {
165
+
166
+    # Argument checks ----------------------------------------------------------
195 167
 
196
-    # Quieten R CMD check about 'no visible binding for global variable' -------
168
+    subverbose <- max(as.integer(verbose) - 1L, 0L)
169
+    # NOTE: Quieten R CMD check about 'no visible binding for global variable'.
197 170
     M <- U <- NULL
198 171
 
199 172
     # Read b-th file to construct data.table of valid loci and their counts ----
... ...
@@ -207,9 +180,7 @@
207 180
         file = file,
208 181
         col_spec = "BSseq",
209 182
         check = TRUE,
210
-        is_zcat_available = is_zcat_available,
211
-        nThread = nThread,
212
-        verbose = verbose)
183
+        verbose = subverbose)
213 184
     # NOTE: Can only collapse by strand if the data are stranded!
214 185
     if (strandCollapse && !is.null(dt[["strand"]])) {
215 186
         # Shift loci on negative strand by 1 to the left and then remove
... ...
@@ -254,48 +225,56 @@
254 225
         return(list(M = M, Cov = Cov))
255 226
     }
256 227
     # Write to M_sink and Cov_sink while respecting the IPC lock.
228
+    viewport <- grid[[b]]
257 229
     ipclock(sink_lock)
258
-    write_block(x = M_sink, viewport = grid[[b]], block = M)
259
-    write_block(x = Cov_sink, viewport = grid[[b]], block = Cov)
230
+    write_block(x = M_sink, viewport = viewport, block = M)
231
+    write_block(x = Cov_sink, viewport = viewport, block = Cov)
260 232
     ipcunlock(sink_lock)
261 233
     NULL
262 234
 }
263 235
 
264
-.constructCounts <- function(files, loci, strandCollapse, verbose, BPPARAM,
265
-                             BACKEND, is_zcat_available, nThread, ...) {
266
-    # Set up ArrayGrid so that each block contains data for a single sample.
236
+.constructCounts <- function(files, loci, strandCollapse, BPPARAM,
237
+                             BACKEND, dir, chunkdim, level, verbose = FALSE) {
238
+
239
+    # Argument checks ----------------------------------------------------------
240
+
241
+    subverbose <- max(as.integer(verbose) - 1L, 0L)
242
+
243
+    # Construct ArrayGrid ------------------------------------------------------
244
+
245
+    # NOTE: Each block contains data for a single sample.
267 246
     ans_nrow <- length(loci)
268 247
     ans_ncol <- length(files)
269
-    grid <- RegularArrayGrid(c(ans_nrow, ans_ncol), c(ans_nrow, 1L))
270
-    # Construct RealizationSink objects.
248
+    ans_dim <- c(ans_nrow, ans_ncol)
249
+    grid <- RegularArrayGrid(
250
+        refdim = ans_dim,
251
+        spacings = c(ans_nrow, 1L))
252
+
253
+    # Construct RealizationSink objects ----------------------------------------
254
+
271 255
     if (is.null(BACKEND)) {
272 256
         M_sink <- NULL
273 257
         Cov_sink <- NULL
274 258
         sink_lock <- NULL
275 259
     } else if (BACKEND == "HDF5Array") {
260
+        h5_path <- file.path(dir, "assays.h5")
276 261
         M_sink <- HDF5RealizationSink(
277
-            dim = c(ans_nrow, ans_ncol),
278
-            # NOTE: Never allow dimnames.
262
+            dim = ans_dim,
279 263
             dimnames = NULL,
280 264
             type = "integer",
265
+            filepath = h5_path,
281 266
             name = "M",
282
-            # TODO: Can chunkdim be specified if data are written to
283
-            #       column-by-column?
284
-            # chunkdim = NULL,
285
-            # level = NULL,
286
-            ...)
267
+            chunkdim = chunkdim,
268
+            level = level)
287 269
         on.exit(close(M_sink), add = TRUE)
288 270
         Cov_sink <- HDF5Array::HDF5RealizationSink(
289
-            dim = c(ans_nrow, ans_ncol),
290
-            # NOTE: Never allow dimnames.
271
+            dim = ans_dim,
291 272
             dimnames = NULL,
292 273
             type = "integer",
274
+            filepath = h5_path,
293 275
             name = "Cov",
294
-            # TODO: Can chunkdim be specified if data are written to
295
-            #       column-by-column?
296
-            # chunkdim = NULL,
297
-            # level = NULL,
298
-            ...)
276
+            chunkdim = chunkdim,
277
+            level = level)
299 278
         on.exit(close(Cov_sink), add = TRUE)
300 279
         sink_lock <- ipcid()
301 280
         on.exit(ipcremove(sink_lock), add = TRUE)
... ...
@@ -305,16 +284,19 @@
305 284
         #       However, we retain it for now (e.g., fstArray backend would use
306 285
         #       this until a dedicated branch was implemented).
307 286
         M_sink <- DelayedArray:::RealizationSink(
308
-            dim = c(ans_nrow, ans_ncol),
287
+            dim = ans_dim,
309 288
             type = "integer")
310 289
         on.exit(close(M_sink), add = TRUE)
311 290
         Cov_sink <- DelayedArray:::RealizationSink(
312
-            dim = c(ans_nrow, ans_ncol),
291
+            dim = ans_dim,
313 292
             type = "integer")
314 293
         on.exit(close(Cov_sink), add = TRUE)
315 294
         sink_lock <- ipcid()
316 295
         on.exit(ipcremove(sink_lock), add = TRUE)
317 296
     }
297
+
298
+    # Apply .constructCountsFromSingleFile() to each file ----------------------
299
+
318 300
     # Set number of tasks to ensure the progress bar gives frequent updates.
319 301
     # NOTE: The progress bar increments once per task
320 302
     #       (https://github.com/Bioconductor/BiocParallel/issues/54).
... ...
@@ -323,8 +305,6 @@
323 305
     #       performance in this instance, and so we opt for a useful progress
324 306
     #       bar. Only SnowParam (and MulticoreParam by inheritance) have a
325 307
     #       bptasks<-() method.
326
-    # TODO: Check that setting number of tasks doesn't affect things (e.g.,
327
-    #       the cost of transfering loci to the workers may be substantial).
328 308
     if (is(BPPARAM, "SnowParam") && bpprogressbar(BPPARAM)) {
329 309
         bptasks(BPPARAM) <- length(grid)
330 310
     }
... ...
@@ -332,42 +312,27 @@
332 312
         X = seq_along(grid),
333 313
         FUN = .constructCountsFromSingleFile,
334 314
         files = files,
335
-        strandCollapse = strandCollapse,
336 315
         loci = loci,
316
+        strandCollapse = strandCollapse,
337 317
         grid = grid,
338 318
         M_sink = M_sink,
339 319
         Cov_sink = Cov_sink,
340 320
         sink_lock = sink_lock,
341
-        verbose = verbose,
342
-        BPPARAM = BPPARAM,
343
-        is_zcat_available = is_zcat_available,
344
-        nThread = nThread))
321
+        verbose = subverbose,
322
+        BPPARAM = BPPARAM))
345 323
     if (!all(bpok(counts))) {
346
-        # TODO: This isn't yet properly impelemented
347
-        # TODO: Feels like stop() rather than warning() should be used, but
348
-        #       stop() doesn't allow for the return of partial results;
349
-        #       see https://support.bioconductor.org/p/109374/
350
-        warning("read.bismark() encountered errors: ",
351
-                sum(!bpok(counts)), " of ", length(counts),
352
-                " files failed.\n",
353
-                "read.bismark() has returned partial results, including ",
354
-                "errors, for debugging purposes.\n",
355
-                "It may be possible to re-run just these failed files.\n",
356
-                "See help(\"read.bismark\")",
357
-                call. = FALSE)
358
-        # NOTE: Return intermediate results as well as all derived variables.
359
-        return(list(counts = counts,
360
-                    M_sink = M_sink,
361
-                    Cov_sink = Cov_sink,
362
-                    BACKEND = BACKEND))
324
+        stop(".constructCounts() encountered errors for these files:\n  ",
325
+             paste(files[!bpok], collapse = "\n  "))
363 326
     }
364
-    # Construct M and Cov from results of
327
+
328
+    # Construct 'M' and 'Cov' --------------------------------------------------
329
+
365 330
     if (is.null(BACKEND)) {
366 331
         # Returning matrix objects.
367 332
         M <- do.call(c, lapply(counts, "[[", "M"))
368
-        attr(M, "dim") <- c(ans_nrow, ans_ncol)
333
+        attr(M, "dim") <- ans_dim
369 334
         Cov <- do.call(c, lapply(counts, "[[", "Cov"))
370
-        attr(Cov, "dim") <- c(ans_nrow, ans_ncol)
335
+        attr(Cov, "dim") <- ans_dim
371 336
     } else {
372 337
         # Returning DelayedMatrix objects.
373 338
         M <- as(M_sink, "DelayedArray")
... ...
@@ -382,55 +347,39 @@
382 347
 # TODO: If you have N cores available, are you better off using
383 348
 #       bpworkers() = N in the BPPARAM or nThread = N and use
384 349
 #       data.table::fread()? Or something in between?
385
-# TODO: Support passing a colData so that metadata is automatically added to
386
-#       samples? Yes, do this.
387
-# TODO: Document that `...` are used to pass filepath, chunkdim, level, etc. to
388
-#       HDF5RealizationSink().
389
-# TODO: (long term) Formalise `...` by something analogous to the
390
-#       BiocParallelParam class (RealizationSinkParam) i.e. something that
391
-#       encapsulates the various arguments available when constructing a
392
-#       RealizationSink.
393
-# TODO: sampleNames = basename(files) isn't guaranteed to be unique.
350
+# TODO: Allow user to determine `nThread`?
394 351
 # TODO: Document that you can't use strandCollapse with files that lack strand
395 352
 #       (e.g., .cov files).
396 353
 # TODO: (long term) Report a run-time warning/message if strandCollapse = TRUE
397 354
 #       is used in conjunction with files without strand information.
398 355
 read.bismark <- function(files,
399
-                         sampleNames = basename(files),
400 356
                          loci = NULL,
357
+                         colData = NULL,
401 358
                          rmZeroCov = FALSE,
402 359
                          strandCollapse = TRUE,
403
-                         verbose = TRUE,
404 360
                          BPPARAM = bpparam(),
405 361
                          BACKEND = getRealizationBackend(),
406
-                         nThread = 1L,
407
-                         ...) {
362
+                         dir = tempfile("BSseq"),
363
+                         replace = FALSE,
364
+                         chunkdim = NULL,
365
+                         level = NULL,
366
+                         verbose = getOption("verbose")) {
408 367
     # Argument checks ----------------------------------------------------------
409 368
 
410 369
     # Check 'files' is valid.
411
-    # TODO: Allow duplicate files? Useful in testing/debugging but generally a
412
-    #       bad idea.
413
-    if (anyDuplicated(files)) stop("'files' cannot have duplicate entires")
370
+    if (anyDuplicated(files)) {
371
+        stop("'files' cannot have duplicate entries.")
372
+    }
414 373
     file_exists <- file.exists(files)
415 374
     if (!isTRUE(all(file_exists))) {
416
-        stop("These files cannot be found:\n",
417
-             "  ", paste(files[!file_exists], collapse = "\n  "))
375
+        stop("These files cannot be found:\n  ",
376
+             paste(files[!file_exists], collapse = "\n  "))
418 377
     }
419
-    # Check 'sampleNames' is valid.
420
-    # TODO: Check if this is still required
421
-    # NOTE: SummarizedExperiment validator makes calls to identical() that will
422
-    #       fail due to how sampleNames are propagated sometimes with and
423
-    #       without names. To make life simpler, we simply strip the names.
424
-    sampleNames <- unname(sampleNames)
425
-    if (length(sampleNames) != length(files)) {
426
-        stop("'sampleNames' must have the same length as 'files'.")
378
+    guessed_file_types <- .guessBismarkFileType(files)
379
+    if (anyNA(guessed_file_types)) {
380
+        stop("Could not guess Bismark file type for these files:\n  ",
381
+             "  ", paste(files[!is.na(guessed_file_types)], collapse = "\n  "))
427 382
     }
428
-    if (anyDuplicated(sampleNames)) {
429
-        stop("'sampleNames' cannot have duplicate entires")
430
-    }
431
-    # Check 'rmZeroCov' and 'strandCollapse' are valid.
432
-    stopifnot(isTRUEorFALSE(rmZeroCov))
433
-    stopifnot(isTRUEorFALSE(strandCollapse))
434 383
     # Check 'loci' is valid.
435 384
     if (!is.null(loci)) {
436 385
         if (!is(loci, "GenomicRanges")) {
... ...
@@ -440,9 +389,18 @@ read.bismark <- function(files,
440 389
             stop("All elements of 'loci' must have width equal to 1.")
441 390
         }
442 391
     }
443
-    # Set verbosity used by internal functions
444
-    subverbose <- as.logical(max(verbose - 1L, 0L))
445
-    # Register 'BACKEND' and return to current value on exit
392
+    # Check 'colData' is valid.
393
+    if (is.null(colData)) {
394
+        colData <- DataFrame(row.names = files)
395
+    }
396
+    if (nrow(colData) != length(files)) {
397
+        stop("Supplied 'colData' must have nrow(colData) == length(files).")
398
+    }
399
+    # Check 'rmZeroCov' and 'strandCollapse' are valid.
400
+    stopifnot(isTRUEorFALSE(rmZeroCov))
401
+    stopifnot(isTRUEorFALSE(strandCollapse))
402
+    # Register 'BACKEND' and return to current value on exit.
403
+    # TODO: Is this strictly necessary?
446 404
     current_BACKEND <- getRealizationBackend()
447 405
     on.exit(setRealizationBackend(current_BACKEND), add = TRUE)
448 406
     setRealizationBackend(BACKEND)
... ...
@@ -460,38 +418,55 @@ read.bismark <- function(files,
460 418
             #       backends. If the realization backend is NULL then an
461 419
             #       ordinary matrix is returned rather than a matrix-backed
462 420
             #       DelayedMatrix.
463
-            stop("The '", BACKEND, "' realization backend is not supported.\n",
464
-                 "See help(\"BSmooth\") for details.",
421
+            stop("The '", BACKEND, "' realization backend is not supported.",
422
+                 "\n  See help(\"BSmooth\") for details.",
465 423
                  call. = FALSE)
466 424
         }
467 425
     }
426
+    # If using HDF5Array as BACKEND, check remaining options are sensible.
427
+    if (!is.null(BACKEND) && BACKEND == "HDF5Array") {
428
+        # NOTE: Most of this copied from
429
+        #       HDF5Array::saveHDF5SummarizedExperiment().
430
+        if (!isSingleString(dir)) {
431
+            stop(wmsg("'dir' must be a single string specifying the path to ",
432
+                      "the directory where to save the BSseq object (the ",
433
+                      "directory will be created)."))
434
+        }
435
+        if (!isTRUEorFALSE(replace)) {
436
+            stop("'replace' must be TRUE or FALSE")
437
+        }
438
+        HDF5Array:::.create_dir(dir = dir, replace = replace)
439
+    }
440
+    # Set verbosity used by internal functions.
441
+    subverbose <- as.logical(max(verbose - 1L, 0L))
468 442
 
469 443
     # Construct FWGRanges with all valid loci ----------------------------------
470 444
 
471
-    # NOTE: "Valid loci" are those that remain after collapsing by strand (if
445
+    # NOTE: "Valid" loci are those that remain after collapsing by strand (if
472 446
     #       strandCollapse == TRUE) and then removing loci with zero coverage
473 447
     #       in all samples (if rmZeroCov == TRUE).
474 448
     if (is.null(loci)) {
475 449
         ptime1 <- proc.time()
476 450
         if (verbose) {
477
-            message("[read.bismark] Parsing files to construct valid loci ...")
451
+            message(
452
+                "[read.bismark] Parsing files and constructing valid loci ...")
478 453
         }
479 454
         loci <- .contructFWGRangesFromBismarkFiles(
480 455
             files = files,
481 456
             rmZeroCov = rmZeroCov,
482 457
             strandCollapse = strandCollapse,
483 458
             verbose = subverbose,
484
-            BPPARAM = BPPARAM,
485
-            is_zcat_available = is_zcat_available,
486
-            nThread = nThread)
459
+            BPPARAM = BPPARAM)
487 460
         ptime2 <- proc.time()
488 461
         stime <- (ptime2 - ptime1)[3]
489 462
         if (verbose) {
490
-            cat(sprintf("done in %.1f secs\n", stime))
463
+            message("Done in ", round(stime, 1), " secs")
491 464
         }
492 465
     } else {
493 466
         ptime1 <- proc.time()
494
-        if (verbose) message("[read.bismark] Using 'loci' as candidate loci")
467
+        if (verbose) {
468
+            message("[read.bismark] Using 'loci' as candidate loci.")
469
+        }
495 470
         if (strandCollapse) {
496 471
             if (verbose) {
497 472
                 message("[read.bismark] Collapsing strand of 'loci' ...")
... ...
@@ -501,7 +476,7 @@ read.bismark <- function(files,
501 476
             ptime2 <- proc.time()
502 477
             stime <- (ptime2 - ptime1)[3]
503 478
             if (verbose) {
504
-                cat(sprintf("done in %.1f secs\n", stime))
479
+                message("Done in ", round(stime, 1), " secs")
505 480
             }
506 481
         }
507 482
         if (rmZeroCov) {
... ...
@@ -516,15 +491,13 @@ read.bismark <- function(files,
516 491
                 rmZeroCov = rmZeroCov,
517 492
                 strandCollapse = strandCollapse,
518 493
                 verbose = subverbose,
519
-                BPPARAM = BPPARAM,
520
-                is_zcat_available = is_zcat_available,
521
-                nThread = nThread)
494
+                BPPARAM = BPPARAM)
522 495
             # Retain those elements of 'loci' with non-zero coverage.
523 496
             loci <- subsetByOverlaps(loci, loci_from_files, type = "equal")
524 497
             ptime2 <- proc.time()
525 498
             stime <- (ptime2 - ptime1)[3]
526 499
             if (verbose) {
527
-                cat(sprintf("done in %.1f secs\n", stime))
500
+                message("Done in ", round(stime, 1), " secs")
528 501
             }
529 502
         }
530 503
     }
... ...
@@ -532,51 +505,53 @@ read.bismark <- function(files,
532 505
     # Construct 'M' and 'Cov' matrices -----------------------------------------
533 506
     ptime1 <- proc.time()
534 507
     if (verbose) {
535
-        message("[read.bismark] Parsing files to construct 'M' and 'Cov' ",
508
+        message("[read.bismark] Parsing files and constructing 'M' and 'Cov' ",
536 509
                 "matrices ...")
537 510
     }
538 511
     counts <- .constructCounts(
539 512
         files = files,
540 513
         loci = loci,
541 514
         strandCollapse = strandCollapse,
542
-        verbose = subverbose,
543 515
         BPPARAM = BPPARAM,
544 516
         BACKEND = BACKEND,
545
-        is_zcat_available = is_zcat_available,
546
-        nThread = nThread,
547
-        ...)
517
+        dir = dir,
518
+        chunkdim = chunkdim,
519
+        level = level,
520
+        verbose = subverbose)
548 521
     ptime2 <- proc.time()
549 522
     stime <- (ptime2 - ptime1)[3]
550 523
     if (verbose) {
551
-        cat(sprintf("done in %.1f secs\n", stime))
524
+        message("Done in ", round(stime, 1), " secs")
552 525
     }
553 526
 
554
-    # Construct BSseq object ---------------------------------------------------
527
+    # Construct BSseq object, saving it if it is HDF5-backed -------------------
555 528
 
556 529
     if (verbose) {
557
-        cat(sprintf("[read.bismark] Constructing BSseq object ... "))
530
+        message("[read.bismark] Constructing BSseq object ... ")
558 531
     }
559 532
     se <- SummarizedExperiment(
560 533
         assays = counts,
561 534
         # NOTE: For now, have to use GRanges instance as rowRanges but
562 535
         #       ultimately would like to use a FWGRanges instance.
563 536
         rowRanges = as(loci, "GRanges"),
564
-        colData = DataFrame(row.names = sampleNames))
537
+        colData = colData)
565 538
     # TODO: Is there a way to use the internal constructor with `check = FALSE`?
566 539
     #       Don't need to check M and Cov because this has already happened
567 540
     #       when files were parsed.
568 541
     # .BSseq(se, trans = function(x) NULL, parameters = list())
569
-    new2("BSseq", se, check = FALSE)
542
+    bsseq <- new2("BSseq", se, check = FALSE)
543
+    if (!is.null(BACKEND) && BACKEND == "HDF5Array") {
544
+        # NOTE: Save BSseq object; mimicing
545
+        #       HDF5Array::saveHDF5SummarizedExperiment().
546
+        x <- bsseq
547
+        x@assays <- HDF5Array:::.shorten_h5_paths(x@assays)
548
+        saveRDS(x, file = file.path(dir, "se.rds"))
549
+    }
550
+    bsseq
570 551
 }
571 552
 
572 553
 # TODOs ------------------------------------------------------------------------
573 554
 
574
-# TODO: Consolidate use of message()/cat()/etc.
575
-# TODO: Add function like minfi::read.metharray.sheet()?
576
-# TODO: Should BACKEND really be an argument of read.bismark(); see related
577
-#       issue on minfi repo https://github.com/hansenlab/minfi/issues/140
578
-# TODO: Think about naming scheme for functions. Try to have the function that
579
-#       is bpapply()-ed have a similar name to its parent.
580 555
 # TODO: Document internal functions for my own sanity. Also, some may be useful
581 556
 #       to users of bsseq (although I still won't export these for the time
582 557
 #       being).
... ...
@@ -589,5 +564,3 @@ read.bismark <- function(files,
589 564
 #       .cytosineReport files and don't want this behaviour (i.e. you want to
590 565
 #       retain strand) then you'll need to construct your own 'gr' and pass
591 566
 #       this to the function. Add unit tests for this behaviour.
592
-# TODO: Use verbose = getOption("verbose") as default.
593
-# TODO: Tidy up argument order of functions and default values.
... ...
@@ -5,21 +5,7 @@ test_that("read.bismark() works for 'coverage' file", {
5 5
     infile <- system.file("extdata", "test_data.fastq_bismark.bismark.cov.gz",
6 6
                           package = "bsseq")
7 7
     bsseq <- read.bismark(files = infile,
8
-                          sampleNames = "test_data",
9
-                          rmZeroCov = FALSE,
10
-                          strandCollapse = FALSE,
11
-                          verbose = FALSE)
12
-    expect_is(bsseq, "BSseq")
13
-})
14
-
15
-test_that("read.bismark() works if sampleNames has names", {
16
-    infile <- system.file("extdata", "test_data.fastq_bismark.bismark.cov.gz",
17
-                          package = "bsseq")
18
-    # Regression test
19
-    # Should not fail if sampleNames have names()
20
-    # TODO: Check why this test is necessary by stepping through read.bismark()
21
-    bsseq <- read.bismark(files = infile,
22
-                          sampleNames = c(test = "test_data"),
8
+                          colData = DataFrame(row.names = "test_data"),
23 9
                           rmZeroCov = FALSE,
24 10
                           strandCollapse = FALSE,
25 11
                           verbose = FALSE)
... ...
@@ -31,7 +17,6 @@ test_that("read.bismark() works for 'genome wide cytosine report' file", {
31 17
     infile <- system.file("extdata", "test_data.cytosineReport.gz",
32 18
                           package = "bsseq")
33 19
     bsseq <- read.bismark(files = infile,
34
-                          sampleNames = "test_data",
35 20
                           rmZeroCov = FALSE,
36 21
                           strandCollapse = FALSE,
37 22
                           verbose = FALSE)
... ...
@@ -42,7 +27,6 @@ test_that("read.bismark() works for 'genome wide cytosine report' file", {
42 27
 
43 28
     # Check that strandCollapse = TRUE works
44 29
     bsseq <- read.bismark(files = infile,
45
-                          sampleNames = "test_data",
46 30
                           rmZeroCov = FALSE,
47 31
                           strandCollapse = TRUE,
48 32
                           verbose = FALSE)