Browse code

Merge branch 'refactor-read.bismark' into refactor

Peter Hickey authored on 29/05/2018 12:56:37
Showing 3 changed files

... ...
@@ -1,5 +1,5 @@
1 1
 Package: bsseq
2
-Version: 1.17.0
2
+Version: 1.17.1
3 3
 Encoding: UTF-8
4 4
 Title: Analyze, manage and store bisulfite sequencing data
5 5
 Description: A collection of tools for analyzing and visualizing bisulfite
... ...
@@ -31,14 +31,15 @@ importMethodsFrom(GenomeInfoDb, "seqlengths", "seqlengths<-", "seqinfo", "seqinf
31 31
                   "seqnames", "seqnames<-", "seqlevels", "seqlevels<-")
32 32
 import(S4Vectors)
33 33
 importFrom(gtools, "combinations")
34
-importFrom(data.table, "fread", "setnames")
34
+# importFrom(data.table, "setnames", "setDT", "data.table", ":=", "setkey")
35
+import(data.table)
35 36
 importFrom(R.utils, "isGzipped", "gunzip")
36 37
 import(limma)
37 38
 importFrom(permute, "shuffleSet", "how")
38 39
 import(DelayedArray)
39 40
 import(BiocParallel)
40
-importFrom(Rcpp, "sourceCpp")
41
-
41
+importFrom(readr, "count_fields", "cols_only", "col_character", "col_integer",
42
+           "col_skip", "col_double", "read_tsv", "tokenizer_tsv")
42 43
 ##
43 44
 ## Exporting
44 45
 ##
... ...
@@ -1,153 +1,455 @@
1
+# Internal functions -----------------------------------------------------------
2
+
3
+.guessBismarkFileType <- function(files, n_max = 10L) {
4
+    guessed_file_types <- setNames(vector("character", length(files)), files)
5
+    for (file in files) {
6
+        n_fields <- count_fields(file, tokenizer_tsv(), n_max = n_max)
7
+        if (isTRUE(all(n_fields == 6L))) {
8
+            guessed_file_types[file] <- "cov"
9
+        } else if (isTRUE(all(n_fields == 7L))) {
10
+            guessed_file_types[file] <- "cytosineReport"
11
+        } else {
12
+            stop("Could not guess Bismark file type for '", file, "'")
13
+        }
14
+    }
15
+    guessed_file_types
16
+}
17
+
18
+.readBismark <- function(file,
19
+                         col_spec = c("BSseq", "GRanges", "all"),
20
+                         verbose = FALSE) {
21
+    col_spec <- match.arg(col_spec)
22
+    file_type <- .guessBismarkFileType(file)
23
+    if (file_type == "cov") {
24
+        col_names <- c("seqnames", "start", "end", "beta", "M", "U")
25
+        if (col_spec == "BSseq") {
26
+            cols <- cols_only(
27
+                seqnames = col_character(),
28
+                start = col_integer(),
29
+                end = col_skip(),
30
+                beta = col_skip(),
31
+                M = col_integer(),
32
+                U = col_integer())
33
+        } else if (col_spec == "GRanges") {
34
+            cols <- cols(
35
+                seqnames = col_character(),
36
+                start = col_integer(),
37
+                end = col_skip(),
38
+                beta = col_skip(),
39
+                M = col_skip(),
40
+                U = col_skip())
41
+        } else if (col_spec == "all") {
42
+            cols <- cols(
43
+                seqnames = col_character(),
44
+                start = col_integer(),
45
+                end = col_integer(),
46
+                beta = col_double(),
47
+                M = col_integer(),
48
+                U = col_integer())
49
+        }
50
+    } else if (file_type == "cytosineReport") {
51
+        col_names = c("seqnames", "start", "strand", "M", "U",
52
+                      "dinucleotide_context", "trinucleotide_context")
53
+        if (col_spec == "BSseq") {
54
+            cols <- cols_only(
55
+                seqnames = col_character(),
56
+                start = col_integer(),
57
+                strand = col_character(),
58
+                M = col_integer(),
59
+                U = col_integer(),
60
+                dinucleotide_context = col_skip(),
61
+                trinucleotide_context = col_skip())
62
+        } else if (col_spec == "GRanges") {
63
+            cols <- cols_only(
64
+                seqnames = col_character(),
65
+                start = col_integer(),
66
+                strand = col_character(),
67
+                M = col_skip(),
68
+                U = col_skip(),
69
+                dinucleotide_context = col_skip(),
70
+                trinucleotide_context = col_skip())
71
+        } else if (col_spec == "all") {
72
+            cols <- cols_only(
73
+                seqnames = col_character(),
74
+                start = col_integer(),
75
+                strand = col_character(),
76
+                M = col_integer(),
77
+                U = col_integer(),
78
+                dinucleotide_context = col_character(),
79
+                trinucleotide_context = col_character())
80
+        }
81
+    }
82
+    if (verbose) {
83
+        message("[.readBismark] Reading file '", file, "'")
84
+    }
85
+    ptime1 <- proc.time()
86
+    x <- read_tsv(
87
+        file = file,
88
+        col_names = col_names,
89
+        col_types = cols,
90
+        progress = FALSE)
91
+    ptime2 <- proc.time()
92
+    stime <- (ptime2 - ptime1)[3]
93
+    if (verbose) {
94
+        cat(sprintf("done in %.1f secs\n", stime))
95
+    }
96
+    x
97
+}
98
+
99
+.readBismarkAsBSseqDT <- function(file, rmZeroCov, strandCollapse, verbose) {
100
+    x <- .readBismark(file, "BSseq", verbose)
101
+    setDT(x)
102
+    # Data is unstranded if none is provided
103
+    # TODO: What's the data.table way to check if column exists and if create
104
+    #       it if it doesn't exist?
105
+    if (is.null(x[["strand"]])) {
106
+        x[, strand := "*"]
107
+    }
108
+    if (strandCollapse) {
109
+        x <- .strandCollapseDT(x, has_counts = TRUE)
110
+    }
111
+    if (rmZeroCov) {
112
+        return(x[(M + U) > 0])
113
+    }
114
+    setkey(x, seqnames, strand, start)
115
+    x
116
+}
117
+
118
+.constructSeqinfoFromLociDT <- function(loci_dt) {
119
+    seqnames <- loci_dt[, as.character(unique(seqnames))]
120
+    sortSeqlevels(Seqinfo(seqnames = seqnames))
121
+}
122
+
123
+.contructLociDTFromBismarkFiles <- function(files,
124
+                                            rmZeroCov,
125
+                                            strandCollapse,
126
+                                            seqinfo,
127
+                                            verbose) {
128
+    subverbose <- max(as.integer(verbose) - 1L, 0L)
129
+
130
+    # Initalise `loci_dt` using the first file.
131
+    if (verbose) {
132
+        message("[.contructLociDTFromBismarkFiles] Extracting loci from ",
133
+                "'", files[1L], "'")
134
+    }
135
+    loci_dt <- .readBismarkAsBSseqDT(
136
+        file = files[1L],
137
+        rmZeroCov = rmZeroCov,
138
+        strandCollapse = strandCollapse,
139
+        verbose = subverbose)
140
+    # Drop 'M' and 'U'
141
+    loci_dt[, c("M", "U") := .(NULL, NULL)]
142
+
143
+    # TODO: Do in batches in parallel (n_batches = mc.cores)
144
+    # Loop over remaining files
145
+    for (file in files[-1L]) {
146
+        if (verbose) {
147
+            message("[.contructLociDTFromBismarkFiles] Extracting loci from ",
148
+                    "'", file, "'")
149
+        }
150
+        # Process this file
151
+        loci_from_this_file_dt <- .readBismarkAsBSseqDT(
152
+            file = file,
153
+            rmZeroCov = rmZeroCov,
154
+            strandCollapse = strandCollapse,
155
+            verbose = subverbose)
156
+        # Drop 'M' and 'U'
157
+        loci_from_this_file_dt[, c("M", "U") := .(NULL, NULL)]
158
+
159
+        # Update `loci_dt`
160
+        loci_dt <- merge(loci_dt, loci_from_this_file_dt, all = TRUE)
161
+    }
162
+
163
+    # Construct seqinfo if none supplied
164
+    if (is.null(seqinfo)) {
165
+        seqinfo <- .constructSeqinfoFromLociDT(loci_dt)
166
+    }
167
+    # Sort the loci using the "natural order" of ordering the elements of a
168
+    # GenomicRanges object (see ?`sort,GenomicRanges-method`)
169
+    loci_dt[,
170
+            c("seqnames", "strand") :=
171
+                .(factor(seqnames, levels = seqlevels(seqinfo)),
172
+                  strand(strand))]
173
+    setkey(loci_dt, seqnames, strand, start)
174
+    loci_dt
175
+}
176
+
177
+.constructCountsFromBismarkFilesAndLociDT <- function(files,
178
+                                                      loci_dt,
179
+                                                      strandCollapse,
180
+                                                      mc.cores,
181
+                                                      BACKEND,
182
+                                                      verbose = FALSE) {
183
+    ans_nrow <- nrow(loci_dt)
184
+    ans_ncol <- length(files)
185
+    if (is.null(BACKEND)) {
186
+        M <- matrix(NA_integer_, nrow = ans_nrow, ncol = ans_ncol)
187
+        Cov <- matrix(NA_integer_, nrow = ans_nrow, ncol = ans_ncol)
188
+    } else {
189
+        # Set up intermediate RealizationSink objects of appropriate dimensions
190
+        # and type. These are ultimately coerced to the output DelayedMatrix
191
+        # objects, `M` and `U`.
192
+        # NOTE: Need to register the supplied BACKEND while being sure to
193
+        #       re-register any existing backend upon exiting the function.
194
+        current_BACKEND <- getRealizationBackend()
195
+        on.exit(setRealizationBackend(current_BACKEND))
196
+        setRealizationBackend(BACKEND)
197
+        # NOTE: Don't do `U_sink <- M_sink` or else these will reference the
198
+        #       same object and clobber each other when written to!
199
+        M_sink <- DelayedArray:::RealizationSink(
200
+            dim = c(ans_nrow, ans_ncol),
201
+            type = "integer")
202
+        on.exit(close(M_sink), add = TRUE)
203
+        Cov_sink <- DelayedArray:::RealizationSink(
204
+            dim = c(ans_nrow, ans_ncol),
205
+            type = "integer")
206
+        on.exit(close(Cov_sink), add = TRUE)
207
+        grid <- RegularArrayGrid(dim(M_sink), c(ans_nrow, 1L))
208
+    }
209
+
210
+    # TODO: Load as many files as safe according to block.size (if using HDF5,
211
+    #       otherwise load them all up)
212
+    for (k in seq_along(files)) {
213
+        file <- files[k]
214
+        if (verbose) {
215
+            message("[.constructCounts] Extracting counts from ", "'", file,
216
+                    "'")
217
+        }
218
+        bsseq_dt <- .readBismarkAsBSseqDT(
219
+            file = file,
220
+            rmZeroCov = FALSE,
221
+            strandCollapse = strandCollapse,
222
+            verbose = verbose)
223
+        # TODO: Need to use GenomicRanges' strand matching behaviour
224
+        counts <- bsseq_dt[loci_dt, c("M", "U"), nomatch = NA]
225
+        counts[is.na(M), M := 0L]
226
+        counts[is.na(U), U := 0L]
227
+        counts[, c("Cov", "U") := .(M + U, NULL)]
228
+
229
+        if (is.null(BACKEND)) {
230
+            M[, k] <- counts[["M"]]
231
+            Cov[, k] <- counts[["Cov"]]
232
+        } else {
233
+            viewport <- grid[[k]]
234
+            # TODO: Does as.matrix() cause a copy?
235
+            write_block_to_sink(
236
+                block = as.matrix(counts[["M"]]),
237
+                sink = M_sink,
238
+                viewport = viewport)
239
+            write_block_to_sink(
240
+                block = as.matrix(counts[["Cov"]]),
241
+                sink = Cov_sink,
242
+                viewport = viewport)
243
+        }
244
+    }
245
+    if (!is.null(BACKEND)) {
246
+        M <- as(M_sink, "DelayedArray")
247
+        Cov <- as(Cov_sink, "DelayedArray")
248
+    }
249
+    list(M = M, Cov = Cov)
250
+}
251
+
252
+.lociDTAsGRanges <- function(loci_dt, seqinfo = NULL) {
253
+    GRanges(
254
+        seqnames = loci_dt[["seqnames"]],
255
+        ranges = IRanges(loci_dt[["start"]], width = 1L),
256
+        strand = loci_dt[["strand"]],
257
+        seqinfo = seqinfo)
258
+}
259
+
260
+.grAsLociDT <- function(gr) {
261
+    data.table(
262
+        seqnames = as.factor(seqnames(gr)),
263
+        start = start(gr),
264
+        strand = as.factor(strand(gr)),
265
+        key = c("seqnames", "strand", "start"))
266
+}
267
+
268
+.strandCollapseDT <- function(x, has_counts = TRUE) {
269
+    # Shift loci on negative strand by 1 to the left
270
+    x[strand == "-", start := start - 1L]
271
+    # Unstrand all loci
272
+    x[, strand := "*"]
273
+    # Set key
274
+    setkey(x, seqnames, strand, start)
275
+    # TODO: Is by
276
+    if (has_counts) return(x[, .(M = sum(M), U = sum(U)), by = key(x)])
277
+    unique(x)
278
+}
279
+
280
+# Exported functions -----------------------------------------------------------
281
+
1 282
 read.bismark <- function(files,
2
-                         sampleNames,
283
+                         sampleNames = basename(files),
3 284
                          rmZeroCov = FALSE,
4 285
                          strandCollapse = TRUE,
5 286
                          fileType = c("cov", "oldBedGraph", "cytosineReport"),
287
+                         seqinfo = NULL,
288
+                         gr = NULL,
6 289
                          mc.cores = 1,
7 290
                          verbose = TRUE,
8 291
                          BACKEND = NULL) {
9
-    ## Argument checking
10
-    if (!is.null(BACKEND) && mc.cores > 1) {
11
-        stop("Currently, 'mc.cores' must be 1 if 'BACKEND' is not NULL")
292
+    # Argument checking
293
+    if (anyDuplicated(files)) stop("'files' cannot have duplicate entires")
294
+    file_exists <- file.exists(files)
295
+    if (!isTRUE(all(file_exists))) {
296
+        stop("These files cannot be found:\n",
297
+             "  ", paste(files[!file_exists], collapse = "\n  "))
12 298
     }
13
-    if (anyDuplicated(files)) {
14
-        stop("duplicate entries in 'files'")
299
+    # TODO: Check if this is still required
300
+    # NOTE: SummarizedExperiment validator makes calls to identical() that will
301
+    #       fail due to how sampleNames are propagated sometimes with and
302
+    #       without names. To make life simpler, we simply strip the names.
303
+    sampleNames <- unname(sampleNames)
304
+    if (length(sampleNames) != length(files)) {
305
+        stop("'sampleNames' must have the same length as 'files'.")
15 306
     }
16
-    if (length(sampleNames) != length(files) || anyDuplicated(sampleNames)) {
17
-        stop("argument 'sampleNames' has to have the same length as argument 'files', without duplicate entries")
307
+    if (anyDuplicated(sampleNames)) {
308
+        stop("'sampleNames' cannot have duplicate entires")
18 309
     }
19
-    fileType <- match.arg(fileType)
20
-    if (verbose) {
21
-        message(paste0("Assuming file type is ", fileType))
310
+    # TODO: What's the proper way to deprecate a function argument?
311
+    if (!missing(fileType)) {
312
+        warning("'fileType' is deprecated and ignored.")
22 313
     }
23
-    if (strandCollapse && (fileType == "cov" || fileType == "oldBedGraph")) {
24
-        stop("'strandCollapse' must be 'FALSE' if 'fileType' is '", fileType, "'")
314
+    if (!is.null(gr) && !is.null(seqinfo)) {
315
+        warning("'seqinfo' is ignored when 'gr' is supplied.")
25 316
     }
26
-    ## SummarizedExperiment validator makes calls to identical() that will fail
27
-    ## due to how sampleNames are propagated sometimes with and without names().
28
-    ## To make life simpler, we simply strip the names()
29
-    sampleNames <- unname(sampleNames)
317
+    subverbose <- max(as.integer(verbose) - 1L, 0L)
30 318
 
31
-    ## Process each file
32
-    idxes <- seq_along(files)
33
-    names(idxes) <- sampleNames
34
-    allOut <- mclapply(idxes, function(ii) {
319
+    # Construct a data.table and a GRanges with all "valid loci".
320
+    # NOTE: "Valid loci" are those that remain after collapsing by strand (if
321
+    #       strandCollapse == TRUE) and then removing loci with zero coverage
322
+    #       in all samples (if rmZeroCov == TRUE).
323
+    if (is.null(gr)) {
35 324
         if (verbose) {
36
-            cat(sprintf("[read.bismark] Reading file '%s' ... ", files[ii]))
325
+            message("[read.bismark] Constructing GRanges with valid loci ...")
37 326
         }
38 327
         ptime1 <- proc.time()
39
-        if (fileType == "cov" || fileType == "oldBedGraph") {
40
-            out <- read.bismarkCovRaw(thisfile = files[ii],
41
-                                      thisSampleName = sampleNames[ii],
42
-                                      rmZeroCov = rmZeroCov,
43
-                                      BACKEND = BACKEND)
44
-        } else if (fileType == "cytosineReport") {
45
-            out <- read.bismarkCytosineReportRaw(thisfile = files[ii],
46
-                                                 thisSampleName = sampleNames[ii],
47
-                                                 rmZeroCov = rmZeroCov,
48
-                                                 keepContext = FALSE,
49
-                                                 BACKEND = BACKEND)
328
+        loci_dt <- .contructLociDTFromBismarkFiles(
329
+            files = files,
330
+            rmZeroCov = rmZeroCov,
331
+            strandCollapse = strandCollapse,
332
+            seqinfo = seqinfo,
333
+            verbose = subverbose)
334
+        if (is.null(seqinfo)) {
335
+            seqinfo <- .constructSeqinfoFromLociDT(loci_dt)
50 336
         }
51
-        if (strandCollapse) {
52
-            out <- strandCollapse(out)
337
+        gr <- .lociDTAsGRanges(loci_dt, seqinfo)
338
+        ptime2 <- proc.time()
339
+        stime <- (ptime2 - ptime1)[3]
340
+        if (verbose) {
341
+            cat(sprintf("done in %.1f secs\n", stime))
53 342
         }
343
+    } else {
344
+        if (verbose) message("[read.bismark] Using 'gr' as GRanges with loci")
345
+        # NOTE: Don't use as.data.table() because it will modify gr by
346
+        #       reference (https://github.com/Rdatatable/data.table/issues/2278)
347
+        ptime1 <- proc.time()
348
+        loci_dt <- .grAsLociDT(gr)
54 349
         ptime2 <- proc.time()
55 350
         stime <- (ptime2 - ptime1)[3]
56 351
         if (verbose) {
57 352
             cat(sprintf("done in %.1f secs\n", stime))
58 353
         }
59
-        out
60
-    }, mc.cores = mc.cores)
354
+        if (strandCollapse) {
355
+            if (verbose) {
356
+                message("[read.bismark] Collapsing strand of loci in 'gr' ...")
357
+            }
358
+            ptime1 <- proc.time()
359
+            loci_dt <- .strandCollapseDT(loci_dt, has_counts = FALSE)
360
+            ptime2 <- proc.time()
361
+            stime <- (ptime2 - ptime1)[3]
362
+            if (verbose) {
363
+                cat(sprintf("done in %.1f secs\n", stime))
364
+            }
365
+        }
366
+        if (rmZeroCov) {
367
+            # NOTE: Have to parse files in order to identify loci with zero
368
+            #       coverage in all samples.
369
+            if (verbose) {
370
+                message("[read.bismark] Identifying loci in 'gr' with zero ",
371
+                        "coverage in all samples ...")
372
+            }
373
+            ptime1 <- proc.time()
374
+            loci_from_files_dt <- .contructLociDTFromBismarkFiles(
375
+                files = files,
376
+                rmZeroCov = rmZeroCov,
377
+                strandCollapse = strandCollapse,
378
+                seqinfo = seqinfo,
379
+                verbose = subverbose)
380
+            # Retain the intersection of loci_dt and loci_from_files_dt
381
+            # TODO: Need to use GenomicRanges' strand matching behaviour
382
+            loci_dt <- loci_from_files_dt[loci_dt]
383
+            ptime2 <- proc.time()
384
+            stime <- (ptime2 - ptime1)[3]
385
+            if (verbose) {
386
+                cat(sprintf("done in %.1f secs\n", stime))
387
+            }
388
+        }
389
+        if (strandCollapse || rmZeroCov) {
390
+            # NOTE: Have to update 'gr' if loci have been collapsed by strand
391
+            #       or loci have been filtered to remove loci with zero
392
+            #       coverage.
393
+            if (verbose) {
394
+                message("[read.bismark] Filtering 'gr' to retain valid loci ",
395
+                        "...")
396
+            }
397
+            ptime1 <- proc.time()
398
+            gr <- .lociDTAsGRanges(loci_dt, seqinfo)
399
+            ptime2 <- proc.time()
400
+            stime <- (ptime2 - ptime1)[3]
401
+            if (verbose) {
402
+                cat(sprintf("done in %.1f secs\n", stime))
403
+            }
404
+        }
405
+    }
61 406
 
407
+    # Construct 'M' and 'Cov' matrices
62 408
     if (verbose) {
63
-        cat(sprintf("[read.bismark] Joining samples ... "))
409
+        message("[read.bismark] Constructing 'M' and 'Cov' matrices ...")
64 410
     }
65 411
     ptime1 <- proc.time()
66
-    allOut <- combineList(allOut)
412
+    counts <- .constructCountsFromBismarkFilesAndLociDT(
413
+        files = files,
414
+        loci_dt = loci_dt,
415
+        strandCollapse = strandCollapse,
416
+        mc.cores = mc.cores,
417
+        BACKEND = BACKEND,
418
+        verbose = subverbose)
67 419
     ptime2 <- proc.time()
68
-    stime <- (ptime2 - ptime1)[3L]
420
+    stime <- (ptime2 - ptime1)[3]
69 421
     if (verbose) {
70 422
         cat(sprintf("done in %.1f secs\n", stime))
71 423
     }
72
-    allOut
73
-}
74 424
 
75
-read.bismarkCovRaw <- function(thisfile,
76
-                               thisSampleName,
77
-                               rmZeroCov,
78
-                               BACKEND = NULL) {
79
-
80
-    ## data.table::fread() can't read directly from a gzipped file so, if
81
-    ## necessary, gunzip the file to a temporary location.
82
-    if (isGzipped(thisfile)) {
83
-        thisfile <- gunzip(thisfile,
84
-                           temporary = TRUE,
85
-                           overwrite = TRUE,
86
-                           remove = FALSE)
87
-    }
88
-
89
-    ## Read in the file
90
-    out <- fread(thisfile)
91
-    if (ncol(out) != 6L) {
92
-        stop("unknown file format")
93
-    }
94
-
95
-    ## Create GRanges instance from 'out'
96
-    gr <- GRanges(seqnames = out[[1L]],
97
-                  ranges = IRanges(start = out[[2L]], width = 1L))
98
-
99
-    ## Create BSseq instance from 'out'
100
-    ## TODO: Might be able to avoid the as.matrix()
101
-    M <- as.matrix(out[[5L]])
102
-    Cov <- as.matrix(out[[5L]] + out[[6L]])
103
-    M <- realize(M, BACKEND = BACKEND)
104
-    Cov <- realize(Cov, BACKEND = BACKEND)
105
-    BSseq(gr = gr,
106
-          M = M,
107
-          Cov = Cov,
108
-          sampleNames = thisSampleName,
109
-          rmZeroCov = rmZeroCov)
425
+    # Construct BSseq object
426
+    if (verbose) {
427
+        cat(sprintf("[read.bismark] Constructing BSseq object ... "))
428
+    }
429
+    ptime1 <- proc.time()
430
+    # TODO: Use new_BSseq(), an internal, fast/minimal BSseq constructor
431
+    #       (this function needs to be written).
432
+    BSseq <- BSseq(
433
+        M = counts[["M"]],
434
+        Cov = counts[["Cov"]],
435
+        gr = gr,
436
+        sampleNames = sampleNames)
437
+    ptime2 <- proc.time()
438
+    stime <- (ptime2 - ptime1)[3]
439
+    if (verbose) {
440
+        cat(sprintf("done in %.1f secs\n", stime))
441
+    }
442
+    BSseq
110 443
 }
111 444
 
112
-read.bismarkCytosineReportRaw <- function(thisfile,
113
-                                          thisSampleName,
114
-                                          rmZeroCov,
115
-                                          keepContext = FALSE,
116
-                                          BACKEND = NULL) {
117
-
118
-    ## NOTE: keepContext not yet implemented
119
-    if (keepContext) {
120
-        stop("'keepContext' argument not yet implemented")
121
-    }
122
-
123
-    ## data.table::fread() can't read directly from a gzipped file so, if
124
-    ## necessary, gunzip the file to a temporary location.
125
-    if (isGzipped(thisfile)) {
126
-        thisfile <- gunzip(thisfile,
127
-                           temporary = TRUE,
128
-                           overwrite = TRUE,
129
-                           remove = FALSE)
130
-    }
131
-
132
-    ## Read in the file
133
-    out <- fread(thisfile)
134
-    if (ncol(out) != 7L) {
135
-        stop("unknown file format")
136
-    }
137
-
138
-    ## Create GRanges instance from 'out'
139
-    gr <- GRanges(seqnames = out[[1]],
140
-                  ranges = IRanges(start = out[[2]], width = 1),
141
-                  strand = out[[3]])
142
-
143
-    ## Create BSseq instance from 'out'
144
-    ## TODO: Might be able to avoid the as.matrix()
145
-    M <- as.matrix(out[[4L]])
146
-    Cov <- as.matrix(out[[4L]] + out[[5L]])
147
-    M <- realize(M, BACKEND = BACKEND)
148
-    Cov <- realize(Cov, BACKEND = BACKEND)
149
-    BSseq(gr = gr, sampleNames = thisSampleName,
150
-          M = M,
151
-          Cov = Cov,
152
-          rmZeroCov = rmZeroCov)
153
-}
445
+# TODOs ------------------------------------------------------------------------
446
+
447
+# TODO: The documentation needs a complete overhaul and checked against Bismark
448
+#       docs (e.g., the description of the .cov file is wrong)
449
+# TODO: Consolidate use of message()/cat()/etc.
450
+# TODO: mc.cores isn't used anywhere; can it be?
451
+# TODO: Add function like minfi::read.metharray.sheet()?
452
+# TODO: Should BACKEND really be an argument of read.bismark(); see related
453
+#       issue on minfi repo https://github.com/hansenlab/minfi/issues/140
454
+# TODO: May receive warning "In read_tokens_(data, tokenizer, col_specs, col_names,  ... : length of NULL cannot be changed". This is fixed in devel version of
455
+#       readr (https://github.com/tidyverse/readr/issues/833)