Browse code

Remove readr dependency

Found that `readr::read_tsv()` was painfully slow for large gzipped Bismark files (e.g., mCH loci).
Instead, now use:
- If file is gzipped and `gzip` is available at the command line, then use `data.table::fread()`.
- If file is gzipped and `gzip` is **not** available at the command line, then use `utils::read.table()`.
- If file is not gzipped (plain text) use `data.table::fread()`.

There are still some kinks to work out and tradeoffs to explore.

Peter Hickey authored on 04/07/2018 02:41:15
Showing 4 changed files

... ...
@@ -31,7 +31,6 @@ Imports:
31 31
     DelayedArray (>= 0.7.15),
32 32
     Rcpp,
33 33
     BiocParallel,
34
-    readr,
35 34
     BSgenome,
36 35
     Biostrings,
37 36
     utils,
... ...
@@ -42,11 +42,11 @@ importFrom(gtools, "combinations")
42 42
 #       e.g., shift(). If new ones are discovered, add them to this list.
43 43
 import(data.table, except = c(shift, first, second))
44 44
 importFrom(permute, "shuffleSet", "how")
45
-importFrom(readr, "cols", "cols_only", "col_character", "col_integer",
46
-           "col_skip", "col_double", "col_factor", "read_tsv", "tokenizer_tsv")
47 45
 importFrom(Biostrings, "DNAString", "vmatchPattern", "reverseComplement")
48 46
 importFrom(utils, "read.delim")
49 47
 importFrom(BSgenome, "vmatchPattern")
48
+importFrom(tools, "file_path_as_absolute")
49
+importFrom(R.utils, "isGzipped")
50 50
 
51 51
 ##
52 52
 ## Exporting
... ...
@@ -217,6 +217,7 @@ setMethod("findOverlaps", c("FWGRanges", "FWGRanges"), .findOverlaps_FWGRanges)
217 217
 #       the smallest returned object (albeit at the small cost of a sort).
218 218
 .readBismarkAsFWGRanges <- function(file, rmZeroCov = FALSE,
219 219
                                     strandCollapse = FALSE, sort = TRUE,
220
+                                    is_zcat_available = TRUE, nThread = 1L,
220 221
                                     verbose = FALSE) {
221 222
     # Quieten R CMD check about 'no visible binding for global variable' -------
222 223
     M <- U <- NULL
... ...
@@ -227,6 +228,8 @@ setMethod("findOverlaps", c("FWGRanges", "FWGRanges"), .findOverlaps_FWGRanges)
227 228
             file = file,
228 229
             col_spec = "BSseq",
229 230
             check = TRUE,
231
+            is_zcat_available = is_zcat_available,
232
+            nThread = nThread,
230 233
             verbose = verbose)
231 234
         if (strandCollapse && !is.null(dt[["strand"]]) &&
232 235
             !dt[, all(strand == "*")]) {
... ...
@@ -245,6 +248,8 @@ setMethod("findOverlaps", c("FWGRanges", "FWGRanges"), .findOverlaps_FWGRanges)
245 248
             file = file,
246 249
             col_spec = "GRanges",
247 250
             check = FALSE,
251
+            is_zcat_available = is_zcat_available,
252
+            nThread = nThread,
248 253
             verbose = verbose)
249 254
         if (strandCollapse && !is.null(dt[["strand"]]) &&
250 255
             !dt[, all(strand == "*")]) {
... ...
@@ -298,7 +303,9 @@ setMethod("findOverlaps", c("FWGRanges", "FWGRanges"), .findOverlaps_FWGRanges)
298 303
                                                rmZeroCov,
299 304
                                                strandCollapse,
300 305
                                                verbose,
301
-                                               BPPARAM) {
306
+                                               BPPARAM,
307
+                                               is_zcat_available,
308
+                                               nThread) {
302 309
     subverbose <- max(as.integer(verbose) - 1L, 0L)
303 310
 
304 311
     # TODO: Instead of using the 'largest' file, use the largest
... ...
@@ -331,6 +338,8 @@ setMethod("findOverlaps", c("FWGRanges", "FWGRanges"), .findOverlaps_FWGRanges)
331 338
         file = files[[1L]],
332 339
         rmZeroCov = rmZeroCov,
333 340
         strandCollapse = strandCollapse,
341
+        is_zcat_available = is_zcat_available,
342
+        nThread = nThread,
334 343
         verbose = subverbose)
335 344
     # Identify loci not found in first file.
336 345
     # TODO: Pre-process loci as a GNCList?
... ...
@@ -357,7 +366,8 @@ setMethod("findOverlaps", c("FWGRanges", "FWGRanges"), .findOverlaps_FWGRanges)
357 366
                 file = file,
358 367
                 rmZeroCov = rmZeroCov,
359 368
                 strandCollapse = strandCollapse,
360
-                verbose = subverbose)
369
+                verbose = subverbose,
370
+                is_zcat_available = is_zcat_available)
361 371
             subsetByOverlaps(
362 372
                 x = loci_from_this_file,
363 373
                 ranges = loci_from_first_file,
... ...
@@ -4,14 +4,10 @@
4 4
 #       and more accurate heuristic to avoid 'passing' bad files.
5 5
 # TODO: Test for 'bismark_methylation_extractor' and 'bedGraph' formats
6 6
 #       (https://github.com/FelixKrueger/Bismark/tree/master/Docs#output-1)
7
+#       and error out (they're not supported by .readBismaskAsDT()).
7 8
 .guessBismarkFileType <- function(files, n_max = 10L) {
8 9
     guessed_file_types <- setNames(vector("character", length(files)), files)
9 10
     for (file in files) {
10
-        # NOTE: Not using readr::count_fields() because it is very slow on
11
-        #       large, compressed files. This is because it reads the entire
12
-        #       file into memory as a raw vector regardless of the value of
13
-        #       `n_max` (see https://github.com/tidyverse/readr/issues/610).
14
-        #       But good old utils::read.delim() doesn't have this limitation!
15 11
         x <- read.delim(
16 12
             file = file,
17 13
             header = FALSE,
... ...
@@ -29,92 +25,65 @@
29 25
     guessed_file_types
30 26
 }
31 27
 
32
-# TODO: (long term) Choose between utils::read.delim(), readr::read_tsv(), and
33
-#       data.table::fread() based on 'file'. If plain text, use fread(). If a
34
-#       compressed file, use readr::read_tsv() if available, otherwise
35
-#       utils::read_delim(). Longer term, combine data.table::fread() with
36
-#       shell commands (where available) to pass compressed files. Will need to
37
-#       be careful of the interaction between BPPARAM and fread()'s nThread.
38
-#       Once implemented, move readr to Suggests. Finally, allow user to
39
-#       specify which function to use.
40 28
 # NOTE: In brief  benchmarking, readr::read_csv() is ~1.3-1.6x faster than
41 29
 #       utils::read.delim() when reading a gzipped file, albeit it with ~1.6-2x
42 30
 #       more total memory allocated. Therefore, there may be times users prefer
43 31
 #       to trade off faster speed for lower memory usage.
44
-# TODO: (long term) Formalise these benchmarks as a document in the bsseq
32
+# TODO: (long term) Formalise benchmarks of utils::read.table(),
33
+#       data.table::fread(), and readr::read_tsv() as a document in the bsseq
45 34
 #       package so that we can readily re-visit these as needed.
35
+# TODO: Add `showProgress` instead of using `verbose` twice?
46 36
 .readBismarkAsDT <- function(file,
47 37
                              col_spec = c("all", "BSseq", "GRanges"),
48 38
                              check = FALSE,
39
+                             is_zcat_available = TRUE,
40
+                             nThread = 1L,
49 41
                              verbose = FALSE,
50 42
                              ...) {
51
-    # Quieten R CMD check about 'no visible binding for global variable' -------
52
-    M <- U <- NULL
53
-
54
-    # Construct the column spec ------------------------------------------------
43
+    # Check arguments ----------------------------------------------------------
55 44
 
45
+    file <- file_path_as_absolute(file)
56 46
     col_spec <- match.arg(col_spec)
57 47
     file_type <- .guessBismarkFileType(file)
58
-    # TODO: Test for 'bismark_methylation_extractor' and 'bedGraph' formats,
59
-    #       and error out (they're not supported).
60 48
     stopifnot(isTRUEorFALSE(check))
49
+    stopifnot(isTRUEorFALSE(is_zcat_available))
50
+    subverbose <- as.logical(max(verbose - 1L, 0L))
51
+
52
+    # Construct the column spec ------------------------------------------------
53
+
61 54
     if (file_type == "cov") {
62
-        col_names <- c("seqnames", "start", "end", "beta", "M", "U")
63 55
         if (col_spec == "BSseq") {
64
-            cols <- cols_only(
65
-                seqnames = col_factor(levels = NULL),
66
-                start = col_integer(),
67
-                end = col_skip(),
68
-                beta = col_skip(),
69
-                M = col_integer(),
70
-                U = col_integer())
56
+            colClasses <- c("factor", "integer", "NULL", "NULL", "integer",
57
+                            "integer")
58
+            drop <- c(3L, 4L)
59
+            col.names <- c("seqnames", "start", "M", "U")
71 60
         } else if (col_spec == "GRanges") {
72
-            cols <- cols(
73
-                seqnames = col_factor(levels = NULL),
74
-                start = col_integer(),
75
-                end = col_skip(),
76
-                beta = col_skip(),
77
-                M = col_skip(),
78
-                U = col_skip())
61
+            colClasses <- c("factor", "integer", "NULL", "NULL", "NULL", "NULL")
62
+            drop <- c(3L, 4L, 5L, 6L)
63
+            col.names <- c("seqnames", "start")
79 64
         } else if (col_spec == "all") {
80
-            cols <- cols(
81
-                seqnames = col_factor(levels = NULL),
82
-                start = col_integer(),
83
-                end = col_integer(),
84
-                beta = col_double(),
85
-                M = col_integer(),
86
-                U = col_integer())
65
+            colClasses <- c("factor", "integer", "integer", "numeric",
66
+                            "integer", "integer")
67
+            drop <- integer(0L)
68
+            col.names <- c("seqnames", "start", "end", "beta", "M", "U")
87 69
         }
88 70
     } else if (file_type == "cytosineReport") {
89
-        col_names = c("seqnames", "start", "strand", "M", "U",
90
-                      "dinucleotide_context", "trinucleotide_context")
91 71
         if (col_spec == "BSseq") {
92
-            cols <- cols_only(
93
-                seqnames = col_factor(levels = NULL),
94
-                start = col_integer(),
95
-                strand = col_factor(levels(strand())),
96
-                M = col_integer(),
97
-                U = col_integer(),
98
-                dinucleotide_context = col_skip(),
99
-                trinucleotide_context = col_skip())
72
+            colClasses <- c("factor", "integer", "factor", "integer",
73
+                            "integer", "NULL", "NULL")
74
+            drop <- c(6L, 7L)
75
+            col.names <- c("seqnames", "start", "strand", "M", "U")
100 76
         } else if (col_spec == "GRanges") {
101
-            cols <- cols_only(
102
-                seqnames = col_factor(levels = NULL),
103
-                start = col_integer(),
104
-                strand = col_factor(levels(strand())),
105
-                M = col_skip(),
106
-                U = col_skip(),
107
-                dinucleotide_context = col_skip(),
108
-                trinucleotide_context = col_skip())
77
+            colClasses <- c("factor", "integer", "factor", "NULL", "NULL",
78
+                            "NULL", "NULL")
79
+            drop <- c(4L, 5L, 6L, 7L)
80
+            col.names <- c("seqnames", "start", "strand")
109 81
         } else if (col_spec == "all") {
110
-            cols <- cols_only(
111
-                seqnames = col_factor(levels = NULL),
112
-                start = col_integer(),
113
-                strand = col_factor(levels(strand())),
114
-                M = col_integer(),
115
-                U = col_integer(),
116
-                dinucleotide_context = col_character(),
117
-                trinucleotide_context = col_character())
82
+            colClasses <- c("factor", "integer", "factor", "integer",
83
+                            "integer", "factor", "factor")
84
+            drop <- integer(0L)
85
+            col.names <- c("seqnames", "start", "strand", "M", "U",
86
+                           "dinucleotide_context", "trinucleotide_context")
118 87
         }
119 88
     }
120 89
 
... ...
@@ -124,37 +93,78 @@
124 93
         message("[.readBismarkAsDT] Reading file '", file, "'")
125 94
     }
126 95
     ptime1 <- proc.time()
127
-    x <- read_tsv(
128
-        file = file,
129
-        col_names = col_names,
130
-        col_types = cols,
131
-        na = character(),
132
-        quoted_na = FALSE,
133
-        progress = verbose,
134
-        ...)
96
+    # TODO: Make copy if isGzipped() to avoid dependency on R.utils?
97
+    if (isGzipped(file)) {
98
+        sysname <- Sys.info()[["sysname"]]
99
+        if (sysname == "Linux") {
100
+            input <- sprintf("zcat %s", shQuote(file))
101
+        } else if (sysname == "Darwin") {
102
+            # macOS/OS X needs a different command; see
103
+            # https://github.com/Rdatatable/data.table/issues/717#issuecomment-140028670
104
+            input <- sprintf("zcat < %s", shQuote(file))
105
+        } else {
106
+            # TODO: Support other OSs (e.g, Windows, sunOS)
107
+            warning(
108
+                "Unable to find 'zcat' for use with 'data.table::fread()'.\n",
109
+                "Falling back to 'utils::read.table()'.")
110
+            is_zcat_available <- FALSE
111
+        }
112
+        # TODO: Gracefully handle situation where user incorrectly sets
113
+        #       `is_zcat_available = TRUE` and it consequently fails.
114
+        if (is_zcat_available) {
115
+            x <- fread(
116
+                input = input,
117
+                sep = "\t",
118
+                header = FALSE,
119
+                verbose = subverbose,
120
+                drop = drop,
121
+                colClasses = colClasses,
122
+                col.names = col.names,
123
+                # TODO: Check remainder of these arguments are optimal.
124
+                # TODO: Add `key` argument? Check other arguments available in
125
+                #       fread().
126
+                quote = "",
127
+                strip.white = FALSE,
128
+                showProgress = as.logical(verbose),
129
+                nThread = nThread)
130
+        } else {
131
+            x <- read.table(
132
+                file = file,
133
+                header = FALSE,
134
+                sep = "\t",
135
+                quote = "",
136
+                col.names = col.names,
137
+                colClasses = colClasses,
138
+                strip.white = FALSE)
139
+            setDT(x)
140
+        }
141
+    } else {
142
+        x <- fread(
143
+            file = file,
144
+            sep = "\t",
145
+            header = FALSE,
146
+            verbose = subverbose,
147
+            drop = drop,
148
+            colClasses = colClasses,
149
+            col.names = col.names,
150
+            # TODO: Check remainder of these arguments are optimal.
151
+            quote = "",
152
+            strip.white = FALSE,
153
+            showProgress = as.logical(verbose),
154
+            nThread = nThread)
155
+    }
135 156
 
136 157
     # Construct the result -----------------------------------------------------
137 158
 
138
-    x <- setDT(x)
159
+    if (!is.null(x[["strand"]])) {
160
+        x[, strand := strand(strand)]
161
+    }
162
+    # NOTE: Quieten R CMD check about 'no visible binding for global variable'.
163
+    M <- U <- NULL
139 164
     if (check && all(c("M", "U") %in% colnames(x))) {
140 165
         if (verbose) {
141 166
             message("[.readBismarkAsDT] Checking validity of counts in file.")
142 167
         }
143
-        # NOTE: .checkMandCov() only accepts matrix input
144
-        # M <- as.matrix(x[["M"]])
145
-        # Cov <- as.matrix(x[["M"]] + x[["U"]])
146
-        # msg <- .checkMandCov(M, Cov)
147
-        # if (!is.null(msg)) {
148
-        #     stop(msg)
149
-        # } else {
150
-        #     if (verbose) {
151
-        #         message("[.readBismarkAsDT] Counts in file are valid!")
152
-        #     }
153
-        # }
154
-        # TODO: Write .checkMandU()? Should be written to avoid copying, use a
155
-        #       low amount of memory, stop as soon as an error is detected,
156
-        #       and be check the exact same stuff as .checkMandCov().
157
-        # TODO: Benchmark the below implementation of .checkMandU().
158 168
         valid <- x[, isTRUE(all(M >= 0L & U >= 0L))]
159 169
         if (!valid) {
160 170
             stop("[.readBismarkAsDT] Invalid counts detected!\n",
... ...
@@ -173,10 +183,10 @@
173 183
     x
174 184
 }
175 185
 
176
-
177 186
 .constructCountsFromSingleFile <- function(b, files, strandCollapse, loci,
178 187
                                            grid, M_sink, Cov_sink, sink_lock,
179
-                                           verbose, BPPARAM) {
188
+                                           verbose, BPPARAM,
189
+                                           is_zcat_available, nThread) {
180 190
 
181 191
     # Quieten R CMD check about 'no visible binding for global variable' -------
182 192
     M <- U <- NULL
... ...
@@ -192,6 +202,8 @@
192 202
         file = file,
193 203
         col_spec = "BSseq",
194 204
         check = TRUE,
205
+        is_zcat_available = is_zcat_available,
206
+        nThread = nThread,
195 207
         verbose = verbose)
196 208
     # NOTE: Can only collapse by strand if the data are stranded!
197 209
     if (strandCollapse && !is.null(dt[["strand"]])) {
... ...
@@ -245,7 +257,7 @@
245 257
 }
246 258
 
247 259
 .constructCounts <- function(files, loci, strandCollapse, verbose, BPPARAM,
248
-                             BACKEND, ...) {
260
+                             BACKEND, is_zcat_available, nThread, ...) {
249 261
     # Set up ArrayGrid so that each block contains data for a single sample.
250 262
     ans_nrow <- length(loci)
251 263
     ans_ncol <- length(files)
... ...
@@ -322,7 +334,9 @@
322 334
         Cov_sink = Cov_sink,
323 335
         sink_lock = sink_lock,
324 336
         verbose = verbose,
325
-        BPPARAM = BPPARAM))
337
+        BPPARAM = BPPARAM,
338
+        is_zcat_available = is_zcat_available,
339
+        nThread = nThread))
326 340
     if (!all(bpok(counts))) {
327 341
         # TODO: This isn't yet properly impelemented
328 342
         # TODO: Feels like stop() rather than warning() should be used, but
... ...
@@ -384,6 +398,8 @@ read.bismark <- function(files,
384 398
                          verbose = TRUE,
385 399
                          BPPARAM = bpparam(),
386 400
                          BACKEND = getRealizationBackend(),
401
+                         is_zcat_available = TRUE,
402
+                         nThread = 1L,
387 403
                          ...,
388 404
                          fileType = c("cov", "oldBedGraph", "cytosineReport"),
389 405
                          mc.cores = 1) {
... ...
@@ -482,7 +498,9 @@ read.bismark <- function(files,
482 498
             rmZeroCov = rmZeroCov,
483 499
             strandCollapse = strandCollapse,
484 500
             verbose = subverbose,
485
-            BPPARAM = BPPARAM)
501
+            BPPARAM = BPPARAM,
502
+            is_zcat_available = is_zcat_available,
503
+            nThread = nThread)
486 504
         ptime2 <- proc.time()
487 505
         stime <- (ptime2 - ptime1)[3]
488 506
         if (verbose) {
... ...
@@ -515,7 +533,9 @@ read.bismark <- function(files,
515 533
                 rmZeroCov = rmZeroCov,
516 534
                 strandCollapse = strandCollapse,
517 535
                 verbose = subverbose,
518
-                BPPARAM = BPPARAM)
536
+                BPPARAM = BPPARAM,
537
+                is_zcat_available = is_zcat_available,
538
+                nThread = nThread)
519 539
             # Retain those elements of 'loci' with non-zero coverage.
520 540
             loci <- subsetByOverlaps(loci, loci_from_files, type = "equal")
521 541
             ptime2 <- proc.time()
... ...
@@ -539,6 +559,8 @@ read.bismark <- function(files,
539 559
         verbose = subverbose,
540 560
         BPPARAM = BPPARAM,
541 561
         BACKEND = BACKEND,
562
+        is_zcat_available = is_zcat_available,
563
+        nThread = nThread,
542 564
         ...)
543 565
     ptime2 <- proc.time()
544 566
     stime <- (ptime2 - ptime1)[3]
... ...
@@ -570,9 +592,6 @@ read.bismark <- function(files,
570 592
 # TODO: Add function like minfi::read.metharray.sheet()?
571 593
 # TODO: Should BACKEND really be an argument of read.bismark(); see related
572 594
 #       issue on minfi repo https://github.com/hansenlab/minfi/issues/140
573
-# TODO: May receive warning "In read_tokens_(data, tokenizer, col_specs,
574
-#       col_names,  ... : length of NULL cannot be changed". This is fixed in
575
-#       devel version of readr (https://github.com/tidyverse/readr/issues/833).
576 595
 # TODO: Think about naming scheme for functions. Try to have the function that
577 596
 #       is bpapply()-ed have a similar name to its parent.
578 597
 # TODO: Document internal functions for my own sanity. Also, some may be useful
... ...
@@ -588,3 +607,4 @@ read.bismark <- function(files,
588 607
 #       retain strand) then you'll need to construct your own 'gr' and pass
589 608
 #       this to the function. Add unit tests for this behaviour.
590 609
 # TODO: Use verbose = getOption("verbose") as default.
610
+# TODO: Tidy up argument order of functions and default values.