Browse code

Split out R/read.bismark.R

Slightly formalise intermediate objects lociDT and BSseqDT.
Split these out into their own files.
Refine construction of 'gr' from Bismark files.

Peter Hickey authored on 31/05/2018 19:23:24
Showing 5 changed files

... ...
@@ -65,6 +65,8 @@ Collate:
65 65
     hdf5_utils.R
66 66
     DelayedArray_utils.R
67 67
     collapseBSseq.R
68
+    BSseqDT.R
69
+    lociDT.R
68 70
 License: Artistic-2.0
69 71
 VignetteBuilder: knitr
70 72
 URL: https://github.com/kasperdanielhansen/bsseq
... ...
@@ -32,14 +32,16 @@ importMethodsFrom(GenomeInfoDb, "seqlengths", "seqlengths<-", "seqinfo", "seqinf
32 32
 import(S4Vectors)
33 33
 importFrom(gtools, "combinations")
34 34
 # importFrom(data.table, "setnames", "setDT", "data.table", ":=", "setkey")
35
+# TODO: More careful use of imports to avoid clashes (e.g., S4Vectors::shift()
36
+#       and data.table::shift()).
35 37
 import(data.table)
36 38
 importFrom(R.utils, "isGzipped", "gunzip")
37 39
 import(limma)
38 40
 importFrom(permute, "shuffleSet", "how")
39 41
 import(DelayedArray)
40 42
 import(BiocParallel)
41
-importFrom(readr, "count_fields", "cols_only", "col_character", "col_integer",
42
-           "col_skip", "col_double", "read_tsv", "tokenizer_tsv")
43
+importFrom(readr, "cols", "cols_only", "col_character", "col_integer",
44
+           "col_skip", "col_double", "col_factor", "read_tsv", "tokenizer_tsv")
43 45
 ##
44 46
 ## Exporting
45 47
 ##
46 48
new file mode 100644
... ...
@@ -0,0 +1,48 @@
1
+# Internal functions -----------------------------------------------------------
2
+
3
+.strandCollapseBSseqDT <- function(bsseq_dt, by_ref = FALSE) {
4
+    if (bsseq_dt[, all(strand == "*")]) return(bsseq_dt)
5
+    if (!by_ref) {
6
+        bsseq_dt <- copy(bsseq_dt)
7
+    }
8
+    # Shift loci on negative strand by 1 to the left
9
+    bsseq_dt[strand == "-", start := start - 1L]
10
+    # Unstrand all loci
11
+    bsseq_dt[, strand := strand("*")]
12
+    # Aggregate counts
13
+    bsseq_dt[, .(M = sum(M), U = sum(U)), by = c("seqnames", "strand", "start")]
14
+}
15
+
16
+
17
+.readBismarkAsBSseqDT <- function(file, rmZeroCov = FALSE,
18
+                                  strandCollapse = FALSE, check = TRUE,
19
+                                  verbose = FALSE) {
20
+    dt <- .readBismarkAsDT(
21
+        file = file,
22
+        col_spec = "BSseq",
23
+        check = check,
24
+        verbose = verbose)
25
+    # Data is unstranded if none is provided
26
+    # TODO: What's the data.table way to check if column exists and if create
27
+    #       it if it doesn't exist?
28
+    if (is.null(dt[["strand"]])) {
29
+        dt[, strand := strand("*")]
30
+    }
31
+    if (strandCollapse) {
32
+        dt <- .strandCollapseBSseqDT(dt)
33
+    }
34
+    if (rmZeroCov) {
35
+        return(dt[(M + U) > 0])
36
+    }
37
+    dt
38
+}
39
+
40
+# TODOs ------------------------------------------------------------------------
41
+
42
+# TODO: Document internal functions for my own sanity. Also, some may be useful
43
+#       to users of bsseq (although I still won't export these for the time
44
+#       being).
45
+# TODO: (long term) Formalise 'lociDT' and 'BSseqDT' concepts as a S4 classes
46
+#       (e.g., lociDT could be a concrete subclass of a GenomicRanges). Also
47
+#       seems re-visiting the FWGenomicRanges idea (fixed-width GenomicRanges
48
+#       class), of which 'lociDT' would be a concrete subclass.
0 49
new file mode 100644
... ...
@@ -0,0 +1,199 @@
1
+# Internal functions -----------------------------------------------------------
2
+
3
+# TODO: Think through logic of 'sort' argument; where is it needed, should it
4
+#       be part of the constructor?
5
+.constructSeqinfoFromLociDT <- function(loci_dt, sort = TRUE) {
6
+    unique_seqnames <- loci_dt[, as.character(unique(seqnames))]
7
+    si <- Seqinfo(seqnames = unique_seqnames)
8
+    if (sort) return(sortSeqlevels(si))
9
+    si
10
+}
11
+
12
+.lociDTAsGRanges <- function(loci_dt, seqinfo = NULL) {
13
+    GRanges(
14
+        seqnames = loci_dt[["seqnames"]],
15
+        ranges = IRanges(loci_dt[["start"]], width = 1L),
16
+        strand = loci_dt[["strand"]],
17
+        seqinfo = seqinfo)
18
+}
19
+
20
+.grAsLociDT <- function(gr) {
21
+    # NOTE: Don't use as.data.table(gr) because it will modify gr by
22
+    #       reference (https://github.com/Rdatatable/data.table/issues/2278)
23
+    data.table(
24
+        seqnames = as.factor(seqnames(gr)),
25
+        start = start(gr),
26
+        strand = as.factor(strand(gr)),
27
+        key = c("seqnames", "strand", "start"))
28
+}
29
+
30
+.strandCollapseLociDT <- function(loci_dt, by_ref = FALSE) {
31
+    if (loci_dt[, all(strand == "*")]) return(loci_dt)
32
+    if (!by_ref) {
33
+        loci_dt <- copy(loci_dt)
34
+    }
35
+    # Shift loci on negative strand by 1 to the left
36
+    loci_dt[strand == "-", start := start - 1L]
37
+    # Unstrand all loci
38
+    loci_dt[, strand := strand("*")]
39
+    # data.table:::funique(loci_dt)
40
+    unique(loci_dt, by = c("seqnames", "strand", "start"))
41
+}
42
+
43
+.readBismarkAsLociDT <- function(file, rmZeroCov = FALSE,
44
+                                 strandCollapse = FALSE, verbose = FALSE) {
45
+    if (!rmZeroCov) {
46
+        dt <- .readBismarkAsDT(
47
+            file = file,
48
+            col_spec = "GRanges",
49
+            check = FALSE,
50
+            verbose = verbose)
51
+        if (is.null(dt[["strand"]])) {
52
+            dt[, strand := strand("*")]
53
+        }
54
+        if (strandCollapse) {
55
+            return(.strandCollapseLociDT(dt))
56
+        } else {
57
+            return(dt)
58
+        }
59
+    } else {
60
+        # Require M and U to remove loci with zero coverage
61
+        dt <- .readBismarkAsBSseqDT(
62
+            file = file,
63
+            rmZeroCov = rmZeroCov,
64
+            strandCollapse = strandCollapse,
65
+            check = FALSE,
66
+            verbose = verbose)
67
+        # Drop 'M' and 'U'
68
+        dt[, c("M", "U") := .(NULL, NULL)]
69
+    }
70
+}
71
+
72
+# TODO: Allow the passing of query_seqinfo and subject_seqinfo if they are
73
+#       already available.
74
+.findOverlaps_lociDT <- function(query, subject, maxgap = -1L, minoverlap = 0L,
75
+                                 type = c("any", "start", "end", "within",
76
+                                          "equal"),
77
+                                 select = c("all", "first", "last",
78
+                                            "arbitrary"),
79
+                                 ignore.strand = FALSE) {
80
+    type <- match.arg(type)
81
+    select <- match.arg(select)
82
+
83
+    # NOTE: Modding GenomicRanges:::findOverlaps_GNCList()
84
+    if (!(is(query, "data.table") && is(subject, "data.table"))) {
85
+        stop("'query' and 'subject' must be data.table objects")
86
+    }
87
+    if (!isTRUEorFALSE(ignore.strand)) {
88
+        stop("'ignore.strand' must be TRUE or FALSE")
89
+    }
90
+    query_seqinfo <- .constructSeqinfoFromLociDT(query, FALSE)
91
+    subject_seqinfo <- .constructSeqinfoFromLociDT(subject, FALSE)
92
+    si <- merge(query_seqinfo, subject_seqinfo)
93
+    q_seqlevels <- seqlevels(query_seqinfo)
94
+    s_seqlevels <- seqlevels(subject_seqinfo)
95
+    common_seqlevels <- intersect(q_seqlevels, s_seqlevels)
96
+    NG <- length(common_seqlevels)
97
+    q_group_idx <- match(common_seqlevels, q_seqlevels)  # of length NG
98
+    s_group_idx <- match(common_seqlevels, s_seqlevels)  # of length NG
99
+
100
+    ## Extract 'q_groups' and 's_groups' (both of length NG).
101
+    # q_groups <- .extract_groups_from_GenomicRanges(query)[q_group_idx]
102
+    q_groups <- splitAsList(
103
+        x = seq.int(0, nrow(query) - 1L),
104
+        f = factor(query[["seqnames"]],
105
+                   seqlevels(query_seqinfo)))[q_group_idx]
106
+    # s_groups <- .extract_groups_from_GenomicRanges(subject)[s_group_idx]
107
+    s_groups <- splitAsList(
108
+        x = seq.int(0, nrow(subject) - 1L),
109
+        f = factor(subject[["seqnames"]],
110
+                   seqlevels(subject_seqinfo)))[s_group_idx]
111
+
112
+    ## Extract 'nclists' and 'nclist_is_q' (both of length NG).
113
+    ## We'll do "on-the-fly preprocessing".
114
+    nclists <- vector(mode = "list", length = NG)
115
+    nclist_is_q <- rep.int(NA, NG)
116
+
117
+    ## Extract 'circle_length' (of length NG).
118
+    circle_length <- GenomicRanges:::.get_circle_length(si)[q_group_idx]
119
+
120
+    ## Extract 'q_space' and 's_space'.
121
+    if (ignore.strand) {
122
+        q_space <- s_space <- NULL
123
+    } else {
124
+        q_space <- as.integer(query[["strand"]]) - 3L
125
+        s_space <- as.integer(subject[["strand"]]) - 3L
126
+    }
127
+
128
+    # NOTE: Modding IRanges:::NCList_find_overlaps_in_groups
129
+    if (!(is(query, "data.table") && is(subject, "data.table"))) {
130
+        stop("'q' and 's' must be data.table object")
131
+    }
132
+    if (!is(q_groups, "CompressedIntegerList")) {
133
+        stop("'q_groups' must be a CompressedIntegerList object")
134
+    }
135
+    if (!is(s_groups, "CompressedIntegerList")) {
136
+        stop("'s_groups' must be a CompressedIntegerList object")
137
+    }
138
+
139
+    if (!isSingleNumber(maxgap)) {
140
+        stop("'maxgap' must be a single integer")
141
+    }
142
+    if (!is.integer(maxgap)) {
143
+        maxgap <- as.integer(maxgap)
144
+    }
145
+
146
+    if (!isSingleNumber(minoverlap)) {
147
+        stop("'minoverlap' must be a single integer")
148
+    }
149
+    if (!is.integer(minoverlap)) {
150
+        minoverlap <- as.integer(minoverlap)
151
+    }
152
+
153
+    circle.length <- circle_length
154
+    q_circle_len <- circle.length
155
+    q_circle_len[which(nclist_is_q)] <- NA_integer_
156
+    # NOTE: No circular ranges
157
+    # q <- .shift_ranges_in_groups_to_first_circle(q, q_groups, q_circle_len)
158
+    s_circle_len <- circle.length
159
+    s_circle_len[which(!nclist_is_q)] <- NA_integer_
160
+    # NOTE: No circular ranges
161
+    # s <- .shift_ranges_in_groups_to_first_circle(s, s_groups, s_circle_len)
162
+
163
+    .Call2("NCList_find_overlaps_in_groups",
164
+           query[["start"]], query[["start"]], q_space, q_groups,
165
+           subject[["start"]], subject[["start"]], s_space, s_groups,
166
+           nclists, nclist_is_q,
167
+           maxgap, minoverlap, type, select, circle.length,
168
+           PACKAGE = "IRanges")
169
+}
170
+
171
+.overlapsAny_lociDT <- function(query, subject, maxgap = -1L, minoverlap = 0L,
172
+                                type = c("any", "start", "end", "within",
173
+                                         "equal"), ...) {
174
+    if (is.integer(query)) {
175
+        stop("Not yet implemented")
176
+    }
177
+    type <- match.arg(type)
178
+    if (missing(subject)) {
179
+        stop("Not yet implemented")
180
+    }
181
+    else {
182
+        ahit <- .findOverlaps_lociDT(query, subject, maxgap = maxgap,
183
+                                     minoverlap = minoverlap, type = type,
184
+                                     select = "arbitrary", ...)
185
+    }
186
+    !is.na(ahit)
187
+}
188
+
189
+.subsetByOverlaps_lociDT <- function(x, ranges, maxgap = -1L, minoverlap = 0L,
190
+                                     type = c("any", "start", "end", "within",
191
+                                              "equal"),
192
+                                     invert = FALSE, ...) {
193
+    ov_any <- .overlapsAny_lociDT(x, ranges, maxgap = maxgap,
194
+                                  minoverlap = minoverlap,
195
+                                  type = match.arg(type), ...)
196
+    if (invert)
197
+        ov_any <- !ov_any
198
+    x[ov_any]
199
+}
... ...
@@ -1,9 +1,21 @@
1 1
 # Internal functions -----------------------------------------------------------
2 2
 
3
+# TODO: Test against some of Bismark's other output files. May need a stricter
4
+#       and more accurate heuristic to avoid 'passing' bad files.
5
+# NOTE: Not using readr::count_fields() because it is very slow on large,
6
+#       compressed files. This is because it reads the entire file into memory
7
+#       as a raw vector regardless of the value of `n_max` (see
8
+#       https://github.com/tidyverse/readr/issues/610). But good old
9
+#       utils::read.delim() doesn't have this limitation!
3 10
 .guessBismarkFileType <- function(files, n_max = 10L) {
4 11
     guessed_file_types <- setNames(vector("character", length(files)), files)
5 12
     for (file in files) {
6
-        n_fields <- count_fields(file, tokenizer_tsv(), n_max = n_max)
13
+        x <- read.delim(
14
+            file = file,
15
+            header = FALSE,
16
+            nrows = n_max,
17
+            stringsAsFactors = FALSE)
18
+        n_fields <- ncol(x)
7 19
         if (isTRUE(all(n_fields == 6L))) {
8 20
             guessed_file_types[file] <- "cov"
9 21
         } else if (isTRUE(all(n_fields == 7L))) {
... ...
@@ -15,15 +27,30 @@
15 27
     guessed_file_types
16 28
 }
17 29
 
30
+# TODO: (longterm, see "Alternatively ..." for a better idea)
31
+#       .readBismarkAsDT2(): exact same as .readBismarkAsDT() but
32
+#       uses utils::read.delim() instead of readr::read_tsv(). In brief
33
+#       benchmarking, readr::read_csv() is ~1.3-1.6x faster than
34
+#       utils::read.delim() when reading a gzipped file, albeit it with ~1.6-2x
35
+#       more total memory allocated. Therefore, there may be times users prefer
36
+#       to trade off faster speed for lower memory usage. When written, move
37
+#       readr to Suggests. Alternatively, re-write .readBismarkAsDT() using
38
+#       data.table::fread() for uncompressed files and utils::read.delim() for
39
+#       compressed files. This removes the dependency on readr, albeit it with
40
+#       slightly slower reading of compressed files. Could even then use
41
+#       data.table::fread() coupled with shell commands (where available) to
42
+#       pass compressed files. Ultimately, we want to use data.table beyond
43
+#       data.table::fread() whereas readr is only used for file input.
18 44
 # NOTE: This returns the file as a data.table. However, to do this it uses
19 45
 #       readr::read_tsv() + data.table::setDT() instead of data.table::fread()!
20 46
 #       Although the latter is faster, this uses the former because Bismark
21 47
 #       files are commonly compressed and readr::read_tsv() supports reading
22
-#       directly from compressed files  whereas data.table::fread() does not.
48
+#       directly from compressed files whereas data.table::fread() does not.
23 49
 .readBismarkAsDT <- function(file,
24
-                         col_spec = c("all", "BSseq", "GRanges"),
25
-                         check = FALSE,
26
-                         verbose = FALSE) {
50
+                             col_spec = c("all", "BSseq", "GRanges"),
51
+                             check = FALSE,
52
+                             verbose = FALSE,
53
+                             ...) {
27 54
     col_spec <- match.arg(col_spec)
28 55
     file_type <- .guessBismarkFileType(file)
29 56
     stopifnot(S4Vectors:::isTRUEorFALSE(check))
... ...
@@ -61,7 +88,7 @@
61 88
             cols <- cols_only(
62 89
                 seqnames = col_character(),
63 90
                 start = col_integer(),
64
-                strand = col_character(),
91
+                strand = col_factor(levels(strand())),
65 92
                 M = col_integer(),
66 93
                 U = col_integer(),
67 94
                 dinucleotide_context = col_skip(),
... ...
@@ -70,7 +97,7 @@
70 97
             cols <- cols_only(
71 98
                 seqnames = col_character(),
72 99
                 start = col_integer(),
73
-                strand = col_character(),
100
+                strand = col_factor(levels(strand())),
74 101
                 M = col_skip(),
75 102
                 U = col_skip(),
76 103
                 dinucleotide_context = col_skip(),
... ...
@@ -79,7 +106,7 @@
79 106
             cols <- cols_only(
80 107
                 seqnames = col_character(),
81 108
                 start = col_integer(),
82
-                strand = col_character(),
109
+                strand = col_factor(levels(strand())),
83 110
                 M = col_integer(),
84 111
                 U = col_integer(),
85 112
                 dinucleotide_context = col_character(),
... ...
@@ -94,13 +121,11 @@
94 121
         file = file,
95 122
         col_names = col_names,
96 123
         col_types = cols,
97
-        progress = FALSE)
124
+        na = character(),
125
+        quoted_na = FALSE,
126
+        progress = verbose,
127
+        ...)
98 128
     x <- setDT(x)
99
-    ptime2 <- proc.time()
100
-    stime <- (ptime2 - ptime1)[3]
101
-    if (verbose) {
102
-        cat(sprintf("done in %.1f secs\n", stime))
103
-    }
104 129
     if (check && all(c("M", "U") %in% colnames(x))) {
105 130
         if (verbose) {
106 131
             message("[.readBismarkAsDT] Checking validity of counts in file.")
... ...
@@ -130,67 +155,24 @@
130 155
             }
131 156
         }
132 157
     }
133
-    x
134
-}
135
-
136
-.strandCollapseDT <- function(dt, has_counts = TRUE) {
137
-    # TODO: Shortcut (and warning?) if all loci are unstranded.
138
-    # Shift loci on negative strand by 1 to the left
139
-    dt[strand == "-", start := start - 1L]
140
-    # Unstrand all loci
141
-    dt[, strand := "*"]
142
-    # Set key
143
-    setkey(dt, seqnames, strand, start)
144
-    # TODO: Is by correct?
145
-    if (has_counts) return(dt[, .(M = sum(M), U = sum(U)), by = key(dt)])
146
-    unique(dt)
147
-}
148
-
149
-.readBismarkAsBSseqDT <- function(file, rmZeroCov, strandCollapse, check,
150
-                                  verbose) {
151
-    dt <- .readBismarkAsDT(file, "BSseq", check, verbose)
152
-    # Data is unstranded if none is provided
153
-    # TODO: What's the data.table way to check if column exists and if create
154
-    #       it if it doesn't exist?
155
-    if (is.null(dt[["strand"]])) {
156
-        dt[, strand := "*"]
157
-    }
158
-    if (strandCollapse) {
159
-        dt <- .strandCollapseDT(dt, has_counts = TRUE)
160
-    }
161
-    if (rmZeroCov) {
162
-        # TODO: setkey()?
163
-        return(dt[(M + U) > 0])
158
+    ptime2 <- proc.time()
159
+    stime <- (ptime2 - ptime1)[3]
160
+    if (verbose) {
161
+        cat(sprintf("done in %.1f secs\n", stime))
164 162
     }
165
-    setkey(dt, seqnames, strand, start)
166
-    dt
167
-}
168
-
169
-.readBismarkAsLociDT <- function(file, rmZeroCov, strandCollapse, verbose) {
170
-    dt <- .readBismarkAsBSseqDT(
171
-        file = files[1L],
172
-        rmZeroCov = rmZeroCov,
173
-        strandCollapse = strandCollapse,
174
-        check = FALSE,
175
-        verbose = subverbose)
176
-    # Drop 'M' and 'U'
177
-    dt[, c("M", "U") := .(NULL, NULL)]
178
-}
179
-
180
-.constructSeqinfoFromLociDT <- function(loci_dt) {
181
-    unique_seqnames <- loci_dt[, as.character(unique(seqnames))]
182
-    sortSeqlevels(Seqinfo(seqnames = unique_seqnames))
163
+    x
183 164
 }
184 165
 
185
-.contructLociDTFromBismarkFiles <- function(files,
186
-                                            rmZeroCov,
187
-                                            strandCollapse,
188
-                                            seqinfo,
189
-                                            verbose,
190
-                                            BPPARAM) {
166
+.contructLociDTAndSeqinfoFromBismarkFiles <- function(files,
167
+                                                      rmZeroCov,
168
+                                                      strandCollapse,
169
+                                                      seqinfo,
170
+                                                      verbose,
171
+                                                      BPPARAM) {
191 172
     subverbose <- max(as.integer(verbose) - 1L, 0L)
192 173
 
193
-    # TODO: Initialise using the 'largest' file (i.e. largest number of lines)?       #       Would like to do this without reading the data into memory.
174
+    # TODO: Initialise using the 'largest' file (i.e. largest number of lines)?
175
+    #       Would like to do this without reading the data into memory.
194 176
     #       Some benchmarks can be found at
195 177
     #       https://gist.github.com/peterhurford/0d62f49fd43b6cf078168c043412f70a
196 178
     #       My initial tests using /users/phickey/GTExScripts/FlowSortingProject/hdf5/extdata/methylation/nonCG/5248_BA9_neg_CHG_report.txt (32 GB) give:
... ...
@@ -212,7 +194,7 @@
212 194
                 "'", files[1L], "'")
213 195
     }
214 196
     loci_dt <- .readBismarkAsLociDT(
215
-        file = files[1L],
197
+        file = files[[1L]],
216 198
         rmZeroCov = rmZeroCov,
217 199
         strandCollapse = strandCollapse,
218 200
         verbose = subverbose)
... ...
@@ -229,42 +211,22 @@
229 211
             rmZeroCov = rmZeroCov,
230 212
             strandCollapse = strandCollapse,
231 213
             verbose = subverbose)
232
-        # Identify loci unique to this file.
233
-        fsetdiff(loci_from_this_file_dt, loci_dt)
214
+        .subsetByOverlaps_lociDT(loci_from_this_file_dt, loci_dt,
215
+                                 type = "equal", invert = TRUE)
234 216
     }, loci_dt = loci_dt, BPPARAM = BPPARAM)
235 217
     # Take union of loci.
236 218
     loci_dt <- funion(loci_dt, Reduce(funion, loci_from_other_files_dt))
237 219
 
238 220
     # Construct seqinfo if none supplied
239 221
     if (is.null(seqinfo)) {
240
-        seqinfo <- .constructSeqinfoFromLociDT(loci_dt)
222
+        # TODO: Document that sort = TRUE is used.
223
+        seqinfo <- .constructSeqinfoFromLociDT(loci_dt, sort = TRUE)
241 224
     }
242 225
     # Sort the loci using the "natural order" of ordering the elements of a
243 226
     # GenomicRanges object (see ?`sort,GenomicRanges-method`)
244
-    loci_dt[,
245
-            c("seqnames", "strand") :=
246
-                .(factor(seqnames, levels = seqlevels(seqinfo)),
247
-                  strand(strand))]
227
+    loci_dt[, seqnames := factor(seqnames, levels = seqlevels(seqinfo))]
248 228
     setkey(loci_dt, seqnames, strand, start)
249
-    loci_dt
250
-}
251
-
252
-.lociDTAsGRanges <- function(loci_dt, seqinfo = NULL) {
253
-    GRanges(
254
-        seqnames = loci_dt[["seqnames"]],
255
-        ranges = IRanges(loci_dt[["start"]], width = 1L),
256
-        strand = loci_dt[["strand"]],
257
-        seqinfo = seqinfo)
258
-}
259
-
260
-.grAsLociDT <- function(gr) {
261
-    # NOTE: Don't use as.data.table(gr) because it will modify gr by
262
-    #       reference (https://github.com/Rdatatable/data.table/issues/2278)
263
-    data.table(
264
-        seqnames = as.factor(seqnames(gr)),
265
-        start = start(gr),
266
-        strand = as.factor(strand(gr)),
267
-        key = c("seqnames", "strand", "start"))
229
+    list(loci_dt = loci_dt, seqinfo = seqinfo)
268 230
 }
269 231
 
270 232
 .constructCountsFromBismarkFileAndLociDT <- function(b, files, strandCollapse,
... ...
@@ -280,14 +242,22 @@
280 242
     }
281 243
     bsseq_dt <- .readBismarkAsBSseqDT(
282 244
         file = file,
245
+        # NOTE: No need to remove zero coverage loci because we combine with
246
+        #       loci_dt, which by construction only contains loci with non-zero
247
+        #       coverage.
283 248
         rmZeroCov = FALSE,
284 249
         strandCollapse = strandCollapse,
285 250
         check = TRUE,
286 251
         verbose = verbose)
252
+    # Sort bsseq_dt to use same order as loci_dt.
253
+    bsseq_dt[, c("seqnames", "strand") :=
254
+                 .(factor(seqnames, levels(loci_dt[["seqnames"]])),
255
+                   strand(strand))]
256
+    setkeyv(bsseq_dt, key(loci_dt))
287 257
 
288 258
     # Combine data for b-th file with loci_dt and construct counts -------------
289
-    # TODO: Need to use GenomicRanges' strand matching behaviour
290
-    # TODO: `nomatch = NA_integer_`?
259
+    # TODO: (UP TO HERE) Need to use GenomicRanges' strand matching behaviour
260
+    # NOTE: Can't use use nomatch = 0, so have to do this in two steps.
291 261
     counts <- bsseq_dt[loci_dt, c("M", "U"), nomatch = NA]
292 262
     counts[is.na(M), M := 0L]
293 263
     counts[is.na(U), U := 0L]
... ...
@@ -405,6 +375,8 @@
405 375
 # TODO: Support BPREDO?
406 376
 # TODO: Support passing a colData so that metadata is automatically added to
407 377
 #       samples?
378
+# TODO: Probably pass down verbose as subverbose to functions that take verbose
379
+#       argument.
408 380
 read.bismark <- function(files,
409 381
                          sampleNames = basename(files),
410 382
                          rmZeroCov = FALSE,
... ...
@@ -489,16 +461,15 @@ read.bismark <- function(files,
489 461
             message("[read.bismark] Reading files to construct GRanges with ",
490 462
                     "valid loci ...")
491 463
         }
492
-        loci_dt <- .contructLociDTFromBismarkFiles(
464
+        loci_dt_and_seqinfo <- .contructLociDTAndSeqinfoFromBismarkFiles(
493 465
             files = files,
494 466
             rmZeroCov = rmZeroCov,
495 467
             strandCollapse = strandCollapse,
496 468
             seqinfo = seqinfo,
497 469
             verbose = subverbose,
498 470
             BPPARAM = BPPARAM)
499
-        if (is.null(seqinfo)) {
500
-            seqinfo <- .constructSeqinfoFromLociDT(loci_dt)
501
-        }
471
+        loci_dt <- loci_dt_and_seqinfo[["loci_dt"]]
472
+        seqinfo <- loci_dt_and_seqinfo[["seqinfo"]]
502 473
         gr <- .lociDTAsGRanges(loci_dt, seqinfo)
503 474
     } else {
504 475
         if (verbose) message("[read.bismark] Using 'gr' as GRanges with loci")
... ...
@@ -508,7 +479,7 @@ read.bismark <- function(files,
508 479
                 message("[read.bismark] Collapsing strand of loci in 'gr' ...")
509 480
             }
510 481
             # ptime1 <- proc.time()
511
-            loci_dt <- .strandCollapseDT(loci_dt, has_counts = FALSE)
482
+            loci_dt <- .strandCollapseLociDT(loci_dt, has_counts = FALSE)
512 483
             # ptime2 <- proc.time()
513 484
             # stime <- (ptime2 - ptime1)[3]
514 485
             # if (verbose) {
... ...
@@ -614,3 +585,19 @@ read.bismark <- function(files,
614 585
 # TODO: Document internal functions for my own sanity. Also, some may be useful
615 586
 #       to users of bsseq (although I still won't export these for the time
616 587
 #       being).
588
+# TODO: Allow user to specify HDF5 file and have both M and Cov written to that
589
+#       file.
590
+# TODO: Helper function to obtain CpX loci from BSgenome as GRanges to pass as
591
+#       'gr' argument in read.bismark().
592
+# TODO: (long term) Current implementation requires that user can load at least
593
+#       one sample's worth of data into memory per worker. Could instead read
594
+#       chunks of data, write to sink, load next chunk, etc.
595
+# TODO: Add big note to documentation that .cov file does not contain strand
596
+#       information, which means strandCollapse can't be used (unless 'gr' is
597
+#       supplied or one of the other files is a cytosineReport file).
598
+# TODO: Document that if 'gr' is NULL and any 'files' (especially the first
599
+#       file) are .cov files, then any loci present in the .cov files will have
600
+#       their strand set to *. If you are mixing and matching .cov and
601
+#       .cytosineReport files and don't want this behaviour (i.e. you want to
602
+#       retain strand) then you'll need to construct your own 'gr' and pass
603
+#       this to the function. Add unit tests for this behaviour.