Slightly formalise intermediate objects lociDT and BSseqDT.
Split these out into their own files.
Refine construction of 'gr' from Bismark files.
... | ... |
@@ -32,14 +32,16 @@ importMethodsFrom(GenomeInfoDb, "seqlengths", "seqlengths<-", "seqinfo", "seqinf |
32 | 32 |
import(S4Vectors) |
33 | 33 |
importFrom(gtools, "combinations") |
34 | 34 |
# importFrom(data.table, "setnames", "setDT", "data.table", ":=", "setkey") |
35 |
+# TODO: More careful use of imports to avoid clashes (e.g., S4Vectors::shift() |
|
36 |
+# and data.table::shift()). |
|
35 | 37 |
import(data.table) |
36 | 38 |
importFrom(R.utils, "isGzipped", "gunzip") |
37 | 39 |
import(limma) |
38 | 40 |
importFrom(permute, "shuffleSet", "how") |
39 | 41 |
import(DelayedArray) |
40 | 42 |
import(BiocParallel) |
41 |
-importFrom(readr, "count_fields", "cols_only", "col_character", "col_integer", |
|
42 |
- "col_skip", "col_double", "read_tsv", "tokenizer_tsv") |
|
43 |
+importFrom(readr, "cols", "cols_only", "col_character", "col_integer", |
|
44 |
+ "col_skip", "col_double", "col_factor", "read_tsv", "tokenizer_tsv") |
|
43 | 45 |
## |
44 | 46 |
## Exporting |
45 | 47 |
## |
46 | 48 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,48 @@ |
1 |
+# Internal functions ----------------------------------------------------------- |
|
2 |
+ |
|
3 |
+.strandCollapseBSseqDT <- function(bsseq_dt, by_ref = FALSE) { |
|
4 |
+ if (bsseq_dt[, all(strand == "*")]) return(bsseq_dt) |
|
5 |
+ if (!by_ref) { |
|
6 |
+ bsseq_dt <- copy(bsseq_dt) |
|
7 |
+ } |
|
8 |
+ # Shift loci on negative strand by 1 to the left |
|
9 |
+ bsseq_dt[strand == "-", start := start - 1L] |
|
10 |
+ # Unstrand all loci |
|
11 |
+ bsseq_dt[, strand := strand("*")] |
|
12 |
+ # Aggregate counts |
|
13 |
+ bsseq_dt[, .(M = sum(M), U = sum(U)), by = c("seqnames", "strand", "start")] |
|
14 |
+} |
|
15 |
+ |
|
16 |
+ |
|
17 |
+.readBismarkAsBSseqDT <- function(file, rmZeroCov = FALSE, |
|
18 |
+ strandCollapse = FALSE, check = TRUE, |
|
19 |
+ verbose = FALSE) { |
|
20 |
+ dt <- .readBismarkAsDT( |
|
21 |
+ file = file, |
|
22 |
+ col_spec = "BSseq", |
|
23 |
+ check = check, |
|
24 |
+ verbose = verbose) |
|
25 |
+ # Data is unstranded if none is provided |
|
26 |
+ # TODO: What's the data.table way to check if column exists and if create |
|
27 |
+ # it if it doesn't exist? |
|
28 |
+ if (is.null(dt[["strand"]])) { |
|
29 |
+ dt[, strand := strand("*")] |
|
30 |
+ } |
|
31 |
+ if (strandCollapse) { |
|
32 |
+ dt <- .strandCollapseBSseqDT(dt) |
|
33 |
+ } |
|
34 |
+ if (rmZeroCov) { |
|
35 |
+ return(dt[(M + U) > 0]) |
|
36 |
+ } |
|
37 |
+ dt |
|
38 |
+} |
|
39 |
+ |
|
40 |
+# TODOs ------------------------------------------------------------------------ |
|
41 |
+ |
|
42 |
+# TODO: Document internal functions for my own sanity. Also, some may be useful |
|
43 |
+# to users of bsseq (although I still won't export these for the time |
|
44 |
+# being). |
|
45 |
+# TODO: (long term) Formalise 'lociDT' and 'BSseqDT' concepts as a S4 classes |
|
46 |
+# (e.g., lociDT could be a concrete subclass of a GenomicRanges). Also |
|
47 |
+# seems re-visiting the FWGenomicRanges idea (fixed-width GenomicRanges |
|
48 |
+# class), of which 'lociDT' would be a concrete subclass. |
0 | 49 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,199 @@ |
1 |
+# Internal functions ----------------------------------------------------------- |
|
2 |
+ |
|
3 |
+# TODO: Think through logic of 'sort' argument; where is it needed, should it |
|
4 |
+# be part of the constructor? |
|
5 |
+.constructSeqinfoFromLociDT <- function(loci_dt, sort = TRUE) { |
|
6 |
+ unique_seqnames <- loci_dt[, as.character(unique(seqnames))] |
|
7 |
+ si <- Seqinfo(seqnames = unique_seqnames) |
|
8 |
+ if (sort) return(sortSeqlevels(si)) |
|
9 |
+ si |
|
10 |
+} |
|
11 |
+ |
|
12 |
+.lociDTAsGRanges <- function(loci_dt, seqinfo = NULL) { |
|
13 |
+ GRanges( |
|
14 |
+ seqnames = loci_dt[["seqnames"]], |
|
15 |
+ ranges = IRanges(loci_dt[["start"]], width = 1L), |
|
16 |
+ strand = loci_dt[["strand"]], |
|
17 |
+ seqinfo = seqinfo) |
|
18 |
+} |
|
19 |
+ |
|
20 |
+.grAsLociDT <- function(gr) { |
|
21 |
+ # NOTE: Don't use as.data.table(gr) because it will modify gr by |
|
22 |
+ # reference (https://github.com/Rdatatable/data.table/issues/2278) |
|
23 |
+ data.table( |
|
24 |
+ seqnames = as.factor(seqnames(gr)), |
|
25 |
+ start = start(gr), |
|
26 |
+ strand = as.factor(strand(gr)), |
|
27 |
+ key = c("seqnames", "strand", "start")) |
|
28 |
+} |
|
29 |
+ |
|
30 |
+.strandCollapseLociDT <- function(loci_dt, by_ref = FALSE) { |
|
31 |
+ if (loci_dt[, all(strand == "*")]) return(loci_dt) |
|
32 |
+ if (!by_ref) { |
|
33 |
+ loci_dt <- copy(loci_dt) |
|
34 |
+ } |
|
35 |
+ # Shift loci on negative strand by 1 to the left |
|
36 |
+ loci_dt[strand == "-", start := start - 1L] |
|
37 |
+ # Unstrand all loci |
|
38 |
+ loci_dt[, strand := strand("*")] |
|
39 |
+ # data.table:::funique(loci_dt) |
|
40 |
+ unique(loci_dt, by = c("seqnames", "strand", "start")) |
|
41 |
+} |
|
42 |
+ |
|
43 |
+.readBismarkAsLociDT <- function(file, rmZeroCov = FALSE, |
|
44 |
+ strandCollapse = FALSE, verbose = FALSE) { |
|
45 |
+ if (!rmZeroCov) { |
|
46 |
+ dt <- .readBismarkAsDT( |
|
47 |
+ file = file, |
|
48 |
+ col_spec = "GRanges", |
|
49 |
+ check = FALSE, |
|
50 |
+ verbose = verbose) |
|
51 |
+ if (is.null(dt[["strand"]])) { |
|
52 |
+ dt[, strand := strand("*")] |
|
53 |
+ } |
|
54 |
+ if (strandCollapse) { |
|
55 |
+ return(.strandCollapseLociDT(dt)) |
|
56 |
+ } else { |
|
57 |
+ return(dt) |
|
58 |
+ } |
|
59 |
+ } else { |
|
60 |
+ # Require M and U to remove loci with zero coverage |
|
61 |
+ dt <- .readBismarkAsBSseqDT( |
|
62 |
+ file = file, |
|
63 |
+ rmZeroCov = rmZeroCov, |
|
64 |
+ strandCollapse = strandCollapse, |
|
65 |
+ check = FALSE, |
|
66 |
+ verbose = verbose) |
|
67 |
+ # Drop 'M' and 'U' |
|
68 |
+ dt[, c("M", "U") := .(NULL, NULL)] |
|
69 |
+ } |
|
70 |
+} |
|
71 |
+ |
|
72 |
+# TODO: Allow the passing of query_seqinfo and subject_seqinfo if they are |
|
73 |
+# already available. |
|
74 |
+.findOverlaps_lociDT <- function(query, subject, maxgap = -1L, minoverlap = 0L, |
|
75 |
+ type = c("any", "start", "end", "within", |
|
76 |
+ "equal"), |
|
77 |
+ select = c("all", "first", "last", |
|
78 |
+ "arbitrary"), |
|
79 |
+ ignore.strand = FALSE) { |
|
80 |
+ type <- match.arg(type) |
|
81 |
+ select <- match.arg(select) |
|
82 |
+ |
|
83 |
+ # NOTE: Modding GenomicRanges:::findOverlaps_GNCList() |
|
84 |
+ if (!(is(query, "data.table") && is(subject, "data.table"))) { |
|
85 |
+ stop("'query' and 'subject' must be data.table objects") |
|
86 |
+ } |
|
87 |
+ if (!isTRUEorFALSE(ignore.strand)) { |
|
88 |
+ stop("'ignore.strand' must be TRUE or FALSE") |
|
89 |
+ } |
|
90 |
+ query_seqinfo <- .constructSeqinfoFromLociDT(query, FALSE) |
|
91 |
+ subject_seqinfo <- .constructSeqinfoFromLociDT(subject, FALSE) |
|
92 |
+ si <- merge(query_seqinfo, subject_seqinfo) |
|
93 |
+ q_seqlevels <- seqlevels(query_seqinfo) |
|
94 |
+ s_seqlevels <- seqlevels(subject_seqinfo) |
|
95 |
+ common_seqlevels <- intersect(q_seqlevels, s_seqlevels) |
|
96 |
+ NG <- length(common_seqlevels) |
|
97 |
+ q_group_idx <- match(common_seqlevels, q_seqlevels) # of length NG |
|
98 |
+ s_group_idx <- match(common_seqlevels, s_seqlevels) # of length NG |
|
99 |
+ |
|
100 |
+ ## Extract 'q_groups' and 's_groups' (both of length NG). |
|
101 |
+ # q_groups <- .extract_groups_from_GenomicRanges(query)[q_group_idx] |
|
102 |
+ q_groups <- splitAsList( |
|
103 |
+ x = seq.int(0, nrow(query) - 1L), |
|
104 |
+ f = factor(query[["seqnames"]], |
|
105 |
+ seqlevels(query_seqinfo)))[q_group_idx] |
|
106 |
+ # s_groups <- .extract_groups_from_GenomicRanges(subject)[s_group_idx] |
|
107 |
+ s_groups <- splitAsList( |
|
108 |
+ x = seq.int(0, nrow(subject) - 1L), |
|
109 |
+ f = factor(subject[["seqnames"]], |
|
110 |
+ seqlevels(subject_seqinfo)))[s_group_idx] |
|
111 |
+ |
|
112 |
+ ## Extract 'nclists' and 'nclist_is_q' (both of length NG). |
|
113 |
+ ## We'll do "on-the-fly preprocessing". |
|
114 |
+ nclists <- vector(mode = "list", length = NG) |
|
115 |
+ nclist_is_q <- rep.int(NA, NG) |
|
116 |
+ |
|
117 |
+ ## Extract 'circle_length' (of length NG). |
|
118 |
+ circle_length <- GenomicRanges:::.get_circle_length(si)[q_group_idx] |
|
119 |
+ |
|
120 |
+ ## Extract 'q_space' and 's_space'. |
|
121 |
+ if (ignore.strand) { |
|
122 |
+ q_space <- s_space <- NULL |
|
123 |
+ } else { |
|
124 |
+ q_space <- as.integer(query[["strand"]]) - 3L |
|
125 |
+ s_space <- as.integer(subject[["strand"]]) - 3L |
|
126 |
+ } |
|
127 |
+ |
|
128 |
+ # NOTE: Modding IRanges:::NCList_find_overlaps_in_groups |
|
129 |
+ if (!(is(query, "data.table") && is(subject, "data.table"))) { |
|
130 |
+ stop("'q' and 's' must be data.table object") |
|
131 |
+ } |
|
132 |
+ if (!is(q_groups, "CompressedIntegerList")) { |
|
133 |
+ stop("'q_groups' must be a CompressedIntegerList object") |
|
134 |
+ } |
|
135 |
+ if (!is(s_groups, "CompressedIntegerList")) { |
|
136 |
+ stop("'s_groups' must be a CompressedIntegerList object") |
|
137 |
+ } |
|
138 |
+ |
|
139 |
+ if (!isSingleNumber(maxgap)) { |
|
140 |
+ stop("'maxgap' must be a single integer") |
|
141 |
+ } |
|
142 |
+ if (!is.integer(maxgap)) { |
|
143 |
+ maxgap <- as.integer(maxgap) |
|
144 |
+ } |
|
145 |
+ |
|
146 |
+ if (!isSingleNumber(minoverlap)) { |
|
147 |
+ stop("'minoverlap' must be a single integer") |
|
148 |
+ } |
|
149 |
+ if (!is.integer(minoverlap)) { |
|
150 |
+ minoverlap <- as.integer(minoverlap) |
|
151 |
+ } |
|
152 |
+ |
|
153 |
+ circle.length <- circle_length |
|
154 |
+ q_circle_len <- circle.length |
|
155 |
+ q_circle_len[which(nclist_is_q)] <- NA_integer_ |
|
156 |
+ # NOTE: No circular ranges |
|
157 |
+ # q <- .shift_ranges_in_groups_to_first_circle(q, q_groups, q_circle_len) |
|
158 |
+ s_circle_len <- circle.length |
|
159 |
+ s_circle_len[which(!nclist_is_q)] <- NA_integer_ |
|
160 |
+ # NOTE: No circular ranges |
|
161 |
+ # s <- .shift_ranges_in_groups_to_first_circle(s, s_groups, s_circle_len) |
|
162 |
+ |
|
163 |
+ .Call2("NCList_find_overlaps_in_groups", |
|
164 |
+ query[["start"]], query[["start"]], q_space, q_groups, |
|
165 |
+ subject[["start"]], subject[["start"]], s_space, s_groups, |
|
166 |
+ nclists, nclist_is_q, |
|
167 |
+ maxgap, minoverlap, type, select, circle.length, |
|
168 |
+ PACKAGE = "IRanges") |
|
169 |
+} |
|
170 |
+ |
|
171 |
+.overlapsAny_lociDT <- function(query, subject, maxgap = -1L, minoverlap = 0L, |
|
172 |
+ type = c("any", "start", "end", "within", |
|
173 |
+ "equal"), ...) { |
|
174 |
+ if (is.integer(query)) { |
|
175 |
+ stop("Not yet implemented") |
|
176 |
+ } |
|
177 |
+ type <- match.arg(type) |
|
178 |
+ if (missing(subject)) { |
|
179 |
+ stop("Not yet implemented") |
|
180 |
+ } |
|
181 |
+ else { |
|
182 |
+ ahit <- .findOverlaps_lociDT(query, subject, maxgap = maxgap, |
|
183 |
+ minoverlap = minoverlap, type = type, |
|
184 |
+ select = "arbitrary", ...) |
|
185 |
+ } |
|
186 |
+ !is.na(ahit) |
|
187 |
+} |
|
188 |
+ |
|
189 |
+.subsetByOverlaps_lociDT <- function(x, ranges, maxgap = -1L, minoverlap = 0L, |
|
190 |
+ type = c("any", "start", "end", "within", |
|
191 |
+ "equal"), |
|
192 |
+ invert = FALSE, ...) { |
|
193 |
+ ov_any <- .overlapsAny_lociDT(x, ranges, maxgap = maxgap, |
|
194 |
+ minoverlap = minoverlap, |
|
195 |
+ type = match.arg(type), ...) |
|
196 |
+ if (invert) |
|
197 |
+ ov_any <- !ov_any |
|
198 |
+ x[ov_any] |
|
199 |
+} |
... | ... |
@@ -1,9 +1,21 @@ |
1 | 1 |
# Internal functions ----------------------------------------------------------- |
2 | 2 |
|
3 |
+# TODO: Test against some of Bismark's other output files. May need a stricter |
|
4 |
+# and more accurate heuristic to avoid 'passing' bad files. |
|
5 |
+# NOTE: Not using readr::count_fields() because it is very slow on large, |
|
6 |
+# compressed files. This is because it reads the entire file into memory |
|
7 |
+# as a raw vector regardless of the value of `n_max` (see |
|
8 |
+# https://github.com/tidyverse/readr/issues/610). But good old |
|
9 |
+# utils::read.delim() doesn't have this limitation! |
|
3 | 10 |
.guessBismarkFileType <- function(files, n_max = 10L) { |
4 | 11 |
guessed_file_types <- setNames(vector("character", length(files)), files) |
5 | 12 |
for (file in files) { |
6 |
- n_fields <- count_fields(file, tokenizer_tsv(), n_max = n_max) |
|
13 |
+ x <- read.delim( |
|
14 |
+ file = file, |
|
15 |
+ header = FALSE, |
|
16 |
+ nrows = n_max, |
|
17 |
+ stringsAsFactors = FALSE) |
|
18 |
+ n_fields <- ncol(x) |
|
7 | 19 |
if (isTRUE(all(n_fields == 6L))) { |
8 | 20 |
guessed_file_types[file] <- "cov" |
9 | 21 |
} else if (isTRUE(all(n_fields == 7L))) { |
... | ... |
@@ -15,15 +27,30 @@ |
15 | 27 |
guessed_file_types |
16 | 28 |
} |
17 | 29 |
|
30 |
+# TODO: (longterm, see "Alternatively ..." for a better idea) |
|
31 |
+# .readBismarkAsDT2(): exact same as .readBismarkAsDT() but |
|
32 |
+# uses utils::read.delim() instead of readr::read_tsv(). In brief |
|
33 |
+# benchmarking, readr::read_csv() is ~1.3-1.6x faster than |
|
34 |
+# utils::read.delim() when reading a gzipped file, albeit it with ~1.6-2x |
|
35 |
+# more total memory allocated. Therefore, there may be times users prefer |
|
36 |
+# to trade off faster speed for lower memory usage. When written, move |
|
37 |
+# readr to Suggests. Alternatively, re-write .readBismarkAsDT() using |
|
38 |
+# data.table::fread() for uncompressed files and utils::read.delim() for |
|
39 |
+# compressed files. This removes the dependency on readr, albeit it with |
|
40 |
+# slightly slower reading of compressed files. Could even then use |
|
41 |
+# data.table::fread() coupled with shell commands (where available) to |
|
42 |
+# pass compressed files. Ultimately, we want to use data.table beyond |
|
43 |
+# data.table::fread() whereas readr is only used for file input. |
|
18 | 44 |
# NOTE: This returns the file as a data.table. However, to do this it uses |
19 | 45 |
# readr::read_tsv() + data.table::setDT() instead of data.table::fread()! |
20 | 46 |
# Although the latter is faster, this uses the former because Bismark |
21 | 47 |
# files are commonly compressed and readr::read_tsv() supports reading |
22 |
-# directly from compressed files whereas data.table::fread() does not. |
|
48 |
+# directly from compressed files whereas data.table::fread() does not. |
|
23 | 49 |
.readBismarkAsDT <- function(file, |
24 |
- col_spec = c("all", "BSseq", "GRanges"), |
|
25 |
- check = FALSE, |
|
26 |
- verbose = FALSE) { |
|
50 |
+ col_spec = c("all", "BSseq", "GRanges"), |
|
51 |
+ check = FALSE, |
|
52 |
+ verbose = FALSE, |
|
53 |
+ ...) { |
|
27 | 54 |
col_spec <- match.arg(col_spec) |
28 | 55 |
file_type <- .guessBismarkFileType(file) |
29 | 56 |
stopifnot(S4Vectors:::isTRUEorFALSE(check)) |
... | ... |
@@ -61,7 +88,7 @@ |
61 | 88 |
cols <- cols_only( |
62 | 89 |
seqnames = col_character(), |
63 | 90 |
start = col_integer(), |
64 |
- strand = col_character(), |
|
91 |
+ strand = col_factor(levels(strand())), |
|
65 | 92 |
M = col_integer(), |
66 | 93 |
U = col_integer(), |
67 | 94 |
dinucleotide_context = col_skip(), |
... | ... |
@@ -70,7 +97,7 @@ |
70 | 97 |
cols <- cols_only( |
71 | 98 |
seqnames = col_character(), |
72 | 99 |
start = col_integer(), |
73 |
- strand = col_character(), |
|
100 |
+ strand = col_factor(levels(strand())), |
|
74 | 101 |
M = col_skip(), |
75 | 102 |
U = col_skip(), |
76 | 103 |
dinucleotide_context = col_skip(), |
... | ... |
@@ -79,7 +106,7 @@ |
79 | 106 |
cols <- cols_only( |
80 | 107 |
seqnames = col_character(), |
81 | 108 |
start = col_integer(), |
82 |
- strand = col_character(), |
|
109 |
+ strand = col_factor(levels(strand())), |
|
83 | 110 |
M = col_integer(), |
84 | 111 |
U = col_integer(), |
85 | 112 |
dinucleotide_context = col_character(), |
... | ... |
@@ -94,13 +121,11 @@ |
94 | 121 |
file = file, |
95 | 122 |
col_names = col_names, |
96 | 123 |
col_types = cols, |
97 |
- progress = FALSE) |
|
124 |
+ na = character(), |
|
125 |
+ quoted_na = FALSE, |
|
126 |
+ progress = verbose, |
|
127 |
+ ...) |
|
98 | 128 |
x <- setDT(x) |
99 |
- ptime2 <- proc.time() |
|
100 |
- stime <- (ptime2 - ptime1)[3] |
|
101 |
- if (verbose) { |
|
102 |
- cat(sprintf("done in %.1f secs\n", stime)) |
|
103 |
- } |
|
104 | 129 |
if (check && all(c("M", "U") %in% colnames(x))) { |
105 | 130 |
if (verbose) { |
106 | 131 |
message("[.readBismarkAsDT] Checking validity of counts in file.") |
... | ... |
@@ -130,67 +155,24 @@ |
130 | 155 |
} |
131 | 156 |
} |
132 | 157 |
} |
133 |
- x |
|
134 |
-} |
|
135 |
- |
|
136 |
-.strandCollapseDT <- function(dt, has_counts = TRUE) { |
|
137 |
- # TODO: Shortcut (and warning?) if all loci are unstranded. |
|
138 |
- # Shift loci on negative strand by 1 to the left |
|
139 |
- dt[strand == "-", start := start - 1L] |
|
140 |
- # Unstrand all loci |
|
141 |
- dt[, strand := "*"] |
|
142 |
- # Set key |
|
143 |
- setkey(dt, seqnames, strand, start) |
|
144 |
- # TODO: Is by correct? |
|
145 |
- if (has_counts) return(dt[, .(M = sum(M), U = sum(U)), by = key(dt)]) |
|
146 |
- unique(dt) |
|
147 |
-} |
|
148 |
- |
|
149 |
-.readBismarkAsBSseqDT <- function(file, rmZeroCov, strandCollapse, check, |
|
150 |
- verbose) { |
|
151 |
- dt <- .readBismarkAsDT(file, "BSseq", check, verbose) |
|
152 |
- # Data is unstranded if none is provided |
|
153 |
- # TODO: What's the data.table way to check if column exists and if create |
|
154 |
- # it if it doesn't exist? |
|
155 |
- if (is.null(dt[["strand"]])) { |
|
156 |
- dt[, strand := "*"] |
|
157 |
- } |
|
158 |
- if (strandCollapse) { |
|
159 |
- dt <- .strandCollapseDT(dt, has_counts = TRUE) |
|
160 |
- } |
|
161 |
- if (rmZeroCov) { |
|
162 |
- # TODO: setkey()? |
|
163 |
- return(dt[(M + U) > 0]) |
|
158 |
+ ptime2 <- proc.time() |
|
159 |
+ stime <- (ptime2 - ptime1)[3] |
|
160 |
+ if (verbose) { |
|
161 |
+ cat(sprintf("done in %.1f secs\n", stime)) |
|
164 | 162 |
} |
165 |
- setkey(dt, seqnames, strand, start) |
|
166 |
- dt |
|
167 |
-} |
|
168 |
- |
|
169 |
-.readBismarkAsLociDT <- function(file, rmZeroCov, strandCollapse, verbose) { |
|
170 |
- dt <- .readBismarkAsBSseqDT( |
|
171 |
- file = files[1L], |
|
172 |
- rmZeroCov = rmZeroCov, |
|
173 |
- strandCollapse = strandCollapse, |
|
174 |
- check = FALSE, |
|
175 |
- verbose = subverbose) |
|
176 |
- # Drop 'M' and 'U' |
|
177 |
- dt[, c("M", "U") := .(NULL, NULL)] |
|
178 |
-} |
|
179 |
- |
|
180 |
-.constructSeqinfoFromLociDT <- function(loci_dt) { |
|
181 |
- unique_seqnames <- loci_dt[, as.character(unique(seqnames))] |
|
182 |
- sortSeqlevels(Seqinfo(seqnames = unique_seqnames)) |
|
163 |
+ x |
|
183 | 164 |
} |
184 | 165 |
|
185 |
-.contructLociDTFromBismarkFiles <- function(files, |
|
186 |
- rmZeroCov, |
|
187 |
- strandCollapse, |
|
188 |
- seqinfo, |
|
189 |
- verbose, |
|
190 |
- BPPARAM) { |
|
166 |
+.contructLociDTAndSeqinfoFromBismarkFiles <- function(files, |
|
167 |
+ rmZeroCov, |
|
168 |
+ strandCollapse, |
|
169 |
+ seqinfo, |
|
170 |
+ verbose, |
|
171 |
+ BPPARAM) { |
|
191 | 172 |
subverbose <- max(as.integer(verbose) - 1L, 0L) |
192 | 173 |
|
193 |
- # TODO: Initialise using the 'largest' file (i.e. largest number of lines)? # Would like to do this without reading the data into memory. |
|
174 |
+ # TODO: Initialise using the 'largest' file (i.e. largest number of lines)? |
|
175 |
+ # Would like to do this without reading the data into memory. |
|
194 | 176 |
# Some benchmarks can be found at |
195 | 177 |
# https://gist.github.com/peterhurford/0d62f49fd43b6cf078168c043412f70a |
196 | 178 |
# My initial tests using /users/phickey/GTExScripts/FlowSortingProject/hdf5/extdata/methylation/nonCG/5248_BA9_neg_CHG_report.txt (32 GB) give: |
... | ... |
@@ -212,7 +194,7 @@ |
212 | 194 |
"'", files[1L], "'") |
213 | 195 |
} |
214 | 196 |
loci_dt <- .readBismarkAsLociDT( |
215 |
- file = files[1L], |
|
197 |
+ file = files[[1L]], |
|
216 | 198 |
rmZeroCov = rmZeroCov, |
217 | 199 |
strandCollapse = strandCollapse, |
218 | 200 |
verbose = subverbose) |
... | ... |
@@ -229,42 +211,22 @@ |
229 | 211 |
rmZeroCov = rmZeroCov, |
230 | 212 |
strandCollapse = strandCollapse, |
231 | 213 |
verbose = subverbose) |
232 |
- # Identify loci unique to this file. |
|
233 |
- fsetdiff(loci_from_this_file_dt, loci_dt) |
|
214 |
+ .subsetByOverlaps_lociDT(loci_from_this_file_dt, loci_dt, |
|
215 |
+ type = "equal", invert = TRUE) |
|
234 | 216 |
}, loci_dt = loci_dt, BPPARAM = BPPARAM) |
235 | 217 |
# Take union of loci. |
236 | 218 |
loci_dt <- funion(loci_dt, Reduce(funion, loci_from_other_files_dt)) |
237 | 219 |
|
238 | 220 |
# Construct seqinfo if none supplied |
239 | 221 |
if (is.null(seqinfo)) { |
240 |
- seqinfo <- .constructSeqinfoFromLociDT(loci_dt) |
|
222 |
+ # TODO: Document that sort = TRUE is used. |
|
223 |
+ seqinfo <- .constructSeqinfoFromLociDT(loci_dt, sort = TRUE) |
|
241 | 224 |
} |
242 | 225 |
# Sort the loci using the "natural order" of ordering the elements of a |
243 | 226 |
# GenomicRanges object (see ?`sort,GenomicRanges-method`) |
244 |
- loci_dt[, |
|
245 |
- c("seqnames", "strand") := |
|
246 |
- .(factor(seqnames, levels = seqlevels(seqinfo)), |
|
247 |
- strand(strand))] |
|
227 |
+ loci_dt[, seqnames := factor(seqnames, levels = seqlevels(seqinfo))] |
|
248 | 228 |
setkey(loci_dt, seqnames, strand, start) |
249 |
- loci_dt |
|
250 |
-} |
|
251 |
- |
|
252 |
-.lociDTAsGRanges <- function(loci_dt, seqinfo = NULL) { |
|
253 |
- GRanges( |
|
254 |
- seqnames = loci_dt[["seqnames"]], |
|
255 |
- ranges = IRanges(loci_dt[["start"]], width = 1L), |
|
256 |
- strand = loci_dt[["strand"]], |
|
257 |
- seqinfo = seqinfo) |
|
258 |
-} |
|
259 |
- |
|
260 |
-.grAsLociDT <- function(gr) { |
|
261 |
- # NOTE: Don't use as.data.table(gr) because it will modify gr by |
|
262 |
- # reference (https://github.com/Rdatatable/data.table/issues/2278) |
|
263 |
- data.table( |
|
264 |
- seqnames = as.factor(seqnames(gr)), |
|
265 |
- start = start(gr), |
|
266 |
- strand = as.factor(strand(gr)), |
|
267 |
- key = c("seqnames", "strand", "start")) |
|
229 |
+ list(loci_dt = loci_dt, seqinfo = seqinfo) |
|
268 | 230 |
} |
269 | 231 |
|
270 | 232 |
.constructCountsFromBismarkFileAndLociDT <- function(b, files, strandCollapse, |
... | ... |
@@ -280,14 +242,22 @@ |
280 | 242 |
} |
281 | 243 |
bsseq_dt <- .readBismarkAsBSseqDT( |
282 | 244 |
file = file, |
245 |
+ # NOTE: No need to remove zero coverage loci because we combine with |
|
246 |
+ # loci_dt, which by construction only contains loci with non-zero |
|
247 |
+ # coverage. |
|
283 | 248 |
rmZeroCov = FALSE, |
284 | 249 |
strandCollapse = strandCollapse, |
285 | 250 |
check = TRUE, |
286 | 251 |
verbose = verbose) |
252 |
+ # Sort bsseq_dt to use same order as loci_dt. |
|
253 |
+ bsseq_dt[, c("seqnames", "strand") := |
|
254 |
+ .(factor(seqnames, levels(loci_dt[["seqnames"]])), |
|
255 |
+ strand(strand))] |
|
256 |
+ setkeyv(bsseq_dt, key(loci_dt)) |
|
287 | 257 |
|
288 | 258 |
# Combine data for b-th file with loci_dt and construct counts ------------- |
289 |
- # TODO: Need to use GenomicRanges' strand matching behaviour |
|
290 |
- # TODO: `nomatch = NA_integer_`? |
|
259 |
+ # TODO: (UP TO HERE) Need to use GenomicRanges' strand matching behaviour |
|
260 |
+ # NOTE: Can't use use nomatch = 0, so have to do this in two steps. |
|
291 | 261 |
counts <- bsseq_dt[loci_dt, c("M", "U"), nomatch = NA] |
292 | 262 |
counts[is.na(M), M := 0L] |
293 | 263 |
counts[is.na(U), U := 0L] |
... | ... |
@@ -405,6 +375,8 @@ |
405 | 375 |
# TODO: Support BPREDO? |
406 | 376 |
# TODO: Support passing a colData so that metadata is automatically added to |
407 | 377 |
# samples? |
378 |
+# TODO: Probably pass down verbose as subverbose to functions that take verbose |
|
379 |
+# argument. |
|
408 | 380 |
read.bismark <- function(files, |
409 | 381 |
sampleNames = basename(files), |
410 | 382 |
rmZeroCov = FALSE, |
... | ... |
@@ -489,16 +461,15 @@ read.bismark <- function(files, |
489 | 461 |
message("[read.bismark] Reading files to construct GRanges with ", |
490 | 462 |
"valid loci ...") |
491 | 463 |
} |
492 |
- loci_dt <- .contructLociDTFromBismarkFiles( |
|
464 |
+ loci_dt_and_seqinfo <- .contructLociDTAndSeqinfoFromBismarkFiles( |
|
493 | 465 |
files = files, |
494 | 466 |
rmZeroCov = rmZeroCov, |
495 | 467 |
strandCollapse = strandCollapse, |
496 | 468 |
seqinfo = seqinfo, |
497 | 469 |
verbose = subverbose, |
498 | 470 |
BPPARAM = BPPARAM) |
499 |
- if (is.null(seqinfo)) { |
|
500 |
- seqinfo <- .constructSeqinfoFromLociDT(loci_dt) |
|
501 |
- } |
|
471 |
+ loci_dt <- loci_dt_and_seqinfo[["loci_dt"]] |
|
472 |
+ seqinfo <- loci_dt_and_seqinfo[["seqinfo"]] |
|
502 | 473 |
gr <- .lociDTAsGRanges(loci_dt, seqinfo) |
503 | 474 |
} else { |
504 | 475 |
if (verbose) message("[read.bismark] Using 'gr' as GRanges with loci") |
... | ... |
@@ -508,7 +479,7 @@ read.bismark <- function(files, |
508 | 479 |
message("[read.bismark] Collapsing strand of loci in 'gr' ...") |
509 | 480 |
} |
510 | 481 |
# ptime1 <- proc.time() |
511 |
- loci_dt <- .strandCollapseDT(loci_dt, has_counts = FALSE) |
|
482 |
+ loci_dt <- .strandCollapseLociDT(loci_dt, has_counts = FALSE) |
|
512 | 483 |
# ptime2 <- proc.time() |
513 | 484 |
# stime <- (ptime2 - ptime1)[3] |
514 | 485 |
# if (verbose) { |
... | ... |
@@ -614,3 +585,19 @@ read.bismark <- function(files, |
614 | 585 |
# TODO: Document internal functions for my own sanity. Also, some may be useful |
615 | 586 |
# to users of bsseq (although I still won't export these for the time |
616 | 587 |
# being). |
588 |
+# TODO: Allow user to specify HDF5 file and have both M and Cov written to that |
|
589 |
+# file. |
|
590 |
+# TODO: Helper function to obtain CpX loci from BSgenome as GRanges to pass as |
|
591 |
+# 'gr' argument in read.bismark(). |
|
592 |
+# TODO: (long term) Current implementation requires that user can load at least |
|
593 |
+# one sample's worth of data into memory per worker. Could instead read |
|
594 |
+# chunks of data, write to sink, load next chunk, etc. |
|
595 |
+# TODO: Add big note to documentation that .cov file does not contain strand |
|
596 |
+# information, which means strandCollapse can't be used (unless 'gr' is |
|
597 |
+# supplied or one of the other files is a cytosineReport file). |
|
598 |
+# TODO: Document that if 'gr' is NULL and any 'files' (especially the first |
|
599 |
+# file) are .cov files, then any loci present in the .cov files will have |
|
600 |
+# their strand set to *. If you are mixing and matching .cov and |
|
601 |
+# .cytosineReport files and don't want this behaviour (i.e. you want to |
|
602 |
+# retain strand) then you'll need to construct your own 'gr' and pass |
|
603 |
+# this to the function. Add unit tests for this behaviour. |