... | ... |
@@ -1,5 +1,5 @@ |
1 | 1 |
Package: bsseq |
2 |
-Version: 1.17.0 |
|
2 |
+Version: 1.17.1 |
|
3 | 3 |
Encoding: UTF-8 |
4 | 4 |
Title: Analyze, manage and store bisulfite sequencing data |
5 | 5 |
Description: A collection of tools for analyzing and visualizing bisulfite |
... | ... |
@@ -30,7 +30,8 @@ Imports: |
30 | 30 |
permute, |
31 | 31 |
limma, |
32 | 32 |
DelayedArray (>= 0.5.34), |
33 |
- HDF5Array |
|
33 |
+ HDF5Array, |
|
34 |
+ readr |
|
34 | 35 |
Suggests: |
35 | 36 |
RUnit, |
36 | 37 |
bsseqData, |
... | ... |
@@ -31,14 +31,18 @@ importMethodsFrom(GenomeInfoDb, "seqlengths", "seqlengths<-", "seqinfo", "seqinf |
31 | 31 |
"seqnames", "seqnames<-", "seqlevels", "seqlevels<-") |
32 | 32 |
import(S4Vectors) |
33 | 33 |
importFrom(gtools, "combinations") |
34 |
-importFrom(data.table, "fread", "setnames") |
|
34 |
+# importFrom(data.table, "setnames", "setDT", "data.table", ":=", "setkey") |
|
35 |
+import(data.table) |
|
35 | 36 |
importFrom(R.utils, "isGzipped", "gunzip") |
36 | 37 |
import(limma) |
37 | 38 |
importFrom(permute, "shuffleSet", "how") |
38 | 39 |
importClassesFrom(DelayedArray, "DelayedArray", "DelayedMatrix") |
39 | 40 |
importFrom(DelayedArray, "DelayedArray", "plogis", "pmin2", "pmax2", "rowMaxs", |
40 |
- "rowMins") |
|
41 |
+ "rowMins", "getRealizationBackend", "setRealizationBackend", "close", |
|
42 |
+ "RegularArrayGrid", "write_block_to_sink") |
|
41 | 43 |
importClassesFrom(HDF5Array, "HDF5Array") |
44 |
+importFrom(readr, "count_fields", "cols_only", "col_character", "col_integer", |
|
45 |
+ "col_skip", "col_double", "read_tsv", "tokenizer_tsv") |
|
42 | 46 |
|
43 | 47 |
## |
44 | 48 |
## Exporting |
... | ... |
@@ -1,153 +1,455 @@ |
1 |
+# Internal functions ----------------------------------------------------------- |
|
2 |
+ |
|
3 |
+.guessBismarkFileType <- function(files, n_max = 10L) { |
|
4 |
+ guessed_file_types <- setNames(vector("character", length(files)), files) |
|
5 |
+ for (file in files) { |
|
6 |
+ n_fields <- count_fields(file, tokenizer_tsv(), n_max = n_max) |
|
7 |
+ if (isTRUE(all(n_fields == 6L))) { |
|
8 |
+ guessed_file_types[file] <- "cov" |
|
9 |
+ } else if (isTRUE(all(n_fields == 7L))) { |
|
10 |
+ guessed_file_types[file] <- "cytosineReport" |
|
11 |
+ } else { |
|
12 |
+ stop("Could not guess Bismark file type for '", file, "'") |
|
13 |
+ } |
|
14 |
+ } |
|
15 |
+ guessed_file_types |
|
16 |
+} |
|
17 |
+ |
|
18 |
+.readBismark <- function(file, |
|
19 |
+ col_spec = c("BSseq", "GRanges", "all"), |
|
20 |
+ verbose = FALSE) { |
|
21 |
+ col_spec <- match.arg(col_spec) |
|
22 |
+ file_type <- .guessBismarkFileType(file) |
|
23 |
+ if (file_type == "cov") { |
|
24 |
+ col_names <- c("seqnames", "start", "end", "beta", "M", "U") |
|
25 |
+ if (col_spec == "BSseq") { |
|
26 |
+ cols <- cols_only( |
|
27 |
+ seqnames = col_character(), |
|
28 |
+ start = col_integer(), |
|
29 |
+ end = col_skip(), |
|
30 |
+ beta = col_skip(), |
|
31 |
+ M = col_integer(), |
|
32 |
+ U = col_integer()) |
|
33 |
+ } else if (col_spec == "GRanges") { |
|
34 |
+ cols <- cols( |
|
35 |
+ seqnames = col_character(), |
|
36 |
+ start = col_integer(), |
|
37 |
+ end = col_skip(), |
|
38 |
+ beta = col_skip(), |
|
39 |
+ M = col_skip(), |
|
40 |
+ U = col_skip()) |
|
41 |
+ } else if (col_spec == "all") { |
|
42 |
+ cols <- cols( |
|
43 |
+ seqnames = col_character(), |
|
44 |
+ start = col_integer(), |
|
45 |
+ end = col_integer(), |
|
46 |
+ beta = col_double(), |
|
47 |
+ M = col_integer(), |
|
48 |
+ U = col_integer()) |
|
49 |
+ } |
|
50 |
+ } else if (file_type == "cytosineReport") { |
|
51 |
+ col_names = c("seqnames", "start", "strand", "M", "U", |
|
52 |
+ "dinucleotide_context", "trinucleotide_context") |
|
53 |
+ if (col_spec == "BSseq") { |
|
54 |
+ cols <- cols_only( |
|
55 |
+ seqnames = col_character(), |
|
56 |
+ start = col_integer(), |
|
57 |
+ strand = col_character(), |
|
58 |
+ M = col_integer(), |
|
59 |
+ U = col_integer(), |
|
60 |
+ dinucleotide_context = col_skip(), |
|
61 |
+ trinucleotide_context = col_skip()) |
|
62 |
+ } else if (col_spec == "GRanges") { |
|
63 |
+ cols <- cols_only( |
|
64 |
+ seqnames = col_character(), |
|
65 |
+ start = col_integer(), |
|
66 |
+ strand = col_character(), |
|
67 |
+ M = col_skip(), |
|
68 |
+ U = col_skip(), |
|
69 |
+ dinucleotide_context = col_skip(), |
|
70 |
+ trinucleotide_context = col_skip()) |
|
71 |
+ } else if (col_spec == "all") { |
|
72 |
+ cols <- cols_only( |
|
73 |
+ seqnames = col_character(), |
|
74 |
+ start = col_integer(), |
|
75 |
+ strand = col_character(), |
|
76 |
+ M = col_integer(), |
|
77 |
+ U = col_integer(), |
|
78 |
+ dinucleotide_context = col_character(), |
|
79 |
+ trinucleotide_context = col_character()) |
|
80 |
+ } |
|
81 |
+ } |
|
82 |
+ if (verbose) { |
|
83 |
+ message("[.readBismark] Reading file '", file, "'") |
|
84 |
+ } |
|
85 |
+ ptime1 <- proc.time() |
|
86 |
+ x <- read_tsv( |
|
87 |
+ file = file, |
|
88 |
+ col_names = col_names, |
|
89 |
+ col_types = cols, |
|
90 |
+ progress = FALSE) |
|
91 |
+ ptime2 <- proc.time() |
|
92 |
+ stime <- (ptime2 - ptime1)[3] |
|
93 |
+ if (verbose) { |
|
94 |
+ cat(sprintf("done in %.1f secs\n", stime)) |
|
95 |
+ } |
|
96 |
+ x |
|
97 |
+} |
|
98 |
+ |
|
99 |
+.readBismarkAsBSseqDT <- function(file, rmZeroCov, strandCollapse, verbose) { |
|
100 |
+ x <- .readBismark(file, "BSseq", verbose) |
|
101 |
+ setDT(x) |
|
102 |
+ # Data is unstranded if none is provided |
|
103 |
+ # TODO: What's the data.table way to check if column exists and if create |
|
104 |
+ # it if it doesn't exist? |
|
105 |
+ if (is.null(x[["strand"]])) { |
|
106 |
+ x[, strand := "*"] |
|
107 |
+ } |
|
108 |
+ if (strandCollapse) { |
|
109 |
+ x <- .strandCollapseDT(x, has_counts = TRUE) |
|
110 |
+ } |
|
111 |
+ if (rmZeroCov) { |
|
112 |
+ return(x[(M + U) > 0]) |
|
113 |
+ } |
|
114 |
+ setkey(x, seqnames, strand, start) |
|
115 |
+ x |
|
116 |
+} |
|
117 |
+ |
|
118 |
+.constructSeqinfoFromLociDT <- function(loci_dt) { |
|
119 |
+ seqnames <- loci_dt[, as.character(unique(seqnames))] |
|
120 |
+ sortSeqlevels(Seqinfo(seqnames = seqnames)) |
|
121 |
+} |
|
122 |
+ |
|
123 |
+.contructLociDTFromBismarkFiles <- function(files, |
|
124 |
+ rmZeroCov, |
|
125 |
+ strandCollapse, |
|
126 |
+ seqinfo, |
|
127 |
+ verbose) { |
|
128 |
+ subverbose <- max(as.integer(verbose) - 1L, 0L) |
|
129 |
+ |
|
130 |
+ # Initalise `loci_dt` using the first file. |
|
131 |
+ if (verbose) { |
|
132 |
+ message("[.contructLociDTFromBismarkFiles] Extracting loci from ", |
|
133 |
+ "'", files[1L], "'") |
|
134 |
+ } |
|
135 |
+ loci_dt <- .readBismarkAsBSseqDT( |
|
136 |
+ file = files[1L], |
|
137 |
+ rmZeroCov = rmZeroCov, |
|
138 |
+ strandCollapse = strandCollapse, |
|
139 |
+ verbose = subverbose) |
|
140 |
+ # Drop 'M' and 'U' |
|
141 |
+ loci_dt[, c("M", "U") := .(NULL, NULL)] |
|
142 |
+ |
|
143 |
+ # TODO: Do in batches in parallel (n_batches = mc.cores) |
|
144 |
+ # Loop over remaining files |
|
145 |
+ for (file in files[-1L]) { |
|
146 |
+ if (verbose) { |
|
147 |
+ message("[.contructLociDTFromBismarkFiles] Extracting loci from ", |
|
148 |
+ "'", file, "'") |
|
149 |
+ } |
|
150 |
+ # Process this file |
|
151 |
+ loci_from_this_file_dt <- .readBismarkAsBSseqDT( |
|
152 |
+ file = file, |
|
153 |
+ rmZeroCov = rmZeroCov, |
|
154 |
+ strandCollapse = strandCollapse, |
|
155 |
+ verbose = subverbose) |
|
156 |
+ # Drop 'M' and 'U' |
|
157 |
+ loci_from_this_file_dt[, c("M", "U") := .(NULL, NULL)] |
|
158 |
+ |
|
159 |
+ # Update `loci_dt` |
|
160 |
+ loci_dt <- merge(loci_dt, loci_from_this_file_dt, all = TRUE) |
|
161 |
+ } |
|
162 |
+ |
|
163 |
+ # Construct seqinfo if none supplied |
|
164 |
+ if (is.null(seqinfo)) { |
|
165 |
+ seqinfo <- .constructSeqinfoFromLociDT(loci_dt) |
|
166 |
+ } |
|
167 |
+ # Sort the loci using the "natural order" of ordering the elements of a |
|
168 |
+ # GenomicRanges object (see ?`sort,GenomicRanges-method`) |
|
169 |
+ loci_dt[, |
|
170 |
+ c("seqnames", "strand") := |
|
171 |
+ .(factor(seqnames, levels = seqlevels(seqinfo)), |
|
172 |
+ strand(strand))] |
|
173 |
+ setkey(loci_dt, seqnames, strand, start) |
|
174 |
+ loci_dt |
|
175 |
+} |
|
176 |
+ |
|
177 |
+.constructCountsFromBismarkFilesAndLociDT <- function(files, |
|
178 |
+ loci_dt, |
|
179 |
+ strandCollapse, |
|
180 |
+ mc.cores, |
|
181 |
+ BACKEND, |
|
182 |
+ verbose = FALSE) { |
|
183 |
+ ans_nrow <- nrow(loci_dt) |
|
184 |
+ ans_ncol <- length(files) |
|
185 |
+ if (is.null(BACKEND)) { |
|
186 |
+ M <- matrix(NA_integer_, nrow = ans_nrow, ncol = ans_ncol) |
|
187 |
+ Cov <- matrix(NA_integer_, nrow = ans_nrow, ncol = ans_ncol) |
|
188 |
+ } else { |
|
189 |
+ # Set up intermediate RealizationSink objects of appropriate dimensions |
|
190 |
+ # and type. These are ultimately coerced to the output DelayedMatrix |
|
191 |
+ # objects, `M` and `U`. |
|
192 |
+ # NOTE: Need to register the supplied BACKEND while being sure to |
|
193 |
+ # re-register any existing backend upon exiting the function. |
|
194 |
+ current_BACKEND <- getRealizationBackend() |
|
195 |
+ on.exit(setRealizationBackend(current_BACKEND)) |
|
196 |
+ setRealizationBackend(BACKEND) |
|
197 |
+ # NOTE: Don't do `U_sink <- M_sink` or else these will reference the |
|
198 |
+ # same object and clobber each other when written to! |
|
199 |
+ M_sink <- DelayedArray:::RealizationSink( |
|
200 |
+ dim = c(ans_nrow, ans_ncol), |
|
201 |
+ type = "integer") |
|
202 |
+ on.exit(close(M_sink), add = TRUE) |
|
203 |
+ Cov_sink <- DelayedArray:::RealizationSink( |
|
204 |
+ dim = c(ans_nrow, ans_ncol), |
|
205 |
+ type = "integer") |
|
206 |
+ on.exit(close(Cov_sink), add = TRUE) |
|
207 |
+ grid <- RegularArrayGrid(dim(M_sink), c(ans_nrow, 1L)) |
|
208 |
+ } |
|
209 |
+ |
|
210 |
+ # TODO: Load as many files as safe according to block.size (if using HDF5, |
|
211 |
+ # otherwise load them all up) |
|
212 |
+ for (k in seq_along(files)) { |
|
213 |
+ file <- files[k] |
|
214 |
+ if (verbose) { |
|
215 |
+ message("[.constructCounts] Extracting counts from ", "'", file, |
|
216 |
+ "'") |
|
217 |
+ } |
|
218 |
+ bsseq_dt <- .readBismarkAsBSseqDT( |
|
219 |
+ file = file, |
|
220 |
+ rmZeroCov = FALSE, |
|
221 |
+ strandCollapse = strandCollapse, |
|
222 |
+ verbose = verbose) |
|
223 |
+ # TODO: Need to use GenomicRanges' strand matching behaviour |
|
224 |
+ counts <- bsseq_dt[loci_dt, c("M", "U"), nomatch = NA] |
|
225 |
+ counts[is.na(M), M := 0L] |
|
226 |
+ counts[is.na(U), U := 0L] |
|
227 |
+ counts[, c("Cov", "U") := .(M + U, NULL)] |
|
228 |
+ |
|
229 |
+ if (is.null(BACKEND)) { |
|
230 |
+ M[, k] <- counts[["M"]] |
|
231 |
+ Cov[, k] <- counts[["Cov"]] |
|
232 |
+ } else { |
|
233 |
+ viewport <- grid[[k]] |
|
234 |
+ # TODO: Does as.matrix() cause a copy? |
|
235 |
+ write_block_to_sink( |
|
236 |
+ block = as.matrix(counts[["M"]]), |
|
237 |
+ sink = M_sink, |
|
238 |
+ viewport = viewport) |
|
239 |
+ write_block_to_sink( |
|
240 |
+ block = as.matrix(counts[["Cov"]]), |
|
241 |
+ sink = Cov_sink, |
|
242 |
+ viewport = viewport) |
|
243 |
+ } |
|
244 |
+ } |
|
245 |
+ if (!is.null(BACKEND)) { |
|
246 |
+ M <- as(M_sink, "DelayedArray") |
|
247 |
+ Cov <- as(Cov_sink, "DelayedArray") |
|
248 |
+ } |
|
249 |
+ list(M = M, Cov = Cov) |
|
250 |
+} |
|
251 |
+ |
|
252 |
+.lociDTAsGRanges <- function(loci_dt, seqinfo = NULL) { |
|
253 |
+ GRanges( |
|
254 |
+ seqnames = loci_dt[["seqnames"]], |
|
255 |
+ ranges = IRanges(loci_dt[["start"]], width = 1L), |
|
256 |
+ strand = loci_dt[["strand"]], |
|
257 |
+ seqinfo = seqinfo) |
|
258 |
+} |
|
259 |
+ |
|
260 |
+.grAsLociDT <- function(gr) { |
|
261 |
+ data.table( |
|
262 |
+ seqnames = as.factor(seqnames(gr)), |
|
263 |
+ start = start(gr), |
|
264 |
+ strand = as.factor(strand(gr)), |
|
265 |
+ key = c("seqnames", "strand", "start")) |
|
266 |
+} |
|
267 |
+ |
|
268 |
+.strandCollapseDT <- function(x, has_counts = TRUE) { |
|
269 |
+ # Shift loci on negative strand by 1 to the left |
|
270 |
+ x[strand == "-", start := start - 1L] |
|
271 |
+ # Unstrand all loci |
|
272 |
+ x[, strand := "*"] |
|
273 |
+ # Set key |
|
274 |
+ setkey(x, seqnames, strand, start) |
|
275 |
+ # TODO: Is by |
|
276 |
+ if (has_counts) return(x[, .(M = sum(M), U = sum(U)), by = key(x)]) |
|
277 |
+ unique(x) |
|
278 |
+} |
|
279 |
+ |
|
280 |
+# Exported functions ----------------------------------------------------------- |
|
281 |
+ |
|
1 | 282 |
read.bismark <- function(files, |
2 |
- sampleNames, |
|
283 |
+ sampleNames = basename(files), |
|
3 | 284 |
rmZeroCov = FALSE, |
4 | 285 |
strandCollapse = TRUE, |
5 | 286 |
fileType = c("cov", "oldBedGraph", "cytosineReport"), |
287 |
+ seqinfo = NULL, |
|
288 |
+ gr = NULL, |
|
6 | 289 |
mc.cores = 1, |
7 | 290 |
verbose = TRUE, |
8 | 291 |
BACKEND = NULL) { |
9 |
- ## Argument checking |
|
10 |
- if (!is.null(BACKEND) && mc.cores > 1) { |
|
11 |
- stop("Currently, 'mc.cores' must be 1 if 'BACKEND' is not NULL") |
|
292 |
+ # Argument checking |
|
293 |
+ if (anyDuplicated(files)) stop("'files' cannot have duplicate entires") |
|
294 |
+ file_exists <- file.exists(files) |
|
295 |
+ if (!isTRUE(all(file_exists))) { |
|
296 |
+ stop("These files cannot be found:\n", |
|
297 |
+ " ", paste(files[!file_exists], collapse = "\n ")) |
|
12 | 298 |
} |
13 |
- if (anyDuplicated(files)) { |
|
14 |
- stop("duplicate entries in 'files'") |
|
299 |
+ # TODO: Check if this is still required |
|
300 |
+ # NOTE: SummarizedExperiment validator makes calls to identical() that will |
|
301 |
+ # fail due to how sampleNames are propagated sometimes with and |
|
302 |
+ # without names. To make life simpler, we simply strip the names. |
|
303 |
+ sampleNames <- unname(sampleNames) |
|
304 |
+ if (length(sampleNames) != length(files)) { |
|
305 |
+ stop("'sampleNames' must have the same length as 'files'.") |
|
15 | 306 |
} |
16 |
- if (length(sampleNames) != length(files) || anyDuplicated(sampleNames)) { |
|
17 |
- stop("argument 'sampleNames' has to have the same length as argument 'files', without duplicate entries") |
|
307 |
+ if (anyDuplicated(sampleNames)) { |
|
308 |
+ stop("'sampleNames' cannot have duplicate entires") |
|
18 | 309 |
} |
19 |
- fileType <- match.arg(fileType) |
|
20 |
- if (verbose) { |
|
21 |
- message(paste0("Assuming file type is ", fileType)) |
|
310 |
+ # TODO: What's the proper way to deprecate a function argument? |
|
311 |
+ if (!missing(fileType)) { |
|
312 |
+ warning("'fileType' is deprecated and ignored.") |
|
22 | 313 |
} |
23 |
- if (strandCollapse && (fileType == "cov" || fileType == "oldBedGraph")) { |
|
24 |
- stop("'strandCollapse' must be 'FALSE' if 'fileType' is '", fileType, "'") |
|
314 |
+ if (!is.null(gr) && !is.null(seqinfo)) { |
|
315 |
+ warning("'seqinfo' is ignored when 'gr' is supplied.") |
|
25 | 316 |
} |
26 |
- ## SummarizedExperiment validator makes calls to identical() that will fail |
|
27 |
- ## due to how sampleNames are propagated sometimes with and without names(). |
|
28 |
- ## To make life simpler, we simply strip the names() |
|
29 |
- sampleNames <- unname(sampleNames) |
|
317 |
+ subverbose <- max(as.integer(verbose) - 1L, 0L) |
|
30 | 318 |
|
31 |
- ## Process each file |
|
32 |
- idxes <- seq_along(files) |
|
33 |
- names(idxes) <- sampleNames |
|
34 |
- allOut <- mclapply(idxes, function(ii) { |
|
319 |
+ # Construct a data.table and a GRanges with all "valid loci". |
|
320 |
+ # NOTE: "Valid loci" are those that remain after collapsing by strand (if |
|
321 |
+ # strandCollapse == TRUE) and then removing loci with zero coverage |
|
322 |
+ # in all samples (if rmZeroCov == TRUE). |
|
323 |
+ if (is.null(gr)) { |
|
35 | 324 |
if (verbose) { |
36 |
- cat(sprintf("[read.bismark] Reading file '%s' ... ", files[ii])) |
|
325 |
+ message("[read.bismark] Constructing GRanges with valid loci ...") |
|
37 | 326 |
} |
38 | 327 |
ptime1 <- proc.time() |
39 |
- if (fileType == "cov" || fileType == "oldBedGraph") { |
|
40 |
- out <- read.bismarkCovRaw(thisfile = files[ii], |
|
41 |
- thisSampleName = sampleNames[ii], |
|
42 |
- rmZeroCov = rmZeroCov, |
|
43 |
- BACKEND = BACKEND) |
|
44 |
- } else if (fileType == "cytosineReport") { |
|
45 |
- out <- read.bismarkCytosineReportRaw(thisfile = files[ii], |
|
46 |
- thisSampleName = sampleNames[ii], |
|
47 |
- rmZeroCov = rmZeroCov, |
|
48 |
- keepContext = FALSE, |
|
49 |
- BACKEND = BACKEND) |
|
328 |
+ loci_dt <- .contructLociDTFromBismarkFiles( |
|
329 |
+ files = files, |
|
330 |
+ rmZeroCov = rmZeroCov, |
|
331 |
+ strandCollapse = strandCollapse, |
|
332 |
+ seqinfo = seqinfo, |
|
333 |
+ verbose = subverbose) |
|
334 |
+ if (is.null(seqinfo)) { |
|
335 |
+ seqinfo <- .constructSeqinfoFromLociDT(loci_dt) |
|
50 | 336 |
} |
51 |
- if (strandCollapse) { |
|
52 |
- out <- strandCollapse(out) |
|
337 |
+ gr <- .lociDTAsGRanges(loci_dt, seqinfo) |
|
338 |
+ ptime2 <- proc.time() |
|
339 |
+ stime <- (ptime2 - ptime1)[3] |
|
340 |
+ if (verbose) { |
|
341 |
+ cat(sprintf("done in %.1f secs\n", stime)) |
|
53 | 342 |
} |
343 |
+ } else { |
|
344 |
+ if (verbose) message("[read.bismark] Using 'gr' as GRanges with loci") |
|
345 |
+ # NOTE: Don't use as.data.table() because it will modify gr by |
|
346 |
+ # reference (https://github.com/Rdatatable/data.table/issues/2278) |
|
347 |
+ ptime1 <- proc.time() |
|
348 |
+ loci_dt <- .grAsLociDT(gr) |
|
54 | 349 |
ptime2 <- proc.time() |
55 | 350 |
stime <- (ptime2 - ptime1)[3] |
56 | 351 |
if (verbose) { |
57 | 352 |
cat(sprintf("done in %.1f secs\n", stime)) |
58 | 353 |
} |
59 |
- out |
|
60 |
- }, mc.cores = mc.cores) |
|
354 |
+ if (strandCollapse) { |
|
355 |
+ if (verbose) { |
|
356 |
+ message("[read.bismark] Collapsing strand of loci in 'gr' ...") |
|
357 |
+ } |
|
358 |
+ ptime1 <- proc.time() |
|
359 |
+ loci_dt <- .strandCollapseDT(loci_dt, has_counts = FALSE) |
|
360 |
+ ptime2 <- proc.time() |
|
361 |
+ stime <- (ptime2 - ptime1)[3] |
|
362 |
+ if (verbose) { |
|
363 |
+ cat(sprintf("done in %.1f secs\n", stime)) |
|
364 |
+ } |
|
365 |
+ } |
|
366 |
+ if (rmZeroCov) { |
|
367 |
+ # NOTE: Have to parse files in order to identify loci with zero |
|
368 |
+ # coverage in all samples. |
|
369 |
+ if (verbose) { |
|
370 |
+ message("[read.bismark] Identifying loci in 'gr' with zero ", |
|
371 |
+ "coverage in all samples ...") |
|
372 |
+ } |
|
373 |
+ ptime1 <- proc.time() |
|
374 |
+ loci_from_files_dt <- .contructLociDTFromBismarkFiles( |
|
375 |
+ files = files, |
|
376 |
+ rmZeroCov = rmZeroCov, |
|
377 |
+ strandCollapse = strandCollapse, |
|
378 |
+ seqinfo = seqinfo, |
|
379 |
+ verbose = subverbose) |
|
380 |
+ # Retain the intersection of loci_dt and loci_from_files_dt |
|
381 |
+ # TODO: Need to use GenomicRanges' strand matching behaviour |
|
382 |
+ loci_dt <- loci_from_files_dt[loci_dt] |
|
383 |
+ ptime2 <- proc.time() |
|
384 |
+ stime <- (ptime2 - ptime1)[3] |
|
385 |
+ if (verbose) { |
|
386 |
+ cat(sprintf("done in %.1f secs\n", stime)) |
|
387 |
+ } |
|
388 |
+ } |
|
389 |
+ if (strandCollapse || rmZeroCov) { |
|
390 |
+ # NOTE: Have to update 'gr' if loci have been collapsed by strand |
|
391 |
+ # or loci have been filtered to remove loci with zero |
|
392 |
+ # coverage. |
|
393 |
+ if (verbose) { |
|
394 |
+ message("[read.bismark] Filtering 'gr' to retain valid loci ", |
|
395 |
+ "...") |
|
396 |
+ } |
|
397 |
+ ptime1 <- proc.time() |
|
398 |
+ gr <- .lociDTAsGRanges(loci_dt, seqinfo) |
|
399 |
+ ptime2 <- proc.time() |
|
400 |
+ stime <- (ptime2 - ptime1)[3] |
|
401 |
+ if (verbose) { |
|
402 |
+ cat(sprintf("done in %.1f secs\n", stime)) |
|
403 |
+ } |
|
404 |
+ } |
|
405 |
+ } |
|
61 | 406 |
|
407 |
+ # Construct 'M' and 'Cov' matrices |
|
62 | 408 |
if (verbose) { |
63 |
- cat(sprintf("[read.bismark] Joining samples ... ")) |
|
409 |
+ message("[read.bismark] Constructing 'M' and 'Cov' matrices ...") |
|
64 | 410 |
} |
65 | 411 |
ptime1 <- proc.time() |
66 |
- allOut <- combineList(allOut) |
|
412 |
+ counts <- .constructCountsFromBismarkFilesAndLociDT( |
|
413 |
+ files = files, |
|
414 |
+ loci_dt = loci_dt, |
|
415 |
+ strandCollapse = strandCollapse, |
|
416 |
+ mc.cores = mc.cores, |
|
417 |
+ BACKEND = BACKEND, |
|
418 |
+ verbose = subverbose) |
|
67 | 419 |
ptime2 <- proc.time() |
68 |
- stime <- (ptime2 - ptime1)[3L] |
|
420 |
+ stime <- (ptime2 - ptime1)[3] |
|
69 | 421 |
if (verbose) { |
70 | 422 |
cat(sprintf("done in %.1f secs\n", stime)) |
71 | 423 |
} |
72 |
- allOut |
|
73 |
-} |
|
74 | 424 |
|
75 |
-read.bismarkCovRaw <- function(thisfile, |
|
76 |
- thisSampleName, |
|
77 |
- rmZeroCov, |
|
78 |
- BACKEND = NULL) { |
|
79 |
- |
|
80 |
- ## data.table::fread() can't read directly from a gzipped file so, if |
|
81 |
- ## necessary, gunzip the file to a temporary location. |
|
82 |
- if (isGzipped(thisfile)) { |
|
83 |
- thisfile <- gunzip(thisfile, |
|
84 |
- temporary = TRUE, |
|
85 |
- overwrite = TRUE, |
|
86 |
- remove = FALSE) |
|
87 |
- } |
|
88 |
- |
|
89 |
- ## Read in the file |
|
90 |
- out <- fread(thisfile) |
|
91 |
- if (ncol(out) != 6L) { |
|
92 |
- stop("unknown file format") |
|
93 |
- } |
|
94 |
- |
|
95 |
- ## Create GRanges instance from 'out' |
|
96 |
- gr <- GRanges(seqnames = out[[1L]], |
|
97 |
- ranges = IRanges(start = out[[2L]], width = 1L)) |
|
98 |
- |
|
99 |
- ## Create BSseq instance from 'out' |
|
100 |
- ## TODO: Might be able to avoid the as.matrix() |
|
101 |
- M <- as.matrix(out[[5L]]) |
|
102 |
- Cov <- as.matrix(out[[5L]] + out[[6L]]) |
|
103 |
- M <- realize(M, BACKEND = BACKEND) |
|
104 |
- Cov <- realize(Cov, BACKEND = BACKEND) |
|
105 |
- BSseq(gr = gr, |
|
106 |
- M = M, |
|
107 |
- Cov = Cov, |
|
108 |
- sampleNames = thisSampleName, |
|
109 |
- rmZeroCov = rmZeroCov) |
|
425 |
+ # Construct BSseq object |
|
426 |
+ if (verbose) { |
|
427 |
+ cat(sprintf("[read.bismark] Constructing BSseq object ... ")) |
|
428 |
+ } |
|
429 |
+ ptime1 <- proc.time() |
|
430 |
+ # TODO: Use new_BSseq(), an internal, fast/minimal BSseq constructor |
|
431 |
+ # (this function needs to be written). |
|
432 |
+ BSseq <- BSseq( |
|
433 |
+ M = counts[["M"]], |
|
434 |
+ Cov = counts[["Cov"]], |
|
435 |
+ gr = gr, |
|
436 |
+ sampleNames = sampleNames) |
|
437 |
+ ptime2 <- proc.time() |
|
438 |
+ stime <- (ptime2 - ptime1)[3] |
|
439 |
+ if (verbose) { |
|
440 |
+ cat(sprintf("done in %.1f secs\n", stime)) |
|
441 |
+ } |
|
442 |
+ BSseq |
|
110 | 443 |
} |
111 | 444 |
|
112 |
-read.bismarkCytosineReportRaw <- function(thisfile, |
|
113 |
- thisSampleName, |
|
114 |
- rmZeroCov, |
|
115 |
- keepContext = FALSE, |
|
116 |
- BACKEND = NULL) { |
|
117 |
- |
|
118 |
- ## NOTE: keepContext not yet implemented |
|
119 |
- if (keepContext) { |
|
120 |
- stop("'keepContext' argument not yet implemented") |
|
121 |
- } |
|
122 |
- |
|
123 |
- ## data.table::fread() can't read directly from a gzipped file so, if |
|
124 |
- ## necessary, gunzip the file to a temporary location. |
|
125 |
- if (isGzipped(thisfile)) { |
|
126 |
- thisfile <- gunzip(thisfile, |
|
127 |
- temporary = TRUE, |
|
128 |
- overwrite = TRUE, |
|
129 |
- remove = FALSE) |
|
130 |
- } |
|
131 |
- |
|
132 |
- ## Read in the file |
|
133 |
- out <- fread(thisfile) |
|
134 |
- if (ncol(out) != 7L) { |
|
135 |
- stop("unknown file format") |
|
136 |
- } |
|
137 |
- |
|
138 |
- ## Create GRanges instance from 'out' |
|
139 |
- gr <- GRanges(seqnames = out[[1]], |
|
140 |
- ranges = IRanges(start = out[[2]], width = 1), |
|
141 |
- strand = out[[3]]) |
|
142 |
- |
|
143 |
- ## Create BSseq instance from 'out' |
|
144 |
- ## TODO: Might be able to avoid the as.matrix() |
|
145 |
- M <- as.matrix(out[[4L]]) |
|
146 |
- Cov <- as.matrix(out[[4L]] + out[[5L]]) |
|
147 |
- M <- realize(M, BACKEND = BACKEND) |
|
148 |
- Cov <- realize(Cov, BACKEND = BACKEND) |
|
149 |
- BSseq(gr = gr, sampleNames = thisSampleName, |
|
150 |
- M = M, |
|
151 |
- Cov = Cov, |
|
152 |
- rmZeroCov = rmZeroCov) |
|
153 |
-} |
|
445 |
+# TODOs ------------------------------------------------------------------------ |
|
446 |
+ |
|
447 |
+# TODO: The documentation needs a complete overhaul and checked against Bismark |
|
448 |
+# docs (e.g., the description of the .cov file is wrong) |
|
449 |
+# TODO: Consolidate use of message()/cat()/etc. |
|
450 |
+# TODO: mc.cores isn't used anywhere; can it be? |
|
451 |
+# TODO: Add function like minfi::read.metharray.sheet()? |
|
452 |
+# TODO: Should BACKEND really be an argument of read.bismark(); see related |
|
453 |
+# issue on minfi repo https://github.com/hansenlab/minfi/issues/140 |
|
454 |
+# TODO: May receive warning "In read_tokens_(data, tokenizer, col_specs, col_names, ... : length of NULL cannot be changed". This is fixed in devel version of |
|
455 |
+# readr (https://github.com/tidyverse/readr/issues/833) |