- BSseq objects can once again use ordinary matrix objects as assays.
- Reimplement `BSmooth()` more-or-less from scratch:
- Switch from 'parallel' to 'BiocParallel' for parallelization. This brings some notable improvements:
- Smoothed results can now be written directly to an on-disk realization backend by the worker. This dramatically reduces memory usage compared to previous versions of 'bsseq' that required all results be retained in-memory.
- Parallelization is now supported on Windows through the use of a 'SnowParam' object as the value of `BPPARAM`.
- Improved error handling makes it possible to gracefully resume `BSmooth()` jobs that encountered errors by re-doing only the necessary tasks.
- Detailed and extensive job logging facilities.
- Fix bug in `BSmooth()` with the `maxGap` parameter.
- Re-factor BSseq() constructor and add fast, internal .BSseq() constructor.
- Re-factor collapseBSseq() and combine(). Should be much more performant.
- Use beachmat to implement fast validity checking of 'M' and 'Cov' matrices.
- Resave BS.chr22 (supplied data) using integer for storage mode of assays to reduce size.
- Switch from RUnit to testthat. testthat has better integration with code coverage tools that help when refactoring.
... | ... |
@@ -12,8 +12,7 @@ Depends: |
12 | 12 |
methods, |
13 | 13 |
BiocGenerics, |
14 | 14 |
GenomicRanges (>= 1.29.14), |
15 |
- SummarizedExperiment (>= 1.9.18), |
|
16 |
- parallel |
|
15 |
+ SummarizedExperiment (>= 1.9.18) |
|
17 | 16 |
Imports: |
18 | 17 |
IRanges (>= 2.11.16), |
19 | 18 |
GenomeInfoDb, |
... | ... |
@@ -30,17 +29,24 @@ Imports: |
30 | 29 |
permute, |
31 | 30 |
limma, |
32 | 31 |
DelayedArray (>= 0.5.34), |
33 |
- HDF5Array |
|
32 |
+ Rcpp, |
|
33 |
+ BiocParallel, |
|
34 |
+ readr |
|
34 | 35 |
Suggests: |
35 |
- RUnit, |
|
36 |
+ testthat, |
|
36 | 37 |
bsseqData, |
37 | 38 |
BiocStyle, |
38 | 39 |
rmarkdown, |
39 | 40 |
knitr, |
41 |
+ beachmat, |
|
42 |
+ Matrix, |
|
43 |
+ HDF5Array, |
|
44 |
+ doParallel, |
|
45 |
+ testthat |
|
40 | 46 |
Collate: |
41 | 47 |
utils.R |
42 | 48 |
hasGRanges.R |
43 |
- BSseq_class.R |
|
49 |
+ BSseq-class.R |
|
44 | 50 |
BSseqTstat_class.R |
45 | 51 |
BSseq_utils.R |
46 | 52 |
combine.R |
... | ... |
@@ -57,8 +63,8 @@ Collate: |
57 | 63 |
BSseqStat_class.R |
58 | 64 |
getStats.R |
59 | 65 |
hdf5_utils.R |
60 |
- combine_utils.R |
|
61 | 66 |
DelayedArray_utils.R |
67 |
+ collapseBSseq.R |
|
62 | 68 |
License: Artistic-2.0 |
63 | 69 |
VignetteBuilder: knitr |
64 | 70 |
URL: https://github.com/kasperdanielhansen/bsseq |
... | ... |
@@ -66,3 +72,6 @@ LazyData: yes |
66 | 72 |
LazyDataCompression: xz |
67 | 73 |
BugReports: https://github.com/kasperdanielhansen/bsseq/issues |
68 | 74 |
biocViews: DNAMethylation |
75 |
+SystemRequirements: C++11 |
|
76 |
+LinkingTo: Rcpp, Rhdf5lib, beachmat |
|
77 |
+NeedsCompilation: yes |
... | ... |
@@ -17,7 +17,7 @@ importFrom(graphics, "abline", "axis", "layout", "legend", "lines", |
17 | 17 |
import(parallel) |
18 | 18 |
importFrom(locfit, "locfit", "lp") |
19 | 19 |
importFrom(DelayedMatrixStats, "rowSds", "rowVars", "colMeans2", "colSums2", |
20 |
- "rowSums2", "rowMeans2") |
|
20 |
+ "rowSums2", "rowMeans2", "rowAlls") |
|
21 | 21 |
import(IRanges) |
22 | 22 |
import(GenomicRanges) |
23 | 23 |
import(SummarizedExperiment) |
... | ... |
@@ -35,10 +35,9 @@ importFrom(data.table, "fread", "setnames") |
35 | 35 |
importFrom(R.utils, "isGzipped", "gunzip") |
36 | 36 |
import(limma) |
37 | 37 |
importFrom(permute, "shuffleSet", "how") |
38 |
-importClassesFrom(DelayedArray, "DelayedArray", "DelayedMatrix") |
|
39 |
-importFrom(DelayedArray, "DelayedArray", "plogis", "pmin2", "pmax2", "rowMaxs", |
|
40 |
- "rowMins") |
|
41 |
-importClassesFrom(HDF5Array, "HDF5Array") |
|
38 |
+import(DelayedArray) |
|
39 |
+import(BiocParallel) |
|
40 |
+importFrom(Rcpp, "sourceCpp") |
|
42 | 41 |
|
43 | 42 |
## |
44 | 43 |
## Exporting |
... | ... |
@@ -86,3 +85,5 @@ S3method("plot", "BSseqTstat") |
86 | 85 |
|
87 | 86 |
exportMethods("assays", "assayNames") |
88 | 87 |
|
88 |
+# C++ code registration |
|
89 |
+useDynLib(bsseq, .registration = TRUE, .fixes = "cxx_") |
... | ... |
@@ -1,3 +1,15 @@ |
1 |
+# Internal functions ----------------------------------------------------------- |
|
2 |
+ |
|
3 |
+.rowTickmarks <- function(hasGRanges, maxGap) { |
|
4 |
+ gr <- granges(hasGRanges) |
|
5 |
+ # NOTE: This relies on 'gr' being sorted, so really want to be sure of this! |
|
6 |
+ stopifnot(!is.unsorted(gr)) |
|
7 |
+ clusters <- reduce(gr, min.gapwidth = maxGap, with.revmap = TRUE) |
|
8 |
+ cumsum(lengths(mcols(clusters)[["revmap"]])) |
|
9 |
+} |
|
10 |
+ |
|
11 |
+# TODO: Remove other uses of this function, if possible. |
|
12 |
+# TODO: Rename to .makeClusters() since it's an internal function |
|
1 | 13 |
makeClusters <- function(hasGRanges, maxGap = 10^8) { |
2 | 14 |
chrOrder <- as.character(runValue(seqnames(hasGRanges))) |
3 | 15 |
if(anyDuplicated(chrOrder)) |
... | ... |
@@ -18,136 +30,320 @@ makeClusters <- function(hasGRanges, maxGap = 10^8) { |
18 | 30 |
clusterIdx |
19 | 31 |
} |
20 | 32 |
|
33 |
+.BSmooth <- function(b, M, Cov, pos, coef_sink, se.coef_sink, |
|
34 |
+ coef_sink_lock, se.coef_sink_lock, grid, pos_grid, |
|
35 |
+ ns, h, keep.se) { |
|
36 |
+ # Load packages on worker (required for SnowParam) ------------------------- |
|
37 |
+ |
|
38 |
+ suppressPackageStartupMessages({ |
|
39 |
+ library(BiocParallel) |
|
40 |
+ }) |
|
41 |
+ suppressPackageStartupMessages({ |
|
42 |
+ library(locfit) |
|
43 |
+ }) |
|
44 |
+ suppressPackageStartupMessages({ |
|
45 |
+ library(DelayedArray) |
|
46 |
+ }) |
|
47 |
+ |
|
48 |
+ # Construct inputs for smoothing ------------------------------------------- |
|
49 |
+ |
|
50 |
+ # NOTE: 'bb' indexes over elements of pos_grid by cycling `ncol(M)` times |
|
51 |
+ # over 1, ..., nrow(pos_grid). |
|
52 |
+ bb <- (b - 1L) %% nrow(pos_grid) + 1L |
|
53 |
+ # NOTE: unname() is necessary because M and Cov may carry colnames |
|
54 |
+ sdata <- data.frame( |
|
55 |
+ pos = DelayedArray:::extract_block(pos, pos_grid[[bb]]), |
|
56 |
+ M = unname(DelayedArray:::extract_block(M, grid[[b]])), |
|
57 |
+ Cov = unname(DelayedArray:::extract_block(Cov, grid[[b]]))) |
|
58 |
+ # Ensure 0 < M < Cov to avoid boundary issues (only relevant at loci with |
|
59 |
+ # non-zero coverage, so doesn't matter what M is for these loci). |
|
60 |
+ sdata[["M"]] <- pmin(pmax(sdata[["M"]], 0.01), pmax(sdata[["Cov"]] - 0.01)) |
|
61 |
+ n_loci <- nrow(sdata) |
|
62 |
+ n_loci_with_non_zero_coverage <- sum(sdata[["Cov"]] > 0) |
|
21 | 63 |
|
22 |
-BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8, parallelBy = c("sample", "chromosome"), |
|
23 |
- mc.preschedule = FALSE, mc.cores = 1, keep.se = FALSE, verbose = TRUE) { |
|
24 |
- smooth <- function(idxes, sampleIdx) { |
|
25 |
- ## Assuming that idxes is a set of indexes into the BSseq object |
|
26 |
- ## sampleIdx is a single character |
|
27 |
- this_sample_chr <- c(sampleNames(BSseq)[sampleIdx], |
|
28 |
- as.character(seqnames(BSseq)[idxes[1]])) |
|
29 |
- if(verbose >= 2) |
|
30 |
- cat(sprintf("[BSmooth] smoothing start: sample:%s, chr:%s, nLoci:%s\n", |
|
31 |
- this_sample_chr[1], this_sample_chr[2], length(idxes))) |
|
32 |
- Cov <- unname(as.array( |
|
33 |
- getCoverage(BSseq, type = "Cov")[idxes, sampleIdx])) |
|
34 |
- M <- unname(as.array( |
|
35 |
- getCoverage(BSseq, type = "M")[idxes, sampleIdx])) |
|
36 |
- pos <- start(BSseq)[idxes] |
|
37 |
- stopifnot(all(diff(pos) > 0)) |
|
38 |
- wh <- which(Cov != 0) |
|
39 |
- nn <- ns / length(wh) |
|
40 |
- if(length(wh) <= ns) { |
|
41 |
- if(keep.se) |
|
42 |
- se.coef <- rep(NA_real_, length(Cov)) |
|
43 |
- else |
|
44 |
- se.coef <- NULL |
|
45 |
- return(list(coef = rep(NA_real_, length(Cov)), |
|
46 |
- se.coef = se.coef, |
|
47 |
- trans = NULL, h = h, nn = nn)) |
|
64 |
+ # Fit local binomial regression model -------------------------------------- |
|
65 |
+ |
|
66 |
+ # NOTE: Require (ns + 1) loci with non-zero coverage to run smooth. |
|
67 |
+ if (n_loci_with_non_zero_coverage <= ns) { |
|
68 |
+ coef <- rep(NA_real_, n_loci) |
|
69 |
+ if (keep.se) { |
|
70 |
+ se.coef <- rep(NA_real_, n_loci) |
|
71 |
+ } else { |
|
72 |
+ se.coef <- NULL |
|
48 | 73 |
} |
49 |
- sdata <- data.frame(pos = pos[wh], |
|
50 |
- M = pmin(pmax(M[wh], 0.01), |
|
51 |
- Cov[wh] - 0.01), |
|
52 |
- Cov = Cov[wh]) |
|
53 |
- fit <- locfit(M ~ lp(pos, nn = nn, h = h), data = sdata, |
|
54 |
- weights = Cov, family = "binomial", maxk = 10000) |
|
55 |
- pp <- preplot(fit, where = "data", band = "local", |
|
56 |
- newdata = data.frame(pos = pos)) |
|
57 |
- if(keep.se) { |
|
74 |
+ } else { |
|
75 |
+ # Set 'nearest neighbour' smoothing parameter. |
|
76 |
+ nn <- ns / n_loci_with_non_zero_coverage |
|
77 |
+ # Fit model only using loci with non-zero coverage. |
|
78 |
+ fit <- locfit( |
|
79 |
+ M ~ lp(pos, nn = nn, h = h), |
|
80 |
+ data = sdata, |
|
81 |
+ weights = Cov, |
|
82 |
+ family = "binomial", |
|
83 |
+ subset = Cov > 0, |
|
84 |
+ maxk = 10000) |
|
85 |
+ # Evaluate smooth at all loci (regardless of coverage). |
|
86 |
+ pp <- preplot( |
|
87 |
+ object = fit, |
|
88 |
+ newdata = sdata["pos"], |
|
89 |
+ band = "local") |
|
90 |
+ coef <- pp$fit |
|
91 |
+ if (keep.se) { |
|
58 | 92 |
se.coef <- pp$se.fit |
59 | 93 |
} else { |
60 | 94 |
se.coef <- NULL |
61 | 95 |
} |
62 |
- if(verbose >= 2) |
|
63 |
- cat(sprintf("[BSmooth] smoothing end: sample:%s, chr:%s, nLoci:%s, nCoveredLoci:%s\n", |
|
64 |
- this_sample_chr[1], this_sample_chr[2], length(idxes), nrow(sdata))) |
|
65 |
- return(list(coef = pp$fit, se.coef = se.coef, |
|
66 |
- trans = pp$trans, h = h, nn = nn)) |
|
67 | 96 |
} |
68 |
- stopifnot(class(BSseq) == "BSseq") |
|
69 |
- parallelBy <- match.arg(parallelBy) |
|
70 |
- if(verbose) cat("[BSmooth] preprocessing ... ") |
|
71 |
- ptime1 <- proc.time() |
|
72 |
- clusterIdx <- makeClusters(BSseq, maxGap = maxGap) |
|
73 |
- ptime2 <- proc.time() |
|
74 |
- stime <- (ptime2 - ptime1)[3] |
|
75 |
- if(verbose) cat(sprintf("done in %.1f sec\n", stime)) |
|
76 |
- |
|
77 |
- sampleNames <- sampleNames(BSseq) |
|
78 |
- names(sampleNames) <- sampleNames |
|
79 |
- |
|
80 |
- ptime.outer1 <- proc.time() |
|
81 |
- switch(parallelBy, "sample" = { |
|
82 |
- if(verbose) cat(sprintf("[BSmooth] smoothing by 'sample' (mc.cores = %d, mc.preschedule = %s)\n", |
|
83 |
- mc.cores, mc.preschedule)) |
|
84 |
- out <- mclapply(seq(along = sampleNames), function(sIdx) { |
|
85 |
- ptime1 <- proc.time() |
|
86 |
- tmp <- lapply(clusterIdx, function(jj) { |
|
87 |
- try(smooth(idxes = jj, sampleIdx = sIdx)) |
|
88 |
- }) |
|
89 |
- coef <- do.call(c, lapply(tmp, function(xx) xx$coef)) |
|
90 |
- se.coef <- do.call(c, lapply(tmp, function(xx) xx$se.coef)) |
|
91 |
- ptime2 <- proc.time() |
|
92 |
- stime <- (ptime2 - ptime1)[3] |
|
93 |
- if(verbose) { |
|
94 |
- cat(sprintf("[BSmooth] sample %s (out of %d), done in %.1f sec\n", |
|
95 |
- sampleNames[sIdx], length(sampleNames), stime)) |
|
97 |
+ |
|
98 |
+ # Return the results of the smooth or write them to the RealizationSink ---- |
|
99 |
+ |
|
100 |
+ if (is.null(coef_sink)) { |
|
101 |
+ return(list(coef = coef, se.coef = se.coef)) |
|
102 |
+ } |
|
103 |
+ # Write to coef_sink and se.coef_sink while respecting the IPC lock(s). |
|
104 |
+ ipclock(coef_sink_lock) |
|
105 |
+ write_block_to_sink(as.matrix(coef), coef_sink, grid[[b]]) |
|
106 |
+ ipcunlock(coef_sink_lock) |
|
107 |
+ if (keep.se) { |
|
108 |
+ ipclock(se.coef_sink_lock) |
|
109 |
+ write_block_to_sink(as.matrix(se.coef), se.coef_sink, grid[[b]]) |
|
110 |
+ ipcunlock(se.coef_sink_lock) |
|
111 |
+ } |
|
112 |
+} |
|
113 |
+ |
|
114 |
+# Exported functions ----------------------------------------------------------- |
|
115 |
+ |
|
116 |
+# TODO: Make 'mc.cores', 'mc.preschedule', and 'verbose' defunct one release |
|
117 |
+# cycle with them deprecated. |
|
118 |
+# TODO: Consider having BSmooth() create a 'smoothed' assay in addition to or |
|
119 |
+# instead of the 'coef' and 'se.coef' assays. |
|
120 |
+# TODO: To benefit from error recovery requires that bpStopOnError(BPPARAM) is |
|
121 |
+# TRUE but the default is FALSE. How to help the user? Probably don't |
|
122 |
+# want to override the user-specified value. |
|
123 |
+BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8, |
|
124 |
+ parallelBy = c("sample", "chromosome"), |
|
125 |
+ mc.preschedule = FALSE, mc.cores = 1, keep.se = FALSE, |
|
126 |
+ verbose = TRUE, BPREDO = list(), BPPARAM = bpparam()) { |
|
127 |
+ # Argument checks----------------------------------------------------------- |
|
128 |
+ |
|
129 |
+ # Check if this is a re-do. |
|
130 |
+ # NOTE: Under the assumptions of a re-do (i.e. BSmooth() is being re-run |
|
131 |
+ # with the same arguments), we skip straight ahead to re-running |
|
132 |
+ # failed smoothing tasks with no further argument checks. |
|
133 |
+ if (length(BPREDO)) { |
|
134 |
+ if (!is.list(BPREDO) || |
|
135 |
+ identical(names(BPREDO), c("smooth", "coef_sink", "se.coef_sink", |
|
136 |
+ "realization_backend"))) { |
|
137 |
+ stop("'BPREDO' should be a list with elements 'smooth', ", |
|
138 |
+ "'coef_sink', 'se.coef_sink', and 'realization_backend'.") |
|
139 |
+ } |
|
140 |
+ is_redo <- TRUE |
|
141 |
+ coef_sink <- BPREDO[["coef_sink"]] |
|
142 |
+ se.coef_sink <- BPREDO[["se.coef_sink"]] |
|
143 |
+ realization_backend <- BPREDO[["realization_sink"]] |
|
144 |
+ BPREDO <- BPREDO[["smooth"]] |
|
145 |
+ } else { |
|
146 |
+ is_redo <- FALSE |
|
147 |
+ # Check validity of 'BSseq' argument |
|
148 |
+ if (!is(BSseq, "BSseq")) { |
|
149 |
+ stop("'BSseq' must be a BSseq object.") |
|
150 |
+ } |
|
151 |
+ if (!isTRUE(all(width(BSseq) == 1L))) { |
|
152 |
+ stop("All loci in 'BSseq' must have width == 1.") |
|
153 |
+ } |
|
154 |
+ if (is.unsorted(BSseq)) { |
|
155 |
+ stop("'BSseq' must be sorted before smoothing. Use 'sort(BSseq)'.") |
|
156 |
+ } |
|
157 |
+ # Check for deprecated arguments and issue warning(s) if found. |
|
158 |
+ if (!missing(parallelBy)) { |
|
159 |
+ warning( |
|
160 |
+ "'parallelBy' is deprecated.\n", |
|
161 |
+ "See help(\"BSmooth\") for details.", |
|
162 |
+ call. = FALSE, |
|
163 |
+ immediate. = TRUE) |
|
164 |
+ } |
|
165 |
+ if (!missing(mc.preschedule)) { |
|
166 |
+ warning( |
|
167 |
+ "'mc.preschedule' is deprecated.\n", |
|
168 |
+ "See help(\"BSmooth\") for details.", |
|
169 |
+ call. = FALSE, |
|
170 |
+ immediate. = TRUE) |
|
171 |
+ } |
|
172 |
+ if (!missing(mc.cores)) { |
|
173 |
+ warning( |
|
174 |
+ "'mc.cores' is deprecated.\n", |
|
175 |
+ "See help(\"BSmooth\").", |
|
176 |
+ call. = FALSE, |
|
177 |
+ immediate. = TRUE) |
|
178 |
+ BPPARAM <- MulticoreParam(workers = mc.cores) |
|
179 |
+ } |
|
180 |
+ if (!missing(verbose)) { |
|
181 |
+ warning( |
|
182 |
+ "'verbose' is deprecated.\n", |
|
183 |
+ "See help(\"BSmooth\") for details.", |
|
184 |
+ call. = FALSE, |
|
185 |
+ immediate. = TRUE) |
|
186 |
+ if (verbose) bpprogressbar(BPPARAM) <- TRUE |
|
187 |
+ } |
|
188 |
+ # Check compatability of realization backend with backend(s) of BSseq |
|
189 |
+ # object. |
|
190 |
+ realization_backend <- getRealizationBackend() |
|
191 |
+ BSseq_backends <- .getBSseqBackends(BSseq) |
|
192 |
+ if (.areBackendsInMemory(realization_backend) && |
|
193 |
+ !.areBackendsInMemory(BSseq_backends)) { |
|
194 |
+ stop("Using an in-memory backend for a disk-backed BSseq object ", |
|
195 |
+ "is not supported.\n", |
|
196 |
+ "See help(\"BSmooth\") for details.", |
|
197 |
+ call. = FALSE) |
|
198 |
+ } |
|
199 |
+ # Check compatability of 'BPPARAM' with the realization backend. |
|
200 |
+ if (!.areBackendsInMemory(realization_backend)) { |
|
201 |
+ if (!.isSingleMachineBackend(BPPARAM)) { |
|
202 |
+ stop("The parallelisation strategy must use a single machine ", |
|
203 |
+ "when using an on-disk realization backend.\n", |
|
204 |
+ "See help(\"BSmooth\") for details.", |
|
205 |
+ call. = FALSE) |
|
206 |
+ } |
|
207 |
+ } else { |
|
208 |
+ if (!is.null(realization_backend)) { |
|
209 |
+ # NOTE: Currently do not support any in-memory realization |
|
210 |
+ # backends. If the realization backend is NULL then an |
|
211 |
+ # ordinary matrix is returned rather than a matrix-backed |
|
212 |
+ # DelayedMatrix. |
|
213 |
+ stop("The '", realization_backend, "' realization backend is ", |
|
214 |
+ "not supported.\n", |
|
215 |
+ "See help(\"BSmooth\") for details.", |
|
216 |
+ call. = FALSE) |
|
217 |
+ } |
|
218 |
+ } |
|
219 |
+ } |
|
220 |
+ |
|
221 |
+ # Smoothing ---------------------------------------------------------------- |
|
222 |
+ |
|
223 |
+ # Extract pieces of BSseq object required for smoothing |
|
224 |
+ M <- getCoverage(BSseq, type = "M", withDimnames = FALSE) |
|
225 |
+ Cov <- getCoverage(BSseq, type = "Cov", withDimnames = FALSE) |
|
226 |
+ pos <- matrix(start(BSseq), ncol = 1) |
|
227 |
+ # Set up ArrayGrid so that each block contains data for a single sample and |
|
228 |
+ # single cluster of loci. |
|
229 |
+ row_tickmarks <- .rowTickmarks(BSseq, maxGap) |
|
230 |
+ col_tickmarks <- seq_len(ncol(M)) |
|
231 |
+ grid <- ArbitraryArrayGrid(list(row_tickmarks, col_tickmarks)) |
|
232 |
+ # Set up "parallel" ArrayGrid over pos |
|
233 |
+ pos_grid <- ArbitraryArrayGrid(list(row_tickmarks, 1L)) |
|
234 |
+ # Construct RealizationSink objects (as required) |
|
235 |
+ if (!is_redo) { |
|
236 |
+ if (is.null(realization_backend)) { |
|
237 |
+ coef_sink <- NULL |
|
238 |
+ coef_sink_lock <- NULL |
|
239 |
+ se.coef_sink <- NULL |
|
240 |
+ se.coef_sink_lock <- NULL |
|
241 |
+ } else { |
|
242 |
+ coef_sink <- DelayedArray:::RealizationSink(dim(M), type = "double") |
|
243 |
+ on.exit(close(coef_sink), add = TRUE) |
|
244 |
+ coef_sink_lock <- ipcid() |
|
245 |
+ on.exit(ipcremove(coef_sink_lock), add = TRUE) |
|
246 |
+ if (keep.se) { |
|
247 |
+ se.coef_sink <- DelayedArray:::RealizationSink(dim(M), |
|
248 |
+ type = "double") |
|
249 |
+ on.exit(close(se.coef_sink), add = TRUE) |
|
250 |
+ se.coef_sink_lock <- ipcid() |
|
251 |
+ on.exit(ipcremove(se.coef_sink_lock), add = TRUE) |
|
252 |
+ } else { |
|
253 |
+ se.coef_sink <- NULL |
|
254 |
+ se.coef_sink_lock <- NULL |
|
96 | 255 |
} |
97 |
- return(list(coef = coef, se.coef = se.coef)) |
|
98 |
- }, mc.preschedule = mc.preschedule, mc.cores = mc.cores) |
|
99 |
- if(any(sapply(out, is, class2 = "try-error"))) |
|
100 |
- stop("BSmooth encountered smoothing errors") |
|
101 |
- coef <- do.call(cbind, lapply(out, function(xx) xx$coef)) |
|
102 |
- se.coef <- do.call(cbind, lapply(out, function(xx) xx$se.coef)) |
|
103 |
- }, "chromosome" = { |
|
104 |
- if(verbose) cat(sprintf("[BSmooth] smoothing by 'chromosome' (mc.cores = %d, mc.preschedule = %s)\n", |
|
105 |
- mc.cores, mc.preschedule)) |
|
106 |
- out <- mclapply(1:length(clusterIdx), function(ii) { |
|
107 |
- ptime1 <- proc.time() |
|
108 |
- tmp <- lapply(seq(along = sampleNames), function(sIdx) { |
|
109 |
- smooth(idxes = clusterIdx[[ii]], sampleIdx = sIdx) |
|
110 |
- }) |
|
111 |
- coef <- do.call(cbind, lapply(tmp, function(xx) xx$coef)) |
|
112 |
- se.coef <- do.call(cbind, lapply(tmp, function(xx) xx$se.coef)) |
|
113 |
- ptime2 <- proc.time() |
|
114 |
- stime <- (ptime2 - ptime1)[3] |
|
115 |
- if(verbose) |
|
116 |
- cat(sprintf("[BSmooth] chr idx %d (out of %d), done in %.1f sec\n", |
|
117 |
- ii, length(clusterIdx), stime)) |
|
118 |
- return(list(coef = coef, se.coef = se.coef)) |
|
119 |
- }, mc.preschedule = mc.preschedule, mc.cores = mc.cores) |
|
120 |
- if(any(sapply(out, is, class2 = "try-error"))) |
|
121 |
- stop("BSmooth encountered smoothing errors") |
|
122 |
- coef <- do.call(rbind, lapply(out, function(xx) xx$coef)) |
|
123 |
- se.coef <- do.call(rbind, lapply(out, function(xx) xx$se.coef)) |
|
124 |
- }) |
|
125 |
- coef <- .DelayedMatrix(coef) |
|
126 |
- if (!is.null(se.coef)) { |
|
127 |
- se.coef <- .DelayedMatrix(se.coef) |
|
256 |
+ } |
|
257 |
+ } |
|
258 |
+ |
|
259 |
+ # Set number of tasks to ensure the progress bar gives frequent updates. |
|
260 |
+ # NOTE: The progress bar increments once per task |
|
261 |
+ # (https://github.com/Bioconductor/BiocParallel/issues/54). |
|
262 |
+ # Although it is somewhat of a bad idea to overrides a user-specified |
|
263 |
+ # bptasks(BPPARAM), the value of bptasks(BPPARAM) doesn't affect |
|
264 |
+ # performance in this instance, and so we opt for a useful progress |
|
265 |
+ # bar. Only SnowParam (and MulticoreParam by inheritance) have a |
|
266 |
+ # bptasks<-() method. |
|
267 |
+ if (is(BPPARAM, "SnowParam") && bpprogressbar(BPPARAM)) { |
|
268 |
+ bptasks(BPPARAM) <- length(grid) |
|
269 |
+ } |
|
270 |
+ smooth <- bptry(bplapply( |
|
271 |
+ X = seq_along(grid), |
|
272 |
+ FUN = .BSmooth, |
|
273 |
+ M = M, |
|
274 |
+ Cov = Cov, |
|
275 |
+ pos = pos, |
|
276 |
+ coef_sink = coef_sink, |
|
277 |
+ se.coef_sink = se.coef_sink, |
|
278 |
+ coef_sink_lock = coef_sink_lock, |
|
279 |
+ se.coef_sink_lock = se.coef_sink_lock, |
|
280 |
+ grid = grid, |
|
281 |
+ pos_grid = pos_grid, |
|
282 |
+ ns = ns, |
|
283 |
+ h = h, |
|
284 |
+ keep.se = keep.se, |
|
285 |
+ BPREDO = BPREDO, |
|
286 |
+ BPPARAM = BPPARAM)) |
|
287 |
+ if (!all(bpok(smooth))) { |
|
288 |
+ # TODO: Feels like stop() rather than warning() should be used, but |
|
289 |
+ # stop() doesn't allow for the return of partial results; |
|
290 |
+ # see https://support.bioconductor.org/p/109374/ |
|
291 |
+ warning("BSmooth() encountered errors: ", |
|
292 |
+ sum(!bpok(smooth)), " of ", length(smooth), |
|
293 |
+ " smoothing tasks failed.\n", |
|
294 |
+ "BSmooth() has returned partial results, including errors, ", |
|
295 |
+ "for debugging purposes.\n", |
|
296 |
+ "It may be possible to re-run just these failed smoothing ", |
|
297 |
+ "tasks.\nSee help(\"BSmooth\")", |
|
298 |
+ call. = FALSE) |
|
299 |
+ # NOTE: Return intermediate results as well as all derived variables. |
|
300 |
+ return(list(smooth = smooth, |
|
301 |
+ coef_sink = coef_sink, |
|
302 |
+ se.coef_sink = se.coef_sink, |
|
303 |
+ realization_backend = realization_backend)) |
|
128 | 304 |
} |
129 |
- ptime.outer2 <- proc.time() |
|
130 |
- stime.outer <- (ptime.outer2 - ptime.outer1)[3] |
|
131 |
- if(verbose) |
|
132 |
- cat(sprintf("[BSmooth] smoothing done in %.1f sec\n", stime.outer)) |
|
133 |
- |
|
134 |
- rownames(coef) <- NULL |
|
135 |
- colnames(coef) <- sampleNames(BSseq) |
|
136 |
- if(!is.null(se.coef)) { |
|
137 |
- rownames(se.coef) <- NULL |
|
138 |
- colnames(se.coef) <- sampleNames(BSseq) |
|
305 |
+ # Construct coef and se.coef from results of smooth(). |
|
306 |
+ if (is.null(realization_backend)) { |
|
307 |
+ # Returning matrix objects. |
|
308 |
+ coef <- do.call(c, lapply(smooth, "[[", "coef")) |
|
309 |
+ attr(coef, "dim") <- dim(M) |
|
310 |
+ if (keep.se) { |
|
311 |
+ se.coef <- do.call(c, lapply(smooth, "[[", "se.coef")) |
|
312 |
+ attr(se.coef, "dim") <- dim(M) |
|
313 |
+ } else { |
|
314 |
+ se.coef <- NULL |
|
315 |
+ } |
|
316 |
+ } else { |
|
317 |
+ # Returning DelayedMatrix objects. |
|
318 |
+ coef <- as(coef_sink, "DelayedArray") |
|
319 |
+ if (keep.se) { |
|
320 |
+ se.coef <- as(se.coef_sink, "DelayedArray") |
|
321 |
+ } else { |
|
322 |
+ se.coef <- NULL |
|
323 |
+ } |
|
139 | 324 |
} |
140 | 325 |
|
141 |
- if(!is.null(coef)) |
|
142 |
- assay(BSseq, "coef") <- coef |
|
143 |
- if(!is.null(se.coef)) |
|
144 |
- assay(BSseq, "se.coef") <- se.coef |
|
145 |
- BSseq@trans <- plogis |
|
326 |
+ # Construct BSseq object --------------------------------------------------- |
|
146 | 327 |
|
147 |
- parameters <- list(smoothText = sprintf("BSmooth (ns = %d, h = %d, maxGap = %d)", ns, h, maxGap), |
|
148 |
- ns = ns, h = h, maxGap = maxGap) |
|
149 |
- BSseq@parameters <- parameters |
|
150 |
- BSseq |
|
328 |
+ assays <- c(assays(BSseq, withDimnames = FALSE), SimpleList(coef = coef)) |
|
329 |
+ if (keep.se) assays <- c(assays, SimpleList(se.coef = se.coef)) |
|
330 |
+ BiocGenerics:::replaceSlots( |
|
331 |
+ object = BSseq, |
|
332 |
+ assays = Assays(assays), |
|
333 |
+ trans = plogis, |
|
334 |
+ parameters = list( |
|
335 |
+ smoothText = sprintf( |
|
336 |
+ "BSmooth (ns = %d, h = %d, maxGap = %d)", ns, h, maxGap), |
|
337 |
+ ns = ns, |
|
338 |
+ h = h, |
|
339 |
+ maxGap = maxGap), |
|
340 |
+ check = FALSE) |
|
151 | 341 |
} |
152 | 342 |
|
343 |
+# TODOs ------------------------------------------------------------------------ |
|
153 | 344 |
|
345 |
+# TODO: Use the logging facilities of BiocParallel. This is a longterm goal. |
|
346 |
+# For example, we could set custom messages within .BSmooth() using the |
|
347 |
+# futile.logger syntax; see the BiocParalell vignette 'Errors, Logs and |
|
348 |
+# Debugging in BiocParallel'. |
|
349 |
+# TODO: Remove NOTEs that are really documentation issues to the docs |
154 | 350 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,318 @@ |
1 |
+# Exported classes ------------------------------------------------------------- |
|
2 |
+ |
|
3 |
+# NOTE: This create a 'classGeneratorFunction' (internal constructor), .BSseq() |
|
4 |
+.BSseq <- setClass( |
|
5 |
+ "BSseq", |
|
6 |
+ slots = representation( |
|
7 |
+ trans = "function", |
|
8 |
+ parameters = "list"), |
|
9 |
+ contains = "RangedSummarizedExperiment" |
|
10 |
+) |
|
11 |
+ |
|
12 |
+# Validity methods ------------------------------------------------------------- |
|
13 |
+ |
|
14 |
+.checkAssayNames <- function(object, names) { |
|
15 |
+ if (all(names %in% assayNames(object))) { |
|
16 |
+ return(NULL) |
|
17 |
+ } else { |
|
18 |
+ paste0( |
|
19 |
+ "object of class ", |
|
20 |
+ class(object), |
|
21 |
+ " needs to have assay slots with names ", |
|
22 |
+ paste0(names, collapse = ", ")) |
|
23 |
+ |
|
24 |
+ } |
|
25 |
+} |
|
26 |
+ |
|
27 |
+.checkMandCov <- function(M, Cov) { |
|
28 |
+ msg <- NULL |
|
29 |
+ validMsg(msg, .Call(cxx_check_M_and_Cov, M, Cov)) |
|
30 |
+} |
|
31 |
+ |
|
32 |
+setValidity2("BSseq", function(object) { |
|
33 |
+ msg <- NULL |
|
34 |
+ |
|
35 |
+ if (identical(object, .BSseq())) { |
|
36 |
+ # No validity checks for object returned by internal constructor |
|
37 |
+ return(msg) |
|
38 |
+ } |
|
39 |
+ msg <- validMsg(msg, .checkAssayNames(object, c("Cov", "M"))) |
|
40 |
+ |
|
41 |
+ # TODO: Are colnames strictly necessary? |
|
42 |
+ if (is.null(colnames(object))) { |
|
43 |
+ msg <- validMsg(msg, "colnames (aka sampleNames) need to be set") |
|
44 |
+ } |
|
45 |
+ |
|
46 |
+ assay_rownames <- lapply(assays(object), rownames) |
|
47 |
+ if (!all(S4Vectors:::sapply_isNULL(assay_rownames))) { |
|
48 |
+ msg <- validMsg(msg, "unnecessary rownames on one-or-more assays") |
|
49 |
+ } |
|
50 |
+ |
|
51 |
+ msg <- validMsg(msg, |
|
52 |
+ .checkMandCov(assay(object, "M", withDimnames = FALSE), |
|
53 |
+ assay(object, "Cov", withDimnames = FALSE))) |
|
54 |
+ |
|
55 |
+ if (is.null(msg)) TRUE else msg |
|
56 |
+}) |
|
57 |
+ |
|
58 |
+# Internal functions ----------------------------------------------------------- |
|
59 |
+ |
|
60 |
+.oldTrans <- function(x) { |
|
61 |
+ y <- x |
|
62 |
+ ix <- which(x < 0) |
|
63 |
+ ix2 <- which(x > 0) |
|
64 |
+ y[ix] <- exp(x[ix])/(1 + exp(x[ix])) |
|
65 |
+ y[ix2] <- 1/(1 + exp(-x[ix2])) |
|
66 |
+ y |
|
67 |
+} |
|
68 |
+ |
|
69 |
+# Exported functions ----------------------------------------------------------- |
|
70 |
+ |
|
71 |
+# TODO: BSseq() is arguably a bad constructor. It doesn't return a valid BSseq |
|
72 |
+# object when called without any arguments. It also does some pretty |
|
73 |
+# complicated parsing of the inputs. Still, I think we're stuck with it |
|
74 |
+# because it's been around for a long time. |
|
75 |
+BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL, |
|
76 |
+ trans = NULL, parameters = NULL, pData = NULL, |
|
77 |
+ gr = NULL, pos = NULL, chr = NULL, sampleNames = NULL, |
|
78 |
+ rmZeroCov = FALSE) { |
|
79 |
+ |
|
80 |
+ # Process 'gr', or 'pos' and 'chr' |
|
81 |
+ if (is.null(gr)) { |
|
82 |
+ if (is.null(pos) || is.null(chr)) { |
|
83 |
+ stop("Need 'pos' and 'chr' if 'gr' not supplied.") |
|
84 |
+ } |
|
85 |
+ gr <- GRanges(seqnames = chr, ranges = IRanges(start = pos, width = 1L)) |
|
86 |
+ } |
|
87 |
+ if (!is(gr, "GRanges")) stop("'gr' needs to be a GRanges.") |
|
88 |
+ if (any(width(gr) != 1L)) stop("'gr' needs to have widths of 1.") |
|
89 |
+ |
|
90 |
+ # Process 'M' and 'Cov' |
|
91 |
+ if (is.null(M) || is.null(Cov)) stop("Need 'M' and 'Cov'.") |
|
92 |
+ if (length(gr) != nrow(M) || |
|
93 |
+ length(gr) != nrow(Cov) || |
|
94 |
+ ncol(Cov) != ncol(M)) { |
|
95 |
+ stop("'gr', 'M' and 'Cov' need to have compatible dimensions.") |
|
96 |
+ } |
|
97 |
+ if (!is.null(rownames(M))) rownames(M) <- NULL |
|
98 |
+ if (!is.null(rownames(Cov))) rownames(Cov) <- NULL |
|
99 |
+ if (!is.null(names(gr))) names(gr) <- NULL |
|
100 |
+ |
|
101 |
+ # Process 'sampleNames' and 'pData' |
|
102 |
+ if (is.null(pData)) { |
|
103 |
+ if (is.null(sampleNames)) { |
|
104 |
+ if (!is.null(colnames(M))) { |
|
105 |
+ sampleNames <- colnames(M) |
|
106 |
+ } else if (!is.null(colnames(Cov))) { |
|
107 |
+ sampleNames <- colnames(Cov) |
|
108 |
+ } else { |
|
109 |
+ sampleNames <- paste0("V", seq_len(ncol(M))) |
|
110 |
+ } |
|
111 |
+ } |
|
112 |
+ pData <- DataFrame(row.names = sampleNames) |
|
113 |
+ } else { |
|
114 |
+ pData <- as(pData, "DataFrame") |
|
115 |
+ } |
|
116 |
+ if (is.null(sampleNames) && !is.null(pData) && !is.null(rownames(pData))) { |
|
117 |
+ sampleNames <- rownames(pData) |
|
118 |
+ } |
|
119 |
+ if (length(unique(sampleNames)) != ncol(M)) { |
|
120 |
+ stop("sampleNames need to be unique and of the right length.") |
|
121 |
+ } |
|
122 |
+ |
|
123 |
+ # TODO: Is this really necessary? It complicates things because HDF5Matrix |
|
124 |
+ # will be degraded to a DelayedMatrix |
|
125 |
+ # Add column names to assays |
|
126 |
+ if (is.null(colnames(M)) || any(sampleNames != colnames(M))) { |
|
127 |
+ colnames(M) <- sampleNames |
|
128 |
+ } |
|
129 |
+ if (is.null(colnames(Cov)) || any(sampleNames != colnames(Cov))) { |
|
130 |
+ colnames(Cov) <- sampleNames |
|
131 |
+ } |
|
132 |
+ if (!is.null(coef)) { |
|
133 |
+ if (nrow(coef) != nrow(M) || ncol(coef) != ncol(M)) { |
|
134 |
+ stop("'coef' does not have the right dimensions") |
|
135 |
+ } |
|
136 |
+ if (is.null(colnames(coef)) || any(sampleNames != colnames(coef))) { |
|
137 |
+ colnames(coef) <- sampleNames |
|
138 |
+ } |
|
139 |
+ if (!is.null(rownames(coef))) { |
|
140 |
+ rownames(coef) <- NULL |
|
141 |
+ } |
|
142 |
+ } |
|
143 |
+ if (!is.null(se.coef)) { |
|
144 |
+ if (nrow(se.coef) != nrow(M) || ncol(se.coef) != ncol(M)) { |
|
145 |
+ stop("'se.coef' does not have the right dimensions") |
|
146 |
+ } |
|
147 |
+ if (is.null(colnames(se.coef)) || |
|
148 |
+ any(sampleNames != colnames(se.coef))) { |
|
149 |
+ colnames(se.coef) <- sampleNames |
|
150 |
+ } |
|
151 |
+ if (!is.null(rownames(se.coef))) { |
|
152 |
+ rownames(se.coef) <- NULL |
|
153 |
+ } |
|
154 |
+ } |
|
155 |
+ |
|
156 |
+ # Optionally, remove positions with Cov == 0 |
|
157 |
+ if (rmZeroCov) { |
|
158 |
+ loci_with_zero_cov <- rowAlls(Cov, value = 0) |
|
159 |
+ if (any(loci_with_zero_cov)) { |
|
160 |
+ gr <- gr[!loci_with_zero_cov] |
|
161 |
+ M <- M[!loci_with_zero_cov, , drop = FALSE] |
|
162 |
+ Cov <- Cov[!loci_with_zero_cov, , drop = FALSE] |
|
163 |
+ } |
|
164 |
+ } |
|
165 |
+ |
|
166 |
+ # Collapse duplicate loci |
|
167 |
+ if (any(duplicated(gr))) { |
|
168 |
+ warning("Detected duplicate loci. Collapsing counts in 'M' and 'Cov' ", |
|
169 |
+ "at these positions.") |
|
170 |
+ if (!is.null(coef) || !is.null(se.coef)) { |
|
171 |
+ stop("Cannot collapse when 'coef' or 'se.coef' are present.") |
|
172 |
+ } |
|
173 |
+ # NOTE: reduce() sorts the output |
|
174 |
+ unique_gr <- unique(gr) |
|
175 |
+ ol <- findOverlaps(unique_gr, gr) |
|
176 |
+ gr <- unique(gr) |
|
177 |
+ idx <- as.list(ol) |
|
178 |
+ M <- .collapseMatrixLike(M, idx, MARGIN = 2L) |
|
179 |
+ Cov <- .collapseMatrixLike(Cov, idx, MARGIN = 2L) |
|
180 |
+ } |
|
181 |
+ |
|
182 |
+ # Sort loci |
|
183 |
+ if (is.unsorted(gr)) { |
|
184 |
+ o <- order(gr) |
|
185 |
+ gr <- gr[o] |
|
186 |
+ M <- M[o, , drop = FALSE] |
|
187 |
+ Cov <- Cov[o, , drop = FALSE] |
|
188 |
+ coef <- coef[o, , drop = FALSE] |
|
189 |
+ se.coef <- se.coef[o, , drop = FALSE] |
|
190 |
+ } |
|
191 |
+ |
|
192 |
+ # Construct BSseq object |
|
193 |
+ assays <- SimpleList(M = M, Cov = Cov, coef = coef, se.coef = se.coef) |
|
194 |
+ assays <- assays[!S4Vectors:::sapply_isNULL(assays)] |
|
195 |
+ se <- SummarizedExperiment(assays = assays, rowRanges = gr, colData = pData) |
|
196 |
+ if (is.null(parameters)) { |
|
197 |
+ parameters <- list() |
|
198 |
+ } |
|
199 |
+ if (is.null(trans)) { |
|
200 |
+ trans <- function(x) NULL |
|
201 |
+ } |
|
202 |
+ .BSseq(se, trans = trans, parameters = parameters) |
|
203 |
+} |
|
204 |
+ |
|
205 |
+hasBeenSmoothed <- function(BSseq) "coef" %in% assayNames(BSseq) |
|
206 |
+ |
|
207 |
+# TODO: Document withDimnames |
|
208 |
+getBSseq <- function(BSseq, |
|
209 |
+ type = c("Cov", "M", "gr", "coef", "se.coef", "trans", |
|
210 |
+ "parameters"), |
|
211 |
+ withDimnames = TRUE) { |
|
212 |
+ type <- match.arg(type) |
|
213 |
+ if (type %in% c("M", "Cov")) { |
|
214 |
+ return(assay(BSseq, type, withDimnames = withDimnames)) |
|
215 |
+ } |
|
216 |
+ if (type %in% c("coef", "se.coef") && type %in% assayNames(BSseq)) { |
|
217 |
+ return(assay(BSseq, type, withDimnames = withDimnames)) |
|
218 |
+ } |
|
219 |
+ if (type %in% c("coef", "se.coef")) return(NULL) |
|
220 |
+ if (type == "trans") return(BSseq@trans) |
|
221 |
+ if (type == "parameters") return(BSseq@parameters) |
|
222 |
+ if (type == "gr") return(BSseq@rowRanges) |
|
223 |
+ |
|
224 |
+} |
|
225 |
+ |
|
226 |
+strandCollapse <- function(BSseq, shift = TRUE) { |
|
227 |
+ if (all(runValue(strand(BSseq)) == "*")) { |
|
228 |
+ warning("All loci are unstranded; nothing to collapse") |
|
229 |
+ return(BSseq) |
|
230 |
+ } |
|
231 |
+ if (!(all(runValue(strand(BSseq)) %in% c("+", "-")))) { |
|
232 |
+ stop("'BSseq' object has a mix of stranded and unstranded loci.") |
|
233 |
+ } |
|
234 |
+ BS.forward <- BSseq[strand(BSseq) == "+"] |
|
235 |
+ strand(BS.forward) <- "*" |
|
236 |
+ BS.reverse <- BSseq[strand(BSseq) == "-"] |
|
237 |
+ strand(BS.reverse) <- "*" |
|
238 |
+ if (shift) rowRanges(BS.reverse) <- shift(rowRanges(BS.reverse), -1L) |
|
239 |
+ sampleNames(BS.reverse) <- paste0(sampleNames(BS.reverse), "_REVERSE") |
|
240 |
+ BS.comb <- combine(BS.forward, BS.reverse) |
|
241 |
+ newBSseq <- collapseBSseq(BS.comb, columns = rep(sampleNames(BSseq), 2)) |
|
242 |
+ newBSseq |
|
243 |
+} |
|
244 |
+ |
|
245 |
+# Exported methods ------------------------------------------------------------- |
|
246 |
+ |
|
247 |
+setMethod("show", signature(object = "BSseq"), function(object) { |
|
248 |
+ cat("An object of type 'BSseq' with\n") |
|
249 |
+ cat(" ", nrow(object), "methylation loci\n") |
|
250 |
+ cat(" ", ncol(object), "samples\n") |
|
251 |
+ if (hasBeenSmoothed(object)) { |
|
252 |
+ cat("has been smoothed with\n") |
|
253 |
+ cat(" ", object@parameters$smoothText, "\n") |
|
254 |
+ } else { |
|
255 |
+ cat("has not been smoothed\n") |
|
256 |
+ } |
|
257 |
+ if (.isHDF5ArrayBacked(object)) { |
|
258 |
+ cat("Some assays are HDF5Array-backed\n") |
|
259 |
+ } else { |
|
260 |
+ cat("All assays are in-memory\n") |
|
261 |
+ } |
|
262 |
+}) |
|
263 |
+ |
|
264 |
+setMethod("pData", "BSseq", function(object) { |
|
265 |
+ object@colData |
|
266 |
+}) |
|
267 |
+ |
|
268 |
+setReplaceMethod( |
|
269 |
+ "pData", |
|
270 |
+ signature = signature(object = "BSseq", value = "data.frame"), |
|
271 |
+ function(object, value) { |
|
272 |
+ colData(object) <- as(value, "DataFrame") |
|
273 |
+ object |
|
274 |
+ } |
|
275 |
+) |
|
276 |
+ |
|
277 |
+setReplaceMethod( |
|
278 |
+ "pData", |
|
279 |
+ signature = signature(object = "BSseq", value = "DataFrame"), |
|
280 |
+ function(object, value) { |
|
281 |
+ colData(object) <- value |
|
282 |
+ object |
|
283 |
+ } |
|
284 |
+) |
|
285 |
+ |
|
286 |
+setMethod("sampleNames", "BSseq", function(object) colnames(object)) |
|
287 |
+ |
|
288 |
+setReplaceMethod( |
|
289 |
+ "sampleNames", |
|
290 |
+ signature = signature(object = "BSseq", value = "ANY"), |
|
291 |
+ function(object, value) { |
|
292 |
+ colnames(object) <- value |
|
293 |
+ object |
|
294 |
+ } |
|
295 |
+) |
|
296 |
+ |
|
297 |
+setMethod("updateObject", "BSseq", |
|
298 |
+ function(object, ...) { |
|
299 |
+ # NOTE: identical() is too strong |
|
300 |
+ if (isTRUE(all.equal(getBSseq(object, "trans"), .oldTrans))) { |
|
301 |
+ object@trans <- plogis |
|
302 |
+ } |
|
303 |
+ if (is(object, "SummarizedExperiment")) { |
|
304 |
+ # NOTE: Call method for SummarizedExperiment objects |
|
305 |
+ object <- callNextMethod() |
|
306 |
+ return(object) |
|
307 |
+ } else { |
|
308 |
+ BSseq(gr = object@gr, |
|
309 |
+ M = object@M, |
|
310 |
+ Cov = object@Cov, |
|
311 |
+ coef = object@coef, |
|
312 |
+ se.coef = object@se.coef, |
|
313 |
+ trans = object@trans, |
|
314 |
+ parameters = object@parameters, |
|
315 |
+ pData = object@phenoData@data) |
|
316 |
+ } |
|
317 |
+ } |
|
318 |
+) |
0 | 319 |
deleted file mode 100644 |
... | ... |
@@ -1,355 +0,0 @@ |
1 |
-setClass("BSseq", contains = "RangedSummarizedExperiment", |
|
2 |
- representation(trans = "function", |
|
3 |
- parameters = "list")) |
|
4 |
- |
|
5 |
-setValidity("BSseq", function(object) { |
|
6 |
- msg <- validMsg(NULL, .checkAssayNames(object, c("Cov", "M"))) |
|
7 |
- msg <- validMsg(msg, .checkAssayClasses(object, |
|
8 |
- c("Cov", "M", "coef", "se.coef"))) |
|
9 |
- if(class(rowRanges(object)) != "GRanges") |
|
10 |
- msg <- validMsg(msg, sprintf("object of class '%s' needs to have a 'GRanges' in slot 'rowRanges'", class(object))) |
|
11 |
- ## benchmarking shows that min(assay()) < 0 is faster than any(assay() < 0) if it is false |
|
12 |
- if(is.null(colnames(object))) |
|
13 |
- msg <- validMsg(msg, "colnames (aka sampleNames) need to be set") |
|
14 |
- M <- assay(object, "M", withDimnames = FALSE) |
|
15 |
- Cov <- assay(object, "Cov", withDimnames = FALSE) |
|
16 |
- if (!.isHDF5ArrayBacked(M) && !.isHDF5ArrayBacked(Cov)) { |
|
17 |
- ## TODO: This check is super expensive if M or Cov is a HDF5Matrix, so |
|
18 |
- ## we skip it for the time being |
|
19 |
- if(min(assay(object, "M", withDimnames = FALSE)) < 0) |
|
20 |
- msg <- validMsg(msg, "the 'M' assay has negative entries") |
|
21 |
- if(min(assay(object, "Cov", withDimnames = FALSE)) < 0) |
|
22 |
- msg <- validMsg(msg, "the 'Cov' assay has negative entries") |
|
23 |
- if(max(assay(object, "M", withDimnames = FALSE) - |
|
24 |
- assay(object, "Cov", withDimnames = FALSE)) > 0.5) |
|
25 |
- msg <- validMsg(msg, "the 'M' assay has at least one entry bigger than the 'Cov' assay") |
|
26 |
- } |
|
27 |
- if(!is.null(rownames(M)) || !is.null(rownames(Cov)) || |
|
28 |
- ("coef" %in% assayNames(object) && !is.null(rownames(assay(object, "coef")))) || |
|
29 |
- ("se.coef" %in% assayNames(object) && !is.null(rownames(assay(object, "se.coef"))))) |
|
30 |
- warning("unnecessary rownames in object") |
|
31 |
- if (is.null(msg)) TRUE else msg |
|
32 |
-}) |
|
33 |
- |
|
34 |
-setMethod("show", signature(object = "BSseq"), |
|
35 |
- function(object) { |
|
36 |
- cat("An object of type 'BSseq' with\n") |
|
37 |
- cat(" ", nrow(object), "methylation loci\n") |
|
38 |
- cat(" ", ncol(object), "samples\n") |
|
39 |
- if(hasBeenSmoothed(object)) { |
|
40 |
- cat("has been smoothed with\n") |
|
41 |
- cat(" ", object@parameters$smoothText, "\n") |
|
42 |
- } else { |
|
43 |
- cat("has not been smoothed\n") |
|
44 |
- } |
|
45 |
- if (.isHDF5ArrayBacked(object)) { |
|
46 |
- cat("Some assays are HDF5Array-backed\n") |
|
47 |
- } else { |
|
48 |
- cat("All assays are in-memory\n") |
|
49 |
- } |
|
50 |
- }) |
|
51 |
- |
|
52 |
-setMethod("pData", "BSseq", function(object) { |
|
53 |
- object@colData |
|
54 |
-}) |
|
55 |
- |
|
56 |
-setReplaceMethod("pData", |
|
57 |
- signature = signature( |
|
58 |
- object = "BSseq", |
|
59 |
- value = "data.frame"), |
|
60 |
- function(object, value) { |
|
61 |
- colData(object) <- as(value, "DataFrame") |
|
62 |
- object |
|
63 |
- }) |
|
64 |
- |
|
65 |
-setReplaceMethod("pData", |
|
66 |
- signature = signature( |
|
67 |
- object = "BSseq", |
|
68 |
- value = "DataFrame"), |
|
69 |
- function(object, value) { |
|
70 |
- colData(object) <- value |
|
71 |
- object |
|
72 |
- }) |
|
73 |
- |
|
74 |
-setMethod("sampleNames", "BSseq", function(object) { |
|
75 |
- colnames(object) |
|
76 |
-}) |
|
77 |
- |
|
78 |
-setReplaceMethod("sampleNames", |
|
79 |
- signature = signature( |
|
80 |
- object = "BSseq", |
|
81 |
- value = "ANY"), |
|
82 |
- function(object, value) { |
|
83 |
- colnames(object) <- value |
|
84 |
- object |
|
85 |
- }) |
|
86 |
- |
|
87 |
-setMethod("length", "BSseq", function(x) { |
|
88 |
- length(granges(x)) |
|
89 |
-}) |
|
90 |
- |
|
91 |
-hasBeenSmoothed <- function(BSseq) { |
|
92 |
- "coef" %in% assayNames(BSseq) |
|
93 |
-} |
|
94 |
- |
|
95 |
-getBSseq <- function(BSseq, type = c("Cov", "M", "gr", "coef", "se.coef", "trans", "parameters")) { |
|
96 |
- type <- match.arg(type) |
|
97 |
- if(type %in% c("M", "Cov")) |
|
98 |
- return(assay(BSseq, type)) |
|
99 |
- if(type %in% c("coef", "se.coef") && type %in% assayNames(BSseq)) |
|
100 |
- return(assay(BSseq, type)) |
|
101 |
- if(type %in% c("coef", "se.coef")) |
|
102 |
- return(NULL) |
|
103 |
- if(type == "trans") |
|
104 |
- return(BSseq@trans) |
|
105 |
- if(type == "parameters") |
|
106 |
- return(BSseq@parameters) |
|
107 |
- if(type == "gr") |
|
108 |
- return(BSseq@rowRanges) |
|
109 |
- |
|
110 |
-} |
|
111 |
- |
|
112 |
-BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL, |
|
113 |
- trans = NULL, parameters = NULL, pData = NULL, |
|
114 |
- gr = NULL, pos = NULL, chr = NULL, sampleNames = NULL, |
|
115 |
- rmZeroCov = FALSE) { |
|
116 |
- if(is.null(gr)) { |
|
117 |
- if(is.null(pos) || is.null(chr)) |
|
118 |
- stop("Need pos and chr") |
|
119 |
- gr <- GRanges(seqnames = chr, ranges = IRanges(start = pos, width = 1)) |
|
120 |
- } |
|
121 |
- if(!is(gr, "GRanges")) |
|
122 |
- stop("'gr' needs to be a GRanges") |
|
123 |
- if(any(width(gr) != 1)) |
|
124 |
- stop("'gr' needs to have widths of 1") |
|
125 |
- if(is.null(M) || is.null(Cov)) |
|
126 |
- stop("Need M and Cov") |
|
127 |
- M <- .DelayedMatrix(M) |
|
128 |
- Cov <- .DelayedMatrix(Cov) |
|
129 |
- if (!is.null(coef)) { |
|
130 |
- coef <- .DelayedMatrix(coef) |
|
131 |
- } |
|
132 |
- if (!is.null(se.coef)) { |
|
133 |
- se.coef <- .DelayedMatrix(se.coef) |
|
134 |
- } |
|
135 |
- if(length(gr) != nrow(M) || |
|
136 |
- length(gr) != nrow(Cov) || |
|
137 |
- ncol(Cov) != ncol(M)) |
|
138 |
- stop("'gr', 'M' and 'Cov' need to have similar dimensions") |
|
139 |
- if(!is.null(rownames(M))) |
|
140 |
- rownames(M) <- NULL |
|
141 |
- if(!is.null(rownames(Cov))) |
|
142 |
- rownames(Cov) <- NULL |
|
143 |
- if(!is.null(names(gr))) |
|
144 |
- names(gr) <- NULL |
|
145 |
- ## deal with sampleNames |
|
146 |
- if(!is(pData, "DataFrame")) |
|
147 |
- pData <- as(pData, "DataFrame") |
|
148 |
- if(is.null(sampleNames) && !is.null(pData) && !is.null(rownames(pData))) |
|
149 |
- sampleNames <- rownames(pData) |
|
150 |
- if(is.null(sampleNames) && !is.null(colnames(M))) |
|
151 |
- sampleNames <- colnames(M) |
|
152 |
- if(is.null(sampleNames) && !is.null(colnames(Cov))) |
|
153 |
- sampleNames <- colnames(Cov) |
|
154 |
- if(is.null(sampleNames)) |
|
155 |
- sampleNames <- paste("V", 1:ncol(M), sep = "") |
|
156 |
- if(length(unique(sampleNames)) != ncol(M)) |
|
157 |
- stop("sampleNames need to be unique and of the right length.") |
|
158 |
- ## check that 0 <= M <= Cov and remove positions with Cov = 0 |
|
159 |
- if (!.isHDF5ArrayBacked(M) && !.isHDF5ArrayBacked(Cov)) { |
|
160 |
- ## TODO: This check is super expensive if M or Cov is a HDF5Matrix, so |
|
161 |
- ## we skip it for the time being |
|
162 |
- if (any(M < 0) || any(M > Cov) || any(is.na(M)) || any(is.na(Cov)) || |
|
163 |
- any(is.infinite(Cov))) { |
|
164 |
- stop("'M' and 'Cov' may not contain NA or infinite values and ", |
|
165 |
- "0 <= M <= Cov") |
|
166 |
- } |
|
167 |
- } |
|
168 |
- if(rmZeroCov) { |
|
169 |
- wh <- which(rowSums2(Cov) == 0) |
|
170 |
- if(length(wh) > 0) { |
|
171 |
- gr <- gr[-wh] |
|
172 |
- M <- M[-wh,,drop = FALSE] |
|
173 |
- Cov <- Cov[-wh,,drop = FALSE] |
|
174 |
- } |
|
175 |
- } |
|
176 |
- grR <- reduce(gr, min.gapwidth = 0L) |
|
177 |
- if(!identical(grR, gr)) { |
|
178 |
- ## Now we either need to re-order or collapse or both |
|
179 |
- mm <- as.matrix(findOverlaps(grR, gr)) |
|
180 |
- mm <- mm[order(mm[,1]),] |
|
181 |
- if(length(grR) == length(gr)) { |
|
182 |
- ## only re-ordering is necessary |
|
183 |
- gr <- grR |
|
184 |
- M <- M[mm[,2],,drop = FALSE] |
|
185 |
- Cov <- Cov[mm[,2],,drop = FALSE] |
|
186 |
- if(!is.null(coef)) |
|
187 |
- coef <- coef[mm[,2],,drop = FALSE] |
|
188 |
- if(!is.null(se.coef)) |
|
189 |
- se.coef <- se.coef[mm[,2],, drop = FALSE] |
|
190 |
- } else { |
|
191 |
- warning("multiple positions, collapsing BSseq object\n") |
|
192 |
- if(!is.null(coef) || !is.null(se.coef)) |
|
193 |
- stop("Cannot collapse when 'coef' or 'se.coef' are present") |
|
194 |
- gr <- grR |
|
195 |
- sp <- split(mm[,2], mm[,1])[as.character(1:length(grR))] |
|
196 |
- names(sp) <- NULL |
|
197 |
- # TODO: .collapseDelayedMatrix() always return numeric; it may be |
|
198 |
- # worth coercing M and Cov to integer DelayedMatrix objects, |
|
199 |
- # which would halve storage requirements and impose some more |
|
200 |
- # structure on the BSseq class (via new validity method |
|
201 |
- # checks) |
|
202 |
- # NOTE: Tries to be smart about how collapsed DelayedMatrix should |
|
203 |
- # be realized |
|
204 |
- if (.isHDF5ArrayBacked(M)) { |
|
205 |
- M_BACKEND <- "HDF5Array" |
|
206 |
- } else { |
|
207 |
- M_BACKEND <- NULL |
|
208 |
- } |
|
209 |
- M <- .collapseDelayedMatrix(x = M, |
|
210 |
- sp = sp, |
|
211 |
- MARGIN = 2, |
|
212 |
- BACKEND = M_BACKEND) |
|
213 |
- if (.isHDF5ArrayBacked(Cov)) { |
|
214 |
- Cov_BACKEND <- "HDF5Array" |
|
215 |
- } else { |
|
216 |
- Cov_BACKEND <- NULL |
|
217 |
- } |
|
218 |
- Cov <- .collapseDelayedMatrix(x = Cov, |
|
219 |
- sp = sp, |
|
220 |
- MARGIN = 2, |
|
221 |
- BACKEND = Cov_BACKEND) |
|
222 |
- } |
|
223 |
- } |
|
224 |
- if(is.null(colnames(M)) || any(sampleNames != colnames(M))) |
|
225 |
- colnames(M) <- sampleNames |
|
226 |
- if(is.null(colnames(Cov)) || any(sampleNames != colnames(Cov))) |
|
227 |
- colnames(Cov) <- sampleNames |
|
228 |
- if(!is.null(coef)) { |
|
229 |
- |
|
230 |
- if(nrow(coef) != nrow(M) || |
|
231 |
- ncol(coef) != ncol(M)) |
|
232 |
- stop("'coef' does not have the right dimensions") |
|
233 |
- if(is.null(colnames(coef)) || any(sampleNames != colnames(coef))) |
|
234 |
- colnames(coef) <- sampleNames |
|
235 |
- if(!is.null(rownames(coef))) |
|
236 |
- rownames(coef) <- NULL |
|
237 |
- } |
|
238 |
- if(!is.null(se.coef)) { |
|
239 |
- if(nrow(se.coef) != nrow(M) || |
|
240 |
- ncol(se.coef) != ncol(M)) |
|
241 |
- stop("'se.coef' does not have the right dimensions") |
|
242 |
- if(is.null(colnames(se.coef)) || any(sampleNames != colnames(se.coef))) |
|
243 |
- colnames(se.coef) <- sampleNames |
|
244 |
- if(!is.null(rownames(se.coef))) |
|
245 |
- rownames(se.coef) <- NULL |
|
246 |
- } |
|
247 |
- assays <- SimpleList(M = M, Cov = Cov, coef = coef, se.coef = se.coef) |
|
248 |
- assays <- assays[!sapply(assays, is.null)] |
|
249 |
- if(is.null(pData) || all(dim(pData) == c(0,0))) |
|
250 |
- BSseq <- SummarizedExperiment(assays = assays, rowRanges = gr) |
|
251 |
- else |
|
252 |
- BSseq <- SummarizedExperiment(assays = assays, rowRanges = gr, colData = pData) |
|
253 |
- BSseq <- as(BSseq, "BSseq") |
|
254 |
- if(is.function(trans)) |
|
255 |
- BSseq@trans <- trans |
|
256 |
- if(is.list(parameters)) |
|
257 |
- BSseq@parameters <- parameters |
|
258 |
- BSseq |
|
259 |
-} |
|
260 |
- |
|
261 |
-setMethod("updateObject", "BSseq", |
|
262 |
- function(object, ...) { |
|
263 |
- # NOTE: identical() is too strong |
|
264 |
- if (isTRUE(all.equal(getBSseq(object, "trans"), .oldTrans))) { |
|
265 |
- object@trans <- plogis |
|
266 |
- } |
|
267 |
- if (.hasSlot(object, "assays")) { |
|
268 |
- # call method for RangedSummarizedExperiment objects |
|
269 |
- object <- callNextMethod() |
|
270 |
- assays(object) <- endoapply( |
|
271 |
- assays(object, withDimnames = FALSE), function(assay) { |
|
272 |
- if (is.null(assay)) { |
|
273 |
- return(assay) |
|
274 |
- } else { |
|
275 |
- .DelayedMatrix(assay) |
|
276 |
- } |
|
277 |
- }) |
|
278 |
- object |
|
279 |
- } else { |
|
280 |
- BSseq(gr = object@gr, M = object@M, Cov = object@Cov, |
|
281 |
- coef = object@coef, se.coef = object@se.coef, |
|
282 |
- trans = object@trans, parameters = object@parameters, |
|
283 |
- pData = object@phenoData@data) |
|
284 |
- } |
|
285 |
- }) |
|
286 |
- |
|
287 |
- |
|
288 |
-strandCollapse <- function(BSseq, shift = TRUE) { |
|
289 |
- if(all(runValue(strand(BSseq)) == "*")) { |
|
290 |
- warning("All loci are unstranded; nothing to collapse") |
|
291 |
- return(BSseq) |
|
292 |
- } |
|
293 |
- if(!(all(runValue(strand(BSseq)) %in% c("+", "-")))) |
|
294 |
- stop("'BSseq' object has a mix of stranded and unstranded loci.") |
|
295 |
- BS.forward <- BSseq[strand(BSseq) == "+"] |
|
296 |
- strand(BS.forward) <- "*" |
|
297 |
- BS.reverse <- BSseq[strand(BSseq) == "-"] |
|
298 |
- strand(BS.reverse) <- "*" |
|
299 |
- if(shift) |
|
300 |
- rowRanges(BS.reverse) <- shift(granges(BS.reverse), -1L) |
|
301 |
- sampleNames(BS.reverse) <- paste0(sampleNames(BS.reverse), "_REVERSE") |
|
302 |
- BS.comb <- combine(BS.forward, BS.reverse) |
|
303 |
- newBSseq <- collapseBSseq(BS.comb, columns = rep(sampleNames(BSseq), 2)) |
|
304 |
- newBSseq |
|
305 |
-} |
|
306 |
- |
|
307 |
-## getD <- function(data, sample1, sample2, type = c("raw", "fit"), |
|
308 |
-## addPositions = FALSE, alpha = 0.95) { |
|
309 |
-## d.conf <- function(p1, n1, p2, n2) { |
|
310 |
-## ## Method 10 from Newcombe (Stat in Med, 1998) |
|
311 |
-## getRoots <- function(p, n) { |
|
312 |
-## mat <- cbind(p^2, -(2*p + z^2/n), 1+z^2/n) |
|
313 |
-## roots <- t(Re(apply(mat, 1, polyroot))) |
|
314 |
-## roots |
|
315 |
-## } |
|
316 |
-## roots1 <- roots2 <- matrix(NA_real_, nrow = length(p1), ncol = 2) |
|
317 |
-## idx <- !is.na(p1) |
|
318 |
-## roots1[idx,] <- getRoots(p1[idx], n1[idx]) |
|
319 |
-## idx <- !is.na(p2) |
|
320 |
-## roots2[idx,] <- getRoots(p2[idx], n2[idx]) |
|
321 |
-## d <- p1 - p2 |
|
322 |
-## suppressWarnings({ |
|
323 |
-## lower <- d - z * sqrt(roots1[,1]*(1-roots1[,1])/n1 + roots2[,2]*(1-roots2[,2])/n2) |
|
324 |
-## upper <- d + z * sqrt(roots1[,2]*(1-roots1[,2])/n1 + roots2[,1]*(1-roots2[,1])/n2) |
|
325 |
-## }) |
|
326 |
-## return(data.frame(d = d, lower = lower, upper = upper)) |
|
327 |
-## } |
|
328 |
-## type <- match.arg(type) |
|
329 |
-## z <- abs(qnorm((1-alpha)/2, mean = 0, sd = 1)) |
|
330 |
-## switch(type, |
|
331 |
-## raw = { |
|
332 |
-## conf <- d.conf(p1 = data$M[, sample1] / data$Cov[, sample1], |
|
333 |
-## n1 = data$Cov[, sample1], |
|
334 |
-## p2 = data$M[, sample2] / data$Cov[, sample2], |
|
335 |
-## n2 = data$Cov[, sample2]) |
|
336 |
-## out <- conf |
|
337 |
-## }, |
|
338 |
-## fit = { |
|
339 |
-## p1 <- data$trans(data$coef[, sample1]) |
|
340 |
-## p2 <- data$trans(data$coef[, sample2]) |
|
341 |
-## d <- p1 - p2 |
|
342 |
-## ## Based on the delta method |
|
343 |
-## se.d <- sqrt((data$se.coef[, sample1] * p1 * (1-p1))^2 + |
|
344 |
-## (data$se.coef[, sample2] * p2 * (1-p2))^2) |
|
345 |
- |
|
346 |
-## lower <- d - z * se.d |
|
347 |
-## upper <- d + z * se.d |
|
348 |
-## out <- data.frame(d = d, lower = lower, upper = upper) |
|
349 |
-## }) |
|
350 |
-## if(addPositions) { |
|
351 |
-## out$pos <- start(data$gr) |
|
352 |
-## out$chr <- seqnames(data$gr) |
|
353 |
-## } |
|
354 |
-## out |
|
355 |
-## } |
... | ... |
@@ -1,68 +1,25 @@ |
1 |
-collapseBSseq <- function(BSseq, columns) { |
|
2 |
- ## columns is a vector of new names, names(columns) is sampleNames or empty |
|
3 |
- stopifnot(is.character(columns)) |
|
4 |
- if(is.null(names(columns)) && length(columns) != ncol(BSseq)) |
|
5 |
- stop("if `columns' does not have names, it needs to be of the same length as `BSseq` has columns (samples)") |
|
6 |
- if(!is.null(names(columns)) && !all(names(columns) %in% sampleNames(BSseq))) |
|
7 |
- stop("if `columns` has names, they need to be sampleNames(BSseq)") |
|
8 |
- if(is.null(names(columns))) |
|
9 |
- columns.idx <- 1:ncol(BSseq) |
|
10 |
- else |
|
11 |
- columns.idx <- match(names(columns), sampleNames(BSseq)) |
|
12 |
- sp <- split(columns.idx, columns) |
|
13 |
- # TODO: .collapseDelayedMatrix() always return numeric; it may be |
|
14 |
- # worth coercing M and Cov to integer DelayedMatrix objects, |
|
15 |
- # which would halve storage requirements and impose some more |
|
16 |
- # structure on the BSseq class (via new validity method |
|
17 |
- # checks) |
|
18 |
- # NOTE: Tries to be smart about how collapsed DelayedMatrix should |
|
19 |
- # be realized |
|
20 |
- M <- getBSseq(BSseq, "M") |
|
21 |
- if (.isHDF5ArrayBacked(M)) { |
|
22 |
- M_BACKEND <- "HDF5Array" |
|
23 |
- } else { |
|
24 |
- M_BACKEND <- NULL |
|
25 |
- } |
|
26 |
- M <- .collapseDelayedMatrix(x = M, |
|
27 |
- sp = sp, |
|
28 |
- MARGIN = 1, |
|
29 |
- BACKEND = M_BACKEND) |
|
30 |
- Cov <- getBSseq(BSseq, "Cov") |
|
31 |
- if (.isHDF5ArrayBacked(Cov)) { |
|
32 |
- Cov_BACKEND <- "HDF5Array" |
|
33 |
- } else { |
|
34 |
- Cov_BACKEND <- NULL |
|
35 |
- } |
|
36 |
- Cov <- .collapseDelayedMatrix(x = Cov, |
|
37 |
- sp = sp, |
|
38 |
- MARGIN = 1, |
|
39 |
- BACKEND = Cov_BACKEND) |
|
40 |
- BSseq(gr = getBSseq(BSseq, "gr"), M = M, Cov = Cov, sampleNames = names(sp)) |
|
41 |
-} |
|
42 |
- |
|
43 | 1 |
chrSelectBSseq <- function(BSseq, seqnames = NULL, order = FALSE) { |
44 | 2 |
seqlevels(BSseq, pruning.mode = "coarse") <- seqnames |
45 |
- if(order) |
|
46 |
- BSseq <- orderBSseq(BSseq, seqOrder = seqnames) |
|
3 |
+ if (order) BSseq <- orderBSseq(BSseq, seqOrder = seqnames) |
|
47 | 4 |
BSseq |
48 | 5 |
} |
49 | 6 |
|
50 |
- |
|
51 | 7 |
orderBSseq <- function(BSseq, seqOrder = NULL) { |
52 |
- if(!is.null(seqOrder)) |
|
8 |
+ if (!is.null(seqOrder)) { |
|
53 | 9 |
seqlevels(BSseq, pruning.mode = "coarse") <- seqOrder |
10 |
+ } |
|
54 | 11 |
BSseq[order(granges(BSseq))] |
55 | 12 |
} |
56 | 13 |
|
57 |
- |
|
58 | 14 |
# TODO: getMeth() realises the result in memory iff regions is not NULL; |
59 | 15 |
# discuss with Kasper |
60 | 16 |
# TODO: Whether or not colnames are added to returned value depends on whether |
61 | 17 |
# regions is non-NULL; discuss with Kasper |
62 | 18 |
# TODO: Add parallel support |
19 |
+# TODO: Document withDimnames |
|
63 | 20 |
getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"), |
64 | 21 |
what = c("perBase", "perRegion"), confint = FALSE, |
65 |
- alpha = 0.95) { |
|
22 |
+ alpha = 0.95, withDimnames = TRUE) { |
|
66 | 23 |
p.conf <- function(p, n, alpha) { |
67 | 24 |
z <- abs(qnorm((1 - alpha)/2, mean = 0, sd = 1)) |
68 | 25 |
upper <- (p + z ^ 2 / (2 * n) + |
... | ... |
@@ -85,21 +42,21 @@ getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"), |
85 | 42 |
} |
86 | 43 |
z <- abs(qnorm((1 - alpha)/2, mean = 0, sd = 1)) |
87 | 44 |
if (is.null(regions) && type == "smooth") { |
88 |
- coef <- getBSseq(BSseq, type = "coef") |
|
89 |
- meth <- getBSseq(BSseq, type = "trans")(coef) |
|
45 |
+ coef <- getBSseq(BSseq, "coef", withDimnames) |
|
46 |
+ meth <- getBSseq(BSseq, "trans", withDimnames)(coef) |
|
90 | 47 |
if (confint) { |
91 |
- upper <- meth + z * getBSseq(BSseq, type = "se.coef") |
|
92 |
- lower <- meth - z * getBSseq(BSseq, type = "se.coef") |
|
48 |
+ upper <- meth + z * getBSseq(BSseq, "se.coef", withDimnames) |
|
49 |
+ lower <- meth - z * getBSseq(BSseq, "se.coef", withDimnames) |
|
93 | 50 |
return(list(meth = meth, lower = lower, upper = upper)) |
94 | 51 |
} else { |
95 | 52 |
return(meth) |
96 | 53 |
} |
97 | 54 |
} |
98 | 55 |
if (is.null(regions) && type == "raw") { |
99 |
- meth <- getBSseq(BSseq, type = "M") / getBSseq(BSseq, type = "Cov") |
|
56 |
+ meth <- getBSseq(BSseq, "M", withDimnames) / |
|
57 |
+ getBSseq(BSseq, "Cov", withDimnames) |
|
100 | 58 |
if (confint) { |
101 |
- return(p.conf(meth, n = getBSseq(BSseq, type = "Cov"), |
|
102 |
- alpha = alpha)) |
|
59 |
+ return(p.conf(meth, getBSseq(BSseq, "Cov", withDimnames), alpha)) |
|
103 | 60 |
} else { |
104 | 61 |
return(meth) |
105 | 62 |
} |
... | ... |
@@ -119,12 +76,14 @@ getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"), |
119 | 76 |
# chunks if what = perRegion |
120 | 77 |
if (type == "smooth") { |
121 | 78 |
meth <- as.matrix( |
122 |
- getBSseq(BSseq, "trans")(getBSseq(BSseq, "coef"))[queryHits(ov), , |
|
123 |
- drop = FALSE]) |
|
79 |
+ getBSseq(BSseq, "trans", withDimnames)( |
|
80 |
+ getBSseq(BSseq, "coef", withDimnames))[ |
|
81 |
+ queryHits(ov), , drop = FALSE]) |
|
124 | 82 |
} else if (type == "raw") { |
125 | 83 |
meth <- as.matrix( |
126 |
- (getBSseq(BSseq, "M") / getBSseq(BSseq, "Cov"))[queryHits(ov), , |
|
127 |
- drop = FALSE]) |
|
84 |
+ (getBSseq(BSseq, "M", withDimnames) / |
|
85 |
+ getBSseq(BSseq, "Cov", withDimnames))[ |
|
86 |
+ queryHits(ov), , drop = FALSE]) |
|
128 | 87 |
} |
129 | 88 |
out <- lapply(split(meth, subjectHits(ov)), matrix, ncol = ncol(meth)) |
130 | 89 |
if (what == "perBase") { |
... | ... |
@@ -138,9 +97,9 @@ getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"), |
138 | 97 |
# TODO: Don't really understand the logic of the remaining code; how |
139 | 98 |
# could the rows end up in the wrong order? |
140 | 99 |
outMatrix <- matrix(NA, ncol = ncol(BSseq), nrow = length(regions)) |
141 |
- colnames(outMatrix) <- sampleNames(BSseq) |
|
100 |
+ if (withDimnames) colnames(outMatrix) <- sampleNames(BSseq) |
|
142 | 101 |
outMatrix[as.integer(rownames(out)), ] <- out |
143 |
- .DelayedMatrix(outMatrix) |
|
102 |
+ outMatrix |
|
144 | 103 |
} |
145 | 104 |
} |
146 | 105 |
|
... | ... |
@@ -148,21 +107,23 @@ getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"), |
148 | 107 |
# discuss with Kasper |
149 | 108 |
# TODO: Whether or not colnames are added to returned value depends on whether |
150 | 109 |
# regions is non-NULL; discuss with Kasper |
110 |
+# TODO: Document withDimnames |
|
151 | 111 |
getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"), |
152 | 112 |
what = c("perBase", "perRegionAverage", |
153 |
- "perRegionTotal")) { |
|
113 |
+ "perRegionTotal"), |
|
114 |
+ withDimnames = TRUE) { |
|
154 | 115 |
stopifnot(is(BSseq, "BSseq")) |
155 | 116 |
type <- match.arg(type) |
156 | 117 |
what <- match.arg(what) |
157 | 118 |
if (is.null(regions)) { |
158 | 119 |
if (what == "perBase") { |
159 |
- return(getBSseq(BSseq, type = type)) |
|
120 |
+ return(getBSseq(BSseq, type, withDimnames)) |
|
160 | 121 |
} |
161 | 122 |
if (what == "perRegionTotal") { |
162 |
- return(colSums2(getBSseq(BSseq, type = type))) |
|
123 |
+ return(colSums2(getBSseq(BSseq, type, withDimnames))) |
|
163 | 124 |
} |
164 | 125 |
if (what == "perRegionAverage") { |
165 |
- return(colMeans2(getBSseq(BSseq, type = type))) |
|
126 |
+ return(colMeans2(getBSseq(BSseq, type, withDimnames))) |
|
166 | 127 |
} |
167 | 128 |
} |
168 | 129 |
if (class(regions) == "data.frame") { |
... | ... |
@@ -171,11 +132,8 @@ getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"), |
171 | 132 |
stopifnot(is(regions, "GenomicRanges")) |
172 | 133 |
grBSseq <- granges(BSseq) |
173 | 134 |
ov <- findOverlaps(grBSseq, regions) |