... | ... |
@@ -1,19 +1,19 @@ |
1 | 1 |
Package: bsseq |
2 |
-Version: 1.9.2 |
|
2 |
+Version: 1.11.1 |
|
3 |
+Encoding: UTF-8 |
|
3 | 4 |
Title: Analyze, manage and store bisulfite sequencing data |
4 | 5 |
Description: A collection of tools for analyzing and visualizing bisulfite |
5 | 6 |
sequencing data. |
6 | 7 |
Authors@R: c(person(c("Kasper", "Daniel"), "Hansen", role = c("aut", "cre"), |
7 | 8 |
email = "kasperdanielhansen@gmail.com"), |
8 |
- person("Peter", "Hickey", role = "ctb", email = "peter.hickey@gmail.com")) |
|
9 |
+ person("Peter", "Hickey", role = "aut", email = "peter.hickey@gmail.com")) |
|
9 | 10 |
Depends: |
10 | 11 |
R (>= 3.3), |
11 | 12 |
methods, |
12 | 13 |
BiocGenerics, |
13 | 14 |
GenomicRanges (>= 1.23.7), |
14 | 15 |
SummarizedExperiment (>= 0.1.1), |
15 |
- parallel, |
|
16 |
- limma |
|
16 |
+ parallel |
|
17 | 17 |
Imports: |
18 | 18 |
IRanges (>= 2.5.17), |
19 | 19 |
GenomeInfoDb, |
... | ... |
@@ -27,19 +27,22 @@ Imports: |
27 | 27 |
S4Vectors, |
28 | 28 |
R.utils (>= 2.0.0), |
29 | 29 |
matrixStats (>= 0.50.0), |
30 |
- permute |
|
30 |
+ permute, |
|
31 |
+ limma, |
|
32 |
+ DelayedArray, |
|
33 |
+ HDF5Array |
|
31 | 34 |
Suggests: |
32 | 35 |
RUnit, |
33 | 36 |
bsseqData, |
34 | 37 |
BiocStyle, |
35 |
- knitr |
|
38 |
+ knitr, |
|
36 | 39 |
Collate: |
40 |
+ utils.R |
|
37 | 41 |
hasGRanges.R |
38 | 42 |
BSseq_class.R |
39 | 43 |
BSseqTstat_class.R |
40 | 44 |
BSseq_utils.R |
41 | 45 |
combine.R |
42 |
- utils.R |
|
43 | 46 |
read.bsmooth.R |
44 | 47 |
read.bismark.R |
45 | 48 |
BSmooth.R |
... | ... |
@@ -52,6 +55,9 @@ Collate: |
52 | 55 |
BSmooth.fstat.R |
53 | 56 |
BSseqStat_class.R |
54 | 57 |
getStats.R |
58 |
+ hdf5_utils.R |
|
59 |
+ combine_utils.R |
|
60 |
+ DelayedArray_utils.R |
|
55 | 61 |
License: Artistic-2.0 |
56 | 62 |
VignetteBuilder: knitr |
57 | 63 |
URL: https://github.com/kasperdanielhansen/bsseq |
... | ... |
@@ -10,7 +10,8 @@ importFrom(BiocGenerics, "anyDuplicated", "cbind", "colnames", |
10 | 10 |
"strand", "strand<-", "union", "unique", "updateObject") |
11 | 11 |
importFrom(stats, "approxfun", "fisher.test", "ppoints", |
12 | 12 |
"predict", "preplot", "qchisq", |
13 |
- "qnorm", "qqplot", "qunif", "cov2cor") |
|
13 |
+ "qnorm", "qqplot", "qunif", "cov2cor", |
|
14 |
+ "plogis") |
|
14 | 15 |
importFrom(graphics, "abline", "axis", "layout", "legend", "lines", |
15 | 16 |
"mtext", "par", "plot", "points", "polygon", "rect", "rug", "text") |
16 | 17 |
import(parallel) |
... | ... |
@@ -33,6 +34,9 @@ importFrom(data.table, "fread", "setnames") |
33 | 34 |
importFrom(R.utils, "isGzipped", "gunzip") |
34 | 35 |
import(limma) |
35 | 36 |
importFrom(permute, "shuffleSet", "how") |
37 |
+importClassesFrom(DelayedArray, "DelayedArray", "DelayedMatrix") |
|
38 |
+importFrom(DelayedArray, "DelayedArray", "plogis", "pmin2", "pmax2", |
|
39 |
+ "setRealizeBackend", "type") |
|
36 | 40 |
|
37 | 41 |
## |
38 | 42 |
## Exporting |
... | ... |
@@ -17,7 +17,7 @@ makeClusters <- function(hasGRanges, maxGap = 10^8) { |
17 | 17 |
names(clusterIdx) <- NULL |
18 | 18 |
clusterIdx |
19 | 19 |
} |
20 |
- |
|
20 |
+ |
|
21 | 21 |
|
22 | 22 |
BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8, parallelBy = c("sample", "chromosome"), |
23 | 23 |
mc.preschedule = FALSE, mc.cores = 1, keep.se = FALSE, verbose = TRUE) { |
... | ... |
@@ -29,8 +29,8 @@ BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8, parallelBy = c("sam |
29 | 29 |
if(verbose >= 2) |
30 | 30 |
cat(sprintf("[BSmooth] smoothing start: sample:%s, chr:%s, nLoci:%s\n", |
31 | 31 |
this_sample_chr[1], this_sample_chr[2], length(idxes))) |
32 |
- Cov <- getCoverage(BSseq, type = "Cov")[idxes, sampleIdx] |
|
33 |
- M <- getCoverage(BSseq, type = "M")[idxes, sampleIdx] |
|
32 |
+ Cov <- getCoverage(BSseq, type = "Cov")[idxes, sampleIdx, drop = FALSE] |
|
33 |
+ M <- getCoverage(BSseq, type = "M")[idxes, sampleIdx, drop = FALSE] |
|
34 | 34 |
pos <- start(BSseq)[idxes] |
35 | 35 |
stopifnot(all(diff(pos) > 0)) |
36 | 36 |
wh <- which(Cov != 0) |
... | ... |
@@ -45,8 +45,9 @@ BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8, parallelBy = c("sam |
45 | 45 |
trans = NULL, h = h, nn = nn)) |
46 | 46 |
} |
47 | 47 |
sdata <- data.frame(pos = pos[wh], |
48 |
- M = pmin(pmax(M[wh], 0.01), Cov[wh] - 0.01), |
|
49 |
- Cov = Cov[wh]) |
|
48 |
+ M = as.array(pmin2(pmax2(M[wh, ], 0.01), |
|
49 |
+ Cov[wh, ] - 0.01)), |
|
50 |
+ Cov = Cov[wh, ]) |
|
50 | 51 |
fit <- locfit(M ~ lp(pos, nn = nn, h = h), data = sdata, |
51 | 52 |
weights = Cov, family = "binomial", maxk = 10000) |
52 | 53 |
pp <- preplot(fit, where = "data", band = "local", |
... | ... |
@@ -119,11 +120,15 @@ BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8, parallelBy = c("sam |
119 | 120 |
coef <- do.call(rbind, lapply(out, function(xx) xx$coef)) |
120 | 121 |
se.coef <- do.call(rbind, lapply(out, function(xx) xx$se.coef)) |
121 | 122 |
}) |
123 |
+ coef <- .DelayedMatrix(coef) |
|
124 |
+ if (!is.null(se.coef)) { |
|
125 |
+ se.coef <- .DelayedMatrix(se.coef) |
|
126 |
+ } |
|
122 | 127 |
ptime.outer2 <- proc.time() |
123 | 128 |
stime.outer <- (ptime.outer2 - ptime.outer1)[3] |
124 | 129 |
if(verbose) |
125 | 130 |
cat(sprintf("[BSmooth] smoothing done in %.1f sec\n", stime.outer)) |
126 |
- |
|
131 |
+ |
|
127 | 132 |
rownames(coef) <- NULL |
128 | 133 |
colnames(coef) <- sampleNames(BSseq) |
129 | 134 |
if(!is.null(se.coef)) { |
... | ... |
@@ -135,16 +140,8 @@ BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8, parallelBy = c("sam |
135 | 140 |
assay(BSseq, "coef") <- coef |
136 | 141 |
if(!is.null(se.coef)) |
137 | 142 |
assay(BSseq, "se.coef") <- se.coef |
138 |
- mytrans <- function(x) { |
|
139 |
- y <- x |
|
140 |
- ix <- which(x < 0) |
|
141 |
- ix2 <- which(x > 0) |
|
142 |
- y[ix] <- exp(x[ix])/(1 + exp(x[ix])) |
|
143 |
- y[ix2] <- 1/(1 + exp(-x[ix2])) |
|
144 |
- y |
|
145 |
- } |
|
146 |
- environment(mytrans) <- baseenv() |
|
147 |
- BSseq@trans <- mytrans |
|
143 |
+ BSseq@trans <- plogis |
|
144 |
+ |
|
148 | 145 |
parameters <- list(smoothText = sprintf("BSmooth (ns = %d, h = %d, maxGap = %d)", ns, h, maxGap), |
149 | 146 |
ns = ns, h = h, maxGap = maxGap) |
150 | 147 |
BSseq@parameters <- parameters |
... | ... |
@@ -9,6 +9,7 @@ BSmooth.fstat <- function(BSseq, design, contrasts, verbose = TRUE){ |
9 | 9 |
ptime1 <- proc.time() |
10 | 10 |
allPs <- getMeth(BSseq, type = "smooth", what = "perBase", |
11 | 11 |
confint = FALSE) |
12 |
+ allPs <- as.array(allPs) |
|
12 | 13 |
fit <- lmFit(allPs, design) |
13 | 14 |
fitC <- contrasts.fit(fit, contrasts) |
14 | 15 |
## Need |
... | ... |
@@ -17,7 +18,7 @@ BSmooth.fstat <- function(BSseq, design, contrasts, verbose = TRUE){ |
17 | 18 |
## tstats <- fitC$coefficients / fitC$stdev.unscaled / fitC$sigma |
18 | 19 |
## rawSds <- fitC$sigma |
19 | 20 |
## cor.coefficients <- cov2cor(fitC$cov.coefficients) |
20 |
- rawSds <- fitC$sigma |
|
21 |
+ rawSds <- as.matrix(fitC$sigma) |
|
21 | 22 |
cor.coefficients <- cov2cor(fitC$cov.coefficients) |
22 | 23 |
rawTstats <- fitC$coefficients / fitC$stdev.unscaled / fitC$sigma |
23 | 24 |
names(dimnames(rawTstats)) <- NULL |
... | ... |
@@ -31,7 +32,8 @@ BSmooth.fstat <- function(BSseq, design, contrasts, verbose = TRUE){ |
31 | 32 |
cor.coefficients = cor.coefficients, |
32 | 33 |
rawTstats = rawTstats) |
33 | 34 |
out <- BSseqStat(gr = granges(BSseq), |
34 |
- stats = stats, parameters = parameters) |
|
35 |
+ stats = stats, |
|
36 |
+ parameters = parameters) |
|
35 | 37 |
out |
36 | 38 |
} |
37 | 39 |
|
... | ... |
@@ -57,8 +59,12 @@ smoothSds <- function(BSseqStat, k = 101, qSd = 0.75, mc.cores = 1, |
57 | 59 |
if(verbose) cat(sprintf("done in %.1f sec\n", stime)) |
58 | 60 |
smoothSds <- do.call("c", |
59 | 61 |
mclapply(clusterIdx, function(idx) { |
60 |
- smoothSd(getStats(BSseqStat, what = "rawSds")[idx], k = k, qSd = qSd) |
|
62 |
+ rawSds <- getStats(BSseqStat, |
|
63 |
+ what = "rawSds")[idx, ] |
|
64 |
+ rawSds <- as.array(rawSds) |
|
65 |
+ smoothSd(rawSds, k = k, qSd = qSd) |
|
61 | 66 |
}, mc.cores = mc.cores)) |
67 |
+ smoothSds <- .DelayedMatrix(as.matrix(smoothSds)) |
|
62 | 68 |
if("smoothSds" %in% names(getStats(BSseqStat))) |
63 | 69 |
BSseqStat@stats[["smoothSds"]] <- smoothSds |
64 | 70 |
else |
... | ... |
@@ -68,21 +74,30 @@ smoothSds <- function(BSseqStat, k = 101, qSd = 0.75, mc.cores = 1, |
68 | 74 |
|
69 | 75 |
# Quieten R CMD check |
70 | 76 |
globalVariables("tstat") |
77 |
+ |
|
78 |
+# NOTE: Realises in memory a matrix with nrow = length(BSseqStat) and |
|
79 |
+# ncol = length(coef) |
|
71 | 80 |
computeStat <- function(BSseqStat, coef = NULL) { |
72 | 81 |
stopifnot(is(BSseqStat, "BSseqStat")) |
73 | 82 |
if(is.null(coef)) { |
74 | 83 |
coef <- 1:ncol(getStats(BSseqStat, what = "rawTstats")) |
75 | 84 |
} |
76 |
- tstats <- getStats(BSseqStat, what = "rawTstats")[, coef, drop = FALSE] |
|
77 |
- tstats <- tstats * getStats(BSseqStat, what = "rawSds") / |
|
85 |
+ raw_tstats <- getStats(BSseqStat, what = "rawTstats")[, coef, drop = FALSE] |
|
86 |
+ scaled_sds <- getStats(BSseqStat, what = "rawSds") / |
|
78 | 87 |
getStats(BSseqStat, what = "smoothSds") |
88 |
+ scaled_sds_matrix <- do.call(cbind, replicate(ncol(raw_tstats), scaled_sds)) |
|
89 |
+ tstats <- raw_tstats * scaled_sds_matrix |
|
79 | 90 |
if(length(coef) > 1) { |
80 |
- cor.coefficients <- getStats(BSseqStat, what = "cor.coefficients")[coef,coef] |
|
81 |
- stat <- as.numeric(classifyTestsF(tstats, cor.coefficients, |
|
82 |
- fstat.only = TRUE)) |
|
91 |
+ cor.coefficients <- getStats(BSseqStat, |
|
92 |
+ what = "cor.coefficients")[coef,coef] |
|
93 |
+ # NOTE: classifyTestsF() calls as.matrix(tstats) and so realises this |
|
94 |
+ # array |
|
95 |
+ stat <- .DelayedMatrix(as.matrix(classifyTestsF(tstats, |
|
96 |
+ cor.coefficients, |
|
97 |
+ fstat.only = TRUE))) |
|
83 | 98 |
stat.type <- "fstat" |
84 | 99 |
} else { |
85 |
- stat <- as.numeric(tstats) |
|
100 |
+ stat <- .DelayedMatrix(tstats) |
|
86 | 101 |
stat.type <- "tstat" |
87 | 102 |
} |
88 | 103 |
if("stat" %in% names(getStats(BSseqStat))) { |
... | ... |
@@ -114,9 +129,9 @@ localCorrectStat <- function(BSseqStat, threshold = c(-15,15), mc.cores = 1, ver |
114 | 129 |
xx.reg <- seq(from = min(xx), to = max(xx), by = 2000) |
115 | 130 |
yy.reg <- tstat.function(xx.reg) |
116 | 131 |
fit <- locfit(yy.reg ~ lp(xx.reg, h = 25000, deg = 2, nn = 0), |
117 |
- family = "huber", maxk = 50000) |
|
132 |
+ family = "huber", maxk = 50000) |
|
118 | 133 |
correction <- predict(fit, newdata = data.frame(xx.reg = xx)) |
119 |
- yy - correction |
|
134 |
+ yy - correction |
|
120 | 135 |
} |
121 | 136 |
maxGap <- BSseqStat$parameters$maxGap |
122 | 137 |
if(verbose) cat("[BSmooth.tstat] preprocessing ... ") |
... | ... |
@@ -20,9 +20,9 @@ BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paire |
20 | 20 |
xx.reg <- seq(from = min(xx), to = max(xx), by = 2000) |
21 | 21 |
yy.reg <- tstat.function(xx.reg) |
22 | 22 |
fit <- locfit(yy.reg ~ lp(xx.reg, h = 25000, deg = 2, nn = 0), |
23 |
- family = "huber", maxk = 50000) |
|
23 |
+ family = "huber", maxk = 50000) |
|
24 | 24 |
correction <- predict(fit, newdata = data.frame(xx.reg = xx)) |
25 |
- yy - correction |
|
25 |
+ yy - correction |
|
26 | 26 |
} |
27 | 27 |
|
28 | 28 |
estimate.var <- match.arg(estimate.var) |
... | ... |
@@ -38,7 +38,7 @@ BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paire |
38 | 38 |
if(is.character(group2)) { |
39 | 39 |
stopifnot(all(group2 %in% sampleNames(BSseq))) |
40 | 40 |
group2 <- match(group2, sampleNames(BSseq)) |
41 |
- } |
|
41 |
+ } |
|
42 | 42 |
if(is.numeric(group2)) { |
43 | 43 |
stopifnot(min(group2) >= 1 & max(group2) <= ncol(BSseq)) |
44 | 44 |
} else stop("problems with argument 'group2'") |
... | ... |
@@ -48,22 +48,29 @@ BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paire |
48 | 48 |
stopifnot(length(group1) + length(group2) >= 3) |
49 | 49 |
if(estimate.var == "paired") |
50 | 50 |
stopifnot(length(group1) == length(group2)) |
51 |
- |
|
52 |
- if(any(rowSums(getCoverage(BSseq)[, c(group1, group2)]) == 0)) |
|
53 |
- warning("Computing t-statistics at locations where there is no data; consider subsetting the 'BSseq' object first") |
|
54 |
- |
|
51 |
+ |
|
52 |
+ if (!.isHDF5ArrayBacked(getCoverage(BSseq))) { |
|
53 |
+ ## TODO: This check can be super expensive if Cov is a HDF5Matrix, so |
|
54 |
+ ## we skip it for the time being. Really noticeable when Cov |
|
55 |
+ ## needs row subsetted/reordered (i.e. has Cov@index[[1]] exists) |
|
56 |
+ ## |
|
57 |
+ if(any(rowSums(getCoverage(BSseq)[, c(group1, group2)]) == 0)) { |
|
58 |
+ warning("Computing t-statistics at locations where there is no data; consider subsetting the 'BSseq' object first") |
|
59 |
+ } |
|
60 |
+ } |
|
61 |
+ |
|
55 | 62 |
if(is.null(maxGap)) |
56 | 63 |
maxGap <- BSseq@parameters$maxGap |
57 | 64 |
if(is.null(maxGap)) |
58 | 65 |
stop("need to set argument 'maxGap'") |
59 |
- |
|
66 |
+ |
|
60 | 67 |
if(verbose) cat("[BSmooth.tstat] preprocessing ... ") |
61 | 68 |
ptime1 <- proc.time() |
62 | 69 |
clusterIdx <- makeClusters(BSseq, maxGap = maxGap) |
63 | 70 |
ptime2 <- proc.time() |
64 | 71 |
stime <- (ptime2 - ptime1)[3] |
65 | 72 |
if(verbose) cat(sprintf("done in %.1f sec\n", stime)) |
66 |
- |
|
73 |
+ |
|
67 | 74 |
if(verbose) cat("[BSmooth.tstat] computing stats within groups ... ") |
68 | 75 |
ptime1 <- proc.time() |
69 | 76 |
allPs <- getMeth(BSseq, type = "smooth", what = "perBase", |
... | ... |
@@ -73,12 +80,12 @@ BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paire |
73 | 80 |
ptime2 <- proc.time() |
74 | 81 |
stime <- (ptime2 - ptime1)[3] |
75 | 82 |
if(verbose) cat(sprintf("done in %.1f sec\n", stime)) |
76 |
- |
|
83 |
+ |
|
77 | 84 |
if(verbose) cat("[BSmooth.tstat] computing stats across groups ... ") |
78 | 85 |
ptime1 <- proc.time() |
79 | 86 |
switch(estimate.var, |
80 | 87 |
"group2" = { |
81 |
- rawSds <- rowSds(allPs, cols = group2, na.rm = TRUE) |
|
88 |
+ rawSds <- .rowSds(x = allPs, cols = group2, na.rm = TRUE) |
|
82 | 89 |
smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) { |
83 | 90 |
smoothSd(rawSds[idx], k = k) |
84 | 91 |
}, mc.cores = mc.cores)) |
... | ... |
@@ -86,9 +93,9 @@ BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paire |
86 | 93 |
tstat.sd <- smoothSds * scale |
87 | 94 |
}, |
88 | 95 |
"same" = { |
89 |
- rawSds <- sqrt( ((length(group1) - 1) * rowVars(allPs, cols = group1) + |
|
90 |
- (length(group2) - 1) * rowVars(allPs, cols = group2)) / |
|
91 |
- (length(group1) + length(group2) - 2)) |
|
96 |
+ rawSds <- sqrt( ((length(group1) - 1) * .rowVars(x = allPs, cols = group1) + |
|
97 |
+ (length(group2) - 1) * .rowVars(x = allPs, cols = group2)) / |
|
98 |
+ (length(group1) + length(group2) - 2)) |
|
92 | 99 |
smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) { |
93 | 100 |
smoothSd(rawSds[idx], k = k) |
94 | 101 |
}, mc.cores = mc.cores)) |
... | ... |
@@ -96,7 +103,7 @@ BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paire |
96 | 103 |
tstat.sd <- smoothSds * scale |
97 | 104 |
}, |
98 | 105 |
"paired" = { |
99 |
- rawSds <- rowSds(allPs[, group1, drop = FALSE] - allPs[, group2, drop = FALSE]) |
|
106 |
+ rawSds <- .rowSds(x = allPs[, group1, drop = FALSE] - allPs[, group2, drop = FALSE]) |
|
100 | 107 |
smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) { |
101 | 108 |
smoothSd(rawSds[idx], k = k) |
102 | 109 |
}, mc.cores = mc.cores)) |
... | ... |
@@ -113,24 +120,24 @@ BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paire |
113 | 120 |
ptime2 <- proc.time() |
114 | 121 |
stime <- (ptime2 - ptime1)[3] |
115 | 122 |
if(verbose) cat(sprintf("done in %.1f sec\n", stime)) |
116 |
- |
|
123 |
+ |
|
117 | 124 |
if(local.correct) { |
118 | 125 |
stats <- cbind(rawSds, tstat.sd, group2.means, group1.means, |
119 | 126 |
tstat, tstat.corrected) |
120 | 127 |
colnames(stats) <- c("rawSds", "tstat.sd", |
121 | 128 |
"group2.means", "group1.means", "tstat", |
122 | 129 |
"tstat.corrected") |
123 |
- |
|
130 |
+ |
|
124 | 131 |
} else { |
125 | 132 |
stats <- cbind(rawSds, tstat.sd, group2.means, group1.means, |
126 | 133 |
tstat) |
127 | 134 |
colnames(stats) <- c("rawSds", "tstat.sd", |
128 | 135 |
"group2.means", "group1.means", "tstat") |
129 | 136 |
} |
130 |
- |
|
137 |
+ |
|
131 | 138 |
parameters <- c(BSseq@parameters, |
132 | 139 |
list(tstatText = sprintf("BSmooth.tstat (local.correct = %s, maxGap = %d)", |
133 |
- local.correct, maxGap), |
|
140 |
+ local.correct, maxGap), |
|
134 | 141 |
group1 = group1, group2 = group2, k = k, qSd = qSd, |
135 | 142 |
local.correct = local.correct, maxGap = maxGap)) |
136 | 143 |
out <- BSseqTstat(gr = granges(BSseq), stats = stats, parameters = parameters) |
... | ... |
@@ -1,4 +1,4 @@ |
1 |
-setClass("BSseqStat", contains = "hasGRanges", |
|
1 |
+setClass("BSseqStat", contains = "hasGRanges", |
|
2 | 2 |
representation(stats = "list", |
3 | 3 |
parameters = "list") |
4 | 4 |
) |
... | ... |
@@ -8,17 +8,12 @@ setValidity("BSseqStat", function(object) { |
8 | 8 |
if(is.null(names(object@stats)) || any(names(object@stats) == "") || |
9 | 9 |
anyDuplicated(names(object@stats))) |
10 | 10 |
msg <- validMsg(msg, "the 'stats' list needs to be named with unique names.") |
11 |
- for(name in c("rawSds", "smoothsSds", "stat")) { |
|
12 |
- if(name %in% names(object@stats) && !is.vector(object@stats[[name]])) |
|
13 |
- msg <- validMsg(msg, sprintf("component '%s' of slot 'stats' have to be a vector", name)) |
|
14 |
- if(name %in% names(object@stats) && length(object@stats[[name]]) != length(object@gr)) |
|
15 |
- msg <- validMsg(msg, sprintf("component '%s' of slot 'stats' have to have the same length as slot 'gr'", name)) |
|
16 |
- } |
|
17 |
- for(name in c("rawTstats")) { |
|
18 |
- if(name %in% names(object@stats) && !is.matrix(object@stats[[name]])) |
|
19 |
- msg <- validMsg(msg, sprintf("component '%s' of slot 'stats' have to be a matrix", name)) |
|
20 |
- if(name %in% names(object@stats) && nrow(object@stats[[name]]) != length(object@gr)) |
|
21 |
- msg <- validMsg(msg, sprintf("component '%s' of slot 'stats' have to have the same number of rows as slot 'gr' is long", name)) |
|
11 |
+ for(name in c("rawSds", "smoothsSds", "stat", "rawTstats")) { |
|
12 |
+ if(name %in% names(object@stats) && |
|
13 |
+ !is(object@stats[[name]], "DelayedMatrix")) |
|
14 |
+ msg <- validMsg(msg, sprintf("component '%s' of slot 'stats' has to be a DelayedMatrix object", name)) |
|
15 |
+ if(name %in% names(object@stats) && isTRUE(nrow(object@stats[[name]]) != length(object@gr))) |
|
16 |
+ msg <- validMsg(msg, sprintf("component '%s' of slot 'stats' has to have the same number of rows as slot 'gr' is long", name)) |
|
22 | 17 |
} |
23 | 18 |
if(is.null(msg)) TRUE else msg |
24 | 19 |
}) |
... | ... |
@@ -29,8 +24,17 @@ setMethod("show", signature(object = "BSseqStat"), |
29 | 24 |
cat(" ", length(object), "methylation loci\n") |
30 | 25 |
cat("based on smoothed data:\n") |
31 | 26 |
cat(" ", object@parameters$smoothText, "\n") |
32 |
- cat("with parameters\n") |
|
33 |
- cat(" ", object@parameters$tstatText, "\n") |
|
27 |
+ not_delayed_matrices <- c("cor.coefficients", "stat.type") |
|
28 |
+ delayed_matrices <- setdiff(names(object@stats), |
|
29 |
+ not_delayed_matrices) |
|
30 |
+ is_HDF5Array_backed <- vapply(object@stats[delayed_matrices], |
|
31 |
+ .isHDF5ArrayBacked, |
|
32 |
+ logical(1L)) |
|
33 |
+ if (any(is_HDF5Array_backed)) { |
|
34 |
+ cat("'stats' slot is HDF5Array-backed\n") |
|
35 |
+ } else { |
|
36 |
+ cat("'stats' slot is in-memory\n") |
|
37 |
+ } |
|
34 | 38 |
}) |
35 | 39 |
|
36 | 40 |
setMethod("[", "BSseqStat", function(x, i, ...) { |
... | ... |
@@ -42,14 +46,11 @@ setMethod("[", "BSseqStat", function(x, i, ...) { |
42 | 46 |
statnames <- names(x@stats) |
43 | 47 |
names(statnames) <- statnames |
44 | 48 |
x@stats <- lapply(statnames, function(nam) { |
45 |
- if(nam %in% c("rawTstats", "modelCoefficients")) { |
|
46 |
- stopifnot(is.matrix(x@stats[[nam]])) |
|
49 |
+ if(nam %in% c("rawTstats", "modelCoefficients", "rawSds", "smoothSds", |
|
50 |
+ "stat")) { |
|
51 |
+ stopifnot(is(x@stats[[nam]], "DelayedMatrix")) |
|
47 | 52 |
return(x@stats[[nam]][i,,drop=FALSE]) |
48 | 53 |
} |
49 |
- if(nam %in% c("rawSds", "smoothSds", "stat")) { |
|
50 |
- stopifnot(is.vector(x@stats[[nam]])) |
|
51 |
- return(x@stats[[nam]][i]) |
|
52 |
- } |
|
53 | 54 |
x@stats[[nam]] |
54 | 55 |
}) |
55 | 56 |
x |
... | ... |
@@ -58,11 +59,26 @@ setMethod("[", "BSseqStat", function(x, i, ...) { |
58 | 59 |
BSseqStat <- function(gr = NULL, stats = NULL, parameters = NULL) { |
59 | 60 |
out <- new("BSseqStat") |
60 | 61 |
out@gr <- gr |
62 |
+ not_delayed_matrices <- c("cor.coefficients", "stat.type") |
|
63 |
+ delayed_matrices <- setdiff(names(stats), not_delayed_matrices) |
|
64 |
+ stats[delayed_matrices] <- endoapply(stats[delayed_matrices], |
|
65 |
+ .DelayedMatrix) |
|
61 | 66 |
out@stats <- stats |
62 | 67 |
out@parameters <- parameters |
63 | 68 |
out |
64 | 69 |
} |
65 | 70 |
|
71 |
+setMethod("updateObject", "BSseqStat", |
|
72 |
+ function(object, ...) { |
|
73 |
+ not_delayed_matrices <- c("cor.coefficients", "stat.type") |
|
74 |
+ delayed_matrices <- setdiff(names(stats), not_delayed_matrices) |
|
75 |
+ stats <- object@stats |
|
76 |
+ stats[delayed_matrices] <- endoapply(stats[delayed_matrices], |
|
77 |
+ .DelayedMatrix) |
|
78 |
+ object@stats <- stats |
|
79 |
+ object |
|
80 |
+ } |
|
81 |
+) |
|
66 | 82 |
|
67 | 83 |
## summary.BSseqStat <- function(object, ...) { |
68 | 84 |
## quant <- quantile(getStats(object)[, "tstat.corrected"], |
... | ... |
@@ -1,5 +1,5 @@ |
1 |
-setClass("BSseqTstat", contains = "hasGRanges", |
|
2 |
- representation(stats = "matrix", |
|
1 |
+setClass("BSseqTstat", contains = "hasGRanges", |
|
2 |
+ representation(stats = "DelayedMatrix", |
|
3 | 3 |
parameters = "list") |
4 | 4 |
) |
5 | 5 |
setValidity("BSseqTstat", function(object) { |
... | ... |
@@ -17,6 +17,11 @@ setMethod("show", signature(object = "BSseqTstat"), |
17 | 17 |
cat(" ", object@parameters$smoothText, "\n") |
18 | 18 |
cat("with parameters\n") |
19 | 19 |
cat(" ", object@parameters$tstatText, "\n") |
20 |
+ if (.isHDF5ArrayBacked(object@stats)) { |
|
21 |
+ cat("'stats' slot is HDF5Array-backed\n") |
|
22 |
+ } else { |
|
23 |
+ cat("'stats' slot is in-memory\n") |
|
24 |
+ } |
|
20 | 25 |
}) |
21 | 26 |
|
22 | 27 |
setMethod("[", "BSseqTstat", function(x, i, ...) { |
... | ... |
@@ -32,14 +37,14 @@ setMethod("[", "BSseqTstat", function(x, i, ...) { |
32 | 37 |
BSseqTstat <- function(gr = NULL, stats = NULL, parameters = NULL) { |
33 | 38 |
out <- new("BSseqTstat") |
34 | 39 |
out@gr <- gr |
35 |
- out@stats <- stats |
|
40 |
+ out@stats <- .DelayedMatrix(stats) |
|
36 | 41 |
out@parameters <- parameters |
37 | 42 |
out |
38 | 43 |
} |
39 | 44 |
|
40 | 45 |
summary.BSseqTstat <- function(object, ...) { |
41 |
- quant <- quantile(getStats(object)[, "tstat.corrected"], |
|
42 |
- prob = c(0.0001, 0.001, 0.01, 0.5, 0.99, 0.999, 0.9999)) |
|
46 |
+ quant <- .quantile(getStats(object)[, "tstat.corrected"], |
|
47 |
+ prob = c(0.0001, 0.001, 0.01, 0.5, 0.99, 0.999, 0.9999)) |
|
43 | 48 |
quant <- t(t(quant)) |
44 | 49 |
colnames(quant) <- "quantiles" |
45 | 50 |
out <- list(quantiles = quant) |
... | ... |
@@ -53,9 +58,11 @@ print.summary.BSseqTstat <- function(x, ...) { |
53 | 58 |
|
54 | 59 |
plot.BSseqTstat <- function(x, y, ...) { |
55 | 60 |
tstat <- getStats(x)[, "tstat"] |
61 |
+ tstat <- as.array(tstat) |
|
56 | 62 |
plot(density(tstat), xlim = c(-10,10), col = "blue", main = "") |
57 | 63 |
if("tstat.corrected" %in% colnames(getStats(x))) { |
58 | 64 |
tstat.cor <- getStats(x)[, "tstat.corrected"] |
65 |
+ tstat.cor <- as.array(tstat.cor) |
|
59 | 66 |
lines(density(tstat.cor), col = "black") |
60 | 67 |
legend("topleft", legend = c("uncorrected", "corrected"), lty = c(1,1), |
61 | 68 |
col = c("blue", "black")) |
... | ... |
@@ -65,3 +72,11 @@ plot.BSseqTstat <- function(x, y, ...) { |
65 | 72 |
} |
66 | 73 |
} |
67 | 74 |
|
75 |
+setMethod("updateObject", "BSseqTstat", |
|
76 |
+ function(object, ...) { |
|
77 |
+ stats <- object@stats |
|
78 |
+ stats <- .DelayedMatrix(stats) |
|
79 |
+ object@stats <- stats |
|
80 |
+ object |
|
81 |
+ } |
|
82 |
+) |
... | ... |
@@ -1,22 +1,30 @@ |
1 |
-setClass("BSseq", contains = "RangedSummarizedExperiment", |
|
1 |
+setClass("BSseq", contains = "RangedSummarizedExperiment", |
|
2 | 2 |
representation(trans = "function", |
3 | 3 |
parameters = "list")) |
4 | 4 |
|
5 | 5 |
setValidity("BSseq", function(object) { |
6 | 6 |
msg <- validMsg(NULL, .checkAssayNames(object, c("Cov", "M"))) |
7 |
+ msg <- validMsg(msg, .checkAssayClasses(object, |
|
8 |
+ c("Cov", "M", "coef", "se.coef"))) |
|
7 | 9 |
if(class(rowRanges(object)) != "GRanges") |
8 | 10 |
msg <- validMsg(msg, sprintf("object of class '%s' needs to have a 'GRanges' in slot 'rowRanges'", class(object))) |
9 | 11 |
## benchmarking shows that min(assay()) < 0 is faster than any(assay() < 0) if it is false |
10 | 12 |
if(is.null(colnames(object))) |
11 | 13 |
msg <- validMsg(msg, "colnames (aka sampleNames) need to be set") |
12 |
- if(min(assay(object, "M")) < 0) |
|
13 |
- msg <- validMsg(msg, "the 'M' assay has negative entries") |
|
14 |
- if(min(assay(object, "Cov")) < 0) |
|
15 |
- msg <- validMsg(msg, "the 'Cov' assay has negative entries") |
|
16 |
- if(max(assay(object, "M") - assay(object, "Cov")) > 0.5) |
|
17 |
- msg <- validMsg(msg, "the 'M' assay has at least one entry bigger than the 'Cov' assay") |
|
18 |
- if(!is.null(rownames(assay(object, "M"))) || |
|
19 |
- !is.null(rownames(assay(object, "Cov"))) || |
|
14 |
+ M <- assay(object, "M", withDimnames = FALSE) |
|
15 |
+ Cov <- assay(object, "Cov", withDimnames = FALSE) |
|
16 |
+ if (!.isHDF5ArrayBacked(M) && !.isHDF5ArrayBacked(Cov)) { |
|
17 |
+ ## TODO: This check is super expensive if M or Cov is a HDF5Matrix, so |
|
18 |
+ ## we skip it for the time being |
|
19 |
+ if(min(assay(object, "M", withDimnames = FALSE)) < 0) |
|
20 |
+ msg <- validMsg(msg, "the 'M' assay has negative entries") |
|
21 |
+ if(min(assay(object, "Cov", withDimnames = FALSE)) < 0) |
|
22 |
+ msg <- validMsg(msg, "the 'Cov' assay has negative entries") |
|
23 |
+ if(max(assay(object, "M", withDimnames = FALSE) - |
|
24 |
+ assay(object, "Cov", withDimnames = FALSE)) > 0.5) |
|
25 |
+ msg <- validMsg(msg, "the 'M' assay has at least one entry bigger than the 'Cov' assay") |
|
26 |
+ } |
|
27 |
+ if(!is.null(rownames(M)) || !is.null(rownames(Cov)) || |
|
20 | 28 |
("coef" %in% assayNames(object) && !is.null(rownames(assay(object, "coef")))) || |
21 | 29 |
("se.coef" %in% assayNames(object) && !is.null(rownames(assay(object, "se.coef"))))) |
22 | 30 |
warning("unnecessary rownames in object") |
... | ... |
@@ -34,6 +42,11 @@ setMethod("show", signature(object = "BSseq"), |
34 | 42 |
} else { |
35 | 43 |
cat("has not been smoothed\n") |
36 | 44 |
} |
45 |
+ if (.isHDF5ArrayBacked(object)) { |
|
46 |
+ cat("Some assays are HDF5Array-backed\n") |
|
47 |
+ } else { |
|
48 |
+ cat("All assays are in-memory\n") |
|
49 |
+ } |
|
37 | 50 |
}) |
38 | 51 |
|
39 | 52 |
setMethod("pData", "BSseq", function(object) { |
... | ... |
@@ -93,7 +106,7 @@ getBSseq <- function(BSseq, type = c("Cov", "M", "gr", "coef", "se.coef", "trans |
93 | 106 |
return(BSseq@parameters) |
94 | 107 |
if(type == "gr") |
95 | 108 |
return(BSseq@rowRanges) |
96 |
- |
|
109 |
+ |
|
97 | 110 |
} |
98 | 111 |
|
99 | 112 |
BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL, |
... | ... |
@@ -111,10 +124,14 @@ BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL, |
111 | 124 |
stop("'gr' needs to have widths of 1") |
112 | 125 |
if(is.null(M) || is.null(Cov)) |
113 | 126 |
stop("Need M and Cov") |
114 |
- if(!is.matrix(M)) |
|
115 |
- stop("'M' needs to be a matrix") |
|
116 |
- if(!is.matrix(Cov)) |
|
117 |
- stop("'Cov' needs to be a matrix") |
|
127 |
+ M <- .DelayedMatrix(M) |
|
128 |
+ Cov <- .DelayedMatrix(Cov) |
|
129 |
+ if (!is.null(coef)) { |
|
130 |
+ coef <- .DelayedMatrix(coef) |
|
131 |
+ } |
|
132 |
+ if (!is.null(se.coef)) { |
|
133 |
+ se.coef <- .DelayedMatrix(se.coef) |
|
134 |
+ } |
|
118 | 135 |
if(length(gr) != nrow(M) || |
119 | 136 |
length(gr) != nrow(Cov) || |
120 | 137 |
ncol(Cov) != ncol(M)) |
... | ... |
@@ -139,9 +156,15 @@ BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL, |
139 | 156 |
if(length(unique(sampleNames)) != ncol(M)) |
140 | 157 |
stop("sampleNames need to be unique and of the right length.") |
141 | 158 |
## check that 0 <= M <= Cov and remove positions with Cov = 0 |
142 |
- if(any(M < 0) || any(M > Cov) || any(is.na(M)) || any(is.na(Cov)) || |
|
143 |
- any(is.infinite(Cov))) |
|
144 |
- stop("'M' and 'Cov' may not contain NA or infinite values and 0 <= M <= Cov") |
|
159 |
+ if (!.isHDF5ArrayBacked(M) && !.isHDF5ArrayBacked(Cov)) { |
|
160 |
+ ## TODO: This check is super expensive if M or Cov is a HDF5Matrix, so |
|
161 |
+ ## we skip it for the time being |
|
162 |
+ if (any(M < 0) || any(M > Cov) || any(is.na(M)) || any(is.na(Cov)) || |
|
163 |
+ any(is.infinite(Cov))) { |
|
164 |
+ stop("'M' and 'Cov' may not contain NA or infinite values and ", |
|
165 |
+ "0 <= M <= Cov") |
|
166 |
+ } |
|
167 |
+ } |
|
145 | 168 |
if(rmZeroCov) { |
146 | 169 |
wh <- which(rowSums(Cov) == 0) |
147 | 170 |
if(length(wh) > 0) { |
... | ... |
@@ -156,7 +179,7 @@ BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL, |
156 | 179 |
mm <- as.matrix(findOverlaps(grR, gr)) |
157 | 180 |
mm <- mm[order(mm[,1]),] |
158 | 181 |
if(length(grR) == length(gr)) { |
159 |
- ## only re-ordering is necessary |
|
182 |
+ ## only re-ordering is necessary |
|
160 | 183 |
gr <- grR |
161 | 184 |
M <- M[mm[,2],,drop = FALSE] |
162 | 185 |
Cov <- Cov[mm[,2],,drop = FALSE] |
... | ... |
@@ -171,8 +194,31 @@ BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL, |
171 | 194 |
gr <- grR |
172 | 195 |
sp <- split(mm[,2], mm[,1])[as.character(1:length(grR))] |
173 | 196 |
names(sp) <- NULL |
174 |
- M <- do.call(rbind, lapply(sp, function(ii) colSums(M[ii,, drop = FALSE]))) |
|
175 |
- Cov <- do.call(rbind, lapply(sp, function(ii) colSums(Cov[ii,, drop = FALSE]))) |
|
197 |
+ # TODO: .collapseDelayedMatrix() always return numeric; it may be |
|
198 |
+ # worth coercing M and Cov to integer DelayedMatrix objects, |
|
199 |
+ # which would halve storage requirements and impose some more |
|
200 |
+ # structure on the BSseq class (via new validity method |
|
201 |
+ # checks) |
|
202 |
+ # NOTE: Tries to be smart about how collapsed DelayedMatrix should |
|
203 |
+ # be realized |
|
204 |
+ if (.isHDF5ArrayBacked(M)) { |
|
205 |
+ M_BACKEND <- "HDF5Array" |
|
206 |
+ } else { |
|
207 |
+ M_BACKEND <- NULL |
|
208 |
+ } |
|
209 |
+ M <- .collapseDelayedMatrix(x = M, |
|
210 |
+ sp = sp, |
|
211 |
+ MARGIN = 2, |
|
212 |
+ BACKEND = M_BACKEND) |
|
213 |
+ if (.isHDF5ArrayBacked(Cov)) { |
|
214 |
+ Cov_BACKEND <- "HDF5Array" |
|
215 |
+ } else { |
|
216 |
+ Cov_BACKEND <- NULL |
|
217 |
+ } |
|
218 |
+ Cov <- .collapseDelayedMatrix(x = Cov, |
|
219 |
+ sp = sp, |
|
220 |
+ MARGIN = 2, |
|
221 |
+ BACKEND = Cov_BACKEND) |
|
176 | 222 |
} |
177 | 223 |
} |
178 | 224 |
if(is.null(colnames(M)) || any(sampleNames != colnames(M))) |
... | ... |
@@ -180,8 +226,8 @@ BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL, |
180 | 226 |
if(is.null(colnames(Cov)) || any(sampleNames != colnames(Cov))) |
181 | 227 |
colnames(Cov) <- sampleNames |
182 | 228 |
if(!is.null(coef)) { |
183 |
- if(!is.matrix(coef) || |
|
184 |
- nrow(coef) != nrow(M) || |
|
229 |
+ |
|
230 |
+ if(nrow(coef) != nrow(M) || |
|
185 | 231 |
ncol(coef) != ncol(M)) |
186 | 232 |
stop("'coef' does not have the right dimensions") |
187 | 233 |
if(is.null(colnames(coef)) || any(sampleNames != colnames(coef))) |
... | ... |
@@ -190,8 +236,7 @@ BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL, |
190 | 236 |
rownames(coef) <- NULL |
191 | 237 |
} |
192 | 238 |
if(!is.null(se.coef)) { |
193 |
- if(!is.matrix(se.coef) || |
|
194 |
- nrow(se.coef) != nrow(M) || |
|
239 |
+ if(nrow(se.coef) != nrow(M) || |
|
195 | 240 |
ncol(se.coef) != ncol(M)) |
196 | 241 |
stop("'se.coef' does not have the right dimensions") |
197 | 242 |
if(is.null(colnames(se.coef)) || any(sampleNames != colnames(se.coef))) |
... | ... |
@@ -213,19 +258,30 @@ BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL, |
213 | 258 |
BSseq |
214 | 259 |
} |
215 | 260 |
|
216 |
- |
|
217 | 261 |
setMethod("updateObject", "BSseq", |
218 | 262 |
function(object, ...) { |
219 |
- if(.hasSlot(object, "assays")) { |
|
220 |
- # call method for RangedSummarizedExperiment objects |
|
221 |
- callNextMethod() |
|
222 |
- } else { |
|
223 |
- BSseq(gr = object@gr, M = object@M, Cov = object@Cov, |
|
224 |
- coef = object@coef, se.coef = object@se.coef, |
|
225 |
- trans = object@trans, parameters = object@parameters, |
|
226 |
- pData = object@phenoData@data) |
|
227 |
- } |
|
228 |
- }) |
|
263 |
+ # NOTE: identical() is too strong |
|
264 |
+ if (isTRUE(all.equal(getBSseq(object, "trans"), .oldTrans))) { |
|
265 |
+ object@trans <- plogis |
|
266 |
+ } |
|
267 |
+ if (.hasSlot(object, "assays")) { |
|
268 |
+ assays(object) <- endoapply( |
|
269 |
+ assays(object, withDimnames = FALSE), function(assay) { |
|
270 |
+ if (is.null(assay)) { |
|
271 |
+ return(assay) |
|
272 |
+ } else { |
|
273 |
+ .DelayedMatrix(assay) |
|
274 |
+ } |
|
275 |
+ }) |
|
276 |
+ # call method for RangedSummarizedExperiment objects |
|
277 |
+ callNextMethod() |
|
278 |
+ } else { |
|
279 |
+ BSseq(gr = object@gr, M = object@M, Cov = object@Cov, |
|
280 |
+ coef = object@coef, se.coef = object@se.coef, |
|
281 |
+ trans = object@trans, parameters = object@parameters, |
|
282 |
+ pData = object@phenoData@data) |
|
283 |
+ } |
|
284 |
+ }) |
|
229 | 285 |
|
230 | 286 |
|
231 | 287 |
strandCollapse <- function(BSseq, shift = TRUE) { |
... | ... |
@@ -285,7 +341,7 @@ strandCollapse <- function(BSseq, shift = TRUE) { |
285 | 341 |
## ## Based on the delta method |
286 | 342 |
## se.d <- sqrt((data$se.coef[, sample1] * p1 * (1-p1))^2 + |
287 | 343 |
## (data$se.coef[, sample2] * p2 * (1-p2))^2) |
288 |
- |
|
344 |
+ |
|
289 | 345 |
## lower <- d - z * se.d |
290 | 346 |
## upper <- d + z * se.d |
291 | 347 |
## out <- data.frame(d = d, lower = lower, upper = upper) |
... | ... |
@@ -10,12 +10,33 @@ collapseBSseq <- function(BSseq, columns) { |
10 | 10 |
else |
11 | 11 |
columns.idx <- match(names(columns), sampleNames(BSseq)) |
12 | 12 |
sp <- split(columns.idx, columns) |
13 |
- M <- do.call(cbind, lapply(sp, function(ss) { |
|
14 |
- rowSums(getBSseq(BSseq, "M")[, ss, drop = FALSE]) |
|
15 |
- })) |
|
16 |
- Cov <- do.call(cbind, lapply(sp, function(ss) { |
|
17 |
- rowSums(getBSseq(BSseq, "Cov")[, ss, drop = FALSE]) |
|
18 |
- })) |
|
13 |
+ # TODO: .collapseDelayedMatrix() always return numeric; it may be |
|
14 |
+ # worth coercing M and Cov to integer DelayedMatrix objects, |
|
15 |
+ # which would halve storage requirements and impose some more |
|
16 |
+ # structure on the BSseq class (via new validity method |
|
17 |
+ # checks) |
|
18 |
+ # NOTE: Tries to be smart about how collapsed DelayedMatrix should |
|
19 |
+ # be realized |
|
20 |
+ M <- getBSseq(BSseq, "M") |
|
21 |
+ if (.isHDF5ArrayBacked(M)) { |
|
22 |
+ M_BACKEND <- "HDF5Array" |
|
23 |
+ } else { |
|
24 |
+ M_BACKEND <- NULL |
|
25 |
+ } |
|
26 |
+ M <- .collapseDelayedMatrix(x = M, |
|
27 |
+ sp = sp, |
|
28 |
+ MARGIN = 1, |
|
29 |
+ BACKEND = M_BACKEND) |
|
30 |
+ Cov <- getBSseq(BSseq, "Cov") |
|
31 |
+ if (.isHDF5ArrayBacked(Cov)) { |
|
32 |
+ Cov_BACKEND <- "HDF5Array" |
|
33 |
+ } else { |
|
34 |
+ Cov_BACKEND <- NULL |
|
35 |
+ } |
|
36 |
+ Cov <- .collapseDelayedMatrix(x = Cov, |
|
37 |
+ sp = sp, |
|
38 |
+ MARGIN = 1, |
|
39 |
+ BACKEND = Cov_BACKEND) |
|
19 | 40 |
BSseq(gr = getBSseq(BSseq, "gr"), M = M, Cov = Cov, sampleNames = names(sp)) |
20 | 41 |
} |
21 | 42 |
|
... | ... |
@@ -108,11 +129,10 @@ getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"), |
108 | 129 |
outMatrix <- matrix(NA, ncol = ncol(BSseq), nrow = length(regions)) |
109 | 130 |
colnames(outMatrix) <- sampleNames(BSseq) |
110 | 131 |
outMatrix[as.integer(rownames(out)),] <- out |
111 |
- return(outMatrix) |
|
132 |
+ .DelayedMatrix(outMatrix) |
|
112 | 133 |
} |
113 | 134 |
} |
114 | 135 |
|
115 |
- |
|
116 | 136 |
getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"), |
117 | 137 |
what = c("perBase", "perRegionAverage", "perRegionTotal")) { |
118 | 138 |
stopifnot(is(BSseq, "BSseq")) |
... | ... |
@@ -175,7 +195,7 @@ getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"), |
175 | 195 |
outMatrix <- matrix(NA, ncol = ncol(BSseq), nrow = length(regions)) |
176 | 196 |
colnames(outMatrix) <- sampleNames(BSseq) |
177 | 197 |
outMatrix[as.integer(rownames(out)),] <- out |
178 |
- return(outMatrix) |
|
198 |
+ .DelayedMatrix(outMatrix) |
|
179 | 199 |
} |
180 | 200 |
|
181 | 201 |
|
182 | 202 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,91 @@ |
1 |
+# Functions/methods that would be good to have in DelayedArray |
|
2 |
+ |
|
3 |
+.rowVars <- function(x, rows = NULL, cols = NULL, ...) { |
|
4 |
+ if (is(x, "DelayedArray")) { |
|
5 |
+ if (!is.null(rows)) { |
|
6 |
+ x <- x[rows, ] |
|
7 |
+ } |
|
8 |
+ if (!is.null(cols)) { |
|
9 |
+ x <- x[, cols] |
|
10 |
+ } |
|
11 |
+ row_vars <- rowVars(as.array(x), ...) |
|
12 |
+ } else { |
|
13 |
+ row_vars <- rowVars(x, rows = rows, cols = cols, ...) |
|
14 |
+ } |
|
15 |
+ row_vars |
|
16 |
+} |
|
17 |
+ |
|
18 |
+.rowSds <- function(x, rows = NULL, cols = NULL, ...) { |
|
19 |
+ row_vars <- .rowVars(x, rows = rows, cols = cols, ...) |
|
20 |
+ sqrt(row_vars) |
|
21 |
+} |
|
22 |
+ |
|
23 |
+.quantile <- function(x, ...) { |
|
24 |
+ if (is(x, "DelayedArray")) { |
|
25 |
+ x <- as.array(x) |
|
26 |
+ } |
|
27 |
+ quantile(x, ...) |
|
28 |
+} |
|
29 |
+ |
|
30 |
+.DelayedMatrix <- function(x) { |
|
31 |
+ x_name <- deparse(substitute(x)) |
|
32 |
+ X <- try(DelayedArray(x), silent = TRUE) |
|
33 |
+ if (is(X, "try-error")) { |
|
34 |
+ stop("Could not construct DelayedMatrix from '", x_name, "'", |
|
35 |
+ call. = FALSE) |
|
36 |
+ } |
|
37 |
+ if (!is(X, "DelayedMatrix")) { |
|
38 |
+ stop("'", x_name, "' must be matrix-like", call. = FALSE) |
|
39 |
+ } |
|
40 |
+ X |
|
41 |
+} |
|
42 |
+ |
|
43 |
+.isSimpleDelayedMatrix <- function(x) { |
|
44 |
+ is(x@seed, "matrix") |
|
45 |
+} |
|
46 |
+ |
|
47 |
+# NOTE: Equivalent to rowSums(x[, j, drop = FALSE]) but does it using a |
|
48 |
+# delayed operation and always returns a nrow(x) x 1 DelayedMatrix |
|
49 |
+.delayed_rowSums <- function(x, j) { |
|
50 |
+ Reduce(`+`, lapply(j, function(jj) x[, jj, drop = FALSE])) |
|
51 |
+} |
|
52 |
+ |
|
53 |
+# NOTE: Equivalent to colSums(x[i, , drop = FALSE]) but does it using a |
|
54 |
+# delayed operation and always returns a 1 x ncol(x) DelayedMatrix |
|
55 |
+.delayed_colSums <- function(x, i) { |
|
56 |
+ Reduce(`+`, lapply(i, function(ii) x[ii, , drop = FALSE])) |
|
57 |
+} |
|
58 |
+ |
|
59 |
+# MARGIN = 1: collapse using rowSums |
|
60 |
+# MARGIN = 2: collapse using colSums |
|
61 |
+.collapseDelayedMatrix <- function(x, sp, MARGIN, BACKEND = NULL) { |
|
62 |
+ stopifnot(is(x, "DelayedMatrix")) |
|
63 |
+ if (MARGIN == 1) { |
|
64 |
+ if (is.null(BACKEND)) { |
|
65 |
+ collapsed_x <- do.call(cbind, lapply(sp, function(j) { |
|
66 |
+ rowSums(x[, j, drop = FALSE]) |
|
67 |
+ })) |
|
68 |
+ } else { |
|
69 |
+ collapsed_x <- do.call(cbind, lapply(sp, function(j) { |
|
70 |
+ .delayed_rowSums(x, j) |
|
71 |
+ })) |
|
72 |
+ # NOTE: Need to manually add colnames when using this method |
|
73 |
+ colnames(collapsed_x) <- names(sp) |
|
74 |
+ } |
|
75 |
+ } else if (MARGIN == 2) { |
|
76 |
+ if (is.null(BACKEND)) { |
|
77 |
+ collapsed_x <- do.call(rbind, lapply(sp, function(i) { |
|
78 |
+ colSums(x[i, , drop = FALSE]) |
|
79 |
+ })) |
|
80 |
+ } else { |
|
81 |
+ collapsed_x <- do.call(rbind, lapply(sp, function(i) { |
|
82 |
+ .delayed_colSums(x, i) |
|
83 |
+ })) |
|
84 |
+ # NOTE: Need to manually add rownames when using this method |
|
85 |
+ rownames(collapsed_x) <- names(sp) |
|
86 |
+ } |
|
87 |
+ } else { |
|
88 |
+ stop("'MARGIN' must be 1 or 2") |
|
89 |
+ } |
|
90 |
+ realize(collapsed_x, BACKEND = BACKEND) |
|
91 |
+} |
... | ... |
@@ -1,106 +1,219 @@ |
1 |
+# TODO: Ultimately, combine() is just a special case of combineList(), so |
|
2 |
+# should simplify code |
|
1 | 3 |
setMethod("combine", signature(x = "BSseq", y = "BSseq"), function(x, y, ...) { |
2 | 4 |
## All of this assumes that we are place x and y "next" to each other, |
3 | 5 |
## ie. we are not combining the same set of samples sequenced at different times |
4 |
- if (class(x) != class(y)) |
|
6 |
+ if (class(x) != class(y)) { |
|
5 | 7 |
stop(paste("objects must be the same class, but are ", |
6 |
- class(x), ", ", class(y), sep="")) |
|
7 |
- if(hasBeenSmoothed(x) && hasBeenSmoothed(y) && !all.equal(x@trans, y@trans)) |
|
8 |
+ class(x), ", ", class(y), se = "")) |
|
9 |
+ } |
|
10 |
+ if (hasBeenSmoothed(x) && hasBeenSmoothed(y) && |
|
11 |
+ !all.equal(getBSseq(x, "trans"), getBSseq(y, "trans")) && |
|
12 |
+ !all.equal(getBSseq(x, "parameters"), getBSseq(y, "parameters"))) { |
|
8 | 13 |
stop("'x' and 'y' need to be smoothed on the same scale") |
14 |
+ } |
|
9 | 15 |
pData <- combine(as(pData(x), "data.frame"), as(pData(y), "data.frame")) |
10 |
- if(identical(granges(x), granges(y))) { |
|
16 |
+ # TODO: Could relax this by sorting x and y before doing the next steps |
|
17 |
+ if (identical(granges(x), granges(y))) { |
|
18 |
+ # NOTE: Don't realize() M, Cov, coef, or se.coef because we are |
|
19 |
+ # cbind()-ing, which is very efficient for DelayedArray objects. |
|
20 |
+ # E.g., cbind()-ing a bunch of HDF5Matrix objects is super fast |
|
21 |
+ # and uses basically no memory whereas realize()-ing the final |
|
22 |
+ # object as a new HDF5Matrix can take a long time. |
|
11 | 23 |
gr <- granges(x) |
12 | 24 |
M <- cbind(getBSseq(x, "M"), getBSseq(y, "M")) |
13 | 25 |
Cov <- cbind(getBSseq(x, "Cov"), getBSseq(y, "Cov")) |
14 |
- if(!hasBeenSmoothed(x) || !hasBeenSmoothed(y)) { |
|
26 |
+ if (!hasBeenSmoothed(x) || !hasBeenSmoothed(y)) { |
|
15 | 27 |
coef <- NULL |
16 | 28 |
se.coef <- NULL |
17 | 29 |
trans <- NULL |
30 |
+ parameters <- NULL |
|
18 | 31 |
} else { |
32 |
+ x_trans <- getBSseq(x, "trans") |
|
33 |
+ y_trans <- getBSseq(y, "trans") |
|
34 |
+ if (!identical(x_trans, y_trans)) { |
|
35 |
+ stop("'x' and 'y' need to have the same 'trans'") |
|
36 |
+ } |
|
37 |
+ trans <- x_trans |
|
38 |
+ x_parameters <- getBSseq(x, "parameters") |
|
39 |
+ y_parameters <- getBSseq(y, "parameters") |
|
40 |
+ if (!identical(x_parameters, y_parameters)) { |
|
41 |
+ stop("'x' and 'y' need to have the same 'parameters'") |
|
42 |
+ } |
|
43 |
+ parameters <- x_parameters |
|
19 | 44 |
coef <- cbind(getBSseq(x, "coef"), getBSseq(y, "coef")) |
20 |
- se.coef <- cbind(getBSseq(x, "se.coef"), getBSseq(y, "se.coef")) |
|
21 |
- trans <- getBSseq(x, "trans") |
|
45 |
+ if (!is.null(getBSseq(x, "se.coef")) && |
|
46 |
+ !is.null(getBSseq(y, "se.coef"))) { |
|
47 |
+ se.coef <- cbind(getBSseq(x, "se.coef"), getBSseq(y, "se.coef")) |
|
48 |
+ } else { |
|
49 |
+ if (!is.null(getBSseq(x, "se.coef")) || |
|
50 |
+ !is.null(getBSseq(y, "se.coef"))) { |
|
51 |
+ warning("Setting 'se.coef' to NULL: Cannot combine() ", |
|
52 |
+ "these because one of 'x' or 'y' is missing ", |
|
53 |
+ "'se.coef'") |
|
54 |
+ } |
|
55 |
+ se.coef <- NULL |
|
56 |
+ } |
|
22 | 57 |
} |
23 | 58 |
} else { |
24 | 59 |
gr <- reduce(c(granges(x), granges(y)), min.gapwidth = 0L) |
25 | 60 |
mm.x <- as.matrix(findOverlaps(gr, granges(x))) |
26 | 61 |
mm.y <- as.matrix(findOverlaps(gr, granges(y))) |
27 |
- sampleNames <- c(sampleNames(x), sampleNames(y)) |
|
62 |
+ I <- list(mm.x[, 1], mm.y[, 1]) |
|
28 | 63 |
## FIXME: there is no check that the two sampleNames are disjoint. |
29 |
- M <- Cov <- matrix(0, nrow = length(gr), ncol = length(sampleNames)) |
|
30 |
- colnames(M) <- colnames(Cov) <- sampleNames |
|
31 |
- M[mm.x[,1], 1:ncol(x)] <- getBSseq(x, "M")[mm.x[,2],] |
|
32 |
- M[mm.y[,1], ncol(x) + 1:ncol(y)] <- getBSseq(y, "M")[mm.y[,2],] |
|
33 |
- Cov[mm.x[,1], 1:ncol(x)] <- getBSseq(x, "Cov")[mm.x[,2],] |
|
34 |
- Cov[mm.y[,1], ncol(x) + 1:ncol(y)] <- getBSseq(y, "Cov")[mm.y[,2],] |
|
35 |
- if(!hasBeenSmoothed(x) || !hasBeenSmoothed(y)) { |
|
36 |
- coef <- NULL |
|
37 |
- se.coef <- NULL |
|
38 |
- trans <- NULL |
|
64 |
+ sampleNames <- c(sampleNames(x), sampleNames(y)) |
|
65 |
+ M_X <- list(getBSseq(x, "M"), getBSseq(y, "M")) |
|
66 |
+ if (any(vapply(M_X, .isHDF5ArrayBacked, logical(1L)))) { |
|
67 |
+ M_BACKEND <- "HDF5Array" |
|
39 | 68 |
} else { |
40 |
- trans <- x@trans |
|
41 |
- coef <- matrix(0, nrow = length(gr), ncol = length(sampleNames)) |
|
42 |
- colnames(coef) <- rownames(pData) |
|
43 |
- if(hasBeenSmoothed(x)) |
|
44 |
- coef[mm.x[,1], 1:ncol(x)] <- getBSseq(x, "coef")[mm.x[,2],] |
|
45 |
- if(hasBeenSmoothed(y)) |
|
46 |
- coef[mm.y[,1], ncol(x) + 1:ncol(y)] <- getBSseq(y, "coef")[mm.y[,2],] |
|
47 |
- if(is.null(getBSseq(x, "se.coef")) && is.null(getBSseq(x, "se.coef"))) |
|
48 |
- se.coef <- NULL |
|
49 |
- else { |
|
50 |
- se.coef <- matrix(0, nrow = length(gr), ncol = length(sampleNames)) |
|
51 |
- colnames(se.coef) <- sampleNames(pData) |
|
52 |
- if(!is.null(getBSseq(x, "se.coef"))) |
|
53 |
- se.coef[mm.x[,1], 1:ncol(x)] <- getBSseq(x, "se.coef")[mm.x[,2],] |
|
54 |
- if(!is.null(getBSseq(y, "se.coef"))) |
|
55 |
- se.coef[mm.y[,1], ncol(x) + 1:ncol(y)] <- getBSseq(y, "se.coef")[mm.y[,2],] |
|
56 |
- } |
|
69 |
+ M_BACKEND <- NULL |
|
57 | 70 |
} |
71 |
+ # TODO: Figure out if fill should be 0 or 0L based on M_X |
|
72 |
+ M <- .combineListOfDelayedMatrixObjects( |
|
73 |
+ X = M_X, |
|
74 |
+ I = I, |
|
75 |
+ nrow = length(gr), |
|
76 |
+ ncol = length(sampleNames), |
|
77 |
+ dimnames = list(NULL, sampleNames), |
|
78 |
+ fill = 0L, |
|
79 |
+ BACKEND = M_BACKEND) |
|
80 |
+ Cov_X <- list(getBSseq(x, "Cov"), getBSseq(y, "Cov")) |
|
81 |
+ if (any(vapply(Cov_X, .isHDF5ArrayBacked, logical(1L)))) { |
|
82 |
+ Cov_BACKEND <- "HDF5Array" |
|
83 |
+ } else { |
|
84 |
+ Cov_BACKEND <- NULL |
|
85 |
+ } |
|
86 |
+ Cov <- .combineListOfDelayedMatrixObjects( |
|
87 |
+ X = list(getBSseq(x, "Cov"), getBSseq(y, "Cov")), |
|
88 |
+ I = I, |
|
89 |
+ nrow = length(gr), |
|
90 |
+ ncol = length(sampleNames), |
|
91 |
+ dimnames = list(NULL, sampleNames), |
|
92 |
+ fill = 0L, |
|
93 |
+ BACKEND = Cov_BACKEND) |
|
94 |
+ if (hasBeenSmoothed(x) || hasBeenSmoothed(y)) { |
|
95 |
+ warning("Setting 'coef' and 'se.coef' to NULL: Cannot combine() ", |
|
96 |
+ "these because 'x' and 'y' have different rowRanges") |
|
97 |
+ } |
|
98 |
+ coef <- NULL |
|
99 |
+ se.coef <- NULL |
|
100 |
+ trans <- NULL |
|
101 |
+ parameters <- NULL |
|
58 | 102 |
} |
59 | 103 |
BSseq(gr = gr, M = M, Cov = Cov, coef = coef, se.coef = se.coef, |
60 |
- pData = pData, trans = trans, rmZeroCov = FALSE) |
|
104 |
+ pData = pData, trans = trans, parameters = parameters, |
|
105 |
+ rmZeroCov = FALSE) |
|
61 | 106 |
}) |
62 | 107 |
|
63 |
-combineList <- function(x, ...) { |
|
64 |
- if(class(x) == "BSseq") |
|
108 |
+combineList <- function(x, ..., BACKEND = NULL) { |
|
109 |
+ if (class(x) == "BSseq") { |
|
65 | 110 |
x <- list(x, ...) |
111 |
+ } |
|
66 | 112 |
stopifnot(all(sapply(x, class) == "BSseq")) |
67 |
- gr <- getBSseq(x[[1]], "gr") |
|
68 | 113 |
trans <- getBSseq(x[[1]], "trans") |
69 | 114 |
sameTrans <- sapply(x[-1], function(xx) { |
70 | 115 |
identical(trans, getBSseq(xx, "trans")) |
71 | 116 |
}) |
72 |
- if(!all(sameTrans)) |
|
73 |
- stop("all elements of '...' in combineList needs to have the same trans") |
|
117 |
+ if (!all(sameTrans)) { |
|
118 |
+ stop("all elements of '...' in combineList needs to have the same ", |
|
119 |
+ "'trans'") |
|
120 |
+ } |
|
121 |
+ parameters <- getBSseq(x[[1]], "parameters") |
|
122 |
+ same_parameters <- vapply(x, function(xx) { |
|
123 |
+ identical(getBSseq(xx, "parameters"), parameters) |
|
124 |
+ }, logical(1L)) |
|
125 |
+ if (!all(same_parameters)) { |
|
126 |
+ stop("all elements of '...' in combineList needs to have the same ", |
|
127 |
+ "'parameters'") |
|
128 |
+ } |
|
129 |
+ # TODO: Could relax this by sorting all elements of x before doing the |
|
130 |
+ # next steps |
|
131 |
+ gr <- getBSseq(x[[1]], "gr") |
|
74 | 132 |
sameGr <- sapply(x[-1], function(xx) { |
75 | 133 |
identical(gr, getBSseq(xx, "gr")) |
76 | 134 |
}) |
77 |
- if(all(sameGr)) { |
|
135 |
+ if (all(sameGr)) { |
|
136 |
+ # NOTE: Don't realize() M, Cov, coef, or se.coef because we are |
|
137 |
+ # cbind()-ing, which is very efficient for DelayedArray objects. |
|
138 |
+ # E.g., cbind()-ing a bunch of HDF5Matrix objects is super fast |
|
139 |
+ # and uses basically no memory whereas realize()-ing the final |
|
140 |
+ # object as a new HDF5Matrix can take a long time. |
|
78 | 141 |
M <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "M"))) |
79 | 142 |
Cov <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "Cov"))) |
143 |
+ if (any(!vapply(x, hasBeenSmoothed, logical(1L)))) { |
|
144 |
+ warning("Setting 'coef' and 'se.coef' to NULL: Cannot combine() ", |
|
145 |
+ "these because BSseq objects have different rowRanges") |
|
146 |
+ coef <- NULL |
|
147 |
+ se.coef <- NULL |
|
148 |
+ trans <- NULL |
|
149 |
+ } else { |
|
150 |
+ list_of_trans <- lapply(x, getBSseq, "trans") |
|
151 |
+ if (!all(vapply(list_of_trans, function(trans) { |
|
152 |
+ identical(trans, list_of_trans[[1]]) |
|
153 |
+ }, logical(1L)))) { |
|
154 |
+ stop("All BSseq objects need to have the same 'trans'") |
|
155 |
+ } |
|
156 |
+ trans <- list_of_trans[[1]] |
|
157 |
+ list_of_parameters <- lapply(x, getBSseq, "parameters") |
|
158 |
+ if (!all(vapply(list_of_parameters, function(parameters) { |
|
159 |
+ identical(parameters, list_of_parameters[[1]]) |
|
160 |
+ }, logical(1L)))) { |
|
161 |
+ stop("All BSseq objects need to have the same 'parameters'") |
|
162 |
+ } |
|
163 |
+ parameters <- list_of_parameters[[1]] |
|
164 |
+ list_of_coef <- lapply(x, function(xx) getBSseq(xx, "coef")) |
|
165 |
+ coef <- do.call(cbind, list_of_coef) |
|
166 |
+ list_of_se.coef <- lapply(x, function(xx) getBSseq(xx, "se.coef")) |
|
167 |
+ if (all(!vapply(list_of_coef, is.null, logical(1L)))) { |
|
168 |
+ se.coef <- do.call(cbind, list_of_se.coef) |
|
169 |
+ } else { |
|
170 |
+ if (any(vapply(list_of_coef, is.null, logical(1L)))) { |
|
171 |
+ warning("Setting 'se.coef' to NULL: Cannot combine() ", |
|
172 |
+ "these because at least of the BSseq objects ", |
|
173 |
+ "is missing 'se.coef'") |
|
174 |
+ } |
|
175 |
+ se.coef <- NULL |
|
176 |
+ } |
|
177 |
+ } |
|
80 | 178 |
} else { |
81 |
- gr <- sort(reduce(do.call(c, unname(lapply(x, granges))), min.gapwidth = 0L)) |
|
179 |
+ gr <- sort(reduce(do.call(c, unname(lapply(x, granges))), |
|
180 |
+ min.gapwidth = 0L)) |
|
181 |
+ I <- lapply(x, function(xx) { |
|
182 |
+ ov <- findOverlaps(gr, xx) |
|
183 |
+ queryHits(ov) |
|
184 |
+ }) |
|
82 | 185 |
sampleNames <- do.call(c, unname(lapply(x, sampleNames))) |
83 |
- M <- matrix(0, ncol = length(sampleNames), nrow = length(gr)) |
|
84 |
- colnames(M) <- sampleNames |
|
85 |
- Cov <- M |
|
86 |
- idxes <- split(seq(along = sampleNames), rep(seq(along = x), times = sapply(x, ncol))) |
|
87 |
- for(ii in seq(along = idxes)) { |
|
88 |
- ov <- findOverlaps(gr, granges(x[[ii]])) |
|
89 |
- idx <- idxes[[ii]] |
|
90 |
- M[queryHits(ov), idx] <- getBSseq(x[[ii]], "M")[subjectHits(ov),] |
|
91 |
- Cov[queryHits(ov), idx] <- getBSseq(x[[ii]], "Cov")[subjectHits(ov),] |
|
186 |
+ ## FIXME: there is no check that the sampleNames are unique/disjoint. |
|
187 |
+ # TODO: Figure out if fill should be 0 or 0L based on X (getting it |
|
188 |
+ # right will avoid a copy/cast) |
|
189 |
+ M <- .combineListOfDelayedMatrixObjects( |
|
190 |
+ X = lapply(x, getBSseq, "M"), |
|
191 |
+ I = I, |
|
192 |
+ nrow = length(gr), |
|
193 |
+ ncol = length(sampleNames), |
|
194 |
+ dimnames = list(NULL, sampleNames), |
|
195 |
+ fill = 0, |
|
196 |
+ BACKEND = BACKEND) |
|
197 |
+ Cov <- .combineListOfDelayedMatrixObjects( |
|
198 |
+ X = lapply(x, getBSseq, "Cov"), |
|
199 |
+ I = I, |
|
200 |
+ nrow = length(gr), |
|
201 |
+ ncol = length(sampleNames), |
|
202 |
+ dimnames = list(NULL, sampleNames), |
|
203 |
+ fill = 0, |
|
204 |
+ BACKEND = BACKEND) |
|
205 |
+ if (any(vapply(x, hasBeenSmoothed, logical(1L)))) { |
|
206 |
+ warning("Setting 'coef' and 'se.coef' to NULL: Cannot combine() ", |
|
207 |
+ "these because BSseq objects have different rowRanges") |
|
92 | 208 |
} |
93 |
- } |
|
94 |
- if(any(!sapply(x, hasBeenSmoothed)) || !(all(sameGr))) { |
|
95 | 209 |
coef <- NULL |
96 | 210 |
se.coef <- NULL |
97 | 211 |
trans <- NULL |
98 |
- } else { |
|
99 |
- coef <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "coef"))) |
|
100 |
- se.coef <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "se.coef"))) |
|
101 | 212 |
} |
102 |
- pData <- as(Reduce(combine, lapply(x, function(xx) as.data.frame(pData(xx)))), "DataFrame") |
|
213 |
+ pData <- as(Reduce(combine, |
|
214 |
+ lapply(x, function(xx) as.data.frame(pData(xx)))), |
|
215 |
+ "DataFrame") |
|
103 | 216 |
BSseq(gr = gr, M = M, Cov = Cov, coef = coef, se.coef = se.coef, |
104 |
- pData = pData, trans = trans, rmZeroCov = FALSE) |
|
217 |
+ pData = pData, trans = trans, parameters = parameters, |
|
218 |
+ rmZeroCov = FALSE) |
|
105 | 219 |
} |
106 |
- |
107 | 220 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,79 @@ |
1 |
+# NOTE: DelayedMatrix objects don't suport subassignment ([<- or [[<-). This |
|
2 |
+# function supports a special case of subassignment for DelayedMatrix |
|
3 |
+# objects namely: |
|
4 |
+# - Create a DelayedMatrix, value, with `nrow` and `ncol` |
|
5 |
+# - Fill in value[i, ] with `x` |
|
6 |
+# - Fill in remaining rows with `fill` |
|
7 |
+.subassignRowsDelayedMatrix <- function(x, i, nrow, fill = NA_integer_, |
|
8 |
+ BACKEND = NULL, by_row = FALSE) { |
|
9 |
+ if (is.null(BACKEND)) { |
|
10 |
+ if (by_row) { |
|
11 |
+ warning("'by_row' ignored when 'BACKEND' is NULL") |
|
12 |
+ } |
|
13 |
+ z <- matrix(fill, nrow = nrow, ncol = ncol(x)) |
|
14 |
+ z[i, ] <- as.array(x) |
|
15 |
+ return(realize(z, BACKEND = BACKEND)) |
|
16 |
+ } else { |
|
17 |
+ # NOTE: Believe by_row == FALSE is better for the situations it's being |
|
18 |
+ # called in by bsseq but not done extensive benchmarking |
|
19 |
+ if (by_row) { |
|
20 |
+ # Construct the 'missing' rows of the returned matrix in memory, |
|
21 |
+ # then realize(), then rbind these rows, then reorder the rows, |
|
22 |
+ # then realize() the returned matrix |
|
23 |
+ missing_i <- setdiff(seq_len(nrow), i) |
|
24 |
+ x2 <- .DelayedMatrix(matrix(fill, |
|
25 |
+ nrow = nrow - nrow(x), |
|
26 |
+ ncol = ncol(x))) |
|
27 |
+ x2 <- realize(x2, BACKEND = BACKEND) |
|
28 |
+ z <- rbind(x, x2) |
|
29 |
+ z <- z[order(c(i, missing_i)), ] |
|
30 |
+ return(realize(z, BACKEND = BACKEND)) |
|
31 |
+ } else { |
|
32 |
+ # Construct each column of the returned matrix in memory, then |
|
33 |
+ # realize(), then cbind these columns |
|
34 |
+ zjs <- lapply(seq_len(ncol(x)), function(j) { |
|
35 |
+ zj <- matrix(fill, nrow = nrow, ncol = 1) |
|
36 |
+ zj[i] <- as.array(x[, j]) |
|
37 |
+ realize(zj, BACKEND = BACKEND) |
|
38 |
+ }) |
|
39 |
+ do.call(cbind, zjs) |
|
40 |
+ } |
|
41 |
+ } |
|
42 |
+} |
|
43 |
+ |
|
44 |
+# X is a list of DelayedMatrix objects |
|
45 |
+# I is a list of row indices where I[[k]] is the vector of row indices for |
|
46 |
+# X[[k]] |
|
47 |
+.combineListOfDelayedMatrixObjects <- function(X, I, nrow, ncol, |
|
48 |
+ dimnames = NULL, |
|
49 |
+ fill = NA_integer_, |
|
50 |
+ BACKEND = NULL) { |
|
51 |
+ stopifnot(length(X) == length(I)) |
|
52 |
+ is_delayed_matrix <- vapply(X, function(XX) is(XX, "DelayedMatrix"), |
|
53 |
+ logical(1L)) |
|
54 |
+ stopifnot(all(is_delayed_matrix)) |
|
55 |
+ is_simple_delayed_matrix <- vapply(X, .isSimpleDelayedMatrix, logical(1L)) |
|
56 |
+ # NOTE: If all DelayedMatrix objects are 'simple' (i.e. the @seed of each |
|
57 |
+ # is a base::matrix), then can use this much faster and more |
|
58 |
+ # memory-efficient method to combine the DelayedMatrix objects into |
|
59 |
+ # a single DelayedMatrix |
|
60 |
+ if (all(is_simple_delayed_matrix) && is.null(BACKEND)) { |
|
61 |
+ X <- endoapply(X, as.array) |
|
62 |
+ z <- matrix(fill, nrow, ncol, dimnames = dimnames) |
|
63 |
+ j0 <- 0 |
|
64 |
+ for (j in seq_along(X)) { |
|
65 |
+ i <- I[[j]] |
|
66 |
+ jj <- j0 + seq_len(ncol(X[[j]])) |
|
67 |
+ z[i, jj] <- X[[j]] |
|
68 |
+ j0 <- j0 + ncol(X[[j]]) |
|
69 |
+ } |
|
70 |
+ return(realize(z, BACKEND = BACKEND)) |
|
71 |
+ } else { |
|
72 |
+ z <- mapply(.subassignRowsDelayedMatrix, x = X, i = I, |
|
73 |
+ MoreArgs = list(nrow = nrow, fill = fill, |
|
74 |
+ BACKEND = BACKEND)) |
|
75 |
+ z <- do.call("cbind", z) |
|
76 |
+ dimnames(z) <- dimnames |
|
77 |
+ z |
|
78 |
+ } |
|
79 |
+} |
... | ... |
@@ -1,20 +1,21 @@ |
1 |
+# NOTE: Realises in memory a matrix with nrow = length(bstat) and ncol = 1 |
|
1 | 2 |
dmrFinder <- function(bstat, cutoff = NULL, qcutoff = c(0.025, 0.975), |
2 | 3 |
maxGap = 300, stat = "tstat.corrected", verbose = TRUE) { |
3 | 4 |
if(is(bstat, "BSseqTstat")) { |
4 | 5 |
if(! stat %in% colnames(bstat@stats)) |
5 | 6 |
stop("'stat' needs to be a column of 'bstat@stats'") |
6 |
- dmrStat <- bstat@stats[, stat] |
|
7 |
+ dmrStat <- as.array(bstat@stats[, stat, drop = FALSE]) |
|
7 | 8 |
} |
8 | 9 |
if(is(bstat, "BSseqStat")) { |
9 |
- dmrStat <- getStats(bstat, what = "stat") |
|
10 |
+ dmrStat <- as.array(getStats(bstat, what = "stat")) |
|
10 | 11 |
} |
11 | 12 |
subverbose <- max(as.integer(verbose) - 1L, 0L) |
12 | 13 |
if(is.null(cutoff)) |
13 |
- cutoff <- quantile(dmrStat, qcutoff) |
|
14 |
+ cutoff <- .quantile(dmrStat, qcutoff) |
|
14 | 15 |
if(length(cutoff) == 1) |
15 | 16 |
cutoff <- c(-cutoff, cutoff) |
16 | 17 |
direction <- as.integer(dmrStat >= cutoff[2]) |
17 |
- direction[dmrStat <= cutoff[1]] <- -1L |
|
18 |
+ direction[as.array(dmrStat <= cutoff[1])] <- -1L |
|
18 | 19 |
direction[is.na(direction)] <- 0L |
19 | 20 |
chrs <- as.character(seqnames(bstat)) |
20 | 21 |
positions <- start(bstat) |
... | ... |
@@ -5,6 +5,16 @@ getStats <- function(bstat, regions = NULL, ...) { |
5 | 5 |
getStats_BSseqStat(bstat, regions = regions, ...) |
6 | 6 |
} |
7 | 7 |
|
8 |
+# NOTE: Realises in memory a matrix with nrow = length(hits) and ncol = 1 |
|
9 |
+.getRegionStats <- function(stat, hits, na.rm = FALSE) { |
|
10 |
+ stat_by_region <- split(as.array(stat[queryHits(hits), ]), |
|
11 |
+ subjectHits(hits)) |
|
12 |
+ data.frame(areaStat = vapply(stat_by_region, sum, numeric(1), |
|
13 |
+ na.rm = na.rm, USE.NAMES = FALSE), |
|
14 |
+ maxStat = vapply(stat_by_region, max, numeric(1), |
|
15 |
+ na.rm = na.rm, USE.NAMES = FALSE)) |
|
16 |
+} |
|
17 |
+ |
|
8 | 18 |
getStats_BSseqStat <- function(BSseqStat, regions = NULL, what = NULL) { |
9 | 19 |
stopifnot(is(BSseqStat, "BSseqStat")) |
10 | 20 |
if(!is.null(what)) { |
... | ... |
@@ -16,24 +26,37 @@ getStats_BSseqStat <- function(BSseqStat, regions = NULL, what = NULL) { |
16 | 26 |
## Now we have regions and no what |
17 | 27 |
if(class(regions) == "data.frame") |
18 | 28 |
regions <- data.frame2GRanges(regions) |
19 |
- ov <- findOverlaps(BSseqStat, regions) |
|
20 |
- ov.sp <- split(queryHits(ov), subjectHits(ov)) |
|
21 |
- ## We need areaStat and maxStat |
|
22 |
- ## Could need averageMeth in each group? |
|
23 |
- ## Need to have a specific design for that |
|
24 |
- ## Could at least get contrast coefficient |
|
25 |
- getRegionStats <- function(idx) { |
|
26 |
- areaStat <- sum(BSseqStat@stats$stat[idx], na.rm = TRUE) |
|
27 |
- maxStat <- max(BSseqStat@stats$stat[idx], na.rm = TRUE) |
|
28 |
- c(areaStat, maxStat) |
|
29 |
- } |
|
30 |
- regionStats <- matrix(NA, ncol = 2, nrow = length(regions)) |
|
31 |
- colnames(regionStats) <- c("areaStat", "maxStat") |
|
32 |
- tmp <- lapply(ov.sp, getRegionStats) |
|
33 |
- regionStats[as.integer(names(tmp)),] <- do.call(rbind, tmp) |
|
29 |
+ hits <- findOverlaps(BSseqStat, regions) |
|
30 |
+ regionStats <- .getRegionStats(stat = BSseqStat@stats$stat, |
|
31 |
+ hits = hits) |
|
34 | 32 |
regionStats |
35 | 33 |
} |
36 | 34 |
|
35 |
+# NOTE: Realises in memory a matrix with nrow = length(hits) and ncol = 3 |
|
36 |
+.getRegionStats_ttest <- function(stats, hits, na.rm = FALSE) { |
|
37 |
+ stat_names <- c("group1.means", "group2.means", "tstat.sd") |
|
38 |
+ stats <- as.array(stats[queryHits(hits), stat_names]) |
|
39 |
+ stats_by_region <- lapply(seq_along(stat_names), function(j) { |
|
40 |
+ split(stats[, j], subjectHits(hits)) |
|
41 |
+ }) |
|
42 |
+ names(stats_by_region) <- stat_names |
|
43 |
+ meanDiff <- mapply(function(g1, g2, na.rm) { |
|
44 |
+ mean(g1 - g2, na.rm = na.rm) |
|
45 |
+ }, g1 = stats_by_region[["group1.means"]], |
|
46 |
+ g2 = stats_by_region[["group2.means"]], |
|
47 |
+ MoreArgs = list(na.rm = na.rm), SIMPLIFY = TRUE, USE.NAMES = FALSE) |
|
48 |
+ group1.mean <- vapply(stats_by_region[["group1.means"]], mean, numeric(1), |
|
49 |
+ na.rm = na.rm, USE.NAMES = FALSE) |
|
50 |
+ group2.mean <- vapply(stats_by_region[["group2.means"]], mean, numeric(1), |
|
51 |
+ na.rm = na.rm, USE.NAMES = FALSE) |
|
52 |
+ tstat.sd <- vapply(stats_by_region[["tstat.sd"]], mean, numeric(1), |
|
53 |
+ na.rm = na.rm, USE.NAMES = FALSE) |
|
54 |
+ data.frame(meanDiff = meanDiff, |
|
55 |
+ group1.mean = group1.mean, |
|
56 |
+ group2.mean = group2.mean, |
|
57 |
+ tstat.sd = tstat.sd) |
|
58 |
+} |
|
59 |
+ |
|
37 | 60 |
getStats_BSseqTstat <- function(BSseqTstat, regions = NULL, stat = "tstat.corrected") { |
38 | 61 |
stopifnot(is(BSseqTstat, "BSseqTstat")) |
39 | 62 |
if(is.null(regions)) |
... | ... |
@@ -43,33 +66,13 @@ getStats_BSseqTstat <- function(BSseqTstat, regions = NULL, stat = "tstat.correc |
43 | 66 |
stopifnot(stat %in% colnames(BSseqTstat@stats)) |
44 | 67 |
stopifnot(length(stat) == 1) |
45 | 68 |
stopifnot(is(regions, "GenomicRanges")) |
46 |
- ov <- findOverlaps(BSseqTstat, regions) |
|
47 |
- ov.sp <- split(queryHits(ov), subjectHits(ov)) |
|
48 |
- getRegionStats <- function(idx) { |
|
49 |
- mat <- BSseqTstat@stats[idx,, drop=FALSE] |
|
50 |
- areaStat <- sum(mat[, stat]) |
|
51 |
- maxStat <- max(mat[, stat]) |
|
52 |
- c(areaStat, maxStat) |
|
53 |
- } |
|
54 |
- stats <- matrix(NA, ncol = 2, nrow = length(regions)) |
|
55 |
- colnames(stats) <- c("areaStat", "maxStat") |
|
56 |
- tmp <- lapply(ov.sp, getRegionStats) |
|
57 |
- stats[as.integer(names(tmp)),] <- do.call(rbind, tmp) |
|
58 |
- out <- as.data.frame(stats) |
|
69 |
+ hits <- findOverlaps(BSseqTstat, regions) |
|
70 |
+ out <- .getRegionStats(stat = BSseqTstat@stats[, stat, drop = FALSE], |
|
71 |
+ hits = hits) |
|
59 | 72 |
if(! stat %in% c("tstat.corrected", "tstat")) |
60 | 73 |
return(out) |
61 |
- getRegionStats_ttest <- function(idx) { |
|
62 |
- mat <- BSseqTstat@stats[idx,, drop=FALSE] |
|
63 |
- group1.mean <- mean(mat[, "group1.means"]) |
|
64 |
- group2.mean <- mean(mat[, "group2.means"]) |
|
65 |
- meanDiff <- mean(mat[, "group1.means"] - mat[, "group2.means"]) |
|
66 |
- tstat.sd <- mean(mat[, "tstat.sd"]) |
|
67 |
- c(meanDiff, group1.mean, group2.mean, tstat.sd) |
|
68 |
- } |
|
69 |
- stats_ttest<- matrix(NA, ncol = 4, nrow = length(regions)) |
|
70 |
- colnames(stats_ttest) <- c("meanDiff", "group1.mean", "group2.mean", "tstat.sd") |
|
71 |
- tmp <- lapply(ov.sp, getRegionStats_ttest) |
|
72 |
- stats_ttest[as.integer(names(tmp)),] <- do.call(rbind, tmp) |
|
73 |
- out <- cbind(out, as.data.frame(stats_ttest)) |
|
74 |
+ stats_ttest <- .getRegionStats_ttest(stats = BSseqTstat@stats, |
|
75 |
+ hits = hits) |
|
76 |
+ out <- cbind(out, stats_ttest) |
|
74 | 77 |
out |