Rather than 'split the overlaps' and then load in chunks of the data [as in the previous implementation of getMeth()], we load the methylation data in the regions into memory and split in memory. This is **much** more efficient but with the cost of loading more data into memory.
... | ... |
@@ -16,7 +16,7 @@ importFrom(graphics, "abline", "axis", "layout", "legend", "lines", |
16 | 16 |
"mtext", "par", "plot", "points", "polygon", "rect", "rug", "text") |
17 | 17 |
import(parallel) |
18 | 18 |
importFrom(locfit, "locfit", "lp") |
19 |
-importFrom(matrixStats, "rowSds", "rowVars", "rowMaxs", "rowMins") |
|
19 |
+importFrom(matrixStats, "rowSds", "rowVars", "rowMaxs", "rowMins", "colMeans2") |
|
20 | 20 |
import(IRanges) |
21 | 21 |
import(GenomicRanges) |
22 | 22 |
import(SummarizedExperiment) |
... | ... |
@@ -55,86 +55,97 @@ orderBSseq <- function(BSseq, seqOrder = NULL) { |
55 | 55 |
} |
56 | 56 |
|
57 | 57 |
|
58 |
+# TODO: getMeth() realises the result in memory iff regions is not NULL; |
|
59 |
+# discuss with Kasper |
|
60 |
+# TODO: Whether or not colnames are added to returned value depends on whether |
|
61 |
+# regions is non-NULL; discuss with Kasper |
|
62 |
+# TODO: Add parallel support |
|
58 | 63 |
getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"), |
59 |
- what = c("perBase", "perRegion"), confint = FALSE, alpha = 0.95) { |
|
64 |
+ what = c("perBase", "perRegion"), confint = FALSE, |
|
65 |
+ alpha = 0.95) { |
|
60 | 66 |
p.conf <- function(p, n, alpha) { |
61 | 67 |
z <- abs(qnorm((1 - alpha)/2, mean = 0, sd = 1)) |
62 |
- upper <- (p + z^2/(2*n) + z*sqrt( (p*(1-p) + z^2/(4*n)) / n)) / |
|
63 |
- (1+z^2/n) |
|
64 |
- lower <- (p + z^2/(2*n) - z*sqrt( (p*(1-p) + z^2/(4*n)) / n)) / |
|
65 |
- (1+z^2/n) |
|
68 |
+ upper <- (p + z ^ 2 / (2 * n) + |
|
69 |
+ z * sqrt((p * (1 - p) + z ^ 2 / (4 * n)) / n)) / |
|
70 |
+ (1 + z ^ 2 / n) |
|
71 |
+ lower <- (p + z ^ 2 / (2 * n) - |
|
72 |
+ z * sqrt((p * (1 - p) + z ^ 2 / (4 * n)) / n)) / |
|
73 |
+ (1 + z ^ 2 / n) |
|
66 | 74 |
return(list(meth = p, lower = lower, upper = upper)) |
67 | 75 |
} |
76 |
+ |
|
68 | 77 |
stopifnot(is(BSseq, "BSseq")) |
69 | 78 |
type <- match.arg(type) |
70 |
- if(type == "smooth" & !hasBeenSmoothed(BSseq)) |
|
79 |
+ if (type == "smooth" & !hasBeenSmoothed(BSseq)) { |
|
71 | 80 |
stop("'type=smooth' requires the object to have been smoothed.") |
81 |
+ } |
|
72 | 82 |
what <- match.arg(what) |
83 |
+ if (what == "perRegion" & is.null(regions)) { |
|
84 |
+ stop("'what=perRegion' but no 'regions' supplied") |
|
85 |
+ } |
|
73 | 86 |
z <- abs(qnorm((1 - alpha)/2, mean = 0, sd = 1)) |
74 |
- if(is.null(regions) && type == "smooth") { |
|
75 |
- meth <- getBSseq(BSseq, type = "trans")(getBSseq(BSseq, type = "coef")) |
|
76 |
- if(confint) { |
|
77 |
- upper <- getBSseq(BSseq, type = "trans")(getBSseq(BSseq, type = "coef") + |
|
78 |
- z * getBSseq(BSseq, type = "se.coef")) |
|
79 |
- lower <- getBSseq(BSseq, type = "trans")(getBSseq(BSseq, type = "coef") - |
|
80 |
- z * getBSseq(BSseq, type = "se.coef")) |
|
87 |
+ if (is.null(regions) && type == "smooth") { |
|
88 |
+ coef <- getBSseq(BSseq, type = "coef") |
|
89 |
+ meth <- getBSseq(BSseq, type = "trans")(coef) |
|
90 |
+ if (confint) { |
|
91 |
+ upper <- meth + z * getBSseq(BSseq, type = "se.coef") |
|
92 |
+ lower <- meth - z * getBSseq(BSseq, type = "se.coef") |
|
81 | 93 |
return(list(meth = meth, lower = lower, upper = upper)) |
82 | 94 |
} else { |
83 | 95 |
return(meth) |
84 | 96 |
} |
85 | 97 |
} |
86 |
- if(is.null(regions) && type == "raw") { |
|
98 |
+ if (is.null(regions) && type == "raw") { |
|
87 | 99 |
meth <- getBSseq(BSseq, type = "M") / getBSseq(BSseq, type = "Cov") |
88 |
- if(confint) { |
|
89 |
- return(p.conf(meth, n = getBSseq(BSseq, type = "Cov"), alpha = alpha)) |
|
100 |
+ if (confint) { |
|
101 |
+ return(p.conf(meth, n = getBSseq(BSseq, type = "Cov"), |
|
102 |
+ alpha = alpha)) |
|
90 | 103 |
} else { |
91 | 104 |
return(meth) |
92 | 105 |
} |
93 | 106 |
} |
107 |
+ |
|
94 | 108 |
## At this point, regions have been specified |
95 |
- if(class(regions) == "data.frame") |
|
109 |
+ if (class(regions) == "data.frame") { |
|
96 | 110 |
regions <- data.frame2GRanges(regions) |
111 |
+ } |
|
97 | 112 |
stopifnot(is(regions, "GenomicRanges")) |
98 |
- if(confint) stop("'confint = TRUE' is not supported by 'getMeth' when regions is given") |
|
113 |
+ if (confint) { |
|
114 |
+ stop("'confint = TRUE' is not supported by 'getMeth' when regions is given") |
|
115 |
+ } |
|
99 | 116 |
grBSseq <- granges(BSseq) |
100 |
- mm <- as.matrix(findOverlaps(regions, grBSseq)) |
|
101 |
- mmsplit <- split(mm[,2], mm[,1]) |
|
102 |
- if(what == "perBase") { |
|
103 |
- if(type == "smooth") { |
|
104 |
- out <- lapply(mmsplit, function(xx) { |
|
105 |
- getBSseq(BSseq, "trans")(getBSseq(BSseq, "coef")[xx,,drop = FALSE]) |
|
106 |
- }) |
|
107 |
- } |
|
108 |
- if(type == "raw") { |
|
109 |
- out <- lapply(mmsplit, function(xx) { |
|
110 |
- getBSseq(BSseq, "M")[xx,,drop = FALSE] / getBSseq(BSseq, "Cov")[xx,,drop = FALSE] |
|
111 |
- }) |
|
112 |
- } |
|
117 |
+ ov <- findOverlaps(grBSseq, regions) |
|
118 |
+ # NOTE: This realises a large object in memory (`meth`) - could do it in |
|
119 |
+ # chunks if what = perRegion |
|
120 |
+ if (type == "smooth") { |
|
121 |
+ meth <- as.matrix( |
|
122 |
+ getBSseq(BSseq, "trans")(getBSseq(BSseq, "coef"))[queryHits(ov), , |
|
123 |
+ drop = FALSE]) |
|
124 |
+ } else if (type == "raw") { |
|
125 |
+ meth <- as.matrix( |
|
126 |
+ (getBSseq(BSseq, "M") / getBSseq(BSseq, "Cov"))[queryHits(ov), , |
|
127 |
+ drop = FALSE]) |
|
128 |
+ } |
|
129 |
+ out <- lapply(split(meth, subjectHits(ov)), matrix, ncol = ncol(meth)) |
|
130 |
+ if (what == "perBase") { |
|
131 |
+ # TODO: Don't really understand the logic of the remaining code; how |
|
132 |
+ # could the results end up in the wrong order wrt to regions? |
|
113 | 133 |
outList <- vector("list", length(regions)) |
114 |
- outList[as.integer(names(mmsplit))] <- out |
|
134 |
+ outList[as.integer(names(out))] <- out |
|
115 | 135 |
return(outList) |
116 |
- } |
|
117 |
- if(what == "perRegion") { |
|
118 |
- if(type == "smooth") { |
|
119 |
- out <- lapply(mmsplit, function(xx) { |
|
120 |
- colMeans(getBSseq(BSseq, "trans")(getBSseq(BSseq, "coef")[xx,,drop = FALSE]), na.rm = TRUE) |
|
121 |
- }) |
|
122 |
- } |
|
123 |
- if(type == "raw") { |
|
124 |
- out <- lapply(mmsplit, function(xx) { |
|
125 |
- colMeans(getBSseq(BSseq, "M")[xx,,drop = FALSE] / getBSseq(BSseq, "Cov")[xx,,drop = FALSE], na.rm = TRUE) |
|
126 |
- }) |
|
127 |
- } |
|
128 |
- out <- do.call(rbind, out) |
|
136 |
+ } else if (what == "perRegion") { |
|
137 |
+ out <- do.call(rbind, lapply(out, colMeans2, na.rm = TRUE)) |
|
138 |
+ # TODO: Don't really understand the logic of the remaining code; how |
|
139 |
+ # could the rows end up in the wrong order? |
|
129 | 140 |
outMatrix <- matrix(NA, ncol = ncol(BSseq), nrow = length(regions)) |
130 | 141 |
colnames(outMatrix) <- sampleNames(BSseq) |
131 |
- outMatrix[as.integer(rownames(out)),] <- out |
|
142 |
+ outMatrix[as.integer(rownames(out)), ] <- out |
|
132 | 143 |
.DelayedMatrix(outMatrix) |
133 | 144 |
} |
134 | 145 |
} |
135 | 146 |
|
136 | 147 |
getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"), |
137 |
- what = c("perBase", "perRegionAverage", "perRegionTotal")) { |
|
148 |
+ what = c("perBase", "perRegionAverage", "perRegionTotal")) { |
|
138 | 149 |
stopifnot(is(BSseq, "BSseq")) |
139 | 150 |
type <- match.arg(type) |
140 | 151 |
what <- match.arg(what) |