Browse code

Fix performance regression in getMeth() [https://support.bioconductor.org/p/97611/]

Rather than 'split the overlaps' and then load in chunks of the data [as in the previous implementation of getMeth()], we load the methylation data in the regions into memory and split in memory. This is **much** more efficient but with the cost of loading more data into memory.

Peter Hickey authored on 03/07/2017 02:19:32
Showing 2 changed files

... ...
@@ -16,7 +16,7 @@ importFrom(graphics, "abline", "axis", "layout", "legend", "lines",
16 16
            "mtext", "par", "plot", "points", "polygon", "rect", "rug", "text")
17 17
 import(parallel)
18 18
 importFrom(locfit, "locfit", "lp")
19
-importFrom(matrixStats, "rowSds", "rowVars", "rowMaxs", "rowMins")
19
+importFrom(matrixStats, "rowSds", "rowVars", "rowMaxs", "rowMins", "colMeans2")
20 20
 import(IRanges)
21 21
 import(GenomicRanges)
22 22
 import(SummarizedExperiment)
... ...
@@ -55,86 +55,97 @@ orderBSseq <- function(BSseq, seqOrder = NULL) {
55 55
 }
56 56
 
57 57
 
58
+# TODO: getMeth() realises the result in memory iff regions is not NULL;
59
+#       discuss with Kasper
60
+# TODO: Whether or not colnames are added to returned value depends on whether
61
+#       regions is non-NULL; discuss with Kasper
62
+# TODO: Add parallel support
58 63
 getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"),
59
-                    what = c("perBase", "perRegion"), confint = FALSE, alpha = 0.95) {
64
+                    what = c("perBase", "perRegion"), confint = FALSE,
65
+                    alpha = 0.95) {
60 66
     p.conf <- function(p, n, alpha) {
61 67
         z <- abs(qnorm((1 - alpha)/2, mean = 0, sd = 1))
62
-        upper <- (p + z^2/(2*n) + z*sqrt(  (p*(1-p) + z^2/(4*n)) / n)) /
63
-            (1+z^2/n)
64
-        lower <- (p + z^2/(2*n) - z*sqrt(  (p*(1-p) + z^2/(4*n)) / n)) /
65
-            (1+z^2/n)
68
+        upper <- (p + z ^ 2 / (2 * n) +
69
+                      z * sqrt((p * (1 - p) + z ^ 2 / (4 * n)) / n)) /
70
+            (1 + z ^ 2 / n)
71
+        lower <- (p + z ^ 2 / (2 * n) -
72
+                      z * sqrt((p * (1 - p) + z ^ 2 / (4 * n)) / n)) /
73
+            (1 + z ^ 2 / n)
66 74
         return(list(meth = p, lower = lower, upper = upper))
67 75
     }
76
+
68 77
     stopifnot(is(BSseq, "BSseq"))
69 78
     type <- match.arg(type)
70
-    if(type == "smooth" & !hasBeenSmoothed(BSseq))
79
+    if (type == "smooth" & !hasBeenSmoothed(BSseq)) {
71 80
         stop("'type=smooth' requires the object to have been smoothed.")
81
+    }
72 82
     what <- match.arg(what)
83
+    if (what == "perRegion" & is.null(regions)) {
84
+        stop("'what=perRegion' but no 'regions' supplied")
85
+    }
73 86
     z <- abs(qnorm((1 - alpha)/2, mean = 0, sd = 1))
74
-    if(is.null(regions) && type == "smooth") {
75
-        meth <- getBSseq(BSseq, type = "trans")(getBSseq(BSseq, type = "coef"))
76
-        if(confint) {
77
-            upper <- getBSseq(BSseq, type = "trans")(getBSseq(BSseq, type = "coef") +
78
-                                      z * getBSseq(BSseq, type = "se.coef"))
79
-            lower <- getBSseq(BSseq, type = "trans")(getBSseq(BSseq, type = "coef") -
80
-                                      z * getBSseq(BSseq, type = "se.coef"))
87
+    if (is.null(regions) && type == "smooth") {
88
+        coef <- getBSseq(BSseq, type = "coef")
89
+        meth <- getBSseq(BSseq, type = "trans")(coef)
90
+        if (confint) {
91
+            upper <- meth + z * getBSseq(BSseq, type = "se.coef")
92
+            lower <- meth - z * getBSseq(BSseq, type = "se.coef")
81 93
             return(list(meth = meth, lower = lower, upper = upper))
82 94
         } else {
83 95
             return(meth)
84 96
         }
85 97
     }
86
-    if(is.null(regions) && type == "raw") {
98
+    if (is.null(regions) && type == "raw") {
87 99
         meth <- getBSseq(BSseq, type = "M") / getBSseq(BSseq, type = "Cov")
88
-        if(confint) {
89
-            return(p.conf(meth, n = getBSseq(BSseq, type = "Cov"), alpha = alpha))
100
+        if (confint) {
101
+            return(p.conf(meth, n = getBSseq(BSseq, type = "Cov"),
102
+                          alpha = alpha))
90 103
         } else {
91 104
             return(meth)
92 105
         }
93 106
     }
107
+
94 108
     ## At this point, regions have been specified
95
-    if(class(regions) == "data.frame")
109
+    if (class(regions) == "data.frame") {
96 110
         regions <- data.frame2GRanges(regions)
111
+    }
97 112
     stopifnot(is(regions, "GenomicRanges"))
98
-    if(confint) stop("'confint = TRUE' is not supported by 'getMeth' when regions is given")
113
+    if (confint) {
114
+        stop("'confint = TRUE' is not supported by 'getMeth' when regions is given")
115
+    }
99 116
     grBSseq <- granges(BSseq)
100
-    mm <- as.matrix(findOverlaps(regions, grBSseq))
101
-    mmsplit <- split(mm[,2], mm[,1])
102
-    if(what == "perBase") {
103
-        if(type == "smooth") {
104
-            out <- lapply(mmsplit, function(xx) {
105
-                getBSseq(BSseq, "trans")(getBSseq(BSseq, "coef")[xx,,drop = FALSE])
106
-            })
107
-        }
108
-        if(type == "raw") {
109
-            out <- lapply(mmsplit, function(xx) {
110
-                getBSseq(BSseq, "M")[xx,,drop = FALSE] / getBSseq(BSseq, "Cov")[xx,,drop = FALSE]
111
-            })
112
-        }
117
+    ov <- findOverlaps(grBSseq, regions)
118
+    # NOTE: This realises a large object in memory (`meth`) - could do it in
119
+    #       chunks if what = perRegion
120
+    if (type == "smooth") {
121
+        meth <- as.matrix(
122
+            getBSseq(BSseq, "trans")(getBSseq(BSseq, "coef"))[queryHits(ov), ,
123
+                                                              drop = FALSE])
124
+    } else if (type == "raw") {
125
+        meth <- as.matrix(
126
+            (getBSseq(BSseq, "M") / getBSseq(BSseq, "Cov"))[queryHits(ov), ,
127
+                                                            drop = FALSE])
128
+    }
129
+    out <- lapply(split(meth, subjectHits(ov)), matrix, ncol = ncol(meth))
130
+    if (what == "perBase") {
131
+        # TODO: Don't really understand the logic of the remaining code; how
132
+        #       could the results end up in the wrong order wrt to regions?
113 133
         outList <- vector("list", length(regions))
114
-        outList[as.integer(names(mmsplit))] <- out
134
+        outList[as.integer(names(out))] <- out
115 135
         return(outList)
116
-    }
117
-    if(what == "perRegion") {
118
-        if(type == "smooth") {
119
-            out <- lapply(mmsplit, function(xx) {
120
-                colMeans(getBSseq(BSseq, "trans")(getBSseq(BSseq, "coef")[xx,,drop = FALSE]), na.rm = TRUE)
121
-            })
122
-        }
123
-        if(type == "raw") {
124
-            out <- lapply(mmsplit, function(xx) {
125
-                colMeans(getBSseq(BSseq, "M")[xx,,drop = FALSE] / getBSseq(BSseq, "Cov")[xx,,drop = FALSE], na.rm = TRUE)
126
-            })
127
-        }
128
-        out <- do.call(rbind, out)
136
+    } else if (what == "perRegion") {
137
+        out <- do.call(rbind, lapply(out, colMeans2, na.rm = TRUE))
138
+        # TODO: Don't really understand the logic of the remaining code; how
139
+        #       could the rows end up in the wrong order?
129 140
         outMatrix <- matrix(NA, ncol = ncol(BSseq), nrow = length(regions))
130 141
         colnames(outMatrix) <- sampleNames(BSseq)
131
-        outMatrix[as.integer(rownames(out)),] <- out
142
+        outMatrix[as.integer(rownames(out)), ] <- out
132 143
         .DelayedMatrix(outMatrix)
133 144
     }
134 145
 }
135 146
 
136 147
 getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"),
137
-                    what = c("perBase", "perRegionAverage", "perRegionTotal")) {
148
+                        what = c("perBase", "perRegionAverage", "perRegionTotal")) {
138 149
     stopifnot(is(BSseq, "BSseq"))
139 150
     type <- match.arg(type)
140 151
     what <- match.arg(what)