Browse code

Fix same performance regression in getCoverage() as in getMeth()

Peter Hickey authored on 03/07/2017 02:26:39
Showing 3 changed files

... ...
@@ -1,5 +1,5 @@
1 1
 Package: bsseq
2
-Version: 1.13.3
2
+Version: 1.13.4
3 3
 Encoding: UTF-8
4 4
 Title: Analyze, manage and store bisulfite sequencing data
5 5
 Description: A collection of tools for analyzing and visualizing bisulfite
... ...
@@ -16,7 +16,8 @@ importFrom(graphics, "abline", "axis", "layout", "legend", "lines",
16 16
            "mtext", "par", "plot", "points", "polygon", "rect", "rug", "text")
17 17
 import(parallel)
18 18
 importFrom(locfit, "locfit", "lp")
19
-importFrom(matrixStats, "rowSds", "rowVars", "rowMaxs", "rowMins", "colMeans2")
19
+importFrom(matrixStats, "rowSds", "rowVars", "rowMaxs", "rowMins", "colMeans2",
20
+           "colSums2")
20 21
 import(IRanges)
21 22
 import(GenomicRanges)
22 23
 import(SummarizedExperiment)
... ...
@@ -144,69 +144,56 @@ getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"),
144 144
     }
145 145
 }
146 146
 
147
+# TODO: getCoverage() realises the result in memory iff regions is not NULL;
148
+#       discuss with Kasper
149
+# TODO: Whether or not colnames are added to returned value depends on whether
150
+#       regions is non-NULL; discuss with Kasper
147 151
 getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"),
148
-                        what = c("perBase", "perRegionAverage", "perRegionTotal")) {
152
+                        what = c("perBase", "perRegionAverage",
153
+                                 "perRegionTotal")) {
149 154
     stopifnot(is(BSseq, "BSseq"))
150 155
     type <- match.arg(type)
151 156
     what <- match.arg(what)
152
-    if(is.null(regions)) {
153
-        if(what == "perBase")
157
+    if (is.null(regions)) {
158
+        if (what == "perBase") {
154 159
             return(getBSseq(BSseq, type = type))
155
-        if(what == "perRegionTotal")
160
+        }
161
+        if (what == "perRegionTotal") {
156 162
             return(colSums(getBSseq(BSseq, type = type)))
157
-        if(what == "perRegionAverage")
163
+        }
164
+        if (what == "perRegionAverage") {
158 165
             return(colMeans(getBSseq(BSseq, type = type)))
166
+        }
159 167
     }
160
-    if(class(regions) == "data.frame")
168
+    if (class(regions) == "data.frame") {
161 169
         regions <- data.frame2GRanges(regions)
170
+    }
162 171
     stopifnot(is(regions, "GenomicRanges"))
163 172
     grBSseq <- granges(BSseq)
164
-    mm <- as.matrix(findOverlaps(regions, grBSseq))
165
-    mmsplit <- split(mm[,2], mm[,1])
166
-    if(what == "perBase") {
167
-        if(type == "Cov") {
168
-            out <- lapply(mmsplit, function(xx) {
169
-                getBSseq(BSseq, "Cov")[xx,,drop = FALSE]
170
-            })
171
-        }
172
-        if(type == "M") {
173
-            out <- lapply(mmsplit, function(xx) {
174
-                getBSseq(BSseq, "M")[xx,,drop = FALSE]
175
-            })
176
-        }
173
+    ov <- findOverlaps(grBSseq, regions)
174
+    if (type == "Cov") {
175
+        coverage <- getBSseq(BSseq, "Cov")[queryHits(ov), , drop = FALSE]
176
+    } else if (type == "M") {
177
+        coverage <- getBSseq(BSseq, "M")[queryHits(ov), , drop = FALSE]
178
+    }
179
+    out <- lapply(split(coverage, subjectHits(ov)), matrix,
180
+                  ncol = ncol(coverage))
181
+
182
+    if (what == "perBase") {
183
+        # TODO: Don't really understand the logic of the remaining code; how
184
+        #       could the results end up in the wrong order wrt to regions?
177 185
         outList <- vector("list", length(regions))
178
-        outList[as.integer(names(mmsplit))] <- out
186
+        outList[as.integer(names(out))] <- out
179 187
         return(outList)
188
+    } else if (what == "perRegionAverage") {
189
+        out <- do.call(rbind, lapply(out, colMeans2, na.rm = TRUE))
190
+    } else if (what == "perRegionTotal") {
191
+        out <- do.call(rbind, lapply(out, colSums2, na.rm = TRUE))
180 192
     }
181
-    if(what == "perRegionAverage") {
182
-        if(type == "Cov") {
183
-            out <- lapply(mmsplit, function(xx) {
184
-                colMeans(getBSseq(BSseq, "Cov")[xx,,drop = FALSE], na.rm = TRUE)
185
-            })
186
-        }
187
-        if(type == "M") {
188
-            out <- lapply(mmsplit, function(xx) {
189
-                colMeans(getBSseq(BSseq, "M")[xx,,drop = FALSE], na.rm = TRUE)
190
-            })
191
-        }
192
-    }
193
-    if(what == "perRegionTotal") {
194
-        if(type == "Cov") {
195
-            out <- lapply(mmsplit, function(xx) {
196
-                colSums(getBSseq(BSseq, "Cov")[xx,,drop = FALSE], na.rm = TRUE)
197
-            })
198
-        }
199
-        if(type == "M") {
200
-            out <- lapply(mmsplit, function(xx) {
201
-                colSums(getBSseq(BSseq, "M")[xx,,drop = FALSE], na.rm = TRUE)
202
-            })
203
-        }
204
-    }
205
-    out <- do.call(rbind, out)
193
+    # TODO: Don't really understand the logic of the remaining code; how
194
+    #       could the rows end up in the wrong order?
206 195
     outMatrix <- matrix(NA, ncol = ncol(BSseq), nrow = length(regions))
207 196
     colnames(outMatrix) <- sampleNames(BSseq)
208 197
     outMatrix[as.integer(rownames(out)),] <- out
209 198
     .DelayedMatrix(outMatrix)
210 199
 }
211
-
212
-