Browse code

Speed up .collapseMatrixLike() and, in turn, BSseq()

- Addressed #71, thanks @scottgigante
- Removes undocumented behaviour of BSseq() sorting the output. In previous versions, this was done via `reduce(gr)`; see https://github.com/hansenlab/bsseq/blob/RELEASE_3_7/R/BSseq_class.R#L176-L223. In the refactor branch I had changed it to use order(gr). Now, it is completely removed.

Peter Hickey authored on 20/06/2018 16:19:24
Showing 5 changed files

... ...
@@ -69,10 +69,12 @@ setValidity2("BSseq", function(object) {
69 69
 
70 70
 # Exported functions -----------------------------------------------------------
71 71
 
72
+# TODO: Document that if duplicate loci are detected then the output `M` and
73
+#       `Cov` will be numeric even if input `M` and `Cov` were integer.
72 74
 # TODO: BSseq() is arguably a bad constructor. It doesn't return a valid BSseq
73 75
 #       object when called without any arguments. It also does some pretty
74
-#       complicated parsing of the inputs. Still, I think we're stuck with it
75
-#       because it's been around for a long time.
76
+#       complicated parsing of the inputs. But we're stuck with it because it's
77
+#       been around for a long time.
76 78
 BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
77 79
                   trans = NULL, parameters = NULL, pData = NULL,
78 80
                   gr = NULL, pos = NULL, chr = NULL, sampleNames = NULL,
... ...
@@ -171,27 +173,15 @@ BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
171 173
         if (!is.null(coef) || !is.null(se.coef)) {
172 174
             stop("Cannot collapse when 'coef' or 'se.coef' are present.")
173 175
         }
174
-        # NOTE: reduce() sorts the output
175
-        # TODO: Clarify above comment.
176 176
         unique_gr <- unique(gr)
177
-        # TODO: type = "equal"?
178
-        ol <- findOverlaps(unique_gr, gr)
179
-        gr <- unique(gr)
177
+        ol <- findOverlaps(unique_gr, gr, type = "equal")
178
+        gr <- unique_gr
179
+        # TODO: Can we avoid making `idx`?
180 180
         idx <- as.list(ol)
181 181
         M <- .collapseMatrixLike(M, idx, MARGIN = 2L)
182 182
         Cov <- .collapseMatrixLike(Cov, idx, MARGIN = 2L)
183 183
     }
184 184
 
185
-    # Sort loci
186
-    if (is.unsorted(gr)) {
187
-        o <- order(gr)
188
-        gr <- gr[o]
189
-        M <- M[o, , drop = FALSE]
190
-        Cov <- Cov[o, , drop = FALSE]
191
-        coef <- coef[o, , drop = FALSE]
192
-        se.coef <- se.coef[o, , drop = FALSE]
193
-    }
194
-
195 185
     # Construct BSseq object
196 186
     assays <- SimpleList(M = M, Cov = Cov, coef = coef, se.coef = se.coef)
197 187
     assays <- assays[!S4Vectors:::sapply_isNULL(assays)]
... ...
@@ -13,16 +13,42 @@ setGeneric(
13 13
 
14 14
 # Internal methods -------------------------------------------------------------
15 15
 
16
+# TODO: Re-write in C/C++ to avoid allocating `ans` multiple times
16 17
 setMethod(".collapseMatrixLike", "matrix", function(x, idx, MARGIN) {
17 18
     if (MARGIN == 1L) {
18
-        # TODO: Need colnames?
19
-        return(do.call(cbind, lapply(idx, function(j) rowSums2(x, cols = j))))
19
+        ans <- matrix(
20
+            # NOTE: rowSums2() always returns numeric
21
+            data = NA_real_,
22
+            nrow = nrow(x),
23
+            ncol = length(idx),
24
+            dimnames = list(rownames(x), NULL))
25
+        l <- lengths(idx)
26
+        stopifnot(min(l) > 0)
27
+        l1 <- which(l == 1)
28
+        ans[, l1] <- x[, unlist(idx[l1])]
29
+        lmore <- which(l != 1)
30
+        ans[, lmore] <- do.call(cbind, lapply(idx[lmore], function(j) {
31
+            rowSums2(x, cols = j)
32
+        }))
20 33
     } else if (MARGIN == 2L) {
21
-        # TODO: Need rownames?
22
-        do.call(rbind, lapply(idx, function(i) colSums2(x, rows = i)))
34
+        ans <- matrix(
35
+            # NOTE: colSums2() always returns numeric
36
+            data = NA_real_,
37
+            nrow = length(idx),
38
+            ncol = ncol(x),
39
+            dimnames = list(NULL, colnames(x)))
40
+        l <- lengths(idx)
41
+        stopifnot(min(l) > 0)
42
+        l1 <- which(l == 1)
43
+        ans[l1, ] <- x[unlist(idx[l1]), ]
44
+        lmore <- which(l != 1)
45
+        ans[lmore, ] <- do.call(rbind, lapply(idx[lmore], function(i) {
46
+            colSums2(x, rows = i)
47
+        }))
23 48
     } else {
24 49
         stop("'MARGIN' must be 1 or 2")
25 50
     }
51
+    ans
26 52
 })
27 53
 
28 54
 setMethod(
... ...
@@ -3,6 +3,13 @@
3 3
 \encoding{UTF-8}
4 4
 \section{Version 1.17.x}{
5 5
   \itemize{
6
+    \item{\code{BSseq()} will no longer reorder inputs. Previously, the
7
+    returned \emph{BSseq} object was ordered by ordering the loci, although
8
+    this behaviour was not documented. \code{BSseq()} may still filter out loci
9
+    if \code{rmZeroCov = FALSE} or collapse loci if
10
+    \code{strandCollapse = FALSE} or duplicate loci are detected, but the
11
+    relative order of loci in the output will match that of the input.
12
+    }
6 13
     \item{Fix bug with \code{maxGap} argument of \code{BSmooth()}. The bug
7 14
     meant that the 'maximum gap between two methylation loci' was incorrectly
8 15
     set to \code{2 * maxGap + 1} instead of \code{maxGap}. This likely did not
... ...
@@ -47,7 +47,7 @@ BSseq(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
47 47
 
48 48
   In case one or more methylation loci appears multiple times, the
49 49
   \code{M} and \code{Cov} matrices are summed over rows linked to the
50
-  same methylation loci.  See the example below.
50
+  same methylation loci. See the example below.
51 51
 
52 52
   Users should never have to specify \code{coef}, \code{se.coef},
53 53
   \code{trans}, and \code{parameters}, this is for internal use (they
... ...
@@ -29,14 +29,18 @@ test_that("BSseq", {
29 29
 
30 30
     BStest2 <- BSseq(pos = 3:1, chr = rep("chr1", 3), M = M[3:1, ],
31 31
                      Cov = M[3:1, ] + 2)
32
-    expect_equal(BStest, BStest2)
32
+    # NOTE: BSseq() does not reorder inputs
33
+    expect_true(!isTRUE(all.equal(BStest, BStest2)))
34
+    expect_equal(BStest, sort(BStest2))
33 35
     M2 <- rbind(M[3:1, ], c(1, 1, 1))
34 36
     Cov2 <- rbind(M[3:1, ] + 2, c(1, 1, 1))
35 37
     M2[3, ] <- M2[3, ] - 1
36 38
     Cov2[3, ] <- Cov2[3, ] - 1
37 39
     BStest3 <- suppressWarnings(
38 40
         BSseq(pos = c(3:1,1), chr = rep("chr1", 4), M = M2, Cov = Cov2))
39
-    expect_equal(BStest, BStest3)
41
+    # NOTE: BSseq() does not reorder inputs
42
+    expect_true(!isTRUE(all.equal(BStest, BStest3)))
43
+    expect_equal(BStest, sort(BStest3))
40 44
 })
41 45
 
42 46
 test_that("BSseq with HDF5Backend", {