Browse code

Simplify BSseq() and BSseq validity method, refactor strandCollapse()

Peter Hickey authored on 17/09/2018 11:07:27
Showing1 changed files

... ...
@@ -39,21 +39,17 @@ setValidity2("BSseq", function(object) {
39 39
     }
40 40
     msg <- validMsg(msg, .checkAssayNames(object, c("Cov", "M")))
41 41
 
42
-    # TODO: Are colnames strictly necessary?
43
-    if (is.null(colnames(object))) {
44
-        msg <- validMsg(msg, "colnames (aka sampleNames) need to be set")
45
-    }
46
-
47
-    assay_rownames <- lapply(assays(object), rownames)
48
-    if (!all(S4Vectors:::sapply_isNULL(assay_rownames))) {
49
-        msg <- validMsg(msg, "unnecessary rownames on one-or-more assays")
42
+    msg <- validMsg(
43
+        msg = msg,
44
+        result = .checkMandCov(
45
+            M = assay(object, "M", withDimnames = FALSE),
46
+            Cov = assay(object, "Cov", withDimnames = FALSE)))
47
+
48
+    if (is.null(msg)) {
49
+        TRUE
50
+    } else {
51
+        msg
50 52
     }
51
-
52
-    msg <- validMsg(msg,
53
-                    .checkMandCov(assay(object, "M", withDimnames = FALSE),
54
-                                  assay(object, "Cov", withDimnames = FALSE)))
55
-
56
-    if (is.null(msg)) TRUE else msg
57 53
 })
58 54
 
59 55
 # Internal functions -----------------------------------------------------------
... ...
@@ -69,135 +65,113 @@ setValidity2("BSseq", function(object) {
69 65
 
70 66
 # Exported functions -----------------------------------------------------------
71 67
 
72
-# TODO: Document that if duplicate loci are detected then the output `M` and
73
-#       `Cov` will be numeric even if input `M` and `Cov` were integer.
74 68
 # TODO: BSseq() is arguably a bad constructor. It doesn't return a valid BSseq
75 69
 #       object when called without any arguments. It also does some pretty
76 70
 #       complicated parsing of the inputs. But we're stuck with it because it's
77 71
 #       been around for a long time.
78 72
 BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
79
-                  trans = NULL, parameters = NULL, pData = NULL,
80
-                  gr = NULL, pos = NULL, chr = NULL, sampleNames = NULL,
73
+                  trans = NULL, parameters = NULL, pData = NULL, gr = NULL,
74
+                  pos = NULL, chr = NULL, sampleNames = NULL,
81 75
                   rmZeroCov = FALSE) {
82 76
 
83
-    # Process 'gr', or 'pos' and 'chr'
84
-    if (is.null(gr)) {
85
-        if (is.null(pos) || is.null(chr)) {
86
-            stop("Need 'pos' and 'chr' if 'gr' not supplied.")
87
-        }
88
-        gr <- GRanges(seqnames = chr, ranges = IRanges(start = pos, width = 1L))
77
+    # Argument checks ----------------------------------------------------------
78
+
79
+    # Process assays.
80
+    # NOTE: Nothing to do for 'coef', and 'se.coef'.
81
+    if (is.null(M) || is.null(Cov)) {
82
+        stop("Need 'M' and 'Cov'.")
89 83
     }
90
-    if (!is(gr, "GRanges")) stop("'gr' needs to be a GRanges.")
91
-    if (any(width(gr) != 1L)) stop("'gr' needs to have widths of 1.")
92
-
93
-    # Process 'M' and 'Cov'
94
-    if (is.null(M) || is.null(Cov)) stop("Need 'M' and 'Cov'.")
95
-    if (length(gr) != nrow(M) ||
96
-        length(gr) != nrow(Cov) ||
97
-        ncol(Cov) != ncol(M)) {
98
-        stop("'gr', 'M' and 'Cov' need to have compatible dimensions.")
84
+    # Process 'trans' and 'parameters'.
85
+    if (is.null(trans)) {
86
+        trans <- function(x) NULL
99 87
     }
100
-    if (!is.null(rownames(M))) rownames(M) <- NULL
101
-    if (!is.null(rownames(Cov))) rownames(Cov) <- NULL
102
-    if (!is.null(names(gr))) names(gr) <- NULL
103
-
104
-    # Process 'sampleNames' and 'pData'
105
-    if (is.null(pData)) {
106
-        if (is.null(sampleNames)) {
107
-            if (!is.null(colnames(M))) {
108
-                sampleNames <- colnames(M)
109
-            } else if (!is.null(colnames(Cov))) {
110
-                sampleNames <- colnames(Cov)
88
+    if (is.null(parameters)) {
89
+        parameters <- list()
90
+    }
91
+    # Process 'sampleNames' and 'pData'.
92
+    if (is.null(sampleNames)) {
93
+        if (is.null(pData)) {
94
+            # BSseq object will have no colnames.
95
+            pData <- S4Vectors:::new_DataFrame(nrows = ncol(M))
96
+        } else {
97
+            # BSseq object will have 'sampleNames' as colnames.
98
+            pData <- DataFrame(row.names = sampleNames)
99
+        }
100
+    } else {
101
+        if (is.null(pData)) {
102
+            # BSseq object will have 'sampleNames' as colnames.
103
+            pData <- DataFrame(row.names = sampleNames)
104
+        } else {
105
+            if (is.null(rownames(pData))) {
106
+                rownames(pData) <- sampleNames
111 107
             } else {
112
-                sampleNames <- paste0("V", seq_len(ncol(M)))
108
+                stopifnot(identical(rownames(pData), sampleNames))
113 109
             }
114 110
         }
115
-        pData <- DataFrame(row.names = sampleNames)
116
-    } else {
117
-        pData <- as(pData, "DataFrame")
118 111
     }
119
-    if (is.null(sampleNames) && !is.null(pData) && !is.null(rownames(pData))) {
120
-        sampleNames <- rownames(pData)
112
+    # Process 'gr', 'pos', and 'chr'.
113
+    if (is.null(gr)) {
114
+        if (is.null(pos) || is.null(chr)) {
115
+            stop("Need 'pos' and 'chr' if 'gr' not supplied.")
116
+        }
117
+        gr <- GRanges(seqnames = chr, ranges = IRanges(start = pos, width = 1L))
121 118
     }
122
-    if (length(unique(sampleNames)) != ncol(M)) {
123
-        stop("sampleNames need to be unique and of the right length.")
119
+    if (!is(gr, "GRanges")) {
120
+        stop("'gr' needs to be a GRanges.")
124 121
     }
122
+    # Process 'rmZeroCov'.
123
+    stopifnot(isTRUEorFALSE(rmZeroCov))
125 124
 
126
-    # TODO: Is this really necessary? It complicates things because HDF5Matrix
127
-    #       will be degraded to a DelayedMatrix
128
-    # Add column names to assays
129
-    if (is.null(colnames(M)) || any(sampleNames != colnames(M))) {
130
-        colnames(M) <- sampleNames
131
-    }
132
-    if (is.null(colnames(Cov)) || any(sampleNames != colnames(Cov))) {
133
-        colnames(Cov) <- sampleNames
134
-    }
135
-    if (!is.null(coef)) {
136
-        if (nrow(coef) != nrow(M) || ncol(coef) != ncol(M)) {
137
-            stop("'coef' does not have the right dimensions")
138
-        }
139
-        if (is.null(colnames(coef)) || any(sampleNames != colnames(coef))) {
140
-            colnames(coef) <- sampleNames
141
-        }
142
-        if (!is.null(rownames(coef))) {
143
-            rownames(coef) <- NULL
144
-        }
145
-    }
146
-    if (!is.null(se.coef)) {
147
-        if (nrow(se.coef) != nrow(M) || ncol(se.coef) != ncol(M)) {
148
-            stop("'se.coef' does not have the right dimensions")
149
-        }
150
-        if (is.null(colnames(se.coef)) ||
151
-            any(sampleNames != colnames(se.coef))) {
152
-            colnames(se.coef) <- sampleNames
153
-        }
154
-        if (!is.null(rownames(se.coef))) {
155
-            rownames(se.coef) <- NULL
125
+    # Collapse duplicate loci --------------------------------------------------
126
+
127
+    is_duplicated <- duplicated(gr)
128
+    if (any(is_duplicated)) {
129
+        warning("Detected duplicate loci. Collapsing counts in 'M' and 'Cov' ",
130
+                "at these positions.")
131
+        if (!is.null(coef) || !is.null(se.coef)) {
132
+            stop("Cannot collapse when 'coef' or 'se.coef' are non-NULL.")
156 133
         }
134
+        loci <- gr[!is_duplicated]
135
+        ol <- findOverlaps(gr, loci, type = "equal")
136
+        M <- rowsum(x = M, group = subjectHits(ol), reorder = FALSE)
137
+        rownames(M) <- NULL
138
+        Cov <- rowsum(x = Cov, group = subjectHits(ol), reorder = FALSE)
139
+        rownames(Cov) <- NULL
140
+    } else {
141
+        loci <- gr
157 142
     }
158 143
 
159
-    # Optionally, remove positions with Cov == 0
144
+    # Optionally, remove positions with zero coverage --------------------------
145
+
160 146
     if (rmZeroCov) {
161 147
         loci_with_zero_cov <- rowAlls(Cov, value = 0)
162 148
         if (any(loci_with_zero_cov)) {
163
-            gr <- gr[!loci_with_zero_cov]
164
-            M <- M[!loci_with_zero_cov, , drop = FALSE]
165
-            Cov <- Cov[!loci_with_zero_cov, , drop = FALSE]
149
+            loci_with_nonzero_cov <- !loci_with_zero_cov
150
+            gr <- gr[loci_with_nonzero_cov]
151
+            M <- M[loci_with_nonzero_cov, , drop = FALSE]
152
+            Cov <- Cov[loci_with_nonzero_cov, , drop = FALSE]
153
+            coef <- coef[loci_with_nonzero_cov, , drop = FALSE]
154
+            se.coef <- se.coef[loci_with_nonzero_cov, , drop = FALSE]
166 155
         }
167 156
     }
168 157
 
169
-    # Collapse duplicate loci
170
-    if (any(duplicated(gr))) {
171
-        warning("Detected duplicate loci. Collapsing counts in 'M' and 'Cov' ",
172
-                "at these positions.")
173
-        if (!is.null(coef) || !is.null(se.coef)) {
174
-            stop("Cannot collapse when 'coef' or 'se.coef' are present.")
175
-        }
176
-        unique_gr <- unique(gr)
177
-        ol <- findOverlaps(unique_gr, gr, type = "equal")
178
-        gr <- unique_gr
179
-        # TODO: Can we avoid making `idx`?
180
-        idx <- as.list(ol)
181
-        M <- .collapseMatrixLike(M, idx, MARGIN = 2L)
182
-        Cov <- .collapseMatrixLike(Cov, idx, MARGIN = 2L)
183
-    }
158
+    # Construct BSseq object ---------------------------------------------------
184 159
 
185
-    # Construct BSseq object
186 160
     assays <- SimpleList(M = M, Cov = Cov, coef = coef, se.coef = se.coef)
187 161
     assays <- assays[!S4Vectors:::sapply_isNULL(assays)]
188
-    se <- SummarizedExperiment(assays = assays, rowRanges = gr, colData = pData)
189
-    if (is.null(parameters)) {
190
-        parameters <- list()
191
-    }
192
-    if (is.null(trans)) {
193
-        trans <- function(x) NULL
194
-    }
162
+    se <- SummarizedExperiment(
163
+        assays = assays,
164
+        rowRanges = loci,
165
+        colData = pData)
195 166
     .BSseq(se, trans = trans, parameters = parameters)
196 167
 }
197 168
 
198
-hasBeenSmoothed <- function(BSseq) "coef" %in% assayNames(BSseq)
169
+# Move to BSseq-utils?
170
+hasBeenSmoothed <- function(BSseq) {
171
+    "coef" %in% assayNames(BSseq)
172
+}
199 173
 
200
-# TODO: Document withDimnames
174
+# Move to BSseq-utils?
201 175
 getBSseq <- function(BSseq,
202 176
                      type = c("Cov", "M", "gr", "coef", "se.coef", "trans",
203 177
                               "parameters"),
... ...
@@ -206,33 +180,135 @@ getBSseq <- function(BSseq,
206 180
     if (type %in% c("M", "Cov")) {
207 181
         return(assay(BSseq, type, withDimnames = withDimnames))
208 182
     }
209
-    if (type %in% c("coef", "se.coef") && type %in% assayNames(BSseq)) {
210
-        return(assay(BSseq, type, withDimnames = withDimnames))
183
+    if (type %in% c("coef", "se.coef")) {
184
+        if (type %in% assayNames(BSseq)) {
185
+            return(assay(BSseq, type, withDimnames = withDimnames))
186
+        } else {
187
+            return(NULL)
188
+        }
189
+    }
190
+    if (type == "trans") {
191
+        return(BSseq@trans)
192
+    }
193
+    if (type == "parameters") {
194
+        return(BSseq@parameters)
195
+    }
196
+    if (type == "gr") {
197
+        return(BSseq@rowRanges)
211 198
     }
212
-    if (type %in% c("coef", "se.coef")) return(NULL)
213
-    if (type == "trans") return(BSseq@trans)
214
-    if (type == "parameters") return(BSseq@parameters)
215
-    if (type == "gr") return(BSseq@rowRanges)
216
-
217 199
 }
218 200
 
219
-strandCollapse <- function(BSseq, shift = TRUE) {
201
+# Move to BSseq-utils?
202
+strandCollapse <- function(BSseq, shift = TRUE, BPPARAM = bpparam(),
203
+                           BACKEND = getRealizationBackend(),
204
+                           dir = tempfile("BSseq"), replace = FALSE,
205
+                           chunkdim = NULL, level = NULL,
206
+                           type = c("double", "integer")) {
207
+
208
+    # Argument checks ----------------------------------------------------------
209
+
220 210
     if (all(runValue(strand(BSseq)) == "*")) {
221
-        warning("All loci are unstranded; nothing to collapse")
211
+        warning("All loci are unstranded, nothing to collapse.", call. = FALSE)
222 212
         return(BSseq)
223 213
     }
224 214
     if (!(all(runValue(strand(BSseq)) %in% c("+", "-")))) {
225 215
         stop("'BSseq' object has a mix of stranded and unstranded loci.")
226 216
     }
227
-    BS.forward <- BSseq[strand(BSseq) == "+"]
228
-    strand(BS.forward) <- "*"
229
-    BS.reverse <- BSseq[strand(BSseq) == "-"]
230
-    strand(BS.reverse) <- "*"
231
-    if (shift) rowRanges(BS.reverse) <- shift(rowRanges(BS.reverse), -1L)
232
-    sampleNames(BS.reverse) <- paste0(sampleNames(BS.reverse), "_REVERSE")
233
-    BS.comb <- combine(BS.forward, BS.reverse)
234
-    newBSseq <- collapseBSseq(BS.comb, columns = rep(sampleNames(BSseq), 2))
235
-    newBSseq
217
+    # Register 'BACKEND' and return to current value on exit.
218
+    # TODO: Is this strictly necessary?
219
+    current_BACKEND <- getRealizationBackend()
220
+    on.exit(setRealizationBackend(current_BACKEND), add = TRUE)
221
+    setRealizationBackend(BACKEND)
222
+    # Check compatability of 'BPPARAM' with 'BACKEND'.
223
+    if (!.areBackendsInMemory(BACKEND)) {
224
+        if (!.isSingleMachineBackend(BPPARAM)) {
225
+            stop("The parallelisation strategy must use a single machine ",
226
+                 "when using an on-disk realization backend.\n",
227
+                 "See help(\"read.bismark\") for details.",
228
+                 call. = FALSE)
229
+        }
230
+    } else {
231
+        if (!is.null(BACKEND)) {
232
+            # NOTE: Currently do not support any in-memory realization
233
+            #       backends. If the realization backend is NULL then an
234
+            #       ordinary matrix is returned rather than a matrix-backed
235
+            #       DelayedMatrix.
236
+            stop("The '", BACKEND, "' realization backend is not supported.",
237
+                 "\n  See help(\"read.bismark\") for details.",
238
+                 call. = FALSE)
239
+        }
240
+    }
241
+    # If using HDF5Array as BACKEND, check remaining options are sensible.
242
+    if (identical(BACKEND, "HDF5Array")) {
243
+        # NOTE: Most of this copied from
244
+        #       HDF5Array::saveHDF5SummarizedExperiment().
245
+        if (!isSingleString(dir)) {
246
+            stop(wmsg("'dir' must be a single string specifying the path to ",
247
+                      "the directory where to save the BSseq object (the ",
248
+                      "directory will be created)."))
249
+        }
250
+        if (!isTRUEorFALSE(replace)) {
251
+            stop("'replace' must be TRUE or FALSE")
252
+        }
253
+        HDF5Array:::.create_dir(dir = dir, replace = replace)
254
+        h5_path <- file.path(dir, "assays.h5")
255
+    } else if (identical(BACKEND, NULL)) {
256
+        h5_path <- NULL
257
+    }
258
+
259
+    # Collapse loci ------------------------------------------------------------
260
+
261
+    loci <- rowRanges(BSseq)
262
+    if (shift) {
263
+        loci <- shift(loci, shift = as.integer(-1L * (strand(loci) == "-")))
264
+    }
265
+    collapsed_loci <- reduce(loci, min.gapwidth = 0L, ignore.strand = TRUE)
266
+
267
+    # Collapse 'M' and 'Cov' matrices ------------------------------------------
268
+
269
+    ol <- findOverlaps(loci, collapsed_loci, type = "equal")
270
+    group <- subjectHits(ol)
271
+    M <- rowsum(
272
+        x = assay(BSseq, "M", withDimnames = FALSE),
273
+        group = group,
274
+        # NOTE: reorder = TRUE to ensure same row-order as collapsed_loci.
275
+        reorder = TRUE,
276
+        BPPARAM = BPPARAM,
277
+        filepath = h5_path,
278
+        name = "Cov",
279
+        chunkdim = chunkdim,
280
+        level = level,
281
+        type = type)
282
+    Cov <- rowsum(
283
+        x = assay(BSseq, "Cov", withDimnames = FALSE),
284
+        group = group,
285
+        # NOTE: reorder = TRUE to ensure same row-order as collapsed_loci.
286
+        reorder = TRUE,
287
+        BPPARAM = BPPARAM,
288
+        filepath = h5_path,
289
+        name = "Cov",
290
+        chunkdim = chunkdim,
291
+        level = level,
292
+        type = type)
293
+
294
+    # Construct BSseq object, saving it if it is HDF5-backed -------------------
295
+
296
+    se <- SummarizedExperiment(
297
+        assays = SimpleList(M = unname(M), Cov = unname(Cov)),
298
+        rowRanges = collapsed_loci,
299
+        colData = colData(BSseq))
300
+    # TODO: Is there a way to use the internal constructor with `check = FALSE`?
301
+    #       Assuming input was valid, the output is valid, too.
302
+    # .BSseq(se, trans = function(x) NULL, parameters = list())
303
+    bsseq <- new2("BSseq", se, check = FALSE)
304
+    if (!is.null(BACKEND) && BACKEND == "HDF5Array") {
305
+        # NOTE: Save BSseq object; mimicing
306
+        #       HDF5Array::saveHDF5SummarizedExperiment().
307
+        x <- bsseq
308
+        x@assays <- HDF5Array:::.shorten_h5_paths(x@assays)
309
+        saveRDS(x, file = file.path(dir, "se.rds"))
310
+    }
311
+    bsseq
236 312
 }
237 313
 
238 314
 # Exported methods -------------------------------------------------------------
... ...
@@ -290,7 +366,8 @@ setReplaceMethod(
290 366
 setMethod("updateObject", "BSseq",
291 367
           function(object, ...) {
292 368
               # NOTE: identical() is too strong
293
-              if (isTRUE(all.equal(getBSseq(object, "trans"), .oldTrans))) {
369
+              if (hasBeenSmoothed(object) &
370
+                  isTRUE(all.equal(getBSseq(object, "trans"), .oldTrans))) {
294 371
                   object@trans <- plogis
295 372
               }
296 373
               if (is(object, "SummarizedExperiment")) {
... ...
@@ -298,14 +375,15 @@ setMethod("updateObject", "BSseq",
298 375
                   object <- callNextMethod()
299 376
                   return(object)
300 377
               } else {
301
-                  BSseq(gr = object@gr,
302
-                        M = object@M,
303
-                        Cov = object@Cov,
304
-                        coef = object@coef,
305
-                        se.coef = object@se.coef,
306
-                        trans = object@trans,
307
-                        parameters = object@parameters,
308
-                        pData = object@phenoData@data)
378
+                  BSseq(
379
+                      gr = object@gr,
380
+                      M = object@M,
381
+                      Cov = object@Cov,
382
+                      coef = object@coef,
383
+                      se.coef = object@se.coef,
384
+                      trans = object@trans,
385
+                      parameters = object@parameters,
386
+                      pData = object@phenoData@data)
309 387
               }
310 388
           }
311 389
 )