R/BSseq-class.R
866ab76f
 # Exported classes -------------------------------------------------------------
 
 # NOTE: This create a 'classGeneratorFunction' (internal constructor), .BSseq()
 .BSseq <- setClass(
     "BSseq",
     slots = representation(
         trans = "function",
         parameters = "list"),
     contains = "RangedSummarizedExperiment"
 )
 
 # Validity methods -------------------------------------------------------------
 
 .checkAssayNames <- function(object, names) {
     if (all(names %in% assayNames(object))) {
         return(NULL)
     } else {
         paste0(
             "object of class ",
             class(object),
             " needs to have assay slots with names ",
             paste0(names, collapse = ", "))
 
     }
 }
 
 .checkMandCov <- function(M, Cov) {
     msg <- NULL
     validMsg(msg, .Call(cxx_check_M_and_Cov, M, Cov))
 }
 
588e496c
 # TODO: Benchmark validity method
866ab76f
 setValidity2("BSseq", function(object) {
     msg <- NULL
 
     if (identical(object, .BSseq())) {
         # No validity checks for object returned by internal constructor
         return(msg)
     }
     msg <- validMsg(msg, .checkAssayNames(object, c("Cov", "M")))
 
5469cca5
     msg <- validMsg(
         msg = msg,
         result = .checkMandCov(
             M = assay(object, "M", withDimnames = FALSE),
             Cov = assay(object, "Cov", withDimnames = FALSE)))
 
     if (is.null(msg)) {
         TRUE
     } else {
         msg
866ab76f
     }
 })
 
 # Internal functions -----------------------------------------------------------
 
 .oldTrans <- function(x) {
     y <- x
     ix <- which(x < 0)
     ix2 <- which(x > 0)
     y[ix] <- exp(x[ix])/(1 + exp(x[ix]))
     y[ix2] <- 1/(1 + exp(-x[ix2]))
     y
 }
 
 # Exported functions -----------------------------------------------------------
 
 # TODO: BSseq() is arguably a bad constructor. It doesn't return a valid BSseq
 #       object when called without any arguments. It also does some pretty
90182086
 #       complicated parsing of the inputs. But we're stuck with it because it's
 #       been around for a long time.
866ab76f
 BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
5469cca5
                   trans = NULL, parameters = NULL, pData = NULL, gr = NULL,
                   pos = NULL, chr = NULL, sampleNames = NULL,
866ab76f
                   rmZeroCov = FALSE) {
 
5469cca5
     # Argument checks ----------------------------------------------------------
 
     # Process assays.
     # NOTE: Nothing to do for 'coef', and 'se.coef'.
     if (is.null(M) || is.null(Cov)) {
         stop("Need 'M' and 'Cov'.")
866ab76f
     }
5469cca5
     # Process 'trans' and 'parameters'.
     if (is.null(trans)) {
         trans <- function(x) NULL
866ab76f
     }
5469cca5
     if (is.null(parameters)) {
         parameters <- list()
     }
     # Process 'sampleNames' and 'pData'.
     if (is.null(sampleNames)) {
         if (is.null(pData)) {
             # BSseq object will have no colnames.
             pData <- S4Vectors:::new_DataFrame(nrows = ncol(M))
         } else {
             # BSseq object will have 'sampleNames' as colnames.
             pData <- DataFrame(row.names = sampleNames)
         }
     } else {
         if (is.null(pData)) {
             # BSseq object will have 'sampleNames' as colnames.
             pData <- DataFrame(row.names = sampleNames)
         } else {
             if (is.null(rownames(pData))) {
                 rownames(pData) <- sampleNames
866ab76f
             } else {
5469cca5
                 stopifnot(identical(rownames(pData), sampleNames))
866ab76f
             }
         }
     }
5469cca5
     # Process 'gr', 'pos', and 'chr'.
     if (is.null(gr)) {
         if (is.null(pos) || is.null(chr)) {
             stop("Need 'pos' and 'chr' if 'gr' not supplied.")
         }
         gr <- GRanges(seqnames = chr, ranges = IRanges(start = pos, width = 1L))
866ab76f
     }
5469cca5
     if (!is(gr, "GRanges")) {
         stop("'gr' needs to be a GRanges.")
866ab76f
     }
5469cca5
     # Process 'rmZeroCov'.
     stopifnot(isTRUEorFALSE(rmZeroCov))
866ab76f
 
5469cca5
     # Collapse duplicate loci --------------------------------------------------
 
     is_duplicated <- duplicated(gr)
     if (any(is_duplicated)) {
         warning("Detected duplicate loci. Collapsing counts in 'M' and 'Cov' ",
                 "at these positions.")
         if (!is.null(coef) || !is.null(se.coef)) {
             stop("Cannot collapse when 'coef' or 'se.coef' are non-NULL.")
866ab76f
         }
5469cca5
         loci <- gr[!is_duplicated]
         ol <- findOverlaps(gr, loci, type = "equal")
         M <- rowsum(x = M, group = subjectHits(ol), reorder = FALSE)
         rownames(M) <- NULL
         Cov <- rowsum(x = Cov, group = subjectHits(ol), reorder = FALSE)
         rownames(Cov) <- NULL
     } else {
         loci <- gr
866ab76f
     }
 
5469cca5
     # Optionally, remove positions with zero coverage --------------------------
 
866ab76f
     if (rmZeroCov) {
         loci_with_zero_cov <- rowAlls(Cov, value = 0)
         if (any(loci_with_zero_cov)) {
5469cca5
             loci_with_nonzero_cov <- !loci_with_zero_cov
             gr <- gr[loci_with_nonzero_cov]
             M <- M[loci_with_nonzero_cov, , drop = FALSE]
             Cov <- Cov[loci_with_nonzero_cov, , drop = FALSE]
             coef <- coef[loci_with_nonzero_cov, , drop = FALSE]
             se.coef <- se.coef[loci_with_nonzero_cov, , drop = FALSE]
866ab76f
         }
     }
 
5469cca5
     # Construct BSseq object ---------------------------------------------------
866ab76f
 
     assays <- SimpleList(M = M, Cov = Cov, coef = coef, se.coef = se.coef)
     assays <- assays[!S4Vectors:::sapply_isNULL(assays)]
5469cca5
     se <- SummarizedExperiment(
         assays = assays,
         rowRanges = loci,
         colData = pData)
866ab76f
     .BSseq(se, trans = trans, parameters = parameters)
 }
 
5469cca5
 # Move to BSseq-utils?
 hasBeenSmoothed <- function(BSseq) {
     "coef" %in% assayNames(BSseq)
 }
866ab76f
 
5469cca5
 # Move to BSseq-utils?
866ab76f
 getBSseq <- function(BSseq,
                      type = c("Cov", "M", "gr", "coef", "se.coef", "trans",
                               "parameters"),
                      withDimnames = TRUE) {
     type <- match.arg(type)
     if (type %in% c("M", "Cov")) {
         return(assay(BSseq, type, withDimnames = withDimnames))
     }
5469cca5
     if (type %in% c("coef", "se.coef")) {
         if (type %in% assayNames(BSseq)) {
             return(assay(BSseq, type, withDimnames = withDimnames))
         } else {
             return(NULL)
         }
     }
     if (type == "trans") {
         return(BSseq@trans)
     }
     if (type == "parameters") {
         return(BSseq@parameters)
     }
     if (type == "gr") {
         return(BSseq@rowRanges)
866ab76f
     }
 }
 
5469cca5
 # Move to BSseq-utils?
 strandCollapse <- function(BSseq, shift = TRUE, BPPARAM = bpparam(),
                            BACKEND = getRealizationBackend(),
                            dir = tempfile("BSseq"), replace = FALSE,
                            chunkdim = NULL, level = NULL,
                            type = c("double", "integer")) {
 
     # Argument checks ----------------------------------------------------------
 
866ab76f
     if (all(runValue(strand(BSseq)) == "*")) {
5469cca5
         warning("All loci are unstranded, nothing to collapse.", call. = FALSE)
866ab76f
         return(BSseq)
     }
     if (!(all(runValue(strand(BSseq)) %in% c("+", "-")))) {
         stop("'BSseq' object has a mix of stranded and unstranded loci.")
     }
5469cca5
     # Register 'BACKEND' and return to current value on exit.
     # TODO: Is this strictly necessary?
     current_BACKEND <- getRealizationBackend()
     on.exit(setRealizationBackend(current_BACKEND), add = TRUE)
     setRealizationBackend(BACKEND)
     # Check compatability of 'BPPARAM' with 'BACKEND'.
     if (!.areBackendsInMemory(BACKEND)) {
         if (!.isSingleMachineBackend(BPPARAM)) {
             stop("The parallelisation strategy must use a single machine ",
                  "when using an on-disk realization backend.\n",
                  "See help(\"read.bismark\") for details.",
                  call. = FALSE)
         }
     } else {
         if (!is.null(BACKEND)) {
             # NOTE: Currently do not support any in-memory realization
             #       backends. If the realization backend is NULL then an
             #       ordinary matrix is returned rather than a matrix-backed
             #       DelayedMatrix.
             stop("The '", BACKEND, "' realization backend is not supported.",
                  "\n  See help(\"read.bismark\") for details.",
                  call. = FALSE)
         }
     }
     # If using HDF5Array as BACKEND, check remaining options are sensible.
     if (identical(BACKEND, "HDF5Array")) {
         # NOTE: Most of this copied from
         #       HDF5Array::saveHDF5SummarizedExperiment().
         if (!isSingleString(dir)) {
             stop(wmsg("'dir' must be a single string specifying the path to ",
                       "the directory where to save the BSseq object (the ",
                       "directory will be created)."))
         }
         if (!isTRUEorFALSE(replace)) {
             stop("'replace' must be TRUE or FALSE")
         }
         HDF5Array:::.create_dir(dir = dir, replace = replace)
         h5_path <- file.path(dir, "assays.h5")
     } else if (identical(BACKEND, NULL)) {
         h5_path <- NULL
     }
 
     # Collapse loci ------------------------------------------------------------
 
     loci <- rowRanges(BSseq)
     if (shift) {
         loci <- shift(loci, shift = as.integer(-1L * (strand(loci) == "-")))
     }
     collapsed_loci <- reduce(loci, min.gapwidth = 0L, ignore.strand = TRUE)
 
     # Collapse 'M' and 'Cov' matrices ------------------------------------------
 
     ol <- findOverlaps(loci, collapsed_loci, type = "equal")
     group <- subjectHits(ol)
     M <- rowsum(
         x = assay(BSseq, "M", withDimnames = FALSE),
         group = group,
         # NOTE: reorder = TRUE to ensure same row-order as collapsed_loci.
         reorder = TRUE,
         BPPARAM = BPPARAM,
         filepath = h5_path,
         name = "Cov",
         chunkdim = chunkdim,
         level = level,
         type = type)
     Cov <- rowsum(
         x = assay(BSseq, "Cov", withDimnames = FALSE),
         group = group,
         # NOTE: reorder = TRUE to ensure same row-order as collapsed_loci.
         reorder = TRUE,
         BPPARAM = BPPARAM,
         filepath = h5_path,
         name = "Cov",
         chunkdim = chunkdim,
         level = level,
         type = type)
 
     # Construct BSseq object, saving it if it is HDF5-backed -------------------
 
     se <- SummarizedExperiment(
         assays = SimpleList(M = unname(M), Cov = unname(Cov)),
         rowRanges = collapsed_loci,
         colData = colData(BSseq))
     # TODO: Is there a way to use the internal constructor with `check = FALSE`?
     #       Assuming input was valid, the output is valid, too.
     # .BSseq(se, trans = function(x) NULL, parameters = list())
     bsseq <- new2("BSseq", se, check = FALSE)
     if (!is.null(BACKEND) && BACKEND == "HDF5Array") {
         # NOTE: Save BSseq object; mimicing
         #       HDF5Array::saveHDF5SummarizedExperiment().
         x <- bsseq
         x@assays <- HDF5Array:::.shorten_h5_paths(x@assays)
         saveRDS(x, file = file.path(dir, "se.rds"))
     }
     bsseq
866ab76f
 }
 
 # Exported methods -------------------------------------------------------------
 
 setMethod("show", signature(object = "BSseq"), function(object) {
     cat("An object of type 'BSseq' with\n")
     cat(" ", nrow(object), "methylation loci\n")
     cat(" ", ncol(object), "samples\n")
     if (hasBeenSmoothed(object)) {
         cat("has been smoothed with\n")
         cat(" ", object@parameters$smoothText, "\n")
     } else {
         cat("has not been smoothed\n")
     }
     if (.isHDF5ArrayBacked(object)) {
         cat("Some assays are HDF5Array-backed\n")
     } else {
         cat("All assays are in-memory\n")
     }
 })
 
 setMethod("pData", "BSseq", function(object) {
     object@colData
 })
 
 setReplaceMethod(
     "pData",
     signature = signature(object = "BSseq", value = "data.frame"),
     function(object, value) {
         colData(object) <- as(value, "DataFrame")
         object
     }
 )
 
 setReplaceMethod(
     "pData",
     signature = signature(object = "BSseq", value = "DataFrame"),
     function(object, value) {
         colData(object) <- value
         object
     }
 )
 
 setMethod("sampleNames", "BSseq", function(object) colnames(object))
 
 setReplaceMethod(
     "sampleNames",
     signature = signature(object = "BSseq", value = "ANY"),
     function(object, value) {
         colnames(object) <- value
         object
     }
 )
 
 setMethod("updateObject", "BSseq",
           function(object, ...) {
               # NOTE: identical() is too strong
5469cca5
               if (hasBeenSmoothed(object) &
                   isTRUE(all.equal(getBSseq(object, "trans"), .oldTrans))) {
866ab76f
                   object@trans <- plogis
               }
               if (is(object, "SummarizedExperiment")) {
                   # NOTE: Call method for SummarizedExperiment objects
                   object <- callNextMethod()
                   return(object)
               } else {
5469cca5
                   BSseq(
                       gr = object@gr,
                       M = object@M,
                       Cov = object@Cov,
                       coef = object@coef,
                       se.coef = object@se.coef,
                       trans = object@trans,
                       parameters = object@parameters,
                       pData = object@phenoData@data)
866ab76f
               }
           }
 )