#' @title Feature clustering with Celda #' @description Clusters the rows of a count matrix containing single-cell data #' into L modules. The #' \code{useAssay} \link{assay} slot in #' \code{altExpName} \link{altExp} slot will be used if #' it exists. Otherwise, the \code{useAssay} #' \link{assay} slot in \code{x} will be used if #' \code{x} is a \linkS4class{SingleCellExperiment} object. #' @param x A numeric \link{matrix} of counts or a #' \linkS4class{SingleCellExperiment} #' with the matrix located in the assay slot under \code{useAssay}. #' Rows represent features and columns represent cells. #' @param useAssay A string specifying the name of the #' \link{assay} slot to use. Default "counts". #' @param altExpName The name for the \link{altExp} slot #' to use. Default "featureSubset". #' @param L Integer. Number of feature modules. #' @param beta Numeric. Concentration parameter for Phi. Adds a pseudocount to #' each feature module in each cell. Default 1. #' @param delta Numeric. Concentration parameter for Psi. Adds a pseudocount to #' each feature in each module. Default 1. #' @param gamma Numeric. Concentration parameter for Eta. Adds a pseudocount to #' the number of features in each module. Default 1. #' @param stopIter Integer. Number of iterations without improvement in the #' log likelihood to stop inference. Default 10. #' @param maxIter Integer. Maximum number of iterations of Gibbs sampling to #' perform. Default 200. #' @param splitOnIter Integer. On every `splitOnIter` iteration, a heuristic #' will be applied to determine if a feature module should be reassigned and #' another feature module should be split into two clusters. To disable #' splitting, set to -1. Default 10. #' @param splitOnLast Integer. After `stopIter` iterations have been #' performed without improvement, a heuristic will be applied to determine if #' a cell population should be reassigned and another cell population should be #' split into two clusters. If a split occurs, then `stopIter` will be reset. #' Default TRUE. #' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, #' a default value of 12345 is used. If NULL, no calls to #' \link[withr]{with_seed} are made. #' @param nchains Integer. Number of random cluster initializations. Default 3. #' @param yInitialize Chararacter. One of 'random', 'split', or 'predefined'. #' With 'random', features are randomly assigned to a modules. With 'split', #' features will be split into sqrt(L) modules and then each module will be #' subsequently split into another sqrt(L) modules. With 'predefined', values #' in `yInit` will be used to initialize `y`. Default 'split'. #' @param yInit Integer vector. Sets initial starting values of y. If NULL, #' starting values for each feature will be randomly sampled from `1:L`. #' `yInit` can only be used when `initialize = 'random'`. Default NULL. #' @param countChecksum Character. An MD5 checksum for the `counts` matrix. #' Default NULL. #' @param logfile Character. Messages will be redirected to a file named #' `logfile`. If NULL, messages will be printed to stdout. Default NULL. #' @param verbose Logical. Whether to print log messages. Default TRUE. #' @param ... Ignored. Placeholder to prevent check warning. #' @return A \linkS4class{SingleCellExperiment} object. Function #' parameter settings are stored in the \link{metadata} #' \code{"celda_parameters"} slot. Column \code{celda_feature_module} in #' \link{rowData} contains feature modules. #' @seealso \link{celda_C} for cell clustering and \link{celda_CG} for #' simultaneous clustering of features and cells. \link{celdaGridSearch} can #' be used to run multiple values of L and multiple chains in parallel. #' @examples #' data(celdaGSim) #' sce <- celda_G(celdaGSim$counts, L = celdaGSim$L, nchains = 1) #' @export setGeneric("celda_G", function(x, ...) { standardGeneric("celda_G")}) #' @rdname celda_G #' @export setMethod("celda_G", signature(x = "SingleCellExperiment"), function(x, useAssay = "counts", altExpName = "featureSubset", L, beta = 1, delta = 1, gamma = 1, stopIter = 10, maxIter = 200, splitOnIter = 10, splitOnLast = TRUE, seed = 12345, nchains = 3, yInitialize = c("split", "random", "predefined"), countChecksum = NULL, yInit = NULL, logfile = NULL, verbose = TRUE) { xClass <- "SingleCellExperiment" if (!altExpName %in% SingleCellExperiment::altExpNames(x)) { stop(altExpName, " not in 'altExpNames(x)'. Run ", "selectFeatures(x) first!") } altExp <- SingleCellExperiment::altExp(x, altExpName) if (!useAssay %in% SummarizedExperiment::assayNames(altExp)) { stop(useAssay, " not in assayNames(altExp(x, altExpName))") } counts <- SummarizedExperiment::assay(altExp, i = useAssay) altExp <- .celdaGWithSeed(counts = counts, xClass = xClass, useAssay = useAssay, sce = altExp, L = L, beta = beta, delta = delta, gamma = gamma, stopIter = stopIter, maxIter = maxIter, splitOnIter = splitOnIter, splitOnLast = splitOnLast, seed = seed, nchains = nchains, yInitialize = match.arg(yInitialize), countChecksum = countChecksum, yInit = yInit, logfile = logfile, verbose = verbose) SingleCellExperiment::altExp(x, altExpName) <- altExp return(x) } ) #' @rdname celda_G #' @export setMethod("celda_G", signature(x = "matrix"), function(x, useAssay = "counts", altExpName = "featureSubset", L, beta = 1, delta = 1, gamma = 1, stopIter = 10, maxIter = 200, splitOnIter = 10, splitOnLast = TRUE, seed = 12345, nchains = 3, yInitialize = c("split", "random", "predefined"), countChecksum = NULL, yInit = NULL, logfile = NULL, verbose = TRUE) { ls <- list() ls[[useAssay]] <- x sce <- SingleCellExperiment::SingleCellExperiment(assays = ls) SingleCellExperiment::altExp(sce, altExpName) <- sce xClass <- "matrix" altExp <- .celdaGWithSeed(counts = x, xClass = xClass, useAssay = useAssay, sce = SingleCellExperiment::altExp(sce, altExpName), L = L, beta = beta, delta = delta, gamma = gamma, stopIter = stopIter, maxIter = maxIter, splitOnIter = splitOnIter, splitOnLast = splitOnLast, seed = seed, nchains = nchains, yInitialize = match.arg(yInitialize), countChecksum = countChecksum, yInit = yInit, logfile = logfile, verbose = verbose) SingleCellExperiment::altExp(sce, altExpName) <- altExp return(sce) } ) .celdaGWithSeed <- function(counts, xClass, useAssay, sce, L, beta, delta, gamma, stopIter, maxIter, splitOnIter, splitOnLast, seed, nchains, yInitialize, countChecksum, yInit, logfile, verbose) { .validateCounts(counts) if (is.null(seed)) { celdaGMod <- .celda_G(counts = counts, L = L, beta = beta, delta = delta, gamma = gamma, stopIter = stopIter, maxIter = maxIter, splitOnIter = splitOnIter, splitOnLast = splitOnLast, nchains = nchains, yInitialize = yInitialize, countChecksum = countChecksum, yInit = yInit, logfile = logfile, verbose = verbose, reorder = TRUE) } else { with_seed( seed, celdaGMod <- .celda_G(counts = counts, L = L, beta = beta, delta = delta, gamma = gamma, stopIter = stopIter, maxIter = maxIter, splitOnIter = splitOnIter, splitOnLast = splitOnLast, nchains = nchains, yInitialize = yInitialize, countChecksum = countChecksum, yInit = yInit, logfile = logfile, verbose = verbose, reorder = TRUE) ) } sce <- .createSCEceldaG(celdaGMod = celdaGMod, sce = sce, xClass = xClass, useAssay = useAssay, stopIter = stopIter, maxIter = maxIter, splitOnIter = splitOnIter, splitOnLast = splitOnLast, nchains = nchains, yInitialize = yInitialize, yInit = yInit, logfile = logfile, verbose = verbose) return(sce) } .celda_G <- function(counts, L, beta = 1, delta = 1, gamma = 1, stopIter = 10, maxIter = 200, splitOnIter = 10, splitOnLast = TRUE, nchains = 3, yInitialize = c("split", "random", "predefined"), countChecksum = NULL, yInit = NULL, logfile = NULL, verbose = TRUE, reorder = TRUE) { .logMessages(paste(rep("-", 50), collapse = ""), logfile = logfile, append = FALSE, verbose = verbose ) .logMessages("Starting Celda_G: Clustering genes.", logfile = logfile, append = TRUE, verbose = verbose ) .logMessages(paste(rep("-", 50), collapse = ""), logfile = logfile, append = TRUE, verbose = verbose ) start.time <- Sys.time() ## Error checking and variable processing counts <- .processCounts(counts) if (is.null(countChecksum)) { countChecksum <- .createCountChecksum(counts) } yInitialize <- match.arg(yInitialize) allChains <- seq(nchains) # Pre-compute lgamma values lgbeta <- lgamma(seq(0, max(.colSums( counts, nrow(counts), ncol(counts) ))) + beta) lggamma <- lgamma(seq(0, nrow(counts) + L) + gamma) lgdelta <- c(NA, lgamma((seq(nrow(counts) + L) * delta))) bestResult <- NULL for (i in allChains) { ## Randomly select y or y to supplied initial values ## Initialize cluster labels .logMessages(date(), ".. Initializing 'y' in chain", i, "with", paste0("'", yInitialize, "' "), logfile = logfile, append = TRUE, verbose = verbose ) if (yInitialize == "predefined") { if (is.null(yInit)) { stop("'yInit' needs to specified when initilize.y == 'given'.") } y <- .initializeCluster(L, nrow(counts), initial = yInit, fixed = NULL ) } else if (yInitialize == "split") { y <- .initializeSplitY(counts, L, beta = beta, delta = delta, gamma = gamma ) } else { y <- .initializeCluster(L, nrow(counts), initial = NULL, fixed = NULL ) } yBest <- y ## Calculate counts one time up front p <- .cGDecomposeCounts(counts = counts, y = y, L = L) nTSByC <- p$nTSByC nByG <- p$nByG nByTS <- p$nByTS nGByTS <- p$nGByTS nM <- p$nM nG <- p$nG rm(p) ## Calculate initial log likelihood ll <- .cGCalcLL( nTSByC = nTSByC, nByTS = nByTS, nByG = nByG, nGByTS = nGByTS, nM = nM, nG = nG, L = L, beta = beta, delta = delta, gamma = gamma ) iter <- 1L numIterWithoutImprovement <- 0L doGeneSplit <- TRUE while (iter <= maxIter & numIterWithoutImprovement <= stopIter) { nextY <- .cGCalcGibbsProbY( counts = counts, nTSByC = nTSByC, nByTS = nByTS, nGByTS = nGByTS, nByG = nByG, y = y, nG = nG, L = L, beta = beta, delta = delta, gamma = gamma, lgbeta = lgbeta, lggamma = lggamma, lgdelta = lgdelta ) nTSByC <- nextY$nTSByC nGByTS <- nextY$nGByTS nByTS <- nextY$nByTS y <- nextY$y ## Perform split on i-th iteration of no improvement in log ## likelihood tempLl <- .cGCalcLL( nTSByC = nTSByC, nByTS = nByTS, nByG = nByG, nGByTS = nGByTS, nM = nM, nG = nG, L = L, beta = beta, delta = delta, gamma = gamma ) if (L > 2 & iter != maxIter & ((((numIterWithoutImprovement == stopIter & !all(tempLl > ll))) & isTRUE(splitOnLast)) | (splitOnIter > 0 & iter %% splitOnIter == 0 & isTRUE(doGeneSplit)))) { .logMessages(date(), " .... Determining if any gene clusters should be split.", logfile = logfile, append = TRUE, sep = "", verbose = verbose ) res <- .cGSplitY(counts, y, nTSByC, nByTS, nByG, nGByTS, nM, nG, L, beta, delta, gamma, yProb = t(nextY$probs), minFeature = 3, maxClustersToTry = max(L / 2, 10) ) .logMessages(res$message, logfile = logfile, append = TRUE, verbose = verbose ) # Reset convergence counter if a split occured if (!isTRUE(all.equal(y, res$y))) { numIterWithoutImprovement <- 1L doGeneSplit <- TRUE } else { doGeneSplit <- FALSE } ## Re-calculate variables y <- res$y nTSByC <- res$nTSByC nByTS <- res$nByTS nGByTS <- res$nGByTS } ## Calculate complete likelihood tempLl <- .cGCalcLL( nTSByC = nTSByC, nByTS = nByTS, nByG = nByG, nGByTS = nGByTS, nM = nM, nG = nG, L = L, beta = beta, delta = delta, gamma = gamma ) if ((all(tempLl > ll)) | iter == 1) { yBest <- y llBest <- tempLl numIterWithoutImprovement <- 1L } else { numIterWithoutImprovement <- numIterWithoutImprovement + 1L } ll <- c(ll, tempLl) .logMessages(date(), ".... Completed iteration:", iter, "| logLik:", tempLl, logfile = logfile, append = TRUE, verbose = verbose ) iter <- iter + 1 } names <- list(row = rownames(counts), column = colnames(counts)) result <- list( y = yBest, completeLogLik = ll, finalLogLik = llBest, L = L, beta = beta, delta = delta, gamma = gamma, countChecksum = countChecksum, names = names ) if (is.null(bestResult) || result$finalLogLik > bestResult$finalLogLik) { bestResult <- result } .logMessages(date(), ".. Finished chain", i, logfile = logfile, append = TRUE, verbose = verbose ) } bestResult <- methods::new("celda_G", clusters = list(y = yBest), params = list( L = as.integer(L), beta = beta, delta = delta, gamma = gamma, countChecksum = countChecksum ), completeLogLik = ll, finalLogLik = llBest, names = names ) if (isTRUE(reorder)) { bestResult <- .reorderCeldaG(counts = counts, res = bestResult) } endTime <- Sys.time() .logMessages(paste0(rep("-", 50), collapse = ""), logfile = logfile, append = TRUE, verbose = verbose ) .logMessages("Completed Celda_G. Total time:", format(difftime(endTime, start.time)), logfile = logfile, append = TRUE, verbose = verbose ) .logMessages(paste0(rep("-", 50), collapse = ""), logfile = logfile, append = TRUE, verbose = verbose ) return(bestResult) } # Calculate Log Likelihood For Single Set of Cluster Assignments # (Gene Clustering) # This function calculates the log-likelihood of a given set of cluster # assigments for the samples # represented in the provided count matrix. # @param nTSByC Number of counts in each Transcriptional State per Cell. # @param nByTS Number of counts per Transcriptional State. # @param nGByTS Number of genes in each Transcriptional State. # @param nG.in.Y Number of genes in each of the cell cluster. # @param gamma Numeric. Concentration parameter for Eta. Adds a pseudocount to # the number of features in each module. Default 1. # @param delta Numeric. Concentration parameter for Psi. Adds a pseudocount to # each feature in each module. Default 1. # @param beta Numeric. Concentration parameter for Phi. Adds a pseudocount to # each feature module in each cell. Default 1. # @keywords log likelihood .cGCalcGibbsProbY <- function(counts, nTSByC, nByTS, nGByTS, nByG, y, L, nG, beta, delta, gamma, lgbeta, lggamma, lgdelta, doSample = TRUE) { ## Set variables up front outside of loop probs <- matrix(NA, ncol = nG, nrow = L) ix <- sample(seq(nG)) for (i in ix) { probs[, i] <- cG_CalcGibbsProbY( index = i, counts = counts, nTSbyC = nTSByC, nbyTS = nByTS, nGbyTS = nGByTS, nbyG = nByG, y = y, L = L, nG = nG, lg_beta = lgbeta, lg_gamma = lggamma, lg_delta = lgdelta, delta = delta ) ## Sample next state and add back counts if (isTRUE(doSample)) { prevY <- y[i] y[i] <- .sampleLl(probs[, i]) if (prevY != y[i]) { nTSByC[prevY, ] <- nTSByC[prevY, ] - counts[i, ] nGByTS[prevY] <- nGByTS[prevY] - 1L nByTS[prevY] <- nByTS[prevY] - nByG[i] nTSByC[y[i], ] <- nTSByC[y[i], ] + counts[i, ] nGByTS[y[i]] <- nGByTS[y[i]] + 1L nByTS[y[i]] <- nByTS[y[i]] + nByG[i] } } } return(list( nTSByC = nTSByC, nGByTS = nGByTS, nByTS = nByTS, y = y, probs = probs )) } # Calculate log-likelihood of celda_CG model .cGCalcLL <- function(nTSByC, nByTS, nByG, nGByTS, nM, nG, L, beta, delta, gamma) { nG <- sum(nGByTS) ## Calculate for "Phi" component a <- nM * lgamma(L * beta) b <- sum(lgamma(nTSByC + beta)) c <- -nM * L * lgamma(beta) d <- -sum(lgamma(colSums(nTSByC + beta))) phiLl <- a + b + c + d ## Calculate for "Psi" component a <- sum(lgamma(nGByTS * delta)) b <- sum(lgamma(nByG + delta)) c <- -nG * lgamma(delta) d <- -sum(lgamma(nByTS + (nGByTS * delta))) psiLl <- a + b + c + d ## Calculate for "Eta" component a <- lgamma(L * gamma) b <- sum(lgamma(nGByTS + gamma)) c <- -L * lgamma(gamma) d <- -sum(lgamma(sum(nGByTS + gamma))) etaLl <- a + b + c + d final <- phiLl + psiLl + etaLl return(final) } # Takes raw counts matrix and converts it to a series of matrices needed for # log likelihood calculation # @param counts Integer matrix. Rows represent features and columns represent # cells. # @param y Numeric vector. Denotes feature module labels. # @param L Integer. Number of feature modules. .cGDecomposeCounts <- function(counts, y, L) { if (any(y > L)) { stop("Assigned value of feature module greater than the total number", " of feature modules!") } nTSByC <- .rowSumByGroup(counts, group = y, L = L) nByG <- as.integer(.rowSums(counts, nrow(counts), ncol(counts))) nByTS <- as.integer(.rowSumByGroup(matrix(nByG, ncol = 1), group = y, L = L )) nGByTS <- tabulate(y, L) + 1 ## Add pseudogene to each state nM <- ncol(counts) nG <- nrow(counts) return(list( nTSByC = nTSByC, nByG = nByG, nByTS = nByTS, nGByTS = nGByTS, nM = nM, nG = nG )) } .cGReDecomposeCounts <- function(counts, y, previousY, nTSByC, nByG, L) { ## Recalculate counts based on new label nTSByC <- .rowSumByGroupChange(counts, nTSByC, y, previousY, L) nByTS <- as.integer(.rowSumByGroup(matrix(nByG, ncol = 1), group = y, L = L )) nGByTS <- tabulate(y, L) + 1 return(list( nTSByC = nTSByC, nByTS = nByTS, nGByTS = nGByTS )) } .reorderCeldaG <- function(counts, res) { if (params(res)$L > 2 & isTRUE(length(unique(celdaClusters(res)$y)) > 1)) { res@clusters$y <- as.integer(as.factor(celdaClusters(res)$y)) fm <- factorizeMatrix(counts, res) uniqueY <- sort(unique(celdaClusters(res)$y)) cs <- prop.table(t(fm$posterior$cell[uniqueY, ]), 2) d <- .cosineDist(cs) h <- stats::hclust(d, method = "complete") res <- .recodeClusterY(res, from = h$order, to = seq(length(h$order))) } return(res) } .prepareCountsForDimReductionCeldaG <- function(sce, useAssay, maxCells, minClusterSize, modules, normalize, scaleFactor, transformationFun) { counts <- SummarizedExperiment::assay(sce, i = useAssay) counts <- .processCounts(counts) if (is.null(maxCells) || maxCells > ncol(counts)) { maxCells <- ncol(counts) cellIx <- seq_len(ncol(counts)) } else { cellIx <- sample(seq(ncol(counts)), maxCells) } fm <- .factorizeMatrixCelda_G(sce, useAssay = useAssay, type = "counts") modulesToUse <- seq(nrow(fm$counts$cell)) if (!is.null(modules)) { if (!all(modules %in% modulesToUse)) { stop( "'modules' must be a vector of numbers between 1 and ", modulesToUse, "." ) } modulesToUse <- modules } norm <- t(normalizeCounts(fm$counts$cell[modulesToUse, cellIx], normalize = normalize, scaleFactor = scaleFactor, transformationFun = transformationFun)) return(list(norm = norm, cellIx = cellIx)) } .createSCEceldaG <- function(celdaGMod, sce, xClass, useAssay, stopIter, maxIter, splitOnIter, splitOnLast, nchains, yInitialize, yInit, logfile, verbose) { # add metadata S4Vectors::metadata(sce)[["celda_parameters"]] <- list( model = "celda_G", xClass = xClass, useAssay = useAssay, L = celdaGMod@params$L, beta = celdaGMod@params$beta, delta = celdaGMod@params$delta, gamma = celdaGMod@params$gamma, stopIter = stopIter, maxIter = maxIter, splitOnIter = splitOnIter, splitOnLast = splitOnLast, seed = celdaGMod@params$seed, nchains = nchains, yInitialize = yInitialize, countChecksum = celdaGMod@params$countChecksum, yInit = yInit, logfile = logfile, verbose = verbose, completeLogLik = celdaGMod@completeLogLik, finalLogLik = celdaGMod@finalLogLik, featureModuleLevels = sort(unique(celdaClusters(celdaGMod)$y))) SummarizedExperiment::rowData(sce)["rownames"] <- celdaGMod@names$row SummarizedExperiment::colData(sce)["colnames"] <- celdaGMod@names$column SummarizedExperiment::rowData(sce)["celda_feature_module"] <- celdaClusters(celdaGMod)$y return(sce) }