R/initialize_clusters.R
fdf53f53
 .initializeCluster <- function(N,
d1c540ac
     len,
     z = NULL,
     initial = NULL,
75664a2f
     fixed = NULL) {
8dcff611
 
d1c540ac
     # If initial values are given, then they will not be randomly initialized
     if (!is.null(initial)) {
ca5fb59d
         initValues <- sort(unique(initial))
d1c540ac
         if (length(unique(initial)) != N || length(initial) != len ||
ca5fb59d
             !all(initValues %in% seq(N))) {
3134ca62
                 stop("'initial' needs to be a vector of length 'len'",
                     " containing N unique values.")
d1c540ac
         }
af4c3cb8
         z <- as.integer(as.factor(initial))
d1c540ac
     } else {
         z <- rep(NA, len)
     }
8dcff611
 
d1c540ac
     # Set any values that need to be fixed during sampling
     if (!is.null(fixed)) {
ca5fb59d
         fixedValues <- sort(unique(fixed))
         if (length(fixed) != len || !all(fixedValues %in% seq(N))) {
3134ca62
             stop("'fixed' to be a vector of length 'len' where each entry is",
                 " one of N unique values or NA.")
d1c540ac
         }
ca5fb59d
         fixedIx <- !is.na(fixed)
         z[fixedIx] <- fixed[fixedIx]
         zNotUsed <- setdiff(seq(N), unique(fixed[fixedIx]))
d1c540ac
     } else {
ca5fb59d
         zNotUsed <- seq(N)
         fixedIx <- rep(FALSE, len)
8dcff611
     }
d1c540ac
 
     # Randomly sample remaining values
ca5fb59d
     zNa <- which(is.na(z))
     if (length(zNa) > 0) {
         z[zNa] <- sample(zNotUsed, length(zNa), replace = TRUE)
8dcff611
     }
d1c540ac
 
     # Check to ensure each value is in the vector at least once
ca5fb59d
     missing <- setdiff(seq(N), z)
d1c540ac
     for (i in missing) {
ca5fb59d
         ta <- sort(table(z[!fixedIx]), decreasing = TRUE)
d1c540ac
         if (ta[1] == 1) {
             stop("'len' is not long enough to accomodate 'N' unique values")
         }
af4c3cb8
         ix <- which(z == as.integer(names(ta))[1] & !fixedIx)
d1c540ac
         z[sample(ix, 1)] <- i
8dcff611
     }
d1c540ac
 
     return(z)
8dcff611
 }
 
 
fdf53f53
 .initializeSplitZ <- function(counts,
d1c540ac
     K,
ca5fb59d
     KSubcluster = NULL,
d1c540ac
     alpha = 1,
     beta = 1,
75664a2f
     minCell = 3) {
d1c540ac
 
     s <- rep(1, ncol(counts))
ca5fb59d
     if (is.null(KSubcluster))
         KSubcluster <- ceiling(sqrt(K))
d1c540ac
 
ca5fb59d
     # Initialize the model with KSubcluster clusters
d1c540ac
     res <- .celda_C(
         counts,
ca5fb59d
         K = KSubcluster,
         maxIter = 20,
         zInitialize = "random",
d1c540ac
         alpha = alpha,
         beta = beta,
ca5fb59d
         splitOnIter = -1,
         splitOnLast = FALSE,
d1c540ac
         verbose = FALSE,
         reorder = FALSE
     )
06b0c870
     overallZ <- as.integer(as.factor(clusters(res)$z))
ca5fb59d
     currentK <- max(overallZ)
d1c540ac
 
ca5fb59d
     while (currentK < K) {
d1c540ac
         # Determine which clusters are split-able
501d7a5b
         # KRemaining <- K - currentK
ca5fb59d
         KPerCluster <- min(ceiling(K / currentK), KSubcluster)
         KToUse <- ifelse(KPerCluster < 2, 2, KPerCluster)
d1c540ac
 
ca5fb59d
         zTa <- tabulate(overallZ, max(overallZ))
b6cf56ae
 
acfdc7f3
         zToSplit <- which(zTa > minCell & zTa > KToUse)
         if (length(zToSplit) > 1) {
             zToSplit <- sample(zToSplit)
         } else if (length(zToSplit) == 0) {
b6cf56ae
             break
d1c540ac
         }
 
ca5fb59d
         # Cycle through each splitable cluster and split it up into
         # K.sublcusters
         for (i in zToSplit) {
             clustLabel <- .celda_C(counts[, overallZ == i, drop = FALSE],
                 K = KToUse,
                 zInitialize = "random",
d1c540ac
                 alpha = alpha,
                 beta = beta,
ca5fb59d
                 maxIter = 20,
                 splitOnIter = -1,
                 splitOnLast = FALSE,
d1c540ac
                 verbose = FALSE)
06b0c870
             tempZ <- as.integer(as.factor(clusters(clustLabel)$z))
8dcff611
 
d1c540ac
             # Reassign clusters with label > 1
ca5fb59d
             splitIx <- tempZ > 1
             ix <- overallZ == i
             newZ <- overallZ[ix]
             newZ[splitIx] <- currentK + tempZ[splitIx] - 1
8dcff611
 
ca5fb59d
             overallZ[ix] <- newZ
             currentK <- max(overallZ)
d1c540ac
 
             # Ensure that the maximum number of clusters does not get too large'
ca5fb59d
             if (currentK > K + 10) {
b6cf56ae
                 break
d1c540ac
             }
         }
8dcff611
     }
 
d1c540ac
     # Decompose counts for likelihood calculation
ca5fb59d
     p <- .cCDecomposeCounts(counts, s, overallZ, currentK)
d1c540ac
     nS <- p$nS
     nG <- p$nG
     nM <- p$nM
ca5fb59d
     mCPByS <- p$mCPByS
     nGByCP <- p$nGByCP
     nCP <- p$nCP
     nByC <- p$nByC
d1c540ac
 
     # Remove clusters 1-by-1 until K is reached
ca5fb59d
     while (currentK > K) {
d1c540ac
         # Find second best assignment give current assignments for each cell
ca5fb59d
         probs <- .cCCalcEMProbZ(counts,
d1c540ac
                 s = s,
ca5fb59d
                 z = overallZ,
                 K = currentK,
                 mCPByS = mCPByS,
                 nGByCP = nGByCP,
                 nByC = nByC,
                 nCP = nCP,
d1c540ac
                 nG = nG,
                 nM = nM,
                 alpha = alpha,
                 beta = beta,
ca5fb59d
                 doSample = FALSE)
         zProb <- t(probs$probs)
         zProb[cbind(seq(nrow(zProb)), overallZ)] <- NA
         zSecond <- apply(zProb, 1, which.max)
d1c540ac
 
ca5fb59d
         zTa <- tabulate(overallZ, currentK)
         zNonEmpty <- which(zTa > 0)
d1c540ac
 
         # Find worst cluster by logLik to remove
ca5fb59d
         previousZ <- overallZ
         llShuffle <- rep(NA, currentK)
         for (i in zNonEmpty) {
             ix <- overallZ == i
             newZ <- overallZ
             newZ[ix] <- zSecond[ix]
 
             p <- .cCReDecomposeCounts(counts,
d1c540ac
                 s,
ca5fb59d
                 newZ,
                 previousZ,
                 nGByCP,
                 currentK)
             nGByCP <- p$nGByCP
             mCPByS <- p$mCPByS
             llShuffle[i] <- .cCCalcLL(mCPByS,
                     nGByCP,
d1c540ac
                     s,
ca5fb59d
                     newZ,
                     currentK,
d1c540ac
                     nS,
                     nG,
                     alpha,
                     beta)
ca5fb59d
             previousZ <- newZ
d1c540ac
         }
8dcff611
 
d1c540ac
         # Remove the cluster which had the the largest likelihood after removal
ca5fb59d
         zToRemove <- which.max(llShuffle)
8dcff611
 
ca5fb59d
         ix <- overallZ == zToRemove
         overallZ[ix] <- zSecond[ix]
98021745
 
ca5fb59d
         p <- .cCReDecomposeCounts(counts,
d1c540ac
             s,
ca5fb59d
             overallZ,
             previousZ,
             nGByCP,
             currentK)
         nGByCP <- p$nGByCP[, -zToRemove, drop = FALSE]
         mCPByS <- p$mCPByS[-zToRemove, , drop = FALSE]
         overallZ <- as.integer(as.factor(overallZ))
         currentK <- currentK - 1
8dcff611
     }
ca5fb59d
     return(overallZ)
98021745
 }
 
d1c540ac
 
fdf53f53
 .initializeSplitY <- function(counts,
d1c540ac
     L,
ca5fb59d
     LSubcluster = NULL,
     tempK = 100,
d1c540ac
     beta = 1,
     delta = 1,
     gamma = 1,
75664a2f
     minFeature = 3) {
ca5fb59d
 
     if (is.null(LSubcluster))
         LSubcluster <- ceiling(sqrt(L))
d1c540ac
 
     # Collapse cells to managable number of clusters
ca5fb59d
     if (!is.null(tempK) && ncol(counts) > tempK) {
75664a2f
         z <- .initializeSplitZ(counts, K = tempK)
2e877ffe
         counts <- .colSumByGroup(counts, z, length(unique(z)))
d1c540ac
     }
 
ca5fb59d
     # Initialize the model with KSubcluster clusters
d1c540ac
     res <- .celda_G(counts,
ca5fb59d
         L = LSubcluster,
         maxIter = 10,
         yInitialize = "random",
d1c540ac
         beta = beta,
         delta = delta,
         gamma = gamma,
ca5fb59d
         splitOnIter = -1,
         splitOnLast = FALSE,
d1c540ac
         verbose = FALSE,
         reorder = FALSE
     )
06b0c870
     overallY <- as.integer(as.factor(clusters(res)$y))
ca5fb59d
     currentL <- max(overallY)
d1c540ac
 
ca5fb59d
     while (currentL < L) {
d1c540ac
         # Determine which clusters are split-able
ca5fb59d
         yTa <- tabulate(overallY, max(overallY))
         yToSplit <- sample(which(yTa > minFeature & yTa > LSubcluster))
d1c540ac
 
ca5fb59d
         if (length(yToSplit) == 0) {
b6cf56ae
             break
ca5fb59d
         }
d1c540ac
 
b6cf56ae
         # Cycle through each splitable cluster and split it up into
         # LSublcusters
ca5fb59d
         for (i in yToSplit) {
fc0f3024
             # make sure the colSums of subset counts is not 0
             countsY <- counts[overallY == i, , drop = FALSE]
             countsY <- countsY[, !(colSums(countsY) == 0)]
 
             if (ncol(countsY) == 0) {
                 next
             }
 
d1c540ac
             clustLabel <- .celda_G(
fc0f3024
                 countsY,
ca5fb59d
                 L = LSubcluster,
                 yInitialize = "random",
d1c540ac
                 beta = beta,
                 delta = delta,
                 gamma = gamma,
ca5fb59d
                 maxIter = 20,
                 splitOnIter = -1,
                 splitOnLast = FALSE,
f031bb72
                 verbose = FALSE)
06b0c870
             tempY <- as.integer(as.factor(clusters(clustLabel)$y))
d1c540ac
 
             # Reassign clusters with label > 1
ca5fb59d
             splitIx <- tempY > 1
             ix <- overallY == i
             newY <- overallY[ix]
             newY[splitIx] <- currentL + tempY[splitIx] - 1
d1c540ac
 
ca5fb59d
             overallY[ix] <- newY
             currentL <- max(overallY)
d1c540ac
 
             # Ensure that the maximum number of clusters does not get too large
ca5fb59d
             if (currentL > L + 10) {
b6cf56ae
                 break
d1c540ac
             }
         }
     }
 
     ## Decompose counts for likelihood calculation
ca5fb59d
     p <- .cGDecomposeCounts(counts = counts, y = overallY, L = currentL)
     nTSByC <- p$nTSByC
     nByG <- p$nByG
     nByTS <- p$nByTS
     nGByTS <- p$nGByTS
d1c540ac
     nM <- p$nM
     nG <- p$nG
     rm(p)
 
     # Pre-compute lgamma values
ca5fb59d
     lgbeta <- lgamma((seq(0, max(.colSums(counts, nrow(counts),
         ncol(counts))))) + beta)
     lggamma <- lgamma(seq(0, nrow(counts) + L) + gamma)
     lgdelta <- c(NA, lgamma(seq(nrow(counts) + L) * delta))
d1c540ac
 
     # Remove clusters 1-by-1 until L is reached
ca5fb59d
     while (currentL > L) {
d1c540ac
         # Find second best assignment give current assignments for each cell
ca5fb59d
         probs <- .cGCalcGibbsProbY(
d1c540ac
             counts = counts,
ca5fb59d
             y = overallY,
             L = currentL,
             nTSByC = nTSByC,
             nByTS = nByTS,
             nGByTS = nGByTS,
             nByG = nByG,
d1c540ac
             nG = nG,
             beta = beta,
             delta = delta,
             gamma = gamma,
             lgbeta = lgbeta,
             lggamma = lggamma,
             lgdelta = lgdelta,
ca5fb59d
             doSample = FALSE)
         yProb <- t(probs$probs)
         yProb[cbind(seq(nrow(yProb)), overallY)] <- NA
         ySecond <- apply(yProb, 1, which.max)
d1c540ac
 
ca5fb59d
         yTa <- tabulate(overallY, currentL)
         yNonEmpty <- which(yTa > 0)
d1c540ac
 
         # Find worst cluster by logLik to remove
ca5fb59d
         previousY <- overallY
         llShuffle <- rep(NA, currentL)
         for (i in yNonEmpty) {
             ix <- overallY == i
             newY <- overallY
             newY[ix] <- ySecond[ix]
d1c540ac
 
             # Move arounds counts for likelihood calculation
ca5fb59d
             p <- .cGReDecomposeCounts(counts,
                     newY,
                     previousY,
                     nTSByC,
                     nByG,
                     currentL)
             nTSByC <- p$nTSByC
             nGByTS <- p$nGByTS
             nByTS <- p$nByTS
             llShuffle[i] <- .cGCalcLL(nTSByC,
                     nByTS,
                     nByG,
                     nGByTS,
d1c540ac
                     nM,
                     nG,
ca5fb59d
                     currentL,
d1c540ac
                     beta,
                     delta,
                     gamma)
ca5fb59d
             previousY <- newY
d1c540ac
         }
 
         # Remove the cluster which had the the largest likelihood after removal
ca5fb59d
         yToRemove <- which.max(llShuffle)
d1c540ac
 
ca5fb59d
         ix <- overallY == yToRemove
         overallY[ix] <- ySecond[ix]
d1c540ac
 
         # Move around counts and remove module
ca5fb59d
         p <- .cGReDecomposeCounts(counts,
             overallY,
             previousY,
             nTSByC,
             nByG,
             currentL)
         nTSByC <- p$nTSByC[-yToRemove, , drop = FALSE]
         nGByTS <- p$nGByTS[-yToRemove]
         nByTS <- p$nByTS[-yToRemove]
         overallY <- as.integer(as.factor(overallY))
         currentL <- currentL - 1
d1c540ac
     }
ca5fb59d
     return(overallY)
98021745
 }