# .cCCalcLL = function(mCPByS, nGByCP, s, z, K, nS, nG, alpha, beta) .cCSplitZ <- function(counts, mCPByS, nGByCP, nCP, s, z, K, nS, nG, alpha, beta, zProb, maxClustersToTry = 10, minCell = 3) { ## Identify clusters to split zTa <- tabulate(z, K) zToSplit <- which(zTa >= minCell) zNonEmpty <- which(zTa > 0) if (length(zToSplit) == 0) { m <- paste0( date(), " .... Cluster sizes too small. No additional splitting was", " performed." ) return(list( z = z, mCPByS, nGByCP, nCP = nCP, message = m )) } ## Loop through each split-able Z and perform split clustSplit <- vector("list", K) for (i in zToSplit) { clustLabel <- .celda_C( counts[, z == i], K = 2, zInitialize = "random", maxIter = 5, splitOnIter = -1, splitOnLast = FALSE, verbose = FALSE ) clustSplit[[i]] <- celdaClusters(clustLabel)$z } ## Find second best assignment give current assignments for each cell zProb[cbind(seq(nrow(zProb)), z)] <- NA zSecond <- apply(zProb, 1, which.max) ## Set up initial variables zSplit <- matrix(NA, nrow = length(z), ncol = length(zToSplit) * maxClustersToTry ) zSplitLl <- rep(NA, times = length(zToSplit) * maxClustersToTry) zSplitLl[1] <- .cCCalcLL(mCPByS, nGByCP, s, z, K, nS, nG, alpha, beta) zSplit[, 1] <- z ## Select worst clusters to test for reshuffling previousZ <- z llShuffle <- rep(NA, K) for (i in zNonEmpty) { ix <- z == i newZ <- z newZ[ix] <- zSecond[ix] p <- .cCReDecomposeCounts(counts, s, newZ, previousZ, nGByCP, K) nGByCP <- p$nGByCP mCPByS <- p$mCPByS llShuffle[i] <- .cCCalcLL(mCPByS, nGByCP, s, z, K, nS, nG, alpha, beta) previousZ <- newZ } zToShuffle <- utils::head(order(llShuffle, decreasing = TRUE, na.last = NA), n = maxClustersToTry ) pairs <- c(NA, NA) splitIx <- 2 for (i in zToShuffle) { otherClusters <- setdiff(zToSplit, i) for (j in otherClusters) { newZ <- z ## Assign cluster i to the next most similar cluster (excluding ## cluster j) ## as defined above by the correlation ixToMove <- z == i newZ[ixToMove] <- zSecond[ixToMove] ## Split cluster j according to the clustering defined above ixToSplit <- z == j newZ[ixToSplit] <- ifelse(clustSplit[[j]] == 1, j, i) p <- .cCReDecomposeCounts(counts, s, newZ, previousZ, nGByCP, K) nGByCP <- p$nGByCP mCPByS <- p$mCPByS ## Calculate likelihood of split zSplitLl[splitIx] <- .cCCalcLL( mCPByS, nGByCP, s, z, K, nS, nG, alpha, beta ) zSplit[, splitIx] <- newZ splitIx <- splitIx + 1L previousZ <- newZ pairs <- rbind(pairs, c(i, j)) } } select <- which.max(zSplitLl) if (select == 1) { m <- paste0(date(), " .... No additional splitting was performed.") } else { m <- paste0( date(), " .... Cluster ", pairs[select, 1], " was reassigned and cluster ", pairs[select, 2], " was split in two." ) } p <- .cCReDecomposeCounts(counts, s, zSplit[, select], previousZ, nGByCP, K) return(list( z = zSplit[, select], mCPByS = p$mCPByS, nGByCP = p$nGByCP, nCP = p$nCP, message = m )) } # .cCGCalcLL = function(K, L, mCPByS, nTSByCP, nByG, nByTS, nGByTS, # nS, nG, alpha, beta, delta, gamma) .cCGSplitZ <- function(counts, mCPByS, nTSByC, nTSByCP, nByG, nByTS, nGByTS, nCP, s, z, K, L, nS, nG, alpha, beta, delta, gamma, zProb, maxClustersToTry = 10, minCell = 3) { ## Identify clusters to split zTa <- tabulate(z, K) zToSplit <- which(zTa >= minCell) zNonEmpty <- which(zTa > 0) if (length(zToSplit) == 0) { m <- paste0( date(), " .... Cluster sizes too small. No additional splitting was", " performed." ) return(list( z = z, mCPByS = mCPByS, nTSByCP = nTSByCP, nCP = nCP, message = m )) } ## Loop through each split-able Z and perform split clustSplit <- vector("list", K) for (i in zToSplit) { clustLabel <- .celda_C(counts[, z == i], K = 2, zInitialize = "random", maxIter = 5, splitOnIter = -1, splitOnLast = FALSE, verbose = FALSE ) clustSplit[[i]] <- celdaClusters(clustLabel)$z } ## Find second best assignment give current assignments for each cell zProb[cbind(seq(nrow(zProb)), z)] <- NA zSecond <- apply(zProb, 1, which.max) ## Set up initial variables zSplit <- matrix(NA, nrow = length(z), ncol = length(zToSplit) * maxClustersToTry ) zSplitLl <- rep(NA, ncol = length(zToSplit) * maxClustersToTry) zSplitLl[1] <- .cCGCalcLL( K, L, mCPByS, nTSByCP, nByG, nByTS, nGByTS, nS, nG, alpha, beta, delta, gamma ) zSplit[, 1] <- z ## Select worst clusters to test for reshuffling previousZ <- z llShuffle <- rep(NA, K) for (i in zNonEmpty) { ix <- z == i newZ <- z newZ[ix] <- zSecond[ix] p <- .cCReDecomposeCounts(nTSByC, s, newZ, previousZ, nTSByCP, K) nTSByCP <- p$nGByCP mCPByS <- p$mCPByS llShuffle[i] <- .cCGCalcLL( K, L, mCPByS, nTSByCP, nByG, nByTS, nGByTS, nS, nG, alpha, beta, delta, gamma ) previousZ <- newZ } zToShuffle <- utils::head(order(llShuffle, decreasing = TRUE, na.last = NA), n = maxClustersToTry ) pairs <- c(NA, NA) splitIx <- 2 for (i in zToShuffle) { otherClusters <- setdiff(zToSplit, i) for (j in otherClusters) { newZ <- z ## Assign cluster i to the next most similar cluster (excluding ## cluster j) ## as defined above by the correlation ixToMove <- z == i newZ[ixToMove] <- zSecond[ixToMove] ## Split cluster j according to the clustering defined above ixToSplit <- z == j newZ[ixToSplit] <- ifelse(clustSplit[[j]] == 1, j, i) p <- .cCReDecomposeCounts(nTSByC, s, newZ, previousZ, nTSByCP, K) nTSByCP <- p$nGByCP mCPByS <- p$mCPByS ## Calculate likelihood of split zSplitLl[splitIx] <- .cCGCalcLL( K, L, mCPByS, nTSByCP, nByG, nByTS, nGByTS, nS, nG, alpha, beta, delta, gamma ) zSplit[, splitIx] <- newZ splitIx <- splitIx + 1L previousZ <- newZ pairs <- rbind(pairs, c(i, j)) } } select <- which.max(zSplitLl) if (select == 1) { m <- paste0(date(), " .... No additional splitting was performed.") } else { m <- paste0( date(), " .... Cluster ", pairs[select, 1], " was reassigned and cluster ", pairs[select, 2], " was split in two." ) } p <- .cCReDecomposeCounts( nTSByC, s, zSplit[, select], previousZ, nTSByCP, K ) return(list( z = zSplit[, select], mCPByS = p$mCPByS, nTSByCP = p$nGByCP, nCP = p$nCP, message = m )) } .cCGSplitY <- function(counts, y, mCPByS, nGByCP, nTSByC, nTSByCP, nByG, nByTS, nGByTS, nCP, s, z, K, L, nS, nG, alpha, beta, delta, gamma, yProb, maxClustersToTry = 10, KSubclusters = 10, minCell = 3) { ######################### ## First, the cell dimension of the original matrix will be reduced by ## splitting each z cluster into 'KSubclusters'. ######################### ## This will not be as big as the original matrix (which can take a lot of ## time to process with large number of cells), but not as small as the ## 'nGByCP' with current z assignments zTa <- tabulate(z, K) zNonEmpty <- which(zTa > 0) tempZ <- rep(0, length(z)) currentTopZ <- 0 for (i in zNonEmpty) { ix <- z == i if (zTa[i] <= KSubclusters) { tempZ[ix] <- seq(currentTopZ + 1, currentTopZ + zTa[i]) } else { clustLabel <- .celda_C(counts[, z == i], K = KSubclusters, zInitialize = "random", maxIter = 5, splitOnIter = -1, splitOnLast = FALSE, verbose = FALSE ) tempZ[ix] <- celdaClusters(clustLabel)$z + currentTopZ } currentTopZ <- max(tempZ, na.rm = TRUE) } ## Decompose counts according to new/temp z labels tempNGByCP <- .colSumByGroup(counts, group = tempZ, K = currentTopZ) ######################### ## Second, different y splits will be estimated and tested ######################### ## Identify clusters to split yTa <- tabulate(y, L) yToSplit <- which(yTa >= minCell) yNonEmpty <- which(yTa > 0) if (length(yToSplit) == 0) { m <- paste0( date(), " .... Cluster sizes too small. No additional splitting was", " performed." ) return(list( y = y, mCPByS = mCPByS, nTSByCP = nTSByCP, nCP = nCP, message = m )) } ## Loop through each split-able Z and perform split clustSplit <- vector("list", L) for (i in yToSplit) { clustLabel <- .celda_G(tempNGByCP[y == i, ], L = 2, yInitialize = "random", maxIter = 5, splitOnIter = -1, splitOnLast = FALSE, verbose = FALSE ) clustSplit[[i]] <- celdaClusters(clustLabel)$y } ## Find second best assignment give current assignments for each cell yProb[cbind(seq(nrow(yProb)), y)] <- NA ySecond <- apply(yProb, 1, which.max) ## Set up initial variables ySplit <- matrix(NA, nrow = length(y), ncol = length(yToSplit) * maxClustersToTry ) ySplitLl <- rep(NA, ncol = length(yToSplit) * maxClustersToTry) ySplitLl[1] <- .cCGCalcLL( K, L, mCPByS, nTSByCP, nByG, nByTS, nGByTS, nS, nG, alpha, beta, delta, gamma ) ySplit[, 1] <- y ## Select worst clusters to test for reshuffling previousY <- y llShuffle <- rep(NA, L) for (i in yNonEmpty) { ix <- y == i newY <- y newY[ix] <- ySecond[ix] p <- .cGReDecomposeCounts(nGByCP, newY, previousY, nTSByCP, nByG, L) nTSByCP <- p$nTSByC nByTS <- p$nByTS nGByTS <- p$nGByTS llShuffle[i] <- .cCGCalcLL( K, L, mCPByS, nTSByCP, nByG, nByTS, nGByTS, nS, nG, alpha, beta, delta, gamma ) previousY <- newY } yToShuffle <- utils::head(order(llShuffle, decreasing = TRUE, na.last = NA), n = maxClustersToTry ) pairs <- c(NA, NA) splitIx <- 2 for (i in yToShuffle) { otherClusters <- setdiff(yToSplit, i) for (j in otherClusters) { newY <- y ## Assign cluster i to the next most similar cluster (excluding ## cluster j) ## as defined above by the correlation ixToMove <- y == i newY[ixToMove] <- ySecond[ixToMove] ## Split cluster j according to the clustering defined above ixToSplit <- y == j newY[ixToSplit] <- ifelse(clustSplit[[j]] == 1, j, i) p <- .cGReDecomposeCounts(nGByCP, newY, previousY, nTSByCP, nByG, L) nTSByCP <- p$nTSByC nByTS <- p$nByTS nGByTS <- p$nGByTS ## Calculate likelihood of split ySplitLl[splitIx] <- .cCGCalcLL( K, L, mCPByS, nTSByCP, nByG, nByTS, nGByTS, nS, nG, alpha, beta, delta, gamma ) ySplit[, splitIx] <- newY splitIx <- splitIx + 1L previousY <- newY pairs <- rbind(pairs, c(i, j)) } } select <- which.max(ySplitLl) if (select == 1) { m <- paste0(date(), " .... No additional splitting was performed.") } else { m <- paste0( date(), " .... Cluster ", pairs[select, 1], " was reassigned and cluster ", pairs[select, 2], " was split in two." ) } p <- .cGReDecomposeCounts( nGByCP, ySplit[, select], previousY, nTSByCP, nByG, L ) return(list( y = ySplit[, select], nTSByCP = p$nTSByC, nByTS = p$nByTS, nGByTS = p$nGByTS, message = m )) } # .cGCalcLL = function(nTSByC, nByTS, nByG, nGByTS, nM, nG, L, beta, delta, # gamma) { .cGSplitY <- function(counts, y, nTSByC, nByTS, nByG, nGByTS, nM, nG, L, beta, delta, gamma, yProb, minFeature = 3, maxClustersToTry = 10) { ## Identify clusters to split yTa <- table(factor(y, levels = seq(L))) yToSplit <- which(yTa >= minFeature) yNonEmpty <- which(yTa > 0) if (length(yToSplit) == 0) { m <- paste0( date(), " .... Cluster sizes too small. No additional splitting was", " performed." ) return(list( y = y, nTSByC = nTSByC, nByTS = nByTS, nGByTS = nGByTS, message = m )) } ## Loop through each split-able y and find best split clustSplit <- vector("list", L) for (i in yToSplit) { clustLabel <- .celda_G(counts[y == i, ], L = 2, yInitialize = "random", maxIter = 5, splitOnIter = -1, splitOnLast = FALSE, verbose = FALSE ) clustSplit[[i]] <- celdaClusters(clustLabel)$y } ## Find second best assignment give current assignments for each cell yProb[cbind(seq(nrow(yProb)), y)] <- NA ySecond <- apply(yProb, 1, which.max) ## Set up initial variables ySplit <- matrix(NA, nrow = length(y), ncol = length(yToSplit) * maxClustersToTry ) ySplitLl <- rep(NA, ncol = length(yToSplit) * maxClustersToTry) ySplitLl[1] <- .cGCalcLL( nTSByC, nByTS, nByG, nGByTS, nM, nG, L, beta, delta, gamma ) ySplit[, 1] <- y ## Select worst clusters to test for reshuffling llShuffle <- rep(NA, L) previousY <- y for (i in yNonEmpty) { ix <- y == i newY <- y newY[ix] <- ySecond[ix] p <- .cGReDecomposeCounts(counts, newY, previousY, nTSByC, nByG, L) llShuffle[i] <- .cGCalcLL( p$nTSByC, p$nByTS, nByG, p$nGByTS, nM, nG, L, beta, delta, gamma ) previousY <- newY } yToShuffle <- utils::head(order(llShuffle, decreasing = TRUE, na.last = NA), n = maxClustersToTry ) pairs <- c(NA, NA) splitIx <- 2 for (i in yToShuffle) { otherClusters <- setdiff(yToSplit, i) for (j in otherClusters) { newY <- y ## Assign cluster i to the next most similar cluster (excluding ## cluster j) ## as defined above by the spearman correlation ixToMove <- y == i newY[ixToMove] <- ySecond[ixToMove] ## Split cluster j according to the clustering defined above ixToSplit <- y == j newY[ixToSplit] <- ifelse(clustSplit[[j]] == 1, j, i) ## Calculate likelihood of split p <- .cGReDecomposeCounts(counts, newY, previousY, nTSByC, nByG, L) ySplitLl[splitIx] <- .cGCalcLL( p$nTSByC, p$nByTS, nByG, p$nGByTS, nM, nG, L, beta, delta, gamma ) ySplit[, splitIx] <- newY splitIx <- splitIx + 1L previousY <- newY pairs <- rbind(pairs, c(i, j)) } } select <- which.max(ySplitLl) if (select == 1) { m <- paste0(date(), " .... No additional splitting was performed.") } else { m <- paste0( date(), " .... Cluster ", pairs[select, 1], " was reassigned and cluster ", pairs[select, 2], " was split in two." ) } p <- .cGReDecomposeCounts( counts, ySplit[, select], previousY, nTSByC, nByG, L ) return(list( y = ySplit[, select], nTSByC = p$nTSByC, nByTS = p$nByTS, nGByTS = p$nGByTS, message = m )) }