fdf53f53 |
.initializeCluster <- function(N,
|
d1c540ac |
len,
z = NULL,
initial = NULL,
|
75664a2f |
fixed = NULL) {
|
8dcff611 |
|
d1c540ac |
# If initial values are given, then they will not be randomly initialized
if (!is.null(initial)) {
|
ca5fb59d |
initValues <- sort(unique(initial))
|
d1c540ac |
if (length(unique(initial)) != N || length(initial) != len ||
|
ca5fb59d |
!all(initValues %in% seq(N))) {
|
3134ca62 |
stop("'initial' needs to be a vector of length 'len'",
" containing N unique values.")
|
d1c540ac |
}
|
af4c3cb8 |
z <- as.integer(as.factor(initial))
|
d1c540ac |
} else {
z <- rep(NA, len)
}
|
8dcff611 |
|
d1c540ac |
# Set any values that need to be fixed during sampling
if (!is.null(fixed)) {
|
ca5fb59d |
fixedValues <- sort(unique(fixed))
if (length(fixed) != len || !all(fixedValues %in% seq(N))) {
|
3134ca62 |
stop("'fixed' to be a vector of length 'len' where each entry is",
" one of N unique values or NA.")
|
d1c540ac |
}
|
ca5fb59d |
fixedIx <- !is.na(fixed)
z[fixedIx] <- fixed[fixedIx]
zNotUsed <- setdiff(seq(N), unique(fixed[fixedIx]))
|
d1c540ac |
} else {
|
ca5fb59d |
zNotUsed <- seq(N)
fixedIx <- rep(FALSE, len)
|
8dcff611 |
}
|
d1c540ac |
# Randomly sample remaining values
|
ca5fb59d |
zNa <- which(is.na(z))
if (length(zNa) > 0) {
z[zNa] <- sample(zNotUsed, length(zNa), replace = TRUE)
|
8dcff611 |
}
|
d1c540ac |
# Check to ensure each value is in the vector at least once
|
ca5fb59d |
missing <- setdiff(seq(N), z)
|
d1c540ac |
for (i in missing) {
|
ca5fb59d |
ta <- sort(table(z[!fixedIx]), decreasing = TRUE)
|
d1c540ac |
if (ta[1] == 1) {
stop("'len' is not long enough to accomodate 'N' unique values")
}
|
af4c3cb8 |
ix <- which(z == as.integer(names(ta))[1] & !fixedIx)
|
d1c540ac |
z[sample(ix, 1)] <- i
|
8dcff611 |
}
|
d1c540ac |
return(z)
|
8dcff611 |
}
|
fdf53f53 |
.initializeSplitZ <- function(counts,
|
d1c540ac |
K,
|
ca5fb59d |
KSubcluster = NULL,
|
d1c540ac |
alpha = 1,
beta = 1,
|
75664a2f |
minCell = 3) {
|
d1c540ac |
s <- rep(1, ncol(counts))
|
ca5fb59d |
if (is.null(KSubcluster))
KSubcluster <- ceiling(sqrt(K))
|
d1c540ac |
|
ca5fb59d |
# Initialize the model with KSubcluster clusters
|
d1c540ac |
res <- .celda_C(
counts,
|
ca5fb59d |
K = KSubcluster,
maxIter = 20,
zInitialize = "random",
|
d1c540ac |
alpha = alpha,
beta = beta,
|
ca5fb59d |
splitOnIter = -1,
splitOnLast = FALSE,
|
d1c540ac |
verbose = FALSE,
reorder = FALSE
)
|
06b0c870 |
overallZ <- as.integer(as.factor(clusters(res)$z))
|
ca5fb59d |
currentK <- max(overallZ)
|
d1c540ac |
|
ca5fb59d |
while (currentK < K) {
|
d1c540ac |
# Determine which clusters are split-able
|
501d7a5b |
# KRemaining <- K - currentK
|
ca5fb59d |
KPerCluster <- min(ceiling(K / currentK), KSubcluster)
KToUse <- ifelse(KPerCluster < 2, 2, KPerCluster)
|
d1c540ac |
|
ca5fb59d |
zTa <- tabulate(overallZ, max(overallZ))
|
b6cf56ae |
|
acfdc7f3 |
zToSplit <- which(zTa > minCell & zTa > KToUse)
if (length(zToSplit) > 1) {
zToSplit <- sample(zToSplit)
} else if (length(zToSplit) == 0) {
|
b6cf56ae |
break
|
d1c540ac |
}
|
ca5fb59d |
# Cycle through each splitable cluster and split it up into
# K.sublcusters
for (i in zToSplit) {
clustLabel <- .celda_C(counts[, overallZ == i, drop = FALSE],
K = KToUse,
zInitialize = "random",
|
d1c540ac |
alpha = alpha,
beta = beta,
|
ca5fb59d |
maxIter = 20,
splitOnIter = -1,
splitOnLast = FALSE,
|
d1c540ac |
verbose = FALSE)
|
06b0c870 |
tempZ <- as.integer(as.factor(clusters(clustLabel)$z))
|
8dcff611 |
|
d1c540ac |
# Reassign clusters with label > 1
|
ca5fb59d |
splitIx <- tempZ > 1
ix <- overallZ == i
newZ <- overallZ[ix]
newZ[splitIx] <- currentK + tempZ[splitIx] - 1
|
8dcff611 |
|
ca5fb59d |
overallZ[ix] <- newZ
currentK <- max(overallZ)
|
d1c540ac |
# Ensure that the maximum number of clusters does not get too large'
|
ca5fb59d |
if (currentK > K + 10) {
|
b6cf56ae |
break
|
d1c540ac |
}
}
|
8dcff611 |
}
|
d1c540ac |
# Decompose counts for likelihood calculation
|
ca5fb59d |
p <- .cCDecomposeCounts(counts, s, overallZ, currentK)
|
d1c540ac |
nS <- p$nS
nG <- p$nG
nM <- p$nM
|
ca5fb59d |
mCPByS <- p$mCPByS
nGByCP <- p$nGByCP
nCP <- p$nCP
nByC <- p$nByC
|
d1c540ac |
# Remove clusters 1-by-1 until K is reached
|
ca5fb59d |
while (currentK > K) {
|
d1c540ac |
# Find second best assignment give current assignments for each cell
|
ca5fb59d |
probs <- .cCCalcEMProbZ(counts,
|
d1c540ac |
s = s,
|
ca5fb59d |
z = overallZ,
K = currentK,
mCPByS = mCPByS,
nGByCP = nGByCP,
nByC = nByC,
nCP = nCP,
|
d1c540ac |
nG = nG,
nM = nM,
alpha = alpha,
beta = beta,
|
ca5fb59d |
doSample = FALSE)
zProb <- t(probs$probs)
zProb[cbind(seq(nrow(zProb)), overallZ)] <- NA
zSecond <- apply(zProb, 1, which.max)
|
d1c540ac |
|
ca5fb59d |
zTa <- tabulate(overallZ, currentK)
zNonEmpty <- which(zTa > 0)
|
d1c540ac |
# Find worst cluster by logLik to remove
|
ca5fb59d |
previousZ <- overallZ
llShuffle <- rep(NA, currentK)
for (i in zNonEmpty) {
ix <- overallZ == i
newZ <- overallZ
newZ[ix] <- zSecond[ix]
p <- .cCReDecomposeCounts(counts,
|
d1c540ac |
s,
|
ca5fb59d |
newZ,
previousZ,
nGByCP,
currentK)
nGByCP <- p$nGByCP
mCPByS <- p$mCPByS
llShuffle[i] <- .cCCalcLL(mCPByS,
nGByCP,
|
d1c540ac |
s,
|
ca5fb59d |
newZ,
currentK,
|
d1c540ac |
nS,
nG,
alpha,
beta)
|
ca5fb59d |
previousZ <- newZ
|
d1c540ac |
}
|
8dcff611 |
|
d1c540ac |
# Remove the cluster which had the the largest likelihood after removal
|
ca5fb59d |
zToRemove <- which.max(llShuffle)
|
8dcff611 |
|
ca5fb59d |
ix <- overallZ == zToRemove
overallZ[ix] <- zSecond[ix]
|
98021745 |
|
ca5fb59d |
p <- .cCReDecomposeCounts(counts,
|
d1c540ac |
s,
|
ca5fb59d |
overallZ,
previousZ,
nGByCP,
currentK)
nGByCP <- p$nGByCP[, -zToRemove, drop = FALSE]
mCPByS <- p$mCPByS[-zToRemove, , drop = FALSE]
overallZ <- as.integer(as.factor(overallZ))
currentK <- currentK - 1
|
8dcff611 |
}
|
ca5fb59d |
return(overallZ)
|
98021745 |
}
|
d1c540ac |
|
fdf53f53 |
.initializeSplitY <- function(counts,
|
d1c540ac |
L,
|
ca5fb59d |
LSubcluster = NULL,
tempK = 100,
|
d1c540ac |
beta = 1,
delta = 1,
gamma = 1,
|
75664a2f |
minFeature = 3) {
|
ca5fb59d |
if (is.null(LSubcluster))
LSubcluster <- ceiling(sqrt(L))
|
d1c540ac |
# Collapse cells to managable number of clusters
|
ca5fb59d |
if (!is.null(tempK) && ncol(counts) > tempK) {
|
75664a2f |
z <- .initializeSplitZ(counts, K = tempK)
|
2e877ffe |
counts <- .colSumByGroup(counts, z, length(unique(z)))
|
d1c540ac |
}
|
ca5fb59d |
# Initialize the model with KSubcluster clusters
|
d1c540ac |
res <- .celda_G(counts,
|
ca5fb59d |
L = LSubcluster,
maxIter = 10,
yInitialize = "random",
|
d1c540ac |
beta = beta,
delta = delta,
gamma = gamma,
|
ca5fb59d |
splitOnIter = -1,
splitOnLast = FALSE,
|
d1c540ac |
verbose = FALSE,
reorder = FALSE
)
|
06b0c870 |
overallY <- as.integer(as.factor(clusters(res)$y))
|
ca5fb59d |
currentL <- max(overallY)
|
d1c540ac |
|
ca5fb59d |
while (currentL < L) {
|
d1c540ac |
# Determine which clusters are split-able
|
ca5fb59d |
yTa <- tabulate(overallY, max(overallY))
yToSplit <- sample(which(yTa > minFeature & yTa > LSubcluster))
|
d1c540ac |
|
ca5fb59d |
if (length(yToSplit) == 0) {
|
b6cf56ae |
break
|
ca5fb59d |
}
|
d1c540ac |
|
b6cf56ae |
# Cycle through each splitable cluster and split it up into
# LSublcusters
|
ca5fb59d |
for (i in yToSplit) {
|
fc0f3024 |
# make sure the colSums of subset counts is not 0
countsY <- counts[overallY == i, , drop = FALSE]
countsY <- countsY[, !(colSums(countsY) == 0)]
if (ncol(countsY) == 0) {
next
}
|
d1c540ac |
clustLabel <- .celda_G(
|
fc0f3024 |
countsY,
|
ca5fb59d |
L = LSubcluster,
yInitialize = "random",
|
d1c540ac |
beta = beta,
delta = delta,
gamma = gamma,
|
ca5fb59d |
maxIter = 20,
splitOnIter = -1,
splitOnLast = FALSE,
|
f031bb72 |
verbose = FALSE)
|
06b0c870 |
tempY <- as.integer(as.factor(clusters(clustLabel)$y))
|
d1c540ac |
# Reassign clusters with label > 1
|
ca5fb59d |
splitIx <- tempY > 1
ix <- overallY == i
newY <- overallY[ix]
newY[splitIx] <- currentL + tempY[splitIx] - 1
|
d1c540ac |
|
ca5fb59d |
overallY[ix] <- newY
currentL <- max(overallY)
|
d1c540ac |
# Ensure that the maximum number of clusters does not get too large
|
ca5fb59d |
if (currentL > L + 10) {
|
b6cf56ae |
break
|
d1c540ac |
}
}
}
## Decompose counts for likelihood calculation
|
ca5fb59d |
p <- .cGDecomposeCounts(counts = counts, y = overallY, L = currentL)
nTSByC <- p$nTSByC
nByG <- p$nByG
nByTS <- p$nByTS
nGByTS <- p$nGByTS
|
d1c540ac |
nM <- p$nM
nG <- p$nG
rm(p)
# Pre-compute lgamma values
|
ca5fb59d |
lgbeta <- lgamma((seq(0, max(.colSums(counts, nrow(counts),
ncol(counts))))) + beta)
lggamma <- lgamma(seq(0, nrow(counts) + L) + gamma)
lgdelta <- c(NA, lgamma(seq(nrow(counts) + L) * delta))
|
d1c540ac |
# Remove clusters 1-by-1 until L is reached
|
ca5fb59d |
while (currentL > L) {
|
d1c540ac |
# Find second best assignment give current assignments for each cell
|
ca5fb59d |
probs <- .cGCalcGibbsProbY(
|
d1c540ac |
counts = counts,
|
ca5fb59d |
y = overallY,
L = currentL,
nTSByC = nTSByC,
nByTS = nByTS,
nGByTS = nGByTS,
nByG = nByG,
|
d1c540ac |
nG = nG,
beta = beta,
delta = delta,
gamma = gamma,
lgbeta = lgbeta,
lggamma = lggamma,
lgdelta = lgdelta,
|
ca5fb59d |
doSample = FALSE)
yProb <- t(probs$probs)
yProb[cbind(seq(nrow(yProb)), overallY)] <- NA
ySecond <- apply(yProb, 1, which.max)
|
d1c540ac |
|
ca5fb59d |
yTa <- tabulate(overallY, currentL)
yNonEmpty <- which(yTa > 0)
|
d1c540ac |
# Find worst cluster by logLik to remove
|
ca5fb59d |
previousY <- overallY
llShuffle <- rep(NA, currentL)
for (i in yNonEmpty) {
ix <- overallY == i
newY <- overallY
newY[ix] <- ySecond[ix]
|
d1c540ac |
# Move arounds counts for likelihood calculation
|
ca5fb59d |
p <- .cGReDecomposeCounts(counts,
newY,
previousY,
nTSByC,
nByG,
currentL)
nTSByC <- p$nTSByC
nGByTS <- p$nGByTS
nByTS <- p$nByTS
llShuffle[i] <- .cGCalcLL(nTSByC,
nByTS,
nByG,
nGByTS,
|
d1c540ac |
nM,
nG,
|
ca5fb59d |
currentL,
|
d1c540ac |
beta,
delta,
gamma)
|
ca5fb59d |
previousY <- newY
|
d1c540ac |
}
# Remove the cluster which had the the largest likelihood after removal
|
ca5fb59d |
yToRemove <- which.max(llShuffle)
|
d1c540ac |
|
ca5fb59d |
ix <- overallY == yToRemove
overallY[ix] <- ySecond[ix]
|
d1c540ac |
# Move around counts and remove module
|
ca5fb59d |
p <- .cGReDecomposeCounts(counts,
overallY,
previousY,
nTSByC,
nByG,
currentL)
nTSByC <- p$nTSByC[-yToRemove, , drop = FALSE]
nGByTS <- p$nGByTS[-yToRemove]
nByTS <- p$nByTS[-yToRemove]
overallY <- as.integer(as.factor(overallY))
currentL <- currentL - 1
|
d1c540ac |
}
|
ca5fb59d |
return(overallY)
|
98021745 |
}
|