... | ... |
@@ -313,10 +313,7 @@ |
313 | 313 |
rm(p) |
314 | 314 |
|
315 | 315 |
# Pre-compute lgamma values |
316 |
- lgbeta <- lgamma((seq(0, max(.colSums( |
|
317 |
- counts, nrow(counts), |
|
318 |
- ncol(counts) |
|
319 |
- )))) + beta) |
|
316 |
+ lgbeta <- lgamma((seq(0, max(colSums(counts)))) + beta) |
|
320 | 317 |
lggamma <- lgamma(seq(0, nrow(counts) + L) + gamma) |
321 | 318 |
lgdelta <- c(NA, lgamma(seq(nrow(counts) + L) * delta)) |
322 | 319 |
|
... | ... |
@@ -111,7 +111,8 @@ |
111 | 111 |
maxIter = 20, |
112 | 112 |
splitOnIter = -1, |
113 | 113 |
splitOnLast = FALSE, |
114 |
- verbose = FALSE) |
|
114 |
+ verbose = FALSE, |
|
115 |
+ reorder = FALSE) |
|
115 | 116 |
tempZ <- as.integer(as.factor(celdaClusters(clustLabel)$z)) |
116 | 117 |
|
117 | 118 |
# Reassign clusters with label > 1 |
... | ... |
@@ -157,7 +158,7 @@ |
157 | 158 |
alpha = alpha, |
158 | 159 |
beta = beta, |
159 | 160 |
doSample = FALSE) |
160 |
- zProb <- t(probs$probs) |
|
161 |
+ zProb <- t(as.matrix(probs$probs)) |
|
161 | 162 |
zProb[cbind(seq(nrow(zProb)), overallZ)] <- NA |
162 | 163 |
zSecond <- apply(zProb, 1, which.max) |
163 | 164 |
|
... | ... |
@@ -279,7 +280,8 @@ |
279 | 280 |
maxIter = 20, |
280 | 281 |
splitOnIter = -1, |
281 | 282 |
splitOnLast = FALSE, |
282 |
- verbose = FALSE |
|
283 |
+ verbose = FALSE, |
|
284 |
+ reorder = FALSE |
|
283 | 285 |
) |
284 | 286 |
tempY <- as.integer(as.factor(celdaClusters(clustLabel)$y)) |
285 | 287 |
|
... | ... |
@@ -81,7 +81,7 @@ |
81 | 81 |
verbose = FALSE, |
82 | 82 |
reorder = FALSE |
83 | 83 |
) |
84 |
- overallZ <- as.integer(as.factor(res@clusters$z)) |
|
84 |
+ overallZ <- as.integer(as.factor(celdaClusters(res)$z)) |
|
85 | 85 |
currentK <- max(overallZ) |
86 | 86 |
|
87 | 87 |
counter <- 0 |
... | ... |
@@ -112,7 +112,7 @@ |
112 | 112 |
splitOnIter = -1, |
113 | 113 |
splitOnLast = FALSE, |
114 | 114 |
verbose = FALSE) |
115 |
- tempZ <- as.integer(as.factor(clustLabel@clusters$z)) |
|
115 |
+ tempZ <- as.integer(as.factor(celdaClusters(clustLabel)$z)) |
|
116 | 116 |
|
117 | 117 |
# Reassign clusters with label > 1 |
118 | 118 |
splitIx <- tempZ > 1 |
... | ... |
@@ -245,7 +245,7 @@ |
245 | 245 |
splitOnLast = FALSE, |
246 | 246 |
verbose = FALSE, |
247 | 247 |
reorder = FALSE) |
248 |
- overallY <- as.integer(as.factor(res@clusters$y)) |
|
248 |
+ overallY <- as.integer(as.factor(celdaClusters(res)$y)) |
|
249 | 249 |
currentL <- max(overallY) |
250 | 250 |
|
251 | 251 |
counter <- 0 |
... | ... |
@@ -281,7 +281,7 @@ |
281 | 281 |
splitOnLast = FALSE, |
282 | 282 |
verbose = FALSE |
283 | 283 |
) |
284 |
- tempY <- as.integer(as.factor(clustLabel@clusters$y)) |
|
284 |
+ tempY <- as.integer(as.factor(celdaClusters(clustLabel)$y)) |
|
285 | 285 |
|
286 | 286 |
# Reassign clusters with label > 1 |
287 | 287 |
splitIx <- tempY > 1 |
... | ... |
@@ -111,8 +111,7 @@ |
111 | 111 |
maxIter = 20, |
112 | 112 |
splitOnIter = -1, |
113 | 113 |
splitOnLast = FALSE, |
114 |
- verbose = FALSE |
|
115 |
- ) |
|
114 |
+ verbose = FALSE) |
|
116 | 115 |
tempZ <- as.integer(as.factor(clustLabel@clusters$z)) |
117 | 116 |
|
118 | 117 |
# Reassign clusters with label > 1 |
... | ... |
@@ -157,8 +156,7 @@ |
157 | 156 |
nM = nM, |
158 | 157 |
alpha = alpha, |
159 | 158 |
beta = beta, |
160 |
- doSample = FALSE |
|
161 |
- ) |
|
159 |
+ doSample = FALSE) |
|
162 | 160 |
zProb <- t(probs$probs) |
163 | 161 |
zProb[cbind(seq(nrow(zProb)), overallZ)] <- NA |
164 | 162 |
zSecond <- apply(zProb, 1, which.max) |
... | ... |
@@ -180,8 +178,7 @@ |
180 | 178 |
newZ, |
181 | 179 |
previousZ, |
182 | 180 |
nGByCP, |
183 |
- currentK |
|
184 |
- ) |
|
181 |
+ currentK) |
|
185 | 182 |
nGByCP <- p$nGByCP |
186 | 183 |
mCPByS <- p$mCPByS |
187 | 184 |
llShuffle[i] <- .cCCalcLL( |
... | ... |
@@ -193,8 +190,7 @@ |
193 | 190 |
nS, |
194 | 191 |
nG, |
195 | 192 |
alpha, |
196 |
- beta |
|
197 |
- ) |
|
193 |
+ beta) |
|
198 | 194 |
previousZ <- newZ |
199 | 195 |
} |
200 | 196 |
|
... | ... |
@@ -204,14 +200,12 @@ |
204 | 200 |
ix <- overallZ == zToRemove |
205 | 201 |
overallZ[ix] <- zSecond[ix] |
206 | 202 |
|
207 |
- p <- .cCReDecomposeCounts( |
|
208 |
- counts, |
|
203 |
+ p <- .cCReDecomposeCounts(counts, |
|
209 | 204 |
s, |
210 | 205 |
overallZ, |
211 | 206 |
previousZ, |
212 | 207 |
nGByCP, |
213 |
- currentK |
|
214 |
- ) |
|
208 |
+ currentK) |
|
215 | 209 |
nGByCP <- p$nGByCP[, -zToRemove, drop = FALSE] |
216 | 210 |
mCPByS <- p$mCPByS[-zToRemove, , drop = FALSE] |
217 | 211 |
overallZ <- as.integer(as.factor(overallZ)) |
... | ... |
@@ -250,8 +244,7 @@ |
250 | 244 |
splitOnIter = -1, |
251 | 245 |
splitOnLast = FALSE, |
252 | 246 |
verbose = FALSE, |
253 |
- reorder = FALSE |
|
254 |
- ) |
|
247 |
+ reorder = FALSE) |
|
255 | 248 |
overallY <- as.integer(as.factor(res@clusters$y)) |
256 | 249 |
currentL <- max(overallY) |
257 | 250 |
|
... | ... |
@@ -343,8 +336,7 @@ |
343 | 336 |
lgbeta = lgbeta, |
344 | 337 |
lggamma = lggamma, |
345 | 338 |
lgdelta = lgdelta, |
346 |
- doSample = FALSE |
|
347 |
- ) |
|
339 |
+ doSample = FALSE) |
|
348 | 340 |
yProb <- t(probs$probs) |
349 | 341 |
yProb[cbind(seq(nrow(yProb)), overallY)] <- NA |
350 | 342 |
ySecond <- apply(yProb, 1, which.max) |
... | ... |
@@ -367,8 +359,7 @@ |
367 | 359 |
previousY, |
368 | 360 |
nTSByC, |
369 | 361 |
nByG, |
370 |
- currentL |
|
371 |
- ) |
|
362 |
+ currentL) |
|
372 | 363 |
nTSByC <- p$nTSByC |
373 | 364 |
nGByTS <- p$nGByTS |
374 | 365 |
nByTS <- p$nByTS |
... | ... |
@@ -382,8 +373,7 @@ |
382 | 373 |
currentL, |
383 | 374 |
beta, |
384 | 375 |
delta, |
385 |
- gamma |
|
386 |
- ) |
|
376 |
+ gamma) |
|
387 | 377 |
previousY <- newY |
388 | 378 |
} |
389 | 379 |
|
... | ... |
@@ -400,8 +390,7 @@ |
400 | 390 |
previousY, |
401 | 391 |
nTSByC, |
402 | 392 |
nByG, |
403 |
- currentL |
|
404 |
- ) |
|
393 |
+ currentL) |
|
405 | 394 |
nTSByC <- p$nTSByC[-yToRemove, , drop = FALSE] |
406 | 395 |
nGByTS <- p$nGByTS[-yToRemove] |
407 | 396 |
nByTS <- p$nByTS[-yToRemove] |
... | ... |
@@ -81,7 +81,7 @@ |
81 | 81 |
verbose = FALSE, |
82 | 82 |
reorder = FALSE |
83 | 83 |
) |
84 |
- overallZ <- as.integer(as.factor(clusters(res)$z)) |
|
84 |
+ overallZ <- as.integer(as.factor(res@clusters$z)) |
|
85 | 85 |
currentK <- max(overallZ) |
86 | 86 |
|
87 | 87 |
counter <- 0 |
... | ... |
@@ -113,7 +113,7 @@ |
113 | 113 |
splitOnLast = FALSE, |
114 | 114 |
verbose = FALSE |
115 | 115 |
) |
116 |
- tempZ <- as.integer(as.factor(clusters(clustLabel)$z)) |
|
116 |
+ tempZ <- as.integer(as.factor(clustLabel@clusters$z)) |
|
117 | 117 |
|
118 | 118 |
# Reassign clusters with label > 1 |
119 | 119 |
splitIx <- tempZ > 1 |
... | ... |
@@ -252,7 +252,7 @@ |
252 | 252 |
verbose = FALSE, |
253 | 253 |
reorder = FALSE |
254 | 254 |
) |
255 |
- overallY <- as.integer(as.factor(clusters(res)$y)) |
|
255 |
+ overallY <- as.integer(as.factor(res@clusters$y)) |
|
256 | 256 |
currentL <- max(overallY) |
257 | 257 |
|
258 | 258 |
counter <- 0 |
... | ... |
@@ -288,7 +288,7 @@ |
288 | 288 |
splitOnLast = FALSE, |
289 | 289 |
verbose = FALSE |
290 | 290 |
) |
291 |
- tempY <- as.integer(as.factor(clusters(clustLabel)$y)) |
|
291 |
+ tempY <- as.integer(as.factor(clustLabel@clusters$y)) |
|
292 | 292 |
|
293 | 293 |
# Reassign clusters with label > 1 |
294 | 294 |
splitIx <- tempY > 1 |
... | ... |
@@ -1,390 +1,412 @@ |
1 | 1 |
.initializeCluster <- function(N, |
2 |
- len, |
|
3 |
- z = NULL, |
|
4 |
- initial = NULL, |
|
5 |
- fixed = NULL) { |
|
6 |
- |
|
7 |
- # If initial values are given, then they will not be randomly initialized |
|
8 |
- if (!is.null(initial)) { |
|
9 |
- initValues <- sort(unique(initial)) |
|
10 |
- if (length(unique(initial)) != N || length(initial) != len || |
|
11 |
- !all(initValues %in% seq(N))) { |
|
12 |
- stop("'initial' needs to be a vector of length 'len'", |
|
13 |
- " containing N unique values.") |
|
14 |
- } |
|
15 |
- z <- as.integer(as.factor(initial)) |
|
16 |
- } else { |
|
17 |
- z <- rep(NA, len) |
|
2 |
+ len, |
|
3 |
+ z = NULL, |
|
4 |
+ initial = NULL, |
|
5 |
+ fixed = NULL) { |
|
6 |
+ |
|
7 |
+ # If initial values are given, then they will not be randomly initialized |
|
8 |
+ if (!is.null(initial)) { |
|
9 |
+ initValues <- sort(unique(initial)) |
|
10 |
+ if (length(unique(initial)) != N || length(initial) != len || |
|
11 |
+ !all(initValues %in% seq(N))) { |
|
12 |
+ stop( |
|
13 |
+ "'initial' needs to be a vector of length 'len'", |
|
14 |
+ " containing N unique values." |
|
15 |
+ ) |
|
18 | 16 |
} |
19 |
- |
|
20 |
- # Set any values that need to be fixed during sampling |
|
21 |
- if (!is.null(fixed)) { |
|
22 |
- fixedValues <- sort(unique(fixed)) |
|
23 |
- if (length(fixed) != len || !all(fixedValues %in% seq(N))) { |
|
24 |
- stop("'fixed' to be a vector of length 'len' where each entry is", |
|
25 |
- " one of N unique values or NA.") |
|
26 |
- } |
|
27 |
- fixedIx <- !is.na(fixed) |
|
28 |
- z[fixedIx] <- fixed[fixedIx] |
|
29 |
- zNotUsed <- setdiff(seq(N), unique(fixed[fixedIx])) |
|
30 |
- } else { |
|
31 |
- zNotUsed <- seq(N) |
|
32 |
- fixedIx <- rep(FALSE, len) |
|
33 |
- } |
|
34 |
- |
|
35 |
- # Randomly sample remaining values |
|
36 |
- zNa <- which(is.na(z)) |
|
37 |
- if (length(zNa) > 0) { |
|
38 |
- z[zNa] <- sample(zNotUsed, length(zNa), replace = TRUE) |
|
17 |
+ z <- as.integer(as.factor(initial)) |
|
18 |
+ } else { |
|
19 |
+ z <- rep(NA, len) |
|
20 |
+ } |
|
21 |
+ |
|
22 |
+ # Set any values that need to be fixed during sampling |
|
23 |
+ if (!is.null(fixed)) { |
|
24 |
+ fixedValues <- sort(unique(fixed)) |
|
25 |
+ if (length(fixed) != len || !all(fixedValues %in% seq(N))) { |
|
26 |
+ stop( |
|
27 |
+ "'fixed' to be a vector of length 'len' where each entry is", |
|
28 |
+ " one of N unique values or NA." |
|
29 |
+ ) |
|
39 | 30 |
} |
40 |
- |
|
41 |
- # Check to ensure each value is in the vector at least once |
|
42 |
- missing <- setdiff(seq(N), z) |
|
43 |
- for (i in missing) { |
|
44 |
- ta <- sort(table(z[!fixedIx]), decreasing = TRUE) |
|
45 |
- if (ta[1] == 1) { |
|
46 |
- stop("'len' is not long enough to accomodate 'N' unique values") |
|
47 |
- } |
|
48 |
- ix <- which(z == as.integer(names(ta))[1] & !fixedIx) |
|
49 |
- z[sample(ix, 1)] <- i |
|
31 |
+ fixedIx <- !is.na(fixed) |
|
32 |
+ z[fixedIx] <- fixed[fixedIx] |
|
33 |
+ zNotUsed <- setdiff(seq(N), unique(fixed[fixedIx])) |
|
34 |
+ } else { |
|
35 |
+ zNotUsed <- seq(N) |
|
36 |
+ fixedIx <- rep(FALSE, len) |
|
37 |
+ } |
|
38 |
+ |
|
39 |
+ # Randomly sample remaining values |
|
40 |
+ zNa <- which(is.na(z)) |
|
41 |
+ if (length(zNa) > 0) { |
|
42 |
+ z[zNa] <- sample(zNotUsed, length(zNa), replace = TRUE) |
|
43 |
+ } |
|
44 |
+ |
|
45 |
+ # Check to ensure each value is in the vector at least once |
|
46 |
+ missing <- setdiff(seq(N), z) |
|
47 |
+ for (i in missing) { |
|
48 |
+ ta <- sort(table(z[!fixedIx]), decreasing = TRUE) |
|
49 |
+ if (ta[1] == 1) { |
|
50 |
+ stop("'len' is not long enough to accomodate 'N' unique values") |
|
50 | 51 |
} |
52 |
+ ix <- which(z == as.integer(names(ta))[1] & !fixedIx) |
|
53 |
+ z[sample(ix, 1)] <- i |
|
54 |
+ } |
|
51 | 55 |
|
52 |
- return(z) |
|
56 |
+ return(z) |
|
53 | 57 |
} |
54 | 58 |
|
55 | 59 |
|
56 | 60 |
.initializeSplitZ <- function(counts, |
57 |
- K, |
|
58 |
- KSubcluster = NULL, |
|
59 |
- alpha = 1, |
|
60 |
- beta = 1, |
|
61 |
- minCell = 3) { |
|
62 |
- |
|
63 |
- s <- rep(1, ncol(counts)) |
|
64 |
- if (is.null(KSubcluster)) |
|
65 |
- KSubcluster <- ceiling(sqrt(K)) |
|
66 |
- |
|
67 |
- # Initialize the model with KSubcluster clusters |
|
68 |
- res <- .celda_C( |
|
69 |
- counts, |
|
70 |
- K = min(KSubcluster, ncol(counts)), |
|
71 |
- maxIter = 20, |
|
61 |
+ K, |
|
62 |
+ KSubcluster = NULL, |
|
63 |
+ alpha = 1, |
|
64 |
+ beta = 1, |
|
65 |
+ minCell = 3) { |
|
66 |
+ s <- rep(1, ncol(counts)) |
|
67 |
+ if (is.null(KSubcluster)) { |
|
68 |
+ KSubcluster <- ceiling(sqrt(K)) |
|
69 |
+ } |
|
70 |
+ |
|
71 |
+ # Initialize the model with KSubcluster clusters |
|
72 |
+ res <- .celda_C( |
|
73 |
+ counts, |
|
74 |
+ K = min(KSubcluster, ncol(counts)), |
|
75 |
+ maxIter = 20, |
|
76 |
+ zInitialize = "random", |
|
77 |
+ alpha = alpha, |
|
78 |
+ beta = beta, |
|
79 |
+ splitOnIter = -1, |
|
80 |
+ splitOnLast = FALSE, |
|
81 |
+ verbose = FALSE, |
|
82 |
+ reorder = FALSE |
|
83 |
+ ) |
|
84 |
+ overallZ <- as.integer(as.factor(clusters(res)$z)) |
|
85 |
+ currentK <- max(overallZ) |
|
86 |
+ |
|
87 |
+ counter <- 0 |
|
88 |
+ while (currentK < K & counter < 25) { |
|
89 |
+ # Determine which clusters are split-able |
|
90 |
+ # KRemaining <- K - currentK |
|
91 |
+ KPerCluster <- min(ceiling(K / currentK), KSubcluster) |
|
92 |
+ KToUse <- ifelse(KPerCluster < 2, 2, KPerCluster) |
|
93 |
+ |
|
94 |
+ zTa <- tabulate(overallZ, max(overallZ)) |
|
95 |
+ |
|
96 |
+ zToSplit <- which(zTa > minCell & zTa > KToUse) |
|
97 |
+ if (length(zToSplit) > 1) { |
|
98 |
+ zToSplit <- sample(zToSplit) |
|
99 |
+ } else if (length(zToSplit) == 0) { |
|
100 |
+ break |
|
101 |
+ } |
|
102 |
+ |
|
103 |
+ # Cycle through each splitable cluster and split it up into |
|
104 |
+ # K.sublcusters |
|
105 |
+ for (i in zToSplit) { |
|
106 |
+ clustLabel <- .celda_C(counts[, overallZ == i, drop = FALSE], |
|
107 |
+ K = KToUse, |
|
72 | 108 |
zInitialize = "random", |
73 | 109 |
alpha = alpha, |
74 | 110 |
beta = beta, |
111 |
+ maxIter = 20, |
|
75 | 112 |
splitOnIter = -1, |
76 | 113 |
splitOnLast = FALSE, |
77 |
- verbose = FALSE, |
|
78 |
- reorder = FALSE |
|
114 |
+ verbose = FALSE |
|
115 |
+ ) |
|
116 |
+ tempZ <- as.integer(as.factor(clusters(clustLabel)$z)) |
|
117 |
+ |
|
118 |
+ # Reassign clusters with label > 1 |
|
119 |
+ splitIx <- tempZ > 1 |
|
120 |
+ ix <- overallZ == i |
|
121 |
+ newZ <- overallZ[ix] |
|
122 |
+ newZ[splitIx] <- currentK + tempZ[splitIx] - 1 |
|
123 |
+ |
|
124 |
+ overallZ[ix] <- newZ |
|
125 |
+ currentK <- max(overallZ) |
|
126 |
+ |
|
127 |
+ # Ensure that the maximum number of clusters does not get too large' |
|
128 |
+ if (currentK > K + 10) { |
|
129 |
+ break |
|
130 |
+ } |
|
131 |
+ } |
|
132 |
+ counter <- counter + 1 |
|
133 |
+ } |
|
134 |
+ |
|
135 |
+ # Decompose counts for likelihood calculation |
|
136 |
+ p <- .cCDecomposeCounts(counts, s, overallZ, currentK) |
|
137 |
+ nS <- p$nS |
|
138 |
+ nG <- p$nG |
|
139 |
+ nM <- p$nM |
|
140 |
+ mCPByS <- p$mCPByS |
|
141 |
+ nGByCP <- p$nGByCP |
|
142 |
+ nCP <- p$nCP |
|
143 |
+ nByC <- p$nByC |
|
144 |
+ |
|
145 |
+ # Remove clusters 1-by-1 until K is reached |
|
146 |
+ while (currentK > K) { |
|
147 |
+ # Find second best assignment give current assignments for each cell |
|
148 |
+ probs <- .cCCalcEMProbZ(counts, |
|
149 |
+ s = s, |
|
150 |
+ z = overallZ, |
|
151 |
+ K = currentK, |
|
152 |
+ mCPByS = mCPByS, |
|
153 |
+ nGByCP = nGByCP, |
|
154 |
+ nByC = nByC, |
|
155 |
+ nCP = nCP, |
|
156 |
+ nG = nG, |
|
157 |
+ nM = nM, |
|
158 |
+ alpha = alpha, |
|
159 |
+ beta = beta, |
|
160 |
+ doSample = FALSE |
|
79 | 161 |
) |
80 |
- overallZ <- as.integer(as.factor(clusters(res)$z)) |
|
81 |
- currentK <- max(overallZ) |
|
82 |
- |
|
83 |
- counter <- 0 |
|
84 |
- while (currentK < K & counter < 25) { |
|
85 |
- # Determine which clusters are split-able |
|
86 |
- # KRemaining <- K - currentK |
|
87 |
- KPerCluster <- min(ceiling(K / currentK), KSubcluster) |
|
88 |
- KToUse <- ifelse(KPerCluster < 2, 2, KPerCluster) |
|
89 |
- |
|
90 |
- zTa <- tabulate(overallZ, max(overallZ)) |
|
91 |
- |
|
92 |
- zToSplit <- which(zTa > minCell & zTa > KToUse) |
|
93 |
- if (length(zToSplit) > 1) { |
|
94 |
- zToSplit <- sample(zToSplit) |
|
95 |
- } else if (length(zToSplit) == 0) { |
|
96 |
- break |
|
97 |
- } |
|
98 |
- |
|
99 |
- # Cycle through each splitable cluster and split it up into |
|
100 |
- # K.sublcusters |
|
101 |
- for (i in zToSplit) { |
|
102 |
- clustLabel <- .celda_C(counts[, overallZ == i, drop = FALSE], |
|
103 |
- K = KToUse, |
|
104 |
- zInitialize = "random", |
|
105 |
- alpha = alpha, |
|
106 |
- beta = beta, |
|
107 |
- maxIter = 20, |
|
108 |
- splitOnIter = -1, |
|
109 |
- splitOnLast = FALSE, |
|
110 |
- verbose = FALSE) |
|
111 |
- tempZ <- as.integer(as.factor(clusters(clustLabel)$z)) |
|
112 |
- |
|
113 |
- # Reassign clusters with label > 1 |
|
114 |
- splitIx <- tempZ > 1 |
|
115 |
- ix <- overallZ == i |
|
116 |
- newZ <- overallZ[ix] |
|
117 |
- newZ[splitIx] <- currentK + tempZ[splitIx] - 1 |
|
118 |
- |
|
119 |
- overallZ[ix] <- newZ |
|
120 |
- currentK <- max(overallZ) |
|
121 |
- |
|
122 |
- # Ensure that the maximum number of clusters does not get too large' |
|
123 |
- if (currentK > K + 10) { |
|
124 |
- break |
|
125 |
- } |
|
126 |
- } |
|
127 |
- counter <- counter + 1 |
|
162 |
+ zProb <- t(probs$probs) |
|
163 |
+ zProb[cbind(seq(nrow(zProb)), overallZ)] <- NA |
|
164 |
+ zSecond <- apply(zProb, 1, which.max) |
|
165 |
+ |
|
166 |
+ zTa <- tabulate(overallZ, currentK) |
|
167 |
+ zNonEmpty <- which(zTa > 0) |
|
168 |
+ |
|
169 |
+ # Find worst cluster by logLik to remove |
|
170 |
+ previousZ <- overallZ |
|
171 |
+ llShuffle <- rep(NA, currentK) |
|
172 |
+ for (i in zNonEmpty) { |
|
173 |
+ ix <- overallZ == i |
|
174 |
+ newZ <- overallZ |
|
175 |
+ newZ[ix] <- zSecond[ix] |
|
176 |
+ |
|
177 |
+ p <- .cCReDecomposeCounts( |
|
178 |
+ counts, |
|
179 |
+ s, |
|
180 |
+ newZ, |
|
181 |
+ previousZ, |
|
182 |
+ nGByCP, |
|
183 |
+ currentK |
|
184 |
+ ) |
|
185 |
+ nGByCP <- p$nGByCP |
|
186 |
+ mCPByS <- p$mCPByS |
|
187 |
+ llShuffle[i] <- .cCCalcLL( |
|
188 |
+ mCPByS, |
|
189 |
+ nGByCP, |
|
190 |
+ s, |
|
191 |
+ newZ, |
|
192 |
+ currentK, |
|
193 |
+ nS, |
|
194 |
+ nG, |
|
195 |
+ alpha, |
|
196 |
+ beta |
|
197 |
+ ) |
|
198 |
+ previousZ <- newZ |
|
128 | 199 |
} |
129 | 200 |
|
130 |
- # Decompose counts for likelihood calculation |
|
131 |
- p <- .cCDecomposeCounts(counts, s, overallZ, currentK) |
|
132 |
- nS <- p$nS |
|
133 |
- nG <- p$nG |
|
134 |
- nM <- p$nM |
|
135 |
- mCPByS <- p$mCPByS |
|
136 |
- nGByCP <- p$nGByCP |
|
137 |
- nCP <- p$nCP |
|
138 |
- nByC <- p$nByC |
|
139 |
- |
|
140 |
- # Remove clusters 1-by-1 until K is reached |
|
141 |
- while (currentK > K) { |
|
142 |
- # Find second best assignment give current assignments for each cell |
|
143 |
- probs <- .cCCalcEMProbZ(counts, |
|
144 |
- s = s, |
|
145 |
- z = overallZ, |
|
146 |
- K = currentK, |
|
147 |
- mCPByS = mCPByS, |
|
148 |
- nGByCP = nGByCP, |
|
149 |
- nByC = nByC, |
|
150 |
- nCP = nCP, |
|
151 |
- nG = nG, |
|
152 |
- nM = nM, |
|
153 |
- alpha = alpha, |
|
154 |
- beta = beta, |
|
155 |
- doSample = FALSE) |
|
156 |
- zProb <- t(probs$probs) |
|
157 |
- zProb[cbind(seq(nrow(zProb)), overallZ)] <- NA |
|
158 |
- zSecond <- apply(zProb, 1, which.max) |
|
159 |
- |
|
160 |
- zTa <- tabulate(overallZ, currentK) |
|
161 |
- zNonEmpty <- which(zTa > 0) |
|
162 |
- |
|
163 |
- # Find worst cluster by logLik to remove |
|
164 |
- previousZ <- overallZ |
|
165 |
- llShuffle <- rep(NA, currentK) |
|
166 |
- for (i in zNonEmpty) { |
|
167 |
- ix <- overallZ == i |
|
168 |
- newZ <- overallZ |
|
169 |
- newZ[ix] <- zSecond[ix] |
|
170 |
- |
|
171 |
- p <- .cCReDecomposeCounts(counts, |
|
172 |
- s, |
|
173 |
- newZ, |
|
174 |
- previousZ, |
|
175 |
- nGByCP, |
|
176 |
- currentK) |
|
177 |
- nGByCP <- p$nGByCP |
|
178 |
- mCPByS <- p$mCPByS |
|
179 |
- llShuffle[i] <- .cCCalcLL(mCPByS, |
|
180 |
- nGByCP, |
|
181 |
- s, |
|
182 |
- newZ, |
|
183 |
- currentK, |
|
184 |
- nS, |
|
185 |
- nG, |
|
186 |
- alpha, |
|
187 |
- beta) |
|
188 |
- previousZ <- newZ |
|
189 |
- } |
|
190 |
- |
|
191 |
- # Remove the cluster which had the the largest likelihood after removal |
|
192 |
- zToRemove <- which.max(llShuffle) |
|
193 |
- |
|
194 |
- ix <- overallZ == zToRemove |
|
195 |
- overallZ[ix] <- zSecond[ix] |
|
196 |
- |
|
197 |
- p <- .cCReDecomposeCounts(counts, |
|
198 |
- s, |
|
199 |
- overallZ, |
|
200 |
- previousZ, |
|
201 |
- nGByCP, |
|
202 |
- currentK) |
|
203 |
- nGByCP <- p$nGByCP[, -zToRemove, drop = FALSE] |
|
204 |
- mCPByS <- p$mCPByS[-zToRemove, , drop = FALSE] |
|
205 |
- overallZ <- as.integer(as.factor(overallZ)) |
|
206 |
- currentK <- currentK - 1 |
|
207 |
- } |
|
208 |
- return(overallZ) |
|
201 |
+ # Remove the cluster which had the the largest likelihood after removal |
|
202 |
+ zToRemove <- which.max(llShuffle) |
|
203 |
+ |
|
204 |
+ ix <- overallZ == zToRemove |
|
205 |
+ overallZ[ix] <- zSecond[ix] |
|
206 |
+ |
|
207 |
+ p <- .cCReDecomposeCounts( |
|
208 |
+ counts, |
|
209 |
+ s, |
|
210 |
+ overallZ, |
|
211 |
+ previousZ, |
|
212 |
+ nGByCP, |
|
213 |
+ currentK |
|
214 |
+ ) |
|
215 |
+ nGByCP <- p$nGByCP[, -zToRemove, drop = FALSE] |
|
216 |
+ mCPByS <- p$mCPByS[-zToRemove, , drop = FALSE] |
|
217 |
+ overallZ <- as.integer(as.factor(overallZ)) |
|
218 |
+ currentK <- currentK - 1 |
|
219 |
+ } |
|
220 |
+ return(overallZ) |
|
209 | 221 |
} |
210 | 222 |
|
211 | 223 |
|
212 | 224 |
.initializeSplitY <- function(counts, |
213 |
- L, |
|
214 |
- LSubcluster = NULL, |
|
215 |
- tempK = 100, |
|
216 |
- beta = 1, |
|
217 |
- delta = 1, |
|
218 |
- gamma = 1, |
|
219 |
- minFeature = 3) { |
|
220 |
- |
|
221 |
- if (is.null(LSubcluster)) |
|
222 |
- LSubcluster <- ceiling(sqrt(L)) |
|
223 |
- |
|
224 |
- # Collapse cells to managable number of clusters |
|
225 |
- if (!is.null(tempK) && ncol(counts) > tempK) { |
|
226 |
- z <- .initializeSplitZ(counts, K = tempK) |
|
227 |
- counts <- .colSumByGroup(counts, z, length(unique(z))) |
|
225 |
+ L, |
|
226 |
+ LSubcluster = NULL, |
|
227 |
+ tempK = 100, |
|
228 |
+ beta = 1, |
|
229 |
+ delta = 1, |
|
230 |
+ gamma = 1, |
|
231 |
+ minFeature = 3) { |
|
232 |
+ if (is.null(LSubcluster)) { |
|
233 |
+ LSubcluster <- ceiling(sqrt(L)) |
|
234 |
+ } |
|
235 |
+ |
|
236 |
+ # Collapse cells to managable number of clusters |
|
237 |
+ if (!is.null(tempK) && ncol(counts) > tempK) { |
|
238 |
+ z <- .initializeSplitZ(counts, K = tempK) |
|
239 |
+ counts <- .colSumByGroup(counts, z, length(unique(z))) |
|
240 |
+ } |
|
241 |
+ |
|
242 |
+ # Initialize the model with KSubcluster clusters |
|
243 |
+ res <- .celda_G(counts, |
|
244 |
+ L = LSubcluster, |
|
245 |
+ maxIter = 10, |
|
246 |
+ yInitialize = "random", |
|
247 |
+ beta = beta, |
|
248 |
+ delta = delta, |
|
249 |
+ gamma = gamma, |
|
250 |
+ splitOnIter = -1, |
|
251 |
+ splitOnLast = FALSE, |
|
252 |
+ verbose = FALSE, |
|
253 |
+ reorder = FALSE |
|
254 |
+ ) |
|
255 |
+ overallY <- as.integer(as.factor(clusters(res)$y)) |
|
256 |
+ currentL <- max(overallY) |
|
257 |
+ |
|
258 |
+ counter <- 0 |
|
259 |
+ while (currentL < L & counter < 25) { |
|
260 |
+ # Determine which clusters are split-able |
|
261 |
+ yTa <- tabulate(overallY, max(overallY)) |
|
262 |
+ yToSplit <- sample(which(yTa > minFeature & yTa > LSubcluster)) |
|
263 |
+ |
|
264 |
+ if (length(yToSplit) == 0) { |
|
265 |
+ break |
|
228 | 266 |
} |
229 | 267 |
|
230 |
- # Initialize the model with KSubcluster clusters |
|
231 |
- res <- .celda_G(counts, |
|
232 |
- L = LSubcluster, |
|
233 |
- maxIter = 10, |
|
268 |
+ # Cycle through each splitable cluster and split it up into |
|
269 |
+ # LSublcusters |
|
270 |
+ for (i in yToSplit) { |
|
271 |
+ # make sure the colSums of subset counts is not 0 |
|
272 |
+ countsY <- counts[overallY == i, , drop = FALSE] |
|
273 |
+ countsY <- countsY[, !(colSums(countsY) == 0)] |
|
274 |
+ |
|
275 |
+ if (ncol(countsY) == 0) { |
|
276 |
+ next |
|
277 |
+ } |
|
278 |
+ |
|
279 |
+ clustLabel <- .celda_G( |
|
280 |
+ countsY, |
|
281 |
+ L = min(LSubcluster, nrow(countsY)), |
|
234 | 282 |
yInitialize = "random", |
235 | 283 |
beta = beta, |
236 | 284 |
delta = delta, |
237 | 285 |
gamma = gamma, |
286 |
+ maxIter = 20, |
|
238 | 287 |
splitOnIter = -1, |
239 | 288 |
splitOnLast = FALSE, |
240 |
- verbose = FALSE, |
|
241 |
- reorder = FALSE |
|
289 |
+ verbose = FALSE |
|
290 |
+ ) |
|
291 |
+ tempY <- as.integer(as.factor(clusters(clustLabel)$y)) |
|
292 |
+ |
|
293 |
+ # Reassign clusters with label > 1 |
|
294 |
+ splitIx <- tempY > 1 |
|
295 |
+ ix <- overallY == i |
|
296 |
+ newY <- overallY[ix] |
|
297 |
+ newY[splitIx] <- currentL + tempY[splitIx] - 1 |
|
298 |
+ |
|
299 |
+ overallY[ix] <- newY |
|
300 |
+ currentL <- max(overallY) |
|
301 |
+ |
|
302 |
+ # Ensure that the maximum number of clusters does not get too large |
|
303 |
+ if (currentL > L + 10) { |
|
304 |
+ break |
|
305 |
+ } |
|
306 |
+ } |
|
307 |
+ counter <- counter + 1 |
|
308 |
+ } |
|
309 |
+ |
|
310 |
+ ## Decompose counts for likelihood calculation |
|
311 |
+ p <- .cGDecomposeCounts(counts = counts, y = overallY, L = currentL) |
|
312 |
+ nTSByC <- p$nTSByC |
|
313 |
+ nByG <- p$nByG |
|
314 |
+ nByTS <- p$nByTS |
|
315 |
+ nGByTS <- p$nGByTS |
|
316 |
+ nM <- p$nM |
|
317 |
+ nG <- p$nG |
|
318 |
+ rm(p) |
|
319 |
+ |
|
320 |
+ # Pre-compute lgamma values |
|
321 |
+ lgbeta <- lgamma((seq(0, max(.colSums( |
|
322 |
+ counts, nrow(counts), |
|
323 |
+ ncol(counts) |
|
324 |
+ )))) + beta) |
|
325 |
+ lggamma <- lgamma(seq(0, nrow(counts) + L) + gamma) |
|
326 |
+ lgdelta <- c(NA, lgamma(seq(nrow(counts) + L) * delta)) |
|
327 |
+ |
|
328 |
+ # Remove clusters 1-by-1 until L is reached |
|
329 |
+ while (currentL > L) { |
|
330 |
+ # Find second best assignment give current assignments for each cell |
|
331 |
+ probs <- .cGCalcGibbsProbY( |
|
332 |
+ counts = counts, |
|
333 |
+ y = overallY, |
|
334 |
+ L = currentL, |
|
335 |
+ nTSByC = nTSByC, |
|
336 |
+ nByTS = nByTS, |
|
337 |
+ nGByTS = nGByTS, |
|
338 |
+ nByG = nByG, |
|
339 |
+ nG = nG, |
|
340 |
+ beta = beta, |
|
341 |
+ delta = delta, |
|
342 |
+ gamma = gamma, |
|
343 |
+ lgbeta = lgbeta, |
|
344 |
+ lggamma = lggamma, |
|
345 |
+ lgdelta = lgdelta, |
|
346 |
+ doSample = FALSE |
|
242 | 347 |
) |
243 |
- overallY <- as.integer(as.factor(clusters(res)$y)) |
|
244 |
- currentL <- max(overallY) |
|
245 |
- |
|
246 |
- counter <- 0 |
|
247 |
- while (currentL < L & counter < 25) { |
|
248 |
- # Determine which clusters are split-able |
|
249 |
- yTa <- tabulate(overallY, max(overallY)) |
|
250 |
- yToSplit <- sample(which(yTa > minFeature & yTa > LSubcluster)) |
|
251 |
- |
|
252 |
- if (length(yToSplit) == 0) { |
|
253 |
- break |
|
254 |
- } |
|
255 |
- |
|
256 |
- # Cycle through each splitable cluster and split it up into |
|
257 |
- # LSublcusters |
|
258 |
- for (i in yToSplit) { |
|
259 |
- # make sure the colSums of subset counts is not 0 |
|
260 |
- countsY <- counts[overallY == i, , drop = FALSE] |
|
261 |
- countsY <- countsY[, !(colSums(countsY) == 0)] |
|
262 |
- |
|
263 |
- if (ncol(countsY) == 0) { |
|
264 |
- next |
|
265 |
- } |
|
266 |
- |
|
267 |
- clustLabel <- .celda_G( |
|
268 |
- countsY, |
|
269 |
- L = min(LSubcluster, nrow(countsY)), |
|
270 |
- yInitialize = "random", |
|
271 |
- beta = beta, |
|
272 |
- delta = delta, |
|
273 |
- gamma = gamma, |
|
274 |
- maxIter = 20, |
|
275 |
- splitOnIter = -1, |
|
276 |
- splitOnLast = FALSE, |
|
277 |
- verbose = FALSE) |
|
278 |
- tempY <- as.integer(as.factor(clusters(clustLabel)$y)) |
|
279 |
- |
|
280 |
- # Reassign clusters with label > 1 |
|
281 |
- splitIx <- tempY > 1 |
|
282 |
- ix <- overallY == i |
|
283 |
- newY <- overallY[ix] |
|
284 |
- newY[splitIx] <- currentL + tempY[splitIx] - 1 |
|
285 |
- |
|
286 |
- overallY[ix] <- newY |
|
287 |
- currentL <- max(overallY) |
|
288 |
- |
|
289 |
- # Ensure that the maximum number of clusters does not get too large |
|
290 |
- if (currentL > L + 10) { |
|
291 |
- break |
|
292 |
- } |
|
293 |
- } |
|
294 |
- counter <- counter + 1 |
|
348 |
+ yProb <- t(probs$probs) |
|
349 |
+ yProb[cbind(seq(nrow(yProb)), overallY)] <- NA |
|
350 |
+ ySecond <- apply(yProb, 1, which.max) |
|
351 |
+ |
|
352 |
+ yTa <- tabulate(overallY, currentL) |
|
353 |
+ yNonEmpty <- which(yTa > 0) |
|
354 |
+ |
|
355 |
+ # Find worst cluster by logLik to remove |
|
356 |
+ previousY <- overallY |
|
357 |
+ llShuffle <- rep(NA, currentL) |
|
358 |
+ for (i in yNonEmpty) { |
|
359 |
+ ix <- overallY == i |
|
360 |
+ newY <- overallY |
|
361 |
+ newY[ix] <- ySecond[ix] |
|
362 |
+ |
|
363 |
+ # Move arounds counts for likelihood calculation |
|
364 |
+ p <- .cGReDecomposeCounts( |
|
365 |
+ counts, |
|
366 |
+ newY, |
|
367 |
+ previousY, |
|
368 |
+ nTSByC, |
|
369 |
+ nByG, |
|
370 |
+ currentL |
|
371 |
+ ) |
|
372 |
+ nTSByC <- p$nTSByC |
|
373 |
+ nGByTS <- p$nGByTS |
|
374 |
+ nByTS <- p$nByTS |
|
375 |
+ llShuffle[i] <- .cGCalcLL( |
|
376 |
+ nTSByC, |
|
377 |
+ nByTS, |
|
378 |
+ nByG, |
|
379 |
+ nGByTS, |
|
380 |
+ nM, |
|
381 |
+ nG, |
|
382 |
+ currentL, |
|
383 |
+ beta, |
|
384 |
+ delta, |
|
385 |
+ gamma |
|
386 |
+ ) |
|
387 |
+ previousY <- newY |
|
295 | 388 |
} |
296 | 389 |
|
297 |
- ## Decompose counts for likelihood calculation |
|
298 |
- p <- .cGDecomposeCounts(counts = counts, y = overallY, L = currentL) |
|
299 |
- nTSByC <- p$nTSByC |
|
300 |
- nByG <- p$nByG |
|
301 |
- nByTS <- p$nByTS |
|
302 |
- nGByTS <- p$nGByTS |
|
303 |
- nM <- p$nM |
|
304 |
- nG <- p$nG |
|
305 |
- rm(p) |
|
306 |
- |
|
307 |
- # Pre-compute lgamma values |
|
308 |
- lgbeta <- lgamma((seq(0, max(.colSums(counts, nrow(counts), |
|
309 |
- ncol(counts))))) + beta) |
|
310 |
- lggamma <- lgamma(seq(0, nrow(counts) + L) + gamma) |
|
311 |
- lgdelta <- c(NA, lgamma(seq(nrow(counts) + L) * delta)) |
|
312 |
- |
|
313 |
- # Remove clusters 1-by-1 until L is reached |
|
314 |
- while (currentL > L) { |
|
315 |
- # Find second best assignment give current assignments for each cell |
|
316 |
- probs <- .cGCalcGibbsProbY( |
|
317 |
- counts = counts, |
|
318 |
- y = overallY, |
|
319 |
- L = currentL, |
|
320 |
- nTSByC = nTSByC, |
|
321 |
- nByTS = nByTS, |
|
322 |
- nGByTS = nGByTS, |
|
323 |
- nByG = nByG, |
|
324 |
- nG = nG, |
|
325 |
- beta = beta, |
|
326 |
- delta = delta, |
|
327 |
- gamma = gamma, |
|
328 |
- lgbeta = lgbeta, |
|
329 |
- lggamma = lggamma, |