Browse code

Fixed bugs when input is a sparse matrix

Joshua D. Campbell authored on 17/07/2021 13:56:39
Showing 1 changed files
... ...
@@ -313,10 +313,7 @@
313 313
   rm(p)
314 314
 
315 315
   # Pre-compute lgamma values
316
-  lgbeta <- lgamma((seq(0, max(.colSums(
317
-    counts, nrow(counts),
318
-    ncol(counts)
319
-  )))) + beta)
316
+  lgbeta <- lgamma((seq(0, max(colSums(counts)))) + beta)
320 317
   lggamma <- lgamma(seq(0, nrow(counts) + L) + gamma)
321 318
   lgdelta <- c(NA, lgamma(seq(nrow(counts) + L) * delta))
322 319
 
Browse code

Added support for dgCMatrix class to celda_C functions

Joshua D. Campbell authored on 05/04/2021 00:33:51
Showing 1 changed files
... ...
@@ -111,7 +111,8 @@
111 111
         maxIter = 20,
112 112
         splitOnIter = -1,
113 113
         splitOnLast = FALSE,
114
-        verbose = FALSE)
114
+        verbose = FALSE,
115
+        reorder = FALSE)
115 116
       tempZ <- as.integer(as.factor(celdaClusters(clustLabel)$z))
116 117
 
117 118
       # Reassign clusters with label > 1
... ...
@@ -157,7 +158,7 @@
157 158
       alpha = alpha,
158 159
       beta = beta,
159 160
       doSample = FALSE)
160
-    zProb <- t(probs$probs)
161
+    zProb <- t(as.matrix(probs$probs))
161 162
     zProb[cbind(seq(nrow(zProb)), overallZ)] <- NA
162 163
     zSecond <- apply(zProb, 1, which.max)
163 164
 
... ...
@@ -279,7 +280,8 @@
279 280
         maxIter = 20,
280 281
         splitOnIter = -1,
281 282
         splitOnLast = FALSE,
282
-        verbose = FALSE
283
+        verbose = FALSE,
284
+        reorder = FALSE
283 285
       )
284 286
       tempY <- as.integer(as.factor(celdaClusters(clustLabel)$y))
285 287
 
Browse code

update docs

zhewa authored on 21/05/2020 09:04:10
Showing 1 changed files
... ...
@@ -81,7 +81,7 @@
81 81
     verbose = FALSE,
82 82
     reorder = FALSE
83 83
   )
84
-  overallZ <- as.integer(as.factor(res@clusters$z))
84
+  overallZ <- as.integer(as.factor(celdaClusters(res)$z))
85 85
   currentK <- max(overallZ)
86 86
 
87 87
   counter <- 0
... ...
@@ -112,7 +112,7 @@
112 112
         splitOnIter = -1,
113 113
         splitOnLast = FALSE,
114 114
         verbose = FALSE)
115
-      tempZ <- as.integer(as.factor(clustLabel@clusters$z))
115
+      tempZ <- as.integer(as.factor(celdaClusters(clustLabel)$z))
116 116
 
117 117
       # Reassign clusters with label > 1
118 118
       splitIx <- tempZ > 1
... ...
@@ -245,7 +245,7 @@
245 245
     splitOnLast = FALSE,
246 246
     verbose = FALSE,
247 247
     reorder = FALSE)
248
-  overallY <- as.integer(as.factor(res@clusters$y))
248
+  overallY <- as.integer(as.factor(celdaClusters(res)$y))
249 249
   currentL <- max(overallY)
250 250
 
251 251
   counter <- 0
... ...
@@ -281,7 +281,7 @@
281 281
         splitOnLast = FALSE,
282 282
         verbose = FALSE
283 283
       )
284
-      tempY <- as.integer(as.factor(clustLabel@clusters$y))
284
+      tempY <- as.integer(as.factor(celdaClusters(clustLabel)$y))
285 285
 
286 286
       # Reassign clusters with label > 1
287 287
       splitIx <- tempY > 1
Browse code

sce geneSetEnrich, getDecisions, model_performance

zhewa authored on 17/05/2020 11:35:18
Showing 1 changed files
... ...
@@ -111,8 +111,7 @@
111 111
         maxIter = 20,
112 112
         splitOnIter = -1,
113 113
         splitOnLast = FALSE,
114
-        verbose = FALSE
115
-      )
114
+        verbose = FALSE)
116 115
       tempZ <- as.integer(as.factor(clustLabel@clusters$z))
117 116
 
118 117
       # Reassign clusters with label > 1
... ...
@@ -157,8 +156,7 @@
157 156
       nM = nM,
158 157
       alpha = alpha,
159 158
       beta = beta,
160
-      doSample = FALSE
161
-    )
159
+      doSample = FALSE)
162 160
     zProb <- t(probs$probs)
163 161
     zProb[cbind(seq(nrow(zProb)), overallZ)] <- NA
164 162
     zSecond <- apply(zProb, 1, which.max)
... ...
@@ -180,8 +178,7 @@
180 178
         newZ,
181 179
         previousZ,
182 180
         nGByCP,
183
-        currentK
184
-      )
181
+        currentK)
185 182
       nGByCP <- p$nGByCP
186 183
       mCPByS <- p$mCPByS
187 184
       llShuffle[i] <- .cCCalcLL(
... ...
@@ -193,8 +190,7 @@
193 190
         nS,
194 191
         nG,
195 192
         alpha,
196
-        beta
197
-      )
193
+        beta)
198 194
       previousZ <- newZ
199 195
     }
200 196
 
... ...
@@ -204,14 +200,12 @@
204 200
     ix <- overallZ == zToRemove
205 201
     overallZ[ix] <- zSecond[ix]
206 202
 
207
-    p <- .cCReDecomposeCounts(
208
-      counts,
203
+    p <- .cCReDecomposeCounts(counts,
209 204
       s,
210 205
       overallZ,
211 206
       previousZ,
212 207
       nGByCP,
213
-      currentK
214
-    )
208
+      currentK)
215 209
     nGByCP <- p$nGByCP[, -zToRemove, drop = FALSE]
216 210
     mCPByS <- p$mCPByS[-zToRemove, , drop = FALSE]
217 211
     overallZ <- as.integer(as.factor(overallZ))
... ...
@@ -250,8 +244,7 @@
250 244
     splitOnIter = -1,
251 245
     splitOnLast = FALSE,
252 246
     verbose = FALSE,
253
-    reorder = FALSE
254
-  )
247
+    reorder = FALSE)
255 248
   overallY <- as.integer(as.factor(res@clusters$y))
256 249
   currentL <- max(overallY)
257 250
 
... ...
@@ -343,8 +336,7 @@
343 336
       lgbeta = lgbeta,
344 337
       lggamma = lggamma,
345 338
       lgdelta = lgdelta,
346
-      doSample = FALSE
347
-    )
339
+      doSample = FALSE)
348 340
     yProb <- t(probs$probs)
349 341
     yProb[cbind(seq(nrow(yProb)), overallY)] <- NA
350 342
     ySecond <- apply(yProb, 1, which.max)
... ...
@@ -367,8 +359,7 @@
367 359
         previousY,
368 360
         nTSByC,
369 361
         nByG,
370
-        currentL
371
-      )
362
+        currentL)
372 363
       nTSByC <- p$nTSByC
373 364
       nGByTS <- p$nGByTS
374 365
       nByTS <- p$nByTS
... ...
@@ -382,8 +373,7 @@
382 373
         currentL,
383 374
         beta,
384 375
         delta,
385
-        gamma
386
-      )
376
+        gamma)
387 377
       previousY <- newY
388 378
     }
389 379
 
... ...
@@ -400,8 +390,7 @@
400 390
       previousY,
401 391
       nTSByC,
402 392
       nByG,
403
-      currentL
404
-    )
393
+      currentL)
405 394
     nTSByC <- p$nTSByC[-yToRemove, , drop = FALSE]
406 395
     nGByTS <- p$nGByTS[-yToRemove]
407 396
     nByTS <- p$nByTS[-yToRemove]
Browse code

sce celdaGridSearch update docs

zhewa authored on 10/05/2020 13:27:03
Showing 1 changed files
... ...
@@ -81,7 +81,7 @@
81 81
     verbose = FALSE,
82 82
     reorder = FALSE
83 83
   )
84
-  overallZ <- as.integer(as.factor(clusters(res)$z))
84
+  overallZ <- as.integer(as.factor(res@clusters$z))
85 85
   currentK <- max(overallZ)
86 86
 
87 87
   counter <- 0
... ...
@@ -113,7 +113,7 @@
113 113
         splitOnLast = FALSE,
114 114
         verbose = FALSE
115 115
       )
116
-      tempZ <- as.integer(as.factor(clusters(clustLabel)$z))
116
+      tempZ <- as.integer(as.factor(clustLabel@clusters$z))
117 117
 
118 118
       # Reassign clusters with label > 1
119 119
       splitIx <- tempZ > 1
... ...
@@ -252,7 +252,7 @@
252 252
     verbose = FALSE,
253 253
     reorder = FALSE
254 254
   )
255
-  overallY <- as.integer(as.factor(clusters(res)$y))
255
+  overallY <- as.integer(as.factor(res@clusters$y))
256 256
   currentL <- max(overallY)
257 257
 
258 258
   counter <- 0
... ...
@@ -288,7 +288,7 @@
288 288
         splitOnLast = FALSE,
289 289
         verbose = FALSE
290 290
       )
291
-      tempY <- as.integer(as.factor(clusters(clustLabel)$y))
291
+      tempY <- as.integer(as.factor(clustLabel@clusters$y))
292 292
 
293 293
       # Reassign clusters with label > 1
294 294
       splitIx <- tempY > 1
Browse code

Ran styler and fixed lints

Joshua D. Campbell authored on 06/04/2020 23:58:56
Showing 1 changed files
... ...
@@ -1,390 +1,412 @@
1 1
 .initializeCluster <- function(N,
2
-    len,
3
-    z = NULL,
4
-    initial = NULL,
5
-    fixed = NULL) {
6
-
7
-    # If initial values are given, then they will not be randomly initialized
8
-    if (!is.null(initial)) {
9
-        initValues <- sort(unique(initial))
10
-        if (length(unique(initial)) != N || length(initial) != len ||
11
-            !all(initValues %in% seq(N))) {
12
-                stop("'initial' needs to be a vector of length 'len'",
13
-                    " containing N unique values.")
14
-        }
15
-        z <- as.integer(as.factor(initial))
16
-    } else {
17
-        z <- rep(NA, len)
2
+                               len,
3
+                               z = NULL,
4
+                               initial = NULL,
5
+                               fixed = NULL) {
6
+
7
+  # If initial values are given, then they will not be randomly initialized
8
+  if (!is.null(initial)) {
9
+    initValues <- sort(unique(initial))
10
+    if (length(unique(initial)) != N || length(initial) != len ||
11
+      !all(initValues %in% seq(N))) {
12
+      stop(
13
+        "'initial' needs to be a vector of length 'len'",
14
+        " containing N unique values."
15
+      )
18 16
     }
19
-
20
-    # Set any values that need to be fixed during sampling
21
-    if (!is.null(fixed)) {
22
-        fixedValues <- sort(unique(fixed))
23
-        if (length(fixed) != len || !all(fixedValues %in% seq(N))) {
24
-            stop("'fixed' to be a vector of length 'len' where each entry is",
25
-                " one of N unique values or NA.")
26
-        }
27
-        fixedIx <- !is.na(fixed)
28
-        z[fixedIx] <- fixed[fixedIx]
29
-        zNotUsed <- setdiff(seq(N), unique(fixed[fixedIx]))
30
-    } else {
31
-        zNotUsed <- seq(N)
32
-        fixedIx <- rep(FALSE, len)
33
-    }
34
-
35
-    # Randomly sample remaining values
36
-    zNa <- which(is.na(z))
37
-    if (length(zNa) > 0) {
38
-        z[zNa] <- sample(zNotUsed, length(zNa), replace = TRUE)
17
+    z <- as.integer(as.factor(initial))
18
+  } else {
19
+    z <- rep(NA, len)
20
+  }
21
+
22
+  # Set any values that need to be fixed during sampling
23
+  if (!is.null(fixed)) {
24
+    fixedValues <- sort(unique(fixed))
25
+    if (length(fixed) != len || !all(fixedValues %in% seq(N))) {
26
+      stop(
27
+        "'fixed' to be a vector of length 'len' where each entry is",
28
+        " one of N unique values or NA."
29
+      )
39 30
     }
40
-
41
-    # Check to ensure each value is in the vector at least once
42
-    missing <- setdiff(seq(N), z)
43
-    for (i in missing) {
44
-        ta <- sort(table(z[!fixedIx]), decreasing = TRUE)
45
-        if (ta[1] == 1) {
46
-            stop("'len' is not long enough to accomodate 'N' unique values")
47
-        }
48
-        ix <- which(z == as.integer(names(ta))[1] & !fixedIx)
49
-        z[sample(ix, 1)] <- i
31
+    fixedIx <- !is.na(fixed)
32
+    z[fixedIx] <- fixed[fixedIx]
33
+    zNotUsed <- setdiff(seq(N), unique(fixed[fixedIx]))
34
+  } else {
35
+    zNotUsed <- seq(N)
36
+    fixedIx <- rep(FALSE, len)
37
+  }
38
+
39
+  # Randomly sample remaining values
40
+  zNa <- which(is.na(z))
41
+  if (length(zNa) > 0) {
42
+    z[zNa] <- sample(zNotUsed, length(zNa), replace = TRUE)
43
+  }
44
+
45
+  # Check to ensure each value is in the vector at least once
46
+  missing <- setdiff(seq(N), z)
47
+  for (i in missing) {
48
+    ta <- sort(table(z[!fixedIx]), decreasing = TRUE)
49
+    if (ta[1] == 1) {
50
+      stop("'len' is not long enough to accomodate 'N' unique values")
50 51
     }
52
+    ix <- which(z == as.integer(names(ta))[1] & !fixedIx)
53
+    z[sample(ix, 1)] <- i
54
+  }
51 55
 
52
-    return(z)
56
+  return(z)
53 57
 }
54 58
 
55 59
 
56 60
 .initializeSplitZ <- function(counts,
57
-    K,
58
-    KSubcluster = NULL,
59
-    alpha = 1,
60
-    beta = 1,
61
-    minCell = 3) {
62
-
63
-    s <- rep(1, ncol(counts))
64
-    if (is.null(KSubcluster))
65
-        KSubcluster <- ceiling(sqrt(K))
66
-
67
-    # Initialize the model with KSubcluster clusters
68
-    res <- .celda_C(
69
-        counts,
70
-        K = min(KSubcluster, ncol(counts)),
71
-        maxIter = 20,
61
+                              K,
62
+                              KSubcluster = NULL,
63
+                              alpha = 1,
64
+                              beta = 1,
65
+                              minCell = 3) {
66
+  s <- rep(1, ncol(counts))
67
+  if (is.null(KSubcluster)) {
68
+    KSubcluster <- ceiling(sqrt(K))
69
+  }
70
+
71
+  # Initialize the model with KSubcluster clusters
72
+  res <- .celda_C(
73
+    counts,
74
+    K = min(KSubcluster, ncol(counts)),
75
+    maxIter = 20,
76
+    zInitialize = "random",
77
+    alpha = alpha,
78
+    beta = beta,
79
+    splitOnIter = -1,
80
+    splitOnLast = FALSE,
81
+    verbose = FALSE,
82
+    reorder = FALSE
83
+  )
84
+  overallZ <- as.integer(as.factor(clusters(res)$z))
85
+  currentK <- max(overallZ)
86
+
87
+  counter <- 0
88
+  while (currentK < K & counter < 25) {
89
+    # Determine which clusters are split-able
90
+    # KRemaining <- K - currentK
91
+    KPerCluster <- min(ceiling(K / currentK), KSubcluster)
92
+    KToUse <- ifelse(KPerCluster < 2, 2, KPerCluster)
93
+
94
+    zTa <- tabulate(overallZ, max(overallZ))
95
+
96
+    zToSplit <- which(zTa > minCell & zTa > KToUse)
97
+    if (length(zToSplit) > 1) {
98
+      zToSplit <- sample(zToSplit)
99
+    } else if (length(zToSplit) == 0) {
100
+      break
101
+    }
102
+
103
+    # Cycle through each splitable cluster and split it up into
104
+    # K.sublcusters
105
+    for (i in zToSplit) {
106
+      clustLabel <- .celda_C(counts[, overallZ == i, drop = FALSE],
107
+        K = KToUse,
72 108
         zInitialize = "random",
73 109
         alpha = alpha,
74 110
         beta = beta,
111
+        maxIter = 20,
75 112
         splitOnIter = -1,
76 113
         splitOnLast = FALSE,
77
-        verbose = FALSE,
78
-        reorder = FALSE
114
+        verbose = FALSE
115
+      )
116
+      tempZ <- as.integer(as.factor(clusters(clustLabel)$z))
117
+
118
+      # Reassign clusters with label > 1
119
+      splitIx <- tempZ > 1
120
+      ix <- overallZ == i
121
+      newZ <- overallZ[ix]
122
+      newZ[splitIx] <- currentK + tempZ[splitIx] - 1
123
+
124
+      overallZ[ix] <- newZ
125
+      currentK <- max(overallZ)
126
+
127
+      # Ensure that the maximum number of clusters does not get too large'
128
+      if (currentK > K + 10) {
129
+        break
130
+      }
131
+    }
132
+    counter <- counter + 1
133
+  }
134
+
135
+  # Decompose counts for likelihood calculation
136
+  p <- .cCDecomposeCounts(counts, s, overallZ, currentK)
137
+  nS <- p$nS
138
+  nG <- p$nG
139
+  nM <- p$nM
140
+  mCPByS <- p$mCPByS
141
+  nGByCP <- p$nGByCP
142
+  nCP <- p$nCP
143
+  nByC <- p$nByC
144
+
145
+  # Remove clusters 1-by-1 until K is reached
146
+  while (currentK > K) {
147
+    # Find second best assignment give current assignments for each cell
148
+    probs <- .cCCalcEMProbZ(counts,
149
+      s = s,
150
+      z = overallZ,
151
+      K = currentK,
152
+      mCPByS = mCPByS,
153
+      nGByCP = nGByCP,
154
+      nByC = nByC,
155
+      nCP = nCP,
156
+      nG = nG,
157
+      nM = nM,
158
+      alpha = alpha,
159
+      beta = beta,
160
+      doSample = FALSE
79 161
     )
80
-    overallZ <- as.integer(as.factor(clusters(res)$z))
81
-    currentK <- max(overallZ)
82
-
83
-    counter <- 0
84
-    while (currentK < K & counter < 25) {
85
-        # Determine which clusters are split-able
86
-        # KRemaining <- K - currentK
87
-        KPerCluster <- min(ceiling(K / currentK), KSubcluster)
88
-        KToUse <- ifelse(KPerCluster < 2, 2, KPerCluster)
89
-
90
-        zTa <- tabulate(overallZ, max(overallZ))
91
-
92
-        zToSplit <- which(zTa > minCell & zTa > KToUse)
93
-        if (length(zToSplit) > 1) {
94
-            zToSplit <- sample(zToSplit)
95
-        } else if (length(zToSplit) == 0) {
96
-            break
97
-        }
98
-
99
-        # Cycle through each splitable cluster and split it up into
100
-        # K.sublcusters
101
-        for (i in zToSplit) {
102
-            clustLabel <- .celda_C(counts[, overallZ == i, drop = FALSE],
103
-                K = KToUse,
104
-                zInitialize = "random",
105
-                alpha = alpha,
106
-                beta = beta,
107
-                maxIter = 20,
108
-                splitOnIter = -1,
109
-                splitOnLast = FALSE,
110
-                verbose = FALSE)
111
-            tempZ <- as.integer(as.factor(clusters(clustLabel)$z))
112
-
113
-            # Reassign clusters with label > 1
114
-            splitIx <- tempZ > 1
115
-            ix <- overallZ == i
116
-            newZ <- overallZ[ix]
117
-            newZ[splitIx] <- currentK + tempZ[splitIx] - 1
118
-
119
-            overallZ[ix] <- newZ
120
-            currentK <- max(overallZ)
121
-
122
-            # Ensure that the maximum number of clusters does not get too large'
123
-            if (currentK > K + 10) {
124
-                break
125
-            }
126
-        }
127
-        counter <- counter + 1
162
+    zProb <- t(probs$probs)
163
+    zProb[cbind(seq(nrow(zProb)), overallZ)] <- NA
164
+    zSecond <- apply(zProb, 1, which.max)
165
+
166
+    zTa <- tabulate(overallZ, currentK)
167
+    zNonEmpty <- which(zTa > 0)
168
+
169
+    # Find worst cluster by logLik to remove
170
+    previousZ <- overallZ
171
+    llShuffle <- rep(NA, currentK)
172
+    for (i in zNonEmpty) {
173
+      ix <- overallZ == i
174
+      newZ <- overallZ
175
+      newZ[ix] <- zSecond[ix]
176
+
177
+      p <- .cCReDecomposeCounts(
178
+        counts,
179
+        s,
180
+        newZ,
181
+        previousZ,
182
+        nGByCP,
183
+        currentK
184
+      )
185
+      nGByCP <- p$nGByCP
186
+      mCPByS <- p$mCPByS
187
+      llShuffle[i] <- .cCCalcLL(
188
+        mCPByS,
189
+        nGByCP,
190
+        s,
191
+        newZ,
192
+        currentK,
193
+        nS,
194
+        nG,
195
+        alpha,
196
+        beta
197
+      )
198
+      previousZ <- newZ
128 199
     }
129 200
 
130
-    # Decompose counts for likelihood calculation
131
-    p <- .cCDecomposeCounts(counts, s, overallZ, currentK)
132
-    nS <- p$nS
133
-    nG <- p$nG
134
-    nM <- p$nM
135
-    mCPByS <- p$mCPByS
136
-    nGByCP <- p$nGByCP
137
-    nCP <- p$nCP
138
-    nByC <- p$nByC
139
-
140
-    # Remove clusters 1-by-1 until K is reached
141
-    while (currentK > K) {
142
-        # Find second best assignment give current assignments for each cell
143
-        probs <- .cCCalcEMProbZ(counts,
144
-                s = s,
145
-                z = overallZ,
146
-                K = currentK,
147
-                mCPByS = mCPByS,
148
-                nGByCP = nGByCP,
149
-                nByC = nByC,
150
-                nCP = nCP,
151
-                nG = nG,
152
-                nM = nM,
153
-                alpha = alpha,
154
-                beta = beta,
155
-                doSample = FALSE)
156
-        zProb <- t(probs$probs)
157
-        zProb[cbind(seq(nrow(zProb)), overallZ)] <- NA
158
-        zSecond <- apply(zProb, 1, which.max)
159
-
160
-        zTa <- tabulate(overallZ, currentK)
161
-        zNonEmpty <- which(zTa > 0)
162
-
163
-        # Find worst cluster by logLik to remove
164
-        previousZ <- overallZ
165
-        llShuffle <- rep(NA, currentK)
166
-        for (i in zNonEmpty) {
167
-            ix <- overallZ == i
168
-            newZ <- overallZ
169
-            newZ[ix] <- zSecond[ix]
170
-
171
-            p <- .cCReDecomposeCounts(counts,
172
-                s,
173
-                newZ,
174
-                previousZ,
175
-                nGByCP,
176
-                currentK)
177
-            nGByCP <- p$nGByCP
178
-            mCPByS <- p$mCPByS
179
-            llShuffle[i] <- .cCCalcLL(mCPByS,
180
-                    nGByCP,
181
-                    s,
182
-                    newZ,
183
-                    currentK,
184
-                    nS,
185
-                    nG,
186
-                    alpha,
187
-                    beta)
188
-            previousZ <- newZ
189
-        }
190
-
191
-        # Remove the cluster which had the the largest likelihood after removal
192
-        zToRemove <- which.max(llShuffle)
193
-
194
-        ix <- overallZ == zToRemove
195
-        overallZ[ix] <- zSecond[ix]
196
-
197
-        p <- .cCReDecomposeCounts(counts,
198
-            s,
199
-            overallZ,
200
-            previousZ,
201
-            nGByCP,
202
-            currentK)
203
-        nGByCP <- p$nGByCP[, -zToRemove, drop = FALSE]
204
-        mCPByS <- p$mCPByS[-zToRemove, , drop = FALSE]
205
-        overallZ <- as.integer(as.factor(overallZ))
206
-        currentK <- currentK - 1
207
-    }
208
-    return(overallZ)
201
+    # Remove the cluster which had the the largest likelihood after removal
202
+    zToRemove <- which.max(llShuffle)
203
+
204
+    ix <- overallZ == zToRemove
205
+    overallZ[ix] <- zSecond[ix]
206
+
207
+    p <- .cCReDecomposeCounts(
208
+      counts,
209
+      s,
210
+      overallZ,
211
+      previousZ,
212
+      nGByCP,
213
+      currentK
214
+    )
215
+    nGByCP <- p$nGByCP[, -zToRemove, drop = FALSE]
216
+    mCPByS <- p$mCPByS[-zToRemove, , drop = FALSE]
217
+    overallZ <- as.integer(as.factor(overallZ))
218
+    currentK <- currentK - 1
219
+  }
220
+  return(overallZ)
209 221
 }
210 222
 
211 223
 
212 224
 .initializeSplitY <- function(counts,
213
-    L,
214
-    LSubcluster = NULL,
215
-    tempK = 100,
216
-    beta = 1,
217
-    delta = 1,
218
-    gamma = 1,
219
-    minFeature = 3) {
220
-
221
-    if (is.null(LSubcluster))
222
-        LSubcluster <- ceiling(sqrt(L))
223
-
224
-    # Collapse cells to managable number of clusters
225
-    if (!is.null(tempK) && ncol(counts) > tempK) {
226
-        z <- .initializeSplitZ(counts, K = tempK)
227
-        counts <- .colSumByGroup(counts, z, length(unique(z)))
225
+                              L,
226
+                              LSubcluster = NULL,
227
+                              tempK = 100,
228
+                              beta = 1,
229
+                              delta = 1,
230
+                              gamma = 1,
231
+                              minFeature = 3) {
232
+  if (is.null(LSubcluster)) {
233
+    LSubcluster <- ceiling(sqrt(L))
234
+  }
235
+
236
+  # Collapse cells to managable number of clusters
237
+  if (!is.null(tempK) && ncol(counts) > tempK) {
238
+    z <- .initializeSplitZ(counts, K = tempK)
239
+    counts <- .colSumByGroup(counts, z, length(unique(z)))
240
+  }
241
+
242
+  # Initialize the model with KSubcluster clusters
243
+  res <- .celda_G(counts,
244
+    L = LSubcluster,
245
+    maxIter = 10,
246
+    yInitialize = "random",
247
+    beta = beta,
248
+    delta = delta,
249
+    gamma = gamma,
250
+    splitOnIter = -1,
251
+    splitOnLast = FALSE,
252
+    verbose = FALSE,
253
+    reorder = FALSE
254
+  )
255
+  overallY <- as.integer(as.factor(clusters(res)$y))
256
+  currentL <- max(overallY)
257
+
258
+  counter <- 0
259
+  while (currentL < L & counter < 25) {
260
+    # Determine which clusters are split-able
261
+    yTa <- tabulate(overallY, max(overallY))
262
+    yToSplit <- sample(which(yTa > minFeature & yTa > LSubcluster))
263
+
264
+    if (length(yToSplit) == 0) {
265
+      break
228 266
     }
229 267
 
230
-    # Initialize the model with KSubcluster clusters
231
-    res <- .celda_G(counts,
232
-        L = LSubcluster,
233
-        maxIter = 10,
268
+    # Cycle through each splitable cluster and split it up into
269
+    # LSublcusters
270
+    for (i in yToSplit) {
271
+      # make sure the colSums of subset counts is not 0
272
+      countsY <- counts[overallY == i, , drop = FALSE]
273
+      countsY <- countsY[, !(colSums(countsY) == 0)]
274
+
275
+      if (ncol(countsY) == 0) {
276
+        next
277
+      }
278
+
279
+      clustLabel <- .celda_G(
280
+        countsY,
281
+        L = min(LSubcluster, nrow(countsY)),
234 282
         yInitialize = "random",
235 283
         beta = beta,
236 284
         delta = delta,
237 285
         gamma = gamma,
286
+        maxIter = 20,
238 287
         splitOnIter = -1,
239 288
         splitOnLast = FALSE,
240
-        verbose = FALSE,
241
-        reorder = FALSE
289
+        verbose = FALSE
290
+      )
291
+      tempY <- as.integer(as.factor(clusters(clustLabel)$y))
292
+
293
+      # Reassign clusters with label > 1
294
+      splitIx <- tempY > 1
295
+      ix <- overallY == i
296
+      newY <- overallY[ix]
297
+      newY[splitIx] <- currentL + tempY[splitIx] - 1
298
+
299
+      overallY[ix] <- newY
300
+      currentL <- max(overallY)
301
+
302
+      # Ensure that the maximum number of clusters does not get too large
303
+      if (currentL > L + 10) {
304
+        break
305
+      }
306
+    }
307
+    counter <- counter + 1
308
+  }
309
+
310
+  ## Decompose counts for likelihood calculation
311
+  p <- .cGDecomposeCounts(counts = counts, y = overallY, L = currentL)
312
+  nTSByC <- p$nTSByC
313
+  nByG <- p$nByG
314
+  nByTS <- p$nByTS
315
+  nGByTS <- p$nGByTS
316
+  nM <- p$nM
317
+  nG <- p$nG
318
+  rm(p)
319
+
320
+  # Pre-compute lgamma values
321
+  lgbeta <- lgamma((seq(0, max(.colSums(
322
+    counts, nrow(counts),
323
+    ncol(counts)
324
+  )))) + beta)
325
+  lggamma <- lgamma(seq(0, nrow(counts) + L) + gamma)
326
+  lgdelta <- c(NA, lgamma(seq(nrow(counts) + L) * delta))
327
+
328
+  # Remove clusters 1-by-1 until L is reached
329
+  while (currentL > L) {
330
+    # Find second best assignment give current assignments for each cell
331
+    probs <- .cGCalcGibbsProbY(
332
+      counts = counts,
333
+      y = overallY,
334
+      L = currentL,
335
+      nTSByC = nTSByC,
336
+      nByTS = nByTS,
337
+      nGByTS = nGByTS,
338
+      nByG = nByG,
339
+      nG = nG,
340
+      beta = beta,
341
+      delta = delta,
342
+      gamma = gamma,
343
+      lgbeta = lgbeta,
344
+      lggamma = lggamma,
345
+      lgdelta = lgdelta,
346
+      doSample = FALSE
242 347
     )
243
-    overallY <- as.integer(as.factor(clusters(res)$y))
244
-    currentL <- max(overallY)
245
-
246
-    counter <- 0
247
-    while (currentL < L & counter < 25) {
248
-        # Determine which clusters are split-able
249
-        yTa <- tabulate(overallY, max(overallY))
250
-        yToSplit <- sample(which(yTa > minFeature & yTa > LSubcluster))
251
-
252
-        if (length(yToSplit) == 0) {
253
-            break
254
-        }
255
-
256
-        # Cycle through each splitable cluster and split it up into
257
-        # LSublcusters
258
-        for (i in yToSplit) {
259
-            # make sure the colSums of subset counts is not 0
260
-            countsY <- counts[overallY == i, , drop = FALSE]
261
-            countsY <- countsY[, !(colSums(countsY) == 0)]
262
-
263
-            if (ncol(countsY) == 0) {
264
-                next
265
-            }
266
-
267
-            clustLabel <- .celda_G(
268
-                countsY,
269
-                L = min(LSubcluster, nrow(countsY)),
270
-                yInitialize = "random",
271
-                beta = beta,
272
-                delta = delta,
273
-                gamma = gamma,
274
-                maxIter = 20,
275
-                splitOnIter = -1,
276
-                splitOnLast = FALSE,
277
-                verbose = FALSE)
278
-            tempY <- as.integer(as.factor(clusters(clustLabel)$y))
279
-
280
-            # Reassign clusters with label > 1
281
-            splitIx <- tempY > 1
282
-            ix <- overallY == i
283
-            newY <- overallY[ix]
284
-            newY[splitIx] <- currentL + tempY[splitIx] - 1
285
-
286
-            overallY[ix] <- newY
287
-            currentL <- max(overallY)
288
-
289
-            # Ensure that the maximum number of clusters does not get too large
290
-            if (currentL > L + 10) {
291
-                break
292
-            }
293
-        }
294
-        counter <- counter + 1
348
+    yProb <- t(probs$probs)
349
+    yProb[cbind(seq(nrow(yProb)), overallY)] <- NA
350
+    ySecond <- apply(yProb, 1, which.max)
351
+
352
+    yTa <- tabulate(overallY, currentL)
353
+    yNonEmpty <- which(yTa > 0)
354
+
355
+    # Find worst cluster by logLik to remove
356
+    previousY <- overallY
357
+    llShuffle <- rep(NA, currentL)
358
+    for (i in yNonEmpty) {
359
+      ix <- overallY == i
360
+      newY <- overallY
361
+      newY[ix] <- ySecond[ix]
362
+
363
+      # Move arounds counts for likelihood calculation
364
+      p <- .cGReDecomposeCounts(
365
+        counts,
366
+        newY,
367
+        previousY,
368
+        nTSByC,
369
+        nByG,
370
+        currentL
371
+      )
372
+      nTSByC <- p$nTSByC
373
+      nGByTS <- p$nGByTS
374
+      nByTS <- p$nByTS
375
+      llShuffle[i] <- .cGCalcLL(
376
+        nTSByC,
377
+        nByTS,
378
+        nByG,
379
+        nGByTS,
380
+        nM,
381
+        nG,
382
+        currentL,
383
+        beta,
384
+        delta,
385
+        gamma
386
+      )
387
+      previousY <- newY
295 388
     }
296 389
 
297
-    ## Decompose counts for likelihood calculation
298
-    p <- .cGDecomposeCounts(counts = counts, y = overallY, L = currentL)
299
-    nTSByC <- p$nTSByC
300
-    nByG <- p$nByG
301
-    nByTS <- p$nByTS
302
-    nGByTS <- p$nGByTS
303
-    nM <- p$nM
304
-    nG <- p$nG
305
-    rm(p)
306
-
307
-    # Pre-compute lgamma values
308
-    lgbeta <- lgamma((seq(0, max(.colSums(counts, nrow(counts),
309
-        ncol(counts))))) + beta)
310
-    lggamma <- lgamma(seq(0, nrow(counts) + L) + gamma)
311
-    lgdelta <- c(NA, lgamma(seq(nrow(counts) + L) * delta))
312
-
313
-    # Remove clusters 1-by-1 until L is reached
314
-    while (currentL > L) {
315
-        # Find second best assignment give current assignments for each cell
316
-        probs <- .cGCalcGibbsProbY(
317
-            counts = counts,
318
-            y = overallY,
319
-            L = currentL,
320
-            nTSByC = nTSByC,
321
-            nByTS = nByTS,
322
-            nGByTS = nGByTS,
323
-            nByG = nByG,
324
-            nG = nG,
325
-            beta = beta,
326
-            delta = delta,
327
-            gamma = gamma,
328
-            lgbeta = lgbeta,
329
-            lggamma = lggamma,