Browse code

Merging performance enhancements (#268)

* Changed default parameters for distinct_colors.

* Fixed bug dealing with subtracting 1 to make the factor indices to be zero based

* Changed EM function to use fastor colSumByGroupChange function to shift counts around after z labels have been altered

* updated documentation

* Added 'src/*.so' statement

* Changed 'split.each.z' to 'cC.splitZ' which utilizes the rowSumByGroupChange function to greatly increase speed in celda_C. However a separate function will be needed for the cells in celda_CG

* changed decomposed matrix to have TS as rows rather than columns (e.g. n.C.by.TS => n.TS.by.C)..

* Adding function for splitting clusters for celda_CG

* Reverted code to making any matrix with genes (G) or transcriptional state (TS) as being the rows. This eliminates the need for the counts.t matrix

* Reverted code to making any matrix with genes (G) or transcriptional state (TS) as being the rows and cells (C) or cell populations (CP) as columns. This eliminates the need for transposing matrices back and forth

* Debugged the new function 'cCG.splitZ' which will perform the cell splits for celda_CG in a much faster and more effecient way

* Refactor rowSumByGroupChange

* Modified the split for celda_G to use the faster rowSumByGroupChange function compared to rowSumByGroup

* Updated celda_CG y splitting function

* Fixed bug in using colSumByGroupChange

* updated NAMESPACE

* Updated testthat to reflect new defaults for distinct_color function

* Needed to added statement to load in previously simulated results

* Added missing lines in testthat for celda_C

* Removed dependency on Rmpfr by using functions from matrixStats package

* Changed default gamma in simulateCells functions to 5. Removed 'precision' argument in perplexity

* added 'seed' parameter to celdaTsne

* Updated NAMESPACE


Former-commit-id: 3e7a458f955513bef374fc98ed23c567155b13eb

Joshua D. Campbell authored on 01/07/2018 20:40:59 • Sean committed on 01/07/2018 20:40:59
Showing 16 changed files

... ...
@@ -28,3 +28,4 @@ celda.Rproj
28 28
 .DS_Store
29 29
 src/*.o
30 30
 src/*.dll
31
+src/*.so
... ...
@@ -21,7 +21,7 @@ Imports:
21 21
     gtable,
22 22
     grDevices,
23 23
     graphics,
24
-    Rmpfr,
24
+    matrixStats,
25 25
     gplots,
26 26
     doParallel,
27 27
     cluster,
... ...
@@ -79,4 +79,3 @@ import(grid)
79 79
 import(gtable)
80 80
 import(scales)
81 81
 useDynLib(celda)
82
-importFrom(Rcpp, sourceCpp)
... ...
@@ -91,21 +91,21 @@ celda_C = function(counts, sample.label=NULL, K, alpha=1, beta=1,
91 91
   num.iter.without.improvement = 0L
92 92
   do.cell.split = TRUE
93 93
   while(iter <= max.iter & num.iter.without.improvement <= stop.iter) {
94
-    message(paste("Iteration: ", iter))
95 94
     
96 95
 #   next.z = cC.calcGibbsProbZ(counts=counts, m.CP.by.S=m.CP.by.S, n.G.by.CP=n.G.by.CP, n.by.C=n.by.C, n.CP=n.CP, z=z, s=s, K=K, nG=nG, nM=nM, alpha=alpha, beta=beta)
97 96
 #	next.z = cC.calcEMProbZ(counts=counts, m.CP.by.S=m.CP.by.S, n.G.by.CP=n.G.by.CP, n.by.C=n.by.C, n.CP=n.CP, z=z, s=s, K=K, nG=nG, nM=nM, alpha=alpha, beta=beta)
98 97
     next.z = do.call(algorithm.fun, list(counts=counts, m.CP.by.S=m.CP.by.S, n.G.by.CP=n.G.by.CP, n.by.C=n.by.C, n.CP=n.CP, z=z, s=s, K=K, nG=nG, nM=nM, alpha=alpha, beta=beta))
98
+
99 99
     m.CP.by.S = next.z$m.CP.by.S
100 100
     n.G.by.CP = next.z$n.G.by.CP
101 101
     n.CP = next.z$n.CP
102 102
     z = next.z$z
103
-    
103
+
104 104
     ## Perform split on i-th iteration of no improvement in log likelihood
105 105
     if(K > 2 & (((iter == max.iter | num.iter.without.improvement == stop.iter) & isTRUE(split.on.last)) | (split.on.iter > 0 & iter %% split.on.iter == 0 & isTRUE(do.cell.split)))) {
106 106
 
107 107
       logMessages(date(), " ... Determining if any cell clusters should be split.", logfile=logfile, append=TRUE, sep="")
108
-      res = split.each.z(counts=counts, z=z, K=K, z.prob=t(next.z$probs), alpha=alpha, beta=beta, s=s, LLFunction="calculateLoglikFromVariables.celda_C")
108
+	  res = cC.splitZ(counts, m.CP.by.S, n.G.by.CP, s, z, K, nS, nG, alpha, beta, z.prob=t(next.z$probs), max.clusters.to.try=10, min.cell=3)
109 109
       logMessages(res$message, logfile=logfile, append=TRUE)
110 110
 
111 111
 	  # Reset convergence counter if a split occured
... ...
@@ -118,9 +118,12 @@ celda_C = function(counts, sample.label=NULL, K, alpha=1, beta=1,
118 118
             
119 119
       ## Re-calculate variables
120 120
       z = res$z
121
-      m.CP.by.S = matrix(as.integer(table(factor(z, levels=1:K), s)), ncol=nS)
122
-      n.G.by.CP = colSumByGroup(counts, group=z, K=K)
123
-      n.CP = as.integer(colSums(n.G.by.CP))
121
+      #m.CP.by.S = matrix(as.integer(table(factor(z, levels=1:K), s)), ncol=nS)
122
+      #n.G.by.CP = colSumByGroup(counts, group=z, K=K)
123
+      #n.CP = as.integer(colSums(n.G.by.CP))
124
+      m.CP.by.S = res$m.CP.by.S
125
+      n.G.by.CP = res$n.G.by.CP
126
+      n.CP = res$n.CP
124 127
     }
125 128
 
126 129
 
... ...
@@ -225,14 +228,16 @@ cC.calcEMProbZ = function(counts, m.CP.by.S, n.G.by.CP, n.by.C, n.CP, z, s, K, n
225 228
   ## Maximization to find best label for each cell
226 229
   probs = eigenMatMultInt(phi, counts) + theta[, s]  
227 230
   #probs = (t(phi) %*% counts) + theta[, s]  
231
+  
232
+  z.previous = z
228 233
   z = apply(probs, 2, which.max)
229 234
 
230 235
   ## Recalculate counts based on new label
231
-  p = cC.decomposeCounts(counts, s, z, K)
236
+  #p = cC.decomposeCounts(counts, s, z, K)
237
+  p = cC.reDecomposeCounts(counts, s, z, z.previous, n.G.by.CP, K)
232 238
   m.CP.by.S = p$m.CP.by.S
233 239
   n.G.by.CP = p$n.G.by.CP
234 240
   n.CP = p$n.CP
235
-  n.by.C = p$n.by.C
236 241
 
237 242
   return(list(m.CP.by.S=m.CP.by.S, n.G.by.CP=n.G.by.CP, n.CP=n.CP, z=z, probs=probs))
238 243
 }
... ...
@@ -400,10 +405,21 @@ cC.decomposeCounts = function(counts, s, z, K) {
400 405
   n.G.by.CP = colSumByGroup(counts, group=z, K=K)
401 406
   n.CP = as.integer(colSums(n.G.by.CP))
402 407
   n.by.C = as.integer(colSums(counts))
403
-  
408
+
404 409
   return(list(m.CP.by.S=m.CP.by.S, n.G.by.CP=n.G.by.CP, n.CP=n.CP, n.by.C=n.by.C, nS=nS, nG=nG, nM=nM))
405 410
 }
406 411
 
412
+cC.reDecomposeCounts = function(counts, s, z, previous.z, n.G.by.CP, K) {
413
+
414
+  ## Recalculate counts based on new label
415
+  n.G.by.CP = colSumByGroupChange(counts, n.G.by.CP, z, previous.z, K)
416
+  nS = length(unique(s))
417
+  m.CP.by.S = matrix(as.integer(table(factor(z, levels=1:K), s)), ncol=nS)
418
+  n.CP = as.integer(colSums(n.G.by.CP))
419
+
420
+  return(list(m.CP.by.S=m.CP.by.S, n.G.by.CP=n.G.by.CP, n.CP=n.CP))  
421
+}
422
+
407 423
 
408 424
 #' Calculates the conditional probability of each cell belong to each cluster given all other cluster assignments
409 425
 #'
... ...
@@ -439,22 +455,13 @@ clusterProbability.celda_C = function(celda.mod, counts, log=FALSE, ...) {
439 455
 #' @export
440 456
 calculatePerplexity.celda_C = function(counts, celda.mod, precision=128) {
441 457
   
442
-  # TODO Can try to turn into a single giant matrix multiplication by duplicating
443
-  #     phi / theta / sl
444
-  # TODO Cast to sparse matrices?
445 458
   factorized = factorizeMatrix(counts = counts, celda.mod = celda.mod, "posterior")
446 459
   theta = log(factorized$posterior$sample.states)
447 460
   phi = log(factorized$posterior$gene.states)
448 461
   sl = celda.mod$sample.label
449 462
   
450 463
   inner.log.prob = (t(phi) %*% counts) + theta[, sl]  
451
-  inner.log.prob = Rmpfr::mpfr(inner.log.prob, precision)
452
-  inner.log.prob.exp = exp(inner.log.prob)
453
-  
454
-  log.px = 0
455
-  for(i in 1:ncol(inner.log.prob.exp)) {
456
-    log.px = log.px + Rmpfr::asNumeric(log(sum(inner.log.prob.exp[, i])))
457
-  }
464
+  log.px = sum(apply(inner.log.prob, 2, matrixStats::logSumExp))
458 465
   
459 466
   perplexity = exp(-(log.px/sum(counts)))
460 467
   return(perplexity)
... ...
@@ -84,18 +84,19 @@ celda_CG = function(counts, sample.label=NULL, K, L,
84 84
   p = cCG.decomposeCounts(counts, s, z, y, K, L)
85 85
   m.CP.by.S = p$m.CP.by.S
86 86
   n.TS.by.C = p$n.TS.by.C
87
-  n.CP.by.TS = p$n.CP.by.TS
87
+  n.TS.by.CP = p$n.TS.by.CP
88 88
   n.CP = p$n.CP
89 89
   n.by.G = p$n.by.G
90 90
   n.by.C = p$n.by.C
91 91
   n.by.TS = p$n.by.TS
92 92
   nG.by.TS = p$nG.by.TS
93
+  n.G.by.CP = p$n.G.by.CP
93 94
   nM = p$nM
94 95
   nG = p$nG
95 96
   nS = p$nS
96 97
   rm(p)
97 98
   
98
-  ll = cCG.calcLL(K=K, L=L, m.CP.by.S=m.CP.by.S, n.CP.by.TS=n.CP.by.TS, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nS=nS, nG=nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma)
99
+  ll = cCG.calcLL(K=K, L=L, m.CP.by.S=m.CP.by.S, n.TS.by.CP=n.TS.by.CP, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nS=nS, nG=nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma)
99 100
 
100 101
   set.seed(seed)
101 102
   logMessages(date(), "... Starting celda_CG to cluster cells and genes.", logfile=logfile, append=FALSE)
... ...
@@ -106,29 +107,27 @@ celda_CG = function(counts, sample.label=NULL, K, L,
106 107
   do.gene.split = TRUE  
107 108
   while(iter <= max.iter & num.iter.without.improvement <= stop.iter) {
108 109
 
109
-    ## Gibbs sampling for each cell
110
-    n.TS.by.C = rowSumByGroup(counts, group=y, L=L)
111
-    n.TS.by.CP = t(n.CP.by.TS)
112
-	#next.z = cC.calcGibbsProbZ(counts=n.TS.by.C, m.CP.by.S=m.CP.by.S, n.G.by.CP=n.TS.by.CP, n.CP=n.CP, n.by.C=n.by.C, z=z, s=s, K=K, nG=L, nM=nM, alpha=alpha, beta=beta)
110
+    ## Gibbs or EM sampling for each cell
113 111
 	next.z = do.call(algorithm.fun, list(counts=n.TS.by.C, m.CP.by.S=m.CP.by.S, n.G.by.CP=n.TS.by.CP, n.CP=n.CP, n.by.C=n.by.C, z=z, s=s, K=K, nG=L, nM=nM, alpha=alpha, beta=beta))
114 112
     m.CP.by.S = next.z$m.CP.by.S
115 113
     n.TS.by.CP = next.z$n.G.by.CP
116 114
     n.CP = next.z$n.CP
115
+    n.G.by.CP = colSumByGroupChange(counts, n.G.by.CP, next.z$z, z, K)
117 116
     z = next.z$z
118
-
117
+        
119 118
     ## Gibbs sampling for each gene
120
-    n.CP.by.G = t(colSumByGroup(counts, group=z, K=K))
121
-    n.CP.by.TS = t(n.TS.by.CP) 
122
- 	next.y = cG.calcGibbsProbY(counts.t=n.CP.by.G, n.C.by.TS=n.CP.by.TS, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, n.by.G=n.by.G, y=y, L=L, nG=nG, beta=beta, delta=delta, gamma=gamma)
123
-	n.CP.by.TS = next.y$n.C.by.TS
119
+ 	next.y = cG.calcGibbsProbY(counts=n.G.by.CP, n.TS.by.C=n.TS.by.CP, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, n.by.G=n.by.G, y=y, L=L, nG=nG, beta=beta, delta=delta, gamma=gamma)
120
+	n.TS.by.CP = next.y$n.TS.by.C
124 121
 	nG.by.TS = next.y$nG.by.TS
125 122
 	n.by.TS = next.y$n.by.TS
123
+	n.TS.by.C = rowSumByGroupChange(counts, n.TS.by.C, next.y$y, y, L)
126 124
 	y = next.y$y
127 125
     
126
+        
128 127
     ## Perform split on i-th iteration defined by split.on.iter
129 128
 	if(K > 2 & (((iter == max.iter | num.iter.without.improvement == stop.iter) & isTRUE(split.on.last)) | (split.on.iter > 0 & iter %% split.on.iter == 0 & isTRUE(do.cell.split)))) {
130 129
 	  logMessages(date(), " ... Determining if any cell clusters should be split.", logfile=logfile, append=TRUE, sep="")
131
-	  res = split.each.z(counts=counts, z=z, y=y, z.prob=t(next.z$probs), K=K, L=L, alpha=alpha, delta=delta, beta=beta, gamma=gamma, s=s, LLFunction="calculateLoglikFromVariables.celda_CG")
130
+	  res = cCG.splitZ(counts, m.CP.by.S, n.TS.by.C, n.TS.by.CP, n.by.G, n.by.TS, nG.by.TS, s, z, K, L, nS, nG, alpha, beta, delta, gamma, z.prob=t(next.z$probs), max.clusters.to.try=10, min.cell=3)
132 131
 	  logMessages(res$message, logfile=logfile, append=TRUE)
133 132
 
134 133
 	  # Reset convergence counter if a split occured
... ...
@@ -141,15 +140,14 @@ celda_CG = function(counts, sample.label=NULL, K, L,
141 140
 
142 141
 	  ## Re-calculate variables
143 142
 	  z = res$z      
144
-	  m.CP.by.S = matrix(as.integer(table(factor(z, levels=1:K), s)), ncol=nS)
145
-	  n.TS.by.C = rowSumByGroup(counts, group=y, L=L)
146
-	  n.CP.by.TS = t(colSumByGroup(n.TS.by.C, group=z, K=K))
147
-	  n.CP = as.integer(rowSums(n.CP.by.TS))
148
-	  n.CP.by.G = t(colSumByGroup(counts, group=z, K=K))
143
+	  m.CP.by.S = res$m.CP.by.S
144
+	  n.TS.by.CP = res$n.TS.by.CP
145
+	  n.CP = res$n.CP
146
+	  n.G.by.CP = colSumByGroup(counts, group=z, K=K)
149 147
 	}  
150 148
 	if(L > 2 & (((iter == max.iter | num.iter.without.improvement == stop.iter) & isTRUE(split.on.last)) | (split.on.iter > 0 & iter %% split.on.iter == 0 & isTRUE(do.gene.split)))) {
151 149
 	  logMessages(date(), " ... Determining if any gene clusters should be split.", logfile=logfile, append=TRUE, sep="")
152
-	  res = split.each.y(counts=counts, z=z, y=y, y.prob=t(next.y$probs), K=K, L=L, alpha=alpha, beta=beta, delta=delta, gamma=gamma, s=s, LLFunction="calculateLoglikFromVariables.celda_CG")
150
+	  res = cCG.splitY(counts, y, m.CP.by.S, n.G.by.CP, n.TS.by.C, n.TS.by.CP, n.by.G, n.by.TS, nG.by.TS, n.CP, s, z, K, L, nS, nG, alpha, beta, delta, gamma, y.prob=t(next.y$probs), max.clusters.to.try=10, min.cell=3)
153 151
 	  logMessages(res$message, logfile=logfile, append=TRUE)
154 152
 
155 153
 	  # Reset convergence counter if a split occured	    
... ...
@@ -162,15 +160,14 @@ celda_CG = function(counts, sample.label=NULL, K, L,
162 160
 
163 161
 	  ## Re-calculate variables
164 162
 	  y = res$y        
165
-	  n.TS.by.C = rowSumByGroup(counts, group=y, L=L)
166
-	  n.CP.by.TS = t(colSumByGroup(n.TS.by.C, group=z, K=K))
167
-	  n.CP = as.integer(rowSums(n.CP.by.TS))
168
-	  n.by.TS = as.integer(rowSumByGroup(matrix(n.by.G,ncol=1), group=y, L=L))
169
-	  nG.by.TS = as.integer(table(factor(y, levels=1:L)))
163
+	  n.TS.by.CP = res$n.TS.by.CP
164
+	  n.by.TS = res$n.by.TS
165
+	  nG.by.TS = res$nG.by.TS
166
+	  n.TS.by.C = rowSumByGroup(counts, group=y, L=L)	  
170 167
 	}      
171 168
 
172 169
     ## Calculate complete likelihood
173
-    temp.ll = cCG.calcLL(K=K, L=L, m.CP.by.S=m.CP.by.S, n.CP.by.TS=n.CP.by.TS, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nS=nS, nG=nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma)
170
+    temp.ll = cCG.calcLL(K=K, L=L, m.CP.by.S=m.CP.by.S, n.TS.by.CP=n.TS.by.CP, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nS=nS, nG=nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma)
174 171
     if((all(temp.ll > ll)) | iter == 1) {
175 172
       z.best = z
176 173
       y.best = y
... ...
@@ -223,7 +220,7 @@ celda_CG = function(counts, sample.label=NULL, K, L,
223 220
 #' @param ... Other arguments
224 221
 #' @export
225 222
 simulateCells.celda_CG = function(model, S=10, C.Range=c(50,100), N.Range=c(500,5000), 
226
-                                  G=1000, K=3, L=10, alpha=1, beta=1, gamma=1, 
223
+                                  G=1000, K=3, L=10, alpha=1, beta=1, gamma=5, 
227 224
                                   delta=1, seed=12345, ...) {
228 225
   
229 226
   set.seed(seed)
... ...
@@ -327,7 +324,7 @@ factorizeMatrix.celda_CG = function(counts, celda.mod, type=c("counts", "proport
327 324
   nM = p$nM
328 325
   m.CP.by.S = p$m.CP.by.S
329 326
   n.TS.by.C = p$n.TS.by.C
330
-  n.CP.by.TS = p$n.CP.by.TS
327
+  n.TS.by.CP = p$n.TS.by.CP
331 328
   n.by.G = p$n.by.G
332 329
   n.by.TS = p$n.by.TS
333 330
   nG.by.TS = p$nG.by.TS
... ...
@@ -345,8 +342,8 @@ factorizeMatrix.celda_CG = function(counts, celda.mod, type=c("counts", "proport
345 342
   rownames(n.G.by.TS) = celda.mod$names$row
346 343
   rownames(m.CP.by.S) = K.names
347 344
   colnames(m.CP.by.S) = celda.mod$names$sample
348
-  colnames(n.CP.by.TS) = L.names
349
-  rownames(n.CP.by.TS) = K.names
345
+  colnames(n.TS.by.CP) = K.names
346
+  rownames(n.TS.by.CP) = L.names
350 347
 
351 348
   counts.list = c()
352 349
   prop.list = c()
... ...
@@ -355,7 +352,7 @@ factorizeMatrix.celda_CG = function(counts, celda.mod, type=c("counts", "proport
355 352
     
356 353
   if(any("counts" %in% type)) {
357 354
     counts.list = list(sample.states = m.CP.by.S,
358
-            				   population.states = t(n.CP.by.TS), 
355
+            				   population.states = n.TS.by.CP, 
359 356
             				   cell.states = n.TS.by.C,
360 357
             				   gene.states = n.G.by.TS,
361 358
             				   gene.distribution = nG.by.TS)
... ...
@@ -365,8 +362,8 @@ factorizeMatrix.celda_CG = function(counts, celda.mod, type=c("counts", "proport
365 362
 
366 363
     ## Need to avoid normalizing cell/gene states with zero cells/genes
367 364
     unique.z = sort(unique(z))
368
-    temp.n.CP.by.TS = t(n.CP.by.TS)
369
-    temp.n.CP.by.TS[,unique.z] = normalizeCounts(temp.n.CP.by.TS[,unique.z], scale.factor=1)
365
+    temp.n.TS.by.CP = n.TS.by.CP
366
+    temp.n.TS.by.CP[,unique.z] = normalizeCounts(temp.n.TS.by.CP[,unique.z], scale.factor=1)
370 367
 
371 368
     unique.y = sort(unique(y))
372 369
     temp.n.G.by.TS = n.G.by.TS
... ...
@@ -374,7 +371,7 @@ factorizeMatrix.celda_CG = function(counts, celda.mod, type=c("counts", "proport
374 371
     temp.nG.by.TS = nG.by.TS/sum(nG.by.TS)
375 372
     
376 373
     prop.list = list(sample.states =  normalizeCounts(m.CP.by.S, scale.factor=1),
377
-    				   population.states = temp.n.CP.by.TS, 
374
+    				   population.states = temp.n.TS.by.CP, 
378 375
     				   cell.states = normalizeCounts(n.TS.by.C, scale.factor=1),
379 376
     				   gene.states = temp.n.G.by.TS, 
380 377
     				   gene.distribution = temp.nG.by.TS)
... ...
@@ -388,7 +385,7 @@ factorizeMatrix.celda_CG = function(counts, celda.mod, type=c("counts", "proport
388 385
 	temp.nG.by.TS = (nG.by.TS + gamma)/sum(nG.by.TS + gamma)
389 386
 	
390 387
     post.list = list(sample.states = normalizeCounts(m.CP.by.S + alpha, scale.factor=1),
391
-          				   population.states = normalizeCounts(t(n.CP.by.TS + beta), scale.factor=1), 
388
+          				   population.states = normalizeCounts(n.TS.by.CP + beta, scale.factor=1), 
392 389
           				   gene.states = gs,
393 390
           				   gene.distribution = temp.nG.by.TS)
394 391
     res = c(res, posterior = list(post.list))						    
... ...
@@ -398,7 +395,7 @@ factorizeMatrix.celda_CG = function(counts, celda.mod, type=c("counts", "proport
398 395
 }
399 396
 
400 397
 # Calculate the loglikelihood for the celda_CG model
401
-cCG.calcLL = function(K, L, m.CP.by.S, n.CP.by.TS, n.by.G, n.by.TS, nG.by.TS, nS, nG, alpha, beta, delta, gamma) {
398
+cCG.calcLL = function(K, L, m.CP.by.S, n.TS.by.CP, n.by.G, n.by.TS, nG.by.TS, nS, nG, alpha, beta, delta, gamma) {
402 399
   nG.by.TS[nG.by.TS == 0] = 1
403 400
   nG = sum(nG.by.TS)
404 401
   
... ...
@@ -413,9 +410,9 @@ cCG.calcLL = function(K, L, m.CP.by.S, n.CP.by.TS, n.by.G, n.by.TS, nG.by.TS, nS
413 410
   
414 411
   ## Calculate for "Phi" component
415 412
   a = K * lgamma(L * beta)
416
-  b = sum(lgamma(n.CP.by.TS + beta))
413
+  b = sum(lgamma(n.TS.by.CP + beta))
417 414
   c = -K * L * lgamma(beta)
418
-  d = -sum(lgamma(rowSums(n.CP.by.TS + beta)))
415
+  d = -sum(lgamma(colSums(n.TS.by.CP + beta)))
419 416
   
420 417
   phi.ll = a + b + c + d
421 418
   
... ...
@@ -457,7 +454,7 @@ cCG.calcLL = function(K, L, m.CP.by.S, n.CP.by.TS, n.by.G, n.by.TS, nG.by.TS, nS
457 454
 calculateLoglikFromVariables.celda_CG = function(counts, sample.label, z, y, K, L, alpha, beta, delta, gamma) {  
458 455
   s = processSampleLabels(sample.label, ncol(counts))
459 456
   p = cCG.decomposeCounts(counts, s, z, y, K, L)
460
-  final = cCG.calcLL(K=K, L=L, m.CP.by.S=p$m.CP.by.S, n.CP.by.TS=p$n.CP.by.TS, n.by.G=p$n.by.G, n.by.TS=p$n.by.TS, nG.by.TS=p$nG.by.TS, nS=p$nS, nG=p$nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma)
457
+  final = cCG.calcLL(K=K, L=L, m.CP.by.S=p$m.CP.by.S, n.TS.by.CP=p$n.TS.by.CP, n.by.G=p$n.by.G, n.by.TS=p$n.by.TS, nG.by.TS=p$nG.by.TS, nS=p$nS, nG=p$nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma)
461 458
   return(final)
462 459
 }
463 460
 
... ...
@@ -472,18 +469,18 @@ calculateLoglikFromVariables.celda_CG = function(counts, sample.label, z, y, K,
472 469
 cCG.decomposeCounts = function(counts, s, z, y, K, L) {
473 470
   nS = length(unique(s))
474 471
   m.CP.by.S = matrix(as.integer(table(factor(z, levels=1:K), s)), ncol=nS)
475
-    n.TS.by.C = rowSumByGroup(counts, group=y, L=L)
476
-  n.CP.by.TS = t(colSumByGroup(n.TS.by.C, group=z, K=K))  # TODO Refactor celda_CG
477
-  n.CP = as.integer(rowSums(n.CP.by.TS))
472
+  n.TS.by.C = rowSumByGroup(counts, group=y, L=L)
473
+  n.TS.by.CP = colSumByGroup(n.TS.by.C, group=z, K=K)
474
+  n.CP = as.integer(colSums(n.TS.by.CP))
478 475
   n.by.G = as.integer(rowSums(counts))
479 476
   n.by.C = as.integer(colSums(counts))
480 477
   n.by.TS = as.integer(rowSumByGroup(matrix(n.by.G,ncol=1), group=y, L=L))
481
-  nG.by.TS = as.integer(table(factor(y, 1:L)))
482
-  n.CP.by.G = colSumByGroup(counts, group=z, K=K)
478
+  nG.by.TS = tabulate(y, L)
479
+  n.G.by.CP = colSumByGroup(counts, group=z, K=K)
483 480
   
484 481
   nG = nrow(counts)
485 482
   nM = ncol(counts)
486
-  return(list(m.CP.by.S=m.CP.by.S, n.TS.by.C=n.TS.by.C, n.CP.by.TS=n.CP.by.TS, n.CP=n.CP, n.by.G=n.by.G, n.by.C=n.by.C, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, n.CP.by.G=n.CP.by.G, nM=nM, nG=nG, nS=nS))
483
+  return(list(m.CP.by.S=m.CP.by.S, n.TS.by.C=n.TS.by.C, n.TS.by.CP=n.TS.by.CP, n.CP=n.CP, n.by.G=n.by.G, n.by.C=n.by.C, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, n.G.by.CP=n.G.by.CP, nM=nM, nG=nG, nS=nS))
487 484
 }  
488 485
 
489 486
 
... ...
@@ -511,11 +508,11 @@ clusterProbability.celda_CG = function(celda.mod, counts, log=FALSE, ...) {
511 508
   p = cCG.decomposeCounts(counts, s, z, y, K, L)
512 509
 
513 510
   ## Gibbs sampling for each cell
514
-  next.z = cC.calcGibbsProbZ(counts=p$n.TS.by.C, m.CP.by.S=p$m.CP.by.S, n.G.by.CP=t(p$n.CP.by.TS), n.CP=p$n.CP, n.by.C=p$n.by.C, z=z, s=s, K=K, nG=L, nM=p$nM, alpha=alpha, beta=beta, do.sample=FALSE)
511
+  next.z = cC.calcGibbsProbZ(counts=p$n.TS.by.C, m.CP.by.S=p$m.CP.by.S, n.G.by.CP=p$n.TS.by.CP, n.CP=p$n.CP, n.by.C=p$n.by.C, z=z, s=s, K=K, nG=L, nM=p$nM, alpha=alpha, beta=beta, do.sample=FALSE)
515 512
   z.prob = t(next.z$probs)
516 513
 
517 514
   ## Gibbs sampling for each gene
518
-  next.y = cG.calcGibbsProbY(counts.t=p$n.CP.by.G, n.C.by.TS=p$n.CP.by.TS, n.by.TS=p$n.by.TS, nG.by.TS=p$nG.by.TS, n.by.G=p$n.by.G, y=y, L=L, nG=nG, beta=beta, delta=delta, gamma=gamma, do.sample=FALSE)
515
+  next.y = cG.calcGibbsProbY(counts=p$n.CP.by.G, n.TS.by.C=p$n.TS.by.CP, n.by.TS=p$n.by.TS, nG.by.TS=p$nG.by.TS, n.by.G=p$n.by.G, y=y, L=L, nG=nG, beta=beta, delta=delta, gamma=gamma, do.sample=FALSE)
519 516
   y.prob = t(next.y$probs)
520 517
 
521 518
   if(!isTRUE(log)) {
... ...
@@ -528,7 +525,7 @@ clusterProbability.celda_CG = function(celda.mod, counts, log=FALSE, ...) {
528 525
 
529 526
 
530 527
 #' @export
531
-calculatePerplexity.celda_CG = function(counts, celda.mod, precision=128) {
528
+calculatePerplexity.celda_CG = function(counts, celda.mod) {
532 529
   
533 530
   factorized = factorizeMatrix(counts = counts, celda.mod = celda.mod, "posterior")
534 531
   theta = log(factorized$posterior$sample.states)
... ...
@@ -541,14 +538,8 @@ calculatePerplexity.celda_CG = function(counts, celda.mod, precision=128) {
541 538
   eta.prob = log(eta) * nG.by.TS
542 539
   gene.by.pop.prob = log(psi %*% phi)
543 540
   inner.log.prob = (t(gene.by.pop.prob) %*% counts) + theta[, sl]  
544
-  inner.log.prob = Rmpfr::mpfr(inner.log.prob, precision)
545
-  inner.log.prob.exp = exp(inner.log.prob)
546
-  
547
-  log.px = sum(eta.prob)
548
-  for(i in 1:ncol(inner.log.prob.exp)) {
549
-    log.px = log.px + Rmpfr::asNumeric(log(sum(inner.log.prob.exp[, i])))
550
-  }
551 541
   
542
+  log.px = sum(apply(inner.log.prob, 2, matrixStats::logSumExp)) + sum(eta.prob)
552 543
   perplexity = exp(-(log.px/sum(counts)))
553 544
   return(perplexity)
554 545
 }
... ...
@@ -58,7 +58,6 @@ celda_G = function(counts, L, beta=1, delta=1, gamma=1,
58 58
 
59 59
   ## Error checking and variable processing
60 60
   counts = processCounts(counts)  
61
-  counts.t = t(counts)
62 61
   
63 62
   if(is.null(count.checksum)) {
64 63
     count.checksum = digest::digest(counts, algo="md5")
... ...
@@ -70,7 +69,7 @@ celda_G = function(counts, L, beta=1, delta=1, gamma=1,
70 69
 
71 70
   ## Calculate counts one time up front
72 71
   p = cG.decomposeCounts(counts=counts, y=y, L=L)
73
-  n.C.by.TS = p$n.C.by.TS
72
+  n.TS.by.C = p$n.TS.by.C
74 73
   n.by.G = p$n.by.G
75 74
   n.by.TS = p$n.by.TS
76 75
   nG.by.TS = p$nG.by.TS
... ...
@@ -82,15 +81,15 @@ celda_G = function(counts, L, beta=1, delta=1, gamma=1,
82 81
   logMessages(date(), "... Starting celda_G to cluster genes.", logfile=logfile, append=FALSE)
83 82
   
84 83
   ## Calculate initial log likelihood
85
-  ll <- cG.calcLL(n.C.by.TS=n.C.by.TS, n.by.TS=n.by.TS, n.by.G=n.by.G, nG.by.TS=nG.by.TS, nM=nM, nG=nG, L=L, beta=beta, delta=delta, gamma=gamma)
84
+  ll <- cG.calcLL(n.TS.by.C=n.TS.by.C, n.by.TS=n.by.TS, n.by.G=n.by.G, nG.by.TS=nG.by.TS, nM=nM, nG=nG, L=L, beta=beta, delta=delta, gamma=gamma)
86 85
 
87 86
   iter <- 1L
88 87
   num.iter.without.improvement = 0L
89 88
   do.gene.split = TRUE
90 89
   while(iter <= max.iter & num.iter.without.improvement <= stop.iter) {
91 90
 
92
-	next.y = cG.calcGibbsProbY(counts.t=counts.t, n.C.by.TS=n.C.by.TS, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, n.by.G=n.by.G, y=y, nG=nG, L=L, beta=beta, delta=delta, gamma=gamma)
93
-	n.C.by.TS = next.y$n.C.by.TS
91
+	next.y = cG.calcGibbsProbY(counts=counts, n.TS.by.C=n.TS.by.C, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, n.by.G=n.by.G, y=y, nG=nG, L=L, beta=beta, delta=delta, gamma=gamma)
92
+	n.TS.by.C = next.y$n.TS.by.C
94 93
 	nG.by.TS = next.y$nG.by.TS
95 94
 	n.by.TS = next.y$n.by.TS
96 95
 	y = next.y$y
... ...
@@ -98,7 +97,7 @@ celda_G = function(counts, L, beta=1, delta=1, gamma=1,
98 97
     ## Perform split on i-th iteration of no improvement in log likelihood
99 98
     if(L > 2 & (((iter == max.iter | num.iter.without.improvement == stop.iter) & isTRUE(split.on.last)) | (split.on.iter > 0 & iter %% split.on.iter == 0 & isTRUE(do.gene.split)))) {
100 99
       logMessages(date(), " ... Determining if any gene clusters should be split.", logfile=logfile, append=TRUE, sep="")
101
-      res = split.each.y(counts=counts, y=y, L=L, y.prob=t(next.y$probs), beta=beta, delta=delta, gamma=gamma, LLFunction="calculateLoglikFromVariables.celda_G")
100
+      res = cG.splitY(counts, y, n.TS.by.C, n.by.TS, n.by.G, nG.by.TS, nM, nG, L, beta, delta, gamma, y.prob=t(next.y$probs), min=3, max.clusters.to.try=10)
102 101
       logMessages(res$message, logfile=logfile, append=TRUE)
103 102
       
104 103
 	  # Reset convergence counter if a split occured	    
... ...
@@ -111,13 +110,13 @@ celda_G = function(counts, L, beta=1, delta=1, gamma=1,
111 110
 	  
112 111
       ## Re-calculate variables
113 112
       y = res$y
114
-      n.C.by.TS = t(rowSumByGroup(counts, group=y, L=L))
115
-      n.by.TS = as.integer(rowSumByGroup(matrix(n.by.G,ncol=1), group=y, L=L))
116
-      nG.by.TS = as.integer(table(factor(y, 1:L)))
113
+      n.TS.by.C = res$n.TS.by.C
114
+      n.by.TS = res$n.by.TS
115
+      nG.by.TS = res$nG.by.TS
117 116
     }
118 117
      
119 118
     ## Calculate complete likelihood
120
-    temp.ll <- cG.calcLL(n.C.by.TS=n.C.by.TS, n.by.TS=n.by.TS, n.by.G=n.by.G, nG.by.TS=nG.by.TS, nM=nM, nG=nG, L=L, beta=beta, delta=delta, gamma=gamma)
119
+    temp.ll <- cG.calcLL(n.TS.by.C=n.TS.by.C, n.by.TS=n.by.TS, n.by.G=n.by.G, nG.by.TS=nG.by.TS, nM=nM, nG=nG, L=L, beta=beta, delta=delta, gamma=gamma)
121 120
     if((all(temp.ll > ll)) | iter == 1) {
122 121
       y.best = y
123 122
       ll.best = temp.ll
... ...
@@ -158,13 +157,13 @@ celda_G = function(counts, L, beta=1, delta=1, gamma=1,
158 157
 # @param delta The Dirichlet distribution parameter for Eta; adds a gene pseudocount to the numbers of genes each state.
159 158
 # @param beta Vector of non-zero concentration parameters for cluster <-> gene assignment Dirichlet distribution
160 159
 # @keywords log likelihood
161
-cG.calcGibbsProbY = function(counts.t, n.C.by.TS, n.by.TS, nG.by.TS, n.by.G, y, L, nG, beta, delta, gamma, do.sample=TRUE) {
160
+cG.calcGibbsProbY = function(counts, n.TS.by.C, n.by.TS, nG.by.TS, n.by.G, y, L, nG, beta, delta, gamma, do.sample=TRUE) {
162 161
 
163 162
   ## Set variables up front outside of loop
164 163
   probs = matrix(NA, ncol=nG, nrow=L)
165 164
   temp.nG.by.TS = nG.by.TS 
166 165
   temp.n.by.TS = n.by.TS 
167
-#  temp.n.C.by.TS = n.C.by.TS
166
+#  temp.n.TS.by.C = n.TS.by.C
168 167
 
169 168
   ix <- sample(1:nG)
170 169
   for(i in ix) {
... ...
@@ -173,25 +172,25 @@ cG.calcGibbsProbY = function(counts.t, n.C.by.TS, n.by.TS, nG.by.TS, n.by.G, y,
173 172
 	nG.by.TS[y[i]] = nG.by.TS[y[i]] - 1L
174 173
 	n.by.TS[y[i]] = n.by.TS[y[i]] - n.by.G[i]
175 174
 	
176
-	n.C.by.TS_1 = n.C.by.TS
177
-	n.C.by.TS_1[,y[i]] = n.C.by.TS_1[,y[i]] - counts.t[,i]
178
-    n.C.by.TS_1 = .colSums(lgamma(n.C.by.TS_1 + beta), nrow(n.C.by.TS), ncol(n.C.by.TS))
175
+	n.TS.by.C_1 = n.TS.by.C
176
+	n.TS.by.C_1[y[i],] = n.TS.by.C_1[y[i],] - counts[i,]
177
+    n.TS.by.C_1 = .rowSums(lgamma(n.TS.by.C_1 + beta), nrow(n.TS.by.C), ncol(n.TS.by.C))
179 178
     
180
-	n.C.by.TS_2 = n.C.by.TS
179
+	n.TS.by.C_2 = n.TS.by.C
181 180
 	other.ix = (1:L)[-y[i]]
182
-	n.C.by.TS_2[,other.ix] = n.C.by.TS_2[,other.ix] + counts.t[,i]
183
-    n.C.by.TS_2 = .colSums(lgamma(n.C.by.TS_2 + beta), nrow(n.C.by.TS), ncol(n.C.by.TS))
184
-    	
181
+	n.TS.by.C_2[other.ix,] = n.TS.by.C_2[other.ix,] + counts[rep(i, length(other.ix)),]
182
+    n.TS.by.C_2 = .rowSums(lgamma(n.TS.by.C_2 + beta), nrow(n.TS.by.C), ncol(n.TS.by.C))
183
+    
185 184
 	## Calculate probabilities for each state
186 185
 	for(j in 1:L) {
187 186
 	
188 187
 	  temp.nG.by.TS = nG.by.TS 
189 188
 	  temp.n.by.TS = n.by.TS 
190
-#	  temp.n.C.by.TS = n.C.by.TS
189
+#	  temp.n.TS.by.C = n.TS.by.C
191 190
 	
192 191
 	  temp.nG.by.TS[j] = temp.nG.by.TS[j] + 1L      
193 192
 	  temp.n.by.TS[j] = temp.n.by.TS[j] + n.by.G[i]
194
-#	  temp.n.C.by.TS[,j] = temp.n.C.by.TS[,j] + counts.t[,i]
193
+#	  temp.n.TS.by.C[j,] = temp.n.TS.by.C[j,] + counts[i,]
195 194
 
196 195
 
197 196
       pseudo.nG.by.TS = temp.nG.by.TS
... ...
@@ -201,8 +200,8 @@ cG.calcGibbsProbY = function(counts.t, n.C.by.TS, n.by.TS, nG.by.TS, n.by.G, y,
201 200
 	  other.ix = (1:L)[-j]
202 201
       probs[j,i] <- 	sum(lgamma(pseudo.nG.by.TS + gamma)) -
203 202
 						sum(lgamma(sum(pseudo.nG.by.TS + gamma))) +
204
-						sum(n.C.by.TS_1[other.ix]) + n.C.by.TS_2[j] +
205
-#						sum(lgamma(temp.n.C.by.TS + beta)) +
203
+						sum(n.TS.by.C_1[other.ix]) + n.TS.by.C_2[j] +
204
+#						sum(lgamma(temp.n.TS.by.C + beta)) +
206 205
 						sum(lgamma(pseudo.nG.by.TS * delta)) -
207 206
 						(pseudo.nG * lgamma(delta)) -
208 207
 						sum(lgamma(temp.n.by.TS + (pseudo.nG.by.TS * delta)))
... ...
@@ -214,16 +213,16 @@ cG.calcGibbsProbY = function(counts.t, n.C.by.TS, n.by.TS, nG.by.TS, n.by.G, y,
214 213
 	if(isTRUE(do.sample)) y[i] = sample.ll(probs[,i])
215 214
 	
216 215
 	if(prev.y != y[i]) {
217
-	  n.C.by.TS[,prev.y] = n.C.by.TS[,prev.y] - counts.t[,i]
218
-	  n.C.by.TS[,y[i]] = n.C.by.TS[,y[i]] + counts.t[,i]
216
+	  n.TS.by.C[prev.y,] = n.TS.by.C[prev.y,] - counts[i,]
217
+	  n.TS.by.C[y[i],] = n.TS.by.C[y[i],] + counts[i,]
219 218
 	}
220 219
 	
221 220
 	nG.by.TS[y[i]] = nG.by.TS[y[i]] + 1L
222 221
 	n.by.TS[y[i]] = n.by.TS[y[i]] + n.by.G[i]
223
-	#n.C.by.TS[,y[i]] = n.C.by.TS[,y[i]] + counts.t[,i]
222
+	#n.TS.by.C[y[i],] = n.TS.by.C[y[i],] + counts[i,]
224 223
   }
225 224
   
226
-  return(list(n.C.by.TS=n.C.by.TS, nG.by.TS=nG.by.TS, n.by.TS=n.by.TS, y=y, probs=probs))
225
+  return(list(n.TS.by.C=n.TS.by.C, nG.by.TS=nG.by.TS, n.by.TS=n.by.TS, y=y, probs=probs))
227 226
 }
228 227
 
229 228
 
... ...
@@ -244,7 +243,7 @@ cG.calcGibbsProbY = function(counts.t, n.C.by.TS, n.by.TS, nG.by.TS, n.by.G, y,
244 243
 #' @param ... Other arguments
245 244
 #' @export
246 245
 simulateCells.celda_G = function(model, C=100, N.Range=c(500,5000),  G=1000, 
247
-                                 L=5, beta=1, gamma=1, delta=1, seed=12345, ...) {
246
+                                 L=5, beta=1, gamma=5, delta=1, seed=12345, ...) {
248 247
   set.seed(seed)
249 248
   eta = rdirichlet(1, rep(gamma, L))
250 249
   
... ...
@@ -314,7 +313,7 @@ factorizeMatrix.celda_G = function(counts, celda.mod, type=c("counts", "proporti
314 313
   gamma = celda.mod$gamma
315 314
   
316 315
   p = cG.decomposeCounts(counts=counts, y=y, L=L)
317
-  n.TS.by.C = t(p$n.C.by.TS)
316
+  n.TS.by.C = p$n.TS.by.C
318 317
   n.by.G = p$n.by.G
319 318
   n.by.TS = p$n.by.TS
320 319
   nG.by.TS = p$nG.by.TS
... ...
@@ -370,16 +369,16 @@ factorizeMatrix.celda_G = function(counts, celda.mod, type=c("counts", "proporti
370 369
 
371 370
 
372 371
 # Calculate log-likelihood of celda_CG model
373
-cG.calcLL = function(n.C.by.TS, n.by.TS, n.by.G, nG.by.TS, nM, nG, L, beta, delta, gamma) {
372
+cG.calcLL = function(n.TS.by.C, n.by.TS, n.by.G, nG.by.TS, nM, nG, L, beta, delta, gamma) {
374 373
   
375 374
   nG.by.TS[nG.by.TS == 0] = 1
376 375
   nG = sum(nG.by.TS)
377 376
   
378 377
   ## Calculate for "Phi" component
379 378
   a <- nM * lgamma(L * beta)
380
-  b <- sum(lgamma(n.C.by.TS + beta))
379
+  b <- sum(lgamma(n.TS.by.C + beta))
381 380
   c <- -nM * L * lgamma(beta)
382
-  d <- -sum(lgamma(rowSums(n.C.by.TS + beta)))
381
+  d <- -sum(lgamma(colSums(n.TS.by.C + beta)))
383 382
 
384 383
   phi.ll <- a + b + c + d
385 384
 
... ...
@@ -422,7 +421,7 @@ cG.calcLL = function(n.C.by.TS, n.by.TS, n.by.G, nG.by.TS, nM, nG, L, beta, delt
422 421
 calculateLoglikFromVariables.celda_G = function(counts, y, L, beta, delta, gamma) {
423 422
 
424 423
   p = cG.decomposeCounts(counts=counts, y=y, L=L)
425
-  final <- cG.calcLL(n.C.by.TS=p$n.C.by.TS, n.by.TS=p$n.by.TS, n.by.G=p$n.by.G, nG.by.TS=p$nG.by.TS, nM=p$nM, nG=p$nG, L=L, beta=beta, delta=delta, gamma=gamma)
424
+  final <- cG.calcLL(n.TS.by.C=p$n.TS.by.C, n.by.TS=p$n.by.TS, n.by.G=p$n.by.G, nG.by.TS=p$nG.by.TS, nM=p$nM, nG=p$nG, L=L, beta=beta, delta=delta, gamma=gamma)
426 425
   
427 426
   return(final)
428 427
 }
... ...
@@ -434,14 +433,24 @@ calculateLoglikFromVariables.celda_G = function(counts, y, L, beta, delta, gamma
434 433
 #' @param L The number of clusters being considered
435 434
 cG.decomposeCounts = function(counts, y, L) {
436 435
 
437
-  n.C.by.TS = t(rowSumByGroup(counts, group=y, L=L))
438
-  n.by.G = as.integer(rowSums(counts))
436
+  n.TS.by.C = rowSumByGroup(counts, group=y, L=L)
437
+  n.by.G = as.integer(.rowSums(counts, nrow(counts), ncol(counts)))
439 438
   n.by.TS = as.integer(rowSumByGroup(matrix(n.by.G,ncol=1), group=y, L=L))
440
-  nG.by.TS = as.integer(table(factor(y, 1:L)))
439
+  nG.by.TS = tabulate(y, L)
441 440
   nM = ncol(counts)
442 441
   nG = nrow(counts)
443 442
   
444
-  return(list(n.C.by.TS=n.C.by.TS, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nM=nM, nG=nG))
443
+  return(list(n.TS.by.C=n.TS.by.C, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nM=nM, nG=nG))
444
+}
445
+
446
+
447
+cG.reDecomposeCounts = function(counts, y, previous.y, n.TS.by.C, n.by.G, L) {
448
+  ## Recalculate counts based on new label
449
+  n.TS.by.C = rowSumByGroupChange(counts, n.TS.by.C, y, previous.y, L)
450
+  n.by.TS = as.integer(rowSumByGroup(matrix(n.by.G,ncol=1), group=y, L=L))
451
+  nG.by.TS = tabulate(y, L)
452
+
453
+  return(list(n.TS.by.C=n.TS.by.C, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS))  
445 454
 }
446 455
 
447 456
 
... ...
@@ -463,7 +472,7 @@ clusterProbability.celda_G = function(celda.mod, counts, log=FALSE, ...) {
463 472
   
464 473
   ## Calculate counts one time up front
465 474
   p = cG.decomposeCounts(counts=counts, y=y, L=L)
466
-  next.y = cG.calcGibbsProbY(counts.t=t(counts), n.C.by.TS=p$n.C.by.TS, n.by.TS=p$n.by.TS, nG.by.TS=p$nG.by.TS, n.by.G=p$n.by.G, y=y, nG=p$nG, L=L, beta=beta, delta=delta, gamma=gamma, do.sample=FALSE)  
475
+  next.y = cG.calcGibbsProbY(counts=counts, n.TS.by.C=p$n.TS.by.C, n.by.TS=p$n.by.TS, nG.by.TS=p$nG.by.TS, n.by.G=p$n.by.G, y=y, nG=p$nG, L=L, beta=beta, delta=delta, gamma=gamma, do.sample=FALSE)  
467 476
   y.prob = t(next.y$probs)
468 477
   
469 478
   if(!isTRUE(log)) {
... ...
@@ -475,7 +484,7 @@ clusterProbability.celda_G = function(celda.mod, counts, log=FALSE, ...) {
475 484
 
476 485
 
477 486
 #' @export
478
-calculatePerplexity.celda_G = function(counts, celda.mod, precision=128) {
487
+calculatePerplexity.celda_G = function(counts, celda.mod) {
479 488
  
480 489
   factorized = factorizeMatrix(counts = counts, celda.mod = celda.mod, "posterior")
481 490
   phi <- factorized$posterior$gene.states
... ...
@@ -252,8 +252,8 @@ logMessages = function(..., sep = " ", logfile = NULL, append = FALSE) {
252 252
 #' @export
253 253
 distinct_colors = function(n,
254 254
 						   hues = c("red", "cyan", "orange", "blue", "yellow", "purple", "green", "magenta"),
255
-                           saturation.range = c(0.4, 1),
256
-                           value.range = c(0.4, 1)) {
255
+                           saturation.range = c(0.7, 1),
256
+                           value.range = c(0.7, 1)) {
257 257
                            
258 258
   if(!(all(hues %in% grDevices::colors()))) {
259 259
     stop("Only color names listed in the 'color' function can be used in 'hues'")
... ...
@@ -11,10 +11,9 @@
11 11
 #' 
12 12
 #' @param counts The count matrix modeled in the celdaRun parameter
13 13
 #' @param celda.mod A single celda run (usually from the _res.list_ property of a celda_list)
14
-#' @param precision The amount of bits of precision to pass to Rmpfr
15
-#' @return The perplexity for the provided chain as an mpfr number
14
+#' @return The perplexity for the provided data and model
16 15
 #' @export
17
-calculatePerplexity = function(counts, celda.mod, precision=128) {
16
+calculatePerplexity = function(counts, celda.mod) {
18 17
   UseMethod("calculatePerplexity", celda.mod)
19 18
 }
20 19
 
... ...
@@ -115,7 +115,7 @@ plotDrCluster <- function(dim1, dim2, cluster, size = 1, xlab = "Dimension_1", y
115 115
 #' @param max.iter Numeric vector; determines iterations for tsne. Default 1000.
116 116
 #' @param distance Character vector; determines which distance metric to use for tsne. Options: cosine, hellinger, spearman.
117 117
 #' @export
118
-celdaTsne = function(counts, celda.mod, states=NULL, perplexity=20, max.iter=2500, distance="hellinger") {
118
+celdaTsne = function(counts, celda.mod, states=NULL, perplexity=20, max.iter=2500, distance="hellinger", seed=12345) {
119 119
   if (!isTRUE(class(celda.mod) %in% c("celda_CG","celda_C","celda_G"))) {
120 120
     stop("celda.mod argument is not of class celda_C, celda_G or celda_CG")
121 121
   } 
... ...
@@ -148,6 +148,7 @@ celdaTsne = function(counts, celda.mod, states=NULL, perplexity=20, max.iter=250
148 148
   }
149 149
   
150 150
   do.pca = class(celda.mod) == "celda_C"
151
+  set.seed(seed)
151 152
   res = Rtsne::Rtsne(d, pca=do.pca, max_iter=max.iter, perplexity = perplexity, 
152 153
                      check_duplicates = FALSE, is_distance = TRUE)$Y
153 154
   colnames(res) = c("tsne_1", "tsne_2")
... ...
@@ -1,50 +1,146 @@
1
-split.each.z = function(counts, z, K, LLFunction, z.prob, min.cell=3, max.clusters.to.try = 10, ...) { 
1
+# cC.calcLL = function(m.CP.by.S, n.G.by.CP, s, z, K, nS, nG, alpha, beta) 
2
+cC.splitZ = function(counts, m.CP.by.S, n.G.by.CP, s, z, K, nS, nG, alpha, beta, z.prob, max.clusters.to.try=10, min.cell=3) {
2 3
 
3
-  dot.args = list(...)
4
-  
5 4
   ## Identify clusters to split
6
-  z.ta = table(factor(z, levels=1:K))
5
+  z.ta = tabulate(z, K)
7 6
   z.to.split = which(z.ta >= min.cell)
8 7
   z.non.empty = which(z.ta > 0)
9 8
   
9
+  if(length(z.to.split) == 0) {
10
+    m = paste0(date(), " ... Cluster sizes too small. No additional splitting was performed.") 
11
+    return(list(z=z, m.CP.by.S, n.G.by.CP, n.CP=n.CP, message=m))  
12
+  }
13
+  
10 14
   ## Loop through each split-able Z and perform split
11
-  clust.split = list()
12
-  for(i in 1:K) { 
13
-    if(i %in% z.to.split) {
14
-      clustLabel = suppressMessages(celda_C(counts[,z == i], K=2, max.iter=5, split.on.iter=-1, split.on.last=FALSE))
15
-      clust.split = c(clust.split, list(clustLabel$z))
16
-    } else {
17
-      clust.split = c(clust.split, list(NA))
18
-    }  
15
+  clust.split = vector("list", K)
16
+  for(i in z.to.split) { 
17
+    clustLabel = suppressMessages(celda_C(counts[,z == i], K=2, max.iter=5, split.on.iter=-1, split.on.last=FALSE))
18
+    clust.split[[i]] = clustLabel$z
19 19
   }
20 20
 
21 21
   ## Find second best assignment give current assignments for each cell
22 22
   z.prob[cbind(1:nrow(z.prob), z)] = NA
23 23
   z.second = apply(z.prob, 1, which.max)
24 24
 
25
+  ## Set up initial variables
26
+  z.split = matrix(NA, nrow=length(z), ncol=length(z.to.split) * max.clusters.to.try)
27
+  z.split.ll = rep(NA, ncol=length(z.to.split) * max.clusters.to.try)  
28
+  z.split.ll[1] = cC.calcLL(m.CP.by.S, n.G.by.CP, s, z, K, nS, nG, alpha, beta) 
29
+  z.split[,1] = z
30
+
31
+  ## Select worst clusters to test for reshuffling  
32
+  previous.z = z
25 33
   ll.shuffle = rep(NA, K)
26 34
   for(i in z.non.empty) {
27 35
     ix = z == i
28 36
     new.z = z
29 37
     new.z[ix] = z.second[ix]
30
-    params = c(list(counts=counts, z=new.z, K=K), dot.args)
31
-    ll.shuffle[i] = do.call(LLFunction, params)
32
-  }
38
+    
39
+    p = cC.reDecomposeCounts(counts, s, new.z, previous.z, n.G.by.CP, K)
40
+    n.G.by.CP = p$n.G.by.CP
41
+    m.CP.by.S = p$m.CP.by.S
42
+    ll.shuffle[i] = cC.calcLL(m.CP.by.S, n.G.by.CP, s, z, K, nS, nG, alpha, beta) 
43
+    previous.z = new.z
44
+  } 
33 45
   z.to.shuffle = head(order(ll.shuffle, decreasing = TRUE, na.last=NA), n = max.clusters.to.try)
34 46
 
35
-  if(length(z.to.split) == 0 | length(z.to.shuffle) == 0) {
47
+  
48
+  pairs = c(NA, NA)
49
+  split.ix = 2
50
+  for(i in z.to.shuffle) {
51
+  
52
+    other.clusters = setdiff(z.to.split, i)
53
+   
54
+	for(j in other.clusters) {
55
+	  new.z = z
56
+	  
57
+      ## Assign cluster i to the next most similar cluster (excluding cluster j) 
58
+      ## as defined above by the correlation      
59
+      ix.to.move = z == i
60
+      new.z[ix.to.move] = z.second[ix.to.move]
61
+            
62
+      ## Split cluster j according to the clustering defined above
63
+      ix.to.split = z == j
64
+      new.z[ix.to.split] = ifelse(clust.split[[j]] == 1, j, i)
65
+
66
+      p = cC.reDecomposeCounts(counts, s, new.z, previous.z, n.G.by.CP, K)
67
+      n.G.by.CP = p$n.G.by.CP
68
+      m.CP.by.S = p$m.CP.by.S
69
+      
70
+	  ## Calculate likelihood of split
71
+	  z.split.ll[split.ix] = cC.calcLL(m.CP.by.S, n.G.by.CP, s, z, K, nS, nG, alpha, beta) 
72
+	  z.split[,split.ix] = new.z
73
+	  split.ix = split.ix + 1L
74
+      previous.z = new.z
75
+      
76
+	  pairs = rbind(pairs, c(i, j))
77
+	}  
78
+  }
79
+
80
+  select = which.max(z.split.ll) 
81
+
82
+  if(select == 1) {
83
+    m = paste0(date(), " ... No additional splitting was performed.") 
84
+  } else {
85
+    m = paste0(date(), " ... Cluster ", pairs[select,1], " was reassigned and cluster ", pairs[select,2], " was split in two.")
86
+  } 
87
+
88
+  p = cC.reDecomposeCounts(counts, s, z.split[,select], previous.z, n.G.by.CP, K)
89
+  return(list(z=z.split[,select], m.CP.by.S=p$m.CP.by.S, n.G.by.CP=p$n.G.by.CP, n.CP=p$n.CP, message=m))
90
+}
91
+
92
+
93
+# cCG.calcLL = function(K, L, m.CP.by.S, n.TS.by.CP, n.by.G, n.by.TS, nG.by.TS, nS, nG, alpha, beta, delta, gamma) 
94
+cCG.splitZ = function(counts, m.CP.by.S, n.TS.by.C, n.TS.by.CP, n.by.G, n.by.TS, nG.by.TS, s, z, K, L, nS, nG, alpha, beta, delta, gamma, z.prob, max.clusters.to.try=10, min.cell=3) {
95
+
96
+  ## Identify clusters to split
97
+  z.ta = tabulate(z, K)
98
+  z.to.split = which(z.ta >= min.cell)
99
+  z.non.empty = which(z.ta > 0)
100
+  
101
+  if(length(z.to.split) == 0) {
36 102
     m = paste0(date(), " ... Cluster sizes too small. No additional splitting was performed.") 
37
-    return(list(z=z,message=m))
103
+    return(list(z=z, m.CP.by.S=m.CP.by.S, n.TS.by.CP=n.TS.by.CP, n.CP=n.CP, message=m))  
38 104
   }
39 105
   
40
-  ## Set up variables for holding results with current iteration of z
41
-  params = c(list(counts=counts, z=z, K=K), dot.args)
42
-  z.split.ll = do.call(LLFunction, params)
43
-  z.split = z
106
+  ## Loop through each split-able Z and perform split
107
+  clust.split = vector("list", K)
108
+  for(i in z.to.split) { 
109
+    clustLabel = suppressMessages(celda_C(counts[,z == i], K=2, max.iter=5, split.on.iter=-1, split.on.last=FALSE))
110
+    clust.split[[i]] = clustLabel$z
111
+  }
44 112
 
113
+  ## Find second best assignment give current assignments for each cell
114
+  z.prob[cbind(1:nrow(z.prob), z)] = NA
115
+  z.second = apply(z.prob, 1, which.max)
116
+
117
+  ## Set up initial variables
118
+  z.split = matrix(NA, nrow=length(z), ncol=length(z.to.split) * max.clusters.to.try)
119
+  z.split.ll = rep(NA, ncol=length(z.to.split) * max.clusters.to.try)  
120
+  z.split.ll[1] = cCG.calcLL(K, L, m.CP.by.S, n.TS.by.CP, n.by.G, n.by.TS, nG.by.TS, nS, nG, alpha, beta, delta, gamma) 
121
+  z.split[,1] = z
122
+
123
+  ## Select worst clusters to test for reshuffling  
124
+  previous.z = z
125
+  ll.shuffle = rep(NA, K)
126
+  for(i in z.non.empty) {
127
+    ix = z == i
128
+    new.z = z
129
+    new.z[ix] = z.second[ix]
130
+    
131
+    p = cC.reDecomposeCounts(n.TS.by.C, s, new.z, previous.z, n.TS.by.CP, K)
132
+    n.TS.by.CP = p$n.G.by.CP
133
+    m.CP.by.S = p$m.CP.by.S
134
+    ll.shuffle[i] = cCG.calcLL(K, L, m.CP.by.S, n.TS.by.CP, n.by.G, n.by.TS, nG.by.TS, nS, nG, alpha, beta, delta, gamma) 
135
+    previous.z = new.z
136
+  } 
137
+  z.to.shuffle = head(order(ll.shuffle, decreasing = TRUE, na.last=NA), n = max.clusters.to.try)
138
+
139
+  
45 140
   pairs = c(NA, NA)
141
+  split.ix = 2
46 142
   for(i in z.to.shuffle) {
47
-    
143
+  
48 144
     other.clusters = setdiff(z.to.split, i)
49 145
    
50 146
 	for(j in other.clusters) {
... ...
@@ -59,18 +155,116 @@ split.each.z = function(counts, z, K, LLFunction, z.prob, min.cell=3, max.cluste
59 155
       ix.to.split = z == j
60 156
       new.z[ix.to.split] = ifelse(clust.split[[j]] == 1, j, i)
61 157
 
158
+      p = cC.reDecomposeCounts(n.TS.by.C, s, new.z, previous.z, n.TS.by.CP, K)
159
+      n.TS.by.CP = p$n.G.by.CP
160
+      m.CP.by.S = p$m.CP.by.S
161
+      
62 162
 	  ## Calculate likelihood of split
63
-	  params = c(list(counts=counts, z=new.z, K=K), dot.args)
64
-	  new.ll = do.call(LLFunction, params)
163
+	  z.split.ll[split.ix] = cCG.calcLL(K, L, m.CP.by.S, n.TS.by.CP, n.by.G, n.by.TS, nG.by.TS, nS, nG, alpha, beta, delta, gamma) 
164
+	  z.split[,split.ix] = new.z
165
+	  split.ix = split.ix + 1L
166
+      previous.z = new.z
167
+      
168
+	  pairs = rbind(pairs, c(i, j))
169
+	}  
170
+  }
171
+
172
+  select = which.max(z.split.ll) 
65 173
 
66
-	  z.split.ll = c(z.split.ll, new.ll)
67
-	  z.split = cbind(z.split, new.z)
174
+  if(select == 1) {
175
+    m = paste0(date(), " ... No additional splitting was performed.") 
176
+  } else {
177
+    m = paste0(date(), " ... Cluster ", pairs[select,1], " was reassigned and cluster ", pairs[select,2], " was split in two.")
178
+  } 
179
+
180
+  p = cC.reDecomposeCounts(n.TS.by.C, s, z.split[,select], previous.z, n.TS.by.CP, K)
181
+  return(list(z=z.split[,select], m.CP.by.S=p$m.CP.by.S, n.TS.by.CP=p$n.G.by.CP, n.CP=p$n.CP, message=m))
182
+}
183
+
184
+
185
+
186
+cCG.splitY = function(counts, y, m.CP.by.S, n.G.by.CP, n.TS.by.C, n.TS.by.CP, n.by.G, n.by.TS, nG.by.TS, n.CP, s, z, K, L, nS, nG, alpha, beta, delta, gamma, y.prob, max.clusters.to.try=10, min.cell=3) {
187
+
188
+  ## Identify clusters to split
189
+  y.ta = tabulate(y, L)
190
+  y.to.split = which(y.ta >= min.cell)
191
+  y.non.empty = which(y.ta > 0)
192
+  
193
+  if(length(y.to.split) == 0) {
194
+    m = paste0(date(), " ... Cluster sizes too small. No additional splitting was performed.") 
195
+    return(list(y=y, m.CP.by.S=m.CP.by.S, n.TS.by.CP=n.TS.by.CP, n.CP=n.CP, message=m))  
196
+  }
197
+  
198
+  ## Loop through each split-able Z and perform split
199
+  clust.split = vector("list", L)
200
+  for(i in y.to.split) { 
201
+    clustLabel = suppressMessages(celda_G(counts[y == i,], L=2, max.iter=5, split.on.iter=-1, split.on.last=FALSE))
202
+    clust.split[[i]] = clustLabel$y
203
+  }
204
+
205
+  ## Find second best assignment give current assignments for each cell
206
+  y.prob[cbind(1:nrow(y.prob), y)] = NA
207
+  y.second = apply(y.prob, 1, which.max)
208
+
209
+  ## Set up initial variables
210
+  y.split = matrix(NA, nrow=length(y), ncol=length(y.to.split) * max.clusters.to.try)
211
+  y.split.ll = rep(NA, ncol=length(y.to.split) * max.clusters.to.try)  
212
+  y.split.ll[1] = cCG.calcLL(K, L, m.CP.by.S, n.TS.by.CP, n.by.G, n.by.TS, nG.by.TS, nS, nG, alpha, beta, delta, gamma) 
213
+  y.split[,1] = y
214
+
215
+  ## Select worst clusters to test for reshuffling  
216
+  previous.y = y
217
+  ll.shuffle = rep(NA, L)
218
+  for(i in y.non.empty) {
219
+    ix = y == i
220
+    new.y = y
221
+    new.y[ix] = y.second[ix]
222
+    
223
+    p = cG.reDecomposeCounts(n.G.by.CP, new.y, previous.y, n.TS.by.CP, n.by.G, L)
224
+    n.TS.by.CP = p$n.TS.by.C
225
+    n.by.TS = p$n.by.TS
226
+    nG.by.TS = p$nG.by.TS
227
+    
228
+    ll.shuffle[i] = cCG.calcLL(K, L, m.CP.by.S, n.TS.by.CP, n.by.G, n.by.TS, nG.by.TS, nS, nG, alpha, beta, delta, gamma) 
229
+    previous.y = new.y
230
+  } 
231
+  y.to.shuffle = head(order(ll.shuffle, decreasing = TRUE, na.last=NA), n = max.clusters.to.try)
232
+
233
+  
234
+  pairs = c(NA, NA)
235
+  split.ix = 2
236
+  for(i in y.to.shuffle) {
237
+  
238
+    other.clusters = setdiff(y.to.split, i)
68 239
    
240
+	for(j in other.clusters) {
241
+	  new.y = y
242
+	  
243
+      ## Assign cluster i to the next most similar cluster (excluding cluster j) 
244
+      ## as defined above by the correlation      
245
+      ix.to.move = y == i
246
+      new.y[ix.to.move] = y.second[ix.to.move]
247
+            
248
+      ## Split cluster j according to the clustering defined above
249
+      ix.to.split = y == j
250
+      new.y[ix.to.split] = ifelse(clust.split[[j]] == 1, j, i)
251
+
252
+      p = cG.reDecomposeCounts(n.G.by.CP, new.y, previous.y, n.TS.by.CP, n.by.G, L)
253
+      n.TS.by.CP = p$n.TS.by.C
254
+      n.by.TS = p$n.by.TS
255
+      nG.by.TS = p$nG.by.TS
256
+      
257
+	  ## Calculate likelihood of split
258
+	  y.split.ll[split.ix] = cCG.calcLL(K, L, m.CP.by.S, n.TS.by.CP, n.by.G, n.by.TS, nG.by.TS, nS, nG, alpha, beta, delta, gamma) 
259
+	  y.split[,split.ix] = new.y
260
+	  split.ix = split.ix + 1L
261
+      previous.y = new.y
262
+      
69 263
 	  pairs = rbind(pairs, c(i, j))
70 264
 	}  
71 265
   }
72 266
 
73
-  select = which.max(z.split.ll) 
267
+  select = which.max(y.split.ll) 
74 268
 
75 269
   if(select == 1) {
76 270
     m = paste0(date(), " ... No additional splitting was performed.") 
... ...
@@ -78,55 +272,59 @@ split.each.z = function(counts, z, K, LLFunction, z.prob, min.cell=3, max.cluste
78 272
     m = paste0(date(), " ... Cluster ", pairs[select,1], " was reassigned and cluster ", pairs[select,2], " was split in two.")
79 273
   } 
80 274
 
81
-  return(list(z=z.split[,select], message=m))
275
+  p = cG.reDecomposeCounts(n.G.by.CP, y.split[,select], previous.y, n.TS.by.CP, n.by.G, L)
276
+  return(list(y=y.split[,select], n.TS.by.CP=p$n.TS.by.C, n.by.TS=p$n.by.TS, nG.by.TS=p$nG.by.TS, message=m))
82 277
 }
83 278
 
84 279
 
85 280
 
86
-split.each.y = function(counts, y, L, LLFunction, y.prob, min=3, max.clusters.to.try=10, ...) { 
87 281
 
88
-  dot.args = list(...)
282
+
283
+#cG.calcLL = function(n.TS.by.C, n.by.TS, n.by.G, nG.by.TS, nM, nG, L, beta, delta, gamma) {
284
+cG.splitY = function(counts, y, n.TS.by.C, n.by.TS, n.by.G, nG.by.TS, nM, nG, L, beta, delta, gamma, y.prob, min=3, max.clusters.to.try=10) { 
89 285
   
90 286
   ## Identify clusters to split
91 287
   y.ta = table(factor(y, levels=1:L))
92 288
   y.to.split = which(y.ta >= min)
93 289
   y.non.empty = which(y.ta > 0)
94 290
 
291
+  if(length(y.to.split) == 0) {
292
+    m = paste0(date(), " ... Cluster sizes too small. No additional splitting was performed.") 
293
+    return(list(y=y, n.TS.by.C=n.TS.by.C, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, message=m))
294
+  }
295
+
95 296
   ## Loop through each split-able y and find best split
96
-  clust.split = list()
97
-  for(i in 1:L) {
98
-    if(i %in% y.to.split) {    
99
-      clustLabel = suppressMessages(celda_G(counts[y == i,], L=2, max.iter=5, split.on.iter=-1, split.on.last=FALSE))
100
-      clust.split = c(clust.split, list(clustLabel$y))
101
-    } else {
102
-      clust.split = c(clust.split, list(NA))
103
-    }  
297
+  clust.split = vector("list", L)
298
+  for(i in y.to.split) {
299
+    clustLabel = suppressMessages(celda_G(counts[y == i,], L=2, max.iter=5, split.on.iter=-1, split.on.last=FALSE))
300
+    clust.split[[i]] = clustLabel$y
104 301
   }
105
-  
302
+
106 303
   ## Find second best assignment give current assignments for each cell
107 304
   y.prob[cbind(1:nrow(y.prob), y)] = NA
108 305
   y.second = apply(y.prob, 1, which.max)
306
+
307
+  ## Set up initial variables
308
+  y.split = matrix(NA, nrow=length(y), ncol=length(y.to.split) * max.clusters.to.try)
309
+  y.split.ll = rep(NA, ncol=length(y.to.split) * max.clusters.to.try)  
310
+  y.split.ll[1] = cG.calcLL(n.TS.by.C, n.by.TS, n.by.G, nG.by.TS, nM, nG, L, beta, delta, gamma)
311
+  y.split[,1] = y
312
+
313
+  ## Select worst clusters to test for reshuffling  
109 314
   ll.shuffle = rep(NA, L)
315
+  previous.y = y
110 316
   for(i in y.non.empty) {
111 317
     ix = y == i
112 318
     new.y = y
113 319
     new.y[ix] = y.second[ix]
114
-    params = c(list(counts=counts, y=new.y, L=L), dot.args)
115
-    ll.shuffle[i] = do.call(LLFunction, params)
320
+    p = cG.reDecomposeCounts(counts, new.y, previous.y, n.TS.by.C, n.by.G, L)
321
+    ll.shuffle[i] = cG.calcLL(p$n.TS.by.C, p$n.by.TS, n.by.G, p$nG.by.TS, nM, nG, L, beta, delta, gamma)
322
+    previous.y = new.y
116 323
   }
117 324
   y.to.shuffle = head(order(ll.shuffle, decreasing = TRUE, na.last=NA), n = max.clusters.to.try)
118 325
   
119
-  if(length(y.to.split) == 0 | length(y.to.shuffle) == 0) {
120
-    m = paste0(date(), " ... Cluster sizes too small. No additional splitting was performed.") 
121
-    return(list(y=y, message=m))
122
-  }
123
- 
124
-  ## Set up variables for holding results with current iteration of y
125
-  params = c(list(counts=counts, y=y, L=L), dot.args)
126
-  y.split.ll = do.call(LLFunction, params)
127
-  y.split = y
128
-
129 326
   pairs = c(NA, NA)
327
+  split.ix = 2  
130 328
   for(i in y.to.shuffle) {
131 329
 
132 330
     other.clusters = setdiff(y.to.split, i)
... ...
@@ -144,11 +342,11 @@ split.each.y = function(counts, y, L, LLFunction, y.prob, min=3, max.clusters.to
144 342
       new.y[ix.to.split] = ifelse(clust.split[[j]] == 1, j, i)
145 343
       
146 344
       ## Calculate likelihood of split
147
-      params = c(list(counts=counts, y=new.y, L=L), dot.args)
148
-      new.ll = do.call(LLFunction, params)
149
-
150
-      y.split.ll = c(y.split.ll, new.ll)
151
-      y.split = cbind(y.split, new.y)
345
+      p = cG.reDecomposeCounts(counts, new.y, previous.y, n.TS.by.C, n.by.G, L)
346
+	  y.split.ll[split.ix] = cG.calcLL(p$n.TS.by.C, p$n.by.TS, n.by.G, p$nG.by.TS, nM, nG, L, beta, delta, gamma)
347
+	  y.split[,split.ix] = new.y
348
+	  split.ix = split.ix + 1L
349
+      previous.y = new.y
152 350
       
153 351
       pairs = rbind(pairs, c(i, j))
154 352
     }
... ...
@@ -162,32 +360,11 @@ split.each.y = function(counts, y, L, LLFunction, y.prob, min=3, max.clusters.to
162 360
     m = paste0(date(), " ... Cluster ", pairs[select,1], " was reassigned and cluster ", pairs[select,2], " was split in two.")
163 361
   } 
164 362
  
165
-  return(list(y=y.split[,select], message=m))
363
+  p = cG.reDecomposeCounts(counts, y.split[,select], previous.y, n.TS.by.C, n.by.G, L)
364
+  return(list(y=y.split[,select], n.TS.by.C=p$n.TS.by.C, n.by.TS=p$n.by.TS, nG.by.TS=p$nG.by.TS, message=m))
166 365
 }
167 366
 
168 367
 
169 368
 
170 369
 
171 370
 
172
-clusterByHC = function(d, min=3, method="ward.D") {
173
-  label = cluster::pam(x = d, k=2)$clustering
174
-
175
-  ## If PAM split is too small, perform secondary hclust procedure to split into roughly equal groups
176
-  if(min(table(label)) < min) {
177
-	d.hclust = stats::hclust(d, method=method)
178
-
179
-	## Get maximum sample size of each subcluster
180
-	d.hclust.size = sapply(1:length(d.hclust$height), function(i) max(table(stats::cutree(d.hclust, h=d.hclust$height[i]))))
181
-  
182
-	## Find the height of the dendrogram that best splits the samples in "roughly" half
183
-	sample.size = round(length(d.hclust$order)/ 2)
184
-	d.hclust.select = which.min(abs(d.hclust.size - sample.size))
185
-	temp = stats::cutree(d.hclust, h=d.hclust$height[d.hclust.select])
186
-	
187
-	## Set half of the samples as the largest cluster and the other samples to the other cluster
188
-	label.max.cluster = which.max(table(temp))
189
-	label = ifelse(temp == label.max.cluster, 1, 2)
190
-  } 
191
-  
192
-  return(label)
193
-}
... ...
@@ -4,17 +4,15 @@
4 4
 \alias{calculatePerplexity}
5 5
 \title{Calculate the perplexity from a single celda run}
6 6
 \usage{
7
-calculatePerplexity(counts, celda.mod, precision = 128)
7
+calculatePerplexity(counts, celda.mod)
8 8
 }
9 9
 \arguments{
10 10
 \item{counts}{The count matrix modeled in the celdaRun parameter}
11 11
 
12 12
 \item{celda.mod}{A single celda run (usually from the _res.list_ property of a celda_list)}
13
-
14
-\item{precision}{The amount of bits of precision to pass to Rmpfr}
15 13
 }
16 14
 \value{
17
-The perplexity for the provided chain as an mpfr number
15
+The perplexity for the provided data and model
18 16
 }
19 17
 \description{
20 18
 Perplexity can be seen as a measure of how well a provided set of 
... ...
@@ -5,8 +5,8 @@
5 5
 \title{Generate a distinct palette for coloring different clusters}
6 6
 \usage{
7 7
 distinct_colors(n, hues = c("red", "cyan", "orange", "blue", "yellow",
8
-  "purple", "green", "magenta"), saturation.range = c(0.4, 1),
9
-  value.range = c(0.4, 1))
8
+  "purple", "green", "magenta"), saturation.range = c(0.7, 1),
9
+  value.range = c(0.7, 1))
10 10
 }
11 11
 \arguments{
12 12
 \item{n}{Integer; Number of colors to generate}
... ...
@@ -115,35 +115,16 @@ SEXP rowSumByGroupChange(SEXP R_x, SEXP R_px, SEXP R_group, SEXP R_pgroup)
115 115
     error("group label and previous group label must be the same length as the number of rows in x.");
116 116
   }
117 117
   
118
-  // Create vector containing indices where group is different than pgroup
119
-  // First, calculate the number of differences in group labels
120
-  int ndiff = 0;
121
-  for(i = 0; i < nr; i++) {
122
-    
118
+  int g_ix;
119
+  int pg_ix;
120
+  for (i = 0; i < nr; i++) {
123 121
     if(pgroup[i] != group[i]) {
124
-      ndiff++;
125
-    }
126
-  }
127
-  // Second, add the index of the differences to a vector
128
-  int group_diff_ix[ndiff-1];
129
-  j = 0;
130
-  for(i = 0; i < nr; i++) {
131
-    if(group[i] != pgroup[i]) {
132
-      group_diff_ix[j] = i;
133
-      j++;
134
-    }
135
-  }
136
-  
137
-  // Sum the totals for each element of the 'group' variable
138
-  // Note: columns are iterated over before rows because the compiler appears to store expressions like
139
-  //       'j * nr' in a temporary variable (as they do not change within inner loop);
140
-  //       swapping the order of the outer and inner loops slows down the code ~10X
141
-  int row_ix;
142
-  for (j = 0; j < nc; j++) {
143
-    for (i = 0; i < ndiff; i++) {
144
-      row_ix = group_diff_ix[i];
145
-      px[j * nl + (pgroup[row_ix] - 1)] -= x[j * nr + row_ix];
146
-      px[j * nl + (group[row_ix] - 1)] += x[j * nr + row_ix];      
122
+      for (j = 0; j < nc; j++) {
123
+        pg_ix = (j * nl) + (pgroup[i] - 1);
124
+        g_ix = (j * nl) + (group[i] - 1);
125
+        px[pg_ix] -= x[j * nr + i];
126
+        px[g_ix] += x[j * nr + i];      
127
+      }
147 128
     }
148 129
   }
149 130
 
... ...
@@ -152,6 +133,7 @@ SEXP rowSumByGroupChange(SEXP R_x, SEXP R_px, SEXP R_group, SEXP R_pgroup)
152 133
 
153 134
 
154 135
 
136
+
155 137
 SEXP colSumByGroupChange(SEXP R_x, SEXP R_px, SEXP R_group, SEXP R_pgroup)
156 138
 {
157 139
   int i, j;
... ...
@@ -187,8 +169,8 @@ SEXP colSumByGroupChange(SEXP R_x, SEXP R_px, SEXP R_group, SEXP R_pgroup)
187 169
   for (j = 0; j < nc; j++) {
188 170
     if(group[j] != pgroup[j]) {
189 171
       for (i = 0; i < nr; i++) {
190
-        px[group[j] * nr + i - 1] += x[j * nr + i];
191
-        px[pgroup[j] * nr + i - 1] -= x[j * nr + i];
172
+        px[(group[j]-1) * nr + i] += x[j * nr + i];
173
+        px[(pgroup[j]-1) * nr + i] -= x[j * nr + i];
192 174
       }
193 175
     }
194 176
   }
... ...
@@ -1,70 +1,16 @@
1
-# celda_C.R
1
+#celda_C
2 2
 library(celda)
3
-library(Rtsne)
4
-library(SummarizedExperiment)
5 3
 context("Testing celda_C")
6 4
 
7 5
 load("../celdaCsim.rda")
8 6
 load("../celdaC.rda")
9
-model_C = getModel(celdaC.res, K = 5)[[1]]
10
-factorized <- factorizeMatrix(celda.mod = model_C, counts = celdaC.sim$counts)
11 7
 counts.matrix <- celdaC.sim$counts
8
+model_C = getModel(celdaC.res, K = 5)[[1]]
9
+factorized = factorizeMatrix(counts=counts.matrix, celda.mod=model_C)
12 10
 
13
-#Making sure getModel if functioning correctly
14
-test_that(desc = "Sanity checking getModel",{
15
-  expect_equal(celdaC.res$content.type, class(model_C))
16
-})
17
-
18
-#Making sure relationship of counts vs proportions is correct in factorize matrix
19
-test_that(desc = "Checking factorize matrix, counts vs proportions",{
20
-  expect_equal(TRUE,all(factorized$counts$sample.states/sum(factorized$counts$sample.states) 
21
-                        == factorized$proportions$sample.states))
22
-})
23
-
24
-#Checking dimension of factorize matrix
25
-test_that(desc = "Checking factorize matrix dimension size",{
26
-  expect_equal(5, nrow(factorized$proportions$sample.states))  
27
-})
28
-
29
-
30
-# Ensure calculateLoglikFromVariables calculates the expected values
31
-test_that(desc = "calculateLoglikFromVariables.celda_C returns correct output for various params", {
32
-  expect_lt(calculateLoglikFromVariables(z = celdaC.sim$z, 
33
-                                            beta = 1, alpha = 1, K = 5, model="celda_C", 
34
-                                            s = celdaC.sim$sample.label, 
35
-                                            counts=celdaC.sim$counts),
36
-               0)
37
-})
38
-
39
-test_that(desc = "simulateCells.celda_C returns correctly typed output", {
40
-  sim.res = simulateCells(model="celda_C")
41
-  expect_equal(typeof(sim.res$counts), "integer")
42
-})
43
-
44
-#normalizeCounts
45
-test_that(desc = "Checking normalizeCounts doesn't change dimensions",{
46
-  norm.counts <- normalizeCounts(counts.matrix)
47
-  expect_equal(dim(norm.counts),dim(counts.matrix))
48
-  expect_equal(rownames(norm.counts),rownames(counts.matrix))
49
-  expect_equal(colnames(norm.counts),colnames(counts.matrix))
50
-})
51
-
52
-#recodeClusterZ
53
-test_that(desc = "Checking recodeClusterZ gets correct order",{
54
-  expect_error(recodeClusterZ(celda.mod = model_C, from = NULL, to = ))
55
-  expect_error(recodeClusterZ(celda.mod = model_C, from=c(1,2,3,4,5), to = c(1,2,3,4,6)))
56
-  new.recoded = recodeClusterZ(celda.mod = model_C, from=c(1,2,3,4,5), to = c(5,4,3,2,1))
57
-  expect_equal(model_C$z == 1, new.recoded$z == 5)
58
-})
59
-
60
-#compareCountMatrix
61
-test_that(desc = "Checking CompareCountMatrix",{
62
-  expect_true(compareCountMatrix(count.matrix = celdaC.sim$counts, celda.obj = model_C))
63
-})
64
-
65
-#distinct_colors
11
+distinct_colors
66 12
 test_that(desc = "Checking distinct_colors",{
67
-  expect_equal(distinct_colors(2), c("#FF9999","#99FFFF"))
13
+  expect_equal(distinct_colors(2), c("#FF4D4D", "#4DFFFF"))
68 14
 })
69 15
 
70 16
 ###renderCeldaHeatmap###
... ...
@@ -85,8 +85,8 @@ test_that(desc = "Checking CompareCountMatrix",{
85 85
 
86 86
 #distinct_colors
87 87
 test_that(desc = "Making sure distinct_colors gives expected output",{
88
-  expect_equal(distinct_colors(2), c("#FF9999","#99FFFF"))
89
-  expect_equal(distinct_colors(4), c("#FF9999","#99FFFF","#FFDB99","#9999FF"))
88
+  expect_equal(distinct_colors(2), c("#FF4D4D", "#4DFFFF"))
89
+  expect_equal(distinct_colors(4), c("#FF4D4D", "#4DFFFF", "#FFC04D", "#4D4DFF"))
90 90
 })
91 91
 
92 92
 
... ...
@@ -63,7 +63,7 @@ test_that(desc = "Checking CompareCountMatrix",{
63 63
 
64 64
 #distinct_colors
65 65
 test_that(desc = "Checking distinct_colors",{
66
-  expect_equal(distinct_colors(2), c("#FF9999","#99FFFF"))
66
+  expect_equal(distinct_colors(2), c("#FF4D4D", "#4DFFFF"))
67 67
 })
68 68
 
69 69