Browse code

Devel (#282)

* Changed default parameters for distinct_colors.

* Fixed bug dealing with subtracting 1 to make the factor indices to be zero based

* Changed EM function to use fastor colSumByGroupChange function to shift counts around after z labels have been altered

* updated documentation

* Added 'src/*.so' statement

* Changed 'split.each.z' to 'cC.splitZ' which utilizes the rowSumByGroupChange function to greatly increase speed in celda_C. However a separate function will be needed for the cells in celda_CG

* changed decomposed matrix to have TS as rows rather than columns (e.g. n.C.by.TS => n.TS.by.C)..

* Adding function for splitting clusters for celda_CG

* Reverted code to making any matrix with genes (G) or transcriptional state (TS) as being the rows. This eliminates the need for the counts.t matrix

* Reverted code to making any matrix with genes (G) or transcriptional state (TS) as being the rows and cells (C) or cell populations (CP) as columns. This eliminates the need for transposing matrices back and forth

* Debugged the new function 'cCG.splitZ' which will perform the cell splits for celda_CG in a much faster and more effecient way

* Refactor rowSumByGroupChange

* Modified the split for celda_G to use the faster rowSumByGroupChange function compared to rowSumByGroup

* Updated celda_CG y splitting function

* Fixed bug in using colSumByGroupChange

* updated NAMESPACE

* Updated testthat to reflect new defaults for distinct_color function

* Needed to added statement to load in previously simulated results

* Added missing lines in testthat for celda_C

* Removed dependency on Rmpfr by using functions from matrixStats package

* Changed default gamma in simulateCells functions to 5. Removed 'precision' argument in perplexity

* added 'seed' parameter to celdaTsne

* Updated NAMESPACE

* Added additional heuristic to cCG.splitY that reduces the size of the original counts matrix in the celda_G cluster splitting with L=2. It does this reduciton by splitting each current cell cluster into 'K.subclusters'. So if K=10 and K.subclusters=10, then the counts matrix will be reduced by clusterig each of the 10-K clusters into 10 additional subclusters (10 x 10 = 100 cells clusters). Therefore instead of performing celda_G on original counts matrix with large number of columns, the matrix will have at most K x K.subcluster columns.


Former-commit-id: b27139bbc2de25edb4c4e5689fa37c8cdb3c3df9

Joshua D. Campbell authored on 22/07/2018 17:49:39 • Sean committed on 22/07/2018 17:49:39
Showing 3 changed files

... ...
@@ -29,5 +29,4 @@ celda.Rproj
29 29
 src/*.o
30 30
 src/*.dll
31 31
 src/*.so
32
-
33 32
 etc/*
... ...
@@ -76,6 +76,6 @@ import(graphics)
76 76
 import(grid)
77 77
 import(gtable)
78 78
 import(scales)
79
-
79
+import(data.table)
80 80
 useDynLib(celda)
81
-import(data.table)
82 81
\ No newline at end of file
82
+
... ...
@@ -183,8 +183,35 @@ cCG.splitZ = function(counts, m.CP.by.S, n.TS.by.C, n.TS.by.CP, n.by.G, n.by.TS,
183 183
 
184 184
 
185 185
 
186
-cCG.splitY = function(counts, y, m.CP.by.S, n.G.by.CP, n.TS.by.C, n.TS.by.CP, n.by.G, n.by.TS, nG.by.TS, n.CP, s, z, K, L, nS, nG, alpha, beta, delta, gamma, y.prob, max.clusters.to.try=10, min.cell=3) {
186
+cCG.splitY = function(counts, y, m.CP.by.S, n.G.by.CP, n.TS.by.C, n.TS.by.CP, n.by.G, n.by.TS, nG.by.TS, n.CP, s, z, K, L, nS, nG, alpha, beta, delta, gamma, y.prob, max.clusters.to.try=10, K.subclusters=10, min.cell=3) {
187 187
 
188
+  #########################
189
+  ## First, the cell dimension of the original matrix will be reduced by splitting each z cluster into 'K.subclusters'.
190
+  #########################
191
+  
192
+  ## This will not be as big as the original matrix (which can take a lot of time to process with large number of cells), but not as small as the 'n.G.by.CP' with current z assignments
193
+  z.ta = tabulate(z, K)
194
+  z.non.empty = which(z.ta > 0)
195
+  temp.z = rep(0, length(z))
196
+  current.top.z = 0
197
+  for(i in z.non.empty) { 
198
+    ix = z == i
199
+    if(z.ta[i] <= K.subclusters) {
200
+      temp.z[ix] = (current.top.z + 1):(current.top.z + z.ta[i])
201
+    } else {
202
+      clustLabel = suppressMessages(celda_C(counts[,z == i], K=K.subclusters, max.iter=5, split.on.iter=-1, split.on.last=FALSE))
203
+      temp.z[ix] = clustLabel$z + current.top.z 
204
+    }
205
+    current.top.z = max(temp.z, na.rm=TRUE)
206
+  }
207
+  
208
+  ## Decompose counts according to new/temp z labels
209
+  temp.n.G.by.CP = colSumByGroup(counts, group=temp.z, K=current.top.z)
210
+
211
+  #########################
212
+  ## Second, different y splits will be estimated and tested
213
+  #########################
214
+  
188 215
   ## Identify clusters to split
189 216
   y.ta = tabulate(y, L)
190 217
   y.to.split = which(y.ta >= min.cell)
... ...
@@ -194,11 +221,11 @@ cCG.splitY = function(counts, y, m.CP.by.S, n.G.by.CP, n.TS.by.C, n.TS.by.CP, n.
194 221
     m = paste0(date(), " ... Cluster sizes too small. No additional splitting was performed.") 
195 222
     return(list(y=y, m.CP.by.S=m.CP.by.S, n.TS.by.CP=n.TS.by.CP, n.CP=n.CP, message=m))  
196 223
   }
197
-  
224
+
198 225
   ## Loop through each split-able Z and perform split
199 226
   clust.split = vector("list", L)
200 227
   for(i in y.to.split) { 
201
-    clustLabel = suppressMessages(celda_G(counts[y == i,], L=2, max.iter=5, split.on.iter=-1, split.on.last=FALSE))
228
+    clustLabel = suppressMessages(celda_G(temp.n.G.by.CP[y == i,], L=2, max.iter=5, split.on.iter=-1, split.on.last=FALSE))
202 229
     clust.split[[i]] = clustLabel$y
203 230
   }
204 231
 
... ...
@@ -229,7 +256,6 @@ cCG.splitY = function(counts, y, m.CP.by.S, n.G.by.CP, n.TS.by.C, n.TS.by.CP, n.
229 256
     previous.y = new.y
230 257
   } 
231 258
   y.to.shuffle = head(order(ll.shuffle, decreasing = TRUE, na.last=NA), n = max.clusters.to.try)
232
-
233 259
   
234 260
   pairs = c(NA, NA)
235 261
   split.ix = 2