Browse code

add names of three new function to NAMESPACE and three Rd files in man folder

Former-commit-id: c511bbc2b07a5089863d5e7be25a8f69a1374bc0

Jiangyuan Liu authored on 07/12/2017 17:31:56
Showing 100 changed files

1 1
old mode 100644
2 2
new mode 100755
3 3
old mode 100644
4 4
new mode 100755
5 5
old mode 100644
6 6
new mode 100755
... ...
@@ -21,6 +21,7 @@ S3method(simulateCells,celda_G)
21 21
 S3method(visualizeModelPerformance,celda_C)
22 22
 S3method(visualizeModelPerformance,celda_CG)
23 23
 S3method(visualizeModelPerformance,celda_G)
24
+export(GiniPlot)
24 25
 export(available_models)
25 26
 export(calculateLoglikFromVariables)
26 27
 export(calculatePerplexity)
... ...
@@ -37,6 +38,7 @@ export(clusterProbability.celda_G)
37 38
 export(compareCountMatrix)
38 39
 export(completeClusterHistory)
39 40
 export(completeLogLikelihood)
41
+export(diffExp_MAST)
40 42
 export(distinct_colors)
41 43
 export(factorizeMatrix)
42 44
 export(finalClusterAssignment)
... ...
@@ -56,8 +58,10 @@ export(renderInteractiveKLPlot)
56 58
 export(renderIterationLikelihoodPlot)
57 59
 export(runParams)
58 60
 export(seed)
61
+export(selectModels)
59 62
 export(semi_pheatmap)
60 63
 export(simulateCells)
64
+export(stateHeatmap)
61 65
 export(topRank)
62 66
 export(visualizeModelPerformance)
63 67
 import(RColorBrewer)
64 68
old mode 100644
65 69
new mode 100755
66 70
old mode 100644
67 71
new mode 100755
... ...
@@ -19,7 +19,7 @@ available_models = c("celda_C", "celda_G", "celda_CG")
19 19
 #' @return Object of class "celda_list", which contains results for all model parameter combinations and summaries of the run parameters
20 20
 #' @import foreach
21 21
 #' @export
22
-celda = function(counts, model, sample.label=NULL, nchains=1, cores=1, seed=12345, verbose=TRUE, logfile_prefix="Celda", ...) {
22
+celda = function(counts, model, sample.label=NULL, nchains=1, cores=1, seed=12345, verbose=FALSE, logfile_prefix="Celda", ...) {
23 23
   message("Starting celda...")
24 24
   validateArgs(counts, model, sample.label, nchains, cores, seed, ...)
25 25
   
26 26
old mode 100644
27 27
new mode 100755
... ...
@@ -204,11 +204,9 @@ celda_C = function(counts, sample.label=NULL, K, alpha=1, beta=1,
204 204
     iter = iter + 1    
205 205
   }
206 206
     
207
-  reordered.labels = reorder.label.by.size(z.best, K)
208
-  z.final.reorder = reordered.labels$new.labels
209 207
   names = list(row=rownames(counts), column=colnames(counts), sample=levels(sample.label))
210 208
 
211
-  result = list(z=z.final.reorder, completeLogLik=ll,  
209
+  result = list(z=z.best, completeLogLik=ll,  
212 210
                 finalLogLik=ll.best, seed=seed, K=K, 
213 211
                 sample.label=sample.label, alpha=alpha, 
214 212
                 beta=beta, count.checksum=count.checksum, 
... ...
@@ -216,6 +214,8 @@ celda_C = function(counts, sample.label=NULL, K, alpha=1, beta=1,
216 214
   
217 215
   class(result) = "celda_C"
218 216
   
217
+  result = reorder.celdaC(counts = counts, res = result)
218
+  
219 219
   return(result)
220 220
 }
221 221
 
... ...
@@ -351,6 +351,16 @@ cC.calcLL = function(m.CP.by.S, n.CP.by.G, s, z, K, nS, alpha, beta) {
351 351
   return(final)
352 352
 }
353 353
 
354
+reorder.celdaC = function(counts,res){
355
+  #Reorder K
356
+  fm <- factorizeMatrix(counts = counts, celda.mod = res)
357
+  fm.norm <- t(normalizeCounts(t(fm$proportions$gene.states),scale.factor = 1))
358
+  d <- dist(t(fm.norm),diag = TRUE, upper = TRUE)
359
+  h <- hclust(d, method = "complete")
360
+  res <- recodeClusterZ(res,from = h$order,
361
+                        to = c(1:ncol(fm$counts$gene.states)))
362
+  return(res)
363
+}
354 364
 
355 365
 #' Generate factorized matrices showing each feature's influence on the celda_C model clustering 
356 366
 #' 
357 367
old mode 100644
358 368
new mode 100755
... ...
@@ -93,7 +93,7 @@ calculateLoglikFromVariables.celda_CG = function(counts, s, z, y, K, L, alpha, b
93 93
   return(final)
94 94
 }
95 95
 
96
-cCG.calcLL = function(K, L, m.CP.by.S, n.CP.by.TS, n.by.G, n.by.TS, nG.by.TS, nS, nG, alpha, beta, delta, gamma) {
96
+.calcLL = function(K, L, m.CP.by.S, n.CP.by.TS, n.by.G, n.by.TS, nG.by.TS, nS, nG, alpha, beta, delta, gamma) {
97 97
 
98 98
   ## Determine if any TS has 0 genes
99 99
   ## Need to remove 0 gene states as this will cause the likelihood to fail
... ...
@@ -142,7 +142,7 @@ cCG.calcLL = function(K, L, m.CP.by.S, n.CP.by.TS, n.by.G, n.by.TS, nG.by.TS, nS
142 142
   return(final)
143 143
 }
144 144
 
145
-cCG.calcGibbsProbZ = function(m.CP.by.S, n.CP.by.TS, n.CP, L, alpha, beta) {
145
+.calcGibbsProbZ = function(m.CP.by.S, n.CP.by.TS, n.CP, L, alpha, beta) {
146 146
 
147 147
   ## Calculate for "Theta" component
148 148
   theta.ll = log(m.CP.by.S + alpha)
... ...
@@ -158,7 +158,7 @@ cCG.calcGibbsProbZ = function(m.CP.by.S, n.CP.by.TS, n.CP, L, alpha, beta) {
158 158
   return(final)
159 159
 }
160 160
 
161
-cCG.calcGibbsProbY = function(n.CP.by.TS, n.by.TS, nG.by.TS, nG.in.Y, beta, delta, gamma) {
161
+.calcGibbsProbY = function(n.CP.by.TS, n.by.TS, nG.by.TS, nG.in.Y, beta, delta, gamma) {
162 162
   
163 163
   ## Calculate for "Phi" component
164 164
   phi.ll = sum(lgamma(n.CP.by.TS + beta))
... ...
@@ -176,6 +176,25 @@ cCG.calcGibbsProbY = function(n.CP.by.TS, n.by.TS, nG.by.TS, nG.in.Y, beta, delt
176 176
   return(final)
177 177
 }
178 178
 
179
+reorder.celdaCG = function(counts,res){
180
+  #Reorder K
181
+  fm <- factorizeMatrix(counts = counts, celda.mod = res)
182
+  fm.norm <- t(normalizeCounts(t(fm$proportions$population.states),scale.factor = 1))
183
+  d <- dist(t(fm.norm),diag = TRUE, upper = TRUE)
184
+  h <- hclust(d, method = "complete")
185
+  res <- recodeClusterZ(res,from = h$order,
186
+                        to = c(1:ncol(fm$counts$population.states)))
187
+  
188
+  #Reorder L
189
+  fm <- factorizeMatrix(counts = counts, celda.mod = res)
190
+  fm.norm <- t(normalizeCounts(t(fm$proportions$population.states),scale.factor = 1))
191
+  d <- dist((fm.norm),diag = TRUE, upper = TRUE)
192
+  h <- hclust(d, method = "complete")
193
+  res <- recodeClusterY(res,from = h$order,
194
+                        to = c(1:nrow(fm$counts$population.states)))
195
+  return(res)
196
+}
197
+
179 198
 #' Simulate cells from the cell/gene clustering generative model
180 199
 #' 
181 200
 #' @param S The number of samples
... ...
@@ -246,8 +265,8 @@ simulateCells.celda_CG = function(model, S=10, C.Range=c(50,100), N.Range=c(500,
246 265
   zero.row.idx = which(rowSums(cell.counts) == 0)
247 266
   if (length(zero.row.idx > 0)) {
248 267
     cell.counts = cell.counts[-zero.row.idx, ]
249
-  }
250
-  new$y = new$y[-zero.row.idx]
268
+    new$y = new$y[-zero.row.idx]
269
+  } 
251 270
  
252 271
   ## Assign gene/cell/sample names 
253 272
   rownames(cell.counts) = paste0("Gene_", 1:nrow(cell.counts))
... ...
@@ -332,7 +351,7 @@ celda_CG = function(counts, sample.label=NULL, K, L, alpha=1, beta=1,
332 351
   nG = nrow(counts)
333 352
   nM = ncol(counts)
334 353
   
335
-  ll = cCG.calcLL(K=K, L=L, m.CP.by.S=m.CP.by.S, n.CP.by.TS=n.CP.by.TS, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nS=nS, nG=nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma)
354
+  ll = .calcLL(K=K, L=L, m.CP.by.S=m.CP.by.S, n.CP.by.TS=n.CP.by.TS, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nS=nS, nG=nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma)
336 355
   
337 356
   iter = 1
338 357
   continue = TRUE
... ...
@@ -358,7 +377,7 @@ celda_CG = function(counts, sample.label=NULL, K, L, alpha=1, beta=1,
358 377
         temp.n.CP = n.CP
359 378
         temp.n.CP[j] = temp.n.CP[j] + sum(counts[,i])
360 379
 
361
-        probs[j] = cCG.calcGibbsProbZ(m.CP.by.S=m.CP.by.S[j,s[i]], n.CP.by.TS=temp.n.CP.by.TS, n.CP=temp.n.CP, L=L, alpha=alpha, beta=beta)
380
+        probs[j] = .calcGibbsProbZ(m.CP.by.S=m.CP.by.S[j,s[i]], n.CP.by.TS=temp.n.CP.by.TS, n.CP=temp.n.CP, L=L, alpha=alpha, beta=beta)
362 381
       }  
363 382
     
364 383
       ## Sample next state and add back counts
... ...
@@ -410,7 +429,7 @@ celda_CG = function(counts, sample.label=NULL, K, L, alpha=1, beta=1,
410 429
         temp.nG.by.TS = nG.by.TS + (1 * ADD_PSEUDO)
411 430
         temp.nG.by.TS[j] = temp.nG.by.TS[j] + 1
412 431
 
413
-        probs[j] = cCG.calcGibbsProbY(n.CP.by.TS=temp.n.CP.by.TS,
432
+        probs[j] = .calcGibbsProbY(n.CP.by.TS=temp.n.CP.by.TS,
414 433
 			n.by.TS=temp.n.by.TS,
415 434
 			nG.by.TS=temp.nG.by.TS,
416 435
 			nG.in.Y=temp.nG.by.TS[j],
... ...
@@ -481,7 +500,7 @@ celda_CG = function(counts, sample.label=NULL, K, L, alpha=1, beta=1,
481 500
    }
482 501
 
483 502
     ## Calculate complete likelihood
484
-    temp.ll = cCG.calcLL(K=K, L=L, m.CP.by.S=m.CP.by.S, n.CP.by.TS=n.CP.by.TS, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nS=nS, nG=nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma)
503
+    temp.ll = .calcLL(K=K, L=L, m.CP.by.S=m.CP.by.S, n.CP.by.TS=n.CP.by.TS, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nS=nS, nG=nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma)
485 504
     if((all(temp.ll > ll)) | iter == 1) {
486 505
       z.best = z
487 506
       y.best = y
... ...
@@ -493,14 +512,12 @@ celda_CG = function(counts, sample.label=NULL, K, L, alpha=1, beta=1,
493 512
     iter = iter + 1    
494 513
   }
495 514
     
496
-  ## Peform reordering on final Z and Y assigments:
497
-  reordered.labels = reorder.labels.by.size.then.counts(counts, z=z.best, 
498
-                                                        y=y.best, K=K, L=L)
515
+
499 516
   names = list(row=rownames(counts), column=colnames(counts), 
500 517
                sample=levels(sample.label))
501 518
   
502 519
   
503
-  result = list(z=reordered.labels$z, y=reordered.labels$y, completeLogLik=ll, 
520
+  result = list(z=z.best, y=y.best, completeLogLik=ll, 
504 521
                 finalLogLik=ll.best, K=K, L=L, alpha=alpha, 
505 522
                 beta=beta, delta=delta, gamma=gamma, seed=seed, 
506 523
                 sample.label=sample.label, names=names,
... ...
@@ -508,6 +525,8 @@ celda_CG = function(counts, sample.label=NULL, K, L, alpha=1, beta=1,
508 525
   
509 526
   class(result) = "celda_CG" 
510 527
    
528
+  ## Peform reordering on final Z and Y assigments:
529
+  result = reorder.celdaCG(counts = counts, res = result)
511 530
   return(result)
512 531
 }
513 532
 
... ...
@@ -637,7 +656,7 @@ clusterProbability.celda_CG = function(counts, celda.mod) {
637 656
 	  temp.n.CP = n.CP
638 657
 	  temp.n.CP[j] = temp.n.CP[j] + sum(counts[,i])
639 658
 
640
-	  z.prob[i,j] = cCG.calcGibbsProbZ(m.CP.by.S=m.CP.by.S[j,s[i]], n.CP.by.TS=temp.n.CP.by.TS, n.CP=temp.n.CP, L=L, alpha=alpha, beta=beta)
659
+	  z.prob[i,j] = .calcGibbsProbZ(m.CP.by.S=m.CP.by.S[j,s[i]], n.CP.by.TS=temp.n.CP.by.TS, n.CP=temp.n.CP, L=L, alpha=alpha, beta=beta)
641 660
 	}  
642 661
   
643 662
 	m.CP.by.S[z[i],s[i]] = m.CP.by.S[z[i],s[i]] + 1
... ...
@@ -667,7 +686,7 @@ clusterProbability.celda_CG = function(counts, celda.mod) {
667 686
 	  temp.nG.by.TS = nG.by.TS + (1 * ADD_PSEUDO)
668 687
 	  temp.nG.by.TS[j] = temp.nG.by.TS[j] + 1
669 688
 
670
-	  y.prob[i,j] = cCG.calcGibbsProbY(n.CP.by.TS=temp.n.CP.by.TS,
689
+	  y.prob[i,j] = .calcGibbsProbY(n.CP.by.TS=temp.n.CP.by.TS,
671 690
 		  n.by.TS=temp.n.by.TS,
672 691
 		  nG.by.TS=temp.nG.by.TS,
673 692
 		  nG.in.Y=temp.nG.by.TS[j],
674 693
old mode 100644
675 694
new mode 100755
... ...
@@ -123,7 +123,16 @@ cG.calcLL = function(n.TS.by.C, n.by.TS, n.by.G, nG.by.TS, nM, nG, L, beta, delt
123 123
   return(final)
124 124
 }
125 125
 
126
-
126
+reorder.celdaG = function(counts,res){
127
+  #Reorder L
128
+  fm <- factorizeMatrix(counts = counts, celda.mod = res)
129
+  fm.norm <- t(normalizeCounts(fm$proportions$gene.states,scale.factor = 1))
130
+  d <- dist((fm.norm),diag = TRUE, upper = TRUE)
131
+  h <- hclust(d, method = "complete")
132
+  res <- recodeClusterY(res,from = h$order,
133
+                        to = c(1:nrow(fm$counts$cell.states)))
134
+  return(res)
135
+}
127 136
 
128 137
 
129 138
 # Calculate Log Likelihood For Single Set of Cluster Assignments (Gene Clustering)
... ...
@@ -297,16 +306,16 @@ celda_G = function(counts, L, beta=1, delta=1, gamma=1, max.iter=25,
297 306
     iter <- iter + 1    
298 307
   }
299 308
     
300
-  reordered.labels = reorder.label.by.size(y.best, L)
301
-  y.final.order = reordered.labels$new.labels
302 309
   names = list(row=rownames(counts), column=colnames(counts))  
303 310
 
304
-  result = list(y=y.final.order, completeLogLik=ll, 
311
+  result = list(y=y.best, completeLogLik=ll, 
305 312
                 finalLogLik=ll.best, L=L, beta=beta, delta=delta, gamma=gamma,
306 313
                 count.checksum=count.checksum, seed=seed, names=names)
307 314
   
308 315
   class(result) = "celda_G"
309 316
   
317
+  result = reorder.celdaG(counts = counts, res = result)
318
+  
310 319
   return(result)
311 320
 }
312 321
 
... ...
@@ -361,8 +370,10 @@ simulateCells.celda_G = function(model, C=100, N.Range=c(500,5000),  G=1000,
361 370
   ## Ensure that there are no all-0 rows in the counts matrix, which violates a celda modeling
362 371
   ## constraint (columns are guarnteed at least one count):
363 372
   zero.row.idx = which(rowSums(cell.counts) == 0)
364
-  cell.counts = cell.counts[-zero.row.idx, ]
365
-  y = y[-zero.row.idx]
373
+  if (length(zero.row.idx > 0)) {
374
+    cell.counts = cell.counts[-zero.row.idx, ]
375
+    y = y[-zero.row.idx]
376
+  }
366 377
     
367 378
   rownames(cell.counts) = paste0("Gene_", 1:nrow(cell.counts))
368 379
   colnames(cell.counts) = paste0("Cell_", 1:ncol(cell.counts))
369 380
old mode 100644
370 381
new mode 100755
371 382
old mode 100644
372 383
new mode 100755
373 384
old mode 100644
374 385
new mode 100755
... ...
@@ -2,25 +2,26 @@
2 2
 # S3 Methods for celda_list objects                                            #
3 3
 ################################################################################
4 4
 
5
-#' Run the celda Bayesian hierarchical model on a matrix of counts.
6
-#' TODO: - If no chain provided, automatically choose best chain
7
-#'       - Smarter subsetting of the run.params to DRY this function up
5
+# TODO: - If no chain provided, automatically choose best chain
6
+#       - Smarter subsetting of the run.params to DRY this function up
7
+
8
+#' Select the best model amongst a list of models generated in a celda run.
8 9
 #' 
9 10
 #' @param celda.list A celda_list object returned from celda()
10
-#' @param K The K parameter for the desired model in the results list
11
-#' @param L The L parameter for the desired model in the results list
12
-#' @param chain The desired chain for the specified model
13
-#' @param best Method for choosing best chain automatically. Options are c("perplexity", "loglik"). See documentation for chooseBestModel for details. Overrides chain parameter if provided.
11
+#' @param K The K parameter for the desired model in the results list. Defaults to NULL.
12
+#' @param L The L parameter for the desired model in the results list. Defaults to NULL.
13
+#' @param chain The desired chain for the specified model. Defaults to NULL. Overrides best parameter if provided.
14
+#' @param best Method for choosing best chain automatically. Options are c("perplexity", "loglik"). See documentation for chooseBestModel for details. Defaults to "loglik."
14 15
 #' @return A celda model object matching the provided parameters (of class "celda_C", "celda_G", "celda_CG" accordingly), or NA if one is not found.
15 16
 #' @export
16
-getModel = function(celda.list, K=NULL, L=NULL, chain=1, best=NULL) {
17
+getModel = function(celda.list, K=NULL, L=NULL, chain=NULL, best="loglik") {
17 18
   validateGetModelParams(celda.list, K, L, chain, best)  # Sanity check params
18 19
   
19 20
   requested.chain = NA
20 21
   run.params = celda.list$run.params
21 22
   
22 23
   if (celda.list$content.type == "celda_CG") {
23
-    if (!is.null(best)) {
24
+    if (is.null(chain)) {
24 25
       matching.chain.idx = run.params[run.params$K == K & run.params$L == L, "index"]
25 26
       requested.chain = chooseBestChain(celda.list$res.list[matching.chain.idx], best)
26 27
     } else {
... ...
@@ -32,7 +33,7 @@ getModel = function(celda.list, K=NULL, L=NULL, chain=1, best=NULL) {
32 33
   
33 34
   
34 35
   if (celda.list$content.type == "celda_C") {
35
-    if (!is.null(best)) {
36
+    if (is.null(chain)) {
36 37
       matching.chain.idx = run.params[run.params$K == K, "index"]
37 38
       requested.chain = chooseBestChain(celda.list$res.list[matching.chain.idx], best)
38 39
     } else {
... ...
@@ -43,7 +44,7 @@ getModel = function(celda.list, K=NULL, L=NULL, chain=1, best=NULL) {
43 44
   
44 45
   
45 46
   if (celda.list$content.type == "celda_G") {
46
-    if (!is.null(best)) {
47
+    if (is.null(chain)) {
47 48
       matching.chain.idx = run.params[run.params$L == L, "index"]
48 49
       requested.chain = chooseBestChain(celda.list$res.list[matching.chain.idx], best)
49 50
     } else { 
... ...
@@ -62,6 +63,30 @@ getModel = function(celda.list, K=NULL, L=NULL, chain=1, best=NULL) {
62 63
 }
63 64
 
64 65
 
66
+#' Select a set of models from a celda_list objects based off of rows in its run.params attribute.
67
+#' 
68
+#' @param celda.list A celda_list object returned from celda()
69
+#' @param run.param.rows Row indices in the celda.list's run params corresponding to the desired models.
70
+#' @return A celda_list containing celda model objects matching the provided parameters (of class "celda_C", "celda_G", "celda_CG" accordingly), or NA if one is not found.
71
+#' @export
72
+selectModels = function(celda.list, run.param.rows) {
73
+  desired.models = lapply(run.param.rows,
74
+                          function(row.idx) {
75
+                            if (!is.numeric(row.idx) | 
76
+                                row.idx < 0 | 
77
+                                row.idx > nrow(celda.list$run.params)) {
78
+                                  stop("Invalid row index provided") 
79
+                            }
80
+                            params = celda.list$run.params[row.idx, ]
81
+                            getModel(celda.list, params$K, params$L, params$chain)
82
+                          })
83
+  subsetted.celda.list = list(run.params=celda.list$run.params[run.param.rows, ],
84
+                              res.list=desired.models, content.type=celda.list$content.type,
85
+                              count.checksum=celda.list$count.checksum)
86
+  return(subsetted.celda.list)
87
+}
88
+
89
+
65 90
 validateGetModelParams = function(celda.list, K, L, chain, best) {
66 91
   if (class(celda.list) != "celda_list") stop("First argument to getModel() should be an object of class 'celda_list'")
67 92
   
68 93
old mode 100644
69 94
new mode 100755
70 95
old mode 100644
71 96
new mode 100755
72 97
old mode 100644
73 98
new mode 100755
74 99
old mode 100644
75 100
new mode 100755
76 101
old mode 100644
77 102
new mode 100755
... ...
@@ -17,8 +17,8 @@ plotDrGrid <- function(dim1, dim2, matrix, size, xlab, ylab, color_low, color_mi
17 17
   colnames(m) <- c(xlab,ylab,"facet",var_label)
18 18
   ggplot2::ggplot(m, ggplot2::aes_string(x=xlab, y=ylab)) + ggplot2::geom_point(stat = "identity", size = size, ggplot2::aes_string(color = var_label)) + 
19 19
     ggplot2::facet_wrap(~facet) + ggplot2::theme_bw() + ggplot2::scale_colour_gradient2(low = color_low, high = color_high, mid = color_mid, midpoint = (max(m[,4])-min(m[,4]))/2 ,name = gsub("_"," ",var_label)) + 
20
-    ggplot2::theme(strip.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.spacing = unit(0,"lines"),
21
-                   panel.background = element_blank(), axis.line = ggplot2::element_line(colour = "black"))
20
+    ggplot2::theme(strip.background = ggplot2::element_blank(), panel.grid.major = ggplot2::element_blank(), panel.grid.minor = ggplot2::element_blank(), panel.spacing = unit(0,"lines"),
21
+                   panel.background = ggplot2::element_blank(), axis.line = ggplot2::element_line(colour = "black"))
22 22
 }
23 23
 
24 24
 #' Create a scatterplot for each row of a normalized gene expression matrix where x and y axis are from a data dimensionality reduction tool.  
25 25
old mode 100644
26 26
new mode 100755
27 27
old mode 100644
28 28
new mode 100755
29 29
old mode 100644
30 30
new mode 100755
31 31
old mode 100644
32 32
new mode 100755
... ...
@@ -30,7 +30,7 @@ An analysis example using celda with RNASeq via vignette('celda-analysis')
30 30
 
31 31
 
32 32
 ## New Features and announcements
33
-The v0.2 release of celda represents a useable implementation of the various celda clustering models.
33
+The v0.3 release of celda represents a useable implementation of the various celda clustering models.
34 34
 Please submit any usability issues or bugs to the issue tracker at https://github.com/compbiomed/celda
35 35
 
36 36
 You can discuss celda, or ask the developers usage questions, in our [Google Group.](https://groups.google.com/forum/#!forum/celda-list)
37 37
deleted file mode 100644
... ...
@@ -1 +0,0 @@
1
-20892e2fcfc0c39f6f9e6333e5fd3523849044c3
2 0
\ No newline at end of file
3 1
old mode 100644
4 2
new mode 100755
5 3
old mode 100644
6 4
new mode 100755
... ...
@@ -14,9 +14,11 @@ library(celda)
14 14
 library(Matrix)
15 15
 library(gtools)
16 16
 library(ggplot2)
17
+library(Rtsne)
18
+library(reshape2)
17 19
 
18 20
 ## ------------------------------------------------------------------------
19
-sim_counts = simulateCells("celda_CG", K=3,L=10)
21
+sim_counts = simulateCells("celda_CG", K = 3, L = 10)
20 22
 
21 23
 ## ------------------------------------------------------------------------
22 24
 dim(sim_counts$counts)
... ...
@@ -35,11 +37,11 @@ table(sim_counts$z, sim_counts$sample.label)
35 37
 
36 38
 ## ---- fig.show='hold', warning = FALSE, message = FALSE------------------
37 39
 
38
-celdaCG.res = celda_CG(counts = sim_counts$counts, K =3, L = 10, max.iter = 10, sample.label = sim_counts$sample.label)
40
+celda.res = celda(counts = sim_counts$counts, model = "celda_CG", K = 3, L = 10, max.iter = 10, cores = 1, nchains = 1)
39 41
 
40 42
 ## ------------------------------------------------------------------------
41
-z = celdaCG.res$z
42
-y = celdaCG.res$y
43
+z = celda.res$res.list[[1]]$z
44
+y = celda.res$res.list[[1]]$y
43 45
 
44 46
 table(z, sim_counts$z)
45 47
 table(y, sim_counts$y)
... ...
@@ -47,28 +49,41 @@ table(y, sim_counts$y)
47 49
 
48 50
 ## ---- fig.width = 7, fig.height = 7, warning = FALSE, message = FALSE----
49 51
 norm.counts <- normalizeCounts(sim_counts$counts)
50
-render_celda_heatmap(counts=norm.counts,z=z,y=y, cluster.column = FALSE, cluster.row = FALSE)
52
+renderCeldaHeatmap(counts = norm.counts, z=z, y=y, normalize = NULL, color_scheme = "divergent",cluster_gene = TRUE, cluster_cell = TRUE)
51 53
 
52 54
 ## ------------------------------------------------------------------------
53
-factorized <- factorizeMatrix.celda_CG(counts = sim_counts$counts, celda.obj = celdaCG.res)
55
+model <- getModel(celda.res, K = 3, L = 10, best = "loglik")
56
+factorized <- factorizeMatrix(model, sim_counts$count)
54 57
 
55
-pop.states = t(factorized$proportions$population.states)
58
+## ------------------------------------------------------------------------
59
+dim(factorized$proportions$gene.states)
60
+head(factorized$proportions$gene.states)
61
+
62
+## ------------------------------------------------------------------------
63
+dim(factorized$proportions$cell.states)
64
+factorized$proportions$cell.states[1:10,1:3]
65
+
66
+## ------------------------------------------------------------------------
67
+pop.states = factorized$proportions$population.states
68
+dim(pop.states)
56 69
 head(pop.states)
57 70
 
58 71
 ## ---- fig.width = 7, fig.height = 7--------------------------------------
59
-render_celda_heatmap(pop.states, col=colorRampPalette(c("white", "blue"))(70), scale.row=F, show_cellnames=T, show_genenames=T, cluster.column=F, cluster.row=F, breaks = NA, z= 1:3, y = 1:10)
72
+data.pca <- prcomp(t(scale(t(factorized$proportions$cell.states))),scale = F, center = F)
73
+
74
+plotDrCluster(dim1 = data.pca$rotation[,1], dim2 = data.pca$rotation[,2], cluster = celda.res$res.list[[1]]$z)
75
+
76
+plotDrState(dim1 = data.pca$rotation[,1], dim2 = data.pca$rotation[,2], matrix = factorized$proportions$cell.states, rescale = TRUE)
77
+
78
+## ---- fig.width = 7, fig.height = 7--------------------------------------
79
+renderCeldaHeatmap(pop.states, color_scheme = "sequential", show_cellnames=T, show_genenames=T, breaks = NA, z= 1:ncol(pop.states), y = 1:nrow(pop.states))
60 80
 
61 81
 ## ---- fig.width = 7, fig.height = 7--------------------------------------
62 82
 rel.states = sweep(pop.states, 1, rowSums(pop.states), "/")
63
-render_celda_heatmap(rel.states, col=colorRampPalette(c("white","blue"))(70), scale.row=F, show_cellnames=T, show_genenames=T, cluster.column=F, cluster.row=F, breaks = NA, z= 1:3, y = 1:10)
83
+renderCeldaHeatmap(rel.states, color_scheme = "sequential", show_cellnames=T, show_genenames=T, breaks = NA, z= 1:ncol(rel.states), y = 1:nrow(rel.states))
64 84
 
65 85
 ## ------------------------------------------------------------------------
66
-celda.res.list <- celda(counts = sim_counts$counts,K = 2:4, L = 9:11, nchains = 3, max.iter = 10, model = "celda_CG")
67
-
68
-## ---- fig.width = 7, fig.height = 7, message = FALSE---------------------
69
-visualize.model = visualize_model_performance(celda.res.list,method="perplexity")
70
-
71
-visualize.model$K
86
+celda.res.list <- celda(counts = sim_counts$counts,K = 2:4, L = 9:11, nchains = 3, max.iter = 10, model = "celda_CG", cores = 1)
72 87
 
73 88
 ## ------------------------------------------------------------------------
74 89
 model = getModel(celda.res.list, K = 3, L = 10, best = "loglik")
... ...
@@ -79,37 +94,47 @@ library(celda)
79 94
 library(Matrix)
80 95
 library(data.table)
81 96
 library(pheatmap)
82
-library(biomaRt)
97
+library(Rtsne)
83 98
 
84 99
 ## ------------------------------------------------------------------------
85
-load(file="../data/cmatp_data.rda")
86
-dim(cmatp_data)
87
-head(rownames(cmatp_data))
88
-head(colnames(cmatp_data))
100
+dim(pbmc_data)
101
+head(rownames(pbmc_data))
102
+head(colnames(pbmc_data))
89 103
 
90 104
 ## ------------------------------------------------------------------------
91
-cmatp_select <- cmatp_data[rowSums(cmatp_data>4) > 4,]
105
+pbmc_select <- pbmc_data[rowSums(pbmc_data>4) > 4,]
92 106
 
93 107
 ## ---- message = FALSE, verbose = FALSE-----------------------------------
94
-cmatp.res = celda_CG(cmatp_select, sample.label = rep(1,ncol(cmatp_select)), K = 15, L = 20)
108
+pbmc.res = celda(pbmc_select, K = 10, L = 20, cores = 1, model = "celda_CG", nchains = 8, max.iter = 100)
109
+model<-getModel(pbmc.res, K = 10, L = 20, best = "loglik")
95 110
 
96 111
 ## ------------------------------------------------------------------------
97
-factorize.matrix = factorizeMatrix.celda_CG(counts = cmatp_select, celda.obj = cmatp.res)
98
-top.genes <- topRank(factorize.matrix$proportions$gene.states)
112
+factorize.matrix = factorizeMatrix(model, counts=pbmc_select)
113
+top.genes <- topRank(factorize.matrix$proportions$gene.states, n = 25)
99 114
 
100 115
 ## ------------------------------------------------------------------------
101
-top.genes$names$L16
116
+top.genes$names$L3
102 117
 
103 118
 ## ------------------------------------------------------------------------
104
-top.genes$names$L19
119
+top.genes$names$L4
105 120
 
106 121
 ## ---- fig.width = 7, fig.height = 7--------------------------------------
107 122
 top.genes.ix <- unique(unlist(top.genes$index))
108
-norm.cmatp.counts <- normalizeCounts(cmatp_select)
109
-render_celda_heatmap(norm.cmatp.counts[top.genes.ix,], z = cmatp.res$z, y = cmatp.res$y[top.genes.ix], cluster.row = FALSE, cluster.column = FALSE)
123
+norm.pbmc.counts <- normalizeCounts(pbmc_select)
124
+renderCeldaHeatmap(norm.pbmc.counts[top.genes.ix,], z = model$z, y = model$y[top.genes.ix], normalize = NULL, color_scheme = "divergent")
125
+
126
+## ---- message = FALSE----------------------------------------------------
127
+norm_pbmc <- normalizeCounts(factorize.matrix$counts$cell.states)
128
+set.seed(123)
129
+tsne <- Rtsne(t(norm_pbmc), pca = FALSE, max_iter = 2000)
130
+pbmc_tsne <- tsne$Y
110 131
 
111 132
 ## ---- fig.width = 7, fig.height = 7--------------------------------------
112
-cmatp.states = t(factorize.matrix$proportions$population.states)
113
-cmatp.rel.states = sweep(cmatp.states, 1, rowSums(cmatp.states), "/")
114
-render_celda_heatmap(cmatp.rel.states, col=colorRampPalette(c("white","blue","darkgreen","green"))(100), scale.row=F, show_cellnames=T, show_genenames=TRUE, cluster.column=FALSE, cluster.row=FALSE, breaks = NA, z= 1:15, y = 1:20)
133
+plotDrCluster(dim1 = pbmc_tsne[,1], dim2 = pbmc_tsne[,2], cluster = as.factor(model$z))
134
+
135
+plotDrState(dim1 = pbmc_tsne[,1], dim2 = pbmc_tsne[,2], matrix = factorize.matrix$proportions$cell.states, rescale = TRUE)
136
+
137
+marker.genes <- c("ENSG00000168685_IL7R","ENSG00000198851_CD3E","ENSG00000105374_NKG7","ENSG00000203747_FCGR3A","ENSG00000090382_LYZ","ENSG00000179639_FCER1A","ENSG00000156738_MS4A1", "ENSG00000163736_PPBP")
138
+gene.counts <- pbmc_select[marker.genes,]
139
+plotDrGene(dim1 = pbmc_tsne[,1],dim2 = pbmc_tsne[,2], matrix = gene.counts, rescale = TRUE)
115 140
 
... ...
@@ -1,6 +1,6 @@
1 1
 ---
2 2
 title: "Analyzing single-cell RNA-seq count data with Celda"
3
-author: "Josh D. Campbell, Sean Corbett, Masanao Yajima, Zichun Liu, Shiyi Yang, Tianwen Huan, Anahita Bahri, Yusuke Koga"
3
+author: "Josh D. Campbell, Masanao Yajima, Sean Corbett, Zichun Liu, Shiyi Yang, Tianwen Huan, Anahita Bahri, Zhe Wang, Yusuke Koga, Jiangyuan Liu"
4 4
 date: "`r Sys.Date()`"
5 5
 output: rmarkdown::html_vignette
6 6
 vignette: >
... ...
@@ -8,15 +8,15 @@ vignette: >
8 8
   %\VignetteEngine{knitr::rmarkdown}
9 9
   %\VignetteEncoding{UTF-8}
10 10
 ---
11
-
12
-# 1 Introduction
13
-
14
-"celda" stands for "**CE**llular **L**atent **D**irichlet **A**llocation", which is a suite of Bayesian hierarchical models and supporting functions to perform gene and cell clustering for count data generated by single cell RNA-seq platforms. This algorithm is an extension of the Latent Dirichlet Allocation (LDA) topic modeling framework that has been popular in text mining applications. Celda has advantages over other clustering frameworks:
15
-
16
-1) Celda can simultaneously cluster genes into transcriptional states and cells into subpopulations
11
+  
12
+## 1 Introduction
13
+  
14
+  "celda" stands for "**CE**llular **L**atent **D**irichlet **A**llocation", which is a suite of Bayesian hierarchical models and supporting functions to perform gene and cell clustering for count data generated by single cell RNA-seq platforms. This algorithm is an extension of the Latent Dirichlet Allocation (LDA) topic modeling framework that has been popular in text mining applications. Celda has advantages over other clustering frameworks:
15
+  
16
+1) Celda can simultaneously cluster cells into subpopulations and genes into transcriptional states, which are subgroups of genes that are likely to be expressed in similar proportions over different types of cells
17 17
 2) Celda uses count-based Dirichlet-multinomial distributions so no additional normalization is required for 3' DGE single cell RNA-seq
18 18
 3) These types of models have shown good performance with sparse data.
19
- 
19
+
20 20
 In this vignette we will demonstrate how to perform cell and gene clustering with simulated and real single cell RNA-seq data using the Bayesian hierarchical models within celda. 
21 21
 
22 22
 
... ...
@@ -57,29 +57,31 @@ library(celda)
57 57
 library(Matrix)
58 58
 library(gtools)
59 59
 library(ggplot2)
60
+library(Rtsne)
61
+library(reshape2)
60 62
 ```
61 63
 
62
-We use the built-in data generating function `simulateCells.celda_CG` to simulate a single-cell RNA-Seq dataset. `simulateCells.celda_CG` returns a list containing a count matrix where each row is a gene and each column is a cell. The K parameter determines the number of cell clusters while the L parameter determines the number of gene clusters within the dataset.
64
+We use the built-in data generating function `simulateCells` to simulate a single-cell RNA-Seq dataset. `simulateCells` returns a list containing a count matrix where each row is a gene and each column is a cell. The K parameter determines the number of cell clusters while the L parameter determines the number of gene clusters within the dataset.
63 65
 
64 66
 ```{r}
65
-sim_counts = simulateCells("celda_CG", K=3,L=10)
67
+sim_counts = simulateCells("celda_CG", K = 3, L = 10)
66 68
 ```
67 69
 
68
-The `counts` variable from `simulateCells.celda_CG` contains the counts matrix.
70
+The `counts` variable contains the counts matrix.
69 71
 The dimensions of counts matrix:
70 72
 
71 73
 ```{r}
72 74
 dim(sim_counts$counts)
73 75
 ```
74 76
 
75
-The `z` variable from `simulateCells.celda_CG` contains the cluster for each cell.
77
+The `z` variable contains the cluster for each cell.
76 78
 Here is the number of cells in each subpopulation:
77 79
 
78 80
 ```{r}
79 81
 table(sim_counts$z)
80 82
 ```
81 83
 
82
-The `y` variable from `simulateCells.celda_CG` contains the transcriptional state assignment for each gene. 
84
+The `y` variable contains the transcriptional state assignment for each gene. 
83 85
 Here is the number of genes in each transcriptional state:
84 86
 
85 87
 ```{r}
... ...
@@ -102,18 +104,18 @@ table(sim_counts$z, sim_counts$sample.label)
102 104
 
103 105
 ## 2.1.1 How to Run Celda
104 106
 
105
-There are currently three models within this package: `celda_C` will cluster cells, `celda_G` will cluster genes, and `celda_CG` will simultaneously cluster cells and genes. Here is the command for `celda_CG`:
107
+There are currently three models within this package: `celda_C` will cluster cells, `celda_G` will cluster genes, and `celda_CG` will simultaneously cluster cells and genes. The `celda` function will use these models and take in a counts matrix to create a `celda` object that contains the cluster labels for each cell and gene.
106 108
 
107 109
 ```{r, fig.show='hold', warning = FALSE, message = FALSE}
108 110
 
109
-celdaCG.res = celda_CG(counts = sim_counts$counts, K =3, L = 10, max.iter = 10, sample.label = sim_counts$sample.label)
111
+celda.res = celda(counts = sim_counts$counts, model = "celda_CG", K = 3, L = 10, max.iter = 10, cores = 1, nchains = 1)
110 112
 ```
111 113
 
112 114
 Here is a comparison between the true cluster labels and the estimated cluster labels.
113 115
 
114 116
 ```{r}
115
-z = celdaCG.res$z
116
-y = celdaCG.res$y
117
+z = celda.res$res.list[[1]]$z
118
+y = celda.res$res.list[[1]]$y
117 119
 
118 120
 table(z, sim_counts$z)
119 121
 table(y, sim_counts$y)
... ...
@@ -124,52 +126,71 @@ We can display the clustering results with a heatmap of the normalized counts:
124 126
 
125 127
 ```{r, fig.width = 7, fig.height = 7, warning = FALSE, message = FALSE}
126 128
 norm.counts <- normalizeCounts(sim_counts$counts)
127
-render_celda_heatmap(counts=norm.counts,z=z,y=y, cluster.column = FALSE, cluster.row = FALSE)
129
+renderCeldaHeatmap(counts = norm.counts, z=z, y=y, normalize = NULL, color_scheme = "divergent",cluster_gene = TRUE, cluster_cell = TRUE)
128 130
 ```
129 131
 
130 132
 ## 2.1.2 Matrix factorization
131 133
 
132
-`celda` can also perform matrix factorization to summarize the contribution of each transcriptional state to each cellular subpopulation. 
134
+`celda` can also perform matrix factorization to summarize the contribution of each transcriptional state to each cellular subpopulation. Matrix factorization decomposes the counts matrix into multiple matrices, using the given `celda` model. The gene.states contains each gene's contribution to the transcriptional state it belongs. The cell.states contains each sample's contribution to each transcriptional state. The population.states contains each transcriptional states' contribution to each of the cell states. Each of these matrices can be viewed as raw counts values or proportional values.
135
+
136
+```{r}
137
+model <- getModel(celda.res, K = 3, L = 10, best = "loglik")
138
+factorized <- factorizeMatrix(model, sim_counts$count)
139
+```
140
+
141
+```{r}
142
+dim(factorized$proportions$gene.states)
143
+head(factorized$proportions$gene.states)
144
+```
133 145
 
134 146
 ```{r}
135
-factorized <- factorizeMatrix.celda_CG(counts = sim_counts$counts, celda.obj = celdaCG.res)
147
+dim(factorized$proportions$cell.states)
148
+factorized$proportions$cell.states[1:10,1:3]
149
+```
150
+
151
+For instance, roughly 30% of the population in cell state K2 is in transcriptional state L1. 
136 152
 
137
-pop.states = t(factorized$proportions$population.states)
153
+```{r}
154
+pop.states = factorized$proportions$population.states
155
+dim(pop.states)
138 156
 head(pop.states)
139 157
 ```
140 158
 
141
-These states can also be visualized with a heatmap:
159
+`celda` contains several plotting tools for data dimensionality reduction tool outputs. The PCA result for the factorized matrix is plotted by the `plor_dr_cluster` and `plot_dr_state` function based off of the celda clusters and state probabilities for each cell:
142 160
 
143 161
 ```{r, fig.width = 7, fig.height = 7}
144
-render_celda_heatmap(pop.states, col=colorRampPalette(c("white", "blue"))(70), scale.row=F, show_cellnames=T, show_genenames=T, cluster.column=F, cluster.row=F, breaks = NA, z= 1:3, y = 1:10)
162
+data.pca <- prcomp(t(scale(t(factorized$proportions$cell.states))),scale = F, center = F)
163
+
164
+plotDrCluster(dim1 = data.pca$rotation[,1], dim2 = data.pca$rotation[,2], cluster = celda.res$res.list[[1]]$z)
165
+
166
+plotDrState(dim1 = data.pca$rotation[,1], dim2 = data.pca$rotation[,2], matrix = factorized$proportions$cell.states, rescale = TRUE)
145 167
 ```
146 168
 
147
-Sometimes it may be helpful to visualize the relative probability of each transcriptional state in each cellular subpopulation. We can do this by normalizing the rows of the probability matrix to sum up to 1:
169
+The transcriptional and cell states can also be visualized with a heatmap:
170
+
171
+```{r, fig.width = 7, fig.height = 7}
172
+renderCeldaHeatmap(pop.states, color_scheme = "sequential", show_cellnames=T, show_genenames=T, breaks = NA, z= 1:ncol(pop.states), y = 1:nrow(pop.states))
173
+```
174
+
175
+Sometimes it may be helpful to visualize the relative probability of each transcriptional state in each cellular subpopulation. We can do this by z-score normalizing the probability matrix:
148 176
 
149 177
 ```{r, fig.width = 7, fig.height = 7}
150 178
 rel.states = sweep(pop.states, 1, rowSums(pop.states), "/")
151
-render_celda_heatmap(rel.states, col=colorRampPalette(c("white","blue"))(70), scale.row=F, show_cellnames=T, show_genenames=T, cluster.column=F, cluster.row=F, breaks = NA, z= 1:3, y = 1:10)
179
+renderCeldaHeatmap(rel.states, color_scheme = "sequential", show_cellnames=T, show_genenames=T, breaks = NA, z= 1:ncol(rel.states), y = 1:nrow(rel.states))
152 180
 ```
153 181
 
154 182
 The relative probabilities will tend to reflect the patterns observed in the heatmap as the genes in the heatmaps are Z-score transformed and thus on a relative scale as well.
155 183
 
184
+
156 185
 ## 2.1.3 Identifying the optimal number of states
157 186
 
158 187
 The optimal number of K and L will likely not be known a priori. Therefore multiple choices of K and L may need to tested and compared. Additionally, `celda` is sensitive to initial start conditions, so it is good practice to run multiple chains for each combination of K and L to increase the chances of finding the most optimal solution. We have built a wrapper function that will run multiple combinations of K and L with multiple chains for each combination in parallel. 
159 188
 
160 189
 ```{r}
161
-celda.res.list <- celda(counts = sim_counts$counts,K = 2:4, L = 9:11, nchains = 3, max.iter = 10, model = "celda_CG")
162
-```
163
-
164
-Here, `celda` will fit the model for every combination of K between 2 and 4 and L between 9 and 11. Three chains will be run for each combination. The parameter "ncores" can be set to something more than 1 to run multiple chains in parallel.
165
-
166
-```{r, fig.width = 7, fig.height = 7, message = FALSE}
167
-visualize.model = visualize_model_performance(celda.res.list,method="perplexity")
168
-
169
-visualize.model$K
190
+celda.res.list <- celda(counts = sim_counts$counts,K = 2:4, L = 9:11, nchains = 3, max.iter = 10, model = "celda_CG", cores = 1)
170 191
 ```
171 192
 
172
-Perplexity is a measure of goodness-of-fit for the model where lower perplexity is better. The perplexity of each chain can be visualized as a function of K. In this example, K=3 is much better than K=2, but K=4 does not give much improvement over K=3. We would therefore select K=3 in this case. Note: We are still actively testing various methods for assessing model performance. 
193
+Here, `celda` will fit the model for every combination of K between 2 and 4 and L between 9 and 11. Three chains will be run for each combination. The parameter "cores" can be set to something more than 1 to run multiple chains in parallel.
173 194
 
174 195
 We can use the function "getModel" to select a particular model from the list of models once we have selected the optimal K and L.
175 196
 
... ...
@@ -179,6 +200,8 @@ model = getModel(celda.res.list, K = 3, L = 10, best = "loglik")
179 200
 
180 201
 The chain with the best log likelihood will be returned from all models that were run with K = 3 and L = 10.
181 202
 
203
+
204
+
182 205
 # 3. PBMC Analysis with celda
183 206
 
184 207
 ```{r setup, include=FALSE}
... ...
@@ -187,59 +210,72 @@ library(celda)
187 210
 library(Matrix)
188 211
 library(data.table)
189 212
 library(pheatmap)
190
-library(biomaRt)
213
+library(Rtsne)
191 214
 ```
192 215
 
193
-We will use the "cmatp_data" dataset, which is a dataset of 3000 Peripheral Blood Mononuclear Cells (PBMC), available from 10X Genomics. The rows are organized by gene names, while the columns are organized by barcodes. For this tutorial, the dataset has been slightly modified so the rownames are comprised of the Ensembl gene ID as well as the gene name. The raw dataset can be found at https://support.10xgenomics.com/single-cell/software/pipelines/latest/rkit.
216
+We will use the "pbmc_data" dataset, which is a dataset of 3000 Peripheral Blood Mononuclear Cells (PBMC), available from 10X Genomics. The rows are organized by gene names, while the columns are organized by barcodes. For this tutorial, the dataset has been slightly modified so the rownames are comprised of the Ensembl gene ID as well as the gene name. The raw dataset can be found at https://support.10xgenomics.com/single-cell/software/pipelines/latest/rkit.
194 217
 
195 218
 We will only include genes with at least 5 counts in a least 5 cells for this analysis to reduce computational time. However, `celda` is capable of analyzing genes with fewer counts. 
196 219
 
197 220
 ```{r}
198
-load(file="../data/cmatp_data.rda")
199
-dim(cmatp_data)
200
-head(rownames(cmatp_data))
201
-head(colnames(cmatp_data))
221
+dim(pbmc_data)
222
+head(rownames(pbmc_data))
223
+head(colnames(pbmc_data))
202 224
 ```
203 225
 
204 226
 ```{r}
205
-cmatp_select <- cmatp_data[rowSums(cmatp_data>4) > 4,]
227
+pbmc_select <- pbmc_data[rowSums(pbmc_data>4) > 4,]
206 228
 ```
207 229
 
230
+
208 231
 ### 3.1.1 Analysis of 10X Genomics PBMC data
209 232
 
210 233
 
211 234
 ```{r, message = FALSE, verbose = FALSE}
212
-cmatp.res = celda_CG(cmatp_select, sample.label = rep(1,ncol(cmatp_select)), K = 15, L = 20)
235
+pbmc.res = celda(pbmc_select, K = 10, L = 20, cores = 1, model = "celda_CG", nchains = 8, max.iter = 100)
236
+model<-getModel(pbmc.res, K = 10, L = 20, best = "loglik")
213 237
 ```
214 238
 
215 239
 After matrix factorization, the top genes can be selected using the `topRank` function on the "gene.states" matrix:  
216 240
 
217 241
 ```{r}
218
-factorize.matrix = factorizeMatrix.celda_CG(counts = cmatp_select, celda.obj = cmatp.res)
219
-top.genes <- topRank(factorize.matrix$proportions$gene.states)
242
+factorize.matrix = factorizeMatrix(model, counts=pbmc_select)
243
+top.genes <- topRank(factorize.matrix$proportions$gene.states, n = 25)
220 244
 ```
221 245
 
222 246
 ```{r}
223
-top.genes$names$L16
247
+top.genes$names$L3
224 248
 ```
225 249
 
226 250
 ```{r}
227
-top.genes$names$L19
251
+top.genes$names$L4
228 252
 ```
229 253
 
230 254
 We can make a heatmap of these top genes as follows: 
231 255
 
232 256
 ```{r, fig.width = 7, fig.height = 7}
233 257
 top.genes.ix <- unique(unlist(top.genes$index))
234
-norm.cmatp.counts <- normalizeCounts(cmatp_select)
235
-render_celda_heatmap(norm.cmatp.counts[top.genes.ix,], z = cmatp.res$z, y = cmatp.res$y[top.genes.ix], cluster.row = FALSE, cluster.column = FALSE)
258
+norm.pbmc.counts <- normalizeCounts(pbmc_select)
259
+renderCeldaHeatmap(norm.pbmc.counts[top.genes.ix,], z = model$z, y = model$y[top.genes.ix], normalize = NULL, color_scheme = "divergent")
236 260
 ```
237 261
 
238
-We can also summarize the relative contribution of each transcriptional state in each cell population with a heatmap: 
262
+We can plot tSNE results via `plot_dr_cluster`, `plot_dr_state` and `plot_dr_gene`. These will determine how each cell is labeled via celda, visualize each cell's expression respectivelyof a specific transcriptional state and visualize each cell's expression of a set of genes, . 
263
+
264
+```{r, message = FALSE}
265
+norm_pbmc <- normalizeCounts(factorize.matrix$counts$cell.states)
266
+set.seed(123)
267
+tsne <- Rtsne(t(norm_pbmc), pca = FALSE, max_iter = 2000)
268
+pbmc_tsne <- tsne$Y
269
+```
239 270
 
240 271
 ```{r, fig.width = 7, fig.height = 7}
241
-cmatp.states = t(factorize.matrix$proportions$population.states)
242
-cmatp.rel.states = sweep(cmatp.states, 1, rowSums(cmatp.states), "/")
243
-render_celda_heatmap(cmatp.rel.states, col=colorRampPalette(c("white","blue","darkgreen","green"))(100), scale.row=F, show_cellnames=T, show_genenames=TRUE, cluster.column=FALSE, cluster.row=FALSE, breaks = NA, z= 1:15, y = 1:20)
272
+plotDrCluster(dim1 = pbmc_tsne[,1], dim2 = pbmc_tsne[,2], cluster = as.factor(model$z))
273
+
274
+plotDrState(dim1 = pbmc_tsne[,1], dim2 = pbmc_tsne[,2], matrix = factorize.matrix$proportions$cell.states, rescale = TRUE)
275
+
276
+marker.genes <- c("ENSG00000168685_IL7R","ENSG00000198851_CD3E","ENSG00000105374_NKG7","ENSG00000203747_FCGR3A","ENSG00000090382_LYZ","ENSG00000179639_FCER1A","ENSG00000156738_MS4A1", "ENSG00000163736_PPBP")
277
+gene.counts <- pbmc_select[marker.genes,]
278
+plotDrGene(dim1 = pbmc_tsne[,1],dim2 = pbmc_tsne[,2], matrix = gene.counts, rescale = TRUE)
244 279
 ```
245 280
 
281
+With the use of the plots, it is also possible to determine which cell clusters express specific marker genes. For instance, the same cell cluster that highly expresses the transcriptional state L4 highly expresses LYZ, a known marker gene for monocytes. This is consistent with what we see in the above, in which transcriptional state L6 highly expresses monocytes.
246 282
new file mode 100644
... ...
@@ -0,0 +1,25 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/feature_selection.R
3
+\name{GiniPlot}
4
+\alias{GiniPlot}
5
+\title{Create a bar chart based on Gini coefficients of transcriptional states}
6
+\usage{
7
+GiniPlot(counts, celda.mod, cell_clusters = NULL, label_size = 4,
8
+  bar_col = NULL)
9
+}
10
+\arguments{
11
+\item{counts}{A numeric count matrix.}
12
+
13
+\item{celda.mod}{An object of class celda_CG.}
14
+
15
+\item{cell_clusters}{A list of cell clusters. Calculate Gini coefficients using cells from specific cell clusters. If NULL, uses all cells (NULL by default).}
16
+
17
+\item{label_size}{Numeric. The size of labels of transcrptional states on top of bars (4 by default).}
18
+
19
+\item{bar_col}{The color of bars (#FF8080FF by default).}
20
+}
21
+\description{
22
+A bar plot shows the ordered Gini coefficients of transcriptional states. The transcriptional states with high Gini coefficients (i.e. close to 1) 
23
+   indicates that only a few cell clusters highly express the genes in these transcriptional states. It is easier for users to define cell types based on these
24
+   genes in these significant states.
25
+}
0 26
old mode 100644
1 27
new mode 100755
2 28
old mode 100644
3 29
new mode 100755
4 30
old mode 100644
5 31
new mode 100755
6 32
old mode 100644
7 33
new mode 100755
8 34
old mode 100644
9 35
new mode 100755
10 36
old mode 100644
11 37
new mode 100755
12 38
old mode 100644
13 39
new mode 100755
... ...
@@ -5,7 +5,7 @@
5 5
 \title{Run the celda Bayesian hierarchical model on a matrix of counts.}
6 6
 \usage{
7 7
 celda(counts, model, sample.label = NULL, nchains = 1, cores = 1,
8
-  seed = 12345, verbose = TRUE, logfile_prefix = "Celda", ...)
8
+  seed = 12345, verbose = FALSE, logfile_prefix = "Celda", ...)
9 9
 }
10 10
 \arguments{
11 11
 \item{counts}{A count matrix}
12 12
old mode 100644
13 13
new mode 100755
14 14
old mode 100644
15 15
new mode 100755
16 16
old mode 100644
17 17
new mode 100755
18 18
old mode 100644
19 19
new mode 100755
20 20
old mode 100644
21 21
new mode 100755
22 22
old mode 100644
23 23
new mode 100755
24 24
old mode 100644
25 25
new mode 100755
26 26
old mode 100644
27 27
new mode 100755
28 28
old mode 100644
29 29
new mode 100755
30 30
old mode 100644
31 31
new mode 100755
32 32
old mode 100644
33 33
new mode 100755
34 34
old mode 100644
35 35
new mode 100755
36 36
old mode 100644
37 37
new mode 100755
38 38
old mode 100644
39 39
new mode 100755
40 40
old mode 100644
41 41
new mode 100755
42 42
new file mode 100644
... ...
@@ -0,0 +1,37 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/diffExp.R
3
+\name{diffExp_MAST}
4
+\alias{diffExp_MAST}
5
+\title{Gene expression markers for cell clusters using MAST}
6
+\usage{
7
+diffExp_MAST(counts, celda.mod, c1, c2 = NULL, only.pos = FALSE,
8
+  log2fc.threshold = 0, fdr.threshold = 1)
9
+}
10
+\arguments{
11
+\item{counts}{A numeric count matrix.}
12
+
13
+\item{celda.mod}{An object of class celda_C or celda_CG.}
14
+
15
+\item{c1}{Numeric. Cell cluster(s) to define markers for.}
16
+
17
+\item{c2}{Numeric. Second cell cluster(s) for comparison. If NULL (default) - use all
18
+other cells for comparison.}
19
+
20
+\item{only.pos}{Logical. Only return markers with positive log2fc (FALSE by default).}
21
+
22
+\item{log2fc.threshold}{Numeric. Only return markers whose the absolute values of log2fc
23
+are greater than this threshold (0 by default).}
24
+
25
+\item{fdr.threshold}{Numeric. Only return markers whose false discovery rates (FDRs) are less
26
+than this threshold (1 by default).}
27
+}
28
+\value{
29
+Data frame containing a ranked list (based on the absolute value of log2fc) of putative markers,
30
+   and associated statistics (p-value, log2fc and FDR).
31
+}
32
+\description{
33
+Finds markers (differentially expressed genes) for cell clusters
34
+   using MAST: a flexible statistical framework for assessing transcriptional
35
+   changes and characterizing heterogeneity in single-cell RNA sequencing data
36
+   (Finak et al, Genome Biology, 2015)
37
+}
0 38
old mode 100644
1 39
new mode 100755
2 40
old mode 100644
3 41
new mode 100755
4 42
old mode 100644
5 43
new mode 100755
6 44
old mode 100644
7 45
new mode 100755
8 46
old mode 100644
9 47
new mode 100755
10 48
old mode 100644
11 49
new mode 100755
12 50
old mode 100644
13 51
new mode 100755
14 52
old mode 100644
15 53
new mode 100755
16 54
old mode 100644
17 55
new mode 100755
18 56
old mode 100644
19 57
new mode 100755
20 58
old mode 100644
21 59
new mode 100755
22 60
old mode 100644
23 61
new mode 100755
24 62
old mode 100644
25 63
new mode 100755
26 64
old mode 100644
27 65
new mode 100755
28 66
old mode 100644
29 67
new mode 100755
30 68
old mode 100644
31 69
new mode 100755
32 70
old mode 100644
33 71
new mode 100755
34 72
old mode 100644
35 73
new mode 100755
36 74
old mode 100644
37 75
new mode 100755
... ...
@@ -2,28 +2,24 @@
2 2
 % Please edit documentation in R/celda_list.R
3 3
 \name{getModel}
4 4
 \alias{getModel}
5
-\title{Run the celda Bayesian hierarchical model on a matrix of counts.
6
-TODO: - If no chain provided, automatically choose best chain
7
-      - Smarter subsetting of the run.params to DRY this function up}
5
+\title{Select the best model amongst a list of models generated in a celda run.}
8 6
 \usage{
9
-getModel(celda.list, K = NULL, L = NULL, chain = 1, best = NULL)
7
+getModel(celda.list, K = NULL, L = NULL, chain = NULL, best = "loglik")
10 8
 }
11 9
 \arguments{
12 10
 \item{celda.list}{A celda_list object returned from celda()}
13 11
 
14
-\item{K}{The K parameter for the desired model in the results list}
12
+\item{K}{The K parameter for the desired model in the results list. Defaults to NULL.}
15 13
 
16
-\item{L}{The L parameter for the desired model in the results list}
14
+\item{L}{The L parameter for the desired model in the results list. Defaults to NULL.}
17 15
 
18
-\item{chain}{The desired chain for the specified model}
16
+\item{chain}{The desired chain for the specified model. Defaults to NULL. Overrides best parameter if provided.}
19 17
 
20
-\item{best}{Method for choosing best chain automatically. Options are c("perplexity", "loglik"). See documentation for chooseBestModel for details. Overrides chain parameter if provided.}
18
+\item{best}{Method for choosing best chain automatically. Options are c("perplexity", "loglik"). See documentation for chooseBestModel for details. Defaults to "loglik."}
21 19
 }
22 20
 \value{
23 21
 A celda model object matching the provided parameters (of class "celda_C", "celda_G", "celda_CG" accordingly), or NA if one is not found.
24 22
 }
25 23
 \description{
26
-Run the celda Bayesian hierarchical model on a matrix of counts.
27
-TODO: - If no chain provided, automatically choose best chain
28
-      - Smarter subsetting of the run.params to DRY this function up
24
+Select the best model amongst a list of models generated in a celda run.
29 25
 }
30 26
old mode 100644
31 27
new mode 100755
32 28
old mode 100644
33 29
new mode 100755
34 30
old mode 100644
35 31
new mode 100755
36 32
old mode 100644
37 33
new mode 100755
38 34
old mode 100644
39 35
new mode 100755
40 36
old mode 100644
41 37
new mode 100755
42 38
old mode 100644
43 39
new mode 100755
44 40
old mode 100644
45 41
new mode 100755
46 42
old mode 100644
47 43
new mode 100755
48 44
old mode 100644
49 45
new mode 100755
50 46
old mode 100644
51 47
new mode 100755
52 48
old mode 100644
53 49
new mode 100755
54 50
old mode 100644
55 51
new mode 100755
56 52
old mode 100644
57 53
new mode 100755
58 54
new file mode 100755
... ...
@@ -0,0 +1,19 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/celda_list.R
3
+\name{selectModels}
4
+\alias{selectModels}
5
+\title{Select a set of models from a celda_list objects based off of rows in its run.params attribute.}
6
+\usage{
7
+selectModels(celda.list, run.param.rows)
8
+}
9
+\arguments{
10
+\item{celda.list}{A celda_list object returned from celda()}
11
+
12
+\item{run.param.rows}{Row indices in the celda.list's run params corresponding to the desired models.}
13
+}
14
+\value{
15
+A celda_list containing celda model objects matching the provided parameters (of class "celda_C", "celda_G", "celda_CG" accordingly), or NA if one is not found.
16
+}
17
+\description{
18
+Select a set of models from a celda_list objects based off of rows in its run.params attribute.
19
+}
0 20
old mode 100644
1 21
new mode 100755
2 22
old mode 100644
3 23
new mode 100755
4 24
old mode 100644
5 25
new mode 100755
6 26
old mode 100644
7 27
new mode 100755
8 28
new file mode 100644
... ...
@@ -0,0 +1,40 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/StateHeatmap.R
3
+\name{stateHeatmap}
4
+\alias{stateHeatmap}
5
+\title{Transcriptional state heatmap}
6
+\usage{
7
+stateHeatmap(counts, celda.mod, state.use = 1, cells.use = NULL,
8
+  genes.use = NULL, normalize = TRUE, scale_row = scale,
9
+  show_genenames = TRUE)
10
+}
11
+\arguments{
12
+\item{counts}{A numeric count matrix.}
13
+
14
+\item{celda.mod}{An object of class celda_C or celda_CG.}
15
+
16
+\item{state.use}{Numeric. A transcriptional state to plot.}
17
+
18
+\item{cells.use}{Numeric or character. If a number, plot only this number of cells with respectively 
19
+highest and lowest proportions of counts in the transcriptional state. If a list of cell names, 
20
+plot only these cells orderd by their proportions of counts in the transcriptional state from 
21
+left to right. If NULL, plot all cells (NULL by default).}
22
+
23
+\item{genes.use}{Numeric or character. If a number, plot only this number of genes with respectively 
24
+highest and lowest proportions of counts in the transcriptional state. If a list of gene names, 
25
+plot only these genes orderd by their proportions of counts in the transcriptional state from 
26
+the bottom up. If NULL, plot all genes in the transcriptional state (NULL by default).}
27
+
28
+\item{normalize}{Logical. Should the columns be normmalized? Set to FALSE to disable. 'normalizeCounts' 
29
+normalizes to counts per million (CPM) (default by TRUE).}
30
+
31
+\item{scale_row}{Function; A function to scale each individual row. Set to NULL to disable. Occurs 
32
+after normalization and log transformation. Defualt is 'scale' and thus will Z-score transform each row.}
33
+
34
+\item{show_genenames}{Logical; specifying if gene names should be shown. Default to be TRUE.}
35
+}
36
+\description{
37
+Draws a heatmap focusing on a transcriptional state. Both cells and genes are sorted by 
38
+   their proportions of counts in a given transcriptional state. Allows for nice visualization of 
39
+   co-expression of those genes grouped into transcriptional states by Celda.
40
+}
0 41
old mode 100644
1 42
new mode 100755
2 43
old mode 100644
3 44
new mode 100755
4 45
old mode 100644
5 46
new mode 100755
6 47
old mode 100644
7 48
new mode 100755
8 49
old mode 100644
9 50
new mode 100755
10 51
old mode 100644
11 52
new mode 100755
12 53
old mode 100644
13 54
new mode 100755
14 55
old mode 100644
15 56
new mode 100755
16 57
old mode 100644
17 58
new mode 100755
18 59
old mode 100644
19 60
new mode 100755
<
20 61
old mode 100644
21 62
new mode 100755
... ...
@@ -1,6 +1,6 @@
1 1
 ---
2 2
 title: "Analyzing single-cell RNA-seq count data with Celda"
3
-author: "Josh D. Campbell, Masanao Yajima, Sean Corbett, Zichun Liu, Shiyi Yang, Tianwen Huan, Anahita Bahri, Zhe Wang, Yusuke Koga"
3
+author: "Josh D. Campbell, Masanao Yajima, Sean Corbett, Zichun Liu, Shiyi Yang, Tianwen Huan, Anahita Bahri, Zhe Wang, Yusuke Koga, Jiangyuan Liu"
4 4
 date: "`r Sys.Date()`"
5 5
 output: rmarkdown::html_vignette
6 6
 vignette: >
... ...
@@ -104,18 +104,18 @@ table(sim_counts$z, sim_counts$sample.label)
104 104
 
105 105
 ## 2.1.1 How to Run Celda
106 106
 
107
-There are currently three models within this package: `celda_C` will cluster cells, `celda_G` will cluster genes, and `celda_CG` will simultaneously cluster cells and genes. Here is the command for `celda_CG`:
107
+There are currently three models within this package: `celda_C` will cluster cells, `celda_G` will cluster genes, and `celda_CG` will simultaneously cluster cells and genes. The `celda` function will use these models and take in a counts matrix to create a `celda` object that contains the cluster labels for each cell and gene.
108 108
 
109 109
 ```{r, fig.show='hold', warning = FALSE, message = FALSE}
110 110
 
111
-celdaCG.res = celda_CG(counts = sim_counts$counts, K = 3, L = 10, max.iter = 10, sample.label = sim_counts$sample.label)
111
+celda.res = celda(counts = sim_counts$counts, model = "celda_CG", K = 3, L = 10, max.iter = 10, cores = 1, nchains = 1)
112 112
 ```
113 113
 
114 114
 Here is a comparison between the true cluster labels and the estimated cluster labels.
115 115
 
116 116
 ```{r}
117
-z = celdaCG.res$z
118
-y = celdaCG.res$y
117
+z = celda.res$res.list[[1]]$z
118
+y = celda.res$res.list[[1]]$y
119 119
 
120 120
 table(z, sim_counts$z)
121 121
 table(y, sim_counts$y)
... ...
@@ -131,12 +131,28 @@ renderCeldaHeatmap(counts = norm.counts, z=z, y=y, normalize = NULL, color_schem
131 131
 
132 132
 ## 2.1.2 Matrix factorization
133 133
 
134
-`celda` can also perform matrix factorization to summarize the contribution of each transcriptional state to each cellular subpopulation. 
134
+`celda` can also perform matrix factorization to summarize the contribution of each transcriptional state to each cellular subpopulation. Matrix factorization decomposes the counts matrix into multiple matrices, using the given `celda` model. The gene.states contains each gene's contribution to the transcriptional state it belongs. The cell.states contains each sample's contribution to each transcriptional state. The population.states contains each transcriptional states' contribution to each of the cell states. Each of these matrices can be viewed as raw counts values or proportional values.
135 135
 
136 136
 ```{r}
137
-factorized <- factorizeMatrix(celdaCG.res, sim_counts$count)
137
+model <- getModel(celda.res, K = 3, L = 10, best = "loglik")
138
+factorized <- factorizeMatrix(model, sim_counts$count)
139
+```
138 140
 
141
+```{r}
142
+dim(factorized$proportions$gene.states)
143
+head(factorized$proportions$gene.states)
144
+```
145
+
146
+```{r}
147
+dim(factorized$proportions$cell.states)
148
+factorized$proportions$cell.states[1:10,1:3]
149
+```
150
+
151
+For instance, roughly 30% of the population in cell state K2 is in transcriptional state L1. 
152
+
153
+```{r}
139 154
 pop.states = factorized$proportions$population.states
155
+dim(pop.states)
140 156
 head(pop.states)
141 157
 ```
142 158
 
... ...
@@ -145,7 +161,7 @@ head(pop.states)
145 161
 ```{r, fig.width = 7, fig.height = 7}
146 162
 data.pca <- prcomp(t(scale(t(factorized$proportions$cell.states))),scale = F, center = F)
147 163
 
148
-plotDrCluster(dim1 = data.pca$rotation[,1], dim2 = data.pca$rotation[,2], cluster = as.factor(celdaCG.res$z))
164
+plotDrCluster(dim1 = data.pca$rotation[,1], dim2 = data.pca$rotation[,2], cluster = celda.res$res.list[[1]]$z)
149 165
 
150 166
 plotDrState(dim1 = data.pca$rotation[,1], dim2 = data.pca$rotation[,2], matrix = factorized$proportions$cell.states, rescale = TRUE)
151 167
 ```
... ...
@@ -171,7 +187,7 @@ The relative probabilities will tend to reflect the patterns observed in the hea
171 187
 The optimal number of K and L will likely not be known a priori. Therefore multiple choices of K and L may need to tested and compared. Additionally, `celda` is sensitive to initial start conditions, so it is good practice to run multiple chains for each combination of K and L to increase the chances of finding the most optimal solution. We have built a wrapper function that will run multiple combinations of K and L with multiple chains for each combination in parallel. 
172 188
 
173 189
 ```{r}
174
-celda.res.list <- celda(counts = sim_counts$counts,K = 2:4, L = 9:11, nchains = 3, max.iter = 10, model = "celda_CG")
190
+celda.res.list <- celda(counts = sim_counts$counts,K = 2:4, L = 9:11, nchains = 3, max.iter = 10, model = "celda_CG", cores = 1)
175 191
 ```
176 192
 
177 193
 Here, `celda` will fit the model for every combination of K between 2 and 4 and L between 9 and 11. Three chains will be run for each combination. The parameter "cores" can be set to something more than 1 to run multiple chains in parallel.
... ...
@@ -216,23 +232,23 @@ pbmc_select <- pbmc_data[rowSums(pbmc_data>4) > 4,]
216 232
 
217 233
 
218 234
 ```{r, message = FALSE, verbose = FALSE}
219
-pbmc.res = celda(pbmc_select, K = 15, L = 20, cores = 1, model = "celda_CG", nchains = 10, max.iter = 100)
220
-model.res.list<-getModel(pbmc.res, K = 15, L = 20, best = "loglik")
235
+pbmc.res = celda(pbmc_select, K = 10, L = 20, cores = 1, model = "celda_CG", nchains = 8, max.iter = 100)
236
+model<-getModel(pbmc.res, K = 10, L = 20, best = "loglik")
221 237
 ```
222 238
 
223 239
 After matrix factorization, the top genes can be selected using the `topRank` function on the "gene.states" matrix:  
224 240
 
225 241
 ```{r}
226
-factorize.matrix = factorizeMatrix(model.res.list, counts=pbmc_select)
242
+factorize.matrix = factorizeMatrix(model, counts=pbmc_select)
227 243
 top.genes <- topRank(factorize.matrix$proportions$gene.states, n = 25)
228 244
 ```
229 245
 
230 246
 ```{r}
231
-top.genes$names$L6
247
+top.genes$names$L3
232 248
 ```
233 249
 
234 250
 ```{r}
235
-top.genes$names$L15
251
+top.genes$names$L4
236 252
 ```
237 253
 
238 254
 We can make a heatmap of these top genes as follows: 
... ...
@@ -240,7 +256,7 @@ We can make a heatmap of these top genes as follows:
240 256
 ```{r, fig.width = 7, fig.height = 7}
241 257
 top.genes.ix <- unique(unlist(top.genes$index))
242 258
 norm.pbmc.counts <- normalizeCounts(pbmc_select)
243
-renderCeldaHeatmap(norm.pbmc.counts[top.genes.ix,], z = model.res.list$z, y = model.res.list$y[top.genes.ix], normalize = NULL, color_scheme = "divergent")
259
+renderCeldaHeatmap(norm.pbmc.counts[top.genes.ix,], z = model$z, y = model$y[top.genes.ix], normalize = NULL, color_scheme = "divergent")
244 260
 ```
245 261
 
246 262
 We can plot tSNE results via `plot_dr_cluster`, `plot_dr_state` and `plot_dr_gene`. These will determine how each cell is labeled via celda, visualize each cell's expression respectivelyof a specific transcriptional state and visualize each cell's expression of a set of genes, . 
... ...
@@ -248,12 +264,12 @@ We can plot tSNE results via `plot_dr_cluster`, `plot_dr_state` and `plot_dr_gen
248 264
 ```{r, message = FALSE}
249 265
 norm_pbmc <- normalizeCounts(factorize.matrix$counts$cell.states)
250 266
 set.seed(123)
251
-tsne <- Rtsne(t(norm_pbmc), pca = FALSE, max_iter = 5000)
267
+tsne <- Rtsne(t(norm_pbmc), pca = FALSE, max_iter = 2000)
252 268
 pbmc_tsne <- tsne$Y
253 269
 ```
254 270
 
255 271
 ```{r, fig.width = 7, fig.height = 7}
256
-plotDrCluster(dim1 = pbmc_tsne[,1], dim2 = pbmc_tsne[,2], cluster = as.factor(model.res.list$z))
272
+plotDrCluster(dim1 = pbmc_tsne[,1], dim2 = pbmc_tsne[,2], cluster = as.factor(model$z))
257 273
 
258 274
 plotDrState(dim1 = pbmc_tsne[,1], dim2 = pbmc_tsne[,2], matrix = factorize.matrix$proportions$cell.states, rescale = TRUE)
259 275
 
... ...
@@ -262,4 +278,4 @@ gene.counts <- pbmc_select[marker.genes,]
262 278