Former-commit-id: c511bbc2b07a5089863d5e7be25a8f69a1374bc0
... | ... |
@@ -21,6 +21,7 @@ S3method(simulateCells,celda_G) |
21 | 21 |
S3method(visualizeModelPerformance,celda_C) |
22 | 22 |
S3method(visualizeModelPerformance,celda_CG) |
23 | 23 |
S3method(visualizeModelPerformance,celda_G) |
24 |
+export(GiniPlot) |
|
24 | 25 |
export(available_models) |
25 | 26 |
export(calculateLoglikFromVariables) |
26 | 27 |
export(calculatePerplexity) |
... | ... |
@@ -37,6 +38,7 @@ export(clusterProbability.celda_G) |
37 | 38 |
export(compareCountMatrix) |
38 | 39 |
export(completeClusterHistory) |
39 | 40 |
export(completeLogLikelihood) |
41 |
+export(diffExp_MAST) |
|
40 | 42 |
export(distinct_colors) |
41 | 43 |
export(factorizeMatrix) |
42 | 44 |
export(finalClusterAssignment) |
... | ... |
@@ -56,8 +58,10 @@ export(renderInteractiveKLPlot) |
56 | 58 |
export(renderIterationLikelihoodPlot) |
57 | 59 |
export(runParams) |
58 | 60 |
export(seed) |
61 |
+export(selectModels) |
|
59 | 62 |
export(semi_pheatmap) |
60 | 63 |
export(simulateCells) |
64 |
+export(stateHeatmap) |
|
61 | 65 |
export(topRank) |
62 | 66 |
export(visualizeModelPerformance) |
63 | 67 |
import(RColorBrewer) |
66 | 70 |
old mode 100644 |
67 | 71 |
new mode 100755 |
... | ... |
@@ -19,7 +19,7 @@ available_models = c("celda_C", "celda_G", "celda_CG") |
19 | 19 |
#' @return Object of class "celda_list", which contains results for all model parameter combinations and summaries of the run parameters |
20 | 20 |
#' @import foreach |
21 | 21 |
#' @export |
22 |
-celda = function(counts, model, sample.label=NULL, nchains=1, cores=1, seed=12345, verbose=TRUE, logfile_prefix="Celda", ...) { |
|
22 |
+celda = function(counts, model, sample.label=NULL, nchains=1, cores=1, seed=12345, verbose=FALSE, logfile_prefix="Celda", ...) { |
|
23 | 23 |
message("Starting celda...") |
24 | 24 |
validateArgs(counts, model, sample.label, nchains, cores, seed, ...) |
25 | 25 |
|
26 | 26 |
old mode 100644 |
27 | 27 |
new mode 100755 |
... | ... |
@@ -204,11 +204,9 @@ celda_C = function(counts, sample.label=NULL, K, alpha=1, beta=1, |
204 | 204 |
iter = iter + 1 |
205 | 205 |
} |
206 | 206 |
|
207 |
- reordered.labels = reorder.label.by.size(z.best, K) |
|
208 |
- z.final.reorder = reordered.labels$new.labels |
|
209 | 207 |
names = list(row=rownames(counts), column=colnames(counts), sample=levels(sample.label)) |
210 | 208 |
|
211 |
- result = list(z=z.final.reorder, completeLogLik=ll, |
|
209 |
+ result = list(z=z.best, completeLogLik=ll, |
|
212 | 210 |
finalLogLik=ll.best, seed=seed, K=K, |
213 | 211 |
sample.label=sample.label, alpha=alpha, |
214 | 212 |
beta=beta, count.checksum=count.checksum, |
... | ... |
@@ -216,6 +214,8 @@ celda_C = function(counts, sample.label=NULL, K, alpha=1, beta=1, |
216 | 214 |
|
217 | 215 |
class(result) = "celda_C" |
218 | 216 |
|
217 |
+ result = reorder.celdaC(counts = counts, res = result) |
|
218 |
+ |
|
219 | 219 |
return(result) |
220 | 220 |
} |
221 | 221 |
|
... | ... |
@@ -351,6 +351,16 @@ cC.calcLL = function(m.CP.by.S, n.CP.by.G, s, z, K, nS, alpha, beta) { |
351 | 351 |
return(final) |
352 | 352 |
} |
353 | 353 |
|
354 |
+reorder.celdaC = function(counts,res){ |
|
355 |
+ #Reorder K |
|
356 |
+ fm <- factorizeMatrix(counts = counts, celda.mod = res) |
|
357 |
+ fm.norm <- t(normalizeCounts(t(fm$proportions$gene.states),scale.factor = 1)) |
|
358 |
+ d <- dist(t(fm.norm),diag = TRUE, upper = TRUE) |
|
359 |
+ h <- hclust(d, method = "complete") |
|
360 |
+ res <- recodeClusterZ(res,from = h$order, |
|
361 |
+ to = c(1:ncol(fm$counts$gene.states))) |
|
362 |
+ return(res) |
|
363 |
+} |
|
354 | 364 |
|
355 | 365 |
#' Generate factorized matrices showing each feature's influence on the celda_C model clustering |
356 | 366 |
#' |
357 | 367 |
old mode 100644 |
358 | 368 |
new mode 100755 |
... | ... |
@@ -93,7 +93,7 @@ calculateLoglikFromVariables.celda_CG = function(counts, s, z, y, K, L, alpha, b |
93 | 93 |
return(final) |
94 | 94 |
} |
95 | 95 |
|
96 |
-cCG.calcLL = function(K, L, m.CP.by.S, n.CP.by.TS, n.by.G, n.by.TS, nG.by.TS, nS, nG, alpha, beta, delta, gamma) { |
|
96 |
+.calcLL = function(K, L, m.CP.by.S, n.CP.by.TS, n.by.G, n.by.TS, nG.by.TS, nS, nG, alpha, beta, delta, gamma) { |
|
97 | 97 |
|
98 | 98 |
## Determine if any TS has 0 genes |
99 | 99 |
## Need to remove 0 gene states as this will cause the likelihood to fail |
... | ... |
@@ -142,7 +142,7 @@ cCG.calcLL = function(K, L, m.CP.by.S, n.CP.by.TS, n.by.G, n.by.TS, nG.by.TS, nS |
142 | 142 |
return(final) |
143 | 143 |
} |
144 | 144 |
|
145 |
-cCG.calcGibbsProbZ = function(m.CP.by.S, n.CP.by.TS, n.CP, L, alpha, beta) { |
|
145 |
+.calcGibbsProbZ = function(m.CP.by.S, n.CP.by.TS, n.CP, L, alpha, beta) { |
|
146 | 146 |
|
147 | 147 |
## Calculate for "Theta" component |
148 | 148 |
theta.ll = log(m.CP.by.S + alpha) |
... | ... |
@@ -158,7 +158,7 @@ cCG.calcGibbsProbZ = function(m.CP.by.S, n.CP.by.TS, n.CP, L, alpha, beta) { |
158 | 158 |
return(final) |
159 | 159 |
} |
160 | 160 |
|
161 |
-cCG.calcGibbsProbY = function(n.CP.by.TS, n.by.TS, nG.by.TS, nG.in.Y, beta, delta, gamma) { |
|
161 |
+.calcGibbsProbY = function(n.CP.by.TS, n.by.TS, nG.by.TS, nG.in.Y, beta, delta, gamma) { |
|
162 | 162 |
|
163 | 163 |
## Calculate for "Phi" component |
164 | 164 |
phi.ll = sum(lgamma(n.CP.by.TS + beta)) |
... | ... |
@@ -176,6 +176,25 @@ cCG.calcGibbsProbY = function(n.CP.by.TS, n.by.TS, nG.by.TS, nG.in.Y, beta, delt |
176 | 176 |
return(final) |
177 | 177 |
} |
178 | 178 |
|
179 |
+reorder.celdaCG = function(counts,res){ |
|
180 |
+ #Reorder K |
|
181 |
+ fm <- factorizeMatrix(counts = counts, celda.mod = res) |
|
182 |
+ fm.norm <- t(normalizeCounts(t(fm$proportions$population.states),scale.factor = 1)) |
|
183 |
+ d <- dist(t(fm.norm),diag = TRUE, upper = TRUE) |
|
184 |
+ h <- hclust(d, method = "complete") |
|
185 |
+ res <- recodeClusterZ(res,from = h$order, |
|
186 |
+ to = c(1:ncol(fm$counts$population.states))) |
|
187 |
+ |
|
188 |
+ #Reorder L |
|
189 |
+ fm <- factorizeMatrix(counts = counts, celda.mod = res) |
|
190 |
+ fm.norm <- t(normalizeCounts(t(fm$proportions$population.states),scale.factor = 1)) |
|
191 |
+ d <- dist((fm.norm),diag = TRUE, upper = TRUE) |
|
192 |
+ h <- hclust(d, method = "complete") |
|
193 |
+ res <- recodeClusterY(res,from = h$order, |
|
194 |
+ to = c(1:nrow(fm$counts$population.states))) |
|
195 |
+ return(res) |
|
196 |
+} |
|
197 |
+ |
|
179 | 198 |
#' Simulate cells from the cell/gene clustering generative model |
180 | 199 |
#' |
181 | 200 |
#' @param S The number of samples |
... | ... |
@@ -246,8 +265,8 @@ simulateCells.celda_CG = function(model, S=10, C.Range=c(50,100), N.Range=c(500, |
246 | 265 |
zero.row.idx = which(rowSums(cell.counts) == 0) |
247 | 266 |
if (length(zero.row.idx > 0)) { |
248 | 267 |
cell.counts = cell.counts[-zero.row.idx, ] |
249 |
- } |
|
250 |
- new$y = new$y[-zero.row.idx] |
|
268 |
+ new$y = new$y[-zero.row.idx] |
|
269 |
+ } |
|
251 | 270 |
|
252 | 271 |
## Assign gene/cell/sample names |
253 | 272 |
rownames(cell.counts) = paste0("Gene_", 1:nrow(cell.counts)) |
... | ... |
@@ -332,7 +351,7 @@ celda_CG = function(counts, sample.label=NULL, K, L, alpha=1, beta=1, |
332 | 351 |
nG = nrow(counts) |
333 | 352 |
nM = ncol(counts) |
334 | 353 |
|
335 |
- ll = cCG.calcLL(K=K, L=L, m.CP.by.S=m.CP.by.S, n.CP.by.TS=n.CP.by.TS, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nS=nS, nG=nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma) |
|
354 |
+ ll = .calcLL(K=K, L=L, m.CP.by.S=m.CP.by.S, n.CP.by.TS=n.CP.by.TS, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nS=nS, nG=nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma) |
|
336 | 355 |
|
337 | 356 |
iter = 1 |
338 | 357 |
continue = TRUE |
... | ... |
@@ -358,7 +377,7 @@ celda_CG = function(counts, sample.label=NULL, K, L, alpha=1, beta=1, |
358 | 377 |
temp.n.CP = n.CP |
359 | 378 |
temp.n.CP[j] = temp.n.CP[j] + sum(counts[,i]) |
360 | 379 |
|
361 |
- probs[j] = cCG.calcGibbsProbZ(m.CP.by.S=m.CP.by.S[j,s[i]], n.CP.by.TS=temp.n.CP.by.TS, n.CP=temp.n.CP, L=L, alpha=alpha, beta=beta) |
|
380 |
+ probs[j] = .calcGibbsProbZ(m.CP.by.S=m.CP.by.S[j,s[i]], n.CP.by.TS=temp.n.CP.by.TS, n.CP=temp.n.CP, L=L, alpha=alpha, beta=beta) |
|
362 | 381 |
} |
363 | 382 |
|
364 | 383 |
## Sample next state and add back counts |
... | ... |
@@ -410,7 +429,7 @@ celda_CG = function(counts, sample.label=NULL, K, L, alpha=1, beta=1, |
410 | 429 |
temp.nG.by.TS = nG.by.TS + (1 * ADD_PSEUDO) |
411 | 430 |
temp.nG.by.TS[j] = temp.nG.by.TS[j] + 1 |
412 | 431 |
|
413 |
- probs[j] = cCG.calcGibbsProbY(n.CP.by.TS=temp.n.CP.by.TS, |
|
432 |
+ probs[j] = .calcGibbsProbY(n.CP.by.TS=temp.n.CP.by.TS, |
|
414 | 433 |
n.by.TS=temp.n.by.TS, |
415 | 434 |
nG.by.TS=temp.nG.by.TS, |
416 | 435 |
nG.in.Y=temp.nG.by.TS[j], |
... | ... |
@@ -481,7 +500,7 @@ celda_CG = function(counts, sample.label=NULL, K, L, alpha=1, beta=1, |
481 | 500 |
} |
482 | 501 |
|
483 | 502 |
## Calculate complete likelihood |
484 |
- temp.ll = cCG.calcLL(K=K, L=L, m.CP.by.S=m.CP.by.S, n.CP.by.TS=n.CP.by.TS, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nS=nS, nG=nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma) |
|
503 |
+ temp.ll = .calcLL(K=K, L=L, m.CP.by.S=m.CP.by.S, n.CP.by.TS=n.CP.by.TS, n.by.G=n.by.G, n.by.TS=n.by.TS, nG.by.TS=nG.by.TS, nS=nS, nG=nG, alpha=alpha, beta=beta, delta=delta, gamma=gamma) |
|
485 | 504 |
if((all(temp.ll > ll)) | iter == 1) { |
486 | 505 |
z.best = z |
487 | 506 |
y.best = y |
... | ... |
@@ -493,14 +512,12 @@ celda_CG = function(counts, sample.label=NULL, K, L, alpha=1, beta=1, |
493 | 512 |
iter = iter + 1 |
494 | 513 |
} |
495 | 514 |
|
496 |
- ## Peform reordering on final Z and Y assigments: |
|
497 |
- reordered.labels = reorder.labels.by.size.then.counts(counts, z=z.best, |
|
498 |
- y=y.best, K=K, L=L) |
|
515 |
+ |
|
499 | 516 |
names = list(row=rownames(counts), column=colnames(counts), |
500 | 517 |
sample=levels(sample.label)) |
501 | 518 |
|
502 | 519 |
|
503 |
- result = list(z=reordered.labels$z, y=reordered.labels$y, completeLogLik=ll, |
|
520 |
+ result = list(z=z.best, y=y.best, completeLogLik=ll, |
|
504 | 521 |
finalLogLik=ll.best, K=K, L=L, alpha=alpha, |
505 | 522 |
beta=beta, delta=delta, gamma=gamma, seed=seed, |
506 | 523 |
sample.label=sample.label, names=names, |
... | ... |
@@ -508,6 +525,8 @@ celda_CG = function(counts, sample.label=NULL, K, L, alpha=1, beta=1, |
508 | 525 |
|
509 | 526 |
class(result) = "celda_CG" |
510 | 527 |
|
528 |
+ ## Peform reordering on final Z and Y assigments: |
|
529 |
+ result = reorder.celdaCG(counts = counts, res = result) |
|
511 | 530 |
return(result) |
512 | 531 |
} |
513 | 532 |
|
... | ... |
@@ -637,7 +656,7 @@ clusterProbability.celda_CG = function(counts, celda.mod) { |
637 | 656 |
temp.n.CP = n.CP |
638 | 657 |
temp.n.CP[j] = temp.n.CP[j] + sum(counts[,i]) |
639 | 658 |
|
640 |
- z.prob[i,j] = cCG.calcGibbsProbZ(m.CP.by.S=m.CP.by.S[j,s[i]], n.CP.by.TS=temp.n.CP.by.TS, n.CP=temp.n.CP, L=L, alpha=alpha, beta=beta) |
|
659 |
+ z.prob[i,j] = .calcGibbsProbZ(m.CP.by.S=m.CP.by.S[j,s[i]], n.CP.by.TS=temp.n.CP.by.TS, n.CP=temp.n.CP, L=L, alpha=alpha, beta=beta) |
|
641 | 660 |
} |
642 | 661 |
|
643 | 662 |
m.CP.by.S[z[i],s[i]] = m.CP.by.S[z[i],s[i]] + 1 |
... | ... |
@@ -667,7 +686,7 @@ clusterProbability.celda_CG = function(counts, celda.mod) { |
667 | 686 |
temp.nG.by.TS = nG.by.TS + (1 * ADD_PSEUDO) |
668 | 687 |
temp.nG.by.TS[j] = temp.nG.by.TS[j] + 1 |
669 | 688 |
|
670 |
- y.prob[i,j] = cCG.calcGibbsProbY(n.CP.by.TS=temp.n.CP.by.TS, |
|
689 |
+ y.prob[i,j] = .calcGibbsProbY(n.CP.by.TS=temp.n.CP.by.TS, |
|
671 | 690 |
n.by.TS=temp.n.by.TS, |
672 | 691 |
nG.by.TS=temp.nG.by.TS, |
673 | 692 |
nG.in.Y=temp.nG.by.TS[j], |
674 | 693 |
old mode 100644 |
675 | 694 |
new mode 100755 |
... | ... |
@@ -123,7 +123,16 @@ cG.calcLL = function(n.TS.by.C, n.by.TS, n.by.G, nG.by.TS, nM, nG, L, beta, delt |
123 | 123 |
return(final) |
124 | 124 |
} |
125 | 125 |
|
126 |
- |
|
126 |
+reorder.celdaG = function(counts,res){ |
|
127 |
+ #Reorder L |
|
128 |
+ fm <- factorizeMatrix(counts = counts, celda.mod = res) |
|
129 |
+ fm.norm <- t(normalizeCounts(fm$proportions$gene.states,scale.factor = 1)) |
|
130 |
+ d <- dist((fm.norm),diag = TRUE, upper = TRUE) |
|
131 |
+ h <- hclust(d, method = "complete") |
|
132 |
+ res <- recodeClusterY(res,from = h$order, |
|
133 |
+ to = c(1:nrow(fm$counts$cell.states))) |
|
134 |
+ return(res) |
|
135 |
+} |
|
127 | 136 |
|
128 | 137 |
|
129 | 138 |
# Calculate Log Likelihood For Single Set of Cluster Assignments (Gene Clustering) |
... | ... |
@@ -297,16 +306,16 @@ celda_G = function(counts, L, beta=1, delta=1, gamma=1, max.iter=25, |
297 | 306 |
iter <- iter + 1 |
298 | 307 |
} |
299 | 308 |
|
300 |
- reordered.labels = reorder.label.by.size(y.best, L) |
|
301 |
- y.final.order = reordered.labels$new.labels |
|
302 | 309 |
names = list(row=rownames(counts), column=colnames(counts)) |
303 | 310 |
|
304 |
- result = list(y=y.final.order, completeLogLik=ll, |
|
311 |
+ result = list(y=y.best, completeLogLik=ll, |
|
305 | 312 |
finalLogLik=ll.best, L=L, beta=beta, delta=delta, gamma=gamma, |
306 | 313 |
count.checksum=count.checksum, seed=seed, names=names) |
307 | 314 |
|
308 | 315 |
class(result) = "celda_G" |
309 | 316 |
|
317 |
+ result = reorder.celdaG(counts = counts, res = result) |
|
318 |
+ |
|
310 | 319 |
return(result) |
311 | 320 |
} |
312 | 321 |
|
... | ... |
@@ -361,8 +370,10 @@ simulateCells.celda_G = function(model, C=100, N.Range=c(500,5000), G=1000, |
361 | 370 |
## Ensure that there are no all-0 rows in the counts matrix, which violates a celda modeling |
362 | 371 |
## constraint (columns are guarnteed at least one count): |
363 | 372 |
zero.row.idx = which(rowSums(cell.counts) == 0) |
364 |
- cell.counts = cell.counts[-zero.row.idx, ] |
|
365 |
- y = y[-zero.row.idx] |
|
373 |
+ if (length(zero.row.idx > 0)) { |
|
374 |
+ cell.counts = cell.counts[-zero.row.idx, ] |
|
375 |
+ y = y[-zero.row.idx] |
|
376 |
+ } |
|
366 | 377 |
|
367 | 378 |
rownames(cell.counts) = paste0("Gene_", 1:nrow(cell.counts)) |
368 | 379 |
colnames(cell.counts) = paste0("Cell_", 1:ncol(cell.counts)) |
373 | 384 |
old mode 100644 |
374 | 385 |
new mode 100755 |
... | ... |
@@ -2,25 +2,26 @@ |
2 | 2 |
# S3 Methods for celda_list objects # |
3 | 3 |
################################################################################ |
4 | 4 |
|
5 |
-#' Run the celda Bayesian hierarchical model on a matrix of counts. |
|
6 |
-#' TODO: - If no chain provided, automatically choose best chain |
|
7 |
-#' - Smarter subsetting of the run.params to DRY this function up |
|
5 |
+# TODO: - If no chain provided, automatically choose best chain |
|
6 |
+# - Smarter subsetting of the run.params to DRY this function up |
|
7 |
+ |
|
8 |
+#' Select the best model amongst a list of models generated in a celda run. |
|
8 | 9 |
#' |
9 | 10 |
#' @param celda.list A celda_list object returned from celda() |
10 |
-#' @param K The K parameter for the desired model in the results list |
|
11 |
-#' @param L The L parameter for the desired model in the results list |
|
12 |
-#' @param chain The desired chain for the specified model |
|
13 |
-#' @param best Method for choosing best chain automatically. Options are c("perplexity", "loglik"). See documentation for chooseBestModel for details. Overrides chain parameter if provided. |
|
11 |
+#' @param K The K parameter for the desired model in the results list. Defaults to NULL. |
|
12 |
+#' @param L The L parameter for the desired model in the results list. Defaults to NULL. |
|
13 |
+#' @param chain The desired chain for the specified model. Defaults to NULL. Overrides best parameter if provided. |
|
14 |
+#' @param best Method for choosing best chain automatically. Options are c("perplexity", "loglik"). See documentation for chooseBestModel for details. Defaults to "loglik." |
|
14 | 15 |
#' @return A celda model object matching the provided parameters (of class "celda_C", "celda_G", "celda_CG" accordingly), or NA if one is not found. |
15 | 16 |
#' @export |
16 |
-getModel = function(celda.list, K=NULL, L=NULL, chain=1, best=NULL) { |
|
17 |
+getModel = function(celda.list, K=NULL, L=NULL, chain=NULL, best="loglik") { |
|
17 | 18 |
validateGetModelParams(celda.list, K, L, chain, best) # Sanity check params |
18 | 19 |
|
19 | 20 |
requested.chain = NA |
20 | 21 |
run.params = celda.list$run.params |
21 | 22 |
|
22 | 23 |
if (celda.list$content.type == "celda_CG") { |
23 |
- if (!is.null(best)) { |
|
24 |
+ if (is.null(chain)) { |
|
24 | 25 |
matching.chain.idx = run.params[run.params$K == K & run.params$L == L, "index"] |
25 | 26 |
requested.chain = chooseBestChain(celda.list$res.list[matching.chain.idx], best) |
26 | 27 |
} else { |
... | ... |
@@ -32,7 +33,7 @@ getModel = function(celda.list, K=NULL, L=NULL, chain=1, best=NULL) { |
32 | 33 |
|
33 | 34 |
|
34 | 35 |
if (celda.list$content.type == "celda_C") { |
35 |
- if (!is.null(best)) { |
|
36 |
+ if (is.null(chain)) { |
|
36 | 37 |
matching.chain.idx = run.params[run.params$K == K, "index"] |
37 | 38 |
requested.chain = chooseBestChain(celda.list$res.list[matching.chain.idx], best) |
38 | 39 |
} else { |
... | ... |
@@ -43,7 +44,7 @@ getModel = function(celda.list, K=NULL, L=NULL, chain=1, best=NULL) { |
43 | 44 |
|
44 | 45 |
|
45 | 46 |
if (celda.list$content.type == "celda_G") { |
46 |
- if (!is.null(best)) { |
|
47 |
+ if (is.null(chain)) { |
|
47 | 48 |
matching.chain.idx = run.params[run.params$L == L, "index"] |
48 | 49 |
requested.chain = chooseBestChain(celda.list$res.list[matching.chain.idx], best) |
49 | 50 |
} else { |
... | ... |
@@ -62,6 +63,30 @@ getModel = function(celda.list, K=NULL, L=NULL, chain=1, best=NULL) { |
62 | 63 |
} |
63 | 64 |
|
64 | 65 |
|
66 |
+#' Select a set of models from a celda_list objects based off of rows in its run.params attribute. |
|
67 |
+#' |
|
68 |
+#' @param celda.list A celda_list object returned from celda() |
|
69 |
+#' @param run.param.rows Row indices in the celda.list's run params corresponding to the desired models. |
|
70 |
+#' @return A celda_list containing celda model objects matching the provided parameters (of class "celda_C", "celda_G", "celda_CG" accordingly), or NA if one is not found. |
|
71 |
+#' @export |
|
72 |
+selectModels = function(celda.list, run.param.rows) { |
|
73 |
+ desired.models = lapply(run.param.rows, |
|
74 |
+ function(row.idx) { |
|
75 |
+ if (!is.numeric(row.idx) | |
|
76 |
+ row.idx < 0 | |
|
77 |
+ row.idx > nrow(celda.list$run.params)) { |
|
78 |
+ stop("Invalid row index provided") |
|
79 |
+ } |
|
80 |
+ params = celda.list$run.params[row.idx, ] |
|
81 |
+ getModel(celda.list, params$K, params$L, params$chain) |
|
82 |
+ }) |
|
83 |
+ subsetted.celda.list = list(run.params=celda.list$run.params[run.param.rows, ], |
|
84 |
+ res.list=desired.models, content.type=celda.list$content.type, |
|
85 |
+ count.checksum=celda.list$count.checksum) |
|
86 |
+ return(subsetted.celda.list) |
|
87 |
+} |
|
88 |
+ |
|
89 |
+ |
|
65 | 90 |
validateGetModelParams = function(celda.list, K, L, chain, best) { |
66 | 91 |
if (class(celda.list) != "celda_list") stop("First argument to getModel() should be an object of class 'celda_list'") |
67 | 92 |
|
76 | 101 |
old mode 100644 |
77 | 102 |
new mode 100755 |
... | ... |
@@ -17,8 +17,8 @@ plotDrGrid <- function(dim1, dim2, matrix, size, xlab, ylab, color_low, color_mi |
17 | 17 |
colnames(m) <- c(xlab,ylab,"facet",var_label) |
18 | 18 |
ggplot2::ggplot(m, ggplot2::aes_string(x=xlab, y=ylab)) + ggplot2::geom_point(stat = "identity", size = size, ggplot2::aes_string(color = var_label)) + |
19 | 19 |
ggplot2::facet_wrap(~facet) + ggplot2::theme_bw() + ggplot2::scale_colour_gradient2(low = color_low, high = color_high, mid = color_mid, midpoint = (max(m[,4])-min(m[,4]))/2 ,name = gsub("_"," ",var_label)) + |
20 |
- ggplot2::theme(strip.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.spacing = unit(0,"lines"), |
|
21 |
- panel.background = element_blank(), axis.line = ggplot2::element_line(colour = "black")) |
|
20 |
+ ggplot2::theme(strip.background = ggplot2::element_blank(), panel.grid.major = ggplot2::element_blank(), panel.grid.minor = ggplot2::element_blank(), panel.spacing = unit(0,"lines"), |
|
21 |
+ panel.background = ggplot2::element_blank(), axis.line = ggplot2::element_line(colour = "black")) |
|
22 | 22 |
} |
23 | 23 |
|
24 | 24 |
#' Create a scatterplot for each row of a normalized gene expression matrix where x and y axis are from a data dimensionality reduction tool. |
31 | 31 |
old mode 100644 |
32 | 32 |
new mode 100755 |
... | ... |
@@ -30,7 +30,7 @@ An analysis example using celda with RNASeq via vignette('celda-analysis') |
30 | 30 |
|
31 | 31 |
|
32 | 32 |
## New Features and announcements |
33 |
-The v0.2 release of celda represents a useable implementation of the various celda clustering models. |
|
33 |
+The v0.3 release of celda represents a useable implementation of the various celda clustering models. |
|
34 | 34 |
Please submit any usability issues or bugs to the issue tracker at https://github.com/compbiomed/celda |
35 | 35 |
|
36 | 36 |
You can discuss celda, or ask the developers usage questions, in our [Google Group.](https://groups.google.com/forum/#!forum/celda-list) |
... | ... |
@@ -14,9 +14,11 @@ library(celda) |
14 | 14 |
library(Matrix) |
15 | 15 |
library(gtools) |
16 | 16 |
library(ggplot2) |
17 |
+library(Rtsne) |
|
18 |
+library(reshape2) |
|
17 | 19 |
|
18 | 20 |
## ------------------------------------------------------------------------ |
19 |
-sim_counts = simulateCells("celda_CG", K=3,L=10) |
|
21 |
+sim_counts = simulateCells("celda_CG", K = 3, L = 10) |
|
20 | 22 |
|
21 | 23 |
## ------------------------------------------------------------------------ |
22 | 24 |
dim(sim_counts$counts) |
... | ... |
@@ -35,11 +37,11 @@ table(sim_counts$z, sim_counts$sample.label) |
35 | 37 |
|
36 | 38 |
## ---- fig.show='hold', warning = FALSE, message = FALSE------------------ |
37 | 39 |
|
38 |
-celdaCG.res = celda_CG(counts = sim_counts$counts, K =3, L = 10, max.iter = 10, sample.label = sim_counts$sample.label) |
|
40 |
+celda.res = celda(counts = sim_counts$counts, model = "celda_CG", K = 3, L = 10, max.iter = 10, cores = 1, nchains = 1) |
|
39 | 41 |
|
40 | 42 |
## ------------------------------------------------------------------------ |
41 |
-z = celdaCG.res$z |
|
42 |
-y = celdaCG.res$y |
|
43 |
+z = celda.res$res.list[[1]]$z |
|
44 |
+y = celda.res$res.list[[1]]$y |
|
43 | 45 |
|
44 | 46 |
table(z, sim_counts$z) |
45 | 47 |
table(y, sim_counts$y) |
... | ... |
@@ -47,28 +49,41 @@ table(y, sim_counts$y) |
47 | 49 |
|
48 | 50 |
## ---- fig.width = 7, fig.height = 7, warning = FALSE, message = FALSE---- |
49 | 51 |
norm.counts <- normalizeCounts(sim_counts$counts) |
50 |
-render_celda_heatmap(counts=norm.counts,z=z,y=y, cluster.column = FALSE, cluster.row = FALSE) |
|
52 |
+renderCeldaHeatmap(counts = norm.counts, z=z, y=y, normalize = NULL, color_scheme = "divergent",cluster_gene = TRUE, cluster_cell = TRUE) |
|
51 | 53 |
|
52 | 54 |
## ------------------------------------------------------------------------ |
53 |
-factorized <- factorizeMatrix.celda_CG(counts = sim_counts$counts, celda.obj = celdaCG.res) |
|
55 |
+model <- getModel(celda.res, K = 3, L = 10, best = "loglik") |
|
56 |
+factorized <- factorizeMatrix(model, sim_counts$count) |
|
54 | 57 |
|
55 |
-pop.states = t(factorized$proportions$population.states) |
|
58 |
+## ------------------------------------------------------------------------ |
|
59 |
+dim(factorized$proportions$gene.states) |
|
60 |
+head(factorized$proportions$gene.states) |
|
61 |
+ |
|
62 |
+## ------------------------------------------------------------------------ |
|
63 |
+dim(factorized$proportions$cell.states) |
|
64 |
+factorized$proportions$cell.states[1:10,1:3] |
|
65 |
+ |
|
66 |
+## ------------------------------------------------------------------------ |
|
67 |
+pop.states = factorized$proportions$population.states |
|
68 |
+dim(pop.states) |
|
56 | 69 |
head(pop.states) |
57 | 70 |
|
58 | 71 |
## ---- fig.width = 7, fig.height = 7-------------------------------------- |
59 |
-render_celda_heatmap(pop.states, col=colorRampPalette(c("white", "blue"))(70), scale.row=F, show_cellnames=T, show_genenames=T, cluster.column=F, cluster.row=F, breaks = NA, z= 1:3, y = 1:10) |
|
72 |
+data.pca <- prcomp(t(scale(t(factorized$proportions$cell.states))),scale = F, center = F) |
|
73 |
+ |
|
74 |
+plotDrCluster(dim1 = data.pca$rotation[,1], dim2 = data.pca$rotation[,2], cluster = celda.res$res.list[[1]]$z) |
|
75 |
+ |
|
76 |
+plotDrState(dim1 = data.pca$rotation[,1], dim2 = data.pca$rotation[,2], matrix = factorized$proportions$cell.states, rescale = TRUE) |
|
77 |
+ |
|
78 |
+## ---- fig.width = 7, fig.height = 7-------------------------------------- |
|
79 |
+renderCeldaHeatmap(pop.states, color_scheme = "sequential", show_cellnames=T, show_genenames=T, breaks = NA, z= 1:ncol(pop.states), y = 1:nrow(pop.states)) |
|
60 | 80 |
|
61 | 81 |
## ---- fig.width = 7, fig.height = 7-------------------------------------- |
62 | 82 |
rel.states = sweep(pop.states, 1, rowSums(pop.states), "/") |
63 |
-render_celda_heatmap(rel.states, col=colorRampPalette(c("white","blue"))(70), scale.row=F, show_cellnames=T, show_genenames=T, cluster.column=F, cluster.row=F, breaks = NA, z= 1:3, y = 1:10) |
|
83 |
+renderCeldaHeatmap(rel.states, color_scheme = "sequential", show_cellnames=T, show_genenames=T, breaks = NA, z= 1:ncol(rel.states), y = 1:nrow(rel.states)) |
|
64 | 84 |
|
65 | 85 |
## ------------------------------------------------------------------------ |
66 |
-celda.res.list <- celda(counts = sim_counts$counts,K = 2:4, L = 9:11, nchains = 3, max.iter = 10, model = "celda_CG") |
|
67 |
- |
|
68 |
-## ---- fig.width = 7, fig.height = 7, message = FALSE--------------------- |
|
69 |
-visualize.model = visualize_model_performance(celda.res.list,method="perplexity") |
|
70 |
- |
|
71 |
-visualize.model$K |
|
86 |
+celda.res.list <- celda(counts = sim_counts$counts,K = 2:4, L = 9:11, nchains = 3, max.iter = 10, model = "celda_CG", cores = 1) |
|
72 | 87 |
|
73 | 88 |
## ------------------------------------------------------------------------ |
74 | 89 |
model = getModel(celda.res.list, K = 3, L = 10, best = "loglik") |
... | ... |
@@ -79,37 +94,47 @@ library(celda) |
79 | 94 |
library(Matrix) |
80 | 95 |
library(data.table) |
81 | 96 |
library(pheatmap) |
82 |
-library(biomaRt) |
|
97 |
+library(Rtsne) |
|
83 | 98 |
|
84 | 99 |
## ------------------------------------------------------------------------ |
85 |
-load(file="../data/cmatp_data.rda") |
|
86 |
-dim(cmatp_data) |
|
87 |
-head(rownames(cmatp_data)) |
|
88 |
-head(colnames(cmatp_data)) |
|
100 |
+dim(pbmc_data) |
|
101 |
+head(rownames(pbmc_data)) |
|
102 |
+head(colnames(pbmc_data)) |
|
89 | 103 |
|
90 | 104 |
## ------------------------------------------------------------------------ |
91 |
-cmatp_select <- cmatp_data[rowSums(cmatp_data>4) > 4,] |
|
105 |
+pbmc_select <- pbmc_data[rowSums(pbmc_data>4) > 4,] |
|
92 | 106 |
|
93 | 107 |
## ---- message = FALSE, verbose = FALSE----------------------------------- |
94 |
-cmatp.res = celda_CG(cmatp_select, sample.label = rep(1,ncol(cmatp_select)), K = 15, L = 20) |
|
108 |
+pbmc.res = celda(pbmc_select, K = 10, L = 20, cores = 1, model = "celda_CG", nchains = 8, max.iter = 100) |
|
109 |
+model<-getModel(pbmc.res, K = 10, L = 20, best = "loglik") |
|
95 | 110 |
|
96 | 111 |
## ------------------------------------------------------------------------ |
97 |
-factorize.matrix = factorizeMatrix.celda_CG(counts = cmatp_select, celda.obj = cmatp.res) |
|
98 |
-top.genes <- topRank(factorize.matrix$proportions$gene.states) |
|
112 |
+factorize.matrix = factorizeMatrix(model, counts=pbmc_select) |
|
113 |
+top.genes <- topRank(factorize.matrix$proportions$gene.states, n = 25) |
|
99 | 114 |
|
100 | 115 |
## ------------------------------------------------------------------------ |
101 |
-top.genes$names$L16 |
|
116 |
+top.genes$names$L3 |
|
102 | 117 |
|
103 | 118 |
## ------------------------------------------------------------------------ |
104 |
-top.genes$names$L19 |
|
119 |
+top.genes$names$L4 |
|
105 | 120 |
|
106 | 121 |
## ---- fig.width = 7, fig.height = 7-------------------------------------- |
107 | 122 |
top.genes.ix <- unique(unlist(top.genes$index)) |
108 |
-norm.cmatp.counts <- normalizeCounts(cmatp_select) |
|
109 |
-render_celda_heatmap(norm.cmatp.counts[top.genes.ix,], z = cmatp.res$z, y = cmatp.res$y[top.genes.ix], cluster.row = FALSE, cluster.column = FALSE) |
|
123 |
+norm.pbmc.counts <- normalizeCounts(pbmc_select) |
|
124 |
+renderCeldaHeatmap(norm.pbmc.counts[top.genes.ix,], z = model$z, y = model$y[top.genes.ix], normalize = NULL, color_scheme = "divergent") |
|
125 |
+ |
|
126 |
+## ---- message = FALSE---------------------------------------------------- |
|
127 |
+norm_pbmc <- normalizeCounts(factorize.matrix$counts$cell.states) |
|
128 |
+set.seed(123) |
|
129 |
+tsne <- Rtsne(t(norm_pbmc), pca = FALSE, max_iter = 2000) |
|
130 |
+pbmc_tsne <- tsne$Y |
|
110 | 131 |
|
111 | 132 |
## ---- fig.width = 7, fig.height = 7-------------------------------------- |
112 |
-cmatp.states = t(factorize.matrix$proportions$population.states) |
|
113 |
-cmatp.rel.states = sweep(cmatp.states, 1, rowSums(cmatp.states), "/") |
|
114 |
-render_celda_heatmap(cmatp.rel.states, col=colorRampPalette(c("white","blue","darkgreen","green"))(100), scale.row=F, show_cellnames=T, show_genenames=TRUE, cluster.column=FALSE, cluster.row=FALSE, breaks = NA, z= 1:15, y = 1:20) |
|
133 |
+plotDrCluster(dim1 = pbmc_tsne[,1], dim2 = pbmc_tsne[,2], cluster = as.factor(model$z)) |
|
134 |
+ |
|
135 |
+plotDrState(dim1 = pbmc_tsne[,1], dim2 = pbmc_tsne[,2], matrix = factorize.matrix$proportions$cell.states, rescale = TRUE) |
|
136 |
+ |
|
137 |
+marker.genes <- c("ENSG00000168685_IL7R","ENSG00000198851_CD3E","ENSG00000105374_NKG7","ENSG00000203747_FCGR3A","ENSG00000090382_LYZ","ENSG00000179639_FCER1A","ENSG00000156738_MS4A1", "ENSG00000163736_PPBP") |
|
138 |
+gene.counts <- pbmc_select[marker.genes,] |
|
139 |
+plotDrGene(dim1 = pbmc_tsne[,1],dim2 = pbmc_tsne[,2], matrix = gene.counts, rescale = TRUE) |
|
115 | 140 |
|
... | ... |
@@ -1,6 +1,6 @@ |
1 | 1 |
--- |
2 | 2 |
title: "Analyzing single-cell RNA-seq count data with Celda" |
3 |
-author: "Josh D. Campbell, Sean Corbett, Masanao Yajima, Zichun Liu, Shiyi Yang, Tianwen Huan, Anahita Bahri, Yusuke Koga" |
|
3 |
+author: "Josh D. Campbell, Masanao Yajima, Sean Corbett, Zichun Liu, Shiyi Yang, Tianwen Huan, Anahita Bahri, Zhe Wang, Yusuke Koga, Jiangyuan Liu" |
|
4 | 4 |
date: "`r Sys.Date()`" |
5 | 5 |
output: rmarkdown::html_vignette |
6 | 6 |
vignette: > |
... | ... |
@@ -8,15 +8,15 @@ vignette: > |
8 | 8 |
%\VignetteEngine{knitr::rmarkdown} |
9 | 9 |
%\VignetteEncoding{UTF-8} |
10 | 10 |
--- |
11 |
- |
|
12 |
-# 1 Introduction |
|
13 |
- |
|
14 |
-"celda" stands for "**CE**llular **L**atent **D**irichlet **A**llocation", which is a suite of Bayesian hierarchical models and supporting functions to perform gene and cell clustering for count data generated by single cell RNA-seq platforms. This algorithm is an extension of the Latent Dirichlet Allocation (LDA) topic modeling framework that has been popular in text mining applications. Celda has advantages over other clustering frameworks: |
|
15 |
- |
|
16 |
-1) Celda can simultaneously cluster genes into transcriptional states and cells into subpopulations |
|
11 |
+ |
|
12 |
+## 1 Introduction |
|
13 |
+ |
|
14 |
+ "celda" stands for "**CE**llular **L**atent **D**irichlet **A**llocation", which is a suite of Bayesian hierarchical models and supporting functions to perform gene and cell clustering for count data generated by single cell RNA-seq platforms. This algorithm is an extension of the Latent Dirichlet Allocation (LDA) topic modeling framework that has been popular in text mining applications. Celda has advantages over other clustering frameworks: |
|
15 |
+ |
|
16 |
+1) Celda can simultaneously cluster cells into subpopulations and genes into transcriptional states, which are subgroups of genes that are likely to be expressed in similar proportions over different types of cells |
|
17 | 17 |
2) Celda uses count-based Dirichlet-multinomial distributions so no additional normalization is required for 3' DGE single cell RNA-seq |
18 | 18 |
3) These types of models have shown good performance with sparse data. |
19 |
- |
|
19 |
+ |
|
20 | 20 |
In this vignette we will demonstrate how to perform cell and gene clustering with simulated and real single cell RNA-seq data using the Bayesian hierarchical models within celda. |
21 | 21 |
|
22 | 22 |
|
... | ... |
@@ -57,29 +57,31 @@ library(celda) |
57 | 57 |
library(Matrix) |
58 | 58 |
library(gtools) |
59 | 59 |
library(ggplot2) |
60 |
+library(Rtsne) |
|
61 |
+library(reshape2) |
|
60 | 62 |
``` |
61 | 63 |
|
62 |
-We use the built-in data generating function `simulateCells.celda_CG` to simulate a single-cell RNA-Seq dataset. `simulateCells.celda_CG` returns a list containing a count matrix where each row is a gene and each column is a cell. The K parameter determines the number of cell clusters while the L parameter determines the number of gene clusters within the dataset. |
|
64 |
+We use the built-in data generating function `simulateCells` to simulate a single-cell RNA-Seq dataset. `simulateCells` returns a list containing a count matrix where each row is a gene and each column is a cell. The K parameter determines the number of cell clusters while the L parameter determines the number of gene clusters within the dataset. |
|
63 | 65 |
|
64 | 66 |
```{r} |
65 |
-sim_counts = simulateCells("celda_CG", K=3,L=10) |
|
67 |
+sim_counts = simulateCells("celda_CG", K = 3, L = 10) |
|
66 | 68 |
``` |
67 | 69 |
|
68 |
-The `counts` variable from `simulateCells.celda_CG` contains the counts matrix. |
|
70 |
+The `counts` variable contains the counts matrix. |
|
69 | 71 |
The dimensions of counts matrix: |
70 | 72 |
|
71 | 73 |
```{r} |
72 | 74 |
dim(sim_counts$counts) |
73 | 75 |
``` |
74 | 76 |
|
75 |
-The `z` variable from `simulateCells.celda_CG` contains the cluster for each cell. |
|
77 |
+The `z` variable contains the cluster for each cell. |
|
76 | 78 |
Here is the number of cells in each subpopulation: |
77 | 79 |
|
78 | 80 |
```{r} |
79 | 81 |
table(sim_counts$z) |
80 | 82 |
``` |
81 | 83 |
|
82 |
-The `y` variable from `simulateCells.celda_CG` contains the transcriptional state assignment for each gene. |
|
84 |
+The `y` variable contains the transcriptional state assignment for each gene. |
|
83 | 85 |
Here is the number of genes in each transcriptional state: |
84 | 86 |
|
85 | 87 |
```{r} |
... | ... |
@@ -102,18 +104,18 @@ table(sim_counts$z, sim_counts$sample.label) |
102 | 104 |
|
103 | 105 |
## 2.1.1 How to Run Celda |
104 | 106 |
|
105 |
-There are currently three models within this package: `celda_C` will cluster cells, `celda_G` will cluster genes, and `celda_CG` will simultaneously cluster cells and genes. Here is the command for `celda_CG`: |
|
107 |
+There are currently three models within this package: `celda_C` will cluster cells, `celda_G` will cluster genes, and `celda_CG` will simultaneously cluster cells and genes. The `celda` function will use these models and take in a counts matrix to create a `celda` object that contains the cluster labels for each cell and gene. |
|
106 | 108 |
|
107 | 109 |
```{r, fig.show='hold', warning = FALSE, message = FALSE} |
108 | 110 |
|
109 |
-celdaCG.res = celda_CG(counts = sim_counts$counts, K =3, L = 10, max.iter = 10, sample.label = sim_counts$sample.label) |
|
111 |
+celda.res = celda(counts = sim_counts$counts, model = "celda_CG", K = 3, L = 10, max.iter = 10, cores = 1, nchains = 1) |
|
110 | 112 |
``` |
111 | 113 |
|
112 | 114 |
Here is a comparison between the true cluster labels and the estimated cluster labels. |
113 | 115 |
|
114 | 116 |
```{r} |
115 |
-z = celdaCG.res$z |
|
116 |
-y = celdaCG.res$y |
|
117 |
+z = celda.res$res.list[[1]]$z |
|
118 |
+y = celda.res$res.list[[1]]$y |
|
117 | 119 |
|
118 | 120 |
table(z, sim_counts$z) |
119 | 121 |
table(y, sim_counts$y) |
... | ... |
@@ -124,52 +126,71 @@ We can display the clustering results with a heatmap of the normalized counts: |
124 | 126 |
|
125 | 127 |
```{r, fig.width = 7, fig.height = 7, warning = FALSE, message = FALSE} |
126 | 128 |
norm.counts <- normalizeCounts(sim_counts$counts) |
127 |
-render_celda_heatmap(counts=norm.counts,z=z,y=y, cluster.column = FALSE, cluster.row = FALSE) |
|
129 |
+renderCeldaHeatmap(counts = norm.counts, z=z, y=y, normalize = NULL, color_scheme = "divergent",cluster_gene = TRUE, cluster_cell = TRUE) |
|
128 | 130 |
``` |
129 | 131 |
|
130 | 132 |
## 2.1.2 Matrix factorization |
131 | 133 |
|
132 |
-`celda` can also perform matrix factorization to summarize the contribution of each transcriptional state to each cellular subpopulation. |
|
134 |
+`celda` can also perform matrix factorization to summarize the contribution of each transcriptional state to each cellular subpopulation. Matrix factorization decomposes the counts matrix into multiple matrices, using the given `celda` model. The gene.states contains each gene's contribution to the transcriptional state it belongs. The cell.states contains each sample's contribution to each transcriptional state. The population.states contains each transcriptional states' contribution to each of the cell states. Each of these matrices can be viewed as raw counts values or proportional values. |
|
135 |
+ |
|
136 |
+```{r} |
|
137 |
+model <- getModel(celda.res, K = 3, L = 10, best = "loglik") |
|
138 |
+factorized <- factorizeMatrix(model, sim_counts$count) |
|
139 |
+``` |
|
140 |
+ |
|
141 |
+```{r} |
|
142 |
+dim(factorized$proportions$gene.states) |
|
143 |
+head(factorized$proportions$gene.states) |
|
144 |
+``` |
|
133 | 145 |
|
134 | 146 |
```{r} |
135 |
-factorized <- factorizeMatrix.celda_CG(counts = sim_counts$counts, celda.obj = celdaCG.res) |
|
147 |
+dim(factorized$proportions$cell.states) |
|
148 |
+factorized$proportions$cell.states[1:10,1:3] |
|
149 |
+``` |
|
150 |
+ |
|
151 |
+For instance, roughly 30% of the population in cell state K2 is in transcriptional state L1. |
|
136 | 152 |
|
137 |
-pop.states = t(factorized$proportions$population.states) |
|
153 |
+```{r} |
|
154 |
+pop.states = factorized$proportions$population.states |
|
155 |
+dim(pop.states) |
|
138 | 156 |
head(pop.states) |
139 | 157 |
``` |
140 | 158 |
|
141 |
-These states can also be visualized with a heatmap: |
|
159 |
+`celda` contains several plotting tools for data dimensionality reduction tool outputs. The PCA result for the factorized matrix is plotted by the `plor_dr_cluster` and `plot_dr_state` function based off of the celda clusters and state probabilities for each cell: |
|
142 | 160 |
|
143 | 161 |
```{r, fig.width = 7, fig.height = 7} |
144 |
-render_celda_heatmap(pop.states, col=colorRampPalette(c("white", "blue"))(70), scale.row=F, show_cellnames=T, show_genenames=T, cluster.column=F, cluster.row=F, breaks = NA, z= 1:3, y = 1:10) |
|
162 |
+data.pca <- prcomp(t(scale(t(factorized$proportions$cell.states))),scale = F, center = F) |
|
163 |
+ |
|
164 |
+plotDrCluster(dim1 = data.pca$rotation[,1], dim2 = data.pca$rotation[,2], cluster = celda.res$res.list[[1]]$z) |
|
165 |
+ |
|
166 |
+plotDrState(dim1 = data.pca$rotation[,1], dim2 = data.pca$rotation[,2], matrix = factorized$proportions$cell.states, rescale = TRUE) |
|
145 | 167 |
``` |
146 | 168 |
|
147 |
-Sometimes it may be helpful to visualize the relative probability of each transcriptional state in each cellular subpopulation. We can do this by normalizing the rows of the probability matrix to sum up to 1: |
|
169 |
+The transcriptional and cell states can also be visualized with a heatmap: |
|
170 |
+ |
|
171 |
+```{r, fig.width = 7, fig.height = 7} |
|
172 |
+renderCeldaHeatmap(pop.states, color_scheme = "sequential", show_cellnames=T, show_genenames=T, breaks = NA, z= 1:ncol(pop.states), y = 1:nrow(pop.states)) |
|
173 |
+``` |
|
174 |
+ |
|
175 |
+Sometimes it may be helpful to visualize the relative probability of each transcriptional state in each cellular subpopulation. We can do this by z-score normalizing the probability matrix: |
|
148 | 176 |
|
149 | 177 |
```{r, fig.width = 7, fig.height = 7} |
150 | 178 |
rel.states = sweep(pop.states, 1, rowSums(pop.states), "/") |
151 |
-render_celda_heatmap(rel.states, col=colorRampPalette(c("white","blue"))(70), scale.row=F, show_cellnames=T, show_genenames=T, cluster.column=F, cluster.row=F, breaks = NA, z= 1:3, y = 1:10) |
|
179 |
+renderCeldaHeatmap(rel.states, color_scheme = "sequential", show_cellnames=T, show_genenames=T, breaks = NA, z= 1:ncol(rel.states), y = 1:nrow(rel.states)) |
|
152 | 180 |
``` |
153 | 181 |
|
154 | 182 |
The relative probabilities will tend to reflect the patterns observed in the heatmap as the genes in the heatmaps are Z-score transformed and thus on a relative scale as well. |
155 | 183 |
|
184 |
+ |
|
156 | 185 |
## 2.1.3 Identifying the optimal number of states |
157 | 186 |
|
158 | 187 |
The optimal number of K and L will likely not be known a priori. Therefore multiple choices of K and L may need to tested and compared. Additionally, `celda` is sensitive to initial start conditions, so it is good practice to run multiple chains for each combination of K and L to increase the chances of finding the most optimal solution. We have built a wrapper function that will run multiple combinations of K and L with multiple chains for each combination in parallel. |
159 | 188 |
|
160 | 189 |
```{r} |
161 |
-celda.res.list <- celda(counts = sim_counts$counts,K = 2:4, L = 9:11, nchains = 3, max.iter = 10, model = "celda_CG") |
|
162 |
-``` |
|
163 |
- |
|
164 |
-Here, `celda` will fit the model for every combination of K between 2 and 4 and L between 9 and 11. Three chains will be run for each combination. The parameter "ncores" can be set to something more than 1 to run multiple chains in parallel. |
|
165 |
- |
|
166 |
-```{r, fig.width = 7, fig.height = 7, message = FALSE} |
|
167 |
-visualize.model = visualize_model_performance(celda.res.list,method="perplexity") |
|
168 |
- |
|
169 |
-visualize.model$K |
|
190 |
+celda.res.list <- celda(counts = sim_counts$counts,K = 2:4, L = 9:11, nchains = 3, max.iter = 10, model = "celda_CG", cores = 1) |
|
170 | 191 |
``` |
171 | 192 |
|
172 |
-Perplexity is a measure of goodness-of-fit for the model where lower perplexity is better. The perplexity of each chain can be visualized as a function of K. In this example, K=3 is much better than K=2, but K=4 does not give much improvement over K=3. We would therefore select K=3 in this case. Note: We are still actively testing various methods for assessing model performance. |
|
193 |
+Here, `celda` will fit the model for every combination of K between 2 and 4 and L between 9 and 11. Three chains will be run for each combination. The parameter "cores" can be set to something more than 1 to run multiple chains in parallel. |
|
173 | 194 |
|
174 | 195 |
We can use the function "getModel" to select a particular model from the list of models once we have selected the optimal K and L. |
175 | 196 |
|
... | ... |
@@ -179,6 +200,8 @@ model = getModel(celda.res.list, K = 3, L = 10, best = "loglik") |
179 | 200 |
|
180 | 201 |
The chain with the best log likelihood will be returned from all models that were run with K = 3 and L = 10. |
181 | 202 |
|
203 |
+ |
|
204 |
+ |
|
182 | 205 |
# 3. PBMC Analysis with celda |
183 | 206 |
|
184 | 207 |
```{r setup, include=FALSE} |
... | ... |
@@ -187,59 +210,72 @@ library(celda) |
187 | 210 |
library(Matrix) |
188 | 211 |
library(data.table) |
189 | 212 |
library(pheatmap) |
190 |
-library(biomaRt) |
|
213 |
+library(Rtsne) |
|
191 | 214 |
``` |
192 | 215 |
|
193 |
-We will use the "cmatp_data" dataset, which is a dataset of 3000 Peripheral Blood Mononuclear Cells (PBMC), available from 10X Genomics. The rows are organized by gene names, while the columns are organized by barcodes. For this tutorial, the dataset has been slightly modified so the rownames are comprised of the Ensembl gene ID as well as the gene name. The raw dataset can be found at https://support.10xgenomics.com/single-cell/software/pipelines/latest/rkit. |
|
216 |
+We will use the "pbmc_data" dataset, which is a dataset of 3000 Peripheral Blood Mononuclear Cells (PBMC), available from 10X Genomics. The rows are organized by gene names, while the columns are organized by barcodes. For this tutorial, the dataset has been slightly modified so the rownames are comprised of the Ensembl gene ID as well as the gene name. The raw dataset can be found at https://support.10xgenomics.com/single-cell/software/pipelines/latest/rkit. |
|
194 | 217 |
|
195 | 218 |
We will only include genes with at least 5 counts in a least 5 cells for this analysis to reduce computational time. However, `celda` is capable of analyzing genes with fewer counts. |
196 | 219 |
|
197 | 220 |
```{r} |
198 |
-load(file="../data/cmatp_data.rda") |
|
199 |
-dim(cmatp_data) |
|
200 |
-head(rownames(cmatp_data)) |
|
201 |
-head(colnames(cmatp_data)) |
|
221 |
+dim(pbmc_data) |
|
222 |
+head(rownames(pbmc_data)) |
|
223 |
+head(colnames(pbmc_data)) |
|
202 | 224 |
``` |
203 | 225 |
|
204 | 226 |
```{r} |
205 |
-cmatp_select <- cmatp_data[rowSums(cmatp_data>4) > 4,] |
|
227 |
+pbmc_select <- pbmc_data[rowSums(pbmc_data>4) > 4,] |
|
206 | 228 |
``` |
207 | 229 |
|
230 |
+ |
|
208 | 231 |
### 3.1.1 Analysis of 10X Genomics PBMC data |
209 | 232 |
|
210 | 233 |
|
211 | 234 |
```{r, message = FALSE, verbose = FALSE} |
212 |
-cmatp.res = celda_CG(cmatp_select, sample.label = rep(1,ncol(cmatp_select)), K = 15, L = 20) |
|
235 |
+pbmc.res = celda(pbmc_select, K = 10, L = 20, cores = 1, model = "celda_CG", nchains = 8, max.iter = 100) |
|
236 |
+model<-getModel(pbmc.res, K = 10, L = 20, best = "loglik") |
|
213 | 237 |
``` |
214 | 238 |
|
215 | 239 |
After matrix factorization, the top genes can be selected using the `topRank` function on the "gene.states" matrix: |
216 | 240 |
|
217 | 241 |
```{r} |
218 |
-factorize.matrix = factorizeMatrix.celda_CG(counts = cmatp_select, celda.obj = cmatp.res) |
|
219 |
-top.genes <- topRank(factorize.matrix$proportions$gene.states) |
|
242 |
+factorize.matrix = factorizeMatrix(model, counts=pbmc_select) |
|
243 |
+top.genes <- topRank(factorize.matrix$proportions$gene.states, n = 25) |
|
220 | 244 |
``` |
221 | 245 |
|
222 | 246 |
```{r} |
223 |
-top.genes$names$L16 |
|
247 |
+top.genes$names$L3 |
|
224 | 248 |
``` |
225 | 249 |
|
226 | 250 |
```{r} |
227 |
-top.genes$names$L19 |
|
251 |
+top.genes$names$L4 |
|
228 | 252 |
``` |
229 | 253 |
|
230 | 254 |
We can make a heatmap of these top genes as follows: |
231 | 255 |
|
232 | 256 |
```{r, fig.width = 7, fig.height = 7} |
233 | 257 |
top.genes.ix <- unique(unlist(top.genes$index)) |
234 |
-norm.cmatp.counts <- normalizeCounts(cmatp_select) |
|
235 |
-render_celda_heatmap(norm.cmatp.counts[top.genes.ix,], z = cmatp.res$z, y = cmatp.res$y[top.genes.ix], cluster.row = FALSE, cluster.column = FALSE) |
|
258 |
+norm.pbmc.counts <- normalizeCounts(pbmc_select) |
|
259 |
+renderCeldaHeatmap(norm.pbmc.counts[top.genes.ix,], z = model$z, y = model$y[top.genes.ix], normalize = NULL, color_scheme = "divergent") |
|
236 | 260 |
``` |
237 | 261 |
|
238 |
-We can also summarize the relative contribution of each transcriptional state in each cell population with a heatmap: |
|
262 |
+We can plot tSNE results via `plot_dr_cluster`, `plot_dr_state` and `plot_dr_gene`. These will determine how each cell is labeled via celda, visualize each cell's expression respectivelyof a specific transcriptional state and visualize each cell's expression of a set of genes, . |
|
263 |
+ |
|
264 |
+```{r, message = FALSE} |
|
265 |
+norm_pbmc <- normalizeCounts(factorize.matrix$counts$cell.states) |
|
266 |
+set.seed(123) |
|
267 |
+tsne <- Rtsne(t(norm_pbmc), pca = FALSE, max_iter = 2000) |
|
268 |
+pbmc_tsne <- tsne$Y |
|
269 |
+``` |
|
239 | 270 |
|
240 | 271 |
```{r, fig.width = 7, fig.height = 7} |
241 |
-cmatp.states = t(factorize.matrix$proportions$population.states) |
|
242 |
-cmatp.rel.states = sweep(cmatp.states, 1, rowSums(cmatp.states), "/") |
|
243 |
-render_celda_heatmap(cmatp.rel.states, col=colorRampPalette(c("white","blue","darkgreen","green"))(100), scale.row=F, show_cellnames=T, show_genenames=TRUE, cluster.column=FALSE, cluster.row=FALSE, breaks = NA, z= 1:15, y = 1:20) |
|
272 |
+plotDrCluster(dim1 = pbmc_tsne[,1], dim2 = pbmc_tsne[,2], cluster = as.factor(model$z)) |
|
273 |
+ |
|
274 |
+plotDrState(dim1 = pbmc_tsne[,1], dim2 = pbmc_tsne[,2], matrix = factorize.matrix$proportions$cell.states, rescale = TRUE) |
|
275 |
+ |
|
276 |
+marker.genes <- c("ENSG00000168685_IL7R","ENSG00000198851_CD3E","ENSG00000105374_NKG7","ENSG00000203747_FCGR3A","ENSG00000090382_LYZ","ENSG00000179639_FCER1A","ENSG00000156738_MS4A1", "ENSG00000163736_PPBP") |
|
277 |
+gene.counts <- pbmc_select[marker.genes,] |
|
278 |
+plotDrGene(dim1 = pbmc_tsne[,1],dim2 = pbmc_tsne[,2], matrix = gene.counts, rescale = TRUE) |
|
244 | 279 |
``` |
245 | 280 |
|
281 |
+With the use of the plots, it is also possible to determine which cell clusters express specific marker genes. For instance, the same cell cluster that highly expresses the transcriptional state L4 highly expresses LYZ, a known marker gene for monocytes. This is consistent with what we see in the above, in which transcriptional state L6 highly expresses monocytes. |
246 | 282 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,25 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/feature_selection.R |
|
3 |
+\name{GiniPlot} |
|
4 |
+\alias{GiniPlot} |
|
5 |
+\title{Create a bar chart based on Gini coefficients of transcriptional states} |
|
6 |
+\usage{ |
|
7 |
+GiniPlot(counts, celda.mod, cell_clusters = NULL, label_size = 4, |
|
8 |
+ bar_col = NULL) |
|
9 |
+} |
|
10 |
+\arguments{ |
|
11 |
+\item{counts}{A numeric count matrix.} |
|
12 |
+ |
|
13 |
+\item{celda.mod}{An object of class celda_CG.} |
|
14 |
+ |
|
15 |
+\item{cell_clusters}{A list of cell clusters. Calculate Gini coefficients using cells from specific cell clusters. If NULL, uses all cells (NULL by default).} |
|
16 |
+ |
|
17 |
+\item{label_size}{Numeric. The size of labels of transcrptional states on top of bars (4 by default).} |
|
18 |
+ |
|
19 |
+\item{bar_col}{The color of bars (#FF8080FF by default).} |
|
20 |
+} |
|
21 |
+\description{ |
|
22 |
+A bar plot shows the ordered Gini coefficients of transcriptional states. The transcriptional states with high Gini coefficients (i.e. close to 1) |
|
23 |
+ indicates that only a few cell clusters highly express the genes in these transcriptional states. It is easier for users to define cell types based on these |
|
24 |
+ genes in these significant states. |
|
25 |
+} |
12 | 38 |
old mode 100644 |
13 | 39 |
new mode 100755 |
... | ... |
@@ -5,7 +5,7 @@ |
5 | 5 |
\title{Run the celda Bayesian hierarchical model on a matrix of counts.} |
6 | 6 |
\usage{ |
7 | 7 |
celda(counts, model, sample.label = NULL, nchains = 1, cores = 1, |
8 |
- seed = 12345, verbose = TRUE, logfile_prefix = "Celda", ...) |
|
8 |
+ seed = 12345, verbose = FALSE, logfile_prefix = "Celda", ...) |
|
9 | 9 |
} |
10 | 10 |
\arguments{ |
11 | 11 |
\item{counts}{A count matrix} |
42 | 42 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,37 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/diffExp.R |
|
3 |
+\name{diffExp_MAST} |
|
4 |
+\alias{diffExp_MAST} |
|
5 |
+\title{Gene expression markers for cell clusters using MAST} |
|
6 |
+\usage{ |
|
7 |
+diffExp_MAST(counts, celda.mod, c1, c2 = NULL, only.pos = FALSE, |
|
8 |
+ log2fc.threshold = 0, fdr.threshold = 1) |
|
9 |
+} |
|
10 |
+\arguments{ |
|
11 |
+\item{counts}{A numeric count matrix.} |
|
12 |
+ |
|
13 |
+\item{celda.mod}{An object of class celda_C or celda_CG.} |
|
14 |
+ |
|
15 |
+\item{c1}{Numeric. Cell cluster(s) to define markers for.} |
|
16 |
+ |
|
17 |
+\item{c2}{Numeric. Second cell cluster(s) for comparison. If NULL (default) - use all |
|
18 |
+other cells for comparison.} |
|
19 |
+ |
|
20 |
+\item{only.pos}{Logical. Only return markers with positive log2fc (FALSE by default).} |
|
21 |
+ |
|
22 |
+\item{log2fc.threshold}{Numeric. Only return markers whose the absolute values of log2fc |
|
23 |
+are greater than this threshold (0 by default).} |
|
24 |
+ |
|
25 |
+\item{fdr.threshold}{Numeric. Only return markers whose false discovery rates (FDRs) are less |
|
26 |
+than this threshold (1 by default).} |
|
27 |
+} |
|
28 |
+\value{ |
|
29 |
+Data frame containing a ranked list (based on the absolute value of log2fc) of putative markers, |
|
30 |
+ and associated statistics (p-value, log2fc and FDR). |
|
31 |
+} |
|
32 |
+\description{ |
|
33 |
+Finds markers (differentially expressed genes) for cell clusters |
|
34 |
+ using MAST: a flexible statistical framework for assessing transcriptional |
|
35 |
+ changes and characterizing heterogeneity in single-cell RNA sequencing data |
|
36 |
+ (Finak et al, Genome Biology, 2015) |
|
37 |
+} |
36 | 74 |
old mode 100644 |
37 | 75 |
new mode 100755 |
... | ... |
@@ -2,28 +2,24 @@ |
2 | 2 |
% Please edit documentation in R/celda_list.R |
3 | 3 |
\name{getModel} |
4 | 4 |
\alias{getModel} |
5 |
-\title{Run the celda Bayesian hierarchical model on a matrix of counts. |
|
6 |
-TODO: - If no chain provided, automatically choose best chain |
|
7 |
- - Smarter subsetting of the run.params to DRY this function up} |
|
5 |
+\title{Select the best model amongst a list of models generated in a celda run.} |
|
8 | 6 |
\usage{ |
9 |
-getModel(celda.list, K = NULL, L = NULL, chain = 1, best = NULL) |
|
7 |
+getModel(celda.list, K = NULL, L = NULL, chain = NULL, best = "loglik") |
|
10 | 8 |
} |
11 | 9 |
\arguments{ |
12 | 10 |
\item{celda.list}{A celda_list object returned from celda()} |
13 | 11 |
|
14 |
-\item{K}{The K parameter for the desired model in the results list} |
|
12 |
+\item{K}{The K parameter for the desired model in the results list. Defaults to NULL.} |
|
15 | 13 |
|
16 |
-\item{L}{The L parameter for the desired model in the results list} |
|
14 |
+\item{L}{The L parameter for the desired model in the results list. Defaults to NULL.} |
|
17 | 15 |
|
18 |
-\item{chain}{The desired chain for the specified model} |
|
16 |
+\item{chain}{The desired chain for the specified model. Defaults to NULL. Overrides best parameter if provided.} |
|
19 | 17 |
|
20 |
-\item{best}{Method for choosing best chain automatically. Options are c("perplexity", "loglik"). See documentation for chooseBestModel for details. Overrides chain parameter if provided.} |
|
18 |
+\item{best}{Method for choosing best chain automatically. Options are c("perplexity", "loglik"). See documentation for chooseBestModel for details. Defaults to "loglik."} |
|
21 | 19 |
} |
22 | 20 |
\value{ |
23 | 21 |
A celda model object matching the provided parameters (of class "celda_C", "celda_G", "celda_CG" accordingly), or NA if one is not found. |
24 | 22 |
} |
25 | 23 |
\description{ |
26 |
-Run the celda Bayesian hierarchical model on a matrix of counts. |
|
27 |
-TODO: - If no chain provided, automatically choose best chain |
|
28 |
- - Smarter subsetting of the run.params to DRY this function up |
|
24 |
+Select the best model amongst a list of models generated in a celda run. |
|
29 | 25 |
} |
58 | 54 |
new file mode 100755 |
... | ... |
@@ -0,0 +1,19 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/celda_list.R |
|
3 |
+\name{selectModels} |
|
4 |
+\alias{selectModels} |
|
5 |
+\title{Select a set of models from a celda_list objects based off of rows in its run.params attribute.} |
|
6 |
+\usage{ |
|
7 |
+selectModels(celda.list, run.param.rows) |
|
8 |
+} |
|
9 |
+\arguments{ |
|
10 |
+\item{celda.list}{A celda_list object returned from celda()} |
|
11 |
+ |
|
12 |
+\item{run.param.rows}{Row indices in the celda.list's run params corresponding to the desired models.} |
|
13 |
+} |
|
14 |
+\value{ |
|
15 |
+A celda_list containing celda model objects matching the provided parameters (of class "celda_C", "celda_G", "celda_CG" accordingly), or NA if one is not found. |
|
16 |
+} |
|
17 |
+\description{ |
|
18 |
+Select a set of models from a celda_list objects based off of rows in its run.params attribute. |
|
19 |
+} |
8 | 28 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,40 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/StateHeatmap.R |
|
3 |
+\name{stateHeatmap} |
|
4 |
+\alias{stateHeatmap} |
|
5 |
+\title{Transcriptional state heatmap} |
|
6 |
+\usage{ |
|
7 |
+stateHeatmap(counts, celda.mod, state.use = 1, cells.use = NULL, |
|
8 |
+ genes.use = NULL, normalize = TRUE, scale_row = scale, |
|
9 |
+ show_genenames = TRUE) |
|
10 |
+} |
|
11 |
+\arguments{ |
|
12 |
+\item{counts}{A numeric count matrix.} |
|
13 |
+ |
|
14 |
+\item{celda.mod}{An object of class celda_C or celda_CG.} |
|
15 |
+ |
|
16 |
+\item{state.use}{Numeric. A transcriptional state to plot.} |
|
17 |
+ |
|
18 |
+\item{cells.use}{Numeric or character. If a number, plot only this number of cells with respectively |
|
19 |
+highest and lowest proportions of counts in the transcriptional state. If a list of cell names, |
|
20 |
+plot only these cells orderd by their proportions of counts in the transcriptional state from |
|
21 |
+left to right. If NULL, plot all cells (NULL by default).} |
|
22 |
+ |
|
23 |
+\item{genes.use}{Numeric or character. If a number, plot only this number of genes with respectively |
|
24 |
+highest and lowest proportions of counts in the transcriptional state. If a list of gene names, |
|
25 |
+plot only these genes orderd by their proportions of counts in the transcriptional state from |
|
26 |
+the bottom up. If NULL, plot all genes in the transcriptional state (NULL by default).} |
|
27 |
+ |
|
28 |
+\item{normalize}{Logical. Should the columns be normmalized? Set to FALSE to disable. 'normalizeCounts' |
|
29 |
+normalizes to counts per million (CPM) (default by TRUE).} |
|
30 |
+ |
|
31 |
+\item{scale_row}{Function; A function to scale each individual row. Set to NULL to disable. Occurs |
|
32 |
+after normalization and log transformation. Defualt is 'scale' and thus will Z-score transform each row.} |
|
33 |
+ |
|
34 |
+\item{show_genenames}{Logical; specifying if gene names should be shown. Default to be TRUE.} |
|
35 |
+} |
|
36 |
+\description{ |
|
37 |
+Draws a heatmap focusing on a transcriptional state. Both cells and genes are sorted by |
|
38 |
+ their proportions of counts in a given transcriptional state. Allows for nice visualization of |
|
39 |
+ co-expression of those genes grouped into transcriptional states by Celda. |
|
40 |
+} |
20 | 61 |
old mode 100644 |
21 | 62 |
new mode 100755 |
... | ... |
@@ -1,6 +1,6 @@ |
1 | 1 |
--- |
2 | 2 |
title: "Analyzing single-cell RNA-seq count data with Celda" |
3 |
-author: "Josh D. Campbell, Masanao Yajima, Sean Corbett, Zichun Liu, Shiyi Yang, Tianwen Huan, Anahita Bahri, Zhe Wang, Yusuke Koga" |
|
3 |
+author: "Josh D. Campbell, Masanao Yajima, Sean Corbett, Zichun Liu, Shiyi Yang, Tianwen Huan, Anahita Bahri, Zhe Wang, Yusuke Koga, Jiangyuan Liu" |
|
4 | 4 |
date: "`r Sys.Date()`" |
5 | 5 |
output: rmarkdown::html_vignette |
6 | 6 |
vignette: > |
... | ... |
@@ -104,18 +104,18 @@ table(sim_counts$z, sim_counts$sample.label) |
104 | 104 |
|
105 | 105 |
## 2.1.1 How to Run Celda |
106 | 106 |
|
107 |
-There are currently three models within this package: `celda_C` will cluster cells, `celda_G` will cluster genes, and `celda_CG` will simultaneously cluster cells and genes. Here is the command for `celda_CG`: |
|
107 |
+There are currently three models within this package: `celda_C` will cluster cells, `celda_G` will cluster genes, and `celda_CG` will simultaneously cluster cells and genes. The `celda` function will use these models and take in a counts matrix to create a `celda` object that contains the cluster labels for each cell and gene. |
|
108 | 108 |
|
109 | 109 |
```{r, fig.show='hold', warning = FALSE, message = FALSE} |
110 | 110 |
|
111 |
-celdaCG.res = celda_CG(counts = sim_counts$counts, K = 3, L = 10, max.iter = 10, sample.label = sim_counts$sample.label) |
|
111 |
+celda.res = celda(counts = sim_counts$counts, model = "celda_CG", K = 3, L = 10, max.iter = 10, cores = 1, nchains = 1) |
|
112 | 112 |
``` |
113 | 113 |
|
114 | 114 |
Here is a comparison between the true cluster labels and the estimated cluster labels. |
115 | 115 |
|
116 | 116 |
```{r} |
117 |
-z = celdaCG.res$z |
|
118 |
-y = celdaCG.res$y |
|
117 |
+z = celda.res$res.list[[1]]$z |
|
118 |
+y = celda.res$res.list[[1]]$y |
|
119 | 119 |
|
120 | 120 |
table(z, sim_counts$z) |
121 | 121 |
table(y, sim_counts$y) |
... | ... |
@@ -131,12 +131,28 @@ renderCeldaHeatmap(counts = norm.counts, z=z, y=y, normalize = NULL, color_schem |
131 | 131 |
|
132 | 132 |
## 2.1.2 Matrix factorization |
133 | 133 |
|
134 |
-`celda` can also perform matrix factorization to summarize the contribution of each transcriptional state to each cellular subpopulation. |
|
134 |
+`celda` can also perform matrix factorization to summarize the contribution of each transcriptional state to each cellular subpopulation. Matrix factorization decomposes the counts matrix into multiple matrices, using the given `celda` model. The gene.states contains each gene's contribution to the transcriptional state it belongs. The cell.states contains each sample's contribution to each transcriptional state. The population.states contains each transcriptional states' contribution to each of the cell states. Each of these matrices can be viewed as raw counts values or proportional values. |
|
135 | 135 |
|
136 | 136 |
```{r} |
137 |
-factorized <- factorizeMatrix(celdaCG.res, sim_counts$count) |
|
137 |
+model <- getModel(celda.res, K = 3, L = 10, best = "loglik") |
|
138 |
+factorized <- factorizeMatrix(model, sim_counts$count) |
|
139 |
+``` |
|
138 | 140 |
|
141 |
+```{r} |
|
142 |
+dim(factorized$proportions$gene.states) |
|
143 |
+head(factorized$proportions$gene.states) |
|
144 |
+``` |
|
145 |
+ |
|
146 |
+```{r} |
|
147 |
+dim(factorized$proportions$cell.states) |
|
148 |
+factorized$proportions$cell.states[1:10,1:3] |
|
149 |
+``` |
|
150 |
+ |
|
151 |
+For instance, roughly 30% of the population in cell state K2 is in transcriptional state L1. |
|
152 |
+ |
|
153 |
+```{r} |
|
139 | 154 |
pop.states = factorized$proportions$population.states |
155 |
+dim(pop.states) |
|
140 | 156 |
head(pop.states) |
141 | 157 |
``` |
142 | 158 |
|
... | ... |
@@ -145,7 +161,7 @@ head(pop.states) |
145 | 161 |
```{r, fig.width = 7, fig.height = 7} |
146 | 162 |
data.pca <- prcomp(t(scale(t(factorized$proportions$cell.states))),scale = F, center = F) |
147 | 163 |
|
148 |
-plotDrCluster(dim1 = data.pca$rotation[,1], dim2 = data.pca$rotation[,2], cluster = as.factor(celdaCG.res$z)) |
|
164 |
+plotDrCluster(dim1 = data.pca$rotation[,1], dim2 = data.pca$rotation[,2], cluster = celda.res$res.list[[1]]$z) |
|
149 | 165 |
|
150 | 166 |
plotDrState(dim1 = data.pca$rotation[,1], dim2 = data.pca$rotation[,2], matrix = factorized$proportions$cell.states, rescale = TRUE) |
151 | 167 |
``` |
... | ... |
@@ -171,7 +187,7 @@ The relative probabilities will tend to reflect the patterns observed in the hea |
171 | 187 |
The optimal number of K and L will likely not be known a priori. Therefore multiple choices of K and L may need to tested and compared. Additionally, `celda` is sensitive to initial start conditions, so it is good practice to run multiple chains for each combination of K and L to increase the chances of finding the most optimal solution. We have built a wrapper function that will run multiple combinations of K and L with multiple chains for each combination in parallel. |
172 | 188 |
|
173 | 189 |
```{r} |
174 |
-celda.res.list <- celda(counts = sim_counts$counts,K = 2:4, L = 9:11, nchains = 3, max.iter = 10, model = "celda_CG") |
|
190 |
+celda.res.list <- celda(counts = sim_counts$counts,K = 2:4, L = 9:11, nchains = 3, max.iter = 10, model = "celda_CG", cores = 1) |
|
175 | 191 |
``` |
176 | 192 |
|
177 | 193 |
Here, `celda` will fit the model for every combination of K between 2 and 4 and L between 9 and 11. Three chains will be run for each combination. The parameter "cores" can be set to something more than 1 to run multiple chains in parallel. |
... | ... |
@@ -216,23 +232,23 @@ pbmc_select <- pbmc_data[rowSums(pbmc_data>4) > 4,] |
216 | 232 |
|
217 | 233 |
|
218 | 234 |
```{r, message = FALSE, verbose = FALSE} |
219 |
-pbmc.res = celda(pbmc_select, K = 15, L = 20, cores = 1, model = "celda_CG", nchains = 10, max.iter = 100) |
|
220 |
-model.res.list<-getModel(pbmc.res, K = 15, L = 20, best = "loglik") |
|
235 |
+pbmc.res = celda(pbmc_select, K = 10, L = 20, cores = 1, model = "celda_CG", nchains = 8, max.iter = 100) |
|
236 |
+model<-getModel(pbmc.res, K = 10, L = 20, best = "loglik") |
|
221 | 237 |
``` |
222 | 238 |
|
223 | 239 |
After matrix factorization, the top genes can be selected using the `topRank` function on the "gene.states" matrix: |
224 | 240 |
|
225 | 241 |
```{r} |
226 |
-factorize.matrix = factorizeMatrix(model.res.list, counts=pbmc_select) |
|
242 |
+factorize.matrix = factorizeMatrix(model, counts=pbmc_select) |
|
227 | 243 |
top.genes <- topRank(factorize.matrix$proportions$gene.states, n = 25) |
228 | 244 |
``` |
229 | 245 |
|
230 | 246 |
```{r} |
231 |
-top.genes$names$L6 |
|
247 |
+top.genes$names$L3 |
|
232 | 248 |
``` |
233 | 249 |
|
234 | 250 |
```{r} |
235 |
-top.genes$names$L15 |
|
251 |
+top.genes$names$L4 |
|
236 | 252 |
``` |
237 | 253 |
|
238 | 254 |
We can make a heatmap of these top genes as follows: |
... | ... |
@@ -240,7 +256,7 @@ We can make a heatmap of these top genes as follows: |
240 | 256 |
```{r, fig.width = 7, fig.height = 7} |
241 | 257 |
top.genes.ix <- unique(unlist(top.genes$index)) |
242 | 258 |
norm.pbmc.counts <- normalizeCounts(pbmc_select) |
243 |
-renderCeldaHeatmap(norm.pbmc.counts[top.genes.ix,], z = model.res.list$z, y = model.res.list$y[top.genes.ix], normalize = NULL, color_scheme = "divergent") |
|
259 |
+renderCeldaHeatmap(norm.pbmc.counts[top.genes.ix,], z = model$z, y = model$y[top.genes.ix], normalize = NULL, color_scheme = "divergent") |
|
244 | 260 |
``` |
245 | 261 |
|
246 | 262 |
We can plot tSNE results via `plot_dr_cluster`, `plot_dr_state` and `plot_dr_gene`. These will determine how each cell is labeled via celda, visualize each cell's expression respectivelyof a specific transcriptional state and visualize each cell's expression of a set of genes, . |
... | ... |
@@ -248,12 +264,12 @@ We can plot tSNE results via `plot_dr_cluster`, `plot_dr_state` and `plot_dr_gen |
248 | 264 |
```{r, message = FALSE} |
249 | 265 |
norm_pbmc <- normalizeCounts(factorize.matrix$counts$cell.states) |
250 | 266 |
set.seed(123) |
251 |
-tsne <- Rtsne(t(norm_pbmc), pca = FALSE, max_iter = 5000) |
|
267 |
+tsne <- Rtsne(t(norm_pbmc), pca = FALSE, max_iter = 2000) |
|
252 | 268 |
pbmc_tsne <- tsne$Y |
253 | 269 |
``` |
254 | 270 |
|
255 | 271 |
```{r, fig.width = 7, fig.height = 7} |
256 |
-plotDrCluster(dim1 = pbmc_tsne[,1], dim2 = pbmc_tsne[,2], cluster = as.factor(model.res.list$z)) |
|
272 |
+plotDrCluster(dim1 = pbmc_tsne[,1], dim2 = pbmc_tsne[,2], cluster = as.factor(model$z)) |
|
257 | 273 |
|
258 | 274 |
plotDrState(dim1 = pbmc_tsne[,1], dim2 = pbmc_tsne[,2], matrix = factorize.matrix$proportions$cell.states, rescale = TRUE) |
259 | 275 |
|
... | ... |
@@ -262,4 +278,4 @@ gene.counts <- pbmc_select[marker.genes,] |
262 | 278 | <