... | ... |
@@ -1,14 +1,16 @@ |
1 | 1 |
Package: celda |
2 | 2 |
Title: CEllular Latent Dirichlet Allocation |
3 |
-Version: 0.99.8 |
|
3 |
+Version: 0.99.9 |
|
4 | 4 |
Authors@R: c(person("Joshua", "Campbell", email = "camp@bu.edu", role = c("aut", "cre")), |
5 | 5 |
person("Sean", "Corbett", email = "scorbett@bu.edu", role = c("aut")), |
6 |
- person("Yusuke", "Koga", email="ykoga07@bu.edu", role = c("aut"))) |
|
6 |
+ person("Yusuke", "Koga", email="ykoga07@bu.edu", role = c("aut")), |
|
7 |
+ person("Zhe", "Wang", email="zhe@bu.edu", role = c("aut"))) |
|
7 | 8 |
Description: celda leverages Bayesian hierarchical modeling to cluster genes, |
8 | 9 |
cells, or both simultaneously from single cell sequencing data. |
9 | 10 |
Depends: |
10 | 11 |
R (>= 3.5), |
11 | 12 |
SummarizedExperiment |
13 |
+VignetteBuilder: knitr |
|
12 | 14 |
Imports: |
13 | 15 |
plyr, |
14 | 16 |
foreach, |
... | ... |
@@ -31,23 +33,29 @@ Imports: |
31 | 33 |
GenomicRanges, |
32 | 34 |
data.table, |
33 | 35 |
Rcpp, |
34 |
- RcppEigen |
|
36 |
+ RcppEigen, |
|
37 |
+ umap, |
|
38 |
+ enrichR, |
|
39 |
+ vcd, |
|
40 |
+ ggpubr, |
|
41 |
+ stringi, |
|
42 |
+ SummarizedExperiment |
|
35 | 43 |
Suggests: |
36 | 44 |
testthat, |
37 | 45 |
knitr, |
38 | 46 |
roxygen2, |
39 | 47 |
rmarkdown, |
40 |
- vcd, |
|
41 | 48 |
corrplot, |
42 | 49 |
Matrix, |
43 | 50 |
biomaRt, |
44 | 51 |
Rtsne, |
45 |
- covr |
|
52 |
+ covr, |
|
53 |
+ M3DExampleData, |
|
54 |
+ BiocManager |
|
46 | 55 |
LinkingTo: Rcpp, RcppEigen |
47 |
-VignetteBuilder: knitr |
|
48 |
-License: MIT |
|
56 |
+License: MIT + file LICENSE |
|
49 | 57 |
Encoding: UTF-8 |
50 | 58 |
LazyData: true |
51 |
-RoxygenNote: 6.1.0 |
|
52 |
-BugReports: https://github.com/definitelysean/celda/issues |
|
59 |
+RoxygenNote: 6.1.1 |
|
60 |
+BugReports: https://github.com/campbio/celda/issues |
|
53 | 61 |
biocViews: SingleCell, GeneExpression, Clustering, Sequencing, Bayesian |
... | ... |
@@ -10,6 +10,7 @@ export(celdaHeatmap) |
10 | 10 |
export(celdaPerplexity) |
11 | 11 |
export(celdaProbabilityMap) |
12 | 12 |
export(celdaTsne) |
13 |
+export(celdaUmap) |
|
13 | 14 |
export(celda_C) |
14 | 15 |
export(celda_CG) |
15 | 16 |
export(celda_G) |
... | ... |
@@ -21,6 +22,7 @@ export(distinct_colors) |
21 | 22 |
export(factorizeMatrix) |
22 | 23 |
export(featureModuleLookup) |
23 | 24 |
export(featureModuleTable) |
25 |
+export(geneSetEnrich) |
|
24 | 26 |
export(logLikelihood) |
25 | 27 |
export(logLikelihood.celda_C) |
26 | 28 |
export(logLikelihood.celda_CG) |
... | ... |
@@ -59,6 +61,7 @@ export(violinPlot) |
59 | 61 |
exportMethods(celdaHeatmap) |
60 | 62 |
exportMethods(celdaProbabilityMap) |
61 | 63 |
exportMethods(celdaTsne) |
64 |
+exportMethods(celdaUmap) |
|
62 | 65 |
exportMethods(clusterProbability) |
63 | 66 |
exportMethods(factorizeMatrix) |
64 | 67 |
exportMethods(featureModuleLookup) |
... | ... |
@@ -70,6 +73,7 @@ import(grDevices) |
70 | 73 |
import(graphics) |
71 | 74 |
import(grid) |
72 | 75 |
import(gtable) |
76 |
+import(matrixStats) |
|
73 | 77 |
import(plyr) |
74 | 78 |
import(scales) |
75 | 79 |
useDynLib(celda,"_colSumByGroup") |
... | ... |
@@ -75,12 +75,12 @@ moduleHeatmap <- function(counts, celda.mod, feature.module = 1, top.cells = 100 |
75 | 75 |
gene_ix = match(rownames(filtered_norm.counts), celda.mod@names$row) |
76 | 76 |
cell_ix = match(colnames(filtered_norm.counts), celda.mod@names$column) |
77 | 77 |
z.to.plot = c() |
78 |
- if(methods::.hasSlot(celda.mod, "z")){ |
|
78 |
+ if (is(celda.mod, "celda_C") || is(celda.mod, "celda_CG")){ |
|
79 | 79 |
cell <- distinct_colors(length(unique(celda.mod@clusters$z)))[sort(unique(celda.mod@clusters$z[cell_ix]))] |
80 | 80 |
names(cell) <- sort(unique(celda.mod@clusters$z[cell_ix])) |
81 | 81 |
anno_cell_colors <- list(cell = cell) |
82 | 82 |
z.to.plot = celda.mod@clusters$z[cell.indices] |
83 |
- }else{ |
|
83 |
+ } else { |
|
84 | 84 |
anno_cell_colors <- NULL |
85 | 85 |
} |
86 | 86 |
plotHeatmap( |
... | ... |
@@ -300,7 +300,14 @@ setGeneric("celdaProbabilityMap", |
300 | 300 |
#' Embeds cells in two dimensions using tSNE based on celda_CG results. |
301 | 301 |
#' |
302 | 302 |
#' @param counts Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`. |
303 |
-#' @param celda.mod Celda model. Options available in `celda::available.models`. |
|
303 |
+#' @param celda.mod Celda object of class `celda_CG`. |
|
304 |
+#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000. |
|
305 |
+#' @param min.cluster.size Integer. Do not subsample cell clusters below this threshold. Default 100. |
|
306 |
+#' @param modules Integer vector. Determines which features modules to use for tSNE. If NULL, all modules will be used. Default NULL. |
|
307 |
+#' @param perplexity Numeric. Perplexity parameter for tSNE. Default 20. |
|
308 |
+#' @param max.iter Integer. Maximum number of iterations in tSNE generation. Default 2500. |
|
309 |
+#' @param seed Integer. Passed to `set.seed()`. Default 12345. If NULL, no calls to `set.seed()` are made. |
|
310 |
+#' @param ... Additional parameters. |
|
304 | 311 |
#' @param ... Additional parameters. |
305 | 312 |
#' @return Numeric Matrix of dimension `ncol(counts)` x 2, colums representing the "X" and "Y" coordinates in the data's t-SNE represetation. |
306 | 313 |
#' @examples |
... | ... |
@@ -317,6 +324,30 @@ setGeneric("celdaTsne", |
317 | 324 |
}) |
318 | 325 |
|
319 | 326 |
|
327 |
+#' Embeds cells in two dimensions using umap. |
|
328 |
+#' |
|
329 |
+#' @param counts Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`. |
|
330 |
+#' @param celda.mod Celda object of class `celda_CG`. |
|
331 |
+#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000. |
|
332 |
+#' @param min.cluster.size Integer. Do not subsample cell clusters below this threshold. Default 100. |
|
333 |
+#' @param modules Integer vector. Determines which features modules to use for tSNE. If NULL, all modules will be used. Default NULL. |
|
334 |
+#' @param perplexity Numeric. Perplexity parameter for tSNE. Default 20. |
|
335 |
+#' @param max.iter Integer. Maximum number of iterations in tSNE generation. Default 2500. |
|
336 |
+#' @param seed Integer. Passed to `set.seed()`. Default 12345. If NULL, no calls to `set.seed()` are made. |
|
337 |
+#' @param ... Additional parameters. |
|
338 |
+#' @param ... Additional parameters. |
|
339 |
+#' @return Numeric Matrix of dimension `ncol(counts)` x 2, colums representing the "X" and "Y" coordinates in the data's t-SNE represetation. |
|
340 |
+#' @examples |
|
341 |
+#' tsne.res = celdaTsne(celda.CG.sim$counts, celda.CG.mod) |
|
342 |
+#' @export |
|
343 |
+setGeneric("celdaUmap", |
|
344 |
+ signature = "celda.mod", |
|
345 |
+ function(counts, celda.mod, max.cells=25000, min.cluster.size=100, |
|
346 |
+ modules=NULL, umap.config=umap::umap.defaults) { |
|
347 |
+ standardGeneric("celdaUmap") |
|
348 |
+ }) |
|
349 |
+ |
|
350 |
+ |
|
320 | 351 |
#' Obtain the gene module of a gene of interest |
321 | 352 |
#' |
322 | 353 |
#' This function will output the corresponding feature module for a specified list of genes from a celda model. |
... | ... |
@@ -526,6 +526,7 @@ setMethod("clusterProbability", |
526 | 526 |
#' @seealso `celda_C()` for clustering cells |
527 | 527 |
#' @examples |
528 | 528 |
#' perplexity = perplexity(celda.C.sim$counts, celda.C.mod) |
529 |
+#' @import matrixStats |
|
529 | 530 |
#' @export |
530 | 531 |
setMethod("perplexity", |
531 | 532 |
signature(celda.mod = "celda_C"), |
... | ... |
@@ -617,55 +618,94 @@ setMethod("celdaTsne", |
617 | 618 |
initial.dims=20, modules=NULL, perplexity=20, max.iter=2500, |
618 | 619 |
seed=12345, ...) { |
619 | 620 |
|
620 |
- counts = processCounts(counts) |
|
621 |
- compareCountMatrix(counts, celda.mod) |
|
622 |
- |
|
623 |
- ## Checking if max.cells and min.cluster.size will work |
|
624 |
- if((max.cells < ncol(counts)) & (max.cells / min.cluster.size < celda.mod@params$K)) { |
|
625 |
- stop(paste0("Cannot distribute ", max.cells, " cells among ", |
|
626 |
- celda.mod@params$K, " clusters while maintaining a minumum of ", |
|
627 |
- min.cluster.size, |
|
628 |
- " cells per cluster. Try increasing 'max.cells' or decreasing 'min.cluster.size'.")) |
|
629 |
- } |
|
621 |
+ |
|
622 |
+ prepared.count.info = prepareCountsForDimReduction.celda_C(counts, celda.mod, max.cells, |
|
623 |
+ min.cluster.size, modules) |
|
630 | 624 |
|
631 |
- ## Select a subset of cells to sample if greater than 'max.cells' |
|
632 |
- total.cells.to.remove = ncol(counts) - max.cells |
|
633 |
- z.include = rep(TRUE, ncol(counts)) |
|
634 |
- if(total.cells.to.remove > 0) { |
|
635 |
- z.ta = tabulate(celda.mod@clusters$z, celda.mod@params$K) |
|
636 |
- |
|
637 |
- ## Number of cells that can be sampled from each cluster without |
|
638 |
- ## going below the minimum threshold |
|
639 |
- cluster.cells.to.sample = z.ta - min.cluster.size |
|
640 |
- cluster.cells.to.sample[cluster.cells.to.sample < 0] = 0 |
|
641 |
- |
|
642 |
- ## Number of cells to sample after exluding smaller clusters |
|
643 |
- ## Rounding can cause number to be off by a few, so ceiling is |
|
644 |
- ## used with a second round of subtraction |
|
645 |
- cluster.n.to.sample = ceiling((cluster.cells.to.sample / sum(cluster.cells.to.sample)) * total.cells.to.remove) |
|
646 |
- diff = sum(cluster.n.to.sample) - total.cells.to.remove |
|
647 |
- cluster.n.to.sample[which.max(cluster.n.to.sample)] = cluster.n.to.sample[which.max(cluster.n.to.sample)] - diff |
|
648 |
- |
|
649 |
- ## Perform sampling for each cluster |
|
650 |
- for(i in which(cluster.n.to.sample > 0)) { |
|
651 |
- z.include[sample(which(celda.mod@clusters$z == i), cluster.n.to.sample[i])] = FALSE |
|
652 |
- } |
|
653 |
- } |
|
654 |
- cell.ix = which(z.include) |
|
655 |
- |
|
656 |
- norm = t(normalizeCounts(counts[,cell.ix], normalize="proportion", |
|
657 |
- transformation.fun=sqrt)) |
|
658 |
- res = calculateTsne(norm, perplexity=perplexity, max.iter=max.iter, |
|
659 |
- seed=seed, do.pca=TRUE, |
|
660 |
- initial.dims = initial.dims) |
|
625 |
+ res = calculateTsne(prepared.count.info$norm, perplexity=perplexity, max.iter=max.iter, |
|
626 |
+ seed=seed, do.pca=TRUE, initial.dims = initial.dims) |
|
661 | 627 |
final = matrix(NA, nrow=ncol(counts), ncol=2) |
662 |
- final[cell.ix,] = res |
|
628 |
+ final[prepared.count.info$cell.ix,] = res |
|
663 | 629 |
rownames(final) = colnames(counts) |
664 | 630 |
colnames(final) = c("tsne_1", "tsne_2") |
665 | 631 |
return(final) |
666 | 632 |
}) |
667 | 633 |
|
668 | 634 |
|
635 |
+#' @title umap for celda_C |
|
636 |
+#' @description Embeds cells in two dimensions using umap based on a `celda_C` model. PCA on the normalized counts is used to reduce the number of features before applying umap. |
|
637 |
+#' |
|
638 |
+#' @param counts Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`. |
|
639 |
+#' @param celda.mod Celda object of class `celda_C`. |
|
640 |
+#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000. |
|
641 |
+#' @param min.cluster.size Integer. Do not subsample cell clusters below this threshold. Default 100. |
|
642 |
+#' @param initial.dims Integer. PCA will be used to reduce the dimentionality of the dataset. The top 'initial.dims' principal components will be used for tSNE. Default 20. |
|
643 |
+#' @param perplexity Numeric. Perplexity parameter for tSNE. Default 20. |
|
644 |
+#' @param max.iter Integer. Maximum number of iterations in tSNE generation. Default 2500. |
|
645 |
+#' @param seed Integer. Passed to `set.seed()`. Default 12345. If NULL, no calls to `set.seed()` are made. |
|
646 |
+#' @param ... Additional parameters. |
|
647 |
+#' @seealso `celda_C()` for clustering cells and `celdaHeatmap()` for displaying expression |
|
648 |
+#' @examples |
|
649 |
+#' umap.res = celdaUmap(celda.C.sim$counts, celda.C.mod) |
|
650 |
+#' @return A two column matrix of t-SNE coordinates |
|
651 |
+#' @export |
|
652 |
+setMethod("celdaUmap", |
|
653 |
+ signature(celda.mod = "celda_C"), |
|
654 |
+ function(counts, celda.mod, max.cells=25000, min.cluster.size=100, |
|
655 |
+ modules=NULL, umap.config=umap::umap.defaults) { |
|
656 |
+ prepared.count.info = prepareCountsForDimReduction.celda_C(counts, celda.mod, max.cells, |
|
657 |
+ min.cluster.size, modules) |
|
658 |
+ res = calculateUmap(prepared.count.info$norm, umap.config) |
|
659 |
+ final = matrix(NA, nrow=ncol(counts), ncol=2) |
|
660 |
+ final[prepared.count.info$cell.ix,] = res |
|
661 |
+ rownames(final) = colnames(counts) |
|
662 |
+ colnames(final) = c("umap_1", "umap_2") |
|
663 |
+ return(final) |
|
664 |
+ }) |
|
665 |
+ |
|
666 |
+ |
|
667 |
+prepareCountsForDimReduction.celda_C = function(counts, celda.mod, max.cells=25000, min.cluster.size=100, |
|
668 |
+ modules=NULL) { |
|
669 |
+ counts = processCounts(counts) |
|
670 |
+ compareCountMatrix(counts, celda.mod) |
|
671 |
+ |
|
672 |
+ ## Checking if max.cells and min.cluster.size will work |
|
673 |
+ if((max.cells < ncol(counts)) & (max.cells / min.cluster.size < celda.mod@params$K)) { |
|
674 |
+ stop(paste0("Cannot distribute ", max.cells, " cells among ", |
|
675 |
+ celda.mod@params$K, " clusters while maintaining a minumum of ", |
|
676 |
+ min.cluster.size, |
|
677 |
+ " cells per cluster. Try increasing 'max.cells' or decreasing 'min.cluster.size'.")) |
|
678 |
+ } |
|
679 |
+ |
|
680 |
+ ## Select a subset of cells to sample if greater than 'max.cells' |
|
681 |
+ total.cells.to.remove = ncol(counts) - max.cells |
|
682 |
+ z.include = rep(TRUE, ncol(counts)) |
|
683 |
+ if(total.cells.to.remove > 0) { |
|
684 |
+ z.ta = tabulate(celda.mod@clusters$z, celda.mod@params$K) |
|
685 |
+ |
|
686 |
+ ## Number of cells that can be sampled from each cluster without |
|
687 |
+ ## going below the minimum threshold |
|
688 |
+ cluster.cells.to.sample = z.ta - min.cluster.size |
|
689 |
+ cluster.cells.to.sample[cluster.cells.to.sample < 0] = 0 |
|
690 |
+ |
|
691 |
+ ## Number of cells to sample after exluding smaller clusters |
|
692 |
+ ## Rounding can cause number to be off by a few, so ceiling is |
|
693 |
+ ## used with a second round of subtraction |
|
694 |
+ cluster.n.to.sample = ceiling((cluster.cells.to.sample / sum(cluster.cells.to.sample)) * total.cells.to.remove) |
|
695 |
+ diff = sum(cluster.n.to.sample) - total.cells.to.remove |
|
696 |
+ cluster.n.to.sample[which.max(cluster.n.to.sample)] = cluster.n.to.sample[which.max(cluster.n.to.sample)] - diff |
|
697 |
+ |
|
698 |
+ ## Perform sampling for each cluster |
|
699 |
+ for(i in which(cluster.n.to.sample > 0)) { |
|
700 |
+ z.include[sample(which(celda.mod@clusters$z == i), cluster.n.to.sample[i])] = FALSE |
|
701 |
+ } |
|
702 |
+ } |
|
703 |
+ cell.ix = which(z.include) |
|
704 |
+ norm = t(normalizeCounts(counts[,cell.ix], normalize="proportion", |
|
705 |
+ transformation.fun=sqrt)) |
|
706 |
+ return(list(norm=norm, cell.ix=cell.ix)) |
|
707 |
+} |
|
708 |
+ |
|
669 | 709 |
|
670 | 710 |
#' @title Probability map for a celda_C model |
671 | 711 |
#' @description Renders probability and relative expression heatmaps to visualize the relationship between cell populations and samples. |
... | ... |
@@ -629,6 +629,7 @@ setMethod("clusterProbability", |
629 | 629 |
#' @seealso `celda_CG()` for clustering features and cells |
630 | 630 |
#' @examples |
631 | 631 |
#' perplexity = perplexity(celda.CG.sim$counts, celda.CG.mod) |
632 |
+#' @import matrixStats |
|
632 | 633 |
#' @export |
633 | 634 |
setMethod("perplexity", |
634 | 635 |
signature(celda.mod = "celda_CG"), |
... | ... |
@@ -738,62 +739,102 @@ setMethod("celdaTsne", |
738 | 739 |
function(counts, celda.mod, max.cells=25000, min.cluster.size=100, |
739 | 740 |
initial.dims=20, modules=NULL, perplexity=20, max.iter=2500, |
740 | 741 |
seed=12345, ...) { |
741 |
- |
|
742 |
- ## Checking if max.cells and min.cluster.size will work |
|
743 |
- if((max.cells < ncol(counts)) & (max.cells / min.cluster.size < celda.mod@params$K)) { |
|
744 |
- stop(paste0("Cannot distribute ", max.cells, " cells among ", |
|
745 |
- celda.mod@params$K, " clusters while maintaining a minumum of ", |
|
746 |
- min.cluster.size, " cells per cluster. Try increasing 'max.cells' or decreasing 'min.cluster.size'.")) |
|
747 |
- } |
|
748 |
- |
|
749 |
- fm = factorizeMatrix(counts=counts, celda.mod=celda.mod, |
|
750 |
- type="counts") |
|
751 |
- modules.to.use = 1:nrow(fm$counts$cell) |
|
752 |
- if (!is.null(modules)) { |
|
753 |
- if (!all(modules %in% modules.to.use)) { |
|
754 |
- stop("'modules' must be a vector of numbers between 1 and ", |
|
755 |
- modules.to.use, ".") |
|
756 |
- } |
|
757 |
- modules.to.use = modules |
|
758 |
- } |
|
759 |
- |
|
760 |
- |
|
761 |
- ## Select a subset of cells to sample if greater than 'max.cells' |
|
762 |
- total.cells.to.remove = ncol(counts) - max.cells |
|
763 |
- z.include = rep(TRUE, ncol(counts)) |
|
764 |
- if(total.cells.to.remove > 0) { |
|
765 |
- z.ta = tabulate(celda.mod@clusters$z, celda.mod@params$K) |
|
766 |
- |
|
767 |
- ## Number of cells that can be sampled from each cluster without |
|
768 |
- ## going below the minimum threshold |
|
769 |
- cluster.cells.to.sample = z.ta - min.cluster.size |
|
770 |
- cluster.cells.to.sample[cluster.cells.to.sample < 0] = 0 |
|
771 |
- |
|
772 |
- ## Number of cells to sample after exluding smaller clusters |
|
773 |
- ## Rounding can cause number to be off by a few, so ceiling is used |
|
774 |
- ## with a second round of subtraction |
|
775 |
- cluster.n.to.sample = ceiling((cluster.cells.to.sample / sum(cluster.cells.to.sample)) * total.cells.to.remove) |
|
776 |
- diff = sum(cluster.n.to.sample) - total.cells.to.remove |
|
777 |
- cluster.n.to.sample[which.max(cluster.n.to.sample)] = cluster.n.to.sample[which.max(cluster.n.to.sample)] - diff |
|
778 |
- |
|
779 |
- ## Perform sampling for each cluster |
|
780 |
- for(i in which(cluster.n.to.sample > 0)) { |
|
781 |
- z.include[sample(which(celda.mod@clusters$z == i), cluster.n.to.sample[i])] = FALSE |
|
782 |
- } |
|
783 |
- } |
|
784 |
- cell.ix = which(z.include) |
|
785 |
- |
|
786 |
- norm = t(normalizeCounts(fm$counts$cell[modules.to.use,cell.ix], normalize="proportion", transformation.fun=sqrt)) |
|
787 | 742 |
|
743 |
+ prepared.count.info = prepareCountsForDimReduction.celda_CG(counts, celda.mod, |
|
744 |
+ max.cells, min.cluster.size, |
|
745 |
+ initial.dims, modules, ...) |
|
746 |
+ norm = prepared.count.info$norm |
|
788 | 747 |
res = calculateTsne(norm, do.pca=FALSE, perplexity=perplexity, max.iter=max.iter, seed=seed) |
789 | 748 |
final = matrix(NA, nrow=ncol(counts), ncol=2) |
790 |
- final[cell.ix,] = res |
|
749 |
+ final[prepared.count.info$cell.ix,] = res |
|
791 | 750 |
rownames(final) = colnames(counts) |
792 | 751 |
colnames(final) = c("tsne_1", "tsne_2") |
793 | 752 |
return(final) |
794 | 753 |
}) |
795 | 754 |
|
796 | 755 |
|
756 |
+#' @title umap for celda_CG |
|
757 |
+#' @description Embeds cells in two dimensions using umap based on a `celda_CG` model. umap is run on module probabilities to reduce the number of features instead of using PCA. Module probabilities square-root trasformed before applying tSNE. |
|
758 |
+#' |
|
759 |
+#' @param counts Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`. |
|
760 |
+#' @param celda.mod Celda object of class `celda_CG`. |
|
761 |
+#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000. |
|
762 |
+#' @param min.cluster.size Integer. Do not subsample cell clusters below this threshold. Default 100. |
|
763 |
+#' @param modules Integer vector. Determines which features modules to use for tSNE. If NULL, all modules will be used. Default NULL. |
|
764 |
+#' @param umap.config Object of class `umap.config`. Configures parameters for umap. Default `umap::umap.defaults` |
|
765 |
+#' @seealso `celda_CG()` for clustering features and cells and `celdaHeatmap()` for displaying expression |
|
766 |
+#' @examples |
|
767 |
+#' umap.res = celdaUmap(celda.CG.sim$counts, celda.CG.mod) |
|
768 |
+#' @return A two column matrix of t-SNE coordinates |
|
769 |
+#' @export |
|
770 |
+setMethod("celdaUmap", |
|
771 |
+ signature(celda.mod = "celda_CG"), |
|
772 |
+ function(counts, celda.mod, max.cells=25000, min.cluster.size=100, |
|
773 |
+ modules=NULL, umap.config=umap::umap.defaults) { |
|
774 |
+ prepared.count.info = prepareCountsForDimReduction.celda_CG(counts, celda.mod, |
|
775 |
+ max.cells, min.cluster.size, |
|
776 |
+ initial.dims, modules) |
|
777 |
+ umap.res = calculateUmap(prepared.count.info$norm, |
|
778 |
+ umap.config) |
|
779 |
+ final = matrix(NA, nrow=ncol(counts), ncol=2) |
|
780 |
+ final[prepared.count.info$cell.ix, ] = umap.res |
|
781 |
+ rownames(final) = colnames(counts) |
|
782 |
+ colnames(final) = c("umap_1", "umap_2") |
|
783 |
+ return(final) |
|
784 |
+ }) |
|
785 |
+ |
|
786 |
+ |
|
787 |
+prepareCountsForDimReduction.celda_CG = function(counts, celda.mod, max.cells=25000, |
|
788 |
+ min.cluster.size=100, |
|
789 |
+ initial.dims=20, modules=NULL, ...) { |
|
790 |
+ ## Checking if max.cells and min.cluster.size will work |
|
791 |
+ if((max.cells < ncol(counts)) & (max.cells / min.cluster.size < celda.mod@params$K)) { |
|
792 |
+ stop(paste0("Cannot distribute ", max.cells, " cells among ", |
|
793 |
+ celda.mod@params$K, " clusters while maintaining a minumum of ", |
|
794 |
+ min.cluster.size, " cells per cluster. Try increasing 'max.cells' or decreasing 'min.cluster.size'.")) |
|
795 |
+ } |
|
796 |
+ |
|
797 |
+ fm = factorizeMatrix(counts=counts, celda.mod=celda.mod, |
|
798 |
+ type="counts") |
|
799 |
+ modules.to.use = 1:nrow(fm$counts$cell) |
|
800 |
+ if (!is.null(modules)) { |
|
801 |
+ if (!all(modules %in% modules.to.use)) { |
|
802 |
+ stop("'modules' must be a vector of numbers between 1 and ", |
|
803 |
+ modules.to.use, ".") |
|
804 |
+ } |
|
805 |
+ modules.to.use = modules |
|
806 |
+ } |
|
807 |
+ |
|
808 |
+ |
|
809 |
+ ## Select a subset of cells to sample if greater than 'max.cells' |
|
810 |
+ total.cells.to.remove = ncol(counts) - max.cells |
|
811 |
+ z.include = rep(TRUE, ncol(counts)) |
|
812 |
+ if(total.cells.to.remove > 0) { |
|
813 |
+ z.ta = tabulate(celda.mod@clusters$z, celda.mod@params$K) |
|
814 |
+ |
|
815 |
+ ## Number of cells that can be sampled from each cluster without |
|
816 |
+ ## going below the minimum threshold |
|
817 |
+ cluster.cells.to.sample = z.ta - min.cluster.size |
|
818 |
+ cluster.cells.to.sample[cluster.cells.to.sample < 0] = 0 |
|
819 |
+ |
|
820 |
+ ## Number of cells to sample after exluding smaller clusters |
|
821 |
+ ## Rounding can cause number to be off by a few, so ceiling is used |
|
822 |
+ ## with a second round of subtraction |
|
823 |
+ cluster.n.to.sample = ceiling((cluster.cells.to.sample / sum(cluster.cells.to.sample)) * total.cells.to.remove) |
|
824 |
+ diff = sum(cluster.n.to.sample) - total.cells.to.remove |
|
825 |
+ cluster.n.to.sample[which.max(cluster.n.to.sample)] = cluster.n.to.sample[which.max(cluster.n.to.sample)] - diff |
|
826 |
+ |
|
827 |
+ ## Perform sampling for each cluster |
|
828 |
+ for(i in which(cluster.n.to.sample > 0)) { |
|
829 |
+ z.include[sample(which(celda.mod@clusters$z == i), cluster.n.to.sample[i])] = FALSE |
|
830 |
+ } |
|
831 |
+ } |
|
832 |
+ cell.ix = which(z.include) |
|
833 |
+ |
|
834 |
+ norm = t(normalizeCounts(fm$counts$cell[modules.to.use,cell.ix], normalize="proportion", transformation.fun=sqrt)) |
|
835 |
+ return(list(norm=norm, cell.ix=cell.ix)) |
|
836 |
+} |
|
837 |
+ |
|
797 | 838 |
|
798 | 839 |
#' @title Probability map for a celda_CG model |
799 | 840 |
#' @description Renders probability and relative expression heatmaps to visualize the relationship between features and cell populations or cell populations and samples. |
... | ... |
@@ -192,7 +192,7 @@ cG.calcGibbsProbY = function(counts, n.TS.by.C, n.by.TS, nG.by.TS, n.by.G, y, L, |
192 | 192 |
ix <- sample(1:nG) |
193 | 193 |
for(i in ix) { |
194 | 194 |
probs[,i] = cG_CalcGibbsProbY(index=i, counts=counts, nTSbyC=n.TS.by.C, nbyTS=n.by.TS, nGbyTS=nG.by.TS, nbyG=n.by.G, y=y, L=L, nG=nG, lg_beta=lgbeta, lg_gamma=lggamma, lg_delta=lgdelta, delta=delta) |
195 |
- if(any(is.na(probs[,i]))) browser() |
|
195 |
+ #if(any(is.na(probs[,i]))) browser() |
|
196 | 196 |
## Sample next state and add back counts |
197 | 197 |
if(isTRUE(do.sample)) { |
198 | 198 |
prev.y = y[i] |
... | ... |
@@ -603,28 +603,10 @@ setMethod("celdaTsne", |
603 | 603 |
initial.dims=20, modules=NULL, perplexity=20, max.iter=2500, |
604 | 604 |
seed=12345, ...) { |
605 | 605 |
|
606 |
- if(max.cells > ncol(counts)) { |
|
607 |
- max.cells = ncol(counts) |
|
608 |
- } |
|
609 | 606 |
|
610 |
- fm = factorizeMatrix(counts=counts, celda.mod=celda.mod, |
|
611 |
- type="counts") |
|
612 |
- |
|
613 |
- modules.to.use = 1:nrow(fm$counts$cell) |
|
614 |
- if (!is.null(modules)) { |
|
615 |
- if (!all(modules %in% modules.to.use)) { |
|
616 |
- stop("'modules' must be a vector of numbers between 1 and ", |
|
617 |
- modules.to.use, ".") |
|
618 |
- } |
|
619 |
- modules.to.use = modules |
|
620 |
- } |
|
621 |
- |
|
622 |
- cell.ix = sample(1:ncol(counts), max.cells) |
|
623 |
- norm = t(normalizeCounts(fm$counts$cell[modules.to.use,cell.ix], |
|
624 |
- normalize="proportion", |
|
625 |
- transformation.fun=sqrt)) |
|
626 |
- |
|
627 |
- res = calculateTsne(norm, do.pca=FALSE, perplexity=perplexity, |
|
607 |
+ prepared.count.info = prepareCountsForDimReduction.celda_G(counts, celda.mod, max.cells, |
|
608 |
+ min.cluster.size, modules) |
|
609 |
+ res = calculateTsne(prepared.count.info$norm, do.pca=FALSE, perplexity=perplexity, |
|
628 | 610 |
max.iter=max.iter, seed=seed) |
629 | 611 |
rownames(res) = colnames(counts) |
630 | 612 |
colnames(res) = c("tsne_1", "tsne_2") |
... | ... |
@@ -632,6 +614,62 @@ setMethod("celdaTsne", |
632 | 614 |
}) |
633 | 615 |
|
634 | 616 |
|
617 |
+#' @title umap for celda_G |
|
618 |
+#' @description Embeds cells in two dimensions using umap based on a `celda_G` model. umap is run on module probabilities to reduce the number of features instead of using PCA. Module probabilities square-root trasformed before applying tSNE. |
|
619 |
+#' |
|
620 |
+#' @param counts Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`. |
|
621 |
+#' @param celda.mod Celda object of class `celda_CG`. |
|
622 |
+#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000. |
|
623 |
+#' @param min.cluster.size Integer. Do not subsample cell clusters below this threshold. Default 100. |
|
624 |
+#' @param modules Integer vector. Determines which features modules to use for tSNE. If NULL, all modules will be used. Default NULL. |
|
625 |
+#' @param umap.config Object of class `umap.config`. Configures parameters for umap. Default `umap::umap.defaults` |
|
626 |
+#' @seealso `celda_G()` for clustering features and cells and `celdaHeatmap()` for displaying expression |
|
627 |
+#' @examples |
|
628 |
+#' umap.res = celdaUmap(celda.G.sim$counts, celda.G.mod) |
|
629 |
+#' @return A two column matrix of t-SNE coordinates |
|
630 |
+#' @export |
|
631 |
+setMethod("celdaUmap", |
|
632 |
+ signature(celda.mod = "celda_G"), |
|
633 |
+ function(counts, celda.mod, max.cells=25000, min.cluster.size=100, |
|
634 |
+ modules=NULL, umap.config=umap::umap.defaults) { |
|
635 |
+ prepared.count.info = prepareCountsForDimReduction.celda_G(counts, celda.mod, |
|
636 |
+ max.cells, min.cluster.size, |
|
637 |
+ modules) |
|
638 |
+ umap.res = calculateUmap(prepared.count.info$norm, umap.config) |
|
639 |
+ final = matrix(NA, nrow=ncol(counts), ncol=2) |
|
640 |
+ final[prepared.count.info$cell.ix, ] = umap.res |
|
641 |
+ rownames(final) = colnames(counts) |
|
642 |
+ colnames(final) = c("umap_1", "umap_2") |
|
643 |
+ return(final) |
|
644 |
+ }) |
|
645 |
+ |
|
646 |
+ |
|
647 |
+prepareCountsForDimReduction.celda_G = function(counts, celda.mod, max.cells=25000, min.cluster.size=100, |
|
648 |
+ modules=NULL) { |
|
649 |
+ if(max.cells > ncol(counts)) { |
|
650 |
+ max.cells = ncol(counts) |
|
651 |
+ } |
|
652 |
+ |
|
653 |
+ fm = factorizeMatrix(counts=counts, celda.mod=celda.mod, |
|
654 |
+ type="counts") |
|
655 |
+ |
|
656 |
+ modules.to.use = 1:nrow(fm$counts$cell) |
|
657 |
+ if (!is.null(modules)) { |
|
658 |
+ if (!all(modules %in% modules.to.use)) { |
|
659 |
+ stop("'modules' must be a vector of numbers between 1 and ", |
|
660 |
+ modules.to.use, ".") |
|
661 |
+ } |
|
662 |
+ modules.to.use = modules |
|
663 |
+ } |
|
664 |
+ |
|
665 |
+ cell.ix = sample(1:ncol(counts), max.cells) |
|
666 |
+ norm = t(normalizeCounts(fm$counts$cell[modules.to.use,cell.ix], |
|
667 |
+ normalize="proportion", |
|
668 |
+ transformation.fun=sqrt)) |
|
669 |
+ return(list(norm=norm, cell.ix=cell.ix)) |
|
670 |
+} |
|
671 |
+ |
|
672 |
+ |
|
635 | 673 |
#' @title Lookup the module of a feature |
636 | 674 |
#' @description Finds the module assignments of given features in a `celda_G()` model |
637 | 675 |
#' |
... | ... |
@@ -308,7 +308,7 @@ featureModuleTable = function(counts, celda.mod, output.file = NULL){ |
308 | 308 |
if(is.null(output.file)){ |
309 | 309 |
return(res) |
310 | 310 |
}else{ |
311 |
- write.table(res, file = output.file, sep = "\t", row.names = F, quote = F) |
|
311 |
+ utils::write.table(res, file = output.file, sep = "\t", row.names = FALSE, quote = FALSE) |
|
312 | 312 |
} |
313 | 313 |
} |
314 | 314 |
|
... | ... |
@@ -334,7 +334,7 @@ violinPlot = function(counts, celda.mod, features){ |
334 | 334 |
colnames(m) = c("Cluster","Feature","Expression") |
335 | 335 |
color_pal = distinct_colors(length(unique(cluster))) |
336 | 336 |
p <- ggplot2::ggplot(m, ggplot2::aes(x=Cluster, y=Expression, fill=Cluster)) + |
337 |
- ggplot2::facet_wrap(~Feature) + ggplot2::geom_violin(trim=T, scale = "width") + |
|
337 |
+ ggplot2::facet_wrap(~Feature) + ggplot2::geom_violin(trim=TRUE, scale = "width") + |
|
338 | 338 |
ggplot2::geom_jitter(height = 0, size = 0.1) + |
339 | 339 |
ggplot2::scale_fill_manual(values = color_pal) |
340 | 340 |
return(p) |
... | ... |
@@ -8,6 +8,8 @@ |
8 | 8 |
#' @param beta Numeric. Concentration parameter for Phi. Default to be 0.5 |
9 | 9 |
#' @param delta Numeric / Numeric vector. Concentration parameter for Theta. If input as a single numeric value, symmetric values for beta distribution are specified; if input as a vector of lenght 2, the two values will be the shape1 and shape2 paramters of the beta distribution respectively |
10 | 10 |
#' @param seed Integer. Passed to set.seed(). Default to be 12345. If NULL, no calls to `set.seed()` are made. |
11 |
+#' @return A list object containing the real expression matrix and contamination |
|
12 |
+#' expression matrix as well as other parameters used in the simulation. |
|
11 | 13 |
#' @examples |
12 | 14 |
#' contamination.sim = simulateContaminatedMatrix( K=3, delta=c(1,9)) |
13 | 15 |
#' contamination.sim = simulateContaminatedMatrix( K=3, delta = 1) |
... | ... |
@@ -15,13 +17,13 @@ |
15 | 17 |
simulateContaminatedMatrix = function(C=300, G=100, K=3, N.Range=c(500,1000), beta = 0.5, delta=c(1,2), seed=12345) { |
16 | 18 |
|
17 | 19 |
if (!is.null(seed)) { |
18 |
- set.seed(seed) |
|
20 |
+ set.seed(seed) |
|
19 | 21 |
} |
20 | 22 |
|
21 |
- if(length(delta)==1) { |
|
22 |
- cp.byC = rbeta(n=C, shape1=delta, shape2=delta) |
|
23 |
+ if(length(delta) == 1 ) { |
|
24 |
+ cp.byC = rbeta(n=C, shape1=delta, shape2=delta) |
|
23 | 25 |
} else { |
24 |
- cp.byC = rbeta(n=C, shape1=delta[1], shape2=delta[2] ) |
|
26 |
+ cp.byC = rbeta(n=C, shape1=delta[1], shape2=delta[2] ) |
|
25 | 27 |
} |
26 | 28 |
|
27 | 29 |
z = sample(1:K, size=C, replace=TRUE) |
... | ... |
@@ -32,21 +34,21 @@ simulateContaminatedMatrix = function(C=300, G=100, K=3, N.Range=c(500,1000), be |
32 | 34 |
} |
33 | 35 |
|
34 | 36 |
|
35 |
- N.byC = sample(min(N.Range):max(N.Range), size=C , replace=TRUE ) |
|
37 |
+ N.byC = sample(min(N.Range):max(N.Range), size=C, replace=TRUE ) |
|
36 | 38 |
cN.byC = sapply(1:C, function(i) rbinom(n=1, size=N.byC[i], p=cp.byC[i] ) ) |
37 | 39 |
rN.byC = N.byC - cN.byC |
38 | 40 |
|
39 | 41 |
phi = rdirichlet(K, rep(beta, G)) |
40 | 42 |
|
41 | 43 |
|
42 |
- # sample real expressed count matrix |
|
44 |
+ ## sample real expressed count matrix |
|
43 | 45 |
cell.rmat = sapply(1:C, function(i) stats::rmultinom(1, size=rN.byC[i], prob=phi[z[i],] ) ) |
44 | 46 |
|
45 | 47 |
rownames(cell.rmat) = paste0("Gene_", 1:G) |
46 | 48 |
colnames(cell.rmat) = paste0("Cell_", 1:C) |
47 | 49 |
|
48 | 50 |
|
49 |
- # sample contamination count matrix |
|
51 |
+ ## sample contamination count matrix |
|
50 | 52 |
n.G.by.K = rowSums(cell.rmat) - colSumByGroup(cell.rmat, group=z, K=K) |
51 | 53 |
eta = normalizeCounts(counts=n.G.by.K, normalize="proportion") |
52 | 54 |
|
... | ... |
@@ -67,17 +69,17 @@ simulateContaminatedMatrix = function(C=300, G=100, K=3, N.Range=c(500,1000), be |
67 | 69 |
# eta Numeric matrix. Rows represent features and columns represent cell populations |
68 | 70 |
# theta Numeric vector. Proportion of truely expressed transcripts |
69 | 71 |
decon.calcLL = function(counts, z, phi, eta, theta){ |
70 |
- #ll = sum( t(counts) * log( (1-conP )*geneDist[z,] + conP * conDist[z, ] + 1e-20 ) ) # when dist_mat are K x G matrices |
|
71 |
- ll = sum( t(counts) * log( theta * t(phi)[z,] + (1-theta) * t(eta)[z,] + 1e-20 )) |
|
72 |
- return(ll) |
|
72 |
+ #ll = sum( t(counts) * log( (1-conP )*geneDist[z,] + conP * conDist[z, ] + 1e-20 ) ) # when dist_mat are K x G matrices |
|
73 |
+ ll = sum( t(counts) * log( theta * t(phi)[z,] + (1-theta) * t(eta)[z,] + 1e-20 )) |
|
74 |
+ return(ll) |
|
73 | 75 |
} |
74 | 76 |
|
75 | 77 |
# This function calculates the log-likelihood of background distribution decontamination |
76 | 78 |
# |
77 | 79 |
# bgDist Numeric matrix. Rows represent feature and columns are the times that the background-distribution has been replicated. |
78 | 80 |
bg.calcLL = function(counts, cellDist, bgDist, theta){ |
79 |
- ll = sum( t(counts) * log( theta * t(cellDist) + (1-theta) * t(bgDist) + 1e-20) ) |
|
80 |
- return(ll) |
|
81 |
+ ll = sum( t(counts) * log( theta * t(cellDist) + (1-theta) * t(bgDist) + 1e-20) ) |
|
82 |
+ return(ll) |
|
81 | 83 |
} |
82 | 84 |
|
83 | 85 |
|
... | ... |
@@ -88,24 +90,24 @@ bg.calcLL = function(counts, cellDist, bgDist, theta){ |
88 | 90 |
# theta Numeric vector. Proportion of truely expressed transctripts |
89 | 91 |
cD.calcEMDecontamination = function(counts, phi, eta, theta, z, K, beta, delta ) { |
90 | 92 |
|
91 |
- # Notes: use fix-point iteration to update prior for theta, no need to feed delta anymore |
|
92 |
- log.Pr = log( t(phi)[z,] + 1e-20) + log(theta + 1e-20 ) |
|
93 |
- log.Pc = log( t(eta)[z,] + 1e-20) + log(1-theta + 1e-20) |
|
93 |
+ ## Notes: use fix-point iteration to update prior for theta, no need to feed delta anymore |
|
94 |
+ log.Pr = log( t(phi)[z,] + 1e-20) + log(theta + 1e-20 ) |
|
95 |
+ log.Pc = log( t(eta)[z,] + 1e-20) + log(1-theta + 1e-20) |
|
94 | 96 |
|
95 |
- Pr = exp(log.Pr) / ( exp(log.Pr) + exp(log.Pc) ) |
|
96 |
- Pc = 1 - Pr |
|
97 |
- delta.v2 = MCMCprecision::fit_dirichlet( matrix( c( Pr, Pc) , ncol = 2 ) )$alpha |
|
98 |
- |
|
99 |
- est.rmat = t(Pr) * counts |
|
100 |
- rn.G.by.K = colSumByGroup.numeric(est.rmat, z, K) |
|
101 |
- cn.G.by.K = rowSums(rn.G.by.K) - rn.G.by.K |
|
97 |
+ Pr = exp(log.Pr) / ( exp(log.Pr) + exp(log.Pc) ) |
|
98 |
+ Pc = 1 - Pr |
|
99 |
+ delta.v2 = MCMCprecision::fit_dirichlet( matrix( c( Pr, Pc) , ncol = 2 ) )$alpha |
|
102 | 100 |
|
103 |
- #update parameters |
|
104 |
- theta = (colSums(est.rmat) + delta.v2[1]) / (colSums(counts) + sum(delta.v2) ) |
|
105 |
- phi = normalizeCounts(rn.G.by.K, normalize="proportion", pseudocount.normalize =beta ) |
|
106 |
- eta = normalizeCounts(cn.G.by.K, normalize="proportion", pseudocount.normalize = beta) |
|
101 |
+ est.rmat = t(Pr) * counts |
|
102 |
+ rn.G.by.K = colSumByGroup.numeric(est.rmat, z, K) |
|
103 |
+ cn.G.by.K = rowSums(rn.G.by.K) - rn.G.by.K |
|
107 | 104 |
|
108 |
- return( list("est.rmat"=est.rmat, "theta"=theta, "phi"=phi, "eta"=eta, "delta"=delta.v2 ) ) |
|
105 |
+ ## Update parameters |
|
106 |
+ theta = (colSums(est.rmat) + delta.v2[1]) / (colSums(counts) + sum(delta.v2) ) |
|
107 |
+ phi = normalizeCounts(rn.G.by.K, normalize="proportion", pseudocount.normalize =beta ) |
|
108 |
+ eta = normalizeCounts(cn.G.by.K, normalize="proportion", pseudocount.normalize = beta) |
|
109 |
+ |
|
110 |
+ return( list("est.rmat"=est.rmat, "theta"=theta, "phi"=phi, "eta"=eta, "delta"=delta.v2 ) ) |
|
109 | 111 |
} |
110 | 112 |
|
111 | 113 |
|
... | ... |
@@ -113,11 +115,11 @@ cD.calcEMDecontamination = function(counts, phi, eta, theta, z, K, beta, delta |
113 | 115 |
# |
114 | 116 |
cD.calcEMbgDecontamination = function(counts, cellDist, bgDist, theta, beta, delta){ |
115 | 117 |
|
116 |
- meanN.by.C = apply(counts, 2, mean) |
|
117 |
- log.Pr = log( t(cellDist) + 1e-20) + log( theta + 1e-20) # + log( t(counts) / meanN.by.C ) # better when without panelty |
|
118 |
- log.Pc = log( t(bgDist) + 1e-20) + log( 1-theta + 2e-20) |
|
118 |
+ meanN.by.C = apply(counts, 2, mean) |
|
119 |
+ log.Pr = log( t(cellDist) + 1e-20) + log( theta + 1e-20) # + log( t(counts) / meanN.by.C ) # better when without panelty |
|
120 |
+ log.Pc = log( t(bgDist) + 1e-20) + log( 1-theta + 2e-20) |
|
119 | 121 |
|
120 |
- Pr = exp( log.Pr) / (exp(log.Pr) + exp( log.Pc) ) |
|
122 |
+ Pr = exp( log.Pr) / (exp(log.Pr) + exp( log.Pc) ) |
|
121 | 123 |
|
122 | 124 |
est.rmat = t(Pr) * counts |
123 | 125 |
|
... | ... |
@@ -125,7 +127,7 @@ cD.calcEMbgDecontamination = function(counts, cellDist, bgDist, theta, beta, del |
125 | 127 |
theta = (colSums(est.rmat) + delta) / (colSums(counts) + 2*delta) |
126 | 128 |
cellDist = normalizeCounts(est.rmat, normalize="proportion", pseudocount.normalize =beta) |
127 | 129 |
|
128 |
- return( list("est.rmat"=est.rmat, "theta"=theta, "cellDist"=cellDist) ) |
|
130 |
+ return( list("est.rmat"=est.rmat, "theta"=theta, "cellDist"=cellDist) ) |
|
129 | 131 |
} |
130 | 132 |
|
131 | 133 |
|
... | ... |
@@ -139,48 +141,48 @@ cD.calcEMbgDecontamination = function(counts, cellDist, bgDist, theta, beta, del |
139 | 141 |
#' @param logfile Character. Messages will be redirected to a file named `logfile`. If NULL, messages will be printed to stdout. Default NULL |
140 | 142 |
#' @param verbose Logical. Whether to print log messages. Default TRUE |
141 | 143 |
#' @param seed Integer. Passed to set.seed(). Default to be 1234567. If NULL, no calls to `set.seed()` are made. |
144 |
+#' @return A list object which contains the decontaminated count matrix and related parameters |
|
142 | 145 |
#' @examples |
143 | 146 |
#' decon.c = DecontX( counts = contamination.sim$rmat + contamination.sim$cmat, z=contamination.sim$z, max.iter=3) |
144 | 147 |
#' decon.bg = DecontX( counts=contamination.sim$rmat + contamination.sim$cmat, max.iter=3 ) |
145 | 148 |
#' @export |
146 |
-DecontX = function( counts , z=NULL, batch=NULL, max.iter=200, beta=1e-6, delta=10, logfile=NULL, verbose=TRUE, seed=1234567 ) { |
|
149 |
+DecontX = function( counts, z=NULL, batch=NULL, max.iter=200, beta=1e-6, delta=10, logfile=NULL, verbose=TRUE, seed=1234567 ) { |
|
147 | 150 |
|
148 | 151 |
|
149 |
- if ( !is.null(batch) ) { |
|
150 |
- |
|
151 |
- # Set result lists upfront for all cells from different batches |
|
152 |
- logLikelihood = c() |
|
153 |
- est.rmat = matrix(NA, ncol = ncol(counts) , nrow = nrow(counts ) , dimnames = list( rownames( counts) , colnames( counts ) ) ) |
|
154 |
- theta = rep(NA, ncol(counts ) ) |
|
155 |
- est.conp = rep(NA, ncol(counts) ) |
|
156 |
- |
|
157 |
- batch.index = unique(batch) |
|
158 |
- |
|
159 |
- for ( BATCH in batch.index ) { |
|
160 |
- counts.BATCH = counts[, batch == BATCH ] |
|
161 |
- if( !is.null(z) ) { z.BATCH = z[ batch == BATCH ] } else { z.BATCH = z } |
|
162 |
- res.BATCH = DecontXoneBatch( counts = counts.BATCH, z = z.BATCH, batch = BATCH ,max.iter = max.iter, beta = beta, delta = delta, logfile=logfile, verbose=verbose, seed=seed ) |
|
163 |
- |
|
164 |
- est.rmat[, batch == BATCH ] = res.BATCH$res.list$est.rmat |
|
165 |
- est.conp[ batch == BATCH ] = res.BATCH$res.list$est.conp |
|
166 |
- theta[ batch == BATCH ] = res.BATCH$res.list$theta |
|
167 |
- |
|
168 |
- if( is.null(logLikelihood) ) { |
|
169 |
- logLikelihood = res.BATCH$res.list$logLikelihood |
|
170 |
- } else { |
|
171 |
- logLikelihood = addLogLikelihood( logLikelihood, res.BATCH$res.list$logLikelihood ) |
|
172 |
- } |
|
173 |
- } |
|
174 |
- |
|
175 |
- run.params = res.BATCH$run.params |
|
176 |
- method = res.BATCH$method |
|
177 |
- res.list = list( "logLikelihood"=logLikelihood, "est.rmat"=est.rmat,"est.conp"= est.conp, "theta"=theta ) |
|
152 |
+ if ( !is.null(batch) ) { |
|
178 | 153 |
|
179 |
- return( list("run.params"=run.params, "res.list"=res.list, "method"=method ) ) |
|
180 |
- } |
|
154 |
+ ## Set result lists upfront for all cells from different batches |
|
155 |
+ logLikelihood = c() |
|
156 |
+ est.rmat = matrix(NA, ncol = ncol(counts), nrow = nrow(counts ), dimnames = list( rownames( counts), colnames( counts ) ) ) |
|
157 |
+ theta = rep(NA, ncol(counts ) ) |
|
158 |
+ est.conp = rep(NA, ncol(counts) ) |
|
181 | 159 |
|
182 |
- return( DecontXoneBatch( counts = counts, z = z, max.iter = max.iter, beta = beta, delta = delta, logfile=logfile, verbose=verbose, seed=seed ) ) |
|
160 |
+ batch.index = unique(batch) |
|
183 | 161 |
|
162 |
+ for ( BATCH in batch.index ) { |
|
163 |
+ counts.BATCH = counts[, batch == BATCH ] |
|
164 |
+ if( !is.null(z) ) { z.BATCH = z[ batch == BATCH ] } else { z.BATCH = z } |
|
165 |
+ res.BATCH = DecontXoneBatch( counts = counts.BATCH, z = z.BATCH, batch = BATCH ,max.iter = max.iter, beta = beta, delta = delta, logfile=logfile, verbose=verbose, seed=seed ) |
|
166 |
+ |
|
167 |
+ est.rmat[, batch == BATCH ] = res.BATCH$res.list$est.rmat |
|
168 |
+ est.conp[ batch == BATCH ] = res.BATCH$res.list$est.conp |
|
169 |
+ theta[ batch == BATCH ] = res.BATCH$res.list$theta |
|
170 |
+ |
|
171 |
+ if( is.null(logLikelihood) ) { |
|
172 |
+ logLikelihood = res.BATCH$res.list$logLikelihood |
|
173 |
+ } else { |
|
174 |
+ logLikelihood = addLogLikelihood( logLikelihood, res.BATCH$res.list$logLikelihood ) |
|
175 |
+ } |
|
176 |
+ } |
|
177 |
+ |
|
178 |
+ run.params = res.BATCH$run.params |
|
179 |
+ method = res.BATCH$method |
|
180 |
+ res.list = list( "logLikelihood"=logLikelihood, "est.rmat"=est.rmat,"est.conp"= est.conp, "theta"=theta ) |
|
181 |
+ |
|
182 |
+ return( list("run.params"=run.params, "res.list"=res.list, "method"=method ) ) |
|
183 |
+ } |
|
184 |
+ |
|
185 |
+ return( DecontXoneBatch( counts = counts, z = z, max.iter = max.iter, beta = beta, delta = delta, logfile=logfile, verbose=verbose, seed=seed ) ) |
|
184 | 186 |
} |
185 | 187 |
|
186 | 188 |
|
... | ... |
@@ -188,185 +190,186 @@ DecontX = function( counts , z=NULL, batch=NULL, max.iter=200, beta=1e-6, delta |
188 | 190 |
|
189 | 191 |
# This function updates decontamination for one batch |
190 | 192 |
# |
191 |
-DecontXoneBatch = function(counts, z=NULL, batch= NULL, max.iter=200, beta=1e-6, delta=10, logfile=NULL, verbose=TRUE, seed=1234567 ) { |
|
193 |
+DecontXoneBatch = function(counts, z=NULL, batch=NULL, max.iter=200, beta=1e-6, delta=10, logfile=NULL, verbose=TRUE, seed=1234567 ) { |
|
192 | 194 |
|
193 |
- checkCounts.decon(counts) |
|
194 |
- checkParameters.decon(proportion.prior=delta, distribution.prior=beta) |
|
195 |
+ checkCounts.decon(counts) |
|
196 |
+ checkParameters.decon(proportionPrior=delta, distributionPrior=beta) |
|
195 | 197 |
|
196 |
- nG = nrow(counts) |
|
197 |
- nC = ncol(counts) |
|
198 |
- K = length(unique(z)) |
|
198 |
+ nG = nrow(counts) |
|
199 |
+ nC = ncol(counts) |
|
200 |
+ K = length(unique(z)) |
|
199 | 201 |
|
200 |
- if ( is.null(z) ) { |
|
201 |
- decon.method = "background" |
|
202 |
- } else { |
|
203 |
- decon.method = "clustering" |
|
204 |
- z = processCellLabels(z, num.cells= nC) |
|
205 |
- } |
|
202 |
+ if ( is.null(z) ) { |
|
203 |
+ decon.method = "background" |
|
204 |
+ } else { |
|
205 |
+ decon.method = "clustering" |
|
206 |
+ z = processCellLabels(z, num.cells= nC) |
|
207 |
+ } |
|
206 | 208 |
|
207 |
- # Set seed for initialization |
|
208 |
- if (!is.null(seed)) { |
|
209 |
- set.seed(seed) |
|
210 |
- } |
|
209 |
+ ## Set seed for initialization |
|
210 |
+ if (!is.null(seed)) { |
|
211 |
+ set.seed(seed) |
|
212 |
+ } |
|
211 | 213 |
|
212 | 214 |
|
213 |
- iter = 1L |
|
214 |
- num.iter.without.improvement = 0L |
|
215 |
- stop.iter = 3L |
|
215 |
+ iter = 1L |
|
216 |
+ num.iter.without.improvement = 0L |
|
217 |
+ stop.iter = 3L |
|
216 | 218 |
|
217 | 219 |
|
218 |
- logMessages("----------------------------------------------------------------------", logfile=logfile, append=TRUE, verbose=verbose) |
|
219 |
- logMessages("Start DecontX. Decontamination", logfile=logfile, append=TRUE, verbose=verbose) |
|
220 |
- if ( !is.null(batch) ) { logMessages("batch: ", batch, logfile=logfile, append=TRUE, verbose=verbose) } |
|
221 |
- logMessages("----------------------------------------------------------------------", logfile=logfile, append=TRUE, verbose=verbose) |
|
222 |
- start.time = Sys.time() |
|
220 |
+ logMessages("----------------------------------------------------------------------", logfile=logfile, append=TRUE, verbose=verbose) |
|
221 |
+ logMessages("Start DecontX. Decontamination", logfile=logfile, append=TRUE, verbose=verbose) |
|
222 |
+ if ( !is.null(batch) ) { logMessages("batch: ", batch, logfile=logfile, append=TRUE, verbose=verbose) } |
|
223 |
+ logMessages("----------------------------------------------------------------------", logfile=logfile, append=TRUE, verbose=verbose) |
|
224 |
+ start.time = Sys.time() |
|
223 | 225 |
|
224 |
- if( decon.method == "clustering") { |
|
226 |
+ if( decon.method == "clustering") { |
|
225 | 227 |
|
226 |
- # Initialization |
|
227 |
- theta = runif(nC, min = 0.1, max = 0.5) |
|
228 |
- est.rmat = t (t(counts) * theta ) |
|
229 |
- phi = colSumByGroup.numeric(est.rmat, z, K) |
|
230 |
- eta = rowSums(phi) - phi |
|
231 |
- phi = normalizeCounts(phi, normalize="proportion", pseudocount.normalize =beta ) |
|
232 |
- eta = normalizeCounts(eta, normalize="proportion", pseudocount.normalize = beta) |
|
233 |
- ll = c() |
|
228 |
+ ## Initialization |
|
229 |
+ theta = stats::runif(nC, min = 0.1, max = 0.5) |
|
230 |
+ est.rmat = t (t(counts) * theta ) |
|
231 |
+ phi = colSumByGroup.numeric(est.rmat, z, K) |
|
232 |
+ eta = rowSums(phi) - phi |
|
233 |
+ phi = normalizeCounts(phi, normalize="proportion", pseudocount.normalize =beta ) |
|
234 |
+ eta = normalizeCounts(eta, normalize="proportion", pseudocount.normalize = beta) |
|
235 |
+ ll = c() |
|
234 | 236 |
|
235 | 237 |
|
236 |
- ll.round = decon.calcLL(counts=counts, z=z, phi = phi, eta = eta, theta=theta ) |
|
238 |
+ ll.round = decon.calcLL(counts=counts, z=z, phi = phi, eta = eta, theta=theta ) |
|
237 | 239 |
|
238 | 240 |
|
239 |
- # EM updates |
|
240 |
- while (iter <= max.iter & num.iter.without.improvement <= stop.iter ) { |
|
241 |
+ ## EM updates |
|
242 |
+ while (iter <= max.iter & num.iter.without.improvement <= stop.iter ) { |
|
241 | 243 |
|
244 |
+ next.decon = cD.calcEMDecontamination(counts=counts, phi=phi, eta=eta, theta=theta, z=z, K=K, beta=beta, delta=delta) |
|
242 | 245 |
|
243 |
- next.decon = cD.calcEMDecontamination(counts=counts, phi=phi, eta=eta, theta=theta, z=z, K=K, beta=beta, delta=delta) |
|
246 |
+ theta = next.decon$theta |
|
247 |
+ phi = next.decon$phi |
|
248 |
+ eta = next.decon$eta |
|
249 |
+ delta = next.decon$delta |
|
244 | 250 |
|
245 |
- theta = next.decon$theta |
|
246 |
- phi = next.decon$phi |
|
247 |
- eta = next.decon$eta |
|
248 |
- delta = next.decon$delta |
|
251 |
+ ## Calculate log-likelihood |
|
252 |
+ ll.temp = decon.calcLL(counts=counts, z=z, phi = phi, eta = eta, theta=theta ) |
|
253 |
+ ll = c(ll, ll.temp) |
|
254 |
+ ll.round = c( ll.round, round( ll.temp, 2) ) |
|
249 | 255 |
|
250 |
- # Calculate log-likelihood |
|
251 |
- ll.temp = decon.calcLL(counts=counts, z=z, phi = phi, eta = eta, theta=theta ) |
|
252 |
- ll = c(ll, ll.temp) |
|
253 |
- ll.round = c( ll.round, round( ll.temp, 2) ) |
|
256 |
+ if ( round( ll.temp, 2) > ll.round[ iter ] | iter == 1 ) { |
|
257 |
+ num.iter.without.improvement = 1L |
|
258 |
+ } else { |
|
259 |
+ num.iter.without.improvement = num.iter.without.improvement + 1L |
|
260 |
+ } |
|
261 |
+ iter = iter + 1L |
|
254 | 262 |
|
255 |
- if ( round( ll.temp, 2) > ll.round[ iter ] | iter== 1 ) { |
|
256 |
- num.iter.without.improvement = 1L |
|
257 |
- } else { |
|
258 |
- num.iter.without.improvement = num.iter.without.improvement + 1L |
|
259 |
- } |
|
260 |
- iter = iter + 1L |
|
263 |
+ } |
|
261 | 264 |
|
262 |
- } |
|
265 |
+ } |
|
263 | 266 |
|
264 |
- } |
|
267 |
+ if ( decon.method == "background") { |
|
265 | 268 |
|
266 |
- if ( decon.method == "background") { |
|
269 |
+ # Initialization |
|
270 |
+ theta = runif( nC, min =0.1, max=0.5) |
|
271 |
+ est.rmat = t( t(counts) *theta) |
|
272 |
+ bgDist = rowSums( counts ) / sum( counts) |
|
273 |
+ bgDist = matrix( rep( bgDist, nC), ncol=nC) |
|
274 |
+ cellDist = normalizeCounts( est.rmat, normalize="proportion", pseudocount.normalize=beta) |
|
275 |
+ ll =c() |
|
267 | 276 |
|
268 |
- # Initialization |
|
269 |
- theta = runif( nC, min =0.1, max=0.5) |
|
270 |
- est.rmat = t( t(counts) *theta) |
|
271 |
- bgDist = rowSums( counts ) / sum( counts) |
|
272 |
- bgDist = matrix( rep( bgDist, nC), ncol=nC) |
|
273 |
- cellDist = normalizeCounts( est.rmat, normalize="proportion", pseudocount.normalize=beta) |
|
274 |
- ll =c() |
|
275 | 277 |
|
278 |
+ ll.round = bg.calcLL(counts=counts, cellDist=cellDist, bgDist=bgDist, theta=theta) |
|
276 | 279 |
|
277 |
- ll.round = bg.calcLL(counts=counts, cellDist=cellDist, bgDist=bgDist, theta=theta) |
|
280 |
+ ## EM updates |
|
281 |
+ while (iter <= max.iter & num.iter.without.improvement <= stop.iter ) { |
|
278 | 282 |
|
279 | 283 |
|
280 |
- # EM updates |
|
281 |
- while (iter <= max.iter & num.iter.without.improvement <= stop.iter ) { |
|
284 |
+ next.decon = cD.calcEMbgDecontamination(counts=counts, cellDist=cellDist, bgDist=bgDist, theta=theta, beta=beta, delta=delta) |
|
282 | 285 |
|
286 |
+ theta = next.decon$theta |
|
287 |
+ cellDist = next.decon$cellDist |
|
283 | 288 |
|
284 |
- next.decon = cD.calcEMbgDecontamination(counts=counts, cellDist=cellDist, bgDist=bgDist, theta=theta, beta=beta, delta=delta) |
|
285 | 289 |
|
286 |
- theta = next.decon$theta |
|
287 |
- cellDist = next.decon$cellDist |
|
290 |
+ ## Calculate log-likelihood |
|
291 |
+ ll.temp = bg.calcLL(counts=counts, cellDist=cellDist, bgDist=bgDist, theta=theta) |
|
292 |
+ ll = c(ll, ll.temp) |
|
293 |
+ ll.round = c( ll.round, round( ll.temp, 2) ) |
|
288 | 294 |
|
289 |
- # Calculate log-likelihood |
|
290 |
- ll.temp = bg.calcLL(counts=counts, cellDist=cellDist, bgDist=bgDist, theta=theta) |
|
291 |
- ll = c(ll, ll.temp) |
|
292 |
- ll.round = c( ll.round, round( ll.temp, 2) ) |
|
295 |
+ if ( round( ll.temp, 2) > ll.round[ iter ] | iter == 1 ) { |
|
296 |
+ num.iter.without.improvement = 1L |
|
297 |
+ } else { |
|
298 |
+ num.iter.without.improvement = num.iter.without.improvement + 1L |
|
299 |
+ } |
|
300 |
+ iter = iter + 1L |
|
301 |
+ } |
|
293 | 302 |
|
294 |
- if ( round( ll.temp, 2) > ll.round[ iter ] | iter== 1 ) { |
|
295 |
- num.iter.without.improvement = 1L |
|
296 |
- } else { |
|
297 |
- num.iter.without.improvement = num.iter.without.improvement + 1L |
|
298 | 303 |
} |
299 |
- iter = iter + 1L |
|
300 |
- } |
|
301 | 304 |
|
302 |
- } |
|
305 |
+ res.conp = 1- colSums(next.decon$est.rmat) / colSums(counts) |
|
303 | 306 |
|
304 |
- res.conp = 1- colSums(next.decon$est.rmat) / colSums(counts) |
|
307 |
+ end.time = Sys.time() |
|
308 |
+ logMessages("----------------------------------------------------------------------", logfile=logfile, append=TRUE, verbose=verbose) |
|
309 |
+ logMessages("Completed DecontX. Total time:", format(difftime(end.time, start.time)), logfile=logfile, append=TRUE, verbose=verbose) |
|
310 |
+ if ( !is.null(batch) ) { logMessages("batch: ", batch, logfile=logfile, append=TRUE, verbose=verbose) } |
|
311 |
+ logMessages("----------------------------------------------------------------------", logfile=logfile, append=TRUE, verbose=verbose) |
|
305 | 312 |
|
313 |
+ run.params = list("beta"=beta, "delta"=delta, "iteration"=iter-1L, "seed"=seed) |
|
306 | 314 |
|
307 |
- end.time = Sys.time() |
|
308 |
- logMessages("----------------------------------------------------------------------", logfile=logfile, append=TRUE, verbose=verbose) |
|
309 |
- logMessages("Completed DecontX. Total time:", format(difftime(end.time, start.time)), logfile=logfile, append=TRUE, verbose=verbose) |
|
310 |
- if ( !is.null(batch) ) { logMessages("batch: ", batch, logfile=logfile, append=TRUE, verbose=verbose) } |
|
311 |
- logMessages("----------------------------------------------------------------------", logfile=logfile, append=TRUE, verbose=verbose) |
|
312 |
- |
|
313 |
- run.params = list("beta" = beta, "delta" = delta, "iteration"=iter-1L, "seed"=seed) |
|
314 |
- |
|
315 |
- res.list = list("logLikelihood" = ll, "est.rmat"=next.decon$est.rmat , "est.conp"= res.conp, "theta"=theta , "delta"=delta) |
|
316 |
- if( decon.method=="clustering" ) { |
|
317 |
- posterior.params = list( "est.GeneDist"=phi, "est.ConDist"=eta ) |
|
318 |
- res.list = append( res.list , posterior.params ) |
|
315 |
+ res.list = list("logLikelihood" = ll, "est.rmat"=next.decon$est.rmat , "est.conp"= res.conp, "theta"=theta , "delta"=delta) |
|
316 |
+ if( decon.method=="clustering" ) { |
|
317 |
+ posterior.params = list( "est.GeneDist"=phi, "est.ConDist"=eta ) |
|
318 |
+ res.list = append( res.list , posterior.params ) |
|
319 | 319 |
} |
320 | 320 |
|
321 |
- return(list("run.params"=run.params, "res.list"=res.list, "method"=decon.method )) |
|
321 |
+ return(list("run.params"=run.params, "res.list"=res.list, "method"=decon.method )) |
|
322 | 322 |
} |
323 | 323 |
|
324 | 324 |
|
325 |
-### checking |
|
325 |
+ |
|
326 | 326 |
# Make sure provided parameters are the right type and value range |
327 |
-checkParameters.decon = function(proportion.prior, distribution.prior) { |
|
328 |
- if( length(proportion.prior) > 1 | proportion.prior <= 0 ) { |
|
329 |
- stop("'delta' should be a single positive value.") |
|
330 |
- } |
|
331 |
- if( length( distribution.prior) > 1 | distribution.prior <=0 ) { |
|
332 |
- stop("'beta' should be a single positive value.") |
|
333 |
- } |
|
327 |
+checkParameters.decon = function(proportionPrior, distributionPrior) { |
|
328 |
+ if( length(proportionPrior) > 1 | proportionPrior <= 0 ) { |
|
329 |
+ stop("'delta' should be a single positive value.") |
|
330 |
+ } |
|
331 |
+ if( length( distributionPrior) > 1 | distributionPrior <=0 ) { |
|
332 |
+ stop("'beta' should be a single positive value.") |
|
333 |
+ } |
|
334 | 334 |
} |
335 | 335 |
|
336 |
+ |
|
336 | 337 |
# Make sure provided rmat is the right type |
337 | 338 |
checkCounts.decon = function(counts) { |
338 |
- if ( sum(is.na(counts)) >0 ) { |
|
339 |
- stop("Missing value in 'counts' matrix.") |
|
340 |
- } |
|
339 |
+ if ( sum(is.na(counts)) >0 ) { |
|
340 |
+ stop("Missing value in 'counts' matrix.") |
|
341 |
+ } |
|
341 | 342 |
} |
342 | 343 |
|
344 |
+ |
|
343 | 345 |
# Make sure provided cell labels are the right type |
344 |
-processCellLabels = function(z, num.cells) { |
|
345 |
- if ( length(z) != num.cells ) { |
|
346 |
- stop("'z' must be of the same length as the number of cells in the 'counts' matrix.") |
|
347 |
- } |
|
348 |
- if( length(unique(z)) <2 ) { |
|
349 |
- stop("'z' must have at least 2 different values.") # Even though everything runs smoothly when length(unique(z)) == 1, result is not trustful |
|
350 |
- } |
|
351 |
- if( !is.factor(z) ) { |
|
352 |
- z = plyr::mapvalues(z, unique(z), 1:length(unique(z)) ) |
|
353 |
- z = as.factor(z) |
|
354 |
- } |
|
355 |
- return(z) |
|
346 |
+processCellLabels = function(z, num.cells) { |
|
347 |
+ if ( length(z) != num.cells ) { |
|
348 |
+ stop("'z' must be of the same length as the number of cells in the 'counts' matrix.") |
|
349 |
+ } |
|
350 |
+ if( length(unique(z)) <2 ) { |
|
351 |
+ stop("'z' must have at least 2 different values.") # Even though everything runs smoothly when length(unique(z)) == 1, result is not trustful |
|
352 |
+ } |
|
353 |
+ if( !is.factor(z) ) { |
|
354 |
+ z = plyr::mapvalues(z, unique(z), 1:length(unique(z)) ) |
|
355 |
+ z = as.factor(z) |
|
356 |
+ } |
|
357 |
+ return(z) |
|
356 | 358 |
} |
357 | 359 |
|
360 |
+ |
|
358 | 361 |
# Add two (veried-length) vectors of logLikelihood |
359 |
-addLogLikelihood = function( ll.a, ll.b ) { |
|
360 |
- length.a = length(ll.a ) |
|
361 |
- length.b = length(ll.b) |
|
362 |
- |
|
363 |
- if( length.a >= length.b ) { |
|
364 |
- ll.b = c( ll.b, rep( ll.b[length.b] , length.a - length.b ) ) |
|
365 |
- ll = ll.a + ll.b |
|
366 |
- } else { |
|
367 |
- ll.a = c( ll.a, rep( ll.a[length.a], length.b - length.a ) ) |
|
368 |
- ll = ll.a + ll.b |
|
369 |
- } |
|
362 |
+addLogLikelihood = function( ll.a, ll.b ) { |
|
363 |
+ length.a = length(ll.a ) |
|
364 |
+ length.b = length(ll.b) |
|
365 |
+ |
|
366 |
+ if( length.a >= length.b ) { |
|
367 |
+ ll.b = c( ll.b, rep( ll.b[length.b] , length.a - length.b ) ) |
|
368 |
+ ll = ll.a + ll.b |
|
369 |
+ } else { |
|
370 |
+ ll.a = c( ll.a, rep( ll.a[length.a], length.b - length.a ) ) |
|
371 |
+ ll = ll.a + ll.b |
|
372 |
+ } |
|
370 | 373 |
|
371 |
- return( ll ) |
|
374 |
+ return( ll ) |
|
372 | 375 |
} |
373 | 376 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,56 @@ |
1 |
+#' @title Gene set enrichment |
|
2 |
+#' @description Identify and return significantly-enriched terms for each gene |
|
3 |
+#' module in a Celda object. Performs gene set enrichment analysis for Celda |
|
4 |
+#' identified modules using the enrichR package. |
|
5 |
+#' @author Ahmed Youssef |
|
6 |
+#' @param counts Integer count matrix. Rows represent genes and columns |
|
7 |
+#' represent cells. Row names of the matrix should be gene names. |
|
8 |
+#' @param celdaModel Celda object of class `celda_G` or `celda_CG`. |
|
9 |
+#' @param databases Character vector. Name of reference database. Available |
|
10 |
+#' databases can be viewed by running \code{enrichR::listEnrichrDbs()}. |
|
11 |
+#' @param fdr False discovery rate (FDR). Numeric. Cutoff value for adjusted |
|
12 |
+#' p-value, terms with FDR below this value are considered significantly |
|
13 |
+#' enriched. |
|
14 |
+#' @return List of length 'L' where each member contains the significantly |
|
15 |
+#' enriched terms for the corresponding module. |
|
16 |
+#' @examples |
|
17 |
+#' library(M3DExampleData) |
|
18 |
+#' counts <- M3DExampleData::Mmus_example_list$data |
|
19 |
+#' #subset 100 genes for fast clustering |
|
20 |
+#' counts <- counts[sample(seq_len(nrow(counts)), |
|
21 |
+#' size = 100),] |
|
22 |
+#' #cluster genes into 10 modules for quick demo |
|
23 |
+#' cm <- celda_G(counts = as.matrix(counts), L = 10, verbose = FALSE) |
|
24 |
+#' gse <- geneSetEnrich(counts, |
|
25 |
+#' cm, |
|
26 |
+#' databases = c('GO_Biological_Process_2018','GO_Molecular_Function_2018')) |
|
27 |
+#' @export |
|
28 |
+geneSetEnrich <- function(counts, celdaModel, databases, fdr = 0.05) { |
|
29 |
+ #check for correct celda object |
|
30 |
+ if (!(class(celdaModel) %in% c("celda_G", "celda_CG"))) { |
|
31 |
+ stop("No gene modules in celda object. ", |
|
32 |
+ "Please provide object of class celda_G or celda_CG.") |
|
33 |
+ } |
|
34 |
+ |
|
35 |
+ #initialize list with one entry for each gene module |
|
36 |
+ modules <- vector("list", length = celdaModel@params$L) |
|
37 |
+ |
|
38 |
+ #create dataframe with gene-module associations |
|
39 |
+ genes <- data.frame(gene = rownames(counts), module = celdaModel@clusters$y) |
|
40 |
+ |
|
41 |
+ #iterate over each module, get genes in that module, add to list |
|
42 |
+ for (i in seq_len(celdaModel@params$L)) { |
|
43 |
+ modules[[i]] <- as.character(genes[genes$module == i, 'gene']) |
|
44 |
+ } |
|
45 |
+ |
|
46 |
+ #enrichment analysis |
|
47 |
+ enrichment <- lapply(modules, function(module) { |
|
48 |
+ invisible(capture.output(table <- enrichR::enrichr(genes = module, |
|
49 |
+ databases = databases))) |
|
50 |
+ table <- Reduce(f = rbind, x = table) |
|
51 |
+ table[table$Adjusted.P.value < fdr, 'Term'] |
|
52 |
+ }) |
|
53 |
+ |
|
54 |
+ #return results as a list |
|
55 |
+ return(enrichment) |
|
56 |
+} |
... | ... |
@@ -195,9 +195,6 @@ plotDimReduceCluster = function(dim1, dim2, cluster, size = 1, xlab = "Dimension |
195 | 195 |
} |
196 | 196 |
|
197 | 197 |
|
198 |
- |
|
199 |
- |
|
200 |
- |
|
201 | 198 |
# Run the t-SNE algorithm for dimensionality reduction |
202 | 199 |
# |
203 | 200 |
# @param norm Normalized count matrix. |
... | ... |
@@ -216,3 +213,12 @@ calculateTsne = function(norm, perplexity=20, max.iter=2500, seed=12345, do.pca= |
216 | 213 |
return(res) |
217 | 214 |
} |
218 | 215 |
|
216 |
+ |
|
217 |
+# Run the umap algorithm for dimensionality reduction |
|
218 |
+# |
|
219 |
+# @param norm Normalized count matrix. |
|
220 |
+# @param umap.config An object of class umap.config, containing configuration parameters to be passed to umap. Default umap::umap.defualts. |
|
221 |
+calculateUmap = function(norm, umap.config=umap::umap.defaults) { |
|
222 |
+ return(umap::umap(norm, umap.config)$layout) |
|
223 |
+} |
|
224 |
+ |
... | ... |
@@ -15,11 +15,11 @@ |
15 | 15 |
To install the most recent release of celda (used in the preprint version of the celda paper) via devtools: |
16 | 16 |
``` |
17 | 17 |
library(devtools) |
18 |
-install_github("compbiomed/celda@v0.6") |
|
18 |
+install_github("campbio/celda@v0.6") |
|
19 | 19 |
``` |
20 | 20 |
The most up-to-date (but potentially less stable) version of celda can similarly be installed with: |
21 | 21 |
``` |
22 |
-install_github("compbiomed/celda") |
|
22 |
+install_github("campbio/celda@devel") |
|
23 | 23 |
``` |
24 | 24 |
|
25 | 25 |
**NOTE** On OSX, devtools::install_github() requires installation of **libgit2.** This can be installed via homebrew: |
... | ... |
@@ -37,11 +37,34 @@ An analysis example using celda with RNASeq via vignette('celda-analysis') |
37 | 37 |
### Decontamination with DecontX |
38 | 38 |
Highly expressed genes from various cells clusters will be expressed at low levels in other clusters in droplet-based systems due to contamination. DecontX will decompose an observed count matrix into a decontaminated expression matrix and a contamination matrix. The only other parameter needed is a vector of cell cluster labels. |
39 | 39 |
|
40 |
+To simulate two 300 (gene) x 100 (cell) count matrices from 3 different cell types with total reads per cell ranged from 5000 to 40000: one matrix being ture expression matrix (rmat), the other matrix being contamination count matrix (cmat) |
|
40 | 41 |
``` |
41 |
-new.counts = DecontX( counts = counts, z = cell.label) |
|
42 |
+sim.con = simulateContaminatedMatrix( C = 100, G = 300, K = 3, N.Range= c(5000, 40000), seed = 9124) |
|
43 |
+true.contamination.percentage = colSums( sim.con$cmat ) / colSums( sim.con$cmat + sim.con$rmat ) |
|
44 |
+str(sim.con) |
|
45 |
+# rmat: simulated true expression (gene by cell) count matrix |
|
46 |
+# cmat: simulated contamination (gene by cell) count matrix |
|
47 |
+# N.by.C: total transcripts per cell |
|
48 |
+# z: cell type label |
|
49 |
+ |
|
50 |
+``` |
|
51 |
+Use DecontX to decompose the observed (contaminated) count matrix back into true expression matrix and a contamination matrix with specified cell label |
|
52 |
+``` |
|
53 |
+observed.mat = sim.con$rmat + sim.con$cmat |
|
54 |
+cell.label = sim.con$z |
|
55 |
+new.counts = DecontX( counts = observed.mat, z = cell.label, max.iter = 200, seed = 123) |
|
56 |
+str(new.counts) |
|
42 | 57 |
# Decontaminated matrix: new.counts$res.list$est.rmat |
43 | 58 |
# Percentage of contamination per cell: new.counts$res.list$est.conp |
59 |
+ |
|
44 | 60 |
``` |
61 |
+DecontX Performance check |
|
62 |
+``` |
|
63 |
+estimated.contamination.percentage = new.counts$res.list$est.conp |
|
64 |
+plot( true.contamination.percentage, estimated.contamination.percentage) ; abline(0,1) |
|
65 |
+``` |
|
66 |
+ |
|
67 |
+ |
|
45 | 68 |
|
46 | 69 |
## New Features and announcements |
47 | 70 |
The v0.4 release of celda represents a useable implementation of the various celda clustering models. |
... | ... |
@@ -27,6 +27,9 @@ DecontX(counts, z = NULL, batch = NULL, max.iter = 200, |
27 | 27 |
|
28 | 28 |
\item{seed}{Integer. Passed to set.seed(). Default to be 1234567. If NULL, no calls to `set.seed()` are made.} |
29 | 29 |
} |
30 |
+\value{ |
|
31 |
+A list object which contains the decontaminated count matrix and related parameters |
|
32 |
+} |
|
30 | 33 |
\description{ |
31 | 34 |
This function updates decontamination on dataset with multiple batches |
32 | 35 |
} |
... | ... |
@@ -11,7 +11,21 @@ celdaTsne(counts, celda.mod, max.cells = 25000, min.cluster.size = 100, |
11 | 11 |
\arguments{ |
12 | 12 |
\item{counts}{Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`.} |
13 | 13 |
|
14 |
-\item{celda.mod}{Celda model. Options available in `celda::available.models`.} |
|
14 |
+\item{celda.mod}{Celda object of class `celda_CG`.} |
|
15 |
+ |
|
16 |
+\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.} |
|
17 |
+ |
|
18 |
+\item{min.cluster.size}{Integer. Do not subsample cell clusters below this threshold. Default 100.} |
|
19 |
+ |
|
20 |
+\item{modules}{Integer vector. Determines which features modules to use for tSNE. If NULL, all modules will be used. Default NULL.} |
|
21 |
+ |
|
22 |
+\item{perplexity}{Numeric. Perplexity parameter for tSNE. Default 20.} |
|
23 |
+ |
|
24 |
+\item{max.iter}{Integer. Maximum number of iterations in tSNE generation. Default 2500.} |
|
25 |
+ |
|
26 |
+\item{seed}{Integer. Passed to `set.seed()`. Default 12345. If NULL, no calls to `set.seed()` are made.} |
|
27 |
+ |
|
28 |
+\item{...}{Additional parameters.} |
|
15 | 29 |
|
16 | 30 |
\item{...}{Additional parameters.} |
17 | 31 |
} |
18 | 32 |
new file mode 100755 |
... | ... |
@@ -0,0 +1,42 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/celda_C.R |
|
3 |
+\docType{methods} |
|
4 |
+\name{celdaUmap,celda_C-method} |
|
5 |
+\alias{celdaUmap,celda_C-method} |
|
6 |
+\title{umap for celda_C} |
|
7 |
+\usage{ |
|
8 |
+\S4method{celdaUmap}{celda_C}(counts, celda.mod, max.cells = 25000, |
|
9 |
+ min.cluster.size = 100, modules = NULL, |
|
10 |
+ umap.config = umap::umap.defaults) |
|
11 |
+} |
|
12 |
+\arguments{ |
|
13 |
+\item{counts}{Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`.} |
|
14 |
+ |
|
15 |
+\item{celda.mod}{Celda object of class `celda_C`.} |
|
16 |
+ |
|
17 |
+\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.} |
|
18 |
+ |
|
19 |
+\item{min.cluster.size}{Integer. Do not subsample cell clusters below this threshold. Default 100.} |
|
20 |
+ |
|
21 |
+\item{initial.dims}{Integer. PCA will be used to reduce the dimentionality of the dataset. The top 'initial.dims' principal components will be used for tSNE. Default 20.} |
|
22 |
+ |
|
23 |
+\item{perplexity}{Numeric. Perplexity parameter for tSNE. Default 20.} |
|
24 |
+ |
|
25 |
+\item{max.iter}{Integer. Maximum number of iterations in tSNE generation. Default 2500.} |
|
26 |
+ |
|
27 |
+\item{seed}{Integer. Passed to `set.seed()`. Default 12345. If NULL, no calls to `set.seed()` are made.} |
|
28 |
+ |
|
29 |
+\item{...}{Additional parameters.} |
|
30 |
+} |
|
31 |
+\value{ |
|
32 |
+A two column matrix of t-SNE coordinates |
|
33 |
+} |
|
34 |
+\description{ |
|
35 |
+Embeds cells in two dimensions using umap based on a `celda_C` model. PCA on the normalized counts is used to reduce the number of features before applying umap. |
|
36 |
+} |
|
37 |
+\examples{ |
|
38 |
+umap.res = celdaUmap(celda.C.sim$counts, celda.C.mod) |
|
39 |
+} |
|
40 |
+\seealso{ |
|
41 |
+`celda_C()` for clustering cells and `celdaHeatmap()` for displaying expression |
|
42 |
+} |
0 | 43 |
new file mode 100755 |
... | ... |
@@ -0,0 +1,36 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/celda_CG.R |
|
3 |
+\docType{methods} |
|
4 |
+\name{celdaUmap,celda_CG-method} |
|
5 |
+\alias{celdaUmap,celda_CG-method} |
|
6 |
+\title{umap for celda_CG} |
|
7 |
+\usage{ |
|
8 |
+\S4method{celdaUmap}{celda_CG}(counts, celda.mod, max.cells = 25000, |
|
9 |
+ min.cluster.size = 100, modules = NULL, |
|
10 |
+ umap.config = umap::umap.defaults) |
|
11 |
+} |
|
12 |
+\arguments{ |
|
13 |
+\item{counts}{Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`.} |
|
14 |
+ |
|
15 |
+\item{celda.mod}{Celda object of class `celda_CG`.} |
|
16 |
+ |
|
17 |
+\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.} |
|
18 |
+ |
|
19 |
+\item{min.cluster.size}{Integer. Do not subsample cell clusters below this threshold. Default 100.} |
|
20 |
+ |
|
21 |
+\item{modules}{Integer vector. Determines which features modules to use for tSNE. If NULL, all modules will be used. Default NULL.} |
|
22 |
+ |
|
23 |
+\item{umap.config}{Object of class `umap.config`. Configures parameters for umap. Default `umap::umap.defaults`} |
|
24 |
+} |
|
25 |
+\value{ |
|
26 |
+A two column matrix of t-SNE coordinates |
|
27 |
+} |
|
28 |
+\description{ |
|
29 |
+Embeds cells in two dimensions using umap based on a `celda_CG` model. umap is run on module probabilities to reduce the number of features instead of using PCA. Module probabilities square-root trasformed before applying tSNE. |
|
30 |
+} |
|
31 |
+\examples{ |
|
32 |
+umap.res = celdaUmap(celda.CG.sim$counts, celda.CG.mod) |
|
33 |
+} |
|
34 |
+\seealso{ |
|
35 |
+`celda_CG()` for clustering features and cells and `celdaHeatmap()` for displaying expression |
|
36 |
+} |
0 | 37 |
new file mode 100755 |
... | ... |
@@ -0,0 +1,36 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/celda_G.R |
|
3 |
+\docType{methods} |
|
4 |
+\name{celdaUmap,celda_G-method} |
|
5 |
+\alias{celdaUmap,celda_G-method} |
|
6 |
+\title{umap for celda_G} |
|
7 |
+\usage{ |
|
8 |
+\S4method{celdaUmap}{celda_G}(counts, celda.mod, max.cells = 25000, |
|
9 |
+ min.cluster.size = 100, modules = NULL, |
|
10 |
+ umap.config = umap::umap.defaults) |
|
11 |
+} |
|
12 |
+\arguments{ |
|
13 |
+\item{counts}{Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`.} |
|
14 |
+ |
|
15 |
+\item{celda.mod}{Celda object of class `celda_CG`.} |
|
16 |
+ |
|
17 |
+\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.} |
|
18 |
+ |
|
19 |
+\item{min.cluster.size}{Integer. Do not subsample cell clusters below this threshold. Default 100.} |