Browse code

merge with master

Irisapo authored on 13/03/2019 19:21:30
Showing 27 changed files

... ...
@@ -1,14 +1,16 @@
1 1
 Package: celda
2 2
 Title: CEllular Latent Dirichlet Allocation
3
-Version: 0.99.8
3
+Version: 0.99.9
4 4
 Authors@R: c(person("Joshua", "Campbell", email = "camp@bu.edu", role = c("aut", "cre")),
5 5
              person("Sean", "Corbett", email = "scorbett@bu.edu", role = c("aut")),
6
-	     person("Yusuke", "Koga", email="ykoga07@bu.edu", role = c("aut")))
6
+	     person("Yusuke", "Koga", email="ykoga07@bu.edu", role = c("aut")),
7
+	     person("Zhe", "Wang", email="zhe@bu.edu", role = c("aut")))
7 8
 Description: celda leverages Bayesian hierarchical modeling to cluster genes,
8 9
     cells, or both simultaneously from single cell sequencing data.
9 10
 Depends:
10 11
     R (>= 3.5),
11 12
     SummarizedExperiment
13
+VignetteBuilder: knitr
12 14
 Imports:
13 15
     plyr,
14 16
     foreach,
... ...
@@ -31,23 +33,29 @@ Imports:
31 33
     GenomicRanges,
32 34
     data.table,
33 35
     Rcpp,
34
-    RcppEigen
36
+    RcppEigen,
37
+    umap,
38
+    enrichR,
39
+    vcd,
40
+    ggpubr,
41
+    stringi,
42
+    SummarizedExperiment
35 43
 Suggests:
36 44
     testthat,
37 45
     knitr,
38 46
     roxygen2,
39 47
     rmarkdown,
40
-    vcd,
41 48
     corrplot,
42 49
     Matrix,
43 50
     biomaRt,
44 51
     Rtsne,
45
-    covr
52
+    covr,
53
+    M3DExampleData,
54
+    BiocManager
46 55
 LinkingTo: Rcpp, RcppEigen
47
-VignetteBuilder: knitr
48
-License: MIT
56
+License: MIT + file LICENSE
49 57
 Encoding: UTF-8
50 58
 LazyData: true
51
-RoxygenNote: 6.1.0
52
-BugReports: https://github.com/definitelysean/celda/issues
59
+RoxygenNote: 6.1.1
60
+BugReports: https://github.com/campbio/celda/issues
53 61
 biocViews: SingleCell, GeneExpression, Clustering, Sequencing, Bayesian
... ...
@@ -10,6 +10,7 @@ export(celdaHeatmap)
10 10
 export(celdaPerplexity)
11 11
 export(celdaProbabilityMap)
12 12
 export(celdaTsne)
13
+export(celdaUmap)
13 14
 export(celda_C)
14 15
 export(celda_CG)
15 16
 export(celda_G)
... ...
@@ -21,6 +22,7 @@ export(distinct_colors)
21 22
 export(factorizeMatrix)
22 23
 export(featureModuleLookup)
23 24
 export(featureModuleTable)
25
+export(geneSetEnrich)
24 26
 export(logLikelihood)
25 27
 export(logLikelihood.celda_C)
26 28
 export(logLikelihood.celda_CG)
... ...
@@ -59,6 +61,7 @@ export(violinPlot)
59 61
 exportMethods(celdaHeatmap)
60 62
 exportMethods(celdaProbabilityMap)
61 63
 exportMethods(celdaTsne)
64
+exportMethods(celdaUmap)
62 65
 exportMethods(clusterProbability)
63 66
 exportMethods(factorizeMatrix)
64 67
 exportMethods(featureModuleLookup)
... ...
@@ -70,6 +73,7 @@ import(grDevices)
70 73
 import(graphics)
71 74
 import(grid)
72 75
 import(gtable)
76
+import(matrixStats)
73 77
 import(plyr)
74 78
 import(scales)
75 79
 useDynLib(celda,"_colSumByGroup")
... ...
@@ -75,12 +75,12 @@ moduleHeatmap <- function(counts, celda.mod, feature.module = 1, top.cells = 100
75 75
   gene_ix = match(rownames(filtered_norm.counts), celda.mod@names$row)
76 76
   cell_ix = match(colnames(filtered_norm.counts), celda.mod@names$column)
77 77
   z.to.plot = c()
78
-  if(methods::.hasSlot(celda.mod, "z")){
78
+  if (is(celda.mod, "celda_C") || is(celda.mod, "celda_CG")){
79 79
     cell <- distinct_colors(length(unique(celda.mod@clusters$z)))[sort(unique(celda.mod@clusters$z[cell_ix]))]
80 80
     names(cell) <- sort(unique(celda.mod@clusters$z[cell_ix]))
81 81
     anno_cell_colors <- list(cell = cell)
82 82
     z.to.plot = celda.mod@clusters$z[cell.indices]
83
-  }else{
83
+  } else {
84 84
     anno_cell_colors <- NULL
85 85
   }
86 86
   plotHeatmap(
... ...
@@ -300,7 +300,14 @@ setGeneric("celdaProbabilityMap",
300 300
 #' Embeds cells in two dimensions using tSNE based on celda_CG results.
301 301
 #' 
302 302
 #' @param counts Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`.
303
-#' @param celda.mod Celda model. Options available in `celda::available.models`.
303
+#' @param celda.mod Celda object of class `celda_CG`. 
304
+#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.
305
+#' @param min.cluster.size Integer. Do not subsample cell clusters below this threshold. Default 100. 
306
+#' @param modules Integer vector. Determines which features modules to use for tSNE. If NULL, all modules will be used. Default NULL.
307
+#' @param perplexity Numeric. Perplexity parameter for tSNE. Default 20.
308
+#' @param max.iter Integer. Maximum number of iterations in tSNE generation. Default 2500.
309
+#' @param seed Integer. Passed to `set.seed()`. Default 12345. If NULL, no calls to `set.seed()` are made.
310
+#' @param ... Additional parameters.
304 311
 #' @param ... Additional parameters.
305 312
 #' @return Numeric Matrix of dimension `ncol(counts)` x 2, colums representing the "X" and "Y" coordinates in the data's t-SNE represetation.
306 313
 #' @examples 
... ...
@@ -317,6 +324,30 @@ setGeneric("celdaTsne",
317 324
            })
318 325
 
319 326
 
327
+#' Embeds cells in two dimensions using umap.
328
+#' 
329
+#' @param counts Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`.
330
+#' @param celda.mod Celda object of class `celda_CG`. 
331
+#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.
332
+#' @param min.cluster.size Integer. Do not subsample cell clusters below this threshold. Default 100. 
333
+#' @param modules Integer vector. Determines which features modules to use for tSNE. If NULL, all modules will be used. Default NULL.
334
+#' @param perplexity Numeric. Perplexity parameter for tSNE. Default 20.
335
+#' @param max.iter Integer. Maximum number of iterations in tSNE generation. Default 2500.
336
+#' @param seed Integer. Passed to `set.seed()`. Default 12345. If NULL, no calls to `set.seed()` are made.
337
+#' @param ... Additional parameters.
338
+#' @param ... Additional parameters.
339
+#' @return Numeric Matrix of dimension `ncol(counts)` x 2, colums representing the "X" and "Y" coordinates in the data's t-SNE represetation.
340
+#' @examples 
341
+#' tsne.res = celdaTsne(celda.CG.sim$counts, celda.CG.mod)
342
+#' @export
343
+setGeneric("celdaUmap",
344
+           signature = "celda.mod",
345
+           function(counts, celda.mod, max.cells=25000, min.cluster.size=100,
346
+                   modules=NULL, umap.config=umap::umap.defaults) {
347
+             standardGeneric("celdaUmap")
348
+           })
349
+
350
+
320 351
 #' Obtain the gene module of a gene of interest
321 352
 #' 
322 353
 #' This function will output the corresponding feature module for a specified list of genes from a celda model.
... ...
@@ -526,6 +526,7 @@ setMethod("clusterProbability",
526 526
 #' @seealso `celda_C()` for clustering cells
527 527
 #' @examples
528 528
 #' perplexity = perplexity(celda.C.sim$counts, celda.C.mod)
529
+#' @import matrixStats
529 530
 #' @export
530 531
 setMethod("perplexity",
531 532
           signature(celda.mod = "celda_C"),
... ...
@@ -617,55 +618,94 @@ setMethod("celdaTsne",
617 618
                      initial.dims=20, modules=NULL, perplexity=20, max.iter=2500, 
618 619
                      seed=12345, ...) {
619 620
 
620
-            counts = processCounts(counts)
621
-            compareCountMatrix(counts, celda.mod)
622
-            
623
-            ## Checking if max.cells and min.cluster.size will work
624
-            if((max.cells < ncol(counts)) & (max.cells / min.cluster.size < celda.mod@params$K)) {
625
-              stop(paste0("Cannot distribute ", max.cells, " cells among ", 
626
-                          celda.mod@params$K, " clusters while maintaining a minumum of ", 
627
-                          min.cluster.size, 
628
-                          " cells per cluster. Try increasing 'max.cells' or decreasing 'min.cluster.size'."))
629
-            }
621
+
622
+            prepared.count.info = prepareCountsForDimReduction.celda_C(counts, celda.mod, max.cells,
623
+                                                                       min.cluster.size, modules)
630 624
             
631
-            ## Select a subset of cells to sample if greater than 'max.cells'
632
-            total.cells.to.remove = ncol(counts) - max.cells
633
-            z.include = rep(TRUE, ncol(counts))
634
-            if(total.cells.to.remove > 0) {
635
-          	z.ta = tabulate(celda.mod@clusters$z, celda.mod@params$K)
636
-          	
637
-          	## Number of cells that can be sampled from each cluster without 
638
-          	## going below the minimum threshold
639
-          	cluster.cells.to.sample = z.ta - min.cluster.size          
640
-          	cluster.cells.to.sample[cluster.cells.to.sample < 0] = 0
641
-          	
642
-          	## Number of cells to sample after exluding smaller clusters
643
-          	## Rounding can cause number to be off by a few, so ceiling is 
644
-          	## used with a second round of subtraction
645
-          	cluster.n.to.sample = ceiling((cluster.cells.to.sample / sum(cluster.cells.to.sample)) * total.cells.to.remove)
646
-          	diff = sum(cluster.n.to.sample) - total.cells.to.remove 
647
-          	cluster.n.to.sample[which.max(cluster.n.to.sample)] = cluster.n.to.sample[which.max(cluster.n.to.sample)] - diff
648
-          
649
-          	## Perform sampling for each cluster
650
-          	for(i in which(cluster.n.to.sample > 0)) {
651
-          	  z.include[sample(which(celda.mod@clusters$z == i), cluster.n.to.sample[i])] = FALSE
652
-          	}
653
-            }   
654
-            cell.ix = which(z.include)
655
-          
656
-            norm = t(normalizeCounts(counts[,cell.ix], normalize="proportion", 
657
-                                     transformation.fun=sqrt))
658
-            res = calculateTsne(norm, perplexity=perplexity, max.iter=max.iter, 
659
-                                seed=seed, do.pca=TRUE, 
660
-                                initial.dims = initial.dims)
625
+            res = calculateTsne(prepared.count.info$norm, perplexity=perplexity, max.iter=max.iter, 
626
+                                seed=seed, do.pca=TRUE, initial.dims = initial.dims)
661 627
             final = matrix(NA, nrow=ncol(counts), ncol=2)
662
-            final[cell.ix,] = res
628
+            final[prepared.count.info$cell.ix,] = res
663 629
             rownames(final) = colnames(counts)
664 630
             colnames(final) = c("tsne_1", "tsne_2")
665 631
             return(final)
666 632
           })
667 633
 
668 634
 
635
+#' @title umap for celda_C
636
+#' @description Embeds cells in two dimensions using umap based on a `celda_C` model. PCA on the normalized counts is used to reduce the number of features before applying umap. 
637
+#' 
638
+#' @param counts Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`.
639
+#' @param celda.mod Celda object of class `celda_C`. 
640
+#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.
641
+#' @param min.cluster.size Integer. Do not subsample cell clusters below this threshold. Default 100. 
642
+#' @param initial.dims Integer. PCA will be used to reduce the dimentionality of the dataset. The top 'initial.dims' principal components will be used for tSNE. Default 20.
643
+#' @param perplexity Numeric. Perplexity parameter for tSNE. Default 20.
644
+#' @param max.iter Integer. Maximum number of iterations in tSNE generation. Default 2500.
645
+#' @param seed Integer. Passed to `set.seed()`. Default 12345. If NULL, no calls to `set.seed()` are made.
646
+#' @param ... Additional parameters.
647
+#' @seealso `celda_C()` for clustering cells and `celdaHeatmap()` for displaying expression
648
+#' @examples
649
+#' umap.res = celdaUmap(celda.C.sim$counts, celda.C.mod)
650
+#' @return A two column matrix of t-SNE coordinates
651
+#' @export
652
+setMethod("celdaUmap",
653
+          signature(celda.mod = "celda_C"),
654
+          function(counts, celda.mod, max.cells=25000, min.cluster.size=100,
655
+                   modules=NULL, umap.config=umap::umap.defaults) {
656
+            prepared.count.info = prepareCountsForDimReduction.celda_C(counts, celda.mod, max.cells,
657
+                                                                       min.cluster.size, modules)
658
+            res = calculateUmap(prepared.count.info$norm, umap.config)
659
+            final = matrix(NA, nrow=ncol(counts), ncol=2)
660
+            final[prepared.count.info$cell.ix,] = res
661
+            rownames(final) = colnames(counts)
662
+            colnames(final) = c("umap_1", "umap_2")
663
+            return(final)
664
+          })
665
+
666
+
667
+prepareCountsForDimReduction.celda_C = function(counts, celda.mod, max.cells=25000, min.cluster.size=100,
668
+                                                modules=NULL) {
669
+  counts = processCounts(counts)
670
+  compareCountMatrix(counts, celda.mod)
671
+  
672
+  ## Checking if max.cells and min.cluster.size will work
673
+  if((max.cells < ncol(counts)) & (max.cells / min.cluster.size < celda.mod@params$K)) {
674
+  stop(paste0("Cannot distribute ", max.cells, " cells among ", 
675
+              celda.mod@params$K, " clusters while maintaining a minumum of ", 
676
+              min.cluster.size, 
677
+              " cells per cluster. Try increasing 'max.cells' or decreasing 'min.cluster.size'."))
678
+  }
679
+  
680
+  ## Select a subset of cells to sample if greater than 'max.cells'
681
+  total.cells.to.remove = ncol(counts) - max.cells
682
+  z.include = rep(TRUE, ncol(counts))
683
+  if(total.cells.to.remove > 0) {
684
+    z.ta = tabulate(celda.mod@clusters$z, celda.mod@params$K)
685
+    
686
+    ## Number of cells that can be sampled from each cluster without 
687
+    ## going below the minimum threshold
688
+    cluster.cells.to.sample = z.ta - min.cluster.size          
689
+    cluster.cells.to.sample[cluster.cells.to.sample < 0] = 0
690
+    
691
+    ## Number of cells to sample after exluding smaller clusters
692
+    ## Rounding can cause number to be off by a few, so ceiling is 
693
+    ## used with a second round of subtraction
694
+    cluster.n.to.sample = ceiling((cluster.cells.to.sample / sum(cluster.cells.to.sample)) * total.cells.to.remove)
695
+    diff = sum(cluster.n.to.sample) - total.cells.to.remove 
696
+    cluster.n.to.sample[which.max(cluster.n.to.sample)] = cluster.n.to.sample[which.max(cluster.n.to.sample)] - diff
697
+    
698
+    ## Perform sampling for each cluster
699
+    for(i in which(cluster.n.to.sample > 0)) {
700
+      z.include[sample(which(celda.mod@clusters$z == i), cluster.n.to.sample[i])] = FALSE
701
+    }
702
+  }   
703
+  cell.ix = which(z.include)
704
+  norm = t(normalizeCounts(counts[,cell.ix], normalize="proportion", 
705
+                         transformation.fun=sqrt))
706
+  return(list(norm=norm, cell.ix=cell.ix))
707
+}
708
+
669 709
 
670 710
 #' @title Probability map for a celda_C model
671 711
 #' @description Renders probability and relative expression heatmaps to visualize the relationship between cell populations and samples.
... ...
@@ -629,6 +629,7 @@ setMethod("clusterProbability",
629 629
 #' @seealso `celda_CG()` for clustering features and cells
630 630
 #' @examples
631 631
 #' perplexity = perplexity(celda.CG.sim$counts, celda.CG.mod)
632
+#' @import matrixStats
632 633
 #' @export
633 634
 setMethod("perplexity",
634 635
           signature(celda.mod = "celda_CG"),
... ...
@@ -738,62 +739,102 @@ setMethod("celdaTsne",
738 739
           function(counts, celda.mod, max.cells=25000, min.cluster.size=100,
739 740
                      initial.dims=20, modules=NULL, perplexity=20, max.iter=2500, 
740 741
                      seed=12345, ...) {
741
-
742
-            ## Checking if max.cells and min.cluster.size will work
743
-            if((max.cells < ncol(counts)) & (max.cells / min.cluster.size < celda.mod@params$K)) {
744
-              stop(paste0("Cannot distribute ", max.cells, " cells among ",
745
-                          celda.mod@params$K, " clusters while maintaining a minumum of ", 
746
-                          min.cluster.size, " cells per cluster. Try increasing 'max.cells' or decreasing 'min.cluster.size'."))
747
-            }
748
-            
749
-            fm = factorizeMatrix(counts=counts, celda.mod=celda.mod, 
750
-                                 type="counts")    
751
-            modules.to.use = 1:nrow(fm$counts$cell)
752
-            if (!is.null(modules)) {
753
-              if (!all(modules %in% modules.to.use)) {
754
-                stop("'modules' must be a vector of numbers between 1 and ", 
755
-                     modules.to.use, ".")
756
-              }
757
-              modules.to.use = modules 
758
-            }
759
-            
760
-          
761
-            ## Select a subset of cells to sample if greater than 'max.cells'
762
-            total.cells.to.remove = ncol(counts) - max.cells
763
-            z.include = rep(TRUE, ncol(counts))
764
-            if(total.cells.to.remove > 0) {
765
-            	z.ta = tabulate(celda.mod@clusters$z, celda.mod@params$K)
766
-            	
767
-            	## Number of cells that can be sampled from each cluster without 
768
-            	## going below the minimum threshold
769
-            	cluster.cells.to.sample = z.ta - min.cluster.size          
770
-            	cluster.cells.to.sample[cluster.cells.to.sample < 0] = 0
771
-            	
772
-            	## Number of cells to sample after exluding smaller clusters
773
-            	## Rounding can cause number to be off by a few, so ceiling is used 
774
-            	## with a second round of subtraction
775
-            	cluster.n.to.sample = ceiling((cluster.cells.to.sample / sum(cluster.cells.to.sample)) * total.cells.to.remove)
776
-            	diff = sum(cluster.n.to.sample) - total.cells.to.remove 
777
-            	cluster.n.to.sample[which.max(cluster.n.to.sample)] = cluster.n.to.sample[which.max(cluster.n.to.sample)] - diff
778
-            
779
-            	## Perform sampling for each cluster
780
-            	for(i in which(cluster.n.to.sample > 0)) {
781
-            	  z.include[sample(which(celda.mod@clusters$z == i), cluster.n.to.sample[i])] = FALSE
782
-            	}
783
-            }   
784
-            cell.ix = which(z.include)
785
-          
786
-            norm = t(normalizeCounts(fm$counts$cell[modules.to.use,cell.ix], normalize="proportion", transformation.fun=sqrt))
787 742
             
743
+            prepared.count.info = prepareCountsForDimReduction.celda_CG(counts, celda.mod,
744
+                                                         max.cells, min.cluster.size,
745
+                                                         initial.dims, modules, ...)
746
+            norm = prepared.count.info$norm
788 747
             res = calculateTsne(norm, do.pca=FALSE, perplexity=perplexity, max.iter=max.iter, seed=seed)
789 748
             final = matrix(NA, nrow=ncol(counts), ncol=2)
790
-            final[cell.ix,] = res
749
+            final[prepared.count.info$cell.ix,] = res
791 750
             rownames(final) = colnames(counts)
792 751
             colnames(final) = c("tsne_1", "tsne_2")
793 752
             return(final)
794 753
           })
795 754
 
796 755
 
756
+#' @title umap for celda_CG
757
+#' @description Embeds cells in two dimensions using umap based on a `celda_CG` model. umap is run on module probabilities to reduce the number of features instead of using PCA. Module probabilities square-root trasformed before applying tSNE. 
758
+#' 
759
+#' @param counts Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`.
760
+#' @param celda.mod Celda object of class `celda_CG`. 
761
+#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.
762
+#' @param min.cluster.size Integer. Do not subsample cell clusters below this threshold. Default 100. 
763
+#' @param modules Integer vector. Determines which features modules to use for tSNE. If NULL, all modules will be used. Default NULL.
764
+#' @param umap.config Object of class `umap.config`. Configures parameters for umap. Default `umap::umap.defaults`
765
+#' @seealso `celda_CG()` for clustering features and cells  and `celdaHeatmap()` for displaying expression
766
+#' @examples
767
+#' umap.res = celdaUmap(celda.CG.sim$counts, celda.CG.mod)
768
+#' @return A two column matrix of t-SNE coordinates
769
+#' @export
770
+setMethod("celdaUmap",
771
+          signature(celda.mod = "celda_CG"),
772
+          function(counts, celda.mod, max.cells=25000, min.cluster.size=100,
773
+                   modules=NULL, umap.config=umap::umap.defaults) {
774
+            prepared.count.info = prepareCountsForDimReduction.celda_CG(counts, celda.mod,
775
+                                                                        max.cells, min.cluster.size,
776
+                                                                        initial.dims, modules)
777
+            umap.res = calculateUmap(prepared.count.info$norm,
778
+                                     umap.config)
779
+            final = matrix(NA, nrow=ncol(counts), ncol=2)
780
+            final[prepared.count.info$cell.ix, ] = umap.res
781
+            rownames(final) = colnames(counts)
782
+            colnames(final) = c("umap_1", "umap_2")
783
+            return(final)
784
+          })
785
+
786
+
787
+prepareCountsForDimReduction.celda_CG =  function(counts, celda.mod, max.cells=25000, 
788
+                                                  min.cluster.size=100,
789
+                                                  initial.dims=20, modules=NULL, ...) {
790
+   ## Checking if max.cells and min.cluster.size will work
791
+    if((max.cells < ncol(counts)) & (max.cells / min.cluster.size < celda.mod@params$K)) {
792
+      stop(paste0("Cannot distribute ", max.cells, " cells among ",
793
+                  celda.mod@params$K, " clusters while maintaining a minumum of ", 
794
+                  min.cluster.size, " cells per cluster. Try increasing 'max.cells' or decreasing 'min.cluster.size'."))
795
+    }
796
+    
797
+    fm = factorizeMatrix(counts=counts, celda.mod=celda.mod, 
798
+                         type="counts")    
799
+    modules.to.use = 1:nrow(fm$counts$cell)
800
+    if (!is.null(modules)) {
801
+      if (!all(modules %in% modules.to.use)) {
802
+        stop("'modules' must be a vector of numbers between 1 and ", 
803
+             modules.to.use, ".")
804
+      }
805
+      modules.to.use = modules 
806
+    }
807
+    
808
+  
809
+    ## Select a subset of cells to sample if greater than 'max.cells'
810
+    total.cells.to.remove = ncol(counts) - max.cells
811
+    z.include = rep(TRUE, ncol(counts))
812
+    if(total.cells.to.remove > 0) {
813
+    	z.ta = tabulate(celda.mod@clusters$z, celda.mod@params$K)
814
+    	
815
+    	## Number of cells that can be sampled from each cluster without 
816
+    	## going below the minimum threshold
817
+    	cluster.cells.to.sample = z.ta - min.cluster.size          
818
+    	cluster.cells.to.sample[cluster.cells.to.sample < 0] = 0
819
+    	
820
+    	## Number of cells to sample after exluding smaller clusters
821
+    	## Rounding can cause number to be off by a few, so ceiling is used 
822
+    	## with a second round of subtraction
823
+    	cluster.n.to.sample = ceiling((cluster.cells.to.sample / sum(cluster.cells.to.sample)) * total.cells.to.remove)
824
+    	diff = sum(cluster.n.to.sample) - total.cells.to.remove 
825
+    	cluster.n.to.sample[which.max(cluster.n.to.sample)] = cluster.n.to.sample[which.max(cluster.n.to.sample)] - diff
826
+    
827
+    	## Perform sampling for each cluster
828
+    	for(i in which(cluster.n.to.sample > 0)) {
829
+    	  z.include[sample(which(celda.mod@clusters$z == i), cluster.n.to.sample[i])] = FALSE
830
+    	}
831
+    }   
832
+    cell.ix = which(z.include)
833
+  
834
+    norm = t(normalizeCounts(fm$counts$cell[modules.to.use,cell.ix], normalize="proportion", transformation.fun=sqrt))
835
+    return(list(norm=norm, cell.ix=cell.ix))
836
+}
837
+
797 838
 
798 839
 #' @title Probability map for a celda_CG model
799 840
 #' @description Renders probability and relative expression heatmaps to visualize the relationship between features and cell populations or cell populations and samples.
... ...
@@ -192,7 +192,7 @@ cG.calcGibbsProbY = function(counts, n.TS.by.C, n.by.TS, nG.by.TS, n.by.G, y, L,
192 192
   ix <- sample(1:nG)
193 193
   for(i in ix) {
194 194
     probs[,i] = cG_CalcGibbsProbY(index=i, counts=counts, nTSbyC=n.TS.by.C, nbyTS=n.by.TS, nGbyTS=nG.by.TS, nbyG=n.by.G, y=y, L=L, nG=nG, lg_beta=lgbeta, lg_gamma=lggamma, lg_delta=lgdelta, delta=delta)
195
-    if(any(is.na(probs[,i]))) browser()
195
+    #if(any(is.na(probs[,i]))) browser()
196 196
 	## Sample next state and add back counts
197 197
 	if(isTRUE(do.sample)) {
198 198
 	  prev.y = y[i]
... ...
@@ -603,28 +603,10 @@ setMethod("celdaTsne",
603 603
                     initial.dims=20, modules=NULL, perplexity=20, max.iter=2500, 
604 604
                     seed=12345, ...) {
605 605
   
606
-            if(max.cells > ncol(counts)) {
607
-              max.cells = ncol(counts)
608
-            }
609 606
             
610
-            fm = factorizeMatrix(counts=counts, celda.mod=celda.mod, 
611
-                                 type="counts")
612
-              
613
-            modules.to.use = 1:nrow(fm$counts$cell)
614
-            if (!is.null(modules)) {
615
-          	if (!all(modules %in% modules.to.use)) {
616
-          	  stop("'modules' must be a vector of numbers between 1 and ", 
617
-          	       modules.to.use, ".")
618
-          	}
619
-          	modules.to.use = modules 
620
-            }
621
-             
622
-            cell.ix = sample(1:ncol(counts), max.cells)
623
-            norm = t(normalizeCounts(fm$counts$cell[modules.to.use,cell.ix], 
624
-                                     normalize="proportion", 
625
-                                     transformation.fun=sqrt))
626
-          
627
-            res = calculateTsne(norm, do.pca=FALSE, perplexity=perplexity, 
607
+            prepared.count.info =  prepareCountsForDimReduction.celda_G(counts, celda.mod, max.cells, 
608
+                                                                        min.cluster.size, modules) 
609
+            res = calculateTsne(prepared.count.info$norm, do.pca=FALSE, perplexity=perplexity, 
628 610
                                 max.iter=max.iter, seed=seed)
629 611
             rownames(res) = colnames(counts)
630 612
             colnames(res) = c("tsne_1", "tsne_2")
... ...
@@ -632,6 +614,62 @@ setMethod("celdaTsne",
632 614
           })
633 615
 
634 616
 
617
+#' @title umap for celda_G
618
+#' @description Embeds cells in two dimensions using umap based on a `celda_G` model. umap is run on module probabilities to reduce the number of features instead of using PCA. Module probabilities square-root trasformed before applying tSNE. 
619
+#' 
620
+#' @param counts Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`.
621
+#' @param celda.mod Celda object of class `celda_CG`. 
622
+#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.
623
+#' @param min.cluster.size Integer. Do not subsample cell clusters below this threshold. Default 100. 
624
+#' @param modules Integer vector. Determines which features modules to use for tSNE. If NULL, all modules will be used. Default NULL.
625
+#' @param umap.config Object of class `umap.config`. Configures parameters for umap. Default `umap::umap.defaults`
626
+#' @seealso `celda_G()` for clustering features and cells  and `celdaHeatmap()` for displaying expression
627
+#' @examples
628
+#' umap.res = celdaUmap(celda.G.sim$counts, celda.G.mod)
629
+#' @return A two column matrix of t-SNE coordinates
630
+#' @export
631
+setMethod("celdaUmap",
632
+          signature(celda.mod = "celda_G"),
633
+          function(counts, celda.mod, max.cells=25000, min.cluster.size=100,
634
+                   modules=NULL, umap.config=umap::umap.defaults) {
635
+            prepared.count.info = prepareCountsForDimReduction.celda_G(counts, celda.mod,
636
+                                                                       max.cells, min.cluster.size,
637
+                                                                       modules)
638
+            umap.res = calculateUmap(prepared.count.info$norm, umap.config)
639
+            final = matrix(NA, nrow=ncol(counts), ncol=2)
640
+            final[prepared.count.info$cell.ix, ] = umap.res
641
+            rownames(final) = colnames(counts)
642
+            colnames(final) = c("umap_1", "umap_2")
643
+            return(final)
644
+          })
645
+
646
+
647
+prepareCountsForDimReduction.celda_G = function(counts, celda.mod, max.cells=25000, min.cluster.size=100,
648
+                                                modules=NULL) {
649
+  if(max.cells > ncol(counts)) {
650
+              max.cells = ncol(counts)
651
+            }
652
+            
653
+  fm = factorizeMatrix(counts=counts, celda.mod=celda.mod, 
654
+                       type="counts")
655
+    
656
+  modules.to.use = 1:nrow(fm$counts$cell)
657
+  if (!is.null(modules)) {
658
+  if (!all(modules %in% modules.to.use)) {
659
+    stop("'modules' must be a vector of numbers between 1 and ", 
660
+         modules.to.use, ".")
661
+  }
662
+  modules.to.use = modules 
663
+  }
664
+   
665
+  cell.ix = sample(1:ncol(counts), max.cells)
666
+  norm = t(normalizeCounts(fm$counts$cell[modules.to.use,cell.ix], 
667
+                           normalize="proportion", 
668
+                           transformation.fun=sqrt))
669
+  return(list(norm=norm, cell.ix=cell.ix))
670
+}
671
+
672
+
635 673
 #' @title Lookup the module of a feature
636 674
 #' @description Finds the module assignments of given features in a `celda_G()` model
637 675
 #'  
... ...
@@ -308,7 +308,7 @@ featureModuleTable = function(counts, celda.mod, output.file = NULL){
308 308
   if(is.null(output.file)){
309 309
     return(res)
310 310
   }else{
311
-    write.table(res, file = output.file, sep = "\t", row.names = F, quote = F)
311
+    utils::write.table(res, file = output.file, sep = "\t", row.names = FALSE, quote = FALSE)
312 312
   }
313 313
 }
314 314
 
... ...
@@ -334,7 +334,7 @@ violinPlot = function(counts, celda.mod, features){
334 334
   colnames(m) = c("Cluster","Feature","Expression")
335 335
   color_pal = distinct_colors(length(unique(cluster)))
336 336
   p <- ggplot2::ggplot(m, ggplot2::aes(x=Cluster, y=Expression, fill=Cluster)) + 
337
-    ggplot2::facet_wrap(~Feature) + ggplot2::geom_violin(trim=T, scale = "width") + 
337
+    ggplot2::facet_wrap(~Feature) + ggplot2::geom_violin(trim=TRUE, scale = "width") + 
338 338
     ggplot2::geom_jitter(height = 0, size = 0.1) +
339 339
     ggplot2::scale_fill_manual(values = color_pal)
340 340
   return(p)
... ...
@@ -8,6 +8,8 @@
8 8
 #' @param beta Numeric. Concentration parameter for Phi. Default to be 0.5
9 9
 #' @param delta Numeric / Numeric vector. Concentration parameter for Theta. If input as a single numeric value, symmetric values for beta distribution are specified; if input as a vector of lenght 2, the two values will be the shape1 and shape2 paramters of the beta distribution respectively
10 10
 #' @param seed Integer. Passed to set.seed(). Default to be 12345. If NULL, no calls to `set.seed()` are made.
11
+#' @return A list object containing the real expression matrix and contamination
12
+#' expression matrix as well as other parameters used in the simulation.
11 13
 #' @examples 
12 14
 #' contamination.sim =  simulateContaminatedMatrix(  K=3,  delta=c(1,9)) 
13 15
 #' contamination.sim =  simulateContaminatedMatrix(  K=3,  delta = 1) 
... ...
@@ -15,13 +17,13 @@
15 17
 simulateContaminatedMatrix = function(C=300, G=100, K=3, N.Range=c(500,1000), beta = 0.5, delta=c(1,2),  seed=12345) {
16 18
   
17 19
     if (!is.null(seed)) {
18
-      set.seed(seed) 
20
+        set.seed(seed) 
19 21
     }
20 22
 
21
-    if(length(delta)==1) { 
22
-	    cp.byC = rbeta(n=C, shape1=delta, shape2=delta) 
23
+    if(length(delta) == 1 ) { 
24
+        cp.byC = rbeta(n=C, shape1=delta, shape2=delta) 
23 25
     } else {
24
-            cp.byC = rbeta(n=C, shape1=delta[1], shape2=delta[2] ) 
26
+        cp.byC = rbeta(n=C, shape1=delta[1], shape2=delta[2] ) 
25 27
     } 
26 28
  
27 29
     z = sample(1:K, size=C, replace=TRUE) 
... ...
@@ -32,21 +34,21 @@ simulateContaminatedMatrix = function(C=300, G=100, K=3, N.Range=c(500,1000), be
32 34
     }
33 35
 
34 36
 
35
-    N.byC = sample(min(N.Range):max(N.Range), size=C , replace=TRUE  ) 
37
+    N.byC = sample(min(N.Range):max(N.Range), size=C, replace=TRUE  ) 
36 38
     cN.byC = sapply(1:C, function(i)  rbinom(n=1, size=N.byC[i], p=cp.byC[i] )  ) 
37 39
     rN.byC = N.byC - cN.byC 
38 40
     
39 41
     phi = rdirichlet(K, rep(beta, G)) 
40 42
  
41 43
  
42
-    # sample real expressed count matrix 
44
+    ## sample real expressed count matrix 
43 45
     cell.rmat = sapply(1:C, function(i) stats::rmultinom(1, size=rN.byC[i], prob=phi[z[i],] )  )
44 46
 
45 47
     rownames(cell.rmat) = paste0("Gene_", 1:G) 
46 48
     colnames(cell.rmat) = paste0("Cell_", 1:C) 
47 49
     
48 50
     
49
-    # sample contamination count matrix 
51
+    ## sample contamination count matrix 
50 52
     n.G.by.K = rowSums(cell.rmat) - colSumByGroup(cell.rmat, group=z, K=K) 
51 53
     eta = normalizeCounts(counts=n.G.by.K, normalize="proportion") 
52 54
 
... ...
@@ -67,17 +69,17 @@ simulateContaminatedMatrix = function(C=300, G=100, K=3, N.Range=c(500,1000), be
67 69
 # eta Numeric matrix. Rows represent features and columns represent cell populations 
68 70
 # theta Numeric vector. Proportion of truely expressed transcripts
69 71
 decon.calcLL = function(counts, z,  phi, eta, theta){
70
-  #ll = sum( t(counts) * log( (1-conP )*geneDist[z,] + conP * conDist[z, ] + 1e-20 ) )  # when dist_mat are K x G matrices
71
-  ll = sum( t(counts) * log( theta * t(phi)[z,] + (1-theta) * t(eta)[z,]  + 1e-20  ))   
72
-  return(ll)
72
+    #ll = sum( t(counts) * log( (1-conP )*geneDist[z,] + conP * conDist[z, ] + 1e-20 ) )  # when dist_mat are K x G matrices
73
+    ll = sum( t(counts) * log( theta * t(phi)[z,] + (1-theta) * t(eta)[z,]  + 1e-20  ))   
74
+    return(ll)
73 75
 }
74 76
 
75 77
 # This function calculates the log-likelihood of background distribution decontamination
76 78
 #
77 79
 # bgDist Numeric matrix. Rows represent feature and columns are the times that the background-distribution has been replicated. 
78 80
 bg.calcLL = function(counts, cellDist, bgDist, theta){
79
-  ll = sum( t(counts) * log( theta * t(cellDist) + (1-theta) * t(bgDist) + 1e-20) ) 
80
-  return(ll) 
81
+    ll = sum( t(counts) * log( theta * t(cellDist) + (1-theta) * t(bgDist) + 1e-20) ) 
82
+    return(ll) 
81 83
 }
82 84
 
83 85
 
... ...
@@ -88,24 +90,24 @@ bg.calcLL = function(counts, cellDist, bgDist, theta){
88 90
 #  theta Numeric vector. Proportion of truely expressed transctripts
89 91
 cD.calcEMDecontamination = function(counts, phi, eta, theta,  z, K, beta, delta ) {
90 92
 
91
-   # Notes: use fix-point iteration to update prior for theta, no need to feed delta anymore
92
-   log.Pr = log(  t(phi)[z,] + 1e-20) + log(theta + 1e-20 )
93
-   log.Pc = log(  t(eta)[z,] + 1e-20) + log(1-theta + 1e-20)  
93
+    ## Notes: use fix-point iteration to update prior for theta, no need to feed delta anymore
94
+    log.Pr = log(  t(phi)[z,] + 1e-20) + log(theta + 1e-20 )
95
+    log.Pc = log(  t(eta)[z,] + 1e-20) + log(1-theta + 1e-20)  
94 96
 
95
-   Pr = exp(log.Pr) / (  exp(log.Pr) + exp(log.Pc) )   
96
-   Pc = 1 - Pr 
97
-   delta.v2 = MCMCprecision::fit_dirichlet( matrix( c( Pr, Pc) , ncol = 2 ) )$alpha 
98
-   
99
-   est.rmat = t(Pr) * counts 
100
-   rn.G.by.K = colSumByGroup.numeric(est.rmat, z, K)  
101
-   cn.G.by.K = rowSums(rn.G.by.K) - rn.G.by.K 
97
+    Pr = exp(log.Pr) / (  exp(log.Pr) + exp(log.Pc) )   
98
+    Pc = 1 - Pr 
99
+    delta.v2 = MCMCprecision::fit_dirichlet( matrix( c( Pr, Pc) , ncol = 2 ) )$alpha 
102 100
 
103
-   #update parameters 
104
-   theta = (colSums(est.rmat) + delta.v2[1]) / (colSums(counts) + sum(delta.v2)  )  
105
-   phi  = normalizeCounts(rn.G.by.K, normalize="proportion",  pseudocount.normalize =beta ) 
106
-   eta  = normalizeCounts(cn.G.by.K, normalize="proportion",  pseudocount.normalize = beta) 
101
+    est.rmat = t(Pr) * counts 
102
+    rn.G.by.K = colSumByGroup.numeric(est.rmat, z, K)  
103
+    cn.G.by.K = rowSums(rn.G.by.K) - rn.G.by.K 
107 104
 
108
-   return( list("est.rmat"=est.rmat,  "theta"=theta, "phi"=phi, "eta"=eta, "delta"=delta.v2 ) ) 
105
+    ## Update parameters 
106
+    theta = (colSums(est.rmat) + delta.v2[1]) / (colSums(counts) + sum(delta.v2)  )  
107
+    phi  = normalizeCounts(rn.G.by.K, normalize="proportion",  pseudocount.normalize =beta ) 
108
+    eta  = normalizeCounts(cn.G.by.K, normalize="proportion",  pseudocount.normalize = beta) 
109
+
110
+    return( list("est.rmat"=est.rmat,  "theta"=theta, "phi"=phi, "eta"=eta, "delta"=delta.v2 ) ) 
109 111
 }
110 112
 
111 113
 
... ...
@@ -113,11 +115,11 @@ cD.calcEMDecontamination = function(counts, phi, eta, theta,  z, K, beta, delta
113 115
 # 
114 116
 cD.calcEMbgDecontamination = function(counts, cellDist, bgDist, theta, beta, delta){
115 117
 
116
-  meanN.by.C = apply(counts, 2, mean) 
117
-  log.Pr = log( t(cellDist) + 1e-20) + log( theta + 1e-20) # + log( t(counts) / meanN.by.C )   # better when without panelty 
118
-  log.Pc = log( t(bgDist)  + 1e-20) + log( 1-theta + 2e-20) 
118
+    meanN.by.C = apply(counts, 2, mean) 
119
+    log.Pr = log( t(cellDist) + 1e-20) + log( theta + 1e-20) # + log( t(counts) / meanN.by.C )   # better when without panelty 
120
+    log.Pc = log( t(bgDist)  + 1e-20) + log( 1-theta + 2e-20) 
119 121
 
120
-  Pr = exp( log.Pr) / (exp(log.Pr) + exp( log.Pc) ) 
122
+    Pr = exp( log.Pr) / (exp(log.Pr) + exp( log.Pc) ) 
121 123
 
122 124
   est.rmat = t(Pr) * counts
123 125
 
... ...
@@ -125,7 +127,7 @@ cD.calcEMbgDecontamination = function(counts, cellDist, bgDist, theta, beta, del
125 127
   theta = (colSums(est.rmat) + delta) / (colSums(counts) + 2*delta) 
126 128
   cellDist = normalizeCounts(est.rmat, normalize="proportion", pseudocount.normalize =beta) 
127 129
 
128
-  return( list("est.rmat"=est.rmat, "theta"=theta, "cellDist"=cellDist) ) 
130
+    return( list("est.rmat"=est.rmat, "theta"=theta, "cellDist"=cellDist) ) 
129 131
 }
130 132
 
131 133
 
... ...
@@ -139,48 +141,48 @@ cD.calcEMbgDecontamination = function(counts, cellDist, bgDist, theta, beta, del
139 141
 #' @param logfile Character. Messages will be redirected to a file named `logfile`. If NULL, messages will be printed to stdout.  Default NULL
140 142
 #' @param verbose Logical. Whether to print log messages. Default TRUE
141 143
 #' @param seed Integer. Passed to set.seed(). Default to be 1234567. If NULL, no calls to `set.seed()` are made.
144
+#' @return A list object which contains the decontaminated count matrix and related parameters
142 145
 #' @examples 
143 146
 #' decon.c = DecontX( counts = contamination.sim$rmat + contamination.sim$cmat, z=contamination.sim$z, max.iter=3)
144 147
 #' decon.bg = DecontX( counts=contamination.sim$rmat + contamination.sim$cmat, max.iter=3 ) 
145 148
 #' @export 
146
-DecontX = function( counts , z=NULL, batch=NULL,  max.iter=200, beta=1e-6, delta=10, logfile=NULL, verbose=TRUE, seed=1234567 ) {
149
+DecontX = function( counts, z=NULL, batch=NULL,  max.iter=200, beta=1e-6, delta=10, logfile=NULL, verbose=TRUE, seed=1234567 ) {
147 150
   
148 151
 
149
-  if ( !is.null(batch) ) { 
150
-      
151
-      # Set result lists upfront for all cells from different batches 
152
-      logLikelihood = c() 
153
-      est.rmat = matrix(NA, ncol = ncol(counts) , nrow = nrow(counts ) , dimnames = list( rownames( counts) , colnames( counts ) )      ) 
154
-      theta = rep(NA, ncol(counts ) ) 
155
-      est.conp = rep(NA, ncol(counts) ) 
156
-
157
-      batch.index = unique(batch) 
158
-
159
-      for ( BATCH in batch.index ) {  
160
-          counts.BATCH = counts[, batch == BATCH  ]
161
-          if( !is.null(z) ) { z.BATCH = z[ batch == BATCH ]  }  else { z.BATCH = z } 
162
-          res.BATCH = DecontXoneBatch( counts = counts.BATCH, z = z.BATCH, batch = BATCH  ,max.iter = max.iter, beta = beta, delta = delta, logfile=logfile, verbose=verbose, seed=seed ) 
163
-
164
-          est.rmat[, batch == BATCH ] = res.BATCH$res.list$est.rmat
165
-          est.conp[ batch == BATCH ] = res.BATCH$res.list$est.conp 
166
-          theta[  batch == BATCH ] = res.BATCH$res.list$theta 
167
-
168
-          if( is.null(logLikelihood)  )   { 
169
-              logLikelihood = res.BATCH$res.list$logLikelihood  
170
-          } else { 
171
-              logLikelihood = addLogLikelihood( logLikelihood, res.BATCH$res.list$logLikelihood ) 
172
-          }
173
-      } 
174
-      
175
-      run.params = res.BATCH$run.params 
176
-      method = res.BATCH$method 
177
-      res.list = list( "logLikelihood"=logLikelihood, "est.rmat"=est.rmat,"est.conp"= est.conp,  "theta"=theta ) 
152
+    if ( !is.null(batch) ) { 
178 153
 
179
-      return( list("run.params"=run.params, "res.list"=res.list, "method"=method )  ) 
180
-  } 
154
+        ## Set result lists upfront for all cells from different batches 
155
+        logLikelihood = c() 
156
+        est.rmat = matrix(NA, ncol = ncol(counts), nrow = nrow(counts ), dimnames = list( rownames( counts), colnames( counts ) )      ) 
157
+        theta = rep(NA, ncol(counts ) ) 
158
+        est.conp = rep(NA, ncol(counts) ) 
181 159
 
182
-  return( DecontXoneBatch( counts = counts, z = z, max.iter = max.iter, beta = beta, delta = delta, logfile=logfile, verbose=verbose, seed=seed )  ) 
160
+        batch.index = unique(batch) 
183 161
 
162
+        for ( BATCH in batch.index ) {  
163
+            counts.BATCH = counts[, batch == BATCH  ]
164
+            if( !is.null(z) ) { z.BATCH = z[ batch == BATCH ]  }  else { z.BATCH = z } 
165
+            res.BATCH = DecontXoneBatch( counts = counts.BATCH, z = z.BATCH, batch = BATCH  ,max.iter = max.iter, beta = beta, delta = delta, logfile=logfile, verbose=verbose, seed=seed ) 
166
+
167
+            est.rmat[, batch == BATCH ] = res.BATCH$res.list$est.rmat
168
+            est.conp[ batch == BATCH ] = res.BATCH$res.list$est.conp 
169
+            theta[  batch == BATCH ] = res.BATCH$res.list$theta 
170
+
171
+            if( is.null(logLikelihood)  )   { 
172
+                logLikelihood = res.BATCH$res.list$logLikelihood  
173
+            } else { 
174
+                logLikelihood = addLogLikelihood( logLikelihood, res.BATCH$res.list$logLikelihood ) 
175
+            }
176
+        } 
177
+      
178
+        run.params = res.BATCH$run.params 
179
+        method = res.BATCH$method 
180
+        res.list = list( "logLikelihood"=logLikelihood, "est.rmat"=est.rmat,"est.conp"= est.conp,  "theta"=theta ) 
181
+
182
+        return( list("run.params"=run.params, "res.list"=res.list, "method"=method )  ) 
183
+    } 
184
+
185
+    return( DecontXoneBatch( counts = counts, z = z, max.iter = max.iter, beta = beta, delta = delta, logfile=logfile, verbose=verbose, seed=seed )  ) 
184 186
 }
185 187
 
186 188
 
... ...
@@ -188,185 +190,186 @@ DecontX = function( counts , z=NULL, batch=NULL,  max.iter=200, beta=1e-6, delta
188 190
 
189 191
 # This function updates decontamination for one batch  
190 192
 #
191
-DecontXoneBatch = function(counts, z=NULL, batch= NULL,  max.iter=200, beta=1e-6, delta=10, logfile=NULL, verbose=TRUE, seed=1234567 ) {
193
+DecontXoneBatch = function(counts, z=NULL, batch=NULL, max.iter=200, beta=1e-6, delta=10, logfile=NULL, verbose=TRUE, seed=1234567 ) {
192 194
 
193
-  checkCounts.decon(counts)
194
-  checkParameters.decon(proportion.prior=delta, distribution.prior=beta) 
195
+    checkCounts.decon(counts)
196
+    checkParameters.decon(proportionPrior=delta, distributionPrior=beta) 
195 197
 
196
-  nG = nrow(counts)
197
-  nC = ncol(counts)
198
-  K =  length(unique(z)) 
198
+    nG = nrow(counts)
199
+    nC = ncol(counts)
200
+    K =  length(unique(z)) 
199 201
  
200
-  if ( is.null(z) ) {
201
-      decon.method = "background"
202
-  } else {
203
-      decon.method = "clustering"
204
-      z = processCellLabels(z, num.cells= nC) 
205
-  }
202
+    if ( is.null(z) ) {
203
+        decon.method = "background"
204
+    } else {
205
+        decon.method = "clustering"
206
+        z = processCellLabels(z, num.cells= nC) 
207
+    }
206 208
 
207
-  # Set seed for initialization 
208
-  if (!is.null(seed)) {
209
-    set.seed(seed)
210
-  }
209
+    ## Set seed for initialization 
210
+    if (!is.null(seed)) {
211
+        set.seed(seed)
212
+    }
211 213
 
212 214
 
213
-  iter = 1L 
214
-  num.iter.without.improvement = 0L
215
-  stop.iter = 3L 
215
+    iter = 1L 
216
+    num.iter.without.improvement = 0L
217
+    stop.iter = 3L 
216 218
 
217 219
 
218
-  logMessages("----------------------------------------------------------------------", logfile=logfile, append=TRUE, verbose=verbose) 
219
-  logMessages("Start DecontX. Decontamination", logfile=logfile, append=TRUE, verbose=verbose) 
220
-  if ( !is.null(batch) ) {  logMessages("batch: ",  batch, logfile=logfile, append=TRUE, verbose=verbose)    }
221
-  logMessages("----------------------------------------------------------------------", logfile=logfile, append=TRUE, verbose=verbose) 
222
-  start.time = Sys.time()
220
+    logMessages("----------------------------------------------------------------------", logfile=logfile, append=TRUE, verbose=verbose) 
221
+    logMessages("Start DecontX. Decontamination", logfile=logfile, append=TRUE, verbose=verbose) 
222
+    if ( !is.null(batch) ) {  logMessages("batch: ",  batch, logfile=logfile, append=TRUE, verbose=verbose)    }
223
+    logMessages("----------------------------------------------------------------------", logfile=logfile, append=TRUE, verbose=verbose) 
224
+    start.time = Sys.time()
223 225
 
224
-  if( decon.method == "clustering") {
226
+    if( decon.method == "clustering") {
225 227
 
226
-  # Initialization
227
-  theta  = runif(nC, min = 0.1, max = 0.5)  
228
-  est.rmat = t (t(counts) * theta )       
229
-  phi =   colSumByGroup.numeric(est.rmat, z, K)
230
-  eta =   rowSums(phi) - phi 
231
-  phi = normalizeCounts(phi, normalize="proportion", pseudocount.normalize =beta )
232
-  eta = normalizeCounts(eta, normalize="proportion", pseudocount.normalize = beta)
233
-  ll = c()
228
+        ## Initialization
229
+        theta  = stats::runif(nC, min = 0.1, max = 0.5)  
230
+        est.rmat = t (t(counts) * theta )       
231
+        phi =   colSumByGroup.numeric(est.rmat, z, K)
232
+        eta =   rowSums(phi) - phi 
233
+        phi = normalizeCounts(phi, normalize="proportion", pseudocount.normalize =beta )
234
+        eta = normalizeCounts(eta, normalize="proportion", pseudocount.normalize = beta)
235
+        ll = c()
234 236
   
235 237
 
236
-  ll.round =  decon.calcLL(counts=counts, z=z, phi = phi, eta = eta, theta=theta ) 
238
+        ll.round =  decon.calcLL(counts=counts, z=z, phi = phi, eta = eta, theta=theta ) 
237 239
 
238 240
 
239
-  # EM updates
240
-  while (iter <=  max.iter &  num.iter.without.improvement <= stop.iter  ) {
241
+        ## EM updates
242
+        while (iter <=  max.iter &  num.iter.without.improvement <= stop.iter  ) {
241 243
 
244
+            next.decon = cD.calcEMDecontamination(counts=counts, phi=phi, eta=eta, theta=theta, z=z, K=K,  beta=beta, delta=delta) 
242 245
 
243
-    next.decon = cD.calcEMDecontamination(counts=counts, phi=phi, eta=eta, theta=theta, z=z, K=K,  beta=beta, delta=delta) 
246
+            theta = next.decon$theta 
247
+            phi = next.decon$phi
248
+            eta = next.decon$eta 
249
+            delta = next.decon$delta
244 250
 
245
-    theta = next.decon$theta 
246
-    phi = next.decon$phi
247
-    eta = next.decon$eta 
248
-    delta = next.decon$delta
251
+            ## Calculate log-likelihood
252
+            ll.temp = decon.calcLL(counts=counts, z=z, phi = phi, eta = eta, theta=theta )
253
+            ll  = c(ll, ll.temp)
254
+            ll.round = c( ll.round, round( ll.temp, 2) )  
249 255
 
250
-    # Calculate log-likelihood
251
-    ll.temp = decon.calcLL(counts=counts, z=z, phi = phi, eta = eta, theta=theta )
252
-    ll  = c(ll, ll.temp)
253
-    ll.round = c( ll.round, round( ll.temp, 2) )  
256
+            if ( round( ll.temp, 2) > ll.round[ iter ]  |  iter == 1   )    {
257
+                num.iter.without.improvement = 1L 
258
+            }   else { 
259
+                num.iter.without.improvement = num.iter.without.improvement + 1L 
260
+            } 
261
+            iter = iter + 1L 
254 262
 
255
-    if ( round( ll.temp, 2) > ll.round[ iter ]  |  iter== 1   )    {
256
-        num.iter.without.improvement = 1L 
257
-    }   else { 
258
-        num.iter.without.improvement = num.iter.without.improvement + 1L 
259
-    } 
260
-    iter = iter + 1L 
263
+        }
261 264
 
262
-  }
265
+    }
263 266
 
264
-  }
267
+    if ( decon.method == "background") {
265 268
 
266
-  if ( decon.method == "background") {
269
+        # Initialization
270
+        theta = runif( nC, min =0.1, max=0.5) 
271
+        est.rmat = t( t(counts) *theta) 
272
+        bgDist = rowSums( counts ) / sum( counts) 
273
+        bgDist = matrix( rep( bgDist, nC), ncol=nC) 
274
+        cellDist = normalizeCounts( est.rmat, normalize="proportion", pseudocount.normalize=beta) 
275
+        ll =c()
267 276
 
268
-  # Initialization
269
-  theta = runif( nC, min =0.1, max=0.5) 
270
-  est.rmat = t( t(counts) *theta) 
271
-  bgDist = rowSums( counts ) / sum( counts) 
272
-  bgDist = matrix( rep( bgDist, nC), ncol=nC) 
273
-  cellDist = normalizeCounts( est.rmat, normalize="proportion", pseudocount.normalize=beta) 
274
-  ll =c()
275 277
 
278
+        ll.round = bg.calcLL(counts=counts, cellDist=cellDist, bgDist=bgDist, theta=theta)  
276 279
 
277
-  ll.round = bg.calcLL(counts=counts, cellDist=cellDist, bgDist=bgDist, theta=theta)  
280
+        ## EM updates
281
+        while (iter <=  max.iter &  num.iter.without.improvement <= stop.iter  ) { 
278 282
 
279 283
 
280
-  # EM updates
281
-    while (iter <=  max.iter &  num.iter.without.improvement <= stop.iter  ) { 
284
+            next.decon = cD.calcEMbgDecontamination(counts=counts, cellDist=cellDist, bgDist=bgDist, theta=theta, beta=beta, delta=delta)  
282 285
 
286
+            theta = next.decon$theta
287
+            cellDist = next.decon$cellDist
283 288
 
284
-    next.decon = cD.calcEMbgDecontamination(counts=counts, cellDist=cellDist, bgDist=bgDist, theta=theta, beta=beta, delta=delta)  
285 289
 
286
-    theta = next.decon$theta
287
-    cellDist = next.decon$cellDist
290
+            ## Calculate log-likelihood
291
+            ll.temp = bg.calcLL(counts=counts, cellDist=cellDist, bgDist=bgDist, theta=theta) 
292
+            ll  = c(ll, ll.temp) 
293
+            ll.round = c( ll.round, round( ll.temp, 2) )
288 294
 
289
-    # Calculate log-likelihood
290
-    ll.temp = bg.calcLL(counts=counts, cellDist=cellDist, bgDist=bgDist, theta=theta) 
291
-    ll  = c(ll, ll.temp) 
292
-    ll.round = c( ll.round, round( ll.temp, 2) )
295
+            if ( round( ll.temp, 2) > ll.round[ iter ]  |  iter == 1   )    {
296
+                num.iter.without.improvement = 1L
297
+            }   else {
298
+                num.iter.without.improvement = num.iter.without.improvement + 1L
299
+            }
300
+            iter = iter + 1L
301
+        }
293 302
 
294
-    if ( round( ll.temp, 2) > ll.round[ iter ]  |  iter== 1   )    {
295
-        num.iter.without.improvement = 1L
296
-    }   else {
297
-        num.iter.without.improvement = num.iter.without.improvement + 1L
298 303
     }
299
-    iter = iter + 1L
300
-  }
301 304
 
302
-  }
305
+    res.conp  = 1- colSums(next.decon$est.rmat) / colSums(counts)
303 306
 
304
-  res.conp  = 1- colSums(next.decon$est.rmat) / colSums(counts)
307
+    end.time = Sys.time() 
308
+    logMessages("----------------------------------------------------------------------", logfile=logfile, append=TRUE, verbose=verbose) 
309
+    logMessages("Completed DecontX. Total time:", format(difftime(end.time, start.time)), logfile=logfile, append=TRUE, verbose=verbose) 
310
+    if ( !is.null(batch) ) {  logMessages("batch: ",  batch, logfile=logfile, append=TRUE, verbose=verbose)    }
311
+    logMessages("----------------------------------------------------------------------", logfile=logfile, append=TRUE, verbose=verbose) 
305 312
 
313
+    run.params = list("beta"=beta, "delta"=delta, "iteration"=iter-1L, "seed"=seed)
306 314
 
307
-  end.time = Sys.time() 
308
-  logMessages("----------------------------------------------------------------------", logfile=logfile, append=TRUE, verbose=verbose) 
309
-  logMessages("Completed DecontX. Total time:", format(difftime(end.time, start.time)), logfile=logfile, append=TRUE, verbose=verbose) 
310
-  if ( !is.null(batch) ) {  logMessages("batch: ",  batch, logfile=logfile, append=TRUE, verbose=verbose)    }
311
-  logMessages("----------------------------------------------------------------------", logfile=logfile, append=TRUE, verbose=verbose) 
312
-
313
-  run.params = list("beta" = beta, "delta" = delta, "iteration"=iter-1L, "seed"=seed)
314
-
315
-  res.list = list("logLikelihood" = ll, "est.rmat"=next.decon$est.rmat , "est.conp"= res.conp, "theta"=theta , "delta"=delta)
316
-  if( decon.method=="clustering" ) {
317
-    posterior.params = list( "est.GeneDist"=phi,  "est.ConDist"=eta  ) 
318
-    res.list = append( res.list , posterior.params ) 
315
+    res.list = list("logLikelihood" = ll, "est.rmat"=next.decon$est.rmat , "est.conp"= res.conp, "theta"=theta , "delta"=delta)
316
+    if( decon.method=="clustering" ) {
317
+        posterior.params = list( "est.GeneDist"=phi,  "est.ConDist"=eta  ) 
318
+        res.list = append( res.list , posterior.params ) 
319 319
   }
320 320
   
321
-  return(list("run.params"=run.params, "res.list"=res.list, "method"=decon.method  ))
321
+    return(list("run.params"=run.params, "res.list"=res.list, "method"=decon.method  ))
322 322
 }
323 323
 
324 324
 
325
-### checking 
325
+
326 326
 # Make sure provided parameters are the right type and value range 
327
-checkParameters.decon = function(proportion.prior, distribution.prior) {
328
-  if( length(proportion.prior) > 1 | proportion.prior <= 0   )  {
329
-    stop("'delta' should be a single positive value.")
330
-  }
331
-  if( length( distribution.prior) > 1 | distribution.prior <=0  ) {
332
-    stop("'beta' should be a single positive value.") 
333
-  }
327
+checkParameters.decon = function(proportionPrior, distributionPrior) {
328
+    if( length(proportionPrior) > 1 | proportionPrior <= 0   )  {
329
+        stop("'delta' should be a single positive value.")
330
+    }
331
+    if( length( distributionPrior) > 1 | distributionPrior <=0  ) {
332
+        stop("'beta' should be a single positive value.") 
333
+    }
334 334
 }
335 335
 
336
+
336 337
 # Make sure provided rmat is the right type
337 338
 checkCounts.decon = function(counts) {
338
-  if ( sum(is.na(counts)) >0   ) {
339
-    stop("Missing value in 'counts' matrix.") 
340
-  }
339
+    if ( sum(is.na(counts)) >0   ) {
340
+        stop("Missing value in 'counts' matrix.") 
341
+    }
341 342
 }
342 343
 
344
+
343 345
 # Make sure provided cell labels are the right type
344
-processCellLabels = function(z,  num.cells) {
345
-  if ( length(z) != num.cells )  {
346
-    stop("'z' must be of the same length as the number of cells in the 'counts' matrix.") 
347
-  }
348
-  if( length(unique(z)) <2 ) {
349
-    stop("'z' must have at least 2 different values.") # Even though everything runs smoothly when length(unique(z)) == 1, result is not trustful  
350
-  }
351
-  if( !is.factor(z) ) {
352
-    z = plyr::mapvalues(z, unique(z), 1:length(unique(z)) )
353
-    z = as.factor(z) 
354
-  }
355
-  return(z)
346
+processCellLabels = function(z, num.cells) {
347
+    if ( length(z) != num.cells )  {
348
+        stop("'z' must be of the same length as the number of cells in the 'counts' matrix.") 
349
+    }
350
+    if( length(unique(z)) <2 ) {
351
+        stop("'z' must have at least 2 different values.") # Even though everything runs smoothly when length(unique(z)) == 1, result is not trustful  
352
+    }
353
+    if( !is.factor(z) ) {
354
+        z = plyr::mapvalues(z, unique(z), 1:length(unique(z)) )
355
+        z = as.factor(z) 
356
+    }
357
+    return(z)
356 358
 }
357 359
 
360
+
358 361
 # Add two (veried-length) vectors of logLikelihood 
359
-addLogLikelihood = function( ll.a,  ll.b   ) { 
360
-  length.a = length(ll.a ) 
361
-  length.b = length(ll.b) 
362
-
363
-  if( length.a >= length.b  )  { 
364
-      ll.b = c( ll.b, rep( ll.b[length.b] , length.a - length.b )  ) 
365
-      ll = ll.a + ll.b 
366
-  } else { 
367
-      ll.a = c( ll.a, rep( ll.a[length.a], length.b - length.a ) ) 
368
-      ll = ll.a + ll.b
369
-  } 
362
+addLogLikelihood = function( ll.a, ll.b   ) { 
363
+    length.a = length(ll.a ) 
364
+    length.b = length(ll.b) 
365
+
366
+    if( length.a >= length.b  )  { 
367
+        ll.b = c( ll.b, rep( ll.b[length.b] , length.a - length.b )  ) 
368
+        ll = ll.a + ll.b 
369
+    } else { 
370
+        ll.a = c( ll.a, rep( ll.a[length.a], length.b - length.a ) ) 
371
+        ll = ll.a + ll.b
372
+    } 
370 373
   
371
-  return( ll ) 
374
+    return( ll ) 
372 375
 }
373 376
new file mode 100644
... ...
@@ -0,0 +1,56 @@
1
+#' @title Gene set enrichment
2
+#' @description Identify and return significantly-enriched terms for each gene
3
+#'  module in a Celda object. Performs gene set enrichment analysis for Celda
4
+#'  identified modules using the enrichR package.
5
+#' @author Ahmed Youssef
6
+#' @param counts Integer count matrix. Rows represent genes and columns
7
+#'  represent cells. Row names of the matrix should be gene names.
8
+#' @param celdaModel Celda object of class `celda_G` or `celda_CG`.
9
+#' @param databases Character vector. Name of reference database. Available
10
+#'  databases can be viewed by running \code{enrichR::listEnrichrDbs()}.
11
+#' @param fdr False discovery rate (FDR). Numeric. Cutoff value for adjusted
12
+#'  p-value, terms with FDR below this value are considered significantly
13
+#'  enriched.
14
+#' @return List of length 'L' where each member contains the significantly
15
+#'  enriched terms for the corresponding module.
16
+#' @examples
17
+#' library(M3DExampleData)
18
+#' counts <- M3DExampleData::Mmus_example_list$data
19
+#' #subset 100 genes for fast clustering
20
+#' counts <- counts[sample(seq_len(nrow(counts)),
21
+#'     size = 100),]
22
+#' #cluster genes into 10 modules for quick demo
23
+#' cm <- celda_G(counts = as.matrix(counts), L = 10, verbose = FALSE)
24
+#' gse <- geneSetEnrich(counts,
25
+#'     cm,
26
+#'     databases = c('GO_Biological_Process_2018','GO_Molecular_Function_2018'))
27
+#' @export
28
+geneSetEnrich <- function(counts, celdaModel, databases, fdr = 0.05) {
29
+    #check for correct celda object
30
+    if (!(class(celdaModel) %in% c("celda_G", "celda_CG"))) {
31
+        stop("No gene modules in celda object. ",
32
+            "Please provide object of class celda_G or celda_CG.")
33
+    }
34
+
35
+    #initialize list with one entry for each gene module
36
+    modules <- vector("list", length = celdaModel@params$L)
37
+
38
+    #create dataframe with gene-module associations
39
+    genes <- data.frame(gene = rownames(counts), module = celdaModel@clusters$y)
40
+
41
+    #iterate over each module, get genes in that module, add to list
42
+    for (i in seq_len(celdaModel@params$L)) {
43
+        modules[[i]] <- as.character(genes[genes$module == i, 'gene'])
44
+    }
45
+
46
+    #enrichment analysis
47
+    enrichment <- lapply(modules, function(module) {
48
+        invisible(capture.output(table <- enrichR::enrichr(genes = module,
49
+            databases = databases)))
50
+        table <- Reduce(f = rbind, x = table)
51
+        table[table$Adjusted.P.value < fdr, 'Term']
52
+    })
53
+
54
+    #return results as a list
55
+    return(enrichment)
56
+}
... ...
@@ -195,9 +195,6 @@ plotDimReduceCluster = function(dim1, dim2, cluster, size = 1, xlab = "Dimension
195 195
 }
196 196
 
197 197
 
198
-
199
-
200
-
201 198
 # Run the t-SNE algorithm for dimensionality reduction
202 199
 # 
203 200
 # @param norm Normalized count matrix.
... ...
@@ -216,3 +213,12 @@ calculateTsne = function(norm, perplexity=20, max.iter=2500, seed=12345, do.pca=
216 213
   return(res)                     
217 214
 }
218 215
 
216
+
217
+# Run the umap algorithm for dimensionality reduction
218
+# 
219
+# @param norm Normalized count matrix.
220
+# @param umap.config An object of class umap.config, containing configuration parameters to be passed to umap. Default umap::umap.defualts.
221
+calculateUmap = function(norm, umap.config=umap::umap.defaults) {
222
+  return(umap::umap(norm, umap.config)$layout)
223
+}
224
+
... ...
@@ -15,11 +15,11 @@
15 15
 To install the most recent release of celda (used in the preprint version of the celda paper) via devtools:
16 16
 ```
17 17
 library(devtools)
18
-install_github("compbiomed/celda@v0.6")
18
+install_github("campbio/celda@v0.6")
19 19
 ```
20 20
 The most up-to-date (but potentially less stable) version of celda can similarly be installed with:
21 21
 ```
22
-install_github("compbiomed/celda")
22
+install_github("campbio/celda@devel")
23 23
 ```
24 24
 
25 25
 **NOTE** On OSX, devtools::install_github() requires installation of **libgit2.** This can be installed via homebrew:
... ...
@@ -37,11 +37,34 @@ An analysis example using celda with RNASeq via vignette('celda-analysis')
37 37
 ### Decontamination with DecontX
38 38
 Highly expressed genes from various cells clusters will be expressed at low levels in other clusters in droplet-based systems due to contamination. DecontX will decompose an observed count matrix into a decontaminated expression matrix and a contamination matrix. The only other parameter needed is a vector of cell cluster labels. 
39 39
 
40
+To simulate two 300 (gene) x 100 (cell) count matrices from 3 different cell types with total reads per cell ranged from 5000 to 40000: one matrix being ture expression matrix (rmat), the other matrix being contamination count matrix (cmat)
40 41
 ```
41
-new.counts = DecontX( counts = counts, z = cell.label) 
42
+sim.con = simulateContaminatedMatrix( C = 100, G = 300, K = 3, N.Range= c(5000, 40000), seed = 9124) 
43
+true.contamination.percentage = colSums( sim.con$cmat ) / colSums( sim.con$cmat + sim.con$rmat ) 
44
+str(sim.con)   
45
+# rmat: simulated true expression (gene by cell) count matrix
46
+# cmat: simulated contamination (gene by cell) count matrix 
47
+# N.by.C: total transcripts per cell 
48
+# z: cell type label 
49
+
50
+```
51
+Use DecontX to decompose the observed (contaminated) count matrix back into true expression matrix and a contamination matrix with specified cell label
52
+```
53
+observed.mat = sim.con$rmat + sim.con$cmat 
54
+cell.label = sim.con$z
55
+new.counts = DecontX( counts = observed.mat, z = cell.label,  max.iter = 200, seed = 123) 
56
+str(new.counts) 
42 57
 # Decontaminated matrix: new.counts$res.list$est.rmat
43 58
 # Percentage of contamination per cell: new.counts$res.list$est.conp
59
+
44 60
 ```
61
+DecontX Performance check 
62
+```
63
+estimated.contamination.percentage = new.counts$res.list$est.conp
64
+plot( true.contamination.percentage, estimated.contamination.percentage) ; abline(0,1) 
65
+``` 
66
+
67
+
45 68
 
46 69
 ## New Features and announcements
47 70
 The v0.4 release of celda represents a useable implementation of the various celda clustering models.
... ...
@@ -27,6 +27,9 @@ DecontX(counts, z = NULL, batch = NULL, max.iter = 200,
27 27
 
28 28
 \item{seed}{Integer. Passed to set.seed(). Default to be 1234567. If NULL, no calls to `set.seed()` are made.}
29 29
 }
30
+\value{
31
+A list object which contains the decontaminated count matrix and related parameters
32
+}
30 33
 \description{
31 34
 This function updates decontamination on dataset with multiple batches
32 35
 }
... ...
@@ -11,7 +11,21 @@ celdaTsne(counts, celda.mod, max.cells = 25000, min.cluster.size = 100,
11 11
 \arguments{
12 12
 \item{counts}{Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`.}
13 13
 
14
-\item{celda.mod}{Celda model. Options available in `celda::available.models`.}
14
+\item{celda.mod}{Celda object of class `celda_CG`.}
15
+
16
+\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.}
17
+
18
+\item{min.cluster.size}{Integer. Do not subsample cell clusters below this threshold. Default 100.}
19
+
20
+\item{modules}{Integer vector. Determines which features modules to use for tSNE. If NULL, all modules will be used. Default NULL.}
21
+
22
+\item{perplexity}{Numeric. Perplexity parameter for tSNE. Default 20.}
23
+
24
+\item{max.iter}{Integer. Maximum number of iterations in tSNE generation. Default 2500.}
25
+
26
+\item{seed}{Integer. Passed to `set.seed()`. Default 12345. If NULL, no calls to `set.seed()` are made.}
27
+
28
+\item{...}{Additional parameters.}
15 29
 
16 30
 \item{...}{Additional parameters.}
17 31
 }
18 32
new file mode 100755
... ...
@@ -0,0 +1,42 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/celda_C.R
3
+\docType{methods}
4
+\name{celdaUmap,celda_C-method}
5
+\alias{celdaUmap,celda_C-method}
6
+\title{umap for celda_C}
7
+\usage{
8
+\S4method{celdaUmap}{celda_C}(counts, celda.mod, max.cells = 25000,
9
+  min.cluster.size = 100, modules = NULL,
10
+  umap.config = umap::umap.defaults)
11
+}
12
+\arguments{
13
+\item{counts}{Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`.}
14
+
15
+\item{celda.mod}{Celda object of class `celda_C`.}
16
+
17
+\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.}
18
+
19
+\item{min.cluster.size}{Integer. Do not subsample cell clusters below this threshold. Default 100.}
20
+
21
+\item{initial.dims}{Integer. PCA will be used to reduce the dimentionality of the dataset. The top 'initial.dims' principal components will be used for tSNE. Default 20.}
22
+
23
+\item{perplexity}{Numeric. Perplexity parameter for tSNE. Default 20.}
24
+
25
+\item{max.iter}{Integer. Maximum number of iterations in tSNE generation. Default 2500.}
26
+
27
+\item{seed}{Integer. Passed to `set.seed()`. Default 12345. If NULL, no calls to `set.seed()` are made.}
28
+
29
+\item{...}{Additional parameters.}
30
+}
31
+\value{
32
+A two column matrix of t-SNE coordinates
33
+}
34
+\description{
35
+Embeds cells in two dimensions using umap based on a `celda_C` model. PCA on the normalized counts is used to reduce the number of features before applying umap.
36
+}
37
+\examples{
38
+umap.res = celdaUmap(celda.C.sim$counts, celda.C.mod)
39
+}
40
+\seealso{
41
+`celda_C()` for clustering cells and `celdaHeatmap()` for displaying expression
42
+}
0 43
new file mode 100755
... ...
@@ -0,0 +1,36 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/celda_CG.R
3
+\docType{methods}
4
+\name{celdaUmap,celda_CG-method}
5
+\alias{celdaUmap,celda_CG-method}
6
+\title{umap for celda_CG}
7
+\usage{
8
+\S4method{celdaUmap}{celda_CG}(counts, celda.mod, max.cells = 25000,
9
+  min.cluster.size = 100, modules = NULL,
10
+  umap.config = umap::umap.defaults)
11
+}
12
+\arguments{
13
+\item{counts}{Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`.}
14
+
15
+\item{celda.mod}{Celda object of class `celda_CG`.}
16
+
17
+\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.}
18
+
19
+\item{min.cluster.size}{Integer. Do not subsample cell clusters below this threshold. Default 100.}
20
+
21
+\item{modules}{Integer vector. Determines which features modules to use for tSNE. If NULL, all modules will be used. Default NULL.}
22
+
23
+\item{umap.config}{Object of class `umap.config`. Configures parameters for umap. Default `umap::umap.defaults`}
24
+}
25
+\value{
26
+A two column matrix of t-SNE coordinates
27
+}
28
+\description{
29
+Embeds cells in two dimensions using umap based on a `celda_CG` model. umap is run on module probabilities to reduce the number of features instead of using PCA. Module probabilities square-root trasformed before applying tSNE.
30
+}
31
+\examples{
32
+umap.res = celdaUmap(celda.CG.sim$counts, celda.CG.mod)
33
+}
34
+\seealso{
35
+`celda_CG()` for clustering features and cells  and `celdaHeatmap()` for displaying expression
36
+}
0 37
new file mode 100755
... ...
@@ -0,0 +1,36 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/celda_G.R
3
+\docType{methods}
4
+\name{celdaUmap,celda_G-method}
5
+\alias{celdaUmap,celda_G-method}
6
+\title{umap for celda_G}
7
+\usage{
8
+\S4method{celdaUmap}{celda_G}(counts, celda.mod, max.cells = 25000,
9
+  min.cluster.size = 100, modules = NULL,
10
+  umap.config = umap::umap.defaults)
11
+}
12
+\arguments{
13
+\item{counts}{Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`.}
14
+
15
+\item{celda.mod}{Celda object of class `celda_CG`.}
16
+
17
+\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.}
18
+
19
+\item{min.cluster.size}{Integer. Do not subsample cell clusters below this threshold. Default 100.}