Browse code

Moved new Nonezero cpp function to different file. Fixed celda_heatmap bug causing vignette building to fail. Renamed vignettes. Started new draft of decontX vignette.

Joshua D. Campbell authored on 13/03/2020 00:31:44
Showing 22 changed files

... ...
@@ -4,3 +4,5 @@
4 4
 .travis.yml
5 5
 NOTICE
6 6
 _pkgdown.yml
7
+^doc$
8
+^Meta$
... ...
@@ -33,3 +33,5 @@ etc/*
33 33
 # Celda log files with default prefix
34 34
 Celda_chain.*log.txt
35 35
 inst/doc
36
+doc
37
+Meta
... ...
@@ -17,14 +17,6 @@ calculateNativeMatrix <- function(counts, native_counts, theta, eta, phi, z, row
17 17
     .Call('_celda_calculateNativeMatrix', PACKAGE = 'celda', counts, native_counts, theta, eta, phi, z, row_index, col_index, pseudocount)
18 18
 }
19 19
 
20
-#' get row and column indices of none zero elements in the matrix
21
-#' 
22
-#' @param R_counts A matrix
23
-#' @return An integer matrix where each row is a row, column indices pair 
24
-nonzero <- function(R_counts) {
25
-    .Call('_celda_nonzero', PACKAGE = 'celda', R_counts)
26
-}
27
-
28 20
 cG_calcGibbsProbY_Simple <- function(counts, nGbyTS, nTSbyC, nbyTS, nbyG, y, L, index, gamma, beta, delta) {
29 21
     .Call('_celda_cG_calcGibbsProbY_Simple', PACKAGE = 'celda', counts, nGbyTS, nTSbyC, nbyTS, nbyG, y, L, index, gamma, beta, delta)
30 22
 }
... ...
@@ -77,3 +69,11 @@ fastNormPropSqrt <- function(R_counts, R_alpha) {
77 69
     .Call('_celda_fastNormPropSqrt', PACKAGE = 'celda', R_counts, R_alpha)
78 70
 }
79 71
 
72
+#' get row and column indices of none zero elements in the matrix
73
+#' 
74
+#' @param R_counts A matrix
75
+#' @return An integer matrix where each row is a row, column indices pair 
76
+nonzero <- function(R_counts) {
77
+    .Call('_celda_nonzero', PACKAGE = 'celda', R_counts)
78
+}
79
+
... ...
@@ -162,9 +162,11 @@ plotHeatmap <- function(counts,
162 162
     ## Select subsets of features/cells
163 163
     if (!is.null(featureIx)) {
164 164
         counts <- counts[featureIx, , drop = FALSE]
165
-        if (length(annotationFeature) > 1 ||
166
-                (length(annotationFeature) == 1 & !is.na(annotationFeature))) {
165
+        if (!is.null(annotationFeature) & length(annotationFeature) > 0) {
166
+          if(length(annotationFeature) == 1 & !is.na(annotationFeature) | 
167
+             length(annotationFeature) > 1) {
167 168
             annotationFeature <- annotationFeature[featureIx, , drop = FALSE]
169
+          }  
168 170
         }
169 171
         if (!is.null(y)) {
170 172
             y <- y[featureIx]
... ...
@@ -173,9 +175,11 @@ plotHeatmap <- function(counts,
173 175
 
174 176
     if (!is.null(cellIx)) {
175 177
         counts <- counts[, cellIx, drop = FALSE]
176
-        if (length(annotationCell) > 1 ||
177
-                (length(annotationCell) == 1 & !is.na(annotationCell))) {
178
+        if (!is.null(annotationCell) & length(annotationCell) > 0) {
179
+          if(length(annotationCell) == 1 & !is.na(annotationCell) | 
180
+             length(annotationFeature) > 1) {
178 181
             annotationCell <- annotationCell[cellIx, , drop = FALSE]
182
+          }  
179 183
         }
180 184
         if (!is.null(z)) {
181 185
             z <- z[cellIx]
... ...
@@ -62,6 +62,7 @@
62 62
 #'  }
63 63
 #' }
64 64
 #' @examples
65
+#' \dontrun{
65 66
 #' library(M3DExampleData)
66 67
 #' counts <- M3DExampleData::Mmus_example_list$data
67 68
 #' # subset 100 genes for fast clustering
... ...
@@ -81,6 +82,7 @@
81 82
 #'
82 83
 #' # Plot dendrogram
83 84
 #' plotDendro(DecTree)
85
+#' }
84 86
 #' @import magrittr
85 87
 #' @importFrom methods hasArg
86 88
 #' @export
... ...
@@ -7,6 +7,7 @@
7 7
 #' @param features A L(features) by N(samples) numeric matrix.
8 8
 #' @return A character vector of label predicitions.
9 9
 #' @examples
10
+#' \dontrun{
10 11
 #' library(M3DExampleData)
11 12
 #' counts <- M3DExampleData::Mmus_example_list$data
12 13
 #' # Subset 500 genes for fast clustering
... ...
@@ -26,6 +27,7 @@
26 27
 #'
27 28
 #' # Get sample estimates in training data
28 29
 #' getDecisions(DecTree$rules, features)
30
+#' }
29 31
 #' @export
30 32
 getDecisions <- function(rules, features) {
31 33
     features <- t(features)
... ...
@@ -12,6 +12,7 @@
12 12
 #' @param boxSize A numeric value. Size of rule labels. Default is 7.
13 13
 #' @param boxColor A character value. Color of rule labels. Default is `black`.
14 14
 #' @examples
15
+#' \dontrun{
15 16
 #' library(M3DExampleData)
16 17
 #' counts <- M3DExampleData::Mmus_example_list$data
17 18
 #' # Subset 500 genes for fast clustering
... ...
@@ -31,6 +32,7 @@
31 32
 #'
32 33
 #' # Plot dendrogram
33 34
 #' plotDendro(decTree)
35
+#' }
34 36
 #' @return A ggplot2 object
35 37
 #' @import ggplot2
36 38
 #' @importFrom ggdendro dendro_data ggdendrogram
... ...
@@ -145,7 +145,7 @@ decontXMarkerPlot <- function(counts, z, geneMarkers, threshold = 1, color = "re
145 145
 
146 146
 # geneMarkers should be a dataframe,  w/ 2 column names being `cellType` and `geneMarkers`
147 147
 # convert both `cellType` and `geneMarkers` are factors with levels being integer
148
-.geneMarkerProcess <- function(geneMarkers, orders = None) {
148
+.geneMarkerProcess <- function(geneMarkers, orders = NULL) {
149 149
   geneMarkers[, "cellName"] <- geneMarkers[, "cellType"]
150 150
   geneMarkers[, "cellType"] <- factor(geneMarkers[, "cellType"])
151 151
   levels(geneMarkers[, "cellType"]) <- 1:length(levels(geneMarkers[, "cellType"]))
152 152
deleted file mode 100644
153 153
Binary files a/celdaTsne.ori.rds and /dev/null differ
154 154
deleted file mode 100644
155 155
Binary files a/celda_res_vignettes.rds and /dev/null differ
156 156
deleted file mode 100644
157 157
Binary files a/featureViolin.png and /dev/null differ
158 158
deleted file mode 100644
159 159
Binary files a/pltMarker.png and /dev/null differ
160 160
deleted file mode 100644
161 161
Binary files a/pltTsne.ori.png and /dev/null differ
162 162
deleted file mode 100644
163 163
Binary files a/pltViolin.ori.png and /dev/null differ
164 164
deleted file mode 100644
... ...
@@ -1,41 +0,0 @@
1
-// [[Rcpp::depends(RcppEigen)]]
2
-#include <Rcpp.h>
3
-using namespace Rcpp;
4
-
5
-//' get row and column indices of none zero elements in the matrix
6
-//' 
7
-//' @param R_counts A matrix
8
-//' @return An integer matrix where each row is a row, column indices pair 
9
-// [[Rcpp::export]]
10
-SEXP nonzero(NumericMatrix R_counts) {
11
-
12
-    IntegerVector row(1);
13
-		IntegerVector col(1);
14
-		NumericVector val(1);
15
-
16
-    int nR = R_counts.nrow();
17
-    int nC = R_counts.ncol();
18
-		double x;
19
-
20
-    for (int c = 0; c < nC; c++) {
21
-        for (int r = 0; r < nR; r++) {
22
-            x = R_counts[c * nR + r];
23
-            if (x != 0) {
24
-                row.push_back(r + 1);
25
-                col.push_back(c + 1);
26
-                val.push_back(x);
27
-            }
28
-        }
29
-    }
30
-
31
-		row.erase(0);
32
-		col.erase(0);
33
-		val.erase(0);
34
-
35
-    List res;
36
-		res["row"] = row;
37
-		res["col"] = col;
38
-		res["val"] = val;
39
-
40
-		return(res);
41
-}
... ...
@@ -73,17 +73,6 @@ BEGIN_RCPP
73 73
     return rcpp_result_gen;
74 74
 END_RCPP
75 75
 }
76
-// nonzero
77
-SEXP nonzero(NumericMatrix R_counts);
78
-RcppExport SEXP _celda_nonzero(SEXP R_countsSEXP) {
79
-BEGIN_RCPP
80
-    Rcpp::RObject rcpp_result_gen;
81
-    Rcpp::RNGScope rcpp_rngScope_gen;
82
-    Rcpp::traits::input_parameter< NumericMatrix >::type R_counts(R_countsSEXP);
83
-    rcpp_result_gen = Rcpp::wrap(nonzero(R_counts));
84
-    return rcpp_result_gen;
85
-END_RCPP
86
-}
87 76
 // cG_calcGibbsProbY_Simple
88 77
 NumericVector cG_calcGibbsProbY_Simple(const IntegerMatrix counts, IntegerVector nGbyTS, IntegerMatrix nTSbyC, IntegerVector nbyTS, IntegerVector nbyG, const IntegerVector y, const int L, const int index, const double gamma, const double beta, const double delta);
89 78
 RcppExport SEXP _celda_cG_calcGibbsProbY_Simple(SEXP countsSEXP, SEXP nGbyTSSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP indexSEXP, SEXP gammaSEXP, SEXP betaSEXP, SEXP deltaSEXP) {
... ...
@@ -222,6 +211,17 @@ BEGIN_RCPP
222 211
     return rcpp_result_gen;
223 212
 END_RCPP
224 213
 }
214
+// nonzero
215
+SEXP nonzero(NumericMatrix R_counts);
216
+RcppExport SEXP _celda_nonzero(SEXP R_countsSEXP) {
217
+BEGIN_RCPP
218
+    Rcpp::RObject rcpp_result_gen;
219
+    Rcpp::RNGScope rcpp_rngScope_gen;
220
+    Rcpp::traits::input_parameter< NumericMatrix >::type R_counts(R_countsSEXP);
221
+    rcpp_result_gen = Rcpp::wrap(nonzero(R_counts));
222
+    return rcpp_result_gen;
223
+END_RCPP
224
+}
225 225
 
226 226
 RcppExport SEXP _colSumByGroup(SEXP, SEXP);
227 227
 RcppExport SEXP _colSumByGroup_numeric(SEXP, SEXP);
... ...
@@ -236,7 +236,6 @@ static const R_CallMethodDef CallEntries[] = {
236 236
     {"_celda_decontXLogLik", (DL_FUNC) &_celda_decontXLogLik, 6},
237 237
     {"_celda_decontXInitialize", (DL_FUNC) &_celda_decontXInitialize, 4},
238 238
     {"_celda_calculateNativeMatrix", (DL_FUNC) &_celda_calculateNativeMatrix, 9},
239
-    {"_celda_nonzero", (DL_FUNC) &_celda_nonzero, 1},
240 239
     {"_celda_cG_calcGibbsProbY_Simple", (DL_FUNC) &_celda_cG_calcGibbsProbY_Simple, 11},
241 240
     {"_celda_cG_CalcGibbsProbY_ori", (DL_FUNC) &_celda_cG_CalcGibbsProbY_ori, 13},
242 241
     {"_celda_cG_CalcGibbsProbY_fastRow", (DL_FUNC) &_celda_cG_CalcGibbsProbY_fastRow, 13},
... ...
@@ -245,6 +244,7 @@ static const R_CallMethodDef CallEntries[] = {
245 244
     {"_celda_fastNormProp", (DL_FUNC) &_celda_fastNormProp, 2},
246 245
     {"_celda_fastNormPropLog", (DL_FUNC) &_celda_fastNormPropLog, 2},
247 246
     {"_celda_fastNormPropSqrt", (DL_FUNC) &_celda_fastNormPropSqrt, 2},
247
+    {"_celda_nonzero", (DL_FUNC) &_celda_nonzero, 1},
248 248
     {"_colSumByGroup",         (DL_FUNC) &_colSumByGroup,         2},
249 249
     {"_colSumByGroup_numeric", (DL_FUNC) &_colSumByGroup_numeric, 2},
250 250
     {"_colSumByGroupChange",   (DL_FUNC) &_colSumByGroupChange,   4},
... ...
@@ -84,3 +84,47 @@ SEXP fastNormPropSqrt(NumericMatrix R_counts, double R_alpha) {
84 84
   return Rcpp::wrap(res);
85 85
 }
86 86
 
87
+
88
+
89
+// [[Rcpp::depends(RcppEigen)]]
90
+#include <Rcpp.h>
91
+using namespace Rcpp ;
92
+
93
+//' get row and column indices of none zero elements in the matrix
94
+//' 
95
+//' @param R_counts A matrix
96
+//' @return An integer matrix where each row is a row, column indices pair 
97
+// [[Rcpp::export]]
98
+SEXP nonzero(NumericMatrix R_counts) {
99
+  
100
+  IntegerVector row(1);
101
+  IntegerVector col(1);
102
+  NumericVector val(1);
103
+  
104
+  int nR = R_counts.nrow();
105
+  int nC = R_counts.ncol();
106
+  double x;
107
+  
108
+  for (int c = 0; c < nC; c++) {
109
+    for (int r = 0; r < nR; r++) {
110
+      x = R_counts[c * nR + r];
111
+      if (x != 0) {
112
+        row.push_back(r + 1);
113
+        col.push_back(c + 1);
114
+        val.push_back(x);
115
+      }
116
+    }
117
+  }
118
+  
119
+  row.erase(0);
120
+  col.erase(0);
121
+  val.erase(0);
122
+  
123
+  List res;
124
+  res["row"] = row;
125
+  res["col"] = col;
126
+  res["val"] = val;
127
+  
128
+  return(res);
129
+}
130
+
87 131
deleted file mode 100644
... ...
@@ -1,154 +0,0 @@
1
-title: "Decontamination of ambient RNA in single-cell RNA-seq with DecontX on 10X PBMC4k data" 
2
-author: "Shiyi Yang, Sean Corbett, Yusuke Koga, Zhe Wang, W. Evan Johnson, Masanao Yajima, Joshua D. Campbell"
3
-date: "`r Sys.Date()`"
4
-output:
5
-  BiocStyle::html_document:
6
-    toc: true
7
-vignette: >
8
-  %\VignetteIndexEntry{Estimate and remove cross-contamination from ambient RNA for scRNA-seq data with DecontX}
9
-  %\VignetteEngine{knitr::rmarkdown}
10
-  %\VignetteEncoding{UTF-8}
11
-
12
-
13
-```{r, eval=TRUE, message=FALSE}
14
-library(celda)
15
-```
16
-
17
-DecontX can take either `SingleCellExperiment` object from [SingleCellExperiment package] (https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html) or a count matrix in the format of rows being genesand columns being cells. When using SingleCellExperiment object, DecontX is using the counts matrix extracted from the SingleCellExperiment object. 
18
-
19
-# TODO: Load PBMC4k data from 10X CellRanger output
20
-# Load PBMC4k data from 10X
21
-```{r, eval=TRUE, message=FALSE}
22
-library(TENxPBMCData)  # selected single cell RNAseq datasets from 10X Genomics
23
-pbmc <- TENxPBMCData("pbmc4k")
24
-colnames(pbmc) <- paste(pbmc$Sample, pbmc$Barcode, sep="_") # Rename cells
25
-```
26
-
27
-Take a look at the count matrix in the `pbmc` singleCellExperiment object via `assay` function from [SummarizedExperiment] (https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html). There are total 33694 genes and 4340 cells in this PBMC data.
28
-```{r}
29
-library(SummarizedExperiment)
30
-pbmc_counts = SummarizedExperiment::assay(pbmc, i = "counts")
31
-print(pbmc_counts)
32
-```
33
-
34
-Among the 33694 genes, many of then are empty genes (i.e., not expressed by any cell), and many of the genes are likely to be noise due to extremely low expression among these 4340 cells.
35
-We filter out genes that are likely to be noise. We normally keep genes that have expressed at least 3 counts in at least 3 cells.
36
-After filtering, we have 4529 genes left.
37
-```{r}
38
-#pbmc_filterg = pbmc[rowSums(pbmc_counts > 2) > 2, ] 
39
-pbmc_filterg = pbmc[rowSums(pbmc_counts >= 3) >= 3, ] # Filtering can be applied directly on SingleCellExperiment object 
40
-print(pbmc_filterg)
41
-```
42
-
43
-
44
-
45
-# Estimate and remove contaminated counts using DecontX (2 scenarios -- with and without cell types specified)
46
-## Use DecontX for contamination estimation when there is cell type specified for each individual cells (Here we use celda_CG -- a Bayesian bi-clustering method on single cell RNAseq data -- to get cell clusters as well as gene modules, which enhances cell clustering. You can substitute `celda_CG` with any clustering method you have faith in. DecontX takes the expression count matrix (gene by cell count matrix) and a vector of the same length as number of cells in the count matrix specifying cell cluster. 
47
-```{r}
48
-pbmc_filterg_counts = SummarizedExperiment::assay(pbmc_filterg, "counts")
49
-pbmc_filterg_counts = as.matrix(pbmc_filterg_counts) # Change obejct type as basic matrix to pass into celda::celda_CG
50
-celda_res = celda_CG(counts = pbmc_filterg_counts, K = 19, L = 150) # specify number of cell clusters being K=19, number of gnee modules being L=150
51
-```
52
-P.s: we have analyzed this 4K PBMC data before, so we don't bother with the parameters choice for the bi-clustering method `celda_CG` here. For a detailed toturial of usage on `celda`, it is availabel [here] (celda-vignettes url) 
53
-
54
-
55
-
56
-Pass expression (gene by cell) count matrix and the cluster labels from `celda_CG` to decontX, it will return estimated results. Corresponding to the 2 types of input, output result is either in a `list`, or a SingleCellExperiment object with the result list added back to its `metadata` slot (?? Am I understanding the structure right? Is metadata a "slot" of the sce-object??). When `decontX` takes a SingleCellExperiment object, we can use function `metadata` to extract the result list and use `str` function to look at the 4 attributes ("runParams", "estimates", "contamination", "z") in it.
57
-```{r}
58
-cell_cluster = celda_res@clusters$z 
59
-decontx_res_sce = decontX( x = pbmc_filterg, z = cell_cluster) 
60
-print(decontx_res_sce)
61
-#decontx_res = decontX( x = pbmc_filterg_counts, z = cell_cluster) ## This works too. Difference is that the output is a list rathen than being a SingleCellExperiment object
62
-decontx_res = metadata(decontx_res_sce)$decontX
63
-str(decontx_res)
64
-```
65
-
66
-
67
-## Interpretation of DecontX results
68
-There are 4 attributes of DecontX results: 
69
-(1) runParams: parameters used for estimating contamination
70
-(2) estimates: parameters, decontaminated counts as well as other intermediate results estimated during contamination estimation
71
-(3) contamination: cell level contamination
72
-(4) z: cell cluster label 
73
-
74
-
75
-# Check DecontX decontaminated count to see if gene expressions are cleaner by looking at marker genes' expression
76
-## First we have to identify cell types, which is one step further from cell clustering, as we do not really know which cluster is corresponding to what cell type and multiple cell clusters can also be from the same cell type. We are looking at the 4 major cell types (T-cell, B-cell, NK-cell and monocytes) in this PBMC data.
77
-Combining both the clustering results from `celda_CG`, visualization on a reduced 2-dimention space using Tsne (T-stochastic Neighbor Embedding), and marker genes, we relate cell clusters (more specifically the cells in them) to cell types. The marker genes used to identify these 4 major cell types are CD3D (T-cell marker), MS4A1 and CD79A (B-cell markers), GNLY (NK-cell marker), CD14 or FCGR3A (monocytes markers). There are also other cell types (such as dendritic cells, plasma cells, etc.), we don't include these cell types in our demonstration here and we point you to [here] (celda-vignettes url on PBMC4k data) for cell type analysis using `celda`.
78
-```{r}
79
-celdaTsne.ori = celdaTsne( counts = pbmc_filterg_counts , celdaMod = celda_res ) 
80
-pltTsne.ori = plotDimReduceCluster( dim1 = celdaTsne.ori[,1], dim2 = celdaTsne.ori[, 2], cluster = cell_cluster, labelClusters = T)
81
-print(pltTsne.ori)
82
-```
83
-
84
-We use violin plot to look at the marker gene expression levels in each cluster identified before by `celda_CG`. Those gene names are stored in the "rowData" of the singleCellExperiment object. On the other hand, the rownames of the expression count matrix is using gene ID. We have to match marker genes' name to the gene ID to locate them in the expression count matrix.
85
-```{r}
86
-markerGenes = do.call(what = rbind, args = list(data.frame("celltype" = "Tcell", "markergenes"=c("CD3D" )), 
87
-		                                data.frame("celltype" = "Bcell", "markergenes"=c("MS4A1", "CD79A")),
88
-			                        data.frame("celltype" = "NKcell", "markergenes"=c("GNLY")),
89
-			                        data.frame("celltype" = "monocytes", "markergenes"=c("CD14", "FCGR3A")),
90
-						data.frame("celltype" = "megakaryocytes", "markergenes"=c("PPBP"))))
91
-print(markerGenes)
92
-head(rowData(pbmc_filterg))
93
-markerGenes_wID = merge( x = markerGenes, y = rowData(pbmc_filterg), by.x = "markergenes", by.y = "Symbol")
94
-markerGenes_wID[, "feature"] = paste( markerGenes_wID[, "ENSEMBL_ID"], markerGenes_wID[, "markergenes"], sep = "_")
95
-print(markerGenes_wID)
96
-
97
-pbmc_filterg_counts_rename = pbmc_filterg_counts
98
-rownames(pbmc_filterg_counts_rename) = paste(rownames(pbmc_filterg), rowData(pbmc_filterg)[, "Symbol"], sep = "_")
99
-pltViolin.ori = violinPlot(counts = pbmc_filterg_counts_rename, celdaMod=celda_res, features = markerGenes_wID[, "feature"])
100
-print(pltViolin.ori)
101
-```
102
-
103
-The violin plots of the marker genes' expression indicate that cluster 12, 13 are B-cells, cluster 7, 14, 15, 16, 18, 19 are T-cells, cluster 11 is NK-cells, cluster 2, 4, 5 are monocytes. For monocytes, we did not include cluster 3 as it is more likely to be megakaryocytes (PPBP+) or cluster 6 as it is likely to be doublets group (CD14+ and CD3D).
104
-```{r}
105
-cellType = list("Bcell"=c(12,13), "Tcell"=c(7, 14, 15, 16, 18, 19), "NKcell"=c(11), "monocytes"=c(2, 4, 5))
106
-print(cellType)
107
-```
108
-
109
-Now that we have identified the 4 major cell types, we can go ahead to see how DecontX makes the genes' expression clearer by comparing the decontaminated counts with the orinal counts. As we now look at properties at cell-type level than cell cluster level, again we will convert cell cluster labels into cell types. P.s: except for the aforementioned major 4 cell types, all other cell types in this PBMC data are indicated as "others" in this tutorial.
110
-```{r}
111
-# This function is to convert cluster label into cell-type label
112
-match.z.celltype = function( z, cellType.list   )  { 
113
-    z.cell = z
114
-    for ( i in 1:length(cellType.list) )  { 
115
-        z.cell[ z %in% cellType.list[[i]]  ] =  names( cellType.list[i] ) 
116
-    }
117
-    return( z.cell ) 
118
-}
119
-
120
-cellType_list = cellType
121
-cellType_list[["others"]] = c(1:19)[-unlist(cellType)]
122
-print(cellType_list)
123
-
124
-cell_type = match.z.celltype(z = cell_cluster, cellType.list = cellType_list)
125
-print(table(cell_cluster, cell_type))
126
-```
127
-
128
-We look at the gene expressions for these 4 major cell types by comparing the original expression count and the decontaminated counts by DecontX. Since dropout events are prevalent in single cell RNAseq data, we include more genes from now to show clearer cell-type expressions.
129
-```{r}
130
-ct_markergenes = do.call(what = rbind, args = list(data.frame("celltype" = "Tcell", "markergenes"=c("CD3D", "CD3E")),
131
-																									data.frame("celltype" = "Bcell", "markergenes"=c("MS4A1", "CD79A", "CD79B")),
132
-																									data.frame("celltype" = "NKcell", "markergenes"=c("GNLY")),
133
-																									data.frame("celltype" = "monocytes", "markergenes"=c("LYZ", "S100A8", "S100A9"))))
134
-ct_markergenes_wID = merge( x = ct_markergenes, y = rowData(pbmc_filterg), by.x = "markergenes", by.y = "Symbol")
135
-ct_markergenes_wID[, "feature"] = paste( ct_markergenes_wID[, "ENSEMBL_ID"], ct_markergenes_wID[, "markergenes"], sep = "_")
136
-print(head(ct_markergenes_wID))
137
-
138
-ct_markergenes = ct_markergenes_wID[, c("celltype", "feature")]
139
-colnames(ct_markergenes) = c("cellType", "geneMarkers")
140
-
141
-pltMarker = decontXMarkerPlot( counts = pbmc_filterg_counts_rename, z = cell_type, geneMarkers = ct_markergenes) 
142
-
143
-
144
-```
145
-
146
-
147
-## Use DecontX for contamination estimation when there is no cell type specified for the cells
148
-
149
-
150
-
151
-
152
-
153 0
deleted file mode 100644
... ...
@@ -1,99 +0,0 @@
1
-title: "Estimate and remove cross-contamination from ambient RNA for scRNA-seq data with DecontX"
2
-author: "Shiyi Yang, Sean Corbett, Yusuke Koga, Zhe Wang, W. Evan Johnson, Masanao Yajima, Joshua D. Campbell"
3
-date: "`r Sys.Date()`"
4
-output:
5
-  BiocStyle::html_document:
6
-    toc: true
7
-vignette: >
8
-  %\VignetteIndexEntry{Estimate and remove cross-contamination from ambient RNA for scRNA-seq data with DecontX}
9
-  %\VignetteEngine{knitr::rmarkdown}
10
-  %\VignetteEncoding{UTF-8}
11
-
12
-# Introduction
13
-
14
-DecontX is a Bayesian hierarchical model to estimate and remove cross-contamination from ambient RNA in single-cell RNA-seq count data generated from droplet-based sequencing devices. DecontX will take the count matrix with/without the cell labels and estimate the contamination level and deliver a decontaminted count matrix for downstream analysis. 
15
-
16
-In this vignette we will demonstrate how to use DecontX to estimate and remove contamination.  
17
-
18
-# Installation
19
-
20
-celda can be installed from Bioconductor:
21
-
22
-```{r, eval= FALSE}
23
-if (!requireNamespace("BiocManager", quietly=TRUE)) {
24
-    install.packages("BiocManager")}
25
-BiocManager::install("celda")
26
-```
27
-
28
-The package can be loaded using the `library` command.
29
-
30
-```{r, eval=TRUE, message=FALSE} 
31
-library(celda)
32
-```
33
-
34
-To see the latest updates and releases or to post a bug, see our GitHub page at https://github.com/campbio/celda. To ask questions about running celda, post a thread on Bioconductor support site at https://support.bioconductor.org/.
35
-
36
-# Reproducibility note
37
-
38
-Many functions in *celda* make use of stochastic algorithms or procedures which require the use of random number generator (RNG) for simulation or sampling. To maintain reproducibility, all these functions use a **default seed of 12345** to make sure same results are generated each time one of these functions is called. Explicitly setting the `seed` arguments is needed for greater control and randomness.
39
-
40
-# Generation of a cross-contaminated dataset 
41
-DecontX will take a matrix of counts (referred as observed counts) where each row is a feature, each column is a cell, and each entry in the matrix is the number of counts of each feature in each cell. To illustrate the utility of DecontX, we will apply it to a simulated dataset.
42
-
43
-In the function `simulateContaminatedMatrix`, the K parameter designates the number of cell clusters, the C parameter determines the number of cells, the G parameter determines the number of genes in the simulated dataset.
44
-
45
-```{r}
46
-simCounts <- simulateContaminatedMatrix(G = 300, C = 100, K = 3)
47
-```
48
-
49
-The `nativeCounts` is the natively expressed counts matrix, and `observedCounts` is the observed counts matrix that contains both contaminated and natively expressed transctripts. The `NByC` is the total number of observed transcripts per cell. The counts matrix which only contains contamianted transcripts can be obtained by subtracting the observed counts matrix from the observed counts matrix. 
50
-
51
-```{r}
52
-contamination <- simCounts$observedCounts - simCounts$nativeCounts
53
-```
54
-
55
-The `z` variable contains the population label for each cell.
56
-
57
-```{r}
58
-table(simCounts$z)
59
-```
60
-
61
-The `phi` and `eta` variables contain the expression distributions and contamination distributions for each population, respectively. Each column corresponds to a population, each row represents a gene. The sum of the rows equal to 1.
62
-
63
-```{r}
64
-colSums(simCounts$phi)
65
-colSums(simCounts$eta)
66
-```
67
-
68
-
69
-# Decontamination using DecontX
70
-DecontX uses bayesian method to estimate and remove contamination via varitaional inference.
71
-
72
-```{r, warning = FALSE, message = FALSE}
73
-decontxModel <- decontX(x = simCounts$observedCounts, z = simCounts$z)
74
-```
75
-
76
-## Check convergence
77
-Use log-likelihood to check convergence
78
-
79
-```{r, eval = TRUE, fig.width = 5, fig.height = 5}
80
-plot(decontxModel$estimates$all_cells$logLikelihood)
81
-```
82
-
83
-## Evaluate model performance
84
-
85
-`decontX` estimates a contamination proportion for each cell. We compare the estimated contamination proportion with the real contamination proportion.
86
-
87
-```{r, eval = TRUE, fig.width = 5, fig.height = 5}
88
-plot(decontxModel$contamination,
89
-    colSums(contamination) / simCounts$NByC, col = simCounts$z)
90
-abline(0, 1)
91
-```
92
-
93
-# Session Information
94
-
95
-```{r}
96
-sessionInfo()
97
-```
98 0
similarity index 100%
99 1
rename from vignettes/FindMarkers-analysis.Rmd
100 2
rename to vignettes/FindMarkers.Rmd
101 3
similarity index 96%
102 4
rename from vignettes/celda-analysis.Rmd
103 5
rename to vignettes/celda.Rmd
... ...
@@ -1,6 +1,17 @@
1 1
 ---
2
-title: "Analysis of single-cell 'omic count data with celda"
3
-author: "Sean Corbett, Yusuke Koga, Shiyi Yang, Zhe Wang, Evan Johnson, Paola Sebastiani, Masanao Yajima, Joshua Campbell"
2
+title: "Analysis of single-cell genomic data with celda"
3
+author:
4
+- name: Sean Corbett
5
+  affiliation: &id Boston University School of Medicine
6
+- name: Yusuke Koga
7
+  affiliation: *id
8
+- name: Shiyi Yang
9
+  affiliation: *id
10
+- name: Zhe Wang
11
+  affiliation: *id
12
+- name: Joshua Campbell
13
+  affiliation: *id
14
+  email: camp@bu.edu
4 15
 date: "`r Sys.Date()`"
5 16
 output: 
6 17
   BiocStyle::html_document:
... ...
@@ -13,9 +24,9 @@ vignette: >
13 24
 
14 25
 # Introduction
15 26
 
16
-**CE**llular **L**atent **D**irichlet **A**llocation (celda) is a collection of Bayesian hierarchical models to perform feature and cell bi-clustering for count data generated by single-cell platforms. This algorithm is an extension of the Latent Dirichlet Allocation (LDA) topic modeling framework that has been popular in text mining applications and has shown good performance with sparse data. celda simultaneously clusters features (i.e. gene expression) into modules based on co-expression patterns across cells and cells into subpopulations based on the probabilities of the feature modules within each cell. celda uses Dirichlet-multinomial distributions to model cells and genes so no additional normalization is required for single-cell counts. 
27
+**CE**llular **L**atent **D**irichlet **A**llocation (celda) is a collection of Bayesian hierarchical models to perform feature and cell bi-clustering for count data generated by single-cell platforms. This algorithm is an extension of the Latent Dirichlet Allocation (LDA) topic modeling framework that has been popular in text mining applications and has shown good performance with sparse data. celda simultaneously clusters features (i.e. gene expression) into modules based on co-expression patterns across cells and cells into subpopulations based on the probabilities of the feature modules within each cell.  
17 28
 
18
-In this vignette we will demonstrate how to use celda to perform cell and feature clustering with simulated data.
29
+In this vignette we will demonstrate how to use celda to perform cell and feature clustering with a simple simulated dataset.
19 30
 
20 31
 # Installation
21 32
 
... ...
@@ -307,17 +318,18 @@ celdaModel <- subsetCeldaList(celdaList = cellSplit,
307 318
 
308 319
 ## celdaGridSearch
309 320
 
310
-celda is able to run multiple combinations of K and L with multiple chains in parallel via the `celdaGridSearch` function. 
321
+celda is able to run multiple combinations of K and L with multiple chains in parallel via the `celdaGridSearch` function. Setting `verbose` to `TRUE` will print the output of each model to a text file.
311 322
 
312 323
 The `resamplePerplexity` function "perturbs" the original counts matrix by resampling the counts of each cell according to its normalized probability distribution. Perplexity is calculated on the resampled matrix and the procedure is repeated `resample` times. These results can be visualized with `plotGridSearchPerplexity`. The major goal is to pick the lowest K and L combination with relatively good perplexity. In general, visual inspection of the plot can be used to select the number of modules (L) or cell populations (K) where the rate of decrease in the perplexity starts to drop off.
313 324
 
314 325
 ```{r, eval = TRUE, message = FALSE}
315 326
 cgs <- celdaGridSearch(simCounts$counts,
316 327
     paramsTest = list(K = seq(4, 6), L = seq(9, 11)),
317
-    cores = 4,
328
+    cores = 1,
318 329
     model = "celda_CG",
319 330
     nchains = 2,
320 331
     maxIter = 100,
332
+    verbose = FALSE,
321 333
     bestOnly = TRUE)
322 334
 ```
323 335
 
... ...
@@ -347,10 +359,11 @@ If the "bestOnly" parameter is set to FALSE in the `celdaGridSearch`, then the `
347 359
 ```{r, message=FALSE}
348 360
 cgs <- celdaGridSearch(simCounts$counts,
349 361
     paramsTest = list(K = seq(4, 6), L = seq(9, 11)),
350
-    cores = 4,
362
+    cores = 1,
351 363
     model = "celda_CG",
352 364
     nchains = 2,
353 365
     maxIter = 100,
366
+    verbose = FALSE,
354 367
     bestOnly = FALSE)
355 368
 
356 369
 cgs <- resamplePerplexity(counts = simCounts$counts,
357 370
new file mode 100644
... ...
@@ -0,0 +1,58 @@
1
+---
2
+title: "Decontamination of ambient RNA in single-cell RNA-seq with DecontX on 10X PBMC4k data" 
3
+author: "Shiyi Yang, Joshua D. Campbell"
4
+date: "`r Sys.Date()`"
5
+output:
6
+  BiocStyle::html_document:
7
+    toc: true
8
+vignette: >
9
+  %\VignetteIndexEntry{Estimate and remove cross-contamination from ambient RNA in single-cell data with DecontX}
10
+  %\VignetteEngine{knitr::rmarkdown}
11
+  %\VignetteEncoding{UTF-8}
12
+---
13
+
14
+# Installation
15
+
16
+celda can be installed from Bioconductor:
17
+
18
+```{r, eval= FALSE}
19
+if (!requireNamespace("BiocManager", quietly=TRUE)) {
20
+    install.packages("BiocManager")}
21
+BiocManager::install("celda")
22
+```
23
+
24
+The package can be loaded using the `library` command.
25
+
26
+```{r, eval=TRUE, message=FALSE}
27
+library(celda)
28
+```
29
+
30
+DecontX can take either `SingleCellExperiment` object from package [SingleCellExperiment package](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html) or a single counts matrix as input. `decontX` will attempt to convert any input matrix to class `dgCMatrix` from package [Matrix](https://cran.r-project.org/web/packages/Matrix/index.html) before beginning any analyses.
31
+
32
+
33
+# Load PBMC4k data from 10X
34
+We will utlize the 10X PBMC 4K dataset as an example. This can be easily retrieved from the package [TENxPBMCData](http://bioconductor.org/packages/release/data/experiment/html/TENxPBMCData.html). Make sure the the column names are set before running decontX.
35
+
36
+```{r, eval=TRUE, message=FALSE}
37
+library(TENxPBMCData)  
38
+pbmc4k <- TENxPBMCData("pbmc4k")
39
+colnames(pbmc4k) <- paste(pbmc4k$Sample, pbmc4k$Barcode, sep="_")
40
+```
41
+
42
+
43
+To run decontX with a SingleCellExperiment object, simply use the following command. 
44
+
45
+```{r}
46
+pbmc4k = decontX(x = pbmc4k) 
47
+```
48
+
49
+
50
+
51
+# Examining decontX output
52
+There are 4 attributes of DecontX results: 
53
+(1) runParams: parameters used for estimating contamination
54
+(2) estimates: parameters, decontaminated counts as well as other intermediate results estimated during contamination estimation
55
+(3) contamination: cell level contamination
56
+(4) z: cell cluster label 
57
+
58
+# Plotting decontX results