... | ... |
@@ -17,14 +17,6 @@ calculateNativeMatrix <- function(counts, native_counts, theta, eta, phi, z, row |
17 | 17 |
.Call('_celda_calculateNativeMatrix', PACKAGE = 'celda', counts, native_counts, theta, eta, phi, z, row_index, col_index, pseudocount) |
18 | 18 |
} |
19 | 19 |
|
20 |
-#' get row and column indices of none zero elements in the matrix |
|
21 |
-#' |
|
22 |
-#' @param R_counts A matrix |
|
23 |
-#' @return An integer matrix where each row is a row, column indices pair |
|
24 |
-nonzero <- function(R_counts) { |
|
25 |
- .Call('_celda_nonzero', PACKAGE = 'celda', R_counts) |
|
26 |
-} |
|
27 |
- |
|
28 | 20 |
cG_calcGibbsProbY_Simple <- function(counts, nGbyTS, nTSbyC, nbyTS, nbyG, y, L, index, gamma, beta, delta) { |
29 | 21 |
.Call('_celda_cG_calcGibbsProbY_Simple', PACKAGE = 'celda', counts, nGbyTS, nTSbyC, nbyTS, nbyG, y, L, index, gamma, beta, delta) |
30 | 22 |
} |
... | ... |
@@ -77,3 +69,11 @@ fastNormPropSqrt <- function(R_counts, R_alpha) { |
77 | 69 |
.Call('_celda_fastNormPropSqrt', PACKAGE = 'celda', R_counts, R_alpha) |
78 | 70 |
} |
79 | 71 |
|
72 |
+#' get row and column indices of none zero elements in the matrix |
|
73 |
+#' |
|
74 |
+#' @param R_counts A matrix |
|
75 |
+#' @return An integer matrix where each row is a row, column indices pair |
|
76 |
+nonzero <- function(R_counts) { |
|
77 |
+ .Call('_celda_nonzero', PACKAGE = 'celda', R_counts) |
|
78 |
+} |
|
79 |
+ |
... | ... |
@@ -162,9 +162,11 @@ plotHeatmap <- function(counts, |
162 | 162 |
## Select subsets of features/cells |
163 | 163 |
if (!is.null(featureIx)) { |
164 | 164 |
counts <- counts[featureIx, , drop = FALSE] |
165 |
- if (length(annotationFeature) > 1 || |
|
166 |
- (length(annotationFeature) == 1 & !is.na(annotationFeature))) { |
|
165 |
+ if (!is.null(annotationFeature) & length(annotationFeature) > 0) { |
|
166 |
+ if(length(annotationFeature) == 1 & !is.na(annotationFeature) | |
|
167 |
+ length(annotationFeature) > 1) { |
|
167 | 168 |
annotationFeature <- annotationFeature[featureIx, , drop = FALSE] |
169 |
+ } |
|
168 | 170 |
} |
169 | 171 |
if (!is.null(y)) { |
170 | 172 |
y <- y[featureIx] |
... | ... |
@@ -173,9 +175,11 @@ plotHeatmap <- function(counts, |
173 | 175 |
|
174 | 176 |
if (!is.null(cellIx)) { |
175 | 177 |
counts <- counts[, cellIx, drop = FALSE] |
176 |
- if (length(annotationCell) > 1 || |
|
177 |
- (length(annotationCell) == 1 & !is.na(annotationCell))) { |
|
178 |
+ if (!is.null(annotationCell) & length(annotationCell) > 0) { |
|
179 |
+ if(length(annotationCell) == 1 & !is.na(annotationCell) | |
|
180 |
+ length(annotationFeature) > 1) { |
|
178 | 181 |
annotationCell <- annotationCell[cellIx, , drop = FALSE] |
182 |
+ } |
|
179 | 183 |
} |
180 | 184 |
if (!is.null(z)) { |
181 | 185 |
z <- z[cellIx] |
... | ... |
@@ -62,6 +62,7 @@ |
62 | 62 |
#' } |
63 | 63 |
#' } |
64 | 64 |
#' @examples |
65 |
+#' \dontrun{ |
|
65 | 66 |
#' library(M3DExampleData) |
66 | 67 |
#' counts <- M3DExampleData::Mmus_example_list$data |
67 | 68 |
#' # subset 100 genes for fast clustering |
... | ... |
@@ -81,6 +82,7 @@ |
81 | 82 |
#' |
82 | 83 |
#' # Plot dendrogram |
83 | 84 |
#' plotDendro(DecTree) |
85 |
+#' } |
|
84 | 86 |
#' @import magrittr |
85 | 87 |
#' @importFrom methods hasArg |
86 | 88 |
#' @export |
... | ... |
@@ -7,6 +7,7 @@ |
7 | 7 |
#' @param features A L(features) by N(samples) numeric matrix. |
8 | 8 |
#' @return A character vector of label predicitions. |
9 | 9 |
#' @examples |
10 |
+#' \dontrun{ |
|
10 | 11 |
#' library(M3DExampleData) |
11 | 12 |
#' counts <- M3DExampleData::Mmus_example_list$data |
12 | 13 |
#' # Subset 500 genes for fast clustering |
... | ... |
@@ -26,6 +27,7 @@ |
26 | 27 |
#' |
27 | 28 |
#' # Get sample estimates in training data |
28 | 29 |
#' getDecisions(DecTree$rules, features) |
30 |
+#' } |
|
29 | 31 |
#' @export |
30 | 32 |
getDecisions <- function(rules, features) { |
31 | 33 |
features <- t(features) |
... | ... |
@@ -12,6 +12,7 @@ |
12 | 12 |
#' @param boxSize A numeric value. Size of rule labels. Default is 7. |
13 | 13 |
#' @param boxColor A character value. Color of rule labels. Default is `black`. |
14 | 14 |
#' @examples |
15 |
+#' \dontrun{ |
|
15 | 16 |
#' library(M3DExampleData) |
16 | 17 |
#' counts <- M3DExampleData::Mmus_example_list$data |
17 | 18 |
#' # Subset 500 genes for fast clustering |
... | ... |
@@ -31,6 +32,7 @@ |
31 | 32 |
#' |
32 | 33 |
#' # Plot dendrogram |
33 | 34 |
#' plotDendro(decTree) |
35 |
+#' } |
|
34 | 36 |
#' @return A ggplot2 object |
35 | 37 |
#' @import ggplot2 |
36 | 38 |
#' @importFrom ggdendro dendro_data ggdendrogram |
... | ... |
@@ -145,7 +145,7 @@ decontXMarkerPlot <- function(counts, z, geneMarkers, threshold = 1, color = "re |
145 | 145 |
|
146 | 146 |
# geneMarkers should be a dataframe, w/ 2 column names being `cellType` and `geneMarkers` |
147 | 147 |
# convert both `cellType` and `geneMarkers` are factors with levels being integer |
148 |
-.geneMarkerProcess <- function(geneMarkers, orders = None) { |
|
148 |
+.geneMarkerProcess <- function(geneMarkers, orders = NULL) { |
|
149 | 149 |
geneMarkers[, "cellName"] <- geneMarkers[, "cellType"] |
150 | 150 |
geneMarkers[, "cellType"] <- factor(geneMarkers[, "cellType"]) |
151 | 151 |
levels(geneMarkers[, "cellType"]) <- 1:length(levels(geneMarkers[, "cellType"])) |
164 | 164 |
deleted file mode 100644 |
... | ... |
@@ -1,41 +0,0 @@ |
1 |
-// [[Rcpp::depends(RcppEigen)]] |
|
2 |
-#include <Rcpp.h> |
|
3 |
-using namespace Rcpp; |
|
4 |
- |
|
5 |
-//' get row and column indices of none zero elements in the matrix |
|
6 |
-//' |
|
7 |
-//' @param R_counts A matrix |
|
8 |
-//' @return An integer matrix where each row is a row, column indices pair |
|
9 |
-// [[Rcpp::export]] |
|
10 |
-SEXP nonzero(NumericMatrix R_counts) { |
|
11 |
- |
|
12 |
- IntegerVector row(1); |
|
13 |
- IntegerVector col(1); |
|
14 |
- NumericVector val(1); |
|
15 |
- |
|
16 |
- int nR = R_counts.nrow(); |
|
17 |
- int nC = R_counts.ncol(); |
|
18 |
- double x; |
|
19 |
- |
|
20 |
- for (int c = 0; c < nC; c++) { |
|
21 |
- for (int r = 0; r < nR; r++) { |
|
22 |
- x = R_counts[c * nR + r]; |
|
23 |
- if (x != 0) { |
|
24 |
- row.push_back(r + 1); |
|
25 |
- col.push_back(c + 1); |
|
26 |
- val.push_back(x); |
|
27 |
- } |
|
28 |
- } |
|
29 |
- } |
|
30 |
- |
|
31 |
- row.erase(0); |
|
32 |
- col.erase(0); |
|
33 |
- val.erase(0); |
|
34 |
- |
|
35 |
- List res; |
|
36 |
- res["row"] = row; |
|
37 |
- res["col"] = col; |
|
38 |
- res["val"] = val; |
|
39 |
- |
|
40 |
- return(res); |
|
41 |
-} |
... | ... |
@@ -73,17 +73,6 @@ BEGIN_RCPP |
73 | 73 |
return rcpp_result_gen; |
74 | 74 |
END_RCPP |
75 | 75 |
} |
76 |
-// nonzero |
|
77 |
-SEXP nonzero(NumericMatrix R_counts); |
|
78 |
-RcppExport SEXP _celda_nonzero(SEXP R_countsSEXP) { |
|
79 |
-BEGIN_RCPP |
|
80 |
- Rcpp::RObject rcpp_result_gen; |
|
81 |
- Rcpp::RNGScope rcpp_rngScope_gen; |
|
82 |
- Rcpp::traits::input_parameter< NumericMatrix >::type R_counts(R_countsSEXP); |
|
83 |
- rcpp_result_gen = Rcpp::wrap(nonzero(R_counts)); |
|
84 |
- return rcpp_result_gen; |
|
85 |
-END_RCPP |
|
86 |
-} |
|
87 | 76 |
// cG_calcGibbsProbY_Simple |
88 | 77 |
NumericVector cG_calcGibbsProbY_Simple(const IntegerMatrix counts, IntegerVector nGbyTS, IntegerMatrix nTSbyC, IntegerVector nbyTS, IntegerVector nbyG, const IntegerVector y, const int L, const int index, const double gamma, const double beta, const double delta); |
89 | 78 |
RcppExport SEXP _celda_cG_calcGibbsProbY_Simple(SEXP countsSEXP, SEXP nGbyTSSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP indexSEXP, SEXP gammaSEXP, SEXP betaSEXP, SEXP deltaSEXP) { |
... | ... |
@@ -222,6 +211,17 @@ BEGIN_RCPP |
222 | 211 |
return rcpp_result_gen; |
223 | 212 |
END_RCPP |
224 | 213 |
} |
214 |
+// nonzero |
|
215 |
+SEXP nonzero(NumericMatrix R_counts); |
|
216 |
+RcppExport SEXP _celda_nonzero(SEXP R_countsSEXP) { |
|
217 |
+BEGIN_RCPP |
|
218 |
+ Rcpp::RObject rcpp_result_gen; |
|
219 |
+ Rcpp::RNGScope rcpp_rngScope_gen; |
|
220 |
+ Rcpp::traits::input_parameter< NumericMatrix >::type R_counts(R_countsSEXP); |
|
221 |
+ rcpp_result_gen = Rcpp::wrap(nonzero(R_counts)); |
|
222 |
+ return rcpp_result_gen; |
|
223 |
+END_RCPP |
|
224 |
+} |
|
225 | 225 |
|
226 | 226 |
RcppExport SEXP _colSumByGroup(SEXP, SEXP); |
227 | 227 |
RcppExport SEXP _colSumByGroup_numeric(SEXP, SEXP); |
... | ... |
@@ -236,7 +236,6 @@ static const R_CallMethodDef CallEntries[] = { |
236 | 236 |
{"_celda_decontXLogLik", (DL_FUNC) &_celda_decontXLogLik, 6}, |
237 | 237 |
{"_celda_decontXInitialize", (DL_FUNC) &_celda_decontXInitialize, 4}, |
238 | 238 |
{"_celda_calculateNativeMatrix", (DL_FUNC) &_celda_calculateNativeMatrix, 9}, |
239 |
- {"_celda_nonzero", (DL_FUNC) &_celda_nonzero, 1}, |
|
240 | 239 |
{"_celda_cG_calcGibbsProbY_Simple", (DL_FUNC) &_celda_cG_calcGibbsProbY_Simple, 11}, |
241 | 240 |
{"_celda_cG_CalcGibbsProbY_ori", (DL_FUNC) &_celda_cG_CalcGibbsProbY_ori, 13}, |
242 | 241 |
{"_celda_cG_CalcGibbsProbY_fastRow", (DL_FUNC) &_celda_cG_CalcGibbsProbY_fastRow, 13}, |
... | ... |
@@ -245,6 +244,7 @@ static const R_CallMethodDef CallEntries[] = { |
245 | 244 |
{"_celda_fastNormProp", (DL_FUNC) &_celda_fastNormProp, 2}, |
246 | 245 |
{"_celda_fastNormPropLog", (DL_FUNC) &_celda_fastNormPropLog, 2}, |
247 | 246 |
{"_celda_fastNormPropSqrt", (DL_FUNC) &_celda_fastNormPropSqrt, 2}, |
247 |
+ {"_celda_nonzero", (DL_FUNC) &_celda_nonzero, 1}, |
|
248 | 248 |
{"_colSumByGroup", (DL_FUNC) &_colSumByGroup, 2}, |
249 | 249 |
{"_colSumByGroup_numeric", (DL_FUNC) &_colSumByGroup_numeric, 2}, |
250 | 250 |
{"_colSumByGroupChange", (DL_FUNC) &_colSumByGroupChange, 4}, |
... | ... |
@@ -84,3 +84,47 @@ SEXP fastNormPropSqrt(NumericMatrix R_counts, double R_alpha) { |
84 | 84 |
return Rcpp::wrap(res); |
85 | 85 |
} |
86 | 86 |
|
87 |
+ |
|
88 |
+ |
|
89 |
+// [[Rcpp::depends(RcppEigen)]] |
|
90 |
+#include <Rcpp.h> |
|
91 |
+using namespace Rcpp ; |
|
92 |
+ |
|
93 |
+//' get row and column indices of none zero elements in the matrix |
|
94 |
+//' |
|
95 |
+//' @param R_counts A matrix |
|
96 |
+//' @return An integer matrix where each row is a row, column indices pair |
|
97 |
+// [[Rcpp::export]] |
|
98 |
+SEXP nonzero(NumericMatrix R_counts) { |
|
99 |
+ |
|
100 |
+ IntegerVector row(1); |
|
101 |
+ IntegerVector col(1); |
|
102 |
+ NumericVector val(1); |
|
103 |
+ |
|
104 |
+ int nR = R_counts.nrow(); |
|
105 |
+ int nC = R_counts.ncol(); |
|
106 |
+ double x; |
|
107 |
+ |
|
108 |
+ for (int c = 0; c < nC; c++) { |
|
109 |
+ for (int r = 0; r < nR; r++) { |
|
110 |
+ x = R_counts[c * nR + r]; |
|
111 |
+ if (x != 0) { |
|
112 |
+ row.push_back(r + 1); |
|
113 |
+ col.push_back(c + 1); |
|
114 |
+ val.push_back(x); |
|
115 |
+ } |
|
116 |
+ } |
|
117 |
+ } |
|
118 |
+ |
|
119 |
+ row.erase(0); |
|
120 |
+ col.erase(0); |
|
121 |
+ val.erase(0); |
|
122 |
+ |
|
123 |
+ List res; |
|
124 |
+ res["row"] = row; |
|
125 |
+ res["col"] = col; |
|
126 |
+ res["val"] = val; |
|
127 |
+ |
|
128 |
+ return(res); |
|
129 |
+} |
|
130 |
+ |
87 | 131 |
deleted file mode 100644 |
... | ... |
@@ -1,154 +0,0 @@ |
1 |
-title: "Decontamination of ambient RNA in single-cell RNA-seq with DecontX on 10X PBMC4k data" |
|
2 |
-author: "Shiyi Yang, Sean Corbett, Yusuke Koga, Zhe Wang, W. Evan Johnson, Masanao Yajima, Joshua D. Campbell" |
|
3 |
-date: "`r Sys.Date()`" |
|
4 |
-output: |
|
5 |
- BiocStyle::html_document: |
|
6 |
- toc: true |
|
7 |
-vignette: > |
|
8 |
- %\VignetteIndexEntry{Estimate and remove cross-contamination from ambient RNA for scRNA-seq data with DecontX} |
|
9 |
- %\VignetteEngine{knitr::rmarkdown} |
|
10 |
- %\VignetteEncoding{UTF-8} |
|
11 |
- |
|
12 |
- |
|
13 |
-```{r, eval=TRUE, message=FALSE} |
|
14 |
-library(celda) |
|
15 |
-``` |
|
16 |
- |
|
17 |
-DecontX can take either `SingleCellExperiment` object from [SingleCellExperiment package] (https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html) or a count matrix in the format of rows being genesand columns being cells. When using SingleCellExperiment object, DecontX is using the counts matrix extracted from the SingleCellExperiment object. |
|
18 |
- |
|
19 |
-# TODO: Load PBMC4k data from 10X CellRanger output |
|
20 |
-# Load PBMC4k data from 10X |
|
21 |
-```{r, eval=TRUE, message=FALSE} |
|
22 |
-library(TENxPBMCData) # selected single cell RNAseq datasets from 10X Genomics |
|
23 |
-pbmc <- TENxPBMCData("pbmc4k") |
|
24 |
-colnames(pbmc) <- paste(pbmc$Sample, pbmc$Barcode, sep="_") # Rename cells |
|
25 |
-``` |
|
26 |
- |
|
27 |
-Take a look at the count matrix in the `pbmc` singleCellExperiment object via `assay` function from [SummarizedExperiment] (https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html). There are total 33694 genes and 4340 cells in this PBMC data. |
|
28 |
-```{r} |
|
29 |
-library(SummarizedExperiment) |
|
30 |
-pbmc_counts = SummarizedExperiment::assay(pbmc, i = "counts") |
|
31 |
-print(pbmc_counts) |
|
32 |
-``` |
|
33 |
- |
|
34 |
-Among the 33694 genes, many of then are empty genes (i.e., not expressed by any cell), and many of the genes are likely to be noise due to extremely low expression among these 4340 cells. |
|
35 |
-We filter out genes that are likely to be noise. We normally keep genes that have expressed at least 3 counts in at least 3 cells. |
|
36 |
-After filtering, we have 4529 genes left. |
|
37 |
-```{r} |
|
38 |
-#pbmc_filterg = pbmc[rowSums(pbmc_counts > 2) > 2, ] |
|
39 |
-pbmc_filterg = pbmc[rowSums(pbmc_counts >= 3) >= 3, ] # Filtering can be applied directly on SingleCellExperiment object |
|
40 |
-print(pbmc_filterg) |
|
41 |
-``` |
|
42 |
- |
|
43 |
- |
|
44 |
- |
|
45 |
-# Estimate and remove contaminated counts using DecontX (2 scenarios -- with and without cell types specified) |
|
46 |
-## Use DecontX for contamination estimation when there is cell type specified for each individual cells (Here we use celda_CG -- a Bayesian bi-clustering method on single cell RNAseq data -- to get cell clusters as well as gene modules, which enhances cell clustering. You can substitute `celda_CG` with any clustering method you have faith in. DecontX takes the expression count matrix (gene by cell count matrix) and a vector of the same length as number of cells in the count matrix specifying cell cluster. |
|
47 |
-```{r} |
|
48 |
-pbmc_filterg_counts = SummarizedExperiment::assay(pbmc_filterg, "counts") |
|
49 |
-pbmc_filterg_counts = as.matrix(pbmc_filterg_counts) # Change obejct type as basic matrix to pass into celda::celda_CG |
|
50 |
-celda_res = celda_CG(counts = pbmc_filterg_counts, K = 19, L = 150) # specify number of cell clusters being K=19, number of gnee modules being L=150 |
|
51 |
-``` |
|
52 |
-P.s: we have analyzed this 4K PBMC data before, so we don't bother with the parameters choice for the bi-clustering method `celda_CG` here. For a detailed toturial of usage on `celda`, it is availabel [here] (celda-vignettes url) |
|
53 |
- |
|
54 |
- |
|
55 |
- |
|
56 |
-Pass expression (gene by cell) count matrix and the cluster labels from `celda_CG` to decontX, it will return estimated results. Corresponding to the 2 types of input, output result is either in a `list`, or a SingleCellExperiment object with the result list added back to its `metadata` slot (?? Am I understanding the structure right? Is metadata a "slot" of the sce-object??). When `decontX` takes a SingleCellExperiment object, we can use function `metadata` to extract the result list and use `str` function to look at the 4 attributes ("runParams", "estimates", "contamination", "z") in it. |
|
57 |
-```{r} |
|
58 |
-cell_cluster = celda_res@clusters$z |
|
59 |
-decontx_res_sce = decontX( x = pbmc_filterg, z = cell_cluster) |
|
60 |
-print(decontx_res_sce) |
|
61 |
-#decontx_res = decontX( x = pbmc_filterg_counts, z = cell_cluster) ## This works too. Difference is that the output is a list rathen than being a SingleCellExperiment object |
|
62 |
-decontx_res = metadata(decontx_res_sce)$decontX |
|
63 |
-str(decontx_res) |
|
64 |
-``` |
|
65 |
- |
|
66 |
- |
|
67 |
-## Interpretation of DecontX results |
|
68 |
-There are 4 attributes of DecontX results: |
|
69 |
-(1) runParams: parameters used for estimating contamination |
|
70 |
-(2) estimates: parameters, decontaminated counts as well as other intermediate results estimated during contamination estimation |
|
71 |
-(3) contamination: cell level contamination |
|
72 |
-(4) z: cell cluster label |
|
73 |
- |
|
74 |
- |
|
75 |
-# Check DecontX decontaminated count to see if gene expressions are cleaner by looking at marker genes' expression |
|
76 |
-## First we have to identify cell types, which is one step further from cell clustering, as we do not really know which cluster is corresponding to what cell type and multiple cell clusters can also be from the same cell type. We are looking at the 4 major cell types (T-cell, B-cell, NK-cell and monocytes) in this PBMC data. |
|
77 |
-Combining both the clustering results from `celda_CG`, visualization on a reduced 2-dimention space using Tsne (T-stochastic Neighbor Embedding), and marker genes, we relate cell clusters (more specifically the cells in them) to cell types. The marker genes used to identify these 4 major cell types are CD3D (T-cell marker), MS4A1 and CD79A (B-cell markers), GNLY (NK-cell marker), CD14 or FCGR3A (monocytes markers). There are also other cell types (such as dendritic cells, plasma cells, etc.), we don't include these cell types in our demonstration here and we point you to [here] (celda-vignettes url on PBMC4k data) for cell type analysis using `celda`. |
|
78 |
-```{r} |
|
79 |
-celdaTsne.ori = celdaTsne( counts = pbmc_filterg_counts , celdaMod = celda_res ) |
|
80 |
-pltTsne.ori = plotDimReduceCluster( dim1 = celdaTsne.ori[,1], dim2 = celdaTsne.ori[, 2], cluster = cell_cluster, labelClusters = T) |
|
81 |
-print(pltTsne.ori) |
|
82 |
-``` |
|
83 |
- |
|
84 |
-We use violin plot to look at the marker gene expression levels in each cluster identified before by `celda_CG`. Those gene names are stored in the "rowData" of the singleCellExperiment object. On the other hand, the rownames of the expression count matrix is using gene ID. We have to match marker genes' name to the gene ID to locate them in the expression count matrix. |
|
85 |
-```{r} |
|
86 |
-markerGenes = do.call(what = rbind, args = list(data.frame("celltype" = "Tcell", "markergenes"=c("CD3D" )), |
|
87 |
- data.frame("celltype" = "Bcell", "markergenes"=c("MS4A1", "CD79A")), |
|
88 |
- data.frame("celltype" = "NKcell", "markergenes"=c("GNLY")), |
|
89 |
- data.frame("celltype" = "monocytes", "markergenes"=c("CD14", "FCGR3A")), |
|
90 |
- data.frame("celltype" = "megakaryocytes", "markergenes"=c("PPBP")))) |
|
91 |
-print(markerGenes) |
|
92 |
-head(rowData(pbmc_filterg)) |
|
93 |
-markerGenes_wID = merge( x = markerGenes, y = rowData(pbmc_filterg), by.x = "markergenes", by.y = "Symbol") |
|
94 |
-markerGenes_wID[, "feature"] = paste( markerGenes_wID[, "ENSEMBL_ID"], markerGenes_wID[, "markergenes"], sep = "_") |
|
95 |
-print(markerGenes_wID) |
|
96 |
- |
|
97 |
-pbmc_filterg_counts_rename = pbmc_filterg_counts |
|
98 |
-rownames(pbmc_filterg_counts_rename) = paste(rownames(pbmc_filterg), rowData(pbmc_filterg)[, "Symbol"], sep = "_") |
|
99 |
-pltViolin.ori = violinPlot(counts = pbmc_filterg_counts_rename, celdaMod=celda_res, features = markerGenes_wID[, "feature"]) |
|
100 |
-print(pltViolin.ori) |
|
101 |
-``` |
|
102 |
- |
|
103 |
-The violin plots of the marker genes' expression indicate that cluster 12, 13 are B-cells, cluster 7, 14, 15, 16, 18, 19 are T-cells, cluster 11 is NK-cells, cluster 2, 4, 5 are monocytes. For monocytes, we did not include cluster 3 as it is more likely to be megakaryocytes (PPBP+) or cluster 6 as it is likely to be doublets group (CD14+ and CD3D). |
|
104 |
-```{r} |
|
105 |
-cellType = list("Bcell"=c(12,13), "Tcell"=c(7, 14, 15, 16, 18, 19), "NKcell"=c(11), "monocytes"=c(2, 4, 5)) |
|
106 |
-print(cellType) |
|
107 |
-``` |
|
108 |
- |
|
109 |
-Now that we have identified the 4 major cell types, we can go ahead to see how DecontX makes the genes' expression clearer by comparing the decontaminated counts with the orinal counts. As we now look at properties at cell-type level than cell cluster level, again we will convert cell cluster labels into cell types. P.s: except for the aforementioned major 4 cell types, all other cell types in this PBMC data are indicated as "others" in this tutorial. |
|
110 |
-```{r} |
|
111 |
-# This function is to convert cluster label into cell-type label |
|
112 |
-match.z.celltype = function( z, cellType.list ) { |
|
113 |
- z.cell = z |
|
114 |
- for ( i in 1:length(cellType.list) ) { |
|
115 |
- z.cell[ z %in% cellType.list[[i]] ] = names( cellType.list[i] ) |
|
116 |
- } |
|
117 |
- return( z.cell ) |
|
118 |
-} |
|
119 |
- |
|
120 |
-cellType_list = cellType |
|
121 |
-cellType_list[["others"]] = c(1:19)[-unlist(cellType)] |
|
122 |
-print(cellType_list) |
|
123 |
- |
|
124 |
-cell_type = match.z.celltype(z = cell_cluster, cellType.list = cellType_list) |
|
125 |
-print(table(cell_cluster, cell_type)) |
|
126 |
-``` |
|
127 |
- |
|
128 |
-We look at the gene expressions for these 4 major cell types by comparing the original expression count and the decontaminated counts by DecontX. Since dropout events are prevalent in single cell RNAseq data, we include more genes from now to show clearer cell-type expressions. |
|
129 |
-```{r} |
|
130 |
-ct_markergenes = do.call(what = rbind, args = list(data.frame("celltype" = "Tcell", "markergenes"=c("CD3D", "CD3E")), |
|
131 |
- data.frame("celltype" = "Bcell", "markergenes"=c("MS4A1", "CD79A", "CD79B")), |
|
132 |
- data.frame("celltype" = "NKcell", "markergenes"=c("GNLY")), |
|
133 |
- data.frame("celltype" = "monocytes", "markergenes"=c("LYZ", "S100A8", "S100A9")))) |
|
134 |
-ct_markergenes_wID = merge( x = ct_markergenes, y = rowData(pbmc_filterg), by.x = "markergenes", by.y = "Symbol") |
|
135 |
-ct_markergenes_wID[, "feature"] = paste( ct_markergenes_wID[, "ENSEMBL_ID"], ct_markergenes_wID[, "markergenes"], sep = "_") |
|
136 |
-print(head(ct_markergenes_wID)) |
|
137 |
- |
|
138 |
-ct_markergenes = ct_markergenes_wID[, c("celltype", "feature")] |
|
139 |
-colnames(ct_markergenes) = c("cellType", "geneMarkers") |
|
140 |
- |
|
141 |
-pltMarker = decontXMarkerPlot( counts = pbmc_filterg_counts_rename, z = cell_type, geneMarkers = ct_markergenes) |
|
142 |
- |
|
143 |
- |
|
144 |
-``` |
|
145 |
- |
|
146 |
- |
|
147 |
-## Use DecontX for contamination estimation when there is no cell type specified for the cells |
|
148 |
- |
|
149 |
- |
|
150 |
- |
|
151 |
- |
|
152 |
- |
153 | 0 |
deleted file mode 100644 |
... | ... |
@@ -1,99 +0,0 @@ |
1 |
-title: "Estimate and remove cross-contamination from ambient RNA for scRNA-seq data with DecontX" |
|
2 |
-author: "Shiyi Yang, Sean Corbett, Yusuke Koga, Zhe Wang, W. Evan Johnson, Masanao Yajima, Joshua D. Campbell" |
|
3 |
-date: "`r Sys.Date()`" |
|
4 |
-output: |
|
5 |
- BiocStyle::html_document: |
|
6 |
- toc: true |
|
7 |
-vignette: > |
|
8 |
- %\VignetteIndexEntry{Estimate and remove cross-contamination from ambient RNA for scRNA-seq data with DecontX} |
|
9 |
- %\VignetteEngine{knitr::rmarkdown} |
|
10 |
- %\VignetteEncoding{UTF-8} |
|
11 |
- |
|
12 |
-# Introduction |
|
13 |
- |
|
14 |
-DecontX is a Bayesian hierarchical model to estimate and remove cross-contamination from ambient RNA in single-cell RNA-seq count data generated from droplet-based sequencing devices. DecontX will take the count matrix with/without the cell labels and estimate the contamination level and deliver a decontaminted count matrix for downstream analysis. |
|
15 |
- |
|
16 |
-In this vignette we will demonstrate how to use DecontX to estimate and remove contamination. |
|
17 |
- |
|
18 |
-# Installation |
|
19 |
- |
|
20 |
-celda can be installed from Bioconductor: |
|
21 |
- |
|
22 |
-```{r, eval= FALSE} |
|
23 |
-if (!requireNamespace("BiocManager", quietly=TRUE)) { |
|
24 |
- install.packages("BiocManager")} |
|
25 |
-BiocManager::install("celda") |
|
26 |
-``` |
|
27 |
- |
|
28 |
-The package can be loaded using the `library` command. |
|
29 |
- |
|
30 |
-```{r, eval=TRUE, message=FALSE} |
|
31 |
-library(celda) |
|
32 |
-``` |
|
33 |
- |
|
34 |
-To see the latest updates and releases or to post a bug, see our GitHub page at https://github.com/campbio/celda. To ask questions about running celda, post a thread on Bioconductor support site at https://support.bioconductor.org/. |
|
35 |
- |
|
36 |
-# Reproducibility note |
|
37 |
- |
|
38 |
-Many functions in *celda* make use of stochastic algorithms or procedures which require the use of random number generator (RNG) for simulation or sampling. To maintain reproducibility, all these functions use a **default seed of 12345** to make sure same results are generated each time one of these functions is called. Explicitly setting the `seed` arguments is needed for greater control and randomness. |
|
39 |
- |
|
40 |
-# Generation of a cross-contaminated dataset |
|
41 |
-DecontX will take a matrix of counts (referred as observed counts) where each row is a feature, each column is a cell, and each entry in the matrix is the number of counts of each feature in each cell. To illustrate the utility of DecontX, we will apply it to a simulated dataset. |
|
42 |
- |
|
43 |
-In the function `simulateContaminatedMatrix`, the K parameter designates the number of cell clusters, the C parameter determines the number of cells, the G parameter determines the number of genes in the simulated dataset. |
|
44 |
- |
|
45 |
-```{r} |
|
46 |
-simCounts <- simulateContaminatedMatrix(G = 300, C = 100, K = 3) |
|
47 |
-``` |
|
48 |
- |
|
49 |
-The `nativeCounts` is the natively expressed counts matrix, and `observedCounts` is the observed counts matrix that contains both contaminated and natively expressed transctripts. The `NByC` is the total number of observed transcripts per cell. The counts matrix which only contains contamianted transcripts can be obtained by subtracting the observed counts matrix from the observed counts matrix. |
|
50 |
- |
|
51 |
-```{r} |
|
52 |
-contamination <- simCounts$observedCounts - simCounts$nativeCounts |
|
53 |
-``` |
|
54 |
- |
|
55 |
-The `z` variable contains the population label for each cell. |
|
56 |
- |
|
57 |
-```{r} |
|
58 |
-table(simCounts$z) |
|
59 |
-``` |
|
60 |
- |
|
61 |
-The `phi` and `eta` variables contain the expression distributions and contamination distributions for each population, respectively. Each column corresponds to a population, each row represents a gene. The sum of the rows equal to 1. |
|
62 |
- |
|
63 |
-```{r} |
|
64 |
-colSums(simCounts$phi) |
|
65 |
-colSums(simCounts$eta) |
|
66 |
-``` |
|
67 |
- |
|
68 |
- |
|
69 |
-# Decontamination using DecontX |
|
70 |
-DecontX uses bayesian method to estimate and remove contamination via varitaional inference. |
|
71 |
- |
|
72 |
-```{r, warning = FALSE, message = FALSE} |
|
73 |
-decontxModel <- decontX(x = simCounts$observedCounts, z = simCounts$z) |
|
74 |
-``` |
|
75 |
- |
|
76 |
-## Check convergence |
|
77 |
-Use log-likelihood to check convergence |
|
78 |
- |
|
79 |
-```{r, eval = TRUE, fig.width = 5, fig.height = 5} |
|
80 |
-plot(decontxModel$estimates$all_cells$logLikelihood) |
|
81 |
-``` |
|
82 |
- |
|
83 |
-## Evaluate model performance |
|
84 |
- |
|
85 |
-`decontX` estimates a contamination proportion for each cell. We compare the estimated contamination proportion with the real contamination proportion. |
|
86 |
- |
|
87 |
-```{r, eval = TRUE, fig.width = 5, fig.height = 5} |
|
88 |
-plot(decontxModel$contamination, |
|
89 |
- colSums(contamination) / simCounts$NByC, col = simCounts$z) |
|
90 |
-abline(0, 1) |
|
91 |
-``` |
|
92 |
- |
|
93 |
-# Session Information |
|
94 |
- |
|
95 |
-```{r} |
|
96 |
-sessionInfo() |
|
97 |
-``` |
101 | 3 |
similarity index 96% |
102 | 4 |
rename from vignettes/celda-analysis.Rmd |
103 | 5 |
rename to vignettes/celda.Rmd |
... | ... |
@@ -1,6 +1,17 @@ |
1 | 1 |
--- |
2 |
-title: "Analysis of single-cell 'omic count data with celda" |
|
3 |
-author: "Sean Corbett, Yusuke Koga, Shiyi Yang, Zhe Wang, Evan Johnson, Paola Sebastiani, Masanao Yajima, Joshua Campbell" |
|
2 |
+title: "Analysis of single-cell genomic data with celda" |
|
3 |
+author: |
|
4 |
+- name: Sean Corbett |
|
5 |
+ affiliation: &id Boston University School of Medicine |
|
6 |
+- name: Yusuke Koga |
|
7 |
+ affiliation: *id |
|
8 |
+- name: Shiyi Yang |
|
9 |
+ affiliation: *id |
|
10 |
+- name: Zhe Wang |
|
11 |
+ affiliation: *id |
|
12 |
+- name: Joshua Campbell |
|
13 |
+ affiliation: *id |
|
14 |
+ email: camp@bu.edu |
|
4 | 15 |
date: "`r Sys.Date()`" |
5 | 16 |
output: |
6 | 17 |
BiocStyle::html_document: |
... | ... |
@@ -13,9 +24,9 @@ vignette: > |
13 | 24 |
|
14 | 25 |
# Introduction |
15 | 26 |
|
16 |
-**CE**llular **L**atent **D**irichlet **A**llocation (celda) is a collection of Bayesian hierarchical models to perform feature and cell bi-clustering for count data generated by single-cell platforms. This algorithm is an extension of the Latent Dirichlet Allocation (LDA) topic modeling framework that has been popular in text mining applications and has shown good performance with sparse data. celda simultaneously clusters features (i.e. gene expression) into modules based on co-expression patterns across cells and cells into subpopulations based on the probabilities of the feature modules within each cell. celda uses Dirichlet-multinomial distributions to model cells and genes so no additional normalization is required for single-cell counts. |
|
27 |
+**CE**llular **L**atent **D**irichlet **A**llocation (celda) is a collection of Bayesian hierarchical models to perform feature and cell bi-clustering for count data generated by single-cell platforms. This algorithm is an extension of the Latent Dirichlet Allocation (LDA) topic modeling framework that has been popular in text mining applications and has shown good performance with sparse data. celda simultaneously clusters features (i.e. gene expression) into modules based on co-expression patterns across cells and cells into subpopulations based on the probabilities of the feature modules within each cell. |
|
17 | 28 |
|
18 |
-In this vignette we will demonstrate how to use celda to perform cell and feature clustering with simulated data. |
|
29 |
+In this vignette we will demonstrate how to use celda to perform cell and feature clustering with a simple simulated dataset. |
|
19 | 30 |
|
20 | 31 |
# Installation |
21 | 32 |
|
... | ... |
@@ -307,17 +318,18 @@ celdaModel <- subsetCeldaList(celdaList = cellSplit, |
307 | 318 |
|
308 | 319 |
## celdaGridSearch |
309 | 320 |
|
310 |
-celda is able to run multiple combinations of K and L with multiple chains in parallel via the `celdaGridSearch` function. |
|
321 |
+celda is able to run multiple combinations of K and L with multiple chains in parallel via the `celdaGridSearch` function. Setting `verbose` to `TRUE` will print the output of each model to a text file. |
|
311 | 322 |
|
312 | 323 |
The `resamplePerplexity` function "perturbs" the original counts matrix by resampling the counts of each cell according to its normalized probability distribution. Perplexity is calculated on the resampled matrix and the procedure is repeated `resample` times. These results can be visualized with `plotGridSearchPerplexity`. The major goal is to pick the lowest K and L combination with relatively good perplexity. In general, visual inspection of the plot can be used to select the number of modules (L) or cell populations (K) where the rate of decrease in the perplexity starts to drop off. |
313 | 324 |
|
314 | 325 |
```{r, eval = TRUE, message = FALSE} |
315 | 326 |
cgs <- celdaGridSearch(simCounts$counts, |
316 | 327 |
paramsTest = list(K = seq(4, 6), L = seq(9, 11)), |
317 |
- cores = 4, |
|
328 |
+ cores = 1, |
|
318 | 329 |
model = "celda_CG", |
319 | 330 |
nchains = 2, |
320 | 331 |
maxIter = 100, |
332 |
+ verbose = FALSE, |
|
321 | 333 |
bestOnly = TRUE) |
322 | 334 |
``` |
323 | 335 |
|
... | ... |
@@ -347,10 +359,11 @@ If the "bestOnly" parameter is set to FALSE in the `celdaGridSearch`, then the ` |
347 | 359 |
```{r, message=FALSE} |
348 | 360 |
cgs <- celdaGridSearch(simCounts$counts, |
349 | 361 |
paramsTest = list(K = seq(4, 6), L = seq(9, 11)), |
350 |
- cores = 4, |
|
362 |
+ cores = 1, |
|
351 | 363 |
model = "celda_CG", |
352 | 364 |
nchains = 2, |
353 | 365 |
maxIter = 100, |
366 |
+ verbose = FALSE, |
|
354 | 367 |
bestOnly = FALSE) |
355 | 368 |
|
356 | 369 |
cgs <- resamplePerplexity(counts = simCounts$counts, |
357 | 370 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,58 @@ |
1 |
+--- |
|
2 |
+title: "Decontamination of ambient RNA in single-cell RNA-seq with DecontX on 10X PBMC4k data" |
|
3 |
+author: "Shiyi Yang, Joshua D. Campbell" |
|
4 |
+date: "`r Sys.Date()`" |
|
5 |
+output: |
|
6 |
+ BiocStyle::html_document: |
|
7 |
+ toc: true |
|
8 |
+vignette: > |
|
9 |
+ %\VignetteIndexEntry{Estimate and remove cross-contamination from ambient RNA in single-cell data with DecontX} |
|
10 |
+ %\VignetteEngine{knitr::rmarkdown} |
|
11 |
+ %\VignetteEncoding{UTF-8} |
|
12 |
+--- |
|
13 |
+ |
|
14 |
+# Installation |
|
15 |
+ |
|
16 |
+celda can be installed from Bioconductor: |
|
17 |
+ |
|
18 |
+```{r, eval= FALSE} |
|
19 |
+if (!requireNamespace("BiocManager", quietly=TRUE)) { |
|
20 |
+ install.packages("BiocManager")} |
|
21 |
+BiocManager::install("celda") |
|
22 |
+``` |
|
23 |
+ |
|
24 |
+The package can be loaded using the `library` command. |
|
25 |
+ |
|
26 |
+```{r, eval=TRUE, message=FALSE} |
|
27 |
+library(celda) |
|
28 |
+``` |
|
29 |
+ |
|
30 |
+DecontX can take either `SingleCellExperiment` object from package [SingleCellExperiment package](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html) or a single counts matrix as input. `decontX` will attempt to convert any input matrix to class `dgCMatrix` from package [Matrix](https://cran.r-project.org/web/packages/Matrix/index.html) before beginning any analyses. |
|
31 |
+ |
|
32 |
+ |
|
33 |
+# Load PBMC4k data from 10X |
|
34 |
+We will utlize the 10X PBMC 4K dataset as an example. This can be easily retrieved from the package [TENxPBMCData](http://bioconductor.org/packages/release/data/experiment/html/TENxPBMCData.html). Make sure the the column names are set before running decontX. |
|
35 |
+ |
|
36 |
+```{r, eval=TRUE, message=FALSE} |
|
37 |
+library(TENxPBMCData) |
|
38 |
+pbmc4k <- TENxPBMCData("pbmc4k") |
|
39 |
+colnames(pbmc4k) <- paste(pbmc4k$Sample, pbmc4k$Barcode, sep="_") |
|
40 |
+``` |
|
41 |
+ |
|
42 |
+ |
|
43 |
+To run decontX with a SingleCellExperiment object, simply use the following command. |
|
44 |
+ |
|
45 |
+```{r} |
|
46 |
+pbmc4k = decontX(x = pbmc4k) |
|
47 |
+``` |
|
48 |
+ |
|
49 |
+ |
|
50 |
+ |
|
51 |
+# Examining decontX output |
|
52 |
+There are 4 attributes of DecontX results: |
|
53 |
+(1) runParams: parameters used for estimating contamination |
|
54 |
+(2) estimates: parameters, decontaminated counts as well as other intermediate results estimated during contamination estimation |
|
55 |
+(3) contamination: cell level contamination |
|
56 |
+(4) z: cell cluster label |
|
57 |
+ |
|
58 |
+# Plotting decontX results |