Browse code

Merge branch 'devel' of https://github.com/campbio/celda into devel

Joshua D. Campbell authored on 28/03/2022 20:41:49
Showing 5 changed files

... ...
@@ -165,9 +165,9 @@ recodeClusterZ <- function(sce, from, to, altExpName = "featureSubset") {
165 165
     new.clusters <- plyr::mapvalues(celdaClusters(sce,
166 166
                                                   altExpName = altExpName),
167 167
                                     from, to)
168
-    new.clusters <- factor(new.clusters, levels = 
168
+    new.clusters <- factor(new.clusters, levels =
169 169
                              sort(as.numeric(unique(new.clusters))))
170
-    
170
+
171 171
     celdaClusters(sce, altExpName = altExpName) <- new.clusters
172 172
     return(sce)
173 173
 }
... ...
@@ -218,9 +218,9 @@ recodeClusterY <- function(sce, from, to, altExpName = "featureSubset") {
218 218
     new.clusters <- plyr::mapvalues(celdaModules(sce,
219 219
                                                   altExpName = altExpName),
220 220
                                   from, to)
221
-    new.clusters <- factor(new.clusters, levels = 
221
+    new.clusters <- factor(new.clusters, levels =
222 222
                              sort(as.numeric(unique(new.clusters))))
223
-  
223
+
224 224
     celdaModules(sce, altExpName = altExpName) <- plyr::mapvalues(
225 225
         celdaModules(sce, altExpName = altExpName), from, to)
226 226
     return(sce)
... ...
@@ -26,13 +26,17 @@
26 26
 #' should be considered different batches. Default NULL.
27 27
 #' @param background A numeric matrix of counts or a
28 28
 #' \linkS4class{SingleCellExperiment} with the matrix located in the assay
29
-#' slot under \code{assayName}. It should have the same structure as \code{x}
30
-#' except it contains the matrix of empty droplets instead of cells. When
31
-#' supplied, empirical distribution of transcripts from these empty droplets
29
+#' slot under \code{assayName}. It should have the same data format as \code{x}
30
+#' except it contains the empty droplets instead of cells. When supplied,
31
+#' empirical distribution of transcripts from these empty droplets
32 32
 #' will be used as the contamination distribution. Default NULL.
33 33
 #' @param bgAssayName Character. Name of the assay to use if \code{background}
34 34
 #' is a \linkS4class{SingleCellExperiment}. Default to same as
35 35
 #' \code{assayName}.
36
+#' @param bgBatch Numeric or character vector. Batch labels for
37
+#' \code{background}. Its unique values should be the same as those in
38
+#' \code{batch}, such that each batch of cells have their corresponding batch
39
+#' of empty droplets as background, pointed by this parameter. Default to NULL.
36 40
 #' @param maxIter Integer. Maximum iterations of the EM algorithm. Default 500.
37 41
 #' @param convergence Numeric. The EM algorithm will be stopped if the maximum
38 42
 #' difference in the contamination estimates between the previous and
... ...
@@ -152,6 +156,7 @@ setMethod("decontX", "SingleCellExperiment", function(x,
152 156
                                                       batch = NULL,
153 157
                                                       background = NULL,
154 158
                                                       bgAssayName = NULL,
159
+                                                      bgBatch = NULL,
155 160
                                                       maxIter = 500,
156 161
                                                       delta = c(10, 10),
157 162
                                                       estimateDelta = TRUE,
... ...
@@ -165,8 +170,16 @@ setMethod("decontX", "SingleCellExperiment", function(x,
165 170
   countsBackground <- NULL
166 171
   if (!is.null(background)) {
167 172
     # Remove cells with the same ID between x and the background matrix
168
-    background <- .checkBackground(x = x, background = background,
169
-                                   logfile = logfile, verbose = verbose)
173
+    # Also update bgBatch when background is updated and bgBatch is not null
174
+    temp <- .checkBackground(x = x,
175
+                             background = background,
176
+                             bgBatch = bgBatch,
177
+                             logfile = logfile,
178
+                             verbose = verbose)
179
+
180
+    background <- temp$background
181
+    bgBatch <- temp$bgBatch
182
+
170 183
     if (is.null(bgAssayName)) {
171 184
       bgAssayName <- assayName
172 185
     }
... ...
@@ -180,6 +193,7 @@ setMethod("decontX", "SingleCellExperiment", function(x,
180 193
     z = z,
181 194
     batch = batch,
182 195
     countsBackground = countsBackground,
196
+    batchBackground = bgBatch,
183 197
     maxIter = maxIter,
184 198
     convergence = convergence,
185 199
     iterLogLik = iterLogLik,
... ...
@@ -232,6 +246,7 @@ setMethod("decontX", "ANY", function(x,
232 246
                                      z = NULL,
233 247
                                      batch = NULL,
234 248
                                      background = NULL,
249
+                                     bgBatch = NULL,
235 250
                                      maxIter = 500,
236 251
                                      delta = c(10, 10),
237 252
                                      estimateDelta = TRUE,
... ...
@@ -246,8 +261,18 @@ setMethod("decontX", "ANY", function(x,
246 261
   countsBackground <- NULL
247 262
   if (!is.null(background)) {
248 263
     # Remove cells with the same ID between x and the background matrix
249
-    background <- .checkBackground(x = x, background = background,
250
-                                   logfile = logfile, verbose = verbose)
264
+    # Also update bgBatch when background is updated and bgBatch is not null
265
+    temp <- .checkBackground(x = x,
266
+                             background = background,
267
+                             bgBatch = bgBatch,
268
+                             logfile = logfile,
269
+                             verbose = verbose)
270
+
271
+    background <- temp$background
272
+    countsBackground <- background
273
+    
274
+    bgBatch <- temp$bgBatch
275
+
251 276
   }
252 277
 
253 278
   .decontX(
... ...
@@ -255,6 +280,7 @@ setMethod("decontX", "ANY", function(x,
255 280
     z = z,
256 281
     batch = batch,
257 282
     countsBackground = countsBackground,
283
+    batchBackground = bgBatch,
258 284
     maxIter = maxIter,
259 285
     convergence = convergence,
260 286
     iterLogLik = iterLogLik,
... ...
@@ -337,6 +363,7 @@ setMethod(
337 363
                      z = NULL,
338 364
                      batch = NULL,
339 365
                      countsBackground = NULL,
366
+                     batchBackground = NULL,
340 367
                      maxIter = 200,
341 368
                      convergence = 0.001,
342 369
                      iterLogLik = 10,
... ...
@@ -367,6 +394,7 @@ setMethod(
367 394
   runParams <- list(
368 395
     z = z,
369 396
     batch = batch,
397
+    batchBackground = batchBackground,
370 398
     maxIter = maxIter,
371 399
     delta = delta,
372 400
     estimateDelta = estimateDelta,
... ...
@@ -384,7 +412,7 @@ setMethod(
384 412
   nC <- ncol(counts)
385 413
   allCellNames <- colnames(counts)
386 414
 
387
-  ## Set up final deconaminated matrix
415
+  ## Set up final decontaminated matrix
388 416
   estRmat <- Matrix::Matrix(
389 417
     data = 0,
390 418
     ncol = totalCells,
... ...
@@ -396,8 +424,32 @@ setMethod(
396 424
   ## Generate batch labels if none were supplied
397 425
   if (is.null(batch)) {
398 426
     batch <- rep("all_cells", nC)
427
+
428
+    # If batch null, bgBatch has to be null
429
+    if (!is.null(batchBackground)) {
430
+      stop(
431
+        "When experiment default to no bacth, background should ",
432
+        "also default to no batch."
433
+      )
434
+    }
435
+
436
+    if (!is.null(countsBackground)) {
437
+      batchBackground <- rep("all_cells", ncol(countsBackground))
438
+    }
439
+  } else {
440
+
441
+    # If batch not null and countsBackground supplied,
442
+    # user has to supply batchBackground as well
443
+    if (!is.null(countsBackground) & is.null(batchBackground)) {
444
+      stop(
445
+        "Cell batch, and background are supplied. Please also ",
446
+        "supply background batch."
447
+      )
448
+    }
449
+
399 450
   }
400 451
   runParams$batch <- batch
452
+  runParams$batchBackground <- batchBackground
401 453
   batchIndex <- unique(batch)
402 454
 
403 455
   ## Set result lists upfront for all cells from different batches
... ...
@@ -430,6 +482,7 @@ setMethod(
430 482
 
431 483
     zBat <- NULL
432 484
     countsBat <- counts[, batch == bat]
485
+    bgBat <- countsBackground[, batchBackground == bat]
433 486
 
434 487
     ## Convert to sparse matrix
435 488
     if (!inherits(countsBat, "dgCMatrix")) {
... ...
@@ -442,9 +495,9 @@ setMethod(
442 495
       )
443 496
       countsBat <- methods::as(countsBat, "dgCMatrix")
444 497
     }
445
-    if (!is.null(countsBackground)) {
446
-      if (!inherits(countsBackground, "dgCMatrix")) {
447
-        countsBackground <- methods::as(countsBackground, "dgCMatrix")
498
+    if (!is.null(bgBat)) {
499
+      if (!inherits(bgBat, "dgCMatrix")) {
500
+        bgBat <- methods::as(bgBat, "dgCMatrix")
448 501
       }
449 502
     }
450 503
 
... ...
@@ -456,7 +509,7 @@ setMethod(
456 509
         counts = countsBat,
457 510
         z = zBat,
458 511
         batch = bat,
459
-        countsBackground = countsBackground,
512
+        countsBackground = bgBat,
460 513
         maxIter = maxIter,
461 514
         delta = delta,
462 515
         estimateDelta = estimateDelta,
... ...
@@ -475,7 +528,7 @@ setMethod(
475 528
           counts = countsBat,
476 529
           z = zBat,
477 530
           batch = bat,
478
-          countsBackground = countsBackground,
531
+          countsBackground = bgBat,
479 532
           maxIter = maxIter,
480 533
           delta = delta,
481 534
           estimateDelta = estimateDelta,
... ...
@@ -491,7 +544,7 @@ setMethod(
491 544
     }
492 545
 
493 546
     ## Try to convert class of new matrix to class of original matrix
494
-    
547
+
495 548
     .logMessages(
496 549
       date(),
497 550
       ".. Calculating final decontaminated matrix",
... ...
@@ -563,7 +616,7 @@ setMethod(
563 616
       append = TRUE,
564 617
       verbose = verbose
565 618
     )
566
-    
619
+
567 620
     ## Determine class of seed in DelayedArray
568 621
     seed.class <- unique(DelayedArray::seedApply(counts, class))[[1]]
569 622
     if (seed.class == "HDF5ArraySeed") {
... ...
@@ -1369,9 +1422,11 @@ simulateContamination <- function(C = 300,
1369 1422
 }
1370 1423
 
1371 1424
 
1372
-.checkBackground <- function(x, background, logfile = NULL, verbose = FALSE) {
1425
+.checkBackground <- function(x, background, bgBatch,
1426
+                             logfile = NULL, verbose = FALSE) {
1373 1427
   # Remove background barcodes that have already appeared in x
1374
-  if(!is.null(colnames(background))) {
1428
+  # If bgBatch param is supplied, also remove duplicate bgBatch
1429
+  if (!is.null(colnames(background))) {
1375 1430
     dupBarcode <- colnames(background) %in% colnames(x)
1376 1431
   } else {
1377 1432
     dupBarcode <- FALSE
... ...
@@ -1381,7 +1436,7 @@ simulateContamination <- function(C = 300,
1381 1436
             " Please ensure that no true cells are included in the background ",
1382 1437
             "matrix. Otherwise, results will be incorrect.")
1383 1438
   }
1384
-  
1439
+
1385 1440
   if (any(dupBarcode)) {
1386 1441
     .logMessages(
1387 1442
       date(),
... ...
@@ -1394,6 +1449,20 @@ simulateContamination <- function(C = 300,
1394 1449
       verbose = verbose
1395 1450
     )
1396 1451
     background <- background[, !(dupBarcode), drop = FALSE]
1452
+
1453
+    if (!is.null(bgBatch)) {
1454
+      if (length(bgBatch) != length(dupBarcode)) {
1455
+        stop(
1456
+          "Length of bgBatch must be equal to the number of columns",
1457
+          "of background matrix."
1458
+          )
1459
+      }
1460
+      bgBatch <- bgBatch[!(dupBarcode)]
1461
+    }
1397 1462
   }
1398
-  return(background)
1463
+
1464
+  re <- list(background = background,
1465
+            bgBatch = bgBatch)
1466
+
1467
+  return(re)
1399 1468
 }
... ...
@@ -15,6 +15,7 @@ decontX(x, ...)
15 15
   batch = NULL,
16 16
   background = NULL,
17 17
   bgAssayName = NULL,
18
+  bgBatch = NULL,
18 19
   maxIter = 500,
19 20
   delta = c(10, 10),
20 21
   estimateDelta = TRUE,
... ...
@@ -32,6 +33,7 @@ decontX(x, ...)
32 33
   z = NULL,
33 34
   batch = NULL,
34 35
   background = NULL,
36
+  bgBatch = NULL,
35 37
   maxIter = 500,
36 38
   delta = c(10, 10),
37 39
   estimateDelta = TRUE,
... ...
@@ -72,15 +74,20 @@ should be considered different batches. Default NULL.}
72 74
 
73 75
 \item{background}{A numeric matrix of counts or a
74 76
 \linkS4class{SingleCellExperiment} with the matrix located in the assay
75
-slot under \code{assayName}. It should have the same structure as \code{x}
76
-except it contains the matrix of empty droplets instead of cells. When
77
-supplied, empirical distribution of transcripts from these empty droplets
77
+slot under \code{assayName}. It should have the same data format as \code{x}
78
+except it contains the empty droplets instead of cells. When supplied,
79
+empirical distribution of transcripts from these empty droplets
78 80
 will be used as the contamination distribution. Default NULL.}
79 81
 
80 82
 \item{bgAssayName}{Character. Name of the assay to use if \code{background}
81 83
 is a \linkS4class{SingleCellExperiment}. Default to same as
82 84
 \code{assayName}.}
83 85
 
86
+\item{bgBatch}{Numeric or character vector. Batch labels for
87
+\code{background}. Its unique values should be the same as those in
88
+\code{batch}, such that each batch of cells have their corresponding batch
89
+of empty droplets as background, pointed by this parameter. Default to NULL.}
90
+
84 91
 \item{maxIter}{Integer. Maximum iterations of the EM algorithm. Default 500.}
85 92
 
86 93
 \item{delta}{Numeric Vector of length 2. Concentration parameters for
... ...
@@ -6,6 +6,11 @@
6 6
 
7 7
 using namespace Rcpp;
8 8
 
9
+#ifdef RCPP_USE_GLOBAL_ROSTREAM
10
+Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
11
+Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
12
+#endif
13
+
9 14
 // decontXEM
10 15
 Rcpp::List decontXEM(const Eigen::MappedSparseMatrix<double>& counts, const NumericVector& counts_colsums, const NumericVector& theta, const bool& estimate_eta, const NumericMatrix& eta, const NumericMatrix& phi, const IntegerVector& z, const bool& estimate_delta, const NumericVector& delta, const double& pseudocount);
11 16
 RcppExport SEXP _celda_decontXEM(SEXP countsSEXP, SEXP counts_colsumsSEXP, SEXP thetaSEXP, SEXP estimate_etaSEXP, SEXP etaSEXP, SEXP phiSEXP, SEXP zSEXP, SEXP estimate_deltaSEXP, SEXP deltaSEXP, SEXP pseudocountSEXP) {
... ...
@@ -59,19 +59,21 @@ rownames(sce) <- rowData(sce)$Symbol_TENx
59 59
 
60 60
 # Running decontX
61 61
 
62
-A SingleCellExperiment (SCE) object or a sparse matrix containing the counts for filtered cells can be passed to decontX via the `x` parameter. There are two major ways to run decontX: with and without the raw/droplet matrix containing empty droplets. The raw/droplet matrix can be used to empirically estimate the distribution of ambient RNA, which is especially useful when cells that contributed to the ambient RNA are not accurately represented in the filtered count matrix containing the cells. For example, cells that were removed via flow cytometry or that were more sensitive to lysis during dissociation may have contributed to the ambient RNA but were not measured in the filtered/cell matrix. The raw/droplet matrix can be input as a sparse matrix or SCE object using the `background` parameter:
62
+A SingleCellExperiment (SCE) object or a sparse matrix containing the counts for filtered cells can be passed to decontX via the `x` parameter. There are two major ways to run decontX: with and without the raw/droplet matrix containing empty droplets. The raw/droplet matrix can be used to empirically estimate the distribution of ambient RNA, which is especially useful when cells that contributed to the ambient RNA are not accurately represented in the filtered count matrix containing the cells. For example, cells that were removed via flow cytometry or that were more sensitive to lysis during dissociation may have contributed to the ambient RNA but were not measured in the filtered/cell matrix. The raw/droplet matrix can be input as an SCE object or a sparse matrix using the `background` parameter:
63 63
 
64 64
 ```{r decontX_background, eval=FALSE, message=FALSE}
65 65
 sce <- decontX(sce, background = raw)
66 66
 ```
67 67
 
68
-If cell/column names in the raw/droplet matrix are also found in the filtered counts matrix, then they will be excluded from the raw/droplet matrix before calculation of the ambient RNA distribution. If the raw matrix is not available, then `decontX` will estimate the contamination distribution for each cell cluster based on the profiles of the other cell clusters in the filtered dataset:
68
+We would like to stress that `background` input was designed to contain only empty droplets. In case the `background` input contains both cell and empty droplets, for example the raw output from 10X Genomics, the software will try to look up for the cell/column names in the raw matrix (`background`) that are also found in the filtered counts matrix (`x`), and exclude them from the raw matrix. When cell/column names are not available for the input objects, the software will treat the entire `background` input as empty droplets. This will render incorrect estimation of the ambient RNA profile.
69
+
70
+If the raw matrix is not available, then `decontX` will estimate the contamination distribution for each cell cluster based on the profiles of the other cell clusters in the filtered dataset:
69 71
 
70 72
 ```{r decontX, eval=TRUE, message=FALSE}
71 73
 sce <- decontX(sce)
72 74
 ```
73 75
 
74
-Note that in this case `decontX` will perform heuristic clustering to quickly define major cell clusters. However if you have your own cell cluster labels, they can be specified with the `z` parameter. If you supply a raw matrix via the `background` parameter, then the `z` parameter will not have an effect as clustering will not be performed.
76
+Note that in this case `decontX` will perform heuristic clustering to quickly define major cell clusters. However if you have your own cell cluster labels, they can be specified with the `z` parameter.
75 77
 
76 78
 The contamination can be found in the `colData(sce)$decontX_contamination` and the decontaminated counts can be accessed with `decontXcounts(sce)`. If the input object was a matrix, make sure to save the output into a variable with a different name (e.g. result). The result object will be a list with contamination in `result$contamination` and the decontaminated counts in `result$decontXcounts`. 
77 79
 
... ...
@@ -202,6 +204,43 @@ plot(sce$decontX_contamination, sce.delta$decontX_contamination,
202 204
 abline(0, 1, col = "red", lwd = 2)
203 205
 ```
204 206
 
207
+## Integration with packages such as Seurat and singleCellTK
208
+You can integrate decontX into your scRNA-seq analysis pipelines, such as the one provided by  [Seurat](https://cran.r-project.org/web/packages/Seurat/index.html). Both decontX and Seurat takes input count matrix, although the decontaminated matrix of decontX consists of floating point numbers. As heuristics, you can round the decontaminated matrix to integers before applying it to your Seurat pipeline. 
209
+
210
+```{r seuratIntegration, eval=FALSE}
211
+library(Seurat)
212
+
213
+counts <- Read10X("path/to/file")
214
+
215
+# Convert count matrix to SingleCellExperiment to run on decontX
216
+sce <- SingleCellExperiment(list(counts = counts))
217
+sce <- decontX(sce)
218
+
219
+# Retrieve decontaminated matrix, round to integer, and convert to Seurat object
220
+decontaminated.matrix <- decontXcounts(sce)
221
+decontaminated.counts <- round(decontaminated.matrix)
222
+seuratObject <- CreateSeuratObject(decontaminated.counts)
223
+```
224
+
225
+Conversely, if you have a Seurat object containing raw count matrix and would like to run decontX, simply retrieve the count matrix, convert to SingleCellExperiment, and run on decontX.
226
+
227
+```{r seuratIntegration2, eval=FALSE}
228
+counts <- GetAssayData(object = seuratObject, slot = "counts")
229
+sce <- SingleCellExperiment(list(counts = counts))
230
+sce <- decontX(sce)
231
+```
232
+
233
+
234
+To import datasets into SingleCellExperiment object, the [singleCellTK](https://bioconductor.org/packages/release/bioc/html/singleCellTK.html) package has several importing functions for different preprocessing tools including CellRanger, STARsolo, BUStools, Optimus, DropEST, SEQC, and Alevin/Salmon. For example, the following code can be used as a template to read in the filtered and raw matrices for multiple samples processed with CellRanger:
235
+
236
+```{r singleCellTKIntegration, eval=FALSE}
237
+library(singleCellTK)
238
+
239
+sce <- importCellRanger(sampleDirs = c("path/to/sample1/", "path/to/sample2/"))
240
+sce.raw <- importCellRanger(sampleDirs = c("path/to/sample1/", "path/to/sample2/"), dataType = "raw")
241
+```
242
+
243
+
205 244
 # Session Information
206 245
 
207 246
 ```{r}