Browse code

Updated perplexity functions to be able to subsample cells or run on the original matrix. Also fixed bug when running calculating perplexity on a new counts matrix

Joshua D. Campbell authored on 01/01/2022 01:03:13
Showing 10 changed files

... ...
@@ -80,8 +80,9 @@ setMethod("perplexity", signature(x = "SingleCellExperiment"),
80 80
             y <- celdaModules(x, altExpName = altExpName)
81 81
             p <- .perplexityCelda_G(counts,
82 82
                 factorized,
83
-                L,
84
-                y,
83
+                L = L,
84
+                y = y,
85
+                beta = S4Vectors::metadata(altExp)$celda_parameters$beta,
85 86
                 newCounts = newCounts)
86 87
         } else {
87 88
             stop("S4Vectors::metadata(altExp(x, altExpName))$",
... ...
@@ -198,15 +199,6 @@ setMethod(
198 199
          counts <- .processCounts(x)
199 200
          compareCountMatrix(counts, celdaMod)
200 201
 
201
-        if (is.null(newCounts)) {
202
-            newCounts <- counts
203
-        } else {
204
-            newCounts <- .processCounts(newCounts)
205
-        }
206
-        if (nrow(newCounts) != nrow(counts)) {
207
-            stop("newCounts should have the same number of rows as counts.")
208
-        }
209
-
210 202
         factorized <- factorizeMatrix(
211 203
             x = counts,
212 204
             celdaMod = celdaMod,
... ...
@@ -217,16 +209,27 @@ setMethod(
217 209
         eta <- factorized$posterior$geneDistribution
218 210
         nGByTS <- factorized$counts$geneDistribution
219 211
 
220
-        etaProb <- log(eta) * nGByTS
221
-        # gene.by.cell.prob = log(psi %*% phi)
222
-        # logPx = sum(gene.by.cell.prob * newCounts) # + sum(etaProb)
223
-        logPx <- .perplexityGLogPx(
224
-            newCounts,
225
-            phi,
226
-            psi,
227
-            celdaClusters(celdaMod)$y,
228
-            params(celdaMod)$L
229
-        ) # + sum(etaProb)
212
+        
213
+        if (is.null(newCounts)) {
214
+            newCounts <- counts
215
+        } else {
216
+            newCounts <- .processCounts(newCounts)
217
+            beta <- params(celdaMod)$beta
218
+            L <- params(celdaMod)$L
219
+            y <- celdaMod@clusters$y
220
+            phi <- .rowSumByGroup(newCounts, group = y, L = L) + beta
221
+            phi <- normalizeCounts(phi, normalize = "proportion")
222
+        }
223
+        if (nrow(newCounts) != nrow(counts)) {
224
+            stop("newCounts should have the same number of rows as counts.")
225
+        }
226
+        
227
+        #etaProb <- log(eta) * nGByTS
228
+        #gene.by.cell.prob = log(psi %*% phi)
229
+        #logPx = sum(gene.by.cell.prob * newCounts) # + sum(etaProb)
230
+        gene.by.cell.prob <- log(psi %*% phi)
231
+        logPx <- sum(gene.by.cell.prob * newCounts) # + sum(etaProb)
232
+
230 233
         perplexity <- exp(- (logPx / sum(newCounts)))
231 234
         return(perplexity)
232 235
     }
... ...
@@ -300,40 +303,37 @@ setMethod(
300 303
     factorized,
301 304
     L,
302 305
     y,
306
+    beta,
303 307
     newCounts) {
304 308
 
309
+    psi <- factorized$posterior$module
310
+    phi <- factorized$posterior$cell
311
+    #eta <- factorized$posterior$geneDistribution
312
+    #nGByTS <- factorized$counts$geneDistribution
313
+
305 314
     if (is.null(newCounts)) {
306 315
         newCounts <- counts
307 316
     } else {
308 317
         newCounts <- .processCounts(newCounts)
318
+        phi <- .rowSumByGroup(newCounts, group = y, L = L) + beta
319
+        phi <- normalizeCounts(phi, normalize = "proportion")
309 320
     }
310 321
     if (nrow(newCounts) != nrow(counts)) {
311
-        stop("newCounts should have the same number of rows as",
312
-            " 'assay(altExp(x, altExpName), i = useAssay)'.")
322
+        stop("newCounts should have the same number of rows as counts.")
313 323
     }
324
+    
325
+    #etaProb <- log(eta) * nGByTS
326
+    #gene.by.cell.prob <- log(psi %*% phi)
314 327
 
315
-    psi <- factorized$posterior$module
316
-    phi <- factorized$posterior$cell
317
-    eta <- factorized$posterior$geneDistribution
318
-    nGByTS <- factorized$counts$geneDistribution
319
-
320
-    etaProb <- log(eta) * nGByTS
321
-    # gene.by.cell.prob = log(psi %*% phi)
322
-    # logPx = sum(gene.by.cell.prob * newCounts) # + sum(etaProb)
323
-    logPx <- .perplexityGLogPx(
324
-        newCounts,
325
-        phi,
326
-        psi,
327
-        y,
328
-        L
329
-    ) # + sum(etaProb)
328
+    gene.by.cell.prob <- log(psi %*% phi)
329
+    logPx <- sum(gene.by.cell.prob * newCounts) # + sum(etaProb)
330
+    
330 331
     perplexity <- exp(- (logPx / sum(newCounts)))
331 332
     return(perplexity)
332 333
 }
333 334
 
334 335
 
335
-#' @title Calculate and visualize perplexity of all models in a celdaList, with
336
-#'  count resampling
336
+#' @title Calculate and visualize perplexity of all models in a celdaList
337 337
 #' @description Calculates the perplexity of each model's cluster assignments
338 338
 #'  given the provided countMatrix, as well as resamplings of that count
339 339
 #'  matrix, providing a distribution of perplexities and a better sense of the
... ...
@@ -351,10 +351,19 @@ setMethod(
351 351
 #'  to use. Default "featureSubset".
352 352
 #' @param celdaList Object of class 'celdaList'. Used only if \code{x} is a
353 353
 #'  matrix object.
354
-#' @param resample Integer. The number of times to resample the counts matrix
355
-#'  for evaluating perplexity. Default 5.
354
+#' @param doResampling Boolean. If \code{TRUE}, then each cell in the counts
355
+#' matrix will be resampled according to a multinomial distribution to introduce
356
+#' noise before caculating perplexity. Default \code{FALSE}.
357
+#' @param doSubsampling Boolean. If \code{TRUE}, then a subset of cells from
358
+#' the original counts matrix will be randomly selected. Default \code{TRUE}.
359
+#' @param numResample Integer. The number of times to resample the counts matrix
360
+#' for evaluating perplexity if \code{doSubsampling} is set to \code{TRUE}.
361
+#' Default \code{5}.
362
+#' @param numSubsample Integer. The number of cells to sample from the
363
+#' the counts matrix if \code{doSubsampling} is set to \code{TRUE}. 
364
+#' Default \code{5000}.
356 365
 #' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility,
357
-#'  a default value of 12345 is used. If NULL, no calls to
366
+#'  a default value of \code{12345} is used. If \code{NULL}, no calls to
358 367
 #'  \link[withr]{with_seed} are made.
359 368
 #' @return A \linkS4class{SingleCellExperiment} object or
360 369
 #'  \code{celdaList} object with a \code{perplexity}
... ...
@@ -366,7 +375,10 @@ setGeneric("resamplePerplexity",
366 375
         celdaList,
367 376
         useAssay = "counts",
368 377
         altExpName = "featureSubset",
369
-        resample = 5,
378
+        doResampling = FALSE,
379
+        doSubsampling = TRUE,
380
+        numResample = 5,
381
+        numSubsample = 5000,
370 382
         seed = 12345) {
371 383
     standardGeneric("resamplePerplexity")})
372 384
 
... ...
@@ -382,7 +394,10 @@ setMethod("resamplePerplexity",
382 394
     function(x,
383 395
         useAssay = "counts",
384 396
         altExpName = "featureSubset",
385
-        resample = 5,
397
+        doResampling = FALSE,
398
+        doSubsampling = TRUE,
399
+        numResample = 5,
400
+        numSubsample = 5000,
386 401
         seed = 12345) {
387 402
 
388 403
         altExp <- SingleCellExperiment::altExp(x, altExpName)
... ...
@@ -393,14 +408,19 @@ setMethod("resamplePerplexity",
393 408
             res <- .resamplePerplexity(
394 409
                 counts = counts,
395 410
                 celdaList = celdaList,
396
-                resample = resample)
411
+                doResampling = doResampling,
412
+                doSubsampling = doSubsampling,
413
+                numResample = numResample,
414
+                numSubsample = numSubsample)
397 415
         } else {
398 416
             with_seed(seed,
399 417
                 res <- .resamplePerplexity(
400 418
                     counts = counts,
401 419
                     celdaList = celdaList,
402
-                    resample = resample)
403
-            )
420
+                    doResampling = doResampling,
421
+                    doSubsampling = doSubsampling,
422
+                    numResample = numResample,
423
+                    numSubsample = numSubsample))
404 424
         }
405 425
 
406 426
         S4Vectors::metadata(altExp)$celda_grid_search <- res
... ...
@@ -423,20 +443,29 @@ setMethod("resamplePerplexity",
423 443
     signature(x = "ANY"),
424 444
     function(x,
425 445
         celdaList,
426
-        resample = 5,
446
+        doResampling = FALSE,
447
+        doSubsampling = TRUE,
448
+        numResample = 5,
449
+        numSubsample = 5000,
427 450
         seed = 12345) {
428 451
 
429 452
         if (is.null(seed)) {
430 453
             res <- .resamplePerplexity(
431 454
                 counts = x,
432 455
                 celdaList = celdaList,
433
-                resample = resample)
456
+                doResampling = doResampling,
457
+                doSubsampling = doSubsampling,
458
+                numResample = numResample,
459
+                numSubsample = numSubsample)
434 460
         } else {
435 461
             with_seed(seed,
436 462
                 res <- .resamplePerplexity(
437 463
                     counts = x,
438 464
                     celdaList = celdaList,
439
-                    resample = resample))
465
+                    doResampling = doResampling,
466
+                    doSubsampling = doSubsampling,
467
+                    numResample = numResample,
468
+                    numSubsample = numSubsample))
440 469
         }
441 470
 
442 471
         return(res)
... ...
@@ -446,26 +475,64 @@ setMethod("resamplePerplexity",
446 475
 
447 476
 .resamplePerplexity <- function(counts,
448 477
     celdaList,
449
-    resample = 5) {
478
+    doResampling = FALSE,
479
+    doSubsampling = TRUE,
480
+    numResample = 5,
481
+    numSubsample = 5000) {
450 482
 
451 483
     if (!methods::is(celdaList, "celdaList")) {
452 484
         stop("celdaList parameter was not of class celdaList.")
453 485
     }
454
-    if (!isTRUE(is.numeric(resample))) {
455
-        stop("Provided resample parameter was not numeric.")
486
+    if (!isTRUE(is.logical(doResampling))) {
487
+        stop("The 'doResampling' parameter needs to be logical (TRUE/FALSE).")
488
+    } 
489
+    if (!isTRUE(is.logical(doSubsampling))) {
490
+        stop("The 'doSubsampling' parameter needs to be logical (TRUE/FALSE).")
491
+    } 
492
+    if (!isTRUE(doResampling) & (!is.numeric(numResample) || numResample < 1)) {
493
+        stop("The 'numResample' parameter needs to be an integer greater ",
494
+        "than 0.")
495
+    }
496
+    if (!isTRUE(doSubsampling) & (!is.numeric(numSubsample) || numSubsample < 1)) {
497
+        stop("The 'numResample' parameter needs to be an integer between ",
498
+             "1 and the number of cells.")
456 499
     }
457 500
 
458
-    perpRes <- matrix(NA, nrow = length(resList(celdaList)), ncol = resample)
459
-    for (j in seq(resample)) {
460
-        newCounts <- .resampleCountMatrix(counts)
501
+    if(isTRUE(doSubsampling) & numSubsample < ncol(counts)) {
502
+        ix <- sample(seq(ncol(counts)), size = numSubsample)
503
+        newCounts <- counts[,ix]
504
+        
505
+    } else {
506
+        newCounts <- counts
507
+    }
508
+    if(isTRUE(doResampling)) {
509
+        perpRes <- matrix(NA,
510
+                          nrow = length(resList(celdaList)),
511
+                          ncol = numResample)
512
+        for (j in seq(numResample)) {
513
+            newCounts <- .resampleCountMatrix(newCounts)
514
+            for (i in seq(length(resList(celdaList)))) {
515
+                perpRes[i, j] <- perplexity(x = counts,
516
+                    celdaMod = resList(celdaList)[[i]],
517
+                    newCounts = newCounts)
518
+            }
519
+        }
520
+        celdaList@perplexity <- perpRes
521
+        
522
+    } else {
523
+        perpRes <- matrix(NA,
524
+                          nrow = length(resList(celdaList)),
525
+                          ncol = 1)
461 526
         for (i in seq(length(resList(celdaList)))) {
462
-            perpRes[i, j] <- perplexity(x = counts,
463
-                celdaMod = resList(celdaList)[[i]],
464
-                newCounts = newCounts)
527
+            perpRes[i,1] <- perplexity(x = counts,
528
+                                       celdaMod = resList(celdaList)[[i]],
529
+                                       newCounts = newCounts)
465 530
         }
466
-    }
531
+    }    
532
+   
533
+    # Add perplexity data.frame to celda list object
467 534
     celdaList@perplexity <- perpRes
468
-
535
+    
469 536
     ## Add mean perplexity to runParams
470 537
     perpMean <- apply(perpRes, 1, mean)
471 538
     celdaList@runParams$mean_perplexity <- perpMean
... ...
@@ -736,15 +803,6 @@ setMethod("plotGridSearchPerplexity",
736 803
 
737 804
 
738 805
 # Resample a counts matrix for evaluating perplexity
739
-# Normalizes each column (cell) of a countMatrix by the column sum to
740
-# create a distribution of observing a given number of counts for a given
741
-# gene in that cell,
742
-# then samples across all cells.
743
-# This is primarily used to evaluate the stability of the perplexity for
744
-# a given K/L combination.
745
-# @param celda.mod A single celda run (usually from the _resList_ property
746
-# of a celdaList).
747
-# @return The perplexity for the provided chain as an mpfr number.
748 806
 #' @importFrom Matrix colSums t
749 807
 .resampleCountMatrix <- function(countMatrix) {
750 808
     colsums <- colSums(countMatrix)
751 809
deleted file mode 100644
... ...
@@ -1,19 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/RcppExports.R
3
-\name{eigenMatMultInt}
4
-\alias{eigenMatMultInt}
5
-\title{Fast matrix multiplication for double x int}
6
-\usage{
7
-eigenMatMultInt(A, B)
8
-}
9
-\arguments{
10
-\item{A}{a double matrix}
11
-
12
-\item{B}{an integer matrix}
13
-}
14
-\value{
15
-An integer matrix representing the product of A and B
16
-}
17
-\description{
18
-Fast matrix multiplication for double x int
19
-}
20 0
deleted file mode 100644
... ...
@@ -1,19 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/RcppExports.R
3
-\name{eigenMatMultNumeric}
4
-\alias{eigenMatMultNumeric}
5
-\title{Fast matrix multiplication for double x double}
6
-\usage{
7
-eigenMatMultNumeric(A, B)
8
-}
9
-\arguments{
10
-\item{A}{a double matrix}
11
-
12
-\item{B}{an integer matrix}
13
-}
14
-\value{
15
-An integer matrix representing the product of A and B
16
-}
17
-\description{
18
-Fast matrix multiplication for double x double
19
-}
20 0
deleted file mode 100644
... ...
@@ -1,19 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/RcppExports.R
3
-\name{fastNormProp}
4
-\alias{fastNormProp}
5
-\title{Fast normalization for numeric matrix}
6
-\usage{
7
-fastNormProp(R_counts, R_alpha)
8
-}
9
-\arguments{
10
-\item{R_counts}{An integer matrix}
11
-
12
-\item{R_alpha}{A double value to be added to the matrix as a pseudocount}
13
-}
14
-\value{
15
-A numeric matrix where the columns have been normalized to proportions
16
-}
17
-\description{
18
-Fast normalization for numeric matrix
19
-}
20 0
deleted file mode 100644
... ...
@@ -1,19 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/RcppExports.R
3
-\name{fastNormPropLog}
4
-\alias{fastNormPropLog}
5
-\title{Fast normalization for numeric matrix}
6
-\usage{
7
-fastNormPropLog(R_counts, R_alpha)
8
-}
9
-\arguments{
10
-\item{R_counts}{An integer matrix}
11
-
12
-\item{R_alpha}{A double value to be added to the matrix as a pseudocount}
13
-}
14
-\value{
15
-A numeric matrix where the columns have been normalized to proportions
16
-}
17
-\description{
18
-Fast normalization for numeric matrix
19
-}
20 0
deleted file mode 100644
... ...
@@ -1,19 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/RcppExports.R
3
-\name{fastNormPropSqrt}
4
-\alias{fastNormPropSqrt}
5
-\title{Fast normalization for numeric matrix}
6
-\usage{
7
-fastNormPropSqrt(R_counts, R_alpha)
8
-}
9
-\arguments{
10
-\item{R_counts}{An integer matrix}
11
-
12
-\item{R_alpha}{A double value to be added to the matrix as a pseudocount}
13
-}
14
-\value{
15
-A numeric matrix where the columns have been normalized to proportions
16
-}
17
-\description{
18
-Fast normalization for numeric matrix
19
-}
20 0
deleted file mode 100644
... ...
@@ -1,17 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/RcppExports.R
3
-\name{nonzero}
4
-\alias{nonzero}
5
-\title{get row and column indices of none zero elements in the matrix}
6
-\usage{
7
-nonzero(R_counts)
8
-}
9
-\arguments{
10
-\item{R_counts}{A matrix}
11
-}
12
-\value{
13
-An integer matrix where each row is a row, column indices pair
14
-}
15
-\description{
16
-get row and column indices of none zero elements in the matrix
17
-}
... ...
@@ -4,15 +4,17 @@
4 4
 \alias{resamplePerplexity}
5 5
 \alias{resamplePerplexity,SingleCellExperiment-method}
6 6
 \alias{resamplePerplexity,ANY-method}
7
-\title{Calculate and visualize perplexity of all models in a celdaList, with
8
- count resampling}
7
+\title{Calculate and visualize perplexity of all models in a celdaList}
9 8
 \usage{
10 9
 resamplePerplexity(
11 10
   x,
12 11
   celdaList,
13 12
   useAssay = "counts",
14 13
   altExpName = "featureSubset",
15
-  resample = 5,
14
+  doResampling = FALSE,
15
+  doSubsampling = TRUE,
16
+  numResample = 5,
17
+  numSubsample = 5000,
16 18
   seed = 12345
17 19
 )
18 20
 
... ...
@@ -20,11 +22,22 @@ resamplePerplexity(
20 22
   x,
21 23
   useAssay = "counts",
22 24
   altExpName = "featureSubset",
23
-  resample = 5,
25
+  doResampling = FALSE,
26
+  doSubsampling = TRUE,
27
+  numResample = 5,
28
+  numSubsample = 5000,
24 29
   seed = 12345
25 30
 )
26 31
 
27
-\S4method{resamplePerplexity}{ANY}(x, celdaList, resample = 5, seed = 12345)
32
+\S4method{resamplePerplexity}{ANY}(
33
+  x,
34
+  celdaList,
35
+  doResampling = FALSE,
36
+  doSubsampling = TRUE,
37
+  numResample = 5,
38
+  numSubsample = 5000,
39
+  seed = 12345
40
+)
28 41
 }
29 42
 \arguments{
30 43
 \item{x}{A numeric \link{matrix} of counts or a
... ...
@@ -44,11 +57,23 @@ slot to use if \code{x} is a
44 57
 \item{altExpName}{The name for the \link{altExp} slot
45 58
 to use. Default "featureSubset".}
46 59
 
47
-\item{resample}{Integer. The number of times to resample the counts matrix
48
-for evaluating perplexity. Default 5.}
60
+\item{doResampling}{Boolean. If \code{TRUE}, then each cell in the counts
61
+matrix will be resampled according to a multinomial distribution to introduce
62
+noise before caculating perplexity. Default \code{FALSE}.}
63
+
64
+\item{doSubsampling}{Boolean. If \code{TRUE}, then a subset of cells from
65
+the original counts matrix will be randomly selected. Default \code{TRUE}.}
66
+
67
+\item{numResample}{Integer. The number of times to resample the counts matrix
68
+for evaluating perplexity if \code{doSubsampling} is set to \code{TRUE}.
69
+Default \code{5}.}
70
+
71
+\item{numSubsample}{Integer. The number of cells to sample from the
72
+the counts matrix if \code{doSubsampling} is set to \code{TRUE}. 
73
+Default \code{5000}.}
49 74
 
50 75
 \item{seed}{Integer. Passed to \link[withr]{with_seed}. For reproducibility,
51
-a default value of 12345 is used. If NULL, no calls to
76
+a default value of \code{12345} is used. If \code{NULL}, no calls to
52 77
 \link[withr]{with_seed} are made.}
53 78
 }
54 79
 \value{
... ...
@@ -87,13 +87,13 @@ test_that(desc = "Testing celdaGridSearch with celda_C", {
87 87
         c("index", "chain", "K", "seed", "logLikelihood"))
88 88
     expect_error(plotGridSearchPerplexity(celdaCResList))
89 89
 
90
-    celdaCResList <- resamplePerplexity(celdaCResList, resample = 2)
90
+    celdaCResList <- resamplePerplexity(celdaCResList, numResample = 2)
91 91
     expect_equal(is.null(altExp(
92 92
         celdaCResList)@metadata$celda_grid_search@perplexity),
93 93
         FALSE)
94 94
     expect_true(is(celdaCResList, "SingleCellExperiment"))
95
-    expect_error(resamplePerplexity(celdaCResList, resample = "2"),
96
-        "Provided resample parameter was not numeric.")
95
+    expect_error(resamplePerplexity(celdaCResList, numResample = "2"),
96
+        "The 'numResample' parameter needs to be an integer greater than 0.")
97 97
 
98 98
     plotObj <- plotGridSearchPerplexity(celdaCResList)
99 99
     expect_is(plotObj, "ggplot")
... ...
@@ -95,13 +95,13 @@
95 95
 # #         c("index", "chain", "L", "log_likelihood"))
96 96
 # #
97 97
 # #     celdaGRes <- resamplePerplexity(celdaGSim$counts, celdaGRes,
98
-# #     resample = 2)
98
+# #     numResample = 2)
99 99
 # #     expect_equal(is.null(celdaGRes@perplexity), FALSE)
100 100
 # #     expect_is(celdaGRes, "celdaList")
101 101
 # #     expect_error(resamplePerplexity(celdaGSim$counts,
102
-# #         celdaGRes, resample = "2"))
102
+# #         celdaGRes, numResample = "2"))
103 103
 # #     expect_error(resamplePerplexity(celdaGSim$counts,
104
-# #         "celdaGRes", resample = 2))
104
+# #         "celdaGRes", numResample = 2))
105 105
 # #
106 106
 # #     plotObj <- plotGridSearchPerplexity(celdaGRes)
107 107
 # #     expect_is(plotObj, "ggplot")