Browse code

merge master into sctk_qc

rz2333 authored on 15/10/2020 20:55:34
Showing 145 changed files

... ...
@@ -1,16 +1,16 @@
1 1
 language: r
2
-cache: 
2
+cache:
3 3
   - packages
4 4
   - pip
5 5
 r:
6
-  - oldrel
6
+  - release
7 7
 sudo: false
8 8
 warnings_are_errors: true
9 9
 bioc_required: true
10 10
 
11 11
 # Current version of xgboost fails install
12 12
 # Need to force install older version using 'devtools::install_version'
13
-#    which is done in 'before_install' (see below) 
13
+#    which is done in 'before_install' (see below)
14 14
 # However 'remotes::dev_package_deps' will try to auto upgrade packages
15 15
 #    Therefore, a new install command is used with 'upgrade="never"'
16 16
 # These parts can be deleted once the xgboost error is fixed
... ...
@@ -21,7 +21,7 @@ before_install:
21 21
   - Rscript -e 'install.packages("devtools")'
22 22
   - Rscript -e 'devtools::install_version("xgboost", version = "0.90.0.2", repos = "http://cran.us.r-project.org")'
23 23
   - pip install --upgrade pip
24
-  - pip install scipy scrublet scanpy scanorama scgen bbknn
24
+  - pip install scipy scrublet scanpy scanorama bbknn
25 25
 
26 26
 install:
27 27
 - R -e 'devtools::install_deps(dep = TRUE, upgrade="never")'
... ...
@@ -31,11 +31,9 @@ addons:
31 31
     packages:
32 32
       - libv8-dev
33 33
 
34
-r_build_args: "--no-build-vignettes"
35
-r_check_args: "--no-vignettes --no-build-vignettes --as-cran"
34
+r_check_args: "--as-cran"
36 35
 r_github_packages:
37 36
   - jimhester/lintr
38
-  - compbio/celda
39 37
 notifications:
40 38
   slack:
41 39
     secure: sw/He3WX7Fs61DTufgG9hx7fWDhwx57zv+dYB4sr30LgKyIJNbZYSU76ZN4oPwHFGAJ0eyk/22XCuKF8/seSB2ArQuc4kFlsaeWWg1CxDqW9iR0zs91MJL3lJoUOSayy5WcWbsUxD+x8GcB37FCgoTXcveN956bfgrDb2nGrFY6QMSVsnkWi/kiMdzhR4VPzOfmAUZUcr1aDU3WN4ROwbDg89zCfu9HbVKZwyQdYZk7L2Pgg8xGIcGZkaMN1MMD/r0Wc2aVYqCOcn5qO/9CaspEBf7cd6ejVzgAl/LM4dTIPlx9E6ekQGwJXpt1Qxyc0sqVSQWFZvFkWyV3wkqX8pfSXV6ojlP2cGJt+PSQXwLHj0TKWYx7yHbw+55da/b//SVLZVHl+P6VXxQBe2hmFD6sCk3V3uq2CkxzI52bK4Wv4RRncVKIZEpFZOc9P2ik5mf6pICmLHwttlbHeDjJbQ+raYftF704xb1sOxapLwuy+c2SWKPeqOiimfZNgOK+J7JbvQvpnaiHN+3CA3wQXs1auKBwI+swvsZPAzeKssnLv/0AQmX8ckVzvXJzUvl1J73c9V2zG2F44xmIsXwzruEp6leSbeXcLifrfbfQdmaHzq4M/Z83o/fwGRzdWfG7YXDyPIabik7DOMrSUvF+wLGVTjOart8HrA4eVEkKFnRE=
... ...
@@ -1,6 +1,6 @@
1 1
 Package: singleCellTK
2 2
 Type: Package
3
-Title: Interactive Analysis of Single Cell RNA-Seq Data
3
+Title: Comprehensive and Interactive Analysis of Single Cell RNA-Seq Data
4 4
 Version: 1.7.6
5 5
 Author: David Jenkins
6 6
 Authors@R: c(person(given="David", family="Jenkins", email="dfj@bu.edu", role=c("aut","cre"),
... ...
@@ -15,17 +15,14 @@ Authors@R: c(person(given="David", family="Jenkins", email="dfj@bu.edu", role=c(
15 15
                     comment = c(ORCID = "0000-0002-6247-6595")))
16 16
 Maintainer: David Jenkins <dfj@bu.edu>
17 17
 Depends:
18
-    R (>= 3.6.0),
18
+    R (>= 4.0),
19 19
     SummarizedExperiment,
20 20
     SingleCellExperiment,
21 21
     DelayedArray,
22 22
     Biobase
23
-Description: Run common single cell analysis directly through your browser
24
-    including differential expression, downsampling analysis, and clustering.
23
+Description: Run common single cell analysis in the R console or directly through your browser. Includes many functions for import, quality control, normalization, batch correction, clustering, differential expression, and visualization..
25 24
 License: MIT + file LICENSE
26 25
 Encoding: UTF-8
27
-cache:
28
-  - packages: true
29 26
 biocViews: SingleCell, GeneExpression, DifferentialExpression, Alignment,
30 27
     Clustering, ImmunoOncology
31 28
 LazyData: TRUE
... ...
@@ -35,6 +32,7 @@ Imports:
35 32
     BiocGenerics,
36 33
     BiocParallel,
37 34
     colourpicker,
35
+    colorspace,
38 36
     cowplot,
39 37
     cluster,
40 38
     ComplexHeatmap,
... ...
@@ -46,24 +44,25 @@ Imports:
46 44
     fields,
47 45
     ggplot2,
48 46
     ggplotify,
49
-    ggtree,
50 47
     ggrepel,
48
+    ggtree,
51 49
     gridExtra,
52 50
     GSVA (>= 1.26.0),
53 51
     GSVAdata,
54 52
     harmony,
55 53
     igraph,
54
+    KernSmooth,
56 55
     liger,
57 56
     limma,
58 57
     MAST,
59 58
     Matrix,
60
-    mnormt,
61 59
     matrixStats,
62 60
     methods,
63 61
     msigdbr,
64 62
     multtest,
65 63
     plotly,
66 64
     RColorBrewer,
65
+    ROCR,
67 66
     Rtsne,
68 67
     S4Vectors,
69 68
     scater,
... ...
@@ -86,7 +85,6 @@ Imports:
86 85
     celda,
87 86
     shinycssloaders,
88 87
     shinythemes,
89
-    umap,
90 88
     uwot,
91 89
     DropletUtils,
92 90
     scds (>= 1.2.0),
... ...
@@ -94,10 +92,8 @@ Imports:
94 92
     tools,
95 93
     withr,
96 94
     GSEABase,
97
-    DoubletFinder,
98 95
     R.utils,
99 96
     zinbwave,
100
-    V8,
101 97
     scRNAseq (>= 2.0.2),
102 98
     TENxPBMCData,
103 99
     yaml,
... ...
@@ -117,12 +113,9 @@ Suggests:
117 113
     org.Mm.eg.db,
118 114
     stringr
119 115
 Remotes: 
120
-    joshua-d-campbell/DoubletFinder,
121 116
     joshua-d-campbell/harmony,
122
-    rz2333/mnormt@1.5-6,
123 117
     joshua-d-campbell/liger,
124 118
     joshua-d-campbell/shiny-directory-input,
125
-    campbio/celda@RELEASE_3_11
126 119
 VignetteBuilder: knitr
127 120
 URL: https://compbiomed.github.io/sctk_docs/
128 121
 BugReports: https://github.com/compbiomed/singleCellTK/issues
... ...
@@ -1,5 +1,6 @@
1 1
 # Generated by roxygen2: do not edit by hand
2 2
 
3
+export(MAST)
3 4
 export(alignSingleCellData)
4 5
 export(calcEffectSizes)
5 6
 export(combineSCE)
... ...
@@ -8,6 +9,7 @@ export(constructSCE)
8 9
 export(convertGeneIDs)
9 10
 export(convertSCEToSeurat)
10 11
 export(convertSeuratToSCE)
12
+export(discreteColorPalette)
11 13
 export(distinctColors)
12 14
 export(downSampleCells)
13 15
 export(downSampleDepth)
... ...
@@ -16,7 +18,6 @@ export(exportSCE)
16 18
 export(exportSCEtoAnnData)
17 19
 export(exportSCEtoFlatFile)
18 20
 export(featureIndex)
19
-export(filterSCData)
20 21
 export(findMarkerDiffExp)
21 22
 export(generateMeta)
22 23
 export(generateSimulatedData)
... ...
@@ -43,6 +44,7 @@ export(importGeneSetsFromCollection)
43 44
 export(importGeneSetsFromGMT)
44 45
 export(importGeneSetsFromList)
45 46
 export(importGeneSetsFromMSigDB)
47
+export(importMultipleSources)
46 48
 export(importOptimus)
47 49
 export(importSEQC)
48 50
 export(importSTARsolo)
... ...
@@ -86,6 +88,7 @@ export(plotUMAP)
86 88
 export(qcInputProcess)
87 89
 export(readSingleCellMatrix)
88 90
 export(reportCellQC)
91
+export(reportDiffExp)
89 92
 export(reportDropletQC)
90 93
 export(reportQCTool)
91 94
 export(retrieveSCEIndex)
... ...
@@ -114,7 +117,6 @@ export(runMAST)
114 117
 export(runMNNCorrect)
115 118
 export(runPerCellQC)
116 119
 export(runSCANORAMA)
117
-export(runSCGEN)
118 120
 export(runSCMerge)
119 121
 export(runScranSNN)
120 122
 export(runScrublet)
... ...
@@ -160,8 +162,6 @@ import(DelayedArray)
160 162
 import(DropletUtils)
161 163
 import(GSVAdata)
162 164
 import(SingleCellExperiment)
163
-importFrom(DelayedArray,colSums)
164
-importFrom(Matrix,colSums)
165 165
 importFrom(SingleCellExperiment,"counts<-")
166 166
 importFrom(SingleCellExperiment,"reducedDim<-")
167 167
 importFrom(SingleCellExperiment,counts)
168 168
new file mode 100644
... ...
@@ -0,0 +1,136 @@
1
+#' MAST
2
+#'
3
+#' Run and visualize MAST analysis on a SCtkExperiment object.
4
+#'
5
+#' @param inSCE Input SCtkExperiment object. Required
6
+#' @param useAssay The assay to use for the MAST calculations. The default is
7
+#' "logcounts"
8
+#' @param condition select variable (from the colData) that is used for the
9
+#' model.
10
+#' @param interest.level If the condition of interest has more than two factors,
11
+#' indicate which level should be used to compare to all other samples.
12
+#' @param freqExpressed Filter genes that are expressed in at least this
13
+#' fraction of cells. The default is expression in 0.1 of samples.
14
+#' @param fcThreshold Minimum fold change for differentially expressed gene.
15
+#' @param p.value p values for selecting the hurdle result, default is 0.05
16
+#' @param useThresh Use adaptive thresholding to filter genes. The default is
17
+#' FALSE.
18
+#'
19
+#' @describeIn MAST Run MAST analysis.
20
+#'
21
+#' @return MAST(): A data.frame of differentially expressed genes with p-values.
22
+#' @export
23
+MAST <- function(inSCE, condition = NULL, interest.level = NULL,
24
+                 freqExpressed = 0.1, fcThreshold=log2(1.5), p.value = 0.05,
25
+                 useThresh=FALSE, useAssay = "logcounts"){
26
+
27
+  if (is.null(condition)){
28
+    stop("specify the condition of interest")
29
+  }
30
+
31
+  if (length(unique(SingleCellExperiment::colData(inSCE)[, condition])) == 1) {
32
+    stop("only one level is in the condition")
33
+  }
34
+
35
+  if (is.null(interest.level) & length(
36
+    unique(SingleCellExperiment::colData(inSCE)[, condition])) > 2){
37
+    stop("You must specify a level of interest when more than 2 levels are in",
38
+         " the condition")
39
+  }
40
+
41
+  # Create MAST SingleCellAssay
42
+  pdata <- SingleCellExperiment::colData(inSCE)
43
+  expres <- SummarizedExperiment::assay(inSCE, useAssay)
44
+  fdata <- SingleCellExperiment::rowData(inSCE)
45
+  SCENew <- MAST::FromMatrix(expres, pdata, fdata)
46
+
47
+  #Caculate CDR for zlm model
48
+  SummarizedExperiment::colData(SCENew)$cngeneson <-
49
+    scale(colSums(SummarizedExperiment::assay(SCENew) > 0))
50
+
51
+  if (useThresh){
52
+    SCENew <- SCENew[which(MAST::freq(SCENew) > 0), ]
53
+    invisible(utils::capture.output(thresh <- MAST::thresholdSCRNACountMatrix(
54
+      SummarizedExperiment::assay(SCENew), nbins = 20, min_per_bin = 30)))
55
+    SummarizedExperiment::assays(SCENew) <-
56
+      list(thresh = thresh$counts_threshold,
57
+           tpm = SummarizedExperiment::assay(SCENew))
58
+  }
59
+
60
+  # filter based on frequency of expression across samples
61
+  if (sum(MAST::freq(SCENew) > freqExpressed) <= 1){
62
+    stop("Not enough genes pass frequency expressed filter of 1")
63
+  }
64
+  SCENewSample <- SCENew[which(MAST::freq(SCENew) > freqExpressed), ]
65
+
66
+  # if the condition of interest is numeric, to change it to a factor
67
+  if (is.numeric(SummarizedExperiment::colData(SCENewSample)[, condition])){
68
+    SummarizedExperiment::colData(SCENewSample)[, condition] <-
69
+      as.factor(SummarizedExperiment::colData(SCENewSample)[, condition])
70
+  }
71
+  #Check for NAs, if true throw error
72
+  if (any(is.na(SingleCellExperiment::colData(inSCE)[, condition]))){
73
+     stop("Annotation data has NA values. Filter them to continue.")
74
+  }
75
+  # >2 levels in the condition
76
+  if (!is.null(interest.level) &
77
+      length(unique(SingleCellExperiment::colData(inSCE)[, condition])) > 2){
78
+    levels(SummarizedExperiment::colData(SCENewSample)[, condition]) <-
79
+      c(levels(SummarizedExperiment::colData(SCENewSample)[, condition]),
80
+        paste0("no_", interest.level))
81
+    SummarizedExperiment::colData(SCENewSample)[, condition][
82
+      SummarizedExperiment::colData(SCENewSample)[, condition] !=
83
+        interest.level] <- paste0("no_", interest.level)
84
+    SummarizedExperiment::colData(SCENewSample)[, condition] <-
85
+      droplevels(as.factor(
86
+        SummarizedExperiment::colData(SCENewSample)[, condition]))
87
+
88
+    hurdle1 <- MAST::zlm(stats::as.formula(paste0("~", condition,
89
+                                                  "+cngeneson")), SCENewSample)
90
+    summaryh1 <- MAST::summary(hurdle1, doLRT = paste0(condition, "no_",
91
+                                                       interest.level))
92
+
93
+    summaryDT <- summaryh1[["datatable"]]
94
+
95
+    fcHurdle <- merge(
96
+      summaryDT[summaryDT$contrast == paste0(condition, "no_", interest.level) &
97
+                  summaryDT$component == "H", c("primerid", "Pr(>Chisq)")],
98
+      summaryDT[summaryDT$contrast == paste0(condition, "no_", interest.level) &
99
+                  summaryDT$component == "logFC", c("primerid", "coef", "ci.hi",
100
+                                                    "ci.lo")]
101
+    )
102
+  } else {
103
+    SummarizedExperiment::colData(SCENewSample)[, condition] <-
104
+      droplevels(as.factor(
105
+        SummarizedExperiment::colData(SCENewSample)[, condition]))
106
+    level.cond  <- levels(
107
+      SummarizedExperiment::colData(SCENewSample)[, condition])
108
+
109
+    hurdle1 <- MAST::zlm(stats::as.formula(paste0("~", condition,
110
+                                                  "+cngeneson")), SCENewSample)
111
+    summaryh1 <- MAST::summary(hurdle1, doLRT = paste0(condition,
112
+                                                       level.cond[2]))
113
+
114
+    summaryDT <- summaryh1[["datatable"]]
115
+
116
+    fcHurdle <- merge(
117
+      summaryDT[summaryDT$contrast == paste0(condition, level.cond[2]) &
118
+                  summaryDT$component == "H", c("primerid", "Pr(>Chisq)")],
119
+      summaryDT[summaryDT$contrast == paste0(condition, level.cond[2]) &
120
+                  summaryDT$component == "logFC", c("primerid", "coef", "ci.hi",
121
+                                                    "ci.lo")]
122
+    )
123
+  }
124
+
125
+  # Use p-value correction method, here we use fdr
126
+  fcHurdle$fdr <- stats::p.adjust(fcHurdle$"Pr(>Chisq)", "fdr")
127
+
128
+  # Filter the data again by the adjusted pvalue and coef
129
+  fcHurdleSig <-  fcHurdle[fcHurdle$fdr < p.value & abs(fcHurdle$coef) >
130
+                             fcThreshold & !is.nan(fcHurdle$coef), ]
131
+  colnames(fcHurdleSig)[1] <- "Gene"
132
+
133
+  data.table::setorder(fcHurdleSig, "fdr")
134
+
135
+  return(fcHurdleSig)
136
+}
... ...
@@ -79,7 +79,7 @@
79 79
   
80 80
   ## Get merged rowData
81 81
   by.r <- unique(c('rownames', by.r))
82
-  unionFe <- Reduce(function(r1, r2) merge(r1, r2, by=by.r, all=T), feList)
82
+  unionFe <- Reduce(function(r1, r2) merge(r1, r2, by=by.r, all=TRUE), feList)
83 83
   allGenes <- unique(unlist(lapply(feList, rownames)))
84 84
   
85 85
   ## rowData
... ...
@@ -104,7 +104,7 @@
104 104
   })
105 105
   
106 106
   by.c <- unique(c("rownames", by.c))
107
-  unionCb <- Reduce(function(c1, c2) merge(c1, c2, by=by.c, all=T), cbList)
107
+  unionCb <- Reduce(function(c1, c2) merge(c1, c2, by=by.c, all=TRUE), cbList)
108 108
   rownames(unionCb) <- unionCb[['rownames']]
109 109
   newCbList <- list()
110 110
   for (i in seq_along(sceList)) {
... ...
@@ -234,7 +234,7 @@ combineSCE <- function(sceList, by.r, by.c, combined){
234 234
   assayList <- .mergeAssaySCE(sceList) 
235 235
   
236 236
   New_SCE <- list()
237
-  for (i in 1:length(sceList)) {
237
+  for (i in seq(length(sceList))) {
238 238
     ## create new sce
239 239
     New_SCE[[i]] <- .constructSCE(matrices = assayList[[i]], features = newFeList,
240 240
                                   barcodes = newCbList[[i]], 
... ...
@@ -249,4 +249,4 @@ combineSCE <- function(sceList, by.r, by.c, combined){
249 249
     return(sce)
250 250
   }
251 251
   return(New_SCE)
252
-}
253 252
\ No newline at end of file
253
+}
... ...
@@ -1,7 +1,7 @@
1 1
 #' Example Single Cell RNA-Seq data in SingleCellExperiment Object, GSE60361
2 2
 #' subset
3 3
 #'
4
-#' A subset of 30 samples from a single cell RNA-Seq experiment from Zeisel, et
4
+#' A subset of 30 cells from a single cell RNA-Seq experiment from Zeisel, et
5 5
 #' al. Science 2015. The data was produced from cells from the mouse
6 6
 #' somatosensory cortex (S1) and hippocampus (CA1). 15 of the cells were
7 7
 #' identified as oligodendrocytes and 15 of the cell were identified as
... ...
@@ -16,12 +16,12 @@
16 16
 #' data("mouseBrainSubsetSCE")
17 17
 "mouseBrainSubsetSCE"
18 18
 
19
-#' Example Single Cell RNA-Seq data in SingleCellExperiment Object, 
19
+#' Example Single Cell RNA-Seq data in SingleCellExperiment Object,
20 20
 #' subset of 10x public dataset
21 21
 #' https://support.10xgenomics.com/single-cell-gene-expression/datasets/2.1.0/pbmc4k
22 22
 #' A subset of 390 barcodes and top 200 genes were included in this example.
23 23
 #' Within 390 barcodes, 195 barcodes are empty droplet, 150 barcodes are cell barcode
24
-#' and 45 barcodes are doublets predicted by scrublet and doubletFinder package. 
24
+#' and 45 barcodes are doublets predicted by scrublet and doubletFinder package.
25 25
 #' This example only serves as a proof of concept and a tutoriol on how to
26 26
 #' run the functions in this package. The results should not be
27 27
 #' used for drawing scientific conclusions.
... ...
@@ -40,38 +40,31 @@
40 40
 #' Two batches of pancreas scRNAseq dataset are combined with their original
41 41
 #' counts. Cell types and batches are annotated in `colData(sceBatches)`.
42 42
 #' Two batches came from Wang, et al., 2016, annotated as `'w'`; and Xin, et
43
-#' al., 2016, annotated as `'x'`. Two common cell types, `'alpha'` and 
44
-#' `'beta'`, that could be found in both original studies with relatively 
43
+#' al., 2016, annotated as `'x'`. Two common cell types, `'alpha'` and
44
+#' `'beta'`, that could be found in both original studies with relatively
45 45
 #' large population were kept for cleaner demonstration.
46
-#'
47
-#' @name sceBatches
48
-#' @docType data
49
-#' @format SingleCellExperiment
50
-#' @source DOI: 10.2337/db16-0405 and 10.1016/j.cmet.2016.08.018
51
-#' @keywords datasets
52
-#' @examples
53 46
 #' data('sceBatches')
54 47
 "sceBatches"
55 48
 
56 49
 #' Stably Expressed Gene (SEG) list obect, with SEG sets for human and mouse.
57
-#' 
50
+#'
58 51
 #' The two gene sets came from dataset called `segList` of package `scMerge`.
59 52
 #' @name SEG
60 53
 #' @docType data
61
-#' @format list, with two entries `"human"` and `"mouse"`, each is a charactor
62
-#' array.
63
-#' @source `data('segList', package='scMerge')``
54
+#' @format list, with two entries \code{"human"} and \code{"mouse"}, each is a
55
+#' charactor vector.
56
+#' @source \code{data('segList', package='scMerge')}
64 57
 #' @keywords datasets
65
-#' @examples 
58
+#' @examples
66 59
 #' data('SEG')
67 60
 #' humanSEG <- SEG$human
68 61
 "SEG"
69 62
 
70 63
 #' MSigDB gene get Cctegory table
71
-#' 
72
-#' A table of gene set categories that can be download from MSigDB. The 
64
+#'
65
+#' A table of gene set categories that can be download from MSigDB. The
73 66
 #' categories and descriptions can be found here:
74
-#' https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp. The IDs in the 
67
+#' https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp. The IDs in the
75 68
 #' first column can be used to retrieve the gene sets for these categories
76 69
 #' using the \link{importGeneSetsFromMSigDB} function.
77 70
 
... ...
@@ -1,427 +1,666 @@
1
-## Original code from DoubletFinder package
2
-## (https://github.com/chris-mcginnis-ucsf/DoubletFinder)
3
-
4
-.parallel_paramSweep <- function(n, n.real.cells, real.cells, pK, pN, data,
5
-                                   orig.commands, PCs, sct, verbose, seed) {
6
-    sweep.res.list <- list()
7
-    list.ind <- 0
8
-
9
-    ## Make merged real-artifical data
10
-    if (verbose) {
11
-        print(paste("Creating artificial doublets for pN = ",
12
-                    pN[n] * 100, "%", sep = ""))
13
-    }
14
-    n_doublets <- ceiling(n.real.cells / (1 - pN[n]) - n.real.cells)
15
-    real.cells1 <- sample(real.cells, n_doublets, replace = TRUE)
16
-    real.cells2 <- sample(real.cells, n_doublets, replace = TRUE)
17
-    doublets <- (data[, real.cells1] + data[, real.cells2]) / 2
18
-    colnames(doublets) <- paste("X", 1:n_doublets, sep = "")
19
-    data_wdoublets <- cbind(data, doublets)
20
-
21
-    ## Pre-process Seurat object
22
-    if (sct == FALSE) {
23
-        if (verbose) {
24
-            print("Creating Seurat object...")
25
-        }
26
-        seu_wdoublets <- Seurat::CreateSeuratObject(counts = data_wdoublets)
27
-
28
-        if (verbose) {
29
-            print("Normalizing Seurat object...")
30
-        }
31
-        seu_wdoublets <- Seurat::NormalizeData(seu_wdoublets,
32
-                                       normalization.method = orig.commands$NormalizeData.RNA@params$normalization.method,
33
-                                       scale.factor = orig.commands$NormalizeData.RNA@params$scale.factor,
34
-                                       margin = orig.commands$NormalizeData.RNA@params$margin,
35
-                                       verbose = verbose
36
-        )
37
-
38
-        if (verbose) {
39
-            print("Finding variable genes...")
40
-        }
41
-        seu_wdoublets <- Seurat::FindVariableFeatures(seu_wdoublets,
42
-                                              selection.method = orig.commands$FindVariableFeatures.RNA$selection.method,
43
-                                              loess.span = orig.commands$FindVariableFeatures.RNA$loess.span,
44
-                                              clip.max = orig.commands$FindVariableFeatures.RNA$clip.max,
45
-                                              mean.function = orig.commands$FindVariableFeatures.RNA$mean.function,
46
-                                              dispersion.function = orig.commands$FindVariableFeatures.RNA$dispersion.function,
47
-                                              num.bin = orig.commands$FindVariableFeatures.RNA$num.bin,
48
-                                              binning.method = orig.commands$FindVariableFeatures.RNA$binning.method,
49
-                                              nfeatures = orig.commands$FindVariableFeatures.RNA$nfeatures,
50
-                                              mean.cutoff = orig.commands$FindVariableFeatures.RNA$mean.cutoff,
51
-                                              dispersion.cutoff = orig.commands$FindVariableFeatures.RNA$dispersion.cutoff,
52
-                                              verbose = verbose
53
-        )
54
-
55
-        if (verbose) {
56
-            print("Scaling data...")
57
-        }
58
-        seu_wdoublets <- Seurat::ScaleData(
59
-            seu_wdoublets,
60
-            features = orig.commands$ScaleData.RNA$features,
61
-            model.use = orig.commands$ScaleData.RNA$model.use,
62
-            do.scale = orig.commands$ScaleData.RNA$do.scale,
63
-            do.center = orig.commands$ScaleData.RNA$do.center,
64
-            scale.max = orig.commands$ScaleData.RNA$scale.max,
65
-            block.size = orig.commands$ScaleData.RNA$block.size,
66
-            min.cells.to.block = orig.commands$ScaleData.RNA$min.cells.to.block,
67
-            verbose = verbose
68
-        )
69
-
70
-        if (verbose) {
71
-            print("Running PCA...")
72
-        }
73
-        seu_wdoublets <- Seurat::RunPCA(
74
-            seu_wdoublets,
75
-            features = orig.commands$ScaleData.RNA$features,
76
-            npcs = length(PCs),
77
-            rev.pca = orig.commands$RunPCA.RNA$rev.pca,
78
-            weight.by.var = orig.commands$RunPCA.RNA$weight.by.var,
79
-            seed.use = seed,
80
-            verbose = FALSE
81
-        )
82
-    }
83
-
84
-    if (sct == TRUE) {
85
-        if (verbose) {
86
-            print("Creating Seurat object...")
87
-        }
88
-        seu_wdoublets <- Seurat::CreateSeuratObject(counts = data_wdoublets)
89
-
90
-        if (verbose) {
91
-            print("Running SCTransform...")
92
-        }
93
-        seu_wdoublets <- Seurat::SCTransform(seu_wdoublets)
94
-
95
-        if (verbose) {
96
-            print("Running PCA...")
97
-        }
98
-        seu_wdoublets <- Seurat::RunPCA(seu_wdoublets,
99
-                                npcs = length(PCs), seed.use = seed,
100
-                                verbose = verbose
101
-        )
102
-    }
103
-
104
-    ## Compute PC distance matrix
105
-    if (verbose) {
106
-        print("Calculating PC distance matrix...")
107
-    }
108
-    nCells <- nrow(seu_wdoublets@meta.data)
109
-    pca.coord <- seu_wdoublets@reductions$pca@cell.embeddings[, PCs]
110
-    seu_wdoublets <- NULL
111
-    gc()
112
-    dist.mat <- fields::rdist(pca.coord)[, 1:n.real.cells]
113
-
114
-    ## Pre-order PC distance matrix prior to iterating across pK
115
-    ## for pANN computations
116
-    if (verbose) {
117
-        print("Defining neighborhoods...")
118
-    }
119
-
120
-    for (i in 1:n.real.cells) {
121
-        dist.mat[, i] <- order(dist.mat[, i])
122
-    }
123
-
124
-    ## Trim PC distance matrix for faster manipulations
125
-    ind <- round(nCells * max(pK)) + 5
126
-    dist.mat <- dist.mat[1:ind, ]
127
-
128
-    ## Compute pANN across pK sweep
129
-
130
-    if (verbose) {
131
-        print("Computing pANN across all pK...")
132
-    }
133
-    for (k in 1:length(pK)) {
134
-        if (verbose) {
135
-            print(paste("pK = ", pK[k], "...", sep = ""))
136
-        }
137
-        pk.temp <- round(nCells * pK[k])
138
-        pANN <- as.data.frame(matrix(0L, nrow = n.real.cells, ncol = 1))
139
-        colnames(pANN) <- "pANN"
140
-        rownames(pANN) <- real.cells
141
-        list.ind <- list.ind + 1
142
-
143
-        for (i in 1:n.real.cells) {
144
-            neighbors <- dist.mat[2:(pk.temp + 1), i]
145
-            pANN$pANN[i] <- length(which(neighbors > n.real.cells)) / pk.temp
146
-        }
147
-
148
-        sweep.res.list[[list.ind]] <- pANN
149
-    }
150
-
151
-    return(sweep.res.list)
152
-}
153
-
154
-.paramSweep <- function(seu, PCs = 1:10, sct = FALSE,
155
-                          verbose = FALSE, num.cores, seed) {
156
-    ## Set pN-pK param sweep ranges
157
-    pK <- c(0.0005, 0.001, 0.005, seq(0.01, 0.3, by = 0.01))
158
-    pN <- seq(0.05, 0.3, by = 0.05)
159
-    ## Remove pK values with too few cells
160
-    min.cells <- round(nrow(seu@meta.data) / (1 - 0.05) - nrow(seu@meta.data))
161
-    pK.test <- round(pK * min.cells)
162
-    pK <- pK[which(pK.test >= 1)]
163
-
164
-    ## Extract pre-processing parameters from original data analysis workflow
165
-    orig.commands <- seu@commands
166
-
167
-    ## Down-sample cells to 10000 (when applicable) for computational effiency
168
-    if (nrow(seu@meta.data) > 10000) {
169
-        real.cells <- rownames(seu@meta.data)[sample(1:nrow(seu@meta.data),
170
-                                                     10000, replace = FALSE)]
171
-        data <- seu@assays$RNA@counts[, real.cells]
172
-        n.real.cells <- ncol(data)
173
-    }
174
-
175
-    if (nrow(seu@meta.data) <= 10000) {
176
-        real.cells <- rownames(seu@meta.data)
177
-        data <- seu@assays$RNA@counts
178
-        n.real.cells <- ncol(data)
179
-    }
180
-    ## Iterate through pN, computing pANN vectors at varying pK
181
-    # no_cores <- detectCores()-1
182
-    if (is.null(num.cores)) {
183
-        num.cores <- 1
184
-    }
185
-
186
-    if (num.cores > 1) {
187
-        cl <- parallel::makeCluster(num.cores)
188
-
189
-        output2 <- withr::with_seed(
190
-            seed,
191
-            parallel::mclapply(as.list(1:length(pN)),
192
-                     FUN = .parallel_paramSweep,
193
-                     n.real.cells,
194
-                     real.cells,
195
-                     pK,
196
-                     pN,
197
-                     data,
198
-                     orig.commands,
199
-                     PCs,
200
-                     sct, mc.cores = num.cores,
201
-                     verbose = verbose,
202
-                     seed = seed,
203
-                     mc.set.seed = FALSE
204
-            )
205
-        )
206
-        parallel::stopCluster(cl)
207
-    } else {
208
-        output2 <- lapply(as.list(1:length(pN)),
209
-                          FUN = .parallel_paramSweep,
210
-                          n.real.cells,
211
-                          real.cells,
212
-                          pK,
213
-                          pN,
214
-                          data,
215
-                          orig.commands,
216
-                          PCs,
217
-                          sct,
218
-                          seed = seed,
219
-                          verbose = verbose
220
-        )
221
-    }
222
-
223
-    ## Write parallelized output into list
224
-    sweep.res.list <- list()
225
-    list.ind <- 0
226
-    for (i in 1:length(output2)) {
227
-        for (j in 1:length(output2[[i]])) {
228
-            list.ind <- list.ind + 1
229
-            sweep.res.list[[list.ind]] <- output2[[i]][[j]]
230
-        }
231
-    }
232
-    ## Assign names to list of results
233
-    name.vec <- NULL
234
-    for (j in 1:length(pN)) {
235
-        name.vec <- c(name.vec, paste("pN", pN[j], "pK", pK, sep = "_"))
236
-    }
237
-    names(sweep.res.list) <- name.vec
238
-    return(sweep.res.list)
239
-
240
-}
241
-
242
-.runDoubletFinder <- function(counts, seuratPcs, seuratRes, formationRate,
243
-                              seuratNfeatures, verbose = FALSE,
244
-                              nCores = NULL, seed = 12345) {
245
-
246
-  ## Convert to sparse matrix if not already in that format
247
-  counts <- .convertToMatrix(counts)
248
-  colnames(counts) <- gsub("_", "-", colnames(counts))
249
-
250
-  seurat <- suppressWarnings(Seurat::CreateSeuratObject(
251
-    counts = counts,
252
-    project = "seurat", min.features = 0
253
-  ))
254
-  seurat <- Seurat::NormalizeData(
255
-    object = seurat,
256
-    normalization.method = "LogNormalize", scale.factor = 10000,
257
-    verbose = verbose
258
-  )
259
-  seurat <- Seurat::FindVariableFeatures(seurat,
260
-    selection.method = "vst",
261
-    nfeatures = seuratNfeatures,
262
-    verbose = verbose
263
-  )
264
-
265
-  allGenes <- rownames(seurat)
266
-  seurat <- Seurat::ScaleData(seurat, features = allGenes, verbose = verbose)
267
-
268
-  numPc <- min(ncol(seurat@assays$RNA@scale.data) - 1, 50)
269
-  seurat <- Seurat::RunPCA(seurat,
270
-    features =
271
-      Seurat::VariableFeatures(object = seurat),
272
-    npcs = numPc, verbose = verbose, seed.use = seed
273
-  )
274
-  seurat <- Seurat::FindNeighbors(seurat, dims = seuratPcs, verbose = verbose)
275
-  seurat <- Seurat::FindClusters(seurat,
276
-    resolution = seuratRes,
277
-    verbose = verbose,
278
-    random.seed = seed
279
-  )
280
-  invisible(sweepResListSeurat <- .paramSweep(seurat,
281
-    PCs = seuratPcs,
282
-    sct = FALSE,
283
-    num.cores = nCores,
284
-    verbose = verbose,
285
-    seed = seed
286
-  ))
287
-  sweepStatsSeurat <- DoubletFinder::summarizeSweep(sweepResListSeurat,
288
-    GT = FALSE
289
-  )
290
-  bcmvnSeurat <- DoubletFinder::find.pK(sweepStatsSeurat, verbose = verbose)
291
-  pkOptimal <- as.numeric(as.matrix(bcmvnSeurat$pK[
292
-    which.max(bcmvnSeurat$MeanBC)
293
-  ]))
294
-  annotations <- seurat@meta.data$seurat_clusters
295
-  homotypicProp <- DoubletFinder::modelHomotypic(annotations)
296
-  nExpPoi <- round(formationRate * ncol(seurat@assays$RNA))
297
-  seurat <- DoubletFinder::doubletFinder_v3(seurat,
298
-    PCs = seuratPcs,
299
-    pN = 0.25,
300
-    pK = pkOptimal,
301
-    nExp = nExpPoi,
302
-    reuse.pANN = FALSE,
303
-    sct = FALSE,
304
-    verbose = FALSE
305
-  )
306
-  names(seurat@meta.data)[6] <- "doubletFinderAnnScore"
307
-  names(seurat@meta.data)[7] <- "doubletFinderLabel"
308
-  return(seurat)
309
-}
310
-
311
-#' @title Generates a doublet score for each cell via doubletFinder
312
-#' @description Uses doubletFinder to determine cells within the dataset
313
-#'  suspected to be doublets.
314
-#' @param inSCE Input SingleCellExperiment object. Must contain a counts matrix
315
-#' @param useAssay  Indicate which assay to use. Default "counts".
316
-#' @param sample Numeric vector. Each cell will be assigned a sample number.
317
-#' @param seed Seed for the random number generator. Default 12345.
318
-#' @param seuratNfeatures Integer. Number of highly variable genes to use.
319
-#'  Default 2000.
320
-#' @param seuratPcs Numeric vector. The PCs used in seurat function to
321
-#'   determine number of clusters. Default 1:15.
322
-#' @param seuratRes Numeric vector. The resolution parameter used in seurat,
323
-#'  which adjusts the number of clusters determined via the algorithm.
324
-#'  Default 1.5.
325
-#' @param formationRate Doublet formation rate used within algorithm.
326
-#'  Default 0.075.
327
-#' @param nCores Number of cores used for running the function.
328
-#' @param verbose Boolean. Wheter to print messages from Seurat and
329
-#'  DoubletFinder. Default FALSE.
330
-#' @return SingleCellExperiment object containing the
331
-#'  'doublet_finder_doublet_score'.
332
-#' @examples
333
-#' \donttest{
334
-#' data(scExample, package = "singleCellTK")
335
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
336
-#' sce <- runDoubletFinder(sce)
337
-#' }
338
-#' @export
339
-#' @importFrom SummarizedExperiment colData colData<-
340
-#' @importFrom SingleCellExperiment reducedDim<-
341
-runDoubletFinder <- function(inSCE,
342
-                             useAssay = "counts",
343
-                             sample = NULL,
344
-                             seed = 12345,
345
-                             seuratNfeatures = 2000,
346
-                             seuratPcs = 1:15,
347
-                             seuratRes = 1.5,
348
-                             formationRate = 0.075,
349
-                             nCores = NULL,
350
-                             verbose = FALSE) {
351
-
352
-  # argsList <- as.list(formals(fun = sys.function(sys.parent()), envir = parent.frame()))
353
-  argsList <- mget(names(formals()), sys.frame(sys.nframe()))
354
-
355
-  if (!is.null(sample)) {
356
-    if (length(sample) != ncol(inSCE)) {
357
-      stop("'sample' must be the same length as the number of columns in 'inSCE'")
358
-    }
359
-  } else {
360
-    sample <- rep(1, ncol(inSCE))
361
-  }
362
-
363
-  message(paste0(date(), " ... Running 'doubletFinder'"))
364
-
365
-  doubletScore <- rep(NA, ncol(inSCE))
366
-  doubletLabel <- rep(NA, ncol(inSCE))
367
-  allSampleNumbers <- sort(unique(sample))
368
-
369
-
370
-  for (res in seuratRes) {
371
-    output <- S4Vectors::DataFrame(
372
-      row.names = colnames(inSCE),
373
-      doubletFinder_doublet_score = numeric(ncol(inSCE)),
374
-      doubletFinder_doublet_label = numeric(ncol(inSCE))
375
-    )
376
-    umapDims <- matrix(ncol = 2,
377
-                       nrow = ncol(inSCE))
378
-    rownames(umapDims) = colnames(inSCE)
379
-    colnames(umapDims) = c("UMAP_1", "UMAP_2")
380
-
381
-    ## Loop through each sample and run doubletFinder
382
-    samples <- unique(sample)
383
-
384
-    for (i in seq_len(length(samples))) {
385
-      sceSampleInd <- sample == samples[i]
386
-      sceSample <- inSCE[, sceSampleInd]
387
-      mat <- SummarizedExperiment::assay(sceSample, i = useAssay)
388
-      if(length(seuratPcs) > ncol(mat)){
389
-        seuratPcs <- 1:ncol(mat)
390
-      }
391
-
392
-      result <- suppressMessages(withr::with_seed(
393
-        seed,
394
-        .runDoubletFinder(
395
-          counts = mat,
396
-          seuratPcs = seuratPcs,
397
-          seuratRes = res,
398
-          seuratNfeatures = seuratNfeatures,
399
-          formationRate = formationRate,
400
-          nCores = nCores,
401
-          verbose = verbose,
402
-          seed = seed
403
-        )
404
-      ))
405
-      result <- suppressMessages(Seurat::RunUMAP(result,
406
-                                                 dims = 1:10,
407
-                                                 verbose = verbose,
408
-                                                 umap.method = "umap-learn",
409
-                                                 metric = "correlation"))
410
-
411
-      seuratDims <- Seurat::Embeddings(result, reduction = "umap")
412
-      umapDims[sceSampleInd, 1] <- seuratDims[,1]
413
-      umapDims[sceSampleInd, 2] <- seuratDims[,2]
414
-      output[sceSampleInd, 1] <- result@meta.data$doubletFinderAnnScore
415
-      output[sceSampleInd, 2] <- result@meta.data$doubletFinderLabel
416
-    }
417
-
418
-    colnames(output) <- paste0(colnames(output), "_resolution_", res)
419
-
420
-    argsList <- argsList[!names(argsList) %in% ("...")]
421
-    inSCE@metadata$runDoubletFinder <- argsList[-1]
422
-    inSCE@metadata$runDoubletFinder$packageVersion <- utils::packageDescription("DoubletFinder")$Version
423
-    colData(inSCE) <- cbind(colData(inSCE), output)
424
-    reducedDim(inSCE,'doubletFinder_UMAP') <- umapDims
425
-  }
426
-  return(inSCE)
427
-}
1
+## Original code from DoubletFinder package
2
+## (https://github.com/chris-mcginnis-ucsf/DoubletFinder)
3
+
4
+.parallel_paramSweep <- function(n, n.real.cells, real.cells, pK, pN, data,
5
+                                   orig.commands, PCs, sct, verbose, seed) {
6
+    sweep.res.list <- list()
7
+    list.ind <- 0
8
+
9
+    ## Make merged real-artifical data
10
+    if (verbose) {
11
+        print(paste("Creating artificial doublets for pN = ",
12
+                    pN[n] * 100, "%", sep = ""))
13
+    }
14
+    n_doublets <- ceiling(n.real.cells / (1 - pN[n]) - n.real.cells)
15
+    real.cells1 <- sample(real.cells, n_doublets, replace = TRUE)
16
+    real.cells2 <- sample(real.cells, n_doublets, replace = TRUE)
17
+    doublets <- (data[, real.cells1] + data[, real.cells2]) / 2
18
+    colnames(doublets) <- paste("X", 1:n_doublets, sep = "")
19
+    data_wdoublets <- cbind(data, doublets)
20
+
21
+    ## Pre-process Seurat object
22
+    if (sct == FALSE) {
23
+        if (verbose) {
24
+            print("Creating Seurat object...")
25
+        }
26
+        seu_wdoublets <- Seurat::CreateSeuratObject(counts = data_wdoublets)
27
+
28
+        if (verbose) {
29
+            print("Normalizing Seurat object...")
30
+        }
31
+        seu_wdoublets <- Seurat::NormalizeData(seu_wdoublets,
32
+                                       normalization.method = orig.commands$NormalizeData.RNA@params$normalization.method,
33
+                                       scale.factor = orig.commands$NormalizeData.RNA@params$scale.factor,
34
+                                       margin = orig.commands$NormalizeData.RNA@params$margin,
35
+                                       verbose = verbose
36
+        )
37
+
38
+        if (verbose) {
39
+            print("Finding variable genes...")
40
+        }
41
+        seu_wdoublets <- Seurat::FindVariableFeatures(seu_wdoublets,
42
+                                              selection.method = orig.commands$FindVariableFeatures.RNA$selection.method,
43
+                                              loess.span = orig.commands$FindVariableFeatures.RNA$loess.span,
44
+                                              clip.max = orig.commands$FindVariableFeatures.RNA$clip.max,
45
+                                              mean.function = orig.commands$FindVariableFeatures.RNA$mean.function,
46
+                                              dispersion.function = orig.commands$FindVariableFeatures.RNA$dispersion.function,
47
+                                              num.bin = orig.commands$FindVariableFeatures.RNA$num.bin,
48
+                                              binning.method = orig.commands$FindVariableFeatures.RNA$binning.method,
49
+                                              nfeatures = orig.commands$FindVariableFeatures.RNA$nfeatures,
50
+                                              mean.cutoff = orig.commands$FindVariableFeatures.RNA$mean.cutoff,
51
+                                              dispersion.cutoff = orig.commands$FindVariableFeatures.RNA$dispersion.cutoff,
52
+                                              verbose = verbose
53
+        )
54
+
55
+        if (verbose) {
56
+            print("Scaling data...")
57
+        }
58
+        seu_wdoublets <- Seurat::ScaleData(
59
+            seu_wdoublets,
60
+            features = orig.commands$ScaleData.RNA$features,
61
+            model.use = orig.commands$ScaleData.RNA$model.use,
62
+            do.scale = orig.commands$ScaleData.RNA$do.scale,
63
+            do.center = orig.commands$ScaleData.RNA$do.center,
64
+            scale.max = orig.commands$ScaleData.RNA$scale.max,
65
+            block.size = orig.commands$ScaleData.RNA$block.size,
66
+            min.cells.to.block = orig.commands$ScaleData.RNA$min.cells.to.block,
67
+            verbose = verbose
68
+        )
69
+
70
+        if (verbose) {
71
+            print("Running PCA...")
72
+        }
73
+        seu_wdoublets <- Seurat::RunPCA(
74
+            seu_wdoublets,
75
+            features = orig.commands$ScaleData.RNA$features,
76
+            npcs = length(PCs),
77
+            rev.pca = orig.commands$RunPCA.RNA$rev.pca,
78
+            weight.by.var = orig.commands$RunPCA.RNA$weight.by.var,
79
+            seed.use = seed,
80
+            verbose = FALSE
81
+        )
82
+    }
83
+
84
+    if (sct == TRUE) {
85
+        if (verbose) {
86
+            print("Creating Seurat object...")
87
+        }
88
+        seu_wdoublets <- Seurat::CreateSeuratObject(counts = data_wdoublets)
89
+
90
+        if (verbose) {
91
+            print("Running SCTransform...")
92
+        }
93
+        seu_wdoublets <- Seurat::SCTransform(seu_wdoublets)
94
+
95
+        if (verbose) {
96
+            print("Running PCA...")
97
+        }
98
+        seu_wdoublets <- Seurat::RunPCA(seu_wdoublets,
99
+                                npcs = length(PCs), seed.use = seed,
100
+                                verbose = verbose
101
+        )
102
+    }
103
+
104
+    ## Compute PC distance matrix
105
+    if (verbose) {
106
+        print("Calculating PC distance matrix...")
107
+    }
108
+    nCells <- nrow(seu_wdoublets@meta.data)
109
+    pca.coord <- seu_wdoublets@reductions$pca@cell.embeddings[, PCs]
110
+    seu_wdoublets <- NULL
111
+    gc()
112
+    dist.mat <- fields::rdist(pca.coord)[, 1:n.real.cells]
113
+
114
+    ## Pre-order PC distance matrix prior to iterating across pK
115
+    ## for pANN computations
116
+    if (verbose) {
117
+        print("Defining neighborhoods...")
118
+    }
119
+
120
+    for (i in 1:n.real.cells) {
121
+        dist.mat[, i] <- order(dist.mat[, i])
122
+    }
123
+
124
+    ## Trim PC distance matrix for faster manipulations
125
+    ind <- round(nCells * max(pK)) + 5
126
+    dist.mat <- dist.mat[1:ind, ]
127
+
128
+    ## Compute pANN across pK sweep
129
+
130
+    if (verbose) {
131
+        print("Computing pANN across all pK...")
132
+    }
133
+    for (k in 1:length(pK)) {
134
+        if (verbose) {
135
+            print(paste("pK = ", pK[k], "...", sep = ""))
136
+        }
137
+        pk.temp <- round(nCells * pK[k])
138
+        pANN <- as.data.frame(matrix(0L, nrow = n.real.cells, ncol = 1))
139
+        colnames(pANN) <- "pANN"
140
+        rownames(pANN) <- real.cells
141
+        list.ind <- list.ind + 1
142
+
143
+        for (i in 1:n.real.cells) {
144
+            neighbors <- dist.mat[2:(pk.temp + 1), i]
145
+            pANN$pANN[i] <- length(which(neighbors > n.real.cells)) / pk.temp
146
+        }
147
+
148
+        sweep.res.list[[list.ind]] <- pANN
149
+    }
150
+
151
+    return(sweep.res.list)
152
+}
153
+
154
+.paramSweep <- function(seu, PCs = 1:10, sct = FALSE,
155
+                          verbose = FALSE, num.cores, seed) {
156
+    ## Set pN-pK param sweep ranges
157
+    pK <- c(0.0005, 0.001, 0.005, seq(0.01, 0.3, by = 0.01))
158
+    pN <- seq(0.05, 0.3, by = 0.05)
159
+    ## Remove pK values with too few cells
160
+    min.cells <- round(nrow(seu@meta.data) / (1 - 0.05) - nrow(seu@meta.data))
161
+    pK.test <- round(pK * min.cells)
162
+    pK <- pK[which(pK.test >= 1)]
163
+
164
+    ## Extract pre-processing parameters from original data analysis workflow
165
+    orig.commands <- seu@commands
166
+
167
+    ## Down-sample cells to 10000 (when applicable) for computational effiency
168
+    if (nrow(seu@meta.data) > 10000) {
169
+        real.cells <- rownames(seu@meta.data)[sample(1:nrow(seu@meta.data),
170
+                                                     10000, replace = FALSE)]
171
+        data <- seu@assays$RNA@counts[, real.cells]
172
+        n.real.cells <- ncol(data)
173
+    }
174
+
175
+    if (nrow(seu@meta.data) <= 10000) {
176
+        real.cells <- rownames(seu@meta.data)
177
+        data <- seu@assays$RNA@counts
178
+        n.real.cells <- ncol(data)
179
+    }
180
+    ## Iterate through pN, computing pANN vectors at varying pK
181
+    # no_cores <- detectCores()-1
182
+    if (is.null(num.cores)) {
183
+        num.cores <- 1
184
+    }
185
+
186
+    if (num.cores > 1) {
187
+        cl <- parallel::makeCluster(num.cores)
188
+
189
+        output2 <- withr::with_seed(
190
+            seed,
191
+            parallel::mclapply(as.list(1:length(pN)),
192
+                     FUN = .parallel_paramSweep,
193
+                     n.real.cells,
194
+                     real.cells,
195
+                     pK,
196
+                     pN,
197
+                     data,
198
+                     orig.commands,
199
+                     PCs,
200
+                     sct, mc.cores = num.cores,
201
+                     verbose = verbose,
202
+                     seed = seed,
203
+                     mc.set.seed = FALSE
204
+            )
205
+        )
206
+        parallel::stopCluster(cl)
207
+    } else {
208
+        output2 <- lapply(as.list(1:length(pN)),
209
+                          FUN = .parallel_paramSweep,
210
+                          n.real.cells,
211
+                          real.cells,
212
+                          pK,
213
+                          pN,
214
+                          data,
215
+                          orig.commands,
216
+                          PCs,
217
+                          sct,
218
+                          seed = seed,
219
+                          verbose = verbose
220
+        )
221
+    }
222
+
223
+    ## Write parallelized output into list
224
+    sweep.res.list <- list()
225
+    list.ind <- 0
226
+    for (i in 1:length(output2)) {
227
+        for (j in 1:length(output2[[i]])) {
228
+            list.ind <- list.ind + 1
229
+            sweep.res.list[[list.ind]] <- output2[[i]][[j]]
230
+        }
231
+    }
232
+    ## Assign names to list of results
233
+    name.vec <- NULL
234
+    for (j in 1:length(pN)) {
235
+        name.vec <- c(name.vec, paste("pN", pN[j], "pK", pK, sep = "_"))
236
+    }
237
+    names(sweep.res.list) <- name.vec
238
+    return(sweep.res.list)
239
+
240
+}
241
+
242
+.runDoubletFinder <- function(counts, seuratPcs, seuratRes, formationRate,
243
+                              seuratNfeatures, verbose = FALSE,
244
+                              nCores = NULL, seed = 12345) {
245
+
246
+  ## Convert to sparse matrix if not already in that format
247
+  counts <- .convertToMatrix(counts)
248
+  colnames(counts) <- gsub("_", "-", colnames(counts))
249
+
250
+  seurat <- suppressWarnings(Seurat::CreateSeuratObject(
251
+    counts = counts,
252
+    project = "seurat", min.features = 0
253
+  ))
254
+  seurat <- Seurat::NormalizeData(
255
+    object = seurat,
256
+    normalization.method = "LogNormalize", scale.factor = 10000,
257
+    verbose = verbose
258
+  )
259
+  seurat <- Seurat::FindVariableFeatures(seurat,
260
+    selection.method = "vst",
261
+    nfeatures = seuratNfeatures,
262
+    verbose = verbose
263
+  )
264
+
265
+  allGenes <- rownames(seurat)
266
+  seurat <- Seurat::ScaleData(seurat, features = allGenes, verbose = verbose)
267
+
268
+  numPc <- min(ncol(seurat@assays$RNA@scale.data) - 1, 50)
269
+  seurat <- Seurat::RunPCA(seurat,
270
+    features =
271
+      Seurat::VariableFeatures(object = seurat),
272
+    npcs = numPc, verbose = verbose, seed.use = seed
273
+  )
274
+  seurat <- Seurat::FindNeighbors(seurat, dims = seuratPcs, verbose = verbose)
275
+  seurat <- Seurat::FindClusters(seurat,
276
+    resolution = seuratRes,
277
+    verbose = verbose,
278
+    random.seed = seed
279
+  )
280
+  invisible(sweepResListSeurat <- .paramSweep(seurat,
281
+    PCs = seuratPcs,
282
+    sct = FALSE,
283
+    num.cores = nCores,
284
+    verbose = verbose,
285
+    seed = seed
286
+  ))
287
+  sweepStatsSeurat <- .summarizeSweep(sweepResListSeurat,
288
+    GT = FALSE
289
+  )
290
+  bcmvnSeurat <- .find.pK(sweepStatsSeurat)
291
+  pkOptimal <- as.numeric(as.matrix(bcmvnSeurat$pK[
292
+    which.max(bcmvnSeurat$MeanBC)
293
+  ]))
294
+  annotations <- seurat@meta.data$seurat_clusters
295
+  homotypicProp <- .modelHomotypic(annotations)
296
+  nExpPoi <- round(formationRate * ncol(seurat@assays$RNA))
297
+  seurat <- .doubletFinder_v3(seurat,
298
+    PCs = seuratPcs,
299
+    pN = 0.25,
300
+    pK = pkOptimal,
301
+    nExp = nExpPoi,
302
+    reuse.pANN = FALSE,
303
+    sct = FALSE
304
+  )
305
+  names(seurat@meta.data)[6] <- "doubletFinderAnnScore"
306
+  names(seurat@meta.data)[7] <- "doubletFinderLabel"
307
+  return(seurat)
308
+}
309
+
310
+#' @title Generates a doublet score for each cell via doubletFinder
311
+#' @description Uses doubletFinder to determine cells within the dataset
312
+#'  suspected to be doublets.
313
+#' @param inSCE Input SingleCellExperiment object. Must contain a counts matrix
314
+#' @param useAssay  Indicate which assay to use. Default "counts".
315
+#' @param sample Numeric vector. Each cell will be assigned a sample number.
316
+#' @param seed Seed for the random number generator. Default 12345.
317
+#' @param seuratNfeatures Integer. Number of highly variable genes to use.
318
+#'  Default 2000.
319
+#' @param seuratPcs Numeric vector. The PCs used in seurat function to
320
+#'   determine number of clusters. Default 1:15.
321
+#' @param seuratRes Numeric vector. The resolution parameter used in seurat,
322
+#'  which adjusts the number of clusters determined via the algorithm.
323
+#'  Default 1.5.
324
+#' @param formationRate Doublet formation rate used within algorithm.
325
+#'  Default 0.075.
326
+#' @param nCores Number of cores used for running the function.
327
+#' @param verbose Boolean. Wheter to print messages from Seurat and
328
+#'  DoubletFinder. Default FALSE.
329
+#' @return SingleCellExperiment object containing the
330
+#'  'doublet_finder_doublet_score'.
331
+#' @examples
332
+#' \donttest{
333
+#' data(scExample, package = "singleCellTK")
334
+#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
335
+#' sce <- runDoubletFinder(sce)
336
+#' }
337
+#' @export
338
+#' @importFrom SummarizedExperiment colData colData<-
339
+#' @importFrom SingleCellExperiment reducedDim<-
340
+runDoubletFinder <- function(inSCE,
341
+                             useAssay = "counts",
342
+                             sample = NULL,
343
+                             seed = 12345,
344
+                             seuratNfeatures = 2000,
345
+                             seuratPcs = 1:15,
346
+                             seuratRes = 1.5,
347
+                             formationRate = 0.075,
348
+                             nCores = NULL,
349
+                             verbose = FALSE) {
350
+
351
+  # argsList <- as.list(formals(fun = sys.function(sys.parent()), envir = parent.frame()))
352
+  argsList <- mget(names(formals()), sys.frame(sys.nframe()))
353
+
354
+  if (!is.null(sample)) {
355
+    if (length(sample) != ncol(inSCE)) {
356
+      stop("'sample' must be the same length as the number of columns in 'inSCE'")
357
+    }
358
+  } else {
359
+    sample <- rep(1, ncol(inSCE))
360
+  }
361
+
362
+  message(paste0(date(), " ... Running 'doubletFinder'"))
363
+
364
+  doubletScore <- rep(NA, ncol(inSCE))
365
+  doubletLabel <- rep(NA, ncol(inSCE))
366
+  allSampleNumbers <- sort(unique(sample))
367
+
368
+  for (res in seuratRes) {
369
+    output <- S4Vectors::DataFrame(
370
+      row.names = colnames(inSCE),
371
+      doubletFinder_doublet_score = numeric(ncol(inSCE)),
372
+      doubletFinder_doublet_label = numeric(ncol(inSCE))
373
+    )
374
+    umapDims <- matrix(ncol = 2,
375
+                       nrow = ncol(inSCE))
376
+    rownames(umapDims) = colnames(inSCE)
377
+    colnames(umapDims) = c("UMAP_1", "UMAP_2")
378
+
379
+    ## Loop through each sample and run doubletFinder
380
+    samples <- unique(sample)
381
+
382
+    for (i in seq_len(length(samples))) {
383
+      sceSampleInd <- sample == samples[i]
384
+      sceSample <- inSCE[, sceSampleInd]
385
+      mat <- SummarizedExperiment::assay(sceSample, i = useAssay)
386
+      if(length(seuratPcs) > ncol(mat)){
387
+        seuratPcs <- 1:ncol(mat)
388
+      }
389
+
390
+      result <- suppressMessages(withr::with_seed(
391
+        seed,
392
+        .runDoubletFinder(
393
+          counts = mat,
394
+          seuratPcs = seuratPcs,
395
+          seuratRes = res,
396
+          seuratNfeatures = seuratNfeatures,
397
+          formationRate = formationRate,
398
+          nCores = nCores,
399
+          verbose = verbose,
400
+          seed = seed
401
+        )
402
+      ))
403
+      result <- suppressMessages(Seurat::RunUMAP(result,
404
+                                                 dims = 1:10,
405
+                                                 verbose = verbose,
406
+                                                 umap.method = "umap-learn",
407
+                                                 metric = "correlation"))
408
+
409
+      seuratDims <- Seurat::Embeddings(result, reduction = "umap")
410
+      umapDims[sceSampleInd, 1] <- seuratDims[,1]
411
+      umapDims[sceSampleInd, 2] <- seuratDims[,2]
412
+      output[sceSampleInd, 1] <- result@meta.data$doubletFinderAnnScore
413
+      output[sceSampleInd, 2] <- result@meta.data$doubletFinderLabel
414
+    }
415
+
416
+    colnames(output) <- paste0(colnames(output), "_resolution_", res)
417
+
418
+    argsList <- argsList[!names(argsList) %in% ("...")]
419
+    inSCE@metadata$runDoubletFinder <- argsList[-1]
420
+    inSCE@metadata$runDoubletFinder$packageVersion <- "2.0.2"
421
+    colData(inSCE) <- cbind(colData(inSCE), output)
422
+    reducedDim(inSCE,'doubletFinder_UMAP') <- umapDims
423
+  }
424
+  return(inSCE)
425
+}
426
+
427
+
428
+.summarizeSweep <- function(sweep.list, GT = FALSE, GT.calls = NULL) {
429
+  #require(KernSmooth); require(ROCR)
430
+  ## Set pN-pK param sweep ranges
431
+  name.vec <- names(sweep.list)
432
+  name.vec <- unlist(strsplit(name.vec, split="pN_"))
433
+  name.vec <- name.vec[seq(2, length(name.vec), by=2)]
434
+  name.vec <- unlist(strsplit(name.vec, split="_pK_"))
435
+  pN <- as.numeric(unique(name.vec[seq(1, length(name.vec), by=2)]))
436
+  pK <- as.numeric(unique(name.vec[seq(2, length(name.vec), by=2)]))
437
+
438
+  ## Initialize data structure w/ or w/o AUC column, depending on whether ground-truth doublet classifications are available
439
+  if (GT == TRUE) {
440
+    sweep.stats <- as.data.frame(matrix(0L, nrow=length(sweep.list), ncol=4))
441
+    colnames(sweep.stats) <- c("pN","pK","AUC","BCreal")
442
+    sweep.stats$pN <- factor(rep(pN, each=length(pK), levels = pN))
443
+    sweep.stats$pK <- factor(rep(pK, length(pN),levels = pK))
444
+  }
445
+
446
+  if (GT == FALSE) {
447
+    sweep.stats <- as.data.frame(matrix(0L, nrow=length(sweep.list), ncol=3))
448
+    colnames(sweep.stats) <- c("pN","pK","BCreal")
449
+    sweep.stats$pN <- factor(rep(pN, each=length(pK), levels = pN))
450
+    sweep.stats$pK <- factor(rep(pK, length(pN),levels = pK))
451
+  }
452
+
453
+  ## Perform pN-pK parameter sweep summary
454
+  for (i in 1:length(sweep.list)) {
455
+    res.temp <- sweep.list[[i]]
456
+
457
+    ## Use gaussian kernel density estimation of pANN vector to compute bimodality coefficient
458
+    gkde <- stats::approxfun(KernSmooth::bkde(res.temp$pANN, kernel="normal"))
459
+    x <- seq(from=min(res.temp$pANN), to=max(res.temp$pANN), length.out=nrow(res.temp))
460
+    sweep.stats$BCreal[i] <- .bimodality_coefficient(gkde(x))
461
+
462
+    if (GT == FALSE) { next }
463
+
464
+    ## If ground-truth doublet classifications are available, perform ROC analysis on logistic
465
+    ## regression model trained using pANN vector
466
+    meta <- as.data.frame(matrix(0L, nrow=nrow(res.temp), ncol=2))
467
+    meta[,1] <- GT.calls
468
+    meta[,2] <- res.temp$pANN
469
+    train.ind <- sample(1:nrow(meta), round(nrow(meta)/2), replace=FALSE)
470
+    test.ind <- (1:nrow(meta))[-train.ind]
471
+    colnames(meta) <- c("SinDub","pANN")
472
+    meta$SinDub <- factor(meta$SinDub, levels = c("Doublet","Singlet"))
473
+    model.lm <- stats::glm(SinDub ~ pANN, family=stats::binomial(link='logit'), data=meta, subset=train.ind)
474
+    prob <- stats::predict(model.lm, newdata=meta[test.ind, ], type="response")
475
+    ROCpred <- ROCR::prediction(predictions=prob, labels=meta$SinDub[test.ind])
476
+    perf.auc <- ROCR::performance(ROCpred, measure="auc")
477
+    sweep.stats$AUC[i] <- perf.auc@y.values[[1]]
478
+  }
479
+
480
+  return(sweep.stats)
481
+}
482
+
483
+
484
+.bimodality_coefficient <- function(x) {
485
+  G <- .skewness(x)
486
+  sample.excess.kurtosis <- .kurtosis(x)
487
+  K <- sample.excess.kurtosis
488
+  n <- length(x)
489
+  B <- ((G^2)+1)/(K+((3*((n-1)^2))/((n-2)*(n-3))))
490
+  return(B)
491
+}
492
+
493
+.skewness <- function(x) {
494
+  n <- length(x)
495
+  S <- (1/n)*sum((x-mean(x))^3)/(((1/n)*sum((x-mean(x))^2))^1.5)
496
+  S <- S*(sqrt(n*(n-1)))/(n-2)