... | ... |
@@ -1,16 +1,16 @@ |
1 | 1 |
language: r |
2 |
-cache: |
|
2 |
+cache: |
|
3 | 3 |
- packages |
4 | 4 |
- pip |
5 | 5 |
r: |
6 |
- - oldrel |
|
6 |
+ - release |
|
7 | 7 |
sudo: false |
8 | 8 |
warnings_are_errors: true |
9 | 9 |
bioc_required: true |
10 | 10 |
|
11 | 11 |
# Current version of xgboost fails install |
12 | 12 |
# Need to force install older version using 'devtools::install_version' |
13 |
-# which is done in 'before_install' (see below) |
|
13 |
+# which is done in 'before_install' (see below) |
|
14 | 14 |
# However 'remotes::dev_package_deps' will try to auto upgrade packages |
15 | 15 |
# Therefore, a new install command is used with 'upgrade="never"' |
16 | 16 |
# These parts can be deleted once the xgboost error is fixed |
... | ... |
@@ -21,7 +21,7 @@ before_install: |
21 | 21 |
- Rscript -e 'install.packages("devtools")' |
22 | 22 |
- Rscript -e 'devtools::install_version("xgboost", version = "0.90.0.2", repos = "http://cran.us.r-project.org")' |
23 | 23 |
- pip install --upgrade pip |
24 |
- - pip install scipy scrublet scanpy scanorama scgen bbknn |
|
24 |
+ - pip install scipy scrublet scanpy scanorama bbknn |
|
25 | 25 |
|
26 | 26 |
install: |
27 | 27 |
- R -e 'devtools::install_deps(dep = TRUE, upgrade="never")' |
... | ... |
@@ -31,11 +31,9 @@ addons: |
31 | 31 |
packages: |
32 | 32 |
- libv8-dev |
33 | 33 |
|
34 |
-r_build_args: "--no-build-vignettes" |
|
35 |
-r_check_args: "--no-vignettes --no-build-vignettes --as-cran" |
|
34 |
+r_check_args: "--as-cran" |
|
36 | 35 |
r_github_packages: |
37 | 36 |
- jimhester/lintr |
38 |
- - compbio/celda |
|
39 | 37 |
notifications: |
40 | 38 |
slack: |
41 | 39 |
secure: sw/He3WX7Fs61DTufgG9hx7fWDhwx57zv+dYB4sr30LgKyIJNbZYSU76ZN4oPwHFGAJ0eyk/22XCuKF8/seSB2ArQuc4kFlsaeWWg1CxDqW9iR0zs91MJL3lJoUOSayy5WcWbsUxD+x8GcB37FCgoTXcveN956bfgrDb2nGrFY6QMSVsnkWi/kiMdzhR4VPzOfmAUZUcr1aDU3WN4ROwbDg89zCfu9HbVKZwyQdYZk7L2Pgg8xGIcGZkaMN1MMD/r0Wc2aVYqCOcn5qO/9CaspEBf7cd6ejVzgAl/LM4dTIPlx9E6ekQGwJXpt1Qxyc0sqVSQWFZvFkWyV3wkqX8pfSXV6ojlP2cGJt+PSQXwLHj0TKWYx7yHbw+55da/b//SVLZVHl+P6VXxQBe2hmFD6sCk3V3uq2CkxzI52bK4Wv4RRncVKIZEpFZOc9P2ik5mf6pICmLHwttlbHeDjJbQ+raYftF704xb1sOxapLwuy+c2SWKPeqOiimfZNgOK+J7JbvQvpnaiHN+3CA3wQXs1auKBwI+swvsZPAzeKssnLv/0AQmX8ckVzvXJzUvl1J73c9V2zG2F44xmIsXwzruEp6leSbeXcLifrfbfQdmaHzq4M/Z83o/fwGRzdWfG7YXDyPIabik7DOMrSUvF+wLGVTjOart8HrA4eVEkKFnRE= |
... | ... |
@@ -1,6 +1,6 @@ |
1 | 1 |
Package: singleCellTK |
2 | 2 |
Type: Package |
3 |
-Title: Interactive Analysis of Single Cell RNA-Seq Data |
|
3 |
+Title: Comprehensive and Interactive Analysis of Single Cell RNA-Seq Data |
|
4 | 4 |
Version: 1.7.6 |
5 | 5 |
Author: David Jenkins |
6 | 6 |
Authors@R: c(person(given="David", family="Jenkins", email="dfj@bu.edu", role=c("aut","cre"), |
... | ... |
@@ -15,17 +15,14 @@ Authors@R: c(person(given="David", family="Jenkins", email="dfj@bu.edu", role=c( |
15 | 15 |
comment = c(ORCID = "0000-0002-6247-6595"))) |
16 | 16 |
Maintainer: David Jenkins <dfj@bu.edu> |
17 | 17 |
Depends: |
18 |
- R (>= 3.6.0), |
|
18 |
+ R (>= 4.0), |
|
19 | 19 |
SummarizedExperiment, |
20 | 20 |
SingleCellExperiment, |
21 | 21 |
DelayedArray, |
22 | 22 |
Biobase |
23 |
-Description: Run common single cell analysis directly through your browser |
|
24 |
- including differential expression, downsampling analysis, and clustering. |
|
23 |
+Description: Run common single cell analysis in the R console or directly through your browser. Includes many functions for import, quality control, normalization, batch correction, clustering, differential expression, and visualization.. |
|
25 | 24 |
License: MIT + file LICENSE |
26 | 25 |
Encoding: UTF-8 |
27 |
-cache: |
|
28 |
- - packages: true |
|
29 | 26 |
biocViews: SingleCell, GeneExpression, DifferentialExpression, Alignment, |
30 | 27 |
Clustering, ImmunoOncology |
31 | 28 |
LazyData: TRUE |
... | ... |
@@ -35,6 +32,7 @@ Imports: |
35 | 32 |
BiocGenerics, |
36 | 33 |
BiocParallel, |
37 | 34 |
colourpicker, |
35 |
+ colorspace, |
|
38 | 36 |
cowplot, |
39 | 37 |
cluster, |
40 | 38 |
ComplexHeatmap, |
... | ... |
@@ -46,24 +44,25 @@ Imports: |
46 | 44 |
fields, |
47 | 45 |
ggplot2, |
48 | 46 |
ggplotify, |
49 |
- ggtree, |
|
50 | 47 |
ggrepel, |
48 |
+ ggtree, |
|
51 | 49 |
gridExtra, |
52 | 50 |
GSVA (>= 1.26.0), |
53 | 51 |
GSVAdata, |
54 | 52 |
harmony, |
55 | 53 |
igraph, |
54 |
+ KernSmooth, |
|
56 | 55 |
liger, |
57 | 56 |
limma, |
58 | 57 |
MAST, |
59 | 58 |
Matrix, |
60 |
- mnormt, |
|
61 | 59 |
matrixStats, |
62 | 60 |
methods, |
63 | 61 |
msigdbr, |
64 | 62 |
multtest, |
65 | 63 |
plotly, |
66 | 64 |
RColorBrewer, |
65 |
+ ROCR, |
|
67 | 66 |
Rtsne, |
68 | 67 |
S4Vectors, |
69 | 68 |
scater, |
... | ... |
@@ -86,7 +85,6 @@ Imports: |
86 | 85 |
celda, |
87 | 86 |
shinycssloaders, |
88 | 87 |
shinythemes, |
89 |
- umap, |
|
90 | 88 |
uwot, |
91 | 89 |
DropletUtils, |
92 | 90 |
scds (>= 1.2.0), |
... | ... |
@@ -94,10 +92,8 @@ Imports: |
94 | 92 |
tools, |
95 | 93 |
withr, |
96 | 94 |
GSEABase, |
97 |
- DoubletFinder, |
|
98 | 95 |
R.utils, |
99 | 96 |
zinbwave, |
100 |
- V8, |
|
101 | 97 |
scRNAseq (>= 2.0.2), |
102 | 98 |
TENxPBMCData, |
103 | 99 |
yaml, |
... | ... |
@@ -117,12 +113,9 @@ Suggests: |
117 | 113 |
org.Mm.eg.db, |
118 | 114 |
stringr |
119 | 115 |
Remotes: |
120 |
- joshua-d-campbell/DoubletFinder, |
|
121 | 116 |
joshua-d-campbell/harmony, |
122 |
- rz2333/mnormt@1.5-6, |
|
123 | 117 |
joshua-d-campbell/liger, |
124 | 118 |
joshua-d-campbell/shiny-directory-input, |
125 |
- campbio/celda@RELEASE_3_11 |
|
126 | 119 |
VignetteBuilder: knitr |
127 | 120 |
URL: https://compbiomed.github.io/sctk_docs/ |
128 | 121 |
BugReports: https://github.com/compbiomed/singleCellTK/issues |
... | ... |
@@ -1,5 +1,6 @@ |
1 | 1 |
# Generated by roxygen2: do not edit by hand |
2 | 2 |
|
3 |
+export(MAST) |
|
3 | 4 |
export(alignSingleCellData) |
4 | 5 |
export(calcEffectSizes) |
5 | 6 |
export(combineSCE) |
... | ... |
@@ -8,6 +9,7 @@ export(constructSCE) |
8 | 9 |
export(convertGeneIDs) |
9 | 10 |
export(convertSCEToSeurat) |
10 | 11 |
export(convertSeuratToSCE) |
12 |
+export(discreteColorPalette) |
|
11 | 13 |
export(distinctColors) |
12 | 14 |
export(downSampleCells) |
13 | 15 |
export(downSampleDepth) |
... | ... |
@@ -16,7 +18,6 @@ export(exportSCE) |
16 | 18 |
export(exportSCEtoAnnData) |
17 | 19 |
export(exportSCEtoFlatFile) |
18 | 20 |
export(featureIndex) |
19 |
-export(filterSCData) |
|
20 | 21 |
export(findMarkerDiffExp) |
21 | 22 |
export(generateMeta) |
22 | 23 |
export(generateSimulatedData) |
... | ... |
@@ -43,6 +44,7 @@ export(importGeneSetsFromCollection) |
43 | 44 |
export(importGeneSetsFromGMT) |
44 | 45 |
export(importGeneSetsFromList) |
45 | 46 |
export(importGeneSetsFromMSigDB) |
47 |
+export(importMultipleSources) |
|
46 | 48 |
export(importOptimus) |
47 | 49 |
export(importSEQC) |
48 | 50 |
export(importSTARsolo) |
... | ... |
@@ -86,6 +88,7 @@ export(plotUMAP) |
86 | 88 |
export(qcInputProcess) |
87 | 89 |
export(readSingleCellMatrix) |
88 | 90 |
export(reportCellQC) |
91 |
+export(reportDiffExp) |
|
89 | 92 |
export(reportDropletQC) |
90 | 93 |
export(reportQCTool) |
91 | 94 |
export(retrieveSCEIndex) |
... | ... |
@@ -114,7 +117,6 @@ export(runMAST) |
114 | 117 |
export(runMNNCorrect) |
115 | 118 |
export(runPerCellQC) |
116 | 119 |
export(runSCANORAMA) |
117 |
-export(runSCGEN) |
|
118 | 120 |
export(runSCMerge) |
119 | 121 |
export(runScranSNN) |
120 | 122 |
export(runScrublet) |
... | ... |
@@ -160,8 +162,6 @@ import(DelayedArray) |
160 | 162 |
import(DropletUtils) |
161 | 163 |
import(GSVAdata) |
162 | 164 |
import(SingleCellExperiment) |
163 |
-importFrom(DelayedArray,colSums) |
|
164 |
-importFrom(Matrix,colSums) |
|
165 | 165 |
importFrom(SingleCellExperiment,"counts<-") |
166 | 166 |
importFrom(SingleCellExperiment,"reducedDim<-") |
167 | 167 |
importFrom(SingleCellExperiment,counts) |
168 | 168 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,136 @@ |
1 |
+#' MAST |
|
2 |
+#' |
|
3 |
+#' Run and visualize MAST analysis on a SCtkExperiment object. |
|
4 |
+#' |
|
5 |
+#' @param inSCE Input SCtkExperiment object. Required |
|
6 |
+#' @param useAssay The assay to use for the MAST calculations. The default is |
|
7 |
+#' "logcounts" |
|
8 |
+#' @param condition select variable (from the colData) that is used for the |
|
9 |
+#' model. |
|
10 |
+#' @param interest.level If the condition of interest has more than two factors, |
|
11 |
+#' indicate which level should be used to compare to all other samples. |
|
12 |
+#' @param freqExpressed Filter genes that are expressed in at least this |
|
13 |
+#' fraction of cells. The default is expression in 0.1 of samples. |
|
14 |
+#' @param fcThreshold Minimum fold change for differentially expressed gene. |
|
15 |
+#' @param p.value p values for selecting the hurdle result, default is 0.05 |
|
16 |
+#' @param useThresh Use adaptive thresholding to filter genes. The default is |
|
17 |
+#' FALSE. |
|
18 |
+#' |
|
19 |
+#' @describeIn MAST Run MAST analysis. |
|
20 |
+#' |
|
21 |
+#' @return MAST(): A data.frame of differentially expressed genes with p-values. |
|
22 |
+#' @export |
|
23 |
+MAST <- function(inSCE, condition = NULL, interest.level = NULL, |
|
24 |
+ freqExpressed = 0.1, fcThreshold=log2(1.5), p.value = 0.05, |
|
25 |
+ useThresh=FALSE, useAssay = "logcounts"){ |
|
26 |
+ |
|
27 |
+ if (is.null(condition)){ |
|
28 |
+ stop("specify the condition of interest") |
|
29 |
+ } |
|
30 |
+ |
|
31 |
+ if (length(unique(SingleCellExperiment::colData(inSCE)[, condition])) == 1) { |
|
32 |
+ stop("only one level is in the condition") |
|
33 |
+ } |
|
34 |
+ |
|
35 |
+ if (is.null(interest.level) & length( |
|
36 |
+ unique(SingleCellExperiment::colData(inSCE)[, condition])) > 2){ |
|
37 |
+ stop("You must specify a level of interest when more than 2 levels are in", |
|
38 |
+ " the condition") |
|
39 |
+ } |
|
40 |
+ |
|
41 |
+ # Create MAST SingleCellAssay |
|
42 |
+ pdata <- SingleCellExperiment::colData(inSCE) |
|
43 |
+ expres <- SummarizedExperiment::assay(inSCE, useAssay) |
|
44 |
+ fdata <- SingleCellExperiment::rowData(inSCE) |
|
45 |
+ SCENew <- MAST::FromMatrix(expres, pdata, fdata) |
|
46 |
+ |
|
47 |
+ #Caculate CDR for zlm model |
|
48 |
+ SummarizedExperiment::colData(SCENew)$cngeneson <- |
|
49 |
+ scale(colSums(SummarizedExperiment::assay(SCENew) > 0)) |
|
50 |
+ |
|
51 |
+ if (useThresh){ |
|
52 |
+ SCENew <- SCENew[which(MAST::freq(SCENew) > 0), ] |
|
53 |
+ invisible(utils::capture.output(thresh <- MAST::thresholdSCRNACountMatrix( |
|
54 |
+ SummarizedExperiment::assay(SCENew), nbins = 20, min_per_bin = 30))) |
|
55 |
+ SummarizedExperiment::assays(SCENew) <- |
|
56 |
+ list(thresh = thresh$counts_threshold, |
|
57 |
+ tpm = SummarizedExperiment::assay(SCENew)) |
|
58 |
+ } |
|
59 |
+ |
|
60 |
+ # filter based on frequency of expression across samples |
|
61 |
+ if (sum(MAST::freq(SCENew) > freqExpressed) <= 1){ |
|
62 |
+ stop("Not enough genes pass frequency expressed filter of 1") |
|
63 |
+ } |
|
64 |
+ SCENewSample <- SCENew[which(MAST::freq(SCENew) > freqExpressed), ] |
|
65 |
+ |
|
66 |
+ # if the condition of interest is numeric, to change it to a factor |
|
67 |
+ if (is.numeric(SummarizedExperiment::colData(SCENewSample)[, condition])){ |
|
68 |
+ SummarizedExperiment::colData(SCENewSample)[, condition] <- |
|
69 |
+ as.factor(SummarizedExperiment::colData(SCENewSample)[, condition]) |
|
70 |
+ } |
|
71 |
+ #Check for NAs, if true throw error |
|
72 |
+ if (any(is.na(SingleCellExperiment::colData(inSCE)[, condition]))){ |
|
73 |
+ stop("Annotation data has NA values. Filter them to continue.") |
|
74 |
+ } |
|
75 |
+ # >2 levels in the condition |
|
76 |
+ if (!is.null(interest.level) & |
|
77 |
+ length(unique(SingleCellExperiment::colData(inSCE)[, condition])) > 2){ |
|
78 |
+ levels(SummarizedExperiment::colData(SCENewSample)[, condition]) <- |
|
79 |
+ c(levels(SummarizedExperiment::colData(SCENewSample)[, condition]), |
|
80 |
+ paste0("no_", interest.level)) |
|
81 |
+ SummarizedExperiment::colData(SCENewSample)[, condition][ |
|
82 |
+ SummarizedExperiment::colData(SCENewSample)[, condition] != |
|
83 |
+ interest.level] <- paste0("no_", interest.level) |
|
84 |
+ SummarizedExperiment::colData(SCENewSample)[, condition] <- |
|
85 |
+ droplevels(as.factor( |
|
86 |
+ SummarizedExperiment::colData(SCENewSample)[, condition])) |
|
87 |
+ |
|
88 |
+ hurdle1 <- MAST::zlm(stats::as.formula(paste0("~", condition, |
|
89 |
+ "+cngeneson")), SCENewSample) |
|
90 |
+ summaryh1 <- MAST::summary(hurdle1, doLRT = paste0(condition, "no_", |
|
91 |
+ interest.level)) |
|
92 |
+ |
|
93 |
+ summaryDT <- summaryh1[["datatable"]] |
|
94 |
+ |
|
95 |
+ fcHurdle <- merge( |
|
96 |
+ summaryDT[summaryDT$contrast == paste0(condition, "no_", interest.level) & |
|
97 |
+ summaryDT$component == "H", c("primerid", "Pr(>Chisq)")], |
|
98 |
+ summaryDT[summaryDT$contrast == paste0(condition, "no_", interest.level) & |
|
99 |
+ summaryDT$component == "logFC", c("primerid", "coef", "ci.hi", |
|
100 |
+ "ci.lo")] |
|
101 |
+ ) |
|
102 |
+ } else { |
|
103 |
+ SummarizedExperiment::colData(SCENewSample)[, condition] <- |
|
104 |
+ droplevels(as.factor( |
|
105 |
+ SummarizedExperiment::colData(SCENewSample)[, condition])) |
|
106 |
+ level.cond <- levels( |
|
107 |
+ SummarizedExperiment::colData(SCENewSample)[, condition]) |
|
108 |
+ |
|
109 |
+ hurdle1 <- MAST::zlm(stats::as.formula(paste0("~", condition, |
|
110 |
+ "+cngeneson")), SCENewSample) |
|
111 |
+ summaryh1 <- MAST::summary(hurdle1, doLRT = paste0(condition, |
|
112 |
+ level.cond[2])) |
|
113 |
+ |
|
114 |
+ summaryDT <- summaryh1[["datatable"]] |
|
115 |
+ |
|
116 |
+ fcHurdle <- merge( |
|
117 |
+ summaryDT[summaryDT$contrast == paste0(condition, level.cond[2]) & |
|
118 |
+ summaryDT$component == "H", c("primerid", "Pr(>Chisq)")], |
|
119 |
+ summaryDT[summaryDT$contrast == paste0(condition, level.cond[2]) & |
|
120 |
+ summaryDT$component == "logFC", c("primerid", "coef", "ci.hi", |
|
121 |
+ "ci.lo")] |
|
122 |
+ ) |
|
123 |
+ } |
|
124 |
+ |
|
125 |
+ # Use p-value correction method, here we use fdr |
|
126 |
+ fcHurdle$fdr <- stats::p.adjust(fcHurdle$"Pr(>Chisq)", "fdr") |
|
127 |
+ |
|
128 |
+ # Filter the data again by the adjusted pvalue and coef |
|
129 |
+ fcHurdleSig <- fcHurdle[fcHurdle$fdr < p.value & abs(fcHurdle$coef) > |
|
130 |
+ fcThreshold & !is.nan(fcHurdle$coef), ] |
|
131 |
+ colnames(fcHurdleSig)[1] <- "Gene" |
|
132 |
+ |
|
133 |
+ data.table::setorder(fcHurdleSig, "fdr") |
|
134 |
+ |
|
135 |
+ return(fcHurdleSig) |
|
136 |
+} |
... | ... |
@@ -79,7 +79,7 @@ |
79 | 79 |
|
80 | 80 |
## Get merged rowData |
81 | 81 |
by.r <- unique(c('rownames', by.r)) |
82 |
- unionFe <- Reduce(function(r1, r2) merge(r1, r2, by=by.r, all=T), feList) |
|
82 |
+ unionFe <- Reduce(function(r1, r2) merge(r1, r2, by=by.r, all=TRUE), feList) |
|
83 | 83 |
allGenes <- unique(unlist(lapply(feList, rownames))) |
84 | 84 |
|
85 | 85 |
## rowData |
... | ... |
@@ -104,7 +104,7 @@ |
104 | 104 |
}) |
105 | 105 |
|
106 | 106 |
by.c <- unique(c("rownames", by.c)) |
107 |
- unionCb <- Reduce(function(c1, c2) merge(c1, c2, by=by.c, all=T), cbList) |
|
107 |
+ unionCb <- Reduce(function(c1, c2) merge(c1, c2, by=by.c, all=TRUE), cbList) |
|
108 | 108 |
rownames(unionCb) <- unionCb[['rownames']] |
109 | 109 |
newCbList <- list() |
110 | 110 |
for (i in seq_along(sceList)) { |
... | ... |
@@ -234,7 +234,7 @@ combineSCE <- function(sceList, by.r, by.c, combined){ |
234 | 234 |
assayList <- .mergeAssaySCE(sceList) |
235 | 235 |
|
236 | 236 |
New_SCE <- list() |
237 |
- for (i in 1:length(sceList)) { |
|
237 |
+ for (i in seq(length(sceList))) { |
|
238 | 238 |
## create new sce |
239 | 239 |
New_SCE[[i]] <- .constructSCE(matrices = assayList[[i]], features = newFeList, |
240 | 240 |
barcodes = newCbList[[i]], |
... | ... |
@@ -249,4 +249,4 @@ combineSCE <- function(sceList, by.r, by.c, combined){ |
249 | 249 |
return(sce) |
250 | 250 |
} |
251 | 251 |
return(New_SCE) |
252 |
-} |
|
253 | 252 |
\ No newline at end of file |
253 |
+} |
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
#' Example Single Cell RNA-Seq data in SingleCellExperiment Object, GSE60361 |
2 | 2 |
#' subset |
3 | 3 |
#' |
4 |
-#' A subset of 30 samples from a single cell RNA-Seq experiment from Zeisel, et |
|
4 |
+#' A subset of 30 cells from a single cell RNA-Seq experiment from Zeisel, et |
|
5 | 5 |
#' al. Science 2015. The data was produced from cells from the mouse |
6 | 6 |
#' somatosensory cortex (S1) and hippocampus (CA1). 15 of the cells were |
7 | 7 |
#' identified as oligodendrocytes and 15 of the cell were identified as |
... | ... |
@@ -16,12 +16,12 @@ |
16 | 16 |
#' data("mouseBrainSubsetSCE") |
17 | 17 |
"mouseBrainSubsetSCE" |
18 | 18 |
|
19 |
-#' Example Single Cell RNA-Seq data in SingleCellExperiment Object, |
|
19 |
+#' Example Single Cell RNA-Seq data in SingleCellExperiment Object, |
|
20 | 20 |
#' subset of 10x public dataset |
21 | 21 |
#' https://support.10xgenomics.com/single-cell-gene-expression/datasets/2.1.0/pbmc4k |
22 | 22 |
#' A subset of 390 barcodes and top 200 genes were included in this example. |
23 | 23 |
#' Within 390 barcodes, 195 barcodes are empty droplet, 150 barcodes are cell barcode |
24 |
-#' and 45 barcodes are doublets predicted by scrublet and doubletFinder package. |
|
24 |
+#' and 45 barcodes are doublets predicted by scrublet and doubletFinder package. |
|
25 | 25 |
#' This example only serves as a proof of concept and a tutoriol on how to |
26 | 26 |
#' run the functions in this package. The results should not be |
27 | 27 |
#' used for drawing scientific conclusions. |
... | ... |
@@ -40,38 +40,31 @@ |
40 | 40 |
#' Two batches of pancreas scRNAseq dataset are combined with their original |
41 | 41 |
#' counts. Cell types and batches are annotated in `colData(sceBatches)`. |
42 | 42 |
#' Two batches came from Wang, et al., 2016, annotated as `'w'`; and Xin, et |
43 |
-#' al., 2016, annotated as `'x'`. Two common cell types, `'alpha'` and |
|
44 |
-#' `'beta'`, that could be found in both original studies with relatively |
|
43 |
+#' al., 2016, annotated as `'x'`. Two common cell types, `'alpha'` and |
|
44 |
+#' `'beta'`, that could be found in both original studies with relatively |
|
45 | 45 |
#' large population were kept for cleaner demonstration. |
46 |
-#' |
|
47 |
-#' @name sceBatches |
|
48 |
-#' @docType data |
|
49 |
-#' @format SingleCellExperiment |
|
50 |
-#' @source DOI: 10.2337/db16-0405 and 10.1016/j.cmet.2016.08.018 |
|
51 |
-#' @keywords datasets |
|
52 |
-#' @examples |
|
53 | 46 |
#' data('sceBatches') |
54 | 47 |
"sceBatches" |
55 | 48 |
|
56 | 49 |
#' Stably Expressed Gene (SEG) list obect, with SEG sets for human and mouse. |
57 |
-#' |
|
50 |
+#' |
|
58 | 51 |
#' The two gene sets came from dataset called `segList` of package `scMerge`. |
59 | 52 |
#' @name SEG |
60 | 53 |
#' @docType data |
61 |
-#' @format list, with two entries `"human"` and `"mouse"`, each is a charactor |
|
62 |
-#' array. |
|
63 |
-#' @source `data('segList', package='scMerge')`` |
|
54 |
+#' @format list, with two entries \code{"human"} and \code{"mouse"}, each is a |
|
55 |
+#' charactor vector. |
|
56 |
+#' @source \code{data('segList', package='scMerge')} |
|
64 | 57 |
#' @keywords datasets |
65 |
-#' @examples |
|
58 |
+#' @examples |
|
66 | 59 |
#' data('SEG') |
67 | 60 |
#' humanSEG <- SEG$human |
68 | 61 |
"SEG" |
69 | 62 |
|
70 | 63 |
#' MSigDB gene get Cctegory table |
71 |
-#' |
|
72 |
-#' A table of gene set categories that can be download from MSigDB. The |
|
64 |
+#' |
|
65 |
+#' A table of gene set categories that can be download from MSigDB. The |
|
73 | 66 |
#' categories and descriptions can be found here: |
74 |
-#' https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp. The IDs in the |
|
67 |
+#' https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp. The IDs in the |
|
75 | 68 |
#' first column can be used to retrieve the gene sets for these categories |
76 | 69 |
#' using the \link{importGeneSetsFromMSigDB} function. |
77 | 70 |
|
... | ... |
@@ -1,427 +1,666 @@ |
1 |
-## Original code from DoubletFinder package |
|
2 |
-## (https://github.com/chris-mcginnis-ucsf/DoubletFinder) |
|
3 |
- |
|
4 |
-.parallel_paramSweep <- function(n, n.real.cells, real.cells, pK, pN, data, |
|
5 |
- orig.commands, PCs, sct, verbose, seed) { |
|
6 |
- sweep.res.list <- list() |
|
7 |
- list.ind <- 0 |
|
8 |
- |
|
9 |
- ## Make merged real-artifical data |
|
10 |
- if (verbose) { |
|
11 |
- print(paste("Creating artificial doublets for pN = ", |
|
12 |
- pN[n] * 100, "%", sep = "")) |
|
13 |
- } |
|
14 |
- n_doublets <- ceiling(n.real.cells / (1 - pN[n]) - n.real.cells) |
|
15 |
- real.cells1 <- sample(real.cells, n_doublets, replace = TRUE) |
|
16 |
- real.cells2 <- sample(real.cells, n_doublets, replace = TRUE) |
|
17 |
- doublets <- (data[, real.cells1] + data[, real.cells2]) / 2 |
|
18 |
- colnames(doublets) <- paste("X", 1:n_doublets, sep = "") |
|
19 |
- data_wdoublets <- cbind(data, doublets) |
|
20 |
- |
|
21 |
- ## Pre-process Seurat object |
|
22 |
- if (sct == FALSE) { |
|
23 |
- if (verbose) { |
|
24 |
- print("Creating Seurat object...") |
|
25 |
- } |
|
26 |
- seu_wdoublets <- Seurat::CreateSeuratObject(counts = data_wdoublets) |
|
27 |
- |
|
28 |
- if (verbose) { |
|
29 |
- print("Normalizing Seurat object...") |
|
30 |
- } |
|
31 |
- seu_wdoublets <- Seurat::NormalizeData(seu_wdoublets, |
|
32 |
- normalization.method = orig.commands$NormalizeData.RNA@params$normalization.method, |
|
33 |
- scale.factor = orig.commands$NormalizeData.RNA@params$scale.factor, |
|
34 |
- margin = orig.commands$NormalizeData.RNA@params$margin, |
|
35 |
- verbose = verbose |
|
36 |
- ) |
|
37 |
- |
|
38 |
- if (verbose) { |
|
39 |
- print("Finding variable genes...") |
|
40 |
- } |
|
41 |
- seu_wdoublets <- Seurat::FindVariableFeatures(seu_wdoublets, |
|
42 |
- selection.method = orig.commands$FindVariableFeatures.RNA$selection.method, |
|
43 |
- loess.span = orig.commands$FindVariableFeatures.RNA$loess.span, |
|
44 |
- clip.max = orig.commands$FindVariableFeatures.RNA$clip.max, |
|
45 |
- mean.function = orig.commands$FindVariableFeatures.RNA$mean.function, |
|
46 |
- dispersion.function = orig.commands$FindVariableFeatures.RNA$dispersion.function, |
|
47 |
- num.bin = orig.commands$FindVariableFeatures.RNA$num.bin, |
|
48 |
- binning.method = orig.commands$FindVariableFeatures.RNA$binning.method, |
|
49 |
- nfeatures = orig.commands$FindVariableFeatures.RNA$nfeatures, |
|
50 |
- mean.cutoff = orig.commands$FindVariableFeatures.RNA$mean.cutoff, |
|
51 |
- dispersion.cutoff = orig.commands$FindVariableFeatures.RNA$dispersion.cutoff, |
|
52 |
- verbose = verbose |
|
53 |
- ) |
|
54 |
- |
|
55 |
- if (verbose) { |
|
56 |
- print("Scaling data...") |
|
57 |
- } |
|
58 |
- seu_wdoublets <- Seurat::ScaleData( |
|
59 |
- seu_wdoublets, |
|
60 |
- features = orig.commands$ScaleData.RNA$features, |
|
61 |
- model.use = orig.commands$ScaleData.RNA$model.use, |
|
62 |
- do.scale = orig.commands$ScaleData.RNA$do.scale, |
|
63 |
- do.center = orig.commands$ScaleData.RNA$do.center, |
|
64 |
- scale.max = orig.commands$ScaleData.RNA$scale.max, |
|
65 |
- block.size = orig.commands$ScaleData.RNA$block.size, |
|
66 |
- min.cells.to.block = orig.commands$ScaleData.RNA$min.cells.to.block, |
|
67 |
- verbose = verbose |
|
68 |
- ) |
|
69 |
- |
|
70 |
- if (verbose) { |
|
71 |
- print("Running PCA...") |
|
72 |
- } |
|
73 |
- seu_wdoublets <- Seurat::RunPCA( |
|
74 |
- seu_wdoublets, |
|
75 |
- features = orig.commands$ScaleData.RNA$features, |
|
76 |
- npcs = length(PCs), |
|
77 |
- rev.pca = orig.commands$RunPCA.RNA$rev.pca, |
|
78 |
- weight.by.var = orig.commands$RunPCA.RNA$weight.by.var, |
|
79 |
- seed.use = seed, |
|
80 |
- verbose = FALSE |
|
81 |
- ) |
|
82 |
- } |
|
83 |
- |
|
84 |
- if (sct == TRUE) { |
|
85 |
- if (verbose) { |
|
86 |
- print("Creating Seurat object...") |
|
87 |
- } |
|
88 |
- seu_wdoublets <- Seurat::CreateSeuratObject(counts = data_wdoublets) |
|
89 |
- |
|
90 |
- if (verbose) { |
|
91 |
- print("Running SCTransform...") |
|
92 |
- } |
|
93 |
- seu_wdoublets <- Seurat::SCTransform(seu_wdoublets) |
|
94 |
- |
|
95 |
- if (verbose) { |
|
96 |
- print("Running PCA...") |
|
97 |
- } |
|
98 |
- seu_wdoublets <- Seurat::RunPCA(seu_wdoublets, |
|
99 |
- npcs = length(PCs), seed.use = seed, |
|
100 |
- verbose = verbose |
|
101 |
- ) |
|
102 |
- } |
|
103 |
- |
|
104 |
- ## Compute PC distance matrix |
|
105 |
- if (verbose) { |
|
106 |
- print("Calculating PC distance matrix...") |
|
107 |
- } |
|
108 |
- nCells <- nrow(seu_wdoublets@meta.data) |
|
109 |
- pca.coord <- seu_wdoublets@reductions$pca@cell.embeddings[, PCs] |
|
110 |
- seu_wdoublets <- NULL |
|
111 |
- gc() |
|
112 |
- dist.mat <- fields::rdist(pca.coord)[, 1:n.real.cells] |
|
113 |
- |
|
114 |
- ## Pre-order PC distance matrix prior to iterating across pK |
|
115 |
- ## for pANN computations |
|
116 |
- if (verbose) { |
|
117 |
- print("Defining neighborhoods...") |
|
118 |
- } |
|
119 |
- |
|
120 |
- for (i in 1:n.real.cells) { |
|
121 |
- dist.mat[, i] <- order(dist.mat[, i]) |
|
122 |
- } |
|
123 |
- |
|
124 |
- ## Trim PC distance matrix for faster manipulations |
|
125 |
- ind <- round(nCells * max(pK)) + 5 |
|
126 |
- dist.mat <- dist.mat[1:ind, ] |
|
127 |
- |
|
128 |
- ## Compute pANN across pK sweep |
|
129 |
- |
|
130 |
- if (verbose) { |
|
131 |
- print("Computing pANN across all pK...") |
|
132 |
- } |
|
133 |
- for (k in 1:length(pK)) { |
|
134 |
- if (verbose) { |
|
135 |
- print(paste("pK = ", pK[k], "...", sep = "")) |
|
136 |
- } |
|
137 |
- pk.temp <- round(nCells * pK[k]) |
|
138 |
- pANN <- as.data.frame(matrix(0L, nrow = n.real.cells, ncol = 1)) |
|
139 |
- colnames(pANN) <- "pANN" |
|
140 |
- rownames(pANN) <- real.cells |
|
141 |
- list.ind <- list.ind + 1 |
|
142 |
- |
|
143 |
- for (i in 1:n.real.cells) { |
|
144 |
- neighbors <- dist.mat[2:(pk.temp + 1), i] |
|
145 |
- pANN$pANN[i] <- length(which(neighbors > n.real.cells)) / pk.temp |
|
146 |
- } |
|
147 |
- |
|
148 |
- sweep.res.list[[list.ind]] <- pANN |
|
149 |
- } |
|
150 |
- |
|
151 |
- return(sweep.res.list) |
|
152 |
-} |
|
153 |
- |
|
154 |
-.paramSweep <- function(seu, PCs = 1:10, sct = FALSE, |
|
155 |
- verbose = FALSE, num.cores, seed) { |
|
156 |
- ## Set pN-pK param sweep ranges |
|
157 |
- pK <- c(0.0005, 0.001, 0.005, seq(0.01, 0.3, by = 0.01)) |
|
158 |
- pN <- seq(0.05, 0.3, by = 0.05) |
|
159 |
- ## Remove pK values with too few cells |
|
160 |
- min.cells <- round(nrow(seu@meta.data) / (1 - 0.05) - nrow(seu@meta.data)) |
|
161 |
- pK.test <- round(pK * min.cells) |
|
162 |
- pK <- pK[which(pK.test >= 1)] |
|
163 |
- |
|
164 |
- ## Extract pre-processing parameters from original data analysis workflow |
|
165 |
- orig.commands <- seu@commands |
|
166 |
- |
|
167 |
- ## Down-sample cells to 10000 (when applicable) for computational effiency |
|
168 |
- if (nrow(seu@meta.data) > 10000) { |
|
169 |
- real.cells <- rownames(seu@meta.data)[sample(1:nrow(seu@meta.data), |
|
170 |
- 10000, replace = FALSE)] |
|
171 |
- data <- seu@assays$RNA@counts[, real.cells] |
|
172 |
- n.real.cells <- ncol(data) |
|
173 |
- } |
|
174 |
- |
|
175 |
- if (nrow(seu@meta.data) <= 10000) { |
|
176 |
- real.cells <- rownames(seu@meta.data) |
|
177 |
- data <- seu@assays$RNA@counts |
|
178 |
- n.real.cells <- ncol(data) |
|
179 |
- } |
|
180 |
- ## Iterate through pN, computing pANN vectors at varying pK |
|
181 |
- # no_cores <- detectCores()-1 |
|
182 |
- if (is.null(num.cores)) { |
|
183 |
- num.cores <- 1 |
|
184 |
- } |
|
185 |
- |
|
186 |
- if (num.cores > 1) { |
|
187 |
- cl <- parallel::makeCluster(num.cores) |
|
188 |
- |
|
189 |
- output2 <- withr::with_seed( |
|
190 |
- seed, |
|
191 |
- parallel::mclapply(as.list(1:length(pN)), |
|
192 |
- FUN = .parallel_paramSweep, |
|
193 |
- n.real.cells, |
|
194 |
- real.cells, |
|
195 |
- pK, |
|
196 |
- pN, |
|
197 |
- data, |
|
198 |
- orig.commands, |
|
199 |
- PCs, |
|
200 |
- sct, mc.cores = num.cores, |
|
201 |
- verbose = verbose, |
|
202 |
- seed = seed, |
|
203 |
- mc.set.seed = FALSE |
|
204 |
- ) |
|
205 |
- ) |
|
206 |
- parallel::stopCluster(cl) |
|
207 |
- } else { |
|
208 |
- output2 <- lapply(as.list(1:length(pN)), |
|
209 |
- FUN = .parallel_paramSweep, |
|
210 |
- n.real.cells, |
|
211 |
- real.cells, |
|
212 |
- pK, |
|
213 |
- pN, |
|
214 |
- data, |
|
215 |
- orig.commands, |
|
216 |
- PCs, |
|
217 |
- sct, |
|
218 |
- seed = seed, |
|
219 |
- verbose = verbose |
|
220 |
- ) |
|
221 |
- } |
|
222 |
- |
|
223 |
- ## Write parallelized output into list |
|
224 |
- sweep.res.list <- list() |
|
225 |
- list.ind <- 0 |
|
226 |
- for (i in 1:length(output2)) { |
|
227 |
- for (j in 1:length(output2[[i]])) { |
|
228 |
- list.ind <- list.ind + 1 |
|
229 |
- sweep.res.list[[list.ind]] <- output2[[i]][[j]] |
|
230 |
- } |
|
231 |
- } |
|
232 |
- ## Assign names to list of results |
|
233 |
- name.vec <- NULL |
|
234 |
- for (j in 1:length(pN)) { |
|
235 |
- name.vec <- c(name.vec, paste("pN", pN[j], "pK", pK, sep = "_")) |
|
236 |
- } |
|
237 |
- names(sweep.res.list) <- name.vec |
|
238 |
- return(sweep.res.list) |
|
239 |
- |
|
240 |
-} |
|
241 |
- |
|
242 |
-.runDoubletFinder <- function(counts, seuratPcs, seuratRes, formationRate, |
|
243 |
- seuratNfeatures, verbose = FALSE, |
|
244 |
- nCores = NULL, seed = 12345) { |
|
245 |
- |
|
246 |
- ## Convert to sparse matrix if not already in that format |
|
247 |
- counts <- .convertToMatrix(counts) |
|
248 |
- colnames(counts) <- gsub("_", "-", colnames(counts)) |
|
249 |
- |
|
250 |
- seurat <- suppressWarnings(Seurat::CreateSeuratObject( |
|
251 |
- counts = counts, |
|
252 |
- project = "seurat", min.features = 0 |
|
253 |
- )) |
|
254 |
- seurat <- Seurat::NormalizeData( |
|
255 |
- object = seurat, |
|
256 |
- normalization.method = "LogNormalize", scale.factor = 10000, |
|
257 |
- verbose = verbose |
|
258 |
- ) |
|
259 |
- seurat <- Seurat::FindVariableFeatures(seurat, |
|
260 |
- selection.method = "vst", |
|
261 |
- nfeatures = seuratNfeatures, |
|
262 |
- verbose = verbose |
|
263 |
- ) |
|
264 |
- |
|
265 |
- allGenes <- rownames(seurat) |
|
266 |
- seurat <- Seurat::ScaleData(seurat, features = allGenes, verbose = verbose) |
|
267 |
- |
|
268 |
- numPc <- min(ncol(seurat@assays$RNA@scale.data) - 1, 50) |
|
269 |
- seurat <- Seurat::RunPCA(seurat, |
|
270 |
- features = |
|
271 |
- Seurat::VariableFeatures(object = seurat), |
|
272 |
- npcs = numPc, verbose = verbose, seed.use = seed |
|
273 |
- ) |
|
274 |
- seurat <- Seurat::FindNeighbors(seurat, dims = seuratPcs, verbose = verbose) |
|
275 |
- seurat <- Seurat::FindClusters(seurat, |
|
276 |
- resolution = seuratRes, |
|
277 |
- verbose = verbose, |
|
278 |
- random.seed = seed |
|
279 |
- ) |
|
280 |
- invisible(sweepResListSeurat <- .paramSweep(seurat, |
|
281 |
- PCs = seuratPcs, |
|
282 |
- sct = FALSE, |
|
283 |
- num.cores = nCores, |
|
284 |
- verbose = verbose, |
|
285 |
- seed = seed |
|
286 |
- )) |
|
287 |
- sweepStatsSeurat <- DoubletFinder::summarizeSweep(sweepResListSeurat, |
|
288 |
- GT = FALSE |
|
289 |
- ) |
|
290 |
- bcmvnSeurat <- DoubletFinder::find.pK(sweepStatsSeurat, verbose = verbose) |
|
291 |
- pkOptimal <- as.numeric(as.matrix(bcmvnSeurat$pK[ |
|
292 |
- which.max(bcmvnSeurat$MeanBC) |
|
293 |
- ])) |
|
294 |
- annotations <- seurat@meta.data$seurat_clusters |
|
295 |
- homotypicProp <- DoubletFinder::modelHomotypic(annotations) |
|
296 |
- nExpPoi <- round(formationRate * ncol(seurat@assays$RNA)) |
|
297 |
- seurat <- DoubletFinder::doubletFinder_v3(seurat, |
|
298 |
- PCs = seuratPcs, |
|
299 |
- pN = 0.25, |
|
300 |
- pK = pkOptimal, |
|
301 |
- nExp = nExpPoi, |
|
302 |
- reuse.pANN = FALSE, |
|
303 |
- sct = FALSE, |
|
304 |
- verbose = FALSE |
|
305 |
- ) |
|
306 |
- names(seurat@meta.data)[6] <- "doubletFinderAnnScore" |
|
307 |
- names(seurat@meta.data)[7] <- "doubletFinderLabel" |
|
308 |
- return(seurat) |
|
309 |
-} |
|
310 |
- |
|
311 |
-#' @title Generates a doublet score for each cell via doubletFinder |
|
312 |
-#' @description Uses doubletFinder to determine cells within the dataset |
|
313 |
-#' suspected to be doublets. |
|
314 |
-#' @param inSCE Input SingleCellExperiment object. Must contain a counts matrix |
|
315 |
-#' @param useAssay Indicate which assay to use. Default "counts". |
|
316 |
-#' @param sample Numeric vector. Each cell will be assigned a sample number. |
|
317 |
-#' @param seed Seed for the random number generator. Default 12345. |
|
318 |
-#' @param seuratNfeatures Integer. Number of highly variable genes to use. |
|
319 |
-#' Default 2000. |
|
320 |
-#' @param seuratPcs Numeric vector. The PCs used in seurat function to |
|
321 |
-#' determine number of clusters. Default 1:15. |
|
322 |
-#' @param seuratRes Numeric vector. The resolution parameter used in seurat, |
|
323 |
-#' which adjusts the number of clusters determined via the algorithm. |
|
324 |
-#' Default 1.5. |
|
325 |
-#' @param formationRate Doublet formation rate used within algorithm. |
|
326 |
-#' Default 0.075. |
|
327 |
-#' @param nCores Number of cores used for running the function. |
|
328 |
-#' @param verbose Boolean. Wheter to print messages from Seurat and |
|
329 |
-#' DoubletFinder. Default FALSE. |
|
330 |
-#' @return SingleCellExperiment object containing the |
|
331 |
-#' 'doublet_finder_doublet_score'. |
|
332 |
-#' @examples |
|
333 |
-#' \donttest{ |
|
334 |
-#' data(scExample, package = "singleCellTK") |
|
335 |
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet'] |
|
336 |
-#' sce <- runDoubletFinder(sce) |
|
337 |
-#' } |
|
338 |
-#' @export |
|
339 |
-#' @importFrom SummarizedExperiment colData colData<- |
|
340 |
-#' @importFrom SingleCellExperiment reducedDim<- |
|
341 |
-runDoubletFinder <- function(inSCE, |
|
342 |
- useAssay = "counts", |
|
343 |
- sample = NULL, |
|
344 |
- seed = 12345, |
|
345 |
- seuratNfeatures = 2000, |
|
346 |
- seuratPcs = 1:15, |
|
347 |
- seuratRes = 1.5, |
|
348 |
- formationRate = 0.075, |
|
349 |
- nCores = NULL, |
|
350 |
- verbose = FALSE) { |
|
351 |
- |
|
352 |
- # argsList <- as.list(formals(fun = sys.function(sys.parent()), envir = parent.frame())) |
|
353 |
- argsList <- mget(names(formals()), sys.frame(sys.nframe())) |
|
354 |
- |
|
355 |
- if (!is.null(sample)) { |
|
356 |
- if (length(sample) != ncol(inSCE)) { |
|
357 |
- stop("'sample' must be the same length as the number of columns in 'inSCE'") |
|
358 |
- } |
|
359 |
- } else { |
|
360 |
- sample <- rep(1, ncol(inSCE)) |
|
361 |
- } |
|
362 |
- |
|
363 |
- message(paste0(date(), " ... Running 'doubletFinder'")) |
|
364 |
- |
|
365 |
- doubletScore <- rep(NA, ncol(inSCE)) |
|
366 |
- doubletLabel <- rep(NA, ncol(inSCE)) |
|
367 |
- allSampleNumbers <- sort(unique(sample)) |
|
368 |
- |
|
369 |
- |
|
370 |
- for (res in seuratRes) { |
|
371 |
- output <- S4Vectors::DataFrame( |
|
372 |
- row.names = colnames(inSCE), |
|
373 |
- doubletFinder_doublet_score = numeric(ncol(inSCE)), |
|
374 |
- doubletFinder_doublet_label = numeric(ncol(inSCE)) |
|
375 |
- ) |
|
376 |
- umapDims <- matrix(ncol = 2, |
|
377 |
- nrow = ncol(inSCE)) |
|
378 |
- rownames(umapDims) = colnames(inSCE) |
|
379 |
- colnames(umapDims) = c("UMAP_1", "UMAP_2") |
|
380 |
- |
|
381 |
- ## Loop through each sample and run doubletFinder |
|
382 |
- samples <- unique(sample) |
|
383 |
- |
|
384 |
- for (i in seq_len(length(samples))) { |
|
385 |
- sceSampleInd <- sample == samples[i] |
|
386 |
- sceSample <- inSCE[, sceSampleInd] |
|
387 |
- mat <- SummarizedExperiment::assay(sceSample, i = useAssay) |
|
388 |
- if(length(seuratPcs) > ncol(mat)){ |
|
389 |
- seuratPcs <- 1:ncol(mat) |
|
390 |
- } |
|
391 |
- |
|
392 |
- result <- suppressMessages(withr::with_seed( |
|
393 |
- seed, |
|
394 |
- .runDoubletFinder( |
|
395 |
- counts = mat, |
|
396 |
- seuratPcs = seuratPcs, |
|
397 |
- seuratRes = res, |
|
398 |
- seuratNfeatures = seuratNfeatures, |
|
399 |
- formationRate = formationRate, |
|
400 |
- nCores = nCores, |
|
401 |
- verbose = verbose, |
|
402 |
- seed = seed |
|
403 |
- ) |
|
404 |
- )) |
|
405 |
- result <- suppressMessages(Seurat::RunUMAP(result, |
|
406 |
- dims = 1:10, |
|
407 |
- verbose = verbose, |
|
408 |
- umap.method = "umap-learn", |
|
409 |
- metric = "correlation")) |
|
410 |
- |
|
411 |
- seuratDims <- Seurat::Embeddings(result, reduction = "umap") |
|
412 |
- umapDims[sceSampleInd, 1] <- seuratDims[,1] |
|
413 |
- umapDims[sceSampleInd, 2] <- seuratDims[,2] |
|
414 |
- output[sceSampleInd, 1] <- result@meta.data$doubletFinderAnnScore |
|
415 |
- output[sceSampleInd, 2] <- result@meta.data$doubletFinderLabel |
|
416 |
- } |
|
417 |
- |
|
418 |
- colnames(output) <- paste0(colnames(output), "_resolution_", res) |
|
419 |
- |
|
420 |
- argsList <- argsList[!names(argsList) %in% ("...")] |
|
421 |
- inSCE@metadata$runDoubletFinder <- argsList[-1] |
|
422 |
- inSCE@metadata$runDoubletFinder$packageVersion <- utils::packageDescription("DoubletFinder")$Version |
|
423 |
- colData(inSCE) <- cbind(colData(inSCE), output) |
|
424 |
- reducedDim(inSCE,'doubletFinder_UMAP') <- umapDims |
|
425 |
- } |
|
426 |
- return(inSCE) |
|
427 |
-} |
|
1 |
+## Original code from DoubletFinder package |
|
2 |
+## (https://github.com/chris-mcginnis-ucsf/DoubletFinder) |
|
3 |
+ |
|
4 |
+.parallel_paramSweep <- function(n, n.real.cells, real.cells, pK, pN, data, |
|
5 |
+ orig.commands, PCs, sct, verbose, seed) { |
|
6 |
+ sweep.res.list <- list() |
|
7 |
+ list.ind <- 0 |
|
8 |
+ |
|
9 |
+ ## Make merged real-artifical data |
|
10 |
+ if (verbose) { |
|
11 |
+ print(paste("Creating artificial doublets for pN = ", |
|
12 |
+ pN[n] * 100, "%", sep = "")) |
|
13 |
+ } |
|
14 |
+ n_doublets <- ceiling(n.real.cells / (1 - pN[n]) - n.real.cells) |
|
15 |
+ real.cells1 <- sample(real.cells, n_doublets, replace = TRUE) |
|
16 |
+ real.cells2 <- sample(real.cells, n_doublets, replace = TRUE) |
|
17 |
+ doublets <- (data[, real.cells1] + data[, real.cells2]) / 2 |
|
18 |
+ colnames(doublets) <- paste("X", 1:n_doublets, sep = "") |
|
19 |
+ data_wdoublets <- cbind(data, doublets) |
|
20 |
+ |
|
21 |
+ ## Pre-process Seurat object |
|
22 |
+ if (sct == FALSE) { |
|
23 |
+ if (verbose) { |
|
24 |
+ print("Creating Seurat object...") |
|
25 |
+ } |
|
26 |
+ seu_wdoublets <- Seurat::CreateSeuratObject(counts = data_wdoublets) |
|
27 |
+ |
|
28 |
+ if (verbose) { |
|
29 |
+ print("Normalizing Seurat object...") |
|
30 |
+ } |
|
31 |
+ seu_wdoublets <- Seurat::NormalizeData(seu_wdoublets, |
|
32 |
+ normalization.method = orig.commands$NormalizeData.RNA@params$normalization.method, |
|
33 |
+ scale.factor = orig.commands$NormalizeData.RNA@params$scale.factor, |
|
34 |
+ margin = orig.commands$NormalizeData.RNA@params$margin, |
|
35 |
+ verbose = verbose |
|
36 |
+ ) |
|
37 |
+ |
|
38 |
+ if (verbose) { |
|
39 |
+ print("Finding variable genes...") |
|
40 |
+ } |
|
41 |
+ seu_wdoublets <- Seurat::FindVariableFeatures(seu_wdoublets, |
|
42 |
+ selection.method = orig.commands$FindVariableFeatures.RNA$selection.method, |
|
43 |
+ loess.span = orig.commands$FindVariableFeatures.RNA$loess.span, |
|
44 |
+ clip.max = orig.commands$FindVariableFeatures.RNA$clip.max, |
|
45 |
+ mean.function = orig.commands$FindVariableFeatures.RNA$mean.function, |
|
46 |
+ dispersion.function = orig.commands$FindVariableFeatures.RNA$dispersion.function, |
|
47 |
+ num.bin = orig.commands$FindVariableFeatures.RNA$num.bin, |
|
48 |
+ binning.method = orig.commands$FindVariableFeatures.RNA$binning.method, |
|
49 |
+ nfeatures = orig.commands$FindVariableFeatures.RNA$nfeatures, |
|
50 |
+ mean.cutoff = orig.commands$FindVariableFeatures.RNA$mean.cutoff, |
|
51 |
+ dispersion.cutoff = orig.commands$FindVariableFeatures.RNA$dispersion.cutoff, |
|
52 |
+ verbose = verbose |
|
53 |
+ ) |
|
54 |
+ |
|
55 |
+ if (verbose) { |
|
56 |
+ print("Scaling data...") |
|
57 |
+ } |
|
58 |
+ seu_wdoublets <- Seurat::ScaleData( |
|
59 |
+ seu_wdoublets, |
|
60 |
+ features = orig.commands$ScaleData.RNA$features, |
|
61 |
+ model.use = orig.commands$ScaleData.RNA$model.use, |
|
62 |
+ do.scale = orig.commands$ScaleData.RNA$do.scale, |
|
63 |
+ do.center = orig.commands$ScaleData.RNA$do.center, |
|
64 |
+ scale.max = orig.commands$ScaleData.RNA$scale.max, |
|
65 |
+ block.size = orig.commands$ScaleData.RNA$block.size, |
|
66 |
+ min.cells.to.block = orig.commands$ScaleData.RNA$min.cells.to.block, |
|
67 |
+ verbose = verbose |
|
68 |
+ ) |
|
69 |
+ |
|
70 |
+ if (verbose) { |
|
71 |
+ print("Running PCA...") |
|
72 |
+ } |
|
73 |
+ seu_wdoublets <- Seurat::RunPCA( |
|
74 |
+ seu_wdoublets, |
|
75 |
+ features = orig.commands$ScaleData.RNA$features, |
|
76 |
+ npcs = length(PCs), |
|
77 |
+ rev.pca = orig.commands$RunPCA.RNA$rev.pca, |
|
78 |
+ weight.by.var = orig.commands$RunPCA.RNA$weight.by.var, |
|
79 |
+ seed.use = seed, |
|
80 |
+ verbose = FALSE |
|
81 |
+ ) |
|
82 |
+ } |
|
83 |
+ |
|
84 |
+ if (sct == TRUE) { |
|
85 |
+ if (verbose) { |
|
86 |
+ print("Creating Seurat object...") |
|
87 |
+ } |
|
88 |
+ seu_wdoublets <- Seurat::CreateSeuratObject(counts = data_wdoublets) |
|
89 |
+ |
|
90 |
+ if (verbose) { |
|
91 |
+ print("Running SCTransform...") |
|
92 |
+ } |
|
93 |
+ seu_wdoublets <- Seurat::SCTransform(seu_wdoublets) |
|
94 |
+ |
|
95 |
+ if (verbose) { |
|
96 |
+ print("Running PCA...") |
|
97 |
+ } |
|
98 |
+ seu_wdoublets <- Seurat::RunPCA(seu_wdoublets, |
|
99 |
+ npcs = length(PCs), seed.use = seed, |
|
100 |
+ verbose = verbose |
|
101 |
+ ) |
|
102 |
+ } |
|
103 |
+ |
|
104 |
+ ## Compute PC distance matrix |
|
105 |
+ if (verbose) { |
|
106 |
+ print("Calculating PC distance matrix...") |
|
107 |
+ } |
|
108 |
+ nCells <- nrow(seu_wdoublets@meta.data) |
|
109 |
+ pca.coord <- seu_wdoublets@reductions$pca@cell.embeddings[, PCs] |
|
110 |
+ seu_wdoublets <- NULL |
|
111 |
+ gc() |
|
112 |
+ dist.mat <- fields::rdist(pca.coord)[, 1:n.real.cells] |
|
113 |
+ |
|
114 |
+ ## Pre-order PC distance matrix prior to iterating across pK |
|
115 |
+ ## for pANN computations |
|
116 |
+ if (verbose) { |
|
117 |
+ print("Defining neighborhoods...") |
|
118 |
+ } |
|
119 |
+ |
|
120 |
+ for (i in 1:n.real.cells) { |
|
121 |
+ dist.mat[, i] <- order(dist.mat[, i]) |
|
122 |
+ } |
|
123 |
+ |
|
124 |
+ ## Trim PC distance matrix for faster manipulations |
|
125 |
+ ind <- round(nCells * max(pK)) + 5 |
|
126 |
+ dist.mat <- dist.mat[1:ind, ] |
|
127 |
+ |
|
128 |
+ ## Compute pANN across pK sweep |
|
129 |
+ |
|
130 |
+ if (verbose) { |
|
131 |
+ print("Computing pANN across all pK...") |
|
132 |
+ } |
|
133 |
+ for (k in 1:length(pK)) { |
|
134 |
+ if (verbose) { |
|
135 |
+ print(paste("pK = ", pK[k], "...", sep = "")) |
|
136 |
+ } |
|
137 |
+ pk.temp <- round(nCells * pK[k]) |
|
138 |
+ pANN <- as.data.frame(matrix(0L, nrow = n.real.cells, ncol = 1)) |
|
139 |
+ colnames(pANN) <- "pANN" |
|
140 |
+ rownames(pANN) <- real.cells |
|
141 |
+ list.ind <- list.ind + 1 |
|
142 |
+ |
|
143 |
+ for (i in 1:n.real.cells) { |
|
144 |
+ neighbors <- dist.mat[2:(pk.temp + 1), i] |
|
145 |
+ pANN$pANN[i] <- length(which(neighbors > n.real.cells)) / pk.temp |
|
146 |
+ } |
|
147 |
+ |
|
148 |
+ sweep.res.list[[list.ind]] <- pANN |
|
149 |
+ } |
|
150 |
+ |
|
151 |
+ return(sweep.res.list) |
|
152 |
+} |
|
153 |
+ |
|
154 |
+.paramSweep <- function(seu, PCs = 1:10, sct = FALSE, |
|
155 |
+ verbose = FALSE, num.cores, seed) { |
|
156 |
+ ## Set pN-pK param sweep ranges |
|
157 |
+ pK <- c(0.0005, 0.001, 0.005, seq(0.01, 0.3, by = 0.01)) |
|
158 |
+ pN <- seq(0.05, 0.3, by = 0.05) |
|
159 |
+ ## Remove pK values with too few cells |
|
160 |
+ min.cells <- round(nrow(seu@meta.data) / (1 - 0.05) - nrow(seu@meta.data)) |
|
161 |
+ pK.test <- round(pK * min.cells) |
|
162 |
+ pK <- pK[which(pK.test >= 1)] |
|
163 |
+ |
|
164 |
+ ## Extract pre-processing parameters from original data analysis workflow |
|
165 |
+ orig.commands <- seu@commands |
|
166 |
+ |
|
167 |
+ ## Down-sample cells to 10000 (when applicable) for computational effiency |
|
168 |
+ if (nrow(seu@meta.data) > 10000) { |
|
169 |
+ real.cells <- rownames(seu@meta.data)[sample(1:nrow(seu@meta.data), |
|
170 |
+ 10000, replace = FALSE)] |
|
171 |
+ data <- seu@assays$RNA@counts[, real.cells] |
|
172 |
+ n.real.cells <- ncol(data) |
|
173 |
+ } |
|
174 |
+ |
|
175 |
+ if (nrow(seu@meta.data) <= 10000) { |
|
176 |
+ real.cells <- rownames(seu@meta.data) |
|
177 |
+ data <- seu@assays$RNA@counts |
|
178 |
+ n.real.cells <- ncol(data) |
|
179 |
+ } |
|
180 |
+ ## Iterate through pN, computing pANN vectors at varying pK |
|
181 |
+ # no_cores <- detectCores()-1 |
|
182 |
+ if (is.null(num.cores)) { |
|
183 |
+ num.cores <- 1 |
|
184 |
+ } |
|
185 |
+ |
|
186 |
+ if (num.cores > 1) { |
|
187 |
+ cl <- parallel::makeCluster(num.cores) |
|
188 |
+ |
|
189 |
+ output2 <- withr::with_seed( |
|
190 |
+ seed, |
|
191 |
+ parallel::mclapply(as.list(1:length(pN)), |
|
192 |
+ FUN = .parallel_paramSweep, |
|
193 |
+ n.real.cells, |
|
194 |
+ real.cells, |
|
195 |
+ pK, |
|
196 |
+ pN, |
|
197 |
+ data, |
|
198 |
+ orig.commands, |
|
199 |
+ PCs, |
|
200 |
+ sct, mc.cores = num.cores, |
|
201 |
+ verbose = verbose, |
|
202 |
+ seed = seed, |
|
203 |
+ mc.set.seed = FALSE |
|
204 |
+ ) |
|
205 |
+ ) |
|
206 |
+ parallel::stopCluster(cl) |
|
207 |
+ } else { |
|
208 |
+ output2 <- lapply(as.list(1:length(pN)), |
|
209 |
+ FUN = .parallel_paramSweep, |
|
210 |
+ n.real.cells, |
|
211 |
+ real.cells, |
|
212 |
+ pK, |
|
213 |
+ pN, |
|
214 |
+ data, |
|
215 |
+ orig.commands, |
|
216 |
+ PCs, |
|
217 |
+ sct, |
|
218 |
+ seed = seed, |
|
219 |
+ verbose = verbose |
|
220 |
+ ) |
|
221 |
+ } |
|
222 |
+ |
|
223 |
+ ## Write parallelized output into list |
|
224 |
+ sweep.res.list <- list() |
|
225 |
+ list.ind <- 0 |
|
226 |
+ for (i in 1:length(output2)) { |
|
227 |
+ for (j in 1:length(output2[[i]])) { |
|
228 |
+ list.ind <- list.ind + 1 |
|
229 |
+ sweep.res.list[[list.ind]] <- output2[[i]][[j]] |
|
230 |
+ } |
|
231 |
+ } |
|
232 |
+ ## Assign names to list of results |
|
233 |
+ name.vec <- NULL |
|
234 |
+ for (j in 1:length(pN)) { |
|
235 |
+ name.vec <- c(name.vec, paste("pN", pN[j], "pK", pK, sep = "_")) |
|
236 |
+ } |
|
237 |
+ names(sweep.res.list) <- name.vec |
|
238 |
+ return(sweep.res.list) |
|
239 |
+ |
|
240 |
+} |
|
241 |
+ |
|
242 |
+.runDoubletFinder <- function(counts, seuratPcs, seuratRes, formationRate, |
|
243 |
+ seuratNfeatures, verbose = FALSE, |
|
244 |
+ nCores = NULL, seed = 12345) { |
|
245 |
+ |
|
246 |
+ ## Convert to sparse matrix if not already in that format |
|
247 |
+ counts <- .convertToMatrix(counts) |
|
248 |
+ colnames(counts) <- gsub("_", "-", colnames(counts)) |
|
249 |
+ |
|
250 |
+ seurat <- suppressWarnings(Seurat::CreateSeuratObject( |
|
251 |
+ counts = counts, |
|
252 |
+ project = "seurat", min.features = 0 |
|
253 |
+ )) |
|
254 |
+ seurat <- Seurat::NormalizeData( |
|
255 |
+ object = seurat, |
|
256 |
+ normalization.method = "LogNormalize", scale.factor = 10000, |
|
257 |
+ verbose = verbose |
|
258 |
+ ) |
|
259 |
+ seurat <- Seurat::FindVariableFeatures(seurat, |
|
260 |
+ selection.method = "vst", |
|
261 |
+ nfeatures = seuratNfeatures, |
|
262 |
+ verbose = verbose |
|
263 |
+ ) |
|
264 |
+ |
|
265 |
+ allGenes <- rownames(seurat) |
|
266 |
+ seurat <- Seurat::ScaleData(seurat, features = allGenes, verbose = verbose) |
|
267 |
+ |
|
268 |
+ numPc <- min(ncol(seurat@assays$RNA@scale.data) - 1, 50) |
|
269 |
+ seurat <- Seurat::RunPCA(seurat, |
|
270 |
+ features = |
|
271 |
+ Seurat::VariableFeatures(object = seurat), |
|
272 |
+ npcs = numPc, verbose = verbose, seed.use = seed |
|
273 |
+ ) |
|
274 |
+ seurat <- Seurat::FindNeighbors(seurat, dims = seuratPcs, verbose = verbose) |
|
275 |
+ seurat <- Seurat::FindClusters(seurat, |
|
276 |
+ resolution = seuratRes, |
|
277 |
+ verbose = verbose, |
|
278 |
+ random.seed = seed |
|
279 |
+ ) |
|
280 |
+ invisible(sweepResListSeurat <- .paramSweep(seurat, |
|
281 |
+ PCs = seuratPcs, |
|
282 |
+ sct = FALSE, |
|
283 |
+ num.cores = nCores, |
|
284 |
+ verbose = verbose, |
|
285 |
+ seed = seed |
|
286 |
+ )) |
|
287 |
+ sweepStatsSeurat <- .summarizeSweep(sweepResListSeurat, |
|
288 |
+ GT = FALSE |
|
289 |
+ ) |
|
290 |
+ bcmvnSeurat <- .find.pK(sweepStatsSeurat) |
|
291 |
+ pkOptimal <- as.numeric(as.matrix(bcmvnSeurat$pK[ |
|
292 |
+ which.max(bcmvnSeurat$MeanBC) |
|
293 |
+ ])) |
|
294 |
+ annotations <- seurat@meta.data$seurat_clusters |
|
295 |
+ homotypicProp <- .modelHomotypic(annotations) |
|
296 |
+ nExpPoi <- round(formationRate * ncol(seurat@assays$RNA)) |
|
297 |
+ seurat <- .doubletFinder_v3(seurat, |
|
298 |
+ PCs = seuratPcs, |
|
299 |
+ pN = 0.25, |
|
300 |
+ pK = pkOptimal, |
|
301 |
+ nExp = nExpPoi, |
|
302 |
+ reuse.pANN = FALSE, |
|
303 |
+ sct = FALSE |
|
304 |
+ ) |
|
305 |
+ names(seurat@meta.data)[6] <- "doubletFinderAnnScore" |
|
306 |
+ names(seurat@meta.data)[7] <- "doubletFinderLabel" |
|
307 |
+ return(seurat) |
|
308 |
+} |
|
309 |
+ |
|
310 |
+#' @title Generates a doublet score for each cell via doubletFinder |
|
311 |
+#' @description Uses doubletFinder to determine cells within the dataset |
|
312 |
+#' suspected to be doublets. |
|
313 |
+#' @param inSCE Input SingleCellExperiment object. Must contain a counts matrix |
|
314 |
+#' @param useAssay Indicate which assay to use. Default "counts". |
|
315 |
+#' @param sample Numeric vector. Each cell will be assigned a sample number. |
|
316 |
+#' @param seed Seed for the random number generator. Default 12345. |
|
317 |
+#' @param seuratNfeatures Integer. Number of highly variable genes to use. |
|
318 |
+#' Default 2000. |
|
319 |
+#' @param seuratPcs Numeric vector. The PCs used in seurat function to |
|
320 |
+#' determine number of clusters. Default 1:15. |
|
321 |
+#' @param seuratRes Numeric vector. The resolution parameter used in seurat, |
|
322 |
+#' which adjusts the number of clusters determined via the algorithm. |
|
323 |
+#' Default 1.5. |
|
324 |
+#' @param formationRate Doublet formation rate used within algorithm. |
|
325 |
+#' Default 0.075. |
|
326 |
+#' @param nCores Number of cores used for running the function. |
|
327 |
+#' @param verbose Boolean. Wheter to print messages from Seurat and |
|
328 |
+#' DoubletFinder. Default FALSE. |
|
329 |
+#' @return SingleCellExperiment object containing the |
|
330 |
+#' 'doublet_finder_doublet_score'. |
|
331 |
+#' @examples |
|
332 |
+#' \donttest{ |
|
333 |
+#' data(scExample, package = "singleCellTK") |
|
334 |
+#' sce <- sce[, colData(sce)$type != 'EmptyDroplet'] |
|
335 |
+#' sce <- runDoubletFinder(sce) |
|
336 |
+#' } |
|
337 |
+#' @export |
|
338 |
+#' @importFrom SummarizedExperiment colData colData<- |
|
339 |
+#' @importFrom SingleCellExperiment reducedDim<- |
|
340 |
+runDoubletFinder <- function(inSCE, |
|
341 |
+ useAssay = "counts", |
|
342 |
+ sample = NULL, |
|
343 |
+ seed = 12345, |
|
344 |
+ seuratNfeatures = 2000, |
|
345 |
+ seuratPcs = 1:15, |
|
346 |
+ seuratRes = 1.5, |
|
347 |
+ formationRate = 0.075, |
|
348 |
+ nCores = NULL, |
|
349 |
+ verbose = FALSE) { |
|
350 |
+ |
|
351 |
+ # argsList <- as.list(formals(fun = sys.function(sys.parent()), envir = parent.frame())) |
|
352 |
+ argsList <- mget(names(formals()), sys.frame(sys.nframe())) |
|
353 |
+ |
|
354 |
+ if (!is.null(sample)) { |
|
355 |
+ if (length(sample) != ncol(inSCE)) { |
|
356 |
+ stop("'sample' must be the same length as the number of columns in 'inSCE'") |
|
357 |
+ } |
|
358 |
+ } else { |
|
359 |
+ sample <- rep(1, ncol(inSCE)) |
|
360 |
+ } |
|
361 |
+ |
|
362 |
+ message(paste0(date(), " ... Running 'doubletFinder'")) |
|
363 |
+ |
|
364 |
+ doubletScore <- rep(NA, ncol(inSCE)) |
|
365 |
+ doubletLabel <- rep(NA, ncol(inSCE)) |
|
366 |
+ allSampleNumbers <- sort(unique(sample)) |
|
367 |
+ |
|
368 |
+ for (res in seuratRes) { |
|
369 |
+ output <- S4Vectors::DataFrame( |
|
370 |
+ row.names = colnames(inSCE), |
|
371 |
+ doubletFinder_doublet_score = numeric(ncol(inSCE)), |
|
372 |
+ doubletFinder_doublet_label = numeric(ncol(inSCE)) |
|
373 |
+ ) |
|
374 |
+ umapDims <- matrix(ncol = 2, |
|
375 |
+ nrow = ncol(inSCE)) |
|
376 |
+ rownames(umapDims) = colnames(inSCE) |
|
377 |
+ colnames(umapDims) = c("UMAP_1", "UMAP_2") |
|
378 |
+ |
|
379 |
+ ## Loop through each sample and run doubletFinder |
|
380 |
+ samples <- unique(sample) |
|
381 |
+ |
|
382 |
+ for (i in seq_len(length(samples))) { |
|
383 |
+ sceSampleInd <- sample == samples[i] |
|
384 |
+ sceSample <- inSCE[, sceSampleInd] |
|
385 |
+ mat <- SummarizedExperiment::assay(sceSample, i = useAssay) |
|
386 |
+ if(length(seuratPcs) > ncol(mat)){ |
|
387 |
+ seuratPcs <- 1:ncol(mat) |
|
388 |
+ } |
|
389 |
+ |
|
390 |
+ result <- suppressMessages(withr::with_seed( |
|
391 |
+ seed, |
|
392 |
+ .runDoubletFinder( |
|
393 |
+ counts = mat, |
|
394 |
+ seuratPcs = seuratPcs, |
|
395 |
+ seuratRes = res, |
|
396 |
+ seuratNfeatures = seuratNfeatures, |
|
397 |
+ formationRate = formationRate, |
|
398 |
+ nCores = nCores, |
|
399 |
+ verbose = verbose, |
|
400 |
+ seed = seed |
|
401 |
+ ) |
|
402 |
+ )) |
|
403 |
+ result <- suppressMessages(Seurat::RunUMAP(result, |
|
404 |
+ dims = 1:10, |
|
405 |
+ verbose = verbose, |
|
406 |
+ umap.method = "umap-learn", |
|
407 |
+ metric = "correlation")) |
|
408 |
+ |
|
409 |
+ seuratDims <- Seurat::Embeddings(result, reduction = "umap") |
|
410 |
+ umapDims[sceSampleInd, 1] <- seuratDims[,1] |
|
411 |
+ umapDims[sceSampleInd, 2] <- seuratDims[,2] |
|
412 |
+ output[sceSampleInd, 1] <- result@meta.data$doubletFinderAnnScore |
|
413 |
+ output[sceSampleInd, 2] <- result@meta.data$doubletFinderLabel |
|
414 |
+ } |
|
415 |
+ |
|
416 |
+ colnames(output) <- paste0(colnames(output), "_resolution_", res) |
|
417 |
+ |
|
418 |
+ argsList <- argsList[!names(argsList) %in% ("...")] |
|
419 |
+ inSCE@metadata$runDoubletFinder <- argsList[-1] |
|
420 |
+ inSCE@metadata$runDoubletFinder$packageVersion <- "2.0.2" |
|
421 |
+ colData(inSCE) <- cbind(colData(inSCE), output) |
|
422 |
+ reducedDim(inSCE,'doubletFinder_UMAP') <- umapDims |
|
423 |
+ } |
|
424 |
+ return(inSCE) |
|
425 |
+} |
|
426 |
+ |
|
427 |
+ |
|
428 |
+.summarizeSweep <- function(sweep.list, GT = FALSE, GT.calls = NULL) { |
|
429 |
+ #require(KernSmooth); require(ROCR) |
|
430 |
+ ## Set pN-pK param sweep ranges |
|
431 |
+ name.vec <- names(sweep.list) |
|
432 |
+ name.vec <- unlist(strsplit(name.vec, split="pN_")) |
|
433 |
+ name.vec <- name.vec[seq(2, length(name.vec), by=2)] |
|
434 |
+ name.vec <- unlist(strsplit(name.vec, split="_pK_")) |
|
435 |
+ pN <- as.numeric(unique(name.vec[seq(1, length(name.vec), by=2)])) |
|
436 |
+ pK <- as.numeric(unique(name.vec[seq(2, length(name.vec), by=2)])) |
|
437 |
+ |
|
438 |
+ ## Initialize data structure w/ or w/o AUC column, depending on whether ground-truth doublet classifications are available |
|
439 |
+ if (GT == TRUE) { |
|
440 |
+ sweep.stats <- as.data.frame(matrix(0L, nrow=length(sweep.list), ncol=4)) |
|
441 |
+ colnames(sweep.stats) <- c("pN","pK","AUC","BCreal") |
|
442 |
+ sweep.stats$pN <- factor(rep(pN, each=length(pK), levels = pN)) |
|
443 |
+ sweep.stats$pK <- factor(rep(pK, length(pN),levels = pK)) |
|
444 |
+ } |
|
445 |
+ |
|
446 |
+ if (GT == FALSE) { |
|
447 |
+ sweep.stats <- as.data.frame(matrix(0L, nrow=length(sweep.list), ncol=3)) |
|
448 |
+ colnames(sweep.stats) <- c("pN","pK","BCreal") |
|
449 |
+ sweep.stats$pN <- factor(rep(pN, each=length(pK), levels = pN)) |
|
450 |
+ sweep.stats$pK <- factor(rep(pK, length(pN),levels = pK)) |
|
451 |
+ } |
|
452 |
+ |
|
453 |
+ ## Perform pN-pK parameter sweep summary |
|
454 |
+ for (i in 1:length(sweep.list)) { |
|
455 |
+ res.temp <- sweep.list[[i]] |
|
456 |
+ |
|
457 |
+ ## Use gaussian kernel density estimation of pANN vector to compute bimodality coefficient |
|
458 |
+ gkde <- stats::approxfun(KernSmooth::bkde(res.temp$pANN, kernel="normal")) |
|
459 |
+ x <- seq(from=min(res.temp$pANN), to=max(res.temp$pANN), length.out=nrow(res.temp)) |
|
460 |
+ sweep.stats$BCreal[i] <- .bimodality_coefficient(gkde(x)) |
|
461 |
+ |
|
462 |
+ if (GT == FALSE) { next } |
|
463 |
+ |
|
464 |
+ ## If ground-truth doublet classifications are available, perform ROC analysis on logistic |
|
465 |
+ ## regression model trained using pANN vector |
|
466 |
+ meta <- as.data.frame(matrix(0L, nrow=nrow(res.temp), ncol=2)) |
|
467 |
+ meta[,1] <- GT.calls |
|
468 |
+ meta[,2] <- res.temp$pANN |
|
469 |
+ train.ind <- sample(1:nrow(meta), round(nrow(meta)/2), replace=FALSE) |
|
470 |
+ test.ind <- (1:nrow(meta))[-train.ind] |
|
471 |
+ colnames(meta) <- c("SinDub","pANN") |
|
472 |
+ meta$SinDub <- factor(meta$SinDub, levels = c("Doublet","Singlet")) |
|
473 |
+ model.lm <- stats::glm(SinDub ~ pANN, family=stats::binomial(link='logit'), data=meta, subset=train.ind) |
|
474 |
+ prob <- stats::predict(model.lm, newdata=meta[test.ind, ], type="response") |
|
475 |
+ ROCpred <- ROCR::prediction(predictions=prob, labels=meta$SinDub[test.ind]) |
|
476 |
+ perf.auc <- ROCR::performance(ROCpred, measure="auc") |
|
477 |
+ sweep.stats$AUC[i] <- perf.auc@y.values[[1]] |
|
478 |
+ } |
|
479 |
+ |
|
480 |
+ return(sweep.stats) |
|
481 |
+} |
|
482 |
+ |
|
483 |
+ |
|
484 |
+.bimodality_coefficient <- function(x) { |
|
485 |
+ G <- .skewness(x) |
|
486 |
+ sample.excess.kurtosis <- .kurtosis(x) |
|
487 |
+ K <- sample.excess.kurtosis |
|
488 |
+ n <- length(x) |
|
489 |
+ B <- ((G^2)+1)/(K+((3*((n-1)^2))/((n-2)*(n-3)))) |
|
490 |
+ return(B) |
|
491 |
+} |
|
492 |
+ |
|
493 |
+.skewness <- function(x) { |
|
494 |
+ n <- length(x) |
|
495 |
+ S <- (1/n)*sum((x-mean(x))^3)/(((1/n)*sum((x-mean(x))^2))^1.5) |
|
496 |
+ S <- S*(sqrt(n*(n-1)))/(n-2) |
|