Browse code

Merge branch 'master' of https://github.com/salzcamino/singleCellTK

salzcamino authored on 19/10/2022 15:00:10
Showing 35 changed files

... ...
@@ -255,6 +255,8 @@ importFrom(SingleCellExperiment,SingleCellExperiment)
255 255
 importFrom(SingleCellExperiment,altExp)
256 256
 importFrom(SingleCellExperiment,counts)
257 257
 importFrom(SingleCellExperiment,reducedDim)
258
+importFrom(SingleCellExperiment,reducedDimNames)
259
+importFrom(SingleCellExperiment,reducedDims)
258 260
 importFrom(SingleCellExperiment,rowSubset)
259 261
 importFrom(SummarizedExperiment,"assay<-")
260 262
 importFrom(SummarizedExperiment,"assayNames<-")
... ...
@@ -262,6 +264,7 @@ importFrom(SummarizedExperiment,"assays<-")
262 264
 importFrom(SummarizedExperiment,"colData<-")
263 265
 importFrom(SummarizedExperiment,"rowData<-")
264 266
 importFrom(SummarizedExperiment,assay)
267
+importFrom(SummarizedExperiment,assayNames)
265 268
 importFrom(SummarizedExperiment,assays)
266 269
 importFrom(SummarizedExperiment,colData)
267 270
 importFrom(SummarizedExperiment,rowData)
... ...
@@ -1,10 +1,10 @@
1 1
 .constructSCE <- function(
2
-  matrices,
3
-  features,
4
-  barcodes,
5
-  metadata,
6
-  reducedDims) {
7
-
2
+    matrices,
3
+    features,
4
+    barcodes,
5
+    metadata,
6
+    reducedDims) {
7
+  
8 8
   sce <- SingleCellExperiment::SingleCellExperiment(assays = matrices)
9 9
   SummarizedExperiment::rowData(sce) <- S4Vectors::DataFrame(features)
10 10
   SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(barcodes)
... ...
@@ -34,7 +34,7 @@
34 34
   } else {
35 35
     fill <- NA
36 36
   }
37
-
37
+  
38 38
   ### combine row
39 39
   if (isTRUE(combineRow) & (!is.null(row))) {
40 40
     missRow <- row[!row %in% rownames(x)]
... ...
@@ -43,14 +43,14 @@
43 43
     if (!isTRUE(sparse)) {
44 44
       missMat <- as.matrix(missMat)
45 45
     }
46
-
46
+    
47 47
     mat <- rbind(matOrigin, missMat)
48 48
     if (anyDuplicated(rownames(mat))) {
49 49
       mat <- mat[!duplicated(rownames(mat)), ]
50 50
     }
51 51
     matOrigin <- mat[row, ]
52 52
   }
53
-
53
+  
54 54
   ### combine cols
55 55
   if (isTRUE(combineCol) & (!is.null(col))) {
56 56
     missCol <- col[!col %in% colnames(x)]
... ...
@@ -59,7 +59,7 @@
59 59
     if (!isTRUE(sparse)) {
60 60
       missMat <- as.matrix(missMat)
61 61
     }
62
-
62
+    
63 63
     mat <- cbind(matOrigin, missMat)
64 64
     if (anyDuplicated(colnames(mat))) {
65 65
       mat <- mat[, !duplicated(colnames(mat))]
... ...
@@ -76,12 +76,12 @@
76 76
     rw[['rownames']] <- rownames(rw)
77 77
     return(rw)
78 78
   })
79
-
79
+  
80 80
   ## Get merged rowData
81 81
   by.r <- unique(c('rownames', by.r))
82 82
   unionFe <- Reduce(function(r1, r2) merge(r1, r2, by=by.r, all=TRUE), feList)
83 83
   allGenes <- unique(unlist(lapply(feList, rownames)))
84
-
84
+  
85 85
   ## rowData
86 86
   newFe <- unionFe
87 87
   if (nrow(newFe) != length(allGenes)) {
... ...
@@ -102,7 +102,7 @@
102 102
     cD[['rownames']] <- rownames(cD)
103 103
     return(cD)
104 104
   })
105
-
105
+  
106 106
   by.c <- unique(c("rownames", by.c))
107 107
   unionCb <- Reduce(function(c1, c2) merge(c1, c2, by=by.c, all=TRUE), cbList)
108 108
   rownames(unionCb) <- unionCb[['rownames']]
... ...
@@ -118,19 +118,19 @@
118 118
   reduceList <- lapply(sceList, SingleCellExperiment::reducedDims)
119 119
   ## get every reducedDim exists in at least one SCEs
120 120
   UnionReducedDims <- unique(unlist(lapply(sceList, SingleCellExperiment::reducedDimNames)))
121
-
121
+  
122 122
   ## for each reducedDim, get union row/cols
123 123
   reducedDims <- list()
124 124
   for (reduceDim in UnionReducedDims) {
125 125
     x <- lapply(sceList, function(x) {if (reduceDim %in% SingleCellExperiment::reducedDimNames(x)) {SingleCellExperiment::reducedDim(x, reduceDim)}})
126 126
     reducedDims[[reduceDim]] <- .getDimUnion(x)
127 127
   }
128
-
128
+  
129 129
   ## Merge reducedDim for each SCE
130 130
   redList <- list()
131 131
   for (idx in seq_along(sceList)){
132 132
     redMat <- reduceList[[idx]]
133
-
133
+    
134 134
     for (DimName in UnionReducedDims) {
135 135
       if (DimName %in% names(redMat)) {
136 136
         redMat[[DimName]] <- .getMatUnion(reducedDims[[DimName]], redMat[[DimName]],
... ...
@@ -142,10 +142,10 @@
142 142
                                           dimnames = list(colnames(sceList[[idx]]), reducedDims[[DimName]][[2]]))
143 143
       }
144 144
     }
145
-
145
+    
146 146
     redList[[idx]] <- redMat
147 147
   }
148
-
148
+  
149 149
   return(redList)
150 150
 }
151 151
 
... ...
@@ -157,7 +157,7 @@
157 157
     unique(unlist(lapply(sceList, rownames))),
158 158
     unique(unlist(lapply(sceList, colnames)))
159 159
   )
160
-
160
+  
161 161
   asList <- list()
162 162
   for (idx in seq_along(assayList)){
163 163
     assay <- assayList[[idx]]
... ...
@@ -174,7 +174,7 @@
174 174
     }
175 175
     asList[[idx]] <- assay
176 176
   }
177
-
177
+  
178 178
   return(asList)
179 179
 }
180 180
 
... ...
@@ -197,27 +197,48 @@
197 197
 # }
198 198
 
199 199
 .mergeMetaSCE <- function(SCE_list) {
200
+  # Merge highest level metadata entries (except "sctk") by sample names in
201
+  # SCE_list. For analysis results in "sctk", merge by SCE_list sample name if 
202
+  # given "all_cells" in one object, else, use existing sample names. 
203
+  samples <- names(SCE_list)
200 204
   sampleMeta <- lapply(SCE_list, S4Vectors::metadata)
201 205
   metaNames <- unique(unlist(lapply(sampleMeta, names)))
206
+  sampleSctkMeta <- lapply(SCE_list, function(x) {S4Vectors::metadata(x)$sctk})
207
+  sctkMetaNames <- unique(unlist(lapply(sampleMeta, 
208
+                                        function(x) names(x$sctk))))
209
+  sampleMeta$sctk <- NULL
210
+  metaNames <- metaNames[!metaNames %in% c("sctk")]
202 211
   NewMeta <- list()
203
-
212
+  for (meta in sctkMetaNames) {
213
+    for (i in seq_along(sampleSctkMeta)) {
214
+      # `i` is for identifying each SCE object, usually matching a sample in 
215
+      # the pipeline. `sampleAvail` for samples stored in metadata entry, in
216
+      # case users are merging merged objects. 
217
+      sampleAvail <- names(sampleSctkMeta[[i]][[meta]])
218
+      if (length(sampleAvail) == 1) {
219
+        NewMeta$sctk[[meta]][[samples[i]]] <- sampleSctkMeta[[i]][[meta]][[1]]
220
+      } else if (length(sampleAvail) > 1) {
221
+        names(sampleSctkMeta[[i]][[meta]]) <- 
222
+          paste(names(sampleSctkMeta)[i], 
223
+                names(sampleSctkMeta[[i]][[meta]]), sep = "_")
224
+        NewMeta$sctk[[meta]] <- c(NewMeta$sctk[[meta]], 
225
+                                  sampleSctkMeta[[i]][[meta]])
226
+      }
227
+    }
228
+  }
204 229
   for (meta in metaNames) {
205 230
     for (i in seq_along(sampleMeta)) {
206
-      NewMeta[[meta]][[i]] <- sampleMeta[[i]][[meta]]
231
+      NewMeta[[meta]][[samples[i]]] <- sampleMeta[[i]][[meta]]
207 232
     }
208 233
   }
209
-
210
-  if ("runBarcodeRanksMetaOutput" %in% metaNames) {
211
-    NewMeta[["runBarcodeRanksMetaOutput"]] <- unlist(NewMeta[["runBarcodeRanksMetaOutput"]])
212
-  }
213
-
234
+  
214 235
   if ("assayType" %in% metaNames) {
215 236
     assayType <- lapply(SCE_list, function(x){S4Vectors::metadata(x)$assayType})
216 237
     assayType <- BiocGenerics::Reduce(dplyr::union, assayType)
217 238
     
218 239
     NewMeta[["assayType"]] <- assayType
219 240
   }
220
-
241
+  
221 242
   return(NewMeta)
222 243
 }
223 244
 
... ...
@@ -241,7 +262,7 @@
241 262
 #' @export
242 263
 
243 264
 combineSCE <- function(sceList, by.r = NULL, by.c = NULL, combined = TRUE){
244
-  if(length(sceList) == 1){
265
+  if (length(sceList) == 1) {
245 266
     return(sceList[[1]])
246 267
   }
247 268
   ##  rowData
... ...
@@ -252,16 +273,21 @@ combineSCE <- function(sceList, by.r = NULL, by.c = NULL, combined = TRUE){
252 273
   redMatList <- .mergeRedimSCE(sceList)
253 274
   ## assay
254 275
   assayList <- .mergeAssaySCE(sceList)
255
-
276
+  samples <- names(sceList)
277
+  if (is.null(samples)) {
278
+    samples <- paste0("sample", seq_along(sceList))
279
+  }
256 280
   New_SCE <- list()
257 281
   for (i in seq(length(sceList))) {
258 282
     ## create new sce
259
-    New_SCE[[i]] <- .constructSCE(matrices = assayList[[i]], features = newFeList,
260
-                                  barcodes = newCbList[[i]],
261
-                                  metadata = S4Vectors::metadata(sceList[[i]]),
262
-                                  reducedDims = redMatList[[i]])
283
+    sampleName <- samples[i]
284
+    New_SCE[[sampleName]] <- .constructSCE(matrices = assayList[[i]], 
285
+                                           features = newFeList,
286
+                                           barcodes = newCbList[[i]],
287
+                                           metadata = S4Vectors::metadata(sceList[[i]]),
288
+                                           reducedDims = redMatList[[i]])
263 289
   }
264
-
290
+  
265 291
   if (isTRUE(combined)) {
266 292
     sce <- do.call(SingleCellExperiment::cbind, New_SCE)
267 293
     meta <- .mergeMetaSCE(New_SCE)
... ...
@@ -2,251 +2,255 @@
2 2
 ## (https://github.com/chris-mcginnis-ucsf/DoubletFinder)
3 3
 
4 4
 .parallel_paramSweep <- function(n, n.real.cells, real.cells, pK, pN, data,
5
-                                   orig.commands, PCs, sct, verbose, seed) {
6
-    sweep.res.list <- list()
7
-    list.ind <- 0
8
-
9
-    ## Make merged real-artifical data
5
+                                 orig.commands, PCs, sct, verbose, seed) {
6
+  sweep.res.list <- list()
7
+  list.ind <- 0
8
+  
9
+  ## Make merged real-artifical data
10
+  if (verbose) {
11
+    message(date(), " ...   Creating artificial doublets for pN = ",
12
+            pN[n] * 100, "%")
13
+  }
14
+  n_doublets <- ceiling(n.real.cells / (1 - pN[n]) - n.real.cells)
15
+  real.cells1 <- sample(real.cells, n_doublets, replace = TRUE)
16
+  real.cells2 <- sample(real.cells, n_doublets, replace = TRUE)
17
+  doublets <- (data[, real.cells1] + data[, real.cells2]) / 2
18
+  colnames(doublets) <- paste("X", seq(n_doublets), sep = "")
19
+  data_wdoublets <- cbind(data, doublets)
20
+  
21
+  ## Pre-process Seurat object
22
+  if (sct == FALSE) {
10 23
     if (verbose) {
11
-        message(date(), " ...   Creating artificial doublets for pN = ",
12
-                    pN[n] * 100, "%")
13
-    }
14
-    n_doublets <- ceiling(n.real.cells / (1 - pN[n]) - n.real.cells)
15
-    real.cells1 <- sample(real.cells, n_doublets, replace = TRUE)
16
-    real.cells2 <- sample(real.cells, n_doublets, replace = TRUE)
17
-    doublets <- (data[, real.cells1] + data[, real.cells2]) / 2
18
-    colnames(doublets) <- paste("X", seq(n_doublets), sep = "")
19
-    data_wdoublets <- cbind(data, doublets)
20
-
21
-    ## Pre-process Seurat object
22
-    if (sct == FALSE) {
23
-        if (verbose) {
24
-          message(date(), " ...   Creating Seurat object")
25
-        }
26
-        seu_wdoublets <- Seurat::CreateSeuratObject(counts = data_wdoublets)
27
-
28
-        if (verbose) {
29
-          message(date(), " ...   Normalizing Seurat object")
30
-        }
31
-        seu_wdoublets <- Seurat::NormalizeData(seu_wdoublets,
32
-                                       normalization.method = orig.commands$NormalizeData.RNA@params$normalization.method,
33
-                                       scale.factor = orig.commands$NormalizeData.RNA@params$scale.factor,
34
-                                       margin = orig.commands$NormalizeData.RNA@params$margin,
35
-                                       verbose = verbose
36
-        )
37
-
38
-        if (verbose) {
39
-          message(date(), " ...   Finding variable genes")
40
-        }
41
-        seu_wdoublets <- Seurat::FindVariableFeatures(seu_wdoublets,
42
-                                              selection.method = orig.commands$FindVariableFeatures.RNA$selection.method,
43
-                                              loess.span = orig.commands$FindVariableFeatures.RNA$loess.span,
44
-                                              clip.max = orig.commands$FindVariableFeatures.RNA$clip.max,
45
-                                              mean.function = orig.commands$FindVariableFeatures.RNA$mean.function,
46
-                                              dispersion.function = orig.commands$FindVariableFeatures.RNA$dispersion.function,
47
-                                              num.bin = orig.commands$FindVariableFeatures.RNA$num.bin,
48
-                                              binning.method = orig.commands$FindVariableFeatures.RNA$binning.method,
49
-                                              nfeatures = orig.commands$FindVariableFeatures.RNA$nfeatures,
50
-                                              mean.cutoff = orig.commands$FindVariableFeatures.RNA$mean.cutoff,
51
-                                              dispersion.cutoff = orig.commands$FindVariableFeatures.RNA$dispersion.cutoff,
52
-                                              verbose = verbose
53
-        )
54
-
55
-        if (verbose) {
56
-          message(date(), " ...   Scaling data")
57
-        }
58
-        seu_wdoublets <- Seurat::ScaleData(
59
-            seu_wdoublets,
60
-            features = orig.commands$ScaleData.RNA$features,
61
-            model.use = orig.commands$ScaleData.RNA$model.use,
62
-            do.scale = orig.commands$ScaleData.RNA$do.scale,
63
-            do.center = orig.commands$ScaleData.RNA$do.center,
64
-            scale.max = orig.commands$ScaleData.RNA$scale.max,
65
-            block.size = orig.commands$ScaleData.RNA$block.size,
66
-            min.cells.to.block = orig.commands$ScaleData.RNA$min.cells.to.block,
67
-            verbose = verbose
68
-        )
69
-
70
-        if (verbose) {
71
-          message(date(), " ...   Running PCA")
72
-        }
73
-        seu_wdoublets <- Seurat::RunPCA(
74
-            seu_wdoublets,
75
-            features = orig.commands$ScaleData.RNA$features,
76
-            npcs = length(PCs),
77
-            rev.pca = orig.commands$RunPCA.RNA$rev.pca,
78
-            weight.by.var = orig.commands$RunPCA.RNA$weight.by.var,
79
-            seed.use = seed,
80
-            verbose = FALSE
81
-        )
82
-    }
83
-
84
-    if (sct == TRUE) {
85
-        if (verbose) {
86
-            message(date(), " ...   Creating Seurat object")
87
-        }
88
-        seu_wdoublets <- Seurat::CreateSeuratObject(counts = data_wdoublets)
89
-
90
-        if (verbose) {
91
-          message(date(), " ...   Running SCTransform")
92
-        }
93
-        seu_wdoublets <- Seurat::SCTransform(seu_wdoublets)
94
-
95
-        if (verbose) {
96
-          message(date(), " ...   Running PCA")
97
-        }
98
-        seu_wdoublets <- Seurat::RunPCA(seu_wdoublets,
99
-                                npcs = length(PCs), seed.use = seed,
100
-                                verbose = verbose
101
-        )
24
+      message(date(), " ...   Creating Seurat object")
102 25
     }
103
-
104
-    ## Compute PC distance matrix
26
+    seu_wdoublets <- Seurat::CreateSeuratObject(counts = data_wdoublets)
27
+    
105 28
     if (verbose) {
106
-      message(date(), " ...   Calculating PC distance matrix")
29
+      message(date(), " ...   Normalizing Seurat object")
107 30
     }
108
-    nCells <- nrow(seu_wdoublets@meta.data)
109
-    pca.coord <- seu_wdoublets@reductions$pca@cell.embeddings[, PCs]
110
-    seu_wdoublets <- NULL
111
-    gc()
112
-    dist.mat <- fields::rdist(pca.coord)[, seq(n.real.cells)]
113
-
114
-    ## Pre-order PC distance matrix prior to iterating across pK
115
-    ## for pANN computations
31
+    seu_wdoublets <- Seurat::NormalizeData(
32
+      seu_wdoublets,
33
+      normalization.method = orig.commands$NormalizeData.RNA@params$normalization.method,
34
+      scale.factor = orig.commands$NormalizeData.RNA@params$scale.factor,
35
+      margin = orig.commands$NormalizeData.RNA@params$margin,
36
+      verbose = verbose
37
+    )
38
+    
116 39
     if (verbose) {
117
-      message(date(), " ...   Defining neighborhoods")
40
+      message(date(), " ...   Finding variable genes")
118 41
     }
119
-
120
-    for (i in seq(n.real.cells)) {
121
-        dist.mat[, i] <- order(dist.mat[, i])
42
+    seu_wdoublets <- Seurat::FindVariableFeatures(
43
+      seu_wdoublets,
44
+      selection.method = orig.commands$FindVariableFeatures.RNA$selection.method,
45
+      loess.span = orig.commands$FindVariableFeatures.RNA$loess.span,
46
+      clip.max = orig.commands$FindVariableFeatures.RNA$clip.max,
47
+      mean.function = orig.commands$FindVariableFeatures.RNA$mean.function,
48
+      dispersion.function = orig.commands$FindVariableFeatures.RNA$dispersion.function,
49
+      num.bin = orig.commands$FindVariableFeatures.RNA$num.bin,
50
+      binning.method = orig.commands$FindVariableFeatures.RNA$binning.method,
51
+      nfeatures = orig.commands$FindVariableFeatures.RNA$nfeatures,
52
+      mean.cutoff = orig.commands$FindVariableFeatures.RNA$mean.cutoff,
53
+      dispersion.cutoff = orig.commands$FindVariableFeatures.RNA$dispersion.cutoff,
54
+      verbose = verbose
55
+    )
56
+    
57
+    if (verbose) {
58
+      message(date(), " ...   Scaling data")
122 59
     }
123
-
124
-    ## Trim PC distance matrix for faster manipulations
125
-    ind <- round(nCells * max(pK)) + 5
126
-    dist.mat <- dist.mat[seq(ind), ]
127
-
128
-    ## Compute pANN across pK sweep
129
-
60
+    seu_wdoublets <- Seurat::ScaleData(
61
+      seu_wdoublets,
62
+      features = orig.commands$ScaleData.RNA$features,
63
+      model.use = orig.commands$ScaleData.RNA$model.use,
64
+      do.scale = orig.commands$ScaleData.RNA$do.scale,
65
+      do.center = orig.commands$ScaleData.RNA$do.center,
66
+      scale.max = orig.commands$ScaleData.RNA$scale.max,
67
+      block.size = orig.commands$ScaleData.RNA$block.size,
68
+      min.cells.to.block = orig.commands$ScaleData.RNA$min.cells.to.block,
69
+      verbose = verbose
70
+    )
71
+    
130 72
     if (verbose) {
131
-      message(date(), " ...   Computing pANN across all pK")
73
+      message(date(), " ...   Running PCA")
132 74
     }
133
-    for (k in seq_along(pK)) {
134
-        if (verbose) {
135
-            message(date(), " ...   pK = ", pK[k])
136
-        }
137
-        pk.temp <- round(nCells * pK[k])
138
-        pANN <- as.data.frame(matrix(0L, nrow = n.real.cells, ncol = 1))
139
-        colnames(pANN) <- "pANN"
140
-        rownames(pANN) <- real.cells
141
-        list.ind <- list.ind + 1
142
-
143
-        for (i in seq(n.real.cells)) {
144
-            neighbors <- dist.mat[2:(pk.temp + 1), i]
145
-            pANN$pANN[i] <- length(which(neighbors > n.real.cells)) / pk.temp
146
-        }
147
-
148
-        sweep.res.list[[list.ind]] <- pANN
75
+    seu_wdoublets <- Seurat::RunPCA(
76
+      seu_wdoublets,
77
+      features = orig.commands$ScaleData.RNA$features,
78
+      npcs = length(PCs),
79
+      rev.pca = orig.commands$RunPCA.RNA$rev.pca,
80
+      weight.by.var = orig.commands$RunPCA.RNA$weight.by.var,
81
+      seed.use = seed,
82
+      verbose = FALSE
83
+    )
84
+  }
85
+  
86
+  if (sct == TRUE) {
87
+    if (verbose) {
88
+      message(date(), " ...   Creating Seurat object")
149 89
     }
150
-
151
-    return(sweep.res.list)
152
-}
153
-
154
-.paramSweep <- function(seu, PCs = seq(10), sct = FALSE,
155
-                          verbose = FALSE, num.cores, seed) {
156
-    ## Set pN-pK param sweep ranges
157
-    pK <- c(0.0005, 0.001, 0.005, seq(0.01, 0.3, by = 0.01))
158
-    pN <- seq(0.05, 0.3, by = 0.05)
159
-    ## Remove pK values with too few cells
160
-    min.cells <- round(nrow(seu@meta.data) / (1 - 0.05) - nrow(seu@meta.data))
161
-    pK.test <- round(pK * min.cells)
162
-    pK <- pK[which(pK.test >= 1)]
163
-
164
-    ## Extract pre-processing parameters from original data analysis workflow
165
-    orig.commands <- seu@commands
166
-
167
-    ## Down-sample cells to 10000 (when applicable) for computational effiency
168
-    if (nrow(seu@meta.data) > 10000) {
169
-        real.cells <- rownames(seu@meta.data)[sample(seq(nrow(seu@meta.data)),
170
-                                                     10000, replace = FALSE)]
171
-        data <- seu@assays$RNA@counts[, real.cells]
172
-        n.real.cells <- ncol(data)
90
+    seu_wdoublets <- Seurat::CreateSeuratObject(counts = data_wdoublets)
91
+    
92
+    if (verbose) {
93
+      message(date(), " ...   Running SCTransform")
173 94
     }
174
-
175
-    if (nrow(seu@meta.data) <= 10000) {
176
-        real.cells <- rownames(seu@meta.data)
177
-        data <- seu@assays$RNA@counts
178
-        n.real.cells <- ncol(data)
95
+    seu_wdoublets <- Seurat::SCTransform(seu_wdoublets)
96
+    
97
+    if (verbose) {
98
+      message(date(), " ...   Running PCA")
179 99
     }
180
-    ## Iterate through pN, computing pANN vectors at varying pK
181
-    # no_cores <- detectCores()-1
182
-    if (is.null(num.cores)) {
183
-        num.cores <- 1
100
+    seu_wdoublets <- Seurat::RunPCA(seu_wdoublets,
101
+                                    npcs = length(PCs), seed.use = seed,
102
+                                    verbose = verbose
103
+    )
104
+  }
105
+  
106
+  ## Compute PC distance matrix
107
+  if (verbose) {
108
+    message(date(), " ...   Calculating PC distance matrix")
109
+  }
110
+  nCells <- ncol(seu_wdoublets)
111
+  pca.coord <- Seurat::Reductions(seu_wdoublets, "pca")@cell.embeddings[, PCs]
112
+  #seu_wdoublets <- NULL
113
+  rm(seu_wdoublets)
114
+  gc()
115
+  dist.mat <- fields::rdist(pca.coord)[, seq(n.real.cells)]
116
+  
117
+  ## Pre-order PC distance matrix prior to iterating across pK
118
+  ## for pANN computations
119
+  if (verbose) {
120
+    message(date(), " ...   Defining neighborhoods")
121
+  }
122
+  
123
+  for (i in seq(n.real.cells)) {
124
+    dist.mat[, i] <- order(dist.mat[, i])
125
+  }
126
+  
127
+  ## Trim PC distance matrix for faster manipulations
128
+  ind <- round(nCells * max(pK)) + 5
129
+  dist.mat <- dist.mat[seq(ind), ]
130
+  
131
+  ## Compute pANN across pK sweep
132
+  
133
+  if (verbose) {
134
+    message(date(), " ...   Computing pANN across all pK")
135
+  }
136
+  for (k in seq_along(pK)) {
137
+    if (verbose) {
138
+      message(date(), " ...   pK = ", pK[k])
184 139
     }
185
-
186
-    if (num.cores > 1) {
187
-        cl <- parallel::makeCluster(num.cores)
188
-
189
-        output2 <- withr::with_seed(
190
-            seed,
191
-            parallel::mclapply(as.list(seq_along(pN)),
192
-                     FUN = .parallel_paramSweep,
193
-                     n.real.cells,
194
-                     real.cells,
195
-                     pK,
196
-                     pN,
197
-                     data,
198
-                     orig.commands,
199
-                     PCs,
200
-                     sct, mc.cores = num.cores,
201
-                     verbose = verbose,
202
-                     seed = seed,
203
-                     mc.set.seed = FALSE
204
-            )
205
-        )
206
-        parallel::stopCluster(cl)
207
-    } else {
208
-        output2 <- lapply(as.list(seq_along(pN)),
209
-                          FUN = .parallel_paramSweep,
210
-                          n.real.cells,
211
-                          real.cells,
212
-                          pK,
213
-                          pN,
214
-                          data,
215
-                          orig.commands,
216
-                          PCs,
217
-                          sct,
218
-                          seed = seed,
219
-                          verbose = verbose
220
-        )
140
+    pk.temp <- round(nCells * pK[k])
141
+    pANN <- as.data.frame(matrix(0L, nrow = n.real.cells, ncol = 1))
142
+    colnames(pANN) <- "pANN"
143
+    rownames(pANN) <- real.cells
144
+    list.ind <- list.ind + 1
145
+    
146
+    for (i in seq(n.real.cells)) {
147
+      neighbors <- dist.mat[2:(pk.temp + 1), i]
148
+      pANN$pANN[i] <- length(which(neighbors > n.real.cells)) / pk.temp
221 149
     }
150
+    
151
+    sweep.res.list[[list.ind]] <- pANN
152
+  }
153
+  
154
+  return(sweep.res.list)
155
+}
222 156
 
223
-    ## Write parallelized output into list
224
-    sweep.res.list <- list()
225
-    list.ind <- 0
226
-    for (i in seq_along(output2)) {
227
-        for (j in seq_along(output2[[i]])) {
228
-            list.ind <- list.ind + 1
229
-            sweep.res.list[[list.ind]] <- output2[[i]][[j]]
230
-        }
231
-    }
232
-    ## Assign names to list of results
233
-    name.vec <- NULL
234
-    for (j in seq_along(pN)) {
235
-        name.vec <- c(name.vec, paste("pN", pN[j], "pK", pK, sep = "_"))
157
+.paramSweep <- function(seu, PCs = seq(10), sct = FALSE,
158
+                        verbose = FALSE, num.cores, seed) {
159
+  ## Set pN-pK param sweep ranges
160
+  pK <- c(0.0005, 0.001, 0.005, seq(0.01, 0.3, by = 0.01))
161
+  pN <- seq(0.05, 0.3, by = 0.05)
162
+  ## Remove pK values with too few cells
163
+  min.cells <- round(nrow(seu[[]]) / (1 - 0.05) - nrow(seu[[]]))
164
+  pK.test <- round(pK * min.cells)
165
+  pK <- pK[which(pK.test >= 1)]
166
+  
167
+  ## Extract pre-processing parameters from original data analysis workflow
168
+  orig.commands <- seu@commands
169
+  
170
+  ## Down-sample cells to 10000 (when applicable) for computational effiency
171
+  if (nrow(seu[[]]) > 10000) {
172
+    real.cells <- rownames(seu[[]])[sample(seq(nrow(seu[[]])),
173
+                                           10000, replace = FALSE)]
174
+    data <- Seurat::GetAssayData(seu, slot = "counts", 
175
+                                 assay = "RNA")[, real.cells]
176
+    n.real.cells <- ncol(data)
177
+  }
178
+  
179
+  if (ncol(seu) <= 10000) {
180
+    real.cells <- colnames(seu)
181
+    data <- Seurat::GetAssayData(seu, slot = "counts", assay = "RNA")
182
+    n.real.cells <- ncol(data)
183
+  }
184
+  ## Iterate through pN, computing pANN vectors at varying pK
185
+  # no_cores <- detectCores()-1
186
+  if (is.null(num.cores)) {
187
+    num.cores <- 1
188
+  }
189
+  
190
+  if (num.cores > 1) {
191
+    cl <- parallel::makeCluster(num.cores)
192
+    
193
+    output2 <- withr::with_seed(
194
+      seed,
195
+      parallel::mclapply(as.list(seq_along(pN)),
196
+                         FUN = .parallel_paramSweep,
197
+                         n.real.cells,
198
+                         real.cells,
199
+                         pK,
200
+                         pN,
201
+                         data,
202
+                         orig.commands,
203
+                         PCs,
204
+                         sct, mc.cores = num.cores,
205
+                         verbose = verbose,
206
+                         seed = seed,
207
+                         mc.set.seed = FALSE
208
+      )
209
+    )
210
+    parallel::stopCluster(cl)
211
+  } else {
212
+    output2 <- lapply(as.list(seq_along(pN)),
213
+                      FUN = .parallel_paramSweep,
214
+                      n.real.cells,
215
+                      real.cells,
216
+                      pK,
217
+                      pN,
218
+                      data,
219
+                      orig.commands,
220
+                      PCs,
221
+                      sct,
222
+                      seed = seed,
223
+                      verbose = verbose
224
+    )
225
+  }
226
+  
227
+  ## Write parallelized output into list
228
+  sweep.res.list <- list()
229
+  list.ind <- 0
230
+  for (i in seq_along(output2)) {
231
+    for (j in seq_along(output2[[i]])) {
232
+      list.ind <- list.ind + 1
233
+      sweep.res.list[[list.ind]] <- output2[[i]][[j]]
236 234
     }
237
-    names(sweep.res.list) <- name.vec
238
-    return(sweep.res.list)
239
-
235
+  }
236
+  ## Assign names to list of results
237
+  name.vec <- NULL
238
+  for (j in seq_along(pN)) {
239
+    name.vec <- c(name.vec, paste("pN", pN[j], "pK", pK, sep = "_"))
240
+  }
241
+  names(sweep.res.list) <- name.vec
242
+  return(sweep.res.list)
243
+  
240 244
 }
241 245
 
242 246
 .runDoubletFinder <- function(counts, seuratPcs, seuratRes, formationRate,
243 247
                               seuratNfeatures, verbose = FALSE,
244 248
                               nCores = NULL, seed = 12345) {
245
-
249
+  
246 250
   ## Convert to sparse matrix if not already in that format
247 251
   counts <- .convertToMatrix(counts)
248 252
   colnames(counts) <- gsub("_", "-", colnames(counts))
249
-
253
+  
250 254
   seurat <- suppressWarnings(Seurat::CreateSeuratObject(
251 255
     counts = counts,
252 256
     project = "seurat", min.features = 0
... ...
@@ -257,51 +261,53 @@
257 261
     verbose = verbose
258 262
   )
259 263
   seurat <- Seurat::FindVariableFeatures(seurat,
260
-    selection.method = "vst",
261
-    nfeatures = seuratNfeatures,
262
-    verbose = verbose
264
+                                         selection.method = "vst",
265
+                                         nfeatures = seuratNfeatures,
266
+                                         verbose = verbose
263 267
   )
264
-
268
+  
265 269
   allGenes <- rownames(seurat)
266 270
   seurat <- Seurat::ScaleData(seurat, features = allGenes, verbose = verbose)
267
-
268
-  numPc <- min(ncol(seurat@assays$RNA@scale.data) - 1, 50)
271
+  
272
+  numPc <- min(nrow(Seurat::GetAssayData(seurat, slot = "scale.data", 
273
+                                         assay = "RNA")) - 1, 
274
+               50)
269 275
   seurat <- Seurat::RunPCA(seurat,
270
-    features =
271
-      Seurat::VariableFeatures(object = seurat),
272
-    npcs = numPc, verbose = verbose, seed.use = seed
276
+                           features =
277
+                             Seurat::VariableFeatures(object = seurat),
278
+                           npcs = numPc, verbose = verbose, seed.use = seed
273 279
   )
274 280
   seurat <- Seurat::FindNeighbors(seurat, dims = seuratPcs, verbose = verbose)
275 281
   seurat <- Seurat::FindClusters(seurat,
276
-    resolution = seuratRes,
277
-    verbose = verbose,
278
-    random.seed = seed
282
+                                 resolution = seuratRes,
283
+                                 verbose = verbose,
284
+                                 random.seed = seed
279 285
   )
280 286
   invisible(sweepResListSeurat <- .paramSweep(seurat,
281
-    PCs = seuratPcs,
282
-    sct = FALSE,
283
-    num.cores = nCores,
284
-    verbose = verbose,
285
-    seed = seed
287
+                                              PCs = seuratPcs,
288
+                                              sct = FALSE,
289
+                                              num.cores = nCores,
290
+                                              verbose = verbose,
291
+                                              seed = seed
286 292
   ))
287 293
   sweepStatsSeurat <- .summarizeSweep(sweepResListSeurat,
288
-    GT = FALSE
294
+                                      GT = FALSE
289 295
   )
290 296
   bcmvnSeurat <- .find.pK(sweepStatsSeurat)
291 297
   pkOptimal <- as.numeric(as.matrix(bcmvnSeurat$pK[
292 298
     which.max(bcmvnSeurat$MeanBC)
293 299
   ]))
294
-  annotations <- seurat@meta.data$seurat_clusters
295
-  homotypicProp <- .modelHomotypic(annotations)
296
-  nExpPoi <- round(formationRate * ncol(seurat@assays$RNA))
300
+  #annotations <- seurat[[]]$seurat_clusters
301
+  #homotypicProp <- .modelHomotypic(annotations)
302
+  nExpPoi <- round(formationRate * ncol(seurat))
297 303
   seurat <- .doubletFinder_v3(seurat,
298
-    PCs = seuratPcs,
299
-    pN = 0.25,
300
-    pK = pkOptimal,
301
-    nExp = nExpPoi,
302
-    reuse.pANN = FALSE,
303
-    sct = FALSE,
304
-    verbose = FALSE
304
+                              PCs = seuratPcs,
305
+                              pN = 0.25,
306
+                              pK = pkOptimal,
307
+                              nExp = nExpPoi,
308
+                              reuse.pANN = FALSE,
309
+                              sct = FALSE,
310
+                              verbose = FALSE
305 311
   )
306 312
   names(seurat@meta.data)[6] <- "doubletFinderAnnScore"
307 313
   names(seurat@meta.data)[7] <- "doubletFinderLabel"
... ...
@@ -311,34 +317,39 @@
311 317
 #' @title Generates a doublet score for each cell via doubletFinder
312 318
 #' @description Uses doubletFinder to determine cells within the dataset
313 319
 #'  suspected to be doublets.
314
-#' @param inSCE Input SingleCellExperiment object. Must contain a counts matrix
315
-#' @param useAssay  Indicate which assay to use. Default "counts".
316
-#' @param sample Numeric vector. Each cell will be assigned a sample number.
317
-#' @param seed Seed for the random number generator. Default 12345.
320
+#' @param inSCE inSCE A \linkS4class{SingleCellExperiment} object.
321
+#' @param sample Character vector or colData variable name. Indicates which 
322
+#' sample each cell belongs to. Default \code{NULL}.
323
+#' @param useAssay  A string specifying which assay in the SCE to use. Default 
324
+#' \code{"counts"}.
325
+#' @param seed Seed for the random number generator, can be set to \code{NULL}. 
326
+#' Default \code{12345}.
318 327
 #' @param seuratNfeatures Integer. Number of highly variable genes to use.
319
-#'  Default 2000.
328
+#' Default \code{2000}.
320 329
 #' @param seuratPcs Numeric vector. The PCs used in seurat function to
321
-#'   determine number of clusters. Default 1:15.
322
-#' @param seuratRes Numeric vector. The resolution parameter used in seurat,
323
-#'  which adjusts the number of clusters determined via the algorithm.
324
-#'  Default 1.5.
325
-#' @param formationRate Doublet formation rate used within algorithm.
326
-#'  Default 0.075.
327
-#' @param nCores Number of cores used for running the function.
328
-#' @param verbose Boolean. Wheter to print messages from Seurat and
329
-#'  DoubletFinder. Default FALSE.
330
-#' @return SingleCellExperiment object containing the
331
-#'  'doublet_finder_doublet_score'.
330
+#' determine number of clusters. Default \code{1:15}.
331
+#' @param seuratRes Numeric vector. The resolution parameter used in Seurat,
332
+#' which adjusts the number of clusters determined via the algorithm. Default 
333
+#' \code{1.5}.
334
+#' @param formationRate Doublet formation rate used within algorithm. Default 
335
+#' \code{0.075}.
336
+#' @param nCores Number of cores used for running the function. Default 
337
+#' \code{NULL}.
338
+#' @param verbose Boolean. Wheter to print messages from Seurat and 
339
+#' DoubletFinder. Default \code{FALSE}.
340
+#' @return \linkS4class{SingleCellExperiment} object containing the
341
+#' \code{doublet_finder_doublet_score} variable in \code{colData} slot.
342
+#' @seealso \code{\link{runCellQC}}, \code{\link{plotDoubletFinderResults}}
332 343
 #' @examples
333 344
 #' data(scExample, package = "singleCellTK")
334 345
 #' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
335 346
 #' sce <- runDoubletFinder(sce)
336 347
 #' @export
337
-#' @importFrom SummarizedExperiment colData colData<-
348
+#' @importFrom SummarizedExperiment colData colData<- assayNames assayNames<-
338 349
 #' @importFrom SingleCellExperiment reducedDim<-
339 350
 runDoubletFinder <- function(inSCE,
340
-                             useAssay = "counts",
341 351
                              sample = NULL,
352
+                             useAssay = "counts",
342 353
                              seed = 12345,
343 354
                              seuratNfeatures = 2000,
344 355
                              seuratPcs = seq(15),
... ...
@@ -346,28 +357,25 @@ runDoubletFinder <- function(inSCE,
346 357
                              formationRate = 0.075,
347 358
                              nCores = NULL,
348 359
                              verbose = FALSE) {
349
-
350
-  # argsList <- as.list(formals(fun = sys.function(sys.parent()), envir = parent.frame()))
360
+  
351 361
   argsList <- mget(names(formals()), sys.frame(sys.nframe()))
352
-
362
+  argsList <- argsList[!names(argsList) %in% c("inSCE")]
363
+  argsList$packageVersion <- "2.0.2"
353 364
   tempSCE <- inSCE
354
-  SummarizedExperiment::assayNames(inSCE)[which(useAssay %in% SummarizedExperiment::assayNames(inSCE))] <- "counts"
355
-  useAssay <- "counts"
356
-
357
-  if (!is.null(sample)) {
358
-    if (length(sample) != ncol(inSCE)) {
359
-      stop("'sample' must be the same length as the number of columns in 'inSCE'")
360
-    }
361
-  } else {
365
+  #assayNames(inSCE)[which(useAssay %in% assayNames(inSCE))] <- "counts"
366
+  #useAssay <- "counts"
367
+  
368
+  sample <- .manageCellVar(inSCE, var = sample)
369
+  if (is.null(sample)) {
362 370
     sample <- rep(1, ncol(inSCE))
363 371
   }
364
-
372
+  
365 373
   message(date(), " ... Running 'doubletFinder'")
366
-
367
-  doubletScore <- rep(NA, ncol(inSCE))
368
-  doubletLabel <- rep(NA, ncol(inSCE))
369
-  allSampleNumbers <- sort(unique(sample))
370
-
374
+  
375
+  #doubletScore <- rep(NA, ncol(inSCE))
376
+  #doubletLabel <- rep(NA, ncol(inSCE))
377
+  #allSampleNumbers <- sort(unique(sample))
378
+  
371 379
   for (res in seuratRes) {
372 380
     output <- S4Vectors::DataFrame(
373 381
       row.names = colnames(inSCE),
... ...
@@ -378,21 +386,19 @@ runDoubletFinder <- function(inSCE,
378 386
                        nrow = ncol(inSCE))
379 387
     rownames(umapDims) = colnames(inSCE)
380 388
     colnames(umapDims) = c("UMAP_1", "UMAP_2")
381
-
389
+    
382 390
     ## Loop through each sample and run doubletFinder
383 391
     samples <- unique(sample)
384
-
385
-    for (i in seq_len(length(samples))) {
386
-      sceSampleInd <- sample == samples[i]
392
+    
393
+    for (s in samples) {
394
+      sceSampleInd <- sample == s
387 395
       sceSample <- inSCE[, sceSampleInd]
388 396
       mat <- SummarizedExperiment::assay(sceSample, i = useAssay)
389
-      if(length(seuratPcs) > ncol(mat)){
397
+      if (length(seuratPcs) > ncol(mat)) {
390 398
         seuratPcs <- seq(ncol(mat))
391 399
       }
392
-
393
-      result <- suppressMessages(withr::with_seed(
394
-        seed,
395
-        .runDoubletFinder(
400
+      .withSeed(seed, {
401
+        result <- .runDoubletFinder(
396 402
           counts = mat,
397 403
           seuratPcs = seuratPcs,
398 404
           seuratRes = res,
... ...
@@ -402,34 +408,39 @@ runDoubletFinder <- function(inSCE,
402 408
           verbose = verbose,
403 409
           seed = seed
404 410
         )
405
-      ))
411
+      })
412
+      
406 413
       result <- suppressMessages(Seurat::RunUMAP(result,
407 414
                                                  dims = seq(10),
408 415
                                                  verbose = verbose,
409 416
                                                  umap.method = "uwot"))
410
-
417
+      
411 418
       seuratDims <- Seurat::Embeddings(result, reduction = "umap")
412 419
       umapDims[sceSampleInd, 1] <- seuratDims[,1]
413 420
       umapDims[sceSampleInd, 2] <- seuratDims[,2]
414
-      output[sceSampleInd, 1] <- result@meta.data$doubletFinderAnnScore
415
-      output[sceSampleInd, 2] <- result@meta.data$doubletFinderLabel
421
+      output[sceSampleInd, 1] <- result[[]]$doubletFinderAnnScore
422
+      output[sceSampleInd, 2] <- result[[]]$doubletFinderLabel
423
+      
424
+      if (!identical(samples, 1)) {
425
+        metadata(inSCE)$sctk$runDoubletFinder[[s]] <- argsList
426
+      }
427
+    }
428
+    if (identical(samples, 1)) {
429
+      metadata(inSCE)$sctk$runDoubletFinder$all_cells <- argsList
416 430
     }
417
-
418 431
     colData(inSCE)[, paste0(colnames(output), "_resolution_", res)] <- NULL
419
-
432
+    
420 433
     colnames(output) <- paste0(colnames(output), "_resolution_", res)
421 434
     output[,2] <- factor(output[,2], levels = c("Singlet", "Doublet"))
422
-    argsList <- argsList[!names(argsList) %in% ("...")]
423
-    inSCE@metadata$runDoubletFinder <- argsList[-1]
424
-    inSCE@metadata$runDoubletFinder$packageVersion <- "2.0.2"
435
+    
425 436
     colData(inSCE) <- cbind(colData(inSCE), output)
426 437
     reducedDim(inSCE,'doubletFinder_UMAP') <- umapDims
427 438
   }
428
-
429
-  tempSCE@colData <- inSCE@colData
430
-  tempSCE@metadata <- inSCE@metadata
439
+  
440
+  colData(tempSCE) <- colData(inSCE)
441
+  S4Vectors::metadata(tempSCE) <- S4Vectors::metadata(inSCE)
431 442
   reducedDims(tempSCE) <- reducedDims(inSCE)
432
-
443
+  
433 444
   return(tempSCE)
434 445
 }
435 446
 
... ...
@@ -438,54 +449,63 @@ runDoubletFinder <- function(inSCE,
438 449
   #require(KernSmooth); require(ROCR)
439 450
   ## Set pN-pK param sweep ranges
440 451
   name.vec <- names(sweep.list)
441
-  name.vec <- unlist(strsplit(name.vec, split="pN_"))
442
-  name.vec <- name.vec[seq(2, length(name.vec), by=2)]
443
-  name.vec <- unlist(strsplit(name.vec, split="_pK_"))
444
-  pN <- as.numeric(unique(name.vec[seq(1, length(name.vec), by=2)]))
445
-  pK <- as.numeric(unique(name.vec[seq(2, length(name.vec), by=2)]))
446
-
447
-  ## Initialize data structure w/ or w/o AUC column, depending on whether ground-truth doublet classifications are available
452
+  name.vec <- unlist(strsplit(name.vec, split = "pN_"))
453
+  name.vec <- name.vec[seq(2, length(name.vec), by = 2)]
454
+  name.vec <- unlist(strsplit(name.vec, split = "_pK_"))
455
+  pN <- as.numeric(unique(name.vec[seq(1, length(name.vec), by = 2)]))
456
+  pK <- as.numeric(unique(name.vec[seq(2, length(name.vec), by = 2)]))
457
+  
458
+  ## Initialize data structure w/ or w/o AUC column, depending on whether 
459
+  ## ground-truth doublet classifications are available
448 460
   if (GT == TRUE) {
449
-    sweep.stats <- as.data.frame(matrix(0L, nrow=length(sweep.list), ncol=4))
461
+    sweep.stats <- as.data.frame(matrix(0L, nrow = length(sweep.list), 
462
+                                        ncol = 4))
450 463
     colnames(sweep.stats) <- c("pN","pK","AUC","BCreal")
451
-    sweep.stats$pN <- factor(rep(pN, each=length(pK), levels = pN))
464
+    sweep.stats$pN <- factor(rep(pN, each = length(pK), levels = pN))
452 465
     sweep.stats$pK <- factor(rep(pK, length(pN),levels = pK))
453 466
   }
454
-
467
+  
455 468
   if (GT == FALSE) {
456
-    sweep.stats <- as.data.frame(matrix(0L, nrow=length(sweep.list), ncol=3))
469
+    sweep.stats <- as.data.frame(matrix(0L, nrow = length(sweep.list), 
470
+                                        ncol = 3))
457 471
     colnames(sweep.stats) <- c("pN","pK","BCreal")
458
-    sweep.stats$pN <- factor(rep(pN, each=length(pK), levels = pN))
472
+    sweep.stats$pN <- factor(rep(pN, each = length(pK), levels = pN))
459 473
     sweep.stats$pK <- factor(rep(pK, length(pN),levels = pK))
460 474
   }
461
-
475
+  
462 476
   ## Perform pN-pK parameter sweep summary
463 477
   for (i in seq_along(sweep.list)) {
464 478
     res.temp <- sweep.list[[i]]
465
-
466
-    ## Use gaussian kernel density estimation of pANN vector to compute bimodality coefficient
467
-    gkde <- stats::approxfun(KernSmooth::bkde(res.temp$pANN, kernel="normal"))
468
-    x <- seq(from=min(res.temp$pANN), to=max(res.temp$pANN), length.out=nrow(res.temp))
479
+    
480
+    ## Use gaussian kernel density estimation of pANN vector to compute 
481
+    ## bimodality coefficient
482
+    gkde <- stats::approxfun(KernSmooth::bkde(res.temp$pANN, kernel = "normal"))
483
+    x <- seq(from = min(res.temp$pANN), to = max(res.temp$pANN), 
484
+             length.out = nrow(res.temp))
469 485
     sweep.stats$BCreal[i] <- .bimodality_coefficient(gkde(x))
470
-
486
+    
471 487
     if (GT == FALSE) { next }
472
-
473
-    ## If ground-truth doublet classifications are available, perform ROC analysis on logistic
474
-    ## regression model trained using pANN vector
475
-    meta <- as.data.frame(matrix(0L, nrow=nrow(res.temp), ncol=2))
488
+    
489
+    ## If ground-truth doublet classifications are available, perform ROC 
490
+    ## analysis on logistic regression model trained using pANN vector
491
+    meta <- as.data.frame(matrix(0L, nrow = nrow(res.temp), ncol = 2))
476 492
     meta[,1] <- GT.calls
477 493
     meta[,2] <- res.temp$pANN
478
-    train.ind <- sample(seq(nrow(meta)), round(nrow(meta)/2), replace=FALSE)
494
+    train.ind <- sample(seq(nrow(meta)), round(nrow(meta)/2), replace = FALSE)
479 495
     test.ind <- seq(nrow(meta))[-train.ind]
480 496
     colnames(meta) <- c("SinDub","pANN")
481 497
     meta$SinDub <- factor(meta$SinDub, levels = c("Singlet","Doublet"))
482
-    model.lm <- stats::glm(SinDub ~ pANN, family=stats::binomial(link='logit'), data=meta, subset=train.ind)
483
-    prob <- stats::predict(model.lm, newdata=meta[test.ind, ], type="response")
484
-    ROCpred <- ROCR::prediction(predictions=prob, labels=meta$SinDub[test.ind])
485
-    perf.auc <- ROCR::performance(ROCpred, measure="auc")
498
+    model.lm <- stats::glm(SinDub ~ pANN, 
499
+                           family = stats::binomial(link = 'logit'), 
500
+                           data = meta, subset = train.ind)
501
+    prob <- stats::predict(model.lm, newdata = meta[test.ind, ], 
502
+                           type = "response")
503
+    ROCpred <- ROCR::prediction(predictions = prob, 
504
+                                labels = meta$SinDub[test.ind])
505
+    perf.auc <- ROCR::performance(ROCpred, measure = "auc")
486 506
     sweep.stats$AUC[i] <- perf.auc@y.values[[1]]
487 507
   }
488
-
508
+  
489 509
   return(sweep.stats)
490 510
 }
491 511
 
... ...
@@ -495,21 +515,21 @@ runDoubletFinder <- function(inSCE,
495 515
   sample.excess.kurtosis <- .kurtosis(x)
496 516
   K <- sample.excess.kurtosis
497 517
   n <- length(x)
498
-  B <- ((G^2)+1)/(K+((3*((n-1)^2))/((n-2)*(n-3))))
518
+  B <- ((G^2) + 1)/(K + ((3*((n - 1)^2))/((n - 2)*(n - 3))))
499 519
   return(B)
500 520
 }
501 521
 
502 522
 .skewness <- function(x) {
503 523
   n <- length(x)
504
-  S <- (1/n)*sum((x-mean(x))^3)/(((1/n)*sum((x-mean(x))^2))^1.5)
505
-  S <- S*(sqrt(n*(n-1)))/(n-2)
524
+  S <- (1/n)*sum((x - mean(x))^3)/(((1/n)*sum((x - mean(x))^2))^1.5)
525
+  S <- S*(sqrt(n*(n - 1)))/(n - 2)
506 526
   return(S)
507 527
 }
508 528
 
509
-.kurtosis <- function (x) {
529
+.kurtosis <- function(x) {
510 530
   n <- length(x)
511
-  K <- (1/n)*sum((x-mean(x))^4)/(((1/n)*sum((x-mean(x))^2))^2)-3
512
-  K <- ((n - 1)*((n+1)*K-3*(n-1))/((n-2)*(n-3)))+3
531
+  K <- (1/n)*sum((x - mean(x))^4)/(((1/n)*sum((x - mean(x))^2))^2) - 3
532
+  K <- ((n - 1)*((n + 1)*K - 3*(n - 1))/((n - 2)*(n - 3))) + 3
513 533
   return(K)
514 534
 }
515 535
 
... ...
@@ -520,37 +540,43 @@ runDoubletFinder <- function(inSCE,
520 540
 }
521 541
 
522 542
 .find.pK <- function(sweep.stats) {
523
-
543
+  
524 544
   ## Implementation for data without ground-truth doublet classifications
525 545
   '%ni%' <- Negate('%in%')
526 546
   if ("AUC" %ni% colnames(sweep.stats) == TRUE) {
527 547
     ## Initialize data structure for results storage
528
-    bc.mvn <- as.data.frame(matrix(0L, nrow=length(unique(sweep.stats$pK)), ncol=5))
548
+    bc.mvn <- as.data.frame(matrix(0L, nrow = length(unique(sweep.stats$pK)), 
549
+                                   ncol = 5))
529 550
     colnames(bc.mvn) <- c("ParamID","pK","MeanBC","VarBC","BCmetric")
530 551
     bc.mvn$pK <- unique(sweep.stats$pK)
531 552
     bc.mvn$ParamID <- seq(nrow(bc.mvn))
532
-
533
-    ## Compute bimodality coefficient mean, variance, and BCmvn across pN-pK sweep results
553
+    
554
+    ## Compute bimodality coefficient mean, variance, and BCmvn across pN-pK 
555
+    ## sweep results
534 556
     x <- 0
535 557
     for (i in unique(bc.mvn$pK)) {
536 558
       x <- x + 1
537 559
       ind <- which(sweep.stats$pK == i)
538 560
       bc.mvn$MeanBC[x] <- mean(sweep.stats[ind, "BCreal"])
539 561
       bc.mvn$VarBC[x] <- stats::sd(sweep.stats[ind, "BCreal"])^2
540
-      bc.mvn$BCmetric[x] <- mean(sweep.stats[ind, "BCreal"])/(stats::sd(sweep.stats[ind, "BCreal"])^2)
562
+      bc.mvn$BCmetric[x] <- mean(sweep.stats[ind, "BCreal"]) / 
563
+        (stats::sd(sweep.stats[ind, "BCreal"])^2)
541 564
     }
542 565
     return(bc.mvn)
543 566
   }
544
-
545
-  ## Implementation for data with ground-truth doublet classifications (e.g., MULTI-seq, CellHashing, Demuxlet, etc.)
567
+  
568
+  ## Implementation for data with ground-truth doublet classifications (e.g., 
569
+  ## MULTI-seq, CellHashing, Demuxlet, etc.)
546 570
   if ("AUC" %in% colnames(sweep.stats) == TRUE) {
547 571
     ## Initialize data structure for results storage
548
-    bc.mvn <- as.data.frame(matrix(0L, nrow=length(unique(sweep.stats$pK)), ncol=6))
572
+    bc.mvn <- as.data.frame(matrix(0L, nrow = length(unique(sweep.stats$pK)), 
573
+                                   ncol = 6))
549 574
     colnames(bc.mvn) <- c("ParamID","pK","MeanAUC","MeanBC","VarBC","BCmetric")
550 575
     bc.mvn$pK <- unique(sweep.stats$pK)
551 576
     bc.mvn$ParamID <- seq(nrow(bc.mvn))
552
-
553
-    ## Compute bimodality coefficient mean, variance, and BCmvn across pN-pK sweep results
577
+    
578
+    ## Compute bimodality coefficient mean, variance, and BCmvn across pN-pK 
579
+    ## sweep results
554 580
     x <- 0
555 581
     for (i in unique(bc.mvn$pK)) {
556 582
       x <- x + 1
... ...
@@ -558,29 +584,34 @@ runDoubletFinder <- function(inSCE,
558 584
       bc.mvn$MeanAUC[x] <- mean(sweep.stats[ind, "AUC"])
559 585
       bc.mvn$MeanBC[x] <- mean(sweep.stats[ind, "BCreal"])
560 586
       bc.mvn$VarBC[x] <- stats::sd(sweep.stats[ind, "BCreal"])^2
561
-      bc.mvn$BCmetric[x] <- mean(sweep.stats[ind, "BCreal"])/(stats::sd(sweep.stats[ind, "BCreal"])^2)
587
+      bc.mvn$BCmetric[x] <- mean(sweep.stats[ind, "BCreal"]) / 
588
+        (stats::sd(sweep.stats[ind, "BCreal"])^2)
562 589
     }
563
-
590
+    
564 591
     return(bc.mvn)
565
-
592
+    
566 593
   }
567 594
 }
568 595
 
569
-.doubletFinder_v3 <- function(seu, PCs, pN = 0.25, pK, nExp, reuse.pANN = FALSE, sct = FALSE, verbose = FALSE) {
570
-
571
-  ## Generate new list of doublet classificatons from existing pANN vector to save time
596
+.doubletFinder_v3 <- function(seu, PCs, pN = 0.25, pK, nExp, reuse.pANN = FALSE,
597
+                              sct = FALSE, verbose = FALSE) {
598
+  
599
+  ## Generate new list of doublet classificatons from existing pANN vector to 
600
+  ## save time
572 601
   if (reuse.pANN != FALSE ) {
573
-    pANN.old <- seu@meta.data[ , reuse.pANN]
602
+    pANN.old <- seu[[]][ , reuse.pANN]
574 603
     classifications <- rep("Singlet", length(pANN.old))
575
-    classifications[order(pANN.old, decreasing=TRUE)[seq(nExp)]] <- "Doublet"
576
-    seu@meta.data[, paste("DF.classifications",pN,pK,nExp,sep="_")] <- classifications
604
+    classifications[order(pANN.old, decreasing = TRUE)[seq(nExp)]] <- "Doublet"
605
+    seu[[]][, paste("DF.classifications", pN, pK, nExp, sep = "_")] <- 
606
+      classifications
577 607
     return(seu)
578 608
   }
579
-
609
+  
580 610
   if (reuse.pANN == FALSE) {
581 611
     ## Make merged real-artifical data
582
-    real.cells <- rownames(seu@meta.data)
583
-    data <- seu@assays$RNA@counts[, real.cells]
612
+    real.cells <- colnames(seu)
613
+    data <- Seurat::GetAssayData(seu, slot = "counts", 
614
+                                 assay = "RNA")[, real.cells]
584 615
     n_real.cells <- length(real.cells)
585 616
     n_doublets <- round(n_real.cells/(1 - pN) - n_real.cells)
586 617
     if (verbose) {
... ...
@@ -591,71 +622,81 @@ runDoubletFinder <- function(inSCE,
591 622
     doublets <- (data[, real.cells1] + data[, real.cells2])/2
592 623
     colnames(doublets) <- paste("X", seq(n_doublets), sep = "")
593 624
     data_wdoublets <- cbind(data, doublets)
594
-
625
+    
595 626
     ## Store important pre-processing information
596 627
     orig.commands <- seu@commands
597
-
628
+    
598 629
     ## Pre-process Seurat object
599 630
     if (sct == FALSE) {
600 631
       seu_wdoublets <- Seurat::CreateSeuratObject(counts = data_wdoublets)
601
-
602
-      seu_wdoublets <- Seurat::NormalizeData(seu_wdoublets,
603
-                                     normalization.method = orig.commands$NormalizeData.RNA@params$normalization.method,
604
-                                     scale.factor = orig.commands$NormalizeData.RNA@params$scale.factor,
605
-                                     margin = orig.commands$NormalizeData.RNA@params$margin)
606
-
607
-      seu_wdoublets <- Seurat::FindVariableFeatures(seu_wdoublets,
608
-                                            selection.method = orig.commands$FindVariableFeatures.RNA$selection.method,
609
-                                            loess.span = orig.commands$FindVariableFeatures.RNA$loess.span,
610
-                                            clip.max = orig.commands$FindVariableFeatures.RNA$clip.max,
611
-                                            mean.function = orig.commands$FindVariableFeatures.RNA$mean.function,
612
-                                            dispersion.function = orig.commands$FindVariableFeatures.RNA$dispersion.function,
613
-                                            num.bin = orig.commands$FindVariableFeatures.RNA$num.bin,
614
-                                            binning.method = orig.commands$FindVariableFeatures.RNA$binning.method,
615
-                                            nfeatures = orig.commands$FindVariableFeatures.RNA$nfeatures,
616
-                                            mean.cutoff = orig.commands$FindVariableFeatures.RNA$mean.cutoff,
617
-                                            dispersion.cutoff = orig.commands$FindVariableFeatures.RNA$dispersion.cutoff)
618
-
619
-      seu_wdoublets <- Seurat::ScaleData(seu_wdoublets,
620
-                                 features = orig.commands$ScaleData.RNA$features,
621
-                                 model.use = orig.commands$ScaleData.RNA$model.use,
622
-                                 do.scale = orig.commands$ScaleData.RNA$do.scale,
623
-                                 do.center = orig.commands$ScaleData.RNA$do.center,
624
-                                 scale.max = orig.commands$ScaleData.RNA$scale.max,
625
-                                 block.size = orig.commands$ScaleData.RNA$block.size,
626
-                                 min.cells.to.block = orig.commands$ScaleData.RNA$min.cells.to.block)
627
-
628
-      seu_wdoublets <- Seurat::RunPCA(seu_wdoublets,
629
-                              features = orig.commands$ScaleData.RNA$features,
630
-                              npcs = length(PCs),
631
-                              rev.pca =  orig.commands$RunPCA.RNA$rev.pca,
632
-                              weight.by.var = orig.commands$RunPCA.RNA$weight.by.var,
633
-                              verbose=FALSE)
634
-      pca.coord <- seu_wdoublets@reductions$pca@cell.embeddings[ , PCs]
635
-      cell.names <- rownames(seu_wdoublets@meta.data)
632
+      
633
+      seu_wdoublets <- Seurat::NormalizeData(
634
+        seu_wdoublets,
635
+        normalization.method = orig.commands$NormalizeData.RNA@params$normalization.method,
636
+        scale.factor = orig.commands$NormalizeData.RNA@params$scale.factor,
637
+        margin = orig.commands$NormalizeData.RNA@params$margin
638
+      )
639
+      
640
+      seu_wdoublets <- Seurat::FindVariableFeatures(
641
+        seu_wdoublets,
642
+        selection.method = orig.commands$FindVariableFeatures.RNA$selection.method,
643
+        loess.span = orig.commands$FindVariableFeatures.RNA$loess.span,
644
+        clip.max = orig.commands$FindVariableFeatures.RNA$clip.max,
645
+        mean.function = orig.commands$FindVariableFeatures.RNA$mean.function,
646
+        dispersion.function = orig.commands$FindVariableFeatures.RNA$dispersion.function,
647
+        num.bin = orig.commands$FindVariableFeatures.RNA$num.bin,
648
+        binning.method = orig.commands$FindVariableFeatures.RNA$binning.method,
649
+        nfeatures = orig.commands$FindVariableFeatures.RNA$nfeatures,
650
+        mean.cutoff = orig.commands$FindVariableFeatures.RNA$mean.cutoff,
651
+        dispersion.cutoff = orig.commands$FindVariableFeatures.RNA$dispersion.cutoff
652
+      )
653
+      
654
+      seu_wdoublets <- Seurat::ScaleData(
655
+        seu_wdoublets,
656
+        features = orig.commands$ScaleData.RNA$features,
657
+        model.use = orig.commands$ScaleData.RNA$model.use,
658
+        do.scale = orig.commands$ScaleData.RNA$do.scale,
659
+        do.center = orig.commands$ScaleData.RNA$do.center,
660
+        scale.max = orig.commands$ScaleData.RNA$scale.max,
661
+        block.size = orig.commands$ScaleData.RNA$block.size,
662
+        min.cells.to.block = orig.commands$ScaleData.RNA$min.cells.to.block
663
+      )
664
+      
665
+      seu_wdoublets <- Seurat::RunPCA(
666
+        seu_wdoublets,
667
+        features = orig.commands$ScaleData.RNA$features,
668
+        npcs = length(PCs),
669
+        rev.pca =  orig.commands$RunPCA.RNA$rev.pca,
670
+        weight.by.var = orig.commands$RunPCA.RNA$weight.by.var,
671
+        verbose = FALSE
672
+      )
673
+      pca.coord <- Seurat::Reductions(seu_wdoublets, 
674
+                                      "pca")@cell.embeddings[ , PCs]
675
+      cell.names <- rownames(seu_wdoublets[[]])
636 676
       nCells <- length(cell.names)
637 677
       rm(seu_wdoublets); gc() # Free up memory
638 678
     }
639
-
679
+    
640 680
     if (sct == TRUE) {
641 681
       #require(sctransform)
642 682
       message(date(), " ...   Creating Seurat object")
643 683
       seu_wdoublets <- Seurat::CreateSeuratObject(counts = data_wdoublets)
644
-
684
+      
645 685
       message(date(), " ...   Running SCTransform")
646 686
       seu_wdoublets <- Seurat::SCTransform(seu_wdoublets)
647
-
687
+      
648 688
       message(date(), " ...   Running PCA")
649 689
       seu_wdoublets <- Seurat::RunPCA(seu_wdoublets, npcs = length(PCs))
650
-      pca.coord <- seu_wdoublets@reductions$pca@cell.embeddings[ , PCs]
651
-      cell.names <- rownames(seu_wdoublets@meta.data)
690
+      pca.coord <- Seurat::Reductions(seu_wdoublets, 
691
+                                      "pca")@cell.embeddings[ , PCs]
692
+      cell.names <- rownames(seu_wdoublets[[]])
652 693
       nCells <- length(cell.names)
653 694
       rm(seu_wdoublets); gc()
654 695
     }
655
-
696
+    
656 697
     ## Compute PC distance matrix
657 698
     dist.mat <- fields::rdist(pca.coord)
658
-
699
+    
659 700
     ## Compute pANN
660 701
     pANN <- as.data.frame(matrix(0L, nrow = n_real.cells, ncol = 1))
661 702
     rownames(pANN) <- real.cells
... ...
@@ -664,14 +705,17 @@ runDoubletFinder <- function(inSCE,
664 705
     for (i in seq(n_real.cells)) {
665 706
       neighbors <- order(dist.mat[, i])
666 707
       neighbors <- neighbors[2:(k + 1)]
667
-      neighbor.names <- rownames(dist.mat)[neighbors]
708
+      #neighbor.names <- rownames(dist.mat)[neighbors]
668 709
       pANN$pANN[i] <- length(which(neighbors > n_real.cells))/k
669 710
     }
670
-
711
+    
671 712
     classifications <- rep("Singlet",n_real.cells)
672
-    classifications[order(pANN$pANN[seq(n_real.cells)], decreasing=TRUE)[seq(nExp)]] <- "Doublet"
673
-    seu@meta.data[, paste("pANN",pN,pK,nExp,sep="_")] <- pANN[rownames(seu@meta.data), 1]
674
-    seu@meta.data[, paste("DF.classifications",pN,pK,nExp,sep="_")] <- classifications
713
+    classifications[order(pANN$pANN[seq(n_real.cells)], 
714
+                          decreasing = TRUE)[seq(nExp)]] <- "Doublet"
715
+    seu[[paste("pANN", pN, pK, nExp, sep = "_")]] <- 
716
+      pANN[colnames(seu), 1]
717
+    seu[[paste("DF.classifications", pN, pK, nExp, sep = "_")]] <- 
718
+      classifications
675 719
     return(seu)
676 720
   }
677 721
 }
... ...
@@ -1,21 +1,22 @@
1
-
2
-.runBarcodeRankDrops <- function(barcode.matrix, lower=lower,
3
-                                 fit.bounds=fit.bounds,
4
-                                 df=df) {
5
-
1
+.runBarcodeRankDrops <- function(barcode.matrix, lower = lower,
2
+                                 fit.bounds = fit.bounds,
3
+                                 df = df) {
4
+  
6 5
   ## Convert to sparse matrix if not already in that format
7 6
   barcode.matrix <- .convertToMatrix(barcode.matrix)
8
-
9
-  output <- DropletUtils::barcodeRanks(m = barcode.matrix, lower=lower,
10
-                                       fit.bounds=fit.bounds,
11
-                                       df=df)
12
-
13
-  knee.ix <- as.integer(output@listData$total >= S4Vectors::metadata(output)$knee)
14
-  inflection.ix <- as.integer(output@listData$total >= S4Vectors::metadata(output)$inflection)
15
-  rank.ix<- as.integer(output$rank)
16
-  total.ix<- as.integer(output$total)
17
-  fitted.ix<- as.integer(output$fitted)
18
-
7
+  
8
+  output <- DropletUtils::barcodeRanks(m = barcode.matrix, lower = lower,
9
+                                       fit.bounds = fit.bounds,
10
+                                       df = df)
11
+  
12
+  knee.ix <- as.integer(output@listData$total >= 
13
+                          S4Vectors::metadata(output)$knee)
14
+  inflection.ix <- as.integer(output@listData$total >= 
15
+                                S4Vectors::metadata(output)$inflection)
16
+  rank.ix <- as.integer(output$rank)
17
+  total.ix <- as.integer(output$total)
18
+  fitted.ix <- as.integer(output$fitted)
19
+  
19 20
   result <- cbind(knee.ix, inflection.ix, rank.ix, total.ix, fitted.ix)
20 21
   colnames(result) <- c("dropletUtils_barcodeRank_knee",
21 22
                         "dropletUtils_barcodeRank_inflection",
... ...