Browse code

Merge devel branch (Oct 5) into master branch

Yusuke Koga authored on 09/10/2020 17:57:06
Showing 63 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,9 @@
1
+.Rproj.user
2
+.Rhistory
3
+.RData
4
+.Ruserdata
5
+singleCellTK.Rproj
6
+*.Rproj
7
+.DS_Store
8
+.Rprofile
9
+renv/
... ...
@@ -15,7 +15,7 @@ Authors@R: c(person(given="David", family="Jenkins", email="dfj@bu.edu", role=c(
15 15
                     comment = c(ORCID = "0000-0002-6247-6595")))
16 16
 Maintainer: David Jenkins <dfj@bu.edu>
17 17
 Depends:
18
-    R (>= 3.6.0),
18
+    R (>= 4.0),
19 19
     SummarizedExperiment,
20 20
     SingleCellExperiment,
21 21
     DelayedArray,
... ...
@@ -47,8 +47,8 @@ Imports:
47 47
     fields,
48 48
     ggplot2,
49 49
     ggplotify,
50
-    ggtree,
51 50
     ggrepel,
51
+    ggtree,
52 52
     gridExtra,
53 53
     GSVA (>= 1.26.0),
54 54
     GSVAdata,
... ...
@@ -118,7 +118,6 @@ Suggests:
118 118
     org.Mm.eg.db,
119 119
     stringr
120 120
 Remotes: 
121
-    joshua-d-campbell/DoubletFinder,
122 121
     joshua-d-campbell/harmony,
123 122
     rz2333/mnormt@1.5-6,
124 123
     joshua-d-campbell/liger,
... ...
@@ -1,10 +1,10 @@
1 1
 # Generated by roxygen2: do not edit by hand
2 2
 
3
-export(ComBatSCE)
4 3
 export(MAST)
5 4
 export(alignSingleCellData)
6 5
 export(calcEffectSizes)
7 6
 export(combineSCE)
7
+export(computeZScore)
8 8
 export(constructSCE)
9 9
 export(convertGeneIDs)
10 10
 export(convertSCEToSeurat)
... ...
@@ -15,9 +15,9 @@ export(downSampleCells)
15 15
 export(downSampleDepth)
16 16
 export(enrichRSCE)
17 17
 export(exportSCE)
18
+export(exportSCEtoAnnData)
18 19
 export(exportSCEtoFlatFile)
19 20
 export(featureIndex)
20
-export(filterSCData)
21 21
 export(findMarkerDiffExp)
22 22
 export(generateMeta)
23 23
 export(generateSimulatedData)
... ...
@@ -31,6 +31,7 @@ export(getUMAP)
31 31
 export(gsvaPlot)
32 32
 export(gsvaSCE)
33 33
 export(importAnnData)
34
+export(importBUStools)
34 35
 export(importCellRanger)
35 36
 export(importCellRangerV2)
36 37
 export(importCellRangerV2Sample)
... ...
@@ -43,6 +44,9 @@ export(importGeneSetsFromCollection)
43 44
 export(importGeneSetsFromGMT)
44 45
 export(importGeneSetsFromList)
45 46
 export(importGeneSetsFromMSigDB)
47
+export(importOptimus)
48
+export(importSEQC)
49
+export(importSTARsolo)
46 50
 export(iterateSimulations)
47 51
 export(mergeSCEColData)
48 52
 export(parseRsubreadLogs)
... ...
@@ -81,6 +85,7 @@ export(plotScrubletResults)
81 85
 export(plotTSNE)
82 86
 export(plotUMAP)
83 87
 export(qcInputProcess)
88
+export(readSingleCellMatrix)
84 89
 export(reportCellQC)
85 90
 export(reportDropletQC)
86 91
 export(reportQCTool)
... ...
@@ -149,6 +154,7 @@ export(subsetSCECols)
149 154
 export(subsetSCERows)
150 155
 export(summarizeSCE)
151 156
 export(thresholdGenes)
157
+export(trimCounts)
152 158
 export(visPlot)
153 159
 import(Biobase)
154 160
 import(DelayedArray)
155 161
deleted file mode 100644
... ...
@@ -1,79 +0,0 @@
1
-#' ComBatSCE
2
-#'
3
-#' Run ComBat on a SCtkExperiment object
4
-#'
5
-#' @param inSCE Input SCtkExperiment object. Required
6
-#' @param batch The name of a column in colData to use as the batch variable.
7
-#' Required
8
-#' @param useAssay The assay to use for ComBat. The default is "logcounts"
9
-#' @param par.prior TRUE indicates parametric adjustments will be used, FALSE
10
-#' indicates non-parametric adjustments will be used. Accepted parameters:
11
-#' "Parametric" or "Non-parametric"
12
-#' @param covariates List of other column names in colData to be added to the
13
-#' ComBat model as covariates
14
-#' @param mean.only If TRUE ComBat only corrects the mean of the batch effect
15
-#' @param ref.batch If given, will use the selected batch as a reference for
16
-#' batch adjustment.
17
-#'
18
-#' @return ComBat matrix based on inputs. You can save this matrix into the
19
-#' SCtkExperiment with assay()
20
-#' @export
21
-#' @examples
22
-#' if(requireNamespace("bladderbatch", quietly = TRUE)) {
23
-#'   library(bladderbatch)
24
-#'   data(bladderdata)
25
-#'
26
-#'   #subset for testing
27
-#'   dat <- bladderEset[1:50,]
28
-#'   dat <- as(as(dat, "SummarizedExperiment"), "SCtkExperiment")
29
-#'   mod <- stats::model.matrix(~as.factor(cancer), data = colData(dat))
30
-#'
31
-#'   # parametric adjustment
32
-#'   combat_edata1 <- ComBatSCE(inSCE = dat, useAssay = "exprs",
33
-#'                              batch = "batch", covariates = NULL)
34
-#'   assay(dat, "parametric_combat") <- combat_edata1
35
-#'
36
-#'   # non-parametric adjustment, mean-only version
37
-#'   combat_edata2 <- ComBatSCE(inSCE = dat, useAssay = "exprs",
38
-#'                              batch = "batch", par.prior = "Non-parametric",
39
-#'                              mean.only = TRUE, covariates = NULL)
40
-#'   assay(dat, "nonparametric_combat_meanonly") <- combat_edata2
41
-#'
42
-#'   # reference-batch version, with covariates
43
-#'   combat_edata3 <- ComBatSCE(inSCE = dat, useAssay = "exprs",
44
-#'                              batch = "batch", covariates = "cancer",
45
-#'                              ref.batch = 3)
46
-#'   assay(dat, "refbatch_combat_wcov") <- combat_edata3
47
-#'   assays(dat)
48
-#' }
49
-#'
50
-ComBatSCE <- function(inSCE, batch, useAssay="logcounts",
51
-                      par.prior="Parametric", covariates=NULL, mean.only=FALSE,
52
-                      ref.batch=NULL){
53
-
54
-  #prepare model matrix
55
-  mod <- NULL
56
-  if (length(covariates) > 0){
57
-    mod <- stats::model.matrix(
58
-      stats::as.formula(paste0("~", paste0(covariates, collapse = "+"))),
59
-      data = data.frame(SingleCellExperiment::colData(inSCE)[, covariates,
60
-                                                               drop = FALSE]))
61
-  }
62
-
63
-  #prepare parametric
64
-  if (par.prior == "Parametric"){
65
-    par.prior <- TRUE
66
-  } else if (par.prior == "Non-parametric") {
67
-    par.prior <- FALSE
68
-  } else {
69
-    stop("Invalid option given to par.prior. Accepted values are Parametric",
70
-         " and Non-parametric.")
71
-  }
72
-
73
-  resassay <-
74
-    sva::ComBat(dat = SummarizedExperiment::assay(inSCE, useAssay),
75
-                batch = SingleCellExperiment::colData(inSCE)[, batch],
76
-                mod = mod, par.prior = par.prior,
77
-                mean.only = mean.only, ref.batch = ref.batch)
78
-  return(resassay)
79
-}
80 0
new file mode 100644
... ...
@@ -0,0 +1,25 @@
1
+#' Compute Z-Score
2
+#'
3
+#' Computes Z-Score from an input count matrix using the formula ((x-mean(x))/sd(x))
4
+#' for each gene across all cells. The input count matrix can either be a base matrix,
5
+#' dgCMatrix or a DelayedMatrix. Computations are performed using DelayedMatrixStats
6
+#' package to efficiently compute the Z-Score matrix.
7
+#'
8
+#' @param counts matrix (base matrix, dgCMatrix or DelaymedMatrix)
9
+#'
10
+#' @return z-score computed counts matrix (DelayedMatrix)
11
+#' @export
12
+#'
13
+#' @examples
14
+#'
15
+#' data(sce_chcl, package = "scds")
16
+#' assay(sce_chcl, "countsZScore") <- computeZScore(assay(sce_chcl, "counts"))
17
+#'
18
+computeZScore <- function(counts) {
19
+    if (!methods::is(counts, "DelayedArray")) {
20
+        counts <- DelayedArray::DelayedArray(counts)
21
+    }
22
+    counts <- (counts - DelayedMatrixStats::rowMeans2(counts)) / DelayedMatrixStats::rowSds(counts)
23
+    counts[base::is.nan(counts)] <- 0
24
+    return(counts)
25
+}
0 26
new file mode 100644
... ...
@@ -0,0 +1,75 @@
1
+#' @title Export a \link[SingleCellExperiment]{SingleCellExperiment} R object as 
2
+#' Python annData object 
3
+#' @description Writes all assays, colData, rowData, reducedDims, and altExps objects in a
4
+#' \link[SingleCellExperiment]{SingleCellExperiment} to a Python annData object in the .h5ad format
5
+#' All parameters of Anndata.write_h5ad function (https://icb-anndata.readthedocs-hosted.com/en/stable/anndata.AnnData.write_h5ad.html)
6
+#' are available as parameters to this export function and set to defaults. Defaults can be
7
+#' overridden at function call.
8
+#' @param sce \link[SingleCellExperiment]{SingleCellExperiment} R object to be
9
+#'  exported.
10
+#' @param useAssay Character. The name of assay of
11
+#' interests that will be set as the primary matrix of the output AnnData.
12
+#' Default \code{"counts"}. 
13
+#' @param outputDir Path to the directory where .h5ad outputs will be written. Default is the current working directory.
14
+#' @param prefix Prefix to use for the name of the output file. Default \code{"sample"}.
15
+#' @param overwrite Boolean. Default \code{TRUE}.
16
+#' @param compression If output file compression is required, this variable accepts
17
+#' 'gzip' or 'lzf' as inputs. Default \code{None}.
18
+#' @param compressionOpts Integer. Sets the compression level
19
+#' @param forceDense Default \code{False} Write sparse data as a dense matrix.
20
+#' Refer \code{anndata.write_h5ad} documentation for details. Default \code{NULL}. 
21
+#' @examples
22
+#' \dontrun{
23
+#' data(sce_chcl, package = "scds")
24
+#' exportSCEtoAnnData(sce=sce_chcl, compression="gzip")
25
+#' }
26
+#' 
27
+#' @export
28
+exportSCEtoAnnData <- function(sce, 
29
+                                useAssay = 'counts',
30
+                                outputDir = "./",
31
+                                prefix = "sample",
32
+                                overwrite = TRUE,
33
+                                compression = c('None','lzf','gzip'),
34
+                                compressionOpts = NULL,
35
+                                forceDense = c('False','True')){
36
+  compression <- match.arg(compression)
37
+  forceDense <- match.arg(forceDense)
38
+  if (compression == 'None'){
39
+    compression <- NULL
40
+  }
41
+
42
+  if (!reticulate::py_module_available(module = "scanpy")) {
43
+    warning("Cannot find python module 'scanpy', please install Conda and",
44
+            " run sctkPythonInstallConda() or run sctkPythonInstallVirtualEnv().",
45
+            "If one of these have been previously run to install the modules,",
46
+            "make sure to run selectSCTKConda() or selectSCTKVirtualEnvironment(),",
47
+            " respectively, if R has been restarted since the module installation.",
48
+            " Alternatively, Scrublet can be installed on the local machine",
49
+            "with pip (e.g. pip install --user scanpy) and then the 'use_python()'",
50
+            " function from the 'reticulate' package can be used to select the",
51
+            " correct Python environment.")
52
+    return(sce)}
53
+  
54
+  AssayName <- SummarizedExperiment::assayNames(sce)
55
+  for (assay in AssayName){
56
+    if (!methods::is(SummarizedExperiment::assay(sce, assay), 'dgCMatrix')) {
57
+      SummarizedExperiment::assay(sce, assay) <- .convertToMatrix(SummarizedExperiment::assay(sce, assay))
58
+    }
59
+  }
60
+
61
+  
62
+  dir.create(outputDir, showWarnings = FALSE, recursive = TRUE)
63
+  annData <- .sce2adata(sce,useAssay)
64
+  fileName <- paste0(prefix,".h5ad")
65
+  filePath <- file.path(outputDir,fileName)
66
+  
67
+  if (file.exists(filePath) && !isTRUE(overwrite)) {
68
+    stop(paste0(path, " already exists. Change 'outputDir' or set 'overwrite' to TRUE."))
69
+    }
70
+
71
+  annData$write_h5ad(filePath,
72
+                     compression = compression, 
73
+                     compression_opts = compressionOpts,
74
+                     force_dense = forceDense)
75
+}
... ...
@@ -1,10 +1,10 @@
1
-#' @title Get runDropletQC .html report 
1
+#' @title Get runDropletQC .html report
2 2
 #' @description A  function to generate .html Rmarkdown report containing the visualizations of the runDropletQC function output
3 3
 #' @param inSCE A \link[SingleCellExperiment]{SingleCellExperiment} object containing
4 4
 #' the full droplet count matrix with the output from runDropletQC function
5
-#' @param subTitle subtitle of the QC HTML report. Default is NULL. 
6
-#' @param studyDesign description of the data set and experiment design. It would be shown at the top of QC HTML report. Default is NULL. 
7
-#' @param output_file name of the generated file. If NULL/default then the output file name will be based on the name of the Rmarkdown template 
5
+#' @param subTitle subtitle of the QC HTML report. Default is NULL.
6
+#' @param studyDesign description of the data set and experiment design. It would be shown at the top of QC HTML report. Default is NULL.
7
+#' @param output_file name of the generated file. If NULL/default then the output file name will be based on the name of the Rmarkdown template
8 8
 #' @param output_dir name of the output directory to save the rendered file. If NULL/default the file is stored to the current working directory
9 9
 #' @return .html file
10 10
 #' @examples
... ...
@@ -16,33 +16,33 @@
16 16
 #' @export
17 17
 reportDropletQC <- function(inSCE, output_file = NULL,
18 18
                                    output_dir = NULL,
19
-                                   subTitle = NULL, 
19
+                                   subTitle = NULL,
20 20
                                    studyDesign = NULL) {
21
-  
21
+
22 22
   if (is.null(output_dir)){
23 23
     output_dir<- getwd()
24 24
     }
25
-  
25
+
26 26
   #report_path <- tempfile(fileext = ".Rmd")
27 27
   #file.copy(system.file("rmarkdown/qc/DropletQC.Rmd", package = "singleCellTK"), report_path, overwrite = TRUE)
28 28
 
29 29
   ## create temp Rmd file to bypass permission issue on server
30
-  rmarkdown::render(system.file("rmarkdown/qc/DropletQC.Rmd", package = "singleCellTK"), 
31
-    params = list(object = inSCE, subTitle = subTitle, studyDesign = studyDesign), 
32
-    output_file = output_file, 
30
+  rmarkdown::render(system.file("rmarkdown/qc/DropletQC.Rmd", package = "singleCellTK"),
31
+    params = list(object = inSCE, subTitle = subTitle, studyDesign = studyDesign),
32
+    output_file = output_file,
33 33
     output_dir = output_dir,
34 34
     intermediates_dir = output_dir,
35 35
     knit_root_dir = output_dir)
36 36
  }
37 37
 
38 38
 
39
-#' @title Get runCellQC .html report 
39
+#' @title Get runCellQC .html report
40 40
 #' @description A  function to generate .html Rmarkdown report containing the visualizations of the runCellQC function output
41 41
 #' @param inSCE A \link[SingleCellExperiment]{SingleCellExperiment} object containing
42 42
 #' the filtered count matrix with the output from runCellQC function
43
-#' @param subTitle subtitle of the QC HTML report. Default is NULL. 
44
-#' @param studyDesign description of the data set and experiment design. It would be shown at the top of QC HTML report. Default is NULL. 
45
-#' @param output_file name of the generated file. If NULL/default then the output file name will be based on the name of the Rmarkdown template. 
43
+#' @param subTitle subtitle of the QC HTML report. Default is NULL.
44
+#' @param studyDesign description of the data set and experiment design. It would be shown at the top of QC HTML report. Default is NULL.
45
+#' @param output_file name of the generated file. If NULL/default then the output file name will be based on the name of the Rmarkdown template.
46 46
 #' @param output_dir name of the output directory to save the rendered file. If NULL/default the file is stored to the current working directory
47 47
 #' @return .html file
48 48
 #' @examples
... ...
@@ -55,7 +55,7 @@ reportDropletQC <- function(inSCE, output_file = NULL,
55 55
 #' @export
56 56
 reportCellQC <- function(inSCE, output_file = NULL,
57 57
                                 output_dir = NULL,
58
-                                subTitle = NULL, 
58
+                                subTitle = NULL,
59 59
                                 studyDesign = NULL) {
60 60
   if (is.null(output_dir)){
61 61
     output_dir<- getwd()
... ...
@@ -63,9 +63,9 @@ reportCellQC <- function(inSCE, output_file = NULL,
63 63
   #report_path <- tempfile(fileext = ".Rmd")
64 64
   #file.copy(system.file("rmarkdown/qc/CellQC.Rmd", package = "singleCellTK"), report_path, overwrite = TRUE)
65 65
 
66
-  rmarkdown::render(system.file("rmarkdown/qc/CellQC.Rmd", package = "singleCellTK"), 
67
-    params = list(object = inSCE, subTitle = subTitle, studyDesign = studyDesign), 
68
-    output_file = output_file, 
66
+  rmarkdown::render(system.file("rmarkdown/qc/CellQC.Rmd", package = "singleCellTK"),
67
+    params = list(object = inSCE, subTitle = subTitle, studyDesign = studyDesign),
68
+    output_file = output_file,
69 69
     output_dir = output_dir,
70 70
     intermediates_dir = output_dir,
71 71
     knit_root_dir = output_dir)
... ...
@@ -79,7 +79,7 @@ reportCellQC <- function(inSCE, output_file = NULL,
79 79
 #' runQCMetrics, runScrublet, runDoubletCells, runCxds, runBcds, runCxdsBcdsHybrid, runDecontX, runBarcodeRankDrops, runEmptyDrops
80 80
 #' @param algorithm Character. Specifies which QC algorithm report to generate.
81 81
 #'  Available options are "BarcodeRankDrops", "EmptyDrops", "QCMetrics", "Scrublet", "DoubletCells", "Cxds", "Bcds", "CxdsBcdsHybrid", "DoubletFinder"  and "DecontX".
82
-#' @param output_file name of the generated file. If NULL/default then the output file name will be based on the name of the selected QC algorithm name . 
82
+#' @param output_file name of the generated file. If NULL/default then the output file name will be based on the name of the selected QC algorithm name .
83 83
 #' @param output_dir name of the output directory to save the rendered file. If NULL/default the file is stored to the current working directory
84 84
 #' @return .html file
85 85
 #' @examples
... ...
@@ -87,6 +87,7 @@ reportCellQC <- function(inSCE, output_file = NULL,
87 87
 #' data(scExample, package = "singleCellTK")
88 88
 #' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
89 89
 #' sce <- runDecontX(sce)
90
+#' sce <- getUMAP(sce)
90 91
 #' reportQCTool(inSCE = sce, algorithm = "DecontX")
91 92
 #' }
92 93
 #' @export
... ...
@@ -102,13 +103,13 @@ reportQCTool <- function(inSCE, algorithm=c("BarcodeRankDrops",
102 103
                                             "DecontX"),
103 104
                          output_file = NULL,
104 105
                             output_dir = NULL) {
105
-  
106
+
106 107
   algorithm <- match.arg(algorithm)
107
-  
108
+
108 109
   if (is.null(output_dir)){
109 110
     output_dir<- getwd()
110 111
   }
111
-  
112
+
112 113
   if (algorithm =="BarcodeRankDrops"){
113 114
     rmarkdown::render(system.file("rmarkdown/qc/BarcodeRankDrops.Rmd", package="singleCellTK"), params = list(
114 115
       object=inSCE), output_file = output_file, output_dir = output_dir)
... ...
@@ -152,4 +153,4 @@ reportQCTool <- function(inSCE, algorithm=c("BarcodeRankDrops",
152 153
  }
153 154
 
154 155
 
155
-    
156
+
156 157
new file mode 100644
... ...
@@ -0,0 +1,156 @@
1
+
2
+# dir <- "genecount"
3
+.constructSCEFromBUStoolsOutputs <- function(dir,
4
+    sample,
5
+    matrixFileName,
6
+    featuresFileName,
7
+    barcodesFileName,
8
+    gzipped,
9
+    class,
10
+    delayedArray) {
11
+
12
+    cb <- .readBarcodes(file.path(dir, barcodesFileName))
13
+    fe <- .readFeatures(file.path(dir, featuresFileName))
14
+    ma <- .readMatrixMM(file.path(dir, matrixFileName),
15
+        gzipped = gzipped,
16
+        class = class,
17
+        delayedArray = delayedArray)
18
+    ma <- t(ma)
19
+
20
+    coln <- paste(sample, cb[[1]], sep = "_")
21
+    rownames(ma) <- fe[[1]]
22
+
23
+    sce <- SingleCellExperiment::SingleCellExperiment(
24
+        assays = list(counts = ma))
25
+    SummarizedExperiment::rowData(sce) <- fe
26
+    SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(cb,
27
+        column_name = coln,
28
+        sample = sample,
29
+        row.names = coln)
30
+
31
+    return(sce)
32
+}
33
+
34
+
35
+# main function
36
+.importBUStools <- function(
37
+    BUStoolsDirs,
38
+    samples,
39
+    matrixFileNames,
40
+    featuresFileNames,
41
+    barcodesFileNames,
42
+    gzipped,
43
+    class,
44
+    delayedArray) {
45
+
46
+    if (length(BUStoolsDirs) != length(samples)) {
47
+        stop("'BUStoolsDirs' and 'samples' have unequal lengths!")
48
+    }
49
+
50
+    res <- vector("list", length = length(samples))
51
+
52
+    matrixFileNames <- .getVectorized(matrixFileNames, length(samples))
53
+    featuresFileNames <- .getVectorized(featuresFileNames, length(samples))
54
+    barcodesFileNames <- .getVectorized(barcodesFileNames, length(samples))
55
+    gzipped <- .getVectorized(gzipped, length(samples))
56
+
57
+    for (i in seq_along(samples)) {
58
+        dir <- file.path(BUStoolsDirs[i])
59
+        scei <- .constructSCEFromBUStoolsOutputs(dir,
60
+            sample = samples[i],
61
+            matrixFileName = matrixFileNames[i],
62
+            featuresFileName = featuresFileNames[i],
63
+            barcodesFileName = barcodesFileNames[i],
64
+            gzipped = gzipped[i],
65
+            class = class,
66
+            delayedArray = delayedArray)
67
+        res[[i]] <- scei
68
+    }
69
+
70
+    sce <- do.call(SingleCellExperiment::cbind, res)
71
+    return(sce)
72
+}
73
+
74
+
75
+#' @name importBUStools
76
+#' @rdname importBUStools
77
+#' @title Construct SCE object from BUStools output
78
+#' @description Read the barcodes, features (genes), and matrix from BUStools
79
+#'  output. Import them
80
+#'  as one \link[SingleCellExperiment]{SingleCellExperiment} object. Note the
81
+#'  cells in the output files for BUStools 0.39.4 are not filtered.
82
+#' @param BUStoolsDirs A vector of paths to BUStools output files. Each sample
83
+#'  should have its own path. For example: \code{./genecount}.
84
+#'  Must have the same length as \code{samples}.
85
+#' @param samples A vector of user-defined sample names for the samples to be
86
+#'  imported. Must have the same length as \code{BUStoolsDirs}.
87
+#' @param matrixFileNames Filenames for the Market Exchange Format (MEX) sparse
88
+#'  matrix files (.mtx files). Must have length 1 or the same
89
+#'  length as \code{samples}.
90
+#' @param featuresFileNames Filenames for the feature annotation files.
91
+#'  Must have length 1 or the same length as \code{samples}.
92
+#' @param barcodesFileNames Filenames for the cell barcode list file.
93
+#'  Must have length 1 or the same length as \code{samples}.
94
+#' @param gzipped Boolean. \code{TRUE} if the BUStools output files
95
+#'  (barcodes.txt, genes.txt, and genes.mtx) were
96
+#'  gzip compressed. \code{FALSE} otherwise. This is \code{FALSE} in BUStools
97
+#'  0.39.4. Default \code{"auto"} which automatically detects if the
98
+#'  files are gzip compressed. Must have length 1 or the same length as
99
+#'  \code{samples}.
100
+#' @param class Character. The class of the expression matrix stored in the SCE
101
+#'  object. Can be one of "Matrix" (as returned by
102
+#'  \link[Matrix]{readMM} function), or "matrix" (as returned by
103
+#'  \link[base]{matrix} function). Default "Matrix".
104
+#' @param delayedArray Boolean. Whether to read the expression matrix as
105
+#'  \link[DelayedArray]{DelayedArray} object or not. Default \code{TRUE}.
106
+#' @return A \code{SingleCellExperiment} object containing the count
107
+#'  matrix, the gene annotation, and the cell annotation.
108
+#' @examples
109
+#' # Example #1
110
+#' # FASTQ files were downloaded from
111
+#' # https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.0
112
+#' # /pbmc_1k_v3
113
+#' # They were concatenated as follows:
114
+#' # cat pbmc_1k_v3_S1_L001_R1_001.fastq.gz pbmc_1k_v3_S1_L002_R1_001.fastq.gz >
115
+#' # pbmc_1k_v3_R1.fastq.gz
116
+#' # cat pbmc_1k_v3_S1_L001_R2_001.fastq.gz pbmc_1k_v3_S1_L002_R2_001.fastq.gz >
117
+#' # pbmc_1k_v3_R2.fastq.gz
118
+#' # The following BUStools command generates the gene, cell, and
119
+#' # matrix files
120
+#'
121
+#' # bustools correct -w ./3M-february-2018.txt -p output.bus | \
122
+#' #   bustools sort -T tmp/ -t 4 -p - | \
123
+#' #   bustools count -o genecount/genes \
124
+#' #     -g ./transcripts_to_genes.txt \
125
+#' #     -e matrix.ec \
126
+#' #     -t transcripts.txt \
127
+#' #     --genecounts -
128
+#'
129
+#' # The top 20 genes and the first 20 cells are included in this example.
130
+#' sce <- importBUStools(
131
+#'   BUStoolsDirs = system.file("extdata/BUStools_PBMC_1k_v3_20x20/genecount/",
132
+#'     package = "singleCellTK"),
133
+#'   samples = "PBMC_1k_v3_20x20")
134
+#' @export
135
+importBUStools <- function(
136
+    BUStoolsDirs,
137
+    samples,
138
+    matrixFileNames = "genes.mtx",
139
+    featuresFileNames = "genes.genes.txt",
140
+    barcodesFileNames = "genes.barcodes.txt",
141
+    gzipped = "auto",
142
+    class = c("Matrix", "matrix"),
143
+    delayedArray = TRUE) {
144
+
145
+    class <- match.arg(class)
146
+
147
+    .importBUStools(
148
+        BUStoolsDirs = BUStoolsDirs,
149
+        samples = samples,
150
+        matrixFileNames = matrixFileNames,
151
+        featuresFileNames = featuresFileNames,
152
+        barcodesFileNames = barcodesFileNames,
153
+        gzipped = gzipped,
154
+        class = class,
155
+        delayedArray = delayedArray)
156
+}
0 157
new file mode 100644
... ...
@@ -0,0 +1,285 @@
1
+
2
+#' @importFrom reticulate import
3
+.readMatrixNpz <- function(matrixLocation,
4
+  colIndexLocation,
5
+  rowIndexLocation,
6
+  class,
7
+  delayedArray) {
8
+
9
+  ## Now importing these functions in 'reticulate_setup.R' file
10
+  #  sparse <- reticulate::import("scipy.sparse")
11
+  #  numpy <- reticulate::import("numpy")
12
+  if (!reticulate::py_module_available(module = "scipy.sparse")) {
13
+    stop("Error!", "Cannot find python module 'scipy.sparse', please install Conda and run sctkPythonInstallConda() 
14
+         or run sctkPythonInstallVirtualEnv(). If one of these have been previously run to install the modules,
15
+         make sure to run selectSCTKConda() or selectSCTKVirtualEnvironment(), respectively, if R has been
16
+         restarted since the module installation. Alternatively, scipy can be installed on the local machine
17
+         with pip (e.g. pip install scipy) and then the 'use_python()' function from the 'reticulate' package
18
+         can be used to select the correct Python environment.")
19
+  }
20
+  if (!reticulate::py_module_available(module = "numpy")) {
21
+    stop("Error!", "Cannot find python module 'numpy', please install Conda and run sctkPythonInstallConda() 
22
+         or run sctkPythonInstallVirtualEnv(). If one of these have been previously run to install the modules,
23
+         make sure to run selectSCTKConda() or selectSCTKVirtualEnvironment(), respectively, if R has been
24
+         restarted since the module installation. Alternatively, numpy can be installed on the local machine
25
+         with pip (e.g. pip install numpy) and then the 'use_python()' function from the 'reticulate' package
26
+         can be used to select the correct Python environment.")
27
+  }
28
+  
29
+  error <- try({
30
+    mat <- sparse$load_npz(matrixLocation)
31
+    colIndex <- as.vector(numpy$load(colIndexLocation, allow_pickle = TRUE))
32
+    rowIndex <- as.vector(numpy$load(rowIndexLocation, allow_pickle = TRUE))
33
+    colnames(mat) <- colIndex
34
+    rownames(mat) <- rowIndex
35
+    mat <- t(mat)
36
+
37
+    ## Convert to "dgCMatrix"
38
+    newM <- Matrix::Matrix(mat[,1], nrow=nrow(mat))
39
+    newM <- methods::as(newM, "dgCMatrix")
40
+    breaks <- seq(2, ncol(mat), by=1000)
41
+    if(length(breaks) > 2) {
42
+      for(i in seq(2, length(breaks))) {
43
+        ix <- seq(breaks[i-1], (breaks[i]-1))
44
+        newM <- cbind(newM, mat[,ix])
45
+      }
46
+      ix <- seq(utils::tail(breaks, n = 1), ncol(mat))
47
+      newM <- cbind(newM, mat[,ix])
48
+    } else {
49
+      ix <- seq(2, ncol(mat))
50
+      newM <- cbind(newM, mat[,ix])
51
+    }
52
+
53
+    colnames(newM) <- colnames(mat)
54
+    rownames(newM) <- rownames(mat)
55
+    mat <- newM
56
+  }, silent = TRUE)
57
+  
58
+  if(inherits(error, "try-error")) {
59
+    stop(paste0("importOptimus did not complete successfully. SCE could not be generated. Error given during the import process: \n\n", error))
60
+  }
61
+  
62
+  if (class == "matrix") {
63
+    mat <- as.matrix(mat)
64
+  }
65
+
66
+  if (isTRUE(delayedArray)) {
67
+    mat <- DelayedArray::DelayedArray(mat)
68
+  }
69
+  return(mat)
70
+}
71
+
72
+
73
+.readMetricsOptimus <- function(path) {
74
+  metrics <- data.table::fread(path)
75
+  return(metrics)
76
+}
77
+
78
+
79
+.readEmptyDrops <- function(path) {
80
+  emptyDrops <- data.table::fread(path)
81
+  colnames(emptyDrops) <- paste0("dropletUtils_emptyDrops_",
82
+    colnames(emptyDrops))
83
+  return(emptyDrops)
84
+}
85
+
86
+
87
+.combineColData <- function(colnames, cellMetrics, emptyDrops) {
88
+  cd <- data.table::data.table(CellId = colnames)
89
+  cd <- merge(cd,
90
+    cellMetrics,
91
+    by.x = "CellId",
92
+    by.y = "V1",
93
+    all.x = TRUE,
94
+    all.y = FALSE,
95
+    sort = FALSE)
96
+
97
+  if (!is.null(emptyDrops)) {
98
+    cd <- merge(cd,
99
+      emptyDrops,
100
+      by.x = "CellId",
101
+      by.y = "dropletUtils_emptyDrops_CellId",
102
+      all.x = TRUE,
103
+      all.y = FALSE,
104
+      sort = FALSE)
105
+  }
106
+
107
+  return(cd)
108
+}
109
+
110
+
111
+.combineRowData <- function(rownames, geneMetrics) {
112
+  rd <- data.table::data.table(feature_ID = rownames)
113
+  rd <- merge(rd,
114
+    geneMetrics,
115
+    by.x = "feature_ID",
116
+    by.y = "V1",
117
+    all.x = TRUE,
118
+    all.y = FALSE,
119
+    sort = FALSE)
120
+  return(rd)
121
+}
122
+
123
+
124
+.constructSCEFromOptimusOutputs <- function(dir,
125
+  sample,
126
+  matrixLocation,
127
+  colIndexLocation,
128
+  rowIndexLocation,
129
+  cellMetricsLocation,
130
+  geneMetricsLocation,
131
+  emptyDropsLocation,
132
+  class,
133
+  delayedArray) {
134
+
135
+  mat <- .readMatrixNpz(file.path(dir, matrixLocation),
136
+    file.path(dir, colIndexLocation),
137
+    file.path(dir, rowIndexLocation),
138
+    class,
139
+    delayedArray)
140
+
141
+  cellMetrics <- .readMetricsOptimus(file.path(dir, cellMetricsLocation))
142
+  geneMetrics <- .readMetricsOptimus(file.path(dir, geneMetricsLocation))
143
+
144
+  if (!is.null(geneMetricsLocation)) {
145
+    emptyDrops <- .readEmptyDrops(file.path(dir, emptyDropsLocation))
146
+  }
147
+
148
+  cd <- .combineColData(colnames(mat), cellMetrics, emptyDrops)
149
+  rd <- .combineRowData(rownames(mat), geneMetrics)
150
+
151
+  coln <- paste(sample, colnames(mat), sep = "_")
152
+
153
+  sce <- SingleCellExperiment::SingleCellExperiment(
154
+    assays = list(counts = mat))
155
+  SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(cd,
156
+    column_name = coln,
157
+    sample = sample,
158
+    row.names = coln)
159
+  SummarizedExperiment::rowData(sce) <- S4Vectors::DataFrame(rd)
160
+
161
+  return(sce)
162
+}
163
+
164
+
165
+.checkArgsImportOptimus <- function(OptimusDirs,
166
+  samples) {
167
+
168
+  if (length(OptimusDirs) != length(samples)) {
169
+    stop("'OptimusDirs' and 'samples' have unequal lengths!")
170
+  }
171
+}
172
+
173
+
174
+.importOptimus <- function(OptimusDirs,
175
+  samples,
176
+  matrixLocation,
177
+  colIndexLocation,
178
+  rowIndexLocation,
179
+  cellMetricsLocation,
180
+  geneMetricsLocation,
181
+  emptyDropsLocation,
182
+  class,
183
+  delayedArray) {
184
+
185
+  .checkArgsImportOptimus(OptimusDirs, samples)
186
+
187
+  res <- vector("list", length = length(samples))
188
+
189
+  for (i in seq_along(samples)) {
190
+    scei <- .constructSCEFromOptimusOutputs(OptimusDirs[i],
191
+      sample = samples[i],
192
+      matrixLocation = matrixLocation,
193
+      colIndexLocation = colIndexLocation,
194
+      rowIndexLocation = rowIndexLocation,
195
+      cellMetricsLocation = cellMetricsLocation,
196
+      geneMetricsLocation = geneMetricsLocation,
197
+      emptyDropsLocation = emptyDropsLocation,
198
+      class = class,
199
+      delayedArray = delayedArray)
200
+    res[[i]] <- scei
201
+  }
202
+
203
+  sce <- do.call(SingleCellExperiment::cbind, res)
204
+  return(sce)
205
+}
206
+
207
+
208
+#' @name importOptimus
209
+#' @rdname importOptimus
210
+#' @title Construct SCE object from Optimus output
211
+#' @description Read the barcodes, features (genes), and matrices from Optimus
212
+#'  outputs. Import them
213
+#'  as one \link[SingleCellExperiment]{SingleCellExperiment} object.
214
+#' @param OptimusDirs A vector of root directories of Optimus output files.
215
+#'  The paths should be something like this:
216
+#'  \code{/PATH/TO/bb4a2a5e-ff34-41b6-97d2-0c0c0c534530}.
217
+#'  Each entry in \code{OptimusDirs} is considered a sample and should have
218
+#'  its own path. Must have the same length as \code{samples}.
219
+#' @param samples A vector of user-defined sample names for the sample to be
220
+#'  imported. Must have the same length as \code{OptimusDirs}.
221
+#' @param matrixLocation Character. It is the intermediate
222
+#'  path to the filtered count maxtrix file saved in sparse matrix format
223
+#'  (\code{.npz}). Default
224
+#'  \code{call-MergeCountFiles/sparse_counts.npz} which works for
225
+#'  optimus_v1.4.0.
226
+#' @param colIndexLocation Character. The intermediate path to the barcode
227
+#'  index file. Default \code{call-MergeCountFiles/sparse_counts_col_index.npy}.
228
+#' @param rowIndexLocation Character. The intermediate path to the feature
229
+#'  (gene) index file. Default
230
+#'  \code{call-MergeCountFiles/sparse_counts_row_index.npy}.
231
+#' @param cellMetricsLocation Character. It is the intermediate
232
+#'  path to the cell metrics file (\code{merged-cell-metrics.csv.gz}). Default
233
+#'  \code{call-MergeCellMetrics/merged-cell-metrics.csv.gz} which works for
234
+#'  optimus_v1.4.0.
235
+#' @param geneMetricsLocation Character. It is the intermediate
236
+#'  path to the feature (gene) metrics file (\code{merged-gene-metrics.csv.gz}).
237
+#'  Default \code{call-MergeGeneMetrics/merged-gene-metrics.csv.gz} which works
238
+#'  for optimus_v1.4.0.
239
+#' @param emptyDropsLocation Character. It is the intermediate
240
+#'  path to \link[DropletUtils]{emptyDrops} metrics file
241
+#'  (\code{empty_drops_result.csv}).
242
+#'  Default \code{call-RunEmptyDrops/empty_drops_result.csv} which works for
243
+#'  optimus_v1.4.0.
244
+#' @param class Character. The class of the expression matrix stored in the SCE
245
+#'  object. Can be one of "Matrix" (as returned by
246
+#'  \link[Matrix]{readMM} function), or "matrix" (as returned by
247
+#'  \link[base]{matrix} function). Default "Matrix".
248
+#' @param delayedArray Boolean. Whether to read the expression matrix as
249
+#'  \link[DelayedArray]{DelayedArray} object or not. Default \code{TRUE}.
250
+#' @return A \link[SingleCellExperiment]{SingleCellExperiment} object
251
+#'  containing the count
252
+#'  matrix, the gene annotation, and the cell annotation.
253
+#' @examples
254
+#' \dontrun{
255
+#' sce <- importOptimus(OptimusDirs =
256
+#'   system.file("extdata/Optimus_20x1000",
257
+#'   package = "singleCellTK"),
258
+#'   samples = "Optimus_20x1000")
259
+#' }
260
+#' @export
261
+importOptimus <- function(OptimusDirs,
262
+  samples,
263
+  matrixLocation = "call-MergeCountFiles/sparse_counts.npz",
264
+  colIndexLocation = "call-MergeCountFiles/sparse_counts_col_index.npy",
265
+  rowIndexLocation = "call-MergeCountFiles/sparse_counts_row_index.npy",
266
+  cellMetricsLocation = "call-MergeCellMetrics/merged-cell-metrics.csv.gz",
267
+  geneMetricsLocation = "call-MergeGeneMetrics/merged-gene-metrics.csv.gz",
268
+  emptyDropsLocation = "call-RunEmptyDrops/empty_drops_result.csv",
269
+  class = c("Matrix", "matrix"),
270
+  delayedArray = TRUE) {
271
+
272
+  class <- match.arg(class)
273
+
274
+  .importOptimus(OptimusDirs = OptimusDirs,
275
+    samples = samples,
276
+    matrixLocation = matrixLocation,
277
+    colIndexLocation = colIndexLocation,
278
+    rowIndexLocation = rowIndexLocation,
279
+    cellMetricsLocation = cellMetricsLocation,
280
+    geneMetricsLocation = geneMetricsLocation,
281
+    emptyDropsLocation = emptyDropsLocation,
282
+    class = class,
283
+    delayedArray = delayedArray)
284
+
285
+}
0 286
new file mode 100644
... ...
@@ -0,0 +1,168 @@
1
+
2
+# dir <- "outs/filtered_feature_bc_matrix/"
3
+.constructSCEFromSTARsoloOutputs <- function(dir,
4
+    sample,
5
+    matrixFileName,
6
+    featuresFileName,
7
+    barcodesFileName,
8
+    gzipped,
9
+    class,
10
+    delayedArray) {
11
+
12
+    cb <- .readBarcodes(file.path(dir, barcodesFileName))
13
+    fe <- .readFeatures(file.path(dir, featuresFileName))
14
+    ma <- .readMatrixMM(file.path(dir, matrixFileName),
15
+        gzipped = gzipped,
16
+        class = class,
17
+        delayedArray = delayedArray)
18
+
19
+    coln <- paste(sample, cb[[1]], sep = "_")
20
+    rownames(ma) <- fe[[1]]
21
+
22
+    sce <- SingleCellExperiment::SingleCellExperiment(
23
+        assays = list(counts = ma))
24
+    SummarizedExperiment::rowData(sce) <- fe
25
+    SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(cb,
26
+        column_name = coln,
27
+        sample = sample,
28
+        row.names = coln)
29
+
30
+    return(sce)
31
+}
32
+
33
+
34
+# main function
35
+.importSTARsolo <- function(STARsoloDirs,
36
+    samples,
37
+    STARsoloOuts,
38
+    matrixFileNames,
39
+    featuresFileNames,
40
+    barcodesFileNames,
41
+    gzipped,
42
+    class,
43
+    delayedArray) {
44
+
45
+    if (length(STARsoloDirs) != length(samples)) {
46
+        stop("'STARsoloDirs' and 'samples' have unequal lengths!")
47
+    }
48
+
49
+    res <- vector("list", length = length(samples))
50
+
51
+    STARsoloOuts <- .getVectorized(STARsoloOuts, length(samples))
52
+    matrixFileNames <- .getVectorized(matrixFileNames, length(samples))
53
+    featuresFileNames <- .getVectorized(featuresFileNames, length(samples))
54
+    barcodesFileNames <- .getVectorized(barcodesFileNames, length(samples))
55
+    gzipped <- .getVectorized(gzipped, length(samples))
56
+
57
+    for (i in seq_along(samples)) {
58
+        dir <- file.path(STARsoloDirs[i], STARsoloOuts[i])
59
+        scei <- .constructSCEFromSTARsoloOutputs(dir,
60
+            sample = samples[i],
61
+            matrixFileName = matrixFileNames[i],
62
+            featuresFileName = featuresFileNames[i],
63
+            barcodesFileName = barcodesFileNames[i],
64
+            gzipped = gzipped[i],
65
+            class = class,
66
+            delayedArray = delayedArray)
67
+        res[[i]] <- scei
68
+    }
69
+
70
+    sce <- do.call(SingleCellExperiment::cbind, res)
71
+    return(sce)
72
+}
73
+
74
+
75
+#' @name importSTARsolo
76
+#' @rdname importSTARsolo
77
+#' @title Construct SCE object from STARsolo outputs
78
+#' @description Read the barcodes, features (genes), and matrices from STARsolo
79
+#'  outputs. Import them
80
+#'  as one \link[SingleCellExperiment]{SingleCellExperiment} object.
81
+#' @param STARsoloDirs A vector of root directories of STARsolo output files.
82
+#'  The paths should be something like this:
83
+#'  \bold{/PATH/TO/\emph{prefix}Solo.out}. For example: \code{./Solo.out}.
84
+#'  Each sample should have its own path. Must have the same length as
85
+#'  \code{samples}.
86
+#' @param samples A vector of user-defined sample names for the sample to be
87
+#'  imported. Must have the same length as \code{STARsoloDirs}.
88
+#' @param STARsoloOuts Character vector. The intermediate
89
+#'  paths to filtered or raw cell barcode, feature, and matrix files
90
+#'  for each of \code{samples}. Default \code{"Gene/filtered"}  which works
91
+#'  for STAR 2.7.3a. Must have length 1 or the same
92
+#'  length as \code{samples}.
93
+#' @param matrixFileNames Filenames for the Market Exchange Format (MEX) sparse
94
+#'  matrix file (.mtx file). Must have length 1 or the same
95
+#'  length as \code{samples}.
96
+#' @param featuresFileNames Filenames for the feature annotation file.
97
+#'  Must have length 1 or the same
98
+#'  length as \code{samples}.
99
+#' @param barcodesFileNames Filenames for the cell barcode list file.
100
+#'  Must have length 1 or the same
101
+#'  length as \code{samples}.
102
+#' @param gzipped Boolean. \code{TRUE} if the STARsolo output files
103
+#'  (barcodes.tsv, features.tsv, and matrix.mtx) were
104
+#'  gzip compressed. \code{FALSE} otherwise. This is \code{FALSE} in STAR
105
+#'  2.7.3a. Default \code{"auto"} which automatically detects if the
106
+#'  files are gzip compressed. Must have length 1 or the same
107
+#'  length as \code{samples}.
108
+#' @param class Character. The class of the expression matrix stored in the SCE
109
+#'  object. Can be one of "Matrix" (as returned by
110
+#'  \link[Matrix]{readMM} function), or "matrix" (as returned by
111
+#'  \link[base]{matrix} function). Default "Matrix".
112
+#' @param delayedArray Boolean. Whether to read the expression matrix as
113
+#'  \link[DelayedArray]{DelayedArray} object or not. Default \code{TRUE}.
114
+#' @return A \code{SingleCellExperiment} object containing the count
115
+#'  matrix, the gene annotation, and the cell annotation.
116
+#' @examples
117
+#' # Example #1
118
+#' # FASTQ files were downloaded from
119
+#' # https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.0
120
+#' # /pbmc_1k_v3
121
+#' # They were concatenated as follows:
122
+#' # cat pbmc_1k_v3_S1_L001_R1_001.fastq.gz pbmc_1k_v3_S1_L002_R1_001.fastq.gz >
123
+#' # pbmc_1k_v3_R1.fastq.gz
124
+#' # cat pbmc_1k_v3_S1_L001_R2_001.fastq.gz pbmc_1k_v3_S1_L002_R2_001.fastq.gz >
125
+#' # pbmc_1k_v3_R2.fastq.gz
126
+#' # The following STARsolo command generates the filtered feature, cell, and
127
+#' # matrix files
128
+#' # STAR \
129
+#' #   --genomeDir ./index \
130
+#' #   --readFilesIn ./pbmc_1k_v3_R2.fastq.gz \
131
+#' #                 ./pbmc_1k_v3_R1.fastq.gz \
132
+#' #   --readFilesCommand zcat \
133
+#' #   --outSAMtype BAM Unsorted \
134
+#' #   --outBAMcompression -1 \
135
+#' #   --soloType CB_UMI_Simple \
136
+#' #   --soloCBwhitelist ./737K-august-2016.txt \
137
+#' #   --soloUMIlen 12
138
+#'
139
+#' # The top 20 genes and the first 20 cells are included in this example.
140
+#' sce <- importSTARsolo(
141
+#'   STARsoloDirs = system.file("extdata/STARsolo_PBMC_1k_v3_20x20",
142
+#'     package = "singleCellTK"),
143
+#'   samples = "PBMC_1k_v3_20x20")
144
+#' @export
145
+importSTARsolo <- function(
146
+    STARsoloDirs,
147
+    samples,
148
+    STARsoloOuts = "Gene/filtered",
149
+    matrixFileNames = "matrix.mtx",
150
+    featuresFileNames = "features.tsv",
151
+    barcodesFileNames = "barcodes.tsv",
152
+    gzipped = "auto",
153
+    class = c("Matrix", "matrix"),
154
+    delayedArray = TRUE) {
155
+
156
+    class <- match.arg(class)
157
+
158
+    .importSTARsolo(
159
+        STARsoloDirs = STARsoloDirs,
160
+        samples = samples,
161
+        STARsoloOuts = STARsoloOuts,
162
+        matrixFileNames = matrixFileNames,
163
+        featuresFileNames = featuresFileNames,
164
+        barcodesFileNames = barcodesFileNames,
165
+        gzipped = gzipped,
166
+        class = class,
167
+        delayedArray = delayedArray)
168
+}
0 169
new file mode 100644
... ...
@@ -0,0 +1,236 @@
1
+.constructSCEFromSeqcOutputs <- function(
2
+    sampleName,
3
+    matrix,
4
+    features,
5
+    barcodes) {
6
+
7
+    coln <- paste(sampleName, barcodes[[1]], sep = "_")
8
+    rownames(matrix) <- features[[1]]
9
+
10
+    sce <- SingleCellExperiment::SingleCellExperiment(
11
+        assays = list(counts = matrix))
12
+    SummarizedExperiment::rowData(sce) <- features
13
+    SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(
14
+        cell_barcode = as.character(barcodes[[1]]),
15
+        column_name = coln,
16
+        sample = sampleName,
17
+        row.names = coln)
18
+
19
+    return(sce)
20
+}
21
+
22
+
23
+.unionGeneMatrix <- function(geneUnion, matrix){
24
+    missGene <- geneUnion[!geneUnion %in% rownames(matrix)]
25
+    missMat <- Matrix::Matrix(0, nrow = length(missGene), ncol = ncol(matrix),
26
+        dimnames = list(missGene, NULL))
27
+
28
+    matb <- methods::as(matrix, "dgCMatrix")
29
+    rownames(matb) <- rownames(matrix)
30
+
31
+    mat <- rbind(matb, missMat)
32
+    if (anyDuplicated(rownames(mat))) {
33
+        mat <- mat[!duplicated(rownames(mat)), ]
34
+        warning("Duplicated genes exist in count matrix. Filtered",
35
+            " duplicated genes.")
36
+    }
37
+    return(mat)
38
+}
39
+
40
+
41
+.getGeneUnion <- function(geneList){
42
+    gene <- geneList
43
+    for (i in seq_along(geneList)){
44
+        gene[[i]] <- geneList[[i]][[1]]
45
+    }
46
+
47
+    geneUnion <- base::Reduce(union, gene)
48
+    return(geneUnion)
49
+}
50
+
51
+
52
+.readBarcodesSEQC <- function(path) {
53
+    res <- data.table::fread(path, header = FALSE, sep=",", colClasses = "character")
54
+    res <- res[,-1,drop = FALSE]
55
+    colnames(res) <- "cell_barcode"
56
+    return(res)
57
+}
58
+
59
+
60
+.readFeaturesSEQC <- function(path) {
61
+    res <- data.table::fread(path, header = FALSE, sep=",", colClasses = "character")
62
+    res <- res[,-1,drop = FALSE]
63
+    colnames(res) <- "feature_name"
64
+    return(res)
65
+}
66
+
67
+
68
+.importSEQC <- function(
69
+    seqcDirs,
70
+    samples,
71
+    prefix,
72
+    gzipped,
73
+    class,
74
+    delayedArray,
75
+    cbNotFirstCol,
76
+    feNotFirstCol,
77
+    combinedSample) {
78
+
79
+    if (length(seqcDirs) != length(samples)) {
80
+        stop("'seqcDirs' and 'samples' have unequal lengths!")
81
+    }
82
+
83
+    if (length(seqcDirs) != length(prefix)) {
84
+        stop("'seqcDirs' and 'prefix' have unequal lengths!")
85
+    }
86
+
87
+    res <- vector("list", length = length(seqcDirs))
88
+    cb <- vector("list", length = length(seqcDirs))
89
+    fe <- vector("list", length = length(seqcDirs))
90
+    mat <- vector("list", length = length(seqcDirs))
91
+
92
+    for (i in seq_along(seqcDirs)) {
93
+        dir <- seqcDirs[i]
94
+        matrixFile <- paste(prefix[i], 'sparse_molecule_counts.mtx', sep = "_")
95
+        featuresFile <- paste(prefix[i], 'sparse_counts_genes.csv', sep = "_")
96
+        barcodesFile <- paste(prefix[i], 'sparse_counts_barcodes.csv',
97
+            sep = "_")
98
+
99
+        cb[[i]] <- .readBarcodesSEQC(file.path(dir, barcodesFile))
100
+        fe[[i]] <- .readFeaturesSEQC(file.path(dir, featuresFile))
101
+
102
+        mat[[i]] <- .readMatrixMM(file.path(dir, matrixFile),
103
+            gzipped = gzipped, class = class, delayedArray = delayedArray)
104
+        mat[[i]] <- t(mat[[i]])
105
+        rownames(mat[[i]]) <- fe[[i]][[1]]
106
+    }
107
+
108
+    if (isTRUE(combinedSample) & length(seqcDirs) > 1) {
109
+        geneUnion <- .getGeneUnion(fe)
110
+        for (i in seq_along(seqcDirs)) {
111
+            matrix <- .unionGeneMatrix(geneUnion = geneUnion, matrix = mat[[i]])
112
+            matrix <- matrix[geneUnion, ]
113
+            feature <- S4Vectors::DataFrame('feature_name' = rownames(matrix))
114
+
115
+            scei <- .constructSCEFromSeqcOutputs(
116
+                sampleName = samples[i],
117
+                matrix = matrix,
118
+                features = feature,
119
+                barcodes = cb[[i]])
120
+            res[[i]] <- scei
121
+
122
+        }
123
+        sce <- do.call(SingleCellExperiment::cbind, res)
124
+        return(sce)
125
+
126
+    } else {
127
+        for (i in seq_along(seqcDirs)) {
128
+            scei <- .constructSCEFromSeqcOutputs(
129
+                sampleName = samples[i],
130
+                matrix = mat[[i]],
131
+                features = fe[[i]],
132
+                barcodes = cb[[i]])
133
+            res[[i]] <- scei
134
+        }
135
+        if (length(seqcDirs) == 1) {
136
+            return(res[[1]])
137
+        } else {
138
+            return(res)
139
+        }
140
+    }
141
+}
142
+
143
+
144
+#' @name importSEQC
145
+#' @rdname importSEQC
146
+#' @title Construct SCE object from seqc output
147
+#' @description Read the filtered barcodes, features, and matrices for all
148
+#'  samples from (preferably a single run of) seqc output. Import and
149
+#'  combine them as one big \link[SingleCellExperiment]{SingleCellExperiment}
150
+#'  object.
151
+#' @param seqcDirs A vector of paths to seqc output files. Each sample
152
+#'  should have its own path. For example: \code{./pbmc_1k_50x50}.
153
+#'  Must have the same length as \code{samples}.
154
+#' @param samples A vector of user-defined sample names for the samples to be
155
+#'  imported. Must have the same length as \code{seqcDirs}.
156
+#' @param prefix A vector containing the prefix of file names within each
157
+#'  sample directory. It cannot be null and the vector should have the same
158
+#'  length as \emph{samples}.
159
+#' @param gzipped Boolean. \code{TRUE} if the seqc output files
160
+#'  (sparse_counts_barcode.csv, sparse_counts_genes.csv, and
161
+#'  sparse_molecule_counts.mtx)
162
+#'  were gzip compressed. \code{FALSE} otherwise. Default seqc outputs are
163
+#'  not gzipped.
164
+#' Default \code{FALSE}.
165
+#' @param class Character. The class of the expression matrix stored in the SCE
166
+#'  object. Can be one of "Matrix" (as returned by
167
+#'  \link[Matrix]{readMM} function), or "matrix" (as returned by
168
+#'  \link[base]{matrix} function). Default "Matrix".
169
+#' @param delayedArray Boolean. Whether to read the expression matrix as
170
+#'  \link[DelayedArray]{DelayedArray} object or not. Default \code{TRUE}.
171
+#' @param feNotFirstCol Boolean. \code{TRUE} if first column of
172
+#'  sparse_counts_genes.csv
173
+#' is row index and it will be removed. \code{FALSE} the first column will
174
+#'  be kept.
175
+#' @param cbNotFirstCol Boolean. \code{TRUE} if first column of
176
+#'  sparse_counts_barcode.csv
177
+#' is row index and it will be removed. \code{FALSE} the first column will
178
+#'  be kept.
179
+#' @param combinedSample Boolean. If \code{TRUE}, \code{importSEQC} returns a
180
+#' \code{SingleCellExperiment} object containing the combined count matrix,
181
+#'  feature annotations
182
+#'  and the cell annotations. If \code{FALSE}, \code{importSEQC} returns a
183
+#'  list containing multiple
184
+#'  \code{SingleCellExperiment} objects. Each \code{SingleCellExperiment}
185
+#'  contains count matrix, feature annotations and cell annotations for
186
+#'  each sample.
187
+#' @details
188
+#' \code{importSEQC} imports output from seqc.
189
+#'  The default sparse_counts_barcode.csv or sparse_counts_genes.csv from
190
+#'  seqc output
191
+#'  contains two columns. The first column is row index and the second column
192
+#'  is cell-barcode
193
+#'  or gene symbol. \code{importSEQC} will remove first column. Alternatively,
194
+#'  user can call
195
+#'  \code{cbNotFirstCol} or \code{feNotFirstCol} as FALSE to keep the first
196
+#'  column of these files.
197
+#'  When \code{combinedSample} is TRUE, \code{importSEQC} will combined count
198
+#'  matrix with genes detected in at least one sample.
199
+#' @return A \code{SingleCellExperiment} object containing the combined count
200
+#'  matrix, the feature annotations, and the cell annotation.
201
+#' @examples
202
+#' # Example #1
203
+#' # The following filtered feature, cell, and matrix files were downloaded from
204
+#' # https://support.10xgenomics.com/single-cell-gene-expression/datasets/
205
+#' # 3.0.0/pbmc_1k_v3
206
+#' # The top 50 hg38 genes are included in this example.
207
+#' # Only the top 50 cells are included.
208
+#' sce <- importSEQC(
209
+#'     seqcDirs = system.file("extdata/pbmc_1k_50x50", package = "singleCellTK"),
210
+#'     samples = "pbmc_1k_50x50",
211
+#'     prefix = "pbmc_1k",
212
+#'     combinedSample = FALSE)
213
+#' @export
214
+importSEQC <- function(
215
+    seqcDirs = NULL,
216
+    samples = NULL,
217
+    prefix = NULL,
218
+    gzipped = FALSE,
219
+    class = c("Matrix", "matrix"),
220
+    delayedArray = TRUE,
221
+    cbNotFirstCol = TRUE,
222
+    feNotFirstCol = TRUE,
223
+    combinedSample = TRUE) {
224
+
225
+    class <- match.arg(class)
226
+
227
+    .importSEQC(seqcDirs = seqcDirs,
228
+        samples = samples,
229
+        prefix = prefix,
230
+        gzipped = gzipped,
231
+        class = class,
232
+        delayedArray = delayedArray,
233
+        cbNotFirstCol = cbNotFirstCol,
234
+        feNotFirstCol = feNotFirstCol,
235
+        combinedSample = combinedSample)
236
+}
... ...
@@ -49,68 +49,6 @@ summarizeSCE <- function(inSCE, useAssay = NULL, sampleVariableName = NULL){
49 49
   return(df)
50 50
 }
51 51
 
52
-
53
-#' Filter Genes and Samples from a Single Cell Object
54
-#'
55
-#' @param inSCE Input \linkS4class{SingleCellExperiment} object. Required
56
-#' @param useAssay Indicate which assay to use for filtering. Default is
57
-#' "counts"
58
-#' @param deletesamples List of samples to delete from the object.
59
-#' @param removeNoExpress Remove genes that have no expression across all
60
-#' samples. The default is true
61
-#' @param removeBottom Fraction of low expression genes to remove from the
62
-#' single cell object. This occurs after removeNoExpress. The default is 0.50.
63
-#' @param minimumDetectGenes Minimum number of genes with at least 1
64
-#' count to include a sample in the single cell object. The default is 1700.
65
-#' @param filterSpike Apply filtering to Spike in controls (indicated by
66
-#' isSpike).
67
-#' The default is TRUE.
68
-#'
69
-#' @return The filtered single cell object.
70
-#' @export
71
-#' @examples
72
-#' data("mouseBrainSubsetSCE")
73
-#' mouseBrainSubsetSCE <- filterSCData(mouseBrainSubsetSCE,
74
-#'                                     deletesamples="X1772063061_G11")
75
-filterSCData <- function(inSCE, useAssay="counts", deletesamples=NULL,
76
-                         removeNoExpress=TRUE, removeBottom=0.5,
77
-                         minimumDetectGenes=1700, filterSpike=TRUE){
78
-  #delete specified samples
79
-  inSCE <- inSCE[, !(colnames(inSCE) %in% deletesamples)]
80
-
81
-  if (filterSpike){
82
-    nkeeprows <- ceiling((1 - removeBottom) * as.numeric(nrow(inSCE)))
83
-    tokeeprow <- order(rowSums(SummarizedExperiment::assay(inSCE, useAssay)),
84
-                       decreasing = TRUE)[seq_len(nkeeprows)]
85
-  } else {
86
-    nkeeprows <- ceiling((1 - removeBottom) * as.numeric(nrow(inSCE))) -
87
-      sum(SingleCellExperiment::isSpike(inSCE))
88
-    tokeeprow <- order(rowSums(SummarizedExperiment::assay(inSCE, useAssay)),
89
-                       decreasing = TRUE)
90
-    tokeeprow <- setdiff(tokeeprow,
91
-                         which(SingleCellExperiment::isSpike(inSCE)))
92
-    tokeeprow <- tokeeprow[seq_len(nkeeprows)]
93
-    tokeeprow <- c(tokeeprow, which(SingleCellExperiment::isSpike(inSCE)))
94
-  }
95
-  tokeepcol <- colSums(SummarizedExperiment::assay(inSCE, useAssay) != 0) >=
96
-    minimumDetectGenes
97
-  inSCE <- inSCE[tokeeprow, tokeepcol]
98
-
99
-  #remove genes with no expression
100
-  if (removeNoExpress){
101
-    if (filterSpike){
102
-      inSCE <- inSCE[rowSums(SummarizedExperiment::assay(inSCE,
103
-                                                              useAssay)) != 0, ]
104
-    } else {
105
-      inSCE <- inSCE[(rowSums(
106
-        SummarizedExperiment::assay(inSCE, useAssay)) != 0 |
107
-          SingleCellExperiment::isSpike(inSCE)), ]
108
-    }
109
-  }
110
-
111
-  return(inSCE)
112
-}
113
-
114 52
 #' Generate a distinct palette for coloring different clusters
115 53
 #'
116 54
 #' @param n Integer; Number of colors to generate
... ...
@@ -168,7 +106,7 @@ distinctColors <- function(n, hues = c("red", "cyan", "orange", "blue",
168 106
 #'
169 107
 #' @description Three different generation methods are wrapped, including
170 108
 #' \code{\link[celda]{distinctColors}},
171
-#' \code{\link[randomcoloR]{distinctColorPalette}} and the \code{ggplot}
109
+#' [randomcoloR](SCTK_PerformingQC_Cell_V3.Rmd) and the \code{ggplot}
172 110
 #' default color generation.
173 111
 #' @param n An integer, the number of color codes to generate.
174 112
 #' @param palette A single character string. Select the method, available
175 113
new file mode 100644
... ...
@@ -0,0 +1,91 @@
1
+
2
+#' @importFrom tools file_ext
3
+.getFileExt <- function(file) {
4
+    ext1 <- tools::file_ext(file)
5
+    if (!(ext1 %in% c("mtx", "txt", "csv", "tab", "tsv",
6
+        "npz", "gz", "bz2", "xz"))) {
7
+
8
+        warning("Unknown extension ", ext1, ". Treat as text file.")
9
+        return(c("unknown"))
10
+    }
11
+
12
+    # if compressed
13
+    if (ext1 %in% c("gz", "bz2", "xz")) {
14
+        bn <- substr(file, start = 1, stop = nchar(file) - nchar(ext1) - 1)
15
+        ext2 <- tools::file_ext(bn)
16
+        if (!(ext2 %in% c("mtx", "txt", "csv", "tab", "npz"))) {
17
+            warning("Unknown extension ", ext2, ". Treat as text file.")
18
+            return(c(ext1, "unknown"))
19
+        }
20
+        return(c(ext1, ext2))
21
+    } else {
22
+        return(ext1)
23
+    }
24
+}
25
+
26
+
27
+#' @name readSingleCellMatrix
28
+#' @title Read single cell expression matrix
29
+#' @description Automatically detact the format of the input file and read
30
+#'  the file.
31
+#' @param file Path to input file. Supported file endings include .mtx, .txt,
32
+#'  .csv, .tab, .tsv, .npz, and their corresponding \code{gzip},
33
+#'  \code{bzip2}, or \code{xz} compressed extensions (\code{*.gz},
34
+#'  \code{*.bz2}, or \code{*.xz}).
35
+#' @param class Character. Class of matrix. One of "Matrix" or "matrix". Specifying "Matrix"
36
+#'  will convert to a sparse format which should be used
37
+#'  for datasets with large numbers of cells.  Default "Matrix".
38
+#' @param delayedArray Boolean. Whether to read the expression matrix as
39
+#'  \link[DelayedArray]{DelayedArray} object or not. Default \code{TRUE}.
40
+#' @param colIndexLocation Character. For Optimus output, the path to the
41
+#'  barcode index .npy file. Used only if \code{file} has .npz extension.
42
+#'  Default \code{NULL}.
43
+#' @param rowIndexLocation Character. For Optimus output, The path to the
44
+#'  feature (gene) index .npy file. Used only if \code{file} has .npz extension.
45
+#'  Default \code{NULL}.
46
+#' @examples
47