... | ... |
@@ -15,7 +15,7 @@ Authors@R: c(person(given="David", family="Jenkins", email="dfj@bu.edu", role=c( |
15 | 15 |
comment = c(ORCID = "0000-0002-6247-6595"))) |
16 | 16 |
Maintainer: David Jenkins <dfj@bu.edu> |
17 | 17 |
Depends: |
18 |
- R (>= 3.6.0), |
|
18 |
+ R (>= 4.0), |
|
19 | 19 |
SummarizedExperiment, |
20 | 20 |
SingleCellExperiment, |
21 | 21 |
DelayedArray, |
... | ... |
@@ -47,8 +47,8 @@ Imports: |
47 | 47 |
fields, |
48 | 48 |
ggplot2, |
49 | 49 |
ggplotify, |
50 |
- ggtree, |
|
51 | 50 |
ggrepel, |
51 |
+ ggtree, |
|
52 | 52 |
gridExtra, |
53 | 53 |
GSVA (>= 1.26.0), |
54 | 54 |
GSVAdata, |
... | ... |
@@ -118,7 +118,6 @@ Suggests: |
118 | 118 |
org.Mm.eg.db, |
119 | 119 |
stringr |
120 | 120 |
Remotes: |
121 |
- joshua-d-campbell/DoubletFinder, |
|
122 | 121 |
joshua-d-campbell/harmony, |
123 | 122 |
rz2333/mnormt@1.5-6, |
124 | 123 |
joshua-d-campbell/liger, |
... | ... |
@@ -1,10 +1,10 @@ |
1 | 1 |
# Generated by roxygen2: do not edit by hand |
2 | 2 |
|
3 |
-export(ComBatSCE) |
|
4 | 3 |
export(MAST) |
5 | 4 |
export(alignSingleCellData) |
6 | 5 |
export(calcEffectSizes) |
7 | 6 |
export(combineSCE) |
7 |
+export(computeZScore) |
|
8 | 8 |
export(constructSCE) |
9 | 9 |
export(convertGeneIDs) |
10 | 10 |
export(convertSCEToSeurat) |
... | ... |
@@ -15,9 +15,9 @@ export(downSampleCells) |
15 | 15 |
export(downSampleDepth) |
16 | 16 |
export(enrichRSCE) |
17 | 17 |
export(exportSCE) |
18 |
+export(exportSCEtoAnnData) |
|
18 | 19 |
export(exportSCEtoFlatFile) |
19 | 20 |
export(featureIndex) |
20 |
-export(filterSCData) |
|
21 | 21 |
export(findMarkerDiffExp) |
22 | 22 |
export(generateMeta) |
23 | 23 |
export(generateSimulatedData) |
... | ... |
@@ -31,6 +31,7 @@ export(getUMAP) |
31 | 31 |
export(gsvaPlot) |
32 | 32 |
export(gsvaSCE) |
33 | 33 |
export(importAnnData) |
34 |
+export(importBUStools) |
|
34 | 35 |
export(importCellRanger) |
35 | 36 |
export(importCellRangerV2) |
36 | 37 |
export(importCellRangerV2Sample) |
... | ... |
@@ -43,6 +44,9 @@ export(importGeneSetsFromCollection) |
43 | 44 |
export(importGeneSetsFromGMT) |
44 | 45 |
export(importGeneSetsFromList) |
45 | 46 |
export(importGeneSetsFromMSigDB) |
47 |
+export(importOptimus) |
|
48 |
+export(importSEQC) |
|
49 |
+export(importSTARsolo) |
|
46 | 50 |
export(iterateSimulations) |
47 | 51 |
export(mergeSCEColData) |
48 | 52 |
export(parseRsubreadLogs) |
... | ... |
@@ -81,6 +85,7 @@ export(plotScrubletResults) |
81 | 85 |
export(plotTSNE) |
82 | 86 |
export(plotUMAP) |
83 | 87 |
export(qcInputProcess) |
88 |
+export(readSingleCellMatrix) |
|
84 | 89 |
export(reportCellQC) |
85 | 90 |
export(reportDropletQC) |
86 | 91 |
export(reportQCTool) |
... | ... |
@@ -149,6 +154,7 @@ export(subsetSCECols) |
149 | 154 |
export(subsetSCERows) |
150 | 155 |
export(summarizeSCE) |
151 | 156 |
export(thresholdGenes) |
157 |
+export(trimCounts) |
|
152 | 158 |
export(visPlot) |
153 | 159 |
import(Biobase) |
154 | 160 |
import(DelayedArray) |
155 | 161 |
deleted file mode 100644 |
... | ... |
@@ -1,79 +0,0 @@ |
1 |
-#' ComBatSCE |
|
2 |
-#' |
|
3 |
-#' Run ComBat on a SCtkExperiment object |
|
4 |
-#' |
|
5 |
-#' @param inSCE Input SCtkExperiment object. Required |
|
6 |
-#' @param batch The name of a column in colData to use as the batch variable. |
|
7 |
-#' Required |
|
8 |
-#' @param useAssay The assay to use for ComBat. The default is "logcounts" |
|
9 |
-#' @param par.prior TRUE indicates parametric adjustments will be used, FALSE |
|
10 |
-#' indicates non-parametric adjustments will be used. Accepted parameters: |
|
11 |
-#' "Parametric" or "Non-parametric" |
|
12 |
-#' @param covariates List of other column names in colData to be added to the |
|
13 |
-#' ComBat model as covariates |
|
14 |
-#' @param mean.only If TRUE ComBat only corrects the mean of the batch effect |
|
15 |
-#' @param ref.batch If given, will use the selected batch as a reference for |
|
16 |
-#' batch adjustment. |
|
17 |
-#' |
|
18 |
-#' @return ComBat matrix based on inputs. You can save this matrix into the |
|
19 |
-#' SCtkExperiment with assay() |
|
20 |
-#' @export |
|
21 |
-#' @examples |
|
22 |
-#' if(requireNamespace("bladderbatch", quietly = TRUE)) { |
|
23 |
-#' library(bladderbatch) |
|
24 |
-#' data(bladderdata) |
|
25 |
-#' |
|
26 |
-#' #subset for testing |
|
27 |
-#' dat <- bladderEset[1:50,] |
|
28 |
-#' dat <- as(as(dat, "SummarizedExperiment"), "SCtkExperiment") |
|
29 |
-#' mod <- stats::model.matrix(~as.factor(cancer), data = colData(dat)) |
|
30 |
-#' |
|
31 |
-#' # parametric adjustment |
|
32 |
-#' combat_edata1 <- ComBatSCE(inSCE = dat, useAssay = "exprs", |
|
33 |
-#' batch = "batch", covariates = NULL) |
|
34 |
-#' assay(dat, "parametric_combat") <- combat_edata1 |
|
35 |
-#' |
|
36 |
-#' # non-parametric adjustment, mean-only version |
|
37 |
-#' combat_edata2 <- ComBatSCE(inSCE = dat, useAssay = "exprs", |
|
38 |
-#' batch = "batch", par.prior = "Non-parametric", |
|
39 |
-#' mean.only = TRUE, covariates = NULL) |
|
40 |
-#' assay(dat, "nonparametric_combat_meanonly") <- combat_edata2 |
|
41 |
-#' |
|
42 |
-#' # reference-batch version, with covariates |
|
43 |
-#' combat_edata3 <- ComBatSCE(inSCE = dat, useAssay = "exprs", |
|
44 |
-#' batch = "batch", covariates = "cancer", |
|
45 |
-#' ref.batch = 3) |
|
46 |
-#' assay(dat, "refbatch_combat_wcov") <- combat_edata3 |
|
47 |
-#' assays(dat) |
|
48 |
-#' } |
|
49 |
-#' |
|
50 |
-ComBatSCE <- function(inSCE, batch, useAssay="logcounts", |
|
51 |
- par.prior="Parametric", covariates=NULL, mean.only=FALSE, |
|
52 |
- ref.batch=NULL){ |
|
53 |
- |
|
54 |
- #prepare model matrix |
|
55 |
- mod <- NULL |
|
56 |
- if (length(covariates) > 0){ |
|
57 |
- mod <- stats::model.matrix( |
|
58 |
- stats::as.formula(paste0("~", paste0(covariates, collapse = "+"))), |
|
59 |
- data = data.frame(SingleCellExperiment::colData(inSCE)[, covariates, |
|
60 |
- drop = FALSE])) |
|
61 |
- } |
|
62 |
- |
|
63 |
- #prepare parametric |
|
64 |
- if (par.prior == "Parametric"){ |
|
65 |
- par.prior <- TRUE |
|
66 |
- } else if (par.prior == "Non-parametric") { |
|
67 |
- par.prior <- FALSE |
|
68 |
- } else { |
|
69 |
- stop("Invalid option given to par.prior. Accepted values are Parametric", |
|
70 |
- " and Non-parametric.") |
|
71 |
- } |
|
72 |
- |
|
73 |
- resassay <- |
|
74 |
- sva::ComBat(dat = SummarizedExperiment::assay(inSCE, useAssay), |
|
75 |
- batch = SingleCellExperiment::colData(inSCE)[, batch], |
|
76 |
- mod = mod, par.prior = par.prior, |
|
77 |
- mean.only = mean.only, ref.batch = ref.batch) |
|
78 |
- return(resassay) |
|
79 |
-} |
80 | 0 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,25 @@ |
1 |
+#' Compute Z-Score |
|
2 |
+#' |
|
3 |
+#' Computes Z-Score from an input count matrix using the formula ((x-mean(x))/sd(x)) |
|
4 |
+#' for each gene across all cells. The input count matrix can either be a base matrix, |
|
5 |
+#' dgCMatrix or a DelayedMatrix. Computations are performed using DelayedMatrixStats |
|
6 |
+#' package to efficiently compute the Z-Score matrix. |
|
7 |
+#' |
|
8 |
+#' @param counts matrix (base matrix, dgCMatrix or DelaymedMatrix) |
|
9 |
+#' |
|
10 |
+#' @return z-score computed counts matrix (DelayedMatrix) |
|
11 |
+#' @export |
|
12 |
+#' |
|
13 |
+#' @examples |
|
14 |
+#' |
|
15 |
+#' data(sce_chcl, package = "scds") |
|
16 |
+#' assay(sce_chcl, "countsZScore") <- computeZScore(assay(sce_chcl, "counts")) |
|
17 |
+#' |
|
18 |
+computeZScore <- function(counts) { |
|
19 |
+ if (!methods::is(counts, "DelayedArray")) { |
|
20 |
+ counts <- DelayedArray::DelayedArray(counts) |
|
21 |
+ } |
|
22 |
+ counts <- (counts - DelayedMatrixStats::rowMeans2(counts)) / DelayedMatrixStats::rowSds(counts) |
|
23 |
+ counts[base::is.nan(counts)] <- 0 |
|
24 |
+ return(counts) |
|
25 |
+} |
0 | 26 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,75 @@ |
1 |
+#' @title Export a \link[SingleCellExperiment]{SingleCellExperiment} R object as |
|
2 |
+#' Python annData object |
|
3 |
+#' @description Writes all assays, colData, rowData, reducedDims, and altExps objects in a |
|
4 |
+#' \link[SingleCellExperiment]{SingleCellExperiment} to a Python annData object in the .h5ad format |
|
5 |
+#' All parameters of Anndata.write_h5ad function (https://icb-anndata.readthedocs-hosted.com/en/stable/anndata.AnnData.write_h5ad.html) |
|
6 |
+#' are available as parameters to this export function and set to defaults. Defaults can be |
|
7 |
+#' overridden at function call. |
|
8 |
+#' @param sce \link[SingleCellExperiment]{SingleCellExperiment} R object to be |
|
9 |
+#' exported. |
|
10 |
+#' @param useAssay Character. The name of assay of |
|
11 |
+#' interests that will be set as the primary matrix of the output AnnData. |
|
12 |
+#' Default \code{"counts"}. |
|
13 |
+#' @param outputDir Path to the directory where .h5ad outputs will be written. Default is the current working directory. |
|
14 |
+#' @param prefix Prefix to use for the name of the output file. Default \code{"sample"}. |
|
15 |
+#' @param overwrite Boolean. Default \code{TRUE}. |
|
16 |
+#' @param compression If output file compression is required, this variable accepts |
|
17 |
+#' 'gzip' or 'lzf' as inputs. Default \code{None}. |
|
18 |
+#' @param compressionOpts Integer. Sets the compression level |
|
19 |
+#' @param forceDense Default \code{False} Write sparse data as a dense matrix. |
|
20 |
+#' Refer \code{anndata.write_h5ad} documentation for details. Default \code{NULL}. |
|
21 |
+#' @examples |
|
22 |
+#' \dontrun{ |
|
23 |
+#' data(sce_chcl, package = "scds") |
|
24 |
+#' exportSCEtoAnnData(sce=sce_chcl, compression="gzip") |
|
25 |
+#' } |
|
26 |
+#' |
|
27 |
+#' @export |
|
28 |
+exportSCEtoAnnData <- function(sce, |
|
29 |
+ useAssay = 'counts', |
|
30 |
+ outputDir = "./", |
|
31 |
+ prefix = "sample", |
|
32 |
+ overwrite = TRUE, |
|
33 |
+ compression = c('None','lzf','gzip'), |
|
34 |
+ compressionOpts = NULL, |
|
35 |
+ forceDense = c('False','True')){ |
|
36 |
+ compression <- match.arg(compression) |
|
37 |
+ forceDense <- match.arg(forceDense) |
|
38 |
+ if (compression == 'None'){ |
|
39 |
+ compression <- NULL |
|
40 |
+ } |
|
41 |
+ |
|
42 |
+ if (!reticulate::py_module_available(module = "scanpy")) { |
|
43 |
+ warning("Cannot find python module 'scanpy', please install Conda and", |
|
44 |
+ " run sctkPythonInstallConda() or run sctkPythonInstallVirtualEnv().", |
|
45 |
+ "If one of these have been previously run to install the modules,", |
|
46 |
+ "make sure to run selectSCTKConda() or selectSCTKVirtualEnvironment(),", |
|
47 |
+ " respectively, if R has been restarted since the module installation.", |
|
48 |
+ " Alternatively, Scrublet can be installed on the local machine", |
|
49 |
+ "with pip (e.g. pip install --user scanpy) and then the 'use_python()'", |
|
50 |
+ " function from the 'reticulate' package can be used to select the", |
|
51 |
+ " correct Python environment.") |
|
52 |
+ return(sce)} |
|
53 |
+ |
|
54 |
+ AssayName <- SummarizedExperiment::assayNames(sce) |
|
55 |
+ for (assay in AssayName){ |
|
56 |
+ if (!methods::is(SummarizedExperiment::assay(sce, assay), 'dgCMatrix')) { |
|
57 |
+ SummarizedExperiment::assay(sce, assay) <- .convertToMatrix(SummarizedExperiment::assay(sce, assay)) |
|
58 |
+ } |
|
59 |
+ } |
|
60 |
+ |
|
61 |
+ |
|
62 |
+ dir.create(outputDir, showWarnings = FALSE, recursive = TRUE) |
|
63 |
+ annData <- .sce2adata(sce,useAssay) |
|
64 |
+ fileName <- paste0(prefix,".h5ad") |
|
65 |
+ filePath <- file.path(outputDir,fileName) |
|
66 |
+ |
|
67 |
+ if (file.exists(filePath) && !isTRUE(overwrite)) { |
|
68 |
+ stop(paste0(path, " already exists. Change 'outputDir' or set 'overwrite' to TRUE.")) |
|
69 |
+ } |
|
70 |
+ |
|
71 |
+ annData$write_h5ad(filePath, |
|
72 |
+ compression = compression, |
|
73 |
+ compression_opts = compressionOpts, |
|
74 |
+ force_dense = forceDense) |
|
75 |
+} |
... | ... |
@@ -1,10 +1,10 @@ |
1 |
-#' @title Get runDropletQC .html report |
|
1 |
+#' @title Get runDropletQC .html report |
|
2 | 2 |
#' @description A function to generate .html Rmarkdown report containing the visualizations of the runDropletQC function output |
3 | 3 |
#' @param inSCE A \link[SingleCellExperiment]{SingleCellExperiment} object containing |
4 | 4 |
#' the full droplet count matrix with the output from runDropletQC function |
5 |
-#' @param subTitle subtitle of the QC HTML report. Default is NULL. |
|
6 |
-#' @param studyDesign description of the data set and experiment design. It would be shown at the top of QC HTML report. Default is NULL. |
|
7 |
-#' @param output_file name of the generated file. If NULL/default then the output file name will be based on the name of the Rmarkdown template |
|
5 |
+#' @param subTitle subtitle of the QC HTML report. Default is NULL. |
|
6 |
+#' @param studyDesign description of the data set and experiment design. It would be shown at the top of QC HTML report. Default is NULL. |
|
7 |
+#' @param output_file name of the generated file. If NULL/default then the output file name will be based on the name of the Rmarkdown template |
|
8 | 8 |
#' @param output_dir name of the output directory to save the rendered file. If NULL/default the file is stored to the current working directory |
9 | 9 |
#' @return .html file |
10 | 10 |
#' @examples |
... | ... |
@@ -16,33 +16,33 @@ |
16 | 16 |
#' @export |
17 | 17 |
reportDropletQC <- function(inSCE, output_file = NULL, |
18 | 18 |
output_dir = NULL, |
19 |
- subTitle = NULL, |
|
19 |
+ subTitle = NULL, |
|
20 | 20 |
studyDesign = NULL) { |
21 |
- |
|
21 |
+ |
|
22 | 22 |
if (is.null(output_dir)){ |
23 | 23 |
output_dir<- getwd() |
24 | 24 |
} |
25 |
- |
|
25 |
+ |
|
26 | 26 |
#report_path <- tempfile(fileext = ".Rmd") |
27 | 27 |
#file.copy(system.file("rmarkdown/qc/DropletQC.Rmd", package = "singleCellTK"), report_path, overwrite = TRUE) |
28 | 28 |
|
29 | 29 |
## create temp Rmd file to bypass permission issue on server |
30 |
- rmarkdown::render(system.file("rmarkdown/qc/DropletQC.Rmd", package = "singleCellTK"), |
|
31 |
- params = list(object = inSCE, subTitle = subTitle, studyDesign = studyDesign), |
|
32 |
- output_file = output_file, |
|
30 |
+ rmarkdown::render(system.file("rmarkdown/qc/DropletQC.Rmd", package = "singleCellTK"), |
|
31 |
+ params = list(object = inSCE, subTitle = subTitle, studyDesign = studyDesign), |
|
32 |
+ output_file = output_file, |
|
33 | 33 |
output_dir = output_dir, |
34 | 34 |
intermediates_dir = output_dir, |
35 | 35 |
knit_root_dir = output_dir) |
36 | 36 |
} |
37 | 37 |
|
38 | 38 |
|
39 |
-#' @title Get runCellQC .html report |
|
39 |
+#' @title Get runCellQC .html report |
|
40 | 40 |
#' @description A function to generate .html Rmarkdown report containing the visualizations of the runCellQC function output |
41 | 41 |
#' @param inSCE A \link[SingleCellExperiment]{SingleCellExperiment} object containing |
42 | 42 |
#' the filtered count matrix with the output from runCellQC function |
43 |
-#' @param subTitle subtitle of the QC HTML report. Default is NULL. |
|
44 |
-#' @param studyDesign description of the data set and experiment design. It would be shown at the top of QC HTML report. Default is NULL. |
|
45 |
-#' @param output_file name of the generated file. If NULL/default then the output file name will be based on the name of the Rmarkdown template. |
|
43 |
+#' @param subTitle subtitle of the QC HTML report. Default is NULL. |
|
44 |
+#' @param studyDesign description of the data set and experiment design. It would be shown at the top of QC HTML report. Default is NULL. |
|
45 |
+#' @param output_file name of the generated file. If NULL/default then the output file name will be based on the name of the Rmarkdown template. |
|
46 | 46 |
#' @param output_dir name of the output directory to save the rendered file. If NULL/default the file is stored to the current working directory |
47 | 47 |
#' @return .html file |
48 | 48 |
#' @examples |
... | ... |
@@ -55,7 +55,7 @@ reportDropletQC <- function(inSCE, output_file = NULL, |
55 | 55 |
#' @export |
56 | 56 |
reportCellQC <- function(inSCE, output_file = NULL, |
57 | 57 |
output_dir = NULL, |
58 |
- subTitle = NULL, |
|
58 |
+ subTitle = NULL, |
|
59 | 59 |
studyDesign = NULL) { |
60 | 60 |
if (is.null(output_dir)){ |
61 | 61 |
output_dir<- getwd() |
... | ... |
@@ -63,9 +63,9 @@ reportCellQC <- function(inSCE, output_file = NULL, |
63 | 63 |
#report_path <- tempfile(fileext = ".Rmd") |
64 | 64 |
#file.copy(system.file("rmarkdown/qc/CellQC.Rmd", package = "singleCellTK"), report_path, overwrite = TRUE) |
65 | 65 |
|
66 |
- rmarkdown::render(system.file("rmarkdown/qc/CellQC.Rmd", package = "singleCellTK"), |
|
67 |
- params = list(object = inSCE, subTitle = subTitle, studyDesign = studyDesign), |
|
68 |
- output_file = output_file, |
|
66 |
+ rmarkdown::render(system.file("rmarkdown/qc/CellQC.Rmd", package = "singleCellTK"), |
|
67 |
+ params = list(object = inSCE, subTitle = subTitle, studyDesign = studyDesign), |
|
68 |
+ output_file = output_file, |
|
69 | 69 |
output_dir = output_dir, |
70 | 70 |
intermediates_dir = output_dir, |
71 | 71 |
knit_root_dir = output_dir) |
... | ... |
@@ -79,7 +79,7 @@ reportCellQC <- function(inSCE, output_file = NULL, |
79 | 79 |
#' runQCMetrics, runScrublet, runDoubletCells, runCxds, runBcds, runCxdsBcdsHybrid, runDecontX, runBarcodeRankDrops, runEmptyDrops |
80 | 80 |
#' @param algorithm Character. Specifies which QC algorithm report to generate. |
81 | 81 |
#' Available options are "BarcodeRankDrops", "EmptyDrops", "QCMetrics", "Scrublet", "DoubletCells", "Cxds", "Bcds", "CxdsBcdsHybrid", "DoubletFinder" and "DecontX". |
82 |
-#' @param output_file name of the generated file. If NULL/default then the output file name will be based on the name of the selected QC algorithm name . |
|
82 |
+#' @param output_file name of the generated file. If NULL/default then the output file name will be based on the name of the selected QC algorithm name . |
|
83 | 83 |
#' @param output_dir name of the output directory to save the rendered file. If NULL/default the file is stored to the current working directory |
84 | 84 |
#' @return .html file |
85 | 85 |
#' @examples |
... | ... |
@@ -87,6 +87,7 @@ reportCellQC <- function(inSCE, output_file = NULL, |
87 | 87 |
#' data(scExample, package = "singleCellTK") |
88 | 88 |
#' sce <- sce[, colData(sce)$type != 'EmptyDroplet'] |
89 | 89 |
#' sce <- runDecontX(sce) |
90 |
+#' sce <- getUMAP(sce) |
|
90 | 91 |
#' reportQCTool(inSCE = sce, algorithm = "DecontX") |
91 | 92 |
#' } |
92 | 93 |
#' @export |
... | ... |
@@ -102,13 +103,13 @@ reportQCTool <- function(inSCE, algorithm=c("BarcodeRankDrops", |
102 | 103 |
"DecontX"), |
103 | 104 |
output_file = NULL, |
104 | 105 |
output_dir = NULL) { |
105 |
- |
|
106 |
+ |
|
106 | 107 |
algorithm <- match.arg(algorithm) |
107 |
- |
|
108 |
+ |
|
108 | 109 |
if (is.null(output_dir)){ |
109 | 110 |
output_dir<- getwd() |
110 | 111 |
} |
111 |
- |
|
112 |
+ |
|
112 | 113 |
if (algorithm =="BarcodeRankDrops"){ |
113 | 114 |
rmarkdown::render(system.file("rmarkdown/qc/BarcodeRankDrops.Rmd", package="singleCellTK"), params = list( |
114 | 115 |
object=inSCE), output_file = output_file, output_dir = output_dir) |
... | ... |
@@ -152,4 +153,4 @@ reportQCTool <- function(inSCE, algorithm=c("BarcodeRankDrops", |
152 | 153 |
} |
153 | 154 |
|
154 | 155 |
|
155 |
- |
|
156 |
+ |
156 | 157 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,156 @@ |
1 |
+ |
|
2 |
+# dir <- "genecount" |
|
3 |
+.constructSCEFromBUStoolsOutputs <- function(dir, |
|
4 |
+ sample, |
|
5 |
+ matrixFileName, |
|
6 |
+ featuresFileName, |
|
7 |
+ barcodesFileName, |
|
8 |
+ gzipped, |
|
9 |
+ class, |
|
10 |
+ delayedArray) { |
|
11 |
+ |
|
12 |
+ cb <- .readBarcodes(file.path(dir, barcodesFileName)) |
|
13 |
+ fe <- .readFeatures(file.path(dir, featuresFileName)) |
|
14 |
+ ma <- .readMatrixMM(file.path(dir, matrixFileName), |
|
15 |
+ gzipped = gzipped, |
|
16 |
+ class = class, |
|
17 |
+ delayedArray = delayedArray) |
|
18 |
+ ma <- t(ma) |
|
19 |
+ |
|
20 |
+ coln <- paste(sample, cb[[1]], sep = "_") |
|
21 |
+ rownames(ma) <- fe[[1]] |
|
22 |
+ |
|
23 |
+ sce <- SingleCellExperiment::SingleCellExperiment( |
|
24 |
+ assays = list(counts = ma)) |
|
25 |
+ SummarizedExperiment::rowData(sce) <- fe |
|
26 |
+ SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(cb, |
|
27 |
+ column_name = coln, |
|
28 |
+ sample = sample, |
|
29 |
+ row.names = coln) |
|
30 |
+ |
|
31 |
+ return(sce) |
|
32 |
+} |
|
33 |
+ |
|
34 |
+ |
|
35 |
+# main function |
|
36 |
+.importBUStools <- function( |
|
37 |
+ BUStoolsDirs, |
|
38 |
+ samples, |
|
39 |
+ matrixFileNames, |
|
40 |
+ featuresFileNames, |
|
41 |
+ barcodesFileNames, |
|
42 |
+ gzipped, |
|
43 |
+ class, |
|
44 |
+ delayedArray) { |
|
45 |
+ |
|
46 |
+ if (length(BUStoolsDirs) != length(samples)) { |
|
47 |
+ stop("'BUStoolsDirs' and 'samples' have unequal lengths!") |
|
48 |
+ } |
|
49 |
+ |
|
50 |
+ res <- vector("list", length = length(samples)) |
|
51 |
+ |
|
52 |
+ matrixFileNames <- .getVectorized(matrixFileNames, length(samples)) |
|
53 |
+ featuresFileNames <- .getVectorized(featuresFileNames, length(samples)) |
|
54 |
+ barcodesFileNames <- .getVectorized(barcodesFileNames, length(samples)) |
|
55 |
+ gzipped <- .getVectorized(gzipped, length(samples)) |
|
56 |
+ |
|
57 |
+ for (i in seq_along(samples)) { |
|
58 |
+ dir <- file.path(BUStoolsDirs[i]) |
|
59 |
+ scei <- .constructSCEFromBUStoolsOutputs(dir, |
|
60 |
+ sample = samples[i], |
|
61 |
+ matrixFileName = matrixFileNames[i], |
|
62 |
+ featuresFileName = featuresFileNames[i], |
|
63 |
+ barcodesFileName = barcodesFileNames[i], |
|
64 |
+ gzipped = gzipped[i], |
|
65 |
+ class = class, |
|
66 |
+ delayedArray = delayedArray) |
|
67 |
+ res[[i]] <- scei |
|
68 |
+ } |
|
69 |
+ |
|
70 |
+ sce <- do.call(SingleCellExperiment::cbind, res) |
|
71 |
+ return(sce) |
|
72 |
+} |
|
73 |
+ |
|
74 |
+ |
|
75 |
+#' @name importBUStools |
|
76 |
+#' @rdname importBUStools |
|
77 |
+#' @title Construct SCE object from BUStools output |
|
78 |
+#' @description Read the barcodes, features (genes), and matrix from BUStools |
|
79 |
+#' output. Import them |
|
80 |
+#' as one \link[SingleCellExperiment]{SingleCellExperiment} object. Note the |
|
81 |
+#' cells in the output files for BUStools 0.39.4 are not filtered. |
|
82 |
+#' @param BUStoolsDirs A vector of paths to BUStools output files. Each sample |
|
83 |
+#' should have its own path. For example: \code{./genecount}. |
|
84 |
+#' Must have the same length as \code{samples}. |
|
85 |
+#' @param samples A vector of user-defined sample names for the samples to be |
|
86 |
+#' imported. Must have the same length as \code{BUStoolsDirs}. |
|
87 |
+#' @param matrixFileNames Filenames for the Market Exchange Format (MEX) sparse |
|
88 |
+#' matrix files (.mtx files). Must have length 1 or the same |
|
89 |
+#' length as \code{samples}. |
|
90 |
+#' @param featuresFileNames Filenames for the feature annotation files. |
|
91 |
+#' Must have length 1 or the same length as \code{samples}. |
|
92 |
+#' @param barcodesFileNames Filenames for the cell barcode list file. |
|
93 |
+#' Must have length 1 or the same length as \code{samples}. |
|
94 |
+#' @param gzipped Boolean. \code{TRUE} if the BUStools output files |
|
95 |
+#' (barcodes.txt, genes.txt, and genes.mtx) were |
|
96 |
+#' gzip compressed. \code{FALSE} otherwise. This is \code{FALSE} in BUStools |
|
97 |
+#' 0.39.4. Default \code{"auto"} which automatically detects if the |
|
98 |
+#' files are gzip compressed. Must have length 1 or the same length as |
|
99 |
+#' \code{samples}. |
|
100 |
+#' @param class Character. The class of the expression matrix stored in the SCE |
|
101 |
+#' object. Can be one of "Matrix" (as returned by |
|
102 |
+#' \link[Matrix]{readMM} function), or "matrix" (as returned by |
|
103 |
+#' \link[base]{matrix} function). Default "Matrix". |
|
104 |
+#' @param delayedArray Boolean. Whether to read the expression matrix as |
|
105 |
+#' \link[DelayedArray]{DelayedArray} object or not. Default \code{TRUE}. |
|
106 |
+#' @return A \code{SingleCellExperiment} object containing the count |
|
107 |
+#' matrix, the gene annotation, and the cell annotation. |
|
108 |
+#' @examples |
|
109 |
+#' # Example #1 |
|
110 |
+#' # FASTQ files were downloaded from |
|
111 |
+#' # https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.0 |
|
112 |
+#' # /pbmc_1k_v3 |
|
113 |
+#' # They were concatenated as follows: |
|
114 |
+#' # cat pbmc_1k_v3_S1_L001_R1_001.fastq.gz pbmc_1k_v3_S1_L002_R1_001.fastq.gz > |
|
115 |
+#' # pbmc_1k_v3_R1.fastq.gz |
|
116 |
+#' # cat pbmc_1k_v3_S1_L001_R2_001.fastq.gz pbmc_1k_v3_S1_L002_R2_001.fastq.gz > |
|
117 |
+#' # pbmc_1k_v3_R2.fastq.gz |
|
118 |
+#' # The following BUStools command generates the gene, cell, and |
|
119 |
+#' # matrix files |
|
120 |
+#' |
|
121 |
+#' # bustools correct -w ./3M-february-2018.txt -p output.bus | \ |
|
122 |
+#' # bustools sort -T tmp/ -t 4 -p - | \ |
|
123 |
+#' # bustools count -o genecount/genes \ |
|
124 |
+#' # -g ./transcripts_to_genes.txt \ |
|
125 |
+#' # -e matrix.ec \ |
|
126 |
+#' # -t transcripts.txt \ |
|
127 |
+#' # --genecounts - |
|
128 |
+#' |
|
129 |
+#' # The top 20 genes and the first 20 cells are included in this example. |
|
130 |
+#' sce <- importBUStools( |
|
131 |
+#' BUStoolsDirs = system.file("extdata/BUStools_PBMC_1k_v3_20x20/genecount/", |
|
132 |
+#' package = "singleCellTK"), |
|
133 |
+#' samples = "PBMC_1k_v3_20x20") |
|
134 |
+#' @export |
|
135 |
+importBUStools <- function( |
|
136 |
+ BUStoolsDirs, |
|
137 |
+ samples, |
|
138 |
+ matrixFileNames = "genes.mtx", |
|
139 |
+ featuresFileNames = "genes.genes.txt", |
|
140 |
+ barcodesFileNames = "genes.barcodes.txt", |
|
141 |
+ gzipped = "auto", |
|
142 |
+ class = c("Matrix", "matrix"), |
|
143 |
+ delayedArray = TRUE) { |
|
144 |
+ |
|
145 |
+ class <- match.arg(class) |
|
146 |
+ |
|
147 |
+ .importBUStools( |
|
148 |
+ BUStoolsDirs = BUStoolsDirs, |
|
149 |
+ samples = samples, |
|
150 |
+ matrixFileNames = matrixFileNames, |
|
151 |
+ featuresFileNames = featuresFileNames, |
|
152 |
+ barcodesFileNames = barcodesFileNames, |
|
153 |
+ gzipped = gzipped, |
|
154 |
+ class = class, |
|
155 |
+ delayedArray = delayedArray) |
|
156 |
+} |
0 | 157 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,285 @@ |
1 |
+ |
|
2 |
+#' @importFrom reticulate import |
|
3 |
+.readMatrixNpz <- function(matrixLocation, |
|
4 |
+ colIndexLocation, |
|
5 |
+ rowIndexLocation, |
|
6 |
+ class, |
|
7 |
+ delayedArray) { |
|
8 |
+ |
|
9 |
+ ## Now importing these functions in 'reticulate_setup.R' file |
|
10 |
+ # sparse <- reticulate::import("scipy.sparse") |
|
11 |
+ # numpy <- reticulate::import("numpy") |
|
12 |
+ if (!reticulate::py_module_available(module = "scipy.sparse")) { |
|
13 |
+ stop("Error!", "Cannot find python module 'scipy.sparse', please install Conda and run sctkPythonInstallConda() |
|
14 |
+ or run sctkPythonInstallVirtualEnv(). If one of these have been previously run to install the modules, |
|
15 |
+ make sure to run selectSCTKConda() or selectSCTKVirtualEnvironment(), respectively, if R has been |
|
16 |
+ restarted since the module installation. Alternatively, scipy can be installed on the local machine |
|
17 |
+ with pip (e.g. pip install scipy) and then the 'use_python()' function from the 'reticulate' package |
|
18 |
+ can be used to select the correct Python environment.") |
|
19 |
+ } |
|
20 |
+ if (!reticulate::py_module_available(module = "numpy")) { |
|
21 |
+ stop("Error!", "Cannot find python module 'numpy', please install Conda and run sctkPythonInstallConda() |
|
22 |
+ or run sctkPythonInstallVirtualEnv(). If one of these have been previously run to install the modules, |
|
23 |
+ make sure to run selectSCTKConda() or selectSCTKVirtualEnvironment(), respectively, if R has been |
|
24 |
+ restarted since the module installation. Alternatively, numpy can be installed on the local machine |
|
25 |
+ with pip (e.g. pip install numpy) and then the 'use_python()' function from the 'reticulate' package |
|
26 |
+ can be used to select the correct Python environment.") |
|
27 |
+ } |
|
28 |
+ |
|
29 |
+ error <- try({ |
|
30 |
+ mat <- sparse$load_npz(matrixLocation) |
|
31 |
+ colIndex <- as.vector(numpy$load(colIndexLocation, allow_pickle = TRUE)) |
|
32 |
+ rowIndex <- as.vector(numpy$load(rowIndexLocation, allow_pickle = TRUE)) |
|
33 |
+ colnames(mat) <- colIndex |
|
34 |
+ rownames(mat) <- rowIndex |
|
35 |
+ mat <- t(mat) |
|
36 |
+ |
|
37 |
+ ## Convert to "dgCMatrix" |
|
38 |
+ newM <- Matrix::Matrix(mat[,1], nrow=nrow(mat)) |
|
39 |
+ newM <- methods::as(newM, "dgCMatrix") |
|
40 |
+ breaks <- seq(2, ncol(mat), by=1000) |
|
41 |
+ if(length(breaks) > 2) { |
|
42 |
+ for(i in seq(2, length(breaks))) { |
|
43 |
+ ix <- seq(breaks[i-1], (breaks[i]-1)) |
|
44 |
+ newM <- cbind(newM, mat[,ix]) |
|
45 |
+ } |
|
46 |
+ ix <- seq(utils::tail(breaks, n = 1), ncol(mat)) |
|
47 |
+ newM <- cbind(newM, mat[,ix]) |
|
48 |
+ } else { |
|
49 |
+ ix <- seq(2, ncol(mat)) |
|
50 |
+ newM <- cbind(newM, mat[,ix]) |
|
51 |
+ } |
|
52 |
+ |
|
53 |
+ colnames(newM) <- colnames(mat) |
|
54 |
+ rownames(newM) <- rownames(mat) |
|
55 |
+ mat <- newM |
|
56 |
+ }, silent = TRUE) |
|
57 |
+ |
|
58 |
+ if(inherits(error, "try-error")) { |
|
59 |
+ stop(paste0("importOptimus did not complete successfully. SCE could not be generated. Error given during the import process: \n\n", error)) |
|
60 |
+ } |
|
61 |
+ |
|
62 |
+ if (class == "matrix") { |
|
63 |
+ mat <- as.matrix(mat) |
|
64 |
+ } |
|
65 |
+ |
|
66 |
+ if (isTRUE(delayedArray)) { |
|
67 |
+ mat <- DelayedArray::DelayedArray(mat) |
|
68 |
+ } |
|
69 |
+ return(mat) |
|
70 |
+} |
|
71 |
+ |
|
72 |
+ |
|
73 |
+.readMetricsOptimus <- function(path) { |
|
74 |
+ metrics <- data.table::fread(path) |
|
75 |
+ return(metrics) |
|
76 |
+} |
|
77 |
+ |
|
78 |
+ |
|
79 |
+.readEmptyDrops <- function(path) { |
|
80 |
+ emptyDrops <- data.table::fread(path) |
|
81 |
+ colnames(emptyDrops) <- paste0("dropletUtils_emptyDrops_", |
|
82 |
+ colnames(emptyDrops)) |
|
83 |
+ return(emptyDrops) |
|
84 |
+} |
|
85 |
+ |
|
86 |
+ |
|
87 |
+.combineColData <- function(colnames, cellMetrics, emptyDrops) { |
|
88 |
+ cd <- data.table::data.table(CellId = colnames) |
|
89 |
+ cd <- merge(cd, |
|
90 |
+ cellMetrics, |
|
91 |
+ by.x = "CellId", |
|
92 |
+ by.y = "V1", |
|
93 |
+ all.x = TRUE, |
|
94 |
+ all.y = FALSE, |
|
95 |
+ sort = FALSE) |
|
96 |
+ |
|
97 |
+ if (!is.null(emptyDrops)) { |
|
98 |
+ cd <- merge(cd, |
|
99 |
+ emptyDrops, |
|
100 |
+ by.x = "CellId", |
|
101 |
+ by.y = "dropletUtils_emptyDrops_CellId", |
|
102 |
+ all.x = TRUE, |
|
103 |
+ all.y = FALSE, |
|
104 |
+ sort = FALSE) |
|
105 |
+ } |
|
106 |
+ |
|
107 |
+ return(cd) |
|
108 |
+} |
|
109 |
+ |
|
110 |
+ |
|
111 |
+.combineRowData <- function(rownames, geneMetrics) { |
|
112 |
+ rd <- data.table::data.table(feature_ID = rownames) |
|
113 |
+ rd <- merge(rd, |
|
114 |
+ geneMetrics, |
|
115 |
+ by.x = "feature_ID", |
|
116 |
+ by.y = "V1", |
|
117 |
+ all.x = TRUE, |
|
118 |
+ all.y = FALSE, |
|
119 |
+ sort = FALSE) |
|
120 |
+ return(rd) |
|
121 |
+} |
|
122 |
+ |
|
123 |
+ |
|
124 |
+.constructSCEFromOptimusOutputs <- function(dir, |
|
125 |
+ sample, |
|
126 |
+ matrixLocation, |
|
127 |
+ colIndexLocation, |
|
128 |
+ rowIndexLocation, |
|
129 |
+ cellMetricsLocation, |
|
130 |
+ geneMetricsLocation, |
|
131 |
+ emptyDropsLocation, |
|
132 |
+ class, |
|
133 |
+ delayedArray) { |
|
134 |
+ |
|
135 |
+ mat <- .readMatrixNpz(file.path(dir, matrixLocation), |
|
136 |
+ file.path(dir, colIndexLocation), |
|
137 |
+ file.path(dir, rowIndexLocation), |
|
138 |
+ class, |
|
139 |
+ delayedArray) |
|
140 |
+ |
|
141 |
+ cellMetrics <- .readMetricsOptimus(file.path(dir, cellMetricsLocation)) |
|
142 |
+ geneMetrics <- .readMetricsOptimus(file.path(dir, geneMetricsLocation)) |
|
143 |
+ |
|
144 |
+ if (!is.null(geneMetricsLocation)) { |
|
145 |
+ emptyDrops <- .readEmptyDrops(file.path(dir, emptyDropsLocation)) |
|
146 |
+ } |
|
147 |
+ |
|
148 |
+ cd <- .combineColData(colnames(mat), cellMetrics, emptyDrops) |
|
149 |
+ rd <- .combineRowData(rownames(mat), geneMetrics) |
|
150 |
+ |
|
151 |
+ coln <- paste(sample, colnames(mat), sep = "_") |
|
152 |
+ |
|
153 |
+ sce <- SingleCellExperiment::SingleCellExperiment( |
|
154 |
+ assays = list(counts = mat)) |
|
155 |
+ SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(cd, |
|
156 |
+ column_name = coln, |
|
157 |
+ sample = sample, |
|
158 |
+ row.names = coln) |
|
159 |
+ SummarizedExperiment::rowData(sce) <- S4Vectors::DataFrame(rd) |
|
160 |
+ |
|
161 |
+ return(sce) |
|
162 |
+} |
|
163 |
+ |
|
164 |
+ |
|
165 |
+.checkArgsImportOptimus <- function(OptimusDirs, |
|
166 |
+ samples) { |
|
167 |
+ |
|
168 |
+ if (length(OptimusDirs) != length(samples)) { |
|
169 |
+ stop("'OptimusDirs' and 'samples' have unequal lengths!") |
|
170 |
+ } |
|
171 |
+} |
|
172 |
+ |
|
173 |
+ |
|
174 |
+.importOptimus <- function(OptimusDirs, |
|
175 |
+ samples, |
|
176 |
+ matrixLocation, |
|
177 |
+ colIndexLocation, |
|
178 |
+ rowIndexLocation, |
|
179 |
+ cellMetricsLocation, |
|
180 |
+ geneMetricsLocation, |
|
181 |
+ emptyDropsLocation, |
|
182 |
+ class, |
|
183 |
+ delayedArray) { |
|
184 |
+ |
|
185 |
+ .checkArgsImportOptimus(OptimusDirs, samples) |
|
186 |
+ |
|
187 |
+ res <- vector("list", length = length(samples)) |
|
188 |
+ |
|
189 |
+ for (i in seq_along(samples)) { |
|
190 |
+ scei <- .constructSCEFromOptimusOutputs(OptimusDirs[i], |
|
191 |
+ sample = samples[i], |
|
192 |
+ matrixLocation = matrixLocation, |
|
193 |
+ colIndexLocation = colIndexLocation, |
|
194 |
+ rowIndexLocation = rowIndexLocation, |
|
195 |
+ cellMetricsLocation = cellMetricsLocation, |
|
196 |
+ geneMetricsLocation = geneMetricsLocation, |
|
197 |
+ emptyDropsLocation = emptyDropsLocation, |
|
198 |
+ class = class, |
|
199 |
+ delayedArray = delayedArray) |
|
200 |
+ res[[i]] <- scei |
|
201 |
+ } |
|
202 |
+ |
|
203 |
+ sce <- do.call(SingleCellExperiment::cbind, res) |
|
204 |
+ return(sce) |
|
205 |
+} |
|
206 |
+ |
|
207 |
+ |
|
208 |
+#' @name importOptimus |
|
209 |
+#' @rdname importOptimus |
|
210 |
+#' @title Construct SCE object from Optimus output |
|
211 |
+#' @description Read the barcodes, features (genes), and matrices from Optimus |
|
212 |
+#' outputs. Import them |
|
213 |
+#' as one \link[SingleCellExperiment]{SingleCellExperiment} object. |
|
214 |
+#' @param OptimusDirs A vector of root directories of Optimus output files. |
|
215 |
+#' The paths should be something like this: |
|
216 |
+#' \code{/PATH/TO/bb4a2a5e-ff34-41b6-97d2-0c0c0c534530}. |
|
217 |
+#' Each entry in \code{OptimusDirs} is considered a sample and should have |
|
218 |
+#' its own path. Must have the same length as \code{samples}. |
|
219 |
+#' @param samples A vector of user-defined sample names for the sample to be |
|
220 |
+#' imported. Must have the same length as \code{OptimusDirs}. |
|
221 |
+#' @param matrixLocation Character. It is the intermediate |
|
222 |
+#' path to the filtered count maxtrix file saved in sparse matrix format |
|
223 |
+#' (\code{.npz}). Default |
|
224 |
+#' \code{call-MergeCountFiles/sparse_counts.npz} which works for |
|
225 |
+#' optimus_v1.4.0. |
|
226 |
+#' @param colIndexLocation Character. The intermediate path to the barcode |
|
227 |
+#' index file. Default \code{call-MergeCountFiles/sparse_counts_col_index.npy}. |
|
228 |
+#' @param rowIndexLocation Character. The intermediate path to the feature |
|
229 |
+#' (gene) index file. Default |
|
230 |
+#' \code{call-MergeCountFiles/sparse_counts_row_index.npy}. |
|
231 |
+#' @param cellMetricsLocation Character. It is the intermediate |
|
232 |
+#' path to the cell metrics file (\code{merged-cell-metrics.csv.gz}). Default |
|
233 |
+#' \code{call-MergeCellMetrics/merged-cell-metrics.csv.gz} which works for |
|
234 |
+#' optimus_v1.4.0. |
|
235 |
+#' @param geneMetricsLocation Character. It is the intermediate |
|
236 |
+#' path to the feature (gene) metrics file (\code{merged-gene-metrics.csv.gz}). |
|
237 |
+#' Default \code{call-MergeGeneMetrics/merged-gene-metrics.csv.gz} which works |
|
238 |
+#' for optimus_v1.4.0. |
|
239 |
+#' @param emptyDropsLocation Character. It is the intermediate |
|
240 |
+#' path to \link[DropletUtils]{emptyDrops} metrics file |
|
241 |
+#' (\code{empty_drops_result.csv}). |
|
242 |
+#' Default \code{call-RunEmptyDrops/empty_drops_result.csv} which works for |
|
243 |
+#' optimus_v1.4.0. |
|
244 |
+#' @param class Character. The class of the expression matrix stored in the SCE |
|
245 |
+#' object. Can be one of "Matrix" (as returned by |
|
246 |
+#' \link[Matrix]{readMM} function), or "matrix" (as returned by |
|
247 |
+#' \link[base]{matrix} function). Default "Matrix". |
|
248 |
+#' @param delayedArray Boolean. Whether to read the expression matrix as |
|
249 |
+#' \link[DelayedArray]{DelayedArray} object or not. Default \code{TRUE}. |
|
250 |
+#' @return A \link[SingleCellExperiment]{SingleCellExperiment} object |
|
251 |
+#' containing the count |
|
252 |
+#' matrix, the gene annotation, and the cell annotation. |
|
253 |
+#' @examples |
|
254 |
+#' \dontrun{ |
|
255 |
+#' sce <- importOptimus(OptimusDirs = |
|
256 |
+#' system.file("extdata/Optimus_20x1000", |
|
257 |
+#' package = "singleCellTK"), |
|
258 |
+#' samples = "Optimus_20x1000") |
|
259 |
+#' } |
|
260 |
+#' @export |
|
261 |
+importOptimus <- function(OptimusDirs, |
|
262 |
+ samples, |
|
263 |
+ matrixLocation = "call-MergeCountFiles/sparse_counts.npz", |
|
264 |
+ colIndexLocation = "call-MergeCountFiles/sparse_counts_col_index.npy", |
|
265 |
+ rowIndexLocation = "call-MergeCountFiles/sparse_counts_row_index.npy", |
|
266 |
+ cellMetricsLocation = "call-MergeCellMetrics/merged-cell-metrics.csv.gz", |
|
267 |
+ geneMetricsLocation = "call-MergeGeneMetrics/merged-gene-metrics.csv.gz", |
|
268 |
+ emptyDropsLocation = "call-RunEmptyDrops/empty_drops_result.csv", |
|
269 |
+ class = c("Matrix", "matrix"), |
|
270 |
+ delayedArray = TRUE) { |
|
271 |
+ |
|
272 |
+ class <- match.arg(class) |
|
273 |
+ |
|
274 |
+ .importOptimus(OptimusDirs = OptimusDirs, |
|
275 |
+ samples = samples, |
|
276 |
+ matrixLocation = matrixLocation, |
|
277 |
+ colIndexLocation = colIndexLocation, |
|
278 |
+ rowIndexLocation = rowIndexLocation, |
|
279 |
+ cellMetricsLocation = cellMetricsLocation, |
|
280 |
+ geneMetricsLocation = geneMetricsLocation, |
|
281 |
+ emptyDropsLocation = emptyDropsLocation, |
|
282 |
+ class = class, |
|
283 |
+ delayedArray = delayedArray) |
|
284 |
+ |
|
285 |
+} |
0 | 286 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,168 @@ |
1 |
+ |
|
2 |
+# dir <- "outs/filtered_feature_bc_matrix/" |
|
3 |
+.constructSCEFromSTARsoloOutputs <- function(dir, |
|
4 |
+ sample, |
|
5 |
+ matrixFileName, |
|
6 |
+ featuresFileName, |
|
7 |
+ barcodesFileName, |
|
8 |
+ gzipped, |
|
9 |
+ class, |
|
10 |
+ delayedArray) { |
|
11 |
+ |
|
12 |
+ cb <- .readBarcodes(file.path(dir, barcodesFileName)) |
|
13 |
+ fe <- .readFeatures(file.path(dir, featuresFileName)) |
|
14 |
+ ma <- .readMatrixMM(file.path(dir, matrixFileName), |
|
15 |
+ gzipped = gzipped, |
|
16 |
+ class = class, |
|
17 |
+ delayedArray = delayedArray) |
|
18 |
+ |
|
19 |
+ coln <- paste(sample, cb[[1]], sep = "_") |
|
20 |
+ rownames(ma) <- fe[[1]] |
|
21 |
+ |
|
22 |
+ sce <- SingleCellExperiment::SingleCellExperiment( |
|
23 |
+ assays = list(counts = ma)) |
|
24 |
+ SummarizedExperiment::rowData(sce) <- fe |
|
25 |
+ SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(cb, |
|
26 |
+ column_name = coln, |
|
27 |
+ sample = sample, |
|
28 |
+ row.names = coln) |
|
29 |
+ |
|
30 |
+ return(sce) |
|
31 |
+} |
|
32 |
+ |
|
33 |
+ |
|
34 |
+# main function |
|
35 |
+.importSTARsolo <- function(STARsoloDirs, |
|
36 |
+ samples, |
|
37 |
+ STARsoloOuts, |
|
38 |
+ matrixFileNames, |
|
39 |
+ featuresFileNames, |
|
40 |
+ barcodesFileNames, |
|
41 |
+ gzipped, |
|
42 |
+ class, |
|
43 |
+ delayedArray) { |
|
44 |
+ |
|
45 |
+ if (length(STARsoloDirs) != length(samples)) { |
|
46 |
+ stop("'STARsoloDirs' and 'samples' have unequal lengths!") |
|
47 |
+ } |
|
48 |
+ |
|
49 |
+ res <- vector("list", length = length(samples)) |
|
50 |
+ |
|
51 |
+ STARsoloOuts <- .getVectorized(STARsoloOuts, length(samples)) |
|
52 |
+ matrixFileNames <- .getVectorized(matrixFileNames, length(samples)) |
|
53 |
+ featuresFileNames <- .getVectorized(featuresFileNames, length(samples)) |
|
54 |
+ barcodesFileNames <- .getVectorized(barcodesFileNames, length(samples)) |
|
55 |
+ gzipped <- .getVectorized(gzipped, length(samples)) |
|
56 |
+ |
|
57 |
+ for (i in seq_along(samples)) { |
|
58 |
+ dir <- file.path(STARsoloDirs[i], STARsoloOuts[i]) |
|
59 |
+ scei <- .constructSCEFromSTARsoloOutputs(dir, |
|
60 |
+ sample = samples[i], |
|
61 |
+ matrixFileName = matrixFileNames[i], |
|
62 |
+ featuresFileName = featuresFileNames[i], |
|
63 |
+ barcodesFileName = barcodesFileNames[i], |
|
64 |
+ gzipped = gzipped[i], |
|
65 |
+ class = class, |
|
66 |
+ delayedArray = delayedArray) |
|
67 |
+ res[[i]] <- scei |
|
68 |
+ } |
|
69 |
+ |
|
70 |
+ sce <- do.call(SingleCellExperiment::cbind, res) |
|
71 |
+ return(sce) |
|
72 |
+} |
|
73 |
+ |
|
74 |
+ |
|
75 |
+#' @name importSTARsolo |
|
76 |
+#' @rdname importSTARsolo |
|
77 |
+#' @title Construct SCE object from STARsolo outputs |
|
78 |
+#' @description Read the barcodes, features (genes), and matrices from STARsolo |
|
79 |
+#' outputs. Import them |
|
80 |
+#' as one \link[SingleCellExperiment]{SingleCellExperiment} object. |
|
81 |
+#' @param STARsoloDirs A vector of root directories of STARsolo output files. |
|
82 |
+#' The paths should be something like this: |
|
83 |
+#' \bold{/PATH/TO/\emph{prefix}Solo.out}. For example: \code{./Solo.out}. |
|
84 |
+#' Each sample should have its own path. Must have the same length as |
|
85 |
+#' \code{samples}. |
|
86 |
+#' @param samples A vector of user-defined sample names for the sample to be |
|
87 |
+#' imported. Must have the same length as \code{STARsoloDirs}. |
|
88 |
+#' @param STARsoloOuts Character vector. The intermediate |
|
89 |
+#' paths to filtered or raw cell barcode, feature, and matrix files |
|
90 |
+#' for each of \code{samples}. Default \code{"Gene/filtered"} which works |
|
91 |
+#' for STAR 2.7.3a. Must have length 1 or the same |
|
92 |
+#' length as \code{samples}. |
|
93 |
+#' @param matrixFileNames Filenames for the Market Exchange Format (MEX) sparse |
|
94 |
+#' matrix file (.mtx file). Must have length 1 or the same |
|
95 |
+#' length as \code{samples}. |
|
96 |
+#' @param featuresFileNames Filenames for the feature annotation file. |
|
97 |
+#' Must have length 1 or the same |
|
98 |
+#' length as \code{samples}. |
|
99 |
+#' @param barcodesFileNames Filenames for the cell barcode list file. |
|
100 |
+#' Must have length 1 or the same |
|
101 |
+#' length as \code{samples}. |
|
102 |
+#' @param gzipped Boolean. \code{TRUE} if the STARsolo output files |
|
103 |
+#' (barcodes.tsv, features.tsv, and matrix.mtx) were |
|
104 |
+#' gzip compressed. \code{FALSE} otherwise. This is \code{FALSE} in STAR |
|
105 |
+#' 2.7.3a. Default \code{"auto"} which automatically detects if the |
|
106 |
+#' files are gzip compressed. Must have length 1 or the same |
|
107 |
+#' length as \code{samples}. |
|
108 |
+#' @param class Character. The class of the expression matrix stored in the SCE |
|
109 |
+#' object. Can be one of "Matrix" (as returned by |
|
110 |
+#' \link[Matrix]{readMM} function), or "matrix" (as returned by |
|
111 |
+#' \link[base]{matrix} function). Default "Matrix". |
|
112 |
+#' @param delayedArray Boolean. Whether to read the expression matrix as |
|
113 |
+#' \link[DelayedArray]{DelayedArray} object or not. Default \code{TRUE}. |
|
114 |
+#' @return A \code{SingleCellExperiment} object containing the count |
|
115 |
+#' matrix, the gene annotation, and the cell annotation. |
|
116 |
+#' @examples |
|
117 |
+#' # Example #1 |
|
118 |
+#' # FASTQ files were downloaded from |
|
119 |
+#' # https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.0 |
|
120 |
+#' # /pbmc_1k_v3 |
|
121 |
+#' # They were concatenated as follows: |
|
122 |
+#' # cat pbmc_1k_v3_S1_L001_R1_001.fastq.gz pbmc_1k_v3_S1_L002_R1_001.fastq.gz > |
|
123 |
+#' # pbmc_1k_v3_R1.fastq.gz |
|
124 |
+#' # cat pbmc_1k_v3_S1_L001_R2_001.fastq.gz pbmc_1k_v3_S1_L002_R2_001.fastq.gz > |
|
125 |
+#' # pbmc_1k_v3_R2.fastq.gz |
|
126 |
+#' # The following STARsolo command generates the filtered feature, cell, and |
|
127 |
+#' # matrix files |
|
128 |
+#' # STAR \ |
|
129 |
+#' # --genomeDir ./index \ |
|
130 |
+#' # --readFilesIn ./pbmc_1k_v3_R2.fastq.gz \ |
|
131 |
+#' # ./pbmc_1k_v3_R1.fastq.gz \ |
|
132 |
+#' # --readFilesCommand zcat \ |
|
133 |
+#' # --outSAMtype BAM Unsorted \ |
|
134 |
+#' # --outBAMcompression -1 \ |
|
135 |
+#' # --soloType CB_UMI_Simple \ |
|
136 |
+#' # --soloCBwhitelist ./737K-august-2016.txt \ |
|
137 |
+#' # --soloUMIlen 12 |
|
138 |
+#' |
|
139 |
+#' # The top 20 genes and the first 20 cells are included in this example. |
|
140 |
+#' sce <- importSTARsolo( |
|
141 |
+#' STARsoloDirs = system.file("extdata/STARsolo_PBMC_1k_v3_20x20", |
|
142 |
+#' package = "singleCellTK"), |
|
143 |
+#' samples = "PBMC_1k_v3_20x20") |
|
144 |
+#' @export |
|
145 |
+importSTARsolo <- function( |
|
146 |
+ STARsoloDirs, |
|
147 |
+ samples, |
|
148 |
+ STARsoloOuts = "Gene/filtered", |
|
149 |
+ matrixFileNames = "matrix.mtx", |
|
150 |
+ featuresFileNames = "features.tsv", |
|
151 |
+ barcodesFileNames = "barcodes.tsv", |
|
152 |
+ gzipped = "auto", |
|
153 |
+ class = c("Matrix", "matrix"), |
|
154 |
+ delayedArray = TRUE) { |
|
155 |
+ |
|
156 |
+ class <- match.arg(class) |
|
157 |
+ |
|
158 |
+ .importSTARsolo( |
|
159 |
+ STARsoloDirs = STARsoloDirs, |
|
160 |
+ samples = samples, |
|
161 |
+ STARsoloOuts = STARsoloOuts, |
|
162 |
+ matrixFileNames = matrixFileNames, |
|
163 |
+ featuresFileNames = featuresFileNames, |
|
164 |
+ barcodesFileNames = barcodesFileNames, |
|
165 |
+ gzipped = gzipped, |
|
166 |
+ class = class, |
|
167 |
+ delayedArray = delayedArray) |
|
168 |
+} |
0 | 169 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,236 @@ |
1 |
+.constructSCEFromSeqcOutputs <- function( |
|
2 |
+ sampleName, |
|
3 |
+ matrix, |
|
4 |
+ features, |
|
5 |
+ barcodes) { |
|
6 |
+ |
|
7 |
+ coln <- paste(sampleName, barcodes[[1]], sep = "_") |
|
8 |
+ rownames(matrix) <- features[[1]] |
|
9 |
+ |
|
10 |
+ sce <- SingleCellExperiment::SingleCellExperiment( |
|
11 |
+ assays = list(counts = matrix)) |
|
12 |
+ SummarizedExperiment::rowData(sce) <- features |
|
13 |
+ SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame( |
|
14 |
+ cell_barcode = as.character(barcodes[[1]]), |
|
15 |
+ column_name = coln, |
|
16 |
+ sample = sampleName, |
|
17 |
+ row.names = coln) |
|
18 |
+ |
|
19 |
+ return(sce) |
|
20 |
+} |
|
21 |
+ |
|
22 |
+ |
|
23 |
+.unionGeneMatrix <- function(geneUnion, matrix){ |
|
24 |
+ missGene <- geneUnion[!geneUnion %in% rownames(matrix)] |
|
25 |
+ missMat <- Matrix::Matrix(0, nrow = length(missGene), ncol = ncol(matrix), |
|
26 |
+ dimnames = list(missGene, NULL)) |
|
27 |
+ |
|
28 |
+ matb <- methods::as(matrix, "dgCMatrix") |
|
29 |
+ rownames(matb) <- rownames(matrix) |
|
30 |
+ |
|
31 |
+ mat <- rbind(matb, missMat) |
|
32 |
+ if (anyDuplicated(rownames(mat))) { |
|
33 |
+ mat <- mat[!duplicated(rownames(mat)), ] |
|
34 |
+ warning("Duplicated genes exist in count matrix. Filtered", |
|
35 |
+ " duplicated genes.") |
|
36 |
+ } |
|
37 |
+ return(mat) |
|
38 |
+} |
|
39 |
+ |
|
40 |
+ |
|
41 |
+.getGeneUnion <- function(geneList){ |
|
42 |
+ gene <- geneList |
|
43 |
+ for (i in seq_along(geneList)){ |
|
44 |
+ gene[[i]] <- geneList[[i]][[1]] |
|
45 |
+ } |
|
46 |
+ |
|
47 |
+ geneUnion <- base::Reduce(union, gene) |
|
48 |
+ return(geneUnion) |
|
49 |
+} |
|
50 |
+ |
|
51 |
+ |
|
52 |
+.readBarcodesSEQC <- function(path) { |
|
53 |
+ res <- data.table::fread(path, header = FALSE, sep=",", colClasses = "character") |
|
54 |
+ res <- res[,-1,drop = FALSE] |
|
55 |
+ colnames(res) <- "cell_barcode" |
|
56 |
+ return(res) |
|
57 |
+} |
|
58 |
+ |
|
59 |
+ |
|
60 |
+.readFeaturesSEQC <- function(path) { |
|
61 |
+ res <- data.table::fread(path, header = FALSE, sep=",", colClasses = "character") |
|
62 |
+ res <- res[,-1,drop = FALSE] |
|
63 |
+ colnames(res) <- "feature_name" |
|
64 |
+ return(res) |
|
65 |
+} |
|
66 |
+ |
|
67 |
+ |
|
68 |
+.importSEQC <- function( |
|
69 |
+ seqcDirs, |
|
70 |
+ samples, |
|
71 |
+ prefix, |
|
72 |
+ gzipped, |
|
73 |
+ class, |
|
74 |
+ delayedArray, |
|
75 |
+ cbNotFirstCol, |
|
76 |
+ feNotFirstCol, |
|
77 |
+ combinedSample) { |
|
78 |
+ |
|
79 |
+ if (length(seqcDirs) != length(samples)) { |
|
80 |
+ stop("'seqcDirs' and 'samples' have unequal lengths!") |
|
81 |
+ } |
|
82 |
+ |
|
83 |
+ if (length(seqcDirs) != length(prefix)) { |
|
84 |
+ stop("'seqcDirs' and 'prefix' have unequal lengths!") |
|
85 |
+ } |
|
86 |
+ |
|
87 |
+ res <- vector("list", length = length(seqcDirs)) |
|
88 |
+ cb <- vector("list", length = length(seqcDirs)) |
|
89 |
+ fe <- vector("list", length = length(seqcDirs)) |
|
90 |
+ mat <- vector("list", length = length(seqcDirs)) |
|
91 |
+ |
|
92 |
+ for (i in seq_along(seqcDirs)) { |
|
93 |
+ dir <- seqcDirs[i] |
|
94 |
+ matrixFile <- paste(prefix[i], 'sparse_molecule_counts.mtx', sep = "_") |
|
95 |
+ featuresFile <- paste(prefix[i], 'sparse_counts_genes.csv', sep = "_") |
|
96 |
+ barcodesFile <- paste(prefix[i], 'sparse_counts_barcodes.csv', |
|
97 |
+ sep = "_") |
|
98 |
+ |
|
99 |
+ cb[[i]] <- .readBarcodesSEQC(file.path(dir, barcodesFile)) |
|
100 |
+ fe[[i]] <- .readFeaturesSEQC(file.path(dir, featuresFile)) |
|
101 |
+ |
|
102 |
+ mat[[i]] <- .readMatrixMM(file.path(dir, matrixFile), |
|
103 |
+ gzipped = gzipped, class = class, delayedArray = delayedArray) |
|
104 |
+ mat[[i]] <- t(mat[[i]]) |
|
105 |
+ rownames(mat[[i]]) <- fe[[i]][[1]] |
|
106 |
+ } |
|
107 |
+ |
|
108 |
+ if (isTRUE(combinedSample) & length(seqcDirs) > 1) { |
|
109 |
+ geneUnion <- .getGeneUnion(fe) |
|
110 |
+ for (i in seq_along(seqcDirs)) { |
|
111 |
+ matrix <- .unionGeneMatrix(geneUnion = geneUnion, matrix = mat[[i]]) |
|
112 |
+ matrix <- matrix[geneUnion, ] |
|
113 |
+ feature <- S4Vectors::DataFrame('feature_name' = rownames(matrix)) |
|
114 |
+ |
|
115 |
+ scei <- .constructSCEFromSeqcOutputs( |
|
116 |
+ sampleName = samples[i], |
|
117 |
+ matrix = matrix, |
|
118 |
+ features = feature, |
|
119 |
+ barcodes = cb[[i]]) |
|
120 |
+ res[[i]] <- scei |
|
121 |
+ |
|
122 |
+ } |
|
123 |
+ sce <- do.call(SingleCellExperiment::cbind, res) |
|
124 |
+ return(sce) |
|
125 |
+ |
|
126 |
+ } else { |
|
127 |
+ for (i in seq_along(seqcDirs)) { |
|
128 |
+ scei <- .constructSCEFromSeqcOutputs( |
|
129 |
+ sampleName = samples[i], |
|
130 |
+ matrix = mat[[i]], |
|
131 |
+ features = fe[[i]], |
|
132 |
+ barcodes = cb[[i]]) |
|
133 |
+ res[[i]] <- scei |
|
134 |
+ } |
|
135 |
+ if (length(seqcDirs) == 1) { |
|
136 |
+ return(res[[1]]) |
|
137 |
+ } else { |
|
138 |
+ return(res) |
|
139 |
+ } |
|
140 |
+ } |
|
141 |
+} |
|
142 |
+ |
|
143 |
+ |
|
144 |
+#' @name importSEQC |
|
145 |
+#' @rdname importSEQC |
|
146 |
+#' @title Construct SCE object from seqc output |
|
147 |
+#' @description Read the filtered barcodes, features, and matrices for all |
|
148 |
+#' samples from (preferably a single run of) seqc output. Import and |
|
149 |
+#' combine them as one big \link[SingleCellExperiment]{SingleCellExperiment} |
|
150 |
+#' object. |
|
151 |
+#' @param seqcDirs A vector of paths to seqc output files. Each sample |
|
152 |
+#' should have its own path. For example: \code{./pbmc_1k_50x50}. |
|
153 |
+#' Must have the same length as \code{samples}. |
|
154 |
+#' @param samples A vector of user-defined sample names for the samples to be |
|
155 |
+#' imported. Must have the same length as \code{seqcDirs}. |
|
156 |
+#' @param prefix A vector containing the prefix of file names within each |
|
157 |
+#' sample directory. It cannot be null and the vector should have the same |
|
158 |
+#' length as \emph{samples}. |
|
159 |
+#' @param gzipped Boolean. \code{TRUE} if the seqc output files |
|
160 |
+#' (sparse_counts_barcode.csv, sparse_counts_genes.csv, and |
|
161 |
+#' sparse_molecule_counts.mtx) |
|
162 |
+#' were gzip compressed. \code{FALSE} otherwise. Default seqc outputs are |
|
163 |
+#' not gzipped. |
|
164 |
+#' Default \code{FALSE}. |
|
165 |
+#' @param class Character. The class of the expression matrix stored in the SCE |
|
166 |
+#' object. Can be one of "Matrix" (as returned by |
|
167 |
+#' \link[Matrix]{readMM} function), or "matrix" (as returned by |
|
168 |
+#' \link[base]{matrix} function). Default "Matrix". |
|
169 |
+#' @param delayedArray Boolean. Whether to read the expression matrix as |
|
170 |
+#' \link[DelayedArray]{DelayedArray} object or not. Default \code{TRUE}. |
|
171 |
+#' @param feNotFirstCol Boolean. \code{TRUE} if first column of |
|
172 |
+#' sparse_counts_genes.csv |
|
173 |
+#' is row index and it will be removed. \code{FALSE} the first column will |
|
174 |
+#' be kept. |
|
175 |
+#' @param cbNotFirstCol Boolean. \code{TRUE} if first column of |
|
176 |
+#' sparse_counts_barcode.csv |
|
177 |
+#' is row index and it will be removed. \code{FALSE} the first column will |
|
178 |
+#' be kept. |
|
179 |
+#' @param combinedSample Boolean. If \code{TRUE}, \code{importSEQC} returns a |
|
180 |
+#' \code{SingleCellExperiment} object containing the combined count matrix, |
|
181 |
+#' feature annotations |
|
182 |
+#' and the cell annotations. If \code{FALSE}, \code{importSEQC} returns a |
|
183 |
+#' list containing multiple |
|
184 |
+#' \code{SingleCellExperiment} objects. Each \code{SingleCellExperiment} |
|
185 |
+#' contains count matrix, feature annotations and cell annotations for |
|
186 |
+#' each sample. |
|
187 |
+#' @details |
|
188 |
+#' \code{importSEQC} imports output from seqc. |
|
189 |
+#' The default sparse_counts_barcode.csv or sparse_counts_genes.csv from |
|
190 |
+#' seqc output |
|
191 |
+#' contains two columns. The first column is row index and the second column |
|
192 |
+#' is cell-barcode |
|
193 |
+#' or gene symbol. \code{importSEQC} will remove first column. Alternatively, |
|
194 |
+#' user can call |
|
195 |
+#' \code{cbNotFirstCol} or \code{feNotFirstCol} as FALSE to keep the first |
|
196 |
+#' column of these files. |
|
197 |
+#' When \code{combinedSample} is TRUE, \code{importSEQC} will combined count |
|
198 |
+#' matrix with genes detected in at least one sample. |
|
199 |
+#' @return A \code{SingleCellExperiment} object containing the combined count |
|
200 |
+#' matrix, the feature annotations, and the cell annotation. |
|
201 |
+#' @examples |
|
202 |
+#' # Example #1 |
|
203 |
+#' # The following filtered feature, cell, and matrix files were downloaded from |
|
204 |
+#' # https://support.10xgenomics.com/single-cell-gene-expression/datasets/ |
|
205 |
+#' # 3.0.0/pbmc_1k_v3 |
|
206 |
+#' # The top 50 hg38 genes are included in this example. |
|
207 |
+#' # Only the top 50 cells are included. |
|
208 |
+#' sce <- importSEQC( |
|
209 |
+#' seqcDirs = system.file("extdata/pbmc_1k_50x50", package = "singleCellTK"), |
|
210 |
+#' samples = "pbmc_1k_50x50", |
|
211 |
+#' prefix = "pbmc_1k", |
|
212 |
+#' combinedSample = FALSE) |
|
213 |
+#' @export |
|
214 |
+importSEQC <- function( |
|
215 |
+ seqcDirs = NULL, |
|
216 |
+ samples = NULL, |
|
217 |
+ prefix = NULL, |
|
218 |
+ gzipped = FALSE, |
|
219 |
+ class = c("Matrix", "matrix"), |
|
220 |
+ delayedArray = TRUE, |
|
221 |
+ cbNotFirstCol = TRUE, |
|
222 |
+ feNotFirstCol = TRUE, |
|
223 |
+ combinedSample = TRUE) { |
|
224 |
+ |
|
225 |
+ class <- match.arg(class) |
|
226 |
+ |
|
227 |
+ .importSEQC(seqcDirs = seqcDirs, |
|
228 |
+ samples = samples, |
|
229 |
+ prefix = prefix, |
|
230 |
+ gzipped = gzipped, |
|
231 |
+ class = class, |
|
232 |
+ delayedArray = delayedArray, |
|
233 |
+ cbNotFirstCol = cbNotFirstCol, |
|
234 |
+ feNotFirstCol = feNotFirstCol, |
|
235 |
+ combinedSample = combinedSample) |
|
236 |
+} |
... | ... |
@@ -49,68 +49,6 @@ summarizeSCE <- function(inSCE, useAssay = NULL, sampleVariableName = NULL){ |
49 | 49 |
return(df) |
50 | 50 |
} |
51 | 51 |
|
52 |
- |
|
53 |
-#' Filter Genes and Samples from a Single Cell Object |
|
54 |
-#' |
|
55 |
-#' @param inSCE Input \linkS4class{SingleCellExperiment} object. Required |
|
56 |
-#' @param useAssay Indicate which assay to use for filtering. Default is |
|
57 |
-#' "counts" |
|
58 |
-#' @param deletesamples List of samples to delete from the object. |
|
59 |
-#' @param removeNoExpress Remove genes that have no expression across all |
|
60 |
-#' samples. The default is true |
|
61 |
-#' @param removeBottom Fraction of low expression genes to remove from the |
|
62 |
-#' single cell object. This occurs after removeNoExpress. The default is 0.50. |
|
63 |
-#' @param minimumDetectGenes Minimum number of genes with at least 1 |
|
64 |
-#' count to include a sample in the single cell object. The default is 1700. |
|
65 |
-#' @param filterSpike Apply filtering to Spike in controls (indicated by |
|
66 |
-#' isSpike). |
|
67 |
-#' The default is TRUE. |
|
68 |
-#' |
|
69 |
-#' @return The filtered single cell object. |
|
70 |
-#' @export |
|
71 |
-#' @examples |
|
72 |
-#' data("mouseBrainSubsetSCE") |
|
73 |
-#' mouseBrainSubsetSCE <- filterSCData(mouseBrainSubsetSCE, |
|
74 |
-#' deletesamples="X1772063061_G11") |
|
75 |
-filterSCData <- function(inSCE, useAssay="counts", deletesamples=NULL, |
|
76 |
- removeNoExpress=TRUE, removeBottom=0.5, |
|
77 |
- minimumDetectGenes=1700, filterSpike=TRUE){ |
|
78 |
- #delete specified samples |
|
79 |
- inSCE <- inSCE[, !(colnames(inSCE) %in% deletesamples)] |
|
80 |
- |
|
81 |
- if (filterSpike){ |
|
82 |
- nkeeprows <- ceiling((1 - removeBottom) * as.numeric(nrow(inSCE))) |
|
83 |
- tokeeprow <- order(rowSums(SummarizedExperiment::assay(inSCE, useAssay)), |
|
84 |
- decreasing = TRUE)[seq_len(nkeeprows)] |
|
85 |
- } else { |
|
86 |
- nkeeprows <- ceiling((1 - removeBottom) * as.numeric(nrow(inSCE))) - |
|
87 |
- sum(SingleCellExperiment::isSpike(inSCE)) |
|
88 |
- tokeeprow <- order(rowSums(SummarizedExperiment::assay(inSCE, useAssay)), |
|
89 |
- decreasing = TRUE) |
|
90 |
- tokeeprow <- setdiff(tokeeprow, |
|
91 |
- which(SingleCellExperiment::isSpike(inSCE))) |
|
92 |
- tokeeprow <- tokeeprow[seq_len(nkeeprows)] |
|
93 |
- tokeeprow <- c(tokeeprow, which(SingleCellExperiment::isSpike(inSCE))) |
|
94 |
- } |
|
95 |
- tokeepcol <- colSums(SummarizedExperiment::assay(inSCE, useAssay) != 0) >= |
|
96 |
- minimumDetectGenes |
|
97 |
- inSCE <- inSCE[tokeeprow, tokeepcol] |
|
98 |
- |
|
99 |
- #remove genes with no expression |
|
100 |
- if (removeNoExpress){ |
|
101 |
- if (filterSpike){ |
|
102 |
- inSCE <- inSCE[rowSums(SummarizedExperiment::assay(inSCE, |
|
103 |
- useAssay)) != 0, ] |
|
104 |
- } else { |
|
105 |
- inSCE <- inSCE[(rowSums( |
|
106 |
- SummarizedExperiment::assay(inSCE, useAssay)) != 0 | |
|
107 |
- SingleCellExperiment::isSpike(inSCE)), ] |
|
108 |
- } |
|
109 |
- } |
|
110 |
- |
|
111 |
- return(inSCE) |
|
112 |
-} |
|
113 |
- |
|
114 | 52 |
#' Generate a distinct palette for coloring different clusters |
115 | 53 |
#' |
116 | 54 |
#' @param n Integer; Number of colors to generate |
... | ... |
@@ -168,7 +106,7 @@ distinctColors <- function(n, hues = c("red", "cyan", "orange", "blue", |
168 | 106 |
#' |
169 | 107 |
#' @description Three different generation methods are wrapped, including |
170 | 108 |
#' \code{\link[celda]{distinctColors}}, |
171 |
-#' \code{\link[randomcoloR]{distinctColorPalette}} and the \code{ggplot} |
|
109 |
+#' [randomcoloR](SCTK_PerformingQC_Cell_V3.Rmd) and the \code{ggplot} |
|
172 | 110 |
#' default color generation. |
173 | 111 |
#' @param n An integer, the number of color codes to generate. |
174 | 112 |
#' @param palette A single character string. Select the method, available |
175 | 113 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,91 @@ |
1 |
+ |
|
2 |
+#' @importFrom tools file_ext |
|
3 |
+.getFileExt <- function(file) { |
|
4 |
+ ext1 <- tools::file_ext(file) |
|
5 |
+ if (!(ext1 %in% c("mtx", "txt", "csv", "tab", "tsv", |
|
6 |
+ "npz", "gz", "bz2", "xz"))) { |
|
7 |
+ |
|
8 |
+ warning("Unknown extension ", ext1, ". Treat as text file.") |
|
9 |
+ return(c("unknown")) |
|
10 |
+ } |
|
11 |
+ |
|
12 |
+ # if compressed |
|
13 |
+ if (ext1 %in% c("gz", "bz2", "xz")) { |
|
14 |
+ bn <- substr(file, start = 1, stop = nchar(file) - nchar(ext1) - 1) |
|
15 |
+ ext2 <- tools::file_ext(bn) |
|
16 |
+ if (!(ext2 %in% c("mtx", "txt", "csv", "tab", "npz"))) { |
|
17 |
+ warning("Unknown extension ", ext2, ". Treat as text file.") |
|
18 |
+ return(c(ext1, "unknown")) |
|
19 |
+ } |
|
20 |
+ return(c(ext1, ext2)) |
|
21 |
+ } else { |
|
22 |
+ return(ext1) |
|
23 |
+ } |
|
24 |
+} |
|
25 |
+ |
|
26 |
+ |
|
27 |
+#' @name readSingleCellMatrix |
|
28 |
+#' @title Read single cell expression matrix |
|
29 |
+#' @description Automatically detact the format of the input file and read |
|
30 |
+#' the file. |
|
31 |
+#' @param file Path to input file. Supported file endings include .mtx, .txt, |
|
32 |
+#' .csv, .tab, .tsv, .npz, and their corresponding \code{gzip}, |
|
33 |
+#' \code{bzip2}, or \code{xz} compressed extensions (\code{*.gz}, |
|
34 |
+#' \code{*.bz2}, or \code{*.xz}). |
|
35 |
+#' @param class Character. Class of matrix. One of "Matrix" or "matrix". Specifying "Matrix" |
|
36 |
+#' will convert to a sparse format which should be used |
|
37 |
+#' for datasets with large numbers of cells. Default "Matrix". |
|
38 |
+#' @param delayedArray Boolean. Whether to read the expression matrix as |
|
39 |
+#' \link[DelayedArray]{DelayedArray} object or not. Default \code{TRUE}. |
|
40 |
+#' @param colIndexLocation Character. For Optimus output, the path to the |
|
41 |
+#' barcode index .npy file. Used only if \code{file} has .npz extension. |
|
42 |
+#' Default \code{NULL}. |
|
43 |
+#' @param rowIndexLocation Character. For Optimus output, The path to the |
|
44 |
+#' feature (gene) index .npy file. Used only if \code{file} has .npz extension. |
|
45 |
+#' Default \code{NULL}. |
|
46 |
+#' @examples |
|
47 |