R/importSeqc.R
3a66a102
 .constructSCEFromSeqcOutputs <- function(
     sampleName,
     matrix,
     features,
     barcodes) {
 
     coln <- paste(sampleName, barcodes[[1]], sep = "_")
     rownames(matrix) <- features[[1]]
 
     sce <- SingleCellExperiment::SingleCellExperiment(
         assays = list(counts = matrix))
     SummarizedExperiment::rowData(sce) <- features
     SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(
         cell_barcode = as.character(barcodes[[1]]),
         column_name = coln,
         sample = sampleName,
         row.names = coln)
 
     return(sce)
 }
 
 
 .unionGeneMatrix <- function(geneUnion, matrix){
     missGene <- geneUnion[!geneUnion %in% rownames(matrix)]
     missMat <- Matrix::Matrix(0, nrow = length(missGene), ncol = ncol(matrix),
         dimnames = list(missGene, NULL))
 
     matb <- methods::as(matrix, "dgCMatrix")
     rownames(matb) <- rownames(matrix)
 
     mat <- rbind(matb, missMat)
     if (anyDuplicated(rownames(mat))) {
         mat <- mat[!duplicated(rownames(mat)), ]
         warning("Duplicated genes exist in count matrix. Filtered",
             " duplicated genes.")
     }
     return(mat)
 }
 
 
 .getGeneUnion <- function(geneList){
     gene <- geneList
     for (i in seq_along(geneList)){
         gene[[i]] <- geneList[[i]][[1]]
     }
 
     geneUnion <- base::Reduce(union, gene)
     return(geneUnion)
 }
 
 
 .readBarcodesSEQC <- function(path) {
     res <- data.table::fread(path, header = FALSE, sep=",", colClasses = "character")
     res <- res[,-1,drop = FALSE]
     colnames(res) <- "cell_barcode"
     return(res)
 }
 
 
 .readFeaturesSEQC <- function(path) {
     res <- data.table::fread(path, header = FALSE, sep=",", colClasses = "character")
     res <- res[,-1,drop = FALSE]
     colnames(res) <- "feature_name"
     return(res)
 }
 
 
 .importSEQC <- function(
     seqcDirs,
     samples,
     prefix,
     gzipped,
     class,
     delayedArray,
     cbNotFirstCol,
     feNotFirstCol,
dfe50c33
     combinedSample,
     rowNamesDedup) {
3a66a102
 
     if (length(seqcDirs) != length(samples)) {
         stop("'seqcDirs' and 'samples' have unequal lengths!")
     }
 
     if (length(seqcDirs) != length(prefix)) {
         stop("'seqcDirs' and 'prefix' have unequal lengths!")
     }
 
     res <- vector("list", length = length(seqcDirs))
     cb <- vector("list", length = length(seqcDirs))
     fe <- vector("list", length = length(seqcDirs))
     mat <- vector("list", length = length(seqcDirs))
 
     for (i in seq_along(seqcDirs)) {
         dir <- seqcDirs[i]
         matrixFile <- paste(prefix[i], 'sparse_molecule_counts.mtx', sep = "_")
         featuresFile <- paste(prefix[i], 'sparse_counts_genes.csv', sep = "_")
         barcodesFile <- paste(prefix[i], 'sparse_counts_barcodes.csv',
             sep = "_")
 
         cb[[i]] <- .readBarcodesSEQC(file.path(dir, barcodesFile))
         fe[[i]] <- .readFeaturesSEQC(file.path(dir, featuresFile))
 
         mat[[i]] <- .readMatrixMM(file.path(dir, matrixFile),
             gzipped = gzipped, class = class, delayedArray = delayedArray)
         mat[[i]] <- t(mat[[i]])
         rownames(mat[[i]]) <- fe[[i]][[1]]
     }
 
     if (isTRUE(combinedSample) & length(seqcDirs) > 1) {
         geneUnion <- .getGeneUnion(fe)
         for (i in seq_along(seqcDirs)) {
             matrix <- .unionGeneMatrix(geneUnion = geneUnion, matrix = mat[[i]])
             matrix <- matrix[geneUnion, ]
             feature <- S4Vectors::DataFrame('feature_name' = rownames(matrix))
 
             scei <- .constructSCEFromSeqcOutputs(
                 sampleName = samples[i],
                 matrix = matrix,
                 features = feature,
                 barcodes = cb[[i]])
             res[[i]] <- scei
 
         }
         sce <- do.call(SingleCellExperiment::cbind, res)
dfe50c33
         if (isTRUE(rowNamesDedup)) {
             if (any(duplicated(rownames(sce)))) {
                 message("Duplicated gene names found, adding '-1', '-2', ",
                         "... suffix to them.")
             }
             sce <- dedupRowNames(sce)
         }
3a66a102
         return(sce)
 
     } else {
         for (i in seq_along(seqcDirs)) {
             scei <- .constructSCEFromSeqcOutputs(
                 sampleName = samples[i],
                 matrix = mat[[i]],
                 features = fe[[i]],
                 barcodes = cb[[i]])
             res[[i]] <- scei
         }
         if (length(seqcDirs) == 1) {
dfe50c33
             sce <- res[[1]]
             if (isTRUE(rowNamesDedup)) {
                 if (any(duplicated(rownames(sce)))) {
                     message("Duplicated gene names found, adding '-1', '-2', ",
                             "... suffix to them.")
                 }
                 sce <- dedupRowNames(sce)
             }
             return(sce)
3a66a102
         } else {
             return(res)
         }
     }
 }
 
 
 #' @name importSEQC
 #' @rdname importSEQC
 #' @title Construct SCE object from seqc output
 #' @description Read the filtered barcodes, features, and matrices for all
dfe50c33
 #'  samples from (preferably a single run of) seqc output. Import and combine
 #'  them as one big \link[SingleCellExperiment]{SingleCellExperiment} object.
3a66a102
 #' @param seqcDirs A vector of paths to seqc output files. Each sample
dfe50c33
 #'  should have its own path. For example: \code{"./pbmc_1k_50x50"}. Must have 
 #'  the same length as \code{samples}.
3a66a102
 #' @param samples A vector of user-defined sample names for the samples to be
 #'  imported. Must have the same length as \code{seqcDirs}.
 #' @param prefix A vector containing the prefix of file names within each
 #'  sample directory. It cannot be null and the vector should have the same
 #'  length as \emph{samples}.
 #' @param gzipped Boolean. \code{TRUE} if the seqc output files
 #'  (sparse_counts_barcode.csv, sparse_counts_genes.csv, and
dfe50c33
 #'  sparse_molecule_counts.mtx) were gzip compressed. \code{FALSE} otherwise. 
 #'  Default seqc outputs are not gzipped. Default \code{FALSE}.
3a66a102
 #' @param class Character. The class of the expression matrix stored in the SCE
dfe50c33
 #'  object. Can be one of \code{"Matrix"} (as returned by \link{readMM} 
 #'  function), or \code{"matrix"} (as returned by \link[base]{matrix} function).
 #'  Default \code{"Matrix"}.
3a66a102
 #' @param delayedArray Boolean. Whether to read the expression matrix as
969718f9
 #'  \link{DelayedArray} object or not. Default \code{FALSE}.
3a66a102
 #' @param feNotFirstCol Boolean. \code{TRUE} if first column of
dfe50c33
 #'  sparse_counts_genes.csv is row index and it will be removed. \code{FALSE} 
 #'  the first column will be kept.
3a66a102
 #' @param cbNotFirstCol Boolean. \code{TRUE} if first column of
dfe50c33
 #'  sparse_counts_barcode.csv is row index and it will be removed. \code{FALSE} 
 #'  the first column will be kept.
3a66a102
 #' @param combinedSample Boolean. If \code{TRUE}, \code{importSEQC} returns a
 #' \code{SingleCellExperiment} object containing the combined count matrix,
dfe50c33
 #'  feature annotations and the cell annotations. If \code{FALSE}, 
 #'  \code{importSEQC} returns a list containing multiple
3a66a102
 #'  \code{SingleCellExperiment} objects. Each \code{SingleCellExperiment}
 #'  contains count matrix, feature annotations and cell annotations for
 #'  each sample.
dfe50c33
 #' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Only applied 
 #' if \code{combinedSample} is \code{TRUE} or only one \code{seqcDirs} 
 #' specified. Default \code{TRUE}.
3a66a102
 #' @details
dfe50c33
 #' \code{importSEQC} imports output from seqc. The default 
 #'  sparse_counts_barcode.csv or sparse_counts_genes.csv from seqc output
3a66a102
 #'  contains two columns. The first column is row index and the second column
dfe50c33
 #'  is cell-barcode or gene symbol. \code{importSEQC} will remove first column. 
 #'  Alternatively, user can call
3a66a102
 #'  \code{cbNotFirstCol} or \code{feNotFirstCol} as FALSE to keep the first
dfe50c33
 #'  column of these files. When \code{combinedSample} is TRUE, \code{importSEQC}
 #'  will combined count matrix with genes detected in at least one sample.
3a66a102
 #' @return A \code{SingleCellExperiment} object containing the combined count
 #'  matrix, the feature annotations, and the cell annotation.
 #' @examples
 #' # Example #1
 #' # The following filtered feature, cell, and matrix files were downloaded from
 #' # https://support.10xgenomics.com/single-cell-gene-expression/datasets/
 #' # 3.0.0/pbmc_1k_v3
 #' # The top 50 hg38 genes are included in this example.
 #' # Only the top 50 cells are included.
 #' sce <- importSEQC(
 #'     seqcDirs = system.file("extdata/pbmc_1k_50x50", package = "singleCellTK"),
 #'     samples = "pbmc_1k_50x50",
 #'     prefix = "pbmc_1k",
 #'     combinedSample = FALSE)
 #' @export
 importSEQC <- function(
     seqcDirs = NULL,
     samples = NULL,
     prefix = NULL,
     gzipped = FALSE,
     class = c("Matrix", "matrix"),
969718f9
     delayedArray = FALSE,
3a66a102
     cbNotFirstCol = TRUE,
     feNotFirstCol = TRUE,
dfe50c33
     combinedSample = TRUE,
     rowNamesDedup = TRUE) {
3a66a102
 
     class <- match.arg(class)
 
     .importSEQC(seqcDirs = seqcDirs,
         samples = samples,
         prefix = prefix,
         gzipped = gzipped,
         class = class,
         delayedArray = delayedArray,
         cbNotFirstCol = cbNotFirstCol,
         feNotFirstCol = feNotFirstCol,
dfe50c33
         combinedSample = combinedSample,
         rowNamesDedup = rowNamesDedup)
3a66a102
 }