R/importBUStools.R
3a66a102
 
 # dir <- "genecount"
 .constructSCEFromBUStoolsOutputs <- function(dir,
     sample,
     matrixFileName,
     featuresFileName,
     barcodesFileName,
     gzipped,
     class,
     delayedArray) {
 
     cb <- .readBarcodes(file.path(dir, barcodesFileName))
     fe <- .readFeatures(file.path(dir, featuresFileName))
     ma <- .readMatrixMM(file.path(dir, matrixFileName),
         gzipped = gzipped,
         class = class,
         delayedArray = delayedArray)
     ma <- t(ma)
 
     coln <- paste(sample, cb[[1]], sep = "_")
     rownames(ma) <- fe[[1]]
 
     sce <- SingleCellExperiment::SingleCellExperiment(
         assays = list(counts = ma))
     SummarizedExperiment::rowData(sce) <- fe
     SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(cb,
         column_name = coln,
         sample = sample,
         row.names = coln)
dfe50c33
     
3a66a102
     return(sce)
 }
 
 
 # main function
 .importBUStools <- function(
     BUStoolsDirs,
     samples,
     matrixFileNames,
     featuresFileNames,
     barcodesFileNames,
     gzipped,
     class,
dfe50c33
     delayedArray,
     rowNamesDedup) {
3a66a102
 
     if (length(BUStoolsDirs) != length(samples)) {
         stop("'BUStoolsDirs' and 'samples' have unequal lengths!")
     }
 
     res <- vector("list", length = length(samples))
 
     matrixFileNames <- .getVectorized(matrixFileNames, length(samples))
     featuresFileNames <- .getVectorized(featuresFileNames, length(samples))
     barcodesFileNames <- .getVectorized(barcodesFileNames, length(samples))
     gzipped <- .getVectorized(gzipped, length(samples))
 
     for (i in seq_along(samples)) {
         dir <- file.path(BUStoolsDirs[i])
         scei <- .constructSCEFromBUStoolsOutputs(dir,
             sample = samples[i],
             matrixFileName = matrixFileNames[i],
             featuresFileName = featuresFileNames[i],
             barcodesFileName = barcodesFileNames[i],
             gzipped = gzipped[i],
             class = class,
             delayedArray = delayedArray)
         res[[i]] <- scei
     }
 
     sce <- do.call(SingleCellExperiment::cbind, res)
dfe50c33
     
     if (isTRUE(rowNamesDedup)) {
         if (any(duplicated(rownames(sce)))) {
             message("Duplicated gene names found, adding '-1', '-2', ",
                     "... suffix to them.")
         }
         sce <- dedupRowNames(sce)
     }
     
3a66a102
     return(sce)
 }
 
 
 #' @name importBUStools
 #' @rdname importBUStools
 #' @title Construct SCE object from BUStools output
 #' @description Read the barcodes, features (genes), and matrix from BUStools
 #'  output. Import them
 #'  as one \link[SingleCellExperiment]{SingleCellExperiment} object. Note the
 #'  cells in the output files for BUStools 0.39.4 are not filtered.
 #' @param BUStoolsDirs A vector of paths to BUStools output files. Each sample
 #'  should have its own path. For example: \code{./genecount}.
 #'  Must have the same length as \code{samples}.
 #' @param samples A vector of user-defined sample names for the samples to be
 #'  imported. Must have the same length as \code{BUStoolsDirs}.
 #' @param matrixFileNames Filenames for the Market Exchange Format (MEX) sparse
 #'  matrix files (.mtx files). Must have length 1 or the same
 #'  length as \code{samples}.
 #' @param featuresFileNames Filenames for the feature annotation files.
 #'  Must have length 1 or the same length as \code{samples}.
 #' @param barcodesFileNames Filenames for the cell barcode list file.
 #'  Must have length 1 or the same length as \code{samples}.
 #' @param gzipped Boolean. \code{TRUE} if the BUStools output files
 #'  (barcodes.txt, genes.txt, and genes.mtx) were
 #'  gzip compressed. \code{FALSE} otherwise. This is \code{FALSE} in BUStools
 #'  0.39.4. Default \code{"auto"} which automatically detects if the
 #'  files are gzip compressed. Must have length 1 or the same length as
 #'  \code{samples}.
 #' @param class Character. The class of the expression matrix stored in the SCE
 #'  object. Can be one of "Matrix" (as returned by
6fe21ae1
 #'  \link{readMM} function), or "matrix" (as returned by
3a66a102
 #'  \link[base]{matrix} function). Default "Matrix".
 #' @param delayedArray Boolean. Whether to read the expression matrix as
969718f9
 #'  \link[DelayedArray]{DelayedArray-class} object or not. Default \code{FALSE}.
dfe50c33
 #' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Default 
 #'  \code{TRUE}.
3a66a102
 #' @return A \code{SingleCellExperiment} object containing the count
 #'  matrix, the gene annotation, and the cell annotation.
 #' @examples
 #' # Example #1
 #' # FASTQ files were downloaded from
 #' # https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.0
 #' # /pbmc_1k_v3
 #' # They were concatenated as follows:
 #' # cat pbmc_1k_v3_S1_L001_R1_001.fastq.gz pbmc_1k_v3_S1_L002_R1_001.fastq.gz >
 #' # pbmc_1k_v3_R1.fastq.gz
 #' # cat pbmc_1k_v3_S1_L001_R2_001.fastq.gz pbmc_1k_v3_S1_L002_R2_001.fastq.gz >
 #' # pbmc_1k_v3_R2.fastq.gz
 #' # The following BUStools command generates the gene, cell, and
 #' # matrix files
 #'
 #' # bustools correct -w ./3M-february-2018.txt -p output.bus | \
 #' #   bustools sort -T tmp/ -t 4 -p - | \
 #' #   bustools count -o genecount/genes \
 #' #     -g ./transcripts_to_genes.txt \
 #' #     -e matrix.ec \
 #' #     -t transcripts.txt \
 #' #     --genecounts -
 #'
 #' # The top 20 genes and the first 20 cells are included in this example.
 #' sce <- importBUStools(
 #'   BUStoolsDirs = system.file("extdata/BUStools_PBMC_1k_v3_20x20/genecount/",
 #'     package = "singleCellTK"),
 #'   samples = "PBMC_1k_v3_20x20")
 #' @export
 importBUStools <- function(
     BUStoolsDirs,
     samples,
     matrixFileNames = "genes.mtx",
     featuresFileNames = "genes.genes.txt",
     barcodesFileNames = "genes.barcodes.txt",
     gzipped = "auto",
     class = c("Matrix", "matrix"),
dfe50c33
     delayedArray = FALSE,
     rowNamesDedup = TRUE) {
3a66a102
 
     class <- match.arg(class)
 
     .importBUStools(
         BUStoolsDirs = BUStoolsDirs,
         samples = samples,
         matrixFileNames = matrixFileNames,
         featuresFileNames = featuresFileNames,
         barcodesFileNames = barcodesFileNames,
         gzipped = gzipped,
         class = class,
dfe50c33
         delayedArray = delayedArray,
         rowNamesDedup = rowNamesDedup)
3a66a102
 }