# dir <- "genecount" .constructSCEFromBUStoolsOutputs <- function(dir, sample, matrixFileName, featuresFileName, barcodesFileName, gzipped, class, delayedArray) { cb <- .readBarcodes(file.path(dir, barcodesFileName)) fe <- .readFeatures(file.path(dir, featuresFileName)) ma <- .readMatrixMM(file.path(dir, matrixFileName), gzipped = gzipped, class = class, delayedArray = delayedArray) ma <- t(ma) coln <- paste(sample, cb[[1]], sep = "_") rownames(ma) <- fe[[1]] sce <- SingleCellExperiment::SingleCellExperiment( assays = list(counts = ma)) SummarizedExperiment::rowData(sce) <- fe SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(cb, column_name = coln, sample = sample, row.names = coln) return(sce) } # main function .importBUStools <- function( BUStoolsDirs, samples, matrixFileNames, featuresFileNames, barcodesFileNames, gzipped, class, delayedArray, rowNamesDedup) { if (length(BUStoolsDirs) != length(samples)) { stop("'BUStoolsDirs' and 'samples' have unequal lengths!") } res <- vector("list", length = length(samples)) matrixFileNames <- .getVectorized(matrixFileNames, length(samples)) featuresFileNames <- .getVectorized(featuresFileNames, length(samples)) barcodesFileNames <- .getVectorized(barcodesFileNames, length(samples)) gzipped <- .getVectorized(gzipped, length(samples)) for (i in seq_along(samples)) { dir <- file.path(BUStoolsDirs[i]) scei <- .constructSCEFromBUStoolsOutputs(dir, sample = samples[i], matrixFileName = matrixFileNames[i], featuresFileName = featuresFileNames[i], barcodesFileName = barcodesFileNames[i], gzipped = gzipped[i], class = class, delayedArray = delayedArray) res[[i]] <- scei } sce <- do.call(SingleCellExperiment::cbind, res) if (isTRUE(rowNamesDedup)) { if (any(duplicated(rownames(sce)))) { message("Duplicated gene names found, adding '-1', '-2', ", "... suffix to them.") } sce <- dedupRowNames(sce) } return(sce) } #' @name importBUStools #' @rdname importBUStools #' @title Construct SCE object from BUStools output #' @description Read the barcodes, features (genes), and matrix from BUStools #' output. Import them #' as one \link[SingleCellExperiment]{SingleCellExperiment} object. Note the #' cells in the output files for BUStools 0.39.4 are not filtered. #' @param BUStoolsDirs A vector of paths to BUStools output files. Each sample #' should have its own path. For example: \code{./genecount}. #' Must have the same length as \code{samples}. #' @param samples A vector of user-defined sample names for the samples to be #' imported. Must have the same length as \code{BUStoolsDirs}. #' @param matrixFileNames Filenames for the Market Exchange Format (MEX) sparse #' matrix files (.mtx files). Must have length 1 or the same #' length as \code{samples}. #' @param featuresFileNames Filenames for the feature annotation files. #' Must have length 1 or the same length as \code{samples}. #' @param barcodesFileNames Filenames for the cell barcode list file. #' Must have length 1 or the same length as \code{samples}. #' @param gzipped Boolean. \code{TRUE} if the BUStools output files #' (barcodes.txt, genes.txt, and genes.mtx) were #' gzip compressed. \code{FALSE} otherwise. This is \code{FALSE} in BUStools #' 0.39.4. Default \code{"auto"} which automatically detects if the #' files are gzip compressed. Must have length 1 or the same length as #' \code{samples}. #' @param class Character. The class of the expression matrix stored in the SCE #' object. Can be one of "Matrix" (as returned by #' \link{readMM} function), or "matrix" (as returned by #' \link[base]{matrix} function). Default "Matrix". #' @param delayedArray Boolean. Whether to read the expression matrix as #' \link[DelayedArray]{DelayedArray-class} object or not. Default \code{FALSE}. #' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Default #' \code{TRUE}. #' @return A \code{SingleCellExperiment} object containing the count #' matrix, the gene annotation, and the cell annotation. #' @examples #' # Example #1 #' # FASTQ files were downloaded from #' # https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.0 #' # /pbmc_1k_v3 #' # They were concatenated as follows: #' # cat pbmc_1k_v3_S1_L001_R1_001.fastq.gz pbmc_1k_v3_S1_L002_R1_001.fastq.gz > #' # pbmc_1k_v3_R1.fastq.gz #' # cat pbmc_1k_v3_S1_L001_R2_001.fastq.gz pbmc_1k_v3_S1_L002_R2_001.fastq.gz > #' # pbmc_1k_v3_R2.fastq.gz #' # The following BUStools command generates the gene, cell, and #' # matrix files #' #' # bustools correct -w ./3M-february-2018.txt -p output.bus | \ #' # bustools sort -T tmp/ -t 4 -p - | \ #' # bustools count -o genecount/genes \ #' # -g ./transcripts_to_genes.txt \ #' # -e matrix.ec \ #' # -t transcripts.txt \ #' # --genecounts - #' #' # The top 20 genes and the first 20 cells are included in this example. #' sce <- importBUStools( #' BUStoolsDirs = system.file("extdata/BUStools_PBMC_1k_v3_20x20/genecount/", #' package = "singleCellTK"), #' samples = "PBMC_1k_v3_20x20") #' @export importBUStools <- function( BUStoolsDirs, samples, matrixFileNames = "genes.mtx", featuresFileNames = "genes.genes.txt", barcodesFileNames = "genes.barcodes.txt", gzipped = "auto", class = c("Matrix", "matrix"), delayedArray = FALSE, rowNamesDedup = TRUE) { class <- match.arg(class) .importBUStools( BUStoolsDirs = BUStoolsDirs, samples = samples, matrixFileNames = matrixFileNames, featuresFileNames = featuresFileNames, barcodesFileNames = barcodesFileNames, gzipped = gzipped, class = class, delayedArray = delayedArray, rowNamesDedup = rowNamesDedup) }