R/importDropEst.R
1ff7e374
 .readDropEstFile <- function(sampleDir, dataType,rdsFileName){
   dropEst_cell_counts <- file.path(sampleDir, paste(rdsFileName, '.rds', sep=''))
   if (!file.exists(dropEst_cell_counts)){
     stop("DropEst output not found at location specified. Please check path provided and/or filename.")
   }
005bba57
   dropEst_rds <- readRDS(dropEst_cell_counts)
 
1ff7e374
   return(dropEst_rds)
 }
 
 .constructColdata <- function(dropEst_rds,counts_matrix, dataType){
   coldata_fields <- c("mean_reads_per_umi","aligned_reads_per_cell","aligned_umis_per_cell","requested_umis_per_cb","requested_reads_per_cb")
   coldata_df <-  list()
   for (field in coldata_fields){
     if (field %in% names(dropEst_rds)){
       coldata_field_df <- data.frame(as.matrix(dropEst_rds[[field]]))
       names(coldata_field_df)[1] <- field
       coldata_field_df$cell <- row.names(coldata_field_df)
005bba57
 
1ff7e374
       coldata_df[[field]] <- coldata_field_df
     }}
   coldata_df_merged <- Reduce(function(x, y) merge(x, y, all=TRUE,by="cell"), coldata_df)
   row.names(coldata_df_merged) <- coldata_df_merged$cell
   coldata_df_merged <- S4Vectors::DataFrame(as.matrix(coldata_df_merged))
   if (dataType == 'filtered'){
     coldata_df_merged <- coldata_df_merged[coldata_df_merged$cell %in% colnames(counts_matrix),]
   }
   return(coldata_df_merged)
 }
 
 .extractMetadata <- function(dropEst_rds){
   metadata_fields <- c("saturation_info","merge_targets","reads_per_umi_per_cell")
   metadata <- c()
   for (md in metadata_fields){
     if (md %in% names(dropEst_rds)){
       metadata[[md]] <- dropEst_rds[[md]]
     }}
   return(metadata)
 }
 
005bba57
 .importDropEstSample <- function(sampleDir = './',
                                  dataType,
                                  rdsFileName,
1ff7e374
                                  sampleName = 'sample',
d32b31c8
                                  delayedArray = FALSE,
                                  class){
1ff7e374
   ## Read DropEst RDS
   dropEst_rds <- .readDropEstFile(sampleDir,dataType,rdsFileName)
   if (dataType == 'filtered' && 'cm' %in% names(dropEst_rds)) {
     counts_matrix <- dropEst_rds$cm
   } else if (dataType == 'raw' && 'cm_raw' %in% names(dropEst_rds)) {
     counts_matrix <- dropEst_rds$cm_raw
   } else {
     stop("No counts matrix found in the .rds provided! Exiting.")
   }
005bba57
 
d32b31c8
   if (class == "Matrix") {
     counts_matrix <- .convertToMatrix(counts_matrix)
   } else if (class == "matrix") {
     counts_matrix <- base::as.matrix(counts_matrix)
   }
 
1ff7e374
   if (isTRUE(delayedArray)) {
     counts_matrix <- DelayedArray::DelayedArray(counts_matrix)
d32b31c8
   }
1ff7e374
   ## Create SingleCellExperiment object
   ## Add SCE ColData. If using filtered counts matrix, colData is subset to include filtered cells.
005bba57
   ## append sample name to cells in SCE
1ff7e374
   sce <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = counts_matrix))
005bba57
   colnames(sce) <- paste0(sampleName,"_",colnames(sce))
1ff7e374
   sce_coldata <- .constructColdata(dropEst_rds, counts_matrix, dataType)
   row.names(sce_coldata) <- paste0(sampleName,"_",row.names(sce_coldata))
005bba57
 
1ff7e374
   if (dim(counts_matrix)[2] == dim(sce_coldata)[1]){
     SummarizedExperiment::colData(sce) <- sce_coldata
   } else {
     warning("Unable to add ColData to SCE. nCol of Counts Matrix not equal to nRow of ColData matrix.")
   }
005bba57
 
1ff7e374
   ## Add SCE metadata
   sce_metadata <- .extractMetadata(dropEst_rds)
c9ad930f
   sce@metadata$dropEst <- sce_metadata
5cae00d8
   
1ff7e374
   return(sce)
 }
 
 #' @name importDropEst
 #' @rdname importDropEst
1957acfc
 #' @title Create a SingleCellExperiment Object from DropEst output
 #' @description imports the RDS file created by DropEst (https://github.com/hms-dbmi/dropEst) and
 #' create a SingleCellExperiment object from either the raw or filtered counts matrix.
 #' Additionally parse through the RDS to obtain appropriate feature annotations as
 #' SCE coldata, in addition to any metadata.
 #' @param sampleDirs  A path to the directory containing the data files. Default "./".
 #' @param sampleNames A User-defined sample name. This will be prepended to all cell barcode IDs.
 #'  Default "sample".
95793b91
 #' @param dataType can be "filtered" or "raw". Default \code{"filtered"}.
1ff7e374
 #' @param rdsFileName File name prefix of the DropEst RDS output. default is "cell.counts"
 #' @param delayedArray Boolean. Whether to read the expression matrix as
5cae00d8
 #'  \link{DelayedArray} object or not. Default \code{FALSE}.
d32b31c8
 #' @param class Character. The class of the expression matrix stored in the SCE
 #'  object. Can be one of "Matrix" (as returned by
 #'  \link{readMM} function), or "matrix" (as returned by
 #'  \link[base]{matrix} function). Default \code{"Matrix"}.
dfe50c33
 #' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Default 
 #'  \code{TRUE}.
1ff7e374
 #' @details
 #' \code{importDropEst} expects either raw counts matrix stored as "cm_raw" or filtered
005bba57
 #' counts matrix stored as "cm" in the DropEst rds output.
1ff7e374
 #' ColData is obtained from the DropEst corresponding to "mean_reads_per_umi","aligned_reads_per_cell",
 #' "aligned_umis_per_cell","requested_umis_per_cb","requested_reads_per_cb"
005bba57
 #' If using filtered counts matrix, the colData dataframe is
1ff7e374
 #' subset to contain features from the filtered counts matrix alone.
005bba57
 #' If any annotations of ("saturation_info","merge_targets","reads_per_umi_per_cell") are
1ff7e374
 #' found in the DropEst rds, they will be added to the SCE metadata field
 #' @return A \code{SingleCellExperiment} object containing the count matrix,
 #'  the feature annotations from DropEst as ColData, and any metadata from DropEst
c9ad930f
 #' @examples
 #' # Example results were generated as per instructions from the developers of dropEst described in
 #' # https://github.com/hms-dbmi/dropEst/blob/master/examples/EXAMPLES.md
 #' sce <- importDropEst(sampleDirs = system.file("extdata/dropEst_scg71", package = "singleCellTK"),
 #'                      sampleNames = 'scg71')
1ff7e374
 #' @export
005bba57
 importDropEst <- function(sampleDirs = NULL,
1ff7e374
                           dataType = c('filtered','raw'),
                           rdsFileName = 'cell.counts',
                           sampleNames = NULL,
d32b31c8
                           delayedArray = FALSE,
dfe50c33
                           class = c("Matrix", "matrix"),
                           rowNamesDedup = TRUE) {
1ff7e374
   dataType <- match.arg(dataType)
d32b31c8
   class <- match.arg(class)
005bba57
 
1ff7e374
   if (length(sampleDirs)!=length(sampleNames)){
     stop("Please provide sample names for all input directories")
   }
005bba57
 
1ff7e374
   res <- vector("list", length = length(sampleDirs))
005bba57
 
1ff7e374
   for (i in seq_along(sampleDirs)){
     scei <- .importDropEstSample(sampleDir = sampleDirs[[i]],
5cae00d8
                          sampleName = sampleNames[[i]],
                          dataType = dataType,
                          rdsFileName = rdsFileName,
d32b31c8
                          delayedArray = delayedArray,
                          class = class)
1ff7e374
     res[[i]] <- scei
   }
   sce <- do.call(SingleCellExperiment::cbind, res)
dfe50c33
   
   if (isTRUE(rowNamesDedup)) {
     if (any(duplicated(rownames(sce)))) {
       message("Duplicated gene names found, adding '-1', '-2', ",
               "... suffix to them.")
     }
     sce <- dedupRowNames(sce)
   }
   
1ff7e374
   return(sce)
 }
005bba57