R/diffExp.R
f27f2aab
 #' @title Differential expression for cell subpopulations using MAST
1786be55
 #' @description Uses MAST to find differentially expressed features for
 #'  specified cell subpopulations.
 #' @param counts Integer matrix. Rows represent features and columns represent
 #'  cells. This matrix should be the same as the one used to generate
 #'  `celdaMod`.
 #' @param celdaMod Celda object of class `celda_C` or `celda_CG`.
 #' @param c1 Integer vector. Cell populations to include in group 1 for the
 #'  differential expression analysis.
 #' @param c2 Integer vector. Cell populations to include in group 2 for the
 #'  differential expression analysis. If NULL, the clusters in the c1 group are
 #'  compared to all other clusters. Default NULL.
 #' @param onlyPos Logical. Whether to only return markers with positive log2
 #'  fold change. Default FALSE.
 #' @param log2fcThreshold Numeric. A number greater than 0 that specifies the
 #'  absolute log2 fold change threshold. Only features with absolute value
 #'  above this threshold will be returned. If NULL, this filter will not be
 #'  applied. Default NULL.
 #' @param fdrThreshold Numeric. A number between 0 and 1 that specifies the
 #'  false discovery rate (FDR) threshold. Only features below this threshold
 #'  will be returned. Default 1.
 #' @return Data frame containing MAST results including statistics such as
 #'  p-value, log2 fold change, and FDR.
966185dd
 #' @examples
a49fff03
 #' data(celdaCGSim, celdaCGMod)
1786be55
 #' clusterDiffexpRes = differentialExpression(celdaCGSim$counts,
 #'     celdaCGMod, c1 = c(1, 2))
209dc834
 #' @export
5c12967f
 #' @rawNamespace import(data.table, except = c(melt, shift))
d7196f24
 #' @importFrom MAST FromMatrix
 #' @importFrom MAST zlm
 #' @importFrom MAST summary
 #' @importFrom S4Vectors mcols
 #' @importFrom SummarizedExperiment assay
 #' @importFrom SummarizedExperiment colData
 #' @importFrom SummarizedExperiment assayNames
aa3724e8
 #' @importFrom plyr .
5c12967f
 #' @import SummarizedExperiment
1786be55
 differentialExpression <- function(counts,
     celdaMod,
     c1,
     c2 = NULL,
     onlyPos = FALSE,
     log2fcThreshold = NULL,
     fdrThreshold = 1) {
 
     if (!is.matrix(counts)) {
         stop("'counts' should be a numeric count matrix")
209dc834
     }
1786be55
     if (!(methods::is(celdaMod, "celda_C") ||
             methods::is(celdaMod, "celda_CG"))) {
         stop("'celdaMod' should be an object of class celda_C or celda_CG")
     }
     if (is.null(c1)) {
         stop("'c1' should be a numeric vector of cell cluster(s)")
     }
     compareCountMatrix(counts, celdaMod)
209dc834
 
1786be55
     if (is.null(c2)) {
06b0c870
         c2 <- sort(setdiff(unique(clusters(celdaMod)$z), c1))
1786be55
     }
     if (length(c1) > 1) {
06b0c870
         cells1 <- matrixNames(celdaMod)$column[which(
             clusters(celdaMod)$z %in% c1)]
1786be55
     } else {
06b0c870
         cells1 <- matrixNames(celdaMod)$column[which(
             clusters(celdaMod)$z == c1)]
1786be55
     }
 
     if (length(c2) > 1) {
06b0c870
         cells2 <- matrixNames(celdaMod)$column[which(
             clusters(celdaMod)$z %in% c2)]
1786be55
     } else {
06b0c870
         cells2 <- matrixNames(celdaMod)$column[which(
             clusters(celdaMod)$z == c2)]
1786be55
     }
 
     mat <- counts[, c(cells1, cells2)]
     log_normalized_mat <- normalizeCounts(mat,
         normalize = "cpm",
         transformationFun = log1p)
     cdat <- data.frame(wellKey = c(cells1, cells2),
         condition = c(rep("c1", length(cells1)), rep("c2", length(cells2))),
         ngeneson = rep("", (length(cells1) + length(cells2))),
         stringsAsFactors = FALSE)
 
7890fdf1
     # explicitly load library SummarizedExperiment due to MAST package
     # dependency error
5c12967f
     # requireNamespace(SummarizedExperiment)
e3638ca1
 
1786be55
     sca <- suppressMessages(MAST::FromMatrix(log_normalized_mat, cdat))
     cdr2 <- colSums(SummarizedExperiment::assay(sca) > 0)
     SummarizedExperiment::colData(sca)$cngeneson <- scale(cdr2)
     cond <- factor(SummarizedExperiment::colData(sca)$condition)
     cond <- stats::relevel(cond, "c2")
     SummarizedExperiment::colData(sca)$condition <- cond
     zlmCond <- MAST::zlm(~ condition + cngeneson, sca)
     summaryCond <- MAST::summary(zlmCond, doLRT = "conditionc1")
     summaryDt <- summaryCond$datatable
     contrast <-
         component <-
         primerid <-
         coef <-
         ci.hi <-
         ci.lo <-
         `Pr(>Chisq)` <-
         fdr <- NULL # Avoid NSE notes in check
 
     fcHurdle <- merge(summaryDt[contrast == "conditionc1" &
             component == "H", .(primerid, `Pr(>Chisq)`)],
         summaryDt[contrast == "conditionc1" & component == "logFC",
             .(primerid, coef, ci.hi, ci.lo)], by = "primerid")
 
     fcHurdle[, fdr := stats::p.adjust(`Pr(>Chisq)`, "fdr")]
     ### Some genes aren't outputted because log2FC gets NaN if one or both
     ### clusters have 0 counts for a gene
     ### and then they're discarded because NaN !> 0
     if (is.null(log2fcThreshold)) {
         fcHurdleSig <- fcHurdle
     } else {
         fcHurdleSig <- merge(fcHurdle[fdr < fdrThreshold &
                 abs(coef) > log2fcThreshold],
461b8542
             data.table::as.data.table(S4Vectors::mcols(sca)),
b6cf56ae
             by = "primerid")
1786be55
         if (onlyPos) {
             fcHurdleSig <- fcHurdleSig[which(fcHurdleSig$log2fc > 0), ]
         }
     }
     fcHurdleSig <- fcHurdleSig[, -c(4, 5)]
     names(fcHurdleSig)[c(1, 2, 3, 4)] <- c("Gene", "Pvalue", "Log2_FC", "FDR")
     fcHurdleSig <- fcHurdleSig[order(fcHurdleSig$Pvalue, decreasing = FALSE), ]
 
     return(fcHurdleSig)
 }