R/plot_decontx.R
88f9723b
 #' @title Plots contamination on UMAP coordinates
 #' @description A scatter plot of the UMAP dimensions generated by DecontX with
 #' cells colored by the estimated percentation of contamation.
f2a1c1ba
 #' @param x Either a \linkS4class{SingleCellExperiment} with \code{decontX}
88f9723b
 #' results stored in \code{metadata(x)$decontX} or the result from running
 #' decontX on a count matrix.
 #' @param batch Character. Batch of cells to plot. If \code{NULL}, then
 #' the first batch in the list will be selected. Default \code{NULL}.
 #' @param colorScale Character vector. Contains the color spectrum to be passed
b0efacb1
 #' to \code{scale_colour_gradientn} from package 'ggplot2'. Default
88f9723b
 #' c("blue","green","yellow","orange","red").
 #' @param size Numeric. Size of points in the scatterplot. Default 1.
 #' @return Returns a \code{ggplot} object.
 #' @author Shiyi Yang, Joshua Campbell
 #' @seealso See \code{\link{decontX}} for a full example of how to estimate
f2a1c1ba
 #' and plot contamination.
c5d0a344
 #' @export
88f9723b
 plotDecontXContamination <- function(x,
                                      batch = NULL,
f2a1c1ba
                                      colorScale = c(
                                        "blue",
                                        "green",
                                        "yellow",
                                        "orange",
                                        "red"
                                      ),
88f9723b
                                      size = 1) {
f2a1c1ba
   if (inherits(x, "SingleCellExperiment")) {
     estimates <- S4Vectors::metadata(x)$decontX$estimates
88f9723b
   } else {
     estimates <- x$estimates
   }
f2a1c1ba
   if (is.null(estimates)) {
88f9723b
     stop("decontX estimates not found. Estimates will be found in
           'metadata(x)$decontX$estimates' if 'x' is a
           SingleCellExperiment or 'x$estimates' if decontX was run
           on a count matrix. Are you sure 'x' is output from decontX?")
   }
   batches <- names(estimates)
f2a1c1ba
 
   if (is.null(batch)) {
88f9723b
     i <- batches[1]
   } else {
f2a1c1ba
     if (!(batch %in% batches)) {
       stop(
         "'", batch, "' is not one of the batches in 'x'. Batches available",
         " for plotting: '", paste(batches, collapse = ","), "'"
       )
88f9723b
     }
     i <- batch
   }
 
f2a1c1ba
   contamin <- estimates[[i]]$contamination
88f9723b
   umap <- estimates[[i]]$UMAP
 
   ## Create data.frame
   df <- data.frame(umap, "Contamination" = contamin)
f2a1c1ba
   naIx <- is.na(umap[, 1]) | is.na(umap[, 2])
88f9723b
   df <- df[!naIx, ]
 
   ## Generate ggplot scatterplot
f2a1c1ba
   gg <- ggplot2::ggplot(
     df,
     ggplot2::aes_string(
       x = colnames(umap)[1],
       y = colnames(umap)[2]
     )
   ) +
     ggplot2::geom_point(
       stat = "identity",
       size = size,
       ggplot2::aes_string(color = "Contamination")
     ) +
     ggplot2::theme_bw() +
     ggplot2::scale_colour_gradientn(
       colors = colorScale,
       name = "Contamination",
       limits = c(0, 1)
     ) +
     ggplot2::theme(
       panel.grid.major = ggplot2::element_blank(),
       panel.grid.minor = ggplot2::element_blank(),
       axis.text = ggplot2::element_text(size = 15),
       axis.title = ggplot2::element_text(size = 15)
     )
88f9723b
   return(gg)
 }
e7fe6396
 
 
88f9723b
 #' @title Plots percentage of cells cell types expressing markers
 #' @description Generates a barplot that shows the percentage of
 #' cells within clusters or cell types that have detectable levels
f2a1c1ba
 #' of given marker genes. Can be used to view the expression of
88f9723b
 #' marker genes in different cell types before and after
f2a1c1ba
 #' decontamination with \code{\link{decontX}}.
 #' @param x Either a \linkS4class{SingleCellExperiment} or
 #' a matrix-like object of counts.
 #' @param markers List. A named list indicating the marker genes
 #' for each cell type of
88f9723b
 #' interest. Multiple markers can be supplied for each cell type. For example,
f2a1c1ba
 #' \code{list(Tcell_Markers=c("CD3E", "CD3D"),
 #' Bcell_Markers=c("CD79A", "CD79B", "MS4A1")}
 #' would specify markers for human T-cells and B-cells.
 #' A cell will be considered
 #' "positive" for a cell type if it has a count greater than \code{threshold}
 #' for at least one of the marker genes in the list.
 #' @param groupClusters List. A named list that allows
 #' cell clusters labels coded in
88f9723b
 #' \code{z} to be regrouped and renamed on the fly. For example,
f2a1c1ba
 #' \code{list(Tcells=c(1, 2), Bcells=7)} would recode
 #' clusters 1 and 2 to "Tcells"
 #' and cluster 7 to "Bcells". Note that if this is
 #' used, clusters in \code{z} not found
 #' in \code{groupClusters} will be excluded from the barplot.
 #' Default \code{NULL}.
 #' @param assayName Character vector. Name(s) of the assay(s) to
 #' plot if \code{x} is a
 #' \linkS4class{SingleCellExperiment}. If more than one assay
 #' is listed, then side-by-side barplots will be generated.
 #' Default \code{c("counts", "decontXcounts")}.
 #' @param z Character, Integer, or Vector. Indicates the cluster labels
 #' for each cell.
 #' If \code{x} is a \linkS4class{SingleCellExperiment} and \code{z = NULL},
 #' then the cluster labels from \code{\link{decontX}} will be retived from the
88f9723b
 #' \code{colData} of \code{x} (i.e. \code{colData(x)$decontX_clusters}).
f2a1c1ba
 #' If \code{z} is a single character or integer,
 #' then that column will be retrived
88f9723b
 #' from \code{colData} of \code{x}. (i.e. \code{colData(x)[,z]}). If \code{x}
f2a1c1ba
 #' is a counts matrix, then \code{z} will need
 #' to be a vector the same length as
 #' the number of columns in \code{x} that indicate
 #' the cluster to which each cell
88f9723b
 #' belongs. Default \code{NULL}.
529d38af
 #' @param threshold Numeric. Markers greater than or equal to this value will
 #' be considered detected in a cell. Default 1.
 #' @param exactMatch Boolean. Whether to only identify exact matches
 #' for the markers or to identify partial matches using \code{\link{grep}}. See
 #' \code{\link{retrieveFeatureIndex}} for more details. Default \code{TRUE}.
88f9723b
 #' @param by Character. Where to search for the markers if \code{x} is a
f2a1c1ba
 #' \linkS4class{SingleCellExperiment}. See \code{\link{retrieveFeatureIndex}}
 #' for more details. If \code{x} is a matrix,
 #' then this must be set to \code{"rownames"}.Default \code{"rownames"}.
88f9723b
 #' @param ncol Integer. Number of columns to make in the plot.
f2a1c1ba
 #' Default \code{round(sqrt(length(markers))}.
 #' @param labelBars Boolean. Whether to display percentages above each bar
 #' Default \code{TRUE}.
 #' @param labelSize Numeric. Size of the percentage labels in the barplot.
 #' Default 3.
88f9723b
 #' @return Returns a \code{ggplot} object.
 #' @author Shiyi Yang, Joshua Campbell
 #' @seealso See \code{\link{decontX}} for a full example of how to estimate
f2a1c1ba
 #' and plot contamination.
88f9723b
 #' @export
51477d47
 plotDecontXMarkerPercentage <- function(x, markers, groupClusters = NULL,
f2a1c1ba
                                         assayName = c(
                                           "counts",
                                           "decontXcounts"
                                         ),
                                         z = NULL, threshold = 1,
                                         exactMatch = TRUE, by = "rownames",
                                         ncol = round(sqrt(length(markers))),
                                         labelBars = TRUE, labelSize = 3) {
bec50d46
   cellTypeLabels <- percent <- NULL # fix check note
 
f2a1c1ba
   legend <- "none"
   # Check that list arguments are named
   if (!is(markers, "list") || is.null(names(markers))) {
     stop("'markers' needs to be a named list.")
   }
88f9723b
 
f2a1c1ba
   temp <- .processPlotDecontXMarkerInupt(
     x = x,
     z = z,
     markers = markers,
     groupClusters = groupClusters,
     by = by,
     exactMatch = exactMatch
   )
   x <- temp$x
   z <- temp$z
   geneMarkerIndex <- temp$geneMarkerIndex
   geneMarkerCellTypeIndex <- temp$geneMarkerCellTypeIndex
   groupClusters <- temp$groupClusters
   xlab <- temp$xlab
 
   if (inherits(x, "SingleCellExperiment")) {
     # If 'x' is SingleCellExperiment, then get percentage
     # for each matrix in 'assayName'
     df.list <- list()
     for (i in seq_along(assayName)) {
       counts <- SummarizedExperiment::assay(
         x[geneMarkerIndex, ],
         assayName[i]
       )
       df <- .calculateDecontXBarplotPercent(
         counts,
         z,
         geneMarkerCellTypeIndex,
         threshold
       )
       df.list[[i]] <- cbind(df, assay = assayName[i])
88f9723b
     }
f2a1c1ba
     df <- do.call(rbind, df.list)
     assay <- as.factor(df$assay)
     if (length(assayName) > 1) {
       legend <- "right"
88f9723b
     }
f2a1c1ba
   } else {
     ## If 'x' is matrix, then calculate percentages directly
     counts <- x[geneMarkerIndex, ]
     df <- .calculateDecontXBarplotPercent(
       counts,
       z,
       geneMarkerCellTypeIndex,
       threshold
     )
     assay <- "red3"
     legend <- "none"
   }
e14601cd
 
f2a1c1ba
   # Build data.frame for ggplots
   df <- cbind(df, cellTypeLabels = names(groupClusters)[df$cellType])
   df$cellTypeLabels <- factor(df$cellTypeLabels,
     levels = names(groupClusters)
   )
   df <- cbind(df, markerLabels = names(markers)[df$markers])
   df$markerLabels <- factor(df$markerLabels, levels = names(markers))
 
4c965367
   plt <- ggplot2::ggplot(df, ggplot2::aes_string(
     x = "cellTypeLabels",
     y = "percent", fill = "assay"
f2a1c1ba
   )) +
     ggplot2::geom_bar(
       stat = "identity",
       position = ggplot2::position_dodge2(width = 0.9, preserve = "single")
     ) +
     ggplot2::xlab(xlab) +
     ggplot2::ylab(paste0("Percentage of cells expressing markers")) +
     ggplot2::facet_wrap(. ~ df$markerLabels, ncol = ncol) +
     ggplot2::theme(
       panel.background = ggplot2::element_rect(
         fill = "white",
         color = "grey"
       ),
       panel.grid = ggplot2::element_line("grey"),
       legend.position = legend,
       legend.key = ggplot2::element_rect(
         fill = "white",
         color = "white"
       ),
       panel.grid.minor = ggplot2::element_blank(),
       panel.grid.major = ggplot2::element_blank(),
       text = ggplot2::element_text(size = 10),
       axis.text.x = ggplot2::element_text(
         size = 8, angle = 45,
         hjust = 1
       ),
       axis.text.y = ggplot2::element_text(size = 9),
       legend.key.size = grid::unit(8, "mm"),
       legend.text = ggplot2::element_text(size = 10),
       strip.text.x = ggplot2::element_text(size = 10)
     )
 
   if (isTRUE(labelBars)) {
     plt <- plt + ggplot2::geom_text(ggplot2::aes(
43929492
       x = cellTypeLabels,
       y = percent + 2.5,
       label = percent
f2a1c1ba
     ),
     position = ggplot2::position_dodge2(width = 0.9, preserve = "single"),
     size = labelSize
     )
   }
   return(plt)
e14601cd
 }
 
51477d47
 
 #' @title Plots expression of marker genes before and after decontamination
 #' @description Generates a violin plot that shows the counts of marker
 #' genes in cells across specific clusters or cell types. Can be used to view
 #' the expression of marker genes in different cell types before and after
f2a1c1ba
 #' decontamination with \code{\link{decontX}}.
 #' @param x Either a \linkS4class{SingleCellExperiment}
 #' or a matrix-like object of counts.
 #' @param markers Character Vector or List. A character vector
 #' or list of character vectors
 #' with the names of the marker genes of interest.
 #' @param groupClusters List. A named list that allows
 #' cell clusterslabels coded in
51477d47
 #' \code{z} to be regrouped and renamed on the fly. For example,
f2a1c1ba
 #' \code{list(Tcells=c(1, 2), Bcells=7)} would recode clusters
 #' 1 and 2 to "Tcells"
 #' and cluster 7 to "Bcells". Note that if this is used, clusters
 #' in \code{z} not found
51477d47
 #' in \code{groupClusters} will be excluded. Default \code{NULL}.
f2a1c1ba
 #' @param assayName Character vector. Name(s) of the assay(s) to
 #' plot if \code{x} is a
 #' \linkS4class{SingleCellExperiment}. If more than one assay is listed, then
 #' side-by-side violin plots will be generated.
 #' Default \code{c("counts", "decontXcounts")}.
 #' @param z Character, Integer, or Vector.
 #' Indicates the cluster labels for each cell.
 #' If \code{x} is a \linkS4class{SingleCellExperiment} and \code{z = NULL},
 #' then the cluster labels from \code{\link{decontX}} will be retived from the
51477d47
 #' \code{colData} of \code{x} (i.e. \code{colData(x)$decontX_clusters}).
f2a1c1ba
 #' If \code{z} is a single character or integer, then that column will be
 #' retrived from \code{colData} of \code{x}. (i.e. \code{colData(x)[,z]}).
 #' If \code{x} is a counts matrix, then \code{z} will need to be a vector
 #' the same length as the number of columns in \code{x} that indicate
 #' the cluster to which each cell belongs. Default \code{NULL}.
529d38af
 #' @param exactMatch Boolean. Whether to only identify exact matches
f2a1c1ba
 #' for the markers or to identify partial matches using \code{\link{grep}}.
 #' See \code{\link{retrieveFeatureIndex}} for more details.
 #' Default \code{TRUE}.
51477d47
 #' @param by Character. Where to search for the markers if \code{x} is a
f2a1c1ba
 #' \linkS4class{SingleCellExperiment}. See \code{\link{retrieveFeatureIndex}}
 #' for more details. If \code{x} is a matrix, then this must be set to
 #' \code{"rownames"}. Default \code{"rownames"}.
529d38af
 #' @param log1p Boolean. Whether to apply the function \code{log1p} to the data
f2a1c1ba
 #' before plotting. This function will add a pseudocount of 1 and then log
 #' transform the expression values. Default \code{FALSE}.
51477d47
 #' @param ncol Integer. Number of columns to make in the plot.
f2a1c1ba
 #' Default \code{NULL}.
51477d47
 #' @param plotDots Boolean. If \code{TRUE}, the
 #'  expression of features will be plotted as points in addition to the violin
 #'  curve. Default \code{FALSE}.
 #' @param dotSize Numeric. Size of points if \code{plotDots = TRUE}.
 #' Default \code{0.1}.
 #' @return Returns a \code{ggplot} object.
 #' @author Shiyi Yang, Joshua Campbell
 #' @seealso See \code{\link{decontX}} for a full example of how to estimate
f2a1c1ba
 #' and plot contamination.
51477d47
 #' @export
 plotDecontXMarkerExpression <- function(x, markers, groupClusters = NULL,
f2a1c1ba
                                         assayName = c(
                                           "counts",
                                           "decontXcounts"
                                         ),
529d38af
                                         z = NULL, exactMatch = TRUE,
                                         by = "rownames", log1p = FALSE,
                                         ncol = NULL,
f2a1c1ba
                                         plotDots = FALSE, dotSize = 0.1) {
51477d47
   legend <- "none"
f2a1c1ba
   temp <- .processPlotDecontXMarkerInupt(
     x = x,
     z = z,
     markers = markers,
     groupClusters = groupClusters,
     by = by,
     exactMatch = exactMatch
   )
51477d47
   x <- temp$x
   z <- temp$z
   geneMarkerIndex <- temp$geneMarkerIndex
   groupClusters <- temp$groupClusters
   xlab <- temp$xlab
f2a1c1ba
 
   if (inherits(x, "SingleCellExperiment")) {
51477d47
     # If 'x' is SingleCellExperiment, then get percentage
     # for each matrix in 'assayName'
     df.list <- list()
f2a1c1ba
     for (i in seq_along(assayName)) {
       counts <- SummarizedExperiment::assay(
         x[geneMarkerIndex, ],
         assayName[i]
       )
51477d47
       df <- reshape2::melt(as.matrix(counts),
f2a1c1ba
         varnames = c("Marker", "Cell"),
         value.name = "Expression"
       )
51477d47
       df.list[[i]] <- cbind(df, assay = assayName[i])
     }
     df <- do.call(rbind, df.list)
     assay <- as.factor(df$assay)
f2a1c1ba
     if (length(assayName) > 1) {
51477d47
       legend <- "right"
     }
   } else {
     ## If 'x' is matrix, then calculate percentages directly
f2a1c1ba
     counts <- x[geneMarkerIndex, ]
51477d47
     df <- reshape2::melt(counts,
f2a1c1ba
       varnames = c("Marker", "Cell"),
       value.name = "Expression"
     )
51477d47
     assay <- "red3"
     legend <- "none"
   }
 
f2a1c1ba
   # Create data.frame and add cell type groups back in
51477d47
   names(z) <- colnames(x)
f2a1c1ba
   df <- cbind(df, Cluster = z[df$Cell])
 
51477d47
   ylab <- "Expression"
f2a1c1ba
   if (isTRUE(log1p)) {
51477d47
     df$Expression <- log1p(df$Expression)
     ylab <- "Expression (log1p)"
   }
529d38af
   Expression <- df$Expression
   Marker <- df$Marker
   Assay <- df$assay
   Cluster <- df$Cluster
f2a1c1ba
   if (!is.null(groupClusters)) {
529d38af
     df <- cbind(df, Cell_Type = names(groupClusters)[Cluster])
     Cell_Type <- factor(df$Cell_Type, levels = names(groupClusters))
f2a1c1ba
     plt <- ggplot2::ggplot(df, ggplot2::aes(
       x = Cell_Type,
       y = Expression,
       fill = Assay
     )) +
       ggplot2::facet_wrap(~ Cell_Type + Marker,
         scales = "free",
         labeller = ggplot2::label_context,
         ncol = ncol
       )
51477d47
   } else {
f2a1c1ba
     plt <- ggplot2::ggplot(df, ggplot2::aes(
       x = Cluster,
       y = Expression,
       fill = Assay
     )) +
       ggplot2::facet_wrap(~ Cluster + Marker,
         scales = "free",
         labeller = ggplot2::label_context,
         ncol = ncol
       )
   }
   plt <- plt + ggplot2::geom_violin(
     trim = TRUE,
     scale = "width"
   ) +
     ggplot2::theme_bw() + ggplot2::theme(
       axis.text.x = ggplot2::element_blank(),
       axis.ticks.x = ggplot2::element_blank(),
       axis.title.x = ggplot2::element_blank(),
       strip.text = ggplot2::element_text(size = 8),
       panel.grid = ggplot2::element_blank(),
       legend.position = legend
     ) + ggplot2::ylab(ylab)
 
 
   if (isTRUE(plotDots)) {
51477d47
     plt <- plt + ggplot2::geom_jitter(height = 0, size = dotSize)
   }
f2a1c1ba
 
51477d47
   return(plt)
f2a1c1ba
 }
51477d47
 
 
 
529d38af
 .processPlotDecontXMarkerInupt <- function(x, z, markers, groupClusters,
                                            by, exactMatch) {
f2a1c1ba
 
51477d47
   # Process z and convert to a factor
f2a1c1ba
   if (is.null(z) & inherits(x, "SingleCellExperiment")) {
     cn <- colnames(SummarizedExperiment::colData(x))
     if (!("decontX_clusters" %in% cn)) {
51477d47
       stop("'decontX_clusters' not found in 'colData(x)'. Make sure you have
            run 'decontX' or supply 'z' directly.")
     }
     z <- SummarizedExperiment::colData(x)$decontX_clusters
f2a1c1ba
   } else if (length(z) == 1 & inherits(x, "SingleCellExperiment")) {
     if (!(z %in% colnames(SummarizedExperiment::colData(x)))) {
       stop("'", z, "' not found in 'colData(x)'.")
51477d47
     }
f2a1c1ba
     z <- SummarizedExperiment::colData(x)[, z]
   } else if (length(z) != ncol(x)) {
     stop("If 'x' is a SingleCellExperiment, then 'z' needs to be",
           " a single character or integer specifying the column in",
           " 'colData(x)'. Alternatively to specify the cell cluster",
           " labels directly as a vector, the length of 'z' needs to",
           " be the same as the number of columns in 'x'. This is",
           " required if 'x' is a matrix.")
   }
   if (!is.factor(z)) {
51477d47
     z <- as.factor(z)
   }
f2a1c1ba
 
   if (!is.null(groupClusters)) {
     if (!is(groupClusters, "list") || is.null(names(groupClusters))) {
51477d47
       stop("'groupClusters' needs to be a named list.")
     }
f2a1c1ba
 
     # Check that groupClusters are found in 'z'
51477d47
     cellMappings <- unlist(groupClusters)
f2a1c1ba
     if (any(!(cellMappings %in% z))) {
51477d47
       missing <- cellMappings[!(cellMappings %in% z)]
f2a1c1ba
       stop(
         "'groupClusters' not found in 'z': ",
         paste(missing, collapse = ",")
       )
51477d47
     }
f2a1c1ba
 
51477d47
     labels <- rep(NA, ncol(x))
f2a1c1ba
     for (i in seq_along(groupClusters)) {
51477d47
       labels[z %in% groupClusters[i]] <- names(groupClusters)[i]
     }
     na.ix <- is.na(labels)
     labels <- labels[!na.ix]
f2a1c1ba
     x <- x[, !na.ix]
     z <- as.integer(factor(labels, levels = names(groupClusters)))
51477d47
     xlab <- "Cell types"
   } else {
     labels <- as.factor(z)
     groupClusters <- levels(labels)
     names(groupClusters) <- levels(labels)
     xlab <- "Clusters"
   }
f2a1c1ba
 
51477d47
   # Find index of each feature in 'x'
f2a1c1ba
   geneMarkerCellTypeIndex <- rep(
     seq(length(markers)),
     lapply(markers, length)
   )
51477d47
   geneMarkerIndex <- retrieveFeatureIndex(unlist(markers),
f2a1c1ba
     x,
     by = by,
     removeNA = FALSE,
     exactMatch = exactMatch
   )
 
51477d47
   # Remove genes that did not match
   na.ix <- is.na(geneMarkerIndex)
   geneMarkerCellTypeIndex <- geneMarkerCellTypeIndex[!na.ix]
   geneMarkerIndex <- geneMarkerIndex[!na.ix]
f2a1c1ba
 
   return(list(
     x = x,
     z = z,
     geneMarkerIndex = geneMarkerIndex,
     geneMarkerCellTypeIndex = geneMarkerCellTypeIndex,
     groupClusters = groupClusters,
     xlab = xlab
   ))
51477d47
 }
 
 
f2a1c1ba
 .calculateDecontXBarplotPercent <- function(counts,
                                             z,
                                             geneMarkerCellTypeIndex,
                                             threshold) {
e14601cd
 
   # Get counts matrix and convert to DelayedMatrix
   counts <- DelayedArray::DelayedArray(counts)
f2a1c1ba
 
e14601cd
   # Convert to boolean matrix and sum markers in same cell type
   # The "+ 0" is to convert boolean to numeric
   counts <- counts >= threshold
   countsByMarker <- DelayedArray::rowsum(counts + 0, geneMarkerCellTypeIndex)
   countsByCellType <- DelayedArray::colsum((countsByMarker > 0) + 0, z)
f2a1c1ba
 
e14601cd
   # Calculate percentages within each cell cluster
   zTotals <- tabulate(z)
   percentByCellType <- round(sweep(countsByCellType, 2, zTotals, "/") * 100)
   df <- reshape2::melt(percentByCellType,
f2a1c1ba
     varnames = c("markers", "cellType"),
     value.name = "percent"
   )
 
e14601cd
   return(df)
f2a1c1ba
 }