R/pca_cor_samplevar.R
1e8ab7f0
 #' @include pca.R
 #' @include HermesData-methods.R
 NULL
 
 #' Calculation of R2 between Sample Variable and Principal Components
40a94888
 #'
5a49cd35
 #' @description `r lifecycle::badge("stable")`
 #'
 #' This helper function calculates R2 values between one sample variable from [`AnyHermesData`]
1e8ab7f0
 #' and all Principal Components (PCs) separately (one linear model is fit for each PC).
40a94888
 #'
5a49cd35
 #' @details Note that in case there are estimation problems for any of the PCs, then `NA` will
07f12df2
 #' be returned for those.
 #'
1e8ab7f0
 #' @param pca (`matrix`)\cr principal components matrix generated by [calc_pca()].
5a49cd35
 #' @param x (`vector`)\cr values of one sample variable from a [`AnyHermesData`] object.
40a94888
 #'
1e8ab7f0
 #' @return A vector with R2 values for each principal component.
40a94888
 #'
 #' @export
 #'
 #' @examples
6d249ef9
 #' object <- hermes_data %>%
1e8ab7f0
 #'   add_quality_flags() %>%
 #'   filter() %>%
 #'   normalize()
073ebe65
 #'
 #' # Obtain the principal components.
40a94888
 #' pca <- calc_pca(object)$x
073ebe65
 #'
 #' # Obtain the sample variable.
6d249ef9
 #' x <- colData(object)$AGE18
073ebe65
 #'
 #' # Correlate them.
40a94888
 #' r2 <- h_pca_var_rsquared(pca, x)
 h_pca_var_rsquared <- function(pca, x) {
   assert_that(
     is.matrix(pca),
     is.numeric(x) || is.factor(x) || is.character(x) || is.logical(x),
     identical(length(x), nrow(pca)),
     all(abs(colMeans(pca)) < 1e-10)
   )
   use_sample <- !is.na(x)
   x <- x[use_sample]
93a0a100
   if (is_constant(x)) {
2c07f9f8
     warning("sample variable is constant and R2 values cannot be calculated")
   }
40a94888
   pca <- pca[use_sample, ]
eb3ec4a3
   design <- stats::model.matrix(~x)
40a94888
   # Transpose such that PCs are in rows, and samples in columns.
   y0 <- t(pca)
f9f0744c
   utils::capture.output(fit <- limma::lmFit(y0, design = design))
07f12df2
   had_problems <- apply(fit$coefficients, 1L, function(row) any(is.na(row)))
40a94888
   sst <- rowSums(y0^2)
   ssr <- sst - fit$df.residual * fit$sigma^2
07f12df2
   result <- ssr / sst
   result[had_problems] <- NA
   result
40a94888
 }
 
1e8ab7f0
 #' Calculation of R2 Matrix between Sample Variables and Principal Components
40a94888
 #'
5a49cd35
 #' @description `r lifecycle::badge("stable")`
 #'
 #' This function processes sample variables from [`AnyHermesData`] and the
1e8ab7f0
 #' corresponding principal components matrix, and then generates the matrix of R2 values.
40a94888
 #'
073ebe65
 #' @details
 #'   - Note that only the `df` columns which are `numeric`, `character`, `factor` or
 #'     `logical` are included in the resulting matrix, because other variable types are not
 #'     supported.
 #'   - In addition, `df` columns which are constant, all `NA`, or `character` or `factor`
 #'     columns with too many levels are also dropped before the analysis.
 #'
1e8ab7f0
 #' @param pca (`matrix`)\cr comprises principal components generated by [calc_pca()].
073ebe65
 #' @param df (`data.frame`)\cr from the [SummarizedExperiment::colData()] of a
 #'   [`AnyHermesData`] object.
40a94888
 #'
073ebe65
 #' @return A matrix with R2 values for all combinations of sample variables and principal
1e8ab7f0
 #'   components.
40a94888
 #'
073ebe65
 #' @seealso [h_pca_var_rsquared()] which is used internally to calculate the R2 for one
 #'   sample variable.
 #'
40a94888
 #' @export
 #'
 #' @examples
6d249ef9
 #' object <- hermes_data %>%
1e8ab7f0
 #'   add_quality_flags() %>%
 #'   filter() %>%
 #'   normalize()
073ebe65
 #'
 #' # Obtain the principal components.
40a94888
 #' pca <- calc_pca(object)$x
073ebe65
 #'
 #' # Obtain the `colData` as a `data.frame`.
 #' df <- as.data.frame(colData(object))
 #'
 #' # Correlate them.
 #' r2_all <- h_pca_df_r2_matrix(pca, df)
 #' str(r2_all)
 #'
 #' # We can see that only about half of the columns from `df` were
 #' # used for the correlations.
 #' ncol(r2_all)
 #' ncol(df)
40a94888
 h_pca_df_r2_matrix <- function(pca, df) {
   assert_that(
     is.matrix(pca),
     is.data.frame(df),
     identical(nrow(pca), nrow(df))
   )
   # Sequentially filter down the columns in `df`.
   # Sample variable must be numeric, character, factor or logical.
   is_accepted_type <- vapply(df, function(x) {
     is.numeric(x) || is.character(x) || is.factor(x) || is.logical(x)
   }, TRUE)
   df <- df[, is_accepted_type]
1e8ab7f0
   # Sample variable cannot be completely `NA`.
40a94888
   is_all_na <- vapply(df, all_na, TRUE)
   df <- df[, !is_all_na]
1e8ab7f0
   # Sample variable cannot have a constant value.
40a94888
   is_all_constant <- vapply(df, is_constant, TRUE)
   df <- df[, !is_all_constant]
a00b4df0
   # Filter character or factor sample variable that has too many (more than half the
1e8ab7f0
   # number of samples) unique values.
40a94888
   too_many_levels <- vapply(df, function(x) {
eb3ec4a3
     (is.character(x) || is.factor(x)) && (length(unique(x)) > nrow(df) / 2)
40a94888
   }, TRUE)
   df <- df[, !too_many_levels]
   # On all remaining columns, run R2 analysis vs. all principal components.
   vapply(
     X = df,
     FUN = h_pca_var_rsquared,
     pca = pca,
     FUN.VALUE = rep(0.5, ncol(pca))
   )
 }
 
1e8ab7f0
 # correlate-HermesDataPca ----
40a94888
 
1e8ab7f0
 #' Correlation of Principal Components with Sample Variables
a00b4df0
 #'
5a49cd35
 #' @description `r lifecycle::badge("stable")`
 #'
073ebe65
 #' This `correlate()` method analyses the correlations (in R2 values) between all sample variables
 #' in a [`AnyHermesData`] object and the principal components of the samples.
 #'
 #' A corresponding `autoplot()` method then can visualize the results in a heatmap.
40a94888
 #'
1e8ab7f0
 #' @rdname pca_cor_samplevar
 #' @aliases pca_cor_samplevar
a00b4df0
 #'
1e7a40be
 #' @param object (`HermesDataPca`)\cr input. It can be generated using [calc_pca()] function
90e38b04
 #'   on [`AnyHermesData`].
1e8ab7f0
 #' @param data (`AnyHermesData`)\cr input that was used originally for the PCA.
40a94888
 #'
5a49cd35
 #' @return A [`HermesDataPcaCor`] object with R2 values for all sample variables.
a00b4df0
 #'
073ebe65
 #' @seealso [h_pca_df_r2_matrix()] which is used internally for the details.
 #'
40a94888
 #' @export
 #'
 #' @examples
6d249ef9
 #' object <- hermes_data %>%
1e8ab7f0
 #'   add_quality_flags() %>%
 #'   filter() %>%
 #'   normalize()
073ebe65
 #'
 #' # Perform PCA and then correlate the prinicipal components with the sample variables.
40a94888
 #' object_pca <- calc_pca(object)
 #' result <- correlate(object_pca, object)
 setMethod(
   f = "correlate",
1e8ab7f0
   signature = c(object = "HermesDataPca"),
   definition = function(object, data) {
     pca <- object$x
40a94888
     assert_that(
       is_hermes_data(data),
1e8ab7f0
       is(object, "HermesDataPca"),
40a94888
       identical(rownames(pca), colnames(data))
     )
     df <- as.data.frame(colData(data))
     r2_matrix <- h_pca_df_r2_matrix(pca, df)
     .HermesDataPcaCor(r2_matrix)
   }
 )
 
1e8ab7f0
 # HermesDataPcaCor ----
 
 #' @rdname pca_cor_samplevar
 #' @aliases HermesDataPcaCor
 #' @exportClass HermesDataPcaCor
eb3ec4a3
 .HermesDataPcaCor <- setClass( # nolint
1e8ab7f0
   Class = "HermesDataPcaCor",
   contains = "matrix"
 )
40a94888
 
1e8ab7f0
 # autoplot-HermesDataPcaCor ----
40a94888
 
a00b4df0
 #' @describeIn pca_cor_samplevar This plot method uses the [ComplexHeatmap::Heatmap()] function
5a49cd35
 #'   to visualize a [`HermesDataPcaCor`] object.
40a94888
 #'
a00b4df0
 #' @param cor_colors (`function`)\cr color scale function for the correlation values in the heatmap,
40a94888
 #'   produced by [circlize::colorRamp2()].
 #' @param ... other arguments to be passed to [ComplexHeatmap::Heatmap()].
 #'
dca37215
 #' @export
 #'
40a94888
 #' @examples
073ebe65
 #'
 #' # Visualize the correlations in a heatmap.
40a94888
 #' autoplot(result)
a00b4df0
 #'
1e8ab7f0
 #' # We can also choose to not reorder the columns.
 #' autoplot(result, cluster_columns = FALSE)
7895a53a
 #'
 #' # We can also choose break-points for color customization.
93a0a100
 #' autoplot(
 #'   result,
 #'   cor_colors = circlize::colorRamp2(
 #'     c(-0.5, -0.25, 0, 0.25, 0.5, 0.75, 1),
 #'     c("blue", "green", "purple", "yellow", "orange", "red", "brown")
 #'   )
 #' )
40a94888
 setMethod(
   f = "autoplot",
   signature = c(object = "HermesDataPcaCor"),
eb3ec4a3
   definition = function(object,
                         cor_colors = circlize::colorRamp2(
7895a53a
                           c(-1, 0, 1),
                           c("blue", "white", "red")
eb3ec4a3
                         ),
                         ...) {
b7e7bc28
     mat <- as(object, "matrix")
40a94888
     ComplexHeatmap::Heatmap(
b7e7bc28
       matrix = t(mat),
40a94888
       col = cor_colors,
       name = "R2",
       ...
     )
   }
 )