Bioconductor Code: mia

Browse code

NMF ordination with feature loadings computation (#616)

Co-authored-by: Leo Lahti <leo.lahti@iki.fi>
Co-authored-by: TuomasBorman <tvborm@utu.fi>

Théotime Pralas authored on 09/08/2024 11:25:41 • GitHub committed on 09/08/2024 11:25:41
Showing 9 changed files

DESCRIPTION index fe74ddd..a7a376a 100644
NAMESPACE index c44f52e..e9b9009 100644
NEWS index f06e280..087332a 100644
R/addLDA.R index fb18f6b..0111d3f 100644
R/addNMF.R index 0000000..1bf9bb2
man/addLDA.Rd index 9183fab..636bde6 100644
man/addNMF.Rd index 0000000..8fb7466
pkgdown/_pkgdown.yml index c9ef379..0a79e72 100644
tests/testthat/test-5addNMF.R index 0000000..881ea7a

History View file @ 552a952

@@ -1,6 +1,6 @@
                      Package: mia
                      Type: Package
                     -Version: 1.13.34
                     +Version: 1.13.35
                      Authors@R:
                          c(person(given = "Felix G.M.", family = "Ernst", role = c("aut"),
                                   email = "felix.gm.ernst@outlook.com",
@@ -102,7 +102,8 @@ Suggests:
                          rmarkdown,
                          rhdf5,
                          topicmodels,
                     -    topicdoc
                     +    topicdoc,
                     +    NMF
                      URL: https://github.com/microbiome/mia
                      BugReports: https://github.com/microbiome/mia/issues
                      Roxygen: list(markdown = TRUE)

NAMESPACE

History View file @ 552a952

@@ -12,6 +12,7 @@ export(addDominant)
                      export(addLDA)
                      export(addMediation)
                      export(addNMDS)
                     +export(addNMF)
                      export(addNotContaminantQC)
                      export(addPerSampleDominantFeatures)
                      export(addPerSampleDominantTaxa)
@@ -55,6 +56,7 @@ export(getExperimentCrossCorrelation)
                      export(getLDA)
                      export(getMediation)
                      export(getNMDS)
                     +export(getNMF)
                      export(getPrevalence)
                      export(getPrevalent)
                      export(getPrevalentAbundance)
@@ -152,6 +154,7 @@ exportMethods(addDominant)
                      exportMethods(addHierarchyTree)
                      exportMethods(addLDA)
                      exportMethods(addMediation)
                     +exportMethods(addNMF)
                      exportMethods(addNotContaminantQC)
                      exportMethods(addPerSampleDominantFeatures)
                      exportMethods(addPerSampleDominantTaxa)
@@ -192,6 +195,7 @@ exportMethods(getHierarchyTree)
                      exportMethods(getLDA)
                      exportMethods(getMediation)
                      exportMethods(getNMDS)
                     +exportMethods(getNMF)
                      exportMethods(getPrevalence)
                      exportMethods(getPrevalent)
                      exportMethods(getPrevalentAbundance)

NEWS

History View file @ 552a952

@@ -148,4 +148,7 @@ convertToPhyloseq
                      + Changes in default taxonomy ranks; more ranks supported
                      + Added Tito2024QMP dataset
                      + Added convertToBIOM
                     -+ new methods getLDA and addLDA for LDA ordination with feature loadings computation
                     ++ new methods getLDA and addLDA for LDA ordination with feature loadings
                     +computation
                     ++ new methods getNMF and addNMF for NMF ordination with feature loadings
                     +computation

R/addLDA.R

History View file @ 552a952

@@ -1,11 +1,12 @@
                      #' Latent Dirichlet Allocation
                      #'
                      #' These functions perform Latent Dirichlet Allocation on data stored in a
                     -#'  \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
                     -#'  object.
                     +#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
                     +#' object.
                      #'
                     -#' @param x a \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
                     -#'  object.
                     +#' @param x a
                     +#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
                     +#' object.
                      #'
                      #' @param k \code{Integer vector}. A number of latent vectors/topics.
                      #'  (Default: \code{2})
@@ -26,16 +27,18 @@
                      #'
                      #' @return
                      #' For \code{getLDA}, the ordination matrix with feature loadings matrix
                     -#'  as attribute \code{"loadings"}.
                     +#' as attribute \code{"loadings"}.
                      #'
                     -#' For \code{addLDA}, a \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
                     -#'  object is returned containing the ordination matrix in reducedDims(..., name)
                     -#'  with feature loadings matrix as attribute \code{"loadings"}.
                     +#' For \code{addLDA}, a
                     +#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
                     +#' object is returned containing the ordination matrix in
                     +#' \code{reducedDim(..., name)} with feature loadings matrix as attribute
                     +#' \code{"loadings"}.
                      #'
                      #' @details
                      #' The functions \code{getLDA} and \code{addLDA} internally use
                     -#'  \code{\link[topicmodels:LDA]{LDA}} to compute the ordination matrix and
                     -#'  feature loadings.
                     +#' \code{\link[topicmodels:LDA]{LDA}} to compute the ordination matrix and
                     +#' feature loadings.
                      #'
                      #' @name addLDA
                      #'
@@ -63,15 +66,13 @@ NULL
                      #' @rdname addLDA
                      #' @export
                     -setGeneric("getLDA", signature = c("x"),
                     -           function(x, ...)
                     -             standardGeneric("getLDA"))
                     +setGeneric(
                     +    "getLDA", signature = c("x"), function(x, ...) standardGeneric("getLDA"))
                      #' @rdname addLDA
                      #' @export
                     -setGeneric("addLDA", signature = c("x"),
                     -           function(x, ...)
                     -             standardGeneric("addLDA"))
                     +setGeneric(
                     +    "addLDA", signature = c("x"), function(x, ...) standardGeneric("addLDA"))
                      #' @export
                      #' @rdname addLDA

R/addNMF.R

History View file @ 552a952

                     new file mode 100644
@@ -0,0 +1,181 @@
                     +#' Non-negative Matrix Factorization
                     +#'
                     +#' These functions perform Non-negative Matrix Factorization on data stored in a
                     +#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
                     +#' object.
                     +#'
                     +#' @param x a
                     +#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
                     +#' object.
                     +#'
                     +#' @param k \code{numeric vector}. A number of latent vectors/topics.
                     +#' (Default: \code{2})
                     +#'
                     +#' @param name \code{Character scalar}. The name to be used to store the result
                     +#' in the reducedDims of the output. (Default: \code{"NMF"})
                     +#'
                     +#' @param assay.type \code{Character scalar}. Specifies which assay to use for
                     +#' NMF ordination. (Default: \code{"counts"})
                     +#'
                     +#' @param eval.metric \code{Character scalar}. Specifies the evaluation metric
                     +#' that will be used to select the model with the best fit. Must be one of the
                     +#' following options: \code{"evar"} (explained variance; maximized),
                     +#' \code{"sparseness.basis"} (degree of sparsity in the basis matrix;
                     +#' maximized), \code{"sparseness.coef"} (degree of sparsity in the coefficient
                     +#' matrix; maximized), \code{"rss"} (residual sum of squares; minimized),
                     +#' \code{"silhouette.coef"} (quality of clustering based on the coefficient
                     +#' matrix; maximized), \code{"silhouette.basis"} (quality of clustering based
                     +#' on the basis matrix; maximized), \code{"cophenetic"} (correlation between
                     +#' cophenetic distances and original distances; maximized), \code{"dispersion"}
                     +#' (spread of data points within clusters; minimized). (Default: \code{"evar"})
                     +#'
                     +#' @param ... optional arguments passed to \code{nmf::NMF}.
                     +#'
                     +#' @return
                     +#' For \code{getNMF}, the ordination matrix with feature loadings matrix
                     +#' as attribute \code{"loadings"}.
                     +#'
                     +#' For \code{addNMF}, a
                     +#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
                     +#' object is returned containing the ordination matrix in
                     +#' \code{reducedDims(x, name)} with the following attributes:
                     +#' \itemize{
                     +#'   \item "loadings" which is a matrix containing the feature loadings
                     +#'   \item "NMF_output" which is the output of function \code{nmf::NMF}
                     +#'   \item "best_fit" which is the result of the best fit if k is a vector of
                     +#'   integers
                     +#' }
                     +#'
                     +#' @details
                     +#' The functions \code{getNMF} and \code{addNMF} internally use \code{nmf::NMF}
                     +#' compute the ordination matrix and
                     +#' feature loadings.
                     +#'
                     +#' If k is a vector of integers, NMF output is calculated for all the rank
                     +#' values contained in k, and the best fit is selected based on
                     +#' \code{eval.metric} value.
                     +#'
                     +#' @name addNMF
                     +#'
                     +#' @examples
                     +#' data(GlobalPatterns)
                     +#' tse <- GlobalPatterns
                     +#'
                     +#' # Reduce the number of features
                     +#' tse <- agglomerateByPrevalence(tse, rank = "Phylum")
                     +#'
                     +#' # Run NMF and add the result to reducedDim(tse, "NMF").
                     +#' tse <- addNMF(tse, k = 2, name = "NMF")
                     +#'
                     +#' # Extract feature loadings
                     +#' loadings_NMF <- attr(reducedDim(tse, "NMF"), "loadings")
                     +#' head(loadings_NMF)
                     +#'
                     +#' # Estimate models with number of topics from 2 to 4. Perform 2 runs.
                     +#' tse <- addNMF(tse, k = c(2, 3, 4), name = "NMF_4", nrun = 2)
                     +#'
                     +#' # Extract feature loadings
                     +#' loadings_NMF_4 <- attr(reducedDim(tse, "NMF_4"), "loadings")
                     +#' head(loadings_NMF_4)
                     +#'
                     +NULL
+                    +
                     +#' @rdname addNMF
                     +#' @export
                     +setGeneric(
                     +    "getNMF", signature = c("x"), function(x, ...) standardGeneric("getNMF"))
+                    +
                     +#' @rdname addNMF
                     +#' @export
                     +setGeneric(
                     +    "addNMF", signature = c("x"), function(x, ...) standardGeneric("addNMF"))
+                    +
                     +#' @export
                     +#' @rdname addNMF
                     +setMethod("getNMF", "SummarizedExperiment",
                     +    function(
                     +        x, k = 2, assay.type = "counts", eval.metric = "evar", ...){
                     +    .require_package("NMF")
                     +    # Both NMF and DelayedArray have method seed(). When running
                     +    # NMF::nmf() an error occurs due to wrong method. That is why NMF
                     +    # is first loaded into the session.
                     +    if( "NMF" %in% (.packages()) ){
                     +        detach("package:NMF", unload = TRUE)
                     +    }
                     +    library("NMF")
                     +    .check_assay_present(assay.type, x)
                     +    # Calculate NMF ordination
                     +    mat <- t(assay(x, assay.type))
                     +    res <- NMF::nmf(mat, k, ...)
                     +    # Check oif the output includes multiple ordination with different k values.
                     +    # If it includes multiple, get the best fit based on certain evaluation
                     +    # metric.
                     +    if( is(res, "NMF.rank") ){
                     +        best_fit <- .get_best_nmf_fit(res, eval.metric)
                     +    } else{
                     +        best_fit <- res
                     +    }
                     +    # Get scores and loadings, add loadings and NMF output to attributes of
                     +    # scores
                     +    scores <- best_fit@fit@W
                     +    loadings <- best_fit@fit@H
                     +    attr(scores, "loadings") <- t(loadings)
                     +    attr(scores, "NMF_output") <- res
                     +    # Add best fit if multiple k values
                     +    if( is(res, "NMF.rank") ){
                     +        attr(scores, "best_fit") <- best_fit
                     +    }
                     +    # The NMF package is unloaded
                     +    detach("package:NMF", unload = TRUE)
                     +    # Return scores with loadings, metrics and model as attribute
                     +    return(scores)
                     +}
                     +)
+                    +
                     +#' @export
                     +#' @rdname addNMF
                     +setMethod("addNMF", "SummarizedExperiment",
                     +    function(
                     +        x, k = 2, assay.type = "counts", eval.metric = "evar", name = "NMF",
                     +        ...){
                     +    # Input checks
                     +    if( !.is_a_string(name) ){
                     +        stop("'name' must be a non-empty single character value.",
                     +            call. = FALSE)
                     +    }
                     +    # Fit the model
                     +    nmf <- getNMF(x, k = k, assay.type = assay.type, ...)
                     +    # Add scores matrix with loadings as attribute to reducedDims
                     +    x <- .add_values_to_reducedDims(x, name = name, values = nmf)
                     +    return(x)
                     +    }
                     +)
+                    +
                     +################################ HELP FUNCTIONS ################################
                     +# This function is for evaluating a fit of NMF models
                     +.get_best_nmf_fit <- function(res, eval.metric){
                     +    # Get whether the metric is maximized or minimized
                     +    maximize <- c(
                     +        "sparseness.basis" = TRUE,
                     +        "sparseness.coef" = TRUE,
                     +        "rss" = FALSE,
                     +        "evar" = TRUE,
                     +        "silhouette.coef" = TRUE,
                     +        "silhouette.basis" = TRUE,
                     +        "cophenetic" = TRUE,
                     +        "dispersion" = FALSE
                     +        )
                     +    maximize <- maximize[ eval.metric ]
                     +    if( maximize ){
                     +        FUN <- which.max
                     +    } else{
                     +        FUN <- which.min
                     +    }
                     +    # Get the index of best fit
                     +    measures <- res[["measures"]]
                     +    values <- measures[[eval.metric]]
                     +    ind <- FUN(values)
                     +    # Get the model of best fit
                     +    model <- res[["fit"]][[ind]]
                     +    return(model)
                     +}

man/addLDA.Rd

History View file @ 552a952

@@ -16,7 +16,8 @@ addLDA(x, ...)
                      \S4method{addLDA}{SummarizedExperiment}(x, k = 2, assay.type = "counts", name = "LDA", ...)
+                     }
                      \arguments{
                     -\item{x}{a \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
                     +\item{x}{a
                     +\code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
                      object.}
                      \item{...}{optional arguments passed to \code{\link[topicmodels:LDA]{LDA}}}
@@ -40,9 +41,11 @@ in the reducedDims of the output. (Default: \code{"LDA"})}
                      For \code{getLDA}, the ordination matrix with feature loadings matrix
                      as attribute \code{"loadings"}.
                     -For \code{addLDA}, a \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
                     -object is returned containing the ordination matrix in reducedDims(..., name)
                     -with feature loadings matrix as attribute \code{"loadings"}.
                     +For \code{addLDA}, a
                     +\code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
                     +object is returned containing the ordination matrix in
                     +\code{reducedDim(..., name)} with feature loadings matrix as attribute
                     +\code{"loadings"}.
+                     }
                      \description{
                      These functions perform Latent Dirichlet Allocation on data stored in a

man/addNMF.Rd

History View file @ 552a952

                     new file mode 100644
@@ -0,0 +1,103 @@
                     +% Generated by roxygen2: do not edit by hand
                     +% Please edit documentation in R/addNMF.R
                     +\name{addNMF}
                     +\alias{addNMF}
                     +\alias{getNMF}
                     +\alias{getNMF,SummarizedExperiment-method}
                     +\alias{addNMF,SummarizedExperiment-method}
                     +\title{Non-negative Matrix Factorization}
                     +\usage{
                     +getNMF(x, ...)
+                    +
                     +addNMF(x, ...)
+                    +
                     +\S4method{getNMF}{SummarizedExperiment}(x, k = 2, assay.type = "counts", eval.metric = "evar", ...)
+                    +
                     +\S4method{addNMF}{SummarizedExperiment}(
                     +  x,
                     +  k = 2,
                     +  assay.type = "counts",
                     +  eval.metric = "evar",
                     +  name = "NMF",
                     +  ...
                     +)
                     +}
                     +\arguments{
                     +\item{x}{a
                     +\code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
                     +object.}
+                    +
                     +\item{...}{optional arguments passed to \code{nmf::NMF}.}
+                    +
                     +\item{k}{\code{numeric vector}. A number of latent vectors/topics.
                     +(Default: \code{2})}
+                    +
                     +\item{assay.type}{\code{Character scalar}. Specifies which assay to use for
                     +NMF ordination. (Default: \code{"counts"})}
+                    +
                     +\item{eval.metric}{\code{Character scalar}. Specifies the evaluation metric
                     +that will be used to select the model with the best fit. Must be one of the
                     +following options: \code{"evar"} (explained variance; maximized),
                     +\code{"sparseness.basis"} (degree of sparsity in the basis matrix;
                     +maximized), \code{"sparseness.coef"} (degree of sparsity in the coefficient
                     +matrix; maximized), \code{"rss"} (residual sum of squares; minimized),
                     +\code{"silhouette.coef"} (quality of clustering based on the coefficient
                     +matrix; maximized), \code{"silhouette.basis"} (quality of clustering based
                     +on the basis matrix; maximized), \code{"cophenetic"} (correlation between
                     +cophenetic distances and original distances; maximized), \code{"dispersion"}
                     +(spread of data points within clusters; minimized). (Default: \code{"evar"})}
+                    +
                     +\item{name}{\code{Character scalar}. The name to be used to store the result
                     +in the reducedDims of the output. (Default: \code{"NMF"})}
                     +}
                     +\value{
                     +For \code{getNMF}, the ordination matrix with feature loadings matrix
                     +as attribute \code{"loadings"}.
+                    +
                     +For \code{addNMF}, a
                     +\code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
                     +object is returned containing the ordination matrix in
                     +\code{reducedDims(x, name)} with the following attributes:
                     +\itemize{
                     +\item "loadings" which is a matrix containing the feature loadings
                     +\item "NMF_output" which is the output of function \code{nmf::NMF}
                     +\item "best_fit" which is the result of the best fit if k is a vector of
                     +integers
                     +}
                     +}
                     +\description{
                     +These functions perform Non-negative Matrix Factorization on data stored in a
                     +\code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
                     +object.
                     +}
                     +\details{
                     +The functions \code{getNMF} and \code{addNMF} internally use \code{nmf::NMF}
                     +compute the ordination matrix and
                     +feature loadings.
+                    +
                     +If k is a vector of integers, NMF output is calculated for all the rank
                     +values contained in k, and the best fit is selected based on
                     +\code{eval.metric} value.
                     +}
                     +\examples{
                     +data(GlobalPatterns)
                     +tse <- GlobalPatterns
+                    +
                     +# Reduce the number of features
                     +tse <- agglomerateByPrevalence(tse, rank = "Phylum")
+                    +
                     +# Run NMF and add the result to reducedDim(tse, "NMF").
                     +tse <- addNMF(tse, k = 2, name = "NMF")
+                    +
                     +# Extract feature loadings
                     +loadings_NMF <- attr(reducedDim(tse, "NMF"), "loadings")
                     +head(loadings_NMF)
+                    +
                     +# Estimate models with number of topics from 2 to 4. Perform 2 runs.
                     +tse <- addNMF(tse, k = c(2, 3, 4), name = "NMF_4", nrun = 2)
+                    +
                     +# Extract feature loadings
                     +loadings_NMF_4 <- attr(reducedDim(tse, "NMF_4"), "loadings")
                     +head(loadings_NMF_4)
+                    +
                     +}

pkgdown/_pkgdown.yml

History View file @ 552a952

@@ -69,6 +69,7 @@ reference:
                        - runCCA
                        - runDPCoA
                        - addLDA
                     +  - addNMF
                      - title: Package
                      - contents:

tests/testthat/test-5addNMF.R

History View file @ 552a952

                     new file mode 100644
@@ -0,0 +1,58 @@
                     +context("addNMF")
                     +test_that("addNMF", {
                     +  skip_if_not_installed("NMF")
                     +  data(GlobalPatterns, package="mia")
                     +  #
                     +  set.seed(123)
                     +  tse <- GlobalPatterns
                     +  tse <- agglomerateByPrevalence(tse, rank = "Phylum")
                     +  tse <- addNMF(tse, k = 2, seed = 123)
                     +  expect_named(reducedDims(tse),"NMF")
                     +  expect_true(is.matrix(reducedDim(tse,"NMF")))
                     +  expect_equal(dim(reducedDim(tse,"NMF")),c(26,2))
                     +  red <- reducedDim(tse,"NMF")
                     +  expect_equal(names(attributes(red)),
                     +               c("dim","dimnames","loadings", "NMF_output"))
                     +  expect_equal(dim(attr(red,"loadings")),c(35,2))
                     +  # Check if ordination matrix returned by NMF::nmf is the same as
                     +  # getNMF and addNMF ones
                     +  mat <- t(assay(tse, "counts"))
                     +  library("NMF")
                     +  nmf_model <- NMF::nmf(mat, rank = 2, seed = 123)
                     +  loadings <- t(nmf_model@fit@H)
                     +  # Compare NMF::nmf and addNMF
                     +  expect_equal(loadings, attr(red, "loadings"), tolerance = 10**-3)
                     +  scores2 <- getNMF(tse, k = 2, seed = 123)
                     +  # Compare NMF::nmf and getNMF
                     +  expect_equal(loadings, attr(scores2, "loadings"), tolerance = 10**-4)
                     +  # Test that additional parameters are passed
                     +  scores3 <- getNMF(tse, k = 2, nrun = 2)
                     +  library("NMF")
                     +  nmf_model <- NMF::nmf(mat, rank = 2, nrun = 2)
                     +  expect_equal(attr(scores3, "NMF_output")@nrun, nmf_model@nrun)
                     +  # ERRORs
                     +  expect_error(
                     +    addNMF(GlobalPatterns, k = "test", assay.type = "counts", name = "NMF")
                     +  )
                     +  expect_error(
                     +    addNMF(GlobalPatterns, k = 1.5, assay.type = "counts", name = "NMF")
                     +  )
                     +  expect_error(
                     +    addNMF(GlobalPatterns, k = TRUE, assay.type = "counts", name = "NMF")
                     +  )
                     +  expect_error(
                     +    addNMF(GlobalPatterns, k = 2, assay.type = "test", name = "NMF")
                     +  )
                     +  expect_error(
                     +    addNMF(GlobalPatterns, k = 2, assay.type = 1, name = "NMF")
                     +  )
                     +  expect_error(
                     +    addNMF(GlobalPatterns, k = 2, assay.type = TRUE, name = "NMF")
                     +  )
                     +  expect_error(
                     +    addNMF(GlobalPatterns, k = 2, assay.type = "counts", name = 1)
                     +  )
                     +  expect_error(
                     +    addNMF(GlobalPatterns, k = 2, assay.type = "counts", name = TRUE)
                     +  )
                     +})