Browse code

NMF ordination with feature loadings computation (#616)

Co-authored-by: Leo Lahti <leo.lahti@iki.fi>
Co-authored-by: TuomasBorman <tvborm@utu.fi>

Théotime Pralas authored on 09/08/2024 11:25:41 • GitHub committed on 09/08/2024 11:25:41
Showing 9 changed files

... ...
@@ -1,6 +1,6 @@
1 1
 Package: mia
2 2
 Type: Package
3
-Version: 1.13.34
3
+Version: 1.13.35
4 4
 Authors@R:
5 5
     c(person(given = "Felix G.M.", family = "Ernst", role = c("aut"),
6 6
              email = "felix.gm.ernst@outlook.com",
... ...
@@ -102,7 +102,8 @@ Suggests:
102 102
     rmarkdown,
103 103
     rhdf5,
104 104
     topicmodels,
105
-    topicdoc
105
+    topicdoc,
106
+    NMF
106 107
 URL: https://github.com/microbiome/mia
107 108
 BugReports: https://github.com/microbiome/mia/issues
108 109
 Roxygen: list(markdown = TRUE)
... ...
@@ -12,6 +12,7 @@ export(addDominant)
12 12
 export(addLDA)
13 13
 export(addMediation)
14 14
 export(addNMDS)
15
+export(addNMF)
15 16
 export(addNotContaminantQC)
16 17
 export(addPerSampleDominantFeatures)
17 18
 export(addPerSampleDominantTaxa)
... ...
@@ -55,6 +56,7 @@ export(getExperimentCrossCorrelation)
55 56
 export(getLDA)
56 57
 export(getMediation)
57 58
 export(getNMDS)
59
+export(getNMF)
58 60
 export(getPrevalence)
59 61
 export(getPrevalent)
60 62
 export(getPrevalentAbundance)
... ...
@@ -152,6 +154,7 @@ exportMethods(addDominant)
152 154
 exportMethods(addHierarchyTree)
153 155
 exportMethods(addLDA)
154 156
 exportMethods(addMediation)
157
+exportMethods(addNMF)
155 158
 exportMethods(addNotContaminantQC)
156 159
 exportMethods(addPerSampleDominantFeatures)
157 160
 exportMethods(addPerSampleDominantTaxa)
... ...
@@ -192,6 +195,7 @@ exportMethods(getHierarchyTree)
192 195
 exportMethods(getLDA)
193 196
 exportMethods(getMediation)
194 197
 exportMethods(getNMDS)
198
+exportMethods(getNMF)
195 199
 exportMethods(getPrevalence)
196 200
 exportMethods(getPrevalent)
197 201
 exportMethods(getPrevalentAbundance)
... ...
@@ -148,4 +148,7 @@ convertToPhyloseq
148 148
 + Changes in default taxonomy ranks; more ranks supported
149 149
 + Added Tito2024QMP dataset
150 150
 + Added convertToBIOM
151
-+ new methods getLDA and addLDA for LDA ordination with feature loadings computation
151
++ new methods getLDA and addLDA for LDA ordination with feature loadings
152
+computation
153
++ new methods getNMF and addNMF for NMF ordination with feature loadings
154
+computation
... ...
@@ -1,11 +1,12 @@
1 1
 #' Latent Dirichlet Allocation
2 2
 #'
3 3
 #' These functions perform Latent Dirichlet Allocation on data stored in a 
4
-#'  \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
5
-#'  object.
4
+#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
5
+#' object.
6 6
 #' 
7
-#' @param x a \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
8
-#'  object.
7
+#' @param x a
8
+#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
9
+#' object.
9 10
 #'   
10 11
 #' @param k \code{Integer vector}. A number of latent vectors/topics. 
11 12
 #'  (Default: \code{2})
... ...
@@ -26,16 +27,18 @@
26 27
 #' 
27 28
 #' @return 
28 29
 #' For \code{getLDA}, the ordination matrix with feature loadings matrix
29
-#'  as attribute \code{"loadings"}.
30
+#' as attribute \code{"loadings"}.
30 31
 #'  
31
-#' For \code{addLDA}, a \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
32
-#'  object is returned containing the ordination matrix in reducedDims(..., name)
33
-#'  with feature loadings matrix as attribute \code{"loadings"}.
32
+#' For \code{addLDA}, a
33
+#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
34
+#' object is returned containing the ordination matrix in
35
+#' \code{reducedDim(..., name)} with feature loadings matrix as attribute
36
+#' \code{"loadings"}.
34 37
 #'  
35 38
 #' @details 
36 39
 #' The functions \code{getLDA} and \code{addLDA} internally use 
37
-#'  \code{\link[topicmodels:LDA]{LDA}} to compute the ordination matrix and 
38
-#'  feature loadings.
40
+#' \code{\link[topicmodels:LDA]{LDA}} to compute the ordination matrix and 
41
+#' feature loadings.
39 42
 #'  
40 43
 #' @name addLDA
41 44
 #' 
... ...
@@ -63,15 +66,13 @@ NULL
63 66
 
64 67
 #' @rdname addLDA
65 68
 #' @export
66
-setGeneric("getLDA", signature = c("x"),
67
-           function(x, ...)
68
-             standardGeneric("getLDA"))
69
+setGeneric(
70
+    "getLDA", signature = c("x"), function(x, ...) standardGeneric("getLDA"))
69 71
 
70 72
 #' @rdname addLDA
71 73
 #' @export
72
-setGeneric("addLDA", signature = c("x"),
73
-           function(x, ...)
74
-             standardGeneric("addLDA"))
74
+setGeneric(
75
+    "addLDA", signature = c("x"), function(x, ...) standardGeneric("addLDA"))
75 76
 
76 77
 #' @export
77 78
 #' @rdname addLDA
78 79
new file mode 100644
... ...
@@ -0,0 +1,181 @@
1
+#' Non-negative Matrix Factorization
2
+#'
3
+#' These functions perform Non-negative Matrix Factorization on data stored in a
4
+#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
5
+#' object.
6
+#' 
7
+#' @param x a
8
+#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
9
+#' object.
10
+#'   
11
+#' @param k \code{numeric vector}. A number of latent vectors/topics. 
12
+#' (Default: \code{2})
13
+#' 
14
+#' @param name \code{Character scalar}. The name to be used to store the result 
15
+#' in the reducedDims of the output. (Default: \code{"NMF"})
16
+#'  
17
+#' @param assay.type \code{Character scalar}. Specifies which assay to use for 
18
+#' NMF ordination. (Default: \code{"counts"})
19
+#'  
20
+#' @param eval.metric \code{Character scalar}. Specifies the evaluation metric
21
+#' that will be used to select the model with the best fit. Must be one of the
22
+#' following options: \code{"evar"} (explained variance; maximized),
23
+#' \code{"sparseness.basis"} (degree of sparsity in the basis matrix;
24
+#' maximized), \code{"sparseness.coef"} (degree of sparsity in the coefficient
25
+#' matrix; maximized), \code{"rss"} (residual sum of squares; minimized),
26
+#' \code{"silhouette.coef"} (quality of clustering based on the coefficient
27
+#' matrix; maximized), \code{"silhouette.basis"} (quality of clustering based
28
+#' on the basis matrix; maximized), \code{"cophenetic"} (correlation between
29
+#' cophenetic distances and original distances; maximized), \code{"dispersion"}
30
+#' (spread of data points within clusters; minimized). (Default: \code{"evar"})
31
+#' 
32
+#' @param ... optional arguments passed to \code{nmf::NMF}.
33
+#' 
34
+#' @return 
35
+#' For \code{getNMF}, the ordination matrix with feature loadings matrix
36
+#' as attribute \code{"loadings"}.
37
+#'  
38
+#' For \code{addNMF}, a
39
+#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
40
+#' object is returned containing the ordination matrix in
41
+#' \code{reducedDims(x, name)} with the following attributes:
42
+#' \itemize{
43
+#'   \item "loadings" which is a matrix containing the feature loadings
44
+#'   \item "NMF_output" which is the output of function \code{nmf::NMF} 
45
+#'   \item "best_fit" which is the result of the best fit if k is a vector of
46
+#'   integers
47
+#' }
48
+#' 
49
+#' @details 
50
+#' The functions \code{getNMF} and \code{addNMF} internally use \code{nmf::NMF} 
51
+#' compute the ordination matrix and 
52
+#' feature loadings.
53
+#'  
54
+#' If k is a vector of integers, NMF output is calculated for all the rank 
55
+#' values contained in k, and the best fit is selected based on 
56
+#' \code{eval.metric} value.
57
+#'  
58
+#' @name addNMF
59
+#' 
60
+#' @examples
61
+#' data(GlobalPatterns)
62
+#' tse <- GlobalPatterns
63
+#' 
64
+#' # Reduce the number of features
65
+#' tse <- agglomerateByPrevalence(tse, rank = "Phylum")
66
+#' 
67
+#' # Run NMF and add the result to reducedDim(tse, "NMF").
68
+#' tse <- addNMF(tse, k = 2, name = "NMF")
69
+#' 
70
+#' # Extract feature loadings
71
+#' loadings_NMF <- attr(reducedDim(tse, "NMF"), "loadings")
72
+#' head(loadings_NMF)
73
+#' 
74
+#' # Estimate models with number of topics from 2 to 4. Perform 2 runs.
75
+#' tse <- addNMF(tse, k = c(2, 3, 4), name = "NMF_4", nrun = 2)
76
+#' 
77
+#' # Extract feature loadings
78
+#' loadings_NMF_4 <- attr(reducedDim(tse, "NMF_4"), "loadings")
79
+#' head(loadings_NMF_4)
80
+#' 
81
+NULL
82
+
83
+#' @rdname addNMF
84
+#' @export
85
+setGeneric(
86
+    "getNMF", signature = c("x"), function(x, ...) standardGeneric("getNMF"))
87
+
88
+#' @rdname addNMF
89
+#' @export
90
+setGeneric(
91
+    "addNMF", signature = c("x"), function(x, ...) standardGeneric("addNMF"))
92
+
93
+#' @export
94
+#' @rdname addNMF
95
+setMethod("getNMF", "SummarizedExperiment",
96
+    function(
97
+        x, k = 2, assay.type = "counts", eval.metric = "evar", ...){
98
+    .require_package("NMF")
99
+    # Both NMF and DelayedArray have method seed(). When running
100
+    # NMF::nmf() an error occurs due to wrong method. That is why NMF
101
+    # is first loaded into the session.
102
+    if( "NMF" %in% (.packages()) ){
103
+        detach("package:NMF", unload = TRUE)
104
+    }
105
+    library("NMF")
106
+    .check_assay_present(assay.type, x)
107
+    # Calculate NMF ordination
108
+    mat <- t(assay(x, assay.type))
109
+    res <- NMF::nmf(mat, k, ...)
110
+    # Check oif the output includes multiple ordination with different k values.
111
+    # If it includes multiple, get the best fit based on certain evaluation
112
+    # metric.
113
+    if( is(res, "NMF.rank") ){
114
+        best_fit <- .get_best_nmf_fit(res, eval.metric)
115
+    } else{
116
+        best_fit <- res
117
+    }
118
+    # Get scores and loadings, add loadings and NMF output to attributes of
119
+    # scores
120
+    scores <- best_fit@fit@W
121
+    loadings <- best_fit@fit@H
122
+    attr(scores, "loadings") <- t(loadings)
123
+    attr(scores, "NMF_output") <- res
124
+    # Add best fit if multiple k values
125
+    if( is(res, "NMF.rank") ){
126
+        attr(scores, "best_fit") <- best_fit
127
+    }
128
+    # The NMF package is unloaded
129
+    detach("package:NMF", unload = TRUE)
130
+    # Return scores with loadings, metrics and model as attribute
131
+    return(scores)
132
+}
133
+)
134
+
135
+#' @export
136
+#' @rdname addNMF
137
+setMethod("addNMF", "SummarizedExperiment",
138
+    function(
139
+        x, k = 2, assay.type = "counts", eval.metric = "evar", name = "NMF",
140
+        ...){
141
+    # Input checks
142
+    if( !.is_a_string(name) ){
143
+        stop("'name' must be a non-empty single character value.",
144
+            call. = FALSE)
145
+    }
146
+    # Fit the model
147
+    nmf <- getNMF(x, k = k, assay.type = assay.type, ...)
148
+    # Add scores matrix with loadings as attribute to reducedDims
149
+    x <- .add_values_to_reducedDims(x, name = name, values = nmf)
150
+    return(x)
151
+    }
152
+)
153
+
154
+################################ HELP FUNCTIONS ################################
155
+# This function is for evaluating a fit of NMF models
156
+.get_best_nmf_fit <- function(res, eval.metric){
157
+    # Get whether the metric is maximized or minimized
158
+    maximize <- c(
159
+        "sparseness.basis" = TRUE,
160
+        "sparseness.coef" = TRUE,
161
+        "rss" = FALSE,
162
+        "evar" = TRUE,
163
+        "silhouette.coef" = TRUE,
164
+        "silhouette.basis" = TRUE,
165
+        "cophenetic" = TRUE,
166
+        "dispersion" = FALSE
167
+        )
168
+    maximize <- maximize[ eval.metric ]
169
+    if( maximize ){
170
+        FUN <- which.max
171
+    } else{
172
+        FUN <- which.min
173
+    }
174
+    # Get the index of best fit
175
+    measures <- res[["measures"]]
176
+    values <- measures[[eval.metric]]
177
+    ind <- FUN(values)
178
+    # Get the model of best fit
179
+    model <- res[["fit"]][[ind]]
180
+    return(model)
181
+}
... ...
@@ -16,7 +16,8 @@ addLDA(x, ...)
16 16
 \S4method{addLDA}{SummarizedExperiment}(x, k = 2, assay.type = "counts", name = "LDA", ...)
17 17
 }
18 18
 \arguments{
19
-\item{x}{a \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
19
+\item{x}{a
20
+\code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
20 21
 object.}
21 22
 
22 23
 \item{...}{optional arguments passed to \code{\link[topicmodels:LDA]{LDA}}}
... ...
@@ -40,9 +41,11 @@ in the reducedDims of the output. (Default: \code{"LDA"})}
40 41
 For \code{getLDA}, the ordination matrix with feature loadings matrix
41 42
 as attribute \code{"loadings"}.
42 43
 
43
-For \code{addLDA}, a \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
44
-object is returned containing the ordination matrix in reducedDims(..., name)
45
-with feature loadings matrix as attribute \code{"loadings"}.
44
+For \code{addLDA}, a
45
+\code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
46
+object is returned containing the ordination matrix in
47
+\code{reducedDim(..., name)} with feature loadings matrix as attribute
48
+\code{"loadings"}.
46 49
 }
47 50
 \description{
48 51
 These functions perform Latent Dirichlet Allocation on data stored in a
49 52
new file mode 100644
... ...
@@ -0,0 +1,103 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/addNMF.R
3
+\name{addNMF}
4
+\alias{addNMF}
5
+\alias{getNMF}
6
+\alias{getNMF,SummarizedExperiment-method}
7
+\alias{addNMF,SummarizedExperiment-method}
8
+\title{Non-negative Matrix Factorization}
9
+\usage{
10
+getNMF(x, ...)
11
+
12
+addNMF(x, ...)
13
+
14
+\S4method{getNMF}{SummarizedExperiment}(x, k = 2, assay.type = "counts", eval.metric = "evar", ...)
15
+
16
+\S4method{addNMF}{SummarizedExperiment}(
17
+  x,
18
+  k = 2,
19
+  assay.type = "counts",
20
+  eval.metric = "evar",
21
+  name = "NMF",
22
+  ...
23
+)
24
+}
25
+\arguments{
26
+\item{x}{a
27
+\code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
28
+object.}
29
+
30
+\item{...}{optional arguments passed to \code{nmf::NMF}.}
31
+
32
+\item{k}{\code{numeric vector}. A number of latent vectors/topics.
33
+(Default: \code{2})}
34
+
35
+\item{assay.type}{\code{Character scalar}. Specifies which assay to use for
36
+NMF ordination. (Default: \code{"counts"})}
37
+
38
+\item{eval.metric}{\code{Character scalar}. Specifies the evaluation metric
39
+that will be used to select the model with the best fit. Must be one of the
40
+following options: \code{"evar"} (explained variance; maximized),
41
+\code{"sparseness.basis"} (degree of sparsity in the basis matrix;
42
+maximized), \code{"sparseness.coef"} (degree of sparsity in the coefficient
43
+matrix; maximized), \code{"rss"} (residual sum of squares; minimized),
44
+\code{"silhouette.coef"} (quality of clustering based on the coefficient
45
+matrix; maximized), \code{"silhouette.basis"} (quality of clustering based
46
+on the basis matrix; maximized), \code{"cophenetic"} (correlation between
47
+cophenetic distances and original distances; maximized), \code{"dispersion"}
48
+(spread of data points within clusters; minimized). (Default: \code{"evar"})}
49
+
50
+\item{name}{\code{Character scalar}. The name to be used to store the result
51
+in the reducedDims of the output. (Default: \code{"NMF"})}
52
+}
53
+\value{
54
+For \code{getNMF}, the ordination matrix with feature loadings matrix
55
+as attribute \code{"loadings"}.
56
+
57
+For \code{addNMF}, a
58
+\code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
59
+object is returned containing the ordination matrix in
60
+\code{reducedDims(x, name)} with the following attributes:
61
+\itemize{
62
+\item "loadings" which is a matrix containing the feature loadings
63
+\item "NMF_output" which is the output of function \code{nmf::NMF}
64
+\item "best_fit" which is the result of the best fit if k is a vector of
65
+integers
66
+}
67
+}
68
+\description{
69
+These functions perform Non-negative Matrix Factorization on data stored in a
70
+\code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
71
+object.
72
+}
73
+\details{
74
+The functions \code{getNMF} and \code{addNMF} internally use \code{nmf::NMF}
75
+compute the ordination matrix and
76
+feature loadings.
77
+
78
+If k is a vector of integers, NMF output is calculated for all the rank
79
+values contained in k, and the best fit is selected based on
80
+\code{eval.metric} value.
81
+}
82
+\examples{
83
+data(GlobalPatterns)
84
+tse <- GlobalPatterns
85
+
86
+# Reduce the number of features
87
+tse <- agglomerateByPrevalence(tse, rank = "Phylum")
88
+
89
+# Run NMF and add the result to reducedDim(tse, "NMF").
90
+tse <- addNMF(tse, k = 2, name = "NMF")
91
+
92
+# Extract feature loadings
93
+loadings_NMF <- attr(reducedDim(tse, "NMF"), "loadings")
94
+head(loadings_NMF)
95
+
96
+# Estimate models with number of topics from 2 to 4. Perform 2 runs.
97
+tse <- addNMF(tse, k = c(2, 3, 4), name = "NMF_4", nrun = 2)
98
+
99
+# Extract feature loadings
100
+loadings_NMF_4 <- attr(reducedDim(tse, "NMF_4"), "loadings")
101
+head(loadings_NMF_4)
102
+
103
+}
... ...
@@ -69,6 +69,7 @@ reference:
69 69
   - runCCA
70 70
   - runDPCoA
71 71
   - addLDA
72
+  - addNMF
72 73
 
73 74
 - title: Package
74 75
 - contents:
75 76
new file mode 100644
... ...
@@ -0,0 +1,58 @@
1
+context("addNMF")
2
+test_that("addNMF", {
3
+  skip_if_not_installed("NMF")
4
+  data(GlobalPatterns, package="mia")
5
+  #
6
+  set.seed(123)
7
+  tse <- GlobalPatterns
8
+  tse <- agglomerateByPrevalence(tse, rank = "Phylum")
9
+  tse <- addNMF(tse, k = 2, seed = 123)
10
+  expect_named(reducedDims(tse),"NMF")
11
+  expect_true(is.matrix(reducedDim(tse,"NMF")))
12
+  expect_equal(dim(reducedDim(tse,"NMF")),c(26,2))
13
+  red <- reducedDim(tse,"NMF")
14
+  expect_equal(names(attributes(red)),
15
+               c("dim","dimnames","loadings", "NMF_output"))
16
+  expect_equal(dim(attr(red,"loadings")),c(35,2))
17
+  # Check if ordination matrix returned by NMF::nmf is the same as
18
+  # getNMF and addNMF ones
19
+  mat <- t(assay(tse, "counts"))
20
+  library("NMF")
21
+  nmf_model <- NMF::nmf(mat, rank = 2, seed = 123)
22
+  loadings <- t(nmf_model@fit@H)
23
+  # Compare NMF::nmf and addNMF
24
+  expect_equal(loadings, attr(red, "loadings"), tolerance = 10**-3)
25
+  scores2 <- getNMF(tse, k = 2, seed = 123)
26
+  # Compare NMF::nmf and getNMF
27
+  expect_equal(loadings, attr(scores2, "loadings"), tolerance = 10**-4)
28
+  # Test that additional parameters are passed
29
+  scores3 <- getNMF(tse, k = 2, nrun = 2)
30
+  library("NMF")
31
+  nmf_model <- NMF::nmf(mat, rank = 2, nrun = 2)
32
+  expect_equal(attr(scores3, "NMF_output")@nrun, nmf_model@nrun)
33
+  # ERRORs
34
+  expect_error(
35
+    addNMF(GlobalPatterns, k = "test", assay.type = "counts", name = "NMF")
36
+  )
37
+  expect_error(
38
+    addNMF(GlobalPatterns, k = 1.5, assay.type = "counts", name = "NMF")
39
+  )
40
+  expect_error(
41
+    addNMF(GlobalPatterns, k = TRUE, assay.type = "counts", name = "NMF")
42
+  )
43
+  expect_error(
44
+    addNMF(GlobalPatterns, k = 2, assay.type = "test", name = "NMF")
45
+  )
46
+  expect_error(
47
+    addNMF(GlobalPatterns, k = 2, assay.type = 1, name = "NMF")
48
+  )
49
+  expect_error(
50
+    addNMF(GlobalPatterns, k = 2, assay.type = TRUE, name = "NMF")
51
+  )
52
+  expect_error(
53
+    addNMF(GlobalPatterns, k = 2, assay.type = "counts", name = 1)
54
+  )
55
+  expect_error(
56
+    addNMF(GlobalPatterns, k = 2, assay.type = "counts", name = TRUE)
57
+  )
58
+})