Browse code

- Fixed bugs introduced during today's coding. - selectionMethod should be none and not NULL if to be skipped. Easier to use with naming than NULL. - elasticNetGLM made robust against choosing lambda values which cause zero features to be chosen. - elasticNetFeatures corrected to map indices back from one-hot encoding to original features.

Dario Strbenac authored on 15/02/2023 13:00:02
Showing 5 changed files

... ...
@@ -32,7 +32,7 @@
32 32
 #' Set to NULL or "all" if all features should be used.
33 33
 #' @param selectionMethod Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
34 34
 #' and performing multiview classification, the respective selection methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking
35
-#' and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. \code{NULL} is also a valid value, meaning that no
35
+#' and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. \code{"none"} is also a valid value, meaning that no
36 36
 #' indepedent feature selection will be performed (but implicit selection might still happen with the classifier).
37 37
 #' @param selectionOptimisation A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.
38 38
 #' @param performanceType Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Otherwise, any one of the
... ...
@@ -114,7 +114,7 @@ setMethod("crossValidate", "DataFrame",
114 114
 
115 115
           {
116 116
               # Check that data is in the right format, if not already done for MultiAssayExperiment input.
117
-              if(!"assay" %in% S4Vectors::mcols(measurements)) # Assay is put there by prepareData for MultiAssayExperiment, skip if present. 
117
+              if(!"assay" %in% colnames(S4Vectors::mcols(measurements))) # Assay is put there by prepareData for MultiAssayExperiment, skip if present. 
118 118
               {
119 119
                 prepParams <- list(measurements, outcome, clinicalPredictors)
120 120
                 if("prepare" %in% names(extraParams))
... ...
@@ -594,7 +594,6 @@ generateModellingParams <- function(assayIDs,
594 594
     }    
595 595
     
596 596
     selectionMethod <- unlist(selectionMethod)
597
-    if(is.null(selectionMethod)) selectionMethod <- "none"
598 597
 
599 598
     if(selectionMethod != "none")
600 599
     {
... ...
@@ -6,20 +6,26 @@ elasticNetGLMtrainInterface <- function(measurementsTrain, classesTrain, lambda
6 6
     stop("The package 'glmnet' could not be found. Please install it.")
7 7
   if(verbose == 3)
8 8
     message("Fitting elastic net regularised GLM classifier to data.")
9
-    
10
-  measurementsMatrix <- glmnet::makeX(as(measurementsTrain, "data.frame"))
11 9
 
12
-  fitted <- glmnet::glmnet(measurementsMatrix, classesTrain, family = "multinomial", ...)
10
+  # One-hot encoding needed.    
11
+  measurementsTrain <- MatrixModels::model.Matrix(~ 0 + ., data = measurementsTrain)
12
+  fitted <- glmnet::glmnet(measurementsTrain, classesTrain, family = "multinomial", ...)
13
+  
13 14
 
14 15
   if(is.null(lambda)) # fitted has numerous models for automatically chosen lambda values.
15
-  { # Pick one lambda based on resubstitution performance.
16
-    bestLambda <- fitted[["lambda"]][which.min(sapply(fitted[["lambda"]], function(lambda) # Largest Lambda with minimum balanced error rate.
16
+  { # Pick one lambda based on resubstitution performance. But not the one that makes all variables excluded from model.
17
+      lambdaConsider <- colSums(as.matrix(fitted[["beta"]][[1]])) != 0
18
+    bestLambda <- fitted[["lambda"]][lambdaConsider][which.min(sapply(fitted[["lambda"]][lambdaConsider], function(lambda) # Largest Lambda with minimum balanced error rate.
17 19
     {
18
-      classPredictions <- factor(as.character(predict(fitted, measurementsMatrix, s = lambda, type = "class")), levels = fitted[["classnames"]])
20
+      classPredictions <- factor(as.character(predict(fitted, measurementsTrain, s = lambda, type = "class")), levels = fitted[["classnames"]])
19 21
       calcExternalPerformance(classesTrain, classPredictions, "Balanced Error")
20 22
     }))[1]]
21 23
     attr(fitted, "tune") <- list(lambda = bestLambda)
22 24
   }
25
+  
26
+  attr(fitted, "featureNames") <- colnames(measurementsTrain)
27
+  attr(fitted, "featureGroups") <- measurementsTrain@assign
28
+  
23 29
   fitted
24 30
 }
25 31
 attr(elasticNetGLMtrainInterface, "name") <- "elasticNetGLMtrainInterface"
... ...
@@ -28,6 +34,8 @@ attr(elasticNetGLMtrainInterface, "name") <- "elasticNetGLMtrainInterface"
28 34
 elasticNetGLMpredictInterface <- function(model, measurementsTest, lambda, ..., returnType = c("both", "class", "score"), verbose = 3)
29 35
 { # ... just consumes emitted tuning variables from .doTrain which are unused.
30 36
   returnType <- match.arg(returnType)
37
+  # One-hot encoding needed.    
38
+  measurementsTest <- MatrixModels::model.Matrix(~ 0 + ., data = measurementsTest)
31 39
   
32 40
   # Ensure that testing data has same columns names in same order as training data.
33 41
   # Remove those annoying backquotes which glmnet adds if variables have spaces in names.
... ...
@@ -41,11 +49,11 @@ elasticNetGLMpredictInterface <- function(model, measurementsTest, lambda, ...,
41 49
   if(missing(lambda)) # Tuning parameters are not passed to prediction functions.
42 50
     lambda <- attr(model, "tune")[["lambda"]] # Sneak it in as an attribute on the model.
43 51
 
44
-  testMatrix <- glmnet::makeX(as(measurementsTest, "data.frame"))
45
-  testMatrix <- testMatrix[, rownames(model[["beta"]][[1]])]
46 52
   
47
-  classPredictions <- factor(as.character(predict(model, testMatrix, s = lambda, type = "class")), levels = model[["classnames"]])
48
-  classScores <- predict(model, testMatrix, s = lambda, type = "response")[, , 1]
53
+  measurementsTest <- measurementsTest[, rownames(model[["beta"]][[1]])]
54
+  
55
+  classPredictions <- factor(as.character(predict(model, measurementsTest, s = lambda, type = "class")), levels = model[["classnames"]])
56
+  classScores <- predict(model, measurementsTest, s = lambda, type = "response")[, , 1]
49 57
   
50 58
   if(is.matrix(classScores))
51 59
     classScores <- classScores[, model[["classnames"]]]
... ...
@@ -61,16 +69,20 @@ elasticNetGLMpredictInterface <- function(model, measurementsTest, lambda, ...,
61 69
 #
62 70
 # Get selected features (i.e. non-zero model coefficients)
63 71
 #
72
+# Note: Need to convert back to actual features when factors were expanded into
73
+# multiple columns using indicator variables.
74
+#
64 75
 ################################################################################
65 76
 
66 77
 elasticNetFeatures <- function(model)
67 78
                       {
68
-                        inputFeatures <- rownames(model[["beta"]][[1]])            
69 79
                         # Floating point numbers test for equality.
70 80
                         whichCoefficientColumn <- which(abs(model[["lambda"]] - attr(model, "tune")[["lambda"]]) < 0.00001)[1]
71 81
                         coefficientsUsed <- sapply(model[["beta"]], function(classCoefficients) classCoefficients[, whichCoefficientColumn])
72 82
                         featureScores <- rowSums(abs(coefficientsUsed))
73
-                        rankedFeaturesIndices <- order(featureScores, decreasing = TRUE)
74
-                        selectedFeaturesIndices <- which(featureScores != 0)
83
+                        featureGroups <- attr(model, "featureGroups")[match(names(featureScores), attr(model, "featureNames"))]
84
+                        groupScores <- unname(by(featureScores, featureGroups, max))
85
+                        rankedFeaturesIndices <- order(groupScores, decreasing = TRUE)
86
+                        selectedFeaturesIndices <- which(groupScores != 0)
75 87
                         list(rankedFeaturesIndices, selectedFeaturesIndices)
76 88
                       }
77 89
\ No newline at end of file
... ...
@@ -5,43 +5,28 @@ setGeneric("precisionPathwayTrain", function(measurements, class, ...)
5 5
 
6 6
 #' @rdname precisionPathwayTrain
7 7
 #' @export
8
-setMethod("precisionPathwayTrain", "MultiAssayExperiment", 
9
-          function(measurements, class, clinicalPredictors = NULL, ..., fixedAssays = "clinical",
10
-                   confidenceCutoff = 0.8, minAssaySamples = 10,
11
-                   nFeatures = 20, selectionMethod = setNames(c(NULL, rep("t-test", length(measurements))), c("clinical", names(measurements))),
8
+setMethod("precisionPathwayTrain", "MultiAssayExperimentOrList", 
9
+          function(measurements, class, clinicalPredictors = NULL, maxMissingProp = 0.0, topNvariance = NULL,
10
+                   fixedAssays = "clinical", confidenceCutoff = 0.8, minAssaySamples = 10,
11
+                   nFeatures = 20, selectionMethod = setNames(c("none", rep("t-test", length(measurements))), c("clinical", names(measurements))),
12 12
                    classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))), c("clinical", names(measurements))),
13 13
                    nFolds = 5, nRepeats = 20, nCores = 1)
14 14
           {
15
-              prepArgs <- list(measurements, class, clinicalPredictors)              
16
-              extraInputs <- list(...)
17
-              prepExtras <- numeric()
18
-              if(length(extraInputs) > 0)
19
-                prepExtras <- which(names(extraInputs) %in% .ClassifyRenvir[["prepareDataFormals"]])
20
-              if(length(prepExtras) > 0)
21
-                prepArgs <- append(prepArgs, extraInputs[prepExtras])
22
-              measurementsAndClass <- do.call(prepareData, prepArgs)
23
-              
24
-             .precisionPathwayTrain(measurementsAndClass[["measurements"]], measurementsAndClass[["outcome"]],
25
-                                    fixedAssays = fixedAssays, confidenceCutoff = confidenceCutoff,
26
-                                    minAssaySamples = minAssaySamples, nFeatures = nFeatures,
27
-                                    selectionMethod = selectionMethod, classifier = classifier,
28
-                                    nFolds = nFolds, nRepeats = nRepeats, nCores = nCores)
29
-          })
30
-
31
-#' @rdname precisionPathwayTrain
32
-#' @export
33
-setMethod("precisionPathwayTrain", "list", 
34
-          function(measurements, class, clinicalPredictors = NULL, fixedAssays = "clinical",
35
-                   confidenceCutoff = 0.8, minAssaySamples = 10,
36
-                   nFeatures = 20, selectionMethod = setNames(c(NULL, rep("t-test", length(measurements))), c("clinical", names(measurements))),
37
-                   classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))), c("clinical", names(measurements))),
38
-                   nFolds = 5, nRepeats = 20, nCores = 1, ...)
39
-          {
40
-            # One of the tables must be named "clinical".
41
-            if (!any(names(measurements) == "clinical"))
42
-              stop("One of the tables must be named \"clinical\".")
15
+            if(is.list(measurements)) # Ensure plain list has clinical data.
16
+            {
17
+              # One of the tables must be named "clinical".
18
+              if (!any(names(measurements) == "clinical"))
19
+                stop("One of the tables must be named \"clinical\".")
20
+            }
21
+            prepArgs <- list(measurements, outcomeColumns = class, clinicalPredictors = clinicalPredictors,
22
+                             maxMissingProp = maxMissingProp, topNvariance = topNvariance)
23
+            measurementsAndClass <- do.call(prepareData, prepArgs)
43 24
               
44
-            .precisionPathwayTrain(measurements = combined_df, class = class)
25
+            .precisionPathwayTrain(measurementsAndClass[["measurements"]], measurementsAndClass[["outcome"]],
26
+                                   fixedAssays = fixedAssays, confidenceCutoff = confidenceCutoff,
27
+                                   minAssaySamples = minAssaySamples, nFeatures = nFeatures,
28
+                                   selectionMethod = selectionMethod, classifier = classifier,
29
+                                   nFolds = nFolds, nRepeats = nRepeats, nCores = nCores)
45 30
           })
46 31
 
47 32
 # Internal method which carries out all of the processing, obtaining reformatted data from the
... ...
@@ -52,25 +37,13 @@ setMethod("precisionPathwayTrain", "list",
52 37
                    classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))), c("clinical", names(measurements))),
53 38
                    nFolds = 5, nRepeats = 20, nCores = 1, ...)
54 39
           {
55
-            # Step 1: Separate the vector classes from the table of measurements, if not already separate.
56
-            prepArgs <- list(measurements, class, clinicalPredictors)              
57
-            extraInputs <- list(...)
58
-            prepExtras <- numeric()
59
-            if(length(extraInputs) > 0)
60
-              prepExtras <- which(names(extraInputs) %in% .ClassifyRenvir[["prepareDataFormals"]])
61
-            if(length(prepExtras) > 0)
62
-              prepArgs <- append(prepArgs, extraInputs[prepExtras])
63
-            
64
-            measurementsAndClass <- do.call(prepareData, prepArgs)
65
-            measurements <- measurementsAndClass[["measurements"]]
66
-            class <- measurementsAndClass[["outcome"]]
67 40
             
68
-            # Step 2: Determine all valid permutations of assays, taking into account the
41
+            # Step 1: Determine all valid permutations of assays, taking into account the
69 42
             # assays to be used and which assays, if any, must be included.
70 43
             assayIDs <- unique(S4Vectors::mcols(measurements)[["assay"]])
71 44
             assaysPermutations <- .permutations(assayIDs, fixed = data.frame(seq_along(fixedAssays), fixedAssays))
72 45
             
73
-            # Step 3: Build a classifier for each assay using all of the samples.
46
+            # Step 2: Build a classifier for each assay using all of the samples.
74 47
             modelsList <- crossValidate(measurements, class, nFeatures, selectionMethod,
75 48
                                         classifier = classifier, nFolds = nFolds,
76 49
                                         nRepeats = nRepeats, nCores = nCores)
... ...
@@ -180,10 +180,10 @@ setMethod("prepareData", "MultiAssayExperiment",
180 180
   if(is.null(clinicalPredictors))
181 181
     stop("'clinicalPredictors' must be a vector of informative clinical features (i.e. not sample IDs, sampling dates, etc.) to consider for classification.")      
182 182
 
183
-  if(any(anyReplicated(measurements[, , omicsTargets])))
183
+  if(any(anyReplicated(measurements)))
184 184
     stop("Data set contains replicates. Please remove or average replicate observations and try again.")
185 185
  
186
-  if(is.null(outcomeColums))
186
+  if(is.null(outcomeColumns))
187 187
     stop("'outcomeColumns' is a mandatory parameter but was not specified.")
188 188
       
189 189
   if(!all(outcomeColumns %in% colnames(MultiAssayExperiment::colData(measurements))))
... ...
@@ -195,6 +195,7 @@ setMethod("prepareData", "MultiAssayExperiment",
195 195
   dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(clinicalPredictors, outcomeColumns))
196 196
   rownames(dataTable) <- dataTable[, "primary"]
197 197
   S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "clinical", S4Vectors::mcols(dataTable)[, "sourceName"])
198
+  dataTable <- dataTable[, -match("primary", colnames(dataTable))]
198 199
   colnames(S4Vectors::mcols(dataTable))[1] <- "assay"
199 200
             
200 201
   # Sample information variable names not included in column metadata of wide table but only as row names of it.
... ...
@@ -129,7 +129,7 @@ Set to NULL or "all" if all features should be used.}
129 129
 
130 130
 \item{selectionMethod}{Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
131 131
 and performing multiview classification, the respective selection methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking
132
-and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. \code{NULL} is also a valid value, meaning that no
132
+and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. \code{"none"} is also a valid value, meaning that no
133 133
 indepedent feature selection will be performed (but implicit selection might still happen with the classifier).}
134 134
 
135 135
 \item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.}