Browse code

Merge pull request #46 from SydneyBioX/master

More Fixes for Prevalidation

Dario Strbenac authored on 20/10/2022 10:20:24 • GitHub committed on 20/10/2022 10:20:24
Showing 9 changed files

... ...
@@ -3,8 +3,8 @@ Type: Package
3 3
 Title: A framework for cross-validated classification problems, with
4 4
        applications to differential variability and differential
5 5
        distribution testing
6
-Version: 3.1.25
7
-Date: 2022-10-18
6
+Version: 3.1.26
7
+Date: 2022-10-19
8 8
 Author: Dario Strbenac, Ellis Patrick, John Ormerod, Graham Mann, Jean Yang
9 9
 Maintainer: Dario Strbenac <dario.strbenac@sydney.edu.au>
10 10
 VignetteBuilder: knitr
... ...
@@ -124,16 +124,8 @@ setMethod("crossValidate", "DataFrame",
124 124
                                           measurements = measurements)
125 125
               selectionMethod <- cleanSelectionMethod(selectionMethod = selectionMethod,
126 126
                                                       measurements = measurements)
127
-              if(any(nFeatures == 1) && classifier == "elasticNetGLM")
128
-              {
129
-                  options(warn = 1)
130
-                  warning("Elastic Net GLM requires two or more features as input but there is only one.
131
-Using an ordinary GLM instead.")
132
-                  classifier <- "GLM"
133
-              }
134
-              
135 127
               classifier <- cleanClassifier(classifier = classifier,
136
-                                            measurements = measurements)
128
+                                            measurements = measurements, nFeatures = nFeatures)
137 129
               
138 130
               ##!!!!! Do something with data combinations
139 131
 
... ...
@@ -201,7 +193,7 @@ Using an ordinary GLM instead.")
201 193
                   # The below loops over different combinations of assays and merges them together.
202 194
                   # This allows someone to answer which combinations of the assays might be most useful.
203 195
 
204
-                  if(!is.list(assayCombinations) && assayCombinations == "all") assayCombinations <- do.call("c", sapply(seq_along(assayIDs), function(nChoose) combn(assayIDs, nChoose, simplify = FALSE)))
196
+                  if(!is.list(assayCombinations) && assayCombinations[1] == "all") assayCombinations <- do.call("c", sapply(seq_along(assayIDs), function(nChoose) combn(assayIDs, nChoose, simplify = FALSE)))
205 197
 
206 198
                   result <- sapply(assayCombinations, function(assayIndex){
207 199
                       CV(measurements = measurements[, mcols(measurements)[["assay"]] %in% assayIndex],
... ...
@@ -229,7 +221,7 @@ Using an ordinary GLM instead.")
229 221
                   # This allows someone to answer which combinations of the assays might be most useful.
230 222
 
231 223
 
232
-                  if(!is.list(assayCombinations) && assayCombinations == "all")
224
+                  if(!is.list(assayCombinations) && assayCombinations[1] == "all")
233 225
                   {
234 226
                       assayCombinations <- do.call("c", sapply(seq_along(assayIDs), function(nChoose) combn(assayIDs, nChoose, simplify = FALSE)))
235 227
                       assayCombinations <- assayCombinations[sapply(assayCombinations, function(combination) "clinical" %in% combination, simplify = TRUE)]
... ...
@@ -263,7 +255,7 @@ Using an ordinary GLM instead.")
263 255
                   # This allows someone to answer which combinations of the assays might be most useful.
264 256
 
265 257
 
266
-                  if(!is.list(assayCombinations) && assayCombinations == "all"){
258
+                  if(!is.list(assayCombinations) && assayCombinations[1] == "all"){
267 259
                       assayCombinations <- do.call("c", sapply(seq_along(assayIDs),function(nChoose) combn(assayIDs, nChoose, simplify = FALSE)))
268 260
                       assayCombinations <- assayCombinations[sapply(assayCombinations, function(combination) "clinical" %in% combination, simplify = TRUE)]
269 261
                       if(length(assayCombinations) == 0) stop("No assayCombinations with \"clinical\" data")
... ...
@@ -471,9 +463,9 @@ cleanNFeatures <- function(nFeatures, measurements){
471 463
       obsFeatures <- unlist(as.list(table(mcols(measurements)[, "assay"])))
472 464
     else obsFeatures <- ncol(measurements)
473 465
     if(is.null(nFeatures) || length(nFeatures) == 1 && nFeatures == "all") nFeatures <- as.list(obsFeatures)
474
-    if(is.null(names(nFeatures)) & length(nFeatures) == 1) nFeatures <- as.list(pmin(obsFeatures, nFeatures))
475
-    if(is.null(names(nFeatures)) & length(nFeatures) > 1) nFeatures <- sapply(obsFeatures, function(x)pmin(x, nFeatures), simplify = FALSE)
476
-    #if(is.null(names(nFeatures)) & length(nFeatures) > 1) stop("nFeatures needs to be a named numeric vector or list with the same names as the assays.")
466
+    if(is.null(names(nFeatures)) && length(nFeatures) == 1) nFeatures <- as.list(pmin(obsFeatures, nFeatures))
467
+    if(is.null(names(nFeatures)) && length(nFeatures) > 1) nFeatures <- sapply(obsFeatures, function(x)pmin(x, nFeatures), simplify = FALSE)
468
+    #if(is.null(names(nFeatures)) && length(nFeatures) > 1) stop("nFeatures needs to be a named numeric vector or list with the same names as the assays.")
477 469
     if(!is.null(names(obsFeatures)) && !all(names(obsFeatures) %in% names(nFeatures))) stop("nFeatures needs to be a named numeric vector or list with the same names as the assays.")
478 470
     if(!is.null(names(obsFeatures)) && all(names(obsFeatures) %in% names(nFeatures)) & is(nFeatures, "numeric")) nFeatures <- as.list(pmin(obsFeatures, nFeatures[names(obsFeatures)]))
479 471
     if(!is.null(names(obsFeatures)) && all(names(obsFeatures) %in% names(nFeatures)) & is(nFeatures, "list")) nFeatures <- mapply(pmin, nFeatures[names(obsFeatures)], obsFeatures, SIMPLIFY = FALSE)
... ...
@@ -498,7 +490,7 @@ cleanSelectionMethod <- function(selectionMethod, measurements){
498 490
 
499 491
 ######################################
500 492
 ######################################
501
-cleanClassifier <- function(classifier, measurements){
493
+cleanClassifier <- function(classifier, measurements, nFeatures){
502 494
     #### Clean up
503 495
     if(!is.null(mcols(measurements)$assay))
504 496
       obsFeatures <- unlist(as.list(table(mcols(measurements)[, "assay"])))
... ...
@@ -509,10 +501,20 @@ cleanClassifier <- function(classifier, measurements){
509 501
     #if(is.null(names(classifier)) & length(classifier) > 1) stop("classifier needs to be a named character vector or list with the same names as the assays.")
510 502
     if(!is.null(names(obsFeatures)) && !all(names(obsFeatures) %in% names(classifier))) stop("classifier needs to be a named character vector or list with the same names as the assays.")
511 503
     if(!is.null(names(obsFeatures)) && all(names(obsFeatures) %in% names(classifier)) & is(classifier, "character")) classifier <- as.list(classifier[names(obsFeatures)])
504
+    
505
+    nFeatures <- nFeatures[names(classifier)]
506
+    checkENs <- which(classifier == "elasticNetGLM")
507
+    if(length(checkENs) > 0)
508
+    {
509
+      replacements <- sapply(checkENs, function(checkEN) ifelse(any(nFeatures[[checkEN]] == 1), "GLM", "elasticNetGLM"))
510
+      classifier[checkENs] <- replacements
511
+      if(any(replacements) == "GLM")    
512
+        warning("Elastic Net GLM requires two or more features as input but there is only one.
513
+Using an ordinary GLM instead.")
514
+    }
512 515
     classifier
513 516
 }
514 517
 
515
-
516 518
 ######################################
517 519
 ######################################
518 520
 #' A function to generate a CrossValParams object
... ...
@@ -808,14 +810,6 @@ CV <- function(measurements = NULL,
808 810
                characteristicsLabel = NULL)
809 811
 
810 812
 {
811
-    # Check that other variables are in the right format and fix
812
-    nFeatures <- cleanNFeatures(nFeatures = nFeatures,
813
-                                measurements = measurements)
814
-    selectionMethod <- cleanSelectionMethod(selectionMethod = selectionMethod,
815
-                                            measurements = measurements)
816
-    classifier <- cleanClassifier(classifier = classifier,
817
-                                  measurements = measurements)
818
-
819 813
     # Which data-types or data-views are present?
820 814
     if(is.null(characteristicsLabel)) characteristicsLabel <- "none"
821 815
 
... ...
@@ -841,7 +835,7 @@ CV <- function(measurements = NULL,
841 835
     characteristics <- S4Vectors::DataFrame(characteristic = c(if(!is.null(assayText)) "Assay Name" else NULL, "Classifier Name", "Selection Name", "multiViewMethod", "characteristicsLabel"), value = c(if(!is.null(assayText)) paste(assayText, collapse = ", ") else NULL, paste(classifier, collapse = ", "),  paste(selectionMethod, collapse = ", "), multiViewMethod, characteristicsLabel))
842 836
 
843 837
     if(!is.null(measurements))
844
-    { # Cross-validation.        
838
+    { # Cross-validation.
845 839
       classifyResults <- runTests(measurements, outcome, crossValParams = crossValParams, modellingParams = modellingParams, characteristics = characteristics)
846 840
       fullResult <- runTest(measurements, outcome, measurements, outcome, crossValParams = crossValParams, modellingParams = modellingParams, characteristics = characteristics, .iteration = 1)
847 841
     } else { # Independent training and testing.
... ...
@@ -14,6 +14,7 @@ classifyInterface <- function(countsTrain, classesTrain, countsTest, ...,
14 14
   classPredictions <- predicted[["ytehat"]]
15 15
   classScores <- predicted[["discriminant"]]
16 16
   colnames(classScores) <- levels(classesTrain)
17
+  rownames(classScores) <- names(classPredictions) <- rownames(countsTest)
17 18
   switch(returnType, class = classPredictions, # Factor vector.
18 19
          score = classScores, # Numeric matrix.
19 20
          both = data.frame(class = classPredictions, classScores, check.names = FALSE))
... ...
@@ -39,6 +39,6 @@ coxnetPredictInterface <- function(model, measurementsTest, survivalTest = NULL,
39 39
   model$offset <- TRUE
40 40
   
41 41
   survScores <- predict(model, testMatrix, s = lambda, type = "response", newoffset = offset)
42
-  
42
+  rownames(survScores) <- rownames(measurementsTest)
43 43
   survScores[, 1]
44 44
 }
45 45
\ No newline at end of file
... ...
@@ -31,15 +31,6 @@ extractPrevalidation = function(assayPreval){ #}, startingCol) {
31 31
     vec
32 32
 }
33 33
 
34
-# Use to pull out the names of features in a ClassifyR model - XYZ: Could Ditch if we really wanted to
35
-featurePuller = function(classifyObject) {
36
-    if ("selectResult" %in% slotNames(classifyObject)) {
37
-        features = classifyObject@selectResult@chosenFeatures
38
-    } else{
39
-        features = classifyObject@originalFeatures
40
-    }
41
-}
42
-
43 34
 setClass("prevalModel", slots = list(fullModel = "list"))
44 35
 
45 36
 prevalTrainInterface <- function(measurements, classes, params, ...)
... ...
@@ -23,6 +23,7 @@ randomForestPredictInterface <- function(forest, measurementsTest, ..., returnTy
23 23
   classPredictions <- predict(forest, measurementsTest)$predictions
24 24
   classScores <- predict(forest, measurementsTest, predict.all = TRUE)[[1]]
25 25
   classScores <- t(apply(classScores, 1, function(sampleRow) table(factor(classes[sampleRow], levels = classes)) / forest$forest$num.trees))
26
+  rownames(classScores) <- names(classPredictions) <- rownames(measurementsTest)
26 27
   switch(returnType, class = classPredictions,
27 28
          score = classScores,
28 29
          both = data.frame(class = classPredictions, classScores, check.names = FALSE))
... ...
@@ -38,7 +38,7 @@ SVMpredictInterface <- function(model, measurementsTest, returnType = c("both",
38 38
   # e1071 uses attributes to pass back probabilities. Make them a standalone variable.
39 39
   classScores <- attr(classPredictions, "probabilities")[, model[["levels"]], drop = FALSE]
40 40
   attr(classPredictions, "probabilities") <- NULL
41
-  
41
+  rownames(classScores) <- names(classPredictions) <- rownames(measurementsTest)
42 42
   switch(returnType, class = classPredictions, score = classScores,
43 43
          both = data.frame(class = classPredictions, classScores, check.names = FALSE))
44 44
 }
45 45
\ No newline at end of file
... ...
@@ -46,13 +46,17 @@ extremeGradientBoostingPredictInterface <- function(booster, measurementsTest, .
46 46
   if(verbose == 3)
47 47
     message("Predicting using boosted random forest.")  
48 48
   measurementsTest <- as(measurementsTest, "data.frame")
49
+  measurementsTest <- measurementsTest[, attr(booster, "featureNames")]
49 50
   # Convert to one-hot encoding as xgboost doesn't understand factors. Need to get rid of intercept afterwards.
50 51
   measurementsTest <- MatrixModels::model.Matrix(~ 0 + ., data = measurementsTest, sparse = TRUE)
52
+  
51 53
   scores <- predict(booster, measurementsTest, reshape = TRUE)
54
+  colnames(scores) <- attr(booster, "classes")
52 55
   if(!is.null(attr(booster, "classes"))) # It is a classification task.
53 56
   {
54 57
     classPredictions <- attr(booster, "classes")[apply(scores, 1, function(sampleRow) which.max(sampleRow)[1])]
55 58
     classPredictions <- factor(classPredictions, levels = attr(booster, "classes"))
59
+    rownames(scores) <- names(classPredictions) <- rownames(measurementsTest)
56 60
     result <- switch(returnType, class = classPredictions,
57 61
                      score = scores,
58 62
                      both = data.frame(class = classPredictions, scores, check.names = FALSE))
... ...
@@ -188,7 +188,7 @@ Function(s) | Description | DM | DV | DD
188 188
 <span style="font-family: 'Courier New', monospace;">fisherDiscriminant</span> | Implementation of Fisher's LDA for departures from normality | ✔ | ✔* |
189 189
 <span style="font-family: 'Courier New', monospace;">mixModelsTrain</span>, <span style="font-family: 'Courier New', monospace;">mixModelsPredict</span> | Feature-wise mixtures of normals and voting | ✔ | ✔ | ✔ 
190 190
 <span style="font-family: 'Courier New', monospace;">naiveBayesKernel</span> | Feature-wise kernel density estimation and voting | ✔ | ✔ | ✔ 
191
-<span style="font-family: 'Courier New', monospace;">randomForestTrainInterface</span>, <span style="font-family: 'Courier New', monospace;">randomForestPredictInterface</span> | Wrapper for randomForest's functions <span style="font-family: 'Courier New', monospace;">randomForest</span> and <span style="font-family: 'Courier New', monospace;">predict</span> | ✔ | ✔ | ✔ 
191
+<span style="font-family: 'Courier New', monospace;">randomForestTrainInterface</span>, <span style="font-family: 'Courier New', monospace;">randomForestPredictInterface</span> | Wrapper for ranger's functions <span style="font-family: 'Courier New', monospace;">ranger</span> and <span style="font-family: 'Courier New', monospace;">predict</span> | ✔ | ✔ | ✔ 
192 192
 <span style="font-family: 'Courier New', monospace;">extremeGradientBoostingTrainInterface</span>, <span style="font-family: 'Courier New', monospace;">extremeGradientBoostingPredictInterface</span> | Wrapper for xgboost's functions <span style="font-family: 'Courier New', monospace;">xgboost</span> and <span style="font-family: 'Courier New', monospace;">predict</span> | ✔ | ✔ | ✔ 
193 193
 <span style="font-family: 'Courier New', monospace;">kNNinterface</span> | Wrapper for class's function <span style="font-family: 'Courier New', monospace;">knn</span> | ✔ | ✔ | ✔ 
194 194
 <span style="font-family: 'Courier New', monospace;">SVMtrainInterface</span>, <span style="font-family: 'Courier New', monospace;">SVMpredictInterface</span> | Wrapper for e1071's functions <span style="font-family: 'Courier New', monospace;">svm</span> and <span style="font-family: 'Courier New', monospace;">predict.svm</span> | ✔ | ✔ †| ✔ †