Browse code

- XGBoost scores columns have class labels now. - nFeatures cleaning now only done once in crossValidate and not unnecessarily doubled in CV (it changes vector into list).

Dario Strbenac authored on 19/10/2022 01:30:03
Showing 2 changed files

... ...
@@ -124,16 +124,8 @@ setMethod("crossValidate", "DataFrame",
124 124
                                           measurements = measurements)
125 125
               selectionMethod <- cleanSelectionMethod(selectionMethod = selectionMethod,
126 126
                                                       measurements = measurements)
127
-              if(any(nFeatures == 1) && classifier == "elasticNetGLM")
128
-              {
129
-                  options(warn = 1)
130
-                  warning("Elastic Net GLM requires two or more features as input but there is only one.
131
-Using an ordinary GLM instead.")
132
-                  classifier <- "GLM"
133
-              }
134
-              
135 127
               classifier <- cleanClassifier(classifier = classifier,
136
-                                            measurements = measurements)
128
+                                            measurements = measurements, nFeatures = nFeatures)
137 129
               
138 130
               ##!!!!! Do something with data combinations
139 131
 
... ...
@@ -236,7 +228,7 @@ Using an ordinary GLM instead.")
236 228
                       if(length(assayCombinations) == 0) stop("No assayCombinations with \"clinical\" data")
237 229
                   }
238 230
 
239
-                  result <- sapply(assayCombinations, function(assayIndex){
231
+                  result <- sapply(assayCombinations[2], function(assayIndex){
240 232
                       CV(measurements = measurements[, mcols(measurements)[["assay"]] %in% assayIndex],
241 233
                          outcome = outcome, assayIDs = assayIndex,
242 234
                          nFeatures = nFeatures[assayIndex],
... ...
@@ -471,9 +463,9 @@ cleanNFeatures <- function(nFeatures, measurements){
471 463
       obsFeatures <- unlist(as.list(table(mcols(measurements)[, "assay"])))
472 464
     else obsFeatures <- ncol(measurements)
473 465
     if(is.null(nFeatures) || length(nFeatures) == 1 && nFeatures == "all") nFeatures <- as.list(obsFeatures)
474
-    if(is.null(names(nFeatures)) & length(nFeatures) == 1) nFeatures <- as.list(pmin(obsFeatures, nFeatures))
475
-    if(is.null(names(nFeatures)) & length(nFeatures) > 1) nFeatures <- sapply(obsFeatures, function(x)pmin(x, nFeatures), simplify = FALSE)
476
-    #if(is.null(names(nFeatures)) & length(nFeatures) > 1) stop("nFeatures needs to be a named numeric vector or list with the same names as the assays.")
466
+    if(is.null(names(nFeatures)) && length(nFeatures) == 1) nFeatures <- as.list(pmin(obsFeatures, nFeatures))
467
+    if(is.null(names(nFeatures)) && length(nFeatures) > 1) nFeatures <- sapply(obsFeatures, function(x)pmin(x, nFeatures), simplify = FALSE)
468
+    #if(is.null(names(nFeatures)) && length(nFeatures) > 1) stop("nFeatures needs to be a named numeric vector or list with the same names as the assays.")
477 469
     if(!is.null(names(obsFeatures)) && !all(names(obsFeatures) %in% names(nFeatures))) stop("nFeatures needs to be a named numeric vector or list with the same names as the assays.")
478 470
     if(!is.null(names(obsFeatures)) && all(names(obsFeatures) %in% names(nFeatures)) & is(nFeatures, "numeric")) nFeatures <- as.list(pmin(obsFeatures, nFeatures[names(obsFeatures)]))
479 471
     if(!is.null(names(obsFeatures)) && all(names(obsFeatures) %in% names(nFeatures)) & is(nFeatures, "list")) nFeatures <- mapply(pmin, nFeatures[names(obsFeatures)], obsFeatures, SIMPLIFY = FALSE)
... ...
@@ -498,7 +490,7 @@ cleanSelectionMethod <- function(selectionMethod, measurements){
498 490
 
499 491
 ######################################
500 492
 ######################################
501
-cleanClassifier <- function(classifier, measurements){
493
+cleanClassifier <- function(classifier, measurements, nFeatures){
502 494
     #### Clean up
503 495
     if(!is.null(mcols(measurements)$assay))
504 496
       obsFeatures <- unlist(as.list(table(mcols(measurements)[, "assay"])))
... ...
@@ -509,10 +501,20 @@ cleanClassifier <- function(classifier, measurements){
509 501
     #if(is.null(names(classifier)) & length(classifier) > 1) stop("classifier needs to be a named character vector or list with the same names as the assays.")
510 502
     if(!is.null(names(obsFeatures)) && !all(names(obsFeatures) %in% names(classifier))) stop("classifier needs to be a named character vector or list with the same names as the assays.")
511 503
     if(!is.null(names(obsFeatures)) && all(names(obsFeatures) %in% names(classifier)) & is(classifier, "character")) classifier <- as.list(classifier[names(obsFeatures)])
504
+    
505
+    nFeatures <- nFeatures[names(classifier)]
506
+    checkENs <- which(classifier == "elasticNetGLM")
507
+    if(length(checkENs) > 0)
508
+    {
509
+      replacements <- sapply(checkENs, function(checkEN) ifelse(any(nFeatures[[checkEN]] == 1), "GLM", "elasticNetGLM"))
510
+      classifier[checkENs] <- replacements
511
+      if(any(replacements) == "GLM")    
512
+        warning("Elastic Net GLM requires two or more features as input but there is only one.
513
+Using an ordinary GLM instead.")
514
+    }
512 515
     classifier
513 516
 }
514 517
 
515
-
516 518
 ######################################
517 519
 ######################################
518 520
 #' A function to generate a CrossValParams object
... ...
@@ -808,14 +810,6 @@ CV <- function(measurements = NULL,
808 810
                characteristicsLabel = NULL)
809 811
 
810 812
 {
811
-    # Check that other variables are in the right format and fix
812
-    nFeatures <- cleanNFeatures(nFeatures = nFeatures,
813
-                                measurements = measurements)
814
-    selectionMethod <- cleanSelectionMethod(selectionMethod = selectionMethod,
815
-                                            measurements = measurements)
816
-    classifier <- cleanClassifier(classifier = classifier,
817
-                                  measurements = measurements)
818
-
819 813
     # Which data-types or data-views are present?
820 814
     if(is.null(characteristicsLabel)) characteristicsLabel <- "none"
821 815
 
... ...
@@ -841,7 +835,7 @@ CV <- function(measurements = NULL,
841 835
     characteristics <- S4Vectors::DataFrame(characteristic = c(if(!is.null(assayText)) "Assay Name" else NULL, "Classifier Name", "Selection Name", "multiViewMethod", "characteristicsLabel"), value = c(if(!is.null(assayText)) paste(assayText, collapse = ", ") else NULL, paste(classifier, collapse = ", "),  paste(selectionMethod, collapse = ", "), multiViewMethod, characteristicsLabel))
842 836
 
843 837
     if(!is.null(measurements))
844
-    { # Cross-validation.        
838
+    { # Cross-validation.
845 839
       classifyResults <- runTests(measurements, outcome, crossValParams = crossValParams, modellingParams = modellingParams, characteristics = characteristics)
846 840
       fullResult <- runTest(measurements, outcome, measurements, outcome, crossValParams = crossValParams, modellingParams = modellingParams, characteristics = characteristics, .iteration = 1)
847 841
     } else { # Independent training and testing.
... ...
@@ -49,6 +49,7 @@ extremeGradientBoostingPredictInterface <- function(booster, measurementsTest, .
49 49
   # Convert to one-hot encoding as xgboost doesn't understand factors. Need to get rid of intercept afterwards.
50 50
   measurementsTest <- MatrixModels::model.Matrix(~ 0 + ., data = measurementsTest, sparse = TRUE)
51 51
   scores <- predict(booster, measurementsTest, reshape = TRUE)
52
+  colnames(scores) <- attr(booster, "classes")
52 53
   if(!is.null(attr(booster, "classes"))) # It is a classification task.
53 54
   {
54 55
     classPredictions <- attr(booster, "classes")[apply(scores, 1, function(sampleRow) which.max(sampleRow)[1])]