Browse code

- selectionMethod and classifer now automatically chosen. Were already documented to be, but overlooked for code implementation.

Dario Strbenac authored on 08/12/2022 09:44:46
Showing 5 changed files

... ...
@@ -3,8 +3,8 @@ Type: Package
3 3
 Title: A framework for cross-validated classification problems, with
4 4
        applications to differential variability and differential
5 5
        distribution testing
6
-Version: 3.2.5
7
-Date: 2022-11-28
6
+Version: 3.2.6
7
+Date: 2022-12-08
8 8
 Author: Dario Strbenac, Ellis Patrick, Sourish Iyengar, Harry Robertson, Andy Tran, John Ormerod, Graham Mann, Jean Yang
9 9
 Maintainer: Dario Strbenac <dario.strbenac@sydney.edu.au>
10 10
 VignetteBuilder: knitr
... ...
@@ -22,8 +22,8 @@
22 22
 #' or assays. If a numeric vector these will be optimised over using \code{selectionOptimisation}. If a named vector with the same names of multiple assays, 
23 23
 #' a different number of features will be used for each assay. If a named list of vectors, the respective number of features will be optimised over. 
24 24
 #' Set to NULL or "all" if all features should be used.
25
-#' @param selectionMethod Default: "auto". A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
26
-#' and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"} t-test (two categories) / F-test (three or more categories) ranking
25
+#' @param selectionMethod Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
26
+#' and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking
27 27
 #' and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value.
28 28
 #' @param selectionOptimisation A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.
29 29
 #' @param performanceType Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Otherwise, any one of the
... ...
@@ -91,10 +91,10 @@ setMethod("crossValidate", "DataFrame",
91 91
           function(measurements,
92 92
                    outcome,
93 93
                    nFeatures = 20,
94
-                   selectionMethod = "t-test",
94
+                   selectionMethod = "auto",
95 95
                    selectionOptimisation = "Resubstitution",
96 96
                    performanceType = "auto",
97
-                   classifier = "randomForest",
97
+                   classifier = "auto",
98 98
                    multiViewMethod = "none",
99 99
                    assayCombinations = "all",
100 100
                    nFolds = 5,
... ...
@@ -297,10 +297,10 @@ setMethod("crossValidate", "MultiAssayExperiment",
297 297
           function(measurements,
298 298
                    outcome, 
299 299
                    nFeatures = 20,
300
-                   selectionMethod = "t-test",
300
+                   selectionMethod = "auto",
301 301
                    selectionOptimisation = "Resubstitution",
302 302
                    performanceType = "auto",
303
-                   classifier = "randomForest",
303
+                   classifier = "auto",
304 304
                    multiViewMethod = "none",
305 305
                    assayCombinations = "all",
306 306
                    nFolds = 5,
... ...
@@ -331,10 +331,10 @@ setMethod("crossValidate", "data.frame", # data.frame of numeric measurements.
331 331
           function(measurements,
332 332
                    outcome, 
333 333
                    nFeatures = 20,
334
-                   selectionMethod = "t-test",
334
+                   selectionMethod = "auto",
335 335
                    selectionOptimisation = "Resubstitution",
336 336
                    performanceType = "auto",
337
-                   classifier = "randomForest",
337
+                   classifier = "auto",
338 338
                    multiViewMethod = "none",
339 339
                    assayCombinations = "all",
340 340
                    nFolds = 5,
... ...
@@ -364,10 +364,10 @@ setMethod("crossValidate", "matrix", # Matrix of numeric measurements.
364 364
           function(measurements,
365 365
                    outcome,
366 366
                    nFeatures = 20,
367
-                   selectionMethod = "t-test",
367
+                   selectionMethod = "auto",
368 368
                    selectionOptimisation = "Resubstitution",
369 369
                    performanceType = "auto",
370
-                   classifier = "randomForest",
370
+                   classifier = "auto",
371 371
                    multiViewMethod = "none",
372 372
                    assayCombinations = "all",
373 373
                    nFolds = 5,
... ...
@@ -399,10 +399,10 @@ setMethod("crossValidate", "list",
399 399
           function(measurements,
400 400
                    outcome, 
401 401
                    nFeatures = 20,
402
-                   selectionMethod = "t-test",
402
+                   selectionMethod = "auto",
403 403
                    selectionOptimisation = "Resubstitution",
404 404
                    performanceType = "auto",
405
-                   classifier = "randomForest",
405
+                   classifier = "auto",
406 406
                    multiViewMethod = "none",
407 407
                    assayCombinations = "all",
408 408
                    nFolds = 5,
... ...
@@ -764,19 +764,18 @@ generateMultiviewParams <- function(assayIDs,
764 764
 }
765 765
 
766 766
 # measurements, outcome are mutually exclusive with x, outcomeTrain, measurementsTest, outcomeTest.
767
-CV <- function(measurements = NULL,
768
-               outcome = NULL, x = NULL, outcomeTrain = NULL, measurementsTest = NULL, outcomeTest = NULL,
767
+CV <- function(measurements, outcome, x, outcomeTrain, measurementsTest, outcomeTest,
769 768
                assayIDs,
770
-               nFeatures = NULL,
771
-               selectionMethod = "t-test",
772
-               selectionOptimisation = "Resubstitution",
769
+               nFeatures,
770
+               selectionMethod,
771
+               selectionOptimisation,
773 772
                performanceType,
774
-               classifier = "elasticNetGLM",
775
-               multiViewMethod = "none",
776
-               nFolds = 5,
777
-               nRepeats = 100,
778
-               nCores = 1,
779
-               characteristicsLabel = NULL)
773
+               classifier,
774
+               multiViewMethod,
775
+               nFolds,
776
+               nRepeats,
777
+               nCores,
778
+               characteristicsLabel)
780 779
 
781 780
 {
782 781
     # Which data-types or data-views are present?
... ...
@@ -848,7 +847,7 @@ train.data.frame <- function(x, outcomeTrain, ...)
848 847
 #' @param performanceType Performance metric to optimise if classifier has any tuning parameters.
849 848
 #' @method train DataFrame
850 849
 #' @export
851
-train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", performanceType = "auto",
850
+train.DataFrame <- function(x, outcomeTrain, selectionMethod = "auto", nFeatures = 20, classifier = "auto", performanceType = "auto",
852 851
                             multiViewMethod = "none", assayIDs = "all", ...) # ... for prepareData.
853 852
                    {
854 853
               prepArgs <- list(x, outcomeTrain)
... ...
@@ -863,18 +862,20 @@ train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", perfor
863 862
               # Ensure performance type is one of the ones that can be calculated by the package.
864 863
               if(!performanceType %in% c("auto", .ClassifyRenvir[["performanceTypes"]]))
865 864
                 stop(paste("performanceType must be one of", paste(c("auto", .ClassifyRenvir[["performanceTypes"]]), collapse = ", "), "but is", performanceType))
866
-              
865
+
866
+              isCategorical <- is.character(outcomeTrain) && (length(outcomeTrain) == 1 || length(outcomeTrain) == nrow(measurements)) || is.factor(outcomeTrain)
867 867
               if(performanceType == "auto")
868
-              {
869
-                if(is.character(outcomeTrain) && (length(outcomeTrain) == 1 || length(outcomeTrain) == nrow(x)) || is.factor(outcomeTrain))
870
-                  performanceType <- "Balanced Accuracy"
871
-                else performanceType <- "C-index"
872
-              }
868
+                if(isCategorical) performanceType <- "Balanced Accuracy" else performanceType <- "C-index"
869
+              if(length(selectionMethod) == 1 && selectionMethod == "auto")
870
+                if(isCategorical) selectionMethod <- "t-test" else selectionMethod <- "CoxPH"
871
+              if(length(classifier) == 1 && classifier == "auto")
872
+                if(isCategorical) classifier <- "randomForest" else classifier <- "CoxPH"
873 873
               
874 874
               measurements <- measurementsAndOutcome[["measurements"]]
875 875
               outcomeTrain <- measurementsAndOutcome[["outcome"]]
876 876
               
877 877
               classifier <- cleanClassifier(classifier = classifier, measurements = measurements)
878
+              selectionMethod <- cleanSelectionMethod(selectionMethod = selectionMethod, measurements = measurements)
878 879
               if(assayIDs == "all") assayIDs <- unique(S4Vectors::mcols(measurements)[, "assay"])
879 880
               if(is.null(assayIDs)) assayIDs <- 1
880 881
               names(assayIDs) <- assayIDs
... ...
@@ -886,30 +887,52 @@ train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", perfor
886 887
                           # Loop over assays
887 888
                           sapply(classifier[[assayIndex]], function(classifierForAssay) {
888 889
                               # Loop over classifiers
890
+                                sapply(selectionMethod[[assayIndex]], function(selectionForAssay) {
891
+                                  # Loop over selectors
889 892
                               
890 893
                                   measurementsUse <- measurements
891 894
                                   if(assayIndex != 1) measurementsUse <- measurements[, S4Vectors::mcols(measurements)[, "assay"] == assayIndex, drop = FALSE]
892 895
                                   
896
+                                  modellingParams <- generateModellingParams(assayIDs = assayIDs, measurements = measurements, nFeatures = nFeatures,
897
+                                                     selectionMethod = selectionMethod, selectionOptimisation = "Resubstitution", performanceType = performanceType,
898
+                                                     classifier = classifier, multiViewMethod = "none")
899
+                                  topFeatures <- .doSelection(measurementsUse, outcomeTrain, CrossValParams(), modellingParams, verbose = 0)
900
+                                  selectedFeaturesIndices <- topFeatures[[2]] # Extract for subsetting.
901
+                                  tuneDetailsSelect <- topFeatures[[3]]
902
+                                  measurementsUse <- measurementsUse[, selectedFeaturesIndices]
903
+
893 904
                                   classifierParams <- .classifierKeywordToParams(classifierForAssay)
894
-                                  if(!is.null(classifierParams$trainParams@tuneParams))
895
-                                    classifierParams$trainParams@tuneParams <- c(classifierParams$trainParams@tuneParams, performanceType = performanceType)
896 905
                                   modellingParams <- ModellingParams(balancing = "none", selectParams = NULL,
897
-                                                               trainParams = classifierParams$trainParams, predictParams = classifierParams$predictParams)
906
+                                                                     trainParams = classifierParams$trainParams, predictParams = classifierParams$predictParams)
907
+                                  if(!is.null(tuneDetailsSelect))
908
+                                  {
909
+                                    tuneDetailsSelectUse <- tuneDetailsSelect[["tuneCombinations"]][tuneDetailsSelect[["bestIndex"]], , drop = FALSE]
910
+                                    avoidTune <- match(colnames(tuneDetailsSelectUse), names(modellingParams@trainParams@tuneParams))
911
+                                    if(any(!is.na(avoidTune)))
912
+                                    {
913
+                                      modellingParams@trainParams@otherParams <- c(modellingParams@trainParams@otherParams, tuneDetailsSelectUse[!is.na(avoidTune)])
914
+                                      modellingParams@trainParams@tuneParams <- modellingParams@trainParams@tuneParams[-na.omit(avoidTune)]
915
+                                      if(length(modellingParams@trainParams@tuneParams) == 0) modellingParams@trainParams@tuneParams <- NULL
916
+                                    }
917
+                                  }
918
+                                  if(!is.null(modellingParams@trainParams@tuneParams))
919
+                                    modellingParams$trainParams@tuneParams <- c(modellingParams$trainParams@tuneParams, performanceType = performanceType)
898 920
                                   
899
-                                  .doTrain(measurementsUse, outcomeTrain, NULL, NULL, CrossValParams(), modellingParams, verbose = 0)[["model"]]
921
+                                  trained <- .doTrain(measurementsUse, outcomeTrain, NULL, NULL, CrossValParams(), modellingParams, verbose = 0)[["model"]]
922
+                                  attr(trained, "predictFunction") <- classifierParams$predictParams@predictor
923
+                                  trained
900 924
                                   ## train model
901
-                          },
902
-                          simplify = FALSE)
903
-                      },
904
-                      simplify = FALSE)
925
+                                }, simplify = FALSE)
926
+                          }, simplify = FALSE)
927
+                      }, simplify = FALSE)
905 928
 
906
-                  models <- unlist(resClassifier, recursive = FALSE)
929
+                  models <- unlist(unlist(resClassifier, recursive = FALSE), recursive = FALSE)
907 930
                   if(length(models) == 1) {
908 931
                       model <- models[[1]]
909
-                      class(model) <- c(class(model), "trainedByClassifyR")
932
+                      class(model) <- c("trainedByClassifyR", class(model))
910 933
                       models <- NULL
911 934
                   } else {
912
-                      class(models) <- c(class(models), "listOfModels", "trainedByClassifyR")
935
+                      class(models) <- c("listOfModels", "trainedByClassifyR", class(models))
913 936
                   }
914 937
               }
915 938
 
... ...
@@ -932,6 +955,8 @@ train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", perfor
932 955
 
933 956
                # Generate params for each assay. This could be extended to have different selectionMethods for each type
934 957
                  paramsAssays <- mapply(generateModellingParams,
958
+                                        nFeatures = nFeatures[assayIDs],
959
+                                        selectionMethod = selectionMethod[assayIDs],
935 960
                                         assayIDs = assayIDs,
936 961
                                         measurements = assayTrain[assayIDs],
937 962
                                         classifier = classifier[assayIDs],
... ...
@@ -1049,9 +1074,11 @@ predict.trainedByClassifyR <- function(object, newData, ...)
1049 1074
               newData <- prepareData(newData, useFeatures = allFeatureNames(object))
1050 1075
               # Some classifiers dangerously use positional matching rather than column name matching.
1051 1076
               # newData columns are sorted so that the right column ordering is guaranteed.
1052
-            }
1053
-
1077
+    }
1078
+    
1079
+    predictFunctionUse <- attr(object, "predictFunction")
1080
+    class(object) <- rev(class(object)) # Now want the predict method of the specific model to be picked, so put model class first.
1054 1081
     if (is(object, "listOfModels")) 
1055
-         mapply(function(model, assay) predict(model, assay), object, newData, SIMPLIFY = FALSE)
1056
-    else predict(object, newData) # Object is itself a trained model and it is assumed that a predict method is defined for it.
1082
+         mapply(function(model, assay) predictFunctionUse(model, assay), object, newData, SIMPLIFY = FALSE)
1083
+    else predictFunctionUse(object, newData) # Object is itself a trained model and it is assumed that a predict method is defined for it.
1057 1084
 }
... ...
@@ -13,7 +13,8 @@
13 13
 #' are features.
14 14
 #' @param outcome Either a factor vector of classes, a \code{\link{Surv}} object, or
15 15
 #' a character string, or vector of such strings, containing column name(s) of column(s)
16
-#' containing either classes or time and event information about survival.
16
+#' containing either classes or time and event information about survival. If column names
17
+#' of survival information, time must be in first column and event status in the second.
17 18
 #' @param outcomeColumns If \code{measurements} is a \code{MultiAssayExperiment}, the
18 19
 #' names of the column (class) or columns (survival) in the table extracted by \code{colData(data)}
19 20
 #' that contain(s) the each individual's outcome to use for prediction.
... ...
@@ -46,6 +47,14 @@ setMethod("prepareData", "matrix",
46 47
   prepareData(S4Vectors::DataFrame(measurements, check.names = FALSE), outcome, ...)
47 48
 })
48 49
 
50
+#' @rdname prepareData
51
+#' @export
52
+setMethod("prepareData", "data.frame",
53
+  function(measurements, outcome, ...)
54
+{
55
+  prepareData(S4Vectors::DataFrame(measurements, check.names = FALSE), outcome, ...)
56
+})
57
+
49 58
 #' @rdname prepareData
50 59
 #' @export
51 60
 setMethod("prepareData", "DataFrame",
... ...
@@ -22,10 +22,10 @@ crossValidate(measurements, outcome, ...)
22 22
   measurements,
23 23
   outcome,
24 24
   nFeatures = 20,
25
-  selectionMethod = "t-test",
25
+  selectionMethod = "auto",
26 26
   selectionOptimisation = "Resubstitution",
27 27
   performanceType = "auto",
28
-  classifier = "randomForest",
28
+  classifier = "auto",
29 29
   multiViewMethod = "none",
30 30
   assayCombinations = "all",
31 31
   nFolds = 5,
... ...
@@ -39,10 +39,10 @@ crossValidate(measurements, outcome, ...)
39 39
   measurements,
40 40
   outcome,
41 41
   nFeatures = 20,
42
-  selectionMethod = "t-test",
42
+  selectionMethod = "auto",
43 43
   selectionOptimisation = "Resubstitution",
44 44
   performanceType = "auto",
45
-  classifier = "randomForest",
45
+  classifier = "auto",
46 46
   multiViewMethod = "none",
47 47
   assayCombinations = "all",
48 48
   nFolds = 5,
... ...
@@ -56,10 +56,10 @@ crossValidate(measurements, outcome, ...)
56 56
   measurements,
57 57
   outcome,
58 58
   nFeatures = 20,
59
-  selectionMethod = "t-test",
59
+  selectionMethod = "auto",
60 60
   selectionOptimisation = "Resubstitution",
61 61
   performanceType = "auto",
62
-  classifier = "randomForest",
62
+  classifier = "auto",
63 63
   multiViewMethod = "none",
64 64
   assayCombinations = "all",
65 65
   nFolds = 5,
... ...
@@ -73,10 +73,10 @@ crossValidate(measurements, outcome, ...)
73 73
   measurements,
74 74
   outcome,
75 75
   nFeatures = 20,
76
-  selectionMethod = "t-test",
76
+  selectionMethod = "auto",
77 77
   selectionOptimisation = "Resubstitution",
78 78
   performanceType = "auto",
79
-  classifier = "randomForest",
79
+  classifier = "auto",
80 80
   multiViewMethod = "none",
81 81
   assayCombinations = "all",
82 82
   nFolds = 5,
... ...
@@ -90,10 +90,10 @@ crossValidate(measurements, outcome, ...)
90 90
   measurements,
91 91
   outcome,
92 92
   nFeatures = 20,
93
-  selectionMethod = "t-test",
93
+  selectionMethod = "auto",
94 94
   selectionOptimisation = "Resubstitution",
95 95
   performanceType = "auto",
96
-  classifier = "randomForest",
96
+  classifier = "auto",
97 97
   multiViewMethod = "none",
98 98
   assayCombinations = "all",
99 99
   nFolds = 5,
... ...
@@ -110,7 +110,9 @@ crossValidate(measurements, outcome, ...)
110 110
 \method{train}{DataFrame}(
111 111
   x,
112 112
   outcomeTrain,
113
-  classifier = "randomForest",
113
+  selectionMethod = "auto",
114
+  nFeatures = 20,
115
+  classifier = "auto",
114 116
   performanceType = "auto",
115 117
   multiViewMethod = "none",
116 118
   assayIDs = "all",
... ...
@@ -141,8 +143,8 @@ or assays. If a numeric vector these will be optimised over using \code{selectio
141 143
 a different number of features will be used for each assay. If a named list of vectors, the respective number of features will be optimised over. 
142 144
 Set to NULL or "all" if all features should be used.}
143 145
 
144
-\item{selectionMethod}{Default: "auto". A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
145
-and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"} t-test (two categories) / F-test (three or more categories) ranking
146
+\item{selectionMethod}{Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
147
+and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking
146 148
 and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value.}
147 149
 
148 150
 \item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.}
... ...
@@ -5,10 +5,13 @@
5 5
 \alias{prepareData,matrix-method}
6 6
 \alias{prepareData,DataFrame-method}
7 7
 \alias{prepareData,MultiAssayExperiment-method}
8
+\alias{prepareData,data.frame-method}
8 9
 \title{Convert Different Data Classes into DataFrame and Filter Features}
9 10
 \usage{
10 11
 \S4method{prepareData}{matrix}(measurements, outcome, ...)
11 12
 
13
+\S4method{prepareData}{data.frame}(measurements, outcome, ...)
14
+
12 15
 \S4method{prepareData}{DataFrame}(
13 16
   measurements,
14 17
   outcome,
... ...
@@ -31,7 +34,8 @@ are features.}
31 34
 
32 35
 \item{outcome}{Either a factor vector of classes, a \code{\link{Surv}} object, or
33 36
 a character string, or vector of such strings, containing column name(s) of column(s)
34
-containing either classes or time and event information about survival.}
37
+containing either classes or time and event information about survival. If column names
38
+of survival information, time must be in first column and event status in the second.}
35 39
 
36 40
 \item{useFeatures}{If \code{measurements} is a \code{MultiAssayExperiment},
37 41
 a two-column table of features to use. The first column must have assay names