Browse code

Merge pull request #44 from SydneyBioX/master

Synchronise Latest Developments

Dario Strbenac authored on 16/10/2022 10:25:33 • GitHub committed on 16/10/2022 10:25:33
Showing 16 changed files

... ...
@@ -3,8 +3,8 @@ Type: Package
3 3
 Title: A framework for cross-validated classification problems, with
4 4
        applications to differential variability and differential
5 5
        distribution testing
6
-Version: 3.1.19
7
-Date: 2022-09-28
6
+Version: 3.1.23
7
+Date: 2022-10-14
8 8
 Author: Dario Strbenac, Ellis Patrick, John Ormerod, Graham Mann, Jean Yang
9 9
 Maintainer: Dario Strbenac <dario.strbenac@sydney.edu.au>
10 10
 VignetteBuilder: knitr
... ...
@@ -14,7 +14,8 @@ Depends: R (>= 4.1.0), generics, methods, S4Vectors, MultiAssayExperiment, BiocP
14 14
 Imports: grid, genefilter, utils, dplyr, tidyr, rlang, randomForest
15 15
 Suggests: limma, edgeR, car, Rmixmod, ggplot2 (>= 3.0.0), gridExtra (>= 2.0.0), cowplot,
16 16
         BiocStyle, pamr, PoiClaClu, parathyroidSE, knitr, htmltools, gtable,
17
-        scales, e1071, rmarkdown, IRanges, robustbase, glmnet, class, randomForestSRC
17
+        scales, e1071, rmarkdown, IRanges, robustbase, glmnet, class, randomForestSRC,
18
+        MatrixModels, xgboost
18 19
 Description: The software formalises a framework for classification in R.
19 20
              There are four stages; Data transformation, feature selection, classifier training,
20 21
              and prediction. The requirements of variable types and names are
... ...
@@ -59,6 +60,7 @@ Collate:
59 60
     'interfaceRandomForest.R'
60 61
     'interfaceRandomForestSurvival.R'
61 62
     'interfaceSVM.R'
63
+    'interfaceXGB.R'
62 64
     'performancePlot.R'
63 65
     'plotFeatureClasses.R'
64 66
     'prepareData.R'
... ...
@@ -485,7 +485,7 @@ setClassUnion("SelectParamsOrNULL", c("SelectParams", "NULL"))
485 485
 #' @section Constructor:
486 486
 #' \describe{
487 487
 #' \item{}{\preformatted{SelectParams(featureRanking, characteristics = DataFrame(), minPresence = 1, intermediate = character(0),
488
-#' subsetToSelections = TRUE, tuneParams = list(nFeatures = seq(10, 100, 10), performanceType = "Balanced Error"), ...)} Creates a \code{SelectParams}
488
+#' subsetToSelections = TRUE, tuneParams = list(nFeatures = seq(10, 100, 10), performanceType = "Balanced Accuracy"), ...)} Creates a \code{SelectParams}
489 489
 #' object which stores the function(s) which will do the selection and parameters that the
490 490
 #' function will use.
491 491
 #' \describe{\item{\code{featureRanking}}{A character keyword referring to a registered feature ranking function. See \code{\link{available}}
... ...
@@ -537,7 +537,7 @@ standardGeneric("SelectParams"))
537 537
 #' @export
538 538
 setMethod("SelectParams", c("characterOrList"),
539 539
           function(featureRanking, characteristics = DataFrame(), minPresence = 1, 
540
-                   intermediate = character(0), subsetToSelections = TRUE, tuneParams = list(nFeatures = seq(10, 100, 10), performanceType = "Balanced Error"), ...)
540
+                   intermediate = character(0), subsetToSelections = TRUE, tuneParams = list(nFeatures = seq(10, 100, 10), performanceType = "Balanced Accuracy"), ...)
541 541
           {
542 542
             if(is.character(featureRanking)) featureRanking <- .selectionKeywordToFunction(featureRanking) else featureRanking <- lapply(featureRanking, .selectionKeywordToFunction)
543 543
             if(!is.list(featureRanking) && (ncol(characteristics) == 0 || !"Selection Name" %in% characteristics[, "characteristic"]))
... ...
@@ -655,15 +655,23 @@ setClassUnion("characterOrFunction", c("character", "function"))
655 655
 setMethod("TrainParams", c("characterOrFunction"),
656 656
           function(classifier, balancing = c("downsample", "upsample", "none"), characteristics = DataFrame(), intermediate = character(0), tuneParams = NULL, getFeatures = NULL, ...)
657 657
           {
658
-            if(is.character(classifier))              
659
-              classifier <- .classifierKeywordToParams(classifier)[[1]]@classifier # Training function.              
658
+            extras <- list(...)              
659
+            if(is.character(classifier))
660
+            {
661
+              trainParams <- .classifierKeywordToParams(classifier)[[1]] # Get a default params object.
662
+              if(is.null(getFeatures) && !is.null(trainParams@getFeatures))
663
+                getFeatures <- trainParams@getFeatures
664
+              classifier <- trainParams@classifier # Training function.
665
+            }
660 666
             if(ncol(characteristics) == 0 || !"Classifier Name" %in% characteristics[, "characteristic"])
661 667
             {
662 668
               characteristics <- rbind(characteristics, S4Vectors::DataFrame(characteristic = "Classifier Name", value = .ClassifyRenvir[["functionsTable"]][.ClassifyRenvir[["functionsTable"]][, "character"] == attr(classifier, "name"), "name"]))
663 669
             }
670
+            
671
+            if(length(extras) == 0) extras <- NULL
664 672
             new("TrainParams", classifier = classifier, characteristics = characteristics,
665 673
                 intermediate = intermediate, getFeatures = getFeatures, tuneParams = tuneParams,
666
-                otherParams = list(...))
674
+                otherParams = extras)
667 675
           })
668 676
 
669 677
 #' @usage NULL
... ...
@@ -17,6 +17,8 @@
17 17
                                                     ncol = 2, byrow = TRUE, dimnames = list(NULL, c("type", "better"))
18 18
 ) |> as.data.frame()
19 19
 
20
+.ClassifyRenvir[["performanceTypes"]] <- .ClassifyRenvir[["performanceInfoTable"]][, "type"]
21
+
20 22
 # Nice-looking names for feature selection and classification functions, to automatically use
21 23
 # in a variety of performance plots.
22 24
 .ClassifyRenvir[["functionsTable"]] <- matrix(
... ...
@@ -49,7 +51,8 @@
49 51
     "coxphRanking", "Cox Proportional Hazards",
50 52
     "coxnetTrainInterface", "Penalised Cox Proportional Hazards",
51 53
     #"NEMOEtrainInterface", "Nutrition-Ecotype Mixture of Experts",
52
-    "rfsrcTrainInterface", "Random Survival Forest"),
54
+    "rfsrcTrainInterface", "Random Survival Forest",
55
+    "extremeGradientBoostingTrainInterface", "Extreme Gradient Boosting"),
53 56
   ncol = 2, byrow = TRUE, dimnames = list(NULL, c("character", "name"))
54 57
 ) |> as.data.frame()
55 58
 
... ...
@@ -81,7 +84,8 @@
81 84
     "mixturesNormals", "Mixture of normals feature voting classifier.",
82 85
     "CoxPH", "Cox proportional hazards.",
83 86
     "CoxNet", "Penalised Cox proportional hazards.",
84
-    "randomSurvivalForest", "Random survival forest."
87
+    "randomSurvivalForest", "Random survival forest.",
88
+    "XGB", "Extreme gradient booster."
85 89
     ),
86 90
   ncol = 2, byrow = TRUE, dimnames = list(NULL, c("classifier Keyword", "Description"))
87 91
 ) |> as.data.frame()
... ...
@@ -22,7 +22,9 @@
22 22
 #' Set to NULL or "all" if all features should be used.
23 23
 #' @param selectionMethod A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
24 24
 #' and performing multiview classification, the respective classification methods will be used on each assay.
25
-#' @param selectionOptimisation A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise nFeatures. 
25
+#' @param selectionOptimisation A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.
26
+#' @param performanceType Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Any one of the
27
+#' options described in \code{\link{calcPerformance}} may otherwise be specified.
26 28
 #' @param classifier A character vector of classification methods to compare. If a named character vector with names corresponding to different assays, 
27 29
 #' and performing multiview classification, the respective classification methods will be used on each assay.
28 30
 #' @param multiViewMethod A character vector specifying the multiview method or data integration approach to use.
... ...
@@ -87,6 +89,7 @@ setMethod("crossValidate", "DataFrame",
87 89
                    nFeatures = 20,
88 90
                    selectionMethod = "t-test",
89 91
                    selectionOptimisation = "Resubstitution",
92
+                   performanceType = "auto",
90 93
                    classifier = "randomForest",
91 94
                    multiViewMethod = "none",
92 95
                    assayCombinations = "all",
... ...
@@ -101,6 +104,17 @@ setMethod("crossValidate", "DataFrame",
101 104
               measurements <- measurementsAndOutcome[["measurements"]]
102 105
               outcome <- measurementsAndOutcome[["outcome"]]
103 106
               
107
+              # Ensure performance type is one of the ones that can be calculated by the package.
108
+              if(!performanceType %in% c("auto", .ClassifyRenvir[["performanceTypes"]]))
109
+                stop(paste("performanceType must be one of", paste(c("auto", .ClassifyRenvir[["performanceTypes"]]), collapse = ", "), "but is", performanceType))
110
+              
111
+              if(performanceType == "auto")
112
+              {
113
+                if(is.character(outcome) && (length(outcome) == 1 || length(outcome) == nrow(measurements)) || is.factor(outcome))
114
+                  performanceType <- "Balanced Accuracy"
115
+                else performanceType <- "C-index"
116
+              }
117
+              
104 118
               # Which data-types or data-views are present?
105 119
               assayIDs <- unique(mcols(measurements)$assay)
106 120
               if(is.null(assayIDs)) assayIDs <- 1
... ...
@@ -151,7 +165,7 @@ Using an ordinary GLM instead.")
151 165
                           sapply(classifier[[assayIndex]], function(classifierForAssay) {
152 166
                               # Loop over classifiers
153 167
                               sapply(selectionMethod[[assayIndex]], function(selectionForAssay) {
154
-                                  # Loop over classifiers
168
+                                  # Loop over selectors
155 169
                                   set.seed(seed)
156 170
                                   measurementsUse <- measurements
157 171
                                   if(assayIndex != 1) measurementsUse <- measurements[, mcols(measurements)[, "assay"] == assayIndex, drop = FALSE]
... ...
@@ -161,6 +175,7 @@ Using an ordinary GLM instead.")
161 175
                                       nFeatures = nFeatures[assayIndex],
162 176
                                       selectionMethod = selectionForAssay,
163 177
                                       selectionOptimisation = selectionOptimisation,
178
+                                      performanceType = performanceType,
164 179
                                       classifier = classifierForAssay,
165 180
                                       multiViewMethod = multiViewMethod,
166 181
                                       nFolds = nFolds,
... ...
@@ -202,6 +217,7 @@ Using an ordinary GLM instead.")
202 217
                          nFeatures = nFeatures[assayIndex],
203 218
                          selectionMethod = selectionMethod[assayIndex],
204 219
                          selectionOptimisation = selectionOptimisation,
220
+                         performanceType = performanceType, 
205 221
                          classifier = classifier[assayIndex],
206 222
                          multiViewMethod = ifelse(length(assayIndex) == 1, "none", multiViewMethod),
207 223
                          nFolds = nFolds,
... ...
@@ -235,6 +251,7 @@ Using an ordinary GLM instead.")
235 251
                          nFeatures = nFeatures[assayIndex],
236 252
                          selectionMethod = selectionMethod[assayIndex],
237 253
                          selectionOptimisation = selectionOptimisation,
254
+                         performanceType = performanceType,
238 255
                          classifier = classifier[assayIndex],
239 256
                          multiViewMethod = ifelse(length(assayIndex) == 1, "none", multiViewMethod),
240 257
                          nFolds = nFolds,
... ...
@@ -268,6 +285,7 @@ Using an ordinary GLM instead.")
268 285
                          nFeatures = nFeatures[assayIndex],
269 286
                          selectionMethod = selectionMethod[assayIndex],
270 287
                          selectionOptimisation = selectionOptimisation,
288
+                         performanceType = performanceType,
271 289
                          classifier = classifier[assayIndex],
272 290
                          multiViewMethod = ifelse(length(assayIndex) == 1, "none", multiViewMethod),
273 291
                          nFolds = nFolds,
... ...
@@ -292,6 +310,7 @@ setMethod("crossValidate", "MultiAssayExperiment",
292 310
                    nFeatures = 20,
293 311
                    selectionMethod = "t-test",
294 312
                    selectionOptimisation = "Resubstitution",
313
+                   performanceType = "auto",
295 314
                    classifier = "randomForest",
296 315
                    multiViewMethod = "none",
297 316
                    assayCombinations = "all",
... ...
@@ -307,6 +326,7 @@ setMethod("crossValidate", "MultiAssayExperiment",
307 326
                             nFeatures = nFeatures,
308 327
                             selectionMethod = selectionMethod,
309 328
                             selectionOptimisation = selectionOptimisation,
329
+                            performanceType = performanceType,
310 330
                             classifier = classifier,
311 331
                             multiViewMethod = multiViewMethod,
312 332
                             assayCombinations = assayCombinations,
... ...
@@ -324,6 +344,7 @@ setMethod("crossValidate", "data.frame", # data.frame of numeric measurements.
324 344
                    nFeatures = 20,
325 345
                    selectionMethod = "t-test",
326 346
                    selectionOptimisation = "Resubstitution",
347
+                   performanceType = "auto",
327 348
                    classifier = "randomForest",
328 349
                    multiViewMethod = "none",
329 350
                    assayCombinations = "all",
... ...
@@ -338,6 +359,7 @@ setMethod("crossValidate", "data.frame", # data.frame of numeric measurements.
338 359
                             nFeatures = nFeatures,
339 360
                             selectionMethod = selectionMethod,
340 361
                             selectionOptimisation = selectionOptimisation,
362
+                            performanceType = performanceType,
341 363
                             classifier = classifier,
342 364
                             multiViewMethod = multiViewMethod,
343 365
                             assayCombinations = assayCombinations,
... ...
@@ -355,6 +377,7 @@ setMethod("crossValidate", "matrix", # Matrix of numeric measurements.
355 377
                    nFeatures = 20,
356 378
                    selectionMethod = "t-test",
357 379
                    selectionOptimisation = "Resubstitution",
380
+                   performanceType = "auto",
358 381
                    classifier = "randomForest",
359 382
                    multiViewMethod = "none",
360 383
                    assayCombinations = "all",
... ...
@@ -369,6 +392,7 @@ setMethod("crossValidate", "matrix", # Matrix of numeric measurements.
369 392
                             nFeatures = nFeatures,
370 393
                             selectionMethod = selectionMethod,
371 394
                             selectionOptimisation = selectionOptimisation,
395
+                            performanceType = performanceType,
372 396
                             classifier = classifier,
373 397
                             multiViewMethod = multiViewMethod,
374 398
                             assayCombinations = assayCombinations,
... ...
@@ -388,6 +412,7 @@ setMethod("crossValidate", "list",
388 412
                    nFeatures = 20,
389 413
                    selectionMethod = "t-test",
390 414
                    selectionOptimisation = "Resubstitution",
415
+                   performanceType = "auto",
391 416
                    classifier = "randomForest",
392 417
                    multiViewMethod = "none",
393 418
                    assayCombinations = "all",
... ...
@@ -435,6 +460,7 @@ setMethod("crossValidate", "list",
435 460
                             nFeatures = nFeatures,
436 461
                             selectionMethod = selectionMethod,
437 462
                             selectionOptimisation = selectionOptimisation,
463
+                            performanceType = performanceType,
438 464
                             classifier = classifier,
439 465
                             multiViewMethod = multiViewMethod,
440 466
                             assayCombinations = assayCombinations,
... ...
@@ -527,7 +553,7 @@ generateCrossValParams <- function(nRepeats, nFolds, nCores, selectionOptimisati
527 553
     }
528 554
     tuneMode <- selectionOptimisation
529 555
     if(tuneMode == "CV") tuneMode <- "Nested CV"
530
-    if(!any(tuneMode %in% c("Resubstitution", "Nested CV", "none"))) stop("selectionOptimisation must be CV or Resubstitution or none")
556
+    if(!any(tuneMode %in% c("Resubstitution", "Nested CV", "none"))) stop("selectionOptimisation must be Nested CV or Resubstitution or none")
531 557
     CrossValParams(permutations = nRepeats, folds = nFolds, parallelParams = BPparam, tuneMode = tuneMode)
532 558
 }
533 559
 ######################################
... ...
@@ -579,6 +605,7 @@ generateModellingParams <- function(assayIDs,
579 605
                                     nFeatures,
580 606
                                     selectionMethod,
581 607
                                     selectionOptimisation,
608
+                                    performanceType = "auto",
582 609
                                     classifier,
583 610
                                     multiViewMethod = "none"
584 611
 ){
... ...
@@ -588,6 +615,7 @@ generateModellingParams <- function(assayIDs,
588 615
                                           nFeatures,
589 616
                                           selectionMethod,
590 617
                                           selectionOptimisation,
618
+                                          performanceType = performanceType,
591 619
                                           classifier,
592 620
                                           multiViewMethod)
593 621
         return(params)
... ...
@@ -610,8 +638,6 @@ generateModellingParams <- function(assayIDs,
610 638
     }
611 639
 
612 640
     classifier <- unlist(classifier)
613
-
614
-    performanceType <- ifelse(classifier %in% c("CoxPH", "CoxNet", "randomSurvivalForest"), "C-index", "Balanced Accuracy")
615 641
     
616 642
     # Check classifier
617 643
     knownClassifiers <- .ClassifyRenvir[["classifyKeywords"]][, "classifier Keyword"]
... ...
@@ -619,6 +645,7 @@ generateModellingParams <- function(assayIDs,
619 645
         stop(paste("Classifier must exactly match of these (be careful of case):", paste(knownClassifiers, collapse = ", ")))
620 646
     
621 647
     classifierParams <- .classifierKeywordToParams(classifier)
648
+    classifierParams$trainParams@tuneParams <- c(classifierParams$trainParams@tuneParams, performanceType = performanceType)
622 649
 
623 650
     selectionMethod <- unlist(selectionMethod)
624 651
 
... ...
@@ -658,6 +685,7 @@ generateMultiviewParams <- function(assayIDs,
658 685
                                     nFeatures,
659 686
                                     selectionMethod,
660 687
                                     selectionOptimisation,
688
+                                    performanceType,
661 689
                                     classifier,
662 690
                                     multiViewMethod){
663 691
 
... ...
@@ -676,6 +704,7 @@ generateMultiviewParams <- function(assayIDs,
676 704
                                  measurements = assayTrain[assayIDs],
677 705
                                  MoreArgs = list(
678 706
                                      selectionOptimisation = selectionOptimisation,
707
+                                     performanceType = performanceType,
679 708
                                      classifier = classifier,
680 709
                                      multiViewMethod = "none"),
681 710
                                  SIMPLIFY = FALSE)
... ...
@@ -689,7 +718,6 @@ generateMultiviewParams <- function(assayIDs,
689 718
                                           classifier = classifier,
690 719
                                           multiViewMethod = "none")
691 720
 
692
-        performanceType <- ifelse(classifier %in% c("CoxPH", "CoxNet", "randomSurvivalForest"), "C-index", "Balanced Accuracy")
693 721
         # Update selectParams to use
694 722
         params@selectParams <- SelectParams("selectMulti",
695 723
                                             params = paramsAssays,
... ...
@@ -715,6 +743,7 @@ generateMultiviewParams <- function(assayIDs,
715 743
                                  classifier = classifier[assayIDs],
716 744
                                  MoreArgs = list(
717 745
                                      selectionOptimisation = selectionOptimisation,
746
+                                     performanceType = performanceType,
718 747
                                      multiViewMethod = "none"),
719 748
                                  SIMPLIFY = FALSE)
720 749
 
... ...
@@ -743,6 +772,7 @@ generateMultiviewParams <- function(assayIDs,
743 772
                                  classifier = classifier[assayIDs],
744 773
                                  MoreArgs = list(
745 774
                                      selectionOptimisation = selectionOptimisation,
775
+                                     performanceType = performanceType,
746 776
                                      multiViewMethod = "none"),
747 777
                                  SIMPLIFY = FALSE)
748 778
 
... ...
@@ -771,6 +801,7 @@ generateMultiviewParams <- function(assayIDs,
771 801
                                  measurements = assayTrain[["clinical"]],
772 802
                                  classifier = classifier["clinical"],
773 803
                                  selectionOptimisation = selectionOptimisation,
804
+                                 performanceType = performanceType,
774 805
                                  multiViewMethod = "none"))
775 806
 
776 807
 
... ...
@@ -793,6 +824,7 @@ CV <- function(measurements = NULL,
793 824
                nFeatures = NULL,
794 825
                selectionMethod = "t-test",
795 826
                selectionOptimisation = "Resubstitution",
827
+               performanceType,
796 828
                classifier = "elasticNetGLM",
797 829
                multiViewMethod = "none",
798 830
                nFolds = 5,
... ...
@@ -831,6 +863,7 @@ CV <- function(measurements = NULL,
831 863
                                                nFeatures = nFeatures,
832 864
                                                selectionMethod = selectionMethod,
833 865
                                                selectionOptimisation = selectionOptimisation,
866
+                                               performanceType = performanceType,
834 867
                                                classifier = classifier,
835 868
                                                multiViewMethod = multiViewMethod)
836 869
     
... ...
@@ -879,18 +912,32 @@ train.data.frame <- function(x, outcomeTrain, ...)
879 912
 #' @rdname crossValidate
880 913
 #' @param assayIDs A character vector for assays to train with. Special value \code{"all"}
881 914
 #' uses all assays in the input object.
915
+#' @param performanceType Performance metric to optimise if classifier has any tuning parameters.
882 916
 #' @method train DataFrame
883 917
 #' @export
884
-train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", multiViewMethod = "none", assayIDs = "all", ...) # ... for prepareData.
918
+train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", performanceType = "auto",
919
+                            multiViewMethod = "none", assayIDs = "all", ...) # ... for prepareData.
885 920
                    {
886 921
               prepArgs <- list(x, outcomeTrain)
887 922
               extraInputs <- list(...)
888 923
               prepExtras <- numeric()
889 924
               if(length(extraInputs) > 0)
890
-                prepExtras <- which(names(extrasInputs) %in% .ClassifyRenvir[["prepareDataFormals"]])
925
+                prepExtras <- which(names(extraInputs) %in% .ClassifyRenvir[["prepareDataFormals"]])
891 926
               if(length(prepExtras) > 0)
892 927
                 prepArgs <- append(prepArgs, extraInputs[prepExtras])
893 928
               measurementsAndOutcome <- do.call(prepareData, prepArgs)
929
+              
930
+              # Ensure performance type is one of the ones that can be calculated by the package.
931
+              if(!performanceType %in% c("auto", .ClassifyRenvir[["performanceTypes"]]))
932
+                stop(paste("performanceType must be one of", paste(c("auto", .ClassifyRenvir[["performanceTypes"]]), collapse = ", "), "but is", performanceType))
933
+              
934
+              if(performanceType == "auto")
935
+              {
936
+                if(is.character(outcomeTrain) && (length(outcomeTrain) == 1 || length(outcomeTrain) == nrow(x)) || is.factor(outcomeTrain))
937
+                  performanceType <- "Balanced Accuracy"
938
+                else performanceType <- "C-index"
939
+              }
940
+              
894 941
               measurements <- measurementsAndOutcome[["measurements"]]
895 942
               outcomeTrain <- measurementsAndOutcome[["outcome"]]
896 943
               
... ...
@@ -898,7 +945,7 @@ train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", multiV
898 945
               if(assayIDs == "all") assayIDs <- unique(mcols(measurements)[, "assay"])
899 946
               if(is.null(assayIDs)) assayIDs <- 1
900 947
               names(assayIDs) <- assayIDs
901
-              names(classifier) <- classifier
948
+              names(classifier) <- assayIDs
902 949
 
903 950
               if(multiViewMethod == "none"){
904 951
                   resClassifier <-
... ...
@@ -906,10 +953,13 @@ train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", multiV
906 953
                           # Loop over assays
907 954
                           sapply(classifier[[assayIndex]], function(classifierForAssay) {
908 955
                               # Loop over classifiers
956
+                              
909 957
                                   measurementsUse <- measurements
910 958
                                   if(assayIndex != 1) measurementsUse <- measurements[, mcols(measurements)[, "assay"] == assayIndex, drop = FALSE]
911 959
                                   
912 960
                                   classifierParams <- .classifierKeywordToParams(classifierForAssay)
961
+                                  if(!is.null(classifierParams$trainParams@tuneParams))
962
+                                    classifierParams$trainParams@tuneParams <- c(classifierParams$trainParams@tuneParams, performanceType = performanceType)
913 963
                                   modellingParams <- ModellingParams(balancing = "none", selectParams = NULL,
914 964
                                                                trainParams = classifierParams$trainParams, predictParams = classifierParams$predictParams)
915 965
                                   
... ...
@@ -985,7 +1035,6 @@ train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", multiV
985 1035
 #' @rdname crossValidate
986 1036
 #' @method train list
987 1037
 #' @export
988
-# Each of the first four variables are named lists with names of assays.
989 1038
 train.list <- function(x, outcomeTrain, ...)
990 1039
               {
991 1040
                 # Check data type is valid
... ...
@@ -1001,7 +1050,7 @@ train.list <- function(x, outcomeTrain, ...)
1001 1050
                   stop("All datasets must have the same samples")
1002 1051
               
1003 1052
                 # Check the number of outcome is the same
1004
-                if (!all(sapply(x, nrow) == length(x)) && !is.character(x))
1053
+                if (!all(sapply(x, nrow) == length(outcomeTrain)) && !is.character(outcomeTrain))
1005 1054
                   stop("outcome must have same number of samples as measurements")
1006 1055
               
1007 1056
               df_list <- sapply(x, S4Vectors::DataFrame)
... ...
@@ -1016,6 +1065,7 @@ train.list <- function(x, outcomeTrain, ...)
1016 1065
               
1017 1066
               # Each list of tabular data has been collapsed into a DataFrame.
1018 1067
               # Will be subset to relevant assayIDs inside the DataFrame method.
1068
+              
1019 1069
               train(combined_df, outcomeTrain, ...)
1020 1070
 }
1021 1071
 
... ...
@@ -1028,13 +1078,13 @@ train.MultiAssayExperiment <- function(x, outcomeColumns, ...)
1028 1078
               extraInputs <- list(...)
1029 1079
               prepExtras <- trainExtras <- numeric()
1030 1080
               if(length(extraInputs) > 0)
1031
-                prepExtras <- which(names(extrasInputs) %in% .ClassifyRenvir[["prepareDataFormals"]])
1081
+                prepExtras <- which(names(extraInputs) %in% .ClassifyRenvir[["prepareDataFormals"]])
1032 1082
               if(length(prepExtras) > 0)
1033 1083
                 prepArgs <- append(prepArgs, extraInputs[prepExtras])
1034 1084
               measurementsAndOutcome <- do.call(prepareData, prepArgs)
1035 1085
               trainArgs <- list(measurementsAndOutcome[["measurements"]], measurementsAndOutcome[["outcome"]])
1036 1086
               if(length(extraInputs) > 0)
1037
-                trainExtras <- which(!names(extrasInputs) %in% .ClassifyRenvir[["prepareDataFormals"]])
1087
+                trainExtras <- which(!names(extraInputs) %in% .ClassifyRenvir[["prepareDataFormals"]])
1038 1088
               if(length(trainExtras) > 0)
1039 1089
                 trainArgs <- append(trainArgs, extraInputs[trainExtras])
1040 1090
               do.call(train, trainArgs)
... ...
@@ -1066,11 +1116,9 @@ predict.trainedByClassifyR <- function(object, newData, ...)
1066 1116
               newData <- prepareData(newData, useFeatures = allFeatureNames(object))
1067 1117
               # Some classifiers dangerously use positional matching rather than column name matching.
1068 1118
               # newData columns are sorted so that the right column ordering is guaranteed.
1069
-            } else {stop("'newData' is not one of the valid data types. It is of type ", class(newData), '.')}
1070
-  if(is(object, "ClassifyResult"))
1071
-  {
1072
-    object@modellingParams@predictParams@predictor(object@finalModel[[1]], newData)
1073
-  } else if (is(object, "listOfModels")) { # Object is itself a trained model and it is assumed that a predict method is defined for it.
1074
-    mapply(function(model, assay) predict(model, assay), object, newData, SIMPLIFY = FALSE)
1075
-  } else predict(object, newData)
1119
+            }
1120
+
1121
+    if (is(object, "listOfModels")) 
1122
+         mapply(function(model, assay) predict(model, assay), object, newData, SIMPLIFY = FALSE)
1123
+    else predict(object, newData) # Object is itself a trained model and it is assumed that a predict method is defined for it.
1076 1124
 }
... ...
@@ -29,7 +29,7 @@ DLDApredictInterface <- function(model, measurementsTest, returnType = c("both",
29 29
     message("Predicting classes using trained DLDA classifier.")
30 30
   
31 31
   #predict(model, as.matrix(test))
32
-  predictions <- .predict(model, as.matrix(measurementsTest)) # Copy located in utilities.R.
32
+  predictions <- predict(model, as.matrix(measurementsTest)) # Copy located in utilities.R.
33 33
 
34 34
   switch(returnType, class = predictions[["class"]], # Factor vector.
35 35
          score = predictions[["posterior"]][, model[["groups"]]], # Numeric matrix.
... ...
@@ -1,27 +1,28 @@
1
-# An Interface for randomForest Package's randomForest Function
2
-randomForestTrainInterface <- function(measurementsTrain, classesTrain, mTryProportion = 0.5, ..., verbose = 3)
1
+# An Interface for ranger Package's randomForest Function
2
+randomForestTrainInterface <- function(measurementsTrain, outcomeTrain, mTryProportion = 0.5, ..., verbose = 3)
3 3
 {
4
-  if(!requireNamespace("randomForest", quietly = TRUE))
5
-    stop("The package 'randomForest' could not be found. Please install it.")
4
+  if(!requireNamespace("ranger", quietly = TRUE))
5
+    stop("The package 'ranger' could not be found. Please install it.")
6 6
   if(verbose == 3)
7
-    message("Fitting random forest classifier to training data and making predictions on test
8
-            data.")
7
+    message("Fitting random forest classifier to training data.")
9 8
   mtry <- round(mTryProportion * ncol(measurementsTrain)) # Number of features to try.
10 9
       
11 10
   # Convert to base data.frame as randomForest doesn't understand DataFrame.
12
-  randomForest::randomForest(as(measurementsTrain, "data.frame"), classesTrain, mtry = mtry, keep.forest = TRUE, ...)
11
+  ranger::ranger(x = as(measurementsTrain, "data.frame"), y = outcomeTrain, mtry = mtry, importance = "impurity_corrected", ...)
13 12
 }
14 13
 attr(randomForestTrainInterface, "name") <- "randomForestTrainInterface"
15 14
     
16
-# forest is of class randomForest
15
+# forest is of class ranger
17 16
 randomForestPredictInterface <- function(forest, measurementsTest, ..., returnType = c("both", "class", "score"), verbose = 3)
18 17
 {
19 18
   returnType <- match.arg(returnType)
19
+  classes <- forest$forest$levels
20 20
   if(verbose == 3)
21 21
     message("Predicting using random forest.")  
22 22
   measurementsTest <- as.data.frame(measurementsTest)
23
-  classPredictions <- predict(forest, measurementsTest)
24
-  classScores <- predict(forest, measurementsTest, type = "vote")[, forest[["classes"]], drop = FALSE]
23
+  classPredictions <- predict(forest, measurementsTest)$predictions
24
+  classScores <- predict(forest, measurementsTest, predict.all = TRUE)[[1]]
25
+  classScores <- t(apply(classScores, 1, function(sampleRow) table(factor(classes[sampleRow], levels = classes)) / forest$forest$num.trees))
25 26
   switch(returnType, class = classPredictions,
26 27
          score = classScores,
27 28
          both = data.frame(class = classPredictions, classScores, check.names = FALSE))
... ...
@@ -35,7 +36,7 @@ randomForestPredictInterface <- function(forest, measurementsTest, ..., returnTy
35 36
 
36 37
 forestFeatures <- function(forest)
37 38
                   {
38
-                    rankedFeaturesIndices <- order(randomForest::importance(forest), decreasing = TRUE)
39
-                    selectedFeaturesIndices <- randomForest::varUsed(forest, count = FALSE)
39
+                    rankedFeaturesIndices <- order(ranger::importance(forest), decreasing = TRUE)
40
+                    selectedFeaturesIndices <- which(ranger::importance(forest) > 0)
40 41
                     list(rankedFeaturesIndices, selectedFeaturesIndices)
41 42
                   }
42 43
\ No newline at end of file
... ...
@@ -6,9 +6,10 @@ rfsrcTrainInterface <- function(measurementsTrain, survivalTrain, mTryProportion
6 6
   if(verbose == 3)
7 7
     message("Fitting rfsrc classifier to training data and making predictions on test data.")
8 8
     
9
-  bindedMeasurements <- cbind(measurementsTrain, event = survivalTrain[,1], time = survivalTrain[,2])
9
+  bindedMeasurements <- cbind(measurementsTrain, event = survivalTrain[, 1], time = survivalTrain[, 2])
10 10
   mtry <- round(mTryProportion * ncol(measurementsTrain)) # Number of features to try.
11
-  randomForestSRC::rfsrc(Surv(event = event, time = time) ~ ., as.data.frame(bindedMeasurements), mtry = mtry, ...)
11
+  randomForestSRC::rfsrc(Surv(time, event) ~ ., data = as.data.frame(bindedMeasurements), mtry = mtry,
12
+                          var.used = "all.trees", importance = TRUE, ...)
12 13
 }
13 14
 attr(rfsrcTrainInterface, "name") <- "rfsrcTrainInterface"
14 15
 
... ...
@@ -18,4 +19,11 @@ rfsrcPredictInterface <- function(model, measurementsTest, ..., verbose = 3)
18 19
   predictedOutcome = predict(model, as.data.frame(measurementsTest), ...)$predicted
19 20
   names(predictedOutcome) = rownames(measurementsTest)
20 21
   predictedOutcome
21
-}
22 22
\ No newline at end of file
23
+}
24
+
25
+rfsrcFeatures <- function(forest)
26
+                  {
27
+                    rankedFeaturesIndices <- order(forest[["importance"]], decreasing = TRUE)
28
+                    selectedFeaturesIndices <- which(forest[["var.used"]] > 0)
29
+                    list(rankedFeaturesIndices, selectedFeaturesIndices)
30
+                  }
23 31
\ No newline at end of file
24 32
new file mode 100644
... ...
@@ -0,0 +1,82 @@
1
+# An Interface for xgboost Package's xgboost Function
2
+extremeGradientBoostingTrainInterface <- function(measurementsTrain, outcomeTrain, mTryProportion = 0.5, nrounds = 10, ..., verbose = 3)
3
+{
4
+  if(!requireNamespace("xgboost", quietly = TRUE))
5
+    stop("The package 'xgboost' could not be found. Please install it.")
6
+  if(verbose == 3)
7
+    message("Fitting extreme gradient boosting classifier to training data and making predictions on test
8
+            data.")
9
+  measurementsTrain <- as(measurementsTrain, "data.frame")
10
+  # Convert to one-hot encoding as xgboost doesn't understand factors. Need to get rid of intercept afterwards.
11
+  measurementsTrain <- MatrixModels::model.Matrix(~ 0 + ., data = measurementsTrain, sparse = TRUE)
12
+  
13
+  isClassification <- FALSE
14
+  numClasses <- NULL
15
+  if(is(outcomeTrain, "Surv")) # xgboost only knows about numeric vectors.
16
+  {
17
+    time <- outcomeTrain[, "time"]
18
+    event <- as.numeric(outcomeTrain[, "status"])
19
+    if(max(event) == 2) event <- event - 1
20
+    outcomeTrain <- time * ifelse(event == 1, 1, -1) # Negative for censoring.
21
+    objective <- "survival:cox"
22
+  } else { # Classification task.
23
+    isClassification <- TRUE
24
+    classes <- levels(outcomeTrain)
25
+    numClasses <- length(classes)
26
+    objective <- "multi:softprob"
27
+    outcomeTrain <- as.numeric(outcomeTrain) - 1 # Classes are represented as 0, 1, 2, ...
28
+  }
29
+  
30
+  trained <- xgboost::xgboost(measurementsTrain, outcomeTrain, objective = objective, nrounds = nrounds,
31
+                              num_class = numClasses, colsample_bynode = mTryProportion, verbose = 0, ...)
32
+  if(isClassification)
33
+  {
34
+    attr(trained, "classes") <- classes # Useful for factor predictions in predict method.
35
+    attr(trained, "featureNames") <- colnames(measurementsTrain)
36
+    attr(trained, "featureGroups") <- measurementsTrain@assign
37
+  }
38
+  trained
39
+}
40
+attr(extremeGradientBoostingTrainInterface, "name") <- "extremeGradientBoostingTrainInterface"
41
+    
42
+# booster is of class xgb.Booster
43
+extremeGradientBoostingPredictInterface <- function(booster, measurementsTest, ..., returnType = c("both", "class", "score"), verbose = 3)
44
+{
45
+  returnType <- match.arg(returnType)
46
+  if(verbose == 3)
47
+    message("Predicting using boosted random forest.")  
48
+  measurementsTest <- as(measurementsTest, "data.frame")
49
+  # Convert to one-hot encoding as xgboost doesn't understand factors. Need to get rid of intercept afterwards.
50
+  measurementsTest <- MatrixModels::model.Matrix(~ 0 + ., data = measurementsTest, sparse = TRUE)
51
+  scores <- predict(booster, measurementsTest, reshape = TRUE)
52
+  if(!is.null(attr(booster, "classes"))) # It is a classification task.
53
+  {
54
+    classPredictions <- attr(booster, "classes")[apply(scores, 1, function(sampleRow) which.max(sampleRow)[1])]
55
+    classPredictions <- factor(classPredictions, levels = attr(booster, "classes"))
56
+    result <- switch(returnType, class = classPredictions,
57
+                     score = scores,
58
+                     both = data.frame(class = classPredictions, scores, check.names = FALSE))
59
+  } else { # A survival task.
60
+     result <- scores
61
+  }
62
+  result
63
+}
64
+
65
+################################################################################
66
+#
67
+# Get selected features
68
+#
69
+################################################################################
70
+
71
+XGBfeatures <- function(booster)
72
+                  {
73
+                    importanceGains <- xgboost::xgb.importance(model = booster)[["Gain"]]
74
+                    gains <- rep(0, length(unique(attr(booster, "featureGroups"))))
75
+                    featureGroups <- attr(booster, "featureGroups")[match(xgboost::xgb.importance(model = booster)[["Feature"]], attr(booster, "featureNames"))]
76
+                    maxGains <- by(importanceGains, featureGroups, max)
77
+                    indicesUsed <- as.numeric(names(maxGains))
78
+                    gains[indicesUsed]  <- maxGains # Put into particular indexes.
79
+                    rankedFeaturesIndices <- order(gains, decreasing = TRUE)
80
+                    selectedFeaturesIndices <- indicesUsed
81
+                    list(rankedFeaturesIndices, selectedFeaturesIndices)
82
+                  }
... ...
@@ -1,6 +1,6 @@
1 1
 # Random Forest
2 2
 RFparams <- function() {
3
-    trainParams <- TrainParams(randomForestTrainInterface, tuneParams = list(mTryProportion = c(0.25, 0.33, 0.50, 0.66, 0.75, 1.00), ntree = seq(100, 500, 100), performanceType = "Balanced Error"),
3
+    trainParams <- TrainParams(randomForestTrainInterface, tuneParams = list(mTryProportion = c(0.25, 0.33, 0.50, 0.66, 0.75, 1.00), num.trees = c(10, seq(100, 500, 100))),
4 4
                                getFeatures = forestFeatures)
5 5
     predictParams <- PredictParams(randomForestPredictInterface)
6 6
     
... ...
@@ -9,15 +9,24 @@ RFparams <- function() {
9 9
 
10 10
 # Random Survival Forest
11 11
 RSFparams <- function() {
12
-    trainParams <- TrainParams(rfsrcTrainInterface, tuneParams = list(mTryProportion = c(0.25, 0.33, 0.50, 0.66, 0.75, 1.00), ntree = seq(100, 500, 100), performanceType = "Balanced Error"))
12
+    trainParams <- TrainParams(rfsrcTrainInterface, tuneParams = list(mTryProportion = c(0.25, 0.33, 0.50, 0.66, 0.75, 1.00), ntree = c(10, seq(100, 500, 100))),
13
+                               getFeatures = rfsrcFeatures)
13 14
     predictParams <- PredictParams(rfsrcPredictInterface)
14 15
     
15 16
     return(list(trainParams = trainParams, predictParams = predictParams))
16 17
 }
17 18
 
19
+XGBparams <- function() {
20
+    trainParams <- TrainParams(extremeGradientBoostingTrainInterface, tuneParams = list(mTryProportion = c(0.25, 0.33, 0.50, 0.66, 0.75, 1.00), nrounds = c(5, 10, 15)),
21
+                               getFeatures = XGBfeatures)
22
+    predictParams <- PredictParams(extremeGradientBoostingPredictInterface)
23
+    
24
+    return(list(trainParams = trainParams, predictParams = predictParams))  
25
+}
26
+
18 27
 # k Nearest Neighbours
19 28
 kNNparams <- function() {
20
-    trainParams <- TrainParams(kNNinterface, tuneParams = list(k = 1:5, performanceType = "Balanced Error"))
29
+    trainParams <- TrainParams(kNNinterface, tuneParams = list(k = 1:5))
21 30
     predictParams <- NULL
22 31
     return(list(trainParams = trainParams, predictParams = predictParams))
23 32
 }
... ...
@@ -42,7 +51,7 @@ elasticNetGLMparams <- function() {
42 51
 SVMparams = function() {
43 52
     trainParams <- TrainParams(SVMtrainInterface,
44 53
                    tuneParams = list(kernel = c("linear", "polynomial", "radial", "sigmoid"),
45
-                                     cost = 10^(-3:3), performanceType = "Balanced Error"))
54
+                                     cost = 10^(-3:3)))
46 55
     predictParams <- PredictParams(SVMpredictInterface)
47 56
     
48 57
     return(list(trainParams = trainParams, predictParams = predictParams))
... ...
@@ -66,7 +75,7 @@ DLDAparams = function() {
66 75
 
67 76
 # naive Bayes Kernel
68 77
 naiveBayesParams <- function() {
69
-    trainParams <- TrainParams(naiveBayesKernel, tuneParams = list(difference = c("unweighted", "weighted"), performanceType = "Balanced Error"))
78
+    trainParams <- TrainParams(naiveBayesKernel, tuneParams = list(difference = c("unweighted", "weighted")))
70 79
     predictParams <- NULL
71 80
     return(list(trainParams = trainParams, predictParams = predictParams))
72 81
 }
... ...
@@ -142,6 +142,7 @@
142 142
     tuneParamsTrain <- append(tuneParamsTrain, modellingParams@trainParams@tuneParams)
143 143
     tuneCombosTrain <- expand.grid(tuneParamsTrain, stringsAsFactors = FALSE)  
144 144
     modellingParams@trainParams@tuneParams <- NULL
145
+    
145 146
     allPerformanceTables <- lapply(rankings, function(rankingsVariety)
146 147
     {
147 148
       # Creates a matrix. Columns are top n features, rows are varieties (one row if None).
... ...
@@ -166,7 +167,7 @@
166 167
           result <- runTest(measurementsTrain, outcomeTrain, measurementsTrain, outcomeTrain,
167 168
                             crossValParams = NULL, modellingParams = modellingParams,
168 169
                             verbose = verbose, .iteration = "internal")
169
-          
170
+
170 171
           predictions <- result[["predictions"]]
171 172
           # Classifiers will use a column "class" and survival models will use a column "risk".
172 173
           if(class(predictions) == "data.frame")
... ...
@@ -221,6 +222,7 @@
221 222
                             measurementsTrain, outcomeTrain,
222 223
                             crossValParams = NULL, modellingParams,
223 224
                             verbose = verbose, .iteration = "internal")
225
+
224 226
           predictions <- result[["predictions"]]
225 227
           if(class(predictions) == "data.frame")
226 228
             predictedOutcome <- predictions[, "class"]
... ...
@@ -274,7 +276,7 @@
274 276
         result <- runTest(measurementsTrain, outcomeTrain, measurementsTrain, outcomeTrain,
275 277
                           crossValParams = NULL, modellingParams,
276 278
                           verbose = verbose, .iteration = "internal")
277
-        
279
+
278 280
         predictions <- result[["predictions"]]
279 281
         if(class(predictions) == "data.frame")
280 282
           predictedOutcome <- predictions[, colnames(predictions) %in% c("class", "risk")]
... ...
@@ -511,6 +513,7 @@
511 513
         keyword,
512 514
         "randomForest" = RFparams(),
513 515
         "randomSurvivalForest" = RSFparams(),
516
+        "XGB" = XGBparams(),
514 517
         "GLM" = GLMparams(),
515 518
         "elasticNetGLM" = elasticNetGLMparams(),
516 519
         "SVM" = SVMparams(),
... ...
@@ -578,7 +581,8 @@
578 581
   obj
579 582
 }
580 583
 
581
-.predict <- function(object, newdata, ...) { # Remove once sparsediscrim is reinstated to CRAN.
584
+#' @method predict dlda
585
+predict.dlda <- function(object, newdata, ...) { # Remove once sparsediscrim is reinstated to CRAN.
582 586
   if (!inherits(object, "dlda"))  {
583 587
     stop("object not of class 'dlda'")
584 588
   }
... ...
@@ -2,6 +2,29 @@
2 2
 \title{ClassifyR News}
3 3
 \encoding{UTF-8}
4 4
 
5
+\section{Version 3.2.0}{\itemize{
6
+  \item
7
+  Fast Cox survival analysis.
8
+  \item
9
+  Simple parameter sets, as used by crossVaildate, now come with tuning parameter grid as standard.
10
+  \item
11
+  Wrappers are greatly simplified. Now, there is only one method for a data frame and they are not exported because they are not used directly by the end-user anyway.
12
+  \item
13
+  \code{prepareData} function to filter and subset input data using common ways, such as missingness and variability.
14
+  \item
15
+  Invalid column names of data (e.g. spaces, hyphens) are converted into safe names before modelling but converted back into original names for tracking ranked and selected features.
16
+  \item
17
+  \code{available} function shows the keywords corresponding to transformation, selection, classifier functions.
18
+  \item
19
+  More functions have automatically-selected parameters based on input data, reducing required user-specified parameters.
20
+  \item
21
+  New classifiers added for random survival forests and extreme gradient boosting.
22
+  \item
23
+  Adaptive sampling for modelling with uncertainty of class labels can be enabled with \code{adaptiveResamplingDelta}.
24
+  \item
25
+  Parameter tuning fixed to only use samples from the training set.
26
+}}
27
+
5 28
 \section{Version 3.0.0}{\itemize{
6 29
   \item
7 30
   Now supports survival models and their evaluation, in addition to existing classification functionality.
... ...
@@ -17,7 +17,7 @@ feature selection. The empty constructor is provided for convenience.
17 17
 
18 18
 \describe{
19 19
 \item{}{\preformatted{SelectParams(featureRanking, characteristics = DataFrame(), minPresence = 1, intermediate = character(0),
20
-subsetToSelections = TRUE, tuneParams = list(nFeatures = seq(10, 100, 10), performanceType = "Balanced Error"), ...)} Creates a \code{SelectParams}
20
+subsetToSelections = TRUE, tuneParams = list(nFeatures = seq(10, 100, 10), performanceType = "Balanced Accuracy"), ...)} Creates a \code{SelectParams}
21 21
 object which stores the function(s) which will do the selection and parameters that the
22 22
 function will use.
23 23
 \describe{\item{\code{featureRanking}}{A character keyword referring to a registered feature ranking function. See \code{\link{available}}
... ...
@@ -24,6 +24,7 @@ crossValidate(measurements, outcome, ...)
24 24
   nFeatures = 20,
25 25
   selectionMethod = "t-test",
26 26
   selectionOptimisation = "Resubstitution",
27
+  performanceType = "auto",
27 28
   classifier = "randomForest",
28 29
   multiViewMethod = "none",
29 30
   assayCombinations = "all",
... ...
@@ -40,6 +41,7 @@ crossValidate(measurements, outcome, ...)
40 41
   nFeatures = 20,
41 42
   selectionMethod = "t-test",
42 43
   selectionOptimisation = "Resubstitution",
44
+  performanceType = "auto",
43 45
   classifier = "randomForest",
44 46
   multiViewMethod = "none",
45 47
   assayCombinations = "all",
... ...
@@ -56,6 +58,7 @@ crossValidate(measurements, outcome, ...)
56 58
   nFeatures = 20,
57 59
   selectionMethod = "t-test",
58 60
   selectionOptimisation = "Resubstitution",
61
+  performanceType = "auto",
59 62
   classifier = "randomForest",
60 63
   multiViewMethod = "none",
61 64
   assayCombinations = "all",
... ...
@@ -72,6 +75,7 @@ crossValidate(measurements, outcome, ...)
72 75
   nFeatures = 20,
73 76
   selectionMethod = "t-test",
74 77
   selectionOptimisation = "Resubstitution",
78
+  performanceType = "auto",
75 79
   classifier = "randomForest",
76 80
   multiViewMethod = "none",
77 81
   assayCombinations = "all",
... ...
@@ -88,6 +92,7 @@ crossValidate(measurements, outcome, ...)
88 92
   nFeatures = 20,
89 93
   selectionMethod = "t-test",
90 94
   selectionOptimisation = "Resubstitution",
95
+  performanceType = "auto",
91 96
   classifier = "randomForest",
92 97
   multiViewMethod = "none",
93 98
   assayCombinations = "all",
... ...
@@ -106,6 +111,7 @@ crossValidate(measurements, outcome, ...)
106 111
   x,
107 112
   outcomeTrain,
108 113
   classifier = "randomForest",
114
+  performanceType = "auto",
109 115
   multiViewMethod = "none",
110 116
   assayIDs = "all",
111 117
   ...
... ...
@@ -136,7 +142,9 @@ Set to NULL or "all" if all features should be used.}
136 142
 \item{selectionMethod}{A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
137 143
 and performing multiview classification, the respective classification methods will be used on each assay.}
138 144
 
139
-\item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise nFeatures.}
145
+\item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.}
146
+
147
+\item{performanceType}{Performance metric to optimise if classifier has any tuning parameters.}
140 148
 
141 149
 \item{classifier}{A character vector of classification methods to compare. If a named character vector with names corresponding to different assays, 
142 150
 and performing multiview classification, the respective classification methods will be used on each assay.}
... ...
@@ -13,7 +13,7 @@ generateCrossValParams(nRepeats, nFolds, nCores, selectionOptimisation)
13 13
 
14 14
 \item{nCores}{A numeric specifying the number of cores used if the user wants to use parallelisation.}
15 15
 
16
-\item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise nFeatures.}
16
+\item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.}
17 17
 }
18 18
 \value{
19 19
 CrossValParams object
... ...
@@ -10,6 +10,7 @@ generateModellingParams(
10 10
   nFeatures,
11 11
   selectionMethod,
12 12
   selectionOptimisation,
13
+  performanceType = "auto",
13 14
   classifier,
14 15
   multiViewMethod = "none"
15 16
 )
... ...
@@ -28,7 +29,9 @@ Set to NULL or "all" if all features should be used.}
28 29
 \item{selectionMethod}{A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
29 30
 and performing multiview classification, the respective classification methods will be used on each assay.}
30 31
 
31
-\item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise nFeatures.}
32
+\item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.}
33
+
34
+\item{performanceType}{Performance metric to optimise if classifier has any tuning parameters.}
32 35
 
33 36
 \item{classifier}{A character vector of classification methods to compare. If a named character vector with names corresponding to different assays, 
34 37
 and performing multiview classification, the respective classification methods will be used on each assay.}
... ...
@@ -164,7 +164,7 @@ Although being a cross-validation framework, a number of popular feature selecti
164 164
 
165 165
 In the following tables, a function that is used when no function is explicitly specified by the user is shown as <span style="padding:4px; border:2px dashed #e64626;">functionName</span>.
166 166
 
167
-The functions below produce a ranking, of which different size subsets are tried and the classifier performance evaluated, to select a best subset of features, based on a criterion such as balanced error rate, for example.
167
+The functions below produce a ranking, of which different size subsets are tried and the classifier performance evaluated, to select a best subset of features, based on a criterion such as balanced accuracy rate, for example.
168 168
 
169 169
 Function | Description | DM | DV | DD
170 170
 ----|---------------------------|----|----|----
... ...
@@ -189,6 +189,8 @@ Function(s) | Description | DM | DV | DD
189 189
 <span style="font-family: 'Courier New', monospace;">mixModelsTrain</span>, <span style="font-family: 'Courier New', monospace;">mixModelsPredict</span> | Feature-wise mixtures of normals and voting | ✔ | ✔ | ✔ 
190 190
 <span style="font-family: 'Courier New', monospace;">naiveBayesKernel</span> | Feature-wise kernel density estimation and voting | ✔ | ✔ | ✔ 
191 191
 <span style="font-family: 'Courier New', monospace;">randomForestTrainInterface</span>, <span style="font-family: 'Courier New', monospace;">randomForestPredictInterface</span> | Wrapper for randomForest's functions <span style="font-family: 'Courier New', monospace;">randomForest</span> and <span style="font-family: 'Courier New', monospace;">predict</span> | ✔ | ✔ | ✔ 
192
+<span style="font-family: 'Courier New', monospace;">extremeGradientBoostingTrainInterface</span>, <span style="font-family: 'Courier New', monospace;">extremeGradientBoostingPredictInterface</span> | Wrapper for xgboost's functions <span style="font-family: 'Courier New', monospace;">xgboost</span> and <span style="font-family: 'Courier New', monospace;">predict</span> | ✔ | ✔ | ✔ 
193
+<span style="font-family: 'Courier New', monospace;">kNNinterface</span> | Wrapper for class's function <span style="font-family: 'Courier New', monospace;">knn</span> | ✔ | ✔ | ✔ 
192 194
 <span style="font-family: 'Courier New', monospace;">SVMtrainInterface</span>, <span style="font-family: 'Courier New', monospace;">SVMpredictInterface</span> | Wrapper for e1071's functions <span style="font-family: 'Courier New', monospace;">svm</span> and <span style="font-family: 'Courier New', monospace;">predict.svm</span> | ✔ | ✔ †| ✔ †
193 195
 
194 196
 \* If ordinary numeric measurements have been transformed to absolute deviations using <span style="font-family: 'Courier New', monospace;">subtractFromLocation</span>.<br>
... ...
@@ -249,7 +251,7 @@ bestGenePlot <- plotFeatureClasses(measurements, classes, names(mostChosen), dot
249 251
 
250 252
 The means of the abundance levels of `r names(sortedPercentages)[1]` are substantially different between the people with and without asthma. *plotFeatureClasses* can also plot categorical data, such as may be found in a clinical data table, as a bar chart.
251 253
 
252
-Classification error rates, as well as many other prediction performance measures, can be calculated with *calcCVperformance*. Next, the balanced error rate is calculated considering all samples, each of which was in the test set once. The balanced error rate is defined as the average of the classification errors of each class.
254
+Classification error rates, as well as many other prediction performance measures, can be calculated with *calcCVperformance*. Next, the balanced accuracy rate is calculated considering all samples, each of which was in the test set once. The balanced accuracy rate is defined as the average rate of the correct classifications of each class.
253 255
 
254 256
 See the documentation of *calcCVperformance* for a list of performance metrics which may be calculated.
255 257
 
... ...
@@ -278,7 +280,6 @@ The naive Bayes kernel classifier by default uses the vertical distance between
278 280
 Now, the classification error for each sample is also calculated for both the differential means and differential distribution classifiers and both *ClassifyResult* objects generated so far are plotted with *samplesMetricMap*.
279 281
 
280 282
 ```{r, fig.width = 10, fig.height = 7}
281
-library(grid)
282 283
 DMresults <- calcCVperformance(DMresults, "Sample Error")
283 284
 DDresults <- calcCVperformance(DDresults, "Sample Error")
284 285
 resultsList <- list(Abundance = DMresults, Distribution = DDresults)
... ...
@@ -332,7 +333,7 @@ Once a cross-validated classification is complete, the usefulness of the feature
332 333
 
333 334
 ### Parameter Tuning
334 335
 
335
-Some feature ranking methods or classifiers allow the choosing of tuning parameters, which controls some aspect of their model learning. An example of doing parameter tuning with a linear SVM is presented. This particular SVM has a single tuning parameter, the cost. Higher values of this parameter penalise misclassifications more. Moreover, feature selection happens by using a feature ranking function and then trying a range of top-ranked features to see which gives the best performance, the range being specified by a list element named *nFeatures* and the performance type (e.g. Balanced Error) specified by a list elment named *performanceType*. Therefore, some kind of parameter tuning always happens, even if the feature ranking or classifier function does not have any explicit tuning parameters.
336
+Some feature ranking methods or classifiers allow the choosing of tuning parameters, which controls some aspect of their model learning. An example of doing parameter tuning with a linear SVM is presented. This particular SVM has a single tuning parameter, the cost. Higher values of this parameter penalise misclassifications more. Moreover, feature selection happens by using a feature ranking function and then trying a range of top-ranked features to see which gives the best performance, the range being specified by a list element named *nFeatures* and the performance type (e.g. Balanced Accuracy) specified by a list element named *performanceType*. Therefore, some kind of parameter tuning always happens, even if the feature ranking or classifier function does not have any explicit tuning parameters.
336 337
 
337 338
 Tuning is achieved in ClassifyR by providing a variable called *tuneParams* to the SelectParams or TrainParams constructor. *tuneParams* is a named list, with the names being the names of the tuning variables, except for one which is named *"performanceType"* and specifies the performance metric to use for picking the parameter values. Any of the non-sample-specific performance metrics which *calcCVperformance* calculates can be optimised.
338 339
 
... ...
@@ -343,7 +344,7 @@ SVMparams <- ModellingParams(trainParams = TrainParams("SVM", kernel = "linear",
343 344
 SVMresults <- runTests(measurements, classes, crossValParams, SVMparams)
344 345
 ```
345 346
 
346
-The chosen values of the parameters are stored for every validation, and can be accessed with the *tunedParameters* function.
347
+The index of chosen of the parameters, as well as all combinations of parameters and their associated performance metric, are stored for every validation, and can be accessed with the *tunedParameters* function.
347 348
 
348 349
 ```{r}
349 350
 length(tunedParameters(SVMresults))