Browse code

- Roll-up of fixes from development branch over the past two weeks in preparation for workshop and software demonstration.

Dario Strbenac authored on 17/11/2022 10:45:05
Showing 21 changed files

... ...
@@ -3,8 +3,8 @@ Type: Package
3 3
 Title: A framework for cross-validated classification problems, with
4 4
        applications to differential variability and differential
5 5
        distribution testing
6
-Version: 3.2.0
7
-Date: 2022-10-25
6
+Version: 3.2.1
7
+Date: 2022-11-17
8 8
 Author: Dario Strbenac, Ellis Patrick, Sourish Iyengar, Harry Robertson, Andy Tran, John Ormerod, Graham Mann, Jean Yang
9 9
 Maintainer: Dario Strbenac <dario.strbenac@sydney.edu.au>
10 10
 VignetteBuilder: knitr
... ...
@@ -27,8 +27,6 @@ export(distribution)
27 27
 export(edgesToHubNetworks)
28 28
 export(featureSetSummary)
29 29
 export(finalModel)
30
-export(generateCrossValParams)
31
-export(generateModellingParams)
32 30
 export(interactorDifferences)
33 31
 export(models)
34 32
 export(performance)
... ...
@@ -87,7 +85,6 @@ exportMethods(selectionPlot)
87 85
 exportMethods(show)
88 86
 exportMethods(totalPredictions)
89 87
 exportMethods(tunedParameters)
90
-import(BiocParallel)
91 88
 import(MultiAssayExperiment)
92 89
 import(grid)
93 90
 import(methods)
... ...
@@ -86,7 +86,7 @@ setMethod("ROCplot", "ClassifyResult", function(results, ...) {
86 86
 
87 87
 #' @rdname ROCplot
88 88
 #' @export
89
-setMethod("ROCplot", "list", 
89
+setMethod("ROCplot", "list",
90 90
           function(results, mode = c("merge", "average"), interval = 95,
91 91
                    comparison = "auto", lineColours = "auto",
92 92
                    lineWidth = 1, fontSizes = c(24, 16, 12, 12, 12), labelPositions = seq(0.0, 1.0, 0.2),
... ...
@@ -101,10 +101,18 @@ setMethod("ROCplot", "list",
101 101
   if(comparison == "auto")
102 102
   {
103 103
     if(max(characteristicsCounts) == length(results))
104
-      comparison <- names(characteristicsCounts)[characteristicsCounts == max(characteristicsCounts)][1]
105
-    else
104
+    { # Choose a characteristic which varies the most across the results.
105
+      candidates <- names(characteristicsCounts)[characteristicsCounts == length(results)]
106
+      allCharacteristics <- do.call(rbind, lapply(results, function(result) result@characteristics))
107
+      distinctValues <- by(allCharacteristics[, "value"], allCharacteristics[, "characteristic"], function(values) length(unique(values)))
108
+      comparison <- names(distinctValues)[which.max(distinctValues)][1]
109
+    } else {
106 110
       stop("No characteristic is present for all results but must be.")
111
+    }
107 112
   }
113
+  resultsWithComparison <- sum(sapply(results, function(result) any(result@characteristics[, "characteristic"] == comparison)))
114
+  if(resultsWithComparison < length(results))
115
+    stop("Not all results have comparison characteristic ", comparison, ' but need to.')
108 116
                
109 117
   ggplot2::theme_set(ggplot2::theme_classic() + ggplot2::theme(panel.border = ggplot2::element_rect(fill = NA)))
110 118
   distinctClasses <- levels(actualOutcome(results[[1]]))
... ...
@@ -113,6 +113,15 @@ setMethod("calcExternalPerformance", c("Surv", "numeric"),
113 113
             .calcPerformance(actualOutcome, predictedOutcome, performanceType = performanceType)[["values"]]
114 114
           })
115 115
 
116
+#' @rdname calcPerformance
117
+#' @exportMethod calcExternalPerformance
118
+setMethod("calcExternalPerformance", c("factor", "tabular"), # table has class probabilities per sample.
119
+          function(actualOutcome, predictedOutcome, performanceType = "AUC")
120
+          {
121
+            performanceType <- match.arg(performanceType)
122
+            .calcPerformance(actualOutcome, predictedOutcome, performanceType = performanceType)[["values"]]
123
+          })
124
+
116 125
 #' @rdname calcPerformance
117 126
 #' @usage NULL
118 127
 #' @export
... ...
@@ -11,22 +11,26 @@
11 11
 #' same length as the number of samples in \code{measurements} or a character vector of length 1 containing the
12 12
 #' column name in \code{measurements} if it is a \code{\link{DataFrame}}. Or a \code{\link{Surv}} object or a character vector of
13 13
 #' length 2 or 3 specifying the time and event columns in \code{measurements} for survival outcome. If \code{measurements} is a
14
-#' \code{\link{MultiAssayExperiment}}, the column name(s) in \code{colData(measurements)} representing the outcome.
14
+#' \code{\link{MultiAssayExperiment}}, the column name(s) in \code{colData(measurements)} representing the outcome.  If column names
15
+#' of survival information, time must be in first column and event status in the second.
15 16
 #' @param outcomeTrain For the \code{train} function, either a factor vector of classes, a \code{\link{Surv}} object, or
16 17
 #' a character string, or vector of such strings, containing column name(s) of column(s)
17
-#' containing either classes or time and event information about survival.
18
+#' containing either classes or time and event information about survival. If column names
19
+#' of survival information, time must be in first column and event status in the second.
18 20
 #' @param ... Parameters passed into \code{\link{prepareData}} which control subsetting and filtering of input data.
19 21
 #' @param nFeatures The number of features to be used for classification. If this is a single number, the same number of features will be used for all comparisons
20 22
 #' or assays. If a numeric vector these will be optimised over using \code{selectionOptimisation}. If a named vector with the same names of multiple assays, 
21 23
 #' a different number of features will be used for each assay. If a named list of vectors, the respective number of features will be optimised over. 
22 24
 #' Set to NULL or "all" if all features should be used.
23
-#' @param selectionMethod A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
24
-#' and performing multiview classification, the respective classification methods will be used on each assay.
25
+#' @param selectionMethod Default: "auto". A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
26
+#' and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"} t-test (two categories) / F-test (three or more categories) ranking
27
+#' and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value.
25 28
 #' @param selectionOptimisation A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.
26
-#' @param performanceType Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Any one of the
29
+#' @param performanceType Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Otherwise, any one of the
27 30
 #' options described in \code{\link{calcPerformance}} may otherwise be specified.
28
-#' @param classifier A character vector of classification methods to compare. If a named character vector with names corresponding to different assays, 
29
-#' and performing multiview classification, the respective classification methods will be used on each assay.
31
+#' @param classifier Default: \code{"auto"}. A character vector of classification methods to compare. If a named character vector with names corresponding to different assays, 
32
+#' and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"}, then a random forest is used for a classification
33
+#' task or Cox proportional hazards model for a survival task.
30 34
 #' @param multiViewMethod A character vector specifying the multiview method or data integration approach to use.
31 35
 #' @param assayCombinations A character vector or list of character vectors proposing the assays or, in the case of a list, combination of assays to use
32 36
 #' with each element being a vector of assays to combine. Special value \code{"all"} means all possible subsets of assays.
... ...
@@ -108,12 +112,14 @@ setMethod("crossValidate", "DataFrame",
108 112
               if(!performanceType %in% c("auto", .ClassifyRenvir[["performanceTypes"]]))
109 113
                 stop(paste("performanceType must be one of", paste(c("auto", .ClassifyRenvir[["performanceTypes"]]), collapse = ", "), "but is", performanceType))
110 114
               
115
+              isCategorical <- is.character(outcome) && (length(outcome) == 1 || length(outcome) == nrow(measurements)) || is.factor(outcome)
111 116
               if(performanceType == "auto")
112
-              {
113
-                if(is.character(outcome) && (length(outcome) == 1 || length(outcome) == nrow(measurements)) || is.factor(outcome))
114
-                  performanceType <- "Balanced Accuracy"
115
-                else performanceType <- "C-index"
116
-              }
117
+                if(isCategorical) performanceType <- "Balanced Accuracy" else performanceType <- "C-index"
118
+              if(length(selectionMethod) == 1 && selectionMethod == "auto")
119
+                if(isCategorical) selectionMethod <- "t-test" else selectionMethod <- "CoxPH"
120
+              if(length(classifier) == 1 && classifier == "auto")
121
+                if(isCategorical) classifier <- "randomForest" else classifier <- "CoxPH"
122
+              
117 123
               
118 124
               # Which data-types or data-views are present?
119 125
               assayIDs <- unique(S4Vectors::mcols(measurements)$assay)
... ...
@@ -515,18 +521,6 @@ Using an ordinary GLM instead.")
515 521
     classifier
516 522
 }
517 523
 
518
-######################################
519
-######################################
520
-#' A function to generate a CrossValParams object
521
-#'
522
-#' @inheritParams crossValidate
523
-#'
524
-#' @return CrossValParams object
525
-#' @export
526
-#'
527
-#' @examples
528
-#' CVparams <- generateCrossValParams(nRepeats = 20, nFolds = 5, nCores = 8, selectionOptimisation = "none")
529
-#' @import BiocParallel
530 524
 generateCrossValParams <- function(nRepeats, nFolds, nCores, selectionOptimisation){
531 525
 
532 526
     seed <- .Random.seed[1]
... ...
@@ -549,32 +543,7 @@ generateCrossValParams <- function(nRepeats, nFolds, nCores, selectionOptimisati
549 543
     if(!any(tuneMode %in% c("Resubstitution", "Nested CV", "none"))) stop("selectionOptimisation must be Nested CV or Resubstitution or none")
550 544
     CrossValParams(permutations = nRepeats, folds = nFolds, parallelParams = BPparam, tuneMode = tuneMode)
551 545
 }
552
-######################################
553 546
 
554
-######################################
555
-#' A function to generate a ModellingParams object
556
-#'
557
-#' @inheritParams crossValidate
558
-#' @param assayIDs A vector of data set identifiers as long at the number of data sets.
559
-#'
560
-#' @return ModellingParams object
561
-#' @export
562
-#'
563
-#' @examples
564
-#' data(asthma)
565
-#' # First make a toy example assay with multiple data types. We'll randomly assign different features to be clinical, gene or protein.
566
-#' set.seed(51773)
567
-#' measurements <- DataFrame(measurements, check.names = FALSE) 
568
-#' mcols(measurements)$assay <- c(rep("clinical",20),sample(c("gene", "protein"), ncol(measurements)-20, replace = TRUE))
569
-#' mcols(measurements)$feature <- colnames(measurements)
570
-#' modellingParams <- generateModellingParams(assayIDs = c("clinical", "gene", "protein"),
571
-#'                                           measurements = measurements, 
572
-#'                                           nFeatures = list(clinical = 10, gene = 10, protein = 10),
573
-#'                                           selectionMethod = list(clinical = "t-test", gene = "t-test", protein = "t-test"),
574
-#'                                           selectionOptimisation = "none",
575
-#'                                           classifier = "randomForest",
576
-#'                                           multiViewMethod = "merge")
577
-#' @import BiocParallel
578 547
 generateModellingParams <- function(assayIDs,
579 548
                                     measurements,
580 549
                                     nFeatures,
... ...
@@ -6,9 +6,11 @@ randomForestTrainInterface <- function(measurementsTrain, outcomeTrain, mTryProp
6 6
   if(verbose == 3)
7 7
     message("Fitting random forest classifier to training data.")
8 8
   mtry <- round(mTryProportion * ncol(measurementsTrain)) # Number of features to try.
9
-      
10 9
   # Convert to base data.frame as randomForest doesn't understand DataFrame.
11
-  ranger::ranger(x = as(measurementsTrain, "data.frame"), y = outcomeTrain, mtry = mtry, importance = "impurity_corrected", ...)
10
+  fittedModel <- ranger::ranger(x = as(measurementsTrain, "data.frame"), y = outcomeTrain, mtry = mtry, ...)
11
+  forImportance <- ranger::ranger(x = as(measurementsTrain, "data.frame"), y = outcomeTrain, mtry = mtry, importance = "impurity_corrected", ...)
12
+  attr(fittedModel, "forImportance") <- forImportance
13
+  fittedModel
12 14
 }
13 15
 attr(randomForestTrainInterface, "name") <- "randomForestTrainInterface"
14 16
     
... ...
@@ -37,7 +39,8 @@ randomForestPredictInterface <- function(forest, measurementsTest, ..., returnTy
37 39
 
38 40
 forestFeatures <- function(forest)
39 41
                   {
40
-                    rankedFeaturesIndices <- order(ranger::importance(forest), decreasing = TRUE)
41
-                    selectedFeaturesIndices <- which(ranger::importance(forest) > 0)
42
+                    forImportance <- attr(forest, "forImportance")
43
+                    rankedFeaturesIndices <- order(ranger::importance(forImportance), decreasing = TRUE)
44
+                    selectedFeaturesIndices <- which(ranger::importance(forImportance) > 0)
42 45
                     list(rankedFeaturesIndices, selectedFeaturesIndices)
43 46
                   }
44 47
\ No newline at end of file
... ...
@@ -5,8 +5,9 @@ rfsrcTrainInterface <- function(measurementsTrain, survivalTrain, mTryProportion
5 5
     stop("The package 'randomForestSRC' could not be found. Please install it.")
6 6
   if(verbose == 3)
7 7
     message("Fitting rfsrc classifier to training data and making predictions on test data.")
8
-    
9
-  bindedMeasurements <- cbind(measurementsTrain, event = survivalTrain[, 1], time = survivalTrain[, 2])
8
+
9
+  # Surv objects store survival information as a two-column table, time and event, in that order.    
10
+  bindedMeasurements <- cbind(measurementsTrain, time = survivalTrain[, 1], event = survivalTrain[, 2])
10 11
   mtry <- round(mTryProportion * ncol(measurementsTrain)) # Number of features to try.
11 12
   randomForestSRC::rfsrc(Surv(time, event) ~ ., data = as.data.frame(bindedMeasurements), mtry = mtry,
12 13
                           var.used = "all.trees", importance = TRUE, ...)
... ...
@@ -4,7 +4,7 @@ coxphRanking <- function(measurementsTrain, survivalTrain, verbose = 3) # Clinic
4 4
   
5 5
   pValues <- rep(NA, ncol(measurementsTrain))
6 6
   names(pValues) <- colnames(measurementsTrain)
7
-  
7
+
8 8
   isCat <- sapply(measurementsTrain, class) %in% c("character", "factor")
9 9
   if(any(isCat))
10 10
   {
... ...
@@ -16,7 +16,8 @@
16 16
 #' \code{matrix} or \code{\link{DataFrame}}, the rows are samples, and the columns are features.
17 17
 #' @param outcomeTrain Either a factor vector of classes, a \code{\link{Surv}} object, or
18 18
 #' a character string, or vector of such strings, containing column name(s) of column(s)
19
-#' containing either classes or time and event information about survival.
19
+#' containing either classes or time and event information about survival. If column names
20
+#' of survival information, time must be in first column and event status in the second.
20 21
 #' @param measurementsTest Same data type as \code{measurementsTrain}, but only the test
21 22
 #' samples.
22 23
 #' @param outcomeTest Same data type as \code{outcomeTrain}, but for only the test
... ...
@@ -256,12 +257,19 @@ input data. Autmomatically reducing to smaller number.")
256 257
     {
257 258
       if(is.null(modellingParams@trainParams@getFeatures))
258 259
       selectedFeatures <- originalFeatures[selectedFeaturesIndices]
259
-      else selectedFeatures <- colnames(measurementsTrain)[rankedFeaturesIndices] 
260
+      else selectedFeatures <- colnames(measurementsTrain)[selectedFeaturesIndices] 
260 261
     } else {
261 262
       featureColumns <- na.omit(match(c("assay", "feature"), colnames(S4Vectors::mcols(measurementsTrain))))
262
-      if(is.null(modellingParams@trainParams@getFeatures))
263
-      selectedFeatures <- originalFeatures[selectedFeaturesIndices, ]
264
-      else selectedFeatures <- S4Vectors::mcols(measurementsTrain)[selectedFeaturesIndices, featureColumns]
263
+      if(length(featureColumns) == 1)
264
+      {
265
+         if(is.null(modellingParams@trainParams@getFeatures))
266
+            selectedFeatures <- originalFeatures[selectedFeaturesIndices]
267
+         else selectedFeatures <- S4Vectors::mcols(measurementsTrain)[selectedFeaturesIndices, featureColumns]            
268
+      } else {
269
+                if(is.null(modellingParams@trainParams@getFeatures))
270
+                  selectedFeatures <- originalFeatures[selectedFeaturesIndices, ]
271
+                else selectedFeatures <- S4Vectors::mcols(measurementsTrain)[selectedFeaturesIndices, featureColumns]            
272
+      }
265 273
     }
266 274
     importanceTable <- S4Vectors::DataFrame(selectedFeatures, performanceChanges)
267 275
     if(ncol(importanceTable) == 2) colnames(importanceTable)[1] <- "feature"
... ...
@@ -272,6 +280,7 @@ input data. Autmomatically reducing to smaller number.")
272 280
   if(!is.null(tuneDetailsSelect)) tuneDetails <- tuneDetailsSelect else tuneDetails <- tuneDetailsTrain
273 281
 
274 282
   # Convert back into original, potentially unsafe feature identifiers unless it is a nested cross-validation.
283
+  
275 284
   if(is.null(.iteration) || .iteration != "internal")
276 285
   {
277 286
     if(!is.null(rankedFeaturesIndices))
... ...
@@ -283,9 +292,16 @@ input data. Autmomatically reducing to smaller number.")
283 292
         else rankedFeatures <- colnames(measurementsTrain)[rankedFeaturesIndices]            
284 293
       } else {
285 294
         featureColumns <- na.omit(match(c("assay", "feature"), colnames(S4Vectors::mcols(measurementsTrain))))          
286
-        if(is.null(modellingParams@trainParams@getFeatures))
287
-          rankedFeatures <- originalFeatures[rankedFeaturesIndices, ]
288
-        else rankedFeatures <- S4Vectors::mcols(measurementsTrain)[rankedFeaturesIndices, featureColumns]
295
+        if(length(featureColumns) == 1)
296
+        {
297
+          if(is.null(modellingParams@trainParams@getFeatures))
298
+            rankedFeatures <- originalFeatures[rankedFeaturesIndices]
299
+          else rankedFeatures <- S4Vectors::mcols(measurementsTrain)[rankedFeaturesIndices, featureColumns]
300
+        } else {
301
+          if(is.null(modellingParams@trainParams@getFeatures))
302
+            rankedFeatures <- originalFeatures[rankedFeaturesIndices, ]
303
+          else rankedFeatures <- S4Vectors::mcols(measurementsTrain)[rankedFeaturesIndices, featureColumns] 
304
+        }
289 305
       }
290 306
     } else { rankedFeatures <- NULL}
291 307
     if(!is.null(selectedFeaturesIndices))
... ...
@@ -297,9 +313,16 @@ input data. Autmomatically reducing to smaller number.")
297 313
         else selectedFeatures <- colnames(measurementsTrain)[selectedFeaturesIndices]
298 314
       } else {
299 315
         featureColumns <- na.omit(match(c("assay", "feature"), colnames(S4Vectors::mcols(measurementsTrain))))  
300
-        if(is.null(modellingParams@trainParams@getFeatures))
301
-          selectedFeatures <- originalFeatures[selectedFeaturesIndices, ]
302
-        else selectedFeatures <- S4Vectors::mcols(measurementsTrain)[selectedFeaturesIndices, featureColumns]
316
+        if(length(featureColumns) == 1)
317
+        {
318
+          if(is.null(modellingParams@trainParams@getFeatures))
319
+            selectedFeatures <- originalFeatures[selectedFeaturesIndices]
320
+          else selectedFeatures <- S4Vectors::mcols(measurementsTrain)[selectedFeaturesIndices, featureColumns]            
321
+        } else {
322
+                if(is.null(modellingParams@trainParams@getFeatures))
323
+                  selectedFeatures <- originalFeatures[selectedFeaturesIndices, ]
324
+                else selectedFeatures <- S4Vectors::mcols(measurementsTrain)[selectedFeaturesIndices, featureColumns]            
325
+        }
303 326
       }
304 327
     } else { selectedFeatures <- NULL}
305 328
   } else { # Nested use in feature selection. No feature selection in inner execution, so ignore features. 
... ...
@@ -16,7 +16,8 @@
16 16
 #' containing either classes or time and event information about survival. If
17 17
 #' \code{measurements} is a \code{MultiAssayExperiment}, the names of the column (class) or
18 18
 #' columns (survival) in the table extracted by \code{colData(data)} that contain(s) the samples'
19
-#' outcome to use for prediction.
19
+#' outcome to use for prediction. If column names of survival information, time must be in first
20
+#' column and event status in the second.
20 21
 #' @param crossValParams An object of class \code{\link{CrossValParams}},
21 22
 #' specifying the kind of cross-validation to be done.
22 23
 #' @param modellingParams An object of class \code{\link{ModellingParams}},
... ...
@@ -13,7 +13,7 @@
13 13
 #' a matrix of pre-calculated metrics, for backwards compatibility.
14 14
 #' @param classes If \code{results} is a matrix, this is a factor vector of the
15 15
 #' same length as the number of columns that \code{results} has.
16
-#' @param comparison Default: "Classifier Name". The aspect of the experimental
16
+#' @param comparison Default: "auto". The aspect of the experimental
17 17
 #' design to compare. Can be any characteristic that all results share.
18 18
 #' @param metric Default: "Sample Error". The sample-wise metric to plot.
19 19
 #' @param featureValues If not NULL, can be a named factor or named numeric
... ...
@@ -44,6 +44,8 @@
44 44
 #' @param legendSize The size of the boxes in the legends.
45 45
 #' @param plot Logical. IF \code{TRUE}, a plot is produced on the current
46 46
 #' graphics device.
47
+#' @param ... Parameters not used by the \code{ClassifyResult} method that does
48
+#' list-packaging but used by the main \code{list} method.
47 49
 #' @return A plot is produced and a grob is returned that can be saved to a
48 50
 #' graphics device.
49 51
 #' @author Dario Strbenac
... ...
@@ -82,11 +84,17 @@
82 84
 setGeneric("samplesMetricMap", function(results, ...)
83 85
 standardGeneric("samplesMetricMap"))
84 86
 
87
+#' @rdname samplesMetricMap
88
+#' @export
89
+setMethod("samplesMetricMap", "ClassifyResult", function(results, ...) {
90
+    samplesMetricMap(list(assay = results), ...)
91
+})
92
+
85 93
 #' @rdname samplesMetricMap
86 94
 #' @export
87 95
 setMethod("samplesMetricMap", "list", 
88 96
           function(results,
89
-                   comparison = "Classifier Name",
97
+                   comparison = "auto",
90 98
                    metric = c("Sample Error", "Sample Accuracy", "Sample C-index"),
91 99
                    featureValues = NULL, featureName = NULL,
92 100
                    metricColours = list(c("#3F48CC", "#6F75D8", "#9FA3E5", "#CFD1F2", "#FFFFFF"),
... ...
@@ -103,6 +111,20 @@ setMethod("samplesMetricMap", "list",
103 111
     stop("The package 'gridExtra' could not be found. Please install it.")       
104 112
   if(!requireNamespace("gtable", quietly = TRUE))
105 113
     stop("The package 'gtable' could not be found. Please install it.")
114
+  
115
+  characteristicsCounts <- table(unlist(lapply(results, function(result) result@characteristics[["characteristic"]])))
116
+  if(comparison == "auto")
117
+  {
118
+    if(max(characteristicsCounts) == length(results))
119
+    { # Choose a characteristic which varies the most across the results.
120
+      candidates <- names(characteristicsCounts)[characteristicsCounts == length(results)]
121
+      allCharacteristics <- do.call(rbind, lapply(results, function(result) result@characteristics))
122
+      distinctValues <- by(allCharacteristics[, "value"], allCharacteristics[, "characteristic"], function(values) length(unique(values)))
123
+      comparison <- names(distinctValues)[which.max(distinctValues)][1]
124
+    } else {
125
+      stop("No characteristic is present for all results but must be.")
126
+    }
127
+  }
106 128
   resultsWithComparison <- sum(sapply(results, function(result) any(result@characteristics[, "characteristic"] == comparison)))
107 129
   if(resultsWithComparison < length(results))
108 130
     stop("Not all results have comparison characteristic ", comparison, ' but need to.')
... ...
@@ -7,6 +7,7 @@
7 7
 \alias{calcExternalPerformance,factor,factor-method}
8 8
 \alias{calcExternalPerformance,Surv,numeric-method}
9 9
 \alias{calcCVperformance,ClassifyResult-method}
10
+\alias{calcExternalPerformance,factor,tabular-method}
10 11
 \title{Add Performance Calculations to a ClassifyResult Object or Calculate for a
11 12
 Pair of Factor Vectors}
12 13
 \usage{
... ...
@@ -24,6 +25,12 @@ Pair of Factor Vectors}
24 25
   performanceType = "C-index"
25 26
 )
26 27
 
28
+\S4method{calcExternalPerformance}{factor,tabular}(
29
+  actualOutcome,
30
+  predictedOutcome,
31
+  performanceType = "AUC"
32
+)
33
+
27 34
 \S4method{calcCVperformance}{ClassifyResult}(
28 35
   result,
29 36
   performanceType = c("Balanced Accuracy", "Balanced Error", "Error", "Accuracy",
... ...
@@ -131,7 +131,8 @@ or a list of these objects containing the data.}
131 131
 same length as the number of samples in \code{measurements} or a character vector of length 1 containing the
132 132
 column name in \code{measurements} if it is a \code{\link{DataFrame}}. Or a \code{\link{Surv}} object or a character vector of
133 133
 length 2 or 3 specifying the time and event columns in \code{measurements} for survival outcome. If \code{measurements} is a
134
-\code{\link{MultiAssayExperiment}}, the column name(s) in \code{colData(measurements)} representing the outcome.}
134
+\code{\link{MultiAssayExperiment}}, the column name(s) in \code{colData(measurements)} representing the outcome.  If column names
135
+of survival information, time must be in first column and event status in the second.}
135 136
 
136 137
 \item{...}{Parameters passed into \code{\link{prepareData}} which control subsetting and filtering of input data.}
137 138
 
... ...
@@ -140,15 +141,17 @@ or assays. If a numeric vector these will be optimised over using \code{selectio
140 141
 a different number of features will be used for each assay. If a named list of vectors, the respective number of features will be optimised over. 
141 142
 Set to NULL or "all" if all features should be used.}
142 143
 
143
-\item{selectionMethod}{A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
144
-and performing multiview classification, the respective classification methods will be used on each assay.}
144
+\item{selectionMethod}{Default: "auto". A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
145
+and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"} t-test (two categories) / F-test (three or more categories) ranking
146
+and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value.}
145 147
 
146 148
 \item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.}
147 149
 
148 150
 \item{performanceType}{Performance metric to optimise if classifier has any tuning parameters.}
149 151
 
150
-\item{classifier}{A character vector of classification methods to compare. If a named character vector with names corresponding to different assays, 
151
-and performing multiview classification, the respective classification methods will be used on each assay.}
152
+\item{classifier}{Default: \code{"auto"}. A character vector of classification methods to compare. If a named character vector with names corresponding to different assays, 
153
+and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"}, then a random forest is used for a classification
154
+task or Cox proportional hazards model for a survival task.}
152 155
 
153 156
 \item{multiViewMethod}{A character vector specifying the multiview method or data integration approach to use.}
154 157
 
... ...
@@ -167,7 +170,8 @@ with each element being a vector of assays to combine. Special value \code{"all"
167 170
 
168 171
 \item{outcomeTrain}{For the \code{train} function, either a factor vector of classes, a \code{\link{Surv}} object, or
169 172
 a character string, or vector of such strings, containing column name(s) of column(s)
170
-containing either classes or time and event information about survival.}
173
+containing either classes or time and event information about survival. If column names
174
+of survival information, time must be in first column and event status in the second.}
171 175
 
172 176
 \item{assayIDs}{A character vector for assays to train with. Special value \code{"all"}
173 177
 uses all assays in the input object.}
174 178
deleted file mode 100644
... ...
@@ -1,26 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/crossValidate.R
3
-\name{generateCrossValParams}
4
-\alias{generateCrossValParams}
5
-\title{A function to generate a CrossValParams object}
6
-\usage{
7
-generateCrossValParams(nRepeats, nFolds, nCores, selectionOptimisation)
8
-}
9
-\arguments{
10
-\item{nRepeats}{A numeric specifying the the number of repeats or permutations to use for cross-validation.}
11
-
12
-\item{nFolds}{A numeric specifying the number of folds to use for cross-validation.}
13
-
14
-\item{nCores}{A numeric specifying the number of cores used if the user wants to use parallelisation.}
15
-
16
-\item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.}
17
-}
18
-\value{
19
-CrossValParams object
20
-}
21
-\description{
22
-A function to generate a CrossValParams object
23
-}
24
-\examples{
25
-CVparams <- generateCrossValParams(nRepeats = 20, nFolds = 5, nCores = 8, selectionOptimisation = "none")
26
-}
27 0
deleted file mode 100644
... ...
@@ -1,61 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/crossValidate.R
3
-\name{generateModellingParams}
4
-\alias{generateModellingParams}
5
-\title{A function to generate a ModellingParams object}
6
-\usage{
7
-generateModellingParams(
8
-  assayIDs,
9
-  measurements,
10
-  nFeatures,
11
-  selectionMethod,
12
-  selectionOptimisation,
13
-  performanceType = "auto",
14
-  classifier,
15
-  multiViewMethod = "none"
16
-)
17
-}
18
-\arguments{
19
-\item{assayIDs}{A vector of data set identifiers as long at the number of data sets.}
20
-
21
-\item{measurements}{Either a \code{\link{DataFrame}}, \code{\link{data.frame}}, \code{\link{matrix}}, \code{\link{MultiAssayExperiment}} 
22
-or a list of these objects containing the data.}
23
-
24
-\item{nFeatures}{The number of features to be used for classification. If this is a single number, the same number of features will be used for all comparisons
25
-or assays. If a numeric vector these will be optimised over using \code{selectionOptimisation}. If a named vector with the same names of multiple assays, 
26
-a different number of features will be used for each assay. If a named list of vectors, the respective number of features will be optimised over. 
27
-Set to NULL or "all" if all features should be used.}
28
-
29
-\item{selectionMethod}{A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
30
-and performing multiview classification, the respective classification methods will be used on each assay.}
31
-
32
-\item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.}
33
-
34
-\item{performanceType}{Performance metric to optimise if classifier has any tuning parameters.}
35
-
36
-\item{classifier}{A character vector of classification methods to compare. If a named character vector with names corresponding to different assays, 
37
-and performing multiview classification, the respective classification methods will be used on each assay.}
38
-
39
-\item{multiViewMethod}{A character vector specifying the multiview method or data integration approach to use.}
40
-}
41
-\value{
42
-ModellingParams object
43
-}
44
-\description{
45
-A function to generate a ModellingParams object
46
-}
47
-\examples{
48
-data(asthma)
49
-# First make a toy example assay with multiple data types. We'll randomly assign different features to be clinical, gene or protein.
50
-set.seed(51773)
51
-measurements <- DataFrame(measurements, check.names = FALSE) 
52
-mcols(measurements)$assay <- c(rep("clinical",20),sample(c("gene", "protein"), ncol(measurements)-20, replace = TRUE))
53
-mcols(measurements)$feature <- colnames(measurements)
54
-modellingParams <- generateModellingParams(assayIDs = c("clinical", "gene", "protein"),
55
-                                          measurements = measurements, 
56
-                                          nFeatures = list(clinical = 10, gene = 10, protein = 10),
57
-                                          selectionMethod = list(clinical = "t-test", gene = "t-test", protein = "t-test"),
58
-                                          selectionOptimisation = "none",
59
-                                          classifier = "randomForest",
60
-                                          multiViewMethod = "merge")
61
-}
... ...
@@ -34,7 +34,8 @@ are passed into and used by the \code{DataFrame} method or passed onwards to \co
34 34
 
35 35
 \item{outcomeTrain}{Either a factor vector of classes, a \code{\link{Surv}} object, or
36 36
 a character string, or vector of such strings, containing column name(s) of column(s)
37
-containing either classes or time and event information about survival.}
37
+containing either classes or time and event information about survival. If column names
38
+of survival information, time must be in first column and event status in the second.}
38 39
 
39 40
 \item{measurementsTest}{Same data type as \code{measurementsTrain}, but only the test
40 41
 samples.}
... ...
@@ -35,7 +35,8 @@ a character string, or vector of such strings, containing column name(s) of colu
35 35
 containing either classes or time and event information about survival. If
36 36
 \code{measurements} is a \code{MultiAssayExperiment}, the names of the column (class) or
37 37
 columns (survival) in the table extracted by \code{colData(data)} that contain(s) the samples'
38
-outcome to use for prediction.}
38
+outcome to use for prediction. If column names of survival information, time must be in first
39
+column and event status in the second.}
39 40
 
40 41
 \item{crossValParams}{An object of class \code{\link{CrossValParams}},
41 42
 specifying the kind of cross-validation to be done.}
... ...
@@ -4,11 +4,14 @@
4 4
 \alias{samplesMetricMap}
5 5
 \alias{samplesMetricMap,list-method}
6 6
 \alias{samplesMetricMap,matrix-method}
7
+\alias{samplesMetricMap,ClassifyResult-method}
7 8
 \title{Plot a Grid of Sample Error Rates or Accuracies}
8 9
 \usage{
10
+\S4method{samplesMetricMap}{ClassifyResult}(results, ...)
11
+
9 12
 \S4method{samplesMetricMap}{list}(
10 13
   results,
11
-  comparison = "Classifier Name",
14
+  comparison = "auto",
12 15
   metric = c("Sample Error", "Sample Accuracy", "Sample C-index"),
13 16
   featureValues = NULL,
14 17
   featureName = NULL,
... ...
@@ -55,7 +58,10 @@
55 58
 \item{results}{A list of \code{\link{ClassifyResult}} objects. Could also be
56 59
 a matrix of pre-calculated metrics, for backwards compatibility.}
57 60
 
58
-\item{comparison}{Default: "Classifier Name". The aspect of the experimental
61
+\item{...}{Parameters not used by the \code{ClassifyResult} method that does
62
+list-packaging but used by the main \code{list} method.}
63
+
64
+\item{comparison}{Default: "auto". The aspect of the experimental
59 65
 design to compare. Can be any characteristic that all results share.}
60 66
 
61 67
 \item{metric}{Default: "Sample Error". The sample-wise metric to plot.}
... ...
@@ -1,35 +1,16 @@
1 1
 ---
2
-title: "An Introduction to **ClassifyR**"
2
+title: "Getting Started with ClassifyR"
3 3
 author: Dario Strbenac, Ellis Patrick, Graham Mann, Jean Yang, John Ormerod <br>
4 4
         The University of Sydney, Australia.
5 5
 output: 
6 6
   BiocStyle::html_document:
7
+    css: style.css
7 8
     toc: true
8 9
 vignette: >
9 10
   %\VignetteEngine{knitr::rmarkdown}
10 11
   %\VignetteIndexEntry{An Introduction to the ClassifyR Package}
11 12
 ---
12 13
 
13
-<style>
14
-    body .main-container {
15
-        max-width: 1600px;
16
-    }
17
-    p {
18
-      padding: 20px;
19
-    }
20
-    .table {
21
-      border: 2px solid #e64626;
22
-    }
23
-    .table>thead>tr>th
24
-    {
25
-     border-bottom: 2px solid #e64626;
26
-     border-right: 1px solid black;
27
-    }
28
-    .table>tbody>tr>td {
29
-     border-right: 1px solid black;
30
-    }    
31
-</style>
32
-
33 14
 ```{r, echo = FALSE, results = "asis"}
34 15
 options(width = 130)
35 16
 ```
... ...
@@ -50,7 +31,7 @@ Driver functions can use parallel processing capabilities in R to speed up cross
50 31
 
51 32
 
52 33
 ```{r, echo = FALSE}
53
-htmltools::img(src = knitr::image_uri("ClassifyRprocedure.png"), 
34
+htmltools::img(src = knitr::image_uri("images/ClassifyRprocedure.png"), 
54 35
                style = 'margin-left: auto;margin-right: auto')
55 36
 ```
56 37
 
... ...
@@ -90,7 +71,7 @@ Data Type | $n \times p$ | $p \times n$
90 71
 <span style="font-family: 'Courier New', monospace;">MultiAssayExperiment</span> |  | ✔
91 72
 <span style="font-family: 'Courier New', monospace;">list</span> of <span style="font-family: 'Courier New', monospace;">data.frame</span>s | ✔ | 
92 73
 
93
-*crossValidate* must also be supplied with *outcomes*, which represents the prediction to be made in a variety of possible ways.
74
+*crossValidate* must also be supplied with *outcome*, which represents the prediction to be made in a variety of possible ways.
94 75
 
95 76
 * A *factor* that contains the class label for each observation. *classes* must be of length $n$.
96 77
 * A *character* of length 1 that matches a column name in a data frame which holds the classes. The classes will automatically be removed before training is done.
97 78
new file mode 100755
98 79
Binary files /dev/null and b/vignettes/images/ClassifyRprocedure.png differ
99 80
new file mode 100644
... ...
@@ -0,0 +1,9 @@
1
+body .main-container {max-width: 1600px;}
2
+p {padding: 20px;}
3
+.table {border: 2px solid #e64626;}
4
+.table>thead>tr>th
5
+{
6
+  border-bottom: 2px solid #e64626;
7
+  border-right: 1px solid black;
8
+}
9
+.table>tbody>tr>td {border-right: 1px solid black;}    
0 10
\ No newline at end of file