Browse code

- randomSelection function added. - crissCrossValidate and crissCrossPlot contributions by Harry Robertson added and harmonised to ClassifyR code style. - selectionMethod and classifier defaults become "auto" for crossValidate. Previously only documented but not implemented in code. - predict method for standalone use now finds the correct prediction method for each trained model.

Dario Strbenac authored on 08/12/2022 06:15:07
Showing1 changed files
... ...
@@ -129,10 +129,12 @@
129 129
       tuneCombo <- tuneCombosSelect[rowIndex, , drop = FALSE]
130 130
       if(tuneCombo != "none") # Add real parameters before function call.
131 131
         paramList <- append(paramList, tuneCombo)
132
+      if(attr(featureRanking, "name") == "randomSelection")
133
+        paramList <- append(paramList, nFeatures = topNfeatures)
132 134
       do.call(featureRanking, paramList)
133 135
     })
134 136
 
135
-    if(attr(featureRanking, "name") %in% c("previousSelection", "Union Selection")) # Actually selection not ranking.
137
+    if(attr(featureRanking, "name") %in% c("randomSelection", "previousSelection", "Union Selection")) # Actually selection not ranking.
136 138
       return(list(NULL, rankings[[1]], NULL))
137 139
     
138 140
     if(crossValParams@tuneMode == "none") # No parameters to choose between.
... ...
@@ -509,6 +511,8 @@
509 511
         "KS" = KolmogorovSmirnovRanking,
510 512
         "KL" = KullbackLeiblerRanking,
511 513
         "CoxPH" = coxphRanking,
514
+        "previousSelection" = previousSelection,
515
+        "randomSelection" = randomSelection,
512 516
         "selectMulti" = selectMulti
513 517
     )
514 518
 }
Browse code

- .doSelection modified so that feature selection doesn't happen twice for selectMulti function. - selectMulti fixed so correct indices are returned relative to the whole training data table, not to an individual assay table.

Dario Strbenac authored on 22/11/2022 04:52:30
Showing1 changed files
... ...
@@ -131,8 +131,8 @@
131 131
         paramList <- append(paramList, tuneCombo)
132 132
       do.call(featureRanking, paramList)
133 133
     })
134
-    
135
-    if(attr(featureRanking, "name") == "previousSelection") # Actually selection not ranking.
134
+
135
+    if(attr(featureRanking, "name") %in% c("previousSelection", "Union Selection")) # Actually selection not ranking.
136 136
       return(list(NULL, rankings[[1]], NULL))
137 137
     
138 138
     if(crossValParams@tuneMode == "none") # No parameters to choose between.
Browse code

- Minor fixes.

Dario Strbenac authored on 17/10/2022 03:30:05
Showing1 changed files
... ...
@@ -139,6 +139,12 @@
139 139
         return(list(NULL, rankings[[1]], NULL))
140 140
     
141 141
     tuneParamsTrain <- list(topN = topNfeatures)
142
+    performanceIndex <- match("performanceType", names(modellingParams@trainParams@tuneParams))
143
+    if(!is.na(performanceIndex))
144
+    {
145
+      performanceType <- modellingParams@trainParams@tuneParams[["performanceType"]]
146
+      modellingParams@trainParams@tuneParams <- modellingParams@trainParams@tuneParams[-performanceIndex]
147
+    }
142 148
     tuneParamsTrain <- append(tuneParamsTrain, modellingParams@trainParams@tuneParams)
143 149
     tuneCombosTrain <- expand.grid(tuneParamsTrain, stringsAsFactors = FALSE)  
144 150
     modellingParams@trainParams@tuneParams <- NULL
Browse code

- Minor fixes to standalone train and predict functions for list of inputs.

Dario Strbenac authored on 14/10/2022 06:45:03
Showing1 changed files
... ...
@@ -140,9 +140,9 @@
140 140
     
141 141
     tuneParamsTrain <- list(topN = topNfeatures)
142 142
     tuneParamsTrain <- append(tuneParamsTrain, modellingParams@trainParams@tuneParams)
143
-    tuneParamsTrain <- tuneParamsTrain[-match("performanceType", names(tuneParamsTrain))]
144 143
     tuneCombosTrain <- expand.grid(tuneParamsTrain, stringsAsFactors = FALSE)  
145 144
     modellingParams@trainParams@tuneParams <- NULL
145
+    
146 146
     allPerformanceTables <- lapply(rankings, function(rankingsVariety)
147 147
     {
148 148
       # Creates a matrix. Columns are top n features, rows are varieties (one row if None).
Browse code

- Typo in train.DataFrame variable fixed. - .predict for DLDA made renamed to DLDA to enable easy dispatch by predict method. - randomForest wrapper now uses ranger as the underlying package instead of randomForest.

Dario Strbenac authored on 14/10/2022 00:30:12
Showing1 changed files
... ...
@@ -167,7 +167,7 @@
167 167
           result <- runTest(measurementsTrain, outcomeTrain, measurementsTrain, outcomeTrain,
168 168
                             crossValParams = NULL, modellingParams = modellingParams,
169 169
                             verbose = verbose, .iteration = "internal")
170
-          
170
+
171 171
           predictions <- result[["predictions"]]
172 172
           # Classifiers will use a column "class" and survival models will use a column "risk".
173 173
           if(class(predictions) == "data.frame")
... ...
@@ -222,6 +222,7 @@
222 222
                             measurementsTrain, outcomeTrain,
223 223
                             crossValParams = NULL, modellingParams,
224 224
                             verbose = verbose, .iteration = "internal")
225
+
225 226
           predictions <- result[["predictions"]]
226 227
           if(class(predictions) == "data.frame")
227 228
             predictedOutcome <- predictions[, "class"]
... ...
@@ -275,7 +276,7 @@
275 276
         result <- runTest(measurementsTrain, outcomeTrain, measurementsTrain, outcomeTrain,
276 277
                           crossValParams = NULL, modellingParams,
277 278
                           verbose = verbose, .iteration = "internal")
278
-        
279
+
279 280
         predictions <- result[["predictions"]]
280 281
         if(class(predictions) == "data.frame")
281 282
           predictedOutcome <- predictions[, colnames(predictions) %in% c("class", "risk")]
... ...
@@ -580,7 +581,8 @@
580 581
   obj
581 582
 }
582 583
 
583
-.predict <- function(object, newdata, ...) { # Remove once sparsediscrim is reinstated to CRAN.
584
+#' @method predict dlda
585
+predict.dlda <- function(object, newdata, ...) { # Remove once sparsediscrim is reinstated to CRAN.
584 586
   if (!inherits(object, "dlda"))  {
585 587
     stop("object not of class 'dlda'")
586 588
   }
Browse code

- Extreme gradient boosting wrapper added in interfaceXGB.R. - crossValidate gains performanceType parameter that defaults to "auto" which chooses depending on classification or survival task or can be set to any user-specified performance metric. - SelectParams default performanceType is now balanced accuracy, for consistency with other functions. - Vignette text updated to refer to balanced accuracy and to add k-NN and XGB classifiers to classifier table. - NEWS file updated for upcoming Bioconductor release.

Dario Strbenac authored on 12/10/2022 13:05:23
Showing1 changed files
... ...
@@ -140,6 +140,7 @@
140 140
     
141 141
     tuneParamsTrain <- list(topN = topNfeatures)
142 142
     tuneParamsTrain <- append(tuneParamsTrain, modellingParams@trainParams@tuneParams)
143
+    tuneParamsTrain <- tuneParamsTrain[-match("performanceType", names(tuneParamsTrain))]
143 144
     tuneCombosTrain <- expand.grid(tuneParamsTrain, stringsAsFactors = FALSE)  
144 145
     modellingParams@trainParams@tuneParams <- NULL
145 146
     allPerformanceTables <- lapply(rankings, function(rankingsVariety)
... ...
@@ -511,6 +512,7 @@
511 512
         keyword,
512 513
         "randomForest" = RFparams(),
513 514
         "randomSurvivalForest" = RSFparams(),
515
+        "XGB" = XGBparams(),
514 516
         "GLM" = GLMparams(),
515 517
         "elasticNetGLM" = elasticNetGLMparams(),
516 518
         "SVM" = SVMparams(),
Browse code

- Restored utilities.R.

Dario Strbenac authored on 07/09/2022 00:48:49
Showing1 changed files
... ...
@@ -1,152 +1,3 @@
1
-# Operates on an input data frame, to extract the outcome column(s) and return
2
-# a list with the table of covariates in one element and the outcome in another.
3
-# The outcome need to be removed from the data table before predictor training!
4
-.splitDataAndOutcome <- function(measurements, outcome, restrict = NULL)
5
-{ # DataFrame's outcome variable can be character or factor, so it's a bit involved.
6
-  if(is.character(outcome) && length(outcome) > 3 && length(outcome) != nrow(measurements))
7
-    stop("'outcome' is a character variable but has more than one element. Either provide a\n",
8
-         "       one to three column names or a factor of the same length as the number of samples.")
9
-
10
-  ## String specifies the name of a single outcome column, typically a class.
11
-  if(is.character(outcome) && length(outcome) == 1)
12
-  {
13
-    outcomeColumn <- match(outcome, colnames(measurements))
14
-    if(is.na(outcomeColumn))
15
-      stop("Specified column name of outcome is not present in the data table.")
16
-    outcome <- measurements[, outcomeColumn]
17
-    measurements <- measurements[, -outcomeColumn, drop = FALSE]
18
-    # R version 4 and greater no longer automatically casts character columns to factors because stringsAsFactors
19
-    # is FALSE by default, so it is more likely to be character format these days. Handle it.
20
-    if(class(outcome) != "factor") # Assume there will be no ordinary regression prediction tasks ... for now.
21
-      outcome <- factor(outcome)
22
-  }
23
-  
24
-  # survival's Surv constructor has two inputs for the popular right-censored data and
25
-  # three inputs for less-common interval data.
26
-  if(is.character(outcome) && length(outcome) %in% 2:3)
27
-  {
28
-    outcomeColumns <- match(outcome, colnames(measurements))
29
-    if(any(is.na(outcomeColumns)))
30
-      stop("Specified column names of outcome is not present in the data table.")
31
-    outcome <- measurements[, outcomeColumns]
32
-    measurements <- measurements[, -outcomeColumns, drop = FALSE]
33
-  }
34
-  
35
-  if(is(outcome, "factor") && length(outcome) > 3 & length(outcome) < nrow(measurements))
36
-    stop("The length of outcome is not equal to the number of samples.")
37
-  
38
-  ## A vector of characters was input by the user. Ensure that it is a factor.
39
-  if(is.character(outcome) & length(outcome) == nrow(measurements))
40
-    outcome <- factor(outcome)
41
-  
42
-  # Outcome has columns, so it is tabular. It is inferred to represent survival data.
43
-  if(!is.null(ncol(outcome)) && ncol(outcome) %in% 2:3)
44
-  {
45
-    # Assume that event status is in the last column (second for two columns, third for three columns)
46
-    numberEventTypes <- length(unique(outcome[, ncol(outcome)]))
47
-    # Could be one or two kinds of events. All events might be uncensored or censored
48
-    # in a rare but not impossible scenario.
49
-    if(numberEventTypes > 2)
50
-      stop("Number of distinct event types in the last column exceeds 2 but must be 1 or 2.")
51
-      
52
-    if(ncol(outcome) == 2) # Typical, right-censored survival data.
53
-      outcome <- survival::Surv(outcome[, 1], outcome[, 2])
54
-    else # Three columns. Therefore, counting process data.
55
-      outcome <- survival::Surv(outcome[, 1], outcome[, 2], outcome[, 3])
56
-  }
57
-  
58
-  if(!is.null(restrict))
59
-  {
60
-    isDesiredClass <- sapply(measurements, function(column) is(column, restrict))
61
-    measurements <- measurements[, isDesiredClass, drop = FALSE]
62
-    if(ncol(measurements) == 0)
63
-      stop(paste("No features are left after restricting to", restrict, "but at least one must be."))
64
-  }
65
-
66
-  list(measurements = measurements, outcome = outcome)
67
-}
68
-
69
-# Function to convert a MultiAssayExperiment object into a flat DataFrame table, to enable it
70
-# to be used in typical model building functions.
71
-# Returns a list with a covariate table and and outcome vector/table, or just a covariate table
72
-# in the case the input is a test data set.
73
-.MAEtoWideTable <- function(measurements, targets = NULL, outcomeColumns = NULL, restrict = "numeric")
74
-{
75
-  if(is.null(targets))
76
-    stop("'targets' is not specified but must be.")
77
-  if(is.null(outcomeColumns))
78
-    stop("'outcomeColumns' is not specified but must be.")    
79
-  if(!all(targets %in% c(names(measurements), "sampleInfo")))
80
-    stop("Some table names in 'targets' are not assay names in 'measurements' or \"sampleInfo\".")
81
-  sampleInfoColumns <- colnames(MultiAssayExperiment::colData(measurements))
82
-  if(!missing(outcomeColumns) & !all(outcomeColumns %in% sampleInfoColumns))
83
-    stop("Not all column names specified by 'outcomeColumns' found in sample information table.")  
84
-
85
-  if("sampleInfo" %in% targets)
86
-  {
87
-    targets <- targets[targets != "sampleInfo"]
88
-    sampleInfoColumnsTrain <- sampleInfoColumns
89
-  } else {
90
-    sampleInfoColumnsTrain <- NULL
91
-  }
92
-  
93
-  if(length(targets) > 0)
94
-  {
95
-    measurements <- measurements[, , targets]
96
-  
97
-    # Get all desired measurements tables and sample information columns (other than the columns representing outcome).
98
-    # These form the independent variables to be used for making predictions with.
99
-    # Variable names will have names like RNA:BRAF for traceability.
100
-    dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(sampleInfoColumnsTrain, outcomeColumns), check.names = FALSE, collapse = ':')
101
-    rownames(dataTable) <- dataTable[, "primary"]
102
-    S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "sampleInfo", S4Vectors::mcols(dataTable)[, "sourceName"])
103
-    colnames(S4Vectors::mcols(dataTable))[1] <- "assay"
104
-  
105
-    # Sample information variable names not included in column metadata of wide table but only as row names of it.
106
-    # Create a combined column named "feature" which has feature names of the assays as well as the sample information.
107
-    S4Vectors::mcols(dataTable)[, "feature"] <- as.character(S4Vectors::mcols(dataTable)[, "rowname"])
108
-    missingIndices <- is.na(S4Vectors::mcols(dataTable)[, "feature"])
109
-    S4Vectors::mcols(dataTable)[missingIndices, "feature"] <- colnames(dataTable)[missingIndices]
110
-    
111
-    # Finally, a column annotation recording variable name and which table it originated from for all of the source tables.
112
-    S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")]
113
-  } else { # Must have only been sample information data.
114
-    dataTable <- MultiAssayExperiment::colData(measurements)
115
-  }
116
-  if(!is.null(outcomeColumns)) outcome <- dataTable[, outcomeColumns]
117
-  
118
-  if(!is.null(restrict))
119
-  {
120
-    isDesiredClass <- sapply(dataTable, function(column) is(column, restrict))
121
-    dataTable <- dataTable[, isDesiredClass, drop = FALSE]
122
-    if(ncol(dataTable) == 0)
123
-      stop(paste("No features are left after restricting to", restrict, "but at least one must be."))
124
-  }
125
-
126
-  # Only return independent variables in dataTable for making classifications with.
127
-  # "primary" column is auto-generated by sample information table row names and a duplicate.
128
-  dropColumns <- na.omit(match(c("primary", outcomeColumns), colnames(dataTable)))
129
-  if(length(dropColumns) > 0) dataTable <- dataTable[, -dropColumns]
130
-  
131
-  # Training data table and outcome for training data.
132
-  if(!is.null(outcomeColumns))
133
-      list(dataTable = dataTable, outcome = outcome)
134
-  else # Only test data table for test data input.
135
-    dataTable
136
-}
137
-
138
-# For classifiers which use one single function for inputting a training and a testing table,
139
-# and work only for the numeric data type, this checks whether the training and testing tables 
140
-# both have the same set of features and there are at least some numeric features to use,
141
-# after they have been filtered by another function which splits the covariates and the outcome from the input.
142
-.checkVariablesAndSame <- function(trainingMatrix, testingMatrix)
143
-{
144
-  if(ncol(trainingMatrix) == 0) # Filtering of table removed all columns, leaving nothing to classify with.
145
-    stop("No variables in data tables specified by \'targets\' are numeric.")
146
-  else if(ncol(trainingMatrix) != ncol(testingMatrix))
147
-    stop("Training data set and testing data set contain differing numbers of features.")  
148
-}
149
-
150 1
 # Creates two lists of lists. First has training samples, second has test samples for a range
151 2
 # of different cross-validation schemes.
152 3
 #' @import utils
... ...
@@ -255,7 +106,6 @@
255 106
   tuneParams <- modellingParams@selectParams@tuneParams
256 107
   performanceType <- tuneParams[["performanceType"]]
257 108
   topNfeatures <- tuneParams[["nFeatures"]]
258
-  tuneMode <- ifelse("tuneMode" %in% names(tuneParams), tuneParams[["tuneMode"]], crossValParams@tuneMode)
259 109
   tuneParams <- tuneParams[-match(c("performanceType", "nFeatures"), names(tuneParams))] # Only used as evaluation metric.
260 110
   
261 111
   # Make selectParams NULL, since we are currently doing selection and it shouldn't call
... ...
@@ -282,17 +132,17 @@
282 132
       do.call(featureRanking, paramList)
283 133
     })
284 134
     
285
-    if(featureRanking@generic == "previousSelection") # Actually selection not ranking.
135
+    if(attr(featureRanking, "name") == "previousSelection") # Actually selection not ranking.
286 136
       return(list(NULL, rankings[[1]], NULL))
287 137
     
288
-    if(tuneMode == "none") # Actually selection not ranking.
138
+    if(crossValParams@tuneMode == "none") # No parameters to choose between.
289 139
         return(list(NULL, rankings[[1]], NULL))
290 140
     
291 141
     tuneParamsTrain <- list(topN = topNfeatures)
292 142
     tuneParamsTrain <- append(tuneParamsTrain, modellingParams@trainParams@tuneParams)
293 143
     tuneCombosTrain <- expand.grid(tuneParamsTrain, stringsAsFactors = FALSE)  
294 144
     modellingParams@trainParams@tuneParams <- NULL
295
-    bestPerformers <- sapply(rankings, function(rankingsVariety)
145
+    allPerformanceTables <- lapply(rankings, function(rankingsVariety)
296 146
     {
297 147
       # Creates a matrix. Columns are top n features, rows are varieties (one row if None).
298 148
       performances <- sapply(1:nrow(tuneCombosTrain), function(rowIndex)
... ...
@@ -332,20 +182,22 @@
332 182
        })
333 183
 
334 184
         bestOne <- ifelse(betterValues == "lower", which.min(performances)[1], which.max(performances)[1])
335
-        c(bestOne, performances[bestOne])
185
+        list(data.frame(tuneCombosTrain, performance = performances), bestOne)
336 186
       })
337 187
 
338
-      tunePick <- ifelse(betterValues == "lower", which.min(bestPerformers[2, ])[1], which.max(bestPerformers[2, ])[1])
188
+      tablesBestMetrics <- sapply(allPerformanceTables, function(tableIndexPair) tableIndexPair[[1]][tableIndexPair[[2]], "performance"])
189
+      tunePick <- ifelse(betterValues == "lower", which.min(tablesBestMetrics)[1], which.max(tablesBestMetrics)[1])
339 190
       
340 191
       if(verbose == 3)
341 192
          message("Features selected.")
342 193
       
343
-      tuneRow <- tuneCombosTrain[bestPerformers[1, tunePick], , drop  = FALSE]
344
-      if(ncol(tuneRow) > 1) tuneDetails <- tuneRow[, -1, drop = FALSE] else tuneDetails <- NULL
194
+      tuneDetails <- allPerformanceTables[[tunePick]] # List of length 2.
345 195
       
346 196
       rankingUse <- rankings[[tunePick]]
347
-      selectionIndices <- rankingUse[1:tuneRow[, "topN"]]
197
+      selectionIndices <- rankingUse[1:(tuneDetails[[1]][tuneDetails[[2]], "topN"])]
348 198
       
199
+      names(tuneDetails) <- c("tuneCombinations", "bestIndex")
200
+      colnames(tuneDetails[[1]])[ncol(tuneDetails[[1]])] <- performanceType
349 201
       list(ranked = rankingUse, selected = selectionIndices, tune = tuneDetails)
350 202
     } else if(is.list(featureRanking)) { # It is a list of functions for ensemble selection.
351 203
       featuresIndiciesLists <- mapply(function(selector, selParams)
... ...
@@ -388,8 +240,7 @@
388 240
       
389 241
       list(NULL, selectionIndices, NULL)
390 242
     } else { # Previous selection
391
-      selectedFeatures <- 
392
-      list(NULL, selectionIndices, NULL)
243
+      selectedFeatures <- list(NULL, selectionIndices, NULL)
393 244
     }
394 245
 }
395 246
 
... ...
@@ -405,10 +256,10 @@
405 256
 
406 257
 # Code to create a function call to a training function. Might also do training and testing
407 258
 # within the same function, so test samples are also passed in case they are needed.
408
-.doTrain <- function(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, modellingParams, verbose)
259
+.doTrain <- function(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, crossValParams, modellingParams, verbose)
409 260
 {
410
-  tuneChosen <- NULL
411
-  if(!is.null(modellingParams@trainParams@tuneParams) && is.null(modellingParams@selectParams@tuneParams))
261
+  tuneDetails <- NULL
262
+  if(!is.null(modellingParams@trainParams@tuneParams) && is.null(modellingParams@selectParams))
412 263
   {
413 264
     performanceType <- modellingParams@trainParams@tuneParams[["performanceType"]]
414 265
     modellingParams@trainParams@tuneParams <- modellingParams@trainParams@tuneParams[-match("performanceType", names(modellingParams@trainParams@tuneParams))]
... ...
@@ -420,16 +271,16 @@
420 271
       modellingParams@trainParams@otherParams <- c(modellingParams@trainParams@otherParams, as.list(tuneCombos[rowIndex, ]))
421 272
       if(crossValParams@tuneMode == "Resubstitution")
422 273
       {
423
-        result <- runTest(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest,
274
+        result <- runTest(measurementsTrain, outcomeTrain, measurementsTrain, outcomeTrain,
424 275
                           crossValParams = NULL, modellingParams,
425 276
                           verbose = verbose, .iteration = "internal")
426 277
         
427 278
         predictions <- result[["predictions"]]
428 279
         if(class(predictions) == "data.frame")
429
-          predictedOutcome <- predictions[, "outcome"]
280
+          predictedOutcome <- predictions[, colnames(predictions) %in% c("class", "risk")]
430 281
         else
431 282
           predictedOutcome <- predictions
432
-        calcExternalPerformance(outcomeTest, predictedOutcome, performanceType)
283
+        calcExternalPerformance(outcomeTrain, predictedOutcome, performanceType)
433 284
       } else {
434 285
         result <- runTests(measurementsTrain, outcomeTrain,
435 286
                            crossValParams, modellingParams,
... ...
@@ -438,13 +289,18 @@
438 289
         median(performances(result)[[performanceType]])
439 290
       }
440 291
     })
292
+    allPerformanceTable <- data.frame(tuneCombos, performances)
293
+    colnames(allPerformanceTable)[ncol(allPerformanceTable)] <- performanceType
294
+    
441 295
     betterValues <- .ClassifyRenvir[["performanceInfoTable"]][.ClassifyRenvir[["performanceInfoTable"]][, "type"] == performanceType, "better"]
442 296
     bestOne <- ifelse(betterValues == "lower", which.min(performances)[1], which.max(performances)[1])
443 297
     tuneChosen <- tuneCombos[bestOne, , drop = FALSE]
298
+    tuneDetails <- list(tuneCombos, bestOne)
299
+    names(tuneDetails) <- c("tuneCombinations", "bestIndex")
444 300
     modellingParams@trainParams@otherParams <- tuneChosen
445 301
   }
446 302
 
447
-  if(modellingParams@trainParams@classifier@generic != "previousTrained")
303
+    if (!"previousTrained" %in% attr(modellingParams@trainParams@classifier, "name")) 
448 304
     # Don't name these first two variables. Some classifier functions might use classesTrain and others use outcomeTrain.
449 305
     paramList <- list(measurementsTrain, outcomeTrain)
450 306
   else # Don't pass the measurements and classes, because a pre-existing classifier is used.
... ...
@@ -459,7 +315,7 @@
459 315
   if(verbose >= 2)
460 316
     message("Training completed.")  
461 317
   
462
-  list(model = trained, tune = tuneChosen)
318
+  list(model = trained, tune = tuneDetails)
463 319
 }
464 320
 
465 321
 # Creates a function call to a prediction function.
... ...
@@ -528,15 +384,6 @@
528 384
 # by user-specified values.
529 385
 .filterCharacteristics <- function(characteristics, autoCharacteristics)
530 386
 {
531
-  # Remove duplication of values for classifiers that have one function for training and 
532
-  # one function for prediction.
533
-  if("Classifier Name" %in% autoCharacteristics[, "characteristic"] && "Predictor Name" %in% autoCharacteristics[, "characteristic"])
534
-  {
535
-    classRow <- which(autoCharacteristics[, "characteristic"] == "Classifier Name")
536
-    predRow <- which(autoCharacteristics[, "characteristic"] == "Predictor Name")
537
-    if(autoCharacteristics[classRow, "value"] == autoCharacteristics[predRow, "value"])
538
-      autoCharacteristics <- autoCharacteristics[-predRow, ]
539
-  }
540 387
   # Overwrite automatically-chosen names with user's names.
541 388
   if(nrow(autoCharacteristics) > 0 && nrow(characteristics) > 0)
542 389
   {
... ...
@@ -560,36 +407,6 @@
560 407
   plotData
561 408
 }
562 409
 
563
-# Summary of the features used and the total number of them, no matter if they are a simple type
564
-# or something more complex like Pairs or feature sets.
565
-.summaryFeatures <- function(measurements)
566
-{
567
-  # MultiAssayExperiment has feature details in mcols.
568
-  if(!is.null(S4Vectors::mcols(measurements)))
569
-  {
570
-    originalInfo <- S4Vectors::mcols(measurements)
571
-    featureNames <- S4Vectors::mcols(measurements)[, "feature"]
572
-    assays <- unique(S4Vectors::mcols(measurements)[, "assay"])
573
-    renamedInfo <- S4Vectors::mcols(measurements)
574
-    renamedAssays <- paste("Assay", seq_along(assays), sep = '')
575
-    for(assay in assays)
576
-    {
577
-      rowsAssay <- which(renamedInfo[, "assay"] == assay)
578
-      renamedInfo[rowsAssay, "feature"] <- paste("Feature", seq_along(rowsAssay), sep = '')
579
-      renamedInfo[rowsAssay, "assay"] <- renamedAssays[match(assay, assays)]
580
-    }
581
-    featuresInfo <- S4Vectors::DataFrame(originalInfo, renamedInfo)
582
-    colnames(featuresInfo) <- c("Original Assay", "Original Feature", "Renamed Assay", "Renamed Feature")
583
-    featuresInfo <- cbind(originalInfo, featuresInfo)
584
-  } else {
585
-    originalFeatures <- colnames(measurements)
586
-    renamedInfo <- paste("Feature", seq_along(measurements), sep = '')
587
-    featuresInfo <- S4Vectors::DataFrame(originalFeatures, renamedInfo)
588
-    colnames(featuresInfo) <- c("Original Feature", "Renamed Feature")
589
-  }
590
-  featuresInfo
591
-}
592
-
593 410
 # Function to identify the parameters of an S4 method.
594 411
 .methodFormals <- function(f, signature) {
595 412
   tryCatch({
... ...
@@ -660,6 +477,53 @@
660 477
   list(measurementsTrain = measurementsTrain, classesTrain = classesTrain)
661 478
 }
662 479
 
480
+.transformKeywordToFunction <- function(keyword)
481
+{
482
+  switch(
483
+        keyword,
484
+        "none" = NULL,
485
+        "diffLoc" = subtractFromLocation
486
+    )
487
+}
488
+
489
+.selectionKeywordToFunction <- function(keyword)
490
+{
491
+  switch(
492
+        keyword,
493
+        "none" = NULL,
494
+        "t-test" = differentMeansRanking,
495
+        "limma" = limmaRanking,
496
+        "edgeR" = edgeRranking,
497
+        "Bartlett" = bartlettRanking,
498
+        "Levene" = leveneRanking,
499
+        "DMD" = DMDranking,
500
+        "likelihoodRatio" = likelihoodRatioRanking,
501
+        "KS" = KolmogorovSmirnovRanking,
502
+        "KL" = KullbackLeiblerRanking,
503
+        "CoxPH" = coxphRanking,
504
+        "selectMulti" = selectMulti
505
+    )
506
+}
507
+
508
+.classifierKeywordToParams <- function(keyword)
509
+{
510
+    switch(
511
+        keyword,
512
+        "randomForest" = RFparams(),
513
+        "randomSurvivalForest" = RSFparams(),
514
+        "GLM" = GLMparams(),
515
+        "elasticNetGLM" = elasticNetGLMparams(),
516
+        "SVM" = SVMparams(),
517
+        "NSC" = NSCparams(),
518
+        "DLDA" = DLDAparams(),
519
+        "naiveBayes" = naiveBayesParams(),
520
+        "mixturesNormals" = mixModelsParams(),
521
+        "kNN" = kNNparams(),
522
+        "CoxPH" = coxphParams(),
523
+        "CoxNet" = coxnetParams()
524
+    )    
525
+}
526
+
663 527
 .dlda <- function(x, y, prior = NULL){ # Remove this once sparsediscrim is reinstated to CRAN.
664 528
   obj <- list()
665 529
   obj$labels <- y
Browse code

export colCoxTests

Ellis Patrick authored on 06/09/2022 22:57:36
Showing1 changed files
... ...
@@ -1,3 +1,152 @@
1
+# Operates on an input data frame, to extract the outcome column(s) and return
2
+# a list with the table of covariates in one element and the outcome in another.
3
+# The outcome need to be removed from the data table before predictor training!
4
+.splitDataAndOutcome <- function(measurements, outcome, restrict = NULL)
5
+{ # DataFrame's outcome variable can be character or factor, so it's a bit involved.
6
+  if(is.character(outcome) && length(outcome) > 3 && length(outcome) != nrow(measurements))
7
+    stop("'outcome' is a character variable but has more than one element. Either provide a\n",
8
+         "       one to three column names or a factor of the same length as the number of samples.")
9
+
10
+  ## String specifies the name of a single outcome column, typically a class.
11
+  if(is.character(outcome) && length(outcome) == 1)
12
+  {
13
+    outcomeColumn <- match(outcome, colnames(measurements))
14
+    if(is.na(outcomeColumn))
15
+      stop("Specified column name of outcome is not present in the data table.")
16
+    outcome <- measurements[, outcomeColumn]
17
+    measurements <- measurements[, -outcomeColumn, drop = FALSE]
18
+    # R version 4 and greater no longer automatically casts character columns to factors because stringsAsFactors
19
+    # is FALSE by default, so it is more likely to be character format these days. Handle it.
20
+    if(class(outcome) != "factor") # Assume there will be no ordinary regression prediction tasks ... for now.
21
+      outcome <- factor(outcome)
22
+  }
23
+  
24
+  # survival's Surv constructor has two inputs for the popular right-censored data and
25
+  # three inputs for less-common interval data.
26
+  if(is.character(outcome) && length(outcome) %in% 2:3)
27
+  {
28
+    outcomeColumns <- match(outcome, colnames(measurements))
29
+    if(any(is.na(outcomeColumns)))
30
+      stop("Specified column names of outcome is not present in the data table.")
31
+    outcome <- measurements[, outcomeColumns]
32
+    measurements <- measurements[, -outcomeColumns, drop = FALSE]
33
+  }
34
+  
35
+  if(is(outcome, "factor") && length(outcome) > 3 & length(outcome) < nrow(measurements))
36
+    stop("The length of outcome is not equal to the number of samples.")
37
+  
38
+  ## A vector of characters was input by the user. Ensure that it is a factor.
39
+  if(is.character(outcome) & length(outcome) == nrow(measurements))
40
+    outcome <- factor(outcome)
41
+  
42
+  # Outcome has columns, so it is tabular. It is inferred to represent survival data.
43
+  if(!is.null(ncol(outcome)) && ncol(outcome) %in% 2:3)
44
+  {
45
+    # Assume that event status is in the last column (second for two columns, third for three columns)
46
+    numberEventTypes <- length(unique(outcome[, ncol(outcome)]))
47
+    # Could be one or two kinds of events. All events might be uncensored or censored
48
+    # in a rare but not impossible scenario.
49
+    if(numberEventTypes > 2)
50
+      stop("Number of distinct event types in the last column exceeds 2 but must be 1 or 2.")
51
+      
52
+    if(ncol(outcome) == 2) # Typical, right-censored survival data.
53
+      outcome <- survival::Surv(outcome[, 1], outcome[, 2])
54
+    else # Three columns. Therefore, counting process data.
55
+      outcome <- survival::Surv(outcome[, 1], outcome[, 2], outcome[, 3])
56
+  }
57
+  
58
+  if(!is.null(restrict))
59
+  {
60
+    isDesiredClass <- sapply(measurements, function(column) is(column, restrict))
61
+    measurements <- measurements[, isDesiredClass, drop = FALSE]
62
+    if(ncol(measurements) == 0)
63
+      stop(paste("No features are left after restricting to", restrict, "but at least one must be."))
64
+  }
65
+
66
+  list(measurements = measurements, outcome = outcome)
67
+}
68
+
69
+# Function to convert a MultiAssayExperiment object into a flat DataFrame table, to enable it
70
+# to be used in typical model building functions.
71
+# Returns a list with a covariate table and and outcome vector/table, or just a covariate table
72
+# in the case the input is a test data set.
73
+.MAEtoWideTable <- function(measurements, targets = NULL, outcomeColumns = NULL, restrict = "numeric")
74
+{
75
+  if(is.null(targets))
76
+    stop("'targets' is not specified but must be.")
77
+  if(is.null(outcomeColumns))
78
+    stop("'outcomeColumns' is not specified but must be.")    
79
+  if(!all(targets %in% c(names(measurements), "sampleInfo")))
80
+    stop("Some table names in 'targets' are not assay names in 'measurements' or \"sampleInfo\".")
81
+  sampleInfoColumns <- colnames(MultiAssayExperiment::colData(measurements))
82
+  if(!missing(outcomeColumns) & !all(outcomeColumns %in% sampleInfoColumns))
83
+    stop("Not all column names specified by 'outcomeColumns' found in sample information table.")  
84
+
85
+  if("sampleInfo" %in% targets)
86
+  {
87
+    targets <- targets[targets != "sampleInfo"]
88
+    sampleInfoColumnsTrain <- sampleInfoColumns
89
+  } else {
90
+    sampleInfoColumnsTrain <- NULL
91
+  }
92
+  
93
+  if(length(targets) > 0)
94
+  {
95
+    measurements <- measurements[, , targets]
96
+  
97
+    # Get all desired measurements tables and sample information columns (other than the columns representing outcome).
98
+    # These form the independent variables to be used for making predictions with.
99
+    # Variable names will have names like RNA:BRAF for traceability.
100
+    dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(sampleInfoColumnsTrain, outcomeColumns), check.names = FALSE, collapse = ':')
101
+    rownames(dataTable) <- dataTable[, "primary"]
102
+    S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "sampleInfo", S4Vectors::mcols(dataTable)[, "sourceName"])
103
+    colnames(S4Vectors::mcols(dataTable))[1] <- "assay"
104
+  
105
+    # Sample information variable names not included in column metadata of wide table but only as row names of it.
106
+    # Create a combined column named "feature" which has feature names of the assays as well as the sample information.
107
+    S4Vectors::mcols(dataTable)[, "feature"] <- as.character(S4Vectors::mcols(dataTable)[, "rowname"])
108
+    missingIndices <- is.na(S4Vectors::mcols(dataTable)[, "feature"])
109
+    S4Vectors::mcols(dataTable)[missingIndices, "feature"] <- colnames(dataTable)[missingIndices]
110
+    
111
+    # Finally, a column annotation recording variable name and which table it originated from for all of the source tables.
112
+    S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")]
113
+  } else { # Must have only been sample information data.
114
+    dataTable <- MultiAssayExperiment::colData(measurements)
115
+  }
116
+  if(!is.null(outcomeColumns)) outcome <- dataTable[, outcomeColumns]
117
+  
118
+  if(!is.null(restrict))
119
+  {
120
+    isDesiredClass <- sapply(dataTable, function(column) is(column, restrict))
121
+    dataTable <- dataTable[, isDesiredClass, drop = FALSE]
122
+    if(ncol(dataTable) == 0)
123
+      stop(paste("No features are left after restricting to", restrict, "but at least one must be."))
124
+  }
125
+
126
+  # Only return independent variables in dataTable for making classifications with.
127
+  # "primary" column is auto-generated by sample information table row names and a duplicate.
128
+  dropColumns <- na.omit(match(c("primary", outcomeColumns), colnames(dataTable)))
129
+  if(length(dropColumns) > 0) dataTable <- dataTable[, -dropColumns]
130
+  
131
+  # Training data table and outcome for training data.
132
+  if(!is.null(outcomeColumns))
133
+      list(dataTable = dataTable, outcome = outcome)
134
+  else # Only test data table for test data input.
135
+    dataTable
136
+}
137
+
138
+# For classifiers which use one single function for inputting a training and a testing table,
139
+# and work only for the numeric data type, this checks whether the training and testing tables 
140
+# both have the same set of features and there are at least some numeric features to use,
141
+# after they have been filtered by another function which splits the covariates and the outcome from the input.
142
+.checkVariablesAndSame <- function(trainingMatrix, testingMatrix)
143
+{
144
+  if(ncol(trainingMatrix) == 0) # Filtering of table removed all columns, leaving nothing to classify with.
145
+    stop("No variables in data tables specified by \'targets\' are numeric.")
146
+  else if(ncol(trainingMatrix) != ncol(testingMatrix))
147
+    stop("Training data set and testing data set contain differing numbers of features.")  
148
+}
149
+
1 150
 # Creates two lists of lists. First has training samples, second has test samples for a range
2 151
 # of different cross-validation schemes.
3 152
 #' @import utils
... ...
@@ -106,6 +255,7 @@
106 255
   tuneParams <- modellingParams@selectParams@tuneParams
107 256
   performanceType <- tuneParams[["performanceType"]]
108 257
   topNfeatures <- tuneParams[["nFeatures"]]
258
+  tuneMode <- ifelse("tuneMode" %in% names(tuneParams), tuneParams[["tuneMode"]], crossValParams@tuneMode)
109 259
   tuneParams <- tuneParams[-match(c("performanceType", "nFeatures"), names(tuneParams))] # Only used as evaluation metric.
110 260
   
111 261
   # Make selectParams NULL, since we are currently doing selection and it shouldn't call
... ...
@@ -132,17 +282,17 @@
132 282
       do.call(featureRanking, paramList)
133 283
     })
134 284
     
135
-    if(attr(featureRanking, "name") == "previousSelection") # Actually selection not ranking.
285
+    if(featureRanking@generic == "previousSelection") # Actually selection not ranking.
136 286
       return(list(NULL, rankings[[1]], NULL))
137 287
     
138
-    if(crossValParams@tuneMode == "none") # No parameters to choose between.
288
+    if(tuneMode == "none") # Actually selection not ranking.
139 289
         return(list(NULL, rankings[[1]], NULL))
140 290
     
141 291
     tuneParamsTrain <- list(topN = topNfeatures)
142 292
     tuneParamsTrain <- append(tuneParamsTrain, modellingParams@trainParams@tuneParams)
143 293
     tuneCombosTrain <- expand.grid(tuneParamsTrain, stringsAsFactors = FALSE)  
144 294
     modellingParams@trainParams@tuneParams <- NULL
145
-    allPerformanceTables <- lapply(rankings, function(rankingsVariety)
295
+    bestPerformers <- sapply(rankings, function(rankingsVariety)
146 296
     {
147 297
       # Creates a matrix. Columns are top n features, rows are varieties (one row if None).
148 298
       performances <- sapply(1:nrow(tuneCombosTrain), function(rowIndex)
... ...
@@ -182,22 +332,20 @@
182 332
        })
183 333
 
184 334
         bestOne <- ifelse(betterValues == "lower", which.min(performances)[1], which.max(performances)[1])
185
-        list(data.frame(tuneCombosTrain, performance = performances), bestOne)
335
+        c(bestOne, performances[bestOne])
186 336
       })
187 337
 
188
-      tablesBestMetrics <- sapply(allPerformanceTables, function(tableIndexPair) tableIndexPair[[1]][tableIndexPair[[2]], "performance"])
189
-      tunePick <- ifelse(betterValues == "lower", which.min(tablesBestMetrics)[1], which.max(tablesBestMetrics)[1])
338
+      tunePick <- ifelse(betterValues == "lower", which.min(bestPerformers[2, ])[1], which.max(bestPerformers[2, ])[1])
190 339
       
191 340
       if(verbose == 3)
192 341
          message("Features selected.")
193 342
       
194
-      tuneDetails <- allPerformanceTables[[tunePick]] # List of length 2.
343
+      tuneRow <- tuneCombosTrain[bestPerformers[1, tunePick], , drop  = FALSE]
344
+      if(ncol(tuneRow) > 1) tuneDetails <- tuneRow[, -1, drop = FALSE] else tuneDetails <- NULL
195 345
       
196 346
       rankingUse <- rankings[[tunePick]]
197
-      selectionIndices <- rankingUse[1:(tuneDetails[[1]][tuneDetails[[2]], "topN"])]
347
+      selectionIndices <- rankingUse[1:tuneRow[, "topN"]]
198 348
       
199
-      names(tuneDetails) <- c("tuneCombinations", "bestIndex")
200
-      colnames(tuneDetails[[1]])[ncol(tuneDetails[[1]])] <- performanceType
201 349
       list(ranked = rankingUse, selected = selectionIndices, tune = tuneDetails)
202 350
     } else if(is.list(featureRanking)) { # It is a list of functions for ensemble selection.
203 351
       featuresIndiciesLists <- mapply(function(selector, selParams)
... ...
@@ -240,7 +388,8 @@
240 388
       
241 389
       list(NULL, selectionIndices, NULL)
242 390
     } else { # Previous selection
243
-      selectedFeatures <- list(NULL, selectionIndices, NULL)
391
+      selectedFeatures <- 
392
+      list(NULL, selectionIndices, NULL)
244 393
     }
245 394
 }
246 395
 
... ...
@@ -256,10 +405,10 @@
256 405
 
257 406
 # Code to create a function call to a training function. Might also do training and testing
258 407
 # within the same function, so test samples are also passed in case they are needed.
259
-.doTrain <- function(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, crossValParams, modellingParams, verbose)
408
+.doTrain <- function(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, modellingParams, verbose)
260 409
 {
261
-  tuneDetails <- NULL
262
-  if(!is.null(modellingParams@trainParams@tuneParams) && is.null(modellingParams@selectParams))
410
+  tuneChosen <- NULL
411
+  if(!is.null(modellingParams@trainParams@tuneParams) && is.null(modellingParams@selectParams@tuneParams))
263 412
   {
264 413
     performanceType <- modellingParams@trainParams@tuneParams[["performanceType"]]
265 414
     modellingParams@trainParams@tuneParams <- modellingParams@trainParams@tuneParams[-match("performanceType", names(modellingParams@trainParams@tuneParams))]
... ...
@@ -271,16 +420,16 @@
271 420
       modellingParams@trainParams@otherParams <- c(modellingParams@trainParams@otherParams, as.list(tuneCombos[rowIndex, ]))
272 421
       if(crossValParams@tuneMode == "Resubstitution")
273 422
       {
274
-        result <- runTest(measurementsTrain, outcomeTrain, measurementsTrain, outcomeTrain,
423
+        result <- runTest(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest,
275 424
                           crossValParams = NULL, modellingParams,
276 425
                           verbose = verbose, .iteration = "internal")
277 426
         
278 427
         predictions <- result[["predictions"]]
279 428
         if(class(predictions) == "data.frame")
280
-          predictedOutcome <- predictions[, colnames(predictions) %in% c("class", "risk")]
429
+          predictedOutcome <- predictions[, "outcome"]
281 430
         else
282 431
           predictedOutcome <- predictions
283
-        calcExternalPerformance(outcomeTrain, predictedOutcome, performanceType)
432
+        calcExternalPerformance(outcomeTest, predictedOutcome, performanceType)
284 433
       } else {
285 434
         result <- runTests(measurementsTrain, outcomeTrain,
286 435
                            crossValParams, modellingParams,
... ...
@@ -289,18 +438,13 @@
289 438
         median(performances(result)[[performanceType]])
290 439
       }
291 440
     })
292
-    allPerformanceTable <- data.frame(tuneCombos, performances)
293
-    colnames(allPerformanceTable)[ncol(allPerformanceTable)] <- performanceType
294
-    
295 441
     betterValues <- .ClassifyRenvir[["performanceInfoTable"]][.ClassifyRenvir[["performanceInfoTable"]][, "type"] == performanceType, "better"]
296 442
     bestOne <- ifelse(betterValues == "lower", which.min(performances)[1], which.max(performances)[1])
297 443
     tuneChosen <- tuneCombos[bestOne, , drop = FALSE]
298
-    tuneDetails <- list(tuneCombos, bestOne)
299
-    names(tuneDetails) <- c("tuneCombinations", "bestIndex")
300 444
     modellingParams@trainParams@otherParams <- tuneChosen
301 445
   }
302 446
 
303
-    if (!"previousTrained" %in% attr(modellingParams@trainParams@classifier, "name")) 
447
+  if(modellingParams@trainParams@classifier@generic != "previousTrained")
304 448
     # Don't name these first two variables. Some classifier functions might use classesTrain and others use outcomeTrain.
305 449
     paramList <- list(measurementsTrain, outcomeTrain)
306 450
   else # Don't pass the measurements and classes, because a pre-existing classifier is used.
... ...
@@ -315,7 +459,7 @@
315 459
   if(verbose >= 2)
316 460
     message("Training completed.")  
317 461
   
318
-  list(model = trained, tune = tuneDetails)
462
+  list(model = trained, tune = tuneChosen)
319 463
 }
320 464
 
321 465
 # Creates a function call to a prediction function.
... ...
@@ -384,6 +528,15 @@
384 528
 # by user-specified values.
385 529
 .filterCharacteristics <- function(characteristics, autoCharacteristics)
386 530
 {
531
+  # Remove duplication of values for classifiers that have one function for training and 
532
+  # one function for prediction.
533
+  if("Classifier Name" %in% autoCharacteristics[, "characteristic"] && "Predictor Name" %in% autoCharacteristics[, "characteristic"])
534
+  {
535
+    classRow <- which(autoCharacteristics[, "characteristic"] == "Classifier Name")
536
+    predRow <- which(autoCharacteristics[, "characteristic"] == "Predictor Name")
537
+    if(autoCharacteristics[classRow, "value"] == autoCharacteristics[predRow, "value"])
538
+      autoCharacteristics <- autoCharacteristics[-predRow, ]
539
+  }
387 540
   # Overwrite automatically-chosen names with user's names.
388 541
   if(nrow(autoCharacteristics) > 0 && nrow(characteristics) > 0)
389 542
   {
... ...
@@ -407,6 +560,36 @@
407 560
   plotData
408 561
 }
409 562
 
563
+# Summary of the features used and the total number of them, no matter if they are a simple type
564
+# or something more complex like Pairs or feature sets.
565
+.summaryFeatures <- function(measurements)
566
+{
567
+  # MultiAssayExperiment has feature details in mcols.
568
+  if(!is.null(S4Vectors::mcols(measurements)))
569
+  {
570
+    originalInfo <- S4Vectors::mcols(measurements)
571
+    featureNames <- S4Vectors::mcols(measurements)[, "feature"]
572
+    assays <- unique(S4Vectors::mcols(measurements)[, "assay"])
573
+    renamedInfo <- S4Vectors::mcols(measurements)
574
+    renamedAssays <- paste("Assay", seq_along(assays), sep = '')
575
+    for(assay in assays)
576
+    {
577
+      rowsAssay <- which(renamedInfo[, "assay"] == assay)
578
+      renamedInfo[rowsAssay, "feature"] <- paste("Feature", seq_along(rowsAssay), sep = '')
579
+      renamedInfo[rowsAssay, "assay"] <- renamedAssays[match(assay, assays)]
580
+    }
581
+    featuresInfo <- S4Vectors::DataFrame(originalInfo, renamedInfo)
582
+    colnames(featuresInfo) <- c("Original Assay", "Original Feature", "Renamed Assay", "Renamed Feature")
583
+    featuresInfo <- cbind(originalInfo, featuresInfo)
584
+  } else {
585
+    originalFeatures <- colnames(measurements)
586
+    renamedInfo <- paste("Feature", seq_along(measurements), sep = '')
587
+    featuresInfo <- S4Vectors::DataFrame(originalFeatures, renamedInfo)
588
+    colnames(featuresInfo) <- c("Original Feature", "Renamed Feature")
589
+  }
590
+  featuresInfo
591
+}
592
+
410 593
 # Function to identify the parameters of an S4 method.
411 594
 .methodFormals <- function(f, signature) {
412 595
   tryCatch({
... ...
@@ -477,53 +660,6 @@
477 660
   list(measurementsTrain = measurementsTrain, classesTrain = classesTrain)
478 661
 }
479 662
 
480
-.transformKeywordToFunction <- function(keyword)
481
-{
482
-  switch(
483
-        keyword,
484
-        "none" = NULL,
485
-        "diffLoc" = subtractFromLocation
486
-    )
487
-}
488
-
489
-.selectionKeywordToFunction <- function(keyword)
490
-{
491
-  switch(
492
-        keyword,
493
-        "none" = NULL,
494
-        "t-test" = differentMeansRanking,
495
-        "limma" = limmaRanking,
496
-        "edgeR" = edgeRranking,
497
-        "Bartlett" = bartlettRanking,
498
-        "Levene" = leveneRanking,
499
-        "DMD" = DMDranking,
500
-        "likelihoodRatio" = likelihoodRatioRanking,
501
-        "KS" = KolmogorovSmirnovRanking,
502
-        "KL" = KullbackLeiblerRanking,
503
-        "CoxPH" = coxphRanking,
504
-        "selectMulti" = selectMulti
505
-    )
506
-}
507
-
508
-.classifierKeywordToParams <- function(keyword)
509
-{
510
-    switch(
511
-        keyword,
512
-        "randomForest" = RFparams(),
513
-        "randomSurvivalForest" = RSFparams(),
514
-        "GLM" = GLMparams(),
515
-        "elasticNetGLM" = elasticNetGLMparams(),
516
-        "SVM" = SVMparams(),
517
-        "NSC" = NSCparams(),
518
-        "DLDA" = DLDAparams(),
519
-        "naiveBayes" = naiveBayesParams(),
520
-        "mixturesNormals" = mixModelsParams(),
521
-        "kNN" = kNNparams(),
522
-        "CoxPH" = coxphParams(),
523
-        "CoxNet" = coxnetParams()
524
-    )    
525
-}
526
-
527 663
 .dlda <- function(x, y, prior = NULL){ # Remove this once sparsediscrim is reinstated to CRAN.
528 664
   obj <- list()
529 665
   obj$labels <- y
Browse code

Update utilities.R

Ellis Patrick authored on 05/09/2022 06:03:37 • GitHub committed on 05/09/2022 06:03:37
Showing1 changed files
... ...
@@ -300,7 +300,7 @@
300 300
     modellingParams@trainParams@otherParams <- tuneChosen
301 301
   }
302 302
 
303
-  if(attr(modellingParams@trainParams@classifier, "name") != "previousTrained")
303
+    if (!"previousTrained" %in% attr(modellingParams@trainParams@classifier, "name")) 
304 304
     # Don't name these first two variables. Some classifier functions might use classesTrain and others use outcomeTrain.
305 305
     paramList <- list(measurementsTrain, outcomeTrain)
306 306
   else # Don't pass the measurements and classes, because a pre-existing classifier is used.
... ...
@@ -641,4 +641,4 @@
641 641
 
642 642
 .dmvnorm_diag <- function(x, mean, sigma) { # Remove once sparsediscrim is reinstated to CRAN.
643 643
   exp(sum(dnorm(x, mean=mean, sd=sqrt(sigma), log=TRUE)))
644
-}
645 644
\ No newline at end of file
645
+}
Browse code

- train methods now exported and can be used. train generic imported from generics package. - simple params now have performanceType specified. - Fixes to training with multiple views but no aggregation. - Fix to parameter tuning in .doTrain.

Dario Strbenac authored on 02/09/2022 06:10:10
Showing1 changed files
... ...
@@ -106,7 +106,6 @@
106 106
   tuneParams <- modellingParams@selectParams@tuneParams
107 107
   performanceType <- tuneParams[["performanceType"]]
108 108
   topNfeatures <- tuneParams[["nFeatures"]]
109
-  tuneMode <- ifelse("tuneMode" %in% names(tuneParams), tuneParams[["tuneMode"]], crossValParams@tuneMode)
110 109
   tuneParams <- tuneParams[-match(c("performanceType", "nFeatures"), names(tuneParams))] # Only used as evaluation metric.
111 110
   
112 111
   # Make selectParams NULL, since we are currently doing selection and it shouldn't call
... ...
@@ -136,7 +135,7 @@
136 135
     if(attr(featureRanking, "name") == "previousSelection") # Actually selection not ranking.
137 136
       return(list(NULL, rankings[[1]], NULL))
138 137
     
139
-    if(tuneMode == "none") # No parameters to choose between.
138
+    if(crossValParams@tuneMode == "none") # No parameters to choose between.
140 139
         return(list(NULL, rankings[[1]], NULL))
141 140
     
142 141
     tuneParamsTrain <- list(topN = topNfeatures)
... ...
@@ -257,7 +256,7 @@
257 256
 
258 257
 # Code to create a function call to a training function. Might also do training and testing
259 258
 # within the same function, so test samples are also passed in case they are needed.
260
-.doTrain <- function(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, modellingParams, verbose)
259
+.doTrain <- function(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, crossValParams, modellingParams, verbose)
261 260
 {
262 261
   tuneDetails <- NULL
263 262
   if(!is.null(modellingParams@trainParams@tuneParams) && is.null(modellingParams@selectParams))
... ...
@@ -278,10 +277,10 @@
278 277
         
279 278
         predictions <- result[["predictions"]]
280 279
         if(class(predictions) == "data.frame")
281
-          predictedOutcome <- predictions[, "outcome"]
280
+          predictedOutcome <- predictions[, colnames(predictions) %in% c("class", "risk")]
282 281
         else
283 282
           predictedOutcome <- predictions
284
-        calcExternalPerformance(outcomeTest, predictedOutcome, performanceType)
283
+        calcExternalPerformance(outcomeTrain, predictedOutcome, performanceType)
285 284
       } else {
286 285
         result <- runTests(measurementsTrain, outcomeTrain,
287 286
                            crossValParams, modellingParams,
Browse code

- train and predict functions created to allow training and prediction to be done independently. But they have the capabilities of crossValidate, such as multi-view methods. Implemented as S3 methods to work well with existing generics defined by R. - runTest, runTests and crossValidate now all utilise prepareData and allow passing in extra parameters for it. - getLocationsAndScales becomes private function. - All reference to targets parameter converted to useFeatures, which allows both the assay and the feature to be specified, rather than only the assay. - .MAEtoWideTable data flattening function is gone. Its functionality is incorporated into prepareData. - Parameter tuning in .doSelection of resubstitution mode now correctly uses training data as the test data instead of accidentally using the testing data.

Dario Strbenac authored on 31/08/2022 11:45:02
Showing1 changed files
... ...
@@ -1,60 +1,3 @@
1
-# Function to convert a MultiAssayExperiment object into a flat DataFrame table, to enable it
2
-# to be used in typical model building functions.
3
-# Returns a list with a covariate table and and outcome vector/table, or just a covariate table
4
-# in the case the input is a test data set.
5
-.MAEtoWideTable <- function(measurements, outcomeColumns, useFeatures)
6
-{
7
-  clinicalColumns <- colnames(MultiAssayExperiment::colData(measurements))    
8
-  if("clinical" %in% useFeatures[, 1])
9
-  {
10
-    clinicalRows <- useFeatures[, 1] == "clinical"      
11
-    clinicalColumns <- useFeatures[clinicalRows, 2]
12
-    useFeatures <- useFeatures[!clinicalRows, ]
13
-  } else {
14
-    clinicalColumns <- NULL
15
-  }
16
-  
17
-  if(nrow(useFeatures) > 0)
18
-  {
19
-    measurements <- measurements[, , unique(useFeatures[, 1])]
20
-  
21
-    # Get all desired measurements tables and clinical columns (other than the columns representing outcome).
22
-    # These form the independent variables to be used for making predictions with.
23
-    # Variable names will have names like RNA_BRAF for traceability.
24
-    dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(clinicalColumns, outcomeColumns))
25
-    rownames(dataTable) <- dataTable[, "primary"]
26
-    S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "clinical", S4Vectors::mcols(dataTable)[, "sourceName"])
27
-    colnames(S4Vectors::mcols(dataTable))[1] <- "assay"
28
-            
29
-    # Sample information variable names not included in column metadata of wide table but only as row names of it.
30
-    # Create a combined column named "feature" which has feature names of the assays as well as the clinical.
31
-    S4Vectors::mcols(dataTable)[, "feature"] <- as.character(S4Vectors::mcols(dataTable)[, "rowname"])
32
-    missingIndices <- is.na(S4Vectors::mcols(dataTable)[, "feature"])
33
-    S4Vectors::mcols(dataTable)[missingIndices, "feature"] <- colnames(dataTable)[missingIndices]
34
-    
35
-    # Finally, a column annotation recording variable name and which table it originated from for all of the source tables.
36
-    S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")]
37
-    
38
-    # Subset to only the desired features.
39
-    useFeaturesSubset <- useFeatures[useFeatures[, 2] != "all", ]
40
-    if(nrow(useFeaturesSubset) > 0)
41
-    {
42
-      uniqueAssays <- unique(useFeatures[, 1])
43
-      for(filterAssay in uniqueAssays)
44
-      {
45
-        dropFeatures <- S4Vectors::mcols(dataTable)[, "assay"] == filterAssay &
46
-                        !S4Vectors::mcols(dataTable)[, "feature"] %in% useFeatures[useFeatures[, 1] == filterAssay, 2]
47
-        dataTable <- dataTable[, !dropFeatures]
48
-      }
49
-    }
50
-    dataTable <- dataTable[, -match("primary", colnames(dataTable))]
51
-  } else { # Must have only been clinical data.
52
-    dataTable <- MultiAssayExperiment::colData(measurements)
53
-    S4Vectors::mcols(dataTable) <- DataFrame(assay = "clinical", feature = colnames(dataTable))
54
-  }
55
-  dataTable
56
-}
57
-
58 1
 # Creates two lists of lists. First has training samples, second has test samples for a range
59 2
 # of different cross-validation schemes.
60 3
 #' @import utils
... ...
@@ -329,7 +272,7 @@
329 272
       modellingParams@trainParams@otherParams <- c(modellingParams@trainParams@otherParams, as.list(tuneCombos[rowIndex, ]))
330 273
       if(crossValParams@tuneMode == "Resubstitution")
331 274
       {
332
-        result <- runTest(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest,
275
+        result <- runTest(measurementsTrain, outcomeTrain, measurementsTrain, outcomeTrain,
333 276
                           crossValParams = NULL, modellingParams,
334 277
                           verbose = verbose, .iteration = "internal")
335 278
         
Browse code

- getFeatures functions added to simple params settings to extract selected features from within trained model where relevant. - Nearest Shrunken Centroid added as a simple params function and a classifier keyword option.

Dario Strbenac authored on 25/08/2022 05:15:03
Showing1 changed files
... ...
@@ -193,14 +193,14 @@
193 193
     if(attr(featureRanking, "name") == "previousSelection") # Actually selection not ranking.
194 194
       return(list(NULL, rankings[[1]], NULL))
195 195
     
196
-    if(tuneMode == "none") # Actually selection not ranking.
196
+    if(tuneMode == "none") # No parameters to choose between.
197 197
         return(list(NULL, rankings[[1]], NULL))
198 198
     
199 199
     tuneParamsTrain <- list(topN = topNfeatures)
200 200
     tuneParamsTrain <- append(tuneParamsTrain, modellingParams@trainParams@tuneParams)
201 201
     tuneCombosTrain <- expand.grid(tuneParamsTrain, stringsAsFactors = FALSE)  
202 202
     modellingParams@trainParams@tuneParams <- NULL
203
-    bestPerformers <- sapply(rankings, function(rankingsVariety)
203
+    allPerformanceTables <- lapply(rankings, function(rankingsVariety)
204 204
     {
205 205
       # Creates a matrix. Columns are top n features, rows are varieties (one row if None).
206 206
       performances <- sapply(1:nrow(tuneCombosTrain), function(rowIndex)
... ...
@@ -240,20 +240,22 @@
240 240
        })
241 241
 
242 242
         bestOne <- ifelse(betterValues == "lower", which.min(performances)[1], which.max(performances)[1])
243
-        c(bestOne, performances[bestOne])
243
+        list(data.frame(tuneCombosTrain, performance = performances), bestOne)
244 244
       })
245 245
 
246
-      tunePick <- ifelse(betterValues == "lower", which.min(bestPerformers[2, ])[1], which.max(bestPerformers[2, ])[1])
246
+      tablesBestMetrics <- sapply(allPerformanceTables, function(tableIndexPair) tableIndexPair[[1]][tableIndexPair[[2]], "performance"])
247
+      tunePick <- ifelse(betterValues == "lower", which.min(tablesBestMetrics)[1], which.max(tablesBestMetrics)[1])
247 248
       
248 249
       if(verbose == 3)
249 250
          message("Features selected.")
250 251
       
251
-      tuneRow <- tuneCombosTrain[bestPerformers[1, tunePick], , drop  = FALSE]
252
-      if(ncol(tuneRow) > 1) tuneDetails <- tuneRow[, -1, drop = FALSE] else tuneDetails <- NULL
252
+      tuneDetails <- allPerformanceTables[[tunePick]] # List of length 2.
253 253
       
254 254
       rankingUse <- rankings[[tunePick]]
255
-      selectionIndices <- rankingUse[1:tuneRow[, "topN"]]
255
+      selectionIndices <- rankingUse[1:(tuneDetails[[1]][tuneDetails[[2]], "topN"])]
256 256
       
257
+      names(tuneDetails) <- c("tuneCombinations", "bestIndex")
258
+      colnames(tuneDetails[[1]])[ncol(tuneDetails[[1]])] <- performanceType
257 259
       list(ranked = rankingUse, selected = selectionIndices, tune = tuneDetails)
258 260
     } else if(is.list(featureRanking)) { # It is a list of functions for ensemble selection.
259 261
       featuresIndiciesLists <- mapply(function(selector, selParams)
... ...
@@ -296,8 +298,7 @@
296 298
       
297 299
       list(NULL, selectionIndices, NULL)
298 300
     } else { # Previous selection
299
-      selectedFeatures <- 
300
-      list(NULL, selectionIndices, NULL)
301
+      selectedFeatures <- list(NULL, selectionIndices, NULL)
301 302
     }
302 303
 }
303 304
 
... ...
@@ -315,7 +316,7 @@
315 316
 # within the same function, so test samples are also passed in case they are needed.
316 317
 .doTrain <- function(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, modellingParams, verbose)
317 318
 {
318
-  tuneChosen <- NULL
319
+  tuneDetails <- NULL
319 320
   if(!is.null(modellingParams@trainParams@tuneParams) && is.null(modellingParams@selectParams))
320 321
   {
321 322
     performanceType <- modellingParams@trainParams@tuneParams[["performanceType"]]
... ...
@@ -346,9 +347,14 @@
346 347
         median(performances(result)[[performanceType]])
347 348
       }
348 349
     })
350
+    allPerformanceTable <- data.frame(tuneCombos, performances)
351
+    colnames(allPerformanceTable)[ncol(allPerformanceTable)] <- performanceType
352
+    
349 353
     betterValues <- .ClassifyRenvir[["performanceInfoTable"]][.ClassifyRenvir[["performanceInfoTable"]][, "type"] == performanceType, "better"]
350 354
     bestOne <- ifelse(betterValues == "lower", which.min(performances)[1], which.max(performances)[1])
351 355
     tuneChosen <- tuneCombos[bestOne, , drop = FALSE]
356
+    tuneDetails <- list(tuneCombos, bestOne)
357
+    names(tuneDetails) <- c("tuneCombinations", "bestIndex")
352 358
     modellingParams@trainParams@otherParams <- tuneChosen
353 359
   }
354 360
 
... ...
@@ -367,7 +373,7 @@
367 373
   if(verbose >= 2)
368 374
     message("Training completed.")  
369 375
   
370
-  list(model = trained, tune = tuneChosen)
376
+  list(model = trained, tune = tuneDetails)
371 377
 }
372 378
 
373 379
 # Creates a function call to a prediction function.
... ...
@@ -566,6 +572,7 @@
566 572
         "GLM" = GLMparams(),
567 573
         "elasticNetGLM" = elasticNetGLMparams(),
568 574
         "SVM" = SVMparams(),
575
+        "NSC" = NSCparams(),
569 576
         "DLDA" = DLDAparams(),
570 577
         "naiveBayes" = naiveBayesParams(),
571 578
         "mixturesNormals" = mixModelsParams(),
Browse code

- Restored runTest, runTests, ModellingParams, CrossValParams as public documented functions. Vignette also restored to explain them. - Constructors for params now expect a character keyword which is then converted into a function internally.

Dario Strbenac authored on 21/08/2022 16:07:38
Showing1 changed files
... ...
@@ -529,6 +529,52 @@
529 529
   list(measurementsTrain = measurementsTrain, classesTrain = classesTrain)
530 530
 }
531 531
 
532
+.transformKeywordToFunction <- function(keyword)
533
+{
534
+  switch(
535
+        keyword,
536
+        "none" = NULL,
537
+        "diffLoc" = subtractFromLocation
538
+    )
539
+}
540
+
541
+.selectionKeywordToFunction <- function(keyword)
542
+{
543
+  switch(
544
+        keyword,
545
+        "none" = NULL,
546
+        "t-test" = differentMeansRanking,
547
+        "limma" = limmaRanking,
548
+        "edgeR" = edgeRranking,
549
+        "Bartlett" = bartlettRanking,
550
+        "Levene" = leveneRanking,
551
+        "DMD" = DMDranking,
552
+        "likelihoodRatio" = likelihoodRatioRanking,
553
+        "KS" = KolmogorovSmirnovRanking,
554
+        "KL" = KullbackLeiblerRanking,
555
+        "CoxPH" = coxphRanking,
556
+        "selectMulti" = selectMulti
557
+    )
558
+}
559
+
560
+.classifierKeywordToParams <- function(keyword)
561
+{
562
+    switch(
563
+        keyword,
564
+        "randomForest" = RFparams(),
565
+        "randomSurvivalForest" = RSFparams(),
566
+        "GLM" = GLMparams(),
567
+        "elasticNetGLM" = elasticNetGLMparams(),
568
+        "SVM" = SVMparams(),
569
+        "DLDA" = DLDAparams(),
570
+        "naiveBayes" = naiveBayesParams(),
571
+        "mixturesNormals" = mixModelsParams(),
572
+        "kNN" = kNNparams(),
573
+        "CoxPH" = coxphParams(),
574
+        "CoxNet" = coxnetParams()
575
+    )    
576
+}
577
+
532 578
 .dlda <- function(x, y, prior = NULL){ # Remove this once sparsediscrim is reinstated to CRAN.
533 579
   obj <- list()
534 580
   obj$labels <- y
Browse code

- All references to runTest and runTests in examples and vignette converted to crossValidate. End users should always use crossValidate from now on. - Minor fixes to code mistakes. - Performance tuning of training method parameters chosen within feature selection is now faithfully used in the model training.

Dario Strbenac authored on 17/08/2022 13:55:15
Showing1 changed files
... ...
@@ -316,7 +316,7 @@
316 316
 .doTrain <- function(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, modellingParams, verbose)
317 317
 {
318 318
   tuneChosen <- NULL
319
-  if(!is.null(modellingParams@trainParams@tuneParams) && is.null(modellingParams@selectParams@tuneParams))
319
+  if(!is.null(modellingParams@trainParams@tuneParams) && is.null(modellingParams@selectParams))
320 320
   {
321 321
     performanceType <- modellingParams@trainParams@tuneParams[["performanceType"]]
322 322
     modellingParams@trainParams@tuneParams <- modellingParams@trainParams@tuneParams[-match("performanceType", names(modellingParams@trainParams@tuneParams))]
Browse code

- Classifiers and feature selection functions no longer have multiple signaures and are private. - prepareData function to filter and subset input data using common ways, such as missingness and variability. - The variable renaming and storage in Original Feature and Renamed Feature reverted back to column metadata and assay / feature colums. - sampleInfo now reverted back to clinical.

Dario Strbenac authored on 14/08/2022 23:45:28
Showing1 changed files
... ...
@@ -1,150 +1,58 @@
1
-# Operates on an input data frame, to extract the outcome column(s) and return
2
-# a list with the table of covariates in one element and the outcome in another.
3
-# The outcome need to be removed from the data table before predictor training!
4
-.splitDataAndOutcome <- function(measurements, outcome, restrict = NULL)
5
-{ # DataFrame's outcome variable can be character or factor, so it's a bit involved.
6
-  if(is.character(outcome) && length(outcome) > 3 && length(outcome) != nrow(measurements))
7
-    stop("'outcome' is a character variable but has more than one element. Either provide a\n",
8
-         "       one to three column names or a factor of the same length as the number of samples.")
9
-
10
-  ## String specifies the name of a single outcome column, typically a class.
11
-  if(is.character(outcome) && length(outcome) == 1)
12
-  {
13
-    outcomeColumn <- match(outcome, colnames(measurements))
14
-    if(is.na(outcomeColumn))
15
-      stop("Specified column name of outcome is not present in the data table.")
16
-    outcome <- measurements[, outcomeColumn]
17
-    measurements <- measurements[, -outcomeColumn, drop = FALSE]
18
-    # R version 4 and greater no longer automatically casts character columns to factors because stringsAsFactors
19
-    # is FALSE by default, so it is more likely to be character format these days. Handle it.
20
-    if(class(outcome) != "factor") # Assume there will be no ordinary regression prediction tasks ... for now.
21
-      outcome <- factor(outcome)
22
-  }
23
-  
24
-  # survival's Surv constructor has two inputs for the popular right-censored data and
25
-  # three inputs for less-common interval data.
26
-  if(is.character(outcome) && length(outcome) %in% 2:3)
27
-  {
28
-    outcomeColumns <- match(outcome, colnames(measurements))
29
-    if(any(is.na(outcomeColumns)))
30
-      stop("Specified column names of outcome is not present in the data table.")
31
-    outcome <- measurements[, outcomeColumns]
32
-    measurements <- measurements[, -outcomeColumns, drop = FALSE]
33
-  }
34
-  
35
-  if(is(outcome, "factor") && length(outcome) > 3 & length(outcome) < nrow(measurements))
36
-    stop("The length of outcome is not equal to the number of samples.")
37
-  
38
-  ## A vector of characters was input by the user. Ensure that it is a factor.
39
-  if(is.character(outcome) & length(outcome) == nrow(measurements))
40
-    outcome <- factor(outcome)
41
-  
42
-  # Outcome has columns, so it is tabular. It is inferred to represent survival data.
43
-  if(!is.null(ncol(outcome)) && ncol(outcome) %in% 2:3)
44
-  {
45
-    # Assume that event status is in the last column (second for two columns, third for three columns)
46
-    numberEventTypes <- length(unique(outcome[, ncol(outcome)]))
47
-    # Could be one or two kinds of events. All events might be uncensored or censored
48
-    # in a rare but not impossible scenario.
49
-    if(numberEventTypes > 2)
50
-      stop("Number of distinct event types in the last column exceeds 2 but must be 1 or 2.")
51
-      
52
-    if(ncol(outcome) == 2) # Typical, right-censored survival data.
53
-      outcome <- survival::Surv(outcome[, 1], outcome[, 2])
54
-    else # Three columns. Therefore, counting process data.
55
-      outcome <- survival::Surv(outcome[, 1], outcome[, 2], outcome[, 3])
56
-  }
57
-  
58
-  if(!is.null(restrict))
59
-  {
60
-    isDesiredClass <- sapply(measurements, function(column) is(column, restrict))
61
-    measurements <- measurements[, isDesiredClass, drop = FALSE]
62
-    if(ncol(measurements) == 0)
63
-      stop(paste("No features are left after restricting to", restrict, "but at least one must be."))
64
-  }
65
-
66
-  list(measurements = measurements, outcome = outcome)
67
-}
68
-
69 1
 # Function to convert a MultiAssayExperiment object into a flat DataFrame table, to enable it
70 2
 # to be used in typical model building functions.
71 3
 # Returns a list with a covariate table and and outcome vector/table, or just a covariate table
72 4
 # in the case the input is a test data set.
73
-.MAEtoWideTable <- function(measurements, targets = NULL, outcomeColumns = NULL, restrict = "numeric")
5
+.MAEtoWideTable <- function(measurements, outcomeColumns, useFeatures)
74 6
 {
75
-  if(is.null(targets))
76
-    stop("'targets' is not specified but must be.")
77
-  if(is.null(outcomeColumns))
78
-    stop("'outcomeColumns' is not specified but must be.")    
79
-  if(!all(targets %in% c(names(measurements), "sampleInfo")))
80
-    stop("Some table names in 'targets' are not assay names in 'measurements' or \"sampleInfo\".")
81
-  sampleInfoColumns <- colnames(MultiAssayExperiment::colData(measurements))
82
-  if(!missing(outcomeColumns) & !all(outcomeColumns %in% sampleInfoColumns))
83
-    stop("Not all column names specified by 'outcomeColumns' found in sample information table.")  
84
-
85
-  if("sampleInfo" %in% targets)
7
+  clinicalColumns <- colnames(MultiAssayExperiment::colData(measurements))    
8
+  if("clinical" %in% useFeatures[, 1])
86 9
   {
87
-    targets <- targets[targets != "sampleInfo"]
88
-    sampleInfoColumnsTrain <- sampleInfoColumns
10
+    clinicalRows <- useFeatures[, 1] == "clinical"      
11
+    clinicalColumns <- useFeatures[clinicalRows, 2]
12
+    useFeatures <- useFeatures[!clinicalRows, ]
89 13
   } else {
90
-    sampleInfoColumnsTrain <- NULL
14
+    clinicalColumns <- NULL
91 15
   }
92 16
   
93
-  if(length(targets) > 0)
17
+  if(nrow(useFeatures) > 0)
94 18
   {
95
-    measurements <- measurements[, , targets]
19
+    measurements <- measurements[, , unique(useFeatures[, 1])]
96 20
   
97
-    # Get all desired measurements tables and sample information columns (other than the columns representing outcome).
21
+    # Get all desired measurements tables and clinical columns (other than the columns representing outcome).
98 22
     # These form the independent variables to be used for making predictions with.
99
-    # Variable names will have names like RNA:BRAF for traceability.
100
-    dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(sampleInfoColumnsTrain, outcomeColumns), check.names = FALSE, collapse = ':')
23
+    # Variable names will have names like RNA_BRAF for traceability.
24
+    dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(clinicalColumns, outcomeColumns))
101 25
     rownames(dataTable) <- dataTable[, "primary"]
102
-    S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "sampleInfo", S4Vectors::mcols(dataTable)[, "sourceName"])
26
+    S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "clinical", S4Vectors::mcols(dataTable)[, "sourceName"])
103 27
     colnames(S4Vectors::mcols(dataTable))[1] <- "assay"
104
-  
28
+            
105 29
     # Sample information variable names not included in column metadata of wide table but only as row names of it.
106
-    # Create a combined column named "feature" which has feature names of the assays as well as the sample information.
30
+    # Create a combined column named "feature" which has feature names of the assays as well as the clinical.
107 31
     S4Vectors::mcols(dataTable)[, "feature"] <- as.character(S4Vectors::mcols(dataTable)[, "rowname"])
108 32
     missingIndices <- is.na(S4Vectors::mcols(dataTable)[, "feature"])
109 33
     S4Vectors::mcols(dataTable)[missingIndices, "feature"] <- colnames(dataTable)[missingIndices]
110 34
     
111 35
     # Finally, a column annotation recording variable name and which table it originated from for all of the source tables.
112 36
     S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")]
113
-  } else { # Must have only been sample information data.
37
+    
38
+    # Subset to only the desired features.
39
+    useFeaturesSubset <- useFeatures[useFeatures[, 2] != "all", ]
40
+    if(nrow(useFeaturesSubset) > 0)
41
+    {
42
+      uniqueAssays <- unique(useFeatures[, 1])
43
+      for(filterAssay in uniqueAssays)
44
+      {
45
+        dropFeatures <- S4Vectors::mcols(dataTable)[, "assay"] == filterAssay &
46
+                        !S4Vectors::mcols(dataTable)[, "feature"] %in% useFeatures[useFeatures[, 1] == filterAssay, 2]
47
+        dataTable <- dataTable[, !dropFeatures]
48
+      }
49
+    }
50
+    dataTable <- dataTable[, -match("primary", colnames(dataTable))]
51
+  } else { # Must have only been clinical data.
114 52
     dataTable <- MultiAssayExperiment::colData(measurements)
53
+    S4Vectors::mcols(dataTable) <- DataFrame(assay = "clinical", feature = colnames(dataTable))
115 54
   }
116
-  if(!is.null(outcomeColumns)) outcome <- dataTable[, outcomeColumns]
117
-  
118
-  if(!is.null(restrict))
119
-  {
120
-    isDesiredClass <- sapply(dataTable, function(column) is(column, restrict))
121
-    dataTable <- dataTable[, isDesiredClass, drop = FALSE]
122
-    if(ncol(dataTable) == 0)
123
-      stop(paste("No features are left after restricting to", restrict, "but at least one must be."))
124
-  }
125
-
126
-  # Only return independent variables in dataTable for making classifications with.
127
-  # "primary" column is auto-generated by sample information table row names and a duplicate.
128
-  dropColumns <- na.omit(match(c("primary", outcomeColumns), colnames(dataTable)))
129
-  if(length(dropColumns) > 0) dataTable <- dataTable[, -dropColumns]
130
-  
131
-  # Training data table and outcome for training data.
132
-  if(!is.null(outcomeColumns))
133
-      list(dataTable = dataTable, outcome = outcome)
134
-  else # Only test data table for test data input.
135
-    dataTable
136
-}
137
-
138
-# For classifiers which use one single function for inputting a training and a testing table,
139
-# and work only for the numeric data type, this checks whether the training and testing tables 
140
-# both have the same set of features and there are at least some numeric features to use,
141
-# after they have been filtered by another function which splits the covariates and the outcome from the input.
142
-.checkVariablesAndSame <- function(trainingMatrix, testingMatrix)
143
-{
144
-  if(ncol(trainingMatrix) == 0) # Filtering of table removed all columns, leaving nothing to classify with.
145
-    stop("No variables in data tables specified by \'targets\' are numeric.")
146
-  else if(ncol(trainingMatrix) != ncol(testingMatrix))
147
-    stop("Training data set and testing data set contain differing numbers of features.")  
55
+  dataTable
148 56
 }
149 57
 
150 58
 # Creates two lists of lists. First has training samples, second has test samples for a range
... ...
@@ -282,7 +190,7 @@
282 190
       do.call(featureRanking, paramList)
283 191
     })
284 192
     
285
-    if(featureRanking@generic == "previousSelection") # Actually selection not ranking.
193
+    if(attr(featureRanking, "name") == "previousSelection") # Actually selection not ranking.
286 194
       return(list(NULL, rankings[[1]], NULL))
287 195
     
288 196 <