Browse code

- Restored utilities.R.

Dario Strbenac authored on 07/09/2022 00:48:49
Showing 1 changed files

... ...
@@ -1,152 +1,3 @@
1
-# Operates on an input data frame, to extract the outcome column(s) and return
2
-# a list with the table of covariates in one element and the outcome in another.
3
-# The outcome need to be removed from the data table before predictor training!
4
-.splitDataAndOutcome <- function(measurements, outcome, restrict = NULL)
5
-{ # DataFrame's outcome variable can be character or factor, so it's a bit involved.
6
-  if(is.character(outcome) && length(outcome) > 3 && length(outcome) != nrow(measurements))
7
-    stop("'outcome' is a character variable but has more than one element. Either provide a\n",
8
-         "       one to three column names or a factor of the same length as the number of samples.")
9
-
10
-  ## String specifies the name of a single outcome column, typically a class.
11
-  if(is.character(outcome) && length(outcome) == 1)
12
-  {
13
-    outcomeColumn <- match(outcome, colnames(measurements))
14
-    if(is.na(outcomeColumn))
15
-      stop("Specified column name of outcome is not present in the data table.")
16
-    outcome <- measurements[, outcomeColumn]
17
-    measurements <- measurements[, -outcomeColumn, drop = FALSE]
18
-    # R version 4 and greater no longer automatically casts character columns to factors because stringsAsFactors
19
-    # is FALSE by default, so it is more likely to be character format these days. Handle it.
20
-    if(class(outcome) != "factor") # Assume there will be no ordinary regression prediction tasks ... for now.
21
-      outcome <- factor(outcome)
22
-  }
23
-  
24
-  # survival's Surv constructor has two inputs for the popular right-censored data and
25
-  # three inputs for less-common interval data.
26
-  if(is.character(outcome) && length(outcome) %in% 2:3)
27
-  {
28
-    outcomeColumns <- match(outcome, colnames(measurements))
29
-    if(any(is.na(outcomeColumns)))
30
-      stop("Specified column names of outcome is not present in the data table.")
31
-    outcome <- measurements[, outcomeColumns]
32
-    measurements <- measurements[, -outcomeColumns, drop = FALSE]
33
-  }
34
-  
35
-  if(is(outcome, "factor") && length(outcome) > 3 & length(outcome) < nrow(measurements))
36
-    stop("The length of outcome is not equal to the number of samples.")
37
-  
38
-  ## A vector of characters was input by the user. Ensure that it is a factor.
39
-  if(is.character(outcome) & length(outcome) == nrow(measurements))
40
-    outcome <- factor(outcome)
41
-  
42
-  # Outcome has columns, so it is tabular. It is inferred to represent survival data.
43
-  if(!is.null(ncol(outcome)) && ncol(outcome) %in% 2:3)
44
-  {
45
-    # Assume that event status is in the last column (second for two columns, third for three columns)
46
-    numberEventTypes <- length(unique(outcome[, ncol(outcome)]))
47
-    # Could be one or two kinds of events. All events might be uncensored or censored
48
-    # in a rare but not impossible scenario.
49
-    if(numberEventTypes > 2)
50
-      stop("Number of distinct event types in the last column exceeds 2 but must be 1 or 2.")
51
-      
52
-    if(ncol(outcome) == 2) # Typical, right-censored survival data.
53
-      outcome <- survival::Surv(outcome[, 1], outcome[, 2])
54
-    else # Three columns. Therefore, counting process data.
55
-      outcome <- survival::Surv(outcome[, 1], outcome[, 2], outcome[, 3])
56
-  }
57
-  
58
-  if(!is.null(restrict))
59
-  {
60
-    isDesiredClass <- sapply(measurements, function(column) is(column, restrict))
61
-    measurements <- measurements[, isDesiredClass, drop = FALSE]
62
-    if(ncol(measurements) == 0)
63
-      stop(paste("No features are left after restricting to", restrict, "but at least one must be."))
64
-  }
65
-
66
-  list(measurements = measurements, outcome = outcome)
67
-}
68
-
69
-# Function to convert a MultiAssayExperiment object into a flat DataFrame table, to enable it
70
-# to be used in typical model building functions.
71
-# Returns a list with a covariate table and and outcome vector/table, or just a covariate table
72
-# in the case the input is a test data set.
73
-.MAEtoWideTable <- function(measurements, targets = NULL, outcomeColumns = NULL, restrict = "numeric")
74
-{
75
-  if(is.null(targets))
76
-    stop("'targets' is not specified but must be.")
77
-  if(is.null(outcomeColumns))
78
-    stop("'outcomeColumns' is not specified but must be.")    
79
-  if(!all(targets %in% c(names(measurements), "sampleInfo")))
80
-    stop("Some table names in 'targets' are not assay names in 'measurements' or \"sampleInfo\".")
81
-  sampleInfoColumns <- colnames(MultiAssayExperiment::colData(measurements))
82
-  if(!missing(outcomeColumns) & !all(outcomeColumns %in% sampleInfoColumns))
83
-    stop("Not all column names specified by 'outcomeColumns' found in sample information table.")  
84
-
85
-  if("sampleInfo" %in% targets)
86
-  {
87
-    targets <- targets[targets != "sampleInfo"]
88
-    sampleInfoColumnsTrain <- sampleInfoColumns
89
-  } else {
90
-    sampleInfoColumnsTrain <- NULL
91
-  }
92
-  
93
-  if(length(targets) > 0)
94
-  {
95
-    measurements <- measurements[, , targets]
96
-  
97
-    # Get all desired measurements tables and sample information columns (other than the columns representing outcome).
98
-    # These form the independent variables to be used for making predictions with.
99
-    # Variable names will have names like RNA:BRAF for traceability.
100
-    dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(sampleInfoColumnsTrain, outcomeColumns), check.names = FALSE, collapse = ':')
101
-    rownames(dataTable) <- dataTable[, "primary"]
102
-    S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "sampleInfo", S4Vectors::mcols(dataTable)[, "sourceName"])
103
-    colnames(S4Vectors::mcols(dataTable))[1] <- "assay"
104
-  
105
-    # Sample information variable names not included in column metadata of wide table but only as row names of it.
106
-    # Create a combined column named "feature" which has feature names of the assays as well as the sample information.
107
-    S4Vectors::mcols(dataTable)[, "feature"] <- as.character(S4Vectors::mcols(dataTable)[, "rowname"])
108
-    missingIndices <- is.na(S4Vectors::mcols(dataTable)[, "feature"])
109
-    S4Vectors::mcols(dataTable)[missingIndices, "feature"] <- colnames(dataTable)[missingIndices]
110
-    
111
-    # Finally, a column annotation recording variable name and which table it originated from for all of the source tables.
112
-    S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")]
113
-  } else { # Must have only been sample information data.
114
-    dataTable <- MultiAssayExperiment::colData(measurements)
115
-  }
116
-  if(!is.null(outcomeColumns)) outcome <- dataTable[, outcomeColumns]
117
-  
118
-  if(!is.null(restrict))
119
-  {
120
-    isDesiredClass <- sapply(dataTable, function(column) is(column, restrict))
121
-    dataTable <- dataTable[, isDesiredClass, drop = FALSE]
122
-    if(ncol(dataTable) == 0)
123
-      stop(paste("No features are left after restricting to", restrict, "but at least one must be."))
124
-  }
125
-
126
-  # Only return independent variables in dataTable for making classifications with.
127
-  # "primary" column is auto-generated by sample information table row names and a duplicate.
128
-  dropColumns <- na.omit(match(c("primary", outcomeColumns), colnames(dataTable)))
129
-  if(length(dropColumns) > 0) dataTable <- dataTable[, -dropColumns]
130
-  
131
-  # Training data table and outcome for training data.
132
-  if(!is.null(outcomeColumns))
133
-      list(dataTable = dataTable, outcome = outcome)
134
-  else # Only test data table for test data input.
135
-    dataTable
136
-}
137
-
138
-# For classifiers which use one single function for inputting a training and a testing table,
139
-# and work only for the numeric data type, this checks whether the training and testing tables 
140
-# both have the same set of features and there are at least some numeric features to use,
141
-# after they have been filtered by another function which splits the covariates and the outcome from the input.
142
-.checkVariablesAndSame <- function(trainingMatrix, testingMatrix)
143
-{
144
-  if(ncol(trainingMatrix) == 0) # Filtering of table removed all columns, leaving nothing to classify with.
145
-    stop("No variables in data tables specified by \'targets\' are numeric.")
146
-  else if(ncol(trainingMatrix) != ncol(testingMatrix))
147
-    stop("Training data set and testing data set contain differing numbers of features.")  
148
-}
149
-
150 1
 # Creates two lists of lists. First has training samples, second has test samples for a range
151 2
 # of different cross-validation schemes.
152 3
 #' @import utils
... ...
@@ -255,7 +106,6 @@
255 106
   tuneParams <- modellingParams@selectParams@tuneParams
256 107
   performanceType <- tuneParams[["performanceType"]]
257 108
   topNfeatures <- tuneParams[["nFeatures"]]
258
-  tuneMode <- ifelse("tuneMode" %in% names(tuneParams), tuneParams[["tuneMode"]], crossValParams@tuneMode)
259 109
   tuneParams <- tuneParams[-match(c("performanceType", "nFeatures"), names(tuneParams))] # Only used as evaluation metric.
260 110
   
261 111
   # Make selectParams NULL, since we are currently doing selection and it shouldn't call
... ...
@@ -282,17 +132,17 @@
282 132
       do.call(featureRanking, paramList)
283 133
     })
284 134
     
285
-    if(featureRanking@generic == "previousSelection") # Actually selection not ranking.
135
+    if(attr(featureRanking, "name") == "previousSelection") # Actually selection not ranking.
286 136
       return(list(NULL, rankings[[1]], NULL))
287 137
     
288
-    if(tuneMode == "none") # Actually selection not ranking.
138
+    if(crossValParams@tuneMode == "none") # No parameters to choose between.
289 139
         return(list(NULL, rankings[[1]], NULL))
290 140
     
291 141
     tuneParamsTrain <- list(topN = topNfeatures)
292 142
     tuneParamsTrain <- append(tuneParamsTrain, modellingParams@trainParams@tuneParams)
293 143
     tuneCombosTrain <- expand.grid(tuneParamsTrain, stringsAsFactors = FALSE)  
294 144
     modellingParams@trainParams@tuneParams <- NULL
295
-    bestPerformers <- sapply(rankings, function(rankingsVariety)
145
+    allPerformanceTables <- lapply(rankings, function(rankingsVariety)
296 146
     {
297 147
       # Creates a matrix. Columns are top n features, rows are varieties (one row if None).
298 148
       performances <- sapply(1:nrow(tuneCombosTrain), function(rowIndex)
... ...
@@ -332,20 +182,22 @@
332 182
        })
333 183
 
334 184
         bestOne <- ifelse(betterValues == "lower", which.min(performances)[1], which.max(performances)[1])
335
-        c(bestOne, performances[bestOne])
185
+        list(data.frame(tuneCombosTrain, performance = performances), bestOne)
336 186
       })
337 187
 
338
-      tunePick <- ifelse(betterValues == "lower", which.min(bestPerformers[2, ])[1], which.max(bestPerformers[2, ])[1])
188
+      tablesBestMetrics <- sapply(allPerformanceTables, function(tableIndexPair) tableIndexPair[[1]][tableIndexPair[[2]], "performance"])
189
+      tunePick <- ifelse(betterValues == "lower", which.min(tablesBestMetrics)[1], which.max(tablesBestMetrics)[1])
339 190
       
340 191
       if(verbose == 3)
341 192
          message("Features selected.")
342 193
       
343
-      tuneRow <- tuneCombosTrain[bestPerformers[1, tunePick], , drop  = FALSE]
344
-      if(ncol(tuneRow) > 1) tuneDetails <- tuneRow[, -1, drop = FALSE] else tuneDetails <- NULL
194
+      tuneDetails <- allPerformanceTables[[tunePick]] # List of length 2.
345 195
       
346 196
       rankingUse <- rankings[[tunePick]]
347
-      selectionIndices <- rankingUse[1:tuneRow[, "topN"]]
197
+      selectionIndices <- rankingUse[1:(tuneDetails[[1]][tuneDetails[[2]], "topN"])]
348 198
       
199
+      names(tuneDetails) <- c("tuneCombinations", "bestIndex")
200
+      colnames(tuneDetails[[1]])[ncol(tuneDetails[[1]])] <- performanceType
349 201
       list(ranked = rankingUse, selected = selectionIndices, tune = tuneDetails)
350 202
     } else if(is.list(featureRanking)) { # It is a list of functions for ensemble selection.
351 203
       featuresIndiciesLists <- mapply(function(selector, selParams)
... ...
@@ -388,8 +240,7 @@
388 240
       
389 241
       list(NULL, selectionIndices, NULL)
390 242
     } else { # Previous selection
391
-      selectedFeatures <- 
392
-      list(NULL, selectionIndices, NULL)
243
+      selectedFeatures <- list(NULL, selectionIndices, NULL)
393 244
     }
394 245
 }
395 246
 
... ...
@@ -405,10 +256,10 @@
405 256
 
406 257
 # Code to create a function call to a training function. Might also do training and testing
407 258
 # within the same function, so test samples are also passed in case they are needed.
408
-.doTrain <- function(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, modellingParams, verbose)
259
+.doTrain <- function(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, crossValParams, modellingParams, verbose)
409 260
 {
410
-  tuneChosen <- NULL
411
-  if(!is.null(modellingParams@trainParams@tuneParams) && is.null(modellingParams@selectParams@tuneParams))
261
+  tuneDetails <- NULL
262
+  if(!is.null(modellingParams@trainParams@tuneParams) && is.null(modellingParams@selectParams))
412 263
   {
413 264
     performanceType <- modellingParams@trainParams@tuneParams[["performanceType"]]
414 265
     modellingParams@trainParams@tuneParams <- modellingParams@trainParams@tuneParams[-match("performanceType", names(modellingParams@trainParams@tuneParams))]
... ...
@@ -420,16 +271,16 @@
420 271
       modellingParams@trainParams@otherParams <- c(modellingParams@trainParams@otherParams, as.list(tuneCombos[rowIndex, ]))
421 272
       if(crossValParams@tuneMode == "Resubstitution")
422 273
       {
423
-        result <- runTest(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest,
274
+        result <- runTest(measurementsTrain, outcomeTrain, measurementsTrain, outcomeTrain,
424 275
                           crossValParams = NULL, modellingParams,
425 276
                           verbose = verbose, .iteration = "internal")
426 277
         
427 278
         predictions <- result[["predictions"]]
428 279
         if(class(predictions) == "data.frame")
429
-          predictedOutcome <- predictions[, "outcome"]
280
+          predictedOutcome <- predictions[, colnames(predictions) %in% c("class", "risk")]
430 281
         else
431 282
           predictedOutcome <- predictions
432
-        calcExternalPerformance(outcomeTest, predictedOutcome, performanceType)
283
+        calcExternalPerformance(outcomeTrain, predictedOutcome, performanceType)
433 284
       } else {
434 285
         result <- runTests(measurementsTrain, outcomeTrain,
435 286
                            crossValParams, modellingParams,
... ...
@@ -438,13 +289,18 @@
438 289
         median(performances(result)[[performanceType]])
439 290
       }
440 291
     })
292
+    allPerformanceTable <- data.frame(tuneCombos, performances)
293
+    colnames(allPerformanceTable)[ncol(allPerformanceTable)] <- performanceType
294
+    
441 295
     betterValues <- .ClassifyRenvir[["performanceInfoTable"]][.ClassifyRenvir[["performanceInfoTable"]][, "type"] == performanceType, "better"]
442 296
     bestOne <- ifelse(betterValues == "lower", which.min(performances)[1], which.max(performances)[1])
443 297
     tuneChosen <- tuneCombos[bestOne, , drop = FALSE]
298
+    tuneDetails <- list(tuneCombos, bestOne)
299
+    names(tuneDetails) <- c("tuneCombinations", "bestIndex")
444 300
     modellingParams@trainParams@otherParams <- tuneChosen
445 301
   }
446 302
 
447
-  if(modellingParams@trainParams@classifier@generic != "previousTrained")
303
+    if (!"previousTrained" %in% attr(modellingParams@trainParams@classifier, "name")) 
448 304
     # Don't name these first two variables. Some classifier functions might use classesTrain and others use outcomeTrain.
449 305
     paramList <- list(measurementsTrain, outcomeTrain)
450 306
   else # Don't pass the measurements and classes, because a pre-existing classifier is used.
... ...
@@ -459,7 +315,7 @@
459 315
   if(verbose >= 2)
460 316
     message("Training completed.")  
461 317
   
462
-  list(model = trained, tune = tuneChosen)
318
+  list(model = trained, tune = tuneDetails)
463 319
 }
464 320
 
465 321
 # Creates a function call to a prediction function.
... ...
@@ -528,15 +384,6 @@
528 384
 # by user-specified values.
529 385
 .filterCharacteristics <- function(characteristics, autoCharacteristics)
530 386
 {
531
-  # Remove duplication of values for classifiers that have one function for training and 
532
-  # one function for prediction.
533
-  if("Classifier Name" %in% autoCharacteristics[, "characteristic"] && "Predictor Name" %in% autoCharacteristics[, "characteristic"])
534
-  {
535
-    classRow <- which(autoCharacteristics[, "characteristic"] == "Classifier Name")
536
-    predRow <- which(autoCharacteristics[, "characteristic"] == "Predictor Name")
537
-    if(autoCharacteristics[classRow, "value"] == autoCharacteristics[predRow, "value"])
538
-      autoCharacteristics <- autoCharacteristics[-predRow, ]
539
-  }
540 387
   # Overwrite automatically-chosen names with user's names.
541 388
   if(nrow(autoCharacteristics) > 0 && nrow(characteristics) > 0)
542 389
   {
... ...
@@ -560,36 +407,6 @@
560 407
   plotData
561 408
 }
562 409
 
563
-# Summary of the features used and the total number of them, no matter if they are a simple type
564
-# or something more complex like Pairs or feature sets.
565
-.summaryFeatures <- function(measurements)
566
-{
567
-  # MultiAssayExperiment has feature details in mcols.
568
-  if(!is.null(S4Vectors::mcols(measurements)))
569
-  {
570
-    originalInfo <- S4Vectors::mcols(measurements)
571
-    featureNames <- S4Vectors::mcols(measurements)[, "feature"]
572
-    assays <- unique(S4Vectors::mcols(measurements)[, "assay"])
573
-    renamedInfo <- S4Vectors::mcols(measurements)
574
-    renamedAssays <- paste("Assay", seq_along(assays), sep = '')
575
-    for(assay in assays)
576
-    {
577
-      rowsAssay <- which(renamedInfo[, "assay"] == assay)
578
-      renamedInfo[rowsAssay, "feature"] <- paste("Feature", seq_along(rowsAssay), sep = '')
579
-      renamedInfo[rowsAssay, "assay"] <- renamedAssays[match(assay, assays)]
580
-    }
581
-    featuresInfo <- S4Vectors::DataFrame(originalInfo, renamedInfo)
582
-    colnames(featuresInfo) <- c("Original Assay", "Original Feature", "Renamed Assay", "Renamed Feature")
583
-    featuresInfo <- cbind(originalInfo, featuresInfo)
584
-  } else {
585
-    originalFeatures <- colnames(measurements)
586
-    renamedInfo <- paste("Feature", seq_along(measurements), sep = '')
587
-    featuresInfo <- S4Vectors::DataFrame(originalFeatures, renamedInfo)
588
-    colnames(featuresInfo) <- c("Original Feature", "Renamed Feature")
589
-  }
590
-  featuresInfo
591
-}
592
-
593 410
 # Function to identify the parameters of an S4 method.
594 411
 .methodFormals <- function(f, signature) {
595 412
   tryCatch({
... ...
@@ -660,6 +477,53 @@
660 477
   list(measurementsTrain = measurementsTrain, classesTrain = classesTrain)
661 478
 }
662 479
 
480
+.transformKeywordToFunction <- function(keyword)
481
+{
482
+  switch(
483
+        keyword,
484
+        "none" = NULL,
485
+        "diffLoc" = subtractFromLocation
486
+    )
487
+}
488
+
489
+.selectionKeywordToFunction <- function(keyword)
490
+{
491
+  switch(
492
+        keyword,
493
+        "none" = NULL,
494
+        "t-test" = differentMeansRanking,
495
+        "limma" = limmaRanking,
496
+        "edgeR" = edgeRranking,
497
+        "Bartlett" = bartlettRanking,
498
+        "Levene" = leveneRanking,
499
+        "DMD" = DMDranking,
500
+        "likelihoodRatio" = likelihoodRatioRanking,
501
+        "KS" = KolmogorovSmirnovRanking,
502
+        "KL" = KullbackLeiblerRanking,
503
+        "CoxPH" = coxphRanking,
504
+        "selectMulti" = selectMulti
505
+    )
506
+}
507
+
508
+.classifierKeywordToParams <- function(keyword)
509
+{
510
+    switch(
511
+        keyword,
512
+        "randomForest" = RFparams(),
513
+        "randomSurvivalForest" = RSFparams(),
514
+        "GLM" = GLMparams(),
515
+        "elasticNetGLM" = elasticNetGLMparams(),
516
+        "SVM" = SVMparams(),
517
+        "NSC" = NSCparams(),
518
+        "DLDA" = DLDAparams(),
519
+        "naiveBayes" = naiveBayesParams(),
520
+        "mixturesNormals" = mixModelsParams(),
521
+        "kNN" = kNNparams(),
522
+        "CoxPH" = coxphParams(),
523
+        "CoxNet" = coxnetParams()
524
+    )    
525
+}
526
+
663 527
 .dlda <- function(x, y, prior = NULL){ # Remove this once sparsediscrim is reinstated to CRAN.
664 528
   obj <- list()
665 529
   obj$labels <- y