Browse code

Saving in-progess and untested changes. - useFeatures replaced by clinicalPredictors. Filtering of variables only really needed for clinical data so this simplifies the user experienece. - precisionPathyTrain function added and partially implemented. - prepareData topNvariance is now applied per-assay and has safeguards for assays with less than topNvariance features. - crossValidate S4 method signature simplification (i.e. MultiAssayExperimentOrList).

Dario Strbenac authored on 15/02/2023 06:25:05
Showing 8 changed files

... ...
@@ -72,6 +72,7 @@ Collate:
72 72
     'interfaceXGB.R'
73 73
     'performancePlot.R'
74 74
     'plotFeatureClasses.R'
75
+    'precisionPathway.R'
75 76
     'prepareData.R'
76 77
     'previousSelection.R'
77 78
     'previousTrained.R'
... ...
@@ -76,6 +76,7 @@ exportMethods(models)
76 76
 exportMethods(performance)
77 77
 exportMethods(performancePlot)
78 78
 exportMethods(plotFeatureClasses)
79
+exportMethods(precisionPathwayTrain)
79 80
 exportMethods(predictions)
80 81
 exportMethods(prepareData)
81 82
 exportMethods(rankingPlot)
... ...
@@ -35,6 +35,9 @@ setClassUnion("DataFrameOrNULL", c("DataFrame", "NULL"))
35 35
 setClassUnion("tabular", c("data.frame", "DataFrame", "matrix"))
36 36
 setClassUnion("tabularOrList", c("tabular", "list"))
37 37
 
38
+# List-like assay data
39
+setClassUnion("MultiAssayExperimentOrList", c("MultiAssayExperiment", "list"))
40
+
38 41
 ################################################################################
39 42
 #
40 43
 # Params
... ...
@@ -22,14 +22,18 @@
22 22
 #' @param extraParams A list of parameters that will be used to overwrite default settings of transformation, selection, or model-building functions or
23 23
 #' parameters which will be passed into the data cleaning function. The names of the list must be one of \code{"prepare"},
24 24
 #' \code{"select"}, \code{"train"}, \code{"predict"}. To remove one of the defaults (see the article titled Parameter Tuning Presets for crossValidate and Their Customisation on
25
-#' the website), specify the list element to be \code{NULL}.
25
+#' the website), specify the list element to be \code{NULL}. For the valid element names in the \code{"prepare"} list, see \code{?prepareData}.
26
+#' @param clinicalPredictors If \code{measurements} is a \code{MultiAssayExperiment},
27
+#' a character vector of features to use in modelling. This allows avoidance of things like sample IDs,
28
+#' sample acquisition dates, etc. which are not relevant for outcome prediction.
26 29
 #' @param nFeatures The number of features to be used for classification. If this is a single number, the same number of features will be used for all comparisons
27 30
 #' or assays. If a numeric vector these will be optimised over using \code{selectionOptimisation}. If a named vector with the same names of multiple assays, 
28 31
 #' a different number of features will be used for each assay. If a named list of vectors, the respective number of features will be optimised over. 
29 32
 #' Set to NULL or "all" if all features should be used.
30 33
 #' @param selectionMethod Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
31 34
 #' and performing multiview classification, the respective selection methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking
32
-#' and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value.
35
+#' and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. \code{NULL} is also a valid value, meaning that no
36
+#' indepedent feature selection will be performed (but implicit selection might still happen with the classifier).
33 37
 #' @param selectionOptimisation A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.
34 38
 #' @param performanceType Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Otherwise, any one of the
35 39
 #' options described in \code{\link{calcPerformance}} may otherwise be specified.
... ...
@@ -93,7 +97,7 @@ setGeneric("crossValidate", function(measurements, outcome, ...)
93 97
 
94 98
 #' @rdname crossValidate
95 99
 #' @export
96
-setMethod("crossValidate", "DataFrame", 
100
+setMethod("crossValidate", "DataFrame",
97 101
           function(measurements,
98 102
                    outcome,
99 103
                    nFeatures = 20,
... ...
@@ -110,12 +114,15 @@ setMethod("crossValidate", "DataFrame",
110 114
 
111 115
           {
112 116
               # Check that data is in the right format, if not already done for MultiAssayExperiment input.
113
-              prepParams <- list(measurements, outcome)
114
-              if("prepare" %in% names(extraParams))
115
-                prepParams <- c(prepParams, extraParams[["prepare"]])
116
-              measurementsAndOutcome <- do.call(prepareData, prepParams)
117
-              measurements <- measurementsAndOutcome[["measurements"]]
118
-              outcome <- measurementsAndOutcome[["outcome"]]
117
+              if(!"assay" %in% S4Vectors::mcols(measurements)) # Assay is put there by prepareData for MultiAssayExperiment, skip if present. 
118
+              {
119
+                prepParams <- list(measurements, outcome, clinicalPredictors)
120
+                if("prepare" %in% names(extraParams))
121
+                  prepParams <- c(prepParams, extraParams[["prepare"]])
122
+                measurementsAndOutcome <- do.call(prepareData, prepParams)
123
+                measurements <- measurementsAndOutcome[["measurements"]]
124
+                outcome <- measurementsAndOutcome[["outcome"]]
125
+              }
119 126
               
120 127
               # Ensure performance type is one of the ones that can be calculated by the package.
121 128
               if(!performanceType %in% c("auto", .ClassifyRenvir[["performanceTypes"]]))
... ...
@@ -305,9 +312,10 @@ setMethod("crossValidate", "DataFrame",
305 312
 #' @rdname crossValidate
306 313
 #' @export
307 314
 # One or more omics data sets, possibly with clinical data.
308
-setMethod("crossValidate", "MultiAssayExperiment",
315
+setMethod("crossValidate", "MultiAssayExperimentOrList",
309 316
           function(measurements,
310
-                   outcome, 
317
+                   outcome,
318
+                   clinicalPredictors = NULL,
311 319
                    nFeatures = 20,
312 320
                    selectionMethod = "auto",
313 321
                    selectionOptimisation = "Resubstitution",
... ...
@@ -321,7 +329,7 @@ setMethod("crossValidate", "MultiAssayExperiment",
321 329
                    characteristicsLabel = NULL, extraParams = NULL)
322 330
           {
323 331
               # Check that data is in the right format, if not already done for MultiAssayExperiment input.
324
-              prepParams <- list(measurements, outcome)
332
+              prepParams <- list(measurements, outcome, clinicalPredictors)
325 333
               if("prepare" %in% names(extraParams))
326 334
                 prepParams <- c(prepParams, extraParams[["prepare"]])
327 335
               measurementsAndOutcome <- do.call(prepareData, prepParams)
... ...
@@ -408,76 +416,6 @@ setMethod("crossValidate", "matrix", # Matrix of numeric measurements.
408 416
                             characteristicsLabel = characteristicsLabel, extraParams = extraParams)
409 417
           })
410 418
 
411
-# This expects that each table is about the same set of samples and thus
412
-# has the same number of rows as every other table.
413
-#' @rdname crossValidate                                                            
414
-#' @export
415
-setMethod("crossValidate", "list",
416
-          function(measurements,
417
-                   outcome, 
418
-                   nFeatures = 20,
419
-                   selectionMethod = "auto",
420
-                   selectionOptimisation = "Resubstitution",
421
-                   performanceType = "auto",
422
-                   classifier = "auto",
423
-                   multiViewMethod = "none",
424
-                   assayCombinations = "all",
425
-                   nFolds = 5,
426
-                   nRepeats = 20,
427
-                   nCores = 1,
428
-                   characteristicsLabel = NULL, extraParams = NULL)
429
-          {
430
-              # Check data type is valid
431
-              if (!(all(sapply(measurements, class) %in% c("data.frame", "DataFrame", "matrix")))) {
432
-                  stop("assays must be of type data.frame, DataFrame or matrix")
433
-              }
434
-              
435
-              # Check the list is named
436
-              if (is.null(names(measurements))) {
437
-                  stop("Measurements must be a named list.")
438
-              }
439
-              
440
-              # Check same number of samples for all datasets
441
-              if (!length(unique(sapply(measurements, nrow))) == 1) {
442
-                  stop("All datasets must have the same samples.")
443
-              }
444
-              
445
-              # Check the number of outcome is the same
446
-              if (!all(sapply(measurements, nrow) == length(outcome)) && !is.character(outcome)) {
447
-                  stop("outcome must have same number of samples as measurements.")
448
-              }
449
-              
450
-              df_list <- sapply(measurements, S4Vectors::DataFrame, check.names = FALSE)
451
-              
452
-              df_list <- mapply(function(meas, nam){
453
-                  S4Vectors::mcols(meas)$assay <- nam
454
-                  S4Vectors::mcols(meas)$feature <- colnames(meas)
455
-                  meas
456
-              }, df_list, names(df_list))
457
-              
458
-              
459
-              combined_df <- do.call("cbind", df_list) 
460
-              colnames(combined_df) <- S4Vectors::mcols(combined_df)$feature
461
-
462
-
463
-              
464
-              crossValidate(measurements = combined_df,
465
-                            outcome = outcome, 
466
-                            nFeatures = nFeatures,
467
-                            selectionMethod = selectionMethod,
468
-                            selectionOptimisation = selectionOptimisation,
469
-                            performanceType = performanceType,
470
-                            classifier = classifier,
471
-                            multiViewMethod = multiViewMethod,
472
-                            assayCombinations = assayCombinations,
473
-                            nFolds = nFolds,
474
-                            nRepeats = nRepeats,
475
-                            nCores = nCores,
476
-                            characteristicsLabel = characteristicsLabel, extraParams = extraParams)
477
-          })
478
-
479
-
480
-
481 419
 ######################################
482 420
 ######################################
483 421
 cleanNFeatures <- function(nFeatures, measurements){
... ...
@@ -1126,9 +1064,9 @@ train.list <- function(x, outcomeTrain, ...)
1126 1064
 #' @rdname crossValidate
1127 1065
 #' @method train MultiAssayExperiment
1128 1066
 #' @export
1129
-train.MultiAssayExperiment <- function(x, outcome, ...)
1067
+train.MultiAssayExperiment <- function(x, outcome, clinicalPredictors = NULL, ...)
1130 1068
           {
1131
-              prepArgs <- list(x, outcome)
1069
+              prepArgs <- list(x, outcome, clinicalPredictors)
1132 1070
               extraInputs <- list(...)
1133 1071
               prepExtras <- trainExtras <- numeric()
1134 1072
               if(length(extraInputs) > 0)
... ...
@@ -1167,7 +1105,7 @@ predict.trainedByClassifyR <- function(object, newData, ...)
1167 1105
     newData <- do.call(cbind, newData)
1168 1106
     } else if(is(newData, "MultiAssayExperiment"))
1169 1107
             {
1170
-              newData <- prepareData(newData, useFeatures = allFeatureNames(object))
1108
+              newData <- prepareData(newData, clinicalPredictors = subset(allFeatureNames(object), assay == "clinical")[, "feature"])
1171 1109
               # Some classifiers dangerously use positional matching rather than column name matching.
1172 1110
               # newData columns are sorted so that the right column ordering is guaranteed.
1173 1111
     }
1174 1112
new file mode 100644
... ...
@@ -0,0 +1,77 @@
1
+# Basically, an ordered list of cross-validations.
2
+
3
+setGeneric("precisionPathwayTrain", function(measurements, class, ...)
4
+    standardGeneric("precisionPathwayTrain"))
5
+
6
+#' @rdname precisionPathwayTrain
7
+#' @export
8
+setMethod("precisionPathwayTrain", "MultiAssayExperiment", 
9
+          function(measurements, class, clinicalPredictors = NULL, ..., fixedAssays = "clinical",
10
+                   confidenceCutoff = 0.8, minAssaySamples = 10,
11
+                   nFeatures = 20, selectionMethod = setNames(c(NULL, rep("t-test", length(measurements))), c("clinical", names(measurements))),
12
+                   classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))), c("clinical", names(measurements))),
13
+                   nFolds = 5, nRepeats = 20, nCores = 1)
14
+          {
15
+              prepArgs <- list(measurements, class, clinicalPredictors)              
16
+              extraInputs <- list(...)
17
+              prepExtras <- numeric()
18
+              if(length(extraInputs) > 0)
19
+                prepExtras <- which(names(extraInputs) %in% .ClassifyRenvir[["prepareDataFormals"]])
20
+              if(length(prepExtras) > 0)
21
+                prepArgs <- append(prepArgs, extraInputs[prepExtras])
22
+              measurementsAndClass <- do.call(prepareData, prepArgs)
23
+              
24
+             .precisionPathwayTrain(measurementsAndClass[["measurements"]], measurementsAndClass[["outcome"]],
25
+                                    fixedAssays = fixedAssays, confidenceCutoff = confidenceCutoff,
26
+                                    minAssaySamples = minAssaySamples, nFeatures = nFeatures,
27
+                                    selectionMethod = selectionMethod, classifier = classifier,
28
+                                    nFolds = nFolds, nRepeats = nRepeats, nCores = nCores)
29
+          })
30
+
31
+#' @rdname precisionPathwayTrain
32
+#' @export
33
+setMethod("precisionPathwayTrain", "list", 
34
+          function(measurements, class, clinicalPredictors = NULL, fixedAssays = "clinical",
35
+                   confidenceCutoff = 0.8, minAssaySamples = 10,
36
+                   nFeatures = 20, selectionMethod = setNames(c(NULL, rep("t-test", length(measurements))), c("clinical", names(measurements))),
37
+                   classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))), c("clinical", names(measurements))),
38
+                   nFolds = 5, nRepeats = 20, nCores = 1, ...)
39
+          {
40
+            # One of the tables must be named "clinical".
41
+            if (!any(names(measurements) == "clinical"))
42
+              stop("One of the tables must be named \"clinical\".")
43
+              
44
+            .precisionPathwayTrain(measurements = combined_df, class = class)
45
+          })
46
+
47
+# Internal method which carries out all of the processing, obtaining reformatted data from the
48
+# MultiAssayExperiment and list (of basic rectangular tables) S4 methods.
49
+.precisionPathwayTrain <- function(measurements, class, fixedAssays = "clinical",
50
+                   confidenceCutoff = 0.8, minAssaySamples = 10,
51
+                   nFeatures = 20, selectionMethod = setNames(c(NULL, rep("t-test", length(measurements))), c("clinical", names(measurements))),
52
+                   classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))), c("clinical", names(measurements))),
53
+                   nFolds = 5, nRepeats = 20, nCores = 1, ...)
54
+          {
55
+            # Step 1: Separate the vector classes from the table of measurements, if not already separate.
56
+            prepArgs <- list(measurements, class, clinicalPredictors)              
57
+            extraInputs <- list(...)
58
+            prepExtras <- numeric()
59
+            if(length(extraInputs) > 0)
60
+              prepExtras <- which(names(extraInputs) %in% .ClassifyRenvir[["prepareDataFormals"]])
61
+            if(length(prepExtras) > 0)
62
+              prepArgs <- append(prepArgs, extraInputs[prepExtras])
63
+            
64
+            measurementsAndClass <- do.call(prepareData, prepArgs)
65
+            measurements <- measurementsAndClass[["measurements"]]
66
+            class <- measurementsAndClass[["outcome"]]
67
+            
68
+            # Step 2: Determine all valid permutations of assays, taking into account the
69
+            # assays to be used and which assays, if any, must be included.
70
+            assayIDs <- unique(S4Vectors::mcols(measurements)[["assay"]])
71
+            assaysPermutations <- .permutations(assayIDs, fixed = data.frame(seq_along(fixedAssays), fixedAssays))
72
+            
73
+            # Step 3: Build a classifier for each assay using all of the samples.
74
+            modelsList <- crossValidate(measurements, class, nFeatures, selectionMethod,
75
+                                        classifier = classifier, nFolds = nFolds,
76
+                                        nRepeats = nRepeats, nCores = nCores)
77
+        }
0 78
\ No newline at end of file
... ...
@@ -18,15 +18,13 @@
18 18
 #' @param outcomeColumns If \code{measurements} is a \code{MultiAssayExperiment}, the
19 19
 #' names of the column (class) or columns (survival) in the table extracted by \code{colData(data)}
20 20
 #' that contain(s) the each individual's outcome to use for prediction.
21
-#' @param useFeatures If \code{measurements} is a \code{MultiAssayExperiment},
22
-#' a two-column table of features to use. The first column must have assay names
23
-#' and the second column must have feature names found for that assay. \code{"clinical"} is
24
-#' also a valid assay name and refers to the clinical data table. \code{"all"} is a special
25
-#' keyword that means all features (passing any other filters) of that assay will be used 
26
-#' for modelling. Otherwise, a character vector of feature names to use suffices.
21
+#' @param clinicalPredictors If \code{measurements} is a \code{MultiAssayExperiment},
22
+#' a character vector of features to use in modelling. This allows avoidance of things like sample IDs,
23
+#' sample acquisition dates, etc. which are not relevant for outcome prediction.
27 24
 #' @param maxMissingProp Default: 0.0. A proportion less than 1 which is the maximum
28 25
 #' tolerated proportion of missingness for a feature to be retained for modelling.
29
-#' @param topNvariance Default: NULL. An integer number of most variable features to subset to.
26
+#' @param topNvariance Default: NULL. An integer number of most variable features per assay to subset to.
27
+#' Assays with less features won't be reduced in size.
30 28
 #' @param ... Variables not used by the \code{matrix} nor the
31 29
 #' \code{MultiAssayExperiment} method which are passed into and used by the
32 30
 #' \code{DataFrame} method.
... ...
@@ -58,18 +56,16 @@ setMethod("prepareData", "data.frame",
58 56
 #' @rdname prepareData
59 57
 #' @export
60 58
 setMethod("prepareData", "DataFrame",
61
-  function(measurements, outcome, useFeatures = "all", maxMissingProp = 0.0, topNvariance = NULL)
59
+  function(measurements, outcome, clinicalPredictors = NULL, maxMissingProp = 0.0, topNvariance = NULL)
62 60
 {
63 61
   if(is.null(rownames(measurements)))
64 62
   {
65 63
     warning("'measurements' DataFrame must have sample identifiers as its row names. Generating generic ones.")
66 64
     rownames(measurements) <- paste("Sample", seq_len(nrow(measurements)))
67
-  }      
68
-            
69
-  if(useFeatures != "all") # Subset to only the desired ones.
70
-    measurements <- measurements[, useFeatures]
71
-
65
+  }
66
+      
72 67
   # Won't ever be true if input data was MultiAssayExperiment because wideFormat already produces valid names.  
68
+  # Need to check if input data was DataFrame because names might not be valid from user.
73 69
   if(!all(colnames(measurements) == make.names(colnames(measurements))))
74 70
   {
75 71
     warning("Unsafe feature names in input data. Converted into safe names.")
... ...
@@ -129,6 +125,19 @@ setMethod("prepareData", "DataFrame",
129 125
     else # Three columns. Therefore, counting process data.
130 126
       outcome <- survival::Surv(outcome[, 1], outcome[, 2], outcome[, 3])
131 127
   }
128
+  
129
+  if(!is.null(clinicalPredictors))
130
+  {
131
+    if(!is.null(mcols(measurements)$assay))
132
+    {
133
+      clinicalIndices <- which(mcols(measurements)$assay == "clinical")
134
+      usePredictors <- intersect(clinicalIndices, which(mcols(measurements)$feature %in% clinicalPredictors))
135
+      dropIndices <- setdiff(clinicalIndices, usePredictors)
136
+      if(length(dropIndices) > 0) measurements <- measurements[, -dropIndices]
137
+    } else { # The DataFrame is entirely clinical data.
138
+      measurements <- measurements[, clinicalPredictors]
139
+    }
140
+  }  
132 141
 
133 142
   # Remove samples with indeterminate outcome.
134 143
   dropSamples <- which(is.na(outcome) | is.null(outcome))
... ...
@@ -146,11 +155,18 @@ setMethod("prepareData", "DataFrame",
146 155
   if(length(dropFeatures) > 0)
147 156
     measurements <- measurements[, -dropFeatures]
148 157
   
149
-  # Use only the most variable features.
158
+  # Use only the most N variable features per assay.
150 159
   if(!is.null(topNvariance))
151 160
   {
152
-    mostVariance <- order(apply(measurements, 2, var, na.rm = TRUE), decreasing = TRUE)[1:topNvariance]
153
-    measurements <- measurements[, mostVariance]
161
+    if(is.null(mcols(measurements)$assay)) assays <- rep(1, ncol(measurements)) else assays <- mcols(measurements)$assay
162
+    do.call(cbind, lapply(unqiue(assays), function(assay)
163
+    {
164
+      assayColumns <- which(assays == assay)    
165
+      if(length(assayColumns) < topNvariance)
166
+        measurements[, assayColumns]
167
+      else
168
+        measurements[, assayColumns][order(apply(measurements[, assayColumns], 2, var, na.rm = TRUE), decreasing = TRUE)[1:topNvariance]]  
169
+    }))
154 170
   }
155 171
   
156 172
   list(measurements = measurements, outcome = outcome)
... ...
@@ -159,71 +175,74 @@ setMethod("prepareData", "DataFrame",
159 175
 #' @rdname prepareData
160 176
 #' @export
161 177
 setMethod("prepareData", "MultiAssayExperiment",
162
-  function(measurements, outcomeColumns = NULL, useFeatures = "all", ...)
178
+  function(measurements, outcomeColumns = NULL, clinicalPredictors = NULL, ...)
163 179
 {
164
-  if(is.character(useFeatures)) useFeatures <- data.frame(assay = names(measurements), feature = "all")
165
-  omicsTargets <- setdiff(useFeatures[, "assay"], "clinical")
166
-  if(length(omicsTargets) > 0)
167
-  {
168
-    if(any(anyReplicated(measurements[, , omicsTargets])))
169
-      stop("Data set contains replicates. Please remove or average replicate observations and try again.")
170
-  }
171
-  
172
-  if(!is.null(outcomeColumns) && !all(outcomeColumns %in% colnames(MultiAssayExperiment::colData(measurements))))
180
+  if(is.null(clinicalPredictors))
181
+    stop("'clinicalPredictors' must be a vector of informative clinical features (i.e. not sample IDs, sampling dates, etc.) to consider for classification.")      
182
+
183
+  if(any(anyReplicated(measurements[, , omicsTargets])))
184
+    stop("Data set contains replicates. Please remove or average replicate observations and try again.")
185
+ 
186
+  if(is.null(outcomeColums))
187
+    stop("'outcomeColumns' is a mandatory parameter but was not specified.")
188
+      
189
+  if(!all(outcomeColumns %in% colnames(MultiAssayExperiment::colData(measurements))))
173 190
     stop("Not all column names specified by 'outcomeColumns' found in clinical table.")  
174
-  if(!all(useFeatures[, "assay"] %in% c(names(measurements), "clinical")))
175
-    stop("Some assay names in first column of 'useFeatures' are not assay names in 'measurements' or \"clinical\".")
176 191
 
177
-  clinicalColumnsDataset <- colnames(MultiAssayExperiment::colData(measurements))
178
-  if("clinical" %in% useFeatures[, "assay"])
179
-  {
180
-    clinicalRows <- useFeatures[, "assay"] == "clinical"      
181
-    clinicalColumns <- useFeatures[clinicalRows, "feature"]
182
-    if(length(clinicalColumns) == 1 && clinicalColumns == "all")
183
-      clinicalColumns <- setdiff(clinicalColumnsDataset, outcomeColumns)
184
-    useFeatures <- useFeatures[!clinicalRows, ]
185
-  } else {
186
-    clinicalColumns <- NULL
187
-  }
188
-  
189
-  if(nrow(useFeatures) > 0)
190
-  {
191
-    measurements <- measurements[, , unique(useFeatures[, "assay"])]
192
-    # Get all desired measurements tables and clinical columns (other than the columns representing outcome).
193
-    # These form the independent variables to be used for making predictions with.
194
-    # Variable names will have names like RNA_BRAF for traceability.
195
-    dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(clinicalColumns, outcomeColumns))
196
-    rownames(dataTable) <- dataTable[, "primary"]
197
-    S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "clinical", S4Vectors::mcols(dataTable)[, "sourceName"])
198
-    colnames(S4Vectors::mcols(dataTable))[1] <- "assay"
192
+  # Get all desired measurements tables and clinical columns.
193
+  # These form the independent variables to be used for making predictions with.
194
+  # Variable names will have names like RNA_BRAF for traceability.
195
+  dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(clinicalPredictors, outcomeColumns))
196
+  rownames(dataTable) <- dataTable[, "primary"]
197
+  S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "clinical", S4Vectors::mcols(dataTable)[, "sourceName"])
198
+  colnames(S4Vectors::mcols(dataTable))[1] <- "assay"
199 199
             
200
-    # Sample information variable names not included in column metadata of wide table but only as row names of it.
201
-    # Create a combined column named "feature" which has feature names of the assays as well as the clinical.
202
-    S4Vectors::mcols(dataTable)[, "feature"] <- as.character(S4Vectors::mcols(dataTable)[, "rowname"])
203
-    missingIndices <- is.na(S4Vectors::mcols(dataTable)[, "feature"])
204
-    S4Vectors::mcols(dataTable)[missingIndices, "feature"] <- colnames(dataTable)[missingIndices]
200
+  # Sample information variable names not included in column metadata of wide table but only as row names of it.
201
+  # Create a combined column named "feature" which has feature names of the assays as well as the clinical.
202
+  S4Vectors::mcols(dataTable)[, "feature"] <- as.character(S4Vectors::mcols(dataTable)[, "rowname"])
203
+  missingIndices <- is.na(S4Vectors::mcols(dataTable)[, "feature"])
204
+  S4Vectors::mcols(dataTable)[missingIndices, "feature"] <- colnames(dataTable)[missingIndices]
205 205
     
206
-    # Finally, a column annotation recording variable name and which table it originated from for all of the source tables.
207
-    S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")]
206
+  # Finally, a column annotation recording variable name and which table it originated from for all of the source tables.
207
+  S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")]
208 208
     
209
-    # Subset to only the desired features.
210
-    useFeaturesSubset <- useFeatures[useFeatures[, "feature"] != "all", ]
211
-    if(nrow(useFeaturesSubset) > 0)
212
-    {
213
-      uniqueAssays <- unique(useFeatures[, "assay"])
214
-      for(filterAssay in uniqueAssays)
215
-      {
216
-        dropFeatures <- S4Vectors::mcols(dataTable)[, "assay"] == filterAssay &
217
-                        !S4Vectors::mcols(dataTable)[, "feature"] %in% useFeatures[useFeatures[, 1] == filterAssay, 2]
218
-        dataTable <- dataTable[, !dropFeatures]
219
-      }
220
-    }
221
-    dataTable <- dataTable[, -match("primary", colnames(dataTable))]
222
-  } else { # Must have only been clinical data.
223
-    dataTable <- MultiAssayExperiment::colData(measurements)
224
-    S4Vectors::mcols(dataTable) <- DataFrame(assay = "clinical", feature = colnames(dataTable))
225
-  }
209
+  # Do other filtering and preparation in DataFrame function.
210
+  prepareData(dataTable, outcomeColumns, clinicalPredictors = NULL, ...)
211
+})
212
+
213
+#' @rdname prepareData
214
+#' @export
215
+setMethod("prepareData", "list",
216
+  function(measurements, outcome = NULL, clinicalPredictors = NULL, ...)
217
+{
218
+  # Check the list is named.
219
+  if(is.null(names(measurements)))
220
+    stop("'measurements' must be a named list.")
221
+
222
+  # If clinical table is present, features to use must be user-specified.            
223
+  if("clinical" %in% names(measurements) && is.null(clinicalPredictors))
224
+    stop("Because one provided table in the list is named \"clinical\", 'clinicalPredictors' must be a vector of informative clinical features (i.e. not sample IDs, sampling dates, etc.) to consider for classification.")
225
+
226
+  # Check data type is valid.
227
+  if(!(all(sapply(measurements, class) %in% c("data.frame", "DataFrame", "matrix"))))
228
+    stop("assays in the list must be of type data.frame, DataFrame or matrix")
229
+              
230
+  # Check same number of samples for all datasets
231
+  if (!length(unique(sapply(measurements, nrow))) == 1)
232
+    stop("All datasets must have the same samples.")
233
+      
234
+  if("clinical" %in% names(measurements))
235
+    measurements[["clinical"]] <- measurements[["clinical"]][, clinicalPredictors]
236
+             
237
+  allMetadata <- mapply(function(measurementsOne, assayID) {
238
+                        data.frame(assay = assayID, feature = colnames(measurementsOne))
239
+                        }, measurements, names(measurements))
240
+  allMeasurements <- do.call("cbind", measurements)
241
+  # Different assays e.g. mRNA, protein could have same feature name e.g. BRAF.
242
+  colnames(allMeasurements) <- paste(allMetadata[, "assay"], allMetadata[, "feature"], sep = '_')
243
+  allDataFrame <- DataFrame(allMeasurements)
244
+  S4Vectors::mcols(allMeasurements) <- allMetadata
226 245
     
227 246
   # Do other filtering and preparation in DataFrame function.
228
-  prepareData(dataTable, outcomeColumns, useFeatures = "all", ...)
247
+  prepareData(dataTable, outcome, clinicalPredictors = NULL, ...)
229 248
 })
230 249
\ No newline at end of file
... ...
@@ -6,8 +6,7 @@
6 6
 \alias{crossValidate,DataFrame-method}
7 7
 \alias{crossValidate,MultiAssayExperiment-method,}
8 8
 \alias{crossValidate,data.frame-method}
9
-\alias{crossValidate,MultiAssayExperiment-method}
10
-\alias{crossValidate,list-method}
9
+\alias{crossValidate,MultiAssayExperimentOrList-method}
11 10
 \alias{train.matrix}
12 11
 \alias{train.data.frame}
13 12
 \alias{train.DataFrame}
... ...
@@ -35,9 +34,10 @@ crossValidate(measurements, outcome, ...)
35 34
   extraParams = NULL
36 35
 )
37 36
 
38
-\S4method{crossValidate}{MultiAssayExperiment}(
37
+\S4method{crossValidate}{MultiAssayExperimentOrList}(
39 38
   measurements,
40 39
   outcome,
40
+  clinicalPredictors = NULL,
41 41
   nFeatures = 20,
42 42
   selectionMethod = "auto",
43 43
   selectionOptimisation = "Resubstitution",
... ...
@@ -86,23 +86,6 @@ crossValidate(measurements, outcome, ...)
86 86
   extraParams = NULL
87 87
 )
88 88
 
89
-\S4method{crossValidate}{list}(
90
-  measurements,
91
-  outcome,
92
-  nFeatures = 20,
93
-  selectionMethod = "auto",
94
-  selectionOptimisation = "Resubstitution",
95
-  performanceType = "auto",
96
-  classifier = "auto",
97
-  multiViewMethod = "none",
98
-  assayCombinations = "all",
99
-  nFolds = 5,
100
-  nRepeats = 20,
101
-  nCores = 1,
102
-  characteristicsLabel = NULL,
103
-  extraParams = NULL
104
-)
105
-
106 89
 \method{train}{matrix}(x, outcomeTrain, ...)
107 90
 
108 91
 \method{train}{data.frame}(x, outcomeTrain, ...)
... ...
@@ -122,7 +105,7 @@ crossValidate(measurements, outcome, ...)
122 105
 
123 106
 \method{train}{list}(x, outcomeTrain, ...)
124 107
 
125
-\method{train}{MultiAssayExperiment}(x, outcome, ...)
108
+\method{train}{MultiAssayExperiment}(x, outcome, clinicalPredictors = NULL, ...)
126 109
 
127 110
 \method{predict}{trainedByClassifyR}(object, newData, ...)
128 111
 }
... ...
@@ -146,7 +129,8 @@ Set to NULL or "all" if all features should be used.}
146 129
 
147 130
 \item{selectionMethod}{Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
148 131
 and performing multiview classification, the respective selection methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking
149
-and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value.}
132
+and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. \code{NULL} is also a valid value, meaning that no
133
+indepedent feature selection will be performed (but implicit selection might still happen with the classifier).}
150 134
 
151 135
 \item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.}
152 136
 
... ...
@@ -172,7 +156,11 @@ with each element being a vector of assays to combine. Special value \code{"all"
172 156
 \item{extraParams}{A list of parameters that will be used to overwrite default settings of transformation, selection, or model-building functions or
173 157
 parameters which will be passed into the data cleaning function. The names of the list must be one of \code{"prepare"},
174 158
 \code{"select"}, \code{"train"}, \code{"predict"}. To remove one of the defaults (see the article titled Parameter Tuning Presets for crossValidate and Their Customisation on
175
-the website), specify the list element to be \code{NULL}.}
159
+the website), specify the list element to be \code{NULL}. For the valid element names in the \code{"prepare"} list, see \code{?prepareData}.}
160
+
161
+\item{clinicalPredictors}{If \code{measurements} is a \code{MultiAssayExperiment},
162
+a character vector of features to use in modelling. This allows avoidance of things like sample IDs,
163
+sample acquisition dates, etc. which are not relevant for outcome prediction.}
176 164
 
177 165
 \item{x}{Same as \code{measurements} but only training samples.}
178 166
 
... ...
@@ -6,6 +6,7 @@
6 6
 \alias{prepareData,DataFrame-method}
7 7
 \alias{prepareData,MultiAssayExperiment-method}
8 8
 \alias{prepareData,data.frame-method}
9
+\alias{prepareData,list-method}
9 10
 \title{Convert Different Data Classes into DataFrame and Filter Features}
10 11
 \usage{
11 12
 \S4method{prepareData}{matrix}(measurements, outcome, ...)
... ...
@@ -15,12 +16,19 @@
15 16
 \S4method{prepareData}{DataFrame}(
16 17
   measurements,
17 18
   outcome,
18
-  useFeatures = "all",
19
+  clinicalPredictors = NULL,
19 20
   maxMissingProp = 0,
20 21
   topNvariance = NULL
21 22
 )
22 23
 
23
-\S4method{prepareData}{MultiAssayExperiment}(measurements, outcomeColumns = NULL, useFeatures = "all", ...)
24
+\S4method{prepareData}{MultiAssayExperiment}(
25
+  measurements,
26
+  outcomeColumns = NULL,
27
+  clinicalPredictors = NULL,
28
+  ...
29
+)
30
+
31
+\S4method{prepareData}{list}(measurements, outcome = NULL, clinicalPredictors = NULL, ...)
24 32
 }
25 33
 \arguments{
26 34
 \item{measurements}{Either a \code{\link{matrix}}, \code{\link{DataFrame}}
... ...
@@ -37,17 +45,15 @@ a character string, or vector of such strings, containing column name(s) of colu
37 45
 containing either classes or time and event information about survival. If column names
38 46
 of survival information, time must be in first column and event status in the second.}
39 47
 
40
-\item{useFeatures}{If \code{measurements} is a \code{MultiAssayExperiment},
41
-a two-column table of features to use. The first column must have assay names
42
-and the second column must have feature names found for that assay. \code{"clinical"} is
43
-also a valid assay name and refers to the clinical data table. \code{"all"} is a special
44
-keyword that means all features (passing any other filters) of that assay will be used 
45
-for modelling. Otherwise, a character vector of feature names to use suffices.}
48
+\item{clinicalPredictors}{If \code{measurements} is a \code{MultiAssayExperiment},
49
+a character vector of features to use in modelling. This allows avoidance of things like sample IDs,
50
+sample acquisition dates, etc. which are not relevant for outcome prediction.}
46 51
 
47 52
 \item{maxMissingProp}{Default: 0.0. A proportion less than 1 which is the maximum
48 53
 tolerated proportion of missingness for a feature to be retained for modelling.}
49 54
 
50
-\item{topNvariance}{Default: NULL. An integer number of most variable features to subset to.}
55
+\item{topNvariance}{Default: NULL. An integer number of most variable features per assay to subset to.
56
+Assays with less features won't be reduced in size.}
51 57
 
52 58
 \item{outcomeColumns}{If \code{measurements} is a \code{MultiAssayExperiment}, the
53 59
 names of the column (class) or columns (survival) in the table extracted by \code{colData(data)}