Browse code

Merge pull request #69 from SydneyBioX/master

Implemented precisionPathwaysTrain and Added Default Weights for GLMs

Dario Strbenac authored on 16/02/2023 03:22:53 • GitHub committed on 16/02/2023 03:22:53
Showing 11 changed files

... ...
@@ -3,8 +3,8 @@ Type: Package
3 3
 Title: A framework for cross-validated classification problems, with
4 4
        applications to differential variability and differential
5 5
        distribution testing
6
-Version: 3.3.13
7
-Date: 2023-02-13
6
+Version: 3.3.14
7
+Date: 2023-02-16
8 8
 Authors@R:
9 9
     c(
10 10
     person(given = "Dario", family = "Strbenac", email = "dario.strbenac@sydney.edu.au", role = c("aut", "cre")),
... ...
@@ -72,6 +72,7 @@ Collate:
72 72
     'interfaceXGB.R'
73 73
     'performancePlot.R'
74 74
     'plotFeatureClasses.R'
75
+    'precisionPathways.R'
75 76
     'prepareData.R'
76 77
     'previousSelection.R'
77 78
     'previousTrained.R'
... ...
@@ -76,6 +76,8 @@ exportMethods(models)
76 76
 exportMethods(performance)
77 77
 exportMethods(performancePlot)
78 78
 exportMethods(plotFeatureClasses)
79
+exportMethods(precisionPathwaysPredict)
80
+exportMethods(precisionPathwaysTrain)
79 81
 exportMethods(predictions)
80 82
 exportMethods(prepareData)
81 83
 exportMethods(rankingPlot)
... ...
@@ -35,6 +35,9 @@ setClassUnion("DataFrameOrNULL", c("DataFrame", "NULL"))
35 35
 setClassUnion("tabular", c("data.frame", "DataFrame", "matrix"))
36 36
 setClassUnion("tabularOrList", c("tabular", "list"))
37 37
 
38
+# List-like assay data
39
+setClassUnion("MultiAssayExperimentOrList", c("MultiAssayExperiment", "list"))
40
+
38 41
 ################################################################################
39 42
 #
40 43
 # Params
... ...
@@ -22,14 +22,18 @@
22 22
 #' @param extraParams A list of parameters that will be used to overwrite default settings of transformation, selection, or model-building functions or
23 23
 #' parameters which will be passed into the data cleaning function. The names of the list must be one of \code{"prepare"},
24 24
 #' \code{"select"}, \code{"train"}, \code{"predict"}. To remove one of the defaults (see the article titled Parameter Tuning Presets for crossValidate and Their Customisation on
25
-#' the website), specify the list element to be \code{NULL}.
25
+#' the website), specify the list element to be \code{NULL}. For the valid element names in the \code{"prepare"} list, see \code{?prepareData}.
26
+#' @param clinicalPredictors Default: \code{NULL}. If \code{measurements} is a \code{MultiAssayExperiment},
27
+#' a character vector of features to use in modelling. This allows avoidance of things like sample IDs,
28
+#' sample acquisition dates, etc. which are not relevant for outcome prediction.
26 29
 #' @param nFeatures The number of features to be used for classification. If this is a single number, the same number of features will be used for all comparisons
27 30
 #' or assays. If a numeric vector these will be optimised over using \code{selectionOptimisation}. If a named vector with the same names of multiple assays, 
28 31
 #' a different number of features will be used for each assay. If a named list of vectors, the respective number of features will be optimised over. 
29 32
 #' Set to NULL or "all" if all features should be used.
30 33
 #' @param selectionMethod Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
31 34
 #' and performing multiview classification, the respective selection methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking
32
-#' and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value.
35
+#' and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. \code{"none"} is also a valid value, meaning that no
36
+#' indepedent feature selection will be performed (but implicit selection might still happen with the classifier).
33 37
 #' @param selectionOptimisation A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.
34 38
 #' @param performanceType Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Otherwise, any one of the
35 39
 #' options described in \code{\link{calcPerformance}} may otherwise be specified.
... ...
@@ -88,12 +92,13 @@
88 92
 #' # performancePlot(c(result, resultMerge))
89 93
 #' 
90 94
 #' @importFrom survival Surv
95
+#' @usage NULL
91 96
 setGeneric("crossValidate", function(measurements, outcome, ...)
92 97
     standardGeneric("crossValidate"))
93 98
 
94 99
 #' @rdname crossValidate
95 100
 #' @export
96
-setMethod("crossValidate", "DataFrame", 
101
+setMethod("crossValidate", "DataFrame",
97 102
           function(measurements,
98 103
                    outcome,
99 104
                    nFeatures = 20,
... ...
@@ -110,12 +115,15 @@ setMethod("crossValidate", "DataFrame",
110 115
 
111 116
           {
112 117
               # Check that data is in the right format, if not already done for MultiAssayExperiment input.
113
-              prepParams <- list(measurements, outcome)
114
-              if("prepare" %in% names(extraParams))
115
-                prepParams <- c(prepParams, extraParams[["prepare"]])
116
-              measurementsAndOutcome <- do.call(prepareData, prepParams)
117
-              measurements <- measurementsAndOutcome[["measurements"]]
118
-              outcome <- measurementsAndOutcome[["outcome"]]
118
+              if(!"assay" %in% colnames(S4Vectors::mcols(measurements))) # Assay is put there by prepareData for MultiAssayExperiment, skip if present. 
119
+              {
120
+                prepParams <- list(measurements, outcome)
121
+                if("prepare" %in% names(extraParams))
122
+                  prepParams <- c(prepParams, extraParams[["prepare"]])
123
+                measurementsAndOutcome <- do.call(prepareData, prepParams)
124
+                measurements <- measurementsAndOutcome[["measurements"]]
125
+                outcome <- measurementsAndOutcome[["outcome"]]
126
+              }
119 127
               
120 128
               # Ensure performance type is one of the ones that can be calculated by the package.
121 129
               if(!performanceType %in% c("auto", .ClassifyRenvir[["performanceTypes"]]))
... ...
@@ -305,9 +313,10 @@ setMethod("crossValidate", "DataFrame",
305 313
 #' @rdname crossValidate
306 314
 #' @export
307 315
 # One or more omics data sets, possibly with clinical data.
308
-setMethod("crossValidate", "MultiAssayExperiment",
316
+setMethod("crossValidate", "MultiAssayExperimentOrList",
309 317
           function(measurements,
310
-                   outcome, 
318
+                   outcome,
319
+                   clinicalPredictors = NULL,
311 320
                    nFeatures = 20,
312 321
                    selectionMethod = "auto",
313 322
                    selectionOptimisation = "Resubstitution",
... ...
@@ -321,7 +330,7 @@ setMethod("crossValidate", "MultiAssayExperiment",
321 330
                    characteristicsLabel = NULL, extraParams = NULL)
322 331
           {
323 332
               # Check that data is in the right format, if not already done for MultiAssayExperiment input.
324
-              prepParams <- list(measurements, outcome)
333
+              prepParams <- list(measurements, outcome, clinicalPredictors)
325 334
               if("prepare" %in% names(extraParams))
326 335
                 prepParams <- c(prepParams, extraParams[["prepare"]])
327 336
               measurementsAndOutcome <- do.call(prepareData, prepParams)
... ...
@@ -408,76 +417,6 @@ setMethod("crossValidate", "matrix", # Matrix of numeric measurements.
408 417
                             characteristicsLabel = characteristicsLabel, extraParams = extraParams)
409 418
           })
410 419
 
411
-# This expects that each table is about the same set of samples and thus
412
-# has the same number of rows as every other table.
413
-#' @rdname crossValidate                                                            
414
-#' @export
415
-setMethod("crossValidate", "list",
416
-          function(measurements,
417
-                   outcome, 
418
-                   nFeatures = 20,
419
-                   selectionMethod = "auto",
420
-                   selectionOptimisation = "Resubstitution",
421
-                   performanceType = "auto",
422
-                   classifier = "auto",
423
-                   multiViewMethod = "none",
424
-                   assayCombinations = "all",
425
-                   nFolds = 5,
426
-                   nRepeats = 20,
427
-                   nCores = 1,
428
-                   characteristicsLabel = NULL, extraParams = NULL)
429
-          {
430
-              # Check data type is valid
431
-              if (!(all(sapply(measurements, class) %in% c("data.frame", "DataFrame", "matrix")))) {
432
-                  stop("assays must be of type data.frame, DataFrame or matrix")
433
-              }
434
-              
435
-              # Check the list is named
436
-              if (is.null(names(measurements))) {
437
-                  stop("Measurements must be a named list.")
438
-              }
439
-              
440
-              # Check same number of samples for all datasets
441
-              if (!length(unique(sapply(measurements, nrow))) == 1) {
442
-                  stop("All datasets must have the same samples.")
443
-              }
444
-              
445
-              # Check the number of outcome is the same
446
-              if (!all(sapply(measurements, nrow) == length(outcome)) && !is.character(outcome)) {
447
-                  stop("outcome must have same number of samples as measurements.")
448
-              }
449
-              
450
-              df_list <- sapply(measurements, S4Vectors::DataFrame, check.names = FALSE)
451
-              
452
-              df_list <- mapply(function(meas, nam){
453
-                  S4Vectors::mcols(meas)$assay <- nam
454
-                  S4Vectors::mcols(meas)$feature <- colnames(meas)
455
-                  meas
456
-              }, df_list, names(df_list))
457
-              
458
-              
459
-              combined_df <- do.call("cbind", df_list) 
460
-              colnames(combined_df) <- S4Vectors::mcols(combined_df)$feature
461
-
462
-
463
-              
464
-              crossValidate(measurements = combined_df,
465
-                            outcome = outcome, 
466
-                            nFeatures = nFeatures,
467
-                            selectionMethod = selectionMethod,
468
-                            selectionOptimisation = selectionOptimisation,
469
-                            performanceType = performanceType,
470
-                            classifier = classifier,
471
-                            multiViewMethod = multiViewMethod,
472
-                            assayCombinations = assayCombinations,
473
-                            nFolds = nFolds,
474
-                            nRepeats = nRepeats,
475
-                            nCores = nCores,
476
-                            characteristicsLabel = characteristicsLabel, extraParams = extraParams)
477
-          })
478
-
479
-
480
-
481 420
 ######################################
482 421
 ######################################
483 422
 cleanNFeatures <- function(nFeatures, measurements){
... ...
@@ -656,7 +595,6 @@ generateModellingParams <- function(assayIDs,
656 595
     }    
657 596
     
658 597
     selectionMethod <- unlist(selectionMethod)
659
-    if(is.null(selectionMethod)) selectionMethod <- "none"
660 598
 
661 599
     if(selectionMethod != "none")
662 600
     {
... ...
@@ -1126,9 +1064,9 @@ train.list <- function(x, outcomeTrain, ...)
1126 1064
 #' @rdname crossValidate
1127 1065
 #' @method train MultiAssayExperiment
1128 1066
 #' @export
1129
-train.MultiAssayExperiment <- function(x, outcome, ...)
1067
+train.MultiAssayExperiment <- function(x, outcome, clinicalPredictors = NULL, ...)
1130 1068
           {
1131
-              prepArgs <- list(x, outcome)
1069
+              prepArgs <- list(x, outcome, clinicalPredictors)
1132 1070
               extraInputs <- list(...)
1133 1071
               prepExtras <- trainExtras <- numeric()
1134 1072
               if(length(extraInputs) > 0)
... ...
@@ -1167,7 +1105,7 @@ predict.trainedByClassifyR <- function(object, newData, ...)
1167 1105
     newData <- do.call(cbind, newData)
1168 1106
     } else if(is(newData, "MultiAssayExperiment"))
1169 1107
             {
1170
-              newData <- prepareData(newData, useFeatures = allFeatureNames(object))
1108
+              newData <- prepareData(newData, clinicalPredictors = subset(allFeatureNames(object), assay == "clinical")[, "feature"])
1171 1109
               # Some classifiers dangerously use positional matching rather than column name matching.
1172 1110
               # newData columns are sorted so that the right column ordering is guaranteed.
1173 1111
     }
... ...
@@ -6,20 +6,26 @@ elasticNetGLMtrainInterface <- function(measurementsTrain, classesTrain, lambda
6 6
     stop("The package 'glmnet' could not be found. Please install it.")
7 7
   if(verbose == 3)
8 8
     message("Fitting elastic net regularised GLM classifier to data.")
9
-    
10
-  measurementsMatrix <- glmnet::makeX(as(measurementsTrain, "data.frame"))
11
-
12
-  fitted <- glmnet::glmnet(measurementsMatrix, classesTrain, family = "multinomial", ...)
13 9
 
10
+  # One-hot encoding needed.    
11
+  measurementsTrain <- MatrixModels::model.Matrix(~ 0 + ., data = measurementsTrain)
12
+  fitted <- glmnet::glmnet(measurementsTrain, classesTrain, family = "multinomial", weights = as.numeric(1 / (table(classesTrain)[classesTrain] / length(classesTrain))), ...)
13
+  # Inverse class size weighting needed to give decent predictions when class imbalance.
14
+  
14 15
   if(is.null(lambda)) # fitted has numerous models for automatically chosen lambda values.
15
-  { # Pick one lambda based on resubstitution performance.
16
-    bestLambda <- fitted[["lambda"]][which.min(sapply(fitted[["lambda"]], function(lambda) # Largest Lambda with minimum balanced error rate.
16
+  { # Pick one lambda based on resubstitution performance. But not the one that makes all variables excluded from model.
17
+    lambdaConsider <- colSums(as.matrix(fitted[["beta"]][[1]])) != 0
18
+    bestLambda <- fitted[["lambda"]][lambdaConsider][which.min(sapply(fitted[["lambda"]][lambdaConsider], function(lambda) # Largest Lambda with minimum balanced error rate.
17 19
     {
18
-      classPredictions <- factor(as.character(predict(fitted, measurementsMatrix, s = lambda, type = "class")), levels = fitted[["classnames"]])
20
+      classPredictions <- factor(as.character(predict(fitted, measurementsTrain, s = lambda, type = "class")), levels = fitted[["classnames"]])
19 21
       calcExternalPerformance(classesTrain, classPredictions, "Balanced Error")
20 22
     }))[1]]
21 23
     attr(fitted, "tune") <- list(lambda = bestLambda)
22 24
   }
25
+  
26
+  attr(fitted, "featureNames") <- colnames(measurementsTrain)
27
+  attr(fitted, "featureGroups") <- measurementsTrain@assign
28
+  
23 29
   fitted
24 30
 }
25 31
 attr(elasticNetGLMtrainInterface, "name") <- "elasticNetGLMtrainInterface"
... ...
@@ -28,6 +34,8 @@ attr(elasticNetGLMtrainInterface, "name") <- "elasticNetGLMtrainInterface"
28 34
 elasticNetGLMpredictInterface <- function(model, measurementsTest, lambda, ..., returnType = c("both", "class", "score"), verbose = 3)
29 35
 { # ... just consumes emitted tuning variables from .doTrain which are unused.
30 36
   returnType <- match.arg(returnType)
37
+  # One-hot encoding needed.    
38
+  measurementsTest <- MatrixModels::model.Matrix(~ 0 + ., data = measurementsTest)
31 39
   
32 40
   # Ensure that testing data has same columns names in same order as training data.
33 41
   # Remove those annoying backquotes which glmnet adds if variables have spaces in names.
... ...
@@ -41,11 +49,11 @@ elasticNetGLMpredictInterface <- function(model, measurementsTest, lambda, ...,
41 49
   if(missing(lambda)) # Tuning parameters are not passed to prediction functions.
42 50
     lambda <- attr(model, "tune")[["lambda"]] # Sneak it in as an attribute on the model.
43 51
 
44
-  testMatrix <- glmnet::makeX(as(measurementsTest, "data.frame"))
45
-  testMatrix <- testMatrix[, rownames(model[["beta"]][[1]])]
46 52
   
47
-  classPredictions <- factor(as.character(predict(model, testMatrix, s = lambda, type = "class")), levels = model[["classnames"]])
48
-  classScores <- predict(model, testMatrix, s = lambda, type = "response")[, , 1]
53
+  measurementsTest <- measurementsTest[, rownames(model[["beta"]][[1]])]
54
+  
55
+  classPredictions <- factor(as.character(predict(model, measurementsTest, s = lambda, type = "class")), levels = model[["classnames"]])
56
+  classScores <- predict(model, measurementsTest, s = lambda, type = "response")[, , 1]
49 57
   
50 58
   if(is.matrix(classScores))
51 59
     classScores <- classScores[, model[["classnames"]]]
... ...
@@ -61,16 +69,20 @@ elasticNetGLMpredictInterface <- function(model, measurementsTest, lambda, ...,
61 69
 #
62 70
 # Get selected features (i.e. non-zero model coefficients)
63 71
 #
72
+# Note: Need to convert back to actual features when factors were expanded into
73
+# multiple columns using indicator variables.
74
+#
64 75
 ################################################################################
65 76
 
66 77
 elasticNetFeatures <- function(model)
67 78
                       {
68
-                        inputFeatures <- rownames(model[["beta"]][[1]])            
69 79
                         # Floating point numbers test for equality.
70 80
                         whichCoefficientColumn <- which(abs(model[["lambda"]] - attr(model, "tune")[["lambda"]]) < 0.00001)[1]
71 81
                         coefficientsUsed <- sapply(model[["beta"]], function(classCoefficients) classCoefficients[, whichCoefficientColumn])
72 82
                         featureScores <- rowSums(abs(coefficientsUsed))
73
-                        rankedFeaturesIndices <- order(featureScores, decreasing = TRUE)
74
-                        selectedFeaturesIndices <- which(featureScores != 0)
83
+                        featureGroups <- attr(model, "featureGroups")[match(names(featureScores), attr(model, "featureNames"))]
84
+                        groupScores <- unname(by(featureScores, featureGroups, max))
85
+                        rankedFeaturesIndices <- order(groupScores, decreasing = TRUE)
86
+                        selectedFeaturesIndices <- which(groupScores != 0)
75 87
                         list(rankedFeaturesIndices, selectedFeaturesIndices)
76 88
                       }
77 89
\ No newline at end of file
... ...
@@ -9,7 +9,7 @@ GLMtrainInterface <- function(measurementsTrain, classesTrain, ..., verbose = 3)
9 9
         fitData <- cbind(measurementsTrain, class = classesTrain)
10 10
         classesTrain <- "class" # Column name for glm fit.
11 11
     } else {fitData <- measurementsTrain}
12
-  glm(class ~ . + 0, family = binomial, data = fitData, ...)
12
+  glm(class ~ . + 0, family = binomial, data = fitData, weights = as.numeric(1 / (table(classesTrain)[classesTrain] / length(classesTrain))), ...)
13 13
 }
14 14
 attr(GLMtrainInterface, "name") <- "GLMtrainInterface"
15 15
 
16 16
new file mode 100644
... ...
@@ -0,0 +1,180 @@
1
+#' Precision Pathways for Sample Prediction Based on Prediction Confidence.
2
+#' 
3
+#' Precision pathways allows the evaluation of various permutations of multiomics or multiview data.
4
+#' Samples are predicted by a particular assay if they were consistently predicted as a particular class
5
+#' during cross-validation. Otherwise, they are passed onto subsequent assays/tiers for prediction. Balanced accuracy
6
+#' is used to evaluate overall prediction performance and sample-specific accuracy for individual-level evaluation.
7
+#'
8
+#' @param measurements Either a \code{\link{MultiAssayExperiment}} or a list of the basic tabular objects containing the data.
9
+#' @param class Same as \code{measurements} but only training samples. IF \code{measurements} is a \code{list}, may also be
10
+#' a vector of classes.
11
+#' @param clinicalPredictors Default: \code{NULL}. Must be a character vector of clinical features to use in modelling. This allows avoidance of things like sample IDs,
12
+#' sample acquisition dates, etc. which are not relevant for outcome prediction.
13
+#' @param maxMissingProp Default: 0.0. A proportion less than 1 which is the maximum
14
+#' tolerated proportion of missingness for a feature to be retained for modelling.
15
+#' @param topNvariance Default: NULL. An integer number of most variable features per assay to subset to.
16
+#' Assays with less features won't be reduced in size.
17
+#' @param fixedAssays A character vector of assay names specifying any assays which must be at the
18
+#' beginning of the pathway.
19
+#' @param confidenceCutoff The minimum confidence of predictions for a sample to be predicted by a particular issue
20
+#' . If a sample was predicted to belong to a particular class a proportion \eqn{p} times, then the confidence is \eqn{2 \times |p - 0.5|}.
21
+#' @param minAssaySamples An integer specifying the minimum number of samples a tier may have. If a subsequent tier
22
+#' would have less than this number of samples, the samples are incorporated into the current tier.
23
+#' @param nFeatures Default: 20. The number of features to consider during feature selection, if feature selection is done.
24
+#' @param selectionMethod A named character vector of feature selection methods to use for the assays, one for each. The names must correspond to names of \code{measurements}.
25
+#' @param classifier A named character vector of modelling methods to use for the assays, one for each. The names must correspond to names of \code{measurements}.
26
+#' @param nFolds A numeric specifying the number of folds to use for cross-validation.
27
+#' @param nRepeats A numeric specifying the the number of repeats or permutations to use for cross-validation.
28
+#' @param nCores A numeric specifying the number of cores used if the user wants to use parallelisation.
29
+#' @param pathways A set of pathways created by \code{precisionPathwaysTrain} to be used for predicting on a new data set.
30
+#' @rdname precisionPathways
31
+#' @return An object of class \code{PrecisionPathways} which is basically a named list that other plotting and
32
+#' tabulating functions can use.
33
+#' @examples
34
+#' # To be determined.
35
+
36
+#' @usage NULL
37
+setGeneric("precisionPathwaysTrain", function(measurements, class, ...)
38
+    standardGeneric("precisionPathwaysTrain"))
39
+
40
+#' @rdname precisionPathways
41
+#' @export
42
+setMethod("precisionPathwaysTrain", "MultiAssayExperimentOrList", 
43
+          function(measurements, class, clinicalPredictors = NULL, maxMissingProp = 0.0, topNvariance = NULL,
44
+                   fixedAssays = "clinical", confidenceCutoff = 0.8, minAssaySamples = 10,
45
+                   nFeatures = 20, selectionMethod = setNames(c("none", rep("t-test", length(measurements))), c("clinical", names(measurements))),
46
+                   classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))), c("clinical", names(measurements))),
47
+                   nFolds = 5, nRepeats = 20, nCores = 1)
48
+          {
49
+            if(is.list(measurements)) # Ensure plain list has clinical data.
50
+            {
51
+              # One of the tables must be named "clinical".
52
+              if (!any(names(measurements) == "clinical"))
53
+                stop("One of the tables must be named \"clinical\".")
54
+            }
55
+            prepArgs <- list(measurements, outcomeColumns = class, clinicalPredictors = clinicalPredictors,
56
+                             maxMissingProp = maxMissingProp, topNvariance = topNvariance)
57
+            measurementsAndClass <- do.call(prepareData, prepArgs)
58
+              
59
+            .precisionPathwaysTrain(measurementsAndClass[["measurements"]], measurementsAndClass[["outcome"]],
60
+                                   fixedAssays = fixedAssays, confidenceCutoff = confidenceCutoff,
61
+                                   minAssaySamples = minAssaySamples, nFeatures = nFeatures,
62
+                                   selectionMethod = selectionMethod, classifier = classifier,
63
+                                   nFolds = nFolds, nRepeats = nRepeats, nCores = nCores)
64
+          })
65
+
66
+# Internal method which carries out all of the processing, obtaining reformatted data from the
67
+# MultiAssayExperiment and list (of basic rectangular tables) S4 methods.
68
+.precisionPathwaysTrain <- function(measurements, class, fixedAssays = "clinical",
69
+                   confidenceCutoff = 0.8, minAssaySamples = 10,
70
+                   nFeatures = 20, selectionMethod = setNames(c(NULL, rep("t-test", length(measurements))), c("clinical", names(measurements))),
71
+                   classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))), c("clinical", names(measurements))),
72
+                   nFolds = 5, nRepeats = 20, nCores = 1)
73
+          {
74
+            # Step 1: Determine all valid permutations of assays, taking into account the
75
+            # assays to be used and which assays, if any, must be included.
76
+            assayIDs <- unique(S4Vectors::mcols(measurements)[["assay"]])
77
+            assaysPermutations <- .permutations(assayIDs, fixed = data.frame(seq_along(fixedAssays), fixedAssays))
78
+            permutationIDs <- apply(assaysPermutations, 2, function(permutation) paste(permutation, collapse = '-'))
79
+            
80
+            # Step 2: Build a classifier for each assay using all of the samples.
81
+            modelsList <- crossValidate(measurements, class, nFeatures, selectionMethod,
82
+                                        classifier = classifier, nFolds = nFolds,
83
+                                        nRepeats = nRepeats, nCores = nCores)
84
+            modelsList <- lapply(modelsList, calcCVperformance, "Sample Accuracy") # Add sample accuracy, which can be subset later.
85
+
86
+            # Step 3: Loop over each pathway and each assay in order to determine which samples are used at that level
87
+            # and which are passed onwards.
88
+            precisionPathways <- lapply(as.data.frame(assaysPermutations), function(permutation)
89
+            {
90
+              assaysProcessed <- character()
91
+              samplesUsed <- character()
92
+              individualsTableAll <- S4Vectors::DataFrame()
93
+              tierTableAll <- S4Vectors::DataFrame()
94
+              breakEarly = FALSE
95
+              for(assay in permutation)
96
+              {
97
+                # Step 3a: Identify all samples which are consistently predicted.
98
+                modelIndex <- match(assay, assayIDs)
99
+                allPredictions <- predictions(modelsList[[modelIndex]])
100
+                allSampleIDs <- sampleNames(modelsList[[modelIndex]])
101
+                predictionsSamplesCounts <- table(allPredictions[, "sample"], allPredictions[, "class"])
102
+                confidences <- 2 * abs(predictionsSamplesCounts[, 1] / rowSums(predictionsSamplesCounts) - 0.5)
103
+                sampleIDsUse <- names(confidences)[confidences > confidenceCutoff]
104
+                
105
+                # Check if too few samples left for next round. Include them in this round, if so.
106
+                remainingIDs <- setdiff(allSampleIDs, c(samplesUsed, sampleIDsUse))
107
+                if(length(remainingIDs) < minAssaySamples)
108
+                {
109
+                  sampleIDsUse <- c(sampleIDsUse, remainingIDs)
110
+                  breakEarly = TRUE
111
+                }
112
+                
113
+                predictionsSamplesCounts <- predictionsSamplesCounts[sampleIDsUse, ]
114
+                
115
+                # Step 3b: Individuals predictions and sample-wise accuracy, tier-wise error.
116
+                maxVotes <- apply(predictionsSamplesCounts, 1, function(sample) which.max(sample))
117
+                predictedClasses <- factor(colnames(predictionsSamplesCounts)[maxVotes],
118
+                                           levels = colnames(predictionsSamplesCounts))    
119
+                individualsTable <- S4Vectors::DataFrame(Tier = assay,
120
+                                                         `Sample ID` = sampleIDsUse,
121
+                                                         `Predicted` = predictedClasses,
122
+                                                         `Accuracy` = performance(modelsList[[modelIndex]])[["Sample Accuracy"]][sampleIDsUse],
123
+                                                         check.names = FALSE)
124
+                knownClasses <- actualOutcome(modelsList[[modelIndex]])[match(sampleIDsUse, allSampleIDs)]
125
+                balancedAccuracy <- calcExternalPerformance(knownClasses, predictedClasses)
126
+                tierTable <- S4Vectors::DataFrame(Tier = assay,
127
+                                                  `Balanced Accuracy` = balancedAccuracy, check.names = FALSE)
128
+                
129
+                assaysProcessed <- c(assaysProcessed, assay)
130
+                individualsTableAll <- rbind(individualsTableAll, individualsTable)
131
+                tierTableAll <- rbind(tierTableAll, tierTable)
132
+                samplesUsed <- c(samplesUsed, sampleIDsUse)
133
+                
134
+                if(breakEarly == TRUE) break
135
+              }
136
+              pathwayString <- paste(assaysProcessed, collapse = '-')
137
+              parameters = list(confidenceCutoff = confidenceCutoff, minAssaySamples = minAssaySamples)
138
+              list(models = modelsList, parameters = parameters, pathway = pathwayString,
139
+                  individuals = individualsTableAll, tiers = tierTableAll)
140
+            })
141
+            
142
+            class(precisionPathways) <- "PrecisionPathways"
143
+            names(precisionPathways) <- sapply(precisionPathways, "[[", "pathway")
144
+            precisionPathways
145
+}
146
+
147
+# A nice print method to avoid flooding the screen with lots of tables
148
+# when result is shown in console.
149
+print.PrecisionPathways <- function(x)
150
+{
151
+  cat("An object of class 'PrecisionPathways'.\n")
152
+  cat("Pathways:\n")
153
+  cat(paste(names(x), collapse = '\n'))
154
+}
155
+
156
+#' @usage NULL
157
+setGeneric("precisionPathwaysPredict", function(pathways, measurements, class, ...)
158
+    standardGeneric("precisionPathwaysPredict"))
159
+
160
+#' @rdname precisionPathways
161
+#' @export
162
+setMethod("precisionPathwaysPredict", "MultiAssayExperimentOrList", 
163
+          function(pathways, measurements, class)
164
+          {
165
+            if(is.list(measurements)) # Ensure plain list has clinical data.
166
+            {
167
+              # One of the tables must be named "clinical".
168
+              if (!any(names(measurements) == "clinical"))
169
+                stop("One of the tables must be named \"clinical\".")
170
+            }
171
+            prepArgs <- list(measurements, outcomeColumns = class)
172
+            measurementsAndClass <- do.call(prepareData, prepArgs)
173
+              
174
+            .precisionPathwaysPredict(pathways, measurementsAndClass[["measurements"]], measurementsAndClass[["outcome"]])
175
+          })
176
+
177
+.precisionPathwaysPredict <- function(pathways, measurements, class)
178
+{
179
+  # To do.
180
+}
0 181
\ No newline at end of file
... ...
@@ -18,15 +18,13 @@
18 18
 #' @param outcomeColumns If \code{measurements} is a \code{MultiAssayExperiment}, the
19 19
 #' names of the column (class) or columns (survival) in the table extracted by \code{colData(data)}
20 20
 #' that contain(s) the each individual's outcome to use for prediction.
21
-#' @param useFeatures If \code{measurements} is a \code{MultiAssayExperiment},
22
-#' a two-column table of features to use. The first column must have assay names
23
-#' and the second column must have feature names found for that assay. \code{"clinical"} is
24
-#' also a valid assay name and refers to the clinical data table. \code{"all"} is a special
25
-#' keyword that means all features (passing any other filters) of that assay will be used 
26
-#' for modelling. Otherwise, a character vector of feature names to use suffices.
21
+#' @param clinicalPredictors If \code{measurements} is a \code{MultiAssayExperiment},
22
+#' a character vector of features to use in modelling. This allows avoidance of things like sample IDs,
23
+#' sample acquisition dates, etc. which are not relevant for outcome prediction.
27 24
 #' @param maxMissingProp Default: 0.0. A proportion less than 1 which is the maximum
28 25
 #' tolerated proportion of missingness for a feature to be retained for modelling.
29
-#' @param topNvariance Default: NULL. An integer number of most variable features to subset to.
26
+#' @param topNvariance Default: NULL. An integer number of most variable features per assay to subset to.
27
+#' Assays with less features won't be reduced in size.
30 28
 #' @param ... Variables not used by the \code{matrix} nor the
31 29
 #' \code{MultiAssayExperiment} method which are passed into and used by the
32 30
 #' \code{DataFrame} method.
... ...
@@ -58,18 +56,16 @@ setMethod("prepareData", "data.frame",
58 56
 #' @rdname prepareData
59 57
 #' @export
60 58
 setMethod("prepareData", "DataFrame",
61
-  function(measurements, outcome, useFeatures = "all", maxMissingProp = 0.0, topNvariance = NULL)
59
+  function(measurements, outcome, clinicalPredictors = NULL, maxMissingProp = 0.0, topNvariance = NULL)
62 60
 {
63 61
   if(is.null(rownames(measurements)))
64 62
   {
65 63
     warning("'measurements' DataFrame must have sample identifiers as its row names. Generating generic ones.")
66 64
     rownames(measurements) <- paste("Sample", seq_len(nrow(measurements)))
67
-  }      
68
-            
69
-  if(useFeatures != "all") # Subset to only the desired ones.
70
-    measurements <- measurements[, useFeatures]
71
-
65
+  }
66
+      
72 67
   # Won't ever be true if input data was MultiAssayExperiment because wideFormat already produces valid names.  
68
+  # Need to check if input data was DataFrame because names might not be valid from user.
73 69
   if(!all(colnames(measurements) == make.names(colnames(measurements))))
74 70
   {
75 71
     warning("Unsafe feature names in input data. Converted into safe names.")
... ...
@@ -129,6 +125,19 @@ setMethod("prepareData", "DataFrame",
129 125
     else # Three columns. Therefore, counting process data.
130 126
       outcome <- survival::Surv(outcome[, 1], outcome[, 2], outcome[, 3])
131 127
   }
128
+  
129
+  if(!is.null(clinicalPredictors))
130
+  {
131
+    if(!is.null(mcols(measurements)$assay))
132
+    {
133
+      clinicalIndices <- which(mcols(measurements)$assay == "clinical")
134
+      usePredictors <- intersect(clinicalIndices, which(mcols(measurements)$feature %in% clinicalPredictors))
135
+      dropIndices <- setdiff(clinicalIndices, usePredictors)
136
+      if(length(dropIndices) > 0) measurements <- measurements[, -dropIndices]
137
+    } else { # The DataFrame is entirely clinical data.
138
+      measurements <- measurements[, clinicalPredictors]
139
+    }
140
+  }  
132 141
 
133 142
   # Remove samples with indeterminate outcome.
134 143
   dropSamples <- which(is.na(outcome) | is.null(outcome))
... ...
@@ -146,11 +155,18 @@ setMethod("prepareData", "DataFrame",
146 155
   if(length(dropFeatures) > 0)
147 156
     measurements <- measurements[, -dropFeatures]
148 157
   
149
-  # Use only the most variable features.
158
+  # Use only the most N variable features per assay.
150 159
   if(!is.null(topNvariance))
151 160
   {
152
-    mostVariance <- order(apply(measurements, 2, var, na.rm = TRUE), decreasing = TRUE)[1:topNvariance]
153
-    measurements <- measurements[, mostVariance]
161
+    if(is.null(mcols(measurements)$assay)) assays <- rep(1, ncol(measurements)) else assays <- mcols(measurements)$assay
162
+    do.call(cbind, lapply(unqiue(assays), function(assay)
163
+    {
164
+      assayColumns <- which(assays == assay)    
165
+      if(length(assayColumns) < topNvariance)
166
+        measurements[, assayColumns]
167
+      else
168
+        measurements[, assayColumns][order(apply(measurements[, assayColumns], 2, var, na.rm = TRUE), decreasing = TRUE)[1:topNvariance]]  
169
+    }))
154 170
   }
155 171
   
156 172
   list(measurements = measurements, outcome = outcome)
... ...
@@ -159,71 +175,75 @@ setMethod("prepareData", "DataFrame",
159 175
 #' @rdname prepareData
160 176
 #' @export
161 177
 setMethod("prepareData", "MultiAssayExperiment",
162
-  function(measurements, outcomeColumns = NULL, useFeatures = "all", ...)
178
+  function(measurements, outcomeColumns = NULL, clinicalPredictors = NULL, ...)
163 179
 {
164
-  if(is.character(useFeatures)) useFeatures <- data.frame(assay = names(measurements), feature = "all")
165
-  omicsTargets <- setdiff(useFeatures[, "assay"], "clinical")
166
-  if(length(omicsTargets) > 0)
167
-  {
168
-    if(any(anyReplicated(measurements[, , omicsTargets])))
169
-      stop("Data set contains replicates. Please remove or average replicate observations and try again.")
170
-  }
171
-  
172
-  if(!is.null(outcomeColumns) && !all(outcomeColumns %in% colnames(MultiAssayExperiment::colData(measurements))))
180
+  if(is.null(clinicalPredictors))
181
+    stop("'clinicalPredictors' must be a vector of informative clinical features (i.e. not sample IDs, sampling dates, etc.) to consider for classification.")      
182
+
183
+  if(any(anyReplicated(measurements)))
184
+    stop("Data set contains replicates. Please remove or average replicate observations and try again.")
185
+ 
186
+  if(is.null(outcomeColumns))
187
+    stop("'outcomeColumns' is a mandatory parameter but was not specified.")
188
+      
189
+  if(!all(outcomeColumns %in% colnames(MultiAssayExperiment::colData(measurements))))
173 190
     stop("Not all column names specified by 'outcomeColumns' found in clinical table.")  
174
-  if(!all(useFeatures[, "assay"] %in% c(names(measurements), "clinical")))
175
-    stop("Some assay names in first column of 'useFeatures' are not assay names in 'measurements' or \"clinical\".")
176 191
 
177
-  clinicalColumnsDataset <- colnames(MultiAssayExperiment::colData(measurements))
178
-  if("clinical" %in% useFeatures[, "assay"])
179
-  {
180
-    clinicalRows <- useFeatures[, "assay"] == "clinical"      
181
-    clinicalColumns <- useFeatures[clinicalRows, "feature"]
182
-    if(length(clinicalColumns) == 1 && clinicalColumns == "all")
183
-      clinicalColumns <- setdiff(clinicalColumnsDataset, outcomeColumns)
184
-    useFeatures <- useFeatures[!clinicalRows, ]
185
-  } else {
186
-    clinicalColumns <- NULL
187
-  }
188
-  
189
-  if(nrow(useFeatures) > 0)
190
-  {
191
-    measurements <- measurements[, , unique(useFeatures[, "assay"])]
192
-    # Get all desired measurements tables and clinical columns (other than the columns representing outcome).
193
-    # These form the independent variables to be used for making predictions with.
194
-    # Variable names will have names like RNA_BRAF for traceability.
195
-    dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(clinicalColumns, outcomeColumns))
196
-    rownames(dataTable) <- dataTable[, "primary"]
197
-    S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "clinical", S4Vectors::mcols(dataTable)[, "sourceName"])
198
-    colnames(S4Vectors::mcols(dataTable))[1] <- "assay"
192
+  # Get all desired measurements tables and clinical columns.
193
+  # These form the independent variables to be used for making predictions with.
194
+  # Variable names will have names like RNA_BRAF for traceability.
195
+  dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(clinicalPredictors, outcomeColumns))
196
+  rownames(dataTable) <- dataTable[, "primary"]
197
+  S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "clinical", S4Vectors::mcols(dataTable)[, "sourceName"])
198
+  dataTable <- dataTable[, -match("primary", colnames(dataTable))]
199
+  colnames(S4Vectors::mcols(dataTable))[1] <- "assay"
199 200
             
200
-    # Sample information variable names not included in column metadata of wide table but only as row names of it.
201
-    # Create a combined column named "feature" which has feature names of the assays as well as the clinical.
202
-    S4Vectors::mcols(dataTable)[, "feature"] <- as.character(S4Vectors::mcols(dataTable)[, "rowname"])
203
-    missingIndices <- is.na(S4Vectors::mcols(dataTable)[, "feature"])
204
-    S4Vectors::mcols(dataTable)[missingIndices, "feature"] <- colnames(dataTable)[missingIndices]
201
+  # Sample information variable names not included in column metadata of wide table but only as row names of it.
202
+  # Create a combined column named "feature" which has feature names of the assays as well as the clinical.
203
+  S4Vectors::mcols(dataTable)[, "feature"] <- as.character(S4Vectors::mcols(dataTable)[, "rowname"])
204
+  missingIndices <- is.na(S4Vectors::mcols(dataTable)[, "feature"])
205
+  S4Vectors::mcols(dataTable)[missingIndices, "feature"] <- colnames(dataTable)[missingIndices]
205 206
     
206
-    # Finally, a column annotation recording variable name and which table it originated from for all of the source tables.
207
-    S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")]
207
+  # Finally, a column annotation recording variable name and which table it originated from for all of the source tables.
208
+  S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")]
208 209
     
209
-    # Subset to only the desired features.
210
-    useFeaturesSubset <- useFeatures[useFeatures[, "feature"] != "all", ]
211
-    if(nrow(useFeaturesSubset) > 0)
212
-    {
213
-      uniqueAssays <- unique(useFeatures[, "assay"])
214
-      for(filterAssay in uniqueAssays)
215
-      {
216
-        dropFeatures <- S4Vectors::mcols(dataTable)[, "assay"] == filterAssay &
217
-                        !S4Vectors::mcols(dataTable)[, "feature"] %in% useFeatures[useFeatures[, 1] == filterAssay, 2]
218
-        dataTable <- dataTable[, !dropFeatures]
219
-      }
220
-    }
221
-    dataTable <- dataTable[, -match("primary", colnames(dataTable))]
222
-  } else { # Must have only been clinical data.
223
-    dataTable <- MultiAssayExperiment::colData(measurements)
224
-    S4Vectors::mcols(dataTable) <- DataFrame(assay = "clinical", feature = colnames(dataTable))
225
-  }
210
+  # Do other filtering and preparation in DataFrame function.
211
+  prepareData(dataTable, outcomeColumns, clinicalPredictors = NULL, ...)
212
+})
213
+
214
+#' @rdname prepareData
215
+#' @export
216
+setMethod("prepareData", "list",
217
+  function(measurements, outcome = NULL, clinicalPredictors = NULL, ...)
218
+{
219
+  # Check the list is named.
220
+  if(is.null(names(measurements)))
221
+    stop("'measurements' must be a named list.")
222
+
223
+  # If clinical table is present, features to use must be user-specified.            
224
+  if("clinical" %in% names(measurements) && is.null(clinicalPredictors))
225
+    stop("Because one provided table in the list is named \"clinical\", 'clinicalPredictors' must be a vector of informative clinical features (i.e. not sample IDs, sampling dates, etc.) to consider for classification.")
226
+
227
+  # Check data type is valid.
228
+  if(!(all(sapply(measurements, class) %in% c("data.frame", "DataFrame", "matrix"))))
229
+    stop("assays in the list must be of type data.frame, DataFrame or matrix")
230
+              
231
+  # Check same number of samples for all datasets
232
+  if (!length(unique(sapply(measurements, nrow))) == 1)
233
+    stop("All datasets must have the same samples.")
234
+      
235
+  if("clinical" %in% names(measurements))
236
+    measurements[["clinical"]] <- measurements[["clinical"]][, clinicalPredictors]
237
+             
238
+  allMetadata <- mapply(function(measurementsOne, assayID) {
239
+                        data.frame(assay = assayID, feature = colnames(measurementsOne))
240
+                        }, measurements, names(measurements))
241
+  allMeasurements <- do.call("cbind", measurements)
242
+  # Different assays e.g. mRNA, protein could have same feature name e.g. BRAF.
243
+  colnames(allMeasurements) <- paste(allMetadata[, "assay"], allMetadata[, "feature"], sep = '_')
244
+  allDataFrame <- DataFrame(allMeasurements)
245
+  S4Vectors::mcols(allMeasurements) <- allMetadata
226 246
     
227 247
   # Do other filtering and preparation in DataFrame function.
228
-  prepareData(dataTable, outcomeColumns, useFeatures = "all", ...)
248
+  prepareData(dataTable, outcome, clinicalPredictors = NULL, ...)
229 249
 })
230 250
\ No newline at end of file
... ...
@@ -6,8 +6,7 @@
6 6
 \alias{crossValidate,DataFrame-method}
7 7
 \alias{crossValidate,MultiAssayExperiment-method,}
8 8
 \alias{crossValidate,data.frame-method}
9
-\alias{crossValidate,MultiAssayExperiment-method}
10
-\alias{crossValidate,list-method}
9
+\alias{crossValidate,MultiAssayExperimentOrList-method}
11 10
 \alias{train.matrix}
12 11
 \alias{train.data.frame}
13 12
 \alias{train.DataFrame}
... ...
@@ -16,8 +15,6 @@
16 15
 \alias{predict.trainedByClassifyR}
17 16
 \title{Cross-validation to evaluate classification performance.}
18 17
 \usage{
19
-crossValidate(measurements, outcome, ...)
20
-
21 18
 \S4method{crossValidate}{DataFrame}(
22 19
   measurements,
23 20
   outcome,
... ...
@@ -35,9 +32,10 @@ crossValidate(measurements, outcome, ...)
35 32
   extraParams = NULL
36 33
 )
37 34
 
38
-\S4method{crossValidate}{MultiAssayExperiment}(
35
+\S4method{crossValidate}{MultiAssayExperimentOrList}(
39 36
   measurements,
40 37
   outcome,
38
+  clinicalPredictors = NULL,
41 39
   nFeatures = 20,
42 40
   selectionMethod = "auto",
43 41
   selectionOptimisation = "Resubstitution",
... ...
@@ -86,23 +84,6 @@ crossValidate(measurements, outcome, ...)
86 84
   extraParams = NULL
87 85
 )
88 86
 
89
-\S4method{crossValidate}{list}(
90
-  measurements,
91
-  outcome,
92
-  nFeatures = 20,
93
-  selectionMethod = "auto",
94
-  selectionOptimisation = "Resubstitution",
95
-  performanceType = "auto",
96
-  classifier = "auto",
97
-  multiViewMethod = "none",
98
-  assayCombinations = "all",
99
-  nFolds = 5,
100
-  nRepeats = 20,
101
-  nCores = 1,
102
-  characteristicsLabel = NULL,
103
-  extraParams = NULL
104
-)
105
-
106 87
 \method{train}{matrix}(x, outcomeTrain, ...)
107 88
 
108 89
 \method{train}{data.frame}(x, outcomeTrain, ...)
... ...
@@ -122,7 +103,7 @@ crossValidate(measurements, outcome, ...)
122 103
 
123 104
 \method{train}{list}(x, outcomeTrain, ...)
124 105
 
125
-\method{train}{MultiAssayExperiment}(x, outcome, ...)
106
+\method{train}{MultiAssayExperiment}(x, outcome, clinicalPredictors = NULL, ...)
126 107
 
127 108
 \method{predict}{trainedByClassifyR}(object, newData, ...)
128 109
 }
... ...
@@ -146,7 +127,8 @@ Set to NULL or "all" if all features should be used.}
146 127
 
147 128
 \item{selectionMethod}{Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
148 129
 and performing multiview classification, the respective selection methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking
149
-and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value.}
130
+and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. \code{"none"} is also a valid value, meaning that no
131
+indepedent feature selection will be performed (but implicit selection might still happen with the classifier).}
150 132
 
151 133
 \item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.}
152 134
 
... ...
@@ -172,7 +154,11 @@ with each element being a vector of assays to combine. Special value \code{"all"
172 154
 \item{extraParams}{A list of parameters that will be used to overwrite default settings of transformation, selection, or model-building functions or
173 155
 parameters which will be passed into the data cleaning function. The names of the list must be one of \code{"prepare"},
174 156
 \code{"select"}, \code{"train"}, \code{"predict"}. To remove one of the defaults (see the article titled Parameter Tuning Presets for crossValidate and Their Customisation on
175
-the website), specify the list element to be \code{NULL}.}
157
+the website), specify the list element to be \code{NULL}. For the valid element names in the \code{"prepare"} list, see \code{?prepareData}.}
158
+
159
+\item{clinicalPredictors}{Default: \code{NULL}. If \code{measurements} is a \code{MultiAssayExperiment},
160
+a character vector of features to use in modelling. This allows avoidance of things like sample IDs,
161
+sample acquisition dates, etc. which are not relevant for outcome prediction.}
176 162
 
177 163
 \item{x}{Same as \code{measurements} but only training samples.}
178 164
 
179 165
new file mode 100644
... ...
@@ -0,0 +1,80 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/precisionPathways.R
3
+\name{precisionPathwaysTrain}
4
+\alias{precisionPathwaysTrain}
5
+\alias{precisionPathwaysTrain,MultiAssayExperimentOrList-method}
6
+\alias{precisionPathwaysPredict,MultiAssayExperimentOrList-method}
7
+\title{Precision Pathways for Sample Prediction Based on Prediction Confidence.}
8
+\usage{
9
+\S4method{precisionPathwaysTrain}{MultiAssayExperimentOrList}(
10
+  measurements,
11
+  class,
12
+  clinicalPredictors = NULL,
13
+  maxMissingProp = 0,
14
+  topNvariance = NULL,
15
+  fixedAssays = "clinical",
16
+  confidenceCutoff = 0.8,
17
+  minAssaySamples = 10,
18
+  nFeatures = 20,
19
+  selectionMethod = setNames(c("none", rep("t-test", length(measurements))),
20
+    c("clinical", names(measurements))),
21
+  classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))),
22
+    c("clinical", names(measurements))),
23
+  nFolds = 5,
24
+  nRepeats = 20,
25
+  nCores = 1
26
+)
27
+
28
+\S4method{precisionPathwaysPredict}{MultiAssayExperimentOrList}(pathways, measurements, class)
29
+}
30
+\arguments{
31
+\item{measurements}{Either a \code{\link{MultiAssayExperiment}} or a list of the basic tabular objects containing the data.}
32
+
33
+\item{class}{Same as \code{measurements} but only training samples. IF \code{measurements} is a \code{list}, may also be
34
+a vector of classes.}
35
+
36
+\item{clinicalPredictors}{Default: \code{NULL}. Must be a character vector of clinical features to use in modelling. This allows avoidance of things like sample IDs,
37
+sample acquisition dates, etc. which are not relevant for outcome prediction.}
38
+
39
+\item{maxMissingProp}{Default: 0.0. A proportion less than 1 which is the maximum
40
+tolerated proportion of missingness for a feature to be retained for modelling.}
41
+
42
+\item{topNvariance}{Default: NULL. An integer number of most variable features per assay to subset to.
43
+Assays with less features won't be reduced in size.}
44
+
45
+\item{fixedAssays}{A character vector of assay names specifying any assays which must be at the
46
+beginning of the pathway.}
47
+
48
+\item{confidenceCutoff}{The minimum confidence of predictions for a sample to be predicted by a particular issue
49
+. If a sample was predicted to belong to a particular class a proportion \eqn{p} times, then the confidence is \eqn{2 \times |p - 0.5|}.}
50
+
51
+\item{minAssaySamples}{An integer specifying the minimum number of samples a tier may have. If a subsequent tier
52
+would have less than this number of samples, the samples are incorporated into the current tier.}
53
+
54
+\item{nFeatures}{Default: 20. The number of features to consider during feature selection, if feature selection is done.}
55
+
56
+\item{selectionMethod}{A named character vector of feature selection methods to use for the assays, one for each. The names must correspond to names of \code{measurements}.}
57
+
58
+\item{classifier}{A named character vector of modelling methods to use for the assays, one for each. The names must correspond to names of \code{measurements}.}
59
+
60
+\item{nFolds}{A numeric specifying the number of folds to use for cross-validation.}
61
+
62
+\item{nRepeats}{A numeric specifying the the number of repeats or permutations to use for cross-validation.}
63
+
64
+\item{nCores}{A numeric specifying the number of cores used if the user wants to use parallelisation.}
65
+
66
+\item{pathways}{A set of pathways created by \code{precisionPathwaysTrain} to be used for predicting on a new data set.}
67
+}
68
+\value{
69
+An object of class \code{PrecisionPathways} which is basically a named list that other plotting and
70
+tabulating functions can use.
71
+}
72
+\description{
73
+Precision pathways allows the evaluation of various permutations of multiomics or multiview data.
74
+Samples are predicted by a particular assay if they were consistently predicted as a particular class
75
+during cross-validation. Otherwise, they are passed onto subsequent assays/tiers for prediction. Balanced accuracy
76
+is used to evaluate overall prediction performance and sample-specific accuracy for individual-level evaluation.
77
+}
78
+\examples{
79
+# To be determined.
80
+}
... ...
@@ -6,6 +6,7 @@
6 6
 \alias{prepareData,DataFrame-method}
7 7
 \alias{prepareData,MultiAssayExperiment-method}
8 8
 \alias{prepareData,data.frame-method}
9
+\alias{prepareData,list-method}
9 10
 \title{Convert Different Data Classes into DataFrame and Filter Features}
10 11
 \usage{
11 12
 \S4method{prepareData}{matrix}(measurements, outcome, ...)
... ...
@@ -15,12 +16,19 @@
15 16
 \S4method{prepareData}{DataFrame}(
16 17
   measurements,
17 18
   outcome,
18
-  useFeatures = "all",
19
+  clinicalPredictors = NULL,
19 20
   maxMissingProp = 0,
20 21
   topNvariance = NULL
21 22
 )
22 23
 
23
-\S4method{prepareData}{MultiAssayExperiment}(measurements, outcomeColumns = NULL, useFeatures = "all", ...)
24
+\S4method{prepareData}{MultiAssayExperiment}(
25
+  measurements,
26
+  outcomeColumns = NULL,
27
+  clinicalPredictors = NULL,
28
+  ...
29
+)
30
+
31
+\S4method{prepareData}{list}(measurements, outcome = NULL, clinicalPredictors = NULL, ...)
24 32
 }
25 33
 \arguments{
26 34
 \item{measurements}{Either a \code{\link{matrix}}, \code{\link{DataFrame}}
... ...
@@ -37,17 +45,15 @@ a character string, or vector of such strings, containing column name(s) of colu
37 45
 containing either classes or time and event information about survival. If column names
38 46
 of survival information, time must be in first column and event status in the second.}
39 47
 
40
-\item{useFeatures}{If \code{measurements} is a \code{MultiAssayExperiment},
41
-a two-column table of features to use. The first column must have assay names
42
-and the second column must have feature names found for that assay. \code{"clinical"} is
43
-also a valid assay name and refers to the clinical data table. \code{"all"} is a special
44
-keyword that means all features (passing any other filters) of that assay will be used 
45
-for modelling. Otherwise, a character vector of feature names to use suffices.}
48
+\item{clinicalPredictors}{If \code{measurements} is a \code{MultiAssayExperiment},
49
+a character vector of features to use in modelling. This allows avoidance of things like sample IDs,
50
+sample acquisition dates, etc. which are not relevant for outcome prediction.}
46 51
 
47 52
 \item{maxMissingProp}{Default: 0.0. A proportion less than 1 which is the maximum
48 53
 tolerated proportion of missingness for a feature to be retained for modelling.}
49 54
 
50
-\item{topNvariance}{Default: NULL. An integer number of most variable features to subset to.}
55
+\item{topNvariance}{Default: NULL. An integer number of most variable features per assay to subset to.
56
+Assays with less features won't be reduced in size.}
51 57
 
52 58
 \item{outcomeColumns}{If \code{measurements} is a \code{MultiAssayExperiment}, the
53 59
 names of the column (class) or columns (survival) in the table extracted by \code{colData(data)}