Implemented precisionPathwaysTrain and Added Default Weights for GLMs
... | ... |
@@ -3,8 +3,8 @@ Type: Package |
3 | 3 |
Title: A framework for cross-validated classification problems, with |
4 | 4 |
applications to differential variability and differential |
5 | 5 |
distribution testing |
6 |
-Version: 3.3.13 |
|
7 |
-Date: 2023-02-13 |
|
6 |
+Version: 3.3.14 |
|
7 |
+Date: 2023-02-16 |
|
8 | 8 |
Authors@R: |
9 | 9 |
c( |
10 | 10 |
person(given = "Dario", family = "Strbenac", email = "dario.strbenac@sydney.edu.au", role = c("aut", "cre")), |
... | ... |
@@ -72,6 +72,7 @@ Collate: |
72 | 72 |
'interfaceXGB.R' |
73 | 73 |
'performancePlot.R' |
74 | 74 |
'plotFeatureClasses.R' |
75 |
+ 'precisionPathways.R' |
|
75 | 76 |
'prepareData.R' |
76 | 77 |
'previousSelection.R' |
77 | 78 |
'previousTrained.R' |
... | ... |
@@ -76,6 +76,8 @@ exportMethods(models) |
76 | 76 |
exportMethods(performance) |
77 | 77 |
exportMethods(performancePlot) |
78 | 78 |
exportMethods(plotFeatureClasses) |
79 |
+exportMethods(precisionPathwaysPredict) |
|
80 |
+exportMethods(precisionPathwaysTrain) |
|
79 | 81 |
exportMethods(predictions) |
80 | 82 |
exportMethods(prepareData) |
81 | 83 |
exportMethods(rankingPlot) |
... | ... |
@@ -35,6 +35,9 @@ setClassUnion("DataFrameOrNULL", c("DataFrame", "NULL")) |
35 | 35 |
setClassUnion("tabular", c("data.frame", "DataFrame", "matrix")) |
36 | 36 |
setClassUnion("tabularOrList", c("tabular", "list")) |
37 | 37 |
|
38 |
+# List-like assay data |
|
39 |
+setClassUnion("MultiAssayExperimentOrList", c("MultiAssayExperiment", "list")) |
|
40 |
+ |
|
38 | 41 |
################################################################################ |
39 | 42 |
# |
40 | 43 |
# Params |
... | ... |
@@ -22,14 +22,18 @@ |
22 | 22 |
#' @param extraParams A list of parameters that will be used to overwrite default settings of transformation, selection, or model-building functions or |
23 | 23 |
#' parameters which will be passed into the data cleaning function. The names of the list must be one of \code{"prepare"}, |
24 | 24 |
#' \code{"select"}, \code{"train"}, \code{"predict"}. To remove one of the defaults (see the article titled Parameter Tuning Presets for crossValidate and Their Customisation on |
25 |
-#' the website), specify the list element to be \code{NULL}. |
|
25 |
+#' the website), specify the list element to be \code{NULL}. For the valid element names in the \code{"prepare"} list, see \code{?prepareData}. |
|
26 |
+#' @param clinicalPredictors Default: \code{NULL}. If \code{measurements} is a \code{MultiAssayExperiment}, |
|
27 |
+#' a character vector of features to use in modelling. This allows avoidance of things like sample IDs, |
|
28 |
+#' sample acquisition dates, etc. which are not relevant for outcome prediction. |
|
26 | 29 |
#' @param nFeatures The number of features to be used for classification. If this is a single number, the same number of features will be used for all comparisons |
27 | 30 |
#' or assays. If a numeric vector these will be optimised over using \code{selectionOptimisation}. If a named vector with the same names of multiple assays, |
28 | 31 |
#' a different number of features will be used for each assay. If a named list of vectors, the respective number of features will be optimised over. |
29 | 32 |
#' Set to NULL or "all" if all features should be used. |
30 | 33 |
#' @param selectionMethod Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, |
31 | 34 |
#' and performing multiview classification, the respective selection methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking |
32 |
-#' and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. |
|
35 |
+#' and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. \code{"none"} is also a valid value, meaning that no |
|
36 |
+#' indepedent feature selection will be performed (but implicit selection might still happen with the classifier). |
|
33 | 37 |
#' @param selectionOptimisation A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}. |
34 | 38 |
#' @param performanceType Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Otherwise, any one of the |
35 | 39 |
#' options described in \code{\link{calcPerformance}} may otherwise be specified. |
... | ... |
@@ -88,12 +92,13 @@ |
88 | 92 |
#' # performancePlot(c(result, resultMerge)) |
89 | 93 |
#' |
90 | 94 |
#' @importFrom survival Surv |
95 |
+#' @usage NULL |
|
91 | 96 |
setGeneric("crossValidate", function(measurements, outcome, ...) |
92 | 97 |
standardGeneric("crossValidate")) |
93 | 98 |
|
94 | 99 |
#' @rdname crossValidate |
95 | 100 |
#' @export |
96 |
-setMethod("crossValidate", "DataFrame", |
|
101 |
+setMethod("crossValidate", "DataFrame", |
|
97 | 102 |
function(measurements, |
98 | 103 |
outcome, |
99 | 104 |
nFeatures = 20, |
... | ... |
@@ -110,12 +115,15 @@ setMethod("crossValidate", "DataFrame", |
110 | 115 |
|
111 | 116 |
{ |
112 | 117 |
# Check that data is in the right format, if not already done for MultiAssayExperiment input. |
113 |
- prepParams <- list(measurements, outcome) |
|
114 |
- if("prepare" %in% names(extraParams)) |
|
115 |
- prepParams <- c(prepParams, extraParams[["prepare"]]) |
|
116 |
- measurementsAndOutcome <- do.call(prepareData, prepParams) |
|
117 |
- measurements <- measurementsAndOutcome[["measurements"]] |
|
118 |
- outcome <- measurementsAndOutcome[["outcome"]] |
|
118 |
+ if(!"assay" %in% colnames(S4Vectors::mcols(measurements))) # Assay is put there by prepareData for MultiAssayExperiment, skip if present. |
|
119 |
+ { |
|
120 |
+ prepParams <- list(measurements, outcome) |
|
121 |
+ if("prepare" %in% names(extraParams)) |
|
122 |
+ prepParams <- c(prepParams, extraParams[["prepare"]]) |
|
123 |
+ measurementsAndOutcome <- do.call(prepareData, prepParams) |
|
124 |
+ measurements <- measurementsAndOutcome[["measurements"]] |
|
125 |
+ outcome <- measurementsAndOutcome[["outcome"]] |
|
126 |
+ } |
|
119 | 127 |
|
120 | 128 |
# Ensure performance type is one of the ones that can be calculated by the package. |
121 | 129 |
if(!performanceType %in% c("auto", .ClassifyRenvir[["performanceTypes"]])) |
... | ... |
@@ -305,9 +313,10 @@ setMethod("crossValidate", "DataFrame", |
305 | 313 |
#' @rdname crossValidate |
306 | 314 |
#' @export |
307 | 315 |
# One or more omics data sets, possibly with clinical data. |
308 |
-setMethod("crossValidate", "MultiAssayExperiment", |
|
316 |
+setMethod("crossValidate", "MultiAssayExperimentOrList", |
|
309 | 317 |
function(measurements, |
310 |
- outcome, |
|
318 |
+ outcome, |
|
319 |
+ clinicalPredictors = NULL, |
|
311 | 320 |
nFeatures = 20, |
312 | 321 |
selectionMethod = "auto", |
313 | 322 |
selectionOptimisation = "Resubstitution", |
... | ... |
@@ -321,7 +330,7 @@ setMethod("crossValidate", "MultiAssayExperiment", |
321 | 330 |
characteristicsLabel = NULL, extraParams = NULL) |
322 | 331 |
{ |
323 | 332 |
# Check that data is in the right format, if not already done for MultiAssayExperiment input. |
324 |
- prepParams <- list(measurements, outcome) |
|
333 |
+ prepParams <- list(measurements, outcome, clinicalPredictors) |
|
325 | 334 |
if("prepare" %in% names(extraParams)) |
326 | 335 |
prepParams <- c(prepParams, extraParams[["prepare"]]) |
327 | 336 |
measurementsAndOutcome <- do.call(prepareData, prepParams) |
... | ... |
@@ -408,76 +417,6 @@ setMethod("crossValidate", "matrix", # Matrix of numeric measurements. |
408 | 417 |
characteristicsLabel = characteristicsLabel, extraParams = extraParams) |
409 | 418 |
}) |
410 | 419 |
|
411 |
-# This expects that each table is about the same set of samples and thus |
|
412 |
-# has the same number of rows as every other table. |
|
413 |
-#' @rdname crossValidate |
|
414 |
-#' @export |
|
415 |
-setMethod("crossValidate", "list", |
|
416 |
- function(measurements, |
|
417 |
- outcome, |
|
418 |
- nFeatures = 20, |
|
419 |
- selectionMethod = "auto", |
|
420 |
- selectionOptimisation = "Resubstitution", |
|
421 |
- performanceType = "auto", |
|
422 |
- classifier = "auto", |
|
423 |
- multiViewMethod = "none", |
|
424 |
- assayCombinations = "all", |
|
425 |
- nFolds = 5, |
|
426 |
- nRepeats = 20, |
|
427 |
- nCores = 1, |
|
428 |
- characteristicsLabel = NULL, extraParams = NULL) |
|
429 |
- { |
|
430 |
- # Check data type is valid |
|
431 |
- if (!(all(sapply(measurements, class) %in% c("data.frame", "DataFrame", "matrix")))) { |
|
432 |
- stop("assays must be of type data.frame, DataFrame or matrix") |
|
433 |
- } |
|
434 |
- |
|
435 |
- # Check the list is named |
|
436 |
- if (is.null(names(measurements))) { |
|
437 |
- stop("Measurements must be a named list.") |
|
438 |
- } |
|
439 |
- |
|
440 |
- # Check same number of samples for all datasets |
|
441 |
- if (!length(unique(sapply(measurements, nrow))) == 1) { |
|
442 |
- stop("All datasets must have the same samples.") |
|
443 |
- } |
|
444 |
- |
|
445 |
- # Check the number of outcome is the same |
|
446 |
- if (!all(sapply(measurements, nrow) == length(outcome)) && !is.character(outcome)) { |
|
447 |
- stop("outcome must have same number of samples as measurements.") |
|
448 |
- } |
|
449 |
- |
|
450 |
- df_list <- sapply(measurements, S4Vectors::DataFrame, check.names = FALSE) |
|
451 |
- |
|
452 |
- df_list <- mapply(function(meas, nam){ |
|
453 |
- S4Vectors::mcols(meas)$assay <- nam |
|
454 |
- S4Vectors::mcols(meas)$feature <- colnames(meas) |
|
455 |
- meas |
|
456 |
- }, df_list, names(df_list)) |
|
457 |
- |
|
458 |
- |
|
459 |
- combined_df <- do.call("cbind", df_list) |
|
460 |
- colnames(combined_df) <- S4Vectors::mcols(combined_df)$feature |
|
461 |
- |
|
462 |
- |
|
463 |
- |
|
464 |
- crossValidate(measurements = combined_df, |
|
465 |
- outcome = outcome, |
|
466 |
- nFeatures = nFeatures, |
|
467 |
- selectionMethod = selectionMethod, |
|
468 |
- selectionOptimisation = selectionOptimisation, |
|
469 |
- performanceType = performanceType, |
|
470 |
- classifier = classifier, |
|
471 |
- multiViewMethod = multiViewMethod, |
|
472 |
- assayCombinations = assayCombinations, |
|
473 |
- nFolds = nFolds, |
|
474 |
- nRepeats = nRepeats, |
|
475 |
- nCores = nCores, |
|
476 |
- characteristicsLabel = characteristicsLabel, extraParams = extraParams) |
|
477 |
- }) |
|
478 |
- |
|
479 |
- |
|
480 |
- |
|
481 | 420 |
###################################### |
482 | 421 |
###################################### |
483 | 422 |
cleanNFeatures <- function(nFeatures, measurements){ |
... | ... |
@@ -656,7 +595,6 @@ generateModellingParams <- function(assayIDs, |
656 | 595 |
} |
657 | 596 |
|
658 | 597 |
selectionMethod <- unlist(selectionMethod) |
659 |
- if(is.null(selectionMethod)) selectionMethod <- "none" |
|
660 | 598 |
|
661 | 599 |
if(selectionMethod != "none") |
662 | 600 |
{ |
... | ... |
@@ -1126,9 +1064,9 @@ train.list <- function(x, outcomeTrain, ...) |
1126 | 1064 |
#' @rdname crossValidate |
1127 | 1065 |
#' @method train MultiAssayExperiment |
1128 | 1066 |
#' @export |
1129 |
-train.MultiAssayExperiment <- function(x, outcome, ...) |
|
1067 |
+train.MultiAssayExperiment <- function(x, outcome, clinicalPredictors = NULL, ...) |
|
1130 | 1068 |
{ |
1131 |
- prepArgs <- list(x, outcome) |
|
1069 |
+ prepArgs <- list(x, outcome, clinicalPredictors) |
|
1132 | 1070 |
extraInputs <- list(...) |
1133 | 1071 |
prepExtras <- trainExtras <- numeric() |
1134 | 1072 |
if(length(extraInputs) > 0) |
... | ... |
@@ -1167,7 +1105,7 @@ predict.trainedByClassifyR <- function(object, newData, ...) |
1167 | 1105 |
newData <- do.call(cbind, newData) |
1168 | 1106 |
} else if(is(newData, "MultiAssayExperiment")) |
1169 | 1107 |
{ |
1170 |
- newData <- prepareData(newData, useFeatures = allFeatureNames(object)) |
|
1108 |
+ newData <- prepareData(newData, clinicalPredictors = subset(allFeatureNames(object), assay == "clinical")[, "feature"]) |
|
1171 | 1109 |
# Some classifiers dangerously use positional matching rather than column name matching. |
1172 | 1110 |
# newData columns are sorted so that the right column ordering is guaranteed. |
1173 | 1111 |
} |
... | ... |
@@ -6,20 +6,26 @@ elasticNetGLMtrainInterface <- function(measurementsTrain, classesTrain, lambda |
6 | 6 |
stop("The package 'glmnet' could not be found. Please install it.") |
7 | 7 |
if(verbose == 3) |
8 | 8 |
message("Fitting elastic net regularised GLM classifier to data.") |
9 |
- |
|
10 |
- measurementsMatrix <- glmnet::makeX(as(measurementsTrain, "data.frame")) |
|
11 |
- |
|
12 |
- fitted <- glmnet::glmnet(measurementsMatrix, classesTrain, family = "multinomial", ...) |
|
13 | 9 |
|
10 |
+ # One-hot encoding needed. |
|
11 |
+ measurementsTrain <- MatrixModels::model.Matrix(~ 0 + ., data = measurementsTrain) |
|
12 |
+ fitted <- glmnet::glmnet(measurementsTrain, classesTrain, family = "multinomial", weights = as.numeric(1 / (table(classesTrain)[classesTrain] / length(classesTrain))), ...) |
|
13 |
+ # Inverse class size weighting needed to give decent predictions when class imbalance. |
|
14 |
+ |
|
14 | 15 |
if(is.null(lambda)) # fitted has numerous models for automatically chosen lambda values. |
15 |
- { # Pick one lambda based on resubstitution performance. |
|
16 |
- bestLambda <- fitted[["lambda"]][which.min(sapply(fitted[["lambda"]], function(lambda) # Largest Lambda with minimum balanced error rate. |
|
16 |
+ { # Pick one lambda based on resubstitution performance. But not the one that makes all variables excluded from model. |
|
17 |
+ lambdaConsider <- colSums(as.matrix(fitted[["beta"]][[1]])) != 0 |
|
18 |
+ bestLambda <- fitted[["lambda"]][lambdaConsider][which.min(sapply(fitted[["lambda"]][lambdaConsider], function(lambda) # Largest Lambda with minimum balanced error rate. |
|
17 | 19 |
{ |
18 |
- classPredictions <- factor(as.character(predict(fitted, measurementsMatrix, s = lambda, type = "class")), levels = fitted[["classnames"]]) |
|
20 |
+ classPredictions <- factor(as.character(predict(fitted, measurementsTrain, s = lambda, type = "class")), levels = fitted[["classnames"]]) |
|
19 | 21 |
calcExternalPerformance(classesTrain, classPredictions, "Balanced Error") |
20 | 22 |
}))[1]] |
21 | 23 |
attr(fitted, "tune") <- list(lambda = bestLambda) |
22 | 24 |
} |
25 |
+ |
|
26 |
+ attr(fitted, "featureNames") <- colnames(measurementsTrain) |
|
27 |
+ attr(fitted, "featureGroups") <- measurementsTrain@assign |
|
28 |
+ |
|
23 | 29 |
fitted |
24 | 30 |
} |
25 | 31 |
attr(elasticNetGLMtrainInterface, "name") <- "elasticNetGLMtrainInterface" |
... | ... |
@@ -28,6 +34,8 @@ attr(elasticNetGLMtrainInterface, "name") <- "elasticNetGLMtrainInterface" |
28 | 34 |
elasticNetGLMpredictInterface <- function(model, measurementsTest, lambda, ..., returnType = c("both", "class", "score"), verbose = 3) |
29 | 35 |
{ # ... just consumes emitted tuning variables from .doTrain which are unused. |
30 | 36 |
returnType <- match.arg(returnType) |
37 |
+ # One-hot encoding needed. |
|
38 |
+ measurementsTest <- MatrixModels::model.Matrix(~ 0 + ., data = measurementsTest) |
|
31 | 39 |
|
32 | 40 |
# Ensure that testing data has same columns names in same order as training data. |
33 | 41 |
# Remove those annoying backquotes which glmnet adds if variables have spaces in names. |
... | ... |
@@ -41,11 +49,11 @@ elasticNetGLMpredictInterface <- function(model, measurementsTest, lambda, ..., |
41 | 49 |
if(missing(lambda)) # Tuning parameters are not passed to prediction functions. |
42 | 50 |
lambda <- attr(model, "tune")[["lambda"]] # Sneak it in as an attribute on the model. |
43 | 51 |
|
44 |
- testMatrix <- glmnet::makeX(as(measurementsTest, "data.frame")) |
|
45 |
- testMatrix <- testMatrix[, rownames(model[["beta"]][[1]])] |
|
46 | 52 |
|
47 |
- classPredictions <- factor(as.character(predict(model, testMatrix, s = lambda, type = "class")), levels = model[["classnames"]]) |
|
48 |
- classScores <- predict(model, testMatrix, s = lambda, type = "response")[, , 1] |
|
53 |
+ measurementsTest <- measurementsTest[, rownames(model[["beta"]][[1]])] |
|
54 |
+ |
|
55 |
+ classPredictions <- factor(as.character(predict(model, measurementsTest, s = lambda, type = "class")), levels = model[["classnames"]]) |
|
56 |
+ classScores <- predict(model, measurementsTest, s = lambda, type = "response")[, , 1] |
|
49 | 57 |
|
50 | 58 |
if(is.matrix(classScores)) |
51 | 59 |
classScores <- classScores[, model[["classnames"]]] |
... | ... |
@@ -61,16 +69,20 @@ elasticNetGLMpredictInterface <- function(model, measurementsTest, lambda, ..., |
61 | 69 |
# |
62 | 70 |
# Get selected features (i.e. non-zero model coefficients) |
63 | 71 |
# |
72 |
+# Note: Need to convert back to actual features when factors were expanded into |
|
73 |
+# multiple columns using indicator variables. |
|
74 |
+# |
|
64 | 75 |
################################################################################ |
65 | 76 |
|
66 | 77 |
elasticNetFeatures <- function(model) |
67 | 78 |
{ |
68 |
- inputFeatures <- rownames(model[["beta"]][[1]]) |
|
69 | 79 |
# Floating point numbers test for equality. |
70 | 80 |
whichCoefficientColumn <- which(abs(model[["lambda"]] - attr(model, "tune")[["lambda"]]) < 0.00001)[1] |
71 | 81 |
coefficientsUsed <- sapply(model[["beta"]], function(classCoefficients) classCoefficients[, whichCoefficientColumn]) |
72 | 82 |
featureScores <- rowSums(abs(coefficientsUsed)) |
73 |
- rankedFeaturesIndices <- order(featureScores, decreasing = TRUE) |
|
74 |
- selectedFeaturesIndices <- which(featureScores != 0) |
|
83 |
+ featureGroups <- attr(model, "featureGroups")[match(names(featureScores), attr(model, "featureNames"))] |
|
84 |
+ groupScores <- unname(by(featureScores, featureGroups, max)) |
|
85 |
+ rankedFeaturesIndices <- order(groupScores, decreasing = TRUE) |
|
86 |
+ selectedFeaturesIndices <- which(groupScores != 0) |
|
75 | 87 |
list(rankedFeaturesIndices, selectedFeaturesIndices) |
76 | 88 |
} |
77 | 89 |
\ No newline at end of file |
... | ... |
@@ -9,7 +9,7 @@ GLMtrainInterface <- function(measurementsTrain, classesTrain, ..., verbose = 3) |
9 | 9 |
fitData <- cbind(measurementsTrain, class = classesTrain) |
10 | 10 |
classesTrain <- "class" # Column name for glm fit. |
11 | 11 |
} else {fitData <- measurementsTrain} |
12 |
- glm(class ~ . + 0, family = binomial, data = fitData, ...) |
|
12 |
+ glm(class ~ . + 0, family = binomial, data = fitData, weights = as.numeric(1 / (table(classesTrain)[classesTrain] / length(classesTrain))), ...) |
|
13 | 13 |
} |
14 | 14 |
attr(GLMtrainInterface, "name") <- "GLMtrainInterface" |
15 | 15 |
|
16 | 16 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,180 @@ |
1 |
+#' Precision Pathways for Sample Prediction Based on Prediction Confidence. |
|
2 |
+#' |
|
3 |
+#' Precision pathways allows the evaluation of various permutations of multiomics or multiview data. |
|
4 |
+#' Samples are predicted by a particular assay if they were consistently predicted as a particular class |
|
5 |
+#' during cross-validation. Otherwise, they are passed onto subsequent assays/tiers for prediction. Balanced accuracy |
|
6 |
+#' is used to evaluate overall prediction performance and sample-specific accuracy for individual-level evaluation. |
|
7 |
+#' |
|
8 |
+#' @param measurements Either a \code{\link{MultiAssayExperiment}} or a list of the basic tabular objects containing the data. |
|
9 |
+#' @param class Same as \code{measurements} but only training samples. IF \code{measurements} is a \code{list}, may also be |
|
10 |
+#' a vector of classes. |
|
11 |
+#' @param clinicalPredictors Default: \code{NULL}. Must be a character vector of clinical features to use in modelling. This allows avoidance of things like sample IDs, |
|
12 |
+#' sample acquisition dates, etc. which are not relevant for outcome prediction. |
|
13 |
+#' @param maxMissingProp Default: 0.0. A proportion less than 1 which is the maximum |
|
14 |
+#' tolerated proportion of missingness for a feature to be retained for modelling. |
|
15 |
+#' @param topNvariance Default: NULL. An integer number of most variable features per assay to subset to. |
|
16 |
+#' Assays with less features won't be reduced in size. |
|
17 |
+#' @param fixedAssays A character vector of assay names specifying any assays which must be at the |
|
18 |
+#' beginning of the pathway. |
|
19 |
+#' @param confidenceCutoff The minimum confidence of predictions for a sample to be predicted by a particular issue |
|
20 |
+#' . If a sample was predicted to belong to a particular class a proportion \eqn{p} times, then the confidence is \eqn{2 \times |p - 0.5|}. |
|
21 |
+#' @param minAssaySamples An integer specifying the minimum number of samples a tier may have. If a subsequent tier |
|
22 |
+#' would have less than this number of samples, the samples are incorporated into the current tier. |
|
23 |
+#' @param nFeatures Default: 20. The number of features to consider during feature selection, if feature selection is done. |
|
24 |
+#' @param selectionMethod A named character vector of feature selection methods to use for the assays, one for each. The names must correspond to names of \code{measurements}. |
|
25 |
+#' @param classifier A named character vector of modelling methods to use for the assays, one for each. The names must correspond to names of \code{measurements}. |
|
26 |
+#' @param nFolds A numeric specifying the number of folds to use for cross-validation. |
|
27 |
+#' @param nRepeats A numeric specifying the the number of repeats or permutations to use for cross-validation. |
|
28 |
+#' @param nCores A numeric specifying the number of cores used if the user wants to use parallelisation. |
|
29 |
+#' @param pathways A set of pathways created by \code{precisionPathwaysTrain} to be used for predicting on a new data set. |
|
30 |
+#' @rdname precisionPathways |
|
31 |
+#' @return An object of class \code{PrecisionPathways} which is basically a named list that other plotting and |
|
32 |
+#' tabulating functions can use. |
|
33 |
+#' @examples |
|
34 |
+#' # To be determined. |
|
35 |
+ |
|
36 |
+#' @usage NULL |
|
37 |
+setGeneric("precisionPathwaysTrain", function(measurements, class, ...) |
|
38 |
+ standardGeneric("precisionPathwaysTrain")) |
|
39 |
+ |
|
40 |
+#' @rdname precisionPathways |
|
41 |
+#' @export |
|
42 |
+setMethod("precisionPathwaysTrain", "MultiAssayExperimentOrList", |
|
43 |
+ function(measurements, class, clinicalPredictors = NULL, maxMissingProp = 0.0, topNvariance = NULL, |
|
44 |
+ fixedAssays = "clinical", confidenceCutoff = 0.8, minAssaySamples = 10, |
|
45 |
+ nFeatures = 20, selectionMethod = setNames(c("none", rep("t-test", length(measurements))), c("clinical", names(measurements))), |
|
46 |
+ classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))), c("clinical", names(measurements))), |
|
47 |
+ nFolds = 5, nRepeats = 20, nCores = 1) |
|
48 |
+ { |
|
49 |
+ if(is.list(measurements)) # Ensure plain list has clinical data. |
|
50 |
+ { |
|
51 |
+ # One of the tables must be named "clinical". |
|
52 |
+ if (!any(names(measurements) == "clinical")) |
|
53 |
+ stop("One of the tables must be named \"clinical\".") |
|
54 |
+ } |
|
55 |
+ prepArgs <- list(measurements, outcomeColumns = class, clinicalPredictors = clinicalPredictors, |
|
56 |
+ maxMissingProp = maxMissingProp, topNvariance = topNvariance) |
|
57 |
+ measurementsAndClass <- do.call(prepareData, prepArgs) |
|
58 |
+ |
|
59 |
+ .precisionPathwaysTrain(measurementsAndClass[["measurements"]], measurementsAndClass[["outcome"]], |
|
60 |
+ fixedAssays = fixedAssays, confidenceCutoff = confidenceCutoff, |
|
61 |
+ minAssaySamples = minAssaySamples, nFeatures = nFeatures, |
|
62 |
+ selectionMethod = selectionMethod, classifier = classifier, |
|
63 |
+ nFolds = nFolds, nRepeats = nRepeats, nCores = nCores) |
|
64 |
+ }) |
|
65 |
+ |
|
66 |
+# Internal method which carries out all of the processing, obtaining reformatted data from the |
|
67 |
+# MultiAssayExperiment and list (of basic rectangular tables) S4 methods. |
|
68 |
+.precisionPathwaysTrain <- function(measurements, class, fixedAssays = "clinical", |
|
69 |
+ confidenceCutoff = 0.8, minAssaySamples = 10, |
|
70 |
+ nFeatures = 20, selectionMethod = setNames(c(NULL, rep("t-test", length(measurements))), c("clinical", names(measurements))), |
|
71 |
+ classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))), c("clinical", names(measurements))), |
|
72 |
+ nFolds = 5, nRepeats = 20, nCores = 1) |
|
73 |
+ { |
|
74 |
+ # Step 1: Determine all valid permutations of assays, taking into account the |
|
75 |
+ # assays to be used and which assays, if any, must be included. |
|
76 |
+ assayIDs <- unique(S4Vectors::mcols(measurements)[["assay"]]) |
|
77 |
+ assaysPermutations <- .permutations(assayIDs, fixed = data.frame(seq_along(fixedAssays), fixedAssays)) |
|
78 |
+ permutationIDs <- apply(assaysPermutations, 2, function(permutation) paste(permutation, collapse = '-')) |
|
79 |
+ |
|
80 |
+ # Step 2: Build a classifier for each assay using all of the samples. |
|
81 |
+ modelsList <- crossValidate(measurements, class, nFeatures, selectionMethod, |
|
82 |
+ classifier = classifier, nFolds = nFolds, |
|
83 |
+ nRepeats = nRepeats, nCores = nCores) |
|
84 |
+ modelsList <- lapply(modelsList, calcCVperformance, "Sample Accuracy") # Add sample accuracy, which can be subset later. |
|
85 |
+ |
|
86 |
+ # Step 3: Loop over each pathway and each assay in order to determine which samples are used at that level |
|
87 |
+ # and which are passed onwards. |
|
88 |
+ precisionPathways <- lapply(as.data.frame(assaysPermutations), function(permutation) |
|
89 |
+ { |
|
90 |
+ assaysProcessed <- character() |
|
91 |
+ samplesUsed <- character() |
|
92 |
+ individualsTableAll <- S4Vectors::DataFrame() |
|
93 |
+ tierTableAll <- S4Vectors::DataFrame() |
|
94 |
+ breakEarly = FALSE |
|
95 |
+ for(assay in permutation) |
|
96 |
+ { |
|
97 |
+ # Step 3a: Identify all samples which are consistently predicted. |
|
98 |
+ modelIndex <- match(assay, assayIDs) |
|
99 |
+ allPredictions <- predictions(modelsList[[modelIndex]]) |
|
100 |
+ allSampleIDs <- sampleNames(modelsList[[modelIndex]]) |
|
101 |
+ predictionsSamplesCounts <- table(allPredictions[, "sample"], allPredictions[, "class"]) |
|
102 |
+ confidences <- 2 * abs(predictionsSamplesCounts[, 1] / rowSums(predictionsSamplesCounts) - 0.5) |
|
103 |
+ sampleIDsUse <- names(confidences)[confidences > confidenceCutoff] |
|
104 |
+ |
|
105 |
+ # Check if too few samples left for next round. Include them in this round, if so. |
|
106 |
+ remainingIDs <- setdiff(allSampleIDs, c(samplesUsed, sampleIDsUse)) |
|
107 |
+ if(length(remainingIDs) < minAssaySamples) |
|
108 |
+ { |
|
109 |
+ sampleIDsUse <- c(sampleIDsUse, remainingIDs) |
|
110 |
+ breakEarly = TRUE |
|
111 |
+ } |
|
112 |
+ |
|
113 |
+ predictionsSamplesCounts <- predictionsSamplesCounts[sampleIDsUse, ] |
|
114 |
+ |
|
115 |
+ # Step 3b: Individuals predictions and sample-wise accuracy, tier-wise error. |
|
116 |
+ maxVotes <- apply(predictionsSamplesCounts, 1, function(sample) which.max(sample)) |
|
117 |
+ predictedClasses <- factor(colnames(predictionsSamplesCounts)[maxVotes], |
|
118 |
+ levels = colnames(predictionsSamplesCounts)) |
|
119 |
+ individualsTable <- S4Vectors::DataFrame(Tier = assay, |
|
120 |
+ `Sample ID` = sampleIDsUse, |
|
121 |
+ `Predicted` = predictedClasses, |
|
122 |
+ `Accuracy` = performance(modelsList[[modelIndex]])[["Sample Accuracy"]][sampleIDsUse], |
|
123 |
+ check.names = FALSE) |
|
124 |
+ knownClasses <- actualOutcome(modelsList[[modelIndex]])[match(sampleIDsUse, allSampleIDs)] |
|
125 |
+ balancedAccuracy <- calcExternalPerformance(knownClasses, predictedClasses) |
|
126 |
+ tierTable <- S4Vectors::DataFrame(Tier = assay, |
|
127 |
+ `Balanced Accuracy` = balancedAccuracy, check.names = FALSE) |
|
128 |
+ |
|
129 |
+ assaysProcessed <- c(assaysProcessed, assay) |
|
130 |
+ individualsTableAll <- rbind(individualsTableAll, individualsTable) |
|
131 |
+ tierTableAll <- rbind(tierTableAll, tierTable) |
|
132 |
+ samplesUsed <- c(samplesUsed, sampleIDsUse) |
|
133 |
+ |
|
134 |
+ if(breakEarly == TRUE) break |
|
135 |
+ } |
|
136 |
+ pathwayString <- paste(assaysProcessed, collapse = '-') |
|
137 |
+ parameters = list(confidenceCutoff = confidenceCutoff, minAssaySamples = minAssaySamples) |
|
138 |
+ list(models = modelsList, parameters = parameters, pathway = pathwayString, |
|
139 |
+ individuals = individualsTableAll, tiers = tierTableAll) |
|
140 |
+ }) |
|
141 |
+ |
|
142 |
+ class(precisionPathways) <- "PrecisionPathways" |
|
143 |
+ names(precisionPathways) <- sapply(precisionPathways, "[[", "pathway") |
|
144 |
+ precisionPathways |
|
145 |
+} |
|
146 |
+ |
|
147 |
+# A nice print method to avoid flooding the screen with lots of tables |
|
148 |
+# when result is shown in console. |
|
149 |
+print.PrecisionPathways <- function(x) |
|
150 |
+{ |
|
151 |
+ cat("An object of class 'PrecisionPathways'.\n") |
|
152 |
+ cat("Pathways:\n") |
|
153 |
+ cat(paste(names(x), collapse = '\n')) |
|
154 |
+} |
|
155 |
+ |
|
156 |
+#' @usage NULL |
|
157 |
+setGeneric("precisionPathwaysPredict", function(pathways, measurements, class, ...) |
|
158 |
+ standardGeneric("precisionPathwaysPredict")) |
|
159 |
+ |
|
160 |
+#' @rdname precisionPathways |
|
161 |
+#' @export |
|
162 |
+setMethod("precisionPathwaysPredict", "MultiAssayExperimentOrList", |
|
163 |
+ function(pathways, measurements, class) |
|
164 |
+ { |
|
165 |
+ if(is.list(measurements)) # Ensure plain list has clinical data. |
|
166 |
+ { |
|
167 |
+ # One of the tables must be named "clinical". |
|
168 |
+ if (!any(names(measurements) == "clinical")) |
|
169 |
+ stop("One of the tables must be named \"clinical\".") |
|
170 |
+ } |
|
171 |
+ prepArgs <- list(measurements, outcomeColumns = class) |
|
172 |
+ measurementsAndClass <- do.call(prepareData, prepArgs) |
|
173 |
+ |
|
174 |
+ .precisionPathwaysPredict(pathways, measurementsAndClass[["measurements"]], measurementsAndClass[["outcome"]]) |
|
175 |
+ }) |
|
176 |
+ |
|
177 |
+.precisionPathwaysPredict <- function(pathways, measurements, class) |
|
178 |
+{ |
|
179 |
+ # To do. |
|
180 |
+} |
|
0 | 181 |
\ No newline at end of file |
... | ... |
@@ -18,15 +18,13 @@ |
18 | 18 |
#' @param outcomeColumns If \code{measurements} is a \code{MultiAssayExperiment}, the |
19 | 19 |
#' names of the column (class) or columns (survival) in the table extracted by \code{colData(data)} |
20 | 20 |
#' that contain(s) the each individual's outcome to use for prediction. |
21 |
-#' @param useFeatures If \code{measurements} is a \code{MultiAssayExperiment}, |
|
22 |
-#' a two-column table of features to use. The first column must have assay names |
|
23 |
-#' and the second column must have feature names found for that assay. \code{"clinical"} is |
|
24 |
-#' also a valid assay name and refers to the clinical data table. \code{"all"} is a special |
|
25 |
-#' keyword that means all features (passing any other filters) of that assay will be used |
|
26 |
-#' for modelling. Otherwise, a character vector of feature names to use suffices. |
|
21 |
+#' @param clinicalPredictors If \code{measurements} is a \code{MultiAssayExperiment}, |
|
22 |
+#' a character vector of features to use in modelling. This allows avoidance of things like sample IDs, |
|
23 |
+#' sample acquisition dates, etc. which are not relevant for outcome prediction. |
|
27 | 24 |
#' @param maxMissingProp Default: 0.0. A proportion less than 1 which is the maximum |
28 | 25 |
#' tolerated proportion of missingness for a feature to be retained for modelling. |
29 |
-#' @param topNvariance Default: NULL. An integer number of most variable features to subset to. |
|
26 |
+#' @param topNvariance Default: NULL. An integer number of most variable features per assay to subset to. |
|
27 |
+#' Assays with less features won't be reduced in size. |
|
30 | 28 |
#' @param ... Variables not used by the \code{matrix} nor the |
31 | 29 |
#' \code{MultiAssayExperiment} method which are passed into and used by the |
32 | 30 |
#' \code{DataFrame} method. |
... | ... |
@@ -58,18 +56,16 @@ setMethod("prepareData", "data.frame", |
58 | 56 |
#' @rdname prepareData |
59 | 57 |
#' @export |
60 | 58 |
setMethod("prepareData", "DataFrame", |
61 |
- function(measurements, outcome, useFeatures = "all", maxMissingProp = 0.0, topNvariance = NULL) |
|
59 |
+ function(measurements, outcome, clinicalPredictors = NULL, maxMissingProp = 0.0, topNvariance = NULL) |
|
62 | 60 |
{ |
63 | 61 |
if(is.null(rownames(measurements))) |
64 | 62 |
{ |
65 | 63 |
warning("'measurements' DataFrame must have sample identifiers as its row names. Generating generic ones.") |
66 | 64 |
rownames(measurements) <- paste("Sample", seq_len(nrow(measurements))) |
67 |
- } |
|
68 |
- |
|
69 |
- if(useFeatures != "all") # Subset to only the desired ones. |
|
70 |
- measurements <- measurements[, useFeatures] |
|
71 |
- |
|
65 |
+ } |
|
66 |
+ |
|
72 | 67 |
# Won't ever be true if input data was MultiAssayExperiment because wideFormat already produces valid names. |
68 |
+ # Need to check if input data was DataFrame because names might not be valid from user. |
|
73 | 69 |
if(!all(colnames(measurements) == make.names(colnames(measurements)))) |
74 | 70 |
{ |
75 | 71 |
warning("Unsafe feature names in input data. Converted into safe names.") |
... | ... |
@@ -129,6 +125,19 @@ setMethod("prepareData", "DataFrame", |
129 | 125 |
else # Three columns. Therefore, counting process data. |
130 | 126 |
outcome <- survival::Surv(outcome[, 1], outcome[, 2], outcome[, 3]) |
131 | 127 |
} |
128 |
+ |
|
129 |
+ if(!is.null(clinicalPredictors)) |
|
130 |
+ { |
|
131 |
+ if(!is.null(mcols(measurements)$assay)) |
|
132 |
+ { |
|
133 |
+ clinicalIndices <- which(mcols(measurements)$assay == "clinical") |
|
134 |
+ usePredictors <- intersect(clinicalIndices, which(mcols(measurements)$feature %in% clinicalPredictors)) |
|
135 |
+ dropIndices <- setdiff(clinicalIndices, usePredictors) |
|
136 |
+ if(length(dropIndices) > 0) measurements <- measurements[, -dropIndices] |
|
137 |
+ } else { # The DataFrame is entirely clinical data. |
|
138 |
+ measurements <- measurements[, clinicalPredictors] |
|
139 |
+ } |
|
140 |
+ } |
|
132 | 141 |
|
133 | 142 |
# Remove samples with indeterminate outcome. |
134 | 143 |
dropSamples <- which(is.na(outcome) | is.null(outcome)) |
... | ... |
@@ -146,11 +155,18 @@ setMethod("prepareData", "DataFrame", |
146 | 155 |
if(length(dropFeatures) > 0) |
147 | 156 |
measurements <- measurements[, -dropFeatures] |
148 | 157 |
|
149 |
- # Use only the most variable features. |
|
158 |
+ # Use only the most N variable features per assay. |
|
150 | 159 |
if(!is.null(topNvariance)) |
151 | 160 |
{ |
152 |
- mostVariance <- order(apply(measurements, 2, var, na.rm = TRUE), decreasing = TRUE)[1:topNvariance] |
|
153 |
- measurements <- measurements[, mostVariance] |
|
161 |
+ if(is.null(mcols(measurements)$assay)) assays <- rep(1, ncol(measurements)) else assays <- mcols(measurements)$assay |
|
162 |
+ do.call(cbind, lapply(unqiue(assays), function(assay) |
|
163 |
+ { |
|
164 |
+ assayColumns <- which(assays == assay) |
|
165 |
+ if(length(assayColumns) < topNvariance) |
|
166 |
+ measurements[, assayColumns] |
|
167 |
+ else |
|
168 |
+ measurements[, assayColumns][order(apply(measurements[, assayColumns], 2, var, na.rm = TRUE), decreasing = TRUE)[1:topNvariance]] |
|
169 |
+ })) |
|
154 | 170 |
} |
155 | 171 |
|
156 | 172 |
list(measurements = measurements, outcome = outcome) |
... | ... |
@@ -159,71 +175,75 @@ setMethod("prepareData", "DataFrame", |
159 | 175 |
#' @rdname prepareData |
160 | 176 |
#' @export |
161 | 177 |
setMethod("prepareData", "MultiAssayExperiment", |
162 |
- function(measurements, outcomeColumns = NULL, useFeatures = "all", ...) |
|
178 |
+ function(measurements, outcomeColumns = NULL, clinicalPredictors = NULL, ...) |
|
163 | 179 |
{ |
164 |
- if(is.character(useFeatures)) useFeatures <- data.frame(assay = names(measurements), feature = "all") |
|
165 |
- omicsTargets <- setdiff(useFeatures[, "assay"], "clinical") |
|
166 |
- if(length(omicsTargets) > 0) |
|
167 |
- { |
|
168 |
- if(any(anyReplicated(measurements[, , omicsTargets]))) |
|
169 |
- stop("Data set contains replicates. Please remove or average replicate observations and try again.") |
|
170 |
- } |
|
171 |
- |
|
172 |
- if(!is.null(outcomeColumns) && !all(outcomeColumns %in% colnames(MultiAssayExperiment::colData(measurements)))) |
|
180 |
+ if(is.null(clinicalPredictors)) |
|
181 |
+ stop("'clinicalPredictors' must be a vector of informative clinical features (i.e. not sample IDs, sampling dates, etc.) to consider for classification.") |
|
182 |
+ |
|
183 |
+ if(any(anyReplicated(measurements))) |
|
184 |
+ stop("Data set contains replicates. Please remove or average replicate observations and try again.") |
|
185 |
+ |
|
186 |
+ if(is.null(outcomeColumns)) |
|
187 |
+ stop("'outcomeColumns' is a mandatory parameter but was not specified.") |
|
188 |
+ |
|
189 |
+ if(!all(outcomeColumns %in% colnames(MultiAssayExperiment::colData(measurements)))) |
|
173 | 190 |
stop("Not all column names specified by 'outcomeColumns' found in clinical table.") |
174 |
- if(!all(useFeatures[, "assay"] %in% c(names(measurements), "clinical"))) |
|
175 |
- stop("Some assay names in first column of 'useFeatures' are not assay names in 'measurements' or \"clinical\".") |
|
176 | 191 |
|
177 |
- clinicalColumnsDataset <- colnames(MultiAssayExperiment::colData(measurements)) |
|
178 |
- if("clinical" %in% useFeatures[, "assay"]) |
|
179 |
- { |
|
180 |
- clinicalRows <- useFeatures[, "assay"] == "clinical" |
|
181 |
- clinicalColumns <- useFeatures[clinicalRows, "feature"] |
|
182 |
- if(length(clinicalColumns) == 1 && clinicalColumns == "all") |
|
183 |
- clinicalColumns <- setdiff(clinicalColumnsDataset, outcomeColumns) |
|
184 |
- useFeatures <- useFeatures[!clinicalRows, ] |
|
185 |
- } else { |
|
186 |
- clinicalColumns <- NULL |
|
187 |
- } |
|
188 |
- |
|
189 |
- if(nrow(useFeatures) > 0) |
|
190 |
- { |
|
191 |
- measurements <- measurements[, , unique(useFeatures[, "assay"])] |
|
192 |
- # Get all desired measurements tables and clinical columns (other than the columns representing outcome). |
|
193 |
- # These form the independent variables to be used for making predictions with. |
|
194 |
- # Variable names will have names like RNA_BRAF for traceability. |
|
195 |
- dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(clinicalColumns, outcomeColumns)) |
|
196 |
- rownames(dataTable) <- dataTable[, "primary"] |
|
197 |
- S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "clinical", S4Vectors::mcols(dataTable)[, "sourceName"]) |
|
198 |
- colnames(S4Vectors::mcols(dataTable))[1] <- "assay" |
|
192 |
+ # Get all desired measurements tables and clinical columns. |
|
193 |
+ # These form the independent variables to be used for making predictions with. |
|
194 |
+ # Variable names will have names like RNA_BRAF for traceability. |
|
195 |
+ dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(clinicalPredictors, outcomeColumns)) |
|
196 |
+ rownames(dataTable) <- dataTable[, "primary"] |
|
197 |
+ S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "clinical", S4Vectors::mcols(dataTable)[, "sourceName"]) |
|
198 |
+ dataTable <- dataTable[, -match("primary", colnames(dataTable))] |
|
199 |
+ colnames(S4Vectors::mcols(dataTable))[1] <- "assay" |
|
199 | 200 |
|
200 |
- # Sample information variable names not included in column metadata of wide table but only as row names of it. |
|
201 |
- # Create a combined column named "feature" which has feature names of the assays as well as the clinical. |
|
202 |
- S4Vectors::mcols(dataTable)[, "feature"] <- as.character(S4Vectors::mcols(dataTable)[, "rowname"]) |
|
203 |
- missingIndices <- is.na(S4Vectors::mcols(dataTable)[, "feature"]) |
|
204 |
- S4Vectors::mcols(dataTable)[missingIndices, "feature"] <- colnames(dataTable)[missingIndices] |
|
201 |
+ # Sample information variable names not included in column metadata of wide table but only as row names of it. |
|
202 |
+ # Create a combined column named "feature" which has feature names of the assays as well as the clinical. |
|
203 |
+ S4Vectors::mcols(dataTable)[, "feature"] <- as.character(S4Vectors::mcols(dataTable)[, "rowname"]) |
|
204 |
+ missingIndices <- is.na(S4Vectors::mcols(dataTable)[, "feature"]) |
|
205 |
+ S4Vectors::mcols(dataTable)[missingIndices, "feature"] <- colnames(dataTable)[missingIndices] |
|
205 | 206 |
|
206 |
- # Finally, a column annotation recording variable name and which table it originated from for all of the source tables. |
|
207 |
- S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")] |
|
207 |
+ # Finally, a column annotation recording variable name and which table it originated from for all of the source tables. |
|
208 |
+ S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")] |
|
208 | 209 |
|
209 |
- # Subset to only the desired features. |
|
210 |
- useFeaturesSubset <- useFeatures[useFeatures[, "feature"] != "all", ] |
|
211 |
- if(nrow(useFeaturesSubset) > 0) |
|
212 |
- { |
|
213 |
- uniqueAssays <- unique(useFeatures[, "assay"]) |
|
214 |
- for(filterAssay in uniqueAssays) |
|
215 |
- { |
|
216 |
- dropFeatures <- S4Vectors::mcols(dataTable)[, "assay"] == filterAssay & |
|
217 |
- !S4Vectors::mcols(dataTable)[, "feature"] %in% useFeatures[useFeatures[, 1] == filterAssay, 2] |
|
218 |
- dataTable <- dataTable[, !dropFeatures] |
|
219 |
- } |
|
220 |
- } |
|
221 |
- dataTable <- dataTable[, -match("primary", colnames(dataTable))] |
|
222 |
- } else { # Must have only been clinical data. |
|
223 |
- dataTable <- MultiAssayExperiment::colData(measurements) |
|
224 |
- S4Vectors::mcols(dataTable) <- DataFrame(assay = "clinical", feature = colnames(dataTable)) |
|
225 |
- } |
|
210 |
+ # Do other filtering and preparation in DataFrame function. |
|
211 |
+ prepareData(dataTable, outcomeColumns, clinicalPredictors = NULL, ...) |
|
212 |
+}) |
|
213 |
+ |
|
214 |
+#' @rdname prepareData |
|
215 |
+#' @export |
|
216 |
+setMethod("prepareData", "list", |
|
217 |
+ function(measurements, outcome = NULL, clinicalPredictors = NULL, ...) |
|
218 |
+{ |
|
219 |
+ # Check the list is named. |
|
220 |
+ if(is.null(names(measurements))) |
|
221 |
+ stop("'measurements' must be a named list.") |
|
222 |
+ |
|
223 |
+ # If clinical table is present, features to use must be user-specified. |
|
224 |
+ if("clinical" %in% names(measurements) && is.null(clinicalPredictors)) |
|
225 |
+ stop("Because one provided table in the list is named \"clinical\", 'clinicalPredictors' must be a vector of informative clinical features (i.e. not sample IDs, sampling dates, etc.) to consider for classification.") |
|
226 |
+ |
|
227 |
+ # Check data type is valid. |
|
228 |
+ if(!(all(sapply(measurements, class) %in% c("data.frame", "DataFrame", "matrix")))) |
|
229 |
+ stop("assays in the list must be of type data.frame, DataFrame or matrix") |
|
230 |
+ |
|
231 |
+ # Check same number of samples for all datasets |
|
232 |
+ if (!length(unique(sapply(measurements, nrow))) == 1) |
|
233 |
+ stop("All datasets must have the same samples.") |
|
234 |
+ |
|
235 |
+ if("clinical" %in% names(measurements)) |
|
236 |
+ measurements[["clinical"]] <- measurements[["clinical"]][, clinicalPredictors] |
|
237 |
+ |
|
238 |
+ allMetadata <- mapply(function(measurementsOne, assayID) { |
|
239 |
+ data.frame(assay = assayID, feature = colnames(measurementsOne)) |
|
240 |
+ }, measurements, names(measurements)) |
|
241 |
+ allMeasurements <- do.call("cbind", measurements) |
|
242 |
+ # Different assays e.g. mRNA, protein could have same feature name e.g. BRAF. |
|
243 |
+ colnames(allMeasurements) <- paste(allMetadata[, "assay"], allMetadata[, "feature"], sep = '_') |
|
244 |
+ allDataFrame <- DataFrame(allMeasurements) |
|
245 |
+ S4Vectors::mcols(allMeasurements) <- allMetadata |
|
226 | 246 |
|
227 | 247 |
# Do other filtering and preparation in DataFrame function. |
228 |
- prepareData(dataTable, outcomeColumns, useFeatures = "all", ...) |
|
248 |
+ prepareData(dataTable, outcome, clinicalPredictors = NULL, ...) |
|
229 | 249 |
}) |
230 | 250 |
\ No newline at end of file |
... | ... |
@@ -6,8 +6,7 @@ |
6 | 6 |
\alias{crossValidate,DataFrame-method} |
7 | 7 |
\alias{crossValidate,MultiAssayExperiment-method,} |
8 | 8 |
\alias{crossValidate,data.frame-method} |
9 |
-\alias{crossValidate,MultiAssayExperiment-method} |
|
10 |
-\alias{crossValidate,list-method} |
|
9 |
+\alias{crossValidate,MultiAssayExperimentOrList-method} |
|
11 | 10 |
\alias{train.matrix} |
12 | 11 |
\alias{train.data.frame} |
13 | 12 |
\alias{train.DataFrame} |
... | ... |
@@ -16,8 +15,6 @@ |
16 | 15 |
\alias{predict.trainedByClassifyR} |
17 | 16 |
\title{Cross-validation to evaluate classification performance.} |
18 | 17 |
\usage{ |
19 |
-crossValidate(measurements, outcome, ...) |
|
20 |
- |
|
21 | 18 |
\S4method{crossValidate}{DataFrame}( |
22 | 19 |
measurements, |
23 | 20 |
outcome, |
... | ... |
@@ -35,9 +32,10 @@ crossValidate(measurements, outcome, ...) |
35 | 32 |
extraParams = NULL |
36 | 33 |
) |
37 | 34 |
|
38 |
-\S4method{crossValidate}{MultiAssayExperiment}( |
|
35 |
+\S4method{crossValidate}{MultiAssayExperimentOrList}( |
|
39 | 36 |
measurements, |
40 | 37 |
outcome, |
38 |
+ clinicalPredictors = NULL, |
|
41 | 39 |
nFeatures = 20, |
42 | 40 |
selectionMethod = "auto", |
43 | 41 |
selectionOptimisation = "Resubstitution", |
... | ... |
@@ -86,23 +84,6 @@ crossValidate(measurements, outcome, ...) |
86 | 84 |
extraParams = NULL |
87 | 85 |
) |
88 | 86 |
|
89 |
-\S4method{crossValidate}{list}( |
|
90 |
- measurements, |
|
91 |
- outcome, |
|
92 |
- nFeatures = 20, |
|
93 |
- selectionMethod = "auto", |
|
94 |
- selectionOptimisation = "Resubstitution", |
|
95 |
- performanceType = "auto", |
|
96 |
- classifier = "auto", |
|
97 |
- multiViewMethod = "none", |
|
98 |
- assayCombinations = "all", |
|
99 |
- nFolds = 5, |
|
100 |
- nRepeats = 20, |
|
101 |
- nCores = 1, |
|
102 |
- characteristicsLabel = NULL, |
|
103 |
- extraParams = NULL |
|
104 |
-) |
|
105 |
- |
|
106 | 87 |
\method{train}{matrix}(x, outcomeTrain, ...) |
107 | 88 |
|
108 | 89 |
\method{train}{data.frame}(x, outcomeTrain, ...) |
... | ... |
@@ -122,7 +103,7 @@ crossValidate(measurements, outcome, ...) |
122 | 103 |
|
123 | 104 |
\method{train}{list}(x, outcomeTrain, ...) |
124 | 105 |
|
125 |
-\method{train}{MultiAssayExperiment}(x, outcome, ...) |
|
106 |
+\method{train}{MultiAssayExperiment}(x, outcome, clinicalPredictors = NULL, ...) |
|
126 | 107 |
|
127 | 108 |
\method{predict}{trainedByClassifyR}(object, newData, ...) |
128 | 109 |
} |
... | ... |
@@ -146,7 +127,8 @@ Set to NULL or "all" if all features should be used.} |
146 | 127 |
|
147 | 128 |
\item{selectionMethod}{Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, |
148 | 129 |
and performing multiview classification, the respective selection methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking |
149 |
-and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value.} |
|
130 |
+and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. \code{"none"} is also a valid value, meaning that no |
|
131 |
+indepedent feature selection will be performed (but implicit selection might still happen with the classifier).} |
|
150 | 132 |
|
151 | 133 |
\item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.} |
152 | 134 |
|
... | ... |
@@ -172,7 +154,11 @@ with each element being a vector of assays to combine. Special value \code{"all" |
172 | 154 |
\item{extraParams}{A list of parameters that will be used to overwrite default settings of transformation, selection, or model-building functions or |
173 | 155 |
parameters which will be passed into the data cleaning function. The names of the list must be one of \code{"prepare"}, |
174 | 156 |
\code{"select"}, \code{"train"}, \code{"predict"}. To remove one of the defaults (see the article titled Parameter Tuning Presets for crossValidate and Their Customisation on |
175 |
-the website), specify the list element to be \code{NULL}.} |
|
157 |
+the website), specify the list element to be \code{NULL}. For the valid element names in the \code{"prepare"} list, see \code{?prepareData}.} |
|
158 |
+ |
|
159 |
+\item{clinicalPredictors}{Default: \code{NULL}. If \code{measurements} is a \code{MultiAssayExperiment}, |
|
160 |
+a character vector of features to use in modelling. This allows avoidance of things like sample IDs, |
|
161 |
+sample acquisition dates, etc. which are not relevant for outcome prediction.} |
|
176 | 162 |
|
177 | 163 |
\item{x}{Same as \code{measurements} but only training samples.} |
178 | 164 |
|
179 | 165 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,80 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/precisionPathways.R |
|
3 |
+\name{precisionPathwaysTrain} |
|
4 |
+\alias{precisionPathwaysTrain} |
|
5 |
+\alias{precisionPathwaysTrain,MultiAssayExperimentOrList-method} |
|
6 |
+\alias{precisionPathwaysPredict,MultiAssayExperimentOrList-method} |
|
7 |
+\title{Precision Pathways for Sample Prediction Based on Prediction Confidence.} |
|
8 |
+\usage{ |
|
9 |
+\S4method{precisionPathwaysTrain}{MultiAssayExperimentOrList}( |
|
10 |
+ measurements, |
|
11 |
+ class, |
|
12 |
+ clinicalPredictors = NULL, |
|
13 |
+ maxMissingProp = 0, |
|
14 |
+ topNvariance = NULL, |
|
15 |
+ fixedAssays = "clinical", |
|
16 |
+ confidenceCutoff = 0.8, |
|
17 |
+ minAssaySamples = 10, |
|
18 |
+ nFeatures = 20, |
|
19 |
+ selectionMethod = setNames(c("none", rep("t-test", length(measurements))), |
|
20 |
+ c("clinical", names(measurements))), |
|
21 |
+ classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))), |
|
22 |
+ c("clinical", names(measurements))), |
|
23 |
+ nFolds = 5, |
|
24 |
+ nRepeats = 20, |
|
25 |
+ nCores = 1 |
|
26 |
+) |
|
27 |
+ |
|
28 |
+\S4method{precisionPathwaysPredict}{MultiAssayExperimentOrList}(pathways, measurements, class) |
|
29 |
+} |
|
30 |
+\arguments{ |
|
31 |
+\item{measurements}{Either a \code{\link{MultiAssayExperiment}} or a list of the basic tabular objects containing the data.} |
|
32 |
+ |
|
33 |
+\item{class}{Same as \code{measurements} but only training samples. IF \code{measurements} is a \code{list}, may also be |
|
34 |
+a vector of classes.} |
|
35 |
+ |
|
36 |
+\item{clinicalPredictors}{Default: \code{NULL}. Must be a character vector of clinical features to use in modelling. This allows avoidance of things like sample IDs, |
|
37 |
+sample acquisition dates, etc. which are not relevant for outcome prediction.} |
|
38 |
+ |
|
39 |
+\item{maxMissingProp}{Default: 0.0. A proportion less than 1 which is the maximum |
|
40 |
+tolerated proportion of missingness for a feature to be retained for modelling.} |
|
41 |
+ |
|
42 |
+\item{topNvariance}{Default: NULL. An integer number of most variable features per assay to subset to. |
|
43 |
+Assays with less features won't be reduced in size.} |
|
44 |
+ |
|
45 |
+\item{fixedAssays}{A character vector of assay names specifying any assays which must be at the |
|
46 |
+beginning of the pathway.} |
|
47 |
+ |
|
48 |
+\item{confidenceCutoff}{The minimum confidence of predictions for a sample to be predicted by a particular issue |
|
49 |
+. If a sample was predicted to belong to a particular class a proportion \eqn{p} times, then the confidence is \eqn{2 \times |p - 0.5|}.} |
|
50 |
+ |
|
51 |
+\item{minAssaySamples}{An integer specifying the minimum number of samples a tier may have. If a subsequent tier |
|
52 |
+would have less than this number of samples, the samples are incorporated into the current tier.} |
|
53 |
+ |
|
54 |
+\item{nFeatures}{Default: 20. The number of features to consider during feature selection, if feature selection is done.} |
|
55 |
+ |
|
56 |
+\item{selectionMethod}{A named character vector of feature selection methods to use for the assays, one for each. The names must correspond to names of \code{measurements}.} |
|
57 |
+ |
|
58 |
+\item{classifier}{A named character vector of modelling methods to use for the assays, one for each. The names must correspond to names of \code{measurements}.} |
|
59 |
+ |
|
60 |
+\item{nFolds}{A numeric specifying the number of folds to use for cross-validation.} |
|
61 |
+ |
|
62 |
+\item{nRepeats}{A numeric specifying the the number of repeats or permutations to use for cross-validation.} |
|
63 |
+ |
|
64 |
+\item{nCores}{A numeric specifying the number of cores used if the user wants to use parallelisation.} |
|
65 |
+ |
|
66 |
+\item{pathways}{A set of pathways created by \code{precisionPathwaysTrain} to be used for predicting on a new data set.} |
|
67 |
+} |
|
68 |
+\value{ |
|
69 |
+An object of class \code{PrecisionPathways} which is basically a named list that other plotting and |
|
70 |
+tabulating functions can use. |
|
71 |
+} |
|
72 |
+\description{ |
|
73 |
+Precision pathways allows the evaluation of various permutations of multiomics or multiview data. |
|
74 |
+Samples are predicted by a particular assay if they were consistently predicted as a particular class |
|
75 |
+during cross-validation. Otherwise, they are passed onto subsequent assays/tiers for prediction. Balanced accuracy |
|
76 |
+is used to evaluate overall prediction performance and sample-specific accuracy for individual-level evaluation. |
|
77 |
+} |
|
78 |
+\examples{ |
|
79 |
+# To be determined. |
|
80 |
+} |
... | ... |
@@ -6,6 +6,7 @@ |
6 | 6 |
\alias{prepareData,DataFrame-method} |
7 | 7 |
\alias{prepareData,MultiAssayExperiment-method} |
8 | 8 |
\alias{prepareData,data.frame-method} |
9 |
+\alias{prepareData,list-method} |
|
9 | 10 |
\title{Convert Different Data Classes into DataFrame and Filter Features} |
10 | 11 |
\usage{ |
11 | 12 |
\S4method{prepareData}{matrix}(measurements, outcome, ...) |
... | ... |
@@ -15,12 +16,19 @@ |
15 | 16 |
\S4method{prepareData}{DataFrame}( |
16 | 17 |
measurements, |
17 | 18 |
outcome, |
18 |
- useFeatures = "all", |
|
19 |
+ clinicalPredictors = NULL, |
|
19 | 20 |
maxMissingProp = 0, |
20 | 21 |
topNvariance = NULL |
21 | 22 |
) |
22 | 23 |
|
23 |
-\S4method{prepareData}{MultiAssayExperiment}(measurements, outcomeColumns = NULL, useFeatures = "all", ...) |
|
24 |
+\S4method{prepareData}{MultiAssayExperiment}( |
|
25 |
+ measurements, |
|
26 |
+ outcomeColumns = NULL, |
|
27 |
+ clinicalPredictors = NULL, |
|
28 |
+ ... |
|
29 |
+) |
|
30 |
+ |
|
31 |
+\S4method{prepareData}{list}(measurements, outcome = NULL, clinicalPredictors = NULL, ...) |
|
24 | 32 |
} |
25 | 33 |
\arguments{ |
26 | 34 |
\item{measurements}{Either a \code{\link{matrix}}, \code{\link{DataFrame}} |
... | ... |
@@ -37,17 +45,15 @@ a character string, or vector of such strings, containing column name(s) of colu |
37 | 45 |
containing either classes or time and event information about survival. If column names |
38 | 46 |
of survival information, time must be in first column and event status in the second.} |
39 | 47 |
|
40 |
-\item{useFeatures}{If \code{measurements} is a \code{MultiAssayExperiment}, |
|
41 |
-a two-column table of features to use. The first column must have assay names |
|
42 |
-and the second column must have feature names found for that assay. \code{"clinical"} is |
|
43 |
-also a valid assay name and refers to the clinical data table. \code{"all"} is a special |
|
44 |
-keyword that means all features (passing any other filters) of that assay will be used |
|
45 |
-for modelling. Otherwise, a character vector of feature names to use suffices.} |
|
48 |
+\item{clinicalPredictors}{If \code{measurements} is a \code{MultiAssayExperiment}, |
|
49 |
+a character vector of features to use in modelling. This allows avoidance of things like sample IDs, |
|
50 |
+sample acquisition dates, etc. which are not relevant for outcome prediction.} |
|
46 | 51 |
|
47 | 52 |
\item{maxMissingProp}{Default: 0.0. A proportion less than 1 which is the maximum |
48 | 53 |
tolerated proportion of missingness for a feature to be retained for modelling.} |
49 | 54 |
|
50 |
-\item{topNvariance}{Default: NULL. An integer number of most variable features to subset to.} |
|
55 |
+\item{topNvariance}{Default: NULL. An integer number of most variable features per assay to subset to. |
|
56 |
+Assays with less features won't be reduced in size.} |
|
51 | 57 |
|
52 | 58 |
\item{outcomeColumns}{If \code{measurements} is a \code{MultiAssayExperiment}, the |
53 | 59 |
names of the column (class) or columns (survival) in the table extracted by \code{colData(data)} |