... | ... |
@@ -35,6 +35,9 @@ setClassUnion("DataFrameOrNULL", c("DataFrame", "NULL")) |
35 | 35 |
setClassUnion("tabular", c("data.frame", "DataFrame", "matrix")) |
36 | 36 |
setClassUnion("tabularOrList", c("tabular", "list")) |
37 | 37 |
|
38 |
+# List-like assay data |
|
39 |
+setClassUnion("MultiAssayExperimentOrList", c("MultiAssayExperiment", "list")) |
|
40 |
+ |
|
38 | 41 |
################################################################################ |
39 | 42 |
# |
40 | 43 |
# Params |
... | ... |
@@ -22,14 +22,18 @@ |
22 | 22 |
#' @param extraParams A list of parameters that will be used to overwrite default settings of transformation, selection, or model-building functions or |
23 | 23 |
#' parameters which will be passed into the data cleaning function. The names of the list must be one of \code{"prepare"}, |
24 | 24 |
#' \code{"select"}, \code{"train"}, \code{"predict"}. To remove one of the defaults (see the article titled Parameter Tuning Presets for crossValidate and Their Customisation on |
25 |
-#' the website), specify the list element to be \code{NULL}. |
|
25 |
+#' the website), specify the list element to be \code{NULL}. For the valid element names in the \code{"prepare"} list, see \code{?prepareData}. |
|
26 |
+#' @param clinicalPredictors If \code{measurements} is a \code{MultiAssayExperiment}, |
|
27 |
+#' a character vector of features to use in modelling. This allows avoidance of things like sample IDs, |
|
28 |
+#' sample acquisition dates, etc. which are not relevant for outcome prediction. |
|
26 | 29 |
#' @param nFeatures The number of features to be used for classification. If this is a single number, the same number of features will be used for all comparisons |
27 | 30 |
#' or assays. If a numeric vector these will be optimised over using \code{selectionOptimisation}. If a named vector with the same names of multiple assays, |
28 | 31 |
#' a different number of features will be used for each assay. If a named list of vectors, the respective number of features will be optimised over. |
29 | 32 |
#' Set to NULL or "all" if all features should be used. |
30 | 33 |
#' @param selectionMethod Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, |
31 | 34 |
#' and performing multiview classification, the respective selection methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking |
32 |
-#' and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. |
|
35 |
+#' and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. \code{NULL} is also a valid value, meaning that no |
|
36 |
+#' indepedent feature selection will be performed (but implicit selection might still happen with the classifier). |
|
33 | 37 |
#' @param selectionOptimisation A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}. |
34 | 38 |
#' @param performanceType Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Otherwise, any one of the |
35 | 39 |
#' options described in \code{\link{calcPerformance}} may otherwise be specified. |
... | ... |
@@ -93,7 +97,7 @@ setGeneric("crossValidate", function(measurements, outcome, ...) |
93 | 97 |
|
94 | 98 |
#' @rdname crossValidate |
95 | 99 |
#' @export |
96 |
-setMethod("crossValidate", "DataFrame", |
|
100 |
+setMethod("crossValidate", "DataFrame", |
|
97 | 101 |
function(measurements, |
98 | 102 |
outcome, |
99 | 103 |
nFeatures = 20, |
... | ... |
@@ -110,12 +114,15 @@ setMethod("crossValidate", "DataFrame", |
110 | 114 |
|
111 | 115 |
{ |
112 | 116 |
# Check that data is in the right format, if not already done for MultiAssayExperiment input. |
113 |
- prepParams <- list(measurements, outcome) |
|
114 |
- if("prepare" %in% names(extraParams)) |
|
115 |
- prepParams <- c(prepParams, extraParams[["prepare"]]) |
|
116 |
- measurementsAndOutcome <- do.call(prepareData, prepParams) |
|
117 |
- measurements <- measurementsAndOutcome[["measurements"]] |
|
118 |
- outcome <- measurementsAndOutcome[["outcome"]] |
|
117 |
+ if(!"assay" %in% S4Vectors::mcols(measurements)) # Assay is put there by prepareData for MultiAssayExperiment, skip if present. |
|
118 |
+ { |
|
119 |
+ prepParams <- list(measurements, outcome, clinicalPredictors) |
|
120 |
+ if("prepare" %in% names(extraParams)) |
|
121 |
+ prepParams <- c(prepParams, extraParams[["prepare"]]) |
|
122 |
+ measurementsAndOutcome <- do.call(prepareData, prepParams) |
|
123 |
+ measurements <- measurementsAndOutcome[["measurements"]] |
|
124 |
+ outcome <- measurementsAndOutcome[["outcome"]] |
|
125 |
+ } |
|
119 | 126 |
|
120 | 127 |
# Ensure performance type is one of the ones that can be calculated by the package. |
121 | 128 |
if(!performanceType %in% c("auto", .ClassifyRenvir[["performanceTypes"]])) |
... | ... |
@@ -305,9 +312,10 @@ setMethod("crossValidate", "DataFrame", |
305 | 312 |
#' @rdname crossValidate |
306 | 313 |
#' @export |
307 | 314 |
# One or more omics data sets, possibly with clinical data. |
308 |
-setMethod("crossValidate", "MultiAssayExperiment", |
|
315 |
+setMethod("crossValidate", "MultiAssayExperimentOrList", |
|
309 | 316 |
function(measurements, |
310 |
- outcome, |
|
317 |
+ outcome, |
|
318 |
+ clinicalPredictors = NULL, |
|
311 | 319 |
nFeatures = 20, |
312 | 320 |
selectionMethod = "auto", |
313 | 321 |
selectionOptimisation = "Resubstitution", |
... | ... |
@@ -321,7 +329,7 @@ setMethod("crossValidate", "MultiAssayExperiment", |
321 | 329 |
characteristicsLabel = NULL, extraParams = NULL) |
322 | 330 |
{ |
323 | 331 |
# Check that data is in the right format, if not already done for MultiAssayExperiment input. |
324 |
- prepParams <- list(measurements, outcome) |
|
332 |
+ prepParams <- list(measurements, outcome, clinicalPredictors) |
|
325 | 333 |
if("prepare" %in% names(extraParams)) |
326 | 334 |
prepParams <- c(prepParams, extraParams[["prepare"]]) |
327 | 335 |
measurementsAndOutcome <- do.call(prepareData, prepParams) |
... | ... |
@@ -408,76 +416,6 @@ setMethod("crossValidate", "matrix", # Matrix of numeric measurements. |
408 | 416 |
characteristicsLabel = characteristicsLabel, extraParams = extraParams) |
409 | 417 |
}) |
410 | 418 |
|
411 |
-# This expects that each table is about the same set of samples and thus |
|
412 |
-# has the same number of rows as every other table. |
|
413 |
-#' @rdname crossValidate |
|
414 |
-#' @export |
|
415 |
-setMethod("crossValidate", "list", |
|
416 |
- function(measurements, |
|
417 |
- outcome, |
|
418 |
- nFeatures = 20, |
|
419 |
- selectionMethod = "auto", |
|
420 |
- selectionOptimisation = "Resubstitution", |
|
421 |
- performanceType = "auto", |
|
422 |
- classifier = "auto", |
|
423 |
- multiViewMethod = "none", |
|
424 |
- assayCombinations = "all", |
|
425 |
- nFolds = 5, |
|
426 |
- nRepeats = 20, |
|
427 |
- nCores = 1, |
|
428 |
- characteristicsLabel = NULL, extraParams = NULL) |
|
429 |
- { |
|
430 |
- # Check data type is valid |
|
431 |
- if (!(all(sapply(measurements, class) %in% c("data.frame", "DataFrame", "matrix")))) { |
|
432 |
- stop("assays must be of type data.frame, DataFrame or matrix") |
|
433 |
- } |
|
434 |
- |
|
435 |
- # Check the list is named |
|
436 |
- if (is.null(names(measurements))) { |
|
437 |
- stop("Measurements must be a named list.") |
|
438 |
- } |
|
439 |
- |
|
440 |
- # Check same number of samples for all datasets |
|
441 |
- if (!length(unique(sapply(measurements, nrow))) == 1) { |
|
442 |
- stop("All datasets must have the same samples.") |
|
443 |
- } |
|
444 |
- |
|
445 |
- # Check the number of outcome is the same |
|
446 |
- if (!all(sapply(measurements, nrow) == length(outcome)) && !is.character(outcome)) { |
|
447 |
- stop("outcome must have same number of samples as measurements.") |
|
448 |
- } |
|
449 |
- |
|
450 |
- df_list <- sapply(measurements, S4Vectors::DataFrame, check.names = FALSE) |
|
451 |
- |
|
452 |
- df_list <- mapply(function(meas, nam){ |
|
453 |
- S4Vectors::mcols(meas)$assay <- nam |
|
454 |
- S4Vectors::mcols(meas)$feature <- colnames(meas) |
|
455 |
- meas |
|
456 |
- }, df_list, names(df_list)) |
|
457 |
- |
|
458 |
- |
|
459 |
- combined_df <- do.call("cbind", df_list) |
|
460 |
- colnames(combined_df) <- S4Vectors::mcols(combined_df)$feature |
|
461 |
- |
|
462 |
- |
|
463 |
- |
|
464 |
- crossValidate(measurements = combined_df, |
|
465 |
- outcome = outcome, |
|
466 |
- nFeatures = nFeatures, |
|
467 |
- selectionMethod = selectionMethod, |
|
468 |
- selectionOptimisation = selectionOptimisation, |
|
469 |
- performanceType = performanceType, |
|
470 |
- classifier = classifier, |
|
471 |
- multiViewMethod = multiViewMethod, |
|
472 |
- assayCombinations = assayCombinations, |
|
473 |
- nFolds = nFolds, |
|
474 |
- nRepeats = nRepeats, |
|
475 |
- nCores = nCores, |
|
476 |
- characteristicsLabel = characteristicsLabel, extraParams = extraParams) |
|
477 |
- }) |
|
478 |
- |
|
479 |
- |
|
480 |
- |
|
481 | 419 |
###################################### |
482 | 420 |
###################################### |
483 | 421 |
cleanNFeatures <- function(nFeatures, measurements){ |
... | ... |
@@ -1126,9 +1064,9 @@ train.list <- function(x, outcomeTrain, ...) |
1126 | 1064 |
#' @rdname crossValidate |
1127 | 1065 |
#' @method train MultiAssayExperiment |
1128 | 1066 |
#' @export |
1129 |
-train.MultiAssayExperiment <- function(x, outcome, ...) |
|
1067 |
+train.MultiAssayExperiment <- function(x, outcome, clinicalPredictors = NULL, ...) |
|
1130 | 1068 |
{ |
1131 |
- prepArgs <- list(x, outcome) |
|
1069 |
+ prepArgs <- list(x, outcome, clinicalPredictors) |
|
1132 | 1070 |
extraInputs <- list(...) |
1133 | 1071 |
prepExtras <- trainExtras <- numeric() |
1134 | 1072 |
if(length(extraInputs) > 0) |
... | ... |
@@ -1167,7 +1105,7 @@ predict.trainedByClassifyR <- function(object, newData, ...) |
1167 | 1105 |
newData <- do.call(cbind, newData) |
1168 | 1106 |
} else if(is(newData, "MultiAssayExperiment")) |
1169 | 1107 |
{ |
1170 |
- newData <- prepareData(newData, useFeatures = allFeatureNames(object)) |
|
1108 |
+ newData <- prepareData(newData, clinicalPredictors = subset(allFeatureNames(object), assay == "clinical")[, "feature"]) |
|
1171 | 1109 |
# Some classifiers dangerously use positional matching rather than column name matching. |
1172 | 1110 |
# newData columns are sorted so that the right column ordering is guaranteed. |
1173 | 1111 |
} |
1174 | 1112 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,77 @@ |
1 |
+# Basically, an ordered list of cross-validations. |
|
2 |
+ |
|
3 |
+setGeneric("precisionPathwayTrain", function(measurements, class, ...) |
|
4 |
+ standardGeneric("precisionPathwayTrain")) |
|
5 |
+ |
|
6 |
+#' @rdname precisionPathwayTrain |
|
7 |
+#' @export |
|
8 |
+setMethod("precisionPathwayTrain", "MultiAssayExperiment", |
|
9 |
+ function(measurements, class, clinicalPredictors = NULL, ..., fixedAssays = "clinical", |
|
10 |
+ confidenceCutoff = 0.8, minAssaySamples = 10, |
|
11 |
+ nFeatures = 20, selectionMethod = setNames(c(NULL, rep("t-test", length(measurements))), c("clinical", names(measurements))), |
|
12 |
+ classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))), c("clinical", names(measurements))), |
|
13 |
+ nFolds = 5, nRepeats = 20, nCores = 1) |
|
14 |
+ { |
|
15 |
+ prepArgs <- list(measurements, class, clinicalPredictors) |
|
16 |
+ extraInputs <- list(...) |
|
17 |
+ prepExtras <- numeric() |
|
18 |
+ if(length(extraInputs) > 0) |
|
19 |
+ prepExtras <- which(names(extraInputs) %in% .ClassifyRenvir[["prepareDataFormals"]]) |
|
20 |
+ if(length(prepExtras) > 0) |
|
21 |
+ prepArgs <- append(prepArgs, extraInputs[prepExtras]) |
|
22 |
+ measurementsAndClass <- do.call(prepareData, prepArgs) |
|
23 |
+ |
|
24 |
+ .precisionPathwayTrain(measurementsAndClass[["measurements"]], measurementsAndClass[["outcome"]], |
|
25 |
+ fixedAssays = fixedAssays, confidenceCutoff = confidenceCutoff, |
|
26 |
+ minAssaySamples = minAssaySamples, nFeatures = nFeatures, |
|
27 |
+ selectionMethod = selectionMethod, classifier = classifier, |
|
28 |
+ nFolds = nFolds, nRepeats = nRepeats, nCores = nCores) |
|
29 |
+ }) |
|
30 |
+ |
|
31 |
+#' @rdname precisionPathwayTrain |
|
32 |
+#' @export |
|
33 |
+setMethod("precisionPathwayTrain", "list", |
|
34 |
+ function(measurements, class, clinicalPredictors = NULL, fixedAssays = "clinical", |
|
35 |
+ confidenceCutoff = 0.8, minAssaySamples = 10, |
|
36 |
+ nFeatures = 20, selectionMethod = setNames(c(NULL, rep("t-test", length(measurements))), c("clinical", names(measurements))), |
|
37 |
+ classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))), c("clinical", names(measurements))), |
|
38 |
+ nFolds = 5, nRepeats = 20, nCores = 1, ...) |
|
39 |
+ { |
|
40 |
+ # One of the tables must be named "clinical". |
|
41 |
+ if (!any(names(measurements) == "clinical")) |
|
42 |
+ stop("One of the tables must be named \"clinical\".") |
|
43 |
+ |
|
44 |
+ .precisionPathwayTrain(measurements = combined_df, class = class) |
|
45 |
+ }) |
|
46 |
+ |
|
47 |
+# Internal method which carries out all of the processing, obtaining reformatted data from the |
|
48 |
+# MultiAssayExperiment and list (of basic rectangular tables) S4 methods. |
|
49 |
+.precisionPathwayTrain <- function(measurements, class, fixedAssays = "clinical", |
|
50 |
+ confidenceCutoff = 0.8, minAssaySamples = 10, |
|
51 |
+ nFeatures = 20, selectionMethod = setNames(c(NULL, rep("t-test", length(measurements))), c("clinical", names(measurements))), |
|
52 |
+ classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))), c("clinical", names(measurements))), |
|
53 |
+ nFolds = 5, nRepeats = 20, nCores = 1, ...) |
|
54 |
+ { |
|
55 |
+ # Step 1: Separate the vector classes from the table of measurements, if not already separate. |
|
56 |
+ prepArgs <- list(measurements, class, clinicalPredictors) |
|
57 |
+ extraInputs <- list(...) |
|
58 |
+ prepExtras <- numeric() |
|
59 |
+ if(length(extraInputs) > 0) |
|
60 |
+ prepExtras <- which(names(extraInputs) %in% .ClassifyRenvir[["prepareDataFormals"]]) |
|
61 |
+ if(length(prepExtras) > 0) |
|
62 |
+ prepArgs <- append(prepArgs, extraInputs[prepExtras]) |
|
63 |
+ |
|
64 |
+ measurementsAndClass <- do.call(prepareData, prepArgs) |
|
65 |
+ measurements <- measurementsAndClass[["measurements"]] |
|
66 |
+ class <- measurementsAndClass[["outcome"]] |
|
67 |
+ |
|
68 |
+ # Step 2: Determine all valid permutations of assays, taking into account the |
|
69 |
+ # assays to be used and which assays, if any, must be included. |
|
70 |
+ assayIDs <- unique(S4Vectors::mcols(measurements)[["assay"]]) |
|
71 |
+ assaysPermutations <- .permutations(assayIDs, fixed = data.frame(seq_along(fixedAssays), fixedAssays)) |
|
72 |
+ |
|
73 |
+ # Step 3: Build a classifier for each assay using all of the samples. |
|
74 |
+ modelsList <- crossValidate(measurements, class, nFeatures, selectionMethod, |
|
75 |
+ classifier = classifier, nFolds = nFolds, |
|
76 |
+ nRepeats = nRepeats, nCores = nCores) |
|
77 |
+ } |
|
0 | 78 |
\ No newline at end of file |
... | ... |
@@ -18,15 +18,13 @@ |
18 | 18 |
#' @param outcomeColumns If \code{measurements} is a \code{MultiAssayExperiment}, the |
19 | 19 |
#' names of the column (class) or columns (survival) in the table extracted by \code{colData(data)} |
20 | 20 |
#' that contain(s) the each individual's outcome to use for prediction. |
21 |
-#' @param useFeatures If \code{measurements} is a \code{MultiAssayExperiment}, |
|
22 |
-#' a two-column table of features to use. The first column must have assay names |
|
23 |
-#' and the second column must have feature names found for that assay. \code{"clinical"} is |
|
24 |
-#' also a valid assay name and refers to the clinical data table. \code{"all"} is a special |
|
25 |
-#' keyword that means all features (passing any other filters) of that assay will be used |
|
26 |
-#' for modelling. Otherwise, a character vector of feature names to use suffices. |
|
21 |
+#' @param clinicalPredictors If \code{measurements} is a \code{MultiAssayExperiment}, |
|
22 |
+#' a character vector of features to use in modelling. This allows avoidance of things like sample IDs, |
|
23 |
+#' sample acquisition dates, etc. which are not relevant for outcome prediction. |
|
27 | 24 |
#' @param maxMissingProp Default: 0.0. A proportion less than 1 which is the maximum |
28 | 25 |
#' tolerated proportion of missingness for a feature to be retained for modelling. |
29 |
-#' @param topNvariance Default: NULL. An integer number of most variable features to subset to. |
|
26 |
+#' @param topNvariance Default: NULL. An integer number of most variable features per assay to subset to. |
|
27 |
+#' Assays with less features won't be reduced in size. |
|
30 | 28 |
#' @param ... Variables not used by the \code{matrix} nor the |
31 | 29 |
#' \code{MultiAssayExperiment} method which are passed into and used by the |
32 | 30 |
#' \code{DataFrame} method. |
... | ... |
@@ -58,18 +56,16 @@ setMethod("prepareData", "data.frame", |
58 | 56 |
#' @rdname prepareData |
59 | 57 |
#' @export |
60 | 58 |
setMethod("prepareData", "DataFrame", |
61 |
- function(measurements, outcome, useFeatures = "all", maxMissingProp = 0.0, topNvariance = NULL) |
|
59 |
+ function(measurements, outcome, clinicalPredictors = NULL, maxMissingProp = 0.0, topNvariance = NULL) |
|
62 | 60 |
{ |
63 | 61 |
if(is.null(rownames(measurements))) |
64 | 62 |
{ |
65 | 63 |
warning("'measurements' DataFrame must have sample identifiers as its row names. Generating generic ones.") |
66 | 64 |
rownames(measurements) <- paste("Sample", seq_len(nrow(measurements))) |
67 |
- } |
|
68 |
- |
|
69 |
- if(useFeatures != "all") # Subset to only the desired ones. |
|
70 |
- measurements <- measurements[, useFeatures] |
|
71 |
- |
|
65 |
+ } |
|
66 |
+ |
|
72 | 67 |
# Won't ever be true if input data was MultiAssayExperiment because wideFormat already produces valid names. |
68 |
+ # Need to check if input data was DataFrame because names might not be valid from user. |
|
73 | 69 |
if(!all(colnames(measurements) == make.names(colnames(measurements)))) |
74 | 70 |
{ |
75 | 71 |
warning("Unsafe feature names in input data. Converted into safe names.") |
... | ... |
@@ -129,6 +125,19 @@ setMethod("prepareData", "DataFrame", |
129 | 125 |
else # Three columns. Therefore, counting process data. |
130 | 126 |
outcome <- survival::Surv(outcome[, 1], outcome[, 2], outcome[, 3]) |
131 | 127 |
} |
128 |
+ |
|
129 |
+ if(!is.null(clinicalPredictors)) |
|
130 |
+ { |
|
131 |
+ if(!is.null(mcols(measurements)$assay)) |
|
132 |
+ { |
|
133 |
+ clinicalIndices <- which(mcols(measurements)$assay == "clinical") |
|
134 |
+ usePredictors <- intersect(clinicalIndices, which(mcols(measurements)$feature %in% clinicalPredictors)) |
|
135 |
+ dropIndices <- setdiff(clinicalIndices, usePredictors) |
|
136 |
+ if(length(dropIndices) > 0) measurements <- measurements[, -dropIndices] |
|
137 |
+ } else { # The DataFrame is entirely clinical data. |
|
138 |
+ measurements <- measurements[, clinicalPredictors] |
|
139 |
+ } |
|
140 |
+ } |
|
132 | 141 |
|
133 | 142 |
# Remove samples with indeterminate outcome. |
134 | 143 |
dropSamples <- which(is.na(outcome) | is.null(outcome)) |
... | ... |
@@ -146,11 +155,18 @@ setMethod("prepareData", "DataFrame", |
146 | 155 |
if(length(dropFeatures) > 0) |
147 | 156 |
measurements <- measurements[, -dropFeatures] |
148 | 157 |
|
149 |
- # Use only the most variable features. |
|
158 |
+ # Use only the most N variable features per assay. |
|
150 | 159 |
if(!is.null(topNvariance)) |
151 | 160 |
{ |
152 |
- mostVariance <- order(apply(measurements, 2, var, na.rm = TRUE), decreasing = TRUE)[1:topNvariance] |
|
153 |
- measurements <- measurements[, mostVariance] |
|
161 |
+ if(is.null(mcols(measurements)$assay)) assays <- rep(1, ncol(measurements)) else assays <- mcols(measurements)$assay |
|
162 |
+ do.call(cbind, lapply(unqiue(assays), function(assay) |
|
163 |
+ { |
|
164 |
+ assayColumns <- which(assays == assay) |
|
165 |
+ if(length(assayColumns) < topNvariance) |
|
166 |
+ measurements[, assayColumns] |
|
167 |
+ else |
|
168 |
+ measurements[, assayColumns][order(apply(measurements[, assayColumns], 2, var, na.rm = TRUE), decreasing = TRUE)[1:topNvariance]] |
|
169 |
+ })) |
|
154 | 170 |
} |
155 | 171 |
|
156 | 172 |
list(measurements = measurements, outcome = outcome) |
... | ... |
@@ -159,71 +175,74 @@ setMethod("prepareData", "DataFrame", |
159 | 175 |
#' @rdname prepareData |
160 | 176 |
#' @export |
161 | 177 |
setMethod("prepareData", "MultiAssayExperiment", |
162 |
- function(measurements, outcomeColumns = NULL, useFeatures = "all", ...) |
|
178 |
+ function(measurements, outcomeColumns = NULL, clinicalPredictors = NULL, ...) |
|
163 | 179 |
{ |
164 |
- if(is.character(useFeatures)) useFeatures <- data.frame(assay = names(measurements), feature = "all") |
|
165 |
- omicsTargets <- setdiff(useFeatures[, "assay"], "clinical") |
|
166 |
- if(length(omicsTargets) > 0) |
|
167 |
- { |
|
168 |
- if(any(anyReplicated(measurements[, , omicsTargets]))) |
|
169 |
- stop("Data set contains replicates. Please remove or average replicate observations and try again.") |
|
170 |
- } |
|
171 |
- |
|
172 |
- if(!is.null(outcomeColumns) && !all(outcomeColumns %in% colnames(MultiAssayExperiment::colData(measurements)))) |
|
180 |
+ if(is.null(clinicalPredictors)) |
|
181 |
+ stop("'clinicalPredictors' must be a vector of informative clinical features (i.e. not sample IDs, sampling dates, etc.) to consider for classification.") |
|
182 |
+ |
|
183 |
+ if(any(anyReplicated(measurements[, , omicsTargets]))) |
|
184 |
+ stop("Data set contains replicates. Please remove or average replicate observations and try again.") |
|
185 |
+ |
|
186 |
+ if(is.null(outcomeColums)) |
|
187 |
+ stop("'outcomeColumns' is a mandatory parameter but was not specified.") |
|
188 |
+ |
|
189 |
+ if(!all(outcomeColumns %in% colnames(MultiAssayExperiment::colData(measurements)))) |
|
173 | 190 |
stop("Not all column names specified by 'outcomeColumns' found in clinical table.") |
174 |
- if(!all(useFeatures[, "assay"] %in% c(names(measurements), "clinical"))) |
|
175 |
- stop("Some assay names in first column of 'useFeatures' are not assay names in 'measurements' or \"clinical\".") |
|
176 | 191 |
|
177 |
- clinicalColumnsDataset <- colnames(MultiAssayExperiment::colData(measurements)) |
|
178 |
- if("clinical" %in% useFeatures[, "assay"]) |
|
179 |
- { |
|
180 |
- clinicalRows <- useFeatures[, "assay"] == "clinical" |
|
181 |
- clinicalColumns <- useFeatures[clinicalRows, "feature"] |
|
182 |
- if(length(clinicalColumns) == 1 && clinicalColumns == "all") |
|
183 |
- clinicalColumns <- setdiff(clinicalColumnsDataset, outcomeColumns) |
|
184 |
- useFeatures <- useFeatures[!clinicalRows, ] |
|
185 |
- } else { |
|
186 |
- clinicalColumns <- NULL |
|
187 |
- } |
|
188 |
- |
|
189 |
- if(nrow(useFeatures) > 0) |
|
190 |
- { |
|
191 |
- measurements <- measurements[, , unique(useFeatures[, "assay"])] |
|
192 |
- # Get all desired measurements tables and clinical columns (other than the columns representing outcome). |
|
193 |
- # These form the independent variables to be used for making predictions with. |
|
194 |
- # Variable names will have names like RNA_BRAF for traceability. |
|
195 |
- dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(clinicalColumns, outcomeColumns)) |
|
196 |
- rownames(dataTable) <- dataTable[, "primary"] |
|
197 |
- S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "clinical", S4Vectors::mcols(dataTable)[, "sourceName"]) |
|
198 |
- colnames(S4Vectors::mcols(dataTable))[1] <- "assay" |
|
192 |
+ # Get all desired measurements tables and clinical columns. |
|
193 |
+ # These form the independent variables to be used for making predictions with. |
|
194 |
+ # Variable names will have names like RNA_BRAF for traceability. |
|
195 |
+ dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(clinicalPredictors, outcomeColumns)) |
|
196 |
+ rownames(dataTable) <- dataTable[, "primary"] |
|
197 |
+ S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "clinical", S4Vectors::mcols(dataTable)[, "sourceName"]) |
|
198 |
+ colnames(S4Vectors::mcols(dataTable))[1] <- "assay" |
|
199 | 199 |
|
200 |
- # Sample information variable names not included in column metadata of wide table but only as row names of it. |
|
201 |
- # Create a combined column named "feature" which has feature names of the assays as well as the clinical. |
|
202 |
- S4Vectors::mcols(dataTable)[, "feature"] <- as.character(S4Vectors::mcols(dataTable)[, "rowname"]) |
|
203 |
- missingIndices <- is.na(S4Vectors::mcols(dataTable)[, "feature"]) |
|
204 |
- S4Vectors::mcols(dataTable)[missingIndices, "feature"] <- colnames(dataTable)[missingIndices] |
|
200 |
+ # Sample information variable names not included in column metadata of wide table but only as row names of it. |
|
201 |
+ # Create a combined column named "feature" which has feature names of the assays as well as the clinical. |
|
202 |
+ S4Vectors::mcols(dataTable)[, "feature"] <- as.character(S4Vectors::mcols(dataTable)[, "rowname"]) |
|
203 |
+ missingIndices <- is.na(S4Vectors::mcols(dataTable)[, "feature"]) |
|
204 |
+ S4Vectors::mcols(dataTable)[missingIndices, "feature"] <- colnames(dataTable)[missingIndices] |
|
205 | 205 |
|
206 |
- # Finally, a column annotation recording variable name and which table it originated from for all of the source tables. |
|
207 |
- S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")] |
|
206 |
+ # Finally, a column annotation recording variable name and which table it originated from for all of the source tables. |
|
207 |
+ S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")] |
|
208 | 208 |
|
209 |
- # Subset to only the desired features. |
|
210 |
- useFeaturesSubset <- useFeatures[useFeatures[, "feature"] != "all", ] |
|
211 |
- if(nrow(useFeaturesSubset) > 0) |
|
212 |
- { |
|
213 |
- uniqueAssays <- unique(useFeatures[, "assay"]) |
|
214 |
- for(filterAssay in uniqueAssays) |
|
215 |
- { |
|
216 |
- dropFeatures <- S4Vectors::mcols(dataTable)[, "assay"] == filterAssay & |
|
217 |
- !S4Vectors::mcols(dataTable)[, "feature"] %in% useFeatures[useFeatures[, 1] == filterAssay, 2] |
|
218 |
- dataTable <- dataTable[, !dropFeatures] |
|
219 |
- } |
|
220 |
- } |
|
221 |
- dataTable <- dataTable[, -match("primary", colnames(dataTable))] |
|
222 |
- } else { # Must have only been clinical data. |
|
223 |
- dataTable <- MultiAssayExperiment::colData(measurements) |
|
224 |
- S4Vectors::mcols(dataTable) <- DataFrame(assay = "clinical", feature = colnames(dataTable)) |
|
225 |
- } |
|
209 |
+ # Do other filtering and preparation in DataFrame function. |
|
210 |
+ prepareData(dataTable, outcomeColumns, clinicalPredictors = NULL, ...) |
|
211 |
+}) |
|
212 |
+ |
|
213 |
+#' @rdname prepareData |
|
214 |
+#' @export |
|
215 |
+setMethod("prepareData", "list", |
|
216 |
+ function(measurements, outcome = NULL, clinicalPredictors = NULL, ...) |
|
217 |
+{ |
|
218 |
+ # Check the list is named. |
|
219 |
+ if(is.null(names(measurements))) |
|
220 |
+ stop("'measurements' must be a named list.") |
|
221 |
+ |
|
222 |
+ # If clinical table is present, features to use must be user-specified. |
|
223 |
+ if("clinical" %in% names(measurements) && is.null(clinicalPredictors)) |
|
224 |
+ stop("Because one provided table in the list is named \"clinical\", 'clinicalPredictors' must be a vector of informative clinical features (i.e. not sample IDs, sampling dates, etc.) to consider for classification.") |
|
225 |
+ |
|
226 |
+ # Check data type is valid. |
|
227 |
+ if(!(all(sapply(measurements, class) %in% c("data.frame", "DataFrame", "matrix")))) |
|
228 |
+ stop("assays in the list must be of type data.frame, DataFrame or matrix") |
|
229 |
+ |
|
230 |
+ # Check same number of samples for all datasets |
|
231 |
+ if (!length(unique(sapply(measurements, nrow))) == 1) |
|
232 |
+ stop("All datasets must have the same samples.") |
|
233 |
+ |
|
234 |
+ if("clinical" %in% names(measurements)) |
|
235 |
+ measurements[["clinical"]] <- measurements[["clinical"]][, clinicalPredictors] |
|
236 |
+ |
|
237 |
+ allMetadata <- mapply(function(measurementsOne, assayID) { |
|
238 |
+ data.frame(assay = assayID, feature = colnames(measurementsOne)) |
|
239 |
+ }, measurements, names(measurements)) |
|
240 |
+ allMeasurements <- do.call("cbind", measurements) |
|
241 |
+ # Different assays e.g. mRNA, protein could have same feature name e.g. BRAF. |
|
242 |
+ colnames(allMeasurements) <- paste(allMetadata[, "assay"], allMetadata[, "feature"], sep = '_') |
|
243 |
+ allDataFrame <- DataFrame(allMeasurements) |
|
244 |
+ S4Vectors::mcols(allMeasurements) <- allMetadata |
|
226 | 245 |
|
227 | 246 |
# Do other filtering and preparation in DataFrame function. |
228 |
- prepareData(dataTable, outcomeColumns, useFeatures = "all", ...) |
|
247 |
+ prepareData(dataTable, outcome, clinicalPredictors = NULL, ...) |
|
229 | 248 |
}) |
230 | 249 |
\ No newline at end of file |
... | ... |
@@ -6,8 +6,7 @@ |
6 | 6 |
\alias{crossValidate,DataFrame-method} |
7 | 7 |
\alias{crossValidate,MultiAssayExperiment-method,} |
8 | 8 |
\alias{crossValidate,data.frame-method} |
9 |
-\alias{crossValidate,MultiAssayExperiment-method} |
|
10 |
-\alias{crossValidate,list-method} |
|
9 |
+\alias{crossValidate,MultiAssayExperimentOrList-method} |
|
11 | 10 |
\alias{train.matrix} |
12 | 11 |
\alias{train.data.frame} |
13 | 12 |
\alias{train.DataFrame} |
... | ... |
@@ -35,9 +34,10 @@ crossValidate(measurements, outcome, ...) |
35 | 34 |
extraParams = NULL |
36 | 35 |
) |
37 | 36 |
|
38 |
-\S4method{crossValidate}{MultiAssayExperiment}( |
|
37 |
+\S4method{crossValidate}{MultiAssayExperimentOrList}( |
|
39 | 38 |
measurements, |
40 | 39 |
outcome, |
40 |
+ clinicalPredictors = NULL, |
|
41 | 41 |
nFeatures = 20, |
42 | 42 |
selectionMethod = "auto", |
43 | 43 |
selectionOptimisation = "Resubstitution", |
... | ... |
@@ -86,23 +86,6 @@ crossValidate(measurements, outcome, ...) |
86 | 86 |
extraParams = NULL |
87 | 87 |
) |
88 | 88 |
|
89 |
-\S4method{crossValidate}{list}( |
|
90 |
- measurements, |
|
91 |
- outcome, |
|
92 |
- nFeatures = 20, |
|
93 |
- selectionMethod = "auto", |
|
94 |
- selectionOptimisation = "Resubstitution", |
|
95 |
- performanceType = "auto", |
|
96 |
- classifier = "auto", |
|
97 |
- multiViewMethod = "none", |
|
98 |
- assayCombinations = "all", |
|
99 |
- nFolds = 5, |
|
100 |
- nRepeats = 20, |
|
101 |
- nCores = 1, |
|
102 |
- characteristicsLabel = NULL, |
|
103 |
- extraParams = NULL |
|
104 |
-) |
|
105 |
- |
|
106 | 89 |
\method{train}{matrix}(x, outcomeTrain, ...) |
107 | 90 |
|
108 | 91 |
\method{train}{data.frame}(x, outcomeTrain, ...) |
... | ... |
@@ -122,7 +105,7 @@ crossValidate(measurements, outcome, ...) |
122 | 105 |
|
123 | 106 |
\method{train}{list}(x, outcomeTrain, ...) |
124 | 107 |
|
125 |
-\method{train}{MultiAssayExperiment}(x, outcome, ...) |
|
108 |
+\method{train}{MultiAssayExperiment}(x, outcome, clinicalPredictors = NULL, ...) |
|
126 | 109 |
|
127 | 110 |
\method{predict}{trainedByClassifyR}(object, newData, ...) |
128 | 111 |
} |
... | ... |
@@ -146,7 +129,8 @@ Set to NULL or "all" if all features should be used.} |
146 | 129 |
|
147 | 130 |
\item{selectionMethod}{Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, |
148 | 131 |
and performing multiview classification, the respective selection methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking |
149 |
-and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value.} |
|
132 |
+and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. \code{NULL} is also a valid value, meaning that no |
|
133 |
+indepedent feature selection will be performed (but implicit selection might still happen with the classifier).} |
|
150 | 134 |
|
151 | 135 |
\item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.} |
152 | 136 |
|
... | ... |
@@ -172,7 +156,11 @@ with each element being a vector of assays to combine. Special value \code{"all" |
172 | 156 |
\item{extraParams}{A list of parameters that will be used to overwrite default settings of transformation, selection, or model-building functions or |
173 | 157 |
parameters which will be passed into the data cleaning function. The names of the list must be one of \code{"prepare"}, |
174 | 158 |
\code{"select"}, \code{"train"}, \code{"predict"}. To remove one of the defaults (see the article titled Parameter Tuning Presets for crossValidate and Their Customisation on |
175 |
-the website), specify the list element to be \code{NULL}.} |
|
159 |
+the website), specify the list element to be \code{NULL}. For the valid element names in the \code{"prepare"} list, see \code{?prepareData}.} |
|
160 |
+ |
|
161 |
+\item{clinicalPredictors}{If \code{measurements} is a \code{MultiAssayExperiment}, |
|
162 |
+a character vector of features to use in modelling. This allows avoidance of things like sample IDs, |
|
163 |
+sample acquisition dates, etc. which are not relevant for outcome prediction.} |
|
176 | 164 |
|
177 | 165 |
\item{x}{Same as \code{measurements} but only training samples.} |
178 | 166 |
|
... | ... |
@@ -6,6 +6,7 @@ |
6 | 6 |
\alias{prepareData,DataFrame-method} |
7 | 7 |
\alias{prepareData,MultiAssayExperiment-method} |
8 | 8 |
\alias{prepareData,data.frame-method} |
9 |
+\alias{prepareData,list-method} |
|
9 | 10 |
\title{Convert Different Data Classes into DataFrame and Filter Features} |
10 | 11 |
\usage{ |
11 | 12 |
\S4method{prepareData}{matrix}(measurements, outcome, ...) |
... | ... |
@@ -15,12 +16,19 @@ |
15 | 16 |
\S4method{prepareData}{DataFrame}( |
16 | 17 |
measurements, |
17 | 18 |
outcome, |
18 |
- useFeatures = "all", |
|
19 |
+ clinicalPredictors = NULL, |
|
19 | 20 |
maxMissingProp = 0, |
20 | 21 |
topNvariance = NULL |
21 | 22 |
) |
22 | 23 |
|
23 |
-\S4method{prepareData}{MultiAssayExperiment}(measurements, outcomeColumns = NULL, useFeatures = "all", ...) |
|
24 |
+\S4method{prepareData}{MultiAssayExperiment}( |
|
25 |
+ measurements, |
|
26 |
+ outcomeColumns = NULL, |
|
27 |
+ clinicalPredictors = NULL, |
|
28 |
+ ... |
|
29 |
+) |
|
30 |
+ |
|
31 |
+\S4method{prepareData}{list}(measurements, outcome = NULL, clinicalPredictors = NULL, ...) |
|
24 | 32 |
} |
25 | 33 |
\arguments{ |
26 | 34 |
\item{measurements}{Either a \code{\link{matrix}}, \code{\link{DataFrame}} |
... | ... |
@@ -37,17 +45,15 @@ a character string, or vector of such strings, containing column name(s) of colu |
37 | 45 |
containing either classes or time and event information about survival. If column names |
38 | 46 |
of survival information, time must be in first column and event status in the second.} |
39 | 47 |
|
40 |
-\item{useFeatures}{If \code{measurements} is a \code{MultiAssayExperiment}, |
|
41 |
-a two-column table of features to use. The first column must have assay names |
|
42 |
-and the second column must have feature names found for that assay. \code{"clinical"} is |
|
43 |
-also a valid assay name and refers to the clinical data table. \code{"all"} is a special |
|
44 |
-keyword that means all features (passing any other filters) of that assay will be used |
|
45 |
-for modelling. Otherwise, a character vector of feature names to use suffices.} |
|
48 |
+\item{clinicalPredictors}{If \code{measurements} is a \code{MultiAssayExperiment}, |
|
49 |
+a character vector of features to use in modelling. This allows avoidance of things like sample IDs, |
|
50 |
+sample acquisition dates, etc. which are not relevant for outcome prediction.} |
|
46 | 51 |
|
47 | 52 |
\item{maxMissingProp}{Default: 0.0. A proportion less than 1 which is the maximum |
48 | 53 |
tolerated proportion of missingness for a feature to be retained for modelling.} |
49 | 54 |
|
50 |
-\item{topNvariance}{Default: NULL. An integer number of most variable features to subset to.} |
|
55 |
+\item{topNvariance}{Default: NULL. An integer number of most variable features per assay to subset to. |
|
56 |
+Assays with less features won't be reduced in size.} |
|
51 | 57 |
|
52 | 58 |
\item{outcomeColumns}{If \code{measurements} is a \code{MultiAssayExperiment}, the |
53 | 59 |
names of the column (class) or columns (survival) in the table extracted by \code{colData(data)} |