... | ... |
@@ -3,8 +3,8 @@ Type: Package |
3 | 3 |
Title: A framework for cross-validated classification problems, with |
4 | 4 |
applications to differential variability and differential |
5 | 5 |
distribution testing |
6 |
-Version: 3.1.14 |
|
7 |
-Date: 2022-08-25 |
|
6 |
+Version: 3.1.15 |
|
7 |
+Date: 2022-08-31 |
|
8 | 8 |
Author: Dario Strbenac, Ellis Patrick, John Ormerod, Graham Mann, Jean Yang |
9 | 9 |
Maintainer: Dario Strbenac <dario.strbenac@sydney.edu.au> |
10 | 10 |
VignetteBuilder: knitr |
... | ... |
@@ -1,5 +1,6 @@ |
1 | 1 |
# Generated by roxygen2: do not edit by hand |
2 | 2 |
|
3 |
+S3method(predict,trainedByClassifyR) |
|
3 | 4 |
export(ClassifyResult) |
4 | 5 |
export(CrossValParams) |
5 | 6 |
export(FeatureSetCollection) |
... | ... |
@@ -21,7 +22,6 @@ export(edgesToHubNetworks) |
21 | 22 |
export(featureSetSummary) |
22 | 23 |
export(generateCrossValParams) |
23 | 24 |
export(generateModellingParams) |
24 |
-export(getLocationsAndScales) |
|
25 | 25 |
export(interactorDifferences) |
26 | 26 |
export(models) |
27 | 27 |
export(performance) |
... | ... |
@@ -36,6 +36,11 @@ export(sampleNames) |
36 | 36 |
export(samplesMetricMap) |
37 | 37 |
export(selectionPlot) |
38 | 38 |
export(totalPredictions) |
39 |
+export(train.DataFrame) |
|
40 |
+export(train.MultiAssayExperiment) |
|
41 |
+export(train.data.frame) |
|
42 |
+export(train.list) |
|
43 |
+export(train.matrix) |
|
39 | 44 |
export(tunedParameters) |
40 | 45 |
exportClasses(ClassifyResult) |
41 | 46 |
exportClasses(CrossValParams) |
... | ... |
@@ -62,14 +67,12 @@ exportMethods(chosenFeatureNames) |
62 | 67 |
exportMethods(crossValidate) |
63 | 68 |
exportMethods(distribution) |
64 | 69 |
exportMethods(featureSetSummary) |
65 |
-exportMethods(getLocationsAndScales) |
|
66 | 70 |
exportMethods(interactorDifferences) |
67 | 71 |
exportMethods(length) |
68 | 72 |
exportMethods(models) |
69 | 73 |
exportMethods(performance) |
70 | 74 |
exportMethods(performancePlot) |
71 | 75 |
exportMethods(plotFeatureClasses) |
72 |
-exportMethods(predict) |
|
73 | 76 |
exportMethods(predictions) |
74 | 77 |
exportMethods(prepareData) |
75 | 78 |
exportMethods(rankingPlot) |
... | ... |
@@ -82,7 +85,9 @@ exportMethods(show) |
82 | 85 |
exportMethods(totalPredictions) |
83 | 86 |
exportMethods(tunedParameters) |
84 | 87 |
import(BiocParallel) |
88 |
+import(MultiAssayExperiment) |
|
85 | 89 |
import(grid) |
90 |
+import(methods) |
|
86 | 91 |
import(utils) |
87 | 92 |
importFrom(S4Vectors,as.data.frame) |
88 | 93 |
importFrom(S4Vectors,do.call) |
... | ... |
@@ -1,48 +1,7 @@ |
1 | 1 |
##### Table of contents ##### |
2 |
-# Set old classes |
|
3 | 2 |
# Create union of classes |
4 | 3 |
# Set generic accessors |
5 | 4 |
|
6 |
-################################################################################ |
|
7 |
-# |
|
8 |
-# Set old classes |
|
9 |
-# |
|
10 |
-################################################################################ |
|
11 |
- |
|
12 |
- |
|
13 |
- |
|
14 |
-# Delete when sparsediscrim is restored to CRAN. |
|
15 |
-# Trained dlda Object |
|
16 |
-dlda <- function(x, ...) { |
|
17 |
- UseMethod("dlda") |
|
18 |
-} |
|
19 |
-setOldClass("dlda") |
|
20 |
- |
|
21 |
-# Trained pamr Object |
|
22 |
-setOldClass("pamrtrained") |
|
23 |
- |
|
24 |
- |
|
25 |
-# Trained svm Object |
|
26 |
-setOldClass("svm") |
|
27 |
- |
|
28 |
-# Trained multnet Object |
|
29 |
-setOldClass("multnet") |
|
30 |
- |
|
31 |
-# Trained coxnet Object |
|
32 |
-setOldClass("coxnet") |
|
33 |
- |
|
34 |
-# Trained randomForest Object |
|
35 |
-setOldClass("randomForest") |
|
36 |
- |
|
37 |
-# Trained coxph Object |
|
38 |
-setOldClass("coxph") |
|
39 |
- |
|
40 |
-# Survival Data Container |
|
41 |
-setOldClass("Surv") |
|
42 |
- |
|
43 |
-# Survival Forest Data Container |
|
44 |
-setOldClass("rfsrc") |
|
45 |
- |
|
46 | 5 |
################################################################################ |
47 | 6 |
# |
48 | 7 |
# Create union of classes |
... | ... |
@@ -63,6 +22,7 @@ setClassUnion("numericOrNULL", c("numeric", "NULL")) |
63 | 22 |
setClassUnion("characterOrDataFrame", c("character", "DataFrame")) |
64 | 23 |
|
65 | 24 |
# Union of a Surv class and a factor for flexibility with sample outcome |
25 |
+setOldClass("Surv") |
|
66 | 26 |
setClassUnion("factorOrSurv", c("factor", "Surv")) |
67 | 27 |
|
68 | 28 |
# Union of a List and NULL |
... | ... |
@@ -71,6 +31,9 @@ setClassUnion("listOrNULL", c("list", "NULL")) |
71 | 31 |
# Union of NULL and DataFrame Class |
72 | 32 |
setClassUnion("DataFrameOrNULL", c("DataFrame", "NULL")) |
73 | 33 |
|
34 |
+# Tabular data |
|
35 |
+setClassUnion("tabular", c("data.frame", "DataFrame", "matrix")) |
|
36 |
+setClassUnion("tabularOrList", c("tabular", "list")) |
|
74 | 37 |
|
75 | 38 |
################################################################################ |
76 | 39 |
# |
... | ... |
@@ -93,4 +93,6 @@ |
93 | 93 |
"PCA", "Reduce each assay into a lower dimensional representation and concatenate the principal components to the clinical data before modelling." |
94 | 94 |
), |
95 | 95 |
ncol = 2, byrow = TRUE, dimnames = list(NULL, c("multiViewMethod Keyword", "Description")) |
96 |
-) |> as.data.frame() |
|
97 | 96 |
\ No newline at end of file |
97 |
+) |> as.data.frame() |
|
98 |
+ |
|
99 |
+.ClassifyRenvir[["prepareDataFormals"]] <- c("useFeatures", "maxMissingProp", "topNvariance") |
|
98 | 100 |
\ No newline at end of file |
... | ... |
@@ -1,19 +1,21 @@ |
1 | 1 |
#' Cross-validation to evaluate classification performance. |
2 | 2 |
#' |
3 | 3 |
#' This function has been designed to facilitate the comparison of classification |
4 |
-#' methods using cross-validation. A selection of typical comparisons are implemented. |
|
4 |
+#' methods using cross-validation. A selection of typical comparisons are implemented. The \code{train} function |
|
5 |
+#' is a convenience method for training on one data set and predicting on an independent validation data set. |
|
5 | 6 |
#' |
6 | 7 |
#' @param measurements Either a \code{\link{DataFrame}}, \code{\link{data.frame}}, \code{\link{matrix}}, \code{\link{MultiAssayExperiment}} |
7 |
-#' or a list of these objects containing the training data. For a |
|
8 |
-#' \code{matrix} and \code{data.frame}, the rows are samples and the columns are features. For a \code{data.frame} or \code{\link{MultiAssayExperiment}} assay |
|
9 |
-#' the rows are features and the columns are samples, as is typical in Bioconductor. |
|
8 |
+#' or a list of these objects containing the data. |
|
9 |
+#' @param x Same as \code{measurements} but only training samples. |
|
10 | 10 |
#' @param outcome A vector of class labels of class \code{\link{factor}} of the |
11 | 11 |
#' same length as the number of samples in \code{measurements} or a character vector of length 1 containing the |
12 |
-#' column name in \code{measurements} if it is a \code{\link{DataFrame}} or the |
|
13 |
-#' column name in \code{colData(measurements)} if \code{measurements} is a \code{\link{MultiAssayExperiment}}. If a column name, that column will be |
|
14 |
-#' removed before training. Or a \code{\link{Surv}} object or a character vector of length 2 or 3 specifying the time and event columns in |
|
15 |
-#' \code{measurements} for survival outcome. |
|
16 |
-#' @param ... Parameters passed into \code{\link{prepareData}}. |
|
12 |
+#' column name in \code{measurements} if it is a \code{\link{DataFrame}}. Or a \code{\link{Surv}} object or a character vector of |
|
13 |
+#' length 2 or 3 specifying the time and event columns in \code{measurements} for survival outcome. |
|
14 |
+#' @param outcomeColumns If \code{measurements} is a \code{\link{MultiAssayExperiment}}, the column name(s) in \code{colData(measurements)} representing the outcome. |
|
15 |
+#' @param outcomeTrain For the \code{train} function, either a factor vector of classes, a \code{\link{Surv}} object, or |
|
16 |
+#' a character string, or vector of such strings, containing column name(s) of column(s) |
|
17 |
+#' containing either classes or time and event information about survival. |
|
18 |
+#' @param ... Parameters passed into \code{\link{prepareData}} which control subsetting and filtering of input data. |
|
17 | 19 |
#' @param nFeatures The number of features to be used for classification. If this is a single number, the same number of features will be used for all comparisons |
18 | 20 |
#' or assays. If a numeric vector these will be optimised over using \code{selectionOptimisation}. If a named vector with the same names of multiple assays, |
19 | 21 |
#' a different number of features will be used for each assay. If a named list of vectors, the respective number of features will be optimised over. |
... | ... |
@@ -25,7 +27,7 @@ |
25 | 27 |
#' and performing multiview classification, the respective classification methods will be used on each assay. |
26 | 28 |
#' @param multiViewMethod A character vector specifying the multiview method or data integration approach to use. |
27 | 29 |
#' @param assayCombinations A character vector or list of character vectors proposing the assays or, in the case of a list, combination of assays to use |
28 |
-#' with each element being a vector of assays to combine. |
|
30 |
+#' with each element being a vector of assays to combine. Special value \code{"all"} means all possible subsets of assays. |
|
29 | 31 |
#' @param nFolds A numeric specifying the number of folds to use for cross-validation. |
30 | 32 |
#' @param nRepeats A numeric specifying the the number of repeats or permutations to use for cross-validation. |
31 | 33 |
#' @param nCores A numeric specifying the number of cores used if the user wants to use parallelisation. |
... | ... |
@@ -34,13 +36,9 @@ |
34 | 36 |
#' @param newData The data to use to make predictions with. |
35 | 37 |
#' |
36 | 38 |
#' @details |
37 |
-#' \code{classifier} can be any of the following implemented approaches - randomForest, GLM, elasticNetGLM, logistic, SVM, DLDA, kNN, naiveBayes, mixturesNormals. |
|
38 |
-#' |
|
39 |
-#' \code{selectionMethod} can be any of the following implemented approaches - none, t-test, limma, edgeR, NSC, Bartlett, Levene, DMD, likelihoodRatio, KS or KL. |
|
40 |
-#' |
|
41 |
-#' \code{multiViewMethod} can take a few different values. Using \code{merge} will merge or bind the assays after feature selection. |
|
42 |
-#' Using \code{prevalidation} will build prevalidated vectors on all the assays except the clinical data. There must be a assay called clinical. |
|
43 |
-#' Using \code{PCA} will perform Principal Components Analysis on each assay and then merge the top few components with the clinical data. There must be a assay called clinical. |
|
39 |
+#' \code{classifier} can be any a keyword for any of the implemented approaches as shown by \code{available()}. |
|
40 |
+#' \code{selectionMethod} can be a keyword for any of the implemented approaches as shown by \code{available("selectionMethod")}. |
|
41 |
+#' \code{multiViewMethod} can be a keyword for any of the implemented approaches as shown by \code{available("multiViewMethod")}. |
|
44 | 42 |
#' |
45 | 43 |
#' @return An object of class \code{\link{ClassifyResult}} |
46 | 44 |
#' @export |
... | ... |
@@ -91,7 +89,7 @@ setMethod("crossValidate", "DataFrame", |
91 | 89 |
selectionOptimisation = "Resubstitution", |
92 | 90 |
classifier = "randomForest", |
93 | 91 |
multiViewMethod = "none", |
94 |
- assayCombinations = NULL, |
|
92 |
+ assayCombinations = "all", |
|
95 | 93 |
nFolds = 5, |
96 | 94 |
nRepeats = 20, |
97 | 95 |
nCores = 1, |
... | ... |
@@ -99,12 +97,9 @@ setMethod("crossValidate", "DataFrame", |
99 | 97 |
|
100 | 98 |
{ |
101 | 99 |
# Check that data is in the right format, if not already done for MultiAssayExperiment input. |
102 |
- #if(is.null(mcols(measurements)$assay)) |
|
103 |
- #{ |
|
104 |
- splitAssay <- prepareData(measurements, outcome, ...) |
|
105 |
- measurements <- splitAssay[["measurements"]] |
|
106 |
- outcome <- splitAssay[["outcome"]] |
|
107 |
- #} |
|
100 |
+ measurementsAndOutcome <- prepareData(measurements, outcome, ...) |
|
101 |
+ measurements <- measurementsAndOutcome[["measurements"]] |
|
102 |
+ outcome <- measurementsAndOutcome[["outcome"]] |
|
108 | 103 |
|
109 | 104 |
# Which data-types or data-views are present? |
110 | 105 |
assayIDs <- unique(mcols(measurements)[, "assay"]) |
... | ... |
@@ -121,7 +116,7 @@ setMethod("crossValidate", "DataFrame", |
121 | 116 |
{ |
122 | 117 |
options(warn = 1) |
123 | 118 |
warning("Elastic Net GLM requires two or more features as input but there is only one. |
124 |
- Using an ordinary GLM instead.") |
|
119 |
+Using an ordinary GLM instead.") |
|
125 | 120 |
classifier <- "GLM" |
126 | 121 |
} |
127 | 122 |
|
... | ... |
@@ -153,9 +148,9 @@ setMethod("crossValidate", "DataFrame", |
153 | 148 |
resClassifier <- |
154 | 149 |
sapply(assayIDs, function(assayIndex) { |
155 | 150 |
# Loop over assays |
156 |
- sapply(classifier[[assayIndex]], function(classifierIndex) { |
|
151 |
+ sapply(classifier[[assayIndex]], function(classifierForAssay) { |
|
157 | 152 |
# Loop over classifiers |
158 |
- sapply(selectionMethod[[assayIndex]], function(selectionIndex) { |
|
153 |
+ sapply(selectionMethod[[assayIndex]], function(selectionForAssay) { |
|
159 | 154 |
# Loop over classifiers |
160 | 155 |
set.seed(seed) |
161 | 156 |
measurementsUse <- measurements |
... | ... |
@@ -164,9 +159,9 @@ setMethod("crossValidate", "DataFrame", |
164 | 159 |
measurements = measurementsUse, outcome = outcome, |
165 | 160 |
assayIDs = assayIndex, |
166 | 161 |
nFeatures = nFeatures[assayIndex], |
167 |
- selectionMethod = selectionIndex, |
|
162 |
+ selectionMethod = selectionForAssay, |
|
168 | 163 |
selectionOptimisation = selectionOptimisation, |
169 |
- classifier = classifierIndex, |
|
164 |
+ classifier = classifierForAssay, |
|
170 | 165 |
multiViewMethod = multiViewMethod, |
171 | 166 |
nFolds = nFolds, |
172 | 167 |
nRepeats = nRepeats, |
... | ... |
@@ -199,7 +194,7 @@ setMethod("crossValidate", "DataFrame", |
199 | 194 |
# This allows someone to answer which combinations of the assays might be most useful. |
200 | 195 |
|
201 | 196 |
|
202 |
- if(is.null(assayCombinations)) assayCombinations <- do.call("c", sapply(seq_along(assayIDs), function(nChoose) combn(assayIDs, nChoose, simplify = FALSE))) |
|
197 |
+ if(!is.list(assayCombinations) && assayCombinations == "all") assayCombinations <- do.call("c", sapply(seq_along(assayIDs), function(nChoose) combn(assayIDs, nChoose, simplify = FALSE))) |
|
203 | 198 |
|
204 | 199 |
result <- sapply(assayCombinations, function(assayIndex){ |
205 | 200 |
CV(measurements = measurements[, mcols(measurements)[["assay"]] %in% assayIndex], |
... | ... |
@@ -226,7 +221,7 @@ setMethod("crossValidate", "DataFrame", |
226 | 221 |
# This allows someone to answer which combinations of the assays might be most useful. |
227 | 222 |
|
228 | 223 |
|
229 |
- if(is.null(assayCombinations)) |
|
224 |
+ if(!is.list(assayCombinations) && assayCombinations == "all") |
|
230 | 225 |
{ |
231 | 226 |
assayCombinations <- do.call("c", sapply(seq_along(assayIDs), function(nChoose) combn(assayIDs, nChoose, simplify = FALSE))) |
232 | 227 |
assayCombinations <- assayCombinations[sapply(assayCombinations, function(combination) "clinical" %in% combination, simplify = TRUE)] |
... | ... |
@@ -260,7 +255,7 @@ setMethod("crossValidate", "DataFrame", |
260 | 255 |
# This allows someone to answer which combinations of the assays might be most useful. |
261 | 256 |
|
262 | 257 |
|
263 |
- if(is.null(assayCombinations)){ |
|
258 |
+ if(!is.list(assayCombinations) && assayCombinations == "all"){ |
|
264 | 259 |
assayCombinations <- do.call("c", sapply(seq_along(assayIDs),function(nChoose) combn(assayIDs, nChoose, simplify = FALSE))) |
265 | 260 |
assayCombinations <- assayCombinations[sapply(assayCombinations, function(combination) "clinical" %in% combination, simplify = TRUE)] |
266 | 261 |
if(length(assayCombinations) == 0) stop("No assayCombinations with \"clinical\" data") |
... | ... |
@@ -293,32 +288,22 @@ setMethod("crossValidate", "DataFrame", |
293 | 288 |
# One or more omics data sets, possibly with clinical data. |
294 | 289 |
setMethod("crossValidate", "MultiAssayExperiment", |
295 | 290 |
function(measurements, |
296 |
- outcome, |
|
291 |
+ outcomeColumns, |
|
297 | 292 |
nFeatures = 20, |
298 | 293 |
selectionMethod = "t-test", |
299 | 294 |
selectionOptimisation = "Resubstitution", |
300 | 295 |
classifier = "randomForest", |
301 | 296 |
multiViewMethod = "none", |
302 |
- assayCombinations = NULL, |
|
297 |
+ assayCombinations = "all", |
|
303 | 298 |
nFolds = 5, |
304 | 299 |
nRepeats = 20, |
305 | 300 |
nCores = 1, |
306 |
- characteristicsLabel = NULL) |
|
301 |
+ characteristicsLabel = NULL, ...) |
|
307 | 302 |
{ |
308 |
- targets <- c(names(measurements), "clinical") |
|
309 |
- omicsTargets <- setdiff(targets, "clinical") |
|
310 |
- if(length(omicsTargets) > 0) |
|
311 |
- { |
|
312 |
- if(any(anyReplicated(measurements[, , omicsTargets]))) |
|
313 |
- stop("Data set contains replicates. Please provide remove or average replicate observations and try again.") |
|
314 |
- } |
|
315 |
- |
|
316 |
- tablesAndoutcome <- .MAEtoWideTable(measurements, targets, outcome, restrict = NULL) |
|
317 |
- measurements <- tablesAndoutcome[["dataTable"]] |
|
318 |
- outcome <- tablesAndoutcome[["outcome"]] |
|
303 |
+ measurementsAndOutcome <- prepareData(measurements, outcomeColumns, ...) |
|
319 | 304 |
|
320 |
- crossValidate(measurements = measurements, |
|
321 |
- outcome = outcome, |
|
305 |
+ crossValidate(measurements = measurementsAndOutcome[["measurements"]], |
|
306 |
+ outcome = measurementsAndOutcomeoutcome[["outcome"]], |
|
322 | 307 |
nFeatures = nFeatures, |
323 | 308 |
selectionMethod = selectionMethod, |
324 | 309 |
selectionOptimisation = selectionOptimisation, |
... | ... |
@@ -341,13 +326,13 @@ setMethod("crossValidate", "data.frame", # data.frame of numeric measurements. |
341 | 326 |
selectionOptimisation = "Resubstitution", |
342 | 327 |
classifier = "randomForest", |
343 | 328 |
multiViewMethod = "none", |
344 |
- assayCombinations = NULL, |
|
329 |
+ assayCombinations = "all", |
|
345 | 330 |
nFolds = 5, |
346 | 331 |
nRepeats = 20, |
347 | 332 |
nCores = 1, |
348 |
- characteristicsLabel = NULL) |
|
333 |
+ characteristicsLabel = NULL, ...) |
|
349 | 334 |
{ |
350 |
- measurements <- S4Vectors::DataFrame(measurements) |
|
335 |
+ measurements <- S4Vectors::DataFrame(measurements, check.names = FALSE) |
|
351 | 336 |
crossValidate(measurements = measurements, |
352 | 337 |
outcome = outcome, |
353 | 338 |
nFeatures = nFeatures, |
... | ... |
@@ -359,7 +344,7 @@ setMethod("crossValidate", "data.frame", # data.frame of numeric measurements. |
359 | 344 |
nFolds = nFolds, |
360 | 345 |
nRepeats = nRepeats, |
361 | 346 |
nCores = nCores, |
362 |
- characteristicsLabel = characteristicsLabel) |
|
347 |
+ characteristicsLabel = characteristicsLabel, ...) # ... for prepareData. |
|
363 | 348 |
}) |
364 | 349 |
|
365 | 350 |
#' @rdname crossValidate |
... | ... |
@@ -372,11 +357,11 @@ setMethod("crossValidate", "matrix", # Matrix of numeric measurements. |
372 | 357 |
selectionOptimisation = "Resubstitution", |
373 | 358 |
classifier = "randomForest", |
374 | 359 |
multiViewMethod = "none", |
375 |
- assayCombinations = NULL, |
|
360 |
+ assayCombinations = "all", |
|
376 | 361 |
nFolds = 5, |
377 | 362 |
nRepeats = 20, |
378 | 363 |
nCores = 1, |
379 |
- characteristicsLabel = NULL) |
|
364 |
+ characteristicsLabel = NULL, ...) |
|
380 | 365 |
{ |
381 | 366 |
measurements <- S4Vectors::DataFrame(measurements, check.names = FALSE) |
382 | 367 |
crossValidate(measurements = measurements, |
... | ... |
@@ -390,12 +375,11 @@ setMethod("crossValidate", "matrix", # Matrix of numeric measurements. |
390 | 375 |
nFolds = nFolds, |
391 | 376 |
nRepeats = nRepeats, |
392 | 377 |
nCores = nCores, |
393 |
- characteristicsLabel = characteristicsLabel) |
|
394 |
- |
|
378 |
+ characteristicsLabel = characteristicsLabel, ...) # ... for prepareData. |
|
395 | 379 |
}) |
396 | 380 |
|
397 |
- |
|
398 |
-###!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! |
|
381 |
+# This expects that each table is about the same set of samples and thus |
|
382 |
+# has the same number of rows as every other table. |
|
399 | 383 |
#' @rdname crossValidate |
400 | 384 |
#' @export |
401 | 385 |
setMethod("crossValidate", "list", |
... | ... |
@@ -406,39 +390,33 @@ setMethod("crossValidate", "list", |
406 | 390 |
selectionOptimisation = "Resubstitution", |
407 | 391 |
classifier = "randomForest", |
408 | 392 |
multiViewMethod = "none", |
409 |
- assayCombinations = NULL, |
|
393 |
+ assayCombinations = "all", |
|
410 | 394 |
nFolds = 5, |
411 | 395 |
nRepeats = 20, |
412 | 396 |
nCores = 1, |
413 |
- characteristicsLabel = NULL) |
|
397 |
+ characteristicsLabel = NULL, ...) |
|
414 | 398 |
{ |
415 |
- # Check if the list only contains one data type |
|
416 |
- if (measurements |> sapply(class) |> unique() |> length() != 1) { |
|
417 |
- stop("All assays must be of the same type (e.g. data.frame, matrix)") |
|
418 |
- } |
|
419 |
- |
|
420 | 399 |
# Check data type is valid |
421 |
- if (!(measurements[[1]] |> class() %in% c("data.frame", "DataFrame", "matrix"))) { |
|
400 |
+ if (!(all(sapply(measurements, class) %in% c("data.frame", "DataFrame", "matrix")))) { |
|
422 | 401 |
stop("assays must be of type data.frame, DataFrame or matrix") |
423 | 402 |
} |
424 | 403 |
|
425 | 404 |
# Check the list is named |
426 |
- if (names(measurements) |> is.null()) { |
|
427 |
- stop("Measurements must be a named list") |
|
405 |
+ if (is.null(names(measurements))) { |
|
406 |
+ stop("Measurements must be a named list.") |
|
428 | 407 |
} |
429 | 408 |
|
430 | 409 |
# Check same number of samples for all datasets |
431 |
- if ((measurements |> sapply(dim))[1,] |> unique() |> length() != 1) { |
|
432 |
- stop("All datasets must have the same number of samples") |
|
410 |
+ if (!length(unique(sapply(measurements, nrow))) == 1) { |
|
411 |
+ stop("All datasets must have the same samples.") |
|
433 | 412 |
} |
434 | 413 |
|
435 | 414 |
# Check the number of outcome is the same |
436 |
- if (((measurements[[1]] |> dim())[1] != length(outcome)) & length(outcome)>2) { |
|
437 |
- stop("outcome must have same number of samples as measurements") |
|
415 |
+ if (!all(sapply(measurements, nrow) == length(outcome)) && !is.character(outcome)) { |
|
416 |
+ stop("outcome must have same number of samples as measurements.") |
|
438 | 417 |
} |
439 | 418 |
|
440 |
- df_list <- sapply(measurements, t, simplify = FALSE) |
|
441 |
- df_list <- sapply(measurements , S4Vectors::DataFrame) |
|
419 |
+ df_list <- sapply(measurements, S4Vectors::DataFrame, check.names = FALSE) |
|
442 | 420 |
|
443 | 421 |
df_list <- mapply(function(meas, nam){ |
444 | 422 |
mcols(meas)$assay <- nam |
... | ... |
@@ -448,7 +426,6 @@ setMethod("crossValidate", "list", |
448 | 426 |
|
449 | 427 |
|
450 | 428 |
combined_df <- do.call(cbind, df_list) |
451 |
- colnames(combined_df) <- mcols(combined_df)$feature |
|
452 | 429 |
|
453 | 430 |
crossValidate(measurements = combined_df, |
454 | 431 |
outcome = outcome, |
... | ... |
@@ -461,7 +438,7 @@ setMethod("crossValidate", "list", |
461 | 438 |
nFolds = nFolds, |
462 | 439 |
nRepeats = nRepeats, |
463 | 440 |
nCores = nCores, |
464 |
- characteristicsLabel = characteristicsLabel) |
|
441 |
+ characteristicsLabel = characteristicsLabel, ...) |
|
465 | 442 |
}) |
466 | 443 |
|
467 | 444 |
|
... | ... |
@@ -633,13 +610,10 @@ generateModellingParams <- function(assayIDs, |
633 | 610 |
|
634 | 611 |
performanceType <- ifelse(classifier %in% c("CoxPH", "CoxNet", "randomSurvivalForest"), "C-index", "Balanced Accuracy") |
635 | 612 |
|
636 |
- |
|
637 |
- classifiers <- c("randomForest", "randomSurvivalForest", "GLM", "elasticNetGLM", "SVM", "DLDA", |
|
638 |
- "naiveBayes", "mixturesNormals", "kNN", |
|
639 |
- "CoxPH", "CoxNet") |
|
640 | 613 |
# Check classifier |
641 |
- if(!classifier %in% classifiers) |
|
642 |
- stop(paste("Classifier must exactly match of these (be careful of case):", paste(classifiers, collapse = ", "))) |
|
614 |
+ knownClassifiers <- .ClassifyRenvir[["classifyKeywords"]][, "classifier Keyword"] |
|
615 |
+ if(!classifier %in% knownClassifiers) |
|
616 |
+ stop(paste("Classifier must exactly match of these (be careful of case):", paste(knownClassifiers, collapse = ", "))) |
|
643 | 617 |
|
644 | 618 |
classifierParams <- .classifierKeywordToParams(classifier) |
645 | 619 |
|
... | ... |
@@ -692,7 +666,7 @@ generateMultiviewParams <- function(assayIDs, |
692 | 666 |
assayTrain <- sapply(assayIDs, function(assayID) if(assayID == 1) measurements else measurements[, mcols(measurements)[["assay"]] %in% assayID], simplify = FALSE) |
693 | 667 |
|
694 | 668 |
# Generate params for each assay. This could be extended to have different selectionMethods for each type |
695 |
- paramsassays <- mapply(generateModellingParams, |
|
669 |
+ paramsAssays <- mapply(generateModellingParams, |
|
696 | 670 |
nFeatures = nFeatures[assayIDs], |
697 | 671 |
selectionMethod = selectionMethod[assayIDs], |
698 | 672 |
assayIDs = assayIDs, |
... | ... |
@@ -714,7 +688,7 @@ generateMultiviewParams <- function(assayIDs, |
714 | 688 |
|
715 | 689 |
# Update selectParams to use |
716 | 690 |
params@selectParams <- SelectParams("selectMulti", |
717 |
- params = paramsassays, |
|
691 |
+ params = paramsAssays, |
|
718 | 692 |
characteristics = S4Vectors::DataFrame(characteristic = "Selection Name", value = "merge"), |
719 | 693 |
tuneParams = list(nFeatures = nFeatures[[1]], |
720 | 694 |
performanceType = "Balanced Error", |
... | ... |
@@ -729,7 +703,7 @@ generateMultiviewParams <- function(assayIDs, |
729 | 703 |
assayTrain <- sapply(assayIDs, function(assayID) measurements[, mcols(measurements)[["assay"]] %in% assayID], simplify = FALSE) |
730 | 704 |
|
731 | 705 |
# Generate params for each assay. This could be extended to have different selectionMethods for each type |
732 |
- paramsassays <- mapply(generateModellingParams, |
|
706 |
+ paramsAssays <- mapply(generateModellingParams, |
|
733 | 707 |
nFeatures = nFeatures[assayIDs], |
734 | 708 |
selectionMethod = selectionMethod[assayIDs], |
735 | 709 |
assayIDs = assayIDs, |
... | ... |
@@ -744,8 +718,8 @@ generateMultiviewParams <- function(assayIDs, |
744 | 718 |
params <- ModellingParams( |
745 | 719 |
balancing = "none", |
746 | 720 |
selectParams = NULL, |
747 |
- trainParams = TrainParams(prevalTrainInterface, params = paramsassays, characteristics = paramsassays$clinical@trainParams@characteristics), |
|
748 |
- predictParams = PredictParams(prevalPredictInterface, characteristics = paramsassays$clinical@predictParams@characteristics) |
|
721 |
+ trainParams = TrainParams(prevalTrainInterface, params = paramsAssays, characteristics = paramsAssays$clinical@trainParams@characteristics), |
|
722 |
+ predictParams = PredictParams(prevalPredictInterface, characteristics = paramsAssays$clinical@predictParams@characteristics) |
|
749 | 723 |
) |
750 | 724 |
|
751 | 725 |
return(params) |
... | ... |
@@ -757,7 +731,7 @@ generateMultiviewParams <- function(assayIDs, |
757 | 731 |
assayTrain <- sapply(assayIDs, function(assayID) measurements[, mcols(measurements)[["assay"]] %in% assayID], simplify = FALSE) |
758 | 732 |
|
759 | 733 |
# Generate params for each assay. This could be extended to have different selectionMethods for each type |
760 |
- paramsassays <- mapply(generateModellingParams, |
|
734 |
+ paramsAssays <- mapply(generateModellingParams, |
|
761 | 735 |
nFeatures = nFeatures[assayIDs], |
762 | 736 |
selectionMethod = selectionMethod[assayIDs], |
763 | 737 |
assayIDs = assayIDs, |
... | ... |
@@ -772,8 +746,8 @@ generateMultiviewParams <- function(assayIDs, |
772 | 746 |
params <- ModellingParams( |
773 | 747 |
balancing = "none", |
774 | 748 |
selectParams = NULL, |
775 |
- trainParams = TrainParams(prevalTrainInterface, params = paramsassays, characteristics = paramsassays$clinical@trainParams@characteristics), |
|
776 |
- predictParams = PredictParams(prevalPredictInterface, characteristics = paramsassays$clinical@predictParams@characteristics) |
|
749 |
+ trainParams = TrainParams(prevalTrainInterface, params = paramsAssays, characteristics = paramsAssays$clinical@trainParams@characteristics), |
|
750 |
+ predictParams = PredictParams(prevalPredictInterface, characteristics = paramsAssays$clinical@predictParams@characteristics) |
|
777 | 751 |
) |
778 | 752 |
|
779 | 753 |
return(params) |
... | ... |
@@ -783,7 +757,7 @@ generateMultiviewParams <- function(assayIDs, |
783 | 757 |
if(multiViewMethod == "PCA"){ |
784 | 758 |
|
785 | 759 |
# Split measurements up by assay. |
786 |
- assayTrain <- sapply(assayIDs, function(assayID) measurements[,mcols(measurements)[["assay"]] %in% assayID], simplify = FALSE) |
|
760 |
+ assayTrain <- sapply(assayIDs, function(assayID) measurements[, mcols(measurements)[["assay"]] %in% assayID], simplify = FALSE) |
|
787 | 761 |
|
788 | 762 |
# Generate params for each assay. This could be extended to have different selectionMethods for each type |
789 | 763 |
paramsClinical <- list(clinical = generateModellingParams( |
... | ... |
@@ -808,16 +782,15 @@ generateMultiviewParams <- function(assayIDs, |
808 | 782 |
|
809 | 783 |
} |
810 | 784 |
|
811 |
- |
|
812 |
-CV <- function(measurements, |
|
813 |
- outcome, |
|
785 |
+# measurements, outcome are mutually exclusive with x, outcomeTrain, measurementsTest, outcomeTest. |
|
786 |
+CV <- function(measurements = NULL, |
|
787 |
+ outcome = NULL, x = NULL, outcomeTrain = NULL, measurementsTest = NULL, outcomeTest = NULL, |
|
814 | 788 |
assayIDs, |
815 | 789 |
nFeatures = NULL, |
816 | 790 |
selectionMethod = "t-test", |
817 | 791 |
selectionOptimisation = "Resubstitution", |
818 | 792 |
classifier = "elasticNetGLM", |
819 | 793 |
multiViewMethod = "none", |
820 |
- assayCombinations = NULL, |
|
821 | 794 |
nFolds = 5, |
822 | 795 |
nRepeats = 100, |
823 | 796 |
nCores = 1, |
... | ... |
@@ -825,8 +798,10 @@ CV <- function(measurements, |
825 | 798 |
|
826 | 799 |
{ |
827 | 800 |
# Check that data is in the right format |
828 |
- checkData(measurements, outcome) |
|
829 |
- |
|
801 |
+ if(!is.null(measurements)) |
|
802 |
+ checkData(measurements, outcome) |
|
803 |
+ else |
|
804 |
+ checkData(x, x) |
|
830 | 805 |
# Check that other variables are in the right format and fix |
831 | 806 |
nFeatures <- cleanNFeatures(nFeatures = nFeatures, |
832 | 807 |
measurements = measurements) |
... | ... |
@@ -838,49 +813,239 @@ CV <- function(measurements, |
838 | 813 |
# Which data-types or data-views are present? |
839 | 814 |
if(is.null(characteristicsLabel)) characteristicsLabel <- "none" |
840 | 815 |
|
841 |
- # Setup cross-validation parameters including |
|
816 |
+ # Setup cross-validation parameters. Could be needed for independent train/test if parameter tuning |
|
817 |
+ # is specified to be done by nested cross-validation. |
|
842 | 818 |
crossValParams <- generateCrossValParams(nRepeats = nRepeats, |
843 | 819 |
nFolds = nFolds, |
844 | 820 |
nCores = nCores, |
845 |
- selectionOptimisation = selectionOptimisation |
|
846 |
- ) |
|
821 |
+ selectionOptimisation = selectionOptimisation) |
|
822 |
+ |
|
847 | 823 |
|
848 | 824 |
# Turn text into TrainParams and TestParams objects |
849 | 825 |
modellingParams <- generateModellingParams(assayIDs = assayIDs, |
850 |
- measurements = measurements, |
|
826 |
+ measurements = if(!is.null(measurements)) measurements else x, |
|
851 | 827 |
nFeatures = nFeatures, |
852 | 828 |
selectionMethod = selectionMethod, |
853 | 829 |
selectionOptimisation = selectionOptimisation, |
854 | 830 |
classifier = classifier, |
855 |
- multiViewMethod = multiViewMethod |
|
856 |
- ) |
|
831 |
+ multiViewMethod = multiViewMethod) |
|
832 |
+ |
|
857 | 833 |
if(length(assayIDs) > 1 || length(assayIDs) == 1 && assayIDs != 1) assayText <- assayIDs else assayText <- NULL |
858 | 834 |
characteristics <- S4Vectors::DataFrame(characteristic = c(if(!is.null(assayText)) "Assay Name" else NULL, "Classifier Name", "Selection Name", "multiViewMethod", "characteristicsLabel"), value = c(if(!is.null(assayText)) paste(assayText, collapse = ", ") else NULL, paste(classifier, collapse = ", "), paste(selectionMethod, collapse = ", "), multiViewMethod, characteristicsLabel)) |
859 | 835 |
|
860 |
- classifyResults <- runTests(measurements, outcome, crossValParams = crossValParams, modellingParams = modellingParams, characteristics = characteristics) |
|
861 |
- |
|
862 |
- fullResult <- runTest(measurements, outcome, measurements, outcome, crossValParams = crossValParams, modellingParams = modellingParams, characteristics = characteristics, .iteration = 1) |
|
836 |
+ if(!is.null(measurements)) |
|
837 |
+ { # Cross-validation. |
|
838 |
+ classifyResults <- runTests(measurements, outcome, crossValParams = crossValParams, modellingParams = modellingParams, characteristics = characteristics) |
|
839 |
+ fullResult <- runTest(measurements, outcome, measurements, outcome, crossValParams = crossValParams, modellingParams = modellingParams, characteristics = characteristics, .iteration = 1) |
|
840 |
+ } else { # Independent training and testing. |
|
841 |
+ classifyResults <- runTest(x, outcomeTrain, measurementsTest, outcomeTest, crossValParams = crossValParams, modellingParams = modellingParams, characteristics = characteristics) |
|
842 |
+ |
|
843 |
+ fullResult <- runTest(measurements, outcome, measurements, outcome, crossValParams = crossValParams, modellingParams = modellingParams, characteristics = characteristics, .iteration = 1) |
|
844 |
+ } |
|
863 | 845 |
|
864 | 846 |
classifyResults@finalModel <- list(fullResult$models) |
865 | 847 |
classifyResults |
848 |
+} |
|
866 | 849 |
|
850 |
+simplifyResults <- function(results, values = c("assay", "classifier", "selectionMethod", "multiViewMethod")){ |
|
851 |
+ ch <- sapply(results, function(x) x@characteristics[x@characteristics$characteristic %in% values, "value"], simplify = TRUE) |
|
852 |
+ ch <- data.frame(t(ch)) |
|
853 |
+ results[!duplicated(ch)] |
|
867 | 854 |
} |
868 | 855 |
|
856 |
+#' @rdname crossValidate |
|
857 |
+#' @export |
|
858 |
+train.matrix <-function(x, outcomeTrain, ...) |
|
859 |
+ { |
|
860 |
+ x <- DataFrame(x, check.names = FALSE) |
|
861 |
+ train(x, outcomeTrain, ...) |
|
862 |
+ } |
|
869 | 863 |
|
864 |
+#' @rdname crossValidate |
|
865 |
+#' @export |
|
866 |
+train.data.frame <- function(x, outcomeTrain, ...) |
|
867 |
+ { |
|
868 |
+ x <- DataFrame(x, check.names = FALSE) |
|
869 |
+ train(x, outcomeTrain, ...) |
|
870 |
+ } |
|
870 | 871 |
|
872 |
+#' @rdname crossValidate |
|
873 |
+#' @param assayIDs A character vector for assays to train with. Special value \code{"all"} |
|
874 |
+#' uses all assays in the input object. |
|
875 |
+#' @export |
|
876 |
+train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", multiViewMethod = "none", assayIDs = "all", ...) # ... for prepareData. |
|
877 |
+ { |
|
878 |
+ prepArgs <- list(x, outcomeTrain) |
|
879 |
+ extraInputs <- list(...) |
|
880 |
+ if(length(extraInputs) > 0) |
|
881 |
+ prepExtras <- which(names(extrasInputs) %in% .ClassifyRenvir[["prepareDataFormals"]]) |
|
882 |
+ if(length(prepExtras) > 0) |
|
883 |
+ prepArgs <- append(prepArgs, extraInputs[prepExtras]) |
|
884 |
+ measurementsAndOutcome <- do.call(prepareData, prepArgs) |
|
885 |
+ |
|
886 |
+ classifier <- cleanClassifier(classifier = classifier, measurements = measurements) |
|
887 |
+ if(assayIDs == "all") assayIDs <- unique(mcols(x)[, "assay"]) |
|
888 |
+ if(is.null(assayIDs)) assayIDs <- 1 |
|
871 | 889 |
|
890 |
+ if(multiViewMethod == "none"){ |
|
891 |
+ resClassifier <- |
|
892 |
+ sapply(assayIDs, function(assayIndex) { |
|
893 |
+ # Loop over assays |
|
894 |
+ sapply(classifier[[assayIndex]], function(classifierForAssay) { |
|
895 |
+ # Loop over classifiers |
|
896 |
+ measurementsUse <- measurements |
|
897 |
+ if(assayIndex != 1) measurementsUse <- measurements[, mcols(measurements)[, "assay"] == assayIndex, drop = FALSE] |
|
898 |
+ |
|
899 |
+ classifierParams <- .classifierKeywordToParams(classifierForAssay) |
|
900 |
+ modellingParams <- ModellingParams(balancing = "none", selectParams = "none", |
|
901 |
+ trainParams = classifierParams$trainParams, predictParams = classifierParams$predictParams) |
|
902 |
+ |
|
903 |
+ .doTrain(measurementsUse, outcomeTrain, NULL, NULL, modellingParams, verbose = 0)[["model"]] |
|
904 |
+ ## train model |
|
905 |
+ }, |
|
906 |
+ simplify = FALSE) |
|
907 |
+ }, |
|
908 |
+ simplify = FALSE) |
|
872 | 909 |
|
910 |
+ models <- unlist(resClassifier, recursive = FALSE) |
|
911 |
+ names(models) <- assayIDs |
|
912 |
+ class(models) <- c(class(models), "listOfModels") |
|
913 |
+ } |
|
873 | 914 |
|
874 |
-simplifyResults <- function(results, values = c("assay", "classifier", "selectionMethod", "multiViewMethod")){ |
|
875 |
- ch <- sapply(results, function(x) x@characteristics[x@characteristics$characteristic %in% values, "value"], simplify = TRUE) |
|
876 |
- ch <- data.frame(t(ch)) |
|
877 |
- results[!duplicated(ch)] |
|
915 |
+ ################################ |
|
916 |
+ #### Yes multiview |
|
917 |
+ ################################ |
|
918 |
+ |
|
919 |
+ ### Merging or binding to combine data |
|
920 |
+ if(multiViewMethod == "merge"){ |
|
921 |
+ measurementsUse <- measurements[, mcols(measurements)[["assay"]] %in% assayIDs] |
|
922 |
+ .doTrain(measurementsUse, outcomeTrain, NULL, NULL, modellingParams, verbose = 0)[["model"]] |
|
923 |
+ } |
|
924 |
+ |
|
925 |
+ |
|
926 |
+ ### Prevalidation to combine data |
|
927 |
+ if(multiViewMethod == "prevalidation"){ |
|
928 |
+ # Split measurements up by assay. |
|
929 |
+ assayTrain <- sapply(assayIDs, function(assayID) measurements[, mcols(measurements)[["assay"]] %in% assayID], simplify = FALSE) |
|
930 |
+ |
|
931 |
+ # Generate params for each assay. This could be extended to have different selectionMethods for each type |
|
932 |
+ paramsAssays <- mapply(generateModellingParams, |
|
933 |
+ assayIDs = assayIDs, |
|
934 |
+ measurements = assayTrain[assayIDs], |
|
935 |
+ classifier = classifier[assayIDs], |
|
936 |
+ MoreArgs = list(multiViewMethod = "none"), |
|
937 |
+ SIMPLIFY = FALSE) |
|
938 |
+ |
|
939 |
+ modellingParams <- ModellingParams( |
|
940 |
+ balancing = "none", |
|
941 |
+ selectParams = NULL, |
|
942 |
+ trainParams = TrainParams(prevalTrainInterface, params = paramsAssays, characteristics = paramsAssays$clinical@trainParams@characteristics), |
|
943 |
+ predictParams = PredictParams(prevalPredictInterface, characteristics = paramsAssays$clinical@predictParams@characteristics)) |
|
944 |
+ .doTrain(measurementsUse, outcomeTrain, NULL, NULL, modellingParams, verbose = 0)[["model"]] |
|
945 |
+ } |
|
946 |
+ |
|
947 |
+ ### Principal Components Analysis to combine data |
|
948 |
+ if(multiViewMethod == "PCA"){ |
|
949 |
+ measurementsUse <- measurements[, mcols(measurements)[["assay"]] %in% assayIDs] |
|
950 |
+ paramsClinical <- list(clinical = generateModellingParams( |
|
951 |
+ assayIDs = "clinical", |
|
952 |
+ measurements = measurements[, mcols(measurements)[["assay"]] == "clinical"], |
|
953 |
+ classifier = classifier["clinical"], |
|
954 |
+ multiViewMethod = "none")) |
|
955 |
+ |
|
956 |
+ modellingParams <- ModellingParams(balancing = "none", selectParams = NULL, |
|
957 |
+ trainParams = TrainParams(pcaTrainInterface, params = paramsClinical, nFeatures = nFeatures, characteristics = paramsClinical$clinical@trainParams@characteristics), |
|
958 |
+ predictParams = PredictParams(pcaPredictInterface, characteristics = paramsClinical$clinical@predictParams@characteristics)) |
|
959 |
+ .doTrain(measurementsUse, outcomeTrain, NULL, NULL, modellingParams, verbose = 0)[["model"]] |
|
960 |
+ } |
|
961 |
+ } |
|
962 |
+ |
|
963 |
+#' @rdname crossValidate |
|
964 |
+#' @export |
|
965 |
+# Each of the first four variables are named lists with names of assays. |
|
966 |
+train.list <- function(x, outcomeTrain, ...) |
|
967 |
+ { |
|
968 |
+ # Check data type is valid |
|
969 |
+ if (!(all(sapply(x, function(element) is(element, "tabular"))))) |
|
970 |
+ stop("assays must be of type data.frame, DataFrame or matrix") |
|
971 |
+ |
|
972 |
+ # Check the list is named |
|
973 |
+ if (is.null(names(x))) |
|
974 |
+ stop("Measurements must be a named list") |
|
975 |
+ |
|
976 |
+ # Check same number of samples for all datasets |
|
977 |
+ if (!length(unique(sapply(x, nrow))) == 1) |
|
978 |
+ stop("All datasets must have the same samples") |
|
979 |
+ |
|
980 |
+ # Check the number of outcome is the same |
|
981 |
+ if (!all(sapply(x, nrow) == length(x)) && !is.character(x)) |
|
982 |
+ stop("outcome must have same number of samples as measurements") |
|
983 |
+ |
|
984 |
+ df_list <- sapply(x, S4Vectors::DataFrame) |
|
985 |
+ |
|
986 |
+ df_list <- mapply(function(meas, nam){ |
|
987 |
+ mcols(meas)$assay <- nam |
|
988 |
+ mcols(meas)$feature <- colnames(meas) |
|
989 |
+ meas |
|
990 |
+ }, df_list, names(df_list)) |
|
991 |
+ |
|
992 |
+ combined_df <- do.call(cbind, df_list) |
|
993 |
+ |
|
994 |
+ # Each list of tabular data has been collapsed into a DataFrame. |
|
995 |
+ # Will be subset to relevant assayIDs inside the DataFrame method. |
|
996 |
+ train(combined_df, outcomeTrain, ...) |
|
878 | 997 |
} |
879 | 998 |
|
880 | 999 |
#' @rdname crossValidate |
881 | 1000 |
#' @export |
882 |
-setMethod("predict", "ClassifyResult", |
|
883 |
- function(object, newData) |
|
1001 |
+train.MultiAssayExperiment <- function(x, outcomeColumns, ...) |
|
884 | 1002 |
{ |
885 |
- object@modellingParams@predictParams@predictor(object@finalModel[[1]], newData) |
|
886 |
- }) |
|
1003 |
+ prepArgs <- list(x, outcomeColumns) |
|
1004 |
+ extraInputs <- list(...) |
|
1005 |
+ if(length(extraInputs) > 0) |
|
1006 |
+ prepExtras <- which(names(extrasInputs) %in% .ClassifyRenvir[["prepareDataFormals"]]) |
|
1007 |
+ if(length(prepExtras) > 0) |
|
1008 |
+ prepArgs <- append(prepArgs, extraInputs[prepExtras]) |
|
1009 |
+ measurementsAndOutcome <- do.call(prepareData, prepArgs) |
|
1010 |
+ trainArgs <- list(measurementsAndOutcome[["measurements"]], measurementsAndOutcome[["outcome"]]) |
|
1011 |
+ if(length(extraInputs) > 0) |
|
1012 |
+ trainExtras <- which(!names(extrasInputs) %in% .ClassifyRenvir[["prepareDataFormals"]]) |
|
1013 |
+ if(length(trainExtras) > 0) |
|
1014 |
+ trainArgs <- append(trainArgs, extraInputs[trainExtras]) |
|
1015 |
+ do.call(train, trainArgs) |
|
1016 |
+ } |
|
1017 |
+ |
|
1018 |
+#' @rdname crossValidate |
|
1019 |
+#' @param object A fitted model or a list of such models. |
|
1020 |
+#' @param newData For the \code{predict} function, an object of type \code{matrix}, \code{data.frame} |
|
1021 |
+#' \code{DataFrame}, \code{list} (of matrices or data frames) or \code{MultiAssayExperiment} containing |
|
1022 |
+#' the data to make predictions with with either a fitted model created by \code{train} or the final model |
|
1023 |
+#' stored in a \code{\link{ClassifyResult}} object. |
|
1024 |
+ |
|
1025 |
+#' @export |
|
1026 |
+predict.trainedByClassifyR <- function(object, newData, ...) |
|
1027 |
+{ |
|
1028 |
+ if(is(newData, "tabular")) # Simply tabular data. |
|
1029 |
+ { |
|
1030 |
+ colnames(newData) <- make.names(colnames(newData)) # Ensure that feature names are syntactically valid, like during model fitting. |
|
1031 |
+ } else if(is.list(newData) && !is(object, "listOfModels")) # Don't check all those conditions that train function does. |
|
1032 |
+ { # Merge the list of data tables and keep track of assay names in columns' metadata. |
|
1033 |
+ newData <- mapply(function(meas, nam){ |
|
1034 |
+ mcols(meas)$assay <- nam |
|
1035 |
+ mcols(meas)$feature <- colnames(meas) |
|
1036 |
+ meas |
|
1037 |
+ }, newData, names(newData)) |
|
1038 |
+ newData <- do.call(cbind, newData) |
|
1039 |
+ } else if(is(newData, "MultiAssayExperiment")) |
|
1040 |
+ { |
|
1041 |
+ newData <- prepareData(newData, useFeatures = allFeatureNames(object)) |
|
1042 |
+ # Some classifiers dangerously use positional matching rather than column name matching. |
|
1043 |
+ # newData columns are sorted so that the right column ordering is guaranteed. |
|
1044 |
+ } else {stop("'newData' is not one of the valid data types. It is of type ", class(newData), '.')} |
|
1045 |
+ if(is(object, "ClassifyResult")) |
|
1046 |
+ { |
|
1047 |
+ object@modellingParams@predictParams@predictor(object@finalModel[[1]], newData) |
|
1048 |
+ } else if (is(object, "listOfModels")) { # Object is itself a trained model and it is assumed that a predict method is defined for it. |
|
1049 |
+ mapply(function(model, assay) predict(model, assay), object, newData, SIMPLIFY = FALSE) |
|
1050 |
+ } else predict(object, newData) |
|
1051 |
+} |
|
887 | 1052 |
\ No newline at end of file |
... | ... |
@@ -1,63 +1,7 @@ |
1 |
-#' Calculate Location and Scale |
|
2 |
-#' |
|
3 |
-#' Calculates the location and scale for each feature. |
|
4 |
-#' |
|
5 |
-#' \code{"SD"} is used to represent standard deviation and \code{"MAD"} is used |
|
6 |
-#' to represent median absolute deviation. |
|
7 |
-#' |
|
8 |
-#' @aliases getLocationsAndScales getLocationsAndScales,matrix-method |
|
9 |
-#' getLocationsAndScales,DataFrame-method |
|
10 |
-#' getLocationsAndScales,MultiAssayExperiment-method |
|
11 |
-#' @param measurements Either a \code{\link{matrix}}, \code{\link{DataFrame}} |
|
12 |
-#' or \code{\link{MultiAssayExperiment}} containing the training data. For a |
|
13 |
-#' \code{matrix}, the rows are samples, and the columns are features. |
|
14 |
-#' If of type \code{\link{DataFrame}} or \code{\link{MultiAssayExperiment}}, the data set is subset |
|
15 |
-#' to only those features of type \code{numeric}. |
|
16 |
-#' @param targets If \code{measurements} is a \code{MultiAssayExperiment}, the |
|
17 |
-#' names of the data tables to be used. \code{"clinical"} is also a valid value |
|
18 |
-#' and specifies that numeric variables from the clinical data table will be |
|
19 |
-#' used. |
|
20 |
-#' @param ... Variables not used by the \code{matrix} nor the |
|
21 |
-#' \code{MultiAssayExperiment} method which are passed into and used by the |
|
22 |
-#' \code{DataFrame} method. |
|
23 |
-#' @param location The type of location to be calculated. |
|
24 |
-#' @param scale The type of scale to be calculated. |
|
25 |
-#' @return A \code{\link{list}} of length 2. The first element contains the |
|
26 |
-#' location for every feature. The second element contains the scale for every |
|
27 |
-#' feature. |
|
28 |
-#' @author Dario Strbenac |
|
29 |
-#' @references Qn: |
|
30 |
-#' \url{http://www.tandfonline.com/doi/pdf/10.1080/01621459.1993.10476408} |
|
31 |
-#' @examples |
|
32 |
-#' |
|
33 |
-#' genesMatrix <- matrix(rnorm(1000, 8, 4), nrow = 10) |
|
34 |
-#' distributionInfo <- getLocationsAndScales(genesMatrix, "median", "MAD") |
|
35 |
-#' |
|
36 |
-#' mean(distributionInfo[["median"]]) # Typical median. |
|
37 |
-#' mean(distributionInfo[["MAD"]]) # Typical MAD. |
|
38 |
-#' |
|
39 |
-#' @usage NULL |
|
40 |
-#' @export |
|
41 |
-setGeneric("getLocationsAndScales", function(measurements, ...) |
|
42 |
- standardGeneric("getLocationsAndScales")) |
|
1 |
+# Calculate Location and Scale |
|
43 | 2 |
|
44 |
-#' @rdname getLocationsAndScales |
|
45 |
-#' @export |
|
46 |
-setMethod("getLocationsAndScales", "matrix", # Matrix of numeric measurements. |
|
47 |
- function(measurements, ...) |
|
3 |
+getLocationsAndScales <- function(measurements, location = c("mean", "median"), scale = c("SD", "MAD", "Qn")) |
|
48 | 4 |
{ |
49 |
- getLocationsAndScales(S4Vectors::DataFrame(measurements, check.names = FALSE), ...) |
|
50 |
-}) |
|
51 |
- |
|
52 |
-#' @rdname getLocationsAndScales |
|
53 |
-#' @export |
|
54 |
-setMethod("getLocationsAndScales", "DataFrame", # Sample information data or one of the other inputs, transformed. |
|
55 |
- function(measurements, location = c("mean", "median"), scale = c("SD", "MAD", "Qn")) |
|
56 |
-{ |
|
57 |
- isNumeric <- sapply(measurements, is.numeric) |
|
58 |
- measurements <- measurements[, isNumeric, drop = FALSE] |
|
59 |
- if(sum(isNumeric) == 0) |
|
60 |
- stop("No features are numeric but at least one must be.") |
|
61 | 5 |
location <- match.arg(location) |
62 | 6 |
scale <- match.arg(scale) |
63 | 7 |
|
... | ... |
@@ -72,19 +16,4 @@ setMethod("getLocationsAndScales", "DataFrame", # Sample information data or one |
72 | 16 |
MAD = apply(measurements, 2, mad), |
73 | 17 |
Qn = apply(measurements, 2, robustbase::Qn))), |
74 | 18 |
c(location, scale)) |
75 |
-}) |
|
76 |
- |
|
77 |
-# One or more omics data sets, possibly with clinical data. |
|
78 |
-#' @rdname getLocationsAndScales |
|
79 |
-#' @export |
|
80 |
-setMethod("getLocationsAndScales", "MultiAssayExperiment", |
|
81 |
- function(measurements, targets = names(measurements), ...) |
|
82 |
-{ |
|
83 |
- if(!all(targets %in% c(names(measurements), "clinical"))) |
|
84 |
- stop("Some table names in 'targets' are not assay names in 'measurements' or \"clinical\".") |
|
85 |
- |
|
86 |
- combinedData <- .MAEtoWideTable(measurements, targets, NULL) |
|
87 |
- if(class(combinedData) == "list") |
|
88 |
- combinedData <- combinedData[["dataTable"]] |
|
89 |
- getLocationsAndScales(combinedData, ...) |
|
90 |
-}) |
|
19 |
+} |
|
91 | 20 |
\ No newline at end of file |
... | ... |
@@ -17,11 +17,10 @@ |
17 | 17 |
#' @param featurePairs A object of type \code{\link{Pairs}}. |
18 | 18 |
#' @param absolute If TRUE, then the absolute values of the differences are |
19 | 19 |
#' returned. |
20 |
-#' @param target If \code{measurements} is a \code{MultiAssayExperiment}, the |
|
21 |
-#' name of the data table to be used. |
|
22 |
-#' @param classesColumn If \code{measurementsTrain} is a \code{MultiAssayExperiment}, the |
|
23 |
-#' names of the class column in the table extracted by \code{colData(multiAssayExperiment)} |
|
24 |
-#' that contains each sample's outcome to use for prediction. |
|
20 |
+#' @param useFeatures If \code{measurements} is a \code{MultiAssayExperiment}, |
|
21 |
+#' \code{"all"} or a two-column table of features to use. If a table, the first column must have |
|
22 |
+#' assay names and the second column must have feature names found for that assay. |
|
23 |
+#' \code{"clinical"} is also a valid assay name and refers to the clinical data table. |
|
25 | 24 |
#' @param ... Variables not used by the \code{matrix} nor the |
26 | 25 |
#' \code{MultiAssayExperiment} method which are passed into and used by the |
27 | 26 |
#' \code{DataFrame} method. |
... | ... |
@@ -94,8 +93,8 @@ setMethod("interactorDifferences", "DataFrame", # Possibly mixed data types. |
94 | 93 |
#' @rdname interactorDifferences |
95 | 94 |
#' @export |
96 | 95 |
setMethod("interactorDifferences", "MultiAssayExperiment", # Pick one numeric table from the data set. |
97 |
- function(measurements, target = NULL, classesColumn, ...) |
|
96 |
+ function(measurements, useFeatures = "all", ...) |
|
98 | 97 |
{ |
99 |
- tablesAndClasses <- .MAEtoWideTable(measurements, target, classesColumn) |
|
100 |
- interactorDifferences(tablesAndClasses[["dataTable"]], ...) |
|
98 |
+ measurementsDF <- prepareData(measurements, useFeatures = useFeatures)[["measurements"]] |
|
99 |
+ interactorDifferences(measurementsDF, ...) |
|
101 | 100 |
}) |
102 | 101 |
\ No newline at end of file |
... | ... |
@@ -3,7 +3,7 @@ |
3 | 3 |
#' Input data could be of matrix, MultiAssayExperiment, or DataFrame format and this |
4 | 4 |
#' function will prepare a DataFrame of features and a vector of outcomes and help |
5 | 5 |
#' to exclude nuisance features such as dates or unique sample identifiers from |
6 |
-#' future modelling. |
|
6 |
+#' subsequent modelling. |
|
7 | 7 |
#' |
8 | 8 |
#' @aliases prepareData prepareData,matrix-method prepareData,DataFrame-method |
9 | 9 |
#' prepareData,MultiAssayExperiment-method |
... | ... |
@@ -136,28 +136,76 @@ setMethod("prepareData", "DataFrame", |
136 | 136 |
measurements <- measurements[, mostVariance] |
137 | 137 |
} |
138 | 138 |
|
139 |
- list(measurements = measurements, outcome = outcome) |
|
139 |
+ list(measurements = measurements, outcome = outcome) |
|
140 | 140 |
}) |
141 | 141 |
|
142 | 142 |
#' @rdname prepareData |
143 | 143 |
#' @export |
144 | 144 |
setMethod("prepareData", "MultiAssayExperiment", |
145 |
- function(measurements, outcomeColumns = NULL, useFeatures = data.frame(assay = names(measurements), feature = rep("all", length(measurements))), ...) |
|
145 |
+ function(measurements, outcomeColumns = NULL, useFeatures = "all", ...) |
|
146 | 146 |
{ |
147 |
- omicsTargets <- setdiff(useFeatures[, 1], "clinical") |
|
147 |
+ if(is.character(useFeatures)) useFeatures <- data.frame(assay = names(measurements), feature = "all") |
|
148 |
+ omicsTargets <- setdiff(useFeatures[, "assay"], "clinical") |
|
148 | 149 |
if(length(omicsTargets) > 0) |
149 | 150 |
{ |
150 | 151 |
if(any(anyReplicated(measurements[, , omicsTargets]))) |
151 | 152 |
stop("Data set contains replicates. Please remove or average replicate observations and try again.") |
152 | 153 |
} |
153 | 154 |
|
154 |
- if(is.null(outcomeColumns)) |
|
155 |
- stop("'outcomeColumns' is NULL. One or more outcome columns must be specified.") |
|
156 |
- if(!all(outcomeColumns %in% colnames(MultiAssayExperiment::colData(measurements)))) |
|
155 |
+ if(!is.null(outcomeColumns) && !all(outcomeColumns %in% colnames(MultiAssayExperiment::colData(measurements)))) |
|
157 | 156 |
stop("Not all column names specified by 'outcomeColumns' found in clinical table.") |
158 |
- if(!all(useFeatures[, 1] %in% c(names(measurements), "clinical"))) |
|
157 |
+ if(!all(useFeatures[, "assay"] %in% c(names(measurements), "clinical"))) |
|
159 | 158 |
stop("Some assay names in first column of 'useFeatures' are not assay names in 'measurements' or \"clinical\".") |
159 |
+ |
|
160 |
+ clinicalColumns <- colnames(MultiAssayExperiment::colData(measurements)) |
|
161 |
+ if("clinical" %in% useFeatures[, "assay"]) |
|
162 |
+ { |
|
163 |
+ clinicalRows <- useFeatures[, "assay"] == "clinical" |
|
164 |
+ clinicalColumns <- useFeatures[clinicalRows, "feature"] |
|
165 |
+ useFeatures <- useFeatures[!clinicalRows, ] |
|
166 |
+ } else { |
|
167 |
+ clinicalColumns <- NULL |
|
168 |
+ } |
|
160 | 169 |
|
170 |
+ if(nrow(useFeatures) > 0) |
|
171 |
+ { |
|
172 |
+ measurements <- measurements[, , unique(useFeatures[, "assay"])] |
|
173 |
+ |
|
174 |
+ # Get all desired measurements tables and clinical columns (other than the columns representing outcome). |
|
175 |
+ # These form the independent variables to be used for making predictions with. |
|
176 |
+ # Variable names will have names like RNA_BRAF for traceability. |
|
177 |
+ dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(clinicalColumns, outcomeColumns)) |
|
178 |
+ rownames(dataTable) <- dataTable[, "primary"] |
|
179 |
+ S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "clinical", S4Vectors::mcols(dataTable)[, "sourceName"]) |
|
180 |
+ colnames(S4Vectors::mcols(dataTable))[1] <- "assay" |
|
181 |
+ |
|
182 |
+ # Sample information variable names not included in column metadata of wide table but only as row names of it. |
|
183 |
+ # Create a combined column named "feature" which has feature names of the assays as well as the clinical. |
|
184 |
+ S4Vectors::mcols(dataTable)[, "feature"] <- as.character(S4Vectors::mcols(dataTable)[, "rowname"]) |
|
185 |
+ missingIndices <- is.na(S4Vectors::mcols(dataTable)[, "feature"]) |
|
186 |
+ S4Vectors::mcols(dataTable)[missingIndices, "feature"] <- colnames(dataTable)[missingIndices] |
|
187 |
+ |
|
188 |
+ # Finally, a column annotation recording variable name and which table it originated from for all of the source tables. |
|
189 |
+ S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")] |
|
190 |
+ |
|
191 |
+ # Subset to only the desired features. |
|
192 |
+ useFeaturesSubset <- useFeatures[useFeatures[, "feature"] != "all", ] |
|
193 |
+ if(nrow(useFeaturesSubset) > 0) |
|
194 |
+ { |
|
195 |
+ uniqueAssays <- unique(useFeatures[, "assay"]) |
|
196 |
+ for(filterAssay in uniqueAssays) |
|
197 |
+ { |
|
198 |
+ dropFeatures <- S4Vectors::mcols(dataTable)[, "assay"] == filterAssay & |
|
199 |
+ !S4Vectors::mcols(dataTable)[, "feature"] %in% useFeatures[useFeatures[, 1] == filterAssay, 2] |
|
200 |
+ dataTable <- dataTable[, !dropFeatures] |
|
201 |
+ } |
|
202 |
+ } |
|
203 |
+ dataTable <- dataTable[, -match("primary", colnames(dataTable))] |
|
204 |
+ } else { # Must have only been clinical data. |
|
205 |
+ dataTable <- MultiAssayExperiment::colData(measurements) |
|
206 |
+ S4Vectors::mcols(dataTable) <- DataFrame(assay = "clinical", feature = colnames(dataTable)) |
|
207 |
+ } |
|
208 |
+ |
|
161 | 209 |
# Do other filtering and preparation in DataFrame function. |
162 |
- prepareData(.MAEtoWideTable(measurements, outcomeColumns, useFeatures), make.names(outcomeColumns), "all") |
|
210 |
+ prepareData(dataTable, outcomeColumns, useFeatures = "all", ...) |
|
163 | 211 |
}) |
164 | 212 |
\ No newline at end of file |
... | ... |
@@ -64,8 +64,6 @@ |
64 | 64 |
#' @param margin The margin to have around the plot. |
65 | 65 |
#' @param showLegend If \code{TRUE}, a legend is plotted next to the plot. If |
66 | 66 |
#' FALSE, it is hidden. |
67 |
-#' @param plot Logical. If \code{TRUE}, a plot is produced on the current |
|
68 |
-#' graphics device. |
|
69 | 67 |
#' @param parallelParams An object of class \code{\link{MulticoreParam}} or |
70 | 68 |
#' \code{\link{SnowParam}}. |
71 | 69 |
#' @param ... Not used by end user. |
... | ... |
@@ -126,7 +124,7 @@ setMethod("rankingPlot", "list", |
126 | 124 |
title = if(comparison[1] == "within") "Feature Ranking Stability" else "Feature Ranking Commonality", |
127 | 125 |
yLabel = if(is.null(referenceLevel)) "Average Common Features (%)" else paste("Average Common Features with", referenceLevel, "(%)"), |
128 | 126 |
margin = grid::unit(c(1, 1, 1, 1), "lines"), |
129 |
- showLegend = TRUE, plot = TRUE, parallelParams = bpparam()) |
|
127 |
+ showLegend = TRUE, parallelParams = bpparam()) |
|
130 | 128 |
{ |
131 | 129 |
if(!requireNamespace("ggplot2", quietly = TRUE)) |
132 | 130 |
stop("The package 'ggplot2' could not be found. Please install it.") |
... | ... |
@@ -291,8 +289,5 @@ setMethod("rankingPlot", "list", |
291 | 289 |
|
292 | 290 |
overlapPlot <- overlapPlot + ggplot2::facet_grid(ggplot2::vars(!!rowVariable), ggplot2::vars(!!columnVariable)) + ggplot2::theme(strip.text = ggplot2::element_text(size = sizesList[["fontSizes"]][6])) |
293 | 291 |
|
294 |
- if(plot == TRUE) |
|
295 |
- print(overlapPlot) |
|
296 |
- |
|
297 | 292 |
overlapPlot |
298 | 293 |
}) |
... | ... |
@@ -19,7 +19,7 @@ |
19 | 19 |
#' containing either classes or time and event information about survival. |
20 | 20 |
#' @param measurementsTest Same data type as \code{measurementsTrain}, but only the test |
21 | 21 |
#' samples. |
22 |
-#' @param outcomeTest Same data type as \code{outcomeTrain}, but only the test |
|
22 |
+#' @param outcomeTest Same data type as \code{outcomeTrain}, but for only the test |
|
23 | 23 |
#' samples. |
24 | 24 |
#' @param crossValParams An object of class \code{\link{CrossValParams}}, |
25 | 25 |
#' specifying the kind of cross-validation to be done, if nested |
... | ... |
@@ -27,16 +27,11 @@ |
27 | 27 |
#' @param modellingParams An object of class \code{\link{ModellingParams}}, |
28 | 28 |
#' specifying the class rebalancing, transformation (if any), feature selection |
29 | 29 |
#' (if any), training and prediction to be done on the data set. |
30 |
-#' @param targets If \code{measurementsTrain} is a \code{MultiAssayExperiment}, the |
|
31 |
-#' names of the data tables to be used. \code{"clinical"} is also a valid value |
|
32 |
-#' and specifies that numeric variables from the clinical data table will be |
|
33 |
-#' used. |
|
34 | 30 |
#' @param outcomeColumns If \code{measurementsTrain} is a \code{MultiAssayExperiment}, the |
35 | 31 |
#' names of the column (class) or columns (survival) in the table extracted by \code{colData(data)} |
36 | 32 |
#' that contain(s) the samples' outcome to use for prediction. |
37 |
-#' @param ... Variables not used by the \code{matrix} nor the |
|
38 |
-#' \code{MultiAssayExperiment} method which are passed into and used by the |
|
39 |
-#' \code{DataFrame} method. |
|
33 |
+#' @param ... Variables not used by the \code{matrix} nor the \code{MultiAssayExperiment} method which |
|
34 |
+#' are passed into and used by the \code{DataFrame} method or passed onwards to \code{\link{prepareData}}. |
|
40 | 35 |
#' @param characteristics A \code{\link{DataFrame}} describing the |
41 | 36 |
#' characteristics of the classification used. First column must be named |
42 | 37 |
#' \code{"charateristic"} and second column must be named \code{"value"}. |
... | ... |
@@ -71,8 +66,7 @@ |
71 | 66 |
#' @importFrom S4Vectors do.call mcols |
72 | 67 |
#' @usage NULL |
73 | 68 |
#' @export |
74 |
-setGeneric("runTest", function(measurementsTrain, ...) |
|
75 |
- standardGeneric("runTest")) |
|
69 |
+setGeneric("runTest", function(measurementsTrain, ...) standardGeneric("runTest")) |
|
76 | 70 |
|
77 | 71 |
#' @rdname runTest |
78 | 72 |
#' @export |
... | ... |
@@ -91,7 +85,7 @@ setMethod("runTest", "matrix", # Matrix of numeric measurements. |
91 | 85 |
setMethod("runTest", "DataFrame", # Sample information data or one of the other inputs, transformed. |
92 | 86 |
function(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, |
93 | 87 |
crossValParams = CrossValParams(), # crossValParams might be used for tuning optimisation. |
94 |
- modellingParams = ModellingParams(), characteristics = S4Vectors::DataFrame(), verbose = 1, .iteration = NULL) |
|
88 |
+ modellingParams = ModellingParams(), characteristics = S4Vectors::DataFrame(), ..., verbose = 1, .iteration = NULL) |
|
95 | 89 |
{ |
96 | 90 |
if(is.null(.iteration)) # Not being called by runTests but by user. So, check the user input. |
97 | 91 |
{ |
... | ... |
@@ -100,7 +94,7 @@ function(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, |
100 | 94 |
if(any(is.na(measurementsTrain))) |
101 | 95 |
stop("Some data elements are missing and classifiers don't work with missing data. Consider imputation or filtering.") |
102 | 96 |
|
103 |
- splitDatasetTrain <- prepareData(measurementsTrain, outcomeTrain) |
|
97 |
+ splitDatasetTrain <- prepareData(measurementsTrain, outcomeTrain, ...) |
|
104 | 98 |
|
105 | 99 |
# Rebalance the class sizes of the training samples by either downsampling or upsampling |
106 | 100 |
# or leave untouched if balancing is none. |
... | ... |
@@ -223,6 +217,8 @@ input data. Autmomatically reducing to smaller number.") |
223 | 217 |
predictedOutcome <- tryCatch(.doTest(trained[["model"]], measurementsTest, modellingParams@predictParams, verbose), |
224 | 218 |
error = function(error) error[["message"]] |
225 | 219 |
) |
220 |
+ |
|
221 |
+ predictedOutcome <- .doTest(trained[["model"]], measurementsTest, modellingParams@predictParams, verbose) |
|
226 | 222 |
|
227 | 223 |
if(is.character(predictedOutcome)) # An error occurred. |
228 | 224 |
return(predictedOutcome) # Return early. |
... | ... |
@@ -328,24 +324,34 @@ input data. Autmomatically reducing to smaller number.") |
328 | 324 |
} |
329 | 325 |
|
330 | 326 |
ClassifyResult(characteristics, allSamples, originalFeatures, list(rankedFeatures), list(selectedFeatures), |
331 |
- list(models), tuneDetails, S4Vectors::DataFrame(sample = rownames(measurementsTest), predictedOutcome, check.names = FALSE), allOutcome, importanceTable) |
|
327 |
+ list(models), tuneDetails, S4Vectors::DataFrame(sample = rownames(measurementsTest), predictedOutcome, check.names = FALSE), allOutcome, importanceTable, modellingParams, list(models)) |
|
332 | 328 |
} |
333 | 329 |
}) |
334 | 330 |
|
335 | 331 |
#' @rdname runTest |
336 | 332 |
#' @export |
337 | 333 |
setMethod("runTest", c("MultiAssayExperiment"), |
338 |
- function(measurementsTrain, measurementsTest, targets = names(measurements), outcomeColumns, ...) |
|
334 |
+ function(measurementsTrain, measurementsTest, outcomeColumns, ...) |
|
339 | 335 |
{ |
340 |
- omicsTargets <- setdiff(targets, "clinical") |
|
341 |
- if(length(omicsTargets) > 0) |
|
342 |
- { |
|
343 |
- if(any(anyReplicated(measurements[, , omicsTargets]))) |
|
344 |
- stop("Data set contains replicates. Please provide remove or average replicate observations and try again.") |
|
336 |
+ prepArgsTrain <- list(measurementsTrain, outcomeColumns) |
|
337 |
+ prepArgsTest <- list(measurementsTest, outcomeColumns) |
|
338 |
+ extraInputs <- list(...) |
|
339 |
+ if(length(extraInputs) > 0) |
|
340 |
+ prepExtras <- which(names(extrasInputs) %in% .ClassifyRenvir[["prepareDataFormals"]]) |
|
341 |
+ if(length(prepExtras) > 0) |
|
342 |
+ { |
|
343 |
+ prepArgsTrain <- append(prepArgsTrain, extraInputs[prepExtras]) |
|
344 |
+ prepArgsTest <- append(prepArgsTest, extraInputs[prepExtras]) |
|
345 | 345 |
} |
346 |
+ measurementsAndOutcomeTrain <- do.call(prepareData, prepArgs) |
|
347 |
+ measurementsAndOutcomeTest <- do.call(prepareData, prepArgs) |
|
346 | 348 |
|
347 |
- tablesAndClassesTrain <- .MAEtoWideTable(measurementsTrain, targets, outcomeColumns, restrict = NULL) |
|
348 |
- tablesAndClassesTest <- .MAEtoWideTable(measurementsTest, targets, outcomeColumns, restrict = NULL) |
|
349 |
- runTest(tablesAndClassesTrain[["dataTable"]], tablesAndClassesTrain[["outcome"]], |
|
350 |
- tablesAndClassesTest[["dataTable"]], tablesAndClassesTest[["outcome"]], ...) |
|
349 |
+ runTestArgs <- list(measurementsAndOutcomeTrain[["measurements"]], measurementsAndOutcomeTrain[["outcome"]], |
|
350 |
+ measurementsAndOutcomeTest[["measurements"]], measurementsAndOutcomeTest[["outcome"]]) |
|
351 |
+ if(length(extraInputs) > 0 && (length(prepExtras) == 0 || length(extraInputs[-prepExtras]) > 0)) |
|
352 |
+ { |
|
353 |
+ if(length(prepExtras) == 0) runTestArgs <- append(runTestArgs, extraInputs) else |
|
354 |
+ runTestArgs <- append(runTestArgs, extraInputs[-prepExtras]) |
|
355 |
+ } |
|
356 |
+ do.call(runTest, runTestArgs) |
|
351 | 357 |
}) |
... | ... |
@@ -26,15 +26,11 @@ |
26 | 26 |
#' package. Transformation, selection and prediction functions provided by |
27 | 27 |
#' this package will cause the characteristics to be automatically determined |
28 | 28 |
#' and this can be left blank. |
29 |
-#' @param targets If \code{measurements} is a \code{MultiAssayExperiment}, the |
|
30 |
-#' names of the data tables to be used. \code{"clinical"} is also a valid value |
|
31 |
-#' and specifies that the clinical data table will be used. |
|
32 | 29 |
#' @param outcomeColumns If \code{measurementsTrain} is a \code{MultiAssayExperiment}, the |
33 | 30 |
#' names of the column (class) or columns (survival) in the table extracted by \code{colData(data)} |
34 | 31 |
#' that contain(s)s the samples' outcome to use for prediction. |
35 |
-#' @param ... Variables not used by the \code{matrix} nor the |
|
36 |
-#' \code{MultiAssayExperiment} method which are passed into and used by the |
|
37 |
-#' \code{DataFrame} method. |
|
32 |
+#' @param ... Variables not used by the \code{matrix} nor the \code{MultiAssayExperiment} method which |
|
33 |
+#' are passed into and used by the \code{DataFrame} method or passed onwards to \code{\link{prepareData}}. |
|
38 | 34 |
#' @param verbose Default: 1. A number between 0 and 3 for the amount of |
39 | 35 |
#' progress messages to give. A higher number will produce more messages as |
40 | 36 |
#' more lower-level functions print messages. |
... | ... |
@@ -72,7 +68,7 @@ setMethod("runTests", c("matrix"), function(measurements, outcome, ...) # Matrix |
72 | 68 |
#' @rdname runTests |
73 | 69 |
#' @export |
74 | 70 |
setMethod("runTests", "DataFrame", function(measurements, outcome, crossValParams = CrossValParams(), modellingParams = ModellingParams(), |
75 |
- characteristics = S4Vectors::DataFrame(), verbose = 1) |
|
71 |
+ characteristics = S4Vectors::DataFrame(), ..., verbose = 1) |
|
76 | 72 |
{ |
77 | 73 |
# Get out the outcome if inside of data table. |
78 | 74 |
if(is.null(rownames(measurements))) |
... | ... |
@@ -83,7 +79,7 @@ setMethod("runTests", "DataFrame", function(measurements, outcome, crossValParam |
83 | 79 |
|
84 | 80 |
originalFeatures <- colnames(measurements) |
85 | 81 |
if("feature" %in% colnames(S4Vectors::mcols(measurements))) originalFeatures <- S4Vectors::mcols(measurements)[, c("assay", "feature")] |
86 |
- splitDataset <- prepareData(measurements, outcome) |
|
82 |
+ splitDataset <- prepareData(measurements, outcome, ...) |
|
87 | 83 |
measurements <- splitDataset[["measurements"]] |
88 | 84 |
outcome <- splitDataset[["outcome"]] |
89 | 85 |
|
... | ... |
@@ -172,23 +168,32 @@ input data. Autmomatically reducing to smaller number.") |
172 | 168 |
if(!is.null(results[[1]][["importance"]])) |
173 | 169 |
importance <- do.call(rbind, lapply(results, "[[", "importance")) |
174 | 170 |
|
171 |
+ fullResult <- runTest(measurements, outcome, measurements, outcome, crossValParams = crossValParams, modellingParams = modellingParams, characteristics = characteristics, .iteration = 1) |
|
172 |
+ |
|
175 | 173 |
ClassifyResult(characteristics, rownames(measurements), originalFeatures, |
176 | 174 |
lapply(results, "[[", "ranked"), lapply(results, "[[", "selected"), |
177 |
- lapply(results, "[[", "models"), tuneList, predictionsTable, outcome, importance, modellingParams) |
|
175 |
+ lapply(results, "[[", "models"), tuneList, predictionsTable, outcome, importance, modellingParams, list(fullResult$models)) |
|
178 | 176 |
}) |
179 | 177 |
|
180 | 178 |
#' @rdname runTests |
179 |
+#' @import MultiAssayExperiment methods |
|
181 | 180 |
#' @export |
182 | 181 |
setMethod("runTests", c("MultiAssayExperiment"), |
183 |
- function(measurements, targets = names(measurements), outcomeColumns, ...) |
|
182 |
+ function(measurements, outcomeColumns, ...) |
|
184 | 183 |
{ |
185 |
- omicsTargets <- setdiff(targets, "clinical") |
|
186 |
- if(length(omicsTargets) > 0) |
|
184 |
+ prepArgs <- list(measurements, outcomeColumns) |
|
185 |
+ extraInputs <- list(...) |
|
186 |
+ if(length(extraInputs) > 0) |
|
187 |
+ prepExtras <- which(names(extrasInputs) %in% .ClassifyRenvir[["prepareDataFormals"]]) |
|
188 |
+ if(length(prepExtras) > 0) |
|
189 |
+ prepArgs <- append(prepArgs, extraInputs[prepExtras]) |
|
190 |
+ measurementsAndOutcome <- do.call(prepareData, prepArgs) |
|
191 |
+ |
|
192 |
+ runTestsArgs <- list(measurementsAndOutcome[["measurements"]], measurementsAndOutcome[["outcome"]]) |
|
193 |
+ if(length(extraInputs) > 0 && (length(prepExtras) == 0 || length(extraInputs[-prepExtras]) > 0)) |
|
187 | 194 |
{ |
188 |
- if(any(anyReplicated(measurements[, , omicsTargets]))) |
|
189 |
- stop("Data set contains replicates. Please provide remove or average replicate observations and try again.") |
|
195 |
+ if(length(prepExtras) == 0) runTestsArgs <- append(runTestsArgs, extraInputs) else |
|
196 |
+ runTestsArgs <- append(runTestsArgs, extraInputs[-prepExtras]) |
|
190 | 197 |
} |
191 |
- |
|
192 |
- tablesAndOutcome <- .MAEtoWideTable(measurements, targets, outcomeColumns, restrict = NULL) |
|
193 |
- runTests(tablesAndOutcome[["dataTable"]], tablesAndOutcome[["outcome"]], ...) |
|
198 |
+ do.call(runTests, runTestsArgs) |
|
194 | 199 |
}) |
... | ... |
@@ -1,60 +1,3 @@ |
1 |
-# Function to convert a MultiAssayExperiment object into a flat DataFrame table, to enable it |
|
2 |
-# to be used in typical model building functions. |
|
3 |
-# Returns a list with a covariate table and and outcome vector/table, or just a covariate table |
|
4 |
-# in the case the input is a test data set. |
|
5 |
-.MAEtoWideTable <- function(measurements, outcomeColumns, useFeatures) |
|
6 |
-{ |
|
7 |
- clinicalColumns <- colnames(MultiAssayExperiment::colData(measurements)) |
|
8 |
- if("clinical" %in% useFeatures[, 1]) |
|
9 |
- { |
|
10 |
- clinicalRows <- useFeatures[, 1] == "clinical" |
|
11 |
- clinicalColumns <- useFeatures[clinicalRows, 2] |
|
12 |
- useFeatures <- useFeatures[!clinicalRows, ] |
|
13 |
- } else { |
|
14 |
- clinicalColumns <- NULL |
|
15 |
- } |
|
16 |
- |
|
17 |
- if(nrow(useFeatures) > 0) |
|
18 |
- { |
|
19 |
- measurements <- measurements[, , unique(useFeatures[, 1])] |
|
20 |
- |
|
21 |
- # Get all desired measurements tables and clinical columns (other than the columns representing outcome). |
|
22 |
- # These form the independent variables to be used for making predictions with. |
|
23 |
- # Variable names will have names like RNA_BRAF for traceability. |
|
24 |
- dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(clinicalColumns, outcomeColumns)) |
|
25 |
- rownames(dataTable) <- dataTable[, "primary"] |
|
26 |
- S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "clinical", S4Vectors::mcols(dataTable)[, "sourceName"]) |
|
27 |
- colnames(S4Vectors::mcols(dataTable))[1] <- "assay" |
|
28 |
- |
|
29 |
- # Sample information variable names not included in column metadata of wide table but only as row names of it. |
|
30 |
- # Create a combined column named "feature" which has feature names of the assays as well as the clinical. |
|
31 |
- S4Vectors::mcols(dataTable)[, "feature"] <- as.character(S4Vectors::mcols(dataTable)[, "rowname"]) |
|
32 |
- missingIndices <- is.na(S4Vectors::mcols(dataTable)[, "feature"]) |
|
33 |
- S4Vectors::mcols(dataTable)[missingIndices, "feature"] <- colnames(dataTable)[missingIndices] |
|
34 |
- |
|
35 |
- # Finally, a column annotation recording variable name and which table it originated from for all of the source tables. |
|
36 |
- S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")] |
|
37 |
- |
|
38 |
- # Subset to only the desired features. |
|
39 |
- useFeaturesSubset <- useFeatures[useFeatures[, 2] != "all", ] |
|
40 |
- if(nrow(useFeaturesSubset) > 0) |
|
41 |
- { |
|
42 |
- uniqueAssays <- unique(useFeatures[, 1]) |
|
43 |
- for(filterAssay in uniqueAssays) |
|
44 |
- { |
|
45 |
- dropFeatures <- S4Vectors::mcols(dataTable)[, "assay"] == filterAssay & |
|
46 |
- !S4Vectors::mcols(dataTable)[, "feature"] %in% useFeatures[useFeatures[, 1] == filterAssay, 2] |
|
47 |
- dataTable <- dataTable[, !dropFeatures] |
|
48 |
- } |
|
49 |
- } |
|
50 |
- dataTable <- dataTable[, -match("primary", colnames(dataTable))] |
|
51 |
- } else { # Must have only been clinical data. |
|
52 |
- dataTable <- MultiAssayExperiment::colData(measurements) |
|
53 |
- S4Vectors::mcols(dataTable) <- DataFrame(assay = "clinical", feature = colnames(dataTable)) |
|
54 |
- } |
|
55 |
- dataTable |
|
56 |
-} |
|
57 |
- |
|
58 | 1 |
# Creates two lists of lists. First has training samples, second has test samples for a range |
59 | 2 |
# of different cross-validation schemes. |
60 | 3 |
#' @import utils |
... | ... |
@@ -329,7 +272,7 @@ |
329 | 272 |
modellingParams@trainParams@otherParams <- c(modellingParams@trainParams@otherParams, as.list(tuneCombos[rowIndex, ])) |
330 | 273 |
if(crossValParams@tuneMode == "Resubstitution") |
331 | 274 |
{ |
332 |
- result <- runTest(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, |
|
275 |
+ result <- runTest(measurementsTrain, outcomeTrain, measurementsTrain, outcomeTrain, |
|
333 | 276 |
crossValParams = NULL, modellingParams, |
334 | 277 |
verbose = verbose, .iteration = "internal") |
335 | 278 |
|
... | ... |
@@ -8,7 +8,12 @@ |
8 | 8 |
\alias{crossValidate,data.frame-method} |
9 | 9 |
\alias{crossValidate,MultiAssayExperiment-method} |
10 | 10 |
\alias{crossValidate,list-method} |
11 |
-\alias{predict,ClassifyResult-method} |
|
11 |
+\alias{train.matrix} |
|
12 |
+\alias{train.data.frame} |
|
13 |
+\alias{train.DataFrame} |
|
14 |
+\alias{train.list} |
|
15 |
+\alias{train.MultiAssayExperiment} |
|
16 |
+\alias{predict.trainedByClassifyR} |
|
12 | 17 |
\title{Cross-validation to evaluate classification performance.} |
13 | 18 |
\usage{ |
14 | 19 |
crossValidate(measurements, outcome, ...) |
... | ... |
@@ -21,7 +26,7 @@ crossValidate(measurements, outcome, ...) |
21 | 26 |
selectionOptimisation = "Resubstitution", |
22 | 27 |
classifier = "randomForest", |
23 | 28 |
multiViewMethod = "none", |
24 |
- assayCombinations = NULL, |
|
29 |
+ assayCombinations = "all", |
|
25 | 30 |
nFolds = 5, |
26 | 31 |
nRepeats = 20, |
27 | 32 |
nCores = 1, |
... | ... |
@@ -31,17 +36,18 @@ crossValidate(measurements, outcome, ...) |
31 | 36 |
|
32 | 37 |
\S4method{crossValidate}{MultiAssayExperiment}( |
33 | 38 |
measurements, |
34 |
- outcome, |
|
39 |
+ outcomeColumns, |
|
35 | 40 |
nFeatures = 20, |
36 | 41 |
selectionMethod = "t-test", |
37 | 42 |
selectionOptimisation = "Resubstitution", |
38 | 43 |
classifier = "randomForest", |
39 | 44 |
multiViewMethod = "none", |
40 |
- assayCombinations = NULL, |
|
45 |
+ assayCombinations = "all", |
|
41 | 46 |
nFolds = 5, |
42 | 47 |
nRepeats = 20, |
43 | 48 |
nCores = 1, |
44 |
- characteristicsLabel = NULL |
|
49 |
+ characteristicsLabel = NULL, |
|
50 |
+ ... |
|
45 | 51 |
) |
46 | 52 |
|
47 | 53 |
\S4method{crossValidate}{data.frame}( |
... | ... |
@@ -52,11 +58,12 @@ crossValidate(measurements, outcome, ...) |
52 | 58 |
selectionOptimisation = "Resubstitution", |
53 | 59 |
classifier = "randomForest", |
54 | 60 |
multiViewMethod = "none", |
55 |
- assayCombinations = NULL, |
|
61 |
+ assayCombinations = "all", |
|
56 | 62 |
nFolds = 5, |
57 | 63 |
nRepeats = 20, |
58 | 64 |
nCores = 1, |
59 |
- characteristicsLabel = NULL |
|
65 |
+ characteristicsLabel = NULL, |
|
66 |
+ ... |
|
60 | 67 |
) |
61 | 68 |
|
62 | 69 |
\S4method{crossValidate}{matrix}( |
... | ... |
@@ -67,11 +74,12 @@ crossValidate(measurements, outcome, ...) |
67 | 74 |
selectionOptimisation = "Resubstitution", |
68 |