... | ... |
@@ -3,8 +3,8 @@ Type: Package |
3 | 3 |
Title: A framework for cross-validated classification problems, with |
4 | 4 |
applications to differential variability and differential |
5 | 5 |
distribution testing |
6 |
-Version: 3.2.0 |
|
7 |
-Date: 2022-10-25 |
|
6 |
+Version: 3.2.1 |
|
7 |
+Date: 2022-11-17 |
|
8 | 8 |
Author: Dario Strbenac, Ellis Patrick, Sourish Iyengar, Harry Robertson, Andy Tran, John Ormerod, Graham Mann, Jean Yang |
9 | 9 |
Maintainer: Dario Strbenac <dario.strbenac@sydney.edu.au> |
10 | 10 |
VignetteBuilder: knitr |
... | ... |
@@ -27,8 +27,6 @@ export(distribution) |
27 | 27 |
export(edgesToHubNetworks) |
28 | 28 |
export(featureSetSummary) |
29 | 29 |
export(finalModel) |
30 |
-export(generateCrossValParams) |
|
31 |
-export(generateModellingParams) |
|
32 | 30 |
export(interactorDifferences) |
33 | 31 |
export(models) |
34 | 32 |
export(performance) |
... | ... |
@@ -87,7 +85,6 @@ exportMethods(selectionPlot) |
87 | 85 |
exportMethods(show) |
88 | 86 |
exportMethods(totalPredictions) |
89 | 87 |
exportMethods(tunedParameters) |
90 |
-import(BiocParallel) |
|
91 | 88 |
import(MultiAssayExperiment) |
92 | 89 |
import(grid) |
93 | 90 |
import(methods) |
... | ... |
@@ -86,7 +86,7 @@ setMethod("ROCplot", "ClassifyResult", function(results, ...) { |
86 | 86 |
|
87 | 87 |
#' @rdname ROCplot |
88 | 88 |
#' @export |
89 |
-setMethod("ROCplot", "list", |
|
89 |
+setMethod("ROCplot", "list", |
|
90 | 90 |
function(results, mode = c("merge", "average"), interval = 95, |
91 | 91 |
comparison = "auto", lineColours = "auto", |
92 | 92 |
lineWidth = 1, fontSizes = c(24, 16, 12, 12, 12), labelPositions = seq(0.0, 1.0, 0.2), |
... | ... |
@@ -101,10 +101,18 @@ setMethod("ROCplot", "list", |
101 | 101 |
if(comparison == "auto") |
102 | 102 |
{ |
103 | 103 |
if(max(characteristicsCounts) == length(results)) |
104 |
- comparison <- names(characteristicsCounts)[characteristicsCounts == max(characteristicsCounts)][1] |
|
105 |
- else |
|
104 |
+ { # Choose a characteristic which varies the most across the results. |
|
105 |
+ candidates <- names(characteristicsCounts)[characteristicsCounts == length(results)] |
|
106 |
+ allCharacteristics <- do.call(rbind, lapply(results, function(result) result@characteristics)) |
|
107 |
+ distinctValues <- by(allCharacteristics[, "value"], allCharacteristics[, "characteristic"], function(values) length(unique(values))) |
|
108 |
+ comparison <- names(distinctValues)[which.max(distinctValues)][1] |
|
109 |
+ } else { |
|
106 | 110 |
stop("No characteristic is present for all results but must be.") |
111 |
+ } |
|
107 | 112 |
} |
113 |
+ resultsWithComparison <- sum(sapply(results, function(result) any(result@characteristics[, "characteristic"] == comparison))) |
|
114 |
+ if(resultsWithComparison < length(results)) |
|
115 |
+ stop("Not all results have comparison characteristic ", comparison, ' but need to.') |
|
108 | 116 |
|
109 | 117 |
ggplot2::theme_set(ggplot2::theme_classic() + ggplot2::theme(panel.border = ggplot2::element_rect(fill = NA))) |
110 | 118 |
distinctClasses <- levels(actualOutcome(results[[1]])) |
... | ... |
@@ -113,6 +113,15 @@ setMethod("calcExternalPerformance", c("Surv", "numeric"), |
113 | 113 |
.calcPerformance(actualOutcome, predictedOutcome, performanceType = performanceType)[["values"]] |
114 | 114 |
}) |
115 | 115 |
|
116 |
+#' @rdname calcPerformance |
|
117 |
+#' @exportMethod calcExternalPerformance |
|
118 |
+setMethod("calcExternalPerformance", c("factor", "tabular"), # table has class probabilities per sample. |
|
119 |
+ function(actualOutcome, predictedOutcome, performanceType = "AUC") |
|
120 |
+ { |
|
121 |
+ performanceType <- match.arg(performanceType) |
|
122 |
+ .calcPerformance(actualOutcome, predictedOutcome, performanceType = performanceType)[["values"]] |
|
123 |
+ }) |
|
124 |
+ |
|
116 | 125 |
#' @rdname calcPerformance |
117 | 126 |
#' @usage NULL |
118 | 127 |
#' @export |
... | ... |
@@ -11,22 +11,26 @@ |
11 | 11 |
#' same length as the number of samples in \code{measurements} or a character vector of length 1 containing the |
12 | 12 |
#' column name in \code{measurements} if it is a \code{\link{DataFrame}}. Or a \code{\link{Surv}} object or a character vector of |
13 | 13 |
#' length 2 or 3 specifying the time and event columns in \code{measurements} for survival outcome. If \code{measurements} is a |
14 |
-#' \code{\link{MultiAssayExperiment}}, the column name(s) in \code{colData(measurements)} representing the outcome. |
|
14 |
+#' \code{\link{MultiAssayExperiment}}, the column name(s) in \code{colData(measurements)} representing the outcome. If column names |
|
15 |
+#' of survival information, time must be in first column and event status in the second. |
|
15 | 16 |
#' @param outcomeTrain For the \code{train} function, either a factor vector of classes, a \code{\link{Surv}} object, or |
16 | 17 |
#' a character string, or vector of such strings, containing column name(s) of column(s) |
17 |
-#' containing either classes or time and event information about survival. |
|
18 |
+#' containing either classes or time and event information about survival. If column names |
|
19 |
+#' of survival information, time must be in first column and event status in the second. |
|
18 | 20 |
#' @param ... Parameters passed into \code{\link{prepareData}} which control subsetting and filtering of input data. |
19 | 21 |
#' @param nFeatures The number of features to be used for classification. If this is a single number, the same number of features will be used for all comparisons |
20 | 22 |
#' or assays. If a numeric vector these will be optimised over using \code{selectionOptimisation}. If a named vector with the same names of multiple assays, |
21 | 23 |
#' a different number of features will be used for each assay. If a named list of vectors, the respective number of features will be optimised over. |
22 | 24 |
#' Set to NULL or "all" if all features should be used. |
23 |
-#' @param selectionMethod A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, |
|
24 |
-#' and performing multiview classification, the respective classification methods will be used on each assay. |
|
25 |
+#' @param selectionMethod Default: "auto". A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, |
|
26 |
+#' and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"} t-test (two categories) / F-test (three or more categories) ranking |
|
27 |
+#' and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. |
|
25 | 28 |
#' @param selectionOptimisation A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}. |
26 |
-#' @param performanceType Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Any one of the |
|
29 |
+#' @param performanceType Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Otherwise, any one of the |
|
27 | 30 |
#' options described in \code{\link{calcPerformance}} may otherwise be specified. |
28 |
-#' @param classifier A character vector of classification methods to compare. If a named character vector with names corresponding to different assays, |
|
29 |
-#' and performing multiview classification, the respective classification methods will be used on each assay. |
|
31 |
+#' @param classifier Default: \code{"auto"}. A character vector of classification methods to compare. If a named character vector with names corresponding to different assays, |
|
32 |
+#' and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"}, then a random forest is used for a classification |
|
33 |
+#' task or Cox proportional hazards model for a survival task. |
|
30 | 34 |
#' @param multiViewMethod A character vector specifying the multiview method or data integration approach to use. |
31 | 35 |
#' @param assayCombinations A character vector or list of character vectors proposing the assays or, in the case of a list, combination of assays to use |
32 | 36 |
#' with each element being a vector of assays to combine. Special value \code{"all"} means all possible subsets of assays. |
... | ... |
@@ -108,12 +112,14 @@ setMethod("crossValidate", "DataFrame", |
108 | 112 |
if(!performanceType %in% c("auto", .ClassifyRenvir[["performanceTypes"]])) |
109 | 113 |
stop(paste("performanceType must be one of", paste(c("auto", .ClassifyRenvir[["performanceTypes"]]), collapse = ", "), "but is", performanceType)) |
110 | 114 |
|
115 |
+ isCategorical <- is.character(outcome) && (length(outcome) == 1 || length(outcome) == nrow(measurements)) || is.factor(outcome) |
|
111 | 116 |
if(performanceType == "auto") |
112 |
- { |
|
113 |
- if(is.character(outcome) && (length(outcome) == 1 || length(outcome) == nrow(measurements)) || is.factor(outcome)) |
|
114 |
- performanceType <- "Balanced Accuracy" |
|
115 |
- else performanceType <- "C-index" |
|
116 |
- } |
|
117 |
+ if(isCategorical) performanceType <- "Balanced Accuracy" else performanceType <- "C-index" |
|
118 |
+ if(length(selectionMethod) == 1 && selectionMethod == "auto") |
|
119 |
+ if(isCategorical) selectionMethod <- "t-test" else selectionMethod <- "CoxPH" |
|
120 |
+ if(length(classifier) == 1 && classifier == "auto") |
|
121 |
+ if(isCategorical) classifier <- "randomForest" else classifier <- "CoxPH" |
|
122 |
+ |
|
117 | 123 |
|
118 | 124 |
# Which data-types or data-views are present? |
119 | 125 |
assayIDs <- unique(S4Vectors::mcols(measurements)$assay) |
... | ... |
@@ -515,18 +521,6 @@ Using an ordinary GLM instead.") |
515 | 521 |
classifier |
516 | 522 |
} |
517 | 523 |
|
518 |
-###################################### |
|
519 |
-###################################### |
|
520 |
-#' A function to generate a CrossValParams object |
|
521 |
-#' |
|
522 |
-#' @inheritParams crossValidate |
|
523 |
-#' |
|
524 |
-#' @return CrossValParams object |
|
525 |
-#' @export |
|
526 |
-#' |
|
527 |
-#' @examples |
|
528 |
-#' CVparams <- generateCrossValParams(nRepeats = 20, nFolds = 5, nCores = 8, selectionOptimisation = "none") |
|
529 |
-#' @import BiocParallel |
|
530 | 524 |
generateCrossValParams <- function(nRepeats, nFolds, nCores, selectionOptimisation){ |
531 | 525 |
|
532 | 526 |
seed <- .Random.seed[1] |
... | ... |
@@ -549,32 +543,7 @@ generateCrossValParams <- function(nRepeats, nFolds, nCores, selectionOptimisati |
549 | 543 |
if(!any(tuneMode %in% c("Resubstitution", "Nested CV", "none"))) stop("selectionOptimisation must be Nested CV or Resubstitution or none") |
550 | 544 |
CrossValParams(permutations = nRepeats, folds = nFolds, parallelParams = BPparam, tuneMode = tuneMode) |
551 | 545 |
} |
552 |
-###################################### |
|
553 | 546 |
|
554 |
-###################################### |
|
555 |
-#' A function to generate a ModellingParams object |
|
556 |
-#' |
|
557 |
-#' @inheritParams crossValidate |
|
558 |
-#' @param assayIDs A vector of data set identifiers as long at the number of data sets. |
|
559 |
-#' |
|
560 |
-#' @return ModellingParams object |
|
561 |
-#' @export |
|
562 |
-#' |
|
563 |
-#' @examples |
|
564 |
-#' data(asthma) |
|
565 |
-#' # First make a toy example assay with multiple data types. We'll randomly assign different features to be clinical, gene or protein. |
|
566 |
-#' set.seed(51773) |
|
567 |
-#' measurements <- DataFrame(measurements, check.names = FALSE) |
|
568 |
-#' mcols(measurements)$assay <- c(rep("clinical",20),sample(c("gene", "protein"), ncol(measurements)-20, replace = TRUE)) |
|
569 |
-#' mcols(measurements)$feature <- colnames(measurements) |
|
570 |
-#' modellingParams <- generateModellingParams(assayIDs = c("clinical", "gene", "protein"), |
|
571 |
-#' measurements = measurements, |
|
572 |
-#' nFeatures = list(clinical = 10, gene = 10, protein = 10), |
|
573 |
-#' selectionMethod = list(clinical = "t-test", gene = "t-test", protein = "t-test"), |
|
574 |
-#' selectionOptimisation = "none", |
|
575 |
-#' classifier = "randomForest", |
|
576 |
-#' multiViewMethod = "merge") |
|
577 |
-#' @import BiocParallel |
|
578 | 547 |
generateModellingParams <- function(assayIDs, |
579 | 548 |
measurements, |
580 | 549 |
nFeatures, |
... | ... |
@@ -6,9 +6,11 @@ randomForestTrainInterface <- function(measurementsTrain, outcomeTrain, mTryProp |
6 | 6 |
if(verbose == 3) |
7 | 7 |
message("Fitting random forest classifier to training data.") |
8 | 8 |
mtry <- round(mTryProportion * ncol(measurementsTrain)) # Number of features to try. |
9 |
- |
|
10 | 9 |
# Convert to base data.frame as randomForest doesn't understand DataFrame. |
11 |
- ranger::ranger(x = as(measurementsTrain, "data.frame"), y = outcomeTrain, mtry = mtry, importance = "impurity_corrected", ...) |
|
10 |
+ fittedModel <- ranger::ranger(x = as(measurementsTrain, "data.frame"), y = outcomeTrain, mtry = mtry, ...) |
|
11 |
+ forImportance <- ranger::ranger(x = as(measurementsTrain, "data.frame"), y = outcomeTrain, mtry = mtry, importance = "impurity_corrected", ...) |
|
12 |
+ attr(fittedModel, "forImportance") <- forImportance |
|
13 |
+ fittedModel |
|
12 | 14 |
} |
13 | 15 |
attr(randomForestTrainInterface, "name") <- "randomForestTrainInterface" |
14 | 16 |
|
... | ... |
@@ -37,7 +39,8 @@ randomForestPredictInterface <- function(forest, measurementsTest, ..., returnTy |
37 | 39 |
|
38 | 40 |
forestFeatures <- function(forest) |
39 | 41 |
{ |
40 |
- rankedFeaturesIndices <- order(ranger::importance(forest), decreasing = TRUE) |
|
41 |
- selectedFeaturesIndices <- which(ranger::importance(forest) > 0) |
|
42 |
+ forImportance <- attr(forest, "forImportance") |
|
43 |
+ rankedFeaturesIndices <- order(ranger::importance(forImportance), decreasing = TRUE) |
|
44 |
+ selectedFeaturesIndices <- which(ranger::importance(forImportance) > 0) |
|
42 | 45 |
list(rankedFeaturesIndices, selectedFeaturesIndices) |
43 | 46 |
} |
44 | 47 |
\ No newline at end of file |
... | ... |
@@ -5,8 +5,9 @@ rfsrcTrainInterface <- function(measurementsTrain, survivalTrain, mTryProportion |
5 | 5 |
stop("The package 'randomForestSRC' could not be found. Please install it.") |
6 | 6 |
if(verbose == 3) |
7 | 7 |
message("Fitting rfsrc classifier to training data and making predictions on test data.") |
8 |
- |
|
9 |
- bindedMeasurements <- cbind(measurementsTrain, event = survivalTrain[, 1], time = survivalTrain[, 2]) |
|
8 |
+ |
|
9 |
+ # Surv objects store survival information as a two-column table, time and event, in that order. |
|
10 |
+ bindedMeasurements <- cbind(measurementsTrain, time = survivalTrain[, 1], event = survivalTrain[, 2]) |
|
10 | 11 |
mtry <- round(mTryProportion * ncol(measurementsTrain)) # Number of features to try. |
11 | 12 |
randomForestSRC::rfsrc(Surv(time, event) ~ ., data = as.data.frame(bindedMeasurements), mtry = mtry, |
12 | 13 |
var.used = "all.trees", importance = TRUE, ...) |
... | ... |
@@ -4,7 +4,7 @@ coxphRanking <- function(measurementsTrain, survivalTrain, verbose = 3) # Clinic |
4 | 4 |
|
5 | 5 |
pValues <- rep(NA, ncol(measurementsTrain)) |
6 | 6 |
names(pValues) <- colnames(measurementsTrain) |
7 |
- |
|
7 |
+ |
|
8 | 8 |
isCat <- sapply(measurementsTrain, class) %in% c("character", "factor") |
9 | 9 |
if(any(isCat)) |
10 | 10 |
{ |
... | ... |
@@ -16,7 +16,8 @@ |
16 | 16 |
#' \code{matrix} or \code{\link{DataFrame}}, the rows are samples, and the columns are features. |
17 | 17 |
#' @param outcomeTrain Either a factor vector of classes, a \code{\link{Surv}} object, or |
18 | 18 |
#' a character string, or vector of such strings, containing column name(s) of column(s) |
19 |
-#' containing either classes or time and event information about survival. |
|
19 |
+#' containing either classes or time and event information about survival. If column names |
|
20 |
+#' of survival information, time must be in first column and event status in the second. |
|
20 | 21 |
#' @param measurementsTest Same data type as \code{measurementsTrain}, but only the test |
21 | 22 |
#' samples. |
22 | 23 |
#' @param outcomeTest Same data type as \code{outcomeTrain}, but for only the test |
... | ... |
@@ -256,12 +257,19 @@ input data. Autmomatically reducing to smaller number.") |
256 | 257 |
{ |
257 | 258 |
if(is.null(modellingParams@trainParams@getFeatures)) |
258 | 259 |
selectedFeatures <- originalFeatures[selectedFeaturesIndices] |
259 |
- else selectedFeatures <- colnames(measurementsTrain)[rankedFeaturesIndices] |
|
260 |
+ else selectedFeatures <- colnames(measurementsTrain)[selectedFeaturesIndices] |
|
260 | 261 |
} else { |
261 | 262 |
featureColumns <- na.omit(match(c("assay", "feature"), colnames(S4Vectors::mcols(measurementsTrain)))) |
262 |
- if(is.null(modellingParams@trainParams@getFeatures)) |
|
263 |
- selectedFeatures <- originalFeatures[selectedFeaturesIndices, ] |
|
264 |
- else selectedFeatures <- S4Vectors::mcols(measurementsTrain)[selectedFeaturesIndices, featureColumns] |
|
263 |
+ if(length(featureColumns) == 1) |
|
264 |
+ { |
|
265 |
+ if(is.null(modellingParams@trainParams@getFeatures)) |
|
266 |
+ selectedFeatures <- originalFeatures[selectedFeaturesIndices] |
|
267 |
+ else selectedFeatures <- S4Vectors::mcols(measurementsTrain)[selectedFeaturesIndices, featureColumns] |
|
268 |
+ } else { |
|
269 |
+ if(is.null(modellingParams@trainParams@getFeatures)) |
|
270 |
+ selectedFeatures <- originalFeatures[selectedFeaturesIndices, ] |
|
271 |
+ else selectedFeatures <- S4Vectors::mcols(measurementsTrain)[selectedFeaturesIndices, featureColumns] |
|
272 |
+ } |
|
265 | 273 |
} |
266 | 274 |
importanceTable <- S4Vectors::DataFrame(selectedFeatures, performanceChanges) |
267 | 275 |
if(ncol(importanceTable) == 2) colnames(importanceTable)[1] <- "feature" |
... | ... |
@@ -272,6 +280,7 @@ input data. Autmomatically reducing to smaller number.") |
272 | 280 |
if(!is.null(tuneDetailsSelect)) tuneDetails <- tuneDetailsSelect else tuneDetails <- tuneDetailsTrain |
273 | 281 |
|
274 | 282 |
# Convert back into original, potentially unsafe feature identifiers unless it is a nested cross-validation. |
283 |
+ |
|
275 | 284 |
if(is.null(.iteration) || .iteration != "internal") |
276 | 285 |
{ |
277 | 286 |
if(!is.null(rankedFeaturesIndices)) |
... | ... |
@@ -283,9 +292,16 @@ input data. Autmomatically reducing to smaller number.") |
283 | 292 |
else rankedFeatures <- colnames(measurementsTrain)[rankedFeaturesIndices] |
284 | 293 |
} else { |
285 | 294 |
featureColumns <- na.omit(match(c("assay", "feature"), colnames(S4Vectors::mcols(measurementsTrain)))) |
286 |
- if(is.null(modellingParams@trainParams@getFeatures)) |
|
287 |
- rankedFeatures <- originalFeatures[rankedFeaturesIndices, ] |
|
288 |
- else rankedFeatures <- S4Vectors::mcols(measurementsTrain)[rankedFeaturesIndices, featureColumns] |
|
295 |
+ if(length(featureColumns) == 1) |
|
296 |
+ { |
|
297 |
+ if(is.null(modellingParams@trainParams@getFeatures)) |
|
298 |
+ rankedFeatures <- originalFeatures[rankedFeaturesIndices] |
|
299 |
+ else rankedFeatures <- S4Vectors::mcols(measurementsTrain)[rankedFeaturesIndices, featureColumns] |
|
300 |
+ } else { |
|
301 |
+ if(is.null(modellingParams@trainParams@getFeatures)) |
|
302 |
+ rankedFeatures <- originalFeatures[rankedFeaturesIndices, ] |
|
303 |
+ else rankedFeatures <- S4Vectors::mcols(measurementsTrain)[rankedFeaturesIndices, featureColumns] |
|
304 |
+ } |
|
289 | 305 |
} |
290 | 306 |
} else { rankedFeatures <- NULL} |
291 | 307 |
if(!is.null(selectedFeaturesIndices)) |
... | ... |
@@ -297,9 +313,16 @@ input data. Autmomatically reducing to smaller number.") |
297 | 313 |
else selectedFeatures <- colnames(measurementsTrain)[selectedFeaturesIndices] |
298 | 314 |
} else { |
299 | 315 |
featureColumns <- na.omit(match(c("assay", "feature"), colnames(S4Vectors::mcols(measurementsTrain)))) |
300 |
- if(is.null(modellingParams@trainParams@getFeatures)) |
|
301 |
- selectedFeatures <- originalFeatures[selectedFeaturesIndices, ] |
|
302 |
- else selectedFeatures <- S4Vectors::mcols(measurementsTrain)[selectedFeaturesIndices, featureColumns] |
|
316 |
+ if(length(featureColumns) == 1) |
|
317 |
+ { |
|
318 |
+ if(is.null(modellingParams@trainParams@getFeatures)) |
|
319 |
+ selectedFeatures <- originalFeatures[selectedFeaturesIndices] |
|
320 |
+ else selectedFeatures <- S4Vectors::mcols(measurementsTrain)[selectedFeaturesIndices, featureColumns] |
|
321 |
+ } else { |
|
322 |
+ if(is.null(modellingParams@trainParams@getFeatures)) |
|
323 |
+ selectedFeatures <- originalFeatures[selectedFeaturesIndices, ] |
|
324 |
+ else selectedFeatures <- S4Vectors::mcols(measurementsTrain)[selectedFeaturesIndices, featureColumns] |
|
325 |
+ } |
|
303 | 326 |
} |
304 | 327 |
} else { selectedFeatures <- NULL} |
305 | 328 |
} else { # Nested use in feature selection. No feature selection in inner execution, so ignore features. |
... | ... |
@@ -16,7 +16,8 @@ |
16 | 16 |
#' containing either classes or time and event information about survival. If |
17 | 17 |
#' \code{measurements} is a \code{MultiAssayExperiment}, the names of the column (class) or |
18 | 18 |
#' columns (survival) in the table extracted by \code{colData(data)} that contain(s) the samples' |
19 |
-#' outcome to use for prediction. |
|
19 |
+#' outcome to use for prediction. If column names of survival information, time must be in first |
|
20 |
+#' column and event status in the second. |
|
20 | 21 |
#' @param crossValParams An object of class \code{\link{CrossValParams}}, |
21 | 22 |
#' specifying the kind of cross-validation to be done. |
22 | 23 |
#' @param modellingParams An object of class \code{\link{ModellingParams}}, |
... | ... |
@@ -13,7 +13,7 @@ |
13 | 13 |
#' a matrix of pre-calculated metrics, for backwards compatibility. |
14 | 14 |
#' @param classes If \code{results} is a matrix, this is a factor vector of the |
15 | 15 |
#' same length as the number of columns that \code{results} has. |
16 |
-#' @param comparison Default: "Classifier Name". The aspect of the experimental |
|
16 |
+#' @param comparison Default: "auto". The aspect of the experimental |
|
17 | 17 |
#' design to compare. Can be any characteristic that all results share. |
18 | 18 |
#' @param metric Default: "Sample Error". The sample-wise metric to plot. |
19 | 19 |
#' @param featureValues If not NULL, can be a named factor or named numeric |
... | ... |
@@ -44,6 +44,8 @@ |
44 | 44 |
#' @param legendSize The size of the boxes in the legends. |
45 | 45 |
#' @param plot Logical. IF \code{TRUE}, a plot is produced on the current |
46 | 46 |
#' graphics device. |
47 |
+#' @param ... Parameters not used by the \code{ClassifyResult} method that does |
|
48 |
+#' list-packaging but used by the main \code{list} method. |
|
47 | 49 |
#' @return A plot is produced and a grob is returned that can be saved to a |
48 | 50 |
#' graphics device. |
49 | 51 |
#' @author Dario Strbenac |
... | ... |
@@ -82,11 +84,17 @@ |
82 | 84 |
setGeneric("samplesMetricMap", function(results, ...) |
83 | 85 |
standardGeneric("samplesMetricMap")) |
84 | 86 |
|
87 |
+#' @rdname samplesMetricMap |
|
88 |
+#' @export |
|
89 |
+setMethod("samplesMetricMap", "ClassifyResult", function(results, ...) { |
|
90 |
+ samplesMetricMap(list(assay = results), ...) |
|
91 |
+}) |
|
92 |
+ |
|
85 | 93 |
#' @rdname samplesMetricMap |
86 | 94 |
#' @export |
87 | 95 |
setMethod("samplesMetricMap", "list", |
88 | 96 |
function(results, |
89 |
- comparison = "Classifier Name", |
|
97 |
+ comparison = "auto", |
|
90 | 98 |
metric = c("Sample Error", "Sample Accuracy", "Sample C-index"), |
91 | 99 |
featureValues = NULL, featureName = NULL, |
92 | 100 |
metricColours = list(c("#3F48CC", "#6F75D8", "#9FA3E5", "#CFD1F2", "#FFFFFF"), |
... | ... |
@@ -103,6 +111,20 @@ setMethod("samplesMetricMap", "list", |
103 | 111 |
stop("The package 'gridExtra' could not be found. Please install it.") |
104 | 112 |
if(!requireNamespace("gtable", quietly = TRUE)) |
105 | 113 |
stop("The package 'gtable' could not be found. Please install it.") |
114 |
+ |
|
115 |
+ characteristicsCounts <- table(unlist(lapply(results, function(result) result@characteristics[["characteristic"]]))) |
|
116 |
+ if(comparison == "auto") |
|
117 |
+ { |
|
118 |
+ if(max(characteristicsCounts) == length(results)) |
|
119 |
+ { # Choose a characteristic which varies the most across the results. |
|
120 |
+ candidates <- names(characteristicsCounts)[characteristicsCounts == length(results)] |
|
121 |
+ allCharacteristics <- do.call(rbind, lapply(results, function(result) result@characteristics)) |
|
122 |
+ distinctValues <- by(allCharacteristics[, "value"], allCharacteristics[, "characteristic"], function(values) length(unique(values))) |
|
123 |
+ comparison <- names(distinctValues)[which.max(distinctValues)][1] |
|
124 |
+ } else { |
|
125 |
+ stop("No characteristic is present for all results but must be.") |
|
126 |
+ } |
|
127 |
+ } |
|
106 | 128 |
resultsWithComparison <- sum(sapply(results, function(result) any(result@characteristics[, "characteristic"] == comparison))) |
107 | 129 |
if(resultsWithComparison < length(results)) |
108 | 130 |
stop("Not all results have comparison characteristic ", comparison, ' but need to.') |
... | ... |
@@ -7,6 +7,7 @@ |
7 | 7 |
\alias{calcExternalPerformance,factor,factor-method} |
8 | 8 |
\alias{calcExternalPerformance,Surv,numeric-method} |
9 | 9 |
\alias{calcCVperformance,ClassifyResult-method} |
10 |
+\alias{calcExternalPerformance,factor,tabular-method} |
|
10 | 11 |
\title{Add Performance Calculations to a ClassifyResult Object or Calculate for a |
11 | 12 |
Pair of Factor Vectors} |
12 | 13 |
\usage{ |
... | ... |
@@ -24,6 +25,12 @@ Pair of Factor Vectors} |
24 | 25 |
performanceType = "C-index" |
25 | 26 |
) |
26 | 27 |
|
28 |
+\S4method{calcExternalPerformance}{factor,tabular}( |
|
29 |
+ actualOutcome, |
|
30 |
+ predictedOutcome, |
|
31 |
+ performanceType = "AUC" |
|
32 |
+) |
|
33 |
+ |
|
27 | 34 |
\S4method{calcCVperformance}{ClassifyResult}( |
28 | 35 |
result, |
29 | 36 |
performanceType = c("Balanced Accuracy", "Balanced Error", "Error", "Accuracy", |
... | ... |
@@ -131,7 +131,8 @@ or a list of these objects containing the data.} |
131 | 131 |
same length as the number of samples in \code{measurements} or a character vector of length 1 containing the |
132 | 132 |
column name in \code{measurements} if it is a \code{\link{DataFrame}}. Or a \code{\link{Surv}} object or a character vector of |
133 | 133 |
length 2 or 3 specifying the time and event columns in \code{measurements} for survival outcome. If \code{measurements} is a |
134 |
-\code{\link{MultiAssayExperiment}}, the column name(s) in \code{colData(measurements)} representing the outcome.} |
|
134 |
+\code{\link{MultiAssayExperiment}}, the column name(s) in \code{colData(measurements)} representing the outcome. If column names |
|
135 |
+of survival information, time must be in first column and event status in the second.} |
|
135 | 136 |
|
136 | 137 |
\item{...}{Parameters passed into \code{\link{prepareData}} which control subsetting and filtering of input data.} |
137 | 138 |
|
... | ... |
@@ -140,15 +141,17 @@ or assays. If a numeric vector these will be optimised over using \code{selectio |
140 | 141 |
a different number of features will be used for each assay. If a named list of vectors, the respective number of features will be optimised over. |
141 | 142 |
Set to NULL or "all" if all features should be used.} |
142 | 143 |
|
143 |
-\item{selectionMethod}{A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, |
|
144 |
-and performing multiview classification, the respective classification methods will be used on each assay.} |
|
144 |
+\item{selectionMethod}{Default: "auto". A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, |
|
145 |
+and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"} t-test (two categories) / F-test (three or more categories) ranking |
|
146 |
+and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value.} |
|
145 | 147 |
|
146 | 148 |
\item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.} |
147 | 149 |
|
148 | 150 |
\item{performanceType}{Performance metric to optimise if classifier has any tuning parameters.} |
149 | 151 |
|
150 |
-\item{classifier}{A character vector of classification methods to compare. If a named character vector with names corresponding to different assays, |
|
151 |
-and performing multiview classification, the respective classification methods will be used on each assay.} |
|
152 |
+\item{classifier}{Default: \code{"auto"}. A character vector of classification methods to compare. If a named character vector with names corresponding to different assays, |
|
153 |
+and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"}, then a random forest is used for a classification |
|
154 |
+task or Cox proportional hazards model for a survival task.} |
|
152 | 155 |
|
153 | 156 |
\item{multiViewMethod}{A character vector specifying the multiview method or data integration approach to use.} |
154 | 157 |
|
... | ... |
@@ -167,7 +170,8 @@ with each element being a vector of assays to combine. Special value \code{"all" |
167 | 170 |
|
168 | 171 |
\item{outcomeTrain}{For the \code{train} function, either a factor vector of classes, a \code{\link{Surv}} object, or |
169 | 172 |
a character string, or vector of such strings, containing column name(s) of column(s) |
170 |
-containing either classes or time and event information about survival.} |
|
173 |
+containing either classes or time and event information about survival. If column names |
|
174 |
+of survival information, time must be in first column and event status in the second.} |
|
171 | 175 |
|
172 | 176 |
\item{assayIDs}{A character vector for assays to train with. Special value \code{"all"} |
173 | 177 |
uses all assays in the input object.} |
174 | 178 |
deleted file mode 100644 |
... | ... |
@@ -1,26 +0,0 @@ |
1 |
-% Generated by roxygen2: do not edit by hand |
|
2 |
-% Please edit documentation in R/crossValidate.R |
|
3 |
-\name{generateCrossValParams} |
|
4 |
-\alias{generateCrossValParams} |
|
5 |
-\title{A function to generate a CrossValParams object} |
|
6 |
-\usage{ |
|
7 |
-generateCrossValParams(nRepeats, nFolds, nCores, selectionOptimisation) |
|
8 |
-} |
|
9 |
-\arguments{ |
|
10 |
-\item{nRepeats}{A numeric specifying the the number of repeats or permutations to use for cross-validation.} |
|
11 |
- |
|
12 |
-\item{nFolds}{A numeric specifying the number of folds to use for cross-validation.} |
|
13 |
- |
|
14 |
-\item{nCores}{A numeric specifying the number of cores used if the user wants to use parallelisation.} |
|
15 |
- |
|
16 |
-\item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.} |
|
17 |
-} |
|
18 |
-\value{ |
|
19 |
-CrossValParams object |
|
20 |
-} |
|
21 |
-\description{ |
|
22 |
-A function to generate a CrossValParams object |
|
23 |
-} |
|
24 |
-\examples{ |
|
25 |
-CVparams <- generateCrossValParams(nRepeats = 20, nFolds = 5, nCores = 8, selectionOptimisation = "none") |
|
26 |
-} |
27 | 0 |
deleted file mode 100644 |
... | ... |
@@ -1,61 +0,0 @@ |
1 |
-% Generated by roxygen2: do not edit by hand |
|
2 |
-% Please edit documentation in R/crossValidate.R |
|
3 |
-\name{generateModellingParams} |
|
4 |
-\alias{generateModellingParams} |
|
5 |
-\title{A function to generate a ModellingParams object} |
|
6 |
-\usage{ |
|
7 |
-generateModellingParams( |
|
8 |
- assayIDs, |
|
9 |
- measurements, |
|
10 |
- nFeatures, |
|
11 |
- selectionMethod, |
|
12 |
- selectionOptimisation, |
|
13 |
- performanceType = "auto", |
|
14 |
- classifier, |
|
15 |
- multiViewMethod = "none" |
|
16 |
-) |
|
17 |
-} |
|
18 |
-\arguments{ |
|
19 |
-\item{assayIDs}{A vector of data set identifiers as long at the number of data sets.} |
|
20 |
- |
|
21 |
-\item{measurements}{Either a \code{\link{DataFrame}}, \code{\link{data.frame}}, \code{\link{matrix}}, \code{\link{MultiAssayExperiment}} |
|
22 |
-or a list of these objects containing the data.} |
|
23 |
- |
|
24 |
-\item{nFeatures}{The number of features to be used for classification. If this is a single number, the same number of features will be used for all comparisons |
|
25 |
-or assays. If a numeric vector these will be optimised over using \code{selectionOptimisation}. If a named vector with the same names of multiple assays, |
|
26 |
-a different number of features will be used for each assay. If a named list of vectors, the respective number of features will be optimised over. |
|
27 |
-Set to NULL or "all" if all features should be used.} |
|
28 |
- |
|
29 |
-\item{selectionMethod}{A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, |
|
30 |
-and performing multiview classification, the respective classification methods will be used on each assay.} |
|
31 |
- |
|
32 |
-\item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.} |
|
33 |
- |
|
34 |
-\item{performanceType}{Performance metric to optimise if classifier has any tuning parameters.} |
|
35 |
- |
|
36 |
-\item{classifier}{A character vector of classification methods to compare. If a named character vector with names corresponding to different assays, |
|
37 |
-and performing multiview classification, the respective classification methods will be used on each assay.} |
|
38 |
- |
|
39 |
-\item{multiViewMethod}{A character vector specifying the multiview method or data integration approach to use.} |
|
40 |
-} |
|
41 |
-\value{ |
|
42 |
-ModellingParams object |
|
43 |
-} |
|
44 |
-\description{ |
|
45 |
-A function to generate a ModellingParams object |
|
46 |
-} |
|
47 |
-\examples{ |
|
48 |
-data(asthma) |
|
49 |
-# First make a toy example assay with multiple data types. We'll randomly assign different features to be clinical, gene or protein. |
|
50 |
-set.seed(51773) |
|
51 |
-measurements <- DataFrame(measurements, check.names = FALSE) |
|
52 |
-mcols(measurements)$assay <- c(rep("clinical",20),sample(c("gene", "protein"), ncol(measurements)-20, replace = TRUE)) |
|
53 |
-mcols(measurements)$feature <- colnames(measurements) |
|
54 |
-modellingParams <- generateModellingParams(assayIDs = c("clinical", "gene", "protein"), |
|
55 |
- measurements = measurements, |
|
56 |
- nFeatures = list(clinical = 10, gene = 10, protein = 10), |
|
57 |
- selectionMethod = list(clinical = "t-test", gene = "t-test", protein = "t-test"), |
|
58 |
- selectionOptimisation = "none", |
|
59 |
- classifier = "randomForest", |
|
60 |
- multiViewMethod = "merge") |
|
61 |
-} |
... | ... |
@@ -34,7 +34,8 @@ are passed into and used by the \code{DataFrame} method or passed onwards to \co |
34 | 34 |
|
35 | 35 |
\item{outcomeTrain}{Either a factor vector of classes, a \code{\link{Surv}} object, or |
36 | 36 |
a character string, or vector of such strings, containing column name(s) of column(s) |
37 |
-containing either classes or time and event information about survival.} |
|
37 |
+containing either classes or time and event information about survival. If column names |
|
38 |
+of survival information, time must be in first column and event status in the second.} |
|
38 | 39 |
|
39 | 40 |
\item{measurementsTest}{Same data type as \code{measurementsTrain}, but only the test |
40 | 41 |
samples.} |
... | ... |
@@ -35,7 +35,8 @@ a character string, or vector of such strings, containing column name(s) of colu |
35 | 35 |
containing either classes or time and event information about survival. If |
36 | 36 |
\code{measurements} is a \code{MultiAssayExperiment}, the names of the column (class) or |
37 | 37 |
columns (survival) in the table extracted by \code{colData(data)} that contain(s) the samples' |
38 |
-outcome to use for prediction.} |
|
38 |
+outcome to use for prediction. If column names of survival information, time must be in first |
|
39 |
+column and event status in the second.} |
|
39 | 40 |
|
40 | 41 |
\item{crossValParams}{An object of class \code{\link{CrossValParams}}, |
41 | 42 |
specifying the kind of cross-validation to be done.} |
... | ... |
@@ -4,11 +4,14 @@ |
4 | 4 |
\alias{samplesMetricMap} |
5 | 5 |
\alias{samplesMetricMap,list-method} |
6 | 6 |
\alias{samplesMetricMap,matrix-method} |
7 |
+\alias{samplesMetricMap,ClassifyResult-method} |
|
7 | 8 |
\title{Plot a Grid of Sample Error Rates or Accuracies} |
8 | 9 |
\usage{ |
10 |
+\S4method{samplesMetricMap}{ClassifyResult}(results, ...) |
|
11 |
+ |
|
9 | 12 |
\S4method{samplesMetricMap}{list}( |
10 | 13 |
results, |
11 |
- comparison = "Classifier Name", |
|
14 |
+ comparison = "auto", |
|
12 | 15 |
metric = c("Sample Error", "Sample Accuracy", "Sample C-index"), |
13 | 16 |
featureValues = NULL, |
14 | 17 |
featureName = NULL, |
... | ... |
@@ -55,7 +58,10 @@ |
55 | 58 |
\item{results}{A list of \code{\link{ClassifyResult}} objects. Could also be |
56 | 59 |
a matrix of pre-calculated metrics, for backwards compatibility.} |
57 | 60 |
|
58 |
-\item{comparison}{Default: "Classifier Name". The aspect of the experimental |
|
61 |
+\item{...}{Parameters not used by the \code{ClassifyResult} method that does |
|
62 |
+list-packaging but used by the main \code{list} method.} |
|
63 |
+ |
|
64 |
+\item{comparison}{Default: "auto". The aspect of the experimental |
|
59 | 65 |
design to compare. Can be any characteristic that all results share.} |
60 | 66 |
|
61 | 67 |
\item{metric}{Default: "Sample Error". The sample-wise metric to plot.} |
... | ... |
@@ -1,35 +1,16 @@ |
1 | 1 |
--- |
2 |
-title: "An Introduction to **ClassifyR**" |
|
2 |
+title: "Getting Started with ClassifyR" |
|
3 | 3 |
author: Dario Strbenac, Ellis Patrick, Graham Mann, Jean Yang, John Ormerod <br> |
4 | 4 |
The University of Sydney, Australia. |
5 | 5 |
output: |
6 | 6 |
BiocStyle::html_document: |
7 |
+ css: style.css |
|
7 | 8 |
toc: true |
8 | 9 |
vignette: > |
9 | 10 |
%\VignetteEngine{knitr::rmarkdown} |
10 | 11 |
%\VignetteIndexEntry{An Introduction to the ClassifyR Package} |
11 | 12 |
--- |
12 | 13 |
|
13 |
-<style> |
|
14 |
- body .main-container { |
|
15 |
- max-width: 1600px; |
|
16 |
- } |
|
17 |
- p { |
|
18 |
- padding: 20px; |
|
19 |
- } |
|
20 |
- .table { |
|
21 |
- border: 2px solid #e64626; |
|
22 |
- } |
|
23 |
- .table>thead>tr>th |
|
24 |
- { |
|
25 |
- border-bottom: 2px solid #e64626; |
|
26 |
- border-right: 1px solid black; |
|
27 |
- } |
|
28 |
- .table>tbody>tr>td { |
|
29 |
- border-right: 1px solid black; |
|
30 |
- } |
|
31 |
-</style> |
|
32 |
- |
|
33 | 14 |
```{r, echo = FALSE, results = "asis"} |
34 | 15 |
options(width = 130) |
35 | 16 |
``` |
... | ... |
@@ -50,7 +31,7 @@ Driver functions can use parallel processing capabilities in R to speed up cross |
50 | 31 |
|
51 | 32 |
|
52 | 33 |
```{r, echo = FALSE} |
53 |
-htmltools::img(src = knitr::image_uri("ClassifyRprocedure.png"), |
|
34 |
+htmltools::img(src = knitr::image_uri("images/ClassifyRprocedure.png"), |
|
54 | 35 |
style = 'margin-left: auto;margin-right: auto') |
55 | 36 |
``` |
56 | 37 |
|
... | ... |
@@ -90,7 +71,7 @@ Data Type | $n \times p$ | $p \times n$ |
90 | 71 |
<span style="font-family: 'Courier New', monospace;">MultiAssayExperiment</span> | | ✔ |
91 | 72 |
<span style="font-family: 'Courier New', monospace;">list</span> of <span style="font-family: 'Courier New', monospace;">data.frame</span>s | ✔ | |
92 | 73 |
|
93 |
-*crossValidate* must also be supplied with *outcomes*, which represents the prediction to be made in a variety of possible ways. |
|
74 |
+*crossValidate* must also be supplied with *outcome*, which represents the prediction to be made in a variety of possible ways. |
|
94 | 75 |
|
95 | 76 |
* A *factor* that contains the class label for each observation. *classes* must be of length $n$. |
96 | 77 |
* A *character* of length 1 that matches a column name in a data frame which holds the classes. The classes will automatically be removed before training is done. |
99 | 80 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,9 @@ |
1 |
+body .main-container {max-width: 1600px;} |
|
2 |
+p {padding: 20px;} |
|
3 |
+.table {border: 2px solid #e64626;} |
|
4 |
+.table>thead>tr>th |
|
5 |
+{ |
|
6 |
+ border-bottom: 2px solid #e64626; |
|
7 |
+ border-right: 1px solid black; |
|
8 |
+} |
|
9 |
+.table>tbody>tr>td {border-right: 1px solid black;} |
|
0 | 10 |
\ No newline at end of file |