crissCrossValidate Addition and predict Dispatch Fix
... | ... |
@@ -3,8 +3,8 @@ Type: Package |
3 | 3 |
Title: A framework for cross-validated classification problems, with |
4 | 4 |
applications to differential variability and differential |
5 | 5 |
distribution testing |
6 |
-Version: 3.3.8 |
|
7 |
-Date: 2022-11-28 |
|
6 |
+Version: 3.3.9 |
|
7 |
+Date: 2022-12-08 |
|
8 | 8 |
Authors@R: |
9 | 9 |
c( |
10 | 10 |
person(given = "Dario", family = "Strbenac", email = "dario.strbenac@sydney.edu.au", role = c("aut", "cre")), |
... | ... |
@@ -20,8 +20,8 @@ VignetteBuilder: knitr |
20 | 20 |
Encoding: UTF-8 |
21 | 21 |
biocViews: Classification, Survival |
22 | 22 |
Depends: R (>= 4.1.0), generics, methods, S4Vectors, MultiAssayExperiment, BiocParallel, survival |
23 |
-Imports: grid, genefilter, utils, dplyr, tidyr, rlang, ranger |
|
24 |
-Suggests: limma, edgeR, car, Rmixmod, ggplot2 (>= 3.0.0), gridExtra (>= 2.0.0), cowplot, |
|
23 |
+Imports: grid, genefilter, utils, dplyr, plyr, tidyr, rlang, ranger, ggplot2 (>= 3.0.0), ggpubr, reshape2 |
|
24 |
+Suggests: limma, edgeR, car, Rmixmod, gridExtra (>= 2.0.0), cowplot, |
|
25 | 25 |
BiocStyle, pamr, PoiClaClu, parathyroidSE, knitr, htmltools, gtable, |
26 | 26 |
scales, e1071, rmarkdown, IRanges, robustbase, glmnet, class, randomForestSRC, |
27 | 27 |
MatrixModels, xgboost |
... | ... |
@@ -44,6 +44,7 @@ Collate: |
44 | 44 |
'classes.R' |
45 | 45 |
'calcPerformance.R' |
46 | 46 |
'constants.R' |
47 |
+ 'crissCrossValidate.R' |
|
47 | 48 |
'crossValidate.R' |
48 | 49 |
'data.R' |
49 | 50 |
'distribution.R' |
... | ... |
@@ -75,6 +76,7 @@ Collate: |
75 | 76 |
'prepareData.R' |
76 | 77 |
'previousSelection.R' |
77 | 78 |
'previousTrained.R' |
79 |
+ 'randomSelection.R' |
|
78 | 80 |
'rankingBartlett.R' |
79 | 81 |
'rankingCoxPH.R' |
80 | 82 |
'rankingDMD.R' |
... | ... |
@@ -22,6 +22,8 @@ export(calcCVperformance) |
22 | 22 |
export(calcExternalPerformance) |
23 | 23 |
export(chosenFeatureNames) |
24 | 24 |
export(colCoxTests) |
25 |
+export(crissCrossPlot) |
|
26 |
+export(crissCrossValidate) |
|
25 | 27 |
export(crossValidate) |
26 | 28 |
export(distribution) |
27 | 29 |
export(edgesToHubNetworks) |
... | ... |
@@ -86,8 +88,13 @@ exportMethods(show) |
86 | 88 |
exportMethods(totalPredictions) |
87 | 89 |
exportMethods(tunedParameters) |
88 | 90 |
import(MultiAssayExperiment) |
91 |
+import(dplyr) |
|
92 |
+import(ggplot2) |
|
93 |
+import(ggpubr) |
|
89 | 94 |
import(grid) |
90 | 95 |
import(methods) |
96 |
+import(plyr) |
|
97 |
+import(reshape2) |
|
91 | 98 |
import(utils) |
92 | 99 |
importFrom(S4Vectors,as.data.frame) |
93 | 100 |
importFrom(S4Vectors,do.call) |
... | ... |
@@ -14,7 +14,7 @@ |
14 | 14 |
#' |
15 | 15 |
#' @aliases ROCplot ROCplot,list-method |
16 | 16 |
#' @param results A list of \code{\link{ClassifyResult}} objects. |
17 |
-#' @param mode Default: "merge". Whether to merge all predictions of all |
|
17 |
+#' @param mode Default: \code{"merge"}. Whether to merge all predictions of all |
|
18 | 18 |
#' iterations of cross-validation into one set or keep them separate. Keeping |
19 | 19 |
#' them separate will cause separate ROC curves to be computed for each |
20 | 20 |
#' iteration and confidence intervals to be drawn with the solid line being the |
... | ... |
@@ -815,7 +815,7 @@ setClass("ModellingParams", representation( |
815 | 815 |
#' @rdname ModellingParams-class |
816 | 816 |
#' @aliases ModellingParams ModellingParams-class |
817 | 817 |
#' @docType class |
818 |
-#' @param balancing Default: "downsample". A character value specifying what kind |
|
818 |
+#' @param balancing Default: \code{"downsample"}. A character value specifying what kind |
|
819 | 819 |
#' of class balancing to do, if any. |
820 | 820 |
#' @param transformParams Parameters used for feature transformation inside of C.V. |
821 | 821 |
#' specified by a \code{\link{TransformParams}} instance. Optional, can be \code{NULL}. |
... | ... |
@@ -45,6 +45,7 @@ |
45 | 45 |
"pairsDifferencesRanking", "Pairs Differences", |
46 | 46 |
"previousSelection", "Previous Selection", |
47 | 47 |
"previousTrained", "Previous Trained", |
48 |
+ "randomSelection", "Random Selection", |
|
48 | 49 |
"randomForestTrainInterface", "Random Forest", |
49 | 50 |
"SVMtrainInterface", "Support Vector Machine", |
50 | 51 |
"coxphTrainInterface", "Cox Proportional Hazards", |
... | ... |
@@ -67,7 +68,8 @@ |
67 | 68 |
"likelihoodRatio", "Likelihood ratio test (normal distribution).", |
68 | 69 |
"KS", "Kolmogorov-Smirnov test for differences in distributions.", |
69 | 70 |
"KL", "Kullback-Leibler divergence between distributions.", |
70 |
- "CoxPH", "Cox proportional hazards Wald test per-feature." |
|
71 |
+ "CoxPH", "Cox proportional hazards Wald test per-feature.", |
|
72 |
+ "randomSelection", "Randomly selects a specified number of features." |
|
71 | 73 |
), |
72 | 74 |
ncol = 2, byrow = TRUE, dimnames = list(NULL, c("selectionMethod Keyword", "Description")) |
73 | 75 |
) |> as.data.frame() |
74 | 76 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,234 @@ |
1 |
+#' A function to perform pairwise cross validation |
|
2 |
+#' |
|
3 |
+#' This function has been designed to perform cross-validation and model prediction on datasets in a pairwise manner. |
|
4 |
+#' |
|
5 |
+#' @param measurements A \code{list} of either \code{\link{DataFrame}}, \code{\link{data.frame}} or \code{\link{matrix}} class measurements. |
|
6 |
+#' @param outcomes A \code{list} of vectors that respectively correspond to outcomes of the samples in \code{measurements} list. |
|
7 |
+#' @param nFeatures The number of features to be used for modelling. |
|
8 |
+#' @param selectionMethod Default: \code{"auto"}. A character keyword of the feature algorithm to be used. If \code{"auto"}, t-test (two categories) / |
|
9 |
+#' F-test (three or more categories) ranking and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional |
|
10 |
+#' hazards p-value. |
|
11 |
+#' @param selectionOptimisation A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise nFeatures. |
|
12 |
+#' @param trainType Default: \code{"modelTrain"}. A keyword specifying whether a fully trained model is used to make predictions on the test |
|
13 |
+#' set or if only the feature identifiers are chosen using the training data set and a number of training-predictions are made by cross-validation |
|
14 |
+#' in the test set. |
|
15 |
+#' @param classifier Default: \code{"auto"}. A character keyword of the modelling algorithm to be used. If \code{"auto"}, then a random forest is used |
|
16 |
+#' for a classification task or Cox proportional hazards model for a survival task. |
|
17 |
+#' @param nFolds A numeric specifying the number of folds to use for cross-validation. |
|
18 |
+#' @param nRepeats A numeric specifying the the number of repeats or permutations to use for cross-validation. |
|
19 |
+#' @param nCores A numeric specifying the number of cores used if the user wants to use parallelisation. |
|
20 |
+#' @param performanceType Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Otherwise, any one of the |
|
21 |
+#' options described in \code{\link{calcPerformance}} may otherwise be specified. |
|
22 |
+#' @param doRandomFeatures Default: \code{FALSE}. Whether to perform random feature selection to establish a baseline performance. Either \code{FALSE} or \code{TRUE} |
|
23 |
+#' are permitted values. |
|
24 |
+#' @return A list with elements \code{"real"} for the matrix of pairwise performance metrics using real |
|
25 |
+#' feature selection, \code{"random"} if \code{doRandomFeatures} is \code{TRUE} for metrics of random selection and |
|
26 |
+#' \code{"params"} for a list of parameters used during the execution of this function. |
|
27 |
+#' @author Harry Robertson |
|
28 |
+ |
|
29 |
+#' @import plyr |
|
30 |
+#' @import dplyr |
|
31 |
+#' |
|
32 |
+#' @export |
|
33 |
+#' |
|
34 |
+ |
|
35 |
+crissCrossValidate <- function(measurements, outcomes, |
|
36 |
+ nFeatures = 20, selectionMethod = "auto", |
|
37 |
+ selectionOptimisation = "Resubstitution", |
|
38 |
+ trainType = c("modelTrain", "modelTest"), |
|
39 |
+ performanceType = "auto", |
|
40 |
+ doRandomFeatures = FALSE, |
|
41 |
+ classifier = "auto", |
|
42 |
+ nFolds = 5, nRepeats = 20, nCores = 1) |
|
43 |
+{ |
|
44 |
+ trainType <- match.arg(trainType) |
|
45 |
+ if(!is.list(measurements)) stop("'measurements' is not of type list but is of type", class(measurements)) |
|
46 |
+ if(!is.list(outcomes)) stop("'outcomes' is not of type list but is of type", class(outcomes)) |
|
47 |
+ isCategorical <- is.character(outcomes[[1]]) && (length(outcomes[[1]]) == 1 || length(outcomes[[1]]) == nrow(measurements[[1]])) || is.factor(outcomes[[1]]) |
|
48 |
+ if(performanceType == "auto") |
|
49 |
+ if(isCategorical) performanceType <- "Balanced Accuracy" else performanceType <- "C-index" |
|
50 |
+ if(length(selectionMethod) == 1 && selectionMethod == "auto") |
|
51 |
+ if(isCategorical) selectionMethod <- "t-test" else selectionMethod <- "CoxPH" |
|
52 |
+ if(length(classifier) == 1 && classifier == "auto") |
|
53 |
+ if(isCategorical) classifier <- "randomForest" else classifier <- "CoxPH" |
|
54 |
+ |
|
55 |
+ dataCleaned <- mapply(function(measurementsOne, outcomesOne) |
|
56 |
+ { |
|
57 |
+ prepareData(measurementsOne, outcomesOne) |
|
58 |
+ }, measurements, outcomes, SIMPLIFY = FALSE) |
|
59 |
+ measurements <- lapply(dataCleaned, "[[", 1) |
|
60 |
+ outcomes <- lapply(dataCleaned, "[[", 2) |
|
61 |
+ |
|
62 |
+ # If trainType is modelTrain, then build a model on a data set and test it on every data set. |
|
63 |
+ if(trainType == "modelTrain") |
|
64 |
+ { |
|
65 |
+ # Build a model for each dataset. |
|
66 |
+ trainedModels <- mapply(function(measurementsOne, outcomesOne) |
|
67 |
+ { |
|
68 |
+ train(measurementsOne, outcomesOne, |
|
69 |
+ nFeatures = nFeatures, |
|
70 |
+ selectionMethod = selectionMethod, selectionOptimisation = selectionOptimisation, |
|
71 |
+ classifier = classifier, multiViewMethod = "none") |
|
72 |
+ }, measurements, outcomes, SIMPLIFY = FALSE) |
|
73 |
+ |
|
74 |
+ # Perform pair-wise model assessment. |
|
75 |
+ performanceAllPairs <- lapply(trainedModels, function(trainedModel) |
|
76 |
+ { |
|
77 |
+ mapply(function(testData, testOutcomes) |
|
78 |
+ { |
|
79 |
+ predictions <- predict(trainedModel, testData) |
|
80 |
+ if(is(predictions, "tabular")) predictions <- predictions[, na.omit(match(c("class", "risk"), colnames(predictions)))] |
|
81 |
+ calcExternalPerformance(predictions, testOutcomes, performanceType) |
|
82 |
+ }, measurements, outcomes) |
|
83 |
+ }) |
|
84 |
+ |
|
85 |
+ realPerformance <- matrix(unlist(performanceAllPairs), ncol = length(measurements), byrow = TRUE, |
|
86 |
+ dimnames = list(paste("Select and Train", names(measurements)), paste("Predict", names(measurements)))) |
|
87 |
+ realPerformance <- round(realPerformance, 2) |
|
88 |
+ } else { # trainType is "modelTest". |
|
89 |
+ trainedModels <- mapply(function(measurementsOne, outcomesOne) |
|
90 |
+ { |
|
91 |
+ crossValidate(measurementsOne, outcomesOne, |
|
92 |
+ nFeatures = nFeatures, |
|
93 |
+ selectionMethod = selectionMethod, |
|
94 |
+ selectionOptimisation = selectionOptimisation, |
|
95 |
+ classifier = classifier, |
|
96 |
+ multiViewMethod = "none", |
|
97 |
+ nFolds = nFolds, |
|
98 |
+ nCores = nCores, |
|
99 |
+ nRepeats = nRepeats) |
|
100 |
+ }, measurements, outcomes, SIMPLIFY = FALSE) |
|
101 |
+ |
|
102 |
+ # Make it for runTests, which allows existing results to be passed into selection process. |
|
103 |
+ crossValParams <- generateCrossValParams(nRepeats, nFolds, nCores, selectionOptimisation) |
|
104 |
+ |
|
105 |
+ performanceAllPairs <- lapply(trainedModels, function(trainedModel) |
|
106 |
+ { |
|
107 |
+ mapply(function(measurementsOne, outcomesOne) |
|
108 |
+ { |
|
109 |
+ classifierParams <- .classifierKeywordToParams(classifier) |
|
110 |
+ modellingParams <- ModellingParams(selectParams = SelectParams("previousSelection", intermediate = ".iteration", classifyResult = trainedModel), |
|
111 |
+ trainParams = classifierParams$trainParams, |
|
112 |
+ predictParams = classifierParams$predictParams) |
|
113 |
+ |
|
114 |
+ result <- runTests(measurementsOne, outcomesOne, crossValParams, modellingParams) |
|
115 |
+ mean(performance(calcCVperformance(result, performanceType))[[performanceType]]) |
|
116 |
+ }, measurements, outcomes, SIMPLIFY = FALSE) |
|
117 |
+ }) |
|
118 |
+ |
|
119 |
+ realPerformance <- matrix(unlist(performanceAllPairs), ncol = length(measurements), byrow = TRUE, |
|
120 |
+ dimnames = list(paste("Select", names(measurements)), paste("Cross-validate", names(measurements)))) |
|
121 |
+ realPerformance <- round(realPerformance, 2) |
|
122 |
+ } |
|
123 |
+ |
|
124 |
+ # Return matrix of pair-wise model accuracy. |
|
125 |
+ # I've made this a list so that I can add things to it later on. |
|
126 |
+ result <- list(real = realPerformance) |
|
127 |
+ # We want to include a set of nFeatures to compare between our feature selection method. |
|
128 |
+ if(doRandomFeatures == TRUE){ |
|
129 |
+ message("Starting random feature selection procedure.") |
|
130 |
+ # Sample nFeatures randomly from each dataset. |
|
131 |
+ randomFeatures <- lapply(measurements, function(dataset) sample(colnames(dataset), nFeatures)) |
|
132 |
+ performanceAllPairs <- lapply(randomFeatures, function(randomFeaturesSet) |
|
133 |
+ { |
|
134 |
+ mapply(function(testData, testOutcomes) |
|
135 |
+ { |
|
136 |
+ result <- crossValidate(testData[, randomFeaturesSet], testOutcomes, |
|
137 |
+ nFeatures = nFeatures, |
|
138 |
+ selectionMethod = "none", |
|
139 |
+ classifier = classifier, |
|
140 |
+ multiViewMethod = "none", |
|
141 |
+ nFolds = nFolds, |
|
142 |
+ nCores = nCores, |
|
143 |
+ nRepeats = nRepeats) |
|
144 |
+ mean(performance(calcCVperformance(result, performanceType))[[performanceType]]) |
|
145 |
+ }, measurements, outcomes) |
|
146 |
+ }) |
|
147 |
+ |
|
148 |
+ randomPerformance <- matrix(unlist(performanceAllPairs), ncol = length(measurements), byrow = TRUE, |
|
149 |
+ dimnames = list(paste("Random Select", names(measurements)), paste("Cross-validate", names(measurements)))) |
|
150 |
+ randomPerformance <- round(randomPerformance, 2) |
|
151 |
+ |
|
152 |
+ result$random <- randomPerformance |
|
153 |
+ } |
|
154 |
+ |
|
155 |
+# Add information about the params to the output. |
|
156 |
+ result$params <- list(nFeatures = nFeatures, selectionMethod = selectionMethod, |
|
157 |
+ selectionOptimisation = selectionOptimisation, |
|
158 |
+ classifier = classifier, nFolds = nFolds, nRepeats = nRepeats, nCores = nCores, |
|
159 |
+ trainType = trainType, performanceType = performanceType, |
|
160 |
+ doRandomFeatures = doRandomFeatures) |
|
161 |
+ |
|
162 |
+ result |
|
163 |
+} |
|
164 |
+ |
|
165 |
+#' A function to plot the output of the crissCrossValidate function. |
|
166 |
+#' |
|
167 |
+#' This function has been designed to give a heatmap output of the crissCrossValidate function. |
|
168 |
+#' |
|
169 |
+#' @param crissCrossResult The output of the crissCrossValidate function. |
|
170 |
+#' @param includeValues If TRUE, then the values of the matrix will be included in the plot. |
|
171 |
+#' @author Harry Robertson |
|
172 |
+#' |
|
173 |
+#' @import ggplot2 |
|
174 |
+#' @import reshape2 |
|
175 |
+#' @import ggpubr |
|
176 |
+#' |
|
177 |
+#' @export |
|
178 |
+ |
|
179 |
+crissCrossPlot <- function(crissCrossResult, includeValues = FALSE){ |
|
180 |
+ |
|
181 |
+ attach(crissCrossResult) |
|
182 |
+ scalebar_title <- params$performanceType |
|
183 |
+ |
|
184 |
+ # If the user does not want to compare features. |
|
185 |
+ if(params$trainType == "modelTrain"){ |
|
186 |
+ melted_cormat <- reshape2::melt(real, na.rm = TRUE) |
|
187 |
+ |
|
188 |
+ ggheatmap <- ggplot(melted_cormat, aes(Var1, Var2, fill = value)) + |
|
189 |
+ geom_tile(color = "white") + |
|
190 |
+ scale_fill_gradient2(high = "red", mid = "white", low = "blue", |
|
191 |
+ midpoint = 0.5, limit = c(0,1), space = "Lab", |
|
192 |
+ name=as.character(scalebar_title)) + |
|
193 |
+ theme_bw() + xlab("Training Dataset") + ylab("Testing Dataset") + |
|
194 |
+ theme(axis.text.x = element_text(angle = 90, vjust = 1, size = 8, hjust = 1)) + |
|
195 |
+ theme(axis.text.y = element_text(vjust = 1, size = 8, hjust = 1)) + |
|
196 |
+ coord_fixed() |
|
197 |
+ |
|
198 |
+ if(includeValues == TRUE) ggheatmap <- ggheatmap + geom_text(aes(label = value), color = "black", size = 3) |
|
199 |
+ } |
|
200 |
+ |
|
201 |
+ else if(params$trainType == "modelTest"){ |
|
202 |
+ melted_cormat_1 <- melt(real, na.rm = TRUE) |
|
203 |
+ ggheatmap_1 <- ggplot(melted_cormat_1, aes(Var1, Var2, fill = value)) + |
|
204 |
+ geom_tile(color = "white") + |
|
205 |
+ scale_fill_gradient2(high = "red", mid = "white", low = "blue", |
|
206 |
+ midpoint = 0.5, limit = c(0,1), space = "Lab", |
|
207 |
+ name=as.character(scalebar_title)) + |
|
208 |
+ theme_bw() + xlab("Features Extracted") + ylab("Dataset Tested") + |
|
209 |
+ theme(axis.text.x = element_text(angle = 90, vjust = 1, size = 8, hjust = 1)) + |
|
210 |
+ theme(axis.text.y = element_text(vjust = 1, size = 8, hjust = 1)) + |
|
211 |
+ coord_fixed() |
|
212 |
+ if(includeValues == TRUE) ggheatmap_1 <- ggheatmap_1 + geom_text(aes(label = value), color = "black", size = 3) |
|
213 |
+ |
|
214 |
+ if(params$doRandomFeatures == TRUE){ |
|
215 |
+ melted_cormat_2 <- melt(random, na.rm = TRUE) |
|
216 |
+ ggheatmap_2 <- ggplot(melted_cormat_2, aes(Var1, Var2, fill = value)) + |
|
217 |
+ geom_tile(color = "white") + |
|
218 |
+ scale_fill_gradient2(high = "red", mid = "white", low = "blue", |
|
219 |
+ midpoint = 0.5, limit = c(0,1), space = "Lab", |
|
220 |
+ name=as.character(scalebar_title)) + |
|
221 |
+ theme_bw() + xlab("Features Extracted") + ylab("Dataset Tested") + |
|
222 |
+ theme(axis.text.x = element_text(angle = 90, vjust = 1, size = 8, hjust = 1)) + |
|
223 |
+ theme(axis.text.y = element_text(vjust = 1, size = 8, hjust = 1)) + |
|
224 |
+ coord_fixed() |
|
225 |
+ if(includeValues == TRUE) ggheatmap_2 <- ggheatmap_2 + geom_text(aes(label = value), color = "black", size = 3) |
|
226 |
+ |
|
227 |
+ ggheatmap <- ggarrange(ggheatmap_1, ggheatmap_2, labels = c("A - Feature Selection", "B - Random Features"), |
|
228 |
+ ncol = 2, common.legend = TRUE, legend = "right") |
|
229 |
+ } else { |
|
230 |
+ ggheatmap <- ggheatmap_1 |
|
231 |
+ } |
|
232 |
+ } |
|
233 |
+ print(ggheatmap) |
|
234 |
+} |
|
0 | 235 |
\ No newline at end of file |
... | ... |
@@ -22,8 +22,8 @@ |
22 | 22 |
#' or assays. If a numeric vector these will be optimised over using \code{selectionOptimisation}. If a named vector with the same names of multiple assays, |
23 | 23 |
#' a different number of features will be used for each assay. If a named list of vectors, the respective number of features will be optimised over. |
24 | 24 |
#' Set to NULL or "all" if all features should be used. |
25 |
-#' @param selectionMethod Default: "auto". A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, |
|
26 |
-#' and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"} t-test (two categories) / F-test (three or more categories) ranking |
|
25 |
+#' @param selectionMethod Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, |
|
26 |
+#' and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking |
|
27 | 27 |
#' and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. |
28 | 28 |
#' @param selectionOptimisation A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}. |
29 | 29 |
#' @param performanceType Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Otherwise, any one of the |
... | ... |
@@ -91,10 +91,10 @@ setMethod("crossValidate", "DataFrame", |
91 | 91 |
function(measurements, |
92 | 92 |
outcome, |
93 | 93 |
nFeatures = 20, |
94 |
- selectionMethod = "t-test", |
|
94 |
+ selectionMethod = "auto", |
|
95 | 95 |
selectionOptimisation = "Resubstitution", |
96 | 96 |
performanceType = "auto", |
97 |
- classifier = "randomForest", |
|
97 |
+ classifier = "auto", |
|
98 | 98 |
multiViewMethod = "none", |
99 | 99 |
assayCombinations = "all", |
100 | 100 |
nFolds = 5, |
... | ... |
@@ -297,10 +297,10 @@ setMethod("crossValidate", "MultiAssayExperiment", |
297 | 297 |
function(measurements, |
298 | 298 |
outcome, |
299 | 299 |
nFeatures = 20, |
300 |
- selectionMethod = "t-test", |
|
300 |
+ selectionMethod = "auto", |
|
301 | 301 |
selectionOptimisation = "Resubstitution", |
302 | 302 |
performanceType = "auto", |
303 |
- classifier = "randomForest", |
|
303 |
+ classifier = "auto", |
|
304 | 304 |
multiViewMethod = "none", |
305 | 305 |
assayCombinations = "all", |
306 | 306 |
nFolds = 5, |
... | ... |
@@ -331,10 +331,10 @@ setMethod("crossValidate", "data.frame", # data.frame of numeric measurements. |
331 | 331 |
function(measurements, |
332 | 332 |
outcome, |
333 | 333 |
nFeatures = 20, |
334 |
- selectionMethod = "t-test", |
|
334 |
+ selectionMethod = "auto", |
|
335 | 335 |
selectionOptimisation = "Resubstitution", |
336 | 336 |
performanceType = "auto", |
337 |
- classifier = "randomForest", |
|
337 |
+ classifier = "auto", |
|
338 | 338 |
multiViewMethod = "none", |
339 | 339 |
assayCombinations = "all", |
340 | 340 |
nFolds = 5, |
... | ... |
@@ -364,10 +364,10 @@ setMethod("crossValidate", "matrix", # Matrix of numeric measurements. |
364 | 364 |
function(measurements, |
365 | 365 |
outcome, |
366 | 366 |
nFeatures = 20, |
367 |
- selectionMethod = "t-test", |
|
367 |
+ selectionMethod = "auto", |
|
368 | 368 |
selectionOptimisation = "Resubstitution", |
369 | 369 |
performanceType = "auto", |
370 |
- classifier = "randomForest", |
|
370 |
+ classifier = "auto", |
|
371 | 371 |
multiViewMethod = "none", |
372 | 372 |
assayCombinations = "all", |
373 | 373 |
nFolds = 5, |
... | ... |
@@ -399,10 +399,10 @@ setMethod("crossValidate", "list", |
399 | 399 |
function(measurements, |
400 | 400 |
outcome, |
401 | 401 |
nFeatures = 20, |
402 |
- selectionMethod = "t-test", |
|
402 |
+ selectionMethod = "auto", |
|
403 | 403 |
selectionOptimisation = "Resubstitution", |
404 | 404 |
performanceType = "auto", |
405 |
- classifier = "randomForest", |
|
405 |
+ classifier = "auto", |
|
406 | 406 |
multiViewMethod = "none", |
407 | 407 |
assayCombinations = "all", |
408 | 408 |
nFolds = 5, |
... | ... |
@@ -764,19 +764,18 @@ generateMultiviewParams <- function(assayIDs, |
764 | 764 |
} |
765 | 765 |
|
766 | 766 |
# measurements, outcome are mutually exclusive with x, outcomeTrain, measurementsTest, outcomeTest. |
767 |
-CV <- function(measurements = NULL, |
|
768 |
- outcome = NULL, x = NULL, outcomeTrain = NULL, measurementsTest = NULL, outcomeTest = NULL, |
|
767 |
+CV <- function(measurements, outcome, x, outcomeTrain, measurementsTest, outcomeTest, |
|
769 | 768 |
assayIDs, |
770 |
- nFeatures = NULL, |
|
771 |
- selectionMethod = "t-test", |
|
772 |
- selectionOptimisation = "Resubstitution", |
|
769 |
+ nFeatures, |
|
770 |
+ selectionMethod, |
|
771 |
+ selectionOptimisation, |
|
773 | 772 |
performanceType, |
774 |
- classifier = "elasticNetGLM", |
|
775 |
- multiViewMethod = "none", |
|
776 |
- nFolds = 5, |
|
777 |
- nRepeats = 100, |
|
778 |
- nCores = 1, |
|
779 |
- characteristicsLabel = NULL) |
|
773 |
+ classifier, |
|
774 |
+ multiViewMethod, |
|
775 |
+ nFolds, |
|
776 |
+ nRepeats, |
|
777 |
+ nCores, |
|
778 |
+ characteristicsLabel) |
|
780 | 779 |
|
781 | 780 |
{ |
782 | 781 |
# Which data-types or data-views are present? |
... | ... |
@@ -848,7 +847,7 @@ train.data.frame <- function(x, outcomeTrain, ...) |
848 | 847 |
#' @param performanceType Performance metric to optimise if classifier has any tuning parameters. |
849 | 848 |
#' @method train DataFrame |
850 | 849 |
#' @export |
851 |
-train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", performanceType = "auto", |
|
850 |
+train.DataFrame <- function(x, outcomeTrain, selectionMethod = "auto", nFeatures = 20, classifier = "auto", performanceType = "auto", |
|
852 | 851 |
multiViewMethod = "none", assayIDs = "all", ...) # ... for prepareData. |
853 | 852 |
{ |
854 | 853 |
prepArgs <- list(x, outcomeTrain) |
... | ... |
@@ -863,18 +862,20 @@ train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", perfor |
863 | 862 |
# Ensure performance type is one of the ones that can be calculated by the package. |
864 | 863 |
if(!performanceType %in% c("auto", .ClassifyRenvir[["performanceTypes"]])) |
865 | 864 |
stop(paste("performanceType must be one of", paste(c("auto", .ClassifyRenvir[["performanceTypes"]]), collapse = ", "), "but is", performanceType)) |
866 |
- |
|
865 |
+ |
|
866 |
+ isCategorical <- is.character(outcomeTrain) && (length(outcomeTrain) == 1 || length(outcomeTrain) == nrow(measurements)) || is.factor(outcomeTrain) |
|
867 | 867 |
if(performanceType == "auto") |
868 |
- { |
|
869 |
- if(is.character(outcomeTrain) && (length(outcomeTrain) == 1 || length(outcomeTrain) == nrow(x)) || is.factor(outcomeTrain)) |
|
870 |
- performanceType <- "Balanced Accuracy" |
|
871 |
- else performanceType <- "C-index" |
|
872 |
- } |
|
868 |
+ if(isCategorical) performanceType <- "Balanced Accuracy" else performanceType <- "C-index" |
|
869 |
+ if(length(selectionMethod) == 1 && selectionMethod == "auto") |
|
870 |
+ if(isCategorical) selectionMethod <- "t-test" else selectionMethod <- "CoxPH" |
|
871 |
+ if(length(classifier) == 1 && classifier == "auto") |
|
872 |
+ if(isCategorical) classifier <- "randomForest" else classifier <- "CoxPH" |
|
873 | 873 |
|
874 | 874 |
measurements <- measurementsAndOutcome[["measurements"]] |
875 | 875 |
outcomeTrain <- measurementsAndOutcome[["outcome"]] |
876 | 876 |
|
877 | 877 |
classifier <- cleanClassifier(classifier = classifier, measurements = measurements) |
878 |
+ selectionMethod <- cleanSelectionMethod(selectionMethod = selectionMethod, measurements = measurements) |
|
878 | 879 |
if(assayIDs == "all") assayIDs <- unique(S4Vectors::mcols(measurements)[, "assay"]) |
879 | 880 |
if(is.null(assayIDs)) assayIDs <- 1 |
880 | 881 |
names(assayIDs) <- assayIDs |
... | ... |
@@ -886,30 +887,52 @@ train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", perfor |
886 | 887 |
# Loop over assays |
887 | 888 |
sapply(classifier[[assayIndex]], function(classifierForAssay) { |
888 | 889 |
# Loop over classifiers |
890 |
+ sapply(selectionMethod[[assayIndex]], function(selectionForAssay) { |
|
891 |
+ # Loop over selectors |
|
889 | 892 |
|
890 | 893 |
measurementsUse <- measurements |
891 | 894 |
if(assayIndex != 1) measurementsUse <- measurements[, S4Vectors::mcols(measurements)[, "assay"] == assayIndex, drop = FALSE] |
892 | 895 |
|
896 |
+ modellingParams <- generateModellingParams(assayIDs = assayIDs, measurements = measurements, nFeatures = nFeatures, |
|
897 |
+ selectionMethod = selectionMethod, selectionOptimisation = "Resubstitution", performanceType = performanceType, |
|
898 |
+ classifier = classifier, multiViewMethod = "none") |
|
899 |
+ topFeatures <- .doSelection(measurementsUse, outcomeTrain, CrossValParams(), modellingParams, verbose = 0) |
|
900 |
+ selectedFeaturesIndices <- topFeatures[[2]] # Extract for subsetting. |
|
901 |
+ tuneDetailsSelect <- topFeatures[[3]] |
|
902 |
+ measurementsUse <- measurementsUse[, selectedFeaturesIndices] |
|
903 |
+ |
|
893 | 904 |
classifierParams <- .classifierKeywordToParams(classifierForAssay) |
894 |
- if(!is.null(classifierParams$trainParams@tuneParams)) |
|
895 |
- classifierParams$trainParams@tuneParams <- c(classifierParams$trainParams@tuneParams, performanceType = performanceType) |
|
896 | 905 |
modellingParams <- ModellingParams(balancing = "none", selectParams = NULL, |
897 |
- trainParams = classifierParams$trainParams, predictParams = classifierParams$predictParams) |
|
906 |
+ trainParams = classifierParams$trainParams, predictParams = classifierParams$predictParams) |
|
907 |
+ if(!is.null(tuneDetailsSelect)) |
|
908 |
+ { |
|
909 |
+ tuneDetailsSelectUse <- tuneDetailsSelect[["tuneCombinations"]][tuneDetailsSelect[["bestIndex"]], , drop = FALSE] |
|
910 |
+ avoidTune <- match(colnames(tuneDetailsSelectUse), names(modellingParams@trainParams@tuneParams)) |
|
911 |
+ if(any(!is.na(avoidTune))) |
|
912 |
+ { |
|
913 |
+ modellingParams@trainParams@otherParams <- c(modellingParams@trainParams@otherParams, tuneDetailsSelectUse[!is.na(avoidTune)]) |
|
914 |
+ modellingParams@trainParams@tuneParams <- modellingParams@trainParams@tuneParams[-na.omit(avoidTune)] |
|
915 |
+ if(length(modellingParams@trainParams@tuneParams) == 0) modellingParams@trainParams@tuneParams <- NULL |
|
916 |
+ } |
|
917 |
+ } |
|
918 |
+ if(!is.null(modellingParams@trainParams@tuneParams)) |
|
919 |
+ modellingParams$trainParams@tuneParams <- c(modellingParams$trainParams@tuneParams, performanceType = performanceType) |
|
898 | 920 |
|
899 |
- .doTrain(measurementsUse, outcomeTrain, NULL, NULL, CrossValParams(), modellingParams, verbose = 0)[["model"]] |
|
921 |
+ trained <- .doTrain(measurementsUse, outcomeTrain, NULL, NULL, CrossValParams(), modellingParams, verbose = 0)[["model"]] |
|
922 |
+ attr(trained, "predictFunction") <- classifierParams$predictParams@predictor |
|
923 |
+ trained |
|
900 | 924 |
## train model |
901 |
- }, |
|
902 |
- simplify = FALSE) |
|
903 |
- }, |
|
904 |
- simplify = FALSE) |
|
925 |
+ }, simplify = FALSE) |
|
926 |
+ }, simplify = FALSE) |
|
927 |
+ }, simplify = FALSE) |
|
905 | 928 |
|
906 |
- models <- unlist(resClassifier, recursive = FALSE) |
|
929 |
+ models <- unlist(unlist(resClassifier, recursive = FALSE), recursive = FALSE) |
|
907 | 930 |
if(length(models) == 1) { |
908 | 931 |
model <- models[[1]] |
909 |
- class(model) <- c(class(model), "trainedByClassifyR") |
|
932 |
+ class(model) <- c("trainedByClassifyR", class(model)) |
|
910 | 933 |
models <- NULL |
911 | 934 |
} else { |
912 |
- class(models) <- c(class(models), "listOfModels", "trainedByClassifyR") |
|
935 |
+ class(models) <- c("listOfModels", "trainedByClassifyR", class(models)) |
|
913 | 936 |
} |
914 | 937 |
} |
915 | 938 |
|
... | ... |
@@ -932,6 +955,8 @@ train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", perfor |
932 | 955 |
|
933 | 956 |
# Generate params for each assay. This could be extended to have different selectionMethods for each type |
934 | 957 |
paramsAssays <- mapply(generateModellingParams, |
958 |
+ nFeatures = nFeatures[assayIDs], |
|
959 |
+ selectionMethod = selectionMethod[assayIDs], |
|
935 | 960 |
assayIDs = assayIDs, |
936 | 961 |
measurements = assayTrain[assayIDs], |
937 | 962 |
classifier = classifier[assayIDs], |
... | ... |
@@ -1049,9 +1074,11 @@ predict.trainedByClassifyR <- function(object, newData, ...) |
1049 | 1074 |
newData <- prepareData(newData, useFeatures = allFeatureNames(object)) |
1050 | 1075 |
# Some classifiers dangerously use positional matching rather than column name matching. |
1051 | 1076 |
# newData columns are sorted so that the right column ordering is guaranteed. |
1052 |
- } |
|
1053 |
- |
|
1077 |
+ } |
|
1078 |
+ |
|
1079 |
+ predictFunctionUse <- attr(object, "predictFunction") |
|
1080 |
+ class(object) <- rev(class(object)) # Now want the predict method of the specific model to be picked, so put model class first. |
|
1054 | 1081 |
if (is(object, "listOfModels")) |
1055 |
- mapply(function(model, assay) predict(model, assay), object, newData, SIMPLIFY = FALSE) |
|
1056 |
- else predict(object, newData) # Object is itself a trained model and it is assumed that a predict method is defined for it. |
|
1082 |
+ mapply(function(model, assay) predictFunctionUse(model, assay), object, newData, SIMPLIFY = FALSE) |
|
1083 |
+ else predictFunctionUse(object, newData) # Object is itself a trained model and it is assumed that a predict method is defined for it. |
|
1057 | 1084 |
} |
... | ... |
@@ -47,6 +47,14 @@ setMethod("prepareData", "matrix", |
47 | 47 |
prepareData(S4Vectors::DataFrame(measurements, check.names = FALSE), outcome, ...) |
48 | 48 |
}) |
49 | 49 |
|
50 |
+#' @rdname prepareData |
|
51 |
+#' @export |
|
52 |
+setMethod("prepareData", "data.frame", |
|
53 |
+ function(measurements, outcome, ...) |
|
54 |
+{ |
|
55 |
+ prepareData(S4Vectors::DataFrame(measurements, check.names = FALSE), outcome, ...) |
|
56 |
+}) |
|
57 |
+ |
|
50 | 58 |
#' @rdname prepareData |
51 | 59 |
#' @export |
52 | 60 |
setMethod("prepareData", "DataFrame", |
53 | 61 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,9 @@ |
1 |
+# Automated Selection of Previously Selected Features |
|
2 |
+randomSelection <- function(measurementsTrain, classesTrain, nFeatures, verbose = 3) |
|
3 |
+{ |
|
4 |
+ if(verbose == 3) |
|
5 |
+ message("Choosing random features.") |
|
6 |
+ |
|
7 |
+ sample(ncol(measurementsTrain), nFeatures) # Return indices, not identifiers. |
|
8 |
+} |
|
9 |
+attr(randomSelection, "name") <- "randomSelection" |
|
0 | 10 |
\ No newline at end of file |
... | ... |
@@ -26,7 +26,7 @@ |
26 | 26 |
#' @param results A list of \code{\link{ClassifyResult}} objects. |
27 | 27 |
#' @param topRanked A sequence of thresholds of number of the best features to |
28 | 28 |
#' use for overlapping. |
29 |
-#' @param comparison Default: within. The aspect of the experimental design to |
|
29 |
+#' @param comparison Default: \code{"within"}. The aspect of the experimental design to |
|
30 | 30 |
#' compare. Can be any characteristic that all results share or special value |
31 | 31 |
#' "within" to compared between all pairwise iterations of cross-validation. |
32 | 32 |
#' @param referenceLevel The level of the comparison factor to use as the |
... | ... |
@@ -13,7 +13,7 @@ |
13 | 13 |
#' a matrix of pre-calculated metrics, for backwards compatibility. |
14 | 14 |
#' @param classes If \code{results} is a matrix, this is a factor vector of the |
15 | 15 |
#' same length as the number of columns that \code{results} has. |
16 |
-#' @param comparison Default: "auto". The aspect of the experimental |
|
16 |
+#' @param comparison Default: \code{"auto"}. The aspect of the experimental |
|
17 | 17 |
#' design to compare. Can be any characteristic that all results share. |
18 | 18 |
#' @param metric Default: \code{"auto"}. The name of the |
19 | 19 |
#' performance measure or "auto". If the results are classification then |
... | ... |
@@ -36,7 +36,7 @@ |
36 | 36 |
#' |
37 | 37 |
#' @aliases selectionPlot selectionPlot,list-method |
38 | 38 |
#' @param results A list of \code{\link{ClassifyResult}} objects. |
39 |
-#' @param comparison Default: within. The aspect of the experimental design to |
|
39 |
+#' @param comparison Default: \code{"within"}. The aspect of the experimental design to |
|
40 | 40 |
#' compare. Can be any characteristic that all results share or either one of |
41 | 41 |
#' the special values \code{"within"} to compare between all pairwise |
42 | 42 |
#' iterations of cross-validation. or \code{"size"}, to draw a bar chart of the |
... | ... |
@@ -129,10 +129,12 @@ |
129 | 129 |
tuneCombo <- tuneCombosSelect[rowIndex, , drop = FALSE] |
130 | 130 |
if(tuneCombo != "none") # Add real parameters before function call. |
131 | 131 |
paramList <- append(paramList, tuneCombo) |
132 |
+ if(attr(featureRanking, "name") == "randomSelection") |
|
133 |
+ paramList <- append(paramList, nFeatures = topNfeatures) |
|
132 | 134 |
do.call(featureRanking, paramList) |
133 | 135 |
}) |
134 | 136 |
|
135 |
- if(attr(featureRanking, "name") %in% c("previousSelection", "Union Selection")) # Actually selection not ranking. |
|
137 |
+ if(attr(featureRanking, "name") %in% c("randomSelection", "previousSelection", "Union Selection")) # Actually selection not ranking. |
|
136 | 138 |
return(list(NULL, rankings[[1]], NULL)) |
137 | 139 |
|
138 | 140 |
if(crossValParams@tuneMode == "none") # No parameters to choose between. |
... | ... |
@@ -509,6 +511,8 @@ |
509 | 511 |
"KS" = KolmogorovSmirnovRanking, |
510 | 512 |
"KL" = KullbackLeiblerRanking, |
511 | 513 |
"CoxPH" = coxphRanking, |
514 |
+ "previousSelection" = previousSelection, |
|
515 |
+ "randomSelection" = randomSelection, |
|
512 | 516 |
"selectMulti" = selectMulti |
513 | 517 |
) |
514 | 518 |
} |
... | ... |
@@ -16,7 +16,7 @@ ModellingParams( |
16 | 16 |
) |
17 | 17 |
} |
18 | 18 |
\arguments{ |
19 |
-\item{balancing}{Default: "downsample". A character value specifying what kind |
|
19 |
+\item{balancing}{Default: \code{"downsample"}. A character value specifying what kind |
|
20 | 20 |
of class balancing to do, if any.} |
21 | 21 |
|
22 | 22 |
\item{transformParams}{Parameters used for feature transformation inside of C.V. |
... | ... |
@@ -30,7 +30,7 @@ |
30 | 30 |
\item{...}{Parameters not used by the \code{ClassifyResult} method but passed to |
31 | 31 |
the \code{list} method.} |
32 | 32 |
|
33 |
-\item{mode}{Default: "merge". Whether to merge all predictions of all |
|
33 |
+\item{mode}{Default: \code{"merge"}. Whether to merge all predictions of all |
|
34 | 34 |
iterations of cross-validation into one set or keep them separate. Keeping |
35 | 35 |
them separate will cause separate ROC curves to be computed for each |
36 | 36 |
iteration and confidence intervals to be drawn with the solid line being the |
37 | 37 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,19 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/crissCrossValidate.R |
|
3 |
+\name{crissCrossPlot} |
|
4 |
+\alias{crissCrossPlot} |
|
5 |
+\title{A function to plot the output of the crissCrossValidate function.} |
|
6 |
+\usage{ |
|
7 |
+crissCrossPlot(crissCrossResult, includeValues = FALSE) |
|
8 |
+} |
|
9 |
+\arguments{ |
|
10 |
+\item{crissCrossResult}{The output of the crissCrossValidate function.} |
|
11 |
+ |
|
12 |
+\item{includeValues}{If TRUE, then the values of the matrix will be included in the plot.} |
|
13 |
+} |
|
14 |
+\description{ |
|
15 |
+This function has been designed to give a heatmap output of the crissCrossValidate function. |
|
16 |
+} |
|
17 |
+\author{ |
|
18 |
+Harry Robertson |
|
19 |
+} |
0 | 20 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,64 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/crissCrossValidate.R |
|
3 |
+\name{crissCrossValidate} |
|
4 |
+\alias{crissCrossValidate} |
|
5 |
+\title{A function to perform pairwise cross validation} |
|
6 |
+\usage{ |
|
7 |
+crissCrossValidate( |
|
8 |
+ measurements, |
|
9 |
+ outcomes, |
|
10 |
+ nFeatures = 20, |
|
11 |
+ selectionMethod = "auto", |
|
12 |
+ selectionOptimisation = "Resubstitution", |
|
13 |
+ trainType = c("modelTrain", "modelTest"), |
|
14 |
+ performanceType = "auto", |
|
15 |
+ doRandomFeatures = FALSE, |
|
16 |
+ classifier = "auto", |
|
17 |
+ nFolds = 5, |
|
18 |
+ nRepeats = 20, |
|
19 |
+ nCores = 1 |
|
20 |
+) |
|
21 |
+} |
|
22 |
+\arguments{ |
|
23 |
+\item{measurements}{A \code{list} of either \code{\link{DataFrame}}, \code{\link{data.frame}} or \code{\link{matrix}} class measurements.} |
|
24 |
+ |
|
25 |
+\item{outcomes}{A \code{list} of vectors that respectively correspond to outcomes of the samples in \code{measurements} list.} |
|
26 |
+ |
|
27 |
+\item{nFeatures}{The number of features to be used for modelling.} |
|
28 |
+ |
|
29 |
+\item{selectionMethod}{Default: \code{"auto"}. A character keyword of the feature algorithm to be used. If \code{"auto"}, t-test (two categories) / |
|
30 |
+F-test (three or more categories) ranking and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional |
|
31 |
+hazards p-value.} |
|
32 |
+ |
|
33 |
+\item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise nFeatures.} |
|
34 |
+ |
|
35 |
+\item{trainType}{Default: \code{"modelTrain"}. A keyword specifying whether a fully trained model is used to make predictions on the test |
|
36 |
+set or if only the feature identifiers are chosen using the training data set and a number of training-predictions are made by cross-validation |
|
37 |
+in the test set.} |
|
38 |
+ |
|
39 |
+\item{performanceType}{Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Otherwise, any one of the |
|
40 |
+options described in \code{\link{calcPerformance}} may otherwise be specified.} |
|
41 |
+ |
|
42 |
+\item{doRandomFeatures}{Default: \code{FALSE}. Whether to perform random feature selection to establish a baseline performance. Either \code{FALSE} or \code{TRUE} |
|
43 |
+are permitted values.} |
|
44 |
+ |
|
45 |
+\item{classifier}{Default: \code{"auto"}. A character keyword of the modelling algorithm to be used. If \code{"auto"}, then a random forest is used |
|
46 |
+for a classification task or Cox proportional hazards model for a survival task.} |
|
47 |
+ |
|
48 |
+\item{nFolds}{A numeric specifying the number of folds to use for cross-validation.} |
|
49 |
+ |
|
50 |
+\item{nRepeats}{A numeric specifying the the number of repeats or permutations to use for cross-validation.} |
|
51 |
+ |
|
52 |
+\item{nCores}{A numeric specifying the number of cores used if the user wants to use parallelisation.} |
|
53 |
+} |
|
54 |
+\value{ |
|
55 |
+A list with elements \code{"real"} for the matrix of pairwise performance metrics using real |
|
56 |
+feature selection, \code{"random"} if \code{doRandomFeatures} is \code{TRUE} for metrics of random selection and |
|
57 |
+\code{"params"} for a list of parameters used during the execution of this function. |
|
58 |
+} |
|
59 |
+\description{ |
|
60 |
+This function has been designed to perform cross-validation and model prediction on datasets in a pairwise manner. |
|
61 |
+} |
|
62 |
+\author{ |
|
63 |
+Harry Robertson |
|
64 |
+} |
... | ... |
@@ -22,10 +22,10 @@ crossValidate(measurements, outcome, ...) |
22 | 22 |
measurements, |
23 | 23 |
outcome, |
24 | 24 |
nFeatures = 20, |
25 |
- selectionMethod = "t-test", |
|
25 |
+ selectionMethod = "auto", |
|
26 | 26 |
selectionOptimisation = "Resubstitution", |
27 | 27 |
performanceType = "auto", |
28 |
- classifier = "randomForest", |
|
28 |
+ classifier = "auto", |
|
29 | 29 |
multiViewMethod = "none", |
30 | 30 |
assayCombinations = "all", |
31 | 31 |
nFolds = 5, |
... | ... |
@@ -39,10 +39,10 @@ crossValidate(measurements, outcome, ...) |
39 | 39 |
measurements, |
40 | 40 |
outcome, |
41 | 41 |
nFeatures = 20, |
42 |
- selectionMethod = "t-test", |
|
42 |
+ selectionMethod = "auto", |
|
43 | 43 |
selectionOptimisation = "Resubstitution", |
44 | 44 |
performanceType = "auto", |
45 |
- classifier = "randomForest", |
|
45 |
+ classifier = "auto", |
|
46 | 46 |
multiViewMethod = "none", |
47 | 47 |
assayCombinations = "all", |
48 | 48 |
nFolds = 5, |
... | ... |
@@ -56,10 +56,10 @@ crossValidate(measurements, outcome, ...) |
56 | 56 |
measurements, |
57 | 57 |
outcome, |
58 | 58 |
nFeatures = 20, |
59 |
- selectionMethod = "t-test", |
|
59 |
+ selectionMethod = "auto", |
|
60 | 60 |
selectionOptimisation = "Resubstitution", |
61 | 61 |
performanceType = "auto", |
62 |
- classifier = "randomForest", |
|
62 |
+ classifier = "auto", |
|
63 | 63 |
multiViewMethod = "none", |
64 | 64 |
assayCombinations = "all", |
65 | 65 |
nFolds = 5, |
... | ... |
@@ -73,10 +73,10 @@ crossValidate(measurements, outcome, ...) |
73 | 73 |
measurements, |
74 | 74 |
outcome, |
75 | 75 |
nFeatures = 20, |
76 |
- selectionMethod = "t-test", |
|
76 |
+ selectionMethod = "auto", |
|
77 | 77 |
selectionOptimisation = "Resubstitution", |
78 | 78 |
performanceType = "auto", |
79 |
- classifier = "randomForest", |
|
79 |
+ classifier = "auto", |
|
80 | 80 |
multiViewMethod = "none", |
81 | 81 |
assayCombinations = "all", |
82 | 82 |
nFolds = 5, |
... | ... |
@@ -90,10 +90,10 @@ crossValidate(measurements, outcome, ...) |
90 | 90 |
measurements, |
91 | 91 |
outcome, |
92 | 92 |
nFeatures = 20, |
93 |
- selectionMethod = "t-test", |
|
93 |
+ selectionMethod = "auto", |
|
94 | 94 |
selectionOptimisation = "Resubstitution", |
95 | 95 |
performanceType = "auto", |
96 |
- classifier = "randomForest", |
|
96 |
+ classifier = "auto", |
|
97 | 97 |
multiViewMethod = "none", |
98 | 98 |
assayCombinations = "all", |
99 | 99 |
nFolds = 5, |
... | ... |
@@ -110,7 +110,9 @@ crossValidate(measurements, outcome, ...) |
110 | 110 |
\method{train}{DataFrame}( |
111 | 111 |
x, |
112 | 112 |
outcomeTrain, |
113 |
- classifier = "randomForest", |
|
113 |
+ selectionMethod = "auto", |
|
114 |
+ nFeatures = 20, |
|
115 |
+ classifier = "auto", |
|
114 | 116 |
performanceType = "auto", |
115 | 117 |
multiViewMethod = "none", |
116 | 118 |
assayIDs = "all", |
... | ... |
@@ -141,8 +143,8 @@ or assays. If a numeric vector these will be optimised over using \code{selectio |
141 | 143 |
a different number of features will be used for each assay. If a named list of vectors, the respective number of features will be optimised over. |
142 | 144 |
Set to NULL or "all" if all features should be used.} |
143 | 145 |
|
144 |
-\item{selectionMethod}{Default: "auto". A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, |
|
145 |
-and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"} t-test (two categories) / F-test (three or more categories) ranking |
|
146 |
+\item{selectionMethod}{Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, |
|
147 |
+and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking |
|
146 | 148 |
and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value.} |
147 | 149 |
|
148 | 150 |
\item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.} |
... | ... |
@@ -5,10 +5,13 @@ |
5 | 5 |
\alias{prepareData,matrix-method} |
6 | 6 |
\alias{prepareData,DataFrame-method} |
7 | 7 |
\alias{prepareData,MultiAssayExperiment-method} |
8 |
+\alias{prepareData,data.frame-method} |
|
8 | 9 |
\title{Convert Different Data Classes into DataFrame and Filter Features} |
9 | 10 |
\usage{ |
10 | 11 |
\S4method{prepareData}{matrix}(measurements, outcome, ...) |
11 | 12 |
|
13 |
+\S4method{prepareData}{data.frame}(measurements, outcome, ...) |
|
14 |
+ |
|
12 | 15 |
\S4method{prepareData}{DataFrame}( |
13 | 16 |
measurements, |
14 | 17 |
outcome, |
... | ... |
@@ -37,7 +37,7 @@ |
37 | 37 |
\item{topRanked}{A sequence of thresholds of number of the best features to |
38 | 38 |
use for overlapping.} |
39 | 39 |
|
40 |
-\item{comparison}{Default: within. The aspect of the experimental design to |
|
40 |
+\item{comparison}{Default: \code{"within"}. The aspect of the experimental design to |
|
41 | 41 |
compare. Can be any characteristic that all results share or special value |
42 | 42 |
"within" to compared between all pairwise iterations of cross-validation.} |
43 | 43 |
|
... | ... |
@@ -61,7 +61,7 @@ a matrix of pre-calculated metrics, for backwards compatibility.} |
61 | 61 |
\item{...}{Parameters not used by the \code{ClassifyResult} method that does |
62 | 62 |
list-packaging but used by the main \code{list} method.} |
63 | 63 |
|
64 |
-\item{comparison}{Default: "auto". The aspect of the experimental |
|
64 |
+\item{comparison}{Default: \code{"auto"}. The aspect of the experimental |
|
65 | 65 |
design to compare. Can be any characteristic that all results share.} |
66 | 66 |
|
67 | 67 |
\item{metric}{Default: \code{"auto"}. The name of the |
... | ... |
@@ -37,7 +37,7 @@ |
37 | 37 |
|
38 | 38 |
\item{...}{Not used by end user.} |
39 | 39 |
|
40 |
-\item{comparison}{Default: within. The aspect of the experimental design to |
|
40 |
+\item{comparison}{Default: \code{"within"}. The aspect of the experimental design to |
|
41 | 41 |
compare. Can be any characteristic that all results share or either one of |
42 | 42 |
the special values \code{"within"} to compare between all pairwise |
43 | 43 |
iterations of cross-validation. or \code{"size"}, to draw a bar chart of the |