... | ... |
@@ -656,3 +656,26 @@ predict.dlda <- function(object, newdata, ...) { # Remove once sparsediscrim is |
656 | 656 |
.dmvnorm_diag <- function(x, mean, sigma) { # Remove once sparsediscrim is reinstated to CRAN. |
657 | 657 |
exp(sum(dnorm(x, mean=mean, sd=sqrt(sigma), log=TRUE))) |
658 | 658 |
} |
659 |
+ |
|
660 |
+# Function to create permutations of a vector, with the possibility to restrict values at certain positions. |
|
661 |
+# fixed parameter is a data frame with first column position and second column value. |
|
662 |
+.permutations <- function(data, fixed = NULL) |
|
663 |
+{ |
|
664 |
+ items <- length(data) |
|
665 |
+ multipliedTo1 <- factorial(items) |
|
666 |
+ if(items > 1) |
|
667 |
+ permutations <- structure(vapply(seq_along(data), function(index) |
|
668 |
+ rbind(data[index], .permutations(data[-index])), |
|
669 |
+ data[rep(1L, multipliedTo1)]), dim = c(items, multipliedTo1)) |
|
670 |
+ else permutations <- data |
|
671 |
+ |
|
672 |
+ if(!is.null(fixed)) |
|
673 |
+ { |
|
674 |
+ for(rowIndex in seq_len(nrow(fixed))) |
|
675 |
+ { |
|
676 |
+ keepColumns <- permutations[fixed[rowIndex, 1], ] == fixed[rowIndex, 2] |
|
677 |
+ permutations <- permutations[, keepColumns] |
|
678 |
+ } |
|
679 |
+ } |
|
680 |
+ permutations |
|
681 |
+} |
... | ... |
@@ -130,7 +130,7 @@ |
130 | 130 |
if(tuneCombo != "none") # Add real parameters before function call. |
131 | 131 |
paramList <- append(paramList, tuneCombo) |
132 | 132 |
if(attr(featureRanking, "name") == "randomSelection") |
133 |
- paramList <- append(paramList, nFeatures = topNfeatures) |
|
133 |
+ paramList <- append(paramList, list(nFeatures = topNfeatures)) |
|
134 | 134 |
do.call(featureRanking, paramList) |
135 | 135 |
}) |
136 | 136 |
|
... | ... |
@@ -129,10 +129,12 @@ |
129 | 129 |
tuneCombo <- tuneCombosSelect[rowIndex, , drop = FALSE] |
130 | 130 |
if(tuneCombo != "none") # Add real parameters before function call. |
131 | 131 |
paramList <- append(paramList, tuneCombo) |
132 |
+ if(attr(featureRanking, "name") == "randomSelection") |
|
133 |
+ paramList <- append(paramList, nFeatures = topNfeatures) |
|
132 | 134 |
do.call(featureRanking, paramList) |
133 | 135 |
}) |
134 | 136 |
|
135 |
- if(attr(featureRanking, "name") %in% c("previousSelection", "Union Selection")) # Actually selection not ranking. |
|
137 |
+ if(attr(featureRanking, "name") %in% c("randomSelection", "previousSelection", "Union Selection")) # Actually selection not ranking. |
|
136 | 138 |
return(list(NULL, rankings[[1]], NULL)) |
137 | 139 |
|
138 | 140 |
if(crossValParams@tuneMode == "none") # No parameters to choose between. |
... | ... |
@@ -509,6 +511,8 @@ |
509 | 511 |
"KS" = KolmogorovSmirnovRanking, |
510 | 512 |
"KL" = KullbackLeiblerRanking, |
511 | 513 |
"CoxPH" = coxphRanking, |
514 |
+ "previousSelection" = previousSelection, |
|
515 |
+ "randomSelection" = randomSelection, |
|
512 | 516 |
"selectMulti" = selectMulti |
513 | 517 |
) |
514 | 518 |
} |
... | ... |
@@ -131,8 +131,8 @@ |
131 | 131 |
paramList <- append(paramList, tuneCombo) |
132 | 132 |
do.call(featureRanking, paramList) |
133 | 133 |
}) |
134 |
- |
|
135 |
- if(attr(featureRanking, "name") == "previousSelection") # Actually selection not ranking. |
|
134 |
+ |
|
135 |
+ if(attr(featureRanking, "name") %in% c("previousSelection", "Union Selection")) # Actually selection not ranking. |
|
136 | 136 |
return(list(NULL, rankings[[1]], NULL)) |
137 | 137 |
|
138 | 138 |
if(crossValParams@tuneMode == "none") # No parameters to choose between. |
... | ... |
@@ -139,6 +139,12 @@ |
139 | 139 |
return(list(NULL, rankings[[1]], NULL)) |
140 | 140 |
|
141 | 141 |
tuneParamsTrain <- list(topN = topNfeatures) |
142 |
+ performanceIndex <- match("performanceType", names(modellingParams@trainParams@tuneParams)) |
|
143 |
+ if(!is.na(performanceIndex)) |
|
144 |
+ { |
|
145 |
+ performanceType <- modellingParams@trainParams@tuneParams[["performanceType"]] |
|
146 |
+ modellingParams@trainParams@tuneParams <- modellingParams@trainParams@tuneParams[-performanceIndex] |
|
147 |
+ } |
|
142 | 148 |
tuneParamsTrain <- append(tuneParamsTrain, modellingParams@trainParams@tuneParams) |
143 | 149 |
tuneCombosTrain <- expand.grid(tuneParamsTrain, stringsAsFactors = FALSE) |
144 | 150 |
modellingParams@trainParams@tuneParams <- NULL |
... | ... |
@@ -140,9 +140,9 @@ |
140 | 140 |
|
141 | 141 |
tuneParamsTrain <- list(topN = topNfeatures) |
142 | 142 |
tuneParamsTrain <- append(tuneParamsTrain, modellingParams@trainParams@tuneParams) |
143 |
- tuneParamsTrain <- tuneParamsTrain[-match("performanceType", names(tuneParamsTrain))] |
|
144 | 143 |
tuneCombosTrain <- expand.grid(tuneParamsTrain, stringsAsFactors = FALSE) |
145 | 144 |
modellingParams@trainParams@tuneParams <- NULL |
145 |
+ |
|
146 | 146 |
allPerformanceTables <- lapply(rankings, function(rankingsVariety) |
147 | 147 |
{ |
148 | 148 |
# Creates a matrix. Columns are top n features, rows are varieties (one row if None). |
... | ... |
@@ -167,7 +167,7 @@ |
167 | 167 |
result <- runTest(measurementsTrain, outcomeTrain, measurementsTrain, outcomeTrain, |
168 | 168 |
crossValParams = NULL, modellingParams = modellingParams, |
169 | 169 |
verbose = verbose, .iteration = "internal") |
170 |
- |
|
170 |
+ |
|
171 | 171 |
predictions <- result[["predictions"]] |
172 | 172 |
# Classifiers will use a column "class" and survival models will use a column "risk". |
173 | 173 |
if(class(predictions) == "data.frame") |
... | ... |
@@ -222,6 +222,7 @@ |
222 | 222 |
measurementsTrain, outcomeTrain, |
223 | 223 |
crossValParams = NULL, modellingParams, |
224 | 224 |
verbose = verbose, .iteration = "internal") |
225 |
+ |
|
225 | 226 |
predictions <- result[["predictions"]] |
226 | 227 |
if(class(predictions) == "data.frame") |
227 | 228 |
predictedOutcome <- predictions[, "class"] |
... | ... |
@@ -275,7 +276,7 @@ |
275 | 276 |
result <- runTest(measurementsTrain, outcomeTrain, measurementsTrain, outcomeTrain, |
276 | 277 |
crossValParams = NULL, modellingParams, |
277 | 278 |
verbose = verbose, .iteration = "internal") |
278 |
- |
|
279 |
+ |
|
279 | 280 |
predictions <- result[["predictions"]] |
280 | 281 |
if(class(predictions) == "data.frame") |
281 | 282 |
predictedOutcome <- predictions[, colnames(predictions) %in% c("class", "risk")] |
... | ... |
@@ -580,7 +581,8 @@ |
580 | 581 |
obj |
581 | 582 |
} |
582 | 583 |
|
583 |
-.predict <- function(object, newdata, ...) { # Remove once sparsediscrim is reinstated to CRAN. |
|
584 |
+#' @method predict dlda |
|
585 |
+predict.dlda <- function(object, newdata, ...) { # Remove once sparsediscrim is reinstated to CRAN. |
|
584 | 586 |
if (!inherits(object, "dlda")) { |
585 | 587 |
stop("object not of class 'dlda'") |
586 | 588 |
} |
... | ... |
@@ -140,6 +140,7 @@ |
140 | 140 |
|
141 | 141 |
tuneParamsTrain <- list(topN = topNfeatures) |
142 | 142 |
tuneParamsTrain <- append(tuneParamsTrain, modellingParams@trainParams@tuneParams) |
143 |
+ tuneParamsTrain <- tuneParamsTrain[-match("performanceType", names(tuneParamsTrain))] |
|
143 | 144 |
tuneCombosTrain <- expand.grid(tuneParamsTrain, stringsAsFactors = FALSE) |
144 | 145 |
modellingParams@trainParams@tuneParams <- NULL |
145 | 146 |
allPerformanceTables <- lapply(rankings, function(rankingsVariety) |
... | ... |
@@ -511,6 +512,7 @@ |
511 | 512 |
keyword, |
512 | 513 |
"randomForest" = RFparams(), |
513 | 514 |
"randomSurvivalForest" = RSFparams(), |
515 |
+ "XGB" = XGBparams(), |
|
514 | 516 |
"GLM" = GLMparams(), |
515 | 517 |
"elasticNetGLM" = elasticNetGLMparams(), |
516 | 518 |
"SVM" = SVMparams(), |
... | ... |
@@ -1,152 +1,3 @@ |
1 |
-# Operates on an input data frame, to extract the outcome column(s) and return |
|
2 |
-# a list with the table of covariates in one element and the outcome in another. |
|
3 |
-# The outcome need to be removed from the data table before predictor training! |
|
4 |
-.splitDataAndOutcome <- function(measurements, outcome, restrict = NULL) |
|
5 |
-{ # DataFrame's outcome variable can be character or factor, so it's a bit involved. |
|
6 |
- if(is.character(outcome) && length(outcome) > 3 && length(outcome) != nrow(measurements)) |
|
7 |
- stop("'outcome' is a character variable but has more than one element. Either provide a\n", |
|
8 |
- " one to three column names or a factor of the same length as the number of samples.") |
|
9 |
- |
|
10 |
- ## String specifies the name of a single outcome column, typically a class. |
|
11 |
- if(is.character(outcome) && length(outcome) == 1) |
|
12 |
- { |
|
13 |
- outcomeColumn <- match(outcome, colnames(measurements)) |
|
14 |
- if(is.na(outcomeColumn)) |
|
15 |
- stop("Specified column name of outcome is not present in the data table.") |
|
16 |
- outcome <- measurements[, outcomeColumn] |
|
17 |
- measurements <- measurements[, -outcomeColumn, drop = FALSE] |
|
18 |
- # R version 4 and greater no longer automatically casts character columns to factors because stringsAsFactors |
|
19 |
- # is FALSE by default, so it is more likely to be character format these days. Handle it. |
|
20 |
- if(class(outcome) != "factor") # Assume there will be no ordinary regression prediction tasks ... for now. |
|
21 |
- outcome <- factor(outcome) |
|
22 |
- } |
|
23 |
- |
|
24 |
- # survival's Surv constructor has two inputs for the popular right-censored data and |
|
25 |
- # three inputs for less-common interval data. |
|
26 |
- if(is.character(outcome) && length(outcome) %in% 2:3) |
|
27 |
- { |
|
28 |
- outcomeColumns <- match(outcome, colnames(measurements)) |
|
29 |
- if(any(is.na(outcomeColumns))) |
|
30 |
- stop("Specified column names of outcome is not present in the data table.") |
|
31 |
- outcome <- measurements[, outcomeColumns] |
|
32 |
- measurements <- measurements[, -outcomeColumns, drop = FALSE] |
|
33 |
- } |
|
34 |
- |
|
35 |
- if(is(outcome, "factor") && length(outcome) > 3 & length(outcome) < nrow(measurements)) |
|
36 |
- stop("The length of outcome is not equal to the number of samples.") |
|
37 |
- |
|
38 |
- ## A vector of characters was input by the user. Ensure that it is a factor. |
|
39 |
- if(is.character(outcome) & length(outcome) == nrow(measurements)) |
|
40 |
- outcome <- factor(outcome) |
|
41 |
- |
|
42 |
- # Outcome has columns, so it is tabular. It is inferred to represent survival data. |
|
43 |
- if(!is.null(ncol(outcome)) && ncol(outcome) %in% 2:3) |
|
44 |
- { |
|
45 |
- # Assume that event status is in the last column (second for two columns, third for three columns) |
|
46 |
- numberEventTypes <- length(unique(outcome[, ncol(outcome)])) |
|
47 |
- # Could be one or two kinds of events. All events might be uncensored or censored |
|
48 |
- # in a rare but not impossible scenario. |
|
49 |
- if(numberEventTypes > 2) |
|
50 |
- stop("Number of distinct event types in the last column exceeds 2 but must be 1 or 2.") |
|
51 |
- |
|
52 |
- if(ncol(outcome) == 2) # Typical, right-censored survival data. |
|
53 |
- outcome <- survival::Surv(outcome[, 1], outcome[, 2]) |
|
54 |
- else # Three columns. Therefore, counting process data. |
|
55 |
- outcome <- survival::Surv(outcome[, 1], outcome[, 2], outcome[, 3]) |
|
56 |
- } |
|
57 |
- |
|
58 |
- if(!is.null(restrict)) |
|
59 |
- { |
|
60 |
- isDesiredClass <- sapply(measurements, function(column) is(column, restrict)) |
|
61 |
- measurements <- measurements[, isDesiredClass, drop = FALSE] |
|
62 |
- if(ncol(measurements) == 0) |
|
63 |
- stop(paste("No features are left after restricting to", restrict, "but at least one must be.")) |
|
64 |
- } |
|
65 |
- |
|
66 |
- list(measurements = measurements, outcome = outcome) |
|
67 |
-} |
|
68 |
- |
|
69 |
-# Function to convert a MultiAssayExperiment object into a flat DataFrame table, to enable it |
|
70 |
-# to be used in typical model building functions. |
|
71 |
-# Returns a list with a covariate table and and outcome vector/table, or just a covariate table |
|
72 |
-# in the case the input is a test data set. |
|
73 |
-.MAEtoWideTable <- function(measurements, targets = NULL, outcomeColumns = NULL, restrict = "numeric") |
|
74 |
-{ |
|
75 |
- if(is.null(targets)) |
|
76 |
- stop("'targets' is not specified but must be.") |
|
77 |
- if(is.null(outcomeColumns)) |
|
78 |
- stop("'outcomeColumns' is not specified but must be.") |
|
79 |
- if(!all(targets %in% c(names(measurements), "sampleInfo"))) |
|
80 |
- stop("Some table names in 'targets' are not assay names in 'measurements' or \"sampleInfo\".") |
|
81 |
- sampleInfoColumns <- colnames(MultiAssayExperiment::colData(measurements)) |
|
82 |
- if(!missing(outcomeColumns) & !all(outcomeColumns %in% sampleInfoColumns)) |
|
83 |
- stop("Not all column names specified by 'outcomeColumns' found in sample information table.") |
|
84 |
- |
|
85 |
- if("sampleInfo" %in% targets) |
|
86 |
- { |
|
87 |
- targets <- targets[targets != "sampleInfo"] |
|
88 |
- sampleInfoColumnsTrain <- sampleInfoColumns |
|
89 |
- } else { |
|
90 |
- sampleInfoColumnsTrain <- NULL |
|
91 |
- } |
|
92 |
- |
|
93 |
- if(length(targets) > 0) |
|
94 |
- { |
|
95 |
- measurements <- measurements[, , targets] |
|
96 |
- |
|
97 |
- # Get all desired measurements tables and sample information columns (other than the columns representing outcome). |
|
98 |
- # These form the independent variables to be used for making predictions with. |
|
99 |
- # Variable names will have names like RNA:BRAF for traceability. |
|
100 |
- dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(sampleInfoColumnsTrain, outcomeColumns), check.names = FALSE, collapse = ':') |
|
101 |
- rownames(dataTable) <- dataTable[, "primary"] |
|
102 |
- S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "sampleInfo", S4Vectors::mcols(dataTable)[, "sourceName"]) |
|
103 |
- colnames(S4Vectors::mcols(dataTable))[1] <- "assay" |
|
104 |
- |
|
105 |
- # Sample information variable names not included in column metadata of wide table but only as row names of it. |
|
106 |
- # Create a combined column named "feature" which has feature names of the assays as well as the sample information. |
|
107 |
- S4Vectors::mcols(dataTable)[, "feature"] <- as.character(S4Vectors::mcols(dataTable)[, "rowname"]) |
|
108 |
- missingIndices <- is.na(S4Vectors::mcols(dataTable)[, "feature"]) |
|
109 |
- S4Vectors::mcols(dataTable)[missingIndices, "feature"] <- colnames(dataTable)[missingIndices] |
|
110 |
- |
|
111 |
- # Finally, a column annotation recording variable name and which table it originated from for all of the source tables. |
|
112 |
- S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")] |
|
113 |
- } else { # Must have only been sample information data. |
|
114 |
- dataTable <- MultiAssayExperiment::colData(measurements) |
|
115 |
- } |
|
116 |
- if(!is.null(outcomeColumns)) outcome <- dataTable[, outcomeColumns] |
|
117 |
- |
|
118 |
- if(!is.null(restrict)) |
|
119 |
- { |
|
120 |
- isDesiredClass <- sapply(dataTable, function(column) is(column, restrict)) |
|
121 |
- dataTable <- dataTable[, isDesiredClass, drop = FALSE] |
|
122 |
- if(ncol(dataTable) == 0) |
|
123 |
- stop(paste("No features are left after restricting to", restrict, "but at least one must be.")) |
|
124 |
- } |
|
125 |
- |
|
126 |
- # Only return independent variables in dataTable for making classifications with. |
|
127 |
- # "primary" column is auto-generated by sample information table row names and a duplicate. |
|
128 |
- dropColumns <- na.omit(match(c("primary", outcomeColumns), colnames(dataTable))) |
|
129 |
- if(length(dropColumns) > 0) dataTable <- dataTable[, -dropColumns] |
|
130 |
- |
|
131 |
- # Training data table and outcome for training data. |
|
132 |
- if(!is.null(outcomeColumns)) |
|
133 |
- list(dataTable = dataTable, outcome = outcome) |
|
134 |
- else # Only test data table for test data input. |
|
135 |
- dataTable |
|
136 |
-} |
|
137 |
- |
|
138 |
-# For classifiers which use one single function for inputting a training and a testing table, |
|
139 |
-# and work only for the numeric data type, this checks whether the training and testing tables |
|
140 |
-# both have the same set of features and there are at least some numeric features to use, |
|
141 |
-# after they have been filtered by another function which splits the covariates and the outcome from the input. |
|
142 |
-.checkVariablesAndSame <- function(trainingMatrix, testingMatrix) |
|
143 |
-{ |
|
144 |
- if(ncol(trainingMatrix) == 0) # Filtering of table removed all columns, leaving nothing to classify with. |
|
145 |
- stop("No variables in data tables specified by \'targets\' are numeric.") |
|
146 |
- else if(ncol(trainingMatrix) != ncol(testingMatrix)) |
|
147 |
- stop("Training data set and testing data set contain differing numbers of features.") |
|
148 |
-} |
|
149 |
- |
|
150 | 1 |
# Creates two lists of lists. First has training samples, second has test samples for a range |
151 | 2 |
# of different cross-validation schemes. |
152 | 3 |
#' @import utils |
... | ... |
@@ -255,7 +106,6 @@ |
255 | 106 |
tuneParams <- modellingParams@selectParams@tuneParams |
256 | 107 |
performanceType <- tuneParams[["performanceType"]] |
257 | 108 |
topNfeatures <- tuneParams[["nFeatures"]] |
258 |
- tuneMode <- ifelse("tuneMode" %in% names(tuneParams), tuneParams[["tuneMode"]], crossValParams@tuneMode) |
|
259 | 109 |
tuneParams <- tuneParams[-match(c("performanceType", "nFeatures"), names(tuneParams))] # Only used as evaluation metric. |
260 | 110 |
|
261 | 111 |
# Make selectParams NULL, since we are currently doing selection and it shouldn't call |
... | ... |
@@ -282,17 +132,17 @@ |
282 | 132 |
do.call(featureRanking, paramList) |
283 | 133 |
}) |
284 | 134 |
|
285 |
- if(featureRanking@generic == "previousSelection") # Actually selection not ranking. |
|
135 |
+ if(attr(featureRanking, "name") == "previousSelection") # Actually selection not ranking. |
|
286 | 136 |
return(list(NULL, rankings[[1]], NULL)) |
287 | 137 |
|
288 |
- if(tuneMode == "none") # Actually selection not ranking. |
|
138 |
+ if(crossValParams@tuneMode == "none") # No parameters to choose between. |
|
289 | 139 |
return(list(NULL, rankings[[1]], NULL)) |
290 | 140 |
|
291 | 141 |
tuneParamsTrain <- list(topN = topNfeatures) |
292 | 142 |
tuneParamsTrain <- append(tuneParamsTrain, modellingParams@trainParams@tuneParams) |
293 | 143 |
tuneCombosTrain <- expand.grid(tuneParamsTrain, stringsAsFactors = FALSE) |
294 | 144 |
modellingParams@trainParams@tuneParams <- NULL |
295 |
- bestPerformers <- sapply(rankings, function(rankingsVariety) |
|
145 |
+ allPerformanceTables <- lapply(rankings, function(rankingsVariety) |
|
296 | 146 |
{ |
297 | 147 |
# Creates a matrix. Columns are top n features, rows are varieties (one row if None). |
298 | 148 |
performances <- sapply(1:nrow(tuneCombosTrain), function(rowIndex) |
... | ... |
@@ -332,20 +182,22 @@ |
332 | 182 |
}) |
333 | 183 |
|
334 | 184 |
bestOne <- ifelse(betterValues == "lower", which.min(performances)[1], which.max(performances)[1]) |
335 |
- c(bestOne, performances[bestOne]) |
|
185 |
+ list(data.frame(tuneCombosTrain, performance = performances), bestOne) |
|
336 | 186 |
}) |
337 | 187 |
|
338 |
- tunePick <- ifelse(betterValues == "lower", which.min(bestPerformers[2, ])[1], which.max(bestPerformers[2, ])[1]) |
|
188 |
+ tablesBestMetrics <- sapply(allPerformanceTables, function(tableIndexPair) tableIndexPair[[1]][tableIndexPair[[2]], "performance"]) |
|
189 |
+ tunePick <- ifelse(betterValues == "lower", which.min(tablesBestMetrics)[1], which.max(tablesBestMetrics)[1]) |
|
339 | 190 |
|
340 | 191 |
if(verbose == 3) |
341 | 192 |
message("Features selected.") |
342 | 193 |
|
343 |
- tuneRow <- tuneCombosTrain[bestPerformers[1, tunePick], , drop = FALSE] |
|
344 |
- if(ncol(tuneRow) > 1) tuneDetails <- tuneRow[, -1, drop = FALSE] else tuneDetails <- NULL |
|
194 |
+ tuneDetails <- allPerformanceTables[[tunePick]] # List of length 2. |
|
345 | 195 |
|
346 | 196 |
rankingUse <- rankings[[tunePick]] |
347 |
- selectionIndices <- rankingUse[1:tuneRow[, "topN"]] |
|
197 |
+ selectionIndices <- rankingUse[1:(tuneDetails[[1]][tuneDetails[[2]], "topN"])] |
|
348 | 198 |
|
199 |
+ names(tuneDetails) <- c("tuneCombinations", "bestIndex") |
|
200 |
+ colnames(tuneDetails[[1]])[ncol(tuneDetails[[1]])] <- performanceType |
|
349 | 201 |
list(ranked = rankingUse, selected = selectionIndices, tune = tuneDetails) |
350 | 202 |
} else if(is.list(featureRanking)) { # It is a list of functions for ensemble selection. |
351 | 203 |
featuresIndiciesLists <- mapply(function(selector, selParams) |
... | ... |
@@ -388,8 +240,7 @@ |
388 | 240 |
|
389 | 241 |
list(NULL, selectionIndices, NULL) |
390 | 242 |
} else { # Previous selection |
391 |
- selectedFeatures <- |
|
392 |
- list(NULL, selectionIndices, NULL) |
|
243 |
+ selectedFeatures <- list(NULL, selectionIndices, NULL) |
|
393 | 244 |
} |
394 | 245 |
} |
395 | 246 |
|
... | ... |
@@ -405,10 +256,10 @@ |
405 | 256 |
|
406 | 257 |
# Code to create a function call to a training function. Might also do training and testing |
407 | 258 |
# within the same function, so test samples are also passed in case they are needed. |
408 |
-.doTrain <- function(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, modellingParams, verbose) |
|
259 |
+.doTrain <- function(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, crossValParams, modellingParams, verbose) |
|
409 | 260 |
{ |
410 |
- tuneChosen <- NULL |
|
411 |
- if(!is.null(modellingParams@trainParams@tuneParams) && is.null(modellingParams@selectParams@tuneParams)) |
|
261 |
+ tuneDetails <- NULL |
|
262 |
+ if(!is.null(modellingParams@trainParams@tuneParams) && is.null(modellingParams@selectParams)) |
|
412 | 263 |
{ |
413 | 264 |
performanceType <- modellingParams@trainParams@tuneParams[["performanceType"]] |
414 | 265 |
modellingParams@trainParams@tuneParams <- modellingParams@trainParams@tuneParams[-match("performanceType", names(modellingParams@trainParams@tuneParams))] |
... | ... |
@@ -420,16 +271,16 @@ |
420 | 271 |
modellingParams@trainParams@otherParams <- c(modellingParams@trainParams@otherParams, as.list(tuneCombos[rowIndex, ])) |
421 | 272 |
if(crossValParams@tuneMode == "Resubstitution") |
422 | 273 |
{ |
423 |
- result <- runTest(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, |
|
274 |
+ result <- runTest(measurementsTrain, outcomeTrain, measurementsTrain, outcomeTrain, |
|
424 | 275 |
crossValParams = NULL, modellingParams, |
425 | 276 |
verbose = verbose, .iteration = "internal") |
426 | 277 |
|
427 | 278 |
predictions <- result[["predictions"]] |
428 | 279 |
if(class(predictions) == "data.frame") |
429 |
- predictedOutcome <- predictions[, "outcome"] |
|
280 |
+ predictedOutcome <- predictions[, colnames(predictions) %in% c("class", "risk")] |
|
430 | 281 |
else |
431 | 282 |
predictedOutcome <- predictions |
432 |
- calcExternalPerformance(outcomeTest, predictedOutcome, performanceType) |
|
283 |
+ calcExternalPerformance(outcomeTrain, predictedOutcome, performanceType) |
|
433 | 284 |
} else { |
434 | 285 |
result <- runTests(measurementsTrain, outcomeTrain, |
435 | 286 |
crossValParams, modellingParams, |
... | ... |
@@ -438,13 +289,18 @@ |
438 | 289 |
median(performances(result)[[performanceType]]) |
439 | 290 |
} |
440 | 291 |
}) |
292 |
+ allPerformanceTable <- data.frame(tuneCombos, performances) |
|
293 |
+ colnames(allPerformanceTable)[ncol(allPerformanceTable)] <- performanceType |
|
294 |
+ |
|
441 | 295 |
betterValues <- .ClassifyRenvir[["performanceInfoTable"]][.ClassifyRenvir[["performanceInfoTable"]][, "type"] == performanceType, "better"] |
442 | 296 |
bestOne <- ifelse(betterValues == "lower", which.min(performances)[1], which.max(performances)[1]) |
443 | 297 |
tuneChosen <- tuneCombos[bestOne, , drop = FALSE] |
298 |
+ tuneDetails <- list(tuneCombos, bestOne) |
|
299 |
+ names(tuneDetails) <- c("tuneCombinations", "bestIndex") |
|
444 | 300 |
modellingParams@trainParams@otherParams <- tuneChosen |
445 | 301 |
} |
446 | 302 |
|
447 |
- if(modellingParams@trainParams@classifier@generic != "previousTrained") |
|
303 |
+ if (!"previousTrained" %in% attr(modellingParams@trainParams@classifier, "name")) |
|
448 | 304 |
# Don't name these first two variables. Some classifier functions might use classesTrain and others use outcomeTrain. |
449 | 305 |
paramList <- list(measurementsTrain, outcomeTrain) |
450 | 306 |
else # Don't pass the measurements and classes, because a pre-existing classifier is used. |
... | ... |
@@ -459,7 +315,7 @@ |
459 | 315 |
if(verbose >= 2) |
460 | 316 |
message("Training completed.") |
461 | 317 |
|
462 |
- list(model = trained, tune = tuneChosen) |
|
318 |
+ list(model = trained, tune = tuneDetails) |
|
463 | 319 |
} |
464 | 320 |
|
465 | 321 |
# Creates a function call to a prediction function. |
... | ... |
@@ -528,15 +384,6 @@ |
528 | 384 |
# by user-specified values. |
529 | 385 |
.filterCharacteristics <- function(characteristics, autoCharacteristics) |
530 | 386 |
{ |
531 |
- # Remove duplication of values for classifiers that have one function for training and |
|
532 |
- # one function for prediction. |
|
533 |
- if("Classifier Name" %in% autoCharacteristics[, "characteristic"] && "Predictor Name" %in% autoCharacteristics[, "characteristic"]) |
|
534 |
- { |
|
535 |
- classRow <- which(autoCharacteristics[, "characteristic"] == "Classifier Name") |
|
536 |
- predRow <- which(autoCharacteristics[, "characteristic"] == "Predictor Name") |
|
537 |
- if(autoCharacteristics[classRow, "value"] == autoCharacteristics[predRow, "value"]) |
|
538 |
- autoCharacteristics <- autoCharacteristics[-predRow, ] |
|
539 |
- } |
|
540 | 387 |
# Overwrite automatically-chosen names with user's names. |
541 | 388 |
if(nrow(autoCharacteristics) > 0 && nrow(characteristics) > 0) |
542 | 389 |
{ |
... | ... |
@@ -560,36 +407,6 @@ |
560 | 407 |
plotData |
561 | 408 |
} |
562 | 409 |
|
563 |
-# Summary of the features used and the total number of them, no matter if they are a simple type |
|
564 |
-# or something more complex like Pairs or feature sets. |
|
565 |
-.summaryFeatures <- function(measurements) |
|
566 |
-{ |
|
567 |
- # MultiAssayExperiment has feature details in mcols. |
|
568 |
- if(!is.null(S4Vectors::mcols(measurements))) |
|
569 |
- { |
|
570 |
- originalInfo <- S4Vectors::mcols(measurements) |
|
571 |
- featureNames <- S4Vectors::mcols(measurements)[, "feature"] |
|
572 |
- assays <- unique(S4Vectors::mcols(measurements)[, "assay"]) |
|
573 |
- renamedInfo <- S4Vectors::mcols(measurements) |
|
574 |
- renamedAssays <- paste("Assay", seq_along(assays), sep = '') |
|
575 |
- for(assay in assays) |
|
576 |
- { |
|
577 |
- rowsAssay <- which(renamedInfo[, "assay"] == assay) |
|
578 |
- renamedInfo[rowsAssay, "feature"] <- paste("Feature", seq_along(rowsAssay), sep = '') |
|
579 |
- renamedInfo[rowsAssay, "assay"] <- renamedAssays[match(assay, assays)] |
|
580 |
- } |
|
581 |
- featuresInfo <- S4Vectors::DataFrame(originalInfo, renamedInfo) |
|
582 |
- colnames(featuresInfo) <- c("Original Assay", "Original Feature", "Renamed Assay", "Renamed Feature") |
|
583 |
- featuresInfo <- cbind(originalInfo, featuresInfo) |
|
584 |
- } else { |
|
585 |
- originalFeatures <- colnames(measurements) |
|
586 |
- renamedInfo <- paste("Feature", seq_along(measurements), sep = '') |
|
587 |
- featuresInfo <- S4Vectors::DataFrame(originalFeatures, renamedInfo) |
|
588 |
- colnames(featuresInfo) <- c("Original Feature", "Renamed Feature") |
|
589 |
- } |
|
590 |
- featuresInfo |
|
591 |
-} |
|
592 |
- |
|
593 | 410 |
# Function to identify the parameters of an S4 method. |
594 | 411 |
.methodFormals <- function(f, signature) { |
595 | 412 |
tryCatch({ |
... | ... |
@@ -660,6 +477,53 @@ |
660 | 477 |
list(measurementsTrain = measurementsTrain, classesTrain = classesTrain) |
661 | 478 |
} |
662 | 479 |
|
480 |
+.transformKeywordToFunction <- function(keyword) |
|
481 |
+{ |
|
482 |
+ switch( |
|
483 |
+ keyword, |
|
484 |
+ "none" = NULL, |
|
485 |
+ "diffLoc" = subtractFromLocation |
|
486 |
+ ) |
|
487 |
+} |
|
488 |
+ |
|
489 |
+.selectionKeywordToFunction <- function(keyword) |
|
490 |
+{ |
|
491 |
+ switch( |
|
492 |
+ keyword, |
|
493 |
+ "none" = NULL, |
|
494 |
+ "t-test" = differentMeansRanking, |
|
495 |
+ "limma" = limmaRanking, |
|
496 |
+ "edgeR" = edgeRranking, |
|
497 |
+ "Bartlett" = bartlettRanking, |
|
498 |
+ "Levene" = leveneRanking, |
|
499 |
+ "DMD" = DMDranking, |
|
500 |
+ "likelihoodRatio" = likelihoodRatioRanking, |
|
501 |
+ "KS" = KolmogorovSmirnovRanking, |
|
502 |
+ "KL" = KullbackLeiblerRanking, |
|
503 |
+ "CoxPH" = coxphRanking, |
|
504 |
+ "selectMulti" = selectMulti |
|
505 |
+ ) |
|
506 |
+} |
|
507 |
+ |
|
508 |
+.classifierKeywordToParams <- function(keyword) |
|
509 |
+{ |
|
510 |
+ switch( |
|
511 |
+ keyword, |
|
512 |
+ "randomForest" = RFparams(), |
|
513 |
+ "randomSurvivalForest" = RSFparams(), |
|
514 |
+ "GLM" = GLMparams(), |
|
515 |
+ "elasticNetGLM" = elasticNetGLMparams(), |
|
516 |
+ "SVM" = SVMparams(), |
|
517 |
+ "NSC" = NSCparams(), |
|
518 |
+ "DLDA" = DLDAparams(), |
|
519 |
+ "naiveBayes" = naiveBayesParams(), |
|
520 |
+ "mixturesNormals" = mixModelsParams(), |
|
521 |
+ "kNN" = kNNparams(), |
|
522 |
+ "CoxPH" = coxphParams(), |
|
523 |
+ "CoxNet" = coxnetParams() |
|
524 |
+ ) |
|
525 |
+} |
|
526 |
+ |
|
663 | 527 |
.dlda <- function(x, y, prior = NULL){ # Remove this once sparsediscrim is reinstated to CRAN. |
664 | 528 |
obj <- list() |
665 | 529 |
obj$labels <- y |
... | ... |
@@ -1,3 +1,152 @@ |
1 |
+# Operates on an input data frame, to extract the outcome column(s) and return |
|
2 |
+# a list with the table of covariates in one element and the outcome in another. |
|
3 |
+# The outcome need to be removed from the data table before predictor training! |
|
4 |
+.splitDataAndOutcome <- function(measurements, outcome, restrict = NULL) |
|
5 |
+{ # DataFrame's outcome variable can be character or factor, so it's a bit involved. |
|
6 |
+ if(is.character(outcome) && length(outcome) > 3 && length(outcome) != nrow(measurements)) |
|
7 |
+ stop("'outcome' is a character variable but has more than one element. Either provide a\n", |
|
8 |
+ " one to three column names or a factor of the same length as the number of samples.") |
|
9 |
+ |
|
10 |
+ ## String specifies the name of a single outcome column, typically a class. |
|
11 |
+ if(is.character(outcome) && length(outcome) == 1) |
|
12 |
+ { |
|
13 |
+ outcomeColumn <- match(outcome, colnames(measurements)) |
|
14 |
+ if(is.na(outcomeColumn)) |
|
15 |
+ stop("Specified column name of outcome is not present in the data table.") |
|
16 |
+ outcome <- measurements[, outcomeColumn] |
|
17 |
+ measurements <- measurements[, -outcomeColumn, drop = FALSE] |
|
18 |
+ # R version 4 and greater no longer automatically casts character columns to factors because stringsAsFactors |
|
19 |
+ # is FALSE by default, so it is more likely to be character format these days. Handle it. |
|
20 |
+ if(class(outcome) != "factor") # Assume there will be no ordinary regression prediction tasks ... for now. |
|
21 |
+ outcome <- factor(outcome) |
|
22 |
+ } |
|
23 |
+ |
|
24 |
+ # survival's Surv constructor has two inputs for the popular right-censored data and |
|
25 |
+ # three inputs for less-common interval data. |
|
26 |
+ if(is.character(outcome) && length(outcome) %in% 2:3) |
|
27 |
+ { |
|
28 |
+ outcomeColumns <- match(outcome, colnames(measurements)) |
|
29 |
+ if(any(is.na(outcomeColumns))) |
|
30 |
+ stop("Specified column names of outcome is not present in the data table.") |
|
31 |
+ outcome <- measurements[, outcomeColumns] |
|
32 |
+ measurements <- measurements[, -outcomeColumns, drop = FALSE] |
|
33 |
+ } |
|
34 |
+ |
|
35 |
+ if(is(outcome, "factor") && length(outcome) > 3 & length(outcome) < nrow(measurements)) |
|
36 |
+ stop("The length of outcome is not equal to the number of samples.") |
|
37 |
+ |
|
38 |
+ ## A vector of characters was input by the user. Ensure that it is a factor. |
|
39 |
+ if(is.character(outcome) & length(outcome) == nrow(measurements)) |
|
40 |
+ outcome <- factor(outcome) |
|
41 |
+ |
|
42 |
+ # Outcome has columns, so it is tabular. It is inferred to represent survival data. |
|
43 |
+ if(!is.null(ncol(outcome)) && ncol(outcome) %in% 2:3) |
|
44 |
+ { |
|
45 |
+ # Assume that event status is in the last column (second for two columns, third for three columns) |
|
46 |
+ numberEventTypes <- length(unique(outcome[, ncol(outcome)])) |
|
47 |
+ # Could be one or two kinds of events. All events might be uncensored or censored |
|
48 |
+ # in a rare but not impossible scenario. |
|
49 |
+ if(numberEventTypes > 2) |
|
50 |
+ stop("Number of distinct event types in the last column exceeds 2 but must be 1 or 2.") |
|
51 |
+ |
|
52 |
+ if(ncol(outcome) == 2) # Typical, right-censored survival data. |
|
53 |
+ outcome <- survival::Surv(outcome[, 1], outcome[, 2]) |
|
54 |
+ else # Three columns. Therefore, counting process data. |
|
55 |
+ outcome <- survival::Surv(outcome[, 1], outcome[, 2], outcome[, 3]) |
|
56 |
+ } |
|
57 |
+ |
|
58 |
+ if(!is.null(restrict)) |
|
59 |
+ { |
|
60 |
+ isDesiredClass <- sapply(measurements, function(column) is(column, restrict)) |
|
61 |
+ measurements <- measurements[, isDesiredClass, drop = FALSE] |
|
62 |
+ if(ncol(measurements) == 0) |
|
63 |
+ stop(paste("No features are left after restricting to", restrict, "but at least one must be.")) |
|
64 |
+ } |
|
65 |
+ |
|
66 |
+ list(measurements = measurements, outcome = outcome) |
|
67 |
+} |
|
68 |
+ |
|
69 |
+# Function to convert a MultiAssayExperiment object into a flat DataFrame table, to enable it |
|
70 |
+# to be used in typical model building functions. |
|
71 |
+# Returns a list with a covariate table and and outcome vector/table, or just a covariate table |
|
72 |
+# in the case the input is a test data set. |
|
73 |
+.MAEtoWideTable <- function(measurements, targets = NULL, outcomeColumns = NULL, restrict = "numeric") |
|
74 |
+{ |
|
75 |
+ if(is.null(targets)) |
|
76 |
+ stop("'targets' is not specified but must be.") |
|
77 |
+ if(is.null(outcomeColumns)) |
|
78 |
+ stop("'outcomeColumns' is not specified but must be.") |
|
79 |
+ if(!all(targets %in% c(names(measurements), "sampleInfo"))) |
|
80 |
+ stop("Some table names in 'targets' are not assay names in 'measurements' or \"sampleInfo\".") |
|
81 |
+ sampleInfoColumns <- colnames(MultiAssayExperiment::colData(measurements)) |
|
82 |
+ if(!missing(outcomeColumns) & !all(outcomeColumns %in% sampleInfoColumns)) |
|
83 |
+ stop("Not all column names specified by 'outcomeColumns' found in sample information table.") |
|
84 |
+ |
|
85 |
+ if("sampleInfo" %in% targets) |
|
86 |
+ { |
|
87 |
+ targets <- targets[targets != "sampleInfo"] |
|
88 |
+ sampleInfoColumnsTrain <- sampleInfoColumns |
|
89 |
+ } else { |
|
90 |
+ sampleInfoColumnsTrain <- NULL |
|
91 |
+ } |
|
92 |
+ |
|
93 |
+ if(length(targets) > 0) |
|
94 |
+ { |
|
95 |
+ measurements <- measurements[, , targets] |
|
96 |
+ |
|
97 |
+ # Get all desired measurements tables and sample information columns (other than the columns representing outcome). |
|
98 |
+ # These form the independent variables to be used for making predictions with. |
|
99 |
+ # Variable names will have names like RNA:BRAF for traceability. |
|
100 |
+ dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(sampleInfoColumnsTrain, outcomeColumns), check.names = FALSE, collapse = ':') |
|
101 |
+ rownames(dataTable) <- dataTable[, "primary"] |
|
102 |
+ S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "sampleInfo", S4Vectors::mcols(dataTable)[, "sourceName"]) |
|
103 |
+ colnames(S4Vectors::mcols(dataTable))[1] <- "assay" |
|
104 |
+ |
|
105 |
+ # Sample information variable names not included in column metadata of wide table but only as row names of it. |
|
106 |
+ # Create a combined column named "feature" which has feature names of the assays as well as the sample information. |
|
107 |
+ S4Vectors::mcols(dataTable)[, "feature"] <- as.character(S4Vectors::mcols(dataTable)[, "rowname"]) |
|
108 |
+ missingIndices <- is.na(S4Vectors::mcols(dataTable)[, "feature"]) |
|
109 |
+ S4Vectors::mcols(dataTable)[missingIndices, "feature"] <- colnames(dataTable)[missingIndices] |
|
110 |
+ |
|
111 |
+ # Finally, a column annotation recording variable name and which table it originated from for all of the source tables. |
|
112 |
+ S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")] |
|
113 |
+ } else { # Must have only been sample information data. |
|
114 |
+ dataTable <- MultiAssayExperiment::colData(measurements) |
|
115 |
+ } |
|
116 |
+ if(!is.null(outcomeColumns)) outcome <- dataTable[, outcomeColumns] |
|
117 |
+ |
|
118 |
+ if(!is.null(restrict)) |
|
119 |
+ { |
|
120 |
+ isDesiredClass <- sapply(dataTable, function(column) is(column, restrict)) |
|
121 |
+ dataTable <- dataTable[, isDesiredClass, drop = FALSE] |
|
122 |
+ if(ncol(dataTable) == 0) |
|
123 |
+ stop(paste("No features are left after restricting to", restrict, "but at least one must be.")) |
|
124 |
+ } |
|
125 |
+ |
|
126 |
+ # Only return independent variables in dataTable for making classifications with. |
|
127 |
+ # "primary" column is auto-generated by sample information table row names and a duplicate. |
|
128 |
+ dropColumns <- na.omit(match(c("primary", outcomeColumns), colnames(dataTable))) |
|
129 |
+ if(length(dropColumns) > 0) dataTable <- dataTable[, -dropColumns] |
|
130 |
+ |
|
131 |
+ # Training data table and outcome for training data. |
|
132 |
+ if(!is.null(outcomeColumns)) |
|
133 |
+ list(dataTable = dataTable, outcome = outcome) |
|
134 |
+ else # Only test data table for test data input. |
|
135 |
+ dataTable |
|
136 |
+} |
|
137 |
+ |
|
138 |
+# For classifiers which use one single function for inputting a training and a testing table, |
|
139 |
+# and work only for the numeric data type, this checks whether the training and testing tables |
|
140 |
+# both have the same set of features and there are at least some numeric features to use, |
|
141 |
+# after they have been filtered by another function which splits the covariates and the outcome from the input. |
|
142 |
+.checkVariablesAndSame <- function(trainingMatrix, testingMatrix) |
|
143 |
+{ |
|
144 |
+ if(ncol(trainingMatrix) == 0) # Filtering of table removed all columns, leaving nothing to classify with. |
|
145 |
+ stop("No variables in data tables specified by \'targets\' are numeric.") |
|
146 |
+ else if(ncol(trainingMatrix) != ncol(testingMatrix)) |
|
147 |
+ stop("Training data set and testing data set contain differing numbers of features.") |
|
148 |
+} |
|
149 |
+ |
|
1 | 150 |
# Creates two lists of lists. First has training samples, second has test samples for a range |
2 | 151 |
# of different cross-validation schemes. |
3 | 152 |
#' @import utils |
... | ... |
@@ -106,6 +255,7 @@ |
106 | 255 |
tuneParams <- modellingParams@selectParams@tuneParams |
107 | 256 |
performanceType <- tuneParams[["performanceType"]] |
108 | 257 |
topNfeatures <- tuneParams[["nFeatures"]] |
258 |
+ tuneMode <- ifelse("tuneMode" %in% names(tuneParams), tuneParams[["tuneMode"]], crossValParams@tuneMode) |
|
109 | 259 |
tuneParams <- tuneParams[-match(c("performanceType", "nFeatures"), names(tuneParams))] # Only used as evaluation metric. |
110 | 260 |
|
111 | 261 |
# Make selectParams NULL, since we are currently doing selection and it shouldn't call |
... | ... |
@@ -132,17 +282,17 @@ |
132 | 282 |
do.call(featureRanking, paramList) |
133 | 283 |
}) |
134 | 284 |
|
135 |
- if(attr(featureRanking, "name") == "previousSelection") # Actually selection not ranking. |
|
285 |
+ if(featureRanking@generic == "previousSelection") # Actually selection not ranking. |
|
136 | 286 |
return(list(NULL, rankings[[1]], NULL)) |
137 | 287 |
|
138 |
- if(crossValParams@tuneMode == "none") # No parameters to choose between. |
|
288 |
+ if(tuneMode == "none") # Actually selection not ranking. |
|
139 | 289 |
return(list(NULL, rankings[[1]], NULL)) |
140 | 290 |
|
141 | 291 |
tuneParamsTrain <- list(topN = topNfeatures) |
142 | 292 |
tuneParamsTrain <- append(tuneParamsTrain, modellingParams@trainParams@tuneParams) |
143 | 293 |
tuneCombosTrain <- expand.grid(tuneParamsTrain, stringsAsFactors = FALSE) |
144 | 294 |
modellingParams@trainParams@tuneParams <- NULL |
145 |
- allPerformanceTables <- lapply(rankings, function(rankingsVariety) |
|
295 |
+ bestPerformers <- sapply(rankings, function(rankingsVariety) |
|
146 | 296 |
{ |
147 | 297 |
# Creates a matrix. Columns are top n features, rows are varieties (one row if None). |
148 | 298 |
performances <- sapply(1:nrow(tuneCombosTrain), function(rowIndex) |
... | ... |
@@ -182,22 +332,20 @@ |
182 | 332 |
}) |
183 | 333 |
|
184 | 334 |
bestOne <- ifelse(betterValues == "lower", which.min(performances)[1], which.max(performances)[1]) |
185 |
- list(data.frame(tuneCombosTrain, performance = performances), bestOne) |
|
335 |
+ c(bestOne, performances[bestOne]) |
|
186 | 336 |
}) |
187 | 337 |
|
188 |
- tablesBestMetrics <- sapply(allPerformanceTables, function(tableIndexPair) tableIndexPair[[1]][tableIndexPair[[2]], "performance"]) |
|
189 |
- tunePick <- ifelse(betterValues == "lower", which.min(tablesBestMetrics)[1], which.max(tablesBestMetrics)[1]) |
|
338 |
+ tunePick <- ifelse(betterValues == "lower", which.min(bestPerformers[2, ])[1], which.max(bestPerformers[2, ])[1]) |
|
190 | 339 |
|
191 | 340 |
if(verbose == 3) |
192 | 341 |
message("Features selected.") |
193 | 342 |
|
194 |
- tuneDetails <- allPerformanceTables[[tunePick]] # List of length 2. |
|
343 |
+ tuneRow <- tuneCombosTrain[bestPerformers[1, tunePick], , drop = FALSE] |
|
344 |
+ if(ncol(tuneRow) > 1) tuneDetails <- tuneRow[, -1, drop = FALSE] else tuneDetails <- NULL |
|
195 | 345 |
|
196 | 346 |
rankingUse <- rankings[[tunePick]] |
197 |
- selectionIndices <- rankingUse[1:(tuneDetails[[1]][tuneDetails[[2]], "topN"])] |
|
347 |
+ selectionIndices <- rankingUse[1:tuneRow[, "topN"]] |
|
198 | 348 |
|
199 |
- names(tuneDetails) <- c("tuneCombinations", "bestIndex") |
|
200 |
- colnames(tuneDetails[[1]])[ncol(tuneDetails[[1]])] <- performanceType |
|
201 | 349 |
list(ranked = rankingUse, selected = selectionIndices, tune = tuneDetails) |
202 | 350 |
} else if(is.list(featureRanking)) { # It is a list of functions for ensemble selection. |
203 | 351 |
featuresIndiciesLists <- mapply(function(selector, selParams) |
... | ... |
@@ -240,7 +388,8 @@ |
240 | 388 |
|
241 | 389 |
list(NULL, selectionIndices, NULL) |
242 | 390 |
} else { # Previous selection |
243 |
- selectedFeatures <- list(NULL, selectionIndices, NULL) |
|
391 |
+ selectedFeatures <- |
|
392 |
+ list(NULL, selectionIndices, NULL) |
|
244 | 393 |
} |
245 | 394 |
} |
246 | 395 |
|
... | ... |
@@ -256,10 +405,10 @@ |
256 | 405 |
|
257 | 406 |
# Code to create a function call to a training function. Might also do training and testing |
258 | 407 |
# within the same function, so test samples are also passed in case they are needed. |
259 |
-.doTrain <- function(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, crossValParams, modellingParams, verbose) |
|
408 |
+.doTrain <- function(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, modellingParams, verbose) |
|
260 | 409 |
{ |
261 |
- tuneDetails <- NULL |
|
262 |
- if(!is.null(modellingParams@trainParams@tuneParams) && is.null(modellingParams@selectParams)) |
|
410 |
+ tuneChosen <- NULL |
|
411 |
+ if(!is.null(modellingParams@trainParams@tuneParams) && is.null(modellingParams@selectParams@tuneParams)) |
|
263 | 412 |
{ |
264 | 413 |
performanceType <- modellingParams@trainParams@tuneParams[["performanceType"]] |
265 | 414 |
modellingParams@trainParams@tuneParams <- modellingParams@trainParams@tuneParams[-match("performanceType", names(modellingParams@trainParams@tuneParams))] |
... | ... |
@@ -271,16 +420,16 @@ |
271 | 420 |
modellingParams@trainParams@otherParams <- c(modellingParams@trainParams@otherParams, as.list(tuneCombos[rowIndex, ])) |
272 | 421 |
if(crossValParams@tuneMode == "Resubstitution") |
273 | 422 |
{ |
274 |
- result <- runTest(measurementsTrain, outcomeTrain, measurementsTrain, outcomeTrain, |
|
423 |
+ result <- runTest(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, |
|
275 | 424 |
crossValParams = NULL, modellingParams, |
276 | 425 |
verbose = verbose, .iteration = "internal") |
277 | 426 |
|
278 | 427 |
predictions <- result[["predictions"]] |
279 | 428 |
if(class(predictions) == "data.frame") |
280 |
- predictedOutcome <- predictions[, colnames(predictions) %in% c("class", "risk")] |
|
429 |
+ predictedOutcome <- predictions[, "outcome"] |
|
281 | 430 |
else |
282 | 431 |
predictedOutcome <- predictions |
283 |
- calcExternalPerformance(outcomeTrain, predictedOutcome, performanceType) |
|
432 |
+ calcExternalPerformance(outcomeTest, predictedOutcome, performanceType) |
|
284 | 433 |
} else { |
285 | 434 |
result <- runTests(measurementsTrain, outcomeTrain, |
286 | 435 |
crossValParams, modellingParams, |
... | ... |
@@ -289,18 +438,13 @@ |
289 | 438 |
median(performances(result)[[performanceType]]) |
290 | 439 |
} |
291 | 440 |
}) |
292 |
- allPerformanceTable <- data.frame(tuneCombos, performances) |
|
293 |
- colnames(allPerformanceTable)[ncol(allPerformanceTable)] <- performanceType |
|
294 |
- |
|
295 | 441 |
betterValues <- .ClassifyRenvir[["performanceInfoTable"]][.ClassifyRenvir[["performanceInfoTable"]][, "type"] == performanceType, "better"] |
296 | 442 |
bestOne <- ifelse(betterValues == "lower", which.min(performances)[1], which.max(performances)[1]) |
297 | 443 |
tuneChosen <- tuneCombos[bestOne, , drop = FALSE] |
298 |
- tuneDetails <- list(tuneCombos, bestOne) |
|
299 |
- names(tuneDetails) <- c("tuneCombinations", "bestIndex") |
|
300 | 444 |
modellingParams@trainParams@otherParams <- tuneChosen |
301 | 445 |
} |
302 | 446 |
|
303 |
- if (!"previousTrained" %in% attr(modellingParams@trainParams@classifier, "name")) |
|
447 |
+ if(modellingParams@trainParams@classifier@generic != "previousTrained") |
|
304 | 448 |
# Don't name these first two variables. Some classifier functions might use classesTrain and others use outcomeTrain. |
305 | 449 |
paramList <- list(measurementsTrain, outcomeTrain) |
306 | 450 |
else # Don't pass the measurements and classes, because a pre-existing classifier is used. |
... | ... |
@@ -315,7 +459,7 @@ |
315 | 459 |
if(verbose >= 2) |
316 | 460 |
message("Training completed.") |
317 | 461 |
|
318 |
- list(model = trained, tune = tuneDetails) |
|
462 |
+ list(model = trained, tune = tuneChosen) |
|
319 | 463 |
} |
320 | 464 |
|
321 | 465 |
# Creates a function call to a prediction function. |
... | ... |
@@ -384,6 +528,15 @@ |
384 | 528 |
# by user-specified values. |
385 | 529 |
.filterCharacteristics <- function(characteristics, autoCharacteristics) |
386 | 530 |
{ |
531 |
+ # Remove duplication of values for classifiers that have one function for training and |
|
532 |
+ # one function for prediction. |
|
533 |
+ if("Classifier Name" %in% autoCharacteristics[, "characteristic"] && "Predictor Name" %in% autoCharacteristics[, "characteristic"]) |
|
534 |
+ { |
|
535 |
+ classRow <- which(autoCharacteristics[, "characteristic"] == "Classifier Name") |
|
536 |
+ predRow <- which(autoCharacteristics[, "characteristic"] == "Predictor Name") |
|
537 |
+ if(autoCharacteristics[classRow, "value"] == autoCharacteristics[predRow, "value"]) |
|
538 |
+ autoCharacteristics <- autoCharacteristics[-predRow, ] |
|
539 |
+ } |
|
387 | 540 |
# Overwrite automatically-chosen names with user's names. |
388 | 541 |
if(nrow(autoCharacteristics) > 0 && nrow(characteristics) > 0) |
389 | 542 |
{ |
... | ... |
@@ -407,6 +560,36 @@ |
407 | 560 |
plotData |
408 | 561 |
} |
409 | 562 |
|
563 |
+# Summary of the features used and the total number of them, no matter if they are a simple type |
|
564 |
+# or something more complex like Pairs or feature sets. |
|
565 |
+.summaryFeatures <- function(measurements) |
|
566 |
+{ |
|
567 |
+ # MultiAssayExperiment has feature details in mcols. |
|
568 |
+ if(!is.null(S4Vectors::mcols(measurements))) |
|
569 |
+ { |
|
570 |
+ originalInfo <- S4Vectors::mcols(measurements) |
|
571 |
+ featureNames <- S4Vectors::mcols(measurements)[, "feature"] |
|
572 |
+ assays <- unique(S4Vectors::mcols(measurements)[, "assay"]) |
|
573 |
+ renamedInfo <- S4Vectors::mcols(measurements) |
|
574 |
+ renamedAssays <- paste("Assay", seq_along(assays), sep = '') |
|
575 |
+ for(assay in assays) |
|
576 |
+ { |
|
577 |
+ rowsAssay <- which(renamedInfo[, "assay"] == assay) |
|
578 |
+ renamedInfo[rowsAssay, "feature"] <- paste("Feature", seq_along(rowsAssay), sep = '') |
|
579 |
+ renamedInfo[rowsAssay, "assay"] <- renamedAssays[match(assay, assays)] |
|
580 |
+ } |
|
581 |
+ featuresInfo <- S4Vectors::DataFrame(originalInfo, renamedInfo) |
|
582 |
+ colnames(featuresInfo) <- c("Original Assay", "Original Feature", "Renamed Assay", "Renamed Feature") |
|
583 |
+ featuresInfo <- cbind(originalInfo, featuresInfo) |
|
584 |
+ } else { |
|
585 |
+ originalFeatures <- colnames(measurements) |
|
586 |
+ renamedInfo <- paste("Feature", seq_along(measurements), sep = '') |
|
587 |
+ featuresInfo <- S4Vectors::DataFrame(originalFeatures, renamedInfo) |
|
588 |
+ colnames(featuresInfo) <- c("Original Feature", "Renamed Feature") |
|
589 |
+ } |
|
590 |
+ featuresInfo |
|
591 |
+} |
|
592 |
+ |
|
410 | 593 |
# Function to identify the parameters of an S4 method. |
411 | 594 |
.methodFormals <- function(f, signature) { |
412 | 595 |
tryCatch({ |
... | ... |
@@ -477,53 +660,6 @@ |
477 | 660 |
list(measurementsTrain = measurementsTrain, classesTrain = classesTrain) |
478 | 661 |
} |
479 | 662 |
|
480 |
-.transformKeywordToFunction <- function(keyword) |
|
481 |
-{ |
|
482 |
- switch( |
|
483 |
- keyword, |
|
484 |
- "none" = NULL, |
|
485 |
- "diffLoc" = subtractFromLocation |
|
486 |
- ) |
|
487 |
-} |
|
488 |
- |
|
489 |
-.selectionKeywordToFunction <- function(keyword) |
|
490 |