... | ... |
@@ -32,7 +32,7 @@ |
32 | 32 |
#' Set to NULL or "all" if all features should be used. |
33 | 33 |
#' @param selectionMethod Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, |
34 | 34 |
#' and performing multiview classification, the respective selection methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking |
35 |
-#' and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. \code{NULL} is also a valid value, meaning that no |
|
35 |
+#' and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. \code{"none"} is also a valid value, meaning that no |
|
36 | 36 |
#' indepedent feature selection will be performed (but implicit selection might still happen with the classifier). |
37 | 37 |
#' @param selectionOptimisation A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}. |
38 | 38 |
#' @param performanceType Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Otherwise, any one of the |
... | ... |
@@ -114,7 +114,7 @@ setMethod("crossValidate", "DataFrame", |
114 | 114 |
|
115 | 115 |
{ |
116 | 116 |
# Check that data is in the right format, if not already done for MultiAssayExperiment input. |
117 |
- if(!"assay" %in% S4Vectors::mcols(measurements)) # Assay is put there by prepareData for MultiAssayExperiment, skip if present. |
|
117 |
+ if(!"assay" %in% colnames(S4Vectors::mcols(measurements))) # Assay is put there by prepareData for MultiAssayExperiment, skip if present. |
|
118 | 118 |
{ |
119 | 119 |
prepParams <- list(measurements, outcome, clinicalPredictors) |
120 | 120 |
if("prepare" %in% names(extraParams)) |
... | ... |
@@ -594,7 +594,6 @@ generateModellingParams <- function(assayIDs, |
594 | 594 |
} |
595 | 595 |
|
596 | 596 |
selectionMethod <- unlist(selectionMethod) |
597 |
- if(is.null(selectionMethod)) selectionMethod <- "none" |
|
598 | 597 |
|
599 | 598 |
if(selectionMethod != "none") |
600 | 599 |
{ |
... | ... |
@@ -6,20 +6,26 @@ elasticNetGLMtrainInterface <- function(measurementsTrain, classesTrain, lambda |
6 | 6 |
stop("The package 'glmnet' could not be found. Please install it.") |
7 | 7 |
if(verbose == 3) |
8 | 8 |
message("Fitting elastic net regularised GLM classifier to data.") |
9 |
- |
|
10 |
- measurementsMatrix <- glmnet::makeX(as(measurementsTrain, "data.frame")) |
|
11 | 9 |
|
12 |
- fitted <- glmnet::glmnet(measurementsMatrix, classesTrain, family = "multinomial", ...) |
|
10 |
+ # One-hot encoding needed. |
|
11 |
+ measurementsTrain <- MatrixModels::model.Matrix(~ 0 + ., data = measurementsTrain) |
|
12 |
+ fitted <- glmnet::glmnet(measurementsTrain, classesTrain, family = "multinomial", ...) |
|
13 |
+ |
|
13 | 14 |
|
14 | 15 |
if(is.null(lambda)) # fitted has numerous models for automatically chosen lambda values. |
15 |
- { # Pick one lambda based on resubstitution performance. |
|
16 |
- bestLambda <- fitted[["lambda"]][which.min(sapply(fitted[["lambda"]], function(lambda) # Largest Lambda with minimum balanced error rate. |
|
16 |
+ { # Pick one lambda based on resubstitution performance. But not the one that makes all variables excluded from model. |
|
17 |
+ lambdaConsider <- colSums(as.matrix(fitted[["beta"]][[1]])) != 0 |
|
18 |
+ bestLambda <- fitted[["lambda"]][lambdaConsider][which.min(sapply(fitted[["lambda"]][lambdaConsider], function(lambda) # Largest Lambda with minimum balanced error rate. |
|
17 | 19 |
{ |
18 |
- classPredictions <- factor(as.character(predict(fitted, measurementsMatrix, s = lambda, type = "class")), levels = fitted[["classnames"]]) |
|
20 |
+ classPredictions <- factor(as.character(predict(fitted, measurementsTrain, s = lambda, type = "class")), levels = fitted[["classnames"]]) |
|
19 | 21 |
calcExternalPerformance(classesTrain, classPredictions, "Balanced Error") |
20 | 22 |
}))[1]] |
21 | 23 |
attr(fitted, "tune") <- list(lambda = bestLambda) |
22 | 24 |
} |
25 |
+ |
|
26 |
+ attr(fitted, "featureNames") <- colnames(measurementsTrain) |
|
27 |
+ attr(fitted, "featureGroups") <- measurementsTrain@assign |
|
28 |
+ |
|
23 | 29 |
fitted |
24 | 30 |
} |
25 | 31 |
attr(elasticNetGLMtrainInterface, "name") <- "elasticNetGLMtrainInterface" |
... | ... |
@@ -28,6 +34,8 @@ attr(elasticNetGLMtrainInterface, "name") <- "elasticNetGLMtrainInterface" |
28 | 34 |
elasticNetGLMpredictInterface <- function(model, measurementsTest, lambda, ..., returnType = c("both", "class", "score"), verbose = 3) |
29 | 35 |
{ # ... just consumes emitted tuning variables from .doTrain which are unused. |
30 | 36 |
returnType <- match.arg(returnType) |
37 |
+ # One-hot encoding needed. |
|
38 |
+ measurementsTest <- MatrixModels::model.Matrix(~ 0 + ., data = measurementsTest) |
|
31 | 39 |
|
32 | 40 |
# Ensure that testing data has same columns names in same order as training data. |
33 | 41 |
# Remove those annoying backquotes which glmnet adds if variables have spaces in names. |
... | ... |
@@ -41,11 +49,11 @@ elasticNetGLMpredictInterface <- function(model, measurementsTest, lambda, ..., |
41 | 49 |
if(missing(lambda)) # Tuning parameters are not passed to prediction functions. |
42 | 50 |
lambda <- attr(model, "tune")[["lambda"]] # Sneak it in as an attribute on the model. |
43 | 51 |
|
44 |
- testMatrix <- glmnet::makeX(as(measurementsTest, "data.frame")) |
|
45 |
- testMatrix <- testMatrix[, rownames(model[["beta"]][[1]])] |
|
46 | 52 |
|
47 |
- classPredictions <- factor(as.character(predict(model, testMatrix, s = lambda, type = "class")), levels = model[["classnames"]]) |
|
48 |
- classScores <- predict(model, testMatrix, s = lambda, type = "response")[, , 1] |
|
53 |
+ measurementsTest <- measurementsTest[, rownames(model[["beta"]][[1]])] |
|
54 |
+ |
|
55 |
+ classPredictions <- factor(as.character(predict(model, measurementsTest, s = lambda, type = "class")), levels = model[["classnames"]]) |
|
56 |
+ classScores <- predict(model, measurementsTest, s = lambda, type = "response")[, , 1] |
|
49 | 57 |
|
50 | 58 |
if(is.matrix(classScores)) |
51 | 59 |
classScores <- classScores[, model[["classnames"]]] |
... | ... |
@@ -61,16 +69,20 @@ elasticNetGLMpredictInterface <- function(model, measurementsTest, lambda, ..., |
61 | 69 |
# |
62 | 70 |
# Get selected features (i.e. non-zero model coefficients) |
63 | 71 |
# |
72 |
+# Note: Need to convert back to actual features when factors were expanded into |
|
73 |
+# multiple columns using indicator variables. |
|
74 |
+# |
|
64 | 75 |
################################################################################ |
65 | 76 |
|
66 | 77 |
elasticNetFeatures <- function(model) |
67 | 78 |
{ |
68 |
- inputFeatures <- rownames(model[["beta"]][[1]]) |
|
69 | 79 |
# Floating point numbers test for equality. |
70 | 80 |
whichCoefficientColumn <- which(abs(model[["lambda"]] - attr(model, "tune")[["lambda"]]) < 0.00001)[1] |
71 | 81 |
coefficientsUsed <- sapply(model[["beta"]], function(classCoefficients) classCoefficients[, whichCoefficientColumn]) |
72 | 82 |
featureScores <- rowSums(abs(coefficientsUsed)) |
73 |
- rankedFeaturesIndices <- order(featureScores, decreasing = TRUE) |
|
74 |
- selectedFeaturesIndices <- which(featureScores != 0) |
|
83 |
+ featureGroups <- attr(model, "featureGroups")[match(names(featureScores), attr(model, "featureNames"))] |
|
84 |
+ groupScores <- unname(by(featureScores, featureGroups, max)) |
|
85 |
+ rankedFeaturesIndices <- order(groupScores, decreasing = TRUE) |
|
86 |
+ selectedFeaturesIndices <- which(groupScores != 0) |
|
75 | 87 |
list(rankedFeaturesIndices, selectedFeaturesIndices) |
76 | 88 |
} |
77 | 89 |
\ No newline at end of file |
... | ... |
@@ -5,43 +5,28 @@ setGeneric("precisionPathwayTrain", function(measurements, class, ...) |
5 | 5 |
|
6 | 6 |
#' @rdname precisionPathwayTrain |
7 | 7 |
#' @export |
8 |
-setMethod("precisionPathwayTrain", "MultiAssayExperiment", |
|
9 |
- function(measurements, class, clinicalPredictors = NULL, ..., fixedAssays = "clinical", |
|
10 |
- confidenceCutoff = 0.8, minAssaySamples = 10, |
|
11 |
- nFeatures = 20, selectionMethod = setNames(c(NULL, rep("t-test", length(measurements))), c("clinical", names(measurements))), |
|
8 |
+setMethod("precisionPathwayTrain", "MultiAssayExperimentOrList", |
|
9 |
+ function(measurements, class, clinicalPredictors = NULL, maxMissingProp = 0.0, topNvariance = NULL, |
|
10 |
+ fixedAssays = "clinical", confidenceCutoff = 0.8, minAssaySamples = 10, |
|
11 |
+ nFeatures = 20, selectionMethod = setNames(c("none", rep("t-test", length(measurements))), c("clinical", names(measurements))), |
|
12 | 12 |
classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))), c("clinical", names(measurements))), |
13 | 13 |
nFolds = 5, nRepeats = 20, nCores = 1) |
14 | 14 |
{ |
15 |
- prepArgs <- list(measurements, class, clinicalPredictors) |
|
16 |
- extraInputs <- list(...) |
|
17 |
- prepExtras <- numeric() |
|
18 |
- if(length(extraInputs) > 0) |
|
19 |
- prepExtras <- which(names(extraInputs) %in% .ClassifyRenvir[["prepareDataFormals"]]) |
|
20 |
- if(length(prepExtras) > 0) |
|
21 |
- prepArgs <- append(prepArgs, extraInputs[prepExtras]) |
|
22 |
- measurementsAndClass <- do.call(prepareData, prepArgs) |
|
23 |
- |
|
24 |
- .precisionPathwayTrain(measurementsAndClass[["measurements"]], measurementsAndClass[["outcome"]], |
|
25 |
- fixedAssays = fixedAssays, confidenceCutoff = confidenceCutoff, |
|
26 |
- minAssaySamples = minAssaySamples, nFeatures = nFeatures, |
|
27 |
- selectionMethod = selectionMethod, classifier = classifier, |
|
28 |
- nFolds = nFolds, nRepeats = nRepeats, nCores = nCores) |
|
29 |
- }) |
|
30 |
- |
|
31 |
-#' @rdname precisionPathwayTrain |
|
32 |
-#' @export |
|
33 |
-setMethod("precisionPathwayTrain", "list", |
|
34 |
- function(measurements, class, clinicalPredictors = NULL, fixedAssays = "clinical", |
|
35 |
- confidenceCutoff = 0.8, minAssaySamples = 10, |
|
36 |
- nFeatures = 20, selectionMethod = setNames(c(NULL, rep("t-test", length(measurements))), c("clinical", names(measurements))), |
|
37 |
- classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))), c("clinical", names(measurements))), |
|
38 |
- nFolds = 5, nRepeats = 20, nCores = 1, ...) |
|
39 |
- { |
|
40 |
- # One of the tables must be named "clinical". |
|
41 |
- if (!any(names(measurements) == "clinical")) |
|
42 |
- stop("One of the tables must be named \"clinical\".") |
|
15 |
+ if(is.list(measurements)) # Ensure plain list has clinical data. |
|
16 |
+ { |
|
17 |
+ # One of the tables must be named "clinical". |
|
18 |
+ if (!any(names(measurements) == "clinical")) |
|
19 |
+ stop("One of the tables must be named \"clinical\".") |
|
20 |
+ } |
|
21 |
+ prepArgs <- list(measurements, outcomeColumns = class, clinicalPredictors = clinicalPredictors, |
|
22 |
+ maxMissingProp = maxMissingProp, topNvariance = topNvariance) |
|
23 |
+ measurementsAndClass <- do.call(prepareData, prepArgs) |
|
43 | 24 |
|
44 |
- .precisionPathwayTrain(measurements = combined_df, class = class) |
|
25 |
+ .precisionPathwayTrain(measurementsAndClass[["measurements"]], measurementsAndClass[["outcome"]], |
|
26 |
+ fixedAssays = fixedAssays, confidenceCutoff = confidenceCutoff, |
|
27 |
+ minAssaySamples = minAssaySamples, nFeatures = nFeatures, |
|
28 |
+ selectionMethod = selectionMethod, classifier = classifier, |
|
29 |
+ nFolds = nFolds, nRepeats = nRepeats, nCores = nCores) |
|
45 | 30 |
}) |
46 | 31 |
|
47 | 32 |
# Internal method which carries out all of the processing, obtaining reformatted data from the |
... | ... |
@@ -52,25 +37,13 @@ setMethod("precisionPathwayTrain", "list", |
52 | 37 |
classifier = setNames(c("elasticNetGLM", rep("randomForest", length(measurements))), c("clinical", names(measurements))), |
53 | 38 |
nFolds = 5, nRepeats = 20, nCores = 1, ...) |
54 | 39 |
{ |
55 |
- # Step 1: Separate the vector classes from the table of measurements, if not already separate. |
|
56 |
- prepArgs <- list(measurements, class, clinicalPredictors) |
|
57 |
- extraInputs <- list(...) |
|
58 |
- prepExtras <- numeric() |
|
59 |
- if(length(extraInputs) > 0) |
|
60 |
- prepExtras <- which(names(extraInputs) %in% .ClassifyRenvir[["prepareDataFormals"]]) |
|
61 |
- if(length(prepExtras) > 0) |
|
62 |
- prepArgs <- append(prepArgs, extraInputs[prepExtras]) |
|
63 |
- |
|
64 |
- measurementsAndClass <- do.call(prepareData, prepArgs) |
|
65 |
- measurements <- measurementsAndClass[["measurements"]] |
|
66 |
- class <- measurementsAndClass[["outcome"]] |
|
67 | 40 |
|
68 |
- # Step 2: Determine all valid permutations of assays, taking into account the |
|
41 |
+ # Step 1: Determine all valid permutations of assays, taking into account the |
|
69 | 42 |
# assays to be used and which assays, if any, must be included. |
70 | 43 |
assayIDs <- unique(S4Vectors::mcols(measurements)[["assay"]]) |
71 | 44 |
assaysPermutations <- .permutations(assayIDs, fixed = data.frame(seq_along(fixedAssays), fixedAssays)) |
72 | 45 |
|
73 |
- # Step 3: Build a classifier for each assay using all of the samples. |
|
46 |
+ # Step 2: Build a classifier for each assay using all of the samples. |
|
74 | 47 |
modelsList <- crossValidate(measurements, class, nFeatures, selectionMethod, |
75 | 48 |
classifier = classifier, nFolds = nFolds, |
76 | 49 |
nRepeats = nRepeats, nCores = nCores) |
... | ... |
@@ -180,10 +180,10 @@ setMethod("prepareData", "MultiAssayExperiment", |
180 | 180 |
if(is.null(clinicalPredictors)) |
181 | 181 |
stop("'clinicalPredictors' must be a vector of informative clinical features (i.e. not sample IDs, sampling dates, etc.) to consider for classification.") |
182 | 182 |
|
183 |
- if(any(anyReplicated(measurements[, , omicsTargets]))) |
|
183 |
+ if(any(anyReplicated(measurements))) |
|
184 | 184 |
stop("Data set contains replicates. Please remove or average replicate observations and try again.") |
185 | 185 |
|
186 |
- if(is.null(outcomeColums)) |
|
186 |
+ if(is.null(outcomeColumns)) |
|
187 | 187 |
stop("'outcomeColumns' is a mandatory parameter but was not specified.") |
188 | 188 |
|
189 | 189 |
if(!all(outcomeColumns %in% colnames(MultiAssayExperiment::colData(measurements)))) |
... | ... |
@@ -195,6 +195,7 @@ setMethod("prepareData", "MultiAssayExperiment", |
195 | 195 |
dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(clinicalPredictors, outcomeColumns)) |
196 | 196 |
rownames(dataTable) <- dataTable[, "primary"] |
197 | 197 |
S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "clinical", S4Vectors::mcols(dataTable)[, "sourceName"]) |
198 |
+ dataTable <- dataTable[, -match("primary", colnames(dataTable))] |
|
198 | 199 |
colnames(S4Vectors::mcols(dataTable))[1] <- "assay" |
199 | 200 |
|
200 | 201 |
# Sample information variable names not included in column metadata of wide table but only as row names of it. |
... | ... |
@@ -129,7 +129,7 @@ Set to NULL or "all" if all features should be used.} |
129 | 129 |
|
130 | 130 |
\item{selectionMethod}{Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, |
131 | 131 |
and performing multiview classification, the respective selection methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking |
132 |
-and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. \code{NULL} is also a valid value, meaning that no |
|
132 |
+and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value. \code{"none"} is also a valid value, meaning that no |
|
133 | 133 |
indepedent feature selection will be performed (but implicit selection might still happen with the classifier).} |
134 | 134 |
|
135 | 135 |
\item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.} |