366013ed |
# Creates two lists of lists. First has training samples, second has test samples for a range
# of different cross-validation schemes.
#' @import utils
|
42b55d21 |
.samplesSplits <- function(crossValParams, outcome)
|
366013ed |
{
if(crossValParams@samplesSplits %in% c("k-Fold", "Permute k-Fold"))
{
nPermutations <- ifelse(crossValParams@samplesSplits == "k-Fold", 1, crossValParams@permutations)
nFolds <- crossValParams@folds
samplesFolds <- lapply(1:nPermutations, function(permutation)
{
# Create maximally-balanced folds, so class balance is about the same in all.
allFolds <- vector(mode = "list", length = nFolds)
|
42b55d21 |
foldsIndexes <- rep(1:nFolds, length.out = length(outcome))
|
366013ed |
foldsIndex = 1
|
867946d3 |
# Dummy encoding for when outcome is not a class.
|
42b55d21 |
if(is(outcome, "Surv")) outcome <- factor(rep("a", length(outcome)))
for(outcomeName in levels(outcome))
|
366013ed |
{
# Permute the indexes of samples in the class.
|
42b55d21 |
whichSamples <- sample(which(outcome == outcomeName))
|
366013ed |
whichFolds <- foldsIndexes[foldsIndex:(foldsIndex + length(whichSamples) - 1)]
# Put each sample into its fold.
for(sampleIndex in 1:length(whichSamples))
{
allFolds[[whichFolds[sampleIndex]]] <- c(allFolds[[whichFolds[sampleIndex]]], whichSamples[sampleIndex])
}
# Move the beginning index to the first new index.
foldsIndex <- foldsIndex + length(whichSamples)
}
list(train = lapply(1:nFolds, function(index) unlist(allFolds[setdiff(1:nFolds, index)])),
test = allFolds
)
})
|
867946d3 |
# Reorganise into two separate lists, no more nesting.
|
366013ed |
list(train = unlist(lapply(samplesFolds, '[[', 1), recursive = FALSE),
test = unlist(lapply(samplesFolds, '[[', 2), recursive = FALSE))
} else if(crossValParams@samplesSplits == "Permute Percentage Split") {
# Take the same percentage of samples from each class to be in training set.
percent <- crossValParams@percentTest
|
42b55d21 |
samplesTrain <- round((100 - percent) / 100 * table(outcome))
samplesTest <- round(percent / 100 * table(outcome))
|
366013ed |
samplesLists <- lapply(1:crossValParams@permutations, function(permutation)
{
|
867946d3 |
trainSet <- unlist(mapply(function(outcomeName, number)
|
366013ed |
{
|
42b55d21 |
sample(which(outcome == outcomeName), number)
}, levels(outcome), samplesTrain))
|
366013ed |
testSet <- setdiff(1:length(classes), trainSet)
list(trainSet, testSet)
})
# Reorganise into two lists: training, testing.
list(train = lapply(samplesLists, "[[", 1), test = lapply(samplesLists, "[[", 2))
} else if(crossValParams@samplesSplits == "Leave-k-Out") { # leave k out.
|
42b55d21 |
testSamples <- as.data.frame(utils::combn(length(outcome), crossValParams@leave))
trainingSamples <- lapply(testSamples, function(sample) setdiff(1:length(outcome), sample))
|
366013ed |
list(train = as.list(trainingSamples), test = as.list(testSamples))
}
}
# Creates a two-column table for tracking the permutation, fold number, or subset of each set
|
867946d3 |
# of test samples (i.e. for leave-k-out scheme).
|
366013ed |
.splitsTestInfo <- function(crossValParams, samplesSplits)
{
permutationIDs <- NULL
foldIDs <- NULL
subsetIDs <- NULL
if(crossValParams@samplesSplits %in% c("k-Fold", "Permute k-Fold"))
{
foldsSamples <- lengths(samplesSplits[[2]][1:crossValParams@folds])
totalSamples <- sum(foldsSamples)
if(crossValParams@samplesSplits == "Permute k-Fold")
permutationIDs <- rep(1:crossValParams@permutations, each = totalSamples)
times <- ifelse(is.null(crossValParams@permutations), 1, crossValParams@permutations)
foldIDs <- rep(rep(1:crossValParams@folds, foldsSamples), times = times)
} else if(crossValParams@samplesSplits == "Permute Percentage Split") {
permutationIDs <- rep(1:crossValParams@permutations, each = length(samplesSplits[[2]][[1]]))
} else { # Leave-k-out
totalSamples <- length(unique(unlist(samplesSplits[[2]])))
subsetIDs <- rep(1:choose(totalSamples, crossValParams@leave), each = crossValParams@leave)
}
summaryTable <- cbind(permutation = permutationIDs, fold = foldIDs, subset = subsetIDs)
}
|
867946d3 |
# Add extra variables from within runTest functions to function specified by a params object.
|
366013ed |
.addIntermediates <- function(params)
{
intermediateName <- params@intermediate
intermediates <- list(dynGet(intermediateName, inherits = TRUE))
if(is.null(names(params@intermediate))) names(intermediates) <- intermediateName else names(intermediates) <- names(params@intermediate)
params@otherParams <- c(params@otherParams, intermediates)
params
}
|
867946d3 |
# Carries out one iteration of feature selection. Basically, a ranking function is used to rank
# the features in the training set from best to worst and different top sets are used either for
|
fcf300ea |
# predicting on the training set (resubstitution) or nested cross-validation of the training set,
|
867946d3 |
# to find the set of top features which give the best (user-specified) performance measure.
|
42b55d21 |
.doSelection <- function(measurementsTrain, outcomeTrain, crossValParams, modellingParams, verbose)
|
366013ed |
{
tuneParams <- modellingParams@selectParams@tuneParams
performanceType <- tuneParams[["performanceType"]]
topNfeatures <- tuneParams[["nFeatures"]]
tuneParams <- tuneParams[-match(c("performanceType", "nFeatures"), names(tuneParams))] # Only used as evaluation metric.
|
867946d3 |
# Make selectParams NULL, since we are currently doing selection and it shouldn't call
# itself infinitely, but save the parameters of the ranking function for calling the ranking
# function directly using do.call below.
|
366013ed |
featureRanking <- modellingParams@selectParams@featureRanking
otherParams <- modellingParams@selectParams@otherParams
doSubset <- modellingParams@selectParams@subsetToSelections
modellingParams@selectParams <- NULL
betterValues <- .ClassifyRenvir[["performanceInfoTable"]][.ClassifyRenvir[["performanceInfoTable"]][, "type"] == performanceType, "better"]
if(is.function(featureRanking)) # Not a list for ensemble selection.
{
|
42b55d21 |
paramList <- list(measurementsTrain, outcomeTrain, verbose = verbose)
|
a926dda4 |
paramList <- append(paramList, otherParams) # Used directly by a feature ranking function for rankings of features.
|
366013ed |
if(length(tuneParams) == 0) tuneParams <- list(None = "none")
tuneCombosSelect <- expand.grid(tuneParams, stringsAsFactors = FALSE)
|
867946d3 |
# Generate feature rankings for each one of the tuning parameter combinations.
|
366013ed |
rankings <- lapply(1:nrow(tuneCombosSelect), function(rowIndex)
{
tuneCombo <- tuneCombosSelect[rowIndex, , drop = FALSE]
if(tuneCombo != "none") # Add real parameters before function call.
paramList <- append(paramList, tuneCombo)
|
c6ea2926 |
if(attr(featureRanking, "name") == "randomSelection")
paramList <- append(paramList, nFeatures = topNfeatures)
|
366013ed |
do.call(featureRanking, paramList)
})
|
c2bb11be |
|
c6ea2926 |
if(attr(featureRanking, "name") %in% c("randomSelection", "previousSelection", "Union Selection")) # Actually selection not ranking.
|
366013ed |
return(list(NULL, rankings[[1]], NULL))
|
78a4c924 |
if(crossValParams@tuneMode == "none") # No parameters to choose between.
|
366013ed |
return(list(NULL, rankings[[1]], NULL))
tuneParamsTrain <- list(topN = topNfeatures)
|
4503a484 |
performanceIndex <- match("performanceType", names(modellingParams@trainParams@tuneParams))
if(!is.na(performanceIndex))
{
performanceType <- modellingParams@trainParams@tuneParams[["performanceType"]]
modellingParams@trainParams@tuneParams <- modellingParams@trainParams@tuneParams[-performanceIndex]
}
|
366013ed |
tuneParamsTrain <- append(tuneParamsTrain, modellingParams@trainParams@tuneParams)
tuneCombosTrain <- expand.grid(tuneParamsTrain, stringsAsFactors = FALSE)
modellingParams@trainParams@tuneParams <- NULL
|
fbb1890e |
|
78a4c924 |
allPerformanceTables <- lapply(rankings, function(rankingsVariety)
|
00cff98e |
{
|
366013ed |
# Creates a matrix. Columns are top n features, rows are varieties (one row if None).
performances <- sapply(1:nrow(tuneCombosTrain), function(rowIndex)
{
whichTry <- 1:tuneCombosTrain[rowIndex, "topN"]
if(doSubset)
{
|
00cff98e |
topFeatures <- rankingsVariety[whichTry]
measurementsTrain <- measurementsTrain[, topFeatures, drop = FALSE] # Features in columns
|
366013ed |
} else { # Pass along features to use.
modellingParams@trainParams@otherParams <- c(modellingParams@trainParams@otherParams, setNames(list(rankingsVariety[whichTry]), names(modellingParams@trainParams@intermediate)))
}
if(ncol(tuneCombosTrain) > 1) # There are some parameters for training.
modellingParams@trainParams@otherParams <- c(modellingParams@trainParams@otherParams, tuneCombosTrain[rowIndex, 2:ncol(tuneCombosTrain), drop = FALSE])
modellingParams@trainParams@intermediate <- character(0)
|
867946d3 |
# Do either resubstitution classification or nested-CV classification and calculate the resulting performance metric.
|
366013ed |
if(crossValParams@tuneMode == "Resubstitution")
{
|
42b55d21 |
# Specify measurementsTrain and outcomeTrain for testing, too.
result <- runTest(measurementsTrain, outcomeTrain, measurementsTrain, outcomeTrain,
|
366013ed |
crossValParams = NULL, modellingParams = modellingParams,
verbose = verbose, .iteration = "internal")
|
ea07063f |
|
366013ed |
predictions <- result[["predictions"]]
|
bb87c0b2 |
# Classifiers will use a column "class" and survival models will use a column "risk".
|
366013ed |
if(class(predictions) == "data.frame")
|
42b55d21 |
predictedOutcome <- predictions[, na.omit(match(c("class", "risk"), colnames(predictions)))]
|
366013ed |
else
|
42b55d21 |
predictedOutcome <- predictions
calcExternalPerformance(outcomeTrain, predictedOutcome, performanceType)
|
366013ed |
} else {
|
42b55d21 |
result <- runTests(measurementsTrain, outcomeTrain, crossValParams, modellingParams, verbose = verbose)
|
366013ed |
result <- calcCVperformance(result, performanceType)
median(performance(result)[[performanceType]])
}
})
|
805a970f |
|
366013ed |
bestOne <- ifelse(betterValues == "lower", which.min(performances)[1], which.max(performances)[1])
|
78a4c924 |
list(data.frame(tuneCombosTrain, performance = performances), bestOne)
|
366013ed |
})
|
a926dda4 |
|
78a4c924 |
tablesBestMetrics <- sapply(allPerformanceTables, function(tableIndexPair) tableIndexPair[[1]][tableIndexPair[[2]], "performance"])
tunePick <- ifelse(betterValues == "lower", which.min(tablesBestMetrics)[1], which.max(tablesBestMetrics)[1])
|
366013ed |
if(verbose == 3)
message("Features selected.")
|
78a4c924 |
tuneDetails <- allPerformanceTables[[tunePick]] # List of length 2.
|
366013ed |
rankingUse <- rankings[[tunePick]]
|
78a4c924 |
selectionIndices <- rankingUse[1:(tuneDetails[[1]][tuneDetails[[2]], "topN"])]
|
366013ed |
|
78a4c924 |
names(tuneDetails) <- c("tuneCombinations", "bestIndex")
colnames(tuneDetails[[1]])[ncol(tuneDetails[[1]])] <- performanceType
|
00cff98e |
list(ranked = rankingUse, selected = selectionIndices, tune = tuneDetails)
|
366013ed |
} else if(is.list(featureRanking)) { # It is a list of functions for ensemble selection.
|
f68ea112 |
featuresIndiciesLists <- mapply(function(selector, selParams)
|
366013ed |
{
|
42b55d21 |
paramList <- list(measurementsTrain, outcomeTrain, trainParams = trainParams,
|
366013ed |
predictParams = predictParams, verbose = verbose)
paramList <- append(paramList, selParams)
do.call(selector, paramList)
|
f68ea112 |
}, modellingParams@selectParams@featureRanking, modellingParams@selectParams@otherParams, SIMPLIFY = FALSE)
|
366013ed |
performances <- sapply(topNfeatures, function(topN)
{
|
f68ea112 |
topIndices <- unlist(lapply(featuresIndiciesLists, function(featuresIndicies) featuresIndicies[1:topN]))
|
366013ed |
topIndicesCounts <- table(topIndices)
keep <- names(topIndicesCounts)[topIndicesCounts >= modellingParams@selectParams@minPresence]
|
f68ea112 |
measurementsTrain <- measurementsTrain[, as.numeric(keep), drop = FALSE] # Features in columns
|
366013ed |
if(crossValParams@tuneMode == "Resubstitution")
{
|
42b55d21 |
result <- runTest(measurementsTrain, outcomeTrain,
measurementsTrain, outcomeTrain,
|
366013ed |
crossValParams = NULL, modellingParams,
verbose = verbose, .iteration = "internal")
|
ea07063f |
|
366013ed |
predictions <- result[["predictions"]]
if(class(predictions) == "data.frame")
|
42b55d21 |
predictedOutcome <- predictions[, "class"]
|
366013ed |
else
|
42b55d21 |
predictedOutcome <- predictions
calcExternalPerformance(outcomeTrain, predictedOutcome, performanceType)
|
366013ed |
} else {
|
42b55d21 |
result <- runTests(measurementsSubset, outcomeTrain, crossValParams, modellingParams, verbose = verbose)
|
366013ed |
result <- calcCVperformance(result, performanceType)
median(performance(aResult)[[performanceType]])
}
})
bestOne <- ifelse(betterValues == "lower", which.min(performances)[1], which.max(performances)[1])
|
f68ea112 |
selectionIndices <- unlist(lapply(featuresLists, function(featuresList) featuresList[1:topNfeatures[bestOne]]))
names(table(selectionIndices))[table(selectionIndices) >= modellingParams@selectParams@minPresence]
|
366013ed |
|
f68ea112 |
list(NULL, selectionIndices, NULL)
|
366013ed |
} else { # Previous selection
|
78a4c924 |
selectedFeatures <- list(NULL, selectionIndices, NULL)
|
366013ed |
}
}
|
867946d3 |
# Only for transformations that need to be done within cross-validation.
|
f2c8e658 |
.doTransform <- function(measurementsTrain, measurementsTest, transformParams, verbose)
|
366013ed |
{
|
f2c8e658 |
paramList <- list(measurementsTrain, measurementsTest)
|
366013ed |
if(length(transformParams@otherParams) > 0)
paramList <- c(paramList, transformParams@otherParams)
paramList <- c(paramList, verbose = verbose)
do.call(transformParams@transform, paramList)
}
|
867946d3 |
# Code to create a function call to a training function. Might also do training and testing
# within the same function, so test samples are also passed in case they are needed.
|
78a4c924 |
.doTrain <- function(measurementsTrain, outcomeTrain, measurementsTest, outcomeTest, crossValParams, modellingParams, verbose)
|
366013ed |
{
|
78a4c924 |
tuneDetails <- NULL
if(!is.null(modellingParams@trainParams@tuneParams) && is.null(modellingParams@selectParams))
|
366013ed |
{
|
545a691d |
performanceType <- modellingParams@trainParams@tuneParams[["performanceType"]]
modellingParams@trainParams@tuneParams <- modellingParams@trainParams@tuneParams[-match("performanceType", names(modellingParams@trainParams@tuneParams))]
|
366013ed |
tuneCombos <- expand.grid(modellingParams@trainParams@tuneParams, stringsAsFactors = FALSE)
modellingParams@trainParams@tuneParams <- NULL
performances <- sapply(1:nrow(tuneCombos), function(rowIndex)
{
|
545a691d |
modellingParams@trainParams@otherParams <- c(modellingParams@trainParams@otherParams, as.list(tuneCombos[rowIndex, ]))
|
366013ed |
if(crossValParams@tuneMode == "Resubstitution")
{
|
78a4c924 |
result <- runTest(measurementsTrain, outcomeTrain, measurementsTrain, outcomeTrain,
|
366013ed |
crossValParams = NULL, modellingParams,
verbose = verbose, .iteration = "internal")
|
ea07063f |
|
366013ed |
predictions <- result[["predictions"]]
|
bb87c0b2 |
if(class(predictions) == "data.frame")
|
78a4c924 |
predictedOutcome <- predictions[, colnames(predictions) %in% c("class", "risk")]
|
366013ed |
else
|
42b55d21 |
predictedOutcome <- predictions
|
78a4c924 |
calcExternalPerformance(outcomeTrain, predictedOutcome, performanceType)
|
366013ed |
} else {
|
42b55d21 |
result <- runTests(measurementsTrain, outcomeTrain,
|
366013ed |
crossValParams, modellingParams,
verbose = verbose, .iteration = "internal")
|
545a691d |
result <- calcCVperformance(result, performanceType)
median(performances(result)[[performanceType]])
|
366013ed |
}
})
|
78a4c924 |
allPerformanceTable <- data.frame(tuneCombos, performances)
colnames(allPerformanceTable)[ncol(allPerformanceTable)] <- performanceType
|
366013ed |
betterValues <- .ClassifyRenvir[["performanceInfoTable"]][.ClassifyRenvir[["performanceInfoTable"]][, "type"] == performanceType, "better"]
bestOne <- ifelse(betterValues == "lower", which.min(performances)[1], which.max(performances)[1])
tuneChosen <- tuneCombos[bestOne, , drop = FALSE]
|
78a4c924 |
tuneDetails <- list(tuneCombos, bestOne)
names(tuneDetails) <- c("tuneCombinations", "bestIndex")
|
366013ed |
modellingParams@trainParams@otherParams <- tuneChosen
}
|
78a4c924 |
if (!"previousTrained" %in% attr(modellingParams@trainParams@classifier, "name"))
|
42b55d21 |
# Don't name these first two variables. Some classifier functions might use classesTrain and others use outcomeTrain.
paramList <- list(measurementsTrain, outcomeTrain)
|
366013ed |
else # Don't pass the measurements and classes, because a pre-existing classifier is used.
paramList <- list()
if(is.null(modellingParams@predictParams)) # One function does both training and testing.
paramList <- c(paramList, measurementsTest)
if(length(modellingParams@trainParams@otherParams) > 0)
paramList <- c(paramList, modellingParams@trainParams@otherParams)
paramList <- c(paramList, verbose = verbose)
trained <- do.call(modellingParams@trainParams@classifier, paramList)
if(verbose >= 2)
message("Training completed.")
|
78a4c924 |
list(model = trained, tune = tuneDetails)
|
366013ed |
}
|
867946d3 |
# Creates a function call to a prediction function.
|
f2c8e658 |
.doTest <- function(trained, measurementsTest, predictParams, verbose)
|
366013ed |
{
if(!is.null(predictParams@predictor))
{
|
f2c8e658 |
paramList <- list(trained, measurementsTest)
|
366013ed |
if(length(predictParams@otherParams) > 0) paramList <- c(paramList, predictParams@otherParams)
|
fcf300ea |
paramList <- c(paramList, verbose = verbose)
prediction <- do.call(predictParams@predictor, paramList)
} else { prediction <- trained } # Trained is actually the predictions because only one function, not two.
|
366013ed |
if(verbose >= 2)
message("Prediction completed.")
prediction
}
|
867946d3 |
# Converts the characteristics of cross-validation into a pretty character string.
|
366013ed |
.validationText <- function(crossValParams)
{
switch(crossValParams@samplesSplits,
`Permute k-Fold` = paste(crossValParams@permutations, "Permutations,", crossValParams@folds, "Folds"),
`k-Fold` = paste(crossValParams@folds, "-fold cross-validation", sep = ''),
`Leave-k-Out` = paste("Leave", crossValParams@leave, "Out"),
`Permute Percentage Split` = paste(crossValParams@permutations, "Permutations,", crossValParams@percentTest, "% Test"),
independent = "Independent Set")
}
|
867946d3 |
# Used for ROC area under curve calculation.
|
cb6a5ffa |
# PRtable is a data frame with columns FPR, TPR and class.
# distinctClasses is a vector of all of the class names.
|
366013ed |
.calcArea <- function(PRtable, distinctClasses)
{
do.call(rbind, lapply(distinctClasses, function(aClass)
{
classTable <- subset(PRtable, class == aClass)
areaSum <- 0
for(index in 2:nrow(classTable))
{
|
cd909e14 |
# Some samples had identical predictions but belong to different classes.
if(classTable[index, "FPR"] != classTable[index - 1, "FPR"] && classTable[index, "TPR"] != classTable[index - 1, "TPR"])
{
newArea <- (classTable[index, "FPR"] - classTable[index - 1, "FPR"]) * classTable[index - 1, "TPR"] + # Rectangle part
0.5 * (classTable[index, "FPR"] - classTable[index - 1, "FPR"]) * (classTable[index, "TPR"] - classTable[index - 1, "TPR"]) # Triangle part on top.
} else { # Only one sample with predicted score. Line went either up or right, but not both.
newArea <- (classTable[index, "FPR"] - classTable[index - 1, "FPR"]) * classTable[index, "TPR"]
}
areaSum <- areaSum + newArea
|
366013ed |
}
data.frame(classTable, AUC = round(areaSum, 2), check.names = FALSE)
}))
}
|
867946d3 |
# Converts features into strings to be displayed in plots.
|
366013ed |
.getFeaturesStrings <- function(importantFeatures)
{
|
867946d3 |
# Do nothing if only a simple vector of feature IDs.
|
0c0e16c2 |
if(!is.null(ncol(importantFeatures[[1]]))) # Data set and feature ID columns.
|
366013ed |
importantFeatures <- lapply(importantFeatures, function(features) paste(features[, 1], features[, 2]))
else if("Pairs" %in% class(importantFeatures[[1]]))
|
867946d3 |
importantFeatures <- lapply(importantFeatures, function(features) paste(first(features), second(features), sep = '-'))
|
366013ed |
importantFeatures
}
|
f2c8e658 |
# Function to overwrite characteristics which are automatically derived from function names
# by user-specified values.
|
366013ed |
.filterCharacteristics <- function(characteristics, autoCharacteristics)
{
|
f2c8e658 |
# Overwrite automatically-chosen names with user's names.
|
366013ed |
if(nrow(autoCharacteristics) > 0 && nrow(characteristics) > 0)
{
overwrite <- na.omit(match(characteristics[, "characteristic"], autoCharacteristics[, "characteristic"]))
if(length(overwrite) > 0)
autoCharacteristics <- autoCharacteristics[-overwrite, ]
}
|
f2c8e658 |
# Merge characteristics tables and return.
rbind(characteristics, autoCharacteristics)
|
366013ed |
}
|
867946d3 |
# Don't just plot groups alphabetically, but do so in a meaningful order.
.addUserLevels <- function(plotData, orderingList)
|
366013ed |
{
for(orderingIndex in 1:length(orderingList))
{
plotData[, names(orderingList)[orderingIndex]] <- factor(plotData[, names(orderingList)[orderingIndex]],
levels = orderingList[[orderingIndex]])
}
plotData
}
|
867946d3 |
# Function to identify the parameters of an S4 method.
|
366013ed |
.methodFormals <- function(f, signature) {
tryCatch({
fdef <- getGeneric(f)
method <- selectMethod(fdef, signature)
genFormals <- base::formals(fdef)
b <- body(method)
if(is(b, "{") && is(b[[2]], "<-") && identical(b[[2]][[2]], as.name(".local"))) {
local <- eval(b[[2]][[3]])
if(is.function(local))
return(formals(local))
warning("Expected a .local assignment to be a function. Corrupted method?")
}
genFormals
},
error = function(error) {
formals(f)
})
}
|
867946d3 |
# Find the x-axis positions where a set of density functions cross-over.
# The trivial cross-overs at the beginning and end of the data range are removed.
# Used by the mixtures of normals and naive Bayes classifiers.
|
366013ed |
.densitiesCrossover <- function(densities) # A list of densities created by splinefun.
{
if(!all(table(unlist(lapply(densities, function(density) density[['x']]))) == length(densities)))
stop("x positions are not the same for all of the densities.")
lapply(1:length(densities), function(densityIndex) # All crossing points with other class densities.
{
unlist(lapply(setdiff(1:length(densities), densityIndex), function(otherIndex)
{
allDifferences <- densities[[densityIndex]][['y']] - densities[[otherIndex]][['y']]
crosses <- which(diff(sign(allDifferences)) != 0)
crosses <- sapply(crosses, function(cross) # Refine location for plateaus.
{
isSmall <- rle(allDifferences[(cross+1):length(allDifferences)] < 0.000001)
if(isSmall[["values"]][1] == "TRUE")
cross <- cross + isSmall[["lengths"]][1] / 2
cross
})
if(length(crosses) > 1 && densities[[densityIndex]][['y']][crosses[1]] < 0.000001 && densities[[densityIndex]][['y']][crosses[length(crosses)]] < 0.000001)
crosses <- crosses[-c(1, length(crosses))] # Remove crossings at ends of densities.
densities[[densityIndex]][['x']][crosses]
}))
})
}
|
867946d3 |
# Samples in the training set are upsampled or downsampled so that the class imbalance is
# removed.
.rebalanceTrainingClasses <- function(measurementsTrain, classesTrain, balancing)
{
samplesPerClassTrain <- table(classesTrain)
downsampleTo <- min(samplesPerClassTrain)
upsampleTo <- max(samplesPerClassTrain)
trainBalanced <- unlist(mapply(function(classSize, className)
{
if(balancing == "downsample" && classSize > downsampleTo)
sample(which(classesTrain == className), downsampleTo)
else if(balancing == "upsample" && classSize < upsampleTo)
sample(which(classesTrain == className), upsampleTo, replace = TRUE)
else
which(classesTrain == className)
|
f2c8e658 |
}, samplesPerClassTrain, names(samplesPerClassTrain), SIMPLIFY = FALSE))
|
867946d3 |
measurementsTrain <- measurementsTrain[trainBalanced, ]
classesTrain <- classesTrain[trainBalanced]
list(measurementsTrain = measurementsTrain, classesTrain = classesTrain)
}
|
78a4c924 |
.transformKeywordToFunction <- function(keyword)
{
switch(
keyword,
"none" = NULL,
"diffLoc" = subtractFromLocation
)
}
.selectionKeywordToFunction <- function(keyword)
{
switch(
keyword,
"none" = NULL,
"t-test" = differentMeansRanking,
"limma" = limmaRanking,
"edgeR" = edgeRranking,
"Bartlett" = bartlettRanking,
"Levene" = leveneRanking,
"DMD" = DMDranking,
"likelihoodRatio" = likelihoodRatioRanking,
"KS" = KolmogorovSmirnovRanking,
"KL" = KullbackLeiblerRanking,
"CoxPH" = coxphRanking,
|
c6ea2926 |
"previousSelection" = previousSelection,
"randomSelection" = randomSelection,
|
78a4c924 |
"selectMulti" = selectMulti
)
}
.classifierKeywordToParams <- function(keyword)
{
switch(
keyword,
"randomForest" = RFparams(),
"randomSurvivalForest" = RSFparams(),
|
ed2b7cb5 |
"XGB" = XGBparams(),
|
78a4c924 |
"GLM" = GLMparams(),
"elasticNetGLM" = elasticNetGLMparams(),
"SVM" = SVMparams(),
"NSC" = NSCparams(),
"DLDA" = DLDAparams(),
"naiveBayes" = naiveBayesParams(),
"mixturesNormals" = mixModelsParams(),
"kNN" = kNNparams(),
"CoxPH" = coxphParams(),
"CoxNet" = coxnetParams()
)
}
|
867946d3 |
.dlda <- function(x, y, prior = NULL){ # Remove this once sparsediscrim is reinstated to CRAN.
|
366013ed |
obj <- list()
obj$labels <- y
obj$N <- length(y)
obj$p <- ncol(x)
obj$groups <- levels(y)
obj$num_groups <- nlevels(y)
est_mean <- "mle"
# Error Checking
if (!is.null(prior)) {
if (length(prior) != obj$num_groups) {
stop("The number of 'prior' probabilities must match the number of classes in 'y'.")
}
if (any(prior <= 0)) {
stop("The 'prior' probabilities must be nonnegative.")
}
if (sum(prior) != 1) {
stop("The 'prior' probabilities must sum to one.")
}
}
if (any(table(y) < 2)) {
stop("There must be at least 2 observations in each class.")
}
# By default, we estimate the 'a priori' probabilities of class membership with
# the MLEs (the sample proportions).
if (is.null(prior)) {
prior <- as.vector(table(y) / length(y))
}
# For each class, we calculate the MLEs (or specified alternative estimators)
# for each parameter used in the DLDA classifier. The 'est' list contains the
# estimators for each class.
obj$est <- tapply(seq_along(y), y, function(i) {
stats <- list()
stats$n <- length(i)
stats$xbar <- colMeans(x[i, , drop = FALSE])
stats$var <- with(stats, (n - 1) / n * apply(x[i, , drop = FALSE], 2, var))
stats
})
# Calculates the pooled variance across all classes.
obj$var_pool <- Reduce('+', lapply(obj$est, function(x) x$n * x$var)) / obj$N
# Add each element in 'prior' to the corresponding obj$est$prior
for(k in seq_len(obj$num_groups)) {
obj$est[[k]]$prior <- prior[k]
}
class(obj) <- "dlda"
obj
}
|
ea07063f |
#' @method predict dlda
predict.dlda <- function(object, newdata, ...) { # Remove once sparsediscrim is reinstated to CRAN.
|
366013ed |
if (!inherits(object, "dlda")) {
stop("object not of class 'dlda'")
}
if (is.vector(newdata)) {
newdata <- as.matrix(newdata)
}
scores <- apply(newdata, 1, function(obs) {
sapply(object$est, function(class_est) {
with(class_est, sum((obs - xbar)^2 / object$var_pool) + log(prior))
})
})
if (is.vector(scores)) {
min_scores <- which.min(scores)
} else {
min_scores <- apply(scores, 2, which.min)
}
# Posterior probabilities via Bayes Theorem
means <- lapply(object$est, "[[", "xbar")
covs <- replicate(n=object$num_groups, object$var_pool, simplify=FALSE)
priors <- lapply(object$est, "[[", "prior")
posterior <- .posterior_probs(x=newdata,
means=means,
covs=covs,
priors=priors)
class <- factor(object$groups[min_scores], levels = object$groups)
list(class = class, scores = scores, posterior = posterior)
}
.posterior_probs <- function(x, means, covs, priors) { # Remove once sparsediscrim is reinstated to CRAN.
if (is.vector(x)) {
|
a926dda4 |
x <- matrix(x, nrow = 1)
|
366013ed |
}
x <- as.matrix(x)
posterior <- mapply(function(xbar_k, cov_k, prior_k) {
if (is.vector(cov_k)) {
post_k <- apply(x, 1, function(obs) {
.dmvnorm_diag(x=obs, mean=xbar_k, sigma=cov_k)
})
} else {
post_k <- dmvnorm(x=x, mean=xbar_k, sigma=cov_k)
}
prior_k * post_k
}, means, covs, priors)
if (is.vector(posterior)) {
posterior <- posterior / sum(posterior)
|
a926dda4 |
posterior <- matrix(posterior, nrow = 1) # Ensure it's always matrix, like just below.
colnames(posterior) <- names(priors)
|
366013ed |
} else {
posterior <- posterior / rowSums(posterior)
}
posterior
}
.dmvnorm_diag <- function(x, mean, sigma) { # Remove once sparsediscrim is reinstated to CRAN.
exp(sum(dnorm(x, mean=mean, sd=sqrt(sigma), log=TRUE)))
|
6e3a76b4 |
}
|