Browse code

Merge pull request #60 from SydneyBioX/master

crissCrossValidate Addition and predict Dispatch Fix

Dario Strbenac authored on 08/12/2022 09:35:26 • GitHub committed on 08/12/2022 09:35:26
Showing 22 changed files

... ...
@@ -3,8 +3,8 @@ Type: Package
3 3
 Title: A framework for cross-validated classification problems, with
4 4
        applications to differential variability and differential
5 5
        distribution testing
6
-Version: 3.3.8
7
-Date: 2022-11-28
6
+Version: 3.3.9
7
+Date: 2022-12-08
8 8
 Authors@R:
9 9
     c(
10 10
     person(given = "Dario", family = "Strbenac", email = "dario.strbenac@sydney.edu.au", role = c("aut", "cre")),
... ...
@@ -20,8 +20,8 @@ VignetteBuilder: knitr
20 20
 Encoding: UTF-8
21 21
 biocViews: Classification, Survival
22 22
 Depends: R (>= 4.1.0), generics, methods, S4Vectors, MultiAssayExperiment, BiocParallel, survival
23
-Imports: grid, genefilter, utils, dplyr, tidyr, rlang, ranger
24
-Suggests: limma, edgeR, car, Rmixmod, ggplot2 (>= 3.0.0), gridExtra (>= 2.0.0), cowplot,
23
+Imports: grid, genefilter, utils, dplyr, plyr, tidyr, rlang, ranger, ggplot2 (>= 3.0.0), ggpubr, reshape2
24
+Suggests: limma, edgeR, car, Rmixmod, gridExtra (>= 2.0.0), cowplot,
25 25
         BiocStyle, pamr, PoiClaClu, parathyroidSE, knitr, htmltools, gtable,
26 26
         scales, e1071, rmarkdown, IRanges, robustbase, glmnet, class, randomForestSRC,
27 27
         MatrixModels, xgboost
... ...
@@ -44,6 +44,7 @@ Collate:
44 44
     'classes.R'
45 45
     'calcPerformance.R'
46 46
     'constants.R'
47
+    'crissCrossValidate.R'
47 48
     'crossValidate.R'
48 49
     'data.R'
49 50
     'distribution.R'
... ...
@@ -75,6 +76,7 @@ Collate:
75 76
     'prepareData.R'
76 77
     'previousSelection.R'
77 78
     'previousTrained.R'
79
+    'randomSelection.R'
78 80
     'rankingBartlett.R'
79 81
     'rankingCoxPH.R'
80 82
     'rankingDMD.R'
... ...
@@ -22,6 +22,8 @@ export(calcCVperformance)
22 22
 export(calcExternalPerformance)
23 23
 export(chosenFeatureNames)
24 24
 export(colCoxTests)
25
+export(crissCrossPlot)
26
+export(crissCrossValidate)
25 27
 export(crossValidate)
26 28
 export(distribution)
27 29
 export(edgesToHubNetworks)
... ...
@@ -86,8 +88,13 @@ exportMethods(show)
86 88
 exportMethods(totalPredictions)
87 89
 exportMethods(tunedParameters)
88 90
 import(MultiAssayExperiment)
91
+import(dplyr)
92
+import(ggplot2)
93
+import(ggpubr)
89 94
 import(grid)
90 95
 import(methods)
96
+import(plyr)
97
+import(reshape2)
91 98
 import(utils)
92 99
 importFrom(S4Vectors,as.data.frame)
93 100
 importFrom(S4Vectors,do.call)
... ...
@@ -14,7 +14,7 @@
14 14
 #' 
15 15
 #' @aliases ROCplot ROCplot,list-method
16 16
 #' @param results A list of \code{\link{ClassifyResult}} objects.
17
-#' @param mode Default: "merge". Whether to merge all predictions of all
17
+#' @param mode Default: \code{"merge"}. Whether to merge all predictions of all
18 18
 #' iterations of cross-validation into one set or keep them separate. Keeping
19 19
 #' them separate will cause separate ROC curves to be computed for each
20 20
 #' iteration and confidence intervals to be drawn with the solid line being the
... ...
@@ -815,7 +815,7 @@ setClass("ModellingParams", representation(
815 815
 #' @rdname ModellingParams-class
816 816
 #' @aliases ModellingParams ModellingParams-class
817 817
 #' @docType class
818
-#' @param balancing Default: "downsample". A character value specifying what kind
818
+#' @param balancing Default: \code{"downsample"}. A character value specifying what kind
819 819
 #' of class balancing to do, if any.
820 820
 #' @param transformParams Parameters used for feature transformation inside of C.V.
821 821
 #' specified by a \code{\link{TransformParams}} instance. Optional, can be \code{NULL}.
... ...
@@ -45,6 +45,7 @@
45 45
     "pairsDifferencesRanking", "Pairs Differences",
46 46
     "previousSelection", "Previous Selection", 
47 47
     "previousTrained", "Previous Trained",
48
+    "randomSelection", "Random Selection",
48 49
     "randomForestTrainInterface", "Random Forest",
49 50
     "SVMtrainInterface", "Support Vector Machine",
50 51
     "coxphTrainInterface", "Cox Proportional Hazards",
... ...
@@ -67,7 +68,8 @@
67 68
     "likelihoodRatio", "Likelihood ratio test (normal distribution).",
68 69
     "KS", "Kolmogorov-Smirnov test for differences in distributions.",
69 70
     "KL", "Kullback-Leibler divergence between distributions.",
70
-    "CoxPH", "Cox proportional hazards Wald test per-feature."
71
+    "CoxPH", "Cox proportional hazards Wald test per-feature.",
72
+    "randomSelection", "Randomly selects a specified number of features."
71 73
     ),
72 74
   ncol = 2, byrow = TRUE, dimnames = list(NULL, c("selectionMethod Keyword", "Description"))
73 75
 ) |> as.data.frame()
74 76
new file mode 100644
... ...
@@ -0,0 +1,234 @@
1
+#' A function to perform pairwise cross validation
2
+#'
3
+#' This function has been designed to perform cross-validation and model prediction on datasets in a pairwise manner.
4
+#'
5
+#' @param measurements A \code{list} of either \code{\link{DataFrame}}, \code{\link{data.frame}} or \code{\link{matrix}} class measurements.
6
+#' @param outcomes A \code{list} of vectors that respectively correspond to outcomes of the samples in \code{measurements} list.
7
+#' @param nFeatures The number of features to be used for modelling.
8
+#' @param selectionMethod Default: \code{"auto"}. A character keyword of the feature algorithm to be used. If \code{"auto"}, t-test (two categories) /
9
+#' F-test (three or more categories) ranking and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional
10
+#' hazards p-value.
11
+#' @param selectionOptimisation A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise nFeatures.
12
+#' @param trainType Default: \code{"modelTrain"}. A keyword specifying whether a fully trained model is used to make predictions on the test
13
+#' set or if only the feature identifiers are chosen using the training data set and a number of training-predictions are made by cross-validation
14
+#' in the test set.
15
+#' @param classifier Default: \code{"auto"}. A character keyword of the modelling algorithm to be used. If \code{"auto"}, then a random forest is used
16
+#' for a classification task or Cox proportional hazards model for a survival task.
17
+#' @param nFolds A numeric specifying the number of folds to use for cross-validation.
18
+#' @param nRepeats A numeric specifying the the number of repeats or permutations to use for cross-validation.
19
+#' @param nCores A numeric specifying the number of cores used if the user wants to use parallelisation. 
20
+#' @param performanceType Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Otherwise, any one of the
21
+#' options described in \code{\link{calcPerformance}} may otherwise be specified.
22
+#' @param doRandomFeatures Default: \code{FALSE}. Whether to perform random feature selection to establish a baseline performance. Either \code{FALSE} or \code{TRUE}
23
+#' are permitted values.
24
+#' @return A list with elements \code{"real"} for the matrix of pairwise performance metrics using real
25
+#' feature selection, \code{"random"} if \code{doRandomFeatures} is \code{TRUE} for metrics of random selection and
26
+#' \code{"params"} for a list of parameters used during the execution of this function.
27
+#' @author Harry Robertson
28
+
29
+#' @import plyr
30
+#' @import dplyr
31
+#'
32
+#' @export
33
+#' 
34
+
35
+crissCrossValidate <- function(measurements, outcomes, 
36
+                               nFeatures = 20, selectionMethod = "auto",
37
+                               selectionOptimisation = "Resubstitution",
38
+                               trainType = c("modelTrain", "modelTest"),
39
+                               performanceType = "auto",
40
+                               doRandomFeatures = FALSE,
41
+                               classifier = "auto",
42
+                               nFolds = 5, nRepeats = 20, nCores = 1)
43
+{
44
+  trainType <- match.arg(trainType)
45
+  if(!is.list(measurements)) stop("'measurements' is not of type list but is of type", class(measurements))
46
+  if(!is.list(outcomes)) stop("'outcomes' is not of type list but is of type", class(outcomes))
47
+  isCategorical <- is.character(outcomes[[1]]) && (length(outcomes[[1]]) == 1 || length(outcomes[[1]]) == nrow(measurements[[1]])) || is.factor(outcomes[[1]])
48
+  if(performanceType == "auto")
49
+    if(isCategorical) performanceType <- "Balanced Accuracy" else performanceType <- "C-index"
50
+  if(length(selectionMethod) == 1 && selectionMethod == "auto")
51
+    if(isCategorical) selectionMethod <- "t-test" else selectionMethod <- "CoxPH"
52
+  if(length(classifier) == 1 && classifier == "auto")
53
+    if(isCategorical) classifier <- "randomForest" else classifier <- "CoxPH"
54
+  
55
+  dataCleaned <- mapply(function(measurementsOne, outcomesOne)
56
+  {
57
+    prepareData(measurementsOne, outcomesOne)
58
+  }, measurements, outcomes, SIMPLIFY = FALSE)
59
+  measurements <- lapply(dataCleaned, "[[", 1)
60
+  outcomes <- lapply(dataCleaned, "[[", 2)
61
+  
62
+  # If trainType is modelTrain, then build a model on a data set and test it on every data set.
63
+  if(trainType == "modelTrain")
64
+  {
65
+    # Build a model for each dataset.
66
+    trainedModels <- mapply(function(measurementsOne, outcomesOne)
67
+    {
68
+      train(measurementsOne, outcomesOne,
69
+            nFeatures = nFeatures,
70
+            selectionMethod = selectionMethod, selectionOptimisation = selectionOptimisation,
71
+            classifier = classifier, multiViewMethod = "none")
72
+     }, measurements, outcomes, SIMPLIFY = FALSE)
73
+
74
+    # Perform pair-wise model assessment.
75
+    performanceAllPairs <- lapply(trainedModels, function(trainedModel)
76
+    {
77
+      mapply(function(testData, testOutcomes)
78
+      {
79
+        predictions <- predict(trainedModel, testData)
80
+        if(is(predictions, "tabular")) predictions <- predictions[, na.omit(match(c("class", "risk"), colnames(predictions)))]
81
+        calcExternalPerformance(predictions, testOutcomes, performanceType)
82
+      }, measurements, outcomes)
83
+    })
84
+
85
+    realPerformance <- matrix(unlist(performanceAllPairs), ncol = length(measurements), byrow = TRUE,
86
+                              dimnames = list(paste("Select and Train", names(measurements)), paste("Predict", names(measurements))))
87
+    realPerformance <- round(realPerformance, 2)
88
+  } else { # trainType is "modelTest".
89
+    trainedModels <- mapply(function(measurementsOne, outcomesOne)
90
+    {
91
+      crossValidate(measurementsOne, outcomesOne,
92
+                    nFeatures = nFeatures,
93
+                    selectionMethod = selectionMethod,
94
+                    selectionOptimisation = selectionOptimisation,
95
+                    classifier = classifier,
96
+                    multiViewMethod = "none",
97
+                    nFolds = nFolds,
98
+                    nCores = nCores,
99
+                    nRepeats = nRepeats)
100
+     }, measurements, outcomes, SIMPLIFY = FALSE)
101
+    
102
+    # Make it for runTests, which allows existing results to be passed into selection process.
103
+    crossValParams <- generateCrossValParams(nRepeats, nFolds, nCores, selectionOptimisation)
104
+
105
+    performanceAllPairs <- lapply(trainedModels, function(trainedModel)
106
+    {
107
+      mapply(function(measurementsOne, outcomesOne)
108
+      {
109
+        classifierParams <- .classifierKeywordToParams(classifier)
110
+        modellingParams <- ModellingParams(selectParams = SelectParams("previousSelection", intermediate = ".iteration", classifyResult = trainedModel),
111
+                                           trainParams = classifierParams$trainParams,
112
+                                           predictParams = classifierParams$predictParams)
113
+        
114
+        result <- runTests(measurementsOne, outcomesOne, crossValParams, modellingParams)
115
+        mean(performance(calcCVperformance(result, performanceType))[[performanceType]])
116
+      }, measurements, outcomes, SIMPLIFY = FALSE)
117
+     })
118
+    
119
+    realPerformance <- matrix(unlist(performanceAllPairs), ncol = length(measurements), byrow = TRUE,
120
+                              dimnames = list(paste("Select", names(measurements)), paste("Cross-validate", names(measurements))))
121
+    realPerformance <- round(realPerformance, 2)
122
+  }
123
+
124
+  # Return matrix of pair-wise model accuracy. 
125
+  # I've made this a list so that I can add things to it later on. 
126
+  result <- list(real = realPerformance)
127
+  # We want to include a set of nFeatures to compare between our feature selection method.
128
+  if(doRandomFeatures == TRUE){
129
+    message("Starting random feature selection procedure.")
130
+    # Sample nFeatures randomly from each dataset.
131
+    randomFeatures <- lapply(measurements, function(dataset) sample(colnames(dataset), nFeatures))
132
+    performanceAllPairs <- lapply(randomFeatures, function(randomFeaturesSet)
133
+    {
134
+      mapply(function(testData, testOutcomes)
135
+      {
136
+        result <- crossValidate(testData[, randomFeaturesSet], testOutcomes,
137
+                    nFeatures = nFeatures,
138
+                    selectionMethod = "none",
139
+                    classifier = classifier,
140
+                    multiViewMethod = "none",
141
+                    nFolds = nFolds,
142
+                    nCores = nCores,
143
+                    nRepeats = nRepeats)
144
+        mean(performance(calcCVperformance(result, performanceType))[[performanceType]])
145
+      }, measurements, outcomes)
146
+    })
147
+
148
+    randomPerformance <- matrix(unlist(performanceAllPairs), ncol = length(measurements), byrow = TRUE,
149
+                              dimnames = list(paste("Random Select", names(measurements)), paste("Cross-validate", names(measurements))))
150
+    randomPerformance <- round(randomPerformance, 2)
151
+    
152
+    result$random <- randomPerformance
153
+  }
154
+  
155
+# Add information about the params to the output.
156
+  result$params <- list(nFeatures = nFeatures, selectionMethod = selectionMethod,
157
+                     selectionOptimisation = selectionOptimisation,
158
+                     classifier = classifier, nFolds = nFolds, nRepeats = nRepeats, nCores = nCores,
159
+                     trainType = trainType, performanceType = performanceType,
160
+                     doRandomFeatures = doRandomFeatures)
161
+  
162
+  result
163
+}
164
+
165
+#' A function to plot the output of the crissCrossValidate function.
166
+#'
167
+#' This function has been designed to give a heatmap output of the crissCrossValidate function.
168
+#'
169
+#' @param crissCrossResult The output of the crissCrossValidate function.
170
+#' @param includeValues If TRUE, then the values of the matrix will be included in the plot.
171
+#' @author Harry Robertson
172
+#' 
173
+#' @import ggplot2
174
+#' @import reshape2
175
+#' @import ggpubr
176
+#' 
177
+#' @export
178
+
179
+crissCrossPlot <- function(crissCrossResult, includeValues = FALSE){
180
+
181
+  attach(crissCrossResult)
182
+  scalebar_title <- params$performanceType
183
+
184
+  # If the user does not want to compare features.
185
+  if(params$trainType == "modelTrain"){
186
+    melted_cormat <- reshape2::melt(real, na.rm = TRUE)
187
+  
188
+    ggheatmap <- ggplot(melted_cormat, aes(Var1, Var2, fill = value)) +
189
+      geom_tile(color = "white") +
190
+      scale_fill_gradient2(high = "red", mid = "white", low = "blue", 
191
+                          midpoint = 0.5, limit = c(0,1), space = "Lab", 
192
+                          name=as.character(scalebar_title)) +
193
+      theme_bw() + xlab("Training Dataset") + ylab("Testing Dataset") +
194
+      theme(axis.text.x = element_text(angle = 90, vjust = 1, size = 8, hjust = 1)) + 
195
+      theme(axis.text.y = element_text(vjust = 1, size = 8, hjust = 1)) +
196
+      coord_fixed() 
197
+    
198
+    if(includeValues == TRUE) ggheatmap <- ggheatmap + geom_text(aes(label = value), color = "black", size = 3)
199
+  }
200
+  
201
+  else if(params$trainType == "modelTest"){
202
+    melted_cormat_1 <- melt(real, na.rm = TRUE)
203
+    ggheatmap_1 <- ggplot(melted_cormat_1, aes(Var1, Var2, fill = value)) +
204
+      geom_tile(color = "white") +
205
+      scale_fill_gradient2(high = "red", mid = "white", low = "blue", 
206
+                          midpoint = 0.5, limit = c(0,1), space = "Lab", 
207
+                          name=as.character(scalebar_title)) +
208
+      theme_bw() + xlab("Features Extracted") + ylab("Dataset Tested") +
209
+      theme(axis.text.x = element_text(angle = 90, vjust = 1, size = 8, hjust = 1)) + 
210
+      theme(axis.text.y = element_text(vjust = 1, size = 8, hjust = 1)) +
211
+      coord_fixed()
212
+    if(includeValues == TRUE) ggheatmap_1 <- ggheatmap_1 + geom_text(aes(label = value), color = "black", size = 3)
213
+    
214
+    if(params$doRandomFeatures == TRUE){
215
+      melted_cormat_2 <- melt(random, na.rm = TRUE)
216
+      ggheatmap_2 <- ggplot(melted_cormat_2, aes(Var1, Var2, fill = value)) +
217
+        geom_tile(color = "white") +
218
+        scale_fill_gradient2(high = "red", mid = "white", low = "blue", 
219
+                            midpoint = 0.5, limit = c(0,1), space = "Lab", 
220
+                            name=as.character(scalebar_title)) +
221
+        theme_bw() + xlab("Features Extracted") + ylab("Dataset Tested") +
222
+        theme(axis.text.x = element_text(angle = 90, vjust = 1, size = 8, hjust = 1)) + 
223
+        theme(axis.text.y = element_text(vjust = 1, size = 8, hjust = 1)) +
224
+        coord_fixed()
225
+      if(includeValues == TRUE) ggheatmap_2 <- ggheatmap_2 + geom_text(aes(label = value), color = "black", size = 3)
226
+
227
+        ggheatmap <- ggarrange(ggheatmap_1, ggheatmap_2, labels = c("A - Feature Selection", "B - Random Features"), 
228
+                           ncol = 2, common.legend = TRUE, legend = "right")
229
+    } else {
230
+      ggheatmap <- ggheatmap_1
231
+    }
232
+  }
233
+  print(ggheatmap)
234
+}
0 235
\ No newline at end of file
... ...
@@ -22,8 +22,8 @@
22 22
 #' or assays. If a numeric vector these will be optimised over using \code{selectionOptimisation}. If a named vector with the same names of multiple assays, 
23 23
 #' a different number of features will be used for each assay. If a named list of vectors, the respective number of features will be optimised over. 
24 24
 #' Set to NULL or "all" if all features should be used.
25
-#' @param selectionMethod Default: "auto". A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
26
-#' and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"} t-test (two categories) / F-test (three or more categories) ranking
25
+#' @param selectionMethod Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
26
+#' and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking
27 27
 #' and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value.
28 28
 #' @param selectionOptimisation A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.
29 29
 #' @param performanceType Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Otherwise, any one of the
... ...
@@ -91,10 +91,10 @@ setMethod("crossValidate", "DataFrame",
91 91
           function(measurements,
92 92
                    outcome,
93 93
                    nFeatures = 20,
94
-                   selectionMethod = "t-test",
94
+                   selectionMethod = "auto",
95 95
                    selectionOptimisation = "Resubstitution",
96 96
                    performanceType = "auto",
97
-                   classifier = "randomForest",
97
+                   classifier = "auto",
98 98
                    multiViewMethod = "none",
99 99
                    assayCombinations = "all",
100 100
                    nFolds = 5,
... ...
@@ -297,10 +297,10 @@ setMethod("crossValidate", "MultiAssayExperiment",
297 297
           function(measurements,
298 298
                    outcome, 
299 299
                    nFeatures = 20,
300
-                   selectionMethod = "t-test",
300
+                   selectionMethod = "auto",
301 301
                    selectionOptimisation = "Resubstitution",
302 302
                    performanceType = "auto",
303
-                   classifier = "randomForest",
303
+                   classifier = "auto",
304 304
                    multiViewMethod = "none",
305 305
                    assayCombinations = "all",
306 306
                    nFolds = 5,
... ...
@@ -331,10 +331,10 @@ setMethod("crossValidate", "data.frame", # data.frame of numeric measurements.
331 331
           function(measurements,
332 332
                    outcome, 
333 333
                    nFeatures = 20,
334
-                   selectionMethod = "t-test",
334
+                   selectionMethod = "auto",
335 335
                    selectionOptimisation = "Resubstitution",
336 336
                    performanceType = "auto",
337
-                   classifier = "randomForest",
337
+                   classifier = "auto",
338 338
                    multiViewMethod = "none",
339 339
                    assayCombinations = "all",
340 340
                    nFolds = 5,
... ...
@@ -364,10 +364,10 @@ setMethod("crossValidate", "matrix", # Matrix of numeric measurements.
364 364
           function(measurements,
365 365
                    outcome,
366 366
                    nFeatures = 20,
367
-                   selectionMethod = "t-test",
367
+                   selectionMethod = "auto",
368 368
                    selectionOptimisation = "Resubstitution",
369 369
                    performanceType = "auto",
370
-                   classifier = "randomForest",
370
+                   classifier = "auto",
371 371
                    multiViewMethod = "none",
372 372
                    assayCombinations = "all",
373 373
                    nFolds = 5,
... ...
@@ -399,10 +399,10 @@ setMethod("crossValidate", "list",
399 399
           function(measurements,
400 400
                    outcome, 
401 401
                    nFeatures = 20,
402
-                   selectionMethod = "t-test",
402
+                   selectionMethod = "auto",
403 403
                    selectionOptimisation = "Resubstitution",
404 404
                    performanceType = "auto",
405
-                   classifier = "randomForest",
405
+                   classifier = "auto",
406 406
                    multiViewMethod = "none",
407 407
                    assayCombinations = "all",
408 408
                    nFolds = 5,
... ...
@@ -764,19 +764,18 @@ generateMultiviewParams <- function(assayIDs,
764 764
 }
765 765
 
766 766
 # measurements, outcome are mutually exclusive with x, outcomeTrain, measurementsTest, outcomeTest.
767
-CV <- function(measurements = NULL,
768
-               outcome = NULL, x = NULL, outcomeTrain = NULL, measurementsTest = NULL, outcomeTest = NULL,
767
+CV <- function(measurements, outcome, x, outcomeTrain, measurementsTest, outcomeTest,
769 768
                assayIDs,
770
-               nFeatures = NULL,
771
-               selectionMethod = "t-test",
772
-               selectionOptimisation = "Resubstitution",
769
+               nFeatures,
770
+               selectionMethod,
771
+               selectionOptimisation,
773 772
                performanceType,
774
-               classifier = "elasticNetGLM",
775
-               multiViewMethod = "none",
776
-               nFolds = 5,
777
-               nRepeats = 100,
778
-               nCores = 1,
779
-               characteristicsLabel = NULL)
773
+               classifier,
774
+               multiViewMethod,
775
+               nFolds,
776
+               nRepeats,
777
+               nCores,
778
+               characteristicsLabel)
780 779
 
781 780
 {
782 781
     # Which data-types or data-views are present?
... ...
@@ -848,7 +847,7 @@ train.data.frame <- function(x, outcomeTrain, ...)
848 847
 #' @param performanceType Performance metric to optimise if classifier has any tuning parameters.
849 848
 #' @method train DataFrame
850 849
 #' @export
851
-train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", performanceType = "auto",
850
+train.DataFrame <- function(x, outcomeTrain, selectionMethod = "auto", nFeatures = 20, classifier = "auto", performanceType = "auto",
852 851
                             multiViewMethod = "none", assayIDs = "all", ...) # ... for prepareData.
853 852
                    {
854 853
               prepArgs <- list(x, outcomeTrain)
... ...
@@ -863,18 +862,20 @@ train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", perfor
863 862
               # Ensure performance type is one of the ones that can be calculated by the package.
864 863
               if(!performanceType %in% c("auto", .ClassifyRenvir[["performanceTypes"]]))
865 864
                 stop(paste("performanceType must be one of", paste(c("auto", .ClassifyRenvir[["performanceTypes"]]), collapse = ", "), "but is", performanceType))
866
-              
865
+
866
+              isCategorical <- is.character(outcomeTrain) && (length(outcomeTrain) == 1 || length(outcomeTrain) == nrow(measurements)) || is.factor(outcomeTrain)
867 867
               if(performanceType == "auto")
868
-              {
869
-                if(is.character(outcomeTrain) && (length(outcomeTrain) == 1 || length(outcomeTrain) == nrow(x)) || is.factor(outcomeTrain))
870
-                  performanceType <- "Balanced Accuracy"
871
-                else performanceType <- "C-index"
872
-              }
868
+                if(isCategorical) performanceType <- "Balanced Accuracy" else performanceType <- "C-index"
869
+              if(length(selectionMethod) == 1 && selectionMethod == "auto")
870
+                if(isCategorical) selectionMethod <- "t-test" else selectionMethod <- "CoxPH"
871
+              if(length(classifier) == 1 && classifier == "auto")
872
+                if(isCategorical) classifier <- "randomForest" else classifier <- "CoxPH"
873 873
               
874 874
               measurements <- measurementsAndOutcome[["measurements"]]
875 875
               outcomeTrain <- measurementsAndOutcome[["outcome"]]
876 876
               
877 877
               classifier <- cleanClassifier(classifier = classifier, measurements = measurements)
878
+              selectionMethod <- cleanSelectionMethod(selectionMethod = selectionMethod, measurements = measurements)
878 879
               if(assayIDs == "all") assayIDs <- unique(S4Vectors::mcols(measurements)[, "assay"])
879 880
               if(is.null(assayIDs)) assayIDs <- 1
880 881
               names(assayIDs) <- assayIDs
... ...
@@ -886,30 +887,52 @@ train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", perfor
886 887
                           # Loop over assays
887 888
                           sapply(classifier[[assayIndex]], function(classifierForAssay) {
888 889
                               # Loop over classifiers
890
+                                sapply(selectionMethod[[assayIndex]], function(selectionForAssay) {
891
+                                  # Loop over selectors
889 892
                               
890 893
                                   measurementsUse <- measurements
891 894
                                   if(assayIndex != 1) measurementsUse <- measurements[, S4Vectors::mcols(measurements)[, "assay"] == assayIndex, drop = FALSE]
892 895
                                   
896
+                                  modellingParams <- generateModellingParams(assayIDs = assayIDs, measurements = measurements, nFeatures = nFeatures,
897
+                                                     selectionMethod = selectionMethod, selectionOptimisation = "Resubstitution", performanceType = performanceType,
898
+                                                     classifier = classifier, multiViewMethod = "none")
899
+                                  topFeatures <- .doSelection(measurementsUse, outcomeTrain, CrossValParams(), modellingParams, verbose = 0)
900
+                                  selectedFeaturesIndices <- topFeatures[[2]] # Extract for subsetting.
901
+                                  tuneDetailsSelect <- topFeatures[[3]]
902
+                                  measurementsUse <- measurementsUse[, selectedFeaturesIndices]
903
+
893 904
                                   classifierParams <- .classifierKeywordToParams(classifierForAssay)
894
-                                  if(!is.null(classifierParams$trainParams@tuneParams))
895
-                                    classifierParams$trainParams@tuneParams <- c(classifierParams$trainParams@tuneParams, performanceType = performanceType)
896 905
                                   modellingParams <- ModellingParams(balancing = "none", selectParams = NULL,
897
-                                                               trainParams = classifierParams$trainParams, predictParams = classifierParams$predictParams)
906
+                                                                     trainParams = classifierParams$trainParams, predictParams = classifierParams$predictParams)
907
+                                  if(!is.null(tuneDetailsSelect))
908
+                                  {
909
+                                    tuneDetailsSelectUse <- tuneDetailsSelect[["tuneCombinations"]][tuneDetailsSelect[["bestIndex"]], , drop = FALSE]
910
+                                    avoidTune <- match(colnames(tuneDetailsSelectUse), names(modellingParams@trainParams@tuneParams))
911
+                                    if(any(!is.na(avoidTune)))
912
+                                    {
913
+                                      modellingParams@trainParams@otherParams <- c(modellingParams@trainParams@otherParams, tuneDetailsSelectUse[!is.na(avoidTune)])
914
+                                      modellingParams@trainParams@tuneParams <- modellingParams@trainParams@tuneParams[-na.omit(avoidTune)]
915
+                                      if(length(modellingParams@trainParams@tuneParams) == 0) modellingParams@trainParams@tuneParams <- NULL
916
+                                    }
917
+                                  }
918
+                                  if(!is.null(modellingParams@trainParams@tuneParams))
919
+                                    modellingParams$trainParams@tuneParams <- c(modellingParams$trainParams@tuneParams, performanceType = performanceType)
898 920
                                   
899
-                                  .doTrain(measurementsUse, outcomeTrain, NULL, NULL, CrossValParams(), modellingParams, verbose = 0)[["model"]]
921
+                                  trained <- .doTrain(measurementsUse, outcomeTrain, NULL, NULL, CrossValParams(), modellingParams, verbose = 0)[["model"]]
922
+                                  attr(trained, "predictFunction") <- classifierParams$predictParams@predictor
923
+                                  trained
900 924
                                   ## train model
901
-                          },
902
-                          simplify = FALSE)
903
-                      },
904
-                      simplify = FALSE)
925
+                                }, simplify = FALSE)
926
+                          }, simplify = FALSE)
927
+                      }, simplify = FALSE)
905 928
 
906
-                  models <- unlist(resClassifier, recursive = FALSE)
929
+                  models <- unlist(unlist(resClassifier, recursive = FALSE), recursive = FALSE)
907 930
                   if(length(models) == 1) {
908 931
                       model <- models[[1]]
909
-                      class(model) <- c(class(model), "trainedByClassifyR")
932
+                      class(model) <- c("trainedByClassifyR", class(model))
910 933
                       models <- NULL
911 934
                   } else {
912
-                      class(models) <- c(class(models), "listOfModels", "trainedByClassifyR")
935
+                      class(models) <- c("listOfModels", "trainedByClassifyR", class(models))
913 936
                   }
914 937
               }
915 938
 
... ...
@@ -932,6 +955,8 @@ train.DataFrame <- function(x, outcomeTrain, classifier = "randomForest", perfor
932 955
 
933 956
                # Generate params for each assay. This could be extended to have different selectionMethods for each type
934 957
                  paramsAssays <- mapply(generateModellingParams,
958
+                                        nFeatures = nFeatures[assayIDs],
959
+                                        selectionMethod = selectionMethod[assayIDs],
935 960
                                         assayIDs = assayIDs,
936 961
                                         measurements = assayTrain[assayIDs],
937 962
                                         classifier = classifier[assayIDs],
... ...
@@ -1049,9 +1074,11 @@ predict.trainedByClassifyR <- function(object, newData, ...)
1049 1074
               newData <- prepareData(newData, useFeatures = allFeatureNames(object))
1050 1075
               # Some classifiers dangerously use positional matching rather than column name matching.
1051 1076
               # newData columns are sorted so that the right column ordering is guaranteed.
1052
-            }
1053
-
1077
+    }
1078
+    
1079
+    predictFunctionUse <- attr(object, "predictFunction")
1080
+    class(object) <- rev(class(object)) # Now want the predict method of the specific model to be picked, so put model class first.
1054 1081
     if (is(object, "listOfModels")) 
1055
-         mapply(function(model, assay) predict(model, assay), object, newData, SIMPLIFY = FALSE)
1056
-    else predict(object, newData) # Object is itself a trained model and it is assumed that a predict method is defined for it.
1082
+         mapply(function(model, assay) predictFunctionUse(model, assay), object, newData, SIMPLIFY = FALSE)
1083
+    else predictFunctionUse(object, newData) # Object is itself a trained model and it is assumed that a predict method is defined for it.
1057 1084
 }
... ...
@@ -47,6 +47,14 @@ setMethod("prepareData", "matrix",
47 47
   prepareData(S4Vectors::DataFrame(measurements, check.names = FALSE), outcome, ...)
48 48
 })
49 49
 
50
+#' @rdname prepareData
51
+#' @export
52
+setMethod("prepareData", "data.frame",
53
+  function(measurements, outcome, ...)
54
+{
55
+  prepareData(S4Vectors::DataFrame(measurements, check.names = FALSE), outcome, ...)
56
+})
57
+
50 58
 #' @rdname prepareData
51 59
 #' @export
52 60
 setMethod("prepareData", "DataFrame",
53 61
new file mode 100644
... ...
@@ -0,0 +1,9 @@
1
+# Automated Selection of Previously Selected Features
2
+randomSelection <- function(measurementsTrain, classesTrain, nFeatures, verbose = 3)
3
+{
4
+  if(verbose == 3)
5
+    message("Choosing random features.")
6
+
7
+  sample(ncol(measurementsTrain), nFeatures) # Return indices, not identifiers.
8
+}
9
+attr(randomSelection, "name") <- "randomSelection"
0 10
\ No newline at end of file
... ...
@@ -26,7 +26,7 @@
26 26
 #' @param results A list of \code{\link{ClassifyResult}} objects.
27 27
 #' @param topRanked A sequence of thresholds of number of the best features to
28 28
 #' use for overlapping.
29
-#' @param comparison Default: within. The aspect of the experimental design to
29
+#' @param comparison Default: \code{"within"}. The aspect of the experimental design to
30 30
 #' compare. Can be any characteristic that all results share or special value
31 31
 #' "within" to compared between all pairwise iterations of cross-validation.
32 32
 #' @param referenceLevel The level of the comparison factor to use as the
... ...
@@ -13,7 +13,7 @@
13 13
 #' a matrix of pre-calculated metrics, for backwards compatibility.
14 14
 #' @param classes If \code{results} is a matrix, this is a factor vector of the
15 15
 #' same length as the number of columns that \code{results} has.
16
-#' @param comparison Default: "auto". The aspect of the experimental
16
+#' @param comparison Default: \code{"auto"}. The aspect of the experimental
17 17
 #' design to compare. Can be any characteristic that all results share.
18 18
 #' @param metric Default: \code{"auto"}. The name of the
19 19
 #' performance measure or "auto". If the results are classification then
... ...
@@ -36,7 +36,7 @@
36 36
 #' 
37 37
 #' @aliases selectionPlot selectionPlot,list-method
38 38
 #' @param results A list of \code{\link{ClassifyResult}} objects.
39
-#' @param comparison Default: within. The aspect of the experimental design to
39
+#' @param comparison Default: \code{"within"}. The aspect of the experimental design to
40 40
 #' compare. Can be any characteristic that all results share or either one of
41 41
 #' the special values \code{"within"} to compare between all pairwise
42 42
 #' iterations of cross-validation. or \code{"size"}, to draw a bar chart of the
... ...
@@ -129,10 +129,12 @@
129 129
       tuneCombo <- tuneCombosSelect[rowIndex, , drop = FALSE]
130 130
       if(tuneCombo != "none") # Add real parameters before function call.
131 131
         paramList <- append(paramList, tuneCombo)
132
+      if(attr(featureRanking, "name") == "randomSelection")
133
+        paramList <- append(paramList, nFeatures = topNfeatures)
132 134
       do.call(featureRanking, paramList)
133 135
     })
134 136
 
135
-    if(attr(featureRanking, "name") %in% c("previousSelection", "Union Selection")) # Actually selection not ranking.
137
+    if(attr(featureRanking, "name") %in% c("randomSelection", "previousSelection", "Union Selection")) # Actually selection not ranking.
136 138
       return(list(NULL, rankings[[1]], NULL))
137 139
     
138 140
     if(crossValParams@tuneMode == "none") # No parameters to choose between.
... ...
@@ -509,6 +511,8 @@
509 511
         "KS" = KolmogorovSmirnovRanking,
510 512
         "KL" = KullbackLeiblerRanking,
511 513
         "CoxPH" = coxphRanking,
514
+        "previousSelection" = previousSelection,
515
+        "randomSelection" = randomSelection,
512 516
         "selectMulti" = selectMulti
513 517
     )
514 518
 }
... ...
@@ -16,7 +16,7 @@ ModellingParams(
16 16
 )
17 17
 }
18 18
 \arguments{
19
-\item{balancing}{Default: "downsample". A character value specifying what kind
19
+\item{balancing}{Default: \code{"downsample"}. A character value specifying what kind
20 20
 of class balancing to do, if any.}
21 21
 
22 22
 \item{transformParams}{Parameters used for feature transformation inside of C.V.
... ...
@@ -30,7 +30,7 @@
30 30
 \item{...}{Parameters not used by the \code{ClassifyResult} method but passed to
31 31
 the \code{list} method.}
32 32
 
33
-\item{mode}{Default: "merge". Whether to merge all predictions of all
33
+\item{mode}{Default: \code{"merge"}. Whether to merge all predictions of all
34 34
 iterations of cross-validation into one set or keep them separate. Keeping
35 35
 them separate will cause separate ROC curves to be computed for each
36 36
 iteration and confidence intervals to be drawn with the solid line being the
37 37
new file mode 100644
... ...
@@ -0,0 +1,19 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/crissCrossValidate.R
3
+\name{crissCrossPlot}
4
+\alias{crissCrossPlot}
5
+\title{A function to plot the output of the crissCrossValidate function.}
6
+\usage{
7
+crissCrossPlot(crissCrossResult, includeValues = FALSE)
8
+}
9
+\arguments{
10
+\item{crissCrossResult}{The output of the crissCrossValidate function.}
11
+
12
+\item{includeValues}{If TRUE, then the values of the matrix will be included in the plot.}
13
+}
14
+\description{
15
+This function has been designed to give a heatmap output of the crissCrossValidate function.
16
+}
17
+\author{
18
+Harry Robertson
19
+}
0 20
new file mode 100644
... ...
@@ -0,0 +1,64 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/crissCrossValidate.R
3
+\name{crissCrossValidate}
4
+\alias{crissCrossValidate}
5
+\title{A function to perform pairwise cross validation}
6
+\usage{
7
+crissCrossValidate(
8
+  measurements,
9
+  outcomes,
10
+  nFeatures = 20,
11
+  selectionMethod = "auto",
12
+  selectionOptimisation = "Resubstitution",
13
+  trainType = c("modelTrain", "modelTest"),
14
+  performanceType = "auto",
15
+  doRandomFeatures = FALSE,
16
+  classifier = "auto",
17
+  nFolds = 5,
18
+  nRepeats = 20,
19
+  nCores = 1
20
+)
21
+}
22
+\arguments{
23
+\item{measurements}{A \code{list} of either \code{\link{DataFrame}}, \code{\link{data.frame}} or \code{\link{matrix}} class measurements.}
24
+
25
+\item{outcomes}{A \code{list} of vectors that respectively correspond to outcomes of the samples in \code{measurements} list.}
26
+
27
+\item{nFeatures}{The number of features to be used for modelling.}
28
+
29
+\item{selectionMethod}{Default: \code{"auto"}. A character keyword of the feature algorithm to be used. If \code{"auto"}, t-test (two categories) /
30
+F-test (three or more categories) ranking and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional
31
+hazards p-value.}
32
+
33
+\item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise nFeatures.}
34
+
35
+\item{trainType}{Default: \code{"modelTrain"}. A keyword specifying whether a fully trained model is used to make predictions on the test
36
+set or if only the feature identifiers are chosen using the training data set and a number of training-predictions are made by cross-validation
37
+in the test set.}
38
+
39
+\item{performanceType}{Default: \code{"auto"}. If \code{"auto"}, then balanced accuracy for classification or C-index for survival. Otherwise, any one of the
40
+options described in \code{\link{calcPerformance}} may otherwise be specified.}
41
+
42
+\item{doRandomFeatures}{Default: \code{FALSE}. Whether to perform random feature selection to establish a baseline performance. Either \code{FALSE} or \code{TRUE}
43
+are permitted values.}
44
+
45
+\item{classifier}{Default: \code{"auto"}. A character keyword of the modelling algorithm to be used. If \code{"auto"}, then a random forest is used
46
+for a classification task or Cox proportional hazards model for a survival task.}
47
+
48
+\item{nFolds}{A numeric specifying the number of folds to use for cross-validation.}
49
+
50
+\item{nRepeats}{A numeric specifying the the number of repeats or permutations to use for cross-validation.}
51
+
52
+\item{nCores}{A numeric specifying the number of cores used if the user wants to use parallelisation.}
53
+}
54
+\value{
55
+A list with elements \code{"real"} for the matrix of pairwise performance metrics using real
56
+feature selection, \code{"random"} if \code{doRandomFeatures} is \code{TRUE} for metrics of random selection and
57
+\code{"params"} for a list of parameters used during the execution of this function.
58
+}
59
+\description{
60
+This function has been designed to perform cross-validation and model prediction on datasets in a pairwise manner.
61
+}
62
+\author{
63
+Harry Robertson
64
+}
... ...
@@ -22,10 +22,10 @@ crossValidate(measurements, outcome, ...)
22 22
   measurements,
23 23
   outcome,
24 24
   nFeatures = 20,
25
-  selectionMethod = "t-test",
25
+  selectionMethod = "auto",
26 26
   selectionOptimisation = "Resubstitution",
27 27
   performanceType = "auto",
28
-  classifier = "randomForest",
28
+  classifier = "auto",
29 29
   multiViewMethod = "none",
30 30
   assayCombinations = "all",
31 31
   nFolds = 5,
... ...
@@ -39,10 +39,10 @@ crossValidate(measurements, outcome, ...)
39 39
   measurements,
40 40
   outcome,
41 41
   nFeatures = 20,
42
-  selectionMethod = "t-test",
42
+  selectionMethod = "auto",
43 43
   selectionOptimisation = "Resubstitution",
44 44
   performanceType = "auto",
45
-  classifier = "randomForest",
45
+  classifier = "auto",
46 46
   multiViewMethod = "none",
47 47
   assayCombinations = "all",
48 48
   nFolds = 5,
... ...
@@ -56,10 +56,10 @@ crossValidate(measurements, outcome, ...)
56 56
   measurements,
57 57
   outcome,
58 58
   nFeatures = 20,
59
-  selectionMethod = "t-test",
59
+  selectionMethod = "auto",
60 60
   selectionOptimisation = "Resubstitution",
61 61
   performanceType = "auto",
62
-  classifier = "randomForest",
62
+  classifier = "auto",
63 63
   multiViewMethod = "none",
64 64
   assayCombinations = "all",
65 65
   nFolds = 5,
... ...
@@ -73,10 +73,10 @@ crossValidate(measurements, outcome, ...)
73 73
   measurements,
74 74
   outcome,
75 75
   nFeatures = 20,
76
-  selectionMethod = "t-test",
76
+  selectionMethod = "auto",
77 77
   selectionOptimisation = "Resubstitution",
78 78
   performanceType = "auto",
79
-  classifier = "randomForest",
79
+  classifier = "auto",
80 80
   multiViewMethod = "none",
81 81
   assayCombinations = "all",
82 82
   nFolds = 5,
... ...
@@ -90,10 +90,10 @@ crossValidate(measurements, outcome, ...)
90 90
   measurements,
91 91
   outcome,
92 92
   nFeatures = 20,
93
-  selectionMethod = "t-test",
93
+  selectionMethod = "auto",
94 94
   selectionOptimisation = "Resubstitution",
95 95
   performanceType = "auto",
96
-  classifier = "randomForest",
96
+  classifier = "auto",
97 97
   multiViewMethod = "none",
98 98
   assayCombinations = "all",
99 99
   nFolds = 5,
... ...
@@ -110,7 +110,9 @@ crossValidate(measurements, outcome, ...)
110 110
 \method{train}{DataFrame}(
111 111
   x,
112 112
   outcomeTrain,
113
-  classifier = "randomForest",
113
+  selectionMethod = "auto",
114
+  nFeatures = 20,
115
+  classifier = "auto",
114 116
   performanceType = "auto",
115 117
   multiViewMethod = "none",
116 118
   assayIDs = "all",
... ...
@@ -141,8 +143,8 @@ or assays. If a numeric vector these will be optimised over using \code{selectio
141 143
 a different number of features will be used for each assay. If a named list of vectors, the respective number of features will be optimised over. 
142 144
 Set to NULL or "all" if all features should be used.}
143 145
 
144
-\item{selectionMethod}{Default: "auto". A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
145
-and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"} t-test (two categories) / F-test (three or more categories) ranking
146
+\item{selectionMethod}{Default: \code{"auto"}. A character vector of feature selection methods to compare. If a named character vector with names corresponding to different assays, 
147
+and performing multiview classification, the respective classification methods will be used on each assay. If \code{"auto"}, t-test (two categories) / F-test (three or more categories) ranking
146 148
 and top \code{nFeatures} optimisation is done. Otherwise, the ranking method is per-feature Cox proportional hazards p-value.}
147 149
 
148 150
 \item{selectionOptimisation}{A character of "Resubstitution", "Nested CV" or "none" specifying the approach used to optimise \code{nFeatures}.}
... ...
@@ -5,10 +5,13 @@
5 5
 \alias{prepareData,matrix-method}
6 6
 \alias{prepareData,DataFrame-method}
7 7
 \alias{prepareData,MultiAssayExperiment-method}
8
+\alias{prepareData,data.frame-method}
8 9
 \title{Convert Different Data Classes into DataFrame and Filter Features}
9 10
 \usage{
10 11
 \S4method{prepareData}{matrix}(measurements, outcome, ...)
11 12
 
13
+\S4method{prepareData}{data.frame}(measurements, outcome, ...)
14
+
12 15
 \S4method{prepareData}{DataFrame}(
13 16
   measurements,
14 17
   outcome,
... ...
@@ -37,7 +37,7 @@
37 37
 \item{topRanked}{A sequence of thresholds of number of the best features to
38 38
 use for overlapping.}
39 39
 
40
-\item{comparison}{Default: within. The aspect of the experimental design to
40
+\item{comparison}{Default: \code{"within"}. The aspect of the experimental design to
41 41
 compare. Can be any characteristic that all results share or special value
42 42
 "within" to compared between all pairwise iterations of cross-validation.}
43 43
 
... ...
@@ -61,7 +61,7 @@ a matrix of pre-calculated metrics, for backwards compatibility.}
61 61
 \item{...}{Parameters not used by the \code{ClassifyResult} method that does
62 62
 list-packaging but used by the main \code{list} method.}
63 63
 
64
-\item{comparison}{Default: "auto". The aspect of the experimental
64
+\item{comparison}{Default: \code{"auto"}. The aspect of the experimental
65 65
 design to compare. Can be any characteristic that all results share.}
66 66
 
67 67
 \item{metric}{Default: \code{"auto"}. The name of the
... ...
@@ -37,7 +37,7 @@
37 37
 
38 38
 \item{...}{Not used by end user.}
39 39
 
40
-\item{comparison}{Default: within. The aspect of the experimental design to
40
+\item{comparison}{Default: \code{"within"}. The aspect of the experimental design to
41 41
 compare. Can be any characteristic that all results share or either one of
42 42
 the special values \code{"within"} to compare between all pairwise
43 43
 iterations of cross-validation. or \code{"size"}, to draw a bar chart of the