Browse code

Merge branch 'master' of https://github.com/DarioS/ClassifyR

Dario Strbenac authored on 18/10/2022 02:08:33
Showing 8 changed files

... ...
@@ -3,15 +3,15 @@ Type: Package
3 3
 Title: A framework for cross-validated classification problems, with
4 4
        applications to differential variability and differential
5 5
        distribution testing
6
-Version: 3.1.23
7
-Date: 2022-10-14
6
+Version: 3.1.25
7
+Date: 2022-10-18
8 8
 Author: Dario Strbenac, Ellis Patrick, John Ormerod, Graham Mann, Jean Yang
9 9
 Maintainer: Dario Strbenac <dario.strbenac@sydney.edu.au>
10 10
 VignetteBuilder: knitr
11 11
 Encoding: UTF-8
12 12
 biocViews: Classification, Survival
13 13
 Depends: R (>= 4.1.0), generics, methods, S4Vectors, MultiAssayExperiment, BiocParallel, survival
14
-Imports: grid, genefilter, utils, dplyr, tidyr, rlang, randomForest
14
+Imports: grid, genefilter, utils, dplyr, tidyr, rlang, ranger
15 15
 Suggests: limma, edgeR, car, Rmixmod, ggplot2 (>= 3.0.0), gridExtra (>= 2.0.0), cowplot,
16 16
         BiocStyle, pamr, PoiClaClu, parathyroidSE, knitr, htmltools, gtable,
17 17
         scales, e1071, rmarkdown, IRanges, robustbase, glmnet, class, randomForestSRC,
... ...
@@ -10,8 +10,8 @@
10 10
 #' @param outcome A vector of class labels of class \code{\link{factor}} of the
11 11
 #' same length as the number of samples in \code{measurements} or a character vector of length 1 containing the
12 12
 #' column name in \code{measurements} if it is a \code{\link{DataFrame}}. Or a \code{\link{Surv}} object or a character vector of
13
-#' length 2 or 3 specifying the time and event columns in \code{measurements} for survival outcome.
14
-#' @param outcomeColumns If \code{measurements} is a \code{\link{MultiAssayExperiment}}, the column name(s) in \code{colData(measurements)} representing the outcome.
13
+#' length 2 or 3 specifying the time and event columns in \code{measurements} for survival outcome. If \code{measurements} is a
14
+#' \code{\link{MultiAssayExperiment}}, the column name(s) in \code{colData(measurements)} representing the outcome.
15 15
 #' @param outcomeTrain For the \code{train} function, either a factor vector of classes, a \code{\link{Surv}} object, or
16 16
 #' a character string, or vector of such strings, containing column name(s) of column(s)
17 17
 #' containing either classes or time and event information about survival.
... ...
@@ -118,8 +118,6 @@ setMethod("crossValidate", "DataFrame",
118 118
               # Which data-types or data-views are present?
119 119
               assayIDs <- unique(mcols(measurements)$assay)
120 120
               if(is.null(assayIDs)) assayIDs <- 1
121
-              
122
-              checkData(measurements, outcome)
123 121
 
124 122
               # Check that other variables are in the right format and fix
125 123
               nFeatures <- cleanNFeatures(nFeatures = nFeatures,
... ...
@@ -184,17 +182,12 @@ Using an ordinary GLM instead.")
184 182
                                       characteristicsLabel = characteristicsLabel
185 183
                                   )
186 184
                               },
187
-
188 185
                               simplify = FALSE)
189 186
                           },
190
-
191 187
                           simplify = FALSE)
192 188
                       },
193
-
194 189
                       simplify = FALSE)
195
-
196 190
                   result <- unlist(unlist(resClassifier))
197
-
198 191
               }
199 192
 
200 193
               ################################
... ...
@@ -208,7 +201,6 @@ Using an ordinary GLM instead.")
208 201
                   # The below loops over different combinations of assays and merges them together.
209 202
                   # This allows someone to answer which combinations of the assays might be most useful.
210 203
 
211
-
212 204
                   if(!is.list(assayCombinations) && assayCombinations == "all") assayCombinations <- do.call("c", sapply(seq_along(assayIDs), function(nChoose) combn(assayIDs, nChoose, simplify = FALSE)))
213 205
 
214 206
                   result <- sapply(assayCombinations, function(assayIndex){
... ...
@@ -244,7 +236,6 @@ Using an ordinary GLM instead.")
244 236
                       if(length(assayCombinations) == 0) stop("No assayCombinations with \"clinical\" data")
245 237
                   }
246 238
 
247
-
248 239
                   result <- sapply(assayCombinations, function(assayIndex){
249 240
                       CV(measurements = measurements[, mcols(measurements)[["assay"]] %in% assayIndex],
250 241
                          outcome = outcome, assayIDs = assayIndex,
... ...
@@ -306,7 +297,7 @@ Using an ordinary GLM instead.")
306 297
 # One or more omics data sets, possibly with clinical data.
307 298
 setMethod("crossValidate", "MultiAssayExperiment",
308 299
           function(measurements,
309
-                   outcomeColumns, 
300
+                   outcome, 
310 301
                    nFeatures = 20,
311 302
                    selectionMethod = "t-test",
312 303
                    selectionOptimisation = "Resubstitution",
... ...
@@ -319,7 +310,7 @@ setMethod("crossValidate", "MultiAssayExperiment",
319 310
                    nCores = 1,
320 311
                    characteristicsLabel = NULL, ...)
321 312
           {
322
-              measurementsAndOutcome <- prepareData(measurements, outcomeColumns, ...)
313
+              measurementsAndOutcome <- prepareData(measurements, outcome, ...)
323 314
 
324 315
               crossValidate(measurements = measurementsAndOutcome[["measurements"]],
325 316
                             outcome = measurementsAndOutcome[["outcome"]], 
... ...
@@ -558,24 +549,6 @@ generateCrossValParams <- function(nRepeats, nFolds, nCores, selectionOptimisati
558 549
 }
559 550
 ######################################
560 551
 
561
-
562
-
563
-######################################
564
-######################################
565
-checkData <- function(measurements, outcome){
566
-    if(is.null(rownames(measurements)))
567
-        stop("'measurements' DataFrame must have sample identifiers as its row names.")
568
-    if(any(is.na(measurements)))
569
-        stop("Some data elements are missing and classifiers don't work with missing data. Consider imputation or filtering.")
570
-
571
-    # !!!  Need to check mcols has assay NUm
572
-
573
-}
574
-######################################
575
-
576
-
577
-
578
-######################################
579 552
 ######################################
580 553
 #' A function to generate a ModellingParams object
581 554
 #'
... ...
@@ -643,9 +616,10 @@ generateModellingParams <- function(assayIDs,
643 616
     knownClassifiers <- .ClassifyRenvir[["classifyKeywords"]][, "classifier Keyword"]
644 617
     if(!classifier %in% knownClassifiers)
645 618
         stop(paste("Classifier must exactly match of these (be careful of case):", paste(knownClassifiers, collapse = ", ")))
646
-    
619
+
647 620
     classifierParams <- .classifierKeywordToParams(classifier)
648
-    classifierParams$trainParams@tuneParams <- c(classifierParams$trainParams@tuneParams, performanceType = performanceType)
621
+    if(!is.null(classifierParams$trainParams@tuneParams))
622
+      classifierParams$trainParams@tuneParams <- c(classifierParams$trainParams@tuneParams, performanceType = performanceType)
649 623
 
650 624
     selectionMethod <- unlist(selectionMethod)
651 625
 
... ...
@@ -715,6 +689,7 @@ generateMultiviewParams <- function(assayIDs,
715 689
                                           nFeatures = nFeatures,
716 690
                                           selectionMethod = selectionMethod,
717 691
                                           selectionOptimisation = "none",
692
+                                          performanceType = performanceType,
718 693
                                           classifier = classifier,
719 694
                                           multiViewMethod = "none")
720 695
 
... ...
@@ -833,11 +808,6 @@ CV <- function(measurements = NULL,
833 808
                characteristicsLabel = NULL)
834 809
 
835 810
 {
836
-    # Check that data is in the right format
837
-    if(!is.null(measurements))
838
-      checkData(measurements, outcome)
839
-    else
840
-      checkData(x, x)
841 811
     # Check that other variables are in the right format and fix
842 812
     nFeatures <- cleanNFeatures(nFeatures = nFeatures,
843 813
                                 measurements = measurements)
... ...
@@ -1072,9 +1042,9 @@ train.list <- function(x, outcomeTrain, ...)
1072 1042
 #' @rdname crossValidate
1073 1043
 #' @method train MultiAssayExperiment
1074 1044
 #' @export
1075
-train.MultiAssayExperiment <- function(x, outcomeColumns, ...)
1045
+train.MultiAssayExperiment <- function(x, outcome, ...)
1076 1046
           {
1077
-              prepArgs <- list(x, outcomeColumns)
1047
+              prepArgs <- list(x, outcome)
1078 1048
               extraInputs <- list(...)
1079 1049
               prepExtras <- trainExtras <- numeric()
1080 1050
               if(length(extraInputs) > 0)
... ...
@@ -11,6 +11,7 @@ GLMtrainInterface <- function(measurementsTrain, classesTrain, ..., verbose = 3)
11 11
     } else {fitData <- measurementsTrain}
12 12
   glm(class ~ . + 0, family = binomial, data = fitData, ...)
13 13
 }
14
+attr(GLMtrainInterface, "name") <- "GLMtrainInterface"
14 15
 
15 16
 # model is of class glm.
16 17
 GLMpredictInterface <- function(model, measurementsTest, returnType = c("both", "class", "score"),
... ...
@@ -51,11 +51,17 @@ setMethod("prepareData", "matrix",
51 51
 setMethod("prepareData", "DataFrame",
52 52
   function(measurements, outcome, useFeatures = "all", maxMissingProp = 0.0, topNvariance = NULL)
53 53
 {
54
+  if(is.null(rownames(measurements)))
55
+  {
56
+    warning("'measurements' DataFrame must have sample identifiers as its row names. Generating generic ones.")
57
+    rownames(measurements) <- paste("Sample", seq_len(nrow(measurements)))
58
+  }      
59
+            
54 60
   if(useFeatures != "all") # Subset to only the desired ones.
55 61
     measurements <- measurements[, useFeatures]
56 62
 
57 63
   # Won't ever be true if input data was MultiAssayExperiment because wideFormat already produces valid names.  
58
-  if(all.equal(colnames(measurements), make.names(colnames(measurements))) != TRUE)
64
+  if(!all(colnames(measurements) == make.names(colnames(measurements))))
59 65
   {
60 66
     warning("Unsafe feature names in input data. Converted into safe names.")
61 67
     mcols(measurements)$feature <- colnames(measurements) # Save the originals.
... ...
@@ -114,7 +120,7 @@ setMethod("prepareData", "DataFrame",
114 120
     else # Three columns. Therefore, counting process data.
115 121
       outcome <- survival::Surv(outcome[, 1], outcome[, 2], outcome[, 3])
116 122
   }
117
-  
123
+
118 124
   # Remove samples with indeterminate outcome.
119 125
   dropSamples <- which(is.na(outcome) | is.null(outcome))
120 126
   if(length(dropSamples) > 0)
... ...
@@ -125,8 +131,9 @@ setMethod("prepareData", "DataFrame",
125 131
   
126 132
   # Remove features with more missingness than allowed.
127 133
   nSamples <- nrow(measurements)
128
-  dropFeatures <- which(apply(measurements, 2, function(featureMeasurements) sum(is.na(featureMeasurements)))
129
-                        / nrow(measurements) > maxMissingProp)
134
+  measurementsMatrix <- as.matrix(measurements) # For speed of calculation.
135
+  dropFeatures <- which(apply(measurementsMatrix, 2, function(featureMeasurements) sum(is.na(featureMeasurements)))
136
+                        / nrow(measurementsMatrix) > maxMissingProp)
130 137
   if(length(dropFeatures) > 0)
131 138
     measurements <- measurements[, -dropFeatures]
132 139
   
... ...
@@ -158,11 +165,13 @@ setMethod("prepareData", "MultiAssayExperiment",
158 165
   if(!all(useFeatures[, "assay"] %in% c(names(measurements), "clinical")))
159 166
     stop("Some assay names in first column of 'useFeatures' are not assay names in 'measurements' or \"clinical\".")
160 167
 
161
-  clinicalColumns <- colnames(MultiAssayExperiment::colData(measurements))
168
+  clinicalColumnsDataset <- colnames(MultiAssayExperiment::colData(measurements))
162 169
   if("clinical" %in% useFeatures[, "assay"])
163 170
   {
164 171
     clinicalRows <- useFeatures[, "assay"] == "clinical"      
165 172
     clinicalColumns <- useFeatures[clinicalRows, "feature"]
173
+    if(length(clinicalColumns) == 1 && clinicalColumns == "all")
174
+      clinicalColumns <- setdiff(clinicalColumnsDataset, outcomeColumns)
166 175
     useFeatures <- useFeatures[!clinicalRows, ]
167 176
   } else {
168 177
     clinicalColumns <- NULL
... ...
@@ -171,7 +180,6 @@ setMethod("prepareData", "MultiAssayExperiment",
171 180
   if(nrow(useFeatures) > 0)
172 181
   {
173 182
     measurements <- measurements[, , unique(useFeatures[, "assay"])]
174
-  
175 183
     # Get all desired measurements tables and clinical columns (other than the columns representing outcome).
176 184
     # These form the independent variables to be used for making predictions with.
177 185
     # Variable names will have names like RNA_BRAF for traceability.
... ...
@@ -13,7 +13,10 @@
13 13
 #' are features.
14 14
 #' @param outcome Either a factor vector of classes, a \code{\link{Surv}} object, or
15 15
 #' a character string, or vector of such strings, containing column name(s) of column(s)
16
-#' containing either classes or time and event information about survival.
16
+#' containing either classes or time and event information about survival. If
17
+#' \code{measurements} is a \code{MultiAssayExperiment}, the names of the column (class) or
18
+#' columns (survival) in the table extracted by \code{colData(data)} that contain(s) the samples'
19
+#' outcome to use for prediction.
17 20
 #' @param crossValParams An object of class \code{\link{CrossValParams}},
18 21
 #' specifying the kind of cross-validation to be done.
19 22
 #' @param modellingParams An object of class \code{\link{ModellingParams}},
... ...
@@ -26,9 +29,6 @@
26 29
 #' package.  Transformation, selection and prediction functions provided by
27 30
 #' this package will cause the characteristics to be automatically determined
28 31
 #' and this can be left blank.
29
-#' @param outcomeColumns If \code{measurementsTrain} is a \code{MultiAssayExperiment}, the
30
-#' names of the column (class) or columns (survival) in the table extracted by \code{colData(data)}
31
-#' that contain(s)s the samples' outcome to use for prediction.
32 32
 #' @param ... Variables not used by the \code{matrix} nor the \code{MultiAssayExperiment} method which
33 33
 #' are passed into and used by the \code{DataFrame} method or passed onwards to \code{\link{prepareData}}.
34 34
 #' @param verbose Default: 1. A number between 0 and 3 for the amount of
... ...
@@ -70,9 +70,11 @@ setMethod("runTests", c("matrix"), function(measurements, outcome, ...) # Matrix
70 70
 setMethod("runTests", "DataFrame", function(measurements, outcome, crossValParams = CrossValParams(), modellingParams = ModellingParams(),
71 71
            characteristics = S4Vectors::DataFrame(), ..., verbose = 1)
72 72
 {
73
-  # Get out the outcome if inside of data table.           
74 73
   if(is.null(rownames(measurements)))
75
-    stop("'measurements' DataFrame must have sample identifiers as its row names.")
74
+  {
75
+    warning("'measurements' DataFrame must have sample identifiers as its row names. Generating generic ones.")
76
+    rownames(measurements) <- paste("Sample", seq_len(nrow(measurements)))
77
+  }
76 78
   
77 79
   if(any(is.na(measurements)))
78 80
     stop("Some data elements are missing and classifiers don't work with missing data. Consider imputation or filtering.")            
... ...
@@ -93,7 +95,7 @@ input data. Autmomatically reducing to smaller number.")
93 95
   
94 96
   # Element names of the list returned by runTest, in order.
95 97
   resultTypes <- c("ranked", "selected", "models", "testSet", "predictions", "tune", "importance")
96
-  
98
+
97 99
   # Create all partitions of training and testing sets.
98 100
   samplesSplits <- .samplesSplits(crossValParams, outcome)
99 101
   splitsTestInfo <- .splitsTestInfo(crossValParams, samplesSplits)
... ...
@@ -112,6 +114,7 @@ input data. Autmomatically reducing to smaller number.")
112 114
       message("Processing sample set ", setNumber, '.')
113 115
     
114 116
     # crossValParams is needed at least for nested feature tuning.
117
+    
115 118
     runTest(measurements[trainingSamples, , drop = FALSE], outcome[trainingSamples],
116 119
             measurements[testSamples, , drop = FALSE], outcome[testSamples],
117 120
             crossValParams, modellingParams, characteristics, verbose,
... ...
@@ -180,9 +183,9 @@ input data. Autmomatically reducing to smaller number.")
180 183
 #' @import MultiAssayExperiment methods
181 184
 #' @export
182 185
 setMethod("runTests", c("MultiAssayExperiment"),
183
-          function(measurements, outcomeColumns, ...)
186
+          function(measurements, outcome, ...)
184 187
 {
185
-  prepArgs <- list(measurements, outcomeColumns)              
188
+  prepArgs <- list(measurements, outcome)              
186 189
   extraInputs <- list(...)
187 190
   prepExtras <- numeric()
188 191
   if(length(extraInputs) > 0)
... ...
@@ -139,6 +139,12 @@
139 139
         return(list(NULL, rankings[[1]], NULL))
140 140
     
141 141
     tuneParamsTrain <- list(topN = topNfeatures)
142
+    performanceIndex <- match("performanceType", names(modellingParams@trainParams@tuneParams))
143
+    if(!is.na(performanceIndex))
144
+    {
145
+      performanceType <- modellingParams@trainParams@tuneParams[["performanceType"]]
146
+      modellingParams@trainParams@tuneParams <- modellingParams@trainParams@tuneParams[-performanceIndex]
147
+    }
142 148
     tuneParamsTrain <- append(tuneParamsTrain, modellingParams@trainParams@tuneParams)
143 149
     tuneCombosTrain <- expand.grid(tuneParamsTrain, stringsAsFactors = FALSE)  
144 150
     modellingParams@trainParams@tuneParams <- NULL
... ...
@@ -37,7 +37,7 @@ crossValidate(measurements, outcome, ...)
37 37
 
38 38
 \S4method{crossValidate}{MultiAssayExperiment}(
39 39
   measurements,
40
-  outcomeColumns,
40
+  outcome,
41 41
   nFeatures = 20,
42 42
   selectionMethod = "t-test",
43 43
   selectionOptimisation = "Resubstitution",
... ...
@@ -119,7 +119,7 @@ crossValidate(measurements, outcome, ...)
119 119
 
120 120
 \method{train}{list}(x, outcomeTrain, ...)
121 121
 
122
-\method{train}{MultiAssayExperiment}(x, outcomeColumns, ...)
122
+\method{train}{MultiAssayExperiment}(x, outcome, ...)
123 123
 
124 124
 \method{predict}{trainedByClassifyR}(object, newData, ...)
125 125
 }
... ...
@@ -130,7 +130,8 @@ or a list of these objects containing the data.}
130 130
 \item{outcome}{A vector of class labels of class \code{\link{factor}} of the
131 131
 same length as the number of samples in \code{measurements} or a character vector of length 1 containing the
132 132
 column name in \code{measurements} if it is a \code{\link{DataFrame}}. Or a \code{\link{Surv}} object or a character vector of
133
-length 2 or 3 specifying the time and event columns in \code{measurements} for survival outcome.}
133
+length 2 or 3 specifying the time and event columns in \code{measurements} for survival outcome. If \code{measurements} is a
134
+\code{\link{MultiAssayExperiment}}, the column name(s) in \code{colData(measurements)} representing the outcome.}
134 135
 
135 136
 \item{...}{Parameters passed into \code{\link{prepareData}} which control subsetting and filtering of input data.}
136 137
 
... ...
@@ -162,8 +163,6 @@ with each element being a vector of assays to combine. Special value \code{"all"
162 163
 
163 164
 \item{characteristicsLabel}{A character specifying an additional label for the cross-validation run.}
164 165
 
165
-\item{outcomeColumns}{If \code{measurements} is a \code{\link{MultiAssayExperiment}}, the column name(s) in \code{colData(measurements)} representing the outcome.}
166
-
167 166
 \item{x}{Same as \code{measurements} but only training samples.}
168 167
 
169 168
 \item{outcomeTrain}{For the \code{train} function, either a factor vector of classes, a \code{\link{Surv}} object, or
... ...
@@ -19,7 +19,7 @@
19 19
   verbose = 1
20 20
 )
21 21
 
22
-\S4method{runTests}{MultiAssayExperiment}(measurements, outcomeColumns, ...)
22
+\S4method{runTests}{MultiAssayExperiment}(measurements, outcome, ...)
23 23
 }
24 24
 \arguments{
25 25
 \item{measurements}{Either a \code{\link{matrix}}, \code{\link{DataFrame}}
... ...
@@ -32,7 +32,10 @@ are passed into and used by the \code{DataFrame} method or passed onwards to \co
32 32
 
33 33
 \item{outcome}{Either a factor vector of classes, a \code{\link{Surv}} object, or
34 34
 a character string, or vector of such strings, containing column name(s) of column(s)
35
-containing either classes or time and event information about survival.}
35
+containing either classes or time and event information about survival. If
36
+\code{measurements} is a \code{MultiAssayExperiment}, the names of the column (class) or
37
+columns (survival) in the table extracted by \code{colData(data)} that contain(s) the samples'
38
+outcome to use for prediction.}
36 39
 
37 40
 \item{crossValParams}{An object of class \code{\link{CrossValParams}},
38 41
 specifying the kind of cross-validation to be done.}
... ...
@@ -52,10 +55,6 @@ and this can be left blank.}
52 55
 \item{verbose}{Default: 1. A number between 0 and 3 for the amount of
53 56
 progress messages to give.  A higher number will produce more messages as
54 57
 more lower-level functions print messages.}
55
-
56
-\item{outcomeColumns}{If \code{measurementsTrain} is a \code{MultiAssayExperiment}, the
57
-names of the column (class) or columns (survival) in the table extracted by \code{colData(data)}
58
-that contain(s)s the samples' outcome to use for prediction.}
59 58
 }
60 59
 \value{
61 60
 An object of class \code{\link{ClassifyResult}}.