Browse code

- calcPerformance gains a Sample C-index option to calculate a C-index for each individual. - .splitDataAndOutcomes fix for survival object creation.

Dario Strbenac authored on 15/07/2022 06:00:02
Showing 5 changed files

... ...
@@ -170,6 +170,8 @@ importFrom(S4Vectors,do.call)
170 170
 importFrom(S4Vectors,mcols)
171 171
 importFrom(dplyr,mutate)
172 172
 importFrom(dplyr,n)
173
+importFrom(genefilter,rowFtests)
174
+importFrom(genefilter,rowttests)
173 175
 importFrom(randomForest,importance)
174 176
 importFrom(randomForest,varUsed)
175 177
 importFrom(rlang,sym)
... ...
@@ -57,6 +57,7 @@
57 57
 #' there are two classes.}
58 58
 #' \item{\code{"AUC"}: Area Under the Curve. An area ranging from 0 to 1, under the ROC.}
59 59
 #' \item{\code{"C-index"}: For survival data, the concordance index, for models which produce risk scores. Ranges from 0 to 1.}
60
+#' \item{\code{"Sample C-index"}: Per-individual C-index.}
60 61
 #' }
61 62
 #' 
62 63
 #' @param actualOutcomes A factor vector or survival information specifying each sample's known outcome.
... ...
@@ -128,7 +129,7 @@ setMethod("calcCVperformance", "ClassifyResult",
128 129
                                                "Micro Precision", "Micro Recall",
129 130
                                                "Micro F1", "Macro Precision",
130 131
                                                "Macro Recall", "Macro F1", "Matthews Correlation Coefficient", "AUC", 
131
-                                               "C-index"))
132
+                                               "C-index", "Sample C-index"))
132 133
 {
133 134
   performanceType <- match.arg(performanceType)
134 135
   actualOutcomes <- actualOutcomes(result) # Extract the known outcomes of all samples.
... ...
@@ -143,7 +144,7 @@ setMethod("calcCVperformance", "ClassifyResult",
143 144
   }
144 145
   
145 146
   ### Performance for survival data
146
-  if(performanceType == "C-index") {
147
+  if(performanceType %in% c("C-index", "Sample C-index")) {
147 148
     samples <- factor(result@predictions[, "sample"], levels = sampleNames(result))
148 149
     performance <- .calcPerformance(actualOutcomes = actualOutcomes[match(result@predictions[, "sample"], sampleNames(result))],
149 150
                                     predictedOutcomes = result@predictions[, "risk"], 
... ...
@@ -180,30 +181,63 @@ setMethod("calcCVperformance", "ClassifyResult",
180 181
 {
181 182
   if(performanceType %in% c("Sample Error", "Sample Accuracy"))
182 183
   {
183
-    resultTable <- data.frame(sample = samples,
184
-                              actual = actualOutcomes,
185
-                              predicted = predictedOutcomes)
186
-    allIDs <- levels(resultTable[, "sample"])
187 184
     
188
-    sampleMetricValues <- sapply(allIDs, function(sampleID)
185
+    sampleMetricValues <- sapply(levels(samples), function(sampleID)
189 186
     {
190
-      sampleResult <- subset(resultTable, sample == sampleID)
191
-      if(nrow(sampleResult) == 0)
192
-        return(NA)
187
+      consider <- which(samples == sampleID)
193 188
       if(performanceType == "Sample Error")
194
-        sum(as.character(sampleResult[, "predicted"]) != as.character(sampleResult[, "actual"]))
189
+        sum(predictedOutcomes[consider] != as.character(actualOutcomes[consider]))
195 190
       else
196
-        sum(as.character(sampleResult[, "predicted"]) == as.character(sampleResult[, "actual"]))
191
+        sum(predictedOutcomes[consider] == as.character(actualOutcomes[consider]))
197 192
     })
198
-    performanceValues <- as.numeric(sampleMetricValues / table(factor(resultTable[, "sample"], levels = allIDs)))
199
-    names(performanceValues) <- allIDs
193
+    performanceValues <- as.numeric(sampleMetricValues / table(samples))
194
+    names(performanceValues) <- levels(samples)
200 195
     return(list(name = performanceType, values = performanceValues))
201 196
   }
202
-  
197
+    
203 198
   if(!is.null(grouping))
204 199
   {
205 200
     actualOutcomes <- split(actualOutcomes, grouping)
206 201
     predictedOutcomes <- split(predictedOutcomes, grouping)
202
+    allSamples <- levels(samples)
203
+    samples <- split(samples, grouping)
204
+  }
205
+    
206
+  if(performanceType == "Sample C-index")
207
+  {
208
+    performanceValues <- do.call(rbind, mapply(function(iterationSurv, iterationPredictions, iterationSamples)
209
+    {
210
+      do.call(rbind, lapply(allSamples, function(sampleID)
211
+      {
212
+        sampleIndex <- which(iterationSamples == sampleID)
213
+        otherIndices <- setdiff(seq_along(allSamples), sampleIndex)
214
+        concordants <- discordants <- 0
215
+        iterationSurv <- as.matrix(iterationSurv)
216
+        for(compareIndex in otherIndices)
217
+        {
218
+          if(iterationSurv[sampleIndex, "time"] < iterationSurv[compareIndex, "time"] && iterationPredictions[sampleIndex] > iterationPredictions[compareIndex] && iterationSurv[sampleIndex, "status"] == 1)
219
+          { # Reference sample has shorter time, it is not censored, greater risk. Concordant.
220
+            concordants <- concordants + 1
221
+          } else if(iterationSurv[sampleIndex, "time"] > iterationSurv[compareIndex, "time"] && iterationPredictions[sampleIndex] < iterationPredictions[compareIndex] && iterationSurv[compareIndex, "status"] == 1)
222
+          { # Reference sample has longer time, the comparison sample is not censored, lower risk. Concordant.
223
+            concordants <- concordants + 1
224
+          } else if(iterationSurv[sampleIndex, "time"] < iterationSurv[compareIndex, "time"] && iterationPredictions[sampleIndex] < iterationPredictions[compareIndex] && iterationSurv[sampleIndex, "status"] == 1)
225
+          { # Reference sample has shorter time, it is not censored, but lower risk than comparison sample. Discordant.
226
+            discordants <- discordants + 1
227
+          } else if(iterationSurv[sampleIndex, "time"] > iterationSurv[compareIndex, "time"] && iterationPredictions[sampleIndex] > iterationPredictions[compareIndex] && iterationSurv[compareIndex, "status"] == 1)
228
+          { # Reference sample has longer time, the comparison sample is not censored, but higher risk than comparison sample. Discordant.
229
+            discordants <- discordants + 1
230
+          }
231
+        }
232
+        data.frame(sample = sampleID, concordant = concordants, discordant = discordants)
233
+      }))
234
+    }, actualOutcomes, predictedOutcomes, samples, SIMPLIFY = FALSE))
235
+
236
+    sampleValues <- by(performanceValues[, c("concordant", "discordant")], performanceValues[, "sample"], colSums)
237
+    Cindex <- round(sapply(sampleValues, '[', 1) / (sapply(sampleValues, '[', 1) + sapply(sampleValues, '[', 2)), 2)
238
+    names(Cindex) <- names(sampleValues)
239
+    Cindex <- Cindex[!is.nan(Cindex)] # The individual with the smallest censored time will always have no useful inequalities.
240
+    return(list(name = performanceType, values = Cindex))
207 241
   }
208 242
   
209 243
   if(!is(actualOutcomes, "list")) actualOutcomes <- list(actualOutcomes)
... ...
@@ -49,6 +49,7 @@
49 49
 #'     head(ranked)
50 50
 #' @usage NULL
51 51
 #' @export
52
+#' @importFrom genefilter rowttests rowFtests
52 53
 setGeneric("differentMeansRanking", function(measurementsTrain, ...)
53 54
            standardGeneric("differentMeansRanking"))
54 55
 
... ...
@@ -40,25 +40,19 @@
40 40
     outcomes <- factor(outcomes)
41 41
   
42 42
   # Outcomes has columns, so it is tabular. It is inferred to represent survival data.
43
-  if(!is.null(ncol(outcomes)))
43
+  if(!is.null(ncol(outcomes)) && ncol(outcomes) %in% 2:3)
44 44
   {
45 45
     # Assume that event status is in the last column (second for two columns, third for three columns)
46
-    if(!is.null(dim(outcomes)) && length(dim(data)) == 2 && ncol(outcomes) %in% 2:3)
47
-    {
48
-      numberEventTypes <- length(unique(outcomes[, ncol(outcomes)]))
49
-      # Could be one or two kinds of events. All events might be uncensored or censored
50
-      # in a rare but not impossible scenario.
51
-      if(numberEventTypes > 2)
52
-        stop("Number of distinct event types in the last column exceeds 2 but must be 1 or 2.")
53
-      
46
+    numberEventTypes <- length(unique(outcomes[, ncol(outcomes)]))
47
+    # Could be one or two kinds of events. All events might be uncensored or censored
48
+    # in a rare but not impossible scenario.
49
+    if(numberEventTypes > 2)
50
+      stop("Number of distinct event types in the last column exceeds 2 but must be 1 or 2.")
54 51
       
55
-      if(ncol(outcomes) == 2) # Typical, right-censored survival data.
56
-      {
57
-        outcomes <- survival::Surv(outcomes[, 1], outcomes[, 2])
58
-      } else { # Three columns. Therefore, counting process data.
59
-        outcomes <- survival::Surv(outcomes[, 1], outcomes[, 2], outcomes[, 3])
60
-      }
61
-    }
52
+    if(ncol(outcomes) == 2) # Typical, right-censored survival data.
53
+      outcomes <- survival::Surv(outcomes[, 1], outcomes[, 2])
54
+    else # Three columns. Therefore, counting process data.
55
+      outcomes <- survival::Surv(outcomes[, 1], outcomes[, 2], outcomes[, 3])
62 56
   }
63 57
   
64 58
   if(!is.null(restrict))
... ...
@@ -29,7 +29,7 @@ Pair of Factor Vectors}
29 29
   performanceType = c("Balanced Error", "Balanced Accuracy", "Error", "Accuracy",
30 30
     "Sample Error", "Sample Accuracy", "Micro Precision", "Micro Recall", "Micro F1",
31 31
     "Macro Precision", "Macro Recall", "Macro F1", "Matthews Correlation Coefficient",
32
-    "AUC", "C-index")
32
+    "AUC", "C-index", "Sample C-index")
33 33
 )
34 34
 }
35 35
 \arguments{
... ...
@@ -64,6 +64,7 @@ between -1 and 1 indicating how concordant the predicted classes are to the actu
64 64
 there are two classes.}
65 65
 \item{\code{"AUC"}: Area Under the Curve. An area ranging from 0 to 1, under the ROC.}
66 66
 \item{\code{"C-index"}: For survival data, the concordance index, for models which produce risk scores. Ranges from 0 to 1.}
67
+\item{\code{"Sample C-index"}: Per-individual C-index.}
67 68
 }}
68 69
 
69 70
 \item{result}{An object of class \code{\link{ClassifyResult}}.}