Browse code

Handle case with insufficient data

Updates error handling for case where
insufficient data is provided in the 5 component
(or 3 component) case. Updates vignette
accordingly. Also makes some changes to
other internal functions like greatly simplifying
.getNames()

Max McGrath authored on 01/09/2021 17:03:57
Showing6 changed files

... ...
@@ -16,11 +16,15 @@ Description: Discordant is a method to determine differential
16 16
 Encoding: UTF-8
17 17
 biocViews: ImmunoOncology, BiologicalQuestion, StatisticalMethod, 
18 18
     mRNAMicroarray, Microarray, Genetics, RNASeq
19
-Suggests: BiocStyle, knitr, testthat
20
-Imports: Rcpp, Biobase, stats, biwt, gtools, MASS, tools, dplyr, methods
19
+Suggests: 
20
+    BiocStyle,
21
+    knitr,
22
+    testthat (>= 3.0.0)
23
+Imports: Rcpp, Biobase, stats, biwt, gtools, MASS, tools, dplyr, methods, utils
21 24
 License: GPL (>= 2)
22 25
 URL: https://github.com/siskac/discordant
23 26
 NeedsCompilation: yes
24 27
 LinkingTo: Rcpp
25 28
 VignetteBuilder: knitr
26 29
 RoxygenNote: 7.1.1
30
+Config/testthat/edition: 3
... ...
@@ -13,4 +13,5 @@ import(tools)
13 13
 importFrom(Rcpp,sourceCpp)
14 14
 importFrom(dplyr,case_when)
15 15
 importFrom(methods,is)
16
+importFrom(utils,combn)
16 17
 useDynLib(discordant, .registration = TRUE)
... ...
@@ -56,8 +56,10 @@ createVectors <- function(x, y = NULL, groups,
56 56
     
57 57
     if(is.null(y)) {
58 58
         data <- exprs(x)
59
+        vector_names <- .getNames(exprs(x), y)
59 60
     } else {
60 61
         data <- rbind(exprs(x), exprs(y))
62
+        vector_names <- .getNames(exprs(x), exprs(y))
61 63
         featureSize <- dim(exprs(x))[1]
62 64
     }
63 65
     
... ...
@@ -88,7 +90,6 @@ createVectors <- function(x, y = NULL, groups,
88 90
         statVector2 <- as.vector(statMatrix2)
89 91
     }
90 92
     
91
-    vector_names <- .getNames(exprs(x), y)
92 93
     names(statVector1) <- vector_names
93 94
     names(statVector2) <- vector_names
94 95
     return(list(v1 = statVector1, v2 = statVector2))
... ...
@@ -126,9 +126,7 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
126 126
     if (subsampling) {
127 127
         subSize <- min(nrow(x), nrow(y))
128 128
         total_mu <- total_sigma <- total_nu <- 
129
-          total_tau <- total_pi <- rep(0, components) 
130
-        # NOTE: changed this from default 3 to components... need to figure out 
131
-        #   if that's right
129
+          total_tau <- total_pi <- rep(0, components)
132 130
         
133 131
         for(i in 1:iter) {
134 132
             # make sure pairs are independent
... ...
@@ -221,6 +219,7 @@ em.normal.partial.concordant <- function(data, class, components) {
221 219
 
222 220
     zx <- .unmap(class[,1], components = components)
223 221
     zy <- .unmap(class[,2], components = components)
222
+    .checkForMissingComponents(zx, zy)
224 223
     zxy <- sapply(1:dim(zx)[1], yl.outer, zx, zy)
225 224
 
226 225
     pi <- double(g*g)
... ...
@@ -277,9 +276,9 @@ em.normal.partial.concordant <- function(data, class, components) {
277 276
   }
278 277
   
279 278
   # Need to double check if this is true w/ Katerina
280
-  if (is.null(y) && subsampling) {
281
-    stop("y cannot be NULL if subsampling is TRUE")
282
-  }
279
+  # if (is.null(y) && subsampling) {
280
+  #   stop("y cannot be NULL if subsampling is TRUE")
281
+  # }
283 282
   
284 283
   if (!(components %in% c(3, 5))) {
285 284
     stop ("components must be equal to 3 or 5")
... ...
@@ -289,4 +288,14 @@ em.normal.partial.concordant <- function(data, class, components) {
289 288
                     range(v2)[1] < -1 || range(v2)[2] > 1)) {
290 289
     stop ("correlation vectors have values less than -1 and/or greater than 1.")
291 290
   }
291
+}
292
+
293
+.checkForMissingComponents <- function(zx, zy) {
294
+  sumZx <- colSums(zx)
295
+  sumZy <- colSums(zy)
296
+  if(any(c(sumZx, sumZy) == 0)) {
297
+    stop("Insufficient data for component estimation. Increase number of 
298
+         features or reduce number of components used. If subsampling=TRUE,
299
+         consider increasing subSize as well.")
300
+  }
292 301
 }
293 302
\ No newline at end of file
... ...
@@ -36,8 +36,8 @@ fishersTrans <- function(rho) {
36 36
         return( c(zx[k,] %o% zy[k,]) )
37 37
     }
38 38
     
39
-    zx <- unmap(class[,1], components = components)
40
-    zy <- unmap(class[,2], components = components)
39
+    zx <- .unmap(class[,1], components = components)
40
+    zy <- .unmap(class[,2], components = components)
41 41
     zxy <- sapply(1:dim(zx)[1], yl.outer, zx, zy)
42 42
     
43 43
     results <- subsampling_cpp(as.double(pdata[,1]), as.double(pdata[,2]), 
... ...
@@ -66,33 +66,15 @@ fishersTrans <- function(rho) {
66 66
     return(z)
67 67
 }
68 68
 
69
+#' @importFrom utils combn
69 70
 .getNames <- function(x, y = NULL) {
70
-    if(is.null(y) == FALSE) {
71
-        y <- exprs(y)
72
-        namesMatrix <- NULL
73
-        for(i in 1:nrow(x)) {
74
-            tempMatrix <- cbind(rep(rownames(x)[i], nrow(y)), rownames(y))
75
-            namesMatrix <- rbind(namesMatrix, tempMatrix)
76
-        }
71
+    if (is.null(y)) {
72
+        vector_names <- paste0(combn(rownames(x), 2)[1, ], "_", 
73
+                               combn(rownames(x), 2)[2, ])
77 74
     } else {
78
-        temp <- matrix(NA,nrow = nrow(x), ncol = nrow(x))
79
-        diag <- lower.tri(temp, diag = FALSE)
80
-        temp[diag] <- rep(1, sum(diag == TRUE))
81
-        
82
-        namesMatrix <- NULL
83
-        
84
-        for(i in 1:dim(temp)[1]) {
85
-            outputCol <- temp[,i]
86
-            index <- which(is.na(outputCol) == FALSE)
87
-            if(length(index) > 0) {
88
-                tempMatrix <- cbind(rep(rownames(x)[i],length(index)), 
89
-                                    rownames(x)[index])
90
-                namesMatrix <- rbind(namesMatrix, tempMatrix)
91
-            }
92
-        }
75
+        name_combns <- expand.grid(rownames(y), rownames(x))
76
+        vector_names <- paste0(name_combns[[2]], "_", name_combns[[1]])
93 77
     }
94 78
     
95
-    vector_names <- apply(namesMatrix, 1, function(k) paste(k[1],"_",k[2],
96
-                                                            sep = ""))
97 79
     return(vector_names)
98 80
 }
99 81
\ No newline at end of file
... ...
@@ -1,6 +1,6 @@
1 1
 ---
2 2
 title: "The discordant R Package: A Novel Approach to Differential Correlation"
3
-shorttitle: "Discordant"
3
+shorttitle: "discordant"
4 4
 author:
5 5
   - name: Charlotte Siska
6 6
     email: charlotte.siska@ucdenver.edu
... ...
@@ -84,14 +84,17 @@ are considered *differentially* correlated, as opposed to when correlation
84 84
 coefficients are in the *same* component then they are *equivalently* 
85 85
 correlated.
86 86
 
87
-  | 0 | - | +
88
-0 | 1 | 2 | 3
89
-- | 4 | 5 | 6
90
-+ | 7 | 8 | 9
91
-: (\#tab:table) Class Matrix for Three Component Mixture Model.
87
+\[
88
+\begin{array}{c|c c c}
89
+  \text{} & \text{0} & \text{-}  & \text{+}\\ 
90
+\hline
91
+   0 & 1 & 2  & 3 \\
92
+   - & 4 & 5  & 6 \\
93
+   + & 7 & 8  & 9 \\ 
94
+\end{array}
95
+\]
92 96
 
93
-The class matrix (Table 1) contains the classes that represent all possible 
97
+The class matrix (above) contains the classes that represent all possible 
94 98
 paired-correlation scenarios. These scenarios are based off the components in 
95 99
 the mixture models. Molecular features that have correlation coefficients in 
96 100
 different components are considered differentially correlated, as opposed to 
... ...
@@ -229,7 +232,7 @@ whether one or two matrices are parameters for this function. `createVectors()`
229 232
 is run as follows:
230 233
 
231 234
 ```{r}
232
-groups <- c(rep(1,10), rep(2,10))
235
+groups <- c(rep(1,10), rep(2,20))
233 236
 
234 237
 # Within -omics
235 238
 wthn_vectors <- createVectors(x = TCGA_GBM_transcript_microarray, 
... ...
@@ -418,58 +421,61 @@ data has a greater feature size than the microarray data. A dataset with small
418 421
 feature size will cause a segmentation fault with subsampling because not enough
419 422
 feature pairs are being used to estimate parameters in the mixture model.
420 423
 
421
-\begin{table}[h]
422
-\begin{center}
423
-\begin{tabular}{ c |  c c c c c}  
424
-  & 0 & - & -- & + & ++ \\
425
-\hline
426
-0 & 1 & 2 & 3 & 4 & 5 \\
427
-- & 6 & 7 & 8 & 9 & 10  \\  
428
-+ & 16 & 17 & 18 & 19 & 20 \\
429
-++ & 21 & 22 & 23 & 24 & 25
430
-\end{tabular}
431
-\caption{Class Matrix for Five Component Mixture Model}
432
-\end{center}
433
-\end{table}
434
-
435
-## Number of Components
424
+## Five Components
436 425
 
437 426
 We also provide the option to increase component size from three to five in the 
438 427
 mixture model. The number of classes in the class matrix increases, as seen in 
439
-Table 2. Incorporating the extra components means that it is possible to 
428
+the table below. Incorporating the extra components means that it is possible to
440 429
 identify elevated differential correlation, which is when there are associations
441
-in both groups in the same direction but one is more extreme. Using this options
430
+in both groups in the same direction but one is more extreme. Using this option
442 431
 introduces more parameters, which does have an effect on run-time. We also found
443 432
 that using the five mixture component mixture model reduces performance compared
444 433
 to the three component mixture model(@siska2). However, the option is 
445 434
 available if users wish to explore more types of differential correlation.
446 435
 
436
+\[
437
+\begin{array}{c|c c c c c}
438
+  \text{} & \text{0} & \text{-}  & \text{--} & \text{+} & \text{++} \\ 
439
+\hline
440
+  0 & 1 & 2 & 3 & 4 & 5 \\
441
+  - & 6 & 7 & 8 & 9 & 10  \\  
442
+  -- & 11 & 12 & 13 & 14 & 15 \\
443
+  + & 16 & 17 & 18 & 19 & 20 \\
444
+  ++ & 21 & 22 & 23 & 24 & 25
445
+\end{array}
446
+\]
447
+
447 448
 By default, `discordantRun()` uses a three component mixture model, but this may
448 449
 be changed to a five component mixture model by setting the argument
449
-`components = 5`.
450
-
451
-# Microarray Example
450
+`components = 5`. A greater amount of data (specifically a greater number of 
451
+features) is necessary to accurately estimate 5 components compared to 3.
452
+If an insufficient amount of data is used, `discordantRun()` will throw an error
453
+suggesting the user increase the number of features or reduce the chosen
454
+number of components. The data used for the within -omics analysis above does
455
+not have enough features to estimate 5 components, so an error is thrown below.
456
+
457
+```{r, error = TRUE}
458
+# Within -omics
459
+wthn_result <- discordantRun(v1 = wthn_vectors$v1, 
460
+                             v2 = wthn_vectors$v2, 
461
+                             x = TCGA_GBM_transcript_microarray,
462
+                             components = 5)
463
+```
452 464
 
453
-__This section is just a repeat of above section's code__ *Need to figure
454
-out additional example, ideall involving subsampling or 5 components or both*
465
+However, the between -omics analysis above is a suitable candidate for further
466
+analysis using five components, so an example of such an analysis is provided
467
+below.
455 468
 
456 469
 ```{r}
457
-data(TCGA_GBM_miRNA_microarray)
458
-data(TCGA_GBM_transcript_microarray)
459
-groups <- c(rep(1,10), rep(2,10))
460
-
461
-#Within -Omics
462
-wthn_vectors <- createVectors(TCGA_GBM_transcript_microarray, groups = groups)
463
-wthn_result <- discordantRun(wthn_vectors$v1, wthn_vectors$v2, 
464
-                             TCGA_GBM_transcript_microarray)
465
-
466
-#Between -Omics
467
-btwn_vectors <- createVectors(TCGA_GBM_miRNA_microarray, 
468
-                              TCGA_GBM_transcript_microarray, groups = groups)
469
-btwn_result <- discordantRun(btwn_vectors$v1, btwn_vectors$v2, 
470
-                             TCGA_GBM_miRNA_microarray, 
471
-                             TCGA_GBM_transcript_microarray)
470
+# Between -omics
471
+btwn_result <- discordantRun(v1 = btwn_vectors$v1, 
472
+                             v2 = btwn_vectors$v2, 
473
+                             x = TCGA_GBM_miRNA_microarray, 
474
+                             y = TCGA_GBM_transcript_microarray,
475
+                             components = 5)
476
+
477
+# Between -omics
478
+round(head(btwn_result$probMatrix), 2)
472 479
 ```
473 480
 
474 481
 # Session Info