Browse code

Refactors discordantRun()

Adds .assignClass() to reduce repeated code.
Reorders if statements for clarity. Removes
unnecessary EM function args.

Max McGrath authored on 23/07/2021 17:21:57
Showing2 changed files

... ...
@@ -11,4 +11,5 @@ import(gtools)
11 11
 import(stats)
12 12
 import(tools)
13 13
 importFrom(Rcpp,sourceCpp)
14
+importFrom(dplyr,case_when)
14 15
 useDynLib(discordant, .registration = TRUE)
... ...
@@ -1,6 +1,6 @@
1 1
 # Applied from Lai et al 2007 Bioinformatics.
2 2
 #
3
-# Code written by Charlotte Siska
3
+# Code written by Charlotte Siska and Max McGrath
4 4
 # R functions fishersTrans, createVectors, discordantRun and makeTable
5 5
 # Email: charlotte.siska@ucdenver.edu
6 6
 #
... ...
@@ -12,163 +12,124 @@
12 12
 # R function for SparCC, adapted from https://bitbucket.org/yonatanf/sparcc
13 13
 # Email: hyfang@pku.edu.cn
14 14
 
15
+#' @export
15 16
 discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE, 
16 17
                           subsampling = FALSE, subSize = dim(x)[1], iter = 100, 
17 18
                           components = 3) {
18 19
   
19
-  .checkDiscordantInputs(v1, v2, x, y, transform, subsampling, subSize, iter, 
20
-                         components)
21
-  
22
-  if (transform) {
23
-    v1 <- fishersTrans(v1)
24
-    v2 <- fishersTrans(v2)
25
-  }
26
-  
27
-  x <- exprs(x)
28
-  if(is.null(y) == FALSE) { y <- exprs(y) }
29
-  featureSize = dim(x)[1]
30
-  
31
-  pdata <- cbind(v1, v2)
32
-  
33
-  param1 <- sd(v1)
34
-  param2 <- sd(v2)
35
-  
36
-  if(components == 3) {
37
-    class <- cbind(rep(0, dim(pdata)[1]), rep(0, dim(pdata)[1]))
38
-    class[pdata[,1]>0+param1,1] <- 2
39
-    class[pdata[,1]<0-param1,1] <- 1
40
-    class[pdata[,2]>0+param2,2] <- 2
41
-    class[pdata[,2]<0-param2,2] <- 1
42
-    discordClass <- c(2,3,4,6,7,8)
43
-  }
44
-  
45
-  if(components == 5) {
46
-    class <- cbind(rep(0, dim(pdata)[1]), rep(0, dim(pdata)[1]))
47
-    class[pdata[,1]>0+param1,1] <- 2
48
-    class[pdata[,1]<0-param1,1] <- 1
49
-    class[pdata[,1]>0+(2*param1),1] <- 4
50
-    class[pdata[,1]<0-(2*param1),1] <- 3
51
-    class[pdata[,2]>0+param2,2] <- 2
52
-    class[pdata[,2]<0-param2,2] <- 1
53
-    class[pdata[,2]>0+(2*param2),2] <- 4
54
-    class[pdata[,2]<0-(2*param2),2] <- 3
55
-    concordClass <- c(1,7,13,19,25)
56
-    discordClass <- setdiff(1:25,concordClass)
57
-  }
20
+    .checkDiscordantInputs(v1, v2, x, y, transform, subsampling, subSize, iter, 
21
+                           components)
22
+    
23
+    if (transform) {
24
+        v1 <- fishersTrans(v1)
25
+        v2 <- fishersTrans(v2)
26
+    }
58 27
   
59
-  if(subsampling == TRUE) {
60
-    subSize = nrow(x)
61
-    if(is.null(y) == FALSE & nrow(y) < subSize) {
62
-      subSize = nrow(y)
28
+    x <- exprs(x)
29
+    if (is.null(y) == FALSE) { y <- exprs(y) }
30
+    featureSize = dim(x)[1]
31
+    
32
+    pdata <- cbind(v1, v2)
33
+    param1 <- sd(v1)
34
+    param2 <- sd(v2)
35
+    class <- cbind(.assignClass(v1, param1, components),
36
+                   .assignClass(v2, param2, components))
37
+    
38
+    if (components == 3) {
39
+        discordClass <- c(2,3,4,6,7,8)
40
+    } else {
41
+        discordClass <- setdiff(1:25, c(1, 7, 13, 19, 25))
63 42
     }
64
-    total_mu <- rep(0,3)
65
-    total_sigma <- rep(0,3)
66
-    total_nu <- rep(0,3)
67
-    total_tau <- rep(0,3)
68
-    total_pi <- rep(0,3)
69
-    for(i in 1:iter) {
70
-      # make sure pairs are independent
71
-      rowIndex <- sample(nrow(x), subSize)
72
-      colIndex <- sample(nrow(y), subSize)
73
-      mat1 <- matrix(v1, nrow = nrow(x), byrow = FALSE)
74
-      mat2 <- matrix(v2, nrow = nrow(x), byrow = FALSE)
43
+    
44
+    if (subsampling) {
45
+        subSize <- min(nrow(x), nrow(y))
46
+        total_mu <- total_sigma <- total_nu <- 
47
+          total_tau <- total_pi <- rep(0, components) 
48
+        # NOTE: changed this from default 3 to components... need to figure out 
49
+        #   if that's right
50
+        
51
+        for(i in 1:iter) {
52
+            # make sure pairs are independent
53
+            rowIndex <- sample(nrow(x), subSize)
54
+            colIndex <- sample(nrow(y), subSize)
55
+            mat1 <- matrix(v1, nrow = nrow(x), byrow = FALSE)
56
+            mat2 <- matrix(v2, nrow = nrow(x), byrow = FALSE)
57
+            
58
+            subSampV1 <- sapply(1:subSize, function(x) mat1[rowIndex[x], 
59
+                                                            colIndex[x]])
60
+            subSampV2 <- sapply(1:subSize, function(x) mat2[rowIndex[x], 
61
+                                                            colIndex[x]])
62
+            
63
+            sub.pdata <- cbind(subSampV1, subSampV2)
64
+            sub.class <- cbind(.assignClass(subSampV1, param1, components),
65
+                               .assignClass(subSampV2, param2, components))
66
+            
67
+            pd <- em.normal.partial.concordant(sub.pdata, sub.class, components)
68
+            total_mu <- total_mu + pd$mu_sigma[1,]
69
+            total_sigma <- total_sigma + pd$mu_sigma[2,]
70
+            total_nu <- total_nu + pd$nu_tau[1,]
71
+            total_tau <- total_tau + pd$nu_tau[2,]
72
+            total_pi <- total_pi + pd$pi
73
+      }
75 74
       
76
-      subSampV1 <- sapply(1:subSize, function(x) mat1[rowIndex[x], 
77
-                                                      colIndex[x]])
78
-      subSampV2 <- sapply(1:subSize, function(x) mat2[rowIndex[x], 
79
-                                                      colIndex[x]])
75
+      mu <- total_mu / iter
76
+      sigma <- total_sigma / iter
77
+      nu <- total_nu / iter
78
+      tau <- total_tau / iter
79
+      pi <- total_pi / iter
80 80
       
81
-      sub.pdata <- cbind(subSampV1, subSampV2)
82
-      sub.class <- cbind(rep(0, subSize), rep(0, subSize))
83
-      if(components == 3) {
84
-        sub.class[sub.pdata[,1]>0+param1,1] <- 2
85
-        sub.class[sub.pdata[,1]<0-param1,1] <- 1
86
-        sub.class[sub.pdata[,2]>0+param2,2] <- 2
87
-        sub.class[sub.pdata[,2]<0-param2,2] <- 1
88
-      }
89
-      if(components == 5) {
90
-        sub.class[sub.pdata[,1]>0+param1,1] <- 2
91
-        sub.class[sub.pdata[,1]<0-param1,1] <- 1
92
-        sub.class[sub.pdata[,1]>0+(2*param1),1] <- 4
93
-        sub.class[sub.pdata[,1]<0-(2*param1),1] <- 3
94
-        sub.class[pdata[,2]>0+param2,2] <- 2
95
-        sub.class[pdata[,2]<0-param2,2] <- 1
96
-        sub.class[pdata[,2]>0+(2*param2),2] <- 4
97
-        sub.class[pdata[,2]<0-(2*param2),2] <- 3
98
-      }
99
-      pd <- em.normal.partial.concordant(sub.pdata, sub.class, tol = 0.001, 
100
-                                         restriction = 0, 
101
-                                         constrain = c(0,-sd(pdata),sd(pdata)),
102
-                                         iteration = 1000, 
103
-                                         components = components)
104
-      total_mu <- total_mu + pd$mu_sigma[1,]
105
-      total_sigma <- total_sigma + pd$mu_sigma[2,]
106
-      total_nu <- total_nu + pd$nu_tau[1,]
107
-      total_tau <- total_tau + pd$nu_tau[2,]
108
-      total_pi <- total_pi + pd$pi
81
+      finalResult <- subSampleData(pdata, class, mu, sigma, nu, tau, pi, 
82
+                                   components)
83
+      zTable <- finalResult$z
84
+      classVector <- finalResult$class
85
+    } else {
86
+        pd <- em.normal.partial.concordant(pdata, class, components)
87
+        zTable <- pd$z
88
+        classVector <- pd$class
109 89
     }
110 90
     
111
-    mu <- total_mu / iter
112
-    sigma <- total_sigma / iter
113
-    nu <- total_nu / iter
114
-    tau <- total_tau / iter
115
-    pi <- total_pi / iter
91
+    discordPPV <- apply(zTable, 1, function(x) sum(x[discordClass])/sum(x))
116 92
     
117
-    finalResult <- subSampleData(pdata, class, mu, sigma, nu, tau, pi, 
118
-                                 components)
119
-    zTable <- finalResult$z
120
-    classVector <- finalResult$class
121
-  } else {
122
-    pd <- em.normal.partial.concordant(pdata, class, tol = 0.001, 
123
-                                       restriction = 0, 
124
-                                       constrain = c(0, -sd(pdata), sd(pdata)), 
125
-                                       iteration = 1000, 
126
-                                       components = components)
127
-    zTable <- pd$z
128
-    classVector <- pd$class
129
-  }
130
-  
131
-  discordPPV <- apply(zTable, 1, function(x) sum(x[discordClass])/sum(x))
132
-  
133
-  if(is.null(y) == FALSE) {
134
-    discordPPMatrix <- matrix(discordPPV, nrow = featureSize, 
135
-                              byrow = FALSE)
136
-    classMatrix <- matrix(classVector, nrow = featureSize, byrow = FALSE)
137
-    rownames(discordPPMatrix) <- rownames(x)
138
-    colnames(discordPPMatrix) <- rownames(y)
139
-    rownames(classMatrix) <- rownames(x)
140
-    colnames(classMatrix) <- rownames(y)
93
+    if(is.null(y)) {
94
+        discordPPMatrix <- matrix(NA,nrow = featureSize, ncol = featureSize)
95
+        classMatrix <- discordPPMatrix
96
+        diag <- lower.tri(discordPPMatrix, diag = FALSE)
97
+        discordPPMatrix[diag] <- discordPPV
98
+        classMatrix[diag] <- classVector
99
+        rownames(discordPPMatrix) <- rownames(x)
100
+        colnames(discordPPMatrix) <- rownames(x)
101
+        rownames(classMatrix) <- rownames(x)
102
+        colnames(classMatrix) <- rownames(x)
103
+        vector_names <- getNames(x)
104
+        names(discordPPV) <- vector_names
105
+        names(classVector) <- vector_names
106
+    } else {
107
+        discordPPMatrix <- matrix(discordPPV, nrow = featureSize, 
108
+                                  byrow = FALSE)
109
+        classMatrix <- matrix(classVector, nrow = featureSize, byrow = FALSE)
110
+        rownames(discordPPMatrix) <- rownames(x)
111
+        colnames(discordPPMatrix) <- rownames(y)
112
+        rownames(classMatrix) <- rownames(x)
113
+        colnames(classMatrix) <- rownames(y)
114
+        
115
+        vector_names <- getNames(x,y)
116
+        names(discordPPV) <- vector_names
117
+        names(classVector) <- vector_names
118
+    }
141 119
     
142
-    vector_names <- getNames(x,y)
143
-    names(discordPPV) <- vector_names
144
-    names(classVector) <- vector_names
145
-  }
146
-  
147
-  if(is.null(y)) {
148
-    discordPPMatrix <- matrix(NA,nrow = featureSize, ncol = featureSize)
149
-    classMatrix <- discordPPMatrix
150
-    diag <- lower.tri(discordPPMatrix, diag = FALSE)
151
-    discordPPMatrix[diag] <- discordPPV
152
-    classMatrix[diag] <- classVector
153
-    rownames(discordPPMatrix) <- rownames(x)
154
-    colnames(discordPPMatrix) <- rownames(x)
155
-    rownames(classMatrix) <- rownames(x)
156
-    colnames(classMatrix) <- rownames(x)
157
-    vector_names <- getNames(x)
158
-    names(discordPPV) <- vector_names
159
-    names(classVector) <- vector_names
160
-  }
161
-  
162
-  zTable <- t(apply(zTable, 1, function(x) x/sum(x)))
163
-  rownames(zTable) <- vector_names
164
-  
165
-  return(list(discordPPMatrix = discordPPMatrix, discordPPVector = 
166
-                discordPPV, classMatrix = classMatrix, classVector = classVector, 
167
-              probMatrix = zTable, loglik = pd$loglik))
120
+    zTable <- t(apply(zTable, 1, function(x) x/sum(x)))
121
+    rownames(zTable) <- vector_names
122
+    
123
+    return(list(discordPPMatrix = discordPPMatrix, discordPPVector = discordPPV,
124
+                classMatrix = classMatrix, classVector = classVector, 
125
+                probMatrix = zTable, loglik = pd$loglik))
168 126
 }
169 127
 
170
-em.normal.partial.concordant <- function(data, class, tol=0.001, 
171
-    restriction=0, constrain=0, iteration=1000, components) {
128
+em.normal.partial.concordant <- function(data, class, components) {
129
+    tol <- 0.001
130
+    restriction <- 0
131
+    constrain <- 0
132
+    iteration <- 1000
172 133
     n <- as.integer(dim(data)[1])
173 134
     g <- as.integer(nlevels(as.factor(class)))
174 135
 
... ...
@@ -209,6 +170,21 @@ em.normal.partial.concordant <- function(data, class, tol=0.001,
209 170
                 z = array(results[[3]], dim = c(n, g*g))))
210 171
 }
211 172
 
173
+
174
+# Internal function to assign class to vector based on number of components and 
175
+#   each elements distance from zero
176
+#' @importFrom dplyr case_when
177
+.assignClass <- function(x, param, components) {
178
+  if (components == 3) {
179
+    rtn <- case_when(x < -param ~ 1, x > param ~ 2, TRUE ~ 0)
180
+  } else {
181
+    rtn <- case_when(x < -2*param ~ 3, x > 2*param ~ 4, x < -param ~ 1,
182
+                     x > param ~ 2, TRUE ~ 0)
183
+  }
184
+  return(rtn)
185
+}
186
+
187
+# Internal function to validate user inputs for discordantRun()
212 188
 .checkDiscordantInputs <- function(v1, v2, x, y, transform, 
213 189
                                    subsampling, subSize, iter, 
214 190
                                    components) {
... ...
@@ -217,6 +193,11 @@ em.normal.partial.concordant <- function(data, class, tol=0.001,
217 193
     stop("x and y (if present) must be type ExpressionSet")
218 194
   }
219 195
   
196
+  # Need to double check if this is true w/ Katerina
197
+  if (is.null(y) && subsampling) {
198
+    stop("y cannot be NULL if subsampling is TRUE")
199
+  }
200
+  
220 201
   if (!(components %in% c(3, 5))) {
221 202
     stop ("components must be equal to 3 or 5")
222 203
   }