Browse code

Data in vignette, NEWS, other small changes

Change breast data to voom transformed version,
remove use of paste() in stop() and warning(),
update news to reflect latest changes and move
it to main directory

Max McGrath authored on 15/10/2021 19:11:10
Showing1 changed files
... ...
@@ -145,10 +145,10 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
145 145
         }
146 146
         
147 147
         if (repeats >= floor(iter * .1)) {
148
-          stop(paste0("\nInsufficient data for subsampling. Increase number of",
148
+          stop("\nInsufficient data for subsampling. Increase number of",
149 149
                "\nfeatures, reduce numberof components used, or increase", 
150 150
                "\nsubSize if not at default value. Alternatively, set",
151
-               "\nsubsampling=FALSE."))
151
+               "\nsubsampling=FALSE.")
152 152
         }
153 153
       
154 154
       mu <- total_mu / iter
... ...
@@ -165,9 +165,8 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
165 165
         pd <- tryCatch({em.normal.partial.concordant(pdata, class, components)},
166 166
                        error = function(unused) return(NULL))
167 167
         if (is.null(pd)) {
168
-          stop(
169
-            paste0("\nInsufficient data for component estimation. Increase",
170
-                   "\nnumber of features or reduce number of components used."))
168
+          stop("\nInsufficient data for component estimation. Increase",
169
+               "\nnumber of features or reduce number of components used.")
171 170
         }
172 171
         zTable <- pd$z
173 172
         classVector <- pd$class
... ...
@@ -343,16 +342,16 @@ em.normal.partial.concordant <- function(data, class, components) {
343 342
       subSize <- floor(length(rownames(x)) / 2)
344 343
     } else if (subSize > floor(length(rownames(x)) / 2)) {
345 344
       subSize <- floor(length(rownames(x)) / 2)
346
-      warning(paste0("subSize argument too large. Using subSize ", subSize,
347
-                     " See vignette for more information."))
345
+      warning("subSize argument too large. Using subSize ", subSize,
346
+              " See vignette for more information.")
348 347
     }
349 348
   } else {
350 349
     if (is.null(subSize)) {
351 350
       subSize <- min(nrow(x), nrow(y))
352 351
     } else if (subSize > min(nrow(x), nrow(y))) {
353 352
       subSize <- min(nrow(x), nrow(y))
354
-      warning(paste0("subSize argument to large. Using subSize ", subSize,
355
-                     " See vignette for more information."))
353
+      warning("subSize argument to large. Using subSize ", subSize,
354
+              " See vignette for more information.")
356 355
     }
357 356
   }
358 357
   return(subSize)
Browse code

Relocate error handling

Errors are now caught by discordantRun() rather
than only being caught by discordant.cpp which
leads to more helpful error messages

Max McGrath authored on 13/10/2021 18:37:08
Showing1 changed files
... ...
@@ -145,10 +145,10 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
145 145
         }
146 146
         
147 147
         if (repeats >= floor(iter * .1)) {
148
-          stop("Insufficient data for subsampling. Increase number of
149
-               features, reduce number of components used, or increase 
150
-               subSize if not at default value. Alternatively, set
151
-               subsampling=FALSE.")
148
+          stop(paste0("\nInsufficient data for subsampling. Increase number of",
149
+               "\nfeatures, reduce numberof components used, or increase", 
150
+               "\nsubSize if not at default value. Alternatively, set",
151
+               "\nsubsampling=FALSE."))
152 152
         }
153 153
       
154 154
       mu <- total_mu / iter
... ...
@@ -162,7 +162,13 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
162 162
       zTable <- finalResult$z
163 163
       classVector <- finalResult$class
164 164
     } else {
165
-        pd <- em.normal.partial.concordant(pdata, class, components)
165
+        pd <- tryCatch({em.normal.partial.concordant(pdata, class, components)},
166
+                       error = function(unused) return(NULL))
167
+        if (is.null(pd)) {
168
+          stop(
169
+            paste0("\nInsufficient data for component estimation. Increase",
170
+                   "\nnumber of features or reduce number of components used."))
171
+        }
166 172
         zTable <- pd$z
167 173
         classVector <- pd$class
168 174
     }
... ...
@@ -184,7 +190,7 @@ em.normal.partial.concordant <- function(data, class, components) {
184 190
 
185 191
     zx <- .unmap(class[,1], components = components)
186 192
     zy <- .unmap(class[,2], components = components)
187
-    .checkForMissingComponents(zx, zy)
193
+    # .checkForMissingComponents(zx, zy)
188 194
     zxy <- sapply(1:dim(zx)[1], yl.outer, zx, zy)
189 195
 
190 196
     pi <- double(g*g)
... ...
@@ -320,15 +326,15 @@ em.normal.partial.concordant <- function(data, class, components) {
320 326
 # Internal function that checks whether all types of component are present
321 327
 #   in given vectors. If a certain component is not present, we run into a
322 328
 #   divide-by-zero error that crashes R
323
-.checkForMissingComponents <- function(zx, zy) {
324
-  sumZx <- colSums(zx)
325
-  sumZy <- colSums(zy)
326
-  if(any(c(sumZx, sumZy) == 0)) {
327
-    stop("Insufficient data for component estimation. Increase number of 
328
-         features or reduce number of components used. If subsampling=TRUE,
329
-         you may need set subsampling=FALSE.")
330
-  }
331
-}
329
+# .checkForMissingComponents <- function(zx, zy) {
330
+#   sumZx <- colSums(zx)
331
+#   sumZy <- colSums(zy)
332
+#   if(any(c(sumZx, sumZy) == 0)) {
333
+#     stop("Insufficient data for component estimation. Increase number of 
334
+#          features or reduce number of components used. If subsampling=TRUE,
335
+#          you may need set subsampling=FALSE.")
336
+#   }
337
+# }
332 338
 
333 339
 # Internal function to set sub size based on data
334 340
 .setSubSize <- function(x, y, subSize) {
Browse code

Update subsampling error handling, add subSize arg

If EM alg throws an error, subsampling will
now repeat that iteration up to ten percent of iter
times (after which it throws an error). Re-added
subSize argument as well (with minimum values
set)

Max McGrath authored on 29/09/2021 23:07:31
Showing1 changed files
... ...
@@ -22,6 +22,8 @@
22 22
 #' @param y ExpressionSet of -omics data, induces dual -omics analysis
23 23
 #' @param transform If TRUE v1 and v2 will be Fisher transformed
24 24
 #' @param subsampling If TRUE subsampling will be run
25
+#' @param subSize Indicates how many feature pairs to be used for subsampling. 
26
+#' Default is the feature size in x
25 27
 #' @param iter Number of iterations for subsampling. Default is 100
26 28
 #' @param components Number of components in mixture model.
27 29
 #' 
... ...
@@ -94,7 +96,7 @@
94 96
 #'                        TCGA_GBM_miRNA_microarray)
95 97
 #' @export
96 98
 discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE, 
97
-                          subsampling = FALSE, iter = 100, 
99
+                          subsampling = FALSE, subSize = NULL, iter = 100, 
98 100
                           components = 3) {
99 101
   
100 102
     .checkDiscordantInputs(v1, v2, x, y, transform, subsampling, subSize, iter, 
... ...
@@ -113,23 +115,41 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
113 115
                    .assignClass(v2, param2, components))
114 116
     
115 117
     if (subsampling) {
116
-        subSize <- .setSubSize(x, y)
118
+        subSize <- .setSubSize(x, y, subSize)
117 119
         total_mu <- total_sigma <- total_nu <- 
118 120
           total_tau <- total_pi <- rep(0, components)
119 121
         
120
-        for(i in 1:iter) {
122
+        i <- 0
123
+        repeats <- 0
124
+        
125
+        while(i < iter && repeats < floor(iter * .1)) {
121 126
             subSamples <- .createSubsamples(x, y, v1, v2, subSize)
122 127
             sub.pdata <- cbind(subSamples$v1, subSamples$v2)
123 128
             sub.class <- cbind(.assignClass(subSamples$v1, param1, components),
124 129
                                .assignClass(subSamples$v2, param2, components))
125 130
             
126
-            pd <- em.normal.partial.concordant(sub.pdata, sub.class, components)
127
-            total_mu <- total_mu + pd$mu_sigma[1,]
128
-            total_sigma <- total_sigma + pd$mu_sigma[2,]
129
-            total_nu <- total_nu + pd$nu_tau[1,]
130
-            total_tau <- total_tau + pd$nu_tau[2,]
131
-            total_pi <- total_pi + pd$pi
132
-      }
131
+            pd <- tryCatch({em.normal.partial.concordant(sub.pdata, sub.class, 
132
+                                                   components)},
133
+                error = function(unused) return(NULL))
134
+            
135
+            if(is.null(pd)) { 
136
+              repeats <- repeats + 1 
137
+            } else {
138
+              i <- i + 1
139
+              total_mu <- total_mu + pd$mu_sigma[1,]
140
+              total_sigma <- total_sigma + pd$mu_sigma[2,]
141
+              total_nu <- total_nu + pd$nu_tau[1,]
142
+              total_tau <- total_tau + pd$nu_tau[2,]
143
+              total_pi <- total_pi + pd$pi
144
+            }
145
+        }
146
+        
147
+        if (repeats >= floor(iter * .1)) {
148
+          stop("Insufficient data for subsampling. Increase number of
149
+               features, reduce number of components used, or increase 
150
+               subSize if not at default value. Alternatively, set
151
+               subsampling=FALSE.")
152
+        }
133 153
       
134 154
       mu <- total_mu / iter
135 155
       sigma <- total_sigma / iter
... ...
@@ -311,11 +331,23 @@ em.normal.partial.concordant <- function(data, class, components) {
311 331
 }
312 332
 
313 333
 # Internal function to set sub size based on data
314
-.setSubSize <- function(x, y) {
334
+.setSubSize <- function(x, y, subSize) {
315 335
   if (is.null(y)) {
316
-    sampleNames <- rownames(x)
317
-    subSize <- floor(length(sampleNames) / 2)
336
+    if (is.null(subSize)) {
337
+      subSize <- floor(length(rownames(x)) / 2)
338
+    } else if (subSize > floor(length(rownames(x)) / 2)) {
339
+      subSize <- floor(length(rownames(x)) / 2)
340
+      warning(paste0("subSize argument too large. Using subSize ", subSize,
341
+                     " See vignette for more information."))
342
+    }
318 343
   } else {
319
-    subSize <- min(nrow(x), nrow(y))
344
+    if (is.null(subSize)) {
345
+      subSize <- min(nrow(x), nrow(y))
346
+    } else if (subSize > min(nrow(x), nrow(y))) {
347
+      subSize <- min(nrow(x), nrow(y))
348
+      warning(paste0("subSize argument to large. Using subSize ", subSize,
349
+                     " See vignette for more information."))
350
+    }
320 351
   }
352
+  return(subSize)
321 353
 }
322 354
\ No newline at end of file
Browse code

Remove subSize from documentation

Max McGrath authored on 29/09/2021 21:31:51
Showing1 changed files
... ...
@@ -22,8 +22,6 @@
22 22
 #' @param y ExpressionSet of -omics data, induces dual -omics analysis
23 23
 #' @param transform If TRUE v1 and v2 will be Fisher transformed
24 24
 #' @param subsampling If TRUE subsampling will be run
25
-#' @param subSize Indicates how many feature pairs to be used for subsampling. 
26
-#' Default is the feature size in x
27 25
 #' @param iter Number of iterations for subsampling. Default is 100
28 26
 #' @param components Number of components in mixture model.
29 27
 #' 
Browse code

Refactor discordantRun output chunk

Break output chunks into seperate function, reduce
repeated code in output chunk.

Max McGrath authored on 22/09/2021 23:34:22
Showing1 changed files
... ...
@@ -106,25 +106,16 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
106 106
         v1 <- fishersTrans(v1)
107 107
         v2 <- fishersTrans(v2)
108 108
     }
109
-  
110 109
     x <- exprs(x)
111 110
     if (!is.null(y)) { y <- exprs(y) }
112
-    featureSize = dim(x)[1]
113
-    
114 111
     pdata <- cbind(v1, v2)
115 112
     param1 <- sd(v1)
116 113
     param2 <- sd(v2)
117 114
     class <- cbind(.assignClass(v1, param1, components),
118 115
                    .assignClass(v2, param2, components))
119 116
     
120
-    if (components == 3) {
121
-        discordClass <- c(2,3,4,6,7,8)
122
-    } else {
123
-        discordClass <- setdiff(1:25, c(1, 7, 13, 19, 25))
124
-    }
125
-    
126 117
     if (subsampling) {
127
-        subSize = .setSubSize(x, y)
118
+        subSize <- .setSubSize(x, y)
128 119
         total_mu <- total_sigma <- total_nu <- 
129 120
           total_tau <- total_pi <- rep(0, components)
130 121
         
... ...
@@ -158,41 +149,7 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
158 149
         classVector <- pd$class
159 150
     }
160 151
     
161
-    discordPPV <- apply(zTable, 1, function(x) sum(x[discordClass])/sum(x))
162
-    
163
-    if(is.null(y)) {
164
-        discordPPMatrix <- matrix(NA,nrow = featureSize, ncol = featureSize)
165
-        classMatrix <- discordPPMatrix
166
-        diag <- lower.tri(discordPPMatrix, diag = FALSE)
167
-        discordPPMatrix[diag] <- discordPPV
168
-        classMatrix[diag] <- classVector
169
-        rownames(discordPPMatrix) <- rownames(x)
170
-        colnames(discordPPMatrix) <- rownames(x)
171
-        rownames(classMatrix) <- rownames(x)
172
-        colnames(classMatrix) <- rownames(x)
173
-        vector_names <- .getNames(x)
174
-        names(discordPPV) <- vector_names
175
-        names(classVector) <- vector_names
176
-    } else {
177
-        discordPPMatrix <- matrix(discordPPV, nrow = featureSize, 
178
-                                  byrow = FALSE)
179
-        classMatrix <- matrix(classVector, nrow = featureSize, byrow = FALSE)
180
-        rownames(discordPPMatrix) <- rownames(x)
181
-        colnames(discordPPMatrix) <- rownames(y)
182
-        rownames(classMatrix) <- rownames(x)
183
-        colnames(classMatrix) <- rownames(y)
184
-        
185
-        vector_names <- .getNames(x,y)
186
-        names(discordPPV) <- vector_names
187
-        names(classVector) <- vector_names
188
-    }
189
-    
190
-    zTable <- t(apply(zTable, 1, function(x) x/sum(x)))
191
-    rownames(zTable) <- vector_names
192
-    
193
-    return(list(discordPPMatrix = discordPPMatrix, discordPPVector = discordPPV,
194
-                classMatrix = classMatrix, classVector = classVector, 
195
-                probMatrix = zTable, loglik = pd$loglik))
152
+    rtn <- .prepareOutput(x, y, pd, zTable, classVector, components)
196 153
 }
197 154
 
198 155
 em.normal.partial.concordant <- function(data, class, components) {
... ...
@@ -241,6 +198,47 @@ em.normal.partial.concordant <- function(data, class, components) {
241 198
                 z = array(results[[3]], dim = c(n, g*g))))
242 199
 }
243 200
 
201
+.prepareOutput <- function(x, y, pd, zTable, classVector, components) {
202
+  
203
+    if (components == 3) {
204
+        discordClass <- c(2,3,4,6,7,8)
205
+    } else {
206
+        discordClass <- setdiff(1:25, c(1, 7, 13, 19, 25))
207
+    }
208
+    featureSize = dim(x)[1]
209
+    discordPPV <- apply(zTable, 1, function(x) sum(x[discordClass]) / sum(x))
210
+    
211
+    if(is.null(y)) {
212
+        discordPPMatrix <- matrix(NA, nrow = featureSize, ncol = featureSize)
213
+        classMatrix <- discordPPMatrix
214
+        diag <- lower.tri(discordPPMatrix, diag = FALSE)
215
+        discordPPMatrix[diag] <- discordPPV
216
+        classMatrix[diag] <- classVector
217
+        colnames(discordPPMatrix) <- rownames(x)
218
+        colnames(classMatrix) <- rownames(x)
219
+        vector_names <- .getNames(x)
220
+    } else {
221
+        discordPPMatrix <- matrix(discordPPV, nrow = featureSize, 
222
+                                  byrow = FALSE)
223
+        classMatrix <- matrix(classVector, nrow = featureSize, byrow = FALSE)
224
+        colnames(discordPPMatrix) <- rownames(y)
225
+        colnames(classMatrix) <- rownames(y)
226
+        vector_names <- .getNames(x,y)
227
+    }
228
+
229
+    rownames(discordPPMatrix) <- rownames(x)
230
+    rownames(classMatrix) <- rownames(x)
231
+    names(discordPPV) <- vector_names
232
+    names(classVector) <- vector_names
233
+    
234
+    zTable <- t(apply(zTable, 1, function(x) x / sum(x)))
235
+    rownames(zTable) <- vector_names
236
+    
237
+    return(list(discordPPMatrix = discordPPMatrix, discordPPVector = discordPPV,
238
+                classMatrix = classMatrix, classVector = classVector, 
239
+                probMatrix = zTable, loglik = pd$loglik))
240
+}
241
+
244 242
 
245 243
 # Internal function to assign class to vector based on number of components and 
246 244
 #   each elements distance from zero
... ...
@@ -255,28 +253,30 @@ em.normal.partial.concordant <- function(data, class, components) {
255 253
   return(rtn)
256 254
 }
257 255
 
256
+# Internal function to independently subsample data according to whether the
257
+#   analysis is within -omics or between -omics. 
258 258
 .createSubsamples <- function(x, y = NULL, v1, v2, subSize) {
259
-  if(is.null(y)) {
260
-    sampleNames <- rownames(x)
261
-    nameSet1 <- sample(sampleNames, subSize)
262
-    nameSet2 <- sample(setdiff(sampleNames, nameSet1))
263
-    nameSetA <- paste0(nameSet1, "_", nameSet2)
264
-    nameSetB <- paste0(nameSet2, "_", nameSet1)
265
-    subSampV1 <- ifelse(is.na(v1[nameSetA]), v1[nameSetB], v1[nameSetA])
266
-    subSampV2 <- ifelse(is.na(v2[nameSetA]), v2[nameSetB], v2[nameSetA])
267
-  } else {
268
-    # make sure pairs are independent
269
-    rowIndex <- sample(nrow(x), subSize)
270
-    colIndex <- sample(nrow(y), subSize)
271
-    mat1 <- matrix(v1, nrow = nrow(x), byrow = FALSE)
272
-    mat2 <- matrix(v2, nrow = nrow(x), byrow = FALSE)
273
-    subSampV1 <- sapply(1:subSize, function(x) mat1[rowIndex[x], 
274
-                                                    colIndex[x]])
275
-    subSampV2 <- sapply(1:subSize, function(x) mat2[rowIndex[x], 
276
-                                                    colIndex[x]])
277
-  }
278
-  return(list(v1 = subSampV1,
279
-              v2 = subSampV2))
259
+    if(is.null(y)) {
260
+        sampleNames <- rownames(x)
261
+        nameSet1 <- sample(sampleNames, subSize)
262
+        nameSet2 <- sample(setdiff(sampleNames, nameSet1), subSize)
263
+        nameSetA <- paste0(nameSet1, "_", nameSet2)
264
+        nameSetB <- paste0(nameSet2, "_", nameSet1)
265
+        subSampV1 <- ifelse(is.na(v1[nameSetA]), v1[nameSetB], v1[nameSetA])
266
+        subSampV2 <- ifelse(is.na(v2[nameSetA]), v2[nameSetB], v2[nameSetA])
267
+    } else {
268
+        # make sure pairs are independent
269
+        rowIndex <- sample(nrow(x), subSize)
270
+        colIndex <- sample(nrow(y), subSize)
271
+        mat1 <- matrix(v1, nrow = nrow(x), byrow = FALSE)
272
+        mat2 <- matrix(v2, nrow = nrow(x), byrow = FALSE)
273
+        subSampV1 <- sapply(1:subSize, function(x) mat1[rowIndex[x], 
274
+                                                        colIndex[x]])
275
+        subSampV2 <- sapply(1:subSize, function(x) mat2[rowIndex[x], 
276
+                                                        colIndex[x]])
277
+    }
278
+    return(list(v1 = subSampV1,
279
+                v2 = subSampV2))
280 280
 }
281 281
 
282 282
 # Internal function to validate user inputs for discordantRun()
... ...
@@ -289,11 +289,6 @@ em.normal.partial.concordant <- function(data, class, components) {
289 289
     stop("x and y (if present) must be type ExpressionSet")
290 290
   }
291 291
   
292
-  # Need to double check if this is true w/ Katerina
293
-  # if (is.null(y) && subsampling) {
294
-  #   stop("y cannot be NULL if subsampling is TRUE")
295
-  # }
296
-  
297 292
   if (!(components %in% c(3, 5))) {
298 293
     stop ("Components must be equal to 3 or 5")
299 294
   }
Browse code

Refactor subsampling, remove subSize argument

Parts of subsampling algorithm broken out into
seperate functions. subSize argument was removed
as it doesn't actually do anything (may add back
in later)

Max McGrath authored on 22/09/2021 22:39:44
Showing1 changed files
... ...
@@ -96,7 +96,7 @@
96 96
 #'                        TCGA_GBM_miRNA_microarray)
97 97
 #' @export
98 98
 discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE, 
99
-                          subsampling = FALSE, subSize = dim(x)[1], iter = 100, 
99
+                          subsampling = FALSE, iter = 100, 
100 100
                           components = 3) {
101 101
   
102 102
     .checkDiscordantInputs(v1, v2, x, y, transform, subsampling, subSize, iter, 
... ...
@@ -124,42 +124,15 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
124 124
     }
125 125
     
126 126
     if (subsampling) {
127
-        subSize <- min(nrow(x), nrow(y))
127
+        subSize = .setSubSize(x, y)
128 128
         total_mu <- total_sigma <- total_nu <- 
129 129
           total_tau <- total_pi <- rep(0, components)
130 130
         
131 131
         for(i in 1:iter) {
132
-            # make sure pairs are independent
133
-            rowIndex <- sample(nrow(x), subSize)
134
-            
135
-            if(is.null(y)) {
136
-              # colIndex <- sample(nrow(x), subSize)
137
-              sampleNames <- rownames(x)
138
-              
139
-              if (floor(length(sampleNames) / 2) < subSize) {
140
-                warning("Provided subSize too high. Using number of features divided by 2.")
141
-                subSize <- floor(length(sampleNames) / 2)
142
-              }
143
-              
144
-              nameSet1 <- sample(sampleNames, subSize)
145
-              nameSet2 <- setdiff(sampleNames, nameSet1)
146
-              nameSetA <- paste0(nameSet1, "_", nameSet2)
147
-              nameSetB <- paste0(nameSet2, "_", nameSet1)
148
-              subSampV1 <- ifelse(is.na(v1[nameSetA]), v1[nameSetB], v1[nameSetA])
149
-              subSampV2 <- ifelse(is.na(v2[nameSetA]), v2[nameSetB], v2[nameSetA])
150
-            } else {
151
-              colIndex <- sample(nrow(y), subSize)
152
-              mat1 <- matrix(v1, nrow = nrow(x), byrow = FALSE)
153
-              mat2 <- matrix(v2, nrow = nrow(x), byrow = FALSE)
154
-              subSampV1 <- sapply(1:subSize, function(x) mat1[rowIndex[x], 
155
-                                                              colIndex[x]])
156
-              subSampV2 <- sapply(1:subSize, function(x) mat2[rowIndex[x], 
157
-                                                              colIndex[x]])
158
-            }
159
-            
160
-            sub.pdata <- cbind(subSampV1, subSampV2)
161
-            sub.class <- cbind(.assignClass(subSampV1, param1, components),
162
-                               .assignClass(subSampV2, param2, components))
132
+            subSamples <- .createSubsamples(x, y, v1, v2, subSize)
133
+            sub.pdata <- cbind(subSamples$v1, subSamples$v2)
134
+            sub.class <- cbind(.assignClass(subSamples$v1, param1, components),
135
+                               .assignClass(subSamples$v2, param2, components))
163 136
             
164 137
             pd <- em.normal.partial.concordant(sub.pdata, sub.class, components)
165 138
             total_mu <- total_mu + pd$mu_sigma[1,]
... ...
@@ -282,9 +255,29 @@ em.normal.partial.concordant <- function(data, class, components) {
282 255
   return(rtn)
283 256
 }
284 257
 
285
-# .createSubsamples <- function(x, y = NULL, v1, v2) {
286
-#   
287
-# }
258
+.createSubsamples <- function(x, y = NULL, v1, v2, subSize) {
259
+  if(is.null(y)) {
260
+    sampleNames <- rownames(x)
261
+    nameSet1 <- sample(sampleNames, subSize)
262
+    nameSet2 <- sample(setdiff(sampleNames, nameSet1))
263
+    nameSetA <- paste0(nameSet1, "_", nameSet2)
264
+    nameSetB <- paste0(nameSet2, "_", nameSet1)
265
+    subSampV1 <- ifelse(is.na(v1[nameSetA]), v1[nameSetB], v1[nameSetA])
266
+    subSampV2 <- ifelse(is.na(v2[nameSetA]), v2[nameSetB], v2[nameSetA])
267
+  } else {
268
+    # make sure pairs are independent
269
+    rowIndex <- sample(nrow(x), subSize)
270
+    colIndex <- sample(nrow(y), subSize)
271
+    mat1 <- matrix(v1, nrow = nrow(x), byrow = FALSE)
272
+    mat2 <- matrix(v2, nrow = nrow(x), byrow = FALSE)
273
+    subSampV1 <- sapply(1:subSize, function(x) mat1[rowIndex[x], 
274
+                                                    colIndex[x]])
275
+    subSampV2 <- sapply(1:subSize, function(x) mat2[rowIndex[x], 
276
+                                                    colIndex[x]])
277
+  }
278
+  return(list(v1 = subSampV1,
279
+              v2 = subSampV2))
280
+}
288 281
 
289 282
 # Internal function to validate user inputs for discordantRun()
290 283
 #' @importFrom methods is
... ...
@@ -302,12 +295,12 @@ em.normal.partial.concordant <- function(data, class, components) {
302 295
   # }
303 296
   
304 297
   if (!(components %in% c(3, 5))) {
305
-    stop ("components must be equal to 3 or 5")
298
+    stop ("Components must be equal to 3 or 5")
306 299
   }
307 300
   
308 301
   if (transform && (range(v1)[1] < -1 || range(v1)[2] > 1 || 
309 302
                     range(v2)[1] < -1 || range(v2)[2] > 1)) {
310
-    stop ("correlation vectors have values less than -1 and/or greater than 1.")
303
+    stop ("Correlation vectors have values less than -1 and/or greater than 1.")
311 304
   }
312 305
 }
313 306
 
... ...
@@ -320,6 +313,16 @@ em.normal.partial.concordant <- function(data, class, components) {
320 313
   if(any(c(sumZx, sumZy) == 0)) {
321 314
     stop("Insufficient data for component estimation. Increase number of 
322 315
          features or reduce number of components used. If subsampling=TRUE,
323
-         consider increasing subSize as well.")
316
+         you may need set subsampling=FALSE.")
317
+  }
318
+}
319
+
320
+# Internal function to set sub size based on data
321
+.setSubSize <- function(x, y) {
322
+  if (is.null(y)) {
323
+    sampleNames <- rownames(x)
324
+    subSize <- floor(length(sampleNames) / 2)
325
+  } else {
326
+    subSize <- min(nrow(x), nrow(y))
324 327
   }
325 328
 }
326 329
\ No newline at end of file
Browse code

Enable subsampling for within -omics

Adds code to independently subsample a single
dataset for within -omics analysis

Max McGrath authored on 22/09/2021 21:52:00
Showing1 changed files
... ...
@@ -131,14 +131,31 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
131 131
         for(i in 1:iter) {
132 132
             # make sure pairs are independent
133 133
             rowIndex <- sample(nrow(x), subSize)
134
-            colIndex <- sample(nrow(y), subSize)
135
-            mat1 <- matrix(v1, nrow = nrow(x), byrow = FALSE)
136
-            mat2 <- matrix(v2, nrow = nrow(x), byrow = FALSE)
137 134
             
138
-            subSampV1 <- sapply(1:subSize, function(x) mat1[rowIndex[x], 
139
-                                                            colIndex[x]])
140
-            subSampV2 <- sapply(1:subSize, function(x) mat2[rowIndex[x], 
141
-                                                            colIndex[x]])
135
+            if(is.null(y)) {
136
+              # colIndex <- sample(nrow(x), subSize)
137
+              sampleNames <- rownames(x)
138
+              
139
+              if (floor(length(sampleNames) / 2) < subSize) {
140
+                warning("Provided subSize too high. Using number of features divided by 2.")
141
+                subSize <- floor(length(sampleNames) / 2)
142
+              }
143
+              
144
+              nameSet1 <- sample(sampleNames, subSize)
145
+              nameSet2 <- setdiff(sampleNames, nameSet1)
146
+              nameSetA <- paste0(nameSet1, "_", nameSet2)
147
+              nameSetB <- paste0(nameSet2, "_", nameSet1)
148
+              subSampV1 <- ifelse(is.na(v1[nameSetA]), v1[nameSetB], v1[nameSetA])
149
+              subSampV2 <- ifelse(is.na(v2[nameSetA]), v2[nameSetB], v2[nameSetA])
150
+            } else {
151
+              colIndex <- sample(nrow(y), subSize)
152
+              mat1 <- matrix(v1, nrow = nrow(x), byrow = FALSE)
153
+              mat2 <- matrix(v2, nrow = nrow(x), byrow = FALSE)
154
+              subSampV1 <- sapply(1:subSize, function(x) mat1[rowIndex[x], 
155
+                                                              colIndex[x]])
156
+              subSampV2 <- sapply(1:subSize, function(x) mat2[rowIndex[x], 
157
+                                                              colIndex[x]])
158
+            }
142 159
             
143 160
             sub.pdata <- cbind(subSampV1, subSampV2)
144 161
             sub.class <- cbind(.assignClass(subSampV1, param1, components),
... ...
@@ -265,6 +282,10 @@ em.normal.partial.concordant <- function(data, class, components) {
265 282
   return(rtn)
266 283
 }
267 284
 
285
+# .createSubsamples <- function(x, y = NULL, v1, v2) {
286
+#   
287
+# }
288
+
268 289
 # Internal function to validate user inputs for discordantRun()
269 290
 #' @importFrom methods is
270 291
 .checkDiscordantInputs <- function(v1, v2, x, y, transform, 
... ...
@@ -290,6 +311,9 @@ em.normal.partial.concordant <- function(data, class, components) {
290 311
   }
291 312
 }
292 313
 
314
+# Internal function that checks whether all types of component are present
315
+#   in given vectors. If a certain component is not present, we run into a
316
+#   divide-by-zero error that crashes R
293 317
 .checkForMissingComponents <- function(zx, zy) {
294 318
   sumZx <- colSums(zx)
295 319
   sumZy <- colSums(zy)
Browse code

Handle case with insufficient data

Updates error handling for case where
insufficient data is provided in the 5 component
(or 3 component) case. Updates vignette
accordingly. Also makes some changes to
other internal functions like greatly simplifying
.getNames()

Max McGrath authored on 01/09/2021 17:03:57
Showing1 changed files
... ...
@@ -126,9 +126,7 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
126 126
     if (subsampling) {
127 127
         subSize <- min(nrow(x), nrow(y))
128 128
         total_mu <- total_sigma <- total_nu <- 
129
-          total_tau <- total_pi <- rep(0, components) 
130
-        # NOTE: changed this from default 3 to components... need to figure out 
131
-        #   if that's right
129
+          total_tau <- total_pi <- rep(0, components)
132 130
         
133 131
         for(i in 1:iter) {
134 132
             # make sure pairs are independent
... ...
@@ -221,6 +219,7 @@ em.normal.partial.concordant <- function(data, class, components) {
221 219
 
222 220
     zx <- .unmap(class[,1], components = components)
223 221
     zy <- .unmap(class[,2], components = components)
222
+    .checkForMissingComponents(zx, zy)
224 223
     zxy <- sapply(1:dim(zx)[1], yl.outer, zx, zy)
225 224
 
226 225
     pi <- double(g*g)
... ...
@@ -277,9 +276,9 @@ em.normal.partial.concordant <- function(data, class, components) {
277 276
   }
278 277
   
279 278
   # Need to double check if this is true w/ Katerina
280
-  if (is.null(y) && subsampling) {
281
-    stop("y cannot be NULL if subsampling is TRUE")
282
-  }
279
+  # if (is.null(y) && subsampling) {
280
+  #   stop("y cannot be NULL if subsampling is TRUE")
281
+  # }
283 282
   
284 283
   if (!(components %in% c(3, 5))) {
285 284
     stop ("components must be equal to 3 or 5")
... ...
@@ -289,4 +288,14 @@ em.normal.partial.concordant <- function(data, class, components) {
289 288
                     range(v2)[1] < -1 || range(v2)[2] > 1)) {
290 289
     stop ("correlation vectors have values less than -1 and/or greater than 1.")
291 290
   }
291
+}
292
+
293
+.checkForMissingComponents <- function(zx, zy) {
294
+  sumZx <- colSums(zx)
295
+  sumZy <- colSums(zy)
296
+  if(any(c(sumZx, sumZy) == 0)) {
297
+    stop("Insufficient data for component estimation. Increase number of 
298
+         features or reduce number of components used. If subsampling=TRUE,
299
+         consider increasing subSize as well.")
300
+  }
292 301
 }
293 302
\ No newline at end of file
Browse code

Refactor and add function to NAMESPACE

Refactors createVectors a bit further, makes
all non-exported functions start with ".",
adds is() from methods package to NAMESPACE and
DESCRIPTION

Max McGrath authored on 12/08/2021 17:17:54
Showing1 changed files
... ...
@@ -160,7 +160,7 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
160 160
       tau <- total_tau / iter
161 161
       pi <- total_pi / iter
162 162
       
163
-      finalResult <- subSampleData(pdata, class, mu, sigma, nu, tau, pi, 
163
+      finalResult <- .subSampleData(pdata, class, mu, sigma, nu, tau, pi, 
164 164
                                    components)
165 165
       zTable <- finalResult$z
166 166
       classVector <- finalResult$class
... ...
@@ -182,7 +182,7 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
182 182
         colnames(discordPPMatrix) <- rownames(x)
183 183
         rownames(classMatrix) <- rownames(x)
184 184
         colnames(classMatrix) <- rownames(x)
185
-        vector_names <- getNames(x)
185
+        vector_names <- .getNames(x)
186 186
         names(discordPPV) <- vector_names
187 187
         names(classVector) <- vector_names
188 188
     } else {
... ...
@@ -194,7 +194,7 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
194 194
         rownames(classMatrix) <- rownames(x)
195 195
         colnames(classMatrix) <- rownames(y)
196 196
         
197
-        vector_names <- getNames(x,y)
197
+        vector_names <- .getNames(x,y)
198 198
         names(discordPPV) <- vector_names
199 199
         names(classVector) <- vector_names
200 200
     }
... ...
@@ -219,8 +219,8 @@ em.normal.partial.concordant <- function(data, class, components) {
219 219
         return( c(zx[k,] %o% zy[k,]) )
220 220
     }
221 221
 
222
-    zx <- unmap(class[,1], components = components)
223
-    zy <- unmap(class[,2], components = components)
222
+    zx <- .unmap(class[,1], components = components)
223
+    zy <- .unmap(class[,2], components = components)
224 224
     zxy <- sapply(1:dim(zx)[1], yl.outer, zx, zy)
225 225
 
226 226
     pi <- double(g*g)
... ...
@@ -267,6 +267,7 @@ em.normal.partial.concordant <- function(data, class, components) {
267 267
 }
268 268
 
269 269
 # Internal function to validate user inputs for discordantRun()
270
+#' @importFrom methods is
270 271
 .checkDiscordantInputs <- function(v1, v2, x, y, transform, 
271 272
                                    subsampling, subSize, iter, 
272 273
                                    components) {
Browse code

Update documentation to Roxygen2

Shifts .Rd files in /man from being manually
created to being generated via Roxygen2. Data
is now documented in R/data.R and functions
are documented in the same files they are defined
in. Will make package more comparable to other
KechrisLab packages and easier to maintain

Max McGrath authored on 04/08/2021 22:21:24
Showing1 changed files
... ...
@@ -12,6 +12,88 @@
12 12
 # R function for SparCC, adapted from https://bitbucket.org/yonatanf/sparcc
13 13
 # Email: hyfang@pku.edu.cn
14 14
 
15
+#' Run Discordant Algorithm
16
+#' 
17
+#' Runs discordant algorithm on two vectors of correlation coefficients.
18
+#' 
19
+#' @param v1 Vector of correlation coefficients in group 1
20
+#' @param v2 Vector of correlation coefficients in group 2
21
+#' @param x ExpressionSet of -omics data
22
+#' @param y ExpressionSet of -omics data, induces dual -omics analysis
23
+#' @param transform If TRUE v1 and v2 will be Fisher transformed
24
+#' @param subsampling If TRUE subsampling will be run
25
+#' @param subSize Indicates how many feature pairs to be used for subsampling. 
26
+#' Default is the feature size in x
27
+#' @param iter Number of iterations for subsampling. Default is 100
28
+#' @param components Number of components in mixture model.
29
+#' 
30
+#' @return
31
+#' \describe{
32
+#'   \item{discordPPVector}{Vector of differentially correlated posterior 
33
+#'   probabilities.}
34
+#'   \item{discordPPMatrix}{Matrix of differentially correlated posterior 
35
+#'   probabilities where rows and columns reflect features}
36
+#'   \item{classVector}{Vector of classes that have the highest posterior 
37
+#'   probability}
38
+#'   \item{classMatrix}{Matrix of classes that have hte highest posterior 
39
+#'   probability where rows and columns reflect features}
40
+#'   \item{probMatrix}{Matrix of posterior probabilities where rows are each 
41
+#'   molecular feature pair and columns are nine different classes}
42
+#'   \item{loglik}{Final log likelihood}
43
+#' }
44
+#' @details 
45
+#' The discordant algorithm is based on a Gaussian mixture model. If there are 
46
+#' three components, correlation coefficients are clustered into negative 
47
+#' correlations (-), positive correlations (+) and no correlation (0). If there 
48
+#' are five components, then there are two more classes for very negative 
49
+#' correlation (--) and very positive correlations (++). All possible 
50
+#' combinations for these components are made into classes. If there are three 
51
+#' components, there are 9 classes. If there are five components, there are 25 
52
+#' classes.
53
+#' 
54
+#' The posterior probabilities for each class are generated and outputted into 
55
+#' the value probMatrix. The value probMatrix is a matrix where each column is a
56
+#'  class and each row is a feature pair. The values discordPPVector and 
57
+#'  discordPPMatrix are the summed differential correlation posterior 
58
+#'  probability for each feature pair. The values classVector and classMatrix 
59
+#'  are the class with the highest posterior probability for each feature pair.
60
+#' @references 
61
+#' Siska C, Bowler R and Kechris K. The Discordant Method: A Novel Approach for 
62
+#' Differential Correlation (2015), Bioinformatics. 32 (5): 690-696.
63
+#' 
64
+#' Lai Y, Zhang F, Nayak TK, Modarres R, Lee NH and McCaffrey TA. Concordant 
65
+#' integrative gene set enrichment analysis of multiple large-scale two-sample 
66
+#' expression data sets. (2014) BMC Genomics 15, S6.
67
+#' 
68
+#' Lai Y, Adam B-l, Podolsky R, She J-X. A mixture model approach to the tests 
69
+#' of concordance and discordancd between two large-scale experiments with two 
70
+#' sample groups. (2007) Bioinformatics 23, 1243-1250.
71
+#' 
72
+#' @author Charlotte Siska \email{siska.charlotte@@gmail.com}
73
+#' @author Max McGrath \email{max.mcgrath@@ucdenver.edu}
74
+#' 
75
+#' @examples
76
+#' # Load Data
77
+#' data(TCGA_GBM_miRNA_microarray)
78
+#' data(TCGA_GBM_transcript_microarray)
79
+#' print(colnames(TCGA_GBM_transcript_microarray)) # look at groups
80
+#' groups <- c(rep(1,10), rep(2,20))
81
+#' 
82
+#' ## DC analysis on only transcripts pairs
83
+#' 
84
+#' vectors <- createVectors(TCGA_GBM_transcript_microarray, 
85
+#'                          groups = groups)
86
+#' result <- discordantRun(vectors$v1, vectors$v2, 
87
+#'                         TCGA_GBM_transcript_microarray)
88
+#' 
89
+#' ## DC analysis on miRNA-transcript pairs
90
+#' 
91
+#' vectors <- createVectors(TCGA_GBM_transcript_microarray, 
92
+#'                          TCGA_GBM_miRNA_microarray, groups = groups, 
93
+#'                          cor.method = c("pearson"))
94
+#' result <- discordantRun(vectors$v1, vectors$v2, 
95
+#'                         TCGA_GBM_transcript_microarray, 
96
+#'                        TCGA_GBM_miRNA_microarray)
15 97
 #' @export
16 98
 discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE, 
17 99
                           subsampling = FALSE, subSize = dim(x)[1], iter = 100, 
Browse code

Adds README, other small changes

Max McGrath authored on 23/07/2021 18:16:50
Showing1 changed files
... ...
@@ -26,7 +26,7 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
26 26
     }
27 27
   
28 28
     x <- exprs(x)
29
-    if (is.null(y) == FALSE) { y <- exprs(y) }
29
+    if (!is.null(y)) { y <- exprs(y) }
30 30
     featureSize = dim(x)[1]
31 31
     
32 32
     pdata <- cbind(v1, v2)
Browse code

Refactors discordantRun()

Adds .assignClass() to reduce repeated code.
Reorders if statements for clarity. Removes
unnecessary EM function args.

Max McGrath authored on 23/07/2021 17:21:57
Showing1 changed files
... ...
@@ -1,6 +1,6 @@
1 1
 # Applied from Lai et al 2007 Bioinformatics.
2 2
 #
3
-# Code written by Charlotte Siska
3
+# Code written by Charlotte Siska and Max McGrath
4 4
 # R functions fishersTrans, createVectors, discordantRun and makeTable
5 5
 # Email: charlotte.siska@ucdenver.edu
6 6
 #
... ...
@@ -12,163 +12,124 @@
12 12
 # R function for SparCC, adapted from https://bitbucket.org/yonatanf/sparcc
13 13
 # Email: hyfang@pku.edu.cn
14 14
 
15
+#' @export
15 16
 discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE, 
16 17
                           subsampling = FALSE, subSize = dim(x)[1], iter = 100, 
17 18
                           components = 3) {
18 19
   
19
-  .checkDiscordantInputs(v1, v2, x, y, transform, subsampling, subSize, iter, 
20
-                         components)
21
-  
22
-  if (transform) {
23
-    v1 <- fishersTrans(v1)
24
-    v2 <- fishersTrans(v2)
25
-  }
26
-  
27
-  x <- exprs(x)
28
-  if(is.null(y) == FALSE) { y <- exprs(y) }
29
-  featureSize = dim(x)[1]
30
-  
31
-  pdata <- cbind(v1, v2)
32
-  
33
-  param1 <- sd(v1)
34
-  param2 <- sd(v2)
35
-  
36
-  if(components == 3) {
37
-    class <- cbind(rep(0, dim(pdata)[1]), rep(0, dim(pdata)[1]))
38
-    class[pdata[,1]>0+param1,1] <- 2
39
-    class[pdata[,1]<0-param1,1] <- 1
40
-    class[pdata[,2]>0+param2,2] <- 2
41
-    class[pdata[,2]<0-param2,2] <- 1
42
-    discordClass <- c(2,3,4,6,7,8)
43
-  }
44
-  
45
-  if(components == 5) {
46
-    class <- cbind(rep(0, dim(pdata)[1]), rep(0, dim(pdata)[1]))
47
-    class[pdata[,1]>0+param1,1] <- 2
48
-    class[pdata[,1]<0-param1,1] <- 1
49
-    class[pdata[,1]>0+(2*param1),1] <- 4
50
-    class[pdata[,1]<0-(2*param1),1] <- 3
51
-    class[pdata[,2]>0+param2,2] <- 2
52
-    class[pdata[,2]<0-param2,2] <- 1
53
-    class[pdata[,2]>0+(2*param2),2] <- 4
54
-    class[pdata[,2]<0-(2*param2),2] <- 3
55
-    concordClass <- c(1,7,13,19,25)
56
-    discordClass <- setdiff(1:25,concordClass)
57
-  }
20
+    .checkDiscordantInputs(v1, v2, x, y, transform, subsampling, subSize, iter, 
21
+                           components)
22
+    
23
+    if (transform) {
24
+        v1 <- fishersTrans(v1)
25
+        v2 <- fishersTrans(v2)
26
+    }
58 27
   
59
-  if(subsampling == TRUE) {
60
-    subSize = nrow(x)
61
-    if(is.null(y) == FALSE & nrow(y) < subSize) {
62
-      subSize = nrow(y)
28
+    x <- exprs(x)
29
+    if (is.null(y) == FALSE) { y <- exprs(y) }
30
+    featureSize = dim(x)[1]
31
+    
32
+    pdata <- cbind(v1, v2)
33
+    param1 <- sd(v1)
34
+    param2 <- sd(v2)
35
+    class <- cbind(.assignClass(v1, param1, components),
36
+                   .assignClass(v2, param2, components))
37
+    
38
+    if (components == 3) {
39
+        discordClass <- c(2,3,4,6,7,8)
40
+    } else {
41
+        discordClass <- setdiff(1:25, c(1, 7, 13, 19, 25))
63 42
     }
64
-    total_mu <- rep(0,3)
65
-    total_sigma <- rep(0,3)
66
-    total_nu <- rep(0,3)
67
-    total_tau <- rep(0,3)
68
-    total_pi <- rep(0,3)
69
-    for(i in 1:iter) {
70
-      # make sure pairs are independent
71
-      rowIndex <- sample(nrow(x), subSize)
72
-      colIndex <- sample(nrow(y), subSize)
73
-      mat1 <- matrix(v1, nrow = nrow(x), byrow = FALSE)
74
-      mat2 <- matrix(v2, nrow = nrow(x), byrow = FALSE)
43
+    
44
+    if (subsampling) {
45
+        subSize <- min(nrow(x), nrow(y))
46
+        total_mu <- total_sigma <- total_nu <- 
47
+          total_tau <- total_pi <- rep(0, components) 
48
+        # NOTE: changed this from default 3 to components... need to figure out 
49
+        #   if that's right
50
+        
51
+        for(i in 1:iter) {
52
+            # make sure pairs are independent
53
+            rowIndex <- sample(nrow(x), subSize)
54
+            colIndex <- sample(nrow(y), subSize)
55
+            mat1 <- matrix(v1, nrow = nrow(x), byrow = FALSE)
56
+            mat2 <- matrix(v2, nrow = nrow(x), byrow = FALSE)
57
+            
58
+            subSampV1 <- sapply(1:subSize, function(x) mat1[rowIndex[x], 
59
+                                                            colIndex[x]])
60
+            subSampV2 <- sapply(1:subSize, function(x) mat2[rowIndex[x], 
61
+                                                            colIndex[x]])
62
+            
63
+            sub.pdata <- cbind(subSampV1, subSampV2)
64
+            sub.class <- cbind(.assignClass(subSampV1, param1, components),
65
+                               .assignClass(subSampV2, param2, components))
66
+            
67
+            pd <- em.normal.partial.concordant(sub.pdata, sub.class, components)
68
+            total_mu <- total_mu + pd$mu_sigma[1,]
69
+            total_sigma <- total_sigma + pd$mu_sigma[2,]
70
+            total_nu <- total_nu + pd$nu_tau[1,]
71
+            total_tau <- total_tau + pd$nu_tau[2,]
72
+            total_pi <- total_pi + pd$pi
73
+      }
75 74
       
76
-      subSampV1 <- sapply(1:subSize, function(x) mat1[rowIndex[x], 
77
-                                                      colIndex[x]])
78
-      subSampV2 <- sapply(1:subSize, function(x) mat2[rowIndex[x], 
79
-                                                      colIndex[x]])
75
+      mu <- total_mu / iter
76
+      sigma <- total_sigma / iter
77
+      nu <- total_nu / iter
78
+      tau <- total_tau / iter
79
+      pi <- total_pi / iter
80 80
       
81
-      sub.pdata <- cbind(subSampV1, subSampV2)
82
-      sub.class <- cbind(rep(0, subSize), rep(0, subSize))
83
-      if(components == 3) {
84
-        sub.class[sub.pdata[,1]>0+param1,1] <- 2
85
-        sub.class[sub.pdata[,1]<0-param1,1] <- 1
86
-        sub.class[sub.pdata[,2]>0+param2,2] <- 2
87
-        sub.class[sub.pdata[,2]<0-param2,2] <- 1
88
-      }
89
-      if(components == 5) {
90
-        sub.class[sub.pdata[,1]>0+param1,1] <- 2
91
-        sub.class[sub.pdata[,1]<0-param1,1] <- 1
92
-        sub.class[sub.pdata[,1]>0+(2*param1),1] <- 4
93
-        sub.class[sub.pdata[,1]<0-(2*param1),1] <- 3
94
-        sub.class[pdata[,2]>0+param2,2] <- 2
95
-        sub.class[pdata[,2]<0-param2,2] <- 1
96
-        sub.class[pdata[,2]>0+(2*param2),2] <- 4
97
-        sub.class[pdata[,2]<0-(2*param2),2] <- 3
98
-      }
99
-      pd <- em.normal.partial.concordant(sub.pdata, sub.class, tol = 0.001, 
100
-                                         restriction = 0, 
101
-                                         constrain = c(0,-sd(pdata),sd(pdata)),
102
-                                         iteration = 1000, 
103
-                                         components = components)
104
-      total_mu <- total_mu + pd$mu_sigma[1,]
105
-      total_sigma <- total_sigma + pd$mu_sigma[2,]
106
-      total_nu <- total_nu + pd$nu_tau[1,]
107
-      total_tau <- total_tau + pd$nu_tau[2,]
108
-      total_pi <- total_pi + pd$pi
81
+      finalResult <- subSampleData(pdata, class, mu, sigma, nu, tau, pi, 
82
+                                   components)
83
+      zTable <- finalResult$z
84
+      classVector <- finalResult$class
85
+    } else {
86
+        pd <- em.normal.partial.concordant(pdata, class, components)
87
+        zTable <- pd$z
88
+        classVector <- pd$class
109 89
     }
110 90
     
111
-    mu <- total_mu / iter
112
-    sigma <- total_sigma / iter
113
-    nu <- total_nu / iter
114
-    tau <- total_tau / iter
115
-    pi <- total_pi / iter
91
+    discordPPV <- apply(zTable, 1, function(x) sum(x[discordClass])/sum(x))
116 92
     
117
-    finalResult <- subSampleData(pdata, class, mu, sigma, nu, tau, pi, 
118
-                                 components)
119
-    zTable <- finalResult$z
120
-    classVector <- finalResult$class
121
-  } else {
122
-    pd <- em.normal.partial.concordant(pdata, class, tol = 0.001, 
123
-                                       restriction = 0, 
124
-                                       constrain = c(0, -sd(pdata), sd(pdata)), 
125
-                                       iteration = 1000, 
126
-                                       components = components)
127
-    zTable <- pd$z
128
-    classVector <- pd$class
129
-  }
130
-  
131
-  discordPPV <- apply(zTable, 1, function(x) sum(x[discordClass])/sum(x))
132
-  
133
-  if(is.null(y) == FALSE) {
134
-    discordPPMatrix <- matrix(discordPPV, nrow = featureSize, 
135
-                              byrow = FALSE)
136
-    classMatrix <- matrix(classVector, nrow = featureSize, byrow = FALSE)
137
-    rownames(discordPPMatrix) <- rownames(x)
138
-    colnames(discordPPMatrix) <- rownames(y)
139
-    rownames(classMatrix) <- rownames(x)
140
-    colnames(classMatrix) <- rownames(y)
93
+    if(is.null(y)) {
94
+        discordPPMatrix <- matrix(NA,nrow = featureSize, ncol = featureSize)
95
+        classMatrix <- discordPPMatrix
96
+        diag <- lower.tri(discordPPMatrix, diag = FALSE)
97
+        discordPPMatrix[diag] <- discordPPV
98
+        classMatrix[diag] <- classVector
99
+        rownames(discordPPMatrix) <- rownames(x)
100
+        colnames(discordPPMatrix) <- rownames(x)
101
+        rownames(classMatrix) <- rownames(x)
102
+        colnames(classMatrix) <- rownames(x)
103
+        vector_names <- getNames(x)
104
+        names(discordPPV) <- vector_names
105
+        names(classVector) <- vector_names
106
+    } else {
107
+        discordPPMatrix <- matrix(discordPPV, nrow = featureSize, 
108
+                                  byrow = FALSE)
109
+        classMatrix <- matrix(classVector, nrow = featureSize, byrow = FALSE)
110
+        rownames(discordPPMatrix) <- rownames(x)
111
+        colnames(discordPPMatrix) <- rownames(y)
112
+        rownames(classMatrix) <- rownames(x)
113
+        colnames(classMatrix) <- rownames(y)
114
+        
115
+        vector_names <- getNames(x,y)
116
+        names(discordPPV) <- vector_names
117
+        names(classVector) <- vector_names
118
+    }
141 119
     
142
-    vector_names <- getNames(x,y)
143
-    names(discordPPV) <- vector_names
144
-    names(classVector) <- vector_names
145
-  }
146
-  
147
-  if(is.null(y)) {
148
-    discordPPMatrix <- matrix(NA,nrow = featureSize, ncol = featureSize)
149
-    classMatrix <- discordPPMatrix
150
-    diag <- lower.tri(discordPPMatrix, diag = FALSE)
151
-    discordPPMatrix[diag] <- discordPPV
152
-    classMatrix[diag] <- classVector
153
-    rownames(discordPPMatrix) <- rownames(x)
154
-    colnames(discordPPMatrix) <- rownames(x)
155
-    rownames(classMatrix) <- rownames(x)
156
-    colnames(classMatrix) <- rownames(x)
157
-    vector_names <- getNames(x)
158
-    names(discordPPV) <- vector_names
159
-    names(classVector) <- vector_names
160
-  }
161
-  
162
-  zTable <- t(apply(zTable, 1, function(x) x/sum(x)))
163
-  rownames(zTable) <- vector_names
164
-  
165
-  return(list(discordPPMatrix = discordPPMatrix, discordPPVector = 
166
-                discordPPV, classMatrix = classMatrix, classVector = classVector, 
167
-              probMatrix = zTable, loglik = pd$loglik))
120
+    zTable <- t(apply(zTable, 1, function(x) x/sum(x)))
121
+    rownames(zTable) <- vector_names
122
+    
123
+    return(list(discordPPMatrix = discordPPMatrix, discordPPVector = discordPPV,
124
+                classMatrix = classMatrix, classVector = classVector, 
125
+                probMatrix = zTable, loglik = pd$loglik))
168 126
 }
169 127
 
170
-em.normal.partial.concordant <- function(data, class, tol=0.001, 
171
-    restriction=0, constrain=0, iteration=1000, components) {
128
+em.normal.partial.concordant <- function(data, class, components) {
129
+    tol <- 0.001
130
+    restriction <- 0
131
+    constrain <- 0
132
+    iteration <- 1000
172 133
     n <- as.integer(dim(data)[1])
173 134
     g <- as.integer(nlevels(as.factor(class)))
174 135
 
... ...
@@ -209,6 +170,21 @@ em.normal.partial.concordant <- function(data, class, tol=0.001,
209 170
                 z = array(results[[3]], dim = c(n, g*g))))
210 171
 }
211 172
 
173
+
174
+# Internal function to assign class to vector based on number of components and 
175
+#   each elements distance from zero
176
+#' @importFrom dplyr case_when
177
+.assignClass <- function(x, param, components) {
178
+  if (components == 3) {
179
+    rtn <- case_when(x < -param ~ 1, x > param ~ 2, TRUE ~ 0)
180
+  } else {
181
+    rtn <- case_when(x < -2*param ~ 3, x > 2*param ~ 4, x < -param ~ 1,
182
+                     x > param ~ 2, TRUE ~ 0)
183
+  }
184
+  return(rtn)
185
+}
186
+
187
+# Internal function to validate user inputs for discordantRun()
212 188
 .checkDiscordantInputs <- function(v1, v2, x, y, transform, 
213 189
                                    subsampling, subSize, iter, 
214 190
                                    components) {
... ...
@@ -217,6 +193,11 @@ em.normal.partial.concordant <- function(data, class, tol=0.001,
217 193
     stop("x and y (if present) must be type ExpressionSet")
218 194
   }
219 195
   
196
+  # Need to double check if this is true w/ Katerina
197
+  if (is.null(y) && subsampling) {
198
+    stop("y cannot be NULL if subsampling is TRUE")
199
+  }
200
+  
220 201
   if (!(components %in% c(3, 5))) {
221 202
     stop ("components must be equal to 3 or 5")
222 203
   }
Browse code

Improves param validation, fixes SparCC bug

Removes checkInputs in favor of seperate functions
for checking inputs for discordantRun and
createVectors. Also adds previous SparCC fix
to case where y is not null

Max McGrath authored on 22/07/2021 18:40:36
Showing1 changed files
... ...
@@ -16,25 +16,14 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
16 16
                           subsampling = FALSE, subSize = dim(x)[1], iter = 100, 
17 17
                           components = 3) {
18 18
   
19
-  if(checkInputs(x,y)) {
20
-    stop("Please fix inputs.")
21
-  }
19
+  .checkDiscordantInputs(v1, v2, x, y, transform, subsampling, subSize, iter, 
20
+                         components)
22 21
   
23
-  if(transform == TRUE) {
24
-    if(range(v1)[1] < -1 || range(v1)[2] > 1 || range(v2)[1] < -1 || 
25
-       range(v2)[2] > 1) {
26
-      stop("correlation vectors have values less than -1 and/or greater 
27
-                than 1.")
28
-    }
22
+  if (transform) {
29 23
     v1 <- fishersTrans(v1)
30 24
     v2 <- fishersTrans(v2)
31 25
   }
32 26
   
33
-  check <- match(components, c(3,5))
34
-  if(is.na(check)) {
35
-    stop("components must be 3 or 5")
36
-  }
37
-  
38 27
   x <- exprs(x)
39 28
   if(is.null(y) == FALSE) { y <- exprs(y) }
40 29
   featureSize = dim(x)[1]
... ...
@@ -125,7 +114,8 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
125 114
     tau <- total_tau / iter
126 115
     pi <- total_pi / iter
127 116
     
128
-    finalResult <- subSampleData(pdata, class, mu, sigma, nu, tau, pi, components)
117
+    finalResult <- subSampleData(pdata, class, mu, sigma, nu, tau, pi, 
118
+                                 components)
129 119
     zTable <- finalResult$z
130 120
     classVector <- finalResult$class
131 121
   } else {
... ...
@@ -177,30 +167,14 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
177 167
               probMatrix = zTable, loglik = pd$loglik))
178 168
 }
179 169
 
180
-#modified from package mclust
181
-unmap <- function(classification, components){
182
-    n <- length(classification)
183
-    # u <- sort(unique(classification)) # OG Code
184
-    u <- 0:(components - 1) # Max's potential fix
185
-    labs <- as.character(u)
186
-    k <- length(u)
187
-    z <- matrix(0, n, k)
188
-    for (j in 1:k) z[classification == u[j], j] <- 1
189
-    dimnames(z) <- list(NULL, labs)
190
-    return(z)
191
-}
192
-
193 170
 em.normal.partial.concordant <- function(data, class, tol=0.001, 
194
-    restriction=0, constrain=0, iteration=1000, components){
171
+    restriction=0, constrain=0, iteration=1000, components) {
195 172
     n <- as.integer(dim(data)[1])
196 173
     g <- as.integer(nlevels(as.factor(class)))
197 174
 
198 175
     yl.outer <- function(k, zx, zy){
199 176
         return( c(zx[k,] %o% zy[k,]) )
200 177
     }
201
-    yl.diag <- function(k, z){
202
-        return( c(diag(z[k,])) )
203
-    }
204 178
 
205 179
     zx <- unmap(class[,1], components = components)
206 180
     zy <- unmap(class[,2], components = components)
... ...
@@ -224,9 +198,31 @@ em.normal.partial.concordant <- function(data, class, tol=0.001,
224 198
                                                 as.integer(iteration), 
225 199
                                                 convergence)
226 200
     
227
-    return(list(model="PCD", convergence=results[[16]], 
228
-        pi=t(array(results[[5]], dim=c(g,g))), mu_sigma=rbind(results[[6]], 
229
-        results[[7]]), nu_tau=rbind(results[[8]], results[[9]]), 
230
-        loglik=results[[11]], class=apply(array(results[[3]], dim=c(n,g*g)),1,
231
-        order,decreasing=TRUE)[1,], z=array(results[[3]], dim=c(n,g*g))))
201
+    return(list(model = "PCD", 
202
+                convergence = results[[16]],
203
+                pi = t(array(results[[5]], dim=c(g,g))), 
204
+                mu_sigma = rbind(results[[6]], results[[7]]), 
205
+                nu_tau = rbind(results[[8]], results[[9]]),
206
+                loglik = results[[11]], 
207
+                class = apply(array(results[[3]], dim = c(n,g*g)), 
208
+                              1, order, decreasing = TRUE)[1,], 
209
+                z = array(results[[3]], dim = c(n, g*g))))
210
+}
211
+
212
+.checkDiscordantInputs <- function(v1, v2, x, y, transform, 
213
+                                   subsampling, subSize, iter, 
214
+                                   components) {
215
+  
216
+  if (!is(x, "ExpressionSet") || (!is.null(y) && !is(y, "ExpressionSet"))) {
217
+    stop("x and y (if present) must be type ExpressionSet")
218
+  }
219
+  
220
+  if (!(components %in% c(3, 5))) {
221
+    stop ("components must be equal to 3 or 5")
222
+  }
223
+  
224
+  if (transform && (range(v1)[1] < -1 || range(v1)[2] > 1 || 
225
+                    range(v2)[1] < -1 || range(v2)[2] > 1)) {
226
+    stop ("correlation vectors have values less than -1 and/or greater than 1.")
227
+  }
232 228
 }
233 229
\ No newline at end of file
Browse code

Breaks up discordant.R into multiple files

All exported / large functions are given their
own file, with fishersTrans and remaining
non-exported functions placed in utils.R

Max McGrath authored on 22/07/2021 17:38:58
Showing1 changed files
... ...
@@ -1,168 +1,187 @@
1 1
 # Applied from Lai et al 2007 Bioinformatics.
2
-
2
+#
3 3
 # Code written by Charlotte Siska
4
-# R functions fisherTrans, createVectors, discordantRun and makeTable
4
+# R functions fishersTrans, createVectors, discordantRun and makeTable
5 5
 # Email: charlotte.siska@ucdenver.edu
6
-
6
+#
7 7
 # Code written by Yinglei Lai
8 8
 # C code and R functions unmap and em.normal.partial.concordant
9 9
 # E-mail: ylai@gwu.edu
10
-
10
+#
11 11
 # Code written by Fang Huaying
12 12
 # R function for SparCC, adapted from https://bitbucket.org/yonatanf/sparcc
13 13
 # Email: hyfang@pku.edu.cn
14 14
 
15
-################################################################################
16
-# File: SparCC.R
17
-# Aim : SparCC 
18
-#-------------------------------------------------------------------------------
19
-# Author: Fang Huaying (Peking University)
20
-# Email : hyfang@pku.edu.cn
21
-# Date  : 11/12/2014
22
-#-------------------------------------------------------------------------------
23
-# SparCC for counts known
24
-#   function: SparCC.count
25
-#   input:
26
-#          x ------ nxp count data matrix, row is sample, col is variable
27
-#       imax ------ resampling times from posterior distribution. default 20
28
-#       kmax ------ max iteration steps for SparCC. default is 10
29
-#      alpha ------ the threshold for strong correlation. default is 0.1
30
-#       Vmin ------ minimal variance if negative variance appears. default is 1e-4
31
-#   output: a list structure
32
-#      cov.w ------ covariance estimation
33
-#      cor.w ------ correlation estimation
34
-#
35
-
36
-SparCC.count <- function(x, imax = 20, kmax = 10, alpha = 0.1, Vmin = 1e-4) {
37
-  # dimension for w (latent variables)
38
-  p <- ncol(x);
39
-  n <- nrow(x);
40
-  # posterior distribution (alpha)
41
-  x <- x + 1;
42
-  # store generate data
43
-  y <- matrix(0, n, p);
44
-  # store covariance/correlation matrix
45
-  cov.w <- cor.w <- matrix(0, p, p);
46
-  indLow <- lower.tri(cov.w, diag = TRUE);
47
-  # store covariance/correlation for several posterior samples
48
-  covs <- cors <- matrix(0, p * (p + 1) / 2, imax);
49
-  for(i in 1:imax) {
50
-    # generate fractions from posterior distribution
51
-    y <- t(apply(x, 1, function(x) rdirichlet(n = 1, alpha = x)));
52
-    # estimate covariance/correlation
53
-    cov_cor <- SparCC.frac(x = y, kmax = kmax, alpha = alpha, Vmin = Vmin);
54
-    # store variance/correlation only low triangle 
55
-    covs[, i] <- cov_cor$cov.w[indLow];
56
-    cors[, i] <- cov_cor$cor.w[indLow];
15
+discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE, 
16
+                          subsampling = FALSE, subSize = dim(x)[1], iter = 100, 
17
+                          components = 3) {
18
+  
19
+  if(checkInputs(x,y)) {
20
+    stop("Please fix inputs.")
57 21
   }
58
-  # calculate median for several posterior samples
59
-  cov.w[indLow] <- apply(covs, 1, median); 
60
-  cor.w[indLow] <- apply(cors, 1, median);
61
-  #
62
-  cov.w <- cov.w + t(cov.w);
63
-  diag(cov.w) <- diag(cov.w) / 2;
64
-  cor.w <- cor.w + t(cor.w);
65
-  diag(cor.w) <- 1;
66
-  #
67
-  return(list(cov.w = cov.w, cor.w = cor.w));
68
-}
69
-
70
-#-------------------------------------------------------------------------------
71
-# SparCC for fractions known
72
-#   function: SparCC.frac
73
-#   input:
74
-#          x ------ nxp fraction data matrix, row is sample, col is variable
75
-#       kmax ------ max iteration steps for SparCC. default is 10
76
-#      alpha ------ the threshold for strong correlation. default is 0.1
77
-#       Vmin ------ minimal variance if negative variance appears. default is 1e-4
78
-#   output: a list structure
79
-#      cov.w ------ covariance estimation
80
-#      cor.w ------ correlation estimation
81
-SparCC.frac <- function(x, kmax = 10, alpha = 0.1, Vmin = 1e-4) {
82
-  # Log transformation
83
-  x <- log(x);
84
-  p <- ncol(x);
85
-  # T0 = var(log(xi/xj)) variation matrix
86
-  TT <- stats::var(x);
87
-  T0 <- diag(TT) + rep(diag(TT), each = p) - 2 * TT;
88
-  # Variance and correlation coefficients for Basic SparCC  
89
-  rowT0 <- rowSums(T0);
90
-  var.w <- (rowT0 - sum(rowT0) / (2 * p - 2))/(p - 2);
91
-  var.w[var.w < Vmin] <- Vmin;
92
-  #cor.w <- (outer(var.w, var.w, "+") - T0 ) / 
93
-  #  sqrt(outer(var.w, var.w, "*")) / 2;
94
-  Is <- sqrt(1/var.w);
95
-  cor.w <- (var.w + rep(var.w, each = p) - T0) * Is * rep(Is, each = p) * 0.5;
96
-  # Truncated correlation in [-1, 1]
97
-  cor.w[cor.w <= - 1] <- - 1; 
98
-  cor.w[cor.w >= 1] <- 1;
99
-  # Left matrix of estimation equation
100
-  Lmat <- diag(rep(p - 2, p)) + 1; 
101
-  # Remove pairs
102
-  rp <- NULL;
103
-  # Left components
104
-  cp <- rep(TRUE, p);
105
-  # Do loops until max iteration or only 3 components left
106
-  k <- 0;  
107
-  while(k < kmax && sum(cp) > 3) {
108
-    # Left T0 = var(log(xi/xj)) after removing pairs
109
-    T02 <- T0;
110
-    # Store current correlation to find the strongest pair
111
-    curr_cor.w <- cor.w;
112
-    # Remove diagonal
113
-    diag(curr_cor.w) <- 0;
114
-    # Remove removed pairs
115
-    if(!is.null(rp)) {
116
-      curr_cor.w[rp] <- 0;
117
-    }
118
-    # Find the strongest pair in vector form
119
-    n_rp <- which.max(abs(curr_cor.w));
120
-    # Remove the pair if geater than alpha
121
-    if(abs(curr_cor.w[n_rp]) >= alpha) {
122
-      # Which pair in matrix form
123
-      t_id <- c(arrayInd(n_rp, .dim = c(p, p)));
124
-      Lmat[t_id, t_id] <- Lmat[t_id, t_id] - 1;
125
-      # Update remove pairs
126
-      n_rp <- c(n_rp, (p + 1) * sum(t_id) - 2 * p - n_rp);
127
-      rp <- c(rp, n_rp);
128
-      # Update T02
129
-      T02[rp] <- 0;
130
-      # Which component left
131
-      cp <- (diag(Lmat) > 0);
132
-      # Update variance and truncated lower by Vmin
133
-      var.w[cp] <- solve(Lmat[cp, cp], rowSums(T02[cp, cp]));
134
-      var.w[var.w <= Vmin] <- Vmin;
135
-      # Update correlation matrix and truncated by [-1, 1]
136
-      #cor.w <- (outer(var.w, var.w, "+") - T0 ) / 
137
-      #  sqrt(outer(var.w, var.w, "*")) / 2;    
138
-      Is <- sqrt(1/var.w);
139
-      cor.w <- (var.w + rep(var.w, each = p) - T0) * 
140
-        Is * rep(Is, each = p) * 0.5;
141
-      # Truncated correlation in [-1, 1]
142
-      cor.w[cor.w <= - 1] <- - 1;
143
-      cor.w[cor.w >= 1] <- 1;
22
+  
23
+  if(transform == TRUE) {
24
+    if(range(v1)[1] < -1 || range(v1)[2] > 1 || range(v2)[1] < -1 || 
25
+       range(v2)[2] > 1) {
26
+      stop("correlation vectors have values less than -1 and/or greater 
27
+                than 1.")
144 28
     }
145
-    else {
146
-      break;
29
+    v1 <- fishersTrans(v1)
30
+    v2 <- fishersTrans(v2)
31
+  }
32
+  
33
+  check <- match(components, c(3,5))
34
+  if(is.na(check)) {
35
+    stop("components must be 3 or 5")
36
+  }
37
+  
38
+  x <- exprs(x)
39
+  if(is.null(y) == FALSE) { y <- exprs(y) }
40
+  featureSize = dim(x)[1]
41
+  
42
+  pdata <- cbind(v1, v2)
43
+  
44
+  param1 <- sd(v1)
45
+  param2 <- sd(v2)
46
+  
47
+  if(components == 3) {
48
+    class <- cbind(rep(0, dim(pdata)[1]), rep(0, dim(pdata)[1]))
49
+    class[pdata[,1]>0+param1,1] <- 2
50
+    class[pdata[,1]<0-param1,1] <- 1
51
+    class[pdata[,2]>0+param2,2] <- 2
52
+    class[pdata[,2]<0-param2,2] <- 1
53
+    discordClass <- c(2,3,4,6,7,8)
54
+  }
55
+