Browse code

Refactor subsampling, remove subSize argument

Parts of subsampling algorithm broken out into
seperate functions. subSize argument was removed
as it doesn't actually do anything (may add back
in later)

Max McGrath authored on 22/09/2021 22:39:44
Showing1 changed files

... ...
@@ -96,7 +96,7 @@
96 96
 #'                        TCGA_GBM_miRNA_microarray)
97 97
 #' @export
98 98
 discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE, 
99
-                          subsampling = FALSE, subSize = dim(x)[1], iter = 100, 
99
+                          subsampling = FALSE, iter = 100, 
100 100
                           components = 3) {
101 101
   
102 102
     .checkDiscordantInputs(v1, v2, x, y, transform, subsampling, subSize, iter, 
... ...
@@ -124,42 +124,15 @@ discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE,
124 124
     }
125 125
     
126 126
     if (subsampling) {
127
-        subSize <- min(nrow(x), nrow(y))
127
+        subSize = .setSubSize(x, y)
128 128
         total_mu <- total_sigma <- total_nu <- 
129 129
           total_tau <- total_pi <- rep(0, components)
130 130
         
131 131
         for(i in 1:iter) {
132
-            # make sure pairs are independent
133
-            rowIndex <- sample(nrow(x), subSize)
134
-            
135
-            if(is.null(y)) {
136
-              # colIndex <- sample(nrow(x), subSize)
137
-              sampleNames <- rownames(x)
138
-              
139
-              if (floor(length(sampleNames) / 2) < subSize) {
140
-                warning("Provided subSize too high. Using number of features divided by 2.")
141
-                subSize <- floor(length(sampleNames) / 2)
142
-              }
143
-              
144
-              nameSet1 <- sample(sampleNames, subSize)
145
-              nameSet2 <- setdiff(sampleNames, nameSet1)
146
-              nameSetA <- paste0(nameSet1, "_", nameSet2)
147
-              nameSetB <- paste0(nameSet2, "_", nameSet1)
148
-              subSampV1 <- ifelse(is.na(v1[nameSetA]), v1[nameSetB], v1[nameSetA])
149
-              subSampV2 <- ifelse(is.na(v2[nameSetA]), v2[nameSetB], v2[nameSetA])
150
-            } else {
151
-              colIndex <- sample(nrow(y), subSize)
152
-              mat1 <- matrix(v1, nrow = nrow(x), byrow = FALSE)
153
-              mat2 <- matrix(v2, nrow = nrow(x), byrow = FALSE)
154
-              subSampV1 <- sapply(1:subSize, function(x) mat1[rowIndex[x], 
155
-                                                              colIndex[x]])
156
-              subSampV2 <- sapply(1:subSize, function(x) mat2[rowIndex[x], 
157
-                                                              colIndex[x]])
158
-            }
159
-            
160
-            sub.pdata <- cbind(subSampV1, subSampV2)
161
-            sub.class <- cbind(.assignClass(subSampV1, param1, components),
162
-                               .assignClass(subSampV2, param2, components))
132
+            subSamples <- .createSubsamples(x, y, v1, v2, subSize)
133
+            sub.pdata <- cbind(subSamples$v1, subSamples$v2)
134
+            sub.class <- cbind(.assignClass(subSamples$v1, param1, components),
135
+                               .assignClass(subSamples$v2, param2, components))
163 136
             
164 137
             pd <- em.normal.partial.concordant(sub.pdata, sub.class, components)
165 138
             total_mu <- total_mu + pd$mu_sigma[1,]
... ...
@@ -282,9 +255,29 @@ em.normal.partial.concordant <- function(data, class, components) {
282 255
   return(rtn)
283 256
 }
284 257
 
285
-# .createSubsamples <- function(x, y = NULL, v1, v2) {
286
-#   
287
-# }
258
+.createSubsamples <- function(x, y = NULL, v1, v2, subSize) {
259
+  if(is.null(y)) {
260
+    sampleNames <- rownames(x)
261
+    nameSet1 <- sample(sampleNames, subSize)
262
+    nameSet2 <- sample(setdiff(sampleNames, nameSet1))
263
+    nameSetA <- paste0(nameSet1, "_", nameSet2)
264
+    nameSetB <- paste0(nameSet2, "_", nameSet1)
265
+    subSampV1 <- ifelse(is.na(v1[nameSetA]), v1[nameSetB], v1[nameSetA])
266
+    subSampV2 <- ifelse(is.na(v2[nameSetA]), v2[nameSetB], v2[nameSetA])
267
+  } else {
268
+    # make sure pairs are independent
269
+    rowIndex <- sample(nrow(x), subSize)
270
+    colIndex <- sample(nrow(y), subSize)
271
+    mat1 <- matrix(v1, nrow = nrow(x), byrow = FALSE)
272
+    mat2 <- matrix(v2, nrow = nrow(x), byrow = FALSE)
273
+    subSampV1 <- sapply(1:subSize, function(x) mat1[rowIndex[x], 
274
+                                                    colIndex[x]])
275
+    subSampV2 <- sapply(1:subSize, function(x) mat2[rowIndex[x], 
276
+                                                    colIndex[x]])
277
+  }
278
+  return(list(v1 = subSampV1,
279
+              v2 = subSampV2))
280
+}
288 281
 
289 282
 # Internal function to validate user inputs for discordantRun()
290 283
 #' @importFrom methods is
... ...
@@ -302,12 +295,12 @@ em.normal.partial.concordant <- function(data, class, components) {
302 295
   # }
303 296
   
304 297
   if (!(components %in% c(3, 5))) {
305
-    stop ("components must be equal to 3 or 5")
298
+    stop ("Components must be equal to 3 or 5")
306 299
   }
307 300
   
308 301
   if (transform && (range(v1)[1] < -1 || range(v1)[2] > 1 || 
309 302
                     range(v2)[1] < -1 || range(v2)[2] > 1)) {
310
-    stop ("correlation vectors have values less than -1 and/or greater than 1.")
303
+    stop ("Correlation vectors have values less than -1 and/or greater than 1.")
311 304
   }
312 305
 }
313 306
 
... ...
@@ -320,6 +313,16 @@ em.normal.partial.concordant <- function(data, class, components) {
320 313
   if(any(c(sumZx, sumZy) == 0)) {
321 314
     stop("Insufficient data for component estimation. Increase number of 
322 315
          features or reduce number of components used. If subsampling=TRUE,
323
-         consider increasing subSize as well.")
316
+         you may need set subsampling=FALSE.")
317
+  }
318
+}
319
+
320
+# Internal function to set sub size based on data
321
+.setSubSize <- function(x, y) {
322
+  if (is.null(y)) {
323
+    sampleNames <- rownames(x)
324
+    subSize <- floor(length(sampleNames) / 2)
325
+  } else {
326
+    subSize <- min(nrow(x), nrow(y))
324 327
   }
325 328
 }
326 329
\ No newline at end of file