Browse code

trying to submit to bioconductor

Charlotte Siska authored on 23/10/2016 17:57:42
Showing27 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,17 @@
1
+Package: discordant
2
+Version: 0.99.0
3
+Date: 2016-10-21
4
+Title: The Discordant Method: A Novel Approach for Differential
5
+        Correlation
6
+Authors@R: c(person("Charlotte", "Siska", role = c("aut", "cre"), email = "charlotte.siska@ucdenver.edu"), person("Katerina", "Kechris", role = c("aut")))
7
+Author: Charlotte Siska [cre,aut], Katerina Kechris [aut]
8
+Maintainer: Charlotte Siska <charlotte.siska@ucdenver.edu>
9
+Description: Discordant is a method to determine differential correlation of molecular feature pairs from -omics data using mixture models. Algorithm is explained further in Siska et al.
10
+Encoding: latin1
11
+biocViews: BiologicalQuestion, StatisticalMethod, mRNAMicroarray,
12
+        Microarray, Genetics, RNASeq
13
+Imports: stats, biwt, gtools, MASS, tools
14
+License: GPL (>= 2)
15
+URL: https://github.com/siskac/discordant
16
+NeedsCompilation: yes
17
+Packaged: 2016-10-23 17:49:47 UTC; siskac
0 18
new file mode 100644
... ...
@@ -0,0 +1,9 @@
1
+useDynLib(discordant)
2
+
3
+import(MASS)
4
+import(gtools)
5
+import(biwt)
6
+import(tools)
7
+import(stats)
8
+
9
+export(createVectors, discordantRun, fishersTrans, splitMADOutlier)
0 10
new file mode 100644
... ...
@@ -0,0 +1,497 @@
1
+# Applied from Lai et al 2007 Bioinformatics.
2
+
3
+# Code written by Charlotte Siska
4
+# R functions fisherTrans, createVectors, discordantRun and makeTable
5
+# Email: charlotte.siska@ucdenver.edu
6
+
7
+# Code written by Yinglei Lai
8
+# C code and R functions unmap and em.normal.partial.concordant
9
+# E-mail: ylai@gwu.edu
10
+
11
+# Code written by Fang Huaying
12
+# R function for SparCC, adapted from https://bitbucket.org/yonatanf/sparcc
13
+# Email: hyfang@pku.edu.cn
14
+
15
+################################################################################
16
+# File: SparCC.R
17
+# Aim : SparCC 
18
+#-------------------------------------------------------------------------------
19
+# Author: Fang Huaying (Peking University)
20
+# Email : hyfang@pku.edu.cn
21
+# Date  : 11/12/2014
22
+#-------------------------------------------------------------------------------
23
+# SparCC for counts known
24
+#   function: SparCC.count
25
+#   input:
26
+#          x ------ nxp count data matrix, row is sample, col is variable
27
+#       imax ------ resampling times from posterior distribution. default 20
28
+#       kmax ------ max iteration steps for SparCC. default is 10
29
+#      alpha ------ the threshold for strong correlation. default is 0.1
30
+#       Vmin ------ minimal variance if negative variance appears. default is 1e-4
31
+#   output: a list structure
32
+#      cov.w ------ covariance estimation
33
+#      cor.w ------ correlation estimation
34
+#
35
+
36
+SparCC.count <- function(x, imax = 20, kmax = 10, alpha = 0.1, Vmin = 1e-4) {
37
+  # dimension for w (latent variables)
38
+  p <- ncol(x);
39
+  n <- nrow(x);
40
+  # posterior distribution (alpha)
41
+  x <- x + 1;
42
+  # store generate data
43
+  y <- matrix(0, n, p);
44
+  # store covariance/correlation matrix
45
+  cov.w <- cor.w <- matrix(0, p, p);
46
+  indLow <- lower.tri(cov.w, diag = TRUE);
47
+  # store covariance/correlation for several posterior samples
48
+  covs <- cors <- matrix(0, p * (p + 1) / 2, imax);
49
+  for(i in 1:imax) {
50
+    # generate fractions from posterior distribution
51
+    y <- t(apply(x, 1, function(x) rdirichlet(n = 1, alpha = x)));
52
+    # estimate covariance/correlation
53
+    cov_cor <- SparCC.frac(x = y, kmax = kmax, alpha = alpha, Vmin = Vmin);
54
+    # store variance/correlation only low triangle 
55
+    covs[, i] <- cov_cor$cov.w[indLow];
56
+    cors[, i] <- cov_cor$cor.w[indLow];
57
+  }
58
+  # calculate median for several posterior samples
59
+  cov.w[indLow] <- apply(covs, 1, median); 
60
+  cor.w[indLow] <- apply(cors, 1, median);
61
+  #
62
+  cov.w <- cov.w + t(cov.w);
63
+  diag(cov.w) <- diag(cov.w) / 2;
64
+  cor.w <- cor.w + t(cor.w);
65
+  diag(cor.w) <- 1;
66
+  #
67
+  return(list(cov.w = cov.w, cor.w = cor.w));
68
+}
69
+
70
+#-------------------------------------------------------------------------------
71
+# SparCC for fractions known
72
+#   function: SparCC.frac
73
+#   input:
74
+#          x ------ nxp fraction data matrix, row is sample, col is variable
75
+#       kmax ------ max iteration steps for SparCC. default is 10
76
+#      alpha ------ the threshold for strong correlation. default is 0.1
77
+#       Vmin ------ minimal variance if negative variance appears. default is 1e-4
78
+#   output: a list structure
79
+#      cov.w ------ covariance estimation
80
+#      cor.w ------ correlation estimation
81
+SparCC.frac <- function(x, kmax = 10, alpha = 0.1, Vmin = 1e-4) {
82
+  # Log transformation
83
+  x <- log(x);
84
+  p <- ncol(x);
85
+  # T0 = var(log(xi/xj)) variation matrix
86
+  TT <- stats::var(x);
87
+  T0 <- diag(TT) + rep(diag(TT), each = p) - 2 * TT;
88
+  # Variance and correlation coefficients for Basic SparCC  
89
+  rowT0 <- rowSums(T0);
90
+  var.w <- (rowT0 - sum(rowT0) / (2 * p - 2))/(p - 2);
91
+  var.w[var.w < Vmin] <- Vmin;
92
+  #cor.w <- (outer(var.w, var.w, "+") - T0 ) / 
93
+  #  sqrt(outer(var.w, var.w, "*")) / 2;
94
+  Is <- sqrt(1/var.w);
95
+  cor.w <- (var.w + rep(var.w, each = p) - T0) * Is * rep(Is, each = p) * 0.5;
96
+  # Truncated correlation in [-1, 1]
97
+  cor.w[cor.w <= - 1] <- - 1; 
98
+  cor.w[cor.w >= 1] <- 1;
99
+  # Left matrix of estimation equation
100
+  Lmat <- diag(rep(p - 2, p)) + 1; 
101
+  # Remove pairs
102
+  rp <- NULL;
103
+  # Left components
104
+  cp <- rep(TRUE, p);
105
+  # Do loops until max iteration or only 3 components left
106
+  k <- 0;  
107
+  while(k < kmax && sum(cp) > 3) {
108
+    # Left T0 = var(log(xi/xj)) after removing pairs
109
+    T02 <- T0;
110
+    # Store current correlation to find the strongest pair
111
+    curr_cor.w <- cor.w;
112
+    # Remove diagonal
113
+    diag(curr_cor.w) <- 0;
114
+    # Remove removed pairs
115
+    if(!is.null(rp)) {
116
+      curr_cor.w[rp] <- 0;
117
+    }
118
+    # Find the strongest pair in vector form
119
+    n_rp <- which.max(abs(curr_cor.w));
120
+    # Remove the pair if geater than alpha
121
+    if(abs(curr_cor.w[n_rp]) >= alpha) {
122
+      # Which pair in matrix form
123
+      t_id <- c(arrayInd(n_rp, .dim = c(p, p)));
124
+      Lmat[t_id, t_id] <- Lmat[t_id, t_id] - 1;
125
+      # Update remove pairs
126
+      n_rp <- c(n_rp, (p + 1) * sum(t_id) - 2 * p - n_rp);
127
+      rp <- c(rp, n_rp);
128
+      # Update T02
129
+      T02[rp] <- 0;
130
+      # Which component left
131
+      cp <- (diag(Lmat) > 0);
132
+      # Update variance and truncated lower by Vmin
133
+      var.w[cp] <- solve(Lmat[cp, cp], rowSums(T02[cp, cp]));
134
+      var.w[var.w <= Vmin] <- Vmin;
135
+      # Update correlation matrix and truncated by [-1, 1]
136
+      #cor.w <- (outer(var.w, var.w, "+") - T0 ) / 
137
+      #  sqrt(outer(var.w, var.w, "*")) / 2;    
138
+      Is <- sqrt(1/var.w);
139
+      cor.w <- (var.w + rep(var.w, each = p) - T0) * 
140
+        Is * rep(Is, each = p) * 0.5;
141
+      # Truncated correlation in [-1, 1]
142
+      cor.w[cor.w <= - 1] <- - 1;
143
+      cor.w[cor.w >= 1] <- 1;
144
+    }
145
+    else {
146
+      break;
147
+    }
148
+    # 
149
+    k <- k + 1;
150
+  }
151
+  # Covariance
152
+  Is <- sqrt(var.w);
153
+  cov.w <- cor.w * Is * rep(Is, each = p);
154
+  #
155
+  return(list(cov.w = cov.w, cor.w = cor.w));
156
+}
157
+
158
+
159
+
160
+#modified from package mclust
161
+unmap <- function(classification){
162
+    n <- length(classification)
163
+    u <- sort(unique(classification))
164
+    labs <- as.character(u)
165
+    k <- length(u)
166
+    z <- matrix(0, n, k)
167
+    for (j in 1:k) z[classification == u[j], j] <- 1
168
+    dimnames(z) <- list(NULL, labs)
169
+    return(z)
170
+}
171
+
172
+
173
+em.normal.partial.concordant <- function(data, class, tol=0.001, restriction=0, constrain=0, iteration=1000){
174
+    n <- as.integer(dim(data)[1])
175
+    g <- as.integer(nlevels(as.factor(class)))
176
+
177
+    yl.outer <- function(k, zx, zy){
178
+        return( c(zx[k,] %o% zy[k,]) )
179
+    }
180
+    yl.diag <- function(k, z){
181
+        return( c(diag(z[k,])) )
182
+    }
183
+
184
+    zx <- unmap(class[,1])
185
+    zy <- unmap(class[,2])
186
+    zxy <- sapply(1:dim(zx)[1], yl.outer, zx, zy)
187
+
188
+    pi <- double(g*g)
189
+    mu <- double(g)
190
+    sigma <- double(g)
191
+    nu <- double(g)
192
+    tau <- double(g)
193
+    loglik <- double(1)
194
+    convergence <- integer(1)
195
+    results <- .C("em_normal_partial_concordant", as.double(data[,1]), as.double(data[,2]), as.double(t(zxy)), n, pi, mu, sigma, nu, tau, g, loglik, as.double(tol), as.integer(restriction), as.integer(constrain), as.integer(iteration), convergence)
196
+    return(list(model="PCD", convergence=results[[16]], pi=t(array(results[[5]],dim=c(g,g))), mu_sigma=rbind(results[[6]], results[[7]]), nu_tau=rbind(results[[8]], results[[9]]), loglik=results[[11]], class=apply(array(results[[3]], dim=c(n,g*g)),1,order,decreasing=TRUE)[1,], z=array(results[[3]], dim=c(n,g*g))))
197
+}
198
+
199
+subSampleData <- function(pdata, class, mu, sigma, nu, tau, pi) {
200
+    n <- as.integer(dim(pdata)[1])
201
+    g <- as.integer(nlevels(as.factor(class)))
202
+
203
+    yl.outer <- function(k, zx, zy){
204
+        return( c(zx[k,] %o% zy[k,]) )
205
+    }
206
+    yl.diag <- function(k, z){
207
+        return( c(diag(z[k,])) )
208
+    }
209
+
210
+    zx <- unmap(class[,1])
211
+    zy <- unmap(class[,2])
212
+    zxy <- sapply(1:dim(zx)[1], yl.outer, zx, zy)
213
+
214
+    results <- .C("subsampling", as.double(pdata[,1]), as.double(pdata[,2]), as.double(t(zxy)), n, as.double(pi), as.double(mu), as.double(sigma), as.double(nu), as.double(tau), g)
215
+    return(list(pi=t(array(results[[5]],dim=c(g,g))), mu_sigma=rbind(results[[6]], results[[7]]), nu_tau=rbind(results[[8]], results[[9]]), class=apply(array(results[[3]], dim=c(n,g*g)),1,order,decreasing=TRUE)[1,], z=array(results[[3]], dim=c(n,g*g))))
216
+
217
+} 
218
+
219
+fishersTrans <- function(rho) {
220
+    r = (1+rho)/(1-rho)
221
+    z = 0.5*log(r,base = exp(1))
222
+    return(z)
223
+}
224
+
225
+getNames <- function(x, y = NULL) {
226
+    if(is.null(y) == FALSE) {
227
+        namesMatrix <- NULL
228
+        for(i in 1:nrow(x)) {
229
+            tempMatrix <- cbind(rep(rownames(x)[i],nrow(y)), rownames(y))
230
+            namesMatrix <- rbind(namesMatrix, tempMatrix)
231
+        }
232
+    }  
233
+
234
+    if(is.null(y)) {
235
+        temp <- matrix(NA,nrow = nrow(x), ncol = nrow(x))
236
+        diag <- lower.tri(temp, diag = FALSE)
237
+        temp[diag] <- rep(1, sum(diag == TRUE))
238
+
239
+        namesMatrix <- NULL
240
+
241
+        for(i in 1:dim(temp)[1]) {
242
+            outputRow <- temp[i,]
243
+            index <- which(is.na(outputRow) == FALSE)
244
+            if(length(index) > 0) {
245
+                tempMatrix <- cbind(rep(rownames(x)[i],length(index)), rownames(x)[index])
246
+                namesMatrix <- rbind(namesMatrix, tempMatrix)
247
+            }
248
+        }
249
+    }
250
+
251
+    vector_names <- apply(namesMatrix, 1, function(k) paste(k[1],"_",k[2],sep = ""))
252
+    return(vector_names)
253
+}
254
+
255
+createVectors <- function(x, y = NULL, groups, cor.method = c("spearman")) {
256
+    index1 <- which(groups == 1)
257
+    index2 <- which(groups == 2)
258
+
259
+    check <- match(cor.method, c("spearman", "pearson", "bwmc", "sparcc"))
260
+    if(is.na(check)) {
261
+        stop("Please enter spearman, pearson, bwmc or sparcc for correlation metric.")
262
+    }
263
+
264
+    if(is.null(y) == FALSE) {
265
+        data <- rbind(x, y)
266
+        data1 <- data[,index1]
267
+        data2 <- data[,index2]
268
+        featureSize = dim(x)[1]
269
+        if(cor.method == c("spearman") || cor.method == c("pearson")) {
270
+            statMatrix1 <- cor(t(data1), method = cor.method)
271
+            statMatrix2 <- cor(t(data2), method = cor.method)
272
+        }
273
+        if(cor.method == c("bwmc")) {
274
+            statMatrix1 <- biwt.cor(data1)
275
+            statMatrix2 <- biwt.cor(data2)
276
+        }
277
+        if(cor.method == c("sparcc")) {
278
+            if(min(data) < 0) {
279
+                stop("SparCC can only be applied to sequencing data.")
280
+            }
281
+            statMatrix1 <- SparCC.count(t(data1))
282
+            statMatrix2 <- SparCC.count(t(data2))
283
+        }
284
+        statMatrix1 <- statMatrix1[1:featureSize,(featureSize + 1):dim(data1)[1]]
285
+        statMatrix2 <- statMatrix2[1:featureSize,(featureSize + 1):dim(data1)[1]]
286
+        statVector1 <- as.vector(statMatrix1)
287
+        statVector2 <- as.vector(statMatrix2)
288
+        vector_names <- getNames(x,y)
289
+    }
290
+
291
+    if(is.null(y)) {
292
+        data1 <- x[,index1]
293
+        data2 <- x[,index2]
294
+        if(cor.method == c("spearman") || cor.method == c("pearson")) {
295
+            statMatrix1 <- cor(t(data1), method = cor.method)
296
+            statMatrix2 <- cor(t(data2), method = cor.method)
297
+        }   
298
+        if(cor.method == c("bwmc")) {
299
+            statMatrix1 <- biwt.cor(data1)
300
+            statMatrix2 <- biwt.cor(data2)
301
+        }   
302
+        if(cor.method == c("sparcc")) {
303
+            if(min(data) < 0) {
304
+                stop("SparCC can only be applied to sequencing data.")
305
+            }   
306
+        statMatrix1 <- SparCC.count(t(data1))
307
+        statMatrix2 <- SparCC.count(t(data2))
308
+        }   
309
+        statVector1 <- as.vector(statMatrix1)
310
+        statVector2 <- as.vector(statMatrix2)
311
+        diag <- lower.tri(statMatrix1, diag = FALSE)
312
+        statVector1 <- statMatrix1[diag]
313
+        statVector2 <- statMatrix2[diag]
314
+        vector_names <- getNames(x)
315
+    }
316
+
317
+    names(statVector1) <- vector_names
318
+    names(statVector2) <- vector_names
319
+
320
+    return(list(v1 = statVector1, v2 = statVector2))
321
+}
322
+
323
+discordantRun <- function(v1, v2, x, y = NULL, transform = TRUE, subsampling = FALSE, subSize = dim(x)[1], iter = 100, components = 3) {
324
+
325
+    if(transform == TRUE) {
326
+        if(range(v1)[1] < -1 || range(v1)[2] > 1 || range(v2)[1] < -1 || range(v2)[2] > 1) {
327
+            stop("correlation vectors have values less than -1 and/or greater than 1.")
328
+        }
329
+        v1 <- fishersTrans(v1)
330
+        v2 <- fishersTrans(v2)
331
+    }
332
+
333
+    check <- match(components, c(3,5))
334
+    if(is.na(check)) {
335
+        stop("components must be 3 or 5")
336
+    }
337
+
338
+    featureSize = dim(x)[1]
339
+
340
+    pdata <- cbind(v1, v2)
341
+	
342
+    param1 <- sd(v1)
343
+    param2 <- sd(v2)
344
+	
345
+    if(components == 3) {
346
+        class <- cbind(rep(0, dim(pdata)[1]), rep(0, dim(pdata)[1]))
347
+        class[pdata[,1]>0+param1,1] <- 2
348
+        class[pdata[,1]<0-param1,1] <- 1
349
+        class[pdata[,2]>0+param2,2] <- 2
350
+        class[pdata[,2]<0-param2,2] <- 1
351
+        discordClass <- c(2,3,4,6,7,8)
352
+    }
353
+
354
+    if(components == 5) {
355
+        class <- cbind(rep(0, dim(pdata)[1]), rep(0, dim(pdata)[1]))
356
+        class[pdata[,1]>0+param1,1] <- 2
357
+        class[pdata[,1]<0-param1,1] <- 1
358
+        class[pdata[,1]>0+(2*param1),1] <- 4
359
+        class[pdata[,1]<0-(2*param1),1] <- 3
360
+        class[pdata[,2]>0+param2,2] <- 2
361
+        class[pdata[,2]<0-param2,2] <- 1
362
+        class[pdata[,2]>0+(2*param2),2] <- 4
363
+        class[pdata[,2]<0-(2*param2),2] <- 3
364
+        concordClass <- c(1,7,13,19,25)
365
+        discordClass <- setdiff(1:25,concordClass)
366
+    }
367
+
368
+    if(subsampling == TRUE) {
369
+        subSize = nrow(x)
370
+        if(is.null(y) == FALSE & nrow(y) < subSize) {
371
+            subSize = nrow(y)
372
+        }
373
+        total_mu <- rep(0,3)
374
+        total_sigma <- rep(0,3)
375
+        total_nu <- rep(0,3)
376
+        total_tau <- rep(0,3)
377
+        total_pi <- rep(0,3)
378
+        for(i in 1:iter) {
379
+        # make sure pairs are independent
380
+            rowIndex <- sample(nrow(x), subSize)
381
+            colIndex <- sample(nrow(y), subSize)
382
+            mat1 <- matrix(v1, nrow = nrow(x), byrow = FALSE)
383
+            mat2 <- matrix(v2, nrow = nrow(x), byrow = FALSE)
384
+
385
+            subSampV1 <- sapply(1:subSize, function(x) mat1[rowIndex[x], colIndex[x]])
386
+            subSampV2 <- sapply(1:subSize, function(x) mat2[rowIndex[x], colIndex[x]])
387
+
388
+            sub.pdata <- cbind(subSampV1, subSampV2)
389
+            sub.class <- cbind(rep(0, subSize), rep(0, subSize))
390
+            if(components == 3) {
391
+                sub.class[sub.pdata[,1]>0+param1,1] <- 2
392
+                sub.class[sub.pdata[,1]<0-param1,1] <- 1
393
+                sub.class[sub.pdata[,2]>0+param2,2] <- 2
394
+                sub.class[sub.pdata[,2]<0-param2,2] <- 1
395
+            }
396
+            if(components == 5) {
397
+                sub.class[sub.pdata[,1]>0+param1,1] <- 2
398
+                sub.class[sub.pdata[,1]<0-param1,1] <- 1
399
+                sub.class[sub.pdata[,1]>0+(2*param1),1] <- 4
400
+                sub.class[sub.pdata[,1]<0-(2*param1),1] <- 3
401
+                sub.class[pdata[,2]>0+param2,2] <- 2
402
+                sub.class[pdata[,2]<0-param2,2] <- 1
403
+                sub.class[pdata[,2]>0+(2*param2),2] <- 4
404
+                sub.class[pdata[,2]<0-(2*param2),2] <- 3
405
+            }
406
+            pd <- em.normal.partial.concordant(sub.pdata, sub.class, tol=0.001, restriction=0, constrain=c(0,-sd(pdata),sd(pdata)), iteration=1000)
407
+            total_mu <- total_mu + pd$mu_sigma[1,]
408
+            total_sigma <- total_sigma + pd$mu_sigma[2,]
409
+            total_nu <- total_nu + pd$nu_tau[1,]
410
+            total_tau <- total_tau + pd$nu_tau[2,]
411
+            total_pi <- total_pi + pd$pi
412
+        }
413
+
414
+        mu <- total_mu/iter
415
+        sigma <- total_sigma/iter
416
+        nu <- total_nu/iter
417
+        tau <- total_tau/iter
418
+        pi <- total_pi/iter
419
+
420
+        finalResult <- subSampleData(pdata, class, mu, sigma, nu, tau, pi)
421
+        zTable <- finalResult$z
422
+        classVector <- finalResult$class
423
+    } else {
424
+        pd <- em.normal.partial.concordant(pdata, class, tol=0.001, restriction=0, constrain=c(0,-sd(pdata),sd(pdata)), iteration=1000)
425
+        zTable <- pd$z
426
+        classVector <- pd$class
427
+    }
428
+
429
+    discordPPV <- apply(zTable, 1, function(x) sum(x[discordClass])/sum(x))
430
+
431
+    if(is.null(y) == FALSE) {
432
+        discordPPMatrix <- matrix(discordPPV, nrow = featureSize, byrow = FALSE)
433
+        classMatrix <- matrix(classVector, nrow = featureSize, byrow = FALSE)
434
+        rownames(discordPPMatrix) <- rownames(x)
435
+        colnames(discordPPMatrix) <- rownames(y)
436
+        rownames(classMatrix) <- rownames(x)
437
+        colnames(classMatrix) <- rownames(y)
438
+
439
+        vector_names <- getNames(x,y)
440
+        names(discordPPV) <- vector_names
441
+        names(classVector) <- vector_names
442
+    }   
443
+
444
+    if(is.null(y)) {
445
+        discordPPMatrix <- matrix(NA,nrow = featureSize, ncol = featureSize)
446
+        classMatrix <- discordPPMatrix
447
+        diag <- lower.tri(discordPPMatrix, diag = FALSE)
448
+        discordPPMatrix[diag] <- discordPPV
449
+        classMatrix[diag] <- classVector
450
+        rownames(discordPPMatrix) <- rownames(x)
451
+        colnames(discordPPMatrix) <- rownames(x)
452
+        rownames(classMatrix) <- rownames(x)
453
+        colnames(classMatrix) <- rownames(x)
454
+        vector_names <- getNames(x)
455
+        names(discordPPV) <- vector_names
456
+        names(classVector) <- vector_names
457
+    }
458
+
459
+    zTable <- t(apply(zTable, 1, function(x) x/sum(x)))
460
+    rownames(zTable) <- vector_names
461
+
462
+    return(list(discordPPMatrix = discordPPMatrix, discordPPVector = discordPPV, classMatrix = classMatrix, classVector = classVector, probMatrix = zTable, loglik = pd$loglik))
463
+}
464
+
465
+splitMADOutlier <- function(mat, filter0 = TRUE, threshold = 2) {
466
+    if(is.matrix(mat) == FALSE) {
467
+        mat <- as.matrix(mat)
468
+    }
469
+    maxMAD <- c()
470
+    mat.filtered <- NULL
471
+    index <- c()
472
+    for(i in 1:nrow(mat)) {
473
+        y <- mat[i,]
474
+        x <- y
475
+        if(length(which(is.infinite(x))) >0 ){
476
+            x <- x[-which(is.infinite(x))]
477
+        }
478
+        if(length(which(is.na(x)))> 0) {
479
+            x <- x[-which(is.na(x))]
480
+        }
481
+        median.x <- median(x)
482
+        left <- x[x <= median.x]
483
+        right <- x[x > median.x]
484
+        left.mad <- mad(left)
485
+        right.mad <- mad(right)
486
+        leftThresh <- median(left) - left.mad*threshold
487
+        rightThresh <- median(right) + right.mad*threshold 
488
+        left.check <- sum(left < leftThresh)
489
+        right.check <- sum(right > rightThresh)
490
+        if(left.check == 0 & right.check == 0) {
491
+            mat.filtered <- rbind(mat.filtered, y)
492
+            rownames(mat.filtered)[nrow(mat.filtered)] <- rownames(mat)[i]
493
+            index <- c(index,i)
494
+        }
495
+    }
496
+	return(list(mat.filtered = mat.filtered, index = index))
497
+}
0 498
new file mode 100644
1 499
Binary files /dev/null and b/build/vignette.rds differ
2 500
new file mode 100644
3 501
Binary files /dev/null and b/data/TCGA_Breast_RNASeq.RData differ
4 502
new file mode 100644
5 503
Binary files /dev/null and b/data/TCGA_Breast_RNASeq_voom.RData differ
6 504
new file mode 100644
7 505
Binary files /dev/null and b/data/TCGA_Breast_miRNASeq.RData differ
8 506
new file mode 100644
9 507
Binary files /dev/null and b/data/TCGA_Breast_miRNASeq_voom.RData differ
10 508
new file mode 100644
11 509
Binary files /dev/null and b/data/TCGA_GBM_miRNA_microarray.RData differ
12 510
new file mode 100644
13 511
Binary files /dev/null and b/data/TCGA_GBM_transcript_microarray.RData differ
14 512
new file mode 100644
... ...
@@ -0,0 +1 @@
1
+October 23, 2016: Package submitted.
0 2
new file mode 100644
... ...
@@ -0,0 +1,348 @@
1
+%\VignetteIndexEntry{Discordant}
2
+
3
+\documentclass[12pt]{article}
4
+\usepackage{Sweave}
5
+\usepackage{caption}
6
+\usepackage{cite}
7
+\usepackage{hyperref}
8
+\usepackage{amssymb}
9
+\begin{document}
10
+\title{The Discordant R Package: A Novel Approach to Differential Correlation}
11
+\author{Charlotte Siska and Katerina Kechris} 
12
+\maketitle
13
+
14
+\tableofcontents
15
+
16
+\pagebreak
17
+
18
+\section{Introduction}
19
+
20
+Discordant is an R package that identifies pairs of features that correlate differently between phenotypic groups, with application to -omics datasets. Discordant uses a mixture model that “bins” molecular feature pairs based on their type of coexpression. More information on the algorithm can be found in \cite{siska1, siska2}. The final output are posterior probabilities of differential correlation. This package can be used to determine differential correlation within one –omics dataset or between two –omics datasets (provided that both –omics datasets were taken from the same samples). Also, the type of data can be any type of –omics with normal or non-normal distributions. Some examples are metabolomics, transcriptomic, proteomics, etc. 
21
+
22
+The functions in the Discordant package provide a simple pipeline for intermediate R users to determine differentially correlated pairs. The final output is a table of molecular feature pairs and their respective posterior probabilities. Functions have been written to allow flexibility for users in how they interpret results, which will be discussed further. Currently, the package only supports the comparison between two phenotypic groups (e.g., disease vs control, mutant vs wildtype).
23
+
24
+\section{Discordant Algorithm}
25
+
26
+Discordant is originally derived from the Concordant algorithm written by \cite{lai1, lai2}. It was used to determine concordance between microarrays. We have applied it to determine differential correlation of features between groups \cite{siska1,siska2}.
27
+
28
+Using a three component mixture model and the EM algorithm, the model predicts if the correlation coefficients in phenotypic groups 1 and 2 for a molecular feature pair are dissimilar \cite{siska1}. The correlation coefficients are generated for all possible molecular feature pairs witin an -omics dataset or between two -omics datasets. The correlation coefficients are transformed into z scores using Fisher's tranformation. The three components are -, + and 0 which correspond respectively to a negative, positive or no correlation. Molecular features that have correlation coefficients in \textit{different} components are considered \textit{differentially} correlated, as opposed to when correlation coefficients are in the \textit{same} component then they are \textit{equivalently} correlated.
29
+
30
+\begin{table}
31
+\begin{center}
32
+\begin{tabular}{ c |  c c c }
33
+  & 0 & - & + \\
34
+\hline
35
+0 & 1 & 2 & 3 \\
36
+- & 4 & 5 & 6 \\
37
++ & 7 & 8 & 9 \\
38
+\end{tabular}
39
+\caption{Class Matrix for Three Component Mixture Model}
40
+\end{center}
41
+\end{table}
42
+
43
+The class matrix (Table 1) contains the classes that represent all possible paired-correlation scenarios. These scenarios are based off the components in the mixture models. Molecular features that have correlation coefficients in different components are considered differentially correlated, as opposed to when correlation coefficients are in the same component they are equivalently correlated. This can be visualized in the class matrix, where the rows represent the components for group 1 and the columns represent the components for group 2. The classes on the diagonal represent equivalent correlation (1, 5 and 9), and classes in the off-diagonal represent differential correlation (2, 3, 4, 6, 8).
44
+
45
+After running the EM algorithm, we have 9 posterior probabilities for each molecular feature pair that correspond to the 9 classes in the class matrix. Since we want to summarize the probability that the molecular feature pair is differentially correlated, we sum the posterior probabilities representing the off-diagonal classes in the class matrix.
46
+
47
+\section{Example Data}
48
+
49
+All datasets are originally from the Cancer Genome Atlas (TCGA) and can be found at \href{http://cancergenome.nih.gov/}{http://cancergenome.nih.gov/}.
50
+
51
+\begin{description}
52
+\item{\verb|TCGA_GBM_miRNA_microarray|}{  Data is miRNA expression values from 10 control and 20 tumor samples for a Glioblastoma multiforme (GBM) comparison. The feature size was originally 470, but after features with outliers were filtered out feature size reduces to 331. In this sample dataset, random 10 features are present. The dataset was generated with an Agilent miRNA microarray.}
53
+\item{\verb|TCGA_GBM_transcript_microarray|}{  Data is transcript (or mRNA) expression values from 10 control and 20 tumor samples in a GBM comparison. The feature size was originally 90797, but after features with outliers were filtered out feature size reduces to 72656. In this sample dataset, 20 random features are present. The dataset was generated with an Agilent 244k micorarray.}
54
+\item{\verb|TCGA_Breast_miRNASeq|}{  Data is miRNA counts from 15 control and 45 tumor samples in a Breast Cancer comparison. The feature size was originally 212, but after features with outliers were filtered out feature size reduces to 200. In this sample dataset, 100 random features are present. Dataset was generated using Illumina HiSeq miRNASeq.}
55
+\item{\verb|TCGA_Breast_RNASeq|}{  Data is transcript (or mRNA) counts from 15 control and 45 tumor samples in a Breast Cancer comparison. The feature size was originally 19414, but after features with outliers were filtered out feature size reduces to 16656. In this sample dataset, 100 random features are present. The dataset was generated using Illumina HiSeq RNASeq.}
56
+\item{\verb|TCGA_Breast_miRNASeq_voom|}{  This dataset is the voom-transformed \verb|TCGA_Breast_miRNASeq|.}
57
+\item{\verb|TCGA_Breast_RNASeq_vooim|}{  This dataset is the voom-transformed \verb|TCGA_Breast_RNASeq|.}
58
+\end{description}
59
+
60
+\section{Before Starting}
61
+
62
+\subsection{Required Inputs}
63
+
64
+Within –omics refers to when the Discordant analysis is performed within one –omics dataset where all molecular features within a -omics dataset are paired to each other (e.g. transcript-transcript pairs in a microarray transcriptomics experiment).
65
+
66
+Between -omics refers to analysis of two -omics datasets. Molecular feature pairs analyzed are between the two -omics, (e.g. transcript-protein, protein-metabolite) are paired.
67
+
68
+\begin{description}
69
+\item[x]{ m by n matrix where m are features and n are samples. If only this matrix is provided, a within -omics analysis is performed.}
70
+\item[y]{ m by n matrix where m are features and n are samples. Optional, will induce between -omics analysis. Samples must be matched with those in x.} 
71
+\item[groups]{ vector containing 1s and 2s that correspond to the location of samples in the columns of x (and y if provided). For example, the control group is group 1 and the experimental group 2, and the location of samples corresponding to the two groups matches the locations of 1s and 2s in the group vector}
72
+\end{description}
73
+
74
+\subsection{Outliers}
75
+
76
+In our work, we found that features with outliers would skew correlation and cause false positives. Our approach was to filter out features that had large outliers. With normal data, such as in microarrays, Grubbs' test can be used. The null hypothesis is that there are no outliers in the data, and so features with p-value $\le$ 0.05 are kept. A simple R function is found in the \verb|outliers| R package as \verb|grubbs.test|.
77
+
78
+Determining outliers in non-normal data is more complicated. We used the median absolute deviation (MAD). Normally, features are filtered if they are outside 2 or 3 MADs from the median \cite{leys}. This is not completely applicable to sequencing data, because sequencing data has large variance and a non-symmetrical distribution. We used "split MAD," which has been used before \cite{magwene}. A left MAD is determined based on data left to the median and a right MAD is determined based on data to the right of the median. If there are any feature outside a factor of the left or right MAD from the median, there are featured out.
79
+
80
+A function in Discordant is provided called \verb|split.madOutlier|. The number of MAD outside of the median can be changed with option \verb|threshold|. Another option is \verb|filter0| which if \verb|TRUE| will filter out any feature with at least one 0. Arguments returned are \verb|mat.filtered|, which is the filtered matrix and \verb|index| which is the index of features that are retained in \verb|mat.filtered|.
81
+
82
+\begin{verbatim}
83
+> data(TCGA_Breast_miRNASeq)
84
+> mat.filtered <- splitMADOutlier(TCGA_Breast_miRNASeq,
85
++       filter0 = TRUE, threshold = 4)
86
+\end{verbatim}
87
+
88
+\section{Create Correlation Vectors}
89
+
90
+To run the Discordant algorithm correlation vectors respective to each group are necessary for input, which are easy to create using the  $\verb|createVectors|$. Each correlation coefficient represents the correlation between two molecular features. The type of molecular feature pairs  depend if a within -omics or between -omics analysis is performed. Correlation between molecular features in the same -omics dataset is within -omics, and correlation between molecular features in two different -omics datasets is between -omics. Whether or not within -omics or between -omics analysis is performed depends on whether one or two matrices are parameters for this function. $\verb|createVectors|$ has two outputs:
91
+
92
+\begin{description}
93
+\item[v1]{Correlation vector of molecular feature pairs corresponding to samples labeled 1 in group parameter.}
94
+\item[v2]{Correlation vector of molecular feature pairs corresponding to samples labeled 2 in group parameter.}
95
+\end{description}
96
+
97
+\begin{verbatim}
98
+
99
+> data(TCGA_GBM_miRNA_microarray)
100
+> data(TCGA_GBM_transcript_microarray)
101
+> groups <- c(rep(1,10), rep(2,10))
102
+
103
+#Within -Omics
104
+> vectors <- createVectors(TCGA_GBM_transcript_microarray, 
105
++      groups = groups)
106
+#Between -Omics
107
+> vectors <- createVectors(TCGA_GBM_miRNA_microarray, 
108
++      TCGA_GBM_transcript_microarray, groups = groups)
109
+\end{verbatim}
110
+
111
+\subsection{Correlation Metric}
112
+
113
+We also have included different options for correlation metrics. This argument is called $\verb|cor.method|$ and its default is $\verb|"spearman"|$. Other options are $\verb|"pearson"|$, $\verb|"bwmc"|$ and $\verb|"sparcc"|$. For information and comparison of Spearman, Pearson and biweight midcorrelation (bwmc) please read this paper by \href{http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-13-328}{Song et al}\cite{song}. We have also investigated correlation metrics in Discordant in relation to sequencing data, and found Spearman's correlation had the best performance\cite{siska2}.
114
+
115
+The algorithm for SparCC was introduced by Friedman et al\cite{friedman}. We use R code written by Huaying Fang\cite{fang} .
116
+
117
+Example
118
+
119
+\begin{verbatim}
120
+
121
+> data(TCGA_GBM_miRNA_microarray)
122
+> data(TCGA_GBM_transcript_microarray)
123
+> groups <- c(rep(1,10), rep(2,10))
124
+
125
+#Within -Omics
126
+> vectors <- createVectors(TCGA_GBM_transcript_microarray, 
127
++      groups = groups, cor.method = c("bwmc"))
128
+#Between -Omics
129
+> vectors <- createVectors(TCGA_GBM_miRNA_microarray, 
130
++      TCGA_GBM_transcript_microarray, groups = groups,
131
++      cor.method = c("bwmc"))
132
+
133
+\end{verbatim}
134
+
135
+\section{Run the Discordant Algorithm}
136
+
137
+The Discordant Algorithm is implemented in the the function $\verb|discordantRun|$ which requires two correlation vectors and the original data. If the user wishes to generate their own correlation vector before inputting into the dataset, they can do so. However, the function will return an error message if the dimensions of the datasets inserted do not match the correlation vector.
138
+
139
+The posterior probability output of the Discordant algorithm are the differential correlation posterior probabilities (the sum of the off-diagonal of the class matrix described in Table 1). If the user wishes to observe more detailed information, alternative outputs are available. $\verb|discordantRun|$ has five outputs:
140
+
141
+\begin{description}
142
+\item[discordPPMatrix]{Matrix of differential correlation posterior probabilities where rows and columns reflect features. If only x was inputted, then the rows and column names are the feature names, and the upper diagonal of the matrix are NAs to avoid repeating results. If x and y are inputted, the row names are features from x and the column names are features from y.}
143
+\item[discordPPVector]{Vector of differential correlation posterior probabilities. This has the same information as discordPPMatrix, but in vector form.}
144
+\item[classMatrix]{Matrix of classes with the highest posterior probability. Row and column names are the same as in discordPPMatrix depending if only x is inputted or both x and y.}
145
+\item[classVector]{Vector of class with the highest posterior probability for each pair. This has the same information as classMatrix, but in vector form.}
146
+\item[probMatrix]{Matrix of all posterior probabilities, where each row represents a feature pair and nine columns that represent the class within the class matrix. The values across each row add up to 1. Posterior probabilities in discordPPMatrix and discordPPVector are summed up from columns 2, 3, 4, 6, 7 and 8 which correspond to differential correlation classes (Table 1).}
147
+\item[loglik]{The log likelihood.}
148
+\end{description}
149
+
150
+\begin{verbatim}
151
+> data(TCGA_GBM_miRNA_microarray)
152
+> data(TCGA_GBM_transcript_microarray)
153
+> groups <- c(rep(1,10), rep(2,10))
154
+
155
+#Within -omics
156
+> vectors <- createVectors(TCGA_GBM_transcript_microarray, 
157
++      groups = groups)
158
+> result <- discordantRun(vectors$v1, vectors$v2, 
159
++      TCGA_GBM_transcript_microarray}
160
+> result$discordPPMatrix[1:4,1:4]
161
+             A_23_P138644 A_23_P24296 A_24_P345312 A_24_P571870
162
+A_23_P138644           NA          NA           NA           NA
163
+A_23_P24296    0.33050233          NA           NA           NA
164
+A_24_P345312   0.24910061  0.47858580           NA           NA
165
+A_24_P571870   0.07052135  0.23310301   0.07028103           NA
166
+> head(result$discordPPVector)
167
+A_23_P24296_A_23_P138644 A_24_P345312_A_23_P138644 
168
+               0.33050233                0.24910061 
169
+A_24_P345312_A_23_P24296 A_24_P571870_A_23_P138644
170
+               0.44839316                0.07052135
171
+A_24_P571870_A_23_P24296 A_24_P571870_A_24_P345312 
172
+               0.56846815                0.91395405 
173
+> result$classMatrix[1:4,1:4]
174
+             A_23_P138644 A_23_P24296 A_24_P345312 A_24_P571870
175
+A_23_P138644           NA          NA           NA           NA
176
+A_23_P24296             1          NA           NA           NA
177
+A_24_P345312            1           3           NA           NA
178
+A_24_P571870            1           1            1           NA
179
+> head(result$classVector)
180
+A_23_P24296_A_23_P138644 A_24_P345312_A_23_P138644 
181
+                        1                         1
182
+A_24_P345312_A_23_P24296 A_24_P571870_A_23_P138644
183
+                        1                         1  
184
+A_24_P571870_A_23_P24296 A_24_P571870_A_24_P345312 
185
+                        4                         2
186
+> head(result$probMatrix)
187
+                                   1            2           3
188
+A_23_P24296_A_23_P138644  0.63574038 5.223996e-07 0.289699897
189
+A_24_P345312_A_23_P138644 0.74859348 2.151027e-02 0.012949903
190
+A_24_P345312_A_23_P24296  0.92584634 4.666010e-03 0.022843630
191
+A_24_P571870_A_23_P138644 0.55002415 1.359646e-03 0.016168496
192
+A_24_P571870_A_23_P24296  0.43013074 5.668452e-04 0.014985341
193
+A_24_P571870_A_24_P345312 0.05392322 8.877314e-01 0.001065886
194
+                                   4            5           6
195
+A_23_P24296_A_23_P138644  0.01293366 2.138702e-09 7.37399e-03
196
+A_24_P345312_A_23_P138644 0.19191496 1.124351e-03 4.15700e-03
197
+A_24_P345312_A_23_P24296  0.00260480 2.636222e-06 8.20242e-05
198
+A_24_P571870_A_23_P138644 0.40351609 2.017512e-04 1.47538e-02
199
+A_24_P571870_A_23_P24296  0.52075349 1.377220e-04 2.24260e-02
200
+A_24_P571870_A_24_P345312 0.00961640 3.201954e-02 2.38872e-04
201
+                                   7            8           9
202
+A_23_P24296_A_23_P138644  0.02049424 1.062232e-08 0.033757287
203
+A_24_P345312_A_23_P138644 0.01824136 3.271036e-04 0.001181551
204
+A_24_P345312_A_23_P24296  0.04019691 1.279636e-04 0.003629672
205
+A_24_P571870_A_23_P138644 0.01257563 1.944655e-05 0.001380934
206
+A_24_P571870_A_23_P24296  0.00972846 7.995730e-06 0.001263389
207
+A_24_P571870_A_24_P345312 0.00138369 1.391785e-02 0.000103189
208
+> result$loglik
209
+[1] 1129.558
210
+> # Between -omics
211
+> vectors <- createVectors(TCGA_GBM_miRNA_microarray, 
212
++      TCGA_GBM_transcript_microarray, groups = groups)
213
+> result <- discordantRun(vectors$v1, vectors$v2, 
214
++       TCGA_GBM_miRNA_microarray, 
215
++       TCGA_GBM_transcript_microarray)
216
+> result$discordPPMatrix[1:3,1:3]
217
+               A_23_P138644 A_23_P24296 A_24_P345312
218
+hsa-miR-19b-5p    0.9734173  0.19568366   0.70425484
219
+hsa-miR-206-5p    0.8422418  0.03809339   0.91989561
220
+hsa-miR-369-5p    0.9509883  0.05088263   0.99194855
221
+> head(result$discordPPVector)
222
+hsa-miR-19b-5p_A_23_P138644  hsa-miR-19b-5p_A_23_P24296 
223
+                  0.9734173                   0.8422418 
224
+hsa-miR-19b-5p_A_24_P345312 hsa-miR-19b-5p_A_24_P571870 
225
+                  0.9509883                   0.8983236 
226
+ hsa-miR-19b-5p_A_32_P71885  hsa-miR-19b-5p_A_32_P82889 
227
+                  0.9154871                   0.6706972
228
+> result$classMatrix[1:3,1:3] 
229
+               A_23_P138644 A_23_P24296 A_24_P345312
230
+hsa-miR-19b-5p            7           1            4
231
+hsa-miR-206-5p            2           1            3
232
+hsa-miR-369-5p            4           1            2
233
+> head(result$classVector)
234
+hsa-miR-19b-5p_A_23_P138644  hsa-miR-19b-5p_A_23_P24296 
235
+                          7                           2 
236
+hsa-miR-19b-5p_A_24_P345312 hsa-miR-19b-5p_A_24_P571870 
237
+                          4                           7 
238
+ hsa-miR-19b-5p_A_32_P71885  hsa-miR-19b-5p_A_32_P82889 
239
+                          4                           4 
240
+> head(result$probMatrix)
241
+                                    1          2          3
242
+hsa-miR-19b-5p_A_23_P138644 0.0265823 1.6948e-02 7.4477e-09
243
+hsa-miR-19b-5p_A_23_P24296  0.1575141 8.3361e-01 2.2858e-10
244
+hsa-miR-19b-5p_A_24_P345312 0.0488204 2.8058e-05 1.1427e-03
245
+hsa-miR-19b-5p_A_24_P571870 0.1016756 4.2631e-02 7.0747e-08
246
+hsa-miR-19b-5p_A_32_P71885  0.0844775 3.4488e-06 2.7536e-02
247
+hsa-miR-19b-5p_A_32_P82889  0.2333993 1.1262e-01 1.1956e-07
248
+                                     4           5        6
249
+hsa-miR-19b-5p_A_23_P138644 1.2909e-11 2.82374e-12 2.26e-19
250
+hsa-miR-19b-5p_A_23_P24296  1.3327e-04 2.44029e-04 1.21e-14
251
+hsa-miR-19b-5p_A_24_P345312 9.4829e-01 1.90403e-04 1.50e-03
252
+hsa-miR-19b-5p_A_24_P571870 7.6664e-10 1.10858e-10 3.34e-17
253
+hsa-miR-19b-5p_A_32_P71885  8.6805e-01 1.23439e-05 1.98e-02
254
+hsa-miR-19b-5p_A_32_P82889  5.5782e-01 9.59033e-02 1.80e-08
255
+                                     7           8        9
256
+hsa-miR-19b-5p_A_23_P138644 8.4389e-01 1.12569e-01 3.33e-07
257
+hsa-miR-19b-5p_A_23_P24296  4.1233e-03 4.37311e-03 8.73e-12
258
+hsa-miR-19b-5p_A_24_P345312 2.3763e-05 2.67832e-09 8.07e-07
259
+hsa-miR-19b-5p_A_24_P571870 7.8738e-01 6.83115e-02 7.76e-07
260
+hsa-miR-19b-5p_A_32_P71885  4.9158e-05 3.95251e-10 2.30e-05
261
+hsa-miR-19b-5p_A_32_P82889  2.1812e-04 2.06579e-05 1.65e-10
262
+> result$loglik
263
+[1] 1169.986
264
+\end{verbatim}
265
+
266
+\subsection{Subsampling}
267
+
268
+Subsampling is an option to run the EM algorithm with a random sample of independent feature pairs are drawn. This is repeated for a number of samplings, and then the average of these parameters are used to maximize posterior probabilities for all feature pairs. This option was introduced to speed up Discordant method and to also solve the independence assumption. There are some implementation issues which are explained in Siska et al, submitted\cite{siska2}.
269
+
270
+The argument $\verb|subsampling|$ must be set to $\verb|TRUE|$ for subsampling to be used. The number of independent feature pairs to be subsampled is determined by the argument $\verb|subSize|$ whose default value is the number of rows in $\verb|x|$. The number of independent feature pairs must be less or equal to the number of features in $\verb|x|$ and $\verb|y|$. The number of random samplings to be run is set by the argument $\verb|iter|$, whose default value is 100.
271
+
272
+
273
+\begin{verbatim}
274
+> data(TCGA_GBM_miRNA_microarray)
275
+> data(TCGA_GBM_transcript_microarray)
276
+> groups <- c(rep(1,10), rep(2,10))
277
+
278
+> vectors <- createVectors(TCGA_GBM_miRNA_microarray, 
279
++      TCGA_GBM_transcript_microarray, groups = groups)
280
+> result <- discordantRun(vectors$v1, vectors$v2, 
281
++      TCGA_Breast_miRNASeq, 
282
++      TCGA_Breast_RNASeq, subsampling = TRUE, 
283
++      iter = 200, subSize = 20)
284
+\end{verbatim}
285
+
286
+\begin{table}[h]
287
+\begin{center}
288
+\begin{tabular}{ c |  c c c c c}  
289
+  & 0 & - & -- & + & ++ \\
290
+\hline
291
+0 & 1 & 2 & 3 & 4 & 5 \\
292
+- & 6 & 7 & 8 & 9 & 10  \\  
293
+-- & 11 & 12 & 13 & 14 & 15 \\
294
++ & 16 & 17 & 18 & 19 & 20 \\
295
+\end{tabular}
296
+\caption{Class Matrix for Five Component Mixture Model}
297
+\end{center}
298
+\end{table}
299
+
300
+\subsection{Increase Component Size}
301
+
302
+We also provide the option to increase component size from three to five in the mixture model. The number of classes in the class matrix increases, as seen in Table 2. Incorporating the extra components means that it is possible to identify elevated differential correlation, which is when there are associations in both groups in the same direction but one is more extreme. Using this options introduces more parameters, which does have an effect on run-time. We also found that using the five mixture component mixture model reduces performance compared to the three component mixture model\cite{siska2}. However, the option is available if users wish to explore more types of differential correlation.
303
+
304
+The default is to run the three component mixture model and can be changed with option $\verb|components|$.
305
+
306
+Example:
307
+
308
+\begin{verbatim}
309
+> data(TCGA_GBM_miRNA_microarray)
310
+> data(TCGA_GBM_transcript_microarray)
311
+> groups <- c(rep(1,10), rep(2,10))
312
+
313
+> vectors <- createVectors(TCGA_GBM_miRNA_microarray, 
314
++      TCGA_GBM_transcript_microarray, groups = groups)
315
+> result <- discordantRun(vectors$v1, vectors$v2, 
316
++      TCGA_GBM_transcript_microarray, 
317
++      TCGA_GBM_miRNA_microarray, components = 5)
318
+\end{verbatim}
319
+
320
+\section{Example Run with Microarrays}
321
+
322
+\begin{verbatim}
323
+data(TCGA_GBM_miRNA_microarray)
324
+data(TCGA_GBM_transcript_microarray)
325
+groups <- c(rep(1,10), rep(2,10))
326
+
327
+#Within -Omics
328
+
329
+vectors <- createVectors(TCGA_GBM_transcript_microarray, 
330
+      groups = groups)
331
+result <- discordantRun(vectors$v1, vectors$v2, 
332
+      TCGA_GBM_transcript_microarray)
333
+
334
+#Between -Omics
335
+
336
+vectors <- createVectors(TCGA_GBM_miRNA_microarray, 
337
+      TCGA_GBM_transcript_microarray, groups = groups)
338
+result <- discordantRun(vectors$v1, vectors$v2, 
339
+      TCGA_GBM_miRNA_microarray, 
340
+      TCGA_GBM_transcript_microarray)
341
+\end{verbatim}
342
+
343
+\bibliographystyle{ieeetr}
344
+
345
+\bibliography{Discordant_bib_v3}
346
+
347
+\end{document}
0 348
new file mode 100644
1 349
Binary files /dev/null and b/inst/doc/Discordant_vignette.pdf differ
2 350
new file mode 100644
... ...
@@ -0,0 +1,19 @@
1
+\name{TCGA_Breast_RNASeq}
2
+\docType{data}
3
+\alias{TCGA_Breast_RNASeq}
4
+\title{TCGA Breast Cancer RNASeq Sample Dataset}
5
+\description{
6
+	This dataset contains TMM normalized RNA count values from RNASeq that was taken from the Cancer Genome Atlas, or TCGA. It has 100 features and 57 samples. The original dataset had 17972 features and 57 samples.
7
+}
8
+
9
+\references{
10
+National Institues of Health. The Cancer Genome Atlas. http://cancergenome.nih.gov/
11
+}
12
+
13
+\author{
14
+Charlotte Siska <siska.charlotte@gmail.edu>
15
+}
16
+
17
+\usage{TCGA_Breast_RNASeq}
18
+\format{A matrix of RNA count values}
19
+\keyword{datasets}
0 20
new file mode 100644
... ...
@@ -0,0 +1,20 @@
1
+\name{TCGA_Breast_RNASeq_voom}
2
+\docType{data}
3
+\alias{TCGA_Breast_RNASeq_voom}
4
+\title{TCGA Breast Cancer RNASeq Sample Dataset}
5
+\description{
6
+	This dataset contains TMM normalized voom-transformed RNA count values from RNASeq that was taken from the Cancer Genome Atlas, or TCGA.
7
+}
8
+
9
+\references{
10
+Charity W Law, Yunshun Chen, Wei Shi, Gordon K Smyth. voom: precision weights unlock linear model analysis tools for RNA-seq read counts. 2014. Genome Biology, 15:R29.
11
+National Institues of Health. The Cancer Genome Atlas. http://cancergenome.nih.gov/
12
+}
13
+
14
+\author{
15
+Charlotte Siska <siska.charlotte@gmail.edu>
16
+}
17
+
18
+\usage{TCGA_Breast_RNASeq_voom}
19
+\format{A matrix of RNA count values}
20
+\keyword{datasets}
0 21
new file mode 100644
... ...
@@ -0,0 +1,19 @@
1
+\name{TCGA_Breast_miRNASeq}
2
+\docType{data}
3
+\alias{TCGA_Breast_miRNASeq}
4
+\title{TCGA Breast Cancer miRNASeq Sample Dataset}
5
+\description{
6
+	This dataset contains TMM normalized miRNA count values from miRNASeq that was taken from the Cancer Genome Atlas, or TCGA. The dataset has 100 miRNA and 57 samples. The original dataset has 212 miRNA and 57 samples.
7
+}
8
+
9
+\references{
10
+National Institues of Health. The Cancer Genome Atlas. http://cancergenome.nih.gov/
11
+}
12
+\author{
13
+Charlotte Siska <siska.charlotte@gmail.edu>
14
+}
15
+
16
+
17
+\usage{TCGA_Breast_miRNASeq}
18
+\format{A matrix of miRNA count values}
19
+\keyword{datasets}
0 20
new file mode 100644
... ...
@@ -0,0 +1,20 @@
1
+\name{TCGA_Breast_miRNASeq_voom}
2
+\docType{data}
3
+\alias{TCGA_Breast_miRNASeq_voom}
4
+\title{TCGA Breast Cancer miRNASeq Sample Dataset}
5
+\description{
6
+	This dataset contains TMM normalized voom-transformed miRNA count values from miRNASeq that was taken from the Cancer Genome Atlas, or TCGA. The dataset has 100 miRNA and 57 samples. The original dataset has 212 miRNA and 57 samples.
7
+}
8
+
9
+\references{
10
+Charity W Law, Yunshun Chen, Wei Shi, Gordon K Smyth. voom: precision weights unlock linear model analysis tools for RNA-seq read counts. 2014. Genome Biology, 15:R29.
11
+National Institues of Health. The Cancer Genome Atlas. http://cancergenome.nih.gov/
12
+}
13
+
14
+\author{
15
+Charlotte Siska <siska.charlotte@gmail.edu>
16
+}
17
+
18
+\usage{TCGA_Breast_miRNASeq_voom}
19
+\format{A matrix of miRNA count values}
20
+\keyword{datasets}
0 21
new file mode 100644
... ...
@@ -0,0 +1,19 @@
1
+\name{TCGA_GBM_miRNA_microarray}
2
+\docType{data}
3
+\alias{TCGA_GBM_miRNA_microarray}
4
+\title{TCGA Glioblastoma Multiforme miRNA Sample Dataset}
5
+\description{
6
+	This dataset contains miRNA expression values from a microarray that was taken from the Cancer Genome Atlas, or TCGA. It has 10 features and 30 samples. The original dataset had 331 features and 30 samples.
7
+}
8
+
9
+\references{
10
+National Institues of Health. The Cancer Genome Atlas. http://cancergenome.nih.gov/
11
+}
12
+
13
+\author{
14
+Charlotte Siska <siska.charlotte@gmail.edu>
15
+}
16
+
17
+\usage{TCGA_GBM_miRNASample}
18
+\format{A matrix of miRNA expression values}
19
+\keyword{datasets}
0 20
new file mode 100644
... ...
@@ -0,0 +1,19 @@
1
+\name{TCGA_GBM_transcript_microarray}
2
+\docType{data}
3
+\alias{TCGA_GBM_transcript_microarray}
4
+\title{TCGA Glioblastoma Multiforme Transcript Sample Dataset}
5
+\description{
6
+	This dataset contains transcript expression values from a microarray that was taken from the Cancer Genome Atlas, or TCGA. It has 10 features and 30 samples. The original dataset had 72656 features and 30 samples.
7
+}
8
+
9
+\references{
10
+National Institues of Health. The Cancer Genome Atlas. http://cancergenome.nih.gov/
11
+}
12
+
13
+\author{
14
+Charlotte Siska <siska.charlotte@gmail.edu>
15
+}
16
+
17
+\usage{TCGA_GBM_transcript_microarray}
18
+\format{A matrix of transcript expression values}
19
+\keyword{datasets}
0 20
new file mode 100644
... ...
@@ -0,0 +1,54 @@
1
+
2
+\name{createVectors}
3
+\alias{createVectors}
4
+\title{Create Pearson's correlation coefficient vectors based on bivariate data}
5
+
6
+\description{
7
+  Calculates correlation coefficients based on two groups of omics bivariate data. Currently, only two groups of samples can be specified. Used to make input for discordantRun().
8
+}
9
+
10
+\usage{
11
+createVectors(x, y = NULL, groups, cor.method = c("spearman"))
12
+}
13
+
14
+\arguments{
15
+  \item{x}{m by n matrix where m are features and n are samples}
16
+  \item{y}{optional second m by n matrix, induces dual -omics analysis}
17
+  \item{groups}{n-length vector of 1s and 2s matching samples belonging to groups 1 and 2}
18
+  \item{cor.method}{correlation method to measure association. Options are "spearman", "pearson", "bwmc" and "sparcc"}
19
+}
20
+
21
+\value{
22
+  \item{v1}{List of correlation coefficients for group 1}
23
+  \item{v2}{List of correlation coefficients for group 2}
24
+}
25
+
26
+\references{
27
+Siska C, Bowler R and Kechris K. The Discordant Method: A Novel Approach for Differential Correlation. (2015) Bioinformatics. 32(5): 690-696.
28
+Friedman J and Alm EJ. Inferring Correlation Networks from Genomic Survey Data. (2012) PLoS Computational Biology. 8:9, e1002687.
29
+}
30
+
31
+\author{
32
+Charlotte Siska <siska.charlotte@gmail.edu>
33
+}
34
+
35
+\examples{
36
+
37
+## load data
38
+data("TCGA_GBM_miRNA_microarray") # loads matrix called TCGA_GBM_miRNA_microarray
39
+data("TCGA_GBM_transcript_microarray") # loads matrix called TCGA_GBM_transcript_microarray
40
+print(colnames(TCGA_GBM_transcript_microarray)) # look at groups
41
+
42
+groups <- c(rep(1,10), rep(2,20))
43
+
44
+# transcript-transcript pairs
45
+
46
+vectors <- createVectors(TCGA_GBM_transcript_microarray, groups = groups, cor.method = c("pearson"))
47
+
48
+# miRNA-transcript pairs
49
+
50
+vectors <- createVectors(TCGA_GBM_transcript_microarray, TCGA_GBM_miRNA_microarray, groups = groups)
51
+
52
+}
53
+
54
+\keyword{datagen}
0 55
new file mode 100644
... ...
@@ -0,0 +1,66 @@
1
+% File src/library/discordant/man/discordantRun.Rd
2
+
3
+\name{discordantRun}
4
+\alias{discordantRun}
5
+\title{Run Discordant Algorithm}
6
+
7
+\description{
8
+   Runs discordant algorithm on two vectors of correlation coefficients.
9
+}
10
+
11
+\usage{
12
+discordantRun(v1, v2, x, y = NULL, transform = TRUE, subsampling = FALSE, subSize = dim(x)[1], iter = 100, components = 3)
13
+}
14
+
15
+\arguments{
16
+  \item{v1}{Vector of Pearson correlation coefficients in group 1}
17
+  \item{v2}{Vector of Pearson correlation coefficients in group 2}
18
+  \item{x}{m by n matrix where m are features and n samples}
19
+  \item{y}{optional second m by n matrix, induces dual -omics analysis}
20
+  \item{transform}{If TRUE v1 and v2 will be Fisher transformed}
21
+  \item{subsampling}{If TRUE subsampling will be run}
22
+  \item{subSize}{Indicates how many feature pairs to be used for subsampling. Default is the feature size in x}
23
+  \item{iter}{Number of iterations for subsampling. Default is 100}
24
+  \item{components}{Number of components in mixture model.}
25
+}
26
+
27
+\value{
28
+  \item{discordPPVector}{Vector of differentially correlated posterior probabilities.}
29
+  \item{discordPPMatrix}{Matrix of differentially correlated posterior probabilities where rows and columns reflect features}
30
+  \item{classVector}{Vector of classes that have the highest posterior probability}
31
+  \item{classMatrix}{Matrix of classes that have hte highest posterior probability where rows and columns reflect features}
32
+  \item{probMatrix}{Matrix of posterior probabilities where rows are each molecular feature pair and columns are nine different classes}
33
+  \item{loglik}{Final log likelihood}
34
+}
35
+
36
+\references{
37
+ Siska C, Bowler R and Kechris K. The Discordant Method: A Novel Approach for Differential Correlation (2015), Bioinformatics. 32 (5): 690-696.
38
+Lai Y, Zhang F, Nayak TK, Modarres R, Lee NH and McCaffrey TA. Concordant integrative gene set enrichment analysis of multiple large-scale two-sample expression data sets. (2014) BMC Genomics 15, S6.
39
+Lai Y, Adam B-l, Podolsky R, She J-X. A mixture model approach to the tests of concordance and discordancd between two large-scale experiments with two sample groups. (2007) Bioinformatics 23, 1243-1250.
40
+}
41
+
42
+\author{
43
+Charlotte Siska <siska.charlotte@gmail.edu>
44
+}
45
+
46
+\examples{
47
+
48
+## load Data
49
+
50
+data(TCGA_GBM_miRNA_microarray) # loads matrix called TCGA_GBM_miRNA_microarray
51
+data(TCGA_GBM_transcript_microarray) # loads matrix called TCGA_GBM_transcript_microarray
52
+print(colnames(TCGA_GBM_transcript_microarray)) # look at groups
53
+groups <- c(rep(1,10), rep(2,20))
54
+
55
+## DC analysis on only transcripts pairs
56
+
57
+vectors <- createVectors(TCGA_GBM_transcript_microarray, groups = groups)
58
+result <- discordantRun(vectors$v1, vectors$v2, TCGA_GBM_transcript_microarray)
59
+
60
+## DC analysis on miRNA-transcript pairs
61
+
62
+vectors <- createVectors(TCGA_GBM_transcript_microarray, TCGA_GBM_miRNA_microarray, groups = groups, cor.method = c("pearson"))
63
+result <- discordantRun(vectors$v1, vectors$v2, TCGA_GBM_transcript_microarray, TCGA_GBM_miRNA_microarray)
64
+
65
+}
66
+\keyword{ model }
0 67
new file mode 100644
... ...
@@ -0,0 +1,32 @@
1
+\name{fishersTrans}
2
+\alias{fishersTrans}
3
+\title{Fisher Transformation of Pearson Correlation Coefficients to Z Scores}
4
+
5
+\description{
6
+   Transforms Pearsons correlation coefficients into z scores using Fishers method.
7
+}
8
+
9
+\usage{
10
+fishersTrans(rho)
11
+}
12
+
13
+\arguments{
14
+  \item{rho}{Integer or numeric list of Pearson's correlation coefficients}
15
+}
16
+
17
+\references{
18
+Fisher, R.A. (1915). "Frequency distribution of the values of the correlation coefficient in samples of an indefinitely large population". Biometrika (Biometrika Trust) 10 (4).
19
+}
20
+
21
+\examples{
22
+
23
+## Create integer or list of Pearson's correlation coefficients.
24
+
25
+library(MASS)
26
+rhoV <- as.vector(cor(t(mvrnorm(10,rep(3,100),diag(100)))))
27
+
28
+## Determine Fisher-Transformed z scores of rho
29
+zV <- fishersTrans(rhoV)
30
+}
31
+
32
+\keyword{methods}
0 33
new file mode 100644
... ...
@@ -0,0 +1,38 @@
1
+\name{splitMADOutlier}
2
+\alias{splitMADOutlier}
3
+\title{Outliers using left and right MAD}
4
+
5
+\description{
6
+  Identify features with outliers using left and right median absolute deviation (MAD).
7
+}
8
+
9
+\usage{
10
+splitMADOutlier(mat, filter0 = TRUE, threshold = 2)
11
+}
12
+
13
+\arguments{
14
+  \item{mat}{mxn matrix of -omics data, where rows are features and columns samples.}
15
+  \item{filter0}{Option to filter out features if they have at least one 0 value. Default is TRUE.}
16
+  \item{threshold}{Threshold of how many MADs outside the left or right median is used to determine features with outliers.}
17
+}
18
+
19
+\value{
20
+  \item{mat.filtered}{Input matrix where features with outliers filtered out.}
21
+  \item{index}{Index of features that have no outliers.}
22
+}
23
+
24
+\references{
25
+Leys C, Klein O, Bernard P and Licata L. "Detecting Outliers: Do Not Use Standard Deviation Around the Mean, Use Absolute Deivation Around the Median." Journal of Experimental Social Psychology, 2013. 49(4), 764-766.
26
+Magwene, PM, Willis JH, Kelly JK and Siepel A. "The Statistics of Bulk Segregant Analysis Using Next Generation Sequencing." PLoS Computational Biology, 2011. 7(11), e1002255.
27
+}
28
+
29
+\examples{
30
+
31
+## Simulate matrix of continuous -omics data.
32
+data(TCGA_Breast_miRNASeq)
33
+
34
+## Filter matrix based on outliers.
35
+mat.filtered <- splitMADOutlier(TCGA_Breast_miRNASeq)$mat.filtered
36
+}
37
+
38
+\keyword{methods}
0 39
new file mode 100644
... ...
@@ -0,0 +1,236 @@
1
+#include <R.h>
2
+
3
+void em_normal_partial_concordant(double *x, double *y, double *zxy, int *n, double *pi, double *mu, double *sigma, double *nu, double *tau, int *g, double *loglik, double *tol, int *restriction, int *constrain, int *iteration, int *convergence) {
4
+	int i, j, k, flag, iter;
5
+	double temp, loglik2;
6
+
7
+
8
+	/*theoretical restriction on the null*/
9
+	if((*restriction)>0){
10
+	/*initialization*/
11
+		flag=1;
12
+		(*loglik) = 0;
13
+		for(k=0;k<*n;k++){	
14
+			(*loglik) = (*loglik) - 0.5*x[k]*x[k] - 0.5*log(2*PI) - 0.5*y[k]*y[k] - 0.5*log(2*PI);
15
+		}
16
+		/*iteration*/
17
+		
18
+		iter = 0;
19
+		while(flag>0 & iter<(*iteration)){
20
+		/*M step for pi, mu and sigma*/
21
+			for(i=0;i<(*g);i++){
22
+				pi[i] = 0;
23
+				for(k=0;k<(*n);k++){
24
+					for(j=0;j<(*g);j++){
25
+						pi[i] = pi[i] + zxy[(j*(*g)+i)*(*n)+k];
26
+					}
27
+				}
28
+				mu[i] = 0;
29
+				for(k=0;k<(*n);k++){
30
+					for(j=0;j<(*g);j++){
31
+						mu[i] = mu[i] + zxy[(j*(*g)+i)*(*n)+k]*x[k];
32
+					}
33
+				}
34
+				mu[i] = mu[i] / pi[i];
35
+				sigma[i] = 0;
36
+				for(k=0;k<(*n);k++){
37
+					for(j=0;j<(*g);j++){
38
+						sigma[i] = sigma[i] + zxy[(j*(*g)+i)*(*n)+k]*(x[k]-mu[i])*(x[k]-mu[i]);
39
+					}
40
+				}
41
+				sigma[i] = sigma[i] / pi[i];
42
+			}
43
+			for(j=0;j<(*g);j++){
44
+				pi[j] = 0;
45
+				for(k=0;k<(*n);k++){
46
+					for(i=0;i<(*g);i++){
47
+						pi[j] = pi[j] + zxy[(j*(*g)+i)*(*n)+k];
48
+					}
49
+				}
50
+				nu[j] = 0;
51
+				for(k=0;k<(*n);k++){
52
+					for(i=0;i<(*g);i++){
53
+						nu[j] = nu[j] + zxy[(j*(*g)+i)*(*n)+k]*y[k];
54
+					}
55
+				}
56
+				nu[j] = nu[j] / pi[j];
57
+				tau[j] = 0;
58
+				for(k=0;k<(*n);k++){
59
+					for(i=0;i<(*g);i++){
60
+						tau[j] = tau[j] + zxy[(j*(*g)+i)*(*n)+k]*(y[k]-nu[j])*(y[k]-nu[j]);
61
+					}
62
+				}
63
+				tau[j] = tau[j] / pi[j];
64
+			}
65
+			for(i=0;i<(*g);i++){
66
+				for(j=0;j<(*g);j++){
67
+					pi[i*(*g)+j] = 0;
68
+					for(k=0;k<(*n);k++){
69
+						pi[i*(*g)+j] = pi[i*(*g)+j] + zxy[(j*(*g)+i)*(*n)+k];
70
+					}
71
+					pi[i*(*g)+j] = pi[i*(*g)+j] / (double)(*n);
72
+				}
73
+			}
74
+			/*constrain*/
75
+			for(i=0;i<(*g);i++){
76
+				if(constrain[i]==0-1){
77
+					if(mu[i]>0.0){
78
+						mu[i]=0.0;
79
+					}
80
+					if(nu[i]>0.0){
81
+						nu[i]=0.0;
82
+					}
83
+				}
84
+				if(constrain[i]==0){
85
+					mu[i] = 0.0;
86
+					sigma[i] = 1.0;
87
+					nu[i] = 0.0;
88
+					tau[i] = 1.0;
89
+				}
90
+				if(constrain[i]==0+1){
91
+					if(mu[i]<0.0){
92
+						mu[i]=0.0;
93
+					}
94
+					if(nu[i]<0.0){
95
+						nu[i]=0.0;
96
+					}
97
+				}
98
+			}
99
+			/*check convergence*/
100
+			loglik2 = (*loglik);
101
+			(*loglik) = 0;
102
+			for(k=0;k<(*n);k++){
103
+				temp = 0;
104
+				for(i=0;i<(*g);i++){
105
+					for(j=0;j<(*g);j++){
106
+						temp = temp + pi[i*(*g)+j] * ( exp(0-0.5*(x[k]-mu[i])*(x[k]-mu[i])/sigma[i])/sqrt(2*PI*sigma[i]) )*( exp(0-0.5*(y[k]-nu[j])*(y[k]-nu[j])/tau[j])/sqrt(2*PI*tau[j]) );
107
+					}
108
+				}
109
+				(*loglik) = (*loglik) + log(temp);
110
+			}
111
+			if(fabs((*loglik)-loglik2)<(*tol)){
112
+				flag = 0;
113
+			}
114
+			/*E step for z*/
115
+			for(k=0;k<(*n);k++){
116
+				temp = 0;
117
+				for(i=0;i<(*g);i++){
118
+					for(j=0;j<(*g);j++){
119
+						temp = temp + pi[i*(*g)+j] * ( exp(0-0.5*(x[k]-mu[i])*(x[k]-mu[i])/sigma[i])/sqrt(2*PI*sigma[i]) )*( exp(0-0.5*(y[k]-nu[j])*(y[k]-nu[j])/tau[j])/sqrt(2*PI*tau[j]) );
120
+					}
121
+				}
122
+				for(i=0;i<(*g);i++){
123
+					for(j=0;j<(*g);j++){
124
+						zxy[(j*(*g)+i)*(*n)+k] = pi[i*(*g)+j] * ( exp(0-0.5*(x[k]-mu[i])*(x[k]-mu[i])/sigma[i])/sqrt(2*PI*sigma[i]) )*( exp(0-0.5*(y[k]-nu[j])*(y[k]-nu[j])/tau[j])/sqrt(2*PI*tau[j]) )/temp;
125
+					}
126
+				}
127
+			}
128
+			iter = iter + 1;
129
+		}
130
+		(*convergence) = 0;
131
+		if(iter<(*iteration)){
132
+			(*convergence) = 1;
133
+		}
134
+	}
135
+	/*no restriction on the null*/
136
+	else{
137
+	/*initialization*/
138
+		flag=1;
139
+		(*loglik) = 0;
140
+		for(k=0;k<(*n);k++){
141
+			(*loglik) = (*loglik) - 0.5*x[k]*x[k] - 0.5*log(2*PI) - 0.5*y[k]*y[k] - 0.5*log(2*PI);
142
+		}
143
+		/*iteration*/
144
+		iter = 0;
145
+		while(flag>0 & iter<(*iteration)){
146
+		/*M step for pi, mu and sigma*/
147
+			for(i=0;i<(*g);i++){
148
+				pi[i] = 0;
149
+				for(k=0;k<(*n);k++){
150
+					for(j=0;j<(*g);j++){
151
+						pi[i] = pi[i] + zxy[(j*(*g)+i)*(*n)+k];
152
+					}
153
+				}
154
+				mu[i] = 0;
155
+				for(k=0;k<(*n);k++){
156
+					for(j=0;j<(*g);j++){
157
+						mu[i] = mu[i] + zxy[(j*(*g)+i)*(*n)+k]*x[k];
158
+					}
159
+				}
160
+				mu[i] = mu[i] / pi[i];
161
+				sigma[i] = 0;
162
+				for(k=0;k<(*n);k++){
163
+					for(j=0;j<(*g);j++){
164
+						sigma[i] = sigma[i] + zxy[(j*(*g)+i)*(*n)+k]*(x[k]-mu[i])*(x[k]-mu[i]);
165
+					}
166
+				}
167
+				sigma[i] = sigma[i] / pi[i];
168
+			}
169
+			for(j=0;j<(*g);j++){
170
+				pi[j] = 0;
171
+				for(k=0;k<(*n);k++){
172
+					for(i=0;i<(*g);i++){
173
+						pi[j] = pi[j] + zxy[(j*(*g)+i)*(*n)+k];
174
+					}
175
+				}
176
+				nu[j] = 0;
177
+				for(k=0;k<(*n);k++){
178
+					for(i=0;i<(*g);i++){
179
+						nu[j] = nu[j] + zxy[(j*(*g)+i)*(*n)+k]*y[k];
180
+					}
181
+				}
182
+				nu[j] = nu[j] / pi[j];
183
+				tau[j] = 0;
184
+				for(k=0;k<(*n);k++){
185
+					for(i=0;i<(*g);i++){
186
+						tau[j] = tau[j] + zxy[(j*(*g)+i)*(*n)+k]*(y[k]-nu[j])*(y[k]-nu[j]);
187
+					}
188
+				}
189
+				tau[j] = tau[j] / pi[j];
190
+			}
191
+			for(i=0;i<(*g);i++){
192
+				for(j=0;j<(*g);j++){
193
+					pi[i*(*g)+j] = 0;
194
+					for(k=0;k<(*n);k++){
195
+						pi[i*(*g)+j] = pi[i*(*g)+j] + zxy[(j*(*g)+i)*(*n)+k];
196
+					}
197
+					pi[i*(*g)+j] = pi[i*(*g)+j] / (double)(*n);
198
+				}
199
+			}
200
+			/*check convergence*/
201
+			loglik2 = (*loglik);
202
+			(*loglik) = 0;
203
+			for(k=0;k<(*n);k++){
204
+				temp = 0;
205
+				for(i=0;i<(*g);i++){
206
+					for(j=0;j<(*g);j++){
207
+						temp = temp + pi[i*(*g)+j] * ( exp(0-0.5*(x[k]-mu[i])*(x[k]-mu[i])/sigma[i])/sqrt(2*PI*sigma[i]) )*( exp(0-0.5*(y[k]-nu[j])*(y[k]-nu[j])/tau[j])/sqrt(2*PI*tau[j]) );
208
+					}
209
+				}
210
+				(*loglik) = (*loglik) + log(temp);
211
+			}
212
+			if(fabs((*loglik)-loglik2)<(*tol)){
213
+				flag = 0;
214
+			}
215
+			/*E step for z*/
216
+			for(k=0;k<(*n);k++){
217
+				temp = 0;
218
+				for(i=0;i<(*g);i++){
219
+					for(j=0;j<(*g);j++){
220
+						temp = temp + pi[i*(*g)+j] * ( exp(0-0.5*(x[k]-mu[i])*(x[k]-mu[i])/sigma[i])/sqrt(2*PI*sigma[i]) )*( exp(0-0.5*(y[k]-nu[j])*(y[k]-nu[j])/tau[j])/sqrt(2*PI*tau[j]) );
221
+					}
222
+				}
223
+				for(i=0;i<(*g);i++) {
224
+					for(j=0;j<(*g);j++){
225
+						zxy[(j*(*g)+i)*(*n)+k] = zxy[(j*(*g)+i)*(*n)+k] + pi[i*(*g)+j] * ( exp(0-0.5*(x[k]-mu[i])*(x[k]-mu[i])/sigma[i])/sqrt(2*PI*sigma[i]) )*( exp(0-0.5*(y[k]-nu[j])*(y[k]-nu[j])/tau[j])/sqrt(2*PI*tau[j]) )/temp;
226
+					}
227
+				}
228
+			}
229
+			iter = iter + 1;
230
+		}
231
+		(*convergence) = 0;
232
+		if(iter<(*iteration)){
233
+			(*convergence) = 1;
234
+		}
235
+	}
236
+}
0 237
new file mode 100644
... ...
@@ -0,0 +1,20 @@
1
+#include <R.h>
2
+
3
+void subsampling(double *x, double *y, double *zxy, int *n, double *pi, double *mu, double *sigma, double *nu, double *tau, int *g) {
4
+	int i, j, k;
5
+	double temp;
6
+	for(k=0;k<(*n);k++){
7
+		temp = 0;
8
+		for(i=0;i<(*g);i++){
9
+			for(j=0;j<(*g);j++){
10
+				temp = temp + pi[i*(*g)+j] * ( exp(0-0.5*(x[k]-mu[i])*(x[k]-mu[i])/sigma[i])/sqrt(2*PI*sigma[i]) )*( exp(0-0.5*(y[k]-nu[j])*(y[k]-nu[j])/tau[j])/sqrt(2*PI*tau[j]) );
11
+			}
12
+		}
13
+		for(i=0;i<(*g);i++) {
14
+			for(j=0;j<(*g);j++){
15
+				zxy[(j*(*g)+i)*(*n)+k] = zxy[(j*(*g)+i)*(*n)+k] + pi[i*(*g)+j] * ( exp(0-0.5*(x[k]-mu[i])*(x[k]-mu[i])/sigma[i])/sqrt(2*PI*sigma[i]) )*( exp(0-0.5*(y[k]-nu[j])*(y[k]-nu[j])/tau[j])/sqrt(2*PI*tau[j]) )/temp;
16
+			}
17
+		}
18
+	}
19
+
20
+}
0 21
new file mode 100644
... ...
@@ -0,0 +1,70 @@
1
+@article{fang,
2
+  author = "Huaying Fang and Chengcheng Huang and Hongyu Zhao and Minghua Deng",
3
+  title = "CCLasso: correlation inference for compositional data through Lasso",
4
+  journal = "Bioinformatics",
5
+  volume = 31,
6
+  number = 19,
7
+  pages = "3172-3180",
8
+  year = 2015}
9
+
10
+@article{friedman,
11
+  author="Jonathan Friedman and Eric J. Alm",
12
+  title= "Inferring correlation networks from genomic survey data",
13
+  journal= "PLoS Computational Biology",
14
+  year = 2012}
15
+
16
+@article{lai1,
17
+  author = "Yinglei Lai and Bao-ling Adam and Robert Podolsky and Jin-Xiong She",
18
+  title = "A mixture model approach to the tests of concordance and discordance between two large-scale experiments with two-sample groups",
19
+  journal = "Bioinformatics",
20
+  volume = 23,
21
+  number = 10,
22
+  pages = "1243-1250",
23
+  year = 2007}
24
+
25
+@article{lai2,
26
+  author = "Yinglei Lai and Fanni Zhang and Tapan K. Naya and Reza Modarres and Norman H. Lee and Timonthy A. McCaffrey",
27
+  title = "Concordant integrative gene set enrichment analysis of multiple large-scale two-sample expression data sets",
28
+  journal = "BMC Genomics",
29
+  volume = 15,
30
+  year = 2014}
31
+
32
+@article{leys,
33
+  author = "Christophe Leys and Olivier Klein and Philippe Bernard and Laurent Licata",
34
+  title = "Detecting outliers: Do not use standard deviation around the mean, use absolute deviation around the median",
35
+  journal = "Journal of Experimental Social Psychology",
36
+  volume = 49,
37
+  number = 4,
38
+  year = 2013}
39
+
40
+@article{magwene,
41
+  author = "Paul M. Magwene and John H. Willis and John K. Kelley and Adam Siepel",
42
+  title = "The Statistics of Bulk Segregant Analysis Using Next Generation Sequencing",
43
+  journal = "PLoS Computational Biology",
44
+  volume = 7,
45
+  number = 11,
46
+  year = 2011}
47
+
48
+@article{siska1,
49
+  author = "Charlotte Siska and Russ Bowler and Katerina Kechris",
50
+  title = "The Discordant Method: a Novel Approach for Differential Correlation",
51
+  journal = "Bioinformatics",
52
+  volume = 32,
53
+  number = 5,
54
+  pages = "690-696",
55
+  year = 2015}
56
+
57
+@unpublished{siska2,
58
+  title = "Differential Correlation for Sequencing Data",
59
+  author = "Charlotte Siska. and Katerina Kechris",
60
+  note = "Submitted",
61
+  year = 2016}
62
+
63
+@article{song,
64
+  author= "Lin Song and Peter Langfelder and Steve Horvath",
65
+  title= "Comparison of co-expression measures: mutual information, correlation, and model based indices",
66
+  journal = "BMC Bioinformatics",
67
+  volume = 13,
68
+  number = 328,
69
+  year = 2012}
70
+
0 71
new file mode 100644
... ...
@@ -0,0 +1,355 @@
1
+%\VignetteIndexEntry{Discordant}
2
+
3
+\documentclass[12pt]{article}
4
+\usepackage{Sweave}
5
+\usepackage{caption}
6
+\usepackage{cite}
7
+\usepackage{hyperref}
8
+\usepackage{amssymb}
9
+\usepackage[utf8]{inputenc}
10
+\begin{document}
11
+\title{The Discordant R Package: A Novel Approach to Differential Correlation}
12
+\author{Charlotte Siska and Katerina Kechris} 
13
+\maketitle
14
+
15
+\tableofcontents
16
+
17
+\pagebreak
18
+
19
+\section{Introduction}
20
+
21
+Discordant is an R package that identifies pairs of features that correlate differently between phenotypic groups, with application to -omics datasets. Discordant uses a mixture model that “bins” molecular feature pairs based on their type of coexpression. More information on the algorithm can be found in \cite{siska1, siska2}. The final output are posterior probabilities of differential correlation. This package can be used to determine differential correlation within one –omics dataset or between two –omics datasets (provided that both –omics datasets were taken from the same samples). Also, the type of data can be any type of –omics with normal or non-normal distributions. Some examples are metabolomics, transcriptomic, proteomics, etc. 
22
+
23
+The functions in the Discordant package provide a simple pipeline for intermediate R users to determine differentially correlated pairs. The final output is a table of molecular feature pairs and their respective posterior probabilities. Functions have been written to allow flexibility for users in how they interpret results, which will be discussed further. Currently, the package only supports the comparison between two phenotypic groups (e.g., disease vs control, mutant vs wildtype).
24
+
25
+\section{Required packages}
26
+
27
+R packages WGCNA, gtools, MASS and tools are required.
28
+
29
+\section{Discordant Algorithm}
30
+
31
+Discordant is originally derived from the Concordant algorithm written by \cite{lai1, lai2}. It was used to determine concordance between microarrays. We have applied it to determine differential correlation of features between groups \cite{siska1,siska2}.
32
+
33
+Using a three component mixture model and the EM algorithm, the model predicts if the correlation coefficients in phenotypic groups 1 and 2 for a molecular feature pair are dissimilar \cite{siska1}. The correlation coefficients are generated for all possible molecular feature pairs witin an -omics dataset or between two -omics datasets. The correlation coefficients are transformed into z scores using Fisher's tranformation. The three components are -, + and 0 which correspond respectively to a negative, positive or no correlation. Molecular features that have correlation coefficients in \textit{different} components are considered \textit{differentially} correlated, as opposed to when correlation coefficients are in the \textit{same} component then they are \textit{equivalently} correlated.
34
+
35
+\begin{table}
36
+\begin{center}
37
+\begin{tabular}{ c |  c c c }
38
+  & 0 & - & + \\
39
+\hline
40
+0 & 1 & 2 & 3 \\
41
+- & 4 & 5 & 6 \\
42
++ & 7 & 8 & 9 \\
43
+\end{tabular}
44
+\caption{Class Matrix for Three Component Mixture Model}
45
+\end{center}
46
+\end{table}
47
+
48
+The class matrix (Table 1) contains the classes that represent all possible paired-correlation scenarios. These scenarios are based off the components in the mixture models. Molecular features that have correlation coefficients in different components are considered differentially correlated, as opposed to when correlation coefficients are in the same component they are equivalently correlated. This can be visualized in the class matrix, where the rows represent the components for group 1 and the columns represent the components for group 2. The classes on the diagonal represent equivalent correlation (1, 5 and 9), and classes in the off-diagonal represent differential correlation (2, 3, 4, 6, 8).
49
+
50
+After running the EM algorithm, we have 9 posterior probabilities for each molecular feature pair that correspond to the 9 classes in the class matrix. Since we want to summarize the probability that the molecular feature pair is differentially correlated, we sum the posterior probabilities representing the off-diagonal classes in the class matrix.
51
+
52
+\section{Example Data}
53
+
54
+All datasets are originally from the Cancer Genome Atlas (TCGA) and can be found at \href{http://cancergenome.nih.gov/}{http://cancergenome.nih.gov/}.
55
+
56
+\begin{description}
57
+\item{\verb|TCGA_GBM_miRNA_microarray|}{  Data is miRNA expression values from 10 control and 20 tumor samples for a Glioblastoma multiforme (GBM) Agilent miRNA micorarray. The feature size was originally 470, but after features with outliers were filtered out feature size reduces to 331. In this sample dataset, random 10 features are present.}
58
+\item{\verb|TCGA_GBM_transcript_microarray|}{  Data is transcript (or mRNA) expression values from 10 control and 20 tumor samples in a GBM Agilent 244k micorarray. The feature size was originally 90797, but after features with outliers were filtered out, feature size reduces to 72656. In this sample dataset, 20 random features are present.}
59
+\item{\verb|TCGA_Breast_miRNASeq|}{  Data is miRNA counts from 15 control and 45 tumor samples in a Breast Cancer Illumina HiSeq miRNASeq. The feature size was originally 212, but after features with outliers were filtered out feature size reduces to 200. In this sample dataset, 100 random features are present.}
60
+\item{\verb|TCGA_Breast_RNASeq|}{  Data is transcript (or mRNA) counts from 15 control and 45 tumor samples in a Breast Cancer Illumina HiSeq RNASeq. The feature size was originally 19414, but after features with outliers were filtered out feature size reduces to 16656. In this sample dataset, 100 random features are present.}
61
+\item{\verb|TCGA_Breast_miRNASeq_voom|}{  This dataset is the voom-transformed \verb|TCGA_Breast_miRNASeq|.}
62
+\item{\verb|TCGA_Breast_RNASeq_vooim|}{  This dataset is the voom-transformed \verb|TCGA_Breast_RNASeq|.}
63
+\end{description}
64
+
65
+\section{Before Starting}
66
+
67
+\subsection{Required Inputs}
68
+
69
+"Within" –omics refers to when the Discordant analysis is performed within one –omics dataset where all molecular features within a -omics dataset are paired to each other (e.g. transcript-transcript pairs in a microarray transcriptomics experiment).
70
+
71
+"Between" -omics refers to analysis of two -omics datasets. Molecular feature pairs analyzed are between the two -omics, (e.g. transcript-protein, protein-metabolite) are paired.
72
+
73
+\begin{description}
74
+\item[x]{ m by n matrix where m are features and n are samples. If only this matrix is provided, a within -omics analysis is performed.}
75
+\item[y]{ m by n matrix where m are features and n are samples. This is an optional argument which will induce between -omics analysis. Samples must be matched with those in x.} 
76
+\item[groups]{ vector containing 1s and 2s that correspond to the location of samples in the columns of x (and y if provided). For example, the control group is group 1 and the experimental group 2, and the location of samples corresponding to the two groups matches the locations of 1s and 2s in the group vector}
77
+\end{description}
78
+
79
+\subsection{Outliers}
80
+
81
+In our work, we found that features with outliers would skew correlation and cause false positives. Our approach was to filter out features that had large outliers. With normal data, such as in microarrays, Grubbs' test can be used. The null hypothesis is that there are no outliers in the data, and so features with p-value $\ge$ 0.05 are kept. A simple R function is found in the \verb|outliers| R package as \verb|grubbs.test|.
82
+
83
+Determining outliers in non-normal data is more complicated. We used the median absolute deviation (MAD). Normally, features are filtered if they are outside 2 or 3 MADs from the median \cite{leys}. This is not completely applicable to sequencing data, because sequencing data has large variance and a non-symmetrical distribution. We used 'split MAD', which has been used before \cite{magwene}. A left MAD is determined based on data left to the median and a right MAD is determined based on data to the right of the median. If there are any feature outside a factor of the left or right MAD from the median, there are featured out.
84
+
85
+A function in Discordant is provided called \verb|split.madOutlier|. The number of MAD outside of the median can be changed with option \verb|threshold|. Another option is \verb|filter0| which if \verb|TRUE| will filter out any feature with at least one 0. Arguments returned are \verb|mat.filtered|, which is the filtered matrix and \verb|index| which is the index of features that are retained in \verb|mat.filtered|.
86
+
87
+\begin{verbatim}
88
+> data(TCGA_Breast_miRNASeq)
89
+> mat.filtered <- splitMADOutlier(TCGA_Breast_miRNASeq,
90
++       filter0 = TRUE, threshold = 4)
91
+\end{verbatim}
92
+
93
+\section{Create Correlation Vectors}
94
+
95
+To run the Discordant algorithm, correlation vectors respective to each group are necessary for input, which are easy to create using the  $\verb|createVectors|$. Each correlation coefficient represents the correlation between two molecular features. The type of molecular feature pairs depends if a within -omics or between -omics analysis is performed. Correlation between molecular features in the same -omics dataset is within -omics, and correlation between molecular features in two different -omics datasets is between -omics. Whether or not within -omics or between -omics analysis is performed depends on whether one or two matrices are parameters for this function. $\verb|createVectors|$ has two outputs:
96
+
97
+
98
+\begin{description}
99
+\item[v1]{Correlation vector of molecular feature pairs corresponding to samples labeled 1 in group parameter.}
100
+\item[v2]{Correlation vector of molecular feature pairs corresponding to samples labeled 2 in group parameter.}
101
+\end{description}
102
+
103
+\begin{verbatim}
104
+
105
+> data(TCGA_GBM_miRNA_microarray)
106
+> data(TCGA_GBM_transcript_microarray)
107
+> groups <- c(rep(1,10), rep(2,10))
108
+
109
+#Within -Omics
110
+> wthn_vectors <- createVectors(TCGA_GBM_transcript_microarray, 
111
++      groups = groups)
112
+#Between -Omics
113
+> btwn_vectors <- createVectors(TCGA_GBM_miRNA_microarray, 
114
++      TCGA_GBM_transcript_microarray, groups = groups)
115
+\end{verbatim}
116
+
117
+\subsection{Correlation Metric}
118
+
119
+We also have included different options for correlation metrics. This argument is called $\verb|cor.method|$ and its default value is $\verb|"spearman"|$. Other options are $\verb|"pearson"|$, $\verb|"bwmc"|$ and $\verb|"sparcc"|$. For information and comparison of Spearman, Pearson and biweight midcorrelation (bwmc) please read this paper by \href{http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-13-328}{Song et al}\cite{song}. We have also investigated correlation metrics in Discordant in relation to sequencing data, and found Spearman's correlation had the best performance\cite{siska2}.
120
+
121
+The algorithm for SparCC was introduced by Friedman et al\cite{friedman}. We use R code written by Huaying Fang\cite{fang} .
122
+
123
+Example
124
+
125
+\begin{verbatim}
126
+
127
+> data(TCGA_GBM_miRNA_microarray)
128
+> data(TCGA_GBM_transcript_microarray)
129
+> groups <- c(rep(1,10), rep(2,10))
130
+
131
+#Within -Omics
132
+> wthn_vectors <- createVectors(TCGA_GBM_transcript_microarray, 
133
++      groups = groups, cor.method = c("bwmc"))
134
+#Between -Omics
135
+> btwn_vectors <- createVectors(TCGA_GBM_miRNA_microarray, 
136
++      TCGA_GBM_transcript_microarray, groups = groups,
137
++      cor.method = c("bwmc"))
138
+
139
+\end{verbatim}
140
+
141
+\section{Run the Discordant Algorithm}
142
+
143
+The Discordant Algorithm is implemented in the the function $\verb|discordantRun|$ which requires two correlation vectors and the original data. If the user wishes to generate their own correlation vector before inputting into the dataset, they can do so. However, the function will return an error message if the dimensions of the datasets inserted do not match the correlation vector.
144
+
145
+The posterior probability output of the Discordant algorithm are the differential correlation posterior probabilities (the sum of the off-diagonal of the class matrix described in Table 1). If the user wishes to observe more detailed information, alternative outputs are available. $\verb|discordantRun|$ has five outputs:
146
+
147
+\begin{description}
148
+\item[discordPPMatrix]{Matrix of differential correlation posterior probabilities where rows and columns reflect features. If only x was inputted, then the number of rows and columns are number of features in x. The rows and column names are the feature names, and the upper diagonal of the matrix are NAs to avoid repeating results. If x and y are inputted, the number of rows is the feature size of x and the number of columns the feature size of y. The row names are features from x and the column names are features from y.}
149
+\item[discordPPVector]{Vector of differential correlation posterior probabilities. The length is the number of feature pairs. The names of the vector are the feature pairs.}
150
+\item[classMatrix]{Matrix of classes with the highest posterior probability. Row and column names are the same as in discordPPMatrix depending if only x is inputted or both x and y.}
151
+\item[classVector]{Vector of class with the highest posterior probability for each pair. The length is the number of feature pairs. Names of vector correspond to the feature pairs, similar to discordPPVector.}
152
+\item[probMatrix]{Matrix of all posterior probabilities, where the number of rows is the number of feature pairs and the columns represent the class within the class matrix. The number of columns can be 9 or 25, depending on how many mixture components are chosen (discussed later). The values across each row add up to 1. Posterior probabilities in discordPPMatrix and discordPPVector are the summation of columns that correspond to differential correlation classes (Table 1).}
153
+\item[loglik]{The log likelihood.}
154
+\end{description}
155
+
156
+\begin{verbatim}
157
+> data(TCGA_GBM_miRNA_microarray)
158
+> data(TCGA_GBM_transcript_microarray)
159
+> groups <- c(rep(1,10), rep(2,10))
160
+
161
+#Within -omics
162
+> wthn_vectors <- createVectors(TCGA_GBM_transcript_microarray, 
163
++      groups = groups)
164
+> wthn_result <- discordantRun(vectors$v1, vectors$v2, 
165
++      TCGA_GBM_transcript_microarray)
166
+> wthn_result$discordPPMatrix[1:4,1:4]
167
+             A_23_P138644 A_23_P24296 A_24_P345312 A_24_P571870
168
+A_23_P138644           NA          NA           NA           NA
169
+A_23_P24296    0.33050233          NA           NA           NA
170
+A_24_P345312   0.24910061  0.47858580           NA           NA
171
+A_24_P571870   0.07052135  0.23310301   0.07028103           NA
172
+> head(wthn_result$discordPPVector)
173
+A_23_P24296_A_23_P138644 A_24_P345312_A_23_P138644 
174
+               0.33050233                0.24910061 
175
+A_24_P345312_A_23_P24296 A_24_P571870_A_23_P138644
176
+               0.44839316                0.07052135
177
+A_24_P571870_A_23_P24296 A_24_P571870_A_24_P345312 
178
+               0.56846815                0.91395405 
179
+> wthn_result$classMatrix[1:4,1:4]
180
+             A_23_P138644 A_23_P24296 A_24_P345312 A_24_P571870
181
+A_23_P138644           NA          NA           NA           NA
182
+A_23_P24296             1          NA           NA           NA
183
+A_24_P345312            1           3           NA           NA
184
+A_24_P571870            1           1            1           NA
185
+> head(wthn_result$classVector)
186
+A_23_P24296_A_23_P138644 A_24_P345312_A_23_P138644 
187
+                        1                         1
188
+A_24_P345312_A_23_P24296 A_24_P571870_A_23_P138644
189
+                        1                         1  
190
+A_24_P571870_A_23_P24296 A_24_P571870_A_24_P345312 
191
+                        4                         2
192
+> head(wthn_result$probMatrix)
193
+                                   1            2           3
194
+A_23_P24296_A_23_P138644  0.63574038 5.223996e-07 0.289699897
195
+A_24_P345312_A_23_P138644 0.74859348 2.151027e-02 0.012949903
196
+A_24_P345312_A_23_P24296  0.92584634 4.666010e-03 0.022843630
197
+A_24_P571870_A_23_P138644 0.55002415 1.359646e-03 0.016168496
198
+A_24_P571870_A_23_P24296  0.43013074 5.668452e-04 0.014985341
199
+A_24_P571870_A_24_P345312 0.05392322 8.877314e-01 0.001065886
200
+                                   4            5           6
201
+A_23_P24296_A_23_P138644  0.01293366 2.138702e-09 7.37399e-03
202
+A_24_P345312_A_23_P138644 0.19191496 1.124351e-03 4.15700e-03
203
+A_24_P345312_A_23_P24296  0.00260480 2.636222e-06 8.20242e-05
204
+A_24_P571870_A_23_P138644 0.40351609 2.017512e-04 1.47538e-02
205
+A_24_P571870_A_23_P24296  0.52075349 1.377220e-04 2.24260e-02
206
+A_24_P571870_A_24_P345312 0.00961640 3.201954e-02 2.38872e-04
207
+                                   7            8           9
208
+A_23_P24296_A_23_P138644  0.02049424 1.062232e-08 0.033757287
209
+A_24_P345312_A_23_P138644 0.01824136 3.271036e-04 0.001181551
210
+A_24_P345312_A_23_P24296  0.04019691 1.279636e-04 0.003629672
211
+A_24_P571870_A_23_P138644 0.01257563 1.944655e-05 0.001380934
212
+A_24_P571870_A_23_P24296  0.00972846 7.995730e-06 0.001263389
213
+A_24_P571870_A_24_P345312 0.00138369 1.391785e-02 0.000103189
214
+> wthn_result$loglik
215
+[1] 1129.558
216
+> # Between -omics
217
+> btwn_vectors <- createVectors(TCGA_GBM_miRNA_microarray, 
218
++      TCGA_GBM_transcript_microarray, groups = groups)
219
+> btwn_result <- discordantRun(vectors$v1, vectors$v2, 
220
++       TCGA_GBM_miRNA_microarray, 
221
++       TCGA_GBM_transcript_microarray)
222
+> btwn_result$discordPPMatrix[1:3,1:3]
223
+               A_23_P138644 A_23_P24296 A_24_P345312
224
+hsa-miR-19b-5p    0.9734173  0.19568366   0.70425484
225
+hsa-miR-206-5p    0.8422418  0.03809339   0.91989561
226
+hsa-miR-369-5p    0.9509883  0.05088263   0.99194855
227
+> head(btwn_result$discordPPVector)
228
+hsa-miR-19b-5p_A_23_P138644  hsa-miR-19b-5p_A_23_P24296 
229
+                  0.9734173                   0.8422418 
230
+hsa-miR-19b-5p_A_24_P345312 hsa-miR-19b-5p_A_24_P571870 
231
+                  0.9509883                   0.8983236 
232
+ hsa-miR-19b-5p_A_32_P71885  hsa-miR-19b-5p_A_32_P82889 
233
+                  0.9154871                   0.6706972
234
+> btwn_result$classMatrix[1:3,1:3] 
235
+               A_23_P138644 A_23_P24296 A_24_P345312
236
+hsa-miR-19b-5p            7           1            4
237
+hsa-miR-206-5p            2           1            3
238
+hsa-miR-369-5p            4           1            2
239
+> head(btwn_result$classVector)
240
+hsa-miR-19b-5p_A_23_P138644  hsa-miR-19b-5p_A_23_P24296 
241
+                          7                           2 
242
+hsa-miR-19b-5p_A_24_P345312 hsa-miR-19b-5p_A_24_P571870 
243
+                          4                           7 
244
+ hsa-miR-19b-5p_A_32_P71885  hsa-miR-19b-5p_A_32_P82889 
245
+                          4                           4 
246
+> head(btwn_result$probMatrix)
247
+                                    1          2          3
248
+hsa-miR-19b-5p_A_23_P138644 0.0265823 1.6948e-02 7.4477e-09
249
+hsa-miR-19b-5p_A_23_P24296  0.1575141 8.3361e-01 2.2858e-10
250
+hsa-miR-19b-5p_A_24_P345312 0.0488204 2.8058e-05 1.1427e-03
251
+hsa-miR-19b-5p_A_24_P571870 0.1016756 4.2631e-02 7.0747e-08
252
+hsa-miR-19b-5p_A_32_P71885  0.0844775 3.4488e-06 2.7536e-02
253
+hsa-miR-19b-5p_A_32_P82889  0.2333993 1.1262e-01 1.1956e-07
254
+                                     4           5        6
255
+hsa-miR-19b-5p_A_23_P138644 1.2909e-11 2.82374e-12 2.26e-19
256
+hsa-miR-19b-5p_A_23_P24296  1.3327e-04 2.44029e-04 1.21e-14
257
+hsa-miR-19b-5p_A_24_P345312 9.4829e-01 1.90403e-04 1.50e-03
258
+hsa-miR-19b-5p_A_24_P571870 7.6664e-10 1.10858e-10 3.34e-17
259
+hsa-miR-19b-5p_A_32_P71885  8.6805e-01 1.23439e-05 1.98e-02
260
+hsa-miR-19b-5p_A_32_P82889  5.5782e-01 9.59033e-02 1.80e-08
261
+                                     7           8        9
262
+hsa-miR-19b-5p_A_23_P138644 8.4389e-01 1.12569e-01 3.33e-07
263
+hsa-miR-19b-5p_A_23_P24296  4.1233e-03 4.37311e-03 8.73e-12
264
+hsa-miR-19b-5p_A_24_P345312 2.3763e-05 2.67832e-09 8.07e-07
265
+hsa-miR-19b-5p_A_24_P571870 7.8738e-01 6.83115e-02 7.76e-07
266
+hsa-miR-19b-5p_A_32_P71885  4.9158e-05 3.95251e-10 2.30e-05
267
+hsa-miR-19b-5p_A_32_P82889  2.1812e-04 2.06579e-05 1.65e-10
268
+> btwn_result$loglik
269
+[1] 1169.986
270
+\end{verbatim}
271
+
272
+\subsection{Subsampling}
273
+
274
+Subsampling is an option to run the EM algorithm with a random sample of independent feature pairs. This is repeated for a number of samplings, and then the average of these parameters are used to maximize posterior probabilities for all feature pairs. This option was introduced to speed up Discordant method and to also address the independence assumption. There are some implementation issues which are explained in Siska et al, submitted\cite{siska2}.
275
+
276
+The argument $\verb|subsampling|$ must be set to $\verb|TRUE|$ for subsampling to be used. The number of independent feature pairs to be subsampled is determined by the argument $\verb|subSize|$ whose default value is the number of rows in $\verb|x|$. The number of independent feature pairs must be less or equal to the number of features in $\verb|x|$ and $\verb|y|$. The number of random samplings to be run is set by the argument $\verb|iter|$, whose default value is 100.
277
+
278
+For the subsampling example, we will use the sequencing data. The sequencing data has a greater feature size than the microarray data. A dataset with small feature size will cause a segmentation fault with subsampling because not enough feature pairs are being used to estimate parameters in the mixture model.
279
+
280
+\begin{verbatim}
281
+> data(TCGA_Breast_miRNASeq)
282
+> data(TCGA_Breast_RNASeq)
283
+> groups <- c(rep(1,10), rep(2,10))
284
+
285
+> vectors <- createVectors(TCGA_Breast_miRNASeq, 
286
++      TCGA_Breast_RNASeq, groups = groups)
287
+> result <- discordantRun(vectors$v1, vectors$v2, 
288
++      TCGA_Breast_miRNASeq, 
289
++      TCGA_Breast_RNASeq, subsampling = TRUE, 
290
++      iter = 200, subSize = 20)
291
+\end{verbatim}
292
+
293
+\begin{table}[h]
294
+\begin{center}
295
+\begin{tabular}{ c |  c c c c c}  
296
+  & 0 & - & -- & + & ++ \\
297
+\hline
298
+0 & 1 & 2 & 3 & 4 & 5 \\
299
+- & 6 & 7 & 8 & 9 & 10  \\  
300
+-- & 11 & 12 & 13 & 14 & 15 \\
301
++ & 16 & 17 & 18 & 19 & 20 \\
302
+\end{tabular}
303
+\caption{Class Matrix for Five Component Mixture Model}
304
+\end{center}
305
+\end{table}
306
+
307
+\subsection{Increase Component Size}
308
+
309
+We also provide the option to increase component size from three to five in the mixture model. The number of classes in the class matrix increases, as seen in Table 2. Incorporating the extra components means that it is possible to identify elevated differential correlation, which is when there are associations in both groups in the same direction but one is more extreme. Using this options introduces more parameters, which does have an effect on run-time. We also found that using the five mixture component mixture model reduces performance compared to the three component mixture model\cite{siska2}. However, the option is available if users wish to explore more types of differential correlation.
310
+
311
+The default is to run the three component mixture model and can be changed with option $\verb|components|$.
312
+
313
+Example:
314
+
315
+\begin{verbatim}
316
+> data(TCGA_GBM_miRNA_microarray)
317
+> data(TCGA_GBM_transcript_microarray)
318
+> groups <- c(rep(1,10), rep(2,10))
319
+
320
+> btwn_vectors <- createVectors(TCGA_GBM_miRNA_microarray, 
321
++      TCGA_GBM_transcript_microarray, groups = groups)
322
+> bwtn_result <- discordantRun(vectors$v1, vectors$v2, 
323
++      TCGA_GBM_transcript_microarray, 
324
++      TCGA_GBM_miRNA_microarray, components = 5)
325
+\end{verbatim}
326
+
327
+\section{Example Run with Microarrays}
328
+
329
+\begin{verbatim}
330
+data(TCGA_GBM_miRNA_microarray)
331
+data(TCGA_GBM_transcript_microarray)
332
+groups <- c(rep(1,10), rep(2,10))
333
+
334
+#Within -Omics
335
+
336
+> wthn_vectors <- createVectors(TCGA_GBM_transcript_microarray, 
337
++     groups = groups)
338
+> wthn_result <- discordantRun(vectors$v1, vectors$v2, 
339
++     TCGA_GBM_transcript_microarray)
340
+
341
+#Between -Omics
342
+
343
+> btwn_vectors <- createVectors(TCGA_GBM_miRNA_microarray, 
344
++     TCGA_GBM_transcript_microarray, groups = groups)
345
+> btwn_result <- discordantRun(vectors$v1, vectors$v2, 
346
++     TCGA_GBM_miRNA_microarray, 
347
++     TCGA_GBM_transcript_microarray)
348
+\end{verbatim}
349
+
350
+\bibliographystyle{ieeetr}
351
+
352
+\bibliography{Discordant_bib_v3}
353
+