Browse code

Added the CoGAPS package.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/CoGAPS@48067 bc3139a8-67e5-0310-9ffc-ced21a209358

Chao-Jen Wong authored on 12/07/2010 17:26:34
Showing 34 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,18 @@
1
+Package: CoGAPS
2
+Version: 0.99.1
3
+Date: 2010-07-02
4
+Title: Coordinated Gene Activity in Pattern Sets 
5
+Author: Elana J. Fertig
6
+Description: Coordinated Gene Activity in Pattern Sets (CoGAPS) infers
7
+  biological processes which are active in individual gene sets from
8
+  corresponding microarray measurements.  CoGAPS achieves this inference by
9
+  combining a MCMC matrix decomposition algorithm (GAPS) with a novel statistic
10
+  inferring activity on gene sets. 
11
+Maintainer: Elana J. Fertig <ejfertig@jhmi.edu>, Michael F. Ochs <mfo@jhu.edu>
12
+SystemRequirements: GAPS-JAGS (==1.0.2)
13
+Depends: R (>= 2.9.0), rjags(>= 2.1.0), R.utils (>= 1.2.4)
14
+Imports: graphics, grDevices, methods, stats, utils
15
+License: GPL (== 2)
16
+URL: http://www.cancerbiostats.onc.jhmi.edu/cogaps.cfm
17
+biocViews: GeneExpression, Microarray, Bioinformatics 
18
+
0 19
new file mode 100644
... ...
@@ -0,0 +1,4 @@
1
+CoGAPS		Algorithm for coordination of activity in gene sets with patterns from GAPS
2
+GAPS		MCMC matrix decomposition algorithm
3
+calcCoGASPStat	Computes the CoGAPS gene set statistic
4
+LoadGAPSJAGSLib	Loads in the c++ libraries that perform the GAPS matrix decomposition.  Recommended to be performed before any run of CoGAPS.
0 5
new file mode 100644
... ...
@@ -0,0 +1,10 @@
1
+export(CoGAPS)
2
+export(GAPS)
3
+export(calcCoGAPSStat)
4
+export(plotGAPS)
5
+importFrom(graphics, matplot, title)
6
+importFrom(grDevices, dev.new, dev.off, pdf)
7
+importFrom(methods, is)
8
+importFrom(rjags, jags.model, load.module, jags.samples)  
9
+importFrom(stats, heatmap, runif)
10
+importFrom(utils, read.table, write.table)
0 11
new file mode 100644
... ...
@@ -0,0 +1,2 @@
1
+CoGAPS requires GAPS-JAGS available from http://www.cancerbiostats.onc.jhmi.edu/cogaps.cfm.  This c++ package is a redistribution of JAGS version 2.1.0 with a module implementing the GAPS matrix decomposition for microarray data.  GAPS-JAGS takes the place of the original JAGS package required for the successful installation and running of the rjags package. Please see the installation instructions in the users manual.  If you have any questions, please comment Elana Fertig <ejfertig@jhmi.edu> or Michael Ochs <mfo@jhu.edu>.
2
+
0 3
new file mode 100644
... ...
@@ -0,0 +1,120 @@
1
+# obtain mean and SD from output of data
2
+AtomChain2APNorm <- function(Afile, Pfile,
3
+                             nGene, nSample, nPattern,
4
+                             verbose = TRUE) {
5
+
6
+  if (verbose) {
7
+    message(sprintf("Finding the distribution of A and P matrices from output files %s and %s\n.",
8
+                  Afile, Pfile))
9
+  }
10
+
11
+  # determine the number of iterations from the file length
12
+
13
+  nLinesA <- countLines(Afile)
14
+  nLinesP <- countLines(Pfile)
15
+
16
+  nIterA <- (nLinesA)
17
+  nIterP <- (nLinesP)
18
+
19
+  if ( (nIterA != nIterP) || nIterA <= 1 || nIterP <= 1 ) {
20
+    stop(sprintf("Cannot normalize A and P matrices output at %d and %d iterations",
21
+                 nIterA, nIterP))
22
+  }
23
+
24
+  if (verbose) {
25
+    message("Computing mean of A and P matrices\n")
26
+  }
27
+  Anorm <- matrix(0, nrow = nGene,    ncol=nPattern)
28
+  Pnorm <- matrix(0, nrow = nPattern, ncol=nSample)
29
+
30
+  AConnection <- file(Afile,"r")
31
+  if (!isOpen(AConnection, "r")) {
32
+    stop(paste("Could not open",Afile,"for normalization.",sep=" "))
33
+  }
34
+
35
+  PConnection <- file(Pfile,"r")
36
+  if (!isOpen(PConnection, "r")) {
37
+    stop(paste("Could not open",Pfile,"for normalization.",sep=" "))
38
+  }
39
+  
40
+  for (i in 1:nIterA) {
41
+    AlineData <- scan(AConnection, sep="\t",nlines = 1, strip.white=T, quiet=T)
42
+    PlineData <- scan(PConnection, sep="\t", nlines = 1, strip.white=T, quiet=T)
43
+
44
+    if (length(AlineData) != nGene*nPattern) {
45
+      stop(paste("Read invalid record for A matrix from", Afile, sep=" "))
46
+    }
47
+
48
+    if (length(PlineData) != nPattern*nSample) {
49
+      stop(paste("Read invalid record for P matrix from", Pfile, sep=" "))
50
+    }
51
+
52
+    Atemp <- matrix(AlineData, nrow=nGene, ncol=nPattern)
53
+    Ptemp <- matrix(PlineData, nrow=nPattern, ncol=nSample)
54
+
55
+    normFactor <- apply(Ptemp,1,sum)
56
+    Ptempnorm <- Ptemp / matrix(rep(normFactor, nSample),
57
+                                nrow=nPattern, ncol=nSample)
58
+    Atempnorm <- Atemp * matrix(rep(normFactor, nGene),
59
+                                nrow=nGene, ncol=nPattern, byrow=T)
60
+
61
+    
62
+    Anorm <- Anorm + Atempnorm
63
+    Pnorm <- Pnorm + Ptempnorm
64
+  }
65
+  Anorm <- Anorm / nIterA
66
+  Pnorm <- Pnorm / nIterP
67
+
68
+  close(AConnection)
69
+  close(PConnection)
70
+  
71
+  if (verbose) {
72
+    message("Computing sd of A and P matrices\n")
73
+  }
74
+  Asd <- matrix(0, nrow=nGene, ncol=nPattern)
75
+  Psd <- matrix(0, nrow=nPattern, ncol=nSample)
76
+
77
+  AConnection <- file(Afile,"r")
78
+  if (!isOpen(AConnection, "r")) {
79
+    stop(paste("Could not open",Afile,"for normalization.",sep=" "))
80
+  }
81
+
82
+  PConnection <- file(Pfile,"r")
83
+  if (!isOpen(PConnection, "r")) {
84
+    stop(paste("Could not open",Pfile,"for normalization.",sep=" "))
85
+  }
86
+  
87
+  for (i in 1:nIterA) {
88
+    AlineData <- scan(AConnection, sep="\t",nlines = 1, strip.white=T, quiet=T)
89
+    PlineData <- scan(PConnection, sep="\t", nlines = 1, strip.white=T, quiet=T)
90
+
91
+    if (length(AlineData) != nGene*nPattern) {
92
+      stop(paste("Read invalid record for A matrix from", Afile, sep=" "))
93
+    }
94
+
95
+    if (length(PlineData) != nPattern*nSample) {
96
+      stop(paste("Read invalid record for P matrix from", Pfile, sep=" "))
97
+    }
98
+
99
+    Atemp <- matrix(AlineData, nrow=nGene,    ncol=nPattern)
100
+    Ptemp <- matrix(PlineData, nrow=nPattern, ncol=nSample)
101
+
102
+    normFactor <- apply(Ptemp,1,sum)
103
+    Ptempnorm <- Ptemp / matrix(rep(normFactor, nSample),
104
+                                nrow=nPattern, ncol=nSample)
105
+    Atempnorm <- Atemp * matrix(rep(normFactor, nGene),
106
+                                nrow=nGene, ncol=nPattern, byrow=T)
107
+    
108
+    Asd <- Asd + (Atempnorm - Anorm)^2
109
+    Psd <- Psd + (Ptempnorm - Pnorm)^2
110
+  }
111
+  Asd <- sqrt(Asd / (nIterA - 1))
112
+  Psd <- sqrt(Psd / (nIterP - 1))
113
+  
114
+  close(AConnection)
115
+  close(PConnection)
116
+
117
+  return(list(Amean=Anorm, Pmean=Pnorm, Asd=Asd, Psd=Psd))
118
+  
119
+  
120
+}
0 121
new file mode 100644
... ...
@@ -0,0 +1,55 @@
1
+# run the full algorithm
2
+CoGAPS <- function(data, unc, GStoGenes,
3
+                 outputDir, outputBase="",
4
+                 sep = "\t", isPercentError=FALSE,
5
+                 numPatterns,
6
+                 MaxAtomsA=2^32, alphaA=0.01,
7
+                 MaxAtomsP=2^32, alphaP=0.01,
8
+                 SAIter=1000000000, iter = 500000000, thin=-1,
9
+                 nPerm=500, verbose=TRUE, plot=FALSE, keepChain=FALSE) {
10
+
11
+
12
+  # decompose the data
13
+  matrixDecomp <- GAPS(data=data, unc=unc,
14
+                       outputDir=outputDir, outputBase=outputBase, 
15
+		       sep=sep, isPercentError=isPercentError,
16
+                       numPatterns=numPatterns, MaxAtomsA=MaxAtomsA, 
17
+		       alphaA=alphaA, MaxAtomsP=MaxAtomsP,
18
+                       alphaP=alphaP, SAIter=SAIter, iter=iter, thin=thin,
19
+                       verbose=verbose, keepChain=keepChain)
20
+
21
+  # plot patterns and show heatmap of Anorm
22
+  if (plot) {
23
+    plotGAPS(matrixDecomp$Amean, matrixDecomp$Pmean)
24
+  }
25
+
26
+  # compute the gene set scores
27
+  GSP <- calcCoGAPSStat(matrixDecomp$Amean, matrixDecomp$Asd, GStoGenes, nPerm)
28
+
29
+  # output gene set results
30
+  if (outputDir != "") {
31
+     outputGSPBase <- paste(outputDir, outputBase, sep="/")
32
+  } else {
33
+     outputGSPBase <- outputBase
34
+  }
35
+  if (outputBase != "") {
36
+    outputGSPBase <- paste(outputGSPBase, "_", sep="")
37
+  }
38
+  
39
+  UpFile <- paste(outputGSPBase, "GSUpStat.txt", sep="")
40
+  DownFile <- paste(outputGSPBase, "GSDownStat.txt", sep="")
41
+  ActEstFile <- paste(outputGSPBase, "GSActEst.txt", sep="")
42
+
43
+  write.table(GSP$GSUpreg, file=UpFile, sep=sep)
44
+  write.table(GSP$GSDownreg, file=DownFile, sep=sep)
45
+  write.table(GSP$GSActEst, file=ActEstFile, sep=sep)
46
+
47
+  return(list(meanChi2=matrixDecomp$meanChi2,
48
+              D=matrixDecomp$D, Sigma=matrixDecomp$Sigma,
49
+              Amean=matrixDecomp$Amean, Asd=matrixDecomp$Asd,
50
+              Pmean=matrixDecomp$Pmean, Psd=matrixDecomp$Psd,
51
+              meanMock=matrixDecomp$meanMock,
52
+              GSUpreg=GSP$GSUpreg, GSDownreg=GSP$GSDownreg, GSActEst=GSP$GSActEst))
53
+  
54
+  
55
+}
0 56
new file mode 100644
... ...
@@ -0,0 +1,287 @@
1
+# driver for the matrix decomposition
2
+GAPS <- function(data, unc,
3
+                 outputDir, outputBase="",
4
+                 sep = "\t", isPercentError=FALSE,
5
+                 numPatterns,
6
+                 MaxAtomsA=2^32, alphaA=0.01,
7
+                 MaxAtomsP=2^32, alphaP=0.01,
8
+                 SAIter=1000000000, iter = 500000000, thin=-1,
9
+                 verbose=TRUE, keepChain = FALSE) {
10
+  
11
+  # formatting data for use by algorithm
12
+  if (verbose) {
13
+    message("Formatting data for GAPS");
14
+  }
15
+
16
+  # process data
17
+  dataInFile = tolower(class(data))=="character"
18
+
19
+  # read in data from a file
20
+  if (dataInFile) {
21
+
22
+    if (verbose) {
23
+      message(paste("Reading in data from", data, sep=" "))
24
+    }
25
+
26
+    D <- as.matrix(read.table(file = data, sep = sep,
27
+                              row.names=1, header=TRUE))
28
+    
29
+  } else {
30
+    # data is input in matrix / array form
31
+    D <- as.matrix(data)
32
+  }
33
+
34
+  if (numPatterns >= nrow(D) || numPatterns >= ncol(D)) {
35
+    stop(sprintf("Cannot decompose data into more patterns (%d) than rows (%d) or columns (%d).", numPatterns, nrow(D), ncol(D)))
36
+  }
37
+  
38
+  # process uncertainty
39
+  sdInFile = tolower(class(unc))=="character"
40
+  if (sdInFile) {
41
+
42
+    # uncertainty is specified in a file
43
+    if (isPercentError) {
44
+      stop(paste("Uncertainty specified in file", unc, 
45
+                  "must be specified as additive error."))
46
+    }
47
+
48
+    if (verbose) {
49
+       message(paste("Reading in uncertainty from", unc, sep=" "))
50
+    }
51
+    
52
+    Sigma <- as.matrix(read.table(file = unc, sep = sep,
53
+                                  row.names=1,header=TRUE))
54
+    
55
+  } else {
56
+
57
+    
58
+    nrunc <- nrow(unc)
59
+    ncunc <- ncol(unc)
60
+    if (length(unc)==1) {
61
+      nrunc <- 1
62
+      ncunc <- 1
63
+    }
64
+    
65
+    # see if input uncertainty is a scalar
66
+    if ( nrunc == 1 && ncunc == 1 ) {
67
+
68
+      if (verbose) {
69
+        message("Using single unc value specified in input.")
70
+      }
71
+
72
+      # compute the error as either fractional or additive
73
+      if (isPercentError) {
74
+        Sigma <- pmax(unc*abs(D),unc)
75
+      } else {
76
+        Sigma <- matrix(data=unc, nrow=nrow(D), ncol=ncol(D))
77
+      }
78
+
79
+      # check that the uncertainty is positive
80
+      if (any(Sigma <= 0)) {
81
+        stop(paste("Uncertainty must be positive (Sigma = ", Sigma, ").", sep=""))
82
+      }
83
+    } else {
84
+
85
+      if (verbose) {
86
+        message("Using matrix unc value specified in input.")
87
+      }
88
+
89
+      # check that dimensions of uncertainty and data match
90
+      if (nrunc != nrow(D) || ncunc != ncol(D)) {
91
+        stop(paste("Dimensions of data (", nrow(D), ", ", ncol(D),
92
+                    ") and uncertainty (", nrow(Sigma), ", ", ncol(Sigma), ") do not agree",
93
+                    sep = ""))
94
+      }
95
+      # check that the uncertainty is positive
96
+      if (any(unc <= 0)) {
97
+        stop(paste("Uncertainty must be positive (unc = ", unc, ").", sep=""))
98
+      }
99
+
100
+      # ensure that matrix errors additive                                                        
101
+      if (isPercentError) {
102
+        stop("Uncertainty specified in an array must be additive error.")
103
+      }
104
+      Sigma <- as.matrix(unc)
105
+
106
+    }
107
+
108
+  }
109
+
110
+  # make a temporary file to outputBase which contains the row and column names
111
+  if (outputDir != "" && outputDir != ".") {
112
+     fullOutputPath <- paste(outputDir,outputBase,sep="/")
113
+     if (!file.exists(outputDir)) {
114
+        dir.create(outputDir)
115
+     }
116
+  } else {
117
+     fullOutputPath <- outputBase
118
+  }
119
+  
120
+
121
+  if (verbose) {
122
+    message(paste("Decomposing data of size:",
123
+                nrow(D), "by", ncol(D),
124
+                "into", numPatterns, "patterns.", sep=" "))
125
+  } 
126
+  
127
+  # get expected mass of each matrix element based on data
128
+  if (verbose) {
129
+    message("Initializing distribution parameters")
130
+  }
131
+
132
+  # computing the mean mass of atoms in A and P based on the data matrix
133
+  lambdaA <- alphaA * sqrt(numPatterns) / sqrt(mean(D)) 
134
+  lambdaP <- alphaP * sqrt(numPatterns) / sqrt(mean(D))
135
+
136
+  # estimate number of outputs based on expected number of matrix elements
137
+  if (thin <= 0) {
138
+    if (iter < 10000) {
139
+      outThin <- 1
140
+    } else {
141
+      outThin <- iter/10000
142
+    }
143
+  } else {
144
+    outThin <- thin
145
+  }
146
+
147
+  # assign the unique file identifier to be the number after the decimal
148
+  if (floor(outThin) - outThin < 1.e-10) {
149
+    outThin <- outThin + runif(1)
150
+  }
151
+
152
+
153
+  # initialize output arrays, including chain dimensions
154
+  Amean   <- array(0., c(nrow(D), numPatterns))
155
+  row.names(Amean) <- row.names(D)
156
+  colnames(Amean) <- paste("p",1:numPatterns,sep="")
157
+  Asd     <- array(0., c(nrow(D), numPatterns))
158
+  row.names(Asd) <- row.names(D)
159
+  colnames(Amean) <- paste("p",1:numPatterns,sep="")
160
+  Pmean   <- array(0., c(numPatterns, ncol(D)))
161
+  row.names(Pmean) <-   paste("p",1:numPatterns,sep="")
162
+  colnames(Pmean) <- colnames(D)
163
+  Psd     <- array(0., c(numPatterns, ncol(D)))
164
+  row.names(Psd) <- paste("p",1:numPatterns,sep="")
165
+  colnames(Psd) <- colnames(D)
166
+
167
+  meanChi2 <- 0. 
168
+  meanMock <- array(0., c(nrow(D),ncol(D)))
169
+
170
+  
171
+  if (verbose) {
172
+    message(paste("Running GAPS."))
173
+    message(paste(SAIter, "burn-in steps;", iter, "steps.", sep=" "))
174
+  }
175
+    
176
+  # run the mcmc
177
+  mdout <- RunGAPS(D, Sigma, numPatterns,
178
+                   MaxAtomsA, alphaA, lambdaA,
179
+                   MaxAtomsP, alphaP, lambdaP,
180
+                   SAIter, iter, outThin)
181
+
182
+  if (verbose) {
183
+    message(paste("Normalizing results."))
184
+  }
185
+
186
+  # obtain the mean and standard deviation of A and P from the chain files
187
+  FID <- sprintf('%.10f', outThin - floor(outThin))
188
+  AFile <- paste("AResults", FID, ".ChainValues.txt", sep="")
189
+  PFile <- paste("PResults", FID, ".ChainValues.txt", sep="")
190
+  normmd <- AtomChain2APNorm(Afile      = AFile,
191
+                             Pfile      = PFile,
192
+                             nGene      = nrow(D),
193
+                             nPattern   = numPatterns,
194
+                             nSample    = ncol(D))
195
+    
196
+  # extract chain data and store in above arrays
197
+  Amean[,] <- normmd$Amean
198
+  Asd[,] <- normmd$Asd
199
+  Pmean[,] <- normmd$Pmean
200
+  Psd[,] <- normmd$Psd
201
+
202
+  write.table(Amean[,],
203
+              file=paste(fullOutputPath,"Amean.",FID,".txt",sep=""),
204
+              sep="\t")
205
+  write.table(Asd[,],
206
+              file=paste(fullOutputPath,"Asd.",FID,".txt",sep=""),
207
+              sep="\t")
208
+  write.table(Pmean[,],
209
+              file=paste(fullOutputPath,"Pmean.",FID,".txt",sep=""),
210
+              sep="\t")
211
+  write.table(Psd[,],
212
+              file=paste(fullOutputPath,"Psd.",FID,".txt",sep=""),
213
+              sep="\t")
214
+    
215
+    
216
+  if (verbose) {
217
+    message(paste("Computing chi2")) 
218
+  }
219
+    
220
+  atmp <- as.matrix(Amean[,])
221
+  ptmp <- as.matrix(Pmean[,])
222
+  mtmp <- atmp %*% ptmp
223
+  meanMock[,] = array(mtmp, c(nrow(D), ncol(D), 1))
224
+  meanChi2 <- sum((D-meanMock[,])^2 / Sigma^2)
225
+
226
+  # process the diagnostic file and add appropriate means
227
+  ADiagFile <- paste("AResults",FID,".Diagnostics.txt",sep="")
228
+  PDiagFile <- paste("PResults",FID,".Diagnostics.txt",sep="")
229
+
230
+  ADiag <- read.table(ADiagFile, sep="\t", header=T, skip=10)
231
+  PDiag <- read.table(PDiagFile, sep="\t", header=T, skip=10)
232
+
233
+  chainKeepA  <- which(ADiag[,2]==0)
234
+  meanOfChi2A <- mean(ADiag[chainKeepA,4])
235
+  meanAtomsA  <- mean(ADiag[chainKeepA,5])
236
+
237
+  chi2DiagA <- paste("<number of atoms>", meanAtomsA,
238
+                     "<chi^2>:", meanOfChi2A,
239
+                     "; chi^2 of mean:", meanChi2, sep=" ")
240
+  cat("\n\nSummary\n", file=ADiagFile, append=TRUE)
241
+  cat(chi2DiagA, file=ADiagFile, append=TRUE)
242
+  cat("\n", file=ADiagFile, append=TRUE)
243
+    
244
+  chainKeepP  <- which(PDiag[,2]==0)
245
+  meanOfChi2P <- mean(PDiag[chainKeepP,4])
246
+  meanAtomsP  <- mean(PDiag[chainKeepP,5])
247
+
248
+  chi2DiagP <- paste("<number of atoms>", meanAtomsP,
249
+                     "<chi^2>:", meanOfChi2P,
250
+                     "; chi^2 of mean:", meanChi2, sep=" ")
251
+  cat("\n\nSummary\n", file=PDiagFile, append=TRUE)
252
+  cat(chi2DiagP, file=PDiagFile, append=TRUE)
253
+  cat("\n", file=PDiagFile, append=TRUE)
254
+    
255
+  if (fullOutputPath != "") {
256
+     file.rename(ADiagFile, paste(fullOutputPath, basename(ADiagFile), sep=""))
257
+     file.rename(PDiagFile, paste(fullOutputPath, basename(PDiagFile), sep=""))
258
+  }
259
+
260
+  # delete chain files or move to appropriate directory
261
+  if (!keepChain) {
262
+    message("Deleting chain files")
263
+    file.remove(AFile)
264
+    file.remove(PFile)
265
+  } else {
266
+    message(paste('Moving chain files to ', outputDir, sep=' '))
267
+    if (fullOutputPath != "") {
268
+       file.rename(AFile, paste(fullOutputPath, basename(AFile),sep=""))
269
+       file.rename(PFile, paste(fullOutputPath, basename(PFile),sep=""))
270
+    }
271
+  }
272
+  
273
+
274
+  if (verbose) {
275
+    message("Completed simulation.\n")
276
+    message("Return list contains chi^2 of solution and normalized, mean and standard deviation of A and P matrices.\n")
277
+  }
278
+
279
+  # output list of data for each chain
280
+  return(list(meanChi2=meanChi2,
281
+              D=D, Sigma=Sigma,
282
+              Amean=Amean, Asd=Asd,
283
+              Pmean=Pmean, Psd=Psd,
284
+              meanMock=meanMock))
285
+
286
+}
287
+                          
0 288
new file mode 100644
... ...
@@ -0,0 +1,44 @@
1
+# formats raw table data into a format required for MCMC with JAGS
2
+GetDataListGAPS <- function(data, unc, numPatterns,
3
+                            MaxAtomsA, alphaA, lambdaA, 
4
+                            MaxAtomsP, alphaP, lambdaP, SAIter, thin) {
5
+
6
+  # convert to matrix for output
7
+  D <- as.matrix(data)
8
+
9
+  # process the uncertainty
10
+  S <- as.matrix(unc)
11
+
12
+  # check the dimensionality
13
+  if (nrow(S) != nrow(D) || ncol(S) != ncol(D)) {
14
+    stop(cat("Dimensions of data (",
15
+             nrow(D), ", ", ncol(D),
16
+             ") must match dimensions of uncertainty (",
17
+             nrow(S), ", ", ncol(S), ").\n"))
18
+  }
19
+
20
+  # check that D and S are meet positivity requirements positive
21
+  if (any(D < 0)) {
22
+    warning("Data should be non-negative.\n");
23
+  }
24
+
25
+  if (any(S <= 0)) {
26
+    stop("Uncertainty must be strictly positive.\n");
27
+  }
28
+    
29
+
30
+  datalist = list(
31
+    numGenes = nrow(data), numSamples = ncol(data),
32
+    numPatterns = numPatterns,
33
+    D = D, S = S,
34
+    ADim = matrix(data=1,nrow=nrow(data), ncol=numPatterns),
35
+    MaxAtomsA = MaxAtomsA, alphaA = alphaA, lambdaA = lambdaA,
36
+    PDim = matrix(data=1,nrow=numPatterns, ncol=ncol(data)),
37
+    MaxAtomsP = MaxAtomsP, alphaP = alphaP, lambdaP = lambdaP,
38
+    SAIterA = SAIter, SAIterP = SAIter, fileID = thin)
39
+
40
+  return(datalist)
41
+  
42
+
43
+  
44
+}
0 45
new file mode 100644
... ...
@@ -0,0 +1,55 @@
1
+# runs matrix decomposition with MCMC using rjags
2
+RunGAPS <- function(data, unc, numPatterns,
3
+                    MaxAtomsA, alphaA, lambdaA, 
4
+                    MaxAtomsP, alphaP, lambdaP, 
5
+                    SAIter, iter, thin=1) {
6
+
7
+
8
+  # load in the matrix decomposition module
9
+  load.module(c("gaps"))
10
+
11
+  # get the data needed for MCMC
12
+  matrixDecompData <- GetDataListGAPS(data, unc, numPatterns,
13
+                                      MaxAtomsA, alphaA, lambdaA,
14
+                                      MaxAtomsP, alphaP, lambdaP,
15
+                                      SAIter, thin)
16
+  matrixDecompInits <- list(A = matrix(
17
+                              data=0, matrixDecompData$numGenes, numPatterns),
18
+                            P = matrix(
19
+                              data=0, numPatterns, matrixDecompData$numSamples))
20
+
21
+  # create the model file used by this decomposition
22
+  FID <- sprintf('%.10f', thin - floor(thin))
23
+  ModelFile <- paste("GAPS", FID, "bug", sep=".")
24
+
25
+  cat("var\n", file=ModelFile)
26
+  cat("\tA[numGenes, numPatterns], ADim[numGenes, numPatterns],\n",
27
+      file=ModelFile, append=TRUE)
28
+  cat("\tP[numPatterns, numSamples], PDim[numPatterns, numSamples],\n",
29
+      file=ModelFile, append=TRUE)
30
+  cat("\tD[numGenes, numSamples], S[numGenes, numSamples];\n",
31
+      file=ModelFile, append=TRUE)
32
+  cat("\nmodel {\n", file=ModelFile, append=TRUE)
33
+  cat("A[,] ~ datomicgibbs(ADim, MaxAtomsA, alphaA, lambdaA, SAIterA, fileID)\n", file=ModelFile, append=TRUE)
34
+  cat("P[,] ~ datomicgibbs(PDim, MaxAtomsP, alphaP, lambdaP, SAIterP, fileID)\n", file=ModelFile, append=TRUE)
35
+  cat("D[,] ~ gapsnorm(A,P,S)\n", file=ModelFile, append=TRUE)
36
+  cat("}\n", file=ModelFile, append=TRUE) 
37
+
38
+  # set the model being used
39
+  matrixDecompModel <- jags.model(file=ModelFile,
40
+                                  data=matrixDecompData,
41
+                                  inits=matrixDecompInits,
42
+                                  n.adapt=SAIter)
43
+  # update and specify which nodes are monitored
44
+  matrixDecompOutput <- jags.samples(model=matrixDecompModel,
45
+                                     variable.names=c('A','P'),  
46
+                                     n.iter=iter,
47
+                                     thin=iter)
48
+
49
+  file.remove(ModelFile)
50
+  
51
+  return(matrixDecompOutput)
52
+  
53
+
54
+}
55
+
0 56
new file mode 100644
... ...
@@ -0,0 +1,99 @@
1
+calcCoGAPSStat <- function (Amean, Asd, GStoGenes, numPerm=500) {
2
+
3
+  # calculate Z scores
4
+  zMatrix <- calcZ(Amean,Asd)
5
+  
6
+  # compute the p-value for each gene set belonging to each pattern
7
+
8
+  # check input arguments
9
+  if (!is(GStoGenes, "data.frame") && !is(GStoGenes, "list")) {
10
+    stop("GStoGenes must be a data.frame or list with format specified in the users manual.")
11
+  }
12
+	
13
+  if (is(GStoGenes, "list")) {
14
+    GStoGenesList <- GStoGenes
15
+  } else {
16
+    GStoGenesList <- list()
17
+    for (i in 1:dim(GStoGenes)[2]) {
18
+      GStoGenesList[[as.character(colnames(GStoGenes)[i])]] <- as.character(unique(GStoGenes[,i]))
19
+    }
20
+  }
21
+  
22
+  # get dimensions
23
+  numGS   <- length(names(GStoGenesList))
24
+  numPatt <- dim(zMatrix)[2]
25
+  numG    <- dim(zMatrix)[1]+0.9999
26
+
27
+  # initialize matrices
28
+  statsUp   <- matrix(ncol = numGS, nrow = numPatt)
29
+  statsDown <- matrix(ncol = numGS, nrow = numPatt)
30
+  actEst    <- matrix(ncol = numGS, nrow = numPatt)
31
+  results   <- new.env()
32
+  zPerm     <- matrix(ncol=numPerm,nrow=numPatt)
33
+
34
+  # do permutation test
35
+  for (gs in 1:numGS) {
36
+    genes <- GStoGenesList[[names(GStoGenesList)[gs]]]
37
+    index <- rownames(zMatrix) %in% genes
38
+    zValues <- zMatrix[index,1]
39
+    numGenes <- length(zValues)
40
+    label <- as.character(numGenes)
41
+    
42
+    if (!exists(label,env=results)) {
43
+      for (p in 1:numPatt) {
44
+        for (j in 1:numPerm) {
45
+          temp <- floor(runif(numGenes,1,numG))
46
+          temp2 <- zMatrix[temp,p]
47
+          zPerm[p,j] <- mean(temp2)
48
+        }
49
+      }
50
+      assign(label,zPerm,env=results)
51
+    }
52
+  }
53
+
54
+  # get p-value
55
+  for (p in 1:numPatt) {
56
+
57
+    for (gs in 1:numGS) {
58
+
59
+      genes <- GStoGenesList[[names(GStoGenesList)[gs]]]
60
+      index <- rownames(zMatrix) %in% genes
61
+      zValues <- zMatrix[index,p]
62
+      zScore <- mean(zValues)
63
+
64
+      numGenes <- length(zValues)
65
+      label <- as.character(numGenes)
66
+
67
+      permzValues <- get(label, env=results)[p,]
68
+      ordering <- order(permzValues)
69
+      permzValues <- permzValues[ordering]
70
+
71
+      statistic <- sum(zScore > permzValues)
72
+      statUpReg <- 1 - statistic/length(permzValues) 
73
+      statsUp[p,gs] <- max(statUpReg, 1/numPerm)
74
+
75
+      statistic <- sum(zScore < permzValues)
76
+      statDownReg <- 1 - statistic/length(permzValues) 
77
+      statsDown[p,gs] <- max(statDownReg,1/numPerm)
78
+
79
+      activity <- 1 - 2*max(statUpReg, 1/numPerm)
80
+      actEst[p,gs] <- activity
81
+    }
82
+
83
+  }
84
+
85
+  # format output
86
+  colnames(statsUp) <- names(GStoGenesList)
87
+  colnames(statsDown) <- names(GStoGenesList)
88
+  colnames(actEst) <- names(GStoGenesList)
89
+
90
+  rownames(statsUp) <- colnames(zMatrix)
91
+  rownames(statsDown) <- colnames(zMatrix)
92
+  rownames(actEst) <- colnames(zMatrix)
93
+
94
+  assign('GSUpreg', statsUp, env=results)
95
+  assign('GSDownreg', statsDown, env=results)
96
+  assign('GSActEst', actEst, env=results)
97
+  
98
+  return(results)
99
+}
0 100
new file mode 100644
... ...
@@ -0,0 +1,25 @@
1
+calcZ <- function (meanMat, sdMat) {
2
+
3
+  # compute the z score for each gene's association to each pattern
4
+
5
+  # find matrix dimensions
6
+  nrows <- dim(meanMat)[1]
7
+  ncols <- dim(meanMat)[2]
8
+
9
+  check <- dim(sdMat)[1]
10
+  if (nrows != check) {
11
+    stop("Number of rows in the mean and standard deviation of A do not agree.")
12
+  }
13
+
14
+  check <- dim(sdMat)[2]
15
+  if (ncols != check) {
16
+    stop("Number of columns in the mean and standard deviation of A do not agree.")
17
+  }
18
+
19
+  # compute the matrix of z scores
20
+  zMat <- meanMat/sdMat
21
+  rownames(zMat) <- rownames(meanMat)
22
+  colnames(zMat) <- colnames(meanMat)
23
+
24
+  return(zMat)
25
+}
0 26
new file mode 100644
... ...
@@ -0,0 +1,26 @@
1
+# plot the decomposed A and P matrices
2
+plotGAPS <- function(A, P, outputPDF="") {
3
+	if (outputPDF != "") {
4
+	  pdf(file=paste(outputPDF,"_Patterns",".pdf",sep=""))
5
+	} else {
6
+	  dev.new()
7
+	}
8
+
9
+	arrayIdx <- 1:ncol(P)
10
+	matplot(arrayIdx, t(P), type='l', lwd=10, xlim=c(1,ncol(P)), ylim=c(0,1))
11
+	title(main='Inferred patterns')
12
+  
13
+	if (outputPDF == "") {
14
+	  dev.new()
15
+	} else {
16
+	  dev.off()
17
+	  pdf(file=paste(outputPDF,"_Amplitude",".pdf",sep=""))
18
+	}
19
+
20
+	heatmap(A, Rowv=NA, Colv=NA) 
21
+	
22
+	if (outputPDF != "") {
23
+		dev.off()
24
+	}
25
+  
26
+}
0 27
new file mode 100644
1 28
Binary files /dev/null and b/data/EasySimGS.RData differ
2 29
new file mode 100644
3 30
Binary files /dev/null and b/data/GIST_TS_20084.RData differ
4 31
new file mode 100644
5 32
Binary files /dev/null and b/data/ModSim.RData differ
6 33
new file mode 100644
7 34
Binary files /dev/null and b/data/TFGSList.RData differ
8 35
new file mode 100644
... ...
@@ -0,0 +1,16 @@
1
+citHeader("To cite package 'CoGAPS' in publications use:")
2
+
3
+desc <- packageDescription("CoGAPS")
4
+year <- sub(".*(2[[:digit:]]{3})-.*", "\\1", desc$Date)
5
+vers <- paste("R package version", desc$Version)
6
+
7
+citEntry(entry="article",
8
+         title = "{CoGAPS: an integrated R/C++ package to identify overlapping patterns of activation of biological processes from expression data}",
9
+         author = personList(as.person("Elana J. Fertig"),
10
+           as.person("Jie Ding"), as.person("Alexander V. Favorov"),
11
+           as.person("Giovanni Parmigiani"), as.person("Michael F. Ochs")),
12
+         year = 2010,
13
+         journal = "Bioinformatics",
14
+         note = "Manuscript Submitted",
15
+         textVersion = "E.J. Fertig, J. Ding, A.V. Favorov, G. Parmigiani, and M.F. Ochs (2010) CoGAPS: an integrated R/C++ package to identify overlapping patterns of activation of biological processes from expression data.  Manuscript Submitted to Bioinformatics.")
16
+           
0 17
new file mode 100644
... ...
@@ -0,0 +1,529 @@
1
+%% This BibTeX bibliography file was created using BibDesk.
2
+%% http://bibdesk.sourceforge.net/
3
+
4
+
5
+%% Created for Elana Fertig at 2010-02-04 11:48:54 -0500 
6
+
7
+
8
+%% Saved with string encoding Unicode (UTF-8) 
9
+
10
+
11
+
12
+@article{Tavazoie1999,
13
+	Abstract = {Technologies to measure whole-genome mRNA abundances and methods to organize and display such data are emerging as valuable tools for systems-level exploration of transcriptional regulatory networks. For instance, it has been shown that mRNA data from 118 genes, measured at several time points in the developing hindbrain of mice, can be hierarchically clustered into various patterns (or 'waves') whose members tend to participate in common processes. We have previously shown that hierarchical clustering can group together genes whose cis-regulatory elements are bound by the same proteins in vivo. Hierarchical clustering has also been used to organize genes into hierarchical dendograms on the basis of their expression across multiple growth conditions. The application of Fourier analysis to synchronized yeast mRNA expression data has identified cell-cycle periodic genes, many of which have expected cis-regulatory elements. Here we apply a systematic set of statistical algorithms, based on whole-genome mRNA data, partitional clustering and motif discovery, to identify transcriptional regulatory sub-networks in yeast-without any a priori knowledge of their structure or any assumptions about their dynamics. This approach uncovered new regulons (sets of co-regulated genes) and their putative cis-regulatory elements. We used statistical characterization of known regulons and motifs to derive criteria by which we infer the biological significance of newly discovered regulons and motifs. Our approach holds promise for the rapid elucidation of genetic network architecture in sequenced organisms in which little biology is known.},
14
+	Address = {Department of Genetics, Harvard Medical School, Boston, Massachusetts 02115, USA.},
15
+	Au = {Tavazoie, S and Hughes, JD and Campbell, MJ and Cho, RJ and Church, GM},
16
+	Author = {Tavazoie, S. and Hughes, J.D. and Campbell, M.J. and Cho, R.J. and Church, G.M.},
17
+	Cin = {Nat Genet. 1999 Jul;22(3):213-5. PMID: 10391202},
18
+	Crdt = {1999/07/03 10:00},
19
+	Da = {19990719},
20
+	Date-Added = {2010-02-04 11:43:20 -0500},
21
+	Date-Modified = {2010-02-04 11:48:43 -0500},
22
+	Dcom = {19990719},
23
+	Doi = {10.1038/10343},
24
+	Edat = {1999/07/03 10:00},
25
+	Issn = {1061-4036 (Print); 1061-4036 (Linking)},
26
+	Jid = {9216904},
27
+	Journal = {Nat Genet},
28
+	Jt = {Nature genetics},
29
+	Language = {eng},
30
+	Lr = {20061115},
31
+	Mh = {Animals; Cell Cycle/genetics; DNA/genetics; Gene Expression; *Genetic Techniques; Mice; Multigene Family; Open Reading Frames; RNA, Messenger/genetics/metabolism; Rhombencephalon/growth \& development/metabolism; Saccharomyces cerevisiae/cytology/genetics/metabolism},
32
+	Mhda = {2001/03/23 10:01},
33
+	Number = {3},
34
+	Own = {NLM},
35
+	Pages = {281--285},
36
+	Pl = {UNITED STATES},
37
+	Pmid = {10391217},
38
+	Pst = {ppublish},
39
+	Pt = {Journal Article; Research Support, Non-U.S. Gov't; Research Support, U.S. Gov't, Non-P.H.S.},
40
+	Rn = {0 (RNA, Messenger); 9007-49-2 (DNA)},
41
+	Sb = {IM},
42
+	So = {Nat Genet. 1999 Jul;22(3):281-5. },
43
+	Stat = {MEDLINE},
44
+	Title = {Systematic determination of genetic network architecture.},
45
+	Volume = {22},
46
+	Year = {1999},
47
+	Bdsk-Url-1 = {http://dx.doi.org/10.1038/10343}}
48
+
49
+@article{Goeman2007,
50
+	Abstract = {MOTIVATION: Many statistical tests have been proposed in recent years for analyzing gene expression data in terms of gene sets, usually from Gene Ontology. These methods are based on widely different methodological assumptions. Some approaches test differential expression of each gene set against differential expression of the rest of the genes, whereas others test each gene set on its own. Also, some methods are based on a model in which the genes are the sampling units, whereas others treat the subjects as the sampling units. This article aims to clarify the assumptions behind different approaches and to indicate a preferential methodology of gene set testing. RESULTS: We identify some crucial assumptions which are needed by the majority of methods. P-values derived from methods that use a model which takes the genes as the sampling unit are easily misinterpreted, as they are based on a statistical model that does not resemble the biological experiment actually performed. Furthermore, because these models are based on a crucial and unrealistic independence assumption between genes, the P-values derived from such methods can be wildly anti-conservative, as a simulation experiment shows. We also argue that methods that competitively test each gene set against the rest of the genes create an unnecessary rift between single gene testing and gene set testing.},
51
+	Address = {Department of Medical Statistics and Bioinformatics, Leiden University Medical Center, Leiden, The Netherlands. j.j.goeman@lumc.nl},
52
+	Au = {Goeman, JJ and Buhlmann, P},
53
+	Author = {Goeman, J.J. and Buhlmann, P.},
54
+	Crdt = {2007/02/17 09:00},
55
+	Da = {20070501},
56
+	Date-Added = {2010-02-04 11:42:46 -0500},
57
+	Date-Modified = {2010-02-04 11:48:51 -0500},
58
+	Dcom = {20070522},
59
+	Dep = {20070215},
60
+	Doi = {10.1093/bioinformatics/btm051},
61
+	Edat = {2007/02/17 09:00},
62
+	Issn = {1367-4811 (Electronic); 1367-4803 (Linking)},
63
+	Jid = {9808944},
64
+	Journal = {Bioinformatics},
65
+	Jt = {Bioinformatics (Oxford, England)},
66
+	Keywords = {gene sets},
67
+	Language = {eng},
68
+	Lr = {20091104},
69
+	Mh = {*Algorithms; *Artifacts; *Data Interpretation, Statistical; *Databases, Genetic; Gene Expression Profiling/*methods; Information Storage and Retrieval/*methods; Reproducibility of Results; Sensitivity and Specificity},
70
+	Mhda = {2007/05/23 09:00},
71
+	Number = {8},
72
+	Own = {NLM},
73
+	Pages = {980--987},
74
+	Phst = {2007/02/15 {$[$}aheadofprint{$]$}},
75
+	Pii = {btm051},
76
+	Pl = {England},
77
+	Pmid = {17303618},
78
+	Pst = {ppublish},
79
+	Pt = {Comparative Study; Evaluation Studies; Journal Article},
80
+	Sb = {IM},
81
+	So = {Bioinformatics. 2007 Apr 15;23(8):980-7. Epub 2007 Feb 15. },
82
+	Stat = {MEDLINE},
83
+	Title = {Analyzing gene expression data in terms of gene sets: methodological issues.},
84
+	Volume = {23},
85
+	Year = {2007},
86
+	Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btm051}}
87
+
88
+@article{Bidaut2004,
89
+	Abstract = {ClutrFree facilitates the visualization and interpretation of clusters or patterns computed from microarray data through a graphical user interface that displays patterns, membership information of the genes and annotation statistics simultaneously. ClutrFree creates a tree linking the patterns based on similarity, permitting the navigation among patterns identified by different algorithms or by the same algorithm with different parameters, and aids the inferring of conclusions from a microarray experiment. AVAILABILITY: The ClutrFree Java source code and compiled bytecode are available as a package under the GNU General Public License at http://bioinformatics.fccc.edu},
90
+	Address = {Division of Population Science, Fox Chase Cancer Center, 333 Cottman Avenue, Philadelphia, PA 19111, USA.},
91
+	Au = {Bidaut, G and Ochs, MF},
92
+	Author = {Bidaut, G. and Ochs, M.F.},
93
+	Crdt = {2004/05/18 05:00},
94
+	Da = {20041101},
95
+	Date-Added = {2010-02-02 12:36:53 -0500},
96
+	Date-Modified = {2010-02-02 16:00:15 -0500},
97
+	Dcom = {20050210},
98
+	Dep = {20040514},
99
+	Doi = {10.1093/bioinformatics/bth307},
100
+	Edat = {2004/05/18 05:00},
101
+	Gr = {CA06927/CA/NCI NIH HHS/United States},
102
+	Issn = {1367-4803 (Print); 1367-4803 (Linking)},
103
+	Jid = {9808944},
104
+	Journal = {Bioinformatics},
105
+	Jt = {Bioinformatics (Oxford, England)},
106
+	Language = {eng},
107
+	Lr = {20071114},
108
+	Mh = {*Algorithms; *Cluster Analysis; Computer Graphics; Oligonucleotide Array Sequence Analysis/*methods; Sequence Alignment/*methods; Sequence Analysis, DNA/*methods; *Software; *User-Computer Interface},
109
+	Mhda = {2005/02/11 09:00},
110
+	Number = {16},
111
+	Own = {NLM},
112
+	Pages = {2869--2871},
113
+	Phst = {2004/05/14 {$[$}aheadofprint{$]$}},
114
+	Pii = {bth307},
115
+	Pl = {England},
116
+	Pmid = {15145813},
117
+	Pst = {ppublish},
118
+	Pt = {Journal Article; Research Support, Non-U.S. Gov't; Research Support, U.S. Gov't, P.H.S.},
119
+	Sb = {IM},
120
+	So = {Bioinformatics. 2004 Nov 1;20(16):2869-71. Epub 2004 May 14. },
121
+	Stat = {MEDLINE},
122
+	Title = {ClutrFree: cluster tree visualization and interpretation.},
123
+	Volume = {20},
124
+	Year = {2004},
125
+	Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/bth307}}
126
+
127
+@article{Carvalho2008,
128
+	Author = {Carvalho, C.M. and Chang, J. and Lucas, J. and Nevins, J.R. and Wang, Q. and West, M. },
129
+	Date-Added = {2010-02-02 12:36:36 -0500},
130
+	Date-Modified = {2010-02-02 12:36:36 -0500},
131
+	Journal = {J. Am. Stat. Assoc. },
132
+	Pages = {1438 - 1456},
133
+	Title = {High-dimensional sparse factor modelling: Applications in gene  expression genomics},
134
+	Volume = {103},
135
+	Year = {2008}}
136
+
137
+@article{Lee1999,
138
+	Abstract = {Is perception of the whole based on perception of its parts? There is psychological and physiological evidence for parts-based representations in the brain, and certain computational theories of object recognition rely on such representations. But little is known about how brains or computers might learn the parts of objects. Here we demonstrate an algorithm for non-negative matrix factorization that is able to learn parts of faces and semantic features of text. This is in contrast to other methods, such as principal components analysis and vector quantization, that learn holistic, not parts-based, representations. Non-negative matrix factorization is distinguished from the other methods by its use of non-negativity constraints. These constraints lead to a parts-based representation because they allow only additive, not subtractive, combinations. When non-negative matrix factorization is implemented as a neural network, parts-based representations emerge by virtue of two properties: the firing rates of neurons are never negative and synaptic strengths do not change sign.},
139
+	Address = {Bell Laboratories, Lucent Technologies, Murray Hill, New Jersey 07974, USA.},
140
+	Author = {Lee, D.D. and Seung, H.S.},
141
+	Cin = {Nature. 1999 Oct 21;401(6755):759. PMID: 10548097; Nature. 1999 Oct 21;401(6755):759-60. PMID: 10548098},
142
+	Crdt = {1999/11/05 08:00},
143
+	Da = {19991116},
144
+	Date = {1999 Oct 21},
145
+	Date-Added = {2010-02-02 12:35:49 -0500},
146
+	Date-Modified = {2010-02-02 15:57:17 -0500},
147
+	Dcom = {19991116},
148
+	Doi = {10.1038/44565},
149
+	Edat = {1999/11/05 08:00},
150
+	Issn = {0028-0836 (Print)},
151
+	Jid = {0410462},
152
+	Journal = {Nature},
153
+	Jt = {Nature},
154
+	Language = {eng},
155
+	Lr = {20061115},
156
+	Mh = {*Algorithms; Face; Humans; *Learning; Models, Neurological; Perception/physiology; Semantics},
157
+	Mhda = {2001/03/23 10:01},
158
+	Number = {6755},
159
+	Own = {NLM},
160
+	Pages = {788--791},
161
+	Pl = {ENGLAND},
162
+	Pmid = {10548103},
163
+	Pst = {ppublish},
164
+	Pt = {Journal Article; Research Support, Non-U.S. Gov't},
165
+	Sb = {IM},
166
+	Status = {MEDLINE},
167
+	Title = {Learning the parts of objects by non-negative matrix factorization.},
168
+	Volume = {401},
169
+	Year = {1999},
170
+	Bdsk-Url-1 = {http://dx.doi.org/10.1038/44565}}
171
+
172
+@article{Kossenkov2009,
173
+	Abstract = {We explore a number of matrix factorization methods in terms of their ability to identify signatures of biological processes in a large gene expression study. We focus on the ability of these methods to find signatures in terms of gene ontology enhancement and on the interpretation of these signatures in the samples. Two Bayesian approaches, Bayesian Decomposition (BD) and Bayesian Factor Regression Modeling (BFRM), perform best. Differences in the strength of the signatures between the samples suggest that BD will be most useful for systems modeling and BFRM for biomarker discovery.},
174
+	Address = {The Wistar Institute, Philadelphia, Pennsylvania, USA.},
175
+	Au = {Kossenkov, AV and Ochs, MF},
176
+	Author = {Kossenkov, A.V. and Ochs, M.F.},
177
+	Crdt = {2009/11/10 06:00},
178
+	Da = {20091109},
179
+	Date-Added = {2010-02-02 12:33:48 -0500},
180
+	Date-Modified = {2010-02-02 15:55:21 -0500},
181
+	Dcom = {20100111},
182
+	Doi = {10.1016/S0076-6879(09)67003-8},
183
+	Edat = {2009/11/10 06:00},
184
+	Issn = {1557-7988 (Electronic); 1557-7988 (Linking)},
185
+	Jid = {0212271},
186
+	Journal = {Methods Enzymol},
187
+	Jt = {Methods in enzymology},
188
+	Keywords = {Markov Chain Monte Carlo, matrix factorization},
189
+	Language = {eng},
190
+	Mh = {Algorithms; Bayes Theorem; Biological Processes/physiology; *Cluster Analysis; *Data Interpretation, Statistical; Gene Expression Profiling/*methods; Gene Regulatory Networks; Microarray Analysis/*methods; Pattern Recognition, Automated/methods; Saccharomyces cerevisiae/genetics/physiology},
191
+	Mhda = {2010/01/12 06:00},
192
+	Own = {NLM},
193
+	Pages = {59--77},
194
+	Pii = {S0076-6879(09)67003-8},
195
+	Pl = {United States},
196
+	Pmid = {19897089},
197
+	Pst = {ppublish},
198
+	Pt = {Journal Article},
199
+	Sb = {IM},
200
+	So = {Methods Enzymol. 2009;467:59-77. },
201
+	Stat = {MEDLINE},
202
+	Title = {Matrix factorization for recovery of biological processes from microarray data.},
203
+	Volume = {467},
204
+	Year = {2009},
205
+	Bdsk-Url-1 = {http://dx.doi.org/10.1016/S0076-6879(09)67003-8}}
206
+
207
+@article{Subramanian2005,
208
+	Abstract = {Although genomewide RNA expression analysis has become a routine tool in biomedical research, extracting biological insight from such information remains a major challenge. Here, we describe a powerful analytical method called Gene Set Enrichment Analysis (GSEA) for interpreting gene expression data. The method derives its power by focusing on gene sets, that is, groups of genes that share common biological function, chromosomal location, or regulation. We demonstrate how GSEA yields insights into several cancer-related data sets, including leukemia and lung cancer. Notably, where single-gene analysis finds little similarity between two independent studies of patient survival in lung cancer, GSEA reveals many biological pathways in common. The GSEA method is embodied in a freely available software package, together with an initial database of 1,325 biologically defined gene sets.},
209
+	Address = {Broad Institute of Massachusetts Institute of Technology and Harvard, 320 Charles Street, Cambridge, MA 02141, USA.},
210
+	Au = {Subramanian, A and Tamayo, P and Mootha, VK and Mukherjee, S and Ebert, BL and Gillette, MA and Paulovich, A and Pomeroy, SL and Golub, TR and Lander, ES and Mesirov, JP},
211
+	Author = {Subramanian, A. and Tamayo, P. and Mootha, V.K. and Mukherjee, S. and Ebert, B.L. and Gillette, M.A. and Paulovich, A. and Pomeroy, S.L. and Golub, T.R. and Lander, E.S. and Mesirov, J.P.},
212
+	Cin = {Proc Natl Acad Sci U S A. 2005 Oct 25;102(43):15278-9. PMID: 16230612},
213
+	Crdt = {2005/10/04 09:00},
214
+	Da = {20051026},
215
+	Date-Added = {2010-02-01 16:29:02 -0500},
216
+	Date-Modified = {2010-02-02 15:59:33 -0500},
217
+	Dcom = {20051212},
218
+	Dep = {20050930},
219
+	Doi = {10.1073/pnas.0506580102},
220
+	Edat = {2005/10/04 09:00},
221
+	Issn = {0027-8424 (Print); 0027-8424 (Linking)},
222
+	Jid = {7505876},
223
+	Journal = {Proc Natl Acad Sci},
224
+	Jt = {Proceedings of the National Academy of Sciences of the United States of America},
225
+	Keywords = {gene set},
226
+	Language = {eng},
227
+	Lr = {20091118},
228
+	Mh = {Cell Line, Tumor; Female; Gene Expression Profiling/*methods; Genes, p53/physiology; Genome; Humans; Leukemia, Myeloid, Acute/genetics; Lung Neoplasms/genetics/mortality; Male; *Oligonucleotide Array Sequence Analysis; Precursor Cell Lymphoblastic Leukemia-Lymphoma/genetics},
229
+	Mhda = {2005/12/15 09:00},
230
+	Number = {43},
231
+	Oid = {NLM: PMC1239896},
232
+	Own = {NLM},
233
+	Pages = {15545--15550},
234
+	Phst = {2005/09/30 {$[$}aheadofprint{$]$}},
235
+	Pii = {0506580102},
236
+	Pl = {United States},
237
+	Pmc = {PMC1239896},
238
+	Pmid = {16199517},
239
+	Pst = {ppublish},
240
+	Pt = {Journal Article},
241
+	Sb = {IM},
242
+	So = {Proc Natl Acad Sci U S A. 2005 Oct 25;102(43):15545-50. Epub 2005 Sep 30. },
243
+	Stat = {MEDLINE},
244
+	Title = {Gene set enrichment analysis: a knowledge-based approach for interpreting genome-wide expression profiles.},
245
+	Volume = {102},
246
+	Year = {2005},
247
+	Bdsk-Url-1 = {http://dx.doi.org/10.1073/pnas.0506580102}}
248
+
249
+@article{Draghici2003,
250
+	Abstract = {Onto-Tools is a set of four seamlessly integrated databases: Onto-Express, Onto-Compare, Onto-Design and Onto-Translate. Onto-Express is able to automatically translate lists of genes found to be differentially regulated in a given condition into functional profiles characterizing the impact of the condition studied upon various biological processes and pathways. OE constructs functional profiles (using Gene Ontology terms) for the following categories: biochemical function, biological process, cellular role, cellular component, molecular function and chromosome location. Statistical significance values are calculated for each category. Once the initial exploratory analysis identified a number of relevant biological processes, specific mechanisms of interactions can be hypothesized for the conditions studied. Currently, many commercial arrays are available for the investigation of specific mechanisms. Each such array is characterized by a biological bias determined by the extent to which the genes present on the array represent specific pathways. Onto-Compare is a tool that allows efficient comparisons of any sets of commercial or custom arrays. Using Onto-Compare, a researcher can determine quickly which array, or set of arrays, covers best the hypotheses studied. In many situations, no commercial arrays are available for specific biological mechanisms. Onto-Design is a tool that allows the user to select genes that represent given functional categories. Onto-Translate allows the user to translate easily lists of accession numbers, UniGene clusters and Affymetrix probes into one another. All tools above are seamlessly integrated. The Onto-Tools are available online at http://vortex.cs.wayne.edu/Projects.html.},
251
+	Address = {Department of Computer Science, Wayne State University, 431 State Hall, Detroit, MI 48202, USA. sod@cs.wayne.edu},
252
+	Au = {Draghici, S and Khatri, P and Bhavsar, P and Shah, A and Krawetz, SA and Tainsky, MA},
253
+	Author = {Draghici, S. and Khatri, P. and Bhavsar, P. and Shah, A. and Krawetz, S.A. and Tainsky, M.A.},
254
+	Crdt = {2003/06/26 05:00},
255
+	Da = {20030625},
256
+	Date-Added = {2010-02-01 16:28:22 -0500},
257
+	Date-Modified = {2010-02-02 15:56:55 -0500},
258
+	Dcom = {20030818},
259
+	Edat = {2003/06/26 05:00},
260
+	Gr = {R01-NS045207-01/NS/NINDS NIH HHS/United States; R21-EB000990-01/EB/NIBIB NIH HHS/United States},
261
+	Issn = {1362-4962 (Electronic); 1362-4962 (Linking)},
262
+	Jid = {0411011},
263
+	Journal = {Nucleic Acids Res},
264
+	Jt = {Nucleic acids research},
265
+	Keywords = {gene set},
266
+	Language = {eng},
267
+	Lr = {20091118},
268
+	Mh = {Databases, Nucleic Acid; Gene Expression Profiling/*methods; Internet; Oligonucleotide Array Sequence Analysis/*methods; Proteins/genetics/physiology; *Software; Systems Integration},
269
+	Mhda = {2003/08/19 05:00},
270
+	Number = {13},
271
+	Oid = {NLM: PMC169030},
272
+	Own = {NLM},
273
+	Pages = {3775--3781},
274
+	Pl = {England},
275
+	Pmc = {PMC169030},
276
+	Pmid = {12824416},
277
+	Pst = {ppublish},
278
+	Pt = {Journal Article; Research Support, Non-U.S. Gov't; Research Support, U.S. Gov't, Non-P.H.S.; Research Support, U.S. Gov't, P.H.S.},
279
+	Rn = {0 (Proteins)},
280
+	Sb = {IM},
281
+	So = {Nucleic Acids Res. 2003 Jul 1;31(13):3775-81. },
282
+	Stat = {MEDLINE},
283
+	Title = {{Onto-Tools, the toolkit of the modern biologist: Onto-Express, Onto-Compare, Onto-Design and Onto-Translate.}},
284
+	Volume = {31},
285
+	Year = {2003}}
286
+
287
+@inbook{Ochs2006,
288
+	Address = {London},
289
+	Author = {M.F. Ochs},
290
+	Date-Added = {2010-01-08 14:45:36 -0500},
291
+	Date-Modified = {2010-01-08 14:48:01 -0500},
292
+	Editor = {G. Parmigiani and E. S. Garrett and R. A. Irizarry and S. L. Zeger},
293
+	Pages = {388-408},
294
+	Publisher = {Springer-Verlag},
295
+	Series = {Statistics for Biology and Health},
296
+	Title = {The Analysis of Gene Expression Data The Analysis of Gene Expression Data: Methods and Software},
297
+	Year = {2006}}
298
+
299
+@article{Bidaut2006,
300
+	Abstract = {BACKGROUND: As numerous diseases involve errors in signal transduction, modern therapeutics often target proteins involved in cellular signaling. Interpretation of the activity of signaling pathways during disease development or therapeutic intervention would assist in drug development, design of therapy, and target identification. Microarrays provide a global measure of cellular response, however linking these responses to signaling pathways requires an analytic approach tuned to the underlying biology. An ongoing issue in pattern recognition in microarrays has been how to determine the number of patterns (or clusters) to use for data interpretation, and this is a critical issue as measures of statistical significance in gene ontology or pathways rely on proper separation of genes into groups. RESULTS: Here we introduce a method relying on gene annotation coupled to decompositional analysis of global gene expression data that allows us to estimate specific activity on strongly coupled signaling pathways and, in some cases, activity of specific signaling proteins. We demonstrate the technique using the Rosetta yeast deletion mutant data set, decompositional analysis by Bayesian Decomposition, and annotation analysis using ClutrFree. We determined from measurements of gene persistence in patterns across multiple potential dimensionalities that 15 basis vectors provides the correct dimensionality for interpreting the data. Using gene ontology and data on gene regulation in the Saccharomyces Genome Database, we identified the transcriptional signatures of several cellular processes in yeast, including cell wall creation, ribosomal disruption, chemical blocking of protein synthesis, and, critically, individual signatures of the strongly coupled mating and filamentation pathways. CONCLUSION: This works demonstrates that microarray data can provide downstream indicators of pathway activity either through use of gene ontology or transcription factor databases. This can be used to investigate the specificity and success of targeted therapeutics as well as to elucidate signaling activity in normal and disease processes.},
301
+	Address = {Fox Chase Cancer Center, 333 Cottman Avenue, Philadelphia, PA 19111, USA. ghbidaut@pcbi.upenn.edu},
302
+	Au = {Bidaut, G and Suhre, K and Claverie, JM and Ochs, MF},
303
+	Author = {Bidaut, G. and Suhre, K. and Claverie, J.-M. and Ochs, M.F.},
304
+	Da = {20060327},
305
+	Date-Added = {2010-01-08 14:39:10 -0500},
306
+	Date-Modified = {2010-01-08 14:39:10 -0500},
307
+	Dcom = {20060426},
308
+	Dep = {20060228},
309
+	Doi = {10.1186/1471-2105-7-99},
310
+	Edat = {2006/03/02 09:00},
311
+	Gr = {CA06927/CA/United States NCI; LM008309/LM/United States NLM},
312
+	Issn = {1471-2105 (Electronic)},
313
+	Jid = {100965194},
314
+	Journal = {BMC Bioinformatics},
315
+	Jt = {BMC bioinformatics},
316
+	Keywords = {Bayesian, Markov Chain Monte Carlo, signaling network, microarray, k25},
317
+	Language = {eng},
318
+	Lr = {20071114},
319
+	Mh = {Algorithms; Computer Simulation; Gene Expression Profiling/*methods; *Models, Biological; Oligonucleotide Array Sequence Analysis/*methods; Pattern Recognition, Automated/*methods; Saccharomyces cerevisiae Proteins/genetics/*metabolism; Signal Transduction/*physiology; Transcription Factors/genetics/*metabolism},
320
+	Mhda = {2006/04/28 09:00},
321
+	Own = {NLM},
322
+	Pages = {99},
323
+	Phst = {2005/09/22 {$[$}received{$]$}; 2006/02/28 {$[$}accepted{$]$}; 2006/02/28 {$[$}aheadofprint{$]$}},
324
+	Pii = {1471-2105-7-99},
325
+	Pl = {England},
326
+	Pmc = {PMC1413561},
327
+	Pmid = {16507110},
328
+	Pst = {epublish},
329
+	Pt = {Journal Article; Research Support, N.I.H., Extramural; Research Support, Non-U.S. Gov't},
330
+	Pubm = {Electronic},
331
+	Rn = {0 (Saccharomyces cerevisiae Proteins); 0 (Transcription Factors)},
332
+	Sb = {IM},
333
+	So = {BMC Bioinformatics. 2006 Feb 28;7:99. },
334
+	Stat = {MEDLINE},
335
+	Title = {Determination of strongly overlapping signaling activity from microarray data.},
336
+	Volume = {7},
337
+	Year = {2006},
338
+	Bdsk-Url-1 = {http://dx.doi.org/10.1186/1471-2105-7-99}}
339
+
340
+@article{Kossenkov2007a,
341
+	Abstract = {Many biological processes rely on remodeling of the transcriptional response of cells through activation of transcription factors. Although determination of the activity level of transcription factors from microarray data can provide insight into developmental and disease processes, it requires careful analysis because of the multiple regulation of genes. We present a novel approach that handles both the assignment of genes to multiple patterns, as required by multiple regulation, and the linking of genes in prior probability distributions according to their known transcriptional regulators. We demonstrate the power of this approach in simulations and by application to yeast cell cycle and deletion mutant data. The results of simulations in the presence of increasing noise showed improved recovery of patterns in terms of chi2 fit. Analysis of the yeast data led to improved inference of biologically meaningful groups in comparison to other techniques, as demonstrated with ROC analysis. The new algorithm provides an approach for estimating the levels of transcription factor activity from microarray data, and therefore provides insights into biological response.},
342
+	Address = {The Wistar Institute, Philadelphia, PA, USA.},
343
+	Au = {Kossenkov, AV and Peterson, AJ and Ochs, MF},
344
+	Author = {A.V. Kossenkov and A.J. Peterson and M.F. Ochs},
345
+	Da = {20071003},
346
+	Date-Added = {2010-01-08 14:39:10 -0500},
347
+	Date-Modified = {2010-01-08 14:39:10 -0500},
348
+	Dcom = {20071102},
349
+	Edat = {2007/10/04 09:00},
350
+	Gr = {CA06973/CA/United States NCI; LM008309/LM/United States NLM},
351
+	Issn = {0926-9630 (Print)},
352
+	Jid = {9214582},
353
+	Journal = {Stud Health Technol Inform},
354
+	Jt = {Studies in health technology and informatics},
355
+	Keywords = {Bayesian, Markov Chain Monte Carlo, microarray},
356
+	Language = {eng},
357
+	Lr = {20080710},
358
+	Mh = {*Algorithms; Bayes Theorem; Computational Biology; *Gene Expression Regulation; Markov Chains; Models, Genetic; Monte Carlo Method; *Oligonucleotide Array Sequence Analysis; ROC Curve; Transcription Factors/*metabolism; Transcription, Genetic; Yeasts/genetics},
359
+	Mhda = {2007/11/06 09:00},
360
+	Number = {Pt 2},
361
+	Own = {NLM},
362
+	Pages = {1250--1254},
363
+	Pl = {Netherlands},
364
+	Pmid = {17911915},
365
+	Pst = {ppublish},
366
+	Pt = {Journal Article; Research Support, N.I.H., Extramural},
367
+	Pubm = {Print},
368
+	Rn = {0 (Transcription Factors)},
369
+	Sb = {T},
370
+	So = {Stud Health Technol Inform. 2007;129(Pt 2):1250-4. },
371
+	Stat = {MEDLINE},
372
+	Title = {Determining transcription factor activity from microarray data using {Bayesian Markov chain Monte Carlo} sampling.},
373
+	Volume = {129},
374
+	Year = {2007}}
375
+
376
+@article{Moloshok2002,
377
+	Abstract = {MOTIVATION: Microarray and gene chip technology provide high throughput tools for measuring gene expression levels in a variety of circumstances, including cellular response to drug treatment, cellular growth and development, tumorigenesis, among many other processes. In order to interpret the large data sets generated in experiments, data analysis techniques that consider biological knowledge during analysis will be extremely useful. We present here results showing the application of such a tool to expression data from yeast cell cycle experiments. RESULTS: Originally developed for spectroscopic analysis, Bayesian Decomposition (BD) includes two features which make it useful for microarray data analysis: the ability to assign genes to multiple coexpression groups and the ability to encode biological knowledge into the system. Here we demonstrate the ability of the algorithm to provide insight into the yeast cell cycle, including identification of five temporal patterns tied to cell cycle phases as well as the identification of a pattern tied to an approximately 40 min cell cycle oscillator. The genes are simultaneously assigned to the patterns, including partial assignment to multiple patterns when this is required to explain the expression profile. AVAILABILITY: The application is available free to academic users under a material transfer agreement. Go to http://bioinformatics.fccc.edu/ for more details.},
378
+	Address = {Bioinformatics Working Group, Fox Chase Cancer Center, Philadelphia, PA 19111, USA. td_moloshok@fccc.edu},
379
+	Au = {Moloshok, TD and Klevecz, RR and Grant, JD and Manion, FJ and Speier WF, 4th and Ochs, MF},
380
+	Author = {Moloshok, T.D. and Klevecz, R.R. and Grant, J.D. and Manion, F.J. and Speier IV, W.F. and Ochs, M.F.},
381
+	Da = {20020517},
382
+	Date-Added = {2010-01-08 14:39:10 -0500},
383
+	Date-Modified = {2010-01-08 14:39:10 -0500},
384
+	Dcom = {20021108},
385
+	Edat = {2002/05/23 10:00},
386
+	Gr = {CA06927/CA/United States NCI},
387
+	Issn = {1367-4803 (Print)},
388
+	Jid = {9808944},
389
+	Journal = {Bioinformatics},
390
+	Jt = {Bioinformatics (Oxford, England)},
391
+	Keywords = {Bayesian, signaling network, microarray},
392
+	Language = {eng},
393
+	Lr = {20071114},
394
+	Mh = {*Algorithms; *Bayes Theorem; Cell Cycle/genetics; Databases, Genetic; Gene Expression Regulation; Genome, Fungal; Markov Chains; *Models, Genetic; *Models, Statistical; Monte Carlo Method; Oligonucleotide Array Sequence Analysis/*methods; Pattern Recognition, Automated; Periodicity; Reproducibility of Results; Saccharomyces cerevisiae/genetics; Sensitivity and Specificity},
395
+	Mhda = {2002/11/26 04:00},
396
+	Number = {4},
397
+	Own = {NLM},
398
+	Pages = {566--575},
399
+	Pl = {England},
400
+	Pmid = {12016054},
401
+	Pst = {ppublish},
402
+	Pt = {Comparative Study; Journal Article; Research Support, Non-U.S. Gov't; Research Support, U.S. Gov't, P.H.S.},
403
+	Pubm = {Print},
404
+	Sb = {IM},
405
+	So = {Bioinformatics. 2002 Apr;18(4):566-75. },
406
+	Stat = {MEDLINE},
407
+	Title = {Application of {B}ayesian decomposition for analysing microarray data.},
408
+	Volume = {18},
409
+	Year = {2002},
410
+	Bdsk-File-1 = {YnBsaXN0MDDUAQIDBAUGCQpYJHZlcnNpb25UJHRvcFkkYXJjaGl2ZXJYJG9iamVjdHMSAAGGoNEHCFRyb290gAFfEA9OU0tleWVkQXJjaGl2ZXKoCwwXGBkdJCVVJG51bGzTDQ4PEBEUViRjbGFzc1dOUy5rZXlzWk5TLm9iamVjdHOAB6ISE4ACgAOiFRaABIAGWWFsaWFzRGF0YVxyZWxhdGl2ZVBhdGjSDRobHFdOUy5kYXRhgAVPEQJGAAAAAAJGAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAADD4wxdSCsAAAUWsv4VTW9sb3Nob2tfQkRfWWVhc3QucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABRay/8Xm4ONQREYgQ0FSTwACAAkAAAkgAAAAAAAAAAAAAAAAAAAAATIAABAACAAAw+NSrQAAABEACAAAxecZIwAAAAEAKAUWsv4FFrL9Ajn6RAI5RRoAFB2EABQZOQAP6lwACqQSAAqkEQAAetwAAgCFTWFjaW50b3NoIEhEOlVzZXJzOmVqZmVydGlnOkxpYnJhcnk6TWFpbDpJTUFQLWVqZmVydGlnQG1haWwubCMxNDE5Mzkub3JnOklOQk9YOk9jaHMuaW1hcG1ib3g6QXR0YWNobWVudHM6MzA4NzoyOk1vbG9zaG9rX0JEX1llYXN0LnBkZgAADgAsABUATQBvAGwAbwBzAGgAbwBrAF8AQgBEAF8AWQBlAGEAcwB0AC4AcABkAGYADwAaAAwATQBhAGMAaQBuAHQAbwBzAGgAIABIAEQAEgB6VXNlcnMvZWpmZXJ0aWcvTGlicmFyeS9NYWlsL0lNQVAtZWpmZXJ0aWdAbWFpbC5saXF1aWRkaXJ0Lm9yZy9JTkJPWC9PY2hzLmltYXBtYm94L0F0dGFjaG1lbnRzLzMwODcvMi9Nb2xvc2hva19CRF9ZZWFzdC5wZGYAEwABLwAAFQACAA///wAA0h4fICFYJGNsYXNzZXNaJGNsYXNzbmFtZaMhIiNdTlNNdXRhYmxlRGF0YVZOU0RhdGFYTlNPYmplY3RfEHEuLi8uLi9MaWJyYXJ5L01haWwvSU1BUC1lamZlcnRpZ0BtYWlsLmxpcXVpZGRpcnQub3JnL0lOQk9YL09jaHMuaW1hcG1ib3gvQXR0YWNobWVudHMvMzA4Ny8yL01vbG9zaG9rX0JEX1llYXN0LnBkZtIeHyYnoicjXE5TRGljdGlvbmFyeQAIABEAGgAfACkAMgA3ADoAPwBBAFMAXABiAGkAcAB4AIMAhQCIAIoAjACPAJEAkwCdAKoArwC3ALkDAwMIAxEDHAMgAy4DNQM+A7IDtwO6AAAAAAAAAgEAAAAAAAAAKAAAAAAAAAAAAAAAAAAAA8c=}}
411
+
412
+@article{Ochs1999,
413
+	Abstract = {A frequent problem in analysis is the need to find two matrices, closely related to the underlying measurement process, which when multiplied together reproduce the matrix of data points. Such problems arise throughout science, for example, in imaging where both the calibration of the sensor and the true scene may be unknown and in localized spectroscopy where multiple components may be present in varying amounts in any spectrum. Since both matrices are unknown, such a decomposition is a bilinear problem. We report here a solution to this problem for the case in which the decomposition results in matrices with elements drawn from positive additive distributions. We demonstrate the power of the methodology on chemical shift images (CSI). The new method, Bayesian spectral decomposition (BSD), reduces the CSI data to a small number of basis spectra together with their localized amplitudes. We apply this new algorithm to a 19F nonlocalized study of the catabolism of 5-fluorouracil in human liver, 31P CSI studies of a human head and calf muscle, and simulations which show its strengths and limitations. In all cases, the dataset, viewed as a matrix with rows containing the individual NMR spectra, results from the multiplication of a matrix of generally nonorthogonal basis spectra (the spectral matrix) by a matrix of the amplitudes of each basis spectrum in the the individual voxels (the amplitude matrix). The results show that BSD can simultaneously determine both the basis spectra and their distribution. In principle, BSD should solve this bilinear problem for any dataset which results from multiplication of matrices representing positive additive distributions if the data overdetermine the solutions.},
414
+	Address = {NMR and Medical Spectroscopy, Fox Chase Cancer Center, Philadelphia, PA, USA.},
415
+	Au = {Ochs, MF and Stoyanova, RS and Arias-Mendoza, F and Brown, TR},
416
+	Author = {Ochs, M.F. and Stoyanova, R.S. and Arias-Mendoza, F. and Brown, T.R.},
417
+	Ci = {Copyright 1999 Academic Press.},
418
+	Da = {19990331},
419
+	Date-Added = {2010-01-08 14:39:10 -0500},
420
+	Date-Modified = {2010-01-08 14:39:10 -0500},
421
+	Dcom = {19990331},
422
+	Doi = {10.1006/jmre.1998.1639},
423
+	Edat = {1999/03/04},
424
+	Gr = {CA41078/CA/United States NCI; CA62556/CA/United States NCI},
425
+	Issn = {1090-7807 (Print)},
426
+	Jid = {9707935},
427
+	Journal = {J Magn Reson},
428
+	Jt = {Journal of magnetic resonance (San Diego, Calif. : 1997)},
429
+	Keywords = {Markov Chain Monte Carlo, Bayesian, k25},
430
+	Language = {eng},
431
+	Lr = {20071114},
432
+	Mh = {Adenosine Triphosphate/analysis; Bayes Theorem; Brain/metabolism; Fluorouracil/*metabolism; Humans; Image Processing, Computer-Assisted; Liver/*metabolism; Magnesium/analysis; Magnetic Resonance Spectroscopy/*methods; Muscle, Skeletal/metabolism},
433
+	Mhda = {1999/03/04 00:01},
434
+	Number = {1},
435
+	Own = {NLM},
436
+	Pages = {161--176},
437
+	Pii = {S1090-7807(98)91639-1},
438
+	Pl = {UNITED STATES},
439
+	Pmid = {10053145},
440
+	Pst = {ppublish},
441
+	Pt = {Comparative Study; Journal Article; Research Support, U.S. Gov't, P.H.S.},
442
+	Pubm = {Print},
443
+	Rn = {51-21-8 (Fluorouracil); 56-65-5 (Adenosine Triphosphate); 7439-95-4 (Magnesium)},
444
+	Sb = {IM},
445
+	So = {J Magn Reson. 1999 Mar;137(1):161-76. },
446
+	Stat = {MEDLINE},
447
+	Title = {A new method for spectral decomposition using a bilinear Bayesian approach.},
448
+	Volume = {137},
449
+	Year = {1999},
450
+	Bdsk-Url-1 = {http://dx.doi.org/10.1006/jmre.1998.1639}}
451
+
452
+@conference{Plummer2003,
453
+	Address = {Vienna, Austria},
454
+	Author = {M. Plummer},
455
+	Booktitle = {Proceedings of the 3rd Internation Workshop on Distributed Statistical Computing},
456
+	Date-Added = {2010-01-08 14:38:05 -0500},
457
+	Date-Modified = {2010-02-02 15:47:49 -0500},
458
+	Editor = {K. Hornik and F. Leisch and A. Zeileis},
459
+	Keywords = {Markov Chain Monte Carlo},
460
+	Month = {March 20-22},
461
+	Title = {{JAGS}: A program for analysis of {Bayesian} graphical models using {Gibbs} sampling},
462
+	Year = {2003}}
463
+
464
+@article{Ochs2009,
465
+	Abstract = {Cell signaling plays a central role in the etiology of cancer. Numerous therapeutics in use or under development target signaling proteins; however, off-target effects often limit assignment of positive clinical response to the intended target. As direct measurements of signaling protein activity are not generally feasible during treatment, there is a need for more powerful methods to determine if therapeutics inhibit their targets and when off-target effects occur. We have used the Bayesian Decomposition algorithm and data on transcriptional regulation to create a novel methodology, Differential Expression for Signaling Determination (DESIDE), for inferring signaling activity from microarray measurements. We applied DESIDE to deduce signaling activity in gastrointestinal stromal tumor cell lines treated with the targeted therapeutic imatinib mesylate (Gleevec). We detected the expected reduced activity in the KIT pathway, as well as unexpected changes in the p53 pathway. Pursuing these findings, we have determined that imatinib-induced DNA damage is responsible for the increased activity of p53, identifying a novel off-target activity for this drug. We then used DESIDE on data from resected, post-imatinib treatment tumor samples and identified a pattern in these tumors similar to that at late time points in the cell lines, and this pattern correlated with initial clinical response. The pattern showed increased activity of ETS domain-containing protein Elk-1 and signal transducers and activators of transcription 3 transcription factors, which are associated with the growth of side population cells. DESIDE infers the global reprogramming of signaling networks during treatment, permitting treatment modification that leverages ongoing drug development efforts, which is crucial for personalized medicine.},
466
+	Address = {Division of Oncology Biostatistics and Bioinformatics, Johns Hopkins University, Baltimore, Maryland 21205, USA. mfo@jhu.edu},
467
+	Au = {Ochs, MF and Rink, L and Tarn, C and Mburu, S and Taguchi, T and Eisenberg, B and Godwin, AK},
468
+	Author = {Ochs, M.F. and Rink, L. and Tarn, C. and Mburu, S. and Taguchi, T. and Eisenberg, B. and Godwin, A.K.},
469
+	Crdt = {2009/11/12 06:00},
470
+	Da = {20091204},
471
+	Date-Added = {2010-01-08 14:35:07 -0500},
472
+	Date-Modified = {2010-02-02 15:57:46 -0500},
473
+	Dcom = {20091221},
474
+	Dep = {20091110},
475
+	Doi = {10.1158/0008-5472.CAN-09-1709},
476
+	Edat = {2009/11/12 06:00},
477
+	Gr = {CA009035/CA/NCI NIH HHS/United States; CA106588/CA/NCI NIH HHS/United States; CA21661/CA/NCI NIH HHS/United States; LM009382/LM/NLM NIH HHS/United States},
478
+	Issn = {1538-7445 (Electronic); 1538-7445 (Linking)},
479
+	Jid = {2984705R},
480
+	Journal = {Cancer Res},
481
+	Jt = {Cancer research},
482
+	Language = {eng},
483
+	Mh = {Antineoplastic Agents/*pharmacology; Cell Line, Tumor; DNA Damage; Gastrointestinal Stromal Tumors/*drug therapy/*genetics/metabolism; Gene Expression Profiling; Humans; Piperazines/*pharmacology; Pyrimidines/*pharmacology; RNA, Messenger/biosynthesis/genetics; STAT3 Transcription Factor/metabolism; Signal Transduction; Tumor Suppressor Protein p53/genetics/metabolism; ets-Domain Protein Elk-1/metabolism},
484
+	Mhda = {2009/12/22 06:00},
485
+	Mid = {NIHMS149886},
486
+	Number = {23},
487
+	Oid = {NLM: NIHMS149886 {$[$}Available on 12/01/10{$]$}; NLM: PMC2789202 {$[$}Available on 12/01/10{$]$}},
488
+	Own = {NLM},
489
+	Pages = {9125--9132},
490
+	Phst = {2009/11/10 {$[$}aheadofprint{$]$}},
491
+	Pii = {0008-5472.CAN-09-1709},
492
+	Pl = {United States},
493
+	Pmc = {PMC2789202},
494
+	Pmcr = {2010/12/01},
495
+	Pmid = {19903850},
496
+	Pst = {ppublish},
497
+	Pt = {Journal Article; Research Support, N.I.H., Extramural; Research Support, Non-U.S. Gov't},
498
+	Rn = {0 (Antineoplastic Agents); 0 (ELK1 protein, human); 0 (Piperazines); 0 (Pyrimidines); 0 (RNA, Messenger); 0 (STAT3 Transcription Factor); 0 (STAT3 protein, human); 0 (TP53 protein, human); 0 (Tumor Suppressor Protein p53); 0 (ets-Domain Protein Elk-1); 152459-95-5 (imatinib)},
499
+	Sb = {IM},
500
+	So = {Cancer Res. 2009 Dec 1;69(23):9125-32. Epub 2009 Nov 10. },
501
+	Stat = {MEDLINE},
502
+	Title = {Detection of treatment-induced changes in signaling pathways in gastrointestinal stromal tumors using transcriptomic data.},
503
+	Volume = {69},
504
+	Year = {2009},
505
+	Bdsk-Url-1 = {http://dx.doi.org/10.1158/0008-5472.CAN-09-1709}}
506
+
507
+@conference{Skilling1998,
508
+	Address = {Dordrecht/Boston/London},
509
+	Author = {J. Skilling},
510
+	Booktitle = {Maximum Entropy and Bayesian Methods, Proceedings of the 17th International Workshop on Maxiumum Entropy and Bayesian Methods of Statistical Analysis},
511
+	Date-Added = {2010-01-08 14:27:23 -0500},
512
+	Date-Modified = {2010-01-08 14:29:16 -0500},
513
+	Editor = {G. J. Erickson and J. T. Rychert and C. R. Smith},
514
+	Publisher = {Kluwer Academic Publishers},
515
+	Title = {Massive inference and maximum entropy},
516
+	Year = {1998}}
517
+
518
+@article{Sibisi1997,
519
+	Author = {S. Sibisi and J. Skilling},
520
+	Date-Added = {2010-01-08 13:50:30 -0500},
521
+	Date-Modified = {2010-01-08 13:50:30 -0500},
522
+	Journal = {Journal of the Royal Statistical Society, B},
523
+	Keywords = {Bayesian, probability},
524
+	Number = {1},
525
+	Pages = {217-235},
526
+	Title = {Prior distributions on measure space},
527
+	Volume = {59},
528
+	Year = {1997},
529
+	Bdsk-File-1 = {YnBsaXN0MDDUAQIDBAUGCQpYJHZlcnNpb25UJHRvcFkkYXJjaGl2ZXJYJG9iamVjdHMSAAGGoNEHCFRyb290gAFfEA9OU0tleWVkQXJjaGl2ZXKoCwwXGBkdJCVVJG51bGzTDQ4PEBEUViRjbGFzc1dOUy5rZXlzWk5TLm9iamVjdHOAB6ISE4ACgAOiFRaABIAGWWFsaWFzRGF0YVxyZWxhdGl2ZVBhdGjSDRobHFdOUy5kYXRhgAVPEQHyAAAAAAHyAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAADD4wxdSCsAAAAQbOgfU2liaXNpU2tpbGxpbmdfSlJveSMxMDI4OURGLnBkZgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQKJ38SrsAMAAAAAAAAAAAADAAIAAAkgAAAAAAAAAAAAAAAAAAAAClJlZmVyZW5jZXMAEAAIAADD41KtAAAAEQAIAADEq+hDAAAAAQAQABBs6AAKpCgACqQRAAB63AACAFBNYWNpbnRvc2ggSEQ6VXNlcnM6ZWpmZXJ0aWc6RG9jdW1lbnRzOlJlZmVyZW5jZXM6U2liaXNpU2tpbGxpbmdfSlJveSMxMDI4OURGLnBkZgAOAE4AJgBTAGkAYgBpAHMAaQBTAGsAaQBsAGwAaQBuAGcAXwBKAFIAbwB5AGEAbABTAHQAYQB0AFMAbwBjAEIAXwAxADkAOQA3AC4AcABkAGYADwAaAAwATQBhAGMAaQBuAHQAbwBzAGgAIABIAEQAEgBKVXNlcnMvZWpmZXJ0aWcvRG9jdW1lbnRzL1JlZmVyZW5jZXMvU2liaXNpU2tpbGxpbmdfSlJveWFsU3RhdFNvY0JfMTk5Ny5wZGYAEwABLwAAFQACAA///wAA0h4fICFYJGNsYXNzZXNaJGNsYXNzbmFtZaMhIiNdTlNNdXRhYmxlRGF0YVZOU0RhdGFYTlNPYmplY3RfEDouLi8uLi8uLi9SZWZlcmVuY2VzL1NpYmlzaVNraWxsaW5nX0pSb3lhbFN0YXRTb2NCXzE5OTcucGRm0h4fJieiJyNcTlNEaWN0aW9uYXJ5AAgAEQAaAB8AKQAyADcAOgA/AEEAUwBcAGIAaQBwAHgAgwCFAIgAigCMAI8AkQCTAJ0AqgCvALcAuQKvArQCvQLIAswC2gLhAuoDJwMsAy8AAAAAAAACAQAAAAAAAAAoAAAAAAAAAAAAAAAAAAADPA==}}
0 530
new file mode 100644
... ...
@@ -0,0 +1,388 @@
1
+% \VignetteIndexEntry{GAPS/CoGAPS Users Manual}
2
+% \VignettePackage{CoGAPS}
3
+
4
+\documentclass{report}
5
+\usepackage{Sweave,fullpage,./chicago,graphicx,subfigure,amssymb,amsmath,color,hyperref,wrapfig}
6
+
7
+\author{Elana J. Fertig \\
8
+email: \texttt{ejfertig@jhmi.edu}}
9
+\title{GAPS/CoGAPS Users Manual}
10
+
11
+\begin{document}
12
+
13
+\maketitle
14
+\tableofcontents
15
+
16
+\chapter{Introduction}
17
+
18
+\par Gene Association in Pattern Sets (GAPS) infers underlying patterns in gene expression a matrix of microarray measurements.  This Markov chain Monte Carlo (MCMC) matrix decomposition which infers these patterns also infers the extent to which individual genes belong to these patterns.  The CoGAPS algorithm extends GAPS to infer the coordinated activity in sets of genes for each of the inferred patterns.
19
+
20
+\par The GAPS algorithm is implemented in a module of the open source, C++ MCMC software Just Another Gibbs Sampler (JAGS; \url{http://www-fis.iarc.fr/~martyn/software/jags/}; \cite{Plummer2003}).  We call the software package containing this implementation of the GAPS algorithm GAPS-JAGS. As an extension including a redistribution of JAGS, GAPS-JAGS is also licensed under the GNU General Public License version 2.  You may freely modify and redistribute GAPS under certain conditions that are described in the top level source directory file \nolinkurl{COPYING}.
21
+
22
+\par The R package CoGAPS is designed to facilitate the corresponding analysis of microarray measurements by calling libraries in the GAPS-JAGS package.  The installation instructions provided in Chapter \ref{install} will ensure proper interaction between the CoGAPS R package and GAPS-JAGS libraries.  Running instructions for the GAPS and CoGAPS analyses are provided in Sections \ref{GAPSRun} and \ref{CoGAPSRun}, respectively.  CoGAPS and GAPS-JAGS are freely available at \url{http://www.cancerbiostats.onc.jhmi.edu/CoGAPS.cfm}.
23
+
24
+\par Please contact Elana J. Fertig \url{ejfertig@jhmi.edu} or Michael F. Ochs \url{mfo@jhu.edu} for citation information. 
25
+
26
+\chapter{Installation Instructions} \label{install}
27
+
28
+\par The GAPS and CoGAPS algorithms are implemente