Browse code

Added the CoGAPS package.

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/CoGAPS@48067 bc3139a8-67e5-0310-9ffc-ced21a209358

c.wong authored on 12/07/2010 17:26:34
Showing34 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,18 @@
1
+Package: CoGAPS
2
+Version: 0.99.1
3
+Date: 2010-07-02
4
+Title: Coordinated Gene Activity in Pattern Sets 
5
+Author: Elana J. Fertig
6
+Description: Coordinated Gene Activity in Pattern Sets (CoGAPS) infers
7
+  biological processes which are active in individual gene sets from
8
+  corresponding microarray measurements.  CoGAPS achieves this inference by
9
+  combining a MCMC matrix decomposition algorithm (GAPS) with a novel statistic
10
+  inferring activity on gene sets. 
11
+Maintainer: Elana J. Fertig <ejfertig@jhmi.edu>, Michael F. Ochs <mfo@jhu.edu>
12
+SystemRequirements: GAPS-JAGS (==1.0.2)
13
+Depends: R (>= 2.9.0), rjags(>= 2.1.0), R.utils (>= 1.2.4)
14
+Imports: graphics, grDevices, methods, stats, utils
15
+License: GPL (== 2)
16
+URL: http://www.cancerbiostats.onc.jhmi.edu/cogaps.cfm
17
+biocViews: GeneExpression, Microarray, Bioinformatics 
18
+
0 19
new file mode 100644
... ...
@@ -0,0 +1,4 @@
1
+CoGAPS		Algorithm for coordination of activity in gene sets with patterns from GAPS
2
+GAPS		MCMC matrix decomposition algorithm
3
+calcCoGASPStat	Computes the CoGAPS gene set statistic
4
+LoadGAPSJAGSLib	Loads in the c++ libraries that perform the GAPS matrix decomposition.  Recommended to be performed before any run of CoGAPS.
0 5
new file mode 100644
... ...
@@ -0,0 +1,10 @@
1
+export(CoGAPS)
2
+export(GAPS)
3
+export(calcCoGAPSStat)
4
+export(plotGAPS)
5
+importFrom(graphics, matplot, title)
6
+importFrom(grDevices, dev.new, dev.off, pdf)
7
+importFrom(methods, is)
8
+importFrom(rjags, jags.model, load.module, jags.samples)  
9
+importFrom(stats, heatmap, runif)
10
+importFrom(utils, read.table, write.table)
0 11
new file mode 100644
... ...
@@ -0,0 +1,2 @@
1
+CoGAPS requires GAPS-JAGS available from http://www.cancerbiostats.onc.jhmi.edu/cogaps.cfm.  This c++ package is a redistribution of JAGS version 2.1.0 with a module implementing the GAPS matrix decomposition for microarray data.  GAPS-JAGS takes the place of the original JAGS package required for the successful installation and running of the rjags package. Please see the installation instructions in the users manual.  If you have any questions, please comment Elana Fertig <ejfertig@jhmi.edu> or Michael Ochs <mfo@jhu.edu>.
2
+
0 3
new file mode 100644
... ...
@@ -0,0 +1,120 @@
1
+# obtain mean and SD from output of data
2
+AtomChain2APNorm <- function(Afile, Pfile,
3
+                             nGene, nSample, nPattern,
4
+                             verbose = TRUE) {
5
+
6
+  if (verbose) {
7
+    message(sprintf("Finding the distribution of A and P matrices from output files %s and %s\n.",
8
+                  Afile, Pfile))
9
+  }
10
+
11
+  # determine the number of iterations from the file length
12
+
13
+  nLinesA <- countLines(Afile)
14
+  nLinesP <- countLines(Pfile)
15
+
16
+  nIterA <- (nLinesA)
17
+  nIterP <- (nLinesP)
18
+
19
+  if ( (nIterA != nIterP) || nIterA <= 1 || nIterP <= 1 ) {
20
+    stop(sprintf("Cannot normalize A and P matrices output at %d and %d iterations",
21
+                 nIterA, nIterP))
22
+  }
23
+
24
+  if (verbose) {
25
+    message("Computing mean of A and P matrices\n")
26
+  }
27
+  Anorm <- matrix(0, nrow = nGene,    ncol=nPattern)
28
+  Pnorm <- matrix(0, nrow = nPattern, ncol=nSample)
29
+
30
+  AConnection <- file(Afile,"r")
31
+  if (!isOpen(AConnection, "r")) {
32
+    stop(paste("Could not open",Afile,"for normalization.",sep=" "))
33
+  }
34
+
35
+  PConnection <- file(Pfile,"r")
36
+  if (!isOpen(PConnection, "r")) {
37
+    stop(paste("Could not open",Pfile,"for normalization.",sep=" "))
38
+  }
39
+  
40
+  for (i in 1:nIterA) {
41
+    AlineData <- scan(AConnection, sep="\t",nlines = 1, strip.white=T, quiet=T)
42
+    PlineData <- scan(PConnection, sep="\t", nlines = 1, strip.white=T, quiet=T)
43
+
44
+    if (length(AlineData) != nGene*nPattern) {
45
+      stop(paste("Read invalid record for A matrix from", Afile, sep=" "))
46
+    }
47
+
48
+    if (length(PlineData) != nPattern*nSample) {
49
+      stop(paste("Read invalid record for P matrix from", Pfile, sep=" "))
50
+    }
51
+
52
+    Atemp <- matrix(AlineData, nrow=nGene, ncol=nPattern)
53
+    Ptemp <- matrix(PlineData, nrow=nPattern, ncol=nSample)
54
+
55
+    normFactor <- apply(Ptemp,1,sum)
56
+    Ptempnorm <- Ptemp / matrix(rep(normFactor, nSample),
57
+                                nrow=nPattern, ncol=nSample)
58
+    Atempnorm <- Atemp * matrix(rep(normFactor, nGene),
59
+                                nrow=nGene, ncol=nPattern, byrow=T)
60
+
61
+    
62
+    Anorm <- Anorm + Atempnorm
63
+    Pnorm <- Pnorm + Ptempnorm
64
+  }
65
+  Anorm <- Anorm / nIterA
66
+  Pnorm <- Pnorm / nIterP
67
+
68
+  close(AConnection)
69
+  close(PConnection)
70
+  
71
+  if (verbose) {
72
+    message("Computing sd of A and P matrices\n")
73
+  }
74
+  Asd <- matrix(0, nrow=nGene, ncol=nPattern)
75
+  Psd <- matrix(0, nrow=nPattern, ncol=nSample)
76
+
77
+  AConnection <- file(Afile,"r")
78
+  if (!isOpen(AConnection, "r")) {
79
+    stop(paste("Could not open",Afile,"for normalization.",sep=" "))
80
+  }
81
+
82
+  PConnection <- file(Pfile,"r")
83
+  if (!isOpen(PConnection, "r")) {
84
+    stop(paste("Could not open",Pfile,"for normalization.",sep=" "))
85
+  }
86
+  
87
+  for (i in 1:nIterA) {
88
+    AlineData <- scan(AConnection, sep="\t",nlines = 1, strip.white=T, quiet=T)
89
+    PlineData <- scan(PConnection, sep="\t", nlines = 1, strip.white=T, quiet=T)
90
+
91
+    if (length(AlineData) != nGene*nPattern) {
92
+      stop(paste("Read invalid record for A matrix from", Afile, sep=" "))
93
+    }
94
+
95
+    if (length(PlineData) != nPattern*nSample) {
96
+      stop(paste("Read invalid record for P matrix from", Pfile, sep=" "))
97
+    }
98
+
99
+    Atemp <- matrix(AlineData, nrow=nGene,    ncol=nPattern)
100
+    Ptemp <- matrix(PlineData, nrow=nPattern, ncol=nSample)
101
+
102
+    normFactor <- apply(Ptemp,1,sum)
103
+    Ptempnorm <- Ptemp / matrix(rep(normFactor, nSample),
104
+                                nrow=nPattern, ncol=nSample)
105
+    Atempnorm <- Atemp * matrix(rep(normFactor, nGene),
106
+                                nrow=nGene, ncol=nPattern, byrow=T)
107
+    
108
+    Asd <- Asd + (Atempnorm - Anorm)^2
109
+    Psd <- Psd + (Ptempnorm - Pnorm)^2
110
+  }
111
+  Asd <- sqrt(Asd / (nIterA - 1))
112
+  Psd <- sqrt(Psd / (nIterP - 1))
113
+  
114
+  close(AConnection)
115
+  close(PConnection)
116
+
117
+  return(list(Amean=Anorm, Pmean=Pnorm, Asd=Asd, Psd=Psd))
118
+  
119
+  
120
+}
0 121
new file mode 100644
... ...
@@ -0,0 +1,55 @@
1
+# run the full algorithm
2
+CoGAPS <- function(data, unc, GStoGenes,
3
+                 outputDir, outputBase="",
4
+                 sep = "\t", isPercentError=FALSE,
5
+                 numPatterns,
6
+                 MaxAtomsA=2^32, alphaA=0.01,
7
+                 MaxAtomsP=2^32, alphaP=0.01,
8
+                 SAIter=1000000000, iter = 500000000, thin=-1,
9
+                 nPerm=500, verbose=TRUE, plot=FALSE, keepChain=FALSE) {
10
+
11
+
12
+  # decompose the data
13
+  matrixDecomp <- GAPS(data=data, unc=unc,
14
+                       outputDir=outputDir, outputBase=outputBase, 
15
+		       sep=sep, isPercentError=isPercentError,
16
+                       numPatterns=numPatterns, MaxAtomsA=MaxAtomsA, 
17
+		       alphaA=alphaA, MaxAtomsP=MaxAtomsP,
18
+                       alphaP=alphaP, SAIter=SAIter, iter=iter, thin=thin,
19
+                       verbose=verbose, keepChain=keepChain)
20
+
21
+  # plot patterns and show heatmap of Anorm
22
+  if (plot) {
23
+    plotGAPS(matrixDecomp$Amean, matrixDecomp$Pmean)
24
+  }
25
+
26
+  # compute the gene set scores
27
+  GSP <- calcCoGAPSStat(matrixDecomp$Amean, matrixDecomp$Asd, GStoGenes, nPerm)
28
+
29
+  # output gene set results
30
+  if (outputDir != "") {
31
+     outputGSPBase <- paste(outputDir, outputBase, sep="/")
32
+  } else {
33
+     outputGSPBase <- outputBase
34
+  }
35
+  if (outputBase != "") {
36
+    outputGSPBase <- paste(outputGSPBase, "_", sep="")
37
+  }
38
+  
39
+  UpFile <- paste(outputGSPBase, "GSUpStat.txt", sep="")
40
+  DownFile <- paste(outputGSPBase, "GSDownStat.txt", sep="")
41
+  ActEstFile <- paste(outputGSPBase, "GSActEst.txt", sep="")
42
+
43
+  write.table(GSP$GSUpreg, file=UpFile, sep=sep)
44
+  write.table(GSP$GSDownreg, file=DownFile, sep=sep)
45
+  write.table(GSP$GSActEst, file=ActEstFile, sep=sep)
46
+
47
+  return(list(meanChi2=matrixDecomp$meanChi2,
48
+              D=matrixDecomp$D, Sigma=matrixDecomp$Sigma,
49
+              Amean=matrixDecomp$Amean, Asd=matrixDecomp$Asd,
50
+              Pmean=matrixDecomp$Pmean, Psd=matrixDecomp$Psd,
51
+              meanMock=matrixDecomp$meanMock,
52
+              GSUpreg=GSP$GSUpreg, GSDownreg=GSP$GSDownreg, GSActEst=GSP$GSActEst))
53
+  
54
+  
55
+}
0 56
new file mode 100644
... ...
@@ -0,0 +1,287 @@
1
+# driver for the matrix decomposition
2
+GAPS <- function(data, unc,
3
+                 outputDir, outputBase="",
4
+                 sep = "\t", isPercentError=FALSE,
5
+                 numPatterns,
6
+                 MaxAtomsA=2^32, alphaA=0.01,
7
+                 MaxAtomsP=2^32, alphaP=0.01,
8
+                 SAIter=1000000000, iter = 500000000, thin=-1,
9
+                 verbose=TRUE, keepChain = FALSE) {
10
+  
11
+  # formatting data for use by algorithm
12
+  if (verbose) {
13
+    message("Formatting data for GAPS");
14
+  }
15
+
16
+  # process data
17
+  dataInFile = tolower(class(data))=="character"
18
+
19
+  # read in data from a file
20
+  if (dataInFile) {
21
+
22
+    if (verbose) {
23
+      message(paste("Reading in data from", data, sep=" "))
24
+    }
25
+
26
+    D <- as.matrix(read.table(file = data, sep = sep,
27
+                              row.names=1, header=TRUE))
28
+    
29
+  } else {
30
+    # data is input in matrix / array form
31
+    D <- as.matrix(data)
32
+  }
33
+
34
+  if (numPatterns >= nrow(D) || numPatterns >= ncol(D)) {
35
+    stop(sprintf("Cannot decompose data into more patterns (%d) than rows (%d) or columns (%d).", numPatterns, nrow(D), ncol(D)))
36
+  }
37
+  
38
+  # process uncertainty
39
+  sdInFile = tolower(class(unc))=="character"
40
+  if (sdInFile) {
41
+
42
+    # uncertainty is specified in a file
43
+    if (isPercentError) {
44
+      stop(paste("Uncertainty specified in file", unc, 
45
+                  "must be specified as additive error."))
46
+    }
47
+
48
+    if (verbose) {
49
+       message(paste("Reading in uncertainty from", unc, sep=" "))
50
+    }
51
+    
52
+    Sigma <- as.matrix(read.table(file = unc, sep = sep,
53
+                                  row.names=1,header=TRUE))
54
+    
55
+  } else {
56
+
57
+    
58
+    nrunc <- nrow(unc)
59
+    ncunc <- ncol(unc)
60
+    if (length(unc)==1) {
61
+      nrunc <- 1
62
+      ncunc <- 1
63
+    }
64
+    
65
+    # see if input uncertainty is a scalar
66
+    if ( nrunc == 1 && ncunc == 1 ) {
67
+
68
+      if (verbose) {
69
+        message("Using single unc value specified in input.")
70
+      }
71
+
72
+      # compute the error as either fractional or additive
73
+      if (isPercentError) {
74
+        Sigma <- pmax(unc*abs(D),unc)
75
+      } else {
76
+        Sigma <- matrix(data=unc, nrow=nrow(D), ncol=ncol(D))
77
+      }
78
+
79
+      # check that the uncertainty is positive
80
+      if (any(Sigma <= 0)) {
81
+        stop(paste("Uncertainty must be positive (Sigma = ", Sigma, ").", sep=""))
82
+      }
83
+    } else {
84
+
85
+      if (verbose) {
86
+        message("Using matrix unc value specified in input.")
87
+      }
88
+
89
+      # check that dimensions of uncertainty and data match
90
+      if (nrunc != nrow(D) || ncunc != ncol(D)) {
91
+        stop(paste("Dimensions of data (", nrow(D), ", ", ncol(D),
92
+                    ") and uncertainty (", nrow(Sigma), ", ", ncol(Sigma), ") do not agree",
93
+                    sep = ""))
94
+      }
95
+      # check that the uncertainty is positive
96
+      if (any(unc <= 0)) {
97
+        stop(paste("Uncertainty must be positive (unc = ", unc, ").", sep=""))
98
+      }
99
+
100
+      # ensure that matrix errors additive                                                        
101
+      if (isPercentError) {
102
+        stop("Uncertainty specified in an array must be additive error.")
103
+      }
104
+      Sigma <- as.matrix(unc)
105
+
106
+    }
107
+
108
+  }
109
+
110
+  # make a temporary file to outputBase which contains the row and column names
111
+  if (outputDir != "" && outputDir != ".") {
112
+     fullOutputPath <- paste(outputDir,outputBase,sep="/")
113
+     if (!file.exists(outputDir)) {
114
+        dir.create(outputDir)
115
+     }
116
+  } else {
117
+     fullOutputPath <- outputBase
118
+  }
119
+  
120
+
121
+  if (verbose) {
122
+    message(paste("Decomposing data of size:",
123
+                nrow(D), "by", ncol(D),
124
+                "into", numPatterns, "patterns.", sep=" "))
125
+  } 
126
+  
127
+  # get expected mass of each matrix element based on data
128
+  if (verbose) {
129
+    message("Initializing distribution parameters")
130
+  }
131
+
132
+  # computing the mean mass of atoms in A and P based on the data matrix
133
+  lambdaA <- alphaA * sqrt(numPatterns) / sqrt(mean(D)) 
134
+  lambdaP <- alphaP * sqrt(numPatterns) / sqrt(mean(D))
135
+
136
+  # estimate number of outputs based on expected number of matrix elements
137
+  if (thin <= 0) {
138
+    if (iter < 10000) {
139
+      outThin <- 1
140
+    } else {
141
+      outThin <- iter/10000
142
+    }
143
+  } else {
144
+    outThin <- thin
145
+  }
146
+
147
+  # assign the unique file identifier to be the number after the decimal
148
+  if (floor(outThin) - outThin < 1.e-10) {
149
+    outThin <- outThin + runif(1)
150
+  }
151
+
152
+
153
+  # initialize output arrays, including chain dimensions
154
+  Amean   <- array(0., c(nrow(D), numPatterns))
155
+  row.names(Amean) <- row.names(D)
156
+  colnames(Amean) <- paste("p",1:numPatterns,sep="")
157
+  Asd     <- array(0., c(nrow(D), numPatterns))
158
+  row.names(Asd) <- row.names(D)
159
+  colnames(Amean) <- paste("p",1:numPatterns,sep="")
160
+  Pmean   <- array(0., c(numPatterns, ncol(D)))
161
+  row.names(Pmean) <-   paste("p",1:numPatterns,sep="")
162
+  colnames(Pmean) <- colnames(D)
163
+  Psd     <- array(0., c(numPatterns, ncol(D)))
164
+  row.names(Psd) <- paste("p",1:numPatterns,sep="")
165
+  colnames(Psd) <- colnames(D)
166
+
167
+  meanChi2 <- 0. 
168
+  meanMock <- array(0., c(nrow(D),ncol(D)))
169
+
170
+  
171
+  if (verbose) {
172
+    message(paste("Running GAPS."))
173
+    message(paste(SAIter, "burn-in steps;", iter, "steps.", sep=" "))
174
+  }
175
+    
176
+  # run the mcmc
177
+  mdout <- RunGAPS(D, Sigma, numPatterns,
178
+                   MaxAtomsA, alphaA, lambdaA,
179
+                   MaxAtomsP, alphaP, lambdaP,
180
+                   SAIter, iter, outThin)
181
+
182
+  if (verbose) {
183
+    message(paste("Normalizing results."))
184
+  }
185
+
186
+  # obtain the mean and standard deviation of A and P from the chain files
187
+  FID <- sprintf('%.10f', outThin - floor(outThin))
188
+  AFile <- paste("AResults", FID, ".ChainValues.txt", sep="")
189
+  PFile <- paste("PResults", FID, ".ChainValues.txt", sep="")
190
+  normmd <- AtomChain2APNorm(Afile      = AFile,
191
+                             Pfile      = PFile,
192
+                             nGene      = nrow(D),
193
+                             nPattern   = numPatterns,
194
+                             nSample    = ncol(D))
195
+    
196
+  # extract chain data and store in above arrays
197
+  Amean[,] <- normmd$Amean
198
+  Asd[,] <- normmd$Asd
199
+  Pmean[,] <- normmd$Pmean
200
+  Psd[,] <- normmd$Psd
201
+
202
+  write.table(Amean[,],
203
+              file=paste(fullOutputPath,"Amean.",FID,".txt",sep=""),
204
+              sep="\t")
205
+  write.table(Asd[,],
206
+              file=paste(fullOutputPath,"Asd.",FID,".txt",sep=""),
207
+              sep="\t")
208
+  write.table(Pmean[,],
209
+              file=paste(fullOutputPath,"Pmean.",FID,".txt",sep=""),
210
+              sep="\t")
211
+  write.table(Psd[,],
212
+              file=paste(fullOutputPath,"Psd.",FID,".txt",sep=""),
213
+              sep="\t")
214
+    
215
+    
216
+  if (verbose) {
217
+    message(paste("Computing chi2")) 
218
+  }
219
+    
220
+  atmp <- as.matrix(Amean[,])
221
+  ptmp <- as.matrix(Pmean[,])
222
+  mtmp <- atmp %*% ptmp
223
+  meanMock[,] = array(mtmp, c(nrow(D), ncol(D), 1))
224
+  meanChi2 <- sum((D-meanMock[,])^2 / Sigma^2)
225
+
226
+  # process the diagnostic file and add appropriate means
227
+  ADiagFile <- paste("AResults",FID,".Diagnostics.txt",sep="")
228
+  PDiagFile <- paste("PResults",FID,".Diagnostics.txt",sep="")
229
+
230
+  ADiag <- read.table(ADiagFile, sep="\t", header=T, skip=10)
231
+  PDiag <- read.table(PDiagFile, sep="\t", header=T, skip=10)
232
+
233
+  chainKeepA  <- which(ADiag[,2]==0)
234
+  meanOfChi2A <- mean(ADiag[chainKeepA,4])
235
+  meanAtomsA  <- mean(ADiag[chainKeepA,5])
236
+
237
+  chi2DiagA <- paste("<number of atoms>", meanAtomsA,
238
+                     "<chi^2>:", meanOfChi2A,
239
+                     "; chi^2 of mean:", meanChi2, sep=" ")
240
+  cat("\n\nSummary\n", file=ADiagFile, append=TRUE)
241
+  cat(chi2DiagA, file=ADiagFile, append=TRUE)
242
+  cat("\n", file=ADiagFile, append=TRUE)
243
+    
244
+  chainKeepP  <- which(PDiag[,2]==0)
245
+  meanOfChi2P <- mean(PDiag[chainKeepP,4])
246
+  meanAtomsP  <- mean(PDiag[chainKeepP,5])
247
+
248
+  chi2DiagP <- paste("<number of atoms>", meanAtomsP,
249
+                     "<chi^2>:", meanOfChi2P,
250
+                     "; chi^2 of mean:", meanChi2, sep=" ")
251
+  cat("\n\nSummary\n", file=PDiagFile, append=TRUE)
252
+  cat(chi2DiagP, file=PDiagFile, append=TRUE)
253
+  cat("\n", file=PDiagFile, append=TRUE)
254
+    
255
+  if (fullOutputPath != "") {
256
+     file.rename(ADiagFile, paste(fullOutputPath, basename(ADiagFile), sep=""))
257
+     file.rename(PDiagFile, paste(fullOutputPath, basename(PDiagFile), sep=""))
258
+  }
259
+
260
+  # delete chain files or move to appropriate directory
261
+  if (!keepChain) {
262
+    message("Deleting chain files")
263
+    file.remove(AFile)
264
+    file.remove(PFile)
265
+  } else {
266
+    message(paste('Moving chain files to ', outputDir, sep=' '))
267
+    if (fullOutputPath != "") {
268
+       file.rename(AFile, paste(fullOutputPath, basename(AFile),sep=""))
269
+       file.rename(PFile, paste(fullOutputPath, basename(PFile),sep=""))
270
+    }
271
+  }
272
+  
273
+
274
+  if (verbose) {
275
+    message("Completed simulation.\n")
276
+    message("Return list contains chi^2 of solution and normalized, mean and standard deviation of A and P matrices.\n")
277
+  }
278
+
279
+  # output list of data for each chain
280
+  return(list(meanChi2=meanChi2,
281
+              D=D, Sigma=Sigma,
282
+              Amean=Amean, Asd=Asd,
283
+              Pmean=Pmean, Psd=Psd,
284
+              meanMock=meanMock))
285
+
286
+}
287
+                          
0 288
new file mode 100644
... ...
@@ -0,0 +1,44 @@
1
+# formats raw table data into a format required for MCMC with JAGS
2
+GetDataListGAPS <- function(data, unc, numPatterns,
3
+                            MaxAtomsA, alphaA, lambdaA, 
4
+                            MaxAtomsP, alphaP, lambdaP, SAIter, thin) {
5
+
6
+  # convert to matrix for output
7
+  D <- as.matrix(data)
8
+
9
+  # process the uncertainty
10
+  S <- as.matrix(unc)
11
+
12
+  # check the dimensionality
13
+  if (nrow(S) != nrow(D) || ncol(S) != ncol(D)) {
14
+    stop(cat("Dimensions of data (",
15
+             nrow(D), ", ", ncol(D),
16
+             ") must match dimensions of uncertainty (",
17
+             nrow(S), ", ", ncol(S), ").\n"))
18
+  }
19
+
20
+  # check that D and S are meet positivity requirements positive
21
+  if (any(D < 0)) {
22
+    warning("Data should be non-negative.\n");
23
+  }
24
+
25
+  if (any(S <= 0)) {
26
+    stop("Uncertainty must be strictly positive.\n");
27
+  }
28
+    
29
+
30
+  datalist = list(
31
+    numGenes = nrow(data), numSamples = ncol(data),
32
+    numPatterns = numPatterns,
33
+    D = D, S = S,
34
+    ADim = matrix(data=1,nrow=nrow(data), ncol=numPatterns),
35
+    MaxAtomsA = MaxAtomsA, alphaA = alphaA, lambdaA = lambdaA,
36
+    PDim = matrix(data=1,nrow=numPatterns, ncol=ncol(data)),
37
+    MaxAtomsP = MaxAtomsP, alphaP = alphaP, lambdaP = lambdaP,
38
+    SAIterA = SAIter, SAIterP = SAIter, fileID = thin)
39
+
40
+  return(datalist)
41
+  
42
+
43
+  
44
+}
0 45
new file mode 100644
... ...
@@ -0,0 +1,55 @@
1
+# runs matrix decomposition with MCMC using rjags
2
+RunGAPS <- function(data, unc, numPatterns,
3
+                    MaxAtomsA, alphaA, lambdaA, 
4
+                    MaxAtomsP, alphaP, lambdaP, 
5
+                    SAIter, iter, thin=1) {
6
+
7
+
8
+  # load in the matrix decomposition module
9
+  load.module(c("gaps"))
10
+
11
+  # get the data needed for MCMC
12
+  matrixDecompData <- GetDataListGAPS(data, unc, numPatterns,
13
+                                      MaxAtomsA, alphaA, lambdaA,
14
+                                      MaxAtomsP, alphaP, lambdaP,
15
+                                      SAIter, thin)
16
+  matrixDecompInits <- list(A = matrix(
17
+                              data=0, matrixDecompData$numGenes, numPatterns),
18
+                            P = matrix(
19
+                              data=0, numPatterns, matrixDecompData$numSamples))
20
+
21
+  # create the model file used by this decomposition
22
+  FID <- sprintf('%.10f', thin - floor(thin))
23
+  ModelFile <- paste("GAPS", FID, "bug", sep=".")
24
+
25
+  cat("var\n", file=ModelFile)
26
+  cat("\tA[numGenes, numPatterns], ADim[numGenes, numPatterns],\n",
27
+      file=ModelFile, append=TRUE)
28
+  cat("\tP[numPatterns, numSamples], PDim[numPatterns, numSamples],\n",
29
+      file=ModelFile, append=TRUE)
30
+  cat("\tD[numGenes, numSamples], S[numGenes, numSamples];\n",
31
+      file=ModelFile, append=TRUE)
32
+  cat("\nmodel {\n", file=ModelFile, append=TRUE)
33
+  cat("A[,] ~ datomicgibbs(ADim, MaxAtomsA, alphaA, lambdaA, SAIterA, fileID)\n", file=ModelFile, append=TRUE)
34
+  cat("P[,] ~ datomicgibbs(PDim, MaxAtomsP, alphaP, lambdaP, SAIterP, fileID)\n", file=ModelFile, append=TRUE)
35
+  cat("D[,] ~ gapsnorm(A,P,S)\n", file=ModelFile, append=TRUE)
36
+  cat("}\n", file=ModelFile, append=TRUE) 
37
+
38
+  # set the model being used
39
+  matrixDecompModel <- jags.model(file=ModelFile,
40
+                                  data=matrixDecompData,
41
+                                  inits=matrixDecompInits,
42
+                                  n.adapt=SAIter)
43
+  # update and specify which nodes are monitored
44
+  matrixDecompOutput <- jags.samples(model=matrixDecompModel,
45
+                                     variable.names=c('A','P'),  
46
+                                     n.iter=iter,
47
+                                     thin=iter)
48
+
49
+  file.remove(ModelFile)
50
+  
51
+  return(matrixDecompOutput)
52
+  
53
+
54
+}
55
+
0 56
new file mode 100644
... ...
@@ -0,0 +1,99 @@
1
+calcCoGAPSStat <- function (Amean, Asd, GStoGenes, numPerm=500) {
2
+
3
+  # calculate Z scores
4
+  zMatrix <- calcZ(Amean,Asd)
5
+  
6
+  # compute the p-value for each gene set belonging to each pattern
7
+
8
+  # check input arguments
9
+  if (!is(GStoGenes, "data.frame") && !is(GStoGenes, "list")) {
10
+    stop("GStoGenes must be a data.frame or list with format specified in the users manual.")
11
+  }
12
+	
13
+  if (is(GStoGenes, "list")) {
14
+    GStoGenesList <- GStoGenes
15
+  } else {
16
+    GStoGenesList <- list()
17
+    for (i in 1:dim(GStoGenes)[2]) {
18
+      GStoGenesList[[as.character(colnames(GStoGenes)[i])]] <- as.character(unique(GStoGenes[,i]))
19
+    }
20
+  }
21
+  
22
+  # get dimensions
23
+  numGS   <- length(names(GStoGenesList))
24
+  numPatt <- dim(zMatrix)[2]
25
+  numG    <- dim(zMatrix)[1]+0.9999
26
+
27
+  # initialize matrices
28
+  statsUp   <- matrix(ncol = numGS, nrow = numPatt)
29
+  statsDown <- matrix(ncol = numGS, nrow = numPatt)
30
+  actEst    <- matrix(ncol = numGS, nrow = numPatt)
31
+  results   <- new.env()
32
+  zPerm     <- matrix(ncol=numPerm,nrow=numPatt)
33
+
34
+  # do permutation test
35
+  for (gs in 1:numGS) {
36
+    genes <- GStoGenesList[[names(GStoGenesList)[gs]]]
37
+    index <- rownames(zMatrix) %in% genes
38
+    zValues <- zMatrix[index,1]
39
+    numGenes <- length(zValues)
40
+    label <- as.character(numGenes)
41
+    
42
+    if (!exists(label,env=results)) {
43
+      for (p in 1:numPatt) {
44
+        for (j in 1:numPerm) {
45
+          temp <- floor(runif(numGenes,1,numG))
46
+          temp2 <- zMatrix[temp,p]
47
+          zPerm[p,j] <- mean(temp2)
48
+        }
49
+      }
50
+      assign(label,zPerm,env=results)
51
+    }
52
+  }
53
+
54
+  # get p-value
55
+  for (p in 1:numPatt) {
56
+
57
+    for (gs in 1:numGS) {
58
+
59
+      genes <- GStoGenesList[[names(GStoGenesList)[gs]]]
60
+      index <- rownames(zMatrix) %in% genes
61
+      zValues <- zMatrix[index,p]
62
+      zScore <- mean(zValues)
63
+
64
+      numGenes <- length(zValues)
65
+      label <- as.character(numGenes)
66
+
67
+      permzValues <- get(label, env=results)[p,]
68
+      ordering <- order(permzValues)
69
+      permzValues <- permzValues[ordering]
70
+
71
+      statistic <- sum(zScore > permzValues)
72
+      statUpReg <- 1 - statistic/length(permzValues) 
73
+      statsUp[p,gs] <- max(statUpReg, 1/numPerm)
74
+
75
+      statistic <- sum(zScore < permzValues)
76
+      statDownReg <- 1 - statistic/length(permzValues) 
77
+      statsDown[p,gs] <- max(statDownReg,1/numPerm)
78
+
79
+      activity <- 1 - 2*max(statUpReg, 1/numPerm)
80
+      actEst[p,gs] <- activity
81
+    }
82
+
83
+  }
84
+
85
+  # format output
86
+  colnames(statsUp) <- names(GStoGenesList)
87
+  colnames(statsDown) <- names(GStoGenesList)
88
+  colnames(actEst) <- names(GStoGenesList)
89
+
90
+  rownames(statsUp) <- colnames(zMatrix)
91
+  rownames(statsDown) <- colnames(zMatrix)
92
+  rownames(actEst) <- colnames(zMatrix)
93
+
94
+  assign('GSUpreg', statsUp, env=results)
95
+  assign('GSDownreg', statsDown, env=results)
96
+  assign('GSActEst', actEst, env=results)
97
+  
98
+  return(results)
99
+}
0 100
new file mode 100644
... ...
@@ -0,0 +1,25 @@
1
+calcZ <- function (meanMat, sdMat) {
2
+
3
+  # compute the z score for each gene's association to each pattern
4
+
5
+  # find matrix dimensions
6
+  nrows <- dim(meanMat)[1]
7
+  ncols <- dim(meanMat)[2]
8
+
9
+  check <- dim(sdMat)[1]
10
+  if (nrows != check) {
11
+    stop("Number of rows in the mean and standard deviation of A do not agree.")
12
+  }
13
+
14
+  check <- dim(sdMat)[2]
15
+  if (ncols != check) {
16
+    stop("Number of columns in the mean and standard deviation of A do not agree.")
17
+  }
18
+
19
+  # compute the matrix of z scores
20
+  zMat <- meanMat/sdMat
21
+  rownames(zMat) <- rownames(meanMat)
22
+  colnames(zMat) <- colnames(meanMat)
23
+
24
+  return(zMat)
25
+}
0 26
new file mode 100644
... ...
@@ -0,0 +1,26 @@
1
+# plot the decomposed A and P matrices
2
+plotGAPS <- function(A, P, outputPDF="") {
3
+	if (outputPDF != "") {
4
+	  pdf(file=paste(outputPDF,"_Patterns",".pdf",sep=""))
5
+	} else {
6
+	  dev.new()
7
+	}
8
+
9
+	arrayIdx <- 1:ncol(P)
10
+	matplot(arrayIdx, t(P), type='l', lwd=10, xlim=c(1,ncol(P)), ylim=c(0,1))
11
+	title(main='Inferred patterns')
12
+  
13
+	if (outputPDF == "") {
14
+	  dev.new()
15
+	} else {
16
+	  dev.off()
17
+	  pdf(file=paste(outputPDF,"_Amplitude",".pdf",sep=""))
18
+	}
19
+
20
+	heatmap(A, Rowv=NA, Colv=NA) 
21
+	
22
+	if (outputPDF != "") {
23
+		dev.off()
24
+	}
25
+  
26
+}
0 27
new file mode 100644
1 28
Binary files /dev/null and b/data/EasySimGS.RData differ
2 29
new file mode 100644
3 30
Binary files /dev/null and b/data/GIST_TS_20084.RData differ
4 31
new file mode 100644
5 32
Binary files /dev/null and b/data/ModSim.RData differ
6 33
new file mode 100644
7 34
Binary files /dev/null and b/data/TFGSList.RData differ
8 35
new file mode 100644
... ...
@@ -0,0 +1,16 @@
1
+citHeader("To cite package 'CoGAPS' in publications use:")
2
+
3
+desc <- packageDescription("CoGAPS")
4
+year <- sub(".*(2[[:digit:]]{3})-.*", "\\1", desc$Date)
5
+vers <- paste("R package version", desc$Version)
6
+
7
+citEntry(entry="article",
8
+         title = "{CoGAPS: an integrated R/C++ package to identify overlapping patterns of activation of biological processes from expression data}",
9
+         author = personList(as.person("Elana J. Fertig"),
10
+           as.person("Jie Ding"), as.person("Alexander V. Favorov"),
11
+           as.person("Giovanni Parmigiani"), as.person("Michael F. Ochs")),
12
+         year = 2010,
13
+         journal = "Bioinformatics",
14
+         note = "Manuscript Submitted",
15
+         textVersion = "E.J. Fertig, J. Ding, A.V. Favorov, G. Parmigiani, and M.F. Ochs (2010) CoGAPS: an integrated R/C++ package to identify overlapping patterns of activation of biological processes from expression data.  Manuscript Submitted to Bioinformatics.")
16
+           
0 17
new file mode 100644
... ...
@@ -0,0 +1,529 @@
1
+%% This BibTeX bibliography file was created using BibDesk.
2
+%% http://bibdesk.sourceforge.net/
3
+
4
+
5
+%% Created for Elana Fertig at 2010-02-04 11:48:54 -0500 
6
+
7
+
8
+%% Saved with string encoding Unicode (UTF-8) 
9
+
10
+
11
+
12
+@article{Tavazoie1999,
13
+	Abstract = {Technologies to measure whole-genome mRNA abundances and methods to organize and display such data are emerging as valuable tools for systems-level exploration of transcriptional regulatory networks. For instance, it has been shown that mRNA data from 118 genes, measured at several time points in the developing hindbrain of mice, can be hierarchically clustered into various patterns (or 'waves') whose members tend to participate in common processes. We have previously shown that hierarchical clustering can group together genes whose cis-regulatory elements are bound by the same proteins in vivo. Hierarchical clustering has also been used to organize genes into hierarchical dendograms on the basis of their expression across multiple growth conditions. The application of Fourier analysis to synchronized yeast mRNA expression data has identified cell-cycle periodic genes, many of which have expected cis-regulatory elements. Here we apply a systematic set of statistical algorithms, based on whole-genome mRNA data, partitional clustering and motif discovery, to identify transcriptional regulatory sub-networks in yeast-without any a priori knowledge of their structure or any assumptions about their dynamics. This approach uncovered new regulons (sets of co-regulated genes) and their putative cis-regulatory elements. We used statistical characterization of known regulons and motifs to derive criteria by which we infer the biological significance of newly discovered regulons and motifs. Our approach holds promise for the rapid elucidation of genetic network architecture in sequenced organisms in which little biology is known.},
14
+	Address = {Department of Genetics, Harvard Medical School, Boston, Massachusetts 02115, USA.},
15
+	Au = {Tavazoie, S and Hughes, JD and Campbell, MJ and Cho, RJ and Church, GM},
16
+	Author = {Tavazoie, S. and Hughes, J.D. and Campbell, M.J. and Cho, R.J. and Church, G.M.},
17
+	Cin = {Nat Genet. 1999 Jul;22(3):213-5. PMID: 10391202},
18
+	Crdt = {1999/07/03 10:00},
19
+	Da = {19990719},
20
+	Date-Added = {2010-02-04 11:43:20 -0500},
21
+	Date-Modified = {2010-02-04 11:48:43 -0500},
22
+	Dcom = {19990719},
23
+	Doi = {10.1038/10343},
24
+	Edat = {1999/07/03 10:00},
25
+	Issn = {1061-4036 (Print); 1061-4036 (Linking)},
26
+	Jid = {9216904},
27
+	Journal = {Nat Genet},
28
+	Jt = {Nature genetics},
29
+	Language = {eng},
30
+	Lr = {20061115},
31
+	Mh = {Animals; Cell Cycle/genetics; DNA/genetics; Gene Expression; *Genetic Techniques; Mice; Multigene Family; Open Reading Frames; RNA, Messenger/genetics/metabolism; Rhombencephalon/growth \& development/metabolism; Saccharomyces cerevisiae/cytology/genetics/metabolism},
32
+	Mhda = {2001/03/23 10:01},
33
+	Number = {3},
34
+	Own = {NLM},
35
+	Pages = {281--285},
36
+	Pl = {UNITED STATES},
37
+	Pmid = {10391217},
38
+	Pst = {ppublish},
39
+	Pt = {Journal Article; Research Support, Non-U.S. Gov't; Research Support, U.S. Gov't, Non-P.H.S.},
40
+	Rn = {0 (RNA, Messenger); 9007-49-2 (DNA)},
41
+	Sb = {IM},
42
+	So = {Nat Genet. 1999 Jul;22(3):281-5. },
43
+	Stat = {MEDLINE},
44
+	Title = {Systematic determination of genetic network architecture.},
45
+	Volume = {22},
46
+	Year = {1999},
47
+	Bdsk-Url-1 = {http://dx.doi.org/10.1038/10343}}
48
+
49
+@article{Goeman2007,
50
+	Abstract = {MOTIVATION: Many statistical tests have been proposed in recent years for analyzing gene expression data in terms of gene sets, usually from Gene Ontology. These methods are based on widely different methodological assumptions. Some approaches test differential expression of each gene set against differential expression of the rest of the genes, whereas others test each gene set on its own. Also, some methods are based on a model in which the genes are the sampling units, whereas others treat the subjects as the sampling units. This article aims to clarify the assumptions behind different approaches and to indicate a preferential methodology of gene set testing. RESULTS: We identify some crucial assumptions which are needed by the majority of methods. P-values derived from methods that use a model which takes the genes as the sampling unit are easily misinterpreted, as they are based on a statistical model that does not resemble the biological experiment actually performed. Furthermore, because these models are based on a crucial and unrealistic independence assumption between genes, the P-values derived from such methods can be wildly anti-conservative, as a simulation experiment shows. We also argue that methods that competitively test each gene set against the rest of the genes create an unnecessary rift between single gene testing and gene set testing.},
51
+	Address = {Department of Medical Statistics and Bioinformatics, Leiden University Medical Center, Leiden, The Netherlands. j.j.goeman@lumc.nl},
52
+	Au = {Goeman, JJ and Buhlmann, P},
53
+	Author = {Goeman, J.J. and Buhlmann, P.},
54
+	Crdt = {2007/02/17 09:00},
55
+	Da = {20070501},
56
+	Date-Added = {2010-02-04 11:42:46 -0500},
57
+	Date-Modified = {2010-02-04 11:48:51 -0500},
58
+	Dcom = {20070522},
59
+	Dep = {20070215},
60
+	Doi = {10.1093/bioinformatics/btm051},
61
+	Edat = {2007/02/17 09:00},
62
+	Issn = {1367-4811 (Electronic); 1367-4803 (Linking)},
63
+	Jid = {9808944},
64
+	Journal = {Bioinformatics},
65
+	Jt = {Bioinformatics (Oxford, England)},
66
+	Keywords = {gene sets},
67
+	Language = {eng},
68
+	Lr = {20091104},
69
+	Mh = {*Algorithms; *Artifacts; *Data Interpretation, Statistical; *Databases, Genetic; Gene Expression Profiling/*methods; Information Storage and Retrieval/*methods; Reproducibility of Results; Sensitivity and Specificity},
70
+	Mhda = {2007/05/23 09:00},
71
+	Number = {8},
72
+	Own = {NLM},
73
+	Pages = {980--987},
74
+	Phst = {2007/02/15 {$[$}aheadofprint{$]$}},
75
+	Pii = {btm051},
76
+	Pl = {England},
77
+	Pmid = {17303618},
78
+	Pst = {ppublish},
79
+	Pt = {Comparative Study; Evaluation Studies; Journal Article},
80
+	Sb = {IM},
81
+	So = {Bioinformatics. 2007 Apr 15;23(8):980-7. Epub 2007 Feb 15. },
82
+	Stat = {MEDLINE},
83
+	Title = {Analyzing gene expression data in terms of gene sets: methodological issues.},
84
+	Volume = {23},
85
+	Year = {2007},
86
+	Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btm051}}
87
+
88
+@article{Bidaut2004,
89
+	Abstract = {ClutrFree facilitates the visualization and interpretation of clusters or patterns computed from microarray data through a graphical user interface that displays patterns, membership information of the genes and annotation statistics simultaneously. ClutrFree creates a tree linking the patterns based on similarity, permitting the navigation among patterns identified by different algorithms or by the same algorithm with different parameters, and aids the inferring of conclusions from a microarray experiment. AVAILABILITY: The ClutrFree Java source code and compiled bytecode are available as a package under the GNU General Public License at http://bioinformatics.fccc.edu},
90
+	Address = {Division of Population Science, Fox Chase Cancer Center, 333 Cottman Avenue, Philadelphia, PA 19111, USA.},
91
+	Au = {Bidaut, G and Ochs, MF},
92
+	Author = {Bidaut, G. and Ochs, M.F.},
93
+	Crdt = {2004/05/18 05:00},
94
+	Da = {20041101},
95
+	Date-Added = {2010-02-02 12:36:53 -0500},
96
+	Date-Modified = {2010-02-02 16:00:15 -0500},
97
+	Dcom = {20050210},
98
+	Dep = {20040514},
99
+	Doi = {10.1093/bioinformatics/bth307},
100
+	Edat = {2004/05/18 05:00},
101
+	Gr = {CA06927/CA/NCI NIH HHS/United States},
102
+	Issn = {1367-4803 (Print); 1367-4803 (Linking)},
103
+	Jid = {9808944},
104
+	Journal = {Bioinformatics},
105
+	Jt = {Bioinformatics (Oxford, England)},
106
+	Language = {eng},
107
+	Lr = {20071114},
108
+	Mh = {*Algorithms; *Cluster Analysis; Computer Graphics; Oligonucleotide Array Sequence Analysis/*methods; Sequence Alignment/*methods; Sequence Analysis, DNA/*methods; *Software; *User-Computer Interface},
109
+	Mhda = {2005/02/11 09:00},
110
+	Number = {16},
111
+	Own = {NLM},
112
+	Pages = {2869--2871},
113
+	Phst = {2004/05/14 {$[$}aheadofprint{$]$}},
114
+	Pii = {bth307},
115
+	Pl = {England},
116
+	Pmid = {15145813},
117
+	Pst = {ppublish},
118
+	Pt = {Journal Article; Research Support, Non-U.S. Gov't; Research Support, U.S. Gov't, P.H.S.},
119
+	Sb = {IM},
120
+	So = {Bioinformatics. 2004 Nov 1;20(16):2869-71. Epub 2004 May 14. },
121
+	Stat = {MEDLINE},
122
+	Title = {ClutrFree: cluster tree visualization and interpretation.},
123
+	Volume = {20},
124
+	Year = {2004},
125
+	Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/bth307}}
126
+
127
+@article{Carvalho2008,
128
+	Author = {Carvalho, C.M. and Chang, J. and Lucas, J. and Nevins, J.R. and Wang, Q. and West, M. },
129
+	Date-Added = {2010-02-02 12:36:36 -0500},
130
+	Date-Modified = {2010-02-02 12:36:36 -0500},
131
+	Journal = {J. Am. Stat. Assoc. },
132
+	Pages = {1438 - 1456},
133
+	Title = {High-dimensional sparse factor modelling: Applications in gene  expression genomics},
134
+	Volume = {103},
135
+	Year = {2008}}
136
+
137
+@article{Lee1999,
138
+	Abstract = {Is perception of the whole based on perception of its parts? There is psychological and physiological evidence for parts-based representations in the brain, and certain computational theories of object recognition rely on such representations. But little is known about how brains or computers might learn the parts of objects. Here we demonstrate an algorithm for non-negative matrix factorization that is able to learn parts of faces and semantic features of text. This is in contrast to other methods, such as principal components analysis and vector quantization, that learn holistic, not parts-based, representations. Non-negative matrix factorization is distinguished from the other methods by its use of non-negativity constraints. These constraints lead to a parts-based representation because they allow only additive, not subtractive, combinations. When non-negative matrix factorization is implemented as a neural network, parts-based representations emerge by virtue of two properties: the firing rates of neurons are never negative and synaptic strengths do not change sign.},
139
+	Address = {Bell Laboratories, Lucent Technologies, Murray Hill, New Jersey 07974, USA.},
140
+	Author = {Lee, D.D. and Seung, H.S.},
141
+	Cin = {Nature. 1999 Oct 21;401(6755):759. PMID: 10548097; Nature. 1999 Oct 21;401(6755):759-60. PMID: 10548098},
142
+	Crdt = {1999/11/05 08:00},
143
+	Da = {19991116},
144
+	Date = {1999 Oct 21},
145
+	Date-Added = {2010-02-02 12:35:49 -0500},
146
+	Date-Modified = {2010-02-02 15:57:17 -0500},
147
+	Dcom = {19991116},
148
+	Doi = {10.1038/44565},
149
+	Edat = {1999/11/05 08:00},
150
+	Issn = {0028-0836 (Print)},
151
+	Jid = {0410462},
152
+	Journal = {Nature},
153
+	Jt = {Nature},
154
+	Language = {eng},
155
+	Lr = {20061115},
156
+	Mh = {*Algorithms; Face; Humans; *Learning; Models, Neurological; Perception/physiology; Semantics},
157
+	Mhda = {2001/03/23 10:01},
158
+	Number = {6755},
159
+	Own = {NLM},
160
+	Pages = {788--791},
161
+	Pl = {ENGLAND},
162
+	Pmid = {10548103},
163
+	Pst = {ppublish},
164
+	Pt = {Journal Article; Research Support, Non-U.S. Gov't},
165
+	Sb = {IM},
166
+	Status = {MEDLINE},
167
+	Title = {Learning the parts of objects by non-negative matrix factorization.},
168
+	Volume = {401},
169
+	Year = {1999},
170
+	Bdsk-Url-1 = {http://dx.doi.org/10.1038/44565}}
171
+
172
+@article{Kossenkov2009,
173
+	Abstract = {We explore a number of matrix factorization methods in terms of their ability to identify signatures of biological processes in a large gene expression study. We focus on the ability of these methods to find signatures in terms of gene ontology enhancement and on the interpretation of these signatures in the samples. Two Bayesian approaches, Bayesian Decomposition (BD) and Bayesian Factor Regression Modeling (BFRM), perform best. Differences in the strength of the signatures between the samples suggest that BD will be most useful for systems modeling and BFRM for biomarker discovery.},
174
+	Address = {The Wistar Institute, Philadelphia, Pennsylvania, USA.},
175
+	Au = {Kossenkov, AV and Ochs, MF},
176
+	Author = {Kossenkov, A.V. and Ochs, M.F.},
177
+	Crdt = {2009/11/10 06:00},
178
+	Da = {20091109},
179
+	Date-Added = {2010-02-02 12:33:48 -0500},
180
+	Date-Modified = {2010-02-02 15:55:21 -0500},
181
+	Dcom = {20100111},
182
+	Doi = {10.1016/S0076-6879(09)67003-8},
183
+	Edat = {2009/11/10 06:00},
184
+	Issn = {1557-7988 (Electronic); 1557-7988 (Linking)},
185
+	Jid = {0212271},
186
+	Journal = {Methods Enzymol},
187
+	Jt = {Methods in enzymology},
188
+	Keywords = {Markov Chain Monte Carlo, matrix factorization},
189
+	Language = {eng},
190
+	Mh = {Algorithms; Bayes Theorem; Biological Processes/physiology; *Cluster Analysis; *Data Interpretation, Statistical; Gene Expression Profiling/*methods; Gene Regulatory Networks; Microarray Analysis/*methods; Pattern Recognition, Automated/methods; Saccharomyces cerevisiae/genetics/physiology},
191
+	Mhda = {2010/01/12 06:00},
192
+	Own = {NLM},
193
+	Pages = {59--77},
194
+	Pii = {S0076-6879(09)67003-8},
195
+	Pl = {United States},
196
+	Pmid = {19897089},
197
+	Pst = {ppublish},
198
+	Pt = {Journal Article},
199
+	Sb = {IM},
200
+	So = {Methods Enzymol. 2009;467:59-77. },
201
+	Stat = {MEDLINE},
202
+	Title = {Matrix factorization for recovery of biological processes from microarray data.},
203
+	Volume = {467},
204
+	Year = {2009},
205
+	Bdsk-Url-1 = {http://dx.doi.org/10.1016/S0076-6879(09)67003-8}}
206
+
207
+@article{Subramanian2005,
208
+	Abstract = {Although genomewide RNA expression analysis has become a routine tool in biomedical research, extracting biological insight from such information remains a major challenge. Here, we describe a powerful analytical method called Gene Set Enrichment Analysis (GSEA) for interpreting gene expression data. The method derives its power by focusing on gene sets, that is, groups of genes that share common biological function, chromosomal location, or regulation. We demonstrate how GSEA yields insights into several cancer-related data sets, including leukemia and lung cancer. Notably, where single-gene analysis finds little similarity between two independent studies of patient survival in lung cancer, GSEA reveals many biological pathways in common. The GSEA method is embodied in a freely available software package, together with an initial database of 1,325 biologically defined gene sets.},
209
+	Address = {Broad Institute of Massachusetts Institute of Technology and Harvard, 320 Charles Street, Cambridge, MA 02141, USA.},
210
+	Au = {Subramanian, A and Tamayo, P and Mootha, VK and Mukherjee, S and Ebert, BL and Gillette, MA and Paulovich, A and Pomeroy, SL and Golub, TR and Lander, ES and Mesirov, JP},
211
+	Author = {Subramanian, A. and Tamayo, P. and Mootha, V.K. and Mukherjee, S. and Ebert, B.L. and Gillette, M.A. and Paulovich, A. and Pomeroy, S.L. and Golub, T.R. and Lander, E.S. and Mesirov, J.P.},
212
+	Cin = {Proc Natl Acad Sci U S A. 2005 Oct 25;102(43):15278-9. PMID: 16230612},
213
+	Crdt = {2005/10/04 09:00},
214
+	Da = {20051026},
215
+	Date-Added = {2010-02-01 16:29:02 -0500},
216
+	Date-Modified = {2010-02-02 15:59:33 -0500},
217
+	Dcom = {20051212},
218
+	Dep = {20050930},
219
+	Doi = {10.1073/pnas.0506580102},
220
+	Edat = {2005/10/04 09:00},
221
+	Issn = {0027-8424 (Print); 0027-8424 (Linking)},
222
+	Jid = {7505876},
223
+	Journal = {Proc Natl Acad Sci},
224
+	Jt = {Proceedings of the National Academy of Sciences of the United States of America},
225
+	Keywords = {gene set},
226
+	Language = {eng},
227
+	Lr = {20091118},
228
+	Mh = {Cell Line, Tumor; Female; Gene Expression Profiling/*methods; Genes, p53/physiology; Genome; Humans; Leukemia, Myeloid, Acute/genetics; Lung Neoplasms/genetics/mortality; Male; *Oligonucleotide Array Sequence Analysis; Precursor Cell Lymphoblastic Leukemia-Lymphoma/genetics},
229
+	Mhda = {2005/12/15 09:00},
230
+	Number = {43},
231
+	Oid = {NLM: PMC1239896},
232
+	Own = {NLM},
233
+	Pages = {15545--15550},
234
+	Phst = {2005/09/30 {$[$}aheadofprint{$]$}},
235
+	Pii = {0506580102},
236
+	Pl = {United States},
237
+	Pmc = {PMC1239896},
238
+	Pmid = {16199517},
239
+	Pst = {ppublish},
240
+	Pt = {Journal Article},
241
+	Sb = {IM},
242
+	So = {Proc Natl Acad Sci U S A. 2005 Oct 25;102(43):15545-50. Epub 2005 Sep 30. },
243
+	Stat = {MEDLINE},
244
+	Title = {Gene set enrichment analysis: a knowledge-based approach for interpreting genome-wide expression profiles.},
245
+	Volume = {102},
246
+	Year = {2005},
247
+	Bdsk-Url-1 = {http://dx.doi.org/10.1073/pnas.0506580102}}
248
+
249
+@article{Draghici2003,
250
+	Abstract = {Onto-Tools is a set of four seamlessly integrated databases: Onto-Express, Onto-Compare, Onto-Design and Onto-Translate. Onto-Express is able to automatically translate lists of genes found to be differentially regulated in a given condition into functional profiles characterizing the impact of the condition studied upon various biological processes and pathways. OE constructs functional profiles (using Gene Ontology terms) for the following categories: biochemical function, biological process, cellular role, cellular component, molecular function and chromosome location. Statistical significance values are calculated for each category. Once the initial exploratory analysis identified a number of relevant biological processes, specific mechanisms of interactions can be hypothesized for the conditions studied. Currently, many commercial arrays are available for the investigation of specific mechanisms. Each such array is characterized by a biological bias determined by the extent to which the genes present on the array represent specific pathways. Onto-Compare is a tool that allows efficient comparisons of any sets of commercial or custom arrays. Using Onto-Compare, a researcher can determine quickly which array, or set of arrays, covers best the hypotheses studied. In many situations, no commercial arrays are available for specific biological mechanisms. Onto-Design is a tool that allows the user to select genes that represent given functional categories. Onto-Translate allows the user to translate easily lists of accession numbers, UniGene clusters and Affymetrix probes into one another. All tools above are seamlessly integrated. The Onto-Tools are available online at http://vortex.cs.wayne.edu/Projects.html.},
251
+	Address = {Department of Computer Science, Wayne State University, 431 State Hall, Detroit, MI 48202, USA. sod@cs.wayne.edu},
252
+	Au = {Draghici, S and Khatri, P and Bhavsar, P and Shah, A and Krawetz, SA and Tainsky, MA},
253
+	Author = {Draghici, S. and Khatri, P. and Bhavsar, P. and Shah, A. and Krawetz, S.A. and Tainsky, M.A.},
254
+	Crdt = {2003/06/26 05:00},
255
+	Da = {20030625},
256
+	Date-Added = {2010-02-01 16:28:22 -0500},
257
+	Date-Modified = {2010-02-02 15:56:55 -0500},
258
+	Dcom = {20030818},
259
+	Edat = {2003/06/26 05:00},
260
+	Gr = {R01-NS045207-01/NS/NINDS NIH HHS/United States; R21-EB000990-01/EB/NIBIB NIH HHS/United States},
261
+	Issn = {1362-4962 (Electronic); 1362-4962 (Linking)},
262
+	Jid = {0411011},
263
+	Journal = {Nucleic Acids Res},
264
+	Jt = {Nucleic acids research},
265
+	Keywords = {gene set},
266
+	Language = {eng},
267
+	Lr = {20091118},
268
+	Mh = {Databases, Nucleic Acid; Gene Expression Profiling/*methods; Internet; Oligonucleotide Array Sequence Analysis/*methods; Proteins/genetics/physiology; *Software; Systems Integration},
269
+	Mhda = {2003/08/19 05:00},
270
+	Number = {13},
271
+	Oid = {NLM: PMC169030},
272
+	Own = {NLM},
273
+	Pages = {3775--3781},
274
+	Pl = {England},
275
+	Pmc = {PMC169030},
276
+	Pmid = {12824416},
277
+	Pst = {ppublish},
278
+	Pt = {Journal Article; Research Support, Non-U.S. Gov't; Research Support, U.S. Gov't, Non-P.H.S.; Research Support, U.S. Gov't, P.H.S.},
279
+	Rn = {0 (Proteins)},
280
+	Sb = {IM},
281
+	So = {Nucleic Acids Res. 2003 Jul 1;31(13):3775-81. },
282
+	Stat = {MEDLINE},
283
+	Title = {{Onto-Tools, the toolkit of the modern biologist: Onto-Express, Onto-Compare, Onto-Design and Onto-Translate.}},
284
+	Volume = {31},
285
+	Year = {2003}}
286
+
287
+@inbook{Ochs2006,
288
+	Address = {London},
289
+	Author = {M.F. Ochs},
290
+	Date-Added = {2010-01-08 14:45:36 -0500},
291
+	Date-Modified = {2010-01-08 14:48:01 -0500},
292
+	Editor = {G. Parmigiani and E. S. Garrett and R. A. Irizarry and S. L. Zeger},
293
+	Pages = {388-408},
294
+	Publisher = {Springer-Verlag},
295
+	Series = {Statistics for Biology and Health},
296
+	Title = {The Analysis of Gene Expression Data The Analysis of Gene Expression Data: Methods and Software},
297
+	Year = {2006}}
298
+
299
+@article{Bidaut2006,
300
+	Abstract = {BACKGROUND: As numerous diseases involve errors in signal transduction, modern therapeutics often target proteins involved in cellular signaling. Interpretation of the activity of signaling pathways during disease development or therapeutic intervention would assist in drug development, design of therapy, and target identification. Microarrays provide a global measure of cellular response, however linking these responses to signaling pathways requires an analytic approach tuned to the underlying biology. An ongoing issue in pattern recognition in microarrays has been how to determine the number of patterns (or clusters) to use for data interpretation, and this is a critical issue as measures of statistical significance in gene ontology or pathways rely on proper separation of genes into groups. RESULTS: Here we introduce a method relying on gene annotation coupled to decompositional analysis of global gene expression data that allows us to estimate specific activity on strongly coupled signaling pathways and, in some cases, activity of specific signaling proteins. We demonstrate the technique using the Rosetta yeast deletion mutant data set, decompositional analysis by Bayesian Decomposition, and annotation analysis using ClutrFree. We determined from measurements of gene persistence in patterns across multiple potential dimensionalities that 15 basis vectors provides the correct dimensionality for interpreting the data. Using gene ontology and data on gene regulation in the Saccharomyces Genome Database, we identified the transcriptional signatures of several cellular processes in yeast, including cell wall creation, ribosomal disruption, chemical blocking of protein synthesis, and, critically, individual signatures of the strongly coupled mating and filamentation pathways. CONCLUSION: This works demonstrates that microarray data can provide downstream indicators of pathway activity either through use of gene ontology or transcription factor databases. This can be used to investigate the specificity and success of targeted therapeutics as well as to elucidate signaling activity in normal and disease processes.},
301
+	Address = {Fox Chase Cancer Center, 333 Cottman Avenue, Philadelphia, PA 19111, USA. ghbidaut@pcbi.upenn.edu},
302
+	Au = {Bidaut, G and Suhre, K and Claverie, JM and Ochs, MF},
303
+	Author = {Bidaut, G. and Suhre, K. and Claverie, J.-M. and Ochs, M.F.},
304
+	Da = {20060327},
305
+	Date-Added = {2010-01-08 14:39:10 -0500},
306
+	Date-Modified = {2010-01-08 14:39:10 -0500},
307
+	Dcom = {20060426},
308
+	Dep = {20060228},
309
+	Doi = {10.1186/1471-2105-7-99},
310
+	Edat = {2006/03/02 09:00},
311
+	Gr = {CA06927/CA/United States NCI; LM008309/LM/United States NLM},
312
+	Issn = {1471-2105 (Electronic)},
313
+	Jid = {100965194},
314
+	Journal = {BMC Bioinformatics},
315
+	Jt = {BMC bioinformatics},
316
+	Keywords = {Bayesian, Markov Chain Monte Carlo, signaling network, microarray, k25},
317
+	Language = {eng},
318
+	Lr = {20071114},
319
+	Mh = {Algorithms; Computer Simulation; Gene Expression Profiling/*methods; *Models, Biological; Oligonucleotide Array Sequence Analysis/*methods; Pattern Recognition, Automated/*methods; Saccharomyces cerevisiae Proteins/genetics/*metabolism; Signal Transduction/*physiology; Transcription Factors/genetics/*metabolism},
320
+	Mhda = {2006/04/28 09:00},
321
+	Own = {NLM},
322
+	Pages = {99},
323
+	Phst = {2005/09/22 {$[$}received{$]$}; 2006/02/28 {$[$}accepted{$]$}; 2006/02/28 {$[$}aheadofprint{$]$}},
324
+	Pii = {1471-2105-7-99},
325
+	Pl = {England},
326
+	Pmc = {PMC1413561},
327
+	Pmid = {16507110},
328
+	Pst = {epublish},
329
+	Pt = {Journal Article; Research Support, N.I.H., Extramural; Research Support, Non-U.S. Gov't},
330
+	Pubm = {Electronic},
331
+	Rn = {0 (Saccharomyces cerevisiae Proteins); 0 (Transcription Factors)},
332
+	Sb = {IM},
333
+	So = {BMC Bioinformatics. 2006 Feb 28;7:99. },
334
+	Stat = {MEDLINE},
335
+	Title = {Determination of strongly overlapping signaling activity from microarray data.},
336
+	Volume = {7},
337
+	Year = {2006},
338
+	Bdsk-Url-1 = {http://dx.doi.org/10.1186/1471-2105-7-99}}
339
+
340
+@article{Kossenkov2007a,
341
+	Abstract = {Many biological processes rely on remodeling of the transcriptional response of cells through activation of transcription factors. Although determination of the activity level of transcription factors from microarray data can provide insight into developmental and disease processes, it requires careful analysis because of the multiple regulation of genes. We present a novel approach that handles both the assignment of genes to multiple patterns, as required by multiple regulation, and the linking of genes in prior probability distributions according to their known transcriptional regulators. We demonstrate the power of this approach in simulations and by application to yeast cell cycle and deletion mutant data. The results of simulations in the presence of increasing noise showed improved recovery of patterns in terms of chi2 fit. Analysis of the yeast data led to improved inference of biologically meaningful groups in comparison to other techniques, as demonstrated with ROC analysis. The new algorithm provides an approach for estimating the levels of transcription factor activity from microarray data, and therefore provides insights into biological response.},
342
+	Address = {The Wistar Institute, Philadelphia, PA, USA.},
343
+	Au = {Kossenkov, AV and Peterson, AJ and Ochs, MF},
344
+	Author = {A.V. Kossenkov and A.J. Peterson and M.F. Ochs},
345
+	Da = {20071003},
346
+	Date-Added = {2010-01-08 14:39:10 -0500},
347
+	Date-Modified = {2010-01-08 14:39:10 -0500},
348
+	Dcom = {20071102},
349
+	Edat = {2007/10/04 09:00},
350
+	Gr = {CA06973/CA/United States NCI; LM008309/LM/United States NLM},
351
+	Issn = {0926-9630 (Print)},
352
+	Jid = {9214582},
353
+	Journal = {Stud Health Technol Inform},
354
+	Jt = {Studies in health technology and informatics},
355
+	Keywords = {Bayesian, Markov Chain Monte Carlo, microarray},
356
+	Language = {eng},
357
+	Lr = {20080710},
358
+	Mh = {*Algorithms; Bayes Theorem; Computational Biology; *Gene Expression Regulation; Markov Chains; Models, Genetic; Monte Carlo Method; *Oligonucleotide Array Sequence Analysis; ROC Curve; Transcription Factors/*metabolism; Transcription, Genetic; Yeasts/genetics},
359
+	Mhda = {2007/11/06 09:00},
360
+	Number = {Pt 2},
361
+	Own = {NLM},
362
+	Pages = {1250--1254},
363
+	Pl = {Netherlands},
364
+	Pmid = {17911915},
365
+	Pst = {ppublish},
366
+	Pt = {Journal Article; Research Support, N.I.H., Extramural},
367
+	Pubm = {Print},
368
+	Rn = {0 (Transcription Factors)},
369
+	Sb = {T},
370
+	So = {Stud Health Technol Inform. 2007;129(Pt 2):1250-4. },
371
+	Stat = {MEDLINE},
372
+	Title = {Determining transcription factor activity from microarray data using {Bayesian Markov chain Monte Carlo} sampling.},
373
+	Volume = {129},
374
+	Year = {2007}}
375
+
376
+@article{Moloshok2002,
377
+	Abstract = {MOTIVATION: Microarray and gene chip technology provide high throughput tools for measuring gene expression levels in a variety of circumstances, including cellular response to drug treatment, cellular growth and development, tumorigenesis, among many other processes. In order to interpret the large data sets generated in experiments, data analysis techniques that consider biological knowledge during analysis will be extremely useful. We present here results showing the application of such a tool to expression data from yeast cell cycle experiments. RESULTS: Originally developed for spectroscopic analysis, Bayesian Decomposition (BD) includes two features which make it useful for microarray data analysis: the ability to assign genes to multiple coexpression groups and the ability to encode biological knowledge into the system. Here we demonstrate the ability of the algorithm to provide insight into the yeast cell cycle, including identification of five temporal patterns tied to cell cycle phases as well as the identification of a pattern tied to an approximately 40 min cell cycle oscillator. The genes are simultaneously assigned to the patterns, including partial assignment to multiple patterns when this is required to explain the expression profile. AVAILABILITY: The application is available free to academic users under a material transfer agreement. Go to http://bioinformatics.fccc.edu/ for more details.},
378
+	Address = {Bioinformatics Working Group, Fox Chase Cancer Center, Philadelphia, PA 19111, USA. td_moloshok@fccc.edu},
379
+	Au = {Moloshok, TD and Klevecz, RR and Grant, JD and Manion, FJ and Speier WF, 4th and Ochs, MF},
380
+	Author = {Moloshok, T.D. and Klevecz, R.R. and Grant, J.D. and Manion, F.J. and Speier IV, W.F. and Ochs, M.F.},
381
+	Da = {20020517},
382
+	Date-Added = {2010-01-08 14:39:10 -0500},
383
+	Date-Modified = {2010-01-08 14:39:10 -0500},
384
+	Dcom = {20021108},
385
+	Edat = {2002/05/23 10:00},
386
+	Gr = {CA06927/CA/United States NCI},
387
+	Issn = {1367-4803 (Print)},
388
+	Jid = {9808944},
389
+	Journal = {Bioinformatics},
390
+	Jt = {Bioinformatics (Oxford, England)},
391
+	Keywords = {Bayesian, signaling network, microarray},
392
+	Language = {eng},
393
+	Lr = {20071114},
394
+	Mh = {*Algorithms; *Bayes Theorem; Cell Cycle/genetics; Databases, Genetic; Gene Expression Regulation; Genome, Fungal; Markov Chains; *Models, Genetic; *Models, Statistical; Monte Carlo Method; Oligonucleotide Array Sequence Analysis/*methods; Pattern Recognition, Automated; Periodicity; Reproducibility of Results; Saccharomyces cerevisiae/genetics; Sensitivity and Specificity},
395
+	Mhda = {2002/11/26 04:00},
396
+	Number = {4},
397
+	Own = {NLM},
398
+	Pages = {566--575},
399
+	Pl = {England},
400
+	Pmid = {12016054},
401
+	Pst = {ppublish},
402
+	Pt = {Comparative Study; Journal Article; Research Support, Non-U.S. Gov't; Research Support, U.S. Gov't, P.H.S.},
403
+	Pubm = {Print},
404
+	Sb = {IM},
405
+	So = {Bioinformatics. 2002 Apr;18(4):566-75. },
406
+	Stat = {MEDLINE},
407
+	Title = {Application of {B}ayesian decomposition for analysing microarray data.},
408
+	Volume = {18},
409
+	Year = {2002},
410
+	Bdsk-File-1 = {YnBsaXN0MDDUAQIDBAUGCQpYJHZlcnNpb25UJHRvcFkkYXJjaGl2ZXJYJG9iamVjdHMSAAGGoNEHCFRyb290gAFfEA9OU0tleWVkQXJjaGl2ZXKoCwwXGBkdJCVVJG51bGzTDQ4PEBEUViRjbGFzc1dOUy5rZXlzWk5TLm9iamVjdHOAB6ISE4ACgAOiFRaABIAGWWFsaWFzRGF0YVxyZWxhdGl2ZVBhdGjSDRobHFdOUy5kYXRhgAVPEQJGAAAAAAJGAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAADD4wxdSCsAAAUWsv4VTW9sb3Nob2tfQkRfWWVhc3QucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABRay/8Xm4ONQREYgQ0FSTwACAAkAAAkgAAAAAAAAAAAAAAAAAAAAATIAABAACAAAw+NSrQAAABEACAAAxecZIwAAAAEAKAUWsv4FFrL9Ajn6RAI5RRoAFB2EABQZOQAP6lwACqQSAAqkEQAAetwAAgCFTWFjaW50b3NoIEhEOlVzZXJzOmVqZmVydGlnOkxpYnJhcnk6TWFpbDpJTUFQLWVqZmVydGlnQG1haWwubCMxNDE5Mzkub3JnOklOQk9YOk9jaHMuaW1hcG1ib3g6QXR0YWNobWVudHM6MzA4NzoyOk1vbG9zaG9rX0JEX1llYXN0LnBkZgAADgAsABUATQBvAGwAbwBzAGgAbwBrAF8AQgBEAF8AWQBlAGEAcwB0AC4AcABkAGYADwAaAAwATQBhAGMAaQBuAHQAbwBzAGgAIABIAEQAEgB6VXNlcnMvZWpmZXJ0aWcvTGlicmFyeS9NYWlsL0lNQVAtZWpmZXJ0aWdAbWFpbC5saXF1aWRkaXJ0Lm9yZy9JTkJPWC9PY2hzLmltYXBtYm94L0F0dGFjaG1lbnRzLzMwODcvMi9Nb2xvc2hva19CRF9ZZWFzdC5wZGYAEwABLwAAFQACAA///wAA0h4fICFYJGNsYXNzZXNaJGNsYXNzbmFtZaMhIiNdTlNNdXRhYmxlRGF0YVZOU0RhdGFYTlNPYmplY3RfEHEuLi8uLi9MaWJyYXJ5L01haWwvSU1BUC1lamZlcnRpZ0BtYWlsLmxpcXVpZGRpcnQub3JnL0lOQk9YL09jaHMuaW1hcG1ib3gvQXR0YWNobWVudHMvMzA4Ny8yL01vbG9zaG9rX0JEX1llYXN0LnBkZtIeHyYnoicjXE5TRGljdGlvbmFyeQAIABEAGgAfACkAMgA3ADoAPwBBAFMAXABiAGkAcAB4AIMAhQCIAIoAjACPAJEAkwCdAKoArwC3ALkDAwMIAxEDHAMgAy4DNQM+A7IDtwO6AAAAAAAAAgEAAAAAAAAAKAAAAAAAAAAAAAAAAAAAA8c=}}
411
+
412
+@article{Ochs1999,
413
+	Abstract = {A frequent problem in analysis is the need to find two matrices, closely related to the underlying measurement process, which when multiplied together reproduce the matrix of data points. Such problems arise throughout science, for example, in imaging where both the calibration of the sensor and the true scene may be unknown and in localized spectroscopy where multiple components may be present in varying amounts in any spectrum. Since both matrices are unknown, such a decomposition is a bilinear problem. We report here a solution to this problem for the case in which the decomposition results in matrices with elements drawn from positive additive distributions. We demonstrate the power of the methodology on chemical shift images (CSI). The new method, Bayesian spectral decomposition (BSD), reduces the CSI data to a small number of basis spectra together with their localized amplitudes. We apply this new algorithm to a 19F nonlocalized study of the catabolism of 5-fluorouracil in human liver, 31P CSI studies of a human head and calf muscle, and simulations which show its strengths and limitations. In all cases, the dataset, viewed as a matrix with rows containing the individual NMR spectra, results from the multiplication of a matrix of generally nonorthogonal basis spectra (the spectral matrix) by a matrix of the amplitudes of each basis spectrum in the the individual voxels (the amplitude matrix). The results show that BSD can simultaneously determine both the basis spectra and their distribution. In principle, BSD should solve this bilinear problem for any dataset which results from multiplication of matrices representing positive additive distributions if the data overdetermine the solutions.},
414
+	Address = {NMR and Medical Spectroscopy, Fox Chase Cancer Center, Philadelphia, PA, USA.},
415
+	Au = {Ochs, MF and Stoyanova, RS and Arias-Mendoza, F and Brown, TR},
416
+	Author = {Ochs, M.F. and Stoyanova, R.S. and Arias-Mendoza, F. and Brown, T.R.},
417
+	Ci = {Copyright 1999 Academic Press.},
418
+	Da = {19990331},
419
+	Date-Added = {2010-01-08 14:39:10 -0500},
420
+	Date-Modified = {2010-01-08 14:39:10 -0500},
421
+	Dcom = {19990331},
422
+	Doi = {10.1006/jmre.1998.1639},
423
+	Edat = {1999/03/04},
424
+	Gr = {CA41078/CA/United States NCI; CA62556/CA/United States NCI},
425
+	Issn = {1090-7807 (Print)},
426
+	Jid = {9707935},
427
+	Journal = {J Magn Reson},
428
+	Jt = {Journal of magnetic resonance (San Diego, Calif. : 1997)},
429
+	Keywords = {Markov Chain Monte Carlo, Bayesian, k25},
430
+	Language = {eng},
431
+	Lr = {20071114},
432
+	Mh = {Adenosine Triphosphate/analysis; Bayes Theorem; Brain/metabolism; Fluorouracil/*metabolism; Humans; Image Processing, Computer-Assisted; Liver/*metabolism; Magnesium/analysis; Magnetic Resonance Spectroscopy/*methods; Muscle, Skeletal/metabolism},
433
+	Mhda = {1999/03/04 00:01},
434
+	Number = {1},
435
+	Own = {NLM},
436
+	Pages = {161--176},
437
+	Pii = {S1090-7807(98)91639-1},
438
+	Pl = {UNITED STATES},
439
+	Pmid = {10053145},
440
+	Pst = {ppublish},
441
+	Pt = {Comparative Study; Journal Article; Research Support, U.S. Gov't, P.H.S.},
442
+	Pubm = {Print},
443
+	Rn = {51-21-8 (Fluorouracil); 56-65-5 (Adenosine Triphosphate); 7439-95-4 (Magnesium)},
444
+	Sb = {IM},
445
+	So = {J Magn Reson. 1999 Mar;137(1):161-76. },
446
+	Stat = {MEDLINE},
447
+	Title = {A new method for spectral decomposition using a bilinear Bayesian approach.},
448
+	Volume = {137},
449
+	Year = {1999},
450
+	Bdsk-Url-1 = {http://dx.doi.org/10.1006/jmre.1998.1639}}
451
+
452
+@conference{Plummer2003,
453
+	Address = {Vienna, Austria},
454
+	Author = {M. Plummer},
455
+	Booktitle = {Proceedings of the 3rd Internation Workshop on Distributed Statistical Computing},
456
+	Date-Added = {2010-01-08 14:38:05 -0500},
457
+	Date-Modified = {2010-02-02 15:47:49 -0500},
458
+	Editor = {K. Hornik and F. Leisch and A. Zeileis},
459
+	Keywords = {Markov Chain Monte Carlo},
460
+	Month = {March 20-22},
461
+	Title = {{JAGS}: A program for analysis of {Bayesian} graphical models using {Gibbs} sampling},
462
+	Year = {2003}}
463
+
464
+@article{Ochs2009,
465
+	Abstract = {Cell signaling plays a central role in the etiology of cancer. Numerous therapeutics in use or under development target signaling proteins; however, off-target effects often limit assignment of positive clinical response to the intended target. As direct measurements of signaling protein activity are not generally feasible during treatment, there is a need for more powerful methods to determine if therapeutics inhibit their targets and when off-target effects occur. We have used the Bayesian Decomposition algorithm and data on transcriptional regulation to create a novel methodology, Differential Expression for Signaling Determination (DESIDE), for inferring signaling activity from microarray measurements. We applied DESIDE to deduce signaling activity in gastrointestinal stromal tumor cell lines treated with the targeted therapeutic imatinib mesylate (Gleevec). We detected the expected reduced activity in the KIT pathway, as well as unexpected changes in the p53 pathway. Pursuing these findings, we have determined that imatinib-induced DNA damage is responsible for the increased activity of p53, identifying a novel off-target activity for this drug. We then used DESIDE on data from resected, post-imatinib treatment tumor samples and identified a pattern in these tumors similar to that at late time points in the cell lines, and this pattern correlated with initial clinical response. The pattern showed increased activity of ETS domain-containing protein Elk-1 and signal transducers and activators of transcription 3 transcription factors, which are associated with the growth of side population cells. DESIDE infers the global reprogramming of signaling networks during treatment, permitting treatment modification that leverages ongoing drug development efforts, which is crucial for personalized medicine.},
466
+	Address = {Division of Oncology Biostatistics and Bioinformatics, Johns Hopkins University, Baltimore, Maryland 21205, USA. mfo@jhu.edu},
467
+	Au = {Ochs, MF and Rink, L and Tarn, C and Mburu, S and Taguchi, T and Eisenberg, B and Godwin, AK},
468
+	Author = {Ochs, M.F. and Rink, L. and Tarn, C. and Mburu, S. and Taguchi, T. and Eisenberg, B. and Godwin, A.K.},
469
+	Crdt = {2009/11/12 06:00},
470
+	Da = {20091204},
471
+	Date-Added = {2010-01-08 14:35:07 -0500},
472
+	Date-Modified = {2010-02-02 15:57:46 -0500},
473
+	Dcom = {20091221},
474
+	Dep = {20091110},
475
+	Doi = {10.1158/0008-5472.CAN-09-1709},
476
+	Edat = {2009/11/12 06:00},
477
+	Gr = {CA009035/CA/NCI NIH HHS/United States; CA106588/CA/NCI NIH HHS/United States; CA21661/CA/NCI NIH HHS/United States; LM009382/LM/NLM NIH HHS/United States},
478
+	Issn = {1538-7445 (Electronic); 1538-7445 (Linking)},
479
+	Jid = {2984705R},
480
+	Journal = {Cancer Res},
481
+	Jt = {Cancer research},
482
+	Language = {eng},
483
+	Mh = {Antineoplastic Agents/*pharmacology; Cell Line, Tumor; DNA Damage; Gastrointestinal Stromal Tumors/*drug therapy/*genetics/metabolism; Gene Expression Profiling; Humans; Piperazines/*pharmacology; Pyrimidines/*pharmacology; RNA, Messenger/biosynthesis/genetics; STAT3 Transcription Factor/metabolism; Signal Transduction; Tumor Suppressor Protein p53/genetics/metabolism; ets-Domain Protein Elk-1/metabolism},
484
+	Mhda = {2009/12/22 06:00},
485
+	Mid = {NIHMS149886},
486
+	Number = {23},
487
+	Oid = {NLM: NIHMS149886 {$[$}Available on 12/01/10{$]$}; NLM: PMC2789202 {$[$}Available on 12/01/10{$]$}},
488
+	Own = {NLM},
489
+	Pages = {9125--9132},
490
+	Phst = {2009/11/10 {$[$}aheadofprint{$]$}},
491
+	Pii = {0008-5472.CAN-09-1709},
492
+	Pl = {United States},
493
+	Pmc = {PMC2789202},
494
+	Pmcr = {2010/12/01},
495
+	Pmid = {19903850},
496
+	Pst = {ppublish},
497
+	Pt = {Journal Article; Research Support, N.I.H., Extramural; Research Support, Non-U.S. Gov't},
498
+	Rn = {0 (Antineoplastic Agents); 0 (ELK1 protein, human); 0 (Piperazines); 0 (Pyrimidines); 0 (RNA, Messenger); 0 (STAT3 Transcription Factor); 0 (STAT3 protein, human); 0 (TP53 protein, human); 0 (Tumor Suppressor Protein p53); 0 (ets-Domain Protein Elk-1); 152459-95-5 (imatinib)},
499
+	Sb = {IM},
500
+	So = {Cancer Res. 2009 Dec 1;69(23):9125-32. Epub 2009 Nov 10. },
501
+	Stat = {MEDLINE},
502
+	Title = {Detection of treatment-induced changes in signaling pathways in gastrointestinal stromal tumors using transcriptomic data.},
503
+	Volume = {69},
504
+	Year = {2009},
505
+	Bdsk-Url-1 = {http://dx.doi.org/10.1158/0008-5472.CAN-09-1709}}
506
+
507
+@conference{Skilling1998,
508
+	Address = {Dordrecht/Boston/London},
509
+	Author = {J. Skilling},
510
+	Booktitle = {Maximum Entropy and Bayesian Methods, Proceedings of the 17th International Workshop on Maxiumum Entropy and Bayesian Methods of Statistical Analysis},
511
+	Date-Added = {2010-01-08 14:27:23 -0500},
512
+	Date-Modified = {2010-01-08 14:29:16 -0500},
513
+	Editor = {G. J. Erickson and J. T. Rychert and C. R. Smith},
514
+	Publisher = {Kluwer Academic Publishers},
515
+	Title = {Massive inference and maximum entropy},
516
+	Year = {1998}}
517
+
518
+@article{Sibisi1997,
519
+	Author = {S. Sibisi and J. Skilling},
520
+	Date-Added = {2010-01-08 13:50:30 -0500},
521
+	Date-Modified = {2010-01-08 13:50:30 -0500},
522
+	Journal = {Journal of the Royal Statistical Society, B},
523
+	Keywords = {Bayesian, probability},
524
+	Number = {1},
525
+	Pages = {217-235},
526
+	Title = {Prior distributions on measure space},
527
+	Volume = {59},
528
+	Year = {1997},
529
+	Bdsk-File-1 = {YnBsaXN0MDDUAQIDBAUGCQpYJHZlcnNpb25UJHRvcFkkYXJjaGl2ZXJYJG9iamVjdHMSAAGGoNEHCFRyb290gAFfEA9OU0tleWVkQXJjaGl2ZXKoCwwXGBkdJCVVJG51bGzTDQ4PEBEUViRjbGFzc1dOUy5rZXlzWk5TLm9iamVjdHOAB6ISE4ACgAOiFRaABIAGWWFsaWFzRGF0YVxyZWxhdGl2ZVBhdGjSDRobHFdOUy5kYXRhgAVPEQHyAAAAAAHyAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAADD4wxdSCsAAAAQbOgfU2liaXNpU2tpbGxpbmdfSlJveSMxMDI4OURGLnBkZgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQKJ38SrsAMAAAAAAAAAAAADAAIAAAkgAAAAAAAAAAAAAAAAAAAAClJlZmVyZW5jZXMAEAAIAADD41KtAAAAEQAIAADEq+hDAAAAAQAQABBs6AAKpCgACqQRAAB63AACAFBNYWNpbnRvc2ggSEQ6VXNlcnM6ZWpmZXJ0aWc6RG9jdW1lbnRzOlJlZmVyZW5jZXM6U2liaXNpU2tpbGxpbmdfSlJveSMxMDI4OURGLnBkZgAOAE4AJgBTAGkAYgBpAHMAaQBTAGsAaQBsAGwAaQBuAGcAXwBKAFIAbwB5AGEAbABTAHQAYQB0AFMAbwBjAEIAXwAxADkAOQA3AC4AcABkAGYADwAaAAwATQBhAGMAaQBuAHQAbwBzAGgAIABIAEQAEgBKVXNlcnMvZWpmZXJ0aWcvRG9jdW1lbnRzL1JlZmVyZW5jZXMvU2liaXNpU2tpbGxpbmdfSlJveWFsU3RhdFNvY0JfMTk5Ny5wZGYAEwABLwAAFQACAA///wAA0h4fICFYJGNsYXNzZXNaJGNsYXNzbmFtZaMhIiNdTlNNdXRhYmxlRGF0YVZOU0RhdGFYTlNPYmplY3RfEDouLi8uLi8uLi9SZWZlcmVuY2VzL1NpYmlzaVNraWxsaW5nX0pSb3lhbFN0YXRTb2NCXzE5OTcucGRm0h4fJieiJyNcTlNEaWN0aW9uYXJ5AAgAEQAaAB8AKQAyADcAOgA/AEEAUwBcAGIAaQBwAHgAgwCFAIgAigCMAI8AkQCTAJ0AqgCvALcAuQKvArQCvQLIAswC2gLhAuoDJwMsAy8AAAAAAAACAQAAAAAAAAAoAAAAAAAAAAAAAAAAAAADPA==}}
0 530
new file mode 100644
... ...
@@ -0,0 +1,388 @@
1
+% \VignetteIndexEntry{GAPS/CoGAPS Users Manual}
2
+% \VignettePackage{CoGAPS}
3
+
4
+\documentclass{report}
5
+\usepackage{Sweave,fullpage,./chicago,graphicx,subfigure,amssymb,amsmath,color,hyperref,wrapfig}
6
+
7
+\author{Elana J. Fertig \\
8
+email: \texttt{ejfertig@jhmi.edu}}
9
+\title{GAPS/CoGAPS Users Manual}
10
+
11
+\begin{document}
12
+
13
+\maketitle
14
+\tableofcontents
15
+
16
+\chapter{Introduction}
17
+
18
+\par Gene Association in Pattern Sets (GAPS) infers underlying patterns in gene expression a matrix of microarray measurements.  This Markov chain Monte Carlo (MCMC) matrix decomposition which infers these patterns also infers the extent to which individual genes belong to these patterns.  The CoGAPS algorithm extends GAPS to infer the coordinated activity in sets of genes for each of the inferred patterns.
19
+
20
+\par The GAPS algorithm is implemented in a module of the open source, C++ MCMC software Just Another Gibbs Sampler (JAGS; \url{http://www-fis.iarc.fr/~martyn/software/jags/}; \cite{Plummer2003}).  We call the software package containing this implementation of the GAPS algorithm GAPS-JAGS. As an extension including a redistribution of JAGS, GAPS-JAGS is also licensed under the GNU General Public License version 2.  You may freely modify and redistribute GAPS under certain conditions that are described in the top level source directory file \nolinkurl{COPYING}.
21
+
22
+\par The R package CoGAPS is designed to facilitate the corresponding analysis of microarray measurements by calling libraries in the GAPS-JAGS package.  The installation instructions provided in Chapter \ref{install} will ensure proper interaction between the CoGAPS R package and GAPS-JAGS libraries.  Running instructions for the GAPS and CoGAPS analyses are provided in Sections \ref{GAPSRun} and \ref{CoGAPSRun}, respectively.  CoGAPS and GAPS-JAGS are freely available at \url{http://www.cancerbiostats.onc.jhmi.edu/CoGAPS.cfm}.
23
+
24
+\par Please contact Elana J. Fertig \url{ejfertig@jhmi.edu} or Michael F. Ochs \url{mfo@jhu.edu} for citation information. 
25
+
26
+\chapter{Installation Instructions} \label{install}
27
+
28
+\par The GAPS and CoGAPS algorithms are implemented in an open source C++ software based upon JAGS version 2.1.0 (GAPS-JAGS) and an R package to facilitate the running (CoGAPS, available through Bioconductor).  The CoGAPS package relies on the interface between R and JAGS, rjags.  It is important to link rjags to our redistribution of JAGS, GAPS-JAGS, to ensure proper interfacing between the GAPS algorithm and R.  
29
+
30
+\par The installation instructions provided in this section describe procedures to compile GAPS-JAGS (Section \ref{GAPSJAGSInstall}), to link GAPS-JAGS libraries to rjags (Section \ref{rjagsInstall}), and install the CoGAPS Bioconductor package (Section \ref{CoGAPSInstall}).  We recommend that users proceed with installation in this order; i.e., first install GAPS-JAGS according to Section \ref{GAPSJAGSInstall}, then install rjags according to Section \ref{rjagsInstall}, and finally install the CoGAPS Bioconductor package according to Section \ref{CoGAPSInstall}.  We note that the GAPS-JAGS and rjags installation follow the procedures for likewise installing the standard JAGS software, with the caveat that JAGS has been replaced by our software GAPS-JAGS.
31
+
32
+\section{GAPS-JAGS} \label{GAPSJAGSInstall}
33
+
34
+\par GAPS-JAGS is currently distributed from source only.  To use it, it must be compiled.  In this section, we provide installation instructions for GAPS-JAGS on Unix, MAC, and Windows.  We note that we describe only standard installation processes for GAPS-JAGS.  More detailed installation instructions can be found in the JAGS installation manual available at \url{http://sourceforge.net/projects/mcmc-jags/files/}.
35
+
36
+\subsection{Unix / MAC} \label{GAPSJAGSUnix}
37
+
38
+\par Successful installation of GAPS-JAGS requires the primarily on the following dependencies:
39
+\begin{description}
40
+\item[\textbf{automake}:]{Available from \url{http://www.gnu.org/software/automake/}.  Installation instructions are provided in the \texttt{INSTALL} file included with the package.}
41
+\item[\textbf{autoconf}:]{Available from \url{http://www.gnu.org/software/autoconf/}.  Installation instructions are provided in the \texttt{INSTALL} file included with the package.}
42
+\item[\textbf{Fortran compiler}:]{gfortran can be obtained from \url{http://gcc.gnu.org/fortran}.  Binaries for Mac OS 10.5 and earlier are available at \url{http://r.research.att.com/tools/}.}
43
+\item[\textbf{BLAS and LAPACK}:]{These libraries are typically available by default on most platforms.  If not provided on your machine, installation instructions are provided in the JAGS user manual available at \url{http://sourceforge.net/projects/mcmc-jags/files/}.}
44
+\end{description}
45
+
46
+\par To install GAPS-JAGS, download the source from \url{http://www.cancerbiostats.onc.jhmi.edu/CoGAPS.cfm} and enter the top directory of the downloaded source.  Then, GAPS-JAGS follows the typical GNU installation procedure of
47
+\begin{verbatim}
48
+./configure
49
+make
50
+sudo make install
51
+\end{verbatim}
52
+These commands will install GAPS-JAGS and its associated libraries into default path (typically \nolinkurl{/usr/local}).
53
+
54
+\par The installation procedure above requires administrative privileges.  To install GAPS-JAGS locally into the directory \$GAPSJAGS\_PATH, the following commands can be used.
55
+\begin{verbatim}
56
+./configure --prefix=${GAPSJAGS_PATH}
57
+make
58
+make install
59
+\end{verbatim}
60
+
61
+\par More detailed installation instructions are provided in the file \nolinkurl{INSTALL} in the top-level source directory or the JAGS installation manual available at \url{http://sourceforge.net/projects/mcmc-jags/files/}.
62
+
63
+\par On MAC, we recommend using the most recent version of \texttt{gcc} available through \texttt{xtools} to ensure proper interaction between the GAPS-JAGS libraries and R.
64
+
65
+\subsection{Windows}
66
+
67
+\par We provide an executable which installs GAPS-JAGS in Windows called \texttt{gaps-jags-1.0.0-setup.exe} at \url{http://www.cancerbiostats.onc.jhmi.edu/CoGAPS.cfm}.  To install GAPS-JAGS, download and run this executable, following the installation instructions noted on the screen.  Keep note of the directory to which GAPS-JAGS was installed for the installation of rjags (by default \nolinkurl{C:\\Program Files\\GAPS-JAGS\\GAPS-JAGS-1.0.2}).  If you wish to compile GAPS-JAGS yourself, follow the instructions in the JAGS installation manual (\url{http://sourceforge.net/projects/mcmc-jags/files/}).
68
+
69
+\section{rjags} \label{rjagsInstall}
70
+
71
+\par Throughout this section, we will assume that GAPS-JAGS was successfully installed into the directory \$GAPSJAGS\_PATH.
72
+
73
+\subsection{Unix / Mac}
74
+
75
+\par On Unix or Mac, use the following command inside R to install rjags and integrate the GAPS-JAGS libraries in \$GAPSJAGS\_PATH:
76
+\begin{verbatim}
77
+> install.packages("rjags", 
78
+  configure.args="--with-jags-include=${GAPSJAGS_PATH}/include/GAPS-JAGS 
79
+  --with-jags-lib=${GAPSJAGS_PATH}/lib 
80
+   --with-jags-modules=${GAPSJAGS_PATH}/lib/JAGS/modules-1.0.2")
81
+\end{verbatim}
82
+If this installation fails, try installing rjags from another mirror.  Installation on Mac may require adding the flag \texttt{type="source"} to \texttt{install.packages()}.
83
+
84
+\par Alternatively, the rjags package can be obtained through CRAN at \url{http://cran.r-project.org/web/packages/rjags/index.html}.  In this case, download the rjags source and install using the following command line argument:
85
+\begin{verbatim}
86
+R CMD INSTALL --configure-args=
87
+   "--with-jags-include=${GAPSJAGS_PATH}/include/GAPS-JAGS 
88
+    --with-jags-lib=${GAPSJAGS_PATH}/lib 
89
+    --with-jags-modules=${GAPSJAGS_PATH}/lib/JAGS/modules-1.0.2" 
90
+    rjags_2.1.0-4.tar.gz
91
+\end{verbatim}
92
+
93
+\par As before, this installation procedure requires administrative privileges to install the rjags package in R.  If you do not have administrative privileges, follow standard R procedures to install the package locally using the lib.loc option in \texttt{install.packages} or \texttt{-l} flag in \texttt{R CMD INSTALL}.
94
+
95
+\par In some platforms, the dynamic libraries may not be properly linked for loading the rjags package required by CoGAPS, leading to an error message such as
96
+\begin{verbatim}
97
+Error in dyn.load(file, DLLpath=DLLpath, ...)  :
98
+  unable to load shared library 'rjags.so'
99
+  libjags.so.1 cannot open shared object file: No such file or directory
100
+Error : .onLoad failed in 'loadNamespace' for 'rjags'
101
+Error: package 'rjags' could not be loaded
102
+\end{verbatim}
103
+In this case, the user should either set the environment variable \texttt{LD\_LIBRARY\_PATH} to \texttt{GAPSJAGS\_PATH/lib} or load in the dynamic libraries \texttt{libjags} and \texttt{libjrmath} manually as follows:
104
+\begin{verbatim}
105
+> dyn.load('${GAPSJAGS_PATH}/lib/libjags.so')
106
+> dyn.load('${GAPSJAGS_PATH}/lib/libjrmath.so')
107
+\end{verbatim}
108
+
109
+\subsection{Windows}
110
+
111
+\par Before installing or running rjags, the user must specify an environment variables \texttt{JAGS\_HOME} and \texttt{JAGS\_ROOT} specifying the location of GAPS-JAGS.  By default, \texttt{gaps-jags-1.0.2-setup.exe} will install GAPS-JAGS into \nolinkurl{C:\\Program Files\\GAPS-JAGS\\GAPS-JAGS-1.0.2}.  The corresponding environment variable can be set globally in Windows through the following steps
112
+\begin{enumerate}
113
+\item Open the \texttt{start} menu.
114
+\item Right click on the \texttt{My Computer} icon and select \texttt{properties}.
115
+\item Go to the \texttt{Advanced} tab.
116
+\item Click on the \texttt{Environment Variables} button.
117
+\item Select the \texttt{new} button under the \texttt{System variables} section.
118
+\item Set the variable name to be \texttt{JAGS\_HOME} and variable value to be \texttt{\$GAPSJAGS\_PATH}.
119
+\item Click on the \texttt{OK} button.
120
+\item Select the \texttt{new} button under the \texttt{System variables} section.
121
+\item Set the variable name to be \texttt{JAGS\_ROOT} and variable value to be \texttt{\$GAPSJAGS\_PATH}.
122
+\item Click on the \texttt{OK} button.
123
+\item Click on the \texttt{OK} button in the environment variables window.
124
+\item Click on the \texttt{OK} button in the \texttt{Advanced} pane and exit system properties.
125
+\end{enumerate}
126
+
127
+\par Alternatively, the user can set the environment variables \texttt{JAGS\_HOME} and \texttt{JAGS\_ROOT} locally through R using the following command
128
+\begin{verbatim}
129
+> Sys.setenv("JAGS_HOME"="${GAPSJAGS_PATH}")
130
+> Sys.setenv("JAGS_ROOT"="${GAPSJAGS_PATH}")
131
+\end{verbatim}
132
+In this case, the user must reenter these commands in each session of R in which rjags will be installed or run.
133
+
134
+\par Once the environment variables \texttt{JAGS\_HOME} and \texttt{JAGS\_ROOT} have been set, use the following commands inside R to install rjags and integrate the GAPS-JAGS libraries in \$GAPSJAGS\_PATH:
135
+\begin{verbatim}
136
+> install.packages("rjags")
137
+\end{verbatim}
138
+
139
+\section{CoGAPS} \label{CoGAPSInstall}
140
+
141
+\par Before installing the CoGAPS R package, standard R procedures should be followed to install the package \texttt{R.utils}.  Once \texttt{R.utils} and \texttt{rjags} have been installed, the installation of CoGAPS should follow the standard R package installation procedure.  Namely, download the package CoGAPS from \url{http://www.cancerbiostats.onc.jhmi.edu/CoGAPS.cfm}.  On UNIX/Mac, install the package with the following command line argument:
142
+\begin{verbatim}
143
+R CMD INSTALL CoGAPS_0.99.1.tar.gz
144
+\end{verbatim}
145
+
146
+\par In Windows, a zip file containing the CoGAPS library will be provided when GAPS-JAGS is released for Windows.
147
+
148
+Install the package with the following command in R:
149
+\begin{verbatim}
150
+install.packages('CoGAPS.zip', repos=NULL, source=TRUE)
151
+\end{verbatim}
152
+
153
+\par As described in Section \ref{rjagsInstall}, CoGAPS can be installed into local directories using the lib.loc option in install packages or -l flag in R CMD INSTALL.
154
+
155
+\chapter{Running Instructions}
156
+
157
+\par In this chapter, we describe how to run both the GAPS and CoGAPS algorithms.  We note that GAPS-JAGS will create temporary files in the working directory in your R session.  As a result, the user must change to a directory with write permissions before running GAPS-JAGS.
158
+
159
+\section{GAPS} \label{GAPSRun}
160
+
161
+\par GAPS seeks a pattern matrix (${\bf{P}}$) and the corresponding distribution matrix of weights (${\bf{A}}$) whose product forms a mock data matrix (${\bf{M}}$) that represents the expression data ${\bf{D}}$ within noise limits ($\boldsymbol{\varepsilon}$).  That is,
162
+\begin{equation}
163
+{\bf{D}} = {\bf{M}} + \boldsymbol{\varepsilon} = {\bf{A}}{\bf{P}} + \boldsymbol{\varepsilon}.
164
+\label{eq:matrixDecomp}
165
+\end{equation}
166
+The number of rows in ${\bf{P}}$ (columns in ${\bf{A}}$) defines the number of biological patterns that GAPS will infer from the measured microarray data.  As in the Bayesian Decomposition algorithm \cite{Ochs2006}, the matrices ${\bf{A}}$ and ${\bf{P}}$ in GAPS are assumed to have the atomic prior described in \cite{Sibisi1997}.  In the GAPS implementation, $\alpha_{A}$ and $\alpha_{P}$ are corresponding parameters for the expected number of atoms which map to each matrix element in ${\bf{A}}$ and ${\bf{P}}$, respectively.  The corresponding matrices ${\bf{A}}$ and ${\bf{P}}$ are found with MCMC sampling implemented within JAGS \cite{Plummer2003}.
167
+
168
+\par The GAPS algorithm is run by calling the \texttt{GAPS} function in the CoGAPS R package as follows:
169
+\begin{verbatim}
170
+> GAPS(data, unc, outputDir, outputBase="", sep="\t", isPercentError=FALSE, 
171
+     numPatterns, MaxAtomsA=2^32, alphaA=0.01, MaxAtomsP=2^32, alphaP=0.01, 
172
+     SAIter=1000000000, iter = 500000000, thin=-1, 
173
+     verbose=TRUE, keepChain=FALSE)
174
+\end{verbatim}
175
+
176
+\par \noindent \textbf{\underline{Input Arguments}}
177
+\begin{description}
178
+\item[data]{The matrix of m genes by n arrays of expression data.  The input can be either the data matrix itself or the file containing this data.  If the latter, GAPS will read in the data using \texttt{read.table(data, sep=sep, header=T, row.names=1)}.}
179
+\item[unc]{The matrix of m genes by n arrays of uncertainty (standard deviation) for the expression data.  The input can be either a file containing the uncertainty (using the format from data), a matrix containing the uncertainty, or a constant value.  If unc is a constant value, it can represent either a constant uncertainty or a constant percentage of the values in data as determined by isPercentError.}
180
+\item[numPatterns]{Number of patterns into which the data will be decomposed.  Must be less than the number of genes and number of arrays in the data.}
181
+\item[outputDir]{Directory to which to output result and diagnostic files created by GAPS. (Use "" to output results to the current directory).}
182
+\item[outputBase]{Prefix for all result and diagnostic files created by GAPS (optional; default="")}
183
+\item[sep]{Text delimiter for tables in data and unc (if specified in file) and any output tables (optional; default="\t")}
184
+\item[isPercentError]{Boolean indicating whether constant value in unc is the value of the uncertainty or the percentage of the data that is the uncertainty.}
185
+\item[MaxAtomsA]{Maximum number of atoms in the atomic domain used for the prior of the amplitude matrix in the decomposition \cite{Sibisi1997}.  The default value will typically be sufficient for most applications (optional; default=$2^{32}$).}
186
+\item[alphaA]{Sparsity parameter reflecting the expected number of atoms per element of the amplitude matrix in the decomposition.  To enforce sparsity, this parameter should typically be less than one. (optional; default=0.01)}
187
+\item[MaxAtomsP]{Maximum number of atoms in the atomic domain used for the prior of the pattern matrix in the decomposition \cite{Sibisi1997}.  The default value will typically be sufficient for most applications (optional; default=$2^{32}$).}
188
+\item[alphaP]{Sparsity parameter reflecting the expected number of atoms per element of the pattern matrix in the decomposition.  To enforce sparsity, this parameter should typically be less than one. (optional; default=0.01)}
189
+\item[SAIter]{Number of burn-in iterations for the MCMC matrix decomposition (optional; default=1000000000)}
190
+\item[iter]{Number of iterations to represent the distribution of amplitude and pattern matrices with the MCMC matrix decomposition (optional; default=500000000)}
191
+\item[thin]{Double whose integer part represents the number of iterations at which the samples are kept and decimal part provides an identifier for the output files from this implementation of GAPS.  If thin is an integer or not specified, this decimal file identifier is assigned randomly.  (optional; default=-1; code assigns number of iterations kept to be iter/10000 and file identifier to be runif(1)) }
192
+\item[verbose]{Boolean which specifies the amount of output to the user about the progress of the program. (optional; default=TRUE)}
193
+\item[keepChain]{Boolean which specifies if chain values of ${{\bf{A}}}$ and ${{\bf{P}}}$ are saved in outputDir (optional; default=FALSE).}
194
+\end{description}
195
+
196
+\par \noindent \textbf{\underline{List Items in Function Output}}
197
+\begin{description}
198
+\item[D]{Microarray data matrix.}
199
+\item[Sigma]{Data matrix with uncertainty of D.}
200
+\item[Amean]{Sampled mean value of the amplitude matrix ${{\bf{A}}}$.}
201
+\item[Asd]{Sampled standard deviation of the amplitude matrix ${{\bf{A}}}$.}
202
+\item[Pmean]{Sampled mean value of the pattern matrix ${\bf{P}}$.}
203
+\item[Psd]{Sampled standard deviation of the pattern matrix ${\bf{P}}$.}
204
+\item[meanMock]{Mock data obtained from matrix decomposition for sampled mean values (= ${\bf{Amean}}{\bf{Pmean}}$).}
205
+\item[meanChi2]{${\chi^2}$ value for the sampled mean values (${\bf{Amean}}$ and ${\bf{Pmean}}$) of the matrix decomposition.}
206
+\end{description}
207
+  
208
+\par \noindent \textbf{\underline{Side Effects}}
209
+\begin{itemize}
210
+  \item Makes the folder \texttt{outputDir} in which to put the results.
211
+  \item Create diagnostic files with $\chi^{2}$ and number of atoms in \texttt{outputDir}
212
+  \item Create files containing the mean and standard deviation of ${\bf{A}}$ and ${\bf{P}}$ estimated with MCMC in \texttt{outputDir}.
213
+  \item Create files with values of ${\bf{A}}$ and ${\bf{P}}$ from the MCMC chain stored in \texttt{outputDir} if the input parameter \texttt{keepChain} is true.
214
+\end{itemize}
215
+
216
+\par Once the GAPS algorithm has been run, the inferred patterns and corresponding amplitude can be displayed using the \texttt{plotGAPS} function as follows:
217
+\begin{verbatim}
218
+> plotGAPS(A, P, outputPDF="")
219
+\end{verbatim}
220
+
221
+\par \noindent \textbf{\underline{Input Arguments}}
222
+\begin{description}
223
+\item[A]{The amplitude matrix ${\bf{Amean}}$ obtained from GAPS.}
224
+\item[P]{The pattern matrix ${\bf{Pmean}}$ obtained from GAPS.}
225
+\item[outputPDF]{Name of an \texttt{pdf} file to which the results will be output.  (Optional; default="" will output plots to the screen.)}
226
+\end{description}
227
+
228
+\par \noindent \textbf{\underline{Side Effects}}
229
+\begin{itemize}
230
+  \item Save the plots of ${\bf{Amean}}$ and ${\bf{Pmean}}$ to the \texttt{pdf} file \texttt{outputPDF}.
231
+\end{itemize}
232
+
233
+\subsection{Example}
234
+
235
+\par In this example, we perform the GAPS matrix decomposition on a simulated data set with known underlying patterns (ModSim) as follows.
236
+
237
+<<label=GAPSModSim, echo=TRUE, include=FALSE, cache=TRUE>>=
238
+library('CoGAPS')
239
+data('ModSim')
240
+nIter <- 500000
241
+results <- GAPS(data=ModSim.D, unc=0.01, isPercentError=FALSE,
242
+                numPatterns=3, SAIter=2*nIter, iter = nIter,
243
+                outputDir='ModSimResults')
244
+plotGAPS(results$Amean, results$Pmean, 'ModSimFigs')
245
+message('Deleting analysis results from GAPS for Vignette')
246
+unlink('ModSimResults', recursive=T)
247
+@ 
248
+Figure \ref{fig:ModSim} shows the results from plotting the GAPS estimates of ${\bf{A}}$ and ${\bf{P}}$ using \texttt{plotGAPS}, which has a fit to ${\bf{D}}$ of $\chi^{2}=\Sexpr{results$meanChi2}$.  
249
+\begin{figure}[ht]
250
+  \begin{center}
251
+    \subfigure[Inferred amplitude matrix]{
252
+      \includegraphics[width=0.45\linewidth]{ModSimFigs_Amplitude}
253
+    }
254
+    \subfigure[Inferred patterns]{
255
+      \includegraphics[width=0.45\linewidth]{ModSimFigs_Patterns}
256
+    }
257
+  \end{center}
258
+  \caption{Results from GAPS on simulated data set with known true patterns.}
259
+  \label{fig:ModSim}
260
+\end{figure}
261
+Figure \ref{fig:ModSimPtrue} displays the true patterns used to create the ModSim data, stored in \texttt{ModSim.P.true}.
262
+\begin{figure}[ht]
263
+  \begin{center}
264
+<<label=ModSimPtrue, fig=TRUE, echo=FALSE, cache=TRUE>>=
265
+arrayIdx <- 1:ncol(ModSim.P.true)
266
+matplot(arrayIdx, t(ModSim.P.true), type='l', lwd=10)
267
+@ 
268
+  \end{center}
269
+  \caption{Known true patterns used to generate ModSim data.}
270
+  \label{fig:ModSimPtrue}
271
+\end{figure}
272
+
273
+
274
+\section{CoGAPS} \label{CoGAPSRun}
275
+
276
+\par CoGAPS infers coordinated activity in gene sets active in each row of the pattern matrix ${\bf{P}}$ found by GAPS.  Specifically, CoGAPS computes a $Z$-score based statistic on each column of the ${\bf{A}}$ matrix developed in \cite{Ochs2009}.  The resulting $Z$-score for pattern $p$ and gene set $i$, $\mathcal{G}_{i}$, with $G$ elements is given by
277
+\begin{equation}
278
+Z_{i,p} = \frac{1}{G} \sum\limits_{g \in \mathcal{G}_{i}} {\frac{{\bf{A}}_{gp}}{{\bf{Asd}}_{gp}}}
279
+\label{eq:avgZ}
280
+\end{equation}
281
+where $g$ indexes the genes in the set and ${\bf{Asd}}_{gp}$ is the standard deviation of ${\bf{A}}_{gp}$ obtained from the MCMC sampling in GAPS.  CoGAPS then uses random sample tests to convert the Z-scores from eq. (\ref{eq:avgZ}) to $p$ values for each gene set.
282
+
283
+\par The CoGAPS algorithm is run by calling the \texttt{CoGAPS} function in the CoGAPS R package as follows:
284
+\begin{verbatim}
285
+> CoGAPS(data, unc, GStoGenes, outputDir, outputBase="", sep="\t", 
286
+         isPercentError=FALSE, numPatterns, MaxAtomsA=2^32, alphaA=0.01, 
287
+         MaxAtomsP=2^32, alphaP=0.01, SAIter=1000000000, iter = 500000000, 
288
+         thin=-1, nPerm=500, verbose=TRUE, plot=FALSE, keepChain=FALSE)
289
+\end{verbatim}
290
+
291
+\par \noindent \textbf{\underline{Input Arguments}}
292
+\begin{description}
293
+\item[$\ldots$]{Input arguments from GAPS.}
294
+\item[GStoGenes]{List or data frame containing the genes in each gene set. If a list, gene set names are the list names and corresponding elements are the names of genes contained in each set. If a data frame, gene set names are in the first column and corresponding gene names are listed in rows beneath each gene set name.}
295
+\item[nPerm]{Number of permutations used for the null distribution in the gene set statistic. (optional; default=500).}
296
+\item[plot]{Use \texttt{plotGAPS} to plot results from the run of \texttt{GAPS} within \texttt{CoGAPS}.}
297
+\end{description}
298
+  
299
+\par \noindent \textbf{\underline{List Items in Function Output}}
300
+\begin{description}
301
+\item[$\ldots$]{Output list from GAPS.}
302
+\item[GSUpreg]{p-values for upregulation of each gene set in each pattern.}
303
+\item[GSDownreg]{p-values for downregulation of each gene set in each pattern.}
304
+\item[GSActEst]{p-values for activity of each gene set in each pattern.}
305
+\end{description}
306
+
307
+\par \noindent \textbf{\underline{Side Effects}}
308
+\begin{itemize}
309
+  \item Side effects from the \texttt{GAPS} algorithm.
310
+  \item Creates files from \texttt{GSUpreg}, \texttt{GSDownreg}, and \texttt{GSActEst} into \texttt{outputDir}.
311
+\end{itemize}
312
+
313
+\par The CoGAPS algorithm can also be run manually by first running the GAPS algorithm described in Section \ref{GAPSRun} and then calling the function \texttt{calcCoGAPSStat} as follows:
314
+\begin{verbatim}
315
+> calcCoGAPSStat(Amean, Asd, GStoGenes, numPerm=500)
316
+\end{verbatim}
317
+The input arguments for \texttt{calcCoGAPSStat} are as described in the previous sections.  This function will output a list containing \texttt{GSUpreg}, \texttt{GSDownreg}, and \texttt{GSActEst}.
318
+
319
+\subsection{Examples}
320
+
321
+\subsubsection{Simulated data}
322
+
323
+\par In this example, we have simulated data in EasySimGS (DGS) with three known patterns (PGS) and corresponding amplitude (AGS) with specified activity in two gene sets (gs).  In this data set, each gene set is overexpressed in of the simulated patterns and underexpressed in one.
324
+
325
+<<label=SimCoGAPS,echo=TRUE>>=
326
+library('CoGAPS')
327
+data('EasySimGS')
328
+nIter <- 5e+05 
329
+results <- CoGAPS(data=DGS, unc=0.01, isPercentError=FALSE,
330
+                  GStoGenes=gs,
331
+                  numPatterns=3,
332
+                  SAIter = 2*nIter, iter = nIter,
333
+                  outputDir='GSResults', plot=FALSE)
334
+plotGAPS(results$Amean, results$Pmean, 'GSFigs')
335
+message('Deleting analysis results from CoGAPS for Vignette')
336
+unlink('GSResults', recursive=T)
337
+@
338
+
339
+Figure \ref{fig:GS} shows the results from running CoGAPS on the GIST data in \cite{Ochs2009} with the option plot set to true.
340
+\begin{figure}[ht]
341
+\begin{center}
342
+\subfigure[Inferred amplitude matrix]{
343
+\includegraphics[width=0.45\linewidth]{GSFigs_Amplitude}
344
+}
345
+\subfigure[Inferred patterns]{
346
+\includegraphics[width=0.45\linewidth]{GSFigs_Patterns}
347
+}
348
+\end{center}
349
+\caption{Results from GAPS on data of simulated gene set data.}
350
+\label{fig:GS}
351
+\end{figure}
352
+
353
+\par Moreover, the gene set activity is provided in \texttt{results\$GSActEst} including p-values for upregulation in \texttt{results\$GSUpreg} and downregulation in \texttt{results\$GSDownreg}.
354
+
355
+\subsubsection{GIST data}
356
+
357
+\par We also provide the code that would be used for the CoGAPS analysis of GIST data (GIST\_TS\_20084) with gene sets defined by transcription factors (TFGSList), as in the DESIDE analysis of \cite{Ochs2009}.  To enable quick package installation, we do not evaluate this code in the vignette, but leave it for the user to compare to the results of \cite{Ochs2009}.
358
+<<eval=FALSE>>=
359
+library('CoGAPS')
360
+data('GIST_TS_20084')
361
+data('TFGSList')
362
+nIter <- 50000000 
363
+results <- CoGAPS(data=GIST.D, unc=GIST.S,
364
+                  GStoGenes=tf2ugFC,
365
+                  numPatterns=5,
366
+                  SAIter = 2*nIter, iter = nIter,
367
+                  outputDir='GISTResults', plot=FALSE)
368
+plotGAPS(results$Amean, results$Pmean, 'GISTFigs')
369
+message('Deleting analysis results from CoGAPS for Vignette')
370
+unlink('GISTResults', recursive=T)
371
+@  
372
+
373
+\chapter{Feedback}
374
+
375
+\par Please send feedback to Elana Fertig \texttt{ejfertig@jhmi.edu} or Michael Ochs \texttt{mfo@jhu.edu}.
376
+
377
+\par If you want to send a bug report, it must be reproducible.  Send the data, describe what you think should happen, and what did happen.
378
+
379
+\chapter{Acknowledgments}
380
+
381
+\par We would like to thank Martyn Plummer for the JAGS package and speedy feedback to bugs which facilitated our development of the GAPS-JAGS software.  Special thanks to paper co-authors Jie Ding, Alexander V. Favorov, and Giovanni Parmigiani for statistical advice in developing the GAPS / CoGAPS algorithms.  Additional thanks to Simina M. Boca, Ludmila V. Danilova, Jeffrey Leek, and Svitlana Tyekcheva for their useful feedback.
382
+
383
+\par This work was funded by NLM LM009382 and NSF Grant 0342111. 
384
+
385
+\bibliographystyle{plain}
386
+\bibliography{AppNote}
387
+
388
+\end{document}
0 389
new file mode 100644
1 390
Binary files /dev/null and b/inst/doc/CoGAPSUsersManual.pdf differ
2 391
new file mode 100644
... ...
@@ -0,0 +1,264 @@
1
+% -*- LaTeX -*-
2
+%%% ====================================================================
3
+%%%  @LaTeX-style-file{
4
+%%%     author          = "Glenn Paulley",
5
+%%%     version         = "4",
6
+%%%     date            = "31 August 1992",
7
+%%%     time            = "09:42:44 199",
8
+%%%     filename        = "chicago.sty",
9
+%%%     address         = "Data Structuring Group
10
+%%%                        Department of Computer Science
11
+%%%                        University of Waterloo
12
+%%%                        Waterloo, Ontario, Canada
13
+%%%                        N2L 3G1",
14
+%%%     telephone       = "(519) 885-1211",
15
+%%%     FAX             = "(519) 885-1208",
16
+%%%     checksum        = "44674 264 1050 10394",
17
+%%%     email           = "gnpaulle@bluebox.uwaterloo.ca",
18
+%%%     codetable       = "ISO/ASCII",
19
+%%%     keywords        = "",
20
+%%%     supported       = "yes",
21
+%%%     abstract        = "Contains the LaTeX style command definitions
22
+%%%                        for the Chicago BibTeX styles chicago.bst and
23
+%%%                        chicagoa.bst. For details, see below.",
24
+%%%     docstring       = "The checksum field above contains a CRC-16
25
+%%%                        checksum as the first value, followed by the
26
+%%%                        equivalent of the standard UNIX wc (word
27
+%%%                        count) utility output of lines, words, and
28
+%%%                        characters.  This is produced by Robert
29
+%%%                        Solovay's checksum utility.",
30
+%%%  }
31
+%%% ====================================================================
32
+%
33
+% chicago.sty: Style file for use with bibtex style chicago.bst, for
34
+% bibliographies formatted according to the 13th Edition of the Chicago
35
+% Manual of Style.
36
+%
37
+% 'newapa.bst' was made from 'plain.bst', 'named.bst', and 'apalike.bst',
38
+% with lots of tweaking to make it look like APA style, along with tips
39
+% from Young Ryu and Brian Reiser's modifications of 'apalike.bst'.
40
+% newapa.sty formed the basis of this style, chicago.sty. Author-date
41
+% references in newapa.bst formed the basis for chicago.bst. Chicagoa.bst
42
+% supports annotations.
43
+%
44
+% Version 4 (August, 1992):
45
+% - fixed chicago.bst and chicagoa.bst to handle long author lists in
46
+%   sorting
47
+% - fixed chicago.bst and chicagoa.bst so that missing page numbers in
48
+%   ``article'' entries are handled correctly
49
+% - modified chicago.sty to format entries with 2nd and subsequent lines
50
+%   indented.
51
+%
52
+%   Citation format: (author-last-name year)
53
+%             (author-last-name and author-last-name year)
54
+%             (author-last-name et al. year)
55
+%             (author-last-name)
56
+%             author-last-name
57
+%             author-last-name (year)
58
+%             (author-last-name and author-last-name)
59
+%             (author-last-name et al.)
60
+%             (year) or (year,year)
61
+%             year or year,year
62
+%
63
+%   Reference list ordering: alphabetical by author or whatever passes
64
+%    for author in the absence of one.
65
+%
66
+% This BibTeX style has support for abbreviated author lists and for
67
+%    year-only citations.  This is done by having the citations
68
+%    actually look like
69
+%
70
+%    \citeauthoryear{full-author-info}{abbrev-author-info}{year}
71
+%
72
+% The LaTeX style has to have the following (or similar)
73
+%
74
+%     \let\@internalcite\cite
75
+%     \def\fullcite{\def\citeauthoryear##1##2##3{##1, ##3}\@internalcite}
76
+%     \def\fullciteA{\def\citeauthoryear##1##2##3{##1}\@internalcite}
77
+%     \def\shortcite{\def\citeauthoryear##1##2##3{##2, ##3}\@internalcite}
78
+%     \def\shortciteA{\def\citeauthoryear##1##2##3{##2}\@internalcite}
79
+%     \def\citeyear{\def\citeauthoryear##1##2##3{##3}\@internalcite}
80
+%
81
+% -------------------------------------------------------------------------
82
+% This file implements citations for the ``chicago'' bibliography style.
83
+%  Place it in a file called chicago.sty in the TeX search path.
84
+%(Placing it in the same directory as the LaTeX document should also work.)
85
+%
86
+%    This file is a modification of the ``newapa'' LaTeX style,
87
+%    originally adapted by Steven Spencer from the ``apalike'' LaTeX style.
88
+%    It was originally modified by Stephen N. Spencer, with further
89
+%    modifications by Young U. Ryu.
90
+%
91
+% The ``chicago'' BibTeX bibliography style creates citations with labels:
92
+%       \citeauthoryear{author-info}{abbrev. author-info}{year}
93
+%
94
+% These labels are processed by the following LaTeX commands:
95
+%
96
+%  \cite{key}
97
+%    which produces citations with full author list and year.
98
+%    eg. (Brown 1978; Jarke, Turner, Stohl, et al. 1985)
99
+%  \citeNP{key}
100
+%    which produces citations with full author list and year, but without
101
+%    enclosing parentheses:
102
+%    eg. Brown 1978; Jarke, Turner and Stohl 1985
103
+%  \citeA{key}
104
+%    which produces citations with only the full author list.
105
+%    eg. (Brown; Jarke, Turner and Stohl)
106
+%  \citeANP{key}
107
+%    which produces citations with only the full author list, without
108
+%    parentheses eg. Brown; Jarke, Turner and Stohl
109
+%  \citeN{key}
110
+%    which produces citations with the full author list and year, but
111
+%    can be used as nouns in a sentence; no parentheses appear around
112
+%    the author names, but only around the year.
113
+%      eg. Shneiderman (1978) states that......
114
+%    \citeN should only be used for a single citation.
115
+%  \shortcite{key}
116
+%    which produces citations with abbreviated author list and year.
117
+%  \shortciteNP{key}
118
+%    which produces citations with abbreviated author list and year.
119
+%  \shortciteA{key}
120
+%    which produces only the abbreviated author list.
121
+%  \shortciteANP{key}
122
+%    which produces only the abbreviated author list.
123
+%  \shortciteN{key}
124
+%    which produces the abbreviated author list and year, with only the
125
+%    year in parentheses. Use with only one citation.
126
+%  \citeyear{key}
127
+%    which produces the year information only, within parentheses.
128
+%  \citeyearNP{key}
129
+%    which produces the year information only.
130
+%
131
+% Abbreviated author lists use the ``et al.'' construct.
132
+%
133
+% `NP' means `no parentheses'.
134
+%
135
+% This LaTeX style file must be used with the ``chicago'' or ``chicagoa''
136
+% (annotated chicago style) BibTeX styles.
137
+%
138
+\typeout{Using Chicago Manual of Style bibliography: 31 August 1992}
139
+%
140
+% -------------------------------------------------------------------------
141
+%
142
+% Citation macros.
143
+%
144
+\let\@internalcite\cite
145
+%
146
+\def\cite{\def\@citeseppen{-1000}%
147
+    \def\@cite##1##2{(##1\if@tempswa , ##2\fi)}%
148
+    \def\citeauthoryear##1##2##3{##1 ##3}\@internalcite}
149
+\def\citeNP{\def\@citeseppen{-1000}%
150
+    \def\@cite##1##2{##1\if@tempswa , ##2\fi}%
151
+    \def\citeauthoryear##1##2##3{##1 ##3}\@internalcite}
152
+\def\citeN{\def\@citeseppen{-1000}%
153
+    \def\@cite##1##2{##1\if@tempswa , ##2)\else{)}\fi}%
154
+    \def\citeauthoryear##1##2##3{##1 (##3}\@citedata}
155
+\def\citeA{\def\@citeseppen{-1000}%
156
+    \def\@cite##1##2{(##1\if@tempswa , ##2\fi)}%
157
+    \def\citeauthoryear##1##2##3{##1}\@internalcite}
158
+\def\citeANP{\def\@citeseppen{-1000}%
159
+    \def\@cite##1##2{##1\if@tempswa , ##2\fi}%
160
+    \def\citeauthoryear##1##2##3{##1}\@internalcite}
161
+%
162
+\def\shortcite{\def\@citeseppen{-1000}%
163
+    \def\@cite##1##2{(##1\if@tempswa , ##2\fi)}%
164
+    \def\citeauthoryear##1##2##3{##2 ##3}\@internalcite}
165
+\def\shortciteNP{\def\@citeseppen{-1000}%
166
+    \def\@cite##1##2{##1\if@tempswa , ##2\fi}%
167
+    \def\citeauthoryear##1##2##3{##2 ##3}\@internalcite}