Browse code

vignette coming together

Tom Sherman authored on 01/08/2018 20:53:40
Showing26 changed files

... ...
@@ -23,7 +23,7 @@ exportClasses(CogapsResult)
23 23
 importClassesFrom(S4Vectors,Annotated)
24 24
 importClassesFrom(S4Vectors,character_OR_NULL)
25 25
 importClassesFrom(SingleCellExperiment,LinearEmbeddingMatrix)
26
-importFrom(BiocParallel,SnowParam)
26
+importFrom(BiocParallel,MulticoreParam)
27 27
 importFrom(BiocParallel,bplapply)
28 28
 importFrom(Rcpp,evalCpp)
29 29
 importFrom(SummarizedExperiment,assay)
... ...
@@ -85,6 +85,11 @@ checkpointInFile=NULL, transposeData=FALSE, ...)
85 85
     if (!is(data, "character"))
86 86
         checkDataMatrix(data, uncertainty, params)
87 87
 
88
+    # check single cell parameter
89
+    if (!is.null(allParams$gaps@distributed))
90
+        if (allParams$gaps@distributed == "single-cell" & !allParams$gaps@singleCell)
91
+            warning("running single-cell CoGAPS with singleCell=FALSE")
92
+
88 93
     # convert data to matrix
89 94
     if (is(data, "data.frame"))
90 95
         data <- data.matrix(data)
... ...
@@ -132,7 +137,7 @@ checkpointInFile=NULL, transposeData=FALSE, ...)
132 137
 #' @inheritParams CoGAPS
133 138
 #' @return CogapsResult object
134 139
 #' @importFrom methods new
135
-scCoGAPS <- function(data, params=new("CogapsParams"), nThreads=NULL,
140
+scCoGAPS <- function(data, params=new("CogapsParams"), nThreads=1,
136 141
 messages=TRUE, outputFrequency=500, uncertainty=NULL,
137 142
 checkpointOutFile="gaps_checkpoint.out", checkpointInterval=1000,
138 143
 checkpointInFile=NULL, transposeData=FALSE, ...)
... ...
@@ -151,7 +156,7 @@ checkpointInFile=NULL, transposeData=FALSE, ...)
151 156
 #' @inheritParams CoGAPS
152 157
 #' @return CogapsResult object
153 158
 #' @importFrom methods new
154
-GWCoGAPS <- function(data, params=new("CogapsParams"), nThreads=NULL,
159
+GWCoGAPS <- function(data, params=new("CogapsParams"), nThreads=1,
155 160
 messages=TRUE, outputFrequency=500, uncertainty=NULL,
156 161
 checkpointOutFile="gaps_checkpoint.out", checkpointInterval=1000,
157 162
 checkpointInFile=NULL, transposeData=FALSE, ...)
... ...
@@ -7,8 +7,8 @@
7 7
 #' @param allParams list of all parameters used in computation
8 8
 #' @param uncertainty uncertainty matrix (same supported types as data)
9 9
 #' @return list
10
-#' @importFrom BiocParallel bplapply SnowParam
11
-distributedCogaps <- function(data, allParams, uncertainty)
10
+#' @importFrom BiocParallel bplapply MulticoreParam
11
+distributedCogaps <- function(data, allParams, uncertainty, BPPARAM=NULL)
12 12
 {
13 13
     FUN <- function(set, data, allParams, uncertainty, fixedMatrix=NULL)
14 14
     {
... ...
@@ -21,10 +21,11 @@ distributedCogaps <- function(data, allParams, uncertainty)
21 21
     # randomly sample either rows or columns into subsets to break the data up
22 22
     set.seed(allParams$gaps@seed)
23 23
     sets <- createSets(data, allParams)
24
-    snow <- SnowParam(workers=length(sets), type="SOCK")
25
-
24
+    if (is.null(BPPARAM))
25
+        BPPARAM <- BiocParallel::MulticoreParam(workers=length(sets))
26
+    
26 27
     # run Cogaps normally on each subset of the data
27
-    initialResult <- bplapply(sets, FUN, BPPARAM=snow, data=data,
28
+    initialResult <- bplapply(sets, FUN, BPPARAM=BPPARAM, data=data,
28 29
         allParams=allParams, uncertainty=uncertainty)
29 30
 
30 31
     # match patterns in either A or P matrix
... ...
@@ -36,7 +37,7 @@ distributedCogaps <- function(data, allParams, uncertainty)
36 37
         == "genome-wide", "P", "A")
37 38
 
38 39
     # ru final phase with fixed matrix
39
-    finalResult <- bplapply(sets, FUN, data=data, BPPARAM=snow,
40
+    finalResult <- bplapply(sets, FUN, BPPARAM=BPPARAM, data=data, 
40 41
         allParams=allParams, uncertainty=uncertainty, fixedMatrix=consensusMatrix)
41 42
 
42 43
     # get result 
... ...
@@ -27,26 +27,8 @@ NULL
27 27
 
28 28
 #' Simulated data
29 29
 #' @docType data
30
-#' @name SimpSim.D
31
-#' @usage SimpSim.D
32
-NULL
33
-
34
-#' Simulated data
35
-#' @docType data
36
-#' @name SimpSim.S
37
-#' @usage SimpSim.S
38
-NULL
39
-
40
-#' Simulated data
41
-#' @docType data
42
-#' @name SimpSim.A
43
-#' @usage SimpSim.A
44
-NULL
45
-
46
-#' Simulated data
47
-#' @docType data
48
-#' @name SimpSim.P
49
-#' @usage SimpSim.P
30
+#' @name SimpSim.data
31
+#' @usage SimpSim.data
50 32
 NULL
51 33
 
52 34
 #' CoGAPS result from running on simulated data
... ...
@@ -57,20 +39,14 @@ NULL
57 39
 
58 40
 #' Sample GIST gene expression data from Ochs et al. (2009)
59 41
 #' @docType data
60
-#' @name GIST.D
61
-#' @usage GIST.D
42
+#' @name GIST.data_frame
43
+#' @usage GIST.data_frame
62 44
 NULL
63 45
 
64 46
 #' Sample GIST gene expression data from Ochs et al. (2009)
65 47
 #' @docType data
66
-#' @name GIST.S
67
-#' @usage GIST.S
68
-NULL
69
-
70
-#' Simulated dataset to quantify gene set membership in the GIST dataset 
71
-#' @docType data
72
-#' @name GIST.GeneSets
73
-#' @usage GIST.GeneSets
48
+#' @name GIST.matrix
49
+#' @usage GIST.matrix
74 50
 NULL
75 51
 
76 52
 #' CoGAPS result from running on GIST dataset
... ...
@@ -79,12 +55,6 @@ NULL
79 55
 #' @usage GIST.result
80 56
 NULL
81 57
 
82
-#' Simulated dataset to quantify gene set membership.
83
-#' @docType data
84
-#' @name GSets
85
-#' @usage GSets
86
-NULL
87
-
88 58
 #' Gene sets defined by transcription factors defined from TRANSFAC.
89 59
 #' @docType data
90 60
 #' @name tf2ugFC
... ...
@@ -79,6 +79,8 @@ setValidity("CogapsParams",
79 79
             "random seed must be an integer greater than zero"
80 80
         if (object@minNS <= 1 | object@minNS %% 1 != 0)
81 81
             "minNS must be an integer greater than one"
82
+        if (object@nSets <= 1 | object@nSets %% 1 != 0)
83
+            "minNS must be an integer greater than one"
82 84
     }
83 85
 )
84 86
 
... ...
@@ -104,14 +106,16 @@ setGeneric("setParam", function(object, whichParam, value)
104 106
 #'
105 107
 #' @description these parameters  are interrelated so they must be set together
106 108
 #' @param object an object of type CogapsParams
107
-#' @param cut a distributed CoGAPS parameter 
108
-#' @param minNS a distributed CoGAPS parameter 
109
-#' @param maxNS a distributed CoGAPS parameter 
109
+#' @param nSets number of sets to break data into
110
+#' @param cut number of branches at which to cut dendrogram used in
111
+#' pattern matching
112
+#' @param minNS minimum of individual set contributions a cluster must contain
113
+#' @param maxNS maximum of individual set contributions a cluster can contain
110 114
 #' @return the modified params object
111 115
 #' @examples
112 116
 #'  params <- new("CogapsParams")
113 117
 #'  params <- setDistributedParams(3, 2, 4)
114
-setGeneric("setDistributedParams", function(object, cut=NULL,
118
+setGeneric("setDistributedParams", function(object, nSets, cut=NULL,
115 119
 minNS=NULL, maxNS=NULL)
116 120
     {standardGeneric("setDistributedParams")})
117 121
 
... ...
@@ -57,7 +57,7 @@ function(object, whichParam, value)
57 57
         object@maxGibbsMassA <- value
58 58
         object@maxGibbsMassP <- value
59 59
     }
60
-    else if (whichParam %in% c("cut", "minNS", "maxNS"))
60
+    else if (whichParam %in% c("nSets", "cut", "minNS", "maxNS"))
61 61
     {
62 62
         stop("please set this parameter with setDistributedParams")
63 63
     }
... ...
@@ -73,19 +73,15 @@ function(object, whichParam, value)
73 73
 #' @aliases setDistributedParams
74 74
 #' @importFrom methods slot
75 75
 setMethod("setDistributedParams", signature(object="CogapsParams"),
76
-function(object, cut, minNS, maxNS)
76
+function(object, nSets, cut, minNS, maxNS)
77 77
 {
78
-    object@cut <- cut
79
-    object@minNS <- minNS
80
-    object@maxNS <- maxNS
78
+    object@nSets <- nSets
81 79
 
82
-    if (is.null(object@cut))
83
-        object@cut <- object@nPatterns
84
-    if (is.null(object@minNS))
85
-        object@minNS <- ceiling(object@nSets / 2)
86
-    if (is.null(object@maxNS))
87
-        object@maxNS <- object@minNS + object@nSets
80
+    object@cut <- ifelse(is.null(cut), object@nPatterns, cut)
81
+    object@minNS <- ifelse(is.null(minNS), ceiling(object@nSets / 2), minNS)
82
+    object@maxNS <- ifelse(is.null(maxNS), object@minNS + object@nSets, maxNS)
88 83
 
84
+    validObject(object)
89 85
     return(object)
90 86
 })
91 87
 
... ...
@@ -15,18 +15,18 @@ function(object)
15 15
 #' @importFrom grDevices rainbow
16 16
 plot.CogapsResult <- function(x, ...)
17 17
 {
18
-    nSamples <- nrow(object@sampleFactors)
19
-    nFactors <- ncol(object@sampleFactors)
18
+    nSamples <- nrow(x@sampleFactors)
19
+    nFactors <- ncol(x@sampleFactors)
20 20
     colors <- rainbow(nFactors)
21 21
     xlimits <- c(0, nSamples + 1)
22
-    ylimits <- c(0, max(object@sampleFactors) * 1.1)
22
+    ylimits <- c(0, max(x@sampleFactors) * 1.1)
23 23
 
24 24
     plot(NULL, xlim=xlimits, ylim=ylimits, ylab="Relative Amplitude")
25 25
 
26 26
     for (i in 1:nFactors)
27 27
     {
28
-        lines(x=1:nSamples, y=object@sampleFactors[,i], col=colors[i])
29
-        points(x=1:nSamples, y=object@sampleFactors[,i], col=colors[i], pch=i)
28
+        lines(x=1:nSamples, y=x@sampleFactors[,i], col=colors[i])
29
+        points(x=1:nSamples, y=x@sampleFactors[,i], col=colors[i], pch=i)
30 30
     }
31 31
 
32 32
     legend("top", paste("Pattern", 1:nFactors, sep = ""), pch = 1:nFactors,
33 33
Binary files a/data/GIST.RData and b/data/GIST.RData differ
34 34
Binary files a/data/SimpSim.RData and b/data/SimpSim.RData differ
35 35
deleted file mode 100644
... ...
@@ -1,12 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/Package.R
3
-\docType{data}
4
-\name{GIST.GeneSets}
5
-\alias{GIST.GeneSets}
6
-\title{Simulated dataset to quantify gene set membership in the GIST dataset}
7
-\usage{
8
-GIST.GeneSets
9
-}
10
-\description{
11
-Simulated dataset to quantify gene set membership in the GIST dataset
12
-}
13 0
similarity index 79%
14 1
rename from man/GIST.S.Rd
15 2
rename to man/GIST.data_frame.Rd
... ...
@@ -1,11 +1,11 @@
1 1
 % Generated by roxygen2: do not edit by hand
2 2
 % Please edit documentation in R/Package.R
3 3
 \docType{data}
4
-\name{GIST.S}
5
-\alias{GIST.S}
4
+\name{GIST.data_frame}
5
+\alias{GIST.data_frame}
6 6
 \title{Sample GIST gene expression data from Ochs et al. (2009)}
7 7
 \usage{
8
-GIST.S
8
+GIST.data_frame
9 9
 }
10 10
 \description{
11 11
 Sample GIST gene expression data from Ochs et al. (2009)
12 12
similarity index 83%
13 13
rename from man/GIST.D.Rd
14 14
rename to man/GIST.matrix.Rd
... ...
@@ -1,11 +1,11 @@
1 1
 % Generated by roxygen2: do not edit by hand
2 2
 % Please edit documentation in R/Package.R
3 3
 \docType{data}
4
-\name{GIST.D}
5
-\alias{GIST.D}
4
+\name{GIST.matrix}
5
+\alias{GIST.matrix}
6 6
 \title{Sample GIST gene expression data from Ochs et al. (2009)}
7 7
 \usage{
8
-GIST.D
8
+GIST.matrix
9 9
 }
10 10
 \description{
11 11
 Sample GIST gene expression data from Ochs et al. (2009)
12 12
deleted file mode 100644
... ...
@@ -1,12 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/Package.R
3
-\docType{data}
4
-\name{GSets}
5
-\alias{GSets}
6
-\title{Simulated dataset to quantify gene set membership.}
7
-\usage{
8
-GSets
9
-}
10
-\description{
11
-Simulated dataset to quantify gene set membership.
12
-}
13 0
deleted file mode 100644
... ...
@@ -1,12 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/Package.R
3
-\docType{data}
4
-\name{SimpSim.A}
5
-\alias{SimpSim.A}
6
-\title{Simulated data}
7
-\usage{
8
-SimpSim.A
9
-}
10
-\description{
11
-Simulated data
12
-}
13 0
deleted file mode 100644
... ...
@@ -1,12 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/Package.R
3
-\docType{data}
4
-\name{SimpSim.P}
5
-\alias{SimpSim.P}
6
-\title{Simulated data}
7
-\usage{
8
-SimpSim.P
9
-}
10
-\description{
11
-Simulated data
12
-}
13 0
deleted file mode 100644
... ...
@@ -1,12 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/Package.R
3
-\docType{data}
4
-\name{SimpSim.S}
5
-\alias{SimpSim.S}
6
-\title{Simulated data}
7
-\usage{
8
-SimpSim.S
9
-}
10
-\description{
11
-Simulated data
12
-}
13 0
similarity index 75%
14 1
rename from man/SimpSim.D.Rd
15 2
rename to man/SimpSim.data.Rd
... ...
@@ -1,11 +1,11 @@
1 1
 % Generated by roxygen2: do not edit by hand
2 2
 % Please edit documentation in R/Package.R
3 3
 \docType{data}
4
-\name{SimpSim.D}
5
-\alias{SimpSim.D}
4
+\name{SimpSim.data}
5
+\alias{SimpSim.data}
6 6
 \title{Simulated data}
7 7
 \usage{
8
-SimpSim.D
8
+SimpSim.data
9 9
 }
10 10
 \description{
11 11
 Simulated data
... ...
@@ -4,7 +4,7 @@
4 4
 \alias{distributedCogaps}
5 5
 \title{CoGAPS Distributed Matrix Factorization Algorithm}
6 6
 \usage{
7
-distributedCogaps(data, allParams, uncertainty)
7
+distributedCogaps(data, allParams, uncertainty, BPPARAM = NULL)
8 8
 }
9 9
 \arguments{
10 10
 \item{data}{File name or R object (see details for supported types)}
... ...
@@ -6,19 +6,23 @@
6 6
 \alias{setDistributedParams,CogapsParams-method}
7 7
 \title{set the value of parameters for distributed CoGAPS}
8 8
 \usage{
9
-setDistributedParams(object, cut = NULL, minNS = NULL, maxNS = NULL)
9
+setDistributedParams(object, nSets, cut = NULL, minNS = NULL,
10
+  maxNS = NULL)
10 11
 
11
-\S4method{setDistributedParams}{CogapsParams}(object, cut = NULL,
12
+\S4method{setDistributedParams}{CogapsParams}(object, nSets, cut = NULL,
12 13
   minNS = NULL, maxNS = NULL)
13 14
 }
14 15
 \arguments{
15 16
 \item{object}{an object of type CogapsParams}
16 17
 
17
-\item{cut}{a distributed CoGAPS parameter}
18
+\item{nSets}{number of sets to break data into}
18 19
 
19
-\item{minNS}{a distributed CoGAPS parameter}
20
+\item{cut}{number of branches at which to cut dendrogram used in
21
+pattern matching}
20 22
 
21
-\item{maxNS}{a distributed CoGAPS parameter}
23
+\item{minNS}{minimum of individual set contributions a cluster must contain}
24
+
25
+\item{maxNS}{maximum of individual set contributions a cluster can contain}
22 26
 }
23 27
 \value{
24 28
 the modified params object
... ...
@@ -127,7 +127,6 @@ const Rcpp::Nullable<Rcpp::NumericMatrix> &fixedMatrix)
127 127
         {
128 128
             GAPS_ASSERT(!Rf_isNull(allParams["whichMatrixFixed"]));
129 129
             std::string which = Rcpp::as<std::string>(allParams["whichMatrixFixed"]);
130
-            gaps_printf("%s %c\n", which.c_str(), which[0]);
131 130
             runner.setFixedMatrix(which[0], convertRMatrix(Rcpp::NumericMatrix(fixedMatrix), which[0]=='P'));
132 131
         }
133 132
 
... ...
@@ -174,22 +174,63 @@ void GapsRunner::updateSampler(unsigned nA, unsigned nP)
174 174
     }
175 175
 }
176 176
 
177
+// sum coef * log(i) for i = 1 to total, fit coef from number of atoms
178
+// approximates sum of number of atoms (stirling approx to factorial)
179
+// this should be proportional to total number of updates
180
+static double estimatedNumUpdates(double current, double total, float nAtoms)
181
+{
182
+    double coef = nAtoms / std::log(current);
183
+    return coef * std::log(std::sqrt(2 * total * gaps::algo::pi)) +
184
+        total * coef * std::log(total) - total * coef;
185
+}
186
+
187
+double GapsRunner::estimatedPercentComplete() const
188
+{
189
+    double nIter = static_cast<double>(mCurrentIteration);
190
+    double nAtomsA = static_cast<double>(mASampler.nAtoms());
191
+    double nAtomsP = static_cast<double>(mPSampler.nAtoms());
192
+    
193
+    if (mPhase == 'S')
194
+    {
195
+        nIter += mMaxIterations;
196
+    }
197
+
198
+    double totalIter = 2.0 * static_cast<double>(mMaxIterations);
199
+
200
+    double estimatedCompleted = estimatedNumUpdates(nIter, nIter, nAtomsA) + 
201
+        estimatedNumUpdates(nIter, nIter, nAtomsP);
202
+
203
+    double estimatedTotal = estimatedNumUpdates(nIter, totalIter, nAtomsA) + 
204
+        estimatedNumUpdates(nIter, totalIter, nAtomsP);
205
+
206
+    return estimatedCompleted / estimatedTotal;
207
+}
208
+
177 209
 void GapsRunner::displayStatus()
178 210
 {
179 211
     if (mPrintMessages && mOutputFrequency > 0 && ((mCurrentIteration + 1) % mOutputFrequency) == 0)
180 212
     {
181 213
         bpt::time_duration diff = bpt_now() - mStartTime;
182
-        unsigned elapsedSeconds = static_cast<unsigned>(diff.total_seconds());
214
+        double nSecondsCurrent = diff.total_seconds();
215
+        double nSecondsTotal = nSecondsCurrent / estimatedPercentComplete();
216
+
217
+        unsigned elapsedSeconds = static_cast<unsigned>(nSecondsCurrent);
218
+        unsigned totalSeconds = static_cast<unsigned>(nSecondsTotal);
219
+
220
+        unsigned elapsedHours = elapsedSeconds / 3600;
221
+        elapsedSeconds -= elapsedHours * 3600;
222
+        unsigned elapsedMinutes = elapsedSeconds / 60;
223
+        elapsedSeconds -= elapsedMinutes * 60;
183 224
 
184
-        unsigned hours = elapsedSeconds / 3600;
185
-        elapsedSeconds -= hours * 3600;
186
-        unsigned minutes = elapsedSeconds / 60;
187
-        elapsedSeconds -= minutes * 60;
188
-        unsigned seconds = elapsedSeconds;
225
+        unsigned totalHours = totalSeconds / 3600;
226
+        totalSeconds -= totalHours * 3600;
227
+        unsigned totalMinutes = totalSeconds / 60;
228
+        totalSeconds -= totalMinutes * 60;
189 229
 
190
-        gaps_printf("%d of %d, Atoms: %lu(%lu), ChiSq: %.0f, elapsed time: %02d:%02d:%02d\n",
230
+        gaps_printf("%d of %d, Atoms: %lu(%lu), ChiSq: %.0f, Time: %02d:%02d:%02d / %02d:%02d:%02d\n",
191 231
             mCurrentIteration + 1, mMaxIterations, mASampler.nAtoms(),
192
-            mPSampler.nAtoms(), mASampler.chi2(), hours, minutes, seconds);
232
+            mPSampler.nAtoms(), mASampler.chi2(), elapsedHours, elapsedMinutes,
233
+            elapsedSeconds, totalHours, totalMinutes, totalSeconds);
193 234
         gaps_flush();
194 235
     }
195 236
 }
... ...
@@ -61,6 +61,7 @@ private:
61 61
         
62 62
     void runOnePhase();
63 63
     void updateSampler(unsigned nA, unsigned nP);
64
+    double estimatedPercentComplete() const;
64 65
     void displayStatus();
65 66
     void createCheckpoint();
66 67
 
... ...
@@ -156,9 +156,9 @@ inline std::string buildReport()
156 156
 #endif
157 157
 
158 158
 #if defined( __GAPS_AVX__ )
159
-    std::string simd = "AVX enabled\n";
159
+    std::string simd = "SIMD: AVX instructions enabled\n";
160 160
 #elif defined( __GAPS_SSE__ )
161
-    std::string simd = "SSE enabled\n";
161
+    std::string simd = "SIMD: SSE instructions enabled\n";
162 162
 #else
163 163
     std::string simd = "SIMD not enabled\n";
164 164
 #endif
... ...
@@ -1,6 +1,6 @@
1 1
 ---
2 2
 title: "CoGAPS - Coordinated Gene Association in Pattern Sets"
3
-author: "Genevieve Stein-O'Brien, Thomas Sherman, Hyejune Limb, Elana Fertig"
3
+author: "Thomas Sherman, Genevieve Stein-O'Brien, Hyejune Limb, Elana Fertig"
4 4
 date: "`r doc_date()`"
5 5
 bibliography: References.bib
6 6
 vignette: >
... ...
@@ -17,15 +17,23 @@ library(CoGAPS)
17 17
 
18 18
 # Introduction
19 19
 
20
-Coordinated Gene Association in Pattern Sets (CoGAPS) infers underlying patterns
21
-in a matrix of measurements that can be interpreted as arising from the
22
-multiplication of two lower dimensional matrices. 
23
-
24
-The Markov chain Monte Carlo (MCMC) matrix factorization that infers patterns
25
-also infers the extent to which individual genes belong to these patterns. The
26
-CoGAPS algorithm extends GAPS to infer the coordinated activity in sets of genes
27
-for each of the inferred patterns based upon (5) and to refine gene set
28
-membership based upon (2).
20
+Coordinated Gene Association in Pattern Sets (CoGAPS) is a technique for latent
21
+space learning in gene expression data. CoGAPS is a member of the
22
+Nonnegative Matrix Factorization (NMF) class of algorithms. NMFs factorize a
23
+data matrix into two related matrices containing gene weights, the
24
+Amplitude (A) matrix, and sample weights, the Pattern (P) Matrix. Each column
25
+of A or row of P defines a feature and together this set of features defines
26
+the latent space among genes and samples, respectively. In NMF, the values of
27
+the elements in the A and P matrices are constrained to be greater than or
28
+equal to zero. This constraint simultaneously reflects the non-negative nature
29
+of gene expression data and enforces the additive nature of the resulting
30
+feature dimensions, generating solutions that are biologically intuitive to
31
+interpret @SEUNG_1999.
32
+
33
+CoGAPS has two extensions that allow it to scale up to large data sets,
34
+Genome-Wide CoGAPS (GWCoGAPS) and Single-Cell CoGAPS (scCOGAPS). This package
35
+presents a unified R interface for all three methods, with a parallel,
36
+efficient underlying implementation in C++.
29 37
 
30 38
 # Installing CoGAPS
31 39
 
... ...
@@ -37,7 +45,7 @@ source("https://bioconductor.org/biocLite.R")
37 45
 biocLite("CoGAPS")
38 46
 ```
39 47
 
40
-The most up to date version of *CoGAPS* can be installed directly from the 
48
+The most up-to-date version of *CoGAPS* can be installed directly from the 
41 49
 *FertigLab* Github Repository:
42 50
 
43 51
 ```{r eval=FALSE}
... ...
@@ -49,8 +57,8 @@ devtools::install_github("FertigLab/CoGAPS")
49 57
 ```
50 58
 
51 59
 There is also an option to install the development version of *CoGAPS*, 
52
-although this is not guaranteed to be stable and should not be used for
53
-for publications.
60
+while this version has the latest experimental features, it is not guaranteed
61
+to be stable.
54 62
 
55 63
 ```{r eval=FALSE}
56 64
 ## Method 1 using biocLite
... ...
@@ -60,36 +68,48 @@ biocLite("FertigLab/CoGAPS", ref="develop", dependencies = TRUE, build_vignettes
60 68
 devtools::install_github("FertigLab/CoGAPS", ref="develop")
61 69
 ```
62 70
 
63
-# Running CoGAPS
71
+# Package Overview
64 72
 
65
-Here we walk through running *CoGAPS* on the *GIST* data set from Ochs et
66
-al. (2009), which contains gene expression data from gastrointestinal
67
-stromal tumor cell lines treated with Gleevec. The matrix contains bulk gene
68
-expression data with 1363 genes and 9 samples.
73
+We first give a walkthrough of the package features using a simple, simulated
74
+data set. In later sections we provide two example workflows on real data
75
+sets.
69 76
 
70
-## Running with Default Parameters
77
+## Running CoGAPS with Default Parameters
71 78
 
72
-The only required argument to *CoGAPS* is the data set. The simplest call to
73
-*CoGAPS* is the following:
79
+The only required argument to `CoGAPS` is the data set. This can be a `matrix`,
80
+`data.frame`, `SummarizedExperiment`, `SingleCellExperiment` or the path of a
81
+file (`tsv`, `csv`, `mtx`) containing the data.
74 82
 
75
-```{r eval=FALSE}
83
+```{r}
76 84
 # load data
77
-data(GIST)
85
+data(SimpSim)
78 86
 
79 87
 # run CoGAPS
80
-result <- CoGAPS(GIST.D)
88
+CoGAPS(SimpSim.data)
81 89
 ```
82 90
 
91
+While CoGAPS is running it periodically prints status messages. For example,
92
+`20000 of 25000, Atoms: 2932(80), ChiSq: 9728, time: 00:00:29 / 00:01:19`. This
93
+message tells us that CoGAPS is at iteration 20000 out of 25000 for this phase,
94
+and that 29 seconds out of an estimated 1 minute 19 seconds have passed. It 
95
+also tells us the size of the atomic domain which is a core component of the 
96
+algorithm but can be ignored for now. Finally, the ChiSq value tells us how
97
+closely the A and P matrices reconstruct the original data. In general, we want
98
+this value to go down - but it is not a perfect measurment of how well CoGAPS
99
+is finding the biological processes contained in the data. CoGAPS also prints
100
+a message indicating which phase is currently happening. There are two phases
101
+to the algorithm - *Calibration* and *Sampling*.
102
+
83 103
 ## Setting Parameters
84 104
 
85 105
 ### Model Parameters
86 106
 
87
-Most of the time we'll want to set some parameters before running *CoGAPS*.
88
-Parameters are managed with a *CogapsParams* object. This object will
107
+Most of the time we'll want to set some parameters before running CoGAPS.
108
+Parameters are managed with a `CogapsParams` object. This object will
89 109
 store all parameters needed to run CoGAPS and provides a simple interface for
90 110
 viewing and setting the parameter values.
91 111
 
92
-```{r eval=FALSE}
112
+```{r}
93 113
 # create new parameters object
94 114
 params <- new("CogapsParams")
95 115
 
... ...
@@ -105,219 +125,178 @@ getParam(params, "nPatterns")
105 125
 ``` 
106 126
 
107 127
 Once we've created the parameters object we can pass it along with our data to
108
-*CoGAPS*.
128
+`CoGAPS`.
109 129
 
110
-```{r eval=FALSE}
130
+```{r}
111 131
 # run CoGAPS with specified model parameters
112
-result <- CoGAPS(GIST.D, params)
132
+CoGAPS(SimpSim.data, params)
113 133
 ```
114 134
 
115 135
 ### Run Configuration Options
116 136
 
117
-The *CogapsParams* class manages the model parameters - i.e. the parameters
118
-that affect the result. There are also a few parameters that are passed directly
119
-to *CoGAPS* that control things like displaying the status of the run.
137
+The `CogapsParams` class manages the model parameters - i.e. the parameters
138
+that affect the result. There are also a few parameters that are passed
139
+directly to `CoGAPS` that control things like displaying the status of the run.
120 140
 
121
-```{r eval=FALSE}
141
+```{r}
122 142
 # run CoGAPS with specified output frequency
123
-result <- CoGAPS(GIST.D, params, outputFrequency=200)
143
+CoGAPS(SimpSim.data, params, outputFrequency=250)
124 144
 ```
125 145
 
126
-There are several other arguments that are passed directly to *CoGAPS* which
146
+There are several other arguments that are passed directly to `CoGAPS` which
127 147
 are covered in later sections.
128 148
 
129
-# Visualizing Output
149
+## Visualizing Output
130 150
 
131
-*CoGAPS* returns a *CogapsResult* object that can be passed on to the analysis
132
-and plotting functions provided in the package. By default, the *plot* function
133
-displays the patterns across samples.
151
+`CoGAPS` returns a `CogapsResult` object that can be passed on to the analysis
152
+and plotting functions provided in the package. By default, the `plot` function
153
+displays how the patterns vary across the samples. (Note that we pass the 
154
+`nIterations` parameter here directly, this is allowed for any parameters in 
155
+the `CogapsParams` class and will always take precedent over the values given
156
+in `params`).
134 157
 
135
-```{r eval=FALSE}
136
-# show result summary
137
-result
158
+```{r}
159
+# store result
160
+result <- CoGAPS(SimpSim.data, params, nIterations=5000, outputFrequency=2500)
138 161
 
139 162
 # plot CogapsResult object returned from CoGAPS
140 163
 plot(result)
141 164
 ```
142 165
 
143
-## Pattern Markers
144
-
145
-# Selecting Appropiate Number of Patterns
166
+In the example workflows we'll explore some more analysis functions provided in
167
+the package.
146 168
 
147
-Selecting the best value for *nPatterns* is the most difficult part of the
148
-analysis. For starters, there is not one "best" value for the number of
149
-patterns - various numbers of patterns can capture various levels of
150
-granularity in the data. To further complicate the problem, there's not a
151
-clear way to compare runs for different numbers of patterns.
169
+## Running CoGAPS in Parallel
152 170
 
153
-Here we show the simplest approach of selecting dimensionality by plotting the
154
-error and selecting the least number of patterns that sufficiently reduce the
155
-error. We also introduce another way to pass parameters - any parameter in the 
156
-*CogapsParams* class can be passed by name directly to the *CoGAPS* function,
157
-overwriting the value contained in `params`.
171
+Non-Negative Matrix Factorization algorithms typically require long computation
172
+times and CoGAPS is no exception. In order to scale CoGAPS up to the size of
173
+data sets seen in practice we need to take advantage of modern hardware
174
+and parallelize the algorithm.
158 175
 
159
-```{r eval=FALSE}
160
-# define the range of patterns we are searching over
161
-pattern_range <- c(3,5,8)
176
+### Multi-Threaded Parallelization
162 177
 
163
-# run CoGAPS with each value in range
164
-resultList <- lapply(pattern_range, function(p) CoGAPS(GIST.D, params, nPatterns=p, nIterations=3000, outputFrequency=2500))
178
+The simplest way to run CoGAPS in parallel is to provide the `nThreads`
179
+argument to `CoGAPS`. This allows the underlying algorithm to run on multiple
180
+threads and has no effect on the mathematics of the algorithm i.e. this is
181
+still standard CoGAPS. The precise number of threads to use depends on many
182
+things like hardware and data size. The best approach is to play around with
183
+different values and see how it effects the estimated time.
165 184
 
166
-# plot chi-sq values for each run
167
-chisq <- sapply(resultList, function(result) getMeanChiSq(result))
168
-plot(pattern_range, chisq)
185
+```{r}
186
+CoGAPS(SimpSim.data, nIterations=10000, outputFrequency=5000, nThreads=1)
187
+CoGAPS(SimpSim.data, nIterations=10000, outputFrequency=5000, nThreads=4)
169 188
 ```
170 189
 
171
-# Running CoGAPS in Parallel
172
-
173
-## Single Process Parallelization
190
+Note this method relies on CoGAPS being compiled with OpenMP support, use
191
+`buildReport` to check.
174 192
 
175
-*CoGAPS* can be run across multiple cores by setting the *nCores* parameter.
176
-This is different from (Running Distributed CoGAPS)[running-distributed-cogaps].
177
-The algorithm will run in the normal fashion, without splitting up the data.
178
-
179
-```{r eval=FALSE}
180
-params <- setParam(params, "nCores", 4)
193
+```{r}
194
+cat(CoGAPS::buildReport())
181 195
 ```
182 196
 
183
-## Running Distributed CoGAPS
184
-
185
-For extremely large datasets (greater than a few thousand rows or columns), it
186
-is much more efficient to break up the data into random subsets and perform
187
-*CoGAPS* on each subset in parallel - stitching the results back together
188
-at the end. The full explanation of this method can be seen here
189
- (CITE GWCOGAPS).
190
-
191
-To run *CoGAPS* in this distributed way there are two distinct functions.
192
-Genome-Wide CoGAPS - *GWCoGAPS* and Single Cell CoGAPS - *scCoGAPS*. 
193
-*GWCoGAPS* is used to break the genes (rows) into subsets and *scCoGAPS* is
194
-used to break the samples (columns - cells in single cell data) into subsets.
195
-Both methods follow a similiar workflow and the underlying method is 
196
-essentially the same.
197
-
198
-The first step is breaking the data into subsets. This is done as a separate
199
-step where all subsets are then saved back to a file. In the case where the
200
-data set is so large it requires a machine with large memory, this allows the
201
-user to only pay for the machine for enough time to split the data. The actual 
202
-algorithm should use a reasonable amount of memory on each machine during
203
-the distributed computation.
204
-
205
-```{r eval=FALSE}
206
-# splitting data into subsets for GWCoGAPS
207
-gw_sim_name <- createGWCoGAPSSets(as.matrix(GIST.D), as.matrix(GIST.S), nSets=4, "gw_example")
197
+### Distributed CoGAPS (GWCoGAPS/scCoGAPS)
198
+
199
+For large datasets (greater than a few thousand genes or samples) the
200
+multi-threaded parallelization isn't enough. It is more efficient to break up
201
+the data into subsets and perform CoGAPS on each subset in parallel, stitching 
202
+the results back together at the end. The CoGAPS extensions, GWCOGAPS and 
203
+scCoGAPS, each implement a version of this method (CITE).
204
+
205
+In order to use these extensions, some additional parameters are required.
206
+`nSets` specifies the number of subsets to break the data set into. `cut`,
207
+`minNS`, and `maxNS` control the process of matching patterns across subsets
208
+and in general should not be changed from defaults. More information about
209
+these parameters are available in the original papers (CITE). These parameters
210
+need to be set with a different function than `setParam` since they depend
211
+on each other. Here we only set `nSets` (always required), but we have the
212
+option to pass the other parameters as well.
213
+
214
+```{r}
215
+params <- setDistributedParams(params, nSets=3)
216
+```
208 217
 
209
-# splitting data into subsets for scCoGAPS
210
-sc_sim_name <- createscCoGAPSSets(t(as.matrix(GIST.D)), nSets=4, "sc_example")
218
+Setting `nSets` requires balancing available hardware and run time against the
219
+size of your data. In general, `nSets` should be less than or equal to the
220
+number of nodes/cores that are available. If that is true, then the more subsets
221
+you create, the faster CoGAPS will run - however, some robustness can be lost
222
+when the subsets get too small. The general rule of thumb is to set `nSets`
223
+so that each subset has between 1000 and 5000 genes or cells. We will see an
224
+example of this on real data in the next two sections.
225
+
226
+Once the distributed parameters have been set we can call CoGAPS either by
227
+setting the `distributed` parameter or by using the provided wrapper functions.
228
+The following calls are equivalent:
229
+
230
+```{r}
231
+# genome-wide CoGAPS
232
+CoGAPS(SimpSim.data, params, distributed="genome-wide")
233
+GWCoGAPS(SimpSim.data, params)
234
+
235
+# single-cell CoGAPS
236
+CoGAPS(SimpSim.data, params, distributed="single-cell", singleCell=TRUE)
237
+scCoGAPS(SimpSim.data, params)
211 238
 ```
212 239
 
213
-Notice that we pass the transpose of the *GIST* data to *scCoGAPS* since
214
-the normal dataset is 1363 x 9 and we want a data set with a large number of
215
-samples.
240
+Notice that we also set the parameter `singleCell=TRUE`. This makes some 
241
+adjustments to the algorithm to account for the sparsity in single-cell data.
242
+The `scCoGAPS` wrapper automatically sets this parameter for us.
216 243
 
217
-It is neccesary to name the simulation so that the computation portion knows
218
-what the subset files are called. For convenience the name provided to the
219
-subset function is returned back so that it can be saved.
244
+The parallel backend for this computation is managed by the package `BiocParallel`
245
+and there is an option for the user to specifiy which backend they want. See the
246
+[Additional Features](#setting-parallel-backend-for-gwcogapssccogaps)
247
+section for more information.
220 248
 
221
-Running the computation is just as easy as running normal *CoGAPS*. The results
222
-will be in the exact same format as *CoGAPS*, only the intermediate computation
223
-is different. Notice here that no *params* object is accepted - all parameters
224
-must be passed by name.
249
+In general it is preferred to pass a file name to `GWCoGAPS`/`scCoGAPS` since
250
+otherwise the entire data set must be copied across multiple processes which
251
+will slow things down and potentially cause an out-of-memory error. We will
252
+see examples of this in the next two sections.
225 253
 
226
-```{r eval=FALSE}
227
-GWCoGAPS(gw_sim_name, nPatterns=3, nIterations=1000)
228
-scCoGAPS(sc_sim_name, nPatterns=3, nIterations=1000)
229
-```
254
+# Workflow Example - Bulk Data
230 255
 
231
-# Supported Data Formats
256
+THIS SECTION UNDER CONSTRUCTION
232 257
 
233
-CoGAPS supports both R objects and file names as input. Any of these data types
234
-can be passed directly to *CoGAPS* as the *data* parameter.
258
+Here we walk through running CoGAPS on the *GIST* data set from @OCHS_2009,
259
+which contains gene expression data from gastrointestinal
260
+stromal tumor cell lines treated with Gleevec. The matrix contains bulk gene
261
+expression data with 1363 genes and 9 samples.
235 262
 
236
-R object
263
+# Workflow Example - Single Cell Data
237 264
 
238
-* matrix
239
-* data.frame
240
-* SummarizedExperiment
241
-* SingleCellExperiment
265
+THIS SECTION UNDER CONSTRUCTION
242 266
 
243
-File
267
+# Additional Features of CoGAPS
244 268
 
245
-* .mtx
246
-* .tsv
247
-* .csv
269
+## Checkpoint System - Saving/Loading CoGAPS Runs
248 270
 
249
-# Transposing Data
271
+## Manual Pipeline for GWCoGAPS/scCoGAPS
250 272
 
251
-If your data is stored as samples x genes, *CoGAPS* allows you to pass
252
-*transposeData* and will automatically and efficiently read the transpose
253
-of your data to get the required gense x samples configuration.
273
+THIS SECTION UNDER CONSTRUCTION
254 274
 
255
-# Obscure Options
275
+## Setting Parallel Backend for GWCoGAPS/scCoGAPS
256 276
 
257
-## Running CoGAPS on Single Cell Data
277
+## Transposing Data
258 278
 
259
-The size and distribution of single cell data creates additional considerations
260
-when running *CoGAPS*. The section
261
-[Running Distributed CoGAPS](#running-distributed-cogaps) gives an overview
262
-for running CoGAPS on large single cell datasets. With regards to the 
263
-distribution of single cell data, it is neccesary to set the parameter
264
-**singleCell**, i.e. `params <- setParam(params, "singleCell" , TRUE)`.
279
+If your data is stored as samples x genes, `CoGAPS` allows you to pass
280
+`transposeData=TRUE` and will automatically read the transpose of your data
281
+to get the required genes x samples configuration.
265 282
 
266 283
 ## Passing Uncertainty Matrix
267 284
 
268 285
 In addition to providing the data, the user can also specify an uncertainty
269 286
 measurement - the standard deviation of each entry in the data matrix. By
270
-default, *CoGAPS* assumes that the standard deviation matrix is 10% of the
271
-data.
287
+default, `CoGAPS` assumes that the standard deviation matrix is 10% of the
288
+data matrix. This is a reasonable heuristic to use, but for specific types
289
+of data you may be able to provide better information.
272 290
 
273 291
 ```{r eval=FALSE}
274 292
 # run CoGAPS with custom uncertainty
275
-result <- CoGAPS(GIST.D, params, uncertainty=as.matrix(GIST.S))
293
+result <- CoGAPS(GIST.matrix, params, uncertainty=GIST.uncertainty)
276 294
 ```
277 295
 
278
-# Using CoGAPS-based statistics to infer gene membership in annotated gene sets
279
-
280
-## CoGAPSStat
281
-
282
-The function *calcCoGAPSStat* is used to infer gene set activity in each
283
-pattern from the CoGAPS matrix factorization. *calcCoGAPSStat* calculates the
284
-gene set statistics for each column of the feature matrix using a Z-score, the
285
-input gene set and permutation tests. The function outputs a list containing:
286
-
287
-* *GSUpreg* lists p-values for upregulation of each gene set in each pattern
288
-* *GSDownreg* lists p-values for downregulation of each gene set in each pattern
289
-* *GSActEst* provides gene set activity through conversion of p-values to
290
-activity estimates of each gene set in each pattern
291
-
292
-## computeGeneGSProb
293
-
294
-Now using the *computeGeneGSProb* function, we can use the gene set statistic
295
-(returned from calcCoGAPSStat) to compute a statistic to quantify the
296
-likelihood of membership for each gene annotated to a set based on its
297
-inferred activity. The statistic used to infer membership compares the 
298
-expression pattern of a gene annotated as a member of a gene set to the common
299
-expression pattern of all annotated members of that gene set. The function
300
-outputs the p-value of a set membership for each gene specified in GStoGenes. 
301
-
302 296
 # Citing CoGAPS
303 297
 
304
-If you use the CoGAPS package for your analysis please cite: (1) EJ Fertig,
305
-J Ding, AV Favorov, G Parmigiani, and MF Ochs (2010) CoGAPS: an R/C++ package to
306
-identify patterns and biological process activity in transcriptomic data.
307
-Bioinformatics 26: 2792-2793.
308
-
309
-To cite the CoGAPS algorithm use: (3) MF Ochs (2003) Bayesian Decomposition in
310
-The Analysis of Gene Expression Data: Methods and Software G Parmigiani,
311
-E Garrett, R Irizarry, and S Zeger, ed. New York: Springer Verlag.
312
-
313
-To cite the gene set statistic use: (5) MF Ochs, L Rink, C Tarn, S Mburu,
314
-T Taguchi, B Eisenberg, and AK Godwin (2009) Detection of treatment-induced
315
-changes in signaling pathways in gastrointestinal stromal tumors using
316
-transcriptomic data. Cancer Research 69: 9125-9132.
298
+If you use the CoGAPS package for your analysis, please cite @FERTIG_2010
317 299
 
318
-To site the set-membership refinement statistic use: (2) EJ Fertig, AV Favorov,
319
-and MF Ochs (2012) Identifying context-specific transcription factor targets
320
-from prior knowledge and gene expression data. 2012 IEEE International
321
-Conference on Bioinformatics and Biomedicine, B310, in press.
300
+If you use the gene set statistic, please cite @OCHS_2009
322 301
 
323 302
 # References
... ...
@@ -1,570 +1,46 @@
1
-%% This BibTeX bibliography file was created using BibDesk.
2
-%% http://bibdesk.sourceforge.net/
3
-
4
-
5
-%% Created for Elana Fertig at 2012-08-02 15:30:36 -0400 
6
-
7
-
8
-%% Saved with string encoding Unicode (UTF-8) 
9
-
10
-
11
-
12
-@conference{Fertig2012,
13
-	Address = {Philadelphia, PA, USA},
14
-	Author = {EJ Fertig and AV Favorov and MF Ochs},
15
-	Booktitle = {IEEE International Conference on Bioinformatics and Biomedicine},
16
-	Date-Added = {2012-08-02 15:29:22 -0400},
17
-	Date-Modified = {2012-08-02 15:30:29 -0400},
18
-	Number = {B310},
19
-	Title = {Identifying context-specific transcription factor targets from prior knowledge and gene expression data.},
20
-	Year = {2012}}
21
-
22
-@incollection{Ochs2003,
23
-	Address = {New York},
24
-	Author = {MF Ochs},
25
-	Booktitle = {The Analysis of Gene Expression Data: Methods and Software},
26
-	Date-Added = {2012-08-02 15:27:27 -0400},
27
-	Date-Modified = {2012-08-02 15:29:07 -0400},
28
-	Editor = {G Parmigiani and E Garrett and R Irizarry and S Zeger},
29
-	Publisher = {Springer-Verlag},
30
-	Title = {Bayesian Decomposition},
31
-	Year = {2003}}
32
-
33
-@article{Fertig2010,
34
-	Abstract = {SUMMARY: Coordinated Gene Activity in Pattern Sets (CoGAPS) provides an integrated package for isolating gene expression driven by a biological process, enhancing inference of biological processes from transcriptomic data. CoGAPS improves on other enrichment measurement methods by combining a Markov chain Monte Carlo (MCMC) matrix factorization algorithm (GAPS) with a threshold-independent statistic inferring activity on gene sets. The software is provided as open source C++ code built on top of JAGS software with an R interface. AVAILABILITY: The R package CoGAPS and the C++ package GAPS-JAGS are provided open source under the GNU Lesser Public License (GLPL) with a users manual containing installation and operating instructions. CoGAPS is available through Bioconductor and depends on the rjags package available through CRAN to interface CoGAPS with GAPS-JAGS. URL: http://www.cancerbiostats.onc.jhmi.edu/cogaps.cfm .},
35
-	Author = {Fertig, EJ and Ding, J and Favorov, AV and Parmigiani, G and Ochs, MF},
36
-	Date-Added = {2012-08-01 12:56:23 -0400},
37
-	Date-Modified = {2012-08-01 12:57:24 -0400},
38
-	Doi = {10.1093/bioinformatics/btq503},
1
+@article{FERTIG_2010,
2
+	Author = {Fertig, Elana J. and Ding, Jie and Favorov, Alexander V. and Parmigiani, Giovanni and Ochs, Michael F.},
3
+	Year = {2010},
4
+	Title = {CoGAPS: an R/C++ package to identify patterns and biological process activity in transcriptomic data},
39 5
 	Journal = {Bioinformatics},
40
-	Journal-Full = {Bioinformatics (Oxford, England)},
41
-	Mesh = {Computational Biology; Gene Expression; Gene Expression Profiling; Genomics; Markov Chains; Oligonucleotide Array Sequence Analysis; Software},
42
-	Month = {Nov},
43
-	Number = {21},
44
-	Pages = {2792-3},
45
-	Pmc = {PMC3025742},
46
-	Pmid = {20810601},
47
-	Pst = {ppublish},
48
-	Title = {{CoGAPS: an R/C++ package to identify patterns and biological process activity in transcriptomic data}},
49 6
 	Volume = {26},
50
-	Year = {2010},
51
-	Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btq503}}
7
+	Number = {21},
8
+	Pages = {2792-2793},
9
+	Doi = {10.1093/bioinformatics/btq503}}
52 10
 
53
-@article{Tavazoie1999,
54
-	Abstract = {Technologies to measure whole-genome mRNA abundances and methods to organize and display such data are emerging as valuable tools for systems-level exploration of transcriptional regulatory networks. For instance, it has been shown that mRNA data from 118 genes, measured at several time points in the developing hindbrain of mice, can be hierarchically clustered into various patterns (or 'waves') whose members tend to participate in common processes. We have previously shown that hierarchical clustering can group together genes whose cis-regulatory elements are bound by the same proteins in vivo. Hierarchical clustering has also been used to organize genes into hierarchical dendograms on the basis of their expression across multiple growth conditions. The application of Fourier analysis to synchronized yeast mRNA expression data has identified cell-cycle periodic genes, many of which have expected cis-regulatory elements. Here we apply a systematic set of statistical algorithms, based on whole-genome mRNA data, partitional clustering and motif discovery, to identify transcriptional regulatory sub-networks in yeast-without any a priori knowledge of their structure or any assumptions about their dynamics. This approach uncovered new regulons (sets of co-regulated genes) and their putative cis-regulatory elements. We used statistical characterization of known regulons and motifs to derive criteria by which we infer the biological significance of newly discovered regulons and motifs. Our approach holds promise for the rapid elucidation of genetic network architecture in sequenced organisms in which little biology is known.},
55
-	Address = {Department of Genetics, Harvard Medical School, Boston, Massachusetts 02115, USA.},
56
-	Au = {Tavazoie, S and Hughes, JD and Campbell, MJ and Cho, RJ and Church, GM},
57
-	Author = {Tavazoie, S. and Hughes, J.D. and Campbell, M.J. and Cho, R.J. and Church, G.M.},
58
-	Cin = {Nat Genet. 1999 Jul;22(3):213-5. PMID: 10391202},
59
-	Crdt = {1999/07/03 10:00},
60
-	Da = {19990719},
61
-	Date-Added = {2010-02-04 11:43:20 -0500},
62
-	Date-Modified = {2010-02-04 11:48:43 -0500},
63
-	Dcom = {19990719},
64
-	Doi = {10.1038/10343},
65
-	Edat = {1999/07/03 10:00},
66
-	Issn = {1061-4036 (Print); 1061-4036 (Linking)},
67
-	Jid = {9216904},
68
-	Journal = {Nat Genet},
69
-	Jt = {Nature genetics},
70
-	Language = {eng},
71
-	Lr = {20061115},
72
-	Mh = {Animals; Cell Cycle/genetics; DNA/genetics; Gene Expression; *Genetic Techniques; Mice; Multigene Family; Open Reading Frames; RNA, Messenger/genetics/metabolism; Rhombencephalon/growth \& development/metabolism; Saccharomyces cerevisiae/cytology/genetics/metabolism},
73
-	Mhda = {2001/03/23 10:01},
74
-	Number = {3},
75
-	Own = {NLM},
76
-	Pages = {281--285},
77
-	Pl = {UNITED STATES},
78
-	Pmid = {10391217},
79
-	Pst = {ppublish},
80
-	Pt = {Journal Article; Research Support, Non-U.S. Gov't; Research Support, U.S. Gov't, Non-P.H.S.},
81
-	Rn = {0 (RNA, Messenger); 9007-49-2 (DNA)},
82
-	Sb = {IM},
83
-	So = {Nat Genet. 1999 Jul;22(3):281-5.},
84
-	Stat = {MEDLINE},
85
-	Title = {Systematic determination of genetic network architecture.},
86
-	Volume = {22},
11
+@article{SEUNG_1999,
12
+	Author = {Seung, Sebastian and Lee, Daniel D.},
87 13
 	Year = {1999},
88
-	Bdsk-Url-1 = {http://dx.doi.org/10.1038/10343}}
89
-
90
-@article{Goeman2007,
91
-	Abstract = {MOTIVATION: Many statistical tests have been proposed in recent years for analyzing gene expression data in terms of gene sets, usually from Gene Ontology. These methods are based on widely different methodological assumptions. Some approaches test differential expression of each gene set against differential expression of the rest of the genes, whereas others test each gene set on its own. Also, some methods are based on a model in which the genes are the sampling units, whereas others treat the subjects as the sampling units. This article aims to clarify the assumptions behind different approaches and to indicate a preferential methodology of gene set testing. RESULTS: We identify some crucial assumptions which are needed by the majority of methods. P-values derived from methods that use a model which takes the genes as the sampling unit are easily misinterpreted, as they are based on a statistical model that does not resemble the biological experiment actually performed. Furthermore, because these models are based on a crucial and unrealistic independence assumption between genes, the P-values derived from such methods can be wildly anti-conservative, as a simulation experiment shows. We also argue that methods that competitively test each gene set against the rest of the genes create an unnecessary rift between single gene testing and gene set testing.},
92
-	Address = {Department of Medical Statistics and Bioinformatics, Leiden University Medical Center, Leiden, The Netherlands. j.j.goeman@lumc.nl},
93
-	Au = {Goeman, JJ and Buhlmann, P},
94
-	Author = {Goeman, J.J. and Buhlmann, P.},
95
-	Crdt = {2007/02/17 09:00},
96
-	Da = {20070501},
97
-	Date-Added = {2010-02-04 11:42:46 -0500},
98
-	Date-Modified = {2010-02-04 11:48:51 -0500},
99
-	Dcom = {20070522},
100
-	Dep = {20070215},
101
-	Doi = {10.1093/bioinformatics/btm051},
102
-	Edat = {2007/02/17 09:00},
103
-	Issn = {1367-4811 (Electronic); 1367-4803 (Linking)},
104
-	Jid = {9808944},
105
-	Journal = {Bioinformatics},
106
-	Jt = {Bioinformatics (Oxford, England)},
107
-	Keywords = {gene sets},
108
-	Language = {eng},
109
-	Lr = {20091104},
110
-	Mh = {*Algorithms; *Artifacts; *Data Interpretation, Statistical; *Databases, Genetic; Gene Expression Profiling/*methods; Information Storage and Retrieval/*methods; Reproducibility of Results; Sensitivity and Specificity},
111
-	Mhda = {2007/05/23 09:00},
112
-	Number = {8},
113
-	Own = {NLM},
114
-	Pages = {980--987},
115
-	Phst = {2007/02/15 {$[$}aheadofprint{$]$}},
116
-	Pii = {btm051},
117
-	Pl = {England},
118
-	Pmid = {17303618},
119
-	Pst = {ppublish},
120
-	Pt = {Comparative Study; Evaluation Studies; Journal Article},
121
-	Sb = {IM},
122
-	So = {Bioinformatics. 2007 Apr 15;23(8):980-7. Epub 2007 Feb 15.},
123
-	Stat = {MEDLINE},
124
-	Title = {Analyzing gene expression data in terms of gene sets: methodological issues.},
125
-	Volume = {23},
126
-	Year = {2007},
127
-	Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btm051}}
128
-
129
-@article{Bidaut2004,
130
-	Abstract = {ClutrFree facilitates the visualization and interpretation of clusters or patterns computed from microarray data through a graphical user interface that displays patterns, membership information of the genes and annotation statistics simultaneously. ClutrFree creates a tree linking the patterns based on similarity, permitting the navigation among patterns identified by different algorithms or by the same algorithm with different parameters, and aids the inferring of conclusions from a microarray experiment. AVAILABILITY: The ClutrFree Java source code and compiled bytecode are available as a package under the GNU General Public License at http://bioinformatics.fccc.edu},
131
-	Address = {Division of Population Science, Fox Chase Cancer Center, 333 Cottman Avenue, Philadelphia, PA 19111, USA.},
132
-	Au = {Bidaut, G and Ochs, MF},
133
-	Author = {Bidaut, G. and Ochs, M.F.},
134
-	Crdt = {2004/05/18 05:00},
135
-	Da = {20041101},
136
-	Date-Added = {2010-02-02 12:36:53 -0500},
137
-	Date-Modified = {2010-02-02 16:00:15 -0500},
138
-	Dcom = {20050210},
139
-	Dep = {20040514},
140
-	Doi = {10.1093/bioinformatics/bth307},
141
-	Edat = {2004/05/18 05:00},
142
-	Gr = {CA06927/CA/NCI NIH HHS/United States},
143
-	Issn = {1367-4803 (Print); 1367-4803 (Linking)},
144
-	Jid = {9808944},
145
-	Journal = {Bioinformatics},
146
-	Jt = {Bioinformatics (Oxford, England)},
147
-	Language = {eng},
148
-	Lr = {20071114},
149
-	Mh = {*Algorithms; *Cluster Analysis; Computer Graphics; Oligonucleotide Array Sequence Analysis/*methods; Sequence Alignment/*methods; Sequence Analysis, DNA/*methods; *Software; *User-Computer Interface},
150
-	Mhda = {2005/02/11 09:00},
151
-	Number = {16},
152
-	Own = {NLM},
153
-	Pages = {2869--2871},
154
-	Phst = {2004/05/14 {$[$}aheadofprint{$]$}},
155
-	Pii = {bth307},
156
-	Pl = {England},
157
-	Pmid = {15145813},
158
-	Pst = {ppublish},
159
-	Pt = {Journal Article; Research Support, Non-U.S. Gov't; Research Support, U.S. Gov't, P.H.S.},
160
-	Sb = {IM},
161
-	So = {Bioinformatics. 2004 Nov 1;20(16):2869-71. Epub 2004 May 14.},
162
-	Stat = {MEDLINE},
163
-	Title = {ClutrFree: cluster tree visualization and interpretation.},
164
-	Volume = {20},
165
-	Year = {2004},
166
-	Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/bth307}}
167
-
168
-@article{Carvalho2008,
169
-	Author = {Carvalho, C.M. and Chang, J. and Lucas, J. and Nevins, J.R. and Wang, Q. and West, M.},
170
-	Date-Added = {2010-02-02 12:36:36 -0500},
171
-	Date-Modified = {2010-02-02 12:36:36 -0500},
172
-	Journal = {J. Am. Stat. Assoc.},
173
-	Pages = {1438 - 1456},
174
-	Title = {High-dimensional sparse factor modelling: Applications in gene expression genomics},
175
-	Volume = {103},
176
-	Year = {2008}}
177
-
178
-@article{Lee1999,
179
-	Abstract = {Is perception of the whole based on perception of its parts? There is psychological and physiological evidence for parts-based representations in the brain, and certain computational theories of object recognition rely on such representations. But little is known about how brains or computers might learn the parts of objects. Here we demonstrate an algorithm for non-negative matrix factorization that is able to learn parts of faces and semantic features of text. This is in contrast to other methods, such as principal components analysis and vector quantization, that learn holistic, not parts-based, representations. Non-negative matrix factorization is distinguished from the other methods by its use of non-negativity constraints. These constraints lead to a parts-based representation because they allow only additive, not subtractive, combinations. When non-negative matrix factorization is implemented as a neural network, parts-based representations emerge by virtue of two properties: the firing rates of neurons are never negative and synaptic strengths do not change sign.},
180
-	Address = {Bell Laboratories, Lucent Technologies, Murray Hill, New Jersey 07974, USA.},
181
-	Author = {Lee, D.D. and Seung, H.S.},
182
-	Cin = {Nature. 1999 Oct 21;401(6755):759. PMID: 10548097; Nature. 1999 Oct 21;401(6755):759-60. PMID: 10548098},
183
-	Crdt = {1999/11/05 08:00},
184
-	Da = {19991116},
185
-	Date = {1999 Oct 21},
186
-	Date-Added = {2010-02-02 12:35:49 -0500},
187
-	Date-Modified = {2010-02-02 15:57:17 -0500},
188
-	Dcom = {19991116},
189
-	Doi = {10.1038/44565},
190
-	Edat = {1999/11/05 08:00},
191
-	Issn = {0028-0836 (Print)},
192
-	Jid = {0410462},
14
+	Title = {Learning the Parts of Objects by Non-Negative Matrix Factorization},
193 15
 	Journal = {Nature},
194
-	Jt = {Nature},
195
-	Language = {eng},
196
-	Lr = {20061115},
197
-	Mh = {*Algorithms; Face; Humans; *Learning; Models, Neurological; Perception/physiology; Semantics},
198
-	Mhda = {2001/03/23 10:01},
199
-	Number = {6755},
200
-	Own = {NLM},
201
-	Pages = {788--791},
202
-	Pl = {ENGLAND},
203
-	Pmid = {10548103},
204
-	Pst = {ppublish},
205
-	Pt = {Journal Article; Research Support, Non-U.S. Gov't},
206
-	Sb = {IM},
207
-	Status = {MEDLINE},
208
-	Title = {Learning the parts of objects by non-negative matrix factorization.},
209 16
 	Volume = {401},
210
-	Year = {1999},
211
-	Bdsk-Url-1 = {http://dx.doi.org/10.1038/44565}}
212
-
213
-@article{Kossenkov2009,
214
-	Abstract = {We explore a number of matrix factorization methods in terms of their ability to identify signatures of biological processes in a large gene expression study. We focus on the ability of these methods to find signatures in terms of gene ontology enhancement and on the interpretation of these signatures in the samples. Two Bayesian approaches, Bayesian Decomposition (BD) and Bayesian Factor Regression Modeling (BFRM), perform best. Differences in the strength of the signatures between the samples suggest that BD will be most useful for systems modeling and BFRM for biomarker discovery.},
215
-	Address = {The Wistar Institute, Philadelphia, Pennsylvania, USA.},
216
-	Au = {Kossenkov, AV and Ochs, MF},
217
-	Author = {Kossenkov, A.V. and Ochs, M.F.},
218
-	Crdt = {2009/11/10 06:00},
219
-	Da = {20091109},
220
-	Date-Added = {2010-02-02 12:33:48 -0500},
221
-	Date-Modified = {2010-02-02 15:55:21 -0500},
222
-	Dcom = {20100111},
223
-	Doi = {10.1016/S0076-6879(09)67003-8},
224
-	Edat = {2009/11/10 06:00},
225
-	Issn = {1557-7988 (Electronic); 1557-7988 (Linking)},
226
-	Jid = {0212271},
227
-	Journal = {Methods Enzymol},
228
-	Jt = {Methods in enzymology},
229
-	Keywords = {Markov Chain Monte Carlo, matrix factorization},
230
-	Language = {eng},
231
-	Mh = {Algorithms; Bayes Theorem; Biological Processes/physiology; *Cluster Analysis; *Data Interpretation, Statistical; Gene Expression Profiling/*methods; Gene Regulatory Networks; Microarray Analysis/*methods; Pattern Recognition, Automated/methods; Saccharomyces cerevisiae/genetics/physiology},
232
-	Mhda = {2010/01/12 06:00},
233
-	Own = {NLM},
234
-	Pages = {59--77},
235
-	Pii = {S0076-6879(09)67003-8},
236
-	Pl = {United States},
237
-	Pmid = {19897089},
238
-	Pst = {ppublish},
239
-	Pt = {Journal Article},
240
-	Sb = {IM},
241
-	So = {Methods Enzymol. 2009;467:59-77.},
242
-	Stat = {MEDLINE},
243
-	Title = {Matrix factorization for recovery of biological processes from microarray data.},
244
-	Volume = {467},
245
-	Year = {2009},
246
-	Bdsk-Url-1 = {http://dx.doi.org/10.1016/S0076-6879(09)67003-8}}
247
-
248
-@article{Subramanian2005,
249
-	Abstract = {Although genomewide RNA expression analysis has become a routine tool in biomedical research, extracting biological insight from such information remains a major challenge. Here, we describe a powerful analytical method called Gene Set Enrichment Analysis (GSEA) for interpreting gene expression data. The method derives its power by focusing on gene sets, that is, groups of genes that share common biological function, chromosomal location, or regulation. We demonstrate how GSEA yields insights into several cancer-related data sets, including leukemia and lung cancer. Notably, where single-gene analysis finds little similarity between two independent studies of patient survival in lung cancer, GSEA reveals many biological pathways in common. The GSEA method is embodied in a freely available software package, together with an initial database of 1,325 biologically defined gene sets.},
250
-	Address = {Broad Institute of Massachusetts Institute of Technology and Harvard, 320 Charles Street, Cambridge, MA 02141, USA.},
251
-	Au = {Subramanian, A and Tamayo, P and Mootha, VK and Mukherjee, S and Ebert, BL and Gillette, MA and Paulovich, A and Pomeroy, SL and Golub, TR and Lander, ES and Mesirov, JP},
252
-	Author = {Subramanian, A. and Tamayo, P. and Mootha, V.K. and Mukherjee, S. and Ebert, B.L. and Gillette, M.A. and Paulovich, A. and Pomeroy, S.L. and Golub, T.R. and Lander, E.S. and Mesirov, J.P.},
253
-	Cin = {Proc Natl Acad Sci U S A. 2005 Oct 25;102(43):15278-9. PMID: 16230612},
254
-	Crdt = {2005/10/04 09:00},
255
-	Da = {20051026},
256
-	Date-Added = {2010-02-01 16:29:02 -0500},
257
-	Date-Modified = {2010-02-02 15:59:33 -0500},
258
-	Dcom = {20051212},
259
-	Dep = {20050930},
260
-	Doi = {10.1073/pnas.0506580102},
261
-	Edat = {2005/10/04 09:00},
262
-	Issn = {0027-8424 (Print); 0027-8424 (Linking)},
263
-	Jid = {7505876},
264
-	Journal = {Proc Natl Acad Sci},
265
-	Jt = {Proceedings of the National Academy of Sciences of the United States of America},
266
-	Keywords = {gene set},
267
-	Language = {eng},
268
-	Lr = {20091118},
269
-	Mh = {Cell Line, Tumor; Female; Gene Expression Profiling/*methods; Genes, p53/physiology; Genome; Humans; Leukemia, Myeloid, Acute/genetics; Lung Neoplasms/genetics/mortality; Male; *Oligonucleotide Array Sequence Analysis; Precursor Cell Lymphoblastic Leukemia-Lymphoma/genetics},
270
-	Mhda = {2005/12/15 09:00},
271
-	Number = {43},
272
-	Oid = {NLM: PMC1239896},
273
-	Own = {NLM},
274
-	Pages = {15545--15550},
275
-	Phst = {2005/09/30 {$[$}aheadofprint{$]$}},
276
-	Pii = {0506580102},
277
-	Pl = {United States},
278
-	Pmc = {PMC1239896},
279
-	Pmid = {16199517},
280
-	Pst = {ppublish},
281
-	Pt = {Journal Article},
282
-	Sb = {IM},
283
-	So = {Proc Natl Acad Sci U S A. 2005 Oct 25;102(43):15545-50. Epub 2005 Sep 30.},
284
-	Stat = {MEDLINE},
285
-	Title = {Gene set enrichment analysis: a knowledge-based approach for interpreting genome-wide expression profiles.},
286
-	Volume = {102},
287
-	Year = {2005},
288
-	Bdsk-Url-1 = {http://dx.doi.org/10.1073/pnas.0506580102}}
289
-
290
-@article{Draghici2003,
291
-	Abstract = {Onto-Tools is a set of four seamlessly integrated databases: Onto-Express, Onto-Compare, Onto-Design and Onto-Translate. Onto-Express is able to automatically translate lists of genes found to be differentially regulated in a given condition into functional profiles characterizing the impact of the condition studied upon various biological processes and pathways. OE constructs functional profiles (using Gene Ontology terms) for the following categories: biochemical function, biological process, cellular role, cellular component, molecular function and chromosome location. Statistical significance values are calculated for each category. Once the initial exploratory analysis identified a number of relevant biological processes, specific mechanisms of interactions can be hypothesized for the conditions studied. Currently, many commercial arrays are available for the investigation of specific mechanisms. Each such array is characterized by a biological bias determined by the extent to which the genes present on the array represent specific pathways. Onto-Compare is a tool that allows efficient comparisons of any sets of commercial or custom arrays. Using Onto-Compare, a researcher can determine quickly which array, or set of arrays, covers best the hypotheses studied. In many situations, no commercial arrays are available for specific biological mechanisms. Onto-Design is a tool that allows the user to select genes that represent given functional categories. Onto-Translate allows the user to translate easily lists of accession numbers, UniGene clusters and Affymetrix probes into one another. All tools above are seamlessly integrated. The Onto-Tools are available online at http://vortex.cs.wayne.edu/Projects.html.},
292
-	Address = {Department of Computer Science, Wayne State University, 431 State Hall, Detroit, MI 48202, USA. sod@cs.wayne.edu},
293
-	Au = {Draghici, S and Khatri, P and Bhavsar, P and Shah, A and Krawetz, SA and Tainsky, MA},
294
-	Author = {Draghici, S. and Khatri, P. and Bhavsar, P. and Shah, A. and Krawetz, S.A. and Tainsky, M.A.},
295
-	Crdt = {2003/06/26 05:00},
296
-	Da = {20030625},
297
-	Date-Added = {2010-02-01 16:28:22 -0500},
298
-	Date-Modified = {2010-02-02 15:56:55 -0500},
299
-	Dcom = {20030818},
300
-	Edat = {2003/06/26 05:00},
301
-	Gr = {R01-NS045207-01/NS/NINDS NIH HHS/United States; R21-EB000990-01/EB/NIBIB NIH HHS/United States},
302
-	Issn = {1362-4962 (Electronic); 1362-4962 (Linking)},
303
-	Jid = {0411011},
304
-	Journal = {Nucleic Acids Res},
305
-	Jt = {Nucleic acids research},
306
-	Keywords = {gene set},
307
-	Language = {eng},
308
-	Lr = {20091118},
309
-	Mh = {Databases, Nucleic Acid; Gene Expression Profiling/*methods; Internet; Oligonucleotide Array Sequence Analysis/*methods; Proteins/genetics/physiology; *Software; Systems Integration},
310
-	Mhda = {2003/08/19 05:00},
311
-	Number = {13},
312
-	Oid = {NLM: PMC169030},
313
-	Own = {NLM},
314
-	Pages = {3775--3781},
315
-	Pl = {England},
316
-	Pmc = {PMC169030},
317
-	Pmid = {12824416},
318
-	Pst = {ppublish},
319
-	Pt = {Journal Article; Research Support, Non-U.S. Gov't; Research Support, U.S. Gov't, Non-P.H.S.; Research Support, U.S. Gov't, P.H.S.},
320
-	Rn = {0 (Proteins)},
321
-	Sb = {IM},
322
-	So = {Nucleic Acids Res. 2003 Jul 1;31(13):3775-81.},
323
-	Stat = {MEDLINE},
324
-	Title = {{Onto-Tools, the toolkit of the modern biologist: Onto-Express, Onto-Compare, Onto-Design and Onto-Translate.}},
325
-	Volume = {31},
326
-	Year = {2003}}
327
-
328
-@inbook{Ochs2006,
329
-	Address = {London},
330
-	Author = {M.F. Ochs},
331
-	Date-Added = {2010-01-08 14:45:36 -0500},
332
-	Date-Modified = {2010-01-08 14:48:01 -0500},
333
-	Editor = {G. Parmigiani and E. S. Garrett and R. A. Irizarry and S. L. Zeger},
334
-	Pages = {388-408},
335
-	Publisher = {Springer-Verlag},
336
-	Series = {Statistics for Biology and Health},
337
-	Title = {The Analysis of Gene Expression Data The Analysis of Gene Expression Data: Methods and Software},
338
-	Year = {2006}}
339
-
340
-@article{Bidaut2006,
341
-	Abstract = {BACKGROUND: As numerous diseases involve errors in signal transduction, modern therapeutics often target proteins involved in cellular signaling. Interpretation of the activity of signaling pathways during disease development or therapeutic intervention would assist in drug development, design of therapy, and target identification. Microarrays provide a global measure of cellular response, however linking these responses to signaling pathways requires an analytic approach tuned to the underlying biology. An ongoing issue in pattern recognition in microarrays has been how to determine the number of patterns (or clusters) to use for data interpretation, and this is a critical issue as measures of statistical significance in gene ontology or pathways rely on proper separation of genes into groups. RESULTS: Here we introduce a method relying on gene annotation coupled to decompositional analysis of global gene expression data that allows us to estimate specific activity on strongly coupled signaling pathways and, in some cases, activity of specific signaling proteins. We demonstrate the technique using the Rosetta yeast deletion mutant data set, decompositional analysis by Bayesian Decomposition, and annotation analysis using ClutrFree. We determined from measurements of gene persistence in patterns across multiple potential dimensionalities that 15 basis vectors provides the correct dimensionality for interpreting the data. Using gene ontology and data on gene regulation in the Saccharomyces Genome Database, we identified the transcriptional signatures of several cellular processes in yeast, including cell wall creation, ribosomal disruption, chemical blocking of protein synthesis, and, critically, individual signatures of the strongly coupled mating and filamentation pathways. CONCLUSION: This works demonstrates that microarray data can provide downstream indicators of pathway activity either through use of gene ontology or transcription factor databases. This can be used to investigate the specificity and success of targeted therapeutics as well as to elucidate signaling activity in normal and disease processes.},
342
-	Address = {Fox Chase Cancer Center, 333 Cottman Avenue, Philadelphia, PA 19111, USA. ghbidaut@pcbi.upenn.edu},
343
-	Au = {Bidaut, G and Suhre, K and Claverie, JM and Ochs, MF},
344
-	Author = {Bidaut, G. and Suhre, K. and Claverie, J.-M. and Ochs, M.F.},
345
-	Da = {20060327},
346
-	Date-Added = {2010-01-08 14:39:10 -0500},
347
-	Date-Modified = {2010-01-08 14:39:10 -0500},
348
-	Dcom = {20060426},
349
-	Dep = {20060228},
350
-	Doi = {10.1186/1471-2105-7-99},
351
-	Edat = {2006/03/02 09:00},
352
-	Gr = {CA06927/CA/United States NCI; LM008309/LM/United States NLM},
353
-	Issn = {1471-2105 (Electronic)},
354
-	Jid = {100965194},
355
-	Journal = {BMC Bioinformatics},
356
-	Jt = {BMC bioinformatics},
357
-	Keywords = {Bayesian, Markov Chain Monte Carlo, signaling network, microarray, k25},
358
-	Language = {eng},
359
-	Lr = {20071114},
360
-	Mh = {Algorithms; Computer Simulation; Gene Expression Profiling/*methods; *Models, Biological; Oligonucleotide Array Sequence Analysis/*methods; Pattern Recognition, Automated/*methods; Saccharomyces cerevisiae Proteins/genetics/*metabolism; Signal Transduction/*physiology; Transcription Factors/genetics/*metabolism},
361
-	Mhda = {2006/04/28 09:00},
362
-	Own = {NLM},
363
-	Pages = {99},
364
-	Phst = {2005/09/22 {$[$}received{$]$}; 2006/02/28 {$[$}accepted{$]$}; 2006/02/28 {$[$}aheadofprint{$]$}},
365
-	Pii = {1471-2105-7-99},
366
-	Pl = {England},
367
-	Pmc = {PMC1413561},
368
-	Pmid = {16507110},
369
-	Pst = {epublish},
370
-	Pt = {Journal Article; Research Support, N.I.H., Extramural; Research Support, Non-U.S. Gov't},
371
-	Pubm = {Electronic},
372
-	Rn = {0 (Saccharomyces cerevisiae Proteins); 0 (Transcription Factors)},
373
-	Sb = {IM},
374
-	So = {BMC Bioinformatics. 2006 Feb 28;7:99.},
375
-	Stat = {MEDLINE},
376
-	Title = {Determination of strongly overlapping signaling activity from microarray data.},
377
-	Volume = {7},
378
-	Year = {2006},
379
-	Bdsk-Url-1 = {http://dx.doi.org/10.1186/1471-2105-7-99}}
380
-
381
-@article{Kossenkov2007a,
382
-	Abstract = {Many biological processes rely on remodeling of the transcriptional response of cells through activation of transcription factors. Although determination of the activity level of transcription factors from microarray data can provide insight into developmental and disease processes, it requires careful analysis because of the multiple regulation of genes. We present a novel approach that handles both the assignment of genes to multiple patterns, as required by multiple regulation, and the linking of genes in prior probability distributions according to their known transcriptional regulators. We demonstrate the power of this approach in simulations and by application to yeast cell cycle and deletion mutant data. The results of simulations in the presence of increasing noise showed improved recovery of patterns in terms of chi2 fit. Analysis of the yeast data led to improved inference of biologically meaningful groups in comparison to other techniques, as demonstrated with ROC analysis. The new algorithm provides an approach for estimating the levels of transcription factor activity from microarray data, and therefore provides insights into biological response.},
383
-	Address = {The Wistar Institute, Philadelphia, PA, USA.},
384
-	Au = {Kossenkov, AV and Peterson, AJ and Ochs, MF},
385
-	Author = {A.V. Kossenkov and A.J. Peterson and M.F. Ochs},
386
-	Da = {20071003},
387
-	Date-Added = {2010-01-08 14:39:10 -0500},
388
-	Date-Modified = {2010-01-08 14:39:10 -0500},
389
-	Dcom = {20071102},
390
-	Edat = {2007/10/04 09:00},
391
-	Gr = {CA06973/CA/United States NCI; LM008309/LM/United States NLM},
392
-	Issn = {0926-9630 (Print)},
393
-	Jid = {9214582},
394
-	Journal = {Stud Health Technol Inform},
395
-	Jt = {Studies in health technology and informatics},
396
-	Keywords = {Bayesian, Markov Chain Monte Carlo, microarray},
397
-	Language = {eng},
398
-	Lr = {20080710},
399
-	Mh = {*Algorithms; Bayes Theorem; Computational Biology; *Gene Expression Regulation; Markov Chains; Models, Genetic; Monte Carlo Method; *Oligonucleotide Array Sequence Analysis; ROC Curve; Transcription Factors/*metabolism; Transcription, Genetic; Yeasts/genetics},
400
-	Mhda = {2007/11/06 09:00},
401
-	Number = {Pt 2},
402
-	Own = {NLM},
403
-	Pages = {1250--1254},
404
-	Pl = {Netherlands},
405
-	Pmid = {17911915},
406
-	Pst = {ppublish},
407
-	Pt = {Journal Article; Research Support, N.I.H., Extramural},
408
-	Pubm = {Print},
409
-	Rn = {0 (Transcription Factors)},
410
-	Sb = {T},
411
-	So = {Stud Health Technol Inform. 2007;129(Pt 2):1250-4.},
412
-	Stat = {MEDLINE},
413
-	Title = {Determining transcription factor activity from microarray data using {Bayesian Markov chain Monte Carlo} sampling.},
414
-	Volume = {129},
415
-	Year = {2007}}
416
-
417
-@article{Moloshok2002,
418
-	Abstract = {MOTIVATION: Microarray and gene chip technology provide high throughput tools for measuring gene expression levels in a variety of circumstances, including cellular response to drug treatment, cellular growth and development, tumorigenesis, among many other processes. In order to interpret the large data sets generated in experiments, data analysis techniques that consider biological knowledge during analysis will be extremely useful. We present here results showing the application of such a tool to expression data from yeast cell cycle experiments. RESULTS: Originally developed for spectroscopic analysis, Bayesian Decomposition (BD) includes two features which make it useful for microarray data analysis: the ability to assign genes to multiple coexpression groups and the ability to encode biological knowledge into the system. Here we demonstrate the ability of the algorithm to provide insight into the yeast cell cycle, including identification of five temporal patterns tied to cell cycle phases as well as the identification of a pattern tied to an approximately 40 min cell cycle oscillator. The genes are simultaneously assigned to the patterns, including partial assignment to multiple patterns when this is required to explain the expression profile. AVAILABILITY: The application is available free to academic users under a material transfer agreement. Go to http://bioinformatics.fccc.edu/ for more details.},
419
-	Address = {Bioinformatics Working Group, Fox Chase Cancer Center, Philadelphia, PA 19111, USA. td_moloshok@fccc.edu},
420
-	Au = {Moloshok, TD and Klevecz, RR and Grant, JD and Manion, FJ and Speier WF, 4th and Ochs, MF},
421
-	Author = {Moloshok, T.D. and Klevecz, R.R. and Grant, J.D. and Manion, F.J. and Speier IV, W.F. and Ochs, M.F.},
422
-	Da = {20020517},
423
-	Date-Added = {2010-01-08 14:39:10 -0500},
424
-	Date-Modified = {2010-01-08 14:39:10 -0500},
425
-	Dcom = {20021108},
426
-	Edat = {2002/05/23 10:00},
427
-	Gr = {CA06927/CA/United States NCI},
428
-	Issn = {1367-4803 (Print)},
429
-	Jid = {9808944},
430
-	Journal = {Bioinformatics},
431
-	Jt = {Bioinformatics (Oxford, England)},
432
-	Keywords = {Bayesian, signaling network, microarray},
433
-	Language = {eng},
434
-	Lr = {20071114},
435
-	Mh = {*Algorithms; *Bayes Theorem; Cell Cycle/genetics; Databases, Genetic; Gene Expression Regulation; Genome, Fungal; Markov Chains; *Models, Genetic; *Models, Statistical; Monte Carlo Method; Oligonucleotide Array Sequence Analysis/*methods; Pattern Recognition, Automated; Periodicity; Reproducibility of Results; Saccharomyces cerevisiae/genetics; Sensitivity and Specificity},
436
-	Mhda = {2002/11/26 04:00},
437
-	Number = {4},
438
-	Own = {NLM},
439
-	Pages = {566--575},
440
-	Pl = {England},
441
-	Pmid = {12016054},
442
-	Pst = {ppublish},
443
-	Pt = {Comparative Study; Journal Article; Research Support, Non-U.S. Gov't; Research Support, U.S. Gov't, P.H.S.},
444
-	Pubm = {Print},
445
-	Sb = {IM},
446
-	So = {Bioinformatics. 2002 Apr;18(4):566-75.},
447
-	Stat = {MEDLINE},
448
-	Title = {Application of {B}ayesian decomposition for analysing microarray data.},
449
-	Volume = {18},
450
-	Year = {2002},
451
-	Bdsk-File-1 = {YnBsaXN0MDDUAQIDBAUIJidUJHRvcFgkb2JqZWN0c1gkdmVyc2lvblkkYXJjaGl2ZXLRBgdUcm9vdIABqAkKFRYXGyIjVSRudWxs0wsMDQ4RFFpOUy5vYmplY3RzV05TLmtleXNWJGNsYXNzog8QgASABqISE4ACgAOAB1lhbGlhc0RhdGFccmVsYXRpdmVQYXRo0hgNGRpXTlMuZGF0YU8RAkYAAAAAAkYAAgAADE1hY2ludG9zaCBIRAAAAAAAAAAAAAAAAAAAAMPjDF1IKwAABRay/hVNb2xvc2hva19CRF9ZZWFzdC5wZGYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFFrL/xebg41BERiBDQVJPAAIACQAACSAAAAAAAAAAAAAAAAAAAAABMgAAEAAIAADD41KtAAAAEQAIAADF5xkjAAAAAQAoBRay/gUWsv0COfpEAjlFGgAUHYQAFBk5AA/qXAAKpBIACqQRAAB63AACAIVNYWNpbnRvc2ggSEQ6VXNlcnM6ZWpmZXJ0aWc6TGlicmFyeTpNYWlsOklNQVAtZWpmZXJ0aWdAbWFpbC5sIzE0MTkzOS5vcmc6SU5CT1g6T2Nocy5pbWFwbWJveDpBdHRhY2htZW50czozMDg3OjI6TW9sb3Nob2tfQkRfWWVhc3QucGRmAAAOACwAFQBNAG8AbABvAHMAaABvAGsAXwBCAEQAXwBZAGUAYQBzAHQALgBwAGQAZgAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAASAHpVc2Vycy9lamZlcnRpZy9MaWJyYXJ5L01haWwvSU1BUC1lamZlcnRpZ0BtYWlsLmxpcXVpZGRpcnQub3JnL0lOQk9YL09jaHMuaW1hcG1ib3gvQXR0YWNobWVudHMvMzA4Ny8yL01vbG9zaG9rX0JEX1llYXN0LnBkZgATAAEvAAAVAAIAD///AACABdIcHR4fWCRjbGFzc2VzWiRjbGFzc25hbWWjHyAhXU5TTXV0YWJsZURhdGFWTlNEYXRhWE5TT2JqZWN0XxBxLi4vLi4vTGlicmFyeS9NYWlsL0lNQVAtZWpmZXJ0aWdAbWFpbC5saXF1aWRkaXJ0Lm9yZy9JTkJPWC9PY2hzLmltYXBtYm94L0F0dGFjaG1lbnRzLzMwODcvMi9Nb2xvc2hva19CRF9ZZWFzdC5wZGbSHB0kJaIlIVxOU0RpY3Rpb25hcnkSAAGGoF8QD05TS2V5ZWRBcmNoaXZlcgAIABEAFgAfACgAMgA1ADoAPABFAEsAUgBdAGUAbABvAHEAcwB2AHgAegB8AIYAkwCYAKAC6gLsAvEC+gMFAwkDFwMeAycDmwOgA6MDsAO1AAAAAAAAAgEAAAAAAAAAKAAAAAAAAAAAAAAAAAAAA8c=}}
17
+	Number = {6755},
18
+	Pages = {788-791},
19
+	Doi = {10.1038/44565}}
452 20
 
453
-@article{Ochs1999,
454
-	Abstract = {A frequent problem in analysis is the need to find two matrices, closely related to the underlying measurement process, which when multiplied together reproduce the matrix of data points. Such problems arise throughout science, for example, in imaging where both the calibration of the sensor and the true scene may be unknown and in localized spectroscopy where multiple components may be present in varying amounts in any spectrum. Since both matrices are unknown, such a decomposition is a bilinear problem. We report here a solution to this problem for the case in which the decomposition results in matrices with elements drawn from positive additive distributions. We demonstrate the power of the methodology on chemical shift images (CSI). The new method, Bayesian spectral decomposition (BSD), reduces the CSI data to a small number of basis spectra together with their localized amplitudes. We apply this new algorithm to a 19F nonlocalized study of the catabolism of 5-fluorouracil in human liver, 31P CSI studies of a human head and calf muscle, and simulations which show its strengths and limitations. In all cases, the dataset, viewed as a matrix with rows containing the individual NMR spectra, results from the multiplication of a matrix of generally nonorthogonal basis spectra (the spectral matrix) by a matrix of the amplitudes of each basis spectrum in the the individual voxels (the amplitude matrix). The results show that BSD can simultaneously determine both the basis spectra and their distribution. In principle, BSD should solve this bilinear problem for any dataset which results from multiplication of matrices representing positive additive distributions if the data overdetermine the solutions.},
455
-	Address = {NMR and Medical Spectroscopy, Fox Chase Cancer Center, Philadelphia, PA, USA.},
456
-	Au = {Ochs, MF and Stoyanova, RS and Arias-Mendoza, F and Brown, TR},
457
-	Author = {Ochs, M.F. and Stoyanova, R.S. and Arias-Mendoza, F. and Brown, T.R.},
458
-	Ci = {Copyright 1999 Academic Press.},
459
-	Da = {19990331},
460
-	Date-Added = {2010-01-08 14:39:10 -0500},
461
-	Date-Modified = {2010-01-08 14:39:10 -0500},
462
-	Dcom = {19990331},
463
-	Doi = {10.1006/jmre.1998.1639},
464
-	Edat = {1999/03/04},
465
-	Gr = {CA41078/CA/United States NCI; CA62556/CA/United States NCI},
466
-	Issn = {1090-7807 (Print)},
467
-	Jid = {9707935},
468
-	Journal = {J Magn Reson},
469
-	Jt = {Journal of magnetic resonance (San Diego, Calif. : 1997)},
470
-	Keywords = {Markov Chain Monte Carlo, Bayesian, k25},
471
-	Language = {eng},
472
-	Lr = {20071114},
473
-	Mh = {Adenosine Triphosphate/analysis; Bayes Theorem; Brain/metabolism; Fluorouracil/*metabolism; Humans; Image Processing, Computer-Assisted; Liver/*metabolism; Magnesium/analysis; Magnetic Resonance Spectroscopy/*methods; Muscle, Skeletal/metabolism},
474
-	Mhda = {1999/03/04 00:01},
475
-	Number = {1},
476
-	Own = {NLM},
477
-	Pages = {161--176},
478
-	Pii = {S1090-7807(98)91639-1},
479
-	Pl = {UNITED STATES},
480
-	Pmid = {10053145},
481
-	Pst = {ppublish},
482
-	Pt = {Comparative Study; Journal Article; Research Support, U.S. Gov't, P.H.S.},
483
-	Pubm = {Print},
484
-	Rn = {51-21-8 (Fluorouracil); 56-65-5 (Adenosine Triphosphate); 7439-95-4 (Magnesium)},
485
-	Sb = {IM},
486
-	So = {J Magn Reson. 1999 Mar;137(1):161-76.},
487
-	Stat = {MEDLINE},
488
-	Title = {A new method for spectral decomposition using a bilinear Bayesian approach.},
489
-	Volume = {137},
490
-	Year = {1999},
491
-	Bdsk-Url-1 = {http://dx.doi.org/10.1006/jmre.1998.1639}}
21
+@conference{FERTIG_2012,
22
+	Author = {Fertig, Elana J. and Favorov, Alexander V. and Ochs, Michael F.},
23
+ 	Year = {2012},
24
+	Title = {Identifying Context-Specific Transcription Factor Targets from Prior Knowledge and Gene Expression Data},
25
+	Booktitle = {IEEE International Conference on Bioinformatics and Biomedicine},
26
+	Address = {Philadelphia, PA, USA}}
492 27
 
493
-@conference{Plummer2003,
494
-	Address = {Vienna, Austria},
495
-	Author = {M. Plummer},
496
-	Booktitle = {Proceedings of the 3rd Internation Workshop on Distributed Statistical Computing},
497
-	Date-Added = {2010-01-08 14:38:05 -0500},
498
-	Date-Modified = {2010-02-02 15:47:49 -0500},
499
-	Editor = {K. Hornik and F. Leisch and A. Zeileis},
500
-	Keywords = {Markov Chain Monte Carlo},
501
-	Month = {March 20-22},
502
-	Title = {{JAGS}: A program for analysis of {Bayesian} graphical models using {Gibbs} sampling},
503
-	Year = {2003}}
28
+@incollection{OCHS_2003,
29
+	Author = {Ochs, Michael F.},
30
+	Year = {2003},
31
+	Title = {Bayesian Decomposition},
32
+	Booktitle = {The Analysis of Gene Expression Data: Methods and Software},
33
+	Editor = {Parmigiani, Giovanni and Garett, Elizabeth S. and Irizarry, Rafael A. and Zeger, Scott L.},
34
+	Address = {New York},
35
+ 	Publisher = {Springer-Verlag}}
504 36
 
505
-@article{Ochs2009,
506
-	Abstract = {Cell signaling plays a central role in the etiology of cancer. Numerous therapeutics in use or under development target signaling proteins; however, off-target effects often limit assignment of positive clinical response to the intended target. As direct measurements of signaling protein activity are not generally feasible during treatment, there is a need for more powerful methods to determine if therapeutics inhibit their targets and when off-target effects occur. We have used the Bayesian Decomposition algorithm and data on transcriptional regulation to create a novel methodology, Differential Expression for Signaling Determination (DESIDE), for inferring signaling activity from microarray measurements. We applied DESIDE to deduce signaling activity in gastrointestinal stromal tumor cell lines treated with the targeted therapeutic imatinib mesylate (Gleevec). We detected the expected reduced activity in the KIT pathway, as well as unexpected changes in the p53 pathway. Pursuing these findings, we have determined that imatinib-induced DNA damage is responsible for the increased activity of p53, identifying a novel off-target activity for this drug. We then used DESIDE on data from resected, post-imatinib treatment tumor samples and identified a pattern in these tumors similar to that at late time points in the cell lines, and this pattern correlated with initial clinical response. The pattern showed increased activity of ETS domain-containing protein Elk-1 and signal transducers and activators of transcription 3 transcription factors, which are associated with the growth of side population cells. DESIDE infers the global reprogramming of signaling networks during treatment, permitting treatment modification that leverages ongoing drug development efforts, which is crucial for personalized medicine.},
507
-	Address = {Division of Oncology Biostatistics and Bioinformatics, Johns Hopkins University, Baltimore, Maryland 21205, USA. mfo@jhu.edu},
508
-	Au = {Ochs, MF and Rink, L and Tarn, C and Mburu, S and Taguchi, T and Eisenberg, B and Godwin, AK},
509
-	Author = {Ochs, M.F. and Rink, L. and Tarn, C. and Mburu, S. and Taguchi, T. and Eisenberg, B. and Godwin, A.K.},
510
-	Crdt = {2009/11/12 06:00},
511
-	Da = {20091204},
512
-	Date-Added = {2010-01-08 14:35:07 -0500},
513
-	Date-Modified = {2010-02-02 15:57:46 -0500},
514
-	Dcom = {20091221},
515
-	Dep = {20091110},
516
-	Doi = {10.1158/0008-5472.CAN-09-1709},
517
-	Edat = {2009/11/12 06:00},
518
-	Gr = {CA009035/CA/NCI NIH HHS/United States; CA106588/CA/NCI NIH HHS/United States; CA21661/CA/NCI NIH HHS/United States; LM009382/LM/NLM NIH HHS/United States},
519
-	Issn = {1538-7445 (Electronic); 1538-7445 (Linking)},
520
-	Jid = {2984705R},
521
-	Journal = {Cancer Res},
522
-	Jt = {Cancer research},
523
-	Language = {eng},
524
-	Mh = {Antineoplastic Agents/*pharmacology; Cell Line, Tumor; DNA Damage; Gastrointestinal Stromal Tumors/*drug therapy/*genetics/metabolism; Gene Expression Profiling; Humans; Piperazines/*pharmacology; Pyrimidines/*pharmacology; RNA, Messenger/biosynthesis/genetics; STAT3 Transcription Factor/metabolism; Signal Transduction; Tumor Suppressor Protein p53/genetics/metabolism; ets-Domain Protein Elk-1/metabolism},
525
-	Mhda = {2009/12/22 06:00},
526
-	Mid = {NIHMS149886},
527
-	Number = {23},
528
-	Oid = {NLM: NIHMS149886 {$[$}Available on 12/01/10{$]$}; NLM: PMC2789202 {$[$}Available on 12/01/10{$]$}},
529
-	Own = {NLM},
530
-	Pages = {9125--9132},
531
-	Phst = {2009/11/10 {$[$}aheadofprint{$]$}},
532
-	Pii = {0008-5472.CAN-09-1709},
533
-	Pl = {United States},
534
-	Pmc = {PMC2789202},
535
-	Pmcr = {2010/12/01},
536
-	Pmid = {19903850},
537
-	Pst = {ppublish},
538
-	Pt = {Journal Article; Research Support, N.I.H., Extramural; Research Support, Non-U.S. Gov't},
539
-	Rn = {0 (Antineoplastic Agents); 0 (ELK1 protein, human); 0 (Piperazines); 0 (Pyrimidines); 0 (RNA, Messenger); 0 (STAT3 Transcription Factor); 0 (STAT3 protein, human); 0 (TP53 protein, human); 0 (Tumor Suppressor Protein p53); 0 (ets-Domain Protein Elk-1); 152459-95-5 (imatinib)},
540
-	Sb = {IM},
541
-	So = {Cancer Res. 2009 Dec 1;69(23):9125-32. Epub 2009 Nov 10.},
542
-	Stat = {MEDLINE},
543
-	Title = {Detection of treatment-induced changes in signaling pathways in gastrointestinal stromal tumors using transcriptomic data.},
544
-	Volume = {69},
37
+@article{OCHS_2009,
38
+	Author = {Ochs, Michael F. and Rink, Lori and Tarn, Chi and Mburu, Sarah and Taguchi, Takahiro and Eisenberg, Burton and Godwin, Andrew K.},
545 39
 	Year = {2009},
546
-	Bdsk-Url-1 = {http://dx.doi.org/10.1158/0008-5472.CAN-09-1709}}
547
-
548
-@conference{Skilling1998,
549
-	Address = {Dordrecht/Boston/London},
550
-	Author = {J. Skilling},
551
-	Booktitle = {Maximum Entropy and Bayesian Methods, Proceedings of the 17th International Workshop on Maxiumum Entropy and Bayesian Methods of Statistical Analysis},
552
-	Date-Added = {2010-01-08 14:27:23 -0500},
553
-	Date-Modified = {2010-01-08 14:29:16 -0500},
554
-	Editor = {G. J. Erickson and J. T. Rychert and C. R. Smith},
555
-	Publisher = {Kluwer Academic Publishers},
556
-	Title = {Massive inference and maximum entropy},
557
-	Year = {1998}}
40
+	Title = {Detection of Treatment-Induced Changes in Signaling Pathways in Gastrointestinal Stromal Tumors Using Transcriptomic Data},
41
+	Journal = {Cancer Research},
42
+	Volume = {69},
43
+	Number = {23},
44
+	Pages = {9125-9132},
45
+	Doi = {10.1158/0008-5472.CAN-09-1709}}
558 46
 
559
-@article{Sibisi1997,
560
-	Author = {S. Sibisi and J. Skilling},
561
-	Date-Added = {2010-01-08 13:50:30 -0500},
562
-	Date-Modified = {2010-01-08 13:50:30 -0500},
563
-	Journal = {Journal of the Royal Statistical Society, B},
564
-	Keywords = {Bayesian, probability},
565
-	Number = {1},
566
-	Pages = {217-235},
567
-	Title = {Prior distributions on measure space},
568
-	Volume = {59},
569
-	Year = {1997},
570
-	Bdsk-File-1 = {YnBsaXN0MDDUAQIDBAUIJidUJHRvcFgkb2JqZWN0c1gkdmVyc2lvblkkYXJjaGl2ZXLRBgdUcm9vdIABqAkKFRYXGyIjVSRudWxs0wsMDQ4RFFpOUy5vYmplY3RzV05TLmtleXNWJGNsYXNzog8QgASABqISE4ACgAOAB1lhbGlhc0RhdGFccmVsYXRpdmVQYXRo0hgNGRpXTlMuZGF0YU8RAfIAAAAAAfIAAgAADE1hY2ludG9zaCBIRAAAAAAAAAAAAAAAAAAAAMkCCRlIKwAAAAnLqh9TaWJpc2lTa2lsbGluZ19KUm95YWwjOUNDRTEucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACczhxKuwAwAAAAAAAAAA/////wAACSAAAAAAAAAAAAAAAAAAAAAKUmVmZXJlbmNlcwAQAAgAAMkCT2kAAAARAAgAAMSr6EMAAAABABAACcuqAAhtRgAITPkAAJMnAAIAUE1hY2ludG9zaCBIRDpVc2VyczplamZlcnRpZzpEb2N1bWVudHM6UmVmZXJlbmNlczpTaWJpc2lTa2lsbGluZ19KUm95YWwjOUNDRTEucGRmAA4ATgAmAFMAaQBiAGkAcwBpAFMAawBpAGwAbABpAG4AZwBfAEoAUgBvAHkAYQBsAFMAdABhAHQAUwBvAGMAQgBfADEAOQA5ADcALgBwAGQAZgAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAASAEpVc2Vycy9lamZlcnRpZy9Eb2N1bWVudHMvUmVmZXJlbmNlcy9TaWJpc2lTa2lsbGluZ19KUm95YWxTdGF0U29jQl8xOTk3LnBkZgATAAEvAAAVAAIAD///AACABdIcHR4fWCRjbGFzc2VzWiRjbGFzc25hbWWjHyAhXU5TTXV0YWJsZURhdGFWTlNEYXRhWE5TT2JqZWN0XxBiLi4vLi4vLi4vLi4vLi4vLi4vLi4vLi4vVXNlcnMvZWpmZXJ0aWcvRG9jdW1lbnRzL1JlZmVyZW5jZXMvU2liaXNpU2tpbGxpbmdfSlJveWFsU3RhdFNvY0JfMTk5Ny5wZGbSHB0kJaIlIVxOU0RpY3Rpb25hcnkSAAGGoF8QD05TS2V5ZWRBcmNoaXZlcgAIABEAFgAfACgAMgA1ADoAPABFAEsAUgBdAGUAbABvAHEAcwB2AHgAegB8AIYAkwCYAKAClgKYAp0CpgKxArUCwwLKAtMDOAM9A0ADTQNSAAAAAAAAAgEAAAAAAAAAKAAAAAAAAAAAAAAAAAAAA2Q=}}
571 47
new file mode 100644
... ...
@@ -0,0 +1,71 @@
1
+## Pattern Markers
2
+
3
+## Selecting Appropiate Number of Patterns
4
+
5
+Selecting the best value for *nPatterns* is the most difficult part of the
6
+analysis. For starters, there is not one "best" value for the number of
7
+patterns - various numbers of patterns can capture various levels of
8
+granularity in the data. To further complicate the problem, there's not a
9
+clear way to compare runs for different numbers of patterns.
10
+
11
+Here we show the simplest approach of selecting dimensionality by plotting the
12
+error and selecting the least number of patterns that sufficiently reduce the
13
+error. We also introduce another way to pass parameters - any parameter in the 
14
+*CogapsParams* class can be passed by name directly to the *CoGAPS* function,
15
+overwriting the value contained in `params`.
16
+
17
+```{r eval=FALSE}
18
+# define the range of patterns we are searching over
19
+pattern_range <- c(3,5,8)
20
+
21
+# run CoGAPS with each value in range
22
+resultList <- lapply(pattern_range, function(p) CoGAPS(GIST.D, params, nPatterns=p, nIterations=3000, outputFrequency=2500))
23
+
24
+# plot chi-sq values for each run
25
+chisq <- sapply(resultList, function(result) getMeanChiSq(result))
26
+plot(pattern_range, chisq)
27
+```
28
+
29
+## CoGAPS-based statistics
30
+
31
+### CoGAPSStat
32
+
33
+The function *calcCoGAPSStat* is used to infer gene set activity in each
34
+pattern from the CoGAPS matrix factorization. *calcCoGAPSStat* calculates the
35
+gene set statistics for each column of the feature matrix using a Z-score, the
36
+input gene set and permutation tests. The function outputs a list containing:
37
+
38
+* *GSUpreg* lists p-values for upregulation of each gene set in each pattern
39
+* *GSDownreg* lists p-values for downregulation of each gene set in each pattern
40
+* *GSActEst* provides gene set activity through conversion of p-values to
41
+activity estimates of each gene set in each pattern
42
+
43
+### computeGeneGSProb
44
+
45
+Now using the *computeGeneGSProb* function, we can use the gene set statistic
46
+(returned from calcCoGAPSStat) to compute a statistic to quantify the
47
+likelihood of membership for each gene annotated to a set based on its
48
+inferred activity. The statistic used to infer membership compares the 
49
+expression pattern of a gene annotated as a member of a gene set to the common
50
+expression pattern of all annotated members of that gene set. The function
51
+outputs the p-value of a set membership for each gene specified in GStoGenes. 
52
+
53
+
54
+
55
+Notice that we pass the transpose of the *GIST* data to *scCoGAPS* since
56
+the normal dataset is 1363 x 9 and we want a data set with a large number of
57
+samples.
58
+
59
+It is neccesary to name the simulation so that the computation portion knows
60
+what the subset files are called. For convenience the name provided to the
61
+subset function is returned back so that it can be saved.
62
+
63
+Running the computation is just as easy as running normal *CoGAPS*. The results
64
+will be in the exact same format as *CoGAPS*, only the intermediate computation
65
+is different. Notice here that no *params* object is accepted - all parameters
66
+must be passed by name.
67
+
68
+```{r eval=FALSE}
69
+GWCoGAPS(gw_sim_name, nPatterns=3, nIterations=1000)
70
+scCoGAPS(sc_sim_name, nPatterns=3, nIterations=1000)
71
+```