Abstract = {Multiple-hypothesis testing involves guarding against much more complicated errors than single-hypothesis testing. Whereas we typically control the type I error rate for a single-hypothesis test, a compound error rate is controlled for multiple-hypothesis tests. For example, controlling the false discovery rate FDR traditionally involves intricate sequential p-value rejection methods based on the observed data. Whereas a sequential p-value method fixes the error rate and estimates its corresponding rejection region, we propose the opposite approach-we fix the rejection region and then estimate its corresponding error rate. This new approach offers increased applicability, accuracy and power. We apply the methodology to both the positive false discovery rate pFDR and FDR, and provide evidence for its benefits. It is shown that pFDR is probably the quantity of interest over FDR. Also discussed is the calculation of the q-value, the pFDR analogue of the p-value, which eliminates the need to set the error rate beforehand as is traditionally done. Some simple numerical examples are presented that show that this new approach can yield an increase of over eight times in power compared with the Benjamini-Hochberg FDR method.},
	Author = {Storey, JD},
	Date-Added = {2011-10-30 22:26:25 -0400},
	Date-Modified = {2011-10-30 22:26:25 -0400},
	Isi = {000177425500009},
	Isi-Recid = {126051257},
	Isi-Ref-Recids = {112504863 90155838 115373815 122784094 87253760 119531800 126051258 126051259 119668320 112504865},
	Journal = {Journal of the Royal Statistical Society Series B-Statistical Methodology},
	Keywords = {false discovery rate; multiple comparisons; positive false discovery rate; p-values; q-values; sequential p-value methods; simultaneous inference},
	Pages = {479--498},
	Publisher = {BLACKWELL PUBL LTD},
	Times-Cited = {1132},
	Title = {A direct approach to false discovery rates},
	Volume = {64},
	Year = {2002},

  Abstract = {With the increase in genomewide experiments and the sequencing of multiple genomes, the analysis of large data sets has become commonplace in biology. It is often the case that thousands of features in a genomewide data set are tested against some null hypothesis, where a number of features are expected to be significant. Here we propose an approach to measuring statistical significance in these genomewide studies based on the concept of the false discovery rate. This approach offers a sensible balance between the number of true and false positives that is automatically calibrated and easily interpreted. In doing so, a measure of statistical significance called the q value is associated with each tested feature. The q value is similar to the well known p value, except it is a measure of significance in terms of the false discovery rate rather than the false positive rate. Our approach avoids a flood of false positive results, while offering a more liberal criterion than what has been used in genome scans for linkage.},
	Author = {Storey, John D and Tibshirani, Robert},
	Date-Added = {2011-10-30 22:16:49 -0400},
	Date-Modified = {2011-10-30 22:16:49 -0400},
	Doi = {10.1073/pnas.1530509100},
	Journal = {Proc Natl Acad Sci U S A},
	Journal-Full = {Proceedings of the National Academy of Sciences of the United States of America},
	Mesh = {Algorithms; Alternative Splicing; Animals; Binding Sites; Exons; Gene Expression Regulation; Genetic Linkage; Genetic Techniques; Genome; Humans; Oligonucleotide Array Sequence Analysis; Statistics as Topic; Transcription, Genetic},
	Month = {Aug},
	Number = {16},
	Pages = {9440-5},
	Pmc = {PMC170937},
	Pmid = {12883005},
	Pst = {ppublish},
	Title = {Statistical significance for genomewide studies},
	Volume = {100},
	Year = {2003},

author = {Woo, Sangsoon and Leek, Jeffrey T. and Storey, John D.},
title = {A computationally efficient modular optimal discovery procedure},
volume = {27},
number = {4},
pages = {509-515},
year = {2011},
doi = {10.1093/bioinformatics/btq701},
abstract ={Motivation: It is well known that patterns of differential gene expression across biological conditions are often shared by many genes, particularly those within functional groups. Taking advantage of these patterns can lead to increased statistical power and biological clarity when testing for differential expression in a microarray experiment. The optimal discovery procedure (ODP), which maximizes the expected number of true positives for each fixed number of expected false positives, is a framework aimed at this goal. Storey et al. introduced an estimator of the ODP for identifying differentially expressed genes. However, their ODP estimator grows quadratically in computational time with respect to the number of genes. Reducing this computational burden is a key step in making the ODP practical for usage in a variety of high-throughput problems.Results: Here, we propose a new estimate of the ODP called the modular ODP (mODP). The existing ‘full ODP’ requires that the likelihood function for each gene be evaluated according to the parameter estimates for all genes. The mODP assigns genes to modules according to a Kullback–Leibler distance, and then evaluates the statistic only at the module-averaged parameter estimates. We show that the mODP is relatively insensitive to the choice of the number of modules, but dramatically reduces the computational complexity from quadratic to linear in the number of genes. We compare the full ODP algorithm and mODP on simulated data and gene expression data from a recent study of Morrocan Amazighs. The mODP and full ODP algorithm perform very similarly across a range of comparisons.Availability: The mODP methodology has been implemented into EDGE, a comprehensive gene expression analysis software package in R, available at http://genomine.org/edge/.Contact: jstorey@princeton.eduSupplementary information: Supplementary data are available at Bioinformatics online.},
URL = {http://bioinformatics.oxfordjournals.org/content/27/4/509.abstract},
eprint = {http://bioinformatics.oxfordjournals.org/content/27/4/509.full.pdf+html},
journal = {Bioinformatics}

@article {storey:2007,
author = {Storey, John D.},
title = {The optimal discovery procedure: a new approach to simultaneous significance testing},
journal = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
volume = {69},
number = {3},
publisher = {Blackwell Publishing Ltd},
issn = {1467-9868},
url = {http://dx.doi.org/10.1111/j.1467-9868.2007.005592.x},
doi = {10.1111/j.1467-9868.2007.005592.x},
pages = {347--368},
keywords = {Classification, False discovery rate, Multiple-hypothesis testing, Optimal discovery procedure, q-value, Single-thresholding procedure},
year = {2007},

@article {gibson:2008,
author = {Idaghdour, Y and Storey, JD and Jadallah, SJ and Gibson, G },
title = {A Genome-Wide Gene Expression Signature of Environmental Geography in Leukocytes of Moroccan Amazighs},
journal = {PLoS Genetics},
volume = {4},
doi = {10.1371/journal.pgen.1000052}

author = {Storey, John D. and Dai, James Y. and Leek, Jeffrey T.},
title = {The optimal discovery procedure for large-scale significance testing, with applications to comparative microarray experiments},
volume = {8},
number = {2},
pages = {414-432},
year = {2007},
doi = {10.1093/biostatistics/kxl019},
abstract ={As much of the focus of genetics and molecular biology has shifted toward the systems level, it has become increasingly important to accurately extract biologically relevant signal from thousands of related measurements. The common property among these high-dimensional biological studies is that the measured features have a rich and largely unknown underlying structure. One example of much recent interest is identifying differentially expressed genes in comparative microarray experiments. We propose a new approach aimed at optimally performing many hypothesis tests in a high-dimensional study. This approach estimates the optimal discovery procedure (ODP), which has recently been introduced and theoretically shown to optimally perform multiple significance tests. Whereas existing procedures essentially use data from only one feature at a time, the ODP approach uses the relevant information from the entire data set when testing each feature. In particular, we propose a generally applicable estimate of the ODP for identifying differentially expressed genes in microarray experiments. This microarray method consistently shows favorable performance over five highly used existing methods. For example, in testing for differential expression between two breast cancer tumor types, the ODP provides increases from 72% to 185% in the number of genes called significant at a false discovery rate of 3%. Our proposed microarray method is freely available to academic users in the open-source, point-and-click EDGE software package.},
URL = {http://biostatistics.oxfordjournals.org/content/8/2/414.abstract},
eprint = {http://biostatistics.oxfordjournals.org/content/8/2/414.full.pdf+html},
journal = {Biostatistics}

author = "Storey, John D. and Xiao, Wenzhong and Leek, Jeffrey T. and Tompkins, Ronald G. and Davis, Ronald W.",
title = "Significance analysis of time course microarray experiments",
volume = "102",
number = "36",
pages = "12837-12842",
year = "2005",
doi = "10.1073/pnas.0504609102",
abstract ="Characterizing the genome-wide dynamic regulation of gene expression is important and will be of much interest in the future. However, there is currently no established method for identifying differentially expressed genes in a time course study. Here we propose a significance method for analyzing time course microarray studies that can be applied to the typical types of comparisons and sampling schemes. This method is applied to two studies on humans. In one study, genes are identified that show differential expression over time in response to in vivo endotoxin administration. By using our method, 7,409 genes are called significant at a 1% false-discovery rate level, whereas several existing approaches fail to identify any genes. In another study, 417 genes are identified at a 10% false-discovery rate level that show expression changing with age in the kidney cortex. Here it is also shown that as many as 47% of the genes change with age in a manner more complex than simple exponential growth or decay. The methodology proposed here has been implemented in the freely distributed and open-source edge software package.",
URL = "http://www.pnas.org/content/102/36/12837.abstract",
eprint = "http://www.pnas.org/content/102/36/12837.full.pdf+html",
journal = "Proceedings of the National Academy of Sciences of the United States of America"

author = {Leek, Jeffrey T. and Monsen, Eva and Dabney, Alan R. and Storey, John D.},
title = {EDGE: extraction and analysis of differential gene expression},
volume = {22},
number = {4},
pages = {507-508},
year = {2006},
doi = {10.1093/bioinformatics/btk005},
abstract ={Summary: EDGE (Extraction of Differential Gene Expression) is an open source, point-and-click software program for the significance analysis of DNA microarray experiments. EDGE can perform both standard and time course differential expression analysis. The functions are based on newly developed statistical theory and methods. This document introduces the EDGE software package.Availability: EDGE is freely available for non-commercial users. EDGE can be downloaded for Windows, Macintosh and Linux/UNIX from http://faculty.washington.edu/jstorey/edgeContact: jtleek@u.washington.edu},
URL = {http://bioinformatics.oxfordjournals.org/content/22/4/507.abstract},
eprint = {http://bioinformatics.oxfordjournals.org/content/22/4/507.full.pdf+html},
journal = {Bioinformatics}

author = {Hedenfalk, Ingrid and Duggan, David and Chen, Yidong and Radmacher, Michael and Bittner, Michael and Simon, Richard and Meltzer, Paul and Gusterson, Barry and Esteller, Manel and Raffeld, Mark and Yakhini, Zohar and Ben-Dor, Amir and Dougherty, Edward and Kononen, Juha and Bubendorf, Lukas and Fehrle, Wilfrid and Pittaluga, Stefania and Gruvberger, Sofia and Loman, Niklas and Johannsson, Oskar and Olsson, Håkan and Wilfond, Benjamin and Sauter, Guido and Kallioniemi, Olli-P. and Borg, Åke and Trent, Jeffrey},
title = {Gene-Expression Profiles in Hereditary Breast Cancer},
journal = {New England Journal of Medicine},
volume = {344},
number = {8},
pages = {539-548},
year = {2001},
doi = {10.1056/NEJM200102223440801},
    note ={PMID: 11207349},

URL = {

eprint = {



    author = {Rodwell, Graham E. J AND Sonu, Rebecca AND Zahn, Jacob M AND Lund, James AND Wilhelmy, Julie AND Wang, Lingli AND Xiao, Wenzhong AND Mindrinos, Michael AND Crane, Emily AND Segal, Eran AND Myers, Bryan D AND Brooks, James D AND Davis, Ronald W AND Higgins, John AND Owen, Art B AND Kim, Stuart K},
    journal = {PLoS Biol},
    publisher = {Public Library of Science},
    title = {A Transcriptional Profile of Aging in the Human Kidney},
    year = {2004},
    month = {11},
    volume = {2},
    pages = {e427},
    number = {12},
    doi = {10.1371/journal.pbio.0020427}

  Author = "Storey, J. D.",
	Title = "The positive false discovery rate: A Bayesian interpretation and the q-value",
	Journal = "Annals of Statistics",
	Year = 2003,
	Volume = 31,
	Pages = "2013-2035",

    author = {Leek, Jeffrey T AND Storey, John D},
    journal = {PLoS Genet},
    publisher = {Public Library of Science},
    title = {Capturing Heterogeneity in Gene Expression Studies by Surrogate Variable Analysis},
    year = {2007},
    month = {09},
    volume = {3},
    pages = {e161},
    abstract = {<title>Author Summary</title><sec id="st1"><title/><p>In scientific and medical studies, great care must be taken when collecting data to understand the relationship between two variables, such as a drug and its effect on a disease. In any given study there will be many other variables at play, such as the effects of age and sex on the disease. We show that in studies where the expression levels of thousands of genes are measured at once, these issues become surprisingly critical. Due to the complexity of our genomes, environment, and demographic features, there are many sources of variation when analyzing gene expression levels. In any given study, it is impossible to measure every single variable that may be influencing how our genes are expressed. Despite this, we show that by considering all expression levels simultaneously, one can actually recover the effects of these important missed variables and essentially produce an analysis as if all relevant variables were included. As opposed to traditional studies, the massive amount of data available in this setting is what makes the method, called surrogate variable analysis, possible. We hypothesize that surrogate variable analysis will be useful in many large-scale gene expression studies.</p></sec>},
    number = {9},
    doi = {10.1371/journal.pgen.0030161}

  Abstract = {We develop a general framework for performing large-scale significance testing in the presence of arbitrarily strong dependence. We derive a low-dimensional set of random vectors, called a dependence kernel, that fully captures the dependence structure in an observed high-dimensional dataset. This result shows a surprising reversal of the "curse of dimensionality" in the high-dimensional hypothesis testing setting. We show theoretically that conditioning on a dependence kernel is sufficient to render statistical tests independent regardless of the level of dependence in the observed data. This framework for multiple testing dependence has implications in a variety of common multiple testing problems, such as in gene expression studies, brain imaging, and spatial epidemiology.},
	Author = {Leek, Jeffrey T and Storey, John D},
	Date-Added = {2011-10-30 22:16:12 -0400},
	Date-Modified = {2011-10-30 22:16:12 -0400},
	Doi = {10.1073/pnas.0808709105},
	Journal = {Proc Natl Acad Sci U S A},
	Journal-Full = {Proceedings of the National Academy of Sciences of the United States of America},
	Mesh = {Algorithms; Computer Simulation; Models, Statistical; Software; Statistics as Topic},
	Month = {Dec},
	Number = {48},
	Pages = {18718-23},
	Pmc = {PMC2586646},
	Pmid = {19033188},
	Pst = {ppublish},
	Title = {A general framework for multiple testing dependence},
	Volume = {105},
	Year = {2008},
	Bdsk-Url-1 = {http://dx.doi.org/10.1073/pnas.0808709105}

author = {Mecham, Brigham H. and Nelson, Peter S. and Storey, John D.},
title = {Supervised normalization of microarrays},
volume = {26},
number = {10},
pages = {1308-1315},
year = {2010},
doi = {10.1093/bioinformatics/btq118},
abstract ={Motivation: A major challenge in utilizing microarray technologies to measure nucleic acid abundances is ‘normalization’, the goal of which is to separate biologically meaningful signal from other confounding sources of signal, often due to unavoidable technical factors. It is intuitively clear that true biological signal and confounding factors need to be simultaneously considered when performing normalization. However, the most popular normalization approaches do not utilize what is known about the study, both in terms of the biological variables of interest and the known technical factors in the study, such as batch or array processing date.Results: We show here that failing to include all study-specific biological and technical variables when performing normalization leads to biased downstream analyses. We propose a general normalization framework that fits a study-specific model employing every known variable that is relevant to the expression study. The proposed method is generally applicable to the full range of existing probe designs, as well as to both single-channel and dual-channel arrays. We show through real and simulated examples that the method has favorable operating characteristics in comparison to some of the most highly used normalization methods.Availability: An R package called snm implementing the methodology will be made available from Bioconductor (http://bioconductor.org).Contact: jstorey@princeton.eduSupplementary information: Supplementary data are available at Bioinformatics online.},
URL = {http://bioinformatics.oxfordjournals.org/content/26/10/1308.abstract},
eprint = {http://bioinformatics.oxfordjournals.org/content/26/10/1308.full.pdf+html},
journal = {Bioinformatics}

author = {Calvano, SE and Xiao, W and Richards, DR and Felciano, RM and Baker, HV and Cho, RJ and Chen, RO and Brownstein, BH and Cobb, JP and Tschoeke, SK and Miller-Graziano, C and Moldawer, LL and Mindrinos, MN and Davis, RW and Tompkins, RG and Lowry, SF},
title = {A network-based analysis of systemic inflammation in humans},
volume = {437},
pages = {1032-1037},
year = {2005},
doi = {10.1038/nature03985},
URL = {http://www.nature.com/nature/journal/v437/n7061/full/nature03985.html},
journal = {Nature}