##------------------------------------------------------------------- ## Name: TSPM.R ## R code for the paper by Paul L. Auer and R.W. Doerge: ## "A Two-Stage Poisson Model for Testing RNA-Seq Data" ## Date: February 2011 ## Contact: Paul Auer plivermo@fhcrc.org ## R.W. Doerge doerge@purdue.edu ## Example: ## counts <- matrix(0, nrow=1000, ncol=10) ## for(i in 1:1000){ ## lambda <- rpois(n=1, lambda=10) ## counts[i,] <- rpois(n=10, lambda=lambda) ## } ## x1 <- gl(n=2, k=5, labels=c("T", "C")) ## x0 <- rep(1, times=10) ## lib.size <- apply(counts,2,sum) ## result <- TSPM(counts, x1, x0, lib.size) ##--------------------------------------------------------------------- ####################################################################### ###### The TSPM function ############################################## ####################################################################### TSPM <- function(counts, x1, x0, lib.size, alpha.wh=0.05){ ## Input: #counts: a matrix of RNA-Seq gene counts (genes are rows, samples are # columns) #x1: a vector of treatment group factors (under the alternative # hypothesis) #x0: a vector of treatment group factors (under the null hypothesis) #lib.size: a vector of RNA-Seq library sizes. This could simply be obtained # by specifying lib.size <- apply(counts,2,sum). It may also be any # other appropriate scaling factor. #alpha.wh: the significance threshold to use for deciding whether a gene is # overdispersed. # Defaults to 0.05. ## Output: #log.fold.change: a vector containing the estimated log fold changes for # each gene #pvalues: a vector containing the raw p-values testing differential # expression for each gene. #index.over.disp: a vector of integer values containing the indices of the # over-dispersed genes. #index.not.over.disp: a vector of integer values containing the indices of the # non-over-dispersed genes. #padj: a vector containing the p-values after adjusting for # multiple testing using the # method of Benjamini-Hochberg ######## The main loop that fits the GLMs to each gene ##################### ### Initializing model parameters #### n <- dim(counts)[1] per.gene.disp <- NULL LRT <- NULL score.test <- NULL LFC <- NULL ###### Fitting the GLMs for each gene ################# for(i in 1:n){ ### Fit full and reduced models ### model.1 <- glm(as.numeric(counts[i,]) ~ x1, offset=log(lib.size), family=poisson) model.0 <- glm(as.numeric(counts[i,]) ~ x0, offset=log(lib.size), family=poisson) ### Obtain diagonals of Hat matrix from the full model fit ### hats <- hatvalues(model.1) ### Obtain Pearson overdispersion estimate #### per.gene.disp[i] <- sum(residuals(model.1, type="pearson")^2)/model.1$df.residual ### Obtain Likelihood ratio statistic #### LRT[i] <- deviance(model.0)-deviance(model.1) ### Obtain score test statistic #### score.test[i] <- 1/(2*length(counts[i,])) * sum(residuals(model.1, type="pearson")^2 - ((counts[i,] - hats*model.1$fitted.values)/model.1$fitted.values))^2 ### Obtain the estimated log fold change ### LFC[i] <- -model.1$coef[2] } ## Initialize parameters for Working-Hotelling bands around the score TSs ### qchi <- qchisq(df=1, (1:n-0.5)/n) MSE <- 2 UL <- NULL #### Obtain the upper boundary of the WH bands ############################ xbar <- mean(qchi) bottom <- sum((qchi-xbar)^2) top <- (qchi-xbar)^2 s <- sqrt(MSE*(1/n) + (top/bottom)) W <- sqrt(2*qf(df1=1, df2=n-1, p=1-(alpha.wh/n))) UL <- pmax(qchi + W*s,1) ###### Obtain the indices of the over-dispersed and not-over-dispersed genes, # respectively ########## cutoff <- min(which(sort(score.test)-UL > 0)) temp <- cutoff-1 + seq(cutoff:length(score.test)) over.disp <- which(score.test %in% sort(score.test)[temp]) not.over.disp <- setdiff(1:length(score.test), over.disp) ###### Compute p-values #################################### p.f <- pf(LRT[over.disp]/per.gene.disp[over.disp], df1=1, df2=model.1$df.residual, lower.tail=FALSE) p.chi <- pchisq(LRT[not.over.disp], df=1, lower.tail=FALSE) p <- NULL p[over.disp] <- p.f p[not.over.disp] <- p.chi ##### Adjust the p-values using the B-H method #################### p.bh.f <- p.adjust(p.f, method="BH") p.bh.chi <- p.adjust(p.chi, method="BH") final.p.bh.tagwise <- NULL final.p.bh.tagwise[over.disp] <- p.bh.f final.p.bh.tagwise[not.over.disp] <- p.bh.chi ### Output ### list(log.fold.change=LFC, pvalues=p, index.over.disp=over.disp, index.not.over.disp=not.over.disp, padj=final.p.bh.tagwise) }