Browse code

Remove zinb functions

Davide Risso authored on 04/11/2016 23:55:09
Showing 6 changed files

... ...
@@ -27,7 +27,6 @@ Imports:
27 27
   grDevices,
28 28
   hexbin,
29 29
   limma,
30
-  MASS,
31 30
   matrixStats,
32 31
   mixtools,
33 32
   RColorBrewer,
... ...
@@ -11,11 +11,9 @@ export(UQ_FN_POS)
11 11
 export(biplot_colored)
12 12
 export(biplot_interactive)
13 13
 export(estimate_ziber)
14
-export(estimate_zinb)
15 14
 export(factor_sample_filter)
16 15
 export(impute_expectation)
17 16
 export(impute_null)
18
-export(impute_zinb)
19 17
 export(lm_adjust)
20 18
 export(make_design)
21 19
 export(metric_sample_filter)
... ...
@@ -47,7 +45,6 @@ importFrom(DT,formatSignif)
47 45
 importFrom(DT,renderDataTable)
48 46
 importFrom(DT,selectRows)
49 47
 importFrom(EDASeq,betweenLaneNormalization)
50
-importFrom(MASS,glm.nb)
51 48
 importFrom(NMF,aheatmap)
52 49
 importFrom(RColorBrewer,brewer.pal)
53 50
 importFrom(RUVSeq,RUVg)
54 51
deleted file mode 100644
... ...
@@ -1,176 +0,0 @@
1
-#' Parameter estimation of zero-inflated negative binomial model
2
-#'
3
-#' This function implements an EM algorithm to estimate the parameters of a zero-inflated negative binomial model
4
-#'
5
-#' This function implements an expectation-maximization algorithm for a zero-inflated
6
-#' model, with a constant gene-specific mean parameter, gene-specific dispersion and
7
-#' a mixing probability that depends solely on the mean.
8
-#'
9
-#' @param Y matrix. An integer data matrix (genes in rows, cells in columns)
10
-#' @param maxiter numeric. The maximum number of iterations.
11
-#' @param verbose logical. Whether or not to print the value of the likelihood at each value.
12
-#'
13
-#' @importFrom MASS glm.nb
14
-#' @export
15
-#'
16
-#' @return a list with the following elements:
17
-#' \itemize{
18
-#' \item{mu}{the mean of the negative binomial component}
19
-#' \item{theta}{the dispersion parameter of the negative binomial component}
20
-#' \item{pi}{the mixing probability}
21
-#' \item{coefs}{the coefficients of the logistic regression}
22
-#' \item{p_z}{the probability P(Z=0|Y), useful as weights in weighted principal components analysis}
23
-#' \item{expected_value}{the expected value E[Y]}
24
-#' \item{variance}{the variance Var(Y)}
25
-#' \item{loglik}{the log-likelihood}
26
-#' \item{convergence}{0 if the algorithm converged and 1 if maxiter was reached}
27
-#' }
28
-estimate_zinb <- function(Y, maxiter=10, verbose=FALSE) {
29
-
30
-  if(!all( .isWholeNumber(Y) )){
31
-    stop("Expression matrix contains non-integer values.")
32
-  }
33
-
34
-  n <- ncol(Y)
35
-  J <- nrow(Y)
36
-
37
-  # 0. Initial estimate of mu, pi and z
38
-  mu0 <- rowMeans(Y)
39
-
40
-  pi0 <- sapply(seq_len(n), function(i) {
41
-    y <- as.numeric(Y[,i]<=0)
42
-    fit <- glm(y~log(mu0), family = binomial)
43
-    return(fitted.values(fit))
44
-  })
45
-
46
-  zhat <- pi0/(pi0 + (1 - pi0) * dnbinom(0, size = 1, mu = matrix(mu0, nrow=J, ncol=n)))
47
-  zhat[Y>0] <- 0
48
-  thetahat <- rep(1, J)
49
-
50
-  coefs_mu <- log(mu0)
51
-  coefs_pi <- sapply(seq_len(n), function(i) {
52
-    fit <- suppressWarnings(glm(zhat[,i]~log(mu0), family = binomial(link = logit)))
53
-    return(coefficients(fit))
54
-  })
55
-
56
-  X <- matrix(rep(1, n), ncol=1)
57
-  W <- model.matrix(~log(mu0))
58
-  linkobj <- binomial()
59
-
60
-  ll_new <- loglik_small(c(coefs_mu, coefs_pi[1,], coefs_pi[2,], log(thetahat)), Y, Y>0, X, W, J, n*2, 0, 0, linkobj)
61
-  ll_old <- 2 * ll_new
62
-
63
-  ## EM iteration
64
-  iter <- 0
65
-  while (abs((ll_old - ll_new)/ll_old) > 1e-4 & iter<maxiter) {
66
-    ll_old <- ll_new
67
-    fit_mu <- bplapply(seq_len(J), function(i) {
68
-      fit <- suppressWarnings(glm.nb(Y[i,] ~ 1, weights = (1 - zhat[i,]), init.theta = thetahat[i], start=coefs_mu[i]))
69
-      return(list(coefs=coefficients(fit), theta=fit$theta))
70
-    })
71
-    coefs_mu <- sapply(fit_mu, function(x) x$coefs)
72
-    muhat <- exp(coefs_mu)
73
-    thetahat <- sapply(fit_mu, function(x) x$theta)
74
-
75
-    fit_pi <- bplapply(seq_len(n), function(i) {
76
-      fit <- suppressWarnings(glm(zhat[,i]~log(muhat), family = binomial(link = logit), start=coefs_pi[,i]))
77
-      return(list(coefs=coefficients(fit), fitted=fitted.values(fit)))
78
-    })
79
-    coefs_pi <- sapply(fit_pi, function(x) x$coefs)
80
-    pihat <- sapply(fit_pi, function(x) x$fitted)
81
-
82
-    zhat <- pihat/(pihat + (1 - pihat) * dnbinom(0, size = matrix(thetahat, nrow=J, ncol=n), mu = matrix(muhat, nrow=J, ncol=n)))
83
-    zhat[Y>0] <- 0
84
-
85
-    W <- model.matrix(~log(muhat))
86
-    ll_new <- loglik_small(c(coefs_mu, coefs_pi[1,], coefs_pi[2,], log(thetahat)), Y, Y>0, X, W, J, n*2, 0, 0, linkobj)
87
-
88
-    if(verbose) {
89
-      print(ll_new)
90
-    }
91
-
92
-    iter <- iter + 1
93
-  }
94
-
95
-  convergence <- 0
96
-  if(iter == maxiter) {
97
-    convergence <- 1
98
-  }
99
-
100
-  p_z <- 1 - (Y == 0) * pihat / (pihat + (1 - pihat) * (1 + muhat / thetahat)^(-thetahat))
101
-  eval <- (1 - pihat) * muhat
102
-  variance <- (1 - pihat) * muhat * (1 + muhat * (1/thetahat + pihat))
103
-
104
-  return(list(mu=muhat, theta=thetahat, pi=pihat, coefs=coefs_pi,
105
-              p_z=p_z, expected_value=eval, variance=variance,
106
-              loglik=ll_new, convergence=convergence))
107
-}
108
-
109
-#' Log-likelihood function of the zero-inflated negative binomial model
110
-#'
111
-#' This function computes the log-likelihood of a standard regression model
112
-#'
113
-#' This is a (hopefully) memory-efficient implementation of the log-likelihood of a
114
-#' zero-inflated negative binomial regression model.
115
-#' In this attempt, the design matrices don't have n*J rows, but n and J, respectively.
116
-#' The computation is a bit slower, but the memory usage should be much smaller for
117
-#' large J and n.
118
-#'
119
-#' @param parms a vector of parameters: should contain the values of beta, followed by those of alpha, followed by the log(1/phi)
120
-#' @param Y the data matrix (genes in rows, cells in columns)
121
-#' @param Y1 a logical indicator of Y>0
122
-#' @param X the design matrix for the regression on mu (n x k_X)
123
-#' @param W the design matrix for the regression on pi (J x k_W)
124
-#' @param kx the number of beta parameters
125
-#' @param kw the number of alpha parameters
126
-#' @param offsetx the offset for the regression on X
127
-#' @param offsetw the offset for the regression on W
128
-#' @param linkobj the link function object for the regression on pi (typically the result of binomial())
129
-loglik_small <- function(parms, Y, Y1, X, W, kx, kw, offsetx, offsetw, linkobj) {
130
-
131
-  J <- nrow(Y)
132
-  n <- ncol(Y)
133
-
134
-  beta <- matrix(parms[1:kx], ncol=J, nrow=ncol(X))
135
-  mu <- t(exp(X %*% beta + offsetx))
136
-
137
-  alpha <- matrix(parms[(kx + 1):(kw+kx)], nrow=ncol(W), ncol=n, byrow=TRUE)
138
-  eta <- W %*% alpha + offsetw
139
-  pi <- logistic(eta)
140
-
141
-  theta <- matrix(exp(parms[(kw + kx + 1):(kw + kx + J)]), nrow=J, ncol=n)
142
-
143
-  loglik0 <- log(pi + exp(log(1 - pi) + suppressWarnings(dnbinom(0, size = theta, mu = mu, log = TRUE))))
144
-  loglik1 <- log(1 - pi) + suppressWarnings(dnbinom(Y, size = theta, mu = mu, log = TRUE))
145
-
146
-  return(sum(loglik0[which(Y==0)]) + sum(loglik1[which(Y>0)]))
147
-}
148
-
149
-#' Imputation of zero counts based on zero-inflated negative binomial model
150
-#'
151
-#' This function is used to impute the data, by weighting the observed zeros by their
152
-#' probability of coming from the zero-inflation part of the distribution P(Z=1|Y).
153
-#'
154
-#' @details The imputation is carried out with the following formula:
155
-#' y_{ij}* =  y_{ij} * Pr(Z_{ij} = 0 | Y_{ij}=y_{ij}) + mu_{ij} * Pr(Z_{ij} = 1 | Y_{ij} = y_{ij}).
156
-#' Note that for y_{ij} > 0, Pr(Z_{ij} = 0 | Y_{ij}=y_{ij}) = 1 and hence the data are not imputed.
157
-#'
158
-#' @export
159
-#'
160
-#' @param expression the data matrix (genes in rows, cells in columns)
161
-#' @param impute_args arguments for imputation (not used)
162
-#' 
163
-#' @return the imputed expression matrix.
164
-impute_zinb <- function(expression, impute_args) {
165
-  pars <- estimate_zinb(expression)
166
-  w <- pars$p_z
167
-  imputed <- expression * w + pars$mu * (1 - w)
168
-  return(imputed)
169
-}
170
-
171
-logit <- binomial()$linkfun
172
-logistic <- binomial()$linkinv
173
-
174
-.isWholeNumber <- function(x, tol = .Machine$double.eps^0.5) {
175
-  abs(x - round(x)) < tol
176
-}
177 0
deleted file mode 100644
... ...
@@ -1,38 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/zinb.R
3
-\name{estimate_zinb}
4
-\alias{estimate_zinb}
5
-\title{Parameter estimation of zero-inflated negative binomial model}
6
-\usage{
7
-estimate_zinb(Y, maxiter = 10, verbose = FALSE)
8
-}
9
-\arguments{
10
-\item{Y}{matrix. An integer data matrix (genes in rows, cells in columns)}
11
-
12
-\item{maxiter}{numeric. The maximum number of iterations.}
13
-
14
-\item{verbose}{logical. Whether or not to print the value of the likelihood at each value.}
15
-}
16
-\value{
17
-a list with the following elements:
18
-\itemize{
19
-\item{mu}{the mean of the negative binomial component}
20
-\item{theta}{the dispersion parameter of the negative binomial component}
21
-\item{pi}{the mixing probability}
22
-\item{coefs}{the coefficients of the logistic regression}
23
-\item{p_z}{the probability P(Z=0|Y), useful as weights in weighted principal components analysis}
24
-\item{expected_value}{the expected value E[Y]}
25
-\item{variance}{the variance Var(Y)}
26
-\item{loglik}{the log-likelihood}
27
-\item{convergence}{0 if the algorithm converged and 1 if maxiter was reached}
28
-}
29
-}
30
-\description{
31
-This function implements an EM algorithm to estimate the parameters of a zero-inflated negative binomial model
32
-}
33
-\details{
34
-This function implements an expectation-maximization algorithm for a zero-inflated
35
-model, with a constant gene-specific mean parameter, gene-specific dispersion and
36
-a mixing probability that depends solely on the mean.
37
-}
38
-
39 0
deleted file mode 100644
... ...
@@ -1,26 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/zinb.R
3
-\name{impute_zinb}
4
-\alias{impute_zinb}
5
-\title{Imputation of zero counts based on zero-inflated negative binomial model}
6
-\usage{
7
-impute_zinb(expression, impute_args)
8
-}
9
-\arguments{
10
-\item{expression}{the data matrix (genes in rows, cells in columns)}
11
-
12
-\item{impute_args}{arguments for imputation (not used)}
13
-}
14
-\value{
15
-the imputed expression matrix.
16
-}
17
-\description{
18
-This function is used to impute the data, by weighting the observed zeros by their
19
-probability of coming from the zero-inflation part of the distribution P(Z=1|Y).
20
-}
21
-\details{
22
-The imputation is carried out with the following formula:
23
-y_{ij}* =  y_{ij} * Pr(Z_{ij} = 0 | Y_{ij}=y_{ij}) + mu_{ij} * Pr(Z_{ij} = 1 | Y_{ij} = y_{ij}).
24
-Note that for y_{ij} > 0, Pr(Z_{ij} = 0 | Y_{ij}=y_{ij}) = 1 and hence the data are not imputed.
25
-}
26
-
27 0
deleted file mode 100644
... ...
@@ -1,40 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/zinb.R
3
-\name{loglik_small}
4
-\alias{loglik_small}
5
-\title{Log-likelihood function of the zero-inflated negative binomial model}
6
-\usage{
7
-loglik_small(parms, Y, Y1, X, W, kx, kw, offsetx, offsetw, linkobj)
8
-}
9
-\arguments{
10
-\item{parms}{a vector of parameters: should contain the values of beta, followed by those of alpha, followed by the log(1/phi)}
11
-
12
-\item{Y}{the data matrix (genes in rows, cells in columns)}
13
-
14
-\item{Y1}{a logical indicator of Y>0}
15
-
16
-\item{X}{the design matrix for the regression on mu (n x k_X)}
17
-
18
-\item{W}{the design matrix for the regression on pi (J x k_W)}
19
-
20
-\item{kx}{the number of beta parameters}
21
-
22
-\item{kw}{the number of alpha parameters}
23
-
24
-\item{offsetx}{the offset for the regression on X}
25
-
26
-\item{offsetw}{the offset for the regression on W}
27
-
28
-\item{linkobj}{the link function object for the regression on pi (typically the result of binomial())}
29
-}
30
-\description{
31
-This function computes the log-likelihood of a standard regression model
32
-}
33
-\details{
34
-This is a (hopefully) memory-efficient implementation of the log-likelihood of a
35
-zero-inflated negative binomial regression model.
36
-In this attempt, the design matrices don't have n*J rows, but n and J, respectively.
37
-The computation is a bit slower, but the memory usage should be much smaller for
38
-large J and n.
39
-}
40
-