... | ... |
@@ -11,11 +11,9 @@ export(UQ_FN_POS) |
11 | 11 |
export(biplot_colored) |
12 | 12 |
export(biplot_interactive) |
13 | 13 |
export(estimate_ziber) |
14 |
-export(estimate_zinb) |
|
15 | 14 |
export(factor_sample_filter) |
16 | 15 |
export(impute_expectation) |
17 | 16 |
export(impute_null) |
18 |
-export(impute_zinb) |
|
19 | 17 |
export(lm_adjust) |
20 | 18 |
export(make_design) |
21 | 19 |
export(metric_sample_filter) |
... | ... |
@@ -47,7 +45,6 @@ importFrom(DT,formatSignif) |
47 | 45 |
importFrom(DT,renderDataTable) |
48 | 46 |
importFrom(DT,selectRows) |
49 | 47 |
importFrom(EDASeq,betweenLaneNormalization) |
50 |
-importFrom(MASS,glm.nb) |
|
51 | 48 |
importFrom(NMF,aheatmap) |
52 | 49 |
importFrom(RColorBrewer,brewer.pal) |
53 | 50 |
importFrom(RUVSeq,RUVg) |
54 | 51 |
deleted file mode 100644 |
... | ... |
@@ -1,176 +0,0 @@ |
1 |
-#' Parameter estimation of zero-inflated negative binomial model |
|
2 |
-#' |
|
3 |
-#' This function implements an EM algorithm to estimate the parameters of a zero-inflated negative binomial model |
|
4 |
-#' |
|
5 |
-#' This function implements an expectation-maximization algorithm for a zero-inflated |
|
6 |
-#' model, with a constant gene-specific mean parameter, gene-specific dispersion and |
|
7 |
-#' a mixing probability that depends solely on the mean. |
|
8 |
-#' |
|
9 |
-#' @param Y matrix. An integer data matrix (genes in rows, cells in columns) |
|
10 |
-#' @param maxiter numeric. The maximum number of iterations. |
|
11 |
-#' @param verbose logical. Whether or not to print the value of the likelihood at each value. |
|
12 |
-#' |
|
13 |
-#' @importFrom MASS glm.nb |
|
14 |
-#' @export |
|
15 |
-#' |
|
16 |
-#' @return a list with the following elements: |
|
17 |
-#' \itemize{ |
|
18 |
-#' \item{mu}{the mean of the negative binomial component} |
|
19 |
-#' \item{theta}{the dispersion parameter of the negative binomial component} |
|
20 |
-#' \item{pi}{the mixing probability} |
|
21 |
-#' \item{coefs}{the coefficients of the logistic regression} |
|
22 |
-#' \item{p_z}{the probability P(Z=0|Y), useful as weights in weighted principal components analysis} |
|
23 |
-#' \item{expected_value}{the expected value E[Y]} |
|
24 |
-#' \item{variance}{the variance Var(Y)} |
|
25 |
-#' \item{loglik}{the log-likelihood} |
|
26 |
-#' \item{convergence}{0 if the algorithm converged and 1 if maxiter was reached} |
|
27 |
-#' } |
|
28 |
-estimate_zinb <- function(Y, maxiter=10, verbose=FALSE) { |
|
29 |
- |
|
30 |
- if(!all( .isWholeNumber(Y) )){ |
|
31 |
- stop("Expression matrix contains non-integer values.") |
|
32 |
- } |
|
33 |
- |
|
34 |
- n <- ncol(Y) |
|
35 |
- J <- nrow(Y) |
|
36 |
- |
|
37 |
- # 0. Initial estimate of mu, pi and z |
|
38 |
- mu0 <- rowMeans(Y) |
|
39 |
- |
|
40 |
- pi0 <- sapply(seq_len(n), function(i) { |
|
41 |
- y <- as.numeric(Y[,i]<=0) |
|
42 |
- fit <- glm(y~log(mu0), family = binomial) |
|
43 |
- return(fitted.values(fit)) |
|
44 |
- }) |
|
45 |
- |
|
46 |
- zhat <- pi0/(pi0 + (1 - pi0) * dnbinom(0, size = 1, mu = matrix(mu0, nrow=J, ncol=n))) |
|
47 |
- zhat[Y>0] <- 0 |
|
48 |
- thetahat <- rep(1, J) |
|
49 |
- |
|
50 |
- coefs_mu <- log(mu0) |
|
51 |
- coefs_pi <- sapply(seq_len(n), function(i) { |
|
52 |
- fit <- suppressWarnings(glm(zhat[,i]~log(mu0), family = binomial(link = logit))) |
|
53 |
- return(coefficients(fit)) |
|
54 |
- }) |
|
55 |
- |
|
56 |
- X <- matrix(rep(1, n), ncol=1) |
|
57 |
- W <- model.matrix(~log(mu0)) |
|
58 |
- linkobj <- binomial() |
|
59 |
- |
|
60 |
- ll_new <- loglik_small(c(coefs_mu, coefs_pi[1,], coefs_pi[2,], log(thetahat)), Y, Y>0, X, W, J, n*2, 0, 0, linkobj) |
|
61 |
- ll_old <- 2 * ll_new |
|
62 |
- |
|
63 |
- ## EM iteration |
|
64 |
- iter <- 0 |
|
65 |
- while (abs((ll_old - ll_new)/ll_old) > 1e-4 & iter<maxiter) { |
|
66 |
- ll_old <- ll_new |
|
67 |
- fit_mu <- bplapply(seq_len(J), function(i) { |
|
68 |
- fit <- suppressWarnings(glm.nb(Y[i,] ~ 1, weights = (1 - zhat[i,]), init.theta = thetahat[i], start=coefs_mu[i])) |
|
69 |
- return(list(coefs=coefficients(fit), theta=fit$theta)) |
|
70 |
- }) |
|
71 |
- coefs_mu <- sapply(fit_mu, function(x) x$coefs) |
|
72 |
- muhat <- exp(coefs_mu) |
|
73 |
- thetahat <- sapply(fit_mu, function(x) x$theta) |
|
74 |
- |
|
75 |
- fit_pi <- bplapply(seq_len(n), function(i) { |
|
76 |
- fit <- suppressWarnings(glm(zhat[,i]~log(muhat), family = binomial(link = logit), start=coefs_pi[,i])) |
|
77 |
- return(list(coefs=coefficients(fit), fitted=fitted.values(fit))) |
|
78 |
- }) |
|
79 |
- coefs_pi <- sapply(fit_pi, function(x) x$coefs) |
|
80 |
- pihat <- sapply(fit_pi, function(x) x$fitted) |
|
81 |
- |
|
82 |
- zhat <- pihat/(pihat + (1 - pihat) * dnbinom(0, size = matrix(thetahat, nrow=J, ncol=n), mu = matrix(muhat, nrow=J, ncol=n))) |
|
83 |
- zhat[Y>0] <- 0 |
|
84 |
- |
|
85 |
- W <- model.matrix(~log(muhat)) |
|
86 |
- ll_new <- loglik_small(c(coefs_mu, coefs_pi[1,], coefs_pi[2,], log(thetahat)), Y, Y>0, X, W, J, n*2, 0, 0, linkobj) |
|
87 |
- |
|
88 |
- if(verbose) { |
|
89 |
- print(ll_new) |
|
90 |
- } |
|
91 |
- |
|
92 |
- iter <- iter + 1 |
|
93 |
- } |
|
94 |
- |
|
95 |
- convergence <- 0 |
|
96 |
- if(iter == maxiter) { |
|
97 |
- convergence <- 1 |
|
98 |
- } |
|
99 |
- |
|
100 |
- p_z <- 1 - (Y == 0) * pihat / (pihat + (1 - pihat) * (1 + muhat / thetahat)^(-thetahat)) |
|
101 |
- eval <- (1 - pihat) * muhat |
|
102 |
- variance <- (1 - pihat) * muhat * (1 + muhat * (1/thetahat + pihat)) |
|
103 |
- |
|
104 |
- return(list(mu=muhat, theta=thetahat, pi=pihat, coefs=coefs_pi, |
|
105 |
- p_z=p_z, expected_value=eval, variance=variance, |
|
106 |
- loglik=ll_new, convergence=convergence)) |
|
107 |
-} |
|
108 |
- |
|
109 |
-#' Log-likelihood function of the zero-inflated negative binomial model |
|
110 |
-#' |
|
111 |
-#' This function computes the log-likelihood of a standard regression model |
|
112 |
-#' |
|
113 |
-#' This is a (hopefully) memory-efficient implementation of the log-likelihood of a |
|
114 |
-#' zero-inflated negative binomial regression model. |
|
115 |
-#' In this attempt, the design matrices don't have n*J rows, but n and J, respectively. |
|
116 |
-#' The computation is a bit slower, but the memory usage should be much smaller for |
|
117 |
-#' large J and n. |
|
118 |
-#' |
|
119 |
-#' @param parms a vector of parameters: should contain the values of beta, followed by those of alpha, followed by the log(1/phi) |
|
120 |
-#' @param Y the data matrix (genes in rows, cells in columns) |
|
121 |
-#' @param Y1 a logical indicator of Y>0 |
|
122 |
-#' @param X the design matrix for the regression on mu (n x k_X) |
|
123 |
-#' @param W the design matrix for the regression on pi (J x k_W) |
|
124 |
-#' @param kx the number of beta parameters |
|
125 |
-#' @param kw the number of alpha parameters |
|
126 |
-#' @param offsetx the offset for the regression on X |
|
127 |
-#' @param offsetw the offset for the regression on W |
|
128 |
-#' @param linkobj the link function object for the regression on pi (typically the result of binomial()) |
|
129 |
-loglik_small <- function(parms, Y, Y1, X, W, kx, kw, offsetx, offsetw, linkobj) { |
|
130 |
- |
|
131 |
- J <- nrow(Y) |
|
132 |
- n <- ncol(Y) |
|
133 |
- |
|
134 |
- beta <- matrix(parms[1:kx], ncol=J, nrow=ncol(X)) |
|
135 |
- mu <- t(exp(X %*% beta + offsetx)) |
|
136 |
- |
|
137 |
- alpha <- matrix(parms[(kx + 1):(kw+kx)], nrow=ncol(W), ncol=n, byrow=TRUE) |
|
138 |
- eta <- W %*% alpha + offsetw |
|
139 |
- pi <- logistic(eta) |
|
140 |
- |
|
141 |
- theta <- matrix(exp(parms[(kw + kx + 1):(kw + kx + J)]), nrow=J, ncol=n) |
|
142 |
- |
|
143 |
- loglik0 <- log(pi + exp(log(1 - pi) + suppressWarnings(dnbinom(0, size = theta, mu = mu, log = TRUE)))) |
|
144 |
- loglik1 <- log(1 - pi) + suppressWarnings(dnbinom(Y, size = theta, mu = mu, log = TRUE)) |
|
145 |
- |
|
146 |
- return(sum(loglik0[which(Y==0)]) + sum(loglik1[which(Y>0)])) |
|
147 |
-} |
|
148 |
- |
|
149 |
-#' Imputation of zero counts based on zero-inflated negative binomial model |
|
150 |
-#' |
|
151 |
-#' This function is used to impute the data, by weighting the observed zeros by their |
|
152 |
-#' probability of coming from the zero-inflation part of the distribution P(Z=1|Y). |
|
153 |
-#' |
|
154 |
-#' @details The imputation is carried out with the following formula: |
|
155 |
-#' y_{ij}* = y_{ij} * Pr(Z_{ij} = 0 | Y_{ij}=y_{ij}) + mu_{ij} * Pr(Z_{ij} = 1 | Y_{ij} = y_{ij}). |
|
156 |
-#' Note that for y_{ij} > 0, Pr(Z_{ij} = 0 | Y_{ij}=y_{ij}) = 1 and hence the data are not imputed. |
|
157 |
-#' |
|
158 |
-#' @export |
|
159 |
-#' |
|
160 |
-#' @param expression the data matrix (genes in rows, cells in columns) |
|
161 |
-#' @param impute_args arguments for imputation (not used) |
|
162 |
-#' |
|
163 |
-#' @return the imputed expression matrix. |
|
164 |
-impute_zinb <- function(expression, impute_args) { |
|
165 |
- pars <- estimate_zinb(expression) |
|
166 |
- w <- pars$p_z |
|
167 |
- imputed <- expression * w + pars$mu * (1 - w) |
|
168 |
- return(imputed) |
|
169 |
-} |
|
170 |
- |
|
171 |
-logit <- binomial()$linkfun |
|
172 |
-logistic <- binomial()$linkinv |
|
173 |
- |
|
174 |
-.isWholeNumber <- function(x, tol = .Machine$double.eps^0.5) { |
|
175 |
- abs(x - round(x)) < tol |
|
176 |
-} |
177 | 0 |
deleted file mode 100644 |
... | ... |
@@ -1,38 +0,0 @@ |
1 |
-% Generated by roxygen2: do not edit by hand |
|
2 |
-% Please edit documentation in R/zinb.R |
|
3 |
-\name{estimate_zinb} |
|
4 |
-\alias{estimate_zinb} |
|
5 |
-\title{Parameter estimation of zero-inflated negative binomial model} |
|
6 |
-\usage{ |
|
7 |
-estimate_zinb(Y, maxiter = 10, verbose = FALSE) |
|
8 |
-} |
|
9 |
-\arguments{ |
|
10 |
-\item{Y}{matrix. An integer data matrix (genes in rows, cells in columns)} |
|
11 |
- |
|
12 |
-\item{maxiter}{numeric. The maximum number of iterations.} |
|
13 |
- |
|
14 |
-\item{verbose}{logical. Whether or not to print the value of the likelihood at each value.} |
|
15 |
-} |
|
16 |
-\value{ |
|
17 |
-a list with the following elements: |
|
18 |
-\itemize{ |
|
19 |
-\item{mu}{the mean of the negative binomial component} |
|
20 |
-\item{theta}{the dispersion parameter of the negative binomial component} |
|
21 |
-\item{pi}{the mixing probability} |
|
22 |
-\item{coefs}{the coefficients of the logistic regression} |
|
23 |
-\item{p_z}{the probability P(Z=0|Y), useful as weights in weighted principal components analysis} |
|
24 |
-\item{expected_value}{the expected value E[Y]} |
|
25 |
-\item{variance}{the variance Var(Y)} |
|
26 |
-\item{loglik}{the log-likelihood} |
|
27 |
-\item{convergence}{0 if the algorithm converged and 1 if maxiter was reached} |
|
28 |
-} |
|
29 |
-} |
|
30 |
-\description{ |
|
31 |
-This function implements an EM algorithm to estimate the parameters of a zero-inflated negative binomial model |
|
32 |
-} |
|
33 |
-\details{ |
|
34 |
-This function implements an expectation-maximization algorithm for a zero-inflated |
|
35 |
-model, with a constant gene-specific mean parameter, gene-specific dispersion and |
|
36 |
-a mixing probability that depends solely on the mean. |
|
37 |
-} |
|
38 |
- |
39 | 0 |
deleted file mode 100644 |
... | ... |
@@ -1,26 +0,0 @@ |
1 |
-% Generated by roxygen2: do not edit by hand |
|
2 |
-% Please edit documentation in R/zinb.R |
|
3 |
-\name{impute_zinb} |
|
4 |
-\alias{impute_zinb} |
|
5 |
-\title{Imputation of zero counts based on zero-inflated negative binomial model} |
|
6 |
-\usage{ |
|
7 |
-impute_zinb(expression, impute_args) |
|
8 |
-} |
|
9 |
-\arguments{ |
|
10 |
-\item{expression}{the data matrix (genes in rows, cells in columns)} |
|
11 |
- |
|
12 |
-\item{impute_args}{arguments for imputation (not used)} |
|
13 |
-} |
|
14 |
-\value{ |
|
15 |
-the imputed expression matrix. |
|
16 |
-} |
|
17 |
-\description{ |
|
18 |
-This function is used to impute the data, by weighting the observed zeros by their |
|
19 |
-probability of coming from the zero-inflation part of the distribution P(Z=1|Y). |
|
20 |
-} |
|
21 |
-\details{ |
|
22 |
-The imputation is carried out with the following formula: |
|
23 |
-y_{ij}* = y_{ij} * Pr(Z_{ij} = 0 | Y_{ij}=y_{ij}) + mu_{ij} * Pr(Z_{ij} = 1 | Y_{ij} = y_{ij}). |
|
24 |
-Note that for y_{ij} > 0, Pr(Z_{ij} = 0 | Y_{ij}=y_{ij}) = 1 and hence the data are not imputed. |
|
25 |
-} |
|
26 |
- |
27 | 0 |
deleted file mode 100644 |
... | ... |
@@ -1,40 +0,0 @@ |
1 |
-% Generated by roxygen2: do not edit by hand |
|
2 |
-% Please edit documentation in R/zinb.R |
|
3 |
-\name{loglik_small} |
|
4 |
-\alias{loglik_small} |
|
5 |
-\title{Log-likelihood function of the zero-inflated negative binomial model} |
|
6 |
-\usage{ |
|
7 |
-loglik_small(parms, Y, Y1, X, W, kx, kw, offsetx, offsetw, linkobj) |
|
8 |
-} |
|
9 |
-\arguments{ |
|
10 |
-\item{parms}{a vector of parameters: should contain the values of beta, followed by those of alpha, followed by the log(1/phi)} |
|
11 |
- |
|
12 |
-\item{Y}{the data matrix (genes in rows, cells in columns)} |
|
13 |
- |
|
14 |
-\item{Y1}{a logical indicator of Y>0} |
|
15 |
- |
|
16 |
-\item{X}{the design matrix for the regression on mu (n x k_X)} |
|
17 |
- |
|
18 |
-\item{W}{the design matrix for the regression on pi (J x k_W)} |
|
19 |
- |
|
20 |
-\item{kx}{the number of beta parameters} |
|
21 |
- |
|
22 |
-\item{kw}{the number of alpha parameters} |
|
23 |
- |
|
24 |
-\item{offsetx}{the offset for the regression on X} |
|
25 |
- |
|
26 |
-\item{offsetw}{the offset for the regression on W} |
|
27 |
- |
|
28 |
-\item{linkobj}{the link function object for the regression on pi (typically the result of binomial())} |
|
29 |
-} |
|
30 |
-\description{ |
|
31 |
-This function computes the log-likelihood of a standard regression model |
|
32 |
-} |
|
33 |
-\details{ |
|
34 |
-This is a (hopefully) memory-efficient implementation of the log-likelihood of a |
|
35 |
-zero-inflated negative binomial regression model. |
|
36 |
-In this attempt, the design matrices don't have n*J rows, but n and J, respectively. |
|
37 |
-The computation is a bit slower, but the memory usage should be much smaller for |
|
38 |
-large J and n. |
|
39 |
-} |
|
40 |
- |