...
|
...
|
@@ -259,3 +259,165 @@ impute_expectation <- function(expression,impute_args) {
|
259
|
259
|
impute_null <- function(expression,impute_args) {
|
260
|
260
|
return(expression)
|
261
|
261
|
}
|
|
262
|
+
|
|
263
|
+#' Fast parameter estimation of zero-inflated bernoulli model
|
|
264
|
+#'
|
|
265
|
+#' This function implements Newton's method for solving zero of
|
|
266
|
+#' Expectation-Maximization equation at the limit of parameter convergence: a
|
|
267
|
+#' zero-inflated bernoulli model of transcript detection, modeling gene
|
|
268
|
+#' expression state (off of on) as a bernoulli draw on a gene-specific
|
|
269
|
+#' expression rate (Z in {0,1}). Detection conditioned on expression is a
|
|
270
|
+#' logistic function of gene-level features. The bernoulli model is modeled
|
|
271
|
+#' numerically by a logistic model with an intercept.
|
|
272
|
+#'
|
|
273
|
+#' @param x matrix. An expression data matrix (genes in rows, cells in columns)
|
|
274
|
+#' @param fp_tresh numeric. Threshold for calling a positive detection (D = 1).
|
|
275
|
+#' Default 0.
|
|
276
|
+#' @param gfeatM matrix. Numeric gene level determinants of drop-out (genes in
|
|
277
|
+#' rows, features in columns)
|
|
278
|
+#' @param bulk_model logical. Use median log-expression of gene in detected
|
|
279
|
+#' fraction as sole gene-level feature. Default FALSE. Ignored if gfeatM is
|
|
280
|
+#' specified.
|
|
281
|
+#' @param pos_controls logical. TRUE for all genes that are known to be
|
|
282
|
+#' expressed in all cells.
|
|
283
|
+#' @param rate_tol numeric. Convergence treshold on expression rates (0-1).
|
|
284
|
+#' @param maxiter numeric. The maximum number of steps per gene. Default
|
|
285
|
+#' 100.
|
|
286
|
+#' @param verbose logical. Whether or not to print the value of the likelihood
|
|
287
|
+#' at each iteration.
|
|
288
|
+#'
|
|
289
|
+#' @export
|
|
290
|
+#'
|
|
291
|
+#' @return a list with the following elements: \itemize{ \item{W}{ coefficients
|
|
292
|
+#' of sample-specific logistic drop-out model } \item{Alpha}{ intercept and
|
|
293
|
+#' gene-level parameter matrix } \item{X}{ intercept } \item{Beta}{
|
|
294
|
+#' coefficient of gene-specific logistic expression model }
|
|
295
|
+#' \item{fnr_character}{ the probability, per gene, of P(D=0|E=1)}
|
|
296
|
+#' \item{p_nodrop}{ 1 - the probability P(drop|Y), useful as weights in
|
|
297
|
+#' weighted PCA} \item{expected_state}{ the expected
|
|
298
|
+#' value E[Z] (1 = "on")} \item{loglik}{ the log-likelihood}
|
|
299
|
+#' \item{convergence}{for all genes, 0 if the algorithm converged and 1 if maxiter was
|
|
300
|
+#' reached} }
|
|
301
|
+#'
|
|
302
|
+#' @examples
|
|
303
|
+#' mat <- matrix(rpois(1000, lambda = 3), ncol=10)
|
|
304
|
+#' mat = mat * matrix(1-rbinom(1000, size = 1, prob = .01), ncol=10)
|
|
305
|
+#' ziber_out = suppressWarnings(fast_estimate_ziber(mat,
|
|
306
|
+#' bulk_model = TRUE,
|
|
307
|
+#' pos_controls = 1:10))
|
|
308
|
+#'
|
|
309
|
+
|
|
310
|
+fast_estimate_ziber = function(x, fp_tresh = 0,
|
|
311
|
+ gfeatM = NULL, bulk_model = FALSE,
|
|
312
|
+ pos_controls = NULL,
|
|
313
|
+ rate_tol = 0.01, maxiter = 100,
|
|
314
|
+ verbose = FALSE){
|
|
315
|
+
|
|
316
|
+ Y = t(matrix(as.numeric( x > fp_tresh ),nrow = dim(x)[1]))
|
|
317
|
+
|
|
318
|
+ if(is.null(gfeatM) & bulk_model){
|
|
319
|
+ x2 = x
|
|
320
|
+ x2[x2 <= fp_tresh] = NA
|
|
321
|
+ Alpha = t(matrix(log(rowMedians(x2, na.rm=TRUE))))
|
|
322
|
+ }else{
|
|
323
|
+ if(is.null(gfeatM)){
|
|
324
|
+ stop("No gene-level features specified")
|
|
325
|
+ }
|
|
326
|
+ Alpha = t(gfeatM)
|
|
327
|
+ }
|
|
328
|
+
|
|
329
|
+ # Prepare Unwanted Variation and Sample-Intrinsic Intercept
|
|
330
|
+ Alpha = rbind(Alpha,rep(1,dim(x)[1]))
|
|
331
|
+ K = dim(Alpha)[1]
|
|
332
|
+ W = matrix(0,dim(x)[2],K)
|
|
333
|
+
|
|
334
|
+ if(is.null(pos_controls)){
|
|
335
|
+ stop("Must supply positive controls genes to fit FNR characteristic")
|
|
336
|
+ }
|
|
337
|
+
|
|
338
|
+ # Fit W once using control genes
|
|
339
|
+ cY = Y[,pos_controls]
|
|
340
|
+ cAlpha = Alpha[,pos_controls]
|
|
341
|
+ glm_pval = rep(NA,nrow(Y))
|
|
342
|
+
|
|
343
|
+ for (i in 1:nrow(Y)){
|
|
344
|
+
|
|
345
|
+ fit = suppressWarnings(glm(cbind(cY[i,],1-cY[i,]) ~ 0 +
|
|
346
|
+ t(cAlpha),family=binomial(logit)))
|
|
347
|
+ glm_pval[i] = summary(fit)$coef[,4][1]
|
|
348
|
+
|
|
349
|
+ if((glm_pval[i] < .01) & fit$converged){
|
|
350
|
+ W[i,] = fit$coefficients
|
|
351
|
+ } else {
|
|
352
|
+ if(!fit$converged){
|
|
353
|
+ warning(paste0("Sample ",i," failed GLM fit, applying",
|
|
354
|
+ " expression independent model."))
|
|
355
|
+ }else{
|
|
356
|
+ warning(paste0("Sample ",i," exhibits expression dependence",
|
|
357
|
+ " consistent with null, applying expression ",
|
|
358
|
+ "independent model."))
|
|
359
|
+ }
|
|
360
|
+
|
|
361
|
+ # Only intercept with pseudocounts
|
|
362
|
+ W[i,] = c(0,log((sum(cY[i,]) + 0.5)/(ncol(cY) + 1)) -
|
|
363
|
+ log((sum(1-cY[i,]) +
|
|
364
|
+ 0.5)/(ncol(cY) + 1)))
|
|
365
|
+ }
|
|
366
|
+ }
|
|
367
|
+
|
|
368
|
+ fnr_character = .likfn(1,W,Alpha)
|
|
369
|
+
|
|
370
|
+ f_fun = function(x,y,theta){
|
|
371
|
+ x - mean(1/(1+(1-y)*((1/x - 1)/theta)))
|
|
372
|
+ }
|
|
373
|
+ df_fun = function(x,y,theta){
|
|
374
|
+ 1 - mean((theta - y*theta)/(1 + y*(-1 + x) + (-1 + theta)*x)^2)
|
|
375
|
+ }
|
|
376
|
+
|
|
377
|
+ rates = rep(0.5,ncol(fnr_character))
|
|
378
|
+ rates[pos_controls] = 1
|
|
379
|
+ convergences = rep(0,ncol(fnr_character))
|
|
380
|
+ convergences[pos_controls] = NA
|
|
381
|
+ for(i in which(!pos_controls)){
|
|
382
|
+ rate_est = rates[i]
|
|
383
|
+ y = Y[,i]
|
|
384
|
+ theta = 1 - fnr_character[,i]
|
|
385
|
+ dr = -f_fun(rate_est,y,theta)/df_fun(rate_est,y,theta)
|
|
386
|
+ niter = 0
|
|
387
|
+ while(abs(dr) > rate_tol){
|
|
388
|
+ niter = niter + 1
|
|
389
|
+ rate_est = rate_est + dr
|
|
390
|
+ dr = -f_fun(rate_est,y,theta)/df_fun(rate_est,y,theta)
|
|
391
|
+ if(niter >= maxiter){
|
|
392
|
+ convergences[i] = 1
|
|
393
|
+ break
|
|
394
|
+ }
|
|
395
|
+ }
|
|
396
|
+ rates[i] = rate_est
|
|
397
|
+ }
|
|
398
|
+
|
|
399
|
+ # Prepare Gene-Intrinsic Intercept
|
|
400
|
+ X = matrix(1,dim(x)[2],1)
|
|
401
|
+ Beta = matrix(boot::logit(rates),1,dim(x)[1])
|
|
402
|
+
|
|
403
|
+ # Expression States
|
|
404
|
+ Z = Y*0
|
|
405
|
+ Z[,pos_controls] = 1
|
|
406
|
+ Z[,!pos_controls] = .pzfn(Y[,!pos_controls],
|
|
407
|
+ W,Alpha[,!pos_controls],
|
|
408
|
+ X,Beta[,!pos_controls])
|
|
409
|
+
|
|
410
|
+ # Likelihood
|
|
411
|
+ matA = .likfn(1,X,Beta[,!pos_controls]) # Expression
|
|
412
|
+ matC = .likfn(Y[,!pos_controls],W,Alpha[,!pos_controls]) # Detection
|
|
413
|
+ is_perfect = (matC == 0) | (matA == 0) | (matA == 1)
|
|
414
|
+ EL2 = sum((Z[,!pos_controls]*log( matC ))[!is_perfect],na.rm = FALSE) +
|
|
415
|
+ sum((Z[,!pos_controls]*log( matA ))[!is_perfect],na.rm = FALSE) +
|
|
416
|
+ sum(((1-Z[,!pos_controls])*log( 1 - matA ))[!is_perfect],na.rm = FALSE)
|
|
417
|
+
|
|
418
|
+ return(list(W = W, X = X, Alpha = Alpha, Beta = Beta,
|
|
419
|
+ fnr_character = t(fnr_character),
|
|
420
|
+ p_nodrop = 1 - t((Y <= fp_tresh)*Z),
|
|
421
|
+ expected_state = t(Z), loglik = EL2,
|
|
422
|
+ convergence = convergences, glm_pval = glm_pval))
|
|
423
|
+}
|
262
|
424
|
\ No newline at end of file
|