The ... param was allowing users to misspell key arguments to the
model parameters, essentially causing silent failures (see
to the celda() wrapper function, removed ... from all of the relevant
signatures, and updated the docs to reflect the new landscape.
Also updated Josh's copyright while I was at it.
Fixes #212 Fixes #211
Former-commit-id: 4bba23567a4672520f3dcea16781929b0bcd1ddd
... | ... |
@@ -10,27 +10,41 @@ available_models = c("celda_C", "celda_G", "celda_CG") |
10 | 10 |
#' @param counts A count matrix |
11 | 11 |
#' @param model Which celda sub-model to run. Options include "celda_C" (cell clustering), "celda_G" (gene clustering), "celda_CG" (gene and cell clustering) |
12 | 12 |
#' @param sample.label A numeric vector, character vector, or factor indicating the originating sample for each cell (column) in the count matrix. By default, every cell will be assumed to be from an independent sample. |
13 |
-#' @param nchains The number of chains of Gibbs sampling to run for every combination of K/L parameters. Defaults to 1 |
|
13 |
+#' @param K Number of desired cell subpopulation clusters. Required for celda_C and celda_CG models. |
|
14 |
+#' @param L Number of desired gene clusters. Required for celda_G and celda_CG models. |
|
15 |
+#' @param alpha Non-zero concentration parameter for sample Dirichlet distribution (celda_C / celda_CG only) |
|
16 |
+#' @param beta Non-zero concentration parameter for gene Dirichlet distribution |
|
17 |
+#' @param gamma The Dirichlet distribution parameter for Psi; adds a pseudocount to each gene within each transcriptional state (celda_G / celda_CG only) |
|
18 |
+#' @param delta The Dirichlet distribution parameter for Eta; adds a gene pseudocount to the numbers of genes each state (celda_G / celda_CG only) |
|
19 |
+#' @param nchains The number of chains of Gibbs sampling to run for every combination of K/L parameters. Defaults to 1 (celda_G / celda_CG only) |
|
14 | 20 |
#' @param bestChainsOnly Return only the best chain (by final log-likelihood) per K/L combination. |
15 | 21 |
#' @param cores The number of cores to use for parallell Gibbs sampling. Defaults to 1. |
16 | 22 |
#' @param seed The base seed for random number generation |
17 | 23 |
#' @param verbose Print log messages during celda chain execution |
18 | 24 |
#' @param logfile_prefix Prefix for log files from worker threads and main process. |
19 |
-#' @param ... Model specific parameters |
|
20 | 25 |
#' @return Object of class "celda_list", which contains results for all model parameter combinations and summaries of the run parameters |
21 | 26 |
#' @import foreach |
22 | 27 |
#' @export |
23 |
-celda = function(counts, model, sample.label=NULL, nchains=1, bestChainsOnly=TRUE, cores=1, seed=12345, verbose=FALSE, logfile_prefix="Celda", ...) { |
|
28 |
+celda = function(counts, model, sample.label=NULL, K=NULL, L=NULL, alpha=1, beta=1, |
|
29 |
+ delta=1, gamma=1, max.iter=200, z.init=NULL, y.init=NULL, |
|
30 |
+ stop.iter=10, split.on.iter=10, process.counts=FALSE, |
|
31 |
+ nchains=1, bestChainsOnly=TRUE, cores=1, |
|
32 |
+ seed=12345, verbose=FALSE, logfile_prefix="Celda") { |
|
33 |
+ |
|
24 | 34 |
message(paste(Sys.time(), "Starting celda.")) |
25 |
- validateArgs(counts, model, sample.label, nchains, cores, seed, ...) |
|
35 |
+ params.list = buildParamList(counts, model, sample.label, K, L, alpha, beta, delta, |
|
36 |
+ gamma, max.iter, z.init, y.init, stop.iter, split.on.iter, |
|
37 |
+ process.counts, nchains, cores, seed) |
|
38 |
+ |
|
26 | 39 |
|
27 | 40 |
# Redirect stderr from the worker threads if user asks for verbose |
28 | 41 |
logfile = paste0(logfile_prefix, "_main_log.txt") |
42 |
+ params.list$logfile = logfile |
|
29 | 43 |
cl = if (verbose) parallel::makeCluster(cores, outfile=logfile) else parallel::makeCluster(cores) |
30 | 44 |
doParallel::registerDoParallel(cl) |
31 | 45 |
|
32 | 46 |
# Details for each model parameter / chain combination |
33 |
- runs = expand.grid(chain=1:nchains, ...) |
|
47 |
+ runs = expand.grid(plyr::compact(list(chain=1:nchains, K=K, L=L))) |
|
34 | 48 |
runs$index = as.numeric(rownames(runs)) |
35 | 49 |
if (verbose) print(runs) |
36 | 50 |
|
... | ... |
@@ -42,20 +56,22 @@ celda = function(counts, model, sample.label=NULL, nchains=1, bestChainsOnly=TRU |
42 | 56 |
# count matrix was used. |
43 | 57 |
counts = processCounts(counts) |
44 | 58 |
count.checksum = digest::digest(counts, algo="md5") |
59 |
+ params.list$count.checksum = count.checksum |
|
60 |
+ |
|
61 |
+ res.list = foreach(i = 1:nrow(runs), .export=model, .combine = c, .multicombine=TRUE) %dopar% { |
|
62 |
+ chain.params = params.list |
|
63 |
+ chain.params$seed = all.seeds[ifelse(i %% nchains == 0, nchains, i %% nchains)] |
|
45 | 64 |
|
46 |
- |
|
47 |
- res.list = foreach(i = 1:nrow(runs), .export=model, .combine = c, .multicombine=TRUE) %dopar% { |
|
48 |
- chain.seed = all.seeds[ifelse(i %% nchains == 0, nchains, i %% nchains)] |
|
49 |
- |
|
50 |
- if (verbose) { |
|
65 |
+ if (isTRUE(verbose)) { |
|
51 | 66 |
## Generate a unique log file name based on given prefix and parameters |
52 |
- logfile = paste0(logfile_prefix, "_", paste(paste(colnames(runs), runs[i,], sep="-"), collapse="_"), "_Seed-", chain.seed, "_log.txt") |
|
53 |
- res = do.call(model, c(list(counts=counts, sample.label=sample.label, count.checksum=count.checksum, seed=chain.seed, logfile=logfile, process.counts=F), c(runs[i,-1]))) |
|
67 |
+ chain.params$logfile = paste0(logfile_prefix, "_", paste(paste(colnames(runs), runs[i,], sep="-"), collapse="_"), "_Seed-", chain.params$seed, "_log.txt") |
|
68 |
+ res = do.call(model, chain.params) |
|
54 | 69 |
} else { |
55 |
- res = suppressMessages(do.call(model, c(list(counts=counts, sample.label=sample.label, count.checksum=count.checksum, seed=chain.seed, logfile=NULL, process.counts=F), c(runs[i,-1])))) |
|
70 |
+ chain.params$logfile = NULL |
|
71 |
+ res = suppressMessages(do.call(model, chain.params)) |
|
56 | 72 |
} |
57 | 73 |
return(list(res)) |
58 |
- } |
|
74 |
+ } |
|
59 | 75 |
parallel::stopCluster(cl) |
60 | 76 |
celda.res = list(run.params=runs, res.list=res.list, |
61 | 77 |
content.type=model, count.checksum=count.checksum) |
... | ... |
@@ -80,10 +96,43 @@ celda = function(counts, model, sample.label=NULL, nchains=1, bestChainsOnly=TRU |
80 | 96 |
} |
81 | 97 |
|
82 | 98 |
|
99 |
+# Build a list of parameters tailored to the specific celda model being run, |
|
100 |
+# validating the provided parameters along the way |
|
101 |
+buildParamList = function(counts, model, sample.label, K, L, alpha, beta, delta, |
|
102 |
+ gamma, max.iter, z.init, y.init, stop.iter, split.on.iter, |
|
103 |
+ process.counts, nchains, cores, seed) { |
|
104 |
+ |
|
105 |
+ validateArgs(counts, model, sample.label, nchains, cores, seed, K=K, L=L) |
|
106 |
+ |
|
107 |
+ params.list = list(counts=counts, |
|
108 |
+ sample.label=sample.label, |
|
109 |
+ max.iter=max.iter, |
|
110 |
+ stop.iter=stop.iter, |
|
111 |
+ split.on.iter=split.on.iter, |
|
112 |
+ process.counts=process.counts) |
|
113 |
+ |
|
114 |
+ if (model %in% c("celda_C", "celda_CG")) { |
|
115 |
+ params.list$alpha = alpha |
|
116 |
+ params.list$beta = beta |
|
117 |
+ params.list$K = K |
|
118 |
+ z.init=z.init |
|
119 |
+ } |
|
120 |
+ if (model %in% c("celda_G", "celda_CG")) { |
|
121 |
+ params.list$beta = beta |
|
122 |
+ params.list$delta = delta |
|
123 |
+ params.list$gamma = gamma |
|
124 |
+ params.list$L = L |
|
125 |
+ y.init=y.init |
|
126 |
+ } |
|
127 |
+ |
|
128 |
+ return(params.list) |
|
129 |
+} |
|
130 |
+ |
|
131 |
+ |
|
83 | 132 |
# Sanity check arguments to celda() to ensure a smooth run. |
84 | 133 |
# See parameter descriptions from celda() documentation. |
85 | 134 |
validateArgs = function(counts, model, sample.label, |
86 |
- nchains, cores, seed, K=NULL, L=NULL, ...) { |
|
135 |
+ nchains, cores, seed, K=NULL, L=NULL) { #, ...) { |
|
87 | 136 |
model_args = names(formals(model)) |
88 | 137 |
if ("K" %in% model_args && is.null(K)) { |
89 | 138 |
stop("Must provide a K parameter when running a celda_C or celda_CG model") |
... | ... |
@@ -84,15 +84,13 @@ simulateCells.celda_C = function(model, S=10, C.Range=c(10, 100), N.Range=c(100, |
84 | 84 |
#' @param z.init Initial values of z. If NULL, z will be randomly sampled. Default NULL. |
85 | 85 |
#' @param process.counts Whether to cast the counts matrix to integer and round(). Defaults to TRUE. |
86 | 86 |
#' @param logfile If NULL, messages will be displayed as normal. If set to a file name, messages will be redirected messages to the file. Default NULL. |
87 |
-#' @param ... additonal parameters |
|
88 | 87 |
#' @return An object of class celda_C with clustering results and Gibbs sampling statistics |
89 | 88 |
#' @export |
90 |
-celda_C = function(counts, sample.label=NULL, K, |
|
91 |
- alpha=1, beta=1, |
|
92 |
- stop.iter = 10, split.on.iter=10, max.iter=200, |
|
93 |
- count.checksum=NULL, seed=12345, |
|
94 |
- z.init = NULL, process.counts=TRUE, logfile=NULL, ...) { |
|
95 |
- |
|
89 |
+celda_C = function(counts, sample.label=NULL, K, alpha=1, beta=1, |
|
90 |
+ stop.iter = 10, split.on.iter=10, max.iter=200, |
|
91 |
+ count.checksum=NULL, seed=12345, |
|
92 |
+ z.init = c(), process.counts=TRUE, logfile=NULL) { |
|
93 |
+ |
|
96 | 94 |
## Error checking and variable processing |
97 | 95 |
if (isTRUE(process.counts)) { |
98 | 96 |
counts = processCounts(counts) |
... | ... |
@@ -130,6 +128,7 @@ celda_C = function(counts, sample.label=NULL, K, |
130 | 128 |
iter = 1L |
131 | 129 |
num.iter.without.improvement = 0L |
132 | 130 |
do.cell.split = TRUE |
131 |
+ logMessages(date(), "Max iter:", max.iter, logfile=logfile, append=TRUE) |
|
133 | 132 |
while(iter <= max.iter & num.iter.without.improvement <= stop.iter) { |
134 | 133 |
|
135 | 134 |
next.z = cC.calcGibbsProbZ(counts=counts, m.CP.by.S=m.CP.by.S, n.G.by.CP=n.G.by.CP, n.by.C=n.by.C, n.CP=n.CP, z=z, s=s, K=K, nG=nG, nM=nM, alpha=alpha, beta=beta) |
... | ... |
@@ -323,13 +323,12 @@ simulateCells.celda_CG = function(model, S=10, C.Range=c(50,100), N.Range=c(500, |
323 | 323 |
#' @param y.init Initial values of y. If NULL, y will be randomly sampled. Default NULL. |
324 | 324 |
#' @param logfile The name of the logfile to redirect messages to. |
325 | 325 |
#' @param count.checksum An MD5 checksum for the provided counts matrix |
326 |
-#' @param ... Additional parameters |
|
327 | 326 |
#' @export |
328 | 327 |
celda_CG = function(counts, sample.label=NULL, K, L, |
329 |
- alpha=1, beta=1, delta=1, gamma=1, |
|
328 |
+ alpha=1, beta=1, delta=1, gamma=1, |
|
330 | 329 |
max.iter=200, stop.iter = 10, split.on.iter=10, |
331 | 330 |
seed=12345, count.checksum=NULL, |
332 |
- z.init = NULL, y.init = NULL, logfile=NULL, ...) { |
|
331 |
+ z.init = NULL, y.init = NULL, logfile=NULL) { |
|
333 | 332 |
|
334 | 333 |
## Error checking and variable processing |
335 | 334 |
if (isTRUE(processCounts)) { |
... | ... |
@@ -198,14 +198,12 @@ cG.calcGibbsProbY = function(counts.t, n.C.by.TS, n.by.TS, nG.by.TS, n.by.G, y, |
198 | 198 |
#' @param y.init Initial values of y. If NULL, y will be randomly sampled. Default NULL. |
199 | 199 |
#' @param process.counts Whether to cast the counts matrix to integer and round(). Defaults to TRUE. |
200 | 200 |
#' @param logfile The name of the logfile to redirect messages to. |
201 |
-#' @param ... Additional parameters |
|
202 | 201 |
#' @keywords LDA gene clustering gibbs |
203 | 202 |
#' @export |
204 |
-celda_G = function(counts, L, |
|
205 |
- beta=1, delta=1, gamma=1, |
|
203 |
+celda_G = function(counts, L, beta=1, delta=1, gamma=1, |
|
206 | 204 |
stop.iter=10, split.on.iter=10, max.iter=200, |
207 |
- count.checksum=NULL, seed=12345, |
|
208 |
- y.init=NULL, process.counts=TRUE, logfile=NULL, ...) { |
|
205 |
+ count.checksum=NULL, seed=12345, |
|
206 |
+ y.init=NULL, process.counts=TRUE, logfile=NULL) { |
|
209 | 207 |
|
210 | 208 |
## Error checking and variable processing |
211 | 209 |
if (isTRUE(process.counts)) { |
... | ... |
@@ -4,9 +4,11 @@ |
4 | 4 |
\alias{celda} |
5 | 5 |
\title{Run the celda Bayesian hierarchical model on a matrix of counts.} |
6 | 6 |
\usage{ |
7 |
-celda(counts, model, sample.label = NULL, nchains = 1, |
|
8 |
- bestChainsOnly = TRUE, cores = 1, seed = 12345, verbose = FALSE, |
|
9 |
- logfile_prefix = "Celda", ...) |
|
7 |
+celda(counts, model, sample.label = NULL, K = NULL, L = NULL, alpha = 1, |
|
8 |
+ beta = 1, delta = 1, gamma = 1, max.iter = 200, z.init = NULL, |
|
9 |
+ y.init = NULL, stop.iter = 10, split.on.iter = 10, |
|
10 |
+ process.counts = FALSE, nchains = 1, bestChainsOnly = TRUE, cores = 1, |
|
11 |
+ seed = 12345, verbose = FALSE, logfile_prefix = "Celda") |
|
10 | 12 |
} |
11 | 13 |
\arguments{ |
12 | 14 |
\item{counts}{A count matrix} |
... | ... |
@@ -15,7 +17,19 @@ celda(counts, model, sample.label = NULL, nchains = 1, |
15 | 17 |
|
16 | 18 |
\item{sample.label}{A numeric vector, character vector, or factor indicating the originating sample for each cell (column) in the count matrix. By default, every cell will be assumed to be from an independent sample.} |
17 | 19 |
|
18 |
-\item{nchains}{The number of chains of Gibbs sampling to run for every combination of K/L parameters. Defaults to 1} |
|
20 |
+\item{K}{Number of desired cell subpopulation clusters. Required for celda_C and celda_CG models.} |
|
21 |
+ |
|
22 |
+\item{L}{Number of desired gene clusters. Required for celda_G and celda_CG models.} |
|
23 |
+ |
|
24 |
+\item{alpha}{Non-zero concentration parameter for sample Dirichlet distribution (celda_C / celda_CG only)} |
|
25 |
+ |
|
26 |
+\item{beta}{Non-zero concentration parameter for gene Dirichlet distribution} |
|
27 |
+ |
|
28 |
+\item{delta}{The Dirichlet distribution parameter for Eta; adds a gene pseudocount to the numbers of genes each state (celda_G / celda_CG only)} |
|
29 |
+ |
|
30 |
+\item{gamma}{The Dirichlet distribution parameter for Psi; adds a pseudocount to each gene within each transcriptional state (celda_G / celda_CG only)} |
|
31 |
+ |
|
32 |
+\item{nchains}{The number of chains of Gibbs sampling to run for every combination of K/L parameters. Defaults to 1 (celda_G / celda_CG only)} |
|
19 | 33 |
|
20 | 34 |
\item{bestChainsOnly}{Return only the best chain (by final log-likelihood) per K/L combination.} |
21 | 35 |
|
... | ... |
@@ -26,8 +40,6 @@ celda(counts, model, sample.label = NULL, nchains = 1, |
26 | 40 |
\item{verbose}{Print log messages during celda chain execution} |
27 | 41 |
|
28 | 42 |
\item{logfile_prefix}{Prefix for log files from worker threads and main process.} |
29 |
- |
|
30 |
-\item{...}{Model specific parameters} |
|
31 | 43 |
} |
32 | 44 |
\value{ |
33 | 45 |
Object of class "celda_list", which contains results for all model parameter combinations and summaries of the run parameters |
... | ... |
@@ -6,8 +6,8 @@ |
6 | 6 |
\usage{ |
7 | 7 |
celda_C(counts, sample.label = NULL, K, alpha = 1, beta = 1, |
8 | 8 |
stop.iter = 10, split.on.iter = 10, max.iter = 200, |
9 |
- count.checksum = NULL, seed = 12345, z.init = NULL, |
|
10 |
- process.counts = TRUE, logfile = NULL, ...) |
|
9 |
+ count.checksum = NULL, seed = 12345, z.init = c(), |
|
10 |
+ process.counts = TRUE, logfile = NULL) |
|
11 | 11 |
} |
12 | 12 |
\arguments{ |
13 | 13 |
\item{counts}{A numeric count matrix} |
... | ... |
@@ -35,8 +35,6 @@ celda_C(counts, sample.label = NULL, K, alpha = 1, beta = 1, |
35 | 35 |
\item{process.counts}{Whether to cast the counts matrix to integer and round(). Defaults to TRUE.} |
36 | 36 |
|
37 | 37 |
\item{logfile}{If NULL, messages will be displayed as normal. If set to a file name, messages will be redirected messages to the file. Default NULL.} |
38 |
- |
|
39 |
-\item{...}{additonal parameters} |
|
40 | 38 |
} |
41 | 39 |
\value{ |
42 | 40 |
An object of class celda_C with clustering results and Gibbs sampling statistics |
... | ... |
@@ -7,7 +7,7 @@ |
7 | 7 |
celda_CG(counts, sample.label = NULL, K, L, alpha = 1, beta = 1, |
8 | 8 |
delta = 1, gamma = 1, max.iter = 200, stop.iter = 10, |
9 | 9 |
split.on.iter = 10, seed = 12345, count.checksum = NULL, |
10 |
- z.init = NULL, y.init = NULL, logfile = NULL, ...) |
|
10 |
+ z.init = NULL, y.init = NULL, logfile = NULL) |
|
11 | 11 |
} |
12 | 12 |
\arguments{ |
13 | 13 |
\item{counts}{A numeric count matrix.} |
... | ... |
@@ -41,8 +41,6 @@ celda_CG(counts, sample.label = NULL, K, L, alpha = 1, beta = 1, |
41 | 41 |
\item{y.init}{Initial values of y. If NULL, y will be randomly sampled. Default NULL.} |
42 | 42 |
|
43 | 43 |
\item{logfile}{The name of the logfile to redirect messages to.} |
44 |
- |
|
45 |
-\item{...}{Additional parameters} |
|
46 | 44 |
} |
47 | 45 |
\description{ |
48 | 46 |
celda Cell and Gene Clustering Model |
... | ... |
@@ -6,8 +6,7 @@ |
6 | 6 |
\usage{ |
7 | 7 |
celda_G(counts, L, beta = 1, delta = 1, gamma = 1, stop.iter = 10, |
8 | 8 |
split.on.iter = 10, max.iter = 200, count.checksum = NULL, |
9 |
- seed = 12345, y.init = NULL, process.counts = TRUE, logfile = NULL, |
|
10 |
- ...) |
|
9 |
+ seed = 12345, y.init = NULL, process.counts = TRUE, logfile = NULL) |
|
11 | 10 |
} |
12 | 11 |
\arguments{ |
13 | 12 |
\item{counts}{A numeric count matrix} |
... | ... |
@@ -35,8 +34,6 @@ celda_G(counts, L, beta = 1, delta = 1, gamma = 1, stop.iter = 10, |
35 | 34 |
\item{process.counts}{Whether to cast the counts matrix to integer and round(). Defaults to TRUE.} |
36 | 35 |
|
37 | 36 |
\item{logfile}{The name of the logfile to redirect messages to.} |
38 |
- |
|
39 |
-\item{...}{Additional parameters} |
|
40 | 37 |
} |
41 | 38 |
\description{ |
42 | 39 |
Provides cluster assignments for all genes in a provided single-cell |