... | ... |
@@ -89,13 +89,12 @@ Chromstar <- function(inputfolder, experiment.table, outputfolder, configfile=NU |
89 | 89 |
## Prepare IDs and filenames |
90 | 90 |
IDs <- paste0(exp.table$mark, '-', exp.table$condition, '-rep', exp.table$replicate) |
91 | 91 |
datafiles <- file.path(inputfolder, basename(as.character(exp.table$file))) |
92 |
- names(datafiles) <- basename(datafiles) |
|
93 |
- names(IDs) <- basename(datafiles) |
|
94 |
- rownames(exp.table) <- basename(datafiles) |
|
95 |
- filenames <- paste0(IDs, '_binsize', binsize.string, '_stepsize', stepsize.string, '.RData') |
|
96 |
- names(filenames) <- basename(datafiles) |
|
97 |
- filenames.stepsize <- paste0(IDs, '_binsize', stepsize.string, '_stepsize', stepsize.string, '.RData') # filenames for saving counts at stepsize |
|
98 |
- names(filenames.stepsize) <- basename(datafiles) |
|
92 |
+ names(datafiles) <- IDs |
|
93 |
+ rownames(exp.table) <- IDs |
|
94 |
+ filenames <- paste0(IDs, '_', as.character(exp.table$file), '_binsize', binsize.string, '_stepsize', stepsize.string, '.RData') |
|
95 |
+ names(filenames) <- IDs |
|
96 |
+ filenames.stepsize <- paste0(IDs, '_', as.character(exp.table$file), '_binsize', stepsize.string, '_stepsize', stepsize.string, '.RData') # filenames for saving counts at stepsize |
|
97 |
+ names(filenames.stepsize) <- IDs |
|
99 | 98 |
## Inputfiles |
100 | 99 |
inputfiles <- unique(unlist(strsplit(as.character(exp.table$controlFiles), '\\|'))) |
101 | 100 |
inputfiles <- inputfiles[!is.na(inputfiles)] |
... | ... |
@@ -245,19 +244,21 @@ Chromstar <- function(inputfolder, experiment.table, outputfolder, configfile=NU |
245 | 244 |
|
246 | 245 |
### Count reads in bins ### |
247 | 246 |
if (!file.exists(binpath)) { dir.create(binpath) } |
248 |
- parallel.helper <- function(file, input) { |
|
247 |
+ parallel.helper <- function(ID, input, inputfile=NULL) { |
|
249 | 248 |
if (!input) { |
250 |
- savename <- file.path(binpath, filenames[basename(file)]) |
|
251 |
- savename.stepsize <- file.path(binpath, filenames.stepsize[basename(file)]) |
|
252 |
- pairedEndReads <- exp.table[basename(file),'pairedEndReads'] |
|
249 |
+ savename <- file.path(binpath, filenames[ID]) |
|
250 |
+ savename.stepsize <- file.path(binpath, filenames.stepsize[ID]) |
|
251 |
+ pairedEndReads <- exp.table[ID,'pairedEndReads'] |
|
252 |
+ file <- datafiles[ID] |
|
253 | 253 |
} else { |
254 |
- savename <- file.path(binpath, inputfilenames[basename(file)]) |
|
255 |
- savename.stepsize <- file.path(binpath, inputfilenames.stepsize[basename(file)]) |
|
256 |
- pairedEndReads <- exp.table[grep(basename(file), exp.table$controlFiles),'pairedEndReads'] |
|
254 |
+ savename <- file.path(binpath, inputfilenames[basename(inputfile)]) |
|
255 |
+ savename.stepsize <- file.path(binpath, inputfilenames.stepsize[basename(inputfile)]) |
|
256 |
+ pairedEndReads <- exp.table[grep(basename(inputfile), exp.table$controlFiles),'pairedEndReads'] |
|
257 | 257 |
if (any(pairedEndReads != pairedEndReads[1])) { |
258 | 258 |
stop("Multiple definitions of 'pairedEndReads' for file ", file, ".") |
259 | 259 |
} |
260 | 260 |
pairedEndReads <- pairedEndReads[1] |
261 |
+ file <- inputfile |
|
261 | 262 |
} |
262 | 263 |
if (!file.exists(savename)) { |
263 | 264 |
tC <- tryCatch({ |
... | ... |
@@ -265,7 +266,7 @@ Chromstar <- function(inputfolder, experiment.table, outputfolder, configfile=NU |
265 | 266 |
if (!input) { |
266 | 267 |
exp.table.input <- exp.table |
267 | 268 |
} |
268 |
- binlist <- binReads(file=file, experiment.table=exp.table.input, assembly=chrom.lengths.df, pairedEndReads=pairedEndReads, binsizes=NULL, reads.per.bin=NULL, bins=list(pre.bins, pre.bins.stepsize), stepsizes=c(stepsize, stepsize), chromosomes=conf[['chromosomes']], remove.duplicate.reads=conf[['remove.duplicate.reads']], min.mapq=conf[['min.mapq']]) |
|
269 |
+ binlist <- binReads(file=file, experiment.table=exp.table.input, ID=ID, assembly=chrom.lengths.df, pairedEndReads=pairedEndReads, binsizes=NULL, reads.per.bin=NULL, bins=list(pre.bins, pre.bins.stepsize), stepsizes=c(stepsize, stepsize), chromosomes=conf[['chromosomes']], remove.duplicate.reads=conf[['remove.duplicate.reads']], min.mapq=conf[['min.mapq']]) |
|
269 | 270 |
ptm <- startTimedMessage("Saving to file ", savename, " ...") |
270 | 271 |
bins <- binlist[[1]] |
271 | 272 |
save(bins, file=savename) |
... | ... |
@@ -280,33 +281,33 @@ Chromstar <- function(inputfolder, experiment.table, outputfolder, configfile=NU |
280 | 281 |
} |
281 | 282 |
} |
282 | 283 |
## Unparallelized |
283 |
-# for (file in datafiles) { |
|
284 |
-# parallel.helper(file, input=FALSE) |
|
284 |
+# for (ID in IDs) { |
|
285 |
+# parallel.helper(ID, input=FALSE) |
|
285 | 286 |
# } |
286 | 287 |
# for (file in inputfiles) { |
287 |
-# parallel.helper(file, input=TRUE) |
|
288 |
+# parallel.helper(ID=NULL, input=TRUE, inputfile=file) |
|
288 | 289 |
# } |
289 | 290 |
## Parallelized |
290 | 291 |
if (numcpu > 1) { |
291 | 292 |
ptm <- startTimedMessage("Binning data ...") |
292 |
- temp <- foreach (file = datafiles, .packages=c("chromstaR")) %dopar% { |
|
293 |
- parallel.helper(file, input=FALSE) |
|
293 |
+ temp <- foreach (ID = IDs, .packages=c("chromstaR")) %dopar% { |
|
294 |
+ parallel.helper(ID, input=FALSE) |
|
294 | 295 |
} |
295 | 296 |
stopTimedMessage(ptm) |
296 | 297 |
} else { |
297 |
- for (file in datafiles) { |
|
298 |
- parallel.helper(file, input=FALSE) |
|
298 |
+ for (ID in IDs) { |
|
299 |
+ parallel.helper(ID, input=FALSE) |
|
299 | 300 |
} |
300 | 301 |
} |
301 | 302 |
if (numcpu > 1) { |
302 | 303 |
ptm <- startTimedMessage("Binning input ...") |
303 | 304 |
temp <- foreach (file = inputfiles, .packages=c("chromstaR")) %dopar% { |
304 |
- parallel.helper(file, input=TRUE) |
|
305 |
+ parallel.helper(ID=NULL, input=TRUE, inputfile=file) |
|
305 | 306 |
} |
306 | 307 |
stopTimedMessage(ptm) |
307 | 308 |
} else { |
308 | 309 |
for (file in inputfiles) { |
309 |
- parallel.helper(file, input=TRUE) |
|
310 |
+ parallel.helper(ID=NULL, input=TRUE, inputfile=file) |
|
310 | 311 |
} |
311 | 312 |
} |
312 | 313 |
|
... | ... |
@@ -328,8 +329,6 @@ Chromstar <- function(inputfolder, experiment.table, outputfolder, configfile=NU |
328 | 329 |
savename <- file.path(unipath, basename(file)) |
329 | 330 |
if (!file.exists(savename)) { |
330 | 331 |
tC <- tryCatch({ |
331 |
- mark <- strsplit(basename(file), '-')[[1]][1] |
|
332 |
- condition <- strsplit(basename(file), '-')[[1]][2] |
|
333 | 332 |
input.files <- NULL |
334 | 333 |
if (length(inputfiles)>0) { |
335 | 334 |
input.files <- inputfiles |
... | ... |
@@ -356,16 +355,16 @@ Chromstar <- function(inputfolder, experiment.table, outputfolder, configfile=NU |
356 | 355 |
} |
357 | 356 |
if (numcpu > 1) { |
358 | 357 |
ptm <- startTimedMessage("Univariate peak calling ...") |
359 |
- temp <- foreach (filename = names(files), .packages=c("chromstaR")) %dopar% { |
|
360 |
- file <- files[filename] |
|
361 |
- inputfiles <- inputfiles.list[[filename]] |
|
358 |
+ temp <- foreach (ID = names(files), .packages=c("chromstaR")) %dopar% { |
|
359 |
+ file <- files[ID] |
|
360 |
+ inputfiles <- inputfiles.list[[ID]] |
|
362 | 361 |
parallel.helper(file, inputfiles) |
363 | 362 |
} |
364 | 363 |
stopTimedMessage(ptm) |
365 | 364 |
} else { |
366 |
- for (filename in names(files)) { |
|
367 |
- file <- files[filename] |
|
368 |
- inputfiles <- inputfiles.list[[filename]] |
|
365 |
+ for (ID in names(files)) { |
|
366 |
+ file <- files[ID] |
|
367 |
+ inputfiles <- inputfiles.list[[ID]] |
|
369 | 368 |
parallel.helper(file, inputfiles) |
370 | 369 |
} |
371 | 370 |
} |
... | ... |
@@ -9,6 +9,7 @@ |
9 | 9 |
#' @aliases binning |
10 | 10 |
#' @param file A file with aligned reads. Alternatively a \code{\link{GRanges}} with aligned reads. |
11 | 11 |
#' @param experiment.table An \code{\link{experiment.table}} containing the supplied \code{file}. This is necessary to uniquely identify the file in later steps of the workflow. Set to \code{NULL} if you don't have it (not recommended). |
12 |
+#' @param ID Optional ID to select a row from the \code{experiment.table}. Only necessary if the experiment table contains the same file in multiple positions in column 'file'. |
|
12 | 13 |
#' @inheritParams readBamFileAsGRanges |
13 | 14 |
#' @inheritParams readBedFileAsGRanges |
14 | 15 |
#' @param binsizes An integer vector specifying the bin sizes to use. |
... | ... |
@@ -37,7 +38,7 @@ |
37 | 38 |
#' stepsizes=500, chromosomes='chr12') |
38 | 39 |
#'print(binned) |
39 | 40 |
#' |
40 |
-binReads <- function(file, experiment.table=NULL, assembly, bamindex=file, chromosomes=NULL, pairedEndReads=FALSE, min.mapq=10, remove.duplicate.reads=TRUE, max.fragment.width=1000, blacklist=NULL, binsizes=1000, stepsizes=binsizes/2, reads.per.bin=NULL, bins=NULL, variable.width.reference=NULL, use.bamsignals=TRUE, format=NULL) { |
|
41 |
+binReads <- function(file, experiment.table=NULL, ID=NULL, assembly, bamindex=file, chromosomes=NULL, pairedEndReads=FALSE, min.mapq=10, remove.duplicate.reads=TRUE, max.fragment.width=1000, blacklist=NULL, binsizes=1000, stepsizes=binsizes/2, reads.per.bin=NULL, bins=NULL, variable.width.reference=NULL, use.bamsignals=TRUE, format=NULL) { |
|
41 | 42 |
|
42 | 43 |
## Determine format |
43 | 44 |
if (is.null(format)) { |
... | ... |
@@ -78,6 +79,12 @@ binReads <- function(file, experiment.table=NULL, assembly, bamindex=file, chrom |
78 | 79 |
if (!is.null(experiment.table)) { |
79 | 80 |
check.experiment.table(experiment.table) |
80 | 81 |
info <- experiment.table[basename(as.character(experiment.table$file))==basename(file),] |
82 |
+ if (nrow(info) > 1) { |
|
83 |
+ if (is.null(ID)) { |
|
84 |
+ stop("Please specify argument 'ID' if multiple files in 'experiment.table' are identical.") |
|
85 |
+ } |
|
86 |
+ info <- info[ID, ] |
|
87 |
+ } |
|
81 | 88 |
ID <- paste0(info$mark, '-', info$condition, '-rep', info$rep) |
82 | 89 |
info$ID <- ID |
83 | 90 |
if (pairedEndReads != info$pairedEndReads) { |
... | ... |
@@ -331,7 +331,7 @@ runMultivariate <- function(binned.data, stepbins, info, comb.states, use.states |
331 | 331 |
get.posteriors = as.logical(get.posteriors), # bool* keep_posteriors |
332 | 332 |
densities = double(length=lenDensities), # double* densities |
333 | 333 |
keep.densities = as.logical(keep.densities), # bool* keep_densities |
334 |
- states = integer(length=length(binned.data)), # int* states |
|
334 |
+ states = double(length=length(binned.data)), # double* states |
|
335 | 335 |
maxPosterior = double(length=length(binned.data)), # double* maxPosterior |
336 | 336 |
A = double(length=length(comb.states)*length(comb.states)), # double* A |
337 | 337 |
proba = double(length=length(comb.states)), # double* proba |
... | ... |
@@ -292,7 +292,7 @@ callPeaksUnivariateAllChr <- function(binned.data, input.data=NULL, eps=0.01, in |
292 | 292 |
read.counts.to.remove <- max(c(read.counts.to.remove.1, 2*minlow)) |
293 | 293 |
index.filtered <- which(counts>0 & counts<=read.counts.to.remove) |
294 | 294 |
counts[index.filtered] <- 0 |
295 |
- if (length(index.filtered)>0) { |
|
295 |
+ if (length(index.filtered)>0 & ioffset == 1) { |
|
296 | 296 |
message(paste0("Replaced read counts <= ",read.counts.to.remove," by 0. This was done because the selected bin size is considered too big for this dataset: The mean of the read counts (zeros removed) is bigger than the specified max.mean = ",max.mean,". Check the fits!")) |
297 | 297 |
} |
298 | 298 |
} |
... | ... |
@@ -412,6 +412,7 @@ callPeaksUnivariateAllChr <- function(binned.data, input.data=NULL, eps=0.01, in |
412 | 412 |
if (ioffset == 1) { |
413 | 413 |
### Make return object ### |
414 | 414 |
result <- list() |
415 |
+ class(result) <- class.univariate.hmm |
|
415 | 416 |
result$info <- attr(binned.data, 'info') |
416 | 417 |
## Parameters |
417 | 418 |
# Weights |
... | ... |
@@ -444,8 +445,6 @@ callPeaksUnivariateAllChr <- function(binned.data, input.data=NULL, eps=0.01, in |
444 | 445 |
## Convergence info |
445 | 446 |
convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec, max.mean=max.mean, read.cutoff=max(hmm$counts)) |
446 | 447 |
result$convergenceInfo <- convergenceInfo |
447 |
- ## Add class |
|
448 |
- class(result) <- class.univariate.hmm |
|
449 | 448 |
} |
450 | 449 |
|
451 | 450 |
if (ioffset == 1) { ptm <- startTimedMessage("Collecting counts and posteriors ...") } |
... | ... |
@@ -46,4 +46,4 @@ Please refer to the [vignette](https://github.com/ataudt/chromstaR/blob/master/v |
46 | 46 |
Report Errors |
47 | 47 |
------------- |
48 | 48 |
|
49 |
-If you encounter errors of any kind, please file an [issue here](https://github.com/ataudt/chromstaR/issues/new). I will try to react within one day. |
|
49 |
+If you encounter errors of any kind, please file an [issue here](https://github.com/ataudt/chromstaR/issues/new). I will try to react within three working days. |
... | ... |
@@ -5,9 +5,9 @@ |
5 | 5 |
\alias{binning} |
6 | 6 |
\title{Convert aligned reads from various file formats into read counts in equidistant bins} |
7 | 7 |
\usage{ |
8 |
-binReads(file, experiment.table = NULL, assembly, bamindex = file, |
|
9 |
- chromosomes = NULL, pairedEndReads = FALSE, min.mapq = 10, |
|
10 |
- remove.duplicate.reads = TRUE, max.fragment.width = 1000, |
|
8 |
+binReads(file, experiment.table = NULL, ID = NULL, assembly, |
|
9 |
+ bamindex = file, chromosomes = NULL, pairedEndReads = FALSE, |
|
10 |
+ min.mapq = 10, remove.duplicate.reads = TRUE, max.fragment.width = 1000, |
|
11 | 11 |
blacklist = NULL, binsizes = 1000, stepsizes = binsizes/2, |
12 | 12 |
reads.per.bin = NULL, bins = NULL, variable.width.reference = NULL, |
13 | 13 |
use.bamsignals = TRUE, format = NULL) |
... | ... |
@@ -17,6 +17,8 @@ binReads(file, experiment.table = NULL, assembly, bamindex = file, |
17 | 17 |
|
18 | 18 |
\item{experiment.table}{An \code{\link{experiment.table}} containing the supplied \code{file}. This is necessary to uniquely identify the file in later steps of the workflow. Set to \code{NULL} if you don't have it (not recommended).} |
19 | 19 |
|
20 |
+\item{ID}{Optional ID to select a row from the \code{experiment.table}. Only necessary if the experiment table contains the same file in multiple positions in column 'file'.} |
|
21 |
+ |
|
20 | 22 |
\item{assembly}{Please see \code{\link[GenomeInfoDb]{fetchExtendedChromInfoFromUCSC}} for available assemblies. Only necessary when importing BED files. BAM files are handled automatically. Alternatively a data.frame with columns 'chromosome' and 'length'.} |
21 | 23 |
|
22 | 24 |
\item{bamindex}{BAM index file. Can be specified without the .bai ending. If the index file does not exist it will be created and a warning is issued.} |
... | ... |
@@ -273,7 +273,7 @@ void univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* max |
273 | 273 |
// ===================================================================================================================================================== |
274 | 274 |
// This function takes parameters from R, creates a multivariate HMM object, creates the distributions, runs the Baum-Welch and returns the result to R. |
275 | 275 |
// ===================================================================================================================================================== |
276 |
-void multivariate_hmm(int* O, int* T, int* N, int *Nmod, double* comb_states, double* size, double* prob, double* w, double* cor_matrix_inv, double* det, int* maxiter, int* maxtime, double* eps, double* posteriors, bool* keep_posteriors, double* densities, bool* keep_densities, int* states, double* maxPosterior, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* verbosity) |
|
276 |
+void multivariate_hmm(int* O, int* T, int* N, int *Nmod, double* comb_states, double* size, double* prob, double* w, double* cor_matrix_inv, double* det, int* maxiter, int* maxtime, double* eps, double* posteriors, bool* keep_posteriors, double* densities, bool* keep_densities, double* states, double* maxPosterior, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* verbosity) |
|
277 | 277 |
{ |
278 | 278 |
|
279 | 279 |
// Define logging level {"ERROR", "WARNING", "INFO", "ITERATION", "DEBUG", "DEBUG1", "DEBUG2", "DEBUG3", "DEBUG4"} |
... | ... |
@@ -20,7 +20,7 @@ extern "C" |
20 | 20 |
void univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* maxiter, int* maxtime, double* eps, double* posteriors, double* densities, bool* keep_densities, int* states, double* maxPosterior, double* A, double* proba, double* loglik, double* weights, int* iniproc, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* verbosity); |
21 | 21 |
|
22 | 22 |
extern "C" |
23 |
-void multivariate_hmm(int* O, int* T, int* N, int *Nmod, double* comb_states, double* size, double* prob, double* w, double* cor_matrix_inv, double* det, int* maxiter, int* maxtime, double* eps, double* posteriors, bool* keep_posteriors, double* densities, bool* keep_densities, int* states, double* maxPosterior, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* verbosity); |
|
23 |
+void multivariate_hmm(int* O, int* T, int* N, int *Nmod, double* comb_states, double* size, double* prob, double* w, double* cor_matrix_inv, double* det, int* maxiter, int* maxtime, double* eps, double* posteriors, bool* keep_posteriors, double* densities, bool* keep_densities, double* states, double* maxPosterior, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* verbosity); |
|
24 | 24 |
|
25 | 25 |
extern "C" |
26 | 26 |
void univariate_cleanup(); |
... | ... |
@@ -32,4 +32,4 @@ extern "C" |
32 | 32 |
void array3D_which_max(double* array3D, int* dim, int* ind_max); |
33 | 33 |
|
34 | 34 |
extern "C" |
35 |
-void array2D_which_max(double* array3D, int* dim, int* ind_max, double* value_max); |
|
36 | 35 |
\ No newline at end of file |
36 |
+void array2D_which_max(double* array2D, int* dim, int* ind_max, double* value_max); |
|
37 | 37 |
\ No newline at end of file |
... | ... |
@@ -4,7 +4,7 @@ |
4 | 4 |
|
5 | 5 |
|
6 | 6 |
R_NativePrimitiveArgType arg1[] = {INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP, INTSXP}; |
7 |
-R_NativePrimitiveArgType arg2[] = {INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, REALSXP, LGLSXP, REALSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP}; |
|
7 |
+R_NativePrimitiveArgType arg2[] = {INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, REALSXP, LGLSXP, REALSXP, LGLSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP}; |
|
8 | 8 |
R_NativePrimitiveArgType arg4[] = {INTSXP}; |
9 | 9 |
R_NativePrimitiveArgType arg5[] = {REALSXP, INTSXP, INTSXP}; |
10 | 10 |
R_NativePrimitiveArgType arg6[] = {REALSXP, INTSXP, INTSXP, REALSXP}; |