Browse code

experiment.table can contain multiple of the same files in column "file", save "state" column as double instead of integer

chakalakka authored on 18/05/2017 11:49:42
Showing 9 changed files

... ...
@@ -89,13 +89,12 @@ Chromstar <- function(inputfolder, experiment.table, outputfolder, configfile=NU
89 89
     ## Prepare IDs and filenames
90 90
     IDs <- paste0(exp.table$mark, '-', exp.table$condition, '-rep', exp.table$replicate)
91 91
     datafiles <- file.path(inputfolder, basename(as.character(exp.table$file)))
92
-    names(datafiles) <- basename(datafiles)
93
-    names(IDs) <- basename(datafiles)
94
-    rownames(exp.table) <- basename(datafiles)
95
-    filenames <- paste0(IDs, '_binsize', binsize.string, '_stepsize', stepsize.string, '.RData')
96
-    names(filenames) <- basename(datafiles)
97
-    filenames.stepsize <- paste0(IDs, '_binsize', stepsize.string, '_stepsize', stepsize.string, '.RData') # filenames for saving counts at stepsize
98
-    names(filenames.stepsize) <- basename(datafiles)
92
+    names(datafiles) <- IDs
93
+    rownames(exp.table) <- IDs
94
+    filenames <- paste0(IDs, '_', as.character(exp.table$file), '_binsize', binsize.string, '_stepsize', stepsize.string, '.RData')
95
+    names(filenames) <- IDs
96
+    filenames.stepsize <- paste0(IDs, '_', as.character(exp.table$file), '_binsize', stepsize.string, '_stepsize', stepsize.string, '.RData') # filenames for saving counts at stepsize
97
+    names(filenames.stepsize) <- IDs
99 98
     ## Inputfiles
100 99
     inputfiles <- unique(unlist(strsplit(as.character(exp.table$controlFiles), '\\|')))
101 100
     inputfiles <- inputfiles[!is.na(inputfiles)]
... ...
@@ -245,19 +244,21 @@ Chromstar <- function(inputfolder, experiment.table, outputfolder, configfile=NU
245 244
     
246 245
     ### Count reads in bins ###
247 246
     if (!file.exists(binpath)) { dir.create(binpath) }
248
-    parallel.helper <- function(file, input) {
247
+    parallel.helper <- function(ID, input, inputfile=NULL) {
249 248
         if (!input) {
250
-            savename <- file.path(binpath, filenames[basename(file)])
251
-            savename.stepsize <- file.path(binpath, filenames.stepsize[basename(file)])
252
-            pairedEndReads <- exp.table[basename(file),'pairedEndReads']
249
+            savename <- file.path(binpath, filenames[ID])
250
+            savename.stepsize <- file.path(binpath, filenames.stepsize[ID])
251
+            pairedEndReads <- exp.table[ID,'pairedEndReads']
252
+            file <- datafiles[ID]
253 253
         } else {
254
-            savename <- file.path(binpath, inputfilenames[basename(file)])
255
-            savename.stepsize <- file.path(binpath, inputfilenames.stepsize[basename(file)])
256
-            pairedEndReads <- exp.table[grep(basename(file), exp.table$controlFiles),'pairedEndReads']
254
+            savename <- file.path(binpath, inputfilenames[basename(inputfile)])
255
+            savename.stepsize <- file.path(binpath, inputfilenames.stepsize[basename(inputfile)])
256
+            pairedEndReads <- exp.table[grep(basename(inputfile), exp.table$controlFiles),'pairedEndReads']
257 257
             if (any(pairedEndReads != pairedEndReads[1])) {
258 258
                 stop("Multiple definitions of 'pairedEndReads' for file ", file, ".")
259 259
             }
260 260
             pairedEndReads <- pairedEndReads[1]
261
+            file <- inputfile
261 262
         }
262 263
         if (!file.exists(savename)) {
263 264
             tC <- tryCatch({
... ...
@@ -265,7 +266,7 @@ Chromstar <- function(inputfolder, experiment.table, outputfolder, configfile=NU
265 266
                 if (!input) {
266 267
                     exp.table.input <- exp.table
267 268
                 }
268
-                binlist <- binReads(file=file, experiment.table=exp.table.input, assembly=chrom.lengths.df, pairedEndReads=pairedEndReads, binsizes=NULL, reads.per.bin=NULL, bins=list(pre.bins, pre.bins.stepsize), stepsizes=c(stepsize, stepsize), chromosomes=conf[['chromosomes']], remove.duplicate.reads=conf[['remove.duplicate.reads']], min.mapq=conf[['min.mapq']])
269
+                binlist <- binReads(file=file, experiment.table=exp.table.input, ID=ID, assembly=chrom.lengths.df, pairedEndReads=pairedEndReads, binsizes=NULL, reads.per.bin=NULL, bins=list(pre.bins, pre.bins.stepsize), stepsizes=c(stepsize, stepsize), chromosomes=conf[['chromosomes']], remove.duplicate.reads=conf[['remove.duplicate.reads']], min.mapq=conf[['min.mapq']])
269 270
                 ptm <- startTimedMessage("Saving to file ", savename, " ...")
270 271
                 bins <- binlist[[1]]
271 272
                 save(bins, file=savename)
... ...
@@ -280,33 +281,33 @@ Chromstar <- function(inputfolder, experiment.table, outputfolder, configfile=NU
280 281
         }
281 282
     }
282 283
     ## Unparallelized
283
-#     for (file in datafiles) {
284
-#         parallel.helper(file, input=FALSE)
284
+#     for (ID in IDs) {
285
+#         parallel.helper(ID, input=FALSE)
285 286
 #     }
286 287
 #     for (file in inputfiles) {
287
-#         parallel.helper(file, input=TRUE)
288
+#         parallel.helper(ID=NULL, input=TRUE, inputfile=file)
288 289
 #     }
289 290
     ## Parallelized
290 291
     if (numcpu > 1) {
291 292
         ptm <- startTimedMessage("Binning data ...")
292
-        temp <- foreach (file = datafiles, .packages=c("chromstaR")) %dopar% {
293
-            parallel.helper(file, input=FALSE)
293
+        temp <- foreach (ID = IDs, .packages=c("chromstaR")) %dopar% {
294
+            parallel.helper(ID, input=FALSE)
294 295
         }
295 296
         stopTimedMessage(ptm)
296 297
     } else {
297
-        for (file in datafiles) {
298
-            parallel.helper(file, input=FALSE)
298
+        for (ID in IDs) {
299
+            parallel.helper(ID, input=FALSE)
299 300
         }
300 301
     }
301 302
     if (numcpu > 1) {
302 303
         ptm <- startTimedMessage("Binning input ...")
303 304
         temp <- foreach (file = inputfiles, .packages=c("chromstaR")) %dopar% {
304
-            parallel.helper(file, input=TRUE)
305
+            parallel.helper(ID=NULL, input=TRUE, inputfile=file)
305 306
         }
306 307
         stopTimedMessage(ptm)
307 308
     } else {
308 309
         for (file in inputfiles) {
309
-            parallel.helper(file, input=TRUE)
310
+            parallel.helper(ID=NULL, input=TRUE, inputfile=file)
310 311
         }
311 312
     }
312 313
     
... ...
@@ -328,8 +329,6 @@ Chromstar <- function(inputfolder, experiment.table, outputfolder, configfile=NU
328 329
         savename <- file.path(unipath, basename(file))
329 330
         if (!file.exists(savename)) {
330 331
             tC <- tryCatch({
331
-                mark <- strsplit(basename(file), '-')[[1]][1]
332
-                condition <- strsplit(basename(file), '-')[[1]][2]
333 332
                 input.files <- NULL
334 333
                 if (length(inputfiles)>0) {
335 334
                     input.files <- inputfiles
... ...
@@ -356,16 +355,16 @@ Chromstar <- function(inputfolder, experiment.table, outputfolder, configfile=NU
356 355
     }
357 356
     if (numcpu > 1) {
358 357
         ptm <- startTimedMessage("Univariate peak calling ...")
359
-        temp <- foreach (filename = names(files), .packages=c("chromstaR")) %dopar% {
360
-            file <- files[filename]
361
-            inputfiles <- inputfiles.list[[filename]]
358
+        temp <- foreach (ID = names(files), .packages=c("chromstaR")) %dopar% {
359
+            file <- files[ID]
360
+            inputfiles <- inputfiles.list[[ID]]
362 361
             parallel.helper(file, inputfiles)
363 362
         }
364 363
         stopTimedMessage(ptm)
365 364
     } else {
366
-        for (filename in names(files)) {
367
-            file <- files[filename]
368
-            inputfiles <- inputfiles.list[[filename]]
365
+        for (ID in names(files)) {
366
+            file <- files[ID]
367
+            inputfiles <- inputfiles.list[[ID]]
369 368
             parallel.helper(file, inputfiles)
370 369
         }
371 370
     }
... ...
@@ -9,6 +9,7 @@
9 9
 #' @aliases binning
10 10
 #' @param file A file with aligned reads. Alternatively a \code{\link{GRanges}} with aligned reads.
11 11
 #' @param experiment.table An \code{\link{experiment.table}} containing the supplied \code{file}. This is necessary to uniquely identify the file in later steps of the workflow. Set to \code{NULL} if you don't have it (not recommended).
12
+#' @param ID Optional ID to select a row from the \code{experiment.table}. Only necessary if the experiment table contains the same file in multiple positions in column 'file'.
12 13
 #' @inheritParams readBamFileAsGRanges
13 14
 #' @inheritParams readBedFileAsGRanges
14 15
 #' @param binsizes An integer vector specifying the bin sizes to use.
... ...
@@ -37,7 +38,7 @@
37 38
 #'                   stepsizes=500, chromosomes='chr12')
38 39
 #'print(binned)
39 40
 #'
40
-binReads <- function(file, experiment.table=NULL, assembly, bamindex=file, chromosomes=NULL, pairedEndReads=FALSE, min.mapq=10, remove.duplicate.reads=TRUE, max.fragment.width=1000, blacklist=NULL, binsizes=1000, stepsizes=binsizes/2, reads.per.bin=NULL, bins=NULL, variable.width.reference=NULL, use.bamsignals=TRUE, format=NULL) {
41
+binReads <- function(file, experiment.table=NULL, ID=NULL, assembly, bamindex=file, chromosomes=NULL, pairedEndReads=FALSE, min.mapq=10, remove.duplicate.reads=TRUE, max.fragment.width=1000, blacklist=NULL, binsizes=1000, stepsizes=binsizes/2, reads.per.bin=NULL, bins=NULL, variable.width.reference=NULL, use.bamsignals=TRUE, format=NULL) {
41 42
 
42 43
     ## Determine format
43 44
     if (is.null(format)) {
... ...
@@ -78,6 +79,12 @@ binReads <- function(file, experiment.table=NULL, assembly, bamindex=file, chrom
78 79
     if (!is.null(experiment.table)) {
79 80
         check.experiment.table(experiment.table)
80 81
         info <- experiment.table[basename(as.character(experiment.table$file))==basename(file),]
82
+        if (nrow(info) > 1) {
83
+            if (is.null(ID)) {
84
+                stop("Please specify argument 'ID' if multiple files in 'experiment.table' are identical.")
85
+            }
86
+            info <- info[ID, ]
87
+        }
81 88
         ID <- paste0(info$mark, '-', info$condition, '-rep', info$rep)
82 89
         info$ID <- ID
83 90
         if (pairedEndReads != info$pairedEndReads) {
... ...
@@ -331,7 +331,7 @@ runMultivariate <- function(binned.data, stepbins, info, comb.states, use.states
331 331
             get.posteriors = as.logical(get.posteriors), # bool* keep_posteriors
332 332
             densities = double(length=lenDensities), # double* densities
333 333
             keep.densities = as.logical(keep.densities), # bool* keep_densities
334
-            states = integer(length=length(binned.data)), # int* states
334
+            states = double(length=length(binned.data)), # double* states
335 335
             maxPosterior = double(length=length(binned.data)), # double* maxPosterior
336 336
             A = double(length=length(comb.states)*length(comb.states)), # double* A
337 337
             proba = double(length=length(comb.states)), # double* proba
... ...
@@ -292,7 +292,7 @@ callPeaksUnivariateAllChr <- function(binned.data, input.data=NULL, eps=0.01, in
292 292
                 read.counts.to.remove <- max(c(read.counts.to.remove.1, 2*minlow))
293 293
                 index.filtered <- which(counts>0 & counts<=read.counts.to.remove)
294 294
                 counts[index.filtered] <- 0
295
-                if (length(index.filtered)>0) {
295
+                if (length(index.filtered)>0 & ioffset == 1) {
296 296
                     message(paste0("Replaced read counts <= ",read.counts.to.remove," by 0. This was done because the selected bin size is considered too big for this dataset: The mean of the read counts (zeros removed) is bigger than the specified max.mean = ",max.mean,". Check the fits!"))
297 297
                 }
298 298
             }
... ...
@@ -412,6 +412,7 @@ callPeaksUnivariateAllChr <- function(binned.data, input.data=NULL, eps=0.01, in
412 412
         if (ioffset == 1) {
413 413
             ### Make return object ###
414 414
                 result <- list()
415
+                class(result) <- class.univariate.hmm
415 416
                 result$info <- attr(binned.data, 'info')
416 417
             ## Parameters
417 418
                 # Weights
... ...
@@ -444,8 +445,6 @@ callPeaksUnivariateAllChr <- function(binned.data, input.data=NULL, eps=0.01, in
444 445
             ## Convergence info
445 446
                 convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec, max.mean=max.mean, read.cutoff=max(hmm$counts))
446 447
                 result$convergenceInfo <- convergenceInfo
447
-            ## Add class
448
-                class(result) <- class.univariate.hmm
449 448
         }
450 449
         
451 450
         if (ioffset == 1) { ptm <- startTimedMessage("Collecting counts and posteriors ...") }
... ...
@@ -46,4 +46,4 @@ Please refer to the [vignette](https://github.com/ataudt/chromstaR/blob/master/v
46 46
 Report Errors
47 47
 -------------
48 48
 
49
-If you encounter errors of any kind, please file an [issue here](https://github.com/ataudt/chromstaR/issues/new). I will try to react within one day.
49
+If you encounter errors of any kind, please file an [issue here](https://github.com/ataudt/chromstaR/issues/new). I will try to react within three working days.
... ...
@@ -5,9 +5,9 @@
5 5
 \alias{binning}
6 6
 \title{Convert aligned reads from various file formats into read counts in equidistant bins}
7 7
 \usage{
8
-binReads(file, experiment.table = NULL, assembly, bamindex = file,
9
-  chromosomes = NULL, pairedEndReads = FALSE, min.mapq = 10,
10
-  remove.duplicate.reads = TRUE, max.fragment.width = 1000,
8
+binReads(file, experiment.table = NULL, ID = NULL, assembly,
9
+  bamindex = file, chromosomes = NULL, pairedEndReads = FALSE,
10
+  min.mapq = 10, remove.duplicate.reads = TRUE, max.fragment.width = 1000,
11 11
   blacklist = NULL, binsizes = 1000, stepsizes = binsizes/2,
12 12
   reads.per.bin = NULL, bins = NULL, variable.width.reference = NULL,
13 13
   use.bamsignals = TRUE, format = NULL)
... ...
@@ -17,6 +17,8 @@ binReads(file, experiment.table = NULL, assembly, bamindex = file,
17 17
 
18 18
 \item{experiment.table}{An \code{\link{experiment.table}} containing the supplied \code{file}. This is necessary to uniquely identify the file in later steps of the workflow. Set to \code{NULL} if you don't have it (not recommended).}
19 19
 
20
+\item{ID}{Optional ID to select a row from the \code{experiment.table}. Only necessary if the experiment table contains the same file in multiple positions in column 'file'.}
21
+
20 22
 \item{assembly}{Please see \code{\link[GenomeInfoDb]{fetchExtendedChromInfoFromUCSC}} for available assemblies. Only necessary when importing BED files. BAM files are handled automatically. Alternatively a data.frame with columns 'chromosome' and 'length'.}
21 23
 
22 24
 \item{bamindex}{BAM index file. Can be specified without the .bai ending. If the index file does not exist it will be created and a warning is issued.}
... ...
@@ -273,7 +273,7 @@ void univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* max
273 273
 // =====================================================================================================================================================
274 274
 // This function takes parameters from R, creates a multivariate HMM object, creates the distributions, runs the Baum-Welch and returns the result to R.
275 275
 // =====================================================================================================================================================
276
-void multivariate_hmm(int* O, int* T, int* N, int *Nmod, double* comb_states, double* size, double* prob, double* w, double* cor_matrix_inv, double* det, int* maxiter, int* maxtime, double* eps, double* posteriors, bool* keep_posteriors, double* densities, bool* keep_densities, int* states, double* maxPosterior, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* verbosity)
276
+void multivariate_hmm(int* O, int* T, int* N, int *Nmod, double* comb_states, double* size, double* prob, double* w, double* cor_matrix_inv, double* det, int* maxiter, int* maxtime, double* eps, double* posteriors, bool* keep_posteriors, double* densities, bool* keep_densities, double* states, double* maxPosterior, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* verbosity)
277 277
 {
278 278
 
279 279
 	// Define logging level {"ERROR", "WARNING", "INFO", "ITERATION", "DEBUG", "DEBUG1", "DEBUG2", "DEBUG3", "DEBUG4"}
... ...
@@ -20,7 +20,7 @@ extern "C"
20 20
 void univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* maxiter, int* maxtime, double* eps, double* posteriors, double* densities, bool* keep_densities, int* states, double* maxPosterior, double* A, double* proba, double* loglik, double* weights, int* iniproc, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* verbosity);
21 21
 
22 22
 extern "C"
23
-void multivariate_hmm(int* O, int* T, int* N, int *Nmod, double* comb_states, double* size, double* prob, double* w, double* cor_matrix_inv, double* det, int* maxiter, int* maxtime, double* eps, double* posteriors, bool* keep_posteriors, double* densities, bool* keep_densities, int* states, double* maxPosterior, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* verbosity);
23
+void multivariate_hmm(int* O, int* T, int* N, int *Nmod, double* comb_states, double* size, double* prob, double* w, double* cor_matrix_inv, double* det, int* maxiter, int* maxtime, double* eps, double* posteriors, bool* keep_posteriors, double* densities, bool* keep_densities, double* states, double* maxPosterior, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* verbosity);
24 24
 
25 25
 extern "C"
26 26
 void univariate_cleanup();
... ...
@@ -32,4 +32,4 @@ extern "C"
32 32
 void array3D_which_max(double* array3D, int* dim, int* ind_max);
33 33
 
34 34
 extern "C"
35
-void array2D_which_max(double* array3D, int* dim, int* ind_max, double* value_max);
36 35
\ No newline at end of file
36
+void array2D_which_max(double* array2D, int* dim, int* ind_max, double* value_max);
37 37
\ No newline at end of file
... ...
@@ -4,7 +4,7 @@
4 4
 
5 5
 
6 6
 R_NativePrimitiveArgType arg1[] = {INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP, INTSXP};
7
-R_NativePrimitiveArgType arg2[] = {INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, REALSXP, LGLSXP, REALSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP};
7
+R_NativePrimitiveArgType arg2[] = {INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, REALSXP, LGLSXP, REALSXP, LGLSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP};
8 8
 R_NativePrimitiveArgType arg4[] = {INTSXP};
9 9
 R_NativePrimitiveArgType arg5[] = {REALSXP, INTSXP, INTSXP};
10 10
 R_NativePrimitiveArgType arg6[] = {REALSXP, INTSXP, INTSXP, REALSXP};