... | ... |
@@ -9,29 +9,27 @@ |
9 | 9 |
#' @param configfile A file specifying the parameters of this function (without \code{inputfolder}, \code{outputfolder} and \code{configfile}). Having the parameters in a file can be handy if many samples with the same parameter settings are to be run. If a \code{configfile} is specified, it will take priority over the command line parameters. |
10 | 10 |
#' @param numCPU The numbers of CPUs that are used. Should not be more than available on your machine. |
11 | 11 |
#' @param reuse.existing.files A logical indicating whether or not existing files in \code{outputfolder} should be reused. |
12 |
-#' @param binsizes An integer vector with bin sizes. If more than one value is given, output files will be produced for each bin size. |
|
13 |
-#' @param reads.per.bin Approximate number of desired reads per bin. The bin size will be selected accordingly. Output files are produced for each value. |
|
14 |
-#' @param format Format of the sequencing data. One of \code{c('bam')}. |
|
15 |
-#' @param chromosomes If only a subset of the chromosomes should be binned, specify them here. |
|
16 |
-#' @param remove.duplicate.reads A logical indicating whether or not duplicate reads should be removed. |
|
17 |
-#' @param min.mapq Minimum mapping quality when importing from BAM files. |
|
12 |
+ |
|
13 |
+#' @param stepsize Fraction of the binsize that the sliding window is offset at each step. Example: If \code{stepsize=0.1} and \code{binsizes=c(200000,500000)}, the actual stepsize in basepairs is 20000 and 50000, respectively. |
|
14 |
+#' @inheritParams align2binned |
|
15 |
+ |
|
18 | 16 |
#' @param correction.method Correction methods to be used for the binned read counts. Currently any combination of \code{c('GC')}. |
19 | 17 |
#' @param GC.BSgenome A \code{BSgenome} object which contains the DNA sequence that is used for the GC correction. |
20 | 18 |
#' @param callCNVs A logical indicating whether CNVs should be called. |
21 | 19 |
#' @param callSCEs A logical indicating whether SCEs should be called. |
22 |
-#' @param eps Convergence threshold for the Baum-Welch algorithm. |
|
23 |
-#' @param max.time The maximum running time in seconds for the Baum-Welch algorithm. If this time is reached, the Baum-Welch will terminate after the current iteration finishes. \code{max.time=-1} means no limit. |
|
24 |
-#' @param max.iter The maximum number of iterations for the Baum-Welch algorithm. \code{max.iter=-1} means no limit. |
|
25 |
-#' @param num.trials The number of trials to find a fit where state \code{most.frequent.state} is most frequent. Each time, the HMM is seeded with different random initial values. |
|
26 |
-#' @param states A subset or all of \code{c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy")}. This vector defines the states that are used in the Hidden Markov Model. The order of the entries should not be changed. |
|
20 |
+ |
|
21 |
+#' @inheritParams univariate.findCNVs |
|
27 | 22 |
#' @param most.frequent.state One of the states that were given in \code{states}. The specified state is assumed to be the most frequent one when calling CNVs. This can help the fitting procedure to converge into the correct fit. Default is 'disomy'. |
28 | 23 |
#' @param most.frequent.state.SCE One of the states that were given in \code{states}. The specified state is assumed to be the most frequent one when calling SCEs. This can help the fitting procedure to converge into the correct fit. Default is 'monosomy'. |
24 |
+ |
|
25 |
+#' @inheritParams getSCEcoordinates |
|
26 |
+ |
|
29 | 27 |
#' @param cluster.plots A logical indicating whether plots should be clustered by similarity. |
30 | 28 |
#' @author Aaron Taudt |
31 | 29 |
#' @import foreach |
32 | 30 |
#' @import doParallel |
33 | 31 |
#' @export |
34 |
-Aneufinder <- function(inputfolder, outputfolder, configfile=NULL, numCPU=1, reuse.existing.files=TRUE, binsizes=500000, reads.per.bin=NULL, format='bam', chromosomes=NULL, remove.duplicate.reads=TRUE, min.mapq=10, correction.method=NULL, GC.BSgenome=NULL, callCNVs=TRUE, callSCEs=FALSE, eps=0.1, max.time=60, max.iter=5000, num.trials=15, states=c('zero-inflation','nullsomy','monosomy','disomy','trisomy','tetrasomy','multisomy'), most.frequent.state='disomy', most.frequent.state.SCE='monosomy', cluster.plots=TRUE) { |
|
32 |
+Aneufinder <- function(inputfolder, outputfolder, configfile=NULL, numCPU=1, reuse.existing.files=TRUE, binsizes=500000, reads.per.bin=NULL, stepsize=NULL, format='bam', chromosomes=NULL, remove.duplicate.reads=TRUE, min.mapq=10, correction.method=NULL, GC.BSgenome=NULL, callCNVs=TRUE, callSCEs=FALSE, eps=0.1, max.time=60, max.iter=5000, num.trials=15, states=c('zero-inflation','nullsomy','monosomy','disomy','trisomy','tetrasomy','multisomy'), most.frequent.state='disomy', most.frequent.state.SCE='monosomy', resolution=c(3,6), min.segwidth=2, min.reads=50, cluster.plots=TRUE) { |
|
35 | 33 |
|
36 | 34 |
#======================= |
37 | 35 |
### Helper functions ### |
... | ... |
@@ -63,7 +61,7 @@ if (class(GC.BSgenome)=='BSgenome') { |
63 | 61 |
} |
64 | 62 |
|
65 | 63 |
## Put options into list and merge with conf |
66 |
-params <- list(numCPU=numCPU, reuse.existing.files=reuse.existing.files, binsizes=binsizes, reads.per.bin=reads.per.bin, format=format, chromosomes=chromosomes, remove.duplicate.reads=remove.duplicate.reads, min.mapq=min.mapq, correction.method=correction.method, GC.BSgenome=GC.BSgenome, callCNVs=callCNVs, callSCEs=callSCEs, eps=eps, max.time=max.time, max.iter=max.iter, num.trials=num.trials, states=states, most.frequent.state=most.frequent.state, most.frequent.state.SCE=most.frequent.state.SCE, cluster.plots=cluster.plots) |
|
64 |
+params <- list(numCPU=numCPU, reuse.existing.files=reuse.existing.files, binsizes=binsizes, reads.per.bin=reads.per.bin, stepsize=stepsize, format=format, chromosomes=chromosomes, remove.duplicate.reads=remove.duplicate.reads, min.mapq=min.mapq, correction.method=correction.method, GC.BSgenome=GC.BSgenome, callCNVs=callCNVs, callSCEs=callSCEs, eps=eps, max.time=max.time, max.iter=max.iter, num.trials=num.trials, states=states, most.frequent.state=most.frequent.state, most.frequent.state.SCE=most.frequent.state.SCE, resolution=resolution, min.segwidth=min.segwidth, min.reads=min.reads, cluster.plots=cluster.plots) |
|
67 | 65 |
conf <- c(conf, params[setdiff(names(params),names(conf))]) |
68 | 66 |
|
69 | 67 |
## Input checks |
... | ... |
@@ -74,7 +72,7 @@ if (! conf[['format']] %in% c('bam')) { |
74 | 72 |
## Helpers |
75 | 73 |
binsizes <- conf[['binsizes']] |
76 | 74 |
reads.per.bins <- conf[['reads.per.bin']] |
77 |
-patterns <- c(paste0('reads.per.bin_',reads.per.bins,'_'), paste0('binsize_',format(binsizes, scientific=F, trim=T),'_')) |
|
75 |
+patterns <- c(paste0('reads.per.bin_',reads.per.bins,'_'), paste0('binsize_',format(binsizes, scientific=T, trim=T),'_')) |
|
78 | 76 |
patterns <- setdiff(patterns, c('reads.per.bin__','binsize__')) |
79 | 77 |
pattern <- NULL #ease R CMD check |
80 | 78 |
|
... | ... |
@@ -123,7 +121,7 @@ temp <- foreach (file = files, .packages=c('aneufinder')) %dopar% { |
123 | 121 |
if (length(c(binsizes.todo,rpbin.todo)) > 0) { |
124 | 122 |
tC <- tryCatch({ |
125 | 123 |
if (conf[['format']]=='bam') { |
126 |
- bam2binned(bamfile=file, binsizes=binsizes.todo, reads.per.bin=rpbin.todo, chromosomes=conf[['chromosomes']], remove.duplicate.reads=conf[['remove.duplicate.reads']], min.mapq=conf[['min.mapq']], outputfolder.binned=binpath.uncorrected, save.as.RData=TRUE, reads.store=TRUE, outputfolder.reads=readspath) |
|
124 |
+ bam2binned(bamfile=file, binsizes=binsizes.todo, reads.per.bin=rpbin.todo, stepsize=conf[['stepsize']], chromosomes=conf[['chromosomes']], remove.duplicate.reads=conf[['remove.duplicate.reads']], min.mapq=conf[['min.mapq']], outputfolder.binned=binpath.uncorrected, save.as.RData=TRUE, reads.store=TRUE, outputfolder.reads=readspath) |
|
127 | 125 |
} |
128 | 126 |
}, error = function(err) { |
129 | 127 |
stop(file,'\n',err) |
... | ... |
@@ -217,7 +215,7 @@ if (conf[['callCNVs']]==TRUE) { |
217 | 215 |
### Plotting CNV ### |
218 | 216 |
#=================== |
219 | 217 |
if (!file.exists(CNVplotpath)) { dir.create(CNVplotpath) } |
220 |
- patterns <- c(paste0('reads.per.bin_',reads.per.bins,'_'), paste0('binsize_',format(binsizes, scientific=F, trim=T),'_')) |
|
218 |
+ patterns <- c(paste0('reads.per.bin_',reads.per.bins,'_'), paste0('binsize_',format(binsizes, scientific=T, trim=T),'_')) |
|
221 | 219 |
patterns <- setdiff(patterns, c('reads.per.bin__','binsize__')) |
222 | 220 |
files <- list.files(CNVpath, full.names=TRUE, recursive=T, pattern='.RData$') |
223 | 221 |
|
... | ... |
@@ -230,7 +228,7 @@ if (conf[['callCNVs']]==TRUE) { |
230 | 228 |
if (!file.exists(savename)) { |
231 | 229 |
pdf(file=savename, width=10, height=7) |
232 | 230 |
ifiles <- list.files(CNVpath, pattern='RData$', full.names=TRUE) |
233 |
- ifiles <- grep(pattern, ifiles, value=T) |
|
231 |
+ ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T) |
|
234 | 232 |
for (ifile in ifiles) { |
235 | 233 |
tC <- tryCatch({ |
236 | 234 |
model <- get(load(ifile)) |
... | ... |
@@ -249,7 +247,7 @@ if (conf[['callCNVs']]==TRUE) { |
249 | 247 |
message("Plotting heatmaps ...", appendLF=F); ptm <- proc.time() |
250 | 248 |
temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% { |
251 | 249 |
ifiles <- list.files(CNVpath, pattern='RData$', full.names=TRUE) |
252 |
- ifiles <- grep(pattern, ifiles, value=T) |
|
250 |
+ ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T) |
|
253 | 251 |
savename=file.path(CNVplotpath,paste0('genomeHeatmap_',sub('_$','',pattern),'.pdf')) |
254 | 252 |
if (!file.exists(savename)) { |
255 | 253 |
suppressMessages(heatmapGenomewide(ifiles, file=savename, plot.SCE=F, cluster=conf[['cluster.plots']])) |
... | ... |
@@ -257,7 +255,7 @@ if (conf[['callCNVs']]==TRUE) { |
257 | 255 |
} |
258 | 256 |
temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% { |
259 | 257 |
ifiles <- list.files(CNVpath, pattern='RData$', full.names=TRUE) |
260 |
- ifiles <- grep(pattern, ifiles, value=T) |
|
258 |
+ ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T) |
|
261 | 259 |
savename=file.path(CNVplotpath,paste0('aneuploidyHeatmap_',sub('_$','',pattern),'.pdf')) |
262 | 260 |
if (!file.exists(savename)) { |
263 | 261 |
ggplt <- suppressMessages(heatmapAneuploidies(ifiles, cluster=conf[['cluster.plots']])) |
... | ... |
@@ -276,7 +274,7 @@ if (conf[['callCNVs']]==TRUE) { |
276 | 274 |
if (!file.exists(savename)) { |
277 | 275 |
pdf(file=savename, width=20, height=5) |
278 | 276 |
ifiles <- list.files(CNVpath, pattern='RData$', full.names=TRUE) |
279 |
- ifiles <- grep(pattern, ifiles, value=T) |
|
277 |
+ ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T) |
|
280 | 278 |
for (ifile in ifiles) { |
281 | 279 |
tC <- tryCatch({ |
282 | 280 |
model <- get(load(ifile)) |
... | ... |
@@ -298,7 +296,7 @@ if (conf[['callCNVs']]==TRUE) { |
298 | 296 |
if (!file.exists(savename)) { |
299 | 297 |
pdf(file=savename, width=12*1.4, height=2*4.6) |
300 | 298 |
ifiles <- list.files(CNVpath, pattern='RData$', full.names=TRUE) |
301 |
- ifiles <- grep(pattern, ifiles, value=T) |
|
299 |
+ ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T) |
|
302 | 300 |
for (ifile in ifiles) { |
303 | 301 |
tC <- tryCatch({ |
304 | 302 |
model <- get(load(ifile)) |
... | ... |
@@ -320,7 +318,7 @@ if (conf[['callCNVs']]==TRUE) { |
320 | 318 |
savename <- file.path(CNVbrowserpath,paste0(pattern)) |
321 | 319 |
if (!file.exists(paste0(savename,'.bed.gz'))) { |
322 | 320 |
ifiles <- list.files(CNVpath, pattern='RData$', full.names=TRUE) |
323 |
- ifiles <- grep(pattern, ifiles, value=T) |
|
321 |
+ ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T) |
|
324 | 322 |
exportCNVs(ifiles, filename=savename, cluster=conf[['cluster.plots']], export.CNV=TRUE, export.SCE=FALSE) |
325 | 323 |
} |
326 | 324 |
} |
... | ... |
@@ -340,7 +338,9 @@ if (conf[['callSCEs']]==TRUE) { |
340 | 338 |
tC <- tryCatch({ |
341 | 339 |
savename <- file.path(SCEpath,basename(file)) |
342 | 340 |
if (!file.exists(savename)) { |
343 |
- model <- findSCEs(file, eps=conf[['eps']], max.time=conf[['max.time']], max.iter=conf[['max.iter']], num.trials=conf[['num.trials']], states=conf[['states']], most.frequent.state=conf[['most.frequent.state.SCE']]) |
|
341 |
+ binned.data <- get(load(file)) |
|
342 |
+ reads.file <- list.files(readspath, full.names=T, pattern=paste0('^',attr(binned.data,'ID'),'.RData$')) |
|
343 |
+ model <- findSCEs(file, eps=conf[['eps']], max.time=conf[['max.time']], max.iter=conf[['max.iter']], num.trials=conf[['num.trials']], states=conf[['states']], most.frequent.state=conf[['most.frequent.state.SCE']], resolution=conf[['resolution']], min.segwidth=conf[['min.segwidth']], fragments=reads.file, min.reads=conf[['min.reads']]) |
|
344 | 344 |
save(model, file=savename) |
345 | 345 |
} |
346 | 346 |
}, error = function(err) { |
... | ... |
@@ -353,7 +353,7 @@ if (conf[['callSCEs']]==TRUE) { |
353 | 353 |
### Plotting SCE ### |
354 | 354 |
#=================== |
355 | 355 |
if (!file.exists(SCEplotpath)) { dir.create(SCEplotpath) } |
356 |
- patterns <- c(paste0('reads.per.bin_',reads.per.bins,'_'), paste0('binsize_',format(binsizes, scientific=F, trim=T),'_')) |
|
356 |
+ patterns <- c(paste0('reads.per.bin_',reads.per.bins,'_'), paste0('binsize_',format(binsizes, scientific=T, trim=T),'_')) |
|
357 | 357 |
patterns <- setdiff(patterns, c('reads.per.bin__','binsize__')) |
358 | 358 |
files <- list.files(SCEpath, full.names=TRUE, recursive=T, pattern='.RData$') |
359 | 359 |
|
... | ... |
@@ -366,7 +366,7 @@ if (conf[['callSCEs']]==TRUE) { |
366 | 366 |
if (!file.exists(savename)) { |
367 | 367 |
pdf(file=savename, width=10, height=7) |
368 | 368 |
ifiles <- list.files(SCEpath, pattern='RData$', full.names=TRUE) |
369 |
- ifiles <- grep(pattern, ifiles, value=T) |
|
369 |
+ ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T) |
|
370 | 370 |
for (ifile in ifiles) { |
371 | 371 |
tC <- tryCatch({ |
372 | 372 |
model <- get(load(ifile)) |
... | ... |
@@ -385,7 +385,7 @@ if (conf[['callSCEs']]==TRUE) { |
385 | 385 |
message("Plotting heatmaps ...", appendLF=F); ptm <- proc.time() |
386 | 386 |
temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% { |
387 | 387 |
ifiles <- list.files(SCEpath, pattern='RData$', full.names=TRUE) |
388 |
- ifiles <- grep(pattern, ifiles, value=T) |
|
388 |
+ ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T) |
|
389 | 389 |
savename=file.path(SCEplotpath,paste0('genomeHeatmap_',sub('_$','',pattern),'.pdf')) |
390 | 390 |
if (!file.exists(savename)) { |
391 | 391 |
suppressMessages(heatmapGenomewide(ifiles, file=savename, plot.SCE=T, cluster=conf[['cluster.plots']])) |
... | ... |
@@ -393,7 +393,7 @@ if (conf[['callSCEs']]==TRUE) { |
393 | 393 |
} |
394 | 394 |
temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% { |
395 | 395 |
ifiles <- list.files(SCEpath, pattern='RData$', full.names=TRUE) |
396 |
- ifiles <- grep(pattern, ifiles, value=T) |
|
396 |
+ ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T) |
|
397 | 397 |
savename=file.path(SCEplotpath,paste0('aneuploidyHeatmap_',sub('_$','',pattern),'.pdf')) |
398 | 398 |
if (!file.exists(savename)) { |
399 | 399 |
pdf(savename, width=30, height=0.3*length(ifiles)) |
... | ... |
@@ -412,7 +412,7 @@ if (conf[['callSCEs']]==TRUE) { |
412 | 412 |
if (!file.exists(savename)) { |
413 | 413 |
pdf(file=savename, width=20, height=5) |
414 | 414 |
ifiles <- list.files(SCEpath, pattern='RData$', full.names=TRUE) |
415 |
- ifiles <- grep(pattern, ifiles, value=T) |
|
415 |
+ ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T) |
|
416 | 416 |
for (ifile in ifiles) { |
417 | 417 |
tC <- tryCatch({ |
418 | 418 |
model <- get(load(ifile)) |
... | ... |
@@ -434,7 +434,7 @@ if (conf[['callSCEs']]==TRUE) { |
434 | 434 |
if (!file.exists(savename)) { |
435 | 435 |
pdf(file=savename, width=12*1.4, height=2*4.6) |
436 | 436 |
ifiles <- list.files(SCEpath, pattern='RData$', full.names=TRUE) |
437 |
- ifiles <- grep(pattern, ifiles, value=T) |
|
437 |
+ ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T) |
|
438 | 438 |
for (ifile in ifiles) { |
439 | 439 |
tC <- tryCatch({ |
440 | 440 |
model <- get(load(ifile)) |
... | ... |
@@ -456,7 +456,7 @@ if (conf[['callSCEs']]==TRUE) { |
456 | 456 |
savename <- file.path(SCEbrowserpath,sub('_$','',pattern)) |
457 | 457 |
if (!file.exists(paste0(savename,'.bed.gz'))) { |
458 | 458 |
ifiles <- list.files(SCEpath, pattern='RData$', full.names=TRUE) |
459 |
- ifiles <- grep(pattern, ifiles, value=T) |
|
459 |
+ ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T) |
|
460 | 460 |
exportCNVs(ifiles, filename=savename, cluster=conf[['cluster.plots']], export.CNV=TRUE, export.SCE=TRUE) |
461 | 461 |
} |
462 | 462 |
} |
... | ... |
@@ -2,10 +2,9 @@ |
2 | 2 |
|
3 | 3 |
#' Convert aligned reads from various file formats into read counts in equidistant bins |
4 | 4 |
#' |
5 |
-#' Convert aligned reads in .bam or .bed format into read counts in equidistant windows. Convert signal values in .bedGraph format to signal counts in equidistant windows. |
|
5 |
+#' Convert aligned reads in .bam or .bed format into read counts in equidistant windows. |
|
6 | 6 |
#' |
7 | 7 |
#' Convert aligned reads or signal values from various file formats into read counts in equidistant windows (bins). bam2binned() and bed2binned() use 'GenomicRanges::countOverlaps()' to calculate the read counts. Therefore, with small binsizes and large read lengths, one read fragment will often overlap more than one bin. |
8 |
-#' bedGraph2binned() sets the maximum signal value in a bin as value for that bin. |
|
9 | 8 |
#' |
10 | 9 |
#' @name binning |
11 | 10 |
#' @examples |
... | ... |
@@ -21,13 +20,13 @@ NULL |
21 | 20 |
#' @param bamfile A file in BAM format. |
22 | 21 |
#' @param bamindex BAM index file. Can be specified without the .bai ending. If the index file does not exist it will be created and a warning is issued. |
23 | 22 |
#' @export |
24 |
-bam2binned <- function(bamfile, bamindex=bamfile, ID=basename(bamfile), pairedEndReads=FALSE, outputfolder.binned="binned_data", binsizes=NULL, reads.per.bin=NULL, numbins=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) { |
|
23 |
+bam2binned <- function(bamfile, bamindex=bamfile, ID=basename(bamfile), pairedEndReads=FALSE, outputfolder.binned="binned_data", binsizes=NULL, reads.per.bin=NULL, stepsize=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) { |
|
25 | 24 |
call <- match.call() |
26 | 25 |
underline <- paste0(rep('=',sum(nchar(call[[1]]))+3), collapse='') |
27 | 26 |
message("\n",call[[1]],"():") |
28 | 27 |
message(underline) |
29 | 28 |
ptm <- proc.time() |
30 |
- binned.data <- align2binned(bamfile, format="bam", ID=ID, bamindex=bamindex, pairedEndReads=pairedEndReads, outputfolder.binned=outputfolder.binned, binsizes=binsizes, reads.per.bin=reads.per.bin, numbins=numbins, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only) |
|
29 |
+ binned.data <- align2binned(bamfile, format="bam", ID=ID, bamindex=bamindex, pairedEndReads=pairedEndReads, outputfolder.binned=outputfolder.binned, binsizes=binsizes, reads.per.bin=reads.per.bin, stepsize=stepsize, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only) |
|
31 | 30 |
time <- proc.time() - ptm |
32 | 31 |
message("Time spent in ", call[[1]],"(): ",round(time[3],2),"s") |
33 | 32 |
return(binned.data) |
... | ... |
@@ -36,30 +35,13 @@ bam2binned <- function(bamfile, bamindex=bamfile, ID=basename(bamfile), pairedEn |
36 | 35 |
#' @describeIn binning Bin reads in BED format |
37 | 36 |
#' @inheritParams align2binned |
38 | 37 |
#' @param bedfile A file in BED format. |
39 |
-#' @export |
|
40 |
-bed2binned <- function(bedfile, chrom.length.file, ID=basename(bedfile), outputfolder.binned="binned_data", binsizes=NULL, reads.per.bin=NULL, numbins=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) { |
|
38 |
+bed2binned <- function(bedfile, chrom.length.file, ID=basename(bedfile), outputfolder.binned="binned_data", binsizes=NULL, reads.per.bin=NULL, stepsize=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) { |
|
41 | 39 |
call <- match.call() |
42 | 40 |
underline <- paste0(rep('=',sum(nchar(call[[1]]))+3), collapse='') |
43 | 41 |
message("\n",call[[1]],"():") |
44 | 42 |
message(underline) |
45 | 43 |
ptm <- proc.time() |
46 |
- binned.data <- align2binned(bedfile, format="bed", ID=ID, chrom.length.file=chrom.length.file, outputfolder.binned=outputfolder.binned, binsizes=binsizes, reads.per.bin=reads.per.bin, numbins=numbins, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only) |
|
47 |
- time <- proc.time() - ptm |
|
48 |
- message("Time spent in ", call[[1]],"(): ",round(time[3],2),"s") |
|
49 |
- return(binned.data) |
|
50 |
-} |
|
51 |
- |
|
52 |
-#' @describeIn binning Bin reads in bedGraph format |
|
53 |
-#' @inheritParams align2binned |
|
54 |
-#' @param bedGraphfile A file in bedGraph format. |
|
55 |
-#' @export |
|
56 |
-bedGraph2binned <- function(bedGraphfile, chrom.length.file, ID=basename(bedGraphfile), outputfolder.binned="binned_data", binsizes=NULL, reads.per.bin=NULL, numbins=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) { |
|
57 |
- call <- match.call() |
|
58 |
- underline <- paste0(rep('=',sum(nchar(call[[1]]))+3), collapse='') |
|
59 |
- message("\n",call[[1]],"():") |
|
60 |
- message(underline) |
|
61 |
- ptm <- proc.time() |
|
62 |
- binned.data <- align2binned(bedGraphfile, format="bedGraph", ID=ID, chrom.length.file=chrom.length.file, outputfolder.binned=outputfolder.binned, binsizes=binsizes, reads.per.bin=reads.per.bin, numbins=numbins, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only) |
|
44 |
+ binned.data <- align2binned(bedfile, format="bed", ID=ID, chrom.length.file=chrom.length.file, outputfolder.binned=outputfolder.binned, binsizes=binsizes, reads.per.bin=reads.per.bin, stepsize=stepsize, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only) |
|
63 | 45 |
time <- proc.time() - ptm |
64 | 46 |
message("Time spent in ", call[[1]],"(): ",round(time[3],2),"s") |
65 | 47 |
return(binned.data) |
... | ... |
@@ -67,10 +49,10 @@ bedGraph2binned <- function(bedGraphfile, chrom.length.file, ID=basename(bedGrap |
67 | 49 |
|
68 | 50 |
#' Convert aligned reads from various file formats into read counts in equidistant bins |
69 | 51 |
#' |
70 |
-#' Convert aligned reads in .bam or .bed format into read counts in equidistant windows. Convert signal values in .bedGraph format to signal counts in equidistant windows. |
|
52 |
+#' Convert aligned reads in .bam or .bed format into read counts in equidistant windows. |
|
71 | 53 |
#' |
72 | 54 |
#' @param file A file with aligned reads. |
73 |
-#' @param format One of \code{c('bam', 'bed', 'bedGraph')}. |
|
55 |
+#' @param format One of \code{c('bam', 'bed')}. |
|
74 | 56 |
#' @param ID An identifier that will be used to identify the file throughout the workflow and in plotting. |
75 | 57 |
#' @param bamindex Index file if \code{format='bam'} with or without the .bai ending. If this file does not exist it will be created and a warning is issued. |
76 | 58 |
#' @param pairedEndReads Set to \code{TRUE} if you have paired-end reads in your file. |
... | ... |
@@ -78,7 +60,7 @@ bedGraph2binned <- function(bedGraphfile, chrom.length.file, ID=basename(bedGrap |
78 | 60 |
#' @param outputfolder.binned Folder to which the binned data will be saved. If the specified folder does not exist, it will be created. |
79 | 61 |
#' @param binsizes An integer vector with bin sizes. If more than one value is given, output files will be produced for each bin size. |
80 | 62 |
#' @param reads.per.bin Approximate number of desired reads per bin. The bin size will be selected accordingly. Output files are produced for each value. |
81 |
-#' @param numbins Number of bins per chromosome. Each chromosome will have a different binsize! DO NOT USE UNLESS YOU KNOW WHAT YOU ARE DOING. Output files are produced for each value. |
|
63 |
+#' @param stepsize Fraction of the binsize that the sliding window is offset at each step. Example: If \code{stepsize=0.1} and \code{binsizes=c(200000,500000)}, the actual stepsize in basepairs is 20000 and 50000, respectively. |
|
82 | 64 |
#' @param chromosomes If only a subset of the chromosomes should be binned, specify them here. |
83 | 65 |
#' @param save.as.RData If set to \code{FALSE}, no output file will be written. Instead, a \link{GenomicRanges} object containing the binned data will be returned. Only the first binsize will be processed in this case. |
84 | 66 |
#' @param calc.complexity A logical indicating whether or not to estimate library complexity. |
... | ... |
@@ -91,7 +73,7 @@ bedGraph2binned <- function(bedGraphfile, chrom.length.file, ID=basename(bedGrap |
91 | 73 |
#' @param reads.overwrite Whether or not an existing file with read fragments should be overwritten. |
92 | 74 |
#' @param reads.only If \code{TRUE} only read fragments are stored and/or returned and no binning is done. |
93 | 75 |
#' @return The function produces a \link{GRanges} object with one meta data column 'reads' that contains the read count. This binned data will be either written to file (\code{save.as.RData=FALSE}) or given as return value (\code{save.as.RData=FALSE}). |
94 |
-align2binned <- function(file, format, ID=basename(file), bamindex=file, pairedEndReads=FALSE, chrom.length.file, outputfolder.binned="binned_data", binsizes=200000, reads.per.bin=NULL, numbins=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, call=match.call(), reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) { |
|
76 |
+align2binned <- function(file, format, ID=basename(file), bamindex=file, pairedEndReads=FALSE, chrom.length.file, outputfolder.binned="binned_data", binsizes=200000, reads.per.bin=NULL, stepsize=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, call=match.call(), reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) { |
|
95 | 77 |
|
96 | 78 |
## Check user input |
97 | 79 |
if (reads.return==FALSE & reads.only==FALSE) { |
... | ... |
@@ -137,21 +119,6 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, pairedE |
137 | 119 |
} |
138 | 120 |
data <- data[mcols(data)$mapq >= min.mapq] |
139 | 121 |
} |
140 |
- ## BEDGraph (0-based) |
|
141 |
- } else if (format == "bedGraph") { |
|
142 |
- # File with chromosome lengths (1-based) |
|
143 |
- chrom.lengths.df <- read.table(chrom.length.file) |
|
144 |
- chrom.lengths <- chrom.lengths.df[,2] |
|
145 |
- names(chrom.lengths) <- chrom.lengths.df[,1] |
|
146 |
- # File with reads, determine classes first for faster import |
|
147 |
- tab5rows <- read.table(file, nrows=5) |
|
148 |
- classes.in.bed <- sapply(tab5rows, class) |
|
149 |
- classes <- rep("NULL",length(classes.in.bed)) |
|
150 |
- classes[1:4] <- classes.in.bed[1:4] |
|
151 |
- data <- read.table(file, colClasses=classes) |
|
152 |
- # Convert to GRanges object |
|
153 |
- data <- GenomicRanges::GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2]+1, end=data[,3]), strand=Rle(strand("*"), nrow(data)), signal=data[,4]) # start+1 to go from [0,x) -> [1,x] |
|
154 |
- seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))]) |
|
155 | 122 |
} |
156 | 123 |
time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
157 | 124 |
|
... | ... |
@@ -208,60 +175,44 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, pairedE |
208 | 175 |
coverage <- list(coverage=coverage, genome.covered=genome.covered, coverage.per.chrom=coverage.per.chrom, genome.covered.per.chrom=genome.covered.per.chrom) |
209 | 176 |
|
210 | 177 |
|
211 |
- ### Loop over all binsizes ### |
|
212 | 178 |
## Pad binsizes and reads.per.bin with each others value |
213 | 179 |
numcountsperbp <- length(data) / sum(as.numeric(seqlengths(data))) |
214 | 180 |
binsizes.add <- round(reads.per.bin / numcountsperbp, -2) |
215 | 181 |
reads.per.bin.add <- round(binsizes * numcountsperbp, 2) |
216 | 182 |
binsizes <- c(binsizes, binsizes.add) |
217 | 183 |
reads.per.bin <- c(reads.per.bin.add, reads.per.bin) |
218 |
- if (is.null(numbins)) { |
|
219 |
- length.binsizes <- length(binsizes) |
|
220 |
- } else { |
|
221 |
- length.binsizes <- length(numbins) |
|
222 |
- } |
|
184 |
+ length.binsizes <- length(binsizes) |
|
223 | 185 |
|
186 |
+ ### Loop over all binsizes ### |
|
187 |
+ data.plus <- data[strand(data)=='+'] |
|
188 |
+ data.minus <- data[strand(data)=='-'] |
|
224 | 189 |
for (ibinsize in 1:length.binsizes) { |
225 |
- if (is.null(numbins)) { |
|
226 |
- binsize <- binsizes[ibinsize] |
|
227 |
- readsperbin <- reads.per.bin[ibinsize] |
|
228 |
- message("Binning into bin size ",binsize," with on average ",readsperbin," reads per bin") |
|
229 |
- } else { |
|
230 |
- numbin <- numbins[ibinsize] |
|
231 |
- message("Binning with number of bins ",numbin) |
|
232 |
- } |
|
190 |
+ binsize <- binsizes[ibinsize] |
|
191 |
+ readsperbin <- reads.per.bin[ibinsize] |
|
192 |
+ message("Binning into bin size ",binsize," with on average ",readsperbin," reads per bin") |
|
233 | 193 |
|
234 |
- ### Iterate over all chromosomes |
|
194 |
+ ### Loop over chromosomes ### |
|
235 | 195 |
message(" binning genome ...", appendLF=F); ptm <- proc.time() |
236 | 196 |
binned.data <- GenomicRanges::GRangesList() |
237 | 197 |
for (chromosome in chroms2use) { |
238 | 198 |
|
239 |
- if (is.null(numbins)) { |
|
240 |
- ## Check last incomplete bin |
|
241 |
- incomplete.bin <- seqlengths(data)[chromosome] %% binsize > 0 |
|
242 |
- if (incomplete.bin) { |
|
243 |
- numbin <- floor(seqlengths(data)[chromosome]/binsize) # floor: we don't want incomplete bins, ceiling: we want incomplete bins at the end |
|
244 |
- } else { |
|
245 |
- numbin <- seqlengths(data)[chromosome]/binsize |
|
246 |
- } |
|
247 |
- if (numbin == 0) { |
|
248 |
- warning("chromosome ",chromosome," is smaller than binsize. Skipped.") |
|
249 |
- next |
|
250 |
- } |
|
251 |
- ## Initialize vectors |
|
252 |
- chroms <- rep(chromosome,numbin) |
|
253 |
- reads <- rep(0,numbin) |
|
254 |
- start <- seq(from=1, by=binsize, length.out=numbin) |
|
255 |
- end <- seq(from=binsize, by=binsize, length.out=numbin) |
|
256 |
-# end[length(end)] <- seqlengths(data)[chromosome] # last ending coordinate is size of chromosome, only if incomplete bins are desired |
|
199 |
+ ## Check last incomplete bin |
|
200 |
+ incomplete.bin <- seqlengths(data)[chromosome] %% binsize > 0 |
|
201 |
+ if (incomplete.bin) { |
|
202 |
+ numbin <- floor(seqlengths(data)[chromosome]/binsize) # floor: we don't want incomplete bins, ceiling: we want incomplete bins at the end |
|
257 | 203 |
} else { |
258 |
- ## Initialize vectors |
|
259 |
- chroms <- rep(chromosome,numbin) |
|
260 |
- start <- round(seq(from=1, to=seqlengths(data)[chromosome], length.out=numbin+1)) |
|
261 |
- end <- start[-1] - 1 |
|
262 |
- end[length(end)] <- end[length(end)] + 1 |
|
263 |
- start <- start[-length(start)] |
|
204 |
+ numbin <- seqlengths(data)[chromosome]/binsize |
|
264 | 205 |
} |
206 |
+ if (numbin == 0) { |
|
207 |
+ warning("chromosome ",chromosome," is smaller than binsize. Skipped.") |
|
208 |
+ next |
|
209 |
+ } |
|
210 |
+ ## Initialize vectors |
|
211 |
+ chroms <- rep(chromosome,numbin) |
|
212 |
+ reads <- rep(0,numbin) |
|
213 |
+ start <- seq(from=1, by=binsize, length.out=numbin) |
|
214 |
+ end <- seq(from=binsize, by=binsize, length.out=numbin) |
|
215 |
+# end[length(end)] <- seqlengths(data)[chromosome] # last ending coordinate is size of chromosome, only if incomplete bins are desired |
|
265 | 216 |
|
266 | 217 |
## Create binned chromosome as GRanges object |
267 | 218 |
i.binned.data <- GenomicRanges::GRanges(seqnames = Rle(chromosome, numbin), |
... | ... |
@@ -273,44 +224,35 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, pairedE |
273 | 224 |
suppressWarnings( |
274 | 225 |
binned.data[[chromosome]] <- i.binned.data |
275 | 226 |
) |
276 |
- } |
|
227 |
+ |
|
228 |
+ } ### end loop chromosomes ### |
|
277 | 229 |
time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
278 | 230 |
binned.data <- unlist(binned.data) |
279 | 231 |
names(binned.data) <- NULL |
280 | 232 |
|
281 |
- ## Count overlaps |
|
282 |
- message(" counting overlaps ...", appendLF=F); ptm <- proc.time() |
|
283 |
- if (format=="bam" | format=="bed") { |
|
284 |
- mcounts <- GenomicRanges::countOverlaps(binned.data, data[strand(data)=='-']) |
|
285 |
- pcounts <- GenomicRanges::countOverlaps(binned.data, data[strand(data)=='+']) |
|
286 |
- counts <- mcounts + pcounts |
|
287 |
- } else if (format=="bedGraph") { |
|
288 |
- # Take the max value from all regions that fall into / overlap a given bin as read counts |
|
289 |
- midx <- as.matrix(findOverlaps(binned.data, data)) |
|
290 |
- counts <- rep(0,length(binned.data)) |
|
291 |
- signal <- mcols(data)$signal |
|
292 |
- rle <- rle(midx[,1]) |
|
293 |
- read.idx <- rle$values |
|
294 |
- max.idx <- cumsum(rle$lengths) |
|
295 |
- maxvalues <- rep(NA, length(read.idx)) |
|
296 |
- maxvalues[1] <- max(signal[midx[1:(max.idx[1]),2]]) |
|
297 |
- for (i1 in 2:length(read.idx)) { |
|
298 |
- maxvalues[i1] <- max(signal[midx[(max.idx[i1-1]+1):(max.idx[i1]),2]]) |
|
299 |
- } |
|
300 |
- counts[read.idx] <- maxvalues |
|
301 |
- mcounts <- rep(NA, length(counts)) |
|
302 |
- pcounts <- rep(NA, length(counts)) |
|
303 |
- } |
|
304 |
- if (is.null(numbins)) { |
|
305 |
- mcols(binned.data)$counts <- counts |
|
306 |
- mcols(binned.data)$mcounts <- mcounts |
|
307 |
- mcols(binned.data)$pcounts <- pcounts |
|
233 |
+ ### Loop over offsets ### |
|
234 |
+ countmatrices <- list() |
|
235 |
+ if (is.null(stepsize)) { |
|
236 |
+ offsets <- 0 |
|
308 | 237 |
} else { |
309 |
- mcols(binned.data)$counts <- counts / (end - start) |
|
310 |
- mcols(binned.data)$mcounts <- mcounts / (end - start) |
|
311 |
- mcols(binned.data)$pcounts <- pcounts / (end - start) |
|
238 |
+ offsets <- seq(from=0, to=binsize, by=as.integer(stepsize*binsize)) |
|
312 | 239 |
} |
313 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
240 |
+ for (ioff in offsets) { |
|
241 |
+ ## Count overlaps |
|
242 |
+ message(paste0(" counting overlaps for offset ",ioff," ..."), appendLF=F); ptm <- proc.time() |
|
243 |
+ binned.data.shift <- suppressWarnings( shift(binned.data, shift=ioff) ) |
|
244 |
+ if (format=="bam" | format=="bed") { |
|
245 |
+ mcounts <- suppressWarnings( GenomicRanges::countOverlaps(binned.data.shift, data.minus) ) |
|
246 |
+ pcounts <- suppressWarnings( GenomicRanges::countOverlaps(binned.data.shift, data.plus) ) |
|
247 |
+ counts <- mcounts + pcounts |
|
248 |
+ } |
|
249 |
+ countmatrix <- matrix(c(counts,mcounts,pcounts), ncol=3) |
|
250 |
+ colnames(countmatrix) <- c('counts','mcounts','pcounts') |
|
251 |
+ countmatrices[[as.character(ioff)]] <- countmatrix |
|
252 |
+ time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
253 |
+ } |
|
254 |
+ mcols(binned.data) <- as(countmatrices[['0']],'DataFrame') |
|
255 |
+ attr(binned.data,'offset.counts') <- countmatrices |
|
314 | 256 |
|
315 | 257 |
if (length(binned.data) == 0) { |
316 | 258 |
warning(paste0("The bin size of ",binsize," with reads per bin ",reads.per.bin," is larger than any of the chromosomes.")) |
... | ... |
@@ -328,11 +270,7 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, pairedE |
328 | 270 |
### Save or return the binned data ### |
329 | 271 |
if (save.as.RData==TRUE) { |
330 | 272 |
# Save to file |
331 |
- if (is.null(numbins)) { |
|
332 |
- filename <- paste0(ID,"_binsize_",format(binsize, scientific=T, trim=T),"_reads.per.bin_",readsperbin,"_.RData") |
|
333 |
- } else { |
|
334 |
- filename <- paste0(ID,"_numbin_",format(numbin, scientific=T, trim=T),"_.RData") |
|
335 |
- } |
|
273 |
+ filename <- paste0(ID,"_binsize_",format(binsize, scientific=T, trim=T),"_reads.per.bin_",readsperbin,"_.RData") |
|
336 | 274 |
message("Saving to file ...", appendLF=F); ptm <- proc.time() |
337 | 275 |
# attr(binned.data, 'call') <- call # do not store along with GRanges because it inflates disk usage |
338 | 276 |
save(binned.data, file=file.path(outputfolder.binned,filename) ) |
... | ... |
@@ -342,7 +280,7 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, pairedE |
342 | 280 |
return(binned.data) |
343 | 281 |
} |
344 | 282 |
|
345 |
- } |
|
283 |
+ } ### end loop binsizes ### |
|
346 | 284 |
|
347 | 285 |
|
348 | 286 |
} |
... | ... |
@@ -37,10 +37,15 @@ insertchr <- function(hmm.gr) { |
37 | 37 |
exportCNVs <- function(hmm.list, filename, cluster=TRUE, export.CNV=TRUE, export.SCE=TRUE) { |
38 | 38 |
|
39 | 39 |
## Get segments and SCE coordinates |
40 |
- temp <- getSegments(hmm.list, cluster=cluster, getSCE=export.SCE) |
|
40 |
+ hmm.list <- loadHmmsFromFiles(hmm.list) |
|
41 |
+ temp <- getSegments(hmm.list, cluster=cluster) |
|
41 | 42 |
hmm.grl <- temp$segments |
43 |
+ if (cluster) { |
|
44 |
+ hmm.list <- hmm.list[temp$clustering$order] |
|
45 |
+ } |
|
42 | 46 |
if (export.SCE) { |
43 |
- sce <- temp$sce |
|
47 |
+ sce <- lapply(hmm.list,'[[','sce') |
|
48 |
+ names(sce) <- lapply(hmm.list,'[[','ID') |
|
44 | 49 |
sce <- sce[!unlist(lapply(sce, is.null))] |
45 | 50 |
sce <- sce[lapply(sce, length)!=0] |
46 | 51 |
if (length(sce)==0) { |
... | ... |
@@ -6,8 +6,8 @@ |
6 | 6 |
#' |
7 | 7 |
#' \code{findCNVs} uses a 6-state Hidden Markov Model to classify the binned read counts: state 'nullsomy' with a delta function as emission densitiy (only zero read counts), 'monosomy','disomy','trisomy','tetrasomy' and 'multisomy' with negative binomials (see \code{\link{dnbinom}}) as emission densities. A Baum-Welch algorithm is employed to estimate the parameters of the distributions. See our paper for a detailed description of the method. TODO: insert paper |
8 | 8 |
#' @author Aaron Taudt |
9 |
-#' @param method Currently only \code{c('univariate')}. |
|
10 | 9 |
#' @inheritParams univariate.findCNVs |
10 |
+#' @param method One of \code{c('univariate','bivariate')}. In the univariate case strand information is discarded, while in the bivariate case strand information is used for the fitting. |
|
11 | 11 |
#' @examples |
12 | 12 |
#'## Get an example BAM file with single-cell-sequencing reads |
13 | 13 |
#'bamfile <- system.file("extdata/BB140820_I_002.bam", package="aneufinder") |
... | ... |
@@ -18,7 +18,7 @@ |
18 | 18 |
#'## Check the fit |
19 | 19 |
#'plot(model, type='histogram') |
20 | 20 |
#' @export |
21 |
-findCNVs <- function(binned.data, ID=NULL, method='univariate', eps=0.1, init="standard", max.time=-1, max.iter=1000, num.trials=15, eps.try=10*eps, num.threads=1, count.cutoff.quantile=0.999, strand='*', states=c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), most.frequent.state="disomy") { |
|
21 |
+findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=1000, num.trials=15, eps.try=10*eps, num.threads=1, count.cutoff.quantile=0.999, strand='*', states=c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), most.frequent.state="disomy", method="univariate", algorithm="EM", allow.odd.states=TRUE, initial.params=NULL) { |
|
22 | 22 |
|
23 | 23 |
## Intercept user input |
24 | 24 |
if (class(binned.data) != 'GRanges') { |
... | ... |
@@ -37,8 +37,10 @@ findCNVs <- function(binned.data, ID=NULL, method='univariate', eps=0.1, init="s |
37 | 37 |
ptm <- proc.time() |
38 | 38 |
message("Find CNVs for ID = ",ID, ":") |
39 | 39 |
|
40 |
- if (method=='univariate') { |
|
41 |
- model <- univariate.findCNVs(binned.data, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, count.cutoff.quantile=count.cutoff.quantile, strand=strand, states=states, most.frequent.state=most.frequent.state) |
|
40 |
+ if (method == 'univariate') { |
|
41 |
+ model <- univariate.findCNVs(binned.data, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, count.cutoff.quantile=count.cutoff.quantile, strand=strand, states=states, most.frequent.state=most.frequent.state, algorithm=algorithm, initial.params=initial.params) |
|
42 |
+ } else if (method == 'bivariate') { |
|
43 |
+ model <- bivariate.findCNVs(binned.data, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, count.cutoff.quantile=count.cutoff.quantile, allow.odd.states=allow.odd.states, states=states, most.frequent.state=most.frequent.state, initial.params=initial.params) |
|
42 | 44 |
} |
43 | 45 |
|
44 | 46 |
attr(model, 'call') <- call |
... | ... |
@@ -70,8 +72,10 @@ findCNVs <- function(binned.data, ID=NULL, method='univariate', eps=0.1, init="s |
70 | 72 |
#' @param strand Run the HMM only for the specified strand. One of \code{c('+', '-', '*')}. |
71 | 73 |
#' @param states A subset or all of \code{c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy")}. This vector defines the states that are used in the Hidden Markov Model. The order of the entries should not be changed. |
72 | 74 |
#' @param most.frequent.state One of the states that were given in \code{states} or 'none'. The specified state is assumed to be the most frequent one. This can help the fitting procedure to converge into the correct fit. |
75 |
+#' @param algorithm One of \code{c('baumWelch','EM')}. The expectation maximization (\code{'EM'}) will find the most likely states and fit the best parameters to the data, the \code{'baumWelch'} will find the most likely states using the initial parameters. |
|
76 |
+#' @param initial.params A \code{\link{aneuHMM}} object or file containing such an object from which initial starting parameters will be extracted. |
|
73 | 77 |
#' @return An \code{\link{aneuHMM}} object. |
74 |
-univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, count.cutoff.quantile=0.999, strand='*', states=c("zero-inflation","monosomy","disomy","trisomy","tetrasomy","multisomy"), most.frequent.state="disomy") { |
|
78 |
+univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, count.cutoff.quantile=0.999, strand='*', states=c("zero-inflation","monosomy","disomy","trisomy","tetrasomy","multisomy"), most.frequent.state="disomy", algorithm="EM", initial.params=NULL) { |
|
75 | 79 |
|
76 | 80 |
### Define cleanup behaviour ### |
77 | 81 |
on.exit(.C("R_univariate_cleanup")) |
... | ... |
@@ -94,6 +98,23 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
94 | 98 |
if (check.positive.integer(num.threads)!=0) stop("argument 'num.threads' expects a positive integer") |
95 | 99 |
if (check.strand(strand)!=0) stop("argument 'strand' expects either '+', '-' or '*'") |
96 | 100 |
if (!most.frequent.state %in% c(states,'none')) stop("argument 'most.frequent.state' must be one of c(",paste(c(states,'none'), collapse=","),")") |
101 |
+ if (!algorithm %in% c('baumWelch','EM')) { |
|
102 |
+ stop("argument 'algorithm' expects one of c('baumWelch','EM')") |
|
103 |
+ } |
|
104 |
+ if (algorithm == 'baumWelch' & num.trials>1) { |
|
105 |
+ warning("Set 'num.trials <- 1' because 'algorithm==\"baumWelch\"'.") |
|
106 |
+ num.trials <- 1 |
|
107 |
+ } |
|
108 |
+ initial.params <- loadHmmsFromFiles(initial.params)[[1]] |
|
109 |
+ if (class(initial.params)!=class.univariate.hmm & !is.null(initial.params)) { |
|
110 |
+ stop("argument 'initial.params' expects a ",class.univariate.hmm," object or file that contains such an object") |
|
111 |
+ } |
|
112 |
+ if (algorithm == 'baumWelch' & is.null(initial.params)) { |
|
113 |
+ warning("'initial.params' should be specified if 'algorithm=\"baumWelch\"") |
|
114 |
+ } |
|
115 |
+ if (!is.null(initial.params)) { |
|
116 |
+ init <- 'initial.params' |
|
117 |
+ } |
|
97 | 118 |
|
98 | 119 |
warlist <- list() |
99 | 120 |
if (is.null(eps.try) | num.trials==1) eps.try <- eps |
... | ... |
@@ -102,6 +123,7 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
102 | 123 |
temp <- initializeStates(states) |
103 | 124 |
state.labels <- temp$labels |
104 | 125 |
state.distributions <- temp$distributions |
126 |
+ multiplicity <- temp$multiplicity |
|
105 | 127 |
dependent.states.mask <- state.labels %in% c("monosomy","disomy","trisomy","tetrasomy","multisomy") |
106 | 128 |
numstates <- length(states) |
107 | 129 |
numbins <- length(binned.data) |
... | ... |
@@ -114,6 +136,7 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
114 | 136 |
select <- 'counts' |
115 | 137 |
} |
116 | 138 |
counts <- mcols(binned.data)[,select] |
139 |
+ algorithm <- factor(algorithm, levels=c('baumWelch','viterbi','EM')) |
|
117 | 140 |
|
118 | 141 |
### Make return object |
119 | 142 |
result <- list() |
... | ... |
@@ -121,10 +144,10 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
121 | 144 |
result$ID <- ID |
122 | 145 |
result$bins <- binned.data |
123 | 146 |
## Quality info |
124 |
- qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins, 'complexity.preseqR'), bhattacharyya=NA) |
|
147 |
+ qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins,'qualityInfo')$complexity$preseqR, bhattacharyya=NA) |
|
125 | 148 |
result$qualityInfo <- qualityInfo |
126 | 149 |
|
127 |
- # Check if there are reads in the data, otherwise HMM will blow up |
|
150 |
+ # Check if there are counts in the data, otherwise HMM will blow up |
|
128 | 151 |
if (any(is.na(counts))) { |
129 | 152 |
stop(paste0("ID = ",ID,": NAs found in reads.")) |
130 | 153 |
} |
... | ... |
@@ -156,13 +179,24 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
156 | 179 |
message(paste0("Trial ",i_try," / ",num.trials)) |
157 | 180 |
|
158 | 181 |
## Initial parameters |
159 |
- if (init == 'random') { |
|
182 |
+ if (init == 'initial.params') { |
|
183 |
+ A.initial <- initial.params$transitionProbs |
|
184 |
+ proba.initial <- initial.params$startProbs |
|
185 |
+ size.initial <- initial.params$distributions[,'size'] |
|
186 |
+ prob.initial <- initial.params$distributions[,'prob'] |
|
187 |
+ size.initial[is.na(size.initial)] <- 0 |
|
188 |
+ prob.initial[is.na(prob.initial)] <- 0 |
|
189 |
+ } else if (init == 'random') { |
|
160 | 190 |
A.initial <- matrix(runif(numstates^2), ncol=numstates) |
161 | 191 |
A.initial <- sweep(A.initial, 1, rowSums(A.initial), "/") |
162 | 192 |
proba.initial <- runif(numstates) |
163 | 193 |
# Distributions for dependent states |
164 | 194 |
size.initial <- runif(1, min=0, max=100) * cumsum(dependent.states.mask) |
165 | 195 |
prob.initial <- runif(1) * dependent.states.mask |
196 |
+ # Assign initials for the nullsomy distribution |
|
197 |
+ index <- which('nullsomy'==state.labels) |
|
198 |
+ size.initial[index] <- 1 |
|
199 |
+ prob.initial[index] <- 0.5 |
|
166 | 200 |
} else if (init == 'standard') { |
167 | 201 |
A.initial <- matrix(NA, ncol=numstates, nrow=numstates) |
168 | 202 |
for (irow in 1:numstates) { |
... | ... |
@@ -172,8 +206,9 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
172 | 206 |
} |
173 | 207 |
} |
174 | 208 |
proba.initial <- rep(1/numstates, numstates) |
175 |
- mean.initial.monosomy <- mean(counts[counts>0])/2 |
|
176 |
- var.initial.monosomy <- var(counts[counts>0])/2 |
|
209 |
+ divf <- max(multiplicity[most.frequent.state], 1) |
|
210 |
+ mean.initial.monosomy <- mean(counts[counts>0])/divf |
|
211 |
+ var.initial.monosomy <- var(counts[counts>0])/divf |
|
177 | 212 |
if (is.na(mean.initial.monosomy)) { |
178 | 213 |
mean.initial.monosomy <- 1 |
179 | 214 |
} |
... | ... |
@@ -197,11 +232,11 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
197 | 232 |
size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask]) |
198 | 233 |
prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask]) |
199 | 234 |
} |
235 |
+ # Assign initials for the nullsomy distribution |
|
236 |
+ index <- which('nullsomy'==state.labels) |
|
237 |
+ size.initial[index] <- 1 |
|
238 |
+ prob.initial[index] <- 0.5 |
|
200 | 239 |
} |
201 |
- # Assign initials for the nullsomy distribution |
|
202 |
- index <- which('nullsomy'==state.labels) |
|
203 |
- size.initial[index] <- 1 |
|
204 |
- prob.initial[index] <- 0.5 |
|
205 | 240 |
|
206 | 241 |
hmm <- .C("R_univariate_hmm", |
207 | 242 |
counts = as.integer(counts), # int* O |
... | ... |
@@ -226,7 +261,8 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
226 | 261 |
use.initial.params = as.logical(1), # bool* use_initial_params |
227 | 262 |
num.threads = as.integer(num.threads), # int* num_threads |
228 | 263 |
error = as.integer(0), # int* error (error handling) |
229 |
- count.cutoff = as.integer(count.cutoff) # int* count.cutoff |
|
264 |
+ count.cutoff = as.integer(count.cutoff), # int* count.cutoff |
|
265 |
+ algorithm = as.integer(algorithm) # int* algorithm |
|
230 | 266 |
) |
231 | 267 |
|
232 | 268 |
hmm$eps <- eps.try |
... | ... |
@@ -237,13 +273,6 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
237 | 273 |
# Store model in list |
238 | 274 |
hmm$counts <- NULL |
239 | 275 |
modellist[[as.character(i_try)]] <- hmm |
240 |
-# # Check if most.frequent.state is more than half of actual most frequent state and stop trials |
|
241 |
-# if (most.frequent.state %in% state.labels) { |
|
242 |
-# imostfrequent <- which(state.labels==most.frequent.state) |
|
243 |
-# if (hmm$weights[imostfrequent] / max(hmm$weights) > 0.5) { |
|
244 |
-# break |
|
245 |
-# } |
|
246 |
-# } |
|
247 | 276 |
init <- 'random' |
248 | 277 |
} else if (num.trials == 1) { |
249 | 278 |
if (hmm$loglik.delta > eps) { |
... | ... |
@@ -303,7 +332,8 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
303 | 332 |
use.initial.params = as.logical(1), # bool* use_initial_params |
304 | 333 |
num.threads = as.integer(num.threads), # int* num_threads |
305 | 334 |
error = as.integer(0), # int* error (error handling) |
306 |
- count.cutoff = as.integer(count.cutoff) # int* count.cutoff |
|
335 |
+ count.cutoff = as.integer(count.cutoff), # int* count.cutoff |
|
336 |
+ algorithm = as.integer(algorithm) |
|
307 | 337 |
) |
308 | 338 |
} |
309 | 339 |
|
... | ... |
@@ -346,9 +376,9 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
346 | 376 |
result$transitionProbs.initial <- transitionProbs.initial |
347 | 377 |
# Initial probs |
348 | 378 |
result$startProbs <- hmm$proba |
349 |
- names(result$startProbs) <- paste0("P(",state.labels,")") |
|
379 |
+ names(result$startProbs) <- state.labels |
|
350 | 380 |
result$startProbs.initial <- hmm$proba.initial |
351 |
- names(result$startProbs.initial) <- paste0("P(",state.labels,")") |
|
381 |
+ names(result$startProbs.initial) <- state.labels |
|
352 | 382 |
# Distributions |
353 | 383 |
distributions <- data.frame() |
354 | 384 |
distributions.initial <- data.frame() |
... | ... |
@@ -376,7 +406,7 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
376 | 406 |
convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec, error=hmm$error) |
377 | 407 |
result$convergenceInfo <- convergenceInfo |
378 | 408 |
## Quality info |
379 |
- qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins, 'complexity.preseqR'), bhattacharyya=qc.bhattacharyya(result)) |
|
409 |
+ qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins,'qualityInfo')$complexity$preseqR, bhattacharyya=qc.bhattacharyya(result)) |
|
380 | 410 |
result$qualityInfo <- qualityInfo |
381 | 411 |
} else if (hmm$error == 1) { |
382 | 412 |
warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'count.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to.")) |
... | ... |
@@ -389,9 +419,426 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
389 | 419 |
## Issue warnings |
390 | 420 |
result$warnings <- warlist |
391 | 421 |
|
392 |
-# return(list(result, modellist)) |
|
393 | 422 |
## Return results |
394 | 423 |
return(result) |
395 | 424 |
} |
396 | 425 |
|
397 | 426 |
|
427 |
+#' Find copy number variations (bivariate) |
|
428 |
+#' |
|
429 |
+#' \code{bivariate.findCNVs} finds CNVs using read count information from both strands. |
|
430 |
+#' |
|
431 |
+#' @inheritParams univariate.findCNVs |
|
432 |
+#' @param allow.odd.states If set to \code{TRUE}, all states are allowed. If \code{FALSE}, only states which have an even multiplicity (plus nullsomy-monosomy states) are allowed, e.g. disomy-disomy, monosomy-trisomy. |
|
433 |
+bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, count.cutoff.quantile=0.999, allow.odd.states=TRUE, states=c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), most.frequent.state="monosomy", algorithm='EM', initial.params=NULL) { |
|
434 |
+ |
|
435 |
+ ## Intercept user input |
|
436 |
+ if (class(binned.data) != 'GRanges') { |
|
437 |
+ binned.data <- get(load(binned.data)) |
|
438 |
+ if (class(binned.data) != 'GRanges') stop("argument 'binned.data' expects a GRanges with meta-column 'counts' or a file that contains such an object") |
|
439 |
+ } |
|
440 |
+ if (is.null(ID)) { |
|
441 |
+ ID <- attr(binned.data, 'ID') |
|
442 |
+ } |
|
443 |
+ if (check.positive(eps)!=0) stop("argument 'eps' expects a positive numeric") |
|
444 |
+ if (check.integer(max.time)!=0) stop("argument 'max.time' expects an integer") |
|
445 |
+ if (check.integer(max.iter)!=0) stop("argument 'max.iter' expects an integer") |
|
446 |
+ if (check.positive.integer(num.trials)!=0) stop("argument 'num.trials' expects a positive integer") |
|
447 |
+ if (!is.null(eps.try)) { |
|
448 |
+ if (check.positive(eps.try)!=0) stop("argument 'eps.try' expects a positive numeric") |
|
449 |
+ } |
|
450 |
+ if (check.positive.integer(num.threads)!=0) stop("argument 'num.threads' expects a positive integer") |
|
451 |
+ initial.params <- loadHmmsFromFiles(initial.params)[[1]] |
|
452 |
+ if (class(initial.params)!=class.bivariate.hmm & !is.null(initial.params)) { |
|
453 |
+ stop("argument 'initial.params' expects a ",class.bivariate.hmm," object or file that contains such an object") |
|
454 |
+ } |
|
455 |
+ if (algorithm == 'baumWelch' & is.null(initial.params)) { |
|
456 |
+ warning("'initial.params' should be specified if 'algorithm=\"baumWelch\"") |
|
457 |
+ } |
|
458 |
+ if (!is.null(initial.params)) { |
|
459 |
+ init <- 'initial.params' |
|
460 |
+ } |
|
461 |
+ |
|
462 |
+ warlist <- list() |
|
463 |
+ if (is.null(eps.try)) eps.try <- eps |
|
464 |
+ |
|
465 |
+ ## Variables |
|
466 |
+ num.bins <- length(binned.data) |
|
467 |
+ temp <- initializeStates(states) |
|
468 |
+ multiplicity <- temp$multiplicity |
|
469 |
+ state.labels <- temp$labels |
|
470 |
+ algorithm <- factor(algorithm, levels=c('baumWelch','viterbi','EM')) |
|
471 |
+ |
|
472 |
+ ## Get counts |
|
473 |
+ select <- 'counts' |
|
474 |
+ counts <- matrix(c(mcols(binned.data)[,paste0('m',select)], mcols(binned.data)[,paste0('p',select)]), ncol=2, dimnames=list(bin=1:num.bins, strand=c('minus','plus'))) |
|
475 |
+ maxcounts <- max(counts) |
|
476 |
+ |
|
477 |
+ ## Filter high counts out, makes HMM faster |
|
478 |
+ count.cutoff <- quantile(counts, count.cutoff.quantile) |
|
479 |
+ names.count.cutoff <- names(count.cutoff) |
|
480 |
+ count.cutoff <- ceiling(count.cutoff) |
|
481 |
+ mask <- counts > count.cutoff |
|
482 |
+ counts[mask] <- count.cutoff |
|
483 |
+ numfiltered <- length(which(mask)) |
|
484 |
+ if (numfiltered > 0) { |
|
485 |
+ message(paste0("Replaced read counts > ",count.cutoff," (",names.count.cutoff," quantile) by ",count.cutoff," in ",numfiltered," bins. Set option 'count.cutoff.quantile=1' to disable this filtering. This filtering was done to increase the speed of the HMM and should not affect the results.\n")) |
|
486 |
+ } |
|
487 |
+ |
|
488 |
+ ### Make return object |
|
489 |
+ result <- list() |
|
490 |
+ class(result) <- class.bivariate.hmm |
|
491 |
+ result$ID <- ID |
|
492 |
+ result$bins <- binned.data |
|
493 |
+ ## Quality info |
|
494 |
+ qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins,'qualityInfo')$complexity$preseqR, bhattacharyya=NA) |
|
495 |
+ result$qualityInfo <- qualityInfo |
|
496 |
+ |
|
497 |
+ # Check if there are counts in the data, otherwise HMM will blow up |
|
498 |
+ if (!any(counts!=0)) { |
|
499 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": All counts in data are zero. No HMM done.")) |
|
500 |
+ result$warnings <- warlist |
|
501 |
+ return(result) |
|
502 |
+ } else if (any(counts<0)) { |
|
503 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Some counts in data are negative. No HMM done.")) |
|
504 |
+ result$warnings <- warlist |
|
505 |
+ return(result) |
|
506 |
+ } |
|
507 |
+ |
|
508 |
+ if (init=='initial.params') { |
|
509 |
+ uni.transitionProbs <- initial.params$univariateParams$transitionProbs |
|
510 |
+ uni.startProbs <- initial.params$univariateParams$startProbs |
|
511 |
+ distributions <- initial.params$distributions |
|
512 |
+ uni.weights <- initial.params$univariateParams$weights |
|
513 |
+ uni.states <- names(uni.weights) |
|
514 |
+ num.uni.states <- length(uni.states) |
|
515 |
+ num.models <- length(distributions) |
|
516 |
+ num.bins <- length(initial.params$bins) |
|
517 |
+ comb.states <- factor(names(initial.params$startProbs), levels=names(initial.params$startProbs)) |
|
518 |
+ num.comb.states <- length(comb.states) |
|
519 |
+ states.list <- list(minus=initial.params$bins$mstate, plus=initial.params$bins$pstate) |
|
520 |
+ comb.states.per.bin <- factor(do.call(paste, states.list), levels=levels(comb.states)) |
|
521 |
+ A.initial <- initial.params$transitionProbs |
|
522 |
+ proba.initial <- initial.params$startProbs |
|
523 |
+ use.initial <- TRUE |
|
524 |
+ } else { |
|
525 |
+ ### Stack the strands and run one univariate findCNVs |
|
526 |
+ message("") |
|
527 |
+ message(paste(rep('-',getOption('width')), collapse='')) |
|
528 |
+ message("Running univariate") |
|
529 |
+ binned.data.minus <- binned.data |
|
530 |
+ strand(binned.data.minus) <- '-' |
|
531 |
+ binned.data.minus$counts <- binned.data.minus$mcounts |
|
532 |
+ binned.data.plus <- binned.data |
|
533 |
+ strand(binned.data.plus) <- '+' |
|
534 |
+ binned.data.plus$counts <- binned.data.plus$pcounts |
|
535 |
+ binned.data.stacked <- c(binned.data.minus, binned.data.plus) |
|
536 |
+ mask.attributes <- c('qualityInfo', 'ID', 'min.mapq') |
|
537 |
+ attributes(binned.data.stacked)[mask.attributes] <- attributes(binned.data)[mask.attributes] |
|
538 |
+ |
|
539 |
+ model.stacked <- univariate.findCNVs(binned.data.stacked, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, count.cutoff.quantile=1, states=states, most.frequent.state=most.frequent.state) |
|
540 |
+ model.minus <- model.stacked |
|
541 |
+ model.minus$bins <- model.minus$bins[strand(model.minus$bins)=='-'] |
|
542 |
+ model.minus$segments <- model.minus$segments[strand(model.minus$segments)=='-'] |
|
543 |
+ model.plus <- model.stacked |
|
544 |
+ model.plus$bins <- model.plus$bins[strand(model.plus$bins)=='+'] |
|
545 |
+ model.plus$segments <- model.plus$segments[strand(model.plus$segments)=='+'] |
|
546 |
+ |
|
547 |
+ models <- list(minus=model.minus, plus=model.plus) |
|
548 |
+ |
|
549 |
+ ## Extract counts and other stuff |
|
550 |
+ uni.transitionProbs <- lapply(models, '[[', 'transitionProbs')[[1]] |
|
551 |
+ uni.startProbs <- lapply(models, '[[', 'startProbs')[[1]] |
|
552 |
+ distributions <- lapply(models, '[[', 'distributions') |
|
553 |
+ uni.weights <- lapply(models, '[[', 'weights')[[1]] |
|
554 |
+ uni.states <- names(uni.weights) |
|
555 |
+ num.uni.states <- length(uni.states) |
|
556 |
+ num.models <- length(distributions) |
|
557 |
+ num.bins <- length(models[[1]]$bins) |
|
558 |
+ comb.states <- vector() |
|
559 |
+ levels.state <- levels(models[[1]]$bins$state) |
|
560 |
+ for (i1 in 1:length(levels.state)) { |
|
561 |
+ for (i2 in 1:length(levels.state)) { |
|
562 |
+ comb.state <- paste(levels.state[i1], levels.state[i2]) |
|
563 |
+ if (allow.odd.states) { |
|
564 |
+ comb.states[length(comb.states)+1] <- comb.state |
|
565 |
+ } else { |
|
566 |
+ state.multiplicity <- multiplicity[levels.state[i1]] + multiplicity[levels.state[i2]] |
|
567 |
+ # Only even states and null-mono states (for sex chromosomes) |
|
568 |
+ if (state.multiplicity %% 2 == 0 | state.multiplicity == 1) { |
|
569 |
+ comb.states[length(comb.states)+1] <- comb.state |
|
570 |
+ } |
|
571 |
+ } |
|
572 |
+ } |
|
573 |
+ } |
|
574 |
+ comb.states <- factor(comb.states, levels=comb.states) |
|
575 |
+ num.comb.states <- length(comb.states) |
|
576 |
+ states.list <- lapply(models, function(x) { x$bins$state }) |
|
577 |
+ comb.states.per.bin <- factor(do.call(paste, states.list), levels=levels(comb.states)) |
|
578 |
+ A.initial <- double(length=num.comb.states*num.comb.states) |
|
579 |
+ proba.initial <- double(length=num.comb.states) |
|
580 |
+ use.initial <- FALSE |
|
581 |
+ } |
|
582 |
+ |
|
583 |
+ ### Prepare the multivariate HMM |
|
584 |
+ message("") |
|
585 |
+ message(paste(rep('-',getOption('width')), collapse='')) |
|
586 |
+ message("Preparing bivariate HMM\n") |
|
587 |
+ |
|
588 |
+ ## Pre-compute z-values for each number of counts |
|
589 |
+ message("Computing pre z-matrix...", appendLF=F); ptm <- proc.time() |
|
590 |
+ z.per.count <- array(NA, dim=c(maxcounts+1, num.models, num.uni.states), dimnames=list(counts=0:maxcounts, strand=names(distributions), state=uni.states)) |
|
591 |
+ xcounts <- 0:maxcounts |
|
592 |
+ for (istrand in 1:num.models) { |
|
593 |
+ for (istate in 1:num.uni.states) { |
|
594 |
+ if (distributions[[istrand]][istate,'type']=='dnbinom') { |
|
595 |
+ size <- distributions[[istrand]][istate,'size'] |
|
596 |
+ prob <- distributions[[istrand]][istate,'prob'] |
|
597 |
+ u <- pnbinom(xcounts, size, prob) |
|
598 |
+ } else if (distributions[[istrand]][istate,'type']=='delta') { |
|
599 |
+ u <- rep(1, length(xcounts)) |
|
600 |
+ } else if (distributions[[istrand]][istate,'type']=='dgeom') { |
|
601 |
+ prob <- distributions[[istrand]][istate,'prob'] |
|
602 |
+ u <- pgeom(xcounts, prob) |
|
603 |
+ } |
|
604 |
+ qnorm_u <- qnorm(u) |
|
605 |
+ mask <- qnorm_u==Inf |
|
606 |
+ qnorm_u[mask] <- qnorm(1-1e-16) |
|
607 |
+ z.per.count[ , istrand, istate] <- qnorm_u |
|
608 |
+ } |
|
609 |
+ } |
|
610 |
+ time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
611 |
+ |
|
612 |
+ ## Compute the z matrix |
|
613 |
+ message("Transfering values into z-matrix...", appendLF=F); ptm <- proc.time() |
|
614 |
+ z.per.bin = array(NA, dim=c(num.bins, num.models, num.uni.states), dimnames=list(bin=1:num.bins, strand=names(distributions), state=uni.states)) |
|
615 |
+ for (istrand in 1:num.models) { |
|
616 |
+ for (istate in 1:num.uni.states) { |
|
617 |
+ z.per.bin[ , istrand, istate] = z.per.count[counts[,istrand]+1, istrand, istate] |
|
618 |
+ } |
|
619 |
+ } |
|
620 |
+ remove(z.per.count) |
|
621 |
+ time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
622 |
+ |
|
623 |
+ ## Calculate correlation matrix |
|
624 |
+ message("Computing inverse of correlation matrix...", appendLF=F); ptm <- proc.time() |
|
625 |
+ correlationMatrix <- array(0, dim=c(num.models,num.models,num.comb.states), dimnames=list(strand=names(distributions), strand=names(distributions), comb.state=comb.states)) |
|
626 |
+ correlationMatrixInverse <- correlationMatrix |
|
627 |
+ determinant <- rep(0, num.comb.states) |
|
628 |
+ names(determinant) <- comb.states |
|
629 |
+ usestateTF <- rep(NA, num.comb.states) # TRUE, FALSE vector for usable states |
|
630 |
+ names(usestateTF) <- comb.states |
|
631 |
+ for (comb.state in comb.states) { |
|
632 |
+ state <- strsplit(comb.state, ' ')[[1]] |
|
633 |
+ mask <- which(comb.states.per.bin==comb.state) |
|
634 |
+ # Subselect z |
|
635 |
+ z.temp <- matrix(NA, ncol=num.models, nrow=length(mask)) |
|
636 |
+ for (istrand in 1:num.models) { |
|
637 |
+ z.temp[,istrand] <- z.per.bin[mask, istrand, state[istrand]] |
|
638 |
+ } |
|
639 |
+ temp <- tryCatch({ |
|
640 |
+ if (nrow(z.temp) > 1) { |
|
641 |
+ correlationMatrix[,,comb.state] <- cor(z.temp) |
|
642 |
+ determinant[comb.state] <- det( correlationMatrix[,,comb.state] ) |
|
643 |
+ correlationMatrixInverse[,,comb.state] <- solve(correlationMatrix[,,comb.state]) |
|
644 |
+ usestateTF[comb.state] <- TRUE |
|
645 |
+ } else { |
|
646 |
+ correlationMatrix[,,comb.state] <- diag(num.models) |
|
647 |
+ determinant[comb.state] <- det( correlationMatrix[,,comb.state] ) |
|
648 |
+ correlationMatrixInverse[,,comb.state] <- solve(correlationMatrix[,,comb.state]) |
|
649 |
+ usestateTF[comb.state] <- TRUE |
|
650 |
+ } |
|
651 |
+ }, warning = function(war) { |
|
652 |
+ correlationMatrix[,,comb.state] <<- diag(num.models) |
|
653 |
+ determinant[comb.state] <<- det( correlationMatrix[,,comb.state] ) |
|
654 |
+ correlationMatrixInverse[,,comb.state] <<- solve(correlationMatrix[,,comb.state]) |
|
655 |
+ usestateTF[comb.state] <<- TRUE |
|
656 |
+ war |
|
657 |
+ }, error = function(err) { |
|
658 |
+ correlationMatrix[,,comb.state] <<- diag(num.models) |
|
659 |
+ determinant[comb.state] <<- det( correlationMatrix[,,comb.state] ) |
|
660 |
+ correlationMatrixInverse[,,comb.state] <<- solve(correlationMatrix[,,comb.state]) |
|
661 |
+ usestateTF[comb.state] <<- TRUE |
|
662 |
+ err |
|
663 |
+ }) |
|
664 |
+ } |
|
665 |
+ time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
666 |
+ |
|
667 |
+ # Use only states with valid correlationMatrixInverse (all states are valid in this version) |
|
668 |
+ correlationMatrix = correlationMatrix[,,usestateTF] |
|
669 |
+ correlationMatrixInverse = correlationMatrixInverse[,,usestateTF] |
|
670 |
+ comb.states = comb.states[usestateTF] |
|
671 |
+ comb.states <- droplevels(comb.states) |
|
672 |
+ determinant = determinant[usestateTF] |
|
673 |
+ num.comb.states <- length(comb.states) |
|
674 |
+ |
|
675 |
+ ### Calculate multivariate densities for each state ### |
|
676 |
+ message("Calculating multivariate densities...", appendLF=F); ptm <- proc.time() |
|
677 |
+ densities <- matrix(1, ncol=num.comb.states, nrow=num.bins, dimnames=list(bin=1:num.bins, comb.state=comb.states)) |
|
678 |
+ for (comb.state in comb.states) { |
|
679 |
+ istate <- which(comb.state==comb.states) |
|
680 |
+ state <- strsplit(comb.state, ' ')[[1]] |
|
681 |
+ z.temp <- matrix(NA, ncol=num.models, nrow=num.bins) |
|
682 |
+ product <- 1 |
|
683 |
+ for (istrand in 1:num.models) { |
|
684 |
+ z.temp[,istrand] <- z.per.bin[, istrand, state[istrand]] |
|
685 |
+ if (distributions[[istrand]][state[istrand],'type'] == 'dnbinom') { |
|
686 |
+ size <- distributions[[istrand]][state[istrand],'size'] |
|
687 |
+ prob <- distributions[[istrand]][state[istrand],'prob'] |
|
688 |
+ product <- product * dnbinom(counts[,istrand], size, prob) |
|
689 |
+ } else if (distributions[[istrand]][state[istrand],'type'] == 'dgeom') { |
|
690 |
+ prob <- distributions[[istrand]][state[istrand],'prob'] |
|
691 |
+ product <- product * dgeom(counts[,istrand], prob) |
|
692 |
+ } else if (distributions[[istrand]][state[istrand],'type'] == 'delta') { |
|
693 |
+ product <- product * ifelse(counts[,istrand]==0, 1, 0) |
|
694 |
+ } |
|
695 |
+ } |
|
696 |
+ exponent <- -0.5 * apply( ( z.temp %*% (correlationMatrixInverse[ , , istate] - diag(num.models)) ) * z.temp, 1, sum) |
|
697 |
+ densities[,istate] <- product * determinant[istate]^(-0.5) * exp( exponent ) |
|
698 |
+ } |
|
699 |
+ # Check if densities are > 1 |
|
700 |
+# if (any(densities>1)) stop("Densities > 1") |
|
701 |
+# if (any(densities<0)) stop("Densities < 0") |
|
702 |
+ densities[densities>1] <- 1 |
|
703 |
+ densities[densities<0] <- 0 |
|
704 |
+ # Check if densities are 0 everywhere in some bins |
|
705 |
+ check <- which(apply(densities, 1, sum) == 0) |
|
706 |
+ if (length(check)>0) { |
|
707 |
+ if (check[1]==1) { |
|
708 |
+ densities[1,] <- rep(1e-10, ncol(densities)) |
|
709 |
+ check <- check[-1] |
|
710 |
+ } |
|
711 |
+ for (icheck in check) { |
|
712 |
+ densities[icheck,] <- densities[icheck-1,] |
|
713 |
+ } |
|
714 |
+ } |
|
715 |
+ time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
716 |
+ |
|
717 |
+ ### Define cleanup behaviour ### |
|
718 |
+ on.exit(.C("R_multivariate_cleanup", as.integer(num.comb.states))) |
|
719 |
+ |
|
720 |
+ ### Run the multivariate HMM |
|
721 |
+ # Call the C function |
|
722 |
+ hmm <- .C("R_multivariate_hmm", |
|
723 |
+ densities = as.double(densities), # double* D |
|
724 |
+ num.bins = as.integer(num.bins), # int* T |
|
725 |
+ num.comb.states = as.integer(num.comb.states), # int* N |
|
726 |
+ num.strands = as.integer(num.models), # int* Nmod |
|
727 |
+ comb.states = as.integer(comb.states), # int* comb_states |
|
728 |
+ num.iterations = as.integer(max.iter), # int* maxiter |
|
729 |
+ time.sec = as.integer(max.time), # double* maxtime |
|
730 |
+ loglik.delta = as.double(eps), # double* eps |
|
731 |
+ states = integer(length=num.bins), # int* states |
|
732 |
+ A = double(length=num.comb.states*num.comb.states), # double* A |
|
733 |
+ proba = double(length=num.comb.states), # double* proba |
|
734 |
+ loglik = double(length=1), # double* loglik |
|
735 |
+ A.initial = as.vector(A.initial), # double* initial_A |
|
736 |
+ proba.initial = as.vector(proba.initial), # double* initial_proba |
|
737 |
+ use.initial.params = as.logical(use.initial), # bool* use_initial_params |
|
738 |
+ num.threads = as.integer(num.threads), # int* num_threads |
|
739 |
+ error = as.integer(0), # error handling |
|
740 |
+ algorithm = as.integer(algorithm) # int* algorithm |
|
741 |
+ ) |
|
742 |
+ |
|
743 |
+ ### Check convergence ### |
|
744 |
+ war <- NULL |
|
745 |
+ if (hmm$loglik.delta > eps) { |
|
746 |
+ war <- warning(paste0("ID = ",ID,": HMM did not converge!\n")) |
|
747 |
+ } |
|
748 |
+ if (hmm$error == 1) { |
|
749 |
+ warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'count.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to.")) |
|
750 |
+ } else if (hmm$error == 2) { |
|
751 |
+ warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library")) |
|
752 |
+ } |
|
753 |
+ |
|
754 |
+ ### Make return object ### |
|
755 |
+ if (hmm$error == 0) { |
|
756 |
+ result <- list() |
|
757 |
+ class(result) <- class.bivariate.hmm |
|
758 |
+ result$ID <- ID |
|
759 |
+ ## Bin coordinates and states |
|
760 |
+ result$bins <- binned.data |
|
761 |
+ result$bins$state <- comb.states[hmm$states] |
|
762 |
+ # Get states as factors in data.frame |
|
763 |
+ matrix.states <- matrix(unlist(strsplit(as.character(result$bins$state), split=' ')), byrow=T, ncol=num.models, dimnames=list(bin=1:num.bins, strand=names(distributions))) |
|
764 |
+ names <- c('mstate','pstate') |
|
765 |
+ for (i1 in 1:num.models) { |
|
766 |
+ mcols(result$bins)[names[i1]] <- factor(matrix.states[,i1], levels=uni.states) |
|
767 |
+ } |
|
768 |
+ ## Segmentation |
|
769 |
+ message("Making segmentation ...", appendLF=F); ptm <- proc.time() |
|
770 |
+ gr <- result$bins |
|
771 |
+ red.gr.list <- GRangesList() |
|
772 |
+ for (state in comb.states) { |
|
773 |
+ red.gr <- GenomicRanges::reduce(gr[gr$state==state]) |
|
774 |
+ if (length(red.gr)>0) { |
|
775 |
+ mcols(red.gr)$state <- rep(factor(state, levels=levels(gr$state)),length(red.gr)) |
|
776 |
+ red.gr.list[[length(red.gr.list)+1]] <- red.gr |
|
777 |
+ } |
|
778 |
+ } |
|
779 |
+ red.gr <- GenomicRanges::sort(GenomicRanges::unlist(red.gr.list)) |
|
780 |
+ result$segments <- red.gr |
|
781 |
+ seqlengths(result$segments) <- seqlengths(result$bins) |
|
782 |
+ time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
783 |
+ ## CNV state for both strands combined |
|
784 |
+ # Bins |
|
785 |
+ state <- multiplicity[result$bins$mstate] + multiplicity[result$bins$pstate] |
|
786 |
+ state[state>max(multiplicity)] <- max(multiplicity) |
|
787 |
+ multiplicity.inverse <- names(multiplicity) |
|
788 |
+ names(multiplicity.inverse) <- multiplicity |
|
789 |
+ state <- multiplicity.inverse[as.character(state)] |
|
790 |
+ state[(result$bins$mstate=='nullsomy' | result$bins$pstate=='nullsomy') & state=='zero-inflation'] <- 'nullsomy' |
|
791 |
+ result$bins$state <- factor(state, levels=names(multiplicity)) |
|
792 |
+ # Segments |
|
793 |
+ str <- strsplit(as.character(result$segments$state),' ') |
|
794 |
+ result$segments$mstate <- factor(unlist(lapply(str, '[[', 1)), levels=state.labels) |
|
795 |
+ result$segments$pstate <- factor(unlist(lapply(str, '[[', 2)), levels=state.labels) |
|
796 |
+ state <- multiplicity[result$segments$mstate] + multiplicity[result$segments$pstate] |
|
797 |
+ state[state>max(multiplicity)] <- max(multiplicity) |
|
798 |
+ multiplicity.inverse <- names(multiplicity) |
|
799 |
+ names(multiplicity.inverse) <- multiplicity |
|
800 |
+ state <- multiplicity.inverse[as.character(state)] |
|
801 |
+ state[(result$segments$mstate=='nullsomy' | result$segments$pstate=='nullsomy') & state=='zero-inflation'] <- 'nullsomy' |
|
802 |
+ result$segments$state <- factor(state, levels=names(multiplicity)) |
|
803 |
+ ## Parameters |
|
804 |
+ # Weights |
|
805 |
+ tstates <- table(result$bins$state) |
|
806 |
+ result$weights <- tstates/sum(tstates) |
|
807 |
+ # Transition matrices |
|
808 |
+ result$transitionProbs <- matrix(hmm$A, ncol=num.comb.states) |
|
809 |
+ colnames(result$transitionProbs) <- comb.states |
|
810 |
+ rownames(result$transitionProbs) <- comb.states |
|
811 |
+ result$transitionProbs.initial <- matrix(hmm$A.initial, ncol=num.comb.states) |
|
812 |
+ colnames(result$transitionProbs.initial) <- comb.states |
|
813 |
+ rownames(result$transitionProbs.initial) <- comb.states |
|
814 |
+ # Initial probs |
|
815 |
+ result$startProbs <- hmm$proba |
|
816 |
+ names(result$startProbs) <- comb.states |
|
817 |
+ result$startProbs.initial <- hmm$proba.initial |
|
818 |
+ names(result$startProbs.initial) <- comb.states |
|
819 |
+ # Distributions |
|
820 |
+ result$distributions <- distributions |
|
821 |
+ ## Convergence info |
|
822 |
+ convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec) |
|
823 |
+ result$convergenceInfo <- convergenceInfo |
|
824 |
+ ## Quality info |
|
825 |
+ qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins,'qualityInfo')$complexity$preseqR, bhattacharyya=qc.bhattacharyya(result)) |
|
826 |
+ result$qualityInfo <- qualityInfo |
|
827 |
+ ## Univariate infos |
|
828 |
+ univariateParams <- list(transitionProbs=uni.transitionProbs, startProbs=uni.startProbs, distributions=distributions[[1]], weights=uni.weights) |
|
829 |
+ result$univariateParams <- univariateParams |
|
830 |
+ } else if (hmm$error == 1) { |
|
831 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'count.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to.")) |
|
832 |
+ } else if (hmm$error == 2) { |
|
833 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library")) |
|
834 |
+ } |
|
835 |
+ |
|
836 |
+ ## Issue warnings |
|
837 |
+ result$warnings <- warlist |
|
838 |
+ |
|
839 |
+ ## Return results |
|
840 |
+ return(result) |
|
841 |
+} |
|
842 |
+ |
|
843 |
+ |
|
844 |
+ |
... | ... |
@@ -4,10 +4,11 @@ |
4 | 4 |
#' |
5 | 5 |
#' \code{findSCEs} classifies the binned read counts into several states which represent the number of chromatids on each strand. |
6 | 6 |
#' |
7 |
-#' \code{findSCEs} uses a 6-state Hidden Markov Model to classify the binned read counts: state 'nullsomy' with a delta function as emission densitiy (only zero read counts), 'monosomy','disomy','trisomy','tetrasomy' and 'multisomy' with negative binomials (see \code{\link{dnbinom}}) as emission densities. A Baum-Welch algorithm is employed to estimate the parameters of the distributions. See our paper for a detailed description of the method. TODO: insert paper |
|
7 |
+#' \code{findSCEs} uses a Hidden Markov Model to classify the binned read counts: state 'zero-inflation' with a delta function as emission densitiy (only zero read counts), 'nullsomy' with geometric distribution, 'monosomy','disomy','trisomy','tetrasomy' and 'multisomy' with negative binomials (see \code{\link{dnbinom}}) as emission densities. A expectation-maximization (EM) algorithm is employed to estimate the parameters of the distributions. See our paper for a detailed description of the method. TODO: insert paper |
|
8 | 8 |
#' @author Aaron Taudt |
9 |
-#' @inheritParams univariate.findSCEs |
|
10 |
-#' @inheritParams bivariate.findSCEs |
|
9 |
+#' @inheritParams univariate.findCNVs |
|
10 |
+#' @inheritParams bivariate.findCNVs |
|
11 |
+#' @inheritParams getSCEcoordinates |
|
11 | 12 |
#' @return An \code{\link{aneuBiHMM}} object. |
12 | 13 |
#' @examples |
13 | 14 |
#'## Get an example BAM file with single-cell-sequencing reads |
... | ... |
@@ -19,7 +20,7 @@ |
19 | 20 |
#'## Check the fit |
20 | 21 |
#'plot(model, type='histogram') |
21 | 22 |
#' @export |
22 |
-findSCEs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=1000, num.trials=15, eps.try=10*eps, num.threads=1, count.cutoff.quantile=0.999, strand='*', allow.odd.states=TRUE, states=c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), most.frequent.state="monosomy") { |
|
23 |
+findSCEs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=1000, num.trials=5, eps.try=10*eps, num.threads=1, count.cutoff.quantile=0.999, strand='*', allow.odd.states=TRUE, states=c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), most.frequent.state="monosomy", algorithm="EM", initial.params=NULL, resolution=c(3,6), min.segwidth=2, fragments=NULL, min.reads=50) { |
|
23 | 24 |
|
24 | 25 |
## Intercept user input |
25 | 26 |
if (class(binned.data) != 'GRanges') { |
... | ... |
@@ -38,7 +39,23 @@ findSCEs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1 |
38 | 39 |
ptm <- proc.time() |
39 | 40 |
message("Find CNVs for ID = ",ID, ":") |
40 | 41 |
|
41 |
- model <- bivariate.findSCEs(binned.data, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, count.cutoff.quantile=count.cutoff.quantile, allow.odd.states=allow.odd.states, states=states, most.frequent.state=most.frequent.state) |
|
42 |
+ model <- bivariate.findCNVs(binned.data, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, count.cutoff.quantile=count.cutoff.quantile, allow.odd.states=allow.odd.states, states=states, most.frequent.state=most.frequent.state, algorithm=algorithm, initial.params=initial.params) |
|
43 |
+ model$sce <- getSCEcoordinates(model, resolution=resolution, min.segwidth=min.segwidth, fragments=fragments, min.reads=min.reads) |
|
44 |
+ |
|
45 |
+# ## Find CNV calls for offset counts using the parameters from the normal run |
|
46 |
+# offsets <- setdiff(names(attr(binned.data,'offset.counts')), 0) |
|
47 |
+# if (!is.null(offsets)) { |
|
48 |
+# offset.models <- list() |
|
49 |
+# for (ioff in offsets) { |
|
50 |
+# message(paste0("Finding SCE for offset ",ioff)) |
|
51 |
+# off.counts <- attr(binned.data,'offset.counts')[[as.character(ioff)]] |
|
52 |
+# off.binned.data <- binned.data |
|
53 |
+# mcols(off.binned.data)[names(mcols(binned.data)) %in% names(off.counts)] <- as(off.counts, 'DataFrame') |
|
54 |
+# off.model <- suppressMessages( bivariate.findCNVs(off.binned.data, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, count.cutoff.quantile=count.cutoff.quantile, allow.odd.states=allow.odd.states, states=states, most.frequent.state=most.frequent.state, algorithm='baumWelch', initial.params=model) ) |
|
55 |
+# offset.models[[as.character(ioff)]] <- off.model |
|
56 |
+# } |
|
57 |
+# } |
|
58 |
+# return(offset.models) |
|
42 | 59 |
|
43 | 60 |
attr(model, 'call') <- call |
44 | 61 |
time <- proc.time() - ptm |
... | ... |
@@ -48,740 +65,6 @@ findSCEs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1 |
48 | 65 |
} |
49 | 66 |
|
50 | 67 |
|
51 |
-#' Find copy number variations (univariate) |
|
52 |
-#' |
|
53 |
-#' \code{univariate.findSCEs} classifies the input binned read counts into several states which represent copy-number-variation. |
|
54 |
-#' |
|
55 |
-#' \code{univariate.findSCEs} is almost the same as \code{findCNVs}. However, it has slighlty different parameter settings and variable preparations because it is used by \code{bivariate.findSCEs} for CNV calling on both strands separately. |
|
56 |
-#' |
|
57 |
-#' @param binned.data A \link{GRanges} object with binned read counts. |
|
58 |
-#' @param ID An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example. |
|
59 |
-#' @param eps Convergence threshold for the Baum-Welch algorithm. |
|
60 |
-#' @param init One of the following initialization procedures: |
|
61 |
-#' \describe{ |
|
62 |
-#' \item{\code{standard}}{The negative binomial of state 'disomy' will be initialized with \code{mean=mean(counts)}, \code{var=var(counts)}. This procedure usually gives good convergence.} |
|
63 |
-#' \item{\code{random}}{Mean and variance of the negative binomial of state 'disomy' will be initialized with random values (in certain boundaries, see source code). Try this if the \code{standard} procedure fails to produce a good fit.} |
|
64 |
-#' } |
|
65 |
-#' @param max.time The maximum running time in seconds for the Baum-Welch algorithm. If this time is reached, the Baum-Welch will terminate after the current iteration finishes. The default -1 is no limit. |
|
66 |
-#' @param max.iter The maximum number of iterations for the Baum-Welch algorithm. The default -1 is no limit. |
|
67 |
-#' @param num.trials The number of trials to find a fit where state \code{most.frequent.state} is most frequent. Each time, the HMM is seeded with different random initial values. |
|
68 |
-#' @param eps.try If code num.trials is set to greater than 1, \code{eps.try} is used for the trial runs. If unset, \code{eps} is used. |
|
69 |
-#' @param num.threads Number of threads to use. Setting this to >1 may give increased performance. |
|
70 |
-#' @param count.cutoff.quantile A quantile between 0 and 1. Should be near 1. Read counts above this quantile will be set to the read count specified by this quantile. Filtering very high read counts increases the performance of the Baum-Welch fitting procedure. However, if your data contains very few peaks they might be filtered out. Set \code{count.cutoff.quantile=1} in this case. |
|
71 |
-#' @param strand Run the HMM only for the specified strand. One of \code{c('+', '-', '*')}. |
|
72 |
-#' @param states A subset or all of \code{c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy")}. This vector defines the states that are used in the Hidden Markov Model. The order of the entries should not be changed. |
|
73 |
-#' @param most.frequent.state One of the states that were given in \code{states} or 'none'. The specified state is assumed to be the most frequent one. This can help the fitting procedure to converge into the correct fit. If set to 'none', no state is assumed to be most frequent. |
|
74 |
-#' @return An \code{\link{aneuHMM}} object. |
|
75 |
-univariate.findSCEs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, count.cutoff.quantile=0.999, strand='*', states=c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), most.frequent.state="monosomy") { |
|
76 |
- |
|
77 |
- ### Define cleanup behaviour ### |
|
78 |
- on.exit(.C("R_univariate_cleanup")) |
|
79 |
- |
|
80 |
- ## Intercept user input |
|
81 |
- if (class(binned.data) != 'GRanges') { |
|
82 |
- binned.data <- get(load(binned.data)) |
|
83 |
- if (class(binned.data) != 'GRanges') stop("argument 'binned.data' expects a GRanges with meta-column 'counts' or a file that contains such an object") |
|
84 |
- } |
|
85 |
- if (is.null(ID)) { |
|
86 |
- ID <- attr(binned.data, 'ID') |
|
87 |
- } |
|
88 |
- if (check.positive(eps)!=0) stop("argument 'eps' expects a positive numeric") |
|
89 |
- if (check.integer(max.time)!=0) stop("argument 'max.time' expects an integer") |
|
90 |
- if (check.integer(max.iter)!=0) stop("argument 'max.iter' expects an integer") |
|
91 |
- if (check.positive.integer(num.trials)!=0) stop("argument 'num.trials' expects a positive integer") |
|
92 |
- if (!is.null(eps.try)) { |
|
93 |
- if (check.positive(eps.try)!=0) stop("argument 'eps.try' expects a positive numeric") |
|
94 |
- } |
|
95 |
- if (check.positive.integer(num.threads)!=0) stop("argument 'num.threads' expects a positive integer") |
|
96 |
- if (check.strand(strand)!=0) stop("argument 'strand' expects either '+', '-' or '*'") |
|
97 |
- if (!most.frequent.state %in% c(states,'none')) stop("argument 'most.frequent.state' must be one of c(",paste(c(states,'none'), collapse=","),")") |
|
98 |
- |
|
99 |
- warlist <- list() |
|
100 |
- if (is.null(eps.try) | num.trials==1) eps.try <- eps |
|
101 |
- |
|
102 |
- ## Assign variables |
|
103 |
- temp <- initializeStates(states) |
|
104 |
- state.labels <- temp$labels |
|
105 |
- state.distributions <- temp$distributions |
|
106 |
- dependent.states.mask <- state.labels %in% c("monosomy","disomy","trisomy","tetrasomy","multisomy") |
|
107 |
- numstates <- length(states) |
|
108 |
- numbins <- length(binned.data) |
|
109 |
- iniproc <- which(init==c("standard","random")) # transform to int |
|
110 |
- if (strand=='+') { |
|
111 |
- select <- 'pcounts' |
|
112 |
- } else if (strand=='-') { |
|
113 |
- select <- 'mcounts' |
|
114 |
- } else if (strand=='*') { |
|
115 |
- select <- 'counts' |
|
116 |
- } |
|
117 |
- counts <- mcols(binned.data)[,select] |
|
118 |
- |
|
119 |
- ### Make return object |
|
120 |
- result <- list() |
|
121 |
- class(result) <- class.univariate.hmm |
|
122 |
- result$ID <- ID |
|
123 |
- result$bins <- binned.data |
|
124 |
- ## Quality info |
|
125 |
- qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins, 'complexity.preseqR'), bhattacharyya=NA) |
|
126 |
- result$qualityInfo <- qualityInfo |
|
127 |
- |
|
128 |
- # Check if there are counts in the data, otherwise HMM will blow up |
|
129 |
- if (any(is.na(counts))) { |
|
130 |
- stop(paste0("ID = ",ID,": NAs found in reads.")) |
|
131 |
- } |
|
132 |
- if (!any(counts!=0)) { |
|
133 |
- warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": All counts in data are zero. No HMM done.")) |
|
134 |
- result$warnings <- warlist |
|
135 |
- return(result) |
|
136 |
- } else if (any(counts<0)) { |
|
137 |
- warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Some counts in data are negative. No HMM done.")) |
|
138 |
- result$warnings <- warlist |
|
139 |
- return(result) |
|
140 |
- } |
|
141 |
- |
|
142 |
- |
|
143 |
- # Filter high counts out, makes HMM faster |
|
144 |
- count.cutoff <- quantile(counts, count.cutoff.quantile) |
|
145 |
- names.count.cutoff <- names(count.cutoff) |
|
146 |
- count.cutoff <- ceiling(count.cutoff) |
|
147 |
- mask <- counts > count.cutoff |
|
148 |
- counts[mask] <- count.cutoff |
|
149 |
- numfiltered <- length(which(mask)) |
|
150 |
- if (numfiltered > 0) { |
|
151 |
- message(paste0("Replaced read counts > ",count.cutoff," (",names.count.cutoff," quantile) by ",count.cutoff," in ",numfiltered," bins. Set option 'count.cutoff.quantile=1' to disable this filtering. This filtering was done to increase the speed of the HMM and should not affect the results.\n")) |
|
152 |
- } |
|
153 |
- |
|
154 |
- ## Call univariate in a for loop to enable multiple trials |
|
155 |
- modellist <- list() |
|
156 |
- for (i_try in 1:num.trials) { |
|
157 |
- message(paste0("Trial ",i_try," / ",num.trials)) |
|
158 |
- |
|
159 |
- ## Initial parameters |
|
160 |
- if (init == 'random') { |
|
161 |
- A.initial <- matrix(runif(numstates^2), ncol=numstates) |
|
162 |
- A.initial <- sweep(A.initial, 1, rowSums(A.initial), "/") |
|
163 |
- proba.initial <- runif(numstates) |
|
164 |
- # Distributions for dependent states |
|
165 |
- size.initial <- runif(1, min=0, max=100) * cumsum(dependent.states.mask) |
|
166 |
- prob.initial <- runif(1) * dependent.states.mask |
|
167 |
- } else if (init == 'standard') { |
|
168 |
- A.initial <- matrix(NA, ncol=numstates, nrow=numstates) |
|
169 |
- for (irow in 1:numstates) { |
|
170 |
- for (icol in 1:numstates) { |
|
171 |
- if (irow==icol) { A.initial[irow,icol] <- 0.9 } |
|
172 |
- else { A.initial[irow,icol] <- 0.1/(numstates-1) } |
|
173 |
- } |
|
174 |
- } |
|
175 |
- proba.initial <- rep(1/numstates, numstates) |
|
176 |
- mean.initial.monosomy <- mean(counts[counts>0]) |
|
177 |
- var.initial.monosomy <- var(counts[counts>0]) |
|
178 |
- if (is.na(mean.initial.monosomy)) { |
|
179 |
- mean.initial.monosomy <- 1 |
|
180 |
- } |
|
181 |
- if (is.na(var.initial.monosomy)) { |
|
182 |
- var.initial.monosomy <- mean.initial.monosomy + 1 |
|
183 |
- } |
|
184 |
- if (mean.initial.monosomy >= var.initial.monosomy) { |
|
185 |
- mean.initial <- mean.initial.monosomy * cumsum(dependent.states.mask) |
|
186 |
- var.initial <- (mean.initial.monosomy+1) * cumsum(dependent.states.mask) |
|
187 |
- size.initial <- rep(0,numstates) |
|
188 |
- prob.initial <- rep(0,numstates) |
|
189 |
- mask <- dependent.states.mask |
|
190 |
- size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask]) |
|
191 |
- prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask]) |
|
192 |
- } else { |
|
193 |
- mean.initial <- mean.initial.monosomy * cumsum(dependent.states.mask) |
|
194 |
- var.initial <- var.initial.monosomy * cumsum(dependent.states.mask) |
|
195 |
- size.initial <- rep(0,numstates) |
|
196 |
- prob.initial <- rep(0,numstates) |
|
197 |
- mask <- dependent.states.mask |
|
198 |
- size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask]) |
|
199 |
- prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask]) |
|
200 |
- } |
|
201 |
- } |
|
202 |
- # Assign initials for the nullsomy distribution |
|
203 |