git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/chromstaR@126431 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
Package: chromstaR |
2 | 2 |
Type: Package |
3 | 3 |
Title: Combinatorial and Differential Chromatin State Analysis for ChIP-Seq Data |
4 |
-Version: 1.1.1 |
|
4 |
+Version: 1.1.2 |
|
5 | 5 |
Date: 2016-10-19 |
6 | 6 |
Author: Aaron Taudt, Maria Colome Tatche, Matthias Heinig, Minh Anh Nguyen |
7 | 7 |
Maintainer: Aaron Taudt <aaron.taudt@gmail.com> |
... | ... |
@@ -25,7 +25,8 @@ Imports: |
25 | 25 |
reshape2, |
26 | 26 |
Rsamtools, |
27 | 27 |
GenomicAlignments, |
28 |
- bamsignals |
|
28 |
+ bamsignals, |
|
29 |
+ mvtnorm |
|
29 | 30 |
Suggests: |
30 | 31 |
knitr, |
31 | 32 |
BiocStyle, |
... | ... |
@@ -36,5 +37,5 @@ License: Artistic-2.0 |
36 | 37 |
LazyLoad: yes |
37 | 38 |
VignetteBuilder: knitr |
38 | 39 |
Packaged: 2014-05-03 00:00:00 CET+1; Taudt |
39 |
-biocViews: Software, DifferentialPeakCalling, HiddenMarkovModel, ChIPSeq, MultipleComparison |
|
40 |
+biocViews: Software, DifferentialPeakCalling, HiddenMarkovModel, ChIPSeq, HistoneModification, MultipleComparison |
|
40 | 41 |
RoxygenNote: 5.0.1 |
... | ... |
@@ -1,5 +1,8 @@ |
1 | 1 |
# Generated by roxygen2: do not edit by hand |
2 | 2 |
|
3 |
+S3method(print,combinedMultiHMM) |
|
4 |
+S3method(print,multiHMM) |
|
5 |
+S3method(print,uniHMM) |
|
3 | 6 |
export(Chromstar) |
4 | 7 |
export(bin2dec) |
5 | 8 |
export(binReads) |
... | ... |
@@ -7,6 +10,7 @@ export(callPeaksMultivariate) |
7 | 10 |
export(callPeaksReplicates) |
8 | 11 |
export(callPeaksUnivariate) |
9 | 12 |
export(changeFDR) |
13 |
+export(changePostCutoff) |
|
10 | 14 |
export(collapseBins) |
11 | 15 |
export(combineMultivariates) |
12 | 16 |
export(dec2bin) |
... | ... |
@@ -31,6 +35,7 @@ export(plotHistogram) |
31 | 35 |
export(readBamFileAsGRanges) |
32 | 36 |
export(readBedFileAsGRanges) |
33 | 37 |
export(readCustomBedFile) |
38 |
+export(removeCondition) |
|
34 | 39 |
export(stateBrewer) |
35 | 40 |
export(transitionFrequencies) |
36 | 41 |
export(unis2pseudomulti) |
... | ... |
@@ -57,9 +62,11 @@ importFrom(S4Vectors,runmean) |
57 | 62 |
importFrom(S4Vectors,subjectHits) |
58 | 63 |
importFrom(bamsignals,bamCount) |
59 | 64 |
importFrom(grDevices,col2rgb) |
65 |
+importFrom(grDevices,rgb2hsv) |
|
60 | 66 |
importFrom(graphics,hist) |
61 | 67 |
importFrom(methods,as) |
62 | 68 |
importFrom(methods,is) |
69 |
+importFrom(mvtnorm,rmvnorm) |
|
63 | 70 |
importFrom(reshape2,melt) |
64 | 71 |
importFrom(stats,aggregate) |
65 | 72 |
importFrom(stats,cutree) |
... | ... |
@@ -1,3 +1,15 @@ |
1 |
+CHANGES IN VERSION 1.1.2 |
|
2 |
+------------------------ |
|
3 |
+ |
|
4 |
+NEW FEATURES |
|
5 |
+ |
|
6 |
+ o Proper print() methods for all objects. |
|
7 |
+ |
|
8 |
+BUG FIXES |
|
9 |
+ |
|
10 |
+ o Fixed a bug where chromosomes with a single bin where making problems. |
|
11 |
+ |
|
12 |
+ |
|
1 | 13 |
CHANGES IN VERSION 1.1.1 |
2 | 14 |
------------------------ |
3 | 15 |
|
... | ... |
@@ -1,4 +1,4 @@ |
1 |
-#' Wrapper function for the \code{\link{chromstaR}} package |
|
1 |
+#' Wrapper function for the \pkg{\link{chromstaR}} package |
|
2 | 2 |
#' |
3 | 3 |
#' This function performs \code{\link[chromstaR:binReads]{binning}}, \code{\link[chromstaR:callPeaksUnivariate]{univariate peak calling}} and \code{\link[chromstaR:callPeaksMultivariate]{multivariate peak calling}} from a list of input files. |
4 | 4 |
#' |
... | ... |
@@ -7,7 +7,7 @@ |
7 | 7 |
#' Convert aligned reads from .bam or .bed(.gz) files into read counts in equidistant windows (bins). This function uses \code{\link[GenomicRanges]{countOverlaps}} to calculate the read counts, or alternatively \code{\link[bamsignals]{bamProfile}} if option \code{use.bamsignals} is set (only effective for .bam files). |
8 | 8 |
#' |
9 | 9 |
#' @aliases binning |
10 |
-#' @param file A file with aligned reads. Alternatively a \code{\link{GRanges}} with aligned reads if format is set to 'GRanges'. |
|
10 |
+#' @param file A file with aligned reads. Alternatively a \code{\link{GRanges}} with aligned reads. |
|
11 | 11 |
#' @param experiment.table An \code{\link{experiment.table}} containing the supplied \code{file}. This is necessary to uniquely identify the file in later steps of the workflow. Set to \code{NULL} if you don't have it (not recommended). |
12 | 12 |
#' @inheritParams readBamFileAsGRanges |
13 | 13 |
#' @inheritParams readBedFileAsGRanges |
... | ... |
@@ -17,6 +17,7 @@ |
17 | 17 |
#' @param variable.width.reference A BAM file that is used as reference to produce variable width bins. See \code{\link{variableWidthBins}} for details. |
18 | 18 |
#' @param chromosomes If only a subset of the chromosomes should be binned, specify them here. |
19 | 19 |
#' @param use.bamsignals If \code{TRUE} the \pkg{\link[bamsignals]{bamsignals}} package is used for parsing of BAM files. This gives tremendous speed advantage for only one binsize but linearly increases for multiple binsizes, while \code{use.bamsignals=FALSE} has a binsize dependent runtime and might be faster if many binsizes are calculated. |
20 |
+#' @param format One of \code{c('bed','bam','GRanges')} or \code{NULL} if the format is to be determined automatically. |
|
20 | 21 |
#' @return If only one bin size was specified for option \code{binsizes}, the function returns a single \code{\link{GRanges}} object with meta data column 'counts' that contains the read count. If multiple \code{binsizes} were specified , the function returns a named \code{list()} of \link{GRanges} objects. |
21 | 22 |
#' @importFrom Rsamtools BamFile indexBam |
22 | 23 |
#' @importFrom bamsignals bamCount |
... | ... |
@@ -35,14 +36,21 @@ |
35 | 36 |
#' chromosomes='chr12') |
36 | 37 |
#'print(binned) |
37 | 38 |
#' |
38 |
-binReads <- function(file, experiment.table=NULL, assembly, bamindex=file, chromosomes=NULL, pairedEndReads=FALSE, min.mapq=10, remove.duplicate.reads=TRUE, max.fragment.width=1000, blacklist=NULL, binsizes=1000, reads.per.bin=NULL, bins=NULL, variable.width.reference=NULL, use.bamsignals=TRUE) { |
|
39 |
+binReads <- function(file, experiment.table=NULL, assembly, bamindex=file, chromosomes=NULL, pairedEndReads=FALSE, min.mapq=10, remove.duplicate.reads=TRUE, max.fragment.width=1000, blacklist=NULL, binsizes=1000, reads.per.bin=NULL, bins=NULL, variable.width.reference=NULL, use.bamsignals=TRUE, format=NULL) { |
|
39 | 40 |
|
40 | 41 |
## Determine format |
41 |
- if (is.character(file)) { |
|
42 |
- file.clean <- sub('\\.gz$','', file) |
|
43 |
- format <- rev(strsplit(file.clean, '\\.')[[1]])[1] |
|
44 |
- } else if (class(file)=='GRanges') { |
|
45 |
- format <- 'GRanges' |
|
42 |
+ if (is.null(format)) { |
|
43 |
+ if (is.character(file)) { |
|
44 |
+ file.clean <- sub('\\.gz$','', file) |
|
45 |
+ format <- rev(strsplit(file.clean, '\\.')[[1]])[1] |
|
46 |
+ } else if (class(file)=='GRanges') { |
|
47 |
+ format <- 'GRanges' |
|
48 |
+ } else { |
|
49 |
+ stop("Could not determine format automatically. Please specify it via the 'format' parameter.") |
|
50 |
+ } |
|
51 |
+ } |
|
52 |
+ if (! format %in% c('bed','bam','GRanges')) { |
|
53 |
+ stop("Unknown format. Needs to be one of 'bed', 'bam' or 'GRanges'.") |
|
46 | 54 |
} |
47 | 55 |
|
48 | 56 |
if (format=='bed') { |
... | ... |
@@ -111,6 +119,8 @@ binReads <- function(file, experiment.table=NULL, assembly, bamindex=file, chrom |
111 | 119 |
## GRanges (1-based) |
112 | 120 |
data <- file |
113 | 121 |
chrom.lengths <- seqlengths(data) |
122 |
+ } else { |
|
123 |
+ stop("Unknown file format.") |
|
114 | 124 |
} |
115 | 125 |
|
116 | 126 |
## Select chromosomes to bin |
... | ... |
@@ -78,6 +78,11 @@ callPeaksMultivariate <- function(hmms, use.states, max.states=NULL, per.chrom=T |
78 | 78 |
### Make return object ### |
79 | 79 |
result <- list() |
80 | 80 |
result$info <- hmm$info |
81 |
+ if (is.null(result$info)) { |
|
82 |
+ n <- 1 |
|
83 |
+ result$info <- data.frame(file=rep(NA, n), mark=1:n, condition=1:n, replicate=1, pairedEndReads=rep(NA, n), controlFiles=rep(NA, n)) |
|
84 |
+ result$info$ID <- paste0(result$info$mark, '-', result$info$condition, '-rep', result$info$replicate) |
|
85 |
+ } |
|
81 | 86 |
## Bin coordinates, posteriors and states |
82 | 87 |
result$bins <- hmm$bins |
83 | 88 |
result$bins$score <- NULL |
... | ... |
@@ -133,7 +138,7 @@ callPeaksMultivariate <- function(hmms, use.states, max.states=NULL, per.chrom=T |
133 | 138 |
p <- prepareMultivariate(hmms, use.states=use.states, max.states=max.states, chromosomes=chromosomes) |
134 | 139 |
|
135 | 140 |
if (is.null(chromosomes)) { |
136 |
- chromosomes <- seqlevels(p$bins) |
|
141 |
+ chromosomes <- intersect(seqlevels(p$bins), unique(seqnames(p$bins))) |
|
137 | 142 |
} |
138 | 143 |
|
139 | 144 |
## Run multivariate per chromosome |
... | ... |
@@ -167,7 +172,7 @@ callPeaksMultivariate <- function(hmms, use.states, max.states=NULL, per.chrom=T |
167 | 172 |
for (chrom in chromosomes) { |
168 | 173 |
ptm <- startTimedMessage("Chromosome = ", chrom, "\n") |
169 | 174 |
bins <- p$bins[seqnames(p$bins)==chrom] |
170 |
- model <- suppressMessages( runMultivariate(bins=bins, info=p$info, comb.states=p$comb.states, use.states=p$use.states, distributions=p$distributions, weights=p$weights, correlationMatrix=p$correlationMatrix, correlationMatrixInverse=p$correlationMatrixInverse, determinant=p$determinant, max.iter=max.iter, max.time=max.time, eps=eps, num.threads=1, keep.posteriors=keep.posteriors, keep.densities=keep.densities, verbosity=verbosity) ) |
|
175 |
+ model <- runMultivariate(bins=bins, info=p$info, comb.states=p$comb.states, use.states=p$use.states, distributions=p$distributions, weights=p$weights, correlationMatrix=p$correlationMatrix, correlationMatrixInverse=p$correlationMatrixInverse, determinant=p$determinant, max.iter=max.iter, max.time=max.time, eps=eps, num.threads=1, keep.posteriors=keep.posteriors, keep.densities=keep.densities, verbosity=verbosity) |
|
171 | 176 |
message("Time spent for chromosome = ", chrom, ":", appendLF=FALSE) |
172 | 177 |
stopTimedMessage(ptm) |
173 | 178 |
models[[chrom]] <- model |
... | ... |
@@ -396,6 +401,11 @@ prepareMultivariate = function(hmms, use.states=NULL, max.states=NULL, chromosom |
396 | 401 |
} |
397 | 402 |
info <- do.call(rbind, info) |
398 | 403 |
rownames(info) <- NULL |
404 |
+ if (is.null(info)) { |
|
405 |
+ n <- nummod |
|
406 |
+ info <- data.frame(file=rep(NA, n), mark=1:n, condition=1:n, replicate=rep(1, n), pairedEndReads=rep(NA, n), controlFiles=rep(NA, n)) |
|
407 |
+ info$ID <- paste0(info$mark, '-', info$condition, '-rep', info$replicate) |
|
408 |
+ } |
|
399 | 409 |
bins$counts <- counts |
400 | 410 |
colnames(bins$counts) <- info$ID |
401 | 411 |
maxcounts <- max(bins$counts) |
... | ... |
@@ -507,7 +517,8 @@ prepareMultivariate = function(hmms, use.states=NULL, max.states=NULL, chromosom |
507 | 517 |
} |
508 | 518 |
temp <- tryCatch({ |
509 | 519 |
if (nrow(z.temp) > 100) { |
510 |
- correlationMatrix[,,istate] <- cor(z.temp) |
|
520 |
+ correlationMatrix[,,istate] <- suppressWarnings( cor(z.temp) ) |
|
521 |
+ correlationMatrix[,,istate][is.na(correlationMatrix[,,istate])] <- 0 |
|
511 | 522 |
determinant[istate] <- det( correlationMatrix[,,istate] ) |
512 | 523 |
correlationMatrixInverse[,,istate] <- chol2inv(chol(correlationMatrix[,,istate])) |
513 | 524 |
} else { |
... | ... |
@@ -516,12 +527,10 @@ prepareMultivariate = function(hmms, use.states=NULL, max.states=NULL, chromosom |
516 | 527 |
correlationMatrixInverse[,,istate] <- diag(nummod) # solve(diag(x)) == diag(x) |
517 | 528 |
} |
518 | 529 |
0 |
519 |
- }, warning = function(war) { |
|
520 |
- 1 |
|
521 | 530 |
}, error = function(err) { |
522 |
- 1 |
|
531 |
+ 2 |
|
523 | 532 |
}) |
524 |
- if (temp!=0) { |
|
533 |
+ if (temp==2) { |
|
525 | 534 |
correlationMatrix[,,istate] <- diag(nummod) |
526 | 535 |
determinant[istate] <- 1 |
527 | 536 |
correlationMatrixInverse[,,istate] <- diag(nummod) |
... | ... |
@@ -572,3 +581,13 @@ prepareMultivariate = function(hmms, use.states=NULL, max.states=NULL, chromosom |
572 | 581 |
return(out) |
573 | 582 |
} |
574 | 583 |
|
584 |
+ |
|
585 |
+### Get real transition probabilities ### |
|
586 |
+transProbs <- function(model) { |
|
587 |
+ bins <- model$bins |
|
588 |
+ df <- data.frame(from = bins$combination[-length(bins)], to = bins$combination[-1]) |
|
589 |
+ t <- table(df) |
|
590 |
+ t <- t[rownames(model$transitionProbs), colnames(model$transitionProbs)] |
|
591 |
+ t <- sweep(t, MARGIN = 1, STATS = rowSums(t), FUN = '/') |
|
592 |
+ return(t) |
|
593 |
+} |
... | ... |
@@ -64,8 +64,8 @@ callPeaksReplicates <- function(hmm.list, max.states=32, force.equal=FALSE, eps= |
64 | 64 |
|
65 | 65 |
## Univariate replicateInfo |
66 | 66 |
ids <- sapply(hmms, function(x) { x$info$ID }) |
67 |
- weight.univariate <- unlist(lapply(hmms, function(x) { x$weights['modified'] })) |
|
68 |
- total.count <- unlist(lapply(hmms, function(x) { sum(x$bins$counts) })) |
|
67 |
+ weight.univariate <- sapply(hmms, function(x) { x$weights['modified'] }) |
|
68 |
+ total.count <- sapply(hmms, function(x) { sum(x$bins$counts) }) |
|
69 | 69 |
info.df <- data.frame(total.count=total.count, weight.univariate=weight.univariate) |
70 | 70 |
if (!is.null(unlist(ids))) { |
71 | 71 |
rownames(info.df) <- ids |
... | ... |
@@ -57,7 +57,7 @@ callPeaksUnivariate <- function(binned.data, input.data=NULL, prefit.on.chr=NULL |
57 | 57 |
} |
58 | 58 |
if (!is.null(input.data)) { |
59 | 59 |
if (class(input.data) == 'character') { |
60 |
- message("Loading input file ",input.data) |
|
60 |
+ message("Loading input file(s) ", paste0(input.data, collapse=', ')) |
|
61 | 61 |
input.datas <- loadHmmsFromFiles(input.data) |
62 | 62 |
input.data <- input.datas[[1]] |
63 | 63 |
if (length(input.datas) > 1) { |
... | ... |
@@ -8,24 +8,25 @@ |
8 | 8 |
#' @param model A \code{\link{uniHMM}} or \code{\link{multiHMM}} object with posteriors. |
9 | 9 |
#' @param post.cutoff A vector of posterior cutoff values between 0 and 1 the same length as \code{ncol(model$bins$posteriors)}. If only one value is given, it will be reused for all columns. Values close to 1 will yield more stringent peak calls with lower false positive but higher false negative rate. |
10 | 10 |
#' @return The input object is returned with adjusted peak calls. |
11 |
-# #' @examples |
|
12 |
-# #'## Get an example BAM file |
|
13 |
-# #'file <- system.file("extdata", "euratrans", |
|
14 |
-# #' "lv-H3K27me3-BN-male-bio2-tech1.bam", |
|
15 |
-# #' package="chromstaRData") |
|
16 |
-# #'## Bin the file into bin size 1000bp |
|
17 |
-# #'data(rn4_chrominfo) |
|
18 |
-# #'binned <- binReads(file, assembly=rn4_chrominfo, binsizes=1000, |
|
19 |
-# #' chromosomes='chr12') |
|
20 |
-# #'## Fit the univariate Hidden Markov Model |
|
21 |
-# #'# !Keep posteriors to change the post.cutoff later! |
|
22 |
-# #'hmm <- callPeaksUnivariate(binned, max.time=60, eps=1, |
|
23 |
-# #' keep.posteriors=TRUE) |
|
24 |
-# #'## Compare fits with different post.cutoffs |
|
25 |
-# #'plotHistogram(changePostCutoff(hmm, post.cutoff=0.01)) + ylim(0,0.25) |
|
26 |
-# #'plotHistogram(hmm) + ylim(0,0.25) |
|
27 |
-# #'plotHistogram(changePostCutoff(hmm, post.cutoff=0.99)) + ylim(0,0.25) |
|
28 |
-# #' |
|
11 |
+#' @export |
|
12 |
+#' @examples |
|
13 |
+#'## Get an example BAM file |
|
14 |
+#'file <- system.file("extdata", "euratrans", |
|
15 |
+#' "lv-H3K27me3-BN-male-bio2-tech1.bam", |
|
16 |
+#' package="chromstaRData") |
|
17 |
+#'## Bin the file into bin size 1000bp |
|
18 |
+#'data(rn4_chrominfo) |
|
19 |
+#'binned <- binReads(file, assembly=rn4_chrominfo, binsizes=1000, |
|
20 |
+#' chromosomes='chr12') |
|
21 |
+#'## Fit the univariate Hidden Markov Model |
|
22 |
+#'# !Keep posteriors to change the post.cutoff later! |
|
23 |
+#'hmm <- callPeaksUnivariate(binned, max.time=60, eps=1, |
|
24 |
+#' keep.posteriors=TRUE) |
|
25 |
+#'## Compare fits with different post.cutoffs |
|
26 |
+#'plotHistogram(changePostCutoff(hmm, post.cutoff=0.01)) + ylim(0,0.25) |
|
27 |
+#'plotHistogram(hmm) + ylim(0,0.25) |
|
28 |
+#'plotHistogram(changePostCutoff(hmm, post.cutoff=0.99)) + ylim(0,0.25) |
|
29 |
+#' |
|
29 | 30 |
changePostCutoff <- function(model, post.cutoff=0.5) { |
30 | 31 |
|
31 | 32 |
if (!is.numeric(post.cutoff)) { |
... | ... |
@@ -108,10 +109,27 @@ changePostCutoff.multivariate <- function(model, post.cutoff) { |
108 | 109 |
multiHMM$bins$state <- factor(states) |
109 | 110 |
multiHMM$bins$posteriors <- post |
110 | 111 |
multiHMM$mapping <- mapping |
112 |
+ multiHMM$peaks <- model$peaks |
|
111 | 113 |
model <- combineMultivariates(list(multiHMM), mode='full') |
112 | 114 |
} else { |
113 | 115 |
stop("Supply either a uniHMM, multiHMM or combinedMultiHMM object.") |
114 | 116 |
} |
117 |
+ |
|
118 |
+ ## Redo peaks |
|
119 |
+ ptm <- startTimedMessage("Recalculating peaks ...") |
|
120 |
+ binstates <- dec2bin(model$segments$state, colnames=colnames(post)) |
|
121 |
+ segments <- model$segments |
|
122 |
+ mcols(segments) <- NULL |
|
123 |
+ for (icol in 1:ncol(binstates)) { |
|
124 |
+ segments$peakScores <- model$segments$peakScores[,icol] |
|
125 |
+ segments$state <- binstates[,icol] |
|
126 |
+ red.df <- suppressMessages( collapseBins(as.data.frame(segments), column2collapseBy = 'state') ) |
|
127 |
+ red.df <- red.df[red.df$state == 1,] |
|
128 |
+ red.df$state <- NULL |
|
129 |
+ model$peaks[[icol]] <- methods::as(red.df, 'GRanges') |
|
130 |
+ } |
|
131 |
+ stopTimedMessage(ptm) |
|
132 |
+ |
|
115 | 133 |
## Return model |
116 | 134 |
model$post.cutoff <- threshold |
117 | 135 |
return(model) |
... | ... |
@@ -150,10 +168,11 @@ changePostCutoff.univariate <- function(model, post.cutoff) { |
150 | 168 |
ptm <- startTimedMessage("Making segmentation ...") |
151 | 169 |
gr <- model$bins |
152 | 170 |
df <- as.data.frame(gr) |
153 |
- red.df <- suppressMessages(collapseBins(df, column2collapseBy='state', columns2average=c('score'), columns2drop=c('width',grep('posteriors', names(df), value=TRUE), 'counts'))) |
|
154 |
- red.gr <- GRanges(seqnames=red.df[,1], ranges=IRanges(start=red.df[,2], end=red.df[,3]), strand=red.df[,4], state=red.df[,'state'], score=red.df[,'mean.score']) |
|
155 |
- model$segments <- red.gr |
|
156 |
- seqlengths(model$segments) <- seqlengths(model$bins)[seqlevels(model$segments)] |
|
171 |
+ red.df <- suppressMessages(collapseBins(df, column2collapseBy='state', columns2drop=c('width', 'posterior.modified', grep('posteriors', names(df), value=TRUE), 'counts'))) |
|
172 |
+ segments <- methods::as(red.df, 'GRanges') |
|
173 |
+ model$peaks <- segments[segments$state == 'modified'] |
|
174 |
+ model$peaks$state <- NULL |
|
175 |
+ seqlengths(model$peaks) <- seqlengths(model$bins)[seqlevels(model$peaks)] |
|
157 | 176 |
stopTimedMessage(ptm) |
158 | 177 |
|
159 | 178 |
## Return model |
... | ... |
@@ -144,12 +144,22 @@ combineMultivariates <- function(hmms, mode) { |
144 | 144 |
stopTimedMessage(ptm) |
145 | 145 |
} |
146 | 146 |
ptm <- startTimedMessage("Concatenating HMMs ...") |
147 |
+ # Slightly more complicated selection procedure for conditions in case one mark is missing |
|
148 |
+ conds.help <- lapply(infos, function(x) { unique(x$condition) }) |
|
149 |
+ conditions <- conds.help[[which.max(sapply(conds.help, length))]] |
|
150 |
+ infos <- do.call(rbind, infos) |
|
151 |
+ infos$condition <- factor(infos$condition, levels=conditions) |
|
152 |
+ infos <- infos[order(infos$condition, infos$mark, infos$replicate),] |
|
153 |
+ infos$condition <- as.character(infos$condition) |
|
154 |
+ # Reorder everything according to conditions |
|
147 | 155 |
counts <- do.call(cbind, counts) |
156 |
+ counts <- counts[,infos$ID] |
|
148 | 157 |
posteriors <- do.call(cbind, posteriors) |
158 |
+ posteriors <- posteriors[,infos$ID] |
|
149 | 159 |
peakScores <- do.call(cbind, peakScores) |
150 |
- infos <- do.call(rbind, infos) |
|
151 |
- conditions <- unique(infos$condition) |
|
160 |
+ peakScores <- peakScores[,infos$ID] |
|
152 | 161 |
binstates <- do.call(cbind, binstates) |
162 |
+ binstates <- binstates[,infos$ID] |
|
153 | 163 |
states <- factor(bin2dec(binstates)) |
154 | 164 |
names(states) <- NULL |
155 | 165 |
stopTimedMessage(ptm) |
... | ... |
@@ -3,8 +3,9 @@ |
3 | 3 |
#' Plotting functions for enrichment analysis of \code{\link{multiHMM}} or \code{\link{combinedMultiHMM}} objects with any annotation of interest, specified as a \code{\link[GenomicRanges]{GRanges}} object. |
4 | 4 |
#' |
5 | 5 |
#' @name enrichment_analysis |
6 |
-#' @param combinations A vector with combinations for which the fold enrichment will be calculated. If \code{NULL} all combinations will be considered. |
|
6 |
+#' @param combinations A vector with combinations for which the enrichment will be calculated. If \code{NULL} all combinations will be considered. |
|
7 | 7 |
#' @param marks A vector with marks for which the enrichment is plotted. If \code{NULL} all marks will be considered. |
8 |
+#' @param logscale Set to \code{TRUE} to plot fold enrichment on log-scale. Ignored if \code{statistic = 'fraction'}. |
|
8 | 9 |
#' @return A \code{\link[ggplot2:ggplot]{ggplot}} object containing the plot or a list() with \code{\link[ggplot2:ggplot]{ggplot}} objects if several plots are returned. For \code{plotFoldEnrichHeatmap} a named array with fold enrichments if \code{plot=FALSE}. |
9 | 10 |
#' @author Aaron Taudt |
10 | 11 |
#' @seealso \code{\link{plotting}} |
... | ... |
@@ -54,6 +55,7 @@ |
54 | 55 |
#'# Fold enrichment with different biotypes, showing that protein coding genes are |
55 | 56 |
#'# enriched with (bivalent) promoter combinations [H3K4me3] and [H3K4me3+H3K27me3], |
56 | 57 |
#'# while rRNA is enriched with the empty [] and repressive combinations [H3K27me3]. |
58 |
+#' tss <- resize(genes, width = 3, fix = 'start') |
|
57 | 59 |
#' biotypes <- split(tss, tss$biotype) |
58 | 60 |
#' plotFoldEnrichHeatmap(model, annotations=biotypes) + coord_flip() |
59 | 61 |
#' |
... | ... |
@@ -64,11 +66,12 @@ NULL |
64 | 66 |
#' @param hmm A \code{\link{combinedMultiHMM}} or \code{\link{multiHMM}} object or a file that contains such an object. |
65 | 67 |
#' @param annotations A \code{list()} with \code{\link{GRanges}} objects containing coordinates of multiple annotations The names of the list entries will be used to name the return values. |
66 | 68 |
#' @param plot A logical indicating whether the plot or an array with the fold enrichment values is returned. |
69 |
+#' @param what One of \code{c('combinations','peaks','counts','transitions')} specifying on which feature the statistic is calculated. |
|
67 | 70 |
#' @importFrom S4Vectors subjectHits queryHits |
68 | 71 |
#' @importFrom IRanges subsetByOverlaps |
69 | 72 |
#' @importFrom reshape2 melt |
70 | 73 |
#' @export |
71 |
-plotFoldEnrichHeatmap <- function(hmm, annotations, what="combinations", combinations=NULL, marks=NULL, plot=TRUE) { |
|
74 |
+plotFoldEnrichHeatmap <- function(hmm, annotations, what="combinations", combinations=NULL, marks=NULL, plot=TRUE, logscale=TRUE) { |
|
72 | 75 |
|
73 | 76 |
hmm <- loadHmmsFromFiles(hmm, check.class=c(class.multivariate.hmm, class.combined.multivariate.hmm))[[1]] |
74 | 77 |
## Variables |
... | ... |
@@ -76,9 +79,17 @@ plotFoldEnrichHeatmap <- function(hmm, annotations, what="combinations", combina |
76 | 79 |
if (class(hmm) == class.combined.multivariate.hmm) { |
77 | 80 |
} else if (class(hmm) == class.multivariate.hmm) { |
78 | 81 |
# Rename 'combination' to 'combination.' for coherence with combinedMultiHMM |
79 |
- names(mcols(bins))[grep('combination', names(mcols(bins)))] <- 'combination.' |
|
82 |
+ names(mcols(bins))[grep('combination', names(mcols(bins)))] <- paste0('combination.', unique(hmm$info$condition)) |
|
80 | 83 |
} |
81 | 84 |
conditions <- sub('combination.', '', grep('combination', names(mcols(bins)), value=TRUE)) |
85 |
+ comb.levels <- levels(mcols(bins)[,paste0('combination.', conditions[1])]) |
|
86 |
+ ## Create new column combination with all conditions combined |
|
87 |
+ combs <- list() |
|
88 |
+ for (condition in conditions) { |
|
89 |
+ combs[[condition]] <- paste0(condition, ":", mcols(bins)[,paste0('combination.', condition)]) |
|
90 |
+ } |
|
91 |
+ combs$sep <- ', ' |
|
92 |
+ bins$transitions <- factor(do.call(paste, combs)) |
|
82 | 93 |
if (is.null(combinations)) { |
83 | 94 |
comb.levels <- levels(mcols(bins)[,paste0('combination.', conditions[1])]) |
84 | 95 |
} else { |
... | ... |
@@ -93,85 +104,137 @@ plotFoldEnrichHeatmap <- function(hmm, annotations, what="combinations", combina |
93 | 104 |
annotationsAtBins <- lapply(annotations, function(x) { IRanges::subsetByOverlaps(x, bins) }) |
94 | 105 |
feature.lengths <- lapply(annotationsAtBins, function(x) { sum(as.numeric(width(x))) }) |
95 | 106 |
|
96 |
- if (what == 'peaks') { |
|
97 |
- binstates <- dec2bin(bins$state, colnames=hmm$info$ID) |
|
98 |
- } |
|
99 |
- |
|
100 |
- ## Fold enrichment |
|
101 | 107 |
ggplts <- list() |
102 | 108 |
folds <- list() |
103 | 109 |
maxfolds <- list() |
104 |
- for (condition in conditions) { |
|
105 |
- if (what == 'combinations') { |
|
106 |
- bins$combination <- mcols(bins)[,paste0('combination.', condition)] |
|
107 |
- fold <- array(NA, dim=c(length(annotationsAtBins), length(comb.levels)), dimnames=list(annotation=names(annotationsAtBins), combination=comb.levels)) |
|
108 |
- for (icomb in 1:length(comb.levels)) { |
|
109 |
- mask <- bins$combination == comb.levels[icomb] |
|
110 |
- bins.mask <- bins[mask] |
|
111 |
- combstate.length <- sum(as.numeric(width(bins.mask))) |
|
112 |
- for (ifeat in 1:length(annotationsAtBins)) { |
|
113 |
- feature <- annotationsAtBins[[ifeat]] |
|
114 |
- ind <- findOverlaps(bins.mask, feature) |
|
110 |
+ minfolds <- list() |
|
115 | 111 |
|
116 |
- binsinfeature <- bins.mask[unique(S4Vectors::queryHits(ind))] |
|
117 |
- sum.binsinfeature <- sum(as.numeric(width(binsinfeature))) |
|
118 |
- |
|
119 |
- featuresinbins <- feature[unique(S4Vectors::subjectHits(ind))] |
|
120 |
- sum.featuresinbins <- sum(as.numeric(width(featuresinbins))) |
|
121 |
- |
|
122 |
- fold[ifeat,icomb] <- sum.binsinfeature / combstate.length / feature.lengths[[ifeat]] * genome |
|
123 |
- } |
|
124 |
- } |
|
125 |
- |
|
126 |
- } else if (what == 'peaks') { |
|
127 |
- fold <- array(NA, dim=c(length(annotationsAtBins), length(mark.levels)), dimnames=list(annotation=names(annotationsAtBins), mark=mark.levels)) |
|
128 |
- for (imark in 1:length(mark.levels)) { |
|
129 |
- mark <- mark.levels[imark] |
|
130 |
- colmask <- hmm$info$mark == mark |
|
131 |
- colmask <- colmask & (!duplicated(paste0(hmm$info$mark, hmm$info$condition))) # remove replicates |
|
132 |
- if (condition != "") { |
|
133 |
- colmask <- colmask & (hmm$info$condition == condition) |
|
134 |
- } |
|
135 |
- binstates.cond <- binstates[,colmask] |
|
136 |
- mask <- binstates.cond |
|
137 |
- bins.mask <- bins[mask] |
|
138 |
- combstate.length <- sum(as.numeric(width(bins.mask))) |
|
139 |
- for (ifeat in 1:length(annotationsAtBins)) { |
|
140 |
- feature <- annotationsAtBins[[ifeat]] |
|
141 |
- ind <- findOverlaps(bins.mask, feature) |
|
142 |
- |
|
143 |
- binsinfeature <- bins.mask[unique(S4Vectors::queryHits(ind))] |
|
144 |
- sum.binsinfeature <- sum(as.numeric(width(binsinfeature))) |
|
145 |
- |
|
146 |
- featuresinbins <- feature[unique(S4Vectors::subjectHits(ind))] |
|
147 |
- sum.featuresinbins <- sum(as.numeric(width(featuresinbins))) |
|
148 |
- |
|
149 |
- fold[ifeat,imark] <- sum.binsinfeature / combstate.length / feature.lengths[[ifeat]] * genome |
|
150 |
- } |
|
112 |
+ if (what == 'peaks') { |
|
113 |
+ binstates <- dec2bin(bins$state, colnames=hmm$info$ID) |
|
114 |
+ } |
|
115 |
+ if (what == 'transitions') { |
|
116 |
+ bins$combination <- bins$transitions |
|
117 |
+ comb.levels <- names(sort(table(bins$combination), decreasing = TRUE)) |
|
118 |
+ fold <- array(NA, dim=c(length(annotationsAtBins), length(comb.levels)), dimnames=list(annotation=names(annotationsAtBins), combination=comb.levels)) |
|
119 |
+ for (icomb in 1:length(comb.levels)) { |
|
120 |
+ mask <- bins$combination == comb.levels[icomb] |
|
121 |
+ bins.mask <- bins[mask] |
|
122 |
+ combstate.length <- sum(as.numeric(width(bins.mask))) |
|
123 |
+ for (ifeat in 1:length(annotationsAtBins)) { |
|
124 |
+ feature <- annotationsAtBins[[ifeat]] |
|
125 |
+ ind <- findOverlaps(bins.mask, feature) |
|
126 |
+ |
|
127 |
+ binsinfeature <- bins.mask[unique(S4Vectors::queryHits(ind))] |
|
128 |
+ sum.binsinfeature <- sum(as.numeric(width(binsinfeature))) |
|
129 |
+ |
|
130 |
+ featuresinbins <- feature[unique(S4Vectors::subjectHits(ind))] |
|
131 |
+ sum.featuresinbins <- sum(as.numeric(width(featuresinbins))) |
|
132 |
+ |
|
133 |
+ fold[ifeat,icomb] <- sum.binsinfeature / combstate.length / feature.lengths[[ifeat]] * genome |
|
151 | 134 |
} |
152 |
- |
|
153 | 135 |
} |
154 |
- |
|
155 | 136 |
if (plot) { |
156 | 137 |
df <- reshape2::melt(fold, value.name='foldEnrichment') |
157 |
- maxfolds[[condition]] <- max(df$foldEnrichment, na.rm=TRUE) |
|
158 |
- if (what == 'combinations') { |
|
159 |
- ggplt <- ggplot(df) + geom_tile(aes_string(x='combination', y='annotation', fill='foldEnrichment')) |
|
160 |
- } else if (what == 'peaks') { |
|
161 |
- ggplt <- ggplot(df) + geom_tile(aes_string(x='mark', y='annotation', fill='foldEnrichment')) |
|
138 |
+ if (logscale) { |
|
139 |
+ df$foldEnrichment <- log(df$foldEnrichment) |
|
162 | 140 |
} |
141 |
+ foldsnoinf <- setdiff(df$foldEnrichment, c(Inf, -Inf)) |
|
142 |
+ maxfolds[[1]] <- max(foldsnoinf, na.rm=TRUE) |
|
143 |
+ minfolds[[1]] <- min(foldsnoinf, na.rm=TRUE) |
|
144 |
+ ggplt <- ggplot(df) + geom_tile(aes_string(x='combination', y='annotation', fill='foldEnrichment')) |
|
163 | 145 |
ggplt <- ggplt + theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) |
164 |
- ggplts[[condition]] <- ggplt |
|
146 |
+ ggplts[[1]] <- ggplt |
|
165 | 147 |
} else { |
166 |
- folds[[condition]] <- fold |
|
148 |
+ folds[[1]] <- fold |
|
149 |
+ } |
|
150 |
+ } |
|
151 |
+ |
|
152 |
+ ## Fold enrichment |
|
153 |
+ if (what %in% c('peaks', 'combinations')) { |
|
154 |
+ for (condition in conditions) { |
|
155 |
+ if (what == 'combinations') { |
|
156 |
+ bins$combination <- mcols(bins)[,paste0('combination.', condition)] |
|
157 |
+ fold <- array(NA, dim=c(length(annotationsAtBins), length(comb.levels)), dimnames=list(annotation=names(annotationsAtBins), combination=comb.levels)) |
|
158 |
+ for (icomb in 1:length(comb.levels)) { |
|
159 |
+ mask <- bins$combination == comb.levels[icomb] |
|
160 |
+ bins.mask <- bins[mask] |
|
161 |
+ combstate.length <- sum(as.numeric(width(bins.mask))) |
|
162 |
+ for (ifeat in 1:length(annotationsAtBins)) { |
|
163 |
+ feature <- annotationsAtBins[[ifeat]] |
|
164 |
+ ind <- findOverlaps(bins.mask, feature) |
|
165 |
+ |
|
166 |
+ binsinfeature <- bins.mask[unique(S4Vectors::queryHits(ind))] |
|
167 |
+ sum.binsinfeature <- sum(as.numeric(width(binsinfeature))) |
|
168 |
+ |
|
169 |
+ featuresinbins <- feature[unique(S4Vectors::subjectHits(ind))] |
|
170 |
+ sum.featuresinbins <- sum(as.numeric(width(featuresinbins))) |
|
171 |
+ |
|
172 |
+ fold[ifeat,icomb] <- sum.binsinfeature / combstate.length / feature.lengths[[ifeat]] * genome |
|
173 |
+ } |
|
174 |
+ } |
|
175 |
+ |
|
176 |
+ } else if (what == 'peaks') { |
|
177 |
+ fold <- array(NA, dim=c(length(annotationsAtBins), length(mark.levels)), dimnames=list(annotation=names(annotationsAtBins), mark=mark.levels)) |
|
178 |
+ for (imark in 1:length(mark.levels)) { |
|
179 |
+ mark <- mark.levels[imark] |
|
180 |
+ colmask <- hmm$info$mark == mark |
|
181 |
+ colmask <- colmask & (!duplicated(paste0(hmm$info$mark, hmm$info$condition))) # remove replicates |
|
182 |
+ if (condition != "") { |
|
183 |
+ colmask <- colmask & (hmm$info$condition == condition) |
|
184 |
+ } |
|
185 |
+ binstates.cond <- binstates[,colmask] |
|
186 |
+ mask <- binstates.cond |
|
187 |
+ bins.mask <- bins[mask] |
|
188 |
+ combstate.length <- sum(as.numeric(width(bins.mask))) |
|
189 |
+ for (ifeat in 1:length(annotationsAtBins)) { |
|
190 |
+ feature <- annotationsAtBins[[ifeat]] |
|
191 |
+ ind <- findOverlaps(bins.mask, feature) |
|
192 |
+ |
|
193 |
+ binsinfeature <- bins.mask[unique(S4Vectors::queryHits(ind))] |
|
194 |
+ sum.binsinfeature <- sum(as.numeric(width(binsinfeature))) |
|
195 |
+ |
|
196 |
+ featuresinbins <- feature[unique(S4Vectors::subjectHits(ind))] |
|
197 |
+ sum.featuresinbins <- sum(as.numeric(width(featuresinbins))) |
|
198 |
+ |
|
199 |
+ fold[ifeat,imark] <- sum.binsinfeature / combstate.length / feature.lengths[[ifeat]] * genome |
|
200 |
+ } |
|
201 |
+ } |
|
202 |
+ |
|
203 |
+ } |
|
204 |
+ |
|
205 |
+ if (plot) { |
|
206 |
+ df <- reshape2::melt(fold, value.name='foldEnrichment') |
|
207 |
+ if (logscale) { |
|
208 |
+ df$foldEnrichment <- log(df$foldEnrichment) |
|
209 |
+ } |
|
210 |
+ foldsnoinf <- setdiff(df$foldEnrichment, c(Inf, -Inf)) |
|
211 |
+ maxfolds[[condition]] <- max(foldsnoinf, na.rm=TRUE) |
|
212 |
+ minfolds[[condition]] <- min(foldsnoinf, na.rm=TRUE) |
|
213 |
+ if (what == 'combinations') { |
|
214 |
+ ggplt <- ggplot(df) + geom_tile(aes_string(x='combination', y='annotation', fill='foldEnrichment')) |
|
215 |
+ } else if (what == 'peaks') { |
|
216 |
+ ggplt <- ggplot(df) + geom_tile(aes_string(x='mark', y='annotation', fill='foldEnrichment')) |
|
217 |
+ } |
|
218 |
+ ggplt <- ggplt + theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) |
|
219 |
+ ggplts[[condition]] <- ggplt |
|
220 |
+ } else { |
|
221 |
+ folds[[condition]] <- fold |
|
222 |
+ } |
|
167 | 223 |
} |
168 | 224 |
} |
169 | 225 |
if (plot) { |
170 |
- maxfolds <- unlist(maxfolds) |
|
171 |
- ggplts <- lapply(ggplts, function(ggplt) { ggplt + scale_fill_gradientn(colors=grDevices::colorRampPalette(c("blue","white","red"))(20), values=c(seq(0,1,length.out=10), seq(1,max(maxfolds, na.rm=TRUE),length.out=10)), rescaler=function(x,...) {x}, oob=identity, limits=c(0,max(maxfolds, na.rm=TRUE))) }) |
|
226 |
+ maxfold <- max(unlist(maxfolds), na.rm=TRUE) |
|
227 |
+ minfold <- min(unlist(minfolds), na.rm=TRUE) |
|
228 |
+ limits <- max(abs(maxfold), abs(minfold)) |
|
229 |
+ if (logscale) { |
|
230 |
+ ggplts <- lapply(ggplts, function(ggplt) { ggplt + scale_fill_gradientn(name='log(observed/expected)', colors=grDevices::colorRampPalette(c("blue","white","red"))(20), values=c(seq(-limits,0,length.out=10), seq(0,limits,length.out=10)), rescaler=function(x,...) {x}, oob=identity, limits=c(-limits, limits)) }) |
|
231 |
+ ggplts <- lapply(ggplts, function(ggplt) { ggplt$data$foldEnrichment[ggplt$data$foldEnrichment == -Inf] <- -limits; ggplt }) |
|
232 |
+ } else { |
|
233 |
+ ggplts <- lapply(ggplts, function(ggplt) { ggplt + scale_fill_gradientn(name='observed/expected', colors=grDevices::colorRampPalette(c("blue","white","red"))(20), values=c(seq(0,1,length.out=10), seq(1,maxfold,length.out=10)), rescaler=function(x,...) {x}, oob=identity, limits=c(0,maxfold)) }) |
|
234 |
+ } |
|
172 | 235 |
} |
173 | 236 |
|
174 |
- if (class(hmm) == class.multivariate.hmm) { |
|
237 |
+ if (class(hmm) == class.multivariate.hmm | what == 'transitions') { |
|
175 | 238 |
if (plot) { |
176 | 239 |
return(ggplts[[1]]) |
177 | 240 |
} else { |
... | ... |
@@ -203,12 +266,12 @@ plotEnrichCountHeatmap <- function(hmm, annotation, bp.around.annotation=10000, |
203 | 266 |
conditions <- sub('combination.', '', grep('combination', names(mcols(bins)), value=TRUE)) |
204 | 267 |
comb.levels <- levels(mcols(bins)[,paste0('combination.', conditions[1])]) |
205 | 268 |
## Create new column combination with all conditions combined |
206 |
- combinations <- list() |
|
269 |
+ combs <- list() |
|
207 | 270 |
for (condition in conditions) { |
208 |
- combinations[[condition]] <- paste0(condition, ":", mcols(bins)[,paste0('combination.', condition)]) |
|
271 |
+ combs[[condition]] <- paste0(condition, ":", mcols(bins)[,paste0('combination.', condition)]) |
|
209 | 272 |
} |
210 |
- combinations$sep <- ', ' |
|
211 |
- bins$combination <- factor(do.call(paste, combinations)) |
|
273 |
+ combs$sep <- ', ' |
|
274 |
+ bins$combination <- factor(do.call(paste, combs)) |
|
212 | 275 |
} else if (class(hmm) == class.multivariate.hmm) { |
213 | 276 |
comb.levels <- levels(bins$combination) |
214 | 277 |
} |
... | ... |
@@ -254,18 +317,19 @@ plotEnrichCountHeatmap <- function(hmm, annotation, bp.around.annotation=10000, |
254 | 317 |
rownames(ext.index) <- 1:nrow(ext.index) |
255 | 318 |
stopTimedMessage(ptm) |
256 | 319 |
|
257 |
- ## Go through combinations and then tracks to get the read counts |
|
320 |
+ ## Go through combinations and then IDs to get the read counts |
|
258 | 321 |
ptm <- startTimedMessage("Getting read counts") |
259 | 322 |
counts <- list() |
260 |
- for (combination in levels(bins$combination)) { |
|
323 |
+ combinations <- names(sort(table(bins$combination[index]), decreasing = TRUE)) |
|
324 |
+ for (combination in combinations) { |
|
261 | 325 |
counts[[combination]] <- list() |
262 | 326 |
index.combination <- which(bins$combination[index]==combination) |
263 | 327 |
ext.index.combination <- ext.index[index.combination,] |
264 | 328 |
if (is.null(dim(ext.index.combination))) { |
265 | 329 |
ext.index.combination <- array(ext.index.combination, dim=c(1,dim(ext.index)[[2]]), dimnames=list(anno=rownames(ext.index)[index.combination], position=dimnames(ext.index)[[2]])) |
266 | 330 |
} |
267 |
- for (ntrack in colnames(bins$counts)) { |
|
268 |
- counts[[combination]][[ntrack]] <- array(bins$counts[ext.index.combination,ntrack], dim=dim(ext.index.combination), dimnames=dimnames(ext.index.combination)) |
|
331 |
+ for (nID in colnames(bins$counts)) { |
|
332 |
+ counts[[combination]][[nID]] <- array(bins$counts[ext.index.combination,nID], dim=dim(ext.index.combination), dimnames=dimnames(ext.index.combination)) |
|
269 | 333 |
} |
270 | 334 |
} |
271 | 335 |
stopTimedMessage(ptm) |
... | ... |
@@ -287,16 +351,16 @@ plotEnrichCountHeatmap <- function(hmm, annotation, bp.around.annotation=10000, |
287 | 351 |
comb2keep <- names(num.comb)[num.comb/sum(num.comb) > 0.005] |
288 | 352 |
counts <- counts[comb2keep] |
289 | 353 |
df <- reshape2::melt(counts) |
290 |
- names(df) <- c('id','position','RPKM','track','combination') |
|
354 |
+ names(df) <- c('id','position','RPKM','ID','combination') |
|
291 | 355 |
df$id <- factor(df$id, levels=rev(unique(df$id))) |
292 | 356 |
df$combination <- factor(df$combination, levels=unique(df$combination)) |
293 |
- df$track <- factor(df$track, levels=colnames(bins$counts)) |
|
357 |
+ df$ID <- factor(df$ID, levels=hmm$info$ID) |
|
294 | 358 |
|
295 | 359 |
## Plot as heatmap |
296 | 360 |
ggplt <- ggplot(df) + geom_tile(aes_string(x='position', y='id', color='combination')) |
297 | 361 |
ggplt <- ggplt + scale_color_manual(values = getDistinctColors(length(unique(df$combination)))) |
298 | 362 |
ggplt <- ggplt + geom_tile(aes_string(x='position', y='id', fill='RPKM'), alpha=0.6) |
299 |
- ggplt <- ggplt + facet_wrap( ~ track, nrow=1) + custom_theme |
|
363 |
+ ggplt <- ggplt + facet_wrap( ~ ID, nrow=1) + custom_theme |
|
300 | 364 |
ggplt <- ggplt + xlab('distance from annotation in [bp]') + ylab('') |
301 | 365 |
ggplt <- ggplt + scale_fill_continuous(trans='log1p', low='white', high='black') |
302 | 366 |
# Insert horizontal lines |
... | ... |
@@ -315,7 +379,7 @@ plotEnrichCountHeatmap <- function(hmm, annotation, bp.around.annotation=10000, |
315 | 379 |
#' @inheritParams enrichmentAtAnnotation |
316 | 380 |
#' @importFrom reshape2 melt |
317 | 381 |
#' @export |
318 |
-plotEnrichment <- function(hmm, annotation, bp.around.annotation=10000, region=c("start","inside","end"), num.intervals=20, what='combinations', combinations=NULL, marks=NULL, statistic='fold') { |
|
382 |
+plotEnrichment <- function(hmm, annotation, bp.around.annotation=10000, region=c("start","inside","end"), num.intervals=20, what='combinations', combinations=NULL, marks=NULL, statistic='fold', logscale=TRUE) { |
|
319 | 383 |
|
320 | 384 |
## Check user input |
321 | 385 |
if ((!what %in% c('combinations','peaks','counts')) | length(what) > 1) { |
... | ... |
@@ -326,11 +390,17 @@ plotEnrichment <- function(hmm, annotation, bp.around.annotation=10000, region=c |
326 | 390 |
} |
327 | 391 |
|
328 | 392 |
## Variables |
329 |
- hmm <- loadHmmsFromFiles(hmm, check.class=c(class.multivariate.hmm, class.combined.multivariate.hmm))[[1]] |
|
330 |
- hmm$bins$counts <- rpkm.matrix(hmm$bins$counts, binsize=mean(width(hmm$bins))) |
|
393 |
+ hmm <- loadHmmsFromFiles(hmm, check.class=c(class.univariate.hmm, class.multivariate.hmm, class.combined.multivariate.hmm))[[1]] |
|
331 | 394 |
bins <- hmm$bins |
332 |
- if (class(hmm) == class.combined.multivariate.hmm) { |
|
395 |
+ if (class(hmm) == class.univariate.hmm) { |
|
396 |
+ bins$counts <- rpkm.vector(hmm$bins$counts, binsize=mean(width(hmm$bins))) |
|
397 |
+ mcols(bins)['combination.'] <- bins$state |
|
398 |
+ bins$state <- c('zero-inflation' = 0, 'unmodified' = 0, 'modified' = 1)[bins$state] |
|
399 |
+ hmm$info <- data.frame(file=NA, mark=1, condition=1, replicate=1, pairedEndReads=NA, controlFiles=NA, ID='1-1-rep1') |
|
400 |
+ } else if (class(hmm) == class.combined.multivariate.hmm) { |
|
401 |
+ bins$counts <- rpkm.matrix(hmm$bins$counts, binsize=mean(width(hmm$bins))) |
|
333 | 402 |
} else if (class(hmm) == class.multivariate.hmm) { |
403 |
+ bins$counts <- rpkm.matrix(hmm$bins$counts, binsize=mean(width(hmm$bins))) |
|
334 | 404 |
# Rename 'combination' to 'combination.' for coherence with combinedMultiHMM |
335 | 405 |
names(mcols(bins))[grep('combination', names(mcols(bins)))] <- 'combination.' |
336 | 406 |
} |
... | ... |
@@ -348,10 +418,12 @@ plotEnrichment <- function(hmm, annotation, bp.around.annotation=10000, region=c |
348 | 418 |
|
349 | 419 |
if (what %in% c('peaks','counts')) { |
350 | 420 |
### Get fold enrichment |
351 |
- enrich <- enrichmentAtAnnotation(hmm$bins, hmm$info, annotation, bp.around.annotation=bp.around.annotation, region=region, what=what, num.intervals=num.intervals, statistic=statistic) |
|
421 |
+ enrich <- enrichmentAtAnnotation(bins, hmm$info, annotation, bp.around.annotation=bp.around.annotation, region=region, what=what, num.intervals=num.intervals, statistic=statistic) |
|
352 | 422 |
} |
353 | 423 |
ggplts <- list() |
354 | 424 |
maxfolds <- list() |
425 |
+ minfolds <- list() |
|
426 |
+ maxcols <- list() |
|
355 | 427 |
for (condition in conditions) { |
356 | 428 |
if (what == 'combinations') { |
357 | 429 |
### Get fold enrichment |
... | ... |
@@ -382,37 +454,59 @@ plotEnrichment <- function(hmm, annotation, bp.around.annotation=10000, region=c |
382 | 454 |
df$position[df$L1 == 'inside'] <- df$position[df$L1 == 'inside'] * bp.around.annotation |
383 | 455 |
if (what == 'combinations') { |
384 | 456 |
df <- df[df$combination %in% comb.levels,] |
457 |
+ df$combination <- factor(df$combination, levels=comb.levels) |
|
385 | 458 |
} else if (what %in% c('peaks','counts')) { |
386 |
- df$mark <- sub("-.*", "", df$track) |
|
459 |
+ df$mark <- sub("-.*", "", df$ID) |
|
387 | 460 |
df <- df[df$mark %in% mark.levels, ] |
388 |
- df$condition <- sapply(strsplit(as.character(df$track), '-'), '[[', 2) |
|
461 |
+ df$condition <- sapply(strsplit(as.character(df$ID), '-'), '[[', 2) |
|
389 | 462 |
if (condition != "") { |
390 | 463 |
df <- df[df$condition == condition, ] |
391 | 464 |
} |
392 | 465 |
} |
466 |
+ if (logscale & !(statistic=='fraction')) { |
|
467 |
+ df$value <- log(df$value) |
|
468 |
+ } |
|
393 | 469 |
|
394 | 470 |
### Plot |
395 | 471 |
if (what == 'combinations') { |
396 | 472 |
ggplt <- ggplot(df) + geom_line(aes_string(x='position', y='value', col='combination'), size=2) |
397 | 473 |
if (statistic == 'fold') { |
398 |
- ggplt <- ggplt + ylab('fold enrichment') |
|
399 |
- ggplt <- ggplt + geom_hline(yintercept=1, lty=2) |
|
474 |
+ if (logscale) { |
|
475 |
+ ggplt <- ggplt + ylab('log(observed/expected)') |
|
476 |
+ ggplt <- ggplt + geom_hline(yintercept=0, lty=2) |
|
477 |
+ } else { |
|
478 |
+ ggplt <- ggplt + ylab('observed/expected') |
|
479 |
+ ggplt <- ggplt + geom_hline(yintercept=1, lty=2) |
|
480 |
+ } |
|
400 | 481 |
} else if (statistic == 'fraction') { |
401 | 482 |
ggplt <- ggplt + ylab('fraction') |
402 | 483 |
} |
403 |
- ggplt <- ggplt + scale_color_manual(values = getDistinctColors(length(unique(df$combination)))) |
|
484 |
+ # ggplt <- ggplt + scale_color_manual(values = getDistinctColors(length(unique(df$combination)))) |
|
485 |
+ maxcols[[condition]] <- length(unique(df$combination)) |
|
404 | 486 |
} else if (what == 'peaks') { |
405 | 487 |
ggplt <- ggplot(df) + geom_line(aes_string(x='position', y='value', col='mark'), size=2) |
406 | 488 |
if (statistic == 'fold') { |
407 |
- ggplt <- ggplt + ylab('fold enrichment') |
|
408 |
- ggplt <- ggplt + geom_hline(yintercept=1, lty=2) |
|
489 |
+ if (logscale) { |
|
490 |
+ ggplt <- ggplt + ylab('log(observed/expected)') |
|
491 |
+ ggplt <- ggplt + geom_hline(yintercept=0, lty=2) |
|
492 |
+ } else { |
|
493 |
+ ggplt <- ggplt + ylab('observed/expected') |
|
494 |
+ ggplt <- ggplt + geom_hline(yintercept=1, lty=2) |
|
495 |
+ } |
|
409 | 496 |
} else if (statistic == 'fraction') { |
410 | 497 |
ggplt <- ggplt + ylab('fraction') |
411 | 498 |
} |
412 |
- ggplt <- ggplt + scale_color_manual(values = getDistinctColors(length(unique(df$mark)))) |
|
499 |
+ # ggplt <- ggplt + scale_color_manual(values = getDistinctColors(length(unique(df$mark)))) |
|
500 |
+ maxcols[[condition]] <- length(unique(df$mark)) |
|
413 | 501 |
} else if (what == 'counts') { |
414 |
- ggplt <- ggplot(df) + geom_line(aes_string(x='position', y='value', col='track'), size=2) + ylab('RPKM') |
|
415 |
- ggplt <- ggplt + scale_color_manual(values = getDistinctColors(length(unique(df$track)))) |
|
502 |
+ ggplt <- ggplot(df) + geom_line(aes_string(x='position', y='value', col='ID'), size=2) |
|
503 |
+ if (logscale) { |
|
504 |
+ ggplt <- ggplt + ylab('log(RPKM)') |
|
505 |
+ } else { |
|
506 |
+ ggplt <- ggplt + ylab('RPKM') |
|
507 |
+ } |
|
508 |
+ # ggplt <- ggplt + scale_color_manual(values = getDistinctColors(length(unique(df$ID)))) |
|
509 |
+ maxcols[[condition]] <- length(unique(df$ID)) |
|
416 | 510 |
} |
417 | 511 |
ggplt <- ggplt + theme_bw() + xlab('distance from annotation in [bp]') |
418 | 512 |
if (length(region)>=2 & 'inside' %in% region) { |
... | ... |
@@ -420,16 +514,23 @@ plotEnrichment <- function(hmm, annotation, bp.around.annotation=10000, region=c |
420 | 514 |
labels <- c(-bp.around.annotation, -bp.around.annotation/2, '0%', '50%', '100%', bp.around.annotation/2, bp.around.annotation) |
421 | 515 |
ggplt <- ggplt + scale_x_continuous(breaks=breaks, labels=labels) |
422 | 516 |
} |
423 |
- maxfolds[[condition]] <- max(df$value, na.rm=TRUE) |
|
517 |
+ foldsnoinf <- setdiff(df$value, c(Inf, -Inf)) |
|
518 |
+ maxfolds[[condition]] <- max(foldsnoinf, na.rm=TRUE) |
|
519 |
+ minfolds[[condition]] <- min(foldsnoinf, na.rm=TRUE) |
|
424 | 520 |
ggplts[[condition]] <- ggplt |
425 | 521 |
} |
426 |
- maxfolds <- unlist(maxfolds) |
|
522 |
+ maxfold <- max(unlist(maxfolds), na.rm=TRUE) |
|
523 |
+ minfold <- min(unlist(minfolds), na.rm=TRUE) |
|
524 |
+ maxcol <- max(unlist(maxcols), na.rm=TRUE) |
|
427 | 525 |
if (statistic == 'fraction' & what %in% c('combinations','peaks')) { |
428 | 526 |
ggplts <- lapply(ggplts, function(ggplt) { ggplt + scale_y_continuous(limits=c(0,1)) }) |
429 | 527 |
} else { |
430 |
- ggplts <- lapply(ggplts, function(ggplt) { ggplt + scale_y_continuous(limits=c(0,max(maxfolds, na.rm=TRUE)*1.1)) }) |
|
528 |
+ ggplts <- lapply(ggplts, function(ggplt) { ggplt + scale_y_continuous(limits=c(minfold*(1-sign(minfold)*0.1),maxfold*(1+sign(maxfold)*0.1))) }) |
|
431 | 529 |
} |
432 |
- if (class(hmm) == class.multivariate.hmm) { |
|
530 |
+ ggplts <- lapply(ggplts, function(ggplt) { ggplt + scale_color_manual(values=getDistinctColors(maxcol)) }) # Add color here like this because of weird bug |
|
531 |
+ if (class(hmm) == class.univariate.hmm) { |
|
532 |
+ return(ggplts[[1]]) |
|
533 |
+ } else if (class(hmm) == class.multivariate.hmm) { |
|
433 | 534 |
return(ggplts[[1]]) |
434 | 535 |
} else if (class(hmm) == class.combined.multivariate.hmm) { |
435 | 536 |
return(ggplts) |
... | ... |
@@ -448,12 +549,12 @@ plotEnrichment <- function(hmm, annotation, bp.around.annotation=10000, region=c |
448 | 549 |
#' @param annotation A \code{\link{GRanges}} object with the annotation of interest. |
449 | 550 |
#' @param bp.around.annotation An integer specifying the number of basepairs up- and downstream of the annotation for which the enrichment will be calculated. |
450 | 551 |
#' @param region A combination of \code{c('start','inside','end')} specifying the region of the annotation for which the enrichment will be calculated. Select \code{'start'} if you have a point-sized annotation like transcription start sites. Select \code{c('start','inside','end')} if you have long annotations like genes. |
451 |
-#' @param what One of \code{c('combinations','peaks','counts')} specifying which statistic to calculate. |
|
552 |
+#' @param what One of \code{c('combinations','peaks','counts')} specifying on which feature the statistic is calculated. |
|
452 | 553 |
#' @param num.intervals Number of intervals for enrichment 'inside' of annotation. |
453 | 554 |
#' @param statistic The statistic to calculate. Either 'fold' for fold enrichments or 'fraction' for fraction of bins falling into the annotation. |
454 | 555 |
#' @return A \code{list()} containing \code{data.frame()}s for enrichment of combinatorial states and binary states at the start, end and inside of the annotation. |
455 | 556 |
#' @importFrom S4Vectors as.factor subjectHits queryHits |
456 |
-enrichmentAtAnnotation <- function(bins, info, annotation, bp.around.annotation=10000, region=c('start','inside','end'), what=c('combinations','peaks','counts'), num.intervals=21, statistic='fold') { |
|
557 |
+enrichmentAtAnnotation <- function(bins, info, annotation, bp.around.annotation=10000, region=c('start','inside','end'), what='combinations', num.intervals=21, statistic='fold') { |
|
457 | 558 |
|
458 | 559 |
## Check user input |
459 | 560 |
if ((!what %in% c('combinations','peaks','counts')) | length(what) > 1) { |
... | ... |
@@ -475,7 +576,11 @@ enrichmentAtAnnotation <- function(bins, info, annotation, bp.around.annotation= |
475 | 576 |
if ('peaks' %in% what) { |
476 | 577 |
binstates <- dec2bin(bins$state, colnames=info$ID) |
477 | 578 |
# Remove replicates |
478 |
- binstates <- binstates[ ,info.dedup$ID] |
|
579 |
+ if (ncol(binstates) > 1) { |
|
580 |
+ binstates <- binstates[ ,info.dedup$ID] |
|
581 |
+ } else { |
|
582 |
+ binstates <- matrix(binstates[ ,info.dedup$ID], ncol=1) |
|
583 |
+ } |
|
479 | 584 |
colsums.binstates <- colSums(binstates) |
480 | 585 |
} |
481 | 586 |
if ('counts' %in% what) { |
... | ... |
@@ -490,9 +595,9 @@ enrichmentAtAnnotation <- function(bins, info, annotation, bp.around.annotation= |
490 | 595 |
widths.annotation <- width(annotation) - 1 |
491 | 596 |
annotation.1bp <- resize(annotation, 1, fix='start') |
492 | 597 |
# Initialize arrays |
493 |
- if ('peaks' %in% what) binstates.inside <- array(dim=c(num.intervals+1, length(info.dedup$ID)), dimnames=list(interval=intervals, track=info.dedup$ID)) |
|
598 |
+ if ('peaks' %in% what) binstates.inside <- array(dim=c(num.intervals+1, length(info.dedup$ID)), dimnames=list(interval=intervals, ID=info.dedup$ID)) |
|
494 | 599 |
if ('combinations' %in% what) combinations.inside <- array(dim=c(num.intervals+1, length(levels(bins$combination))), dimnames=list(interval=intervals, combination=levels(bins$combination))) |
495 |
- if ('counts' %in% what) counts.inside <- array(dim=c(num.intervals+1, length(info$ID)), dimnames=list(interval=intervals, track=info$ID)) |
|
600 |
+ if ('counts' %in% what) counts.inside <- array(dim=c(num.intervals+1, length(info$ID)), dimnames=list(interval=intervals, ID=info$ID)) |
|
496 | 601 |
|
497 | 602 |
for (interval in intervals) { |
498 | 603 |
shift <- widths.annotation * interval * c(1,-1,1)[as.integer(strand(annotation))] |
... | ... |
@@ -506,10 +611,15 @@ enrichmentAtAnnotation <- function(bins, info, annotation, bp.around.annotation= |
506 | 611 |
index <- c(index.inside.plus, index.inside.minus) |
507 | 612 |
index <- index[index>0 & index<=length(bins)] # index could cross chromosome boundaries, but we risk it |
508 | 613 |
if ('peaks' %in% what) { |
614 |
+ if (ncol(binstates) > 1) { |
|
615 |
+ binstates.index <- binstates[index,] |
|
616 |
+ } else { |
|
617 |
+ binstates.index <- matrix(binstates[index,], ncol=1) |
|
618 |
+ } |
|
509 | 619 |
if (statistic == 'fraction') { |
510 |
- binstates.inside[as.character(interval),] <- colSums(binstates[index,]) / length(index) # or colMeans |
|
620 |
+ binstates.inside[as.character(interval),] <- colSums(binstates.index) / length(index) # or colMeans |
|
511 | 621 |
} else if (statistic == 'fold') { |
512 |
- binstates.inside[as.character(interval),] <- colSums(binstates[index,]) / length(index) / colsums.binstates * length(bins) |
|
622 |
+ binstates.inside[as.character(interval),] <- colSums(binstates.index) / length(index) / colsums.binstates * length(bins) |
|
513 | 623 |
} |
514 | 624 |
} |
515 | 625 |
if ('combinations' %in% what) { |
... | ... |
@@ -546,17 +656,22 @@ enrichmentAtAnnotation <- function(bins, info, annotation, bp.around.annotation= |
546 | 656 |
index.start.plus <- index.start.plus[!is.na(index.start.plus)] |
547 | 657 |
index.start.minus <- index.start.minus[!is.na(index.start.minus)] |
548 | 658 |
# Occurrences at every bin position relative to feature |
549 |
- if ('peaks' %in% what) binstates.start <- array(dim=c(length(-lag:lag), length(info.dedup$ID)), dimnames=list(lag=-lag:lag, track=info.dedup$ID)) |
|
659 |
+ if ('peaks' %in% what) binstates.start <- array(dim=c(length(-lag:lag), length(info.dedup$ID)), dimnames=list(lag=-lag:lag, ID=info.dedup$ID)) |
|
550 | 660 |
if ('combinations' %in% what) combinations.start <- array(dim=c(length(-lag:lag), length(levels(bins$combination))), dimnames=list(lag=-lag:lag, combination=levels(bins$combination))) |
551 |
- if ('counts' %in% what) counts.start <- array(dim=c(length(-lag:lag), length(info$ID)), dimnames=list(lag=-lag:lag, track=info$ID)) |
|
661 |
+ if ('counts' %in% what) counts.start <- array(dim=c(length(-lag:lag), length(info$ID)), dimnames=list(lag=-lag:lag, ID=info$ID)) |
|
552 | 662 |
for (ilag in -lag:lag) { |
553 | 663 |
index <- c(index.start.plus+ilag, index.start.minus-ilag) |
554 | 664 |
index <- index[index>0 & index<=length(bins)] |
555 | 665 |
if ('peaks' %in% what) { |
666 |
+ if (ncol(binstates) > 1) { |
|
667 |
+ binstates.index <- binstates[index,] |
|
668 |
+ } else { |
|
669 |
+ binstates.index <- matrix(binstates[index,], ncol=1) |
|
670 |
+ } |
|
556 | 671 |
if (statistic == 'fraction') { |
557 |
- binstates.start[as.character(ilag),] <- colSums(binstates[index,]) / length(index) |
|
672 |
+ binstates.start[as.character(ilag),] <- colSums(binstates.index) / length(index) |
|
558 | 673 |
} else if (statistic == 'fold') { |
559 |
- binstates.start[as.character(ilag),] <- colSums(binstates[index,]) / length(index) / colsums.binstates * length(bins) |
|
674 |
+ binstates.start[as.character(ilag),] <- colSums(binstates.index) / length(index) / colsums.binstates * length(bins) |
|
560 | 675 |
} |
561 | 676 |
} |
562 | 677 |
if ('combinations' %in% what) { |
... | ... |
@@ -596,17 +711,22 @@ enrichmentAtAnnotation <- function(bins, info, annotation, bp.around.annotation= |
596 | 711 |
index.end.plus <- index.end.plus[!is.na(index.end.plus)] |
597 | 712 |
index.end.minus <- index.end.minus[!is.na(index.end.minus)] |
598 | 713 |
# Occurrences at every bin position relative to feature |
599 |
- if ('peaks' %in% what) binstates.end <- array(dim=c(length(-lag:lag), length(info.dedup$ID)), dimnames=list(lag=-lag:lag, track=info.dedup$ID)) |
|
714 |
+ if ('peaks' %in% what) binstates.end <- array(dim=c(length(-lag:lag), length(info.dedup$ID)), dimnames=list(lag=-lag:lag, ID=info.dedup$ID)) |
|
600 | 715 |
if ('combinations' %in% what) combinations.end <- array(dim=c(length(-lag:lag), length(levels(bins$combination))), dimnames=list(lag=-lag:lag, combination=levels(bins$combination))) |
601 |
- if ('counts' %in% what) counts.end <- array(dim=c(length(-lag:lag), length(info$ID)), dimnames=list(lag=-lag:lag, track=info$ID)) |
|
716 |
+ if ('counts' %in% what) counts.end <- array(dim=c(length(-lag:lag), length(info$ID)), dimnames=list(lag=-lag:lag, ID=info$ID)) |
|
602 | 717 |
for (ilag in -lag:lag) { |
603 | 718 |
index <- c(index.end.plus+ilag, index.end.minus-ilag) |
604 | 719 |
index <- index[index>0 & index<=length(bins)] |
605 | 720 |
if ('peaks' %in% what) { |
721 |
+ if (ncol(binstates) > 1) { |
|
722 |
+ binstates.index <- binstates[index,] |
|
723 |
+ } else { |
|
724 |
+ binstates.index <- matrix(binstates[index,], ncol=1) |
|
725 |
+ } |
|
606 | 726 |
if (statistic == 'fraction') { |
607 |
- binstates.end[as.character(ilag),] <- colSums(binstates[index,]) / length(index) |
|
727 |
+ binstates.end[as.character(ilag),] <- colSums(binstates.index) / length(index) |
|
608 | 728 |
} else if (statistic == 'fold') { |
609 |
- binstates.end[as.character(ilag),] <- colSums(binstates[index,]) / length(index) / colsums.binstates * length(bins) |
|
729 |
+ binstates.end[as.character(ilag),] <- colSums(binstates.index) / length(index) / colsums.binstates * length(bins) |
|
610 | 730 |
} |
611 | 731 |
} |
612 | 732 |
if ('combinations' %in% what) { |
... | ... |
@@ -874,9 +874,9 @@ exportGRangesAsBedFile <- function(gr, trackname, filename, namecol='combination |
874 | 874 |
|
875 | 875 |
# Write first line to file |
876 | 876 |
if (append) { |
877 |
- message('appending to file ',filename) |
|
877 |
+ ptm <- startTimedMessage('Appending to file ',filename, ' ...') |
|
878 | 878 |
} else { |
879 |
- message('Writing to file ',filename) |
|
879 |
+ ptm <- startTimedMessage('Writing to file ',filename, ' ...') |
|
880 | 880 |
cat("", file=filename.gz) |
881 | 881 |
} |
882 | 882 |
|
... | ... |
@@ -888,6 +888,10 @@ exportGRangesAsBedFile <- function(gr, trackname, filename, namecol='combination |
888 | 888 |
gr$score <- 0 |
889 | 889 |
} else { |
890 | 890 |
gr$score <- mcols(gr)[,scorecol] |
891 |
+ # Check if scorecolumn follows the UCSC convention |
|
892 |
+ if (min(gr$score) < 0 | max(gr$score) > 1000 | all(gr$score<=1.2) | !is.integer(gr$score)) { |
|
893 |
+ warning("Column '", scorecol, "' should contain integer values between 0 and 1000 for compatibility with the UCSC convention.") |
|
894 |
+ } |
|
891 | 895 |
} |
892 | 896 |
regions <- as.data.frame(gr)[c('chromosome','start','end','score','strand')] |
893 | 897 |
regions$strand <- sub("\\*", ".", regions$strand) |
... | ... |
@@ -897,13 +901,15 @@ exportGRangesAsBedFile <- function(gr, trackname, filename, namecol='combination |
897 | 901 |
regions$name <- mcols(gr)[,namecol] |
898 | 902 |
} |
899 | 903 |
regions <- regions[c('chromosome','start','end','name','score','strand')] |
900 |
- # Make score integer |
|
901 |
- if (min(regions$score) < 0 | max(regions$score) > 1000 | all(regions$score<=1.2) | !is.integer(regions$score)) { |
|
902 |
- warning("Column '", scorecol, "' should contain integer values between 0 and 1000 for compatibility with the UCSC convention.") |
|
903 |
- } |
|
904 |
- df <- cbind(regions, thickStart=regions$start, thickEnd=regions$end) |
|
904 |
+ df <- regions |
|
905 |
+ # Convert from 1-based closed to 0-based half open |
|
906 |
+ df$start <- df$start - 1 |
|
907 |
+ |
|
905 | 908 |
|
906 | 909 |
if (!is.null(colorcol)) { |
910 |
+ df <- cbind(regions, thickStart=regions$start, thickEnd=regions$end) |
|
911 |
+ # Convert from 1-based closed to 0-based half open |
|
912 |
+ df$thickStart <- df$thickStart - 1 |
|
907 | 913 |
# Generate the colors for each element in 'namecol' |
908 | 914 |
if (colorcol %in% names(mcols(gr))) { |
909 | 915 |
if (is.null(colors)) { |
... | ... |
@@ -921,9 +927,6 @@ exportGRangesAsBedFile <- function(gr, trackname, filename, namecol='combination |
921 | 927 |
} |
922 | 928 |
} |
923 | 929 |
|
924 |
- # Convert from 1-based closed to 0-based half open |
|
925 |
- df$start <- df$start - 1 |
|
926 |
- df$thickStart <- df$thickStart - 1 |
|
927 | 930 |
if (nrow(df) == 0) { |
928 | 931 |
warning('No regions in input') |
929 | 932 |
} else { |
... | ... |
@@ -931,5 +934,6 @@ exportGRangesAsBedFile <- function(gr, trackname, filename, namecol='combination |
931 | 934 |
} |
932 | 935 |
|
933 | 936 |
close(filename.gz) |
937 |
+ stopTimedMessage(ptm) |
|
934 | 938 |
|
935 | 939 |
} |
... | ... |
@@ -19,11 +19,15 @@ genomicFrequencies <- function(multi.hmm, combinations=NULL, per.mark=FALSE) { |
19 | 19 |
|
20 | 20 |
multi.hmm <- loadHmmsFromFiles(multi.hmm, check.class=c(class.multivariate.hmm, class.combined.multivariate.hmm))[[1]] |
21 | 21 |
bins <- multi.hmm$bins |
22 |
+ segs <- multi.hmm$segments |
|
23 |
+ peaks <- multi.hmm$peaks |
|
22 | 24 |
|
23 | 25 |
if (per.mark) { |
24 |
- binstates <- dec2bin(bins$state, colnames=multi.hmm$info$ID) |
|
25 |
- t <- colSums(binstates) / nrow(binstates) |
|
26 |
- return(t) |
|
26 |
+ # binstates <- dec2bin(bins$state, colnames=multi.hmm$info$ID) |
|
27 |
+ # t <- colSums(binstates) / nrow(binstates) |
|
28 |
+ t <- sapply(peaks, function(peak) { sum(as.numeric(width(peak))) }) / sum(as.numeric(width(bins))) |
|
29 |
+ s <- sapply(peaks, length) |
|
30 |
+ return(list(frequency=t, domains=s)) |
|
27 | 31 |
} |
28 | 32 |
|
29 | 33 |
if (class(multi.hmm)==class.multivariate.hmm) { |
... | ... |
@@ -35,18 +39,22 @@ genomicFrequencies <- function(multi.hmm, combinations=NULL, per.mark=FALSE) { |
35 | 39 |
} |
36 | 40 |
t <- table(bins$combination) / length(bins) |
37 | 41 |
t <- t[names(t) %in% comb.levels] |
38 |
- return(t) |
|
42 |
+ s <- table(segs$combination) |
|
43 |
+ s <- s[names(s) %in% comb.levels] |
|
44 |
+ return(list(frequency=t, domains=s)) |
|
39 | 45 |
|
40 | 46 |
} else if (class(multi.hmm)==class.combined.multivariate.hmm) { |
41 | 47 |
|
42 | 48 |
if (is.null(combinations)) { |
43 |
- comb.levels <- unique(as.vector(sapply(mcols(bins)[grepl('combination', names(mcols(bins)))], levels))) |
|
49 |
+ comb.levels <- unique(as.vector(sapply(getCombinations(bins), levels))) |
|
44 | 50 |
} else { |
45 | 51 |
comb.levels <- combinations |
46 | 52 |
} |
47 | 53 |
t <- sapply(mcols(bins)[grepl('combination', names(mcols(bins)))], function(x) { table(x) / length(bins) }) |
48 | 54 |
t <- t[rownames(t) %in% comb.levels,] |
49 |
- return(t) |
|
55 |
+ s <- sapply(mcols(segs)[grepl('combination', names(mcols(segs)))], table) |
|
56 |
+ s <- s[rownames(s) %in% comb.levels,] |
|
57 |
+ return(list(frequency=t, domains=s)) |
|
50 | 58 |
|
51 | 59 |
} |
52 | 60 |
} |
... | ... |
@@ -140,6 +148,13 @@ transitionFrequencies <- function(multi.hmms=NULL, combined.hmm=NULL, zero.state |
140 | 148 |
# Cumulative frequencies |
141 | 149 |
freqtrans$cumulative.frequency <- cumsum(freqtrans$frequency) |
142 | 150 |
stopTimedMessage(ptm) |
151 |
+ |
|
152 |
+ ### Number of domains ### |
|
153 |
+ ptm <- startTimedMessage("Number of domains ...") |
|
154 |
+ rle.gentrans <- rle(as.character(gentrans)) |
|
155 |
+ ndomains <- table(rle.gentrans$values) |
|
156 |
+ freqtrans$domains <- ndomains[freqtrans$transition] |
|
157 |
+ stopTimedMessage(ptm) |
|
143 | 158 |
|
144 | 159 |
### Assigning groups for frequency table ### |
145 | 160 |
ptm <- startTimedMessage("Assigning groups ...") |
... | ... |
@@ -157,6 +172,9 @@ transitionFrequencies <- function(multi.hmms=NULL, combined.hmm=NULL, zero.state |
157 | 172 |
## Remove unneeded column |
158 | 173 |
freqtrans$transition <- NULL |
159 | 174 |
|
175 |
+ ## Reorder columns |
|
176 |
+ freqtrans <- freqtrans[, c(grep('combination', names(freqtrans), value=TRUE), 'domains', 'frequency', 'cumulative.frequency', 'group')] |
|
177 |
+ |
|
160 | 178 |
## Return value ## |
161 | 179 |
return(list(table=freqtrans, per.bin=gentrans)) |
162 | 180 |
|
... | ... |
@@ -6,24 +6,40 @@ |
6 | 6 |
#' |
7 | 7 |
#' The function computes the euclidian distance between all \code{\link{colors}} and iteratively selects those that have the furthest closes distance to the set of already selected colors. |
8 | 8 |
#' |
9 |
-#' @param n Number of colors to select. |
|
9 |
+#' @param n Number of colors to select. If \code{n} is a character vector, \code{length(n)} will be taken as the number of colors and the colors will be named by \code{n}. |
|
10 | 10 |
#' @param start.color Color to start the selection process from. |
11 | 11 |
#' @param exclude.colors Character vector with colors that should not be used. |
12 | 12 |
#' @param exclude.rgb.above Exclude colors where all RGB values are above. This is useful to exclude whitish colors. |
13 |
+#' @param exclude.brightness.above Exclude colors where the 'brightness' value in HSV space is above. This is useful to obtain a matt palette. |
|
13 | 14 |
#' @return A character vector with colors. |
14 | 15 |
#' @author Aaron Taudt |
15 |
-#' @importFrom grDevices col2rgb |
|
16 |
-#' @importFrom stats dist |
|
17 | 16 |
#' @export |
17 |
+#' @importFrom grDevices col2rgb rgb2hsv |
|
18 |
+#' @importFrom stats dist |
|
18 | 19 |
#' @examples |
19 | 20 |
#'cols <- getDistinctColors(5) |
20 | 21 |
#'pie(rep(1,5), labels=cols, col=cols) |
21 | 22 |
#' |
22 |
-getDistinctColors <- function(n, start.color='blue4', exclude.colors=c('white','black','gray','grey'), exclude.rgb.above=210) { |
|
23 |
+getDistinctColors <- function(n, start.color='blue4', exclude.colors=c('white','black','gray','grey','\\<yellow\\>', 'yellow1', 'lemonchiffon'), exclude.brightness.above=1, exclude.rgb.above=210) { |
|
23 | 24 |
|
25 |
+ n.names <- NULL |
|
26 |
+ if (is.character(n)) { |
|
27 |
+ n.names <- n |
|
28 |
+ n <- length(n) |
|
29 |
+ } else if (is.factor(n)) { |
|
30 |
+ n.names <- as.character(n) |
|
31 |
+ n <- length(n) |
|
32 |
+ } |
|
24 | 33 |
cols <- grDevices::colors() |
34 |
+ |
|
25 | 35 |
# Exclude unwanted colors |
26 | 36 |
cols <- grep(paste(exclude.colors, collapse='|'), cols, invert=TRUE, value=TRUE) |
37 |
+ # Exclude too bright colors |
|
38 |
+ colsrgb <- grDevices::col2rgb(cols) |
|
39 |
+ colshsv <- t(mapply(grDevices::rgb2hsv, r=colsrgb[1,], g=colsrgb[2,], b=colsrgb[3,])) |
|
40 |
+ rownames(colshsv) <- cols |
|
41 |
+ colshsv <- colshsv[colshsv[,3] <= exclude.brightness.above,] |
|
42 |
+ cols <- rownames(colshsv) |
|
27 | 43 |
# Get RGB values |
28 | 44 |
rgbs <- t(grDevices::col2rgb(cols)) |
29 | 45 |
rownames(rgbs) <- cols |
... | ... |
@@ -52,6 +68,7 @@ getDistinctColors <- function(n, start.color='blue4', exclude.colors=c('white',' |
52 | 68 |
if (length(colors) > length(selected.cols)) { |
53 | 69 |
warning("Recycling colors because we only have ", length(selected.cols), " distinct colors to choose from.") |
54 | 70 |
} |
71 |
+ names(colors) <- n.names |
|
55 | 72 |
return(colors) |
56 | 73 |
|
57 | 74 |
} |
... | ... |
@@ -15,9 +15,15 @@ |
15 | 15 |
#' |
16 | 16 |
#'@examples |
17 | 17 |
#'## Make fixed-width bins of size 500kb and 1Mb |
18 |
-#'bins <- fixedWidthBins(assembly='mm10', chromosome.format='NCBI', binsizes=c(5e5,1e6)) |
|
18 |
+#'data(rn4_chrominfo) |
|
19 |
+#'chrom.lengths <- rn4_chrominfo$length |
|
20 |
+#'names(chrom.lengths) <- rn4_chrominfo$chromosome |
|
21 |
+#'bins <- fixedWidthBins(chrom.lengths=chrom.lengths, binsizes=c(5e5,1e6)) |
|
19 | 22 |
#'bins |
20 | 23 |
#' |
24 |
+#'## Make bins using NCBI server (requires internet connection) |
|
25 |
+#'# bins <- fixedWidthBins(assembly='mm10', chromosome.format='NCBI', binsizes=c(5e5,1e6)) |
|
26 |
+#' |
|
21 | 27 |
fixedWidthBins <- function(bamfile=NULL, assembly=NULL, chrom.lengths=NULL, chromosome.format, binsizes=1e6, chromosomes=NULL) { |
22 | 28 |
|
23 | 29 |
### Check user input ### |
... | ... |
@@ -78,11 +84,16 @@ fixedWidthBins <- function(bamfile=NULL, assembly=NULL, chrom.lengths=NULL, chro |
78 | 84 |
|
79 | 85 |
ptm <- startTimedMessage("Making fixed-width bins for bin size ", binsize, " ...") |
80 | 86 |
chrom.lengths.floor <- floor(chrom.lengths / binsize) * binsize |
81 |
- bins <- unlist(GenomicRanges::tileGenome(chrom.lengths.floor[chroms2use], tilewidth=binsize), use.names=FALSE) |
|
87 |
+ clfloor2use <- chrom.lengths.floor[chroms2use] |
|
88 |
+ clfloor2use <- clfloor2use[clfloor2use >= binsize] |
|
89 |
+ if (length(clfloor2use) == 0) { |
|
90 |
+ stop("All selected chromosomes are smaller than binsize ", binsize) |
|
91 |
+ } |
|
92 |
+ bins <- unlist(GenomicRanges::tileGenome(clfloor2use, tilewidth=binsize), use.names=FALSE) |
|
82 | 93 |
if (any(width(bins)!=binsize)) { |
83 | 94 |
stop("tileGenome failed") |
84 | 95 |
} |
85 |
- seqlengths(bins) <- chrom.lengths[chroms2use] |
|
96 |
+ seqlengths(bins) <- chrom.lengths[seqlevels(bins)] |
|
86 | 97 |
bins.list[[as.character(binsize)]] <- bins |
87 | 98 |
|
88 | 99 |
skipped.chroms <- setdiff(seqlevels(bins), as.character(unique(seqnames(bins)))) |
... | ... |
@@ -54,9 +54,9 @@ plotHistograms <- function(model, ...) { |
54 | 54 |
|
55 | 55 |
model <- suppressMessages( loadHmmsFromFiles(model, check.class=class.multivariate.hmm)[[1]] ) |
56 | 56 |
## Make fake uni.hmm and plot |
57 |
- binmapping <- dec2bin(names(model$mapping)) |
|
57 |
+ binmapping <- dec2bin(levels(model$bins$state), colnames=model$info$ID) |
|
58 | 58 |
ggplts <- list() |
59 |
- for (i1 in 1:length(model$info$ID)) { |
|
59 |
+ for (i1 in 1:ncol(model$bins$counts)) { |
|
60 | 60 |
uni.hmm <- list() |
61 | 61 |
uni.hmm$info <- model$info[i1,] |
62 | 62 |
uni.hmm$bins <- model$bins |
... | ... |
@@ -371,3 +371,71 @@ plotBoxplot <- function(model) { |
371 | 371 |
} |
372 | 372 |
|
373 | 373 |
|
374 |
+#' Plot a genome browser view |
|
375 |
+#' |
|
376 |
+#' Plot a simple genome browser view. This is useful for scripted genome browser snapshots. |
|
377 |
+#' |
|
378 |
+#' @param counts A \code{\link[GenomicRanges]{GRanges}} object with meta-data column 'counts'. |
|
379 |
+#' @param peaklist A named list() of \code{\link[GenomicRanges]{GRanges}} objects containing peak coordinates. |
|
380 |
+#' @param chr,start,end Chromosome, start and end coordinates for the plot. |
|
381 |
+#' @param countcol A character giving the color for the counts. |
|
382 |
+#' @param peakcols A character vector with colors for the peaks in \code{peaklist}. |
|
383 |
+#' @param style One of \code{c('peaks', 'density')}. |
|
384 |
+#' @param peakTrackHeight Relative height of the tracks given in \code{peaklist} compared to the \code{counts}. |
|
385 |
+#' @return A \code{\link[ggplot2:ggplot]{ggplot}} object. |
|
386 |
+plotGenomeBrowser <- function(counts, peaklist=NULL, chr, start, end, countcol='black', peakcols=NULL, style='peaks', peakTrackHeight=5) { |
|
387 |
+ |
|
388 |
+ ## Select ranges to plot |
|
389 |
+ ranges2plot <- reduce(counts[counts@seqnames == chr & start(counts) >= start & start(counts) <= end]) |
|
390 |
+ |
|
391 |
+ ## Counts |
|
392 |
+ counts <- subsetByOverlaps(counts, ranges2plot) |
|
393 |
+ |
|
394 |
+ if (style == 'peaks') { |
|
395 |
+ # df.start <- data.frame(x=start(counts), counts=counts$counts) |
|
396 |
+ # df.end <- data.frame(x=end(counts), counts=counts$counts) |
|
397 |
+ # df <- rbind(df.start, df.end) |
|
398 |
+ # df <- df[order(df$x),] |
|
399 |
+ df <- data.frame(x=(start(counts)+end(counts))/2, counts=counts$counts) # plot triangles centered at middle of the bin |
|
400 |
+ ggplt <- ggplot(df) + geom_area(aes_string(x='x', y='counts')) + theme(panel.grid = element_blank(), panel.background = element_blank(), axis.text.x = element_blank(), axis.title = element_blank(), axis.ticks.x = element_blank(), axis.line = element_blank()) |
|
401 |
+ maxcounts <- max(counts$counts) |
|
402 |
+ ggplt <- ggplt + scale_y_continuous(breaks=c(0, maxcounts)) |
|
403 |
+ } else if (style == 'density') { |
|
404 |
+ df <- data.frame(xmin=start(counts), xmax=end(counts), counts=counts$counts) |
|
405 |
+ |
|
406 |
+ # # Rolling mean |
|
407 |
+ # n <- 10 |
|
408 |
+ # cx <- cumsum(df$counts) |
|
409 |
+ # rsum <- (cx[n:length(df$counts)] - c(0, cx[1:(length(df$counts) - n)])) / n |
|
410 |
+ # df$counts[(n%/%2 + 1):(length(df$counts)-(n-1)%/%2)] <- rsum |
|
411 |
+ |
|
412 |
+ # # Expand high peaks |
|
413 |
+ # fact <- 1e-5 |
|
414 |
+ # df$xmin <- df$xmin - (end-start)*df$counts * fact |
|
415 |
+ # df$xmax <- df$xmax + (end-start)*df$counts * fact |
|
416 |
+ |
|
417 |
+ ggplt <- ggplot(df) + geom_rect(aes_string(xmin='xmin', xmax='xmax', ymin=0, ymax=4, alpha='counts')) + theme(panel.grid = element_blank(), panel.background = element_blank(), axis.text = element_blank(), axis.title = element_blank(), axis.ticks = element_blank(), axis.line = element_blank()) |
|
418 |
+ } else { |
|
419 |
+ stop("Unknown value '", style, "' for parameter 'style'. Must be one of c('peaks', 'density').") |
|
420 |
+ } |
|
421 |
+ |
|
422 |
+ ## Peaks |
|
423 |
+ if (!is.null(peaklist)) { |
|
424 |
+ if (is.null(peakcols)) { |
|
425 |
+ peakcols <- getDistinctColors(length(peaklist)) |
|
426 |
+ } |
|
427 |
+ for (i1 in 1:length(peaklist)) { |
|
428 |
+ p <- peakTrackHeight |
|
429 |
+ peaks <- subsetByOverlaps(peaklist[[i1]], ranges2plot) |
|
430 |
+ if (length(peaks) > 0) { |
|
431 |
+ df <- data.frame(start=start(peaks), end=end(peaks), ymin=-p*i1, ymax=-p*i1+0.9*p) |
|
432 |
+ ggplt <- ggplt + geom_rect(data=df, mapping=aes_string(xmin='start', xmax='end', ymin='ymin', ymax='ymax'), col=peakcols[i1], fill=peakcols[i1]) |
|
433 |
+ } |
|
434 |
+ trackname <- names(peaklist)[i1] |
|
435 |
+ df <- data.frame(x=start(counts)[1], y=-p*i1+0.5*p, label=trackname) |
|
436 |
+ ggplt <- ggplt + geom_text(data=df, mapping=aes_string(x='x', y='y', label='label'), vjust=0.5, hjust=0.5, col=peakcols[i1]) |
|
437 |
+ } |
|
438 |
+ } |
|
439 |
+ |
|
440 |
+ return(ggplt) |
|
441 |
+} |
|
374 | 442 |
\ No newline at end of file |
375 | 443 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,49 @@ |
1 |
+#' Print uniHMM object |
|
2 |
+#' |
|
3 |
+#' @param x An \code{\link{uniHMM}} object. |
|
4 |
+#' @param ... Ignored. |
|
5 |
+#' @return An invisible \code{NULL}. |
|
6 |
+#' @export |
|
7 |
+print.uniHMM <- function(x, ...) { |
|
8 |
+ |
|
9 |
+ message("$info") |
|
10 |
+ print(x$info) |
|
11 |
+ message("\n$peaks") |
|
12 |
+ print(x$peaks) |
|
13 |
+ message("\nUse the list operator $ to access all elements of this object.") |
|
14 |
+ |
|
15 |
+} |
|
16 |
+ |
|
17 |
+ |
|
18 |
+#' Print multiHMM object |
|
19 |
+#' |
|
20 |
+#' @param x An \code{\link{multiHMM}} object. |
|
21 |
+#' @param ... Ignored. |
|
22 |
+#' @return An invisible \code{NULL}. |
|
23 |
+#' @export |
|
24 |
+print.multiHMM <- function(x, ...) { |
|
25 |
+ |
|
26 |
+ message("$info") |
|
27 |
+ print(x$info) |
|
28 |
+ message("\n$segments") |
|
29 |
+ print(x$segments) |
|
30 |
+ message("\nUse the list operator $ to access all elements of this object.") |
|
31 |
+ |
|
32 |
+} |
|
33 |
+ |
|
34 |
+ |
|
35 |
+#' Print combinedMultiHMM object |
|
36 |
+#' |
|
37 |
+#' @param x An \code{\link{combinedMultiHMM}} object. |
|
38 |
+#' @param ... Ignored. |
|
39 |
+#' @return An invisible \code{NULL}. |
|
40 |
+#' @export |
|
41 |
+print.combinedMultiHMM <- function(x, ...) { |
|
42 |
+ |
|
43 |
+ message("$info") |
|
44 |
+ print(x$info) |
|
45 |
+ message("\n$segments") |
|
46 |
+ print(x$segments) |
|
47 |
+ message("\nUse the list operator $ to access all elements of this object.") |
|
48 |
+ |
|
49 |
+} |
... | ... |
@@ -35,10 +35,12 @@ readCustomBedFile <- function(bedfile, col.names=c('chromosome','start','end','n |
35 | 35 |
} else { |
36 | 36 |
data$strand <- '*' |
37 | 37 |
} |
38 |
- # Adjust chromosome format |
|
39 |
- data$chromosome <- sub('^chr', '', data$chromosome) |
|
40 |
- if (chromosome.format=='UCSC') { |
|
41 |
- data$chromosome <- paste0('chr', data$chromosome) |
|
38 |
+ if (nrow(data) > 0) { |
|
39 |
+ # Adjust chromosome format |
|
40 |
+ data$chromosome <- sub('^chr', '', data$chromosome) |
|
41 |
+ if (chromosome.format=='UCSC') { |
|
42 |
+ data$chromosome <- paste0('chr', data$chromosome) |
|
43 |
+ } |
|
42 | 44 |
} |
43 | 45 |
# Convert to GRanges object |
44 | 46 |
gr <- methods::as(data, 'GRanges') |
45 | 47 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,136 @@ |
1 |
+#' Remove condition from model |
|
2 |
+#' |
|
3 |
+#' Remove a condition from a \code{\link{combinedMultiHMM}} object. |
|
4 |
+#' |
|
5 |
+#' @param model A \code{\link{combinedMultiHMM}} object or file which contains such an object. |
|
6 |
+#' @param conditions A character vector with the condition(s) to be removed. |
|
7 |
+#' @return The input \code{\link{combinedMultiHMM}} object with specified conditions removed. |
|
8 |
+#' @export |
|
9 |
+#' @examples |
|
10 |
+#'## Get an example HMM |
|
11 |
+#'file <- system.file("data","combined_mode-differential.RData", |
|
12 |
+#' package="chromstaR") |
|
13 |
+#'model <- get(load(file)) |
|
14 |
+#' |
|
15 |
+#'## Print available conditions |
|
16 |
+#'print(unique(model$info$condition)) |
|
17 |
+#' |
|
18 |
+#'## Remove condition SHR |
|
19 |
+#'new.model <- removeCondition(model, conditions='SHR') |
|
20 |
+#' |
|
21 |
+removeCondition <- function(model, conditions) { |
|
22 |
+ |
|
23 |
+ model <- loadHmmsFromFiles(model, check.class=class.combined.multivariate.hmm)[[1]] |
|
24 |
+ |
|
25 |
+ ### Check input ### |
|
26 |
+ cond.setdiff <- setdiff(conditions, model$info$condition) |
|
27 |
+ if (length(cond.setdiff) == length(conditions)) { |
|
28 |
+ warning("Conditions ", paste0(cond.setdiff, collapse=', '), " could not be found. Doing nothing.") |
|
29 |
+ return(model) |
|
30 |
+ } |
|
31 |
+ if (length(cond.setdiff) > 0) { |
|
32 |
+ warning("Conditions ", paste0(cond.setdiff, collapse=', '), " could not be found.") |
|
33 |
+ } |
|
34 |
+ conditions <- setdiff(conditions, cond.setdiff) |
|
35 |
+ |
|
36 |
+ ### Info ### |
|
37 |
+ info <- model$info |
|
38 |
+ mask <- !info$condition %in% conditions |
|
39 |
+ info <- info[mask,] |
|
40 |
+ model$info <- info |
|
41 |
+ |
|
42 |
+ ### Bins ### |
|
43 |
+ # State |
|
44 |
+ if (!is.null(model$bins$state)) { |
|
45 |
+ ptm <- startTimedMessage("Recalculating states ...") |
|
46 |
+ states <- model$bins$state |
|
47 |
+ binstates <- dec2bin(states, colnames=model$info$ID) |
|
48 |
+ removeconds <- paste0(paste0('-', conditions, '-'), collapse='|') |
|
49 |
+ binstates <- binstates[, grep(removeconds, colnames(binstates), invert=TRUE)] |
|
50 |
+ state <- factor(bin2dec(binstates)) |
|
51 |
+ model$bins$state <- state |
|
52 |
+ stopTimedMessage(ptm) |
|
53 |
+ } |
|
54 |
+ # Combinations |
|
55 |
+ ptm <- startTimedMessage("Selecting conditions ...") |
|
56 |
+ removeconds <- paste0('combination.', conditions) |
|
57 |
+ mcols(model$bins)[removeconds] <- NULL |
|
58 |
+ # Counts |
|
59 |
+ if (!is.null(model$bins$counts)) { |
|
60 |
+ counts <- model$bins$counts |
|
61 |
+ removeconds <- paste0(paste0('-', conditions, '-'), collapse='|') |
|
62 |
+ keepconds <- grep(removeconds, colnames(counts), invert=TRUE, value=TRUE) |
|
63 |
+ counts <- counts[,keepconds] |
|
64 |
+ if (class(counts) != "matrix") { |
|
65 |
+ counts <- matrix(counts, ncol=1, dimnames=list(NULL, keepconds)) |
|
66 |
+ } |
|
67 |
+ model$bins$counts <- counts |
|
68 |
+ } |
|
69 |
+ if (!is.null(model$bins$posteriors)) { |
|
70 |
+ posteriors <- model$bins$posteriors |
|
71 |
+ removeconds <- paste0(paste0('-', conditions, '-'), collapse='|') |
|
72 |
+ keepconds <- grep(removeconds, colnames(posteriors), invert=TRUE, value=TRUE) |
|