... | ... |
@@ -1,15 +1,37 @@ |
1 | 1 |
Package: chromstaR |
2 | 2 |
Type: Package |
3 |
-Title: Analysis of ChIP-Seq data for single and multiple histone modifications |
|
4 |
-Version: 0.9.4 |
|
3 |
+Title: Combinatorial and Differential Chromatin State Analysis for ChIP-Seq Data |
|
4 |
+Version: 0.98.0 |
|
5 | 5 |
Date: 2014-05-03 |
6 | 6 |
Author: Aaron Taudt, Maria Colome Tatche, Matthias Heinig, Minh Anh Nguyen |
7 | 7 |
Maintainer: Aaron Taudt <a.s.taudt@umcg.nl> |
8 |
-Description: not yet |
|
9 |
-Depends: R (>= 3.1.0), S4Vectors, IRanges, GenomicRanges, ggplot2, reshape2 |
|
10 |
-Imports: Rsamtools, GenomicAlignments, BSgenome, BiocGenerics |
|
11 |
-Suggests: grid, rtracklayer, mvtnorm, biomaRt, biovizBase, ggbio, |
|
8 |
+Description: This package implements functions for combinatorial and differential analysis of ChIP-seq data. It includes uni- and multivariate peak-calling, export to genome browser viewable files, and functions for enrichment analyses. |
|
9 |
+Depends: |
|
10 |
+ R (>= 3.3.0), |
|
11 |
+ GenomicRanges, |
|
12 |
+ ggplot2, |
|
13 |
+Imports: |
|
14 |
+ utils, |
|
15 |
+ grDevices, |
|
16 |
+ graphics, |
|
17 |
+ stats, |
|
18 |
+ S4Vectors, |
|
19 |
+ GenomeInfoDb, |
|
20 |
+ IRanges, |
|
21 |
+ reshape2, |
|
22 |
+ Rsamtools, |
|
23 |
+ GenomicAlignments, |
|
24 |
+ BSgenome, |
|
25 |
+ BiocGenerics |
|
26 |
+Suggests: |
|
27 |
+ grid, |
|
28 |
+ rtracklayer, |
|
29 |
+ biomaRt, |
|
30 |
+ biovizBase, |
|
31 |
+ ggbio, |
|
12 | 32 |
testthat |
13 |
-License: GPL |
|
33 |
+License: GPL-3 |
|
14 | 34 |
LazyLoad: yes |
15 | 35 |
Packaged: 2014-05-03 00:00:00 CET+1; Taudt |
36 |
+biocViews: Software, DifferentialPeakCalling, HiddenMarkovModel, ChIPSeq, MultipleComparison |
|
37 |
+RoxygenNote: 5.0.1 |
... | ... |
@@ -1,4 +1,4 @@ |
1 |
-# Generated by roxygen2 (4.1.1): do not edit by hand |
|
1 |
+# Generated by roxygen2: do not edit by hand |
|
2 | 2 |
|
3 | 3 |
S3method(plot,GRanges) |
4 | 4 |
S3method(plot,character) |
... | ... |
@@ -43,17 +43,37 @@ export(summarizeDiffPeaks) |
43 | 43 |
export(summarizePeaks) |
44 | 44 |
export(transitionFrequencies) |
45 | 45 |
export(unis2pseudomulti) |
46 |
-import(BiocGenerics) |
|
46 |
+import(GenomeInfoDb) |
|
47 | 47 |
import(GenomicRanges) |
48 | 48 |
import(IRanges) |
49 |
-import(S4Vectors) |
|
50 | 49 |
import(ggplot2) |
51 | 50 |
import(reshape2) |
51 |
+importFrom(BiocGenerics,unlist) |
|
52 | 52 |
importFrom(GenomicAlignments,first) |
53 |
-importFrom(GenomicAlignments,readGAlignmentPairsFromBam) |
|
54 |
-importFrom(GenomicAlignments,readGAlignmentsFromBam) |
|
53 |
+importFrom(GenomicAlignments,readGAlignmentPairs) |
|
54 |
+importFrom(GenomicAlignments,readGAlignments) |
|
55 | 55 |
importFrom(Rsamtools,ScanBamParam) |
56 | 56 |
importFrom(Rsamtools,indexBam) |
57 | 57 |
importFrom(Rsamtools,scanBamFlag) |
58 | 58 |
importFrom(Rsamtools,scanBamHeader) |
59 |
-useDynLib(chromstaR) |
|
59 |
+importFrom(S4Vectors,as.factor) |
|
60 |
+importFrom(S4Vectors,queryHits) |
|
61 |
+importFrom(S4Vectors,subjectHits) |
|
62 |
+importFrom(grDevices,col2rgb) |
|
63 |
+importFrom(graphics,hist) |
|
64 |
+importFrom(graphics,plot) |
|
65 |
+importFrom(stats,aggregate) |
|
66 |
+importFrom(stats,cutree) |
|
67 |
+importFrom(stats,dist) |
|
68 |
+importFrom(stats,dnbinom) |
|
69 |
+importFrom(stats,dnorm) |
|
70 |
+importFrom(stats,hclust) |
|
71 |
+importFrom(stats,pnbinom) |
|
72 |
+importFrom(stats,qnorm) |
|
73 |
+importFrom(stats,rbinom) |
|
74 |
+importFrom(stats,rnbinom) |
|
75 |
+importFrom(stats,runif) |
|
76 |
+importFrom(stats,stepfun) |
|
77 |
+importFrom(utils,read.table) |
|
78 |
+importFrom(utils,write.table) |
|
79 |
+useDynLib(chromstaR, .registration = TRUE, .fixes = "") |
... | ... |
@@ -9,6 +9,7 @@ |
9 | 9 |
#' @param percentages Set to \code{TRUE} if you want to have percentages (0 to 1) instead of fold enrichments returned. Note that in this case different features are not directly comparable. |
10 | 10 |
#' @return A named array with fold enrichments. If \code{percentages=TRUE} a list with arrays of percentage (0 to 1) enrichments. |
11 | 11 |
#' @author Aaron Taudt |
12 |
+#' @importFrom S4Vectors subjectHits queryHits |
|
12 | 13 |
#' @export |
13 | 14 |
foldEnrichment <- function(multi.hmm, featurelist, combinations=NULL, percentages=FALSE) { |
14 | 15 |
|
... | ... |
@@ -35,11 +36,11 @@ foldEnrichment <- function(multi.hmm, featurelist, combinations=NULL, percentage |
35 | 36 |
feature <- featurelist[[ifeat]] |
36 | 37 |
ind <- findOverlaps(bins.mask, feature) |
37 | 38 |
|
38 |
- binsinfeature <- bins.mask[unique(queryHits(ind))] |
|
39 |
+ binsinfeature <- bins.mask[unique(S4Vectors::queryHits(ind))] |
|
39 | 40 |
sum.binsinfeature <- sum(as.numeric(width(binsinfeature))) |
40 | 41 |
perc.combstate.in.feature[ifeat,icomb] <- sum.binsinfeature / combstate.length |
41 | 42 |
|
42 |
- featuresinbins <- feature[unique(subjectHits(ind))] |
|
43 |
+ featuresinbins <- feature[unique(S4Vectors::subjectHits(ind))] |
|
43 | 44 |
sum.featuresinbins <- sum(as.numeric(width(featuresinbins))) |
44 | 45 |
perc.feature.in.combstate[ifeat,icomb] <- sum.featuresinbins / feature.lengths[[ifeat]] |
45 | 46 |
|
... | ... |
@@ -105,6 +106,7 @@ expressionOverlap <- function(multi.hmm, expression, combinations=NULL, return.m |
105 | 106 |
#' @param combinations A vector with combinations for which the expression overlap will be calculated. If \code{NULL} all combinations will be considered. |
106 | 107 |
#' @return A list with vectors of mean expression values per percentile for each combinatorial state. |
107 | 108 |
#' @author Aaron Taudt |
109 |
+#' @importFrom S4Vectors queryHits |
|
108 | 110 |
#' @export |
109 | 111 |
expressionAtPercentageOverlap <- function(multi.hmm, expression, combinations=NULL) { |
110 | 112 |
|
... | ... |
@@ -122,7 +124,7 @@ expressionAtPercentageOverlap <- function(multi.hmm, expression, combinations=NU |
122 | 124 |
for (icomb in 1:length(comb.levels)) { |
123 | 125 |
mask <- bins$combination == comb.levels[icomb] |
124 | 126 |
ind <- findOverlaps(expression, bins[mask]) |
125 |
- rle <- rle(queryHits(ind)) |
|
127 |
+ rle <- rle(S4Vectors::queryHits(ind)) |
|
126 | 128 |
expression$num.bins <- 0 |
127 | 129 |
expression$num.bins[rle$values] <- rle$lengths |
128 | 130 |
expression$genewidth <- width(expression) |
... | ... |
@@ -206,7 +208,7 @@ transitionFrequencies <- function(multi.hmms=NULL, zero.states="", combstates=NU |
206 | 208 |
levels.combstates <- setdiff(levels.combstates, zero.states) |
207 | 209 |
levels.combstates <- gsub('-','_',levels.combstates) |
208 | 210 |
for (combination in levels.combstates) { |
209 |
- mask <- intersect(grep(paste0('\\<',combination,'\\>'), freqtrans$transition), grep(paste(paste0('\\<',setdiff(levels.combstates,combination),'\\>'), collapse='|'), freqtrans$transition, invert=T)) |
|
211 |
+ mask <- intersect(grep(paste0('\\<',combination,'\\>'), freqtrans$transition), grep(paste(paste0('\\<',setdiff(levels.combstates,combination),'\\>'), collapse='|'), freqtrans$transition, invert=TRUE)) |
|
210 | 212 |
freqtrans$group[mask] <- paste0('stage-specific ',combination) |
211 | 213 |
mask <- sapply(gregexpr(paste0('\\<',combination,'\\>'),freqtrans$transition), length) == num.models |
212 | 214 |
freqtrans$group[mask] <- paste0('constant ', combination) |
... | ... |
@@ -21,13 +21,13 @@ bin.annotation = function(annotation.file, reference.genome.file, outputfolder=" |
21 | 21 |
|
22 | 22 |
# Read in the data |
23 | 23 |
cat("Reading file",basename(annotation.file),"...") |
24 |
- annotation = read.table(annotation.file, sep="\t", header=TRUE) |
|
24 |
+ annotation = utils::read.table(annotation.file, sep="\t", header=TRUE) |
|
25 | 25 |
annotation$txDiff = annotation$txEnd - annotation$txStart |
26 | 26 |
annotation$cdsDiff = annotation$cdsEnd - annotation$cdsStart |
27 | 27 |
cat(" done\n") |
28 | 28 |
|
29 | 29 |
# Reference genome |
30 |
- reference.genome = read.table(reference.genome.file, row.names=1) |
|
30 |
+ reference.genome = utils::read.table(reference.genome.file, row.names=1) |
|
31 | 31 |
|
32 | 32 |
# Do the loop for all binsizes |
33 | 33 |
for (binsize in binsizes) { |
... | ... |
@@ -64,6 +64,7 @@ bedGraph2binned <- function(bedGraphfile, assembly, chrom.length.file=NULL, outp |
64 | 64 |
#' @param remove.duplicate.reads Logical indicating whether duplicate reads should be removed. NOTE: Duplicate removal only works if your duplicate reads are properly flagged in the BAM file. |
65 | 65 |
#' @param max.fragment.width Maximum allowed fragment length. This is to filter out erroneously wrong fragments due to mapping errors of paired end reads. |
66 | 66 |
#' @return The function produces a \code{\link{GRanges}} object with one meta data column 'reads' that contains the read count. This binned data will be either written to file (\code{save.as.RData=FALSE}) or given as return value (\code{save.as.RData=FALSE}). |
67 |
+#' @importFrom utils read.table |
|
67 | 68 |
align2binned <- function(file, format, assembly, bamindex=file, pairedEndReads=FALSE, chrom.length.file=NULL, outputfolder="binned_data", binsizes=500, chromosomes=NULL, save.as.RData=FALSE, min.mapq=10, downsample.to.reads=NULL, remove.duplicate.reads=FALSE, max.fragment.width=1000) { |
68 | 69 |
|
69 | 70 |
## Create outputfolder if not exists |
... | ... |
@@ -72,25 +73,25 @@ align2binned <- function(file, format, assembly, bamindex=file, pairedEndReads=F |
72 | 73 |
} |
73 | 74 |
|
74 | 75 |
### Read in the data |
75 |
- message("Reading file ",basename(file)," ...", appendLF=F); ptm <- proc.time() |
|
76 |
+ ptm <- startTimedMessage("Reading file ",basename(file)," ...") |
|
76 | 77 |
## BED (0-based) |
77 | 78 |
if (format == "bed") { |
78 | 79 |
if (!is.null(chrom.length.file)) { |
79 | 80 |
# File with chromosome lengths (1-based) |
80 |
- chrom.lengths.df <- read.table(chrom.length.file) |
|
81 |
+ chrom.lengths.df <- utils::read.table(chrom.length.file) |
|
81 | 82 |
chrom.lengths <- chrom.lengths.df[,2] |
82 | 83 |
names(chrom.lengths) <- chrom.lengths.df[,1] |
83 | 84 |
} else { |
84 | 85 |
chrom.lengths <- getChromLengths(assembly=assembly) |
85 | 86 |
} |
86 | 87 |
# File with reads, determine classes first for faster import (0-based) |
87 |
- tab5rows <- read.table(file, nrows=5) |
|
88 |
+ tab5rows <- utils::read.table(file, nrows=5) |
|
88 | 89 |
classes.in.bed <- sapply(tab5rows, class) |
89 | 90 |
classes <- rep("NULL",length(classes.in.bed)) |
90 | 91 |
classes[1:3] <- classes.in.bed[1:3] |
91 |
- data <- read.table(file, colClasses=classes) |
|
92 |
+ data <- utils::read.table(file, colClasses=classes) |
|
92 | 93 |
# Convert to GRanges object |
93 |
- data <- GenomicRanges::GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2]+1, end=data[,3]), strand=Rle(strand("*"), nrow(data))) # start+1 to go from [0,x) -> [1,x] |
|
94 |
+ data <- GenomicRanges::GRanges(seqnames=data[,1], ranges=IRanges(start=data[,2]+1, end=data[,3]), strand="*") # start+1 to go from [0,x) -> [1,x] |
|
94 | 95 |
seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))]) |
95 | 96 |
## BAM (1-based) |
96 | 97 |
} else if (format == "bam") { |
... | ... |
@@ -99,23 +100,23 @@ align2binned <- function(file, format, assembly, bamindex=file, pairedEndReads=F |
99 | 100 |
} else if (format == "bedGraph") { |
100 | 101 |
if (!is.null(chrom.length.file)) { |
101 | 102 |
# File with chromosome lengths (1-based) |
102 |
- chrom.lengths.df <- read.table(chrom.length.file) |
|
103 |
+ chrom.lengths.df <- utils::read.table(chrom.length.file) |
|
103 | 104 |
chrom.lengths <- chrom.lengths.df[,2] |
104 | 105 |
names(chrom.lengths) <- chrom.lengths.df[,1] |
105 | 106 |
} else { |
106 | 107 |
chrom.lengths <- getChromLengths(assembly=assembly) |
107 | 108 |
} |
108 | 109 |
# File with reads, determine classes first for faster import |
109 |
- tab5rows <- read.table(file, nrows=5) |
|
110 |
+ tab5rows <- utils::read.table(file, nrows=5) |
|
110 | 111 |
classes.in.bed <- sapply(tab5rows, class) |
111 | 112 |
classes <- rep("NULL",length(classes.in.bed)) |
112 | 113 |
classes[1:4] <- classes.in.bed[1:4] |
113 |
- data <- read.table(file, colClasses=classes) |
|
114 |
+ data <- utils::read.table(file, colClasses=classes) |
|
114 | 115 |
# Convert to GRanges object |
115 |
- data <- GenomicRanges::GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2]+1, end=data[,3]), strand=Rle(strand("*"), nrow(data)), signal=data[,4]) # start+1 to go from [0,x) -> [1,x] |
|
116 |
+ data <- GenomicRanges::GRanges(seqnames=data[,1], ranges=IRanges(start=data[,2]+1, end=data[,3]), strand="*", signal=data[,4]) # start+1 to go from [0,x) -> [1,x] |
|
116 | 117 |
seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))]) |
117 | 118 |
} |
118 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
119 |
+ stopTimedMessage(ptm) |
|
119 | 120 |
|
120 | 121 |
## Select chromosomes to bin |
121 | 122 |
if (is.null(chromosomes)) { |
... | ... |
@@ -126,9 +127,9 @@ align2binned <- function(file, format, assembly, bamindex=file, pairedEndReads=F |
126 | 127 |
### Downsampling |
127 | 128 |
if (!is.null(downsample.to.reads)) { |
128 | 129 |
if (downsample.to.reads <= length(data)) { |
129 |
- message("downsampling to ", downsample.to.reads, " reads ...", appendLF=F); ptm <- proc.time() |
|
130 |
+ ptm <- startTimedMessage("downsampling to ", downsample.to.reads, " reads ...") |
|
130 | 131 |
data <- data[sort(sample(1:length(data), size=downsample.to.reads, replace=FALSE))] |
131 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
132 |
+ stopTimedMessage(ptm) |
|
132 | 133 |
} else { |
133 | 134 |
warning("No downsampling done because there are only ", length(data), " reads in the data.") |
134 | 135 |
} |
... | ... |
@@ -139,7 +140,7 @@ align2binned <- function(file, format, assembly, bamindex=file, pairedEndReads=F |
139 | 140 |
message("Binning into bin size ",binsize) |
140 | 141 |
|
141 | 142 |
### Iterate over all chromosomes |
142 |
- message(" binning genome ...", appendLF=F); ptm <- proc.time() |
|
143 |
+ ptm <- startTimedMessage(" binning genome ...") |
|
143 | 144 |
binned.data <- GenomicRanges::GRangesList() |
144 | 145 |
for (chromosome in chroms2use) { |
145 | 146 |
## Check last incomplete bin |
... | ... |
@@ -161,9 +162,9 @@ align2binned <- function(file, format, assembly, bamindex=file, pairedEndReads=F |
161 | 162 |
# end[length(end)] <- seqlengths(data)[chromosome] # last ending coordinate is size of chromosome, only if incomplete bins are desired |
162 | 163 |
|
163 | 164 |
## Create binned chromosome as GRanges object |
164 |
- i.binned.data <- GenomicRanges::GRanges(seqnames = Rle(chromosome, numbins), |
|
165 |
+ i.binned.data <- GenomicRanges::GRanges(seqnames = rep(chromosome, numbins), |
|
165 | 166 |
ranges = IRanges(start=start, end=end), |
166 |
- strand = Rle(strand("*"), numbins) |
|
167 |
+ strand = "*" |
|
167 | 168 |
) |
168 | 169 |
seqlengths(i.binned.data) <- seqlengths(data)[chromosome] |
169 | 170 |
|
... | ... |
@@ -171,12 +172,12 @@ align2binned <- function(file, format, assembly, bamindex=file, pairedEndReads=F |
171 | 172 |
binned.data[[chromosome]] <- i.binned.data |
172 | 173 |
) |
173 | 174 |
} |
174 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
175 |
+ stopTimedMessage(ptm) |
|
175 | 176 |
binned.data <- unlist(binned.data) |
176 | 177 |
names(binned.data) <- NULL |
177 | 178 |
|
178 | 179 |
## Count overlaps |
179 |
- message(" counting overlaps ...", appendLF=F); ptm <- proc.time() |
|
180 |
+ ptm <- startTimedMessage(" counting overlaps ...") |
|
180 | 181 |
if (format=="bam" | format=="bed") { |
181 | 182 |
reads <- GenomicRanges::countOverlaps(binned.data, data) |
182 | 183 |
} else if (format=="bedGraph") { |
... | ... |
@@ -194,8 +195,8 @@ align2binned <- function(file, format, assembly, bamindex=file, pairedEndReads=F |
194 | 195 |
} |
195 | 196 |
reads[read.idx] <- maxvalues |
196 | 197 |
} |
197 |
- mcols(binned.data)$reads <- reads |
|
198 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
198 |
+ mcols(binned.data)$counts <- reads |
|
199 |
+ stopTimedMessage(ptm) |
|
199 | 200 |
|
200 | 201 |
if (length(binned.data) == 0) { |
201 | 202 |
warning(paste0("The bin size of ",binsize," is larger than any of the chromosomes.")) |
... | ... |
@@ -204,10 +205,10 @@ align2binned <- function(file, format, assembly, bamindex=file, pairedEndReads=F |
204 | 205 |
|
205 | 206 |
if (save.as.RData==TRUE) { |
206 | 207 |
# Print to file |
207 |
- filename <- paste0(basename(file),"_binsize",format(binsize, scientific=F, trim=T),".RData") |
|
208 |
- message("Saving to file ...", appendLF=F); ptm <- proc.time() |
|
208 |
+ filename <- paste0(basename(file),"_binsize",format(binsize, scientific=F, trim=TRUE),".RData") |
|
209 |
+ ptm <- startTimedMessage("Saving to file ...") |
|
209 | 210 |
save(binned.data, file=file.path(outputfolder,filename) ) |
210 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
211 |
+ stopTimedMessage(ptm) |
|
211 | 212 |
} else { |
212 | 213 |
return(binned.data) |
213 | 214 |
} |
... | ... |
@@ -229,7 +230,7 @@ align2binned <- function(file, format, assembly, bamindex=file, pairedEndReads=F |
229 | 230 |
#' @param min.mapq Minimum mapping quality when importing from BAM files. Set \code{min.mapq=NULL} to keep all reads. |
230 | 231 |
#' @param max.fragment.width Maximum allowed fragment length. This is to filter out erroneously wrong fragments due to mapping errors of paired end reads. |
231 | 232 |
#' @importFrom Rsamtools indexBam scanBamHeader ScanBamParam scanBamFlag |
232 |
-#' @importFrom GenomicAlignments readGAlignmentPairsFromBam readGAlignmentsFromBam first |
|
233 |
+#' @importFrom GenomicAlignments readGAlignmentPairs readGAlignments first |
|
233 | 234 |
bam2GRanges <- function(file, bamindex=file, chromosomes=NULL, pairedEndReads=FALSE, keep.duplicate.reads=TRUE, min.mapq=10, max.fragment.width=1000) { |
234 | 235 |
|
235 | 236 |
## Check if bamindex exists |
... | ... |
@@ -258,25 +259,25 @@ bam2GRanges <- function(file, bamindex=file, chromosomes=NULL, pairedEndReads=FA |
258 | 259 |
warning(paste0('Not using chromosomes ', diffs, ' because they are not in the data.')) |
259 | 260 |
} |
260 | 261 |
## Import the file into GRanges |
261 |
- gr <- GenomicRanges::GRanges(seqnames=Rle(chroms2use), ranges=IRanges(start=rep(1, length(chroms2use)), end=chrom.lengths[chroms2use])) |
|
262 |
+ gr <- GenomicRanges::GRanges(seqnames=chroms2use, ranges=IRanges(start=rep(1, length(chroms2use)), end=chrom.lengths[chroms2use])) |
|
262 | 263 |
if (keep.duplicate.reads) { |
263 | 264 |
if (pairedEndReads) { |
264 |
- data.raw <- GenomicAlignments::readGAlignmentPairsFromBam(file, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what='mapq')) |
|
265 |
+ data.raw <- GenomicAlignments::readGAlignmentPairs(file, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what='mapq')) |
|
265 | 266 |
} else { |
266 |
- data.raw <- GenomicAlignments::readGAlignmentsFromBam(file, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what='mapq')) |
|
267 |
+ data.raw <- GenomicAlignments::readGAlignments(file, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what='mapq')) |
|
267 | 268 |
} |
268 | 269 |
} else { |
269 | 270 |
if (pairedEndReads) { |
270 |
- data.raw <- GenomicAlignments::readGAlignmentPairsFromBam(file, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what='mapq', flag=scanBamFlag(isDuplicate=F))) |
|
271 |
+ data.raw <- GenomicAlignments::readGAlignmentPairs(file, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what='mapq', flag=scanBamFlag(isDuplicate=FALSE))) |
|
271 | 272 |
} else { |
272 |
- data.raw <- GenomicAlignments::readGAlignmentsFromBam(file, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what='mapq', flag=scanBamFlag(isDuplicate=F))) |
|
273 |
+ data.raw <- GenomicAlignments::readGAlignments(file, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what='mapq', flag=scanBamFlag(isDuplicate=FALSE))) |
|
273 | 274 |
} |
274 | 275 |
} |
275 | 276 |
## Filter by mapping quality |
276 | 277 |
if (pairedEndReads) { |
277 | 278 |
message("Converting to GRanges ...", appendLF=FALSE); ptm <- proc.time() |
278 | 279 |
data <- as(data.raw, 'GRanges') # treat as one fragment |
279 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
280 |
+ stopTimedMessage(ptm) |
|
280 | 281 |
|
281 | 282 |
message("Filtering reads ...", appendLF=FALSE); ptm <- proc.time() |
282 | 283 |
if (!is.null(min.mapq)) { |
... | ... |
@@ -290,11 +291,11 @@ bam2GRanges <- function(file, bamindex=file, chromosomes=NULL, pairedEndReads=FA |
290 | 291 |
# Filter out too long fragments |
291 | 292 |
data <- data[width(data)<=max.fragment.width] |
292 | 293 |
} |
293 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
294 |
+ stopTimedMessage(ptm) |
|
294 | 295 |
} else { |
295 | 296 |
message("Converting to GRanges ...", appendLF=FALSE); ptm <- proc.time() |
296 | 297 |
data <- as(data.raw, 'GRanges') |
297 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
298 |
+ stopTimedMessage(ptm) |
|
298 | 299 |
|
299 | 300 |
message("Filtering reads ...", appendLF=FALSE); ptm <- proc.time() |
300 | 301 |
if (!is.null(min.mapq)) { |
... | ... |
@@ -304,7 +305,7 @@ bam2GRanges <- function(file, bamindex=file, chromosomes=NULL, pairedEndReads=FA |
304 | 305 |
} |
305 | 306 |
data <- data[mcols(data)$mapq >= min.mapq] |
306 | 307 |
} |
307 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
308 |
+ stopTimedMessage(ptm) |
|
308 | 309 |
} |
309 | 310 |
return(data) |
310 | 311 |
|
... | ... |
@@ -87,7 +87,7 @@ callPeaksMultivariate <- function(modellist, use.states, num.states=NULL, chromo |
87 | 87 |
## Prepare the HMM |
88 | 88 |
params <- prepare.multivariate(modellist, use.states=use.states, num.states=num.states) |
89 | 89 |
bins <- params$bins |
90 |
- reads <- params$reads |
|
90 |
+ reads <- params$counts |
|
91 | 91 |
numbins <- params$numbins |
92 | 92 |
nummod <- params$nummod |
93 | 93 |
comb.states2use <- params$comb.states |
... | ... |
@@ -113,11 +113,11 @@ callPeaksMultivariate <- function(modellist, use.states, num.states=NULL, chromo |
113 | 113 |
} |
114 | 114 |
|
115 | 115 |
### Define cleanup behaviour ### |
116 |
- on.exit(.C("R_multivariate_cleanup", as.integer(nummod))) |
|
116 |
+ on.exit(.C("C_multivariate_cleanup", as.integer(nummod))) |
|
117 | 117 |
|
118 | 118 |
## Starting multivariate HMM |
119 | 119 |
message("\nStarting multivariate HMM") |
120 |
- message("Using the following combinatorial states, covering ", mean(comb.states.per.bin %in% comb.states2use)*100, "% of the bins:\n", paste(comb.states2use, collapse=" "),"\n", appendLF=F) |
|
120 |
+ message("Using the following combinatorial states, covering ", mean(comb.states.per.bin %in% comb.states2use)*100, "% of the bins:\n", paste(comb.states2use, collapse=" "),"\n", appendLF=FALSE) |
|
121 | 121 |
|
122 | 122 |
# Prepare input for C function |
123 | 123 |
rs <- unlist(lapply(distributions,"[",2:3,'size')) |
... | ... |
@@ -176,7 +176,7 @@ callPeaksMultivariate <- function(modellist, use.states, num.states=NULL, chromo |
176 | 176 |
max.time.temp <- checkpoint.after.time |
177 | 177 |
} |
178 | 178 |
# Call the C function |
179 |
- hmm <- .C("R_multivariate_hmm", |
|
179 |
+ hmm <- .C("C_multivariate_hmm", |
|
180 | 180 |
reads = as.integer(as.vector(reads)), # int* multiO |
181 | 181 |
num.bins = as.integer(numbins), # int* T |
182 | 182 |
num.states = as.integer(numstates2use), # int* N |
... | ... |
@@ -218,9 +218,9 @@ callPeaksMultivariate <- function(modellist, use.states, num.states=NULL, chromo |
218 | 218 |
result$IDs <- IDs |
219 | 219 |
## Bin coordinates, posteriors and states |
220 | 220 |
result$bins <- bins |
221 |
- result$bins$reads <- reads |
|
221 |
+ result$bins$counts <- reads |
|
222 | 222 |
if (get.posteriors) { |
223 |
- message("Transforming posteriors to `per sample` representation ...", appendLF=F); ptm <- proc.time() |
|
223 |
+ ptm <- startTimedMessage("Transforming posteriors to `per sample` representation ...") |
|
224 | 224 |
hmm$posteriors <- matrix(hmm$posteriors, ncol=hmm$num.states) |
225 | 225 |
colnames(hmm$posteriors) <- paste0("P(",hmm$comb.states,")") |
226 | 226 |
result$bins$score <- do.call(pmax,as.list(as.data.frame(hmm$posteriors))) |
... | ... |
@@ -228,9 +228,9 @@ callPeaksMultivariate <- function(modellist, use.states, num.states=NULL, chromo |
228 | 228 |
post.per.track <- hmm$posteriors %*% binstates |
229 | 229 |
colnames(post.per.track) <- result$IDs |
230 | 230 |
result$bins$posteriors <- post.per.track |
231 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
231 |
+ stopTimedMessage(ptm) |
|
232 | 232 |
} |
233 |
- message("Calculating states from posteriors ...", appendLF=F); ptm <- proc.time() |
|
233 |
+ ptm <- startTimedMessage("Calculating states from posteriors ...") |
|
234 | 234 |
if (!is.null(use.states$state)) { |
235 | 235 |
state.levels <- levels(use.states$state) |
236 | 236 |
} else { |
... | ... |
@@ -241,14 +241,14 @@ callPeaksMultivariate <- function(modellist, use.states, num.states=NULL, chromo |
241 | 241 |
} else { |
242 | 242 |
result$bins$state <- factor(hmm$states, levels=state.levels) |
243 | 243 |
} |
244 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
244 |
+ stopTimedMessage(ptm) |
|
245 | 245 |
if (keep.densities) { |
246 | 246 |
result$bins$densities <- matrix(hmm$densities, ncol=hmm$num.states) |
247 | 247 |
colnames(result$bins$densities) <- hmm$comb.states |
248 | 248 |
} |
249 | 249 |
|
250 | 250 |
## Segmentation |
251 |
- message("Making segmentation ...", appendLF=F); ptm <- proc.time() |
|
251 |
+ ptm <- startTimedMessage("Making segmentation ...") |
|
252 | 252 |
df <- as.data.frame(result$bins) |
253 | 253 |
ind.readcols <- grep('^reads', names(df)) |
254 | 254 |
ind.postcols <- grep('^posteriors', names(df)) |
... | ... |
@@ -265,7 +265,7 @@ callPeaksMultivariate <- function(modellist, use.states, num.states=NULL, chromo |
265 | 265 |
if (!keep.posteriors) { |
266 | 266 |
result$bins$posteriors <- NULL |
267 | 267 |
} |
268 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
268 |
+ stopTimedMessage(ptm) |
|
269 | 269 |
## Add combinations |
270 | 270 |
if (!is.null(use.states)) { |
271 | 271 |
mapping <- use.states$combination |
... | ... |
@@ -277,7 +277,7 @@ callPeaksMultivariate <- function(modellist, use.states, num.states=NULL, chromo |
277 | 277 |
## Parameters |
278 | 278 |
# Weights |
279 | 279 |
tstates <- table(hmm$states) |
280 |
- result$weights <- sort(tstates/sum(tstates), decreasing=T) |
|
280 |
+ result$weights <- sort(tstates/sum(tstates), decreasing=TRUE) |
|
281 | 281 |
result$weights.univariate <- weights |
282 | 282 |
# Transition matrices |
283 | 283 |
result$transitionProbs <- matrix(hmm$A, ncol=numstates2use, byrow=TRUE) |
... | ... |
@@ -14,6 +14,7 @@ |
14 | 14 |
#' @param max.distance This number is used as a cutoff to group replicates based on their distance matrix. The lower this number, the more similar replicates have to be to be grouped together. |
15 | 15 |
#' @return Output is a \code{\link{chromstaR_multivariateHMM}} object with additional entry \code{replicateInfo}. If only one \code{\link{chromstaR_univariateHMM}} was given as input, a simple list() with the \code{replicateInfo} is returned. |
16 | 16 |
#' @seealso \code{\link{chromstaR_multivariateHMM}}, \code{\link{callPeaksUnivariate}}, \code{\link{callPeaksMultivariate}} |
17 |
+#' @importFrom stats hclust cutree dist |
|
17 | 18 |
#' @export |
18 | 19 |
callPeaksReplicates <- function(hmm.list, max.states=32, force.equal=FALSE, eps=0.01, max.iter=NULL, max.time=NULL, keep.posteriors=FALSE, num.threads=1, max.distance=0.2) { |
19 | 20 |
|
... | ... |
@@ -28,8 +29,8 @@ callPeaksReplicates <- function(hmm.list, max.states=32, force.equal=FALSE, eps= |
28 | 29 |
info.df <- multimodel$replicateInfo$info |
29 | 30 |
cor.matrix <- multimodel$replicateInfo$correlation |
30 | 31 |
dist.matrix <- multimodel$replicateInfo$distance |
31 |
- hc <- hclust(dist.matrix) |
|
32 |
- info.df$group <- cutree(hc, h=max.distance) |
|
32 |
+ hc <- stats::hclust(dist.matrix) |
|
33 |
+ info.df$group <- stats::cutree(hc, h=max.distance) |
|
33 | 34 |
|
34 | 35 |
### Call peaks for several replicates ### |
35 | 36 |
} else { |
... | ... |
@@ -39,7 +40,7 @@ callPeaksReplicates <- function(hmm.list, max.states=32, force.equal=FALSE, eps= |
39 | 40 |
## Univariate replicateInfo |
40 | 41 |
ids <- unlist(lapply(hmms, '[[', 'ID')) |
41 | 42 |
weight.univariate <- unlist(lapply(hmms, function(x) { x$weights['modified'] })) |
42 |
- total.count <- unlist(lapply(hmms, function(x) { sum(x$bins$reads) })) |
|
43 |
+ total.count <- unlist(lapply(hmms, function(x) { sum(x$bins$counts) })) |
|
43 | 44 |
info.df <- data.frame(total.count=total.count, weight.univariate=weight.univariate) |
44 | 45 |
rownames(info.df) <- ids |
45 | 46 |
|
... | ... |
@@ -64,9 +65,9 @@ callPeaksReplicates <- function(hmm.list, max.states=32, force.equal=FALSE, eps= |
64 | 65 |
cor.matrix <- cor(binstates) |
65 | 66 |
weight.multivariate <- apply(binstates, 2, sum) / nrow(binstates) |
66 | 67 |
info.df$weight.multivariate <- weight.multivariate |
67 |
- dist.matrix <- dist(cor.matrix) |
|
68 |
- hc <- hclust(dist.matrix) |
|
69 |
- info.df$group <- cutree(hc, h=max.distance) |
|
68 |
+ dist.matrix <- stats::dist(cor.matrix) |
|
69 |
+ hc <- stats::hclust(dist.matrix) |
|
70 |
+ info.df$group <- stats::cutree(hc, h=max.distance) |
|
70 | 71 |
} |
71 | 72 |
|
72 | 73 |
## Make return object |
... | ... |
@@ -11,7 +11,7 @@ |
11 | 11 |
#' @param eps Convergence threshold for the Baum-Welch algorithm. |
12 | 12 |
#' @param init One of the following initialization procedures: |
13 | 13 |
#' \describe{ |
14 |
-#' \item{\code{standard}}{The negative binomial of state 'unmodified' will be initialized with \code{mean=mean(reads)}, \code{var=var(reads)} and the negative binomial of state 'modified' with \code{mean=mean(reads)+1}, \code{var=var(reads)}. This procedure usually gives the fastest convergence.} |
|
14 |
+#' \item{\code{standard}}{The negative binomial of state 'unmodified' will be initialized with \code{mean=mean(counts)}, \code{var=var(counts)} and the negative binomial of state 'modified' with \code{mean=mean(counts)+1}, \code{var=var(counts)}. This procedure usually gives the fastest convergence.} |
|
15 | 15 |
#' \item{\code{random}}{Mean and variance of the negative binomials will be initialized with random values (in certain boundaries, see source code). Try this if the \code{'standard'} procedure fails to produce a good fit.} |
16 | 16 |
#' \item{\code{empiric}}{Yet another way to initialize the Baum-Welch. Try this if the other two methods fail to produce a good fit.} |
17 | 17 |
#' } |
... | ... |
@@ -23,7 +23,7 @@ |
23 | 23 |
#' @param read.cutoff The default (\code{TRUE}) enables filtering of high read counts. Set \code{read.cutoff=FALSE} to disable this filtering. |
24 | 24 |
#' @param read.cutoff.quantile A quantile between 0 and 1. Should be near 1. Read counts above this quantile will be set to the read count specified by this quantile. Filtering very high read counts increases the performance of the Baum-Welch fitting procedure. However, if your data contains very few peaks they might be filtered out. If option \code{read.cutoff.absolute} is also specified, the minimum of the resulting cutoff values will be used. Set \code{read.cutoff=FALSE} to disable this filtering. |
25 | 25 |
#' @param read.cutoff.absolute Read counts above this value will be set to the read count specified by this value. Filtering very high read counts increases the performance of the Baum-Welch fitting procedure. However, if your data contains very few peaks they might be filtered out. If option \code{read.cutoff.quantile} is also specified, the minimum of the resulting cutoff values will be used. Set \code{read.cutoff=FALSE} to disable this filtering. |
26 |
-#' @param max.mean If \code{mean(reads)>max.mean}, bins with low read counts will be set to 0. This is a workaround to obtain good fits in the case of large bin sizes. |
|
26 |
+#' @param max.mean If \code{mean(counts)>max.mean}, bins with low read counts will be set to 0. This is a workaround to obtain good fits in the case of large bin sizes. |
|
27 | 27 |
#' @param FDR False discovery rate. code{NULL} means that the state with maximum posterior probability will be chosen, irrespective of its absolute probability (default=code{NULL}). |
28 | 28 |
#' @param control If set to \code{TRUE}, the binned data will be treated as control experiment. That means only state 'zero-inflation' and 'unmodified' will be used in the HMM. |
29 | 29 |
#' @param keep.posteriors If set to \code{TRUE} (default=\code{FALSE}), posteriors will be available in the output. This is useful to change the FDR later, but increases the necessary disk space to store the result. |
... | ... |
@@ -75,7 +75,7 @@ callPeaksUnivariate2 <- function(binned.data, ID, prefit.on.chr, short=TRUE, eps |
75 | 75 |
#' @param eps Convergence threshold for the Baum-Welch algorithm. |
76 | 76 |
#' @param init One of the following initialization procedures: |
77 | 77 |
#' \describe{ |
78 |
-#' \item{\code{standard}}{The negative binomial of state 'unmodified' will be initialized with \code{mean=mean(reads)}, \code{var=var(reads)} and the negative binomial of state 'modified' with \code{mean=mean(reads)+1}, \code{var=var(reads)}. This procedure usually gives the fastest convergence.} |
|
78 |
+#' \item{\code{standard}}{The negative binomial of state 'unmodified' will be initialized with \code{mean=mean(counts)}, \code{var=var(counts)} and the negative binomial of state 'modified' with \code{mean=mean(counts)+1}, \code{var=var(counts)}. This procedure usually gives the fastest convergence.} |
|
79 | 79 |
#' \item{\code{random}}{Mean and variance of the negative binomials will be initialized with random values (in certain boundaries, see source code). Try this if the \code{'standard'} procedure fails to produce a good fit.} |
80 | 80 |
#' \item{\code{empiric}}{Yet another way to initialize the Baum-Welch. Try this if the other two methods fail to produce a good fit.} |
81 | 81 |
#' } |
... | ... |
@@ -87,7 +87,7 @@ callPeaksUnivariate2 <- function(binned.data, ID, prefit.on.chr, short=TRUE, eps |
87 | 87 |
#' @param read.cutoff The default (\code{TRUE}) enables filtering of high read counts. Set \code{read.cutoff=FALSE} to disable this filtering. |
88 | 88 |
#' @param read.cutoff.quantile A quantile between 0 and 1. Should be near 1. Read counts above this quantile will be set to the read count specified by this quantile. Filtering very high read counts increases the performance of the Baum-Welch fitting procedure. However, if your data contains very few peaks they might be filtered out. If option \code{read.cutoff.absolute} is also specified, the minimum of the resulting cutoff values will be used. Set \code{read.cutoff=FALSE} to disable this filtering. |
89 | 89 |
#' @param read.cutoff.absolute Read counts above this value will be set to the read count specified by this value. Filtering very high read counts increases the performance of the Baum-Welch fitting procedure. However, if your data contains very few peaks they might be filtered out. If option \code{read.cutoff.quantile} is also specified, the minimum of the resulting cutoff values will be used. Set \code{read.cutoff=FALSE} to disable this filtering. |
90 |
-#' @param max.mean If \code{mean(reads)>max.mean}, bins with low read counts will be set to 0. This is a workaround to obtain good fits in the case of large bin sizes. |
|
90 |
+#' @param max.mean If \code{mean(counts)>max.mean}, bins with low read counts will be set to 0. This is a workaround to obtain good fits in the case of large bin sizes. |
|
91 | 91 |
#' @param FDR False discovery rate. code{NULL} means that the state with maximum posterior probability will be chosen, irrespective of its absolute probability (default=code{NULL}). |
92 | 92 |
#' @param control If set to \code{TRUE}, the binned data will be treated as control experiment. That means only state 'zero-inflation' and 'unmodified' will be used in the HMM. |
93 | 93 |
#' @param keep.posteriors If set to \code{TRUE} (default=\code{FALSE}), posteriors will be available in the output. This is useful to change the FDR later, but increases the necessary disk space to store the result. |
... | ... |
@@ -100,6 +100,7 @@ callPeaksUnivariate2 <- function(binned.data, ID, prefit.on.chr, short=TRUE, eps |
100 | 100 |
#' @param verbosity Verbosity level for the fitting procedure. 0 - No output, 1 - Iterations are printed. |
101 | 101 |
#' @author Aaron Taudt, Maria Coome Tatche |
102 | 102 |
#' @seealso \code{\link{chromstaR_univariateHMM}}, \code{\link{callPeaksMultivariate}} |
103 |
+#' @importFrom stats runif |
|
103 | 104 |
#' @examples |
104 | 105 |
#'## Get an example BED-file with ChIP-seq reads for H3K36me3 in brain tissue |
105 | 106 |
#'bedfile <- system.file("extdata/brain/BI.Brain_Angular_Gyrus.H3K36me3.112.chr22.bed.gz", |
... | ... |
@@ -114,7 +115,7 @@ callPeaksUnivariate2 <- function(binned.data, ID, prefit.on.chr, short=TRUE, eps |
114 | 115 |
callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max.time=NULL, max.iter=NULL, num.trials=1, eps.try=NULL, num.threads=1, read.cutoff=TRUE, read.cutoff.quantile=1, read.cutoff.absolute=500, max.mean=Inf, FDR=0.5, control=FALSE, keep.posteriors=FALSE, keep.densities=FALSE, checkpoint.after.iter=NULL, checkpoint.after.time=NULL, checkpoint.file=paste0('chromstaR_checkpoint_',ID,'.cpt'), checkpoint.overwrite=TRUE, checkpoint.use.existing=FALSE, verbosity=1) { |
115 | 116 |
|
116 | 117 |
### Define cleanup behaviour ### |
117 |
- on.exit(.C("R_univariate_cleanup")) |
|
118 |
+ on.exit(.C("C_univariate_cleanup")) |
|
118 | 119 |
|
119 | 120 |
### Intercept user input ### |
120 | 121 |
IDcheck <- ID #trigger error if not defined |
... | ... |
@@ -156,7 +157,7 @@ callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max. |
156 | 157 |
continue.from.univariate.hmm <- TRUE |
157 | 158 |
} else if (class(binned.data) == 'GRanges') { |
158 | 159 |
} else { |
159 |
- stop("argument 'binned.data' expects a GRanges with meta-column 'reads' or a file that contains such an object") |
|
160 |
+ stop("argument 'binned.data' expects a GRanges with meta-column 'counts' or a file that contains such an object") |
|
160 | 161 |
} |
161 | 162 |
|
162 | 163 |
### Assign variables ### |
... | ... |
@@ -165,29 +166,29 @@ callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max. |
165 | 166 |
} |
166 | 167 |
numstates <- length(state.labels) |
167 | 168 |
numbins <- length(binned.data) |
168 |
- reads <- mcols(binned.data)$reads |
|
169 |
+ counts <- mcols(binned.data)$counts |
|
169 | 170 |
iniproc <- which(init==c("standard","random","empiric")) # transform to int |
170 | 171 |
if (keep.densities) { lenDensities <- numbins * numstates } else { lenDensities <- 1 } |
171 | 172 |
|
172 |
- ### Check if there are reads in the data, otherwise HMM will blow up ### |
|
173 |
- if (!any(reads!=0)) { |
|
174 |
- stop("All reads in data are zero. No univariate HMM done.") |
|
173 |
+ ### Check if there are counts in the data, otherwise HMM will blow up ### |
|
174 |
+ if (!any(counts!=0)) { |
|
175 |
+ stop("All counts in data are zero. No univariate HMM done.") |
|
175 | 176 |
} |
176 | 177 |
|
177 |
- ### Filter high reads out, makes HMM faster ### |
|
178 |
+ ### Filter high counts out, makes HMM faster ### |
|
178 | 179 |
numfiltered <- 0 |
179 | 180 |
if (!continue.from.univariate.hmm) { |
180 | 181 |
if (read.cutoff) { |
181 |
- read.cutoff.by.quantile <- quantile(reads, read.cutoff.quantile) |
|
182 |
+ read.cutoff.by.quantile <- quantile(counts, read.cutoff.quantile) |
|
182 | 183 |
read.cutoff.by.quantile <- as.integer(read.cutoff.by.quantile) |
183 | 184 |
read.cutoff.absolute <- min(read.cutoff.by.quantile, read.cutoff.absolute) |
184 |
- mask <- reads > read.cutoff.absolute |
|
185 |
- reads[mask] <- read.cutoff.absolute |
|
185 |
+ mask <- counts > read.cutoff.absolute |
|
186 |
+ counts[mask] <- read.cutoff.absolute |
|
186 | 187 |
numfiltered <- length(which(mask)) |
187 | 188 |
} |
188 | 189 |
} else { |
189 |
- mask <- reads > read.cutoff.absolute |
|
190 |
- reads[mask] <- read.cutoff.absolute |
|
190 |
+ mask <- counts > read.cutoff.absolute |
|
191 |
+ counts[mask] <- read.cutoff.absolute |
|
191 | 192 |
numfiltered <- length(which(mask)) |
192 | 193 |
} |
193 | 194 |
if (numfiltered > 0) { |
... | ... |
@@ -195,29 +196,29 @@ callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max. |
195 | 196 |
} |
196 | 197 |
|
197 | 198 |
### Filter out low read counts that arise when the bin size is larger than optimal (should correct the result to near optimal again) ### |
198 |
- hist <- hist(reads[reads>0], breaks=0:max(reads), right=FALSE, plot=FALSE) |
|
199 |
+ hist <- graphics::hist(counts[counts>0], breaks=0:max(counts), right=FALSE, plot=FALSE) |
|
199 | 200 |
maxhist <- which.max(hist$counts) |
200 | 201 |
if (maxhist-1 > max.mean) { # -1 to get from 1-based histogram indices to (0-based) read counts |
201 |
- # Two empirical rules to remove low reads |
|
202 |
+ # Two empirical rules to remove low counts |
|
202 | 203 |
read.counts.to.remove.1 <- which(hist$counts[1:maxhist]<=hist$counts[2]) -1 |
203 | 204 |
minlow <- which.min(hist$counts[2:maxhist]) |
204 | 205 |
read.counts.to.remove <- max(c(read.counts.to.remove.1, 2*minlow)) |
205 |
- index.filtered <- which(reads>0 & reads<=read.counts.to.remove) |
|
206 |
- reads[index.filtered] <- 0 |
|
206 |
+ index.filtered <- which(counts>0 & counts<=read.counts.to.remove) |
|
207 |
+ counts[index.filtered] <- 0 |
|
207 | 208 |
if (length(index.filtered)>0) { |
208 | 209 |
message(paste0("Replaced read counts <= ",read.counts.to.remove," by 0. This was done because the selected bin size is considered too big for this dataset: The mean of the read counts (zeros removed) is bigger than the specified max.mean = ",max.mean,". Check the fits!")) |
209 | 210 |
} |
210 | 211 |
} |
211 | 212 |
|
212 | 213 |
# ### Initialization of emission distributions ### |
213 |
-# reads.greater.0 <- reads[reads>0] |
|
214 |
-# mean.reads <- mean(reads.greater.0) |
|
214 |
+# reads.greater.0 <- counts[counts>0] |
|
215 |
+# mean.counts <- mean(reads.greater.0) |
|
215 | 216 |
# var.reads <- var(reads.greater.0) |
216 | 217 |
# imean <- vector() |
217 | 218 |
# ivar <- vector() |
218 | 219 |
# if (init=="standard") { |
219 |
-# imean['unmodified'] <- mean.reads |
|
220 |
-# imean['modified'] <- mean.reads + 1 |
|
220 |
+# imean['unmodified'] <- mean.counts |
|
221 |
+# imean['modified'] <- mean.counts + 1 |
|
221 | 222 |
# ivar['unmodified'] <- var.reads |
222 | 223 |
# ivar['modified'] <- var.reads |
223 | 224 |
# # Make sure variance is bigger than mean for negative binomial |
... | ... |
@@ -228,13 +229,13 @@ callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max. |
228 | 229 |
# } |
229 | 230 |
# } else if (init=="random") { |
230 | 231 |
# for (istate in c('unmodified','modified')) { |
231 |
-# imean[istate] <- runif(1, min=0, max=10*mean.reads) |
|
232 |
-# ivar[istate] <- runif(1, min=imean[istate], max=20*mean.reads) |
|
232 |
+# imean[istate] <- stats::runif(1, min=0, max=10*mean.counts) |
|
233 |
+# ivar[istate] <- stats::runif(1, min=imean[istate], max=20*mean.counts) |
|
233 | 234 |
# } |
234 | 235 |
# } else if (init=="empiric") { |
235 |
-# imean['unmodified'] <- mean.reads/2 |
|
236 |
+# imean['unmodified'] <- mean.counts/2 |
|
236 | 237 |
# ivar['unmodified'] <- imean['unmodified']*2 |
237 |
-# imean['modified'] <- mean.reads*2 |
|
238 |
+# imean['modified'] <- mean.counts*2 |
|
238 | 239 |
# ivar['modified'] <- imean['modified']*2 |
239 | 240 |
# } else if (init %in% seqlevels(binned.data)) { |
240 | 241 |
# } else if { |
... | ... |
@@ -282,8 +283,8 @@ callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max. |
282 | 283 |
max.time.temp <- checkpoint.after.time |
283 | 284 |
} |
284 | 285 |
## Call the Baum-Welch |
285 |
- hmm <- .C("R_univariate_hmm", |
|
286 |
- reads = as.integer(reads), # double* O |
|
286 |
+ hmm <- .C("C_univariate_hmm", |
|
287 |
+ counts = as.integer(counts), # double* O |
|
287 | 288 |
num.bins = as.integer(numbins), # int* T |
288 | 289 |
num.states = as.integer(numstates), # int* N |
289 | 290 |
size = double(length=numstates), # double* size |
... | ... |
@@ -306,7 +307,7 @@ callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max. |
306 | 307 |
use.initial.params = as.logical(use.initial), # bool* use_initial_params |
307 | 308 |
num.threads = as.integer(num.threads), # int* num_threads |
308 | 309 |
error = as.integer(0), # int* error (error handling) |
309 |
- read.cutoff = as.integer(max(reads)), # int* read_cutoff |
|
310 |
+ read.cutoff = as.integer(max(counts)), # int* read_cutoff |
|
310 | 311 |
verbosity = as.integer(verbosity) # int* verbosity |
311 | 312 |
) |
312 | 313 |
## Adjust parameters for the next round |
... | ... |
@@ -353,7 +354,7 @@ callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max. |
353 | 354 |
# FDR |
354 | 355 |
result$FDR <- FDR |
355 | 356 |
## Convergence info |
356 |
- convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec, read.cutoff=max(hmm$reads)) |
|
357 |
+ convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec, read.cutoff=max(hmm$counts)) |
|
357 | 358 |
result$convergenceInfo <- convergenceInfo |
358 | 359 |
## Add class |
359 | 360 |
class(result) <- class.univariate.hmm |
... | ... |
@@ -379,8 +380,8 @@ callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max. |
379 | 380 |
modellist <- list() |
380 | 381 |
for (i_try in 1:num.trials) { |
381 | 382 |
if (verbosity>=1) message("------------------------------------ Try ",i_try," of ",num.trials," -------------------------------------") |
382 |
- hmm <- .C("R_univariate_hmm", |
|
383 |
- reads = as.integer(reads), # double* O |
|
383 |
+ hmm <- .C("C_univariate_hmm", |
|
384 |
+ counts = as.integer(counts), # double* O |
|
384 | 385 |
num.bins = as.integer(numbins), # int* T |
385 | 386 |
num.states = as.integer(numstates), # int* N |
386 | 387 |
size = double(length=numstates), # double* size |
... | ... |
@@ -403,7 +404,7 @@ callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max. |
403 | 404 |
use.initial.params = as.logical(0), # bool* use_initial_params |
404 | 405 |
num.threads = as.integer(num.threads), # int* num_threads |
405 | 406 |
error = as.integer(0), # int* error (error handling) |
406 |
- read.cutoff = as.integer(max(reads)), # int* read_cutoff |
|
407 |
+ read.cutoff = as.integer(max(counts)), # int* read_cutoff |
|
407 | 408 |
verbosity = as.integer(verbosity) # int* verbosity |
408 | 409 |
) |
409 | 410 |
|
... | ... |
@@ -424,8 +425,8 @@ callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max. |
424 | 425 |
|
425 | 426 |
# Rerun the HMM with different epsilon and initial parameters from trial run |
426 | 427 |
if (verbosity>=1) message("------------------------- Rerunning try ",indexmax," with eps = ",eps," -------------------------") |
427 |
- hmm <- .C("R_univariate_hmm", |
|
428 |
- reads = as.integer(reads), # double* O |
|
428 |
+ hmm <- .C("C_univariate_hmm", |
|
429 |
+ counts = as.integer(counts), # double* O |
|
429 | 430 |
num.bins = as.integer(numbins), # int* T |
430 | 431 |
num.states = as.integer(numstates), # int* N |
431 | 432 |
size = double(length=numstates), # double* size |
... | ... |
@@ -448,7 +449,7 @@ callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max. |
448 | 449 |
use.initial.params = as.logical(1), # bool* use_initial_params |
449 | 450 |
num.threads = as.integer(num.threads), # int* num_threads |
450 | 451 |
error = as.integer(0), # int* error (error handling) |
451 |
- read.cutoff = as.integer(max(reads)), # int* read_cutoff |
|
452 |
+ read.cutoff = as.integer(max(counts)), # int* read_cutoff |
|
452 | 453 |
verbosity = as.integer(verbosity) # int* verbosity |
453 | 454 |
) |
454 | 455 |
} |
... | ... |
@@ -468,7 +469,7 @@ callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max. |
468 | 469 |
result <- list() |
469 | 470 |
result$ID <- ID |
470 | 471 |
## Get states |
471 |
- message("Calculating states from posteriors ...", appendLF=F); ptm <- proc.time() |
|
472 |
+ ptm <- startTimedMessage("Calculating states from posteriors ...") |
|
472 | 473 |
hmm$posteriors <- matrix(hmm$posteriors, ncol=hmm$num.states) |
473 | 474 |
colnames(hmm$posteriors) <- paste0("P(",state.labels,")") |
474 | 475 |
threshold <- 1-FDR |
... | ... |
@@ -480,7 +481,7 @@ callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max. |
480 | 481 |
## Bin coordinates, posteriors and states |
481 | 482 |
result$bins <- GRanges(seqnames=seqnames(binned.data), |
482 | 483 |
ranges=ranges(binned.data), |
483 |
- reads=hmm$reads, |
|
484 |
+ counts=hmm$counts, |
|
484 | 485 |
state=states) |
485 | 486 |
result$bins$score <- apply(hmm$posteriors, 1, max) |
486 | 487 |
result$bins$posteriors <- hmm$posteriors |
... | ... |
@@ -488,18 +489,18 @@ callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max. |
488 | 489 |
result$bins$densities <- matrix(hmm$densities, ncol=hmm$num.states) |
489 | 490 |
} |
490 | 491 |
seqlengths(result$bins) <- seqlengths(binned.data) |
491 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
492 |
+ stopTimedMessage(ptm) |
|
492 | 493 |
## Segmentation |
493 |
- message("Making segmentation ...", appendLF=F); ptm <- proc.time() |
|
494 |
+ ptm <- startTimedMessage("Making segmentation ...") |
|
494 | 495 |
gr <- result$bins |
495 |
- red.df <- suppressMessages(collapseBins(as.data.frame(gr), column2collapseBy='state', columns2average=c('reads','score','posteriors.P.modified.'), columns2drop=c('width','posteriors.P.zero.inflation.','posteriors.P.unmodified.'))) |
|
496 |
- red.gr <- GRanges(seqnames=red.df[,1], ranges=IRanges(start=red.df[,2], end=red.df[,3]), strand=red.df[,4], mean.reads=red.df[,'mean.reads'], state=red.df[,'state'], score=red.df[,'mean.score'], mean.posterior.modified=red.df[,'mean.posteriors.P.modified.']) |
|
496 |
+ red.df <- suppressMessages(collapseBins(as.data.frame(gr), column2collapseBy='state', columns2average=c('counts','score','posteriors.P.modified.'), columns2drop=c('width','posteriors.P.zero.inflation.','posteriors.P.unmodified.'))) |
|
497 |
+ red.gr <- GRanges(seqnames=red.df[,1], ranges=IRanges(start=red.df[,2], end=red.df[,3]), strand=red.df[,4], mean.counts=red.df[,'mean.counts'], state=red.df[,'state'], score=red.df[,'mean.score'], mean.posterior.modified=red.df[,'mean.posteriors.P.modified.']) |
|
497 | 498 |
result$segments <- red.gr |
498 | 499 |
seqlengths(result$segments) <- seqlengths(binned.data) |
499 | 500 |
if (!keep.posteriors) { |
500 | 501 |
result$bins$posteriors <- NULL |
501 | 502 |
} |
502 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
503 |
+ stopTimedMessage(ptm) |
|
503 | 504 |
## Parameters |
504 | 505 |
# Weights |
505 | 506 |
result$weights <- hmm$weights |
... | ... |
@@ -529,7 +530,7 @@ callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max. |
529 | 530 |
# FDR |
530 | 531 |
result$FDR <- FDR |
531 | 532 |
## Convergence info |
532 |
- convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec, max.mean=max.mean, read.cutoff=max(hmm$reads)) |
|
533 |
+ convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec, max.mean=max.mean, read.cutoff=max(hmm$counts)) |
|
533 | 534 |
result$convergenceInfo <- convergenceInfo |
534 | 535 |
## Add class |
535 | 536 |
class(result) <- class.univariate.hmm |
... | ... |
@@ -45,7 +45,7 @@ changeFDR <- function(model, FDR=0.5, separate.zeroinflation=TRUE, averages=TRUE |
45 | 45 |
if (is(model,class.univariate.hmm)) { |
46 | 46 |
if (is.null(model$bins$posteriors)) stop("Cannot recalculate states because posteriors are missing. Run 'callPeaksUnivariate' again with option 'keep.posteriors' set to TRUE.") |
47 | 47 |
## Calculate states |
48 |
- message("Calculating states from posteriors ...", appendLF=F); ptm <- proc.time() |
|
48 |
+ ptm <- startTimedMessage("Calculating states from posteriors ...") |
|
49 | 49 |
states <- rep(NA,length(model$bins)) |
50 | 50 |
if (separate.zeroinflation) { |
51 | 51 |
states[ model$bins$posteriors[,3]<threshold & model$bins$posteriors[,2]<=model$bins$posteriors[,1] ] <- 1 |
... | ... |
@@ -57,9 +57,9 @@ changeFDR <- function(model, FDR=0.5, separate.zeroinflation=TRUE, averages=TRUE |
57 | 57 |
states <- state.labels[2:3][states] |
58 | 58 |
} |
59 | 59 |
model$bins$state <- states |
60 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
60 |
+ stopTimedMessage(ptm) |
|
61 | 61 |
## Redo segmentation |
62 |
- message("Making segmentation ...", appendLF=F); ptm <- proc.time() |
|
62 |
+ ptm <- startTimedMessage("Making segmentation ...") |
|
63 | 63 |
gr <- model$bins |
64 | 64 |
df <- as.data.frame(gr) |
65 | 65 |
if (averages==TRUE) { |
... | ... |
@@ -71,7 +71,7 @@ changeFDR <- function(model, FDR=0.5, separate.zeroinflation=TRUE, averages=TRUE |
71 | 71 |
} |
72 | 72 |
model$segments <- red.gr |
73 | 73 |
seqlengths(model$segments) <- seqlengths(model$bins) |
74 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
74 |
+ stopTimedMessage(ptm) |
|
75 | 75 |
# ## Redo weights |
76 | 76 |
# model$weights <- table(model$bins$state) / length(model$bins) |
77 | 77 |
|
... | ... |
@@ -79,7 +79,7 @@ changeFDR <- function(model, FDR=0.5, separate.zeroinflation=TRUE, averages=TRUE |
79 | 79 |
} else if (is(model,class.multivariate.hmm)) { |
80 | 80 |
if (is.null(model$bins$posteriors)) stop("Cannot recalculate states because posteriors are missing. Run 'callPeaksMultivariate' again with option 'keep.posteriors' set to TRUE.") |
81 | 81 |
## Calculate states |
82 |
- message("Calculating states from posteriors ...", appendLF=F); ptm <- proc.time() |
|
82 |
+ ptm <- startTimedMessage("Calculating states from posteriors ...") |
|
83 | 83 |
states <- factor(bin2dec(model$bins$posteriors >= threshold), levels=levels(model$bins$state)) |
84 | 84 |
model$bins$state <- states |
85 | 85 |
## Combinations |
... | ... |
@@ -89,9 +89,9 @@ changeFDR <- function(model, FDR=0.5, separate.zeroinflation=TRUE, averages=TRUE |
89 | 89 |
# names(mapping) <- levels(model$bins$state) |
90 | 90 |
model$bins$combination <- factor(mapping[as.character(model$bins$state)], levels=mapping[as.character(levels(model$bins$state))]) |
91 | 91 |
} |
92 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
92 |
+ stopTimedMessage(ptm) |
|
93 | 93 |
## Redo segmentation |
94 |
- message("Making segmentation ...", appendLF=F); ptm <- proc.time() |
|
94 |
+ ptm <- startTimedMessage("Making segmentation ...") |
|
95 | 95 |
df <- as.data.frame(model$bins) |
96 | 96 |
ind.readcols <- grep('^reads', names(df)) |
97 | 97 |
ind.postcols <- grep('^posteriors', names(df)) |
... | ... |
@@ -116,7 +116,7 @@ changeFDR <- function(model, FDR=0.5, separate.zeroinflation=TRUE, averages=TRUE |
116 | 116 |
} |
117 | 117 |
model$segments <- red.gr |
118 | 118 |
seqlengths(model$segments) <- seqlengths(model$bins) |
119 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
119 |
+ stopTimedMessage(ptm) |
|
120 | 120 |
# ## Redo weights |
121 | 121 |
# model$weights <- table(model$bins$state) / length(model$bins) |
122 | 122 |
} else { |
... | ... |
@@ -1,6 +1,6 @@ |
1 | 1 |
|
2 | 2 |
|
3 |
-#' Combinatorial and differential ChIP-seq analysis |
|
3 |
+#' Combinatorial and differential chromatin state analysis for ChIP-seq data |
|
4 | 4 |
#' |
5 | 5 |
#' This package implements functions for the combinatorial and differential analysis of ChIP-seq data. It was developed for histone modifications with a broad profile but is also suitable for the analysis of transcription factor binding data. A Hidden Markov Model with a mixture of Negative Binomials as emission densities is used to call peaks. Please read our publication for a detailed description of the method. TODO: insert publication. |
6 | 6 |
#' |
... | ... |
@@ -60,7 +60,7 @@ collapseBins = function(data, column2collapseBy=NULL, columns2sumUp=NULL, column |
60 | 60 |
ind_maxcols <- columns2getMax |
61 | 61 |
|
62 | 62 |
## Make the comparison vector |
63 |
- message('Making comparison vector ...', appendLF=F); ptm <- proc.time() |
|
63 |
+ ptm <- startTimedMessage('Making comparison vector ...') |
|
64 | 64 |
if (is.null(column2collapseBy)) { |
65 | 65 |
c <- data$start |
66 | 66 |
cShift1 <- rep(NA,length(c)) |
... | ... |
@@ -85,13 +85,13 @@ collapseBins = function(data, column2collapseBy=NULL, columns2sumUp=NULL, column |
85 | 85 |
compare[1] <- TRUE |
86 | 86 |
numcollapsedbins <- length(which(compare==TRUE)) |
87 | 87 |
numbins <- nrow(data) |
88 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
88 |
+ stopTimedMessage(ptm) |
|
89 | 89 |
if (any(is.na(compare))) { |
90 | 90 |
stop("NAs in vector 'compare'") |
91 | 91 |
} |
92 | 92 |
|
93 | 93 |
## Select the collapsed rows |
94 |
- message('Selecting rows ...', appendLF=F); ptm <- proc.time() |
|
94 |
+ ptm <- startTimedMessage('Selecting rows ...') |
|
95 | 95 |
collapsed.bins <- list() |
96 | 96 |
collapsed.bins[[names(data)[1]]] <- data[which(compare),1] #which to remove NAs which shouldn't be there in the first place |
97 | 97 |
collapsed.bins[[names(data)[2]]] <- data[which(compare),2] |
... | ... |
@@ -104,7 +104,7 @@ collapseBins = function(data, column2collapseBy=NULL, columns2sumUp=NULL, column |
104 | 104 |
collapsed.bins[(lcb+1):(lcb+lmc)] <- data[which(compare), ind_morecols] |
105 | 105 |
names(collapsed.bins)[(lcb+1):(lcb+lmc)] <- names(data)[ind_morecols] |
106 | 106 |
} |
107 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
107 |
+ stopTimedMessage(ptm) |
|
108 | 108 |
|
109 | 109 |
## Sum up columns |
110 | 110 |
xfuns <- list(sum, mean, max) |
... | ... |
@@ -117,7 +117,7 @@ collapseBins = function(data, column2collapseBy=NULL, columns2sumUp=NULL, column |
117 | 117 |
columns2x <- columns2xs[[ix]] |
118 | 118 |
ind_xcols <- inds_xcols[[ix]] |
119 | 119 |
if (!is.null(columns2x)) { |
120 |
- message('Calculating ',xstring,' ...', appendLF=F); ptm <- proc.time() |
|
120 |
+ ptm <- startTimedMessage('Calculating ',xstring,' ...') |
|
121 | 121 |
xcols <- as.matrix(data[,columns2x]) |
122 | 122 |
collapsed.xcols <- matrix(NA, nrow=numcollapsedbins, ncol=length(columns2x)) |
123 | 123 |
icount <- 1 |
... | ... |
@@ -153,7 +153,7 @@ collapseBins = function(data, column2collapseBy=NULL, columns2sumUp=NULL, column |
153 | 153 |
collapsed.bins[(lcb+1):(lcb+lsc)] <- as.data.frame(collapsed.xcols) |
154 | 154 |
names(collapsed.bins)[(lcb+1):(lcb+lsc)] <- paste(xstring, names(data)[ind_xcols], sep='.') |
155 | 155 |
} |
156 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
156 |
+ stopTimedMessage(ptm) |
|
157 | 157 |
} |
158 | 158 |
} |
159 | 159 |
|
... | ... |
@@ -10,7 +10,8 @@ |
10 | 10 |
#' @param multi.hmm.list A list of \code{\link{chromstaR_multivariateHMM}} objects or a vector of files that contain such objects. |
11 | 11 |
#' @param window.size.bp Window size in base-pairs that will be used to average the results. |
12 | 12 |
#' @param direction.of.adding Calculation of the combinatorial entropy only makes sense between different modifications. Choose 'between.hmms' if you have one modification per HMM, which is the case in a differential analysis. Choose 'inside.hmms' if you have multiple modifications per HMM, which is the case in a combinatorial analysis. In this case it is up to the user to make sure that the \code{IDs} between HMMs are identical. |
13 |
-#' @import BiocGenerics |
|
13 |
+#' @importFrom BiocGenerics unlist |
|
14 |
+#' @importFrom S4Vectors subjectHits queryHits |
|
14 | 15 |
#' @export |
15 | 16 |
combinatorialEntropy <- function(multi.hmm.list, window.size.bp=NULL, direction.of.adding='between.hmms') { |
16 | 17 |
|
... | ... |
@@ -18,7 +19,7 @@ combinatorialEntropy <- function(multi.hmm.list, window.size.bp=NULL, direction. |
18 | 19 |
#=================================== |
19 | 20 |
### Direction of adding between HMMs |
20 | 21 |
#=================================== |
21 |
- message("Calculating entropy ...", appendLF=F); ptm <- proc.time() |
|
22 |
+ ptm <- startTimedMessage("Calculating entropy ...") |
|
22 | 23 |
segments <- GRangesList() |
23 | 24 |
for (i1 in 1:length(multi.hmm.list)) { |
24 | 25 |
hmm <- suppressMessages( loadMultiHmmsFromFiles(multi.hmm.list[[i1]])[[1]] ) |
... | ... |
@@ -32,8 +33,8 @@ combinatorialEntropy <- function(multi.hmm.list, window.size.bp=NULL, direction. |
32 | 33 |
segments[[i1]] <- isegments |
33 | 34 |
} |
34 | 35 |
remove(hmm) |
35 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
36 |
- message("Making consensus template ...", appendLF=F); ptm <- proc.time() |
|
36 |
+ stopTimedMessage(ptm) |
|
37 |
+ ptm <- startTimedMessage("Making consensus template ...") |
|
37 | 38 |
consensus <- GenomicRanges::disjoin(BiocGenerics::unlist(segments)) |
38 | 39 |
conentropy <- matrix(NA, ncol=length(segments), nrow=length(consensus)) |
39 | 40 |
for (i1 in 1:length(segments)) { |
... | ... |
@@ -42,13 +43,13 @@ combinatorialEntropy <- function(multi.hmm.list, window.size.bp=NULL, direction. |
42 | 43 |
conentropy[,i1] <- segment$entropy[mind] |
43 | 44 |
} |
44 | 45 |
consensus$entropy <- rowSums(conentropy) |
45 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
46 |
+ stopTimedMessage(ptm) |
|
46 | 47 |
|
47 | 48 |
} else if (direction.of.adding == 'inside.hmms') { |
48 | 49 |
#================================== |
49 | 50 |
### Direction of adding inside HMMs |
50 | 51 |
#================================== |
51 |
- message("Making consensus templates ...", appendLF=F); ptm <- proc.time() |
|
52 |
+ ptm <- startTimedMessage("Making consensus templates ...") |
|
52 | 53 |
segments <- GRangesList() |
53 | 54 |
binstates <- list() |
54 | 55 |
for (i1 in 1:length(multi.hmm.list)) { |
... | ... |
@@ -71,17 +72,17 @@ combinatorialEntropy <- function(multi.hmm.list, window.size.bp=NULL, direction. |
71 | 72 |
} |
72 | 73 |
} |
73 | 74 |
} |
74 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
75 |
- message("Calculating entropy ...", appendLF=F); ptm <- proc.time() |
|
75 |
+ stopTimedMessage(ptm) |
|
76 |
+ ptm <- startTimedMessage("Calculating entropy ...") |
|
76 | 77 |
entropy <- matrix(NA, nrow=length(consensus), ncol=length(IDs)) |
77 | 78 |
colnames(entropy) <- IDs |
78 | 79 |
for (i1 in 1:length(IDs)) { |
79 | 80 |
num.samples <- length(which(!is.na(conbinstates[1,,i1]))) |
80 |
- num.1s <- rowSums(conbinstates[,,i1], na.rm=T) |
|
81 |
+ num.1s <- rowSums(conbinstates[,,i1], na.rm=TRUE) |
|
81 | 82 |
entropy[,i1] <- log(choose(num.samples, num.1s)) |
82 | 83 |
} |
83 | 84 |
consensus$entropy <- rowSums(entropy) |
84 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
85 |
+ stopTimedMessage(ptm) |
|
85 | 86 |
} |
86 | 87 |
|
87 | 88 |
#=========================== |
... | ... |
@@ -90,23 +91,23 @@ combinatorialEntropy <- function(multi.hmm.list, window.size.bp=NULL, direction. |
90 | 91 |
# Remove NAs that occur when num.bins differ between HMMs |
91 | 92 |
gr <- consensus[!is.na(consensus$entropy)] |
92 | 93 |
if (!is.null(window.size.bp)) { |
93 |
- message("Averaging over window size ",window.size.bp,"bp ...", appendLF=F); ptm <- proc.time() |
|
94 |
+ ptm <- startTimedMessage("Averaging over window size ",window.size.bp,"bp ...") |
|
94 | 95 |
tg <- unlist(tileGenome(seqlengths(gr), tilewidth=window.size.bp)) |
95 | 96 |
tg.cut <- subsetByOverlaps(tg, gr) |
96 | 97 |
mind <- findOverlaps(gr, tg.cut) |
97 |
- gr.extended <- gr[queryHits(mind)] |
|
98 |
- rlemind <- rle(subjectHits(mind)) |
|
98 |
+ gr.extended <- gr[S4Vectors::queryHits(mind)] |
|
99 |
+ rlemind <- rle(S4Vectors::subjectHits(mind)) |
|
99 | 100 |
index.last <- cumsum(rlemind$lengths) |
100 | 101 |
index.first <- c(1,index.last[-length(index.last)]+1) |
101 | 102 |
start(gr.extended)[index.first] <- start(tg.cut) |
102 | 103 |
end(gr.extended)[index.last] <- end(tg.cut) |
103 | 104 |
gr.extended$weighted.entropy <- width(gr.extended)*gr.extended$entropy |
104 |
- gr.extended$index <- subjectHits(mind) |
|
105 |
+ gr.extended$index <- S4Vectors::subjectHits(mind) |
|
105 | 106 |
df <- as.data.frame(gr.extended)[,c('seqnames','start','end','width','index','weighted.entropy')] |
106 | 107 |
df <- suppressMessages( collapseBins(df, column2collapseBy='index', columns2sumUp=c('width','weighted.entropy')) ) |
107 | 108 |
gr <- GRanges(seqnames=df$seqnames, ranges=IRanges(start=df$start, end=df$end), entropy=df$sum.weighted.entropy/df$sum.width) |
108 | 109 |
seqlengths(gr) <- seqlengths(gr.extended) |
109 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
110 |
+ stopTimedMessage(ptm) |
|
110 | 111 |
} |
111 | 112 |
|
112 | 113 |
return(gr) |
... | ... |
@@ -31,13 +31,13 @@ combineMultivariates <- function(conditions=NULL, marks=NULL) { |
31 | 31 |
## Add combinatorial states |
32 | 32 |
combs <- list() |
33 | 33 |
combs[[names(conditions)[1]]] <- hmm$bins$combination |
34 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
34 |
+ stopTimedMessage(ptm) |
|
35 | 35 |
|
36 | 36 |
for (i1 in 2:length(conditions)) { |
37 | 37 |
message("Processing condition ",names(conditions)[i1]," ...", appendLF=FALSE); ptm <- proc.time() |
38 | 38 |
hmm <- suppressMessages( loadMultiHmmsFromFiles(conditions[[i1]])[[1]] ) |
39 | 39 |
combs[[names(conditions)[i1]]] <- hmm$bins$combination |
40 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
40 |
+ stopTimedMessage(ptm) |
|
41 | 41 |
} |
42 | 42 |
combs.df <- as(combs,'DataFrame') |
43 | 43 |
|
... | ... |
@@ -72,7 +72,7 @@ combineMultivariates <- function(conditions=NULL, marks=NULL) { |
72 | 72 |
names(mapping) <- comblevels |
73 | 73 |
states[[mark]][[cond]] <- c('',mark)[mapping[hmm$bins$combination]+1] # no need to coerce to character here because the order is the same |
74 | 74 |
} |
75 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
75 |
+ stopTimedMessage(ptm) |
|
76 | 76 |
} |
77 | 77 |
### Paste the marks over each condition |
78 | 78 |
message("Pasting into combinatorial states") |
... | ... |
@@ -86,7 +86,7 @@ combineMultivariates <- function(conditions=NULL, marks=NULL) { |
86 | 86 |
comb <- sub('^\\+','', comb) |
87 | 87 |
comb <- sub('\\+$','', comb) |
88 | 88 |
combs[[cond]] <- comb |
89 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
89 |
+ stopTimedMessage(ptm) |
|
90 | 90 |
} |
91 | 91 |
combs.df <- as.data.frame(combs) |
92 | 92 |
combs.df <- as(combs.df, 'DataFrame') |
... | ... |
@@ -105,7 +105,7 @@ combineMultivariates <- function(conditions=NULL, marks=NULL) { |
105 | 105 |
combined.segments <- as(segments.df, 'GRanges') |
106 | 106 |
seqlengths(combined.segments) <- seqlengths(bins) |
107 | 107 |
mcols(bins)$state <- NULL |
108 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
108 |
+ stopTimedMessage(ptm) |
|
109 | 109 |
|
110 | 110 |
### Redo the segmentation for each condition separately |
111 | 111 |
message("Redoing segmentation for each condition separately ...", appendLF=FALSE); ptm <- proc.time() |
... | ... |
@@ -120,7 +120,7 @@ combineMultivariates <- function(conditions=NULL, marks=NULL) { |
120 | 120 |
seqlengths(segments.cond) <- seqlengths(bins) |
121 | 121 |
segments[[cond]] <- segments.cond |
122 | 122 |
} |
123 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
123 |
+ stopTimedMessage(ptm) |
|
124 | 124 |
|
125 | 125 |
### Make return object |
126 | 126 |
hmm <- list() |
... | ... |
@@ -8,22 +8,23 @@ |
8 | 8 |
#' @param binsize Force the resulting \link{GRanges} to have this bin size. |
9 | 9 |
#' @return A \link{GRanges} object. |
10 | 10 |
#' @author Aaron Taudt |
11 |
+#' @importFrom utils read.table |
|
11 | 12 |
bed2GRanges <- function(bedfile, chrom.length.file, skip=1, binsize=NULL) { |
12 | 13 |
|
13 | 14 |
# File with chromosome lengths (1-based) |
14 |
- chrom.lengths.df <- read.table(chrom.length.file) |
|
15 |
+ chrom.lengths.df <- utils::read.table(chrom.length.file) |
|
15 | 16 |
chrom.lengths <- chrom.lengths.df[,2] |
16 | 17 |
names(chrom.lengths) <- chrom.lengths.df[,1] |
17 | 18 |
# File with reads, determine classes first for faster import (0-based) |
18 |
- tab5rows <- read.table(bedfile, nrows=5, skip=skip) |
|
19 |
+ tab5rows <- utils::read.table(bedfile, nrows=5, skip=skip) |
|
19 | 20 |
classes.in.bed <- sapply(tab5rows, class) |
20 | 21 |
classes <- rep("NULL",length(classes.in.bed)) |
21 | 22 |
classes[1:4] <- classes.in.bed[1:4] |
22 |
- data <- read.table(bedfile, colClasses=classes, skip=skip) |
|
23 |
+ data <- utils::read.table(bedfile, colClasses=classes, skip=skip) |
|
23 | 24 |
# Convert to GRanges object |
24 |
- data <- GenomicRanges::GRanges(seqnames=Rle(data[,1]), |
|
25 |
+ data <- GenomicRanges::GRanges(seqnames=data[,1], |
|
25 | 26 |
ranges=IRanges(start=data[,2]+1, end=data[,3]+1), # +1 to match coordinate systems |
26 |
- strand=Rle(strand("*"), nrow(data)), |
|
27 |
+ strand="*", |
|
27 | 28 |
state=data[,4]) |
28 | 29 |
seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))]) |
29 | 30 |
chroms.in.data <- seqlevels(data) |
... | ... |
@@ -39,7 +40,7 @@ bed2GRanges <- function(bedfile, chrom.length.file, skip=1, binsize=NULL) { |
39 | 40 |
# } else { |
40 | 41 |
# numbins <- chrom.lengths[chrom]/binsize |
41 | 42 |
# } |
42 |
-# ibinned <- GenomicRanges::GRanges(seqnames=Rle(rep(chrom, numbins)), |
|
43 |
+# ibinned <- GenomicRanges::GRanges(seqnames=rep(chrom, numbins), |
|
43 | 44 |
# ranges=IRanges(start=seq(from=1, by=binsize, length=numbins), end=seq(from=binsize, by=binsize, length=numbins))) |
44 | 45 |
# seqlengths(ibinned) <- chrom.lengths[chrom] |
45 | 46 |
# suppressWarnings( binned <- c(binned, ibinned) ) |
... | ... |
@@ -77,9 +78,9 @@ bed2GRanges <- function(bedfile, chrom.length.file, skip=1, binsize=NULL) { |
77 | 78 |
infstarts <- seq(starts[1], ends[length(ends)]-1, by=binsize) |
78 | 79 |
infends <- seq(starts[1]-1+binsize, ends[length(ends)]-2+binsize, by=binsize) |
79 | 80 |
|
80 |
- inflated.chrom <- GenomicRanges::GRanges(seqnames=Rle(infchroms), |
|
81 |
+ inflated.chrom <- GenomicRanges::GRanges(seqnames=infchroms, |
|
81 | 82 |
ranges=IRanges(start=infstarts, end=infends), |
82 |
- strand=Rle(strand('*'), sum(numbins)), |
|
83 |
+ strand="*", |
|
83 | 84 |
state=infstates) |
84 | 85 |
suppressWarnings( inflated.data[[i1]] <- inflated.chrom ) |
85 | 86 |
} |
... | ... |
@@ -2,7 +2,7 @@ correlation.analysis <- function(multi.hmm, IDs=NULL, plot=TRUE) { |
2 | 2 |
|
3 | 3 |
## Intercept user input |
4 | 4 |
if (check.multivariate.model(multi.hmm)!=0) { |
5 |
- message("Loading multivariate HMM from file ...", appendLF=F) |
|
5 |
+ message("Loading multivariate HMM from file ...", appendLF=FALSE) |
|
6 | 6 |
multi.hmm <- get(load(multi.hmm)) |
7 | 7 |
message(" done") |
8 | 8 |
if (check.multivariate.model(multi.hmm)!=0) stop("argument 'multi.hmm' expects a multivariate hmm object or a file that contains a multivariate hmm (type ?multi.hmm for help)") |
... | ... |
@@ -15,7 +15,7 @@ |
15 | 15 |
#'new.states.decimal <- bin2dec(states.binary) |
16 | 16 |
#' @name conversion |
17 | 17 |
NULL |
18 |
-#' |
|
18 |
+ |
|
19 | 19 |
#' @describeIn conversion Decimal to binary conversion. |
20 | 20 |
#' @param dec A vector with whole numbers. |
21 | 21 |
#' @param colnames The column names for the returned matrix. If specified, \code{ndigits} will be the length of \code{colnames}. |
... | ... |
@@ -1,6 +1,7 @@ |
1 | 1 |
#' Binned read counts |
2 | 2 |
#' |
3 |
-#' A \link{GRanges} object which contains binned read counts as meta data column \code{reads}. It is output of the various \link{binning} functions.#' @name binned.data |
|
3 |
+#' A \link{GRanges} object which contains binned read counts as meta data column \code{reads}. It is output of the various \link{binning} functions. |
|
4 |
+#' @name binned.data |
|
4 | 5 |
NULL |
5 | 6 |
|
6 | 7 |
#' Format of the \code{chrom.length.file} |
... | ... |
@@ -33,16 +33,16 @@ plotHeatmapFoldEnrichment <- function(enrichment.table) { |
33 | 33 |
et <- enrichment.table |
34 | 34 |
|
35 | 35 |
## Get columns with fold enrichment values |
36 |
- fe.names <- unlist(lapply(as.list(names(et)), function(x) { grep(x=x, pattern='fold_enrich', value=T) } )) |
|
36 |
+ fe.names <- unlist(lapply(as.list(names(et)), function(x) { grep(x=x, pattern='fold_enrich', value=TRUE) } )) |
|
37 | 37 |
fe <- et[,c('state',fe.names)] |
38 | 38 |
names(fe) <- gsub(pattern='\\.fold_enrich', replacement='', x=names(fe)) |
39 | 39 |
df.feat <- melt(fe, id.vars='state', variable.name='feature', value.name='fold.enrichment') |
40 | 40 |
|
41 | 41 |
## Get columns with p-values |
42 |
- p.dep.names <- unlist(lapply(as.list(names(et)), function(x) { grep(x=x, pattern='p_dep', value=T) } )) |
|
42 |
+ p.dep.names <- unlist(lapply(as.list(names(et)), function(x) { grep(x=x, pattern='p_dep', value=TRUE) } )) |
|
43 | 43 |
pv <- et[,c('state',p.dep.names)] |
44 | 44 |
df.p.dep <- melt(pv, id.vars='state', variable.name='feature', value.name='p.value') |
45 |
- p.enr.names <- unlist(lapply(as.list(names(et)), function(x) { grep(x=x, pattern='p_enr', value=T) } )) |
|
45 |
+ p.enr.names <- unlist(lapply(as.list(names(et)), function(x) { grep(x=x, pattern='p_enr', value=TRUE) } )) |
|
46 | 46 |
pv <- et[,c('state',p.enr.names)] |
47 | 47 |
df.p.enr <- melt(pv, id.vars='state', variable.name='feature', value.name='p.value') |
48 | 48 |
# Combine |
... | ... |
@@ -9,7 +9,7 @@ |
9 | 9 |
#' @param region A combination of \code{c('start','inside','end')} specifying the region of the annotation for which the enrichment will be calculated. |
10 | 10 |
#' @param what A combination of \code{c('combstates','binstates','reads')} specifying which statistic to calculate. |
11 | 11 |
#' @return A \code{list()} containing \code{data.frame()}s for enrichment of combinatorial states and binary states at the start, end and inside of the annotation. |
12 |
-#' @import S4Vectors |
|
12 |
+#' @importFrom S4Vectors as.factor subjectHits queryHits |
|
13 | 13 |
#' @export |
14 | 14 |
enrichmentCurve <- function(hmm, annotation, bp.around.annotation=10000, region=c('start','inside','end'), what=c('combstates','binstates','reads'), intervals=21) { |
15 | 15 |
|
... | ... |
@@ -26,7 +26,7 @@ enrichmentCurve <- function(hmm, annotation, bp.around.annotation=10000, region= |
26 | 26 |
eCurve <- list() |
27 | 27 |
eCurve$combstates <- list() |
28 | 28 |
eCurve$binstates <- list() |
29 |
- eCurve$reads <- list() |
|
29 |
+ eCurve$counts <- list() |
|
30 | 30 |
|
31 | 31 |
## Get combinatorial and binary states |
32 | 32 |
combstates <- hmm$bins$combination |
... | ... |
@@ -41,7 +41,7 @@ enrichmentCurve <- function(hmm, annotation, bp.around.annotation=10000, region= |
41 | 41 |
|
42 | 42 |
# Get bins that overlap annotation |
43 | 43 |
ind <- findOverlaps(hmm$bins, annotation) |
44 |
- bins.per.annotation <- table(subjectHits(ind)) # Table is sorted! |
|
44 |
+ bins.per.annotation <- table(S4Vectors::subjectHits(ind)) # Table is sorted! |
|
45 | 45 |
annotation$num.bins.spanning <- rep(0, length(annotation)) |
46 | 46 |
annotation$num.bins.spanning[as.numeric(names(bins.per.annotation))] <- bins.per.annotation |
47 | 47 |
strand.per.annotation <- S4Vectors::as.factor(strand(annotation[as.numeric(names(bins.per.annotation))])) |
... | ... |
@@ -50,18 +50,18 @@ enrichmentCurve <- function(hmm, annotation, bp.around.annotation=10000, region= |
50 | 50 |
# States and strands per bin-that-overlaps-an-annotation |
51 | 51 |
anno.binstates <- NULL |
52 | 52 |
if ('binstates' %in% what) { |
53 |
- anno.binstates <- binstates[queryHits(ind),] |
|
53 |
+ anno.binstates <- binstates[S4Vectors::queryHits(ind),] |
|
54 | 54 |
} |
55 | 55 |
anno.combstates <- NULL |
56 | 56 |
if ('combstates' %in% what) { |
57 |
- anno.combstates <- combstates[queryHits(ind)] |
|
57 |
+ anno.combstates <- combstates[S4Vectors::queryHits(ind)] |
|
58 | 58 |
} |
59 | 59 |
anno.reads <- NULL |
60 | 60 |
if ('reads' %in% what) { |
61 |
- anno.reads <- hmm$bins$reads[queryHits(ind),] |
|
61 |
+ anno.reads <- hmm$bins$counts[S4Vectors::queryHits(ind),] |
|
62 | 62 |
colnames(anno.reads) <- NULL |
63 | 63 |
} |
64 |
- anno.strands <- strand(annotation)[subjectHits(ind)] |
|
64 |
+ anno.strands <- strand(annotation)[S4Vectors::subjectHits(ind)] |
|
65 | 65 |
|
66 | 66 |
# Relative coordinate of every bin (TSS=0, TES=1) |
67 | 67 |
relcoord <- list() |
... | ... |
@@ -77,7 +77,7 @@ enrichmentCurve <- function(hmm, annotation, bp.around.annotation=10000, region= |
77 | 77 |
# Collect in data.frame |
78 | 78 |
anno.df <- data.frame(as.data.frame(ind), strand=as.character(anno.strands)) |
79 | 79 |
anno.df$binstate <- anno.binstates |
80 |
- anno.df$reads <- anno.reads |
|
80 |
+ anno.df$counts <- anno.reads |
|
81 | 81 |
anno.df$combstate <- anno.combstates |
82 | 82 |
# Reorder to add stuff that was calculated from sorted table |
83 | 83 |
anno.df <- cbind(anno.df[order(anno.df$subjectHits),], relcoord=relcoord) |
... | ... |
@@ -98,7 +98,7 @@ enrichmentCurve <- function(hmm, annotation, bp.around.annotation=10000, region= |
98 | 98 |
binstates.inside <- matrix(NA, nrow=length(intervals), ncol=ncol(anno.binstates), dimnames=list(interval=intervals, track=hmm$IDs)) |
99 | 99 |
for (interval in intervals) { |
100 | 100 |
mask <- anno.df$interval==interval |
101 |
- binstates.inside[as.character(interval),] <- colMeans(anno.df[,grepl('binstate',names(anno.df))][mask,], na.rm=T) |
|
101 |
+ binstates.inside[as.character(interval),] <- colMeans(anno.df[,grepl('binstate',names(anno.df))][mask,], na.rm=TRUE) |
|
102 | 102 |
} |
103 | 103 |
eCurve$binstates$inside <- binstates.inside |
104 | 104 |
} |
... | ... |
@@ -115,9 +115,9 @@ enrichmentCurve <- function(hmm, annotation, bp.around.annotation=10000, region= |
115 | 115 |
reads.inside <- matrix(NA, nrow=length(intervals), ncol=ncol(anno.binstates), dimnames=list(interval=intervals, track=hmm$IDs)) |
116 | 116 |
for (interval in intervals) { |
117 | 117 |
mask <- anno.df$interval==interval |
118 |
- reads.inside[as.character(interval),] <- colMeans(anno.df[,grepl('reads',names(anno.df))][mask,], na.rm=T) |
|
118 |
+ reads.inside[as.character(interval),] <- colMeans(anno.df[,grepl('reads',names(anno.df))][mask,], na.rm=TRUE) |
|
119 | 119 |
} |
120 |
- eCurve$reads$inside <- reads.inside |
|
120 |
+ eCurve$counts$inside <- reads.inside |
|
121 | 121 |
} |
122 | 122 |
} |
123 | 123 |
|
... | ... |
@@ -143,7 +143,7 @@ enrichmentCurve <- function(hmm, annotation, bp.around.annotation=10000, region= |
143 | 143 |
fold[is.na(fold)] <- 0 |
144 | 144 |
combstates.start[as.character(ilag),] <- fold |
145 | 145 |
} |
146 |
- if ('reads' %in% what) reads.start[as.character(ilag),] <- colMeans(hmm$bins$reads[index,]) |
|
146 |
+ if ('reads' %in% what) reads.start[as.character(ilag),] <- colMeans(hmm$bins$counts[index,]) |
|
147 | 147 |
} |
148 | 148 |
if ('binstates' %in% what) { |
149 | 149 |
rownames(binstates.start) <- as.numeric(rownames(binstates.start)) * binsize |
... | ... |
@@ -155,7 +155,7 @@ enrichmentCurve <- function(hmm, annotation, bp.around.annotation=10000, region= |
155 | 155 |
} |
156 | 156 |
if ('reads' %in% what) { |
157 | 157 |
rownames(reads.start) <- as.numeric(rownames(reads.start)) * binsize |
158 |
- eCurve$reads$start <- reads.start |
|
158 |
+ eCurve$counts$start <- reads.start |
|
159 | 159 |
} |
160 | 160 |
} |
161 | 161 |
if ('end' %in% region) { |
... | ... |
@@ -180,7 +180,7 @@ enrichmentCurve <- function(hmm, annotation, bp.around.annotation=10000, region= |
180 | 180 |
fold[is.na(fold)] <- 0 |
181 | 181 |
combstates.start[as.character(ilag),] <- fold |
182 | 182 |
} |
183 |
- if ('reads' %in% what) reads.end[as.character(ilag),] <- colMeans(hmm$bins$reads[index,]) |
|
183 |
+ if ('reads' %in% what) reads.end[as.character(ilag),] <- colMeans(hmm$bins$counts[index,]) |
|
184 | 184 |
} |
185 | 185 |
if ('binstates' %in% what) { |
186 | 186 |
rownames(binstates.end) <- as.numeric(rownames(binstates.end)) * binsize |
... | ... |
@@ -192,7 +192,7 @@ enrichmentCurve <- function(hmm, annotation, bp.around.annotation=10000, region= |
192 | 192 |
} |
193 | 193 |
if ('reads' %in% what) { |
194 | 194 |
rownames(reads.end) <- as.numeric(rownames(reads.end)) * binsize |
195 |
- eCurve$reads$end <- reads.end |
|
195 |
+ eCurve$counts$end <- reads.end |
|
196 | 196 |
} |
197 | 197 |
} |
198 | 198 |
|
... | ... |
@@ -200,31 +200,32 @@ enrichmentCurve <- function(hmm, annotation, bp.around.annotation=10000, region= |
200 | 200 |
|
201 | 201 |
} |
202 | 202 |
|
203 |
+#' @importFrom S4Vectors subjectHits queryHits |
|
203 | 204 |
plot.cross.correlation <- function(hmm, annotation, bp.around.annotation=10000) { |
204 | 205 |
|
205 |
- ## Debugging |
|
206 |
- library(biomaRt) |
|
207 |
- bp.around.annotation=10000 |
|
208 |
-# hg19 <- useMart('ENSEMBL_MART_ENSEMBL', host='grch37.ensembl.org', dataset='hsapiens_gene_ensembl') |
|
209 |
-# filters <- listFilters(hg19) |
|
210 |
-# attributes <- listAttributes(hg19) |
|
211 |
-# hg19.genes <- getBM(attributes=c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position', 'strand'), mart=hg19) |
|
212 |
-# names(hg19.genes)[1:5] <- c('ID','chrom','start','end','strand') |
|
213 |
-# genes <- GRanges(seqnames=paste0('chr',hg19.genes$chrom), ranges=IRanges(start=hg19.genes$start, end=hg19.genes$end), strand=hg19.genes$strand, gene_id=hg19.genes$ID) |
|
214 |
-# file <- "multiresults_all_chrom/multivariate_mark_H3K27ac_patient_149_binsize_1000.RData" |
|
206 |
+# ## Debugging |
|
207 |
+# library(biomaRt) |
|
208 |
+# bp.around.annotation=10000 |
|
209 |
+# # hg19 <- biomaRt::useMart('ENSEMBL_MART_ENSEMBL', host='grch37.ensembl.org', dataset='hsapiens_gene_ensembl') |
|
210 |
+# # filters <- biomaRt::listFilters(hg19) |
|
211 |
+# # attributes <- biomaRt::listAttributes(hg19) |
|
212 |
+# # hg19.genes <- biomaRt::getBM(attributes=c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position', 'strand'), mart=hg19) |
|
213 |
+# # names(hg19.genes)[1:5] <- c('ID','chrom','start','end','strand') |
|
214 |
+# # genes <- GRanges(seqnames=paste0('chr',hg19.genes$chrom), ranges=IRanges(start=hg19.genes$start, end=hg19.genes$end), strand=hg19.genes$strand, gene_id=hg19.genes$ID) |
|
215 |
+# # file <- "multiresults_all_chrom/multivariate_mark_H3K27ac_patient_149_binsize_1000.RData" |
|
216 |
+# # hmm <- get(load(file)) |
|
217 |
+# # annotation <- genes |
|
218 |
+# |
|
219 |
+# rat.ensembl59 <- biomaRt::useMart('ENSEMBL_MART_ENSEMBL', host='aug2010.archive.ensembl.org', dataset='rnorvegicus_gene_ensembl') |
|
220 |
+# filters <- biomaRt::listFilters(rat.ensembl59) |
|
221 |
+# attributes <- biomaRt::listAttributes(rat.ensembl59) |
|
222 |
+# rat.genes <- biomaRt::getBM(attributes=c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position', 'strand'), mart=rat.ensembl59) |
|
223 |
+# names(rat.genes)[1:5] <- c('ID','chrom','start','end','strand') |
|
224 |
+# genes <- GRanges(seqnames=paste0('chr',rat.genes$chrom), ranges=IRanges(start=rat.genes$start, end=rat.genes$end), strand=rat.genes$strand, gene_id=rat.genes$ID) |
|
225 |
+# file <- 'multiresults/euratrans_lv_binsize_1000.RData' |
|
215 | 226 |
# hmm <- get(load(file)) |
216 | 227 |
# annotation <- genes |
217 | 228 |
|
218 |
- rat.ensembl59 <- useMart('ENSEMBL_MART_ENSEMBL', host='aug2010.archive.ensembl.org', dataset='rnorvegicus_gene_ensembl') |
|
219 |
- filters <- listFilters(rat.ensembl59) |
|
220 |
- attributes <- listAttributes(rat.ensembl59) |
|
221 |
- rat.genes <- getBM(attributes=c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position', 'strand'), mart=rat.ensembl59) |
|
222 |
- names(rat.genes)[1:5] <- c('ID','chrom','start','end','strand') |
|
223 |
- genes <- GRanges(seqnames=paste0('chr',rat.genes$chrom), ranges=IRanges(start=rat.genes$start, end=rat.genes$end), strand=rat.genes$strand, gene_id=rat.genes$ID) |
|
224 |
- file <- 'multiresults/euratrans_lv_binsize_1000.RData' |
|
225 |
- hmm <- get(load(file)) |
|
226 |
- annotation <- genes |
|
227 |
- |
|
228 | 229 |
## Variables |
229 | 230 |
binsize <- width(hmm$bins)[1] |
230 | 231 |
lag <- round(bp.around.annotation/binsize) |
... | ... |
@@ -234,26 +235,26 @@ plot.cross.correlation <- function(hmm, annotation, bp.around.annotation=10000) |
234 | 235 |
# Select only annotations that span more than 10 bins |
235 | 236 |
annotation.sub <- annotation |
236 | 237 |
# Not necessary but makes it easier to debug |
237 |
- annotation.sub <- keepSeqlevels(annotation.sub, seqlevels(hmm$bins)) |
|
238 |
+ annotation.sub <- GenomeInfoDb::keepSeqlevels(annotation.sub, seqlevels(hmm$bins)) |
|
238 | 239 |
seqlengths(annotation.sub) <- seqlengths(hmm$bins) |
239 | 240 |
annotation.sub <- sort(annotation.sub) |
240 | 241 |
|
241 | 242 |
# Get bins that overlap annotation |
242 | 243 |
ind <- findOverlaps(hmm$bins, annotation.sub) |
243 |
- bins.per.annotation <- table(subjectHits(ind)) # Table is sorted! |
|
244 |
+ bins.per.annotation <- table(S4Vectors::subjectHits(ind)) # Table is sorted! |
|
244 | 245 |
annotation.sub$num.bins.spanning <- rep(0, length(annotation.sub)) |
245 | 246 |
annotation.sub$num.bins.spanning[as.numeric(names(bins.per.annotation))] <- bins.per.annotation |
246 | 247 |
strand.per.annotation <- as.factor(strand(annotation.sub[as.numeric(names(bins.per.annotation))])) |
247 | 248 |
names(strand.per.annotation) <- names(bins.per.annotation) |
248 | 249 |
|
249 | 250 |
# Get binary states |
250 |
- binstates <- dec2bin(hmm$bins$state, ndigits=ncol(hmm$bins$reads)) |
|
251 |
+ binstates <- dec2bin(hmm$bins$state, ndigits=ncol(hmm$bins$counts)) |
|
251 | 252 |
|
252 | 253 |
# States, reads and strands per bin-that-overlaps-an-annotation |
253 |
- ind.binstates <- binstates[queryHits(ind),] |
|
254 |
- ind.reads <- hmm$bins$reads[queryHits(ind),] |
|
254 |
+ ind.binstates <- binstates[S4Vectors::queryHits(ind),] |
|
255 |
+ ind.reads <- hmm$bins$counts[S4Vectors::queryHits(ind),] |
|
255 | 256 |
colnames(ind.reads) <- NULL |
256 |
- ind.strands <- strand(annotation.sub)[subjectHits(ind)] |
|
257 |
+ ind.strands <- strand(annotation.sub)[S4Vectors::subjectHits(ind)] |
|
257 | 258 |
|
258 | 259 |
# Relative coordinate of every bin (TSS=0, TTS=1) |
259 | 260 |
relcoord <- list() |
... | ... |
@@ -284,13 +285,13 @@ plot.cross.correlation <- function(hmm, annotation, bp.around.annotation=10000) |
284 | 285 |
ind.df$interval <- intervals[findInterval(ind.df$relcoord, intervals)] |
285 | 286 |
|
286 | 287 |
# Mean over intervals |
287 |
- binstates.inside <- matrix(NA, nrow=length(intervals), ncol=ncol(ind.binstates), dimnames=list(interval=intervals, track=colnames(hmm$bins$reads))) |
|
288 |
- reads.inside <- matrix(NA, nrow=length(intervals), ncol=ncol(ind.binstates), dimnames=list(interval=intervals, track=colnames(hmm$bins$reads))) |
|
288 |
+ binstates.inside <- matrix(NA, nrow=length(intervals), ncol=ncol(ind.binstates), dimnames=list(interval=intervals, track=colnames(hmm$bins$counts))) |
|
289 |
+ reads.inside <- matrix(NA, nrow=length(intervals), ncol=ncol(ind.binstates), dimnames=list(interval=intervals, track=colnames(hmm$bins$counts))) |
|
289 | 290 |
for (interval in intervals) { |
290 | 291 |
i1 <- which(interval==intervals) |
291 | 292 |
mask <- ind.df$interval==interval |
292 |
- binstates.inside[i1,] <- colMeans(ind.df[,grepl('binstate',names(ind.df))][mask,], na.rm=T) |
|
293 |
- reads.inside[i1,] <- colMeans(ind.df[,grepl('read',names(ind.df))][mask,], na.rm=T) |
|
293 |
+ binstates.inside[i1,] <- colMeans(ind.df[,grepl('binstate',names(ind.df))][mask,], na.rm=TRUE) |
|
294 |
+ reads.inside[i1,] <- colMeans(ind.df[,grepl('read',names(ind.df))][mask,], na.rm=TRUE) |
|
294 | 295 |
} |
295 | 296 |
|
296 | 297 |
### 10000 bp before and after annotation ### |
... | ... |
@@ -301,14 +302,14 @@ plot.cross.correlation <- function(hmm, annotation, bp.around.annotation=10000) |
301 | 302 |
index.start.plus <- index.start.plus[!is.na(index.start.plus)] |
302 | 303 |
index.start.minus <- index.start.minus[!is.na(index.start.minus)] |
303 | 304 |
# Occurrences at every bin position relative to feature |
304 |
- binstates.before <- array(dim=c(length(-lag:0), ncol(hmm$bins$reads)), dimnames=list(lag=-lag:0, track=colnames(hmm$bins$reads))) |
|
305 |
- reads.before <- array(dim=c(length(-lag:0), ncol(hmm$bins$reads)), dimnames=list(lag=-lag:0, track=colnames(hmm$bins$reads))) |
|
305 |
+ binstates.before <- array(dim=c(length(-lag:0), ncol(hmm$bins$counts)), dimnames=list(lag=-lag:0, track=colnames(hmm$bins$counts))) |
|
306 |
+ reads.before <- array(dim=c(length(-lag:0), ncol(hmm$bins$counts)), dimnames=list(lag=-lag:0, track=colnames(hmm$bins$counts))) |
|