... | ... |
@@ -6,7 +6,7 @@ hmmList2GRangesList <- function(hmm.list, reduce=TRUE, numCPU=1, consensus=FALSE |
6 | 6 |
## Transform to GRanges |
7 | 7 |
cat('transforming to GRanges\n') |
8 | 8 |
if (numCPU > 1) { |
9 |
- library(doParallel) |
|
9 |
+ suppressMessages( library(doParallel) ) |
|
10 | 10 |
cl <- makeCluster(numCPU) |
11 | 11 |
registerDoParallel(cl) |
12 | 12 |
cfun <- function(...) { GRangesList(...) } |
... | ... |
@@ -28,7 +28,7 @@ hmmList2GRangesList <- function(hmm.list, reduce=TRUE, numCPU=1, consensus=FALSE |
28 | 28 |
} |
29 | 29 |
if (consensus) { |
30 | 30 |
consensus.gr <- disjoin(unlist(hmm.grl)) |
31 |
- constates <- matrix(NA, ncol=length(hmm.grl), nrow=length(hmm.grl[[1]])) |
|
31 |
+ constates <- matrix(NA, ncol=length(hmm.grl), nrow=length(consensus.gr)) |
|
32 | 32 |
for (i1 in 1:length(hmm.grl)) { |
33 | 33 |
hmm.gr <- hmm.grl[[i1]] |
34 | 34 |
splt <- split(hmm.gr, mcols(hmm.gr)$state) |
... | ... |
@@ -38,8 +38,8 @@ hmmList2GRangesList <- function(hmm.list, reduce=TRUE, numCPU=1, consensus=FALSE |
38 | 38 |
} |
39 | 39 |
} |
40 | 40 |
if (consensus) { |
41 |
- meanstates <- apply(constates, 1, mean) |
|
42 |
- mcols(consensus.gr) <- meanstates |
|
41 |
+ meanstates <- apply(constates, 1, mean, na.rm=T) |
|
42 |
+ mcols(consensus.gr)$meanstate <- meanstates |
|
43 | 43 |
return(list(grl=hmm.grl, consensus=consensus.gr, constates=constates)) |
44 | 44 |
} else { |
45 | 45 |
return(hmm.grl) |
... | ... |
@@ -131,8 +131,8 @@ bed2GRanges <- function(bedfile, chrom.length.file, skip=1, binsize=NULL) { |
131 | 131 |
|
132 | 132 |
## Inflate every range with bins |
133 | 133 |
if (!is.null(binsize)) { |
134 |
- inflated.data <- GRangesList() |
|
135 | 134 |
grl <- split(data, seqnames(data)) |
135 |
+ inflated.data <- GRangesList() |
|
136 | 136 |
for (i1 in 1:length(grl)) { |
137 | 137 |
rgr <- ranges(grl[[i1]]) |
138 | 138 |
widths <- width(rgr) |
... | ... |
@@ -156,7 +156,7 @@ bed2GRanges <- function(bedfile, chrom.length.file, skip=1, binsize=NULL) { |
156 | 156 |
ranges=IRanges(start=infstarts, end=infends), |
157 | 157 |
strand=Rle(strand('*'), sum(numbins)), |
158 | 158 |
state=infstates) |
159 |
- inflated.data[[i1]] <- inflated.chrom |
|
159 |
+ suppressWarnings( inflated.data[[i1]] <- inflated.chrom ) |
|
160 | 160 |
} |
161 | 161 |
inflated.data <- unlist(inflated.data) |
162 | 162 |
seqlengths(inflated.data) <- as.integer(chrom.lengths[names(seqlengths(data))]) |
... | ... |
@@ -1,125 +1,120 @@ |
1 |
-# =============================================== |
|
2 |
-# Write color-coded tracks with univariate states |
|
3 |
-# =============================================== |
|
4 |
-export.hmm2bed <- function(uni.hmm.list, file="view_me_in_genome_browser", numCPU=1) { |
|
5 |
- |
|
6 |
- ## Intercept user input |
|
7 |
- if (check.univariate.modellist(uni.hmm.list)!=0) { |
|
8 |
- cat("Loading univariate HMMs from files ...") |
|
9 |
- mlist <- NULL |
|
10 |
- for (modelfile in uni.hmm.list) { |
|
11 |
- mlist[[length(mlist)+1]] <- get(load(modelfile)) |
|
12 |
- } |
|
13 |
- uni.hmm.list <- mlist |
|
14 |
- remove(mlist) |
|
15 |
- cat(" done\n") |
|
16 |
- if (check.univariate.modellist(uni.hmm.list)!=0) stop("argument 'uni.hmm.list' expects a list of univariate hmms or a list of files that contain univariate hmms") |
|
1 |
+# ==================================== |
|
2 |
+# Write color-coded tracks with states |
|
3 |
+# ==================================== |
|
4 |
+export.hmm2bed <- function(hmm.list, file="view_me_in_genome_browser", numCPU=1) { |
|
5 |
+ |
|
6 |
+ ## Function definitions |
|
7 |
+ insertchr <- function(hmm.gr) { |
|
8 |
+ # Change chromosome names from '1' to 'chr1' if necessary |
|
9 |
+ mask <- which(!grepl('chr', seqnames(hmm.gr))) |
|
10 |
+ mcols(hmm.gr)$chromosome <- as.character(seqnames(hmm.gr)) |
|
11 |
+ mcols(hmm.gr)$chromosome[mask] <- sub(pattern='^', replacement='chr', mcols(hmm.gr)$chromosome[mask]) |
|
12 |
+ mcols(hmm.gr)$chromosome <- as.factor(mcols(hmm.gr)$chromosome) |
|
13 |
+ return(hmm.gr) |
|
17 | 14 |
} |
18 | 15 |
|
16 |
+ ## Load models |
|
17 |
+ hmm.list <- loadHmmsFromFiles(hmm.list) |
|
18 |
+ |
|
19 | 19 |
## Transform to GRanges |
20 |
- cat('transforming to GRanges\n') |
|
21 |
- if (numCPU > 1) { |
|
22 |
- library(doParallel) |
|
23 |
- cl <- makeCluster(numCPU) |
|
24 |
- registerDoParallel(cl) |
|
25 |
- cfun <- function(...) { GRangesList(...) } |
|
26 |
- uni.hmm.grl <- foreach (uni.hmm = uni.hmm.list, .packages='aneufinder', .combine='cfun', .multicombine=TRUE) %dopar% { |
|
27 |
- hmm2GRanges(uni.hmm, reduce=T) |
|
28 |
- } |
|
29 |
- consensus.gr <- disjoin(unlist(uni.hmm.grl)) |
|
30 |
- constates <- foreach (uni.hmm.gr = uni.hmm.grl, .packages='GenomicRanges', .combine='cbind') %dopar% { |
|
31 |
- splt <- split(uni.hmm.gr, mcols(uni.hmm.gr)$state) |
|
32 |
- mind <- as.matrix(findOverlaps(consensus.gr, splt, select='first')) |
|
33 |
- } |
|
34 |
- stopCluster(cl) |
|
35 |
- } else { |
|
36 |
- uni.hmm.grl <- GRangesList() |
|
37 |
- for (uni.hmm in uni.hmm.list) { |
|
38 |
- uni.hmm.grl[[length(uni.hmm.grl)+1]] <- hmm2GRanges(uni.hmm, reduce=T) |
|
39 |
- } |
|
40 |
- consensus.gr <- disjoin(unlist(uni.hmm.grl)) |
|
41 |
- constates <- matrix(NA, ncol=length(uni.hmm.grl), nrow=length(uni.hmm.grl[[1]])) |
|
42 |
- for (i1 in 1:length(uni.hmm.grl)) { |
|
43 |
- uni.hmm.gr <- uni.hmm.grl[[i1]] |
|
44 |
- splt <- split(uni.hmm.gr, mcols(uni.hmm.gr)$state) |
|
45 |
- mind <- as.matrix(findOverlaps(consensus.gr, splt, select='first')) |
|
46 |
- constates[,i1] <- mind |
|
47 |
- } |
|
48 |
- } |
|
49 |
- meanstates <- apply(constates, 1, mean) |
|
50 |
- mcols(consensus.gr) <- meanstates |
|
20 |
+ temp <- hmmList2GRangesList(hmm.list, reduce=T, numCPU=numCPU, consensus=T) |
|
21 |
+ consensus.gr <- insertchr(temp$consensus) |
|
22 |
+ hmm.grl <- lapply(temp$grl, insertchr) |
|
51 | 23 |
|
52 | 24 |
# Variables |
53 |
- nummod <- length(uni.hmm.list) |
|
25 |
+ nummod <- length(hmm.list) |
|
26 |
+ numstates <- hmm.list[[1]]$num.states |
|
54 | 27 |
file <- paste(file,"bed", sep=".") |
55 | 28 |
|
56 | 29 |
# Generate the colors |
57 |
- colors <- state.colors[state.labels] |
|
30 |
+ colors <- state.colors[levels(hmm.grl[[1]]$state)] |
|
58 | 31 |
RGBs <- t(col2rgb(colors)) |
59 | 32 |
RGBs <- apply(RGBs,1,paste,collapse=",") |
33 |
+ cf <- colorRamp(colors, space='rgb', interpolate='linear') |
|
34 |
+ consensusRGBs <- round(cf((mcols(consensus.gr)$meanstate-1)/(numstates-1)), 0) |
|
35 |
+ itemconsensusRgb <- apply(consensusRGBs,1,paste,collapse=",") |
|
60 | 36 |
|
61 | 37 |
# Write first line to file |
62 |
- cat("browser hide all\n", file=file) |
|
38 |
+ cat('writing to file',file,'\n') |
|
39 |
+# cat("browser hide all\n", file=file) |
|
40 |
+ cat("", file=file) |
|
63 | 41 |
|
64 | 42 |
### Write every model to file ### |
65 | 43 |
for (imod in 1:nummod) { |
66 |
- uni.hmm <- uni.hmm.list[[imod]] |
|
67 |
- uni.hmm.gr <- uni.hmm.grl[[imod]] |
|
68 |
- priority <- 50 + imod |
|
69 |
- cat(paste("track name=",uni.hmm$ID," description=\"univariate calls for ",names(uni.hmm.list)[imod],"\" visibility=1 itemRgb=On priority=",priority,"\n", sep=""), file=file, append=TRUE) |
|
70 |
- |
|
71 |
- # Change chromosome names from '1' to 'chr1' if necessary |
|
72 |
- mask <- which(!grepl('chr', seqnames(uni.hmm.gr))) |
|
73 |
- mcols(uni.hmm.gr)$chromosome <- as.character(seqnames(uni.hmm.gr)) |
|
74 |
- mcols(uni.hmm.gr)$chromosome[mask] <- sub(pattern='^', replacement='chr', mcols(uni.hmm.gr)$chromosome[mask]) |
|
75 |
- mcols(uni.hmm.gr)$chromosome <- as.factor(mcols(uni.hmm.gr)$chromosome) |
|
76 |
- |
|
77 |
- # Collapse the calls |
|
78 |
- collapsed.calls <- as.data.frame(uni.hmm.gr)[c('chromosome','start','end','state')] |
|
79 |
- |
|
80 |
- # Append to file |
|
44 |
+ cat('writing hmm',imod,'/',nummod,'\r') |
|
45 |
+ hmm <- hmm.list[[imod]] |
|
46 |
+ hmm.gr <- hmm.grl[[imod]] |
|
47 |
+ priority <- 51 + imod |
|
48 |
+ cat(paste0("track name=",hmm$ID," state"," description=\"",hmm$ID," state","\" visibility=1 itemRgb=On priority=",priority,"\n"), file=file, append=TRUE) |
|
49 |
+ collapsed.calls <- as.data.frame(hmm.gr)[c('chromosome','start','end','state')] |
|
81 | 50 |
itemRgb <- RGBs[as.character(collapsed.calls$state)] |
82 | 51 |
numsegments <- nrow(collapsed.calls) |
83 | 52 |
df <- cbind(collapsed.calls, score=rep(0,numsegments), strand=rep(".",numsegments), thickStart=collapsed.calls$start, thickEnd=collapsed.calls$end, itemRgb=itemRgb) |
84 | 53 |
write.table(format(df, scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE) |
85 | 54 |
} |
55 |
+ cat('\n') |
|
56 |
+ |
|
57 |
+ ### Write consensus model to file ### |
|
58 |
+ cat('writing mean states to file\n') |
|
59 |
+ priority <- 50 |
|
60 |
+ cat(paste0("track name=\"","average state","\" description=\"","average state","\" visibility=1 itemRgb=On priority=",priority,"\n"), file=file, append=TRUE) |
|
61 |
+ collapsed.calls <- as.data.frame(consensus.gr)[c('chromosome','start','end','meanstate')] |
|
62 |
+ numsegments <- nrow(collapsed.calls) |
|
63 |
+ df <- cbind(collapsed.calls, score=rep(0,numsegments), strand=rep(".",numsegments), thickStart=collapsed.calls$start, thickEnd=collapsed.calls$end, itemRgb=itemconsensusRgb) |
|
64 |
+ write.table(format(df, scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE) |
|
86 | 65 |
|
87 | 66 |
} |
88 | 67 |
|
89 | 68 |
|
90 |
-# ==================================== |
|
91 |
-# Write signal tracks from binned data |
|
92 |
-# ==================================== |
|
93 |
-export.binned2wiggle <- function(binned.data.list, file="view_me_in_genome_browser") { |
|
94 |
- |
|
95 |
- # Check user input |
|
96 |
- if (is.null(names(binned.data.list))) { |
|
97 |
- stop("Please name the list entries. The names will be used as track name for the Genome Browser file.") |
|
69 |
+# ============================= |
|
70 |
+# Write signal tracks from hmms |
|
71 |
+# ============================= |
|
72 |
+export.hmm2wiggle <- function(hmm.list, file="view_me_in_genome_browser", numCPU=1) { |
|
73 |
+ |
|
74 |
+ ## Function definitions |
|
75 |
+ insertchr <- function(hmm.gr) { |
|
76 |
+ # Change chromosome names from '1' to 'chr1' if necessary |
|
77 |
+ mask <- which(!grepl('chr', seqnames(hmm.gr))) |
|
78 |
+ mcols(hmm.gr)$chromosome <- as.character(seqnames(hmm.gr)) |
|
79 |
+ mcols(hmm.gr)$chromosome[mask] <- sub(pattern='^', replacement='chr', mcols(hmm.gr)$chromosome[mask]) |
|
80 |
+ mcols(hmm.gr)$chromosome <- as.factor(mcols(hmm.gr)$chromosome) |
|
81 |
+ return(hmm.gr) |
|
98 | 82 |
} |
99 | 83 |
|
84 |
+ ## Load models |
|
85 |
+ hmm.list <- loadHmmsFromFiles(hmm.list) |
|
86 |
+ |
|
87 |
+ ## Transform to GRanges |
|
88 |
+ temp <- hmmList2GRangesList(hmm.list, reduce=F, numCPU=numCPU, consensus=T) |
|
89 |
+ hmm.grl <- temp$grl |
|
90 |
+ consensus.gr <- insertchr(temp$consensus) |
|
91 |
+ hmm.grl <- lapply(hmm.grl, insertchr) |
|
92 |
+ |
|
100 | 93 |
# Variables |
101 |
- file <- paste(file,"wig", sep=".") |
|
94 |
+ nummod <- length(hmm.list) |
|
95 |
+ numstates <- hmm.list[[1]]$num.states |
|
96 |
+ file <- paste(file,"wiggle", sep=".") |
|
102 | 97 |
|
103 |
- # Write to file |
|
104 |
- cat("browser hide all\n", file=file) |
|
98 |
+ # Write first line to file |
|
99 |
+ cat('writing to file',file,'\n') |
|
100 |
+# cat("browser hide all\n", file=file) |
|
101 |
+ cat("", file=file) |
|
105 | 102 |
|
106 |
- # Go through binned.data.list |
|
107 |
- for (i1 in 1:length(binned.data.list)) { |
|
108 |
- binned.data <- binned.data.list[[i1]] |
|
109 |
- names(binned.data) <- binned.data.names |
|
110 |
- |
|
111 |
- # Write track information |
|
112 |
- name <- names(binned.data.list)[i1] |
|
113 |
- binsize <- diff(binned.data$start[1:2]) |
|
114 |
- description <- paste("binned read counts ",binsize,"bp", sep="") |
|
115 |
- cat(paste("track type=wiggle_0 name=\"",name,"\" description=\"",description,"\" visibility=full autoScale=on color=90,90,90 maxHeightPixels=100:50:20 graphType=bar priority=",i1,"\n", sep=""), file=file, append=TRUE) |
|
103 |
+ ### Write every model to file ### |
|
104 |
+ for (imod in 1:nummod) { |
|
105 |
+ cat('writing hmm',imod,'/',nummod,'\r') |
|
106 |
+ hmm <- hmm.list[[imod]] |
|
107 |
+ hmm.gr <- hmm.grl[[imod]] |
|
108 |
+ priority <- 50 + imod |
|
109 |
+ binsize <- width(hmm.gr[1]) |
|
110 |
+ cat(paste("track type=wiggle_0 name=\"",hmm$ID," read count","\" description=\"",hmm$ID," read count","\" visibility=full autoScale=on color=90,90,90 maxHeightPixels=100:50:20 graphType=bar priority=",priority,"\n", sep=""), file=file, append=TRUE) |
|
116 | 111 |
# Write read data |
117 |
- for (chrom in unique(binned.data$chrom)) { |
|
112 |
+ for (chrom in unique(hmm.gr$chromosome)) { |
|
118 | 113 |
cat(paste("fixedStep chrom=",chrom," start=1 step=",binsize," span=",binsize,"\n", sep=""), file=file, append=TRUE) |
119 |
- write.table(binned.data$reads, file=file, append=TRUE, row.names=FALSE, col.names=FALSE) |
|
114 |
+ write.table(mcols(hmm.gr[hmm.gr$chromosome==chrom])$reads, file=file, append=TRUE, row.names=FALSE, col.names=FALSE) |
|
120 | 115 |
} |
121 | 116 |
} |
122 |
- |
|
117 |
+ cat('\n') |
|
123 | 118 |
} |
124 | 119 |
|
125 | 120 |
|
... | ... |
@@ -1,7 +1,11 @@ |
1 |
-findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, filter.reads=TRUE) { |
|
1 |
+findCNVs <- function(binned.data, ID, eps=0.001, init="random", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, read.cutoff.quantile=0.999) { |
|
2 | 2 |
|
3 | 3 |
## Intercept user input |
4 | 4 |
IDcheck <- ID #trigger error if not defined |
5 |
+ if (class(binned.data) != 'GRanges') { |
|
6 |
+ binned.data <- get(load(binned.data)) |
|
7 |
+ if (class(binned.data) != 'GRanges') stop("argument 'binned.data' expects a GRanges with meta-column 'reads' or a file that contains such an object") |
|
8 |
+ } |
|
5 | 9 |
if (check.positive(eps)!=0) stop("argument 'eps' expects a positive numeric") |
6 | 10 |
if (check.integer(max.time)!=0) stop("argument 'max.time' expects an integer") |
7 | 11 |
if (check.integer(max.iter)!=0) stop("argument 'max.iter' expects an integer") |
... | ... |
@@ -28,28 +32,58 @@ findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, m |
28 | 32 |
} |
29 | 33 |
|
30 | 34 |
# Filter high reads out, makes HMM faster |
31 |
- read.cutoff <- as.integer(quantile(reads, 0.9999)) |
|
32 |
- if (filter.reads) { |
|
33 |
- mask <- reads > read.cutoff |
|
34 |
- reads[mask] <- read.cutoff |
|
35 |
- numfiltered <- length(which(mask)) |
|
36 |
- cat(paste0("Replaced read counts > ",read.cutoff," (99.99% quantile) by ",read.cutoff," in ",numfiltered," bins. Set option 'filter.reads=FALSE' to disable this filtering.\n")) |
|
37 |
-# if (numfiltered > 0) { |
|
38 |
-# warning(paste("There are very high read counts in your data (probably artificial). Replaced read counts > ",read.cutoff," (99.99% quantile) by ",read.cutoff," in ",numfiltered," bins. Set option 'filter.reads=FALSE' to disable this filtering.", sep="")) |
|
39 |
-# } |
|
35 |
+ read.cutoff <- as.integer(quantile(reads, read.cutoff.quantile)) |
|
36 |
+ mask <- reads > read.cutoff |
|
37 |
+ reads[mask] <- read.cutoff |
|
38 |
+ numfiltered <- length(which(mask)) |
|
39 |
+ if (numfiltered > 0) { |
|
40 |
+ cat(paste0("Replaced read counts > ",read.cutoff," (",names(read.cutoff)," quantile) by ",read.cutoff," in ",numfiltered," bins. Set option 'read.cutoff.quantile=1' to disable this filtering.\n")) |
|
40 | 41 |
} |
41 | 42 |
|
42 |
- |
|
43 | 43 |
## Call univariate in a for loop to enable multiple trials |
44 | 44 |
modellist <- list() |
45 | 45 |
for (i_try in 1:num.trials) { |
46 | 46 |
cat("\n\nTry ",i_try," of ",num.trials," ------------------------------\n") |
47 |
+ |
|
48 |
+ ## Initial parameters |
|
49 |
+ if (init == 'random') { |
|
50 |
+ A.initial <- matrix(runif(numstates^2), ncol=numstates) |
|
51 |
+ A.initial <- sweep(A.initial, 1, rowSums(A.initial), "/") |
|
52 |
+ proba.initial <- runif(numstates) |
|
53 |
+ size.initial <- runif(1, min=0, max=1000) * cumsum(ifelse(state.distributions=='dnbinom' | state.distributions=='dbinom', T, F)) |
|
54 |
+ prob.initial <- runif(1) * ifelse(state.distributions=='dnbinom', T, F) |
|
55 |
+ prob.initial[state.distributions=='dgeom'] <- runif(1) |
|
56 |
+ lambda.initial <- runif(1, min=0, max=1000) * cumsum(ifelse(state.distributions=='dpois', T, F)) |
|
57 |
+ } else if (init == 'standard') { |
|
58 |
+ A.initial <- matrix(NA, ncol=numstates, nrow=numstates) |
|
59 |
+ for (irow in 1:numstates) { |
|
60 |
+ for (icol in 1:numstates) { |
|
61 |
+ if (irow==icol) { A.initial[irow,icol] <- 0.9 } |
|
62 |
+ else { A.initial[irow,icol] <- 0.1/(numstates-1) } |
|
63 |
+ } |
|
64 |
+ } |
|
65 |
+ proba.initial <- rep(1/numstates, numstates) |
|
66 |
+ mean.initial <- mean(reads[reads>0])/2 * cumsum(ifelse(state.distributions=='dnbinom' | state.distributions=='dbinom', T, F)) |
|
67 |
+ var.initial <- var(reads[reads>0])/2 * cumsum(ifelse(state.distributions=='dnbinom' | state.distributions=='dbinom', T, F)) |
|
68 |
+ size.initial <- rep(0,numstates) |
|
69 |
+ prob.initial <- rep(0,numstates) |
|
70 |
+ mask <- state.distributions=='dnbinom' |
|
71 |
+ size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask]) |
|
72 |
+ prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask]) |
|
73 |
+ mask <- state.distributions=='dbinom' |
|
74 |
+ size.initial[mask] <- dbinom.size(mean.initial[mask], var.initial[mask]) |
|
75 |
+ prob.initial[mask] <- dbinom.prob(mean.initial[mask], var.initial[mask]) |
|
76 |
+ prob.initial[state.distributions=='dgeom'] <- 0.9 |
|
77 |
+ lambda.initial <- mean(reads[reads>0])/2 * cumsum(ifelse(state.distributions=='dpois', T, F)) |
|
78 |
+ } |
|
79 |
+ |
|
47 | 80 |
hmm <- .C("R_univariate_hmm", |
48 | 81 |
reads = as.integer(reads), # int* O |
49 | 82 |
num.bins = as.integer(numbins), # int* T |
50 | 83 |
num.states = as.integer(numstates), # int* N |
51 | 84 |
size = double(length=numstates), # double* size |
52 | 85 |
prob = double(length=numstates), # double* prob |
86 |
+ lambda = double(length=numstates), # double* lambda |
|
53 | 87 |
num.iterations = as.integer(max.iter), # int* maxiter |
54 | 88 |
time.sec = as.integer(max.time), # double* maxtime |
55 | 89 |
loglik.delta = as.double(eps.try), # double* eps |
... | ... |
@@ -60,11 +94,12 @@ findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, m |
60 | 94 |
weights = double(length=numstates), # double* weights |
61 | 95 |
distr.type = as.integer(state.distributions), # int* distr_type |
62 | 96 |
ini.proc = as.integer(iniproc), # int* iniproc |
63 |
- size.initial = double(length=numstates), # double* initial_size |
|
64 |
- prob.initial = double(length=numstates), # double* initial_prob |
|
65 |
- A.initial = double(length=numstates*numstates), # double* initial_A |
|
66 |
- proba.initial = double(length=numstates), # double* initial_proba |
|
67 |
- use.initial.params = as.logical(0), # bool* use_initial_params |
|
97 |
+ size.initial = as.vector(size.initial), # double* initial_size |
|
98 |
+ prob.initial = as.vector(prob.initial), # double* initial_prob |
|
99 |
+ lambda.initial = as.vector(lambda.initial), # double* initial_lambda |
|
100 |
+ A.initial = as.vector(A.initial), # double* initial_A |
|
101 |
+ proba.initial = as.vector(proba.initial), # double* initial_proba |
|
102 |
+ use.initial.params = as.logical(1), # bool* use_initial_params |
|
68 | 103 |
num.threads = as.integer(num.threads), # int* num_threads |
69 | 104 |
error = as.integer(0), # int* error (error handling) |
70 | 105 |
read.cutoff = as.integer(read.cutoff) # int* read_cutoff |
... | ... |
@@ -95,6 +130,7 @@ findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, m |
95 | 130 |
num.states = as.integer(numstates), # int* N |
96 | 131 |
size = double(length=numstates), # double* size |
97 | 132 |
prob = double(length=numstates), # double* prob |
133 |
+ lambda = double(length=numstates), # double* lambda |
|
98 | 134 |
num.iterations = as.integer(max.iter), # int* maxiter |
99 | 135 |
time.sec = as.integer(max.time), # double* maxtime |
100 | 136 |
loglik.delta = as.double(eps), # double* eps |
... | ... |
@@ -107,6 +143,7 @@ findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, m |
107 | 143 |
ini.proc = as.integer(iniproc), # int* iniproc |
108 | 144 |
size.initial = as.vector(hmm$size), # double* initial_size |
109 | 145 |
prob.initial = as.vector(hmm$prob), # double* initial_prob |
146 |
+ lambda.initial = as.vector(hmm$lambda), # double* initial_lambda |
|
110 | 147 |
A.initial = as.vector(hmm$A), # double* initial_A |
111 | 148 |
proba.initial = as.vector(hmm$proba), # double* initial_proba |
112 | 149 |
use.initial.params = as.logical(1), # bool* use_initial_params |
... | ... |
@@ -117,6 +154,7 @@ findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, m |
117 | 154 |
} |
118 | 155 |
|
119 | 156 |
# Add useful entries |
157 |
+ hmm$read.cutoff <- read.cutoff |
|
120 | 158 |
hmm$ID <- ID |
121 | 159 |
names(hmm$weights) <- state.labels |
122 | 160 |
hmm$coordinates <- data.frame(as.character(seqnames(binned.data)), start(ranges(binned.data)), end(ranges(binned.data))) |
... | ... |
@@ -125,32 +163,44 @@ findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, m |
125 | 163 |
class(hmm) <- class.aneufinder.hmm |
126 | 164 |
hmm$states <- factor(state.labels, levels=state.labels)[hmm$states+1] |
127 | 165 |
hmm$eps <- eps |
128 |
- hmm$A <- matrix(hmm$A, ncol=hmm$num.states, byrow=TRUE) |
|
166 |
+ hmm$A <- matrix(hmm$A, ncol=hmm$num.states) |
|
129 | 167 |
rownames(hmm$A) <- state.labels |
130 | 168 |
colnames(hmm$A) <- state.labels |
131 |
- hmm$distributions <- data.frame(type=state.distributions, size=hmm$size, prob=hmm$prob, mu=fmean(hmm$size,hmm$prob), variance=fvariance(hmm$size,hmm$prob)) |
|
132 |
- rownames(hmm$distributions) <- state.labels |
|
133 |
- # Treat 'null-mixed' separately |
|
134 |
- if ('null-mixed' %in% state.labels) { |
|
135 |
- hmm$distributions['null-mixed','mu'] <- (1-hmm$distributions['null-mixed','prob'])/hmm$distributions['null-mixed','prob'] |
|
136 |
- hmm$distributions['null-mixed','variance'] <- hmm$distributions['null-mixed','mu']/hmm$distributions['null-mixed','prob'] |
|
137 |
- hmm$distributions['null-mixed','size'] <- NA |
|
169 |
+ hmm$distributions <- data.frame() |
|
170 |
+ hmm$distributions.initial <- data.frame() |
|
171 |
+ for (idistr in 1:length(hmm$distr.type)) { |
|
172 |
+ distr <- levels(state.distributions)[hmm$distr.type[idistr]] |
|
173 |
+ if (distr == 'dnbinom') { |
|
174 |
+ hmm$distributions <- rbind(hmm$distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], lambda=NA, mu=dnbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dnbinom.variance(hmm$size[idistr],hmm$prob[idistr]))) |
|
175 |
+ hmm$distributions.initial <- rbind(hmm$distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], lambda=NA, mu=dnbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dnbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr]))) |
|
176 |
+ } else if (distr == 'dgeom') { |
|
177 |
+ hmm$distributions <- rbind(hmm$distributions, data.frame(type=distr, size=NA, prob=hmm$prob[idistr], lambda=NA, mu=dgeom.mean(hmm$prob[idistr]), variance=dgeom.variance(hmm$prob[idistr]))) |
|
178 |
+ hmm$distributions.initial <- rbind(hmm$distributions.initial, data.frame(type=distr, size=NA, prob=hmm$prob.initial[idistr], lambda=NA, mu=dgeom.mean(hmm$prob.initial[idistr]), variance=dgeom.variance(hmm$prob.initial[idistr]))) |
|
179 |
+ } else if (distr == 'delta') { |
|
180 |
+ hmm$distributions <- rbind(hmm$distributions, data.frame(type=distr, size=NA, prob=NA, lambda=NA, mu=0, variance=0)) |
|
181 |
+ hmm$distributions.initial <- rbind(hmm$distributions.initial, data.frame(type=distr, size=NA, prob=NA, lambda=NA, mu=0, variance=0)) |
|
182 |
+ } else if (distr == 'dpois') { |
|
183 |
+ hmm$distributions <- rbind(hmm$distributions, data.frame(type=distr, size=NA, prob=NA, lambda=hmm$lambda[idistr], mu=hmm$lambda[idistr], variance=hmm$lambda[idistr])) |
|
184 |
+ hmm$distributions.initial <- rbind(hmm$distributions.initial, data.frame(type=distr, size=NA, prob=NA, lambda=hmm$lambda.initial[idistr], mu=hmm$lambda.initial[idistr], variance=hmm$lambda.initial[idistr])) |
|
185 |
+ } else if (distr == 'dbinom') { |
|
186 |
+ hmm$distributions <- rbind(hmm$distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], lambda=NA, mu=dbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dbinom.variance(hmm$size[idistr],hmm$prob[idistr]))) |
|
187 |
+ hmm$distributions.initial <- rbind(hmm$distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], lambda=NA, mu=dbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr]))) |
|
188 |
+ } |
|
138 | 189 |
} |
139 |
- hmm$A.initial <- matrix(hmm$A.initial, ncol=hmm$num.states, byrow=TRUE) |
|
190 |
+ rownames(hmm$distributions) <- state.labels |
|
191 |
+ rownames(hmm$distributions.initial) <- state.labels |
|
192 |
+ hmm$A.initial <- matrix(hmm$A.initial, ncol=hmm$num.states) |
|
140 | 193 |
rownames(hmm$A.initial) <- state.labels |
141 | 194 |
colnames(hmm$A.initial) <- state.labels |
142 |
- hmm$distributions.initial <- data.frame(type=state.distributions, size=hmm$size.initial, prob=hmm$prob.initial, mu=fmean(hmm$size.initial,hmm$prob.initial), variance=fvariance(hmm$size.initial,hmm$prob.initial)) |
|
143 |
- rownames(hmm$distributions.initial) <- state.labels |
|
144 |
- hmm$distributions.initial['nullsomy',2:5] <- c(0,1,0,0) |
|
145 |
- hmm$filter.reads <- filter.reads |
|
146 | 195 |
|
147 | 196 |
# Delete redundant entries |
148 | 197 |
hmm$size <- NULL |
149 | 198 |
hmm$prob <- NULL |
199 |
+ hmm$lambda <- NULL |
|
150 | 200 |
hmm$size.initial <- NULL |
151 | 201 |
hmm$prob.initial <- NULL |
202 |
+ hmm$lambda.initial <- NULL |
|
152 | 203 |
hmm$use.initial.params <- NULL |
153 |
- hmm$read.cutoff <- NULL |
|
154 | 204 |
hmm$distr.type <- NULL |
155 | 205 |
|
156 | 206 |
# Issue warnings |
... | ... |
@@ -160,7 +210,7 @@ findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, m |
160 | 210 |
} |
161 | 211 |
} |
162 | 212 |
if (hmm$error == 1) { |
163 |
- stop("A nan occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your read counts for very high numbers, they could be the cause for this problem.") |
|
213 |
+ stop("A nan occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your read counts for very high numbers, they could be the cause for this problem. Try again with a lower value for 'read.cutoff.quantile'.") |
|
164 | 214 |
} else if (hmm$error == 2) { |
165 | 215 |
stop("An error occurred during the Baum-Welch! Parameter estimation terminated prematurely.") |
166 | 216 |
} |
... | ... |
@@ -1,8 +1,8 @@ |
1 | 1 |
# ======================================================= |
2 | 2 |
# Some global variables that can be used in all functions |
3 | 3 |
# ======================================================= |
4 |
-state.labels <- c("nullsomy","null-mixed","monosomy","disomy","trisomy","tetrasomy","multisomy") |
|
5 |
-state.distributions <- factor(c('delta','dgeom','dnbinom','dnbinom','dnbinom','dnbinom','dnbinom'), levels=c('delta','dgeom','dnbinom')) |
|
4 |
+state.labels <- c("nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy") |
|
5 |
+state.distributions <- factor(c('delta','dnbinom','dnbinom','dnbinom','dnbinom','dnbinom'), levels=c('delta','dgeom','dnbinom','dpois','dbinom')) |
|
6 | 6 |
coordinate.names <- c("chrom","start","end") |
7 | 7 |
binned.data.names <- c(coordinate.names,"reads") |
8 | 8 |
class.aneufinder.hmm <- "aneufinder.hmm" |
... | ... |
@@ -13,18 +13,44 @@ get.state.colors <- function() { return(state.colors[state.labels]) } |
13 | 13 |
# ============================================================================ |
14 | 14 |
# Functions for a Negative Binomial to transform (mean,variance)<->(size,prob) |
15 | 15 |
# ============================================================================ |
16 |
-fsize <- function(mean, variance) { |
|
16 |
+dnbinom.size <- function(mean, variance) { |
|
17 | 17 |
return(mean^2 / (variance - mean)) |
18 | 18 |
} |
19 | 19 |
|
20 |
-fprob <- function(mean, variance) { |
|
20 |
+dnbinom.prob <- function(mean, variance) { |
|
21 | 21 |
return(mean/variance) |
22 | 22 |
} |
23 | 23 |
|
24 |
-fmean <- function(size, prob) { |
|
24 |
+dnbinom.mean <- function(size, prob) { |
|
25 | 25 |
return(size/prob - size) |
26 | 26 |
} |
27 | 27 |
|
28 |
-fvariance <- function(size, prob) { |
|
28 |
+dnbinom.variance <- function(size, prob) { |
|
29 | 29 |
return( (size - prob*size) / prob^2 ) |
30 | 30 |
} |
31 |
+ |
|
32 |
+dgeom.mean <- function(prob) { |
|
33 |
+ return( (1-prob)/prob ) |
|
34 |
+} |
|
35 |
+ |
|
36 |
+dgeom.variance <- function(prob) { |
|
37 |
+ return( (1-prob)/prob^2 ) |
|
38 |
+} |
|
39 |
+ |
|
40 |
+dbinom.size <- function(mean, variance) { |
|
41 |
+ return( mean^2/(mean-variance) ) |
|
42 |
+} |
|
43 |
+ |
|
44 |
+dbinom.prob <- function(mean, variance) { |
|
45 |
+ return( (mean-variance)/mean ) |
|
46 |
+} |
|
47 |
+ |
|
48 |
+dbinom.mean <- function(size, prob) { |
|
49 |
+ return( size*prob ) |
|
50 |
+} |
|
51 |
+ |
|
52 |
+dbinom.variance <- function(size, prob) { |
|
53 |
+ return( size*prob * (1-prob) ) |
|
54 |
+} |
|
55 |
+ |
|
56 |
+ |
... | ... |
@@ -80,6 +80,15 @@ plot.distribution <- function(model, state=NULL, chrom=NULL, start=NULL, end=NUL |
80 | 80 |
} else if (model$distributions[istate,'type']=='dnbinom') { |
81 | 81 |
# negative binomials |
82 | 82 |
distributions[[length(distributions)+1]] <- weights[istate] * dnbinom(x, model$distributions[istate,'size'], model$distributions[istate,'prob']) |
83 |
+ } else if (model$distributions[istate,'type']=='dpois') { |
|
84 |
+ # poissons |
|
85 |
+ distributions[[length(distributions)+1]] <- weights[istate] * dpois(x, model$distributions[istate,'lambda']) |
|
86 |
+ } else if (model$distributions[istate,'type']=='dbinom') { |
|
87 |
+ # binomials |
|
88 |
+ s <- model$distributions[istate,'size'] |
|
89 |
+ p <- model$distributions[istate,'prob'] |
|
90 |
+# distributions[[length(distributions)+1]] <- weights[istate] * dbinom(x, model$distributions[istate,'size'], model$distributions[istate,'prob']) # only defined for integer 'size' |
|
91 |
+ distributions[[length(distributions)+1]] <- weights[istate] * choose(s,x) * p^x * (1-p)^(s-x) |
|
83 | 92 |
} |
84 | 93 |
} |
85 | 94 |
distributions <- as.data.frame(distributions) |
... | ... |
@@ -23,6 +23,7 @@ The HMM object is output of the function \code{\link{findCNVs}} and is basically |
23 | 23 |
\item{proba.initial}{Initial \option{proba} at the beginning of the Baum-Welch.} |
24 | 24 |
\item{num.threads}{Number of threads that were used in the Baum-Welch.} |
25 | 25 |
\item{error}{This is 0 unless an error occurred during the Baum-Welch.} |
26 |
+\item{read.cutoff}{The read cutoff as specified by the option \option{read.cutoff.quantile}.} |
|
26 | 27 |
\item{eps}{Convergence threshold for the Baum-Welch.} |
27 | 28 |
\item{distributions}{Estimated parameters of the employed distributions. See \code{\link{findCNVs}} for a description of which distribution is used for which state.} |
28 | 29 |
\item{distributions.initial}{Distribution parameters at the beginning of the Baum-Welch.} |
... | ... |
@@ -24,13 +24,13 @@ Initialization procedure for the distributions. Currently only 'standard' is imp |
24 | 24 |
\item{output.if.not.converged}{ |
25 | 25 |
If set to FALSE, no output will be given in the case that the Baum-Welch did not converge to the desired \option{eps}. If set to TRUE, a model will be returned even if the Baum-Welch did not converge to the desired \option{eps}. |
26 | 26 |
} |
27 |
-\item{filter.reads}{ |
|
28 |
-If set to TRUE, then read counts which are above the 99.99\% quantile of the data are set to this value. A warning is issued in this case. This filtering can increase the speed of the Baum-Welch and is unlikely to affect the results. |
|
27 |
+\item{read.cutoff.quantile}{ |
|
28 |
+Read counts which are above this quantile are replaced by the corresponding value. This filtering can increase the speed of the Baum-Welch and is unlikely to affect the results. It may also be helpful to specify a lower cutoff if you are getting the error message 'A nan occurred during the Baum-Welch! ...'. |
|
29 | 29 |
} |
30 | 30 |
} |
31 | 31 |
|
32 | 32 |
\details{ |
33 |
-The Hidden Markov Model which is used to classify the bins uses 7 states: state 'nullsomy' with a delta function as emission densitiy (only zero read counts), 'null-mixed' with a geometric distribution to account, 'monosomy','disomy','trisomy','tetrasomy' and 'multisomy' with Negative Binomials as emission densities. A Baum-Welch algorithm is employed to estimate the parameters of the distributions. See our paper for a detailed description of the method. |
|
33 |
+The Hidden Markov Model which is used to classify the bins uses 7 states: state 'nullsomy' with a delta function as emission densitiy (only zero read counts), 'null-mixed' with a geometric distribution (see \code{\link{dgeom}}), 'monosomy','disomy','trisomy','tetrasomy' and 'multisomy' with negative binomials (see \code{\link{dnbinom}}) as emission densities. A Baum-Welch algorithm is employed to estimate the parameters of the distributions. See our paper for a detailed description of the method. |
|
34 | 34 |
TODO: insert paper |
35 | 35 |
} |
36 | 36 |
\value{ |
... | ... |
@@ -1,13 +1,13 @@ |
1 | 1 |
#include "utility.h" |
2 | 2 |
#include "scalehmm.h" |
3 | 3 |
#include "loghmm.h" |
4 |
-#include <omp.h> // parallelization options |
|
4 |
+// #include <omp.h> // parallelization options |
|
5 | 5 |
|
6 | 6 |
// =================================================================================================================================================== |
7 | 7 |
// This function takes parameters from R, creates a univariate HMM object, creates the distributions, runs the Baum-Welch and returns the result to R. |
8 | 8 |
// =================================================================================================================================================== |
9 | 9 |
extern "C" { |
10 |
-void R_univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, int* iniproc, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff) |
|
10 |
+void R_univariate_hmm(int* O, int* T, int* N, double* size, double* prob, double* lambda, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, int* iniproc, double* initial_size, double* initial_prob, double* initial_lambda, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff) |
|
11 | 11 |
{ |
12 | 12 |
|
13 | 13 |
// Define logging level |
... | ... |
@@ -16,8 +16,8 @@ void R_univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* m |
16 | 16 |
FILELog::ReportingLevel() = FILELog::FromString("ERROR"); |
17 | 17 |
// FILELog::ReportingLevel() = FILELog::FromString("DEBUG2"); |
18 | 18 |
|
19 |
- // Parallelization settings |
|
20 |
- omp_set_num_threads(*num_threads); |
|
19 |
+// // Parallelization settings |
|
20 |
+// omp_set_num_threads(*num_threads); |
|
21 | 21 |
|
22 | 22 |
// Print some information |
23 | 23 |
FILE_LOG(logINFO) << "number of states = " << *N; |
... | ... |
@@ -75,43 +75,7 @@ void R_univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* m |
75 | 75 |
FILE_LOG(logINFO) << "data mean = " << mean << ", data variance = " << variance; |
76 | 76 |
Rprintf("data mean = %g, data variance = %g\n", mean, variance); |
77 | 77 |
|
78 |
- // Go through all states of the hmm and assign the density functions |
|
79 |
- // This loop assumes that the negative binomial states come last and are consecutive |
|
80 |
- double imean, ivariance; |
|
81 |
- for (int i_state=0; i_state<*N; i_state++) |
|
82 |
- { |
|
83 |
- if (*use_initial_params) { |
|
84 |
- FILE_LOG(logINFO) << "Using given parameters for size and prob"; |
|
85 |
- Rprintf("Using given parameters for size and prob\n"); |
|
86 |
- imean = (1-initial_prob[i_state])*initial_size[i_state] / initial_prob[i_state]; |
|
87 |
- ivariance = imean / initial_prob[i_state]; |
|
88 |
- FILE_LOG(logDEBUG2) << "imean = " << imean; |
|
89 |
- FILE_LOG(logDEBUG2) << "ivariance = " << ivariance; |
|
90 |
- } else { |
|
91 |
- |
|
92 |
- if (*iniproc == 1) |
|
93 |
- { |
|
94 |
- // Simple initialization based on data mean, assumed to be the disomic mean |
|
95 |
- if (distr_type[i_state] == 1) { } |
|
96 |
- else if (distr_type[i_state] == 2) { } |
|
97 |
- else if (distr_type[i_state] == 3) |
|
98 |
- { |
|
99 |
- for (int ii_state=i_state; ii_state<*N; ii_state++) |
|
100 |
- { |
|
101 |
- imean = mean/2 * (ii_state-i_state+1); |
|
102 |
- ivariance = imean * 5; |
|
103 |
-// ivariance = variance/2 * (i_state-1); |
|
104 |
- // Calculate r and p from mean and variance |
|
105 |
- initial_size[ii_state] = pow(imean,2)/(ivariance-imean); |
|
106 |
- initial_prob[ii_state] = imean/ivariance; |
|
107 |
- } |
|
108 |
- break; |
|
109 |
- } |
|
110 |
- } |
|
111 |
- |
|
112 |
- } |
|
113 |
- } |
|
114 |
- |
|
78 |
+ // Create the emission densities and initialize |
|
115 | 79 |
for (int i_state=0; i_state<*N; i_state++) |
116 | 80 |
{ |
117 | 81 |
if (distr_type[i_state] == 1) |
... | ... |
@@ -123,7 +87,7 @@ void R_univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* m |
123 | 87 |
else if (distr_type[i_state] == 2) |
124 | 88 |
{ |
125 | 89 |
FILE_LOG(logDEBUG1) << "Using geometric distribution for state " << i_state; |
126 |
- Geometric *d = new Geometric(O, *T, 0.9); // delete is done inside ~ScaleHMM() |
|
90 |
+ Geometric *d = new Geometric(O, *T, initial_prob[i_state]); // delete is done inside ~ScaleHMM() |
|
127 | 91 |
hmm->densityFunctions.push_back(d); |
128 | 92 |
} |
129 | 93 |
else if (distr_type[i_state] == 3) |
... | ... |
@@ -132,6 +96,18 @@ void R_univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* m |
132 | 96 |
NegativeBinomial *d = new NegativeBinomial(O, *T, initial_size[i_state], initial_prob[i_state]); // delete is done inside ~ScaleHMM() |
133 | 97 |
hmm->densityFunctions.push_back(d); |
134 | 98 |
} |
99 |
+ else if (distr_type[i_state] == 4) |
|
100 |
+ { |
|
101 |
+ FILE_LOG(logDEBUG1) << "Using poisson for state " << i_state; |
|
102 |
+ Poisson *d = new Poisson(O, *T, initial_lambda[i_state]); // delete is done inside ~ScaleHMM() |
|
103 |
+ hmm->densityFunctions.push_back(d); |
|
104 |
+ } |
|
105 |
+ else if (distr_type[i_state] == 5) |
|
106 |
+ { |
|
107 |
+ FILE_LOG(logDEBUG1) << "Using binomial for state " << i_state; |
|
108 |
+ NegativeBinomial *d = new NegativeBinomial(O, *T, initial_size[i_state], initial_prob[i_state]); // delete is done inside ~ScaleHMM() |
|
109 |
+ hmm->densityFunctions.push_back(d); |
|
110 |
+ } |
|
135 | 111 |
else |
136 | 112 |
{ |
137 | 113 |
FILE_LOG(logWARNING) << "Density not specified, using default negative binomial for state " << i_state; |
... | ... |
@@ -189,7 +165,7 @@ void R_univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* m |
189 | 165 |
proba[i] = hmm->get_proba(i); |
190 | 166 |
for (int j=0; j<*N; j++) |
191 | 167 |
{ |
192 |
- A[i * (*N) + j] = hmm->get_A(i,j); |
|
168 |
+ A[i * (*N) + j] = hmm->get_A(j,i); |
|
193 | 169 |
} |
194 | 170 |
} |
195 | 171 |
|
... | ... |
@@ -214,6 +190,17 @@ void R_univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* m |
214 | 190 |
size[i] = 0; |
215 | 191 |
prob[i] = 1; |
216 | 192 |
} |
193 |
+ else if (hmm->densityFunctions[i]->get_name() == POISSON) |
|
194 |
+ { |
|
195 |
+ Poisson* d = (Poisson*)(hmm->densityFunctions[i]); |
|
196 |
+ lambda[i] = d->get_lambda(); |
|
197 |
+ } |
|
198 |
+ else if (hmm->densityFunctions[i]->get_name() == BINOMIAL) |
|
199 |
+ { |
|
200 |
+ Binomial* d = (Binomial*)(hmm->densityFunctions[i]); |
|
201 |
+ size[i] = d->get_size(); |
|
202 |
+ prob[i] = d->get_prob(); |
|
203 |
+ } |
|
217 | 204 |
} |
218 | 205 |
*loglik = hmm->get_logP(); |
219 | 206 |
hmm->calc_weights(weights); |
... | ... |
@@ -91,6 +91,214 @@ void Normal::set_stdev(double stdev) |
91 | 91 |
} |
92 | 92 |
|
93 | 93 |
|
94 |
+// ============================================================ |
|
95 |
+// Poisson density |
|
96 |
+// ============================================================ |
|
97 |
+ |
|
98 |
+// Constructor and Destructor --------------------------------- |
|
99 |
+Poisson::Poisson(int* observations, int T, double lambda) |
|
100 |
+{ |
|
101 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
102 |
+ this->obs = observations; |
|
103 |
+ this->T = T; |
|
104 |
+ this->lambda = lambda; |
|
105 |
+ this->lxfactorials = NULL; |
|
106 |
+ // Precompute the lxfactorials that are used in computing the densities |
|
107 |
+ if (this->obs != NULL) |
|
108 |
+ { |
|
109 |
+ this->max_obs = intMax(observations, T); |
|
110 |
+ this->lxfactorials = (double*) calloc(max_obs+1, sizeof(double)); |
|
111 |
+ this->lxfactorials[0] = 0.0; // Not necessary, already 0 because of calloc |
|
112 |
+ this->lxfactorials[1] = 0.0; |
|
113 |
+ for (int j=2; j<=max_obs; j++) |
|
114 |
+ { |
|
115 |
+ this->lxfactorials[j] = this->lxfactorials[j-1] + log(j); |
|
116 |
+ } |
|
117 |
+ } |
|
118 |
+} |
|
119 |
+ |
|
120 |
+Poisson::~Poisson() |
|
121 |
+{ |
|
122 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
123 |
+ if (this->lxfactorials != NULL) |
|
124 |
+ { |
|
125 |
+ free(this->lxfactorials); |
|
126 |
+ } |
|
127 |
+} |
|
128 |
+ |
|
129 |
+// Methods ---------------------------------------------------- |
|
130 |
+void Poisson::calc_logdensities(double* logdens) |
|
131 |
+{ |
|
132 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
133 |
+ double logl = log(this->lambda); |
|
134 |
+ double l = this->lambda; |
|
135 |
+ double lxfactorial; |
|
136 |
+ // Select strategy for computing densities |
|
137 |
+ if (this->max_obs <= this->T) |
|
138 |
+ { |
|
139 |
+ FILE_LOG(logDEBUG2) << "Precomputing densities in " << __func__ << " for every obs[t], because max(O)<=T"; |
|
140 |
+ double logdens_per_read [this->max_obs+1]; |
|
141 |
+ for (int j=0; j<=this->max_obs; j++) |
|
142 |
+ { |
|
143 |
+ logdens_per_read[j] = j*logl - l - this->lxfactorials[j]; |
|
144 |
+ } |
|
145 |
+ for (int t=0; t<this->T; t++) |
|
146 |
+ { |
|
147 |
+ logdens[t] = logdens_per_read[(int) this->obs[t]]; |
|
148 |
+ FILE_LOG(logDEBUG4) << "logdens["<<t<<"] = " << logdens[t]; |
|
149 |
+ if (isnan(logdens[t])) |
|
150 |
+ { |
|
151 |
+ FILE_LOG(logERROR) << __PRETTY_FUNCTION__; |
|
152 |
+ FILE_LOG(logERROR) << "logdens["<<t<<"] = "<< logdens[t]; |
|
153 |
+ throw nan_detected; |
|
154 |
+ } |
|
155 |
+ } |
|
156 |
+ } |
|
157 |
+ else |
|
158 |
+ { |
|
159 |
+ FILE_LOG(logDEBUG2) << "Computing densities in " << __func__ << " for every t, because max(O)>T"; |
|
160 |
+ for (int t=0; t<this->T; t++) |
|
161 |
+ { |
|
162 |
+ lxfactorial = this->lxfactorials[(int) this->obs[t]]; |
|
163 |
+ logdens[t] = this->obs[t]*logl - l - lxfactorial; |
|
164 |
+ FILE_LOG(logDEBUG4) << "logdens["<<t<<"] = " << logdens[t]; |
|
165 |
+ if (isnan(logdens[t])) |
|
166 |
+ { |
|
167 |
+ FILE_LOG(logERROR) << __PRETTY_FUNCTION__; |
|
168 |
+ FILE_LOG(logERROR) << "logdens["<<t<<"] = "<< logdens[t]; |
|
169 |
+ throw nan_detected; |
|
170 |
+ } |
|
171 |
+ } |
|
172 |
+ } |
|
173 |
+} |
|
174 |
+ |
|
175 |
+void Poisson::calc_densities(double* dens) |
|
176 |
+{ |
|
177 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
178 |
+ double logl = log(this->lambda); |
|
179 |
+ double l = this->lambda; |
|
180 |
+ double lxfactorial; |
|
181 |
+ // Select strategy for computing densities |
|
182 |
+ if (this->max_obs <= this->T) |
|
183 |
+ { |
|
184 |
+ FILE_LOG(logDEBUG2) << "Precomputing densities in " << __func__ << " for every obs[t], because max(O)<=T"; |
|
185 |
+ double dens_per_read [this->max_obs+1]; |
|
186 |
+ for (int j=0; j<=this->max_obs; j++) |
|
187 |
+ { |
|
188 |
+ dens_per_read[j] = exp( j*logl - l - this->lxfactorials[j] ); |
|
189 |
+ } |
|
190 |
+ for (int t=0; t<this->T; t++) |
|
191 |
+ { |
|
192 |
+ dens[t] = dens_per_read[(int) this->obs[t]]; |
|
193 |
+ FILE_LOG(logDEBUG4) << "dens["<<t<<"] = " << dens[t]; |
|
194 |
+ if (isnan(dens[t])) |
|
195 |
+ { |
|
196 |
+ FILE_LOG(logERROR) << __PRETTY_FUNCTION__; |
|
197 |
+ FILE_LOG(logERROR) << "dens["<<t<<"] = "<< dens[t]; |
|
198 |
+ throw nan_detected; |
|
199 |
+ } |
|
200 |
+ } |
|
201 |
+ } |
|
202 |
+ else |
|
203 |
+ { |
|
204 |
+ FILE_LOG(logDEBUG2) << "Computing densities in " << __func__ << " for every t, because max(O)>T"; |
|
205 |
+ for (int t=0; t<this->T; t++) |
|
206 |
+ { |
|
207 |
+ lxfactorial = this->lxfactorials[(int) this->obs[t]]; |
|
208 |
+ dens[t] = exp( this->obs[t]*logl - l - lxfactorial ); |
|
209 |
+ FILE_LOG(logDEBUG4) << "dens["<<t<<"] = " << dens[t]; |
|
210 |
+ if (isnan(dens[t])) |
|
211 |
+ { |
|
212 |
+ FILE_LOG(logERROR) << __PRETTY_FUNCTION__; |
|
213 |
+ FILE_LOG(logERROR) << "dens["<<t<<"] = "<< dens[t]; |
|
214 |
+ throw nan_detected; |
|
215 |
+ } |
|
216 |
+ } |
|
217 |
+ } |
|
218 |
+} |
|
219 |
+ |
|
220 |
+void Poisson::update(double* weights) |
|
221 |
+{ |
|
222 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
223 |
+ double numerator, denominator; |
|
224 |
+ // Update lambda |
|
225 |
+ numerator=denominator=0.0; |
|
226 |
+// clock_t time, dtime; |
|
227 |
+// time = clock(); |
|
228 |
+ for (int t=0; t<this->T; t++) |
|
229 |
+ { |
|
230 |
+ numerator += weights[t] * this->obs[t]; |
|
231 |
+ denominator += weights[t]; |
|
232 |
+ } |
|
233 |
+ this->lambda = numerator/denominator; // Update of size is now done with updated lambda |
|
234 |
+// dtime = clock() - time; |
|
235 |
+// FILE_LOG(logDEBUG1) << "updateL(): "<<dtime<< " clicks"; |
|
236 |
+ FILE_LOG(logDEBUG1) << "l = " << this->lambda; |
|
237 |
+ |
|
238 |
+} |
|
239 |
+ |
|
240 |
+void Poisson::update_constrained(double** weights, int fromState, int toState) |
|
241 |
+{ |
|
242 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
243 |
+ FILE_LOG(logDEBUG1) << "l = "<<this->lambda; |
|
244 |
+ double numerator, denominator; |
|
245 |
+ // Update lambda |
|
246 |
+ numerator=denominator=0.0; |
|
247 |
+// clock_t time, dtime; |
|
248 |
+// time = clock(); |
|
249 |
+ for (int i=0; i<toState-fromState; i++) |
|
250 |
+ { |
|
251 |
+ for (int t=0; t<this->T; t++) |
|
252 |
+ { |
|
253 |
+ numerator += weights[i+fromState][t] * this->obs[t]; |
|
254 |
+ denominator += weights[i+fromState][t] * (i+1); |
|
255 |
+ } |
|
256 |
+ } |
|
257 |
+ this->lambda = numerator/denominator; // Update of size is now done with old lambda |
|
258 |
+// dtime = clock() - time; |
|
259 |
+// FILE_LOG(logDEBUG1) << "updateL(): "<<dtime<< " clicks"; |
|
260 |
+ FILE_LOG(logDEBUG1) << "l = "<<this->lambda; |
|
261 |
+ |
|
262 |
+} |
|
263 |
+ |
|
264 |
+// Getter and Setter ------------------------------------------ |
|
265 |
+double Poisson::get_mean() |
|
266 |
+{ |
|
267 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
268 |
+ return( this->lambda ); |
|
269 |
+} |
|
270 |
+ |
|
271 |
+void Poisson::set_mean(double mean) |
|
272 |
+{ |
|
273 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
274 |
+ this->lambda = mean; |
|
275 |
+} |
|
276 |
+ |
|
277 |
+double Poisson::get_variance() |
|
278 |
+{ |
|
279 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
280 |
+ return( this->lambda ); |
|
281 |
+} |
|
282 |
+ |
|
283 |
+void Poisson::set_variance(double variance) |
|
284 |
+{ |
|
285 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
286 |
+ this->lambda = variance; |
|
287 |
+} |
|
288 |
+ |
|
289 |
+DensityName Poisson::get_name() |
|
290 |
+{ |
|
291 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
292 |
+ return(POISSON); |
|
293 |
+} |
|
294 |
+ |
|
295 |
+double Poisson::get_lambda() |
|
296 |
+{ |
|
297 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
298 |
+ return(this->lambda); |
|
299 |
+} |
|
300 |
+ |
|
301 |
+ |
|
94 | 302 |
// ============================================================ |
95 | 303 |
// Negative Binomial density |
96 | 304 |
// ============================================================ |
... | ... |
@@ -226,7 +434,7 @@ void NegativeBinomial::update(double* weights) |
226 | 434 |
{ |
227 | 435 |
FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
228 | 436 |
double eps = 1e-4, kmax; |
229 |
- double numerator, denominator, rhere, dr, Fr, dFrdr, DigammaR, DigammaRplusDR; |
|
437 |
+ double numerator, denominator, size0, dSize, F, dFdSize, DigammaSize, DigammaSizePlusDSize; |
|
230 | 438 |
// Update prob (p) |
231 | 439 |
numerator=denominator=0.0; |
232 | 440 |
// clock_t time, dtime; |
... | ... |
@@ -236,86 +444,86 @@ void NegativeBinomial::update(double* weights) |
236 | 444 |
numerator+=weights[t]*this->size; |
237 | 445 |
denominator+=weights[t]*(this->size+this->obs[t]); |
238 | 446 |
} |
239 |
- this->prob = numerator/denominator; // Update of size (r) is now done with updated prob |
|
447 |
+ this->prob = numerator/denominator; // Update of size is now done with updated prob |
|
240 | 448 |
double logp = log(this->prob); |
241 | 449 |
// dtime = clock() - time; |
242 | 450 |
// FILE_LOG(logDEBUG1) << "updateP(): "<<dtime<< " clicks"; |
243 |
- // Update of size (r) with Newton Method |
|
244 |
- rhere = this->size; |
|
245 |
- dr = 0.00001; |
|
451 |
+ // Update of size with Newton Method |
|
452 |
+ size0 = this->size; |
|
453 |
+ dSize = 0.00001; |
|
246 | 454 |
kmax = 20; |
247 | 455 |
// time = clock(); |
248 | 456 |
// Select strategy for computing digammas |
249 | 457 |
if (this->max_obs <= this->T) |
250 | 458 |
{ |
251 | 459 |
FILE_LOG(logDEBUG2) << "Precomputing digammas in " << __func__ << " for every obs[t], because max(O)<=T"; |
252 |
- double DigammaRplusX[this->max_obs+1], DigammaRplusDRplusX[this->max_obs+1]; |
|
460 |
+ double DigammaSizePlusX[this->max_obs+1], DigammaSizePlusDSizePlusX[this->max_obs+1]; |
|
253 | 461 |
for (int k=1; k<kmax; k++) |
254 | 462 |
{ |
255 |
- Fr=dFrdr=0.0; |
|
256 |
- DigammaR = digamma(rhere); // boost::math::digamma<>(rhere); |
|
257 |
- DigammaRplusDR = digamma(rhere + dr); // boost::math::digamma<>(rhere+dr); |
|
463 |
+ F=dFdSize=0.0; |
|
464 |
+ DigammaSize = digamma(size0); // boost::math::digamma<>(size0); |
|
465 |
+ DigammaSizePlusDSize = digamma(size0 + dSize); // boost::math::digamma<>(size0+dSize); |
|
258 | 466 |
// Precompute the digammas by iterating over all possible values of the observation vector |
259 | 467 |
for (int j=0; j<=this->max_obs; j++) |
260 | 468 |
{ |
261 |
- DigammaRplusX[j] = digamma(rhere+j); |
|
262 |
- DigammaRplusDRplusX[j] = digamma(rhere+dr+j); |
|
469 |
+ DigammaSizePlusX[j] = digamma(size0+j); |
|
470 |
+ DigammaSizePlusDSizePlusX[j] = digamma(size0+dSize+j); |
|
263 | 471 |
} |
264 | 472 |
for(int t=0; t<this->T; t++) |
265 | 473 |
{ |
266 | 474 |
if(this->obs[t]==0) |
267 | 475 |
{ |
268 |
- Fr+=weights[t]*logp; |
|
269 |
- //dFrdr+=0; |
|
476 |
+ F += weights[t] * logp; |
|
477 |
+ //dFdSize+=0; |
|
270 | 478 |
} |
271 | 479 |
if(this->obs[t]!=0) |
272 | 480 |
{ |
273 |
- Fr+=weights[t]*(logp-DigammaR+DigammaRplusX[(int)obs[t]]); |
|
274 |
- dFrdr+=weights[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX[(int)obs[t]]-DigammaRplusX[(int)obs[t]]); |
|
481 |
+ F += weights[t] * (logp - DigammaSize + DigammaSizePlusX[(int)obs[t]]); |
|
482 |
+ dFdSize += weights[t]/dSize * (DigammaSize - DigammaSizePlusDSize + DigammaSizePlusDSizePlusX[(int)obs[t]] - DigammaSizePlusX[(int)obs[t]]); |
|
275 | 483 |
} |
276 | 484 |
} |
277 |
- if(fabs(Fr)<eps) |
|
485 |
+ if(fabs(F)<eps) |
|
278 | 486 |
{ |
279 | 487 |
break; |
280 | 488 |
} |
281 |
- if(Fr/dFrdr<rhere) rhere=rhere-Fr/dFrdr; |
|
282 |
- if(Fr/dFrdr>rhere) rhere=rhere/2.0; |
|
489 |
+ if(F/dFdSize<size0) size0=size0-F/dFdSize; |
|
490 |
+ if(F/dFdSize>size0) size0=size0/2.0; |
|
283 | 491 |
} |
284 | 492 |
} |
285 | 493 |
else |
286 | 494 |
{ |
287 | 495 |
FILE_LOG(logDEBUG2) << "Computing digammas in " << __func__ << " for every t, because max(O)>T"; |
288 |
- double DigammaRplusX, DigammaRplusDRplusX; |
|
496 |
+ double DigammaSizePlusX, DigammaSizePlusDSizePlusX; |
|
289 | 497 |
for (int k=1; k<kmax; k++) |
290 | 498 |
{ |
291 |
- Fr = dFrdr = 0.0; |
|
292 |
- DigammaR = digamma(rhere); // boost::math::digamma<>(rhere); |
|
293 |
- DigammaRplusDR = digamma(rhere + dr); // boost::math::digamma<>(rhere+dr); |
|
499 |
+ F = dFdSize = 0.0; |
|
500 |
+ DigammaSize = digamma(size0); // boost::math::digamma<>(size0); |
|
501 |
+ DigammaSizePlusDSize = digamma(size0 + dSize); // boost::math::digamma<>(size0+dSize); |
|
294 | 502 |
for(int t=0; t<this->T; t++) |
295 | 503 |
{ |
296 |
- DigammaRplusX = digamma(rhere+this->obs[t]); //boost::math::digamma<>(rhere+this->obs[ti]); |
|
297 |
- DigammaRplusDRplusX = digamma(rhere+dr+this->obs[t]); // boost::math::digamma<>(rhere+dr+this->obs[ti]); |
|
504 |
+ DigammaSizePlusX = digamma(size0+this->obs[t]); //boost::math::digamma<>(size0+this->obs[ti]); |
|
505 |
+ DigammaSizePlusDSizePlusX = digamma(size0+dSize+this->obs[t]); // boost::math::digamma<>(size0+dSize+this->obs[ti]); |
|
298 | 506 |
if(this->obs[t]==0) |
299 | 507 |
{ |
300 |
- Fr+=weights[t]*logp; |
|
301 |
- //dFrdr+=0; |
|
508 |
+ F+=weights[t]*logp; |
|
509 |
+ //dFdSize+=0; |
|
302 | 510 |
} |
303 | 511 |
if(this->obs[t]!=0) |
304 | 512 |
{ |
305 |
- Fr+=weights[t]*(logp-DigammaR+DigammaRplusX); |
|
306 |
- dFrdr+=weights[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX-DigammaRplusX); |
|
513 |
+ F += weights[t] * (logp - DigammaSize + DigammaSizePlusX); |
|
514 |
+ dFdSize += weights[t]/dSize * (DigammaSize - DigammaSizePlusDSize + DigammaSizePlusDSizePlusX - DigammaSizePlusX); |
|
307 | 515 |
} |
308 | 516 |
} |
309 |
- if(fabs(Fr)<eps) |
|
517 |
+ if(fabs(F)<eps) |
|
310 | 518 |
{ |
311 | 519 |
break; |
312 | 520 |
} |
313 |
- if(Fr/dFrdr<rhere) rhere=rhere-Fr/dFrdr; |
|
314 |
- if(Fr/dFrdr>rhere) rhere=rhere/2.0; |
|
521 |
+ if(F/dFdSize<size0) size0=size0-F/dFdSize; |
|
522 |
+ if(F/dFdSize>size0) size0=size0/2.0; |
|
315 | 523 |
} |
316 | 524 |
} |
317 |
- this->size = rhere; |
|
318 |
- FILE_LOG(logDEBUG1) << "r = "<<this->size << ", p = "<<this->prob; |
|
525 |
+ this->size = size0; |
|
526 |
+ FILE_LOG(logDEBUG1) << "size = "<<this->size << ", prob = "<<this->prob; |
|
319 | 527 |
|
320 | 528 |
// dtime = clock() - time; |
321 | 529 |
// FILE_LOG(logDEBUG1) << "updateR(): "<<dtime<< " clicks"; |
... | ... |
@@ -325,9 +533,10 @@ void NegativeBinomial::update(double* weights) |
325 | 533 |
void NegativeBinomial::update_constrained(double** weights, int fromState, int toState) |
326 | 534 |
{ |
327 | 535 |
FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
328 |
- FILE_LOG(logDEBUG1) << "r = "<<this->size << ", p = "<<this->prob; |
|
536 |
+ FILE_LOG(logDEBUG1) << "size = "<<this->size << ", prob = "<<this->prob; |
|
329 | 537 |
double eps = 1e-4, kmax; |
330 |
- double numerator, denominator, rhere, dr, Fr, dFrdr, DigammaR, DigammaRplusDR; |
|
538 |
+// double numerator, denominator, size0, dSize, F, dFdSize, DigammaSize, DigammaSizePlusDSize; |
|
539 |
+ double numerator, denominator, size0, dSize, F, dFdSize, DigammaSize, TrigammaSize; |
|
331 | 540 |
double logp = log(this->prob); |
332 | 541 |
// Update prob (p) |
333 | 542 |
numerator=denominator=0.0; |
... | ... |
@@ -341,91 +550,99 @@ void NegativeBinomial::update_constrained(double** weights, int fromState, int t |
341 | 550 |
denominator += weights[i+fromState][t] * (this->size*(i+1) + this->obs[t]); |
342 | 551 |
} |
343 | 552 |
} |
344 |
- this->prob = numerator/denominator; // Update of size (r) is now done with old prob |
|
553 |
+ this->prob = numerator/denominator; // Update of size is now done with old prob |
|
345 | 554 |
// dtime = clock() - time; |
346 | 555 |
// FILE_LOG(logDEBUG1) << "updateP(): "<<dtime<< " clicks"; |
347 |
- // Update of size (r) with Newton Method |
|
348 |
- rhere = this->size; |
|
349 |
- dr = 0.00001; |
|
556 |
+ // Update of size with Newton Method |
|
557 |
+ size0 = this->size; |
|
558 |
+ dSize = 0.00001; |
|
350 | 559 |
kmax = 20; |
351 | 560 |
// time = clock(); |
352 | 561 |
// Select strategy for computing digammas |
353 | 562 |
if (this->max_obs <= this->T) |
354 | 563 |
{ |
355 | 564 |
FILE_LOG(logDEBUG2) << "Precomputing digammas in " << __func__ << " for every obs[t], because max(O)<=T"; |
356 |
- double DigammaRplusX[this->max_obs+1], DigammaRplusDRplusX[this->max_obs+1]; |
|
565 |
+// double DigammaSizePlusX[this->max_obs+1], DigammaSizePlusDSizePlusX[this->max_obs+1]; |
|
566 |
+ double DigammaSizePlusX[this->max_obs+1], TrigammaSizePlusX[this->max_obs+1]; |
|
357 | 567 |
for (int k=1; k<kmax; k++) |
358 | 568 |
{ |
359 |
- Fr=dFrdr=0.0; |
|
569 |
+ F=dFdSize=0.0; |
|
360 | 570 |
for (int i=0; i<toState-fromState; i++) |
361 | 571 |
{ |
362 |
- DigammaR = digamma((i+1)*rhere); // boost::math::digamma<>(rhere); |
|
363 |
- DigammaRplusDR = digamma((i+1)*(rhere + dr)); // boost::math::digamma<>(rhere+dr); |
|
572 |
+ DigammaSize = digamma((i+1)*size0); // boost::math::digamma<>(size0); |
|
573 |
+ TrigammaSize = trigamma((i+1)*size0); // boost::math::digamma<>(size0); |
|
574 |
+// DigammaSizePlusDSize = digamma((i+1)*(size0 + dSize)); // boost::math::digamma<>(size0+dSize); |
|
364 | 575 |
// Precompute the digammas by iterating over all possible values of the observation vector |
365 | 576 |
for (int j=0; j<=this->max_obs; j++) |
366 | 577 |
{ |
367 |
- DigammaRplusX[j] = digamma((i+1)*rhere+j); |
|
368 |
- DigammaRplusDRplusX[j] = digamma((i+1)*(rhere+dr)+j); |
|
578 |
+ DigammaSizePlusX[j] = digamma((i+1)*size0+j); |
|
579 |
+// DigammaSizePlusDSizePlusX[j] = digamma((i+1)*(size0+dSize)+j); |
|
580 |
+ TrigammaSizePlusX[j] = trigamma((i+1)*size0+j); |
|
369 | 581 |
} |
370 | 582 |
for(int t=0; t<this->T; t++) |
371 | 583 |
{ |
372 | 584 |
if(this->obs[t]==0) |
373 | 585 |
{ |
374 |
- Fr += weights[i+fromState][t] * (i+1) * logp; |
|
375 |
- //dFrdr+=0; |
|
586 |
+ F += weights[i+fromState][t] * (i+1) * logp; |
|
587 |
+ //dFdSize+=0; |
|
376 | 588 |
} |
377 | 589 |
if(this->obs[t]!=0) |
378 | 590 |
{ |
379 |
- Fr += weights[i+fromState][t] * (i+1) * (logp - DigammaR + DigammaRplusX[(int)obs[t]]); |
|
380 |
- dFrdr += weights[i+fromState][t] / dr * (i+1) * (DigammaR - DigammaRplusDR + DigammaRplusDRplusX[(int)obs[t]] - DigammaRplusX[(int)obs[t]]); |
|
591 |
+ F += weights[i+fromState][t] * (i+1) * (logp - DigammaSize + DigammaSizePlusX[(int)obs[t]]); |
|
592 |
+// dFdSize += weights[i+fromState][t] / dSize * (i+1) * (DigammaSize - DigammaSizePlusDSize + DigammaSizePlusDSizePlusX[(int)obs[t]] - DigammaSizePlusX[(int)obs[t]]); |
|
593 |
+ dFdSize += weights[i+fromState][t] * pow((i+1),2) * (-TrigammaSize + TrigammaSizePlusX[(int)obs[t]]); |
|
381 | 594 |
} |
382 | 595 |
} |
383 |
- if(fabs(Fr)<eps) |
|
596 |
+ if(fabs(F)<eps) |
|
384 | 597 |
{ |
385 | 598 |
break; |
386 | 599 |
} |
387 | 600 |
} |
388 |
- if(Fr/dFrdr<rhere) rhere=rhere-Fr/dFrdr; |
|
389 |
- if(Fr/dFrdr>rhere) rhere=rhere/2.0; |
|
601 |
+ if(F/dFdSize<size0) size0=size0-F/dFdSize; |
|
602 |
+ if(F/dFdSize>size0) size0=size0/2.0; |
|
390 | 603 |
} |
391 | 604 |
} |
392 | 605 |
else |
393 | 606 |
{ |
394 | 607 |
FILE_LOG(logDEBUG2) << "Computing digammas in " << __func__ << " for every t, because max(O)>T"; |
395 |
- double DigammaRplusX, DigammaRplusDRplusX; |
|
608 |
+// double DigammaSizePlusX, DigammaSizePlusDSizePlusX; |
|
609 |
+ double DigammaSizePlusX, TrigammaSizePlusX; |
|
396 | 610 |
for (int k=1; k<kmax; k++) |
397 | 611 |
{ |
398 |
- Fr = dFrdr = 0.0; |
|
612 |
+ F = dFdSize = 0.0; |
|
399 | 613 |
for (int i=0; i<toState-fromState; i++) |
400 | 614 |
{ |
401 |
- DigammaR = digamma((i+1)*rhere); // boost::math::digamma<>(rhere); |
|
402 |
- DigammaRplusDR = digamma((i+1)*(rhere + dr)); // boost::math::digamma<>(rhere+dr); |
|
615 |
+ DigammaSize = digamma((i+1)*size0); // boost::math::digamma<>(size0); |
|
616 |
+ TrigammaSize = trigamma((i+1)*size0); // boost::math::digamma<>(size0); |
|
617 |
+// DigammaSizePlusDSize = digamma((i+1)*(size0 + dSize)); // boost::math::digamma<>(size0+dSize); |
|
403 | 618 |
for(int t=0; t<this->T; t++) |
404 | 619 |
{ |
405 |
- DigammaRplusX = digamma((i+1)*rhere+this->obs[t]); //boost::math::digamma<>(rhere+this->obs[ti]); |
|
406 |
- DigammaRplusDRplusX = digamma((i+1)*(rhere+dr)+this->obs[t]); // boost::math::digamma<>(rhere+dr+this->obs[ti]); |
|
620 |
+ DigammaSizePlusX = digamma((i+1)*size0+this->obs[t]); //boost::math::digamma<>(size0+this->obs[ti]); |
|
621 |
+// DigammaSizePlusDSizePlusX = digamma((i+1)*(size0+dSize)+this->obs[t]); // boost::math::digamma<>(size0+dSize+this->obs[ti]); |
|
622 |
+ TrigammaSizePlusX = trigamma((i+1)*size0+this->obs[t]); |
|
407 | 623 |
if(this->obs[t]==0) |
408 | 624 |
{ |
409 |
- Fr += weights[i+fromState][t] * (i+1) * logp; |
|
410 |
- //dFrdr+=0; |
|
625 |
+ F += weights[i+fromState][t] * (i+1) * logp; |
|
626 |
+ //dFdSize+=0; |
|
411 | 627 |
} |
412 | 628 |
if(this->obs[t]!=0) |
413 | 629 |
{ |
414 |
- Fr += weights[i+fromState][t] * (i+1) * (logp - DigammaR + DigammaRplusX); |
|
415 |
- dFrdr += weights[i+fromState][t] / dr * (i+1) * (DigammaR - DigammaRplusDR + DigammaRplusDRplusX - DigammaRplusX); |
|
630 |
+ F += weights[i+fromState][t] * (i+1) * (logp - DigammaSize + DigammaSizePlusX); |
|
631 |
+// dFdSize += weights[i+fromState][t] / dSize * (i+1) * (DigammaSize - DigammaSizePlusDSize + DigammaSizePlusDSizePlusX - DigammaSizePlusX); |
|
632 |
+ dFdSize += weights[i+fromState][t] * pow((i+1),2) * (-TrigammaSize + TrigammaSizePlusX); |
|
416 | 633 |
} |
417 | 634 |
} |
418 | 635 |
} |
419 |
- if(fabs(Fr)<eps) |
|
636 |
+ if(fabs(F)<eps) |
|
420 | 637 |
{ |
421 | 638 |
break; |
422 | 639 |
} |
423 |
- if(Fr/dFrdr<rhere) rhere=rhere-Fr/dFrdr; |
|
424 |
- if(Fr/dFrdr>rhere) rhere=rhere/2.0; |
|
640 |
+ if(F/dFdSize<size0) size0=size0-F/dFdSize; |
|
641 |
+ if(F/dFdSize>size0) size0=size0/2.0; |
|
425 | 642 |
} |
426 | 643 |
} |
427 |
- this->size = rhere; |
|
428 |
- FILE_LOG(logDEBUG1) << "r = "<<this->size << ", p = "<<this->prob; |
|
644 |
+ this->size = size0; |
|
645 |
+ FILE_LOG(logDEBUG1) << "size = "<<this->size << ", prob = "<<this->prob; |
|
429 | 646 |
this->mean = this->fmean(this->size, this->prob); |
430 | 647 |
this->variance = this->fvariance(this->size, this->prob); |
431 | 648 |
|
... | ... |
@@ -506,6 +723,396 @@ double NegativeBinomial::get_prob() |
506 | 723 |
} |
507 | 724 |
|
508 | 725 |
|
726 |
+// ============================================================ |
|
727 |
+// Binomial density |
|
728 |
+// ============================================================ |
|
729 |
+ |
|
730 |
+// Constructor and Destructor --------------------------------- |
|
731 |
+Binomial::Binomial(int* observations, int T, double size, double prob) |
|
732 |
+{ |
|
733 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
734 |
+ this->obs = observations; |
|
735 |
+ this->T = T; |
|
736 |
+ this->size = size; |
|
737 |
+ this->prob = prob; |
|
738 |
+} |
|
739 |
+ |
|
740 |
+Binomial::~Binomial() |
|
741 |
+{ |
|
742 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
743 |
+} |
|
744 |
+ |
|
745 |
+// Methods ---------------------------------------------------- |
|
746 |
+void Binomial::calc_logdensities(double* logdens) |
|
747 |
+{ |
|
748 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
749 |
+ double logp = log(this->prob); |
|
750 |
+ double log1minusp = log(1-this->prob); |
|
751 |
+ // Select strategy for computing gammas |
|
752 |
+ if (this->max_obs <= this->T) |
|
753 |
+ { |
|
754 |
+ FILE_LOG(logDEBUG2) << "Precomputing densities in " << __func__ << " for every obs[t], because max(O)<=T"; |
|
755 |
+ double logdens_per_read [this->max_obs+1]; |
|
756 |
+ for (int j=0; j<=this->max_obs; j++) |
|
757 |
+ { |
|
758 |
+ logdens_per_read[j] = lchoose(this->size, j) + j * logp + (this->size-j) * log1minusp; |
|
759 |
+ } |
|
760 |
+ for (int t=0; t<this->T; t++) |
|
761 |
+ { |
|
762 |
+ logdens[t] = logdens_per_read[(int) this->obs[t]]; |
|
763 |
+ FILE_LOG(logDEBUG4) << "logdens["<<t<<"] = " << logdens[t]; |
|
764 |
+ if (isnan(logdens[t])) |
|
765 |
+ { |
|
766 |
+ FILE_LOG(logERROR) << __PRETTY_FUNCTION__; |
|
767 |
+ FILE_LOG(logERROR) << "logdens["<<t<<"] = "<< logdens[t]; |
|
768 |
+ throw nan_detected; |
|
769 |
+ } |
|
770 |
+ } |
|
771 |
+ } |
|
772 |
+ else |
|
773 |
+ { |
|
774 |
+ FILE_LOG(logDEBUG2) << "Computing densities in " << __func__ << " for every t, because max(O)>T"; |
|
775 |
+ int j; |
|
776 |
+ for (int t=0; t<this->T; t++) |
|
777 |
+ { |
|
778 |
+ j = (int) this->obs[t]; |
|
779 |
+ logdens[t] = lchoose(this->size, j) + j * logp + (this->size-j) * log1minusp; |
|
780 |
+ FILE_LOG(logDEBUG4) << "logdens["<<t<<"] = " << logdens[t]; |
|
781 |
+ if (isnan(logdens[t])) |
|
782 |
+ { |
|
783 |
+ FILE_LOG(logERROR) << __PRETTY_FUNCTION__; |
|
784 |
+ FILE_LOG(logERROR) << "logdens["<<t<<"] = "<< logdens[t]; |
|
785 |
+ throw nan_detected; |
|
786 |
+ } |
|
787 |
+ } |
|
788 |
+ } |
|
789 |
+} |
|
790 |
+ |
|
791 |
+void Binomial::calc_densities(double* dens) |
|
792 |
+{ |
|
793 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
794 |
+ double logp = log(this->prob); |
|
795 |
+ double log1minusp = log(1-this->prob); |
|
796 |
+ // Select strategy for computing gammas |
|
797 |
+ if (this->max_obs <= this->T) |
|
798 |
+ { |
|
799 |
+ FILE_LOG(logDEBUG2) << "Precomputing densities in " << __func__ << " for every obs[t], because max(O)<=T"; |
|
800 |
+ double dens_per_read [this->max_obs+1]; |
|
801 |
+ for (int j=0; j<=this->max_obs; j++) |
|
802 |
+ { |
|
803 |
+ dens_per_read[j] = exp( lchoose(this->size, j) + j * logp + (this->size-j) * log1minusp ); |
|
804 |
+ } |
|
805 |
+ for (int t=0; t<this->T; t++) |
|
806 |
+ { |
|
807 |
+ dens[t] = dens_per_read[(int) this->obs[t]]; |
|
808 |
+ FILE_LOG(logDEBUG4) << "dens["<<t<<"] = " << dens[t]; |
|
809 |
+ if (isnan(dens[t])) |
|
810 |
+ { |
|
811 |
+ FILE_LOG(logERROR) << __PRETTY_FUNCTION__; |
|
812 |
+ FILE_LOG(logERROR) << "dens["<<t<<"] = "<< dens[t]; |
|
813 |
+ throw nan_detected; |
|
814 |
+ } |
|
815 |
+ } |
|
816 |
+ } |
|
817 |
+ else |
|
818 |
+ { |
|
819 |
+ FILE_LOG(logDEBUG2) << "Computing densities in " << __func__ << " for every t, because max(O)>T"; |
|
820 |
+ int j; |
|
821 |
+ for (int t=0; t<this->T; t++) |
|
822 |
+ { |
|
823 |
+ j = (int) this->obs[t]; |
|
824 |
+ dens[t] = exp( lchoose(this->size, j) + j * logp + (this->size-j) * log1minusp ); |
|
825 |
+ FILE_LOG(logDEBUG4) << "dens["<<t<<"] = " << dens[t]; |
|
826 |
+ if (isnan(dens[t])) |
|
827 |
+ { |
|
828 |
+ FILE_LOG(logERROR) << __PRETTY_FUNCTION__; |
|
829 |
+ FILE_LOG(logERROR) << "dens["<<t<<"] = "<< dens[t]; |
|
830 |
+ throw nan_detected; |
|
831 |
+ } |
|
832 |
+ } |
|
833 |
+ } |
|
834 |
+} |
|
835 |
+ |
|
836 |
+void Binomial::update(double* weights) |
|
837 |
+{ |
|
838 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
839 |
+ double eps = 1e-4, kmax; |
|
840 |
+ double numerator, denominator, size0, dSize, F, dFdSize, DigammaSizePlus1, DigammaSizePlusDSizePlus1; |
|
841 |
+ // Update prob (p) |
|
842 |
+ numerator=denominator=0.0; |
|
843 |
+// clock_t time, dtime; |
|
844 |
+// time = clock(); |
|
845 |
+ for (int t=0; t<this->T; t++) |
|
846 |
+ { |
|
847 |
+ numerator += weights[t] * this->obs[t]; |
|
848 |
+ denominator += weights[t] * this->size; |
|
849 |
+ } |
|
850 |
+ this->prob = numerator/denominator; // Update of size is now done with updated prob |
|
851 |
+ double log1minusp = log(1-this->prob); |
|
852 |
+// dtime = clock() - time; |
|
853 |
+// FILE_LOG(logDEBUG1) << "updateP(): "<<dtime<< " clicks"; |
|
854 |
+ // Update of size with Newton Method |
|
855 |
+ size0 = this->size; |
|
856 |
+ dSize = 0.00001; |
|
857 |
+ kmax = 20; |
|
858 |
+// time = clock(); |
|
859 |
+ // Select strategy for computing digammas |
|
860 |
+ if (this->max_obs <= this->T) |
|
861 |
+ { |
|
862 |
+ FILE_LOG(logDEBUG2) << "Precomputing digammas in " << __func__ << " for every obs[t], because max(O)<=T"; |
|
863 |
+ double DigammaSizeMinusXPlus1[this->max_obs+1], DigammaSizePlusDSizeMinusXPlus1[this->max_obs+1]; |
|
864 |
+ for (int k=1; k<kmax; k++) |
|
865 |
+ { |
|
866 |
+ F=dFdSize=0.0; |
|
867 |
+ DigammaSizePlus1 = digamma(size0+1); |
|
868 |
+ DigammaSizePlusDSizePlus1 = digamma((size0+dSize)+1); |
|
869 |
+ // Precompute the digammas by iterating over all possible values of the observation vector |
|
870 |
+ for (int j=0; j<=this->max_obs; j++) |
|
871 |
+ { |
|
872 |
+ DigammaSizeMinusXPlus1[j] = digamma(size0-j+1); |
|
873 |
+ DigammaSizePlusDSizeMinusXPlus1[j] = digamma((size0+dSize)-j+1); |
|
874 |
+ } |
|
875 |
+ for(int t=0; t<this->T; t++) |
|
876 |
+ { |
|
877 |
+ if(this->obs[t]==0) |
|
878 |
+ { |
|
879 |
+ F += weights[t] * log1minusp; |
|
880 |
+ //dFdSize+=0; |
|
881 |
+ } |
|
882 |
+ if(this->obs[t]!=0) |
|
883 |
+ { |
|
884 |
+ F += weights[t] * (DigammaSizePlus1 - DigammaSizeMinusXPlus1[(int)this->obs[t]] + log1minusp); |
|
885 |
+ dFdSize += weights[t]/dSize * (DigammaSizePlusDSizePlus1-DigammaSizePlus1 - DigammaSizePlusDSizeMinusXPlus1[(int)this->obs[t]]+DigammaSizeMinusXPlus1[(int)this->obs[t]]); |
|
886 |
+ } |
|
887 |
+ } |
|
888 |
+ if(fabs(F)<eps) |
|
889 |
+{ |
|
890 |
+ break; |
|
891 |
+ } |
|
892 |
+ if(F/dFdSize<size0) size0=size0-F/dFdSize; |
|
893 |
+ if(F/dFdSize>size0) size0=size0/2.0; |
|
894 |
+ } |
|
895 |
+ } |
|
896 |
+ else |
|
897 |
+ { |
|
898 |
+ FILE_LOG(logDEBUG2) << "Computing digammas in " << __func__ << " for every t, because max(O)>T"; |
|
899 |
+ double DigammaSizePlusX, DigammaSizePlusDSizePlusX, DigammaSizeMinusXPlus1, DigammaSizePlusDSizeMinusXPlus1; |
|
900 |
+ for (int k=1; k<kmax; k++) |
|
901 |
+ { |
|
902 |
+ F = dFdSize = 0.0; |
|
903 |
+ DigammaSizePlus1 = digamma(size0+1); |
|
904 |
+ DigammaSizePlusDSizePlus1 = digamma((size0+dSize)+1); |
|
905 |
+ for(int t=0; t<this->T; t++) |
|
906 |
+ { |
|
907 |
+ DigammaSizeMinusXPlus1 = digamma(size0-(int)this->obs[t]+1); |
|
908 |
+ DigammaSizePlusDSizeMinusXPlus1 = digamma((size0+dSize)-(int)this->obs[t]+1); |
|
909 |
+ if(this->obs[t]==0) |
|
910 |
+ { |
|
911 |
+ F += weights[t] * log1minusp; |
|
912 |
+ //dFdSize+=0; |
|
913 |
+ } |
|
914 |
+ if(this->obs[t]!=0) |
|
915 |
+ { |
|
916 |
+ F += weights[t] * (DigammaSizePlus1 - DigammaSizeMinusXPlus1 + log1minusp); |
|
917 |
+ dFdSize += weights[t]/dSize * (DigammaSizePlusDSizePlus1-DigammaSizePlus1 - DigammaSizePlusDSizeMinusXPlus1+DigammaSizeMinusXPlus1); |
|
918 |
+ } |
|
919 |
+ } |
|
920 |
+ if(fabs(F)<eps) |
|
921 |
+ { |
|
922 |
+ break; |
|
923 |
+ } |
|
924 |
+ if(F/dFdSize<size0) size0=size0-F/dFdSize; |
|
925 |
+ if(F/dFdSize>size0) size0=size0/2.0; |
|
926 |
+ } |
|
927 |
+ } |
|
928 |
+ this->size = size0; |
|
929 |
+ FILE_LOG(logDEBUG1) << "r = "<<this->size << ", p = "<<this->prob; |
|
930 |
+ |
|
931 |
+// dtime = clock() - time; |
|
932 |
+// FILE_LOG(logDEBUG1) << "updateR(): "<<dtime<< " clicks"; |
|