... | ... |
@@ -11,15 +11,22 @@ find.aneuploidies, |
11 | 11 |
|
12 | 12 |
plot.distribution, |
13 | 13 |
plot.BAIT, |
14 |
+plot.genome.overview, |
|
14 | 15 |
get.state.table, |
16 |
+get.dendrogram, |
|
17 |
+get.reproducibility.one2many, |
|
18 |
+get.reproducibility.many2many, |
|
15 | 19 |
|
16 | 20 |
univariate2bed, |
17 | 21 |
binned2wiggle, |
18 |
-multivariate2bed, |
|
19 | 22 |
binned2GRanges, |
20 | 23 |
hmm2GRanges, |
24 |
+hmmList2GRangesList, |
|
25 |
+loadHmmsFromFiles, |
|
21 | 26 |
bed2GRanges, |
22 | 27 |
|
28 |
+get.state.labels, |
|
29 |
+get.state.colors, |
|
23 | 30 |
mixedsort, |
24 | 31 |
mixedorder |
25 | 32 |
) |
... | ... |
@@ -1,27 +1,11 @@ |
1 | 1 |
# ================================================================================== |
2 | 2 |
# Generate a data.frame with majority-state for each chromosome and input file/model |
3 | 3 |
# ================================================================================== |
4 |
-get.state.table <- function(modellist) { |
|
5 |
- |
|
6 |
- ## Intercept user input |
|
7 |
- if (check.univariate.modellist(modellist)!=0) { |
|
8 |
- cat("Loading univariate HMMs from files ...") |
|
9 |
- mlist <- NULL |
|
10 |
- for (modelfile in modellist) { |
|
11 |
- mlist[[length(mlist)+1]] <- get(load(modelfile)) |
|
12 |
- } |
|
13 |
- modellist <- mlist |
|
14 |
- remove(mlist) |
|
15 |
- cat(" done\n") |
|
16 |
- if (check.univariate.modellist(modellist)!=0) stop("argument 'modellist' expects a list of univariate hmms or a list of files that contain univariate hmms") |
|
17 |
- } |
|
18 |
- |
|
19 |
- # ----------------------------------- |
|
20 |
- # Percentage of aneuploid chromosomes |
|
21 |
- # ----------------------------------- |
|
4 |
+get.state.table <- function(modellist, numCPU=1) { |
|
22 | 5 |
|
23 | 6 |
## Transform to GRanges |
24 |
- grlist <- lapply(modellist, hmm2GRanges, reduce=F) |
|
7 |
+# grlist <- lapply(modellist, hmm2GRanges, reduce=F) |
|
8 |
+ grlist <- hmmList2GRangesList(modellist, reduce=F, numCPU=numCPU) |
|
25 | 9 |
|
26 | 10 |
## Split by chromosome |
27 | 11 |
grsplitlist <- lapply(grlist, function(gr) { split(gr, seqnames(gr)) }) |
... | ... |
@@ -33,56 +17,209 @@ get.state.table <- function(modellist) { |
33 | 17 |
states <- array(dim=c(length(modellist), length(tablell[[1]]), length(tablell[[1]][[1]])), dimnames=list('model'=1:length(tablell), 'chromosome'=names(tablell[[1]]), 'state'=names(tablell[[1]][[1]]))) |
34 | 18 |
for (i1 in 1:dim(states)[1]) { |
35 | 19 |
for (i2 in 1:dim(states)[2]) { |
36 |
- states[i1,i2,] <- tablell[[i1]][[i2]] |
|
20 |
+ tc <- tryCatch({ |
|
21 |
+ states[i1,i2,] <- tablell[[i1]][[i2]] |
|
22 |
+ }, error = function(err) { |
|
23 |
+ err |
|
24 |
+ }) |
|
37 | 25 |
} |
38 | 26 |
} |
39 | 27 |
|
40 | 28 |
## Get the majority state per chromosome of each model/file |
41 | 29 |
majorstate.f <- apply(states, c(1,2), function(x) { names(x)[which.max(x)] }) |
42 |
- factor2number <- 1:length(state.labels) -1 |
|
43 |
- names(factor2number) <- state.labels |
|
44 |
- majorstate <- majorstate.f |
|
30 |
+ majorstate <- matrix(NA, ncol=ncol(majorstate.f), nrow=nrow(majorstate.f)) |
|
31 |
+ colnames(majorstate) <- colnames(majorstate.f) |
|
32 |
+ rownames(majorstate) <- rownames(majorstate.f) |
|
45 | 33 |
for (i1 in 1:ncol(majorstate.f)) { |
46 |
- majorstate[,i1] <- factor2number[as.character(majorstate.f[,i1])] |
|
34 |
+ majorstate[,i1] <- factor(majorstate.f[,i1], levels=state.labels) |
|
47 | 35 |
} |
48 |
- majorstate <- as.data.frame(majorstate) |
|
36 |
+ majorstate.f <- matrix(state.labels[majorstate], ncol=ncol(majorstate), nrow=nrow(majorstate)) |
|
37 |
+ colnames(majorstate.f) <- colnames(majorstate) |
|
38 |
+ rownames(majorstate.f) <- rownames(majorstate) |
|
49 | 39 |
|
50 | 40 |
## Plot the percentages of aneuploidies/states per chromosome |
51 | 41 |
library(ggplot2) |
52 | 42 |
library(reshape2) |
53 |
- df <- melt(majorstate.f, measure.vars=colnames(majorstate), variable.name="chromosome", value.name="state") |
|
43 |
+ df <- melt(majorstate.f, varnames=c("model","chromosome"), value.name="state") |
|
54 | 44 |
df$state <- factor(df$state, levels=state.labels) |
55 |
- ggplt <- ggplot(df) + geom_bar(aes(n=nrow(majorstate), x=chromosome, fill=state, y=..count../n)) + theme_bw() + ylab('number of samples') + scale_fill_manual(values=state.colors) + scale_y_continuous(labels=scales::percent_format()) |
|
45 |
+ ggplt <- ggplot(df) + geom_bar(aes(x=chromosome, fill=state, y=..count..)) + theme_bw() + ylab('number of samples') + scale_fill_manual(values=state.colors) |
|
46 |
+ |
|
47 |
+ ## Return results |
|
48 |
+ l <- list(table=majorstate, plot=ggplt) |
|
49 |
+ return(l) |
|
50 |
+ |
|
51 |
+ |
|
52 |
+} |
|
56 | 53 |
|
57 |
- # ---------- |
|
58 |
- # Dendrogram |
|
59 |
- # ---------- |
|
54 |
+ |
|
55 |
+# ================================================================================== |
|
56 |
+# Build a dendrogram |
|
57 |
+# ================================================================================== |
|
58 |
+get.dendrogram <- function(modellist, numCPU=1) { |
|
60 | 59 |
|
61 | 60 |
### Generate the GRanges consensus template with variable bins that can hold the states of each model |
61 |
+ ## Load the files |
|
62 |
+ modellist <- loadHmmsFromFiles(modellist) |
|
63 |
+ |
|
62 | 64 |
## Transform to GRanges in reduced representation |
63 |
- grlred <- GRangesList() |
|
64 |
- for (i1 in 1:length(modellist)) { |
|
65 |
- grlred[[i1]] <- hmm2GRanges(modellist[[i1]], reduce=T) |
|
66 |
- } |
|
65 |
+ temp <- hmmList2GRangesList(modellist, reduce=TRUE, numCPU=numCPU, consensus=TRUE) |
|
66 |
+ grlred <- temp$grl |
|
67 |
+ consensus <- temp$consensus |
|
68 |
+ |
|
67 | 69 |
## Split into non-overlapping fragments |
68 |
- consensus <- disjoin(unlist(grlred)) |
|
69 | 70 |
## Overlap each models' states with that consensus template |
70 |
- constates <- matrix(NA, ncol=length(modellist), nrow=length(consensus)) |
|
71 |
- for (i1 in 1:length(modellist)) { |
|
72 |
- splt <- split(grlred[[i1]], mcols(grlred[[i1]])$state) |
|
71 |
+ cat('calculate overlap\n') |
|
72 |
+ constates <- foreach (gr = grlred, .packages='GenomicRanges', .combine='cbind') %do% { |
|
73 |
+ splt <- split(gr, mcols(gr)$state) |
|
73 | 74 |
mind <- as.matrix(findOverlaps(consensus, splt)) |
74 |
- constates[,i1] <- mind[,'subjectHits'] |
|
75 |
+ col <- matrix(-1, nrow=length(consensus), ncol=1) |
|
76 |
+ col[mind[,'queryHits'],1] <- mind[,'subjectHits'] |
|
77 |
+ col |
|
75 | 78 |
} |
79 |
+ colnames(constates) <- unlist(lapply(modellist, '[[', 'ID')) |
|
80 |
+ |
|
76 | 81 |
## Distance measure |
82 |
+ cat('calculating distance\n') |
|
77 | 83 |
wcor <- cov.wt(constates, wt=as.numeric(width(consensus)), cor=T) |
78 | 84 |
dist <- as.dist(1-wcor$cor) |
79 | 85 |
## Dendrogram |
80 | 86 |
hc <- hclust(dist) |
81 | 87 |
|
82 | 88 |
## Return results |
83 |
- l <- list(table=majorstate, percentage.plot=ggplt, dendrogram=hc) |
|
89 |
+ return(hc) |
|
90 |
+ |
|
91 |
+} |
|
92 |
+ |
|
93 |
+ |
|
94 |
+# ================================================================================== |
|
95 |
+# Get a simple measure of reproducibility between a multiple (num.query.samples) |
|
96 |
+# cell sample (query) and single cell (subjects) samples |
|
97 |
+# ================================================================================== |
|
98 |
+get.reproducibility.one2many <- function(query, num.query.samples=1, subjects, num.tests=10, numCPU=1) { |
|
99 |
+ |
|
100 |
+ if (check.univariate.model(query)!=0) { |
|
101 |
+ cat("loading univariate HMM from file\n") |
|
102 |
+ query <- get(load(query)) |
|
103 |
+ if (check.univariate.model(query)!=0) stop("argument 'query' expects a univariate hmm or a file that contains a univariate hmm") |
|
104 |
+ } |
|
105 |
+ |
|
106 |
+ ## Transform to GRanges |
|
107 |
+ query.gr <- hmm2GRanges(query, reduce=F) |
|
108 |
+ subjects.grl <- hmmList2GRangesList(subjects, reduce=T, numCPU=numCPU) |
|
109 |
+ |
|
110 |
+ ## Overlap each subject's states with query |
|
111 |
+ cat('overlapping each subject\'s states with query\n') |
|
112 |
+ library(doParallel) |
|
113 |
+ cl <- makeCluster(numCPU) |
|
114 |
+ registerDoParallel(cl) |
|
115 |
+ constates <- foreach (subject.gr = subjects.grl, .packages='GenomicRanges', .combine='cbind') %dopar% { |
|
116 |
+ splt <- split(subject.gr, mcols(subject.gr)$state) |
|
117 |
+ mind <- as.matrix(findOverlaps(query.gr, splt, select='first')) |
|
118 |
+ } |
|
119 |
+ stopCluster(cl) |
|
120 |
+ |
|
121 |
+ ## Comparing sample of subjects to query |
|
122 |
+ cat('comparing samples of subjects to query\n') |
|
123 |
+ means <- matrix(NA, nrow=num.tests, ncol=length(levels(mcols(query.gr)$state))) |
|
124 |
+ colnames(means) <- levels(mcols(query.gr)$state) |
|
125 |
+ meanconstates <- matrix(NA, nrow=nrow(constates), ncol=num.tests) |
|
126 |
+ for (itest in 1:num.tests) { |
|
127 |
+ cat('test',itest,'\n') |
|
128 |
+ |
|
129 |
+ ## Sample the columns to compare to the query |
|
130 |
+ cols <- sample(1:length(subjects), num.query.samples) |
|
131 |
+ |
|
132 |
+ ## Mean of each position |
|
133 |
+ meanconstates[,itest] <- apply(constates[,cols], 1, mean, na.rm=T) |
|
134 |
+ |
|
135 |
+ ## Subjects-mean of each query-level |
|
136 |
+ meanlevel <- NULL |
|
137 |
+ varlevel <- NULL |
|
138 |
+ for (level in levels(mcols(query.gr)$state)) { |
|
139 |
+ meanlevel[level] <- mean(meanconstates[mcols(query.gr)$state==level]) |
|
140 |
+ varlevel[level] <- var(meanconstates[mcols(query.gr)$state==level]) |
|
141 |
+ } |
|
142 |
+ means[itest,] <- meanlevel |
|
143 |
+ } |
|
144 |
+ ## Plot the results |
|
145 |
+ library(ggplot2) |
|
146 |
+ library(reshape2) |
|
147 |
+ df <- melt(data.frame(state=mcols(query.gr)$state, meanstate=meanconstates)) |
|
148 |
+ ggplt <- ggplot(df) + geom_boxplot(aes(x=state, y=value, fill=variable)) + theme_bw() + coord_cartesian(ylim=c(1,length(levels(mcols(query.gr)$state)))) + scale_y_continuous(labels=levels(mcols(query.gr)$state)) + xlab(paste0(num.query.samples,' cell sample')) + ylab(paste0('single cell samples')) |
|
149 |
+ |
|
150 |
+ df <- melt(means, varnames=c('test','state')) |
|
151 |
+ ggplt1 <- ggplot(df) + geom_boxplot(aes(x=state, y=value)) + theme_bw() + coord_cartesian(ylim=c(1,length(levels(mcols(query.gr)$state)))) + scale_y_continuous(labels=levels(mcols(query.gr)$state)) + xlab(paste0(num.query.samples,' cell sample')) + ylab(paste0('single cell samples')) |
|
152 |
+ |
|
153 |
+ ## Comparing all subjects to query (no means) |
|
154 |
+ cat('comparing all samples to query\n') |
|
155 |
+ df.constates <- as.data.frame(cbind(mcols(query.gr)$state, constates)) |
|
156 |
+ names(df.constates)[1] <- 'query' |
|
157 |
+ df <- melt(df.constates, id.vars='query', value.name='state') |
|
158 |
+ df <- df[seq(from=1, to=nrow(df), by=10000),] |
|
159 |
+ df$query <- factor(levels(mcols(query.gr)$state)[df$query], levels=levels(mcols(query.gr)$state)) |
|
160 |
+ |
|
161 |
+ ggplt2 <- ggplot(df) + geom_boxplot(aes(x=query, y=state)) + theme_bw() + coord_cartesian(ylim=c(1,length(levels(mcols(query.gr)$state)))) + scale_y_continuous(labels=levels(mcols(query.gr)$state)) + xlab(paste0(num.query.samples,' cell sample')) + ylab(paste0('single cell samples')) |
|
162 |
+ |
|
163 |
+ df$state <- factor(levels(mcols(query.gr)$state)[df$state], levels=levels(mcols(query.gr)$state)) |
|
164 |
+ ggplt3 <- ggplot(df) + geom_jitter(aes(x=query, y=state), position=position_jitter(width=0.3, height=0.2)) + theme_bw() + coord_cartesian(ylim=c(1,length(levels(mcols(query.gr)$state)))) + xlab(paste0(num.query.samples,' cell sample')) + ylab(paste0('single cell samples')) |
|
165 |
+ |
|
166 |
+ |
|
167 |
+ |
|
168 |
+ |
|
169 |
+ |
|
170 |
+ ## Return results |
|
171 |
+ l <- list(meanplot=ggplt, meanmeanplot=ggplt1, allbox=ggplt2, alljitter=ggplt3) |
|
84 | 172 |
return(l) |
85 | 173 |
|
86 | 174 |
|
175 |
+ |
|
87 | 176 |
} |
88 | 177 |
|
178 |
+ |
|
179 |
+ |
|
180 |
+# ================================================================================== |
|
181 |
+# Get a simple measure of reproducibility between sets of single cell samples |
|
182 |
+# ================================================================================== |
|
183 |
+get.reproducibility.many2many <- function(samples, num.tests=10, size.test=10, numCPU=1) { |
|
184 |
+ |
|
185 |
+ ## Transform to GRanges |
|
186 |
+ temp <- hmmList2GRangesList(samples, numCPU=numCPU, reduce=T, consensus=T) |
|
187 |
+ samples.grl <- temp$grl |
|
188 |
+ consensus <- temp$consensus |
|
189 |
+ constates <- temp$constates |
|
190 |
+ |
|
191 |
+ ## Comparing samples against each other |
|
192 |
+ cat('comparing samples against each other\n') |
|
193 |
+ meanconstates <- array(dim=c(length(consensus), 2, num.tests)) |
|
194 |
+ for (itest in 1:num.tests) { |
|
195 |
+ cat('test',itest,'\n') |
|
196 |
+ |
|
197 |
+ ## Sample the columns to compare to the query |
|
198 |
+ cols <- sample(1:length(samples), 2*size.test) |
|
199 |
+ cols1 <- cols[1:size.test] |
|
200 |
+ cols2 <- cols[(size.test+1):(2*size.test)] |
|
201 |
+ |
|
202 |
+ ## Mean of each position |
|
203 |
+ if (size.test==1) { |
|
204 |
+ meanconstates[,1,itest] <- constates[,cols1] |
|
205 |
+ meanconstates[,2,itest] <- constates[,cols2] |
|
206 |
+ } else { |
|
207 |
+ meanconstates[,1,itest] <- apply(constates[,cols1], 1, mean, na.rm=T) |
|
208 |
+ meanconstates[,2,itest] <- apply(constates[,cols2], 1, mean, na.rm=T) |
|
209 |
+ } |
|
210 |
+ |
|
211 |
+ } |
|
212 |
+ ## Mean along all tests |
|
213 |
+ means <- apply(meanconstates, c(1,2), mean) |
|
214 |
+ means <- cbind(as.data.frame(means), width=width(consensus)) |
|
215 |
+ |
|
216 |
+ ## Plot the results |
|
217 |
+ library(ggplot2) |
|
218 |
+ library(reshape2) |
|
219 |
+ states <- levels(mcols(samples.grl[[1]])$state) |
|
220 |
+ ggplt1 <- ggplot(as.data.frame(means)) + geom_point(aes(x=V1, y=V2, alpha=width)) + theme_bw() + coord_cartesian(ylim=c(1,length(states))) + scale_x_continuous(breaks=1:length(states), labels=states) + scale_y_continuous(breaks=1:length(states), labels=states) + xlab(paste0(size.test,' single cell samples')) + ylab(paste0(size.test,' single cell samples')) |
|
221 |
+ |
|
222 |
+ ## Return results |
|
223 |
+ return(ggplt1) |
|
224 |
+ |
|
225 |
+} |
... | ... |
@@ -90,7 +90,7 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold |
90 | 90 |
cat('Automatically determining binsizes...') |
91 | 91 |
gr <- GenomicRanges::GRanges(seqnames=Rle(chroms2use), |
92 | 92 |
ranges=IRanges(start=rep(1, length(chroms2use)), end=chrom.lengths[chroms2use])) |
93 |
- autodata <- GenomicAlignments::readGAlignmentsFromBam(file, index=index, param=ScanBamParam(what=c("pos"),which=range(gr))) |
|
93 |
+ autodata <- GenomicAlignments::readGAlignmentsFromBam(file, index=index, param=ScanBamParam(what=c("pos"),which=range(gr),flag=scanBamFlag(isDuplicate=F))) |
|
94 | 94 |
numreadsperbp <- length(autodata) / sum(as.numeric(chrom.lengths[chroms2use])) |
95 | 95 |
## Pad binsizes and reads.per.bin with each others value |
96 | 96 |
binsizes.add <- round(reads.per.bin / numreadsperbp, -2) |
... | ... |
@@ -161,7 +161,7 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold |
161 | 161 |
|
162 | 162 |
if (format=="bam") { |
163 | 163 |
cat("reading reads from file... \r") |
164 |
- data <- GenomicAlignments::readGAlignmentsFromBam(file, index=index, param=ScanBamParam(what=c("pos"),which=range(i.binned.data))) |
|
164 |
+ data <- GenomicAlignments::readGAlignmentsFromBam(file, index=index, param=ScanBamParam(what=c("pos"),which=range(i.binned.data),flag=scanBamFlag(isDuplicate=F))) |
|
165 | 165 |
} |
166 | 166 |
|
167 | 167 |
## Count overlaps |
... | ... |
@@ -1,3 +1,71 @@ |
1 |
+loadHmmsFromFiles <- function(uni.hmm.list) { |
|
2 |
+ |
|
3 |
+ ## Intercept user input |
|
4 |
+ if (check.univariate.modellist(uni.hmm.list)!=0) { |
|
5 |
+ cat("loading univariate HMMs from files\n") |
|
6 |
+ mlist <- NULL |
|
7 |
+ for (modelfile in uni.hmm.list) { |
|
8 |
+ mlist[[length(mlist)+1]] <- get(load(modelfile)) |
|
9 |
+ } |
|
10 |
+ uni.hmm.list <- mlist |
|
11 |
+ remove(mlist) |
|
12 |
+ if (check.univariate.modellist(uni.hmm.list)!=0) stop("argument 'uni.hmm.list' expects a list of univariate hmms or a list of files that contain univariate hmms") |
|
13 |
+ } |
|
14 |
+ |
|
15 |
+ return(uni.hmm.list) |
|
16 |
+ |
|
17 |
+} |
|
18 |
+ |
|
19 |
+hmmList2GRangesList <- function(uni.hmm.list, reduce=TRUE, numCPU=1, consensus=FALSE) { |
|
20 |
+ |
|
21 |
+ ## Load models |
|
22 |
+ uni.hmm.list <- loadHmmsFromFiles(uni.hmm.list) |
|
23 |
+ |
|
24 |
+ ## Transform to GRanges |
|
25 |
+ cat('transforming to GRanges\n') |
|
26 |
+ if (numCPU > 1) { |
|
27 |
+ library(doParallel) |
|
28 |
+ cl <- makeCluster(numCPU) |
|
29 |
+ registerDoParallel(cl) |
|
30 |
+ cfun <- function(...) { GRangesList(...) } |
|
31 |
+ uni.hmm.grl <- foreach (uni.hmm = uni.hmm.list, .packages='aneufinder', .combine='cfun', .multicombine=TRUE) %dopar% { |
|
32 |
+ hmm2GRanges(uni.hmm, reduce=reduce) |
|
33 |
+ } |
|
34 |
+ if (consensus) { |
|
35 |
+ consensus.gr <- disjoin(unlist(uni.hmm.grl)) |
|
36 |
+ constates <- foreach (uni.hmm.gr = uni.hmm.grl, .packages='GenomicRanges', .combine='cbind') %dopar% { |
|
37 |
+ splt <- split(uni.hmm.gr, mcols(uni.hmm.gr)$state) |
|
38 |
+ mind <- as.matrix(findOverlaps(consensus.gr, splt, select='first')) |
|
39 |
+ } |
|
40 |
+ } |
|
41 |
+ stopCluster(cl) |
|
42 |
+ } else { |
|
43 |
+ uni.hmm.grl <- GRangesList() |
|
44 |
+ for (uni.hmm in uni.hmm.list) { |
|
45 |
+ uni.hmm.grl[[length(uni.hmm.grl)+1]] <- hmm2GRanges(uni.hmm, reduce=reduce) |
|
46 |
+ } |
|
47 |
+ if (consensus) { |
|
48 |
+ consensus.gr <- disjoin(unlist(uni.hmm.grl)) |
|
49 |
+ constates <- matrix(NA, ncol=length(uni.hmm.grl), nrow=length(uni.hmm.grl[[1]])) |
|
50 |
+ for (i1 in 1:length(uni.hmm.grl)) { |
|
51 |
+ uni.hmm.gr <- uni.hmm.grl[[i1]] |
|
52 |
+ splt <- split(uni.hmm.gr, mcols(uni.hmm.gr)$state) |
|
53 |
+ mind <- as.matrix(findOverlaps(consensus.gr, splt, select='first')) |
|
54 |
+ constates[,i1] <- mind |
|
55 |
+ } |
|
56 |
+ } |
|
57 |
+ } |
|
58 |
+ if (consensus) { |
|
59 |
+ meanstates <- apply(constates, 1, mean) |
|
60 |
+ mcols(consensus.gr) <- meanstates |
|
61 |
+ return(list(grl=uni.hmm.grl, consensus=consensus.gr, constates=constates)) |
|
62 |
+ } else { |
|
63 |
+ return(uni.hmm.grl) |
|
64 |
+ } |
|
65 |
+ |
|
66 |
+ |
|
67 |
+} |
|
68 |
+ |
|
1 | 69 |
binned2GRanges <- function(binned.data, chrom.length.file=NULL, offset=0) { |
2 | 70 |
|
3 | 71 |
library(GenomicRanges) |
... | ... |
@@ -1,50 +1,84 @@ |
1 | 1 |
# =============================================== |
2 | 2 |
# Write color-coded tracks with univariate states |
3 | 3 |
# =============================================== |
4 |
-univariate2bed <- function(uni.hmm.list, file="view_me_in_genome_browser", threshold=0.5, chrom.length.file=NULL) { |
|
4 |
+univariate2bed <- function(uni.hmm.list, file="view_me_in_genome_browser", numCPU=1) { |
|
5 |
+ |
|
6 |
+ ## Intercept user input |
|
7 |
+ if (check.univariate.modellist(uni.hmm.list)!=0) { |
|
8 |
+ cat("Loading univariate HMMs from files ...") |
|
9 |
+ mlist <- NULL |
|
10 |
+ for (modelfile in uni.hmm.list) { |
|
11 |
+ mlist[[length(mlist)+1]] <- get(load(modelfile)) |
|
12 |
+ } |
|
13 |
+ uni.hmm.list <- mlist |
|
14 |
+ remove(mlist) |
|
15 |
+ cat(" done\n") |
|
16 |
+ if (check.univariate.modellist(uni.hmm.list)!=0) stop("argument 'uni.hmm.list' expects a list of univariate hmms or a list of files that contain univariate hmms") |
|
17 |
+ } |
|
5 | 18 |
|
6 |
- # Check user input |
|
7 |
- if (is.null(names(uni.hmm.list))) { |
|
8 |
- stop("Please name the list entries. The names will be used as track name for the Genome Browser file.") |
|
19 |
+ ## Transform to GRanges |
|
20 |
+ cat('transforming to GRanges\n') |
|
21 |
+ if (numCPU > 1) { |
|
22 |
+ library(doParallel) |
|
23 |
+ cl <- makeCluster(numCPU) |
|
24 |
+ registerDoParallel(cl) |
|
25 |
+ cfun <- function(...) { GRangesList(...) } |
|
26 |
+ uni.hmm.grl <- foreach (uni.hmm = uni.hmm.list, .packages='aneufinder', .combine='cfun', .multicombine=TRUE) %dopar% { |
|
27 |
+ hmm2GRanges(uni.hmm, reduce=T) |
|
28 |
+ } |
|
29 |
+ consensus.gr <- disjoin(unlist(uni.hmm.grl)) |
|
30 |
+ constates <- foreach (uni.hmm.gr = uni.hmm.grl, .packages='GenomicRanges', .combine='cbind') %dopar% { |
|
31 |
+ splt <- split(uni.hmm.gr, mcols(uni.hmm.gr)$state) |
|
32 |
+ mind <- as.matrix(findOverlaps(consensus.gr, splt, select='first')) |
|
33 |
+ } |
|
34 |
+ stopCluster(cl) |
|
35 |
+ } else { |
|
36 |
+ uni.hmm.grl <- GRangesList() |
|
37 |
+ for (uni.hmm in uni.hmm.list) { |
|
38 |
+ uni.hmm.grl[[length(uni.hmm.grl)+1]] <- hmm2GRanges(uni.hmm, reduce=T) |
|
39 |
+ } |
|
40 |
+ consensus.gr <- disjoin(unlist(uni.hmm.grl)) |
|
41 |
+ constates <- matrix(NA, ncol=length(uni.hmm.grl), nrow=length(uni.hmm.grl[[1]])) |
|
42 |
+ for (i1 in 1:length(uni.hmm.grl)) { |
|
43 |
+ uni.hmm.gr <- uni.hmm.grl[[i1]] |
|
44 |
+ splt <- split(uni.hmm.gr, mcols(uni.hmm.gr)$state) |
|
45 |
+ mind <- as.matrix(findOverlaps(consensus.gr, splt, select='first')) |
|
46 |
+ constates[,i1] <- mind |
|
47 |
+ } |
|
9 | 48 |
} |
49 |
+ meanstates <- apply(constates, 1, mean) |
|
50 |
+ mcols(consensus.gr) <- meanstates |
|
10 | 51 |
|
11 | 52 |
# Variables |
12 | 53 |
nummod <- length(uni.hmm.list) |
13 | 54 |
file <- paste(file,"bed", sep=".") |
14 | 55 |
|
15 | 56 |
# Generate the colors |
16 |
- colors <- gcolors[state.labels] |
|
57 |
+ colors <- state.colors[state.labels] |
|
17 | 58 |
RGBs <- t(col2rgb(colors)) |
18 | 59 |
RGBs <- apply(RGBs,1,paste,collapse=",") |
19 | 60 |
|
20 | 61 |
# Write first line to file |
21 | 62 |
cat("browser hide all\n", file=file) |
22 | 63 |
|
64 |
+ ### Write every model to file ### |
|
23 | 65 |
for (imod in 1:nummod) { |
24 | 66 |
uni.hmm <- uni.hmm.list[[imod]] |
67 |
+ uni.hmm.gr <- uni.hmm.grl[[imod]] |
|
25 | 68 |
priority <- 50 + imod |
26 |
- cat(paste("track name=",names(uni.hmm.list)[imod]," description=\"univariate calls for ",names(uni.hmm.list)[imod],", threshold = ",threshold,"\" visibility=1 itemRgb=On priority=",priority,"\n", sep=""), file=file, append=TRUE) |
|
69 |
+ cat(paste("track name=",uni.hmm$ID," description=\"univariate calls for ",names(uni.hmm.list)[imod],"\" visibility=1 itemRgb=On priority=",priority,"\n", sep=""), file=file, append=TRUE) |
|
70 |
+ |
|
71 |
+ # Change chromosome names from '1' to 'chr1' if necessary |
|
72 |
+ mask <- which(!grepl('chr', seqnames(uni.hmm.gr))) |
|
73 |
+ mcols(uni.hmm.gr)$chromosome <- as.character(seqnames(uni.hmm.gr)) |
|
74 |
+ mcols(uni.hmm.gr)$chromosome[mask] <- sub(pattern='^', replacement='chr', mcols(uni.hmm.gr)$chromosome[mask]) |
|
75 |
+ mcols(uni.hmm.gr)$chromosome <- as.factor(mcols(uni.hmm.gr)$chromosome) |
|
27 | 76 |
|
28 | 77 |
# Collapse the calls |
29 |
- calls <- cbind(uni.hmm$coordinates, name=uni.hmm$states) |
|
30 |
- collapsed.calls <- collapse.bins(calls, column2collapseBy=4) |
|
31 |
- |
|
32 |
- # Check length of chromosomes if chrom.length.file was given |
|
33 |
- if (!is.null(chrom.length.file)) { |
|
34 |
- chrom.lengths.df <- read.table(chrom.length.file) |
|
35 |
- chrom.lengths <- chrom.lengths.df[,2] |
|
36 |
- names(chrom.lengths) <- chrom.lengths.df[,1] |
|
37 |
- for (chrom in levels(as.factor(collapsed_calls$chrom))) { |
|
38 |
- index <- which(collapsed_calls$chrom == chrom & collapsed_calls$end > chrom.lengths[chrom]) |
|
39 |
- collapsed_calls$end[index] <- chrom.lengths[chrom] |
|
40 |
- if (length(index) >= 1) { |
|
41 |
- warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome") |
|
42 |
- } |
|
43 |
- } |
|
44 |
- } |
|
78 |
+ collapsed.calls <- as.data.frame(uni.hmm.gr)[c('chromosome','start','end','state')] |
|
45 | 79 |
|
46 | 80 |
# Append to file |
47 |
- itemRgb <- RGBs[as.character(collapsed.calls$name)] |
|
81 |
+ itemRgb <- RGBs[as.character(collapsed.calls$state)] |
|
48 | 82 |
numsegments <- nrow(collapsed.calls) |
49 | 83 |
df <- cbind(collapsed.calls, score=rep(0,numsegments), strand=rep(".",numsegments), thickStart=collapsed.calls$start, thickEnd=collapsed.calls$end, itemRgb=itemRgb) |
50 | 84 |
write.table(format(df, scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE) |
... | ... |
@@ -89,73 +123,3 @@ binned2wiggle <- function(binned.data.list, file="view_me_in_genome_browser") { |
89 | 123 |
} |
90 | 124 |
|
91 | 125 |
|
92 |
-# =============================================================== |
|
93 |
-# Write color-coded tracks with multivariate combinatorial states |
|
94 |
-# =============================================================== |
|
95 |
-multivariate2bed <- function(multi.hmm, separate.tracks=FALSE, exclude.state.zero=TRUE, numstates=NULL, file="view_me_in_genome_browser", chrom.length.file=NULL) { |
|
96 |
- |
|
97 |
- # Variables |
|
98 |
- file <- paste(file,".bed", sep="") |
|
99 |
- combstates <- multi.hmm$comb.states |
|
100 |
- if (is.null(numstates)) { |
|
101 |
- numstates <- length(combstates) |
|
102 |
- } else if (numstates > length(combstates)) { |
|
103 |
- numstates <- length(combstates) |
|
104 |
- } |
|
105 |
- if (exclude.state.zero) { |
|
106 |
- combstates2use <- setdiff(combstates,0) |
|
107 |
- combstates2use <- combstates2use[1:numstates] |
|
108 |
- combstates2use <- combstates2use[!is.na(combstates2use)] |
|
109 |
- numstates <- length(combstates2use) |
|
110 |
- } |
|
111 |
- |
|
112 |
- # Collapse the calls |
|
113 |
- calls <- cbind(multi.hmm$coordinates, name=multi.hmm$states) |
|
114 |
- collapsed.calls <- collapse.bins(calls, column2collapseBy=4) |
|
115 |
- # Select only desired states |
|
116 |
- mask <- rep(FALSE,nrow(collapsed.calls)) |
|
117 |
- for (istate in combstates2use) { |
|
118 |
- mask <- mask | istate==collapsed.calls$name |
|
119 |
- } |
|
120 |
- collapsed.calls <- collapsed.calls[mask,] |
|
121 |
- |
|
122 |
- # Check length of chromosomes if chrom.length.file was given |
|
123 |
- if (!is.null(chrom.length.file)) { |
|
124 |
- chrom.lengths.df <- read.table(chrom.length.file) |
|
125 |
- chrom.lengths <- chrom.lengths.df[,2] |
|
126 |
- names(chrom.lengths) <- chrom.lengths.df[,1] |
|
127 |
- for (chrom in levels(as.factor(collapsed.calls$chrom))) { |
|
128 |
- index <- which(collapsed.calls$chrom == chrom & collapsed.calls$end > chrom.lengths[chrom]) |
|
129 |
- collapsed.calls$end[index] <- chrom.lengths[chrom] |
|
130 |
- if (length(index) >= 1) { |
|
131 |
- warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome") |
|
132 |
- } |
|
133 |
- } |
|
134 |
- } |
|
135 |
- |
|
136 |
- # Generate the colors for each combinatorial state |
|
137 |
- colors <- colors()[grep(colors(), pattern="white|grey|gray|snow", invert=T)] |
|
138 |
- step <- length(colors) %/% numstates |
|
139 |
- colors <- colors[seq(1,by=step,length=numstates)] |
|
140 |
- RGBs <- t(col2rgb(colors)) |
|
141 |
- RGBs <- apply(RGBs,1,paste,collapse=",") |
|
142 |
- itemRgb <- RGBs[as.factor(collapsed.calls$name)] |
|
143 |
- |
|
144 |
- # Write to file |
|
145 |
- cat("browser hide all\n", file=file) |
|
146 |
- numsegments <- nrow(collapsed.calls) |
|
147 |
- df <- cbind(collapsed.calls, score=rep(0,numsegments), strand=rep(".",numsegments), thickStart=collapsed.calls$start, thickEnd=collapsed.calls$end, itemRgb=itemRgb) |
|
148 |
- |
|
149 |
- if (separate.tracks) { |
|
150 |
- for (istate in combstates2use) { |
|
151 |
- priority <- 100 + which(istate==combstates2use) |
|
152 |
- cat(paste("track name=\"comb.state ",istate,"\" description=\"multivariate calls for combinatorial state ",istate,"\" visibility=1 itemRgb=On priority=",priority,"\n", sep=""), file=file, append=TRUE) |
|
153 |
- write.table(format(df[df$name==istate,], scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE) |
|
154 |
- } |
|
155 |
- } else { |
|
156 |
- cat(paste("track name=\"comb.state\" description=\"multivariate combinatorial states\" visibility=1 itemRgb=On priority=100\n", sep=""), file=file, append=TRUE) |
|
157 |
- write.table(format(df, scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE) |
|
158 |
- } |
|
159 |
- |
|
160 |
-} |
|
161 |
- |
... | ... |
@@ -1,4 +1,4 @@ |
1 |
-find.aneuploidies <- function(binned.data, ID, use.states=0:5, eps=0.001, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, filter.reads=TRUE) { |
|
1 |
+find.aneuploidies <- function(binned.data, ID, use.states=0:6, eps=0.001, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, filter.reads=TRUE) { |
|
2 | 2 |
|
3 | 3 |
## Intercept user input |
4 | 4 |
IDcheck <- ID #trigger error if not defined |
... | ... |
@@ -79,6 +79,10 @@ find.aneuploidies <- function(binned.data, ID, use.states=0:5, eps=0.001, init=" |
79 | 79 |
colnames(hmm$A) <- use.state.labels |
80 | 80 |
hmm$distributions <- cbind(size=hmm$size, prob=hmm$prob, mu=fmean(hmm$size,hmm$prob), variance=fvariance(hmm$size,hmm$prob)) |
81 | 81 |
rownames(hmm$distributions) <- use.state.labels |
82 |
+ # Treat 'null-mixed' separately |
|
83 |
+ hmm$distributions[2,'mu'] <- (1-hmm$distributions[2,'prob'])/hmm$distributions[2,'prob'] |
|
84 |
+ hmm$distributions[2,'variance'] <- hmm$distributions[2,'mu']/hmm$distributions[2,'prob'] |
|
85 |
+ hmm$distributions[2,'size'] <- NA |
|
82 | 86 |
hmm$A.initial <- matrix(hmm$A.initial, ncol=hmm$num.states, byrow=TRUE) |
83 | 87 |
rownames(hmm$A.initial) <- use.state.labels |
84 | 88 |
colnames(hmm$A.initial) <- use.state.labels |
... | ... |
@@ -143,6 +147,10 @@ find.aneuploidies <- function(binned.data, ID, use.states=0:5, eps=0.001, init=" |
143 | 147 |
colnames(hmm$A) <- use.state.labels |
144 | 148 |
hmm$distributions <- cbind(size=hmm$size, prob=hmm$prob, mu=fmean(hmm$size,hmm$prob), variance=fvariance(hmm$size,hmm$prob)) |
145 | 149 |
rownames(hmm$distributions) <- use.state.labels |
150 |
+ # Treat 'null-mixed' separately |
|
151 |
+ hmm$distributions[2,'mu'] <- (1-hmm$distributions[2,'prob'])/hmm$distributions[2,'prob'] |
|
152 |
+ hmm$distributions[2,'variance'] <- hmm$distributions[2,'mu']/hmm$distributions[2,'prob'] |
|
153 |
+ hmm$distributions[2,'size'] <- NA |
|
146 | 154 |
hmm$A.initial <- matrix(hmm$A.initial, ncol=hmm$num.states, byrow=TRUE) |
147 | 155 |
rownames(hmm$A.initial) <- use.state.labels |
148 | 156 |
colnames(hmm$A.initial) <- use.state.labels |
... | ... |
@@ -1,11 +1,13 @@ |
1 | 1 |
# ======================================================= |
2 | 2 |
# Some global variables that can be used in all functions |
3 | 3 |
# ======================================================= |
4 |
-state.labels <- c("nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy") |
|
4 |
+state.labels <- c("nullsomy","null-mixed","monosomy","disomy","trisomy","tetrasomy","multisomy") |
|
5 | 5 |
coordinate.names <- c("chrom","start","end") |
6 | 6 |
binned.data.names <- c(coordinate.names,"reads") |
7 | 7 |
class.aneufinder.univariate <- "aneufinder.univariate.hmm" |
8 |
-state.colors <- c("mapped"="gray68","nullsomy"="gray20","monosomy"="gold3","disomy"="springgreen3","trisomy"="orangered1","tetrasomy"="orangered4","multisomy"="purple3","total"="black") |
|
8 |
+state.colors <- c("mapped"="gray68","nullsomy"="gray20","null-mixed"="gray30","monosomy"="gold3","disomy"="springgreen3","trisomy"="orangered1","tetrasomy"="orangered4","multisomy"="purple3","total"="black") |
|
9 |
+get.state.labels <- function() { return(state.labels) } |
|
10 |
+get.state.colors <- function() { return(state.colors[state.labels]) } |
|
9 | 11 |
|
10 | 12 |
# ============================================================================ |
11 | 13 |
# Functions for a Negative Binomial to transform (mean,variance)<->(size,prob) |
... | ... |
@@ -70,9 +70,12 @@ plot.distribution <- function(model, state=NULL, chrom=NULL, start=NULL, end=NUL |
70 | 70 |
x <- 0:rightxlim |
71 | 71 |
distributions <- list(x) |
72 | 72 |
|
73 |
- # unmappable |
|
73 |
+ # zero-inflation |
|
74 | 74 |
distributions[[length(distributions)+1]] <- c(weights[1],rep(0,length(x)-1)) |
75 |
- for (istate in 2:numstates) { |
|
75 |
+ # geometric |
|
76 |
+ distributions[[length(distributions)+1]] <- weights[2] * dgeom(x, model$distributions[2,'prob']) |
|
77 |
+ # negative binomials |
|
78 |
+ for (istate in 3:numstates) { |
|
76 | 79 |
distributions[[length(distributions)+1]] <- weights[istate] * dnbinom(x, model$distributions[istate,'size'], model$distributions[istate,'prob']) |
77 | 80 |
} |
78 | 81 |
distributions <- as.data.frame(distributions) |
... | ... |
@@ -195,3 +198,85 @@ plot.BAIT <- function(model, file='aneufinder_BAIT_plots') { |
195 | 198 |
} |
196 | 199 |
d <- dev.off() |
197 | 200 |
} |
201 |
+ |
|
202 |
+ |
|
203 |
+ |
|
204 |
+# ------------------------------------------------------------ |
|
205 |
+# Plot overview |
|
206 |
+# ------------------------------------------------------------ |
|
207 |
+plot.genome.overview <- function(modellist, file='aneufinder_genome_overview', numCPU=1) { |
|
208 |
+ |
|
209 |
+ ## Function definitions |
|
210 |
+ reformat <- function(x) { |
|
211 |
+ out_list <- list() |
|
212 |
+ |
|
213 |
+ for ( i in 2:length(x) ) { |
|
214 |
+ out_list[[i]] <- c(x[i-1], x[i]) |
|
215 |
+ } |
|
216 |
+ mt <- do.call("rbind",out_list) |
|
217 |
+ df <- data.frame(mt) |
|
218 |
+ colnames(df) <- c("start", "end") |
|
219 |
+ df |
|
220 |
+ } |
|
221 |
+ |
|
222 |
+ ## Load and transform to GRanges |
|
223 |
+ cat('transforming to GRanges\n') |
|
224 |
+ uni.hmm.grl <- hmmList2GRangesList(modellist, reduce=FALSE) |
|
225 |
+ |
|
226 |
+ ## Setup page |
|
227 |
+ library(grid) |
|
228 |
+ library(ggplot2) |
|
229 |
+ nrows <- length(uni.hmm.grl) # rows for plotting genomes |
|
230 |
+ ncols <- 1 |
|
231 |
+ png(file=paste0(file, '.png'), width=ncols*60, height=nrows*5, units='cm', res=150) |
|
232 |
+ grid.newpage() |
|
233 |
+ layout <- matrix(1:((nrows+1)*ncols), ncol=ncols, nrow=nrows+1, byrow=T) |
|
234 |
+ pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout), heights=c(1,rep(10,length(uni.hmm.grl)))))) |
|
235 |
+ # Main title |
|
236 |
+ grid.text(file, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:ncols), gp=gpar(fontsize=26)) |
|
237 |
+ |
|
238 |
+ ## Prepare some variables for plotting |
|
239 |
+ gr <- uni.hmm.grl[[1]] |
|
240 |
+ len <- seqlengths(gr) |
|
241 |
+ chr.names <- levels(seqnames(gr)) |
|
242 |
+ len <- as.numeric(len) |
|
243 |
+ len <- c(0,len) |
|
244 |
+ len <- cumsum(len) |
|
245 |
+ df <- reformat(len) |
|
246 |
+ df$col <- rep(c("grey47","grey77"), 12) |
|
247 |
+ df$breaks <- df[,1] + ((df[,2]-df[,1])/2) |
|
248 |
+ df$chr.names <- chr.names |
|
249 |
+ |
|
250 |
+ my_theme <- theme( |
|
251 |
+ legend.position="none", |
|
252 |
+ panel.background=element_blank(), |
|
253 |
+ panel.border=element_blank(), |
|
254 |
+ panel.grid.major=element_blank(), |
|
255 |
+ panel.grid.minor=element_blank(), |
|
256 |
+ plot.background=element_blank() |
|
257 |
+ ) |
|
258 |
+ |
|
259 |
+ ## Go through models and plot |
|
260 |
+ for (i1 in 1:length(uni.hmm.grl)) { |
|
261 |
+ cat('plotting model',i1,'\n') |
|
262 |
+ # Get the i,j matrix positions of the regions that contain this subplot |
|
263 |
+ matchidx <- as.data.frame(which(layout == i1+ncols, arr.ind = TRUE)) |
|
264 |
+ |
|
265 |
+ trans_gr <- biovizBase::transformToGenome(uni.hmm.grl[[i1]], space.skip = 0) |
|
266 |
+ |
|
267 |
+ dfplot <- as.data.frame(trans_gr) |
|
268 |
+ dfplot$reads <- stats::runmed(uni.hmm.grl[[i1]]$reads, 15) |
|
269 |
+ |
|
270 |
+ ggplt <- ggplot(dfplot, aes(x=.start, y=reads)) |
|
271 |
+ ggplt <- ggplt + geom_linerange(aes(ymin=0, ymax=reads, col=state), size=0.2) |
|
272 |
+ ggplt <- ggplt + geom_rect(data=df, aes(xmin=start, xmax=end, ymin=0, ymax=Inf, fill = col), alpha=I(0.3), inherit.aes = F) |
|
273 |
+ ggplt <- ggplt + scale_x_continuous(breaks = df$breaks, labels=df$chr.names, expand = c(0,0)) + scale_y_continuous(expand=c(0,5)) + scale_fill_manual(values = c("grey47","grey77")) + scale_color_manual(values=state.colors, drop=F) + xlab("chromosomes") + my_theme |
|
274 |
+ suppressWarnings( |
|
275 |
+ print(ggplt + ylim(0,30), vp = viewport(layout.pos.row = matchidx$row, layout.pos.col = matchidx$col)) |
|
276 |
+ ) |
|
277 |
+ |
|
278 |
+ |
|
279 |
+ } |
|
280 |
+ d <- dev.off() |
|
281 |
+} |
|
282 |
+ |
... | ... |
@@ -1,5 +1,6 @@ |
1 | 1 |
#include "utility.h" |
2 | 2 |
#include "scalehmm.h" |
3 |
+#include "loghmm.h" |
|
3 | 4 |
#include <omp.h> // parallelization options |
4 | 5 |
|
5 | 6 |
// =================================================================================================================================================== |
... | ... |
@@ -13,7 +14,7 @@ void R_univariate_hmm(int* O, int* T, int* N, int* statelabels, double* size, do |
13 | 14 |
// FILE* pFile = fopen("chromStar.log", "w"); |
14 | 15 |
// Output2FILE::Stream() = pFile; |
15 | 16 |
FILELog::ReportingLevel() = FILELog::FromString("ERROR"); |
16 |
-// FILELog::ReportingLevel() = FILELog::FromString("DEBUG1"); |
|
17 |
+// FILELog::ReportingLevel() = FILELog::FromString("DEBUG2"); |
|
17 | 18 |
|
18 | 19 |
// Parallelization settings |
19 | 20 |
omp_set_num_threads(*num_threads); |
... | ... |
@@ -53,6 +54,7 @@ void R_univariate_hmm(int* O, int* T, int* N, int* statelabels, double* size, do |
53 | 54 |
// Create the HMM |
54 | 55 |
FILE_LOG(logDEBUG1) << "Creating a univariate HMM"; |
55 | 56 |
ScaleHMM* hmm = new ScaleHMM(*T, *N); |
57 |
+// LogHMM* hmm = new LogHMM(*T, *N); |
|
56 | 58 |
hmm->set_cutoff(*read_cutoff); |
57 | 59 |
// Initialize the transition probabilities and proba |
58 | 60 |
hmm->initialize_transition_probs(initial_A, *use_initial_params); |
... | ... |
@@ -99,16 +101,22 @@ void R_univariate_hmm(int* O, int* T, int* N, int* statelabels, double* size, do |
99 | 101 |
|
100 | 102 |
} |
101 | 103 |
|
102 |
- if (i_state >= 1) |
|
104 |
+ if (i_state == 0) |
|
103 | 105 |
{ |
104 |
- FILE_LOG(logDEBUG1) << "Using negative binomial for state " << i_state; |
|
105 |
- NegativeBinomial *d = new NegativeBinomial(O, *T, initial_size[i_state], initial_prob[i_state]); // delete is done inside ~ScaleHMM() |
|
106 |
+ FILE_LOG(logDEBUG1) << "Using only zeros for state " << i_state; |
|
107 |
+ ZeroInflation *d = new ZeroInflation(O, *T); // delete is done inside ~ScaleHMM() |
|
106 | 108 |
hmm->densityFunctions.push_back(d); |
107 | 109 |
} |
108 |
- else if (i_state == 0) |
|
110 |
+ else if (i_state == 1) |
|
109 | 111 |
{ |
110 |
- FILE_LOG(logDEBUG1) << "Using only zeros for state " << i_state; |
|
111 |
- ZeroInflation *d = new ZeroInflation(O, *T); // delete is done inside ~ScaleHMM() |
|
112 |
+ FILE_LOG(logDEBUG1) << "Using geometric distribution for state " << i_state; |
|
113 |
+ Geometric *d = new Geometric(O, *T, 0.9); // delete is done inside ~ScaleHMM() |
|
114 |
+ hmm->densityFunctions.push_back(d); |
|
115 |
+ } |
|
116 |
+ else if (i_state >= 2) |
|
117 |
+ { |
|
118 |
+ FILE_LOG(logDEBUG1) << "Using negative binomial for state " << i_state; |
|
119 |
+ NegativeBinomial *d = new NegativeBinomial(O, *T, initial_size[i_state], initial_prob[i_state]); // delete is done inside ~ScaleHMM() |
|
112 | 120 |
hmm->densityFunctions.push_back(d); |
113 | 121 |
} |
114 | 122 |
else |
... | ... |
@@ -181,6 +189,12 @@ void R_univariate_hmm(int* O, int* T, int* N, int* statelabels, double* size, do |
181 | 189 |
size[i] = d->get_size(); |
182 | 190 |
prob[i] = d->get_prob(); |
183 | 191 |
} |
192 |
+ else if (hmm->densityFunctions[i]->get_name() == GEOMETRIC) |
|
193 |
+ { |
|
194 |
+ Geometric* d = (Geometric*)(hmm->densityFunctions[i]); |
|
195 |
+ size[i] = 0; |
|
196 |
+ prob[i] = d->get_prob(); |
|
197 |
+ } |
|
184 | 198 |
else if (hmm->densityFunctions[i]->get_name() == ZERO_INFLATION) |
185 | 199 |
{ |
186 | 200 |
// These values for a Negative Binomial define a zero-inflation (delta distribution) |
... | ... |
@@ -222,7 +222,7 @@ void NegativeBinomial::calc_densities(double* dens) |
222 | 222 |
} |
223 | 223 |
} |
224 | 224 |
|
225 |
-void NegativeBinomial::update(double* weight) |
|
225 |
+void NegativeBinomial::update(double* weights) |
|
226 | 226 |
{ |
227 | 227 |
FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
228 | 228 |
double eps = 1e-4, kmax; |
... | ... |
@@ -233,8 +233,8 @@ void NegativeBinomial::update(double* weight) |
233 | 233 |
// time = clock(); |
234 | 234 |
for (int t=0; t<this->T; t++) |
235 | 235 |
{ |
236 |
- numerator+=weight[t]*this->size; |
|
237 |
- denominator+=weight[t]*(this->size+this->obs[t]); |
|
236 |
+ numerator+=weights[t]*this->size; |
|
237 |
+ denominator+=weights[t]*(this->size+this->obs[t]); |
|
238 | 238 |
} |
239 | 239 |
this->prob = numerator/denominator; // Update of size (r) is now done with updated prob |
240 | 240 |
double logp = log(this->prob); |
... | ... |
@@ -265,13 +265,13 @@ void NegativeBinomial::update(double* weight) |
265 | 265 |
{ |
266 | 266 |
if(this->obs[t]==0) |
267 | 267 |
{ |
268 |
- Fr+=weight[t]*logp; |
|
268 |
+ Fr+=weights[t]*logp; |
|
269 | 269 |
//dFrdr+=0; |
270 | 270 |
} |
271 | 271 |
if(this->obs[t]!=0) |
272 | 272 |
{ |
273 |
- Fr+=weight[t]*(logp-DigammaR+DigammaRplusX[(int)obs[t]]); |
|
274 |
- dFrdr+=weight[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX[(int)obs[t]]-DigammaRplusX[(int)obs[t]]); |
|
273 |
+ Fr+=weights[t]*(logp-DigammaR+DigammaRplusX[(int)obs[t]]); |
|
274 |
+ dFrdr+=weights[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX[(int)obs[t]]-DigammaRplusX[(int)obs[t]]); |
|
275 | 275 |
} |
276 | 276 |
} |
277 | 277 |
if(fabs(Fr)<eps) |
... | ... |
@@ -297,13 +297,122 @@ void NegativeBinomial::update(double* weight) |
297 | 297 |
DigammaRplusDRplusX = digamma(rhere+dr+this->obs[t]); // boost::math::digamma<>(rhere+dr+this->obs[ti]); |
298 | 298 |
if(this->obs[t]==0) |
299 | 299 |
{ |
300 |
- Fr+=weight[t]*logp; |
|
300 |
+ Fr+=weights[t]*logp; |
|
301 | 301 |
//dFrdr+=0; |
302 | 302 |
} |
303 | 303 |
if(this->obs[t]!=0) |
304 | 304 |
{ |
305 |
- Fr+=weight[t]*(logp-DigammaR+DigammaRplusX); |
|
306 |
- dFrdr+=weight[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX-DigammaRplusX); |
|
305 |
+ Fr+=weights[t]*(logp-DigammaR+DigammaRplusX); |
|
306 |
+ dFrdr+=weights[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX-DigammaRplusX); |
|
307 |
+ } |
|
308 |
+ } |
|
309 |
+ if(fabs(Fr)<eps) |
|
310 |
+ { |
|
311 |
+ break; |
|
312 |
+ } |
|
313 |
+ if(Fr/dFrdr<rhere) rhere=rhere-Fr/dFrdr; |
|
314 |
+ if(Fr/dFrdr>rhere) rhere=rhere/2.0; |
|
315 |
+ } |
|
316 |
+ } |
|
317 |
+ this->size = rhere; |
|
318 |
+ FILE_LOG(logDEBUG1) << "r = "<<this->size << ", p = "<<this->prob; |
|
319 |
+ |
|
320 |
+// dtime = clock() - time; |
|
321 |
+// FILE_LOG(logDEBUG1) << "updateR(): "<<dtime<< " clicks"; |
|
322 |
+ |
|
323 |
+} |
|
324 |
+ |
|
325 |
+void NegativeBinomial::update_constrained(double** weights, int fromState, int toState) |
|
326 |
+{ |
|
327 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
328 |
+ double eps = 1e-4, kmax; |
|
329 |
+ double numerator, denominator, rhere, dr, Fr, dFrdr, DigammaR, DigammaRplusDR; |
|
330 |
+ // Update prob (p) |
|
331 |
+ numerator=denominator=0.0; |
|
332 |
+// clock_t time, dtime; |
|
333 |
+// time = clock(); |
|
334 |
+ for (int i=0; i<toState-fromState; i++) |
|
335 |
+ { |
|
336 |
+ for (int t=0; t<this->T; t++) |
|
337 |
+ { |
|
338 |
+ numerator+=weights[i+fromState][t]*this->size*(i+1); |
|
339 |
+ denominator+=weights[i+fromState][t]*(this->size*(i+1)+this->obs[t]); |
|
340 |
+ } |
|
341 |
+ } |
|
342 |
+ this->prob = numerator/denominator; // Update of size (r) is now done with updated prob |
|
343 |
+ double logp = log(this->prob); |
|
344 |
+// dtime = clock() - time; |
|
345 |
+// FILE_LOG(logDEBUG1) << "updateP(): "<<dtime<< " clicks"; |
|
346 |
+ // Update of size (r) with Newton Method |
|
347 |
+ rhere = this->size; |
|
348 |
+ dr = 0.00001; |
|
349 |
+ kmax = 20; |
|
350 |
+// time = clock(); |
|
351 |
+ // Select strategy for computing digammas |
|
352 |
+ if (this->max_obs <= this->T) |
|
353 |
+ { |
|
354 |
+ FILE_LOG(logDEBUG2) << "Precomputing digammas in " << __func__ << " for every obs[t], because max(O)<=T"; |
|
355 |
+ double DigammaRplusX[this->max_obs+1], DigammaRplusDRplusX[this->max_obs+1]; |
|
356 |
+ for (int k=1; k<kmax; k++) |
|
357 |
+ { |
|
358 |
+ Fr=dFrdr=0.0; |
|
359 |
+ for (int i=0; i<toState-fromState; i++) |
|
360 |
+ { |
|
361 |
+ DigammaR = digamma((i+1)*rhere); // boost::math::digamma<>(rhere); |
|
362 |
+ DigammaRplusDR = digamma((i+1)*(rhere + dr)); // boost::math::digamma<>(rhere+dr); |
|
363 |
+ // Precompute the digammas by iterating over all possible values of the observation vector |
|
364 |
+ for (int j=0; j<=this->max_obs; j++) |
|
365 |
+ { |
|
366 |
+ DigammaRplusX[j] = digamma((i+1)*rhere+j); |
|
367 |
+ DigammaRplusDRplusX[j] = digamma((i+1)*(rhere+dr)+j); |
|
368 |
+ } |
|
369 |
+ for(int t=0; t<this->T; t++) |
|
370 |
+ { |
|
371 |
+ if(this->obs[t]==0) |
|
372 |
+ { |
|
373 |
+ Fr+=weights[i+fromState][t]*logp; |
|
374 |
+ //dFrdr+=0; |
|
375 |
+ } |
|
376 |
+ if(this->obs[t]!=0) |
|
377 |
+ { |
|
378 |
+ Fr+=weights[i+fromState][t]*(logp-DigammaR+DigammaRplusX[(int)obs[t]]); |
|
379 |
+ dFrdr+=weights[i+fromState][t]/((i+1)*dr)*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX[(int)obs[t]]-DigammaRplusX[(int)obs[t]]); |
|
380 |
+ } |
|
381 |
+ } |
|
382 |
+ if(fabs(Fr)<eps) |
|
383 |
+ { |
|
384 |
+ break; |
|
385 |
+ } |
|
386 |
+ if(Fr/dFrdr<rhere) rhere=rhere-Fr/dFrdr; |
|
387 |
+ if(Fr/dFrdr>rhere) rhere=rhere/2.0; |
|
388 |
+ } |
|
389 |
+ } |
|
390 |
+ } |
|
391 |
+ else |
|
392 |
+ { |
|
393 |
+ FILE_LOG(logDEBUG2) << "Computing digammas in " << __func__ << " for every t, because max(O)>T"; |
|
394 |
+ double DigammaRplusX, DigammaRplusDRplusX; |
|
395 |
+ for (int k=1; k<kmax; k++) |
|
396 |
+ { |
|
397 |
+ Fr = dFrdr = 0.0; |
|
398 |
+ for (int i=0; i<toState-fromState; i++) |
|
399 |
+ { |
|
400 |
+ DigammaR = digamma((i+1)*rhere); // boost::math::digamma<>(rhere); |
|
401 |
+ DigammaRplusDR = digamma((i+1)*(rhere + dr)); // boost::math::digamma<>(rhere+dr); |
|
402 |
+ for(int t=0; t<this->T; t++) |
|
403 |
+ { |
|
404 |
+ DigammaRplusX = digamma((i+1)*rhere+this->obs[t]); //boost::math::digamma<>(rhere+this->obs[ti]); |
|
405 |
+ DigammaRplusDRplusX = digamma((i+1)*(rhere+dr)+this->obs[t]); // boost::math::digamma<>(rhere+dr+this->obs[ti]); |
|
406 |
+ if(this->obs[t]==0) |
|
407 |
+ { |
|
408 |
+ Fr+=weights[i+fromState][t]*logp; |
|
409 |
+ //dFrdr+=0; |
|
410 |
+ } |
|
411 |
+ if(this->obs[t]!=0) |
|
412 |
+ { |
|
413 |
+ Fr+=weights[i+fromState][t]*(logp-DigammaR+DigammaRplusX); |
|
414 |
+ dFrdr+=weights[i+fromState][t]/((i+1)*dr)*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX-DigammaRplusX); |
|
415 |
+ } |
|
307 | 416 |
} |
308 | 417 |
} |
309 | 418 |
if(fabs(Fr)<eps) |
... | ... |
@@ -437,7 +546,7 @@ void ZeroInflation::calc_logdensities(double* logdensity) |
437 | 546 |
if(this->obs[t]==0) |
438 | 547 |
{ |
439 | 548 |
logdensity[t] = 0.0; |
440 |
- }; |
|
549 |
+ } |
|
441 | 550 |
if(this->obs[t]>0) |
442 | 551 |
{ |
443 | 552 |
logdensity[t] = -100; // -INFINITY gives nan's somewhere downstream |
... | ... |
@@ -481,3 +590,186 @@ void ZeroInflation::set_variance(double variance) |
481 | 590 |
} |
482 | 591 |
|
483 | 592 |
|
593 |
+// ============================================================ |
|
594 |
+// Geometric density |
|
595 |
+// ============================================================ |
|
596 |
+ |
|
597 |
+// Constructor and Destructor --------------------------------- |
|
598 |
+Geometric::Geometric(int* observations, int T, double prob) |
|
599 |
+{ |
|
600 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
601 |
+ this->obs = observations; |
|
602 |
+ this->T = T; |
|
603 |
+ this->prob = prob; |
|
604 |
+ if (this->obs != NULL) |
|
605 |
+ { |
|
606 |
+ this->max_obs = intMax(observations, T); |
|
607 |
+ } |
|
608 |
+} |
|
609 |
+ |
|
610 |
+Geometric::~Geometric() |
|
611 |
+{ |
|
612 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
613 |
+} |
|
614 |
+ |
|
615 |
+// Methods ---------------------------------------------------- |
|
616 |
+void Geometric::calc_logdensities(double* logdens) |
|
617 |
+{ |
|
618 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
619 |
+ double logp = log(this->prob); |
|
620 |
+ double log1minusp = log(1-this->prob); |
|
621 |
+ // Select strategy for computing logdensities |
|
622 |
+ if (this->max_obs <= this->T) |
|
623 |
+ { |
|
624 |
+ FILE_LOG(logDEBUG2) << "Precomputing logdensities in " << __func__ << " for every obs[t], because max(O)<=T"; |
|
625 |
+ double logdens_per_read [this->max_obs+1]; |
|
626 |
+ for (int j=0; j<=this->max_obs; j++) |
|
627 |
+ { |
|
628 |
+ logdens_per_read[j] = logp + j * log1minusp; |
|
629 |
+ } |
|
630 |
+ for (int t=0; t<this->T; t++) |
|
631 |
+ { |
|
632 |
+ logdens[t] = logdens_per_read[(int) this->obs[t]]; |
|
633 |
+ FILE_LOG(logDEBUG4) << "logdens["<<t<<"] = " << logdens[t]; |
|
634 |
+ if (isnan(logdens[t])) |
|
635 |
+ { |
|
636 |
+ FILE_LOG(logERROR) << __PRETTY_FUNCTION__; |
|
637 |
+ FILE_LOG(logERROR) << "logdens["<<t<<"] = "<< logdens[t]; |
|
638 |
+ throw nan_detected; |
|
639 |
+ } |
|
640 |
+ } |
|
641 |
+ } |
|
642 |
+ else |
|
643 |
+ { |
|
644 |
+ FILE_LOG(logDEBUG2) << "Computing logdensities in " << __func__ << " for every t, because max(O)>T"; |
|
645 |
+ for (int t=0; t<this->T; t++) |
|
646 |
+ { |
|
647 |
+ logdens[t] = logp + this->obs[t] * log1minusp; |
|
648 |
+ FILE_LOG(logDEBUG4) << "logdens["<<t<<"] = " << logdens[t]; |
|
649 |
+ if (isnan(logdens[t])) |
|
650 |
+ { |
|
651 |
+ FILE_LOG(logERROR) << __PRETTY_FUNCTION__; |
|
652 |
+ FILE_LOG(logERROR) << "logdens["<<t<<"] = "<< logdens[t]; |
|
653 |
+ throw nan_detected; |
|
654 |
+ } |
|
655 |
+ } |
|
656 |
+ } |
|
657 |
+} |
|
658 |
+ |
|
659 |
+void Geometric::calc_densities(double* dens) |
|
660 |
+{ |
|
661 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
662 |
+ double p = this->prob; |
|
663 |
+ double oneminusp = 1-this->prob; |
|
664 |
+ // Select strategy for computing gammas |
|
665 |
+ if (this->max_obs <= this->T) |
|
666 |
+ { |
|
667 |
+ FILE_LOG(logDEBUG2) << "Precomputing densities in " << __func__ << " for every obs[t], because max(O)<=T"; |
|
668 |
+ double dens_per_read [this->max_obs+1]; |
|
669 |
+ for (int j=0; j<=this->max_obs; j++) |
|
670 |
+ { |
|
671 |
+ dens_per_read[j] = p * pow(oneminusp,j); |
|
672 |
+ } |
|
673 |
+ for (int t=0; t<this->T; t++) |
|
674 |
+ { |
|
675 |
+ dens[t] = dens_per_read[(int) this->obs[t]]; |
|
676 |
+ FILE_LOG(logDEBUG4) << "dens["<<t<<"] = " << dens[t]; |
|
677 |
+ if (isnan(dens[t])) |
|
678 |
+ { |
|
679 |
+ FILE_LOG(logERROR) << __PRETTY_FUNCTION__; |
|
680 |
+ FILE_LOG(logERROR) << "dens["<<t<<"] = "<< dens[t]; |
|
681 |
+ throw nan_detected; |
|
682 |
+ } |
|
683 |
+ } |
|
684 |
+ } |
|
685 |
+ else |
|
686 |
+ { |
|
687 |
+ FILE_LOG(logDEBUG2) << "Computing densities in " << __func__ << " for every t, because max(O)>T"; |
|
688 |
+ for (int t=0; t<this->T; t++) |
|
689 |
+ { |
|
690 |
+ dens[t] = p * pow(oneminusp,this->obs[t]); |
|
691 |
+ FILE_LOG(logDEBUG4) << "dens["<<t<<"] = " << dens[t]; |
|
692 |
+ if (isnan(dens[t])) |
|
693 |
+ { |
|
694 |
+ FILE_LOG(logERROR) << __PRETTY_FUNCTION__; |
|
695 |
+ FILE_LOG(logERROR) << "dens["<<t<<"] = "<< dens[t]; |
|
696 |
+ throw nan_detected; |
|
697 |
+ } |
|
698 |
+ } |
|
699 |
+ } |
|
700 |
+} |
|
701 |
+ |
|
702 |
+void Geometric::update(double* weight) |
|
703 |
+{ |
|
704 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
705 |
+ double numerator, denominator; |
|
706 |
+ // Update prob (p) |
|
707 |
+ numerator=denominator=0.0; |
|
708 |
+ for (int t=0; t<this->T; t++) |
|
709 |
+ { |
|
710 |
+ numerator+=weight[t]; |
|
711 |
+ denominator+=weight[t]*(1+this->obs[t]); |
|
712 |
+ } |
|
713 |
+ this->prob = numerator/denominator; |
|
714 |
+ FILE_LOG(logDEBUG1) << "p = "<<this->prob; |
|
715 |
+} |
|
716 |
+ |
|
717 |
+double Geometric::fprob(double mean, double variance) |
|
718 |
+{ |
|
719 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
720 |
+ return( mean / variance ); |
|
721 |
+} |
|
722 |
+ |
|
723 |
+double Geometric::fmean(double prob) |
|
724 |
+{ |
|
725 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
726 |
+ return( (1-prob) / prob ); |
|
727 |
+} |
|
728 |
+ |
|
729 |
+double Geometric::fvariance(double prob) |
|
730 |
+{ |
|
731 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
732 |
+ return( (1-prob) / (prob*prob) ); |
|
733 |
+} |
|
734 |
+ |
|
735 |
+// Getter and Setter ------------------------------------------ |
|
736 |
+double Geometric::get_mean() |
|
737 |
+{ |
|
738 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
739 |
+ return( this->fmean( this->prob ) ); |
|
740 |
+} |
|
741 |
+ |
|
742 |
+void Geometric::set_mean(double mean) |
|
743 |
+{ |
|
744 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
745 |
+ double variance = this->get_variance(); |
|
746 |
+ this->prob = this->fprob( mean, variance ); |
|
747 |
+} |
|
748 |
+ |
|
749 |
+double Geometric::get_variance() |
|
750 |
+{ |
|
751 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
752 |
+ return( this->fvariance( this->prob ) ); |
|
753 |
+} |
|
754 |
+ |
|
755 |
+void Geometric::set_variance(double variance) |
|
756 |
+{ |
|
757 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
758 |
+ double mean = this->get_mean(); |
|
759 |
+ this->prob = this->fprob( mean, variance ); |
|
760 |
+} |
|
761 |
+ |
|
762 |
+DensityName Geometric::get_name() |
|
763 |
+{ |
|
764 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
765 |
+ return(GEOMETRIC); |
|
766 |
+} |
|
767 |
+ |
|
768 |
+double Geometric::get_prob() |
|
769 |
+{ |
|
770 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
771 |
+ return(this->prob); |
|
772 |
+} |
|
773 |
+ |
|
774 |
+ |
|
775 |
+ |
... | ... |
@@ -4,7 +4,7 @@ |
4 | 4 |
#ifndef DENSITIES_H |
5 | 5 |
#define DENSITIES_H |
6 | 6 |
|
7 |
-enum DensityName {ZERO_INFLATION, NORMAL, NEGATIVE_BINOMIAL}; |
|
7 |
+enum DensityName {ZERO_INFLATION, NORMAL, NEGATIVE_BINOMIAL, GEOMETRIC}; |
|
8 | 8 |
|
9 | 9 |
class Density |
10 | 10 |
{ |
... | ... |
@@ -15,6 +15,7 @@ class Density |
15 | 15 |
virtual void calc_logdensities(double* logdensity) {}; |
16 | 16 |
virtual void calc_densities(double* density) {}; |
17 | 17 |
virtual void update(double* weight) {}; |
18 |
+ virtual void update_constrained(double** weights, int fromState, int toState) {}; |
|
18 | 19 |
// Getter and Setter |
19 | 20 |
virtual DensityName get_name() {}; |
20 | 21 |
virtual double get_mean() {}; |
... | ... |
@@ -59,7 +60,7 @@ class NegativeBinomial : public Density |
59 | 60 |
{ |
60 | 61 |
public: |
61 | 62 |
// Constructor and Destructor |
62 |
- NegativeBinomial(int* observations, int T, double mean, double variance); |
|
63 |
+ NegativeBinomial(int* observations, int T, double size, double prob); |
|
63 | 64 |
~NegativeBinomial(); |
64 | 65 |
|
65 | 66 |
// Methods |
... | ... |
@@ -67,6 +68,7 @@ class NegativeBinomial : public Density |
67 | 68 |
void calc_densities(double* density); |
68 | 69 |
void calc_logdensities(double* logdensity); |
69 | 70 |
void update(double* weights); |
71 |
+ void update_constrained(double** weights, int fromState, int toState); |
|
70 | 72 |
double fsize(double mean, double variance); |
71 | 73 |
double fprob(double mean, double variance); |
72 | 74 |
double fmean(double size, double prob); |
... | ... |
@@ -119,4 +121,39 @@ class ZeroInflation : public Density |
119 | 121 |
}; |
120 | 122 |
|
121 | 123 |
|
124 |
+class Geometric : public Density |
|
125 |
+{ |
|
126 |
+ public: |
|
127 |
+ // Constructor and Destructor |
|
128 |
+ Geometric(int* observations, int T, double prob); |
|
129 |
+ ~Geometric(); |
|
130 |
+ |
|
131 |
+ // Methods |
|
132 |
+ DensityName get_name(); |
|
133 |
+ void calc_densities(double* density); |
|
134 |
+ void calc_logdensities(double* logdensity); |
|
135 |
+ void update(double* weights); |
|
136 |
+ double fprob(double mean, double variance); |
|
137 |
+ double fmean(double prob); |
|
138 |
+ double fvariance(double prob); |
|
139 |
+ |
|
140 |
+ // Getter and Setter |
|
141 |
+ double get_mean(); |
|
142 |
+ void set_mean(double mean); |
|
143 |
+ double get_variance(); |
|
144 |
+ void set_variance(double variance); |
|
145 |
+ double get_prob(); |
|
146 |
+ |
|
147 |
+ private: |
|
148 |
+ // Member variables |
|
149 |
+ int T; ///< length of observation vector |
|
150 |
+ int* obs; ///< vector [T] of observations |
|
151 |
+ int max_obs; ///< maximum observation |
|
152 |
+ double prob; ///< probability parameter of the geometric distribution |
|
153 |
+ double mean; ///< mean of the geometric distribution |
|
154 |
+ double variance; ///< variance of the geometric distribution |
|
155 |
+ |
|
156 |
+}; |
|
157 |
+ |
|
158 |
+ |
|
122 | 159 |
#endif |
123 | 160 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,723 @@ |
1 |
+#include "loghmm.h" |
|
2 |
+ |
|
3 |
+// ============================================================ |
|
4 |
+// Hidden Markov Model implemented with scaling strategy |
|
5 |
+// ============================================================ |
|
6 |
+ |
|
7 |
+// Public ===================================================== |
|
8 |
+ |
|
9 |
+// Constructor and Destructor --------------------------------- |
|
10 |
+LogHMM::LogHMM(int T, int N) |
|
11 |
+{ |
|
12 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
13 |
+ this->T = T; |
|
14 |
+ this->N = N; |
|
15 |
+ this->A = allocDoubleMatrix(N, N); |
|
16 |
+ this->logA = allocDoubleMatrix(N, N); |
|
17 |
+ this->logalpha = allocDoubleMatrix(T, N); |
|
18 |
+ this->logbeta = allocDoubleMatrix(T, N); |
|
19 |
+ this->logdensities = allocDoubleMatrix(N, T); |
|
20 |
+// this->tdensities = allocDoubleMatrix(T, N); |
|
21 |
+ this->proba = (double*) calloc(N, sizeof(double)); |
|
22 |
+ this->logproba = (double*) calloc(N, sizeof(double)); |
|
23 |
+ this->gamma = allocDoubleMatrix(N, T); |
|
24 |
+ this->sumgamma = (double*) calloc(N, sizeof(double)); |
|
25 |
+ this->sumxi = allocDoubleMatrix(N, N); |
|
26 |
+ this->logP = -INFINITY; |
|
27 |
+ this->dlogP = INFINITY; |
|
28 |
+ this->sumdiff_state_last = 0; |
|
29 |
+ this->sumdiff_posterior = 0.0; |
|
30 |
+// this->use_tdens = false; |
|
31 |
+ |
|
32 |
+} |
|
33 |
+ |
|
34 |
+LogHMM::~LogHMM() |
|
35 |
+{ |
|
36 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
37 |
+ freeDoubleMatrix(this->A, this->N); |
|
38 |
+ freeDoubleMatrix(this->logA, this->N); |
|
39 |
+ freeDoubleMatrix(this->logalpha, this->T); |
|
40 |
+ freeDoubleMatrix(this->logbeta, this->T); |
|
41 |
+ freeDoubleMatrix(this->logdensities, this->N); |
|
42 |
+// freeDoubleMatrix(this->tdensities, this->T); |
|
43 |
+ freeDoubleMatrix(this->gamma, this->N); |
|
44 |
+ freeDoubleMatrix(this->sumxi, this->N); |
|
45 |
+ free(this->proba); |
|
46 |
+ free(this->logproba); |
|
47 |
+ free(this->sumgamma); |
|
48 |
+ for (int iN=0; iN<this->N; iN++) |
|
49 |
+ { |
|
50 |
+ FILE_LOG(logDEBUG1) << "Deleting density functions"; |
|
51 |
+ delete this->densityFunctions[iN]; |
|
52 |
+ } |
|
53 |
+} |
|
54 |
+ |
|
55 |
+// Methods ---------------------------------------------------- |
|
56 |
+void LogHMM::initialize_transition_probs(double* initial_A, bool use_initial_params) |
|
57 |
+{ |
|
58 |
+ |
|
59 |
+ if (use_initial_params) |
|
60 |
+ { |
|
61 |
+ for (int iN=0; iN<this->N; iN++) |
|
62 |
+ { |
|
63 |
+ for (int jN=0; jN<this->N; jN++) |
|
64 |
+ { |
|
65 |
+ // convert from vector to matrix representation |
|
66 |
+ this->A[jN][iN] = initial_A[iN*this->N + jN]; |
|
67 |
+ this->logA[jN][iN] = log(this->A[jN][iN]); |
|
68 |
+ } |
|
69 |
+ } |
|
70 |
+ } |
|
71 |
+ else |
|
72 |
+ { |
|
73 |
+ double self = 0.9; |
|
74 |
+// self = 1.0 / this->N; // set to uniform |
|
75 |
+ double other = (1.0 - self) / (this->N - 1.0); |
|
76 |
+ for (int iN=0; iN<this->N; iN++) |
|
77 |
+ { |
|
78 |
+ for (int jN=0; jN<this->N; jN++) |
|
79 |
+ { |
|
80 |
+ if (iN == jN) |
|
81 |
+ { |
|
82 |
+ this->A[iN][jN] = self; |
|
83 |
+ this->logA[iN][jN] = log(this->A[iN][jN]); |
|
84 |
+ } |
|
85 |
+ else |
|
86 |
+ { |
|
87 |
+ this->A[iN][jN] = other; |
|
88 |
+ this->logA[iN][jN] = log(this->A[iN][jN]); |
|
89 |
+ } |
|
90 |
+ // Save value to initial A |
|
91 |
+ initial_A[iN*this->N + jN] = this->A[iN][jN]; |
|
92 |
+ } |
|
93 |
+ } |
|
94 |
+ } |
|
95 |
+ |
|
96 |
+} |
|
97 |
+ |
|
98 |
+void LogHMM::initialize_proba(double* initial_proba, bool use_initial_params) |
|
99 |
+{ |
|
100 |
+ |
|
101 |
+ if (use_initial_params) |
|
102 |
+ { |
|
103 |
+ for (int iN=0; iN<this->N; iN++) |
|
104 |
+ { |
|
105 |
+ this->proba[iN] = initial_proba[iN]; |
|
106 |
+ this->proba[iN] = log(this->proba[iN]); |
|
107 |
+ } |
|
108 |
+ } |
|
109 |
+ else |
|
110 |
+ { |
|
111 |
+ for (int iN=0; iN<this->N; iN++) |
|
112 |
+ { |
|
113 |
+ this->proba[iN] = (double)1/this->N; |
|
114 |
+ this->proba[iN] = log(this->proba[iN]); |
|
115 |
+ // Save value to initial proba |
|
116 |
+ initial_proba[iN] = this->proba[iN]; |
|
117 |
+ } |
|
118 |
+ } |
|
119 |
+ |
|
120 |
+} |
|
121 |
+ |
|
122 |
+void LogHMM::baumWelch(int* maxiter, int* maxtime, double* eps) |
|
123 |
+{ |
|
124 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
125 |
+ |
|
126 |
+ double logPold = -INFINITY; |
|
127 |
+ double logPnew; |
|
128 |
+ double** gammaold = allocDoubleMatrix(this->N, this->T); |
|
129 |
+ |
|
130 |
+ // Parallelization settings |
|
131 |
+// omp_set_nested(1); |
|
132 |
+ |
|
133 |
+ // measuring the time |
|
134 |
+ this->baumWelchStartTime_sec = time(NULL); |
|
135 |
+ |
|
136 |
+ // Print some initial information |
|
137 |
+ FILE_LOG(logINFO) << ""; |
|
138 |
+ FILE_LOG(logINFO) << "INITIAL PARAMETERS"; |
|
139 |
+// this->print_uni_params(); |
|
140 |
+ this->print_uni_iteration(0); |
|
141 |
+ |
|
142 |
+ R_CheckUserInterrupt(); |
|
143 |
+ |
|
144 |
+ // Do the Baum-Welch |
|
145 |
+ int iteration = 0; |
|
146 |
+ while (((this->baumWelchTime_real < *maxtime) or (*maxtime < 0)) and ((iteration < *maxiter) or (*maxiter < 0))) |
|
147 |
+ { |
|
148 |
+ |
|
149 |
+ iteration++; |
|
150 |
+ |
|
151 |
+ FILE_LOG(logDEBUG1) << "Calling calc_densities() from baumWelch()"; |
|
152 |
+ try { this->calc_densities(); } catch(...) { throw; } |
|
153 |
+ R_CheckUserInterrupt(); |
|
154 |
+ |
|
155 |
+ FILE_LOG(logDEBUG1) << "Calling forward() from baumWelch()"; |
|
156 |
+ try { this->forward(); } catch(...) { throw; } |
|
157 |
+ R_CheckUserInterrupt(); |
|
158 |
+ |
|
159 |
+ FILE_LOG(logDEBUG1) << "Calling backward() from baumWelch()"; |
|
160 |
+ try { this->backward(); } catch(...) { throw; } |
|
161 |
+ R_CheckUserInterrupt(); |
|
162 |
+ |
|
163 |
+ FILE_LOG(logDEBUG1) << "Calling calc_loglikelihood() from baumWelch()"; |
|
164 |
+ this->calc_loglikelihood(); |
|
165 |
+ logPnew = this->logP; |
|
166 |
+ if(isnan(logPnew)) |
|
167 |
+ { |
|
168 |
+ FILE_LOG(logERROR) << "logPnew = " << logPnew; |
|
169 |
+ throw nan_detected; |
|
170 |
+ } |
|
171 |
+ this->dlogP = logPnew - logPold; |
|
172 |
+ |
|
173 |
+ FILE_LOG(logDEBUG1) << "Calling calc_sumxi() from baumWelch()"; |
|
174 |
+ this->calc_sumxi(); |
|
175 |
+ R_CheckUserInterrupt(); |
|
176 |
+ |
|
177 |
+ FILE_LOG(logDEBUG1) << "Calling calc_sumgamma() from baumWelch()"; |
|
178 |
+ this->calc_sumgamma(); |
|
179 |
+ R_CheckUserInterrupt(); |
|
180 |
+ |
|
181 |
+// clock_t clocktime = clock(), dtime; |
|
182 |
+ // difference in posterior |
|
183 |
+ FILE_LOG(logDEBUG1) << "Calculating differences in posterior in baumWelch()"; |
|
184 |
+ double postsum = 0.0; |
|
185 |
+ for (int t=0; t<this->T; t++) |
|
186 |
+ { |
|
187 |
+ for (int iN=0; iN<this->N; iN++) |
|
188 |
+ { |
|
189 |
+ postsum += fabs(this->gamma[iN][t] - gammaold[iN][t]); |
|
190 |
+ gammaold[iN][t] = this->gamma[iN][t]; |
|
191 |
+ } |
|
192 |
+ } |
|
193 |
+ this->sumdiff_posterior = postsum; |
|
194 |
+// dtime = clock() - clocktime; |
|
195 |