... | ... |
@@ -9,7 +9,16 @@ bedGraph2binned, |
9 | 9 |
|
10 | 10 |
find.aneuploidies, |
11 | 11 |
|
12 |
-plot.distribution |
|
12 |
+plot.distribution, |
|
13 |
+ |
|
14 |
+univariate2bed, |
|
15 |
+binned2wiggle, |
|
16 |
+multivariate2bed, |
|
17 |
+binned2GRanges, |
|
18 |
+hmm2GRanges, |
|
19 |
+ |
|
20 |
+mixedsort, |
|
21 |
+mixedorder |
|
13 | 22 |
) |
14 | 23 |
|
15 | 24 |
# Import all packages listed as Imports or Depends |
... | ... |
@@ -51,12 +51,24 @@ check.integer = function(testvar) { |
51 | 51 |
check.univariate.modellist = function(modellist) { |
52 | 52 |
if (!is(modellist,"list")) return(1) |
53 | 53 |
for (model in modellist) { |
54 |
- if (!is(model,"chromStar.univariate.model")) return(2) |
|
54 |
+ if (!is(model,class.chromstar.univariate)) return(2) |
|
55 | 55 |
} |
56 | 56 |
return(0) |
57 | 57 |
} |
58 | 58 |
check.univariate.model = function(model) { |
59 |
- if (!is(model,"chromStar.univariate.model")) return(1) |
|
59 |
+ if (!is(model,class.chromstar.univariate)) return(1) |
|
60 |
+ return(0) |
|
61 |
+} |
|
62 |
+ |
|
63 |
+check.multivariate.modellist = function(modellist) { |
|
64 |
+ if (!is(modellist,"list")) return(1) |
|
65 |
+ for (model in modellist) { |
|
66 |
+ if (!is(model,class.chromstar.multivariate)) return(2) |
|
67 |
+ } |
|
68 |
+ return(0) |
|
69 |
+} |
|
70 |
+check.multivariate.model = function(model) { |
|
71 |
+ if (!is(model,class.chromstar.multivariate)) return(1) |
|
60 | 72 |
return(0) |
61 | 73 |
} |
62 | 74 |
|
... | ... |
@@ -19,7 +19,14 @@ collapse.bins = function(data, column2collapseBy=NULL, columns2sumUp=NULL) { |
19 | 19 |
cShift1 = rep(NA,length(c)) |
20 | 20 |
cShift1[-1] = c[-length(c)] |
21 | 21 |
} |
22 |
- compare = c != cShift1 |
|
22 |
+ compare_custom = c != cShift1 |
|
23 |
+ # Make the comparison vector to separate chromosomes |
|
24 |
+ c <- as.integer(data[,1]) |
|
25 |
+ cShift1 = rep(NA,length(c)) |
|
26 |
+ cShift1[-1] = c[-length(c)] |
|
27 |
+ compare_chrom = c != cShift1 |
|
28 |
+ # Combine the vectors |
|
29 |
+ compare <- compare_custom | compare_chrom |
|
23 | 30 |
compare[1] = TRUE |
24 | 31 |
numcollapsedbins = length(which(compare==TRUE)) |
25 | 32 |
numbins = nrow(data) |
26 | 33 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,51 @@ |
1 |
+binned2GRanges <- function(binned.data) { |
|
2 |
+ |
|
3 |
+ gr <- GenomicRanges::GRanges( |
|
4 |
+ seqnames = Rle(binned.data$chrom), |
|
5 |
+ ranges = IRanges(start=binned.data$start, end=binned.data$end), |
|
6 |
+ strand = Rle(strand("*"), nrow(binned.data)), |
|
7 |
+ reads = binned.data$reads |
|
8 |
+ ) |
|
9 |
+ return(gr) |
|
10 |
+ |
|
11 |
+} |
|
12 |
+ |
|
13 |
+hmm2GRanges <- function(hmm, reduce=TRUE) { |
|
14 |
+ |
|
15 |
+ ### Check user input ### |
|
16 |
+ if (check.multivariate.model(hmm)!=0 & check.univariate.model(hmm)!=0) stop("argument 'hmm' expects a univariate or multivariate hmm object (type ?uni.hmm or ?multi.hmm for help)") |
|
17 |
+ if (check.logical(reduce)!=0) stop("argument 'reduce' expects TRUE or FALSE") |
|
18 |
+ |
|
19 |
+ ### Create GRanges ### |
|
20 |
+ # Transfer coordinates |
|
21 |
+ gr <- GenomicRanges::GRanges( |
|
22 |
+ seqnames = Rle(hmm$coordinates$chrom), |
|
23 |
+ ranges = IRanges(start=hmm$coordinates$start, end=hmm$coordinates$end), |
|
24 |
+ strand = Rle(strand("*"), nrow(hmm$coordinates)) |
|
25 |
+ ) |
|
26 |
+ # Seqlengths gives warnings because our coordinates are 0-based |
|
27 |
+# for (chrom in unique(hmm$coordinates$chrom)) { |
|
28 |
+# seqlengths(gr)[names(seqlengths(gr))==chrom] <- max(hmm$coordinates$end[hmm$coordinates$chrom==chrom]) |
|
29 |
+# } |
|
30 |
+ |
|
31 |
+ if (reduce) { |
|
32 |
+ # Reduce state by state |
|
33 |
+ red.gr.list <- GenomicRanges::GRangesList() |
|
34 |
+ for (state in unique(hmm$states)) { |
|
35 |
+ red.gr <- GenomicRanges::reduce(gr[hmm$states==state]) |
|
36 |
+ mcols(red.gr)$states <- rep(as.factor(state),length(red.gr)) |
|
37 |
+ red.gr.list[[length(red.gr.list)+1]] <- red.gr |
|
38 |
+ } |
|
39 |
+ # Merge and sort |
|
40 |
+ red.gr <- sort(unlist(red.gr.list)) |
|
41 |
+ remove(red.gr.list) |
|
42 |
+ return(red.gr) |
|
43 |
+ } else { |
|
44 |
+ mcols(gr)$reads <- hmm$reads |
|
45 |
+ mcols(gr)$posteriors <- hmm$posteriors |
|
46 |
+ mcols(gr)$states <- hmm$states |
|
47 |
+ return(gr) |
|
48 |
+ } |
|
49 |
+ |
|
50 |
+} |
|
51 |
+ |
0 | 52 |
deleted file mode 100644 |
... | ... |
@@ -1,291 +0,0 @@ |
1 |
-# ---------------------------------------------------------------- |
|
2 |
-# Write univariate states to file |
|
3 |
-# ---------------------------------------------------------------- |
|
4 |
-univariate2bed <- function(modellist, file="view_me_in_genome_browser", threshold=0.5, separate.zeroinflation=FALSE, chrom.length.file=NULL) { |
|
5 |
- |
|
6 |
- # Check user input |
|
7 |
- if (is.null(names(modellist))) { |
|
8 |
- stop("Please name the list entries. The names will be used as track name for the Genome Browser file.") |
|
9 |
- } |
|
10 |
- |
|
11 |
- # Variables |
|
12 |
- nummod <- length(modellist) |
|
13 |
- file <- paste(file,"bed", sep=".") |
|
14 |
- |
|
15 |
- # Generate the colors |
|
16 |
- colors <- c("zeroinflation"="black", "unmodified"="gray48","modified"="orangered3") |
|
17 |
- RGBs <- t(col2rgb(colors)) |
|
18 |
- RGBs <- apply(RGBs,1,paste,collapse=",") |
|
19 |
- |
|
20 |
- # Write first line to file |
|
21 |
- cat("browser hide all\n", file=file) |
|
22 |
- |
|
23 |
- for (imod in 1:nummod) { |
|
24 |
- model <- modellist[[imod]] |
|
25 |
- priority <- 50 + imod |
|
26 |
- cat(paste("track name=",names(modellist)[imod]," description=\"univariate calls for ",names(modellist)[imod],", threshold = ",threshold,"\" visibility=1 itemRgb=On priority=",priority,"\n", sep=""), file=file, append=TRUE) |
|
27 |
- |
|
28 |
- # Collapse the calls |
|
29 |
- if (separate.zeroinflation) { |
|
30 |
- calls <- cbind(model$coordinates, name=model$states.with.zeroinflation) |
|
31 |
- collapsed.calls <- collapse.bins(calls, column2collapseBy=4) |
|
32 |
- } else { |
|
33 |
- calls <- cbind(model$coordinates, name=model$states) |
|
34 |
- collapsed.calls <- collapse.bins(calls, column2collapseBy=4) |
|
35 |
- collapsed.calls <- collapsed.calls[collapsed.calls$name=="modified",] |
|
36 |
- } |
|
37 |
- |
|
38 |
- # Check length of chromosomes if chrom.length.file was given |
|
39 |
- if (!is.null(chrom.length.file)) { |
|
40 |
- chrom.lengths.df <- read.table(chrom.length.file) |
|
41 |
- chrom.lengths <- chrom.lengths.df[,2] |
|
42 |
- names(chrom.lengths) <- chrom.lengths.df[,1] |
|
43 |
- for (chrom in levels(as.factor(collapsed_calls$chrom))) { |
|
44 |
- index <- which(collapsed_calls$chrom == chrom & collapsed_calls$end > chrom.lengths[chrom]) |
|
45 |
- collapsed_calls$end[index] <- chrom.lengths[chrom] |
|
46 |
- if (length(index) >= 1) { |
|
47 |
- warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome") |
|
48 |
- } |
|
49 |
- } |
|
50 |
- } |
|
51 |
- |
|
52 |
- # Append to file |
|
53 |
- itemRgb <- RGBs[as.character(collapsed.calls$name)] |
|
54 |
- numsegments <- nrow(collapsed.calls) |
|
55 |
- df <- cbind(collapsed.calls, score=rep(0,numsegments), strand=rep(".",numsegments), thickStart=collapsed.calls$start, thickEnd=collapsed.calls$end, itemRgb=itemRgb) |
|
56 |
- write.table(format(df, scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE) |
|
57 |
- } |
|
58 |
- |
|
59 |
-} |
|
60 |
- |
|
61 |
-# ---------------------------------------------------------------- |
|
62 |
-# |
|
63 |
-# ---------------------------------------------------------------- |
|
64 |
-make.ucsc.file.combinatorial <- function(modellist, numstates=NULL, file="combinatorial-states-ucsc", threshold=0.5, chrom.length.file=NULL) { |
|
65 |
- |
|
66 |
- # Define variables |
|
67 |
- nummod <- length(modellist) |
|
68 |
- file <- paste(file,".BED", sep="") |
|
69 |
- coordinates <- modellist[[1]]$coordinates |
|
70 |
- |
|
71 |
- # Write first line to file |
|
72 |
- cat("browser position chr1:1-100000\n", file=file) |
|
73 |
- |
|
74 |
- # Get the combinatorial states |
|
75 |
- cat("Getting the combinatorial states ...\n") |
|
76 |
- combstates <- combinatorial.states(modellist) |
|
77 |
- states <- as.numeric(names(sort(table(combstates), decreasing=TRUE))) |
|
78 |
- |
|
79 |
- # Clean up to reduce memory usage |
|
80 |
- remove(modellist) |
|
81 |
- |
|
82 |
- if (is.null(numstates)) { |
|
83 |
- numstates <- length(states) |
|
84 |
- } else if (numstates > length(states)) { |
|
85 |
- numstates <- length(states) |
|
86 |
- } |
|
87 |
- # Go through each state and write to file |
|
88 |
- for (istate in states[1:numstates]) { |
|
89 |
- cat("\nAnalyzing state",istate,"\n") |
|
90 |
- cat(paste("track name=\"combinatorial state ",istate,"\" description=\"univariate calls for state ",istate,", threshold = ",threshold,"\" visibility=1\n", sep=""), file=file, append=TRUE) |
|
91 |
- |
|
92 |
- calls <- coordinates[ combstates == istate , ] |
|
93 |
- if (length(calls$start) >= 1) { |
|
94 |
- # Collapse the bins |
|
95 |
- cat("collapsing the calls\n") |
|
96 |
- collapsed_calls <- collapse.bins(calls) |
|
97 |
- } else { |
|
98 |
- cat("no bins in state",istate,"\n") |
|
99 |
- collapsed_calls <- NULL |
|
100 |
- } |
|
101 |
- |
|
102 |
- if (!is.null(collapsed_calls)) { |
|
103 |
- # Check length of chromosomes if chrom.length.file was given |
|
104 |
- if (!is.null(chrom.length.file)) { |
|
105 |
- chrom.lengths.df <- read.table(chrom.length.file) |
|
106 |
- chrom.lengths <- chrom.lengths.df[,2] |
|
107 |
- names(chrom.lengths) <- chrom.lengths.df[,1] |
|
108 |
- for (chrom in levels(as.factor(collapsed_calls$chrom))) { |
|
109 |
- index <- which(collapsed_calls$chrom == chrom & collapsed_calls$end > chrom.lengths[chrom]) |
|
110 |
- collapsed_calls$end[index] <- chrom.lengths[chrom] |
|
111 |
- if (length(index) >= 1) { |
|
112 |
- warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome") |
|
113 |
- } |
|
114 |
- } |
|
115 |
- } |
|
116 |
- |
|
117 |
- cat("appending to file ...\n") |
|
118 |
- write.table(format(as.data.frame(collapsed_calls)[,1:3], scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE) |
|
119 |
- } |
|
120 |
- } |
|
121 |
- |
|
122 |
-} |
|
123 |
- |
|
124 |
- |
|
125 |
-# ---------------------------------------------------------------- |
|
126 |
-# |
|
127 |
-# ---------------------------------------------------------------- |
|
128 |
-make.ucsc.file.multivariate <- function(model, numstates=NULL, file="multivariate-states-ucsc", chrom.length.file=NULL) { |
|
129 |
- |
|
130 |
- # Variable assignments |
|
131 |
- states <- model$stateorder |
|
132 |
- coordinates <- model$coordinates |
|
133 |
- file <- paste(file,".BED", sep="") |
|
134 |
- |
|
135 |
- # Write first line to file |
|
136 |
- cat("browser position chr1:1-100000\n", file=file) |
|
137 |
- |
|
138 |
- # Determine state of bin |
|
139 |
- cat("Determining state of bins ...\n") |
|
140 |
- combstates <- apply(model$posteriors,1,which.max) |
|
141 |
- combstates <- states[combstates] |
|
142 |
- |
|
143 |
- if (is.null(numstates)) { |
|
144 |
- numstates <- length(states) |
|
145 |
- } else if (numstates > length(states)) { |
|
146 |
- numstates <- length(states) |
|
147 |
- } |
|
148 |
- # Go through each state and write to file |
|
149 |
- for (istate in states[1:numstates]) { |
|
150 |
- cat("\nAnalyzing state",istate,"\n") |
|
151 |
- cat(paste("track name=\"multivariate state ",istate,"\" description=\"multivariate calls for state ",istate,"\" visibility=1 color=0,0,255\n", sep=""), file=file, append=TRUE) |
|
152 |
- |
|
153 |
- calls <- coordinates[ combstates == istate , ] |
|
154 |
- if (length(calls$start) >= 1) { |
|
155 |
- # Collapse the bins |
|
156 |
- cat("collapsing the calls\n") |
|
157 |
- collapsed_calls <- collapse.bins(calls) |
|
158 |
- } else { |
|
159 |
- cat("no bins in state",istate,"\n") |
|
160 |
- collapsed_calls <- NULL |
|
161 |
- } |
|
162 |
- |
|
163 |
- if (!is.null(collapsed_calls)) { |
|
164 |
- # Check length of chromosomes if chrom.length.file was given |
|
165 |
- if (!is.null(chrom.length.file)) { |
|
166 |
- chrom.lengths.df <- read.table(chrom.length.file) |
|
167 |
- chrom.lengths <- chrom.lengths.df[,2] |
|
168 |
- names(chrom.lengths) <- chrom.lengths.df[,1] |
|
169 |
- for (chrom in levels(as.factor(collapsed_calls$chrom))) { |
|
170 |
- index <- which(collapsed_calls$chrom == chrom & collapsed_calls$end > chrom.lengths[chrom]) |
|
171 |
- collapsed_calls$end[index] <- chrom.lengths[chrom] |
|
172 |
- if (length(index) >= 1) { |
|
173 |
- warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome") |
|
174 |
- } |
|
175 |
- } |
|
176 |
- } |
|
177 |
- |
|
178 |
- cat("appending to file ...\n") |
|
179 |
- write.table(format(as.data.frame(collapsed_calls)[,1:3], scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE) |
|
180 |
- } |
|
181 |
- } |
|
182 |
- |
|
183 |
-} |
|
184 |
- |
|
185 |
- |
|
186 |
-# ---------------------------------------------------------------- |
|
187 |
-# Write signal tracks from binned data |
|
188 |
-# ---------------------------------------------------------------- |
|
189 |
-bin2wiggle <- function(binned.data.list, file="view_me_in_genome_browser") { |
|
190 |
- |
|
191 |
- # Check user input |
|
192 |
- if (is.null(names(binned.data.list))) { |
|
193 |
- stop("Please name the list entries. The names will be used as track name for the Genome Browser file.") |
|
194 |
- } |
|
195 |
- |
|
196 |
- # Variables |
|
197 |
- file <- paste(file,"wig", sep=".") |
|
198 |
- |
|
199 |
- # Write to file |
|
200 |
- cat("browser hide all\n", file=file) |
|
201 |
- |
|
202 |
- # Go through binned.data.list |
|
203 |
- for (i1 in 1:length(binned.data.list)) { |
|
204 |
- binned.data <- binned.data.list[[i1]] |
|
205 |
- names(binned.data) <- c("chrom","start","end","reads") |
|
206 |
- |
|
207 |
- # Write track information |
|
208 |
- name <- names(binned.data.list)[i1] |
|
209 |
- binsize <- diff(binned.data$start[1:2]) |
|
210 |
- description <- paste("binned read counts ",binsize,"bp", sep="") |
|
211 |
- cat(paste("track type=wiggle_0 name=\"",name,"\" description=\"",description,"\" visibility=full autoScale=on color=90,90,90 maxHeightPixels=100:50:20 graphType=bar priority=",i1,"\n", sep=""), file=file, append=TRUE) |
|
212 |
- # Write read data |
|
213 |
- for (chrom in unique(binned.data$chrom)) { |
|
214 |
- cat(paste("fixedStep chrom=",chrom," start=1 step=",binsize," span=",binsize,"\n", sep=""), file=file, append=TRUE) |
|
215 |
- write.table(binned.data$reads, file=file, append=TRUE, row.names=FALSE, col.names=FALSE) |
|
216 |
- } |
|
217 |
- } |
|
218 |
- |
|
219 |
-} |
|
220 |
- |
|
221 |
- |
|
222 |
-# ---------------------------------------------------------------- |
|
223 |
-# Write color-coded (multivariate) combinatorial states to file |
|
224 |
-# ---------------------------------------------------------------- |
|
225 |
-multivariate2bed <- function(model, separate.tracks=FALSE, exclude.state.zero=TRUE, numstates=NULL, file="view_me_in_genome_browser", chrom.length.file=NULL) { |
|
226 |
- |
|
227 |
- # Variables |
|
228 |
- file <- paste(file,".bed", sep="") |
|
229 |
- combstates <- model$stateorder |
|
230 |
- if (is.null(numstates)) { |
|
231 |
- numstates <- length(combstates) |
|
232 |
- } else if (numstates > length(combstates)) { |
|
233 |
- numstates <- length(combstates) |
|
234 |
- } |
|
235 |
- if (exclude.state.zero) { |
|
236 |
- combstates2use <- setdiff(combstates,0) |
|
237 |
- combstates2use <- combstates2use[1:numstates] |
|
238 |
- combstates2use <- combstates2use[!is.na(combstates2use)] |
|
239 |
- numstates <- length(combstates2use) |
|
240 |
- } |
|
241 |
- |
|
242 |
- # Collapse the calls |
|
243 |
- calls <- cbind(model$coordinates, name=model$states) |
|
244 |
- collapsed.calls <- collapse.bins(calls, column2collapseBy=4) |
|
245 |
- # Select only desired states |
|
246 |
- mask <- rep(FALSE,nrow(collapsed.calls)) |
|
247 |
- for (istate in combstates2use) { |
|
248 |
- mask <- mask | istate==collapsed.calls$name |
|
249 |
- } |
|
250 |
- collapsed.calls <- collapsed.calls[mask,] |
|
251 |
- |
|
252 |
- # Check length of chromosomes if chrom.length.file was given |
|
253 |
- if (!is.null(chrom.length.file)) { |
|
254 |
- chrom.lengths.df <- read.table(chrom.length.file) |
|
255 |
- chrom.lengths <- chrom.lengths.df[,2] |
|
256 |
- names(chrom.lengths) <- chrom.lengths.df[,1] |
|
257 |
- for (chrom in levels(as.factor(collapsed.calls$chrom))) { |
|
258 |
- index <- which(collapsed.calls$chrom == chrom & collapsed.calls$end > chrom.lengths[chrom]) |
|
259 |
- collapsed.calls$end[index] <- chrom.lengths[chrom] |
|
260 |
- if (length(index) >= 1) { |
|
261 |
- warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome") |
|
262 |
- } |
|
263 |
- } |
|
264 |
- } |
|
265 |
- |
|
266 |
- # Generate the colors for each combinatorial state |
|
267 |
- colors <- colors()[grep(colors(), pattern="white|grey|gray|snow", invert=T)] |
|
268 |
- step <- length(colors) %/% numstates |
|
269 |
- colors <- colors[seq(1,by=step,length=numstates)] |
|
270 |
- RGBs <- t(col2rgb(colors)) |
|
271 |
- RGBs <- apply(RGBs,1,paste,collapse=",") |
|
272 |
- itemRgb <- RGBs[as.factor(collapsed.calls$name)] |
|
273 |
- |
|
274 |
- # Write to file |
|
275 |
- cat("browser hide all\n", file=file) |
|
276 |
- numsegments <- nrow(collapsed.calls) |
|
277 |
- df <- cbind(collapsed.calls, score=rep(0,numsegments), strand=rep(".",numsegments), thickStart=collapsed.calls$start, thickEnd=collapsed.calls$end, itemRgb=itemRgb) |
|
278 |
- |
|
279 |
- if (separate.tracks) { |
|
280 |
- for (istate in combstates2use) { |
|
281 |
- priority <- 100 + which(istate==combstates2use) |
|
282 |
- cat(paste("track name=\"comb.state ",istate,"\" description=\"multivariate calls for combinatorial state ",istate,"\" visibility=1 itemRgb=On priority=",priority,"\n", sep=""), file=file, append=TRUE) |
|
283 |
- write.table(format(df[df$name==istate,], scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE) |
|
284 |
- } |
|
285 |
- } else { |
|
286 |
- cat(paste("track name=\"comb.state\" description=\"multivariate combinatorial states\" visibility=1 itemRgb=On priority=100\n", sep=""), file=file, append=TRUE) |
|
287 |
- write.table(format(df, scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE) |
|
288 |
- } |
|
289 |
- |
|
290 |
-} |
|
291 |
- |
292 | 0 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,161 @@ |
1 |
+# =============================================== |
|
2 |
+# Write color-coded tracks with univariate states |
|
3 |
+# =============================================== |
|
4 |
+univariate2bed <- function(uni.hmm.list, file="view_me_in_genome_browser", threshold=0.5, chrom.length.file=NULL) { |
|
5 |
+ |
|
6 |
+ # Check user input |
|
7 |
+ if (is.null(names(uni.hmm.list))) { |
|
8 |
+ stop("Please name the list entries. The names will be used as track name for the Genome Browser file.") |
|
9 |
+ } |
|
10 |
+ |
|
11 |
+ # Variables |
|
12 |
+ nummod <- length(uni.hmm.list) |
|
13 |
+ file <- paste(file,"bed", sep=".") |
|
14 |
+ |
|
15 |
+ # Generate the colors |
|
16 |
+ colors <- gcolors[state.labels] |
|
17 |
+ RGBs <- t(col2rgb(colors)) |
|
18 |
+ RGBs <- apply(RGBs,1,paste,collapse=",") |
|
19 |
+ |
|
20 |
+ # Write first line to file |
|
21 |
+ cat("browser hide all\n", file=file) |
|
22 |
+ |
|
23 |
+ for (imod in 1:nummod) { |
|
24 |
+ uni.hmm <- uni.hmm.list[[imod]] |
|
25 |
+ priority <- 50 + imod |
|
26 |
+ cat(paste("track name=",names(uni.hmm.list)[imod]," description=\"univariate calls for ",names(uni.hmm.list)[imod],", threshold = ",threshold,"\" visibility=1 itemRgb=On priority=",priority,"\n", sep=""), file=file, append=TRUE) |
|
27 |
+ |
|
28 |
+ # Collapse the calls |
|
29 |
+ calls <- cbind(uni.hmm$coordinates, name=uni.hmm$states) |
|
30 |
+ collapsed.calls <- collapse.bins(calls, column2collapseBy=4) |
|
31 |
+ |
|
32 |
+ # Check length of chromosomes if chrom.length.file was given |
|
33 |
+ if (!is.null(chrom.length.file)) { |
|
34 |
+ chrom.lengths.df <- read.table(chrom.length.file) |
|
35 |
+ chrom.lengths <- chrom.lengths.df[,2] |
|
36 |
+ names(chrom.lengths) <- chrom.lengths.df[,1] |
|
37 |
+ for (chrom in levels(as.factor(collapsed_calls$chrom))) { |
|
38 |
+ index <- which(collapsed_calls$chrom == chrom & collapsed_calls$end > chrom.lengths[chrom]) |
|
39 |
+ collapsed_calls$end[index] <- chrom.lengths[chrom] |
|
40 |
+ if (length(index) >= 1) { |
|
41 |
+ warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome") |
|
42 |
+ } |
|
43 |
+ } |
|
44 |
+ } |
|
45 |
+ |
|
46 |
+ # Append to file |
|
47 |
+ itemRgb <- RGBs[as.character(collapsed.calls$name)] |
|
48 |
+ numsegments <- nrow(collapsed.calls) |
|
49 |
+ df <- cbind(collapsed.calls, score=rep(0,numsegments), strand=rep(".",numsegments), thickStart=collapsed.calls$start, thickEnd=collapsed.calls$end, itemRgb=itemRgb) |
|
50 |
+ write.table(format(df, scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE) |
|
51 |
+ } |
|
52 |
+ |
|
53 |
+} |
|
54 |
+ |
|
55 |
+ |
|
56 |
+# ==================================== |
|
57 |
+# Write signal tracks from binned data |
|
58 |
+# ==================================== |
|
59 |
+binned2wiggle <- function(binned.data.list, file="view_me_in_genome_browser") { |
|
60 |
+ |
|
61 |
+ # Check user input |
|
62 |
+ if (is.null(names(binned.data.list))) { |
|
63 |
+ stop("Please name the list entries. The names will be used as track name for the Genome Browser file.") |
|
64 |
+ } |
|
65 |
+ |
|
66 |
+ # Variables |
|
67 |
+ file <- paste(file,"wig", sep=".") |
|
68 |
+ |
|
69 |
+ # Write to file |
|
70 |
+ cat("browser hide all\n", file=file) |
|
71 |
+ |
|
72 |
+ # Go through binned.data.list |
|
73 |
+ for (i1 in 1:length(binned.data.list)) { |
|
74 |
+ binned.data <- binned.data.list[[i1]] |
|
75 |
+ names(binned.data) <- binned.data.names |
|
76 |
+ |
|
77 |
+ # Write track information |
|
78 |
+ name <- names(binned.data.list)[i1] |
|
79 |
+ binsize <- diff(binned.data$start[1:2]) |
|
80 |
+ description <- paste("binned read counts ",binsize,"bp", sep="") |
|
81 |
+ cat(paste("track type=wiggle_0 name=\"",name,"\" description=\"",description,"\" visibility=full autoScale=on color=90,90,90 maxHeightPixels=100:50:20 graphType=bar priority=",i1,"\n", sep=""), file=file, append=TRUE) |
|
82 |
+ # Write read data |
|
83 |
+ for (chrom in unique(binned.data$chrom)) { |
|
84 |
+ cat(paste("fixedStep chrom=",chrom," start=1 step=",binsize," span=",binsize,"\n", sep=""), file=file, append=TRUE) |
|
85 |
+ write.table(binned.data$reads, file=file, append=TRUE, row.names=FALSE, col.names=FALSE) |
|
86 |
+ } |
|
87 |
+ } |
|
88 |
+ |
|
89 |
+} |
|
90 |
+ |
|
91 |
+ |
|
92 |
+# =============================================================== |
|
93 |
+# Write color-coded tracks with multivariate combinatorial states |
|
94 |
+# =============================================================== |
|
95 |
+multivariate2bed <- function(multi.hmm, separate.tracks=FALSE, exclude.state.zero=TRUE, numstates=NULL, file="view_me_in_genome_browser", chrom.length.file=NULL) { |
|
96 |
+ |
|
97 |
+ # Variables |
|
98 |
+ file <- paste(file,".bed", sep="") |
|
99 |
+ combstates <- multi.hmm$comb.states |
|
100 |
+ if (is.null(numstates)) { |
|
101 |
+ numstates <- length(combstates) |
|
102 |
+ } else if (numstates > length(combstates)) { |
|
103 |
+ numstates <- length(combstates) |
|
104 |
+ } |
|
105 |
+ if (exclude.state.zero) { |
|
106 |
+ combstates2use <- setdiff(combstates,0) |
|
107 |
+ combstates2use <- combstates2use[1:numstates] |
|
108 |
+ combstates2use <- combstates2use[!is.na(combstates2use)] |
|
109 |
+ numstates <- length(combstates2use) |
|
110 |
+ } |
|
111 |
+ |
|
112 |
+ # Collapse the calls |
|
113 |
+ calls <- cbind(multi.hmm$coordinates, name=multi.hmm$states) |
|
114 |
+ collapsed.calls <- collapse.bins(calls, column2collapseBy=4) |
|
115 |
+ # Select only desired states |
|
116 |
+ mask <- rep(FALSE,nrow(collapsed.calls)) |
|
117 |
+ for (istate in combstates2use) { |
|
118 |
+ mask <- mask | istate==collapsed.calls$name |
|
119 |
+ } |
|
120 |
+ collapsed.calls <- collapsed.calls[mask,] |
|
121 |
+ |
|
122 |
+ # Check length of chromosomes if chrom.length.file was given |
|
123 |
+ if (!is.null(chrom.length.file)) { |
|
124 |
+ chrom.lengths.df <- read.table(chrom.length.file) |
|
125 |
+ chrom.lengths <- chrom.lengths.df[,2] |
|
126 |
+ names(chrom.lengths) <- chrom.lengths.df[,1] |
|
127 |
+ for (chrom in levels(as.factor(collapsed.calls$chrom))) { |
|
128 |
+ index <- which(collapsed.calls$chrom == chrom & collapsed.calls$end > chrom.lengths[chrom]) |
|
129 |
+ collapsed.calls$end[index] <- chrom.lengths[chrom] |
|
130 |
+ if (length(index) >= 1) { |
|
131 |
+ warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome") |
|
132 |
+ } |
|
133 |
+ } |
|
134 |
+ } |
|
135 |
+ |
|
136 |
+ # Generate the colors for each combinatorial state |
|
137 |
+ colors <- colors()[grep(colors(), pattern="white|grey|gray|snow", invert=T)] |
|
138 |
+ step <- length(colors) %/% numstates |
|
139 |
+ colors <- colors[seq(1,by=step,length=numstates)] |
|
140 |
+ RGBs <- t(col2rgb(colors)) |
|
141 |
+ RGBs <- apply(RGBs,1,paste,collapse=",") |
|
142 |
+ itemRgb <- RGBs[as.factor(collapsed.calls$name)] |
|
143 |
+ |
|
144 |
+ # Write to file |
|
145 |
+ cat("browser hide all\n", file=file) |
|
146 |
+ numsegments <- nrow(collapsed.calls) |
|
147 |
+ df <- cbind(collapsed.calls, score=rep(0,numsegments), strand=rep(".",numsegments), thickStart=collapsed.calls$start, thickEnd=collapsed.calls$end, itemRgb=itemRgb) |
|
148 |
+ |
|
149 |
+ if (separate.tracks) { |
|
150 |
+ for (istate in combstates2use) { |
|
151 |
+ priority <- 100 + which(istate==combstates2use) |
|
152 |
+ cat(paste("track name=\"comb.state ",istate,"\" description=\"multivariate calls for combinatorial state ",istate,"\" visibility=1 itemRgb=On priority=",priority,"\n", sep=""), file=file, append=TRUE) |
|
153 |
+ write.table(format(df[df$name==istate,], scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE) |
|
154 |
+ } |
|
155 |
+ } else { |
|
156 |
+ cat(paste("track name=\"comb.state\" description=\"multivariate combinatorial states\" visibility=1 itemRgb=On priority=100\n", sep=""), file=file, append=TRUE) |
|
157 |
+ write.table(format(df, scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE) |
|
158 |
+ } |
|
159 |
+ |
|
160 |
+} |
|
161 |
+ |
... | ... |
@@ -1,4 +1,4 @@ |
1 |
-find.aneuploidies <- function(binned.data, eps=0.001, max.time=-1, max.it=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, filter.reads=TRUE) { |
|
1 |
+find.aneuploidies <- function(binned.data, eps=0.001, init="standard", max.time=-1, max.it=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, filter.reads=TRUE) { |
|
2 | 2 |
|
3 | 3 |
## Intercept user input |
4 | 4 |
if (check.positive(eps)!=0) stop("argument 'eps' expects a positive numeric") |
... | ... |
@@ -21,6 +21,7 @@ find.aneuploidies <- function(binned.data, eps=0.001, max.time=-1, max.it=-1, nu |
21 | 21 |
# state.labels # assigned globally outside this function |
22 | 22 |
numstates <- length(state.labels) |
23 | 23 |
numbins <- length(binned.data$reads) |
24 |
+ iniproc <- which(init==c("standard","random","empiric")) # transform to int |
|
24 | 25 |
|
25 | 26 |
# Check if there are reads in the data, otherwise HMM will blow up |
26 | 27 |
if (!any(binned.data$reads!=0)) { |
... | ... |
@@ -28,13 +29,13 @@ find.aneuploidies <- function(binned.data, eps=0.001, max.time=-1, max.it=-1, nu |
28 | 29 |
} |
29 | 30 |
|
30 | 31 |
# Filter high reads out, makes HMM faster |
32 |
+ read.cutoff <- as.integer(quantile(binned.data$reads, 0.9999)) |
|
31 | 33 |
if (filter.reads) { |
32 |
- limit <- 10*ceiling(var(binned.data$reads)) |
|
33 |
- mask <- binned.data$reads > limit |
|
34 |
- binned.data$reads[mask] <- limit |
|
34 |
+ mask <- binned.data$reads > read.cutoff |
|
35 |
+ binned.data$reads[mask] <- read.cutoff |
|
35 | 36 |
numfiltered <- length(which(mask)) |
36 | 37 |
if (numfiltered > 0) { |
37 |
- warning(paste("There are very high read counts (probably artificial) in your data. Replaced read counts > ",limit," (10*variance) by ",limit," in ",numfiltered," bins. Set option 'filter.reads=FALSE' to disable this filtering.", sep="")) |
|
38 |
+ warning(paste("There are very high read counts in your data (probably artificial). Replaced read counts > ",read.cutoff," (99.99% quantile) by ",read.cutoff," in ",numfiltered," bins. Set option 'filter.reads=FALSE' to disable this filtering.", sep="")) |
|
38 | 39 |
} |
39 | 40 |
} |
40 | 41 |
|
... | ... |
@@ -47,8 +48,8 @@ find.aneuploidies <- function(binned.data, eps=0.001, max.time=-1, max.it=-1, nu |
47 | 48 |
reads = as.integer(binned.data$reads), # double* O |
48 | 49 |
num.bins = as.integer(numbins), # int* T |
49 | 50 |
num.states = as.integer(numstates), # int* N |
50 |
- means = double(length=numstates), # double* means |
|
51 |
- variances = double(length=numstates), # double* variances |
|
51 |
+ size = double(length=numstates), # double* size |
|
52 |
+ prob = double(length=numstates), # double* prob |
|
52 | 53 |
num.iterations = as.integer(max.it), # int* maxiter |
53 | 54 |
time.sec = as.integer(max.time), # double* maxtime |
54 | 55 |
loglik.delta = as.double(eps.try), # double* eps |
... | ... |
@@ -57,24 +58,28 @@ find.aneuploidies <- function(binned.data, eps=0.001, max.time=-1, max.it=-1, nu |
57 | 58 |
proba = double(length=numstates), # double* proba |
58 | 59 |
loglik = double(length=1), # double* loglik |
59 | 60 |
weights = double(length=numstates), # double* weights |
60 |
- means.initial = double(length=numstates), # double* initial_means |
|
61 |
- variances.initial = double(length=numstates), # double* initial_variances |
|
61 |
+ ini.proc = as.integer(iniproc), # int* iniproc |
|
62 |
+ size.initial = double(length=numstates), # double* initial_size |
|
63 |
+ prob.initial = double(length=numstates), # double* initial_prob |
|
62 | 64 |
A.initial = double(length=numstates*numstates), # double* initial_A |
63 | 65 |
proba.initial = double(length=numstates), # double* initial_proba |
64 | 66 |
use.initial.params = as.logical(0), # bool* use_initial_params |
65 |
- num.threads = as.integer(num.threads) # int* num_threads |
|
67 |
+ num.threads = as.integer(num.threads), # int* num_threads |
|
68 |
+ error = as.integer(0), # int* error (error handling) |
|
69 |
+ read.cutoff = as.integer(read.cutoff) # int* read_cutoff |
|
66 | 70 |
) |
67 | 71 |
|
72 |
+ names(hmm$weights) <- state.labels |
|
68 | 73 |
hmm$eps <- eps.try |
69 | 74 |
hmm$A <- matrix(hmm$A, ncol=hmm$num.states, byrow=TRUE) |
70 | 75 |
rownames(hmm$A) <- state.labels |
71 | 76 |
colnames(hmm$A) <- state.labels |
72 |
- hmm$distributions <- cbind(mean=hmm$means, variance=hmm$variances, size=fsize(hmm$means,hmm$variances), prob=fprob(hmm$means,hmm$variances)) |
|
77 |
+ hmm$distributions <- cbind(size=hmm$size, prob=hmm$prob, mu=fmean(hmm$size,hmm$prob), variance=fvariance(hmm$size,hmm$prob)) |
|
73 | 78 |
rownames(hmm$distributions) <- state.labels |
74 | 79 |
hmm$A.initial <- matrix(hmm$A.initial, ncol=hmm$num.states, byrow=TRUE) |
75 | 80 |
rownames(hmm$A.initial) <- state.labels |
76 | 81 |
colnames(hmm$A.initial) <- state.labels |
77 |
- hmm$distributions.initial <- cbind(mean=hmm$means.initial, variance=hmm$variances.initial, size=fsize(hmm$means.initial,hmm$variances.initial), prob=fprob(hmm$means.initial,hmm$variances.initial)) |
|
82 |
+ hmm$distributions.initial <- cbind(size=hmm$size.initial, prob=hmm$prob.initial, mu=fmean(hmm$size.initial,hmm$prob.initial), variance=fvariance(hmm$size.initial,hmm$prob.initial)) |
|
78 | 83 |
rownames(hmm$distributions.initial) <- state.labels |
79 | 84 |
if (num.trials > 1) { |
80 | 85 |
if (hmm$loglik.delta > hmm$eps) { |
... | ... |
@@ -96,51 +101,58 @@ find.aneuploidies <- function(binned.data, eps=0.001, max.time=-1, max.it=-1, nu |
96 | 101 |
# Rerun the HMM with different epsilon and initial parameters from trial run |
97 | 102 |
cat("\n\nRerunning try ",indexmax," with eps =",eps,"--------------------\n") |
98 | 103 |
hmm <- .C("R_univariate_hmm", |
99 |
- reads <- as.integer(binned.data$reads), # double* O |
|
100 |
- num.bins <- as.integer(numbins), # int* T |
|
101 |
- num.states <- as.integer(numstates), # int* N |
|
102 |
- means <- double(length=numstates), # double* means |
|
103 |
- variances <- double(length=numstates), # double* variances |
|
104 |
- num.iterations <- as.integer(max.it), # int* maxiter |
|
105 |
- time.sec <- as.integer(max.time), # double* maxtime |
|
106 |
- loglik.delta <- as.double(eps), # double* eps |
|
107 |
- posteriors <- double(length=numbins * numstates), # double* posteriors |
|
108 |
- A <- double(length=numstates*numstates), # double* A |
|
109 |
- proba <- double(length=numstates), # double* proba |
|
110 |
- loglik <- double(length=1), # double* loglik |
|
111 |
- weights <- double(length=numstates), # double* weights |
|
112 |
- means.initial <- as.vector(hmm$distributions[,'mean']), # double* initial_means |
|
113 |
- variances.initial <- as.vector(hmm$distributions[,'variance']), # double* initial_variances |
|
114 |
- A.initial <- as.vector(hmm$A), # double* initial_A |
|
115 |
- proba.initial <- as.vector(hmm$proba), # double* initial_proba |
|
116 |
- use.initial.params <- as.logical(1), # bool* use_initial_params |
|
117 |
- num.threads <- as.integer(num.threads) # int* num_threads |
|
104 |
+ reads = as.integer(binned.data$reads), # double* O |
|
105 |
+ num.bins = as.integer(numbins), # int* T |
|
106 |
+ num.states = as.integer(numstates), # int* N |
|
107 |
+ size = double(length=numstates), # double* size |
|
108 |
+ prob = double(length=numstates), # double* prob |
|
109 |
+ num.iterations = as.integer(max.it), # int* maxiter |
|
110 |
+ time.sec = as.integer(max.time), # double* maxtime |
|
111 |
+ loglik.delta = as.double(eps), # double* eps |
|
112 |
+ posteriors = double(length=numbins * numstates), # double* posteriors |
|
113 |
+ A = double(length=numstates*numstates), # double* A |
|
114 |
+ proba = double(length=numstates), # double* proba |
|
115 |
+ loglik = double(length=1), # double* loglik |
|
116 |
+ weights = double(length=numstates), # double* weights |
|
117 |
+ ini.proc = as.integer(iniproc), # int* iniproc |
|
118 |
+ size.initial = as.vector(hmm$distributions[,'size']), # double* initial_size |
|
119 |
+ prob.initial = as.vector(hmm$distributions[,'prob']), # double* initial_prob |
|
120 |
+ A.initial = as.vector(hmm$A), # double* initial_A |
|
121 |
+ proba.initial = as.vector(hmm$proba), # double* initial_proba |
|
122 |
+ use.initial.params = as.logical(1), # bool* use_initial_params |
|
123 |
+ num.threads = as.integer(num.threads), # int* num_threads |
|
124 |
+ error = as.integer(0), # int* error (error handling) |
|
125 |
+ read.cutoff = as.integer(read.cutoff) # int* read_cutoff |
|
118 | 126 |
) |
119 | 127 |
} |
120 | 128 |
|
121 | 129 |
# Add useful entries |
130 |
+ names(hmm$weights) <- state.labels |
|
122 | 131 |
hmm$coordinates <- binned.data[,coordinate.names] |
123 | 132 |
hmm$posteriors <- matrix(hmm$posteriors, ncol=hmm$num.states) |
124 | 133 |
colnames(hmm$posteriors) <- paste("P(",state.labels,")", sep="") |
125 |
- class(hmm) <- "aneufinder.model" |
|
134 |
+ class(hmm) <- class.chromstar.univariate |
|
126 | 135 |
hmm$states <- state.labels[apply(hmm$posteriors, 1, which.max)] |
127 | 136 |
hmm$eps <- eps |
128 | 137 |
hmm$A <- matrix(hmm$A, ncol=hmm$num.states, byrow=TRUE) |
129 | 138 |
rownames(hmm$A) <- state.labels |
130 | 139 |
colnames(hmm$A) <- state.labels |
131 |
- hmm$distributions <- cbind(mean=hmm$means, variance=hmm$variances, size=fsize(hmm$means,hmm$variances), prob=fprob(hmm$means,hmm$variances)) |
|
140 |
+ hmm$distributions <- cbind(size=hmm$size, prob=hmm$prob, mu=fmean(hmm$size,hmm$prob), variance=fvariance(hmm$size,hmm$prob)) |
|
132 | 141 |
rownames(hmm$distributions) <- state.labels |
133 | 142 |
hmm$A.initial <- matrix(hmm$A.initial, ncol=hmm$num.states, byrow=TRUE) |
134 | 143 |
rownames(hmm$A.initial) <- state.labels |
135 | 144 |
colnames(hmm$A.initial) <- state.labels |
136 |
- hmm$distributions.initial <- cbind(mean=hmm$means.initial, variance=hmm$variances.initial, size=fsize(hmm$means.initial,hmm$variances.initial), prob=fprob(hmm$means.initial,hmm$variances.initial)) |
|
145 |
+ hmm$distributions.initial <- cbind(size=hmm$size.initial, prob=hmm$prob.initial, mu=fmean(hmm$size.initial,hmm$prob.initial), variance=fvariance(hmm$size.initial,hmm$prob.initial)) |
|
137 | 146 |
rownames(hmm$distributions.initial) <- state.labels |
147 |
+ hmm$filter.reads <- filter.reads |
|
138 | 148 |
|
139 | 149 |
# Delete redundant entries |
140 |
- hmm$r <- NULL |
|
141 |
- hmm$p <- NULL |
|
142 |
- hmm$r.initial <- NULL |
|
143 |
- hmm$p.initial <- NULL |
|
150 |
+ hmm$size <- NULL |
|
151 |
+ hmm$prob <- NULL |
|
152 |
+ hmm$size.initial <- NULL |
|
153 |
+ hmm$prob.initial <- NULL |
|
154 |
+ hmm$use.initial.params <- NULL |
|
155 |
+ hmm$read.cutoff <- NULL |
|
144 | 156 |
|
145 | 157 |
# Issue warnings |
146 | 158 |
if (num.trials == 1) { |
... | ... |
@@ -148,6 +160,11 @@ find.aneuploidies <- function(binned.data, eps=0.001, max.time=-1, max.it=-1, nu |
148 | 160 |
war <- warning("HMM did not converge!\n") |
149 | 161 |
} |
150 | 162 |
} |
163 |
+ if (hmm$error == 1) { |
|
164 |
+ stop("A nan occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your read counts for very high numbers, they could be the cause for this problem.") |
|
165 |
+ } else if (hmm$error == 2) { |
|
166 |
+ stop("An error occurred during the Baum-Welch! Parameter estimation terminated prematurely.") |
|
167 |
+ } |
|
151 | 168 |
|
152 | 169 |
# Return results |
153 | 170 |
if (!is.null(war)) { |
154 | 171 |
similarity index 83% |
155 | 172 |
rename from R/helper.R |
156 | 173 |
rename to R/global.R |
... | ... |
@@ -4,6 +4,8 @@ |
4 | 4 |
state.labels <- c("unmappable","monosomy","disomy","trisomy","tetrasomy") |
5 | 5 |
coordinate.names <- c("chrom","start","end") |
6 | 6 |
binned.data.names <- c(coordinate.names,"reads") |
7 |
+class.chromstar.univariate <- "chromstar.univariate.hmm" |
|
8 |
+gcolors <- c("unmappable"="gray68","monosomy"="red1","disomy"="springgreen3","trisomy"="red2","tetrasomy"="red3") |
|
7 | 9 |
|
8 | 10 |
# ============================================================================ |
9 | 11 |
# Functions for a Negative Binomial to transform (mean,variance)<->(size,prob) |
... | ... |
@@ -1,10 +1,10 @@ |
1 | 1 |
# ------------------------------------------------------------ |
2 | 2 |
# Plot a read histogram with univariate fits |
3 | 3 |
# ------------------------------------------------------------ |
4 |
-plot.distribution <- function(model, state=NULL, chr=NULL, start=NULL, end=NULL) { |
|
4 |
+plot.distribution <- function(model, state=NULL, chrom=NULL, start=NULL, end=NULL) { |
|
5 | 5 |
|
6 | 6 |
## Load libraries |
7 |
-# library(ggplot2) |
|
7 |
+ library(ggplot2) |
|
8 | 8 |
|
9 | 9 |
# ----------------------------------------- |
10 | 10 |
# Get right x limit |
... | ... |
@@ -18,15 +18,16 @@ plot.distribution <- function(model, state=NULL, chr=NULL, start=NULL, end=NULL) |
18 | 18 |
} |
19 | 19 |
|
20 | 20 |
## Plot settings |
21 |
+ cols <- gcolors[c("unmappaple","monosomy","disomy","trisomy","tetrasomy")] |
|
21 | 22 |
|
22 | 23 |
# Select the rows to plot |
23 | 24 |
selectmask <- rep(TRUE,length(model$reads)) |
24 | 25 |
numchrom <- length(table(model$coordinates$chrom)) |
25 |
- if (!is.null(chr)) { |
|
26 |
- if (! chr %in% levels(model$coordinates$chrom)) { |
|
27 |
- stop(chr," can't be found in the model coordinates.") |
|
26 |
+ if (!is.null(chrom)) { |
|
27 |
+ if (! chrom %in% levels(model$coordinates$chrom)) { |
|
28 |
+ stop(chrom," can't be found in the model coordinates.") |
|
28 | 29 |
} |
29 |
- selectchrom <- model$coordinates$chrom == chr |
|
30 |
+ selectchrom <- model$coordinates$chrom == chrom |
|
30 | 31 |
selectmask <- selectmask & selectchrom |
31 | 32 |
numchrom <- 1 |
32 | 33 |
} |
... | ... |
@@ -41,13 +42,18 @@ plot.distribution <- function(model, state=NULL, chr=NULL, start=NULL, end=NULL) |
41 | 42 |
} |
42 | 43 |
} |
43 | 44 |
if (!is.null(state)) { |
44 |
- states <- get.states(model) |
|
45 |
- selectmask <- selectmask & states==state |
|
45 |
+ selectmask <- selectmask & model$states==state |
|
46 | 46 |
} |
47 | 47 |
if (length(which(selectmask)) != length(model$reads)) { |
48 | 48 |
reads <- model$reads[selectmask] |
49 |
- posteriors <- model$posteriors[selectmask,] |
|
50 |
- weights <- apply(posteriors,2,mean) |
|
49 |
+# posteriors <- model$posteriors[selectmask,] |
|
50 |
+# weights <- apply(posteriors,2,mean) |
|
51 |
+ states <- model$states[selectmask] |
|
52 |
+ weights <- rep(NA, 3) |
|
53 |
+ weights[1] <- length(which(states=="zero-inflation")) |
|
54 |
+ weights[2] <- length(which(states=="unmodified")) |
|
55 |
+ weights[3] <- length(which(states=="modified")) |
|
56 |
+ weights <- weights / length(states) |
|
51 | 57 |
} else { |
52 | 58 |
reads <- model$reads |
53 | 59 |
weights <- model$weights |
... | ... |
@@ -88,7 +94,7 @@ plot.distribution <- function(model, state=NULL, chr=NULL, start=NULL, end=NULL) |
88 | 94 |
} |
89 | 95 |
|
90 | 96 |
# Make legend and colors correct |
91 |
-# ggplt <- ggplt + scale_color_manual(name="components", values=cols) |
|
97 |
+ ggplt <- ggplt + scale_color_manual(name="components", values=cols) |
|
92 | 98 |
|
93 | 99 |
return(ggplt) |
94 | 100 |
|
... | ... |
@@ -101,10 +107,10 @@ plot.distribution <- function(model, state=NULL, chr=NULL, start=NULL, end=NULL) |
101 | 107 |
plot.boxplot <- function(model) { |
102 | 108 |
|
103 | 109 |
## Load libraries |
104 |
-# library(ggplot2) |
|
110 |
+ library(ggplot2) |
|
105 | 111 |
|
106 | 112 |
## Plot settings |
107 |
- cols <- c("unmodified"="gray48","modified"="orangered3") |
|
113 |
+ cols <- gcolors[c("unmodified","modified")] |
|
108 | 114 |
|
109 | 115 |
## Boxplot |
110 | 116 |
components <- c("unmodified","modified")[as.factor(get.states(model))] |
... | ... |
@@ -1,47 +1,58 @@ |
1 |
-#include "scalehmm.h" |
|
2 | 1 |
#include "utility.h" |
2 |
+#include "scalehmm.h" |
|
3 |
+#include <omp.h> // parallelization options |
|
3 | 4 |
|
4 |
-// --------------------------------------------------------------- |
|
5 |
-// void R_univariate_hmm() |
|
5 |
+// =================================================================================================================================================== |
|
6 | 6 |
// This function takes parameters from R, creates a univariate HMM object, creates the distributions, runs the Baum-Welch and returns the result to R. |
7 |
-// --------------------------------------------------------------- |
|
7 |
+// =================================================================================================================================================== |
|
8 | 8 |
extern "C" { |
9 |
-void R_univariate_hmm(int* O, int* T, int* N, double* means, double* variances, int* maxiter, int* maxtime, double* eps, double* posteriors, double* A, double* proba, double* loglik, double* weights, double* initial_means, double* initial_variances, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads) { |
|
9 |
+void R_univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* maxiter, int* maxtime, double* eps, double* posteriors, double* A, double* proba, double* loglik, double* weights, int* iniproc, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff) |
|
10 |
+{ |
|
10 | 11 |
|
11 | 12 |
// Define logging level |
12 |
-// FILE* pFile = fopen("aneufinder.log", "w"); |
|
13 |
+// FILE* pFile = fopen("chromStar.log", "w"); |
|
13 | 14 |
// Output2FILE::Stream() = pFile; |
14 |
-// FILELog::ReportingLevel() = FILELog::FromString("INFO"); |
|
15 |
- FILELog::ReportingLevel() = FILELog::FromString("ITERATION"); |
|
16 |
-// FILELog::ReportingLevel() = FILELog::FromString("DEBUG1"); |
|
15 |
+ FILELog::ReportingLevel() = FILELog::FromString("ERROR"); |
|
17 | 16 |
|
18 | 17 |
// Parallelization settings |
19 | 18 |
omp_set_num_threads(*num_threads); |
20 | 19 |
|
21 | 20 |
// Print some information |
22 | 21 |
FILE_LOG(logINFO) << "number of states = " << *N; |
22 |
+ Rprintf("number of states = %d\n", *N); |
|
23 | 23 |
FILE_LOG(logINFO) << "number of bins = " << *T; |
24 |
+ Rprintf("number of bins = %d\n", *T); |
|
24 | 25 |
if (*maxiter < 0) |
25 | 26 |
{ |
26 | 27 |
FILE_LOG(logINFO) << "maximum number of iterations = none"; |
28 |
+ Rprintf("maximum number of iterations = none\n"); |
|
27 | 29 |
} else { |
28 | 30 |
FILE_LOG(logINFO) << "maximum number of iterations = " << *maxiter; |
31 |
+ Rprintf("maximum number of iterations = %d\n", *maxiter); |
|
29 | 32 |
} |
30 | 33 |
if (*maxtime < 0) |
31 | 34 |
{ |
32 | 35 |
FILE_LOG(logINFO) << "maximum running time = none"; |
36 |
+ Rprintf("maximum running time = none\n"); |
|
33 | 37 |
} else { |
34 | 38 |
FILE_LOG(logINFO) << "maximum running time = " << *maxtime << " sec"; |
39 |
+ Rprintf("maximum running time = %d sec\n", *maxtime); |
|
35 | 40 |
} |
36 | 41 |
FILE_LOG(logINFO) << "epsilon = " << *eps; |
42 |
+ Rprintf("epsilon = %g\n", *eps); |
|
43 |
+ |
|
37 | 44 |
FILE_LOG(logDEBUG3) << "observation vector"; |
38 | 45 |
for (int t=0; t<50; t++) { |
39 | 46 |
FILE_LOG(logDEBUG3) << "O["<<t<<"] = " << O[t]; |
40 | 47 |
} |
41 | 48 |
|
49 |
+ // Flush Rprintf statements to console |
|
50 |
+ R_FlushConsole(); |
|
51 |
+ |
|
42 | 52 |
// Create the HMM |
43 | 53 |
FILE_LOG(logDEBUG1) << "Creating a univariate HMM"; |
44 | 54 |
ScaleHMM* hmm = new ScaleHMM(*T, *N); |
55 |
+ hmm->set_cutoff(*read_cutoff); |
|
45 | 56 |
// Initialize the transition probabilities and proba |
46 | 57 |
hmm->initialize_transition_probs(initial_A, *use_initial_params); |
47 | 58 |
hmm->initialize_proba(initial_proba, *use_initial_params); |
... | ... |
@@ -59,60 +70,82 @@ void R_univariate_hmm(int* O, int* T, int* N, double* means, double* variances, |
59 | 70 |
} |
60 | 71 |
variance = variance / *T; |
61 | 72 |
FILE_LOG(logINFO) << "data mean = " << mean << ", data variance = " << variance; |
73 |
+ Rprintf("data mean = %g, data variance = %g\n", mean, variance); |
|
62 | 74 |
|
63 | 75 |
// Go through all states of the hmm and assign the density functions |
64 |
- srand (clock()); |
|
65 |
- int rand1, rand2; |
|
66 | 76 |
double imean, ivariance; |
67 |
- for (int istate=0; istate<*N; istate++) |
|
77 |
+ for (int i_state=0; i_state<*N; i_state++) |
|
68 | 78 |
{ |
69 |
- |
|
70 | 79 |
if (*use_initial_params) { |
71 |
- FILE_LOG(logINFO) << "Using given parameters for mean and variance"; |
|
72 |
- imean = initial_means[istate]; |
|
73 |
- ivariance = initial_variances[istate]; |
|
80 |
+ FILE_LOG(logINFO) << "Using given parameters for size and prob"; |
|
81 |
+ Rprintf("Using given parameters for size and prob\n"); |
|
82 |
+ imean = (1-initial_prob[i_state])*initial_size[i_state] / initial_prob[i_state]; |
|
83 |
+ ivariance = imean / initial_prob[i_state]; |
|
84 |
+ FILE_LOG(logDEBUG2) << "imean = " << imean; |
|
85 |
+ FILE_LOG(logDEBUG2) << "ivariance = " << ivariance; |
|
74 | 86 |
} else { |
75 |
- // Simple initialization |
|
76 |
- imean = mean * pow(2, istate-2); |
|
77 |
- ivariance = variance * pow(2, istate-2); |
|
78 |
- initial_means[istate] = imean; |
|
79 |
- initial_variances[istate] = ivariance; |
|
87 |
+ |
|
88 |
+ if (*iniproc == 1) |
|
89 |
+ { |
|
90 |
+ // Simple initialization |
|
91 |
+ imean = mean * pow(2, i_state-2); |
|
92 |
+ ivariance = variance * pow(2, i_state-2); |
|
93 |
+ } |
|
94 |
+ |
|
95 |
+ // Calculate r and p from mean and variance |
|
96 |
+ initial_size[i_state] = pow(imean,2)/(ivariance-imean); |
|
97 |
+ initial_prob[i_state] = imean/ivariance; |
|
98 |
+ |
|
80 | 99 |
} |
81 |
- FILE_LOG(logDEBUG3) << "imean = " << imean; |
|
82 |
- FILE_LOG(logDEBUG3) << "ivariance = " << ivariance; |
|
83 | 100 |
|
84 |
- if (istate == 0) |
|
101 |
+ if (i_state >= 1) |
|
102 |
+ { |
|
103 |
+ FILE_LOG(logDEBUG1) << "Using negative binomial for state " << i_state; |
|
104 |
+ NegativeBinomial *d = new NegativeBinomial(O, *T, initial_size[i_state], initial_prob[i_state]); // delete is done inside ~ScaleHMM() |
|
105 |
+ hmm->densityFunctions.push_back(d); |
|
106 |
+ } |
|
107 |
+ else if (i_state == 0) |
|
85 | 108 |
{ |
109 |
+ FILE_LOG(logDEBUG1) << "Using only zeros for state " << i_state; |
|
86 | 110 |
ZeroInflation *d = new ZeroInflation(O, *T); // delete is done inside ~ScaleHMM() |
87 |
- FILE_LOG(logDEBUG1) << "Using "<< d->get_name() <<" for state " << istate; |
|
88 | 111 |
hmm->densityFunctions.push_back(d); |
89 | 112 |
} |
90 | 113 |
else |
91 | 114 |
{ |
92 |
- NegativeBinomial *d = new NegativeBinomial(O, *T, imean, ivariance); // delete is done inside ~ScaleHMM() |
|
93 |
- FILE_LOG(logDEBUG1) << "Using "<< d->get_name() <<" for state " << istate; |
|
115 |
+ FILE_LOG(logWARNING) << "Density not specified, using default negative binomial for state " << i_state; |
|
116 |
+ NegativeBinomial *d = new NegativeBinomial(O, *T, initial_size[i_state], initial_prob[i_state]); |
|
94 | 117 |
hmm->densityFunctions.push_back(d); |
95 | 118 |
} |
96 |
- |
|
97 | 119 |
} |
98 | 120 |
|
121 |
+ // Flush Rprintf statements to console |
|
122 |
+ R_FlushConsole(); |
|
123 |
+ |
|
99 | 124 |
// Do the Baum-Welch to estimate the parameters |
100 | 125 |
FILE_LOG(logDEBUG1) << "Starting Baum-Welch estimation"; |
101 |
- hmm->baumWelch(maxiter, maxtime, eps); |
|
126 |
+ try |
|
127 |
+ { |
|
128 |
+ hmm->baumWelch(maxiter, maxtime, eps); |
|
129 |
+ } |
|
130 |
+ catch (std::exception& e) |
|
131 |
+ { |
|
132 |
+ FILE_LOG(logERROR) << "Error in Baum-Welch: " << e.what(); |
|
133 |
+ Rprintf("Error in Baum-Welch: %s\n", e.what()); |
|
134 |
+ if (e.what()=="nan detected") { *error = 1; } |
|
135 |
+ else { *error = 2; } |
|
136 |
+ } |
|
137 |
+ |
|
102 | 138 |
FILE_LOG(logDEBUG1) << "Finished with Baum-Welch estimation"; |
103 | 139 |
// Compute the posteriors and save results directly to the R pointer |
104 |
- double** post = allocDoubleMatrix(*N, *T); |
|
105 |
- hmm->get_posteriors(post); |
|
106 | 140 |
FILE_LOG(logDEBUG1) << "Recode posteriors into column representation"; |
107 | 141 |
#pragma omp parallel for |
108 | 142 |
for (int iN=0; iN<*N; iN++) |
109 | 143 |
{ |
110 | 144 |
for (int t=0; t<*T; t++) |
111 | 145 |
{ |
112 |
- posteriors[t + iN * (*T)] = post[iN][t]; |
|
146 |
+ posteriors[t + iN * (*T)] = hmm->get_posterior(iN, t); |
|
113 | 147 |
} |
114 | 148 |
} |
115 |
- freeDoubleMatrix(post, *N); |
|
116 | 149 |
|
117 | 150 |
FILE_LOG(logDEBUG1) << "Return parameters"; |
118 | 151 |
// also return the estimated transition matrix and the initial probs |
... | ... |
@@ -128,8 +161,18 @@ void R_univariate_hmm(int* O, int* T, int* N, double* means, double* variances, |
128 | 161 |
// copy the estimated distribution params |
129 | 162 |
for (int i=0; i<*N; i++) |
130 | 163 |
{ |
131 |
- means[i] = hmm->densityFunctions[i]->get_mean(); |
|
132 |
- variances[i] = hmm->densityFunctions[i]->get_variance(); |
|
164 |
+ if (hmm->densityFunctions[i]->get_name() == NEGATIVE_BINOMIAL) |
|
165 |
+ { |
|
166 |
+ NegativeBinomial* d = (NegativeBinomial*)(hmm->densityFunctions[i]); |
|
167 |
+ size[i] = d->get_size(); |
|
168 |
+ prob[i] = d->get_prob(); |
|
169 |
+ } |
|
170 |
+ else if (hmm->densityFunctions[i]->get_name() == ZERO_INFLATION) |
|
171 |
+ { |
|
172 |
+ // These values for a Negative Binomial define a zero-inflation (delta distribution) |
|
173 |
+ size[i] = 0; |
|
174 |
+ prob[i] = 1; |
|
175 |
+ } |
|
133 | 176 |
} |
134 | 177 |
*loglik = hmm->get_logP(); |
135 | 178 |
hmm->calc_weights(weights); |
... | ... |
@@ -96,75 +96,165 @@ void Normal::set_stdev(double stdev) |
96 | 96 |
// ============================================================ |
97 | 97 |
|
98 | 98 |
// Constructor and Destructor --------------------------------- |
99 |
-NegativeBinomial::NegativeBinomial(int* observations, int T, double mean, double variance) |
|
99 |
+NegativeBinomial::NegativeBinomial(int* observations, int T, double size, double prob) |
|
100 | 100 |
{ |
101 | 101 |
FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
102 | 102 |
this->obs = observations; |
103 | 103 |
this->T = T; |
104 |
- this->mean = mean; |
|
105 |
- this->variance = variance; |
|
106 |
- this->size = this->fsize(mean, variance); |
|
107 |
- this->prob = this->fprob(mean, variance); |
|
108 |
- this->max_obs = intMax(observations, T); |
|
104 |
+ this->size = size; |
|
105 |
+ this->prob = prob; |
|
106 |
+ this->lxfactorials = NULL; |
|
107 |
+ // Precompute the lxfactorials that are used in computing the densities |
|
108 |
+ if (this->obs != NULL) |
|
109 |
+ { |
|
110 |
+ this->max_obs = intMax(observations, T); |
|
111 |
+ this->lxfactorials = (double*) calloc(max_obs+1, sizeof(double)); |
|
112 |
+ this->lxfactorials[0] = 0.0; // Not necessary, already 0 because of calloc |
|
113 |
+ this->lxfactorials[1] = 0.0; |
|
114 |
+ for (int j=2; j<=max_obs; j++) |
|
115 |
+ { |
|
116 |
+ this->lxfactorials[j] = this->lxfactorials[j-1] + log(j); |
|
117 |
+ } |
|
118 |
+ } |
|
109 | 119 |
} |
110 | 120 |
|
111 | 121 |
NegativeBinomial::~NegativeBinomial() |
112 | 122 |
{ |
113 | 123 |
FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
124 |
+ if (this->lxfactorials != NULL) |
|
125 |
+ { |
|
126 |
+ free(this->lxfactorials); |
|
127 |
+ } |
|
114 | 128 |
} |
115 | 129 |
|
116 | 130 |
// Methods ---------------------------------------------------- |
117 |
-void NegativeBinomial::calc_densities(double* density) |
|
131 |
+void NegativeBinomial::calc_logdensities(double* logdens) |
|
118 | 132 |
{ |
119 | 133 |
FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
120 |
- for (int t=0; t<this->T; t++) |
|
134 |
+ double logp = log(this->prob); |
|
135 |
+ double log1minusp = log(1-this->prob); |
|
136 |
+ double lGammaR,lGammaRplusX,lxfactorial; |
|
137 |
+ lGammaR=lgamma(this->size); |
|
138 |
+ // Select strategy for computing gammas |
|
139 |
+ if (this->max_obs <= this->T) |
|
121 | 140 |
{ |
122 |
- density[t] = dnbinom(this->obs[t], this->size, this->prob, 0); |
|
141 |
+ FILE_LOG(logDEBUG2) << "Precomputing gammas in " << __func__ << " for every obs[t], because max(O)<=T"; |
|
142 |
+ double logdens_per_read [this->max_obs+1]; |
|
143 |
+ for (int j=0; j<=this->max_obs; j++) |
|
144 |
+ { |
|
145 |
+ logdens_per_read[j] = lgamma(this->size + j) - lGammaR - lxfactorials[j] + this->size * logp + j * log1minusp; |
|
146 |
+ } |
|
147 |
+ for (int t=0; t<this->T; t++) |
|
148 |
+ { |
|
149 |
+ logdens[t] = logdens_per_read[(int) this->obs[t]]; |
|
150 |
+ FILE_LOG(logDEBUG4) << "logdens["<<t<<"] = " << logdens[t]; |
|
151 |
+ if (isnan(logdens[t])) |
|
152 |
+ { |
|
153 |
+ FILE_LOG(logERROR) << __PRETTY_FUNCTION__; |
|
154 |
+ FILE_LOG(logERROR) << "logdens["<<t<<"] = "<< logdens[t]; |
|
155 |
+ throw nan_detected; |
|
156 |
+ } |
|
157 |
+ } |
|
123 | 158 |
} |
124 |
-} |
|
159 |
+ else |
|
160 |
+ { |
|
161 |
+ FILE_LOG(logDEBUG2) << "Computing gammas in " << __func__ << " for every t, because max(O)>T"; |
|
162 |
+ for (int t=0; t<this->T; t++) |
|
163 |
+ { |
|
164 |
+ lGammaRplusX = lgamma(this->size + this->obs[t]); |
|
165 |
+ lxfactorial = this->lxfactorials[(int) this->obs[t]]; |
|
166 |
+ logdens[t] = lGammaRplusX - lGammaR - lxfactorial + this->size * logp + this->obs[t] * log1minusp; |
|
167 |
+ FILE_LOG(logDEBUG4) << "logdens["<<t<<"] = " << logdens[t]; |
|
168 |
+ if (isnan(logdens[t])) |
|
169 |
+ { |
|
170 |
+ FILE_LOG(logERROR) << __PRETTY_FUNCTION__; |
|
171 |
+ FILE_LOG(logERROR) << "logdens["<<t<<"] = "<< logdens[t]; |
|
172 |
+ throw nan_detected; |
|
173 |
+ } |
|
174 |
+ } |
|
175 |
+ } |
|
176 |
+} |
|
125 | 177 |
|
126 |
-void NegativeBinomial::calc_logdensities(double* logdensity) |
|
178 |
+void NegativeBinomial::calc_densities(double* dens) |
|
127 | 179 |
{ |
128 | 180 |
FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
129 |
- for (int t=0; t<this->T; t++) |
|
181 |
+ double logp = log(this->prob); |
|
182 |
+ double log1minusp = log(1-this->prob); |
|
183 |
+ double lGammaR,lGammaRplusX,lxfactorial; |
|
184 |
+ lGammaR=lgamma(this->size); |
|
185 |
+ // Select strategy for computing gammas |
|
186 |
+ if (this->max_obs <= this->T) |
|
130 | 187 |
{ |
131 |
- logdensity[t] = dnbinom(this->obs[t], this->size, this->prob, 1); |
|
188 |
+ FILE_LOG(logDEBUG2) << "Precomputing gammas in " << __func__ << " for every obs[t], because max(O)<=T"; |
|
189 |
+ double dens_per_read [this->max_obs+1]; |
|
190 |
+ for (int j=0; j<=this->max_obs; j++) |
|
191 |
+ { |
|
192 |
+ dens_per_read[j] = exp( lgamma(this->size + j) - lGammaR - lxfactorials[j] + this->size * logp + j * log1minusp ); |
|
193 |
+ } |
|
194 |
+ for (int t=0; t<this->T; t++) |
|
195 |
+ { |
|
196 |
+ dens[t] = dens_per_read[(int) this->obs[t]]; |
|
197 |
+ FILE_LOG(logDEBUG4) << "dens["<<t<<"] = " << dens[t]; |
|
198 |
+ if (isnan(dens[t])) |
|
199 |
+ { |
|
200 |
+ FILE_LOG(logERROR) << __PRETTY_FUNCTION__; |
|
201 |
+ FILE_LOG(logERROR) << "dens["<<t<<"] = "<< dens[t]; |
|
202 |
+ throw nan_detected; |
|
203 |
+ } |
|
204 |
+ } |
|
132 | 205 |
} |
133 |
-} |
|
206 |
+ else |
|
207 |
+ { |
|
208 |
+ FILE_LOG(logDEBUG2) << "Computing gammas in " << __func__ << " for every t, because max(O)>T"; |
|
209 |
+ for (int t=0; t<this->T; t++) |
|
210 |
+ { |
|
211 |
+ lGammaRplusX = lgamma(this->size + this->obs[t]); |
|
212 |
+ lxfactorial = this->lxfactorials[(int) this->obs[t]]; |
|
213 |
+ dens[t] = exp( lGammaRplusX - lGammaR - lxfactorial + this->size * logp + this->obs[t] * log1minusp ); |
|
214 |
+ FILE_LOG(logDEBUG4) << "dens["<<t<<"] = " << dens[t]; |
|
215 |
+ if (isnan(dens[t])) |
|
216 |
+ { |
|
217 |
+ FILE_LOG(logERROR) << __PRETTY_FUNCTION__; |
|
218 |
+ FILE_LOG(logERROR) << "dens["<<t<<"] = "<< dens[t]; |
|
219 |
+ throw nan_detected; |
|
220 |
+ } |
|
221 |
+ } |
|
222 |
+ } |
|
223 |
+} |
|
134 | 224 |
|
135 |
-void NegativeBinomial::update(double* weights) |
|
225 |
+void NegativeBinomial::update(double* weight) |
|
136 | 226 |
{ |
137 | 227 |
FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
138 | 228 |
double eps = 1e-4, kmax; |
139 |
- double numerator, denominator, rhere, dr, Fr, dFrdr, DigammaR, DigammaRplusDR, logp; |
|
229 |
+ double numerator, denominator, rhere, dr, Fr, dFrdr, DigammaR, DigammaRplusDR; |
|
140 | 230 |
// Update p |
141 | 231 |
numerator=denominator=0.0; |
142 |
- clock_t time, dtime; |
|
143 |
- time = clock(); |
|
232 |
+// clock_t time, dtime; |
|
233 |
+// time = clock(); |
|
144 | 234 |
for (int t=0; t<this->T; t++) |
145 | 235 |
{ |
146 |
- numerator+=weights[t]*this->size; |
|
147 |
- denominator+=weights[t]*(this->size+this->obs[t]); |
|
236 |
+ numerator+=weight[t]*this->size; |
|
237 |
+ denominator+=weight[t]*(this->size+this->obs[t]); |
|
148 | 238 |
} |
149 | 239 |
this->prob = numerator/denominator; // Update of r is now done with updated p |
150 |
- dtime = clock() - time; |
|
151 |
- FILE_LOG(logDEBUG1) << "update prob: "<<dtime<< " clicks"; |
|
240 |
+ double logp = log(this->prob); |
|
241 |
+// dtime = clock() - time; |
|
242 |
+// FILE_LOG(logDEBUG1) << "updateP(): "<<dtime<< " clicks"; |
|
152 | 243 |
// Update of r with Newton Method |
153 |
- logp = log(this->prob); |
|
154 | 244 |
rhere = this->size; |
155 | 245 |
dr = 0.00001; |
156 | 246 |
kmax = 20; |
157 |
- time = clock(); |
|
247 |
+// time = clock(); |
|
158 | 248 |
// Select strategy for computing digammas |
159 | 249 |
if (this->max_obs <= this->T) |
160 | 250 |
{ |
161 |
- FILE_LOG(logDEBUG2) << "Precomputing digammas in " << __func__ << " for every obs[t], because max(obs)<=T"; |
|
251 |
+ FILE_LOG(logDEBUG2) << "Precomputing digammas in " << __func__ << " for every obs[t], because max(O)<=T"; |
|
162 | 252 |
double DigammaRplusX[this->max_obs+1], DigammaRplusDRplusX[this->max_obs+1]; |
163 | 253 |
for (int k=1; k<kmax; k++) |
164 | 254 |
{ |
165 | 255 |
Fr=dFrdr=0.0; |
166 |
- DigammaR = digamma(rhere); |
|
167 |
- DigammaRplusDR = digamma(rhere + dr); |
|
256 |
+ DigammaR = digamma(rhere); // boost::math::digamma<>(rhere); |
|
257 |
+ DigammaRplusDR = digamma(rhere + dr); // boost::math::digamma<>(rhere+dr); |
|
168 | 258 |
// Precompute the digammas by iterating over all possible values of the observation vector |
169 | 259 |
for (int j=0; j<=this->max_obs; j++) |
170 | 260 |
{ |
... | ... |
@@ -175,13 +265,13 @@ void NegativeBinomial::update(double* weights) |
175 | 265 |
{ |
176 | 266 |
if(this->obs[t]==0) |
177 | 267 |
{ |
178 |
- Fr+=weights[t]*logp; |
|
268 |
+ Fr+=weight[t]*logp; |
|
179 | 269 |
//dFrdr+=0; |
180 | 270 |
} |
181 | 271 |
if(this->obs[t]!=0) |
182 | 272 |
{ |
183 |
- Fr+=weights[t]*(logp-DigammaR+DigammaRplusX[(int)obs[t]]); |
|
184 |
- dFrdr+=weights[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX[(int)obs[t]]-DigammaRplusX[(int)obs[t]]); |
|
273 |
+ Fr+=weight[t]*(logp-DigammaR+DigammaRplusX[(int)obs[t]]); |
|
274 |
+ dFrdr+=weight[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX[(int)obs[t]]-DigammaRplusX[(int)obs[t]]); |
|
185 | 275 |
} |
186 | 276 |
} |
187 | 277 |
if(fabs(Fr)<eps) |
... | ... |
@@ -194,25 +284,26 @@ void NegativeBinomial::update(double* weights) |
194 | 284 |
} |
195 | 285 |
else |
196 | 286 |
{ |
197 |
- FILE_LOG(logDEBUG2) << "Computing digammas in " << __func__ << " for every t, because max(obs)>T"; |
|
287 |
+ FILE_LOG(logDEBUG2) << "Computing digammas in " << __func__ << " for every t, because max(O)>T"; |
|
198 | 288 |
double DigammaRplusX, DigammaRplusDRplusX; |
199 | 289 |
for (int k=1; k<kmax; k++) |
200 | 290 |
{ |
201 | 291 |
Fr = dFrdr = 0.0; |
202 |
- DigammaR = digamma(rhere); |
|
203 |
- DigammaRplusDR = digamma(rhere + dr); |
|
292 |
+ DigammaR = digamma(rhere); // boost::math::digamma<>(rhere); |
|
293 |
+ DigammaRplusDR = digamma(rhere + dr); // boost::math::digamma<>(rhere+dr); |
|
204 | 294 |
for(int t=0; t<this->T; t++) |
205 | 295 |
{ |
206 |
- DigammaRplusX = digamma(rhere+this->obs[t]); |
|
207 |
- DigammaRplusDRplusX = digamma(rhere+dr+this->obs[t]); |
|
296 |
+ DigammaRplusX = digamma(rhere+this->obs[t]); //boost::math::digamma<>(rhere+this->obs[ti]); |
|
297 |
+ DigammaRplusDRplusX = digamma(rhere+dr+this->obs[t]); // boost::math::digamma<>(rhere+dr+this->obs[ti]); |
|
208 | 298 |
if(this->obs[t]==0) |
209 | 299 |
{ |
210 |
- Fr+=weights[t]*logp; |
|
300 |
+ Fr+=weight[t]*logp; |
|
301 |
+ //dFrdr+=0; |
|
211 | 302 |
} |
212 | 303 |
if(this->obs[t]!=0) |
213 | 304 |
{ |
214 |
- Fr+=weights[t]*(logp-DigammaR+DigammaRplusX); |
|
215 |
- dFrdr+=weights[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX-DigammaRplusX); |
|
305 |
+ Fr+=weight[t]*(logp-DigammaR+DigammaRplusX); |
|
306 |
+ dFrdr+=weight[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX-DigammaRplusX); |
|
216 | 307 |
} |
217 | 308 |
} |
218 | 309 |
if(fabs(Fr)<eps) |
... | ... |
@@ -224,74 +315,30 @@ void NegativeBinomial::update(double* weights) |
224 | 315 |
} |
225 | 316 |
} |
226 | 317 |
this->size = rhere; |
227 |
- FILE_LOG(logDEBUG1) << "size = "<<this->size << ", prob = "<<this->prob; |
|
228 |
- |
|
229 |
- // Update mean and variance |
|
230 |
- this->mean = this->fmean(this->size, this->prob); |
|
231 |
- this->variance = this->fvariance(this->size, this->prob); |
|
318 |
+ FILE_LOG(logDEBUG1) << "r = "<<this->size << ", p = "<<this->prob; |
|
232 | 319 |
|
233 |
- dtime = clock() - time; |
|
234 |
- FILE_LOG(logDEBUG1) << "update size: "<<dtime<< " clicks"; |
|
320 |
+// dtime = clock() - time; |
|
321 |
+// FILE_LOG(logDEBUG1) << "updateR(): "<<dtime<< " clicks"; |
|
235 | 322 |
|
236 | 323 |
} |
237 | 324 |
|
238 |
-double NegativeBinomial::fsize(double mean, double variance) |
|
239 |
-{ |
|
240 |
- FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
241 |
- return( mean*mean / (variance - mean) ); |
|
242 |
-} |
|
243 |
- |
|
244 |
-double NegativeBinomial::fprob(double mean, double variance) |
|
245 |
-{ |
|
246 |
- FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
247 |
- return( mean / variance ); |
|
248 |
-} |
|
249 |
- |
|
250 |
-double NegativeBinomial::fmean(double size, double prob) |
|
251 |
-{ |
|
252 |
- FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
253 |
- return( size/prob - size ); |
|
254 |
-} |
|
255 |
- |
|
256 |
-double NegativeBinomial::fvariance(double size, double prob) |
|
257 |
-{ |
|
258 |
- FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
259 |
- return( (size - prob*size) / (prob*prob) ); |
|
260 |
-} |
|
261 |
- |
|
262 | 325 |
// Getter and Setter ------------------------------------------ |
263 |
-DensityName NegativeBinomial::get_name() |
|
264 |
-{ |
|
265 |
- FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
266 |
- return(NEGATIVE_BINOMIAL); |
|
267 |
-} |
|
268 |
- |
|
269 | 326 |
double NegativeBinomial::get_mean() |
270 | 327 |
{ |
271 | 328 |
FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
272 |
- return(this->mean); |
|
273 |
-} |
|
274 |
- |
|
275 |
-void NegativeBinomial::set_mean(double mean) |
|
276 |
-{ |
|
277 |
- FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
278 |
- this->mean = mean; |
|
279 |
- this->size = fsize(mean, this->variance); |
|
280 |
- this->prob = fprob(mean, this->variance); |
|
329 |
+ return this->size*(1-this->prob)/this->prob; |
|
281 | 330 |
} |
282 | 331 |
|
283 | 332 |
double NegativeBinomial::get_variance() |
284 | 333 |
{ |
285 | 334 |
FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
286 |
- return(this->variance); |
|
335 |
+ return this->size*(1-this->prob)/this->prob/this->prob; |
|
287 | 336 |
} |
288 | 337 |
|
289 |
-void NegativeBinomial::set_variance(double variance) |
|
338 |
+DensityName NegativeBinomial::get_name() |
|
290 | 339 |
{ |
291 | 340 |
FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
292 |
- this->variance = variance; |
|
293 |
- this->size = fsize(mean, this->variance); |
|
294 |
- this->prob = fprob(mean, this->variance); |
|
341 |
+ return(NEGATIVE_BINOMIAL); |
|
295 | 342 |
} |
296 | 343 |
|
297 | 344 |
double NegativeBinomial::get_size() |
... | ... |
@@ -85,6 +85,7 @@ class NegativeBinomial : public Density |
85 | 85 |
double mean; ///< mean of the negative binomial |
86 | 86 |
double variance; ///< variance of the negative binomial |
87 | 87 |
int max_obs; ///< maximum observation |
88 |
+ double* lxfactorials; ///< vector of precomputed factorials (x!) |
|
88 | 89 |
|
89 | 90 |
// Methods |
90 | 91 |
double fsize(double mean, double variance); |
91 | 92 |
deleted file mode 100644 |
... | ... |
@@ -1,860 +0,0 @@ |
1 |
-#include "loghmm.h" |
|
2 |
- |
|
3 |
-/* initialize the model */ |
|
4 |
-LogHMM::LogHMM(int T, int N) |
|
5 |
-{ |
|
6 |
- FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
7 |
- FILE_LOG(logDEBUG2) << "Initializing univariate LogHMM"; |
|
8 |
- this->xvariate = UNIVARIATE; |
|
9 |
- this->T = T; |
|
10 |
- this->N = N; |
|
11 |
- this->A = allocDoubleMatrix(N, N); |
|
12 |
- this->logA = allocDoubleMatrix(N, N); |
|
13 |
- this->logalpha = allocDoubleMatrix(T, N); |
|
14 |
- this->logbeta = allocDoubleMatrix(T, N); |