... | ... |
@@ -34,7 +34,7 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold |
34 | 34 |
tab5rows <- read.table(file, nrows=5) |
35 | 35 |
classes.in.bed <- sapply(tab5rows, class) |
36 | 36 |
classes <- rep("NULL",length(classes.in.bed)) |
37 |
- classes[1:3] <- classes.in.bed[1:3] |
|
37 |
+ classes[c(1:3,6)] <- classes.in.bed[c(1:3,6)] |
|
38 | 38 |
data <- read.table(file, colClasses=classes) |
39 | 39 |
# Convert to GRanges object |
40 | 40 |
data <- GenomicRanges::GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2]+1, end=data[,3]+1), strand=Rle(strand("*"), nrow(data))) # +1 to match coordinate systems |
... | ... |
@@ -86,11 +86,15 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold |
86 | 86 |
chroms2use <- intersect(chroms2use, names(chrom.lengths)) |
87 | 87 |
|
88 | 88 |
## Determine binsize automatically |
89 |
- if (!is.null(reads.per.bin)) { |
|
89 |
+# if (!is.null(reads.per.bin)) { |
|
90 | 90 |
cat('Automatically determining binsizes...') |
91 | 91 |
gr <- GenomicRanges::GRanges(seqnames=Rle(chroms2use), |
92 | 92 |
ranges=IRanges(start=rep(1, length(chroms2use)), end=chrom.lengths[chroms2use])) |
93 |
- autodata <- GenomicAlignments::readGAlignmentsFromBam(file, index=index, param=ScanBamParam(what=c("pos"),which=range(gr),flag=scanBamFlag(isDuplicate=F))) |
|
93 |
+ if (format=='bam') { |
|
94 |
+ autodata <- GenomicAlignments::readGAlignmentsFromBam(file, index=index, param=ScanBamParam(what=c("pos"),which=range(gr),flag=scanBamFlag(isDuplicate=F))) |
|
95 |
+ } else if (format=='bed' | format=='bedGraph') { |
|
96 |
+ autodata <- data |
|
97 |
+ } |
|
94 | 98 |
numreadsperbp <- length(autodata) / sum(as.numeric(chrom.lengths[chroms2use])) |
95 | 99 |
## Pad binsizes and reads.per.bin with each others value |
96 | 100 |
binsizes.add <- round(reads.per.bin / numreadsperbp, -2) |
... | ... |
@@ -98,7 +102,7 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold |
98 | 102 |
binsizes <- c(binsizes, binsizes.add) |
99 | 103 |
reads.per.bin <- c(reads.per.bin.add, reads.per.bin) |
100 | 104 |
cat(' done\n') |
101 |
- } |
|
105 |
+# } |
|
102 | 106 |
|
103 | 107 |
### Do the loop for all binsizes |
104 | 108 |
if (is.null(numbins)) { |
... | ... |
@@ -167,7 +171,9 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold |
167 | 171 |
## Count overlaps |
168 | 172 |
cat("counting overlaps... \r") |
169 | 173 |
if (format=="bam" | format=="bed") { |
170 |
- reads <- GenomicRanges::countOverlaps(i.binned.data, data[seqnames(data)==chromosome]) |
|
174 |
+ mreads <- GenomicRanges::countOverlaps(i.binned.data, data[seqnames(data)==chromosome & strand(data)=='-']) |
|
175 |
+ preads <- GenomicRanges::countOverlaps(i.binned.data, data[seqnames(data)==chromosome & strand(data)=='+']) |
|
176 |
+ reads <- mreads + preads |
|
171 | 177 |
} else if (format=="bedGraph") { |
172 | 178 |
# Take the max value from all regions that fall into / overlap a given bin as read count |
173 | 179 |
midx <- as.matrix(findOverlaps(i.binned.data, data[seqnames(data)==chromosome])) |
... | ... |
@@ -182,14 +188,20 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold |
182 | 188 |
maxvalues[i1] <- max(signal[midx[(max.idx[i1-1]+1):(max.idx[i1]),2]]) |
183 | 189 |
} |
184 | 190 |
reads[read.idx] <- maxvalues |
191 |
+ mreads <- rep(NA, length(reads)) |
|
192 |
+ preads <- rep(NA, length(reads)) |
|
185 | 193 |
} |
186 | 194 |
|
187 | 195 |
## Concatenate |
188 | 196 |
cat("concatenate... \r") |
189 | 197 |
if (is.null(numbins)) { |
190 | 198 |
mcols(i.binned.data)$reads <- reads |
199 |
+ mcols(i.binned.data)$mreads <- mreads |
|
200 |
+ mcols(i.binned.data)$preads <- preads |
|
191 | 201 |
} else { |
192 | 202 |
mcols(i.binned.data)$reads <- reads / (end - start) |
203 |
+ mcols(i.binned.data)$mreads <- mreads / (end - start) |
|
204 |
+ mcols(i.binned.data)$preads <- preads / (end - start) |
|
193 | 205 |
} |
194 | 206 |
|
195 | 207 |
if (separate.chroms==TRUE) { |
... | ... |
@@ -197,9 +209,9 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold |
197 | 209 |
if (save.as.RData==TRUE) { |
198 | 210 |
## Print to file |
199 | 211 |
if (is.null(numbins)) { |
200 |
- filename <- paste0(basename(file),"_binsize_",binsize,"_reads.per.bin_",readsperbin,"_",chromosome,".RData") |
|
212 |
+ filename <- paste0(basename(file),"_binsize_",format(binsize, scientific=F, trim=T),"_reads.per.bin_",readsperbin,"_",chromosome,"_.RData") |
|
201 | 213 |
} else { |
202 |
- filename <- paste0(basename(file),"_numbin_",numbin,"_",chromosome,".RData") |
|
214 |
+ filename <- paste0(basename(file),"_numbin_",format(numbin, scientific=F, trim=T),"_",chromosome,"_.RData") |
|
203 | 215 |
} |
204 | 216 |
cat("save... \r") |
205 | 217 |
save(binned.data, file=file.path(outputfolder,filename) ) |
... | ... |
@@ -217,9 +229,9 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold |
217 | 229 |
if (save.as.RData==TRUE) { |
218 | 230 |
# Print to file |
219 | 231 |
if (is.null(numbins)) { |
220 |
- filename <- paste(basename(file),"_binsize_",binsize,"_reads.per.bin_",readsperbin,".RData", sep="") |
|
232 |
+ filename <- paste0(basename(file),"_binsize_",format(binsize, scientific=F, trim=T),"_reads.per.bin_",readsperbin,"_.RData") |
|
221 | 233 |
} else { |
222 |
- filename <- paste(basename(file),"_numbin_",numbin,".RData", sep="") |
|
234 |
+ filename <- paste0(basename(file),"_numbin_",format(numbin, scientific=F, trim=T),"_.RData") |
|
223 | 235 |
} |
224 | 236 |
cat("Saving to file ...") |
225 | 237 |
save(binned.data, file=file.path(outputfolder,filename) ) |
... | ... |
@@ -51,12 +51,16 @@ check.integer = function(testvar) { |
51 | 51 |
check.univariate.modellist = function(modellist) { |
52 | 52 |
if (!is(modellist,"list")) return(1) |
53 | 53 |
for (model in modellist) { |
54 |
- if (!is(model,class.aneufinder.hmm)) return(2) |
|
54 |
+ if (!is(model,class.univariate.hmm)) return(2) |
|
55 | 55 |
} |
56 | 56 |
return(0) |
57 | 57 |
} |
58 | 58 |
check.univariate.model = function(model) { |
59 |
- if (!is(model,class.aneufinder.hmm)) return(1) |
|
59 |
+ if (!is(model,class.univariate.hmm)) return(1) |
|
60 |
+ return(0) |
|
61 |
+} |
|
62 |
+check.multivariate.model = function(model) { |
|
63 |
+ if (!is(model,class.multivariate.hmm)) return(1) |
|
60 | 64 |
return(0) |
61 | 65 |
} |
62 | 66 |
|
... | ... |
@@ -72,7 +72,7 @@ hmm2GRanges <- function(hmm, reduce=TRUE) { |
72 | 72 |
|
73 | 73 |
# library(GenomicRanges) |
74 | 74 |
### Check user input ### |
75 |
- if (check.univariate.model(hmm)!=0) stop("argument 'hmm' expects a univariate hmm object (type ?hmm for help)") |
|
75 |
+ if (check.univariate.model(hmm)!=0 & check.multivariate.model(hmm)!=0) stop("argument 'hmm' expects a univariate or multivariate hmm object (type ?hmm for help)") |
|
76 | 76 |
if (check.logical(reduce)!=0) stop("argument 'reduce' expects TRUE or FALSE") |
77 | 77 |
|
78 | 78 |
### Create GRanges ### |
... | ... |
@@ -94,6 +94,9 @@ hmm2GRanges <- function(hmm, reduce=TRUE) { |
94 | 94 |
for (state in ustates) { |
95 | 95 |
red.gr <- GenomicRanges::reduce(gr[hmm$states==state]) |
96 | 96 |
mcols(red.gr)$state <- rep(factor(state, levels=levels),length(red.gr)) |
97 |
+ if (class(hmm)==class.multivariate.hmm) { |
|
98 |
+ mcols(red.gr)$state.separate <- matrix(rep(strsplit(as.character(state), split=' ')[[1]], length(red.gr)), byrow=T, nrow=length(red.gr)) |
|
99 |
+ } |
|
97 | 100 |
red.gr.list[[length(red.gr.list)+1]] <- red.gr |
98 | 101 |
} |
99 | 102 |
# Merge and sort |
... | ... |
@@ -104,6 +107,9 @@ hmm2GRanges <- function(hmm, reduce=TRUE) { |
104 | 107 |
mcols(gr)$reads <- hmm$reads |
105 | 108 |
mcols(gr)$posteriors <- hmm$posteriors |
106 | 109 |
mcols(gr)$state <- hmm$states |
110 |
+ if (class(hmm)==class.multivariate.hmm) { |
|
111 |
+ mcols(gr)$state.separate <- as.matrix(hmm$states.separate) |
|
112 |
+ } |
|
107 | 113 |
return(gr) |
108 | 114 |
} |
109 | 115 |
|
... | ... |
@@ -1,4 +1,17 @@ |
1 |
-findCNVs <- function(binned.data, ID, eps=0.001, init="random", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, read.cutoff.quantile=0.999) { |
|
1 |
+findCNVs <- function(binned.data, ID, method='univariate', eps=0.001, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, read.cutoff.quantile=0.999) { |
|
2 |
+ |
|
3 |
+ if (method=='univariate') { |
|
4 |
+ model <- univariate.findCNVs(binned.data, ID, eps, init, max.time, max.iter, num.trials, eps.try, num.threads, output.if.not.converged, read.cutoff.quantile) |
|
5 |
+ } else if (method=='bivariate') { |
|
6 |
+ model <- bivariate.findCNVs(binned.data, ID, eps, init, max.time, max.iter, num.trials, eps.try, num.threads, output.if.not.converged, read.cutoff.quantile) |
|
7 |
+ } |
|
8 |
+ |
|
9 |
+ return(model) |
|
10 |
+ |
|
11 |
+} |
|
12 |
+ |
|
13 |
+ |
|
14 |
+univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, read.cutoff.quantile=0.999) { |
|
2 | 15 |
|
3 | 16 |
## Intercept user input |
4 | 17 |
IDcheck <- ID #trigger error if not defined |
... | ... |
@@ -32,12 +45,14 @@ findCNVs <- function(binned.data, ID, eps=0.001, init="random", max.time=-1, max |
32 | 45 |
} |
33 | 46 |
|
34 | 47 |
# Filter high reads out, makes HMM faster |
35 |
- read.cutoff <- as.integer(quantile(reads, read.cutoff.quantile)) |
|
48 |
+ read.cutoff <- quantile(reads, read.cutoff.quantile) |
|
49 |
+ names.read.cutoff <- names(read.cutoff) |
|
50 |
+ read.cutoff <- as.integer(read.cutoff) |
|
36 | 51 |
mask <- reads > read.cutoff |
37 | 52 |
reads[mask] <- read.cutoff |
38 | 53 |
numfiltered <- length(which(mask)) |
39 | 54 |
if (numfiltered > 0) { |
40 |
- cat(paste0("Replaced read counts > ",read.cutoff," (",names(read.cutoff)," quantile) by ",read.cutoff," in ",numfiltered," bins. Set option 'read.cutoff.quantile=1' to disable this filtering.\n")) |
|
55 |
+ cat(paste0("Replaced read counts > ",read.cutoff," (",names.read.cutoff," quantile) by ",read.cutoff," in ",numfiltered," bins. Set option 'read.cutoff.quantile=1' to disable this filtering. This filtering was done to increase the speed of the HMM and should not affect the results.\n")) |
|
41 | 56 |
} |
42 | 57 |
|
43 | 58 |
## Call univariate in a for loop to enable multiple trials |
... | ... |
@@ -81,6 +96,7 @@ findCNVs <- function(binned.data, ID, eps=0.001, init="random", max.time=-1, max |
81 | 96 |
reads = as.integer(reads), # int* O |
82 | 97 |
num.bins = as.integer(numbins), # int* T |
83 | 98 |
num.states = as.integer(numstates), # int* N |
99 |
+ state.labels = as.integer(state.labels), # int* state_labels |
|
84 | 100 |
size = double(length=numstates), # double* size |
85 | 101 |
prob = double(length=numstates), # double* prob |
86 | 102 |
lambda = double(length=numstates), # double* lambda |
... | ... |
@@ -128,6 +144,7 @@ findCNVs <- function(binned.data, ID, eps=0.001, init="random", max.time=-1, max |
128 | 144 |
reads = as.integer(reads), # int* O |
129 | 145 |
num.bins = as.integer(numbins), # int* T |
130 | 146 |
num.states = as.integer(numstates), # int* N |
147 |
+ state.labels = as.integer(state.labels), # int* state_labels |
|
131 | 148 |
size = double(length=numstates), # double* size |
132 | 149 |
prob = double(length=numstates), # double* prob |
133 | 150 |
lambda = double(length=numstates), # double* lambda |
... | ... |
@@ -160,8 +177,9 @@ findCNVs <- function(binned.data, ID, eps=0.001, init="random", max.time=-1, max |
160 | 177 |
hmm$coordinates <- data.frame(as.character(seqnames(binned.data)), start(ranges(binned.data)), end(ranges(binned.data))) |
161 | 178 |
names(hmm$coordinates) <- coordinate.names |
162 | 179 |
hmm$seqlengths <- seqlengths(binned.data) |
163 |
- class(hmm) <- class.aneufinder.hmm |
|
164 |
- hmm$states <- factor(state.labels, levels=state.labels)[hmm$states+1] |
|
180 |
+ class(hmm) <- class.univariate.hmm |
|
181 |
+ hmm$states <- state.labels[hmm$states] |
|
182 |
+ hmm$state.labels <- state.labels |
|
165 | 183 |
hmm$eps <- eps |
166 | 184 |
hmm$A <- matrix(hmm$A, ncol=hmm$num.states) |
167 | 185 |
rownames(hmm$A) <- state.labels |
... | ... |
@@ -224,3 +242,225 @@ findCNVs <- function(binned.data, ID, eps=0.001, init="random", max.time=-1, max |
224 | 242 |
return(hmm) |
225 | 243 |
} |
226 | 244 |
} |
245 |
+ |
|
246 |
+ |
|
247 |
+bivariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, read.cutoff.quantile=0.999) { |
|
248 |
+ |
|
249 |
+ ### Split into strands and run univariate findCNVs |
|
250 |
+ binned.data.plus <- binned.data |
|
251 |
+ binned.data.plus$reads <- binned.data$preads |
|
252 |
+ plus.model <- univariate.findCNVs(binned.data.plus, ID, eps, init, max.time, max.iter, num.trials, eps.try, num.threads, output.if.not.converged, read.cutoff.quantile) |
|
253 |
+ binned.data.minus <- binned.data |
|
254 |
+ binned.data.minus$reads <- binned.data$mreads |
|
255 |
+ minus.model <- univariate.findCNVs(binned.data.minus, ID, eps, init, max.time, max.iter, num.trials, eps.try, num.threads, output.if.not.converged, read.cutoff.quantile) |
|
256 |
+ models <- list(plus=plus.model, minus=minus.model) |
|
257 |
+ |
|
258 |
+ ### Prepare the multivariate HMM |
|
259 |
+ ## Extract reads and other stuff |
|
260 |
+ coordinates <- models[[1]]$coordinates |
|
261 |
+ seqlengths <- models[[1]]$seqlengths |
|
262 |
+ distributions <- lapply(models, '[[', 'distributions') |
|
263 |
+ weights <- lapply(models, '[[', 'weights') |
|
264 |
+ num.uni.states <- models[[1]]$num.states |
|
265 |
+ num.models <- length(models) |
|
266 |
+ num.bins <- models[[1]]$num.bins |
|
267 |
+ comb.states <- paste(levels(models[[1]]$states), levels(models[[2]]$states)) |
|
268 |
+ comb.states <- factor(comb.states, levels=comb.states) |
|
269 |
+ comb.states.per.bin <- factor(do.call(paste, lapply(models, '[[', 'states')), levels=levels(comb.states)) |
|
270 |
+ num.comb.states <- length(comb.states) |
|
271 |
+ reads <- matrix(c(models$plus$reads, models$minus$reads), ncol=num.models, dimnames=list(bin=1:num.bins, strand=names(models))) |
|
272 |
+ maxreads <- max(reads) |
|
273 |
+ |
|
274 |
+ ## Pre-compute z-values for each number of reads |
|
275 |
+ cat("Computing pre z-matrix...") |
|
276 |
+ z.per.read <- array(NA, dim=c(maxreads+1, num.models, num.uni.states)) |
|
277 |
+ xreads <- 0:maxreads |
|
278 |
+ for (istrand in 1:num.models) { |
|
279 |
+ model <- models[[istrand]] |
|
280 |
+ for (istate in 1:num.uni.states) { |
|
281 |
+ if (model$distributions[istate,'type']=='dnbinom') { |
|
282 |
+ size <- model$distributions[istate,'size'] |
|
283 |
+ prob <- model$distributions[istate,'prob'] |
|
284 |
+ u <- pnbinom(xreads, size, prob) |
|
285 |
+ } else if (model$distributions[istate,'type']=='delta') { |
|
286 |
+ u <- rep(1, length(xreads)) |
|
287 |
+ } |
|
288 |
+ qnorm_u <- qnorm(u) |
|
289 |
+ mask <- qnorm_u==Inf |
|
290 |
+ qnorm_u[mask] <- qnorm(1-1e-16) |
|
291 |
+ z.per.read[ , istrand, istate] <- qnorm_u |
|
292 |
+ } |
|
293 |
+ } |
|
294 |
+ cat(" done\n") |
|
295 |
+ |
|
296 |
+ ## Compute the z matrix |
|
297 |
+ cat("Transfering values into z-matrix...") |
|
298 |
+ z.per.bin = array(NA, dim=c(num.bins, num.models, num.uni.states), dimnames=list(bin=1:num.bins, strand=names(models), levels(modellist[[1]]$states))) |
|
299 |
+ for (istrand in 1:num.models) { |
|
300 |
+ model <- models[[istrand]] |
|
301 |
+ for (istate in 1:num.uni.states) { |
|
302 |
+ z.per.bin[ , istrand, istate] = z.per.read[reads[,istrand]+1, istrand, istate] |
|
303 |
+ } |
|
304 |
+ } |
|
305 |
+ remove(z.per.read) |
|
306 |
+ cat(" done\n") |
|
307 |
+ |
|
308 |
+ ## Calculate correlation matrix |
|
309 |
+ cat("Computing inverse of correlation matrix...") |
|
310 |
+ correlationMatrix <- array(0, dim=c(num.models,num.models,num.comb.states), dimnames=list(strand=names(models), strand=names(models), comb.state=comb.states)) |
|
311 |
+ correlationMatrixInverse <- correlationMatrix |
|
312 |
+ determinant <- rep(0, num.comb.states) |
|
313 |
+ names(determinant) <- comb.states |
|
314 |
+ usestateTF <- rep(NA, num.comb.states) # TRUE, FALSE vector for usable states |
|
315 |
+ names(usestateTF) <- comb.states |
|
316 |
+ for (state in comb.states) { |
|
317 |
+ istate <- strsplit(state, ' ')[[1]] |
|
318 |
+ mask <- which(comb.states.per.bin==state) |
|
319 |
+ # Subselect z |
|
320 |
+ z.temp <- matrix(NA, ncol=length(istate), nrow=length(mask)) |
|
321 |
+ for (i1 in 1:length(istate)) { |
|
322 |
+ z.temp[,i1] <- z.per.bin[mask, i1, istate[i1]] |
|
323 |
+ } |
|
324 |
+ temp <- tryCatch({ |
|
325 |
+ correlationMatrix[,,state] <- cor(z.temp) |
|
326 |
+ determinant[state] <- det( correlationMatrix[,,state] ) |
|
327 |
+ correlationMatrixInverse[,,state] <- solve(correlationMatrix[,,state]) |
|
328 |
+ if (length(z.temp)>0) { |
|
329 |
+ usestateTF[state] <- TRUE |
|
330 |
+ } else { |
|
331 |
+ usestateTF[state] <- FALSE |
|
332 |
+ } |
|
333 |
+ }, warning = function(war) { |
|
334 |
+ usestateTF[state] <<- FALSE |
|
335 |
+ war |
|
336 |
+ }, error = function(err) { |
|
337 |
+ usestateTF[state] <<- FALSE |
|
338 |
+ err |
|
339 |
+ }) |
|
340 |
+ } |
|
341 |
+# remove(z.per.bin) |
|
342 |
+ cat(" done\n") |
|
343 |
+ |
|
344 |
+ # Use nullsomy state anyways |
|
345 |
+ usestateTF[1] <- TRUE |
|
346 |
+ correlationMatrix = correlationMatrix[,,usestateTF] |
|
347 |
+ correlationMatrixInverse = correlationMatrixInverse[,,usestateTF] |
|
348 |
+ comb.states = comb.states[usestateTF] |
|
349 |
+ comb.states <- droplevels(comb.states) |
|
350 |
+ determinant = determinant[usestateTF] |
|
351 |
+ num.comb.states <- length(comb.states) |
|
352 |
+ |
|
353 |
+ ## Calculate multivariate densities for each state |
|
354 |
+ cat("Calculating multivariate densities...") |
|
355 |
+ densities <- matrix(1, ncol=num.comb.states, nrow=num.bins, dimnames=list(bin=1:num.bins, comb.state=comb.states)) |
|
356 |
+ for (state in comb.states) { |
|
357 |
+ i <- which(state==comb.states) |
|
358 |
+ istate <- strsplit(state, ' ')[[1]] |
|
359 |
+ product <- 1 |
|
360 |
+ for (istrand in 1:num.models) { |
|
361 |
+ if (models[[istrand]]$distributions[istate[istrand],'type'] == 'dnbinom') { |
|
362 |
+ exponent <- -0.5 * apply( ( z.per.bin[ , , i] %*% (correlationMatrixInverse[ , , i] - diag(num.models)) ) * z.per.bin[ , , i], 1, sum) |
|
363 |
+ size <- models[[istrand]]$distributions[istate[istrand],'size'] |
|
364 |
+ prob <- models[[istrand]]$distributions[istate[istrand],'prob'] |
|
365 |
+ product <- product * dnbinom(reads[,istrand], size, prob) |
|
366 |
+ densities[,i] <- product * determinant[i]^(-0.5) * exp( exponent ) |
|
367 |
+ } |
|
368 |
+ else if (models[[istrand]]$distributions[istate[istrand],'type'] == 'delta') { |
|
369 |
+ densities[,i] <- densities[,i] * ifelse(reads[,istrand]==0, 1, 0) |
|
370 |
+ } |
|
371 |
+ } |
|
372 |
+ } |
|
373 |
+ # Check if densities are > 1 |
|
374 |
+ if (any(densities>1)) stop("Densities > 1") |
|
375 |
+ if (any(densities<0)) stop("Densities < 0") |
|
376 |
+ densities[densities>1] <- 1 |
|
377 |
+ densities[densities<0] <- 0 |
|
378 |
+ # Check if densities are 0 everywhere in some bins |
|
379 |
+ check <- which(apply(densities, 1, sum) == 0) |
|
380 |
+ if (length(check)>0) { |
|
381 |
+ if (check[1]==1) { |
|
382 |
+ densities[1,] <- rep(1e-10, ncol(densities)) |
|
383 |
+ check <- check[-1] |
|
384 |
+ } |
|
385 |
+ for (icheck in check) { |
|
386 |
+ densities[icheck,] <- densities[icheck-1,] |
|
387 |
+ } |
|
388 |
+ } |
|
389 |
+ cat(" done\n") |
|
390 |
+ |
|
391 |
+ |
|
392 |
+ ### Run the multivariate HMM |
|
393 |
+ # Call the C function |
|
394 |
+ use.initial <- FALSE |
|
395 |
+ hmm <- .C("R_multivariate_hmm", |
|
396 |
+ densities = as.double(densities), # double* D |
|
397 |
+ num.bins = as.integer(num.bins), # int* T |
|
398 |
+ num.comb.states = as.integer(num.comb.states), # int* N |
|
399 |
+ num.strands = as.integer(num.models), # int* Nmod |
|
400 |
+ comb.states = as.integer(comb.states), # int* comb_states |
|
401 |
+ num.iterations = as.integer(max.iter), # int* maxiter |
|
402 |
+ time.sec = as.integer(max.time), # double* maxtime |
|
403 |
+ loglik.delta = as.double(eps), # double* eps |
|
404 |
+ states = integer(length=num.bins), # int* states |
|
405 |
+ A = double(length=num.comb.states*num.comb.states), # double* A |
|
406 |
+ proba = double(length=num.comb.states), # double* proba |
|
407 |
+ loglik = double(length=1), # double* loglik |
|
408 |
+ A.initial = double(length=num.comb.states*num.comb.states), # double* initial_A |
|
409 |
+ proba.initial = double(length=num.comb.states), # double* initial_proba |
|
410 |
+ use.initial.params = as.logical(use.initial), # bool* use_initial_params |
|
411 |
+ num.threads = as.integer(num.threads), # int* num_threads |
|
412 |
+ error = as.integer(0) # error handling |
|
413 |
+ ) |
|
414 |
+ |
|
415 |
+ # Add useful entries |
|
416 |
+ class(hmm) <- class.multivariate.hmm |
|
417 |
+ hmm$coordinates <- coordinates |
|
418 |
+ hmm$seqlengths <- seqlengths |
|
419 |
+ hmm$densities <- densities # reassign because of matrix layout |
|
420 |
+ hmm$reads <- reads |
|
421 |
+ hmm$states <- comb.states[hmm$states] |
|
422 |
+ # Get states as factors in data.frame |
|
423 |
+ matrix.states <- matrix(unlist(strsplit(as.character(hmm$states), split=' ')), byrow=T, ncol=num.models, dimnames=list(bin=1:num.bins, strand=names(models))) |
|
424 |
+ toFactor <- function(column) { factor(column, levels=levels(models[[1]]$states)) } |
|
425 |
+ hmm$states.separate <- list() |
|
426 |
+ for (i1 in 1:num.models) { |
|
427 |
+ hmm$states.separate[[names(models)[i1]]] <- toFactor(matrix.states[,i1]) |
|
428 |
+ } |
|
429 |
+ hmm$states.separate <- as.data.frame(hmm$states.separate) |
|
430 |
+ hmm$comb.states <- comb.states |
|
431 |
+ hmm$eps <- eps |
|
432 |
+ hmm$A <- matrix(hmm$A, ncol=num.comb.states) |
|
433 |
+ colnames(hmm$A) <- comb.states |
|
434 |
+ rownames(hmm$A) <- comb.states |
|
435 |
+ names(hmm$proba) <- comb.states |
|
436 |
+ hmm$ID <- ID |
|
437 |
+ hmm$distributions.univariate <- distributions |
|
438 |
+ hmm$weights.univariate <- weights |
|
439 |
+ hmm$A.initial <- matrix(hmm$A.initial, ncol=num.comb.states) |
|
440 |
+ colnames(hmm$A.initial) <- comb.states |
|
441 |
+ rownames(hmm$A.initial) <- comb.states |
|
442 |
+ names(hmm$proba.initial) <- comb.states |
|
443 |
+ hmm$correlation.matrix <- correlationMatrix |
|
444 |
+ hmm$correlation.matrix.inverse <- correlationMatrixInverse |
|
445 |
+ |
|
446 |
+ # Delete redundant entries |
|
447 |
+ hmm$use.initial.params <- NULL |
|
448 |
+ |
|
449 |
+ # Check convergence |
|
450 |
+ war <- NULL |
|
451 |
+ if (hmm$loglik.delta > hmm$eps) { |
|
452 |
+ war <- warning("HMM did not converge!\n") |
|
453 |
+ } |
|
454 |
+ if (hmm$error == 1) { |
|
455 |
+ stop("A nan occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your read counts for very high numbers, they could be the cause for this problem.") |
|
456 |
+ } else if (hmm$error == 2) { |
|
457 |
+ stop("An error occurred during the Baum-Welch! Parameter estimation terminated prematurely.") |
|
458 |
+ } |
|
459 |
+ |
|
460 |
+ hmms <- list(bivariate=hmm, univariate.plus=models$plus, univariate.minus=models$minus) |
|
461 |
+ class(hmms) <- class.hmm.list |
|
462 |
+ return(hmms) |
|
463 |
+ |
|
464 |
+} |
|
465 |
+ |
|
466 |
+ |
... | ... |
@@ -1,11 +1,13 @@ |
1 | 1 |
# ======================================================= |
2 | 2 |
# Some global variables that can be used in all functions |
3 | 3 |
# ======================================================= |
4 |
-state.labels <- c("nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy") |
|
4 |
+state.labels <- factor(c("nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), levels=c("nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy")) |
|
5 | 5 |
state.distributions <- factor(c('delta','dnbinom','dnbinom','dnbinom','dnbinom','dnbinom'), levels=c('delta','dgeom','dnbinom','dpois','dbinom')) |
6 | 6 |
coordinate.names <- c("chrom","start","end") |
7 | 7 |
binned.data.names <- c(coordinate.names,"reads") |
8 |
-class.aneufinder.hmm <- "aneufinder.hmm" |
|
8 |
+class.univariate.hmm <- "aneufinder.univariate.hmm" |
|
9 |
+class.multivariate.hmm <- "aneufinder.multivariate.hmm" |
|
10 |
+class.hmm.list <- "aneufinder.hmm.list" |
|
9 | 11 |
state.colors <- c("mapped"="gray68","nullsomy"="gray20","null-mixed"="gray30","monosomy"="gold3","disomy"="springgreen3","trisomy"="orangered1","tetrasomy"="orangered4","multisomy"="purple3","total"="black") |
10 | 12 |
get.state.labels <- function() { return(state.labels) } |
11 | 13 |
get.state.colors <- function() { return(state.colors[state.labels]) } |
... | ... |
@@ -48,7 +48,7 @@ plot.distribution <- function(model, state=NULL, chrom=NULL, start=NULL, end=NUL |
48 | 48 |
states <- model$states[selectmask] |
49 | 49 |
weights <- rep(NA, 3) |
50 | 50 |
for (i1 in 1:length(levels(model$states))) { |
51 |
- weights[i1] <- length(which(states==state.labels[i1])) |
|
51 |
+ weights[i1] <- length(which(states==model$state.labels[i1])) |
|
52 | 52 |
} |
53 | 53 |
weights <- weights / length(states) |
54 | 54 |
} else { |
... | ... |
@@ -66,6 +66,7 @@ plot.distribution <- function(model, state=NULL, chrom=NULL, start=NULL, end=NUL |
66 | 66 |
ggplt <- ggplot(data.frame(reads)) + geom_histogram(aes(x=reads, y=..density..), binwidth=1, color='black', fill='white') + xlim(0,rightxlim) + theme_bw() + xlab("read count") |
67 | 67 |
|
68 | 68 |
### Add fits to the histogram |
69 |
+ c.state.labels <- as.character(model$state.labels) |
|
69 | 70 |
numstates <- model$num.states |
70 | 71 |
x <- 0:rightxlim |
71 | 72 |
distributions <- list(x) |
... | ... |
@@ -92,12 +93,12 @@ plot.distribution <- function(model, state=NULL, chrom=NULL, start=NULL, end=NUL |
92 | 93 |
} |
93 | 94 |
} |
94 | 95 |
distributions <- as.data.frame(distributions) |
95 |
- names(distributions) <- c("x",state.labels) |
|
96 |
+ names(distributions) <- c("x",c.state.labels) |
|
96 | 97 |
# Total |
97 | 98 |
distributions$total <- apply(distributions[-1], 1, sum) |
98 | 99 |
|
99 | 100 |
# Reshape the data.frame for plotting with ggplot |
100 |
- distributions <- reshape(distributions, direction="long", varying=1+1:(numstates+1), v.names="density", timevar="state", times=c(state.labels,"total")) |
|
101 |
+ distributions <- reshape(distributions, direction="long", varying=1+1:(numstates+1), v.names="density", timevar="state", times=c(c.state.labels,"total")) |
|
101 | 102 |
### Plot the distributions |
102 | 103 |
if (is.null(state)) { |
103 | 104 |
ggplt <- ggplt + geom_line(aes(x=x, y=density, group=state, col=state), data=distributions) |
... | ... |
@@ -108,9 +109,9 @@ plot.distribution <- function(model, state=NULL, chrom=NULL, start=NULL, end=NUL |
108 | 109 |
# Make legend and colors correct |
109 | 110 |
lmeans <- round(model$distributions[,'mu'], 2) |
110 | 111 |
lvars <- round(model$distributions[,'variance'], 2) |
111 |
- legend <- paste(state.labels, ", mean=", lmeans, ", var=", lvars, sep='') |
|
112 |
+ legend <- paste(c.state.labels, ", mean=", lmeans, ", var=", lvars, sep='') |
|
112 | 113 |
legend <- c(legend, paste0('total, mean(data)=', round(mean(reads),2), ', var(data)=', round(var(reads),2))) |
113 |
- ggplt <- ggplt + scale_color_manual(breaks=c(state.labels, 'total'), values=state.colors[c(state.labels,'total')], labels=legend) |
|
114 |
+ ggplt <- ggplt + scale_color_manual(breaks=c(c.state.labels, 'total'), values=state.colors[c(c.state.labels,'total')], labels=legend) |
|
114 | 115 |
ggplt <- ggplt + theme(legend.position=c(1,1), legend.justification=c(1,1)) |
115 | 116 |
|
116 | 117 |
return(ggplt) |
... | ... |
@@ -139,9 +140,22 @@ plot.boxplot <- function(model) { |
139 | 140 |
|
140 | 141 |
|
141 | 142 |
# ------------------------------------------------------------ |
142 |
-# Plot like BAIT |
|
143 |
+# Plot state categorization for all chromosomes |
|
143 | 144 |
# ------------------------------------------------------------ |
144 |
-plot.BAIT <- function(model, file='aneufinder_BAIT_plots') { |
|
145 |
+plot.chromosomes <- function(model, file=NULL) { |
|
146 |
+ |
|
147 |
+ if (class(model)==class.univariate.hmm) { |
|
148 |
+ plot.chromosomes.univariate(model, file=file) |
|
149 |
+ } else if (class(model)==class.multivariate.hmm) { |
|
150 |
+ plot.chromosomes.bivariate(model, file=file) |
|
151 |
+ } |
|
152 |
+ |
|
153 |
+} |
|
154 |
+ |
|
155 |
+# ------------------------------------------------------------ |
|
156 |
+# Plot state categorization for all chromosomes |
|
157 |
+# ------------------------------------------------------------ |
|
158 |
+plot.chromosomes.univariate <- function(model, file=NULL) { |
|
145 | 159 |
|
146 | 160 |
## Convert to GRanges |
147 | 161 |
gr <- hmm2GRanges(model, reduce=F) |
... | ... |
@@ -157,12 +171,14 @@ plot.BAIT <- function(model, file='aneufinder_BAIT_plots') { |
157 | 171 |
library(ggplot2) |
158 | 172 |
nrows <- 2 # rows for plotting chromosomes |
159 | 173 |
ncols <- ceiling(num.chroms/nrows) |
160 |
- png(file=paste0(file, '.png'), width=ncols*6, height=nrows*21, units='cm', res=150) |
|
174 |
+ if (!is.null(file)) { |
|
175 |
+ pdf(file=paste0(file, '.pdf'), width=ncols*2.4, height=nrows*8.3) |
|
176 |
+ } |
|
161 | 177 |
grid.newpage() |
162 | 178 |
layout <- matrix(1:((nrows+1)*ncols), ncol=ncols, nrow=nrows+1, byrow=T) |
163 | 179 |
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout), heights=c(1,21,21)))) |
164 | 180 |
# Main title |
165 |
- grid.text(file, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:ncols), gp=gpar(fontsize=26)) |
|
181 |
+ grid.text(model$ID, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:ncols), gp=gpar(fontsize=26)) |
|
166 | 182 |
|
167 | 183 |
## Go through chromosomes and plot |
168 | 184 |
for (i1 in 1:num.chroms) { |
... | ... |
@@ -181,8 +197,10 @@ plot.BAIT <- function(model, file='aneufinder_BAIT_plots') { |
181 | 197 |
|
182 | 198 |
# Plot the read counts |
183 | 199 |
dfplot <- as.data.frame(grl[[i1]]) |
184 |
- dfplot.points <- dfplot[dfplot$reads>=custom.xlim,] |
|
185 |
- dfplot$reads <- stats::runmed(dfplot$reads, 11) |
|
200 |
+ # Set values to great for plotting to limit |
|
201 |
+ dfplot$reads[dfplot$reads>=custom.xlim] <- custom.xlim |
|
202 |
+ dfplot.points <- dfplot[dfplot$reads>=custom.xlim,] |
|
203 |
+ dfplot.points$reads <- rep(custom.xlim, nrow(dfplot.points)) |
|
186 | 204 |
empty_theme <- theme(axis.line=element_blank(), |
187 | 205 |
axis.text.x=element_blank(), |
188 | 206 |
axis.text.y=element_blank(), |
... | ... |
@@ -197,19 +215,111 @@ plot.BAIT <- function(model, file='aneufinder_BAIT_plots') { |
197 | 215 |
plot.background=element_blank()) |
198 | 216 |
ggplt <- ggplot(dfplot, aes(x=start, y=reads)) # data |
199 | 217 |
ggplt <- ggplt + geom_linerange(aes(ymin=0, ymax=reads, col=state), size=0.2) # read count |
200 |
- ggplt <- ggplt + xlim(0,maxseqlength) + ylim(-0.6*custom.xlim,custom.xlim) # set x- and y-limits |
|
201 | 218 |
ggplt <- ggplt + geom_rect(ymin=-0.05*custom.xlim-0.1*custom.xlim, ymax=-0.05*custom.xlim, xmin=0, mapping=aes(xmax=max(start)), col='white', fill='gray20') # chromosome backbone as simple rectangle |
202 |
- ggplt <- ggplt + geom_point(data=dfplot.points, mapping=aes(x=start, y=reads, col=state)) # outliers |
|
219 |
+ ggplt <- ggplt + geom_point(data=dfplot.points, mapping=aes(x=start, y=reads, col=state), size=5, shape=21) # outliers |
|
203 | 220 |
ggplt <- ggplt + scale_color_manual(values=state.colors, drop=F) # do not drop levels if not present |
204 | 221 |
ggplt <- ggplt + empty_theme # no axes whatsoever |
205 | 222 |
ggplt <- ggplt + ylab(paste0(seqnames(grl[[i1]])[1], "\n", pstring, "\n", pstring2)) # chromosome names |
223 |
+ ggplt <- ggplt + xlim(0,maxseqlength) + ylim(-0.6*custom.xlim,custom.xlim) # set x- and y-limits |
|
206 | 224 |
ggplt <- ggplt + coord_flip() |
207 | 225 |
suppressWarnings( |
208 | 226 |
print(ggplt, vp = viewport(layout.pos.row = matchidx$row, layout.pos.col = matchidx$col)) |
209 | 227 |
) |
210 | 228 |
|
211 | 229 |
} |
212 |
- d <- dev.off() |
|
230 |
+ if (!is.null(file)) { |
|
231 |
+ d <- dev.off() |
|
232 |
+ } |
|
233 |
+} |
|
234 |
+ |
|
235 |
+ |
|
236 |
+ |
|
237 |
+# ------------------------------------------------------------ |
|
238 |
+# Plot state categorization for all chromosomes |
|
239 |
+# ------------------------------------------------------------ |
|
240 |
+plot.chromosomes.bivariate <- function(model, file=NULL) { |
|
241 |
+ |
|
242 |
+ ## Convert to GRanges |
|
243 |
+ gr <- hmm2GRanges(model, reduce=F) |
|
244 |
+ grl <- split(gr, seqnames(gr)) |
|
245 |
+ |
|
246 |
+ ## Get some variables |
|
247 |
+ num.chroms <- length(levels(seqnames(gr))) |
|
248 |
+ maxseqlength <- max(seqlengths(gr)) |
|
249 |
+ custom.xlim <- model$distributions.univariate[[1]]['monosomy','mu'] * 10 |
|
250 |
+ |
|
251 |
+ ## Setup page |
|
252 |
+ library(grid) |
|
253 |
+ library(ggplot2) |
|
254 |
+ nrows <- 2 # rows for plotting chromosomes |
|
255 |
+ ncols <- ceiling(num.chroms/nrows) |
|
256 |
+ if (!is.null(file)) { |
|
257 |
+ pdf(file=paste0(file, '.pdf'), width=ncols*2.4, height=nrows*8.3) |
|
258 |
+ } |
|
259 |
+ grid.newpage() |
|
260 |
+ layout <- matrix(1:((nrows+1)*ncols), ncol=ncols, nrow=nrows+1, byrow=T) |
|
261 |
+ pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout), heights=c(1,21,21)))) |
|
262 |
+ # Main title |
|
263 |
+ grid.text(model$ID, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:ncols), gp=gpar(fontsize=26)) |
|
264 |
+ |
|
265 |
+ ## Go through chromosomes and plot |
|
266 |
+ for (i1 in 1:num.chroms) { |
|
267 |
+ # Get the i,j matrix positions of the regions that contain this subplot |
|
268 |
+ matchidx <- as.data.frame(which(layout == i1+ncols, arr.ind = TRUE)) |
|
269 |
+ |
|
270 |
+ # Percentage of chromosome in state |
|
271 |
+ tstate <- table(mcols(grl[[i1]])$state) |
|
272 |
+ pstate.all <- tstate / sum(tstate) |
|
273 |
+ pstate <- round(pstate.all*100)[-1] # without 'nullsomy / unmapped' state |
|
274 |
+ pstring <- apply(pstate, 1, function(x) { paste0(": ", x, "%") }) |
|
275 |
+ pstring <- paste0(names(pstring), pstring) |
|
276 |
+ pstring <- paste(pstring[which.max(pstate)], collapse="\n") |
|
277 |
+ pstring2 <- round(pstate.all*100)[1] # only 'nullsomy / unmapped' |
|
278 |
+ pstring2 <- paste0(names(pstring2), ": ", pstring2, "%") |
|
279 |
+ |
|
280 |
+ ## Convert to data.frame for plotting and prepare the data.frame |
|
281 |
+ dfplot <- as.data.frame(grl[[i1]]) |
|
282 |
+ dfplot$reads.minus <- - dfplot$reads.minus |
|
283 |
+ # Set values to great for plotting to limit |
|
284 |
+ dfplot$reads.plus[dfplot$reads.plus>=custom.xlim] <- custom.xlim |
|
285 |
+ dfplot$reads.minus[dfplot$reads.minus<=-custom.xlim] <- -custom.xlim |
|
286 |
+ dfplot.points.plus <- dfplot[dfplot$reads.plus>=custom.xlim,] |
|
287 |
+ dfplot.points.plus$reads <- rep(custom.xlim, nrow(dfplot.points.plus)) |
|
288 |
+ dfplot.points.minus <- dfplot[dfplot$reads.minus<=-custom.xlim,] |
|
289 |
+ dfplot.points.minus$reads <- rep(-custom.xlim, nrow(dfplot.points.minus)) |
|
290 |
+ # No axes and labels |
|
291 |
+ empty_theme <- theme(axis.line=element_blank(), |
|
292 |
+ axis.text.x=element_blank(), |
|
293 |
+ axis.text.y=element_blank(), |
|
294 |
+ axis.ticks=element_blank(), |
|
295 |
+ axis.title.x=element_text(size=20), |
|
296 |
+ axis.title.y=element_blank(), |
|
297 |
+ legend.position="none", |
|
298 |
+ panel.background=element_blank(), |
|
299 |
+ panel.border=element_blank(), |
|
300 |
+ panel.grid.major=element_blank(), |
|
301 |
+ panel.grid.minor=element_blank(), |
|
302 |
+ plot.background=element_blank()) |
|
303 |
+ # Plot |
|
304 |
+ ggplt <- ggplot(dfplot, aes(x=start, y=reads.plus)) # data |
|
305 |
+ ggplt <- ggplt + geom_linerange(aes(ymin=0, ymax=reads.plus, col=state.separate.plus), size=0.2) # read count |
|
306 |
+ ggplt <- ggplt + geom_linerange(aes(ymin=0, ymax=reads.minus, col=state.separate.minus), size=0.2) # read count |
|
307 |
+ ggplt <- ggplt + geom_rect(ymin=-0.05*custom.xlim, ymax=0.05*custom.xlim, xmin=0, mapping=aes(xmax=max(start)), col='white', fill='gray20') # chromosome backbone as simple rectangle |
|
308 |
+ ggplt <- ggplt + geom_point(data=dfplot.points.plus, mapping=aes(x=start, y=reads, col=state.separate.plus), size=5, shape=21) # outliers |
|
309 |
+ ggplt <- ggplt + geom_point(data=dfplot.points.minus, mapping=aes(x=start, y=reads, col=state.separate.minus), size=5, shape=21) # outliers |
|
310 |
+ ggplt <- ggplt + scale_color_manual(values=state.colors, drop=F) # do not drop levels if not present |
|
311 |
+ ggplt <- ggplt + empty_theme # no axes whatsoever |
|
312 |
+ ggplt <- ggplt + ylab(paste0(seqnames(grl[[i1]])[1], "\n", pstring, "\n", pstring2)) # chromosome names |
|
313 |
+ ggplt <- ggplt + xlim(0,maxseqlength) + ylim(-custom.xlim,custom.xlim) # set x- and y-limits |
|
314 |
+ ggplt <- ggplt + coord_flip() |
|
315 |
+ suppressWarnings( |
|
316 |
+ print(ggplt, vp = viewport(layout.pos.row = matchidx$row, layout.pos.col = matchidx$col)) |
|
317 |
+ ) |
|
318 |
+ |
|
319 |
+ } |
|
320 |
+ if (!is.null(file)) { |
|
321 |
+ d <- dev.off() |
|
322 |
+ } |
|
213 | 323 |
} |
214 | 324 |
|
215 | 325 |
|
... | ... |
@@ -241,7 +351,7 @@ plot.genome.overview <- function(modellist, file='aneufinder_genome_overview', n |
241 | 351 |
library(ggplot2) |
242 | 352 |
nrows <- length(uni.hmm.grl) # rows for plotting genomes |
243 | 353 |
ncols <- 1 |
244 |
- png(file=paste0(file, '.png'), width=ncols*60, height=nrows*5, units='cm', res=150) |
|
354 |
+ pdf(file=paste0(file, '.pdf'), width=ncols*24, height=nrows*2) |
|
245 | 355 |
grid.newpage() |
246 | 356 |
layout <- matrix(1:((nrows+1)*ncols), ncol=ncols, nrow=nrows+1, byrow=T) |
247 | 357 |
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout), heights=c(1,rep(10,length(uni.hmm.grl)))))) |
... | ... |
@@ -7,7 +7,7 @@ |
7 | 7 |
// This function takes parameters from R, creates a univariate HMM object, creates the distributions, runs the Baum-Welch and returns the result to R. |
8 | 8 |
// =================================================================================================================================================== |
9 | 9 |
extern "C" { |
10 |
-void R_univariate_hmm(int* O, int* T, int* N, double* size, double* prob, double* lambda, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, int* iniproc, double* initial_size, double* initial_prob, double* initial_lambda, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff) |
|
10 |
+void R_univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, double* prob, double* lambda, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, int* iniproc, double* initial_size, double* initial_prob, double* initial_lambda, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff) |
|
11 | 11 |
{ |
12 | 12 |
|
13 | 13 |
// Define logging level |
... | ... |
@@ -155,7 +155,7 @@ void R_univariate_hmm(int* O, int* T, int* N, double* size, double* prob, double |
155 | 155 |
{ |
156 | 156 |
posterior_per_t[iN] = hmm->get_posterior(iN, t); |
157 | 157 |
} |
158 |
- states[t] = argMax(posterior_per_t, *N); |
|
158 |
+ states[t] = state_labels[argMax(posterior_per_t, *N)]; |
|
159 | 159 |
} |
160 | 160 |
|
161 | 161 |
FILE_LOG(logDEBUG1) << "Return parameters"; |
... | ... |
@@ -210,3 +210,135 @@ void R_univariate_hmm(int* O, int* T, int* N, double* size, double* prob, double |
210 | 210 |
} |
211 | 211 |
} // extern C |
212 | 212 |
|
213 |
+ |
|
214 |
+// ===================================================================================================================================================== |
|
215 |
+// This function takes parameters from R, creates a multivariate HMM object, runs the Baum-Welch and returns the result to R. |
|
216 |
+// ===================================================================================================================================================== |
|
217 |
+extern "C" { |
|
218 |
+void R_multivariate_hmm(double* D, int* T, int* N, int *Nmod, int* comb_states, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error) |
|
219 |
+{ |
|
220 |
+ |
|
221 |
+ // Define logging level {"ERROR", "WARNING", "INFO", "ITERATION", "DEBUG", "DEBUG1", "DEBUG2", "DEBUG3", "DEBUG4"} |
|
222 |
+// FILE* pFile = fopen("chromStar.log", "w"); |
|
223 |
+// Output2FILE::Stream() = pFile; |
|
224 |
+// FILELog::ReportingLevel() = FILELog::FromString("ITERATION"); |
|
225 |
+// FILELog::ReportingLevel() = FILELog::FromString("DEBUG2"); |
|
226 |
+ FILELog::ReportingLevel() = FILELog::FromString("ERROR"); |
|
227 |
+ |
|
228 |
+ // Parallelization settings |
|
229 |
+// omp_set_num_threads(*num_threads); |
|
230 |
+ |
|
231 |
+ // Print some information |
|
232 |
+ FILE_LOG(logINFO) << "number of states = " << *N; |
|
233 |
+ Rprintf("number of states = %d\n", *N); |
|
234 |
+ FILE_LOG(logINFO) << "number of bins = " << *T; |
|
235 |
+ Rprintf("number of bins = %d\n", *T); |
|
236 |
+ if (*maxiter < 0) |
|
237 |
+ { |
|
238 |
+ FILE_LOG(logINFO) << "maximum number of iterations = none"; |
|
239 |
+ Rprintf("maximum number of iterations = none\n"); |
|
240 |
+ } else { |
|
241 |
+ FILE_LOG(logINFO) << "maximum number of iterations = " << *maxiter; |
|
242 |
+ Rprintf("maximum number of iterations = %d\n", *maxiter); |
|
243 |
+ } |
|
244 |
+ if (*maxtime < 0) |
|
245 |
+ { |
|
246 |
+ FILE_LOG(logINFO) << "maximum running time = none"; |
|
247 |
+ Rprintf("maximum running time = none\n"); |
|
248 |
+ } else { |
|
249 |
+ FILE_LOG(logINFO) << "maximum running time = " << *maxtime << " sec"; |
|
250 |
+ Rprintf("maximum running time = %d sec\n", *maxtime); |
|
251 |
+ } |
|
252 |
+ FILE_LOG(logINFO) << "epsilon = " << *eps; |
|
253 |
+ Rprintf("epsilon = %g\n", *eps); |
|
254 |
+ FILE_LOG(logINFO) << "number of modifications = " << *Nmod; |
|
255 |
+ Rprintf("number of modifications = %d\n", *Nmod); |
|
256 |
+ |
|
257 |
+ // Flush Rprintf statements to console |
|
258 |
+ R_FlushConsole(); |
|
259 |
+ |
|
260 |
+ // Recode the densities vector to matrix representation |
|
261 |
+// clock_t clocktime = clock(), dtime; |
|
262 |
+ double** multiD = allocDoubleMatrix(*N, *T); |
|
263 |
+ for (int iN=0; iN<*N; iN++) |
|
264 |
+ { |
|
265 |
+ for (int t=0; t<*T; t++) |
|
266 |
+ { |
|
267 |
+ multiD[iN][t] = D[iN*(*T)+t]; |
|
268 |
+ } |
|
269 |
+ } |
|
270 |
+// dtime = clock() - clocktime; |
|
271 |
+// FILE_LOG(logDEBUG1) << "recoding densities vector to matrix representation: " << dtime << " clicks"; |
|
272 |
+ |
|
273 |
+ // Create the HMM |
|
274 |
+ FILE_LOG(logDEBUG1) << "Creating the multivariate HMM"; |
|
275 |
+ ScaleHMM* hmm = new ScaleHMM(*T, *N, *Nmod, multiD); |
|
276 |
+ // Initialize the transition probabilities and proba |
|
277 |
+ hmm->initialize_transition_probs(initial_A, *use_initial_params); |
|
278 |
+ hmm->initialize_proba(initial_proba, *use_initial_params); |
|
279 |
+ |
|
280 |
+ // Print logproba and A |
|
281 |
+// for (int iN=0; iN<*N; iN++) |
|
282 |
+// { |
|
283 |
+// FILE_LOG(logDEBUG) << "proba["<<iN<<"] = " <<exp(hmm->logproba[iN]); |
|
284 |
+// for (int jN=0; jN<*N; jN++) |
|
285 |
+// { |
|
286 |
+// FILE_LOG(logDEBUG) << "A["<<iN<<"]["<<jN<<"] = " << hmm->A[iN][jN]; |
|
287 |
+// } |
|
288 |
+// } |
|
289 |
+ |
|
290 |
+ // Estimate the parameters |
|
291 |
+ FILE_LOG(logDEBUG1) << "Starting Baum-Welch estimation"; |
|
292 |
+ try |
|
293 |
+ { |
|
294 |
+ hmm->baumWelch(maxiter, maxtime, eps); |
|
295 |
+ } |
|
296 |
+ catch (std::exception& e) |
|
297 |
+ { |
|
298 |
+ FILE_LOG(logERROR) << "Error in Baum-Welch: " << e.what(); |
|
299 |
+ Rprintf("Error in Baum-Welch: %s\n", e.what()); |
|
300 |
+ if (e.what()=="nan detected") { *error = 1; } |
|
301 |
+ else { *error = 2; } |
|
302 |
+ } |
|
303 |
+ FILE_LOG(logDEBUG1) << "Finished with Baum-Welch estimation"; |
|
304 |
+ |
|
305 |
+// // Compute the posteriors and save results directly to the R pointer |
|
306 |
+// FILE_LOG(logDEBUG1) << "Recode posteriors into column representation"; |
|
307 |
+// for (int iN=0; iN<*N; iN++) |
|
308 |
+// { |
|
309 |
+// for (int t=0; t<*T; t++) |
|
310 |
+// { |
|
311 |
+// posteriors[t + iN * (*T)] = hmm->get_posterior(iN, t); |
|
312 |
+// } |
|
313 |
+// } |
|
314 |
+ |
|
315 |
+ // Compute the states from posteriors |
|
316 |
+ FILE_LOG(logDEBUG1) << "Computing states from posteriors"; |
|
317 |
+ double posterior_per_t [*N]; |
|
318 |
+ for (int t=0; t<*T; t++) |
|
319 |
+ { |
|
320 |
+ for (int iN=0; iN<*N; iN++) |
|
321 |
+ { |
|
322 |
+ posterior_per_t[iN] = hmm->get_posterior(iN, t); |
|
323 |
+ } |
|
324 |
+ states[t] = comb_states[argMax(posterior_per_t, *N)]; |
|
325 |
+ } |
|
326 |
+ |
|
327 |
+ FILE_LOG(logDEBUG1) << "Return parameters"; |
|
328 |
+ // also return the estimated transition matrix and the initial probs |
|
329 |
+ for (int i=0; i<*N; i++) |
|
330 |
+ { |
|
331 |
+ proba[i] = hmm->get_proba(i); |
|
332 |
+ for (int j=0; j<*N; j++) |
|
333 |
+ { |
|
334 |
+ A[i * (*N) + j] = hmm->get_A(j,i); |
|
335 |
+ } |
|
336 |
+ } |
|
337 |
+ *loglik = hmm->get_logP(); |
|
338 |
+ |
|
339 |
+ FILE_LOG(logDEBUG1) << "Deleting the hmm"; |
|
340 |
+ delete hmm; |
|
341 |
+ freeDoubleMatrix(multiD, *N); |
|
342 |
+} |
|
343 |
+} // extern C |
|
344 |
+ |
... | ... |
@@ -1382,4 +1382,40 @@ double Geometric::get_prob() |
1382 | 1382 |
} |
1383 | 1383 |
|
1384 | 1384 |
|
1385 |
+// ============================================================ |
|
1386 |
+// Multivariate Copula Approximation |
|
1387 |
+// ============================================================ |
|
1388 |
+ |
|
1389 |
+// Constructor and Destructor --------------------------------- |
|
1390 |
+MVCopulaApproximation::MVCopulaApproximation(int** multiobservations, int T, std::vector<Density*> marginals, double* cor_matrix_inv, double cor_matrix_determinant) |
|
1391 |
+{ |
|
1392 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
1393 |
+ this->multi_obs = multiobservations; |
|
1394 |
+ this->T = T; |
|
1395 |
+ // these are the marginal distributions (we need their CDF function) |
|
1396 |
+ this->marginals = marginals; |
|
1397 |
+ this->Nmod = this->marginals.size(); |
|
1398 |
+ this->cor_matrix_inv = cor_matrix_inv; |
|
1399 |
+ this->cor_matrix_determinant = cor_matrix_determinant; |
|
1400 |
+} |
|
1401 |
+ |
|
1402 |
+MVCopulaApproximation::~MVCopulaApproximation() |
|
1403 |
+{ |
|
1404 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
1405 |
+ for (int imod; imod<this->Nmod; imod++) |
|
1406 |
+ { |
|
1407 |
+ delete this->marginals[imod]; |
|
1408 |
+ } |
|
1409 |
+} |
|
1410 |
+ |
|
1411 |
+// Methods ---------------------------------------------------- |
|
1412 |
+ |
|
1413 |
+// Getter and Setter ------------------------------------------ |
|
1414 |
+DensityName MVCopulaApproximation::get_name() |
|
1415 |
+{ |
|
1416 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
1417 |
+ return(OTHER); |
|
1418 |
+} |
|
1419 |
+ |
|
1420 |
+ |
|
1385 | 1421 |
|
... | ... |
@@ -1,10 +1,12 @@ |
1 | 1 |
#include <Rmath.h> // dnorm(), dnbinom() and digamma() etc. |
2 |
+#include <vector> // storing density functions in MVCopula |
|
2 | 3 |
#include "utility.h" // FILE_LOG(), intMax() |
3 | 4 |
|
4 | 5 |
#ifndef DENSITIES_H |
5 | 6 |
#define DENSITIES_H |
6 | 7 |
|
7 |
-enum DensityName {ZERO_INFLATION, NORMAL, NEGATIVE_BINOMIAL, GEOMETRIC, POISSON, BINOMIAL}; |
|
8 |
+enum whichvariate {UNIVARIATE, MULTIVARIATE}; |
|
9 |
+enum DensityName {ZERO_INFLATION, NORMAL, NEGATIVE_BINOMIAL, GEOMETRIC, POISSON, BINOMIAL, OTHER}; |
|
8 | 10 |
|
9 | 11 |
class Density |
10 | 12 |
{ |
... | ... |
@@ -230,4 +232,26 @@ class Geometric : public Density |
230 | 232 |
}; |
231 | 233 |
|
232 | 234 |
|
235 |
+class MVCopulaApproximation : public Density { |
|
236 |
+ public: |
|
237 |
+ // Constructor and Destructor |
|
238 |
+ MVCopulaApproximation(int** multiobservations, int T, std::vector<Density*> marginals, double* cor_matrix_inv, double cor_matrix_determinant); |
|
239 |
+ ~MVCopulaApproximation(); |
|
240 |
+ |
|
241 |
+ // Methods |
|
242 |
+ |
|
243 |
+ // Getters and Setters |
|
244 |
+ DensityName get_name(); |
|
245 |
+ |
|
246 |
+ private: |
|
247 |
+ // Member variables |
|
248 |
+ int Nmod; ///< number of modifications |
|
249 |
+ int** multi_obs; ///< matrix [Nmod x T] of observations |
|
250 |
+ int T; ///< length of observation vector |
|
251 |
+ std::vector<Density*> marginals; ///< vector [Nmod] of marginal distributions |
|
252 |
+ double* cor_matrix_inv; ///< vector with elements of the inverse of the correlation matrix |
|
253 |
+ double cor_matrix_determinant; ///< determinant of the correlation matrix |
|
254 |
+}; |
|
255 |
+ |
|
256 |
+ |
|
233 | 257 |
#endif |
... | ... |
@@ -10,6 +10,8 @@ |
10 | 10 |
ScaleHMM::ScaleHMM(int T, int N) |
11 | 11 |
{ |
12 | 12 |
FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
13 |
+ FILE_LOG(logDEBUG2) << "Initializing univariate ScaleHMM"; |
|
14 |
+ this->xvariate = UNIVARIATE; |
|
13 | 15 |
this->T = T; |
14 | 16 |
this->N = N; |
15 | 17 |
this->A = allocDoubleMatrix(N, N); |
... | ... |
@@ -30,6 +32,29 @@ ScaleHMM::ScaleHMM(int T, int N) |
30 | 32 |
|
31 | 33 |
} |
32 | 34 |
|
35 |
+ |
|
36 |
+ScaleHMM::ScaleHMM(int T, int N, int Nmod, double** densities) |
|
37 |
+{ |
|
38 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
39 |
+ FILE_LOG(logDEBUG2) << "Initializing multivariate ScaleHMM"; |
|
40 |
+ this->xvariate = MULTIVARIATE; |
|
41 |
+ this->T = T; |
|
42 |
+ this->N = N; |
|
43 |
+ this->A = allocDoubleMatrix(N, N); |
|
44 |
+ this->scalefactoralpha = (double*) calloc(T, sizeof(double)); |
|
45 |
+ this->scalealpha = allocDoubleMatrix(T, N); |
|
46 |
+ this->scalebeta = allocDoubleMatrix(T, N); |
|
47 |
+ this->densities = densities; |
|
48 |
+ this->proba = (double*) calloc(N, sizeof(double)); |
|
49 |
+ this->gamma = allocDoubleMatrix(N, T); |
|
50 |
+ this->sumgamma = (double*) calloc(N, sizeof(double)); |
|
51 |
+ this->sumxi = allocDoubleMatrix(N, N); |
|
52 |
+ this->logP = -INFINITY; |
|
53 |
+ this->dlogP = INFINITY; |
|
54 |
+ this->Nmod = Nmod; |
|
55 |
+ |
|
56 |
+} |
|
57 |
+ |
|
33 | 58 |
ScaleHMM::~ScaleHMM() |
34 | 59 |
{ |
35 | 60 |
FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
... | ... |
@@ -37,16 +62,19 @@ ScaleHMM::~ScaleHMM() |
37 | 62 |
free(this->scalefactoralpha); |
38 | 63 |
freeDoubleMatrix(this->scalealpha, this->T); |
39 | 64 |
freeDoubleMatrix(this->scalebeta, this->T); |
40 |
- freeDoubleMatrix(this->densities, this->N); |
|
41 | 65 |
// freeDoubleMatrix(this->tdensities, this->T); |
42 | 66 |
freeDoubleMatrix(this->gamma, this->N); |
43 | 67 |
freeDoubleMatrix(this->sumxi, this->N); |
44 | 68 |
free(this->proba); |
45 | 69 |
free(this->sumgamma); |
46 |
- for (int iN=0; iN<this->N; iN++) |
|
70 |
+ if (this->xvariate == UNIVARIATE) |
|
47 | 71 |
{ |
48 |
- FILE_LOG(logDEBUG1) << "Deleting density functions"; |
|
49 |
- delete this->densityFunctions[iN]; |
|
72 |
+ freeDoubleMatrix(this->densities, this->N); |
|
73 |
+ for (int iN=0; iN<this->N; iN++) |
|
74 |
+ { |
|
75 |
+ FILE_LOG(logDEBUG1) << "Deleting density functions"; |
|
76 |
+ delete this->densityFunctions[iN]; |
|
77 |
+ } |
|
50 | 78 |
} |
51 | 79 |
} |
52 | 80 |
|
... | ... |
@@ -79,7 +107,7 @@ void ScaleHMM::initialize_transition_probs(double* initial_A, bool use_initial_p |
79 | 107 |
else |
80 | 108 |
this->A[iN][jN] = other; |
81 | 109 |
// Save value to initial A |
82 |
- initial_A[iN*this->N + jN] = this->A[jN][iN]; |
|
110 |
+ initial_A[jN*this->N + iN] = this->A[iN][jN]; |
|
83 | 111 |
} |
84 | 112 |
} |
85 | 113 |
} |
... | ... |
@@ -123,10 +151,17 @@ void ScaleHMM::baumWelch(int* maxiter, int* maxtime, double* eps) |
123 | 151 |
this->baumWelchStartTime_sec = time(NULL); |
124 | 152 |
|
125 | 153 |
// Print some initial information |
126 |
- FILE_LOG(logINFO) << ""; |
|
127 |
- FILE_LOG(logINFO) << "INITIAL PARAMETERS"; |
|
128 |
-// this->print_uni_params(); |
|
129 |
- this->print_uni_iteration(0); |
|
154 |
+ if (this->xvariate == UNIVARIATE) |
|
155 |
+ { |
|
156 |
+ FILE_LOG(logINFO) << ""; |
|
157 |
+ FILE_LOG(logINFO) << "INITIAL PARAMETERS"; |
|
158 |
+// this->print_uni_params(); |
|
159 |
+ this->print_uni_iteration(0); |
|
160 |
+ } |
|
161 |
+ else if (this->xvariate == MULTIVARIATE) |
|
162 |
+ { |
|
163 |
+ this->print_multi_iteration(0); |
|
164 |
+ } |
|
130 | 165 |
|
131 | 166 |
R_CheckUserInterrupt(); |
132 | 167 |
|
... | ... |
@@ -137,9 +172,12 @@ void ScaleHMM::baumWelch(int* maxiter, int* maxtime, double* eps) |
137 | 172 |
|
138 | 173 |
iteration++; |
139 | 174 |
|
140 |
- FILE_LOG(logDEBUG1) << "Calling calc_densities() from baumWelch()"; |
|
141 |
- try { this->calc_densities(); } catch(...) { throw; } |
|
142 |
- R_CheckUserInterrupt(); |
|
175 |
+ if (this->xvariate == UNIVARIATE) |
|
176 |
+ { |
|
177 |
+ FILE_LOG(logDEBUG1) << "Calling calc_densities() from baumWelch()"; |
|
178 |
+ try { this->calc_densities(); } catch(...) { throw; } |
|
179 |
+ R_CheckUserInterrupt(); |
|
180 |
+ } |
|
143 | 181 |
|
144 | 182 |
FILE_LOG(logDEBUG1) << "Calling forward() from baumWelch()"; |
145 | 183 |
try { this->forward(); } catch(...) { throw; } |
... | ... |
@@ -167,26 +205,36 @@ void ScaleHMM::baumWelch(int* maxiter, int* maxtime, double* eps) |
167 | 205 |
this->calc_sumgamma(); |
168 | 206 |
R_CheckUserInterrupt(); |
169 | 207 |
|
170 |
-// clock_t clocktime = clock(), dtime; |
|
171 |
- // difference in posterior |
|
172 |
- FILE_LOG(logDEBUG1) << "Calculating differences in posterior in baumWelch()"; |
|
173 |
- double postsum = 0.0; |
|
174 |
- for (int t=0; t<this->T; t++) |
|
208 |
+ if (this->xvariate == UNIVARIATE) |
|
175 | 209 |
{ |
176 |
- for (int iN=0; iN<this->N; iN++) |
|
210 |
+// clock_t clocktime = clock(), dtime; |
|
211 |
+ // difference in posterior |
|
212 |
+ FILE_LOG(logDEBUG1) << "Calculating differences in posterior in baumWelch()"; |
|
213 |
+ double postsum = 0.0; |
|
214 |
+ for (int t=0; t<this->T; t++) |
|
177 | 215 |
{ |
178 |
- postsum += fabs(this->gamma[iN][t] - gammaold[iN][t]); |
|
179 |
- gammaold[iN][t] = this->gamma[iN][t]; |
|
216 |
+ for (int iN=0; iN<this->N; iN++) |
|
217 |
+ { |
|
218 |
+ postsum += fabs(this->gamma[iN][t] - gammaold[iN][t]); |
|
219 |
+ gammaold[iN][t] = this->gamma[iN][t]; |
|
220 |
+ } |
|
180 | 221 |
} |
222 |
+ this->sumdiff_posterior = postsum; |
|
223 |
+// dtime = clock() - clocktime; |
|
224 |
+// FILE_LOG(logDEBUG) << "differences in posterior: " << dtime << " clicks"; |
|
181 | 225 |
} |
182 |
- this->sumdiff_posterior = postsum; |
|
183 |
-// dtime = clock() - clocktime; |
|
184 |
-// FILE_LOG(logDEBUG) << "differences in posterior: " << dtime << " clicks"; |
|
185 | 226 |
|
186 | 227 |
R_CheckUserInterrupt(); |
187 | 228 |
|
188 | 229 |
// Print information about current iteration |
189 |
- this->print_uni_iteration(iteration); |
|
230 |
+ if (this->xvariate == UNIVARIATE) |
|
231 |
+ { |
|
232 |
+ this->print_uni_iteration(iteration); |
|
233 |
+ } |
|
234 |
+ else if (this->xvariate == MULTIVARIATE) |
|
235 |
+ { |
|
236 |
+ this->print_multi_iteration(iteration); |
|
237 |
+ } |
|
190 | 238 |
|
191 | 239 |
// Check convergence |
192 | 240 |
if(this->dlogP < *eps) //it has converged |
... | ... |
@@ -254,81 +302,51 @@ void ScaleHMM::baumWelch(int* maxiter, int* maxtime, double* eps) |
254 | 302 |
} |
255 | 303 |
} |
256 | 304 |
|
257 |
- clock_t clocktime = clock(), dtime; |
|
258 |
- |
|
259 |
-// // Update all distributions independantly |
|
260 |
-// for (int iN=0; iN<this->N; iN++) |
|
261 |
-// { |
|
262 |
-// this->densityFunctions[iN]->update(this->gamma[iN]); |
|
263 |
-// } |
|
264 |
- |
|
265 |
- // Update distribution of state 'null-mixed' and 'monosomic', set others as multiples of 'monosomic' |
|
266 |
- // This loop assumes that the negative binomial states come last and are consecutive |
|
267 |
- int xsomy = 1; |
|
268 |
- for (int iN=0; iN<this->N; iN++) |
|
305 |
+ if (this->xvariate == UNIVARIATE) |
|
269 | 306 |
{ |
270 |
- if (this->densityFunctions[iN]->get_name() == ZERO_INFLATION) {} |
|
271 |
- if (this->densityFunctions[iN]->get_name() == GEOMETRIC) |
|
272 |
- { |
|
273 |
- this->densityFunctions[iN]->update(this->gamma[iN]); |
|
274 |
- } |
|
275 |
- if (this->densityFunctions[iN]->get_name() == NEGATIVE_BINOMIAL) |
|
307 |
+// clock_t clocktime = clock(), dtime; |
|
308 |
+// |
|
309 |
+// // Update all distributions independantly |
|
310 |
+// for (int iN=0; iN<this->N; iN++) |
|
311 |
+// { |
|
312 |
+// this->densityFunctions[iN]->update(this->gamma[iN]); |
|
313 |
+// } |
|
314 |
+ |
|
315 |
+ // Update distribution of state 'null-mixed' and 'monosomic', set others as multiples of 'monosomic' |
|
316 |
+ // This loop assumes that the negative binomial states come last and are consecutive |
|
317 |
+ int xsomy = 1; |
|
318 |
+ for (int iN=0; iN<this->N; iN++) |
|
276 | 319 |
{ |
277 |
- if (xsomy==1) |
|
320 |
+ if (this->densityFunctions[iN]->get_name() == ZERO_INFLATION) {} |
|
321 |
+ if (this->densityFunctions[iN]->get_name() == GEOMETRIC) |
|
322 |
+ { |
|
323 |
+ this->densityFunctions[iN]->update(this->gamma[iN]); |
|
324 |
+ } |
|
325 |
+ if (this->densityFunctions[iN]->get_name() == NEGATIVE_BINOMIAL) |
|
278 | 326 |
{ |
279 |
- FILE_LOG(logDEBUG1) << "mean(state="<<iN<<") = " << this->densityFunctions[iN]->get_mean() << ", var(state="<<iN<<") = " << this->densityFunctions[iN]->get_variance(); |
|
280 |
- this->densityFunctions[iN]->update_constrained(this->gamma, iN, this->N); |
|
281 |
- double mean1 = this->densityFunctions[iN]->get_mean(); |
|
282 |
- double variance1 = this->densityFunctions[iN]->get_variance(); |
|
283 |
- FILE_LOG(logDEBUG1) << "mean(state="<<iN<<") = " << this->densityFunctions[iN]->get_mean() << ", var(state="<<iN<<") = " << this->densityFunctions[iN]->get_variance(); |
|
284 |
- // Set others as multiples |
|
285 |
- for (int jN=iN+1; jN<this->N; jN++) |
|
327 |
+ if (xsomy==1) |
|
286 | 328 |
{ |
287 |
- this->densityFunctions[jN]->set_mean(mean1 * (jN-iN+1)); |
|
288 |
- this->densityFunctions[jN]->set_variance(variance1 * (jN-iN+1)); |
|
289 |
- FILE_LOG(logDEBUG1) << "mean(state="<<jN<<") = " << this->densityFunctions[jN]->get_mean() << ", var(state="<<jN<<") = " << this->densityFunctions[jN]->get_variance(); |
|
329 |
+ FILE_LOG(logDEBUG1) << "mean(state="<<iN<<") = " << this->densityFunctions[iN]->get_mean() << ", var(state="<<iN<<") = " << this->densityFunctions[iN]->get_variance(); |
|
330 |
+ this->densityFunctions[iN]->update_constrained(this->gamma, iN, this->N); |
|
331 |
+ double mean1 = this->densityFunctions[iN]->get_mean(); |
|
332 |
+ double variance1 = this->densityFunctions[iN]->get_variance(); |
|
333 |
+ FILE_LOG(logDEBUG1) << "mean(state="<<iN<<") = " << this->densityFunctions[iN]->get_mean() << ", var(state="<<iN<<") = " << this->densityFunctions[iN]->get_variance(); |
|
334 |
+ // Set others as multiples |
|
335 |
+ for (int jN=iN+1; jN<this->N; jN++) |
|
336 |
+ { |
|
337 |
+ this->densityFunctions[jN]->set_mean(mean1 * (jN-iN+1)); |
|
338 |
+ this->densityFunctions[jN]->set_variance(variance1 * (jN-iN+1)); |
|
339 |
+ FILE_LOG(logDEBUG1) << "mean(state="<<jN<<") = " << this->densityFunctions[jN]->get_mean() << ", var(state="<<jN<<") = " << this->densityFunctions[jN]->get_variance(); |
|
340 |
+ } |
|
341 |
+ break; |
|
290 | 342 |
} |
291 |
- break; |
|
343 |
+ xsomy++; |
|
292 | 344 |
} |
293 |
- xsomy++; |
|
294 | 345 |
} |
346 |
+// dtime = clock() - clocktime; |
|
347 |
+// FILE_LOG(logDEBUG) << "updating distributions: " << dtime << " clicks"; |
|
348 |
+ R_CheckUserInterrupt(); |
|
295 | 349 |
} |
296 |
- |
|
297 |
-// // Update distribution of state 'null-mixed' and 'disomic', set others as multiples of 'disomic'/2 |
|
298 |
-// // This loop assumes that the negative binomial states come last and are consecutive |
|
299 |
-// int xsomy = 1; |
|
300 |
-// for (int iN=0; iN<this->N; iN++) |
|
301 |
-// { |
|
302 |
-// if (this->densityFunctions[iN]->get_name() == ZERO_INFLATION) {} |
|
303 |
-// if (this->densityFunctions[iN]->get_name() == GEOMETRIC) |
|
304 |
-// { |
|
305 |
-// this->densityFunctions[iN]->update(this->gamma[iN]); |
|
306 |
-// } |
|
307 |
-// if (this->densityFunctions[iN]->get_name() == NEGATIVE_BINOMIAL) |
|
308 |
-// { |
|
309 |
-// if (xsomy==2) |
|
310 |
-// { |
|
311 |
-// FILE_LOG(logDEBUG1) << "mean(state="<<iN<<") = " << this->densityFunctions[iN]->get_mean() << ", var(state="<<iN<<") = " << this->densityFunctions[iN]->get_variance(); |
|
312 |
-// this->densityFunctions[iN]->update(this->gamma[iN]); |
|
313 |
-// double mean1 = this->densityFunctions[iN]->get_mean() / xsomy; |
|
314 |
-// double variance1 = this->densityFunctions[iN]->get_variance() / xsomy; |
|
315 |
-// FILE_LOG(logDEBUG1) << "mean(state="<<iN<<") = " << this->densityFunctions[iN]->get_mean() << ", var(state="<<iN<<") = " << this->densityFunctions[iN]->get_variance(); |
|
316 |
-// // Set others as multiples |
|
317 |
-// for (int jN=2; jN<this->N; jN++) |
|
318 |
-// { |
|
319 |
-// this->densityFunctions[jN]->set_mean(mean1 * (jN-iN+2)); |
|
320 |
-// this->densityFunctions[jN]->set_variance(variance1 * (jN-iN+2)); |
|
321 |
-// FILE_LOG(logDEBUG1) << "mean(state="<<jN<<") = " << this->densityFunctions[jN]->get_mean() << ", var(state="<<jN<<") = " << this->densityFunctions[jN]->get_variance(); |
|
322 |
-// } |
|
323 |
-// break; |
|
324 |
-// } |
|
325 |
-// xsomy++; |
|
326 |
-// } |
|
327 |
-// } |
|
328 |
- |
|
329 |
- dtime = clock() - clocktime; |
|
330 |
- FILE_LOG(logDEBUG) << "updating distributions: " << dtime << " clicks"; |
|
331 |
- R_CheckUserInterrupt(); |
|
332 | 350 |
|
333 | 351 |
} /* main loop end */ |
334 | 352 |
|
... | ... |
@@ -338,7 +356,7 @@ void ScaleHMM::baumWelch(int* maxiter, int* maxtime, double* eps) |
338 | 356 |
FILE_LOG(logINFO) << "FINAL ESTIMATION RESULTS"; |
339 | 357 |
// this->print_uni_params(); |
340 | 358 |
|
341 |
- /* free memory */ |
|
359 |
+ // free memory |
|
342 | 360 |
freeDoubleMatrix(gammaold, this->N); |
343 | 361 |
|
344 | 362 |
// Return values |
... | ... |
@@ -816,6 +834,37 @@ void ScaleHMM::print_uni_iteration(int iteration) |
816 | 834 |
R_FlushConsole(); |
817 | 835 |
} |
818 | 836 |
|
837 |
+void ScaleHMM::print_multi_iteration(int iteration) |
|
838 |
+{ |
|
839 |
+ FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
|
840 |
+ this->baumWelchTime_real = difftime(time(NULL),this->baumWelchStartTime_sec); |
|
841 |
+ int bs = 86; |
|
842 |
+ char buffer [bs]; |
|
843 |
+ if (iteration % 20 == 0) |
|
844 |
+ { |
|
845 |
+ snprintf(buffer, bs, "%10s%20s%20s%15s", "Iteration", "log(P)", "dlog(P)", "Time in sec"); |
|
846 |
+ FILE_LOG(logITERATION) << buffer; |
|
847 |
+ Rprintf("%s\n", buffer); |
|
848 |
+ } |
|
849 |
+ if (iteration == 0) |
|
850 |
+ { |
|
851 |
+ snprintf(buffer, bs, "%10s%20s%20s%*d", "0", "-inf", "-", 15, this->baumWelchTime_real); |
|
852 |
+ } |
|
853 |
+ else if (iteration == 1) |
|
854 |
+ { |
|
855 |
+ snprintf(buffer, bs, "%*d%*f%20s%*d", 10, iteration, 20, this->logP, "inf", 15, this->baumWelchTime_real); |
|
856 |
+ } |
|
857 |
+ else |
|
858 |
+ { |
|
859 |
+ snprintf(buffer, bs, "%*d%*f%*f%*d", 10, iteration, 20, this->logP, 20, this->dlogP, 15, this->baumWelchTime_real); |
|
860 |
+ } |
|
861 |
+ FILE_LOG(logITERATION) << buffer; |
|
862 |
+ Rprintf("%s\n", buffer); |
|
863 |
+ |
|
864 |
+ // Flush Rprintf statements to R console |
|
865 |
+ R_FlushConsole(); |
|
866 |
+} |
|
867 |
+ |
|
819 | 868 |
void ScaleHMM::print_uni_params() |
820 | 869 |
{ |
821 | 870 |
FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
... | ... |
@@ -13,6 +13,7 @@ class ScaleHMM { |
13 | 13 |
public: |
14 | 14 |
// Constructor and Destructor |
15 | 15 |
ScaleHMM(int T, int N); |
16 |
+ ScaleHMM(int T, int N, int Nmod, double** densities); |
|
16 | 17 |
~ScaleHMM(); |
17 | 18 |
|
18 | 19 |
// Member variables |
... | ... |
@@ -38,6 +39,7 @@ class ScaleHMM { |
38 | 39 |
int T; ///< length of observed sequence |
39 | 40 |
int* obs; ///< vector [T] of observations |
40 | 41 |
int N; ///< number of states |
42 |
+ int Nmod; ///< number of modifications / marks |
|
41 | 43 |
int cutoff; ///< a cutoff for observations |
42 | 44 |
double* sumgamma; ///< vector[N] of sum of posteriors (gamma values) |
43 | 45 |
double** sumxi; ///< matrix[N x N] of xi values |
... | ... |
@@ -56,6 +58,7 @@ class ScaleHMM { |
56 | 58 |
int sumdiff_state_last; ///< sum of the difference in the state 1 assignments from one iteration to the next |
57 | 59 |
double sumdiff_posterior; ///< sum of the difference in posterior (gamma) values from one iteration to the next |
58 | 60 |
// bool use_tdens; ///< switch for using the tdensities in the calculations |
61 |
+ whichvariate xvariate; ///< enum which stores if UNIVARIATE or MULTIVARIATE |
|
59 | 62 |
|
60 | 63 |
// Methods |
61 | 64 |
void forward(); ///< calculate forward variables (alpha) |
... | ... |
@@ -65,6 +68,7 @@ class ScaleHMM { |
65 | 68 |
void calc_loglikelihood(); |
66 | 69 |
void calc_densities(); |
67 | 70 |
void print_uni_iteration(int iteration); |
71 |
+ void print_multi_iteration(int iteration); |
|
68 | 72 |
void print_uni_params(); |
69 | 73 |
}; |
70 | 74 |
|