... | ... |
@@ -14,7 +14,6 @@ plot.distribution, |
14 | 14 |
univariate2bed, |
15 | 15 |
binned2wiggle, |
16 | 16 |
multivariate2bed, |
17 |
-binned2GRanges, |
|
18 | 17 |
hmm2GRanges, |
19 | 18 |
|
20 | 19 |
mixedsort, |
... | ... |
@@ -22,6 +21,6 @@ mixedorder |
22 | 21 |
) |
23 | 22 |
|
24 | 23 |
# Import all packages listed as Imports or Depends |
25 |
-# import( |
|
26 |
-# GenomicRanges, IRanges |
|
27 |
-# ) |
|
24 |
+import( |
|
25 |
+ GenomicRanges, IRanges |
|
26 |
+) |
... | ... |
@@ -20,23 +20,25 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold |
20 | 20 |
dir.create(outputfolder) |
21 | 21 |
} |
22 | 22 |
|
23 |
- ## Read in the data |
|
23 |
+ ### Read in the data |
|
24 |
+ ## BED (0-based) |
|
24 | 25 |
if (format == "bed") { |
25 | 26 |
cat("Reading file",basename(file),"...") |
26 |
- # File with chromosome lengths |
|
27 |
+ # File with chromosome lengths (1-based) |
|
27 | 28 |
chrom.lengths.df <- read.table(chrom.length.file) |
28 | 29 |
chrom.lengths <- chrom.lengths.df[,2] |
29 | 30 |
names(chrom.lengths) <- chrom.lengths.df[,1] |
30 |
- # File with reads, determine classes first for faster import |
|
31 |
+ # File with reads, determine classes first for faster import (0-based) |
|
31 | 32 |
tab5rows <- read.table(file, nrows=5) |
32 | 33 |
classes.in.bed <- sapply(tab5rows, class) |
33 | 34 |
classes <- rep("NULL",length(classes.in.bed)) |
34 | 35 |
classes[1:3] <- classes.in.bed[1:3] |
35 | 36 |
data <- read.table(file, colClasses=classes) |
36 | 37 |
# Convert to GRanges object |
37 |
- data <- GenomicRanges::GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2], end=data[,3]), strand=Rle(strand("*"), nrow(data))) |
|
38 |
+ data <- GenomicRanges::GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2]+1, end=data[,3]+1), strand=Rle(strand("*"), nrow(data))) # +1 to match coordinate systems |
|
38 | 39 |
seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))]) |
39 | 40 |
chroms.in.data <- seqlevels(data) |
41 |
+ ## BAM (1-based) |
|
40 | 42 |
} else if (format == "bam") { |
41 | 43 |
library(Rsamtools) # TODO: put this in Imports in finished package |
42 | 44 |
library(GenomicAlignments) # TODO: put this in Imports in finished package |
... | ... |
@@ -44,9 +46,10 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold |
44 | 46 |
file.header <- Rsamtools::scanBamHeader(file)[[1]] |
45 | 47 |
chrom.lengths <- file.header$targets |
46 | 48 |
chroms.in.data <- names(chrom.lengths) |
49 |
+ ## BEDGraph (0-based) |
|
47 | 50 |
} else if (format == "bedGraph") { |
48 | 51 |
cat("Reading file",basename(file),"...") |
49 |
- # File with chromosome lengths |
|
52 |
+ # File with chromosome lengths (1-based) |
|
50 | 53 |
chrom.lengths.df <- read.table(chrom.length.file) |
51 | 54 |
chrom.lengths <- chrom.lengths.df[,2] |
52 | 55 |
names(chrom.lengths) <- chrom.lengths.df[,1] |
... | ... |
@@ -57,7 +60,7 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold |
57 | 60 |
classes[1:4] <- classes.in.bed[1:4] |
58 | 61 |
data <- read.table(file, colClasses=classes) |
59 | 62 |
# Convert to GRanges object |
60 |
- data <- GenomicRanges::GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2], end=data[,3]), strand=Rle(strand("*"), nrow(data)), signal=data[,4]) |
|
63 |
+ data <- GenomicRanges::GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2]+1, end=data[,3]+1), strand=Rle(strand("*"), nrow(data)), signal=data[,4]) # +1 to match coordinate systems |
|
61 | 64 |
seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))]) |
62 | 65 |
chroms.in.data <- seqlevels(data) |
63 | 66 |
} |
... | ... |
@@ -69,7 +72,7 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold |
69 | 72 |
cat("Binning into binsize",binsize,"\n") |
70 | 73 |
|
71 | 74 |
### Iterate over all chromosomes |
72 |
- binned.data.allchroms <- NULL |
|
75 |
+ binned.data <- GenomicRanges::GRanges() |
|
73 | 76 |
if (is.null(chromosomes)) { |
74 | 77 |
chromosomes <- chroms.in.data |
75 | 78 |
} |
... | ... |
@@ -86,38 +89,43 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold |
86 | 89 |
## Check last incomplete bin |
87 | 90 |
incomplete.bin <- chrom.lengths[chromosome] %% binsize > 0 |
88 | 91 |
if (incomplete.bin) { |
89 |
- numbins <- ceiling(chrom.lengths[chromosome]/binsize) |
|
92 |
+ numbins <- floor(chrom.lengths[chromosome]/binsize) # floor: we don't want incomplete bins, ceiling: we want incomplete bins at the end |
|
90 | 93 |
} else { |
91 | 94 |
numbins <- chrom.lengths[chromosome]/binsize |
92 | 95 |
} |
96 |
+ if (numbins == 0) { |
|
97 |
+ warning("chromosome ",chromosome," is smaller than binsize. Skipped.") |
|
98 |
+ next |
|
99 |
+ } |
|
93 | 100 |
## Initialize vectors |
94 | 101 |
cat("initialize vectors...\r") |
95 | 102 |
chroms <- rep(chromosome,numbins) |
96 | 103 |
reads <- rep(0,numbins) |
97 |
- start <- seq(from=0, by=binsize, length.out=numbins) |
|
98 |
- end <- seq(from=binsize-1, by=binsize, length.out=numbins) |
|
99 |
- end[length(end)] <- chrom.lengths[chromosome] |
|
104 |
+ start <- seq(from=1, by=binsize, length.out=numbins) |
|
105 |
+ end <- seq(from=binsize+1, by=binsize, length.out=numbins) |
|
106 |
+# end[length(end)] <- chrom.lengths[chromosome] # last ending coordinate is size of chromosome, only if incomplete bins are desired |
|
100 | 107 |
|
101 | 108 |
## Create binned chromosome as GRanges object |
102 | 109 |
cat("creating GRanges container... \r") |
103 |
- ichrom <- GenomicRanges::GRanges(seqnames = Rle(chromosome, numbins), |
|
110 |
+ i.binned.data <- GenomicRanges::GRanges(seqnames = Rle(chromosome, numbins), |
|
104 | 111 |
ranges = IRanges(start=start, end=end), |
105 | 112 |
strand = Rle(strand("*"), numbins) |
106 | 113 |
) |
114 |
+ seqlengths(i.binned.data) <- chrom.lengths[chromosome] |
|
107 | 115 |
|
108 | 116 |
if (format=="bam") { |
109 | 117 |
cat("reading reads from file... \r") |
110 |
- data <- GenomicAlignments::readGAlignmentsFromBam(file, index=index, param=ScanBamParam(what=c("pos"),which=range(ichrom))) |
|
118 |
+ data <- GenomicAlignments::readGAlignmentsFromBam(file, index=index, param=ScanBamParam(what=c("pos"),which=range(i.binned.data))) |
|
111 | 119 |
} |
112 | 120 |
|
113 | 121 |
## Count overlaps |
114 | 122 |
cat("counting overlaps... \r") |
115 | 123 |
if (format=="bam" | format=="bed") { |
116 |
- reads <- GenomicRanges::countOverlaps(ichrom, data[seqnames(data)==chromosome]) |
|
124 |
+ reads <- GenomicRanges::countOverlaps(i.binned.data, data[seqnames(data)==chromosome]) |
|
117 | 125 |
} else if (format=="bedGraph") { |
118 | 126 |
# Take the max value from all regions that fall into / overlap a given bin as read count |
119 |
- midx <- as.matrix(findOverlaps(ichrom, data[seqnames(data)==chromosome])) |
|
120 |
- reads <- rep(0,length(ichrom)) |
|
127 |
+ midx <- as.matrix(findOverlaps(i.binned.data, data[seqnames(data)==chromosome])) |
|
128 |
+ reads <- rep(0,length(i.binned.data)) |
|
121 | 129 |
signal <- mcols(data)$signal |
122 | 130 |
rle <- rle(midx[,1]) |
123 | 131 |
read.idx <- rle$values |
... | ... |
@@ -132,10 +140,10 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold |
132 | 140 |
|
133 | 141 |
## Concatenate |
134 | 142 |
cat("concatenate... \r") |
135 |
- binned.data <- data.frame(chroms,start,end,reads) |
|
136 |
- names(binned.data) <- binned.data.names |
|
143 |
+ mcols(i.binned.data)$reads <- reads |
|
137 | 144 |
|
138 | 145 |
if (separate.chroms==TRUE) { |
146 |
+ binned.data <- i.binned.data |
|
139 | 147 |
if (save.as.RData==TRUE) { |
140 | 148 |
## Print to file |
141 | 149 |
filename <- paste(basename(file),"_binsize_",binsize,"_",chromosome,".RData", sep="") |
... | ... |
@@ -146,23 +154,20 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold |
146 | 154 |
return(binned.data) |
147 | 155 |
} |
148 | 156 |
} else { |
149 |
- binned.data.allchroms[[length(binned.data.allchroms)+1]] <- binned.data |
|
157 |
+ binned.data <- suppressWarnings(BiocGenerics::append(binned.data, i.binned.data)) |
|
150 | 158 |
} |
151 | 159 |
cat(" \r") |
152 | 160 |
|
153 | 161 |
} |
154 |
- if (separate.chroms!=TRUE) { |
|
155 |
- cat("Concatenating chromosomes ...") |
|
156 |
- binned.data.allchroms <- do.call("rbind",binned.data.allchroms) |
|
157 |
- cat(" done\n") |
|
162 |
+ if (separate.chroms==FALSE) { |
|
158 | 163 |
if (save.as.RData==TRUE) { |
159 | 164 |
# Print to file |
160 | 165 |
filename <- paste(basename(file),"_binsize_",binsize,".RData", sep="") |
161 | 166 |
cat("Saving to file ...") |
162 |
- save(binned.data.allchroms, file=file.path(outputfolder,filename) ) |
|
167 |
+ save(binned.data, file=file.path(outputfolder,filename) ) |
|
163 | 168 |
cat(" done\n") |
164 | 169 |
} else { |
165 |
- return(binned.data.allchroms) |
|
170 |
+ return(binned.data) |
|
166 | 171 |
} |
167 | 172 |
} |
168 | 173 |
|
... | ... |
@@ -1,19 +1,6 @@ |
1 |
-binned2GRanges <- function(binned.data) { |
|
2 |
- |
|
3 |
- library(GenomicRanges) |
|
4 |
- gr <- GenomicRanges::GRanges( |
|
5 |
- seqnames = Rle(binned.data$chrom), |
|
6 |
- ranges = IRanges(start=binned.data$start, end=binned.data$end), |
|
7 |
- strand = Rle(strand("*"), nrow(binned.data)), |
|
8 |
- reads = binned.data$reads |
|
9 |
- ) |
|
10 |
- return(gr) |
|
11 |
- |
|
12 |
-} |
|
13 |
- |
|
14 | 1 |
hmm2GRanges <- function(hmm, reduce=TRUE) { |
15 | 2 |
|
16 |
- library(GenomicRanges) |
|
3 |
+# library(GenomicRanges) |
|
17 | 4 |
### Check user input ### |
18 | 5 |
if (check.univariate.model(hmm)!=0) stop("argument 'hmm' expects a univariate hmm object (type ?uni.hmm for help)") |
19 | 6 |
if (check.logical(reduce)!=0) stop("argument 'reduce' expects TRUE or FALSE") |
... | ... |
@@ -25,10 +12,9 @@ hmm2GRanges <- function(hmm, reduce=TRUE) { |
25 | 12 |
ranges = IRanges(start=hmm$coordinates$start, end=hmm$coordinates$end), |
26 | 13 |
strand = Rle(strand("*"), nrow(hmm$coordinates)) |
27 | 14 |
) |
28 |
- # Seqlengths gives warnings because our coordinates are 0-based |
|
29 |
-# for (chrom in unique(hmm$coordinates$chrom)) { |
|
30 |
-# seqlengths(gr)[names(seqlengths(gr))==chrom] <- max(hmm$coordinates$end[hmm$coordinates$chrom==chrom]) |
|
31 |
-# } |
|
15 |
+ seqlengths(gr) <- hmm$seqlengths[names(seqlengths(gr))] |
|
16 |
+ # Reorder seqlevels |
|
17 |
+ gr <- GenomicRanges::keepSeqlevels(gr, names(hmm$seqlengths)) |
|
32 | 18 |
|
33 | 19 |
if (reduce) { |
34 | 20 |
# Reduce state by state |
... | ... |
@@ -16,25 +16,24 @@ find.aneuploidies <- function(binned.data, use.states=0:3, eps=0.001, init="stan |
16 | 16 |
war <- NULL |
17 | 17 |
if (is.null(eps.try)) eps.try <- eps |
18 | 18 |
|
19 |
- names(binned.data) <- binned.data.names # defined globally outside this function |
|
20 |
- |
|
21 | 19 |
## Assign variables |
22 | 20 |
# state.labels # assigned globally outside this function |
23 | 21 |
use.state.labels <- state.labels[use.states+1] |
24 | 22 |
numstates <- length(use.states) |
25 |
- numbins <- length(binned.data$reads) |
|
23 |
+ numbins <- length(binned.data) |
|
24 |
+ reads <- mcols(binned.data)$reads |
|
26 | 25 |
iniproc <- which(init==c("standard","random","empiric")) # transform to int |
27 | 26 |
|
28 | 27 |
# Check if there are reads in the data, otherwise HMM will blow up |
29 |
- if (!any(binned.data$reads!=0)) { |
|
28 |
+ if (!any(reads!=0)) { |
|
30 | 29 |
stop("All reads in data are zero. No univariate HMM done.") |
31 | 30 |
} |
32 | 31 |
|
33 | 32 |
# Filter high reads out, makes HMM faster |
34 |
- read.cutoff <- as.integer(quantile(binned.data$reads, 0.9999)) |
|
33 |
+ read.cutoff <- as.integer(quantile(reads, 0.9999)) |
|
35 | 34 |
if (filter.reads) { |
36 |
- mask <- binned.data$reads > read.cutoff |
|
37 |
- binned.data$reads[mask] <- read.cutoff |
|
35 |
+ mask <- reads > read.cutoff |
|
36 |
+ reads[mask] <- read.cutoff |
|
38 | 37 |
numfiltered <- length(which(mask)) |
39 | 38 |
if (numfiltered > 0) { |
40 | 39 |
warning(paste("There are very high read counts in your data (probably artificial). Replaced read counts > ",read.cutoff," (99.99% quantile) by ",read.cutoff," in ",numfiltered," bins. Set option 'filter.reads=FALSE' to disable this filtering.", sep="")) |
... | ... |
@@ -47,16 +46,16 @@ find.aneuploidies <- function(binned.data, use.states=0:3, eps=0.001, init="stan |
47 | 46 |
for (i_try in 1:num.trials) { |
48 | 47 |
cat("\n\nTry ",i_try," of ",num.trials," ------------------------------\n") |
49 | 48 |
hmm <- .C("R_univariate_hmm", |
50 |
- reads = as.integer(binned.data$reads), # int* O |
|
49 |
+ reads = as.integer(reads), # int* O |
|
51 | 50 |
num.bins = as.integer(numbins), # int* T |
52 | 51 |
num.states = as.integer(numstates), # int* N |
53 |
- use.states = as.integer(use.states), # int* states |
|
52 |
+ use.states = as.integer(use.states), # int* statelabels |
|
54 | 53 |
size = double(length=numstates), # double* size |
55 | 54 |
prob = double(length=numstates), # double* prob |
56 | 55 |
num.iterations = as.integer(max.iter), # int* maxiter |
57 | 56 |
time.sec = as.integer(max.time), # double* maxtime |
58 | 57 |
loglik.delta = as.double(eps.try), # double* eps |
59 |
- posteriors = double(length=numbins * numstates), # double* posteriors |
|
58 |
+ states = integer(length=numbins), # int* states |
|
60 | 59 |
A = double(length=numstates*numstates), # double* A |
61 | 60 |
proba = double(length=numstates), # double* proba |
62 | 61 |
loglik = double(length=1), # double* loglik |
... | ... |
@@ -89,7 +88,6 @@ find.aneuploidies <- function(binned.data, use.states=0:3, eps=0.001, init="stan |
89 | 88 |
warning("HMM did not converge in trial run ",i_try,"!\n") |
90 | 89 |
} |
91 | 90 |
# Store model in list |
92 |
- hmm$posteriors <- NULL |
|
93 | 91 |
hmm$reads <- NULL |
94 | 92 |
modellist[[i_try]] <- hmm |
95 | 93 |
} |
... | ... |
@@ -104,16 +102,16 @@ find.aneuploidies <- function(binned.data, use.states=0:3, eps=0.001, init="stan |
104 | 102 |
# Rerun the HMM with different epsilon and initial parameters from trial run |
105 | 103 |
cat("\n\nRerunning try ",indexmax," with eps =",eps,"--------------------\n") |
106 | 104 |
hmm <- .C("R_univariate_hmm", |
107 |
- reads = as.integer(binned.data$reads), # int* O |
|
105 |
+ reads = as.integer(reads), # int* O |
|
108 | 106 |
num.bins = as.integer(numbins), # int* T |
109 | 107 |
num.states = as.integer(numstates), # int* N |
110 |
- use.states = as.integer(use.states), # int* states |
|
108 |
+ use.states = as.integer(use.states), # int* statelabels |
|
111 | 109 |
size = double(length=numstates), # double* size |
112 | 110 |
prob = double(length=numstates), # double* prob |
113 | 111 |
num.iterations = as.integer(max.iter), # int* maxiter |
114 | 112 |
time.sec = as.integer(max.time), # double* maxtime |
115 | 113 |
loglik.delta = as.double(eps), # double* eps |
116 |
- posteriors = double(length=numbins * numstates), # double* posteriors |
|
114 |
+ states = integer(length=numbins), # int* states |
|
117 | 115 |
A = double(length=numstates*numstates), # double* A |
118 | 116 |
proba = double(length=numstates), # double* proba |
119 | 117 |
loglik = double(length=1), # double* loglik |
... | ... |
@@ -132,11 +130,11 @@ find.aneuploidies <- function(binned.data, use.states=0:3, eps=0.001, init="stan |
132 | 130 |
|
133 | 131 |
# Add useful entries |
134 | 132 |
names(hmm$weights) <- use.state.labels |
135 |
- hmm$coordinates <- binned.data[,coordinate.names] |
|
136 |
- hmm$posteriors <- matrix(hmm$posteriors, ncol=hmm$num.states) |
|
137 |
- colnames(hmm$posteriors) <- paste("P(",use.state.labels,")", sep="") |
|
133 |
+ hmm$coordinates <- data.frame(as.character(seqnames(binned.data)), start(ranges(binned.data)), end(ranges(binned.data))) |
|
134 |
+ names(hmm$coordinates) <- coordinate.names |
|
135 |
+ hmm$seqlengths <- seqlengths(binned.data) |
|
138 | 136 |
class(hmm) <- class.aneufinder.univariate |
139 |
- hmm$states <- use.state.labels[apply(hmm$posteriors, 1, which.max)] |
|
137 |
+ hmm$states <- use.state.labels[hmm$states+1] |
|
140 | 138 |
hmm$eps <- eps |
141 | 139 |
hmm$A <- matrix(hmm$A, ncol=hmm$num.states, byrow=TRUE) |
142 | 140 |
rownames(hmm$A) <- use.state.labels |
... | ... |
@@ -6,7 +6,7 @@ |
6 | 6 |
// This function takes parameters from R, creates a univariate HMM object, creates the distributions, runs the Baum-Welch and returns the result to R. |
7 | 7 |
// =================================================================================================================================================== |
8 | 8 |
extern "C" { |
9 |
-void R_univariate_hmm(int* O, int* T, int* N, int* states, double* size, double* prob, int* maxiter, int* maxtime, double* eps, double* posteriors, double* A, double* proba, double* loglik, double* weights, int* iniproc, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff) |
|
9 |
+void R_univariate_hmm(int* O, int* T, int* N, int* statelabels, double* size, double* prob, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* weights, int* iniproc, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff) |
|
10 | 10 |
{ |
11 | 11 |
|
12 | 12 |
// Define logging level |
... | ... |
@@ -89,8 +89,8 @@ void R_univariate_hmm(int* O, int* T, int* N, int* states, double* size, double* |
89 | 89 |
if (*iniproc == 1) |
90 | 90 |
{ |
91 | 91 |
// Simple initialization based on data mean, assumed to be the disomic mean |
92 |
- imean = mean/2 * states[i_state]; |
|
93 |
- ivariance = variance/2 * states[i_state]; |
|
92 |
+ imean = mean/2 * statelabels[i_state]; |
|
93 |
+ ivariance = variance/2 * statelabels[i_state]; |
|
94 | 94 |
} |
95 | 95 |
|
96 | 96 |
// Calculate r and p from mean and variance |
... | ... |
@@ -137,15 +137,28 @@ void R_univariate_hmm(int* O, int* T, int* N, int* states, double* size, double* |
137 | 137 |
} |
138 | 138 |
|
139 | 139 |
FILE_LOG(logDEBUG1) << "Finished with Baum-Welch estimation"; |
140 |
- // Compute the posteriors and save results directly to the R pointer |
|
141 |
- FILE_LOG(logDEBUG1) << "Recode posteriors into column representation"; |
|
142 |
- #pragma omp parallel for |
|
143 |
- for (int iN=0; iN<*N; iN++) |
|
140 |
+ |
|
141 |
+// // Compute the posteriors and save results directly to the R pointer |
|
142 |
+// FILE_LOG(logDEBUG1) << "Recode posteriors into column representation"; |
|
143 |
+// #pragma omp parallel for |
|
144 |
+// for (int iN=0; iN<*N; iN++) |
|
145 |
+// { |
|
146 |
+// for (int t=0; t<*T; t++) |
|
147 |
+// { |
|
148 |
+// posteriors[t + iN * (*T)] = hmm->get_posterior(iN, t); |
|
149 |
+// } |
|
150 |
+// } |
|
151 |
+ |
|
152 |
+ // Compute the states from posteriors |
|
153 |
+ FILE_LOG(logDEBUG1) << "Computing states from posteriors"; |
|
154 |
+ double posterior_per_t [*N]; |
|
155 |
+ for (int t=0; t<*T; t++) |
|
144 | 156 |
{ |
145 |
- for (int t=0; t<*T; t++) |
|
157 |
+ for (int iN=0; iN<*N; iN++) |
|
146 | 158 |
{ |
147 |
- posteriors[t + iN * (*T)] = hmm->get_posterior(iN, t); |
|
159 |
+ posterior_per_t[iN] = hmm->get_posterior(iN, t); |
|
148 | 160 |
} |
161 |
+ states[t] = statelabels[argMax(posterior_per_t, *N)]; |
|
149 | 162 |
} |
150 | 163 |
|
151 | 164 |
FILE_LOG(logDEBUG1) << "Return parameters"; |
... | ... |
@@ -17,7 +17,7 @@ ScaleHMM::ScaleHMM(int T, int N) |
17 | 17 |
this->scalealpha = allocDoubleMatrix(T, N); |
18 | 18 |
this->scalebeta = allocDoubleMatrix(T, N); |
19 | 19 |
this->densities = allocDoubleMatrix(N, T); |
20 |
- this->tdensities = allocDoubleMatrix(T, N); |
|
20 |
+// this->tdensities = allocDoubleMatrix(T, N); |
|
21 | 21 |
this->proba = (double*) calloc(N, sizeof(double)); |
22 | 22 |
this->gamma = allocDoubleMatrix(N, T); |
23 | 23 |
this->sumgamma = (double*) calloc(N, sizeof(double)); |
... | ... |
@@ -26,7 +26,7 @@ ScaleHMM::ScaleHMM(int T, int N) |
26 | 26 |
this->dlogP = INFINITY; |
27 | 27 |
this->sumdiff_state_last = 0; |
28 | 28 |
this->sumdiff_posterior = 0.0; |
29 |
- this->use_tdens = false; |
|
29 |
+// this->use_tdens = false; |
|
30 | 30 |
|
31 | 31 |
} |
32 | 32 |
|
... | ... |
@@ -38,7 +38,7 @@ ScaleHMM::~ScaleHMM() |
38 | 38 |
freeDoubleMatrix(this->scalealpha, this->T); |
39 | 39 |
freeDoubleMatrix(this->scalebeta, this->T); |
40 | 40 |
freeDoubleMatrix(this->densities, this->N); |
41 |
- freeDoubleMatrix(this->tdensities, this->T); |
|
41 |
+// freeDoubleMatrix(this->tdensities, this->T); |
|
42 | 42 |
freeDoubleMatrix(this->gamma, this->N); |
43 | 43 |
freeDoubleMatrix(this->sumxi, this->N); |
44 | 44 |
free(this->proba); |
... | ... |
@@ -213,7 +213,7 @@ void ScaleHMM::baumWelch(int* maxiter, int* maxtime, double* eps) |
213 | 213 |
this->print_uni_iteration(iteration); |
214 | 214 |
|
215 | 215 |
// Check convergence |
216 |
- if(fabs(this->dlogP) < *eps) //it has converged |
|
216 |
+ if(this->dlogP < *eps) //it has converged |
|
217 | 217 |
{ |
218 | 218 |
FILE_LOG(logINFO) << "\nConvergence reached!\n"; |
219 | 219 |
Rprintf("\nConvergence reached!\n\n"); |
... | ... |
@@ -371,8 +371,8 @@ void ScaleHMM::forward() |
371 | 371 |
FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
372 | 372 |
clock_t time = clock(), dtime; |
373 | 373 |
|
374 |
- if (not this->use_tdens) |
|
375 |
- { |
|
374 |
+// if (not this->use_tdens) |
|
375 |
+// { |
|
376 | 376 |
|
377 | 377 |
double alpha [this->N]; |
378 | 378 |
// Initialization |
... | ... |
@@ -424,61 +424,61 @@ void ScaleHMM::forward() |
424 | 424 |
} |
425 | 425 |
} |
426 | 426 |
|
427 |
- } |
|
428 |
- else if (this->use_tdens) |
|
429 |
- { |
|
430 |
- |
|
431 |
- double alpha [this->N]; |
|
432 |
- // Initialization |
|
433 |
- this->scalefactoralpha[0] = 0.0; |
|
434 |
- for (int iN=0; iN<this->N; iN++) |
|
435 |
- { |
|
436 |
- alpha[iN] = this->proba[iN] * this->tdensities[0][iN]; |
|
437 |
- FILE_LOG(logDEBUG4) << "alpha["<<iN<<"] = " << alpha[iN]; |
|
438 |
- this->scalefactoralpha[0] += alpha[iN]; |
|
439 |
- } |
|
440 |
- FILE_LOG(logDEBUG4) << "scalefactoralpha["<<0<<"] = " << scalefactoralpha[0]; |
|
441 |
- for (int iN=0; iN<this->N; iN++) |
|
442 |
- { |
|
443 |
- this->scalealpha[0][iN] = alpha[iN] / this->scalefactoralpha[0]; |
|
444 |
- FILE_LOG(logDEBUG4) << "scalealpha["<<0<<"]["<<iN<<"] = " << scalealpha[0][iN]; |
|
445 |
- } |
|
446 |
- // Induction |
|
447 |
- for (int t=1; t<this->T; t++) |
|
448 |
- { |
|
449 |
- this->scalefactoralpha[t] = 0.0; |
|
450 |
- for (int iN=0; iN<this->N; iN++) |
|
451 |
- { |
|
452 |
- double helpsum = 0.0; |
|
453 |
- for (int jN=0; jN<this->N; jN++) |
|
454 |
- { |
|
455 |
- helpsum += this->scalealpha[t-1][jN] * this->A[jN][iN]; |
|
456 |
- } |
|
457 |
- alpha[iN] = helpsum * this->tdensities[t][iN]; |
|
458 |
- FILE_LOG(logDEBUG4) << "alpha["<<iN<<"] = " << alpha[iN]; |
|
459 |
- this->scalefactoralpha[t] += alpha[iN]; |
|
460 |
- } |
|
461 |
- FILE_LOG(logDEBUG4) << "scalefactoralpha["<<t<<"] = " << scalefactoralpha[t]; |
|
462 |
- for (int iN=0; iN<this->N; iN++) |
|
463 |
- { |
|
464 |
- this->scalealpha[t][iN] = alpha[iN] / this->scalefactoralpha[t]; |
|
465 |
- FILE_LOG(logDEBUG4) << "scalealpha["<<t<<"]["<<iN<<"] = " << scalealpha[t][iN]; |
|
466 |
- if(isnan(this->scalealpha[t][iN])) |
|
467 |
- { |
|
468 |
- FILE_LOG(logWARNING) << __PRETTY_FUNCTION__; |
|
469 |
- for (int jN=0; jN<this->N; jN++) |
|
470 |
- { |
|
471 |
- FILE_LOG(logWARNING) << "scalealpha["<<t-1<<"]["<<jN<<"] = " << scalealpha[t-1][jN]; |
|
472 |
- FILE_LOG(logWARNING) << "A["<<iN<<"]["<<jN<<"] = " << A[iN][jN]; |
|
473 |
- } |
|
474 |
- FILE_LOG(logWARNING) << "scalefactoralpha["<<t<<"] = "<<scalefactoralpha[t] << ", tdensities = "<<tdensities[t][iN]; |
|
475 |
- FILE_LOG(logWARNING) << "scalealpha["<<t<<"]["<<iN<<"] = " << scalealpha[t][iN]; |
|
476 |
- exit(1); |
|
477 |
- } |
|
478 |
- } |
|
479 |
- } |
|
480 |
- |
|
481 |
- } |
|
427 |
+// } |
|
428 |
+// else if (this->use_tdens) |
|
429 |
+// { |
|
430 |
+// |
|
431 |
+// double alpha [this->N]; |
|
432 |
+// // Initialization |
|
433 |
+// this->scalefactoralpha[0] = 0.0; |
|
434 |
+// for (int iN=0; iN<this->N; iN++) |
|
435 |
+// { |
|
436 |
+// alpha[iN] = this->proba[iN] * this->tdensities[0][iN]; |
|
437 |
+// FILE_LOG(logDEBUG4) << "alpha["<<iN<<"] = " << alpha[iN]; |
|
438 |
+// this->scalefactoralpha[0] += alpha[iN]; |
|
439 |
+// } |
|
440 |
+// FILE_LOG(logDEBUG4) << "scalefactoralpha["<<0<<"] = " << scalefactoralpha[0]; |
|
441 |
+// for (int iN=0; iN<this->N; iN++) |
|
442 |
+// { |
|
443 |
+// this->scalealpha[0][iN] = alpha[iN] / this->scalefactoralpha[0]; |
|
444 |
+// FILE_LOG(logDEBUG4) << "scalealpha["<<0<<"]["<<iN<<"] = " << scalealpha[0][iN]; |
|
445 |
+// } |
|
446 |
+// // Induction |
|
447 |
+// for (int t=1; t<this->T; t++) |
|
448 |
+// { |
|
449 |
+// this->scalefactoralpha[t] = 0.0; |
|
450 |
+// for (int iN=0; iN<this->N; iN++) |
|
451 |
+// { |
|
452 |
+// double helpsum = 0.0; |
|
453 |
+// for (int jN=0; jN<this->N; jN++) |
|
454 |
+// { |
|
455 |
+// helpsum += this->scalealpha[t-1][jN] * this->A[jN][iN]; |
|
456 |
+// } |
|
457 |
+// alpha[iN] = helpsum * this->tdensities[t][iN]; |
|
458 |
+// FILE_LOG(logDEBUG4) << "alpha["<<iN<<"] = " << alpha[iN]; |
|
459 |
+// this->scalefactoralpha[t] += alpha[iN]; |
|
460 |
+// } |
|
461 |
+// FILE_LOG(logDEBUG4) << "scalefactoralpha["<<t<<"] = " << scalefactoralpha[t]; |
|
462 |
+// for (int iN=0; iN<this->N; iN++) |
|
463 |
+// { |
|
464 |
+// this->scalealpha[t][iN] = alpha[iN] / this->scalefactoralpha[t]; |
|
465 |
+// FILE_LOG(logDEBUG4) << "scalealpha["<<t<<"]["<<iN<<"] = " << scalealpha[t][iN]; |
|
466 |
+// if(isnan(this->scalealpha[t][iN])) |
|
467 |
+// { |
|
468 |
+// FILE_LOG(logWARNING) << __PRETTY_FUNCTION__; |
|
469 |
+// for (int jN=0; jN<this->N; jN++) |
|
470 |
+// { |
|
471 |
+// FILE_LOG(logWARNING) << "scalealpha["<<t-1<<"]["<<jN<<"] = " << scalealpha[t-1][jN]; |
|
472 |
+// FILE_LOG(logWARNING) << "A["<<iN<<"]["<<jN<<"] = " << A[iN][jN]; |
|
473 |
+// } |
|
474 |
+// FILE_LOG(logWARNING) << "scalefactoralpha["<<t<<"] = "<<scalefactoralpha[t] << ", tdensities = "<<tdensities[t][iN]; |
|
475 |
+// FILE_LOG(logWARNING) << "scalealpha["<<t<<"]["<<iN<<"] = " << scalealpha[t][iN]; |
|
476 |
+// exit(1); |
|
477 |
+// } |
|
478 |
+// } |
|
479 |
+// } |
|
480 |
+// |
|
481 |
+// } |
|
482 | 482 |
|
483 | 483 |
dtime = clock() - time; |
484 | 484 |
FILE_LOG(logDEBUG) << "forward(): " << dtime << " clicks"; |
... | ... |
@@ -489,8 +489,8 @@ void ScaleHMM::backward() |
489 | 489 |
FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; |
490 | 490 |
clock_t time = clock(), dtime; |
491 | 491 |
|
492 |
- if (not this->use_tdens) |
|
493 |
- { |
|
492 |
+// if (not this->use_tdens) |
|
493 |
+// { |
|
494 | 494 |
|
495 | 495 |
double beta [this->N]; |
496 | 496 |
// Initialization |
... | ... |
@@ -538,57 +538,57 @@ void ScaleHMM::backward() |
538 | 538 |
} |
539 | 539 |
} |
540 | 540 |
|
541 |
- } |
|
542 |
- else if (this->use_tdens) |
|
543 |
- { |
|
544 |
- |
|
545 |
- double beta [this->N]; |
|
546 |
- // Initialization |
|
547 |
- for (int iN=0; iN<this->N; iN++) |
|
548 |
- { |
|
549 |
- beta[iN] = 1.0; |
|
550 |
- FILE_LOG(logDEBUG4) << "beta["<<iN<<"] = " << beta[iN]; |
|
551 |
- } |
|
552 |
- FILE_LOG(logDEBUG4) << "scalefactoralpha["<<T-1<<"] = " << scalefactoralpha[T-1]; |
|
553 |
- for (int iN=0; iN<this->N; iN++) |
|
554 |
- { |
|
555 |
- this->scalebeta[T-1][iN] = beta[iN] / this->scalefactoralpha[T-1]; |
|
556 |
- FILE_LOG(logDEBUG4) << "scalebeta["<<T-1<<"]["<<iN<<"] = " << scalebeta[T-1][iN]; |
|
557 |
- } |
|
558 |
- // Induction |
|
559 |
- for (int t=this->T-2; t>=0; t--) |
|
560 |
- { |
|
561 |
- for (int iN=0; iN<this->N; iN++) |
|
562 |
- { |
|
563 |
- FILE_LOG(logDEBUG4) << "Calculating backward variable for state " << iN; |
|
564 |
- beta[iN] = 0.0; |
|
565 |
- for(int jN=0; jN<this->N; jN++) |
|
566 |
- { |
|
567 |
- beta[iN] += this->A[iN][jN] * this->tdensities[t+1][jN] * this->scalebeta[t+1][jN]; |
|
568 |
- } |
|
569 |
- } |
|
570 |
- FILE_LOG(logDEBUG4) << "scalefactoralpha["<<t<<"] = " << scalefactoralpha[t]; |
|
571 |
- for (int iN=0; iN<this->N; iN++) |
|
572 |
- { |
|
573 |
- this->scalebeta[t][iN] = beta[iN] / this->scalefactoralpha[t]; |
|
574 |
- FILE_LOG(logDEBUG4) << "scalebeta["<<t<<"]["<<iN<<"] = " << scalebeta[t][iN]; |
|
575 |
- if (isnan(this->scalebeta[t][iN])) |
|
576 |
- { |
|
577 |
- FILE_LOG(logWARNING) << __PRETTY_FUNCTION__; |
|
578 |
- for (int jN=0; jN<this->N; jN++) |
|
579 |
- { |
|
580 |
- FILE_LOG(logWARNING) << "scalebeta["<<jN<<"]["<<t+1<<"] = " << scalebeta[t+1][jN]; |
|
581 |
- FILE_LOG(logWARNING) << "A["<<iN<<"]["<<jN<<"] = " << A[iN][jN]; |
|
582 |
- FILE_LOG(logWARNING) << "tdensities["<<t+1<<"]["<<jN<<"] = " << tdensities[t+1][jN]; |
|
583 |
- } |
|
584 |
- FILE_LOG(logWARNING) << "this->scalefactoralpha[t]["<<t<<"] = "<<this->scalefactoralpha[t] << ", tdensities = "<<densities[t][iN]; |
|
585 |
- FILE_LOG(logWARNING) << "scalebeta["<<iN<<"]["<<t<<"] = " << scalebeta[t][iN]; |
|
586 |
- exit(1); |
|
587 |
- } |
|
588 |
- } |
|
589 |
- } |
|
590 |
- |
|
591 |
- } |
|
541 |
+// } |
|
542 |
+// else if (this->use_tdens) |
|
543 |
+// { |
|
544 |
+// |
|
545 |
+// double beta [this->N]; |
|
546 |
+// // Initialization |
|
547 |
+// for (int iN=0; iN<this->N; iN++) |
|
548 |
+// { |
|
549 |
+// beta[iN] = 1.0; |
|
550 |
+// FILE_LOG(logDEBUG4) << "beta["<<iN<<"] = " << beta[iN]; |
|
551 |
+// } |
|
552 |
+// FILE_LOG(logDEBUG4) << "scalefactoralpha["<<T-1<<"] = " << scalefactoralpha[T-1]; |
|
553 |
+// for (int iN=0; iN<this->N; iN++) |
|
554 |
+// { |
|
555 |
+// this->scalebeta[T-1][iN] = beta[iN] / this->scalefactoralpha[T-1]; |
|
556 |
+// FILE_LOG(logDEBUG4) << "scalebeta["<<T-1<<"]["<<iN<<"] = " << scalebeta[T-1][iN]; |
|
557 |
+// } |
|
558 |
+// // Induction |
|
559 |
+// for (int t=this->T-2; t>=0; t--) |
|
560 |
+// { |
|
561 |
+// for (int iN=0; iN<this->N; iN++) |
|
562 |
+// { |
|
563 |
+// FILE_LOG(logDEBUG4) << "Calculating backward variable for state " << iN; |
|
564 |
+// beta[iN] = 0.0; |
|
565 |
+// for(int jN=0; jN<this->N; jN++) |
|
566 |
+// { |
|
567 |
+// beta[iN] += this->A[iN][jN] * this->tdensities[t+1][jN] * this->scalebeta[t+1][jN]; |
|
568 |
+// } |
|
569 |
+// } |
|
570 |
+// FILE_LOG(logDEBUG4) << "scalefactoralpha["<<t<<"] = " << scalefactoralpha[t]; |
|
571 |
+// for (int iN=0; iN<this->N; iN++) |
|
572 |
+// { |
|
573 |
+// this->scalebeta[t][iN] = beta[iN] / this->scalefactoralpha[t]; |
|
574 |
+// FILE_LOG(logDEBUG4) << "scalebeta["<<t<<"]["<<iN<<"] = " << scalebeta[t][iN]; |
|
575 |
+// if (isnan(this->scalebeta[t][iN])) |
|
576 |
+// { |
|
577 |
+// FILE_LOG(logWARNING) << __PRETTY_FUNCTION__; |
|
578 |
+// for (int jN=0; jN<this->N; jN++) |
|
579 |
+// { |
|
580 |
+// FILE_LOG(logWARNING) << "scalebeta["<<jN<<"]["<<t+1<<"] = " << scalebeta[t+1][jN]; |
|
581 |
+// FILE_LOG(logWARNING) << "A["<<iN<<"]["<<jN<<"] = " << A[iN][jN]; |
|
582 |
+// FILE_LOG(logWARNING) << "tdensities["<<t+1<<"]["<<jN<<"] = " << tdensities[t+1][jN]; |
|
583 |
+// } |
|
584 |
+// FILE_LOG(logWARNING) << "this->scalefactoralpha[t]["<<t<<"] = "<<this->scalefactoralpha[t] << ", tdensities = "<<densities[t][iN]; |
|
585 |
+// FILE_LOG(logWARNING) << "scalebeta["<<iN<<"]["<<t<<"] = " << scalebeta[t][iN]; |
|
586 |
+// exit(1); |
|
587 |
+// } |
|
588 |
+// } |
|
589 |
+// } |
|
590 |
+// |
|
591 |
+// } |
|
592 | 592 |
|
593 | 593 |
dtime = clock() - time; |
594 | 594 |
FILE_LOG(logDEBUG) << "backward(): " << dtime << " clicks"; |
... | ... |
@@ -641,8 +641,8 @@ void ScaleHMM::calc_sumxi() |
641 | 641 |
} |
642 | 642 |
|
643 | 643 |
// Compute the sumxi |
644 |
- if (not this->use_tdens) |
|
645 |
- { |
|
644 |
+// if (not this->use_tdens) |
|
645 |
+// { |
|
646 | 646 |
|
647 | 647 |
#pragma omp parallel for |
648 | 648 |
for (int iN=0; iN<this->N; iN++) |
... | ... |
@@ -658,25 +658,25 @@ void ScaleHMM::calc_sumxi() |
658 | 658 |
} |
659 | 659 |
} |
660 | 660 |
|
661 |
- } |
|
662 |
- else if (this->use_tdens) |
|
663 |
- { |
|
664 |
- |
|
665 |
- #pragma omp parallel for |
|
666 |
- for (int iN=0; iN<this->N; iN++) |
|
667 |
- { |
|
668 |
- FILE_LOG(logDEBUG3) << "Calculating sumxi["<<iN<<"][jN]"; |
|
669 |
- for (int t=0; t<this->T-1; t++) |
|
670 |
- { |
|
671 |
- for (int jN=0; jN<this->N; jN++) |
|
672 |
- { |
|
673 |
- xi = this->scalealpha[t][iN] * this->A[iN][jN] * this->tdensities[t+1][jN] * this->scalebeta[t+1][jN]; |
|
674 |
- this->sumxi[iN][jN] += xi; |
|
675 |
- } |
|
676 |
- } |
|
677 |
- } |
|
678 |
- |
|
679 |
- } |
|
661 |
+// } |
|
662 |
+// else if (this->use_tdens) |
|
663 |
+// { |
|
664 |
+// |
|
665 |
+// #pragma omp parallel for |
|
666 |
+// for (int iN=0; iN<this->N; iN++) |
|
667 |
+// { |
|
668 |
+// FILE_LOG(logDEBUG3) << "Calculating sumxi["<<iN<<"][jN]"; |
|
669 |
+// for (int t=0; t<this->T-1; t++) |
|
670 |
+// { |
|
671 |
+// for (int jN=0; jN<this->N; jN++) |
|
672 |
+// { |
|
673 |
+// xi = this->scalealpha[t][iN] * this->A[iN][jN] * this->tdensities[t+1][jN] * this->scalebeta[t+1][jN]; |
|
674 |
+// this->sumxi[iN][jN] += xi; |
|
675 |
+// } |
|
676 |
+// } |
|
677 |
+// } |
|
678 |
+// |
|
679 |
+// } |
|
680 | 680 |
|
681 | 681 |
dtime = clock() - time; |
682 | 682 |
FILE_LOG(logDEBUG) << "calc_sumxi(): " << dtime << " clicks"; |
... | ... |
@@ -708,18 +708,18 @@ void ScaleHMM::calc_densities() |
708 | 708 |
this->densityFunctions[iN]->calc_densities(this->densities[iN]); |
709 | 709 |
} |
710 | 710 |
|
711 |
- if (this->use_tdens) |
|
712 |
- { |
|
713 |
- |
|
714 |
- for (int t=0; t<this->T; t++) |
|
715 |
- { |
|
716 |
- for (int iN=0; iN<this->N; iN++) |
|
717 |
- { |
|
718 |
- this->tdensities[t][iN] = this->densities[iN][t]; |
|
719 |
- } |
|
720 |
- } |
|
721 |
- |
|
722 |
- } |
|
711 |
+// if (this->use_tdens) |
|
712 |
+// { |
|
713 |
+// |
|
714 |
+// for (int t=0; t<this->T; t++) |
|
715 |
+// { |
|
716 |
+// for (int iN=0; iN<this->N; iN++) |
|
717 |
+// { |
|
718 |
+// this->tdensities[t][iN] = this->densities[iN][t]; |
|
719 |
+// } |
|
720 |
+// } |
|
721 |
+// |
|
722 |
+// } |
|
723 | 723 |
|
724 | 724 |
dtime = clock() - time; |
725 | 725 |
FILE_LOG(logDEBUG) << "calc_densities(): " << dtime << " clicks"; |
... | ... |
@@ -46,7 +46,7 @@ class ScaleHMM { |
46 | 46 |
double** scalealpha; ///< matrix [T x N] of forward probabilities |
47 | 47 |
double** scalebeta; ///< matrix [T x N] of backward probabilities |
48 | 48 |
double** densities; ///< matrix [N x T] of density values |
49 |
- double** tdensities; ///< matrix [T x N] of density values, for use in multivariate |
|
49 |
+// double** tdensities; ///< matrix [T x N] of density values, for use in multivariate !increases speed, but on cost of RAM usage and that seems to be limiting |
|
50 | 50 |
double* sumgamma; ///< vector[N] of sum of posteriors (gamma values) |
51 | 51 |
double** sumxi; ///< matrix[N x N] of xi values |
52 | 52 |
double** gamma; ///< matrix[N x T] of posteriors |
... | ... |
@@ -55,7 +55,7 @@ class ScaleHMM { |
55 | 55 |
int baumWelchTime_real; ///< elapsed time from start of the 0th iteration |
56 | 56 |
int sumdiff_state_last; ///< sum of the difference in the state 1 assignments from one iteration to the next |
57 | 57 |
double sumdiff_posterior; ///< sum of the difference in posterior (gamma) values from one iteration to the next |
58 |
- bool use_tdens; ///< switch for using the tdensities in the calculations |
|
58 |
+// bool use_tdens; ///< switch for using the tdensities in the calculations |
|
59 | 59 |
|
60 | 60 |
// Methods |
61 | 61 |
void forward(); ///< calculate forward variables (alpha) |