Browse code

updated code, made it faster

chakalakka authored on 26/06/2014 16:54:03
Showing 19 changed files

... ...
@@ -9,7 +9,16 @@ bedGraph2binned,
9 9
 
10 10
 find.aneuploidies,
11 11
 
12
-plot.distribution
12
+plot.distribution,
13
+
14
+univariate2bed,
15
+binned2wiggle,
16
+multivariate2bed,
17
+binned2GRanges,
18
+hmm2GRanges,
19
+
20
+mixedsort,
21
+mixedorder
13 22
 )
14 23
 
15 24
 # Import all packages listed as Imports or Depends
... ...
@@ -51,12 +51,24 @@ check.integer = function(testvar) {
51 51
 check.univariate.modellist = function(modellist) {
52 52
 	if (!is(modellist,"list")) return(1)
53 53
 	for (model in modellist) {
54
-		if (!is(model,"chromStar.univariate.model")) return(2)
54
+		if (!is(model,class.chromstar.univariate)) return(2)
55 55
 	}
56 56
 	return(0)
57 57
 }
58 58
 check.univariate.model = function(model) {
59
-	if (!is(model,"chromStar.univariate.model")) return(1)
59
+	if (!is(model,class.chromstar.univariate)) return(1)
60
+	return(0)
61
+}
62
+
63
+check.multivariate.modellist = function(modellist) {
64
+	if (!is(modellist,"list")) return(1)
65
+	for (model in modellist) {
66
+		if (!is(model,class.chromstar.multivariate)) return(2)
67
+	}
68
+	return(0)
69
+}
70
+check.multivariate.model = function(model) {
71
+	if (!is(model,class.chromstar.multivariate)) return(1)
60 72
 	return(0)
61 73
 }
62 74
 
... ...
@@ -19,7 +19,14 @@ collapse.bins = function(data, column2collapseBy=NULL, columns2sumUp=NULL) {
19 19
 		cShift1 = rep(NA,length(c))
20 20
 		cShift1[-1] = c[-length(c)]
21 21
 	}
22
-	compare = c != cShift1
22
+	compare_custom = c != cShift1
23
+	# Make the comparison vector to separate chromosomes
24
+	c <- as.integer(data[,1])
25
+	cShift1 = rep(NA,length(c))
26
+	cShift1[-1] = c[-length(c)]
27
+	compare_chrom = c != cShift1
28
+	# Combine the vectors
29
+	compare <- compare_custom | compare_chrom
23 30
 	compare[1] = TRUE
24 31
 	numcollapsedbins = length(which(compare==TRUE))
25 32
 	numbins = nrow(data)
26 33
new file mode 100644
... ...
@@ -0,0 +1,51 @@
1
+binned2GRanges <- function(binned.data) {
2
+
3
+	gr <- GenomicRanges::GRanges(
4
+			seqnames = Rle(binned.data$chrom),
5
+			ranges = IRanges(start=binned.data$start, end=binned.data$end),
6
+			strand = Rle(strand("*"), nrow(binned.data)),
7
+			reads = binned.data$reads
8
+			)
9
+	return(gr)
10
+
11
+}
12
+
13
+hmm2GRanges <- function(hmm, reduce=TRUE) {
14
+
15
+	### Check user input ###
16
+	if (check.multivariate.model(hmm)!=0 & check.univariate.model(hmm)!=0) stop("argument 'hmm' expects a univariate or multivariate hmm object (type ?uni.hmm or ?multi.hmm for help)")
17
+	if (check.logical(reduce)!=0) stop("argument 'reduce' expects TRUE or FALSE")
18
+
19
+	### Create GRanges ###
20
+	# Transfer coordinates
21
+	gr <- GenomicRanges::GRanges(
22
+			seqnames = Rle(hmm$coordinates$chrom),
23
+			ranges = IRanges(start=hmm$coordinates$start, end=hmm$coordinates$end),
24
+			strand = Rle(strand("*"), nrow(hmm$coordinates))
25
+			)
26
+	# Seqlengths gives warnings because our coordinates are 0-based
27
+# 	for (chrom in unique(hmm$coordinates$chrom)) {
28
+# 		seqlengths(gr)[names(seqlengths(gr))==chrom] <- max(hmm$coordinates$end[hmm$coordinates$chrom==chrom])
29
+# 	}
30
+
31
+	if (reduce) {
32
+		# Reduce state by state
33
+		red.gr.list <- GenomicRanges::GRangesList()
34
+		for (state in unique(hmm$states)) {
35
+			red.gr <- GenomicRanges::reduce(gr[hmm$states==state])
36
+			mcols(red.gr)$states <- rep(as.factor(state),length(red.gr))
37
+			red.gr.list[[length(red.gr.list)+1]] <- red.gr
38
+		}
39
+		# Merge and sort
40
+		red.gr <- sort(unlist(red.gr.list))
41
+		remove(red.gr.list)
42
+		return(red.gr)
43
+	} else {
44
+		mcols(gr)$reads <- hmm$reads
45
+		mcols(gr)$posteriors <- hmm$posteriors
46
+		mcols(gr)$states <- hmm$states
47
+		return(gr)
48
+	}
49
+
50
+}
51
+
0 52
deleted file mode 100644
... ...
@@ -1,291 +0,0 @@
1
-# ----------------------------------------------------------------
2
-# Write univariate states to file
3
-# ----------------------------------------------------------------
4
-univariate2bed <- function(modellist, file="view_me_in_genome_browser", threshold=0.5, separate.zeroinflation=FALSE, chrom.length.file=NULL) {
5
-
6
-	# Check user input
7
-	if (is.null(names(modellist))) {
8
-		stop("Please name the list entries. The names will be used as track name for the Genome Browser file.")
9
-	}
10
-
11
-	# Variables
12
-	nummod <- length(modellist)
13
-	file <- paste(file,"bed", sep=".")
14
-
15
-	# Generate the colors
16
-	colors <- c("zeroinflation"="black", "unmodified"="gray48","modified"="orangered3")
17
-	RGBs <- t(col2rgb(colors))
18
-	RGBs <- apply(RGBs,1,paste,collapse=",")
19
-
20
-	# Write first line to file
21
-	cat("browser hide all\n", file=file)
22
-	
23
-	for (imod in 1:nummod) {
24
-		model <- modellist[[imod]]
25
-		priority <- 50 + imod
26
-		cat(paste("track name=",names(modellist)[imod]," description=\"univariate calls for ",names(modellist)[imod],", threshold = ",threshold,"\" visibility=1 itemRgb=On priority=",priority,"\n", sep=""), file=file, append=TRUE)
27
-
28
-		# Collapse the calls
29
-		if (separate.zeroinflation) {
30
-			calls <- cbind(model$coordinates, name=model$states.with.zeroinflation)
31
-			collapsed.calls <- collapse.bins(calls, column2collapseBy=4)
32
-		} else {
33
-			calls <- cbind(model$coordinates, name=model$states)
34
-			collapsed.calls <- collapse.bins(calls, column2collapseBy=4)
35
-			collapsed.calls <- collapsed.calls[collapsed.calls$name=="modified",]
36
-		}
37
-
38
-		# Check length of chromosomes if chrom.length.file was given
39
-		if (!is.null(chrom.length.file)) {
40
-			chrom.lengths.df <- read.table(chrom.length.file)
41
-			chrom.lengths <- chrom.lengths.df[,2]
42
-			names(chrom.lengths) <- chrom.lengths.df[,1]
43
-			for (chrom in levels(as.factor(collapsed_calls$chrom))) {
44
-				index <- which(collapsed_calls$chrom == chrom & collapsed_calls$end > chrom.lengths[chrom])
45
-				collapsed_calls$end[index] <- chrom.lengths[chrom]
46
-				if (length(index) >= 1) {
47
-					warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome")
48
-				}
49
-			}
50
-		}
51
-
52
-		# Append to file
53
-		itemRgb <- RGBs[as.character(collapsed.calls$name)]
54
-		numsegments <- nrow(collapsed.calls)
55
-		df <- cbind(collapsed.calls, score=rep(0,numsegments), strand=rep(".",numsegments), thickStart=collapsed.calls$start, thickEnd=collapsed.calls$end, itemRgb=itemRgb)
56
-		write.table(format(df, scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE)
57
-	}
58
-
59
-}
60
-
61
-# ----------------------------------------------------------------
62
-# 
63
-# ----------------------------------------------------------------
64
-make.ucsc.file.combinatorial <- function(modellist, numstates=NULL, file="combinatorial-states-ucsc", threshold=0.5, chrom.length.file=NULL) {
65
-	
66
-	# Define variables
67
-	nummod <- length(modellist)
68
-	file <- paste(file,".BED", sep="")
69
-	coordinates <- modellist[[1]]$coordinates
70
-	
71
-	# Write first line to file
72
-	cat("browser position chr1:1-100000\n", file=file)
73
-
74
-	# Get the combinatorial states
75
-	cat("Getting the combinatorial states ...\n")
76
-	combstates <- combinatorial.states(modellist)
77
-	states <- as.numeric(names(sort(table(combstates), decreasing=TRUE)))
78
-
79
-	# Clean up to reduce memory usage
80
-	remove(modellist)
81
-
82
-	if (is.null(numstates)) {
83
-		numstates <- length(states)
84
-	} else if (numstates > length(states)) {
85
-		numstates <- length(states)
86
-	}
87
-	# Go through each state and write to file
88
-	for (istate in states[1:numstates]) {
89
-		cat("\nAnalyzing state",istate,"\n")
90
-		cat(paste("track name=\"combinatorial state ",istate,"\" description=\"univariate calls for state ",istate,", threshold = ",threshold,"\" visibility=1\n", sep=""), file=file, append=TRUE)
91
-
92
-		calls <- coordinates[ combstates == istate , ]
93
-		if (length(calls$start) >= 1) {
94
-			# Collapse the bins
95
-			cat("collapsing the calls\n")
96
-			collapsed_calls <- collapse.bins(calls)
97
-		} else {
98
-			cat("no bins in state",istate,"\n")
99
-			collapsed_calls <- NULL
100
-		}
101
-
102
-		if (!is.null(collapsed_calls)) {
103
-			# Check length of chromosomes if chrom.length.file was given
104
-			if (!is.null(chrom.length.file)) {
105
-				chrom.lengths.df <- read.table(chrom.length.file)
106
-				chrom.lengths <- chrom.lengths.df[,2]
107
-				names(chrom.lengths) <- chrom.lengths.df[,1]
108
-				for (chrom in levels(as.factor(collapsed_calls$chrom))) {
109
-					index <- which(collapsed_calls$chrom == chrom & collapsed_calls$end > chrom.lengths[chrom])
110
-					collapsed_calls$end[index] <- chrom.lengths[chrom]
111
-					if (length(index) >= 1) {
112
-						warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome")
113
-					}
114
-				}
115
-			}
116
-
117
-			cat("appending to file ...\n")
118
-			write.table(format(as.data.frame(collapsed_calls)[,1:3], scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE)
119
-		}
120
-	}
121
-
122
-}
123
-
124
-
125
-# ----------------------------------------------------------------
126
-# 
127
-# ----------------------------------------------------------------
128
-make.ucsc.file.multivariate <- function(model, numstates=NULL, file="multivariate-states-ucsc", chrom.length.file=NULL) {
129
-
130
-	# Variable assignments
131
-	states <- model$stateorder
132
-	coordinates <- model$coordinates
133
-	file <- paste(file,".BED", sep="")
134
-
135
-	# Write first line to file
136
-	cat("browser position chr1:1-100000\n", file=file)
137
-
138
-	# Determine state of bin
139
-	cat("Determining state of bins ...\n")
140
-	combstates <- apply(model$posteriors,1,which.max)
141
-	combstates <- states[combstates]
142
-
143
-	if (is.null(numstates)) {
144
-		numstates <- length(states)
145
-	} else if (numstates > length(states)) {
146
-		numstates <- length(states)
147
-	}
148
-	# Go through each state and write to file
149
-	for (istate in states[1:numstates]) {
150
-		cat("\nAnalyzing state",istate,"\n")
151
-		cat(paste("track name=\"multivariate state ",istate,"\" description=\"multivariate calls for state ",istate,"\" visibility=1 color=0,0,255\n", sep=""), file=file, append=TRUE)
152
-
153
-		calls <- coordinates[ combstates == istate , ]
154
-		if (length(calls$start) >= 1) {
155
-			# Collapse the bins
156
-			cat("collapsing the calls\n")
157
-			collapsed_calls <- collapse.bins(calls)
158
-		} else {
159
-			cat("no bins in state",istate,"\n")
160
-			collapsed_calls <- NULL
161
-		}
162
-
163
-		if (!is.null(collapsed_calls)) {
164
-			# Check length of chromosomes if chrom.length.file was given
165
-			if (!is.null(chrom.length.file)) {
166
-				chrom.lengths.df <- read.table(chrom.length.file)
167
-				chrom.lengths <- chrom.lengths.df[,2]
168
-				names(chrom.lengths) <- chrom.lengths.df[,1]
169
-				for (chrom in levels(as.factor(collapsed_calls$chrom))) {
170
-					index <- which(collapsed_calls$chrom == chrom & collapsed_calls$end > chrom.lengths[chrom])
171
-					collapsed_calls$end[index] <- chrom.lengths[chrom]
172
-					if (length(index) >= 1) {
173
-						warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome")
174
-					}
175
-				}
176
-			}
177
-
178
-			cat("appending to file ...\n")
179
-			write.table(format(as.data.frame(collapsed_calls)[,1:3], scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE)
180
-		}
181
-	}
182
-
183
-}
184
-
185
-
186
-# ----------------------------------------------------------------
187
-# Write signal tracks from binned data
188
-# ----------------------------------------------------------------
189
-bin2wiggle <- function(binned.data.list, file="view_me_in_genome_browser") {
190
-	
191
-	# Check user input
192
-	if (is.null(names(binned.data.list))) {
193
-		stop("Please name the list entries. The names will be used as track name for the Genome Browser file.")
194
-	}
195
-
196
-	# Variables
197
-	file <- paste(file,"wig", sep=".")
198
-
199
-	# Write to file
200
-	cat("browser hide all\n", file=file)
201
-	
202
-	# Go through binned.data.list
203
-	for (i1 in 1:length(binned.data.list)) {
204
-		binned.data <- binned.data.list[[i1]]
205
-		names(binned.data) <- c("chrom","start","end","reads")
206
-		
207
-		# Write track information
208
-		name <- names(binned.data.list)[i1]
209
-		binsize <- diff(binned.data$start[1:2])
210
-		description <- paste("binned read counts ",binsize,"bp", sep="")
211
-		cat(paste("track type=wiggle_0 name=\"",name,"\" description=\"",description,"\" visibility=full autoScale=on color=90,90,90 maxHeightPixels=100:50:20 graphType=bar priority=",i1,"\n", sep=""), file=file, append=TRUE)
212
-		# Write read data
213
-		for (chrom in unique(binned.data$chrom)) {
214
-			cat(paste("fixedStep chrom=",chrom," start=1 step=",binsize," span=",binsize,"\n", sep=""), file=file, append=TRUE)
215
-			write.table(binned.data$reads, file=file, append=TRUE, row.names=FALSE, col.names=FALSE)
216
-		}
217
-	}
218
-
219
-}
220
-
221
-
222
-# ----------------------------------------------------------------
223
-# Write color-coded (multivariate) combinatorial states to file
224
-# ----------------------------------------------------------------
225
-multivariate2bed <- function(model, separate.tracks=FALSE, exclude.state.zero=TRUE, numstates=NULL, file="view_me_in_genome_browser", chrom.length.file=NULL) {
226
-
227
-	# Variables
228
-	file <- paste(file,".bed", sep="")
229
-	combstates <- model$stateorder
230
-	if (is.null(numstates)) {
231
-		numstates <- length(combstates)
232
-	} else if (numstates > length(combstates)) {
233
-		numstates <- length(combstates)
234
-	}
235
-	if (exclude.state.zero) {
236
-		combstates2use <- setdiff(combstates,0)
237
-		combstates2use <- combstates2use[1:numstates]
238
-		combstates2use <- combstates2use[!is.na(combstates2use)]
239
-		numstates <- length(combstates2use)
240
-	}
241
-
242
-	# Collapse the calls
243
-	calls <- cbind(model$coordinates, name=model$states)
244
-	collapsed.calls <- collapse.bins(calls, column2collapseBy=4)
245
-	# Select only desired states
246
-	mask <- rep(FALSE,nrow(collapsed.calls))
247
-	for (istate in combstates2use) {
248
-		mask <- mask | istate==collapsed.calls$name
249
-	}
250
-	collapsed.calls <- collapsed.calls[mask,]
251
-
252
-	# Check length of chromosomes if chrom.length.file was given
253
-	if (!is.null(chrom.length.file)) {
254
-		chrom.lengths.df <- read.table(chrom.length.file)
255
-		chrom.lengths <- chrom.lengths.df[,2]
256
-		names(chrom.lengths) <- chrom.lengths.df[,1]
257
-		for (chrom in levels(as.factor(collapsed.calls$chrom))) {
258
-			index <- which(collapsed.calls$chrom == chrom & collapsed.calls$end > chrom.lengths[chrom])
259
-			collapsed.calls$end[index] <- chrom.lengths[chrom]
260
-			if (length(index) >= 1) {
261
-				warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome")
262
-			}
263
-		}
264
-	}
265
-
266
-	# Generate the colors for each combinatorial state
267
-	colors <- colors()[grep(colors(), pattern="white|grey|gray|snow", invert=T)]
268
-	step <- length(colors) %/% numstates
269
-	colors <- colors[seq(1,by=step,length=numstates)]
270
-	RGBs <- t(col2rgb(colors))
271
-	RGBs <- apply(RGBs,1,paste,collapse=",")
272
-	itemRgb <- RGBs[as.factor(collapsed.calls$name)]
273
-
274
-	# Write to file
275
-	cat("browser hide all\n", file=file)
276
-	numsegments <- nrow(collapsed.calls)
277
-	df <- cbind(collapsed.calls, score=rep(0,numsegments), strand=rep(".",numsegments), thickStart=collapsed.calls$start, thickEnd=collapsed.calls$end, itemRgb=itemRgb)
278
-
279
-	if (separate.tracks) {
280
-		for (istate in combstates2use) {
281
-			priority <- 100 + which(istate==combstates2use)
282
-			cat(paste("track name=\"comb.state ",istate,"\" description=\"multivariate calls for combinatorial state ",istate,"\" visibility=1 itemRgb=On priority=",priority,"\n", sep=""), file=file, append=TRUE)
283
-			write.table(format(df[df$name==istate,], scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE)
284
-		}
285
-	} else {
286
-		cat(paste("track name=\"comb.state\" description=\"multivariate combinatorial states\" visibility=1 itemRgb=On priority=100\n", sep=""), file=file, append=TRUE)
287
-		write.table(format(df, scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE)
288
-	}
289
-
290
-}
291
-
292 0
new file mode 100644
... ...
@@ -0,0 +1,161 @@
1
+# ===============================================
2
+# Write color-coded tracks with univariate states
3
+# ===============================================
4
+univariate2bed <- function(uni.hmm.list, file="view_me_in_genome_browser", threshold=0.5, chrom.length.file=NULL) {
5
+
6
+	# Check user input
7
+	if (is.null(names(uni.hmm.list))) {
8
+		stop("Please name the list entries. The names will be used as track name for the Genome Browser file.")
9
+	}
10
+
11
+	# Variables
12
+	nummod <- length(uni.hmm.list)
13
+	file <- paste(file,"bed", sep=".")
14
+
15
+	# Generate the colors
16
+	colors <- gcolors[state.labels]
17
+	RGBs <- t(col2rgb(colors))
18
+	RGBs <- apply(RGBs,1,paste,collapse=",")
19
+
20
+	# Write first line to file
21
+	cat("browser hide all\n", file=file)
22
+	
23
+	for (imod in 1:nummod) {
24
+		uni.hmm <- uni.hmm.list[[imod]]
25
+		priority <- 50 + imod
26
+		cat(paste("track name=",names(uni.hmm.list)[imod]," description=\"univariate calls for ",names(uni.hmm.list)[imod],", threshold = ",threshold,"\" visibility=1 itemRgb=On priority=",priority,"\n", sep=""), file=file, append=TRUE)
27
+
28
+		# Collapse the calls
29
+		calls <- cbind(uni.hmm$coordinates, name=uni.hmm$states)
30
+		collapsed.calls <- collapse.bins(calls, column2collapseBy=4)
31
+
32
+		# Check length of chromosomes if chrom.length.file was given
33
+		if (!is.null(chrom.length.file)) {
34
+			chrom.lengths.df <- read.table(chrom.length.file)
35
+			chrom.lengths <- chrom.lengths.df[,2]
36
+			names(chrom.lengths) <- chrom.lengths.df[,1]
37
+			for (chrom in levels(as.factor(collapsed_calls$chrom))) {
38
+				index <- which(collapsed_calls$chrom == chrom & collapsed_calls$end > chrom.lengths[chrom])
39
+				collapsed_calls$end[index] <- chrom.lengths[chrom]
40
+				if (length(index) >= 1) {
41
+					warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome")
42
+				}
43
+			}
44
+		}
45
+
46
+		# Append to file
47
+		itemRgb <- RGBs[as.character(collapsed.calls$name)]
48
+		numsegments <- nrow(collapsed.calls)
49
+		df <- cbind(collapsed.calls, score=rep(0,numsegments), strand=rep(".",numsegments), thickStart=collapsed.calls$start, thickEnd=collapsed.calls$end, itemRgb=itemRgb)
50
+		write.table(format(df, scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE)
51
+	}
52
+
53
+}
54
+
55
+
56
+# ====================================
57
+# Write signal tracks from binned data
58
+# ====================================
59
+binned2wiggle <- function(binned.data.list, file="view_me_in_genome_browser") {
60
+	
61
+	# Check user input
62
+	if (is.null(names(binned.data.list))) {
63
+		stop("Please name the list entries. The names will be used as track name for the Genome Browser file.")
64
+	}
65
+
66
+	# Variables
67
+	file <- paste(file,"wig", sep=".")
68
+
69
+	# Write to file
70
+	cat("browser hide all\n", file=file)
71
+	
72
+	# Go through binned.data.list
73
+	for (i1 in 1:length(binned.data.list)) {
74
+		binned.data <- binned.data.list[[i1]]
75
+		names(binned.data) <- binned.data.names
76
+		
77
+		# Write track information
78
+		name <- names(binned.data.list)[i1]
79
+		binsize <- diff(binned.data$start[1:2])
80
+		description <- paste("binned read counts ",binsize,"bp", sep="")
81
+		cat(paste("track type=wiggle_0 name=\"",name,"\" description=\"",description,"\" visibility=full autoScale=on color=90,90,90 maxHeightPixels=100:50:20 graphType=bar priority=",i1,"\n", sep=""), file=file, append=TRUE)
82
+		# Write read data
83
+		for (chrom in unique(binned.data$chrom)) {
84
+			cat(paste("fixedStep chrom=",chrom," start=1 step=",binsize," span=",binsize,"\n", sep=""), file=file, append=TRUE)
85
+			write.table(binned.data$reads, file=file, append=TRUE, row.names=FALSE, col.names=FALSE)
86
+		}
87
+	}
88
+
89
+}
90
+
91
+
92
+# ===============================================================
93
+# Write color-coded tracks with multivariate combinatorial states
94
+# ===============================================================
95
+multivariate2bed <- function(multi.hmm, separate.tracks=FALSE, exclude.state.zero=TRUE, numstates=NULL, file="view_me_in_genome_browser", chrom.length.file=NULL) {
96
+
97
+	# Variables
98
+	file <- paste(file,".bed", sep="")
99
+	combstates <- multi.hmm$comb.states
100
+	if (is.null(numstates)) {
101
+		numstates <- length(combstates)
102
+	} else if (numstates > length(combstates)) {
103
+		numstates <- length(combstates)
104
+	}
105
+	if (exclude.state.zero) {
106
+		combstates2use <- setdiff(combstates,0)
107
+		combstates2use <- combstates2use[1:numstates]
108
+		combstates2use <- combstates2use[!is.na(combstates2use)]
109
+		numstates <- length(combstates2use)
110
+	}
111
+
112
+	# Collapse the calls
113
+	calls <- cbind(multi.hmm$coordinates, name=multi.hmm$states)
114
+	collapsed.calls <- collapse.bins(calls, column2collapseBy=4)
115
+	# Select only desired states
116
+	mask <- rep(FALSE,nrow(collapsed.calls))
117
+	for (istate in combstates2use) {
118
+		mask <- mask | istate==collapsed.calls$name
119
+	}
120
+	collapsed.calls <- collapsed.calls[mask,]
121
+
122
+	# Check length of chromosomes if chrom.length.file was given
123
+	if (!is.null(chrom.length.file)) {
124
+		chrom.lengths.df <- read.table(chrom.length.file)
125
+		chrom.lengths <- chrom.lengths.df[,2]
126
+		names(chrom.lengths) <- chrom.lengths.df[,1]
127
+		for (chrom in levels(as.factor(collapsed.calls$chrom))) {
128
+			index <- which(collapsed.calls$chrom == chrom & collapsed.calls$end > chrom.lengths[chrom])
129
+			collapsed.calls$end[index] <- chrom.lengths[chrom]
130
+			if (length(index) >= 1) {
131
+				warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome")
132
+			}
133
+		}
134
+	}
135
+
136
+	# Generate the colors for each combinatorial state
137
+	colors <- colors()[grep(colors(), pattern="white|grey|gray|snow", invert=T)]
138
+	step <- length(colors) %/% numstates
139
+	colors <- colors[seq(1,by=step,length=numstates)]
140
+	RGBs <- t(col2rgb(colors))
141
+	RGBs <- apply(RGBs,1,paste,collapse=",")
142
+	itemRgb <- RGBs[as.factor(collapsed.calls$name)]
143
+
144
+	# Write to file
145
+	cat("browser hide all\n", file=file)
146
+	numsegments <- nrow(collapsed.calls)
147
+	df <- cbind(collapsed.calls, score=rep(0,numsegments), strand=rep(".",numsegments), thickStart=collapsed.calls$start, thickEnd=collapsed.calls$end, itemRgb=itemRgb)
148
+
149
+	if (separate.tracks) {
150
+		for (istate in combstates2use) {
151
+			priority <- 100 + which(istate==combstates2use)
152
+			cat(paste("track name=\"comb.state ",istate,"\" description=\"multivariate calls for combinatorial state ",istate,"\" visibility=1 itemRgb=On priority=",priority,"\n", sep=""), file=file, append=TRUE)
153
+			write.table(format(df[df$name==istate,], scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE)
154
+		}
155
+	} else {
156
+		cat(paste("track name=\"comb.state\" description=\"multivariate combinatorial states\" visibility=1 itemRgb=On priority=100\n", sep=""), file=file, append=TRUE)
157
+		write.table(format(df, scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE)
158
+	}
159
+
160
+}
161
+
... ...
@@ -1,4 +1,4 @@
1
-find.aneuploidies <- function(binned.data, eps=0.001, max.time=-1, max.it=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, filter.reads=TRUE) {
1
+find.aneuploidies <- function(binned.data, eps=0.001, init="standard", max.time=-1, max.it=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, filter.reads=TRUE) {
2 2
 
3 3
 	## Intercept user input
4 4
 	if (check.positive(eps)!=0) stop("argument 'eps' expects a positive numeric")
... ...
@@ -21,6 +21,7 @@ find.aneuploidies <- function(binned.data, eps=0.001, max.time=-1, max.it=-1, nu
21 21
 # 	state.labels # assigned globally outside this function
22 22
 	numstates <- length(state.labels)
23 23
 	numbins <- length(binned.data$reads)
24
+	iniproc <- which(init==c("standard","random","empiric")) # transform to int
24 25
 
25 26
 	# Check if there are reads in the data, otherwise HMM will blow up
26 27
 	if (!any(binned.data$reads!=0)) {
... ...
@@ -28,13 +29,13 @@ find.aneuploidies <- function(binned.data, eps=0.001, max.time=-1, max.it=-1, nu
28 29
 	}
29 30
 
30 31
 	# Filter high reads out, makes HMM faster
32
+	read.cutoff <- as.integer(quantile(binned.data$reads, 0.9999))
31 33
 	if (filter.reads) {
32
-		limit <- 10*ceiling(var(binned.data$reads))
33
-		mask <- binned.data$reads > limit
34
-		binned.data$reads[mask] <- limit
34
+		mask <- binned.data$reads > read.cutoff
35
+		binned.data$reads[mask] <- read.cutoff
35 36
 		numfiltered <- length(which(mask))
36 37
 		if (numfiltered > 0) {
37
-			warning(paste("There are very high read counts (probably artificial) in your data. Replaced read counts > ",limit," (10*variance) by ",limit," in ",numfiltered," bins. Set option 'filter.reads=FALSE' to disable this filtering.", sep=""))
38
+			warning(paste("There are very high read counts in your data (probably artificial). Replaced read counts > ",read.cutoff," (99.99% quantile) by ",read.cutoff," in ",numfiltered," bins. Set option 'filter.reads=FALSE' to disable this filtering.", sep=""))
38 39
 		}
39 40
 	}
40 41
 	
... ...
@@ -47,8 +48,8 @@ find.aneuploidies <- function(binned.data, eps=0.001, max.time=-1, max.it=-1, nu
47 48
 			reads = as.integer(binned.data$reads), # double* O
48 49
 			num.bins = as.integer(numbins), # int* T
49 50
 			num.states = as.integer(numstates), # int* N
50
-			means = double(length=numstates), # double* means
51
-			variances = double(length=numstates), # double* variances
51
+			size = double(length=numstates), # double* size
52
+			prob = double(length=numstates), # double* prob
52 53
 			num.iterations = as.integer(max.it), #  int* maxiter
53 54
 			time.sec = as.integer(max.time), # double* maxtime
54 55
 			loglik.delta = as.double(eps.try), # double* eps
... ...
@@ -57,24 +58,28 @@ find.aneuploidies <- function(binned.data, eps=0.001, max.time=-1, max.it=-1, nu
57 58
 			proba = double(length=numstates), # double* proba
58 59
 			loglik = double(length=1), # double* loglik
59 60
 			weights = double(length=numstates), # double* weights
60
-			means.initial = double(length=numstates), # double* initial_means
61
-			variances.initial = double(length=numstates), # double* initial_variances
61
+			ini.proc = as.integer(iniproc), # int* iniproc
62
+			size.initial = double(length=numstates), # double* initial_size
63
+			prob.initial = double(length=numstates), # double* initial_prob
62 64
 			A.initial = double(length=numstates*numstates), # double* initial_A
63 65
 			proba.initial = double(length=numstates), # double* initial_proba
64 66
 			use.initial.params = as.logical(0), # bool* use_initial_params
65
-			num.threads = as.integer(num.threads) # int* num_threads
67
+			num.threads = as.integer(num.threads), # int* num_threads
68
+			error = as.integer(0), # int* error (error handling)
69
+			read.cutoff = as.integer(read.cutoff) # int* read_cutoff
66 70
 		)
67 71
 
72
+		names(hmm$weights) <- state.labels
68 73
 		hmm$eps <- eps.try
69 74
 		hmm$A <- matrix(hmm$A, ncol=hmm$num.states, byrow=TRUE)
70 75
 		rownames(hmm$A) <- state.labels
71 76
 		colnames(hmm$A) <- state.labels
72
-		hmm$distributions <- cbind(mean=hmm$means, variance=hmm$variances, size=fsize(hmm$means,hmm$variances), prob=fprob(hmm$means,hmm$variances))
77
+		hmm$distributions <- cbind(size=hmm$size, prob=hmm$prob, mu=fmean(hmm$size,hmm$prob), variance=fvariance(hmm$size,hmm$prob))
73 78
 		rownames(hmm$distributions) <- state.labels
74 79
 		hmm$A.initial <- matrix(hmm$A.initial, ncol=hmm$num.states, byrow=TRUE)
75 80
 		rownames(hmm$A.initial) <- state.labels
76 81
 		colnames(hmm$A.initial) <- state.labels
77
-		hmm$distributions.initial <- cbind(mean=hmm$means.initial, variance=hmm$variances.initial, size=fsize(hmm$means.initial,hmm$variances.initial), prob=fprob(hmm$means.initial,hmm$variances.initial))
82
+		hmm$distributions.initial <- cbind(size=hmm$size.initial, prob=hmm$prob.initial, mu=fmean(hmm$size.initial,hmm$prob.initial), variance=fvariance(hmm$size.initial,hmm$prob.initial))
78 83
 		rownames(hmm$distributions.initial) <- state.labels
79 84
 		if (num.trials > 1) {
80 85
 			if (hmm$loglik.delta > hmm$eps) {
... ...
@@ -96,51 +101,58 @@ find.aneuploidies <- function(binned.data, eps=0.001, max.time=-1, max.it=-1, nu
96 101
 		# Rerun the HMM with different epsilon and initial parameters from trial run
97 102
 		cat("\n\nRerunning try ",indexmax," with eps =",eps,"--------------------\n")
98 103
 		hmm <- .C("R_univariate_hmm",
99
-			reads <- as.integer(binned.data$reads), # double* O
100
-			num.bins <- as.integer(numbins), # int* T
101
-			num.states <- as.integer(numstates), # int* N
102
-			means <- double(length=numstates), # double* means
103
-			variances <- double(length=numstates), # double* variances
104
-			num.iterations <- as.integer(max.it), #  int* maxiter
105
-			time.sec <- as.integer(max.time), # double* maxtime
106
-			loglik.delta <- as.double(eps), # double* eps
107
-			posteriors <- double(length=numbins * numstates), # double* posteriors
108
-			A <- double(length=numstates*numstates), # double* A
109
-			proba <- double(length=numstates), # double* proba
110
-			loglik <- double(length=1), # double* loglik
111
-			weights <- double(length=numstates), # double* weights
112
-			means.initial <- as.vector(hmm$distributions[,'mean']), # double* initial_means
113
-			variances.initial <- as.vector(hmm$distributions[,'variance']), # double* initial_variances
114
-			A.initial <- as.vector(hmm$A), # double* initial_A
115
-			proba.initial <- as.vector(hmm$proba), # double* initial_proba
116
-			use.initial.params <- as.logical(1), # bool* use_initial_params
117
-			num.threads <- as.integer(num.threads) # int* num_threads
104
+			reads = as.integer(binned.data$reads), # double* O
105
+			num.bins = as.integer(numbins), # int* T
106
+			num.states = as.integer(numstates), # int* N
107
+			size = double(length=numstates), # double* size
108
+			prob = double(length=numstates), # double* prob
109
+			num.iterations = as.integer(max.it), #  int* maxiter
110
+			time.sec = as.integer(max.time), # double* maxtime
111
+			loglik.delta = as.double(eps), # double* eps
112
+			posteriors = double(length=numbins * numstates), # double* posteriors
113
+			A = double(length=numstates*numstates), # double* A
114
+			proba = double(length=numstates), # double* proba
115
+			loglik = double(length=1), # double* loglik
116
+			weights = double(length=numstates), # double* weights
117
+			ini.proc = as.integer(iniproc), # int* iniproc
118
+			size.initial = as.vector(hmm$distributions[,'size']), # double* initial_size
119
+			prob.initial = as.vector(hmm$distributions[,'prob']), # double* initial_prob
120
+			A.initial = as.vector(hmm$A), # double* initial_A
121
+			proba.initial = as.vector(hmm$proba), # double* initial_proba
122
+			use.initial.params = as.logical(1), # bool* use_initial_params
123
+			num.threads = as.integer(num.threads), # int* num_threads
124
+			error = as.integer(0), # int* error (error handling)
125
+			read.cutoff = as.integer(read.cutoff) # int* read_cutoff
118 126
 		)
119 127
 	}
120 128
 
121 129
 	# Add useful entries
130
+	names(hmm$weights) <- state.labels
122 131
 	hmm$coordinates <- binned.data[,coordinate.names]
123 132
 	hmm$posteriors <- matrix(hmm$posteriors, ncol=hmm$num.states)
124 133
 	colnames(hmm$posteriors) <- paste("P(",state.labels,")", sep="")
125
-	class(hmm) <- "aneufinder.model"
134
+	class(hmm) <- class.chromstar.univariate
126 135
 	hmm$states <- state.labels[apply(hmm$posteriors, 1, which.max)]
127 136
 	hmm$eps <- eps
128 137
 	hmm$A <- matrix(hmm$A, ncol=hmm$num.states, byrow=TRUE)
129 138
 	rownames(hmm$A) <- state.labels
130 139
 	colnames(hmm$A) <- state.labels
131
-	hmm$distributions <- cbind(mean=hmm$means, variance=hmm$variances, size=fsize(hmm$means,hmm$variances), prob=fprob(hmm$means,hmm$variances))
140
+	hmm$distributions <- cbind(size=hmm$size, prob=hmm$prob, mu=fmean(hmm$size,hmm$prob), variance=fvariance(hmm$size,hmm$prob))
132 141
 	rownames(hmm$distributions) <- state.labels
133 142
 	hmm$A.initial <- matrix(hmm$A.initial, ncol=hmm$num.states, byrow=TRUE)
134 143
 	rownames(hmm$A.initial) <- state.labels
135 144
 	colnames(hmm$A.initial) <- state.labels
136
-	hmm$distributions.initial <- cbind(mean=hmm$means.initial, variance=hmm$variances.initial, size=fsize(hmm$means.initial,hmm$variances.initial), prob=fprob(hmm$means.initial,hmm$variances.initial))
145
+	hmm$distributions.initial <- cbind(size=hmm$size.initial, prob=hmm$prob.initial, mu=fmean(hmm$size.initial,hmm$prob.initial), variance=fvariance(hmm$size.initial,hmm$prob.initial))
137 146
 	rownames(hmm$distributions.initial) <- state.labels
147
+	hmm$filter.reads <- filter.reads
138 148
 
139 149
 	# Delete redundant entries
140
-	hmm$r <- NULL
141
-	hmm$p <- NULL
142
-	hmm$r.initial <- NULL
143
-	hmm$p.initial <- NULL
150
+	hmm$size <- NULL
151
+	hmm$prob <- NULL
152
+	hmm$size.initial <- NULL
153
+	hmm$prob.initial <- NULL
154
+	hmm$use.initial.params <- NULL
155
+	hmm$read.cutoff <- NULL
144 156
 
145 157
 	# Issue warnings
146 158
 	if (num.trials == 1) {
... ...
@@ -148,6 +160,11 @@ find.aneuploidies <- function(binned.data, eps=0.001, max.time=-1, max.it=-1, nu
148 160
 			war <- warning("HMM did not converge!\n")
149 161
 		}
150 162
 	}
163
+	if (hmm$error == 1) {
164
+		stop("A nan occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your read counts for very high numbers, they could be the cause for this problem.")
165
+	} else if (hmm$error == 2) {
166
+		stop("An error occurred during the Baum-Welch! Parameter estimation terminated prematurely.")
167
+	}
151 168
 
152 169
 	# Return results
153 170
 	if (!is.null(war)) {
154 171
similarity index 83%
155 172
rename from R/helper.R
156 173
rename to R/global.R
... ...
@@ -4,6 +4,8 @@
4 4
 state.labels <- c("unmappable","monosomy","disomy","trisomy","tetrasomy")
5 5
 coordinate.names <- c("chrom","start","end")
6 6
 binned.data.names <- c(coordinate.names,"reads")
7
+class.chromstar.univariate <- "chromstar.univariate.hmm"
8
+gcolors <- c("unmappable"="gray68","monosomy"="red1","disomy"="springgreen3","trisomy"="red2","tetrasomy"="red3")
7 9
  
8 10
 # ============================================================================
9 11
 # Functions for a Negative Binomial to transform (mean,variance)<->(size,prob)
10 12
similarity index 100%
11 13
rename from R/mixedsort.R
12 14
rename to R/gtools.R
... ...
@@ -1,10 +1,10 @@
1 1
 # ------------------------------------------------------------
2 2
 # Plot a read histogram with univariate fits
3 3
 # ------------------------------------------------------------
4
-plot.distribution <- function(model, state=NULL, chr=NULL, start=NULL, end=NULL) {
4
+plot.distribution <- function(model, state=NULL, chrom=NULL, start=NULL, end=NULL) {
5 5
 
6 6
 	## Load libraries
7
-# 	library(ggplot2)
7
+	library(ggplot2)
8 8
 
9 9
 	# -----------------------------------------
10 10
 	# Get right x limit
... ...
@@ -18,15 +18,16 @@ plot.distribution <- function(model, state=NULL, chr=NULL, start=NULL, end=NULL)
18 18
 	}
19 19
 
20 20
 	## Plot settings
21
+	cols <- gcolors[c("unmappaple","monosomy","disomy","trisomy","tetrasomy")]
21 22
 
22 23
 	# Select the rows to plot
23 24
 	selectmask <- rep(TRUE,length(model$reads))
24 25
 	numchrom <- length(table(model$coordinates$chrom))
25
-	if (!is.null(chr)) {
26
-		if (! chr %in% levels(model$coordinates$chrom)) {
27
-			stop(chr," can't be found in the model coordinates.")
26
+	if (!is.null(chrom)) {
27
+		if (! chrom %in% levels(model$coordinates$chrom)) {
28
+			stop(chrom," can't be found in the model coordinates.")
28 29
 		}
29
-		selectchrom <- model$coordinates$chrom == chr
30
+		selectchrom <- model$coordinates$chrom == chrom
30 31
 		selectmask <- selectmask & selectchrom
31 32
 		numchrom <- 1
32 33
 	}
... ...
@@ -41,13 +42,18 @@ plot.distribution <- function(model, state=NULL, chr=NULL, start=NULL, end=NULL)
41 42
 		}
42 43
 	}
43 44
 	if (!is.null(state)) {
44
-		states <- get.states(model)
45
-		selectmask <- selectmask & states==state
45
+		selectmask <- selectmask & model$states==state
46 46
 	}
47 47
 	if (length(which(selectmask)) != length(model$reads)) {
48 48
 		reads <- model$reads[selectmask]
49
-		posteriors <- model$posteriors[selectmask,]
50
-		weights <- apply(posteriors,2,mean)
49
+# 		posteriors <- model$posteriors[selectmask,]
50
+# 		weights <- apply(posteriors,2,mean)
51
+		states <- model$states[selectmask]
52
+		weights <- rep(NA, 3)
53
+		weights[1] <- length(which(states=="zero-inflation"))
54
+		weights[2] <- length(which(states=="unmodified"))
55
+		weights[3] <- length(which(states=="modified"))
56
+		weights <- weights / length(states)
51 57
 	} else {
52 58
 		reads <- model$reads
53 59
 		weights <- model$weights
... ...
@@ -88,7 +94,7 @@ plot.distribution <- function(model, state=NULL, chr=NULL, start=NULL, end=NULL)
88 94
 	}
89 95
 	
90 96
 	# Make legend and colors correct
91
-# 	ggplt <- ggplt + scale_color_manual(name="components", values=cols)
97
+	ggplt <- ggplt + scale_color_manual(name="components", values=cols)
92 98
 
93 99
 	return(ggplt)
94 100
 
... ...
@@ -101,10 +107,10 @@ plot.distribution <- function(model, state=NULL, chr=NULL, start=NULL, end=NULL)
101 107
 plot.boxplot <- function(model) {
102 108
 
103 109
 	## Load libraries
104
-# 	library(ggplot2)
110
+	library(ggplot2)
105 111
 
106 112
 	## Plot settings
107
-	cols <- c("unmodified"="gray48","modified"="orangered3")
113
+	cols <- gcolors[c("unmodified","modified")]
108 114
 
109 115
 	## Boxplot
110 116
 	components <- c("unmodified","modified")[as.factor(get.states(model))]
... ...
@@ -1,2 +0,0 @@
1
-PKG_LIBS = `gsl-config --libs`
2
-PKG_CPPFLAGS = `gsl-config --cflags` -fopenmp
... ...
@@ -1,47 +1,58 @@
1
-#include "scalehmm.h"
2 1
 #include "utility.h"
2
+#include "scalehmm.h"
3
+#include <omp.h> // parallelization options
3 4
 
4
-// ---------------------------------------------------------------
5
-// void R_univariate_hmm()
5
+// ===================================================================================================================================================
6 6
 // This function takes parameters from R, creates a univariate HMM object, creates the distributions, runs the Baum-Welch and returns the result to R.
7
-// ---------------------------------------------------------------
7
+// ===================================================================================================================================================
8 8
 extern "C" {
9
-void R_univariate_hmm(int* O, int* T, int* N, double* means, double* variances, int* maxiter, int* maxtime, double* eps, double* posteriors, double* A, double* proba, double* loglik, double* weights, double* initial_means, double* initial_variances, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads) {
9
+void R_univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* maxiter, int* maxtime, double* eps, double* posteriors, double* A, double* proba, double* loglik, double* weights, int* iniproc, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff)
10
+{
10 11
 
11 12
 	// Define logging level
12
-// 	FILE* pFile = fopen("aneufinder.log", "w");
13
+// 	FILE* pFile = fopen("chromStar.log", "w");
13 14
 // 	Output2FILE::Stream() = pFile;
14
-//  	FILELog::ReportingLevel() = FILELog::FromString("INFO");
15
- 	FILELog::ReportingLevel() = FILELog::FromString("ITERATION");
16
-//  	FILELog::ReportingLevel() = FILELog::FromString("DEBUG1");
15
+ 	FILELog::ReportingLevel() = FILELog::FromString("ERROR");
17 16
 
18 17
 	// Parallelization settings
19 18
 	omp_set_num_threads(*num_threads);
20 19
 
21 20
 	// Print some information
22 21
 	FILE_LOG(logINFO) << "number of states = " << *N;
22
+	Rprintf("number of states = %d\n", *N);
23 23
 	FILE_LOG(logINFO) << "number of bins = " << *T;
24
+	Rprintf("number of bins = %d\n", *T);
24 25
 	if (*maxiter < 0)
25 26
 	{
26 27
 		FILE_LOG(logINFO) << "maximum number of iterations = none";
28
+		Rprintf("maximum number of iterations = none\n");
27 29
 	} else {
28 30
 		FILE_LOG(logINFO) << "maximum number of iterations = " << *maxiter;
31
+		Rprintf("maximum number of iterations = %d\n", *maxiter);
29 32
 	}
30 33
 	if (*maxtime < 0)
31 34
 	{
32 35
 		FILE_LOG(logINFO) << "maximum running time = none";
36
+		Rprintf("maximum running time = none\n");
33 37
 	} else {
34 38
 		FILE_LOG(logINFO) << "maximum running time = " << *maxtime << " sec";
39
+		Rprintf("maximum running time = %d sec\n", *maxtime);
35 40
 	}
36 41
 	FILE_LOG(logINFO) << "epsilon = " << *eps;
42
+	Rprintf("epsilon = %g\n", *eps);
43
+
37 44
 	FILE_LOG(logDEBUG3) << "observation vector";
38 45
 	for (int t=0; t<50; t++) {
39 46
 		FILE_LOG(logDEBUG3) << "O["<<t<<"] = " << O[t];
40 47
 	}
41 48
 
49
+	// Flush Rprintf statements to console
50
+	R_FlushConsole();
51
+
42 52
 	// Create the HMM
43 53
 	FILE_LOG(logDEBUG1) << "Creating a univariate HMM";
44 54
 	ScaleHMM* hmm = new ScaleHMM(*T, *N);
55
+	hmm->set_cutoff(*read_cutoff);
45 56
 	// Initialize the transition probabilities and proba
46 57
 	hmm->initialize_transition_probs(initial_A, *use_initial_params);
47 58
 	hmm->initialize_proba(initial_proba, *use_initial_params);
... ...
@@ -59,60 +70,82 @@ void R_univariate_hmm(int* O, int* T, int* N, double* means, double* variances,
59 70
 	}
60 71
 	variance = variance / *T;
61 72
 	FILE_LOG(logINFO) << "data mean = " << mean << ", data variance = " << variance;		
73
+	Rprintf("data mean = %g, data variance = %g\n", mean, variance);		
62 74
 	
63 75
 	// Go through all states of the hmm and assign the density functions
64
-	srand (clock());
65
-	int rand1, rand2;
66 76
 	double imean, ivariance;
67
-	for (int istate=0; istate<*N; istate++)
77
+	for (int i_state=0; i_state<*N; i_state++)
68 78
 	{
69
-
70 79
 		if (*use_initial_params) {
71
-			FILE_LOG(logINFO) << "Using given parameters for mean and variance";
72
-			imean = initial_means[istate];
73
-			ivariance = initial_variances[istate];
80
+			FILE_LOG(logINFO) << "Using given parameters for size and prob";
81
+			Rprintf("Using given parameters for size and prob\n");
82
+			imean = (1-initial_prob[i_state])*initial_size[i_state] / initial_prob[i_state];
83
+			ivariance = imean / initial_prob[i_state];
84
+			FILE_LOG(logDEBUG2) << "imean = " << imean;
85
+			FILE_LOG(logDEBUG2) << "ivariance = " << ivariance;
74 86
 		} else {
75
-			// Simple initialization
76
-			imean = mean * pow(2, istate-2);
77
-			ivariance = variance * pow(2, istate-2);
78
-			initial_means[istate] = imean;
79
-			initial_variances[istate] = ivariance;
87
+
88
+			if (*iniproc == 1)
89
+			{
90
+				// Simple initialization
91
+				imean = mean * pow(2, i_state-2);
92
+				ivariance = variance * pow(2, i_state-2);
93
+			}
94
+
95
+			// Calculate r and p from mean and variance
96
+			initial_size[i_state] = pow(imean,2)/(ivariance-imean);
97
+			initial_prob[i_state] = imean/ivariance;
98
+
80 99
 		}
81
-		FILE_LOG(logDEBUG3) << "imean = " << imean;
82
-		FILE_LOG(logDEBUG3) << "ivariance = " << ivariance;
83 100
 
84
-		if (istate == 0)
101
+		if (i_state >= 1)
102
+		{
103
+			FILE_LOG(logDEBUG1) << "Using negative binomial for state " << i_state;
104
+			NegativeBinomial *d = new NegativeBinomial(O, *T, initial_size[i_state], initial_prob[i_state]); // delete is done inside ~ScaleHMM()
105
+			hmm->densityFunctions.push_back(d);
106
+		}
107
+		else if (i_state == 0)
85 108
 		{
109
+			FILE_LOG(logDEBUG1) << "Using only zeros for state " << i_state;
86 110
 			ZeroInflation *d = new ZeroInflation(O, *T); // delete is done inside ~ScaleHMM()
87
-			FILE_LOG(logDEBUG1) << "Using "<< d->get_name() <<" for state " << istate;
88 111
 			hmm->densityFunctions.push_back(d);
89 112
 		}
90 113
 		else
91 114
 		{
92
-			NegativeBinomial *d = new NegativeBinomial(O, *T, imean, ivariance); // delete is done inside ~ScaleHMM()
93
-			FILE_LOG(logDEBUG1) << "Using "<< d->get_name() <<" for state " << istate;
115
+			FILE_LOG(logWARNING) << "Density not specified, using default negative binomial for state " << i_state;
116
+			NegativeBinomial *d = new NegativeBinomial(O, *T, initial_size[i_state], initial_prob[i_state]);
94 117
 			hmm->densityFunctions.push_back(d);
95 118
 		}
96
-
97 119
 	}
98 120
 
121
+	// Flush Rprintf statements to console
122
+	R_FlushConsole();
123
+
99 124
 	// Do the Baum-Welch to estimate the parameters
100 125
 	FILE_LOG(logDEBUG1) << "Starting Baum-Welch estimation";
101
-	hmm->baumWelch(maxiter, maxtime, eps);
126
+	try
127
+	{
128
+		hmm->baumWelch(maxiter, maxtime, eps);
129
+	}
130
+	catch (std::exception& e)
131
+	{
132
+		FILE_LOG(logERROR) << "Error in Baum-Welch: " << e.what();
133
+		Rprintf("Error in Baum-Welch: %s\n", e.what());
134
+		if (e.what()=="nan detected") { *error = 1; }
135
+		else { *error = 2; }
136
+	}
137
+		
102 138
 	FILE_LOG(logDEBUG1) << "Finished with Baum-Welch estimation";
103 139
 	// Compute the posteriors and save results directly to the R pointer
104
-	double** post = allocDoubleMatrix(*N, *T);
105
-	hmm->get_posteriors(post);
106 140
 	FILE_LOG(logDEBUG1) << "Recode posteriors into column representation";
107 141
 	#pragma omp parallel for
108 142
 	for (int iN=0; iN<*N; iN++)
109 143
 	{
110 144
 		for (int t=0; t<*T; t++)
111 145
 		{
112
-			posteriors[t + iN * (*T)] = post[iN][t];
146
+			posteriors[t + iN * (*T)] = hmm->get_posterior(iN, t);
113 147
 		}
114 148
 	}
115
-	freeDoubleMatrix(post, *N);
116 149
 
117 150
 	FILE_LOG(logDEBUG1) << "Return parameters";
118 151
 	// also return the estimated transition matrix and the initial probs
... ...
@@ -128,8 +161,18 @@ void R_univariate_hmm(int* O, int* T, int* N, double* means, double* variances,
128 161
 	// copy the estimated distribution params
129 162
 	for (int i=0; i<*N; i++)
130 163
 	{
131
-		means[i] = hmm->densityFunctions[i]->get_mean();
132
-		variances[i] = hmm->densityFunctions[i]->get_variance();
164
+		if (hmm->densityFunctions[i]->get_name() == NEGATIVE_BINOMIAL) 
165
+		{
166
+			NegativeBinomial* d = (NegativeBinomial*)(hmm->densityFunctions[i]);
167
+			size[i] = d->get_size();
168
+			prob[i] = d->get_prob();
169
+		}
170
+		else if (hmm->densityFunctions[i]->get_name() == ZERO_INFLATION)
171
+		{
172
+			// These values for a Negative Binomial define a zero-inflation (delta distribution)
173
+			size[i] = 0;
174
+			prob[i] = 1;
175
+		}
133 176
 	}
134 177
 	*loglik = hmm->get_logP();
135 178
 	hmm->calc_weights(weights);
... ...
@@ -96,75 +96,165 @@ void Normal::set_stdev(double stdev)
96 96
 // ============================================================
97 97
 
98 98
 // Constructor and Destructor ---------------------------------
99
-NegativeBinomial::NegativeBinomial(int* observations, int T, double mean, double variance)
99
+NegativeBinomial::NegativeBinomial(int* observations, int T, double size, double prob)
100 100
 {
101 101
 	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
102 102
 	this->obs = observations;
103 103
 	this->T = T;
104
-	this->mean = mean;
105
-	this->variance = variance;
106
-	this->size = this->fsize(mean, variance);
107
-	this->prob = this->fprob(mean, variance);
108
-	this->max_obs = intMax(observations, T);
104
+	this->size = size;
105
+	this->prob = prob;
106
+	this->lxfactorials = NULL;
107
+	// Precompute the lxfactorials that are used in computing the densities
108
+	if (this->obs != NULL)
109
+	{
110
+		this->max_obs = intMax(observations, T);
111
+		this->lxfactorials = (double*) calloc(max_obs+1, sizeof(double));
112
+		this->lxfactorials[0] = 0.0;	// Not necessary, already 0 because of calloc
113
+		this->lxfactorials[1] = 0.0;
114
+		for (int j=2; j<=max_obs; j++)
115
+		{
116
+			this->lxfactorials[j] = this->lxfactorials[j-1] + log(j);
117
+		}
118
+	}
109 119
 }
110 120
 
111 121
 NegativeBinomial::~NegativeBinomial()
112 122
 {
113 123
 	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
124
+	if (this->lxfactorials != NULL)
125
+	{
126
+		free(this->lxfactorials);
127
+	}
114 128
 }
115 129
 
116 130
 // Methods ----------------------------------------------------
117
-void NegativeBinomial::calc_densities(double* density)
131
+void NegativeBinomial::calc_logdensities(double* logdens)
118 132
 {
119 133
 	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
120
-	for (int t=0; t<this->T; t++)
134
+	double logp = log(this->prob);
135
+	double log1minusp = log(1-this->prob);
136
+	double lGammaR,lGammaRplusX,lxfactorial;
137
+	lGammaR=lgamma(this->size);
138
+	// Select strategy for computing gammas
139
+	if (this->max_obs <= this->T)
121 140
 	{
122
-		density[t] = dnbinom(this->obs[t], this->size, this->prob, 0);
141
+		FILE_LOG(logDEBUG2) << "Precomputing gammas in " << __func__ << " for every obs[t], because max(O)<=T";
142
+		double logdens_per_read [this->max_obs+1];
143
+		for (int j=0; j<=this->max_obs; j++)
144
+		{
145
+			logdens_per_read[j] = lgamma(this->size + j) - lGammaR - lxfactorials[j] + this->size * logp + j * log1minusp;
146
+		}
147
+		for (int t=0; t<this->T; t++)
148
+		{
149
+			logdens[t] = logdens_per_read[(int) this->obs[t]];
150
+			FILE_LOG(logDEBUG4) << "logdens["<<t<<"] = " << logdens[t];
151
+			if (isnan(logdens[t]))
152
+			{
153
+				FILE_LOG(logERROR) << __PRETTY_FUNCTION__;
154
+				FILE_LOG(logERROR) << "logdens["<<t<<"] = "<< logdens[t];
155
+				throw nan_detected;
156
+			}
157
+		}
123 158
 	}
124
-}
159
+	else
160
+	{
161
+		FILE_LOG(logDEBUG2) << "Computing gammas in " << __func__ << " for every t, because max(O)>T";
162
+		for (int t=0; t<this->T; t++)
163
+		{
164
+			lGammaRplusX = lgamma(this->size + this->obs[t]);
165
+			lxfactorial = this->lxfactorials[(int) this->obs[t]];
166
+			logdens[t] = lGammaRplusX - lGammaR - lxfactorial + this->size * logp + this->obs[t] * log1minusp;
167
+			FILE_LOG(logDEBUG4) << "logdens["<<t<<"] = " << logdens[t];
168
+			if (isnan(logdens[t]))
169
+			{
170
+				FILE_LOG(logERROR) << __PRETTY_FUNCTION__;
171
+				FILE_LOG(logERROR) << "logdens["<<t<<"] = "<< logdens[t];
172
+				throw nan_detected;
173
+			}
174
+		}
175
+	}
176
+} 
125 177
 
126
-void NegativeBinomial::calc_logdensities(double* logdensity)
178
+void NegativeBinomial::calc_densities(double* dens)
127 179
 {
128 180
 	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
129
-	for (int t=0; t<this->T; t++)
181
+	double logp = log(this->prob);
182
+	double log1minusp = log(1-this->prob);
183
+	double lGammaR,lGammaRplusX,lxfactorial;
184
+	lGammaR=lgamma(this->size);
185
+	// Select strategy for computing gammas
186
+	if (this->max_obs <= this->T)
130 187
 	{
131
-		logdensity[t] = dnbinom(this->obs[t], this->size, this->prob, 1);
188
+		FILE_LOG(logDEBUG2) << "Precomputing gammas in " << __func__ << " for every obs[t], because max(O)<=T";
189
+		double dens_per_read [this->max_obs+1];
190
+		for (int j=0; j<=this->max_obs; j++)
191
+		{
192
+			dens_per_read[j] = exp( lgamma(this->size + j) - lGammaR - lxfactorials[j] + this->size * logp + j * log1minusp );
193
+		}
194
+		for (int t=0; t<this->T; t++)
195
+		{
196
+			dens[t] = dens_per_read[(int) this->obs[t]];
197
+			FILE_LOG(logDEBUG4) << "dens["<<t<<"] = " << dens[t];
198
+			if (isnan(dens[t]))
199
+			{
200
+				FILE_LOG(logERROR) << __PRETTY_FUNCTION__;
201
+				FILE_LOG(logERROR) << "dens["<<t<<"] = "<< dens[t];
202
+				throw nan_detected;
203
+			}
204
+		}
132 205
 	}
133
-}
206
+	else
207
+	{
208
+		FILE_LOG(logDEBUG2) << "Computing gammas in " << __func__ << " for every t, because max(O)>T";
209
+		for (int t=0; t<this->T; t++)
210
+		{
211
+			lGammaRplusX = lgamma(this->size + this->obs[t]);
212
+			lxfactorial = this->lxfactorials[(int) this->obs[t]];
213
+			dens[t] = exp( lGammaRplusX - lGammaR - lxfactorial + this->size * logp + this->obs[t] * log1minusp );
214
+			FILE_LOG(logDEBUG4) << "dens["<<t<<"] = " << dens[t];
215
+			if (isnan(dens[t]))
216
+			{
217
+				FILE_LOG(logERROR) << __PRETTY_FUNCTION__;
218
+				FILE_LOG(logERROR) << "dens["<<t<<"] = "<< dens[t];
219
+				throw nan_detected;
220
+			}
221
+		}
222
+	}
223
+} 
134 224
 
135
-void NegativeBinomial::update(double* weights)
225
+void NegativeBinomial::update(double* weight)
136 226
 {
137 227
 	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
138 228
 	double eps = 1e-4, kmax;
139
-	double numerator, denominator, rhere, dr, Fr, dFrdr, DigammaR, DigammaRplusDR, logp;
229
+	double numerator, denominator, rhere, dr, Fr, dFrdr, DigammaR, DigammaRplusDR;
140 230
 	// Update p
141 231
 	numerator=denominator=0.0;
142
-	clock_t time, dtime;
143
-	time = clock();
232
+// 	clock_t time, dtime;
233
+// 	time = clock();
144 234
 	for (int t=0; t<this->T; t++)
145 235
 	{
146
-		numerator+=weights[t]*this->size;
147
-		denominator+=weights[t]*(this->size+this->obs[t]);
236
+		numerator+=weight[t]*this->size;
237
+		denominator+=weight[t]*(this->size+this->obs[t]);
148 238
 	}
149 239
 	this->prob = numerator/denominator; // Update of r is now done with updated p
150
-	dtime = clock() - time;
151
-	FILE_LOG(logDEBUG1) << "update prob: "<<dtime<< " clicks";
240
+	double logp = log(this->prob);
241
+// 	dtime = clock() - time;
242
+// 	FILE_LOG(logDEBUG1) << "updateP(): "<<dtime<< " clicks";
152 243
 	// Update of r with Newton Method
153
-	logp = log(this->prob);
154 244
 	rhere = this->size;
155 245
 	dr = 0.00001;
156 246
 	kmax = 20;
157
-	time = clock();
247
+// 	time = clock();
158 248
 	// Select strategy for computing digammas
159 249
 	if (this->max_obs <= this->T)
160 250
 	{
161
-		FILE_LOG(logDEBUG2) << "Precomputing digammas in " << __func__ << " for every obs[t], because max(obs)<=T";
251
+		FILE_LOG(logDEBUG2) << "Precomputing digammas in " << __func__ << " for every obs[t], because max(O)<=T";
162 252
 		double DigammaRplusX[this->max_obs+1], DigammaRplusDRplusX[this->max_obs+1];
163 253
 		for (int k=1; k<kmax; k++)
164 254
 		{
165 255
 			Fr=dFrdr=0.0;
166
-			DigammaR = digamma(rhere);
167
-			DigammaRplusDR = digamma(rhere + dr);
256
+			DigammaR = digamma(rhere); // boost::math::digamma<>(rhere);
257
+			DigammaRplusDR = digamma(rhere + dr); // boost::math::digamma<>(rhere+dr);
168 258
 			// Precompute the digammas by iterating over all possible values of the observation vector
169 259
 			for (int j=0; j<=this->max_obs; j++)
170 260
 			{
... ...
@@ -175,13 +265,13 @@ void NegativeBinomial::update(double* weights)
175 265
 			{
176 266
 				if(this->obs[t]==0)
177 267
 				{
178
-					Fr+=weights[t]*logp;
268
+					Fr+=weight[t]*logp;
179 269
 					//dFrdr+=0;
180 270
 				}
181 271
 				if(this->obs[t]!=0)
182 272
 				{
183
-					Fr+=weights[t]*(logp-DigammaR+DigammaRplusX[(int)obs[t]]);
184
-					dFrdr+=weights[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX[(int)obs[t]]-DigammaRplusX[(int)obs[t]]);
273
+					Fr+=weight[t]*(logp-DigammaR+DigammaRplusX[(int)obs[t]]);
274
+					dFrdr+=weight[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX[(int)obs[t]]-DigammaRplusX[(int)obs[t]]);
185 275
 				}
186 276
 			}
187 277
 			if(fabs(Fr)<eps)
... ...
@@ -194,25 +284,26 @@ void NegativeBinomial::update(double* weights)
194 284
 	}
195 285
 	else
196 286
 	{
197
-		FILE_LOG(logDEBUG2) << "Computing digammas in " << __func__ << " for every t, because max(obs)>T";
287
+		FILE_LOG(logDEBUG2) << "Computing digammas in " << __func__ << " for every t, because max(O)>T";
198 288
 		double DigammaRplusX, DigammaRplusDRplusX;
199 289
 		for (int k=1; k<kmax; k++)
200 290
 		{
201 291
 			Fr = dFrdr = 0.0;
202
-			DigammaR = digamma(rhere);
203
-			DigammaRplusDR = digamma(rhere + dr);
292
+			DigammaR = digamma(rhere); // boost::math::digamma<>(rhere);
293
+			DigammaRplusDR = digamma(rhere + dr); // boost::math::digamma<>(rhere+dr);
204 294
 			for(int t=0; t<this->T; t++)
205 295
 			{
206
-				DigammaRplusX = digamma(rhere+this->obs[t]);
207
-				DigammaRplusDRplusX = digamma(rhere+dr+this->obs[t]);
296
+				DigammaRplusX = digamma(rhere+this->obs[t]); //boost::math::digamma<>(rhere+this->obs[ti]);
297
+				DigammaRplusDRplusX = digamma(rhere+dr+this->obs[t]); // boost::math::digamma<>(rhere+dr+this->obs[ti]);
208 298
 				if(this->obs[t]==0)
209 299
 				{
210
-					Fr+=weights[t]*logp;
300
+					Fr+=weight[t]*logp;
301
+					//dFrdr+=0;
211 302
 				}
212 303
 				if(this->obs[t]!=0)
213 304
 				{
214
-					Fr+=weights[t]*(logp-DigammaR+DigammaRplusX);
215
-					dFrdr+=weights[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX-DigammaRplusX);
305
+					Fr+=weight[t]*(logp-DigammaR+DigammaRplusX);
306
+					dFrdr+=weight[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX-DigammaRplusX);
216 307
 				}
217 308
 			}
218 309
 			if(fabs(Fr)<eps)
... ...
@@ -224,74 +315,30 @@ void NegativeBinomial::update(double* weights)
224 315
 		}
225 316
 	}
226 317
 	this->size = rhere;
227
-	FILE_LOG(logDEBUG1) << "size = "<<this->size << ", prob = "<<this->prob;
228
-
229
-	// Update mean and variance
230
-	this->mean = this->fmean(this->size, this->prob);
231
-	this->variance = this->fvariance(this->size, this->prob);
318
+	FILE_LOG(logDEBUG1) << "r = "<<this->size << ", p = "<<this->prob;
232 319
 
233
-	dtime = clock() - time;
234
-	FILE_LOG(logDEBUG1) << "update size: "<<dtime<< " clicks";
320
+// 	dtime = clock() - time;
321
+// 	FILE_LOG(logDEBUG1) << "updateR(): "<<dtime<< " clicks";
235 322
 
236 323
 }
237 324
 
238
-double NegativeBinomial::fsize(double mean, double variance)
239
-{
240
-	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
241
-	return( mean*mean / (variance - mean) );
242
-}
243
-
244
-double NegativeBinomial::fprob(double mean, double variance)
245
-{
246
-	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
247
-	return( mean / variance );
248
-}
249
-
250
-double NegativeBinomial::fmean(double size, double prob)
251
-{
252
-	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
253
-	return( size/prob - size );
254
-}
255
-
256
-double NegativeBinomial::fvariance(double size, double prob)
257
-{
258
-	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
259
-	return( (size - prob*size) / (prob*prob) );
260
-}
261
-
262 325
 // Getter and Setter ------------------------------------------
263
-DensityName NegativeBinomial::get_name()
264
-{
265
-	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
266
-	return(NEGATIVE_BINOMIAL);
267
-}
268
-
269 326
 double NegativeBinomial::get_mean()
270 327
 {
271 328
 	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
272
-	return(this->mean);
273
-}
274
-
275
-void NegativeBinomial::set_mean(double mean)
276
-{
277
-	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
278
-	this->mean = mean;
279
-	this->size = fsize(mean, this->variance);
280
-	this->prob = fprob(mean, this->variance);
329
+	return this->size*(1-this->prob)/this->prob;
281 330
 }
282 331
 
283 332
 double NegativeBinomial::get_variance()
284 333
 {
285 334
 	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
286
-	return(this->variance);
335
+	return this->size*(1-this->prob)/this->prob/this->prob;
287 336
 }
288 337
 
289
-void NegativeBinomial::set_variance(double variance)
338
+DensityName NegativeBinomial::get_name()
290 339
 {
291 340
 	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
292
-	this->variance = variance;
293
-	this->size = fsize(mean, this->variance);
294
-	this->prob = fprob(mean, this->variance);
341
+	return(NEGATIVE_BINOMIAL);
295 342
 }
296 343
 
297 344
 double NegativeBinomial::get_size()
... ...
@@ -85,6 +85,7 @@ class NegativeBinomial : public Density
85 85
 		double mean; ///< mean of the negative binomial
86 86
 		double variance; ///< variance of the negative binomial
87 87
 		int max_obs; ///< maximum observation
88
+		double* lxfactorials; ///< vector of precomputed factorials (x!)
88 89
 
89 90
 		// Methods
90 91
 		double fsize(double mean, double variance);
91 92
deleted file mode 100644
... ...
@@ -1,860 +0,0 @@
1
-#include "loghmm.h"
2
-
3
-/* initialize the model */
4
-LogHMM::LogHMM(int T, int N)
5
-{
6
-	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
7
-	FILE_LOG(logDEBUG2) << "Initializing univariate LogHMM";
8
-	this->xvariate = UNIVARIATE;
9
-	this->T = T;
10
-	this->N = N;
11
-	this->A = allocDoubleMatrix(N, N);
12
-	this->logA = allocDoubleMatrix(N, N);
13
-	this->logalpha = allocDoubleMatrix(T, N);
14
-	this->logbeta = allocDoubleMatrix(T, N);