Browse code

analysis and plotting improved

chakalakka authored on 19/09/2014 09:42:08
Showing 15 changed files

... ...
@@ -11,15 +11,22 @@ find.aneuploidies,
11 11
 
12 12
 plot.distribution,
13 13
 plot.BAIT,
14
+plot.genome.overview,
14 15
 get.state.table,
16
+get.dendrogram,
17
+get.reproducibility.one2many,
18
+get.reproducibility.many2many,
15 19
 
16 20
 univariate2bed,
17 21
 binned2wiggle,
18
-multivariate2bed,
19 22
 binned2GRanges,
20 23
 hmm2GRanges,
24
+hmmList2GRangesList,
25
+loadHmmsFromFiles,
21 26
 bed2GRanges,
22 27
 
28
+get.state.labels,
29
+get.state.colors,
23 30
 mixedsort,
24 31
 mixedorder
25 32
 )
... ...
@@ -1,27 +1,11 @@
1 1
 # ==================================================================================
2 2
 # Generate a data.frame with majority-state for each chromosome and input file/model
3 3
 # ==================================================================================
4
-get.state.table <- function(modellist) {
5
-
6
-	## Intercept user input
7
-	if (check.univariate.modellist(modellist)!=0) {
8
-		cat("Loading univariate HMMs from files ...")
9
-		mlist <- NULL
10
-		for (modelfile in modellist) {
11
-			mlist[[length(mlist)+1]] <- get(load(modelfile))
12
-		}
13
-		modellist <- mlist
14
-		remove(mlist)
15
-		cat(" done\n")
16
-		if (check.univariate.modellist(modellist)!=0) stop("argument 'modellist' expects a list of univariate hmms or a list of files that contain univariate hmms")
17
-	}
18
-	
19
-	# -----------------------------------
20
-	# Percentage of aneuploid chromosomes
21
-	# -----------------------------------
4
+get.state.table <- function(modellist, numCPU=1) {
22 5
 
23 6
 	## Transform to GRanges
24
-	grlist <- lapply(modellist, hmm2GRanges, reduce=F)
7
+# 	grlist <- lapply(modellist, hmm2GRanges, reduce=F)
8
+	grlist <- hmmList2GRangesList(modellist, reduce=F, numCPU=numCPU)
25 9
 
26 10
 	## Split by chromosome
27 11
 	grsplitlist <- lapply(grlist, function(gr) { split(gr, seqnames(gr)) })
... ...
@@ -33,56 +17,209 @@ get.state.table <- function(modellist) {
33 17
 	states <- array(dim=c(length(modellist), length(tablell[[1]]), length(tablell[[1]][[1]])), dimnames=list('model'=1:length(tablell), 'chromosome'=names(tablell[[1]]), 'state'=names(tablell[[1]][[1]])))
34 18
 	for (i1 in 1:dim(states)[1]) {
35 19
 		for (i2 in 1:dim(states)[2]) {
36
-			states[i1,i2,] <- tablell[[i1]][[i2]]
20
+			tc <- tryCatch({
21
+				states[i1,i2,] <- tablell[[i1]][[i2]]
22
+			}, error = function(err) {
23
+				err
24
+			})
37 25
 		}
38 26
 	}
39 27
 
40 28
 	## Get the majority state per chromosome of each model/file
41 29
 	majorstate.f <- apply(states, c(1,2), function(x) { names(x)[which.max(x)] })
42
-	factor2number <- 1:length(state.labels) -1
43
-	names(factor2number) <- state.labels
44
-	majorstate <- majorstate.f
30
+	majorstate <- matrix(NA, ncol=ncol(majorstate.f), nrow=nrow(majorstate.f))
31
+	colnames(majorstate) <- colnames(majorstate.f)
32
+	rownames(majorstate) <- rownames(majorstate.f)
45 33
 	for (i1 in 1:ncol(majorstate.f)) {
46
-		majorstate[,i1] <- factor2number[as.character(majorstate.f[,i1])]
34
+		majorstate[,i1] <- factor(majorstate.f[,i1], levels=state.labels)
47 35
 	}
48
-	majorstate <- as.data.frame(majorstate)
36
+	majorstate.f <- matrix(state.labels[majorstate], ncol=ncol(majorstate), nrow=nrow(majorstate))
37
+	colnames(majorstate.f) <- colnames(majorstate)
38
+	rownames(majorstate.f) <- rownames(majorstate)
49 39
 
50 40
 	## Plot the percentages of aneuploidies/states per chromosome
51 41
 	library(ggplot2)
52 42
 	library(reshape2)
53
-	df <- melt(majorstate.f, measure.vars=colnames(majorstate), variable.name="chromosome", value.name="state")
43
+	df <- melt(majorstate.f, varnames=c("model","chromosome"), value.name="state")
54 44
 	df$state <- factor(df$state, levels=state.labels)
55
-	ggplt <- ggplot(df) + geom_bar(aes(n=nrow(majorstate), x=chromosome, fill=state, y=..count../n)) + theme_bw() + ylab('number of samples') + scale_fill_manual(values=state.colors) + scale_y_continuous(labels=scales::percent_format())
45
+	ggplt <- ggplot(df) + geom_bar(aes(x=chromosome, fill=state, y=..count..)) + theme_bw() + ylab('number of samples') + scale_fill_manual(values=state.colors)
46
+
47
+	## Return results
48
+	l <- list(table=majorstate, plot=ggplt)
49
+	return(l)
50
+
51
+
52
+}
56 53
 
57
-	# ----------
58
-	# Dendrogram
59
-	# ----------
54
+
55
+# ==================================================================================
56
+# Build a dendrogram
57
+# ==================================================================================
58
+get.dendrogram <- function(modellist, numCPU=1) {
60 59
 
61 60
 	### Generate the GRanges consensus template with variable bins that can hold the states of each model
61
+	## Load the files
62
+	modellist <- loadHmmsFromFiles(modellist)
63
+
62 64
 	## Transform to GRanges in reduced representation
63
-	grlred <- GRangesList()
64
-	for (i1 in 1:length(modellist)) {
65
-		grlred[[i1]] <- hmm2GRanges(modellist[[i1]], reduce=T)
66
-	}
65
+	temp <- hmmList2GRangesList(modellist, reduce=TRUE, numCPU=numCPU, consensus=TRUE)
66
+	grlred <- temp$grl
67
+	consensus <- temp$consensus
68
+
67 69
 	## Split into non-overlapping fragments
68
-	consensus <- disjoin(unlist(grlred))
69 70
 	## Overlap each models' states with that consensus template
70
-	constates <- matrix(NA, ncol=length(modellist), nrow=length(consensus))
71
-	for (i1 in 1:length(modellist)) {
72
-		splt <- split(grlred[[i1]], mcols(grlred[[i1]])$state)
71
+	cat('calculate overlap\n')
72
+	constates <- foreach (gr = grlred, .packages='GenomicRanges', .combine='cbind') %do% {
73
+		splt <- split(gr, mcols(gr)$state)
73 74
 		mind <- as.matrix(findOverlaps(consensus, splt))
74
-		constates[,i1] <- mind[,'subjectHits']
75
+		col <- matrix(-1, nrow=length(consensus), ncol=1)
76
+		col[mind[,'queryHits'],1] <- mind[,'subjectHits']
77
+		col
75 78
 	}
79
+	colnames(constates) <- unlist(lapply(modellist, '[[', 'ID'))
80
+		
76 81
 	## Distance measure
82
+	cat('calculating distance\n')
77 83
 	wcor <- cov.wt(constates, wt=as.numeric(width(consensus)), cor=T)
78 84
 	dist <- as.dist(1-wcor$cor)
79 85
 	## Dendrogram
80 86
 	hc <- hclust(dist)
81 87
 
82 88
 	## Return results
83
-	l <- list(table=majorstate, percentage.plot=ggplt, dendrogram=hc)
89
+	return(hc)
90
+
91
+}
92
+
93
+
94
+# ==================================================================================
95
+# Get a simple measure of reproducibility between a multiple (num.query.samples)
96
+# cell sample (query) and single cell (subjects) samples
97
+# ==================================================================================
98
+get.reproducibility.one2many <- function(query, num.query.samples=1, subjects, num.tests=10, numCPU=1) {
99
+
100
+	if (check.univariate.model(query)!=0) {
101
+		cat("loading univariate HMM from file\n")
102
+		query <- get(load(query))
103
+		if (check.univariate.model(query)!=0) stop("argument 'query' expects a univariate hmm or a file that contains a univariate hmm")
104
+	}
105
+
106
+	## Transform to GRanges
107
+	query.gr <- hmm2GRanges(query, reduce=F)
108
+	subjects.grl <- hmmList2GRangesList(subjects, reduce=T, numCPU=numCPU)
109
+
110
+	## Overlap each subject's states with query
111
+	cat('overlapping each subject\'s states with query\n')
112
+	library(doParallel)
113
+	cl <- makeCluster(numCPU)
114
+	registerDoParallel(cl)
115
+	constates <- foreach (subject.gr = subjects.grl, .packages='GenomicRanges', .combine='cbind') %dopar% {
116
+		splt <- split(subject.gr, mcols(subject.gr)$state)
117
+		mind <- as.matrix(findOverlaps(query.gr, splt, select='first'))
118
+	}
119
+	stopCluster(cl)
120
+
121
+	## Comparing sample of subjects to query
122
+	cat('comparing samples of subjects to query\n')
123
+	means <- matrix(NA, nrow=num.tests, ncol=length(levels(mcols(query.gr)$state)))
124
+	colnames(means) <- levels(mcols(query.gr)$state)
125
+	meanconstates <- matrix(NA, nrow=nrow(constates), ncol=num.tests)
126
+	for (itest in 1:num.tests) {
127
+		cat('test',itest,'\n')
128
+
129
+		## Sample the columns to compare to the query
130
+		cols <- sample(1:length(subjects), num.query.samples)
131
+
132
+		## Mean of each position
133
+		meanconstates[,itest] <- apply(constates[,cols], 1, mean, na.rm=T)
134
+
135
+		## Subjects-mean of each query-level
136
+		meanlevel <- NULL
137
+		varlevel <- NULL
138
+		for (level in levels(mcols(query.gr)$state)) {
139
+			meanlevel[level] <- mean(meanconstates[mcols(query.gr)$state==level])
140
+			varlevel[level] <- var(meanconstates[mcols(query.gr)$state==level])
141
+		}
142
+		means[itest,] <- meanlevel
143
+	}
144
+	## Plot the results
145
+	library(ggplot2)
146
+	library(reshape2)
147
+	df <- melt(data.frame(state=mcols(query.gr)$state, meanstate=meanconstates))
148
+	ggplt <- ggplot(df) + geom_boxplot(aes(x=state, y=value, fill=variable)) + theme_bw() + coord_cartesian(ylim=c(1,length(levels(mcols(query.gr)$state)))) + scale_y_continuous(labels=levels(mcols(query.gr)$state)) + xlab(paste0(num.query.samples,' cell sample')) + ylab(paste0('single cell samples'))
149
+
150
+	df <- melt(means, varnames=c('test','state'))
151
+	ggplt1 <- ggplot(df) + geom_boxplot(aes(x=state, y=value)) + theme_bw() + coord_cartesian(ylim=c(1,length(levels(mcols(query.gr)$state)))) + scale_y_continuous(labels=levels(mcols(query.gr)$state)) + xlab(paste0(num.query.samples,' cell sample')) + ylab(paste0('single cell samples'))
152
+
153
+	## Comparing all subjects to query (no means)
154
+	cat('comparing all samples to query\n')
155
+	df.constates <- as.data.frame(cbind(mcols(query.gr)$state, constates))
156
+	names(df.constates)[1] <- 'query'
157
+	df <- melt(df.constates, id.vars='query', value.name='state')
158
+	df <- df[seq(from=1, to=nrow(df), by=10000),]
159
+	df$query <- factor(levels(mcols(query.gr)$state)[df$query], levels=levels(mcols(query.gr)$state))
160
+
161
+	ggplt2 <- ggplot(df) + geom_boxplot(aes(x=query, y=state)) + theme_bw() + coord_cartesian(ylim=c(1,length(levels(mcols(query.gr)$state)))) + scale_y_continuous(labels=levels(mcols(query.gr)$state)) + xlab(paste0(num.query.samples,' cell sample')) + ylab(paste0('single cell samples'))
162
+
163
+	df$state <- factor(levels(mcols(query.gr)$state)[df$state], levels=levels(mcols(query.gr)$state))
164
+	ggplt3 <- ggplot(df) + geom_jitter(aes(x=query, y=state), position=position_jitter(width=0.3, height=0.2)) + theme_bw() + coord_cartesian(ylim=c(1,length(levels(mcols(query.gr)$state)))) + xlab(paste0(num.query.samples,' cell sample')) + ylab(paste0('single cell samples'))
165
+	
166
+	
167
+	
168
+
169
+
170
+	## Return results
171
+	l <- list(meanplot=ggplt, meanmeanplot=ggplt1, allbox=ggplt2, alljitter=ggplt3)
84 172
 	return(l)
85 173
 
86 174
 
175
+
87 176
 }
88 177
 
178
+
179
+
180
+# ==================================================================================
181
+# Get a simple measure of reproducibility between sets of single cell samples
182
+# ==================================================================================
183
+get.reproducibility.many2many <- function(samples, num.tests=10, size.test=10, numCPU=1) {
184
+
185
+	## Transform to GRanges
186
+	temp <- hmmList2GRangesList(samples, numCPU=numCPU, reduce=T, consensus=T)
187
+	samples.grl <- temp$grl
188
+	consensus <- temp$consensus
189
+	constates <- temp$constates
190
+
191
+	## Comparing samples against each other
192
+	cat('comparing samples against each other\n')
193
+	meanconstates <- array(dim=c(length(consensus), 2, num.tests))
194
+	for (itest in 1:num.tests) {
195
+		cat('test',itest,'\n')
196
+
197
+		## Sample the columns to compare to the query
198
+		cols <- sample(1:length(samples), 2*size.test)
199
+		cols1 <- cols[1:size.test]
200
+		cols2 <- cols[(size.test+1):(2*size.test)]
201
+
202
+		## Mean of each position
203
+		if (size.test==1) {
204
+			meanconstates[,1,itest] <- constates[,cols1]
205
+			meanconstates[,2,itest] <- constates[,cols2]
206
+		} else {
207
+			meanconstates[,1,itest] <- apply(constates[,cols1], 1, mean, na.rm=T)
208
+			meanconstates[,2,itest] <- apply(constates[,cols2], 1, mean, na.rm=T)
209
+		}
210
+
211
+	}
212
+	## Mean along all tests
213
+	means <- apply(meanconstates, c(1,2), mean)
214
+	means <- cbind(as.data.frame(means), width=width(consensus))
215
+
216
+	## Plot the results
217
+	library(ggplot2)
218
+	library(reshape2)
219
+	states <- levels(mcols(samples.grl[[1]])$state)
220
+	ggplt1 <- ggplot(as.data.frame(means)) + geom_point(aes(x=V1, y=V2, alpha=width)) + theme_bw() + coord_cartesian(ylim=c(1,length(states))) + scale_x_continuous(breaks=1:length(states), labels=states) + scale_y_continuous(breaks=1:length(states), labels=states) + xlab(paste0(size.test,' single cell samples')) + ylab(paste0(size.test,' single cell samples'))
221
+
222
+	## Return results
223
+	return(ggplt1)
224
+
225
+}
... ...
@@ -90,7 +90,7 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold
90 90
 		cat('Automatically determining binsizes...')
91 91
 		gr <- GenomicRanges::GRanges(seqnames=Rle(chroms2use),
92 92
 																	ranges=IRanges(start=rep(1, length(chroms2use)), end=chrom.lengths[chroms2use]))
93
-		autodata <- GenomicAlignments::readGAlignmentsFromBam(file, index=index, param=ScanBamParam(what=c("pos"),which=range(gr)))
93
+		autodata <- GenomicAlignments::readGAlignmentsFromBam(file, index=index, param=ScanBamParam(what=c("pos"),which=range(gr),flag=scanBamFlag(isDuplicate=F)))
94 94
 		numreadsperbp <- length(autodata) / sum(as.numeric(chrom.lengths[chroms2use]))
95 95
 		## Pad binsizes and reads.per.bin with each others value
96 96
 		binsizes.add <- round(reads.per.bin / numreadsperbp, -2)
... ...
@@ -161,7 +161,7 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold
161 161
 
162 162
 			if (format=="bam") {
163 163
 				cat("reading reads from file...               \r")
164
-				data <- GenomicAlignments::readGAlignmentsFromBam(file, index=index, param=ScanBamParam(what=c("pos"),which=range(i.binned.data)))
164
+				data <- GenomicAlignments::readGAlignmentsFromBam(file, index=index, param=ScanBamParam(what=c("pos"),which=range(i.binned.data),flag=scanBamFlag(isDuplicate=F)))
165 165
 			}
166 166
 
167 167
 			## Count overlaps
... ...
@@ -1,3 +1,71 @@
1
+loadHmmsFromFiles <- function(uni.hmm.list) {
2
+
3
+	## Intercept user input
4
+	if (check.univariate.modellist(uni.hmm.list)!=0) {
5
+		cat("loading univariate HMMs from files\n")
6
+		mlist <- NULL
7
+		for (modelfile in uni.hmm.list) {
8
+			mlist[[length(mlist)+1]] <- get(load(modelfile))
9
+		}
10
+		uni.hmm.list <- mlist
11
+		remove(mlist)
12
+		if (check.univariate.modellist(uni.hmm.list)!=0) stop("argument 'uni.hmm.list' expects a list of univariate hmms or a list of files that contain univariate hmms")
13
+	}
14
+	
15
+	return(uni.hmm.list)
16
+
17
+}
18
+
19
+hmmList2GRangesList <- function(uni.hmm.list, reduce=TRUE, numCPU=1, consensus=FALSE) {
20
+
21
+	## Load models
22
+	uni.hmm.list <- loadHmmsFromFiles(uni.hmm.list)
23
+
24
+	## Transform to GRanges
25
+	cat('transforming to GRanges\n')
26
+	if (numCPU > 1) {
27
+		library(doParallel)
28
+		cl <- makeCluster(numCPU)
29
+		registerDoParallel(cl)
30
+		cfun <- function(...) { GRangesList(...) }
31
+		uni.hmm.grl <- foreach (uni.hmm = uni.hmm.list, .packages='aneufinder', .combine='cfun', .multicombine=TRUE) %dopar% {
32
+			hmm2GRanges(uni.hmm, reduce=reduce)
33
+		}
34
+		if (consensus) {
35
+			consensus.gr <- disjoin(unlist(uni.hmm.grl))
36
+			constates <- foreach (uni.hmm.gr = uni.hmm.grl, .packages='GenomicRanges', .combine='cbind') %dopar% {
37
+				splt <- split(uni.hmm.gr, mcols(uni.hmm.gr)$state)
38
+				mind <- as.matrix(findOverlaps(consensus.gr, splt, select='first'))
39
+			}
40
+		}
41
+		stopCluster(cl)
42
+	} else {
43
+		uni.hmm.grl <- GRangesList()
44
+		for (uni.hmm in uni.hmm.list) {
45
+			uni.hmm.grl[[length(uni.hmm.grl)+1]] <- hmm2GRanges(uni.hmm, reduce=reduce)
46
+		}
47
+		if (consensus) {
48
+			consensus.gr <- disjoin(unlist(uni.hmm.grl))
49
+			constates <- matrix(NA, ncol=length(uni.hmm.grl), nrow=length(uni.hmm.grl[[1]]))
50
+			for (i1 in 1:length(uni.hmm.grl)) {
51
+				uni.hmm.gr <- uni.hmm.grl[[i1]]
52
+				splt <- split(uni.hmm.gr, mcols(uni.hmm.gr)$state)
53
+				mind <- as.matrix(findOverlaps(consensus.gr, splt, select='first'))
54
+				constates[,i1] <- mind
55
+			}
56
+		}
57
+	}
58
+	if (consensus) {
59
+		meanstates <- apply(constates, 1, mean)
60
+		mcols(consensus.gr) <- meanstates
61
+		return(list(grl=uni.hmm.grl, consensus=consensus.gr, constates=constates))
62
+	} else {
63
+		return(uni.hmm.grl)
64
+	}
65
+
66
+
67
+}
68
+
1 69
 binned2GRanges <- function(binned.data, chrom.length.file=NULL, offset=0) {
2 70
 
3 71
 	library(GenomicRanges)
... ...
@@ -1,50 +1,84 @@
1 1
 # ===============================================
2 2
 # Write color-coded tracks with univariate states
3 3
 # ===============================================
4
-univariate2bed <- function(uni.hmm.list, file="view_me_in_genome_browser", threshold=0.5, chrom.length.file=NULL) {
4
+univariate2bed <- function(uni.hmm.list, file="view_me_in_genome_browser", numCPU=1) {
5
+
6
+	## Intercept user input
7
+	if (check.univariate.modellist(uni.hmm.list)!=0) {
8
+		cat("Loading univariate HMMs from files ...")
9
+		mlist <- NULL
10
+		for (modelfile in uni.hmm.list) {
11
+			mlist[[length(mlist)+1]] <- get(load(modelfile))
12
+		}
13
+		uni.hmm.list <- mlist
14
+		remove(mlist)
15
+		cat(" done\n")
16
+		if (check.univariate.modellist(uni.hmm.list)!=0) stop("argument 'uni.hmm.list' expects a list of univariate hmms or a list of files that contain univariate hmms")
17
+	}
5 18
 
6
-	# Check user input
7
-	if (is.null(names(uni.hmm.list))) {
8
-		stop("Please name the list entries. The names will be used as track name for the Genome Browser file.")
19
+	## Transform to GRanges
20
+	cat('transforming to GRanges\n')
21
+	if (numCPU > 1) {
22
+		library(doParallel)
23
+		cl <- makeCluster(numCPU)
24
+		registerDoParallel(cl)
25
+		cfun <- function(...) { GRangesList(...) }
26
+		uni.hmm.grl <- foreach (uni.hmm = uni.hmm.list, .packages='aneufinder', .combine='cfun', .multicombine=TRUE) %dopar% {
27
+			hmm2GRanges(uni.hmm, reduce=T)
28
+		}
29
+		consensus.gr <- disjoin(unlist(uni.hmm.grl))
30
+		constates <- foreach (uni.hmm.gr = uni.hmm.grl, .packages='GenomicRanges', .combine='cbind') %dopar% {
31
+			splt <- split(uni.hmm.gr, mcols(uni.hmm.gr)$state)
32
+			mind <- as.matrix(findOverlaps(consensus.gr, splt, select='first'))
33
+		}
34
+		stopCluster(cl)
35
+	} else {
36
+		uni.hmm.grl <- GRangesList()
37
+		for (uni.hmm in uni.hmm.list) {
38
+			uni.hmm.grl[[length(uni.hmm.grl)+1]] <- hmm2GRanges(uni.hmm, reduce=T)
39
+		}
40
+		consensus.gr <- disjoin(unlist(uni.hmm.grl))
41
+		constates <- matrix(NA, ncol=length(uni.hmm.grl), nrow=length(uni.hmm.grl[[1]]))
42
+		for (i1 in 1:length(uni.hmm.grl)) {
43
+			uni.hmm.gr <- uni.hmm.grl[[i1]]
44
+			splt <- split(uni.hmm.gr, mcols(uni.hmm.gr)$state)
45
+			mind <- as.matrix(findOverlaps(consensus.gr, splt, select='first'))
46
+			constates[,i1] <- mind
47
+		}
9 48
 	}
49
+	meanstates <- apply(constates, 1, mean)
50
+	mcols(consensus.gr) <- meanstates
10 51
 
11 52
 	# Variables
12 53
 	nummod <- length(uni.hmm.list)
13 54
 	file <- paste(file,"bed", sep=".")
14 55
 
15 56
 	# Generate the colors
16
-	colors <- gcolors[state.labels]
57
+	colors <- state.colors[state.labels]
17 58
 	RGBs <- t(col2rgb(colors))
18 59
 	RGBs <- apply(RGBs,1,paste,collapse=",")
19 60
 
20 61
 	# Write first line to file
21 62
 	cat("browser hide all\n", file=file)
22 63
 	
64
+	### Write every model to file ###
23 65
 	for (imod in 1:nummod) {
24 66
 		uni.hmm <- uni.hmm.list[[imod]]
67
+		uni.hmm.gr <- uni.hmm.grl[[imod]]
25 68
 		priority <- 50 + imod
26
-		cat(paste("track name=",names(uni.hmm.list)[imod]," description=\"univariate calls for ",names(uni.hmm.list)[imod],", threshold = ",threshold,"\" visibility=1 itemRgb=On priority=",priority,"\n", sep=""), file=file, append=TRUE)
69
+		cat(paste("track name=",uni.hmm$ID," description=\"univariate calls for ",names(uni.hmm.list)[imod],"\" visibility=1 itemRgb=On priority=",priority,"\n", sep=""), file=file, append=TRUE)
70
+
71
+		# Change chromosome names from '1' to 'chr1' if necessary
72
+		mask <- which(!grepl('chr', seqnames(uni.hmm.gr)))
73
+		mcols(uni.hmm.gr)$chromosome <- as.character(seqnames(uni.hmm.gr))
74
+		mcols(uni.hmm.gr)$chromosome[mask] <- sub(pattern='^', replacement='chr', mcols(uni.hmm.gr)$chromosome[mask])
75
+		mcols(uni.hmm.gr)$chromosome <- as.factor(mcols(uni.hmm.gr)$chromosome)
27 76
 
28 77
 		# Collapse the calls
29
-		calls <- cbind(uni.hmm$coordinates, name=uni.hmm$states)
30
-		collapsed.calls <- collapse.bins(calls, column2collapseBy=4)
31
-
32
-		# Check length of chromosomes if chrom.length.file was given
33
-		if (!is.null(chrom.length.file)) {
34
-			chrom.lengths.df <- read.table(chrom.length.file)
35
-			chrom.lengths <- chrom.lengths.df[,2]
36
-			names(chrom.lengths) <- chrom.lengths.df[,1]
37
-			for (chrom in levels(as.factor(collapsed_calls$chrom))) {
38
-				index <- which(collapsed_calls$chrom == chrom & collapsed_calls$end > chrom.lengths[chrom])
39
-				collapsed_calls$end[index] <- chrom.lengths[chrom]
40
-				if (length(index) >= 1) {
41
-					warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome")
42
-				}
43
-			}
44
-		}
78
+		collapsed.calls <- as.data.frame(uni.hmm.gr)[c('chromosome','start','end','state')]
45 79
 
46 80
 		# Append to file
47
-		itemRgb <- RGBs[as.character(collapsed.calls$name)]
81
+		itemRgb <- RGBs[as.character(collapsed.calls$state)]
48 82
 		numsegments <- nrow(collapsed.calls)
49 83
 		df <- cbind(collapsed.calls, score=rep(0,numsegments), strand=rep(".",numsegments), thickStart=collapsed.calls$start, thickEnd=collapsed.calls$end, itemRgb=itemRgb)
50 84
 		write.table(format(df, scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE)
... ...
@@ -89,73 +123,3 @@ binned2wiggle <- function(binned.data.list, file="view_me_in_genome_browser") {
89 123
 }
90 124
 
91 125
 
92
-# ===============================================================
93
-# Write color-coded tracks with multivariate combinatorial states
94
-# ===============================================================
95
-multivariate2bed <- function(multi.hmm, separate.tracks=FALSE, exclude.state.zero=TRUE, numstates=NULL, file="view_me_in_genome_browser", chrom.length.file=NULL) {
96
-
97
-	# Variables
98
-	file <- paste(file,".bed", sep="")
99
-	combstates <- multi.hmm$comb.states
100
-	if (is.null(numstates)) {
101
-		numstates <- length(combstates)
102
-	} else if (numstates > length(combstates)) {
103
-		numstates <- length(combstates)
104
-	}
105
-	if (exclude.state.zero) {
106
-		combstates2use <- setdiff(combstates,0)
107
-		combstates2use <- combstates2use[1:numstates]
108
-		combstates2use <- combstates2use[!is.na(combstates2use)]
109
-		numstates <- length(combstates2use)
110
-	}
111
-
112
-	# Collapse the calls
113
-	calls <- cbind(multi.hmm$coordinates, name=multi.hmm$states)
114
-	collapsed.calls <- collapse.bins(calls, column2collapseBy=4)
115
-	# Select only desired states
116
-	mask <- rep(FALSE,nrow(collapsed.calls))
117
-	for (istate in combstates2use) {
118
-		mask <- mask | istate==collapsed.calls$name
119
-	}
120
-	collapsed.calls <- collapsed.calls[mask,]
121
-
122
-	# Check length of chromosomes if chrom.length.file was given
123
-	if (!is.null(chrom.length.file)) {
124
-		chrom.lengths.df <- read.table(chrom.length.file)
125
-		chrom.lengths <- chrom.lengths.df[,2]
126
-		names(chrom.lengths) <- chrom.lengths.df[,1]
127
-		for (chrom in levels(as.factor(collapsed.calls$chrom))) {
128
-			index <- which(collapsed.calls$chrom == chrom & collapsed.calls$end > chrom.lengths[chrom])
129
-			collapsed.calls$end[index] <- chrom.lengths[chrom]
130
-			if (length(index) >= 1) {
131
-				warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome")
132
-			}
133
-		}
134
-	}
135
-
136
-	# Generate the colors for each combinatorial state
137
-	colors <- colors()[grep(colors(), pattern="white|grey|gray|snow", invert=T)]
138
-	step <- length(colors) %/% numstates
139
-	colors <- colors[seq(1,by=step,length=numstates)]
140
-	RGBs <- t(col2rgb(colors))
141
-	RGBs <- apply(RGBs,1,paste,collapse=",")
142
-	itemRgb <- RGBs[as.factor(collapsed.calls$name)]
143
-
144
-	# Write to file
145
-	cat("browser hide all\n", file=file)
146
-	numsegments <- nrow(collapsed.calls)
147
-	df <- cbind(collapsed.calls, score=rep(0,numsegments), strand=rep(".",numsegments), thickStart=collapsed.calls$start, thickEnd=collapsed.calls$end, itemRgb=itemRgb)
148
-
149
-	if (separate.tracks) {
150
-		for (istate in combstates2use) {
151
-			priority <- 100 + which(istate==combstates2use)
152
-			cat(paste("track name=\"comb.state ",istate,"\" description=\"multivariate calls for combinatorial state ",istate,"\" visibility=1 itemRgb=On priority=",priority,"\n", sep=""), file=file, append=TRUE)
153
-			write.table(format(df[df$name==istate,], scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE)
154
-		}
155
-	} else {
156
-		cat(paste("track name=\"comb.state\" description=\"multivariate combinatorial states\" visibility=1 itemRgb=On priority=100\n", sep=""), file=file, append=TRUE)
157
-		write.table(format(df, scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE)
158
-	}
159
-
160
-}
161
-
... ...
@@ -1,4 +1,4 @@
1
-find.aneuploidies <- function(binned.data, ID, use.states=0:5, eps=0.001, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, filter.reads=TRUE) {
1
+find.aneuploidies <- function(binned.data, ID, use.states=0:6, eps=0.001, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, filter.reads=TRUE) {
2 2
 
3 3
 	## Intercept user input
4 4
 	IDcheck <- ID  #trigger error if not defined
... ...
@@ -79,6 +79,10 @@ find.aneuploidies <- function(binned.data, ID, use.states=0:5, eps=0.001, init="
79 79
 		colnames(hmm$A) <- use.state.labels
80 80
 		hmm$distributions <- cbind(size=hmm$size, prob=hmm$prob, mu=fmean(hmm$size,hmm$prob), variance=fvariance(hmm$size,hmm$prob))
81 81
 		rownames(hmm$distributions) <- use.state.labels
82
+		# Treat 'null-mixed' separately
83
+			hmm$distributions[2,'mu'] <- (1-hmm$distributions[2,'prob'])/hmm$distributions[2,'prob']
84
+			hmm$distributions[2,'variance'] <- hmm$distributions[2,'mu']/hmm$distributions[2,'prob']
85
+			hmm$distributions[2,'size'] <- NA
82 86
 		hmm$A.initial <- matrix(hmm$A.initial, ncol=hmm$num.states, byrow=TRUE)
83 87
 		rownames(hmm$A.initial) <- use.state.labels
84 88
 		colnames(hmm$A.initial) <- use.state.labels
... ...
@@ -143,6 +147,10 @@ find.aneuploidies <- function(binned.data, ID, use.states=0:5, eps=0.001, init="
143 147
 	colnames(hmm$A) <- use.state.labels
144 148
 	hmm$distributions <- cbind(size=hmm$size, prob=hmm$prob, mu=fmean(hmm$size,hmm$prob), variance=fvariance(hmm$size,hmm$prob))
145 149
 	rownames(hmm$distributions) <- use.state.labels
150
+	# Treat 'null-mixed' separately
151
+		hmm$distributions[2,'mu'] <- (1-hmm$distributions[2,'prob'])/hmm$distributions[2,'prob']
152
+		hmm$distributions[2,'variance'] <- hmm$distributions[2,'mu']/hmm$distributions[2,'prob']
153
+		hmm$distributions[2,'size'] <- NA
146 154
 	hmm$A.initial <- matrix(hmm$A.initial, ncol=hmm$num.states, byrow=TRUE)
147 155
 	rownames(hmm$A.initial) <- use.state.labels
148 156
 	colnames(hmm$A.initial) <- use.state.labels
... ...
@@ -1,11 +1,13 @@
1 1
 # =======================================================
2 2
 # Some global variables that can be used in all functions
3 3
 # =======================================================
4
-state.labels <- c("nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy")
4
+state.labels <- c("nullsomy","null-mixed","monosomy","disomy","trisomy","tetrasomy","multisomy")
5 5
 coordinate.names <- c("chrom","start","end")
6 6
 binned.data.names <- c(coordinate.names,"reads")
7 7
 class.aneufinder.univariate <- "aneufinder.univariate.hmm"
8
-state.colors <- c("mapped"="gray68","nullsomy"="gray20","monosomy"="gold3","disomy"="springgreen3","trisomy"="orangered1","tetrasomy"="orangered4","multisomy"="purple3","total"="black")
8
+state.colors <- c("mapped"="gray68","nullsomy"="gray20","null-mixed"="gray30","monosomy"="gold3","disomy"="springgreen3","trisomy"="orangered1","tetrasomy"="orangered4","multisomy"="purple3","total"="black")
9
+get.state.labels <- function() { return(state.labels) }
10
+get.state.colors <- function() { return(state.colors[state.labels]) }
9 11
  
10 12
 # ============================================================================
11 13
 # Functions for a Negative Binomial to transform (mean,variance)<->(size,prob)
... ...
@@ -70,9 +70,12 @@ plot.distribution <- function(model, state=NULL, chrom=NULL, start=NULL, end=NUL
70 70
 	x <- 0:rightxlim
71 71
 	distributions <- list(x)
72 72
 
73
-	# unmappable
73
+	# zero-inflation
74 74
 	distributions[[length(distributions)+1]] <- c(weights[1],rep(0,length(x)-1))
75
-	for (istate in 2:numstates) {
75
+	# geometric
76
+	distributions[[length(distributions)+1]] <- weights[2] * dgeom(x, model$distributions[2,'prob'])
77
+	# negative binomials
78
+	for (istate in 3:numstates) {
76 79
 		distributions[[length(distributions)+1]] <- weights[istate] * dnbinom(x, model$distributions[istate,'size'], model$distributions[istate,'prob'])
77 80
 	}
78 81
 	distributions <- as.data.frame(distributions)
... ...
@@ -195,3 +198,85 @@ plot.BAIT <- function(model, file='aneufinder_BAIT_plots') {
195 198
 	}
196 199
 	d <- dev.off()
197 200
 }
201
+
202
+
203
+
204
+# ------------------------------------------------------------
205
+# Plot overview
206
+# ------------------------------------------------------------
207
+plot.genome.overview <- function(modellist, file='aneufinder_genome_overview', numCPU=1) {
208
+	
209
+	## Function definitions
210
+	reformat <- function(x) {
211
+	out_list <- list() 
212
+
213
+		for ( i in 2:length(x) ) {
214
+			out_list[[i]] <- c(x[i-1], x[i])
215
+		}
216
+	mt <- do.call("rbind",out_list)
217
+	df <- data.frame(mt)
218
+	colnames(df) <- c("start", "end")
219
+	df
220
+	}
221
+
222
+	## Load and transform to GRanges
223
+	cat('transforming to GRanges\n')
224
+	uni.hmm.grl <- hmmList2GRangesList(modellist, reduce=FALSE)
225
+
226
+	## Setup page
227
+	library(grid)
228
+	library(ggplot2)
229
+	nrows <- length(uni.hmm.grl)	# rows for plotting genomes
230
+	ncols <- 1
231
+	png(file=paste0(file, '.png'), width=ncols*60, height=nrows*5, units='cm', res=150)
232
+	grid.newpage()
233
+	layout <- matrix(1:((nrows+1)*ncols), ncol=ncols, nrow=nrows+1, byrow=T)
234
+	pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout), heights=c(1,rep(10,length(uni.hmm.grl))))))
235
+	# Main title
236
+	grid.text(file, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:ncols), gp=gpar(fontsize=26))
237
+
238
+	## Prepare some variables for plotting
239
+	gr <- uni.hmm.grl[[1]]
240
+	len <- seqlengths(gr)
241
+	chr.names <- levels(seqnames(gr))
242
+	len <- as.numeric(len)
243
+	len <- c(0,len)
244
+	len <- cumsum(len)
245
+	df <- reformat(len)
246
+	df$col <- rep(c("grey47","grey77"), 12)
247
+	df$breaks <- df[,1] + ((df[,2]-df[,1])/2)
248
+	df$chr.names <- chr.names 
249
+
250
+	my_theme <- theme(
251
+				legend.position="none",
252
+				panel.background=element_blank(),
253
+				panel.border=element_blank(),
254
+				panel.grid.major=element_blank(),
255
+				panel.grid.minor=element_blank(),
256
+				plot.background=element_blank()
257
+	)
258
+		
259
+	## Go through models and plot
260
+	for (i1 in 1:length(uni.hmm.grl)) {
261
+		cat('plotting model',i1,'\n')
262
+		# Get the i,j matrix positions of the regions that contain this subplot
263
+		matchidx <- as.data.frame(which(layout == i1+ncols, arr.ind = TRUE))
264
+
265
+		trans_gr <- biovizBase::transformToGenome(uni.hmm.grl[[i1]], space.skip = 0)
266
+
267
+		dfplot <- as.data.frame(trans_gr)
268
+		dfplot$reads <- stats::runmed(uni.hmm.grl[[i1]]$reads, 15)
269
+
270
+		ggplt <- ggplot(dfplot, aes(x=.start, y=reads))
271
+		ggplt <- ggplt + geom_linerange(aes(ymin=0, ymax=reads, col=state), size=0.2)
272
+		ggplt <- ggplt + geom_rect(data=df, aes(xmin=start, xmax=end, ymin=0, ymax=Inf, fill = col),  alpha=I(0.3), inherit.aes = F)
273
+		ggplt <- ggplt + scale_x_continuous(breaks = df$breaks, labels=df$chr.names, expand = c(0,0)) + scale_y_continuous(expand=c(0,5)) + scale_fill_manual(values = c("grey47","grey77")) + scale_color_manual(values=state.colors, drop=F) + xlab("chromosomes") + my_theme
274
+		suppressWarnings(
275
+		print(ggplt + ylim(0,30), vp = viewport(layout.pos.row = matchidx$row, layout.pos.col = matchidx$col))
276
+		)
277
+
278
+
279
+	}
280
+	d <- dev.off()
281
+}
282
+
... ...
@@ -1,5 +1,6 @@
1 1
 #include "utility.h"
2 2
 #include "scalehmm.h"
3
+#include "loghmm.h"
3 4
 #include <omp.h> // parallelization options
4 5
 
5 6
 // ===================================================================================================================================================
... ...
@@ -13,7 +14,7 @@ void R_univariate_hmm(int* O, int* T, int* N, int* statelabels, double* size, do
13 14
 // 	FILE* pFile = fopen("chromStar.log", "w");
14 15
 // 	Output2FILE::Stream() = pFile;
15 16
  	FILELog::ReportingLevel() = FILELog::FromString("ERROR");
16
-//  	FILELog::ReportingLevel() = FILELog::FromString("DEBUG1");
17
+//  	FILELog::ReportingLevel() = FILELog::FromString("DEBUG2");
17 18
 
18 19
 	// Parallelization settings
19 20
 	omp_set_num_threads(*num_threads);
... ...
@@ -53,6 +54,7 @@ void R_univariate_hmm(int* O, int* T, int* N, int* statelabels, double* size, do
53 54
 	// Create the HMM
54 55
 	FILE_LOG(logDEBUG1) << "Creating a univariate HMM";
55 56
 	ScaleHMM* hmm = new ScaleHMM(*T, *N);
57
+// 	LogHMM* hmm = new LogHMM(*T, *N);
56 58
 	hmm->set_cutoff(*read_cutoff);
57 59
 	// Initialize the transition probabilities and proba
58 60
 	hmm->initialize_transition_probs(initial_A, *use_initial_params);
... ...
@@ -99,16 +101,22 @@ void R_univariate_hmm(int* O, int* T, int* N, int* statelabels, double* size, do
99 101
 
100 102
 		}
101 103
 
102
-		if (i_state >= 1)
104
+		if (i_state == 0)
103 105
 		{
104
-			FILE_LOG(logDEBUG1) << "Using negative binomial for state " << i_state;
105
-			NegativeBinomial *d = new NegativeBinomial(O, *T, initial_size[i_state], initial_prob[i_state]); // delete is done inside ~ScaleHMM()
106
+			FILE_LOG(logDEBUG1) << "Using only zeros for state " << i_state;
107
+			ZeroInflation *d = new ZeroInflation(O, *T); // delete is done inside ~ScaleHMM()
106 108
 			hmm->densityFunctions.push_back(d);
107 109
 		}
108
-		else if (i_state == 0)
110
+		else if (i_state == 1)
109 111
 		{
110
-			FILE_LOG(logDEBUG1) << "Using only zeros for state " << i_state;
111
-			ZeroInflation *d = new ZeroInflation(O, *T); // delete is done inside ~ScaleHMM()
112
+			FILE_LOG(logDEBUG1) << "Using geometric distribution for state " << i_state;
113
+			Geometric *d = new Geometric(O, *T, 0.9); // delete is done inside ~ScaleHMM()
114
+			hmm->densityFunctions.push_back(d);
115
+		}
116
+		else if (i_state >= 2)
117
+		{
118
+			FILE_LOG(logDEBUG1) << "Using negative binomial for state " << i_state;
119
+			NegativeBinomial *d = new NegativeBinomial(O, *T, initial_size[i_state], initial_prob[i_state]); // delete is done inside ~ScaleHMM()
112 120
 			hmm->densityFunctions.push_back(d);
113 121
 		}
114 122
 		else
... ...
@@ -181,6 +189,12 @@ void R_univariate_hmm(int* O, int* T, int* N, int* statelabels, double* size, do
181 189
 			size[i] = d->get_size();
182 190
 			prob[i] = d->get_prob();
183 191
 		}
192
+		else if (hmm->densityFunctions[i]->get_name() == GEOMETRIC)
193
+		{
194
+			Geometric* d = (Geometric*)(hmm->densityFunctions[i]);
195
+			size[i] = 0;
196
+			prob[i] = d->get_prob();
197
+		}
184 198
 		else if (hmm->densityFunctions[i]->get_name() == ZERO_INFLATION)
185 199
 		{
186 200
 			// These values for a Negative Binomial define a zero-inflation (delta distribution)
... ...
@@ -222,7 +222,7 @@ void NegativeBinomial::calc_densities(double* dens)
222 222
 	}
223 223
 } 
224 224
 
225
-void NegativeBinomial::update(double* weight)
225
+void NegativeBinomial::update(double* weights)
226 226
 {
227 227
 	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
228 228
 	double eps = 1e-4, kmax;
... ...
@@ -233,8 +233,8 @@ void NegativeBinomial::update(double* weight)
233 233
 // 	time = clock();
234 234
 	for (int t=0; t<this->T; t++)
235 235
 	{
236
-		numerator+=weight[t]*this->size;
237
-		denominator+=weight[t]*(this->size+this->obs[t]);
236
+		numerator+=weights[t]*this->size;
237
+		denominator+=weights[t]*(this->size+this->obs[t]);
238 238
 	}
239 239
 	this->prob = numerator/denominator; // Update of size (r) is now done with updated prob
240 240
 	double logp = log(this->prob);
... ...
@@ -265,13 +265,13 @@ void NegativeBinomial::update(double* weight)
265 265
 			{
266 266
 				if(this->obs[t]==0)
267 267
 				{
268
-					Fr+=weight[t]*logp;
268
+					Fr+=weights[t]*logp;
269 269
 					//dFrdr+=0;
270 270
 				}
271 271
 				if(this->obs[t]!=0)
272 272
 				{
273
-					Fr+=weight[t]*(logp-DigammaR+DigammaRplusX[(int)obs[t]]);
274
-					dFrdr+=weight[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX[(int)obs[t]]-DigammaRplusX[(int)obs[t]]);
273
+					Fr+=weights[t]*(logp-DigammaR+DigammaRplusX[(int)obs[t]]);
274
+					dFrdr+=weights[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX[(int)obs[t]]-DigammaRplusX[(int)obs[t]]);
275 275
 				}
276 276
 			}
277 277
 			if(fabs(Fr)<eps)
... ...
@@ -297,13 +297,122 @@ void NegativeBinomial::update(double* weight)
297 297
 				DigammaRplusDRplusX = digamma(rhere+dr+this->obs[t]); // boost::math::digamma<>(rhere+dr+this->obs[ti]);
298 298
 				if(this->obs[t]==0)
299 299
 				{
300
-					Fr+=weight[t]*logp;
300
+					Fr+=weights[t]*logp;
301 301
 					//dFrdr+=0;
302 302
 				}
303 303
 				if(this->obs[t]!=0)
304 304
 				{
305
-					Fr+=weight[t]*(logp-DigammaR+DigammaRplusX);
306
-					dFrdr+=weight[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX-DigammaRplusX);
305
+					Fr+=weights[t]*(logp-DigammaR+DigammaRplusX);
306
+					dFrdr+=weights[t]/dr*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX-DigammaRplusX);
307
+				}
308
+			}
309
+			if(fabs(Fr)<eps)
310
+			{
311
+				break;
312
+			}
313
+			if(Fr/dFrdr<rhere) rhere=rhere-Fr/dFrdr;
314
+			if(Fr/dFrdr>rhere) rhere=rhere/2.0;
315
+		}
316
+	}
317
+	this->size = rhere;
318
+	FILE_LOG(logDEBUG1) << "r = "<<this->size << ", p = "<<this->prob;
319
+
320
+// 	dtime = clock() - time;
321
+// 	FILE_LOG(logDEBUG1) << "updateR(): "<<dtime<< " clicks";
322
+
323
+}
324
+
325
+void NegativeBinomial::update_constrained(double** weights, int fromState, int toState)
326
+{
327
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
328
+	double eps = 1e-4, kmax;
329
+	double numerator, denominator, rhere, dr, Fr, dFrdr, DigammaR, DigammaRplusDR;
330
+	// Update prob (p)
331
+	numerator=denominator=0.0;
332
+// 	clock_t time, dtime;
333
+// 	time = clock();
334
+	for (int i=0; i<toState-fromState; i++)
335
+	{
336
+		for (int t=0; t<this->T; t++)
337
+		{
338
+			numerator+=weights[i+fromState][t]*this->size*(i+1);
339
+			denominator+=weights[i+fromState][t]*(this->size*(i+1)+this->obs[t]);
340
+		}
341
+	}
342
+	this->prob = numerator/denominator; // Update of size (r) is now done with updated prob
343
+	double logp = log(this->prob);
344
+// 	dtime = clock() - time;
345
+// 	FILE_LOG(logDEBUG1) << "updateP(): "<<dtime<< " clicks";
346
+	// Update of size (r) with Newton Method
347
+	rhere = this->size;
348
+	dr = 0.00001;
349
+	kmax = 20;
350
+// 	time = clock();
351
+	// Select strategy for computing digammas
352
+	if (this->max_obs <= this->T)
353
+	{
354
+		FILE_LOG(logDEBUG2) << "Precomputing digammas in " << __func__ << " for every obs[t], because max(O)<=T";
355
+		double DigammaRplusX[this->max_obs+1], DigammaRplusDRplusX[this->max_obs+1];
356
+		for (int k=1; k<kmax; k++)
357
+		{
358
+			Fr=dFrdr=0.0;
359
+			for (int i=0; i<toState-fromState; i++)
360
+			{
361
+				DigammaR = digamma((i+1)*rhere); // boost::math::digamma<>(rhere);
362
+				DigammaRplusDR = digamma((i+1)*(rhere + dr)); // boost::math::digamma<>(rhere+dr);
363
+				// Precompute the digammas by iterating over all possible values of the observation vector
364
+				for (int j=0; j<=this->max_obs; j++)
365
+				{
366
+					DigammaRplusX[j] = digamma((i+1)*rhere+j);
367
+					DigammaRplusDRplusX[j] = digamma((i+1)*(rhere+dr)+j);
368
+				}
369
+				for(int t=0; t<this->T; t++)
370
+				{
371
+					if(this->obs[t]==0)
372
+					{
373
+						Fr+=weights[i+fromState][t]*logp;
374
+						//dFrdr+=0;
375
+					}
376
+					if(this->obs[t]!=0)
377
+					{
378
+						Fr+=weights[i+fromState][t]*(logp-DigammaR+DigammaRplusX[(int)obs[t]]);
379
+						dFrdr+=weights[i+fromState][t]/((i+1)*dr)*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX[(int)obs[t]]-DigammaRplusX[(int)obs[t]]);
380
+					}
381
+				}
382
+				if(fabs(Fr)<eps)
383
+				{
384
+					break;
385
+				}
386
+				if(Fr/dFrdr<rhere) rhere=rhere-Fr/dFrdr;
387
+				if(Fr/dFrdr>rhere) rhere=rhere/2.0;
388
+			}
389
+		}
390
+	}
391
+	else
392
+	{
393
+		FILE_LOG(logDEBUG2) << "Computing digammas in " << __func__ << " for every t, because max(O)>T";
394
+		double DigammaRplusX, DigammaRplusDRplusX;
395
+		for (int k=1; k<kmax; k++)
396
+		{
397
+			Fr = dFrdr = 0.0;
398
+			for (int i=0; i<toState-fromState; i++)
399
+			{
400
+				DigammaR = digamma((i+1)*rhere); // boost::math::digamma<>(rhere);
401
+				DigammaRplusDR = digamma((i+1)*(rhere + dr)); // boost::math::digamma<>(rhere+dr);
402
+				for(int t=0; t<this->T; t++)
403
+				{
404
+					DigammaRplusX = digamma((i+1)*rhere+this->obs[t]); //boost::math::digamma<>(rhere+this->obs[ti]);
405
+					DigammaRplusDRplusX = digamma((i+1)*(rhere+dr)+this->obs[t]); // boost::math::digamma<>(rhere+dr+this->obs[ti]);
406
+					if(this->obs[t]==0)
407
+					{
408
+						Fr+=weights[i+fromState][t]*logp;
409
+						//dFrdr+=0;
410
+					}
411
+					if(this->obs[t]!=0)
412
+					{
413
+						Fr+=weights[i+fromState][t]*(logp-DigammaR+DigammaRplusX);
414
+						dFrdr+=weights[i+fromState][t]/((i+1)*dr)*(DigammaR-DigammaRplusDR+DigammaRplusDRplusX-DigammaRplusX);
415
+					}
307 416
 				}
308 417
 			}
309 418
 			if(fabs(Fr)<eps)
... ...
@@ -437,7 +546,7 @@ void ZeroInflation::calc_logdensities(double* logdensity)
437 546
 		if(this->obs[t]==0)
438 547
 		{
439 548
 			logdensity[t] = 0.0;
440
-		};
549
+		}
441 550
 		if(this->obs[t]>0)
442 551
 		{
443 552
 			logdensity[t] = -100; // -INFINITY gives nan's somewhere downstream
... ...
@@ -481,3 +590,186 @@ void ZeroInflation::set_variance(double variance)
481 590
 }
482 591
 
483 592
 
593
+// ============================================================
594
+// Geometric density
595
+// ============================================================
596
+
597
+// Constructor and Destructor ---------------------------------
598
+Geometric::Geometric(int* observations, int T, double prob)
599
+{
600
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
601
+	this->obs = observations;
602
+	this->T = T;
603
+	this->prob = prob;
604
+	if (this->obs != NULL)
605
+	{
606
+		this->max_obs = intMax(observations, T);
607
+	}
608
+}
609
+
610
+Geometric::~Geometric()
611
+{
612
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
613
+}
614
+
615
+// Methods ----------------------------------------------------
616
+void Geometric::calc_logdensities(double* logdens)
617
+{
618
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
619
+	double logp = log(this->prob);
620
+	double log1minusp = log(1-this->prob);
621
+	// Select strategy for computing logdensities
622
+	if (this->max_obs <= this->T)
623
+	{
624
+		FILE_LOG(logDEBUG2) << "Precomputing logdensities in " << __func__ << " for every obs[t], because max(O)<=T";
625
+		double logdens_per_read [this->max_obs+1];
626
+		for (int j=0; j<=this->max_obs; j++)
627
+		{
628
+			logdens_per_read[j] = logp + j * log1minusp;
629
+		}
630
+		for (int t=0; t<this->T; t++)
631
+		{
632
+			logdens[t] = logdens_per_read[(int) this->obs[t]];
633
+			FILE_LOG(logDEBUG4) << "logdens["<<t<<"] = " << logdens[t];
634
+			if (isnan(logdens[t]))
635
+			{
636
+				FILE_LOG(logERROR) << __PRETTY_FUNCTION__;
637
+				FILE_LOG(logERROR) << "logdens["<<t<<"] = "<< logdens[t];
638
+				throw nan_detected;
639
+			}
640
+		}
641
+	}
642
+	else
643
+	{
644
+		FILE_LOG(logDEBUG2) << "Computing logdensities in " << __func__ << " for every t, because max(O)>T";
645
+		for (int t=0; t<this->T; t++)
646
+		{
647
+			logdens[t] = logp + this->obs[t] * log1minusp;
648
+			FILE_LOG(logDEBUG4) << "logdens["<<t<<"] = " << logdens[t];
649
+			if (isnan(logdens[t]))
650
+			{
651
+				FILE_LOG(logERROR) << __PRETTY_FUNCTION__;
652
+				FILE_LOG(logERROR) << "logdens["<<t<<"] = "<< logdens[t];
653
+				throw nan_detected;
654
+			}
655
+		}
656
+	}
657
+} 
658
+
659
+void Geometric::calc_densities(double* dens)
660
+{
661
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
662
+	double p = this->prob;
663
+	double oneminusp = 1-this->prob;
664
+	// Select strategy for computing gammas
665
+	if (this->max_obs <= this->T)
666
+	{
667
+		FILE_LOG(logDEBUG2) << "Precomputing densities in " << __func__ << " for every obs[t], because max(O)<=T";
668
+		double dens_per_read [this->max_obs+1];
669
+		for (int j=0; j<=this->max_obs; j++)
670
+		{
671
+			dens_per_read[j] = p * pow(oneminusp,j);
672
+		}
673
+		for (int t=0; t<this->T; t++)
674
+		{
675
+			dens[t] = dens_per_read[(int) this->obs[t]];
676
+			FILE_LOG(logDEBUG4) << "dens["<<t<<"] = " << dens[t];
677
+			if (isnan(dens[t]))
678
+			{
679
+				FILE_LOG(logERROR) << __PRETTY_FUNCTION__;
680
+				FILE_LOG(logERROR) << "dens["<<t<<"] = "<< dens[t];
681
+				throw nan_detected;
682
+			}
683
+		}
684
+	}
685
+	else
686
+	{
687
+		FILE_LOG(logDEBUG2) << "Computing densities in " << __func__ << " for every t, because max(O)>T";
688
+		for (int t=0; t<this->T; t++)
689
+		{
690
+			dens[t] = p * pow(oneminusp,this->obs[t]);
691
+			FILE_LOG(logDEBUG4) << "dens["<<t<<"] = " << dens[t];
692
+			if (isnan(dens[t]))
693
+			{
694
+				FILE_LOG(logERROR) << __PRETTY_FUNCTION__;
695
+				FILE_LOG(logERROR) << "dens["<<t<<"] = "<< dens[t];
696
+				throw nan_detected;
697
+			}
698
+		}
699
+	}
700
+} 
701
+
702
+void Geometric::update(double* weight)
703
+{
704
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
705
+	double numerator, denominator;
706
+	// Update prob (p)
707
+	numerator=denominator=0.0;
708
+	for (int t=0; t<this->T; t++)
709
+	{
710
+		numerator+=weight[t];
711
+		denominator+=weight[t]*(1+this->obs[t]);
712
+	}
713
+	this->prob = numerator/denominator;
714
+	FILE_LOG(logDEBUG1) << "p = "<<this->prob;
715
+}
716
+
717
+double Geometric::fprob(double mean, double variance)
718
+{
719
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
720
+	return( mean / variance );
721
+}
722
+
723
+double Geometric::fmean(double prob)
724
+{
725
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
726
+	return( (1-prob) / prob );
727
+}
728
+
729
+double Geometric::fvariance(double prob)
730
+{
731
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
732
+	return( (1-prob) / (prob*prob) );
733
+}
734
+
735
+// Getter and Setter ------------------------------------------
736
+double Geometric::get_mean()
737
+{
738
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
739
+	return( this->fmean( this->prob ) );
740
+}
741
+
742
+void Geometric::set_mean(double mean)
743
+{
744
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
745
+	double variance = this->get_variance();
746
+	this->prob = this->fprob( mean, variance );
747
+}
748
+
749
+double Geometric::get_variance()
750
+{
751
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
752
+	return( this->fvariance( this->prob ) );
753
+}
754
+
755
+void Geometric::set_variance(double variance)
756
+{
757
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
758
+	double mean = this->get_mean();
759
+	this->prob = this->fprob( mean, variance );
760
+}
761
+
762
+DensityName Geometric::get_name()
763
+{
764
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
765
+	return(GEOMETRIC);
766
+}
767
+
768
+double Geometric::get_prob()
769
+{
770
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
771
+	return(this->prob);
772
+}
773
+
774
+
775
+
... ...
@@ -4,7 +4,7 @@
4 4
 #ifndef DENSITIES_H
5 5
 #define DENSITIES_H
6 6
 
7
-enum DensityName {ZERO_INFLATION, NORMAL, NEGATIVE_BINOMIAL};
7
+enum DensityName {ZERO_INFLATION, NORMAL, NEGATIVE_BINOMIAL, GEOMETRIC};
8 8
 
9 9
 class Density
10 10
 {
... ...
@@ -15,6 +15,7 @@ class Density
15 15
 		virtual void calc_logdensities(double* logdensity) {};
16 16
 		virtual void calc_densities(double* density) {};
17 17
 		virtual void update(double* weight) {}; 
18
+		virtual void update_constrained(double** weights, int fromState, int toState) {};
18 19
 		// Getter and Setter
19 20
 		virtual DensityName get_name() {};
20 21
 		virtual double get_mean() {};
... ...
@@ -59,7 +60,7 @@ class NegativeBinomial : public Density
59 60
 {
60 61
 	public:
61 62
 		// Constructor and Destructor
62
-		NegativeBinomial(int* observations, int T, double mean, double variance);
63
+		NegativeBinomial(int* observations, int T, double size, double prob);
63 64
 		~NegativeBinomial();
64 65
 
65 66
 		// Methods
... ...
@@ -67,6 +68,7 @@ class NegativeBinomial : public Density
67 68
 		void calc_densities(double* density);
68 69
 		void calc_logdensities(double* logdensity);
69 70
 		void update(double* weights);
71
+		void update_constrained(double** weights, int fromState, int toState);
70 72
 		double fsize(double mean, double variance);
71 73
 		double fprob(double mean, double variance);
72 74
 		double fmean(double size, double prob);
... ...
@@ -119,4 +121,39 @@ class ZeroInflation : public Density
119 121
 };
120 122
 
121 123
 
124
+class Geometric : public Density
125
+{
126
+	public:
127
+		// Constructor and Destructor
128
+		Geometric(int* observations, int T, double prob);
129
+		~Geometric();
130
+
131
+		// Methods
132
+		DensityName get_name();
133
+		void calc_densities(double* density);
134
+		void calc_logdensities(double* logdensity);
135
+		void update(double* weights);
136
+		double fprob(double mean, double variance);
137
+		double fmean(double prob);
138
+		double fvariance(double prob);
139
+
140
+		// Getter and Setter
141
+		double get_mean();
142
+		void set_mean(double mean);
143
+		double get_variance();
144
+		void set_variance(double variance);
145
+		double get_prob();
146
+
147
+	private:
148
+		// Member variables
149
+		int T; ///< length of observation vector
150
+		int* obs; ///< vector [T] of observations
151
+		int max_obs; ///< maximum observation
152
+		double prob; ///< probability parameter of the geometric distribution
153
+		double mean; ///< mean of the geometric distribution
154
+		double variance; ///< variance of the geometric distribution
155
+
156
+};
157
+
158
+
122 159
 #endif
123 160
new file mode 100644
... ...
@@ -0,0 +1,723 @@
1
+#include "loghmm.h"
2
+
3
+// ============================================================
4
+// Hidden Markov Model implemented with scaling strategy
5
+// ============================================================
6
+
7
+// Public =====================================================
8
+
9
+// Constructor and Destructor ---------------------------------
10
+LogHMM::LogHMM(int T, int N)
11
+{
12
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
13
+	this->T = T;
14
+	this->N = N;
15
+	this->A = allocDoubleMatrix(N, N);
16
+	this->logA = allocDoubleMatrix(N, N);
17
+	this->logalpha = allocDoubleMatrix(T, N);
18
+	this->logbeta = allocDoubleMatrix(T, N);
19
+	this->logdensities = allocDoubleMatrix(N, T);
20
+// 	this->tdensities = allocDoubleMatrix(T, N);
21
+	this->proba = (double*) calloc(N, sizeof(double));
22
+	this->logproba = (double*) calloc(N, sizeof(double));
23
+	this->gamma = allocDoubleMatrix(N, T);
24
+	this->sumgamma = (double*) calloc(N, sizeof(double));
25
+	this->sumxi = allocDoubleMatrix(N, N);
26
+	this->logP = -INFINITY;
27
+	this->dlogP = INFINITY;
28
+	this->sumdiff_state_last = 0;
29
+	this->sumdiff_posterior = 0.0;
30
+// 	this->use_tdens = false;
31
+
32
+}
33
+
34
+LogHMM::~LogHMM()
35
+{
36
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
37
+	freeDoubleMatrix(this->A, this->N);
38
+	freeDoubleMatrix(this->logA, this->N);
39
+	freeDoubleMatrix(this->logalpha, this->T);
40
+	freeDoubleMatrix(this->logbeta, this->T);
41
+	freeDoubleMatrix(this->logdensities, this->N);
42
+// 	freeDoubleMatrix(this->tdensities, this->T);
43
+	freeDoubleMatrix(this->gamma, this->N);
44
+	freeDoubleMatrix(this->sumxi, this->N);
45
+	free(this->proba);
46
+	free(this->logproba);
47
+	free(this->sumgamma);
48
+	for (int iN=0; iN<this->N; iN++)
49
+	{
50
+		FILE_LOG(logDEBUG1) << "Deleting density functions"; 
51
+		delete this->densityFunctions[iN];
52
+	}
53
+}
54
+
55
+// Methods ----------------------------------------------------
56
+void LogHMM::initialize_transition_probs(double* initial_A, bool use_initial_params)
57
+{
58
+
59
+	if (use_initial_params)
60
+	{
61
+		for (int iN=0; iN<this->N; iN++)
62
+		{
63
+			for (int jN=0; jN<this->N; jN++)
64
+			{
65
+				// convert from vector to matrix representation
66
+				this->A[jN][iN] = initial_A[iN*this->N + jN];
67
+				this->logA[jN][iN] = log(this->A[jN][iN]);
68
+			}
69
+		}
70
+	}
71
+	else
72
+	{
73
+		double self = 0.9;
74
+// 		self = 1.0 / this->N; // set to uniform
75
+		double other = (1.0 - self) / (this->N - 1.0);
76
+		for (int iN=0; iN<this->N; iN++)
77
+		{
78
+			for (int jN=0; jN<this->N; jN++)
79
+			{
80
+				if (iN == jN)
81
+				{
82
+					this->A[iN][jN] = self;
83
+					this->logA[iN][jN] = log(this->A[iN][jN]);
84
+				}
85
+				else
86
+				{
87
+					this->A[iN][jN] = other;
88
+					this->logA[iN][jN] = log(this->A[iN][jN]);
89
+				}
90
+				// Save value to initial A
91
+				initial_A[iN*this->N + jN] = this->A[iN][jN];
92
+			}
93
+		}
94
+	}
95
+	
96
+}
97
+
98
+void LogHMM::initialize_proba(double* initial_proba, bool use_initial_params)
99
+{
100
+
101
+	if (use_initial_params)
102
+	{
103
+		for (int iN=0; iN<this->N; iN++)
104
+		{
105
+			this->proba[iN] = initial_proba[iN];
106
+			this->proba[iN] = log(this->proba[iN]);
107
+		}
108
+	}
109
+	else
110
+	{
111
+		for (int iN=0; iN<this->N; iN++)
112
+		{
113
+			this->proba[iN] = (double)1/this->N;
114
+			this->proba[iN] = log(this->proba[iN]);
115
+			// Save value to initial proba
116
+			initial_proba[iN] = this->proba[iN];
117
+		}
118
+	}
119
+
120
+}
121
+
122
+void LogHMM::baumWelch(int* maxiter, int* maxtime, double* eps)
123
+{
124
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
125
+
126
+	double logPold = -INFINITY;
127
+	double logPnew;
128
+	double** gammaold = allocDoubleMatrix(this->N, this->T);
129
+
130
+	// Parallelization settings
131
+// 	omp_set_nested(1);
132
+	
133
+	// measuring the time
134
+	this->baumWelchStartTime_sec = time(NULL);
135
+
136
+	// Print some initial information
137
+	FILE_LOG(logINFO) << "";
138
+	FILE_LOG(logINFO) << "INITIAL PARAMETERS";
139
+// 	this->print_uni_params();
140
+	this->print_uni_iteration(0);
141
+
142
+	R_CheckUserInterrupt();
143
+
144
+	// Do the Baum-Welch
145
+	int iteration = 0;
146
+	while (((this->baumWelchTime_real < *maxtime) or (*maxtime < 0)) and ((iteration < *maxiter) or (*maxiter < 0)))
147
+	{
148
+
149
+		iteration++;
150
+		
151
+		FILE_LOG(logDEBUG1) << "Calling calc_densities() from baumWelch()";
152
+		try { this->calc_densities(); } catch(...) { throw; }
153
+		R_CheckUserInterrupt();
154
+
155
+		FILE_LOG(logDEBUG1) << "Calling forward() from baumWelch()";
156
+		try { this->forward(); } catch(...) { throw; }
157
+		R_CheckUserInterrupt();
158
+
159
+		FILE_LOG(logDEBUG1) << "Calling backward() from baumWelch()";
160
+		try { this->backward(); } catch(...) { throw; }
161
+		R_CheckUserInterrupt();
162
+
163
+		FILE_LOG(logDEBUG1) << "Calling calc_loglikelihood() from baumWelch()";
164
+		this->calc_loglikelihood();
165
+		logPnew = this->logP;
166
+		if(isnan(logPnew))
167
+		{
168
+			FILE_LOG(logERROR) << "logPnew = " << logPnew;
169
+			throw nan_detected;
170
+		}
171
+		this->dlogP = logPnew - logPold;
172
+
173
+		FILE_LOG(logDEBUG1) << "Calling calc_sumxi() from baumWelch()";
174
+		this->calc_sumxi();
175
+		R_CheckUserInterrupt();
176
+
177
+		FILE_LOG(logDEBUG1) << "Calling calc_sumgamma() from baumWelch()";
178
+		this->calc_sumgamma();
179
+		R_CheckUserInterrupt();
180
+
181
+// 		clock_t clocktime = clock(), dtime;
182
+		// difference in posterior
183
+		FILE_LOG(logDEBUG1) << "Calculating differences in posterior in baumWelch()";
184
+		double postsum = 0.0;
185
+		for (int t=0; t<this->T; t++)
186
+		{
187
+			for (int iN=0; iN<this->N; iN++)
188
+			{
189
+				postsum += fabs(this->gamma[iN][t] - gammaold[iN][t]);
190
+				gammaold[iN][t] = this->gamma[iN][t];
191
+			}
192
+		}
193
+		this->sumdiff_posterior = postsum;
194
+// 		dtime = clock() - clocktime;