Browse code

initial project version

chakalakka authored on 02/05/2014 11:14:29
Showing 38 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,14 @@
1
+Package: anafinder
2
+Type: Package
3
+Title: Anaploidy analysis of Seq-data using a Hidden Markov Model
4
+Version: 0.1
5
+Date: 2014-05-01
6
+Author: Aaron Taudt, David Porubsky
7
+Maintainer: Aaron Taudt <a.s.taudt@umcg.nl>, David Porubsky <d.porubsky@umcg.nl>
8
+Description: not yet
9
+Depends: R (>= 2.15.2)
10
+Imports: Rsamtools, GenomicRanges, IRanges, ggplot2
11
+Suggests: 
12
+License: GPL
13
+LazyLoad: yes
14
+Packaged: 2014-02-13 13:29:00 CET+1; Taudt
0 15
new file mode 100644
... ...
@@ -0,0 +1,14 @@
1
+useDynLib(chromstar)
2
+
3
+# Export all names
4
+# exportPattern(".")
5
+export(
6
+bed2bin,
7
+bam2bin,
8
+bedGraph2bin,
9
+)
10
+
11
+# Import all packages listed as Imports or Depends
12
+import(
13
+	Rsamtools, GenomicRanges, IRanges, ggplot2
14
+)
0 15
new file mode 100644
... ...
@@ -0,0 +1,170 @@
1
+bedGraph2bin <- function(bedGraphfile, chrom.length.file, outputfolder="binned_data", binsizes=200, chromosomes=NULL, separate.chroms=TRUE, save.as.RData=TRUE) {
2
+	return(align2bin(bedGraphfile, format="bedGraph", chrom.length.file=chrom.length.file, outputfolder=outputfolder, binsizes=binsizes, chromosomes=chromosomes, separate.chroms=separate.chroms, save.as.RData=save.as.RData))
3
+}
4
+
5
+bam2bin <- function(bamfile, bamindex=bamfile, outputfolder="binned_data", binsizes=200, chromosomes=NULL, separate.chroms=TRUE, save.as.RData=TRUE) {
6
+	return(align2bin(bamfile, format="bam", index=bamindex, outputfolder=outputfolder, binsizes=binsizes, chromosomes=chromosomes, separate.chroms=separate.chroms, save.as.RData=save.as.RData))
7
+}
8
+
9
+bed2bin <- function(bedfile, chrom.length.file, outputfolder="binned_data", binsizes=200, chromosomes=NULL, separate.chroms=TRUE, save.as.RData=TRUE) {
10
+	return(align2bin(bedfile, format="bed", chrom.length.file=chrom.length.file, outputfolder=outputfolder, binsizes=binsizes, chromosomes=chromosomes, separate.chroms=separate.chroms, save.as.RData=save.as.RData))
11
+}
12
+
13
+align2bin <- function(file, format, index=file, chrom.length.file, outputfolder="binned_data", binsizes=200, chromosomes=NULL, separate.chroms=TRUE, save.as.RData=TRUE) {
14
+
15
+	## Load libraries
16
+# 	library(GenomicRanges)
17
+
18
+	## Create outputfolder if not exists
19
+	if (!file.exists(outputfolder) & save.as.RData==TRUE) {
20
+		dir.create(outputfolder)
21
+	}
22
+
23
+	## Read in the data
24
+	if (format == "bed") {
25
+		cat("Reading file",basename(file),"...")
26
+		# File with chromosome lengths
27
+		chrom.lengths.df <- read.table(chrom.length.file)
28
+		chrom.lengths <- chrom.lengths.df[,2]
29
+		names(chrom.lengths) <- chrom.lengths.df[,1]
30
+		# File with reads, determine classes first for faster import
31
+		tab5rows <- read.table(file, nrows=5)
32
+		classes.in.bed <- sapply(tab5rows, class)
33
+		classes <- rep("NULL",length(classes.in.bed))
34
+		classes[1:3] <- classes.in.bed[1:3]
35
+		data <- read.table(file, colClasses=classes)
36
+		# Convert to GRanges object
37
+		data <- GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2], end=data[,3]), strand=Rle(strand("*"), nrow(data)))
38
+		seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))])
39
+		chroms.in.data <- seqlevels(data)
40
+	} else if (format == "bam") {
41
+		library(Rsamtools)
42
+		cat("Reading header of",basename(file),"...")
43
+		file.header <- scanBamHeader(file)[[1]]
44
+		chrom.lengths <- file.header$targets
45
+		chroms.in.data <- names(chrom.lengths)
46
+	} else if (format == "bedGraph") {
47
+		cat("Reading file",basename(file),"...")
48
+		# File with chromosome lengths
49
+		chrom.lengths.df <- read.table(chrom.length.file)
50
+		chrom.lengths <- chrom.lengths.df[,2]
51
+		names(chrom.lengths) <- chrom.lengths.df[,1]
52
+		# File with reads, determine classes first for faster import
53
+		tab5rows <- read.table(file, nrows=5)
54
+		classes.in.bed <- sapply(tab5rows, class)
55
+		classes <- rep("NULL",length(classes.in.bed))
56
+		classes[1:4] <- classes.in.bed[1:4]
57
+		data <- read.table(file, colClasses=classes)
58
+		# Convert to GRanges object
59
+		data <- GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2], end=data[,3]), strand=Rle(strand("*"), nrow(data)), signal=data[,4])
60
+		seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))])
61
+		chroms.in.data <- seqlevels(data)
62
+	}
63
+	cat(" done\n")
64
+
65
+ 
66
+	### Do the loop for all binsizes
67
+	for (binsize in binsizes) {
68
+		cat("Binning into binsize",binsize,"\n")
69
+
70
+		### Iterate over all chromosomes
71
+		binned.data.allchroms <- NULL
72
+		if (is.null(chromosomes)) {
73
+			chromosomes <- chroms.in.data
74
+		}
75
+		for (chromosome in chromosomes) {
76
+			## Check if chromosome exists in data
77
+			if ( !(chromosome %in% chroms.in.data) ) {
78
+				warning("Skipped chromosome ",chromosome,", not in the data!")
79
+				next
80
+			} else if ( !(chromosome %in% names(chrom.lengths)) ) {
81
+				warning("Skipped chromosome ",chromosome,", no length found!")
82
+				next
83
+			}
84
+			cat(chromosome,"                              \n")
85
+			## Check last incomplete bin
86
+			incomplete.bin <- chrom.lengths[chromosome] %% binsize > 0
87
+			if (incomplete.bin) {
88
+				numbins <- ceiling(chrom.lengths[chromosome]/binsize)
89
+			} else {
90
+				numbins <- chrom.lengths[chromosome]/binsize
91
+			}
92
+			## Initialize vectors
93
+			cat("initialize vectors...\r")
94
+			chroms <- rep(chromosome,numbins)
95
+			reads <- rep(0,numbins)
96
+			start <- seq(from=0, by=binsize, length.out=numbins)
97
+			end <- seq(from=binsize-1, by=binsize, length.out=numbins)
98
+			end[length(end)] <- chrom.lengths[chromosome]
99
+
100
+			## Create binned chromosome as GRanges object
101
+			cat("creating GRanges container...            \r")
102
+			ichrom <- GRanges(seqnames = Rle(chromosome, numbins),
103
+							ranges = IRanges(start=start, end=end),
104
+							strand = Rle(strand("*"), numbins)
105
+							)
106
+
107
+			if (format=="bam") {
108
+				cat("reading reads from file...               \r")
109
+				data <- readGAlignmentsFromBam(file, index=index, param=ScanBamParam(what=c("pos"),which=range(ichrom)))
110
+			}
111
+
112
+			## Count overlaps
113
+			cat("counting overlaps...                     \r")
114
+			if (format=="bam" | format=="bed") {
115
+				reads <- countOverlaps(ichrom, data[seqnames(data)==chromosome])
116
+			} else if (format=="bedGraph") {
117
+				# Take the max value from all regions that fall into / overlap a given bin as read count
118
+				midx <- as.matrix(findOverlaps(ichrom, data[seqnames(data)==chromosome]))
119
+				reads <- rep(0,length(ichrom))
120
+				signal <- mcols(data)$signal
121
+				rle <- rle(midx[,1])
122
+				read.idx <- rle$values
123
+				max.idx <- cumsum(rle$lengths)
124
+				maxvalues <- rep(NA, length(read.idx))
125
+				maxvalues[1] <- max(signal[midx[1:(max.idx[1]),2]])
126
+				for (i1 in 2:length(read.idx)) {
127
+					maxvalues[i1] <- max(signal[midx[(max.idx[i1-1]+1):(max.idx[i1]),2]])
128
+				}
129
+				reads[read.idx] <- maxvalues
130
+			}
131
+			
132
+			## Concatenate
133
+			cat("concatenate...                           \r")
134
+			binned.data <- data.frame(chroms,start,end,reads)
135
+			names(binned.data) <- c("chrom","start","end","reads")
136
+
137
+			if (separate.chroms==TRUE) {
138
+				if (save.as.RData==TRUE) {
139
+					## Print to file
140
+					filename <- paste(basename(file),"_binsize_",binsize,"_",chromosome,".RData", sep="")
141
+					cat("save...                                  \r")
142
+					save(binned.data, file=file.path(outputfolder,filename) )
143
+				} else {
144
+					cat("                                         \r")
145
+					return(binned.data)
146
+				}
147
+			} else {
148
+				binned.data.allchroms[[length(binned.data.allchroms)+1]] <- binned.data
149
+			}
150
+			cat("                                         \r")
151
+
152
+		}
153
+		if (separate.chroms!=TRUE) {
154
+			cat("Concatenating chromosomes ...")
155
+			binned.data.allchroms <- do.call("rbind",binned.data.allchroms)
156
+			cat(" done\n")
157
+			if (save.as.RData==TRUE) {
158
+				# Print to file
159
+				filename <- paste(basename(file),"_binsize_",binsize,".RData", sep="")
160
+				cat("Saving to file ...")
161
+				save(binned.data.allchroms, file=file.path(outputfolder,filename) )
162
+				cat(" done\n")
163
+			} else {
164
+				return(binned.data.allchroms)
165
+			}
166
+		}
167
+
168
+	}
169
+
170
+}
0 171
new file mode 100644
... ...
@@ -0,0 +1,67 @@
1
+check.positive.integer = function(testvar) {
2
+	if (!is(testvar,"numeric") & !is(testvar,"integer")) return(1)
3
+	if (length(testvar)>1) return(2)
4
+	if (testvar <= 0) return(3)
5
+	if (testvar != as.integer(testvar)) return(4)
6
+	return(0)
7
+}
8
+check.positive.integer.vector = function(testvec) {
9
+	if (!is(testvec,"numeric") & !is(testvec,"integer")) return(1)
10
+	for (elem in testvec) {
11
+		if (elem <= 0) return(2)
12
+		if (elem != as.integer(elem)) return(3)
13
+	}
14
+	return(0)
15
+}
16
+check.nonnegative.integer.vector = function(testvec) {
17
+	if (!is(testvec,"numeric") & !is(testvec,"integer")) return(1)
18
+	for (elem in testvec) {
19
+		if (elem < 0) return(2)
20
+		if (elem != as.integer(elem)) return(3)
21
+	}
22
+	return(0)
23
+}
24
+check.positive = function(testvar) {
25
+	if (!is(testvar,"numeric") & !is(testvar,"integer")) return(1)
26
+	if (length(testvar)>1) return(2)
27
+	if (testvar <= 0) return(3)
28
+	return(0)
29
+}
30
+check.positive.vector = function(testvec) {
31
+	if (!is(testvec,"numeric") & !is(testvec,"integer")) return(1)
32
+	for (elem in testvec) {
33
+		if (elem <= 0) return(2)
34
+	}
35
+	return(0)
36
+}
37
+check.nonnegative.vector = function(testvec) {
38
+	if (!is(testvec,"numeric") & !is(testvec,"integer")) return(1)
39
+	for (elem in testvec) {
40
+		if (elem < 0) return(2)
41
+	}
42
+	return(0)
43
+}
44
+check.integer = function(testvar) {
45
+	if (!is(testvar,"numeric") & !is(testvar,"integer")) return(1)
46
+	if (length(testvar)>1) return(2)
47
+	if (testvar != as.integer(testvar)) return(3)
48
+	return(0)
49
+}
50
+
51
+check.univariate.modellist = function(modellist) {
52
+	if (!is(modellist,"list")) return(1)
53
+	for (model in modellist) {
54
+		if (!is(model,"chromStar.univariate.model")) return(2)
55
+	}
56
+	return(0)
57
+}
58
+check.univariate.model = function(model) {
59
+	if (!is(model,"chromStar.univariate.model")) return(1)
60
+	return(0)
61
+}
62
+
63
+check.logical = function(testbool) {
64
+	if (!is(testbool,"logical")) return(1)
65
+	if (length(testbool)>1) return(2)
66
+	return(0)
67
+}
0 68
new file mode 100644
... ...
@@ -0,0 +1,90 @@
1
+collapse.bins = function(data, column2collapseBy=NULL, columns2sumUp=NULL) {
2
+
3
+	# Indexing stuff
4
+	ind_coords = 1:3
5
+	ind_morecols = setdiff(1:ncol(data), c(ind_coords,columns2sumUp))
6
+	ind_sumcols = columns2sumUp
7
+
8
+	# Make the comparison vector
9
+	if (is.null(column2collapseBy)) {
10
+		c = data$start
11
+		cShift1 = rep(NA,length(c))
12
+		cShift1[2:length(cShift1)] = data$end[-length(c)] + 1
13
+	} else {
14
+		if (is(data[,column2collapseBy], "factor")) {
15
+			c <- as.integer(data[,column2collapseBy])
16
+		} else {
17
+			c = data[,column2collapseBy]
18
+		}
19
+		cShift1 = rep(NA,length(c))
20
+		cShift1[-1] = c[-length(c)]
21
+	}
22
+	compare = c != cShift1
23
+	compare[1] = TRUE
24
+	numcollapsedbins = length(which(compare==TRUE))
25
+	numbins = nrow(data)
26
+
27
+	# Select the collapsed rows
28
+	collapsed.bins = NULL
29
+	collapsed.bins$chrom = data[compare,1]
30
+	collapsed.bins$start = data[compare,2]
31
+	collapsed.bins$end = data[c((which(compare==TRUE)-1)[-1],numbins), 3]
32
+	if (length(ind_morecols)==1) {
33
+		collapsed.bins[[4]] = data[compare, ind_morecols]
34
+	} else if (length(ind_morecols)>1) {
35
+		lcb = length(collapsed.bins)
36
+		lmc = length(ind_morecols)
37
+		collapsed.bins[(lcb+1):(lcb+lmc)] = data[compare, ind_morecols]
38
+	}
39
+
40
+	# Sum up columns
41
+	if (!is.null(columns2sumUp)) {
42
+		sumcols = as.matrix(data[,columns2sumUp])
43
+		collapsed_sumcols = matrix(rep(NA,numcollapsedbins*length(columns2sumUp)), ncol=length(columns2sumUp))
44
+		pb = txtProgressBar(min=1, max=length(compare), style=3)
45
+		icount = 1
46
+		i1_lasttrue = 1
47
+		for (i1 in 1:length(compare)) {
48
+			if (compare[i1]==TRUE) {
49
+				if (length(columns2sumUp)==1) {
50
+					collapsed_sumcols[icount-1] = sum(sumcols[i1_lasttrue:(i1-1),])
51
+				} else {
52
+					if (i1_lasttrue==i1-1 | i1==1) {
53
+						collapsed_sumcols[icount-1,] = as.numeric(sumcols[i1_lasttrue,])
54
+					} else {
55
+						collapsed_sumcols[icount-1,] = apply(sumcols[i1_lasttrue:(i1-1),],2,sum)
56
+					}
57
+				}
58
+					
59
+				icount = icount+1
60
+				i1_lasttrue = i1
61
+				setTxtProgressBar(pb, i1)
62
+			}
63
+		}
64
+		i1 = i1+1
65
+		if (length(columns2sumUp)==1) {
66
+			collapsed_sumcols[icount-1] = sum(sumcols[i1_lasttrue:(i1-1),])
67
+		} else {
68
+			if (i1_lasttrue==i1-1 | i1==1) {
69
+				collapsed_sumcols[icount-1,] = as.numeric(sumcols[i1_lasttrue,])
70
+			} else {
71
+				collapsed_sumcols[icount-1,] = apply(sumcols[i1_lasttrue:(i1-1),],2,sum)
72
+			}
73
+		}
74
+		setTxtProgressBar(pb, length(compare))
75
+		close(pb)
76
+
77
+		lcb = length(collapsed.bins)
78
+		lsc = length(ind_sumcols)
79
+		collapsed.bins[(lcb+1):(lcb+lsc)] = as.data.frame(collapsed_sumcols)
80
+	}
81
+
82
+	# Create the return data.frame with same order as input data.frame
83
+	collapsed.bins.reordered = NULL
84
+	collapsed.bins.reordered[c(ind_coords,ind_morecols,ind_sumcols)] = collapsed.bins
85
+	names(collapsed.bins.reordered) = names(data)
86
+
87
+	return(as.data.frame(collapsed.bins.reordered))
88
+
89
+}
90
+
0 91
new file mode 100644
... ...
@@ -0,0 +1,291 @@
1
+# ----------------------------------------------------------------
2
+# Write univariate states to file
3
+# ----------------------------------------------------------------
4
+univariate2bed <- function(modellist, file="view_me_in_genome_browser", threshold=0.5, separate.zeroinflation=FALSE, chrom.length.file=NULL) {
5
+
6
+	# Check user input
7
+	if (is.null(names(modellist))) {
8
+		stop("Please name the list entries. The names will be used as track name for the Genome Browser file.")
9
+	}
10
+
11
+	# Variables
12
+	nummod <- length(modellist)
13
+	file <- paste(file,"bed", sep=".")
14
+
15
+	# Generate the colors
16
+	colors <- c("zeroinflation"="black", "unmodified"="gray48","modified"="orangered3")
17
+	RGBs <- t(col2rgb(colors))
18
+	RGBs <- apply(RGBs,1,paste,collapse=",")
19
+
20
+	# Write first line to file
21
+	cat("browser hide all\n", file=file)
22
+	
23
+	for (imod in 1:nummod) {
24
+		model <- modellist[[imod]]
25
+		priority <- 50 + imod
26
+		cat(paste("track name=",names(modellist)[imod]," description=\"univariate calls for ",names(modellist)[imod],", threshold = ",threshold,"\" visibility=1 itemRgb=On priority=",priority,"\n", sep=""), file=file, append=TRUE)
27
+
28
+		# Collapse the calls
29
+		if (separate.zeroinflation) {
30
+			calls <- cbind(model$coordinates, name=model$states.with.zeroinflation)
31
+			collapsed.calls <- collapse.bins(calls, column2collapseBy=4)
32
+		} else {
33
+			calls <- cbind(model$coordinates, name=model$states)
34
+			collapsed.calls <- collapse.bins(calls, column2collapseBy=4)
35
+			collapsed.calls <- collapsed.calls[collapsed.calls$name=="modified",]
36
+		}
37
+
38
+		# Check length of chromosomes if chrom.length.file was given
39
+		if (!is.null(chrom.length.file)) {
40
+			chrom.lengths.df <- read.table(chrom.length.file)
41
+			chrom.lengths <- chrom.lengths.df[,2]
42
+			names(chrom.lengths) <- chrom.lengths.df[,1]
43
+			for (chrom in levels(as.factor(collapsed_calls$chrom))) {
44
+				index <- which(collapsed_calls$chrom == chrom & collapsed_calls$end > chrom.lengths[chrom])
45
+				collapsed_calls$end[index] <- chrom.lengths[chrom]
46
+				if (length(index) >= 1) {
47
+					warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome")
48
+				}
49
+			}
50
+		}
51
+
52
+		# Append to file
53
+		itemRgb <- RGBs[as.character(collapsed.calls$name)]
54
+		numsegments <- nrow(collapsed.calls)
55
+		df <- cbind(collapsed.calls, score=rep(0,numsegments), strand=rep(".",numsegments), thickStart=collapsed.calls$start, thickEnd=collapsed.calls$end, itemRgb=itemRgb)
56
+		write.table(format(df, scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE)
57
+	}
58
+
59
+}
60
+
61
+# ----------------------------------------------------------------
62
+# 
63
+# ----------------------------------------------------------------
64
+make.ucsc.file.combinatorial <- function(modellist, numstates=NULL, file="combinatorial-states-ucsc", threshold=0.5, chrom.length.file=NULL) {
65
+	
66
+	# Define variables
67
+	nummod <- length(modellist)
68
+	file <- paste(file,".BED", sep="")
69
+	coordinates <- modellist[[1]]$coordinates
70
+	
71
+	# Write first line to file
72
+	cat("browser position chr1:1-100000\n", file=file)
73
+
74
+	# Get the combinatorial states
75
+	cat("Getting the combinatorial states ...\n")
76
+	combstates <- combinatorial.states(modellist)
77
+	states <- as.numeric(names(sort(table(combstates), decreasing=TRUE)))
78
+
79
+	# Clean up to reduce memory usage
80
+	remove(modellist)
81
+
82
+	if (is.null(numstates)) {
83
+		numstates <- length(states)
84
+	} else if (numstates > length(states)) {
85
+		numstates <- length(states)
86
+	}
87
+	# Go through each state and write to file
88
+	for (istate in states[1:numstates]) {
89
+		cat("\nAnalyzing state",istate,"\n")
90
+		cat(paste("track name=\"combinatorial state ",istate,"\" description=\"univariate calls for state ",istate,", threshold = ",threshold,"\" visibility=1\n", sep=""), file=file, append=TRUE)
91
+
92
+		calls <- coordinates[ combstates == istate , ]
93
+		if (length(calls$start) >= 1) {
94
+			# Collapse the bins
95
+			cat("collapsing the calls\n")
96
+			collapsed_calls <- collapse.bins(calls)
97
+		} else {
98
+			cat("no bins in state",istate,"\n")
99
+			collapsed_calls <- NULL
100
+		}
101
+
102
+		if (!is.null(collapsed_calls)) {
103
+			# Check length of chromosomes if chrom.length.file was given
104
+			if (!is.null(chrom.length.file)) {
105
+				chrom.lengths.df <- read.table(chrom.length.file)
106
+				chrom.lengths <- chrom.lengths.df[,2]
107
+				names(chrom.lengths) <- chrom.lengths.df[,1]
108
+				for (chrom in levels(as.factor(collapsed_calls$chrom))) {
109
+					index <- which(collapsed_calls$chrom == chrom & collapsed_calls$end > chrom.lengths[chrom])
110
+					collapsed_calls$end[index] <- chrom.lengths[chrom]
111
+					if (length(index) >= 1) {
112
+						warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome")
113
+					}
114
+				}
115
+			}
116
+
117
+			cat("appending to file ...\n")
118
+			write.table(format(as.data.frame(collapsed_calls)[,1:3], scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE)
119
+		}
120
+	}
121
+
122
+}
123
+
124
+
125
+# ----------------------------------------------------------------
126
+# 
127
+# ----------------------------------------------------------------
128
+make.ucsc.file.multivariate <- function(model, numstates=NULL, file="multivariate-states-ucsc", chrom.length.file=NULL) {
129
+
130
+	# Variable assignments
131
+	states <- model$stateorder
132
+	coordinates <- model$coordinates
133
+	file <- paste(file,".BED", sep="")
134
+
135
+	# Write first line to file
136
+	cat("browser position chr1:1-100000\n", file=file)
137
+
138
+	# Determine state of bin
139
+	cat("Determining state of bins ...\n")
140
+	combstates <- apply(model$posteriors,1,which.max)
141
+	combstates <- states[combstates]
142
+
143
+	if (is.null(numstates)) {
144
+		numstates <- length(states)
145
+	} else if (numstates > length(states)) {
146
+		numstates <- length(states)
147
+	}
148
+	# Go through each state and write to file
149
+	for (istate in states[1:numstates]) {
150
+		cat("\nAnalyzing state",istate,"\n")
151
+		cat(paste("track name=\"multivariate state ",istate,"\" description=\"multivariate calls for state ",istate,"\" visibility=1 color=0,0,255\n", sep=""), file=file, append=TRUE)
152
+
153
+		calls <- coordinates[ combstates == istate , ]
154
+		if (length(calls$start) >= 1) {
155
+			# Collapse the bins
156
+			cat("collapsing the calls\n")
157
+			collapsed_calls <- collapse.bins(calls)
158
+		} else {
159
+			cat("no bins in state",istate,"\n")
160
+			collapsed_calls <- NULL
161
+		}
162
+
163
+		if (!is.null(collapsed_calls)) {
164
+			# Check length of chromosomes if chrom.length.file was given
165
+			if (!is.null(chrom.length.file)) {
166
+				chrom.lengths.df <- read.table(chrom.length.file)
167
+				chrom.lengths <- chrom.lengths.df[,2]
168
+				names(chrom.lengths) <- chrom.lengths.df[,1]
169
+				for (chrom in levels(as.factor(collapsed_calls$chrom))) {
170
+					index <- which(collapsed_calls$chrom == chrom & collapsed_calls$end > chrom.lengths[chrom])
171
+					collapsed_calls$end[index] <- chrom.lengths[chrom]
172
+					if (length(index) >= 1) {
173
+						warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome")
174
+					}
175
+				}
176
+			}
177
+
178
+			cat("appending to file ...\n")
179
+			write.table(format(as.data.frame(collapsed_calls)[,1:3], scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE)
180
+		}
181
+	}
182
+
183
+}
184
+
185
+
186
+# ----------------------------------------------------------------
187
+# Write signal tracks from binned data
188
+# ----------------------------------------------------------------
189
+bin2wiggle <- function(binned.data.list, file="view_me_in_genome_browser") {
190
+	
191
+	# Check user input
192
+	if (is.null(names(binned.data.list))) {
193
+		stop("Please name the list entries. The names will be used as track name for the Genome Browser file.")
194
+	}
195
+
196
+	# Variables
197
+	file <- paste(file,"wig", sep=".")
198
+
199
+	# Write to file
200
+	cat("browser hide all\n", file=file)
201
+	
202
+	# Go through binned.data.list
203
+	for (i1 in 1:length(binned.data.list)) {
204
+		binned.data <- binned.data.list[[i1]]
205
+		names(binned.data) <- c("chrom","start","end","reads")
206
+		
207
+		# Write track information
208
+		name <- names(binned.data.list)[i1]
209
+		binsize <- diff(binned.data$start[1:2])
210
+		description <- paste("binned read counts ",binsize,"bp", sep="")
211
+		cat(paste("track type=wiggle_0 name=\"",name,"\" description=\"",description,"\" visibility=full autoScale=on color=90,90,90 maxHeightPixels=100:50:20 graphType=bar priority=",i1,"\n", sep=""), file=file, append=TRUE)
212
+		# Write read data
213
+		for (chrom in unique(binned.data$chrom)) {
214
+			cat(paste("fixedStep chrom=",chrom," start=1 step=",binsize," span=",binsize,"\n", sep=""), file=file, append=TRUE)
215
+			write.table(binned.data$reads, file=file, append=TRUE, row.names=FALSE, col.names=FALSE)
216
+		}
217
+	}
218
+
219
+}
220
+
221
+
222
+# ----------------------------------------------------------------
223
+# Write color-coded (multivariate) combinatorial states to file
224
+# ----------------------------------------------------------------
225
+multivariate2bed <- function(model, separate.tracks=FALSE, exclude.state.zero=TRUE, numstates=NULL, file="view_me_in_genome_browser", chrom.length.file=NULL) {
226
+
227
+	# Variables
228
+	file <- paste(file,".bed", sep="")
229
+	combstates <- model$stateorder
230
+	if (is.null(numstates)) {
231
+		numstates <- length(combstates)
232
+	} else if (numstates > length(combstates)) {
233
+		numstates <- length(combstates)
234
+	}
235
+	if (exclude.state.zero) {
236
+		combstates2use <- setdiff(combstates,0)
237
+		combstates2use <- combstates2use[1:numstates]
238
+		combstates2use <- combstates2use[!is.na(combstates2use)]
239
+		numstates <- length(combstates2use)
240
+	}
241
+
242
+	# Collapse the calls
243
+	calls <- cbind(model$coordinates, name=model$states)
244
+	collapsed.calls <- collapse.bins(calls, column2collapseBy=4)
245
+	# Select only desired states
246
+	mask <- rep(FALSE,nrow(collapsed.calls))
247
+	for (istate in combstates2use) {
248
+		mask <- mask | istate==collapsed.calls$name
249
+	}
250
+	collapsed.calls <- collapsed.calls[mask,]
251
+
252
+	# Check length of chromosomes if chrom.length.file was given
253
+	if (!is.null(chrom.length.file)) {
254
+		chrom.lengths.df <- read.table(chrom.length.file)
255
+		chrom.lengths <- chrom.lengths.df[,2]
256
+		names(chrom.lengths) <- chrom.lengths.df[,1]
257
+		for (chrom in levels(as.factor(collapsed.calls$chrom))) {
258
+			index <- which(collapsed.calls$chrom == chrom & collapsed.calls$end > chrom.lengths[chrom])
259
+			collapsed.calls$end[index] <- chrom.lengths[chrom]
260
+			if (length(index) >= 1) {
261
+				warning("Adjusted entry in chromosome ",chrom,", because it was longer than the length of the chromosome")
262
+			}
263
+		}
264
+	}
265
+
266
+	# Generate the colors for each combinatorial state
267
+	colors <- colors()[grep(colors(), pattern="white|grey|gray|snow", invert=T)]
268
+	step <- length(colors) %/% numstates
269
+	colors <- colors[seq(1,by=step,length=numstates)]
270
+	RGBs <- t(col2rgb(colors))
271
+	RGBs <- apply(RGBs,1,paste,collapse=",")
272
+	itemRgb <- RGBs[as.factor(collapsed.calls$name)]
273
+
274
+	# Write to file
275
+	cat("browser hide all\n", file=file)
276
+	numsegments <- nrow(collapsed.calls)
277
+	df <- cbind(collapsed.calls, score=rep(0,numsegments), strand=rep(".",numsegments), thickStart=collapsed.calls$start, thickEnd=collapsed.calls$end, itemRgb=itemRgb)
278
+
279
+	if (separate.tracks) {
280
+		for (istate in combstates2use) {
281
+			priority <- 100 + which(istate==combstates2use)
282
+			cat(paste("track name=\"comb.state ",istate,"\" description=\"multivariate calls for combinatorial state ",istate,"\" visibility=1 itemRgb=On priority=",priority,"\n", sep=""), file=file, append=TRUE)
283
+			write.table(format(df[df$name==istate,], scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE)
284
+		}
285
+	} else {
286
+		cat(paste("track name=\"comb.state\" description=\"multivariate combinatorial states\" visibility=1 itemRgb=On priority=100\n", sep=""), file=file, append=TRUE)
287
+		write.table(format(df, scientific=FALSE), file=file, append=TRUE, row.names=FALSE, col.names=FALSE, quote=FALSE)
288
+	}
289
+
290
+}
291
+
0 292
new file mode 100644
... ...
@@ -0,0 +1,23 @@
1
+get.states <- function(model, threshold=0.5, separate.zeroinflation=FALSE) {
2
+	numbins <- nrow(model$posteriors)
3
+	states <- rep(NA,numbins)
4
+	if (is(model,"chromStar.univariate.model")) {
5
+		if (separate.zeroinflation) {
6
+			states[ model$posteriors[,3]<=threshold & model$posteriors[,2]<=model$posteriors[,1] ] <- 1
7
+			states[ model$posteriors[,3]<=threshold & model$posteriors[,2]>=model$posteriors[,1] ] <- 2
8
+			states[ model$posteriors[,3]>threshold ] <- 3
9
+			states <- as.factor(c("zeroinflation","unmodified","modified"))[states]
10
+		} else {
11
+			states <- ifelse(model$posteriors[,3]>threshold, 2, 1)
12
+			states <- as.factor(c("unmodified","modified"))[states]
13
+		}
14
+		return(states)
15
+	} else if (is(model,"chromStar.multivariate.model")) {
16
+		states <- model$stateorder[apply(model$posteriors, 1, which.max)]
17
+		return(states)
18
+	} else {
19
+		stop("Supply either a univariate or multivariate chromStar model")
20
+	}
21
+	
22
+}
23
+
0 24
new file mode 100644
... ...
@@ -0,0 +1,52 @@
1
+mixedorder = function (x) 
2
+{
3
+    if (length(x) < 1) 
4
+        return(NULL)
5
+    else if (length(x) == 1) 
6
+        return(1)
7
+    if (is.numeric(x)) 
8
+        return(order(x))
9
+    delim = "\\$\\@\\$"
10
+    numeric <- function(x) {
11
+        suppressWarnings(as.numeric(x))
12
+    }
13
+    nonnumeric <- function(x) {
14
+        suppressWarnings(ifelse(is.na(as.numeric(x)), toupper(x), 
15
+            NA))
16
+    }
17
+    x <- as.character(x)
18
+    which.nas <- which(is.na(x))
19
+    which.blanks <- which(x == "")
20
+    if (length(which.blanks) > 0) 
21
+        x[which.blanks] <- -Inf
22
+    if (length(which.nas) > 0) 
23
+        x[which.nas] <- Inf
24
+    delimited <- gsub("([+-]{0,1}[0-9]+\\.{0,1}[0-9]*([eE][\\+\\-]{0,1}[0-9]+\\.{0,1}[0-9]*){0,1})", 
25
+        paste(delim, "\\1", delim, sep = ""), x)
26
+    step1 <- strsplit(delimited, delim)
27
+    step1 <- lapply(step1, function(x) x[x > ""])
28
+    step1.numeric <- lapply(step1, numeric)
29
+    step1.character <- lapply(step1, nonnumeric)
30
+    maxelem <- max(sapply(step1, length))
31
+    step1.numeric.t <- lapply(1:maxelem, function(i) sapply(step1.numeric, 
32
+        function(x) x[i]))
33
+    step1.character.t <- lapply(1:maxelem, function(i) sapply(step1.character, 
34
+        function(x) x[i]))
35
+    rank.numeric <- sapply(step1.numeric.t, rank)
36
+    rank.character <- sapply(step1.character.t, function(x) as.numeric(factor(x)))
37
+    rank.numeric[!is.na(rank.character)] <- 0
38
+    rank.character <- t(t(rank.character) + apply(matrix(rank.numeric), 
39
+        2, max, na.rm = TRUE))
40
+    rank.overall <- ifelse(is.na(rank.character), rank.numeric, 
41
+        rank.character)
42
+    order.frame <- as.data.frame(rank.overall)
43
+    if (length(which.nas) > 0) 
44
+        order.frame[which.nas, ] <- Inf
45
+    retval <- do.call("order", order.frame)
46
+    return(retval)
47
+}
48
+
49
+mixedsort = function (x) 
50
+{
51
+	x[mixedorder(x)]
52
+}
0 53
new file mode 100644
... ...
@@ -0,0 +1,146 @@
1
+# ------------------------------------------------------------
2
+# Plot a read histogram with univariate fits
3
+# ------------------------------------------------------------
4
+plot.distribution <- function(model, state=NULL, chr=NULL, start=NULL, end=NULL) {
5
+
6
+	## Load libraries
7
+# 	library(ggplot2)
8
+
9
+	# -----------------------------------------
10
+	# Get right x limit
11
+	get_rightxlim <- function(histdata, reads) {
12
+		rightxlim1 <- median(reads[reads>0])*7
13
+		breaks <- histdata$breaks[1:length(histdata$counts)]
14
+		counts <- histdata$counts
15
+		rightxlim2 <- breaks[counts<=5 & breaks>median(reads)][1]
16
+		rightxlim <- min(c(rightxlim1,rightxlim2), na.rm=TRUE)
17
+		return(rightxlim)
18
+	}
19
+
20
+	## Plot settings
21
+	cols <- c("unmodified"="gray48","modified"="orangered3", "total"="black")
22
+
23
+	# Select the rows to plot
24
+	selectmask <- rep(TRUE,length(model$reads))
25
+	numchrom <- length(table(model$coordinates$chrom))
26
+	if (!is.null(chr)) {
27
+		if (! chr %in% levels(model$coordinates$chrom)) {
28
+			stop(chr," can't be found in the model coordinates.")
29
+		}
30
+		selectchrom <- model$coordinates$chrom == chr
31
+		selectmask <- selectmask & selectchrom
32
+		numchrom <- 1
33
+	}
34
+	if (numchrom == 1) {
35
+		if (!is.null(start)) {
36
+			selectstart <- model$coordinates$start >= start
37
+			selectmask <- selectmask & selectstart
38
+		}
39
+		if (!is.null(end)) {
40
+			selectend <- model$coordinates$end <= end
41
+			selectmask <- selectmask & selectend
42
+		}
43
+	}
44
+	if (!is.null(state)) {
45
+		states <- get.states(model)
46
+		selectmask <- selectmask & states==state
47
+	}
48
+	if (length(which(selectmask)) != length(model$reads)) {
49
+		reads <- model$reads[selectmask]
50
+		posteriors <- model$posteriors[selectmask,]
51
+		softweights <- apply(posteriors,2,mean)
52
+	} else {
53
+		reads <- model$reads
54
+		softweights <- model$softweights
55
+	}
56
+
57
+	# Find the x limits
58
+	breaks <- max(reads)
59
+	if (max(reads)==0) { breaks <- 1 }
60
+	histdata <- hist(reads, right=FALSE, breaks=breaks, plot=FALSE)
61
+	rightxlim <- get_rightxlim(histdata, reads)
62
+
63
+	# Plot the histogram
64
+	ggplt <- ggplot(data.frame(reads)) + geom_histogram(aes(x=reads, y=..density..), binwidth=1, color='black', fill='white') + xlim(0,rightxlim) + theme_bw() + xlab("read count")
65
+
66
+	### Add fits to the histogram
67
+	numstates <- length(softweights)
68
+	x <- 0:rightxlim
69
+	distributions <- data.frame(x)
70
+
71
+	# Unmodified
72
+	distributions$unmodified <- (1-softweights[3]) * dzinbinom(x, softweights[1], model$distributions[2,'r'], model$distributions[2,'p'])
73
+	# Modified
74
+	distributions$modified <- softweights[3] * dnbinom(x, model$distributions[3,'r'], model$distributions[3,'p'])
75
+	# Total
76
+	distributions$total <- distributions$unmodified + distributions$modified
77
+
78
+	### Plot the distributions
79
+	if (is.null(state)) {
80
+		ggplt <- ggplt + geom_line(aes(x=x, y=unmodified, color="unmodified", group=1), data=distributions, size=1)
81
+		ggplt <- ggplt + geom_line(aes(x=x, y=modified, color="modified", group=1), data=distributions, size=1)
82
+		ggplt <- ggplt + geom_line(aes(x=x, y=total, color="total", group=1), data=distributions, size=1)
83
+	} else {
84
+		if (state==0) ggplt <- ggplt + geom_line(aes(x=x, y=unmodified, color="unmodified", group=1), data=distributions, size=1)
85
+		if (state==1) ggplt <- ggplt + geom_line(aes(x=x, y=modified, color="modified", group=1), data=distributions, size=1)
86
+	}
87
+	
88
+	# Make legend and colors correct
89
+	ggplt <- ggplt + scale_color_manual(name="components", values=cols)
90
+
91
+	return(ggplt)
92
+
93
+}
94
+
95
+# ------------------------------------------------------------
96
+# Plot a read histogram in normal space of the given state
97
+# ------------------------------------------------------------
98
+plot.distribution.normal <- function(model, state=0) {
99
+
100
+	## Load libraries
101
+# 	library(ggplot2)
102
+
103
+	## Intercept user input
104
+	if (state!=0 & state!=1) { stop("state has to be either 0 or 1") }
105
+
106
+	## Plot settings
107
+	cols <- c("unmodified"="gray48","modified"="orangered3")
108
+
109
+	## Transform the reads
110
+	states <- get.states(model)
111
+	df <- data.frame(bin=1:length(model$reads), reads=model$reads, state=as.factor(states))
112
+	# Transform to uniform space
113
+	df$ureads[df$state==0] <- pzinbinom(df$reads[df$state==0], model$softweights[1], model$distributions[2,'r'], model$distributions[2,'p'])
114
+	df$ureads[df$state==1] <- pnbinom(df$reads[df$state==1], model$distributions[3,'r'], model$distributions[3,'p'])
115
+	# Transform to normal space
116
+	df$nreads <- qnorm(df$ureads)
117
+
118
+	## Make the plots
119
+	subset <- df$nreads[df$state==state]
120
+	breaks <- c(-Inf,sort(as.numeric(names(table(subset)))))
121
+	x <- seq(-4,4,0.1)
122
+	title <- paste("Transformed emission density for state ",state, sep="")
123
+	ggplt <- ggplot() + geom_histogram(data=data.frame(ureads=subset), aes(x=ureads, y=..density..), breaks=breaks, right=TRUE, col='black', fill=cols[state+1]) + theme_bw() + geom_line(data=data.frame(x=x, y=dnorm(x, mean=0, sd=1)), aes(x=x, y=y)) + xlab("transformed reads") + ylim(0,0.5) + labs(title=title)
124
+	return(ggplt)
125
+
126
+}
127
+
128
+# ------------------------------------------------------------
129
+# Plot a boxplot of the univariate calls
130
+# ------------------------------------------------------------
131
+plot.boxplot <- function(model) {
132
+
133
+	## Load libraries
134
+# 	library(ggplot2)
135
+
136
+	## Plot settings
137
+	cols <- c("unmodified"="gray48","modified"="orangered3")
138
+
139
+	## Boxplot
140
+	components <- c("unmodified","modified")[as.factor(get.states(model))]
141
+	df <- data.frame(component=components, reads=model$reads)
142
+	ggplt <- ggplot() + theme_bw() + geom_boxplot(data=df, aes(x=component, y=reads, fill=component)) + scale_fill_manual(values=cols)
143
+	return(ggplt)
144
+
145
+}
146
+
0 147
new file mode 100644
... ...
@@ -0,0 +1,138 @@
1
+simulate.univariate = function(coordinates, transition, emission, initial=1) {
2
+
3
+	# Calculate some variables
4
+	numstates = ncol(transition)
5
+	if (numstates!=3) {
6
+		stop("The transition matrix is expected to have 3 columns and 3 rows")
7
+	}
8
+	numbins = nrow(coordinates)
9
+
10
+	## Make state vector from transition matrix
11
+	# Generate sample of random numbers
12
+	rsample = runif(numbins,0,1)
13
+	# Integrate transition matrix by row and add -1 in front
14
+	cumtransition = cbind(rep(-1,numstates), t(apply(transition, 1, cumsum)))
15
+	# Generate the state vector by going through each state
16
+	cat("Generating states from transition matrix...")
17
+	states = matrix(rep(NA,numstates*numbins), ncol=numstates)
18
+	for (irow in 1:numstates) {
19
+		states[,irow] = findInterval(rsample, cumtransition[irow,], rightmost.closed=TRUE)
20
+	}
21
+	statevec = rep(NA,numbins)
22
+	statevec[1] = initial
23
+	for (ibin in 2:numbins) {
24
+		statevec[ibin] = states[ibin,statevec[ibin-1]]
25
+	}
26
+	cat(" done\n")
27
+
28
+	## Make reads from state vector and distributions
29
+	# Generate the read counts by drawing from distribution
30
+	cat("Generating reads from emission parameters and states...")
31
+	reads = rep(NA, numbins)
32
+	numbins.in.state = aggregate(rep(1,length(statevec)), list(state=statevec), sum)
33
+	reads[statevec==1] = 0
34
+	if (!is.na(numbins.in.state[2,'x'])) {
35
+		reads[statevec==2] = rnbinom(numbins.in.state[2,'x'], size=emission[2,'r'], prob=emission[2,'p'])
36
+	}
37
+	if (!is.na(numbins.in.state[3,'x'])) {
38
+		reads[statevec==3] = rnbinom(numbins.in.state[3,'x'], size=emission[3,'r'], prob=emission[3,'p'])
39
+	}
40
+	cat(" done\n")
41
+
42
+	# Return the output
43
+	out = list(coordinates = coordinates,
44
+				states = statevec,
45
+				reads = reads,
46
+				transition = transition,
47
+				emission = emission
48
+				)
49
+	return(out)
50
+
51
+}
52
+
53
+
54
+
55
+simulate.multivariate = function(coordinates, transition, emissions, weights, sigma, use.states, initial=1) {
56
+
57
+	lib = require(mvtnorm)
58
+	if (lib == FALSE) {
59
+		install.packages("mvtnorm")
60
+		library(mvtnorm)
61
+	}
62
+
63
+	# Calculate some variables
64
+	numstates = ncol(transition)
65
+	numbins = nrow(coordinates)
66
+	nummod = length(emissions)
67
+
68
+	## Make state vector from transition matrix
69
+	# Generate sample of random numbers
70
+	rsample = runif(numbins,0,1)
71
+	# Integrate transition matrix by row and add -1 in front
72
+	cumtransition = cbind(rep(-1,numstates), t(apply(transition, 1, cumsum)))
73
+	# Generate the state vector by going through each state
74
+	cat("Generating states from transition matrix...")
75
+	states = matrix(rep(NA,numstates*numbins), ncol=numstates)
76
+	for (irow in 1:numstates) {
77
+		states[,irow] = findInterval(rsample, cumtransition[irow,], rightmost.closed=TRUE)
78
+	}
79
+	statevec = rep(NA,numbins)
80
+	statevec[1] = initial
81
+	for (ibin in 2:numbins) {
82
+		statevec[ibin] = states[ibin,statevec[ibin-1]]
83
+	}
84
+	# Replace the states by combinatorial states
85
+	statevec = use.states[statevec]
86
+	cat(" done\n")
87
+
88
+	## Make reads from state vector and emission distributions
89
+	reads = matrix(rep(NA, nummod*numbins), ncol=nummod)
90
+
91
+	rs = unlist(lapply(emissions, "[", 2:3, 'r'))
92
+	ps = unlist(lapply(emissions, "[", 2:3, 'p'))
93
+	ws = unlist(lapply(weights, "[", 1))
94
+	for (istate in use.states) {
95
+		cat("Generating reads for state ",istate,"\n")
96
+		i = which(use.states==istate)
97
+		
98
+		# Convert istate to binary representation
99
+		binary_state = rev(as.integer(intToBits(istate))[1:nummod])
100
+		binary_stateTF = unlist(list(c(TRUE,FALSE), c(FALSE,TRUE))[binary_state+1])
101
+
102
+		# Draw from the multivariate normal
103
+		n = length(which(statevec==istate))
104
+		if (n == 0) next
105
+		cat("drawing from multivariate normal...             \r")
106
+		z = matrix(rmvnorm(n, mean=rep(0,nummod), sigma=sigma[,,i]), ncol=nummod)
107
+		# Transform to uniform space
108
+		cat("transforming to uniform space...                \r")
109
+		u = matrix(apply(z, 2, pnorm), ncol=nummod)
110
+		# Transform to count space using marginals
111
+		cat("transform to count space...                     \r")
112
+		irs = rs[binary_stateTF]
113
+		ips = ps[binary_stateTF]
114
+		for (imod in 1:nummod) {
115
+			mask = statevec==istate
116
+			if (binary_state[imod] == 0) {
117
+				reads[mask,imod] = qzinbinom(u[,imod], w=ws[imod], size=irs[imod], prob=ips[imod])
118
+			} else {
119
+				reads[mask,imod] = qnbinom(u[,imod], size=irs[imod], prob=ips[imod])
120
+			}
121
+		}
122
+		cat("                                                \r")
123
+			
124
+	}
125
+
126
+	# Return the output
127
+	out = list(coordinates = coordinates,
128
+				states = statevec,
129
+				reads = reads,
130
+				emissions = emissions,
131
+				transition = transition,
132
+				sigma = sigma,
133
+				use.states = use.states,
134
+				weights = weights
135
+				)
136
+	return(out)
137
+
138
+}
0 139
new file mode 100644
... ...
@@ -0,0 +1,165 @@
1
+univariate.from.binned.data = function(binned.data, eps=0.001, max.time=-1, max.it=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, filter.reads=TRUE) {
2
+
3
+	## Intercept user input
4
+	if (check.positive(eps)!=0) stop("argument 'eps' expects a positive numeric")
5
+	if (check.integer(max.time)!=0) stop("argument 'max.time' expects an integer")
6
+	if (check.integer(max.it)!=0) stop("argument 'max.it' expects an integer")
7
+	if (check.positive.integer(num.trials)!=0) stop("argument 'num.trials' expects a positive integer")
8
+	if (!is.null(eps.try)) {
9
+		if (check.positive(eps.try)!=0) stop("argument 'eps.try' expects a positive numeric")
10
+	}
11
+	if (check.positive.integer(num.threads)!=0) stop("argument 'num.threads' expects a positive integer")
12
+	if (check.logical(output.if.not.converged)!=0) stop("argument 'output.if.not.converged' expects a logical (TRUE or FALSE)")
13
+
14
+
15
+	war = NULL
16
+	if (is.null(eps.try)) eps.try = eps
17
+
18
+	coordinatenames = c("chrom","start","end","reads")
19
+	names(binned.data) = coordinatenames
20
+
21
+	## Assign variables
22
+	# number of states
23
+	numstates = 3
24
+	# prepare densities to use
25
+	densityNames = c("ZI", "NB", "NB")
26
+	enum = c("ZI","NB","ZINB","Other")
27
+	densityNames = factor(densityNames, levels=enum)
28
+	densityNames = as.numeric(densityNames) - 1
29
+	# length of observation sequence
30
+	numbins = length(binned.data$reads) 
31
+
32
+	# Check if there are reads in the data, otherwise HMM will blow up
33
+	if (!any(binned.data$reads!=0)) {
34
+		stop("All reads in data are zero. No univariate HMM done.")
35
+	}
36
+
37
+	# Filter high reads out, makes HMM faster
38
+	if (filter.reads) {
39
+		limit <- 10*ceiling(var(binned.data$reads))
40
+		mask <- binned.data$reads > limit
41
+		binned.data$reads[mask] <- limit
42
+		numfiltered <- length(which(mask))
43
+		if (numfiltered > 0) {
44
+			warning(paste("There are very high read counts (probably artificial) in your data. Replaced read counts > ",limit," (10*variance) by ",limit," in ",numfiltered," bins. Set option 'filter.reads=FALSE' to disable this filtering.", sep=""))
45
+		}
46
+	}
47
+	
48
+	
49
+	## Call univariate in a for loop to enable multiple trials
50
+	modellist = list()
51
+	for (i_try in 1:num.trials) {
52
+		cat("\n\nTry ",i_try," of ",num.trials," ------------------------------\n")
53
+		z = .C("R_univariate_hmm",
54
+			as.integer(binned.data$reads), # double* O
55
+			as.integer(numbins), # int* T
56
+			as.integer(numstates), # int* N
57
+			as.integer(densityNames), # I pass them but they are fixed
58
+			double(length=numstates), # double* r
59
+			double(length=numstates), # double* p
60
+			as.integer(max.it), #  int* maxiter
61
+			as.integer(max.time), # double* maxtime
62
+			as.double(eps.try), # double* eps
63
+			double(length=numbins * numstates), # double* post
64
+			double(length=numstates*numstates), # double* A
65
+			double(length=numstates), # double* proba
66
+			double(length=1), # double* loglik
67
+			double(length=numstates), # double* softweights
68
+			double(length=numstates), # double* initial r
69
+			double(length=numstates), # double* initial p
70
+			double(length=numstates*numstates), # double* initial A
71
+			double(length=numstates), # double* initial proba
72
+			as.logical(0), # bool* use_initial_params
73
+			as.integer(num.threads) # int* num_threads
74
+		)
75
+
76
+		model = NULL
77
+		model$loglik = z[[13]]
78
+		model$iteration = z[[7]]
79
+		model$time.in.sec = z[[8]]
80
+		model$delta.loglik = z[[9]]
81
+		model$epsilon = eps.try
82
+		model$proba = z[[12]]
83
+		model$A = matrix(z[[11]], ncol=numstates, byrow=TRUE)
84
+		model$distributions = cbind(r=z[[5]], p=z[[6]])
85
+		model$softweights = z[[14]]
86
+		model$proba.initial = z[[18]]
87
+		model$A.initial = matrix(z[[17]], ncol=numstates, byrow=TRUE)
88
+		model$distributions.initial = cbind(r=z[[15]], p=z[[16]])
89
+		model$num.threads = z[[20]]
90
+		if (num.trials > 1) {
91
+			if (model$delta.loglik > model$epsilon) {
92
+				warning("HMM did not converge in trial run ",i_try,"!\n")
93
+			}
94
+			# Store model in list
95
+			modellist[[i_try]] = model
96
+		}
97
+	}
98
+
99
+	if (num.trials > 1) {
100
+
101
+		# Select fit with best loglikelihood
102
+		indexmax = which.max(unlist(lapply(modellist,"[[","loglik")))
103
+		model = modellist[[indexmax]]
104
+
105
+		# Rerun the HMM with different epsilon and initial parameters from trial run
106
+		cat("\n\nRerunning try ",indexmax," with eps =",eps,"--------------------\n")
107
+		z = .C("R_univariate_hmm",
108
+			as.integer(binned.data$reads), # double* O
109
+			as.integer(numbins), # int* T
110
+			as.integer(numstates), # int* N
111
+			as.integer(densityNames), # I pass them but they are fixed
112
+			double(length=numstates), # double* r
113
+			double(length=numstates), # double* p
114
+			as.integer(max.it), #  int* maxiter
115
+			as.integer(max.time), # double* maxtime
116
+			as.double(eps), # double* eps
117
+			double(length=numbins * numstates), # double* post
118
+			double(length=numstates*numstates), # double* A
119
+			double(length=numstates), # double* proba
120
+			double(length=1), # double* loglik
121
+			double(length=numstates), # double* softweights
122
+			as.vector(model$distributions[,'r']), # double* initial r
123
+			as.vector(model$distributions[,'p']), # double* initial p
124
+			as.vector(model$A), # double* initial A
125
+			as.vector(model$proba), # double* initial proba
126
+			as.logical(1), # bool* use_initial_params
127
+			as.integer(num.threads) # int* num_threads
128
+		)
129
+	}
130
+
131
+	model = NULL
132
+	model$coordinates = binned.data[,coordinatenames[1:3]]
133
+	model$reads = binned.data$reads
134
+	model$posteriors = matrix(z[[10]], ncol=numstates)
135
+	colnames(model$posteriors) = c("P(state = zero inflation)","P(state = unmodified)","P(state = modified)")
136
+	class(model) = "chromStar.univariate.model"
137
+	model$states = get.states(model)
138
+	model$states.with.zeroinflation = get.states(model, separate.zeroinflation=TRUE)
139
+	model$loglik = z[[13]]
140
+	model$iteration = z[[7]]
141
+	model$time.in.sec = z[[8]]
142
+	model$delta.loglik = z[[9]]
143
+	model$epsilon = eps
144
+	model$proba = z[[12]]
145
+	model$A = matrix(z[[11]], ncol=numstates, byrow=TRUE)
146
+	model$distributions = cbind(r=z[[5]], p=z[[6]])
147
+	model$softweights = z[[14]]
148
+	model$proba.initial = z[[18]]
149
+	model$A.initial = matrix(z[[17]], ncol=numstates, byrow=TRUE)
150
+	model$distributions.initial = cbind(r=z[[15]], p=z[[16]])
151
+	model$num.threads = z[[20]]
152
+	if (num.trials == 1) {
153
+		if (model$delta.loglik > model$epsilon) {
154
+			war = warning("HMM did not converge!\n")
155
+		}
156
+	}
157
+
158
+	if (!is.null(war)) {
159
+		if (output.if.not.converged == TRUE) {
160
+			return(model)
161
+		}
162
+	} else {
163
+		return(model)
164
+	}
165
+}
0 166
new file mode 100644
... ...
@@ -0,0 +1,66 @@
1
+dzinbinom = function(x, w, size, prob, mu) {
2
+	if (w < 0 || w > 1) {
3
+		warning("NAs returned, w needs to be between 0 and 1")
4
+		return(rep(NA, length(x)))
5
+	}
6
+	regular.density = dnbinom(x, size=size, prob=prob, mu=mu)
7
+	zi = rep(0, length(x))
8
+	zi[x==0] = w
9
+	density = zi + (1-w)*regular.density
10
+		
11
+	return(density)
12
+}
13
+
14
+pzinbinom = function(q, w, size, prob, mu, lower.tail=TRUE) {
15
+	if (w < 0 || w > 1) {
16
+		warning("NAs returned, w needs to be between 0 and 1")
17
+		return(rep(NA, length(q)))
18
+	}
19
+	# Use stepfun to make also non-integer values possible
20
+	x = 0:max(q)
21
+	cdf = stepfun(x, c(0, cumsum(dzinbinom(x, w, size, prob, mu))))
22
+	p = cdf(q)
23
+	p[p>1] = 1
24
+
25
+	if (lower.tail==TRUE) {
26
+		return(p)
27
+	} else {
28
+		return(1-p)
29
+	}
30
+}
31
+	
32
+qzinbinom = function(p, w, size, prob, mu, lower.tail=TRUE) {
33
+	if (w < 0 || w > 1) {
34
+		warning("NAs returned, w needs to be between 0 and 1")
35
+		return(rep(NA, length(q)))
36
+	}
37
+	x = 0:5000
38
+	cdf = pzinbinom(x, w, size, prob, mu)
39
+	inv.cdf = stepfun(cdf[-length(cdf)], x)
40
+	q = inv.cdf(p)
41
+
42
+	if (lower.tail==TRUE) {
43
+		return(q)
44
+	} else {
45
+		return(rev(q))
46
+	}
47
+}
48
+
49
+rzinbinom = function(n, w, size, prob, mu) {
50
+	if (w < 0 || w > 1) {
51
+		warning("NAs returned, w needs to be between 0 and 1")
52
+		return(rep(NA, length(q)))
53
+	}
54
+	r = rnbinom(n, size=size, prob=prob, mu=mu)
55
+	if (length(n)==1) {
56
+		i = sample(n, round(w * n))
57
+		if (n < 10) warning("n < 10: Random numbers may not be reliable")
58
+	} else {
59
+		i = sample(n, round(w * length(n)))
60
+		if (length(n) < 10) warning("length(n) < 10: Random numbers may not be reliable")
61
+	}
62
+	r[i] = 0
63
+	
64
+	return(r)
65
+}
66
+
0 67
new file mode 100644
... ...
@@ -0,0 +1,436 @@
1
+10
2
+
3
+dir
4
+821
5
+svn+ssh://maria@141.80.187.138/home/histoneHMM/svn/histoneHMM/branches/groningen/Rpackage/histoneHMM/man
6
+svn+ssh://maria@141.80.187.138/home/histoneHMM/svn
7
+
8
+
9
+
10
+2013-10-07T16:06:14.303223Z
11
+801
12
+heinig
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+72ef0f4a-728c-0410-918a-5b46bb2d7104
28
+
29
+zinbacopula.hmm.posterior.Rd
30
+file
31
+
32
+
33
+
34
+
35
+2013-10-21T16:01:10.000000Z
36
+bee76a313dfbe41bed927bf74a4227bf
37
+2013-10-07T16:06:14.303223Z
38
+801
39
+heinig
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+2424
62
+
63
+em.Rd
64
+file
65
+
66
+
67
+
68
+
69
+2013-10-21T16:01:10.000000Z
70
+53932cc64b0e6b28da2d3f2f02ca9594
71
+2013-10-07T16:06:14.303223Z
72
+801
73
+heinig
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+
94
+
95
+1631
96
+
97
+callRegions.Rd
98
+file
99
+
100
+
101
+
102
+
103
+2013-10-21T14:01:40.000000Z
104
+a67e330a1954b8ebff32e3eed41f6b58
105
+2013-10-07T16:06:14.303223Z
106
+801
107
+heinig
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+2911
130
+
131
+histoneHMM-package.Rd