Browse code

Add experimental support for bigWig tracks. Import seems to work. Export not so much. Note that rtracklayer now contains a large amount of compiled code -- mostly from the UCSC library.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/rtracklayer@44644 bc3139a8-67e5-0310-9ffc-ced21a209358

Michael Lawrence authored on 10/02/2010 23:56:11
Showing 81 changed files

... ...
@@ -1,6 +1,6 @@
1 1
 Package: rtracklayer
2 2
 Title: R interface to genome browsers and their annotation tracks
3
-Version: 1.7.6
3
+Version: 1.7.7
4 4
 Author: Michael Lawrence, Vince Carey, Robert Gentleman
5 5
 Depends: R (>= 2.7.0), methods, RCurl
6 6
 Imports: XML (>= 1.98-0), IRanges (>= 1.3.50), Biostrings (>= 2.11.14), BSgenome (>= 1.13.10)
... ...
@@ -14,5 +14,5 @@ Description: Extensible framework for interacting with multiple genome
14 14
   viewport.
15 15
 Maintainer: Michael Lawrence <mflawren@fhcrc.org>
16 16
 License: Artistic-2.0
17
-Collate: io.R web.R ranges.R browser.R gff.R ucsc.R bed.R wig.R utils.R
17
+Collate: io.R web.R ranges.R browser.R gff.R ucsc.R bed.R wig.R utils.R bigWig.R
18 18
 biocViews: Annotation,Visualization,DataImport
... ...
@@ -1,4 +1,4 @@
1
-#useDynLib(rtracklayer, .registration = TRUE)
1
+useDynLib(rtracklayer, .registration = TRUE)
2 2
 
3 3
 import(RCurl, XML)
4 4
 
... ...
@@ -16,15 +16,13 @@ importMethodsFrom("IRanges", start, end, "start<-", "end<-",
16 16
                   range, split, lapply, ranges, values, unlist, as.list,
17 17
                   isDisjoint, width, space, as.data.frame, rbind, universe,
18 18
                   "universe<-", score, "score<-", as.vector,
19
-                  "ranges<-", colnames, "colnames<-")
19
+                  "ranges<-", colnames, "colnames<-", queryHits, findOverlaps)
20 20
 
21 21
 exportClasses("BrowserSession", "BrowserView",
22 22
               "UCSCSession", "UCSCView",
23 23
               "UCSCData", "TrackLine", "BasicTrackLine", "GraphTrackLine",
24 24
               "Bed15TrackLine", "UCSCTrackModes",
25
-              "RangedSelection"
26
-              ##"BigWigSelection"
27
-              )
25
+              "RangedSelection", "BigWigSelection")
28 26
 
29 27
 exportMethods("activeView", "activeView<-", "blocks", "browseGenome",
30 28
               "browserSession", "browserSession<-",
... ...
@@ -51,9 +49,8 @@ exportMethods("activeView", "activeView<-", "blocks", "browseGenome",
51 49
 
52 50
 export("genomeBrowsers", "start", "end", "strand", "start<-", "end<-",
53 51
        "ranges", "GenomicRanges", "GenomicData", "GenomicSelection", "IRanges",
54
-       "RangedDataList", score, "score<-", "as.data.frame", "ucscGenomes"
55
-       ##"export.bw", "import.bw"
56
-       )
52
+       "RangedDataList", score, "score<-", "as.data.frame", "ucscGenomes",
53
+       "export.bw", "import.bw")
57 54
 
58 55
 ## the S4-connection compatibility layer
59 56
 exportClasses("connection",
60 57
new file mode 100644
... ...
@@ -0,0 +1,106 @@
1
+### UCSC bigWig format 
2
+
3
+.allowedColNames <- list(bigWig = "score")
4
+
5
+.validateColNames <- function(object, format) {
6
+  allowedColNames <- .allowedColNames[[format]]
7
+  invalidColNames <- setdiff(colnames(object), allowedColNames)
8
+  if (length(invalidColNames))
9
+    paste("Column names",
10
+          paste("'", invalidColNames, "'", sep = "", collapse=", "),
11
+          "are invalid for the", format, "format.")
12
+  else NULL
13
+}
14
+
15
+setClass("BigWigSelection", contains = "RangedSelection")
16
+
17
+setValidity("BigWigSelection",
18
+            function(object) {
19
+              .validateColNames(object, "bigWig")
20
+            })
21
+
22
+
23
+setGeneric("export.bw",
24
+           function(object, con,
25
+                    dataFormat = c("auto", "variableStep", "fixedStep",
26
+                      "bedGraph"),
27
+                    seqlengths = NULL, compress = TRUE, ...)
28
+           standardGeneric("export.bw"))
29
+
30
+setMethod("export.bw", "ANY",
31
+          function(object, con,
32
+                   dataFormat = c("auto", "variableStep", "fixedStep",
33
+                     "bedGraph"),
34
+                   seqlengths, compress, genome, ...)
35
+          {
36
+            rd <- as(object, "RangedData")
37
+            if (!is.null(genome))
38
+              genome(rd) <- genome
39
+            export.bw(rd, con, dataFormat, seqlengths, compress, ...)
40
+          })
41
+
42
+setMethod("export.bw", c("RangedData", "character"),
43
+          function(object, con,
44
+                   dataFormat = c("auto", "variableStep", "fixedStep",
45
+                                  "bedGraph"),
46
+                   seqlengths, compress)
47
+          {
48
+            score <- score(object)
49
+            if (!is.numeric(score) || any(is.na(score)))
50
+              stop("The score must be numeric, without any NA's")
51
+            if (!IRanges:::isTRUEorFALSE(compress))
52
+              stop("'compress' must be TRUE or FALSE")
53
+            if (is.null(seqlengths) && !is.null(genome(object)))
54
+              seqlengths <- seqlengths(.genomeForID(genome(object)))
55
+            if (is.null(seqlengths))
56
+              stop("Unable to determine seqlengths; either specify ",
57
+                   "'seqlengths' or specify a genome on 'object' that ",
58
+                   "is known to BSgenome")
59
+            if (!is.numeric(seqlengths) ||
60
+                !all(names(object) %in% names(seqlengths)))
61
+              stop("seqlengths must be numeric and indicate a length for ",
62
+                   "each sequence in 'object'")
63
+            sectionPtr <- NULL # keep adding to the same linked list
64
+            .bigWigWriter <- function(chromData, con, dataFormat) {
65
+              sectionPtr <<- .Call(BWGSectionList_add, sectionPtr,
66
+                                   names(chromData)[1],
67
+                                   as(ranges(chromData)[[1]], "IRanges"),
68
+                                   as.numeric(score(chromData)), dataFormat)
69
+            }
70
+            dataFormat <- match.arg(dataFormat)
71
+            if (dataFormat == "auto")
72
+              format <- chooseGraphType(object)
73
+            else format <- dataFormat
74
+            on.exit(.Call(BWGSectionList_cleanup, sectionPtr))
75
+            if (format == "bedGraph")
76
+              lapply(object, .bigWigWriter, con, dataFormat)
77
+            else export.wigLines(object, con, dataFormat, .bigWigWriter)
78
+            storage.mode(seqlengths) <- "integer"
79
+            .Call(BWGSectionList_write, sectionPtr, seqlengths,
80
+                  compress, con)
81
+          })
82
+
83
+setGeneric("import.bw",
84
+           function(con, selection = GenomicSelection(...), ...)
85
+           standardGeneric("import.bw"))
86
+
87
+
88
+setMethod("import.bw", "character",
89
+          function(con, selection = GenomicSelection(...), ...)
90
+          {
91
+            if (!IRanges:::isSingleString(con))
92
+              stop("'con' must be a single string, specifying a path")
93
+            selection <- try(as(selection, "RangedSelection"))
94
+            if (is.character(selection))
95
+              stop("'selection' must be coercible to RangedSelection")
96
+            normRanges <- as(ranges(selection), "NormalIRangesList")
97
+            rd <- .Call(BWGFile_query, con, as.list(normRanges),
98
+                        colnames(selection))
99
+            ## Unfortunately, the bigWig query API is such that we can
100
+            ## end up with multiple hits.
101
+            if (any(width(rd) > 2)) {
102
+              hits <- queryHits(findOverlaps(ranges(rd), normRanges))
103
+              rd <- rd[!duplicated(hits),]
104
+            }
105
+            rd
106
+          })
0 107
new file mode 100644
... ...
@@ -0,0 +1 @@
1
+#include <_IRanges_stubs.c>
0 2
new file mode 100644
... ...
@@ -0,0 +1,2 @@
1
+include Makevars.common
2
+OBJECTS = $(PKG_OBJECTS) $(UCSC_OBJECTS:%=ucsc/%)
0 3
\ No newline at end of file
1 4
new file mode 100644
... ...
@@ -0,0 +1,10 @@
1
+PKG_OBJECTS = \
2
+  IRanges_stubs.o R_init_rtracklayer.o bigWig.o
3
+  
4
+UCSC_OBJECTS = \
5
+  bPlusTree.o bbiRead.o bbiWrite.o bwgCreate.o bwgQuery.o \
6
+  cirTree.o common.o errCatch.o errabort.o hash.o linefile.o localmem.o \
7
+  sqlNum.o zlibFace.o dystring.o hmmstats.o obscure.o pipeline.o \
8
+  rangeTree.o rbTree.o memalloc.o dlist.o udc.o net.o bits.o _cheapcgi.o \
9
+  internet.o https.o base64.o verbose.o os.o wildcmp.o _portimpl.o
10
+
0 11
new file mode 100644
... ...
@@ -0,0 +1,20 @@
1
+#include "rtracklayer.h"
2
+#include "bigWig.h"
3
+
4
+#include <R_ext/Rdynload.h>
5
+
6
+#define CALLMETHOD_DEF(fun, numArgs) {#fun, (DL_FUNC) &fun, numArgs}
7
+
8
+static const R_CallMethodDef callMethods[] = {
9
+/* bigWig.c */
10
+  CALLMETHOD_DEF(BWGSectionList_add, 5),
11
+  CALLMETHOD_DEF(BWGSectionList_write, 4),
12
+  CALLMETHOD_DEF(BWGSectionList_cleanup, 1),
13
+  CALLMETHOD_DEF(BWGFile_query, 3),
14
+  {NULL, NULL, 0}
15
+};
16
+
17
+void R_init_rtracklayer(DllInfo *info)
18
+{
19
+  R_registerRoutines(info, NULL, callMethods, NULL, NULL);
20
+}
0 21
new file mode 100644
... ...
@@ -0,0 +1,217 @@
1
+#include "ucsc/common.h"
2
+#include "ucsc/linefile.h"
3
+#include "ucsc/localmem.h"
4
+#include "ucsc/hash.h"
5
+#include "ucsc/bbiFile.h"
6
+#include "ucsc/bwgInternal.h"
7
+#include "ucsc/_bwgInternal.h"
8
+
9
+#include "bigWig.h"
10
+
11
+static struct bwgBedGraphItem *
12
+createBedGraphItems(int *start, int *width, double *score, int len,
13
+                    struct lm *lm)
14
+{
15
+  struct bwgBedGraphItem *itemList = NULL, *item;
16
+  int i;
17
+  for (i=0; i<len; ++i)
18
+    {
19
+      lmAllocVar(lm, item);
20
+      item->end = start[i] + width[i] - 1;
21
+      item->start = start[i] - 1;
22
+      item->val = score[i];
23
+      slAddHead(&itemList, item);
24
+    }
25
+  slReverse(&itemList);
26
+  return itemList;
27
+}
28
+
29
+static struct bwgVariableStepPacked *
30
+createVariableStepItems(int *start, double *score, int len, struct lm *lm)
31
+{
32
+  struct bwgVariableStepPacked *packed;
33
+  lmAllocArray(lm, packed, len);
34
+  int i;
35
+  for (i=0; i<len; ++i)
36
+    {
37
+      packed[i].start = start[i] - 1;
38
+      packed[i].val = score[i];
39
+    }
40
+  return packed;
41
+}
42
+
43
+static struct bwgFixedStepPacked *
44
+createFixedStepItems(double *score, int len, struct lm *lm)
45
+{
46
+  struct bwgFixedStepPacked *packed;
47
+  lmAllocArray(lm, packed, len);
48
+  int i;
49
+  for (i=0; i<len; ++i)
50
+    {
51
+      packed[i].val = score[i];
52
+    }
53
+  return packed;
54
+}
55
+
56
+static struct bwgSection *
57
+createBWGSection(const char *seq, int *start, int *width, double *score,
58
+                 int len, enum bwgSectionType type, struct lm *lm)
59
+{
60
+  struct bwgSection *section;
61
+  lmAllocVar(lm, section);
62
+  section->chrom = (char *)seq;
63
+  section->start = start[0] - 1;
64
+  section->end = start[len-1] + width[len-1] - 1;
65
+  section->type = type;
66
+  section->itemSpan = width[0];
67
+  if (type == bwgTypeFixedStep) {
68
+    section->items.fixedStepPacked = createFixedStepItems(score, len, lm);
69
+    section->itemStep = len > 1 ? start[1] - start[0] : 0;
70
+  } else if (type == bwgTypeVariableStep) {
71
+    section->items.variableStepPacked =
72
+      createVariableStepItems(start, score, len, lm);
73
+  } else section->items.bedGraphList =
74
+           createBedGraphItems(start, width, score, len, lm);
75
+  section->itemCount = len;
76
+  return section;
77
+}
78
+
79
+static int itemsPerSlot = 512;
80
+static int blockSize = 1024;
81
+
82
+/* --- .Call ENTRY POINT --- */
83
+
84
+SEXP BWGSectionList_add(SEXP r_sections, SEXP r_seq, SEXP r_ranges,
85
+                        SEXP r_score, SEXP r_format)
86
+{
87
+  struct bwgSection *sections = NULL;
88
+  const char *seq = CHAR(asChar(r_seq));
89
+  int *start = INTEGER(get_IRanges_start(r_ranges));
90
+  int *width = INTEGER(get_IRanges_width(r_ranges));
91
+  double *score = REAL(r_score);
92
+  const char *format = CHAR(asChar(r_format));
93
+  int num = get_IRanges_length(r_ranges);
94
+  int numLeft = num;
95
+  SEXP ans;
96
+  struct lm *lm;
97
+
98
+  enum bwgSectionType type = bwgTypeBedGraph;
99
+  if (sameString(format, "fixedStep"))
100
+    type = bwgTypeFixedStep;
101
+  else if (sameString(format, "variableStep"))
102
+    type = bwgTypeVariableStep;
103
+
104
+  if (r_sections != R_NilValue) {
105
+    sections = R_ExternalPtrAddr(r_sections);
106
+    lm = R_ExternalPtrAddr(R_ExternalPtrTag(r_sections));
107
+  } else lm = lmInit(0);
108
+  
109
+  while(numLeft) {
110
+    int numSection = numLeft > itemsPerSlot ? itemsPerSlot : numLeft;
111
+    numLeft -= numSection;
112
+    slAddHead(&sections,
113
+              createBWGSection(seq, start, width, score, numSection, type, lm));
114
+    start += numSection;
115
+    width += numSection;
116
+    score += numSection;
117
+  }
118
+
119
+  PROTECT(ans = R_MakeExternalPtr(sections, R_NilValue, R_NilValue));
120
+  R_SetExternalPtrTag(ans, R_MakeExternalPtr(lm, R_NilValue, R_NilValue));
121
+  UNPROTECT(1);
122
+
123
+  return ans;
124
+}
125
+
126
+static struct hash *createIntHash(SEXP v) {
127
+  struct hash *hash = hashNew(0);
128
+  SEXP names = getAttrib(v, R_NamesSymbol);
129
+  for (int i = 0; i < length(v); i++)
130
+    hashAddInt(hash, (char *)CHAR(STRING_ELT(names, i)), INTEGER(v)[i]); 
131
+  return hash;
132
+}
133
+
134
+/* --- .Call ENTRY POINT --- */
135
+SEXP BWGSectionList_write(SEXP r_sections, SEXP r_seqlengths, SEXP r_compress,
136
+                          SEXP r_file)
137
+{
138
+  struct bwgSection *sections = NULL;
139
+  struct hash *lenHash = createIntHash(r_seqlengths);
140
+  if (r_sections != R_NilValue) {
141
+    sections = R_ExternalPtrAddr(r_sections);
142
+    slReverse(&sections);
143
+  }
144
+  bwgCreate(sections, lenHash, blockSize, itemsPerSlot, asLogical(r_compress),
145
+            (char *)CHAR(asChar(r_file)));
146
+  freeHash(&lenHash);
147
+  return R_NilValue;
148
+}
149
+
150
+/* --- .Call ENTRY POINT --- */
151
+SEXP BWGSectionList_cleanup(SEXP r_sections)
152
+{
153
+  if (r_sections != R_NilValue) {
154
+    struct lm *lm = R_ExternalPtrAddr(R_ExternalPtrTag(r_sections));
155
+    lmCleanup(&lm);
156
+  }
157
+  return R_NilValue;
158
+}
159
+
160
+/* --- .Call ENTRY POINT --- */
161
+SEXP BWGFile_query(SEXP r_filename, SEXP r_ranges, SEXP r_colnames) {
162
+  struct bbiFile * file = bigWigFileOpen((char *)CHAR(asChar(r_filename)));
163
+  SEXP chromNames = getAttrib(r_ranges, R_NamesSymbol);
164
+  int nchroms = length(r_ranges);
165
+  SEXP rangesList, rangesListEls, dataFrameList, dataFrameListEls, ans;
166
+  const char *var_names[] = { "score", "" };
167
+  struct lm *lm = lmInit(0);
168
+  
169
+  struct bbiInterval *hits = NULL;
170
+
171
+  PROTECT(rangesListEls = allocVector(VECSXP, nchroms));
172
+  setAttrib(rangesListEls, R_NamesSymbol, chromNames);
173
+  PROTECT(dataFrameListEls = allocVector(VECSXP, nchroms));
174
+  setAttrib(dataFrameListEls, R_NamesSymbol, chromNames);
175
+  
176
+  for (int i = 0; i < length(r_ranges); i++) {
177
+    SEXP localRanges = VECTOR_ELT(r_ranges, i);
178
+    int nranges = get_IRanges_length(localRanges);
179
+    int *start = INTEGER(get_IRanges_start(localRanges));
180
+    int *width = INTEGER(get_IRanges_width(localRanges));
181
+    for (int j = 0; j < nranges; j++) {
182
+      struct bbiInterval *queryHits =
183
+        bigWigIntervalQuery(file, (char *)CHAR(STRING_ELT(chromNames, i)),
184
+                            start[j] - 1, start[j] - 1 + width[j], lm);
185
+      hits = slCat(queryHits, hits);
186
+    }
187
+    int nhits = slCount(hits);
188
+    SEXP ans_start, ans_width, ans_score, ans_score_l;
189
+    PROTECT(ans_start = allocVector(INTSXP, nhits));
190
+    PROTECT(ans_width = allocVector(INTSXP, nhits));
191
+    PROTECT(ans_score_l = mkNamed(VECSXP, var_names));
192
+    ans_score = allocVector(REALSXP, nhits);
193
+    SET_VECTOR_ELT(ans_score_l, 0, ans_score);
194
+    for (int j = 0; j < nhits; j++, hits = hits->next) {
195
+      INTEGER(ans_start)[j] = hits->start + 1;
196
+      INTEGER(ans_width)[j] = hits->end - hits->start;
197
+      REAL(ans_score)[j] = hits->val;
198
+    }
199
+    SET_VECTOR_ELT(rangesListEls, i,
200
+                   new_IRanges("IRanges", ans_start, ans_width, R_NilValue));
201
+    SET_VECTOR_ELT(dataFrameListEls, i,
202
+                   new_DataFrame("DataFrame", ans_score_l, R_NilValue,
203
+                                 ScalarInteger(nhits)));
204
+    UNPROTECT(3);    
205
+  }
206
+
207
+  bbiFileClose(&file);
208
+  
209
+  PROTECT(dataFrameList =
210
+          new_SimpleList("SimpleSplitDataFrameList", dataFrameListEls));
211
+  PROTECT(rangesList = new_SimpleList("SimpleRangesList", rangesListEls));
212
+  ans = new_RangedData("RangedData", rangesList, dataFrameList);
213
+
214
+  UNPROTECT(4);
215
+  lmCleanup(&lm);
216
+  return ans;
217
+}
0 218
new file mode 100644
... ...
@@ -0,0 +1,15 @@
1
+#ifndef BIG_WIG_H
2
+#define BIG_WIG_H
3
+
4
+#include "rtracklayer.h"
5
+
6
+/* The .Call entry points */
7
+
8
+SEXP BWGSectionList_add(SEXP r_sections, SEXP r_seq, SEXP r_ranges,
9
+                        SEXP r_score, SEXP r_format);
10
+SEXP BWGSectionList_write(SEXP r_sections, SEXP r_seqlengths, SEXP r_compress,
11
+                          SEXP r_file);
12
+SEXP BWGSectionList_cleanup(SEXP r_sections);
13
+SEXP BWGFile_query(SEXP r_filename, SEXP r_ranges, SEXP r_colnames);
14
+
15
+#endif
0 16
new file mode 100644
... ...
@@ -0,0 +1,8 @@
1
+#ifndef RTRACKLAYER_H
2
+#define RTRACKLAYER_H
3
+
4
+#include <Rinternals.h>
5
+
6
+#include <IRanges_interface.h>
7
+
8
+#endif
0 9
new file mode 100644
... ...
@@ -0,0 +1,74 @@
1
+/* Stuff that should be in ucsc/bwgInternal.h, but wasn't, so
2
+   rtracklayer put it here. */
3
+
4
+struct bwgBedGraphItem
5
+/* An bedGraph-type item in a bwgSection. */
6
+{
7
+  struct bwgBedGraphItem *next;	/* Next in list. */
8
+  bits32 start,end;		/* Range of chromosome covered. */
9
+  float val;			/* Value. */
10
+};
11
+
12
+struct bwgVariableStepItem
13
+/* An variableStep type item in a bwgSection. */
14
+{
15
+  struct bwgVariableStepItem *next;	/* Next in list. */
16
+  bits32 start;		/* Start position in chromosome. */
17
+  float val;			/* Value. */
18
+};
19
+
20
+struct bwgVariableStepPacked
21
+/* An variableStep type item in a bwgSection. */
22
+{
23
+  bits32 start;		/* Start position in chromosome. */
24
+  float val;			/* Value. */
25
+};
26
+
27
+struct bwgFixedStepItem
28
+/* An fixedStep type item in a bwgSection. */
29
+{
30
+  struct bwgFixedStepItem *next;	/* Next in list. */
31
+  float val;			/* Value. */
32
+};
33
+
34
+struct bwgFixedStepPacked
35
+/* An fixedStep type item in a bwgSection. */
36
+{
37
+  float val;			/* Value. */
38
+};
39
+
40
+union bwgItem
41
+/* Union of item pointers for all possible section types. */
42
+{
43
+  struct bwgBedGraphItem *bedGraphList;		/* A linked list */
44
+  struct bwgFixedStepPacked *fixedStepPacked;		/* An array */
45
+  struct bwgVariableStepPacked *variableStepPacked;	/* An array */
46
+  /* No packed format for bedGraph... */
47
+};
48
+
49
+struct bwgSection
50
+/* A section of a bigWig file - all on same chrom.  This is a somewhat fat data
51
+ * structure used by the bigWig creation code.  See also bwgSection for the
52
+ * structure returned by the bigWig reading code. */
53
+{
54
+  struct bwgSection *next;		/* Next in list. */
55
+  char *chrom;			/* Chromosome name. */
56
+  bits32 start,end;			/* Range of chromosome covered. */
57
+  enum bwgSectionType type;
58
+  union bwgItem items;		/* List/array of items in this section. */
59
+  bits32 itemStep;			/* Step within item if applicable. */
60
+  bits32 itemSpan;			/* Item span if applicable. */
61
+  bits16 itemCount;			/* Number of items in section. */
62
+  bits32 chromId;			/* Unique small integer value for chromosome. */
63
+  bits64 fileOffset;			/* Offset of section in file. */
64
+};
65
+
66
+void bwgCreate(struct bwgSection *sectionList, struct hash *chromSizeHash, 
67
+               int blockSize, int itemsPerSlot, boolean doCompress,
68
+               char *fileName);
69
+
70
+struct bbiFile *bigWigFileOpen(char *fileName);
71
+
72
+struct bbiInterval *bigWigIntervalQuery(struct bbiFile *bwf, char *chrom,
73
+                                        bits32 start, bits32 end,
74
+                                        struct lm *lm);
0 75
new file mode 100644
... ...
@@ -0,0 +1,30 @@
1
+/* rtracklayer took out this little utility from cheapcgi.c */
2
+
3
+#include "common.h"
4
+
5
+void cgiDecode(char *in, char *out, int inLength)
6
+/* Decode from cgi pluses-for-spaces format to normal.
7
+ * Out will be a little shorter than in typically, and
8
+ * can be the same buffer. */
9
+{
10
+  char c;
11
+  int i;
12
+  for (i=0; i<inLength;++i)
13
+    {
14
+      c = *in++;
15
+      if (c == '+')
16
+	*out++ = ' ';
17
+      else if (c == '%')
18
+	{
19
+          int code;
20
+          if (sscanf(in, "%2x", &code) != 1)
21
+	    code = '?';
22
+          in += 2;
23
+          i += 2;
24
+          *out++ = code;
25
+	}
26
+      else
27
+	*out++ = c;
28
+    }
29
+  *out++ = 0;
30
+}
0 31
new file mode 100644
... ...
@@ -0,0 +1,51 @@
1
+/* rtracklayer took this utility out of portimpl.c */
2
+
3
+#include "common.h"
4
+#include "portable.h"
5
+
6
+void makeDirsOnPath(char *pathName)
7
+/* Create directory specified by pathName.  If pathName contains
8
+ * slashes, create directory at each level of path if it doesn't
9
+ * already exist.  Abort with error message if there's a problem.
10
+ * (It's not considered a problem for the directory to already
11
+ * exist. ) */
12
+{
13
+/* Save a copy of current directory. */
14
+char *curDir = cloneString(getCurrentDir());
15
+
16
+/* Return current directory.  Abort if it fails. */
17
+/* Make local copy of pathName. */
18
+int len = strlen(pathName);
19
+char pathCopy[len+1];
20
+strcpy(pathCopy, pathName);
21
+
22
+/* Start at root if it's an absolute path name. */
23
+char *s = pathCopy, *e;
24
+if (pathCopy[0] == '/')
25
+  {
26
+setCurrentDir("/");
27
+}
28
+
29
+/* Step through it one slash at a time - changing directory if possible
30
+ * else making directory if possible, else dying. */
31
+for (; !isEmpty(s); s = e)
32
+  {
33
+/* Find end of this section and terminate string there. */
34
+e = strchr(s, '/');
35
+if (e != NULL)
36
+  *e++ = 0;
37
+
38
+/* Cd there.  If that fails mkdir there and then cd there. */
39
+if (!maybeSetCurrentDir(s))
40
+  {
41
+if (!makeDir(s))
42
+  {
43
+break;
44
+}
45
+setCurrentDir(s);
46
+}
47
+}
48
+setCurrentDir(curDir);
49
+freeMem(curDir);
50
+}
0 51
new file mode 100644
... ...
@@ -0,0 +1,440 @@
1
+/* bptFile - B+ Trees.  These are a method of indexing data similar to binary trees, but 
2
+ * with many children rather than just two at each node. They work well when stored on disk,
3
+ * since typically only two or three disk accesses are needed to locate any particular
4
+ * piece of data.  This implementation is *just* for disk based storage.  For memory
5
+ * use the rbTree instead. Currently the implementation is just useful for data warehouse
6
+ * type applications.  That is it implements a function to create a b+ tree from bulk data
7
+ * (bptFileCreate) and a function to lookup a value given a key (bptFileFind) but not functions
8
+ * to add or delete individual items.
9
+ *
10
+ * The layout of the file on disk is:
11
+ *    header
12
+ *    root node
13
+ *    (other nodes)
14
+ * In general when the tree is first built the higher level nodes are stored before the
15
+ * lower level nodes.  It is possible if a b+ tree is dynamically updated for this to
16
+ * no longer be strictly true, but actually currently the b+ tree code here doesn't implement
17
+ * dynamic updates - it just creates a b+ tree from a sorted list.
18
+ *
19
+ * Each node can be one of two types - index or leaf.  The index nodes contain pointers
20
+ * to child nodes.  The leaf nodes contain the actual data. 
21
+ *
22
+ * The layout of the file header is:
23
+ *       <magic number>  4 bytes - The value bptSig (0x78CA8C91)
24
+ *       <block size>    4 bytes - Number of children per block (not byte size of block)
25
+ *       <key size>      4 bytes - Number of significant bytes in key
26
+ *       <val size>      4 bytes - Number of bytes in value
27
+ *       <item count>    8 bytes - Number of items in index
28
+ *       <reserved2>     4 bytes - Always 0 for now
29
+ *       <reserved3>     4 bytes - Always 0 for now
30
+ * The magic number may be byte-swapped, in which case all numbers in the file
31
+ * need to be byte-swapped. 
32
+ *
33
+ * The nodes start with a header:
34
+ *       <is leaf>       1 byte  - 1 for leaf nodes, 0 for index nodes.
35
+ *       <reserved>      1 byte  - Always 0 for now.
36
+ *       <count>         2 bytes - The number of children/items in node
37
+ * This is followed by count items.  For the index nodes the items are
38
+ *       <key>           key size bytes - always written most significant byte first
39
+ *       <offset>        8 bytes - Offset of child node in index file.
40
+ * For leaf nodes the items are
41
+ *       <key>           key size bytes - always written most significant byte first
42
+ *       <val>           val sized bytes - the value associated with the key.
43
+ * Note in general the leaf nodes may not be the same size as the index nodes, though in
44
+ * the important case where the values are file offsets they will be.
45
+ */
46
+
47
+#include "common.h"
48
+#include "sig.h"
49
+#include "udc.h"
50
+#include "bPlusTree.h"
51
+
52
+/* This section of code deals with locating a value in a b+ tree. */
53
+
54
+struct bptFile *bptFileAttach(char *fileName, struct udcFile *udc)
55
+/* Open up index file on previously open file, with header at current file position. */
56
+{
57
+/* Open file and allocate structure to hold info from header etc. */
58
+struct bptFile *bpt = needMem(sizeof(*bpt));
59
+bpt->fileName = fileName;
60
+bpt->udc = udc;
61
+
62
+/* Read magic number at head of file and use it to see if we are proper file type, and
63
+ * see if we are byte-swapped. */
64
+bits32 magic;
65
+boolean isSwapped = FALSE;
66
+udcMustReadOne(udc, magic);
67
+if (magic != bptSig)
68
+    {
69
+    magic = byteSwap32(magic);
70
+    isSwapped = bpt->isSwapped = TRUE;
71
+    if (magic != bptSig)
72
+       errAbort("%s is not a bpt b-plus tree index file", fileName);
73
+    }
74
+
75
+/* Read rest of defined bits of header, byte swapping as needed. */
76
+bpt->blockSize = udcReadBits32(udc, isSwapped);
77
+bpt->keySize = udcReadBits32(udc, isSwapped);
78
+bpt->valSize = udcReadBits32(udc, isSwapped);
79
+bpt->itemCount = udcReadBits64(udc, isSwapped);
80
+
81
+/* Skip over reserved bits of header. */
82
+bits32 reserved32;
83
+udcMustReadOne(udc, reserved32);
84
+udcMustReadOne(udc, reserved32);
85
+
86
+/* Save position of root block of b+ tree. */
87
+bpt->rootOffset = udcTell(udc);
88
+
89
+return bpt;
90
+}
91
+
92
+void bptFileDetach(struct bptFile **pBpt)
93
+/* Detach and free up cirTree file opened with cirTreeFileAttach. */
94
+{
95
+freez(pBpt);
96
+}
97
+
98
+struct bptFile *bptFileOpen(char *fileName)
99
+/* Open up index file - reading header and verifying things. */
100
+{
101
+return bptFileAttach(cloneString(fileName), udcFileOpen(fileName, udcDefaultDir()));
102
+}
103
+
104
+void bptFileClose(struct bptFile **pBpt)
105
+/* Close down and deallocate index file. */
106
+{
107
+struct bptFile *bpt = *pBpt;
108
+if (bpt != NULL)
109
+    {
110
+    udcFileClose(&bpt->udc);
111
+    freeMem(bpt->fileName);
112
+    bptFileDetach(pBpt);
113
+    }
114
+}
115
+
116
+static boolean rFind(struct bptFile *bpt, bits64 blockStart, void *key, void *val)
117
+/* Find value corresponding to key.  If found copy value to memory pointed to by val and return 
118
+ * true. Otherwise return false. */
119
+{
120
+/* Seek to start of block. */
121
+udcSeek(bpt->udc, blockStart);
122
+
123
+/* Read block header. */
124
+UBYTE isLeaf;
125
+UBYTE reserved;
126
+bits16 i, childCount;
127
+udcMustReadOne(bpt->udc, isLeaf);
128
+udcMustReadOne(bpt->udc, reserved);
129
+boolean isSwapped = bpt->isSwapped;
130
+childCount = udcReadBits16(bpt->udc, isSwapped);
131
+
132
+UBYTE keyBuf[bpt->keySize];   /* Place to put a key, buffered on stack. */
133
+
134
+if (isLeaf)
135
+    {
136
+    for (i=0; i<childCount; ++i)
137
+        {
138
+	udcMustRead(bpt->udc, keyBuf, bpt->keySize);
139
+	udcMustRead(bpt->udc, val, bpt->valSize);
140
+	if (memcmp(key, keyBuf, bpt->keySize) == 0)
141
+	    return TRUE;
142
+	}
143
+    return FALSE;
144
+    }
145
+else
146
+    {
147
+    /* Read and discard first key. */
148
+    udcMustRead(bpt->udc, keyBuf, bpt->keySize);
149
+
150
+    /* Scan info for first file offset. */
151
+    bits64 fileOffset = udcReadBits64(bpt->udc, isSwapped);
152
+
153
+    /* Loop through remainder. */
154
+    for (i=1; i<childCount; ++i)
155
+	{
156
+	udcMustRead(bpt->udc, keyBuf, bpt->keySize);
157
+	if (memcmp(key, keyBuf, bpt->keySize) < 0)
158
+	    break;
159
+	fileOffset = udcReadBits64(bpt->udc, isSwapped);
160
+	}
161
+    return rFind(bpt, fileOffset, key, val);
162
+    }
163
+}
164
+
165
+void rTraverse(struct bptFile *bpt, bits64 blockStart, void *context, 
166
+    void (*callback)(void *context, void *key, int keySize, void *val, int valSize) )
167
+/* Recursively go across tree, calling callback at leaves. */
168
+{
169
+/* Seek to start of block. */
170
+udcSeek(bpt->udc, blockStart);
171
+
172
+/* Read block header. */
173
+UBYTE isLeaf;
174
+UBYTE reserved;
175
+bits16 i, childCount;
176
+udcMustReadOne(bpt->udc, isLeaf);
177
+udcMustReadOne(bpt->udc, reserved);
178
+boolean isSwapped = bpt->isSwapped;
179
+childCount = udcReadBits16(bpt->udc, isSwapped);
180
+
181
+char keyBuf[bpt->keySize], valBuf[bpt->valSize];
182
+if (isLeaf)
183
+    {
184
+    for (i=0; i<childCount; ++i)
185
+        {
186
+	udcMustRead(bpt->udc, keyBuf, bpt->keySize);
187
+	udcMustRead(bpt->udc, valBuf, bpt->valSize);
188
+	callback(context, keyBuf, bpt->keySize, valBuf, bpt->valSize);
189
+	}
190
+    }
191
+else
192
+    {
193
+    bits64 fileOffsets[childCount];
194
+    /* Loop through to get file offsets of children. */
195
+    for (i=0; i<childCount; ++i)
196
+	{
197
+	udcMustRead(bpt->udc, keyBuf, bpt->keySize);
198
+	fileOffsets[i] = udcReadBits64(bpt->udc, isSwapped);
199
+	}
200
+    /* Loop through recursing on child offsets. */
201
+    for (i=0; i<childCount; ++i)
202
+	rTraverse(bpt, fileOffsets[i], context, callback);
203
+    }
204
+}
205
+
206
+boolean bptFileFind(struct bptFile *bpt, void *key, int keySize, void *val, int valSize)
207
+/* Find value associated with key.  Return TRUE if it's found. 
208
+*  Parameters:
209
+*     bpt - file handle returned by bptFileOpen
210
+*     key - pointer to key string, which needs to be bpt->keySize long
211
+*     val - pointer to where to put retrieved value
212
+*/
213
+{
214
+/* Check key size vs. file key size, and act appropriately.  If need be copy key to a local
215
+ * buffer and zero-extend it. */
216
+if (keySize > bpt->keySize)
217
+    return FALSE;
218
+char keyBuf[keySize];
219
+if (keySize != bpt->keySize)
220
+    {
221
+    memcpy(keyBuf, key, keySize);
222
+    memset(keyBuf+keySize, 0, bpt->keySize - keySize);
223
+    key = keyBuf;
224
+    }
225
+
226
+/* Make sure the valSize matches what's in file. */
227
+if (valSize != bpt->valSize)
228
+    errAbort("Value size mismatch between bptFileFind (valSize=%d) and %s (valSize=%d)",
229
+    	valSize, bpt->fileName, bpt->valSize);
230
+
231
+/* Call recursive finder. */
232
+return rFind(bpt, bpt->rootOffset, key, val);
233
+}
234
+
235
+void bptFileTraverse(struct bptFile *bpt, void *context,
236
+    void (*callback)(void *context, void *key, int keySize, void *val, int valSize) )
237
+/* Traverse bPlusTree on file, calling supplied callback function at each
238
+ * leaf item. */
239
+{
240
+return rTraverse(bpt, bpt->rootOffset, context, callback);
241
+}
242
+
243
+
244
+/* This section of code deals with making balanced b+ trees given a sorted array as input.
245
+ * The difficult part is mostly just calculating the offsets of various things.  As an example
246
+ * if you had the sorted array:
247
+ *   01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
248
+ * and wanted to create a tree of block size 4, the resulting tree would have three levels
249
+ * as so:
250
+ *  01 17
251
+ *  01 05 09 13   17 21 25
252
+ *  01 02 03 04   05 06 07 08  09 10 11 12   13 14 15 16   17 18 19 20   21 22 23 24  25 26 27
253
+ */
254
+
255
+static int xToY(int x, unsigned y)
256
+/* Return x to the Y power, with y usually small. */
257
+{
258
+int i, val = 1;
259
+for (i=0; i<y; ++i)
260
+    val *= x;
261
+return val;
262
+}
263
+
264
+static int bptCountLevels(int maxBlockSize, int itemCount)
265
+/* Count up number of levels needed in tree of given maximum block size. */
266
+{
267
+int levels = 1;
268
+while (itemCount > maxBlockSize)
269
+    {
270
+    itemCount = (itemCount + maxBlockSize - 1)  / maxBlockSize;
271
+    levels += 1;
272
+    }
273
+return levels;
274
+}
275
+
276
+
277
+static bits32 writeIndexLevel(bits16 blockSize, 
278
+	void *itemArray, int itemSize, int itemCount, 
279
+	bits32 indexOffset, int level, 
280
+	void (*fetchKey)(const void *va, char *keyBuf), bits32 keySize, bits32 valSize,
281
+	FILE *f)
282
+/* Write out a non-leaf level. */
283
+{
284
+char *items = itemArray;
285
+
286
+/* Calculate number of nodes to write at this level. */
287
+int slotSizePer = xToY(blockSize, level);   // Number of items per slot in node
288
+int nodeSizePer = slotSizePer * blockSize;  // Number of items per node
289
+int nodeCount = (itemCount + nodeSizePer - 1)/nodeSizePer;	
290
+
291
+/* Calculate sizes and offsets. */
292
+int bytesInIndexBlock = (bptBlockHeaderSize + blockSize * (keySize+sizeof(bits64)));
293
+int bytesInLeafBlock = (bptBlockHeaderSize + blockSize * (keySize+valSize));
294
+bits64 bytesInNextLevelBlock = (level == 1 ? bytesInLeafBlock : bytesInIndexBlock);
295
+bits64 levelSize = nodeCount * bytesInIndexBlock;
296
+bits64 endLevel = indexOffset + levelSize;
297
+bits64 nextChild = endLevel;
298
+
299
+UBYTE isLeaf = FALSE;
300
+UBYTE reserved = 0;
301
+
302
+int i,j;
303
+char keyBuf[keySize+1];
304
+keyBuf[keySize] = 0;
305
+for (i=0; i<itemCount; i += nodeSizePer)
306
+    {
307
+    /* Calculate size of this block */
308
+    bits16 countOne = (itemCount - i + slotSizePer - 1)/slotSizePer;
309
+    if (countOne > blockSize)
310
+        countOne = blockSize;
311
+
312
+    /* Write block header. */
313
+    writeOne(f, isLeaf);
314
+    writeOne(f, reserved);
315
+    writeOne(f, countOne);
316
+
317
+    /* Write out the slots that are used one by one, and do sanity check. */
318
+    int slotsUsed = 0;
319
+    int endIx = i + nodeSizePer;
320
+    if (endIx > itemCount)
321
+        endIx = itemCount;
322
+    for (j=i; j<endIx; j += slotSizePer)
323
+        {
324
+	void *item = items + j*itemSize;
325
+	memset(keyBuf, 0, keySize);
326
+	(*fetchKey)(item, keyBuf);
327
+	mustWrite(f, keyBuf, keySize);
328
+	writeOne(f, nextChild);
329
+	nextChild += bytesInNextLevelBlock;
330
+	++slotsUsed;
331
+	}
332
+    assert(slotsUsed == countOne);
333
+
334
+    /* Write out empty slots as all zero. */
335
+    int slotSize = keySize + sizeof(bits64);
336
+    for (j=countOne; j<blockSize; ++j)
337
+	repeatCharOut(f, 0, slotSize);
338
+    }
339
+return endLevel;
340
+}
341
+
342
+static void writeLeafLevel(bits16 blockSize, void *itemArray, int itemSize, int itemCount, 
343
+	void (*fetchKey)(const void *va, char *keyBuf), bits32 keySize,
344
+	void* (*fetchVal)(const void *va), bits32 valSize,
345
+	FILE *f)
346
+/* Write out leaf level blocks. */
347
+{
348
+char *items = itemArray;
349
+int i,j;
350
+UBYTE isLeaf = TRUE;
351
+UBYTE reserved = 0;
352
+bits16 countOne;
353
+int countLeft = itemCount;
354
+char keyBuf[keySize+1];
355
+keyBuf[keySize] = 0;
356
+for (i=0; i<itemCount; i += countOne)
357
+    {
358
+    /* Write block header */
359
+    if (countLeft > blockSize)
360
+        countOne = blockSize;
361
+    else
362
+        countOne = countLeft;
363
+    writeOne(f, isLeaf);
364
+    writeOne(f, reserved);
365
+    writeOne(f, countOne);
366
+
367
+    /* Write out position in genome and in file for each item. */
368
+    for (j=0; j<countOne; ++j)
369
+        {
370
+	assert(i+j < itemCount);
371
+	void *item = items + (i+j)*itemSize;
372
+	memset(keyBuf, 0, keySize);
373
+	(*fetchKey)(item, keyBuf);
374
+	mustWrite(f, keyBuf, keySize);
375
+	mustWrite(f, (*fetchVal)(item), valSize);
376
+	}
377
+    
378
+    /* Pad out any unused bits of last block with zeroes. */
379
+    int slotSize = keySize + valSize;
380
+    for (j=countOne; j<blockSize; ++j)
381
+	repeatCharOut(f, 0, slotSize);
382
+
383
+    countLeft -= countOne;
384
+    }
385
+}
386
+
387
+void bptFileBulkIndexToOpenFile(void *itemArray, int itemSize, bits64 itemCount, bits32 blockSize,
388
+	void (*fetchKey)(const void *va, char *keyBuf), bits32 keySize,
389
+	void* (*fetchVal)(const void *va), bits32 valSize, FILE *f)
390
+/* Create a b+ tree index from a sorted array, writing output starting at current position
391
+ * of an already open file.  See bptFileCreate for explanation of parameters. */
392
+{
393
+bits32 magic = bptSig;
394
+bits32 reserved = 0;
395
+writeOne(f, magic);
396
+writeOne(f, blockSize);
397
+writeOne(f, keySize);
398
+writeOne(f, valSize);
399
+writeOne(f, itemCount);
400
+writeOne(f, reserved);
401
+writeOne(f, reserved);
402
+bits64 indexOffset = ftell(f);
403
+
404
+/* Write non-leaf nodes. */
405
+int levels = bptCountLevels(blockSize, itemCount);
406
+int i;
407
+for (i=levels-1; i > 0; --i)
408
+    {
409
+    bits32 endLevelOffset = writeIndexLevel(blockSize, itemArray, itemSize, itemCount, indexOffset, 
410
+    	i, fetchKey, keySize, valSize, f);
411
+    indexOffset = ftell(f);
412
+    if (endLevelOffset != indexOffset)
413
+        internalErr();
414
+    }
415
+
416
+/* Write leaf nodes */
417
+writeLeafLevel(blockSize, itemArray, itemSize, itemCount, 
418
+	fetchKey, keySize, fetchVal, valSize, f);
419
+}
420
+
421
+void bptFileCreate(
422
+	void *itemArray, 	/* Sorted array of things to index. */
423
+	int itemSize, 		/* Size of each element in array. */
424
+	bits64 itemCount, 	/* Number of elements in array. */
425
+	bits32 blockSize,	/* B+ tree block size - # of children for each node. */
426
+	void (*fetchKey)(const void *va, char *keyBuf),  /* Given item, copy key to keyBuf */ 
427
+	bits32 keySize,					 /* Size of key */
428
+	void* (*fetchVal)(const void *va), 		 /* Given item, return pointer to value */
429
+	bits32 valSize, 				 /* Size of value */
430
+	char *fileName)                                  /* Name of output file. */
431
+/* Create a b+ tree index file from a sorted array. */
432
+
433
+{
434
+/* Open file and write header. */
435
+FILE *f = mustOpen(fileName, "wb");
436
+bptFileBulkIndexToOpenFile(itemArray, itemSize, itemCount, blockSize, fetchKey, keySize, 
437
+	fetchVal, valSize, f);
438
+carefulClose(&f);
439
+}
440
+
0 441
new file mode 100644
... ...
@@ -0,0 +1,114 @@
1
+/* bptFile - B+ Trees.  These are a method of indexing data similar to binary trees, but 
2
+ * with many children rather than just two at each node. They work well when stored on disk,
3
+ * since typically only two or three disk accesses are needed to locate any particular
4
+ * piece of data.  This implementation is *just* for disk based storage.  For memory
5
+ * use the rbTree instead. Currently the implementation is just useful for data warehouse
6
+ * type applications.  That is it implements a function to create a b+ tree from bulk data
7
+ * (bptFileCreate) and a function to lookup a value given a key (bptFileFind) but not functions
8
+ * to add or delete individual items.
9
+ *
10
+ *
11
+ * The layout of the file on disk is:
12
+ *    header
13
+ *    root node
14
+ *    (other nodes)
15
+ * In general when the tree is first built the higher level nodes are stored before the
16
+ * lower level nodes.  It is possible if a b+ tree is dynamically updated for this to
17
+ * no longer be strictly true, but actually currently the b+ tree code here doesn't implement
18
+ * dynamic updates - it just creates a b+ tree from a sorted list.
19
+ *
20
+ * Each node can be one of two types - index or leaf.  The index nodes contain pointers
21
+ * to child nodes.  The leaf nodes contain the actual data. 
22
+ *
23
+ * The layout of the file header is:
24
+ *       <magic number>  4 bytes - The value bptSig (0x78CA8C91)
25
+ *       <block size>    4 bytes - Number of children per block (not byte size of block)
26
+ *       <key size>      4 bytes - Number of significant bytes in key
27
+ *       <val size>      4 bytes - Number of bytes in value
28
+ *       <item count>    8 bytes - Number of items in index
29
+ *       <reserved2>     4 bytes - Always 0 for now
30
+ *       <reserved3>     4 bytes - Always 0 for now
31
+ * The magic number may be byte-swapped, in which case all numbers in the file
32
+ * need to be byte-swapped. 
33
+ *
34
+ * The nodes start with a header:
35
+ *       <is leaf>       1 byte  - 1 for leaf nodes, 0 for index nodes.
36
+ *       <reserved>      1 byte  - Always 0 for now.
37
+ *       <count>         2 bytes - The number of children/items in node
38
+ * This is followed by count items.  For the index nodes the items are
39
+ *       <key>           key size bytes - always written most significant byte first
40
+ *       <offset>        8 bytes - Offset of child node in index file.
41
+ * For leaf nodes the items are
42
+ *       <key>           key size bytes - always written most significant byte first
43
+ *       <val>           val sized bytes - the value associated with the key.
44
+ * Note in general the leaf nodes may not be the same size as the index nodes, though in
45
+ * the important case where the values are file offsets they will be.
46
+ */
47
+
48
+#ifndef BPLUSTREE_H
49
+#define BPLUSTREE_H
50
+
51
+struct bptFile
52
+/* B+ tree index file handle. */
53
+    {
54
+    struct bptFile *next;	/* Next in list of index files if any. */
55
+    char *fileName;		/* Name of file - for error reporting. */
56
+    struct udcFile *udc;			/* Open file pointer. */
57
+    bits32 blockSize;		/* Size of block. */
58
+    bits32 keySize;		/* Size of keys. */
59
+    bits32 valSize;		/* Size of values. */
60
+    bits64 itemCount;		/* Number of items indexed. */
61
+    boolean isSwapped;		/* If TRUE need to byte swap everything. */
62
+    bits64 rootOffset;		/* Offset of root block. */
63
+    };
64
+
65
+struct bptFile *bptFileOpen(char *fileName);
66
+/* Open up index file - reading header and verifying things. */
67
+
68
+void bptFileClose(struct bptFile **pBpt);
69
+/* Close down and deallocate index file. */
70
+
71
+struct bptFile *bptFileAttach(char *fileName, struct udcFile *udc);
72
+/* Open up index file on previously open file, with header at current file position. */
73
+
74
+void bptFileDetach(struct bptFile **pBpt);
75
+/* Detach and free up bptFile opened with bptFileAttach. */
76
+
77
+boolean bptFileFind(struct bptFile *bpt, void *key, int keySize, void *val, int valSize);
78
+/* Find value associated with key.  Return TRUE if it's found. 
79
+*  Parameters:
80
+*     bpt - file handle returned by bptFileOpen
81
+*     key - pointer to key string
82
+*     keySize - size of key.  Normally just strlen(key)
83
+*     val - pointer to where to put retrieved value
84
+*     valSize - size of memory buffer that will hold val.  Should match bpt->valSize.
85
+*/
86
+
87
+void bptFileTraverse(struct bptFile *bpt, void *context,
88
+    void (*callback)(void *context, void *key, int keySize, void *val, int valSize) );
89
+/* Traverse bPlusTree on file, calling supplied callback function at each
90
+ * leaf item. */
91
+
92
+
93
+void bptFileCreate(
94
+	void *itemArray, 	/* Sorted array of things to index. */
95
+	int itemSize, 		/* Size of each element in array. */
96
+	bits64 itemCount, 	/* Number of elements in array. */
97
+	bits32 blockSize,	/* B+ tree block size - # of children for each node. */
98
+	void (*fetchKey)(const void *va, char *keyBuf),  /* Given item, copy key to keyBuf */ 
99
+	bits32 keySize,					 /* Size of key */
100
+	void* (*fetchVal)(const void *va), 		 /* Given item, return pointer to value */
101
+	bits32 valSize, 				 /* Size of value */
102
+	char *fileName);                                 /* Name of output file. */
103
+/* Create a b+ tree index file from a sorted array. */
104
+
105
+void bptFileBulkIndexToOpenFile(void *itemArray, int itemSize, bits64 itemCount, bits32 blockSize,
106
+	void (*fetchKey)(const void *va, char *keyBuf), bits32 keySize,
107
+	void* (*fetchVal)(const void *va), bits32 valSize, FILE *f);
108
+/* Create a b+ tree index from a sorted array, writing output starting at current position
109
+ * of an already open file.  See bptFileCreate for explanation of parameters. */
110
+
111
+#define bptFileHeaderSize 32
112
+#define bptBlockHeaderSize 4
113
+
114
+#endif /* BPLUSTREE_H */
0 115
new file mode 100644
... ...
@@ -0,0 +1,130 @@
1
+#include "common.h"
2
+#include "base64.h"
3
+
4
+static char const rcsid[] = "$Id: base64.c,v 1.6 2008/09/17 18:00:47 galt Exp $";
5
+
6
+char *base64Encode(char *input, size_t inplen)
7
+/* Use base64 to encode a string.  Returns one long encoded
8
+ * string which need to be freeMem'd. Note: big-endian algorithm.
9
+ * For some applications you may need to break the base64 output
10
+ * of this function into lines no longer than 76 chars.
11
+ */
12
+{
13
+char b64[] = B64CHARS;
14
+int words = (inplen+2)/3;
15
+int remains = inplen % 3;
16
+char *result = (char *)needMem(4*words+1);
17
+size_t i=0, j=0;
18
+int word = 0;
19
+unsigned char *p = (unsigned char*) input;  
20
+/* p must be unsigned char*,  because without "unsigned",
21
+sign extend messes up last group outputted
22
+when the value of the chars following last in input
23
+happens to be char 0x80 or higher */
24
+for(i=1; i<=words; i++)
25
+    {
26
+    word = 0;
27
+    word |= *p++;
28
+    word <<= 8;
29
+    word |= *p++;
30
+    word <<= 8;
31
+    word |= *p++;
32
+    if (i==words && remains>0)
33
+	{
34
+	word &= 0x00FFFF00;
35
+    	if (remains==1)
36
+    	    word &= 0x00FF0000;
37
+	}
38
+    result[j++]=b64[word >> 18 & 0x3F];
39
+    result[j++]=b64[word >> 12 & 0x3F];
40
+    result[j++]=b64[word >> 6 & 0x3F];
41
+    result[j++]=b64[word & 0x3F];
42
+    }
43
+result[j] = 0;
44
+if (remains >0) result[j-1] = '=';    
45
+if (remains==1) result[j-2] = '=';    
46
+return result;
47
+}
48
+
49
+
50
+boolean base64Validate(char *input)
51
+/* Return true if input is valid base64.
52
+ * Note that the input string is changed by 
53
+ * eraseWhiteSpace(). */
54
+{
55
+size_t i = 0, l = 0;
56
+char *p = input;
57
+boolean validB64 = TRUE;
58
+
59
+/* remove whitespace which is unnecessary and  */
60
+eraseWhiteSpace(input);  
61
+
62
+l = strlen(p);
63
+for(i=0;i<l;i++)
64
+    {
65
+    char c = ' ';
66
+    if (!strchr(B64CHARS,c=*p++))
67
+        {
68
+        if (c != '=')
69
+            {
70
+            validB64 = FALSE;
71
+            break;
72
+            }
73
+        }
74
+    }
75
+if (l%4)
76
+    validB64 = FALSE;
77
+return validB64;
78
+}
79
+
80
+char *base64Decode(char *input, size_t *returnSize)
81
+/* Use base64 to decode a string.  Return decoded
82
+ * string which will be freeMem'd. Note: big-endian algorithm.
83
+ * Call eraseWhiteSpace() and check for invalid input 
84
+ * before passing in input if needed.  
85
+ * Optionally set return size for use with binary data.
86
+ */
87
+{
88
+static int *map=NULL;
89
+char b64[] = B64CHARS;
90
+size_t inplen = strlen(input);
91
+int words = (inplen+3)/4;
92
+char *result = (char *)needMem(3*words+1);
93
+size_t i=0, j=0;
94
+int word = 0;
95
+char *p = input;
96
+
97
+if (!map)
98
+    {
99
+    int i = 0;
100
+    map = needMem(256*sizeof(int));
101
+    for (i = 0; i < 256; ++i)
102
+	{
103
+	map[i]=0;
104
+	}
105
+    for (i = 0; i < 64; ++i)
106
+	{
107
+	map[(int)b64[i]]=i;
108
+	}
109
+    }
110
+for(i=0; i<words; i++)
111
+    {
112
+    word = 0;
113
+    word |= map[(int)*p++];
114
+    word <<= 6;
115
+    word |= map[(int)*p++];
116
+    word <<= 6;
117
+    word |= map[(int)*p++];
118
+    word <<= 6;
119
+    word |= map[(int)*p++];
120
+    result[j++]=word >> 16 & 0xFF;
121
+    result[j++]=word >> 8 & 0xFF;
122
+    result[j++]=word & 0xFF;
123
+    }
124
+result[j] = 0;
125
+if (returnSize)
126
+    *returnSize = j;
127
+     
128
+return result;
129
+}
130
+
0 131
new file mode 100644
... ...
@@ -0,0 +1,30 @@
1
+/* Base64 encoding and decoding.
2
+ * by Galt Barber */
3
+
4
+#ifndef BASE64_H
5
+#define BASE64_H
6
+
7
+#define B64CHARS "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
8
+
9
+char *base64Encode(char *input, size_t inplen);
10
+/* Use base64 to encode a string.  Returns one long encoded
11
+ * string which need to be freeMem'd. Note: big-endian algorithm.
12
+ * For some applications you may need to break the base64 output
13
+ * of this function into lines no longer than 76 chars.
14
+ */
15
+
16
+boolean base64Validate(char *input);
17
+/* Return true if input is valid base64.
18
+ * Note that the input string is changed by 
19
+ * eraseWhiteSpace(). */
20
+
21
+char *base64Decode(char *input, size_t *returnSize);
22
+/* Use base64 to decode a string.  Return decoded
23
+ * string which will be freeMem'd. Note: big-endian algorithm.
24
+ * Call eraseWhiteSpace() and check for invalid input 
25
+ * before passing in input if needed.  
26
+ * Optionally set retun size for use with binary data.
27
+ */
28
+
29
+#endif /* BASE64_H */
30
+
0 31
new file mode 100644
... ...
@@ -0,0 +1,352 @@
1
+/* bbiFile - Big Binary Indexed file.  Stuff that's common between bigWig and bigBed. */
2
+
3
+#ifndef BBIFILE_H
4
+#define BBIFILE_H
5
+
6
+#include "cirTree.h"
7
+
8
+/* bigWig/bigBed file structure:
9
+ *     fixedWidthHeader
10
+ *         magic# 		4 bytes
11
+ *         version              2 bytes
12
+ *	   zoomLevels		2 bytes
13
+ *         chromosomeTreeOffset	8 bytes
14
+ *         fullDataOffset	8 bytes
15
+ *	   fullIndexOffset	8 bytes
16
+ *         fieldCount           2 bytes (for bigWig 0)
17
+ *         definedFieldCount    2 bytes (for bigWig 0)
18
+ *         autoSqlOffset        8 bytes (for bigWig 0) (0 if no autoSql information)
19
+ *         totalSummaryOffset   8 bytes (0 in earlier versions of file lacking totalSummary)
20
+ *         uncompressBufSize    4 bytes (Size of uncompression buffer.  0 if uncompressed.)
21
+ *         reserved             8 bytes (0 for now)
22
+ *     zoomHeaders		there are zoomLevels number of these
23
+ *         reductionLevel	4 bytes
24
+ *	   reserved		4 bytes
25
+ *	   dataOffset		8 bytes
26
+ *         indexOffset          8 bytes
27
+ *     autoSql string (zero terminated - only present if autoSqlOffset non-zero)
28
+ *     totalSummary - summary of all data in file - only present if totalSummaryOffset non-zero
29
+ *         basesCovered        8 bytes
30
+ *         minVal              8 bytes float (for bigBed minimum depth of coverage)
31
+ *         maxVal              8 bytes float (for bigBed maximum depth of coverage)
32
+ *         sumData             8 bytes float (for bigBed sum of coverage)
33
+ *         sumSquared          8 bytes float (for bigBed sum of coverage squared)
34
+ *     chromosome b+ tree       bPlusTree index
35
+ *     full data
36
+ *         sectionCount		8 bytes (item count for bigBeds)
37
+ *         section data		section count sections, of three types (bed data for bigBeds)
38
+ *     full index               cirTree index
39
+ *     zoom info             one of these for each zoom level
40
+ *         zoom data
41
+ *             zoomCount	4 bytes
42
+ *             zoom data	there are zoomCount of these items
43
+ *                 chromId	4 bytes
44
+ *	           chromStart	4 bytes
45
+ *                 chromEnd     4 bytes
46
+ *                 validCount	4 bytes
47
+ *                 minVal       4 bytes float 
48
+ *                 maxVal       4 bytes float
49
+ *                 sumData      4 bytes float
50
+ *                 sumSquares   4 bytes float