Browse code

update Kent library

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/rtracklayer@98880 bc3139a8-67e5-0310-9ffc-ced21a209358

Michael Lawrence authored on 29/01/2015 23:07:25
Showing 62 changed files

... ...
@@ -4,7 +4,7 @@ PKG_OBJECTS = \
4 4
   
5 5
 UCSC_OBJECTS = \
6 6
   bPlusTree.o bbiRead.o bbiWrite.o bwgCreate.o bwgQuery.o \
7
-  cirTree.o common.o dnaseq.o dnautil.o errabort.o hash.o linefile.o localmem.o\
7
+  cirTree.o common.o dnaseq.o dnautil.o errAbort.o hash.o linefile.o localmem.o\
8 8
   sqlNum.o zlibFace.o dystring.o hmmstats.o obscure.o pipeline.o \
9 9
   rangeTree.o rbTree.o memalloc.o dlist.o udc.o net.o bits.o twoBit.o \
10 10
   _cheapcgi.o internet.o https.o base64.o verbose.o os.o wildcmp.o _portimpl.o
... ...
@@ -5,7 +5,6 @@
5 5
 #include "ucsc/bbiFile.h"
6 6
 #include "ucsc/bigWig.h"
7 7
 #include "ucsc/bwgInternal.h"
8
-#include "ucsc/_bwgInternal.h"
9 8
 
10 9
 #include "bigWig.h"
11 10
 #include "handlers.h"
... ...
@@ -190,6 +189,7 @@ SEXP BWGSectionList_write(SEXP r_sections, SEXP r_seqlengths, SEXP r_compress,
190 189
   }
191 190
   pushRHandlers();
192 191
   bwgCreate(sections, lenHash, blockSize, itemsPerSlot, asLogical(r_compress),
192
+            FALSE /*keepAllChromosomes*/, TRUE /*fixedSummaries*/,
193 193
             (char *)CHAR(asChar(r_file)));
194 194
   freeHash(&lenHash);
195 195
   popRHandlers();
... ...
@@ -385,7 +385,7 @@ SEXP BWGFile_fromWIG(SEXP r_infile, SEXP r_seqlengths, SEXP r_outfile) {
385 385
   struct bwgSection *sections =
386 386
     bwgParseWig((char *)CHAR(asChar(r_infile)), FALSE, lenHash, itemsPerSlot,
387 387
                 lm);
388
-  bwgCreate(sections, lenHash, blockSize, itemsPerSlot, TRUE,
388
+  bwgCreate(sections, lenHash, blockSize, itemsPerSlot, TRUE, TRUE, TRUE,
389 389
             (char *)CHAR(asChar(r_outfile)));
390 390
   lmCleanup(&lm);
391 391
   freeHash(&lenHash);
... ...
@@ -1,5 +1,5 @@
1 1
 #include "ucsc/common.h"
2
-#include "ucsc/errabort.h"
2
+#include "ucsc/errAbort.h"
3 3
 
4 4
 #include "handlers.h"
5 5
 
... ...
@@ -1,3 +1,6 @@
1
+/* Copyright (C) 2013 The Regents of the University of California 
2
+ * See README in this or parent directory for licensing information. */
3
+
1 4
 /* bptFile - B+ Trees.  These are a method of indexing data similar to binary trees, but 
2 5
  * with many children rather than just two at each node. They work well when stored on disk,
3 6
  * since typically only two or three disk accesses are needed to locate any particular
... ...
@@ -162,7 +165,71 @@ else
162 165
     }
163 166
 }
164 167
 
165
-void rTraverse(struct bptFile *bpt, bits64 blockStart, void *context, 
168
+static void rFindMulti(struct bptFile *bpt, bits64 blockStart, void *key, struct slRef **pList)
169
+/* Find values corresponding to key and add them to pList.  You'll need to 
170
+ * Do a slRefFreeListAndVals() on the list when done. */
171
+{
172
+/* Seek to start of block. */
173
+udcSeek(bpt->udc, blockStart);
174
+
175
+/* Read block header. */
176
+UBYTE isLeaf;
177
+UBYTE reserved;
178
+bits16 i, childCount;
179
+udcMustReadOne(bpt->udc, isLeaf);
180
+udcMustReadOne(bpt->udc, reserved);
181
+boolean isSwapped = bpt->isSwapped;
182
+childCount = udcReadBits16(bpt->udc, isSwapped);
183
+
184
+int keySize = bpt->keySize;
185
+UBYTE keyBuf[keySize];   /* Place to put a key, buffered on stack. */
186
+UBYTE valBuf[bpt->valSize];   /* Place to put a value, buffered on stack. */
187
+
188
+if (isLeaf)
189
+    {
190
+    for (i=0; i<childCount; ++i)
191
+        {
192
+	udcMustRead(bpt->udc, keyBuf, keySize);
193
+	udcMustRead(bpt->udc, valBuf, bpt->valSize);
194
+	if (memcmp(key, keyBuf, keySize) == 0)
195
+	    {
196
+	    void *val = cloneMem(valBuf, bpt->valSize);
197
+	    refAdd(pList, val);
198
+	    }
199
+	}
200
+    }
201
+else
202
+    {
203
+    /* Read first key and first file offset. */
204
+    udcMustRead(bpt->udc, keyBuf, keySize);
205
+    bits64 lastFileOffset = udcReadBits64(bpt->udc, isSwapped);
206
+    bits64 fileOffset = lastFileOffset;
207
+    int lastCmp = memcmp(key, keyBuf, keySize);
208
+
209
+    /* Loop through remainder. */
210
+    for (i=1; i<childCount; ++i)
211
+	{
212
+	udcMustRead(bpt->udc, keyBuf, keySize);
213
+	fileOffset = udcReadBits64(bpt->udc, isSwapped);
214
+	int cmp = memcmp(key, keyBuf, keySize);
215
+	if (lastCmp >= 0 && cmp <= 0)
216
+	    {
217
+	    bits64 curPos = udcTell(bpt->udc);
218
+	    rFindMulti(bpt, lastFileOffset, key, pList);
219
+	    udcSeek(bpt->udc, curPos);
220
+	    }
221
+	if (cmp < 0)
222
+	    return;
223
+	lastCmp = cmp;
224
+	lastFileOffset = fileOffset;
225
+	}
226
+    /* If made it all the way to end, do last one too. */
227
+    rFindMulti(bpt, fileOffset, key, pList);
228
+    }
229
+}
230
+
231
+
232
+static void rTraverse(struct bptFile *bpt, bits64 blockStart, void *context, 
166 233
     void (*callback)(void *context, void *key, int keySize, void *val, int valSize) )
167 234
 /* Recursively go across tree, calling callback at leaves. */
168 235
 {
... ...
@@ -203,19 +270,83 @@ else
203 270
     }
204 271
 }
205 272
 
206
-boolean bptFileFind(struct bptFile *bpt, void *key, int keySize, void *val, int valSize)
207
-/* Find value associated with key.  Return TRUE if it's found. 
208
-*  Parameters:
209
-*     bpt - file handle returned by bptFileOpen
210
-*     key - pointer to key string, which needs to be bpt->keySize long
211
-*     val - pointer to where to put retrieved value
212
-*/
273
+static bits64 bptDataStart(struct bptFile *bpt)
274
+/* Return offset of first bit of data (as opposed to index) in file.  In hind sight I wish
275
+ * this were stored in the header, but fortunately it's not that hard to compute. */
276
+{
277
+bits64 offset = bpt->rootOffset;
278
+for (;;)
279
+    {
280
+    /* Seek to block start */
281
+    udcSeek(bpt->udc, offset);
282
+
283
+    /* Read block header,  break if we are leaf. */
284
+    UBYTE isLeaf;
285
+    UBYTE reserved;
286
+    bits16 childCount;
287
+    udcMustReadOne(bpt->udc, isLeaf);
288
+    if (isLeaf)
289
+         break;
290
+    udcMustReadOne(bpt->udc, reserved);
291
+    boolean isSwapped = bpt->isSwapped;
292
+    childCount = udcReadBits16(bpt->udc, isSwapped);
293
+
294
+    /* Read and discard first key. */
295
+    char keyBuf[bpt->keySize];
296
+    udcMustRead(bpt->udc, keyBuf, bpt->keySize);
297
+
298
+    /* Get file offset of sub-block. */
299
+    offset = udcReadBits64(bpt->udc, isSwapped);
300
+    }
301
+return offset;
302
+}
303
+
304
+static bits64 bptDataOffset(struct bptFile *bpt, bits64 itemPos)
305
+/* Return position of file of data corresponding to given itemPos.  For first piece of
306
+ * data pass in 0. */
307
+{
308
+if (itemPos >= bpt->itemCount)
309
+    errAbort("Item index %lld greater than item count %lld in %s", 
310
+	itemPos, bpt->itemCount, bpt->fileName);
311
+bits64 blockPos = itemPos/bpt->blockSize;
312
+bits32 insidePos = itemPos - blockPos*bpt->blockSize;
313
+int blockHeaderSize = 4;
314
+bits64 itemByteSize = bpt->valSize + bpt->keySize;
315
+bits64 blockByteSize = bpt->blockSize * itemByteSize + blockHeaderSize;
316
+bits64 blockOffset = blockByteSize*blockPos + bptDataStart(bpt);
317
+bits64 itemOffset = blockOffset + blockHeaderSize + itemByteSize * insidePos;
318
+return itemOffset;
319
+}
320
+
321
+void bptKeyAtPos(struct bptFile *bpt, bits64 itemPos, void *result)
322
+/* Fill in result with the key at given itemPos.  For first piece of data itemPos is 0 
323
+ * Result must be at least bpt->keySize.  If result is a string it won't be zero terminated
324
+ * by this routine.  Use bptStringKeyAtPos instead. */
325
+{
326
+bits64 offset = bptDataOffset(bpt, itemPos);
327
+udcSeek(bpt->udc, offset);
328
+udcMustRead(bpt->udc, result, bpt->keySize);
329
+}
330
+
331
+void bptStringKeyAtPos(struct bptFile *bpt, bits64 itemPos, char *result, int maxResultSize)
332
+/* Fill in result with the key at given itemPos.  The maxResultSize should be 1+bpt->keySize
333
+ * to accommodate zero termination of string. */
334
+{
335
+assert(maxResultSize > bpt->keySize);
336
+bptKeyAtPos(bpt, itemPos, result);
337
+result[bpt->keySize] = 0;
338
+}
339
+
340
+static boolean bptFileFindMaybeMulti(struct bptFile *bpt, void *key, int keySize, int valSize,
341
+    boolean multi, void *singleVal, struct slRef **multiVal)
342
+/* Do either a single or multiple find depending in multi parameter.  Only one of singleVal
343
+ * or multiVal should be non-NULL, depending on the same parameter. */
213 344
 {
214 345
 /* Check key size vs. file key size, and act appropriately.  If need be copy key to a local
215 346
  * buffer and zero-extend it. */
216 347
 if (keySize > bpt->keySize)
217 348
     return FALSE;
218
-char keyBuf[keySize];
349
+char keyBuf[bpt->keySize];
219 350
 if (keySize != bpt->keySize)
220 351
     {
221 352
     memcpy(keyBuf, key, keySize);
... ...
@@ -228,8 +359,34 @@ if (valSize != bpt->valSize)
228 359
     errAbort("Value size mismatch between bptFileFind (valSize=%d) and %s (valSize=%d)",
229 360
     	valSize, bpt->fileName, bpt->valSize);
230 361
 
231
-/* Call recursive finder. */
232
-return rFind(bpt, bpt->rootOffset, key, val);
362
+if (multi)
363
+    {
364
+    rFindMulti(bpt, bpt->rootOffset, key, multiVal);
365
+    return *multiVal != NULL;
366
+    }
367
+else
368
+    return rFind(bpt, bpt->rootOffset, key, singleVal);
369
+}
370
+
371
+boolean bptFileFind(struct bptFile *bpt, void *key, int keySize, void *val, int valSize)
372
+/* Find value associated with key.  Return TRUE if it's found. 
373
+*  Parameters:
374
+*     bpt - file handle returned by bptFileOpen
375
+*     key - pointer to key string, which needs to be bpt->keySize long
376
+*     val - pointer to where to put retrieved value
377
+*/
378
+{
379
+return bptFileFindMaybeMulti(bpt, key, keySize, valSize, FALSE, val, NULL);
380
+}
381
+
382
+struct slRef *bptFileFindMultiple(struct bptFile *bpt, void *key, int keySize, int valSize)
383
+/* Find all values associated with key.  Store this in ->val item of returned list. 
384
+ * Do a slRefFreeListAndVals() on list when done. */
385
+{
386
+struct slRef *list = NULL;
387
+bptFileFindMaybeMulti(bpt, key, keySize, valSize, TRUE, NULL, &list);
388
+slReverse(&list);
389
+return list;
233 390
 }
234 391
 
235 392
 void bptFileTraverse(struct bptFile *bpt, void *context,
... ...
@@ -237,7 +394,7 @@ void bptFileTraverse(struct bptFile *bpt, void *context,
237 394
 /* Traverse bPlusTree on file, calling supplied callback function at each
238 395
  * leaf item. */
239 396
 {
240
-  rTraverse(bpt, bpt->rootOffset, context, callback);
397
+return rTraverse(bpt, bpt->rootOffset, context, callback);
241 398
 }
242 399
 
243 400
 
... ...
@@ -252,10 +409,10 @@ void bptFileTraverse(struct bptFile *bpt, void *context,
252 409
  *  01 02 03 04   05 06 07 08  09 10 11 12   13 14 15 16   17 18 19 20   21 22 23 24  25 26 27
253 410
  */
254 411
 
255
-static int xToY(int x, unsigned y)
412
+static long xToY(int x, unsigned y)
256 413
 /* Return x to the Y power, with y usually small. */
257 414
 {
258
-int i, val = 1;
415
+long i, val = 1;
259 416
 for (i=0; i<y; ++i)
260 417
     val *= x;
261 418
 return val;
... ...
@@ -275,7 +432,7 @@ return levels;
275 432
 
276 433
 
277 434
 static bits32 writeIndexLevel(bits16 blockSize, 
278
-	void *itemArray, int itemSize, int itemCount, 
435
+	void *itemArray, int itemSize, long itemCount, 
279 436
 	bits32 indexOffset, int level, 
280 437
 	void (*fetchKey)(const void *va, char *keyBuf), bits32 keySize, bits32 valSize,
281 438
 	FILE *f)
... ...
@@ -284,39 +441,42 @@ static bits32 writeIndexLevel(bits16 blockSize,
284 441
 char *items = itemArray;
285 442
 
286 443
 /* Calculate number of nodes to write at this level. */
287
-int slotSizePer = xToY(blockSize, level);   // Number of items per slot in node
288
-int nodeSizePer = slotSizePer * blockSize;  // Number of items per node
289
-int nodeCount = (itemCount + nodeSizePer - 1)/nodeSizePer;	
444
+long slotSizePer = xToY(blockSize, level);   // Number of items per slot in node
445
+long nodeSizePer = slotSizePer * blockSize;  // Number of items per node
446
+long nodeCount = (itemCount + nodeSizePer - 1)/nodeSizePer;	
447
+
290 448
 
291 449
 /* Calculate sizes and offsets. */
292
-int bytesInIndexBlock = (bptBlockHeaderSize + blockSize * (keySize+sizeof(bits64)));
293
-int bytesInLeafBlock = (bptBlockHeaderSize + blockSize * (keySize+valSize));
450
+long bytesInIndexBlock = (bptBlockHeaderSize + blockSize * (keySize+sizeof(bits64)));
451
+long bytesInLeafBlock = (bptBlockHeaderSize + blockSize * (keySize+valSize));
294 452
 bits64 bytesInNextLevelBlock = (level == 1 ? bytesInLeafBlock : bytesInIndexBlock);
295 453
 bits64 levelSize = nodeCount * bytesInIndexBlock;
296 454
 bits64 endLevel = indexOffset + levelSize;
297 455
 bits64 nextChild = endLevel;
298 456
 
457
+
299 458
 UBYTE isLeaf = FALSE;
300 459
 UBYTE reserved = 0;
301 460
 
302
-int i,j;
461
+long i,j;
303 462
 char keyBuf[keySize+1];
304 463
 keyBuf[keySize] = 0;
305 464
 for (i=0; i<itemCount; i += nodeSizePer)
306 465
     {
307 466
     /* Calculate size of this block */
308
-    bits16 countOne = (itemCount - i + slotSizePer - 1)/slotSizePer;
467
+    long countOne = (itemCount - i + slotSizePer - 1)/slotSizePer;
309 468
     if (countOne > blockSize)
310 469
         countOne = blockSize;
470
+    bits16 shortCountOne = countOne;
311 471
 
312 472
     /* Write block header. */
313 473
     writeOne(f, isLeaf);
314 474
     writeOne(f, reserved);
315
-    writeOne(f, countOne);
475
+    writeOne(f, shortCountOne);
316 476
 
317 477
     /* Write out the slots that are used one by one, and do sanity check. */
318 478
     int slotsUsed = 0;
319
-    int endIx = i + nodeSizePer;
479
+    long endIx = i + nodeSizePer;
320 480
     if (endIx > itemCount)
321 481
         endIx = itemCount;
322 482
     for (j=i; j<endIx; j += slotSizePer)
... ...
@@ -329,7 +489,7 @@ for (i=0; i<itemCount; i += nodeSizePer)
329 489
 	nextChild += bytesInNextLevelBlock;
330 490
 	++slotsUsed;
331 491
 	}
332
-    assert(slotsUsed == countOne);
492
+    assert(slotsUsed == shortCountOne);
333 493
 
334 494
     /* Write out empty slots as all zero. */
335 495
     int slotSize = keySize + sizeof(bits64);
... ...
@@ -76,19 +76,32 @@ void bptFileDetach(struct bptFile **pBpt);
76 76
 
77 77
 boolean bptFileFind(struct bptFile *bpt, void *key, int keySize, void *val, int valSize);
78 78
 /* Find value associated with key.  Return TRUE if it's found. 
79
-*  Parameters:
80
-*     bpt - file handle returned by bptFileOpen
81
-*     key - pointer to key string
82
-*     keySize - size of key.  Normally just strlen(key)
83
-*     val - pointer to where to put retrieved value
84
-*     valSize - size of memory buffer that will hold val.  Should match bpt->valSize.
85
-*/
79
+ *  Parameters:
80
+ *     bpt - file handle returned by bptFileOpen
81
+ *     key - pointer to key string
82
+ *     keySize - size of key.  Normally just strlen(key)
83
+ *     val - pointer to where to put retrieved value
84
+ *     valSize - size of memory buffer that will hold val.  Should match bpt->valSize.
85
+ */
86
+
87
+struct slRef *bptFileFindMultiple(struct bptFile *bpt, void *key, int keySize, int valSize);
88
+/* Find all values associated with key.  Store this in ->val item of returned list. 
89
+ * Do a slRefFreeListAndVals() on list when done. */
86 90
 
87 91
 void bptFileTraverse(struct bptFile *bpt, void *context,
88 92
     void (*callback)(void *context, void *key, int keySize, void *val, int valSize) );
89 93
 /* Traverse bPlusTree on file, calling supplied callback function at each
90 94
  * leaf item. */
91 95
 
96
+void bptKeyAtPos(struct bptFile *bpt, bits64 itemPos, void *result);
97
+/* Fill in result with the key at given itemPos.  For first piece of data itemPos is 0 
98
+ * and for last piece is bpt->itemCount - 1.  Result must be at least bpt->keySize.  
99
+ * If result is a string it won't be zero terminated
100
+ * by this routine.  Use bptStringKeyAtPos instead. */
101
+
102
+void bptStringKeyAtPos(struct bptFile *bpt, bits64 itemPos, char *result, int maxResultSize);
103
+/* Fill in result with the key at given itemPos.  The maxResultSize should be 1+bpt->keySize
104
+ * to accommodate zero termination of string. */
92 105
 
93 106
 void bptFileCreate(
94 107
 	void *itemArray, 	/* Sorted array of things to index. */
... ...
@@ -1,7 +1,9 @@
1
+/* Copyright (C) 2011 The Regents of the University of California 
2
+ * See README in this or parent directory for licensing information. */
3
+
1 4
 #include "common.h"
2 5
 #include "base64.h"
3 6
 
4
-static char const rcsid[] = "$Id: base64.c,v 1.6 2008/09/17 18:00:47 galt Exp $";
5 7
 
6 8
 char *base64Encode(char *input, size_t inplen)
7 9
 /* Use base64 to encode a string.  Returns one long encoded
... ...
@@ -4,6 +4,8 @@
4 4
 #define BBIFILE_H
5 5
 
6 6
 #include "cirTree.h"
7
+#include "linefile.h"
8
+#include "localmem.h"
7 9
 
8 10
 /* bigWig/bigBed file structure:
9 11
  *     fixedWidthHeader
... ...
@@ -18,7 +20,7 @@
18 20
  *         autoSqlOffset        8 bytes (for bigWig 0) (0 if no autoSql information)
19 21
  *         totalSummaryOffset   8 bytes (0 in earlier versions of file lacking totalSummary)
20 22
  *         uncompressBufSize    4 bytes (Size of uncompression buffer.  0 if uncompressed.)
21
- *         reserved             8 bytes (0 for now)
23
+ *         extensionOffset      8 bytes (Offset to header extension 0 if no such extension)
22 24
  *     zoomHeaders		there are zoomLevels number of these
23 25
  *         reductionLevel	4 bytes
24 26
  *	   reserved		4 bytes
... ...
@@ -31,6 +33,19 @@
31 33
  *         maxVal              8 bytes float (for bigBed maximum depth of coverage)
32 34
  *         sumData             8 bytes float (for bigBed sum of coverage)
33 35
  *         sumSquared          8 bytes float (for bigBed sum of coverage squared)
36
+ *     extendedHeader
37
+ *         extensionSize       2 size of extended header in bytes - currently 64
38
+ *         extraIndexCount     2 number of extra fields we will be indexing
39
+ *         extraIndexListOffset 8 Offset to list of non-chrom/start/end indexes
40
+ *         reserved            48 All zeroes for now
41
+ *     extraIndexList - one of these for each extraIndex 
42
+ *         type                2 Type of index.  Always 0 for bPlusTree now
43
+ *         fieldCount          2 Number of fields used in this index.  Always 1 for now
44
+ *         indexOffset         8 offset for this index in file
45
+ *         reserved            4 All zeroes for now
46
+ *         fieldList - one of these for each field being used in _this_ index
47
+ *            fieldId          2 index of field within record
48
+ *            reserved         2 All zeroes for now
34 49
  *     chromosome b+ tree       bPlusTree index
35 50
  *     full data
36 51
  *         sectionCount		8 bytes (item count for bigBeds)
... ...
@@ -49,6 +64,7 @@
49 64
  *                 sumData      4 bytes float
50 65
  *                 sumSquares   4 bytes float
51 66
  *         zoom index        	cirTree index
67
+ *     extraIndexes [optional]  bPlusTreeIndex for each extra field that is indexed
52 68
  *     magic# 		4 bytes - same as magic number at start of header
53 69
  */
54 70
 
... ...
@@ -102,10 +118,17 @@ struct bbiFile
102 118
     bits16 fieldCount;		/* Number of columns in bed version. */
103 119
     bits16 definedFieldCount;   /* Number of columns using bed standard definitions. */
104 120
     bits64 asOffset;		/* Offset to embedded null-terminated AutoSQL file. */
105
-    bits64 totalSummaryOffset;	/* Offset to total summary information if any.  (On older files have to calculate) */
121
+    bits64 totalSummaryOffset;	/* Offset to total summary information if any.  
122
+				   (On older files have to calculate) */
106 123
     bits32 uncompressBufSize;	/* Size of uncompression buffer, 0 if uncompressed */
124
+    bits64 extensionOffset;	/* Start of header extension block or 0 if none. */
107 125
     struct cirTreeFile *unzoomedCir;	/* Unzoomed data index in memory - may be NULL. */
108 126
     struct bbiZoomLevel *levelList;	/* List of zoom levels. */
127
+
128
+    /* Fields based on extension block. */
129
+    bits16 extensionSize;   /* Size of extension block */
130
+    bits16 extraIndexCount; /* Number of extra indexes (on fields other than chrom,start,end */ 
131
+    bits64 extraIndexListOffset;    /* Offset to list of extra indexes */
109 132
     };
110 133
 
111 134
 
... ...
@@ -150,6 +173,14 @@ void bbiChromInfoKey(const void *va, char *keyBuf);
150 173
 void *bbiChromInfoVal(const void *va);
151 174
 /* Get val field out of bbiChromInfo. */
152 175
 
176
+char *bbiCachedChromLookup(struct bbiFile *bbi, int chromId, int lastChromId,
177
+    char *chromBuf, int chromBufSize);
178
+/* Return chromosome name corresponding to chromId.  Because this is a bit expensive,
179
+ * if you are doing this repeatedly pass in the chromId used in the previous call to
180
+ * this in lastChromId,  which will save it from doing the lookup again on the same
181
+ * chromosome.  Pass in -1 to lastChromId if this is the first time or if you can't be
182
+ * bothered.  The chromBufSize should be at greater or equal to bbi->keySize+1.  */
183
+
153 184
 struct bbiChromUsage
154 185
 /* Information on how many items per chromosome etc.  Used by multipass bbiFile writers. */
155 186
     {
... ...
@@ -249,7 +280,7 @@ boolean bbiSummaryArray(struct bbiFile *bbi, char *chrom, bits32 start, bits32 e
249 280
 struct bbiSummaryElement bbiTotalSummary(struct bbiFile *bbi);
250 281
 /* Return summary of entire file! */
251 282
 
252
-/****** Write side of things - implemented in bbiWrite.c ********/
283
+/****** Write side of things - implemented in bbiWrite.c.  Few people need this. ********/
253 284
 
254 285
 struct bbiBoundsArray
255 286
 /* Minimum info needed for r-tree indexer - where a section lives on disk and the
... ...
@@ -287,7 +318,7 @@ void bbiSumOutStreamWrite(struct bbiSumOutStream *stream, struct bbiSummary *sum
287 318
 void bbiOutputOneSummaryFurtherReduce(struct bbiSummary *sum, 
288 319
 	struct bbiSummary **pTwiceReducedList, 
289 320
 	int doubleReductionSize, struct bbiBoundsArray **pBoundsPt, 
290
-	struct bbiBoundsArray *boundsEnd, bits32 chromSize, struct lm *lm, 
321
+	struct bbiBoundsArray *boundsEnd, struct lm *lm, 
291 322
 	struct bbiSumOutStream *stream);
292 323
 /* Write out sum to file, keeping track of minimal info on it in *pBoundsPt, and also adding
293 324
  * it to second level summary. */
... ...
@@ -296,8 +327,6 @@ struct bbiSummary *bbiSummarySimpleReduce(struct bbiSummary *list, int reduction
296 327
 /* Do a simple reduction - where among other things the reduction level is an integral
297 328
  * multiple of the previous reduction level, and the list is sorted. Allocate result out of lm. */
298 329
 
299
-#define bbiMaxZoomLevels 10	/* Maximum zoom levels produced by writers. */
300
-
301 330
 void bbiWriteDummyHeader(FILE *f);
302 331
 /* Write out all-zero header, just to reserve space for it. */
303 332
 
... ...
@@ -325,9 +354,69 @@ void bbiChromUsageFree(struct bbiChromUsage **pUsage);
325 354
 void bbiChromUsageFreeList(struct bbiChromUsage **pList);
326 355
 /* free a list of bbiChromUsage structures */
327 356
 
328
-struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, 
329
-	struct hash *chromSizesHash, int *retMinDiff, double *retAveSize, bits64 *retBedCount);
330
-/* Go through bed file and collect chromosomes and statistics. Free with bbiChromUsageFreeList */
357
+struct bbNamedFileChunk 
358
+/* A name associated with an offset into a possibly large file.  Used for extra
359
+ * indexes in bigBed files. */
360
+    {
361
+    char *name;	    /* Name of chunk. */
362
+    bits64 offset;  /* Start in file. */
363
+    bits64 size;    /* Size in file. */
364
+    };
365
+
366
+struct bbExIndexMaker
367
+/* A helper structure to make indexes beyond primary one.  Just used for bigBeds */
368
+    {
369
+    bits16 indexCount;          /* Number of extra indexes. */
370
+        /* Kind of wish next four fields,  all of which are arrays indexed
371
+         * by the same thing,  were a single array of a structure instead. */
372
+    bits16 *indexFields;        /* array of field ids, one for each extra index. */
373
+    int *maxFieldSize;          /* array of maximum sizes seen for this field. */
374
+    struct bbNamedFileChunk **chunkArrayArray; /* where we keep name/start/size triples */
375
+    bits64 *fileOffsets;        /* array of file offsets where indexes starts. */
376
+    int recordCount;            /* number of records in file. */
377
+    };
378
+
379
+struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, 
380
+	struct bbExIndexMaker *eim, int *retMinDiff, double *retAveSize, bits64 *retBedCount);
381
+/* Go through bed file and collect chromosomes and statistics.  If eim parameter is non-NULL
382
+ * collect max field sizes there too. */
383
+
384
+#define bbiMaxZoomLevels 10	/* Max number of zoom levels */
385
+#define bbiResIncrement 4	/* Amount to reduce at each zoom level */
386
+
387
+int bbiCalcResScalesAndSizes(int aveSize, 
388
+    int resScales[bbiMaxZoomLevels], int resSizes[bbiMaxZoomLevels]);
389
+/* Fill in resScales with amount to zoom at each level, and zero out resSizes based
390
+ * on average span. Returns the number of zoom levels we actually will use. */
391
+
392
+typedef struct bbiSummary *bbiWriteReducedOnceReturnReducedTwice(
393
+	struct bbiChromUsage *usageList, int fieldCount,
394
+	struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount, 
395
+	int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress,
396
+	struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart,
397
+	struct bbiSummaryElement *totalSum);
398
+/* Typedef for a function that writes out data reduced by factor of initial reduction, and
399
+ * also returns an array of bbiSummaries for the next reduction level. */
400
+
401
+int bbiWriteZoomLevels(
402
+    struct lineFile *lf,    /* Input file. */
403
+    FILE *f,		    /* Output. */
404
+    int blockSize,	    /* Size of index block */
405
+    int itemsPerSlot,	    /* Number of data points bundled at lowest level. */
406
+    bbiWriteReducedOnceReturnReducedTwice writeReducedOnceReturnReducedTwice,   /* callback */
407
+    int fieldCount,	    /* Number of fields in bed (4 for bedGraph) */
408
+    boolean doCompress,	    /* Do we compress.  Answer really should be yes! */
409
+    bits64 dataSize,	    /* Size of data on disk (after compression if any). */
410
+    struct bbiChromUsage *usageList, /* Result from bbiChromUsageFromBedFile */
411
+    int resTryCount, int resScales[], int resSizes[],   /* How much to zoom at each level */
412
+    bits32 zoomAmounts[bbiMaxZoomLevels],      /* Fills in amount zoomed at each level. */
413
+    bits64 zoomDataOffsets[bbiMaxZoomLevels],  /* Fills in where data starts for each zoom level. */
414
+    bits64 zoomIndexOffsets[bbiMaxZoomLevels], /* Fills in where index starts for each level. */
415
+    struct bbiSummaryElement *totalSum);
416
+/* Write out all the zoom levels and return the number of levels written.  Writes 
417
+ * actual zoom amount and the offsets of the zoomed data and index in the last three
418
+ * parameters.  Sorry for all the parameters - it was this or duplicate a big chunk of
419
+ * code between bedToBigBed and bedGraphToBigWig. */
331 420
 
332 421
 int bbiCountSectionsNeeded(struct bbiChromUsage *usageList, int itemsPerSlot);
333 422
 /* Count up number of sections needed for data. */
... ...
@@ -358,4 +447,8 @@ boolean bbiFileCheckSigs(char *fileName, bits32 sig, char *typeName);
358 447
 time_t bbiUpdateTime(struct bbiFile *bbi);
359 448
 /* return bbi->udc->updateTime */
360 449
 
450
+struct bbiSummary *bbiSummariesInRegion(struct bbiZoomLevel *zoom, struct bbiFile *bbi,
451
+        int chromId, bits32 start, bits32 end);
452
+/* Return list of all summaries in region at given zoom level of bbiFile. */
453
+
361 454
 #endif /* BBIFILE_H */
... ...
@@ -1,11 +1,13 @@
1 1
 /* bbiRead - Big Binary Indexed file.  Stuff that's common between bigWig and bigBed on the 
2 2
  * read side. */
3 3
 
4
+/* Copyright (C) 2014 The Regents of the University of California 
5
+ * See README in this or parent directory for licensing information. */
6
+
4 7
 #include "common.h"
5 8
 #include "linefile.h"
6 9
 #include "hash.h"
7 10
 #include "obscure.h"
8
-#include "localmem.h"
9 11
 #include "zlibFace.h"
10 12
 #include "bPlusTree.h"
11 13
 #include "hmmstats.h"
... ...
@@ -115,9 +117,7 @@ bbi->definedFieldCount = udcReadBits16(udc, isSwapped);
115 117
 bbi->asOffset = udcReadBits64(udc, isSwapped);
116 118
 bbi->totalSummaryOffset = udcReadBits64(udc, isSwapped);
117 119
 bbi->uncompressBufSize = udcReadBits32(udc, isSwapped);
118
-
119
-/* Skip over reserved area. */
120
-udcSeek(udc, 64);
120
+bbi->extensionOffset = udcReadBits64(udc, isSwapped);
121 121
 
122 122
 /* Read zoom headers. */
123 123
 int i;
... ...
@@ -134,6 +134,15 @@ for (i=0; i<bbi->zoomLevels; ++i)
134 134
 slReverse(&levelList);
135 135
 bbi->levelList = levelList;
136 136
 
137
+/* Deal with header extension if any. */
138
+if (bbi->extensionOffset != 0)
139
+    {
140
+    udcSeek(udc, bbi->extensionOffset);
141
+    bbi->extensionSize = udcReadBits16(udc, isSwapped);
142
+    bbi->extraIndexCount = udcReadBits16(udc, isSwapped);
143
+    bbi->extraIndexListOffset = udcReadBits64(udc, isSwapped);
144
+    }
145
+
137 146
 /* Attach B+ tree of chromosome names and ids. */
138 147
 udcSeek(udc, bbi->chromTreeOffset);
139 148
 bbi->chromBpt =  bptFileAttach(fileName, udc);
... ...
@@ -157,6 +166,16 @@ if (bwf != NULL)
157 166
     }
158 167
 }
159 168
 
169
+static void chromIdSizeHandleSwapped(boolean isSwapped, struct bbiChromIdSize *idSize)
170
+/* Swap bytes in chromosome Id and Size as needed. */
171
+{
172
+if (isSwapped)
173
+    {
174
+    idSize->chromId = byteSwap32(idSize->chromId);
175
+    idSize->chromSize = byteSwap32(idSize->chromSize);
176
+    }
177
+}
178
+
160 179
 
161 180
 struct fileOffsetSize *bbiOverlappingBlocks(struct bbiFile *bbi, struct cirTreeFile *ctf,
162 181
 	char *chrom, bits32 start, bits32 end, bits32 *retChromId)
... ...
@@ -165,8 +184,7 @@ struct fileOffsetSize *bbiOverlappingBlocks(struct bbiFile *bbi, struct cirTreeF
165 184
 struct bbiChromIdSize idSize;
166 185
 if (!bptFileFind(bbi->chromBpt, chrom, strlen(chrom), &idSize, sizeof(idSize)))
167 186
     return NULL;
168
-if (bbi->isSwapped)
169
-    idSize.chromId = byteSwap32(idSize.chromId);
187
+chromIdSizeHandleSwapped(bbi->isSwapped, &idSize);
170 188
 if (retChromId != NULL)
171 189
     *retChromId = idSize.chromId;
172 190
 return cirTreeFindOverlappingBlocks(ctf, idSize.chromId, start, end);
... ...
@@ -186,15 +204,11 @@ struct chromNameCallbackContext *c = context;
186 204
 struct bbiChromInfo *info;
187 205
 struct bbiChromIdSize *idSize = val;
188 206
 assert(valSize == sizeof(*idSize));
207
+chromIdSizeHandleSwapped(c->isSwapped, idSize);
189 208
 AllocVar(info);
190 209
 info->name = cloneStringZ(key, keySize);
191 210
 info->id = idSize->chromId;
192 211
 info->size = idSize->chromSize;
193
-if (c->isSwapped)
194
-    {
195
-    info->id = byteSwap32(info->id);
196
-    info->size = byteSwap32(info->size);
197
-    }
198 212
 slAddHead(&c->list, info);
199 213
 }
200 214
 
... ...
@@ -215,6 +229,7 @@ bits32 bbiChromSize(struct bbiFile *bbi, char *chrom)
215 229
 struct bbiChromIdSize idSize;
216 230
 if (!bptFileFind(bbi->chromBpt, chrom, strlen(chrom), &idSize, sizeof(idSize)))
217 231
     return 0;
232
+chromIdSizeHandleSwapped(bbi->isSwapped, &idSize);
218 233
 return idSize.chromSize;
219 234
 }
220 235
 
... ...
@@ -303,6 +318,7 @@ sum->chromId = udcReadBits32(udc, isSwapped);
303 318
 sum->start = udcReadBits32(udc, isSwapped);
304 319
 sum->end = udcReadBits32(udc, isSwapped);
305 320
 sum->validCount = udcReadBits32(udc, isSwapped);
321
+// looks like a bug to me, these should call udcReadFloat() 
306 322
 udcMustReadOne(udc, sum->minVal);
307 323
 udcMustReadOne(udc, sum->maxVal);
308 324
 udcMustReadOne(udc, sum->sumData);
... ...
@@ -310,6 +326,22 @@ udcMustReadOne(udc, sum->sumSquares);
310 326
 }
311 327
 #endif /* UNUSED */
312 328
 
329
+static void bbiSummaryHandleSwapped(struct bbiFile *bbi, struct bbiSummaryOnDisk *in)
330
+/* Swap integer fields in summary as needed. */
331
+{
332
+if (bbi->isSwapped)
333
+    {
334
+    in->chromId = byteSwap32(in->chromId);
335
+    in->start = byteSwap32(in->start);
336
+    in->end = byteSwap32(in->end);
337
+    in->validCount = byteSwap32(in->validCount);
338
+    in->minVal = byteSwapFloat(in->minVal);
339
+    in->maxVal = byteSwapFloat(in->maxVal);
340
+    in->sumData = byteSwapFloat(in->sumData);
341
+    in->sumSquares = byteSwapFloat(in->sumSquares);
342
+    }
343
+}
344
+
313 345
 static struct bbiSummary *bbiSummaryFromOnDisk(struct bbiSummaryOnDisk *in)
314 346
 /* Create a bbiSummary unlinked to anything from input in onDisk format. */
315 347
 {
... ...
@@ -326,7 +358,7 @@ out->sumSquares = in->sumSquares;
326 358
 return out;
327 359
 }
328 360
 
329
-static struct bbiSummary *bbiSummariesInRegion(struct bbiZoomLevel *zoom, struct bbiFile *bbi, 
361
+struct bbiSummary *bbiSummariesInRegion(struct bbiZoomLevel *zoom, struct bbiFile *bbi, 
330 362
 	int chromId, bits32 start, bits32 end)
331 363
 /* Return list of all summaries in region at given zoom level of bbiFile. */
332 364
 {
... ...
@@ -386,6 +418,7 @@ for (block = blockList; block != NULL; )
386 418
 	    {
387 419
 	    dSum = (void *)blockPt;
388 420
 	    blockPt += sizeof(*dSum);
421
+	    bbiSummaryHandleSwapped(bbi, dSum);
389 422
 	    if (dSum->chromId == chromId)
390 423
 		{
391 424
 		int s = max(dSum->start, start);
... ...
@@ -409,6 +442,23 @@ slReverse(&sumList);
409 442
 return sumList;
410 443
 }
411 444
 
445
+static int normalizeCount(struct bbiSummaryElement *el, double countFactor, 
446
+    double minVal, double maxVal, double sumData, double sumSquares)
447
+/* normalize statistics to be based on an integer number of valid bases 
448
+ * Integer value is the smallest integer not less than countFactor */
449
+{
450
+bits32 validCount = ceil(countFactor);
451
+double normFactor = (double)validCount/countFactor;
452
+
453
+el->validCount = validCount;
454
+el->minVal = minVal;
455
+el->maxVal = maxVal;
456
+el->sumData = sumData * normFactor;
457
+el->sumSquares = sumSquares * normFactor;
458
+
459
+return validCount;
460
+}
461
+
412 462
 static bits32 bbiSummarySlice(struct bbiFile *bbi, bits32 baseStart, bits32 baseEnd, 
413 463
 	struct bbiSummary *sumList, struct bbiSummaryElement *el)
414 464
 /* Update retVal with the average value if there is any data in interval.  Return number
... ...
@@ -421,6 +471,7 @@ if (sumList != NULL)
421 471
     double minVal = sumList->minVal;
422 472
     double maxVal = sumList->maxVal;
423 473
     double sumData = 0, sumSquares = 0;
474
+    double countFactor = 0.0;
424 475
 
425 476
     struct bbiSummary *sum;
426 477
     for (sum = sumList; sum != NULL && sum->start < baseEnd; sum = sum->next)
... ...
@@ -429,7 +480,7 @@ if (sumList != NULL)
429 480
 	if (overlap > 0)
430 481
 	    {
431 482
 	    double overlapFactor = (double)overlap / (sum->end - sum->start);
432
-	    validCount += sum->validCount * overlapFactor;
483
+	    countFactor += sum->validCount * overlapFactor;
433 484
 	    sumData += sum->sumData * overlapFactor;
434 485
 	    sumSquares += sum->sumSquares * overlapFactor;
435 486
 	    if (maxVal < sum->maxVal)
... ...
@@ -438,24 +489,20 @@ if (sumList != NULL)
438 489
 		minVal = sum->minVal;
439 490
 	    }
440 491
 	}
441
-    if (validCount > 0)
442
-	{
443
-	el->validCount = validCount;
444
-	el->minVal = minVal;
445
-	el->maxVal = maxVal;
446
-	el->sumData = sumData;
447
-	el->sumSquares = sumSquares;
448
-	}
492
+
493
+    if (countFactor > 0)
494
+	validCount = normalizeCount(el, countFactor, minVal, maxVal, sumData, sumSquares);
449 495
     }
450 496
 return validCount;
451 497
 }
452 498
 
453 499
 static int bbiChromId(struct bbiFile *bbi, char *chrom)
454
-/* Return chromosome size */
500
+/* Return chromosome Id */
455 501
 {
456 502
 struct bbiChromIdSize idSize;
457 503
 if (!bptFileFind(bbi->chromBpt, chrom, strlen(chrom), &idSize, sizeof(idSize)))
458 504
     return -1;
505
+chromIdSizeHandleSwapped(bbi->isSwapped, &idSize);
459 506
 return idSize.chromId;
460 507
 }
461 508
 
... ...
@@ -501,10 +548,11 @@ static bits32 bbiIntervalSlice(struct bbiFile *bbi, bits32 baseStart, bits32 bas
501 548
 /* Update retVal with the average value if there is any data in interval.  Return number
502 549
  * of valid data bases in interval. */
503 550
 {
504
-double validCount = 0;
551
+bits32 validCount = 0;
505 552
 
506 553
 if (intervalList != NULL)
507 554
     {
555
+    double countFactor = 0;
508 556
     struct bbiInterval *interval;
509 557
     double sumData = 0, sumSquares = 0;
510 558
     double minVal = intervalList->val;
... ...
@@ -519,7 +567,7 @@ if (intervalList != NULL)
519 567
 	    int intervalSize = interval->end - interval->start;
520 568
 	    double overlapFactor = (double)overlap / intervalSize;
521 569
 	    double intervalWeight = intervalSize * overlapFactor;
522
-	    validCount += intervalWeight;
570
+	    countFactor += intervalWeight;
523 571
 	    sumData += interval->val * intervalWeight;
524 572
 	    sumSquares += interval->val * interval->val * intervalWeight;
525 573
 	    if (maxVal < interval->val)
... ...
@@ -528,13 +576,10 @@ if (intervalList != NULL)
528 576
 		minVal = interval->val;
529 577
 	    }
530 578
 	}
531
-    el->validCount = round(validCount);
532
-    el->minVal = minVal;
533
-    el->maxVal = maxVal;
534
-    el->sumData = sumData;
535
-    el->sumSquares = sumSquares;
579
+
580
+    validCount = normalizeCount(el, countFactor, minVal, maxVal, sumData, sumSquares);
536 581
     }
537
-return round(validCount);
582
+return validCount;
538 583
 }
539 584
 
540 585
 
... ...
@@ -547,7 +592,7 @@ struct bbiInterval *intervalList = NULL, *interval;
547 592
 struct lm *lm = lmInit(0);
548 593
 intervalList = (*fetchIntervals)(bbi, chrom, start, end, lm);
549 594
 boolean result = FALSE;
550
-if (intervalList != NULL);
595
+if (intervalList != NULL)
551 596
     {
552 597
     int i;
553 598
     bits32 baseStart = start, baseEnd;
... ...
@@ -736,3 +781,17 @@ time_t bbiUpdateTime(struct bbiFile *bbi)
736 781
 struct udcFile *udc = bbi->udc;
737 782
 return udcUpdateTime(udc);
738 783
 }
784
+
785
+char *bbiCachedChromLookup(struct bbiFile *bbi, int chromId, int lastChromId,
786
+    char *chromBuf, int chromBufSize)
787
+/* Return chromosome name corresponding to chromId.  Because this is a bit expensive,
788
+ * if you are doing this repeatedly pass in the chromId used in the previous call to
789
+ * this in lastChromId,  which will save it from doing the lookup again on the same
790
+ * chromosome.  Pass in -1 to lastChromId if this is the first time or if you can't be
791
+ * bothered.  The chromBufSize should be at greater or equal to bbi->keySize+1.  */
792
+{
793
+if (chromId != lastChromId)
794
+    bptStringKeyAtPos(bbi->chromBpt, chromId, chromBuf, chromBufSize);
795
+return chromBuf;
796
+}
797
+
... ...
@@ -1,8 +1,12 @@
1
+/* bbiWrite.c - Routines to help write bigWig and bigBed files. See also bbiFile.h */
2
+
3
+/* Copyright (C) 2014 The Regents of the University of California 
4
+ * See README in this or parent directory for licensing information. */
5
+
1 6
 #include "common.h"
2 7
 #include "hash.h"
3 8
 #include "linefile.h"
4 9
 #include "sqlNum.h"
5
-#include "localmem.h"
6 10
 #include "zlibFace.h"
7 11
 #include "cirTree.h"
8 12
 #include "bPlusTree.h"
... ...
@@ -48,23 +52,26 @@ int chromCount = slCount(usageList);
48 52
 struct bbiChromUsage *usage;
49 53
 
50 54
 /* Allocate and fill in array from list. */
51
-struct bbiChromInfo *chromInfoArray;
52
-AllocArray(chromInfoArray, chromCount);
53
-int i;
55
+struct bbiChromInfo *chromInfoArray = NULL;
54 56
 int maxChromNameSize = 0;
55
-for (i=0, usage = usageList; i<chromCount; ++i, usage = usage->next)
57
+if (chromCount > 0)
56 58
     {
57
-    char *chromName = usage->name;
58
-    int len = strlen(chromName);
59
-    if (len > maxChromNameSize)
60
-        maxChromNameSize = len;
61
-    chromInfoArray[i].name = chromName;
62
-    chromInfoArray[i].id = usage->id;
63
-    chromInfoArray[i].size = usage->size;
64
-    }
59
+    AllocArray(chromInfoArray, chromCount);
60
+    int i;
61
+    for (i=0, usage = usageList; i<chromCount; ++i, usage = usage->next)
62
+	{
63
+	char *chromName = usage->name;
64
+	int len = strlen(chromName);
65
+	if (len > maxChromNameSize)
66
+	    maxChromNameSize = len;
67
+	chromInfoArray[i].name = chromName;
68
+	chromInfoArray[i].id = usage->id;
69
+	chromInfoArray[i].size = usage->size;
70
+	}
65 71
 
66
-/* Sort so the b-Tree actually works. */
67
-qsort(chromInfoArray, chromCount, sizeof(chromInfoArray[0]), bbiChromInfoCmp);
72
+    /* Sort so the b-Tree actually works. */
73
+    qsort(chromInfoArray, chromCount, sizeof(chromInfoArray[0]), bbiChromInfoCmp);
74
+    }
68 75
 
69 76
 /* Write chromosome bPlusTree */
70 77
 int chromBlockSize = min(blockSize, chromCount);
... ...
@@ -133,11 +140,40 @@ for (el = *pList; el != NULL; el = next)
133 140
 *pList = NULL;
134 141
 }
135 142
 
136
-struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, 
137
-	struct hash *chromSizesHash, int *retMinDiff, double *retAveSize, bits64 *retBedCount)
138
-/* Go through bed file and collect chromosomes and statistics. */
143
+int bbExIndexMakerMaxIndexField(struct bbExIndexMaker *eim)
144
+/* Return the maximum field we have to index. */
139 145
 {
140
-char *row[3];
146
+int maxIx = 0;
147
+int i;
148
+for (i=0; i<eim->indexCount; ++i)
149
+    {
150
+    int ix = eim->indexFields[i];
151
+    if (ix > maxIx)
152
+        maxIx = ix;
153
+    }
154
+return maxIx;
155
+}
156
+
157
+void bbExIndexMakerUpdateMaxFieldSize(struct bbExIndexMaker *eim, char **row)
158
+/* Fold in information about row into bbExIndexMaker into eim->maxFieldSize */
159
+{
160
+int i;
161
+for (i=0; i<eim->indexCount; ++i)
162
+    {
163
+    int rowIx = eim->indexFields[i];
164
+    int size = strlen(row[rowIx]);
165
+    if (size > eim->maxFieldSize[i])
166
+        eim->maxFieldSize[i] = size;
167
+    }
168
+}
169
+
170
+struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, 
171
+	struct bbExIndexMaker *eim, int *retMinDiff, double *retAveSize, bits64 *retBedCount)
172
+/* Go through bed file and collect chromosomes and statistics.  If eim parameter is non-NULL
173
+ * collect max field sizes there too. */
174
+{
175
+int maxRowSize = (eim == NULL ? 3 : bbExIndexMakerMaxIndexField(eim) + 1);
176
+char *row[maxRowSize];
141 177
 struct hash *uniqHash = hashNew(0);
142 178
 struct bbiChromUsage *usage = NULL, *usageList = NULL;
143 179
 int lastStart = -1;
... ...
@@ -149,20 +185,17 @@ lineFileRemoveInitialCustomTrackLines(lf);
149 185
 
150 186
 for (;;)
151 187
     {
152
-    int rowSize = lineFileChopNext(lf, row, ArraySize(row));
188
+    int rowSize = lineFileChopNext(lf, row, maxRowSize);
153 189
     if (rowSize == 0)
154 190
         break;
155
-    lineFileExpectWords(lf, 3, rowSize);
191
+    lineFileExpectAtLeast(lf, maxRowSize, rowSize);
156 192
     char *chrom = row[0];
157 193
     int start = lineFileNeedNum(lf, row, 1);
158 194
     int end = lineFileNeedNum(lf, row, 2);
159
-    if (start >= end)
195
+    if (eim != NULL)
196
+	bbExIndexMakerUpdateMaxFieldSize(eim, row);
197
+    if (start > end)
160 198
         {
161
-	if (start == end)
162
-	    errAbort("line %d of %s: start and end coordinates the same\n"
163
-	             "They need to be at least one apart"
164
-		     , lf->lineIx, lf->fileName);
165
-	else
166 199
 	    errAbort("end (%d) before start (%d) line %d of %s",
167 200
 	    	end, start, lf->lineIx, lf->fileName);
168 201
 	}
... ...
@@ -204,13 +237,130 @@ for (;;)
204 237
     lastStart = start;
205 238
     }
206 239
 slReverse(&usageList);
240
+double aveSize = 0;
241
+if (bedCount > 0)
242
+    aveSize = (double)totalBases/bedCount;
207 243
 *retMinDiff = minDiff;
208
-*retAveSize = (double)totalBases/bedCount;
244
+*retAveSize = aveSize;
209 245
 *retBedCount = bedCount;
210 246
 freeHash(&uniqHash);
211 247
 return usageList;
212 248
 }
213 249
 
250
+int bbiCalcResScalesAndSizes(int aveSize, 
251
+    int resScales[bbiMaxZoomLevels], int resSizes[bbiMaxZoomLevels])
252
+/* Fill in resScales with amount to zoom at each level, and zero out resSizes based
253
+ * on average span. Returns the number of zoom levels we actually will use. */
254
+{
255
+int resTryCount = bbiMaxZoomLevels, resTry;
256
+int resIncrement = bbiResIncrement;
257
+int minZoom = 10;
258
+int res = aveSize;
259
+if (res < minZoom)
260
+    res = minZoom;
261
+for (resTry = 0; resTry < resTryCount; ++resTry)
262
+    {
263
+    resSizes[resTry] = 0;
264
+    resScales[resTry] = res;
265
+    // if aveSize is large, then the initial value of res is large, and we
266
+    // and we cannot do all 10 levels without overflowing res* integers and other related variables.
267
+    if (res > 1000000000) 
268
+	{
269
+	resTryCount = resTry + 1;  
270
+	verbose(2, "resTryCount reduced from 10 to %d\n", resTryCount);
271
+	break;
272
+	}
273
+    res *= resIncrement;
274
+    }
275
+return resTryCount;
276
+}
277
+
278
+int bbiWriteZoomLevels(
279
+    struct lineFile *lf,    /* Input file. */
280
+    FILE *f,		    /* Output. */
281
+    int blockSize,	    /* Size of index block */
282
+    int itemsPerSlot,	    /* Number of data points bundled at lowest level. */
283
+    bbiWriteReducedOnceReturnReducedTwice writeReducedOnceReturnReducedTwice,   /* callback */
284
+    int fieldCount,	    /* Number of fields in bed (4 for bedGraph) */
285
+    boolean doCompress,	    /* Do we compress.  Answer really should be yes! */
286
+    bits64 dataSize,	    /* Size of data on disk (after compression if any). */
287
+    struct bbiChromUsage *usageList, /* Result from bbiChromUsageFromBedFile */
288
+    int resTryCount, int resScales[], int resSizes[],   /* How much to zoom at each level */
289
+    bits32 zoomAmounts[bbiMaxZoomLevels],      /* Fills in amount zoomed at each level. */
290
+    bits64 zoomDataOffsets[bbiMaxZoomLevels],  /* Fills in where data starts for each zoom level. */
291
+    bits64 zoomIndexOffsets[bbiMaxZoomLevels], /* Fills in where index starts for each level. */
292
+    struct bbiSummaryElement *totalSum)
293
+/* Write out all the zoom levels and return the number of levels written.  Writes 
294
+ * actual zoom amount and the offsets of the zoomed data and index in the last three
295
+ * parameters.  Sorry for all the parameters - it was this or duplicate a big chunk of
296
+ * code between bedToBigBed and bedGraphToBigWig. */
297
+{
298
+/* Write out first zoomed section while storing in memory next zoom level. */
299
+assert(resTryCount > 0);
300
+int maxReducedSize = dataSize/2;
301
+int initialReduction = 0, initialReducedCount = 0;
302
+
303
+/* Figure out initialReduction for zoom - one that is maxReducedSize or less. */
304
+int resTry;
305
+for (resTry = 0; resTry < resTryCount; ++resTry)
306
+    {
307
+    bits64 reducedSize = resSizes[resTry] * sizeof(struct bbiSummaryOnDisk);
308
+    if (doCompress)
309
+	reducedSize /= 2;	// Estimate!
310
+    if (reducedSize <= maxReducedSize)
311
+	{
312
+	initialReduction = resScales[resTry];
313
+	initialReducedCount = resSizes[resTry];
314
+	break;
315
+	}
316
+    }
317
+verbose(2, "initialReduction %d, initialReducedCount = %d\n", 
318
+    initialReduction, initialReducedCount);
319
+
320
+/* Force there to always be at least one zoom.  It may waste a little space on small
321
+ * files, but it makes files more uniform, and avoids special case code for calculating
322
+ * overall file summary. */
323
+if (initialReduction == 0)
324
+    {
325
+    initialReduction = resScales[0];
326
+    initialReducedCount = resSizes[0];
327
+    }
328
+
329
+/* Call routine to make the initial zoom level and also a bit of work towards further levels. */
330
+struct lm *lm = lmInit(0);
331
+int zoomIncrement = bbiResIncrement;
332
+lineFileRewind(lf);
333
+struct bbiSummary *rezoomedList = writeReducedOnceReturnReducedTwice(usageList, fieldCount,
334
+	lf, initialReduction, initialReducedCount,
335
+	zoomIncrement, blockSize, itemsPerSlot, doCompress, lm, 
336
+	f, &zoomDataOffsets[0], &zoomIndexOffsets[0], totalSum);
337
+verboseTime(2, "writeReducedOnceReturnReducedTwice");
338
+zoomAmounts[0] = initialReduction;
339
+int zoomLevels = 1;
340
+
341
+/* Loop around to do any additional levels of zoom. */
342
+int zoomCount = initialReducedCount;
343
+int reduction = initialReduction * zoomIncrement;
344
+while (zoomLevels < bbiMaxZoomLevels)
345
+    {
346
+    int rezoomCount = slCount(rezoomedList);
347
+    if (rezoomCount >= zoomCount)
348
+	break;
349
+    zoomCount = rezoomCount;
350
+    zoomDataOffsets[zoomLevels] = ftell(f);
351
+    zoomIndexOffsets[zoomLevels] = bbiWriteSummaryAndIndex(rezoomedList, 
352
+	blockSize, itemsPerSlot, doCompress, f);
353
+    zoomAmounts[zoomLevels] = reduction;
354
+    ++zoomLevels;
355
+    reduction *= zoomIncrement;
356
+    rezoomedList = bbiSummarySimpleReduce(rezoomedList, reduction, lm);
357
+    }
358
+lmCleanup(&lm);
359
+verboseTime(2, "further reductions");
360
+return zoomLevels;
361
+}
362
+
363
+
214 364
 int bbiCountSectionsNeeded(struct bbiChromUsage *usageList, int itemsPerSlot)
215 365
 /* Count up number of sections needed for data. */
216 366
 {
... ...
@@ -517,7 +667,7 @@ if (elCount >= stream->allocCount)
517 667
 void bbiOutputOneSummaryFurtherReduce(struct bbiSummary *sum, 
518 668
 	struct bbiSummary **pTwiceReducedList, 
519 669
 	int doubleReductionSize, struct bbiBoundsArray **pBoundsPt, 
520
-	struct bbiBoundsArray *boundsEnd, bits32 chromSize, struct lm *lm, 
670
+	struct bbiBoundsArray *boundsEnd, struct lm *lm, 
521 671
 	struct bbiSumOutStream *stream)
522 672
 /* Write out sum to file, keeping track of minimal info on it in *pBoundsPt, and also adding
523 673
  * it to second level summary. */
... ...
@@ -1,7 +1,39 @@
1 1
 /* bigBed - interface to binary file with bed-style values (that is a bunch of
2 2
  * possibly overlapping regions.
3 3
  *
4
- * This shares a lot with the bigWig module. */
4
+ * This shares a lot with the bigWig module. 
5
+ *
6
+ * Most of the functions here are concerned with reading bigBed files.  There's
7
+ * two common things you want to do with a bigBed,  either stream through everything in it,
8
+ * or just read the parts that overlap a interval within the genome.  The files
9
+ * are optimized for interval queries, but streaming through them is not difficult either.
10
+ * 
11
+ * To query an interval:
12
+ *    struct bbiFile *bbi = bigBedFileOpen(fileName);
13
+ *    struct lm *lm = lmInit(0); // Memory pool to hold returned list
14
+ *    struct bigBedInterval *list = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm);
15
+ *    struct bigBedInterval *el;
16
+ *    for (el = list; el != NULL; el = el->next)
17
+ *        // do something involving chrom, el->start, el->end
18
+ *    lmCleanup(&lm);         // typically do this after each query
19
+ *    bigBedFileClose(&bbi);  // typically only do this when finished all queries
20
+ *
21
+ * To stream through whole file
22
+ *    struct bbiFile *bbi = bigBedFileOpen(fileName);
23
+ *    struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi);
24
+ *    for (chrom = chromList; chrom != NULL; chrom = chrom->next)
25
+ *        {
26
+ *        struct lm *lm = lmInit(0);
27
+ *        struct bigBedInterval *list = bigBedIntervalQuery(bbi,chrom->name,0,chrom->size,0,lm);
28
+ *        struct bigBedInterval *el;
29
+ *        for (el = list; el != NULL; el = el->next)
30
+ *            // do something involving chrom, el->start, el->end
31
+ *        lmCleanup(&lm);
32
+ *        }
33
+ *    bigBedFileClose(&bbi);
34
+ *
35
+ * The processes for streaming through or doing interval queries on a bigWig file are very 
36
+ * similar. */
5 37
 
6 38
 #ifndef BIGBED_H
7 39
 #define BIGBED_H
... ...
@@ -11,30 +43,23 @@
11 43
 #endif
12 44
 
13 45
 struct bigBedInterval
14
-/* A partially parsed out bed record plus some extra fields. */
46
+/* A partially parsed out bed record plus some extra fields.  Use this directly
47
+ * or convert it to an array of characters with bigBedIntervalToRow. */
15 48
     {
16 49
     struct bigBedInterval *next;	/* Next in list. */
17 50
     bits32 start, end;		/* Range inside chromosome - half open zero based. */
18 51
     char *rest;			/* Rest of line. May be NULL*/
52
+    bits32 chromId;             /* ID of chromosome.  */
19 53
     };
20 54
 
21
-struct ppBed
22
-/* A partially parsed out bed record plus some extra fields. */
23
-    {
24
-    struct ppBed *next;	/* Next in list. */
25
-    char *chrom;		/* Chromosome name (not allocated here) */
26
-    bits32 start, end;		/* Range inside chromosome - half open zero based. */
27
-    char *rest;			/* The rest of the bed. */
28
-    bits64 fileOffset;		/* File offset. */
29
-    bits32 chromId;		/* Chromosome ID. */
30
-    };
55
+/*** Routines to open & close bigBed files, and to do chromosome range queries on them. ***/
31 56
 
32 57
 struct bbiFile *bigBedFileOpen(char *fileName);
33 58
 /* Open up big bed file.   Free this up with bbiFileClose. */
34 59
 
35 60
 #define bigBedFileClose(a) bbiFileClose(a)
36 61
 
37
-struct bigBedInterval *bigBedIntervalQuery(struct bbiFile *bbi, char *chrom, 
62
+struct bigBedInterval *bigBedIntervalQuery(struct bbiFile *bbi, char *chrom,
38 63
 	bits32 start, bits32 end, int maxItems, struct lm *lm);
39 64
 /* Get data for interval.  Return list allocated out of lm.  Set maxItems to maximum
40 65
  * number of items to return, or to 0 for all items. */
... ...
@@ -43,7 +68,7 @@ int bigBedIntervalToRow(struct bigBedInterval *interval, char *chrom, char *star
43 68
 	char **row, int rowSize);
44 69
 /* Convert bigBedInterval into an array of chars equivalent to what you'd get by
45 70
  * parsing the bed file. The startBuf and endBuf are used to hold the ascii representation of
46
- * start and end.  Note that the interval->rest string will have zeroes inserted as a side effect. 
71
+ * start and end.  Note that the interval->rest string will have zeroes inserted as a side effect.
47 72
  * Returns number of fields in row.  */
48 73
 
49 74
 boolean bigBedSummaryArray(struct bbiFile *bbi, char *chrom, bits32 start, bits32 end,
... ...
@@ -59,17 +84,50 @@ boolean bigBedSummaryArrayExtended(struct bbiFile *bbi, char *chrom, bits32 star
59 84
 /* Get extended summary information for summarySize evenely spaced elements into
60 85
  * the summary array. */
61 86
 
87
+/*** Some routines for accessing bigBed items via name. ***/
88
+
89
+struct bigBedInterval *bigBedNameQuery(struct bbiFile *bbi, struct bptFile *index,
90
+    int fieldIx, char *name, struct lm *lm);
91
+/* Return list of intervals matching file. These intervals will be allocated out of lm. */
92
+
93
+struct bigBedInterval *bigBedMultiNameQuery(struct bbiFile *bbi, struct bptFile *index,
94
+    int fieldIx, char **names, int nameCount, struct lm *lm);
95
+/* Fetch all records matching any of the names. Using given index on given field.
96
+ * Return list is allocated out of lm. */
97
+
98
+int bigBedIntervalToRowLookupChrom(struct bigBedInterval *interval, 
99
+    struct bigBedInterval *prevInterval, struct bbiFile *bbi,
100
+    char *chromBuf, int chromBufSize, char *startBuf, char *endBuf, char **row, int rowSize);
101
+/* Convert bigBedInterval to array of chars equivalend to what you'd get by parsing the
102
+ * bed file.  If you already know what chromosome the interval is on use the simpler
103
+ * bigBedIntervalToRow.  This one will look up the chromosome based on the chromId field
104
+ * of the interval,  which is relatively time consuming.  To avoid doing this unnecessarily
105
+ * pass in a non-NULL prevInterval,  and if the chromId is the same on prevInterval as this,
106
+ * it will avoid the lookup.  The chromBufSize should be at greater or equal to 
107
+ * bbi->chromBpt->keySize+1.  The startBuf and endBuf are used to hold the ascii representation of
108
+ * start and end, and should be 16 bytes.  Note that the interval->rest string will have zeroes 
109
+ * inserted as a side effect.  Returns number of fields in row.  */
110
+
111
+void bigBedIntervalListToBedFile(struct bbiFile *bbi, struct bigBedInterval *intervalList, FILE *f);
112
+/* Write out big bed interval list to bed file, looking up chromosome. */
113
+
114
+/** Routines to access other data from a bigBed file. */
115
+
62 116
 bits64 bigBedItemCount(struct bbiFile *bbi);
63 117
 /* Return total items in file. */
64 118
 
65 119
 char *bigBedAutoSqlText(struct bbiFile *bbi);
66 120
 /* Get autoSql text if any associated with file.  Do a freeMem of this when done. */
67 121
 
68
-struct asObject *bigBedAs(struct bbiFile *bbi);
69
-/* Get autoSql object definition if any associated with file. */
70
-
71 122
 boolean bigBedFileCheckSigs(char *fileName);
72 123
 /* check file signatures at beginning and end of file */
73 124
 
125
+struct bptFile *bigBedOpenExtraIndex(struct bbiFile *bbi, char *fieldName, int *retFieldIx);
126
+/* Return index associated with fieldName.  Aborts if no such index.  Optionally return
127
+ * index in a row of this field. */
128
+
129
+struct slName *bigBedListExtraIndexes(struct bbiFile *bbi);
130
+/* Return list of names of extra indexes beyond primary chrom:start-end one" */
131
+
74 132
 #endif /* BIGBED_H */
75 133
 
... ...
@@ -27,6 +27,18 @@
27 27
 #include "bits.h"
28 28
 #endif
29 29
 
30
+void bigWigFileCreateEx(
31
+	char *inName, 		/* Input file in ascii wiggle format. */
32
+	char *chromSizes, 	/* Two column tab-separated file: <chromosome> <size>. */
33
+	int blockSize,		/* Number of items to bundle in r-tree.  1024 is good. */
34
+	int itemsPerSlot,	/* Number of items in lowest level of tree.  512 is good. */
35
+	boolean clipDontDie,	/* If TRUE then clip items off end of chrom rather than dying. */
36
+	boolean compress,	/* If TRUE then compress data. */
37
+	boolean keepAllChromosomes,	/* If TRUE then store all chromosomes in chromosomal b-tree. */
38
+	boolean fixedSummaries,	/* If TRUE then impose fixed summary levels. */
39
+	char *outName);
40
+/* Convert ascii format wig file (in fixedStep, variableStep or bedGraph format) 
41
+ * to binary big wig format. */
30 42
 void bigWigFileCreate(
31 43
 	char *inName, 		/* Input file in ascii wiggle format. */
32 44
 	char *chromSizes, 	/* Two column tab-separated file: <chromosome> <size>. */
... ...
@@ -6,7 +6,6 @@
6 6
 #include "common.h"
7 7
 #include "bits.h"
8 8
 
9
-static char const rcsid[] = "$Id: bits.c,v 1.20 2008/03/25 16:32:31 angie Exp $";
10 9
 
11 10
 
12 11
 static Bits oneBit[8] = { 0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
... ...
@@ -78,6 +77,33 @@ void bitFree(Bits **pB)
78 77
 freez(pB);
79 78
 }
80 79
 
80
+Bits *lmBitAlloc(struct lm *lm,int bitCount)
81
+// Allocate bits.  Must supply local memory.
82
+{
83
+assert(lm != NULL);
84
+int byteCount = ((bitCount+7)>>3);
85
+return lmAlloc(lm,byteCount);
86
+}
87
+
88
+Bits *lmBitRealloc(struct lm *lm,Bits *b, int bitCount, int newBitCount)
89
+// Resize a bit array.  If b is null, allocate a new array.  Must supply local memory.
90
+{
91
+assert(lm != NULL);
92
+int byteCount = ((bitCount+7)>>3);
93
+int newByteCount = ((newBitCount+7)>>3);
94
+return lmAllocMoreMem(lm, b ,byteCount, newByteCount);
95
+}
96
+
97
+Bits *lmBitClone(struct lm *lm,Bits* orig, int bitCount)
98
+// Clone bits.  Must supply local memory.
99
+{
100
+assert(lm != NULL);
101
+int byteCount = ((bitCount+7)>>3);
102
+Bits* bits = lmAlloc(lm,byteCount);
103
+memcpy(bits, orig, byteCount);
104
+return bits;
105
+}
106
+
81 107
 void bitSetOne(Bits *b, int bitIx)
82 108
 /* Set a single bit. */
83 109
 {
... ...
@@ -232,6 +258,19 @@ while (--byteCount >= 0)
232 258
     }
233 259
 }
234 260
 
261
+int bitAndCount(Bits *a, Bits *b, int bitCount)
262
+// Without altering 2 bitmaps, count the AND bits.
263
+{
264
+int byteCount = ((bitCount+7)>>3);
265
+int count = 0;
266
+if (!inittedBitsInByte)
267
+    bitsInByteInit();
268
+while (--byteCount >= 0)
269
+    count += bitsInByte[(*a++ & *b++)];
270
+
271
+return count;
272
+}
273
+
235 274
 void bitOr(Bits *a, Bits *b, int bitCount)
236 275
 /* Or two bitmaps.  Put result in a. */
237 276
 {
... ...
@@ -243,6 +282,19 @@ while (--byteCount >= 0)
243 282
     }
244 283
 }
245 284
 
285
+int bitOrCount(Bits *a, Bits *b, int bitCount)
286
+// Without altering 2 bitmaps, count the OR'd bits.
287
+{
288
+int byteCount = ((bitCount+7)>>3);
289
+int count = 0;
290
+if (!inittedBitsInByte)
291
+    bitsInByteInit();
292
+while (--byteCount >= 0)
293
+    count += bitsInByte[(*a++ | *b++)];
294
+
295
+return count;
296
+}
297
+
246 298
 void bitXor(Bits *a, Bits *b, int bitCount)
247 299
 {
248 300
 int byteCount = ((bitCount+7)>>3);
... ...
@@ -253,6 +305,19 @@ while (--byteCount >= 0)
253 305
     }
254 306
 }
255 307
 
308
+int bitXorCount(Bits *a, Bits *b, int bitCount)
309
+// Without altering 2 bitmaps, count the XOR'd bits.
310
+{
311
+int byteCount = ((bitCount+7)>>3);
312
+int count = 0;
313
+if (!inittedBitsInByte)
314
+    bitsInByteInit();
315
+while (--byteCount >= 0)
316
+    count += bitsInByte[(*a++ ^ *b++)];
317
+
318
+return count;
319
+}
320
+
256 321
 void bitNot(Bits *a, int bitCount)
257 322
 /* Flip all bits in a. */
258 323
 {
... ...
@@ -264,6 +329,31 @@ while (--byteCount >= 0)
264 329
     }
265 330
 }
266 331
 
332
+void bitReverseRange(Bits *bits, int startIx, int bitCount)
333
+// Reverses bits in range (e.g. 110010 becomes 010011)
334
+{
335
+if (bitCount <= 0)
336
+    return;
337
+int ixA = startIx;
338
+int ixB = (startIx + bitCount - 1);