git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/rtracklayer@98880 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -4,7 +4,7 @@ PKG_OBJECTS = \ |
4 | 4 |
|
5 | 5 |
UCSC_OBJECTS = \ |
6 | 6 |
bPlusTree.o bbiRead.o bbiWrite.o bwgCreate.o bwgQuery.o \ |
7 |
- cirTree.o common.o dnaseq.o dnautil.o errabort.o hash.o linefile.o localmem.o\ |
|
7 |
+ cirTree.o common.o dnaseq.o dnautil.o errAbort.o hash.o linefile.o localmem.o\ |
|
8 | 8 |
sqlNum.o zlibFace.o dystring.o hmmstats.o obscure.o pipeline.o \ |
9 | 9 |
rangeTree.o rbTree.o memalloc.o dlist.o udc.o net.o bits.o twoBit.o \ |
10 | 10 |
_cheapcgi.o internet.o https.o base64.o verbose.o os.o wildcmp.o _portimpl.o |
... | ... |
@@ -5,7 +5,6 @@ |
5 | 5 |
#include "ucsc/bbiFile.h" |
6 | 6 |
#include "ucsc/bigWig.h" |
7 | 7 |
#include "ucsc/bwgInternal.h" |
8 |
-#include "ucsc/_bwgInternal.h" |
|
9 | 8 |
|
10 | 9 |
#include "bigWig.h" |
11 | 10 |
#include "handlers.h" |
... | ... |
@@ -190,6 +189,7 @@ SEXP BWGSectionList_write(SEXP r_sections, SEXP r_seqlengths, SEXP r_compress, |
190 | 189 |
} |
191 | 190 |
pushRHandlers(); |
192 | 191 |
bwgCreate(sections, lenHash, blockSize, itemsPerSlot, asLogical(r_compress), |
192 |
+ FALSE /*keepAllChromosomes*/, TRUE /*fixedSummaries*/, |
|
193 | 193 |
(char *)CHAR(asChar(r_file))); |
194 | 194 |
freeHash(&lenHash); |
195 | 195 |
popRHandlers(); |
... | ... |
@@ -385,7 +385,7 @@ SEXP BWGFile_fromWIG(SEXP r_infile, SEXP r_seqlengths, SEXP r_outfile) { |
385 | 385 |
struct bwgSection *sections = |
386 | 386 |
bwgParseWig((char *)CHAR(asChar(r_infile)), FALSE, lenHash, itemsPerSlot, |
387 | 387 |
lm); |
388 |
- bwgCreate(sections, lenHash, blockSize, itemsPerSlot, TRUE, |
|
388 |
+ bwgCreate(sections, lenHash, blockSize, itemsPerSlot, TRUE, TRUE, TRUE, |
|
389 | 389 |
(char *)CHAR(asChar(r_outfile))); |
390 | 390 |
lmCleanup(&lm); |
391 | 391 |
freeHash(&lenHash); |
... | ... |
@@ -1,3 +1,6 @@ |
1 |
+/* Copyright (C) 2013 The Regents of the University of California |
|
2 |
+ * See README in this or parent directory for licensing information. */ |
|
3 |
+ |
|
1 | 4 |
/* bptFile - B+ Trees. These are a method of indexing data similar to binary trees, but |
2 | 5 |
* with many children rather than just two at each node. They work well when stored on disk, |
3 | 6 |
* since typically only two or three disk accesses are needed to locate any particular |
... | ... |
@@ -162,7 +165,71 @@ else |
162 | 165 |
} |
163 | 166 |
} |
164 | 167 |
|
165 |
-void rTraverse(struct bptFile *bpt, bits64 blockStart, void *context, |
|
168 |
+static void rFindMulti(struct bptFile *bpt, bits64 blockStart, void *key, struct slRef **pList) |
|
169 |
+/* Find values corresponding to key and add them to pList. You'll need to |
|
170 |
+ * Do a slRefFreeListAndVals() on the list when done. */ |
|
171 |
+{ |
|
172 |
+/* Seek to start of block. */ |
|
173 |
+udcSeek(bpt->udc, blockStart); |
|
174 |
+ |
|
175 |
+/* Read block header. */ |
|
176 |
+UBYTE isLeaf; |
|
177 |
+UBYTE reserved; |
|
178 |
+bits16 i, childCount; |
|
179 |
+udcMustReadOne(bpt->udc, isLeaf); |
|
180 |
+udcMustReadOne(bpt->udc, reserved); |
|
181 |
+boolean isSwapped = bpt->isSwapped; |
|
182 |
+childCount = udcReadBits16(bpt->udc, isSwapped); |
|
183 |
+ |
|
184 |
+int keySize = bpt->keySize; |
|
185 |
+UBYTE keyBuf[keySize]; /* Place to put a key, buffered on stack. */ |
|
186 |
+UBYTE valBuf[bpt->valSize]; /* Place to put a value, buffered on stack. */ |
|
187 |
+ |
|
188 |
+if (isLeaf) |
|
189 |
+ { |
|
190 |
+ for (i=0; i<childCount; ++i) |
|
191 |
+ { |
|
192 |
+ udcMustRead(bpt->udc, keyBuf, keySize); |
|
193 |
+ udcMustRead(bpt->udc, valBuf, bpt->valSize); |
|
194 |
+ if (memcmp(key, keyBuf, keySize) == 0) |
|
195 |
+ { |
|
196 |
+ void *val = cloneMem(valBuf, bpt->valSize); |
|
197 |
+ refAdd(pList, val); |
|
198 |
+ } |
|
199 |
+ } |
|
200 |
+ } |
|
201 |
+else |
|
202 |
+ { |
|
203 |
+ /* Read first key and first file offset. */ |
|
204 |
+ udcMustRead(bpt->udc, keyBuf, keySize); |
|
205 |
+ bits64 lastFileOffset = udcReadBits64(bpt->udc, isSwapped); |
|
206 |
+ bits64 fileOffset = lastFileOffset; |
|
207 |
+ int lastCmp = memcmp(key, keyBuf, keySize); |
|
208 |
+ |
|
209 |
+ /* Loop through remainder. */ |
|
210 |
+ for (i=1; i<childCount; ++i) |
|
211 |
+ { |
|
212 |
+ udcMustRead(bpt->udc, keyBuf, keySize); |
|
213 |
+ fileOffset = udcReadBits64(bpt->udc, isSwapped); |
|
214 |
+ int cmp = memcmp(key, keyBuf, keySize); |
|
215 |
+ if (lastCmp >= 0 && cmp <= 0) |
|
216 |
+ { |
|
217 |
+ bits64 curPos = udcTell(bpt->udc); |
|
218 |
+ rFindMulti(bpt, lastFileOffset, key, pList); |
|
219 |
+ udcSeek(bpt->udc, curPos); |
|
220 |
+ } |
|
221 |
+ if (cmp < 0) |
|
222 |
+ return; |
|
223 |
+ lastCmp = cmp; |
|
224 |
+ lastFileOffset = fileOffset; |
|
225 |
+ } |
|
226 |
+ /* If made it all the way to end, do last one too. */ |
|
227 |
+ rFindMulti(bpt, fileOffset, key, pList); |
|
228 |
+ } |
|
229 |
+} |
|
230 |
+ |
|
231 |
+ |
|
232 |
+static void rTraverse(struct bptFile *bpt, bits64 blockStart, void *context, |
|
166 | 233 |
void (*callback)(void *context, void *key, int keySize, void *val, int valSize) ) |
167 | 234 |
/* Recursively go across tree, calling callback at leaves. */ |
168 | 235 |
{ |
... | ... |
@@ -203,19 +270,83 @@ else |
203 | 270 |
} |
204 | 271 |
} |
205 | 272 |
|
206 |
-boolean bptFileFind(struct bptFile *bpt, void *key, int keySize, void *val, int valSize) |
|
207 |
-/* Find value associated with key. Return TRUE if it's found. |
|
208 |
-* Parameters: |
|
209 |
-* bpt - file handle returned by bptFileOpen |
|
210 |
-* key - pointer to key string, which needs to be bpt->keySize long |
|
211 |
-* val - pointer to where to put retrieved value |
|
212 |
-*/ |
|
273 |
+static bits64 bptDataStart(struct bptFile *bpt) |
|
274 |
+/* Return offset of first bit of data (as opposed to index) in file. In hind sight I wish |
|
275 |
+ * this were stored in the header, but fortunately it's not that hard to compute. */ |
|
276 |
+{ |
|
277 |
+bits64 offset = bpt->rootOffset; |
|
278 |
+for (;;) |
|
279 |
+ { |
|
280 |
+ /* Seek to block start */ |
|
281 |
+ udcSeek(bpt->udc, offset); |
|
282 |
+ |
|
283 |
+ /* Read block header, break if we are leaf. */ |
|
284 |
+ UBYTE isLeaf; |
|
285 |
+ UBYTE reserved; |
|
286 |
+ bits16 childCount; |
|
287 |
+ udcMustReadOne(bpt->udc, isLeaf); |
|
288 |
+ if (isLeaf) |
|
289 |
+ break; |
|
290 |
+ udcMustReadOne(bpt->udc, reserved); |
|
291 |
+ boolean isSwapped = bpt->isSwapped; |
|
292 |
+ childCount = udcReadBits16(bpt->udc, isSwapped); |
|
293 |
+ |
|
294 |
+ /* Read and discard first key. */ |
|
295 |
+ char keyBuf[bpt->keySize]; |
|
296 |
+ udcMustRead(bpt->udc, keyBuf, bpt->keySize); |
|
297 |
+ |
|
298 |
+ /* Get file offset of sub-block. */ |
|
299 |
+ offset = udcReadBits64(bpt->udc, isSwapped); |
|
300 |
+ } |
|
301 |
+return offset; |
|
302 |
+} |
|
303 |
+ |
|
304 |
+static bits64 bptDataOffset(struct bptFile *bpt, bits64 itemPos) |
|
305 |
+/* Return position of file of data corresponding to given itemPos. For first piece of |
|
306 |
+ * data pass in 0. */ |
|
307 |
+{ |
|
308 |
+if (itemPos >= bpt->itemCount) |
|
309 |
+ errAbort("Item index %lld greater than item count %lld in %s", |
|
310 |
+ itemPos, bpt->itemCount, bpt->fileName); |
|
311 |
+bits64 blockPos = itemPos/bpt->blockSize; |
|
312 |
+bits32 insidePos = itemPos - blockPos*bpt->blockSize; |
|
313 |
+int blockHeaderSize = 4; |
|
314 |
+bits64 itemByteSize = bpt->valSize + bpt->keySize; |
|
315 |
+bits64 blockByteSize = bpt->blockSize * itemByteSize + blockHeaderSize; |
|
316 |
+bits64 blockOffset = blockByteSize*blockPos + bptDataStart(bpt); |
|
317 |
+bits64 itemOffset = blockOffset + blockHeaderSize + itemByteSize * insidePos; |
|
318 |
+return itemOffset; |
|
319 |
+} |
|
320 |
+ |
|
321 |
+void bptKeyAtPos(struct bptFile *bpt, bits64 itemPos, void *result) |
|
322 |
+/* Fill in result with the key at given itemPos. For first piece of data itemPos is 0 |
|
323 |
+ * Result must be at least bpt->keySize. If result is a string it won't be zero terminated |
|
324 |
+ * by this routine. Use bptStringKeyAtPos instead. */ |
|
325 |
+{ |
|
326 |
+bits64 offset = bptDataOffset(bpt, itemPos); |
|
327 |
+udcSeek(bpt->udc, offset); |
|
328 |
+udcMustRead(bpt->udc, result, bpt->keySize); |
|
329 |
+} |
|
330 |
+ |
|
331 |
+void bptStringKeyAtPos(struct bptFile *bpt, bits64 itemPos, char *result, int maxResultSize) |
|
332 |
+/* Fill in result with the key at given itemPos. The maxResultSize should be 1+bpt->keySize |
|
333 |
+ * to accommodate zero termination of string. */ |
|
334 |
+{ |
|
335 |
+assert(maxResultSize > bpt->keySize); |
|
336 |
+bptKeyAtPos(bpt, itemPos, result); |
|
337 |
+result[bpt->keySize] = 0; |
|
338 |
+} |
|
339 |
+ |
|
340 |
+static boolean bptFileFindMaybeMulti(struct bptFile *bpt, void *key, int keySize, int valSize, |
|
341 |
+ boolean multi, void *singleVal, struct slRef **multiVal) |
|
342 |
+/* Do either a single or multiple find depending in multi parameter. Only one of singleVal |
|
343 |
+ * or multiVal should be non-NULL, depending on the same parameter. */ |
|
213 | 344 |
{ |
214 | 345 |
/* Check key size vs. file key size, and act appropriately. If need be copy key to a local |
215 | 346 |
* buffer and zero-extend it. */ |
216 | 347 |
if (keySize > bpt->keySize) |
217 | 348 |
return FALSE; |
218 |
-char keyBuf[keySize]; |
|
349 |
+char keyBuf[bpt->keySize]; |
|
219 | 350 |
if (keySize != bpt->keySize) |
220 | 351 |
{ |
221 | 352 |
memcpy(keyBuf, key, keySize); |
... | ... |
@@ -228,8 +359,34 @@ if (valSize != bpt->valSize) |
228 | 359 |
errAbort("Value size mismatch between bptFileFind (valSize=%d) and %s (valSize=%d)", |
229 | 360 |
valSize, bpt->fileName, bpt->valSize); |
230 | 361 |
|
231 |
-/* Call recursive finder. */ |
|
232 |
-return rFind(bpt, bpt->rootOffset, key, val); |
|
362 |
+if (multi) |
|
363 |
+ { |
|
364 |
+ rFindMulti(bpt, bpt->rootOffset, key, multiVal); |
|
365 |
+ return *multiVal != NULL; |
|
366 |
+ } |
|
367 |
+else |
|
368 |
+ return rFind(bpt, bpt->rootOffset, key, singleVal); |
|
369 |
+} |
|
370 |
+ |
|
371 |
+boolean bptFileFind(struct bptFile *bpt, void *key, int keySize, void *val, int valSize) |
|
372 |
+/* Find value associated with key. Return TRUE if it's found. |
|
373 |
+* Parameters: |
|
374 |
+* bpt - file handle returned by bptFileOpen |
|
375 |
+* key - pointer to key string, which needs to be bpt->keySize long |
|
376 |
+* val - pointer to where to put retrieved value |
|
377 |
+*/ |
|
378 |
+{ |
|
379 |
+return bptFileFindMaybeMulti(bpt, key, keySize, valSize, FALSE, val, NULL); |
|
380 |
+} |
|
381 |
+ |
|
382 |
+struct slRef *bptFileFindMultiple(struct bptFile *bpt, void *key, int keySize, int valSize) |
|
383 |
+/* Find all values associated with key. Store this in ->val item of returned list. |
|
384 |
+ * Do a slRefFreeListAndVals() on list when done. */ |
|
385 |
+{ |
|
386 |
+struct slRef *list = NULL; |
|
387 |
+bptFileFindMaybeMulti(bpt, key, keySize, valSize, TRUE, NULL, &list); |
|
388 |
+slReverse(&list); |
|
389 |
+return list; |
|
233 | 390 |
} |
234 | 391 |
|
235 | 392 |
void bptFileTraverse(struct bptFile *bpt, void *context, |
... | ... |
@@ -237,7 +394,7 @@ void bptFileTraverse(struct bptFile *bpt, void *context, |
237 | 394 |
/* Traverse bPlusTree on file, calling supplied callback function at each |
238 | 395 |
* leaf item. */ |
239 | 396 |
{ |
240 |
- rTraverse(bpt, bpt->rootOffset, context, callback); |
|
397 |
+return rTraverse(bpt, bpt->rootOffset, context, callback); |
|
241 | 398 |
} |
242 | 399 |
|
243 | 400 |
|
... | ... |
@@ -252,10 +409,10 @@ void bptFileTraverse(struct bptFile *bpt, void *context, |
252 | 409 |
* 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
253 | 410 |
*/ |
254 | 411 |
|
255 |
-static int xToY(int x, unsigned y) |
|
412 |
+static long xToY(int x, unsigned y) |
|
256 | 413 |
/* Return x to the Y power, with y usually small. */ |
257 | 414 |
{ |
258 |
-int i, val = 1; |
|
415 |
+long i, val = 1; |
|
259 | 416 |
for (i=0; i<y; ++i) |
260 | 417 |
val *= x; |
261 | 418 |
return val; |
... | ... |
@@ -275,7 +432,7 @@ return levels; |
275 | 432 |
|
276 | 433 |
|
277 | 434 |
static bits32 writeIndexLevel(bits16 blockSize, |
278 |
- void *itemArray, int itemSize, int itemCount, |
|
435 |
+ void *itemArray, int itemSize, long itemCount, |
|
279 | 436 |
bits32 indexOffset, int level, |
280 | 437 |
void (*fetchKey)(const void *va, char *keyBuf), bits32 keySize, bits32 valSize, |
281 | 438 |
FILE *f) |
... | ... |
@@ -284,39 +441,42 @@ static bits32 writeIndexLevel(bits16 blockSize, |
284 | 441 |
char *items = itemArray; |
285 | 442 |
|
286 | 443 |
/* Calculate number of nodes to write at this level. */ |
287 |
-int slotSizePer = xToY(blockSize, level); // Number of items per slot in node |
|
288 |
-int nodeSizePer = slotSizePer * blockSize; // Number of items per node |
|
289 |
-int nodeCount = (itemCount + nodeSizePer - 1)/nodeSizePer; |
|
444 |
+long slotSizePer = xToY(blockSize, level); // Number of items per slot in node |
|
445 |
+long nodeSizePer = slotSizePer * blockSize; // Number of items per node |
|
446 |
+long nodeCount = (itemCount + nodeSizePer - 1)/nodeSizePer; |
|
447 |
+ |
|
290 | 448 |
|
291 | 449 |
/* Calculate sizes and offsets. */ |
292 |
-int bytesInIndexBlock = (bptBlockHeaderSize + blockSize * (keySize+sizeof(bits64))); |
|
293 |
-int bytesInLeafBlock = (bptBlockHeaderSize + blockSize * (keySize+valSize)); |
|
450 |
+long bytesInIndexBlock = (bptBlockHeaderSize + blockSize * (keySize+sizeof(bits64))); |
|
451 |
+long bytesInLeafBlock = (bptBlockHeaderSize + blockSize * (keySize+valSize)); |
|
294 | 452 |
bits64 bytesInNextLevelBlock = (level == 1 ? bytesInLeafBlock : bytesInIndexBlock); |
295 | 453 |
bits64 levelSize = nodeCount * bytesInIndexBlock; |
296 | 454 |
bits64 endLevel = indexOffset + levelSize; |
297 | 455 |
bits64 nextChild = endLevel; |
298 | 456 |
|
457 |
+ |
|
299 | 458 |
UBYTE isLeaf = FALSE; |
300 | 459 |
UBYTE reserved = 0; |
301 | 460 |
|
302 |
-int i,j; |
|
461 |
+long i,j; |
|
303 | 462 |
char keyBuf[keySize+1]; |
304 | 463 |
keyBuf[keySize] = 0; |
305 | 464 |
for (i=0; i<itemCount; i += nodeSizePer) |
306 | 465 |
{ |
307 | 466 |
/* Calculate size of this block */ |
308 |
- bits16 countOne = (itemCount - i + slotSizePer - 1)/slotSizePer; |
|
467 |
+ long countOne = (itemCount - i + slotSizePer - 1)/slotSizePer; |
|
309 | 468 |
if (countOne > blockSize) |
310 | 469 |
countOne = blockSize; |
470 |
+ bits16 shortCountOne = countOne; |
|
311 | 471 |
|
312 | 472 |
/* Write block header. */ |
313 | 473 |
writeOne(f, isLeaf); |
314 | 474 |
writeOne(f, reserved); |
315 |
- writeOne(f, countOne); |
|
475 |
+ writeOne(f, shortCountOne); |
|
316 | 476 |
|
317 | 477 |
/* Write out the slots that are used one by one, and do sanity check. */ |
318 | 478 |
int slotsUsed = 0; |
319 |
- int endIx = i + nodeSizePer; |
|
479 |
+ long endIx = i + nodeSizePer; |
|
320 | 480 |
if (endIx > itemCount) |
321 | 481 |
endIx = itemCount; |
322 | 482 |
for (j=i; j<endIx; j += slotSizePer) |
... | ... |
@@ -329,7 +489,7 @@ for (i=0; i<itemCount; i += nodeSizePer) |
329 | 489 |
nextChild += bytesInNextLevelBlock; |
330 | 490 |
++slotsUsed; |
331 | 491 |
} |
332 |
- assert(slotsUsed == countOne); |
|
492 |
+ assert(slotsUsed == shortCountOne); |
|
333 | 493 |
|
334 | 494 |
/* Write out empty slots as all zero. */ |
335 | 495 |
int slotSize = keySize + sizeof(bits64); |
... | ... |
@@ -76,19 +76,32 @@ void bptFileDetach(struct bptFile **pBpt); |
76 | 76 |
|
77 | 77 |
boolean bptFileFind(struct bptFile *bpt, void *key, int keySize, void *val, int valSize); |
78 | 78 |
/* Find value associated with key. Return TRUE if it's found. |
79 |
-* Parameters: |
|
80 |
-* bpt - file handle returned by bptFileOpen |
|
81 |
-* key - pointer to key string |
|
82 |
-* keySize - size of key. Normally just strlen(key) |
|
83 |
-* val - pointer to where to put retrieved value |
|
84 |
-* valSize - size of memory buffer that will hold val. Should match bpt->valSize. |
|
85 |
-*/ |
|
79 |
+ * Parameters: |
|
80 |
+ * bpt - file handle returned by bptFileOpen |
|
81 |
+ * key - pointer to key string |
|
82 |
+ * keySize - size of key. Normally just strlen(key) |
|
83 |
+ * val - pointer to where to put retrieved value |
|
84 |
+ * valSize - size of memory buffer that will hold val. Should match bpt->valSize. |
|
85 |
+ */ |
|
86 |
+ |
|
87 |
+struct slRef *bptFileFindMultiple(struct bptFile *bpt, void *key, int keySize, int valSize); |
|
88 |
+/* Find all values associated with key. Store this in ->val item of returned list. |
|
89 |
+ * Do a slRefFreeListAndVals() on list when done. */ |
|
86 | 90 |
|
87 | 91 |
void bptFileTraverse(struct bptFile *bpt, void *context, |
88 | 92 |
void (*callback)(void *context, void *key, int keySize, void *val, int valSize) ); |
89 | 93 |
/* Traverse bPlusTree on file, calling supplied callback function at each |
90 | 94 |
* leaf item. */ |
91 | 95 |
|
96 |
+void bptKeyAtPos(struct bptFile *bpt, bits64 itemPos, void *result); |
|
97 |
+/* Fill in result with the key at given itemPos. For first piece of data itemPos is 0 |
|
98 |
+ * and for last piece is bpt->itemCount - 1. Result must be at least bpt->keySize. |
|
99 |
+ * If result is a string it won't be zero terminated |
|
100 |
+ * by this routine. Use bptStringKeyAtPos instead. */ |
|
101 |
+ |
|
102 |
+void bptStringKeyAtPos(struct bptFile *bpt, bits64 itemPos, char *result, int maxResultSize); |
|
103 |
+/* Fill in result with the key at given itemPos. The maxResultSize should be 1+bpt->keySize |
|
104 |
+ * to accommodate zero termination of string. */ |
|
92 | 105 |
|
93 | 106 |
void bptFileCreate( |
94 | 107 |
void *itemArray, /* Sorted array of things to index. */ |
... | ... |
@@ -1,7 +1,9 @@ |
1 |
+/* Copyright (C) 2011 The Regents of the University of California |
|
2 |
+ * See README in this or parent directory for licensing information. */ |
|
3 |
+ |
|
1 | 4 |
#include "common.h" |
2 | 5 |
#include "base64.h" |
3 | 6 |
|
4 |
-static char const rcsid[] = "$Id: base64.c,v 1.6 2008/09/17 18:00:47 galt Exp $"; |
|
5 | 7 |
|
6 | 8 |
char *base64Encode(char *input, size_t inplen) |
7 | 9 |
/* Use base64 to encode a string. Returns one long encoded |
... | ... |
@@ -4,6 +4,8 @@ |
4 | 4 |
#define BBIFILE_H |
5 | 5 |
|
6 | 6 |
#include "cirTree.h" |
7 |
+#include "linefile.h" |
|
8 |
+#include "localmem.h" |
|
7 | 9 |
|
8 | 10 |
/* bigWig/bigBed file structure: |
9 | 11 |
* fixedWidthHeader |
... | ... |
@@ -18,7 +20,7 @@ |
18 | 20 |
* autoSqlOffset 8 bytes (for bigWig 0) (0 if no autoSql information) |
19 | 21 |
* totalSummaryOffset 8 bytes (0 in earlier versions of file lacking totalSummary) |
20 | 22 |
* uncompressBufSize 4 bytes (Size of uncompression buffer. 0 if uncompressed.) |
21 |
- * reserved 8 bytes (0 for now) |
|
23 |
+ * extensionOffset 8 bytes (Offset to header extension 0 if no such extension) |
|
22 | 24 |
* zoomHeaders there are zoomLevels number of these |
23 | 25 |
* reductionLevel 4 bytes |
24 | 26 |
* reserved 4 bytes |
... | ... |
@@ -31,6 +33,19 @@ |
31 | 33 |
* maxVal 8 bytes float (for bigBed maximum depth of coverage) |
32 | 34 |
* sumData 8 bytes float (for bigBed sum of coverage) |
33 | 35 |
* sumSquared 8 bytes float (for bigBed sum of coverage squared) |
36 |
+ * extendedHeader |
|
37 |
+ * extensionSize 2 size of extended header in bytes - currently 64 |
|
38 |
+ * extraIndexCount 2 number of extra fields we will be indexing |
|
39 |
+ * extraIndexListOffset 8 Offset to list of non-chrom/start/end indexes |
|
40 |
+ * reserved 48 All zeroes for now |
|
41 |
+ * extraIndexList - one of these for each extraIndex |
|
42 |
+ * type 2 Type of index. Always 0 for bPlusTree now |
|
43 |
+ * fieldCount 2 Number of fields used in this index. Always 1 for now |
|
44 |
+ * indexOffset 8 offset for this index in file |
|
45 |
+ * reserved 4 All zeroes for now |
|
46 |
+ * fieldList - one of these for each field being used in _this_ index |
|
47 |
+ * fieldId 2 index of field within record |
|
48 |
+ * reserved 2 All zeroes for now |
|
34 | 49 |
* chromosome b+ tree bPlusTree index |
35 | 50 |
* full data |
36 | 51 |
* sectionCount 8 bytes (item count for bigBeds) |
... | ... |
@@ -49,6 +64,7 @@ |
49 | 64 |
* sumData 4 bytes float |
50 | 65 |
* sumSquares 4 bytes float |
51 | 66 |
* zoom index cirTree index |
67 |
+ * extraIndexes [optional] bPlusTreeIndex for each extra field that is indexed |
|
52 | 68 |
* magic# 4 bytes - same as magic number at start of header |
53 | 69 |
*/ |
54 | 70 |
|
... | ... |
@@ -102,10 +118,17 @@ struct bbiFile |
102 | 118 |
bits16 fieldCount; /* Number of columns in bed version. */ |
103 | 119 |
bits16 definedFieldCount; /* Number of columns using bed standard definitions. */ |
104 | 120 |
bits64 asOffset; /* Offset to embedded null-terminated AutoSQL file. */ |
105 |
- bits64 totalSummaryOffset; /* Offset to total summary information if any. (On older files have to calculate) */ |
|
121 |
+ bits64 totalSummaryOffset; /* Offset to total summary information if any. |
|
122 |
+ (On older files have to calculate) */ |
|
106 | 123 |
bits32 uncompressBufSize; /* Size of uncompression buffer, 0 if uncompressed */ |
124 |
+ bits64 extensionOffset; /* Start of header extension block or 0 if none. */ |
|
107 | 125 |
struct cirTreeFile *unzoomedCir; /* Unzoomed data index in memory - may be NULL. */ |
108 | 126 |
struct bbiZoomLevel *levelList; /* List of zoom levels. */ |
127 |
+ |
|
128 |
+ /* Fields based on extension block. */ |
|
129 |
+ bits16 extensionSize; /* Size of extension block */ |
|
130 |
+ bits16 extraIndexCount; /* Number of extra indexes (on fields other than chrom,start,end */ |
|
131 |
+ bits64 extraIndexListOffset; /* Offset to list of extra indexes */ |
|
109 | 132 |
}; |
110 | 133 |
|
111 | 134 |
|
... | ... |
@@ -150,6 +173,14 @@ void bbiChromInfoKey(const void *va, char *keyBuf); |
150 | 173 |
void *bbiChromInfoVal(const void *va); |
151 | 174 |
/* Get val field out of bbiChromInfo. */ |
152 | 175 |
|
176 |
+char *bbiCachedChromLookup(struct bbiFile *bbi, int chromId, int lastChromId, |
|
177 |
+ char *chromBuf, int chromBufSize); |
|
178 |
+/* Return chromosome name corresponding to chromId. Because this is a bit expensive, |
|
179 |
+ * if you are doing this repeatedly pass in the chromId used in the previous call to |
|
180 |
+ * this in lastChromId, which will save it from doing the lookup again on the same |
|
181 |
+ * chromosome. Pass in -1 to lastChromId if this is the first time or if you can't be |
|
182 |
+ * bothered. The chromBufSize should be at greater or equal to bbi->keySize+1. */ |
|
183 |
+ |
|
153 | 184 |
struct bbiChromUsage |
154 | 185 |
/* Information on how many items per chromosome etc. Used by multipass bbiFile writers. */ |
155 | 186 |
{ |
... | ... |
@@ -249,7 +280,7 @@ boolean bbiSummaryArray(struct bbiFile *bbi, char *chrom, bits32 start, bits32 e |
249 | 280 |
struct bbiSummaryElement bbiTotalSummary(struct bbiFile *bbi); |
250 | 281 |
/* Return summary of entire file! */ |
251 | 282 |
|
252 |
-/****** Write side of things - implemented in bbiWrite.c ********/ |
|
283 |
+/****** Write side of things - implemented in bbiWrite.c. Few people need this. ********/ |
|
253 | 284 |
|
254 | 285 |
struct bbiBoundsArray |
255 | 286 |
/* Minimum info needed for r-tree indexer - where a section lives on disk and the |
... | ... |
@@ -287,7 +318,7 @@ void bbiSumOutStreamWrite(struct bbiSumOutStream *stream, struct bbiSummary *sum |
287 | 318 |
void bbiOutputOneSummaryFurtherReduce(struct bbiSummary *sum, |
288 | 319 |
struct bbiSummary **pTwiceReducedList, |
289 | 320 |
int doubleReductionSize, struct bbiBoundsArray **pBoundsPt, |
290 |
- struct bbiBoundsArray *boundsEnd, bits32 chromSize, struct lm *lm, |
|
321 |
+ struct bbiBoundsArray *boundsEnd, struct lm *lm, |
|
291 | 322 |
struct bbiSumOutStream *stream); |
292 | 323 |
/* Write out sum to file, keeping track of minimal info on it in *pBoundsPt, and also adding |
293 | 324 |
* it to second level summary. */ |
... | ... |
@@ -296,8 +327,6 @@ struct bbiSummary *bbiSummarySimpleReduce(struct bbiSummary *list, int reduction |
296 | 327 |
/* Do a simple reduction - where among other things the reduction level is an integral |
297 | 328 |
* multiple of the previous reduction level, and the list is sorted. Allocate result out of lm. */ |
298 | 329 |
|
299 |
-#define bbiMaxZoomLevels 10 /* Maximum zoom levels produced by writers. */ |
|
300 |
- |
|
301 | 330 |
void bbiWriteDummyHeader(FILE *f); |
302 | 331 |
/* Write out all-zero header, just to reserve space for it. */ |
303 | 332 |
|
... | ... |
@@ -325,9 +354,69 @@ void bbiChromUsageFree(struct bbiChromUsage **pUsage); |
325 | 354 |
void bbiChromUsageFreeList(struct bbiChromUsage **pList); |
326 | 355 |
/* free a list of bbiChromUsage structures */ |
327 | 356 |
|
328 |
-struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, |
|
329 |
- struct hash *chromSizesHash, int *retMinDiff, double *retAveSize, bits64 *retBedCount); |
|
330 |
-/* Go through bed file and collect chromosomes and statistics. Free with bbiChromUsageFreeList */ |
|
357 |
+struct bbNamedFileChunk |
|
358 |
+/* A name associated with an offset into a possibly large file. Used for extra |
|
359 |
+ * indexes in bigBed files. */ |
|
360 |
+ { |
|
361 |
+ char *name; /* Name of chunk. */ |
|
362 |
+ bits64 offset; /* Start in file. */ |
|
363 |
+ bits64 size; /* Size in file. */ |
|
364 |
+ }; |
|
365 |
+ |
|
366 |
+struct bbExIndexMaker |
|
367 |
+/* A helper structure to make indexes beyond primary one. Just used for bigBeds */ |
|
368 |
+ { |
|
369 |
+ bits16 indexCount; /* Number of extra indexes. */ |
|
370 |
+ /* Kind of wish next four fields, all of which are arrays indexed |
|
371 |
+ * by the same thing, were a single array of a structure instead. */ |
|
372 |
+ bits16 *indexFields; /* array of field ids, one for each extra index. */ |
|
373 |
+ int *maxFieldSize; /* array of maximum sizes seen for this field. */ |
|
374 |
+ struct bbNamedFileChunk **chunkArrayArray; /* where we keep name/start/size triples */ |
|
375 |
+ bits64 *fileOffsets; /* array of file offsets where indexes starts. */ |
|
376 |
+ int recordCount; /* number of records in file. */ |
|
377 |
+ }; |
|
378 |
+ |
|
379 |
+struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, |
|
380 |
+ struct bbExIndexMaker *eim, int *retMinDiff, double *retAveSize, bits64 *retBedCount); |
|
381 |
+/* Go through bed file and collect chromosomes and statistics. If eim parameter is non-NULL |
|
382 |
+ * collect max field sizes there too. */ |
|
383 |
+ |
|
384 |
+#define bbiMaxZoomLevels 10 /* Max number of zoom levels */ |
|
385 |
+#define bbiResIncrement 4 /* Amount to reduce at each zoom level */ |
|
386 |
+ |
|
387 |
+int bbiCalcResScalesAndSizes(int aveSize, |
|
388 |
+ int resScales[bbiMaxZoomLevels], int resSizes[bbiMaxZoomLevels]); |
|
389 |
+/* Fill in resScales with amount to zoom at each level, and zero out resSizes based |
|
390 |
+ * on average span. Returns the number of zoom levels we actually will use. */ |
|
391 |
+ |
|
392 |
+typedef struct bbiSummary *bbiWriteReducedOnceReturnReducedTwice( |
|
393 |
+ struct bbiChromUsage *usageList, int fieldCount, |
|
394 |
+ struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount, |
|
395 |
+ int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress, |
|
396 |
+ struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart, |
|
397 |
+ struct bbiSummaryElement *totalSum); |
|
398 |
+/* Typedef for a function that writes out data reduced by factor of initial reduction, and |
|
399 |
+ * also returns an array of bbiSummaries for the next reduction level. */ |
|
400 |
+ |
|
401 |
+int bbiWriteZoomLevels( |
|
402 |
+ struct lineFile *lf, /* Input file. */ |
|
403 |
+ FILE *f, /* Output. */ |
|
404 |
+ int blockSize, /* Size of index block */ |
|
405 |
+ int itemsPerSlot, /* Number of data points bundled at lowest level. */ |
|
406 |
+ bbiWriteReducedOnceReturnReducedTwice writeReducedOnceReturnReducedTwice, /* callback */ |
|
407 |
+ int fieldCount, /* Number of fields in bed (4 for bedGraph) */ |
|
408 |
+ boolean doCompress, /* Do we compress. Answer really should be yes! */ |
|
409 |
+ bits64 dataSize, /* Size of data on disk (after compression if any). */ |
|
410 |
+ struct bbiChromUsage *usageList, /* Result from bbiChromUsageFromBedFile */ |
|
411 |
+ int resTryCount, int resScales[], int resSizes[], /* How much to zoom at each level */ |
|
412 |
+ bits32 zoomAmounts[bbiMaxZoomLevels], /* Fills in amount zoomed at each level. */ |
|
413 |
+ bits64 zoomDataOffsets[bbiMaxZoomLevels], /* Fills in where data starts for each zoom level. */ |
|
414 |
+ bits64 zoomIndexOffsets[bbiMaxZoomLevels], /* Fills in where index starts for each level. */ |
|
415 |
+ struct bbiSummaryElement *totalSum); |
|
416 |
+/* Write out all the zoom levels and return the number of levels written. Writes |
|
417 |
+ * actual zoom amount and the offsets of the zoomed data and index in the last three |
|
418 |
+ * parameters. Sorry for all the parameters - it was this or duplicate a big chunk of |
|
419 |
+ * code between bedToBigBed and bedGraphToBigWig. */ |
|
331 | 420 |
|
332 | 421 |
int bbiCountSectionsNeeded(struct bbiChromUsage *usageList, int itemsPerSlot); |
333 | 422 |
/* Count up number of sections needed for data. */ |
... | ... |
@@ -358,4 +447,8 @@ boolean bbiFileCheckSigs(char *fileName, bits32 sig, char *typeName); |
358 | 447 |
time_t bbiUpdateTime(struct bbiFile *bbi); |
359 | 448 |
/* return bbi->udc->updateTime */ |
360 | 449 |
|
450 |
+struct bbiSummary *bbiSummariesInRegion(struct bbiZoomLevel *zoom, struct bbiFile *bbi, |
|
451 |
+ int chromId, bits32 start, bits32 end); |
|
452 |
+/* Return list of all summaries in region at given zoom level of bbiFile. */ |
|
453 |
+ |
|
361 | 454 |
#endif /* BBIFILE_H */ |
... | ... |
@@ -1,11 +1,13 @@ |
1 | 1 |
/* bbiRead - Big Binary Indexed file. Stuff that's common between bigWig and bigBed on the |
2 | 2 |
* read side. */ |
3 | 3 |
|
4 |
+/* Copyright (C) 2014 The Regents of the University of California |
|
5 |
+ * See README in this or parent directory for licensing information. */ |
|
6 |
+ |
|
4 | 7 |
#include "common.h" |
5 | 8 |
#include "linefile.h" |
6 | 9 |
#include "hash.h" |
7 | 10 |
#include "obscure.h" |
8 |
-#include "localmem.h" |
|
9 | 11 |
#include "zlibFace.h" |
10 | 12 |
#include "bPlusTree.h" |
11 | 13 |
#include "hmmstats.h" |
... | ... |
@@ -115,9 +117,7 @@ bbi->definedFieldCount = udcReadBits16(udc, isSwapped); |
115 | 117 |
bbi->asOffset = udcReadBits64(udc, isSwapped); |
116 | 118 |
bbi->totalSummaryOffset = udcReadBits64(udc, isSwapped); |
117 | 119 |
bbi->uncompressBufSize = udcReadBits32(udc, isSwapped); |
118 |
- |
|
119 |
-/* Skip over reserved area. */ |
|
120 |
-udcSeek(udc, 64); |
|
120 |
+bbi->extensionOffset = udcReadBits64(udc, isSwapped); |
|
121 | 121 |
|
122 | 122 |
/* Read zoom headers. */ |
123 | 123 |
int i; |
... | ... |
@@ -134,6 +134,15 @@ for (i=0; i<bbi->zoomLevels; ++i) |
134 | 134 |
slReverse(&levelList); |
135 | 135 |
bbi->levelList = levelList; |
136 | 136 |
|
137 |
+/* Deal with header extension if any. */ |
|
138 |
+if (bbi->extensionOffset != 0) |
|
139 |
+ { |
|
140 |
+ udcSeek(udc, bbi->extensionOffset); |
|
141 |
+ bbi->extensionSize = udcReadBits16(udc, isSwapped); |
|
142 |
+ bbi->extraIndexCount = udcReadBits16(udc, isSwapped); |
|
143 |
+ bbi->extraIndexListOffset = udcReadBits64(udc, isSwapped); |
|
144 |
+ } |
|
145 |
+ |
|
137 | 146 |
/* Attach B+ tree of chromosome names and ids. */ |
138 | 147 |
udcSeek(udc, bbi->chromTreeOffset); |
139 | 148 |
bbi->chromBpt = bptFileAttach(fileName, udc); |
... | ... |
@@ -157,6 +166,16 @@ if (bwf != NULL) |
157 | 166 |
} |
158 | 167 |
} |
159 | 168 |
|
169 |
+static void chromIdSizeHandleSwapped(boolean isSwapped, struct bbiChromIdSize *idSize) |
|
170 |
+/* Swap bytes in chromosome Id and Size as needed. */ |
|
171 |
+{ |
|
172 |
+if (isSwapped) |
|
173 |
+ { |
|
174 |
+ idSize->chromId = byteSwap32(idSize->chromId); |
|
175 |
+ idSize->chromSize = byteSwap32(idSize->chromSize); |
|
176 |
+ } |
|
177 |
+} |
|
178 |
+ |
|
160 | 179 |
|
161 | 180 |
struct fileOffsetSize *bbiOverlappingBlocks(struct bbiFile *bbi, struct cirTreeFile *ctf, |
162 | 181 |
char *chrom, bits32 start, bits32 end, bits32 *retChromId) |
... | ... |
@@ -165,8 +184,7 @@ struct fileOffsetSize *bbiOverlappingBlocks(struct bbiFile *bbi, struct cirTreeF |
165 | 184 |
struct bbiChromIdSize idSize; |
166 | 185 |
if (!bptFileFind(bbi->chromBpt, chrom, strlen(chrom), &idSize, sizeof(idSize))) |
167 | 186 |
return NULL; |
168 |
-if (bbi->isSwapped) |
|
169 |
- idSize.chromId = byteSwap32(idSize.chromId); |
|
187 |
+chromIdSizeHandleSwapped(bbi->isSwapped, &idSize); |
|
170 | 188 |
if (retChromId != NULL) |
171 | 189 |
*retChromId = idSize.chromId; |
172 | 190 |
return cirTreeFindOverlappingBlocks(ctf, idSize.chromId, start, end); |
... | ... |
@@ -186,15 +204,11 @@ struct chromNameCallbackContext *c = context; |
186 | 204 |
struct bbiChromInfo *info; |
187 | 205 |
struct bbiChromIdSize *idSize = val; |
188 | 206 |
assert(valSize == sizeof(*idSize)); |
207 |
+chromIdSizeHandleSwapped(c->isSwapped, idSize); |
|
189 | 208 |
AllocVar(info); |
190 | 209 |
info->name = cloneStringZ(key, keySize); |
191 | 210 |
info->id = idSize->chromId; |
192 | 211 |
info->size = idSize->chromSize; |
193 |
-if (c->isSwapped) |
|
194 |
- { |
|
195 |
- info->id = byteSwap32(info->id); |
|
196 |
- info->size = byteSwap32(info->size); |
|
197 |
- } |
|
198 | 212 |
slAddHead(&c->list, info); |
199 | 213 |
} |
200 | 214 |
|
... | ... |
@@ -215,6 +229,7 @@ bits32 bbiChromSize(struct bbiFile *bbi, char *chrom) |
215 | 229 |
struct bbiChromIdSize idSize; |
216 | 230 |
if (!bptFileFind(bbi->chromBpt, chrom, strlen(chrom), &idSize, sizeof(idSize))) |
217 | 231 |
return 0; |
232 |
+chromIdSizeHandleSwapped(bbi->isSwapped, &idSize); |
|
218 | 233 |
return idSize.chromSize; |
219 | 234 |
} |
220 | 235 |
|
... | ... |
@@ -303,6 +318,7 @@ sum->chromId = udcReadBits32(udc, isSwapped); |
303 | 318 |
sum->start = udcReadBits32(udc, isSwapped); |
304 | 319 |
sum->end = udcReadBits32(udc, isSwapped); |
305 | 320 |
sum->validCount = udcReadBits32(udc, isSwapped); |
321 |
+// looks like a bug to me, these should call udcReadFloat() |
|
306 | 322 |
udcMustReadOne(udc, sum->minVal); |
307 | 323 |
udcMustReadOne(udc, sum->maxVal); |
308 | 324 |
udcMustReadOne(udc, sum->sumData); |
... | ... |
@@ -310,6 +326,22 @@ udcMustReadOne(udc, sum->sumSquares); |
310 | 326 |
} |
311 | 327 |
#endif /* UNUSED */ |
312 | 328 |
|
329 |
+static void bbiSummaryHandleSwapped(struct bbiFile *bbi, struct bbiSummaryOnDisk *in) |
|
330 |
+/* Swap integer fields in summary as needed. */ |
|
331 |
+{ |
|
332 |
+if (bbi->isSwapped) |
|
333 |
+ { |
|
334 |
+ in->chromId = byteSwap32(in->chromId); |
|
335 |
+ in->start = byteSwap32(in->start); |
|
336 |
+ in->end = byteSwap32(in->end); |
|
337 |
+ in->validCount = byteSwap32(in->validCount); |
|
338 |
+ in->minVal = byteSwapFloat(in->minVal); |
|
339 |
+ in->maxVal = byteSwapFloat(in->maxVal); |
|
340 |
+ in->sumData = byteSwapFloat(in->sumData); |
|
341 |
+ in->sumSquares = byteSwapFloat(in->sumSquares); |
|
342 |
+ } |
|
343 |
+} |
|
344 |
+ |
|
313 | 345 |
static struct bbiSummary *bbiSummaryFromOnDisk(struct bbiSummaryOnDisk *in) |
314 | 346 |
/* Create a bbiSummary unlinked to anything from input in onDisk format. */ |
315 | 347 |
{ |
... | ... |
@@ -326,7 +358,7 @@ out->sumSquares = in->sumSquares; |
326 | 358 |
return out; |
327 | 359 |
} |
328 | 360 |
|
329 |
-static struct bbiSummary *bbiSummariesInRegion(struct bbiZoomLevel *zoom, struct bbiFile *bbi, |
|
361 |
+struct bbiSummary *bbiSummariesInRegion(struct bbiZoomLevel *zoom, struct bbiFile *bbi, |
|
330 | 362 |
int chromId, bits32 start, bits32 end) |
331 | 363 |
/* Return list of all summaries in region at given zoom level of bbiFile. */ |
332 | 364 |
{ |
... | ... |
@@ -386,6 +418,7 @@ for (block = blockList; block != NULL; ) |
386 | 418 |
{ |
387 | 419 |
dSum = (void *)blockPt; |
388 | 420 |
blockPt += sizeof(*dSum); |
421 |
+ bbiSummaryHandleSwapped(bbi, dSum); |
|
389 | 422 |
if (dSum->chromId == chromId) |
390 | 423 |
{ |
391 | 424 |
int s = max(dSum->start, start); |
... | ... |
@@ -409,6 +442,23 @@ slReverse(&sumList); |
409 | 442 |
return sumList; |
410 | 443 |
} |
411 | 444 |
|
445 |
+static int normalizeCount(struct bbiSummaryElement *el, double countFactor, |
|
446 |
+ double minVal, double maxVal, double sumData, double sumSquares) |
|
447 |
+/* normalize statistics to be based on an integer number of valid bases |
|
448 |
+ * Integer value is the smallest integer not less than countFactor */ |
|
449 |
+{ |
|
450 |
+bits32 validCount = ceil(countFactor); |
|
451 |
+double normFactor = (double)validCount/countFactor; |
|
452 |
+ |
|
453 |
+el->validCount = validCount; |
|
454 |
+el->minVal = minVal; |
|
455 |
+el->maxVal = maxVal; |
|
456 |
+el->sumData = sumData * normFactor; |
|
457 |
+el->sumSquares = sumSquares * normFactor; |
|
458 |
+ |
|
459 |
+return validCount; |
|
460 |
+} |
|
461 |
+ |
|
412 | 462 |
static bits32 bbiSummarySlice(struct bbiFile *bbi, bits32 baseStart, bits32 baseEnd, |
413 | 463 |
struct bbiSummary *sumList, struct bbiSummaryElement *el) |
414 | 464 |
/* Update retVal with the average value if there is any data in interval. Return number |
... | ... |
@@ -421,6 +471,7 @@ if (sumList != NULL) |
421 | 471 |
double minVal = sumList->minVal; |
422 | 472 |
double maxVal = sumList->maxVal; |
423 | 473 |
double sumData = 0, sumSquares = 0; |
474 |
+ double countFactor = 0.0; |
|
424 | 475 |
|
425 | 476 |
struct bbiSummary *sum; |
426 | 477 |
for (sum = sumList; sum != NULL && sum->start < baseEnd; sum = sum->next) |
... | ... |
@@ -429,7 +480,7 @@ if (sumList != NULL) |
429 | 480 |
if (overlap > 0) |
430 | 481 |
{ |
431 | 482 |
double overlapFactor = (double)overlap / (sum->end - sum->start); |
432 |
- validCount += sum->validCount * overlapFactor; |
|
483 |
+ countFactor += sum->validCount * overlapFactor; |
|
433 | 484 |
sumData += sum->sumData * overlapFactor; |
434 | 485 |
sumSquares += sum->sumSquares * overlapFactor; |
435 | 486 |
if (maxVal < sum->maxVal) |
... | ... |
@@ -438,24 +489,20 @@ if (sumList != NULL) |
438 | 489 |
minVal = sum->minVal; |
439 | 490 |
} |
440 | 491 |
} |
441 |
- if (validCount > 0) |
|
442 |
- { |
|
443 |
- el->validCount = validCount; |
|
444 |
- el->minVal = minVal; |
|
445 |
- el->maxVal = maxVal; |
|
446 |
- el->sumData = sumData; |
|
447 |
- el->sumSquares = sumSquares; |
|
448 |
- } |
|
492 |
+ |
|
493 |
+ if (countFactor > 0) |
|
494 |
+ validCount = normalizeCount(el, countFactor, minVal, maxVal, sumData, sumSquares); |
|
449 | 495 |
} |
450 | 496 |
return validCount; |
451 | 497 |
} |
452 | 498 |
|
453 | 499 |
static int bbiChromId(struct bbiFile *bbi, char *chrom) |
454 |
-/* Return chromosome size */ |
|
500 |
+/* Return chromosome Id */ |
|
455 | 501 |
{ |
456 | 502 |
struct bbiChromIdSize idSize; |
457 | 503 |
if (!bptFileFind(bbi->chromBpt, chrom, strlen(chrom), &idSize, sizeof(idSize))) |
458 | 504 |
return -1; |
505 |
+chromIdSizeHandleSwapped(bbi->isSwapped, &idSize); |
|
459 | 506 |
return idSize.chromId; |
460 | 507 |
} |
461 | 508 |
|
... | ... |
@@ -501,10 +548,11 @@ static bits32 bbiIntervalSlice(struct bbiFile *bbi, bits32 baseStart, bits32 bas |
501 | 548 |
/* Update retVal with the average value if there is any data in interval. Return number |
502 | 549 |
* of valid data bases in interval. */ |
503 | 550 |
{ |
504 |
-double validCount = 0; |
|
551 |
+bits32 validCount = 0; |
|
505 | 552 |
|
506 | 553 |
if (intervalList != NULL) |
507 | 554 |
{ |
555 |
+ double countFactor = 0; |
|
508 | 556 |
struct bbiInterval *interval; |
509 | 557 |
double sumData = 0, sumSquares = 0; |
510 | 558 |
double minVal = intervalList->val; |
... | ... |
@@ -519,7 +567,7 @@ if (intervalList != NULL) |
519 | 567 |
int intervalSize = interval->end - interval->start; |
520 | 568 |
double overlapFactor = (double)overlap / intervalSize; |
521 | 569 |
double intervalWeight = intervalSize * overlapFactor; |
522 |
- validCount += intervalWeight; |
|
570 |
+ countFactor += intervalWeight; |
|
523 | 571 |
sumData += interval->val * intervalWeight; |
524 | 572 |
sumSquares += interval->val * interval->val * intervalWeight; |
525 | 573 |
if (maxVal < interval->val) |
... | ... |
@@ -528,13 +576,10 @@ if (intervalList != NULL) |
528 | 576 |
minVal = interval->val; |
529 | 577 |
} |
530 | 578 |
} |
531 |
- el->validCount = round(validCount); |
|
532 |
- el->minVal = minVal; |
|
533 |
- el->maxVal = maxVal; |
|
534 |
- el->sumData = sumData; |
|
535 |
- el->sumSquares = sumSquares; |
|
579 |
+ |
|
580 |
+ validCount = normalizeCount(el, countFactor, minVal, maxVal, sumData, sumSquares); |
|
536 | 581 |
} |
537 |
-return round(validCount); |
|
582 |
+return validCount; |
|
538 | 583 |
} |
539 | 584 |
|
540 | 585 |
|
... | ... |
@@ -547,7 +592,7 @@ struct bbiInterval *intervalList = NULL, *interval; |
547 | 592 |
struct lm *lm = lmInit(0); |
548 | 593 |
intervalList = (*fetchIntervals)(bbi, chrom, start, end, lm); |
549 | 594 |
boolean result = FALSE; |
550 |
-if (intervalList != NULL); |
|
595 |
+if (intervalList != NULL) |
|
551 | 596 |
{ |
552 | 597 |
int i; |
553 | 598 |
bits32 baseStart = start, baseEnd; |
... | ... |
@@ -736,3 +781,17 @@ time_t bbiUpdateTime(struct bbiFile *bbi) |
736 | 781 |
struct udcFile *udc = bbi->udc; |
737 | 782 |
return udcUpdateTime(udc); |
738 | 783 |
} |
784 |
+ |
|
785 |
+char *bbiCachedChromLookup(struct bbiFile *bbi, int chromId, int lastChromId, |
|
786 |
+ char *chromBuf, int chromBufSize) |
|
787 |
+/* Return chromosome name corresponding to chromId. Because this is a bit expensive, |
|
788 |
+ * if you are doing this repeatedly pass in the chromId used in the previous call to |
|
789 |
+ * this in lastChromId, which will save it from doing the lookup again on the same |
|
790 |
+ * chromosome. Pass in -1 to lastChromId if this is the first time or if you can't be |
|
791 |
+ * bothered. The chromBufSize should be at greater or equal to bbi->keySize+1. */ |
|
792 |
+{ |
|
793 |
+if (chromId != lastChromId) |
|
794 |
+ bptStringKeyAtPos(bbi->chromBpt, chromId, chromBuf, chromBufSize); |
|
795 |
+return chromBuf; |
|
796 |
+} |
|
797 |
+ |
... | ... |
@@ -1,8 +1,12 @@ |
1 |
+/* bbiWrite.c - Routines to help write bigWig and bigBed files. See also bbiFile.h */ |
|
2 |
+ |
|
3 |
+/* Copyright (C) 2014 The Regents of the University of California |
|
4 |
+ * See README in this or parent directory for licensing information. */ |
|
5 |
+ |
|
1 | 6 |
#include "common.h" |
2 | 7 |
#include "hash.h" |
3 | 8 |
#include "linefile.h" |
4 | 9 |
#include "sqlNum.h" |
5 |
-#include "localmem.h" |
|
6 | 10 |
#include "zlibFace.h" |
7 | 11 |
#include "cirTree.h" |
8 | 12 |
#include "bPlusTree.h" |
... | ... |
@@ -48,23 +52,26 @@ int chromCount = slCount(usageList); |
48 | 52 |
struct bbiChromUsage *usage; |
49 | 53 |
|
50 | 54 |
/* Allocate and fill in array from list. */ |
51 |
-struct bbiChromInfo *chromInfoArray; |
|
52 |
-AllocArray(chromInfoArray, chromCount); |
|
53 |
-int i; |
|
55 |
+struct bbiChromInfo *chromInfoArray = NULL; |
|
54 | 56 |
int maxChromNameSize = 0; |
55 |
-for (i=0, usage = usageList; i<chromCount; ++i, usage = usage->next) |
|
57 |
+if (chromCount > 0) |
|
56 | 58 |
{ |
57 |
- char *chromName = usage->name; |
|
58 |
- int len = strlen(chromName); |
|
59 |
- if (len > maxChromNameSize) |
|
60 |
- maxChromNameSize = len; |
|
61 |
- chromInfoArray[i].name = chromName; |
|
62 |
- chromInfoArray[i].id = usage->id; |
|
63 |
- chromInfoArray[i].size = usage->size; |
|
64 |
- } |
|
59 |
+ AllocArray(chromInfoArray, chromCount); |
|
60 |
+ int i; |
|
61 |
+ for (i=0, usage = usageList; i<chromCount; ++i, usage = usage->next) |
|
62 |
+ { |
|
63 |
+ char *chromName = usage->name; |
|
64 |
+ int len = strlen(chromName); |
|
65 |
+ if (len > maxChromNameSize) |
|
66 |
+ maxChromNameSize = len; |
|
67 |
+ chromInfoArray[i].name = chromName; |
|
68 |
+ chromInfoArray[i].id = usage->id; |
|
69 |
+ chromInfoArray[i].size = usage->size; |
|
70 |
+ } |
|
65 | 71 |
|
66 |
-/* Sort so the b-Tree actually works. */ |
|
67 |
-qsort(chromInfoArray, chromCount, sizeof(chromInfoArray[0]), bbiChromInfoCmp); |
|
72 |
+ /* Sort so the b-Tree actually works. */ |
|
73 |
+ qsort(chromInfoArray, chromCount, sizeof(chromInfoArray[0]), bbiChromInfoCmp); |
|
74 |
+ } |
|
68 | 75 |
|
69 | 76 |
/* Write chromosome bPlusTree */ |
70 | 77 |
int chromBlockSize = min(blockSize, chromCount); |
... | ... |
@@ -133,11 +140,40 @@ for (el = *pList; el != NULL; el = next) |
133 | 140 |
*pList = NULL; |
134 | 141 |
} |
135 | 142 |
|
136 |
-struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, |
|
137 |
- struct hash *chromSizesHash, int *retMinDiff, double *retAveSize, bits64 *retBedCount) |
|
138 |
-/* Go through bed file and collect chromosomes and statistics. */ |
|
143 |
+int bbExIndexMakerMaxIndexField(struct bbExIndexMaker *eim) |
|
144 |
+/* Return the maximum field we have to index. */ |
|
139 | 145 |
{ |
140 |
-char *row[3]; |
|
146 |
+int maxIx = 0; |
|
147 |
+int i; |
|
148 |
+for (i=0; i<eim->indexCount; ++i) |
|
149 |
+ { |
|
150 |
+ int ix = eim->indexFields[i]; |
|
151 |
+ if (ix > maxIx) |
|
152 |
+ maxIx = ix; |
|
153 |
+ } |
|
154 |
+return maxIx; |
|
155 |
+} |
|
156 |
+ |
|
157 |
+void bbExIndexMakerUpdateMaxFieldSize(struct bbExIndexMaker *eim, char **row) |
|
158 |
+/* Fold in information about row into bbExIndexMaker into eim->maxFieldSize */ |
|
159 |
+{ |
|
160 |
+int i; |
|
161 |
+for (i=0; i<eim->indexCount; ++i) |
|
162 |
+ { |
|
163 |
+ int rowIx = eim->indexFields[i]; |
|
164 |
+ int size = strlen(row[rowIx]); |
|
165 |
+ if (size > eim->maxFieldSize[i]) |
|
166 |
+ eim->maxFieldSize[i] = size; |
|
167 |
+ } |
|
168 |
+} |
|
169 |
+ |
|
170 |
+struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, |
|
171 |
+ struct bbExIndexMaker *eim, int *retMinDiff, double *retAveSize, bits64 *retBedCount) |
|
172 |
+/* Go through bed file and collect chromosomes and statistics. If eim parameter is non-NULL |
|
173 |
+ * collect max field sizes there too. */ |
|
174 |
+{ |
|
175 |
+int maxRowSize = (eim == NULL ? 3 : bbExIndexMakerMaxIndexField(eim) + 1); |
|
176 |
+char *row[maxRowSize]; |
|
141 | 177 |
struct hash *uniqHash = hashNew(0); |
142 | 178 |
struct bbiChromUsage *usage = NULL, *usageList = NULL; |
143 | 179 |
int lastStart = -1; |
... | ... |
@@ -149,20 +185,17 @@ lineFileRemoveInitialCustomTrackLines(lf); |
149 | 185 |
|
150 | 186 |
for (;;) |
151 | 187 |
{ |
152 |
- int rowSize = lineFileChopNext(lf, row, ArraySize(row)); |
|
188 |
+ int rowSize = lineFileChopNext(lf, row, maxRowSize); |
|
153 | 189 |
if (rowSize == 0) |
154 | 190 |
break; |
155 |
- lineFileExpectWords(lf, 3, rowSize); |
|
191 |
+ lineFileExpectAtLeast(lf, maxRowSize, rowSize); |
|
156 | 192 |
char *chrom = row[0]; |
157 | 193 |
int start = lineFileNeedNum(lf, row, 1); |
158 | 194 |
int end = lineFileNeedNum(lf, row, 2); |
159 |
- if (start >= end) |
|
195 |
+ if (eim != NULL) |
|
196 |
+ bbExIndexMakerUpdateMaxFieldSize(eim, row); |
|
197 |
+ if (start > end) |
|
160 | 198 |
{ |
161 |
- if (start == end) |
|
162 |
- errAbort("line %d of %s: start and end coordinates the same\n" |
|
163 |
- "They need to be at least one apart" |
|
164 |
- , lf->lineIx, lf->fileName); |
|
165 |
- else |
|
166 | 199 |
errAbort("end (%d) before start (%d) line %d of %s", |
167 | 200 |
end, start, lf->lineIx, lf->fileName); |
168 | 201 |
} |
... | ... |
@@ -204,13 +237,130 @@ for (;;) |
204 | 237 |
lastStart = start; |
205 | 238 |
} |
206 | 239 |
slReverse(&usageList); |
240 |
+double aveSize = 0; |
|
241 |
+if (bedCount > 0) |
|
242 |
+ aveSize = (double)totalBases/bedCount; |
|
207 | 243 |
*retMinDiff = minDiff; |
208 |
-*retAveSize = (double)totalBases/bedCount; |
|
244 |
+*retAveSize = aveSize; |
|
209 | 245 |
*retBedCount = bedCount; |
210 | 246 |
freeHash(&uniqHash); |
211 | 247 |
return usageList; |
212 | 248 |
} |
213 | 249 |
|
250 |
+int bbiCalcResScalesAndSizes(int aveSize, |
|
251 |
+ int resScales[bbiMaxZoomLevels], int resSizes[bbiMaxZoomLevels]) |
|
252 |
+/* Fill in resScales with amount to zoom at each level, and zero out resSizes based |
|
253 |
+ * on average span. Returns the number of zoom levels we actually will use. */ |
|
254 |
+{ |
|
255 |
+int resTryCount = bbiMaxZoomLevels, resTry; |
|
256 |
+int resIncrement = bbiResIncrement; |
|
257 |
+int minZoom = 10; |
|
258 |
+int res = aveSize; |
|
259 |
+if (res < minZoom) |
|
260 |
+ res = minZoom; |
|
261 |
+for (resTry = 0; resTry < resTryCount; ++resTry) |
|
262 |
+ { |
|
263 |
+ resSizes[resTry] = 0; |
|
264 |
+ resScales[resTry] = res; |
|
265 |
+ // if aveSize is large, then the initial value of res is large, and we |
|
266 |
+ // and we cannot do all 10 levels without overflowing res* integers and other related variables. |
|
267 |
+ if (res > 1000000000) |
|
268 |
+ { |
|
269 |
+ resTryCount = resTry + 1; |
|
270 |
+ verbose(2, "resTryCount reduced from 10 to %d\n", resTryCount); |
|
271 |
+ break; |
|
272 |
+ } |
|
273 |
+ res *= resIncrement; |
|
274 |
+ } |
|
275 |
+return resTryCount; |
|
276 |
+} |
|
277 |
+ |
|
278 |
+int bbiWriteZoomLevels( |
|
279 |
+ struct lineFile *lf, /* Input file. */ |
|
280 |
+ FILE *f, /* Output. */ |
|
281 |
+ int blockSize, /* Size of index block */ |
|
282 |
+ int itemsPerSlot, /* Number of data points bundled at lowest level. */ |
|
283 |
+ bbiWriteReducedOnceReturnReducedTwice writeReducedOnceReturnReducedTwice, /* callback */ |
|
284 |
+ int fieldCount, /* Number of fields in bed (4 for bedGraph) */ |
|
285 |
+ boolean doCompress, /* Do we compress. Answer really should be yes! */ |
|
286 |
+ bits64 dataSize, /* Size of data on disk (after compression if any). */ |
|
287 |
+ struct bbiChromUsage *usageList, /* Result from bbiChromUsageFromBedFile */ |
|
288 |
+ int resTryCount, int resScales[], int resSizes[], /* How much to zoom at each level */ |
|
289 |
+ bits32 zoomAmounts[bbiMaxZoomLevels], /* Fills in amount zoomed at each level. */ |
|
290 |
+ bits64 zoomDataOffsets[bbiMaxZoomLevels], /* Fills in where data starts for each zoom level. */ |
|
291 |
+ bits64 zoomIndexOffsets[bbiMaxZoomLevels], /* Fills in where index starts for each level. */ |
|
292 |
+ struct bbiSummaryElement *totalSum) |
|
293 |
+/* Write out all the zoom levels and return the number of levels written. Writes |
|
294 |
+ * actual zoom amount and the offsets of the zoomed data and index in the last three |
|
295 |
+ * parameters. Sorry for all the parameters - it was this or duplicate a big chunk of |
|
296 |
+ * code between bedToBigBed and bedGraphToBigWig. */ |
|
297 |
+{ |
|
298 |
+/* Write out first zoomed section while storing in memory next zoom level. */ |
|
299 |
+assert(resTryCount > 0); |
|
300 |
+int maxReducedSize = dataSize/2; |
|
301 |
+int initialReduction = 0, initialReducedCount = 0; |
|
302 |
+ |
|
303 |
+/* Figure out initialReduction for zoom - one that is maxReducedSize or less. */ |
|
304 |
+int resTry; |
|
305 |
+for (resTry = 0; resTry < resTryCount; ++resTry) |
|
306 |
+ { |
|
307 |
+ bits64 reducedSize = resSizes[resTry] * sizeof(struct bbiSummaryOnDisk); |
|
308 |
+ if (doCompress) |
|
309 |
+ reducedSize /= 2; // Estimate! |
|
310 |
+ if (reducedSize <= maxReducedSize) |
|
311 |
+ { |
|
312 |
+ initialReduction = resScales[resTry]; |
|
313 |
+ initialReducedCount = resSizes[resTry]; |
|
314 |
+ break; |
|
315 |
+ } |
|
316 |
+ } |
|
317 |
+verbose(2, "initialReduction %d, initialReducedCount = %d\n", |
|
318 |
+ initialReduction, initialReducedCount); |
|
319 |
+ |
|
320 |
+/* Force there to always be at least one zoom. It may waste a little space on small |
|
321 |
+ * files, but it makes files more uniform, and avoids special case code for calculating |
|
322 |
+ * overall file summary. */ |
|
323 |
+if (initialReduction == 0) |
|
324 |
+ { |
|
325 |
+ initialReduction = resScales[0]; |
|
326 |
+ initialReducedCount = resSizes[0]; |
|
327 |
+ } |
|
328 |
+ |
|
329 |
+/* Call routine to make the initial zoom level and also a bit of work towards further levels. */ |
|
330 |
+struct lm *lm = lmInit(0); |
|
331 |
+int zoomIncrement = bbiResIncrement; |
|
332 |
+lineFileRewind(lf); |
|
333 |
+struct bbiSummary *rezoomedList = writeReducedOnceReturnReducedTwice(usageList, fieldCount, |
|
334 |
+ lf, initialReduction, initialReducedCount, |
|
335 |
+ zoomIncrement, blockSize, itemsPerSlot, doCompress, lm, |
|
336 |
+ f, &zoomDataOffsets[0], &zoomIndexOffsets[0], totalSum); |
|
337 |
+verboseTime(2, "writeReducedOnceReturnReducedTwice"); |
|
338 |
+zoomAmounts[0] = initialReduction; |
|
339 |
+int zoomLevels = 1; |
|
340 |
+ |
|
341 |
+/* Loop around to do any additional levels of zoom. */ |
|
342 |
+int zoomCount = initialReducedCount; |
|
343 |
+int reduction = initialReduction * zoomIncrement; |
|
344 |
+while (zoomLevels < bbiMaxZoomLevels) |
|
345 |
+ { |
|
346 |
+ int rezoomCount = slCount(rezoomedList); |
|
347 |
+ if (rezoomCount >= zoomCount) |
|
348 |
+ break; |
|
349 |
+ zoomCount = rezoomCount; |
|
350 |
+ zoomDataOffsets[zoomLevels] = ftell(f); |
|
351 |
+ zoomIndexOffsets[zoomLevels] = bbiWriteSummaryAndIndex(rezoomedList, |
|
352 |
+ blockSize, itemsPerSlot, doCompress, f); |
|
353 |
+ zoomAmounts[zoomLevels] = reduction; |
|
354 |
+ ++zoomLevels; |
|
355 |
+ reduction *= zoomIncrement; |
|
356 |
+ rezoomedList = bbiSummarySimpleReduce(rezoomedList, reduction, lm); |
|
357 |
+ } |
|
358 |
+lmCleanup(&lm); |
|
359 |
+verboseTime(2, "further reductions"); |
|
360 |
+return zoomLevels; |
|
361 |
+} |
|
362 |
+ |
|
363 |
+ |
|
214 | 364 |
int bbiCountSectionsNeeded(struct bbiChromUsage *usageList, int itemsPerSlot) |
215 | 365 |
/* Count up number of sections needed for data. */ |
216 | 366 |
{ |
... | ... |
@@ -517,7 +667,7 @@ if (elCount >= stream->allocCount) |
517 | 667 |
void bbiOutputOneSummaryFurtherReduce(struct bbiSummary *sum, |
518 | 668 |
struct bbiSummary **pTwiceReducedList, |
519 | 669 |
int doubleReductionSize, struct bbiBoundsArray **pBoundsPt, |
520 |
- struct bbiBoundsArray *boundsEnd, bits32 chromSize, struct lm *lm, |
|
670 |
+ struct bbiBoundsArray *boundsEnd, struct lm *lm, |
|
521 | 671 |
struct bbiSumOutStream *stream) |
522 | 672 |
/* Write out sum to file, keeping track of minimal info on it in *pBoundsPt, and also adding |
523 | 673 |
* it to second level summary. */ |
... | ... |
@@ -1,7 +1,39 @@ |
1 | 1 |
/* bigBed - interface to binary file with bed-style values (that is a bunch of |
2 | 2 |
* possibly overlapping regions. |
3 | 3 |
* |
4 |
- * This shares a lot with the bigWig module. */ |
|
4 |
+ * This shares a lot with the bigWig module. |
|
5 |
+ * |
|
6 |
+ * Most of the functions here are concerned with reading bigBed files. There's |
|
7 |
+ * two common things you want to do with a bigBed, either stream through everything in it, |
|
8 |
+ * or just read the parts that overlap a interval within the genome. The files |
|
9 |
+ * are optimized for interval queries, but streaming through them is not difficult either. |
|
10 |
+ * |
|
11 |
+ * To query an interval: |
|
12 |
+ * struct bbiFile *bbi = bigBedFileOpen(fileName); |
|
13 |
+ * struct lm *lm = lmInit(0); // Memory pool to hold returned list |
|
14 |
+ * struct bigBedInterval *list = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm); |
|
15 |
+ * struct bigBedInterval *el; |
|
16 |
+ * for (el = list; el != NULL; el = el->next) |
|
17 |
+ * // do something involving chrom, el->start, el->end |
|
18 |
+ * lmCleanup(&lm); // typically do this after each query |
|
19 |
+ * bigBedFileClose(&bbi); // typically only do this when finished all queries |
|
20 |
+ * |
|
21 |
+ * To stream through whole file |
|
22 |
+ * struct bbiFile *bbi = bigBedFileOpen(fileName); |
|
23 |
+ * struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); |
|
24 |
+ * for (chrom = chromList; chrom != NULL; chrom = chrom->next) |
|
25 |
+ * { |
|
26 |
+ * struct lm *lm = lmInit(0); |
|
27 |
+ * struct bigBedInterval *list = bigBedIntervalQuery(bbi,chrom->name,0,chrom->size,0,lm); |
|
28 |
+ * struct bigBedInterval *el; |
|
29 |
+ * for (el = list; el != NULL; el = el->next) |
|
30 |
+ * // do something involving chrom, el->start, el->end |
|
31 |
+ * lmCleanup(&lm); |
|
32 |
+ * } |
|
33 |
+ * bigBedFileClose(&bbi); |
|
34 |
+ * |
|
35 |
+ * The processes for streaming through or doing interval queries on a bigWig file are very |
|
36 |
+ * similar. */ |
|
5 | 37 |
|
6 | 38 |
#ifndef BIGBED_H |
7 | 39 |
#define BIGBED_H |
... | ... |
@@ -11,30 +43,23 @@ |
11 | 43 |
#endif |
12 | 44 |
|
13 | 45 |
struct bigBedInterval |
14 |
-/* A partially parsed out bed record plus some extra fields. */ |
|
46 |
+/* A partially parsed out bed record plus some extra fields. Use this directly |
|
47 |
+ * or convert it to an array of characters with bigBedIntervalToRow. */ |
|
15 | 48 |
{ |
16 | 49 |
struct bigBedInterval *next; /* Next in list. */ |
17 | 50 |
bits32 start, end; /* Range inside chromosome - half open zero based. */ |
18 | 51 |
char *rest; /* Rest of line. May be NULL*/ |
52 |
+ bits32 chromId; /* ID of chromosome. */ |
|
19 | 53 |
}; |
20 | 54 |
|
21 |
-struct ppBed |
|
22 |
-/* A partially parsed out bed record plus some extra fields. */ |
|
23 |
- { |
|
24 |
- struct ppBed *next; /* Next in list. */ |
|
25 |
- char *chrom; /* Chromosome name (not allocated here) */ |
|
26 |
- bits32 start, end; /* Range inside chromosome - half open zero based. */ |
|
27 |
- char *rest; /* The rest of the bed. */ |
|
28 |
- bits64 fileOffset; /* File offset. */ |
|
29 |
- bits32 chromId; /* Chromosome ID. */ |
|
30 |
- }; |
|
55 |
+/*** Routines to open & close bigBed files, and to do chromosome range queries on them. ***/ |
|
31 | 56 |
|
32 | 57 |
struct bbiFile *bigBedFileOpen(char *fileName); |
33 | 58 |
/* Open up big bed file. Free this up with bbiFileClose. */ |
34 | 59 |
|
35 | 60 |
#define bigBedFileClose(a) bbiFileClose(a) |
36 | 61 |
|
37 |
-struct bigBedInterval *bigBedIntervalQuery(struct bbiFile *bbi, char *chrom, |
|
62 |
+struct bigBedInterval *bigBedIntervalQuery(struct bbiFile *bbi, char *chrom, |
|
38 | 63 |
bits32 start, bits32 end, int maxItems, struct lm *lm); |
39 | 64 |
/* Get data for interval. Return list allocated out of lm. Set maxItems to maximum |
40 | 65 |
* number of items to return, or to 0 for all items. */ |
... | ... |
@@ -43,7 +68,7 @@ int bigBedIntervalToRow(struct bigBedInterval *interval, char *chrom, char *star |
43 | 68 |
char **row, int rowSize); |
44 | 69 |
/* Convert bigBedInterval into an array of chars equivalent to what you'd get by |
45 | 70 |
* parsing the bed file. The startBuf and endBuf are used to hold the ascii representation of |
46 |
- * start and end. Note that the interval->rest string will have zeroes inserted as a side effect. |
|
71 |
+ * start and end. Note that the interval->rest string will have zeroes inserted as a side effect. |
|
47 | 72 |
* Returns number of fields in row. */ |
48 | 73 |
|
49 | 74 |
boolean bigBedSummaryArray(struct bbiFile *bbi, char *chrom, bits32 start, bits32 end, |
... | ... |
@@ -59,17 +84,50 @@ boolean bigBedSummaryArrayExtended(struct bbiFile *bbi, char *chrom, bits32 star |
59 | 84 |
/* Get extended summary information for summarySize evenely spaced elements into |
60 | 85 |
* the summary array. */ |
61 | 86 |
|
87 |
+/*** Some routines for accessing bigBed items via name. ***/ |
|
88 |
+ |
|
89 |
+struct bigBedInterval *bigBedNameQuery(struct bbiFile *bbi, struct bptFile *index, |
|
90 |
+ int fieldIx, char *name, struct lm *lm); |
|
91 |
+/* Return list of intervals matching file. These intervals will be allocated out of lm. */ |
|
92 |
+ |
|
93 |
+struct bigBedInterval *bigBedMultiNameQuery(struct bbiFile *bbi, struct bptFile *index, |
|
94 |
+ int fieldIx, char **names, int nameCount, struct lm *lm); |
|
95 |
+/* Fetch all records matching any of the names. Using given index on given field. |
|
96 |
+ * Return list is allocated out of lm. */ |
|
97 |
+ |
|
98 |
+int bigBedIntervalToRowLookupChrom(struct bigBedInterval *interval, |
|
99 |
+ struct bigBedInterval *prevInterval, struct bbiFile *bbi, |
|
100 |
+ char *chromBuf, int chromBufSize, char *startBuf, char *endBuf, char **row, int rowSize); |
|
101 |
+/* Convert bigBedInterval to array of chars equivalend to what you'd get by parsing the |
|
102 |
+ * bed file. If you already know what chromosome the interval is on use the simpler |
|
103 |
+ * bigBedIntervalToRow. This one will look up the chromosome based on the chromId field |
|
104 |
+ * of the interval, which is relatively time consuming. To avoid doing this unnecessarily |
|
105 |
+ * pass in a non-NULL prevInterval, and if the chromId is the same on prevInterval as this, |
|
106 |
+ * it will avoid the lookup. The chromBufSize should be at greater or equal to |
|
107 |
+ * bbi->chromBpt->keySize+1. The startBuf and endBuf are used to hold the ascii representation of |
|
108 |
+ * start and end, and should be 16 bytes. Note that the interval->rest string will have zeroes |
|
109 |
+ * inserted as a side effect. Returns number of fields in row. */ |
|
110 |
+ |
|
111 |
+void bigBedIntervalListToBedFile(struct bbiFile *bbi, struct bigBedInterval *intervalList, FILE *f); |
|
112 |
+/* Write out big bed interval list to bed file, looking up chromosome. */ |
|
113 |
+ |
|
114 |
+/** Routines to access other data from a bigBed file. */ |
|
115 |
+ |
|
62 | 116 |
bits64 bigBedItemCount(struct bbiFile *bbi); |
63 | 117 |
/* Return total items in file. */ |
64 | 118 |
|
65 | 119 |
char *bigBedAutoSqlText(struct bbiFile *bbi); |
66 | 120 |
/* Get autoSql text if any associated with file. Do a freeMem of this when done. */ |
67 | 121 |
|
68 |
-struct asObject *bigBedAs(struct bbiFile *bbi); |
|
69 |
-/* Get autoSql object definition if any associated with file. */ |
|
70 |
- |
|
71 | 122 |
boolean bigBedFileCheckSigs(char *fileName); |
72 | 123 |
/* check file signatures at beginning and end of file */ |
73 | 124 |
|
125 |
+struct bptFile *bigBedOpenExtraIndex(struct bbiFile *bbi, char *fieldName, int *retFieldIx); |
|
126 |
+/* Return index associated with fieldName. Aborts if no such index. Optionally return |
|
127 |
+ * index in a row of this field. */ |
|
128 |
+ |
|
129 |
+struct slName *bigBedListExtraIndexes(struct bbiFile *bbi); |
|
130 |
+/* Return list of names of extra indexes beyond primary chrom:start-end one" */ |
|
131 |
+ |
|
74 | 132 |
#endif /* BIGBED_H */ |
75 | 133 |
|
... | ... |
@@ -27,6 +27,18 @@ |
27 | 27 |
#include "bits.h" |
28 | 28 |
#endif |
29 | 29 |
|
30 |
+void bigWigFileCreateEx( |
|
31 |
+ char *inName, /* Input file in ascii wiggle format. */ |
|
32 |
+ char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */ |
|
33 |
+ int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */ |
|
34 |
+ int itemsPerSlot, /* Number of items in lowest level of tree. 512 is good. */ |
|
35 |
+ boolean clipDontDie, /* If TRUE then clip items off end of chrom rather than dying. */ |
|
36 |
+ boolean compress, /* If TRUE then compress data. */ |
|
37 |
+ boolean keepAllChromosomes, /* If TRUE then store all chromosomes in chromosomal b-tree. */ |
|
38 |
+ boolean fixedSummaries, /* If TRUE then impose fixed summary levels. */ |
|
39 |
+ char *outName); |
|
40 |
+/* Convert ascii format wig file (in fixedStep, variableStep or bedGraph format) |
|
41 |
+ * to binary big wig format. */ |
|
30 | 42 |
void bigWigFileCreate( |
31 | 43 |
char *inName, /* Input file in ascii wiggle format. */ |
32 | 44 |
char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */ |
... | ... |
@@ -6,7 +6,6 @@ |
6 | 6 |
#include "common.h" |
7 | 7 |
#include "bits.h" |
8 | 8 |
|
9 |
-static char const rcsid[] = "$Id: bits.c,v 1.20 2008/03/25 16:32:31 angie Exp $"; |
|
10 | 9 |
|
11 | 10 |
|
12 | 11 |
static Bits oneBit[8] = { 0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1}; |
... | ... |
@@ -78,6 +77,33 @@ void bitFree(Bits **pB) |
78 | 77 |
freez(pB); |
79 | 78 |
} |
80 | 79 |
|
80 |
+Bits *lmBitAlloc(struct lm *lm,int bitCount) |
|
81 |
+// Allocate bits. Must supply local memory. |
|
82 |
+{ |
|
83 |
+assert(lm != NULL); |
|
84 |
+int byteCount = ((bitCount+7)>>3); |
|
85 |
+return lmAlloc(lm,byteCount); |
|
86 |
+} |
|
87 |
+ |
|
88 |
+Bits *lmBitRealloc(struct lm *lm,Bits *b, int bitCount, int newBitCount) |
|
89 |
+// Resize a bit array. If b is null, allocate a new array. Must supply local memory. |
|
90 |
+{ |
|
91 |
+assert(lm != NULL); |
|
92 |
+int byteCount = ((bitCount+7)>>3); |
|
93 |
+int newByteCount = ((newBitCount+7)>>3); |
|
94 |
+return lmAllocMoreMem(lm, b ,byteCount, newByteCount); |
|
95 |
+} |
|
96 |
+ |
|
97 |
+Bits *lmBitClone(struct lm *lm,Bits* orig, int bitCount) |
|
98 |
+// Clone bits. Must supply local memory. |
|
99 |
+{ |
|
100 |
+assert(lm != NULL); |
|
101 |
+int byteCount = ((bitCount+7)>>3); |
|
102 |
+Bits* bits = lmAlloc(lm,byteCount); |
|
103 |
+memcpy(bits, orig, byteCount); |
|
104 |
+return bits; |
|
105 |
+} |
|
106 |
+ |
|
81 | 107 |
void bitSetOne(Bits *b, int bitIx) |
82 | 108 |
/* Set a single bit. */ |
83 | 109 |
{ |
... | ... |
@@ -232,6 +258,19 @@ while (--byteCount >= 0) |
232 | 258 |
} |
233 | 259 |
} |
234 | 260 |
|
261 |
+int bitAndCount(Bits *a, Bits *b, int bitCount) |
|
262 |
+// Without altering 2 bitmaps, count the AND bits. |
|
263 |
+{ |
|
264 |
+int byteCount = ((bitCount+7)>>3); |
|
265 |
+int count = 0; |
|
266 |
+if (!inittedBitsInByte) |
|
267 |
+ bitsInByteInit(); |
|
268 |
+while (--byteCount >= 0) |
|
269 |
+ count += bitsInByte[(*a++ & *b++)]; |
|
270 |
+ |
|
271 |
+return count; |
|
272 |
+} |
|
273 |
+ |
|
235 | 274 |
void bitOr(Bits *a, Bits *b, int bitCount) |
236 | 275 |
/* Or two bitmaps. Put result in a. */ |
237 | 276 |
{ |
... | ... |
@@ -243,6 +282,19 @@ while (--byteCount >= 0) |
243 | 282 |
} |
244 | 283 |
} |
245 | 284 |
|
285 |
+int bitOrCount(Bits *a, Bits *b, int bitCount) |
|
286 |
+// Without altering 2 bitmaps, count the OR'd bits. |
|
287 |
+{ |
|
288 |
+int byteCount = ((bitCount+7)>>3); |
|
289 |
+int count = 0; |
|
290 |
+if (!inittedBitsInByte) |
|
291 |
+ bitsInByteInit(); |
|
292 |
+while (--byteCount >= 0) |
|
293 |
+ count += bitsInByte[(*a++ | *b++)]; |
|
294 |
+ |
|
295 |
+return count; |
|
296 |
+} |
|
297 |
+ |
|
246 | 298 |
void bitXor(Bits *a, Bits *b, int bitCount) |
247 | 299 |
{ |
248 | 300 |
int byteCount = ((bitCount+7)>>3); |
... | ... |
@@ -253,6 +305,19 @@ while (--byteCount >= 0) |
253 | 305 |
} |
254 | 306 |
} |
255 | 307 |
|
308 |
+int bitXorCount(Bits *a, Bits *b, int bitCount) |
|
309 |
+// Without altering 2 bitmaps, count the XOR'd bits. |
|
310 |
+{ |
|
311 |
+int byteCount = ((bitCount+7)>>3); |
|
312 |
+int count = 0; |
|
313 |
+if (!inittedBitsInByte) |
|
314 |
+ bitsInByteInit(); |
|
315 |
+while (--byteCount >= 0) |
|
316 |
+ count += bitsInByte[(*a++ ^ *b++)]; |
|
317 |
+ |
|
318 |
+return count; |
|
319 |
+} |
|
320 |
+ |
|
256 | 321 |
void bitNot(Bits *a, int bitCount) |
257 | 322 |
/* Flip all bits in a. */ |
258 | 323 |
{ |
... | ... |
@@ -264,6 +329,31 @@ while (--byteCount >= 0) |
264 | 329 |
} |
265 | 330 |
} |
266 | 331 |
|
332 |
+void bitReverseRange(Bits *bits, int startIx, int bitCount) |
|
333 |
+// Reverses bits in range (e.g. 110010 becomes 010011) |
|
334 |
+{ |
|
335 |
+if (bitCount <= 0) |
|
336 |
+ return; |
|
337 |
+int ixA = startIx; |
|
338 |
+int ixB = (startIx + bitCount - 1); |
|