3516c08c |
++definedFieldCount;
bedCol = bedCol->next;
asCol = asCol->next;
}
asObjectFree(&bedAs);
return definedFieldCount;
}
bool isPresent(int definedFieldCount, int index) {
return (definedFieldCount - index >= 0) ? TRUE : FALSE;
}
bool isSelected(SEXP r_selectedindex, int position) {
if (length(r_selectedindex) == 0)
return TRUE;
for (int i = 0; i < length(r_selectedindex); ++i) {
if(INTEGER(r_selectedindex)[i] == position)
return TRUE;
}
return FALSE;
}
/* following functions are taken from the kent library */
struct rbTree *rangeTreeForBedChrom(struct lineFile *lf, char *chrom)
/* Read lines from bed file as long as they match chrom. Return a rangeTree that
* corresponds to the coverage. */
{
struct rbTree *tree = rangeTreeNew();
char *line;
while (lineFileNextReal(lf, &line))
{
if (!startsWithWord(chrom, line))
{
lineFileReuse(lf);
break;
}
char *row[3];
chopLine(line, row);
unsigned start = sqlUnsigned(row[1]);
unsigned end = sqlUnsigned(row[2]);
rangeTreeAddToCoverageDepth(tree, start, end);
}
return tree;
}
void bbExIndexMakerAddKeysFromRow(struct bbExIndexMaker *eim, char **row, int recordIx)
/* Save the keys that are being indexed by row in eim. */
{
int i;
for (i=0; i < eim->indexCount; ++i)
{
int rowIx = eim->indexFields[i];
eim->chunkArrayArray[i][recordIx].name = cloneString(row[rowIx]);
}
}
void bbExIndexMakerAddOffsetSize(struct bbExIndexMaker *eim, bits64 offset, bits64 size,
long startIx, long endIx)
/* Update offset and size fields of all file chunks between startIx and endIx */
{
int i;
for (i=0; i < eim->indexCount; ++i)
{
struct bbNamedFileChunk *chunks = eim->chunkArrayArray[i];
long j;
for (j = startIx; j < endIx; ++j)
{
struct bbNamedFileChunk *chunk = chunks + j;
chunk->offset = offset;
chunk->size = size;
}
}
}
/* Allocate the big part of the extra index maker - the part that holds which
* chunk is used for each record. */
void bbExIndexMakerAllocChunkArrays(struct bbExIndexMaker *eim, int recordCount) {
eim->recordCount = recordCount;
int i;
for (i=0; i < eim->indexCount; ++i)
AllocArray(eim->chunkArrayArray[i], recordCount);
}
/* Return an index maker corresponding to extraIndexList. Checks that all fields
* mentioned are in autoSql definition, and for now that they are all text fields. */
struct bbExIndexMaker *bbExIndexMakerNew(struct slName *extraIndexList, struct asObject *as) {
/* Fill in scalar fields and return quickly if no extra indexes. */
struct bbExIndexMaker *eim;
AllocVar(eim);
eim->indexCount = slCount(extraIndexList);
if (eim->indexCount == 0)
return eim; // Not much to do in this case
/* Allocate arrays according field count. */
AllocArray(eim->indexFields, eim->indexCount);
AllocArray(eim->maxFieldSize, eim->indexCount);
AllocArray(eim->chunkArrayArray, eim->indexCount);
AllocArray(eim->fileOffsets, eim->indexCount);
/* Loop through each field checking that it is indeed something we can index
* and if so saving information about it */
int indexIx = 0;
struct slName *name;
for (name = extraIndexList; name != NULL; name = name->next) {
struct asColumn *col = asColumnFind(as, name->name);
if (col == NULL)
errAbort("extraIndex field %s not a standard bed field or found in autoSql string.",
name->name);
if (!sameString(col->lowType->name, "string"))
errAbort("Sorry for now can only index string fields.");
eim->indexFields[indexIx] = slIxFromElement(as->columnList, col);
++indexIx;
}
return eim;
}
/* Compare two named offset object to facilitate qsorting by name. */
int bbNamedFileChunkCmpByName(const void *va, const void *vb) {
const struct bbNamedFileChunk *a = va, *b = vb;
return strcmp(a->name, b->name);
}
/* Return pointer to val for bPlusTree maker. */
void *bbNamedFileChunkVal(const void *va) {
const struct bbNamedFileChunk *item = va;
return (void *)&item->offset;
}
/* Copy name to keyBuf for bPlusTree maker */
void bbNamedFileChunkKey(const void *va, char *keyBuf) {
const struct bbNamedFileChunk *item = va;
strncpy(keyBuf,item->name, maxBedNameSize);
}
struct bbiSummary *bedWriteReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList,
int fieldCount, struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount,
int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress,
struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart,
struct bbiSummaryElement *totalSum)
/* Write out data reduced by factor of initialReduction. Also calculate and keep in memory
* next reduction level. This is more work than some ways, but it keeps us from having to
* keep the first reduction entirely in memory. */
{
struct bbiSummary *twiceReducedList = NULL;
bits32 doubleReductionSize = initialReduction * zoomIncrement;
struct bbiChromUsage *usage = usageList;
struct bbiBoundsArray *boundsArray, *boundsPt, *boundsEnd;
boundsPt = AllocArray(boundsArray, initialReductionCount);
boundsEnd = boundsPt + initialReductionCount;
*retDataStart = ftell(f);
writeOne(f, initialReductionCount);
/* This gets a little complicated I'm afraid. The strategy is to:
* 1) Build up a range tree that represents coverage depth on that chromosome
* This also has the nice side effect of getting rid of overlaps.
* 2) Stream through the range tree, outputting the initial summary level and
* further reducing.
*/
boolean firstTime = TRUE;
struct bbiSumOutStream *stream = bbiSumOutStreamOpen(itemsPerSlot, f, doCompress);
for (usage = usageList; usage != NULL; usage = usage->next)
{
struct bbiSummary oneSummary, *sum = NULL;
struct rbTree *rangeTree = rangeTreeForBedChrom(lf, usage->name);
struct range *range, *rangeList = rangeTreeList(rangeTree);
for (range = rangeList; range != NULL; range = range->next)
{
/* Grab values we want from range. */
double val = ptToInt(range->val);
int start = range->start;
int end = range->end;
bits32 size = end - start;
/* Add to total summary. */
if (firstTime)
{
totalSum->validCount = size;
totalSum->minVal = totalSum->maxVal = val;
totalSum->sumData = val*size;
totalSum->sumSquares = val*val*size;
firstTime = FALSE;
}
else
{
totalSum->validCount += size;
if (val < totalSum->minVal) totalSum->minVal = val;
if (val > totalSum->maxVal) totalSum->maxVal = val;
totalSum->sumData += val*size;
totalSum->sumSquares += val*val*size;
}
/* If start past existing block then output it. */
if (sum != NULL && sum->end <= start && sum->end < usage->size)
{
bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
&boundsPt, boundsEnd, lm, stream);
sum = NULL;
}
/* If don't have a summary we're working on now, make one. */
if (sum == NULL)
{
oneSummary.chromId = usage->id;
oneSummary.start = start;
oneSummary.end = start + initialReduction;
if (oneSummary.end > usage->size) oneSummary.end = usage->size;
oneSummary.minVal = oneSummary.maxVal = val;
oneSummary.sumData = oneSummary.sumSquares = 0.0;
oneSummary.validCount = 0;
sum = &oneSummary;
}
/* Deal with case where might have to split an item between multiple summaries. This
* loop handles all but the final affected summary in that case. */
while (end > sum->end)
{
/* Fold in bits that overlap with existing summary and output. */
int overlap = rangeIntersection(start, end, sum->start, sum->end);
assert(overlap > 0);
sum->validCount += overlap;
if (sum->minVal > val) sum->minVal = val;
if (sum->maxVal < val) sum->maxVal = val;
sum->sumData += val * overlap;
sum->sumSquares += val*val * overlap;
bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
&boundsPt, boundsEnd, lm, stream);
size -= overlap;
/* Move summary to next part. */
sum->start = start = sum->end;
sum->end = start + initialReduction;
if (sum->end > usage->size) sum->end = usage->size;
sum->minVal = sum->maxVal = val;
sum->sumData = sum->sumSquares = 0.0;
sum->validCount = 0;
}
/* Add to summary. */
sum->validCount += size;
if (sum->minVal > val) sum->minVal = val;
if (sum->maxVal < val) sum->maxVal = val;
sum->sumData += val * size;
sum->sumSquares += val*val * size;
}
if (sum != NULL)
{
bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
&boundsPt, boundsEnd, lm, stream);
}
rangeTreeFree(&rangeTree);
}
bbiSumOutStreamClose(&stream);
/* Write out 1st zoom index. */
int indexOffset = *retIndexStart = ftell(f);
assert(boundsPt == boundsEnd);
cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), initialReductionCount,
blockSize, itemsPerSlot, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset,
indexOffset, f);
freez(&boundsArray);
slReverse(&twiceReducedList);
return twiceReducedList;
}
void writeBlocks(struct bbiChromUsage *usageList, struct lineFile *lf, struct asObject *as,
int itemsPerSlot, struct bbiBoundsArray *bounds,
int sectionCount, boolean doCompress, FILE *f,
int resTryCount, int resScales[], int resSizes[],
struct bbExIndexMaker *eim, int bedCount,
bits16 fieldCount, int bedN, bits32 *retMaxBlockSize)
/* Read through lf, writing it in f. Save starting points of blocks (every itemsPerSlot)
* to boundsArray */
{
int maxBlockSize = 0;
struct bbiChromUsage *usage = usageList;
char *line, *row[fieldCount+1];
int lastField = fieldCount-1;
int itemIx = 0, sectionIx = 0;
bits64 blockStartOffset = 0;
int startPos = 0, endPos = 0;
bits32 chromId = 0;
struct dyString *stream = dyStringNew(0);
/* Will keep track of some things that help us determine how much to reduce. */
bits32 resEnds[resTryCount];
int resTry;
for (resTry = 0; resTry < resTryCount; ++resTry)
resEnds[resTry] = 0;
boolean atEnd = FALSE, sameChrom = FALSE;
bits32 start = 0, end = 0;
char *chrom = NULL;
struct bed *bed;
AllocVar(bed);
/* Help keep track of which beds are in current chunk so as to write out
* namedChunks to eim if need be. */
long sectionStartIx = 0, sectionEndIx = 0;
for (;;)
{
/* Get next line of input if any. */
if (lineFileNextReal(lf, &line))
{
/* Chop up line and make sure the word count is right. */
int wordCount;
wordCount = chopLine(line, row);
lineFileExpectWords(lf, fieldCount, wordCount);
loadAndValidateBedExt(row, bedN, fieldCount, lf, bed, as, FALSE, TRUE);
chrom = bed->chrom;
start = bed->chromStart;
end = bed->chromEnd;
sameChrom = sameString(chrom, usage->name);
}
else /* No next line */
{
atEnd = TRUE;
}
/* Check conditions that would end block and save block info and advance to next if need be. */
if (atEnd || !sameChrom || itemIx >= itemsPerSlot)
{
/* Save stream to file, compressing if need be. */
if (stream->stringSize > maxBlockSize)
maxBlockSize = stream->stringSize;
if (doCompress)
{
size_t maxCompSize = zCompBufSize(stream->stringSize);
// keep around an area of scratch memory
static int compBufSize = 0;
static char *compBuf = NULL;
// check to see if buffer needed for compression is big enough
if (compBufSize < maxCompSize)
{
// free up the old not-big-enough piece
freez(&compBuf); // freez knows bout NULL
// get new scratch area
compBufSize = maxCompSize;
compBuf = needLargeMem(compBufSize);
}
int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize);
mustWrite(f, compBuf, compSize);
}
else
mustWrite(f, stream->string, stream->stringSize);
dyStringClear(stream);
/* Save block offset and size for all named chunks in this section. */
if (eim != NULL)
{
bits64 blockEndOffset = ftell(f);
bbExIndexMakerAddOffsetSize(eim, blockStartOffset, blockEndOffset-blockStartOffset,
sectionStartIx, sectionEndIx);
sectionStartIx = sectionEndIx;
}
/* Save info on existing block. */
struct bbiBoundsArray *b = &bounds[sectionIx];
b->offset = blockStartOffset;
b->range.chromIx = chromId;
b->range.start = startPos;
b->range.end = endPos;
++sectionIx;
itemIx = 0;
if (atEnd)
break;
}
/* Advance to next chromosome if need be and get chromosome id. */
if (!sameChrom)
{
usage = usage->next;
assert(usage != NULL);
assert(sameString(chrom, usage->name));
for (resTry = 0; resTry < resTryCount; ++resTry)
resEnds[resTry] = 0;
}
chromId = usage->id;
/* At start of block we save a lot of info. */
if (itemIx == 0)
{
blockStartOffset = ftell(f);
startPos = start;
endPos = end;
}
/* Otherwise just update end. */
{
if (endPos < end)
endPos = end;
/* No need to update startPos since list is sorted. */
}
/* Save name into namedOffset if need be. */
if (eim != NULL)
{
bbExIndexMakerAddKeysFromRow(eim, row, sectionEndIx);
sectionEndIx += 1;
}
/* Write out data. */
dyStringWriteOne(stream, chromId);
dyStringWriteOne(stream, start);
dyStringWriteOne(stream, end);
if (fieldCount > 3)
{
int i;
/* Write 3rd through next to last field and a tab separator. */
for (i=3; i<lastField; ++i)
{
char *s = row[i];
dyStringAppend(stream, s);
dyStringAppendC(stream, '\t');
}
/* Write last field and terminal zero */
char *s = row[lastField];
dyStringAppend(stream, s);
}
dyStringAppendC(stream, 0);
itemIx += 1;
/* Do zoom counting. */
for (resTry = 0; resTry < resTryCount; ++resTry)
{
bits32 resEnd = resEnds[resTry];
if (start >= resEnd)
{
resSizes[resTry] += 1;
resEnds[resTry] = resEnd = start + resScales[resTry];
}
while (end > resEnd)
{
resSizes[resTry] += 1;
resEnds[resTry] = resEnd = resEnd + resScales[resTry];
}
}
}
assert(sectionIx == sectionCount);
freez(&bed);
*retMaxBlockSize = maxBlockSize;
}
|