src/bigBedHelper.c
3516c08c
 #include "bigBedHelper.h"
 
 /*
   Most of the functions in this file are taken from ucscGenomeBrowser/kent/src/utils/bedToBigBed.c
 */
 
 int getDefinedFieldCount(struct asObject *as) {
   int definedFieldCount = 0;
   struct asColumn *asCol = as->columnList;
   char *asText = bedAsDef(12, 12);
   struct asObject *bedAs = asParseText(asText);
   freeMem(asText);
   struct asColumn *bedCol = bedAs->columnList;
   while (asCol && bedCol) {
9221c8e8
     if (strncmp(asCol->name, bedCol->name, strlen(asCol->name)) == 0)
3516c08c
       ++definedFieldCount;
     bedCol = bedCol->next;
     asCol = asCol->next;
   }
   asObjectFree(&bedAs);
   return definedFieldCount;
 }
 
 bool isPresent(int definedFieldCount, int index) {
   return (definedFieldCount - index >= 0) ? TRUE : FALSE;
 }
 
 bool isSelected(SEXP r_selectedindex, int position) {
   if (length(r_selectedindex) == 0)
     return TRUE;
   for (int i = 0; i < length(r_selectedindex); ++i) {
     if(INTEGER(r_selectedindex)[i] == position)
       return TRUE;
   }
   return FALSE;
 }
 
 /* following functions are taken from the kent library */
 
 struct rbTree *rangeTreeForBedChrom(struct lineFile *lf, char *chrom)
 /* Read lines from bed file as long as they match chrom.  Return a rangeTree that
  * corresponds to the coverage. */
 {
 struct rbTree *tree = rangeTreeNew();
 char *line;
 while (lineFileNextReal(lf, &line))
     {
     if (!startsWithWord(chrom, line))
         {
 	lineFileReuse(lf);
 	break;
 	}
     char *row[3];
     chopLine(line, row);
     unsigned start = sqlUnsigned(row[1]);
     unsigned end = sqlUnsigned(row[2]);
     rangeTreeAddToCoverageDepth(tree, start, end);
     }
 return tree;
 }
 
 void bbExIndexMakerAddKeysFromRow(struct bbExIndexMaker *eim, char **row, int recordIx)
 /* Save the keys that are being indexed by row in eim. */
 {
 int i;
 for (i=0; i < eim->indexCount; ++i)
     {
     int rowIx = eim->indexFields[i];
     eim->chunkArrayArray[i][recordIx].name = cloneString(row[rowIx]);
     }
 }
 
 void bbExIndexMakerAddOffsetSize(struct bbExIndexMaker *eim, bits64 offset, bits64 size,
     long startIx, long endIx)
 /* Update offset and size fields of all file chunks between startIx and endIx */
 {
 int i;
 for (i=0; i < eim->indexCount; ++i)
     {
     struct bbNamedFileChunk *chunks = eim->chunkArrayArray[i];
     long j;
     for (j = startIx; j < endIx; ++j)
         {
 	struct bbNamedFileChunk *chunk = chunks + j;
 	chunk->offset = offset;
 	chunk->size = size;
 	}
     }
 }
 
 /* Allocate the big part of the extra index maker - the part that holds which
  * chunk is used for each record. */
 void bbExIndexMakerAllocChunkArrays(struct bbExIndexMaker *eim, int recordCount) {
   eim->recordCount = recordCount;
   int i;
   for (i=0; i < eim->indexCount; ++i)
     AllocArray(eim->chunkArrayArray[i], recordCount);
 }
 
 /* Return an index maker corresponding to extraIndexList. Checks that all fields
  * mentioned are in autoSql definition, and for now that they are all text fields. */
 struct bbExIndexMaker *bbExIndexMakerNew(struct slName *extraIndexList, struct asObject *as) {
   /* Fill in scalar fields and return quickly if no extra indexes. */
   struct bbExIndexMaker *eim;
   AllocVar(eim);
   eim->indexCount = slCount(extraIndexList);
   if (eim->indexCount == 0)
     return eim;	// Not much to do in this case
 
   /* Allocate arrays according field count. */
   AllocArray(eim->indexFields, eim->indexCount);
   AllocArray(eim->maxFieldSize, eim->indexCount);
   AllocArray(eim->chunkArrayArray, eim->indexCount);
   AllocArray(eim->fileOffsets, eim->indexCount);
 
   /* Loop through each field checking that it is indeed something we can index
   * and if so saving information about it */
   int indexIx = 0;
   struct slName *name;
   for (name = extraIndexList; name != NULL; name = name->next) {
       struct asColumn *col = asColumnFind(as, name->name);
       if (col == NULL)
           errAbort("extraIndex field %s not a standard bed field or found in autoSql string.",
         name->name);
       if (!sameString(col->lowType->name, "string"))
           errAbort("Sorry for now can only index string fields.");
       eim->indexFields[indexIx] = slIxFromElement(as->columnList, col);
       ++indexIx;
   }
   return eim;
 }
 
 /* Compare two named offset object to facilitate qsorting by name. */
 int bbNamedFileChunkCmpByName(const void *va, const void *vb) {
   const struct bbNamedFileChunk *a = va, *b = vb;
   return strcmp(a->name, b->name);
 }
 
 /* Return pointer to val for bPlusTree maker. */
 void *bbNamedFileChunkVal(const void *va) {
   const struct bbNamedFileChunk *item = va;
   return (void *)&item->offset;
 }
 
 /* Copy name to keyBuf for bPlusTree maker */
 void bbNamedFileChunkKey(const void *va, char *keyBuf) {
   const struct bbNamedFileChunk *item = va;
   strncpy(keyBuf,item->name, maxBedNameSize);
 }
 
 struct bbiSummary *bedWriteReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList,
 	int fieldCount, struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount,
 	int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress,
 	struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart,
 	struct bbiSummaryElement *totalSum)
 /* Write out data reduced by factor of initialReduction.  Also calculate and keep in memory
  * next reduction level.  This is more work than some ways, but it keeps us from having to
  * keep the first reduction entirely in memory. */
 {
 struct bbiSummary *twiceReducedList = NULL;
 bits32 doubleReductionSize = initialReduction * zoomIncrement;
 struct bbiChromUsage *usage = usageList;
 struct bbiBoundsArray *boundsArray, *boundsPt, *boundsEnd;
 boundsPt = AllocArray(boundsArray, initialReductionCount);
 boundsEnd = boundsPt + initialReductionCount;
 
 *retDataStart = ftell(f);
 writeOne(f, initialReductionCount);
 /* This gets a little complicated I'm afraid.  The strategy is to:
  *   1) Build up a range tree that represents coverage depth on that chromosome
  *      This also has the nice side effect of getting rid of overlaps.
  *   2) Stream through the range tree, outputting the initial summary level and
  *      further reducing. 
  */
 boolean firstTime = TRUE;
 struct bbiSumOutStream *stream = bbiSumOutStreamOpen(itemsPerSlot, f, doCompress);
 for (usage = usageList; usage != NULL; usage = usage->next)
     {
     struct bbiSummary oneSummary, *sum = NULL;
     struct rbTree *rangeTree = rangeTreeForBedChrom(lf, usage->name);
     struct range *range, *rangeList = rangeTreeList(rangeTree);
     for (range = rangeList; range != NULL; range = range->next)
         {
 	/* Grab values we want from range. */
 	double val = ptToInt(range->val);
 	int start = range->start;
 	int end = range->end;
 	bits32 size = end - start;
 
 	/* Add to total summary. */
 	if (firstTime)
 	    {
 	    totalSum->validCount = size;
 	    totalSum->minVal = totalSum->maxVal = val;
 	    totalSum->sumData = val*size;
 	    totalSum->sumSquares = val*val*size;
 	    firstTime = FALSE;
 	    }
 	else
 	    {
 	    totalSum->validCount += size;
 	    if (val < totalSum->minVal) totalSum->minVal = val;
 	    if (val > totalSum->maxVal) totalSum->maxVal = val;
 	    totalSum->sumData += val*size;
 	    totalSum->sumSquares += val*val*size;
 	    }
 
 	/* If start past existing block then output it. */
 	if (sum != NULL && sum->end <= start && sum->end < usage->size)
 	    {
 	    bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
 		&boundsPt, boundsEnd, lm, stream);
 	    sum = NULL;
 	    }
 	/* If don't have a summary we're working on now, make one. */
 	if (sum == NULL)
 	    {
 	    oneSummary.chromId = usage->id;
 	    oneSummary.start = start;
 	    oneSummary.end = start + initialReduction;
 	    if (oneSummary.end > usage->size) oneSummary.end = usage->size;
 	    oneSummary.minVal = oneSummary.maxVal = val;
 	    oneSummary.sumData = oneSummary.sumSquares = 0.0;
 	    oneSummary.validCount = 0;
 	    sum = &oneSummary;
 	    }
 	/* Deal with case where might have to split an item between multiple summaries.  This
 	 * loop handles all but the final affected summary in that case. */
 	while (end > sum->end)
 	    {
 	    /* Fold in bits that overlap with existing summary and output. */
 	    int overlap = rangeIntersection(start, end, sum->start, sum->end);
 	    assert(overlap > 0);
 	    sum->validCount += overlap;
 	    if (sum->minVal > val) sum->minVal = val;
 	    if (sum->maxVal < val) sum->maxVal = val;
 	    sum->sumData += val * overlap;
 	    sum->sumSquares += val*val * overlap;
 	    bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
 		    &boundsPt, boundsEnd, lm, stream);
 	    size -= overlap;
 
 	    /* Move summary to next part. */
 	    sum->start = start = sum->end;
 	    sum->end = start + initialReduction;
 	    if (sum->end > usage->size) sum->end = usage->size;
 	    sum->minVal = sum->maxVal = val;
 	    sum->sumData = sum->sumSquares = 0.0;
 	    sum->validCount = 0;
 	    }
 
 	/* Add to summary. */
 	sum->validCount += size;
 	if (sum->minVal > val) sum->minVal = val;
 	if (sum->maxVal < val) sum->maxVal = val;
 	sum->sumData += val * size;
 	sum->sumSquares += val*val * size;
 	}
     if (sum != NULL)
 	{
 	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
 	    &boundsPt, boundsEnd, lm, stream);
 	}
     rangeTreeFree(&rangeTree);
     }
 bbiSumOutStreamClose(&stream);
 
 /* Write out 1st zoom index. */
 int indexOffset = *retIndexStart = ftell(f);
 assert(boundsPt == boundsEnd);
 cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), initialReductionCount,
     blockSize, itemsPerSlot, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset,
     indexOffset, f);
 
 freez(&boundsArray);
 slReverse(&twiceReducedList);
 return twiceReducedList;
 }
 
 void writeBlocks(struct bbiChromUsage *usageList, struct lineFile *lf, struct asObject *as,
 	int itemsPerSlot, struct bbiBoundsArray *bounds,
 	int sectionCount, boolean doCompress, FILE *f,
 	int resTryCount, int resScales[], int resSizes[],
 	struct bbExIndexMaker *eim,  int bedCount,
 	bits16 fieldCount, int bedN, bits32 *retMaxBlockSize)
 /* Read through lf, writing it in f.  Save starting points of blocks (every itemsPerSlot)
  * to boundsArray */
 {
 int maxBlockSize = 0;
 struct bbiChromUsage *usage = usageList;
 char *line, *row[fieldCount+1];
 int lastField = fieldCount-1;
 int itemIx = 0, sectionIx = 0;
 bits64 blockStartOffset = 0;
 int startPos = 0, endPos = 0;
 bits32 chromId = 0;
 struct dyString *stream = dyStringNew(0);
 
 /* Will keep track of some things that help us determine how much to reduce. */
 bits32 resEnds[resTryCount];
 int resTry;
 for (resTry = 0; resTry < resTryCount; ++resTry)
     resEnds[resTry] = 0;
 boolean atEnd = FALSE, sameChrom = FALSE;
 bits32 start = 0, end = 0;
 char *chrom = NULL;
 struct bed *bed;
 AllocVar(bed);
 
 /* Help keep track of which beds are in current chunk so as to write out
  * namedChunks to eim if need be. */
 long sectionStartIx = 0, sectionEndIx = 0;
 
 for (;;)
     {
     /* Get next line of input if any. */
     if (lineFileNextReal(lf, &line))
 	{
 	/* Chop up line and make sure the word count is right. */
 	int wordCount;
 	wordCount = chopLine(line, row);
 	lineFileExpectWords(lf, fieldCount, wordCount);
 	loadAndValidateBedExt(row, bedN, fieldCount, lf, bed, as, FALSE, TRUE);
 	chrom = bed->chrom;
 	start = bed->chromStart;
 	end = bed->chromEnd;
 
 	sameChrom = sameString(chrom, usage->name);
 	}
     else  /* No next line */
 	{
 	atEnd = TRUE;
 	}
 
     /* Check conditions that would end block and save block info and advance to next if need be. */
     if (atEnd || !sameChrom || itemIx >= itemsPerSlot)
         {
 	/* Save stream to file, compressing if need be. */
 	if (stream->stringSize > maxBlockSize)
 	    maxBlockSize = stream->stringSize;
 	if (doCompress)
             {
 	    size_t maxCompSize = zCompBufSize(stream->stringSize);
 
             // keep around an area of scratch memory
             static int compBufSize = 0;
             static char *compBuf = NULL;
             // check to see if buffer needed for compression is big enough
             if (compBufSize < maxCompSize)
                 {
                 // free up the old not-big-enough piece
                 freez(&compBuf); // freez knows bout NULL
 
                 // get new scratch area
                 compBufSize = maxCompSize;
                 compBuf = needLargeMem(compBufSize);
                 }
 	    int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize);
 	    mustWrite(f, compBuf, compSize);
 	    }
 	else
 	    mustWrite(f, stream->string, stream->stringSize);
 	dyStringClear(stream);
 
 	/* Save block offset and size for all named chunks in this section. */
 	if (eim != NULL)
 	    {
 	    bits64 blockEndOffset = ftell(f);
 	    bbExIndexMakerAddOffsetSize(eim, blockStartOffset, blockEndOffset-blockStartOffset,
 		sectionStartIx, sectionEndIx);
 	    sectionStartIx = sectionEndIx;
 	    }
 
 	/* Save info on existing block. */
 	struct bbiBoundsArray *b = &bounds[sectionIx];
 	b->offset = blockStartOffset;
 	b->range.chromIx = chromId;
 	b->range.start = startPos;
 	b->range.end = endPos;
 	++sectionIx;
 	itemIx = 0;
 
 	if (atEnd)
 	    break;
 	}
 
     /* Advance to next chromosome if need be and get chromosome id. */
     if (!sameChrom)
         {
 	usage = usage->next;
 	assert(usage != NULL);
 	assert(sameString(chrom, usage->name));
 	for (resTry = 0; resTry < resTryCount; ++resTry)
 	    resEnds[resTry] = 0;
 	}
     chromId = usage->id;
 
     /* At start of block we save a lot of info. */
     if (itemIx == 0)
         {
 	blockStartOffset = ftell(f);
 	startPos = start;
 	endPos = end;
 	}
     /* Otherwise just update end. */
         {
 	if (endPos < end)
 	    endPos = end;
 	/* No need to update startPos since list is sorted. */
 	}
 
     /* Save name into namedOffset if need be. */
     if (eim != NULL)
 	{
 	bbExIndexMakerAddKeysFromRow(eim, row, sectionEndIx);
 	sectionEndIx += 1;
 	}
 
     /* Write out data. */
     dyStringWriteOne(stream, chromId);
     dyStringWriteOne(stream, start);
     dyStringWriteOne(stream, end);
     if (fieldCount > 3)
         {
 	int i;
 	/* Write 3rd through next to last field and a tab separator. */
 	for (i=3; i<lastField; ++i)
 	    {
 	    char *s = row[i];
 	    dyStringAppend(stream, s);
 	    dyStringAppendC(stream, '\t');
 	    }
 	/* Write last field and terminal zero */
 	char *s = row[lastField];
 	dyStringAppend(stream, s);
 	}
     dyStringAppendC(stream, 0);
 
     itemIx += 1;
 
     /* Do zoom counting. */
     for (resTry = 0; resTry < resTryCount; ++resTry)
         {
 	bits32 resEnd = resEnds[resTry];
 	if (start >= resEnd)
 	    {
 	    resSizes[resTry] += 1;
 	    resEnds[resTry] = resEnd = start + resScales[resTry];
 	    }
 	while (end > resEnd)
 	    {
 	    resSizes[resTry] += 1;
 	    resEnds[resTry] = resEnd = resEnd + resScales[resTry];
 	    }
 	}
     }
 assert(sectionIx == sectionCount);
 freez(&bed);
 *retMaxBlockSize = maxBlockSize;
 }