Browse code

@sanchit-saini pretty sure this should be strlen() not sizeof()

Michael Lawrence authored on 07/09/2020 15:29:47
Showing1 changed files
... ...
@@ -12,7 +12,7 @@ int getDefinedFieldCount(struct asObject *as) {
12 12
   freeMem(asText);
13 13
   struct asColumn *bedCol = bedAs->columnList;
14 14
   while (asCol && bedCol) {
15
-    if (strncmp(asCol->name, bedCol->name, sizeof(asCol->name)) == 0)
15
+    if (strncmp(asCol->name, bedCol->name, strlen(asCol->name)) == 0)
16 16
       ++definedFieldCount;
17 17
     bedCol = bedCol->next;
18 18
     asCol = asCol->next;
Browse code

Add BBDFile_write C interface for writing bigbed file

- Move helper functions and enum from src/bigBed.c to src/bigBedHelper.c
& src/bigBedHelper.h
- Add BBDFile_write function and other helper functions in src/bigBedHelper.c
& src/bigBedHelper.h

Sanchit Saini authored on 23/08/2020 15:54:06 • Michael Lawrence committed on 07/09/2020 15:21:26
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,460 @@
1
+#include "bigBedHelper.h"
2
+
3
+/*
4
+  Most of the functions in this file are taken from ucscGenomeBrowser/kent/src/utils/bedToBigBed.c
5
+*/
6
+
7
+int getDefinedFieldCount(struct asObject *as) {
8
+  int definedFieldCount = 0;
9
+  struct asColumn *asCol = as->columnList;
10
+  char *asText = bedAsDef(12, 12);
11
+  struct asObject *bedAs = asParseText(asText);
12
+  freeMem(asText);
13
+  struct asColumn *bedCol = bedAs->columnList;
14
+  while (asCol && bedCol) {
15
+    if (strncmp(asCol->name, bedCol->name, sizeof(asCol->name)) == 0)
16
+      ++definedFieldCount;
17
+    bedCol = bedCol->next;
18
+    asCol = asCol->next;
19
+  }
20
+  asObjectFree(&bedAs);
21
+  return definedFieldCount;
22
+}
23
+
24
+bool isPresent(int definedFieldCount, int index) {
25
+  return (definedFieldCount - index >= 0) ? TRUE : FALSE;
26
+}
27
+
28
+bool isSelected(SEXP r_selectedindex, int position) {
29
+  if (length(r_selectedindex) == 0)
30
+    return TRUE;
31
+  for (int i = 0; i < length(r_selectedindex); ++i) {
32
+    if(INTEGER(r_selectedindex)[i] == position)
33
+      return TRUE;
34
+  }
35
+  return FALSE;
36
+}
37
+
38
+/* following functions are taken from the kent library */
39
+
40
+struct rbTree *rangeTreeForBedChrom(struct lineFile *lf, char *chrom)
41
+/* Read lines from bed file as long as they match chrom.  Return a rangeTree that
42
+ * corresponds to the coverage. */
43
+{
44
+struct rbTree *tree = rangeTreeNew();
45
+char *line;
46
+while (lineFileNextReal(lf, &line))
47
+    {
48
+    if (!startsWithWord(chrom, line))
49
+        {
50
+	lineFileReuse(lf);
51
+	break;
52
+	}
53
+    char *row[3];
54
+    chopLine(line, row);
55
+    unsigned start = sqlUnsigned(row[1]);
56
+    unsigned end = sqlUnsigned(row[2]);
57
+    rangeTreeAddToCoverageDepth(tree, start, end);
58
+    }
59
+return tree;
60
+}
61
+
62
+void bbExIndexMakerAddKeysFromRow(struct bbExIndexMaker *eim, char **row, int recordIx)
63
+/* Save the keys that are being indexed by row in eim. */
64
+{
65
+int i;
66
+for (i=0; i < eim->indexCount; ++i)
67
+    {
68
+    int rowIx = eim->indexFields[i];
69
+    eim->chunkArrayArray[i][recordIx].name = cloneString(row[rowIx]);
70
+    }
71
+}
72
+
73
+void bbExIndexMakerAddOffsetSize(struct bbExIndexMaker *eim, bits64 offset, bits64 size,
74
+    long startIx, long endIx)
75
+/* Update offset and size fields of all file chunks between startIx and endIx */
76
+{
77
+int i;
78
+for (i=0; i < eim->indexCount; ++i)
79
+    {
80
+    struct bbNamedFileChunk *chunks = eim->chunkArrayArray[i];
81
+    long j;
82
+    for (j = startIx; j < endIx; ++j)
83
+        {
84
+	struct bbNamedFileChunk *chunk = chunks + j;
85
+	chunk->offset = offset;
86
+	chunk->size = size;
87
+	}
88
+    }
89
+}
90
+
91
+/* Allocate the big part of the extra index maker - the part that holds which
92
+ * chunk is used for each record. */
93
+void bbExIndexMakerAllocChunkArrays(struct bbExIndexMaker *eim, int recordCount) {
94
+  eim->recordCount = recordCount;
95
+  int i;
96
+  for (i=0; i < eim->indexCount; ++i)
97
+    AllocArray(eim->chunkArrayArray[i], recordCount);
98
+}
99
+
100
+/* Return an index maker corresponding to extraIndexList. Checks that all fields
101
+ * mentioned are in autoSql definition, and for now that they are all text fields. */
102
+struct bbExIndexMaker *bbExIndexMakerNew(struct slName *extraIndexList, struct asObject *as) {
103
+  /* Fill in scalar fields and return quickly if no extra indexes. */
104
+  struct bbExIndexMaker *eim;
105
+  AllocVar(eim);
106
+  eim->indexCount = slCount(extraIndexList);
107
+  if (eim->indexCount == 0)
108
+    return eim;	// Not much to do in this case
109
+
110
+  /* Allocate arrays according field count. */
111
+  AllocArray(eim->indexFields, eim->indexCount);
112
+  AllocArray(eim->maxFieldSize, eim->indexCount);
113
+  AllocArray(eim->chunkArrayArray, eim->indexCount);
114
+  AllocArray(eim->fileOffsets, eim->indexCount);
115
+
116
+  /* Loop through each field checking that it is indeed something we can index
117
+  * and if so saving information about it */
118
+  int indexIx = 0;
119
+  struct slName *name;
120
+  for (name = extraIndexList; name != NULL; name = name->next) {
121
+      struct asColumn *col = asColumnFind(as, name->name);
122
+      if (col == NULL)
123
+          errAbort("extraIndex field %s not a standard bed field or found in autoSql string.",
124
+        name->name);
125
+      if (!sameString(col->lowType->name, "string"))
126
+          errAbort("Sorry for now can only index string fields.");
127
+      eim->indexFields[indexIx] = slIxFromElement(as->columnList, col);
128
+      ++indexIx;
129
+  }
130
+  return eim;
131
+}
132
+
133
+/* Compare two named offset object to facilitate qsorting by name. */
134
+int bbNamedFileChunkCmpByName(const void *va, const void *vb) {
135
+  const struct bbNamedFileChunk *a = va, *b = vb;
136
+  return strcmp(a->name, b->name);
137
+}
138
+
139
+/* Return pointer to val for bPlusTree maker. */
140
+void *bbNamedFileChunkVal(const void *va) {
141
+  const struct bbNamedFileChunk *item = va;
142
+  return (void *)&item->offset;
143
+}
144
+
145
+/* Copy name to keyBuf for bPlusTree maker */
146
+void bbNamedFileChunkKey(const void *va, char *keyBuf) {
147
+  const struct bbNamedFileChunk *item = va;
148
+  strncpy(keyBuf,item->name, maxBedNameSize);
149
+}
150
+
151
+struct bbiSummary *bedWriteReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList,
152
+	int fieldCount, struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount,
153
+	int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress,
154
+	struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart,
155
+	struct bbiSummaryElement *totalSum)
156
+/* Write out data reduced by factor of initialReduction.  Also calculate and keep in memory
157
+ * next reduction level.  This is more work than some ways, but it keeps us from having to
158
+ * keep the first reduction entirely in memory. */
159
+{
160
+struct bbiSummary *twiceReducedList = NULL;
161
+bits32 doubleReductionSize = initialReduction * zoomIncrement;
162
+struct bbiChromUsage *usage = usageList;
163
+struct bbiBoundsArray *boundsArray, *boundsPt, *boundsEnd;
164
+boundsPt = AllocArray(boundsArray, initialReductionCount);
165
+boundsEnd = boundsPt + initialReductionCount;
166
+
167
+*retDataStart = ftell(f);
168
+writeOne(f, initialReductionCount);
169
+/* This gets a little complicated I'm afraid.  The strategy is to:
170
+ *   1) Build up a range tree that represents coverage depth on that chromosome
171
+ *      This also has the nice side effect of getting rid of overlaps.
172
+ *   2) Stream through the range tree, outputting the initial summary level and
173
+ *      further reducing. 
174
+ */
175
+boolean firstTime = TRUE;
176
+struct bbiSumOutStream *stream = bbiSumOutStreamOpen(itemsPerSlot, f, doCompress);
177
+for (usage = usageList; usage != NULL; usage = usage->next)
178
+    {
179
+    struct bbiSummary oneSummary, *sum = NULL;
180
+    struct rbTree *rangeTree = rangeTreeForBedChrom(lf, usage->name);
181
+    struct range *range, *rangeList = rangeTreeList(rangeTree);
182
+    for (range = rangeList; range != NULL; range = range->next)
183
+        {
184
+	/* Grab values we want from range. */
185
+	double val = ptToInt(range->val);
186
+	int start = range->start;
187
+	int end = range->end;
188
+	bits32 size = end - start;
189
+
190
+	/* Add to total summary. */
191
+	if (firstTime)
192
+	    {
193
+	    totalSum->validCount = size;
194
+	    totalSum->minVal = totalSum->maxVal = val;
195
+	    totalSum->sumData = val*size;
196
+	    totalSum->sumSquares = val*val*size;
197
+	    firstTime = FALSE;
198
+	    }
199
+	else
200
+	    {
201
+	    totalSum->validCount += size;
202
+	    if (val < totalSum->minVal) totalSum->minVal = val;
203
+	    if (val > totalSum->maxVal) totalSum->maxVal = val;
204
+	    totalSum->sumData += val*size;
205
+	    totalSum->sumSquares += val*val*size;
206
+	    }
207
+
208
+	/* If start past existing block then output it. */
209
+	if (sum != NULL && sum->end <= start && sum->end < usage->size)
210
+	    {
211
+	    bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
212
+		&boundsPt, boundsEnd, lm, stream);
213
+	    sum = NULL;
214
+	    }
215
+	/* If don't have a summary we're working on now, make one. */
216
+	if (sum == NULL)
217
+	    {
218
+	    oneSummary.chromId = usage->id;
219
+	    oneSummary.start = start;
220
+	    oneSummary.end = start + initialReduction;
221
+	    if (oneSummary.end > usage->size) oneSummary.end = usage->size;
222
+	    oneSummary.minVal = oneSummary.maxVal = val;
223
+	    oneSummary.sumData = oneSummary.sumSquares = 0.0;
224
+	    oneSummary.validCount = 0;
225
+	    sum = &oneSummary;
226
+	    }
227
+	/* Deal with case where might have to split an item between multiple summaries.  This
228
+	 * loop handles all but the final affected summary in that case. */
229
+	while (end > sum->end)
230
+	    {
231
+	    /* Fold in bits that overlap with existing summary and output. */
232
+	    int overlap = rangeIntersection(start, end, sum->start, sum->end);
233
+	    assert(overlap > 0);
234
+	    sum->validCount += overlap;
235
+	    if (sum->minVal > val) sum->minVal = val;
236
+	    if (sum->maxVal < val) sum->maxVal = val;
237
+	    sum->sumData += val * overlap;
238
+	    sum->sumSquares += val*val * overlap;
239
+	    bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
240
+		    &boundsPt, boundsEnd, lm, stream);
241
+	    size -= overlap;
242
+
243
+	    /* Move summary to next part. */
244
+	    sum->start = start = sum->end;
245
+	    sum->end = start + initialReduction;
246
+	    if (sum->end > usage->size) sum->end = usage->size;
247
+	    sum->minVal = sum->maxVal = val;
248
+	    sum->sumData = sum->sumSquares = 0.0;
249
+	    sum->validCount = 0;
250
+	    }
251
+
252
+	/* Add to summary. */
253
+	sum->validCount += size;
254
+	if (sum->minVal > val) sum->minVal = val;
255
+	if (sum->maxVal < val) sum->maxVal = val;
256
+	sum->sumData += val * size;
257
+	sum->sumSquares += val*val * size;
258
+	}
259
+    if (sum != NULL)
260
+	{
261
+	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
262
+	    &boundsPt, boundsEnd, lm, stream);
263
+	}
264
+    rangeTreeFree(&rangeTree);
265
+    }
266
+bbiSumOutStreamClose(&stream);
267
+
268
+/* Write out 1st zoom index. */
269
+int indexOffset = *retIndexStart = ftell(f);
270
+assert(boundsPt == boundsEnd);
271
+cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), initialReductionCount,
272
+    blockSize, itemsPerSlot, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset,
273
+    indexOffset, f);
274
+
275
+freez(&boundsArray);
276
+slReverse(&twiceReducedList);
277
+return twiceReducedList;
278
+}
279
+
280
+void writeBlocks(struct bbiChromUsage *usageList, struct lineFile *lf, struct asObject *as,
281
+	int itemsPerSlot, struct bbiBoundsArray *bounds,
282
+	int sectionCount, boolean doCompress, FILE *f,
283
+	int resTryCount, int resScales[], int resSizes[],
284
+	struct bbExIndexMaker *eim,  int bedCount,
285
+	bits16 fieldCount, int bedN, bits32 *retMaxBlockSize)
286
+/* Read through lf, writing it in f.  Save starting points of blocks (every itemsPerSlot)
287
+ * to boundsArray */
288
+{
289
+int maxBlockSize = 0;
290
+struct bbiChromUsage *usage = usageList;
291
+char *line, *row[fieldCount+1];
292
+int lastField = fieldCount-1;
293
+int itemIx = 0, sectionIx = 0;
294
+bits64 blockStartOffset = 0;
295
+int startPos = 0, endPos = 0;
296
+bits32 chromId = 0;
297
+struct dyString *stream = dyStringNew(0);
298
+
299
+/* Will keep track of some things that help us determine how much to reduce. */
300
+bits32 resEnds[resTryCount];
301
+int resTry;
302
+for (resTry = 0; resTry < resTryCount; ++resTry)
303
+    resEnds[resTry] = 0;
304
+boolean atEnd = FALSE, sameChrom = FALSE;
305
+bits32 start = 0, end = 0;
306
+char *chrom = NULL;
307
+struct bed *bed;
308
+AllocVar(bed);
309
+
310
+/* Help keep track of which beds are in current chunk so as to write out
311
+ * namedChunks to eim if need be. */
312
+long sectionStartIx = 0, sectionEndIx = 0;
313
+
314
+for (;;)
315
+    {
316
+    /* Get next line of input if any. */
317
+    if (lineFileNextReal(lf, &line))
318
+	{
319
+	/* Chop up line and make sure the word count is right. */
320
+	int wordCount;
321
+	wordCount = chopLine(line, row);
322
+	lineFileExpectWords(lf, fieldCount, wordCount);
323
+	loadAndValidateBedExt(row, bedN, fieldCount, lf, bed, as, FALSE, TRUE);
324
+	chrom = bed->chrom;
325
+	start = bed->chromStart;
326
+	end = bed->chromEnd;
327
+
328
+	sameChrom = sameString(chrom, usage->name);
329
+	}
330
+    else  /* No next line */
331
+	{
332
+	atEnd = TRUE;
333
+	}
334
+
335
+    /* Check conditions that would end block and save block info and advance to next if need be. */
336
+    if (atEnd || !sameChrom || itemIx >= itemsPerSlot)
337
+        {
338
+	/* Save stream to file, compressing if need be. */
339
+	if (stream->stringSize > maxBlockSize)
340
+	    maxBlockSize = stream->stringSize;
341
+	if (doCompress)
342
+            {
343
+	    size_t maxCompSize = zCompBufSize(stream->stringSize);
344
+
345
+            // keep around an area of scratch memory
346
+            static int compBufSize = 0;
347
+            static char *compBuf = NULL;
348
+            // check to see if buffer needed for compression is big enough
349
+            if (compBufSize < maxCompSize)
350
+                {
351
+                // free up the old not-big-enough piece
352
+                freez(&compBuf); // freez knows bout NULL
353
+
354
+                // get new scratch area
355
+                compBufSize = maxCompSize;
356
+                compBuf = needLargeMem(compBufSize);
357
+                }
358
+	    int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize);
359
+	    mustWrite(f, compBuf, compSize);
360
+	    }
361
+	else
362
+	    mustWrite(f, stream->string, stream->stringSize);
363
+	dyStringClear(stream);
364
+
365
+	/* Save block offset and size for all named chunks in this section. */
366
+	if (eim != NULL)
367
+	    {
368
+	    bits64 blockEndOffset = ftell(f);
369
+	    bbExIndexMakerAddOffsetSize(eim, blockStartOffset, blockEndOffset-blockStartOffset,
370
+		sectionStartIx, sectionEndIx);
371
+	    sectionStartIx = sectionEndIx;
372
+	    }
373
+
374
+	/* Save info on existing block. */
375
+	struct bbiBoundsArray *b = &bounds[sectionIx];
376
+	b->offset = blockStartOffset;
377
+	b->range.chromIx = chromId;
378
+	b->range.start = startPos;
379
+	b->range.end = endPos;
380
+	++sectionIx;
381
+	itemIx = 0;
382
+
383
+	if (atEnd)
384
+	    break;
385
+	}
386
+
387
+    /* Advance to next chromosome if need be and get chromosome id. */
388
+    if (!sameChrom)
389
+        {
390
+	usage = usage->next;
391
+	assert(usage != NULL);
392
+	assert(sameString(chrom, usage->name));
393
+	for (resTry = 0; resTry < resTryCount; ++resTry)
394
+	    resEnds[resTry] = 0;
395
+	}
396
+    chromId = usage->id;
397
+
398
+    /* At start of block we save a lot of info. */
399
+    if (itemIx == 0)
400
+        {
401
+	blockStartOffset = ftell(f);
402
+	startPos = start;
403
+	endPos = end;
404
+	}
405
+    /* Otherwise just update end. */
406
+        {
407
+	if (endPos < end)
408
+	    endPos = end;
409
+	/* No need to update startPos since list is sorted. */
410
+	}
411
+
412
+    /* Save name into namedOffset if need be. */
413
+    if (eim != NULL)
414
+	{
415
+	bbExIndexMakerAddKeysFromRow(eim, row, sectionEndIx);
416
+	sectionEndIx += 1;
417
+	}
418
+
419
+    /* Write out data. */
420
+    dyStringWriteOne(stream, chromId);
421
+    dyStringWriteOne(stream, start);
422
+    dyStringWriteOne(stream, end);
423
+    if (fieldCount > 3)
424
+        {
425
+	int i;
426
+	/* Write 3rd through next to last field and a tab separator. */
427
+	for (i=3; i<lastField; ++i)
428
+	    {
429
+	    char *s = row[i];
430
+	    dyStringAppend(stream, s);
431
+	    dyStringAppendC(stream, '\t');
432
+	    }
433
+	/* Write last field and terminal zero */
434
+	char *s = row[lastField];
435
+	dyStringAppend(stream, s);
436
+	}
437
+    dyStringAppendC(stream, 0);
438
+
439
+    itemIx += 1;
440
+
441
+    /* Do zoom counting. */
442
+    for (resTry = 0; resTry < resTryCount; ++resTry)
443
+        {
444
+	bits32 resEnd = resEnds[resTry];
445
+	if (start >= resEnd)
446
+	    {
447
+	    resSizes[resTry] += 1;
448
+	    resEnds[resTry] = resEnd = start + resScales[resTry];
449
+	    }
450
+	while (end > resEnd)
451
+	    {
452
+	    resSizes[resTry] += 1;
453
+	    resEnds[resTry] = resEnd = resEnd + resScales[resTry];
454
+	    }
455
+	}
456
+    }
457
+assert(sectionIx == sectionCount);
458
+freez(&bed);
459
+*retMaxBlockSize = maxBlockSize;
460
+}