Browse code

seqan header files

aguang authored on 15/02/2018 18:04:57
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,505 @@
1
+// ==========================================================================
2
+//                 SeqAn - The Library for Sequence Analysis
3
+// ==========================================================================
4
+// Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5
+// All rights reserved.
6
+//
7
+// Redistribution and use in source and binary forms, with or without
8
+// modification, are permitted provided that the following conditions are met:
9
+//
10
+//     * Redistributions of source code must retain the above copyright
11
+//       notice, this list of conditions and the following disclaimer.
12
+//     * Redistributions in binary form must reproduce the above copyright
13
+//       notice, this list of conditions and the following disclaimer in the
14
+//       documentation and/or other materials provided with the distribution.
15
+//     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16
+//       its contributors may be used to endorse or promote products derived
17
+//       from this software without specific prior written permission.
18
+//
19
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22
+// ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28
+// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29
+// DAMAGE.
30
+//
31
+// ==========================================================================
32
+// Author: David Weese <david.weese@fu-berlin.de>
33
+// ==========================================================================
34
+
35
+#ifndef SEQAN_STREAM_STREAM_COMPRESSOR_H_
36
+#define SEQAN_STREAM_STREAM_COMPRESSOR_H_
37
+
38
+
39
+#if SEQAN_HAS_ZLIB
40
+// Zlib headers
41
+#include <zlib.h>
42
+#include "iostream_zutil.h"
43
+#endif
44
+
45
+#include <algorithm>    // copy
46
+
47
+namespace seqan {
48
+
49
+// ============================================================================
50
+// Forwards
51
+// ============================================================================
52
+
53
+template <typename TOutPager, typename TSpec>
54
+struct Pager;
55
+
56
+// ============================================================================
57
+// Tags, Enums
58
+// ============================================================================
59
+
60
+// ============================================================================
61
+// Classes
62
+// ============================================================================
63
+
64
+// Special end-of-file marker defined by the BGZF compression format.
65
+// See: https://samtools.github.io/hts-specs/SAMv1.pdf
66
+static constexpr std::array<uint8_t, 28> BGZF_END_OF_FILE_MARKER {{0x1f, 0x8b, 0x08, 0x04,
67
+                                                                  0x00, 0x00, 0x00, 0x00,
68
+                                                                  0x00, 0xff, 0x06, 0x00,
69
+                                                                  0x42, 0x43, 0x02, 0x00,
70
+                                                                  0x1b, 0x00, 0x03, 0x00,
71
+                                                                  0x00, 0x00, 0x00, 0x00,
72
+                                                                  0x00, 0x00, 0x00, 0x00}};
73
+
74
+template <typename TAlgTag>
75
+struct Compress;
76
+
77
+template <typename TAlgTag>
78
+struct CompressionContext {};
79
+
80
+template <typename TAlgTag>
81
+struct DefaultPageSize;
82
+
83
+#if SEQAN_HAS_ZLIB
84
+
85
+template <>
86
+struct CompressionContext<GZFile>
87
+{
88
+    z_stream strm;
89
+
90
+    CompressionContext()
91
+    {
92
+        memset(&strm, 0, sizeof(z_stream));
93
+    }
94
+};
95
+
96
+template <>
97
+struct CompressionContext<BgzfFile>:
98
+    CompressionContext<GZFile>
99
+{
100
+    enum { BLOCK_HEADER_LENGTH = 18 };
101
+    unsigned char headerPos;
102
+};
103
+
104
+template <typename T>
105
+struct MagicHeader<BgzfFile, T>
106
+{
107
+    static char const VALUE[18];
108
+};
109
+
110
+template <typename T>
111
+char const MagicHeader<BgzfFile, T>::VALUE[18] =
112
+{
113
+    MagicHeader<GZFile>::VALUE[0], MagicHeader<GZFile>::VALUE[1], MagicHeader<GZFile>::VALUE[2],
114
+    4, 0, 0, 0, 0, 0, '\xff', 6, 0, 'B', 'C', 2, 0, 0, 0
115
+};
116
+
117
+template <>
118
+struct DefaultPageSize<BgzfFile>
119
+{
120
+    static const unsigned MAX_BLOCK_SIZE = 64 * 1024;
121
+    static const unsigned BLOCK_FOOTER_LENGTH = 8;
122
+    // 5 bytes block overhead (see 3.2.4. at http://www.gzip.org/zlib/rfc-deflate.html)
123
+    static const unsigned ZLIB_BLOCK_OVERHEAD = 5;
124
+
125
+    // Reduce the maximal input size, such that the compressed data
126
+    // always fits in one block even for level Z_NO_COMPRESSION.
127
+    enum { BLOCK_HEADER_LENGTH = CompressionContext<BgzfFile>::BLOCK_HEADER_LENGTH };
128
+    static const unsigned VALUE = MAX_BLOCK_SIZE - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH - ZLIB_BLOCK_OVERHEAD;
129
+};
130
+
131
+
132
+/*
133
+template <typename TOutPager, typename TAlgTag>
134
+Pager<TOutPager, Compress<TAlgTag> >
135
+{
136
+    TOutPager outPager;             // outbound pager
137
+    PageTable<FixedSize> table;     // our page table
138
+
139
+    Pager():
140
+        table(DefaultPageSize<TAlgTag>::VALUE)
141
+    {}
142
+
143
+    Page & getPage (int64_t position)
144
+    {
145
+        Page *page;
146
+        {
147
+            ScopedReadLock(table.lock);
148
+
149
+            page = table[position];
150
+            if (posInPage(position, page))                      // does the page exist yet?
151
+                return page;
152
+        }
153
+        {
154
+            ScopedWriteLock(table.lock);
155
+
156
+            page = table[position];
157
+            if (posInPage(position, page))                      // does the page exist yet?
158
+                return page;
159
+
160
+            page = new Page(table.rangeForPos(position));       // create new page
161
+            reserve(page.data, table.pageSize);                 // allocate required memory
162
+            table.insertPage(page);                             // insert page
163
+            prevPage = prevPage(position);
164
+        }
165
+        return page;
166
+    }
167
+
168
+    void putPage (Page &page)
169
+    {
170
+        int64_t outPosition = 0;                                // compute start position in outbound pager
171
+        if (page.range.begin != 0)
172
+        {
173
+            PageRange range = getPageRange(beginPosition(page.range) - 1);
174
+            outPosition = endPosition(range);                   // wait for end position of the previous page
175
+        }
176
+
177
+        TCompressionContext ctx;
178
+        initCompressionContext(ctx);
179
+
180
+        Size<Page>::Type leftToCompress = length(page);
181
+        while (leftToCompress != 0)
182
+        {
183
+            Page &outPage = outPager.getPage(outPosition);
184
+
185
+            auto handle = std::async(std::launch::async,
186
+                                  parallel_sum<RAIter>, mid, end);
187
+            compress
188
+        }
189
+    }
190
+};
191
+*/
192
+// ============================================================================
193
+// Functions
194
+// ============================================================================
195
+
196
+inline void
197
+compressInit(CompressionContext<GZFile> & ctx)
198
+{
199
+    const int GZIP_WINDOW_BITS = -15;   // no zlib header
200
+    const int Z_DEFAULT_MEM_LEVEL = 8;
201
+
202
+    ctx.strm.zalloc = NULL;
203
+    ctx.strm.zfree = NULL;
204
+
205
+    // (weese:) We use Z_BEST_SPEED instead of Z_DEFAULT_COMPRESSION as it turned out
206
+    //          to be 2x faster and produces only 7% bigger output
207
+//    int status = deflateInit2(&ctx.strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
208
+//                              GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
209
+    int status = deflateInit2(&ctx.strm, Z_BEST_SPEED, Z_DEFLATED,
210
+                              GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
211
+    if (status != Z_OK)
212
+        throw IOError("GZFile deflateInit2() failed.");
213
+}
214
+
215
+inline void
216
+compressInit(CompressionContext<BgzfFile> & ctx)
217
+{
218
+    compressInit(static_cast<CompressionContext<GZFile> &>(ctx));
219
+    ctx.headerPos = 0;
220
+}
221
+
222
+template <typename TTarget, typename TSourceIterator>
223
+inline typename Size<TTarget>::Type
224
+compress(TTarget & target, TSourceIterator & source, CompressionContext<BgzfFile> & ctx)
225
+{
226
+    typedef typename Chunk<TTarget>::Type           TTargetChunk;
227
+    typedef typename Chunk<TSourceIterator>::Type   TSourceChunk;
228
+    typedef typename Value<TSourceChunk>::Type      TSourceValue;
229
+
230
+    TTargetChunk tChunk;
231
+    TSourceChunk sChunk;
232
+
233
+    if (ctx.headerPos < sizeof(MagicHeader<BgzfFile>::VALUE))
234
+    {
235
+        size_t headerLeft = sizeof(MagicHeader<BgzfFile>::VALUE) - ctx.headerPos;
236
+        reserveChunk(target, headerLeft, Output());
237
+
238
+        tChunk = getChunk(target, Output());
239
+        size_t size = std::min(headerLeft, length(tChunk));
240
+        SEQAN_ASSERT_GT(size, 0u);
241
+
242
+        std::copy(tChunk.begin, sChunk.begin, size);
243
+
244
+        advanceChunk(target, size);
245
+        ctx.headerPos += size;
246
+        return size;
247
+    }
248
+    else
249
+    {
250
+        sChunk = getChunk(source, Input());
251
+        tChunk = getChunk(target, Output());
252
+
253
+        ctx.strm.next_in = static_cast<Bytef *>(sChunk.begin);
254
+        ctx.strm.next_out = static_cast<Bytef *>(tChunk.begin);
255
+        ctx.strm.avail_in = length(sChunk) * sizeof(TSourceValue);
256
+        ctx.strm.avail_out = length(tChunk);
257
+
258
+        SEQAN_ASSERT_GT(ctx.strm.avail_out, 0u);
259
+
260
+        int status = deflate(&ctx.strm, Z_NO_FLUSH);
261
+        if (status != Z_OK)
262
+            throw IOError("BgzfFile deflateInit2() failed.");
263
+
264
+        source += length(sChunk) - ctx.strm.avail_in;
265
+        size_t size = length(tChunk) - ctx.strm.avail_out;
266
+        advanceChunk(target, size);
267
+        return size;
268
+    }
269
+
270
+
271
+//    status = deflate(&zs, Z_FINISH);
272
+//    bool rawDataTooBig = (status != Z_STREAM_END);
273
+//
274
+//    status = deflateEnd(&zs);
275
+//    if (status != Z_OK)
276
+//        throw IOError("BgzfFile deflateEnd() failed.");
277
+//
278
+//    if (!rawDataTooBig)
279
+//    {
280
+//        resize(page.raw, zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH);
281
+//        break;
282
+//    }
283
+}
284
+
285
+// ----------------------------------------------------------------------------
286
+// Helper Function _bgzfUnpackXX()
287
+// ----------------------------------------------------------------------------
288
+
289
+inline uint16_t
290
+_bgzfUnpack16(char const * buffer)
291
+{
292
+    uint16_t tmp = *reinterpret_cast<uint16_t const *>(buffer);
293
+    enforceLittleEndian(tmp);
294
+    return tmp;
295
+}
296
+
297
+inline uint32_t
298
+_bgzfUnpack32(char const * buffer)
299
+{
300
+    uint32_t tmp = *reinterpret_cast<uint32_t const *>(buffer);
301
+    enforceLittleEndian(tmp);
302
+    return tmp;
303
+}
304
+
305
+// ----------------------------------------------------------------------------
306
+// Helper Function _bgzfPackXX()
307
+// ----------------------------------------------------------------------------
308
+
309
+inline void
310
+_bgzfPack16(char * buffer, uint16_t value)
311
+{
312
+    enforceLittleEndian(value);
313
+    *reinterpret_cast<uint16_t *>(buffer) = value;
314
+}
315
+
316
+inline void
317
+_bgzfPack32(char * buffer, uint32_t value)
318
+{
319
+    enforceLittleEndian(value);
320
+    *reinterpret_cast<uint32_t *>(buffer) = value;
321
+}
322
+
323
+
324
+template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength>
325
+inline TDestCapacity
326
+_compressBlock(TDestValue *dstBegin,   TDestCapacity dstCapacity,
327
+               TSourceValue *srcBegin, TSourceLength srcLength, CompressionContext<BgzfFile> & ctx)
328
+{
329
+    const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<BgzfFile>::BLOCK_HEADER_LENGTH;
330
+    const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<BgzfFile>::BLOCK_FOOTER_LENGTH;
331
+
332
+    SEQAN_ASSERT_GT(dstCapacity, BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH);
333
+    SEQAN_ASSERT_EQ(sizeof(TDestValue), 1u);
334
+    SEQAN_ASSERT_EQ(sizeof(unsigned), 4u);
335
+
336
+    // 1. COPY HEADER
337
+
338
+    std::copy(&MagicHeader<BgzfFile>::VALUE[0], &MagicHeader<BgzfFile>::VALUE[BLOCK_HEADER_LENGTH], dstBegin);
339
+
340
+
341
+    // 2. COMPRESS
342
+
343
+    compressInit(ctx);
344
+    ctx.strm.next_in = (Bytef *)(srcBegin);
345
+    ctx.strm.next_out = (Bytef *)(dstBegin + BLOCK_HEADER_LENGTH);
346
+    ctx.strm.avail_in = srcLength * sizeof(TSourceValue);
347
+    ctx.strm.avail_out = dstCapacity - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
348
+
349
+    int status = deflate(&ctx.strm, Z_FINISH);
350
+    if (status != Z_STREAM_END)
351
+    {
352
+        deflateEnd(&ctx.strm);
353
+        throw IOError("Deflation failed. Compressed BGZF data is too big.");
354
+    }
355
+
356
+    status = deflateEnd(&ctx.strm);
357
+    if (status != Z_OK)
358
+        throw IOError("BGZF deflateEnd() failed.");
359
+
360
+
361
+    // 3. APPEND FOOTER
362
+
363
+    // Set compressed length into buffer, compute CRC and write CRC into buffer.
364
+
365
+    size_t len = dstCapacity - ctx.strm.avail_out;
366
+    _bgzfPack16(dstBegin + 16, len - 1);
367
+
368
+    dstBegin += len - BLOCK_FOOTER_LENGTH;
369
+    _bgzfPack32(dstBegin, crc32(crc32(0u, NULL, 0u), (Bytef *)(srcBegin), srcLength * sizeof(TSourceValue)));
370
+    _bgzfPack32(dstBegin + 4, srcLength * sizeof(TSourceValue));
371
+
372
+    return dstCapacity - ctx.strm.avail_out;
373
+}
374
+
375
+inline void
376
+decompressInit(CompressionContext<GZFile> & ctx)
377
+{
378
+    const int GZIP_WINDOW_BITS = -15;   // no zlib header
379
+
380
+    ctx.strm.zalloc = NULL;
381
+    ctx.strm.zfree = NULL;
382
+    int status = inflateInit2(&ctx.strm, GZIP_WINDOW_BITS);
383
+    if (status != Z_OK)
384
+        throw IOError("GZip inflateInit2() failed.");
385
+}
386
+
387
+inline void
388
+decompressInit(CompressionContext<BgzfFile> & ctx)
389
+{
390
+    decompressInit(static_cast<CompressionContext<GZFile> &>(ctx));
391
+    ctx.headerPos = 0;
392
+}
393
+
394
+inline bool
395
+_bgzfCheckHeader(char const * header)
396
+{
397
+    const char FLG_FEXTRA = 4;
398
+    const char BGZF_ID1 = 'B';
399
+    const char BGZF_ID2 = 'C';
400
+    const char BGZF_LEN = 2;
401
+    const char BGZF_XLEN = 6;  // BGZF_LEN+4
402
+
403
+    return (header[0] == (char)MagicHeader<GZFile>::VALUE[0] &&
404
+            header[1] == (char)MagicHeader<GZFile>::VALUE[1] &&
405
+            header[2] == (char)MagicHeader<GZFile>::VALUE[2] &&
406
+            (header[3] & FLG_FEXTRA) != 0 &&
407
+            _bgzfUnpack16(header + 10) == BGZF_XLEN &&
408
+            header[12] == BGZF_ID1 &&
409
+            header[13] == BGZF_ID2 &&
410
+            _bgzfUnpack16(header + 14) == BGZF_LEN);
411
+}
412
+
413
+// read first bytes of a file/stream and compare with file format's magic header
414
+template <typename TStream>
415
+inline bool
416
+guessFormatFromStream(TStream &istream, BgzfFile)
417
+{
418
+    char putbackBuf[18];
419
+    bool match = false;
420
+
421
+    SEQAN_ASSERT(istream.good());
422
+
423
+    // try to read and check header
424
+    size_t numRead = istream.readsome(&putbackBuf[0], sizeof(putbackBuf));
425
+    if (numRead == sizeof(putbackBuf) && _bgzfCheckHeader(putbackBuf))
426
+        match = true;
427
+
428
+    // unget all read characters
429
+    for (; numRead > 0; --numRead)
430
+        istream.unget();
431
+
432
+    SEQAN_ASSERT(istream.good());
433
+
434
+    return match;
435
+}
436
+
437
+// ----------------------------------------------------------------------------
438
+// Function _preprocessFilePage()
439
+// ----------------------------------------------------------------------------
440
+
441
+template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength>
442
+inline TDestCapacity
443
+_decompressBlock(TDestValue *dstBegin,   TDestCapacity dstCapacity,
444
+                 TSourceValue *srcBegin, TSourceLength srcLength, CompressionContext<BgzfFile> & ctx)
445
+{
446
+    const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<BgzfFile>::BLOCK_HEADER_LENGTH;
447
+    const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<BgzfFile>::BLOCK_FOOTER_LENGTH;
448
+
449
+    SEQAN_ASSERT_EQ(sizeof(TSourceValue), 1u);
450
+    SEQAN_ASSERT_EQ(sizeof(unsigned), 4u);
451
+
452
+    // 1. CHECK HEADER
453
+
454
+    if (srcLength <= BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH)
455
+        throw IOError("BGZF block too short.");
456
+
457
+    if (!_bgzfCheckHeader(srcBegin))
458
+        throw IOError("Invalid BGZF block header.");
459
+
460
+    size_t compressedLen = _bgzfUnpack16(srcBegin + 16) + 1u;
461
+    if (compressedLen != srcLength)
462
+        throw IOError("BGZF compressed size mismatch.");
463
+
464
+
465
+    // 2. DECOMPRESS
466
+
467
+    decompressInit(ctx);
468
+    ctx.strm.next_in = (Bytef *)(srcBegin + BLOCK_HEADER_LENGTH);
469
+    ctx.strm.next_out = (Bytef *)(dstBegin);
470
+    ctx.strm.avail_in = srcLength - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
471
+    ctx.strm.avail_out = dstCapacity * sizeof(TDestValue);
472
+
473
+    int status = inflate(&ctx.strm, Z_FINISH);
474
+    if (status != Z_STREAM_END)
475
+    {
476
+        inflateEnd(&ctx.strm);
477
+        throw IOError("Inflation failed. Decompressed BGZF data is too big.");
478
+    }
479
+
480
+    status = inflateEnd(&ctx.strm);
481
+    if (status != Z_OK)
482
+        throw IOError("BGZF inflateEnd() failed.");
483
+
484
+
485
+    // 3. CHECK FOOTER
486
+
487
+    // Check compressed length in buffer, compute CRC and compare with CRC in buffer.
488
+
489
+    unsigned crc = crc32(crc32(0u, NULL, 0u), (Bytef *)(dstBegin), dstCapacity - ctx.strm.avail_out);
490
+
491
+    srcBegin += compressedLen - BLOCK_FOOTER_LENGTH;
492
+    if (_bgzfUnpack32(srcBegin) != crc)
493
+        throw IOError("BGZF wrong checksum.");
494
+
495
+    if (_bgzfUnpack32(srcBegin + 4) != dstCapacity - ctx.strm.avail_out)
496
+        throw IOError("BGZF size mismatch.");
497
+
498
+    return (dstCapacity - ctx.strm.avail_out) / sizeof(TDestValue);
499
+}
500
+
501
+#endif  // #if SEQAN_HAS_ZLIB
502
+
503
+}  // namespace seqan
504
+
505
+#endif  // SEQAN_STREAM_STREAM_COMPRESSOR_H_