1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,505 @@ |
1 |
+// ========================================================================== |
|
2 |
+// SeqAn - The Library for Sequence Analysis |
|
3 |
+// ========================================================================== |
|
4 |
+// Copyright (c) 2006-2018, Knut Reinert, FU Berlin |
|
5 |
+// All rights reserved. |
|
6 |
+// |
|
7 |
+// Redistribution and use in source and binary forms, with or without |
|
8 |
+// modification, are permitted provided that the following conditions are met: |
|
9 |
+// |
|
10 |
+// * Redistributions of source code must retain the above copyright |
|
11 |
+// notice, this list of conditions and the following disclaimer. |
|
12 |
+// * Redistributions in binary form must reproduce the above copyright |
|
13 |
+// notice, this list of conditions and the following disclaimer in the |
|
14 |
+// documentation and/or other materials provided with the distribution. |
|
15 |
+// * Neither the name of Knut Reinert or the FU Berlin nor the names of |
|
16 |
+// its contributors may be used to endorse or promote products derived |
|
17 |
+// from this software without specific prior written permission. |
|
18 |
+// |
|
19 |
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
20 |
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
21 |
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
22 |
+// ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE |
|
23 |
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|
24 |
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
|
25 |
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
|
26 |
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
|
27 |
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
|
28 |
+// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH |
|
29 |
+// DAMAGE. |
|
30 |
+// |
|
31 |
+// ========================================================================== |
|
32 |
+// Author: David Weese <david.weese@fu-berlin.de> |
|
33 |
+// ========================================================================== |
|
34 |
+ |
|
35 |
+#ifndef SEQAN_STREAM_STREAM_COMPRESSOR_H_ |
|
36 |
+#define SEQAN_STREAM_STREAM_COMPRESSOR_H_ |
|
37 |
+ |
|
38 |
+ |
|
39 |
+#if SEQAN_HAS_ZLIB |
|
40 |
+// Zlib headers |
|
41 |
+#include <zlib.h> |
|
42 |
+#include "iostream_zutil.h" |
|
43 |
+#endif |
|
44 |
+ |
|
45 |
+#include <algorithm> // copy |
|
46 |
+ |
|
47 |
+namespace seqan { |
|
48 |
+ |
|
49 |
+// ============================================================================ |
|
50 |
+// Forwards |
|
51 |
+// ============================================================================ |
|
52 |
+ |
|
53 |
+template <typename TOutPager, typename TSpec> |
|
54 |
+struct Pager; |
|
55 |
+ |
|
56 |
+// ============================================================================ |
|
57 |
+// Tags, Enums |
|
58 |
+// ============================================================================ |
|
59 |
+ |
|
60 |
+// ============================================================================ |
|
61 |
+// Classes |
|
62 |
+// ============================================================================ |
|
63 |
+ |
|
64 |
+// Special end-of-file marker defined by the BGZF compression format. |
|
65 |
+// See: https://samtools.github.io/hts-specs/SAMv1.pdf |
|
66 |
+static constexpr std::array<uint8_t, 28> BGZF_END_OF_FILE_MARKER {{0x1f, 0x8b, 0x08, 0x04, |
|
67 |
+ 0x00, 0x00, 0x00, 0x00, |
|
68 |
+ 0x00, 0xff, 0x06, 0x00, |
|
69 |
+ 0x42, 0x43, 0x02, 0x00, |
|
70 |
+ 0x1b, 0x00, 0x03, 0x00, |
|
71 |
+ 0x00, 0x00, 0x00, 0x00, |
|
72 |
+ 0x00, 0x00, 0x00, 0x00}}; |
|
73 |
+ |
|
74 |
+template <typename TAlgTag> |
|
75 |
+struct Compress; |
|
76 |
+ |
|
77 |
+template <typename TAlgTag> |
|
78 |
+struct CompressionContext {}; |
|
79 |
+ |
|
80 |
+template <typename TAlgTag> |
|
81 |
+struct DefaultPageSize; |
|
82 |
+ |
|
83 |
+#if SEQAN_HAS_ZLIB |
|
84 |
+ |
|
85 |
+template <> |
|
86 |
+struct CompressionContext<GZFile> |
|
87 |
+{ |
|
88 |
+ z_stream strm; |
|
89 |
+ |
|
90 |
+ CompressionContext() |
|
91 |
+ { |
|
92 |
+ memset(&strm, 0, sizeof(z_stream)); |
|
93 |
+ } |
|
94 |
+}; |
|
95 |
+ |
|
96 |
+template <> |
|
97 |
+struct CompressionContext<BgzfFile>: |
|
98 |
+ CompressionContext<GZFile> |
|
99 |
+{ |
|
100 |
+ enum { BLOCK_HEADER_LENGTH = 18 }; |
|
101 |
+ unsigned char headerPos; |
|
102 |
+}; |
|
103 |
+ |
|
104 |
+template <typename T> |
|
105 |
+struct MagicHeader<BgzfFile, T> |
|
106 |
+{ |
|
107 |
+ static char const VALUE[18]; |
|
108 |
+}; |
|
109 |
+ |
|
110 |
+template <typename T> |
|
111 |
+char const MagicHeader<BgzfFile, T>::VALUE[18] = |
|
112 |
+{ |
|
113 |
+ MagicHeader<GZFile>::VALUE[0], MagicHeader<GZFile>::VALUE[1], MagicHeader<GZFile>::VALUE[2], |
|
114 |
+ 4, 0, 0, 0, 0, 0, '\xff', 6, 0, 'B', 'C', 2, 0, 0, 0 |
|
115 |
+}; |
|
116 |
+ |
|
117 |
+template <> |
|
118 |
+struct DefaultPageSize<BgzfFile> |
|
119 |
+{ |
|
120 |
+ static const unsigned MAX_BLOCK_SIZE = 64 * 1024; |
|
121 |
+ static const unsigned BLOCK_FOOTER_LENGTH = 8; |
|
122 |
+ // 5 bytes block overhead (see 3.2.4. at http://www.gzip.org/zlib/rfc-deflate.html) |
|
123 |
+ static const unsigned ZLIB_BLOCK_OVERHEAD = 5; |
|
124 |
+ |
|
125 |
+ // Reduce the maximal input size, such that the compressed data |
|
126 |
+ // always fits in one block even for level Z_NO_COMPRESSION. |
|
127 |
+ enum { BLOCK_HEADER_LENGTH = CompressionContext<BgzfFile>::BLOCK_HEADER_LENGTH }; |
|
128 |
+ static const unsigned VALUE = MAX_BLOCK_SIZE - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH - ZLIB_BLOCK_OVERHEAD; |
|
129 |
+}; |
|
130 |
+ |
|
131 |
+ |
|
132 |
+/* |
|
133 |
+template <typename TOutPager, typename TAlgTag> |
|
134 |
+Pager<TOutPager, Compress<TAlgTag> > |
|
135 |
+{ |
|
136 |
+ TOutPager outPager; // outbound pager |
|
137 |
+ PageTable<FixedSize> table; // our page table |
|
138 |
+ |
|
139 |
+ Pager(): |
|
140 |
+ table(DefaultPageSize<TAlgTag>::VALUE) |
|
141 |
+ {} |
|
142 |
+ |
|
143 |
+ Page & getPage (int64_t position) |
|
144 |
+ { |
|
145 |
+ Page *page; |
|
146 |
+ { |
|
147 |
+ ScopedReadLock(table.lock); |
|
148 |
+ |
|
149 |
+ page = table[position]; |
|
150 |
+ if (posInPage(position, page)) // does the page exist yet? |
|
151 |
+ return page; |
|
152 |
+ } |
|
153 |
+ { |
|
154 |
+ ScopedWriteLock(table.lock); |
|
155 |
+ |
|
156 |
+ page = table[position]; |
|
157 |
+ if (posInPage(position, page)) // does the page exist yet? |
|
158 |
+ return page; |
|
159 |
+ |
|
160 |
+ page = new Page(table.rangeForPos(position)); // create new page |
|
161 |
+ reserve(page.data, table.pageSize); // allocate required memory |
|
162 |
+ table.insertPage(page); // insert page |
|
163 |
+ prevPage = prevPage(position); |
|
164 |
+ } |
|
165 |
+ return page; |
|
166 |
+ } |
|
167 |
+ |
|
168 |
+ void putPage (Page &page) |
|
169 |
+ { |
|
170 |
+ int64_t outPosition = 0; // compute start position in outbound pager |
|
171 |
+ if (page.range.begin != 0) |
|
172 |
+ { |
|
173 |
+ PageRange range = getPageRange(beginPosition(page.range) - 1); |
|
174 |
+ outPosition = endPosition(range); // wait for end position of the previous page |
|
175 |
+ } |
|
176 |
+ |
|
177 |
+ TCompressionContext ctx; |
|
178 |
+ initCompressionContext(ctx); |
|
179 |
+ |
|
180 |
+ Size<Page>::Type leftToCompress = length(page); |
|
181 |
+ while (leftToCompress != 0) |
|
182 |
+ { |
|
183 |
+ Page &outPage = outPager.getPage(outPosition); |
|
184 |
+ |
|
185 |
+ auto handle = std::async(std::launch::async, |
|
186 |
+ parallel_sum<RAIter>, mid, end); |
|
187 |
+ compress |
|
188 |
+ } |
|
189 |
+ } |
|
190 |
+}; |
|
191 |
+*/ |
|
192 |
+// ============================================================================ |
|
193 |
+// Functions |
|
194 |
+// ============================================================================ |
|
195 |
+ |
|
196 |
+inline void |
|
197 |
+compressInit(CompressionContext<GZFile> & ctx) |
|
198 |
+{ |
|
199 |
+ const int GZIP_WINDOW_BITS = -15; // no zlib header |
|
200 |
+ const int Z_DEFAULT_MEM_LEVEL = 8; |
|
201 |
+ |
|
202 |
+ ctx.strm.zalloc = NULL; |
|
203 |
+ ctx.strm.zfree = NULL; |
|
204 |
+ |
|
205 |
+ // (weese:) We use Z_BEST_SPEED instead of Z_DEFAULT_COMPRESSION as it turned out |
|
206 |
+ // to be 2x faster and produces only 7% bigger output |
|
207 |
+// int status = deflateInit2(&ctx.strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED, |
|
208 |
+// GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); |
|
209 |
+ int status = deflateInit2(&ctx.strm, Z_BEST_SPEED, Z_DEFLATED, |
|
210 |
+ GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); |
|
211 |
+ if (status != Z_OK) |
|
212 |
+ throw IOError("GZFile deflateInit2() failed."); |
|
213 |
+} |
|
214 |
+ |
|
215 |
+inline void |
|
216 |
+compressInit(CompressionContext<BgzfFile> & ctx) |
|
217 |
+{ |
|
218 |
+ compressInit(static_cast<CompressionContext<GZFile> &>(ctx)); |
|
219 |
+ ctx.headerPos = 0; |
|
220 |
+} |
|
221 |
+ |
|
222 |
+template <typename TTarget, typename TSourceIterator> |
|
223 |
+inline typename Size<TTarget>::Type |
|
224 |
+compress(TTarget & target, TSourceIterator & source, CompressionContext<BgzfFile> & ctx) |
|
225 |
+{ |
|
226 |
+ typedef typename Chunk<TTarget>::Type TTargetChunk; |
|
227 |
+ typedef typename Chunk<TSourceIterator>::Type TSourceChunk; |
|
228 |
+ typedef typename Value<TSourceChunk>::Type TSourceValue; |
|
229 |
+ |
|
230 |
+ TTargetChunk tChunk; |
|
231 |
+ TSourceChunk sChunk; |
|
232 |
+ |
|
233 |
+ if (ctx.headerPos < sizeof(MagicHeader<BgzfFile>::VALUE)) |
|
234 |
+ { |
|
235 |
+ size_t headerLeft = sizeof(MagicHeader<BgzfFile>::VALUE) - ctx.headerPos; |
|
236 |
+ reserveChunk(target, headerLeft, Output()); |
|
237 |
+ |
|
238 |
+ tChunk = getChunk(target, Output()); |
|
239 |
+ size_t size = std::min(headerLeft, length(tChunk)); |
|
240 |
+ SEQAN_ASSERT_GT(size, 0u); |
|
241 |
+ |
|
242 |
+ std::copy(tChunk.begin, sChunk.begin, size); |
|
243 |
+ |
|
244 |
+ advanceChunk(target, size); |
|
245 |
+ ctx.headerPos += size; |
|
246 |
+ return size; |
|
247 |
+ } |
|
248 |
+ else |
|
249 |
+ { |
|
250 |
+ sChunk = getChunk(source, Input()); |
|
251 |
+ tChunk = getChunk(target, Output()); |
|
252 |
+ |
|
253 |
+ ctx.strm.next_in = static_cast<Bytef *>(sChunk.begin); |
|
254 |
+ ctx.strm.next_out = static_cast<Bytef *>(tChunk.begin); |
|
255 |
+ ctx.strm.avail_in = length(sChunk) * sizeof(TSourceValue); |
|
256 |
+ ctx.strm.avail_out = length(tChunk); |
|
257 |
+ |
|
258 |
+ SEQAN_ASSERT_GT(ctx.strm.avail_out, 0u); |
|
259 |
+ |
|
260 |
+ int status = deflate(&ctx.strm, Z_NO_FLUSH); |
|
261 |
+ if (status != Z_OK) |
|
262 |
+ throw IOError("BgzfFile deflateInit2() failed."); |
|
263 |
+ |
|
264 |
+ source += length(sChunk) - ctx.strm.avail_in; |
|
265 |
+ size_t size = length(tChunk) - ctx.strm.avail_out; |
|
266 |
+ advanceChunk(target, size); |
|
267 |
+ return size; |
|
268 |
+ } |
|
269 |
+ |
|
270 |
+ |
|
271 |
+// status = deflate(&zs, Z_FINISH); |
|
272 |
+// bool rawDataTooBig = (status != Z_STREAM_END); |
|
273 |
+// |
|
274 |
+// status = deflateEnd(&zs); |
|
275 |
+// if (status != Z_OK) |
|
276 |
+// throw IOError("BgzfFile deflateEnd() failed."); |
|
277 |
+// |
|
278 |
+// if (!rawDataTooBig) |
|
279 |
+// { |
|
280 |
+// resize(page.raw, zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH); |
|
281 |
+// break; |
|
282 |
+// } |
|
283 |
+} |
|
284 |
+ |
|
285 |
+// ---------------------------------------------------------------------------- |
|
286 |
+// Helper Function _bgzfUnpackXX() |
|
287 |
+// ---------------------------------------------------------------------------- |
|
288 |
+ |
|
289 |
+inline uint16_t |
|
290 |
+_bgzfUnpack16(char const * buffer) |
|
291 |
+{ |
|
292 |
+ uint16_t tmp = *reinterpret_cast<uint16_t const *>(buffer); |
|
293 |
+ enforceLittleEndian(tmp); |
|
294 |
+ return tmp; |
|
295 |
+} |
|
296 |
+ |
|
297 |
+inline uint32_t |
|
298 |
+_bgzfUnpack32(char const * buffer) |
|
299 |
+{ |
|
300 |
+ uint32_t tmp = *reinterpret_cast<uint32_t const *>(buffer); |
|
301 |
+ enforceLittleEndian(tmp); |
|
302 |
+ return tmp; |
|
303 |
+} |
|
304 |
+ |
|
305 |
+// ---------------------------------------------------------------------------- |
|
306 |
+// Helper Function _bgzfPackXX() |
|
307 |
+// ---------------------------------------------------------------------------- |
|
308 |
+ |
|
309 |
+inline void |
|
310 |
+_bgzfPack16(char * buffer, uint16_t value) |
|
311 |
+{ |
|
312 |
+ enforceLittleEndian(value); |
|
313 |
+ *reinterpret_cast<uint16_t *>(buffer) = value; |
|
314 |
+} |
|
315 |
+ |
|
316 |
+inline void |
|
317 |
+_bgzfPack32(char * buffer, uint32_t value) |
|
318 |
+{ |
|
319 |
+ enforceLittleEndian(value); |
|
320 |
+ *reinterpret_cast<uint32_t *>(buffer) = value; |
|
321 |
+} |
|
322 |
+ |
|
323 |
+ |
|
324 |
+template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength> |
|
325 |
+inline TDestCapacity |
|
326 |
+_compressBlock(TDestValue *dstBegin, TDestCapacity dstCapacity, |
|
327 |
+ TSourceValue *srcBegin, TSourceLength srcLength, CompressionContext<BgzfFile> & ctx) |
|
328 |
+{ |
|
329 |
+ const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<BgzfFile>::BLOCK_HEADER_LENGTH; |
|
330 |
+ const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<BgzfFile>::BLOCK_FOOTER_LENGTH; |
|
331 |
+ |
|
332 |
+ SEQAN_ASSERT_GT(dstCapacity, BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH); |
|
333 |
+ SEQAN_ASSERT_EQ(sizeof(TDestValue), 1u); |
|
334 |
+ SEQAN_ASSERT_EQ(sizeof(unsigned), 4u); |
|
335 |
+ |
|
336 |
+ // 1. COPY HEADER |
|
337 |
+ |
|
338 |
+ std::copy(&MagicHeader<BgzfFile>::VALUE[0], &MagicHeader<BgzfFile>::VALUE[BLOCK_HEADER_LENGTH], dstBegin); |
|
339 |
+ |
|
340 |
+ |
|
341 |
+ // 2. COMPRESS |
|
342 |
+ |
|
343 |
+ compressInit(ctx); |
|
344 |
+ ctx.strm.next_in = (Bytef *)(srcBegin); |
|
345 |
+ ctx.strm.next_out = (Bytef *)(dstBegin + BLOCK_HEADER_LENGTH); |
|
346 |
+ ctx.strm.avail_in = srcLength * sizeof(TSourceValue); |
|
347 |
+ ctx.strm.avail_out = dstCapacity - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; |
|
348 |
+ |
|
349 |
+ int status = deflate(&ctx.strm, Z_FINISH); |
|
350 |
+ if (status != Z_STREAM_END) |
|
351 |
+ { |
|
352 |
+ deflateEnd(&ctx.strm); |
|
353 |
+ throw IOError("Deflation failed. Compressed BGZF data is too big."); |
|
354 |
+ } |
|
355 |
+ |
|
356 |
+ status = deflateEnd(&ctx.strm); |
|
357 |
+ if (status != Z_OK) |
|
358 |
+ throw IOError("BGZF deflateEnd() failed."); |
|
359 |
+ |
|
360 |
+ |
|
361 |
+ // 3. APPEND FOOTER |
|
362 |
+ |
|
363 |
+ // Set compressed length into buffer, compute CRC and write CRC into buffer. |
|
364 |
+ |
|
365 |
+ size_t len = dstCapacity - ctx.strm.avail_out; |
|
366 |
+ _bgzfPack16(dstBegin + 16, len - 1); |
|
367 |
+ |
|
368 |
+ dstBegin += len - BLOCK_FOOTER_LENGTH; |
|
369 |
+ _bgzfPack32(dstBegin, crc32(crc32(0u, NULL, 0u), (Bytef *)(srcBegin), srcLength * sizeof(TSourceValue))); |
|
370 |
+ _bgzfPack32(dstBegin + 4, srcLength * sizeof(TSourceValue)); |
|
371 |
+ |
|
372 |
+ return dstCapacity - ctx.strm.avail_out; |
|
373 |
+} |
|
374 |
+ |
|
375 |
+inline void |
|
376 |
+decompressInit(CompressionContext<GZFile> & ctx) |
|
377 |
+{ |
|
378 |
+ const int GZIP_WINDOW_BITS = -15; // no zlib header |
|
379 |
+ |
|
380 |
+ ctx.strm.zalloc = NULL; |
|
381 |
+ ctx.strm.zfree = NULL; |
|
382 |
+ int status = inflateInit2(&ctx.strm, GZIP_WINDOW_BITS); |
|
383 |
+ if (status != Z_OK) |
|
384 |
+ throw IOError("GZip inflateInit2() failed."); |
|
385 |
+} |
|
386 |
+ |
|
387 |
+inline void |
|
388 |
+decompressInit(CompressionContext<BgzfFile> & ctx) |
|
389 |
+{ |
|
390 |
+ decompressInit(static_cast<CompressionContext<GZFile> &>(ctx)); |
|
391 |
+ ctx.headerPos = 0; |
|
392 |
+} |
|
393 |
+ |
|
394 |
+inline bool |
|
395 |
+_bgzfCheckHeader(char const * header) |
|
396 |
+{ |
|
397 |
+ const char FLG_FEXTRA = 4; |
|
398 |
+ const char BGZF_ID1 = 'B'; |
|
399 |
+ const char BGZF_ID2 = 'C'; |
|
400 |
+ const char BGZF_LEN = 2; |
|
401 |
+ const char BGZF_XLEN = 6; // BGZF_LEN+4 |
|
402 |
+ |
|
403 |
+ return (header[0] == (char)MagicHeader<GZFile>::VALUE[0] && |
|
404 |
+ header[1] == (char)MagicHeader<GZFile>::VALUE[1] && |
|
405 |
+ header[2] == (char)MagicHeader<GZFile>::VALUE[2] && |
|
406 |
+ (header[3] & FLG_FEXTRA) != 0 && |
|
407 |
+ _bgzfUnpack16(header + 10) == BGZF_XLEN && |
|
408 |
+ header[12] == BGZF_ID1 && |
|
409 |
+ header[13] == BGZF_ID2 && |
|
410 |
+ _bgzfUnpack16(header + 14) == BGZF_LEN); |
|
411 |
+} |
|
412 |
+ |
|
413 |
+// read first bytes of a file/stream and compare with file format's magic header |
|
414 |
+template <typename TStream> |
|
415 |
+inline bool |
|
416 |
+guessFormatFromStream(TStream &istream, BgzfFile) |
|
417 |
+{ |
|
418 |
+ char putbackBuf[18]; |
|
419 |
+ bool match = false; |
|
420 |
+ |
|
421 |
+ SEQAN_ASSERT(istream.good()); |
|
422 |
+ |
|
423 |
+ // try to read and check header |
|
424 |
+ size_t numRead = istream.readsome(&putbackBuf[0], sizeof(putbackBuf)); |
|
425 |
+ if (numRead == sizeof(putbackBuf) && _bgzfCheckHeader(putbackBuf)) |
|
426 |
+ match = true; |
|
427 |
+ |
|
428 |
+ // unget all read characters |
|
429 |
+ for (; numRead > 0; --numRead) |
|
430 |
+ istream.unget(); |
|
431 |
+ |
|
432 |
+ SEQAN_ASSERT(istream.good()); |
|
433 |
+ |
|
434 |
+ return match; |
|
435 |
+} |
|
436 |
+ |
|
437 |
+// ---------------------------------------------------------------------------- |
|
438 |
+// Function _preprocessFilePage() |
|
439 |
+// ---------------------------------------------------------------------------- |
|
440 |
+ |
|
441 |
+template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength> |
|
442 |
+inline TDestCapacity |
|
443 |
+_decompressBlock(TDestValue *dstBegin, TDestCapacity dstCapacity, |
|
444 |
+ TSourceValue *srcBegin, TSourceLength srcLength, CompressionContext<BgzfFile> & ctx) |
|
445 |
+{ |
|
446 |
+ const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<BgzfFile>::BLOCK_HEADER_LENGTH; |
|
447 |
+ const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<BgzfFile>::BLOCK_FOOTER_LENGTH; |
|
448 |
+ |
|
449 |
+ SEQAN_ASSERT_EQ(sizeof(TSourceValue), 1u); |
|
450 |
+ SEQAN_ASSERT_EQ(sizeof(unsigned), 4u); |
|
451 |
+ |
|
452 |
+ // 1. CHECK HEADER |
|
453 |
+ |
|
454 |
+ if (srcLength <= BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH) |
|
455 |
+ throw IOError("BGZF block too short."); |
|
456 |
+ |
|
457 |
+ if (!_bgzfCheckHeader(srcBegin)) |
|
458 |
+ throw IOError("Invalid BGZF block header."); |
|
459 |
+ |
|
460 |
+ size_t compressedLen = _bgzfUnpack16(srcBegin + 16) + 1u; |
|
461 |
+ if (compressedLen != srcLength) |
|
462 |
+ throw IOError("BGZF compressed size mismatch."); |
|
463 |
+ |
|
464 |
+ |
|
465 |
+ // 2. DECOMPRESS |
|
466 |
+ |
|
467 |
+ decompressInit(ctx); |
|
468 |
+ ctx.strm.next_in = (Bytef *)(srcBegin + BLOCK_HEADER_LENGTH); |
|
469 |
+ ctx.strm.next_out = (Bytef *)(dstBegin); |
|
470 |
+ ctx.strm.avail_in = srcLength - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; |
|
471 |
+ ctx.strm.avail_out = dstCapacity * sizeof(TDestValue); |
|
472 |
+ |
|
473 |
+ int status = inflate(&ctx.strm, Z_FINISH); |
|
474 |
+ if (status != Z_STREAM_END) |
|
475 |
+ { |
|
476 |
+ inflateEnd(&ctx.strm); |
|
477 |
+ throw IOError("Inflation failed. Decompressed BGZF data is too big."); |
|
478 |
+ } |
|
479 |
+ |
|
480 |
+ status = inflateEnd(&ctx.strm); |
|
481 |
+ if (status != Z_OK) |
|
482 |
+ throw IOError("BGZF inflateEnd() failed."); |
|
483 |
+ |
|
484 |
+ |
|
485 |
+ // 3. CHECK FOOTER |
|
486 |
+ |
|
487 |
+ // Check compressed length in buffer, compute CRC and compare with CRC in buffer. |
|
488 |
+ |
|
489 |
+ unsigned crc = crc32(crc32(0u, NULL, 0u), (Bytef *)(dstBegin), dstCapacity - ctx.strm.avail_out); |
|
490 |
+ |
|
491 |
+ srcBegin += compressedLen - BLOCK_FOOTER_LENGTH; |
|
492 |
+ if (_bgzfUnpack32(srcBegin) != crc) |
|
493 |
+ throw IOError("BGZF wrong checksum."); |
|
494 |
+ |
|
495 |
+ if (_bgzfUnpack32(srcBegin + 4) != dstCapacity - ctx.strm.avail_out) |
|
496 |
+ throw IOError("BGZF size mismatch."); |
|
497 |
+ |
|
498 |
+ return (dstCapacity - ctx.strm.avail_out) / sizeof(TDestValue); |
|
499 |
+} |
|
500 |
+ |
|
501 |
+#endif // #if SEQAN_HAS_ZLIB |
|
502 |
+ |
|
503 |
+} // namespace seqan |
|
504 |
+ |
|
505 |
+#endif // SEQAN_STREAM_STREAM_COMPRESSOR_H_ |