Browse code

Add updated blosc

Mike Smith authored on 24/09/2019 16:54:09
Showing83 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,221 @@
1
+# a simple way to detect that we are using CMAKE
2
+add_definitions(-DUSING_CMAKE)
3
+
4
+set(INTERNAL_LIBS ${PROJECT_SOURCE_DIR}/internal-complibs)
5
+
6
+# Hide symbols by default unless they're specifically exported.
7
+# This makes it easier to keep the set of exported symbols the
8
+# same across all compilers/platforms.
9
+set(CMAKE_C_VISIBILITY_PRESET hidden)
10
+
11
+# includes
12
+set(BLOSC_INCLUDE_DIRS ${BLOSC_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR})
13
+if(NOT DEACTIVATE_LZ4)
14
+    if (LZ4_FOUND)
15
+        set(BLOSC_INCLUDE_DIRS ${BLOSC_INCLUDE_DIRS} ${LZ4_INCLUDE_DIR})
16
+    else(LZ4_FOUND)
17
+        set(LZ4_LOCAL_DIR ${INTERNAL_LIBS}/lz4-1.8.3)
18
+        set(BLOSC_INCLUDE_DIRS ${BLOSC_INCLUDE_DIRS} ${LZ4_LOCAL_DIR})
19
+    endif(LZ4_FOUND)
20
+endif(NOT DEACTIVATE_LZ4)
21
+
22
+if(NOT DEACTIVATE_SNAPPY)
23
+    if (SNAPPY_FOUND)
24
+        set(BLOSC_INCLUDE_DIRS ${BLOSC_INCLUDE_DIRS} ${SNAPPY_INCLUDE_DIR})
25
+    else(SNAPPY_FOUND)
26
+        set(SNAPPY_LOCAL_DIR ${INTERNAL_LIBS}/snappy-1.1.1)
27
+        set(BLOSC_INCLUDE_DIRS ${BLOSC_INCLUDE_DIRS} ${SNAPPY_LOCAL_DIR})
28
+    endif(SNAPPY_FOUND)
29
+endif(NOT DEACTIVATE_SNAPPY)
30
+
31
+if(NOT DEACTIVATE_ZLIB)
32
+    if (ZLIB_FOUND)
33
+        set(BLOSC_INCLUDE_DIRS ${BLOSC_INCLUDE_DIRS} ${ZLIB_INCLUDE_DIR})
34
+    else(ZLIB_FOUND)
35
+        set(ZLIB_LOCAL_DIR ${INTERNAL_LIBS}/zlib-1.2.8)
36
+        set(BLOSC_INCLUDE_DIRS ${BLOSC_INCLUDE_DIRS} ${ZLIB_LOCAL_DIR})
37
+    endif(ZLIB_FOUND)
38
+endif(NOT DEACTIVATE_ZLIB)
39
+
40
+if (NOT DEACTIVATE_ZSTD)
41
+    if (ZSTD_FOUND)
42
+        set(BLOSC_INCLUDE_DIRS ${BLOSC_INCLUDE_DIRS} ${ZSTD_INCLUDE_DIR})
43
+    else (ZSTD_FOUND)
44
+        set(ZSTD_LOCAL_DIR ${INTERNAL_LIBS}/zstd-1.3.8)
45
+        set(BLOSC_INCLUDE_DIRS ${BLOSC_INCLUDE_DIRS} ${ZSTD_LOCAL_DIR} ${ZSTD_LOCAL_DIR}/common)
46
+    endif (ZSTD_FOUND)
47
+endif (NOT DEACTIVATE_ZSTD)
48
+
49
+include_directories(${BLOSC_INCLUDE_DIRS})
50
+
51
+# library sources
52
+set(SOURCES blosc.c blosclz.c fastcopy.c shuffle-generic.c bitshuffle-generic.c
53
+        blosc-common.h blosc-export.h)
54
+if(COMPILER_SUPPORT_SSE2)
55
+    message(STATUS "Adding run-time support for SSE2")
56
+    set(SOURCES ${SOURCES} shuffle-sse2.c bitshuffle-sse2.c)
57
+endif(COMPILER_SUPPORT_SSE2)
58
+if(COMPILER_SUPPORT_AVX2)
59
+    message(STATUS "Adding run-time support for AVX2")
60
+    set(SOURCES ${SOURCES} shuffle-avx2.c bitshuffle-avx2.c)
61
+endif(COMPILER_SUPPORT_AVX2)
62
+set(SOURCES ${SOURCES} shuffle.c)
63
+
64
+# library install directory
65
+set(lib_dir lib${LIB_SUFFIX})
66
+set(version_string ${BLOSC_VERSION_MAJOR}.${BLOSC_VERSION_MINOR}.${BLOSC_VERSION_PATCH})
67
+
68
+set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
69
+if(WIN32)
70
+    # try to use the system library
71
+    find_package(Threads)
72
+    if(NOT Threads_FOUND)
73
+        message(STATUS "using the internal pthread library for win32 systems.")
74
+        set(SOURCES ${SOURCES} win32/pthread.c)
75
+    else(NOT Threads_FOUND)
76
+        set(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT})
77
+    endif(NOT Threads_FOUND)
78
+else(WIN32)
79
+    find_package(Threads REQUIRED)
80
+    set(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT})
81
+endif(WIN32)
82
+
83
+if(NOT DEACTIVATE_LZ4)
84
+    if(LZ4_FOUND)
85
+        set(LIBS ${LIBS} ${LZ4_LIBRARY})
86
+    else(LZ4_FOUND)
87
+        file(GLOB LZ4_FILES ${LZ4_LOCAL_DIR}/*.c)
88
+        set(SOURCES ${SOURCES} ${LZ4_FILES})
89
+    endif(LZ4_FOUND)
90
+endif(NOT DEACTIVATE_LZ4)
91
+
92
+if(NOT DEACTIVATE_SNAPPY)
93
+    if(SNAPPY_FOUND)
94
+        set(LIBS ${LIBS} ${SNAPPY_LIBRARY})
95
+    else(SNAPPY_FOUND)
96
+        file(GLOB SNAPPY_FILES ${SNAPPY_LOCAL_DIR}/*.cc)
97
+        set(SOURCES ${SOURCES} ${SNAPPY_FILES})
98
+    endif(SNAPPY_FOUND)
99
+endif(NOT DEACTIVATE_SNAPPY)
100
+
101
+if(NOT DEACTIVATE_ZLIB)
102
+    if(ZLIB_FOUND)
103
+        set(LIBS ${LIBS} ${ZLIB_LIBRARY})
104
+    else(ZLIB_FOUND)
105
+        file(GLOB ZLIB_FILES ${ZLIB_LOCAL_DIR}/*.c)
106
+        set(SOURCES ${SOURCES} ${ZLIB_FILES})
107
+    endif(ZLIB_FOUND)
108
+endif(NOT DEACTIVATE_ZLIB)
109
+
110
+if (NOT DEACTIVATE_ZSTD)
111
+    if (ZSTD_FOUND)
112
+        set(LIBS ${LIBS} ${ZSTD_LIBRARY})
113
+    else (ZSTD_FOUND)
114
+      file(GLOB ZSTD_FILES
115
+        ${ZSTD_LOCAL_DIR}/common/*.c
116
+        ${ZSTD_LOCAL_DIR}/compress/*.c
117
+        ${ZSTD_LOCAL_DIR}/decompress/*.c)
118
+        set(SOURCES ${SOURCES} ${ZSTD_FILES})
119
+    endif (ZSTD_FOUND)
120
+endif (NOT DEACTIVATE_ZSTD)
121
+
122
+
123
+# targets
124
+if (BUILD_SHARED)
125
+    add_library(blosc_shared SHARED ${SOURCES})
126
+    set_target_properties(blosc_shared PROPERTIES OUTPUT_NAME blosc)
127
+    set_target_properties(blosc_shared PROPERTIES
128
+            VERSION ${version_string}
129
+            SOVERSION 1  # Change this when an ABI change happens
130
+        )
131
+    set_property(
132
+        TARGET blosc_shared
133
+        APPEND PROPERTY COMPILE_DEFINITIONS BLOSC_SHARED_LIBRARY)
134
+endif()
135
+
136
+# Based on the target architecture and hardware features supported
137
+# by the C compiler, set hardware architecture optimization flags
138
+# for specific shuffle implementations.
139
+if(COMPILER_SUPPORT_SSE2)
140
+    if (MSVC)
141
+        # MSVC targets SSE2 by default on 64-bit configurations, but not 32-bit configurations.
142
+        if (${CMAKE_SIZEOF_VOID_P} EQUAL 4)
143
+            set_source_files_properties(shuffle-sse2.c bitshuffle-sse2.c PROPERTIES COMPILE_FLAGS "/arch:SSE2")
144
+        endif (${CMAKE_SIZEOF_VOID_P} EQUAL 4)
145
+    else (MSVC)
146
+        set_source_files_properties(shuffle-sse2.c bitshuffle-sse2.c PROPERTIES COMPILE_FLAGS -msse2)
147
+    endif (MSVC)
148
+
149
+    # Define a symbol for the shuffle-dispatch implementation
150
+    # so it knows SSE2 is supported even though that file is
151
+    # compiled without SSE2 support (for portability).
152
+    set_property(
153
+        SOURCE shuffle.c
154
+        APPEND PROPERTY COMPILE_DEFINITIONS SHUFFLE_SSE2_ENABLED)
155
+endif(COMPILER_SUPPORT_SSE2)
156
+if(COMPILER_SUPPORT_AVX2)
157
+    if (MSVC)
158
+        set_source_files_properties(shuffle-avx2.c bitshuffle-avx2.c PROPERTIES COMPILE_FLAGS "/arch:AVX2")
159
+    else (MSVC)
160
+        set_source_files_properties(shuffle-avx2.c bitshuffle-avx2.c PROPERTIES COMPILE_FLAGS -mavx2)
161
+    endif (MSVC)
162
+
163
+    # Define a symbol for the shuffle-dispatch implementation
164
+    # so it knows AVX2 is supported even though that file is
165
+    # compiled without AVX2 support (for portability).
166
+    set_property(
167
+        SOURCE shuffle.c
168
+        APPEND PROPERTY COMPILE_DEFINITIONS SHUFFLE_AVX2_ENABLED)
169
+endif(COMPILER_SUPPORT_AVX2)
170
+
171
+# When the option has been selected to compile the test suite,
172
+# compile an additional version of blosc_shared which exports
173
+# some normally-hidden symbols (to facilitate unit testing).
174
+if (BUILD_TESTS)
175
+    add_library(blosc_shared_testing SHARED ${SOURCES})
176
+    set_target_properties(blosc_shared_testing PROPERTIES OUTPUT_NAME blosc_testing)
177
+    set_property(
178
+        TARGET blosc_shared_testing
179
+        APPEND PROPERTY COMPILE_DEFINITIONS BLOSC_SHARED_LIBRARY)
180
+    set_property(
181
+        TARGET blosc_shared_testing
182
+        APPEND PROPERTY COMPILE_DEFINITIONS BLOSC_TESTING)
183
+    # TEMP : CMake doesn't automatically add -lpthread here like it does
184
+    # for the blosc_shared target. Force it for now.
185
+    if(UNIX)
186
+        set_property(
187
+            TARGET blosc_shared_testing
188
+            APPEND PROPERTY LINK_FLAGS "-lpthread")
189
+    endif()
190
+endif()
191
+
192
+if (BUILD_SHARED)
193
+    target_link_libraries(blosc_shared ${LIBS})
194
+    target_include_directories(blosc_shared PUBLIC ${BLOSC_INCLUDE_DIRS})
195
+endif()
196
+
197
+if (BUILD_TESTS)
198
+    target_link_libraries(blosc_shared_testing ${LIBS})
199
+    target_include_directories(blosc_shared_testing PUBLIC ${BLOSC_INCLUDE_DIRS})
200
+endif()
201
+
202
+if(BUILD_STATIC)
203
+    add_library(blosc_static STATIC ${SOURCES})
204
+    set_target_properties(blosc_static PROPERTIES OUTPUT_NAME blosc)
205
+    if (MSVC)
206
+        set_target_properties(blosc_static PROPERTIES PREFIX lib)
207
+    endif()
208
+    target_link_libraries(blosc_static ${LIBS})
209
+    target_include_directories(blosc_static PUBLIC ${BLOSC_INCLUDE_DIRS})
210
+endif(BUILD_STATIC)
211
+
212
+# install
213
+if(BLOSC_INSTALL)
214
+    install(FILES blosc.h blosc-export.h DESTINATION include COMPONENT DEV)
215
+    if(BUILD_SHARED)
216
+        install(TARGETS blosc_shared DESTINATION ${lib_dir} COMPONENT LIB)
217
+    endif(BUILD_SHARED)
218
+    if(BUILD_STATIC)
219
+        install(TARGETS blosc_static DESTINATION ${lib_dir} COMPONENT DEV)
220
+    endif(BUILD_STATIC)
221
+endif(BLOSC_INSTALL)
0 222
new file mode 100644
... ...
@@ -0,0 +1,245 @@
1
+/*
2
+ * Bitshuffle - Filter for improving compression of typed binary data.
3
+ *
4
+ * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
5
+ * Website: http://www.github.com/kiyo-masui/bitshuffle
6
+ * Created: 2014
7
+ *
8
+ * Note: Adapted for c-blosc by Francesc Alted.
9
+ *
10
+ * See LICENSES/BITSHUFFLE.txt file for details about copyright and
11
+ * rights to use.
12
+ *
13
+ */
14
+
15
+#include "bitshuffle-generic.h"
16
+#include "bitshuffle-sse2.h"
17
+#include "bitshuffle-avx2.h"
18
+
19
+
20
+/* Make sure AVX2 is available for the compilation target and compiler. */
21
+#if !defined(__AVX2__)
22
+  #error AVX2 is not supported by the target architecture/platform and/or this compiler.
23
+#endif
24
+
25
+#include <immintrin.h>
26
+
27
+/* The next is useful for debugging purposes */
28
+#if 0
29
+#include <stdio.h>
30
+#include <string.h>
31
+
32
+static void printymm(__m256i ymm0)
33
+{
34
+  uint8_t buf[32];
35
+
36
+  ((__m256i *)buf)[0] = ymm0;
37
+  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
38
+          buf[0], buf[1], buf[2], buf[3],
39
+          buf[4], buf[5], buf[6], buf[7],
40
+          buf[8], buf[9], buf[10], buf[11],
41
+          buf[12], buf[13], buf[14], buf[15],
42
+          buf[16], buf[17], buf[18], buf[19],
43
+          buf[20], buf[21], buf[22], buf[23],
44
+          buf[24], buf[25], buf[26], buf[27],
45
+          buf[28], buf[29], buf[30], buf[31]);
46
+}
47
+#endif
48
+
49
+
50
+/* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */
51
+
52
+
53
+/* Transpose bits within bytes. */
54
+static int64_t bshuf_trans_bit_byte_avx2(void* in, void* out, const size_t size,
55
+                                         const size_t elem_size) {
56
+
57
+    char* in_b = (char*) in;
58
+    char* out_b = (char*) out;
59
+    int32_t* out_i32;
60
+
61
+    size_t nbyte = elem_size * size;
62
+
63
+    int64_t count;
64
+
65
+    __m256i ymm;
66
+    int32_t bt;
67
+    size_t ii, kk;
68
+
69
+    for (ii = 0; ii + 31 < nbyte; ii += 32) {
70
+        ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
71
+        for (kk = 0; kk < 8; kk++) {
72
+            bt = _mm256_movemask_epi8(ymm);
73
+            ymm = _mm256_slli_epi16(ymm, 1);
74
+            out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
75
+            *out_i32 = bt;
76
+        }
77
+    }
78
+    count = blosc_internal_bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
79
+            nbyte - nbyte % 32);
80
+    return count;
81
+}
82
+
83
+/* Transpose bits within elements. */
84
+int64_t blosc_internal_bshuf_trans_bit_elem_avx2(void* in, void* out, const size_t size,
85
+                                                 const size_t elem_size, void* tmp_buf) {
86
+    int64_t count;
87
+
88
+    CHECK_MULT_EIGHT(size);
89
+
90
+    count = blosc_internal_bshuf_trans_byte_elem_sse2(in, out, size, elem_size, tmp_buf);
91
+    CHECK_ERR(count);
92
+    count = bshuf_trans_bit_byte_avx2(out, tmp_buf, size, elem_size);
93
+    CHECK_ERR(count);
94
+    count = blosc_internal_bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
95
+
96
+    return count;
97
+}
98
+
99
+/* For data organized into a row for each bit (8 * elem_size rows), transpose
100
+ * the bytes. */
101
+static int64_t bshuf_trans_byte_bitrow_avx2(void* in, void* out, const size_t size,
102
+                                            const size_t elem_size) {
103
+
104
+    char* in_b = (char*) in;
105
+    char* out_b = (char*) out;
106
+
107
+    size_t nrows = 8 * elem_size;
108
+    size_t nbyte_row = size / 8;
109
+    size_t ii, jj, kk, hh, mm;
110
+
111
+    CHECK_MULT_EIGHT(size);
112
+
113
+    if (elem_size % 4)
114
+      return blosc_internal_bshuf_trans_byte_bitrow_sse2(in, out, size, elem_size);
115
+
116
+    __m256i ymm_0[8];
117
+    __m256i ymm_1[8];
118
+    __m256i ymm_storeage[8][4];
119
+
120
+    for (jj = 0; jj + 31 < nbyte_row; jj += 32) {
121
+        for (ii = 0; ii + 3 < elem_size; ii += 4) {
122
+            for (hh = 0; hh < 4; hh ++) {
123
+
124
+                for (kk = 0; kk < 8; kk ++){
125
+                    ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[
126
+                            (ii * 8 + hh * 8 + kk) * nbyte_row + jj]);
127
+                }
128
+
129
+                for (kk = 0; kk < 4; kk ++){
130
+                    ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2],
131
+                            ymm_0[kk * 2 + 1]);
132
+                    ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2],
133
+                            ymm_0[kk * 2 + 1]);
134
+                }
135
+
136
+                for (kk = 0; kk < 2; kk ++){
137
+                    for (mm = 0; mm < 2; mm ++){
138
+                        ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16(
139
+                                ymm_1[kk * 4 + mm * 2],
140
+                                ymm_1[kk * 4 + mm * 2 + 1]);
141
+                        ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16(
142
+                                ymm_1[kk * 4 + mm * 2],
143
+                                ymm_1[kk * 4 + mm * 2 + 1]);
144
+                    }
145
+                }
146
+
147
+                for (kk = 0; kk < 4; kk ++){
148
+                    ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2],
149
+                            ymm_0[kk * 2 + 1]);
150
+                    ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2],
151
+                            ymm_0[kk * 2 + 1]);
152
+                }
153
+
154
+                for (kk = 0; kk < 8; kk ++){
155
+                    ymm_storeage[kk][hh] = ymm_1[kk];
156
+                }
157
+            }
158
+
159
+            for (mm = 0; mm < 8; mm ++) {
160
+
161
+                for (kk = 0; kk < 4; kk ++){
162
+                    ymm_0[kk] = ymm_storeage[mm][kk];
163
+                }
164
+
165
+                ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]);
166
+                ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]);
167
+                ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]);
168
+                ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]);
169
+
170
+                ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32);
171
+                ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32);
172
+                ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49);
173
+                ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49);
174
+
175
+                _mm256_storeu_si256((__m256i *) &out_b[
176
+                        (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]);
177
+                _mm256_storeu_si256((__m256i *) &out_b[
178
+                        (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]);
179
+                _mm256_storeu_si256((__m256i *) &out_b[
180
+                        (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]);
181
+                _mm256_storeu_si256((__m256i *) &out_b[
182
+                        (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]);
183
+            }
184
+        }
185
+    }
186
+    for (ii = 0; ii < nrows; ii ++ ) {
187
+        for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) {
188
+            out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj];
189
+        }
190
+    }
191
+    return size * elem_size;
192
+}
193
+
194
+
195
+/* Shuffle bits within the bytes of eight element blocks. */
196
+static int64_t bshuf_shuffle_bit_eightelem_avx2(void* in, void* out, const size_t size,
197
+                                                const size_t elem_size) {
198
+
199
+    CHECK_MULT_EIGHT(size);
200
+
201
+    /*  With a bit of care, this could be written such that such that it is */
202
+    /*  in_buf = out_buf safe. */
203
+    char* in_b = (char*) in;
204
+    char* out_b = (char*) out;
205
+
206
+    size_t nbyte = elem_size * size;
207
+    size_t ii, jj, kk, ind;
208
+
209
+    __m256i ymm;
210
+    int32_t bt;
211
+
212
+    if (elem_size % 4) {
213
+        return blosc_internal_bshuf_shuffle_bit_eightelem_sse2(in, out, size, elem_size);
214
+    } else {
215
+        for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) {
216
+            for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
217
+                    ii += 8 * elem_size) {
218
+                ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]);
219
+                for (kk = 0; kk < 8; kk++) {
220
+                    bt = _mm256_movemask_epi8(ymm);
221
+                    ymm = _mm256_slli_epi16(ymm, 1);
222
+                    ind = (ii + jj / 8 + (7 - kk) * elem_size);
223
+                    * (int32_t *) &out_b[ind] = bt;
224
+                }
225
+            }
226
+        }
227
+    }
228
+    return size * elem_size;
229
+}
230
+
231
+
232
+/* Untranspose bits within elements. */
233
+int64_t blosc_internal_bshuf_untrans_bit_elem_avx2(void* in, void* out, const size_t size,
234
+                                                   const size_t elem_size, void* tmp_buf) {
235
+
236
+    int64_t count;
237
+
238
+    CHECK_MULT_EIGHT(size);
239
+
240
+    count = bshuf_trans_byte_bitrow_avx2(in, tmp_buf, size, elem_size);
241
+    CHECK_ERR(count);
242
+    count =  bshuf_shuffle_bit_eightelem_avx2(tmp_buf, out, size, elem_size);
243
+
244
+    return count;
245
+}
0 246
new file mode 100644
... ...
@@ -0,0 +1,38 @@
1
+/*********************************************************************
2
+  Blosc - Blocked Shuffling and Compression Library
3
+
4
+  Author: Francesc Alted <francesc@blosc.org>
5
+
6
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
7
+**********************************************************************/
8
+
9
+/* AVX2-accelerated shuffle/unshuffle routines. */
10
+
11
+#ifndef BITSHUFFLE_AVX2_H
12
+#define BITSHUFFLE_AVX2_H
13
+
14
+#include "blosc-common.h"
15
+
16
+#ifdef __cplusplus
17
+extern "C" {
18
+#endif
19
+
20
+/**
21
+  AVX2-accelerated bitshuffle routine.
22
+*/
23
+BLOSC_NO_EXPORT int64_t
24
+blosc_internal_bshuf_trans_bit_elem_avx2(void* in, void* out, const size_t size,
25
+                                         const size_t elem_size, void* tmp_buf);
26
+
27
+/**
28
+  AVX2-accelerated bitunshuffle routine.
29
+*/
30
+BLOSC_NO_EXPORT int64_t
31
+blosc_internal_bshuf_untrans_bit_elem_avx2(void* in, void* out, const size_t size,
32
+                                           const size_t elem_size, void* tmp_buf);
33
+
34
+#ifdef __cplusplus
35
+}
36
+#endif
37
+
38
+#endif /* BITSHUFFLE_AVX2_H */
0 39
new file mode 100644
... ...
@@ -0,0 +1,220 @@
1
+/*********************************************************************
2
+  Blosc - Blocked Shuffling and Compression Library
3
+
4
+  Author: Francesc Alted <francesc@blosc.org>
5
+
6
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
7
+**********************************************************************/
8
+
9
+#include "bitshuffle-generic.h"
10
+
11
+
12
+/* Transpose bytes within elements, starting partway through input. */
13
+int64_t blosc_internal_bshuf_trans_byte_elem_remainder(const void* in, void* out, const size_t size,
14
+         const size_t elem_size, const size_t start) {
15
+
16
+    char* in_b = (char*) in;
17
+    char* out_b = (char*) out;
18
+    size_t ii, jj, kk;
19
+
20
+    CHECK_MULT_EIGHT(start);
21
+
22
+    if (size > start) {
23
+        /*  ii loop separated into 2 loops so the compiler can unroll */
24
+        /*  the inner one. */
25
+        for (ii = start; ii + 7 < size; ii += 8) {
26
+            for (jj = 0; jj < elem_size; jj++) {
27
+                for (kk = 0; kk < 8; kk++) {
28
+                    out_b[jj * size + ii + kk]
29
+                        = in_b[ii * elem_size + kk * elem_size + jj];
30
+                }
31
+            }
32
+        }
33
+        for (ii = size - size % 8; ii < size; ii ++) {
34
+            for (jj = 0; jj < elem_size; jj++) {
35
+                out_b[jj * size + ii] = in_b[ii * elem_size + jj];
36
+            }
37
+        }
38
+    }
39
+    return size * elem_size;
40
+}
41
+
42
+
43
+/* Transpose bytes within elements. */
44
+int64_t blosc_internal_bshuf_trans_byte_elem_scal(const void* in, void* out, const size_t size,
45
+				   const size_t elem_size) {
46
+
47
+    return blosc_internal_bshuf_trans_byte_elem_remainder(in, out, size, elem_size, 0);
48
+}
49
+
50
+
51
+/* Transpose bits within bytes. */
52
+int64_t blosc_internal_bshuf_trans_bit_byte_remainder(const void* in, void* out, const size_t size,
53
+                                                      const size_t elem_size, const size_t start_byte) {
54
+
55
+    const uint64_t* in_b = (const uint64_t*) in;
56
+    uint8_t* out_b = (uint8_t*) out;
57
+
58
+    uint64_t x, t;
59
+
60
+    size_t ii, kk;
61
+    size_t nbyte = elem_size * size;
62
+    size_t nbyte_bitrow = nbyte / 8;
63
+
64
+    uint64_t e=1;
65
+    const int little_endian = *(uint8_t *) &e == 1;
66
+    const size_t bit_row_skip = little_endian ? nbyte_bitrow : -nbyte_bitrow;
67
+    const int64_t bit_row_offset = little_endian ? 0 : 7 * nbyte_bitrow;
68
+
69
+    CHECK_MULT_EIGHT(nbyte);
70
+    CHECK_MULT_EIGHT(start_byte);
71
+
72
+    for (ii = start_byte / 8; ii < nbyte_bitrow; ii ++) {
73
+        x = in_b[ii];
74
+        if (little_endian) {
75
+            TRANS_BIT_8X8(x, t);
76
+        } else {
77
+            TRANS_BIT_8X8_BE(x, t);
78
+        }
79
+        for (kk = 0; kk < 8; kk ++) {
80
+            out_b[bit_row_offset + kk * bit_row_skip + ii] = x;
81
+            x = x >> 8;
82
+        }
83
+    }
84
+    return size * elem_size;
85
+}
86
+
87
+
88
+/* Transpose bits within bytes. */
89
+static int64_t bshuf_trans_bit_byte_scal(const void* in, void* out, const size_t size,
90
+         const size_t elem_size) {
91
+
92
+    return blosc_internal_bshuf_trans_bit_byte_remainder(in, out, size, elem_size, 0);
93
+}
94
+
95
+/* General transpose of an array, optimized for large element sizes. */
96
+int64_t blosc_internal_bshuf_trans_elem(const void* in, void* out, const size_t lda,
97
+        const size_t ldb, const size_t elem_size) {
98
+
99
+    char* in_b = (char*) in;
100
+    char* out_b = (char*) out;
101
+    size_t ii, jj;
102
+    for (ii = 0; ii < lda; ii++) {
103
+        for (jj = 0; jj < ldb; jj++) {
104
+            memcpy(&out_b[(jj*lda + ii) * elem_size],
105
+                   &in_b[(ii*ldb + jj) * elem_size], elem_size);
106
+        }
107
+    }
108
+    return lda * ldb * elem_size;
109
+}
110
+
111
+
112
+/* Transpose rows of shuffled bits (size / 8 bytes) within groups of 8. */
113
+int64_t blosc_internal_bshuf_trans_bitrow_eight(const void* in, void* out, const size_t size,
114
+         const size_t elem_size) {
115
+
116
+    size_t nbyte_bitrow = size / 8;
117
+
118
+    CHECK_MULT_EIGHT(size);
119
+
120
+    return blosc_internal_bshuf_trans_elem(in, out, 8, elem_size, nbyte_bitrow);
121
+}
122
+
123
+
124
+/* Transpose bits within elements. */
125
+int64_t blosc_internal_bshuf_trans_bit_elem_scal(const void* in, void* out, const size_t size,
126
+                                                 const size_t elem_size, void* tmp_buf) {
127
+
128
+    int64_t count;
129
+
130
+    CHECK_MULT_EIGHT(size);
131
+
132
+    count = blosc_internal_bshuf_trans_byte_elem_scal(in, out, size, elem_size);
133
+    CHECK_ERR(count);
134
+    count = bshuf_trans_bit_byte_scal(out, tmp_buf, size, elem_size);
135
+    CHECK_ERR(count);
136
+    count = blosc_internal_bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
137
+
138
+    return count;
139
+}
140
+
141
+
142
+/* For data organized into a row for each bit (8 * elem_size rows), transpose
143
+ * the bytes. */
144
+static int64_t bshuf_trans_byte_bitrow_scal(const void* in, void* out, const size_t size,
145
+         const size_t elem_size) {
146
+    char* in_b = (char*) in;
147
+    char* out_b = (char*) out;
148
+
149
+    size_t nbyte_row = size / 8;
150
+    size_t ii, jj, kk;
151
+
152
+    CHECK_MULT_EIGHT(size);
153
+
154
+    for (jj = 0; jj < elem_size; jj++) {
155
+        for (ii = 0; ii < nbyte_row; ii++) {
156
+            for (kk = 0; kk < 8; kk++) {
157
+                out_b[ii * 8 * elem_size + jj * 8 + kk] = \
158
+                        in_b[(jj * 8 + kk) * nbyte_row + ii];
159
+            }
160
+        }
161
+    }
162
+    return size * elem_size;
163
+}
164
+
165
+
166
+/* Shuffle bits within the bytes of eight element blocks. */
167
+int64_t blosc_internal_bshuf_shuffle_bit_eightelem_scal(const void* in, void* out,
168
+        const size_t size, const size_t elem_size) {
169
+
170
+    const char *in_b;
171
+    char *out_b;
172
+    uint64_t x, t;
173
+    size_t ii, jj, kk;
174
+    size_t nbyte, out_index;
175
+
176
+    uint64_t e=1;
177
+    const int little_endian = *(uint8_t *) &e == 1;
178
+    const size_t elem_skip = little_endian ? elem_size : -elem_size;
179
+    const uint64_t elem_offset = little_endian ? 0 : 7 * elem_size;
180
+
181
+    CHECK_MULT_EIGHT(size);
182
+
183
+    in_b = (const char*) in;
184
+    out_b = (char*) out;
185
+
186
+    nbyte = elem_size * size;
187
+
188
+    for (jj = 0; jj < 8 * elem_size; jj += 8) {
189
+        for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) {
190
+            x = *((uint64_t*) &in_b[ii + jj]);
191
+            if (little_endian) {
192
+                TRANS_BIT_8X8(x, t);
193
+            } else {
194
+                TRANS_BIT_8X8_BE(x, t);
195
+            }
196
+            for (kk = 0; kk < 8; kk++) {
197
+                out_index = ii + jj / 8 + elem_offset + kk * elem_skip;
198
+                *((uint8_t*) &out_b[out_index]) = x;
199
+                x = x >> 8;
200
+            }
201
+        }
202
+    }
203
+    return size * elem_size;
204
+}
205
+
206
+
207
+/* Untranspose bits within elements. */
208
+int64_t blosc_internal_bshuf_untrans_bit_elem_scal(const void* in, void* out, const size_t size,
209
+                                                   const size_t elem_size, void* tmp_buf) {
210
+
211
+    int64_t count;
212
+
213
+    CHECK_MULT_EIGHT(size);
214
+
215
+    count = bshuf_trans_byte_bitrow_scal(in, tmp_buf, size, elem_size);
216
+    CHECK_ERR(count);
217
+    count =  blosc_internal_bshuf_shuffle_bit_eightelem_scal(tmp_buf, out, size, elem_size);
218
+
219
+    return count;
220
+}
0 221
new file mode 100644
... ...
@@ -0,0 +1,161 @@
1
+/*********************************************************************
2
+  Blosc - Blocked Shuffling and Compression Library
3
+
4
+  Author: Francesc Alted <francesc@blosc.org>
5
+
6
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
7
+**********************************************************************/
8
+
9
+/* Generic (non-hardware-accelerated) shuffle/unshuffle routines.
10
+   These are used when hardware-accelerated functions aren't available
11
+   for a particular platform; they are also used by the hardware-
12
+   accelerated functions to handle any remaining elements in a block
13
+   which isn't a multiple of the hardware's vector size. */
14
+
15
+#ifndef BITSHUFFLE_GENERIC_H
16
+#define BITSHUFFLE_GENERIC_H
17
+
18
+#include "blosc-common.h"
19
+#include <stdlib.h>
20
+
21
+#ifdef __cplusplus
22
+extern "C" {
23
+#endif
24
+
25
+
26
+/*  Macros. */
27
+#define CHECK_MULT_EIGHT(n) if (n % 8) return -80;
28
+#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
29
+#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
30
+#define CHECK_ERR(count) if (count < 0) { return count; }
31
+
32
+
33
+/* ---- Worker code not requiring special instruction sets. ----
34
+ *
35
+ * The following code does not use any x86 specific vectorized instructions
36
+ * and should compile on any machine
37
+ *
38
+ */
39
+
40
+/* Transpose 8x8 bit array packed into a single quadword *x*.
41
+ * *t* is workspace. */
42
+#define TRANS_BIT_8X8(x, t) {                                               \
43
+        t = (x ^ (x >> 7)) & 0x00AA00AA00AA00AALL;                          \
44
+        x = x ^ t ^ (t << 7);                                               \
45
+        t = (x ^ (x >> 14)) & 0x0000CCCC0000CCCCLL;                         \
46
+        x = x ^ t ^ (t << 14);                                              \
47
+        t = (x ^ (x >> 28)) & 0x00000000F0F0F0F0LL;                         \
48
+        x = x ^ t ^ (t << 28);                                              \
49
+    }
50
+
51
+/* Transpose 8x8 bit array along the diagonal from upper right
52
+   to lower left */
53
+#define TRANS_BIT_8X8_BE(x, t) {                                            \
54
+        t = (x ^ (x >> 9)) & 0x0055005500550055LL;                          \
55
+        x = x ^ t ^ (t << 9);                                               \
56
+        t = (x ^ (x >> 18)) & 0x0000333300003333LL;                         \
57
+        x = x ^ t ^ (t << 18);                                              \
58
+        t = (x ^ (x >> 36)) & 0x000000000F0F0F0FLL;                         \
59
+        x = x ^ t ^ (t << 36);                                              \
60
+    }
61
+
62
+/* Transpose of an array of arbitrarily typed elements. */
63
+#define TRANS_ELEM_TYPE(in, out, lda, ldb, type_t) {                        \
64
+        type_t* in_type = (type_t*) in;                                     \
65
+        type_t* out_type = (type_t*) out;                                   \
66
+        size_t ii, jj, kk;                                                  \
67
+        for (ii = 0; ii + 7 < lda; ii += 8) {                               \
68
+            for (jj = 0; jj < ldb; jj++) {                                  \
69
+                for (kk = 0; kk < 8; kk++) {                                \
70
+                    out_type[jj*lda + ii + kk] =                            \
71
+                        in_type[ii*ldb + kk * ldb + jj];                    \
72
+                }                                                           \
73
+            }                                                               \
74
+        }                                                                   \
75
+        for (ii = lda - lda % 8; ii < lda; ii ++) {                         \
76
+            for (jj = 0; jj < ldb; jj++) {                                  \
77
+                out_type[jj*lda + ii] = in_type[ii*ldb + jj];               \
78
+            }                                                               \
79
+        }                                                                   \
80
+    }
81
+
82
+
83
+/* Private functions */
84
+BLOSC_NO_EXPORT int64_t
85
+blosc_internal_bshuf_trans_byte_elem_remainder(const void* in, void* out, const size_t size,
86
+                                               const size_t elem_size, const size_t start);
87
+
88
+BLOSC_NO_EXPORT int64_t
89
+blosc_internal_bshuf_trans_byte_elem_scal(const void* in, void* out, const size_t size,
90
+                                          const size_t elem_size);
91
+
92
+BLOSC_NO_EXPORT int64_t
93
+blosc_internal_bshuf_trans_bit_byte_remainder(const void* in, void* out, const size_t size,
94
+                                              const size_t elem_size, const size_t start_byte);
95
+
96
+BLOSC_NO_EXPORT int64_t
97
+blosc_internal_bshuf_trans_elem(const void* in, void* out, const size_t lda,
98
+                                const size_t ldb, const size_t elem_size);
99
+
100
+BLOSC_NO_EXPORT int64_t
101
+blosc_internal_bshuf_trans_bitrow_eight(const void* in, void* out, const size_t size,
102
+                                        const size_t elem_size);
103
+
104
+BLOSC_NO_EXPORT int64_t
105
+blosc_internal_bshuf_shuffle_bit_eightelem_scal(const void* in, void* out,
106
+                                                const size_t size, const size_t elem_size);
107
+
108
+
109
+/* Bitshuffle the data.
110
+ *
111
+ * Transpose the bits within elements.
112
+ *
113
+ * Parameters
114
+ * ----------
115
+ *  in : input buffer, must be of size * elem_size bytes
116
+ *  out : output buffer, must be of size * elem_size bytes
117
+ *  size : number of elements in input
118
+ *  elem_size : element size of typed data
119
+ *  tmp_buffer : temporary buffer with the same `size` than `in` and `out`
120
+ *
121
+ * Returns
122
+ * -------
123
+ *  nothing -- this cannot fail
124
+ *
125
+ */
126
+
127
+BLOSC_NO_EXPORT int64_t
128
+blosc_internal_bshuf_trans_bit_elem_scal(const void* in, void* out, const size_t size,
129
+                                         const size_t elem_size, void* tmp_buf);
130
+
131
+/* Unshuffle bitshuffled data.
132
+ *
133
+ * Untranspose the bits within elements.
134
+ *
135
+ * To properly unshuffle bitshuffled data, *size* and *elem_size* must
136
+ * match the parameters used to shuffle the data.
137
+ *
138
+ * Parameters
139
+ * ----------
140
+ *  in : input buffer, must be of size * elem_size bytes
141
+ *  out : output buffer, must be of size * elem_size bytes
142
+ *  size : number of elements in input
143
+ *  elem_size : element size of typed data
144
+ *  tmp_buffer : temporary buffer with the same `size` than `in` and `out`
145
+ *
146
+ * Returns
147
+ * -------
148
+ *  nothing -- this cannot fail
149
+ *
150
+ */
151
+
152
+BLOSC_NO_EXPORT int64_t
153
+blosc_internal_bshuf_untrans_bit_elem_scal(const void* in, void* out, const size_t size,
154
+                                           const size_t elem_size, void* tmp_buf);
155
+
156
+
157
+#ifdef __cplusplus
158
+}
159
+#endif
160
+
161
+#endif /* BITSHUFFLE_GENERIC_H */
0 162
new file mode 100644
... ...
@@ -0,0 +1,467 @@
1
+/*
2
+ * Bitshuffle - Filter for improving compression of typed binary data.
3
+ *
4
+ * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
5
+ * Website: http://www.github.com/kiyo-masui/bitshuffle
6
+ * Created: 2014
7
+ *
8
+ * Note: Adapted for c-blosc by Francesc Alted.
9
+ *
10
+ * See LICENSES/BITSHUFFLE.txt file for details about copyright and
11
+ * rights to use.
12
+ *
13
+ */
14
+
15
+#include "bitshuffle-generic.h"
16
+#include "bitshuffle-sse2.h"
17
+
18
+/* Make sure SSE2 is available for the compilation target and compiler. */
19
+#if !defined(__SSE2__)
20
+  #error SSE2 is not supported by the target architecture/platform and/or this compiler.
21
+#endif
22
+
23
+#include <emmintrin.h>
24
+
25
+/* The next is useful for debugging purposes */
26
+#if 0
27
+#include <stdio.h>
28
+#include <string.h>
29
+
30
+
31
+static void printxmm(__m128i xmm0)
32
+{
33
+  uint8_t buf[32];
34
+
35
+  ((__m128i *)buf)[0] = xmm0;
36
+  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
37
+          buf[0], buf[1], buf[2], buf[3],
38
+          buf[4], buf[5], buf[6], buf[7],
39
+          buf[8], buf[9], buf[10], buf[11],
40
+          buf[12], buf[13], buf[14], buf[15]);
41
+}
42
+#endif
43
+
44
+
45
+/* ---- Worker code that requires SSE2. Intel Petium 4 (2000) and later. ---- */
46
+
47
+/* Transpose bytes within elements for 16 bit elements. */
48
+static int64_t bshuf_trans_byte_elem_SSE_16(void* in, void* out, const size_t size) {
49
+
50
+    char* in_b = (char*) in;
51
+    char* out_b = (char*) out;
52
+    __m128i a0, b0, a1, b1;
53
+    size_t ii;
54
+
55
+    for (ii=0; ii + 15 < size; ii += 16) {
56
+        a0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 0*16]);
57
+        b0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 1*16]);
58
+
59
+        a1 = _mm_unpacklo_epi8(a0, b0);
60
+        b1 = _mm_unpackhi_epi8(a0, b0);
61
+
62
+        a0 = _mm_unpacklo_epi8(a1, b1);
63
+        b0 = _mm_unpackhi_epi8(a1, b1);
64
+
65
+        a1 = _mm_unpacklo_epi8(a0, b0);
66
+        b1 = _mm_unpackhi_epi8(a0, b0);
67
+
68
+        a0 = _mm_unpacklo_epi8(a1, b1);
69
+        b0 = _mm_unpackhi_epi8(a1, b1);
70
+
71
+        _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
72
+        _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
73
+    }
74
+    return blosc_internal_bshuf_trans_byte_elem_remainder(in, out, size, 2,
75
+            size - size % 16);
76
+}
77
+
78
+
79
+/* Transpose bytes within elements for 32 bit elements. */
80
+static int64_t bshuf_trans_byte_elem_SSE_32(void* in, void* out, const size_t size) {
81
+
82
+    char* in_b = (char*) in;
83
+    char* out_b = (char*) out;
84
+    __m128i a0, b0, c0, d0, a1, b1, c1, d1;
85
+    size_t ii;
86
+
87
+    for (ii=0; ii + 15 < size; ii += 16) {
88
+        a0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 0*16]);
89
+        b0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 1*16]);
90
+        c0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 2*16]);
91
+        d0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 3*16]);
92
+
93
+        a1 = _mm_unpacklo_epi8(a0, b0);
94
+        b1 = _mm_unpackhi_epi8(a0, b0);
95
+        c1 = _mm_unpacklo_epi8(c0, d0);
96
+        d1 = _mm_unpackhi_epi8(c0, d0);
97
+
98
+        a0 = _mm_unpacklo_epi8(a1, b1);
99
+        b0 = _mm_unpackhi_epi8(a1, b1);
100
+        c0 = _mm_unpacklo_epi8(c1, d1);
101
+        d0 = _mm_unpackhi_epi8(c1, d1);
102
+
103
+        a1 = _mm_unpacklo_epi8(a0, b0);
104
+        b1 = _mm_unpackhi_epi8(a0, b0);
105
+        c1 = _mm_unpacklo_epi8(c0, d0);
106
+        d1 = _mm_unpackhi_epi8(c0, d0);
107
+
108
+        a0 = _mm_unpacklo_epi64(a1, c1);
109
+        b0 = _mm_unpackhi_epi64(a1, c1);
110
+        c0 = _mm_unpacklo_epi64(b1, d1);
111
+        d0 = _mm_unpackhi_epi64(b1, d1);
112
+
113
+        _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
114
+        _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
115
+        _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
116
+        _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
117
+    }
118
+    return blosc_internal_bshuf_trans_byte_elem_remainder(in, out, size, 4,
119
+            size - size % 16);
120
+}
121
+
122
+
123
+/* Transpose bytes within elements for 64 bit elements. */
124
+static int64_t bshuf_trans_byte_elem_SSE_64(void* in, void* out, const size_t size) {
125
+
126
+    char* in_b = (char*) in;
127
+    char* out_b = (char*) out;
128
+    __m128i a0, b0, c0, d0, e0, f0, g0, h0;
129
+    __m128i a1, b1, c1, d1, e1, f1, g1, h1;
130
+    size_t ii;
131
+
132
+    for (ii=0; ii + 15 < size; ii += 16) {
133
+        a0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 0*16]);
134
+        b0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 1*16]);
135
+        c0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 2*16]);
136
+        d0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 3*16]);
137
+        e0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 4*16]);
138
+        f0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 5*16]);
139
+        g0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 6*16]);
140
+        h0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 7*16]);
141
+
142
+        a1 = _mm_unpacklo_epi8(a0, b0);
143
+        b1 = _mm_unpackhi_epi8(a0, b0);
144
+        c1 = _mm_unpacklo_epi8(c0, d0);
145
+        d1 = _mm_unpackhi_epi8(c0, d0);
146
+        e1 = _mm_unpacklo_epi8(e0, f0);
147
+        f1 = _mm_unpackhi_epi8(e0, f0);
148
+        g1 = _mm_unpacklo_epi8(g0, h0);
149
+        h1 = _mm_unpackhi_epi8(g0, h0);
150
+
151
+        a0 = _mm_unpacklo_epi8(a1, b1);
152
+        b0 = _mm_unpackhi_epi8(a1, b1);
153
+        c0 = _mm_unpacklo_epi8(c1, d1);
154
+        d0 = _mm_unpackhi_epi8(c1, d1);
155
+        e0 = _mm_unpacklo_epi8(e1, f1);
156
+        f0 = _mm_unpackhi_epi8(e1, f1);
157
+        g0 = _mm_unpacklo_epi8(g1, h1);
158
+        h0 = _mm_unpackhi_epi8(g1, h1);
159
+
160
+        a1 = _mm_unpacklo_epi32(a0, c0);
161
+        b1 = _mm_unpackhi_epi32(a0, c0);
162
+        c1 = _mm_unpacklo_epi32(b0, d0);
163
+        d1 = _mm_unpackhi_epi32(b0, d0);
164
+        e1 = _mm_unpacklo_epi32(e0, g0);
165
+        f1 = _mm_unpackhi_epi32(e0, g0);
166
+        g1 = _mm_unpacklo_epi32(f0, h0);
167
+        h1 = _mm_unpackhi_epi32(f0, h0);
168
+
169
+        a0 = _mm_unpacklo_epi64(a1, e1);
170
+        b0 = _mm_unpackhi_epi64(a1, e1);
171
+        c0 = _mm_unpacklo_epi64(b1, f1);
172
+        d0 = _mm_unpackhi_epi64(b1, f1);
173
+        e0 = _mm_unpacklo_epi64(c1, g1);
174
+        f0 = _mm_unpackhi_epi64(c1, g1);
175
+        g0 = _mm_unpacklo_epi64(d1, h1);
176
+        h0 = _mm_unpackhi_epi64(d1, h1);
177
+
178
+        _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
179
+        _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
180
+        _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
181
+        _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
182
+        _mm_storeu_si128((__m128i *) &out_b[4*size + ii], e0);
183
+        _mm_storeu_si128((__m128i *) &out_b[5*size + ii], f0);
184
+        _mm_storeu_si128((__m128i *) &out_b[6*size + ii], g0);
185
+        _mm_storeu_si128((__m128i *) &out_b[7*size + ii], h0);
186
+    }
187
+    return blosc_internal_bshuf_trans_byte_elem_remainder(in, out, size, 8,
188
+            size - size % 16);
189
+}
190
+
191
+
192
+/* Memory copy with bshuf call signature. */
193
+static int64_t bshuf_copy(void* in, void* out, const size_t size,
194
+                          const size_t elem_size) {
195
+
196
+    char* in_b = (char*) in;
197
+    char* out_b = (char*) out;
198
+
199
+    memcpy(out_b, in_b, size * elem_size);
200
+    return size * elem_size;
201
+}
202
+
203
+
204
+/* Transpose bytes within elements using best SSE algorithm available. */
205
+int64_t blosc_internal_bshuf_trans_byte_elem_sse2(void* in, void* out, const size_t size,
206
+                                                  const size_t elem_size, void* tmp_buf) {
207
+
208
+    int64_t count;
209
+
210
+    /*  Trivial cases: power of 2 bytes. */
211
+    switch (elem_size) {
212
+        case 1:
213
+            count = bshuf_copy(in, out, size, elem_size);
214
+            return count;
215
+        case 2:
216
+            count = bshuf_trans_byte_elem_SSE_16(in, out, size);
217
+            return count;
218
+        case 4:
219
+            count = bshuf_trans_byte_elem_SSE_32(in, out, size);
220
+            return count;
221
+        case 8:
222
+            count = bshuf_trans_byte_elem_SSE_64(in, out, size);
223
+            return count;
224
+    }
225
+
226
+    /*  Worst case: odd number of bytes. Turns out that this is faster for */
227
+    /*  (odd * 2) byte elements as well (hence % 4). */
228
+    if (elem_size % 4) {
229
+        count = blosc_internal_bshuf_trans_byte_elem_scal(in, out, size, elem_size);
230
+        return count;
231
+    }
232
+
233
+    /*  Multiple of power of 2: transpose hierarchically. */
234
+    {
235
+        size_t nchunk_elem;
236
+
237
+        if ((elem_size % 8) == 0) {
238
+            nchunk_elem = elem_size / 8;
239
+            TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t);
240
+            count = bshuf_trans_byte_elem_SSE_64(out, tmp_buf,
241
+                    size * nchunk_elem);
242
+            blosc_internal_bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size);
243
+        } else if ((elem_size % 4) == 0) {
244
+            nchunk_elem = elem_size / 4;
245
+            TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t);
246
+            count = bshuf_trans_byte_elem_SSE_32(out, tmp_buf,
247
+                    size * nchunk_elem);
248
+            blosc_internal_bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
249
+        } else {
250
+            /*  Not used since scalar algorithm is faster. */
251
+            nchunk_elem = elem_size / 2;
252
+            TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
253
+            count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf,
254
+                    size * nchunk_elem);
255
+            blosc_internal_bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size);
256
+        }
257
+
258
+        return count;
259
+    }
260
+}
261
+
262
+
263
+/* Transpose bits within bytes. */
264
+static int64_t bshuf_trans_bit_byte_sse2(void* in, void* out, const size_t size,
265
+                                         const size_t elem_size) {
266
+
267
+    char* in_b = (char*) in;
268
+    char* out_b = (char*) out;
269
+    uint16_t* out_ui16;
270
+    int64_t count;
271
+    size_t nbyte = elem_size * size;
272
+    __m128i xmm;
273
+    int32_t bt;
274
+    size_t ii, kk;
275
+
276
+    CHECK_MULT_EIGHT(nbyte);
277
+
278
+    for (ii = 0; ii + 15 < nbyte; ii += 16) {
279
+        xmm = _mm_loadu_si128((__m128i *) &in_b[ii]);
280
+        for (kk = 0; kk < 8; kk++) {
281
+            bt = _mm_movemask_epi8(xmm);
282
+            xmm = _mm_slli_epi16(xmm, 1);
283
+            out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
284
+            *out_ui16 = bt;
285
+        }
286
+    }
287
+    count = blosc_internal_bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
288
+            nbyte - nbyte % 16);
289
+    return count;
290
+}
291
+
292
+
293
+/* Transpose bits within elements. */
294
+int64_t blosc_internal_bshuf_trans_bit_elem_sse2(void* in, void* out, const size_t size,
295
+				  const size_t elem_size, void* tmp_buf) {
296
+
297
+    int64_t count;
298
+
299
+    CHECK_MULT_EIGHT(size);
300
+
301
+    count = blosc_internal_bshuf_trans_byte_elem_sse2(in, out, size, elem_size, tmp_buf);
302
+    CHECK_ERR(count);
303
+    count = bshuf_trans_bit_byte_sse2(out, tmp_buf, size, elem_size);
304
+    CHECK_ERR(count);
305
+    count = blosc_internal_bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
306
+
307
+    return count;
308
+}
309
+
310
+
311
+/* For data organized into a row for each bit (8 * elem_size rows), transpose
312
+ * the bytes. */
313
+int64_t blosc_internal_bshuf_trans_byte_bitrow_sse2(void* in, void* out, const size_t size,
314
+				     const size_t elem_size) {
315
+
316
+    char* in_b = (char*) in;
317
+    char* out_b = (char*) out;
318
+    size_t nrows = 8 * elem_size;
319
+    size_t nbyte_row = size / 8;
320
+    size_t ii, jj;
321
+
322
+    __m128i a0, b0, c0, d0, e0, f0, g0, h0;
323
+    __m128i a1, b1, c1, d1, e1, f1, g1, h1;
324
+    __m128 *as, *bs, *cs, *ds, *es, *fs, *gs, *hs;
325
+
326
+    CHECK_MULT_EIGHT(size);
327
+
328
+    for (ii = 0; ii + 7 < nrows; ii += 8) {
329
+        for (jj = 0; jj + 15 < nbyte_row; jj += 16) {
330
+            a0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 0)*nbyte_row + jj]);
331
+            b0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 1)*nbyte_row + jj]);
332
+            c0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 2)*nbyte_row + jj]);
333
+            d0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 3)*nbyte_row + jj]);
334
+            e0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 4)*nbyte_row + jj]);
335
+            f0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 5)*nbyte_row + jj]);
336
+            g0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 6)*nbyte_row + jj]);
337
+            h0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 7)*nbyte_row + jj]);
338
+
339
+
340
+            a1 = _mm_unpacklo_epi8(a0, b0);
341
+            b1 = _mm_unpacklo_epi8(c0, d0);
342
+            c1 = _mm_unpacklo_epi8(e0, f0);
343
+            d1 = _mm_unpacklo_epi8(g0, h0);
344
+            e1 = _mm_unpackhi_epi8(a0, b0);
345
+            f1 = _mm_unpackhi_epi8(c0, d0);
346
+            g1 = _mm_unpackhi_epi8(e0, f0);
347
+            h1 = _mm_unpackhi_epi8(g0, h0);
348
+
349
+
350
+            a0 = _mm_unpacklo_epi16(a1, b1);
351
+            b0 = _mm_unpacklo_epi16(c1, d1);
352
+            c0 = _mm_unpackhi_epi16(a1, b1);
353
+            d0 = _mm_unpackhi_epi16(c1, d1);
354
+
355
+            e0 = _mm_unpacklo_epi16(e1, f1);
356
+            f0 = _mm_unpacklo_epi16(g1, h1);
357
+            g0 = _mm_unpackhi_epi16(e1, f1);
358
+            h0 = _mm_unpackhi_epi16(g1, h1);
359
+
360
+
361
+            a1 = _mm_unpacklo_epi32(a0, b0);
362
+            b1 = _mm_unpackhi_epi32(a0, b0);
363
+
364
+            c1 = _mm_unpacklo_epi32(c0, d0);
365
+            d1 = _mm_unpackhi_epi32(c0, d0);
366
+
367
+            e1 = _mm_unpacklo_epi32(e0, f0);
368
+            f1 = _mm_unpackhi_epi32(e0, f0);
369
+
370
+            g1 = _mm_unpacklo_epi32(g0, h0);
371
+            h1 = _mm_unpackhi_epi32(g0, h0);
372
+
373
+            /*  We don't have a storeh instruction for integers, so interpret */
374
+            /*  as a float. Have a storel (_mm_storel_epi64). */
375
+            as = (__m128 *) &a1;
376
+            bs = (__m128 *) &b1;
377
+            cs = (__m128 *) &c1;
378
+            ds = (__m128 *) &d1;
379
+            es = (__m128 *) &e1;
380
+            fs = (__m128 *) &f1;
381
+            gs = (__m128 *) &g1;
382
+            hs = (__m128 *) &h1;
383
+
384
+            _mm_storel_pi((__m64 *) &out_b[(jj + 0) * nrows + ii], *as);
385
+            _mm_storel_pi((__m64 *) &out_b[(jj + 2) * nrows + ii], *bs);
386
+            _mm_storel_pi((__m64 *) &out_b[(jj + 4) * nrows + ii], *cs);
387
+            _mm_storel_pi((__m64 *) &out_b[(jj + 6) * nrows + ii], *ds);
388
+            _mm_storel_pi((__m64 *) &out_b[(jj + 8) * nrows + ii], *es);
389
+            _mm_storel_pi((__m64 *) &out_b[(jj + 10) * nrows + ii], *fs);
390
+            _mm_storel_pi((__m64 *) &out_b[(jj + 12) * nrows + ii], *gs);
391
+            _mm_storel_pi((__m64 *) &out_b[(jj + 14) * nrows + ii], *hs);
392
+
393
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 1) * nrows + ii], *as);
394
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 3) * nrows + ii], *bs);
395
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 5) * nrows + ii], *cs);
396
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 7) * nrows + ii], *ds);
397
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 9) * nrows + ii], *es);
398
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 11) * nrows + ii], *fs);
399
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 13) * nrows + ii], *gs);
400
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 15) * nrows + ii], *hs);
401
+        }
402
+        for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) {
403
+            out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj];
404
+            out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj];
405
+            out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj];
406
+            out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj];
407
+            out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj];
408
+            out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj];
409
+            out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj];
410
+            out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj];
411
+        }
412
+    }
413
+    return size * elem_size;
414
+}
415
+
416
+
417
+/* Shuffle bits within the bytes of eight element blocks. */
418
+int64_t blosc_internal_bshuf_shuffle_bit_eightelem_sse2(void* in, void* out, const size_t size,
419
+					 const size_t elem_size) {
420
+    /*  With a bit of care, this could be written such that such that it is */
421
+    /*  in_buf = out_buf safe. */
422
+    char* in_b = (char*) in;
423
+    uint16_t* out_ui16 = (uint16_t*) out;
424
+
425
+    size_t nbyte = elem_size * size;
426
+
427
+    __m128i xmm;
428
+    int32_t bt;
429
+    size_t ii, jj, kk;
430
+    size_t ind;
431
+
432
+    CHECK_MULT_EIGHT(size);
433
+
434
+    if (elem_size % 2) {
435
+        blosc_internal_bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
436
+    } else {
437
+        for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
438
+                ii += 8 * elem_size) {
439
+            for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
440
+                xmm = _mm_loadu_si128((__m128i *) &in_b[ii + jj]);
441
+                for (kk = 0; kk < 8; kk++) {
442
+                    bt = _mm_movemask_epi8(xmm);
443
+                    xmm = _mm_slli_epi16(xmm, 1);
444
+                    ind = (ii + jj / 8 + (7 - kk) * elem_size);
445
+                    out_ui16[ind / 2] = bt;
446
+                }
447
+            }
448
+        }
449
+    }
450
+    return size * elem_size;
451
+}
452
+
453
+
454
+/* Untranspose bits within elements. */
455
+int64_t blosc_internal_bshuf_untrans_bit_elem_sse2(void* in, void* out, const size_t size,
456
+				    const size_t elem_size, void* tmp_buf) {
457
+
458
+    int64_t count;
459
+
460
+    CHECK_MULT_EIGHT(size);
461
+
462
+    count = blosc_internal_bshuf_trans_byte_bitrow_sse2(in, tmp_buf, size, elem_size);
463
+    CHECK_ERR(count);
464
+    count = blosc_internal_bshuf_shuffle_bit_eightelem_sse2(tmp_buf, out, size, elem_size);
465
+
466
+    return count;
467
+}
0 468
new file mode 100644
... ...
@@ -0,0 +1,52 @@
1
+/*********************************************************************
2
+  Blosc - Blocked Shuffling and Compression Library
3
+
4
+  Author: Francesc Alted <francesc@blosc.org>
5
+
6
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
7
+**********************************************************************/
8
+
9
+/* SSE2-accelerated shuffle/unshuffle routines. */
10
+
11
+#ifndef BITSHUFFLE_SSE2_H
12
+#define BITSHUFFLE_SSE2_H
13
+
14
+#include "blosc-common.h"
15
+
16
+#ifdef __cplusplus
17
+extern "C" {
18
+#endif
19
+
20
+
21
+BLOSC_NO_EXPORT int64_t
22
+blosc_internal_bshuf_trans_byte_elem_sse2(void* in, void* out, const size_t size,
23
+                                          const size_t elem_size, void* tmp_buf);
24
+
25
+BLOSC_NO_EXPORT int64_t
26
+blosc_internal_bshuf_trans_byte_bitrow_sse2(void* in, void* out, const size_t size,
27
+                                            const size_t elem_size);
28
+
29
+BLOSC_NO_EXPORT int64_t
30
+blosc_internal_bshuf_shuffle_bit_eightelem_sse2(void* in, void* out, const size_t size,
31
+                                                const size_t elem_size);
32
+
33
+/**
34
+  SSE2-accelerated bitshuffle routine.
35
+*/
36
+BLOSC_NO_EXPORT int64_t
37
+blosc_internal_bshuf_trans_bit_elem_sse2(void* in, void* out, const size_t size,
38
+                                         const size_t elem_size, void* tmp_buf);
39
+
40
+/**
41
+  SSE2-accelerated bitunshuffle routine.
42
+*/
43
+BLOSC_NO_EXPORT int64_t
44
+blosc_internal_bshuf_untrans_bit_elem_sse2(void* in, void* out, const size_t size,
45
+                                           const size_t elem_size, void* tmp_buf);
46
+
47
+#ifdef __cplusplus
48
+}
49
+#endif
50
+
51
+
52
+#endif /* BITSHUFFLE_SSE2_H */
0 53
new file mode 100644
... ...
@@ -0,0 +1,74 @@
1
+/*********************************************************************
2
+  Blosc - Blocked Shuffling and Compression Library
3
+
4
+  Author: Francesc Alted <francesc@blosc.org>
5
+
6
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
7
+**********************************************************************/
8
+
9
+#ifndef SHUFFLE_COMMON_H
10
+#define SHUFFLE_COMMON_H
11
+
12
+#include "blosc-export.h"
13
+#include <string.h>
14
+
15
+/* Import standard integer type definitions */
16
+#if defined(_WIN32) && !defined(__MINGW32__)
17
+
18
+  /* stdint.h only available in VS2010 (VC++ 16.0) and newer */
19
+  #if defined(_MSC_VER) && _MSC_VER < 1600
20
+    #include "win32/stdint-windows.h"
21
+  #else
22
+    #include <stdint.h>
23
+  #endif
24
+#else
25
+  #include <stdint.h>
26
+#endif  /* _WIN32 */
27
+
28
+
29
+/* Define the __SSE2__ symbol if compiling with Visual C++ and
30
+   targeting the minimum architecture level supporting SSE2.
31
+   Other compilers define this as expected and emit warnings
32
+   when it is re-defined. */
33
+#if !defined(__SSE2__) && defined(_MSC_VER) && \
34
+    (defined(_M_X64) || (defined(_M_IX86) && _M_IX86_FP >= 2))
35
+  #define __SSE2__
36
+#endif
37
+
38
+/*
39
+ * Detect if the architecture is fine with unaligned access.
40
+ */
41
+#if !defined(BLOSC_STRICT_ALIGN)
42
+#define BLOSC_STRICT_ALIGN
43
+#if defined(__i386__) || defined(__386) || defined (__amd64)  /* GNU C, Sun Studio */
44
+#undef BLOSC_STRICT_ALIGN
45
+#elif defined(__i486__) || defined(__i586__) || defined(__i686__)  /* GNU C */
46
+#undef BLOSC_STRICT_ALIGN
47
+#elif defined(_M_IX86) || defined(_M_X64)   /* Intel, MSVC */
48
+#undef BLOSC_STRICT_ALIGN
49
+#elif defined(__386)
50
+#undef BLOSC_STRICT_ALIGN
51
+#elif defined(_X86_) /* MinGW */
52
+#undef BLOSC_STRICT_ALIGN
53
+#elif defined(__I86__) /* Digital Mars */
54
+#undef BLOSC_STRICT_ALIGN
55
+/* Seems like unaligned access in ARM (at least ARMv6) is pretty
56
+   expensive, so we are going to always enforce strict aligment in ARM.
57
+   If anybody suggest that newer ARMs are better, we can revisit this. */
58
+/* #elif defined(__ARM_FEATURE_UNALIGNED) */  /* ARM, GNU C */
59
+/* #undef BLOSC_STRICT_ALIGN */
60
+#elif defined(_ARCH_PPC) || defined(__PPC__)
61
+/* Modern PowerPC systems (like POWER8) should support unaligned access
62
+   quite efficiently. */
63
+#undef BLOSC_STRICT_ALIGN
64
+#endif
65
+#endif
66
+
67
+#if defined(__SSE2__)
68
+  #include <emmintrin.h>
69
+#endif
70
+#if defined(__AVX2__)
71
+  #include <immintrin.h>
72
+#endif
73
+
74
+#endif  /* SHUFFLE_COMMON_H */
0 75
new file mode 100644
... ...
@@ -0,0 +1,21 @@
1
+/*********************************************************************
2
+  Blosc - Blocked Shuffling and Compression Library
3
+
4
+  Author: Francesc Alted <francesc@blosc.org>
5
+
6
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
7
+**********************************************************************/
8
+
9
+#ifndef BLOSC_COMP_FEATURES_H
10
+#define BLOSC_COMP_FEATURES_H
11
+
12
+/* Use inlined functions for supported systems */
13
+#if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
14
+  #define BLOSC_INLINE __inline  /* Visual C is not C99, but supports some kind of inline */
15
+#elif __STDC_VERSION__ >= 199901L
16
+  #define BLOSC_INLINE inline
17
+#else
18
+  #define BLOSC_INLINE
19
+#endif
20
+
21
+#endif /* BLOSC_COMP_FEATURES_H */
0 22
new file mode 100644
... ...
@@ -0,0 +1,45 @@
1
+/*********************************************************************
2
+  Blosc - Blocked Shuffling and Compression Library
3
+
4
+  Author: Francesc Alted <francesc@blosc.org>
5
+
6
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
7
+**********************************************************************/
8
+#ifndef BLOSC_EXPORT_H
9
+#define BLOSC_EXPORT_H
10
+
11
+/* Macros for specifying exported symbols.
12
+   BLOSC_EXPORT is used to decorate symbols that should be
13
+   exported by the blosc shared library.
14
+   BLOSC_NO_EXPORT is used to decorate symbols that should NOT
15
+   be exported by the blosc shared library.
16
+*/
17
+#if defined(BLOSC_SHARED_LIBRARY)
18
+  #if defined(_MSC_VER)
19
+    #define BLOSC_EXPORT __declspec(dllexport)
20
+  #elif (defined(__GNUC__) && __GNUC__ >= 4) || defined(__clang__)
21
+    #if defined(_WIN32) || defined(__CYGWIN__) || defined(__MINGW32__)
22
+      #define BLOSC_EXPORT __attribute__((dllexport))
23
+    #else
24
+      #define BLOSC_EXPORT __attribute__((visibility("default")))
25
+    #endif  /* defined(_WIN32) || defined(__CYGWIN__) */
26
+  #else
27
+    #error Cannot determine how to define BLOSC_EXPORT for this compiler.
28
+  #endif
29
+#else
30
+  #define BLOSC_EXPORT
31
+#endif  /* defined(BLOSC_SHARED_LIBRARY) */
32
+
33
+#if defined(__GNUC__) || defined(__clang__)
34
+  #define BLOSC_NO_EXPORT __attribute__((visibility("hidden")))
35
+#else
36
+  #define BLOSC_NO_EXPORT
37
+#endif  /* defined(__GNUC__) || defined(__clang__) */
38
+
39
+/* When testing, export everything to make it easier to implement tests. */
40
+#if defined(BLOSC_TESTING)
41
+  #undef BLOSC_NO_EXPORT
42
+  #define BLOSC_NO_EXPORT BLOSC_EXPORT
43
+#endif  /* defined(BLOSC_TESTING) */
44
+
45
+#endif  /* BLOSC_EXPORT_H */
0 46
new file mode 100644
... ...
@@ -0,0 +1,2336 @@
1
+/*********************************************************************
2
+  Blosc - Blocked Shuffling and Compression Library
3
+
4
+  Author: Francesc Alted <francesc@blosc.org>
5
+  Creation date: 2009-05-20
6
+
7
+  See LICENSES/BLOSC.txt for details about copyright and rights to use.
8
+**********************************************************************/
9
+
10
+
11
+#include <stdio.h>
12
+#include <stdlib.h>
13
+#include <errno.h>
14
+#include <string.h>
15
+#include <sys/types.h>
16
+#include <assert.h>
17
+
18
+#include "fastcopy.h"
19
+
20
+#if defined(USING_CMAKE)
21
+  #include "config.h"
22
+#endif /*  USING_CMAKE */
23
+#include "blosc.h"
24
+#include "shuffle.h"
25
+#include "blosclz.h"
26
+#if defined(HAVE_LZ4)
27
+  #include "lz4.h"
28
+  #include "lz4hc.h"
29
+#endif /*  HAVE_LZ4 */
30
+#if defined(HAVE_SNAPPY)
31
+  #include "snappy-c.h"
32
+#endif /*  HAVE_SNAPPY */
33
+#if defined(HAVE_ZLIB)
34
+  #include "zlib.h"
35
+#endif /*  HAVE_ZLIB */
36
+#if defined(HAVE_ZSTD)
37
+  #include "zstd.h"
38
+#endif /*  HAVE_ZSTD */
39
+
40
+#if defined(_WIN32) && !defined(__MINGW32__)
41
+  #include <windows.h>
42
+  #include <malloc.h>
43
+
44
+  /* stdint.h only available in VS2010 (VC++ 16.0) and newer */
45
+  #if defined(_MSC_VER) && _MSC_VER < 1600
46
+    #include "win32/stdint-windows.h"
47
+  #else
48
+    #include <stdint.h>
49
+  #endif
50
+
51
+  #include <process.h>
52
+  #define getpid _getpid
53
+#else
54
+  #include <stdint.h>
55
+  #include <unistd.h>
56
+  #include <inttypes.h>
57
+#endif  /* _WIN32 */
58
+
59
+/* Include the win32/pthread.h library for all the Windows builds. See #224. */
60
+#if defined(_WIN32)
61
+  #include "win32/pthread.h"
62
+  #include "win32/pthread.c"
63
+#else
64
+  #include <pthread.h>
65
+#endif
66
+
67
+
68
+/* Some useful units */
69
+#define KB 1024
70
+#define MB (1024 * (KB))
71
+
72
+/* Minimum buffer size to be compressed */
73
+#define MIN_BUFFERSIZE 128       /* Cannot be smaller than 66 */
74
+
75
+/* The maximum number of splits in a block for compression */
76
+#define MAX_SPLITS 16            /* Cannot be larger than 128 */
77
+
78
+/* The size of L1 cache.  32 KB is quite common nowadays. */
79
+#define L1 (32 * (KB))
80
+
81
+/* Have problems using posix barriers when symbol value is 200112L */
82
+/* This requires more investigation, but will work for the moment */
83
+#if defined(_POSIX_BARRIERS) && ( (_POSIX_BARRIERS - 20012L) >= 0 && _POSIX_BARRIERS != 200112L)
84
+#define _POSIX_BARRIERS_MINE
85
+#endif
86
+/* Synchronization variables */
87
+
88
+
89
+struct blosc_context {
90
+  int32_t compress;               /* 1 if we are doing compression 0 if decompress */
91
+
92
+  const uint8_t* src;
93
+  uint8_t* dest;                  /* The current pos in the destination buffer */
94
+  uint8_t* header_flags;          /* Flags for header */
95
+  int compversion;                /* Compressor version byte, only used during decompression */
96
+  int32_t sourcesize;             /* Number of bytes in source buffer (or uncompressed bytes in compressed file) */
97
+  int32_t compressedsize;         /* Number of bytes of compressed data (only used when decompressing) */
98
+  int32_t nblocks;                /* Number of total blocks in buffer */
99
+  int32_t leftover;               /* Extra bytes at end of buffer */
100
+  int32_t blocksize;              /* Length of the block in bytes */
101
+  int32_t typesize;               /* Type size */
102
+  int32_t num_output_bytes;       /* Counter for the number of output bytes */
103
+  int32_t destsize;               /* Maximum size for destination buffer */
104
+  uint8_t* bstarts;               /* Start of the buffer past header info */
105
+  int32_t compcode;               /* Compressor code to use */
106
+  int clevel;                     /* Compression level (1-9) */
107
+  /* Function to use for decompression.  Only used when decompression */
108
+  int (*decompress_func)(const void* input, int compressed_length, void* output,
109
+                         int maxout);
110
+
111
+  /* Threading */
112
+  int32_t numthreads;
113
+  int32_t threads_started;
114
+  int32_t end_threads;
115
+  pthread_t threads[BLOSC_MAX_THREADS];
116
+  int32_t tids[BLOSC_MAX_THREADS];
117
+  pthread_mutex_t count_mutex;
118
+  #ifdef _POSIX_BARRIERS_MINE
119
+  pthread_barrier_t barr_init;
120
+  pthread_barrier_t barr_finish;
121
+  #else
122
+  int32_t count_threads;
123
+  pthread_mutex_t count_threads_mutex;
124
+  pthread_cond_t count_threads_cv;
125
+  #endif
126
+  #if !defined(_WIN32)
127
+  pthread_attr_t ct_attr;            /* creation time attrs for threads */
128
+  #endif
129
+  int32_t thread_giveup_code;               /* error code when give up */
130
+  int32_t thread_nblock;                    /* block counter */
131
+};
132
+
133
+struct thread_context {
134
+  struct blosc_context* parent_context;
135
+  int32_t tid;
136
+  uint8_t* tmp;
137
+  uint8_t* tmp2;
138
+  uint8_t* tmp3;
139
+  int32_t tmpblocksize; /* Used to keep track of how big the temporary buffers are */
140
+};
141
+
142
+/* Global context for non-contextual API */
143
+static struct blosc_context* g_global_context;
144
+static pthread_mutex_t* global_comp_mutex;
145
+static int32_t g_compressor = BLOSC_BLOSCLZ;  /* the compressor to use by default */
146
+static int32_t g_threads = 1;
147
+static int32_t g_force_blocksize = 0;
148
+static int32_t g_initlib = 0;
149
+static int32_t g_atfork_registered = 0;
150
+static int32_t g_splitmode = BLOSC_FORWARD_COMPAT_SPLIT;
151
+
152
+
153
+
154
+/* Wrapped function to adjust the number of threads used by blosc */
155
+int blosc_set_nthreads_(struct blosc_context*);
156
+
157
+/* Releases the global threadpool */
158
+int blosc_release_threadpool(struct blosc_context* context);
159
+
160
+/* Macros for synchronization */
161
+
162
+/* Wait until all threads are initialized */
163
+#ifdef _POSIX_BARRIERS_MINE
164
+#define WAIT_INIT(RET_VAL, CONTEXT_PTR)  \
165
+  rc = pthread_barrier_wait(&CONTEXT_PTR->barr_init); \
166
+  if (rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD) { \
167
+    printf("Could not wait on barrier (init): %d\n", rc); \
168
+    return((RET_VAL));                            \
169
+  }
170
+#else
171
+#define WAIT_INIT(RET_VAL, CONTEXT_PTR)   \
172
+  pthread_mutex_lock(&CONTEXT_PTR->count_threads_mutex); \
173
+  if (CONTEXT_PTR->count_threads < CONTEXT_PTR->numthreads) { \
174
+    CONTEXT_PTR->count_threads++;  \
175
+    pthread_cond_wait(&CONTEXT_PTR->count_threads_cv, &CONTEXT_PTR->count_threads_mutex); \
176
+  } \
177
+  else { \
178
+    pthread_cond_broadcast(&CONTEXT_PTR->count_threads_cv); \
179
+  } \
180
+  pthread_mutex_unlock(&CONTEXT_PTR->count_threads_mutex);
181
+#endif
182
+
183
+/* Wait for all threads to finish */
184
+#ifdef _POSIX_BARRIERS_MINE
185
+#define WAIT_FINISH(RET_VAL, CONTEXT_PTR)   \
186
+  rc = pthread_barrier_wait(&CONTEXT_PTR->barr_finish); \
187
+  if (rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD) { \
188
+    printf("Could not wait on barrier (finish)\n"); \
189
+    return((RET_VAL));                              \
190
+  }
191
+#else
192
+#define WAIT_FINISH(RET_VAL, CONTEXT_PTR)                           \
193
+  pthread_mutex_lock(&CONTEXT_PTR->count_threads_mutex); \
194
+  if (CONTEXT_PTR->count_threads > 0) { \
195
+    CONTEXT_PTR->count_threads--; \
196
+    pthread_cond_wait(&CONTEXT_PTR->count_threads_cv, &CONTEXT_PTR->count_threads_mutex); \
197
+  } \
198
+  else { \
199
+    pthread_cond_broadcast(&CONTEXT_PTR->count_threads_cv); \
200
+  } \
201
+  pthread_mutex_unlock(&CONTEXT_PTR->count_threads_mutex);
202
+#endif
203
+
204
+
205
+/* A function for aligned malloc that is portable */
206
+static uint8_t *my_malloc(size_t size)
207
+{
208
+  void *block = NULL;
209
+  int res = 0;
210
+
211
+/* Do an alignment to 32 bytes because AVX2 is supported */
212
+#if defined(_WIN32)
213
+  /* A (void *) cast needed for avoiding a warning with MINGW :-/ */
214
+  block = (void *)_aligned_malloc(size, 32);
215
+#elif _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600
216
+  /* Platform does have an implementation of posix_memalign */
217
+  res = posix_memalign(&block, 32, size);
218
+#else
219
+  block = malloc(size);
220
+#endif  /* _WIN32 */
221
+
222
+  if (block == NULL || res != 0) {
223
+    printf("Error allocating memory!");
224
+    return NULL;
225
+  }
226
+
227
+  return (uint8_t *)block;
228
+}
229
+
230
+
231
+/* Release memory booked by my_malloc */
232
+static void my_free(void *block)
233
+{
234
+#if defined(_WIN32)
235
+    _aligned_free(block);
236
+#else
237
+    free(block);
238
+#endif  /* _WIN32 */
239
+}
240
+
241
+
242
+/* Copy 4 bytes from `*pa` to int32_t, changing endianness if necessary. */
243
+static int32_t sw32_(const uint8_t *pa)
244
+{
245
+  int32_t idest;
246
+  uint8_t *dest = (uint8_t *)&idest;
247
+  int i = 1;                    /* for big/little endian detection */
248
+  char *p = (char *)&i;
249
+
250
+  if (p[0] != 1) {
251
+    /* big endian */
252
+    dest[0] = pa[3];
253
+    dest[1] = pa[2];
254
+    dest[2] = pa[1];
255
+    dest[3] = pa[0];
256
+  }
257
+  else {
258
+    /* little endian */
259
+    dest[0] = pa[0];
260
+    dest[1] = pa[1];
261
+    dest[2] = pa[2];
262
+    dest[3] = pa[3];
263
+  }
264
+  return idest;
265
+}
266
+
267
+
268
+/* Copy 4 bytes from `*pa` to `*dest`, changing endianness if necessary. */
269
+static void _sw32(uint8_t* dest, int32_t a)
270
+{
271
+  uint8_t *pa = (uint8_t *)&a;
272
+  int i = 1;                    /* for big/little endian detection */
273
+  char *p = (char *)&i;
274
+
275
+  if (p[0] != 1) {
276
+    /* big endian */
277
+    dest[0] = pa[3];
278
+    dest[1] = pa[2];
279
+    dest[2] = pa[1];
280
+    dest[3] = pa[0];
281
+  }
282
+  else {
283
+    /* little endian */
284
+    dest[0] = pa[0];
285
+    dest[1] = pa[1];
286
+    dest[2] = pa[2];
287
+    dest[3] = pa[3];
288
+  }
289
+}
290
+
291
+/*
292
+ * Conversion routines between compressor and compression libraries
293
+ */
294
+
295
+/* Return the library code associated with the compressor name */
296
+static int compname_to_clibcode(const char *compname)
297
+{
298
+  if (strcmp(compname, BLOSC_BLOSCLZ_COMPNAME) == 0)
299
+    return BLOSC_BLOSCLZ_LIB;
300
+  if (strcmp(compname, BLOSC_LZ4_COMPNAME) == 0)
301
+    return BLOSC_LZ4_LIB;
302
+  if (strcmp(compname, BLOSC_LZ4HC_COMPNAME) == 0)
303
+    return BLOSC_LZ4_LIB;
304
+  if (strcmp(compname, BLOSC_SNAPPY_COMPNAME) == 0)
305
+    return BLOSC_SNAPPY_LIB;
306
+  if (strcmp(compname, BLOSC_ZLIB_COMPNAME) == 0)
307
+    return BLOSC_ZLIB_LIB;
308
+  if (strcmp(compname, BLOSC_ZSTD_COMPNAME) == 0)
309
+    return BLOSC_ZSTD_LIB;
310
+  return -1;
311
+}
312
+
313
+/* Return the library name associated with the compressor code */
314
+static const char *clibcode_to_clibname(int clibcode)
315
+{
316
+  if (clibcode == BLOSC_BLOSCLZ_LIB) return BLOSC_BLOSCLZ_LIBNAME;
317
+  if (clibcode == BLOSC_LZ4_LIB) return BLOSC_LZ4_LIBNAME;
318
+  if (clibcode == BLOSC_SNAPPY_LIB) return BLOSC_SNAPPY_LIBNAME;
319
+  if (clibcode == BLOSC_ZLIB_LIB) return BLOSC_ZLIB_LIBNAME;
320
+  if (clibcode == BLOSC_ZSTD_LIB) return BLOSC_ZSTD_LIBNAME;
321
+  return NULL;                  /* should never happen */
322
+}
323
+
324
+
325
+/*
326
+ * Conversion routines between compressor names and compressor codes
327
+ */
328
+
329
+/* Get the compressor name associated with the compressor code */
330
+int blosc_compcode_to_compname(int compcode, const char **compname)
331
+{
332
+  int code = -1;    /* -1 means non-existent compressor code */
333
+  const char *name = NULL;
334
+
335
+  /* Map the compressor code */
336
+  if (compcode == BLOSC_BLOSCLZ)
337
+    name = BLOSC_BLOSCLZ_COMPNAME;
338
+  else if (compcode == BLOSC_LZ4)
339
+    name = BLOSC_LZ4_COMPNAME;
340
+  else if (compcode == BLOSC_LZ4HC)
341
+    name = BLOSC_LZ4HC_COMPNAME;
342
+  else if (compcode == BLOSC_SNAPPY)
343
+    name = BLOSC_SNAPPY_COMPNAME;
344
+  else if (compcode == BLOSC_ZLIB)
345
+    name = BLOSC_ZLIB_COMPNAME;
346
+  else if (compcode == BLOSC_ZSTD)
347
+    name = BLOSC_ZSTD_COMPNAME;
348
+
349
+  *compname = name;
350
+
351
+  /* Guess if there is support for this code */
352
+  if (compcode == BLOSC_BLOSCLZ)
353
+    code = BLOSC_BLOSCLZ;
354
+#if defined(HAVE_LZ4)
355
+  else if (compcode == BLOSC_LZ4)
356
+    code = BLOSC_LZ4;
357
+  else if (compcode == BLOSC_LZ4HC)
358
+    code = BLOSC_LZ4HC;
359
+#endif /*  HAVE_LZ4 */
360
+#if defined(HAVE_SNAPPY)
361
+  else if (compcode == BLOSC_SNAPPY)
362
+    code = BLOSC_SNAPPY;
363
+#endif /*  HAVE_SNAPPY */
364
+#if defined(HAVE_ZLIB)
365
+  else if (compcode == BLOSC_ZLIB)
366
+    code = BLOSC_ZLIB;
367
+#endif /*  HAVE_ZLIB */
368
+#if defined(HAVE_ZSTD)
369
+  else if (compcode == BLOSC_ZSTD)
370
+    code = BLOSC_ZSTD;
371
+#endif /*  HAVE_ZSTD */
372
+
373
+  return code;
374
+}
375
+
376
+/* Get the compressor code for the compressor name. -1 if it is not available */
377
+int blosc_compname_to_compcode(const char *compname)
378
+{
379
+  int code = -1;  /* -1 means non-existent compressor code */
380
+
381
+  if (strcmp(compname, BLOSC_BLOSCLZ_COMPNAME) == 0) {
382
+    code = BLOSC_BLOSCLZ;
383
+  }
384
+#if defined(HAVE_LZ4)
385
+  else if (strcmp(compname, BLOSC_LZ4_COMPNAME) == 0) {
386
+    code = BLOSC_LZ4;