src/math/SIMD.h
5fd1ddd4
 #ifndef __COGAPS_SIMD_H__
 #define __COGAPS_SIMD_H__
 
 #ifndef SSE_INSTR_SET
6f01f7cd
     #if defined ( __AVX2__ )
5fd1ddd4
         #define SSE_INSTR_SET 8
     #elif defined ( __AVX__ )
         #define SSE_INSTR_SET 7
     #elif defined ( __SSE4_2__ )
         #define SSE_INSTR_SET 6
     #elif defined ( __SSE4_1__ )
         #define SSE_INSTR_SET 5
     #else
         #define SSE_INSTR_SET 0
f013c0d5
     #endif
 #endif
 
 #if SSE_INSTR_SET == 7
     #define __GAPS_AVX__
5fd1ddd4
     #include <immintrin.h>
1d3f54e6
 #elif SSE_INSTR_SET == 6 || SSE_INSTR_SET == 5
f013c0d5
     #define __GAPS_SSE__
5fd1ddd4
     #include <nmmintrin.h>
 #endif
 
 namespace gaps
 {
 namespace simd
 {
 
f013c0d5
 #if defined( __GAPS_AVX__ )
     typedef __m256 gaps_packed_t;
5fd1ddd4
     const unsigned index_increment = 8;
f013c0d5
     #define SET_SCALAR(x) _mm256_set1_ps(x)
     #define LOAD_PACKED(x) _mm256_load_ps(x)
1d3f54e6
     #define STORE_PACKED(p,x) _mm256_store_ps(p,x)
f013c0d5
     #define ADD_PACKED(a,b) _mm256_add_ps(a,b)
     #define SUB_PACKED(a,b) _mm256_sub_ps(a,b)
     #define MUL_PACKED(a,b) _mm256_mul_ps(a,b)
     #define DIV_PACKED(a,b) _mm256_div_ps(a,b)
 #elif defined( __GAPS_SSE__ )
     typedef __m128 gaps_packed_t;
     const unsigned index_increment = 4;
     #define SET_SCALAR(x) _mm_set1_ps(x)
     #define LOAD_PACKED(x) _mm_load_ps(x)
0f279c94
     #define STORE_PACKED(p,x) _mm_store_ps(p,x)
f013c0d5
     #define ADD_PACKED(a,b) _mm_add_ps(a,b)
     #define SUB_PACKED(a,b) _mm_sub_ps(a,b)
     #define MUL_PACKED(a,b) _mm_mul_ps(a,b)
     #define DIV_PACKED(a,b) _mm_div_ps(a,b)
5fd1ddd4
 #else
f013c0d5
     typedef float gaps_packed_t;
5fd1ddd4
     const unsigned index_increment = 1;
f013c0d5
     #define SET_SCALAR(x) x
     #define LOAD_PACKED(x) *(x)
0733d393
     #define STORE_PACKED(p,x) *(p) = (x)
74377719
     #define ADD_PACKED(a,b) ((a)+(b))
     #define SUB_PACKED(a,b) ((a)-(b))
     #define MUL_PACKED(a,b) ((a)*(b))
     #define DIV_PACKED(a,b) ((a)/(b))
5fd1ddd4
 #endif
 
 class Index
 {
 private:
0733d393
 
5fd1ddd4
     unsigned index;
0733d393
 
5fd1ddd4
 public:
0733d393
 
     explicit Index(unsigned i) : index(i) {}
     Index& operator=(unsigned val) { index = val; return *this; }
a157a697
     bool operator<(unsigned comp) { return index < comp; }
ed1ad858
     bool operator<=(unsigned comp) { return index <= comp; }
5fd1ddd4
     void operator++() { index += index_increment; }
ed1ad858
     unsigned value() const { return index; }
     unsigned increment() const { return index_increment; }
5fd1ddd4
     friend const float* operator+(const float *ptr, Index ndx);
0f279c94
     friend float* operator+(float *ptr, Index ndx);
5fd1ddd4
 };
 
f013c0d5
 inline const float* operator+(const float *ptr, Index ndx) { return ptr + ndx.index; }
0f279c94
 inline float* operator+(float *ptr, Index ndx) { return ptr + ndx.index; }
5fd1ddd4
 
 class packedFloat
 {
 private:
 
f013c0d5
     gaps_packed_t mData;
5fd1ddd4
 
f013c0d5
 public:
5fd1ddd4
 
0733d393
     packedFloat() : mData() {}
     explicit packedFloat(float val) : mData(SET_SCALAR(val)) {}
1d3f54e6
 #if defined( __GAPS_SSE__ ) || defined( __GAPS_AVX__ ) || defined( __GAPS_AVX512__ )
0733d393
     explicit packedFloat(gaps_packed_t val) : mData(val) {}
f013c0d5
 #endif
5fd1ddd4
 
0733d393
     packedFloat operator+(packedFloat b) const { return packedFloat(ADD_PACKED(mData, b.mData)); }
     packedFloat operator-(packedFloat b) const { return packedFloat(SUB_PACKED(mData, b.mData)); }
     packedFloat operator*(packedFloat b) const { return packedFloat(MUL_PACKED(mData, b.mData)); }
     packedFloat operator/(packedFloat b) const { return packedFloat(DIV_PACKED(mData, b.mData)); }
f013c0d5
 
     void operator+=(packedFloat val) { mData = ADD_PACKED(mData, val.mData); }
     void load(const float *ptr) { mData = LOAD_PACKED(ptr); }
0f279c94
     void store(float *ptr) { STORE_PACKED(ptr, mData); }
f013c0d5
 
1d3f54e6
 #if defined( __GAPS_AVX512__ )
     float scalar()
     {
dd3a9441
         float* ra = reinterpret_cast<float*>(&mData); // NOLINT
1d3f54e6
         return ra[0] + ra[1] + ra[2] + ra[3] + ra[4] + ra[5] + ra[6] + ra[7] +
             ra[8] + ra[9] + ra[10] + ra[11] + ra[12] + ra[13] + ra[14] + ra[15];
     }
 #elif defined( __GAPS_AVX__ )
f013c0d5
     float scalar()
     {
dd3a9441
         float* ra = reinterpret_cast<float*>(&mData); // NOLINT
f013c0d5
         return ra[0] + ra[1] + ra[2] + ra[3] + ra[4] + ra[5] + ra[6] + ra[7];
     }
 #elif defined( __GAPS_SSE__ )
     float scalar()
     {
dd3a9441
         float* ra = reinterpret_cast<float*>(&mData); // NOLINT
f013c0d5
         return ra[0] + ra[1] + ra[2] + ra[3];
     }
5fd1ddd4
 #else
f013c0d5
     float scalar()
     {
         return mData;
     }
 #endif
5fd1ddd4
 };
 
 } // namespace simd
 } // namespace gaps
 
1d3f54e6
 #if (defined(_M_AMD64) || defined(_M_X64) || defined(__amd64)) && ! defined(__x86_64__)
     #define __x86_64__ 1
 #endif
 
 #ifdef _OPENMP
     #define __GAPS_OPENMP__
 #endif
 
5fd1ddd4
 #endif // __COGAPS_SIMD_H__
f013c0d5