5fd1ddd4 |
#ifndef __COGAPS_SIMD_H__
#define __COGAPS_SIMD_H__
#ifndef SSE_INSTR_SET
|
6f01f7cd |
#if defined ( __AVX2__ )
|
5fd1ddd4 |
#define SSE_INSTR_SET 8
#elif defined ( __AVX__ )
#define SSE_INSTR_SET 7
#elif defined ( __SSE4_2__ )
#define SSE_INSTR_SET 6
#elif defined ( __SSE4_1__ )
#define SSE_INSTR_SET 5
#else
#define SSE_INSTR_SET 0
|
f013c0d5 |
#endif
#endif
#if SSE_INSTR_SET == 7
#define __GAPS_AVX__
|
5fd1ddd4 |
#include <immintrin.h>
|
1d3f54e6 |
#elif SSE_INSTR_SET == 6 || SSE_INSTR_SET == 5
|
f013c0d5 |
#define __GAPS_SSE__
|
5fd1ddd4 |
#include <nmmintrin.h>
#endif
namespace gaps
{
namespace simd
{
|
f013c0d5 |
#if defined( __GAPS_AVX__ )
typedef __m256 gaps_packed_t;
|
5fd1ddd4 |
const unsigned index_increment = 8;
|
f013c0d5 |
#define SET_SCALAR(x) _mm256_set1_ps(x)
#define LOAD_PACKED(x) _mm256_load_ps(x)
|
1d3f54e6 |
#define STORE_PACKED(p,x) _mm256_store_ps(p,x)
|
f013c0d5 |
#define ADD_PACKED(a,b) _mm256_add_ps(a,b)
#define SUB_PACKED(a,b) _mm256_sub_ps(a,b)
#define MUL_PACKED(a,b) _mm256_mul_ps(a,b)
#define DIV_PACKED(a,b) _mm256_div_ps(a,b)
#elif defined( __GAPS_SSE__ )
typedef __m128 gaps_packed_t;
const unsigned index_increment = 4;
#define SET_SCALAR(x) _mm_set1_ps(x)
#define LOAD_PACKED(x) _mm_load_ps(x)
|
0f279c94 |
#define STORE_PACKED(p,x) _mm_store_ps(p,x)
|
f013c0d5 |
#define ADD_PACKED(a,b) _mm_add_ps(a,b)
#define SUB_PACKED(a,b) _mm_sub_ps(a,b)
#define MUL_PACKED(a,b) _mm_mul_ps(a,b)
#define DIV_PACKED(a,b) _mm_div_ps(a,b)
|
5fd1ddd4 |
#else
|
f013c0d5 |
typedef float gaps_packed_t;
|
5fd1ddd4 |
const unsigned index_increment = 1;
|
f013c0d5 |
#define SET_SCALAR(x) x
#define LOAD_PACKED(x) *(x)
|
0733d393 |
#define STORE_PACKED(p,x) *(p) = (x)
|
74377719 |
#define ADD_PACKED(a,b) ((a)+(b))
#define SUB_PACKED(a,b) ((a)-(b))
#define MUL_PACKED(a,b) ((a)*(b))
#define DIV_PACKED(a,b) ((a)/(b))
|
5fd1ddd4 |
#endif
class Index
{
private:
|
0733d393 |
|
5fd1ddd4 |
unsigned index;
|
0733d393 |
|
5fd1ddd4 |
public:
|
0733d393 |
explicit Index(unsigned i) : index(i) {}
Index& operator=(unsigned val) { index = val; return *this; }
|
a157a697 |
bool operator<(unsigned comp) { return index < comp; }
|
ed1ad858 |
bool operator<=(unsigned comp) { return index <= comp; }
|
5fd1ddd4 |
void operator++() { index += index_increment; }
|
ed1ad858 |
unsigned value() const { return index; }
unsigned increment() const { return index_increment; }
|
5fd1ddd4 |
friend const float* operator+(const float *ptr, Index ndx);
|
0f279c94 |
friend float* operator+(float *ptr, Index ndx);
|
5fd1ddd4 |
};
|
f013c0d5 |
inline const float* operator+(const float *ptr, Index ndx) { return ptr + ndx.index; }
|
0f279c94 |
inline float* operator+(float *ptr, Index ndx) { return ptr + ndx.index; }
|
5fd1ddd4 |
class packedFloat
{
private:
|
f013c0d5 |
gaps_packed_t mData;
|
5fd1ddd4 |
|
f013c0d5 |
public:
|
5fd1ddd4 |
|
0733d393 |
packedFloat() : mData() {}
explicit packedFloat(float val) : mData(SET_SCALAR(val)) {}
|
1d3f54e6 |
#if defined( __GAPS_SSE__ ) || defined( __GAPS_AVX__ ) || defined( __GAPS_AVX512__ )
|
0733d393 |
explicit packedFloat(gaps_packed_t val) : mData(val) {}
|
f013c0d5 |
#endif
|
5fd1ddd4 |
|
0733d393 |
packedFloat operator+(packedFloat b) const { return packedFloat(ADD_PACKED(mData, b.mData)); }
packedFloat operator-(packedFloat b) const { return packedFloat(SUB_PACKED(mData, b.mData)); }
packedFloat operator*(packedFloat b) const { return packedFloat(MUL_PACKED(mData, b.mData)); }
packedFloat operator/(packedFloat b) const { return packedFloat(DIV_PACKED(mData, b.mData)); }
|
f013c0d5 |
void operator+=(packedFloat val) { mData = ADD_PACKED(mData, val.mData); }
void load(const float *ptr) { mData = LOAD_PACKED(ptr); }
|
0f279c94 |
void store(float *ptr) { STORE_PACKED(ptr, mData); }
|
f013c0d5 |
|
1d3f54e6 |
#if defined( __GAPS_AVX512__ )
float scalar()
{
|
dd3a9441 |
float* ra = reinterpret_cast<float*>(&mData); // NOLINT
|
1d3f54e6 |
return ra[0] + ra[1] + ra[2] + ra[3] + ra[4] + ra[5] + ra[6] + ra[7] +
ra[8] + ra[9] + ra[10] + ra[11] + ra[12] + ra[13] + ra[14] + ra[15];
}
#elif defined( __GAPS_AVX__ )
|
f013c0d5 |
float scalar()
{
|
dd3a9441 |
float* ra = reinterpret_cast<float*>(&mData); // NOLINT
|
f013c0d5 |
return ra[0] + ra[1] + ra[2] + ra[3] + ra[4] + ra[5] + ra[6] + ra[7];
}
#elif defined( __GAPS_SSE__ )
float scalar()
{
|
dd3a9441 |
float* ra = reinterpret_cast<float*>(&mData); // NOLINT
|
f013c0d5 |
return ra[0] + ra[1] + ra[2] + ra[3];
}
|
5fd1ddd4 |
#else
|
f013c0d5 |
float scalar()
{
return mData;
}
#endif
|
5fd1ddd4 |
};
} // namespace simd
} // namespace gaps
|
1d3f54e6 |
#if (defined(_M_AMD64) || defined(_M_X64) || defined(__amd64)) && ! defined(__x86_64__)
#define __x86_64__ 1
#endif
#ifdef _OPENMP
#define __GAPS_OPENMP__
#endif
|
5fd1ddd4 |
#endif // __COGAPS_SIMD_H__
|
f013c0d5 |
|