... | ... |
@@ -1,7 +1,11 @@ |
1 | 1 |
#ifndef __COGAPS_SIMD_H__ |
2 | 2 |
#define __COGAPS_SIMD_H__ |
3 | 3 |
|
4 |
-#if (defined ( __AVX2__ ) || defined ( __AVX__ )) && (!defined(_WIN32) || !defined(WIN32)) |
|
4 |
+#if defined(_WIN32) || defined(WIN32) || defined(__MINGW32__) |
|
5 |
+ #define COGAPS_SIMD_H_DISABLE_SIMD |
|
6 |
+#endif |
|
7 |
+ |
|
8 |
+#if (defined ( __AVX2__ ) || defined ( __AVX__ )) && !defined(COGAPS_SIMD_H_DISABLE_SIMD) |
|
5 | 9 |
|
6 | 10 |
#define SIMD_INC 8 |
7 | 11 |
#define __GAPS_AVX__ |
... | ... |
@@ -15,7 +19,7 @@ |
15 | 19 |
#define MUL_PACKED(a,b) _mm256_mul_ps(a,b) |
16 | 20 |
#define DIV_PACKED(a,b) _mm256_div_ps(a,b) |
17 | 21 |
|
18 |
-#elif (defined ( __SSE4_2__ ) || defined ( __SSE4_1__ )) && (!defined(_WIN32) || !defined(WIN32)) |
|
22 |
+#elif (defined ( __SSE4_2__ ) || defined ( __SSE4_1__ )) && !defined(COGAPS_SIMD_H_DISABLE_SIMD) |
|
19 | 23 |
|
20 | 24 |
#define SIMD_INC 4 |
21 | 25 |
#define __GAPS_SSE__ |
... | ... |
@@ -132,5 +136,8 @@ inline float getScalar(gaps_packed_t pf) |
132 | 136 |
} // namespace simd |
133 | 137 |
} // namespace gaps |
134 | 138 |
|
135 |
-#endif // __COGAPS_SIMD_H__ |
|
139 |
+#ifdef COGAPS_SIMD_H_DISABLE_SIMD |
|
140 |
+ #undef COGAPS_SIMD_H_DISABLE_SIMD |
|
141 |
+#endif |
|
136 | 142 |
|
143 |
+#endif // __COGAPS_SIMD_H__ |
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
#ifndef __COGAPS_SIMD_H__ |
2 | 2 |
#define __COGAPS_SIMD_H__ |
3 | 3 |
|
4 |
-#if defined ( __AVX2__ ) || defined ( __AVX__ ) |
|
4 |
+#if (defined ( __AVX2__ ) || defined ( __AVX__ )) && (!defined(_WIN32) || !defined(WIN32)) |
|
5 | 5 |
|
6 | 6 |
#define SIMD_INC 8 |
7 | 7 |
#define __GAPS_AVX__ |
... | ... |
@@ -15,7 +15,7 @@ |
15 | 15 |
#define MUL_PACKED(a,b) _mm256_mul_ps(a,b) |
16 | 16 |
#define DIV_PACKED(a,b) _mm256_div_ps(a,b) |
17 | 17 |
|
18 |
-#elif defined ( __SSE4_2__ ) || defined ( __SSE4_1__ ) |
|
18 |
+#elif (defined ( __SSE4_2__ ) || defined ( __SSE4_1__ )) && (!defined(_WIN32) || !defined(WIN32)) |
|
19 | 19 |
|
20 | 20 |
#define SIMD_INC 4 |
21 | 21 |
#define __GAPS_SSE__ |
... | ... |
@@ -97,21 +97,15 @@ public: |
97 | 97 |
float scalar() |
98 | 98 |
{ |
99 | 99 |
#if defined( __GAPS_AVX__ ) |
100 |
- |
|
101 | 100 |
float* ra = reinterpret_cast<float*>(&mData); // NOLINT |
102 | 101 |
mData = _mm256_hadd_ps(mData, mData); |
103 | 102 |
mData = _mm256_hadd_ps(mData, mData); |
104 | 103 |
return ra[0] + ra[4]; |
105 |
- |
|
106 | 104 |
#elif defined( __GAPS_SSE__ ) |
107 |
- |
|
108 | 105 |
float* ra = reinterpret_cast<float*>(&mData); // NOLINT |
109 | 106 |
return ra[0] + ra[1] + ra[2] + ra[3]; |
110 |
- |
|
111 | 107 |
#else |
112 |
- |
|
113 | 108 |
return mData; |
114 |
- |
|
115 | 109 |
#endif |
116 | 110 |
} |
117 | 111 |
|
... | ... |
@@ -123,21 +117,15 @@ private: |
123 | 117 |
inline float getScalar(gaps_packed_t pf) |
124 | 118 |
{ |
125 | 119 |
#if defined( __GAPS_AVX__ ) |
126 |
- |
|
127 | 120 |
pf = _mm256_hadd_ps(pf, pf); |
128 | 121 |
pf = _mm256_hadd_ps(pf, pf); |
129 | 122 |
float* ra = reinterpret_cast<float*>(&pf); // NOLINT |
130 | 123 |
return ra[0] + ra[4]; |
131 |
- |
|
132 | 124 |
#elif defined( __GAPS_SSE__ ) |
133 |
- |
|
134 | 125 |
float* ra = reinterpret_cast<float*>(&pf); // NOLINT |
135 | 126 |
return ra[0] + ra[1] + ra[2] + ra[3]; |
136 |
- |
|
137 | 127 |
#else |
138 |
- |
|
139 | 128 |
return pf; |
140 |
- |
|
141 | 129 |
#endif |
142 | 130 |
} |
143 | 131 |
|
... | ... |
@@ -3,6 +3,7 @@ |
3 | 3 |
|
4 | 4 |
#if defined ( __AVX2__ ) || defined ( __AVX__ ) |
5 | 5 |
|
6 |
+ #define SIMD_INC 8 |
|
6 | 7 |
#define __GAPS_AVX__ |
7 | 8 |
#include <immintrin.h> |
8 | 9 |
typedef __m256 gaps_packed_t; |
... | ... |
@@ -16,6 +17,7 @@ |
16 | 17 |
|
17 | 18 |
#elif defined ( __SSE4_2__ ) || defined ( __SSE4_1__ ) |
18 | 19 |
|
20 |
+ #define SIMD_INC 4 |
|
19 | 21 |
#define __GAPS_SSE__ |
20 | 22 |
#include <nmmintrin.h> |
21 | 23 |
typedef __m128 gaps_packed_t; |
... | ... |
@@ -30,6 +32,7 @@ |
30 | 32 |
#else |
31 | 33 |
|
32 | 34 |
typedef float gaps_packed_t; |
35 |
+ #define SIMD_INC 1 |
|
33 | 36 |
#define SET_SCALAR(x) x |
34 | 37 |
#define LOAD_PACKED(x) *(x) |
35 | 38 |
#define STORE_PACKED(p,x) *(p) = (x) |
... | ... |
@@ -58,13 +61,7 @@ public: |
58 | 61 |
|
59 | 62 |
static unsigned increment() |
60 | 63 |
{ |
61 |
- #if defined( __GAPS_AVX__ ) |
|
62 |
- return 8; |
|
63 |
- #elif defined( __GAPS_SSE__ ) |
|
64 |
- return 4; |
|
65 |
- #else |
|
66 |
- return 1; |
|
67 |
- #endif |
|
64 |
+ return SIMD_INC; |
|
68 | 65 |
} |
69 | 66 |
|
70 | 67 |
friend const float* operator+(const float *ptr, Index ndx); |
... | ... |
@@ -82,7 +79,7 @@ class PackedFloat |
82 | 79 |
{ |
83 | 80 |
public: |
84 | 81 |
|
85 |
- PackedFloat() : mData() {} |
|
82 |
+ PackedFloat() : mData(SET_SCALAR(0.f)) {} |
|
86 | 83 |
explicit PackedFloat(float val) : mData(SET_SCALAR(val)) {} |
87 | 84 |
#if defined( __GAPS_SSE__ ) || defined( __GAPS_AVX__ ) // avoid redefintion when gaps_packed_t == float |
88 | 85 |
explicit PackedFloat(gaps_packed_t val) : mData(val) {} |
... | ... |
@@ -123,6 +120,27 @@ private: |
123 | 120 |
gaps_packed_t mData; |
124 | 121 |
}; |
125 | 122 |
|
123 |
+inline float getScalar(gaps_packed_t pf) |
|
124 |
+{ |
|
125 |
+ #if defined( __GAPS_AVX__ ) |
|
126 |
+ |
|
127 |
+ pf = _mm256_hadd_ps(pf, pf); |
|
128 |
+ pf = _mm256_hadd_ps(pf, pf); |
|
129 |
+ float* ra = reinterpret_cast<float*>(&pf); // NOLINT |
|
130 |
+ return ra[0] + ra[4]; |
|
131 |
+ |
|
132 |
+ #elif defined( __GAPS_SSE__ ) |
|
133 |
+ |
|
134 |
+ float* ra = reinterpret_cast<float*>(&pf); // NOLINT |
|
135 |
+ return ra[0] + ra[1] + ra[2] + ra[3]; |
|
136 |
+ |
|
137 |
+ #else |
|
138 |
+ |
|
139 |
+ return pf; |
|
140 |
+ |
|
141 |
+ #endif |
|
142 |
+} |
|
143 |
+ |
|
126 | 144 |
} // namespace simd |
127 | 145 |
} // namespace gaps |
128 | 146 |
|
... | ... |
@@ -84,7 +84,7 @@ public: |
84 | 84 |
|
85 | 85 |
PackedFloat() : mData() {} |
86 | 86 |
explicit PackedFloat(float val) : mData(SET_SCALAR(val)) {} |
87 |
-#if defined( __GAPS_SSE__ ) || defined( __GAPS_AVX__ ) |
|
87 |
+#if defined( __GAPS_SSE__ ) || defined( __GAPS_AVX__ ) // avoid redefintion when gaps_packed_t == float |
|
88 | 88 |
explicit PackedFloat(gaps_packed_t val) : mData(val) {} |
89 | 89 |
#endif |
90 | 90 |
|
... | ... |
@@ -6,7 +6,6 @@ |
6 | 6 |
#define __GAPS_AVX__ |
7 | 7 |
#include <immintrin.h> |
8 | 8 |
typedef __m256 gaps_packed_t; |
9 |
- const unsigned index_increment = 8; |
|
10 | 9 |
#define SET_SCALAR(x) _mm256_set1_ps(x) |
11 | 10 |
#define LOAD_PACKED(x) _mm256_load_ps(x) |
12 | 11 |
#define STORE_PACKED(p,x) _mm256_store_ps(p,x) |
... | ... |
@@ -20,7 +19,6 @@ |
20 | 19 |
#define __GAPS_SSE__ |
21 | 20 |
#include <nmmintrin.h> |
22 | 21 |
typedef __m128 gaps_packed_t; |
23 |
- const unsigned index_increment = 4; |
|
24 | 22 |
#define SET_SCALAR(x) _mm_set1_ps(x) |
25 | 23 |
#define LOAD_PACKED(x) _mm_load_ps(x) |
26 | 24 |
#define STORE_PACKED(p,x) _mm_store_ps(p,x) |
... | ... |
@@ -32,7 +30,6 @@ |
32 | 30 |
#else |
33 | 31 |
|
34 | 32 |
typedef float gaps_packed_t; |
35 |
- const unsigned index_increment = 1; |
|
36 | 33 |
#define SET_SCALAR(x) x |
37 | 34 |
#define LOAD_PACKED(x) *(x) |
38 | 35 |
#define STORE_PACKED(p,x) *(p) = (x) |
... | ... |
@@ -56,9 +53,19 @@ public: |
56 | 53 |
Index& operator=(unsigned val) { index = val; return *this; } |
57 | 54 |
bool operator<(unsigned comp) const { return index < comp; } |
58 | 55 |
bool operator<=(unsigned comp) const { return index <= comp; } |
59 |
- void operator++() { index += index_increment; } |
|
56 |
+ void operator++() { index += gaps::simd::Index::increment(); } |
|
60 | 57 |
unsigned value() const { return index; } |
61 |
- unsigned increment() const { return index_increment; } |
|
58 |
+ |
|
59 |
+ static unsigned increment() |
|
60 |
+ { |
|
61 |
+ #if defined( __GAPS_AVX__ ) |
|
62 |
+ return 8; |
|
63 |
+ #elif defined( __GAPS_SSE__ ) |
|
64 |
+ return 4; |
|
65 |
+ #else |
|
66 |
+ return 1; |
|
67 |
+ #endif |
|
68 |
+ } |
|
62 | 69 |
|
63 | 70 |
friend const float* operator+(const float *ptr, Index ndx); |
64 | 71 |
friend float* operator+(float *ptr, Index ndx); |
... | ... |
@@ -1,34 +1,10 @@ |
1 | 1 |
#ifndef __COGAPS_SIMD_H__ |
2 | 2 |
#define __COGAPS_SIMD_H__ |
3 | 3 |
|
4 |
-#ifndef SSE_INSTR_SET |
|
5 |
- #if defined ( __AVX2__ ) |
|
6 |
- #define SSE_INSTR_SET 8 |
|
7 |
- #elif defined ( __AVX__ ) |
|
8 |
- #define SSE_INSTR_SET 7 |
|
9 |
- #elif defined ( __SSE4_2__ ) |
|
10 |
- #define SSE_INSTR_SET 6 |
|
11 |
- #elif defined ( __SSE4_1__ ) |
|
12 |
- #define SSE_INSTR_SET 5 |
|
13 |
- #else |
|
14 |
- #define SSE_INSTR_SET 0 |
|
15 |
- #endif |
|
16 |
-#endif |
|
4 |
+#if defined ( __AVX2__ ) || defined ( __AVX__ ) |
|
17 | 5 |
|
18 |
-#if SSE_INSTR_SET > 6 |
|
19 | 6 |
#define __GAPS_AVX__ |
20 | 7 |
#include <immintrin.h> |
21 |
-#elif SSE_INSTR_SET == 6 || SSE_INSTR_SET == 5 |
|
22 |
- #define __GAPS_SSE__ |
|
23 |
- #include <nmmintrin.h> |
|
24 |
-#endif |
|
25 |
- |
|
26 |
-namespace gaps |
|
27 |
-{ |
|
28 |
-namespace simd |
|
29 |
-{ |
|
30 |
- |
|
31 |
-#if defined( __GAPS_AVX__ ) |
|
32 | 8 |
typedef __m256 gaps_packed_t; |
33 | 9 |
const unsigned index_increment = 8; |
34 | 10 |
#define SET_SCALAR(x) _mm256_set1_ps(x) |
... | ... |
@@ -38,7 +14,11 @@ namespace simd |
38 | 14 |
#define SUB_PACKED(a,b) _mm256_sub_ps(a,b) |
39 | 15 |
#define MUL_PACKED(a,b) _mm256_mul_ps(a,b) |
40 | 16 |
#define DIV_PACKED(a,b) _mm256_div_ps(a,b) |
41 |
-#elif defined( __GAPS_SSE__ ) |
|
17 |
+ |
|
18 |
+#elif defined ( __SSE4_2__ ) || defined ( __SSE4_1__ ) |
|
19 |
+ |
|
20 |
+ #define __GAPS_SSE__ |
|
21 |
+ #include <nmmintrin.h> |
|
42 | 22 |
typedef __m128 gaps_packed_t; |
43 | 23 |
const unsigned index_increment = 4; |
44 | 24 |
#define SET_SCALAR(x) _mm_set1_ps(x) |
... | ... |
@@ -48,7 +28,9 @@ namespace simd |
48 | 28 |
#define SUB_PACKED(a,b) _mm_sub_ps(a,b) |
49 | 29 |
#define MUL_PACKED(a,b) _mm_mul_ps(a,b) |
50 | 30 |
#define DIV_PACKED(a,b) _mm_div_ps(a,b) |
31 |
+ |
|
51 | 32 |
#else |
33 |
+ |
|
52 | 34 |
typedef float gaps_packed_t; |
53 | 35 |
const unsigned index_increment = 1; |
54 | 36 |
#define SET_SCALAR(x) x |
... | ... |
@@ -58,73 +40,80 @@ namespace simd |
58 | 40 |
#define SUB_PACKED(a,b) ((a)-(b)) |
59 | 41 |
#define MUL_PACKED(a,b) ((a)*(b)) |
60 | 42 |
#define DIV_PACKED(a,b) ((a)/(b)) |
43 |
+ |
|
61 | 44 |
#endif |
62 | 45 |
|
63 |
-class Index |
|
46 |
+namespace gaps |
|
47 |
+{ |
|
48 |
+namespace simd |
|
64 | 49 |
{ |
65 |
-private: |
|
66 |
- |
|
67 |
- unsigned index; |
|
68 | 50 |
|
51 |
+class Index |
|
52 |
+{ |
|
69 | 53 |
public: |
70 | 54 |
|
71 | 55 |
explicit Index(unsigned i) : index(i) {} |
72 | 56 |
Index& operator=(unsigned val) { index = val; return *this; } |
73 |
- bool operator<(unsigned comp) { return index < comp; } |
|
74 |
- bool operator<=(unsigned comp) { return index <= comp; } |
|
57 |
+ bool operator<(unsigned comp) const { return index < comp; } |
|
58 |
+ bool operator<=(unsigned comp) const { return index <= comp; } |
|
75 | 59 |
void operator++() { index += index_increment; } |
76 | 60 |
unsigned value() const { return index; } |
77 | 61 |
unsigned increment() const { return index_increment; } |
62 |
+ |
|
78 | 63 |
friend const float* operator+(const float *ptr, Index ndx); |
79 | 64 |
friend float* operator+(float *ptr, Index ndx); |
65 |
+ |
|
66 |
+private: |
|
67 |
+ |
|
68 |
+ unsigned index; |
|
80 | 69 |
}; |
81 | 70 |
|
82 | 71 |
inline const float* operator+(const float *ptr, Index ndx) { return ptr + ndx.index; } |
83 | 72 |
inline float* operator+(float *ptr, Index ndx) { return ptr + ndx.index; } |
84 | 73 |
|
85 |
-class packedFloat |
|
74 |
+class PackedFloat |
|
86 | 75 |
{ |
87 |
-private: |
|
88 |
- |
|
89 |
- gaps_packed_t mData; |
|
90 |
- |
|
91 | 76 |
public: |
92 | 77 |
|
93 |
- packedFloat() : mData() {} |
|
94 |
- explicit packedFloat(float val) : mData(SET_SCALAR(val)) {} |
|
95 |
-#if defined( __GAPS_SSE__ ) || defined( __GAPS_AVX__ ) || defined( __GAPS_AVX512__ ) |
|
96 |
- explicit packedFloat(gaps_packed_t val) : mData(val) {} |
|
78 |
+ PackedFloat() : mData() {} |
|
79 |
+ explicit PackedFloat(float val) : mData(SET_SCALAR(val)) {} |
|
80 |
+#if defined( __GAPS_SSE__ ) || defined( __GAPS_AVX__ ) |
|
81 |
+ explicit PackedFloat(gaps_packed_t val) : mData(val) {} |
|
97 | 82 |
#endif |
98 | 83 |
|
99 |
- packedFloat operator+(packedFloat b) const { return packedFloat(ADD_PACKED(mData, b.mData)); } |
|
100 |
- packedFloat operator-(packedFloat b) const { return packedFloat(SUB_PACKED(mData, b.mData)); } |
|
101 |
- packedFloat operator*(packedFloat b) const { return packedFloat(MUL_PACKED(mData, b.mData)); } |
|
102 |
- packedFloat operator/(packedFloat b) const { return packedFloat(DIV_PACKED(mData, b.mData)); } |
|
84 |
+ PackedFloat operator+(PackedFloat b) const { return PackedFloat(ADD_PACKED(mData, b.mData)); } |
|
85 |
+ PackedFloat operator-(PackedFloat b) const { return PackedFloat(SUB_PACKED(mData, b.mData)); } |
|
86 |
+ PackedFloat operator*(PackedFloat b) const { return PackedFloat(MUL_PACKED(mData, b.mData)); } |
|
87 |
+ PackedFloat operator/(PackedFloat b) const { return PackedFloat(DIV_PACKED(mData, b.mData)); } |
|
103 | 88 |
|
104 |
- void operator+=(packedFloat val) { mData = ADD_PACKED(mData, val.mData); } |
|
89 |
+ void operator+=(PackedFloat val) { mData = ADD_PACKED(mData, val.mData); } |
|
105 | 90 |
void load(const float *ptr) { mData = LOAD_PACKED(ptr); } |
106 | 91 |
void store(float *ptr) { STORE_PACKED(ptr, mData); } |
107 | 92 |
|
108 |
-#if defined( __GAPS_AVX__ ) |
|
109 | 93 |
float scalar() |
110 | 94 |
{ |
95 |
+ #if defined( __GAPS_AVX__ ) |
|
96 |
+ |
|
111 | 97 |
float* ra = reinterpret_cast<float*>(&mData); // NOLINT |
112 | 98 |
mData = _mm256_hadd_ps(mData, mData); |
113 | 99 |
mData = _mm256_hadd_ps(mData, mData); |
114 | 100 |
return ra[0] + ra[4]; |
115 |
- } |
|
116 |
-#elif defined( __GAPS_SSE__ ) |
|
117 |
- float scalar() |
|
118 |
- { |
|
101 |
+ |
|
102 |
+ #elif defined( __GAPS_SSE__ ) |
|
103 |
+ |
|
119 | 104 |
float* ra = reinterpret_cast<float*>(&mData); // NOLINT |
120 | 105 |
return ra[0] + ra[1] + ra[2] + ra[3]; |
121 |
- } |
|
122 |
-#else |
|
123 |
- float scalar() |
|
124 |
- { |
|
106 |
+ |
|
107 |
+ #else |
|
108 |
+ |
|
125 | 109 |
return mData; |
110 |
+ |
|
111 |
+ #endif |
|
126 | 112 |
} |
127 |
-#endif |
|
113 |
+ |
|
114 |
+private: |
|
115 |
+ |
|
116 |
+ gaps_packed_t mData; |
|
128 | 117 |
}; |
129 | 118 |
|
130 | 119 |
} // namespace simd |
... | ... |
@@ -130,47 +130,5 @@ public: |
130 | 130 |
} // namespace simd |
131 | 131 |
} // namespace gaps |
132 | 132 |
|
133 |
-#if (defined(_M_AMD64) || defined(_M_X64) || defined(__amd64)) && ! defined(__x86_64__) |
|
134 |
- #define __x86_64__ 1 |
|
135 |
-#endif |
|
136 |
- |
|
137 |
-#ifdef _OPENMP |
|
138 |
- #define __GAPS_OPENMP__ |
|
139 |
-#endif |
|
140 |
- |
|
141 |
-// used to convert defined macro values into strings |
|
142 |
-#define STR_HELPER(x) #x |
|
143 |
-#define STR(x) STR_HELPER(x) |
|
144 |
- |
|
145 |
-inline std::string buildReport() |
|
146 |
-{ |
|
147 |
-#if defined( __clang__ ) |
|
148 |
- std::string compiler = "Compiled with Clang\n"; |
|
149 |
-#elif defined( __INTEL_COMPILER ) |
|
150 |
- std::string compiler = "Compiled with Intel ICC/ICPC\n"; |
|
151 |
-#elif defined( __GNUC__ ) |
|
152 |
- std::string compiler = "Compiled with GCC v" + std::string(STR( __GNUC__ )) |
|
153 |
- + "." + std::string(STR( __GNUC_MINOR__ )) + '\n'; |
|
154 |
-#elif defined( _MSC_VER ) |
|
155 |
- std::string compiler = "Compiled with Microsoft Visual Studio\n"; |
|
156 |
-#endif |
|
157 |
- |
|
158 |
-#if defined( __GAPS_AVX__ ) |
|
159 |
- std::string simd = "SIMD: AVX instructions enabled\n"; |
|
160 |
-#elif defined( __GAPS_SSE__ ) |
|
161 |
- std::string simd = "SIMD: SSE instructions enabled\n"; |
|
162 |
-#else |
|
163 |
- std::string simd = "SIMD not enabled\n"; |
|
164 |
-#endif |
|
165 |
- |
|
166 |
-#ifdef __GAPS_OPENMP__ |
|
167 |
- std::string openmp = "Compiled with OpenMP\n"; |
|
168 |
-#else |
|
169 |
- std::string openmp = "Compiler did not support OpenMP\n"; |
|
170 |
-#endif |
|
171 |
- |
|
172 |
- return compiler + simd + openmp; |
|
173 |
-} |
|
174 |
- |
|
175 | 133 |
#endif // __COGAPS_SIMD_H__ |
176 | 134 |
|
... | ... |
@@ -156,9 +156,9 @@ inline std::string buildReport() |
156 | 156 |
#endif |
157 | 157 |
|
158 | 158 |
#if defined( __GAPS_AVX__ ) |
159 |
- std::string simd = "AVX enabled\n"; |
|
159 |
+ std::string simd = "SIMD: AVX instructions enabled\n"; |
|
160 | 160 |
#elif defined( __GAPS_SSE__ ) |
161 |
- std::string simd = "SSE enabled\n"; |
|
161 |
+ std::string simd = "SIMD: SSE instructions enabled\n"; |
|
162 | 162 |
#else |
163 | 163 |
std::string simd = "SIMD not enabled\n"; |
164 | 164 |
#endif |
... | ... |
@@ -138,5 +138,39 @@ public: |
138 | 138 |
#define __GAPS_OPENMP__ |
139 | 139 |
#endif |
140 | 140 |
|
141 |
+// used to convert defined macro values into strings |
|
142 |
+#define STR_HELPER(x) #x |
|
143 |
+#define STR(x) STR_HELPER(x) |
|
144 |
+ |
|
145 |
+inline std::string buildReport() |
|
146 |
+{ |
|
147 |
+#if defined( __clang__ ) |
|
148 |
+ std::string compiler = "Compiled with Clang\n"; |
|
149 |
+#elif defined( __INTEL_COMPILER ) |
|
150 |
+ std::string compiler = "Compiled with Intel ICC/ICPC\n"; |
|
151 |
+#elif defined( __GNUC__ ) |
|
152 |
+ std::string compiler = "Compiled with GCC v" + std::string(STR( __GNUC__ )) |
|
153 |
+ + "." + std::string(STR( __GNUC_MINOR__ )) + '\n'; |
|
154 |
+#elif defined( _MSC_VER ) |
|
155 |
+ std::string compiler = "Compiled with Microsoft Visual Studio\n"; |
|
156 |
+#endif |
|
157 |
+ |
|
158 |
+#if defined( __GAPS_AVX__ ) |
|
159 |
+ std::string simd = "AVX enabled\n"; |
|
160 |
+#elif defined( __GAPS_SSE__ ) |
|
161 |
+ std::string simd = "SSE enabled\n"; |
|
162 |
+#else |
|
163 |
+ std::string simd = "SIMD not enabled\n"; |
|
164 |
+#endif |
|
165 |
+ |
|
166 |
+#ifdef __GAPS_OPENMP__ |
|
167 |
+ std::string openmp = "Compiled with OpenMP\n"; |
|
168 |
+#else |
|
169 |
+ std::string openmp = "Compiler did not support OpenMP\n"; |
|
170 |
+#endif |
|
171 |
+ |
|
172 |
+ return compiler + simd + openmp; |
|
173 |
+} |
|
174 |
+ |
|
141 | 175 |
#endif // __COGAPS_SIMD_H__ |
142 | 176 |
|
... | ... |
@@ -15,7 +15,7 @@ |
15 | 15 |
#endif |
16 | 16 |
#endif |
17 | 17 |
|
18 |
-#if SSE_INSTR_SET == 7 |
|
18 |
+#if SSE_INSTR_SET > 6 |
|
19 | 19 |
#define __GAPS_AVX__ |
20 | 20 |
#include <immintrin.h> |
21 | 21 |
#elif SSE_INSTR_SET == 6 || SSE_INSTR_SET == 5 |
... | ... |
@@ -105,14 +105,7 @@ public: |
105 | 105 |
void load(const float *ptr) { mData = LOAD_PACKED(ptr); } |
106 | 106 |
void store(float *ptr) { STORE_PACKED(ptr, mData); } |
107 | 107 |
|
108 |
-#if defined( __GAPS_AVX512__ ) |
|
109 |
- float scalar() |
|
110 |
- { |
|
111 |
- float* ra = reinterpret_cast<float*>(&mData); // NOLINT |
|
112 |
- return ra[0] + ra[1] + ra[2] + ra[3] + ra[4] + ra[5] + ra[6] + ra[7] + |
|
113 |
- ra[8] + ra[9] + ra[10] + ra[11] + ra[12] + ra[13] + ra[14] + ra[15]; |
|
114 |
- } |
|
115 |
-#elif defined( __GAPS_AVX__ ) |
|
108 |
+#if defined( __GAPS_AVX__ ) |
|
116 | 109 |
float scalar() |
117 | 110 |
{ |
118 | 111 |
float* ra = reinterpret_cast<float*>(&mData); // NOLINT |
... | ... |
@@ -116,7 +116,9 @@ public: |
116 | 116 |
float scalar() |
117 | 117 |
{ |
118 | 118 |
float* ra = reinterpret_cast<float*>(&mData); // NOLINT |
119 |
- return ra[0] + ra[1] + ra[2] + ra[3] + ra[4] + ra[5] + ra[6] + ra[7]; |
|
119 |
+ mData = _mm256_hadd_ps(mData, mData); |
|
120 |
+ mData = _mm256_hadd_ps(mData, mData); |
|
121 |
+ return ra[0] + ra[4]; |
|
120 | 122 |
} |
121 | 123 |
#elif defined( __GAPS_SSE__ ) |
122 | 124 |
float scalar() |
... | ... |
@@ -108,20 +108,20 @@ public: |
108 | 108 |
#if defined( __GAPS_AVX512__ ) |
109 | 109 |
float scalar() |
110 | 110 |
{ |
111 |
- float* ra = reinterpret_cast<float*>(&mData); |
|
111 |
+ float* ra = reinterpret_cast<float*>(&mData); // NOLINT |
|
112 | 112 |
return ra[0] + ra[1] + ra[2] + ra[3] + ra[4] + ra[5] + ra[6] + ra[7] + |
113 | 113 |
ra[8] + ra[9] + ra[10] + ra[11] + ra[12] + ra[13] + ra[14] + ra[15]; |
114 | 114 |
} |
115 | 115 |
#elif defined( __GAPS_AVX__ ) |
116 | 116 |
float scalar() |
117 | 117 |
{ |
118 |
- float* ra = reinterpret_cast<float*>(&mData); |
|
118 |
+ float* ra = reinterpret_cast<float*>(&mData); // NOLINT |
|
119 | 119 |
return ra[0] + ra[1] + ra[2] + ra[3] + ra[4] + ra[5] + ra[6] + ra[7]; |
120 | 120 |
} |
121 | 121 |
#elif defined( __GAPS_SSE__ ) |
122 | 122 |
float scalar() |
123 | 123 |
{ |
124 |
- float* ra = reinterpret_cast<float*>(&mData); |
|
124 |
+ float* ra = reinterpret_cast<float*>(&mData); // NOLINT |
|
125 | 125 |
return ra[0] + ra[1] + ra[2] + ra[3]; |
126 | 126 |
} |
127 | 127 |
#else |
... | ... |
@@ -1,14 +1,6 @@ |
1 | 1 |
#ifndef __COGAPS_SIMD_H__ |
2 | 2 |
#define __COGAPS_SIMD_H__ |
3 | 3 |
|
4 |
-#if (defined(_M_AMD64) || defined(_M_X64) || defined(__amd64)) && ! defined(__x86_64__) |
|
5 |
- #define __x86_64__ 1 |
|
6 |
-#endif |
|
7 |
- |
|
8 |
-#ifdef _OPENMP |
|
9 |
- #define __GAPS_OPENMP__ |
|
10 |
-#endif |
|
11 |
- |
|
12 | 4 |
#ifndef SSE_INSTR_SET |
13 | 5 |
#if defined ( __AVX2__ ) |
14 | 6 |
#define SSE_INSTR_SET 8 |
... | ... |
@@ -26,12 +18,9 @@ |
26 | 18 |
#if SSE_INSTR_SET == 7 |
27 | 19 |
#define __GAPS_AVX__ |
28 | 20 |
#include <immintrin.h> |
29 |
-#elif SSE_INSTR_SET == 6 |
|
21 |
+#elif SSE_INSTR_SET == 6 || SSE_INSTR_SET == 5 |
|
30 | 22 |
#define __GAPS_SSE__ |
31 | 23 |
#include <nmmintrin.h> |
32 |
-#elif SSE_INSTR_SET == 5 |
|
33 |
- #define __GAPS_SSE__ |
|
34 |
- #include <smmintrin.h> |
|
35 | 24 |
#endif |
36 | 25 |
|
37 | 26 |
namespace gaps |
... | ... |
@@ -44,6 +33,7 @@ namespace simd |
44 | 33 |
const unsigned index_increment = 8; |
45 | 34 |
#define SET_SCALAR(x) _mm256_set1_ps(x) |
46 | 35 |
#define LOAD_PACKED(x) _mm256_load_ps(x) |
36 |
+ #define STORE_PACKED(p,x) _mm256_store_ps(p,x) |
|
47 | 37 |
#define ADD_PACKED(a,b) _mm256_add_ps(a,b) |
48 | 38 |
#define SUB_PACKED(a,b) _mm256_sub_ps(a,b) |
49 | 39 |
#define MUL_PACKED(a,b) _mm256_mul_ps(a,b) |
... | ... |
@@ -102,7 +92,7 @@ public: |
102 | 92 |
|
103 | 93 |
packedFloat() : mData() {} |
104 | 94 |
explicit packedFloat(float val) : mData(SET_SCALAR(val)) {} |
105 |
-#if defined( __GAPS_SSE__ ) || defined( __GAPS_AVX__ ) |
|
95 |
+#if defined( __GAPS_SSE__ ) || defined( __GAPS_AVX__ ) || defined( __GAPS_AVX512__ ) |
|
106 | 96 |
explicit packedFloat(gaps_packed_t val) : mData(val) {} |
107 | 97 |
#endif |
108 | 98 |
|
... | ... |
@@ -115,7 +105,14 @@ public: |
115 | 105 |
void load(const float *ptr) { mData = LOAD_PACKED(ptr); } |
116 | 106 |
void store(float *ptr) { STORE_PACKED(ptr, mData); } |
117 | 107 |
|
118 |
-#if defined( __GAPS_AVX__ ) |
|
108 |
+#if defined( __GAPS_AVX512__ ) |
|
109 |
+ float scalar() |
|
110 |
+ { |
|
111 |
+ float* ra = reinterpret_cast<float*>(&mData); |
|
112 |
+ return ra[0] + ra[1] + ra[2] + ra[3] + ra[4] + ra[5] + ra[6] + ra[7] + |
|
113 |
+ ra[8] + ra[9] + ra[10] + ra[11] + ra[12] + ra[13] + ra[14] + ra[15]; |
|
114 |
+ } |
|
115 |
+#elif defined( __GAPS_AVX__ ) |
|
119 | 116 |
float scalar() |
120 | 117 |
{ |
121 | 118 |
float* ra = reinterpret_cast<float*>(&mData); |
... | ... |
@@ -138,5 +135,13 @@ public: |
138 | 135 |
} // namespace simd |
139 | 136 |
} // namespace gaps |
140 | 137 |
|
138 |
+#if (defined(_M_AMD64) || defined(_M_X64) || defined(__amd64)) && ! defined(__x86_64__) |
|
139 |
+ #define __x86_64__ 1 |
|
140 |
+#endif |
|
141 |
+ |
|
142 |
+#ifdef _OPENMP |
|
143 |
+ #define __GAPS_OPENMP__ |
|
144 |
+#endif |
|
145 |
+ |
|
141 | 146 |
#endif // __COGAPS_SIMD_H__ |
142 | 147 |
|
... | ... |
@@ -5,16 +5,12 @@ |
5 | 5 |
#define __x86_64__ 1 |
6 | 6 |
#endif |
7 | 7 |
|
8 |
-#ifndef _OPENMP |
|
9 |
- #pragma message("Compiler does not support OpenMP") |
|
10 |
-#else |
|
8 |
+#ifdef _OPENMP |
|
11 | 9 |
#define __GAPS_OPENMP__ |
12 | 10 |
#endif |
13 | 11 |
|
14 | 12 |
#ifndef SSE_INSTR_SET |
15 |
- #ifndef SIMD |
|
16 |
- #define SSE_INSTR_SET 0 |
|
17 |
- #elif defined ( __AVX2__ ) |
|
13 |
+ #if defined ( __AVX2__ ) |
|
18 | 14 |
#define SSE_INSTR_SET 8 |
19 | 15 |
#elif defined ( __AVX__ ) |
20 | 16 |
#define SSE_INSTR_SET 7 |
... | ... |
@@ -23,7 +19,6 @@ |
23 | 19 |
#elif defined ( __SSE4_1__ ) |
24 | 20 |
#define SSE_INSTR_SET 5 |
25 | 21 |
#else |
26 |
- #error "SIMD not supported" |
|
27 | 22 |
#define SSE_INSTR_SET 0 |
28 | 23 |
#endif |
29 | 24 |
#endif |
... | ... |
@@ -123,13 +123,13 @@ public: |
123 | 123 |
#if defined( __GAPS_AVX__ ) |
124 | 124 |
float scalar() |
125 | 125 |
{ |
126 |
- float* ra = (float*)&mData; |
|
126 |
+ float* ra = reinterpret_cast<float*>(&mData); |
|
127 | 127 |
return ra[0] + ra[1] + ra[2] + ra[3] + ra[4] + ra[5] + ra[6] + ra[7]; |
128 | 128 |
} |
129 | 129 |
#elif defined( __GAPS_SSE__ ) |
130 | 130 |
float scalar() |
131 | 131 |
{ |
132 |
- float* ra = (float*)&mData; |
|
132 |
+ float* ra = reinterpret_cast<float*>(&mData); |
|
133 | 133 |
return ra[0] + ra[1] + ra[2] + ra[3]; |
134 | 134 |
} |
135 | 135 |
#else |
... | ... |
@@ -68,7 +68,7 @@ namespace simd |
68 | 68 |
const unsigned index_increment = 1; |
69 | 69 |
#define SET_SCALAR(x) x |
70 | 70 |
#define LOAD_PACKED(x) *(x) |
71 |
- #define STORE_PACKED(p,x) *p = x |
|
71 |
+ #define STORE_PACKED(p,x) *(p) = (x) |
|
72 | 72 |
#define ADD_PACKED(a,b) ((a)+(b)) |
73 | 73 |
#define SUB_PACKED(a,b) ((a)-(b)) |
74 | 74 |
#define MUL_PACKED(a,b) ((a)*(b)) |
... | ... |
@@ -78,10 +78,13 @@ namespace simd |
78 | 78 |
class Index |
79 | 79 |
{ |
80 | 80 |
private: |
81 |
+ |
|
81 | 82 |
unsigned index; |
83 |
+ |
|
82 | 84 |
public: |
83 |
- Index(unsigned i) : index(i) {} |
|
84 |
- void operator=(unsigned val) { index = val; } |
|
85 |
+ |
|
86 |
+ explicit Index(unsigned i) : index(i) {} |
|
87 |
+ Index& operator=(unsigned val) { index = val; return *this; } |
|
85 | 88 |
bool operator<(unsigned comp) { return index < comp; } |
86 | 89 |
bool operator<=(unsigned comp) { return index <= comp; } |
87 | 90 |
void operator++() { index += index_increment; } |
... | ... |
@@ -102,16 +105,16 @@ private: |
102 | 105 |
|
103 | 106 |
public: |
104 | 107 |
|
105 |
- packedFloat() {} |
|
106 |
- packedFloat(float val) : mData(SET_SCALAR(val)) {} |
|
108 |
+ packedFloat() : mData() {} |
|
109 |
+ explicit packedFloat(float val) : mData(SET_SCALAR(val)) {} |
|
107 | 110 |
#if defined( __GAPS_SSE__ ) || defined( __GAPS_AVX__ ) |
108 |
- packedFloat(gaps_packed_t val) : mData(val) {} |
|
111 |
+ explicit packedFloat(gaps_packed_t val) : mData(val) {} |
|
109 | 112 |
#endif |
110 | 113 |
|
111 |
- packedFloat operator+(packedFloat b) const { return ADD_PACKED(mData, b.mData); } |
|
112 |
- packedFloat operator-(packedFloat b) const { return SUB_PACKED(mData, b.mData); } |
|
113 |
- packedFloat operator*(packedFloat b) const { return MUL_PACKED(mData, b.mData); } |
|
114 |
- packedFloat operator/(packedFloat b) const { return DIV_PACKED(mData, b.mData); } |
|
114 |
+ packedFloat operator+(packedFloat b) const { return packedFloat(ADD_PACKED(mData, b.mData)); } |
|
115 |
+ packedFloat operator-(packedFloat b) const { return packedFloat(SUB_PACKED(mData, b.mData)); } |
|
116 |
+ packedFloat operator*(packedFloat b) const { return packedFloat(MUL_PACKED(mData, b.mData)); } |
|
117 |
+ packedFloat operator/(packedFloat b) const { return packedFloat(DIV_PACKED(mData, b.mData)); } |
|
115 | 118 |
|
116 | 119 |
void operator+=(packedFloat val) { mData = ADD_PACKED(mData, val.mData); } |
117 | 120 |
void load(const float *ptr) { mData = LOAD_PACKED(ptr); } |
... | ... |
@@ -6,7 +6,7 @@ |
6 | 6 |
#endif |
7 | 7 |
|
8 | 8 |
#ifndef _OPENMP |
9 |
- #warning "Compiler does not support OpenMP" |
|
9 |
+ #pragma message("Compiler does not support OpenMP") |
|
10 | 10 |
#else |
11 | 11 |
#define __GAPS_OPENMP__ |
12 | 12 |
#endif |
... | ... |
@@ -69,10 +69,10 @@ namespace simd |
69 | 69 |
#define SET_SCALAR(x) x |
70 | 70 |
#define LOAD_PACKED(x) *(x) |
71 | 71 |
#define STORE_PACKED(p,x) *p = x |
72 |
- #define ADD_PACKED(a,b) (a+b) |
|
73 |
- #define SUB_PACKED(a,b) (a-b) |
|
74 |
- #define MUL_PACKED(a,b) (a*b) |
|
75 |
- #define DIV_PACKED(a,b) (a/b) |
|
72 |
+ #define ADD_PACKED(a,b) ((a)+(b)) |
|
73 |
+ #define SUB_PACKED(a,b) ((a)-(b)) |
|
74 |
+ #define MUL_PACKED(a,b) ((a)*(b)) |
|
75 |
+ #define DIV_PACKED(a,b) ((a)/(b)) |
|
76 | 76 |
#endif |
77 | 77 |
|
78 | 78 |
class Index |
... | ... |
@@ -52,6 +52,7 @@ namespace simd |
52 | 52 |
const unsigned index_increment = 4; |
53 | 53 |
#define SET_SCALAR(x) _mm_set1_ps(x) |
54 | 54 |
#define LOAD_PACKED(x) _mm_load_ps(x) |
55 |
+ #define STORE_PACKED(p,x) _mm_store_ps(p,x) |
|
55 | 56 |
#define ADD_PACKED(a,b) _mm_add_ps(a,b) |
56 | 57 |
#define SUB_PACKED(a,b) _mm_sub_ps(a,b) |
57 | 58 |
#define MUL_PACKED(a,b) _mm_mul_ps(a,b) |
... | ... |
@@ -61,6 +62,7 @@ namespace simd |
61 | 62 |
const unsigned index_increment = 1; |
62 | 63 |
#define SET_SCALAR(x) x |
63 | 64 |
#define LOAD_PACKED(x) *(x) |
65 |
+ #define STORE_PACKED(p,x) *p = x |
|
64 | 66 |
#define ADD_PACKED(a,b) (a+b) |
65 | 67 |
#define SUB_PACKED(a,b) (a-b) |
66 | 68 |
#define MUL_PACKED(a,b) (a*b) |
... | ... |
@@ -80,9 +82,11 @@ public: |
80 | 82 |
unsigned value() const { return index; } |
81 | 83 |
unsigned increment() const { return index_increment; } |
82 | 84 |
friend const float* operator+(const float *ptr, Index ndx); |
85 |
+ friend float* operator+(float *ptr, Index ndx); |
|
83 | 86 |
}; |
84 | 87 |
|
85 | 88 |
inline const float* operator+(const float *ptr, Index ndx) { return ptr + ndx.index; } |
89 |
+inline float* operator+(float *ptr, Index ndx) { return ptr + ndx.index; } |
|
86 | 90 |
|
87 | 91 |
class packedFloat |
88 | 92 |
{ |
... | ... |
@@ -105,6 +109,7 @@ public: |
105 | 109 |
|
106 | 110 |
void operator+=(packedFloat val) { mData = ADD_PACKED(mData, val.mData); } |
107 | 111 |
void load(const float *ptr) { mData = LOAD_PACKED(ptr); } |
112 |
+ void store(float *ptr) { STORE_PACKED(ptr, mData); } |
|
108 | 113 |
|
109 | 114 |
#if defined( __GAPS_AVX__ ) |
110 | 115 |
float scalar() |
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,133 @@ |
1 |
+#ifndef __COGAPS_SIMD_H__ |
|
2 |
+#define __COGAPS_SIMD_H__ |
|
3 |
+ |
|
4 |
+#if (defined(_M_AMD64) || defined(_M_X64) || defined(__amd64)) && ! defined(__x86_64__) |
|
5 |
+ #define __x86_64__ 1 |
|
6 |
+#endif |
|
7 |
+ |
|
8 |
+#ifndef SSE_INSTR_SET |
|
9 |
+ #ifndef SIMD |
|
10 |
+ #define SSE_INSTR_SET 0 |
|
11 |
+ #elif defined ( __AVX2__ ) |
|
12 |
+ #define SSE_INSTR_SET 8 |
|
13 |
+ #elif defined ( __AVX__ ) |
|
14 |
+ #define SSE_INSTR_SET 7 |
|
15 |
+ #elif defined ( __SSE4_2__ ) |
|
16 |
+ #define SSE_INSTR_SET 6 |
|
17 |
+ #elif defined ( __SSE4_1__ ) |
|
18 |
+ #define SSE_INSTR_SET 5 |
|
19 |
+ #else |
|
20 |
+ #error "SIMD not supported" |
|
21 |
+ #define SSE_INSTR_SET 0 |
|
22 |
+ #endif |
|
23 |
+#endif |
|
24 |
+ |
|
25 |
+#if SSE_INSTR_SET == 7 |
|
26 |
+ #define __GAPS_AVX__ |
|
27 |
+ #include <immintrin.h> |
|
28 |
+#elif SSE_INSTR_SET == 6 |
|
29 |
+ #define __GAPS_SSE__ |
|
30 |
+ #include <nmmintrin.h> |
|
31 |
+#elif SSE_INSTR_SET == 5 |
|
32 |
+ #define __GAPS_SSE__ |
|
33 |
+ #include <smmintrin.h> |
|
34 |
+#endif |
|
35 |
+ |
|
36 |
+namespace gaps |
|
37 |
+{ |
|
38 |
+namespace simd |
|
39 |
+{ |
|
40 |
+ |
|
41 |
+#if defined( __GAPS_AVX__ ) |
|
42 |
+ typedef __m256 gaps_packed_t; |
|
43 |
+ const unsigned index_increment = 8; |
|
44 |
+ #define SET_SCALAR(x) _mm256_set1_ps(x) |
|
45 |
+ #define LOAD_PACKED(x) _mm256_load_ps(x) |
|
46 |
+ #define ADD_PACKED(a,b) _mm256_add_ps(a,b) |
|
47 |
+ #define SUB_PACKED(a,b) _mm256_sub_ps(a,b) |
|
48 |
+ #define MUL_PACKED(a,b) _mm256_mul_ps(a,b) |
|
49 |
+ #define DIV_PACKED(a,b) _mm256_div_ps(a,b) |
|
50 |
+#elif defined( __GAPS_SSE__ ) |
|
51 |
+ typedef __m128 gaps_packed_t; |
|
52 |
+ const unsigned index_increment = 4; |
|
53 |
+ #define SET_SCALAR(x) _mm_set1_ps(x) |
|
54 |
+ #define LOAD_PACKED(x) _mm_load_ps(x) |
|
55 |
+ #define ADD_PACKED(a,b) _mm_add_ps(a,b) |
|
56 |
+ #define SUB_PACKED(a,b) _mm_sub_ps(a,b) |
|
57 |
+ #define MUL_PACKED(a,b) _mm_mul_ps(a,b) |
|
58 |
+ #define DIV_PACKED(a,b) _mm_div_ps(a,b) |
|
59 |
+#else |
|
60 |
+ typedef float gaps_packed_t; |
|
61 |
+ const unsigned index_increment = 1; |
|
62 |
+ #define SET_SCALAR(x) x |
|
63 |
+ #define LOAD_PACKED(x) *(x) |
|
64 |
+ #define ADD_PACKED(a,b) (a+b) |
|
65 |
+ #define SUB_PACKED(a,b) (a-b) |
|
66 |
+ #define MUL_PACKED(a,b) (a*b) |
|
67 |
+ #define DIV_PACKED(a,b) (a/b) |
|
68 |
+#endif |
|
69 |
+ |
|
70 |
+class Index |
|
71 |
+{ |
|
72 |
+private: |
|
73 |
+ unsigned index; |
|
74 |
+public: |
|
75 |
+ Index(unsigned i) : index(i) {} |
|
76 |
+ void operator=(unsigned val) { index = val; } |
|
77 |
+ bool operator<(unsigned comp) { return index < comp; } |
|
78 |
+ bool operator<=(unsigned comp) { return index <= comp; } |
|
79 |
+ void operator++() { index += index_increment; } |
|
80 |
+ unsigned value() const { return index; } |
|
81 |
+ unsigned increment() const { return index_increment; } |
|
82 |
+ friend const float* operator+(const float *ptr, Index ndx); |
|
83 |
+}; |
|
84 |
+ |
|
85 |
+inline const float* operator+(const float *ptr, Index ndx) { return ptr + ndx.index; } |
|
86 |
+ |
|
87 |
+class packedFloat |
|
88 |
+{ |
|
89 |
+private: |
|
90 |
+ |
|
91 |
+ gaps_packed_t mData; |
|
92 |
+ |
|
93 |
+public: |
|
94 |
+ |
|
95 |
+ packedFloat() {} |
|
96 |
+ packedFloat(float val) : mData(SET_SCALAR(val)) {} |
|
97 |
+#if defined( __GAPS_SSE__ ) || defined( __GAPS_AVX__ ) |
|
98 |
+ packedFloat(gaps_packed_t val) : mData(val) {} |
|
99 |
+#endif |
|
100 |
+ |
|
101 |
+ packedFloat operator+(packedFloat b) const { return ADD_PACKED(mData, b.mData); } |
|
102 |
+ packedFloat operator-(packedFloat b) const { return SUB_PACKED(mData, b.mData); } |
|
103 |
+ packedFloat operator*(packedFloat b) const { return MUL_PACKED(mData, b.mData); } |
|
104 |
+ packedFloat operator/(packedFloat b) const { return DIV_PACKED(mData, b.mData); } |
|
105 |
+ |
|
106 |
+ void operator+=(packedFloat val) { mData = ADD_PACKED(mData, val.mData); } |
|
107 |
+ void load(const float *ptr) { mData = LOAD_PACKED(ptr); } |
|
108 |
+ |
|
109 |
+#if defined( __GAPS_AVX__ ) |
|
110 |
+ float scalar() |
|
111 |
+ { |
|
112 |
+ float* ra = (float*)&mData; |
|
113 |
+ return ra[0] + ra[1] + ra[2] + ra[3] + ra[4] + ra[5] + ra[6] + ra[7]; |
|
114 |
+ } |
|
115 |
+#elif defined( __GAPS_SSE__ ) |
|
116 |
+ float scalar() |
|
117 |
+ { |
|
118 |
+ float* ra = (float*)&mData; |
|
119 |
+ return ra[0] + ra[1] + ra[2] + ra[3]; |
|
120 |
+ } |
|
121 |
+#else |
|
122 |
+ float scalar() |
|
123 |
+ { |
|
124 |
+ return mData; |
|
125 |
+ } |
|
126 |
+#endif |
|
127 |
+}; |
|
128 |
+ |
|
129 |
+} // namespace simd |
|
130 |
+} // namespace gaps |
|
131 |
+ |
|
132 |
+#endif // __COGAPS_SIMD_H__ |
|
133 |
+ |