Uses https://github.com/DLTcollab/sse2neon/
Fixes #4
Signed-off-by: Martin Tzvetanov Grigorov <mgrigorov@apache.org>
27 | 30 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,9072 @@ |
1 |
+#ifndef SSE2NEON_H |
|
2 |
+#define SSE2NEON_H |
|
3 |
+ |
|
4 |
+// This header file provides a simple API translation layer |
|
5 |
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions |
|
6 |
+// |
|
7 |
+// Contributors to this work are: |
|
8 |
+// John W. Ratcliff <jratcliffscarab@gmail.com> |
|
9 |
+// Brandon Rowlett <browlett@nvidia.com> |
|
10 |
+// Ken Fast <kfast@gdeb.com> |
|
11 |
+// Eric van Beurden <evanbeurden@nvidia.com> |
|
12 |
+// Alexander Potylitsin <apotylitsin@nvidia.com> |
|
13 |
+// Hasindu Gamaarachchi <hasindu2008@gmail.com> |
|
14 |
+// Jim Huang <jserv@ccns.ncku.edu.tw> |
|
15 |
+// Mark Cheng <marktwtn@gmail.com> |
|
16 |
+// Malcolm James MacLeod <malcolm@gulden.com> |
|
17 |
+// Devin Hussey (easyaspi314) <husseydevin@gmail.com> |
|
18 |
+// Sebastian Pop <spop@amazon.com> |
|
19 |
+// Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com> |
|
20 |
+// Danila Kutenin <danilak@google.com> |
|
21 |
+// François Turban (JishinMaster) <francois.turban@gmail.com> |
|
22 |
+// Pei-Hsuan Hung <afcidk@gmail.com> |
|
23 |
+// Yang-Hao Yuan <yuanyanghau@gmail.com> |
|
24 |
+// Syoyo Fujita <syoyo@lighttransport.com> |
|
25 |
+// Brecht Van Lommel <brecht@blender.org> |
|
26 |
+// Jonathan Hue <jhue@adobe.com> |
|
27 |
+// Cuda Chen <clh960524@gmail.com> |
|
28 |
+// Aymen Qader <aymen.qader@arm.com> |
|
29 |
+ |
|
30 |
+/* |
|
31 |
+ * sse2neon is freely redistributable under the MIT License. |
|
32 |
+ * |
|
33 |
+ * Permission is hereby granted, free of charge, to any person obtaining a copy |
|
34 |
+ * of this software and associated documentation files (the "Software"), to deal |
|
35 |
+ * in the Software without restriction, including without limitation the rights |
|
36 |
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
|
37 |
+ * copies of the Software, and to permit persons to whom the Software is |
|
38 |
+ * furnished to do so, subject to the following conditions: |
|
39 |
+ * |
|
40 |
+ * The above copyright notice and this permission notice shall be included in |
|
41 |
+ * all copies or substantial portions of the Software. |
|
42 |
+ * |
|
43 |
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
44 |
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
45 |
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|
46 |
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
47 |
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
|
48 |
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|
49 |
+ * SOFTWARE. |
|
50 |
+ */ |
|
51 |
+ |
|
52 |
+/* Tunable configurations */ |
|
53 |
+ |
|
54 |
+/* Enable precise implementation of math operations |
|
55 |
+ * This would slow down the computation a bit, but gives consistent result with |
|
56 |
+ * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result) |
|
57 |
+ */ |
|
58 |
+/* _mm_min|max_ps|ss|pd|sd */ |
|
59 |
+#ifndef SSE2NEON_PRECISE_MINMAX |
|
60 |
+#define SSE2NEON_PRECISE_MINMAX (0) |
|
61 |
+#endif |
|
62 |
+/* _mm_rcp_ps and _mm_div_ps */ |
|
63 |
+#ifndef SSE2NEON_PRECISE_DIV |
|
64 |
+#define SSE2NEON_PRECISE_DIV (0) |
|
65 |
+#endif |
|
66 |
+/* _mm_sqrt_ps and _mm_rsqrt_ps */ |
|
67 |
+#ifndef SSE2NEON_PRECISE_SQRT |
|
68 |
+#define SSE2NEON_PRECISE_SQRT (0) |
|
69 |
+#endif |
|
70 |
+/* _mm_dp_pd */ |
|
71 |
+#ifndef SSE2NEON_PRECISE_DP |
|
72 |
+#define SSE2NEON_PRECISE_DP (0) |
|
73 |
+#endif |
|
74 |
+ |
|
75 |
+/* compiler specific definitions */ |
|
76 |
+#if defined(__GNUC__) || defined(__clang__) |
|
77 |
+#pragma push_macro("FORCE_INLINE") |
|
78 |
+#pragma push_macro("ALIGN_STRUCT") |
|
79 |
+#define FORCE_INLINE static inline __attribute__((always_inline)) |
|
80 |
+#define ALIGN_STRUCT(x) __attribute__((aligned(x))) |
|
81 |
+#define _sse2neon_likely(x) __builtin_expect(!!(x), 1) |
|
82 |
+#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0) |
|
83 |
+#else /* non-GNU / non-clang compilers */ |
|
84 |
+#warning "Macro name collisions may happen with unsupported compiler." |
|
85 |
+#ifndef FORCE_INLINE |
|
86 |
+#define FORCE_INLINE static inline |
|
87 |
+#endif |
|
88 |
+#ifndef ALIGN_STRUCT |
|
89 |
+#define ALIGN_STRUCT(x) __declspec(align(x)) |
|
90 |
+#endif |
|
91 |
+#define _sse2neon_likely(x) (x) |
|
92 |
+#define _sse2neon_unlikely(x) (x) |
|
93 |
+#endif |
|
94 |
+ |
|
95 |
+/* C language does not allow initializing a variable with a function call. */ |
|
96 |
+#ifdef __cplusplus |
|
97 |
+#define _sse2neon_const static const |
|
98 |
+#else |
|
99 |
+#define _sse2neon_const const |
|
100 |
+#endif |
|
101 |
+ |
|
102 |
+#include <stdint.h> |
|
103 |
+#include <stdlib.h> |
|
104 |
+ |
|
105 |
+#if defined(_WIN32) |
|
106 |
+/* Definitions for _mm_{malloc,free} are provided by <malloc.h> |
|
107 |
+ * from both MinGW-w64 and MSVC. |
|
108 |
+ */ |
|
109 |
+#define SSE2NEON_ALLOC_DEFINED |
|
110 |
+#endif |
|
111 |
+ |
|
112 |
+/* If using MSVC */ |
|
113 |
+#ifdef _MSC_VER |
|
114 |
+#include <intrin.h> |
|
115 |
+#if (defined(_M_AMD64) || defined(__x86_64__)) || \ |
|
116 |
+ (defined(_M_ARM) || defined(__arm__)) |
|
117 |
+#define SSE2NEON_HAS_BITSCAN64 |
|
118 |
+#endif |
|
119 |
+#endif |
|
120 |
+ |
|
121 |
+/* Compiler barrier */ |
|
122 |
+#define SSE2NEON_BARRIER() \ |
|
123 |
+ do { \ |
|
124 |
+ __asm__ __volatile__("" ::: "memory"); \ |
|
125 |
+ (void) 0; \ |
|
126 |
+ } while (0) |
|
127 |
+ |
|
128 |
+/* Memory barriers |
|
129 |
+ * __atomic_thread_fence does not include a compiler barrier; instead, |
|
130 |
+ * the barrier is part of __atomic_load/__atomic_store's "volatile-like" |
|
131 |
+ * semantics. |
|
132 |
+ */ |
|
133 |
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) |
|
134 |
+#include <stdatomic.h> |
|
135 |
+#endif |
|
136 |
+ |
|
137 |
+FORCE_INLINE void _sse2neon_smp_mb(void) |
|
138 |
+{ |
|
139 |
+ SSE2NEON_BARRIER(); |
|
140 |
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ |
|
141 |
+ !defined(__STDC_NO_ATOMICS__) |
|
142 |
+ atomic_thread_fence(memory_order_seq_cst); |
|
143 |
+#elif defined(__GNUC__) || defined(__clang__) |
|
144 |
+ __atomic_thread_fence(__ATOMIC_SEQ_CST); |
|
145 |
+#else |
|
146 |
+ /* FIXME: MSVC support */ |
|
147 |
+#endif |
|
148 |
+} |
|
149 |
+ |
|
150 |
+/* Architecture-specific build options */ |
|
151 |
+/* FIXME: #pragma GCC push_options is only available on GCC */ |
|
152 |
+#if defined(__GNUC__) |
|
153 |
+#if defined(__arm__) && __ARM_ARCH == 7 |
|
154 |
+/* According to ARM C Language Extensions Architecture specification, |
|
155 |
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON) |
|
156 |
+ * architecture supported. |
|
157 |
+ */ |
|
158 |
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__) |
|
159 |
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON." |
|
160 |
+#endif |
|
161 |
+#if !defined(__clang__) |
|
162 |
+#pragma GCC push_options |
|
163 |
+#pragma GCC target("fpu=neon") |
|
164 |
+#endif |
|
165 |
+#elif defined(__aarch64__) |
|
166 |
+#if !defined(__clang__) |
|
167 |
+#pragma GCC push_options |
|
168 |
+#pragma GCC target("+simd") |
|
169 |
+#endif |
|
170 |
+#elif __ARM_ARCH == 8 |
|
171 |
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__) |
|
172 |
+#error \ |
|
173 |
+ "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON." |
|
174 |
+#endif |
|
175 |
+#if !defined(__clang__) |
|
176 |
+#pragma GCC push_options |
|
177 |
+#endif |
|
178 |
+#else |
|
179 |
+#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A." |
|
180 |
+#endif |
|
181 |
+#endif |
|
182 |
+ |
|
183 |
+#include <arm_neon.h> |
|
184 |
+#if !defined(__aarch64__) && (__ARM_ARCH == 8) |
|
185 |
+#if defined __has_include && __has_include(<arm_acle.h>) |
|
186 |
+#include <arm_acle.h> |
|
187 |
+#endif |
|
188 |
+#endif |
|
189 |
+ |
|
190 |
+/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD |
|
191 |
+ * and other Arm microarchtectures use. |
|
192 |
+ * From sysctl -a on Apple M1: |
|
193 |
+ * hw.cachelinesize: 128 |
|
194 |
+ */ |
|
195 |
+#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__)) |
|
196 |
+#define SSE2NEON_CACHELINE_SIZE 128 |
|
197 |
+#else |
|
198 |
+#define SSE2NEON_CACHELINE_SIZE 64 |
|
199 |
+#endif |
|
200 |
+ |
|
201 |
+/* Rounding functions require either Aarch64 instructions or libm failback */ |
|
202 |
+#if !defined(__aarch64__) |
|
203 |
+#include <math.h> |
|
204 |
+#endif |
|
205 |
+ |
|
206 |
+/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only |
|
207 |
+ * or even not accessible in user mode. |
|
208 |
+ * To write or access to these registers in user mode, |
|
209 |
+ * we have to perform syscall instead. |
|
210 |
+ */ |
|
211 |
+#if !defined(__aarch64__) |
|
212 |
+#include <sys/time.h> |
|
213 |
+#endif |
|
214 |
+ |
|
215 |
+/* "__has_builtin" can be used to query support for built-in functions |
|
216 |
+ * provided by gcc/clang and other compilers that support it. |
|
217 |
+ */ |
|
218 |
+#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */ |
|
219 |
+/* Compatibility with gcc <= 9 */ |
|
220 |
+#if defined(__GNUC__) && (__GNUC__ <= 9) |
|
221 |
+#define __has_builtin(x) HAS##x |
|
222 |
+#define HAS__builtin_popcount 1 |
|
223 |
+#define HAS__builtin_popcountll 1 |
|
224 |
+ |
|
225 |
+// __builtin_shuffle introduced in GCC 4.7.0 |
|
226 |
+#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7)) |
|
227 |
+#define HAS__builtin_shuffle 1 |
|
228 |
+#else |
|
229 |
+#define HAS__builtin_shuffle 0 |
|
230 |
+#endif |
|
231 |
+ |
|
232 |
+#define HAS__builtin_shufflevector 0 |
|
233 |
+#define HAS__builtin_nontemporal_store 0 |
|
234 |
+#else |
|
235 |
+#define __has_builtin(x) 0 |
|
236 |
+#endif |
|
237 |
+#endif |
|
238 |
+ |
|
239 |
+/** |
|
240 |
+ * MACRO for shuffle parameter for _mm_shuffle_ps(). |
|
241 |
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b" |
|
242 |
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same |
|
243 |
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from |
|
244 |
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result. |
|
245 |
+ * fp0 is the same for fp0 of result. |
|
246 |
+ */ |
|
247 |
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ |
|
248 |
+ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) |
|
249 |
+ |
|
250 |
+#if __has_builtin(__builtin_shufflevector) |
|
251 |
+#define _sse2neon_shuffle(type, a, b, ...) \ |
|
252 |
+ __builtin_shufflevector(a, b, __VA_ARGS__) |
|
253 |
+#elif __has_builtin(__builtin_shuffle) |
|
254 |
+#define _sse2neon_shuffle(type, a, b, ...) \ |
|
255 |
+ __extension__({ \ |
|
256 |
+ type tmp = {__VA_ARGS__}; \ |
|
257 |
+ __builtin_shuffle(a, b, tmp); \ |
|
258 |
+ }) |
|
259 |
+#endif |
|
260 |
+ |
|
261 |
+#ifdef _sse2neon_shuffle |
|
262 |
+#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__) |
|
263 |
+#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__) |
|
264 |
+#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__) |
|
265 |
+#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__) |
|
266 |
+#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__) |
|
267 |
+#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__) |
|
268 |
+#endif |
|
269 |
+ |
|
270 |
+/* Rounding mode macros. */ |
|
271 |
+#define _MM_FROUND_TO_NEAREST_INT 0x00 |
|
272 |
+#define _MM_FROUND_TO_NEG_INF 0x01 |
|
273 |
+#define _MM_FROUND_TO_POS_INF 0x02 |
|
274 |
+#define _MM_FROUND_TO_ZERO 0x03 |
|
275 |
+#define _MM_FROUND_CUR_DIRECTION 0x04 |
|
276 |
+#define _MM_FROUND_NO_EXC 0x08 |
|
277 |
+#define _MM_FROUND_RAISE_EXC 0x00 |
|
278 |
+#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) |
|
279 |
+#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) |
|
280 |
+#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) |
|
281 |
+#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) |
|
282 |
+#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) |
|
283 |
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) |
|
284 |
+#define _MM_ROUND_NEAREST 0x0000 |
|
285 |
+#define _MM_ROUND_DOWN 0x2000 |
|
286 |
+#define _MM_ROUND_UP 0x4000 |
|
287 |
+#define _MM_ROUND_TOWARD_ZERO 0x6000 |
|
288 |
+/* Flush zero mode macros. */ |
|
289 |
+#define _MM_FLUSH_ZERO_MASK 0x8000 |
|
290 |
+#define _MM_FLUSH_ZERO_ON 0x8000 |
|
291 |
+#define _MM_FLUSH_ZERO_OFF 0x0000 |
|
292 |
+/* Denormals are zeros mode macros. */ |
|
293 |
+#define _MM_DENORMALS_ZERO_MASK 0x0040 |
|
294 |
+#define _MM_DENORMALS_ZERO_ON 0x0040 |
|
295 |
+#define _MM_DENORMALS_ZERO_OFF 0x0000 |
|
296 |
+ |
|
297 |
+/* indicate immediate constant argument in a given range */ |
|
298 |
+#define __constrange(a, b) const |
|
299 |
+ |
|
300 |
+/* A few intrinsics accept traditional data types like ints or floats, but |
|
301 |
+ * most operate on data types that are specific to SSE. |
|
302 |
+ * If a vector type ends in d, it contains doubles, and if it does not have |
|
303 |
+ * a suffix, it contains floats. An integer vector type can contain any type |
|
304 |
+ * of integer, from chars to shorts to unsigned long longs. |
|
305 |
+ */ |
|
306 |
+typedef int64x1_t __m64; |
|
307 |
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */ |
|
308 |
+// On ARM 32-bit architecture, the float64x2_t is not supported. |
|
309 |
+// The data type __m128d should be represented in a different way for related |
|
310 |
+// intrinsic conversion. |
|
311 |
+#if defined(__aarch64__) |
|
312 |
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */ |
|
313 |
+#else |
|
314 |
+typedef float32x4_t __m128d; |
|
315 |
+#endif |
|
316 |
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */ |
|
317 |
+ |
|
318 |
+// __int64 is defined in the Intrinsics Guide which maps to different datatype |
|
319 |
+// in different data model |
|
320 |
+#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64)) |
|
321 |
+#if (defined(__x86_64__) || defined(__i386__)) |
|
322 |
+#define __int64 long long |
|
323 |
+#else |
|
324 |
+#define __int64 int64_t |
|
325 |
+#endif |
|
326 |
+#endif |
|
327 |
+ |
|
328 |
+/* type-safe casting between types */ |
|
329 |
+ |
|
330 |
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x) |
|
331 |
+#define vreinterpretq_m128_f32(x) (x) |
|
332 |
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x) |
|
333 |
+ |
|
334 |
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x) |
|
335 |
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x) |
|
336 |
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x) |
|
337 |
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x) |
|
338 |
+ |
|
339 |
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x) |
|
340 |
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x) |
|
341 |
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x) |
|
342 |
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x) |
|
343 |
+ |
|
344 |
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x) |
|
345 |
+#define vreinterpretq_f32_m128(x) (x) |
|
346 |
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x) |
|
347 |
+ |
|
348 |
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x) |
|
349 |
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x) |
|
350 |
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x) |
|
351 |
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x) |
|
352 |
+ |
|
353 |
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x) |
|
354 |
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x) |
|
355 |
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x) |
|
356 |
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x) |
|
357 |
+ |
|
358 |
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x) |
|
359 |
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x) |
|
360 |
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) |
|
361 |
+#define vreinterpretq_m128i_s64(x) (x) |
|
362 |
+ |
|
363 |
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x) |
|
364 |
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x) |
|
365 |
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x) |
|
366 |
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x) |
|
367 |
+ |
|
368 |
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x) |
|
369 |
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x) |
|
370 |
+ |
|
371 |
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x) |
|
372 |
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x) |
|
373 |
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) |
|
374 |
+#define vreinterpretq_s64_m128i(x) (x) |
|
375 |
+ |
|
376 |
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x) |
|
377 |
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x) |
|
378 |
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x) |
|
379 |
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x) |
|
380 |
+ |
|
381 |
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x) |
|
382 |
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x) |
|
383 |
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x) |
|
384 |
+#define vreinterpret_m64_s64(x) (x) |
|
385 |
+ |
|
386 |
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x) |
|
387 |
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x) |
|
388 |
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x) |
|
389 |
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x) |
|
390 |
+ |
|
391 |
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x) |
|
392 |
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x) |
|
393 |
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x) |
|
394 |
+ |
|
395 |
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x) |
|
396 |
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x) |
|
397 |
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x) |
|
398 |
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x) |
|
399 |
+ |
|
400 |
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x) |
|
401 |
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x) |
|
402 |
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x) |
|
403 |
+#define vreinterpret_s64_m64(x) (x) |
|
404 |
+ |
|
405 |
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) |
|
406 |
+ |
|
407 |
+#if defined(__aarch64__) |
|
408 |
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) |
|
409 |
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) |
|
410 |
+ |
|
411 |
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x) |
|
412 |
+ |
|
413 |
+#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x) |
|
414 |
+#define vreinterpretq_m128d_f64(x) (x) |
|
415 |
+ |
|
416 |
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x) |
|
417 |
+ |
|
418 |
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x) |
|
419 |
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x) |
|
420 |
+ |
|
421 |
+#define vreinterpretq_f64_m128d(x) (x) |
|
422 |
+#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x) |
|
423 |
+#else |
|
424 |
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x) |
|
425 |
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x) |
|
426 |
+ |
|
427 |
+#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x) |
|
428 |
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x) |
|
429 |
+ |
|
430 |
+#define vreinterpretq_m128d_f32(x) (x) |
|
431 |
+ |
|
432 |
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x) |
|
433 |
+ |
|
434 |
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x) |
|
435 |
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x) |
|
436 |
+ |
|
437 |
+#define vreinterpretq_f32_m128d(x) (x) |
|
438 |
+#endif |
|
439 |
+ |
|
440 |
+// A struct is defined in this header file called 'SIMDVec' which can be used |
|
441 |
+// by applications which attempt to access the contents of an __m128 struct |
|
442 |
+// directly. It is important to note that accessing the __m128 struct directly |
|
443 |
+// is bad coding practice by Microsoft: @see: |
|
444 |
+// https://learn.microsoft.com/en-us/cpp/cpp/m128 |
|
445 |
+// |
|
446 |
+// However, some legacy source code may try to access the contents of an __m128 |
|
447 |
+// struct directly so the developer can use the SIMDVec as an alias for it. Any |
|
448 |
+// casting must be done manually by the developer, as you cannot cast or |
|
449 |
+// otherwise alias the base NEON data type for intrinsic operations. |
|
450 |
+// |
|
451 |
+// union intended to allow direct access to an __m128 variable using the names |
|
452 |
+// that the MSVC compiler provides. This union should really only be used when |
|
453 |
+// trying to access the members of the vector as integer values. GCC/clang |
|
454 |
+// allow native access to the float members through a simple array access |
|
455 |
+// operator (in C since 4.6, in C++ since 4.8). |
|
456 |
+// |
|
457 |
+// Ideally direct accesses to SIMD vectors should not be used since it can cause |
|
458 |
+// a performance hit. If it really is needed however, the original __m128 |
|
459 |
+// variable can be aliased with a pointer to this union and used to access |
|
460 |
+// individual components. The use of this union should be hidden behind a macro |
|
461 |
+// that is used throughout the codebase to access the members instead of always |
|
462 |
+// declaring this type of variable. |
|
463 |
+typedef union ALIGN_STRUCT(16) SIMDVec { |
|
464 |
+ float m128_f32[4]; // as floats - DON'T USE. Added for convenience. |
|
465 |
+ int8_t m128_i8[16]; // as signed 8-bit integers. |
|
466 |
+ int16_t m128_i16[8]; // as signed 16-bit integers. |
|
467 |
+ int32_t m128_i32[4]; // as signed 32-bit integers. |
|
468 |
+ int64_t m128_i64[2]; // as signed 64-bit integers. |
|
469 |
+ uint8_t m128_u8[16]; // as unsigned 8-bit integers. |
|
470 |
+ uint16_t m128_u16[8]; // as unsigned 16-bit integers. |
|
471 |
+ uint32_t m128_u32[4]; // as unsigned 32-bit integers. |
|
472 |
+ uint64_t m128_u64[2]; // as unsigned 64-bit integers. |
|
473 |
+} SIMDVec; |
|
474 |
+ |
|
475 |
+// casting using SIMDVec |
|
476 |
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n]) |
|
477 |
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n]) |
|
478 |
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n]) |
|
479 |
+ |
|
480 |
+/* SSE macros */ |
|
481 |
+#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode |
|
482 |
+#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode |
|
483 |
+#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode |
|
484 |
+#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode |
|
485 |
+ |
|
486 |
+// Function declaration |
|
487 |
+// SSE |
|
488 |
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(); |
|
489 |
+FORCE_INLINE __m128 _mm_move_ss(__m128, __m128); |
|
490 |
+FORCE_INLINE __m128 _mm_or_ps(__m128, __m128); |
|
491 |
+FORCE_INLINE __m128 _mm_set_ps1(float); |
|
492 |
+FORCE_INLINE __m128 _mm_setzero_ps(void); |
|
493 |
+// SSE2 |
|
494 |
+FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i); |
|
495 |
+FORCE_INLINE __m128i _mm_castps_si128(__m128); |
|
496 |
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i); |
|
497 |
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128); |
|
498 |
+FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d); |
|
499 |
+FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i); |
|
500 |
+FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int); |
|
501 |
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t); |
|
502 |
+FORCE_INLINE __m128d _mm_set_pd(double, double); |
|
503 |
+FORCE_INLINE __m128i _mm_set1_epi32(int); |
|
504 |
+FORCE_INLINE __m128i _mm_setzero_si128(); |
|
505 |
+// SSE4.1 |
|
506 |
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d); |
|
507 |
+FORCE_INLINE __m128 _mm_ceil_ps(__m128); |
|
508 |
+FORCE_INLINE __m128d _mm_floor_pd(__m128d); |
|
509 |
+FORCE_INLINE __m128 _mm_floor_ps(__m128); |
|
510 |
+FORCE_INLINE __m128d _mm_round_pd(__m128d, int); |
|
511 |
+FORCE_INLINE __m128 _mm_round_ps(__m128, int); |
|
512 |
+// SSE4.2 |
|
513 |
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t); |
|
514 |
+ |
|
515 |
+/* Backwards compatibility for compilers with lack of specific type support */ |
|
516 |
+ |
|
517 |
+// Older gcc does not define vld1q_u8_x4 type |
|
518 |
+#if defined(__GNUC__) && !defined(__clang__) && \ |
|
519 |
+ ((__GNUC__ <= 12 && defined(__arm__)) || \ |
|
520 |
+ (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \ |
|
521 |
+ (__GNUC__ <= 9 && defined(__aarch64__))) |
|
522 |
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) |
|
523 |
+{ |
|
524 |
+ uint8x16x4_t ret; |
|
525 |
+ ret.val[0] = vld1q_u8(p + 0); |
|
526 |
+ ret.val[1] = vld1q_u8(p + 16); |
|
527 |
+ ret.val[2] = vld1q_u8(p + 32); |
|
528 |
+ ret.val[3] = vld1q_u8(p + 48); |
|
529 |
+ return ret; |
|
530 |
+} |
|
531 |
+#else |
|
532 |
+// Wraps vld1q_u8_x4 |
|
533 |
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) |
|
534 |
+{ |
|
535 |
+ return vld1q_u8_x4(p); |
|
536 |
+} |
|
537 |
+#endif |
|
538 |
+ |
|
539 |
+#if !defined(__aarch64__) |
|
540 |
+/* emulate vaddv u8 variant */ |
|
541 |
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8) |
|
542 |
+{ |
|
543 |
+ const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8))); |
|
544 |
+ return vget_lane_u8(vreinterpret_u8_u64(v1), 0); |
|
545 |
+} |
|
546 |
+#else |
|
547 |
+// Wraps vaddv_u8 |
|
548 |
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8) |
|
549 |
+{ |
|
550 |
+ return vaddv_u8(v8); |
|
551 |
+} |
|
552 |
+#endif |
|
553 |
+ |
|
554 |
+#if !defined(__aarch64__) |
|
555 |
+/* emulate vaddvq u8 variant */ |
|
556 |
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a) |
|
557 |
+{ |
|
558 |
+ uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a)); |
|
559 |
+ uint8_t res = 0; |
|
560 |
+ for (int i = 0; i < 8; ++i) |
|
561 |
+ res += tmp[i]; |
|
562 |
+ return res; |
|
563 |
+} |
|
564 |
+#else |
|
565 |
+// Wraps vaddvq_u8 |
|
566 |
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a) |
|
567 |
+{ |
|
568 |
+ return vaddvq_u8(a); |
|
569 |
+} |
|
570 |
+#endif |
|
571 |
+ |
|
572 |
+#if !defined(__aarch64__) |
|
573 |
+/* emulate vaddvq u16 variant */ |
|
574 |
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a) |
|
575 |
+{ |
|
576 |
+ uint32x4_t m = vpaddlq_u16(a); |
|
577 |
+ uint64x2_t n = vpaddlq_u32(m); |
|
578 |
+ uint64x1_t o = vget_low_u64(n) + vget_high_u64(n); |
|
579 |
+ |
|
580 |
+ return vget_lane_u32((uint32x2_t) o, 0); |
|
581 |
+} |
|
582 |
+#else |
|
583 |
+// Wraps vaddvq_u16 |
|
584 |
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a) |
|
585 |
+{ |
|
586 |
+ return vaddvq_u16(a); |
|
587 |
+} |
|
588 |
+#endif |
|
589 |
+ |
|
590 |
+/* Function Naming Conventions |
|
591 |
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE |
|
592 |
+ * intrinsic function is given as follows: |
|
593 |
+ * _mm_<name>_<data_type> |
|
594 |
+ * |
|
595 |
+ * The parts of this format are given as follows: |
|
596 |
+ * 1. <name> describes the operation performed by the intrinsic |
|
597 |
+ * 2. <data_type> identifies the data type of the function's primary arguments |
|
598 |
+ * |
|
599 |
+ * This last part, <data_type>, is a little complicated. It identifies the |
|
600 |
+ * content of the input values, and can be set to any of the following values: |
|
601 |
+ * + ps - vectors contain floats (ps stands for packed single-precision) |
|
602 |
+ * + pd - vectors cantain doubles (pd stands for packed double-precision) |
|
603 |
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit |
|
604 |
+ * signed integers |
|
605 |
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit |
|
606 |
+ * unsigned integers |
|
607 |
+ * + si128 - unspecified 128-bit vector or 256-bit vector |
|
608 |
+ * + m128/m128i/m128d - identifies input vector types when they are different |
|
609 |
+ * than the type of the returned vector |
|
610 |
+ * |
|
611 |
+ * For example, _mm_setzero_ps. The _mm implies that the function returns |
|
612 |
+ * a 128-bit vector. The _ps at the end implies that the argument vectors |
|
613 |
+ * contain floats. |
|
614 |
+ * |
|
615 |
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8) |
|
616 |
+ * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits |
|
617 |
+ * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); |
|
618 |
+ * // Set packed 8-bit integers |
|
619 |
+ * // 128 bits, 16 chars, per 8 bits |
|
620 |
+ * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11, |
|
621 |
+ * 4, 5, 12, 13, 6, 7, 14, 15); |
|
622 |
+ * // Shuffle packed 8-bit integers |
|
623 |
+ * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb |
|
624 |
+ */ |
|
625 |
+ |
|
626 |
+/* Constants for use with _mm_prefetch. */ |
|
627 |
+enum _mm_hint { |
|
628 |
+ _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ |
|
629 |
+ _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */ |
|
630 |
+ _MM_HINT_T1 = 2, /* load data to L2 cache only */ |
|
631 |
+ _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */ |
|
632 |
+}; |
|
633 |
+ |
|
634 |
+// The bit field mapping to the FPCR(floating-point control register) |
|
635 |
+typedef struct { |
|
636 |
+ uint16_t res0; |
|
637 |
+ uint8_t res1 : 6; |
|
638 |
+ uint8_t bit22 : 1; |
|
639 |
+ uint8_t bit23 : 1; |
|
640 |
+ uint8_t bit24 : 1; |
|
641 |
+ uint8_t res2 : 7; |
|
642 |
+#if defined(__aarch64__) |
|
643 |
+ uint32_t res3; |
|
644 |
+#endif |
|
645 |
+} fpcr_bitfield; |
|
646 |
+ |
|
647 |
+// Takes the upper 64 bits of a and places it in the low end of the result |
|
648 |
+// Takes the lower 64 bits of b and places it into the high end of the result. |
|
649 |
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) |
|
650 |
+{ |
|
651 |
+ float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); |
|
652 |
+ float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); |
|
653 |
+ return vreinterpretq_m128_f32(vcombine_f32(a32, b10)); |
|
654 |
+} |
|
655 |
+ |
|
656 |
+// takes the lower two 32-bit values from a and swaps them and places in high |
|
657 |
+// end of result takes the higher two 32 bit values from b and swaps them and |
|
658 |
+// places in low end of result. |
|
659 |
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) |
|
660 |
+{ |
|
661 |
+ float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); |
|
662 |
+ float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b))); |
|
663 |
+ return vreinterpretq_m128_f32(vcombine_f32(a01, b23)); |
|
664 |
+} |
|
665 |
+ |
|
666 |
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) |
|
667 |
+{ |
|
668 |
+ float32x2_t a21 = vget_high_f32( |
|
669 |
+ vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); |
|
670 |
+ float32x2_t b03 = vget_low_f32( |
|
671 |
+ vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); |
|
672 |
+ return vreinterpretq_m128_f32(vcombine_f32(a21, b03)); |
|
673 |
+} |
|
674 |
+ |
|
675 |
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) |
|
676 |
+{ |
|
677 |
+ float32x2_t a03 = vget_low_f32( |
|
678 |
+ vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); |
|
679 |
+ float32x2_t b21 = vget_high_f32( |
|
680 |
+ vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); |
|
681 |
+ return vreinterpretq_m128_f32(vcombine_f32(a03, b21)); |
|
682 |
+} |
|
683 |
+ |
|
684 |
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) |
|
685 |
+{ |
|
686 |
+ float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); |
|
687 |
+ float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); |
|
688 |
+ return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); |
|
689 |
+} |
|
690 |
+ |
|
691 |
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) |
|
692 |
+{ |
|
693 |
+ float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); |
|
694 |
+ float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); |
|
695 |
+ return vreinterpretq_m128_f32(vcombine_f32(a01, b10)); |
|
696 |
+} |
|
697 |
+ |
|
698 |
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) |
|
699 |
+{ |
|
700 |
+ float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); |
|
701 |
+ float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b))); |
|
702 |
+ return vreinterpretq_m128_f32(vcombine_f32(a01, b01)); |
|
703 |
+} |
|
704 |
+ |
|
705 |
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the |
|
706 |
+// high |
|
707 |
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) |
|
708 |
+{ |
|
709 |
+ float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); |
|
710 |
+ float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); |
|
711 |
+ return vreinterpretq_m128_f32(vcombine_f32(a10, b32)); |
|
712 |
+} |
|
713 |
+ |
|
714 |
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) |
|
715 |
+{ |
|
716 |
+ float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1); |
|
717 |
+ float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); |
|
718 |
+ return vreinterpretq_m128_f32(vcombine_f32(a11, b00)); |
|
719 |
+} |
|
720 |
+ |
|
721 |
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) |
|
722 |
+{ |
|
723 |
+ float32x2_t a22 = |
|
724 |
+ vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); |
|
725 |
+ float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); |
|
726 |
+ return vreinterpretq_m128_f32(vcombine_f32(a22, b00)); |
|
727 |
+} |
|
728 |
+ |
|
729 |
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) |
|
730 |
+{ |
|
731 |
+ float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0); |
|
732 |
+ float32x2_t b22 = |
|
733 |
+ vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0); |
|
734 |
+ return vreinterpretq_m128_f32(vcombine_f32(a00, b22)); |
|
735 |
+} |
|
736 |
+ |
|
737 |
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) |
|
738 |
+{ |
|
739 |
+ float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); |
|
740 |
+ float32x2_t a22 = |
|
741 |
+ vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); |
|
742 |
+ float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/ |
|
743 |
+ float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); |
|
744 |
+ return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); |
|
745 |
+} |
|
746 |
+ |
|
747 |
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) |
|
748 |
+{ |
|
749 |
+ float32x2_t a33 = |
|
750 |
+ vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1); |
|
751 |
+ float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1); |
|
752 |
+ return vreinterpretq_m128_f32(vcombine_f32(a33, b11)); |
|
753 |
+} |
|
754 |
+ |
|
755 |
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) |
|
756 |
+{ |
|
757 |
+ float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); |
|
758 |
+ float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2); |
|
759 |
+ float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); |
|
760 |
+ float32x2_t b20 = vset_lane_f32(b2, b00, 1); |
|
761 |
+ return vreinterpretq_m128_f32(vcombine_f32(a10, b20)); |
|
762 |
+} |
|
763 |
+ |
|
764 |
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) |
|
765 |
+{ |
|
766 |
+ float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); |
|
767 |
+ float32_t b2 = vgetq_lane_f32(b, 2); |
|
768 |
+ float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); |
|
769 |
+ float32x2_t b20 = vset_lane_f32(b2, b00, 1); |
|
770 |
+ return vreinterpretq_m128_f32(vcombine_f32(a01, b20)); |
|
771 |
+} |
|
772 |
+ |
|
773 |
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) |
|
774 |
+{ |
|
775 |
+ float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); |
|
776 |
+ float32_t b2 = vgetq_lane_f32(b, 2); |
|
777 |
+ float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); |
|
778 |
+ float32x2_t b20 = vset_lane_f32(b2, b00, 1); |
|
779 |
+ return vreinterpretq_m128_f32(vcombine_f32(a32, b20)); |
|
780 |
+} |
|
781 |
+ |
|
782 |
+// Kahan summation for accurate summation of floating-point numbers. |
|
783 |
+// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html |
|
784 |
+FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y) |
|
785 |
+{ |
|
786 |
+ y -= *c; |
|
787 |
+ float t = *sum + y; |
|
788 |
+ *c = (t - *sum) - y; |
|
789 |
+ *sum = t; |
|
790 |
+} |
|
791 |
+ |
|
792 |
+#if defined(__ARM_FEATURE_CRYPTO) && \ |
|
793 |
+ (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)) |
|
794 |
+// Wraps vmull_p64 |
|
795 |
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) |
|
796 |
+{ |
|
797 |
+ poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); |
|
798 |
+ poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); |
|
799 |
+ return vreinterpretq_u64_p128(vmull_p64(a, b)); |
|
800 |
+} |
|
801 |
+#else // ARMv7 polyfill |
|
802 |
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. |
|
803 |
+// |
|
804 |
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a |
|
805 |
+// 64-bit->128-bit polynomial multiply. |
|
806 |
+// |
|
807 |
+// It needs some work and is somewhat slow, but it is still faster than all |
|
808 |
+// known scalar methods. |
|
809 |
+// |
|
810 |
+// Algorithm adapted to C from |
|
811 |
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted |
|
812 |
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the |
|
813 |
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab |
|
814 |
+// (https://hal.inria.fr/hal-01506572) |
|
815 |
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) |
|
816 |
+{ |
|
817 |
+ poly8x8_t a = vreinterpret_p8_u64(_a); |
|
818 |
+ poly8x8_t b = vreinterpret_p8_u64(_b); |
|
819 |
+ |
|
820 |
+ // Masks |
|
821 |
+ uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), |
|
822 |
+ vcreate_u8(0x00000000ffffffff)); |
|
823 |
+ uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), |
|
824 |
+ vcreate_u8(0x0000000000000000)); |
|
825 |
+ |
|
826 |
+ // Do the multiplies, rotating with vext to get all combinations |
|
827 |
+ uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 |
|
828 |
+ uint8x16_t e = |
|
829 |
+ vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 |
|
830 |
+ uint8x16_t f = |
|
831 |
+ vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 |
|
832 |
+ uint8x16_t g = |
|
833 |
+ vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 |
|
834 |
+ uint8x16_t h = |
|
835 |
+ vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 |
|
836 |
+ uint8x16_t i = |
|
837 |
+ vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 |
|
838 |
+ uint8x16_t j = |
|
839 |
+ vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 |
|
840 |
+ uint8x16_t k = |
|
841 |
+ vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 |
|
842 |
+ |
|
843 |
+ // Add cross products |
|
844 |
+ uint8x16_t l = veorq_u8(e, f); // L = E + F |
|
845 |
+ uint8x16_t m = veorq_u8(g, h); // M = G + H |
|
846 |
+ uint8x16_t n = veorq_u8(i, j); // N = I + J |
|
847 |
+ |
|
848 |
+ // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL |
|
849 |
+ // instructions. |
|
850 |
+#if defined(__aarch64__) |
|
851 |
+ uint8x16_t lm_p0 = vreinterpretq_u8_u64( |
|
852 |
+ vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); |
|
853 |
+ uint8x16_t lm_p1 = vreinterpretq_u8_u64( |
|
854 |
+ vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); |
|
855 |
+ uint8x16_t nk_p0 = vreinterpretq_u8_u64( |
|
856 |
+ vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); |
|
857 |
+ uint8x16_t nk_p1 = vreinterpretq_u8_u64( |
|
858 |
+ vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); |
|
859 |
+#else |
|
860 |
+ uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); |
|
861 |
+ uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); |
|
862 |
+ uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); |
|
863 |
+ uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); |
|
864 |
+#endif |
|
865 |
+ // t0 = (L) (P0 + P1) << 8 |
|
866 |
+ // t1 = (M) (P2 + P3) << 16 |
|
867 |
+ uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); |
|
868 |
+ uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); |
|
869 |
+ uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); |
|
870 |
+ |
|
871 |
+ // t2 = (N) (P4 + P5) << 24 |
|
872 |
+ // t3 = (K) (P6 + P7) << 32 |
|
873 |
+ uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); |
|
874 |
+ uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); |
|
875 |
+ uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); |
|
876 |
+ |
|
877 |
+ // De-interleave |
|
878 |
+#if defined(__aarch64__) |
|
879 |
+ uint8x16_t t0 = vreinterpretq_u8_u64( |
|
880 |
+ vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); |
|
881 |
+ uint8x16_t t1 = vreinterpretq_u8_u64( |
|
882 |
+ vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); |
|
883 |
+ uint8x16_t t2 = vreinterpretq_u8_u64( |
|
884 |
+ vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); |
|
885 |
+ uint8x16_t t3 = vreinterpretq_u8_u64( |
|
886 |
+ vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); |
|
887 |
+#else |
|
888 |
+ uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); |
|
889 |
+ uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); |
|
890 |
+ uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); |
|
891 |
+ uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); |
|
892 |
+#endif |
|
893 |
+ // Shift the cross products |
|
894 |
+ uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 |
|
895 |
+ uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 |
|
896 |
+ uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 |
|
897 |
+ uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 |
|
898 |
+ |
|
899 |
+ // Accumulate the products |
|
900 |
+ uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); |
|
901 |
+ uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); |
|
902 |
+ uint8x16_t mix = veorq_u8(d, cross1); |
|
903 |
+ uint8x16_t r = veorq_u8(mix, cross2); |
|
904 |
+ return vreinterpretq_u64_u8(r); |
|
905 |
+} |
|
906 |
+#endif // ARMv7 polyfill |
|
907 |
+ |
|
908 |
+// C equivalent: |
|
909 |
+// __m128i _mm_shuffle_epi32_default(__m128i a, |
|
910 |
+// __constrange(0, 255) int imm) { |
|
911 |
+// __m128i ret; |
|
912 |
+// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; |
|
913 |
+// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; |
|
914 |
+// return ret; |
|
915 |
+// } |
|
916 |
+#define _mm_shuffle_epi32_default(a, imm) \ |
|
917 |
+ __extension__({ \ |
|
918 |
+ int32x4_t ret; \ |
|
919 |
+ ret = vmovq_n_s32( \ |
|
920 |
+ vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \ |
|
921 |
+ ret = vsetq_lane_s32( \ |
|
922 |
+ vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \ |
|
923 |
+ ret, 1); \ |
|
924 |
+ ret = vsetq_lane_s32( \ |
|
925 |
+ vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \ |
|
926 |
+ ret, 2); \ |
|
927 |
+ ret = vsetq_lane_s32( \ |
|
928 |
+ vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \ |
|
929 |
+ ret, 3); \ |
|
930 |
+ vreinterpretq_m128i_s32(ret); \ |
|
931 |
+ }) |
|
932 |
+ |
|
933 |
+// Takes the upper 64 bits of a and places it in the low end of the result |
|
934 |
+// Takes the lower 64 bits of a and places it into the high end of the result. |
|
935 |
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) |
|
936 |
+{ |
|
937 |
+ int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); |
|
938 |
+ int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); |
|
939 |
+ return vreinterpretq_m128i_s32(vcombine_s32(a32, a10)); |
|
940 |
+} |
|
941 |
+ |
|
942 |
+// takes the lower two 32-bit values from a and swaps them and places in low end |
|
943 |
+// of result takes the higher two 32 bit values from a and swaps them and places |
|
944 |
+// in high end of result. |
|
945 |
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) |
|
946 |
+{ |
|
947 |
+ int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); |
|
948 |
+ int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a))); |
|
949 |
+ return vreinterpretq_m128i_s32(vcombine_s32(a01, a23)); |
|
950 |
+} |
|
951 |
+ |
|
952 |
+// rotates the least significant 32 bits into the most significant 32 bits, and |
|
953 |
+// shifts the rest down |
|
954 |
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) |
|
955 |
+{ |
|
956 |
+ return vreinterpretq_m128i_s32( |
|
957 |
+ vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1)); |
|
958 |
+} |
|
959 |
+ |
|
960 |
+// rotates the most significant 32 bits into the least significant 32 bits, and |
|
961 |
+// shifts the rest up |
|
962 |
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) |
|
963 |
+{ |
|
964 |
+ return vreinterpretq_m128i_s32( |
|
965 |
+ vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3)); |
|
966 |
+} |
|
967 |
+ |
|
968 |
+// gets the lower 64 bits of a, and places it in the upper 64 bits |
|
969 |
+// gets the lower 64 bits of a and places it in the lower 64 bits |
|
970 |
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) |
|
971 |
+{ |
|
972 |
+ int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); |
|
973 |
+ return vreinterpretq_m128i_s32(vcombine_s32(a10, a10)); |
|
974 |
+} |
|
975 |
+ |
|
976 |
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the |
|
977 |
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits |
|
978 |
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) |
|
979 |
+{ |
|
980 |
+ int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); |
|
981 |
+ int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); |
|
982 |
+ return vreinterpretq_m128i_s32(vcombine_s32(a01, a10)); |
|
983 |
+} |
|
984 |
+ |
|
985 |
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the |
|
986 |
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and |
|
987 |
+// places it in the lower 64 bits |
|
988 |
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) |
|
989 |
+{ |
|
990 |
+ int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); |
|
991 |
+ return vreinterpretq_m128i_s32(vcombine_s32(a01, a01)); |
|
992 |
+} |
|
993 |
+ |
|
994 |
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) |
|
995 |
+{ |
|
996 |
+ int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1); |
|
997 |
+ int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); |
|
998 |
+ return vreinterpretq_m128i_s32(vcombine_s32(a11, a22)); |
|
999 |
+} |
|
1000 |
+ |
|
1001 |
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) |
|
1002 |
+{ |
|
1003 |
+ int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); |
|
1004 |
+ int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); |
|
1005 |
+ return vreinterpretq_m128i_s32(vcombine_s32(a22, a01)); |
|
1006 |
+} |
|
1007 |
+ |
|
1008 |
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) |
|
1009 |
+{ |
|
1010 |
+ int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); |
|
1011 |
+ int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1); |
|
1012 |
+ return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); |
|
1013 |
+} |
|
1014 |
+ |
|
1015 |
+// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255) |
|
1016 |
+// int imm) |
|
1017 |
+#if defined(__aarch64__) |
|
1018 |
+#define _mm_shuffle_epi32_splat(a, imm) \ |
|
1019 |
+ __extension__({ \ |
|
1020 |
+ vreinterpretq_m128i_s32( \ |
|
1021 |
+ vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \ |
|
1022 |
+ }) |
|
1023 |
+#else |
|
1024 |
+#define _mm_shuffle_epi32_splat(a, imm) \ |
|
1025 |
+ __extension__({ \ |
|
1026 |
+ vreinterpretq_m128i_s32( \ |
|
1027 |
+ vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \ |
|
1028 |
+ }) |
|
1029 |
+#endif |
|
1030 |
+ |
|
1031 |
+// NEON does not support a general purpose permute intrinsic. |
|
1032 |
+// Shuffle single-precision (32-bit) floating-point elements in a using the |
|
1033 |
+// control in imm8, and store the results in dst. |
|
1034 |
+// |
|
1035 |
+// C equivalent: |
|
1036 |
+// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, |
|
1037 |
+// __constrange(0, 255) int imm) { |
|
1038 |
+// __m128 ret; |
|
1039 |
+// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; |
|
1040 |
+// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; |
|
1041 |
+// return ret; |
|
1042 |
+// } |
|
1043 |
+// |
|
1044 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps |
|
1045 |
+#define _mm_shuffle_ps_default(a, b, imm) \ |
|
1046 |
+ __extension__({ \ |
|
1047 |
+ float32x4_t ret; \ |
|
1048 |
+ ret = vmovq_n_f32( \ |
|
1049 |
+ vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \ |
|
1050 |
+ ret = vsetq_lane_f32( \ |
|
1051 |
+ vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \ |
|
1052 |
+ ret, 1); \ |
|
1053 |
+ ret = vsetq_lane_f32( \ |
|
1054 |
+ vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \ |
|
1055 |
+ ret, 2); \ |
|
1056 |
+ ret = vsetq_lane_f32( \ |
|
1057 |
+ vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \ |
|
1058 |
+ ret, 3); \ |
|
1059 |
+ vreinterpretq_m128_f32(ret); \ |
|
1060 |
+ }) |
|
1061 |
+ |
|
1062 |
+// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8. |
|
1063 |
+// Store the results in the low 64 bits of dst, with the high 64 bits being |
|
1064 |
+// copied from from a to dst. |
|
1065 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16 |
|
1066 |
+#define _mm_shufflelo_epi16_function(a, imm) \ |
|
1067 |
+ __extension__({ \ |
|
1068 |
+ int16x8_t ret = vreinterpretq_s16_m128i(a); \ |
|
1069 |
+ int16x4_t lowBits = vget_low_s16(ret); \ |
|
1070 |
+ ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ |
|
1071 |
+ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \ |
|
1072 |
+ 1); \ |
|
1073 |
+ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \ |
|
1074 |
+ 2); \ |
|
1075 |
+ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \ |
|
1076 |
+ 3); \ |
|
1077 |
+ vreinterpretq_m128i_s16(ret); \ |
|
1078 |
+ }) |
|
1079 |
+ |
|
1080 |
+// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8. |
|
1081 |
+// Store the results in the high 64 bits of dst, with the low 64 bits being |
|
1082 |
+// copied from from a to dst. |
|
1083 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16 |
|
1084 |
+#define _mm_shufflehi_epi16_function(a, imm) \ |
|
1085 |
+ __extension__({ \ |
|
1086 |
+ int16x8_t ret = vreinterpretq_s16_m128i(a); \ |
|
1087 |
+ int16x4_t highBits = vget_high_s16(ret); \ |
|
1088 |
+ ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ |
|
1089 |
+ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \ |
|
1090 |
+ 5); \ |
|
1091 |
+ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \ |
|
1092 |
+ 6); \ |
|
1093 |
+ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \ |
|
1094 |
+ 7); \ |
|
1095 |
+ vreinterpretq_m128i_s16(ret); \ |
|
1096 |
+ }) |
|
1097 |
+ |
|
1098 |
+/* MMX */ |
|
1099 |
+ |
|
1100 |
+//_mm_empty is a no-op on arm |
|
1101 |
+FORCE_INLINE void _mm_empty(void) {} |
|
1102 |
+ |
|
1103 |
+/* SSE */ |
|
1104 |
+ |
|
1105 |
+// Add packed single-precision (32-bit) floating-point elements in a and b, and |
|
1106 |
+// store the results in dst. |
|
1107 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps |
|
1108 |
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) |
|
1109 |
+{ |
|
1110 |
+ return vreinterpretq_m128_f32( |
|
1111 |
+ vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); |
|
1112 |
+} |
|
1113 |
+ |
|
1114 |
+// Add the lower single-precision (32-bit) floating-point element in a and b, |
|
1115 |
+// store the result in the lower element of dst, and copy the upper 3 packed |
|
1116 |
+// elements from a to the upper elements of dst. |
|
1117 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss |
|
1118 |
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) |
|
1119 |
+{ |
|
1120 |
+ float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); |
|
1121 |
+ float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); |
|
1122 |
+ // the upper values in the result must be the remnants of <a>. |
|
1123 |
+ return vreinterpretq_m128_f32(vaddq_f32(a, value)); |
|
1124 |
+} |
|
1125 |
+ |
|
1126 |
+// Compute the bitwise AND of packed single-precision (32-bit) floating-point |
|
1127 |
+// elements in a and b, and store the results in dst. |
|
1128 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps |
|
1129 |
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) |
|
1130 |
+{ |
|
1131 |
+ return vreinterpretq_m128_s32( |
|
1132 |
+ vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); |
|
1133 |
+} |
|
1134 |
+ |
|
1135 |
+// Compute the bitwise NOT of packed single-precision (32-bit) floating-point |
|
1136 |
+// elements in a and then AND with b, and store the results in dst. |
|
1137 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps |
|
1138 |
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) |
|
1139 |
+{ |
|
1140 |
+ return vreinterpretq_m128_s32( |
|
1141 |
+ vbicq_s32(vreinterpretq_s32_m128(b), |
|
1142 |
+ vreinterpretq_s32_m128(a))); // *NOTE* argument swap |
|
1143 |
+} |
|
1144 |
+ |
|
1145 |
+// Average packed unsigned 16-bit integers in a and b, and store the results in |
|
1146 |
+// dst. |
|
1147 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16 |
|
1148 |
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) |
|
1149 |
+{ |
|
1150 |
+ return vreinterpret_m64_u16( |
|
1151 |
+ vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b))); |
|
1152 |
+} |
|
1153 |
+ |
|
1154 |
+// Average packed unsigned 8-bit integers in a and b, and store the results in |
|
1155 |
+// dst. |
|
1156 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8 |
|
1157 |
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) |
|
1158 |
+{ |
|
1159 |
+ return vreinterpret_m64_u8( |
|
1160 |
+ vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); |
|
1161 |
+} |
|
1162 |
+ |
|
1163 |
+// Compare packed single-precision (32-bit) floating-point elements in a and b |
|
1164 |
+// for equality, and store the results in dst. |
|
1165 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps |
|
1166 |
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) |
|
1167 |
+{ |
|
1168 |
+ return vreinterpretq_m128_u32( |
|
1169 |
+ vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); |
|
1170 |
+} |
|
1171 |
+ |
|
1172 |
+// Compare the lower single-precision (32-bit) floating-point elements in a and |
|
1173 |
+// b for equality, store the result in the lower element of dst, and copy the |
|
1174 |
+// upper 3 packed elements from a to the upper elements of dst. |
|
1175 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss |
|
1176 |
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) |
|
1177 |
+{ |
|
1178 |
+ return _mm_move_ss(a, _mm_cmpeq_ps(a, b)); |
|
1179 |
+} |
|
1180 |
+ |
|
1181 |
+// Compare packed single-precision (32-bit) floating-point elements in a and b |
|
1182 |
+// for greater-than-or-equal, and store the results in dst. |
|
1183 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps |
|
1184 |
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) |
|
1185 |
+{ |
|
1186 |
+ return vreinterpretq_m128_u32( |
|
1187 |
+ vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); |
|
1188 |
+} |
|
1189 |
+ |
|
1190 |
+// Compare the lower single-precision (32-bit) floating-point elements in a and |
|
1191 |
+// b for greater-than-or-equal, store the result in the lower element of dst, |
|
1192 |
+// and copy the upper 3 packed elements from a to the upper elements of dst. |
|
1193 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss |
|
1194 |
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) |
|
1195 |
+{ |
|
1196 |
+ return _mm_move_ss(a, _mm_cmpge_ps(a, b)); |
|
1197 |
+} |
|
1198 |
+ |
|
1199 |
+// Compare packed single-precision (32-bit) floating-point elements in a and b |
|
1200 |
+// for greater-than, and store the results in dst. |
|
1201 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps |
|
1202 |
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) |
|
1203 |
+{ |
|
1204 |
+ return vreinterpretq_m128_u32( |
|
1205 |
+ vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); |
|
1206 |
+} |
|
1207 |
+ |
|
1208 |
+// Compare the lower single-precision (32-bit) floating-point elements in a and |
|
1209 |
+// b for greater-than, store the result in the lower element of dst, and copy |
|
1210 |
+// the upper 3 packed elements from a to the upper elements of dst. |
|
1211 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss |
|
1212 |
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) |
|
1213 |
+{ |
|
1214 |
+ return _mm_move_ss(a, _mm_cmpgt_ps(a, b)); |
|
1215 |
+} |
|
1216 |
+ |
|
1217 |
+// Compare packed single-precision (32-bit) floating-point elements in a and b |
|
1218 |
+// for less-than-or-equal, and store the results in dst. |
|
1219 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps |
|
1220 |
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) |
|
1221 |
+{ |
|
1222 |
+ return vreinterpretq_m128_u32( |
|
1223 |
+ vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); |
|
1224 |
+} |
|
1225 |
+ |
|
1226 |
+// Compare the lower single-precision (32-bit) floating-point elements in a and |
|
1227 |
+// b for less-than-or-equal, store the result in the lower element of dst, and |
|
1228 |
+// copy the upper 3 packed elements from a to the upper elements of dst. |
|
1229 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss |
|
1230 |
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) |
|
1231 |
+{ |
|
1232 |
+ return _mm_move_ss(a, _mm_cmple_ps(a, b)); |
|
1233 |
+} |
|
1234 |
+ |
|
1235 |
+// Compare packed single-precision (32-bit) floating-point elements in a and b |
|
1236 |
+// for less-than, and store the results in dst. |
|
1237 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps |
|
1238 |
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) |
|
1239 |
+{ |
|
1240 |
+ return vreinterpretq_m128_u32( |
|
1241 |
+ vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); |
|
1242 |
+} |
|
1243 |
+ |
|
1244 |
+// Compare the lower single-precision (32-bit) floating-point elements in a and |
|
1245 |
+// b for less-than, store the result in the lower element of dst, and copy the |
|
1246 |
+// upper 3 packed elements from a to the upper elements of dst. |
|
1247 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss |
|
1248 |
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) |
|
1249 |
+{ |
|
1250 |
+ return _mm_move_ss(a, _mm_cmplt_ps(a, b)); |
|
1251 |
+} |
|
1252 |
+ |
|
1253 |
+// Compare packed single-precision (32-bit) floating-point elements in a and b |
|
1254 |
+// for not-equal, and store the results in dst. |
|
1255 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps |
|
1256 |
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) |
|
1257 |
+{ |
|
1258 |
+ return vreinterpretq_m128_u32(vmvnq_u32( |
|
1259 |
+ vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); |
|
1260 |
+} |
|
1261 |
+ |
|
1262 |
+// Compare the lower single-precision (32-bit) floating-point elements in a and |
|
1263 |
+// b for not-equal, store the result in the lower element of dst, and copy the |
|
1264 |
+// upper 3 packed elements from a to the upper elements of dst. |
|
1265 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss |
|
1266 |
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) |
|
1267 |
+{ |
|
1268 |
+ return _mm_move_ss(a, _mm_cmpneq_ps(a, b)); |
|
1269 |
+} |
|
1270 |
+ |
|
1271 |
+// Compare packed single-precision (32-bit) floating-point elements in a and b |
|
1272 |
+// for not-greater-than-or-equal, and store the results in dst. |
|
1273 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps |
|
1274 |
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) |
|
1275 |
+{ |
|
1276 |
+ return vreinterpretq_m128_u32(vmvnq_u32( |
|
1277 |
+ vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); |
|
1278 |
+} |
|
1279 |
+ |
|
1280 |
+// Compare the lower single-precision (32-bit) floating-point elements in a and |
|
1281 |
+// b for not-greater-than-or-equal, store the result in the lower element of |
|
1282 |
+// dst, and copy the upper 3 packed elements from a to the upper elements of |
|
1283 |
+// dst. |
|
1284 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss |
|
1285 |
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) |
|
1286 |
+{ |
|
1287 |
+ return _mm_move_ss(a, _mm_cmpnge_ps(a, b)); |
|
1288 |
+} |
|
1289 |
+ |
|
1290 |
+// Compare packed single-precision (32-bit) floating-point elements in a and b |
|
1291 |
+// for not-greater-than, and store the results in dst. |
|
1292 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps |
|
1293 |
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) |
|
1294 |
+{ |
|
1295 |
+ return vreinterpretq_m128_u32(vmvnq_u32( |
|
1296 |
+ vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); |
|
1297 |
+} |
|
1298 |
+ |
|
1299 |
+// Compare the lower single-precision (32-bit) floating-point elements in a and |
|
1300 |
+// b for not-greater-than, store the result in the lower element of dst, and |
|
1301 |
+// copy the upper 3 packed elements from a to the upper elements of dst. |
|
1302 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss |
|
1303 |
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) |
|
1304 |
+{ |
|
1305 |
+ return _mm_move_ss(a, _mm_cmpngt_ps(a, b)); |
|
1306 |
+} |
|
1307 |
+ |
|
1308 |
+// Compare packed single-precision (32-bit) floating-point elements in a and b |
|
1309 |
+// for not-less-than-or-equal, and store the results in dst. |
|
1310 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps |
|
1311 |
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) |
|
1312 |
+{ |
|
1313 |
+ return vreinterpretq_m128_u32(vmvnq_u32( |
|
1314 |
+ vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); |
|
1315 |
+} |
|
1316 |
+ |
|
1317 |
+// Compare the lower single-precision (32-bit) floating-point elements in a and |
|
1318 |
+// b for not-less-than-or-equal, store the result in the lower element of dst, |
|
1319 |
+// and copy the upper 3 packed elements from a to the upper elements of dst. |
|
1320 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss |
|
1321 |
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) |
|
1322 |
+{ |
|
1323 |
+ return _mm_move_ss(a, _mm_cmpnle_ps(a, b)); |
|
1324 |
+} |
|
1325 |
+ |
|
1326 |
+// Compare packed single-precision (32-bit) floating-point elements in a and b |
|
1327 |
+// for not-less-than, and store the results in dst. |
|
1328 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps |
|
1329 |
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) |
|
1330 |
+{ |
|
1331 |
+ return vreinterpretq_m128_u32(vmvnq_u32( |
|
1332 |
+ vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); |
|
1333 |
+} |
|
1334 |
+ |
|
1335 |
+// Compare the lower single-precision (32-bit) floating-point elements in a and |
|
1336 |
+// b for not-less-than, store the result in the lower element of dst, and copy |
|
1337 |
+// the upper 3 packed elements from a to the upper elements of dst. |
|
1338 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss |
|
1339 |
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) |
|
1340 |
+{ |
|
1341 |
+ return _mm_move_ss(a, _mm_cmpnlt_ps(a, b)); |
|
1342 |
+} |
|
1343 |
+ |
|
1344 |
+// Compare packed single-precision (32-bit) floating-point elements in a and b |
|
1345 |
+// to see if neither is NaN, and store the results in dst. |
|
1346 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps |
|
1347 |
+// |
|
1348 |
+// See also: |
|
1349 |
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean |
|
1350 |
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics |
|
1351 |
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) |
|
1352 |
+{ |
|
1353 |
+ // Note: NEON does not have ordered compare builtin |
|
1354 |
+ // Need to compare a eq a and b eq b to check for NaN |
|
1355 |
+ // Do AND of results to get final |
|
1356 |
+ uint32x4_t ceqaa = |
|
1357 |
+ vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); |
|
1358 |
+ uint32x4_t ceqbb = |
|
1359 |
+ vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); |
|
1360 |
+ return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb)); |
|
1361 |
+} |
|
1362 |
+ |
|
1363 |
+// Compare the lower single-precision (32-bit) floating-point elements in a and |
|
1364 |
+// b to see if neither is NaN, store the result in the lower element of dst, and |
|
1365 |
+// copy the upper 3 packed elements from a to the upper elements of dst. |
|
1366 |
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss |
|
1367 |
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) |
|
1368 |
+{ |
|
1369 |
+ return _mm_move_ss(a, _mm_cmpord_ps(a, b)); |
|
1370 |
+} |
|
1371 |
+ |
|
1372 |
+// Compare packed single-precision (32-bit) floating-point elements in a and |