Browse code

Use sse2neon.h for SSE support on Linux ARM64

Uses https://github.com/DLTcollab/sse2neon/

Fixes #4

Signed-off-by: Martin Tzvetanov Grigorov <mgrigorov@apache.org>

Martin Tzvetanov Grigorov authored on 17/02/2023 13:58:46 • Michael Lawrence committed on 17/02/2023 21:23:18
Showing 2 changed files

... ...
@@ -21,6 +21,9 @@ static char rcsid[] = "$Id: bamtally.c 219300 2019-05-21 18:32:50Z twu $";
21 21
 #ifdef HAVE_SSE4_1
22 22
 #include <smmintrin.h>
23 23
 #endif
24
+#ifdef __ARM_NEON
25
+#include "sse2neon.h"
26
+#endif
24 27
 
25 28
 #include "except.h"
26 29
 #include "assert.h"
27 30
new file mode 100644
... ...
@@ -0,0 +1,9072 @@
1
+#ifndef SSE2NEON_H
2
+#define SSE2NEON_H
3
+
4
+// This header file provides a simple API translation layer
5
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6
+//
7
+// Contributors to this work are:
8
+//   John W. Ratcliff <jratcliffscarab@gmail.com>
9
+//   Brandon Rowlett <browlett@nvidia.com>
10
+//   Ken Fast <kfast@gdeb.com>
11
+//   Eric van Beurden <evanbeurden@nvidia.com>
12
+//   Alexander Potylitsin <apotylitsin@nvidia.com>
13
+//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
14
+//   Jim Huang <jserv@ccns.ncku.edu.tw>
15
+//   Mark Cheng <marktwtn@gmail.com>
16
+//   Malcolm James MacLeod <malcolm@gulden.com>
17
+//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
18
+//   Sebastian Pop <spop@amazon.com>
19
+//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
20
+//   Danila Kutenin <danilak@google.com>
21
+//   François Turban (JishinMaster) <francois.turban@gmail.com>
22
+//   Pei-Hsuan Hung <afcidk@gmail.com>
23
+//   Yang-Hao Yuan <yuanyanghau@gmail.com>
24
+//   Syoyo Fujita <syoyo@lighttransport.com>
25
+//   Brecht Van Lommel <brecht@blender.org>
26
+//   Jonathan Hue <jhue@adobe.com>
27
+//   Cuda Chen <clh960524@gmail.com>
28
+//   Aymen Qader <aymen.qader@arm.com>
29
+
30
+/*
31
+ * sse2neon is freely redistributable under the MIT License.
32
+ *
33
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
34
+ * of this software and associated documentation files (the "Software"), to deal
35
+ * in the Software without restriction, including without limitation the rights
36
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
37
+ * copies of the Software, and to permit persons to whom the Software is
38
+ * furnished to do so, subject to the following conditions:
39
+ *
40
+ * The above copyright notice and this permission notice shall be included in
41
+ * all copies or substantial portions of the Software.
42
+ *
43
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
44
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
45
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
47
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
48
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
49
+ * SOFTWARE.
50
+ */
51
+
52
+/* Tunable configurations */
53
+
54
+/* Enable precise implementation of math operations
55
+ * This would slow down the computation a bit, but gives consistent result with
56
+ * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
57
+ */
58
+/* _mm_min|max_ps|ss|pd|sd */
59
+#ifndef SSE2NEON_PRECISE_MINMAX
60
+#define SSE2NEON_PRECISE_MINMAX (0)
61
+#endif
62
+/* _mm_rcp_ps and _mm_div_ps */
63
+#ifndef SSE2NEON_PRECISE_DIV
64
+#define SSE2NEON_PRECISE_DIV (0)
65
+#endif
66
+/* _mm_sqrt_ps and _mm_rsqrt_ps */
67
+#ifndef SSE2NEON_PRECISE_SQRT
68
+#define SSE2NEON_PRECISE_SQRT (0)
69
+#endif
70
+/* _mm_dp_pd */
71
+#ifndef SSE2NEON_PRECISE_DP
72
+#define SSE2NEON_PRECISE_DP (0)
73
+#endif
74
+
75
+/* compiler specific definitions */
76
+#if defined(__GNUC__) || defined(__clang__)
77
+#pragma push_macro("FORCE_INLINE")
78
+#pragma push_macro("ALIGN_STRUCT")
79
+#define FORCE_INLINE static inline __attribute__((always_inline))
80
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
81
+#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
82
+#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
83
+#else /* non-GNU / non-clang compilers */
84
+#warning "Macro name collisions may happen with unsupported compiler."
85
+#ifndef FORCE_INLINE
86
+#define FORCE_INLINE static inline
87
+#endif
88
+#ifndef ALIGN_STRUCT
89
+#define ALIGN_STRUCT(x) __declspec(align(x))
90
+#endif
91
+#define _sse2neon_likely(x) (x)
92
+#define _sse2neon_unlikely(x) (x)
93
+#endif
94
+
95
+/* C language does not allow initializing a variable with a function call. */
96
+#ifdef __cplusplus
97
+#define _sse2neon_const static const
98
+#else
99
+#define _sse2neon_const const
100
+#endif
101
+
102
+#include <stdint.h>
103
+#include <stdlib.h>
104
+
105
+#if defined(_WIN32)
106
+/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
107
+ * from both MinGW-w64 and MSVC.
108
+ */
109
+#define SSE2NEON_ALLOC_DEFINED
110
+#endif
111
+
112
+/* If using MSVC */
113
+#ifdef _MSC_VER
114
+#include <intrin.h>
115
+#if (defined(_M_AMD64) || defined(__x86_64__)) || \
116
+    (defined(_M_ARM) || defined(__arm__))
117
+#define SSE2NEON_HAS_BITSCAN64
118
+#endif
119
+#endif
120
+
121
+/* Compiler barrier */
122
+#define SSE2NEON_BARRIER()                     \
123
+    do {                                       \
124
+        __asm__ __volatile__("" ::: "memory"); \
125
+        (void) 0;                              \
126
+    } while (0)
127
+
128
+/* Memory barriers
129
+ * __atomic_thread_fence does not include a compiler barrier; instead,
130
+ * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
131
+ * semantics.
132
+ */
133
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
134
+#include <stdatomic.h>
135
+#endif
136
+
137
+FORCE_INLINE void _sse2neon_smp_mb(void)
138
+{
139
+    SSE2NEON_BARRIER();
140
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
141
+    !defined(__STDC_NO_ATOMICS__)
142
+    atomic_thread_fence(memory_order_seq_cst);
143
+#elif defined(__GNUC__) || defined(__clang__)
144
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);
145
+#else
146
+    /* FIXME: MSVC support */
147
+#endif
148
+}
149
+
150
+/* Architecture-specific build options */
151
+/* FIXME: #pragma GCC push_options is only available on GCC */
152
+#if defined(__GNUC__)
153
+#if defined(__arm__) && __ARM_ARCH == 7
154
+/* According to ARM C Language Extensions Architecture specification,
155
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
156
+ * architecture supported.
157
+ */
158
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
159
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
160
+#endif
161
+#if !defined(__clang__)
162
+#pragma GCC push_options
163
+#pragma GCC target("fpu=neon")
164
+#endif
165
+#elif defined(__aarch64__)
166
+#if !defined(__clang__)
167
+#pragma GCC push_options
168
+#pragma GCC target("+simd")
169
+#endif
170
+#elif __ARM_ARCH == 8
171
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
172
+#error \
173
+    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
174
+#endif
175
+#if !defined(__clang__)
176
+#pragma GCC push_options
177
+#endif
178
+#else
179
+#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
180
+#endif
181
+#endif
182
+
183
+#include <arm_neon.h>
184
+#if !defined(__aarch64__) && (__ARM_ARCH == 8)
185
+#if defined __has_include && __has_include(<arm_acle.h>)
186
+#include <arm_acle.h>
187
+#endif
188
+#endif
189
+
190
+/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
191
+ * and other Arm microarchtectures use.
192
+ * From sysctl -a on Apple M1:
193
+ * hw.cachelinesize: 128
194
+ */
195
+#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
196
+#define SSE2NEON_CACHELINE_SIZE 128
197
+#else
198
+#define SSE2NEON_CACHELINE_SIZE 64
199
+#endif
200
+
201
+/* Rounding functions require either Aarch64 instructions or libm failback */
202
+#if !defined(__aarch64__)
203
+#include <math.h>
204
+#endif
205
+
206
+/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
207
+ * or even not accessible in user mode.
208
+ * To write or access to these registers in user mode,
209
+ * we have to perform syscall instead.
210
+ */
211
+#if !defined(__aarch64__)
212
+#include <sys/time.h>
213
+#endif
214
+
215
+/* "__has_builtin" can be used to query support for built-in functions
216
+ * provided by gcc/clang and other compilers that support it.
217
+ */
218
+#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
219
+/* Compatibility with gcc <= 9 */
220
+#if defined(__GNUC__) && (__GNUC__ <= 9)
221
+#define __has_builtin(x) HAS##x
222
+#define HAS__builtin_popcount 1
223
+#define HAS__builtin_popcountll 1
224
+
225
+// __builtin_shuffle introduced in GCC 4.7.0
226
+#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
227
+#define HAS__builtin_shuffle 1
228
+#else
229
+#define HAS__builtin_shuffle 0
230
+#endif
231
+
232
+#define HAS__builtin_shufflevector 0
233
+#define HAS__builtin_nontemporal_store 0
234
+#else
235
+#define __has_builtin(x) 0
236
+#endif
237
+#endif
238
+
239
+/**
240
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
241
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
242
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
243
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
244
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
245
+ * fp0 is the same for fp0 of result.
246
+ */
247
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
248
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
249
+
250
+#if __has_builtin(__builtin_shufflevector)
251
+#define _sse2neon_shuffle(type, a, b, ...) \
252
+    __builtin_shufflevector(a, b, __VA_ARGS__)
253
+#elif __has_builtin(__builtin_shuffle)
254
+#define _sse2neon_shuffle(type, a, b, ...) \
255
+    __extension__({                        \
256
+        type tmp = {__VA_ARGS__};          \
257
+        __builtin_shuffle(a, b, tmp);      \
258
+    })
259
+#endif
260
+
261
+#ifdef _sse2neon_shuffle
262
+#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
263
+#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
264
+#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
265
+#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
266
+#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
267
+#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
268
+#endif
269
+
270
+/* Rounding mode macros. */
271
+#define _MM_FROUND_TO_NEAREST_INT 0x00
272
+#define _MM_FROUND_TO_NEG_INF 0x01
273
+#define _MM_FROUND_TO_POS_INF 0x02
274
+#define _MM_FROUND_TO_ZERO 0x03
275
+#define _MM_FROUND_CUR_DIRECTION 0x04
276
+#define _MM_FROUND_NO_EXC 0x08
277
+#define _MM_FROUND_RAISE_EXC 0x00
278
+#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
279
+#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
280
+#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
281
+#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
282
+#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
283
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
284
+#define _MM_ROUND_NEAREST 0x0000
285
+#define _MM_ROUND_DOWN 0x2000
286
+#define _MM_ROUND_UP 0x4000
287
+#define _MM_ROUND_TOWARD_ZERO 0x6000
288
+/* Flush zero mode macros. */
289
+#define _MM_FLUSH_ZERO_MASK 0x8000
290
+#define _MM_FLUSH_ZERO_ON 0x8000
291
+#define _MM_FLUSH_ZERO_OFF 0x0000
292
+/* Denormals are zeros mode macros. */
293
+#define _MM_DENORMALS_ZERO_MASK 0x0040
294
+#define _MM_DENORMALS_ZERO_ON 0x0040
295
+#define _MM_DENORMALS_ZERO_OFF 0x0000
296
+
297
+/* indicate immediate constant argument in a given range */
298
+#define __constrange(a, b) const
299
+
300
+/* A few intrinsics accept traditional data types like ints or floats, but
301
+ * most operate on data types that are specific to SSE.
302
+ * If a vector type ends in d, it contains doubles, and if it does not have
303
+ * a suffix, it contains floats. An integer vector type can contain any type
304
+ * of integer, from chars to shorts to unsigned long longs.
305
+ */
306
+typedef int64x1_t __m64;
307
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
308
+// On ARM 32-bit architecture, the float64x2_t is not supported.
309
+// The data type __m128d should be represented in a different way for related
310
+// intrinsic conversion.
311
+#if defined(__aarch64__)
312
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
313
+#else
314
+typedef float32x4_t __m128d;
315
+#endif
316
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
317
+
318
+// __int64 is defined in the Intrinsics Guide which maps to different datatype
319
+// in different data model
320
+#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
321
+#if (defined(__x86_64__) || defined(__i386__))
322
+#define __int64 long long
323
+#else
324
+#define __int64 int64_t
325
+#endif
326
+#endif
327
+
328
+/* type-safe casting between types */
329
+
330
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
331
+#define vreinterpretq_m128_f32(x) (x)
332
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
333
+
334
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
335
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
336
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
337
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
338
+
339
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
340
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
341
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
342
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
343
+
344
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
345
+#define vreinterpretq_f32_m128(x) (x)
346
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
347
+
348
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
349
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
350
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
351
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
352
+
353
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
354
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
355
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
356
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
357
+
358
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
359
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
360
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
361
+#define vreinterpretq_m128i_s64(x) (x)
362
+
363
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
364
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
365
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
366
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
367
+
368
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
369
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
370
+
371
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
372
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
373
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
374
+#define vreinterpretq_s64_m128i(x) (x)
375
+
376
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
377
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
378
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
379
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
380
+
381
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
382
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
383
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
384
+#define vreinterpret_m64_s64(x) (x)
385
+
386
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
387
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
388
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
389
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
390
+
391
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
392
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
393
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
394
+
395
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
396
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
397
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
398
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
399
+
400
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
401
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
402
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
403
+#define vreinterpret_s64_m64(x) (x)
404
+
405
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
406
+
407
+#if defined(__aarch64__)
408
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
409
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
410
+
411
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
412
+
413
+#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
414
+#define vreinterpretq_m128d_f64(x) (x)
415
+
416
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
417
+
418
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
419
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
420
+
421
+#define vreinterpretq_f64_m128d(x) (x)
422
+#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
423
+#else
424
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
425
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
426
+
427
+#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
428
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
429
+
430
+#define vreinterpretq_m128d_f32(x) (x)
431
+
432
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
433
+
434
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
435
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
436
+
437
+#define vreinterpretq_f32_m128d(x) (x)
438
+#endif
439
+
440
+// A struct is defined in this header file called 'SIMDVec' which can be used
441
+// by applications which attempt to access the contents of an __m128 struct
442
+// directly.  It is important to note that accessing the __m128 struct directly
443
+// is bad coding practice by Microsoft: @see:
444
+// https://learn.microsoft.com/en-us/cpp/cpp/m128
445
+//
446
+// However, some legacy source code may try to access the contents of an __m128
447
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
448
+// casting must be done manually by the developer, as you cannot cast or
449
+// otherwise alias the base NEON data type for intrinsic operations.
450
+//
451
+// union intended to allow direct access to an __m128 variable using the names
452
+// that the MSVC compiler provides.  This union should really only be used when
453
+// trying to access the members of the vector as integer values.  GCC/clang
454
+// allow native access to the float members through a simple array access
455
+// operator (in C since 4.6, in C++ since 4.8).
456
+//
457
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
458
+// a performance hit.  If it really is needed however, the original __m128
459
+// variable can be aliased with a pointer to this union and used to access
460
+// individual components.  The use of this union should be hidden behind a macro
461
+// that is used throughout the codebase to access the members instead of always
462
+// declaring this type of variable.
463
+typedef union ALIGN_STRUCT(16) SIMDVec {
464
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
465
+    int8_t m128_i8[16];    // as signed 8-bit integers.
466
+    int16_t m128_i16[8];   // as signed 16-bit integers.
467
+    int32_t m128_i32[4];   // as signed 32-bit integers.
468
+    int64_t m128_i64[2];   // as signed 64-bit integers.
469
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
470
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
471
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
472
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
473
+} SIMDVec;
474
+
475
+// casting using SIMDVec
476
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
477
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
478
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
479
+
480
+/* SSE macros */
481
+#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
482
+#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
483
+#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
484
+#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
485
+
486
+// Function declaration
487
+// SSE
488
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
489
+FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
490
+FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
491
+FORCE_INLINE __m128 _mm_set_ps1(float);
492
+FORCE_INLINE __m128 _mm_setzero_ps(void);
493
+// SSE2
494
+FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
495
+FORCE_INLINE __m128i _mm_castps_si128(__m128);
496
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
497
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
498
+FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
499
+FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
500
+FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
501
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
502
+FORCE_INLINE __m128d _mm_set_pd(double, double);
503
+FORCE_INLINE __m128i _mm_set1_epi32(int);
504
+FORCE_INLINE __m128i _mm_setzero_si128();
505
+// SSE4.1
506
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
507
+FORCE_INLINE __m128 _mm_ceil_ps(__m128);
508
+FORCE_INLINE __m128d _mm_floor_pd(__m128d);
509
+FORCE_INLINE __m128 _mm_floor_ps(__m128);
510
+FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
511
+FORCE_INLINE __m128 _mm_round_ps(__m128, int);
512
+// SSE4.2
513
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
514
+
515
+/* Backwards compatibility for compilers with lack of specific type support */
516
+
517
+// Older gcc does not define vld1q_u8_x4 type
518
+#if defined(__GNUC__) && !defined(__clang__) &&                        \
519
+    ((__GNUC__ <= 12 && defined(__arm__)) ||                           \
520
+     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
521
+     (__GNUC__ <= 9 && defined(__aarch64__)))
522
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
523
+{
524
+    uint8x16x4_t ret;
525
+    ret.val[0] = vld1q_u8(p + 0);
526
+    ret.val[1] = vld1q_u8(p + 16);
527
+    ret.val[2] = vld1q_u8(p + 32);
528
+    ret.val[3] = vld1q_u8(p + 48);
529
+    return ret;
530
+}
531
+#else
532
+// Wraps vld1q_u8_x4
533
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
534
+{
535
+    return vld1q_u8_x4(p);
536
+}
537
+#endif
538
+
539
+#if !defined(__aarch64__)
540
+/* emulate vaddv u8 variant */
541
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
542
+{
543
+    const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
544
+    return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
545
+}
546
+#else
547
+// Wraps vaddv_u8
548
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
549
+{
550
+    return vaddv_u8(v8);
551
+}
552
+#endif
553
+
554
+#if !defined(__aarch64__)
555
+/* emulate vaddvq u8 variant */
556
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
557
+{
558
+    uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
559
+    uint8_t res = 0;
560
+    for (int i = 0; i < 8; ++i)
561
+        res += tmp[i];
562
+    return res;
563
+}
564
+#else
565
+// Wraps vaddvq_u8
566
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
567
+{
568
+    return vaddvq_u8(a);
569
+}
570
+#endif
571
+
572
+#if !defined(__aarch64__)
573
+/* emulate vaddvq u16 variant */
574
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
575
+{
576
+    uint32x4_t m = vpaddlq_u16(a);
577
+    uint64x2_t n = vpaddlq_u32(m);
578
+    uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
579
+
580
+    return vget_lane_u32((uint32x2_t) o, 0);
581
+}
582
+#else
583
+// Wraps vaddvq_u16
584
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
585
+{
586
+    return vaddvq_u16(a);
587
+}
588
+#endif
589
+
590
+/* Function Naming Conventions
591
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
592
+ * intrinsic function is given as follows:
593
+ *   _mm_<name>_<data_type>
594
+ *
595
+ * The parts of this format are given as follows:
596
+ * 1. <name> describes the operation performed by the intrinsic
597
+ * 2. <data_type> identifies the data type of the function's primary arguments
598
+ *
599
+ * This last part, <data_type>, is a little complicated. It identifies the
600
+ * content of the input values, and can be set to any of the following values:
601
+ * + ps - vectors contain floats (ps stands for packed single-precision)
602
+ * + pd - vectors cantain doubles (pd stands for packed double-precision)
603
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
604
+ *                            signed integers
605
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
606
+ *                            unsigned integers
607
+ * + si128 - unspecified 128-bit vector or 256-bit vector
608
+ * + m128/m128i/m128d - identifies input vector types when they are different
609
+ *                      than the type of the returned vector
610
+ *
611
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
612
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
613
+ * contain floats.
614
+ *
615
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
616
+ *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
617
+ *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
618
+ *   // Set packed 8-bit integers
619
+ *   // 128 bits, 16 chars, per 8 bits
620
+ *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
621
+ *                                  4, 5, 12, 13, 6, 7, 14, 15);
622
+ *   // Shuffle packed 8-bit integers
623
+ *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
624
+ */
625
+
626
+/* Constants for use with _mm_prefetch. */
627
+enum _mm_hint {
628
+    _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
629
+    _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
630
+    _MM_HINT_T1 = 2,  /* load data to L2 cache only */
631
+    _MM_HINT_T2 = 3,  /* load data to L2 cache only, mark it as NTA */
632
+};
633
+
634
+// The bit field mapping to the FPCR(floating-point control register)
635
+typedef struct {
636
+    uint16_t res0;
637
+    uint8_t res1 : 6;
638
+    uint8_t bit22 : 1;
639
+    uint8_t bit23 : 1;
640
+    uint8_t bit24 : 1;
641
+    uint8_t res2 : 7;
642
+#if defined(__aarch64__)
643
+    uint32_t res3;
644
+#endif
645
+} fpcr_bitfield;
646
+
647
+// Takes the upper 64 bits of a and places it in the low end of the result
648
+// Takes the lower 64 bits of b and places it into the high end of the result.
649
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
650
+{
651
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
652
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
653
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
654
+}
655
+
656
+// takes the lower two 32-bit values from a and swaps them and places in high
657
+// end of result takes the higher two 32 bit values from b and swaps them and
658
+// places in low end of result.
659
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
660
+{
661
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
662
+    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
663
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
664
+}
665
+
666
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
667
+{
668
+    float32x2_t a21 = vget_high_f32(
669
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
670
+    float32x2_t b03 = vget_low_f32(
671
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
672
+    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
673
+}
674
+
675
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
676
+{
677
+    float32x2_t a03 = vget_low_f32(
678
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
679
+    float32x2_t b21 = vget_high_f32(
680
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
681
+    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
682
+}
683
+
684
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
685
+{
686
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
687
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
688
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
689
+}
690
+
691
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
692
+{
693
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
694
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
695
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
696
+}
697
+
698
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
699
+{
700
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
701
+    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
702
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
703
+}
704
+
705
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
706
+// high
707
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
708
+{
709
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
710
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
711
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
712
+}
713
+
714
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
715
+{
716
+    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
717
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
718
+    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
719
+}
720
+
721
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
722
+{
723
+    float32x2_t a22 =
724
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
725
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
726
+    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
727
+}
728
+
729
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
730
+{
731
+    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
732
+    float32x2_t b22 =
733
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
734
+    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
735
+}
736
+
737
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
738
+{
739
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
740
+    float32x2_t a22 =
741
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
742
+    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
743
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
744
+    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
745
+}
746
+
747
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
748
+{
749
+    float32x2_t a33 =
750
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
751
+    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
752
+    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
753
+}
754
+
755
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
756
+{
757
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
758
+    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
759
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
760
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
761
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
762
+}
763
+
764
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
765
+{
766
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
767
+    float32_t b2 = vgetq_lane_f32(b, 2);
768
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
769
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
770
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
771
+}
772
+
773
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
774
+{
775
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
776
+    float32_t b2 = vgetq_lane_f32(b, 2);
777
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
778
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
779
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
780
+}
781
+
782
+// Kahan summation for accurate summation of floating-point numbers.
783
+// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
784
+FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
785
+{
786
+    y -= *c;
787
+    float t = *sum + y;
788
+    *c = (t - *sum) - y;
789
+    *sum = t;
790
+}
791
+
792
+#if defined(__ARM_FEATURE_CRYPTO) && \
793
+    (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
794
+// Wraps vmull_p64
795
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
796
+{
797
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
798
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
799
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
800
+}
801
+#else  // ARMv7 polyfill
802
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
803
+//
804
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
805
+// 64-bit->128-bit polynomial multiply.
806
+//
807
+// It needs some work and is somewhat slow, but it is still faster than all
808
+// known scalar methods.
809
+//
810
+// Algorithm adapted to C from
811
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
812
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
813
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
814
+// (https://hal.inria.fr/hal-01506572)
815
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
816
+{
817
+    poly8x8_t a = vreinterpret_p8_u64(_a);
818
+    poly8x8_t b = vreinterpret_p8_u64(_b);
819
+
820
+    // Masks
821
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
822
+                                    vcreate_u8(0x00000000ffffffff));
823
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
824
+                                    vcreate_u8(0x0000000000000000));
825
+
826
+    // Do the multiplies, rotating with vext to get all combinations
827
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
828
+    uint8x16_t e =
829
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
830
+    uint8x16_t f =
831
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
832
+    uint8x16_t g =
833
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
834
+    uint8x16_t h =
835
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
836
+    uint8x16_t i =
837
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
838
+    uint8x16_t j =
839
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
840
+    uint8x16_t k =
841
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
842
+
843
+    // Add cross products
844
+    uint8x16_t l = veorq_u8(e, f);  // L = E + F
845
+    uint8x16_t m = veorq_u8(g, h);  // M = G + H
846
+    uint8x16_t n = veorq_u8(i, j);  // N = I + J
847
+
848
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
849
+    // instructions.
850
+#if defined(__aarch64__)
851
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
852
+        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
853
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
854
+        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
855
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
856
+        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
857
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
858
+        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
859
+#else
860
+    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
861
+    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
862
+    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
863
+    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
864
+#endif
865
+    // t0 = (L) (P0 + P1) << 8
866
+    // t1 = (M) (P2 + P3) << 16
867
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
868
+    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
869
+    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
870
+
871
+    // t2 = (N) (P4 + P5) << 24
872
+    // t3 = (K) (P6 + P7) << 32
873
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
874
+    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
875
+    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
876
+
877
+    // De-interleave
878
+#if defined(__aarch64__)
879
+    uint8x16_t t0 = vreinterpretq_u8_u64(
880
+        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
881
+    uint8x16_t t1 = vreinterpretq_u8_u64(
882
+        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
883
+    uint8x16_t t2 = vreinterpretq_u8_u64(
884
+        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
885
+    uint8x16_t t3 = vreinterpretq_u8_u64(
886
+        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
887
+#else
888
+    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
889
+    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
890
+    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
891
+    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
892
+#endif
893
+    // Shift the cross products
894
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
895
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
896
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
897
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
898
+
899
+    // Accumulate the products
900
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
901
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
902
+    uint8x16_t mix = veorq_u8(d, cross1);
903
+    uint8x16_t r = veorq_u8(mix, cross2);
904
+    return vreinterpretq_u64_u8(r);
905
+}
906
+#endif  // ARMv7 polyfill
907
+
908
+// C equivalent:
909
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
910
+//                                     __constrange(0, 255) int imm) {
911
+//       __m128i ret;
912
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
913
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
914
+//       return ret;
915
+//   }
916
+#define _mm_shuffle_epi32_default(a, imm)                                   \
917
+    __extension__({                                                         \
918
+        int32x4_t ret;                                                      \
919
+        ret = vmovq_n_s32(                                                  \
920
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
921
+        ret = vsetq_lane_s32(                                               \
922
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
923
+            ret, 1);                                                        \
924
+        ret = vsetq_lane_s32(                                               \
925
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
926
+            ret, 2);                                                        \
927
+        ret = vsetq_lane_s32(                                               \
928
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
929
+            ret, 3);                                                        \
930
+        vreinterpretq_m128i_s32(ret);                                       \
931
+    })
932
+
933
+// Takes the upper 64 bits of a and places it in the low end of the result
934
+// Takes the lower 64 bits of a and places it into the high end of the result.
935
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
936
+{
937
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
938
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
939
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
940
+}
941
+
942
+// takes the lower two 32-bit values from a and swaps them and places in low end
943
+// of result takes the higher two 32 bit values from a and swaps them and places
944
+// in high end of result.
945
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
946
+{
947
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
948
+    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
949
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
950
+}
951
+
952
+// rotates the least significant 32 bits into the most significant 32 bits, and
953
+// shifts the rest down
954
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
955
+{
956
+    return vreinterpretq_m128i_s32(
957
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
958
+}
959
+
960
+// rotates the most significant 32 bits into the least significant 32 bits, and
961
+// shifts the rest up
962
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
963
+{
964
+    return vreinterpretq_m128i_s32(
965
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
966
+}
967
+
968
+// gets the lower 64 bits of a, and places it in the upper 64 bits
969
+// gets the lower 64 bits of a and places it in the lower 64 bits
970
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
971
+{
972
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
973
+    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
974
+}
975
+
976
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
977
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
978
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
979
+{
980
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
981
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
982
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
983
+}
984
+
985
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
986
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
987
+// places it in the lower 64 bits
988
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
989
+{
990
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
991
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
992
+}
993
+
994
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
995
+{
996
+    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
997
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
998
+    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
999
+}
1000
+
1001
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
1002
+{
1003
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1004
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1005
+    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1006
+}
1007
+
1008
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
1009
+{
1010
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1011
+    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1012
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1013
+}
1014
+
1015
+// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
1016
+// int imm)
1017
+#if defined(__aarch64__)
1018
+#define _mm_shuffle_epi32_splat(a, imm)                          \
1019
+    __extension__({                                              \
1020
+        vreinterpretq_m128i_s32(                                 \
1021
+            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
1022
+    })
1023
+#else
1024
+#define _mm_shuffle_epi32_splat(a, imm)                                      \
1025
+    __extension__({                                                          \
1026
+        vreinterpretq_m128i_s32(                                             \
1027
+            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
1028
+    })
1029
+#endif
1030
+
1031
+// NEON does not support a general purpose permute intrinsic.
1032
+// Shuffle single-precision (32-bit) floating-point elements in a using the
1033
+// control in imm8, and store the results in dst.
1034
+//
1035
+// C equivalent:
1036
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
1037
+//                                 __constrange(0, 255) int imm) {
1038
+//       __m128 ret;
1039
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
1040
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
1041
+//       return ret;
1042
+//   }
1043
+//
1044
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
1045
+#define _mm_shuffle_ps_default(a, b, imm)                                  \
1046
+    __extension__({                                                        \
1047
+        float32x4_t ret;                                                   \
1048
+        ret = vmovq_n_f32(                                                 \
1049
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
1050
+        ret = vsetq_lane_f32(                                              \
1051
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1052
+            ret, 1);                                                       \
1053
+        ret = vsetq_lane_f32(                                              \
1054
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1055
+            ret, 2);                                                       \
1056
+        ret = vsetq_lane_f32(                                              \
1057
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1058
+            ret, 3);                                                       \
1059
+        vreinterpretq_m128_f32(ret);                                       \
1060
+    })
1061
+
1062
+// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
1063
+// Store the results in the low 64 bits of dst, with the high 64 bits being
1064
+// copied from from a to dst.
1065
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
1066
+#define _mm_shufflelo_epi16_function(a, imm)                                  \
1067
+    __extension__({                                                           \
1068
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
1069
+        int16x4_t lowBits = vget_low_s16(ret);                                \
1070
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
1071
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1072
+                             1);                                              \
1073
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1074
+                             2);                                              \
1075
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1076
+                             3);                                              \
1077
+        vreinterpretq_m128i_s16(ret);                                         \
1078
+    })
1079
+
1080
+// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
1081
+// Store the results in the high 64 bits of dst, with the low 64 bits being
1082
+// copied from from a to dst.
1083
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
1084
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
1085
+    __extension__({                                                            \
1086
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
1087
+        int16x4_t highBits = vget_high_s16(ret);                               \
1088
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
1089
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1090
+                             5);                                               \
1091
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1092
+                             6);                                               \
1093
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1094
+                             7);                                               \
1095
+        vreinterpretq_m128i_s16(ret);                                          \
1096
+    })
1097
+
1098
+/* MMX */
1099
+
1100
+//_mm_empty is a no-op on arm
1101
+FORCE_INLINE void _mm_empty(void) {}
1102
+
1103
+/* SSE */
1104
+
1105
+// Add packed single-precision (32-bit) floating-point elements in a and b, and
1106
+// store the results in dst.
1107
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
1108
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
1109
+{
1110
+    return vreinterpretq_m128_f32(
1111
+        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1112
+}
1113
+
1114
+// Add the lower single-precision (32-bit) floating-point element in a and b,
1115
+// store the result in the lower element of dst, and copy the upper 3 packed
1116
+// elements from a to the upper elements of dst.
1117
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
1118
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
1119
+{
1120
+    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
1121
+    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1122
+    // the upper values in the result must be the remnants of <a>.
1123
+    return vreinterpretq_m128_f32(vaddq_f32(a, value));
1124
+}
1125
+
1126
+// Compute the bitwise AND of packed single-precision (32-bit) floating-point
1127
+// elements in a and b, and store the results in dst.
1128
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
1129
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1130
+{
1131
+    return vreinterpretq_m128_s32(
1132
+        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1133
+}
1134
+
1135
+// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
1136
+// elements in a and then AND with b, and store the results in dst.
1137
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
1138
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1139
+{
1140
+    return vreinterpretq_m128_s32(
1141
+        vbicq_s32(vreinterpretq_s32_m128(b),
1142
+                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
1143
+}
1144
+
1145
+// Average packed unsigned 16-bit integers in a and b, and store the results in
1146
+// dst.
1147
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
1148
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
1149
+{
1150
+    return vreinterpret_m64_u16(
1151
+        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1152
+}
1153
+
1154
+// Average packed unsigned 8-bit integers in a and b, and store the results in
1155
+// dst.
1156
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
1157
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
1158
+{
1159
+    return vreinterpret_m64_u8(
1160
+        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1161
+}
1162
+
1163
+// Compare packed single-precision (32-bit) floating-point elements in a and b
1164
+// for equality, and store the results in dst.
1165
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
1166
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
1167
+{
1168
+    return vreinterpretq_m128_u32(
1169
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1170
+}
1171
+
1172
+// Compare the lower single-precision (32-bit) floating-point elements in a and
1173
+// b for equality, store the result in the lower element of dst, and copy the
1174
+// upper 3 packed elements from a to the upper elements of dst.
1175
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
1176
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
1177
+{
1178
+    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1179
+}
1180
+
1181
+// Compare packed single-precision (32-bit) floating-point elements in a and b
1182
+// for greater-than-or-equal, and store the results in dst.
1183
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
1184
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
1185
+{
1186
+    return vreinterpretq_m128_u32(
1187
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1188
+}
1189
+
1190
+// Compare the lower single-precision (32-bit) floating-point elements in a and
1191
+// b for greater-than-or-equal, store the result in the lower element of dst,
1192
+// and copy the upper 3 packed elements from a to the upper elements of dst.
1193
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
1194
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
1195
+{
1196
+    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1197
+}
1198
+
1199
+// Compare packed single-precision (32-bit) floating-point elements in a and b
1200
+// for greater-than, and store the results in dst.
1201
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
1202
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
1203
+{
1204
+    return vreinterpretq_m128_u32(
1205
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1206
+}
1207
+
1208
+// Compare the lower single-precision (32-bit) floating-point elements in a and
1209
+// b for greater-than, store the result in the lower element of dst, and copy
1210
+// the upper 3 packed elements from a to the upper elements of dst.
1211
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
1212
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
1213
+{
1214
+    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1215
+}
1216
+
1217
+// Compare packed single-precision (32-bit) floating-point elements in a and b
1218
+// for less-than-or-equal, and store the results in dst.
1219
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
1220
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
1221
+{
1222
+    return vreinterpretq_m128_u32(
1223
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1224
+}
1225
+
1226
+// Compare the lower single-precision (32-bit) floating-point elements in a and
1227
+// b for less-than-or-equal, store the result in the lower element of dst, and
1228
+// copy the upper 3 packed elements from a to the upper elements of dst.
1229
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
1230
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
1231
+{
1232
+    return _mm_move_ss(a, _mm_cmple_ps(a, b));
1233
+}
1234
+
1235
+// Compare packed single-precision (32-bit) floating-point elements in a and b
1236
+// for less-than, and store the results in dst.
1237
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
1238
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
1239
+{
1240
+    return vreinterpretq_m128_u32(
1241
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1242
+}
1243
+
1244
+// Compare the lower single-precision (32-bit) floating-point elements in a and
1245
+// b for less-than, store the result in the lower element of dst, and copy the
1246
+// upper 3 packed elements from a to the upper elements of dst.
1247
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
1248
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
1249
+{
1250
+    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1251
+}
1252
+
1253
+// Compare packed single-precision (32-bit) floating-point elements in a and b
1254
+// for not-equal, and store the results in dst.
1255
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
1256
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
1257
+{
1258
+    return vreinterpretq_m128_u32(vmvnq_u32(
1259
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1260
+}
1261
+
1262
+// Compare the lower single-precision (32-bit) floating-point elements in a and
1263
+// b for not-equal, store the result in the lower element of dst, and copy the
1264
+// upper 3 packed elements from a to the upper elements of dst.
1265
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
1266
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
1267
+{
1268
+    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1269
+}
1270
+
1271
+// Compare packed single-precision (32-bit) floating-point elements in a and b
1272
+// for not-greater-than-or-equal, and store the results in dst.
1273
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
1274
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
1275
+{
1276
+    return vreinterpretq_m128_u32(vmvnq_u32(
1277
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1278
+}
1279
+
1280
+// Compare the lower single-precision (32-bit) floating-point elements in a and
1281
+// b for not-greater-than-or-equal, store the result in the lower element of
1282
+// dst, and copy the upper 3 packed elements from a to the upper elements of
1283
+// dst.
1284
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
1285
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
1286
+{
1287
+    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
1288
+}
1289
+
1290
+// Compare packed single-precision (32-bit) floating-point elements in a and b
1291
+// for not-greater-than, and store the results in dst.
1292
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
1293
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
1294
+{
1295
+    return vreinterpretq_m128_u32(vmvnq_u32(
1296
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1297
+}
1298
+
1299
+// Compare the lower single-precision (32-bit) floating-point elements in a and
1300
+// b for not-greater-than, store the result in the lower element of dst, and
1301
+// copy the upper 3 packed elements from a to the upper elements of dst.
1302
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
1303
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
1304
+{
1305
+    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
1306
+}
1307
+
1308
+// Compare packed single-precision (32-bit) floating-point elements in a and b
1309
+// for not-less-than-or-equal, and store the results in dst.
1310
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
1311
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
1312
+{
1313
+    return vreinterpretq_m128_u32(vmvnq_u32(
1314
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1315
+}
1316
+
1317
+// Compare the lower single-precision (32-bit) floating-point elements in a and
1318
+// b for not-less-than-or-equal, store the result in the lower element of dst,
1319
+// and copy the upper 3 packed elements from a to the upper elements of dst.
1320
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
1321
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
1322
+{
1323
+    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
1324
+}
1325
+
1326
+// Compare packed single-precision (32-bit) floating-point elements in a and b
1327
+// for not-less-than, and store the results in dst.
1328
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
1329
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
1330
+{
1331
+    return vreinterpretq_m128_u32(vmvnq_u32(
1332
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1333
+}
1334
+
1335
+// Compare the lower single-precision (32-bit) floating-point elements in a and
1336
+// b for not-less-than, store the result in the lower element of dst, and copy
1337
+// the upper 3 packed elements from a to the upper elements of dst.
1338
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
1339
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
1340
+{
1341
+    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
1342
+}
1343
+
1344
+// Compare packed single-precision (32-bit) floating-point elements in a and b
1345
+// to see if neither is NaN, and store the results in dst.
1346
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
1347
+//
1348
+// See also:
1349
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1350
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
1351
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
1352
+{
1353
+    // Note: NEON does not have ordered compare builtin
1354
+    // Need to compare a eq a and b eq b to check for NaN
1355
+    // Do AND of results to get final
1356
+    uint32x4_t ceqaa =
1357
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1358
+    uint32x4_t ceqbb =
1359
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1360
+    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1361
+}
1362
+
1363
+// Compare the lower single-precision (32-bit) floating-point elements in a and
1364
+// b to see if neither is NaN, store the result in the lower element of dst, and
1365
+// copy the upper 3 packed elements from a to the upper elements of dst.
1366
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
1367
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
1368
+{
1369
+    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1370
+}
1371
+
1372
+// Compare packed single-precision (32-bit) floating-point elements in a and