diff --git a/Fix-build-error-on-x86_64.patch b/Fix-build-error-on-x86_64.patch deleted file mode 100644 index 58ec1fa..0000000 --- a/Fix-build-error-on-x86_64.patch +++ /dev/null @@ -1,25 +0,0 @@ -From f92e690190b51eb6bada384174887501f4c3f43f Mon Sep 17 00:00:00 2001 -From: wang_yue111 <648774160@qq.com> -Date: Sat, 24 Jul 2021 10:21:03 +0800 -Subject: [PATCH] fix build error on X86 - ---- - cmake/build_wrapper.sh | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/cmake/build_wrapper.sh b/cmake/build_wrapper.sh -index 1962813..895610c 100755 ---- a/cmake/build_wrapper.sh -+++ b/cmake/build_wrapper.sh -@@ -17,7 +17,7 @@ KEEPSYMS=$(mktemp -p /tmp keep.syms.XXXXX) - LIBC_SO=$("$@" --print-file-name=libc.so.6) - cp ${KEEPSYMS_IN} ${KEEPSYMS} - # get all symbols from libc and turn them into patterns --nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ ]*\).*/^\1$/' >> ${KEEPSYMS} -+nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ @]*\).*/^\1$/' >> ${KEEPSYMS} - # build the object - "$@" - # rename the symbols in the object --- -2.23.0 - diff --git a/backport-Fix-segfaults-on-allocation-failure.patch b/backport-Fix-segfaults-on-allocation-failure.patch deleted file mode 100644 index 041081c..0000000 --- a/backport-Fix-segfaults-on-allocation-failure.patch +++ /dev/null @@ -1,34 +0,0 @@ -From 7d644e7ba27eaadda753febf0b142faa9affbbca Mon Sep 17 00:00:00 2001 -From: hongyang7 -Date: Thu, 16 Dec 2021 19:02:17 +0800 -Subject: [PATCH] Fix segfaults on allocation failure (#4) - -Throw std::bad_alloc instead of returning nullptr from -ue2::AlignedAllocator. Allocators for STL containers are expected never -to return with an invalid pointer, and instead must throw on failure. -Violating this expectation can lead to invalid pointer dereferences. - -Co-authored-by: johanngan - -fixes github issue #317 (PR #320) ---- - src/util/alloc.h | 6 +++++- - 1 file changed, 5 insertions(+), 1 deletion(-) - -diff --git a/src/util/alloc.h b/src/util/alloc.h -index de20c8d02..49b4a824d 100644 ---- a/src/util/alloc.h -+++ b/src/util/alloc.h -@@ -76,7 +76,11 @@ class AlignedAllocator { - - T *allocate(std::size_t size) const { - size_t alloc_size = size * sizeof(T); -- return static_cast(aligned_malloc_internal(alloc_size, N)); -+ T *ptr = static_cast(aligned_malloc_internal(alloc_size, N)); -+ if (!ptr) { -+ throw std::bad_alloc(); -+ } -+ return ptr; - } - - void deallocate(T *x, std::size_t) const noexcept { diff --git a/hyperscan-5.4.0.tar.gz b/hyperscan-5.4.0.tar.gz deleted file mode 100644 index 94a94cf..0000000 Binary files a/hyperscan-5.4.0.tar.gz and /dev/null differ diff --git a/hyperscan-5.4.1.tar.gz b/hyperscan-5.4.1.tar.gz new file mode 100644 index 0000000..d094370 Binary files /dev/null and b/hyperscan-5.4.1.tar.gz differ diff --git a/hyperscan-aarch64-support.patch b/hyperscan-aarch64-support.patch index 381e009..a35c8f1 100644 --- a/hyperscan-aarch64-support.patch +++ b/hyperscan-aarch64-support.patch @@ -1,48 +1,48 @@ -From 5f009c288718095c5cc675bfed12d7ec64237731 Mon Sep 17 00:00:00 2001 +From e95491b3a2261aecdc5576a7e507b4f4ace88cbc Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Mon, 20 Jul 2020 17:20:15 +0800 Subject: [PATCH] Add aarch64 support +Signed-off-by: Liu Zixian --- CMakeLists.txt | 108 +- cmake/config.h.in | 9 + cmake/platform.cmake | 13 +- cmake/ragel.cmake | 20 + src/crc32.c | 43 + - src/fdr/fdr.c | 136 +- + src/fdr/fdr.c | 136 ++- src/hs_valid_platform.c | 9 +- src/nfa/limex_exceptional.h | 22 +- src/nfa/limex_internal.h | 2 +- src/nfa/limex_native.c | 10 +- - src/nfa/shufti.c | 580 ++++---- + src/nfa/shufti.c | 18 +- src/nfa/truffle.c | 10 +- - src/parser/control_verbs.cpp | 340 +++++ + src/parser/control_verbs.cpp | 340 +++++++ src/rose/counting_miracle.h | 2 +- src/util/arch.h | 11 + - src/util/cpuid_flags.c | 9 +- + src/util/cpuid_flags.c | 6 + src/util/cpuid_flags.h | 2 + src/util/cpuid_inline.h | 17 +- src/util/intrinsics.h | 12 + src/util/popcount.h | 6 +- - src/util/simd_arm.h | 1069 +++++++++++++++ - src/util/simd_types.h | 42 +- - src/util/simd_utils.h | 1389 +------------------- - src/util/simd_x86.h | 1334 +++++++++++++++++++ - src/util/state_compress.c | 42 +- + src/util/simd_arm.h | 1069 ++++++++++++++++++++ + src/util/simd_types.h | 17 + + src/util/simd_utils.h | 13 + + src/util/simd_x86.h | 10 + tools/hscollider/CMakeLists.txt | 9 +- - tools/hscollider/ColliderCorporaParser.cpp | 474 +++++++ - unit/internal/simd_utils.cpp | 128 +- + tools/hscollider/ColliderCorporaParser.cpp | 474 +++++++++ + unit/internal/simd_utils.cpp | 2 +- util/CMakeLists.txt | 8 +- - util/ExpressionParser.cpp | 397 ++++++ - 30 files changed, 4464 insertions(+), 1789 deletions(-) + util/ExpressionParser.cpp | 397 ++++++++ + 29 files changed, 2717 insertions(+), 78 deletions(-) create mode 100644 src/parser/control_verbs.cpp create mode 100644 src/util/simd_arm.h - create mode 100644 src/util/simd_x86.h + create mode 100644 src/util/simd_utils.h create mode 100644 tools/hscollider/ColliderCorporaParser.cpp create mode 100644 util/ExpressionParser.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt -index 8bc6077..12a889c 100644 +index bd6d2de..8dbcb72 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,6 +74,7 @@ include (${CMAKE_MODULE_PATH}/boost.cmake) @@ -182,7 +182,7 @@ index 8bc6077..12a889c 100644 # testing a builtin takes a little more work CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED) -@@ -403,12 +438,6 @@ if (CXX_IGNORED_ATTR) +@@ -415,12 +450,6 @@ if (CXX_IGNORED_ATTR) set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-ignored-attributes") endif() @@ -195,7 +195,7 @@ index 8bc6077..12a889c 100644 # note this for later # g++ doesn't have this flag but clang does CHECK_CXX_COMPILER_FLAG("-Wweak-vtables" CXX_WEAK_VTABLES) -@@ -463,6 +492,14 @@ else() +@@ -477,6 +506,14 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") endif() @@ -210,7 +210,7 @@ index 8bc6077..12a889c 100644 add_subdirectory(util) add_subdirectory(doc/dev-reference) -@@ -559,7 +596,14 @@ set_source_files_properties( +@@ -573,7 +610,14 @@ set_source_files_properties( PROPERTIES COMPILE_FLAGS "${RAGEL_C_FLAGS}") @@ -747,157 +747,17 @@ index f6f5809..8998830 100644 } } diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c -index 09ffc0c..6231e61 100644 +index 09ffc0c..2cb74f0 100644 --- a/src/nfa/shufti.c +++ b/src/nfa/shufti.c -@@ -42,39 +42,36 @@ - #ifdef DEBUG - #include - --#define DUMP_MSK(_t) \ --static UNUSED \ --void dumpMsk##_t(m##_t msk) { \ -- u8 * mskAsU8 = (u8 *)&msk; \ -- for (unsigned i = 0; i < sizeof(msk); i++) { \ -- u8 c = mskAsU8[i]; \ -- for (int j = 0; j < 8; j++) { \ -- if ((c >> (7-j)) & 0x1) \ -- printf("1"); \ -- else \ -- printf("0"); \ -- } \ -- printf(" "); \ -- } \ --} \ --static UNUSED \ --void dumpMsk##_t##AsChars(m##_t msk) { \ -- u8 * mskAsU8 = (u8 *)&msk; \ -- for (unsigned i = 0; i < sizeof(msk); i++) { \ -- u8 c = mskAsU8[i]; \ -- if (isprint(c)) \ -- printf("%c",c); \ -- else \ -- printf("."); \ -- } \ --} -+#define DUMP_MSK(_t) \ -+ static UNUSED void dumpMsk##_t(m##_t msk) { \ -+ u8 *mskAsU8 = (u8 *)&msk; \ -+ for (unsigned i = 0; i < sizeof(msk); i++) { \ -+ u8 c = mskAsU8[i]; \ -+ for (int j = 0; j < 8; j++) { \ -+ if ((c >> (7 - j)) & 0x1) \ -+ printf("1"); \ -+ else \ -+ printf("0"); \ -+ } \ -+ printf(" "); \ -+ } \ -+ } \ -+ static UNUSED void dumpMsk##_t##AsChars(m##_t msk) { \ -+ u8 *mskAsU8 = (u8 *)&msk; \ -+ for (unsigned i = 0; i < sizeof(msk); i++) { \ -+ u8 c = mskAsU8[i]; \ -+ if (isprint(c)) \ -+ printf("%c", c); \ -+ else \ -+ printf("."); \ -+ } \ -+ } - - #endif - - /** \brief Naive byte-by-byte implementation. */ --static really_inline --const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf, -- const u8 *buf_end) { -+static really_inline const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, -+ const u8 *buf, const u8 *buf_end) { - assert(buf < buf_end); - - for (; buf < buf_end; ++buf) { -@@ -87,9 +84,8 @@ const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf, - } - - /** \brief Naive byte-by-byte implementation. */ --static really_inline --const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf, -- const u8 *buf_end) { -+static really_inline const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, -+ const u8 *buf, const u8 *buf_end) { - assert(buf < buf_end); - - for (buf_end--; buf_end >= buf; buf_end--) { -@@ -111,25 +107,33 @@ DUMP_MSK(128) - #define GET_LO_4(chars) and128(chars, low4bits) - #define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4) - --static really_inline --u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits, -- const m128 compare) { -- m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); -- m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); -- m128 t = and128(c_lo, c_hi); -+static really_inline u32 block(m128 mask_lo, m128 mask_hi, m128 chars, -+ const m128 low4bits, const m128 compare) { -+ m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); -+ m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); -+ m128 t = and128(c_lo, c_hi); - - #ifdef DEBUG -- DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n"); -- DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n"); -- DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n"); -- DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n"); -- DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n"); -+ DEBUG_PRINTF(" chars: "); -+ dumpMsk128AsChars(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" char: "); -+ dumpMsk128(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" c_lo: "); -+ dumpMsk128(c_lo); -+ printf("\n"); -+ DEBUG_PRINTF(" c_hi: "); -+ dumpMsk128(c_hi); -+ printf("\n"); -+ DEBUG_PRINTF(" t: "); -+ dumpMsk128(t); -+ printf("\n"); - #endif - return movemask128(eq128(t, compare)); - } - --static really_inline --const u8 *firstMatch(const u8 *buf, u32 z) { -+static really_inline const u8 *firstMatch(const u8 *buf, u32 z) { - if (unlikely(z != 0xffff)) { - u32 pos = ctz32(~z & 0xffff); - assert(pos < 16); -@@ -139,9 +143,9 @@ const u8 *firstMatch(const u8 *buf, u32 z) { - } - } - --static really_inline --const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf, -- const m128 low4bits, const m128 zeroes) { -+static really_inline const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars, -+ const u8 *buf, const m128 low4bits, -+ const m128 zeroes) { - u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes); - - return firstMatch(buf, z); -@@ -153,13 +157,13 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, +@@ -153,13 +153,13 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, assert(buf < buf_end); // Slow path for small cases. - if (buf_end - buf < 16) { -- return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, -- buf, buf_end); + if (unlikely(buf_end - buf < 16)) { -+ return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf, -+ buf_end); + return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, + buf, buf_end); } const m128 zeroes = zeroes128(); @@ -906,7 +766,7 @@ index 09ffc0c..6231e61 100644 const u8 *rv; size_t min = (size_t)buf % 16; -@@ -179,6 +183,11 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, +@@ -179,6 +179,11 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *last_block = buf_end - 16; while (buf < last_block) { m128 lchars = load128(buf); @@ -918,71 +778,7 @@ index 09ffc0c..6231e61 100644 rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes); if (rv) { return rv; -@@ -198,10 +207,11 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - return buf_end; - } - --static really_inline --const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) { -+static really_inline const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) { - #ifdef DEBUG -- DEBUG_PRINTF("confirming match in:"); dumpMsk128(t); printf("\n"); -+ DEBUG_PRINTF("confirming match in:"); -+ dumpMsk128(t); -+ printf("\n"); - #endif - - u32 z = movemask128(eq128(t, compare)); -@@ -215,20 +225,29 @@ const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) { - } - } - -- --static really_inline --const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf, -- const m128 low4bits, const m128 zeroes) { -- m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); -- m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); -- m128 t = and128(c_lo, c_hi); -+static really_inline const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars, -+ const u8 *buf, const m128 low4bits, -+ const m128 zeroes) { -+ m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); -+ m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); -+ m128 t = and128(c_lo, c_hi); - - #ifdef DEBUG -- DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n"); -- DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n"); -- DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n"); -- DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n"); -- DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n"); -+ DEBUG_PRINTF(" chars: "); -+ dumpMsk128AsChars(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" char: "); -+ dumpMsk128(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" c_lo: "); -+ dumpMsk128(c_lo); -+ printf("\n"); -+ DEBUG_PRINTF(" c_hi: "); -+ dumpMsk128(c_hi); -+ printf("\n"); -+ DEBUG_PRINTF(" t: "); -+ dumpMsk128(t); -+ printf("\n"); - #endif - - return lastMatch(buf, t, zeroes); -@@ -241,12 +260,12 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - - // Slow path for small cases. - if (buf_end - buf < 16) { -- return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, -- buf, buf_end); -+ return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf, -+ buf_end); +@@ -246,7 +251,7 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, } const m128 zeroes = zeroes128(); @@ -991,708 +787,27 @@ index 09ffc0c..6231e61 100644 const u8 *rv; assert(buf_end - buf >= 16); -@@ -283,32 +302,48 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - return buf - 1; - } - --static really_inline --const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, -- m128 chars, const u8 *buf, const m128 low4bits, -- const m128 ones) { -+static really_inline const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, -+ m128 mask2_lo, m128 mask2_hi, -+ m128 chars, const u8 *buf, -+ const m128 low4bits, const m128 ones) { - m128 chars_lo = GET_LO_4(chars); - m128 chars_hi = GET_HI_4(chars); -- m128 c_lo = pshufb_m128(mask1_lo, chars_lo); -- m128 c_hi = pshufb_m128(mask1_hi, chars_hi); -- m128 t = or128(c_lo, c_hi); -+ m128 c_lo = pshufb_m128(mask1_lo, chars_lo); -+ m128 c_hi = pshufb_m128(mask1_hi, chars_hi); -+ m128 t = or128(c_lo, c_hi); - - #ifdef DEBUG -- DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n"); -- DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n"); -- DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n"); -- DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n"); -- DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n"); -+ DEBUG_PRINTF(" chars: "); -+ dumpMsk128AsChars(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" char: "); -+ dumpMsk128(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" c_lo: "); -+ dumpMsk128(c_lo); -+ printf("\n"); -+ DEBUG_PRINTF(" c_hi: "); -+ dumpMsk128(c_hi); -+ printf("\n"); -+ DEBUG_PRINTF(" t: "); -+ dumpMsk128(t); -+ printf("\n"); - #endif - -- m128 c2_lo = pshufb_m128(mask2_lo, chars_lo); -- m128 c2_hi = pshufb_m128(mask2_hi, chars_hi); -- m128 t2 = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1)); -+ m128 c2_lo = pshufb_m128(mask2_lo, chars_lo); -+ m128 c2_hi = pshufb_m128(mask2_hi, chars_hi); -+ m128 t2 = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1)); - - #ifdef DEBUG -- DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo); printf("\n"); -- DEBUG_PRINTF(" c2_hi: "); dumpMsk128(c2_hi); printf("\n"); -- DEBUG_PRINTF(" t2: "); dumpMsk128(t2); printf("\n"); -+ DEBUG_PRINTF(" c2_lo: "); -+ dumpMsk128(c2_lo); -+ printf("\n"); -+ DEBUG_PRINTF(" c2_hi: "); -+ dumpMsk128(c2_hi); -+ printf("\n"); -+ DEBUG_PRINTF(" t2: "); -+ dumpMsk128(t2); -+ printf("\n"); - #endif - - u32 z = movemask128(eq128(t2, ones)); -@@ -316,19 +351,18 @@ const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, - return firstMatch(buf, z); - } - --const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, -- m128 mask2_lo, m128 mask2_hi, -- const u8 *buf, const u8 *buf_end) { -+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, -+ m128 mask2_hi, const u8 *buf, const u8 *buf_end) { +@@ -320,7 +325,7 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, + m128 mask2_lo, m128 mask2_hi, + const u8 *buf, const u8 *buf_end) { const m128 ones = ones128(); - const m128 low4bits = _mm_set1_epi8(0xf); + const m128 low4bits = set16x8(0xf); const u8 *rv; size_t min = (size_t)buf % 16; - - // Preconditioning: most of the time our buffer won't be aligned. - m128 chars = loadu128(buf); -- rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, -- chars, buf, low4bits, ones); -+ rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf, low4bits, -+ ones); - if (rv) { - return rv; - } -@@ -340,8 +374,13 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, +@@ -340,6 +345,11 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, const u8 *last_block = buf_end - 16; while (buf < last_block) { m128 lchars = load128(buf); -- rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, -- lchars, buf, low4bits, ones); + +#if defined(HAVE_NEON) + __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(buf + 256))); +#endif + -+ rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, lchars, buf, -+ low4bits, ones); + rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, + lchars, buf, low4bits, ones); if (rv) { - return rv; - } -@@ -351,8 +390,8 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, - // Use an unaligned load to mop up the last 16 bytes and get an accurate - // picture to buf_end. - chars = loadu128(buf_end - 16); -- rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, -- chars, buf_end - 16, low4bits, ones); -+ rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf_end - 16, -+ low4bits, ones); - if (rv) { - return rv; - } -@@ -370,26 +409,34 @@ DUMP_MSK(256) - #define GET_LO_4(chars) and256(chars, low4bits) - #define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4) - --static really_inline --u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits, -- const m256 compare) { -- m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); -- m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); -+static really_inline u32 block(m256 mask_lo, m256 mask_hi, m256 chars, -+ const m256 low4bits, const m256 compare) { -+ m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); -+ m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); - m256 t = and256(c_lo, c_hi); - - #ifdef DEBUG -- DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n"); -- DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n"); -- DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n"); -- DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n"); -- DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); -+ DEBUG_PRINTF(" chars: "); -+ dumpMsk256AsChars(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" char: "); -+ dumpMsk256(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" c_lo: "); -+ dumpMsk256(c_lo); -+ printf("\n"); -+ DEBUG_PRINTF(" c_hi: "); -+ dumpMsk256(c_hi); -+ printf("\n"); -+ DEBUG_PRINTF(" t: "); -+ dumpMsk256(t); -+ printf("\n"); - #endif - - return movemask256(eq256(t, compare)); - } - --static really_inline --const u8 *firstMatch(const u8 *buf, u32 z) { -+static really_inline const u8 *firstMatch(const u8 *buf, u32 z) { - DEBUG_PRINTF("z 0x%08x\n", z); - if (unlikely(z != 0xffffffff)) { - u32 pos = ctz32(~z); -@@ -401,9 +448,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) { - } - } - --static really_inline --const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf, -- const m256 low4bits) { -+static really_inline const u8 * -+fwdBlockShort(m256 mask, m128 chars, const u8 *buf, const m256 low4bits) { - // do the hi and lo shuffles in the one avx register - m256 c = combine2x128(rshift64_m128(chars, 4), chars); - c = and256(c, low4bits); -@@ -415,9 +461,9 @@ const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf, - return firstMatch(buf, z); - } - --static really_inline --const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf, -- const u8 *buf_end, const m256 low4bits) { -+static really_inline const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, -+ const u8 *buf, const u8 *buf_end, -+ const m256 low4bits) { - // run shufti over two overlapping 16-byte unaligned reads - const m256 mask = combine2x128(mask_hi, mask_lo); - m128 chars = loadu128(buf); -@@ -434,9 +480,9 @@ const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf, - return buf_end; - } - --static really_inline --const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf, -- const m256 low4bits, const m256 zeroes) { -+static really_inline const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, -+ const u8 *buf, const m256 low4bits, -+ const m256 zeroes) { - u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes); - - return firstMatch(buf, z); -@@ -451,8 +497,8 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - - // Slow path for small cases. - if (buf_end - buf < 16) { -- return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, -- buf, buf_end); -+ return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf, -+ buf_end); - } - - const m256 low4bits = set32x8(0xf); -@@ -483,7 +529,8 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - const u8 *last_block = buf_end - 32; - while (buf < last_block) { - m256 lchars = load256(buf); -- rv = fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes); -+ rv = -+ fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes); - if (rv) { - return rv; - } -@@ -494,7 +541,8 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - // picture to buf_end. - assert(buf <= buf_end && buf >= buf_end - 32); - chars = loadu256(buf_end - 32); -- rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes); -+ rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, -+ zeroes); - if (rv) { - return rv; - } -@@ -502,8 +550,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - return buf_end; - } - --static really_inline --const u8 *lastMatch(const u8 *buf, u32 z) { -+static really_inline const u8 *lastMatch(const u8 *buf, u32 z) { - if (unlikely(z != 0xffffffff)) { - u32 pos = clz32(~z); - DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos); -@@ -513,28 +560,37 @@ const u8 *lastMatch(const u8 *buf, u32 z) { - } - } - --static really_inline --const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf, -- const m256 low4bits, const m256 zeroes) { -- m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); -- m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); -- m256 t = and256(c_lo, c_hi); -+static really_inline const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, -+ const u8 *buf, const m256 low4bits, -+ const m256 zeroes) { -+ m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); -+ m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); -+ m256 t = and256(c_lo, c_hi); - - #ifdef DEBUG -- DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n"); -- DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n"); -- DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n"); -- DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n"); -- DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); -+ DEBUG_PRINTF(" chars: "); -+ dumpMsk256AsChars(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" char: "); -+ dumpMsk256(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" c_lo: "); -+ dumpMsk256(c_lo); -+ printf("\n"); -+ DEBUG_PRINTF(" c_hi: "); -+ dumpMsk256(c_hi); -+ printf("\n"); -+ DEBUG_PRINTF(" t: "); -+ dumpMsk256(t); -+ printf("\n"); - #endif - - u32 z = movemask256(eq256(t, zeroes)); - return lastMatch(buf, z); - } - --static really_inline --const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf, -- const m256 low4bits) { -+static really_inline const u8 * -+revBlockShort(m256 mask, m128 chars, const u8 *buf, const m256 low4bits) { - // do the hi and lo shuffles in the one avx register - m256 c = combine2x128(rshift64_m128(chars, 4), chars); - c = and256(c, low4bits); -@@ -546,9 +602,9 @@ const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf, - return lastMatch(buf, z); - } - --static really_inline --const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf, -- const u8 *buf_end, const m256 low4bits) { -+static really_inline const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, -+ const u8 *buf, const u8 *buf_end, -+ const m256 low4bits) { - // run shufti over two overlapping 16-byte unaligned reads - const m256 mask = combine2x128(mask_hi, mask_lo); - -@@ -566,7 +622,6 @@ const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf, - return buf - 1; - } - -- - /* takes 128 bit masks, but operates on 256 bits of data */ - const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - const u8 *buf_end) { -@@ -575,8 +630,8 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - - // Slow path for small cases. - if (buf_end - buf < 16) { -- return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, -- buf, buf_end); -+ return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf, -+ buf_end); - } - - const m256 low4bits = set32x8(0xf); -@@ -594,7 +649,8 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - - // Preconditioning: most of the time our buffer won't be aligned. - m256 chars = loadu256(buf_end - 32); -- rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes); -+ rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, -+ zeroes); - if (rv) { - return rv; - } -@@ -606,7 +662,8 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - while (buf_end > last_block) { - buf_end -= 32; - m256 lchars = load256(buf_end); -- rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, zeroes); -+ rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, -+ zeroes); - if (rv) { - return rv; - } -@@ -623,42 +680,58 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - return buf - 1; - } - --static really_inline --const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi, -- m256 chars, const u8 *buf, const m256 low4bits, -- const m256 ones) { -+static really_inline const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, -+ m256 mask2_lo, m256 mask2_hi, -+ m256 chars, const u8 *buf, -+ const m256 low4bits, const m256 ones) { - DEBUG_PRINTF("buf %p\n", buf); - m256 chars_lo = GET_LO_4(chars); - m256 chars_hi = GET_HI_4(chars); -- m256 c_lo = pshufb_m256(mask1_lo, chars_lo); -- m256 c_hi = pshufb_m256(mask1_hi, chars_hi); -- m256 t = or256(c_lo, c_hi); -+ m256 c_lo = pshufb_m256(mask1_lo, chars_lo); -+ m256 c_hi = pshufb_m256(mask1_hi, chars_hi); -+ m256 t = or256(c_lo, c_hi); - - #ifdef DEBUG -- DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n"); -- DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n"); -- DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n"); -- DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n"); -- DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); -+ DEBUG_PRINTF(" chars: "); -+ dumpMsk256AsChars(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" char: "); -+ dumpMsk256(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" c_lo: "); -+ dumpMsk256(c_lo); -+ printf("\n"); -+ DEBUG_PRINTF(" c_hi: "); -+ dumpMsk256(c_hi); -+ printf("\n"); -+ DEBUG_PRINTF(" t: "); -+ dumpMsk256(t); -+ printf("\n"); - #endif - -- m256 c2_lo = pshufb_m256(mask2_lo, chars_lo); -- m256 c2_hi = pshufb_m256(mask2_hi, chars_hi); -+ m256 c2_lo = pshufb_m256(mask2_lo, chars_lo); -+ m256 c2_hi = pshufb_m256(mask2_hi, chars_hi); - m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1)); - - #ifdef DEBUG -- DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo); printf("\n"); -- DEBUG_PRINTF(" c2_hi: "); dumpMsk256(c2_hi); printf("\n"); -- DEBUG_PRINTF(" t2: "); dumpMsk256(t2); printf("\n"); -+ DEBUG_PRINTF(" c2_lo: "); -+ dumpMsk256(c2_lo); -+ printf("\n"); -+ DEBUG_PRINTF(" c2_hi: "); -+ dumpMsk256(c2_hi); -+ printf("\n"); -+ DEBUG_PRINTF(" t2: "); -+ dumpMsk256(t2); -+ printf("\n"); - #endif - u32 z = movemask256(eq256(t2, ones)); - - return firstMatch(buf, z); - } - --static really_inline --const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf, -- const m256 low4bits) { -+static really_inline const u8 *fwdBlockShort2(m256 mask1, m256 mask2, -+ m128 chars, const u8 *buf, -+ const m256 low4bits) { - // do the hi and lo shuffles in the one avx register - m256 c = combine2x128(rshift64_m128(chars, 4), chars); - c = and256(c, low4bits); -@@ -672,9 +745,10 @@ const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf, - return firstMatch(buf, z); - } - --static really_inline --const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, -- m128 mask2_hi, const u8 *buf, const u8 *buf_end) { -+static really_inline const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, -+ m128 mask2_lo, m128 mask2_hi, -+ const u8 *buf, -+ const u8 *buf_end) { - DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); - const m256 low4bits = set32x8(0xf); - // run shufti over two overlapping 16-byte unaligned reads -@@ -695,9 +769,8 @@ const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, - } - - /* takes 128 bit masks, but operates on 256 bits of data */ --const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, -- m128 mask2_lo, m128 mask2_hi, -- const u8 *buf, const u8 *buf_end) { -+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, -+ m128 mask2_hi, const u8 *buf, const u8 *buf_end) { - /* we should always have at least 16 bytes */ - assert(buf_end - buf >= 16); - DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); -@@ -731,8 +804,8 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, - const u8 *last_block = buf_end - 32; - while (buf < last_block) { - m256 lchars = load256(buf); -- rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, -- lchars, buf, low4bits, ones); -+ rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, -+ wide_mask2_hi, lchars, buf, low4bits, ones); - if (rv) { - return rv; - } -@@ -757,26 +830,34 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, - DUMP_MSK(512) - #endif - --static really_inline --u64a block(m512 mask_lo, m512 mask_hi, m512 chars, const m512 low4bits, -- const m512 compare) { -+static really_inline u64a block(m512 mask_lo, m512 mask_hi, m512 chars, -+ const m512 low4bits, const m512 compare) { - m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits)); -- m512 c_hi = pshufb_m512(mask_hi, -- rshift64_m512(andnot512(low4bits, chars), 4)); -+ m512 c_hi = -+ pshufb_m512(mask_hi, rshift64_m512(andnot512(low4bits, chars), 4)); - m512 t = and512(c_lo, c_hi); - - #ifdef DEBUG -- DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); -- DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); -- DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); -- DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); -- DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); -+ DEBUG_PRINTF(" chars: "); -+ dumpMsk512AsChars(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" char: "); -+ dumpMsk512(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" c_lo: "); -+ dumpMsk512(c_lo); -+ printf("\n"); -+ DEBUG_PRINTF(" c_hi: "); -+ dumpMsk512(c_hi); -+ printf("\n"); -+ DEBUG_PRINTF(" t: "); -+ dumpMsk512(t); -+ printf("\n"); - #endif - - return eq512mask(t, compare); - } --static really_inline --const u8 *firstMatch64(const u8 *buf, u64a z) { -+static really_inline const u8 *firstMatch64(const u8 *buf, u64a z) { - DEBUG_PRINTF("z 0x%016llx\n", z); - if (unlikely(z != ~0ULL)) { - u32 pos = ctz64(~z); -@@ -788,18 +869,19 @@ const u8 *firstMatch64(const u8 *buf, u64a z) { - } - } - --static really_inline --const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf, -- const m512 low4bits, const m512 zeroes) { -+static really_inline const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi, -+ m512 chars, const u8 *buf, -+ const m512 low4bits, -+ const m512 zeroes) { - u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes); - - return firstMatch64(buf, z); - } - --static really_inline --const u8 *shortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf, -- const u8 *buf_end, const m512 low4bits, -- const m512 zeroes) { -+static really_inline const u8 *shortShufti512(m512 mask_lo, m512 mask_hi, -+ const u8 *buf, const u8 *buf_end, -+ const m512 low4bits, -+ const m512 zeroes) { - DEBUG_PRINTF("short shufti %p len %zu\n", buf, buf_end - buf); - uintptr_t len = buf_end - buf; - assert(len <= 64); -@@ -877,8 +959,7 @@ done: - return buf_end; - } - --static really_inline --const u8 *lastMatch64(const u8 *buf, u64a z) { -+static really_inline const u8 *lastMatch64(const u8 *buf, u64a z) { - DEBUG_PRINTF("z 0x%016llx\n", z); - if (unlikely(z != ~0ULL)) { - u32 pos = clz64(~z); -@@ -889,10 +970,10 @@ const u8 *lastMatch64(const u8 *buf, u64a z) { - } - } - --static really_inline --const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf, -- const u8 *buf_end, const m512 low4bits, -- const m512 zeroes) { -+static really_inline const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, -+ const u8 *buf, const u8 *buf_end, -+ const m512 low4bits, -+ const m512 zeroes) { - DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf); - uintptr_t len = buf_end - buf; - assert(len <= 64); -@@ -909,20 +990,31 @@ const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf, - return lastMatch64(buf, z | ~k); - } - --static really_inline --const u8 *revBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf, -- const m512 low4bits, const m512 zeroes) { -- m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits)); -- m512 c_hi = pshufb_m512(mask_hi, -- rshift64_m512(andnot512(low4bits, chars), 4)); -- m512 t = and512(c_lo, c_hi); -+static really_inline const u8 *revBlock512(m512 mask_lo, m512 mask_hi, -+ m512 chars, const u8 *buf, -+ const m512 low4bits, -+ const m512 zeroes) { -+ m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits)); -+ m512 c_hi = -+ pshufb_m512(mask_hi, rshift64_m512(andnot512(low4bits, chars), 4)); -+ m512 t = and512(c_lo, c_hi); - - #ifdef DEBUG -- DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); -- DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); -- DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); -- DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); -- DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); -+ DEBUG_PRINTF(" chars: "); -+ dumpMsk512AsChars(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" char: "); -+ dumpMsk512(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" c_lo: "); -+ dumpMsk512(c_lo); -+ printf("\n"); -+ DEBUG_PRINTF(" c_hi: "); -+ dumpMsk512(c_hi); -+ printf("\n"); -+ DEBUG_PRINTF(" t: "); -+ dumpMsk512(t); -+ printf("\n"); - #endif - - u64a z = eq512mask(t, zeroes); -@@ -985,43 +1077,60 @@ done: - return buf - 1; - } - --static really_inline --const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi, -- m512 chars, const u8 *buf, const m512 low4bits, -- const m512 ones, __mmask64 k) { -+static really_inline const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi, -+ m512 mask2_lo, m512 mask2_hi, -+ m512 chars, const u8 *buf, -+ const m512 low4bits, const m512 ones, -+ __mmask64 k) { - DEBUG_PRINTF("buf %p %.64s\n", buf, buf); - m512 chars_lo = and512(chars, low4bits); - m512 chars_hi = rshift64_m512(andnot512(low4bits, chars), 4); -- m512 c_lo = maskz_pshufb_m512(k, mask1_lo, chars_lo); -- m512 c_hi = maskz_pshufb_m512(k, mask1_hi, chars_hi); -- m512 t = or512(c_lo, c_hi); -+ m512 c_lo = maskz_pshufb_m512(k, mask1_lo, chars_lo); -+ m512 c_hi = maskz_pshufb_m512(k, mask1_hi, chars_hi); -+ m512 t = or512(c_lo, c_hi); - - #ifdef DEBUG -- DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); -- DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); -- DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); -- DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); -- DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); -+ DEBUG_PRINTF(" chars: "); -+ dumpMsk512AsChars(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" char: "); -+ dumpMsk512(chars); -+ printf("\n"); -+ DEBUG_PRINTF(" c_lo: "); -+ dumpMsk512(c_lo); -+ printf("\n"); -+ DEBUG_PRINTF(" c_hi: "); -+ dumpMsk512(c_hi); -+ printf("\n"); -+ DEBUG_PRINTF(" t: "); -+ dumpMsk512(t); -+ printf("\n"); - #endif - -- m512 c2_lo = maskz_pshufb_m512(k, mask2_lo, chars_lo); -- m512 c2_hi = maskz_pshufb_m512(k, mask2_hi, chars_hi); -+ m512 c2_lo = maskz_pshufb_m512(k, mask2_lo, chars_lo); -+ m512 c2_hi = maskz_pshufb_m512(k, mask2_hi, chars_hi); - m512 t2 = or512(t, rshift128_m512(or512(c2_lo, c2_hi), 1)); - - #ifdef DEBUG -- DEBUG_PRINTF(" c2_lo: "); dumpMsk512(c2_lo); printf("\n"); -- DEBUG_PRINTF(" c2_hi: "); dumpMsk512(c2_hi); printf("\n"); -- DEBUG_PRINTF(" t2: "); dumpMsk512(t2); printf("\n"); -+ DEBUG_PRINTF(" c2_lo: "); -+ dumpMsk512(c2_lo); -+ printf("\n"); -+ DEBUG_PRINTF(" c2_hi: "); -+ dumpMsk512(c2_hi); -+ printf("\n"); -+ DEBUG_PRINTF(" t2: "); -+ dumpMsk512(t2); -+ printf("\n"); - #endif - u64a z = eq512mask(t2, ones); - - return firstMatch64(buf, z | ~k); - } - --static really_inline --const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, -- m512 mask2_hi, const u8 *buf, const u8 *buf_end, -- const m512 low4bits, const m512 ones) { -+static really_inline const u8 * -+shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi, -+ const u8 *buf, const u8 *buf_end, const m512 low4bits, -+ const m512 ones) { - DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf); - uintptr_t len = buf_end - buf; - assert(len <= 64); -@@ -1038,9 +1147,8 @@ const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, - } - - /* takes 128 bit masks, but operates on 512 bits of data */ --const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, -- m128 mask2_lo, m128 mask2_hi, -- const u8 *buf, const u8 *buf_end) { -+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, -+ m128 mask2_hi, const u8 *buf, const u8 *buf_end) { - /* we should always have at least 16 bytes */ - assert(buf_end - buf >= 16); - DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); diff --git a/src/nfa/truffle.c b/src/nfa/truffle.c index be6b312..c05d778 100644 --- a/src/nfa/truffle.c @@ -2122,28 +1237,26 @@ index 985fec6..fe4a910 100644 + #endif // UTIL_ARCH_H_ diff --git a/src/util/cpuid_flags.c b/src/util/cpuid_flags.c -index c00ce58..e0f6368 100644 +index c00ce58..96286ee 100644 --- a/src/util/cpuid_flags.c +++ b/src/util/cpuid_flags.c -@@ -39,7 +39,7 @@ - +@@ -40,6 +40,7 @@ u64a cpuid_flags(void) { u64a cap = 0; -- + +#if defined(__X86_64__) if (check_avx2()) { DEBUG_PRINTF("AVX2 enabled\n"); cap |= HS_CPU_FEATURES_AVX2; -@@ -68,7 +68,7 @@ u64a cpuid_flags(void) { +@@ -67,6 +68,7 @@ u64a cpuid_flags(void) { + #if (!defined(FAT_RUNTIME) && !defined(HAVE_AVX512VBMI)) || \ (defined(FAT_RUNTIME) && !defined(BUILD_AVX512VBMI)) cap &= ~HS_CPU_FEATURES_AVX512VBMI; - #endif -- +#endif - return cap; - } + #endif -@@ -78,6 +78,7 @@ struct family_id { + return cap; +@@ -78,6 +80,7 @@ struct family_id { u32 tune; }; @@ -2151,7 +1264,7 @@ index c00ce58..e0f6368 100644 /* from table 35-1 of the Intel 64 and IA32 Arch. Software Developer's Manual * and "Intel Architecture and Processor Identification With CPUID Model and * Family Numbers" */ -@@ -121,6 +122,7 @@ static const struct family_id known_microarch[] = { +@@ -121,6 +124,7 @@ static const struct family_id known_microarch[] = { { 0x6, 0x6C, HS_TUNE_FAMILY_ICX }, /* Icelake Xeon */ }; @@ -2159,7 +1272,7 @@ index c00ce58..e0f6368 100644 #ifdef DUMP_SUPPORT static UNUSED -@@ -144,6 +146,7 @@ const char *dumpTune(u32 tune) { +@@ -144,6 +148,7 @@ const char *dumpTune(u32 tune) { #endif u32 cpuid_tune(void) { @@ -2167,12 +1280,12 @@ index c00ce58..e0f6368 100644 unsigned int eax, ebx, ecx, edx; cpuid(1, 0, &eax, &ebx, &ecx, &edx); -@@ -171,6 +174,6 @@ u32 cpuid_tune(void) { +@@ -171,6 +176,7 @@ u32 cpuid_tune(void) { DEBUG_PRINTF("found tune flag %s\n", dumpTune(tune) ); return tune; } -- +#endif + return HS_TUNE_FAMILY_GENERIC; } diff --git a/src/util/cpuid_flags.h b/src/util/cpuid_flags.h @@ -3380,17 +2493,11 @@ index 0000000..cce119f + +#endif diff --git a/src/util/simd_types.h b/src/util/simd_types.h -index 962cad6..62d39ec 100644 +index 962cad6..b3f96ea 100644 --- a/src/util/simd_types.h +++ b/src/util/simd_types.h -@@ -30,28 +30,58 @@ - #define SIMD_TYPES_H - - #include "config.h" -+#include "ue2common.h" - #include "util/arch.h" - #include "util/intrinsics.h" --#include "ue2common.h" +@@ -35,6 +35,23 @@ + #include "ue2common.h" #if defined(HAVE_SSE2) +typedef __m128i m128; @@ -3412,2895 +2519,45 @@ index 962cad6..62d39ec 100644 + typedef __m128i m128; #else --typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128; -+typedef struct ALIGN_DIRECTIVE { -+ u64a hi; -+ u64a lo; -+} m128; -+ - #endif - - #if defined(HAVE_AVX2) - typedef __m256i m256; - #else --typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256; -+typedef struct ALIGN_AVX_DIRECTIVE { -+ m128 lo; -+ m128 hi; -+} m256; - #endif - --typedef struct {m128 lo; m128 mid; m128 hi;} m384; -+typedef struct { -+ m128 lo; -+ m128 mid; -+ m128 hi; -+} m384; - #if defined(HAVE_AVX512) - typedef __m512i m512; - #else --typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512; -+typedef struct ALIGN_ATTR(64) { -+ m256 lo; -+ m256 hi; -+} m512; - #endif - - #endif /* SIMD_TYPES_H */ -- + typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128; diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h -index d1f060b..7e926b2 100644 ---- a/src/util/simd_utils.h +new file mode 100644 +index 0000000..9588d97 +--- /dev/null +++ b/src/util/simd_utils.h -@@ -26,1395 +26,14 @@ - * POSSIBILITY OF SUCH DAMAGE. - */ - --/** \file -- * \brief SIMD types and primitive operations. -- */ -- - #ifndef SIMD_UTILS - #define SIMD_UTILS - --#if !defined(_WIN32) && !defined(__SSSE3__) --#error SSSE3 instructions must be enabled --#endif -- --#include "config.h" --#include "ue2common.h" --#include "simd_types.h" --#include "unaligned.h" --#include "util/arch.h" --#include "util/intrinsics.h" -- --#include // for memcpy -- --// Define a common assume_aligned using an appropriate compiler built-in, if --// it's available. Note that we need to handle C or C++ compilation. --#ifdef __cplusplus --# ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED --# define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) --# endif --#else --# ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED --# define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) --# endif --#endif -- --// Fallback to identity case. --#ifndef assume_aligned --#define assume_aligned(x, y) (x) --#endif -- --#ifdef __cplusplus --extern "C" { --#endif --extern const char vbs_mask_data[]; --#ifdef __cplusplus --} --#endif -- --static really_inline m128 ones128(void) { --#if defined(__GNUC__) || defined(__INTEL_COMPILER) -- /* gcc gets this right */ -- return _mm_set1_epi8(0xFF); --#else -- /* trick from Intel's optimization guide to generate all-ones. -- * ICC converts this to the single cmpeq instruction */ -- return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); --#endif --} -- --static really_inline m128 zeroes128(void) { -- return _mm_setzero_si128(); --} -- --/** \brief Bitwise not for m128*/ --static really_inline m128 not128(m128 a) { -- return _mm_xor_si128(a, ones128()); --} -- --/** \brief Return 1 if a and b are different otherwise 0 */ --static really_inline int diff128(m128 a, m128 b) { -- return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff); --} -- --static really_inline int isnonzero128(m128 a) { -- return !!diff128(a, zeroes128()); --} -- --/** -- * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit -- * mask indicating which 32-bit words contain differences. -- */ --static really_inline u32 diffrich128(m128 a, m128 b) { -- a = _mm_cmpeq_epi32(a, b); -- return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf; --} -- --/** -- * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and -- * returns a 4-bit mask indicating which 64-bit words contain differences. -- */ --static really_inline u32 diffrich64_128(m128 a, m128 b) { --#if defined(HAVE_SSE41) -- a = _mm_cmpeq_epi64(a, b); -- return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5; --#else -- u32 d = diffrich128(a, b); -- return (d | (d >> 1)) & 0x5; --#endif --} -- --static really_really_inline --m128 lshift64_m128(m128 a, unsigned b) { --#if defined(HAVE__BUILTIN_CONSTANT_P) -- if (__builtin_constant_p(b)) { -- return _mm_slli_epi64(a, b); -- } --#endif -- m128 x = _mm_cvtsi32_si128(b); -- return _mm_sll_epi64(a, x); --} -- --#define rshift64_m128(a, b) _mm_srli_epi64((a), (b)) --#define eq128(a, b) _mm_cmpeq_epi8((a), (b)) --#define movemask128(a) ((u32)_mm_movemask_epi8((a))) -- --#if defined(HAVE_AVX512) --static really_inline m128 cast512to128(const m512 in) { -- return _mm512_castsi512_si128(in); --} --#endif -- --static really_inline m128 set16x8(u8 c) { -- return _mm_set1_epi8(c); --} -- --static really_inline m128 set4x32(u32 c) { -- return _mm_set1_epi32(c); --} -- --static really_inline u32 movd(const m128 in) { -- return _mm_cvtsi128_si32(in); --} -- --#if defined(HAVE_AVX512) --static really_inline u32 movd512(const m512 in) { -- // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in), -- // so we use 2-step convertions to work around. -- return _mm_cvtsi128_si32(_mm512_castsi512_si128(in)); --} -- --static really_inline u64a movq512(const m512 in) { -- // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in), -- // so we use 2-step convertions to work around. -- return _mm_cvtsi128_si64(_mm512_castsi512_si128(in)); --} --#endif -- --static really_inline u64a movq(const m128 in) { --#if defined(ARCH_X86_64) -- return _mm_cvtsi128_si64(in); --#else // 32-bit - this is horrific -- u32 lo = movd(in); -- u32 hi = movd(_mm_srli_epi64(in, 32)); -- return (u64a)hi << 32 | lo; --#endif --} -- --/* another form of movq */ --static really_inline --m128 load_m128_from_u64a(const u64a *p) { -- return _mm_set_epi64x(0LL, *p); --} -- --#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed) --#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed) -- --#if defined(HAVE_SSE41) --#define extract32from128(a, imm) _mm_extract_epi32(a, imm) --#define extract64from128(a, imm) _mm_extract_epi64(a, imm) --#else --#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2)) --#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3)) --#endif -- --#if !defined(HAVE_AVX2) --// TODO: this entire file needs restructuring - this carveout is awful --#define extractlow64from256(a) movq(a.lo) --#define extractlow32from256(a) movd(a.lo) --#if defined(HAVE_SSE41) --#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4) --#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2) --#else --#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4)) --#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8)) --#endif -- --#endif // !AVX2 -- --static really_inline m128 and128(m128 a, m128 b) { -- return _mm_and_si128(a,b); --} -- --static really_inline m128 xor128(m128 a, m128 b) { -- return _mm_xor_si128(a,b); --} -- --static really_inline m128 or128(m128 a, m128 b) { -- return _mm_or_si128(a,b); --} -- --#if defined(HAVE_AVX512VBMI) --static really_inline m512 expand128(m128 a) { -- return _mm512_broadcast_i32x4(a); --} -- --static really_inline m512 expand256(m256 a) { -- return _mm512_broadcast_i64x4(a); --} -- --static really_inline m512 expand384(m384 a) { -- u64a *lo = (u64a*)&a.lo; -- u64a *mid = (u64a*)&a.mid; -- u64a *hi = (u64a*)&a.hi; -- return _mm512_set_epi64(0ULL, 0ULL, hi[1], hi[0], mid[1], mid[0], -- lo[1], lo[0]); --} +@@ -0,0 +1,13 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++// Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. ++ ++#ifndef SIMD_UTILS ++#define SIMD_UTILS ++ +#if defined(__x86_64__) +#include "simd_x86.h" +#elif defined(__aarch64__) +#include "simd_arm.h" - #endif - --static really_inline m128 andnot128(m128 a, m128 b) { -- return _mm_andnot_si128(a, b); --} -- --// aligned load --static really_inline m128 load128(const void *ptr) { -- assert(ISALIGNED_N(ptr, alignof(m128))); -- ptr = assume_aligned(ptr, 16); -- return _mm_load_si128((const m128 *)ptr); --} -- --// aligned store --static really_inline void store128(void *ptr, m128 a) { -- assert(ISALIGNED_N(ptr, alignof(m128))); -- ptr = assume_aligned(ptr, 16); -- *(m128 *)ptr = a; --} -- --// unaligned load --static really_inline m128 loadu128(const void *ptr) { -- return _mm_loadu_si128((const m128 *)ptr); --} -- --// unaligned store --static really_inline void storeu128(void *ptr, m128 a) { -- _mm_storeu_si128 ((m128 *)ptr, a); --} -- --// packed unaligned store of first N bytes --static really_inline --void storebytes128(void *ptr, m128 a, unsigned int n) { -- assert(n <= sizeof(a)); -- memcpy(ptr, &a, n); --} -- --// packed unaligned load of first N bytes, pad with zero --static really_inline --m128 loadbytes128(const void *ptr, unsigned int n) { -- m128 a = zeroes128(); -- assert(n <= sizeof(a)); -- memcpy(&a, ptr, n); -- return a; --} -- --#ifdef __cplusplus --extern "C" { --#endif --extern const u8 simd_onebit_masks[]; --#ifdef __cplusplus --} - #endif - --static really_inline --m128 mask1bit128(unsigned int n) { -- assert(n < sizeof(m128) * 8); -- u32 mask_idx = ((n % 8) * 64) + 95; -- mask_idx -= n / 8; -- return loadu128(&simd_onebit_masks[mask_idx]); --} -- --// switches on bit N in the given vector. --static really_inline --void setbit128(m128 *ptr, unsigned int n) { -- *ptr = or128(mask1bit128(n), *ptr); --} -- --// switches off bit N in the given vector. --static really_inline --void clearbit128(m128 *ptr, unsigned int n) { -- *ptr = andnot128(mask1bit128(n), *ptr); --} -- --// tests bit N in the given vector. --static really_inline --char testbit128(m128 val, unsigned int n) { -- const m128 mask = mask1bit128(n); --#if defined(HAVE_SSE41) -- return !_mm_testz_si128(mask, val); --#else -- return isnonzero128(and128(mask, val)); --#endif --} -- --// offset must be an immediate --#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset) -- --static really_inline --m128 pshufb_m128(m128 a, m128 b) { -- m128 result; -- result = _mm_shuffle_epi8(a, b); -- return result; --} -- --static really_inline --m256 pshufb_m256(m256 a, m256 b) { --#if defined(HAVE_AVX2) -- return _mm256_shuffle_epi8(a, b); --#else -- m256 rv; -- rv.lo = pshufb_m128(a.lo, b.lo); -- rv.hi = pshufb_m128(a.hi, b.hi); -- return rv; --#endif --} -- --#if defined(HAVE_AVX512) --static really_inline --m512 pshufb_m512(m512 a, m512 b) { -- return _mm512_shuffle_epi8(a, b); --} -- --static really_inline --m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) { -- return _mm512_maskz_shuffle_epi8(k, a, b); --} -- --#if defined(HAVE_AVX512VBMI) --#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a) --#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a) --#endif -- --#endif -- --static really_inline --m128 variable_byte_shift_m128(m128 in, s32 amount) { -- assert(amount >= -16 && amount <= 16); -- m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); -- return pshufb_m128(in, shift_mask); --} -- --static really_inline --m128 max_u8_m128(m128 a, m128 b) { -- return _mm_max_epu8(a, b); --} -- --static really_inline --m128 min_u8_m128(m128 a, m128 b) { -- return _mm_min_epu8(a, b); --} -- --static really_inline --m128 sadd_u8_m128(m128 a, m128 b) { -- return _mm_adds_epu8(a, b); --} -- --static really_inline --m128 sub_u8_m128(m128 a, m128 b) { -- return _mm_sub_epi8(a, b); --} -- --static really_inline --m128 set64x2(u64a hi, u64a lo) { -- return _mm_set_epi64x(hi, lo); --} -- --/**** -- **** 256-bit Primitives -- ****/ -- --#if defined(HAVE_AVX2) -- --static really_really_inline --m256 lshift64_m256(m256 a, unsigned b) { --#if defined(HAVE__BUILTIN_CONSTANT_P) -- if (__builtin_constant_p(b)) { -- return _mm256_slli_epi64(a, b); -- } --#endif -- m128 x = _mm_cvtsi32_si128(b); -- return _mm256_sll_epi64(a, x); --} -- --#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b)) -- --static really_inline --m256 set32x8(u32 in) { -- return _mm256_set1_epi8(in); --} -- --#define eq256(a, b) _mm256_cmpeq_epi8((a), (b)) --#define movemask256(a) ((u32)_mm256_movemask_epi8((a))) -- --static really_inline --m256 set2x128(m128 a) { -- return _mm256_broadcastsi128_si256(a); --} -- --#else -- --static really_really_inline --m256 lshift64_m256(m256 a, int b) { -- m256 rv = a; -- rv.lo = lshift64_m128(rv.lo, b); -- rv.hi = lshift64_m128(rv.hi, b); -- return rv; --} -- --static really_inline --m256 rshift64_m256(m256 a, int b) { -- m256 rv = a; -- rv.lo = rshift64_m128(rv.lo, b); -- rv.hi = rshift64_m128(rv.hi, b); -- return rv; --} --static really_inline --m256 set32x8(u32 in) { -- m256 rv; -- rv.lo = set16x8((u8) in); -- rv.hi = rv.lo; -- return rv; --} -- --static really_inline --m256 eq256(m256 a, m256 b) { -- m256 rv; -- rv.lo = eq128(a.lo, b.lo); -- rv.hi = eq128(a.hi, b.hi); -- return rv; --} -- --static really_inline --u32 movemask256(m256 a) { -- u32 lo_mask = movemask128(a.lo); -- u32 hi_mask = movemask128(a.hi); -- return lo_mask | (hi_mask << 16); --} -- --static really_inline --m256 set2x128(m128 a) { -- m256 rv = {a, a}; -- return rv; --} --#endif -- --static really_inline m256 zeroes256(void) { --#if defined(HAVE_AVX2) -- return _mm256_setzero_si256(); --#else -- m256 rv = {zeroes128(), zeroes128()}; -- return rv; --#endif --} -- --static really_inline m256 ones256(void) { --#if defined(HAVE_AVX2) -- m256 rv = _mm256_set1_epi8(0xFF); --#else -- m256 rv = {ones128(), ones128()}; --#endif -- return rv; --} -- --#if defined(HAVE_AVX2) --static really_inline m256 and256(m256 a, m256 b) { -- return _mm256_and_si256(a, b); --} --#else --static really_inline m256 and256(m256 a, m256 b) { -- m256 rv; -- rv.lo = and128(a.lo, b.lo); -- rv.hi = and128(a.hi, b.hi); -- return rv; --} --#endif -- --#if defined(HAVE_AVX2) --static really_inline m256 or256(m256 a, m256 b) { -- return _mm256_or_si256(a, b); --} --#else --static really_inline m256 or256(m256 a, m256 b) { -- m256 rv; -- rv.lo = or128(a.lo, b.lo); -- rv.hi = or128(a.hi, b.hi); -- return rv; --} --#endif -- --#if defined(HAVE_AVX2) --static really_inline m256 xor256(m256 a, m256 b) { -- return _mm256_xor_si256(a, b); --} --#else --static really_inline m256 xor256(m256 a, m256 b) { -- m256 rv; -- rv.lo = xor128(a.lo, b.lo); -- rv.hi = xor128(a.hi, b.hi); -- return rv; --} --#endif -- --#if defined(HAVE_AVX2) --static really_inline m256 not256(m256 a) { -- return _mm256_xor_si256(a, ones256()); --} --#else --static really_inline m256 not256(m256 a) { -- m256 rv; -- rv.lo = not128(a.lo); -- rv.hi = not128(a.hi); -- return rv; --} --#endif -- --#if defined(HAVE_AVX2) --static really_inline m256 andnot256(m256 a, m256 b) { -- return _mm256_andnot_si256(a, b); --} --#else --static really_inline m256 andnot256(m256 a, m256 b) { -- m256 rv; -- rv.lo = andnot128(a.lo, b.lo); -- rv.hi = andnot128(a.hi, b.hi); -- return rv; --} --#endif -- --static really_inline int diff256(m256 a, m256 b) { --#if defined(HAVE_AVX2) -- return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1); --#else -- return diff128(a.lo, b.lo) || diff128(a.hi, b.hi); --#endif --} -- --static really_inline int isnonzero256(m256 a) { --#if defined(HAVE_AVX2) -- return !!diff256(a, zeroes256()); --#else -- return isnonzero128(or128(a.lo, a.hi)); --#endif --} -- --/** -- * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit -- * mask indicating which 32-bit words contain differences. -- */ --static really_inline u32 diffrich256(m256 a, m256 b) { --#if defined(HAVE_AVX2) -- a = _mm256_cmpeq_epi32(a, b); -- return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF; --#else -- m128 z = zeroes128(); -- a.lo = _mm_cmpeq_epi32(a.lo, b.lo); -- a.hi = _mm_cmpeq_epi32(a.hi, b.hi); -- m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z); -- return ~(_mm_movemask_epi8(packed)) & 0xff; --#endif --} -- --/** -- * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and -- * returns an 8-bit mask indicating which 64-bit words contain differences. -- */ --static really_inline u32 diffrich64_256(m256 a, m256 b) { -- u32 d = diffrich256(a, b); -- return (d | (d >> 1)) & 0x55555555; --} -- --// aligned load --static really_inline m256 load256(const void *ptr) { -- assert(ISALIGNED_N(ptr, alignof(m256))); --#if defined(HAVE_AVX2) -- return _mm256_load_si256((const m256 *)ptr); --#else -- m256 rv = { load128(ptr), load128((const char *)ptr + 16) }; -- return rv; --#endif --} -- --// aligned load of 128-bit value to low and high part of 256-bit value --static really_inline m256 load2x128(const void *ptr) { --#if defined(HAVE_AVX2) -- return set2x128(load128(ptr)); --#else -- assert(ISALIGNED_N(ptr, alignof(m128))); -- m256 rv; -- rv.hi = rv.lo = load128(ptr); -- return rv; --#endif --} -- --static really_inline m256 loadu2x128(const void *ptr) { -- return set2x128(loadu128(ptr)); --} -- --// aligned store --static really_inline void store256(void *ptr, m256 a) { -- assert(ISALIGNED_N(ptr, alignof(m256))); --#if defined(HAVE_AVX2) -- _mm256_store_si256((m256 *)ptr, a); --#else -- ptr = assume_aligned(ptr, 16); -- *(m256 *)ptr = a; --#endif --} -- --// unaligned load --static really_inline m256 loadu256(const void *ptr) { --#if defined(HAVE_AVX2) -- return _mm256_loadu_si256((const m256 *)ptr); --#else -- m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) }; -- return rv; --#endif --} -- --// unaligned store --static really_inline void storeu256(void *ptr, m256 a) { --#if defined(HAVE_AVX2) -- _mm256_storeu_si256((m256 *)ptr, a); --#else -- storeu128(ptr, a.lo); -- storeu128((char *)ptr + 16, a.hi); --#endif --} -- --// packed unaligned store of first N bytes --static really_inline --void storebytes256(void *ptr, m256 a, unsigned int n) { -- assert(n <= sizeof(a)); -- memcpy(ptr, &a, n); --} -- --// packed unaligned load of first N bytes, pad with zero --static really_inline --m256 loadbytes256(const void *ptr, unsigned int n) { -- m256 a = zeroes256(); -- assert(n <= sizeof(a)); -- memcpy(&a, ptr, n); -- return a; --} -- --static really_inline --m256 mask1bit256(unsigned int n) { -- assert(n < sizeof(m256) * 8); -- u32 mask_idx = ((n % 8) * 64) + 95; -- mask_idx -= n / 8; -- return loadu256(&simd_onebit_masks[mask_idx]); --} -- --static really_inline --m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { --#if defined(HAVE_AVX2) -- return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0); --#else -- m256 rv; -- rv.hi = set64x2(hi_1, hi_0); -- rv.lo = set64x2(lo_1, lo_0); -- return rv; --#endif --} -- --#if !defined(HAVE_AVX2) --// switches on bit N in the given vector. --static really_inline --void setbit256(m256 *ptr, unsigned int n) { -- assert(n < sizeof(*ptr) * 8); -- m128 *sub; -- if (n < 128) { -- sub = &ptr->lo; -- } else { -- sub = &ptr->hi; -- n -= 128; -- } -- setbit128(sub, n); --} -- --// switches off bit N in the given vector. --static really_inline --void clearbit256(m256 *ptr, unsigned int n) { -- assert(n < sizeof(*ptr) * 8); -- m128 *sub; -- if (n < 128) { -- sub = &ptr->lo; -- } else { -- sub = &ptr->hi; -- n -= 128; -- } -- clearbit128(sub, n); --} -- --// tests bit N in the given vector. --static really_inline --char testbit256(m256 val, unsigned int n) { -- assert(n < sizeof(val) * 8); -- m128 sub; -- if (n < 128) { -- sub = val.lo; -- } else { -- sub = val.hi; -- n -= 128; -- } -- return testbit128(sub, n); --} -- --static really_really_inline --m128 movdq_hi(m256 x) { -- return x.hi; --} -- --static really_really_inline --m128 movdq_lo(m256 x) { -- return x.lo; --} -- --static really_inline --m256 combine2x128(m128 hi, m128 lo) { -- m256 rv = {lo, hi}; -- return rv; --} -- --#else // AVX2 -- --// switches on bit N in the given vector. --static really_inline --void setbit256(m256 *ptr, unsigned int n) { -- *ptr = or256(mask1bit256(n), *ptr); --} -- --static really_inline --void clearbit256(m256 *ptr, unsigned int n) { -- *ptr = andnot256(mask1bit256(n), *ptr); --} -- --// tests bit N in the given vector. --static really_inline --char testbit256(m256 val, unsigned int n) { -- const m256 mask = mask1bit256(n); -- return !_mm256_testz_si256(mask, val); --} -- --static really_really_inline --m128 movdq_hi(m256 x) { -- return _mm256_extracti128_si256(x, 1); --} -- --static really_really_inline --m128 movdq_lo(m256 x) { -- return _mm256_extracti128_si256(x, 0); --} -- --#define cast256to128(a) _mm256_castsi256_si128(a) --#define cast128to256(a) _mm256_castsi128_si256(a) --#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E) --#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm) --#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed) --#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed) --#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2) --#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4) --#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a)) --#define extractlow32from256(a) movd(cast256to128(a)) --#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b) --#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b) --#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset) -- --static really_inline --m256 combine2x128(m128 hi, m128 lo) { --#if defined(_mm256_set_m128i) -- return _mm256_set_m128i(hi, lo); --#else -- return insert128to256(cast128to256(lo), hi, 1); --#endif --} --#endif //AVX2 -- --#if defined(HAVE_AVX512) --#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm) --#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b) --#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b) --#define set2x256(a) _mm512_broadcast_i64x4(a) --#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a) --#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a) --#endif -- --/**** -- **** 384-bit Primitives -- ****/ -- --static really_inline m384 and384(m384 a, m384 b) { -- m384 rv; -- rv.lo = and128(a.lo, b.lo); -- rv.mid = and128(a.mid, b.mid); -- rv.hi = and128(a.hi, b.hi); -- return rv; --} -- --static really_inline m384 or384(m384 a, m384 b) { -- m384 rv; -- rv.lo = or128(a.lo, b.lo); -- rv.mid = or128(a.mid, b.mid); -- rv.hi = or128(a.hi, b.hi); -- return rv; --} -- --static really_inline m384 xor384(m384 a, m384 b) { -- m384 rv; -- rv.lo = xor128(a.lo, b.lo); -- rv.mid = xor128(a.mid, b.mid); -- rv.hi = xor128(a.hi, b.hi); -- return rv; --} --static really_inline m384 not384(m384 a) { -- m384 rv; -- rv.lo = not128(a.lo); -- rv.mid = not128(a.mid); -- rv.hi = not128(a.hi); -- return rv; --} --static really_inline m384 andnot384(m384 a, m384 b) { -- m384 rv; -- rv.lo = andnot128(a.lo, b.lo); -- rv.mid = andnot128(a.mid, b.mid); -- rv.hi = andnot128(a.hi, b.hi); -- return rv; --} -- --static really_really_inline --m384 lshift64_m384(m384 a, unsigned b) { -- m384 rv; -- rv.lo = lshift64_m128(a.lo, b); -- rv.mid = lshift64_m128(a.mid, b); -- rv.hi = lshift64_m128(a.hi, b); -- return rv; --} -- --static really_inline m384 zeroes384(void) { -- m384 rv = {zeroes128(), zeroes128(), zeroes128()}; -- return rv; --} -- --static really_inline m384 ones384(void) { -- m384 rv = {ones128(), ones128(), ones128()}; -- return rv; --} -- --static really_inline int diff384(m384 a, m384 b) { -- return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi); --} -- --static really_inline int isnonzero384(m384 a) { -- return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); --} -- --/** -- * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit -- * mask indicating which 32-bit words contain differences. -- */ --static really_inline u32 diffrich384(m384 a, m384 b) { -- m128 z = zeroes128(); -- a.lo = _mm_cmpeq_epi32(a.lo, b.lo); -- a.mid = _mm_cmpeq_epi32(a.mid, b.mid); -- a.hi = _mm_cmpeq_epi32(a.hi, b.hi); -- m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid), -- _mm_packs_epi32(a.hi, z)); -- return ~(_mm_movemask_epi8(packed)) & 0xfff; --} -- --/** -- * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and -- * returns a 12-bit mask indicating which 64-bit words contain differences. -- */ --static really_inline u32 diffrich64_384(m384 a, m384 b) { -- u32 d = diffrich384(a, b); -- return (d | (d >> 1)) & 0x55555555; --} -- --// aligned load --static really_inline m384 load384(const void *ptr) { -- assert(ISALIGNED_16(ptr)); -- m384 rv = { load128(ptr), load128((const char *)ptr + 16), -- load128((const char *)ptr + 32) }; -- return rv; --} -- --// aligned store --static really_inline void store384(void *ptr, m384 a) { -- assert(ISALIGNED_16(ptr)); -- ptr = assume_aligned(ptr, 16); -- *(m384 *)ptr = a; --} -- --// unaligned load --static really_inline m384 loadu384(const void *ptr) { -- m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16), -- loadu128((const char *)ptr + 32)}; -- return rv; --} -- --// packed unaligned store of first N bytes --static really_inline --void storebytes384(void *ptr, m384 a, unsigned int n) { -- assert(n <= sizeof(a)); -- memcpy(ptr, &a, n); --} -- --// packed unaligned load of first N bytes, pad with zero --static really_inline --m384 loadbytes384(const void *ptr, unsigned int n) { -- m384 a = zeroes384(); -- assert(n <= sizeof(a)); -- memcpy(&a, ptr, n); -- return a; --} -- --// switches on bit N in the given vector. --static really_inline --void setbit384(m384 *ptr, unsigned int n) { -- assert(n < sizeof(*ptr) * 8); -- m128 *sub; -- if (n < 128) { -- sub = &ptr->lo; -- } else if (n < 256) { -- sub = &ptr->mid; -- } else { -- sub = &ptr->hi; -- } -- setbit128(sub, n % 128); --} -- --// switches off bit N in the given vector. --static really_inline --void clearbit384(m384 *ptr, unsigned int n) { -- assert(n < sizeof(*ptr) * 8); -- m128 *sub; -- if (n < 128) { -- sub = &ptr->lo; -- } else if (n < 256) { -- sub = &ptr->mid; -- } else { -- sub = &ptr->hi; -- } -- clearbit128(sub, n % 128); --} -- --// tests bit N in the given vector. --static really_inline --char testbit384(m384 val, unsigned int n) { -- assert(n < sizeof(val) * 8); -- m128 sub; -- if (n < 128) { -- sub = val.lo; -- } else if (n < 256) { -- sub = val.mid; -- } else { -- sub = val.hi; -- } -- return testbit128(sub, n % 128); --} -- --/**** -- **** 512-bit Primitives -- ****/ -- --#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b)) --#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b)) -- --static really_inline --m512 zeroes512(void) { --#if defined(HAVE_AVX512) -- return _mm512_setzero_si512(); --#else -- m512 rv = {zeroes256(), zeroes256()}; -- return rv; --#endif --} -- --static really_inline --m512 ones512(void) { --#if defined(HAVE_AVX512) -- return _mm512_set1_epi8(0xFF); -- //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512()); --#else -- m512 rv = {ones256(), ones256()}; -- return rv; --#endif --} -- --#if defined(HAVE_AVX512) --static really_inline --m512 set64x8(u8 a) { -- return _mm512_set1_epi8(a); --} -- --static really_inline --m512 set8x64(u64a a) { -- return _mm512_set1_epi64(a); --} -- --static really_inline --m512 set16x32(u32 a) { -- return _mm512_set1_epi32(a); --} -- --static really_inline --m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0, -- u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) { -- return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0, -- lo_3, lo_2, lo_1, lo_0); --} -- --static really_inline --m512 swap256in512(m512 a) { -- m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL); -- return vpermq512(idx, a); --} -- --static really_inline --m512 set4x128(m128 a) { -- return _mm512_broadcast_i32x4(a); --} -- --static really_inline --m512 sadd_u8_m512(m512 a, m512 b) { -- return _mm512_adds_epu8(a, b); --} -- --static really_inline --m512 max_u8_m512(m512 a, m512 b) { -- return _mm512_max_epu8(a, b); --} -- --static really_inline --m512 min_u8_m512(m512 a, m512 b) { -- return _mm512_min_epu8(a, b); --} -- --static really_inline --m512 sub_u8_m512(m512 a, m512 b) { -- return _mm512_sub_epi8(a, b); --} --#endif -- --static really_inline --m512 and512(m512 a, m512 b) { --#if defined(HAVE_AVX512) -- return _mm512_and_si512(a, b); --#else -- m512 rv; -- rv.lo = and256(a.lo, b.lo); -- rv.hi = and256(a.hi, b.hi); -- return rv; --#endif --} -- --static really_inline --m512 or512(m512 a, m512 b) { --#if defined(HAVE_AVX512) -- return _mm512_or_si512(a, b); --#else -- m512 rv; -- rv.lo = or256(a.lo, b.lo); -- rv.hi = or256(a.hi, b.hi); -- return rv; --#endif --} -- --static really_inline --m512 xor512(m512 a, m512 b) { --#if defined(HAVE_AVX512) -- return _mm512_xor_si512(a, b); --#else -- m512 rv; -- rv.lo = xor256(a.lo, b.lo); -- rv.hi = xor256(a.hi, b.hi); -- return rv; --#endif --} -- --static really_inline --m512 not512(m512 a) { --#if defined(HAVE_AVX512) -- return _mm512_xor_si512(a, ones512()); --#else -- m512 rv; -- rv.lo = not256(a.lo); -- rv.hi = not256(a.hi); -- return rv; --#endif --} -- --static really_inline --m512 andnot512(m512 a, m512 b) { --#if defined(HAVE_AVX512) -- return _mm512_andnot_si512(a, b); --#else -- m512 rv; -- rv.lo = andnot256(a.lo, b.lo); -- rv.hi = andnot256(a.hi, b.hi); -- return rv; --#endif --} -- --#if defined(HAVE_AVX512) --static really_really_inline --m512 lshift64_m512(m512 a, unsigned b) { --#if defined(HAVE__BUILTIN_CONSTANT_P) -- if (__builtin_constant_p(b)) { -- return _mm512_slli_epi64(a, b); -- } --#endif -- m128 x = _mm_cvtsi32_si128(b); -- return _mm512_sll_epi64(a, x); --} --#else --static really_really_inline --m512 lshift64_m512(m512 a, unsigned b) { -- m512 rv; -- rv.lo = lshift64_m256(a.lo, b); -- rv.hi = lshift64_m256(a.hi, b); -- return rv; --} --#endif -- --#if defined(HAVE_AVX512) --#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b)) --#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed) --#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed) --#endif -- --#if !defined(_MM_CMPINT_NE) --#define _MM_CMPINT_NE 0x4 --#endif -- --static really_inline --int diff512(m512 a, m512 b) { --#if defined(HAVE_AVX512) -- return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE); --#else -- return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); --#endif --} -- --static really_inline --int isnonzero512(m512 a) { --#if defined(HAVE_AVX512) -- return diff512(a, zeroes512()); --#elif defined(HAVE_AVX2) -- m256 x = or256(a.lo, a.hi); -- return !!diff256(x, zeroes256()); --#else -- m128 x = or128(a.lo.lo, a.lo.hi); -- m128 y = or128(a.hi.lo, a.hi.hi); -- return isnonzero128(or128(x, y)); --#endif --} -- --/** -- * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit -- * mask indicating which 32-bit words contain differences. -- */ --static really_inline --u32 diffrich512(m512 a, m512 b) { --#if defined(HAVE_AVX512) -- return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE); --#elif defined(HAVE_AVX2) -- return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8); --#else -- a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo); -- a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi); -- a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo); -- a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi); -- m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi), -- _mm_packs_epi32(a.hi.lo, a.hi.hi)); -- return ~(_mm_movemask_epi8(packed)) & 0xffff; --#endif --} -- --/** -- * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and -- * returns a 16-bit mask indicating which 64-bit words contain differences. -- */ --static really_inline --u32 diffrich64_512(m512 a, m512 b) { -- //TODO: cmp_epi64? -- u32 d = diffrich512(a, b); -- return (d | (d >> 1)) & 0x55555555; --} -- --// aligned load --static really_inline --m512 load512(const void *ptr) { --#if defined(HAVE_AVX512) -- return _mm512_load_si512(ptr); --#else -- assert(ISALIGNED_N(ptr, alignof(m256))); -- m512 rv = { load256(ptr), load256((const char *)ptr + 32) }; -- return rv; --#endif --} -- --// aligned store --static really_inline --void store512(void *ptr, m512 a) { -- assert(ISALIGNED_N(ptr, alignof(m512))); --#if defined(HAVE_AVX512) -- return _mm512_store_si512(ptr, a); --#elif defined(HAVE_AVX2) -- m512 *x = (m512 *)ptr; -- store256(&x->lo, a.lo); -- store256(&x->hi, a.hi); --#else -- ptr = assume_aligned(ptr, 16); -- *(m512 *)ptr = a; --#endif --} -- --// unaligned load --static really_inline --m512 loadu512(const void *ptr) { --#if defined(HAVE_AVX512) -- return _mm512_loadu_si512(ptr); --#else -- m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) }; -- return rv; --#endif --} -- --// unaligned store --static really_inline --void storeu512(void *ptr, m512 a) { --#if defined(HAVE_AVX512) -- _mm512_storeu_si512((m512 *)ptr, a); --#elif defined(HAVE_AVX2) -- storeu256(ptr, a.lo); -- storeu256((char *)ptr + 32, a.hi); --#else -- storeu128(ptr, a.lo.lo); -- storeu128((char *)ptr + 16, a.lo.hi); -- storeu128((char *)ptr + 32, a.hi.lo); -- storeu128((char *)ptr + 48, a.hi.hi); --#endif --} -- --#if defined(HAVE_AVX512) --static really_inline --m512 loadu_maskz_m512(__mmask64 k, const void *ptr) { -- return _mm512_maskz_loadu_epi8(k, ptr); --} -- --static really_inline --m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) { -- return _mm512_mask_loadu_epi8(src, k, ptr); --} -- --static really_inline --void storeu_mask_m512(void *ptr, __mmask64 k, m512 a) { -- _mm512_mask_storeu_epi8(ptr, k, a); --} -- --static really_inline --m512 set_mask_m512(__mmask64 k) { -- return _mm512_movm_epi8(k); --} -- --static really_inline --m256 loadu_maskz_m256(__mmask32 k, const void *ptr) { -- return _mm256_maskz_loadu_epi8(k, ptr); --} --#endif -- --// packed unaligned store of first N bytes --static really_inline --void storebytes512(void *ptr, m512 a, unsigned int n) { -- assert(n <= sizeof(a)); -- memcpy(ptr, &a, n); --} -- --// packed unaligned load of first N bytes, pad with zero --static really_inline --m512 loadbytes512(const void *ptr, unsigned int n) { -- m512 a = zeroes512(); -- assert(n <= sizeof(a)); -- memcpy(&a, ptr, n); -- return a; --} -- --static really_inline --m512 mask1bit512(unsigned int n) { -- assert(n < sizeof(m512) * 8); -- u32 mask_idx = ((n % 8) * 64) + 95; -- mask_idx -= n / 8; -- return loadu512(&simd_onebit_masks[mask_idx]); --} -- --// switches on bit N in the given vector. --static really_inline --void setbit512(m512 *ptr, unsigned int n) { -- assert(n < sizeof(*ptr) * 8); --#if !defined(HAVE_AVX2) -- m128 *sub; -- if (n < 128) { -- sub = &ptr->lo.lo; -- } else if (n < 256) { -- sub = &ptr->lo.hi; -- } else if (n < 384) { -- sub = &ptr->hi.lo; -- } else { -- sub = &ptr->hi.hi; -- } -- setbit128(sub, n % 128); --#elif defined(HAVE_AVX512) -- *ptr = or512(mask1bit512(n), *ptr); --#else -- m256 *sub; -- if (n < 256) { -- sub = &ptr->lo; -- } else { -- sub = &ptr->hi; -- n -= 256; -- } -- setbit256(sub, n); --#endif --} -- --// switches off bit N in the given vector. --static really_inline --void clearbit512(m512 *ptr, unsigned int n) { -- assert(n < sizeof(*ptr) * 8); --#if !defined(HAVE_AVX2) -- m128 *sub; -- if (n < 128) { -- sub = &ptr->lo.lo; -- } else if (n < 256) { -- sub = &ptr->lo.hi; -- } else if (n < 384) { -- sub = &ptr->hi.lo; -- } else { -- sub = &ptr->hi.hi; -- } -- clearbit128(sub, n % 128); --#elif defined(HAVE_AVX512) -- *ptr = andnot512(mask1bit512(n), *ptr); --#else -- m256 *sub; -- if (n < 256) { -- sub = &ptr->lo; -- } else { -- sub = &ptr->hi; -- n -= 256; -- } -- clearbit256(sub, n); --#endif --} -- --// tests bit N in the given vector. --static really_inline --char testbit512(m512 val, unsigned int n) { -- assert(n < sizeof(val) * 8); --#if !defined(HAVE_AVX2) -- m128 sub; -- if (n < 128) { -- sub = val.lo.lo; -- } else if (n < 256) { -- sub = val.lo.hi; -- } else if (n < 384) { -- sub = val.hi.lo; -- } else { -- sub = val.hi.hi; -- } -- return testbit128(sub, n % 128); --#elif defined(HAVE_AVX512) -- const m512 mask = mask1bit512(n); -- return !!_mm512_test_epi8_mask(mask, val); --#else -- m256 sub; -- if (n < 256) { -- sub = val.lo; -- } else { -- sub = val.hi; -- n -= 256; -- } -- return testbit256(sub, n); --#endif --} -- --#endif ++#endif ++ ++#endif diff --git a/src/util/simd_x86.h b/src/util/simd_x86.h -new file mode 100644 -index 0000000..59ac642 ---- /dev/null +index 5fa727e..5daaa74 100644 +--- a/src/util/simd_x86.h +++ b/src/util/simd_x86.h -@@ -0,0 +1,1334 @@ -+/* -+ * Copyright (c) 2015-2017, Intel Corporation -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions are met: -+ * -+ * * Redistributions of source code must retain the above copyright notice, -+ * this list of conditions and the following disclaimer. -+ * * Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * * Neither the name of Intel Corporation nor the names of its contributors -+ * may be used to endorse or promote products derived from this software -+ * without specific prior written permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -+ * POSSIBILITY OF SUCH DAMAGE. -+ */ -+ -+/** \file -+ * \brief SIMD types and primitive operations. -+ */ -+ -+#ifndef SIMD_X86 -+#define SIMD_X86 -+ -+#if !defined(_WIN32) && !defined(__SSSE3__) -+#error SSSE3 instructions must be enabled -+#endif -+ -+#include "config.h" -+#include "ue2common.h" -+#include "simd_types.h" -+#include "unaligned.h" -+#include "util/arch.h" -+#include "util/intrinsics.h" -+ -+#include // for memcpy -+ -+// Define a common assume_aligned using an appropriate compiler built-in, if -+// it's available. Note that we need to handle C or C++ compilation. -+#ifdef __cplusplus -+# ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED -+# define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) -+# endif -+#else -+# ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED -+# define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) -+# endif -+#endif -+ -+// Fallback to identity case. -+#ifndef assume_aligned -+#define assume_aligned(x, y) (x) -+#endif -+ -+#ifdef __cplusplus -+extern "C" { -+#endif -+extern const char vbs_mask_data[]; -+#ifdef __cplusplus -+} -+#endif -+ -+static really_inline m128 ones128(void) { -+#if defined(__GNUC__) || defined(__INTEL_COMPILER) -+ /* gcc gets this right */ -+ return _mm_set1_epi8(0xFF); -+#else -+ /* trick from Intel's optimization guide to generate all-ones. -+ * ICC converts this to the single cmpeq instruction */ -+ return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); -+#endif -+} -+ -+static really_inline m128 zeroes128(void) { -+ return _mm_setzero_si128(); -+} -+ -+/** \brief Bitwise not for m128*/ -+static really_inline m128 not128(m128 a) { -+ return _mm_xor_si128(a, ones128()); -+} -+ -+/** \brief Return 1 if a and b are different otherwise 0 */ -+static really_inline int diff128(m128 a, m128 b) { -+ return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff); -+} -+ -+static really_inline int isnonzero128(m128 a) { -+ return !!diff128(a, zeroes128()); -+} -+ -+/** -+ * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit -+ * mask indicating which 32-bit words contain differences. -+ */ -+static really_inline u32 diffrich128(m128 a, m128 b) { -+ a = _mm_cmpeq_epi32(a, b); -+ return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf; -+} -+ -+/** -+ * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and -+ * returns a 4-bit mask indicating which 64-bit words contain differences. -+ */ -+static really_inline u32 diffrich64_128(m128 a, m128 b) { -+#if defined(HAVE_SSE41) -+ a = _mm_cmpeq_epi64(a, b); -+ return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5; -+#else -+ u32 d = diffrich128(a, b); -+ return (d | (d >> 1)) & 0x5; -+#endif -+} -+ -+static really_really_inline -+m128 lshift64_m128(m128 a, unsigned b) { -+#if defined(HAVE__BUILTIN_CONSTANT_P) -+ if (__builtin_constant_p(b)) { -+ return _mm_slli_epi64(a, b); -+ } -+#endif -+ m128 x = _mm_cvtsi32_si128(b); -+ return _mm_sll_epi64(a, x); -+} -+ -+#define rshift64_m128(a, b) _mm_srli_epi64((a), (b)) -+#define eq128(a, b) _mm_cmpeq_epi8((a), (b)) -+#define movemask128(a) ((u32)_mm_movemask_epi8((a))) -+ -+static really_inline m128 set16x8(u8 c) { -+ return _mm_set1_epi8(c); -+} -+ -+static really_inline m128 set4x32(u32 c) { -+ return _mm_set1_epi32(c); -+} -+ -+static really_inline m128 set2x64(u64a c) { -+ return _mm_set1_epi32(c); -+} -+ -+static really_inline u32 movd(const m128 in) { -+ return _mm_cvtsi128_si32(in); -+} -+ -+static really_inline u64a movq(const m128 in) { -+#if defined(ARCH_X86_64) -+ return _mm_cvtsi128_si64(in); -+#else // 32-bit - this is horrific -+ u32 lo = movd(in); -+ u32 hi = movd(_mm_srli_epi64(in, 32)); -+ return (u64a)hi << 32 | lo; -+#endif -+} -+ -+/* another form of movq */ -+static really_inline -+m128 load_m128_from_u64a(const u64a *p) { -+ return _mm_set_epi64x(0LL, *p); -+} -+ -+#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed) -+#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed) -+ -+#if defined(HAVE_SSE41) -+#define extract32from128(a, imm) _mm_extract_epi32(a, imm) -+#define extract64from128(a, imm) _mm_extract_epi64(a, imm) -+#else -+#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2)) -+#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3)) -+#endif -+ -+#if !defined(HAVE_AVX2) -+// TODO: this entire file needs restructuring - this carveout is awful -+#define extractlow64from256(a) movq(a.lo) -+#define extractlow32from256(a) movd(a.lo) -+#if defined(HAVE_SSE41) -+#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4) -+#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2) -+#else -+#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4)) -+#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8)) -+#endif -+ -+#endif // !AVX2 -+ -+static really_inline m128 and128(m128 a, m128 b) { -+ return _mm_and_si128(a,b); -+} -+ -+static really_inline m128 xor128(m128 a, m128 b) { -+ return _mm_xor_si128(a,b); -+} -+ -+static really_inline m128 or128(m128 a, m128 b) { -+ return _mm_or_si128(a,b); -+} -+ -+static really_inline m128 andnot128(m128 a, m128 b) { -+ return _mm_andnot_si128(a, b); -+} -+ -+// aligned load -+static really_inline m128 load128(const void *ptr) { -+ assert(ISALIGNED_N(ptr, alignof(m128))); -+ ptr = assume_aligned(ptr, 16); -+ return _mm_load_si128((const m128 *)ptr); -+} -+ -+// aligned store -+static really_inline void store128(void *ptr, m128 a) { -+ assert(ISALIGNED_N(ptr, alignof(m128))); -+ ptr = assume_aligned(ptr, 16); -+ *(m128 *)ptr = a; -+} -+ -+// unaligned load -+static really_inline m128 loadu128(const void *ptr) { -+ return _mm_loadu_si128((const m128 *)ptr); -+} -+ -+// unaligned store -+static really_inline void storeu128(void *ptr, m128 a) { -+ _mm_storeu_si128 ((m128 *)ptr, a); -+} -+ -+// packed unaligned store of first N bytes -+static really_inline -+void storebytes128(void *ptr, m128 a, unsigned int n) { -+ assert(n <= sizeof(a)); -+ memcpy(ptr, &a, n); -+} -+ -+// packed unaligned load of first N bytes, pad with zero -+static really_inline -+m128 loadbytes128(const void *ptr, unsigned int n) { -+ m128 a = zeroes128(); -+ assert(n <= sizeof(a)); -+ memcpy(&a, ptr, n); -+ return a; -+} -+ -+#ifdef __cplusplus -+extern "C" { -+#endif -+extern const u8 simd_onebit_masks[]; -+#ifdef __cplusplus -+} -+#endif -+ -+static really_inline -+m128 mask1bit128(unsigned int n) { -+ assert(n < sizeof(m128) * 8); -+ u32 mask_idx = ((n % 8) * 64) + 95; -+ mask_idx -= n / 8; -+ return loadu128(&simd_onebit_masks[mask_idx]); -+} -+ -+// switches on bit N in the given vector. -+static really_inline -+void setbit128(m128 *ptr, unsigned int n) { -+ *ptr = or128(mask1bit128(n), *ptr); -+} -+ -+// switches off bit N in the given vector. -+static really_inline -+void clearbit128(m128 *ptr, unsigned int n) { -+ *ptr = andnot128(mask1bit128(n), *ptr); -+} -+ -+// tests bit N in the given vector. -+static really_inline -+char testbit128(m128 val, unsigned int n) { -+ const m128 mask = mask1bit128(n); -+#if defined(HAVE_SSE41) -+ return !_mm_testz_si128(mask, val); -+#else -+ return isnonzero128(and128(mask, val)); -+#endif -+} -+ -+// offset must be an immediate -+#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset) -+ -+static really_inline -+m128 pshufb_m128(m128 a, m128 b) { -+ m128 result; -+ result = _mm_shuffle_epi8(a, b); -+ return result; -+} -+ -+static really_inline -+m256 pshufb_m256(m256 a, m256 b) { -+#if defined(HAVE_AVX2) -+ return _mm256_shuffle_epi8(a, b); -+#else -+ m256 rv; -+ rv.lo = pshufb_m128(a.lo, b.lo); -+ rv.hi = pshufb_m128(a.hi, b.hi); -+ return rv; -+#endif -+} -+ -+#if defined(HAVE_AVX512) -+static really_inline -+m512 pshufb_m512(m512 a, m512 b) { -+ return _mm512_shuffle_epi8(a, b); -+} -+ -+static really_inline -+m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) { -+ return _mm512_maskz_shuffle_epi8(k, a, b); -+} -+#endif -+ -+static really_inline -+m128 variable_byte_shift_m128(m128 in, s32 amount) { -+ assert(amount >= -16 && amount <= 16); -+ m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); -+ return pshufb_m128(in, shift_mask); -+} -+ -+static really_inline -+m128 max_u8_m128(m128 a, m128 b) { -+ return _mm_max_epu8(a, b); -+} -+ -+static really_inline -+m128 min_u8_m128(m128 a, m128 b) { -+ return _mm_min_epu8(a, b); -+} -+ -+static really_inline -+m128 sadd_u8_m128(m128 a, m128 b) { -+ return _mm_adds_epu8(a, b); -+} -+ -+static really_inline -+m128 sub_u8_m128(m128 a, m128 b) { -+ return _mm_sub_epi8(a, b); -+} -+ -+static really_inline -+m128 set64x2(u64a hi, u64a lo) { -+ return _mm_set_epi64x(hi, lo); -+} -+ -+static really_inline -+m128 set32x4(int i3, int i2, int i1, int i0) { -+ return _mm_set_epi32(i3, i2, i1, i0); -+} -+ -+/**** -+ **** 256-bit Primitives -+ ****/ -+ -+#if defined(HAVE_AVX2) -+ -+static really_really_inline -+m256 lshift64_m256(m256 a, unsigned b) { -+#if defined(HAVE__BUILTIN_CONSTANT_P) -+ if (__builtin_constant_p(b)) { -+ return _mm256_slli_epi64(a, b); -+ } -+#endif -+ m128 x = _mm_cvtsi32_si128(b); -+ return _mm256_sll_epi64(a, x); -+} -+ -+#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b)) -+ -+static really_inline -+m256 set32x8(u32 in) { -+ return _mm256_set1_epi8(in); -+} -+ -+#define eq256(a, b) _mm256_cmpeq_epi8((a), (b)) -+#define movemask256(a) ((u32)_mm256_movemask_epi8((a))) -+ -+static really_inline -+m256 set2x128(m128 a) { -+ return _mm256_broadcastsi128_si256(a); -+} -+ -+#else -+ -+static really_really_inline -+m256 lshift64_m256(m256 a, int b) { -+ m256 rv = a; -+ rv.lo = lshift64_m128(rv.lo, b); -+ rv.hi = lshift64_m128(rv.hi, b); -+ return rv; -+} -+ -+static really_inline -+m256 rshift64_m256(m256 a, int b) { -+ m256 rv = a; -+ rv.lo = rshift64_m128(rv.lo, b); -+ rv.hi = rshift64_m128(rv.hi, b); -+ return rv; -+} -+static really_inline -+m256 set32x8(u32 in) { -+ m256 rv; -+ rv.lo = set16x8((u8) in); -+ rv.hi = rv.lo; -+ return rv; -+} -+ -+static really_inline -+m256 eq256(m256 a, m256 b) { -+ m256 rv; -+ rv.lo = eq128(a.lo, b.lo); -+ rv.hi = eq128(a.hi, b.hi); -+ return rv; -+} -+ -+static really_inline -+u32 movemask256(m256 a) { -+ u32 lo_mask = movemask128(a.lo); -+ u32 hi_mask = movemask128(a.hi); -+ return lo_mask | (hi_mask << 16); -+} -+ -+static really_inline -+m256 set2x128(m128 a) { -+ m256 rv = {a, a}; -+ return rv; -+} -+#endif -+ -+static really_inline m256 zeroes256(void) { -+#if defined(HAVE_AVX2) -+ return _mm256_setzero_si256(); -+#else -+ m256 rv = {zeroes128(), zeroes128()}; -+ return rv; -+#endif -+} -+ -+static really_inline m256 ones256(void) { -+#if defined(HAVE_AVX2) -+ m256 rv = _mm256_set1_epi8(0xFF); -+#else -+ m256 rv = {ones128(), ones128()}; -+#endif -+ return rv; -+} -+ -+#if defined(HAVE_AVX2) -+static really_inline m256 and256(m256 a, m256 b) { -+ return _mm256_and_si256(a, b); -+} -+#else -+static really_inline m256 and256(m256 a, m256 b) { -+ m256 rv; -+ rv.lo = and128(a.lo, b.lo); -+ rv.hi = and128(a.hi, b.hi); -+ return rv; -+} -+#endif -+ -+#if defined(HAVE_AVX2) -+static really_inline m256 or256(m256 a, m256 b) { -+ return _mm256_or_si256(a, b); -+} -+#else -+static really_inline m256 or256(m256 a, m256 b) { -+ m256 rv; -+ rv.lo = or128(a.lo, b.lo); -+ rv.hi = or128(a.hi, b.hi); -+ return rv; -+} -+#endif -+ -+#if defined(HAVE_AVX2) -+static really_inline m256 xor256(m256 a, m256 b) { -+ return _mm256_xor_si256(a, b); -+} -+#else -+static really_inline m256 xor256(m256 a, m256 b) { -+ m256 rv; -+ rv.lo = xor128(a.lo, b.lo); -+ rv.hi = xor128(a.hi, b.hi); -+ return rv; -+} -+#endif -+ -+#if defined(HAVE_AVX2) -+static really_inline m256 not256(m256 a) { -+ return _mm256_xor_si256(a, ones256()); -+} -+#else -+static really_inline m256 not256(m256 a) { -+ m256 rv; -+ rv.lo = not128(a.lo); -+ rv.hi = not128(a.hi); -+ return rv; -+} -+#endif -+ -+#if defined(HAVE_AVX2) -+static really_inline m256 andnot256(m256 a, m256 b) { -+ return _mm256_andnot_si256(a, b); -+} -+#else -+static really_inline m256 andnot256(m256 a, m256 b) { -+ m256 rv; -+ rv.lo = andnot128(a.lo, b.lo); -+ rv.hi = andnot128(a.hi, b.hi); -+ return rv; -+} -+#endif -+ -+static really_inline int diff256(m256 a, m256 b) { -+#if defined(HAVE_AVX2) -+ return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1); -+#else -+ return diff128(a.lo, b.lo) || diff128(a.hi, b.hi); -+#endif -+} -+ -+static really_inline int isnonzero256(m256 a) { -+#if defined(HAVE_AVX2) -+ return !!diff256(a, zeroes256()); -+#else -+ return isnonzero128(or128(a.lo, a.hi)); -+#endif -+} -+ -+/** -+ * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit -+ * mask indicating which 32-bit words contain differences. -+ */ -+static really_inline u32 diffrich256(m256 a, m256 b) { -+#if defined(HAVE_AVX2) -+ a = _mm256_cmpeq_epi32(a, b); -+ return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF; -+#else -+ m128 z = zeroes128(); -+ a.lo = _mm_cmpeq_epi32(a.lo, b.lo); -+ a.hi = _mm_cmpeq_epi32(a.hi, b.hi); -+ m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z); -+ return ~(_mm_movemask_epi8(packed)) & 0xff; -+#endif -+} -+ -+/** -+ * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and -+ * returns an 8-bit mask indicating which 64-bit words contain differences. -+ */ -+static really_inline u32 diffrich64_256(m256 a, m256 b) { -+ u32 d = diffrich256(a, b); -+ return (d | (d >> 1)) & 0x55555555; -+} -+ -+// aligned load -+static really_inline m256 load256(const void *ptr) { -+ assert(ISALIGNED_N(ptr, alignof(m256))); -+#if defined(HAVE_AVX2) -+ return _mm256_load_si256((const m256 *)ptr); -+#else -+ m256 rv = { load128(ptr), load128((const char *)ptr + 16) }; -+ return rv; -+#endif -+} -+ -+// aligned load of 128-bit value to low and high part of 256-bit value -+static really_inline m256 load2x128(const void *ptr) { -+#if defined(HAVE_AVX2) -+ return set2x128(load128(ptr)); -+#else -+ assert(ISALIGNED_N(ptr, alignof(m128))); -+ m256 rv; -+ rv.hi = rv.lo = load128(ptr); -+ return rv; -+#endif -+} -+ -+static really_inline m256 loadu2x128(const void *ptr) { -+ return set2x128(loadu128(ptr)); -+} -+ -+// aligned store -+static really_inline void store256(void *ptr, m256 a) { -+ assert(ISALIGNED_N(ptr, alignof(m256))); -+#if defined(HAVE_AVX2) -+ _mm256_store_si256((m256 *)ptr, a); -+#else -+ ptr = assume_aligned(ptr, 16); -+ *(m256 *)ptr = a; -+#endif -+} -+ -+// unaligned load -+static really_inline m256 loadu256(const void *ptr) { -+#if defined(HAVE_AVX2) -+ return _mm256_loadu_si256((const m256 *)ptr); -+#else -+ m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) }; -+ return rv; -+#endif -+} -+ -+// unaligned store -+static really_inline void storeu256(void *ptr, m256 a) { -+#if defined(HAVE_AVX2) -+ _mm256_storeu_si256((m256 *)ptr, a); -+#else -+ storeu128(ptr, a.lo); -+ storeu128((char *)ptr + 16, a.hi); -+#endif -+} -+ -+// packed unaligned store of first N bytes -+static really_inline -+void storebytes256(void *ptr, m256 a, unsigned int n) { -+ assert(n <= sizeof(a)); -+ memcpy(ptr, &a, n); -+} -+ -+// packed unaligned load of first N bytes, pad with zero -+static really_inline -+m256 loadbytes256(const void *ptr, unsigned int n) { -+ m256 a = zeroes256(); -+ assert(n <= sizeof(a)); -+ memcpy(&a, ptr, n); -+ return a; -+} -+ -+static really_inline -+m256 mask1bit256(unsigned int n) { -+ assert(n < sizeof(m256) * 8); -+ u32 mask_idx = ((n % 8) * 64) + 95; -+ mask_idx -= n / 8; -+ return loadu256(&simd_onebit_masks[mask_idx]); -+} -+ -+static really_inline -+m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { -+#if defined(HAVE_AVX2) -+ return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0); -+#else -+ m256 rv; -+ rv.hi = set64x2(hi_1, hi_0); -+ rv.lo = set64x2(lo_1, lo_0); -+ return rv; -+#endif -+} -+ -+#if !defined(HAVE_AVX2) -+// switches on bit N in the given vector. -+static really_inline -+void setbit256(m256 *ptr, unsigned int n) { -+ assert(n < sizeof(*ptr) * 8); -+ m128 *sub; -+ if (n < 128) { -+ sub = &ptr->lo; -+ } else { -+ sub = &ptr->hi; -+ n -= 128; -+ } -+ setbit128(sub, n); -+} -+ -+// switches off bit N in the given vector. -+static really_inline -+void clearbit256(m256 *ptr, unsigned int n) { -+ assert(n < sizeof(*ptr) * 8); -+ m128 *sub; -+ if (n < 128) { -+ sub = &ptr->lo; -+ } else { -+ sub = &ptr->hi; -+ n -= 128; -+ } -+ clearbit128(sub, n); -+} -+ -+// tests bit N in the given vector. -+static really_inline -+char testbit256(m256 val, unsigned int n) { -+ assert(n < sizeof(val) * 8); -+ m128 sub; -+ if (n < 128) { -+ sub = val.lo; -+ } else { -+ sub = val.hi; -+ n -= 128; -+ } -+ return testbit128(sub, n); -+} -+ -+static really_really_inline -+m128 movdq_hi(m256 x) { -+ return x.hi; -+} -+ -+static really_really_inline -+m128 movdq_lo(m256 x) { -+ return x.lo; -+} -+ -+static really_inline -+m256 combine2x128(m128 hi, m128 lo) { -+ m256 rv = {lo, hi}; -+ return rv; -+} -+ -+#else // AVX2 -+ -+// switches on bit N in the given vector. -+static really_inline -+void setbit256(m256 *ptr, unsigned int n) { -+ *ptr = or256(mask1bit256(n), *ptr); -+} -+ -+static really_inline -+void clearbit256(m256 *ptr, unsigned int n) { -+ *ptr = andnot256(mask1bit256(n), *ptr); -+} -+ -+// tests bit N in the given vector. -+static really_inline -+char testbit256(m256 val, unsigned int n) { -+ const m256 mask = mask1bit256(n); -+ return !_mm256_testz_si256(mask, val); -+} -+ -+static really_really_inline -+m128 movdq_hi(m256 x) { -+ return _mm256_extracti128_si256(x, 1); -+} -+ -+static really_really_inline -+m128 movdq_lo(m256 x) { -+ return _mm256_extracti128_si256(x, 0); -+} -+ -+#define cast256to128(a) _mm256_castsi256_si128(a) -+#define cast128to256(a) _mm256_castsi128_si256(a) -+#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E) -+#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm) -+#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed) -+#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed) -+#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2) -+#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4) -+#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a)) -+#define extractlow32from256(a) movd(cast256to128(a)) -+#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b) -+#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b) -+#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset) -+ -+static really_inline -+m256 combine2x128(m128 hi, m128 lo) { -+#if defined(_mm256_set_m128i) -+ return _mm256_set_m128i(hi, lo); -+#else -+ return insert128to256(cast128to256(lo), hi, 1); -+#endif -+} -+#endif //AVX2 -+ -+#if defined(HAVE_AVX512) -+#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm) -+#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b) -+#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b) -+#define set2x256(a) _mm512_broadcast_i64x4(a) -+#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a) -+#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a) -+#endif -+ -+/**** -+ **** 384-bit Primitives -+ ****/ -+ -+static really_inline m384 and384(m384 a, m384 b) { -+ m384 rv; -+ rv.lo = and128(a.lo, b.lo); -+ rv.mid = and128(a.mid, b.mid); -+ rv.hi = and128(a.hi, b.hi); -+ return rv; -+} -+ -+static really_inline m384 or384(m384 a, m384 b) { -+ m384 rv; -+ rv.lo = or128(a.lo, b.lo); -+ rv.mid = or128(a.mid, b.mid); -+ rv.hi = or128(a.hi, b.hi); -+ return rv; -+} -+ -+static really_inline m384 xor384(m384 a, m384 b) { -+ m384 rv; -+ rv.lo = xor128(a.lo, b.lo); -+ rv.mid = xor128(a.mid, b.mid); -+ rv.hi = xor128(a.hi, b.hi); -+ return rv; -+} -+static really_inline m384 not384(m384 a) { -+ m384 rv; -+ rv.lo = not128(a.lo); -+ rv.mid = not128(a.mid); -+ rv.hi = not128(a.hi); -+ return rv; -+} -+static really_inline m384 andnot384(m384 a, m384 b) { -+ m384 rv; -+ rv.lo = andnot128(a.lo, b.lo); -+ rv.mid = andnot128(a.mid, b.mid); -+ rv.hi = andnot128(a.hi, b.hi); -+ return rv; -+} -+ -+static really_really_inline -+m384 lshift64_m384(m384 a, unsigned b) { -+ m384 rv; -+ rv.lo = lshift64_m128(a.lo, b); -+ rv.mid = lshift64_m128(a.mid, b); -+ rv.hi = lshift64_m128(a.hi, b); -+ return rv; -+} -+ -+static really_inline m384 zeroes384(void) { -+ m384 rv = {zeroes128(), zeroes128(), zeroes128()}; -+ return rv; -+} -+ -+static really_inline m384 ones384(void) { -+ m384 rv = {ones128(), ones128(), ones128()}; -+ return rv; -+} -+ -+static really_inline int diff384(m384 a, m384 b) { -+ return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi); -+} -+ -+static really_inline int isnonzero384(m384 a) { -+ return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); -+} -+ -+/** -+ * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit -+ * mask indicating which 32-bit words contain differences. -+ */ -+static really_inline u32 diffrich384(m384 a, m384 b) { -+ m128 z = zeroes128(); -+ a.lo = _mm_cmpeq_epi32(a.lo, b.lo); -+ a.mid = _mm_cmpeq_epi32(a.mid, b.mid); -+ a.hi = _mm_cmpeq_epi32(a.hi, b.hi); -+ m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid), -+ _mm_packs_epi32(a.hi, z)); -+ return ~(_mm_movemask_epi8(packed)) & 0xfff; -+} -+ -+/** -+ * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and -+ * returns a 12-bit mask indicating which 64-bit words contain differences. -+ */ -+static really_inline u32 diffrich64_384(m384 a, m384 b) { -+ u32 d = diffrich384(a, b); -+ return (d | (d >> 1)) & 0x55555555; -+} -+ -+// aligned load -+static really_inline m384 load384(const void *ptr) { -+ assert(ISALIGNED_16(ptr)); -+ m384 rv = { load128(ptr), load128((const char *)ptr + 16), -+ load128((const char *)ptr + 32) }; -+ return rv; -+} -+ -+// aligned store -+static really_inline void store384(void *ptr, m384 a) { -+ assert(ISALIGNED_16(ptr)); -+ ptr = assume_aligned(ptr, 16); -+ *(m384 *)ptr = a; -+} -+ -+// unaligned load -+static really_inline m384 loadu384(const void *ptr) { -+ m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16), -+ loadu128((const char *)ptr + 32)}; -+ return rv; -+} -+ -+// packed unaligned store of first N bytes -+static really_inline -+void storebytes384(void *ptr, m384 a, unsigned int n) { -+ assert(n <= sizeof(a)); -+ memcpy(ptr, &a, n); -+} -+ -+// packed unaligned load of first N bytes, pad with zero -+static really_inline -+m384 loadbytes384(const void *ptr, unsigned int n) { -+ m384 a = zeroes384(); -+ assert(n <= sizeof(a)); -+ memcpy(&a, ptr, n); -+ return a; -+} -+ -+// switches on bit N in the given vector. -+static really_inline -+void setbit384(m384 *ptr, unsigned int n) { -+ assert(n < sizeof(*ptr) * 8); -+ m128 *sub; -+ if (n < 128) { -+ sub = &ptr->lo; -+ } else if (n < 256) { -+ sub = &ptr->mid; -+ } else { -+ sub = &ptr->hi; -+ } -+ setbit128(sub, n % 128); -+} -+ -+// switches off bit N in the given vector. -+static really_inline -+void clearbit384(m384 *ptr, unsigned int n) { -+ assert(n < sizeof(*ptr) * 8); -+ m128 *sub; -+ if (n < 128) { -+ sub = &ptr->lo; -+ } else if (n < 256) { -+ sub = &ptr->mid; -+ } else { -+ sub = &ptr->hi; -+ } -+ clearbit128(sub, n % 128); -+} -+ -+// tests bit N in the given vector. -+static really_inline -+char testbit384(m384 val, unsigned int n) { -+ assert(n < sizeof(val) * 8); -+ m128 sub; -+ if (n < 128) { -+ sub = val.lo; -+ } else if (n < 256) { -+ sub = val.mid; -+ } else { -+ sub = val.hi; -+ } -+ return testbit128(sub, n % 128); -+} -+ -+/**** -+ **** 512-bit Primitives -+ ****/ -+ -+#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b)) -+#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b)) -+ -+static really_inline -+m512 zeroes512(void) { -+#if defined(HAVE_AVX512) -+ return _mm512_setzero_si512(); -+#else -+ m512 rv = {zeroes256(), zeroes256()}; -+ return rv; -+#endif -+} -+ -+static really_inline -+m512 ones512(void) { -+#if defined(HAVE_AVX512) -+ return _mm512_set1_epi8(0xFF); -+ //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512()); -+#else -+ m512 rv = {ones256(), ones256()}; -+ return rv; -+#endif -+} -+ -+#if defined(HAVE_AVX512) -+static really_inline -+m512 set64x8(u8 a) { -+ return _mm512_set1_epi8(a); -+} -+ -+static really_inline -+m512 set8x64(u64a a) { -+ return _mm512_set1_epi64(a); -+} -+ -+static really_inline -+m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0, -+ u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) { -+ return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0, -+ lo_3, lo_2, lo_1, lo_0); -+} -+ -+static really_inline -+m512 swap256in512(m512 a) { -+ m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL); -+ return vpermq512(idx, a); -+} -+ -+static really_inline -+m512 set4x128(m128 a) { -+ return _mm512_broadcast_i32x4(a); -+} -+#endif -+ -+static really_inline -+m512 and512(m512 a, m512 b) { -+#if defined(HAVE_AVX512) -+ return _mm512_and_si512(a, b); -+#else -+ m512 rv; -+ rv.lo = and256(a.lo, b.lo); -+ rv.hi = and256(a.hi, b.hi); -+ return rv; -+#endif -+} -+ -+static really_inline -+m512 or512(m512 a, m512 b) { -+#if defined(HAVE_AVX512) -+ return _mm512_or_si512(a, b); -+#else -+ m512 rv; -+ rv.lo = or256(a.lo, b.lo); -+ rv.hi = or256(a.hi, b.hi); -+ return rv; -+#endif -+} -+ -+static really_inline -+m512 xor512(m512 a, m512 b) { -+#if defined(HAVE_AVX512) -+ return _mm512_xor_si512(a, b); -+#else -+ m512 rv; -+ rv.lo = xor256(a.lo, b.lo); -+ rv.hi = xor256(a.hi, b.hi); -+ return rv; -+#endif -+} -+ -+static really_inline -+m512 not512(m512 a) { -+#if defined(HAVE_AVX512) -+ return _mm512_xor_si512(a, ones512()); -+#else -+ m512 rv; -+ rv.lo = not256(a.lo); -+ rv.hi = not256(a.hi); -+ return rv; -+#endif -+} -+ -+static really_inline -+m512 andnot512(m512 a, m512 b) { -+#if defined(HAVE_AVX512) -+ return _mm512_andnot_si512(a, b); -+#else -+ m512 rv; -+ rv.lo = andnot256(a.lo, b.lo); -+ rv.hi = andnot256(a.hi, b.hi); -+ return rv; -+#endif -+} -+ -+#if defined(HAVE_AVX512) -+static really_really_inline -+m512 lshift64_m512(m512 a, unsigned b) { -+#if defined(HAVE__BUILTIN_CONSTANT_P) -+ if (__builtin_constant_p(b)) { -+ return _mm512_slli_epi64(a, b); -+ } -+#endif -+ m128 x = _mm_cvtsi32_si128(b); -+ return _mm512_sll_epi64(a, x); -+} -+#else -+static really_really_inline -+m512 lshift64_m512(m512 a, unsigned b) { -+ m512 rv; -+ rv.lo = lshift64_m256(a.lo, b); -+ rv.hi = lshift64_m256(a.hi, b); -+ return rv; -+} -+#endif -+ -+#if defined(HAVE_AVX512) -+#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b)) -+#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed) -+#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed) -+#endif -+ -+#if !defined(_MM_CMPINT_NE) -+#define _MM_CMPINT_NE 0x4 -+#endif -+ -+static really_inline -+int diff512(m512 a, m512 b) { -+#if defined(HAVE_AVX512) -+ return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE); -+#else -+ return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); -+#endif -+} -+ -+static really_inline -+int isnonzero512(m512 a) { -+#if defined(HAVE_AVX512) -+ return diff512(a, zeroes512()); -+#elif defined(HAVE_AVX2) -+ m256 x = or256(a.lo, a.hi); -+ return !!diff256(x, zeroes256()); -+#else -+ m128 x = or128(a.lo.lo, a.lo.hi); -+ m128 y = or128(a.hi.lo, a.hi.hi); -+ return isnonzero128(or128(x, y)); -+#endif -+} -+ -+/** -+ * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit -+ * mask indicating which 32-bit words contain differences. -+ */ -+static really_inline -+u32 diffrich512(m512 a, m512 b) { -+#if defined(HAVE_AVX512) -+ return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE); -+#elif defined(HAVE_AVX2) -+ return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8); -+#else -+ a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo); -+ a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi); -+ a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo); -+ a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi); -+ m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi), -+ _mm_packs_epi32(a.hi.lo, a.hi.hi)); -+ return ~(_mm_movemask_epi8(packed)) & 0xffff; -+#endif -+} -+ -+/** -+ * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and -+ * returns a 16-bit mask indicating which 64-bit words contain differences. -+ */ -+static really_inline -+u32 diffrich64_512(m512 a, m512 b) { -+ //TODO: cmp_epi64? -+ u32 d = diffrich512(a, b); -+ return (d | (d >> 1)) & 0x55555555; -+} -+ -+// aligned load -+static really_inline -+m512 load512(const void *ptr) { -+#if defined(HAVE_AVX512) -+ return _mm512_load_si512(ptr); -+#else -+ assert(ISALIGNED_N(ptr, alignof(m256))); -+ m512 rv = { load256(ptr), load256((const char *)ptr + 32) }; -+ return rv; -+#endif -+} -+ -+// aligned store -+static really_inline -+void store512(void *ptr, m512 a) { -+ assert(ISALIGNED_N(ptr, alignof(m512))); -+#if defined(HAVE_AVX512) -+ return _mm512_store_si512(ptr, a); -+#elif defined(HAVE_AVX2) -+ m512 *x = (m512 *)ptr; -+ store256(&x->lo, a.lo); -+ store256(&x->hi, a.hi); -+#else -+ ptr = assume_aligned(ptr, 16); -+ *(m512 *)ptr = a; -+#endif -+} -+ -+// unaligned load -+static really_inline -+m512 loadu512(const void *ptr) { -+#if defined(HAVE_AVX512) -+ return _mm512_loadu_si512(ptr); -+#else -+ m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) }; -+ return rv; -+#endif -+} -+ -+#if defined(HAVE_AVX512) -+static really_inline -+m512 loadu_maskz_m512(__mmask64 k, const void *ptr) { -+ return _mm512_maskz_loadu_epi8(k, ptr); -+} -+ -+static really_inline -+m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) { -+ return _mm512_mask_loadu_epi8(src, k, ptr); -+} -+ -+static really_inline -+m512 set_mask_m512(__mmask64 k) { -+ return _mm512_movm_epi8(k); -+} -+#endif -+ -+// packed unaligned store of first N bytes -+static really_inline -+void storebytes512(void *ptr, m512 a, unsigned int n) { -+ assert(n <= sizeof(a)); -+ memcpy(ptr, &a, n); -+} -+ -+// packed unaligned load of first N bytes, pad with zero -+static really_inline -+m512 loadbytes512(const void *ptr, unsigned int n) { -+ m512 a = zeroes512(); -+ assert(n <= sizeof(a)); -+ memcpy(&a, ptr, n); -+ return a; -+} -+ -+static really_inline -+m512 mask1bit512(unsigned int n) { -+ assert(n < sizeof(m512) * 8); -+ u32 mask_idx = ((n % 8) * 64) + 95; -+ mask_idx -= n / 8; -+ return loadu512(&simd_onebit_masks[mask_idx]); -+} -+ -+// switches on bit N in the given vector. -+static really_inline -+void setbit512(m512 *ptr, unsigned int n) { -+ assert(n < sizeof(*ptr) * 8); -+#if !defined(HAVE_AVX2) -+ m128 *sub; -+ if (n < 128) { -+ sub = &ptr->lo.lo; -+ } else if (n < 256) { -+ sub = &ptr->lo.hi; -+ } else if (n < 384) { -+ sub = &ptr->hi.lo; -+ } else { -+ sub = &ptr->hi.hi; -+ } -+ setbit128(sub, n % 128); -+#elif defined(HAVE_AVX512) -+ *ptr = or512(mask1bit512(n), *ptr); -+#else -+ m256 *sub; -+ if (n < 256) { -+ sub = &ptr->lo; -+ } else { -+ sub = &ptr->hi; -+ n -= 256; -+ } -+ setbit256(sub, n); -+#endif -+} -+ -+// switches off bit N in the given vector. -+static really_inline -+void clearbit512(m512 *ptr, unsigned int n) { -+ assert(n < sizeof(*ptr) * 8); -+#if !defined(HAVE_AVX2) -+ m128 *sub; -+ if (n < 128) { -+ sub = &ptr->lo.lo; -+ } else if (n < 256) { -+ sub = &ptr->lo.hi; -+ } else if (n < 384) { -+ sub = &ptr->hi.lo; -+ } else { -+ sub = &ptr->hi.hi; -+ } -+ clearbit128(sub, n % 128); -+#elif defined(HAVE_AVX512) -+ *ptr = andnot512(mask1bit512(n), *ptr); -+#else -+ m256 *sub; -+ if (n < 256) { -+ sub = &ptr->lo; -+ } else { -+ sub = &ptr->hi; -+ n -= 256; -+ } -+ clearbit256(sub, n); -+#endif -+} -+ -+// tests bit N in the given vector. -+static really_inline -+char testbit512(m512 val, unsigned int n) { -+ assert(n < sizeof(val) * 8); -+#if !defined(HAVE_AVX2) -+ m128 sub; -+ if (n < 128) { -+ sub = val.lo.lo; -+ } else if (n < 256) { -+ sub = val.lo.hi; -+ } else if (n < 384) { -+ sub = val.hi.lo; -+ } else { -+ sub = val.hi.hi; -+ } -+ return testbit128(sub, n % 128); -+#elif defined(HAVE_AVX512) -+ const m512 mask = mask1bit512(n); -+ return !!_mm512_test_epi8_mask(mask, val); -+#else -+ m256 sub; -+ if (n < 256) { -+ sub = val.lo; -+ } else { -+ sub = val.hi; -+ n -= 256; -+ } -+ return testbit256(sub, n); -+#endif -+} -+ -+#endif -diff --git a/src/util/state_compress.c b/src/util/state_compress.c -index 7238849..4422403 100644 ---- a/src/util/state_compress.c -+++ b/src/util/state_compress.c -@@ -150,7 +150,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { - u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]), - expand32(v[2], m[2]), expand32(v[3], m[3]) }; - -- return _mm_set_epi32(x[3], x[2], x[1], x[0]); -+ return set32x4(x[3], x[2], x[1], x[0]); +@@ -1417,4 +1417,14 @@ char testbit512(m512 val, unsigned int n) { + #endif } + ++static really_inline m128 set2x64(u64a c) ++{ ++ return _mm_set1_epi32(c); ++} ++ ++static really_inline m128 set32x4(int i3, int i2, int i1, int i0) ++{ ++ return _mm_set_epi32(i3, i2, i1, i0); ++} ++ #endif - -@@ -158,7 +158,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { - static really_inline - m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { - // First, decompose our vectors into 64-bit chunks. -- u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) }; -+ u64a m[2] = { movq(mvec), movq(rshiftbyte_m128(mvec, 8)) }; - - u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) }; - u64a v[2]; -@@ -167,7 +167,7 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { - - u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) }; - -- return _mm_set_epi64x(x[1], x[0]); -+ return set64x2(x[1], x[0]); - } - #endif - -@@ -264,8 +264,8 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) { - expand32(v[6], m[6]), expand32(v[7], m[7]) }; - - #if !defined(HAVE_AVX2) -- m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]), -- .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) }; -+ m256 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]), -+ .hi = set32x4(x[7], x[6], x[5], x[4]) }; - #else - m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4], - x[3], x[2], x[1], x[0]); -@@ -291,8 +291,8 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) { - expand64(v[2], m[2]), expand64(v[3], m[3]) }; - - #if !defined(HAVE_AVX2) -- m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]), -- .hi = _mm_set_epi64x(x[3], x[2]) }; -+ m256 xvec = { .lo = set64x2(x[1], x[0]), -+ .hi = set64x2(x[3], x[2]) }; - #else - m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]); - #endif -@@ -402,9 +402,9 @@ m384 loadcompressed384_32bit(const void *ptr, m384 mvec) { - expand32(v[8], m[8]), expand32(v[9], m[9]), - expand32(v[10], m[10]), expand32(v[11], m[11]) }; - -- m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]), -- .mid = _mm_set_epi32(x[7], x[6], x[5], x[4]), -- .hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) }; -+ m384 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]), -+ .mid = set32x4(x[7], x[6], x[5], x[4]), -+ .hi = set32x4(x[11], x[10], x[9], x[8]) }; - return xvec; - } - #endif -@@ -427,9 +427,9 @@ m384 loadcompressed384_64bit(const void *ptr, m384 mvec) { - expand64(v[2], m[2]), expand64(v[3], m[3]), - expand64(v[4], m[4]), expand64(v[5], m[5]) }; - -- m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]), -- .mid = _mm_set_epi64x(x[3], x[2]), -- .hi = _mm_set_epi64x(x[5], x[4]) }; -+ m384 xvec = { .lo = set64x2(x[1], x[0]), -+ .mid = set64x2(x[3], x[2]), -+ .hi = set64x2(x[5], x[4]) }; - return xvec; - } - #endif -@@ -558,10 +558,10 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) { - xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12], - x[11], x[10], x[9], x[8]); - #else -- xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]); -- xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]); -- xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]); -- xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]); -+ xvec.lo.lo = set32x4(x[3], x[2], x[1], x[0]); -+ xvec.lo.hi = set32x4(x[7], x[6], x[5], x[4]); -+ xvec.hi.lo = set32x4(x[11], x[10], x[9], x[8]); -+ xvec.hi.hi = set32x4(x[15], x[14], x[13], x[12]); - #endif - return xvec; - } -@@ -594,10 +594,10 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) { - m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]), - .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])}; - #else -- m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]), -- _mm_set_epi64x(x[3], x[2]) }, -- .hi = { _mm_set_epi64x(x[5], x[4]), -- _mm_set_epi64x(x[7], x[6]) } }; -+ m512 xvec = { .lo = { set64x2(x[1], x[0]), -+ set64x2(x[3], x[2]) }, -+ .hi = { set64x2(x[5], x[4]), -+ set64x2(x[7], x[6]) } }; - #endif - return xvec; - } diff --git a/tools/hscollider/CMakeLists.txt b/tools/hscollider/CMakeLists.txt index a4d71b2..0c41ab9 100644 --- a/tools/hscollider/CMakeLists.txt @@ -6802,59 +3059,10 @@ index 0000000..5391473 + return (cs != FileCorporaParser_error) && (p == pe); +} diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp -index 623c2c9..d6d52a2 100644 +index 623c2c9..22945d6 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp -@@ -40,7 +40,7 @@ using namespace ue2; - namespace { - - // Switch one bit on in a bitmask. --template -+template - Mask setbit(unsigned int bit) { - union { - Mask simd; -@@ -148,7 +148,7 @@ m256 simd_lshift64(const m256 &a, unsigned i) { return lshift64_m256(a, i); } - m384 simd_lshift64(const m384 &a, unsigned i) { return lshift64_m384(a, i); } - m512 simd_lshift64(const m512 &a, unsigned i) { return lshift64_m512(a, i); } - --template -+template - class SimdUtilsTest : public testing::Test { - // empty - }; -@@ -260,9 +260,9 @@ TYPED_TEST(SimdUtilsTest, or2) { - - for (unsigned j = 0; j < 8; j++) { - for (unsigned i = 0; i < 32; i++) { -- m256 x = setbit(j*32+i); -+ m256 x = setbit(j * 32 + i); - m256 y = zeroes256(); -- ASSERT_EQ(1U << j, diffrich256(x, y)) << "bit " << j*32+i << " not happy"; -+ ASSERT_EQ(1U << j, diffrich256(x, y)) << "bit " << j * 32 + i << " not happy"; - } - } - -@@ -431,8 +431,8 @@ TYPED_TEST(SimdUtilsTest, testbit) { - for (unsigned int i = 0; i < total_bits; i++) { - TypeParam a = setbit(i); - for (unsigned int j = 0; j < total_bits; j++) { -- ASSERT_EQ(i == j ? 1 : 0, simd_testbit(a, j)) << "bit " << i -- << " is wrong"; -+ ASSERT_EQ(i == j ? 1 : 0, simd_testbit(a, j)) -+ << "bit " << i << " is wrong"; - } - } - } -@@ -455,7 +455,6 @@ TYPED_TEST(SimdUtilsTest, setbit) { - simd_setbit(&a, i); - } - ASSERT_FALSE(simd_diff(simd_ones(), a)); -- - } - - TYPED_TEST(SimdUtilsTest, diffrich) { -@@ -663,12 +662,11 @@ TEST(SimdUtilsTest, movq) { +@@ -663,7 +663,7 @@ TEST(SimdUtilsTest, movq) { ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd))); ASSERT_EQ(0, memcmp(cmp, &r, sizeof(r))); @@ -6863,152 +3071,6 @@ index 623c2c9..d6d52a2 100644 r = movq(simd); ASSERT_EQ(r, 0x123456789abcdef); } - -- - TEST(SimdUtilsTest, set16x8) { - char cmp[sizeof(m128)]; - -@@ -680,7 +678,7 @@ TEST(SimdUtilsTest, set16x8) { - } - - TEST(SimdUtilsTest, set4x32) { -- u32 cmp[4] = { 0x12345678, 0x12345678, 0x12345678, 0x12345678 }; -+ u32 cmp[4] = {0x12345678, 0x12345678, 0x12345678, 0x12345678}; - m128 simd = set4x32(cmp[0]); - ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd))); - } -@@ -714,51 +712,51 @@ TEST(SimdUtilsTest, variableByteShift128) { - char base[] = "0123456789ABCDEF"; - m128 in = loadu128(base); - -- EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0), -- variable_byte_shift_m128(in, 0))); -- EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1), -- variable_byte_shift_m128(in, -1))); -- EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 2), -- variable_byte_shift_m128(in, -2))); -- EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 3), -- variable_byte_shift_m128(in, -3))); -- EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 4), -- variable_byte_shift_m128(in, -4))); -- EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 5), -- variable_byte_shift_m128(in, -5))); -- EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 6), -- variable_byte_shift_m128(in, -6))); -- EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 7), -- variable_byte_shift_m128(in, -7))); -- EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 8), -- variable_byte_shift_m128(in, -8))); -- EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 9), -- variable_byte_shift_m128(in, -9))); -- EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 10), -- variable_byte_shift_m128(in, -10))); -- -- EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 0), -- variable_byte_shift_m128(in, 0))); -- EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 1), -- variable_byte_shift_m128(in, 1))); -- EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 2), -- variable_byte_shift_m128(in, 2))); -- EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 3), -- variable_byte_shift_m128(in, 3))); -- EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 4), -- variable_byte_shift_m128(in, 4))); -- EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 5), -- variable_byte_shift_m128(in, 5))); -- EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 6), -- variable_byte_shift_m128(in, 6))); -- EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 7), -- variable_byte_shift_m128(in, 7))); -- EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 8), -- variable_byte_shift_m128(in, 8))); -- EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 9), -- variable_byte_shift_m128(in, 9))); -- EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10), -- variable_byte_shift_m128(in, 10))); -+ EXPECT_TRUE( -+ !diff128(rshiftbyte_m128(in, 0), variable_byte_shift_m128(in, 0))); -+ EXPECT_TRUE( -+ !diff128(rshiftbyte_m128(in, 1), variable_byte_shift_m128(in, -1))); -+ EXPECT_TRUE( -+ !diff128(rshiftbyte_m128(in, 2), variable_byte_shift_m128(in, -2))); -+ EXPECT_TRUE( -+ !diff128(rshiftbyte_m128(in, 3), variable_byte_shift_m128(in, -3))); -+ EXPECT_TRUE( -+ !diff128(rshiftbyte_m128(in, 4), variable_byte_shift_m128(in, -4))); -+ EXPECT_TRUE( -+ !diff128(rshiftbyte_m128(in, 5), variable_byte_shift_m128(in, -5))); -+ EXPECT_TRUE( -+ !diff128(rshiftbyte_m128(in, 6), variable_byte_shift_m128(in, -6))); -+ EXPECT_TRUE( -+ !diff128(rshiftbyte_m128(in, 7), variable_byte_shift_m128(in, -7))); -+ EXPECT_TRUE( -+ !diff128(rshiftbyte_m128(in, 8), variable_byte_shift_m128(in, -8))); -+ EXPECT_TRUE( -+ !diff128(rshiftbyte_m128(in, 9), variable_byte_shift_m128(in, -9))); -+ EXPECT_TRUE( -+ !diff128(rshiftbyte_m128(in, 10), variable_byte_shift_m128(in, -10))); -+ -+ EXPECT_TRUE( -+ !diff128(lshiftbyte_m128(in, 0), variable_byte_shift_m128(in, 0))); -+ EXPECT_TRUE( -+ !diff128(lshiftbyte_m128(in, 1), variable_byte_shift_m128(in, 1))); -+ EXPECT_TRUE( -+ !diff128(lshiftbyte_m128(in, 2), variable_byte_shift_m128(in, 2))); -+ EXPECT_TRUE( -+ !diff128(lshiftbyte_m128(in, 3), variable_byte_shift_m128(in, 3))); -+ EXPECT_TRUE( -+ !diff128(lshiftbyte_m128(in, 4), variable_byte_shift_m128(in, 4))); -+ EXPECT_TRUE( -+ !diff128(lshiftbyte_m128(in, 5), variable_byte_shift_m128(in, 5))); -+ EXPECT_TRUE( -+ !diff128(lshiftbyte_m128(in, 6), variable_byte_shift_m128(in, 6))); -+ EXPECT_TRUE( -+ !diff128(lshiftbyte_m128(in, 7), variable_byte_shift_m128(in, 7))); -+ EXPECT_TRUE( -+ !diff128(lshiftbyte_m128(in, 8), variable_byte_shift_m128(in, 8))); -+ EXPECT_TRUE( -+ !diff128(lshiftbyte_m128(in, 9), variable_byte_shift_m128(in, 9))); -+ EXPECT_TRUE( -+ !diff128(lshiftbyte_m128(in, 10), variable_byte_shift_m128(in, 10))); - - EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16))); - EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, -16))); -@@ -785,12 +783,12 @@ TEST(SimdUtilsTest, min_u8_m128) { - } - - TEST(SimdUtilsTest, sadd_u8_m128) { -- unsigned char base1[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4', -- '1', '2', '3', '4', '1', '2', '3', '4'}; -- unsigned char base2[] = {'a', 0x80, 'b', 'A', 0x10, 0x10, 0x10, 0x10, -- 0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0}; -+ unsigned char base1[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4', -+ '1', '2', '3', '4', '1', '2', '3', '4'}; -+ unsigned char base2[] = {'a', 0x80, 'b', 'A', 0x10, 0x10, 0x10, 0x10, -+ 0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0}; - unsigned char expec[] = {'a', 0xff, 0xff, 0x82, 'A', 'B', 'C', 'D', -- 'a', 'b', 'c', 'd', '1', '2', '3', '4'}; -+ 'a', 'b', 'c', 'd', '1', '2', '3', '4'}; - m128 in1 = loadu128(base1); - m128 in2 = loadu128(base2); - m128 result = sadd_u8_m128(in1, in2); -@@ -799,11 +797,11 @@ TEST(SimdUtilsTest, sadd_u8_m128) { - - TEST(SimdUtilsTest, sub_u8_m128) { - unsigned char base1[] = {'a', 0xff, 0xff, 0x82, 'A', 'B', 'C', 'D', -- 'a', 'b', 'c', 'd', '1', '2', '3', '4'}; -- unsigned char base2[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4', -- '1', '2', '3', '4', '1', '2', '3', '4'}; -- unsigned char expec[] = {'a', 0x7f, 0, 'A', 0x10, 0x10, 0x10, 0x10, -- 0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0}; -+ 'a', 'b', 'c', 'd', '1', '2', '3', '4'}; -+ unsigned char base2[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4', -+ '1', '2', '3', '4', '1', '2', '3', '4'}; -+ unsigned char expec[] = {'a', 0x7f, 0, 'A', 0x10, 0x10, 0x10, 0x10, -+ 0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0}; - m128 in1 = loadu128(base1); - m128 in2 = loadu128(base2); - m128 result = sub_u8_m128(in1, in2); diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt index ea942ef..d7bef50 100644 --- a/util/CMakeLists.txt diff --git a/hyperscan.spec b/hyperscan.spec index 3e8190a..5829846 100644 --- a/hyperscan.spec +++ b/hyperscan.spec @@ -1,6 +1,6 @@ Name: hyperscan -Version: 5.4.0 -Release: 3 +Version: 5.4.1 +Release: 1 Summary: High-performance regular expression matching library License: BSD @@ -8,10 +8,7 @@ URL: https://www.hyperscan.io/ Source0: https://github.com/intel/%{name}/archive/v%{version}.tar.gz#/%{name}-%{version}.tar.gz Patch0: hyperscan-aarch64-support.patch -Patch1: Fix-build-error-on-x86_64.patch -Patch2: Fix-hyperscan-gcc10.patch -# https://github.com/intel/hyperscan/commit/7d644e7ba27eaadda753febf0b142faa9affbbca -Patch3: backport-Fix-segfaults-on-allocation-failure.patch +Patch1: Fix-hyperscan-gcc10.patch BuildRequires: gcc-c++ BuildRequires: boost-devel @@ -54,7 +51,15 @@ This package provides the libraries, include files and other resources needed for developing Hyperscan applications. %prep -%autosetup -n %{name}-%{version} -p1 +%setup -q -n %{name}-%{version} +cd %{_builddir}/%{name}-%{version} +mv src/util/simd_utils.h src/util/simd_x86.h +sed -i 's/SIMD_UTILS/SIMD_X86/' src/util/simd_x86.h +sed -i 's/_mm_set_epi32/set32x4/' src/util/state_compress.c +sed -i 's/_mm_set_epi64x/set64x2/' src/util/state_compress.c +sed -i 's/_mm_srli_si128/rshiftbyte_m128/' src/util/state_compress.c +cd - +%autopatch -p1 %build %cmake -DBUILD_SHARED_LIBS:BOOL=ON -DBUILD_STATIC_AND_SHARED:BOOL=OFF . @@ -80,6 +85,9 @@ needed for developing Hyperscan applications. %{_includedir}/hs/ %changelog +* Sat Mar 25 2023 Liu Zixian - 5.4.1-1 +- Update to 5.4.1 + * Tue Mar 21 2023 Liu Zixian - 5.4.0-3 - Cleanup aarch64 patch