diff --git a/Fix-build-error-on-x86_64.patch b/Fix-build-error-on-x86_64.patch
deleted file mode 100644
index 58ec1fa..0000000
--- a/Fix-build-error-on-x86_64.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From f92e690190b51eb6bada384174887501f4c3f43f Mon Sep 17 00:00:00 2001
-From: wang_yue111 <648774160@qq.com>
-Date: Sat, 24 Jul 2021 10:21:03 +0800
-Subject: [PATCH] fix build error on X86
-
----
- cmake/build_wrapper.sh | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/cmake/build_wrapper.sh b/cmake/build_wrapper.sh
-index 1962813..895610c 100755
---- a/cmake/build_wrapper.sh
-+++ b/cmake/build_wrapper.sh
-@@ -17,7 +17,7 @@ KEEPSYMS=$(mktemp -p /tmp keep.syms.XXXXX)
- LIBC_SO=$("$@" --print-file-name=libc.so.6)
- cp ${KEEPSYMS_IN} ${KEEPSYMS}
- # get all symbols from libc and turn them into patterns
--nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ ]*\).*/^\1$/' >> ${KEEPSYMS}
-+nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ @]*\).*/^\1$/' >> ${KEEPSYMS}
- # build the object
- "$@"
- # rename the symbols in the object
--- 
-2.23.0
-
diff --git a/backport-Fix-segfaults-on-allocation-failure.patch b/backport-Fix-segfaults-on-allocation-failure.patch
deleted file mode 100644
index 041081c..0000000
--- a/backport-Fix-segfaults-on-allocation-failure.patch
+++ /dev/null
@@ -1,34 +0,0 @@
-From 7d644e7ba27eaadda753febf0b142faa9affbbca Mon Sep 17 00:00:00 2001
-From: hongyang7 <yang.a.hong@intel.com>
-Date: Thu, 16 Dec 2021 19:02:17 +0800
-Subject: [PATCH] Fix segfaults on allocation failure (#4)
-
-Throw std::bad_alloc instead of returning nullptr from
-ue2::AlignedAllocator. Allocators for STL containers are expected never
-to return with an invalid pointer, and instead must throw on failure.
-Violating this expectation can lead to invalid pointer dereferences.
-
-Co-authored-by: johanngan <johanngan.us@gmail.com>
-
-fixes github issue #317 (PR #320)
----
- src/util/alloc.h | 6 +++++-
- 1 file changed, 5 insertions(+), 1 deletion(-)
-
-diff --git a/src/util/alloc.h b/src/util/alloc.h
-index de20c8d02..49b4a824d 100644
---- a/src/util/alloc.h
-+++ b/src/util/alloc.h
-@@ -76,7 +76,11 @@ class AlignedAllocator {
- 
-     T *allocate(std::size_t size) const {
-         size_t alloc_size = size * sizeof(T);
--        return static_cast<T *>(aligned_malloc_internal(alloc_size, N));
-+        T *ptr = static_cast<T *>(aligned_malloc_internal(alloc_size, N));
-+        if (!ptr) {
-+            throw std::bad_alloc();
-+        }
-+        return ptr;
-     }
- 
-     void deallocate(T *x, std::size_t) const noexcept {
diff --git a/hyperscan-5.4.0.tar.gz b/hyperscan-5.4.0.tar.gz
deleted file mode 100644
index 94a94cf..0000000
Binary files a/hyperscan-5.4.0.tar.gz and /dev/null differ
diff --git a/hyperscan-5.4.1.tar.gz b/hyperscan-5.4.1.tar.gz
new file mode 100644
index 0000000..d094370
Binary files /dev/null and b/hyperscan-5.4.1.tar.gz differ
diff --git a/hyperscan-aarch64-support.patch b/hyperscan-aarch64-support.patch
index 381e009..a35c8f1 100644
--- a/hyperscan-aarch64-support.patch
+++ b/hyperscan-aarch64-support.patch
@@ -1,48 +1,48 @@
-From 5f009c288718095c5cc675bfed12d7ec64237731 Mon Sep 17 00:00:00 2001
+From e95491b3a2261aecdc5576a7e507b4f4ace88cbc Mon Sep 17 00:00:00 2001
 From: Yikun Jiang <yikunkero@gmail.com>
 Date: Mon, 20 Jul 2020 17:20:15 +0800
 Subject: [PATCH] Add aarch64 support
 
+Signed-off-by: Liu Zixian <liuzixian4@huawei.com>
 ---
  CMakeLists.txt                             |  108 +-
  cmake/config.h.in                          |    9 +
  cmake/platform.cmake                       |   13 +-
  cmake/ragel.cmake                          |   20 +
  src/crc32.c                                |   43 +
- src/fdr/fdr.c                              |  136 +-
+ src/fdr/fdr.c                              |  136 ++-
  src/hs_valid_platform.c                    |    9 +-
  src/nfa/limex_exceptional.h                |   22 +-
  src/nfa/limex_internal.h                   |    2 +-
  src/nfa/limex_native.c                     |   10 +-
- src/nfa/shufti.c                           |  580 ++++----
+ src/nfa/shufti.c                           |   18 +-
  src/nfa/truffle.c                          |   10 +-
- src/parser/control_verbs.cpp               |  340 +++++
+ src/parser/control_verbs.cpp               |  340 +++++++
  src/rose/counting_miracle.h                |    2 +-
  src/util/arch.h                            |   11 +
- src/util/cpuid_flags.c                     |    9 +-
+ src/util/cpuid_flags.c                     |    6 +
  src/util/cpuid_flags.h                     |    2 +
  src/util/cpuid_inline.h                    |   17 +-
  src/util/intrinsics.h                      |   12 +
  src/util/popcount.h                        |    6 +-
- src/util/simd_arm.h                        | 1069 +++++++++++++++
- src/util/simd_types.h                      |   42 +-
- src/util/simd_utils.h                      | 1389 +-------------------
- src/util/simd_x86.h                        | 1334 +++++++++++++++++++
- src/util/state_compress.c                  |   42 +-
+ src/util/simd_arm.h                        | 1069 ++++++++++++++++++++
+ src/util/simd_types.h                      |   17 +
+ src/util/simd_utils.h                      |   13 +
+ src/util/simd_x86.h                        |   10 +
  tools/hscollider/CMakeLists.txt            |    9 +-
- tools/hscollider/ColliderCorporaParser.cpp |  474 +++++++
- unit/internal/simd_utils.cpp               |  128 +-
+ tools/hscollider/ColliderCorporaParser.cpp |  474 +++++++++
+ unit/internal/simd_utils.cpp               |    2 +-
  util/CMakeLists.txt                        |    8 +-
- util/ExpressionParser.cpp                  |  397 ++++++
- 30 files changed, 4464 insertions(+), 1789 deletions(-)
+ util/ExpressionParser.cpp                  |  397 ++++++++
+ 29 files changed, 2717 insertions(+), 78 deletions(-)
  create mode 100644 src/parser/control_verbs.cpp
  create mode 100644 src/util/simd_arm.h
- create mode 100644 src/util/simd_x86.h
+ create mode 100644 src/util/simd_utils.h
  create mode 100644 tools/hscollider/ColliderCorporaParser.cpp
  create mode 100644 util/ExpressionParser.cpp
 
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 8bc6077..12a889c 100644
+index bd6d2de..8dbcb72 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -74,6 +74,7 @@ include (${CMAKE_MODULE_PATH}/boost.cmake)
@@ -182,7 +182,7 @@ index 8bc6077..12a889c 100644
  
  # testing a builtin takes a little more work
  CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
-@@ -403,12 +438,6 @@ if (CXX_IGNORED_ATTR)
+@@ -415,12 +450,6 @@ if (CXX_IGNORED_ATTR)
      set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-ignored-attributes")
  endif()
  
@@ -195,7 +195,7 @@ index 8bc6077..12a889c 100644
  # note this for later
  # g++ doesn't have this flag but clang does
  CHECK_CXX_COMPILER_FLAG("-Wweak-vtables" CXX_WEAK_VTABLES)
-@@ -463,6 +492,14 @@ else()
+@@ -477,6 +506,14 @@ else()
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
  endif()
  
@@ -210,7 +210,7 @@ index 8bc6077..12a889c 100644
  add_subdirectory(util)
  add_subdirectory(doc/dev-reference)
  
-@@ -559,7 +596,14 @@ set_source_files_properties(
+@@ -573,7 +610,14 @@ set_source_files_properties(
      PROPERTIES
          COMPILE_FLAGS "${RAGEL_C_FLAGS}")
  
@@ -747,157 +747,17 @@ index f6f5809..8998830 100644
          }
      }
 diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c
-index 09ffc0c..6231e61 100644
+index 09ffc0c..2cb74f0 100644
 --- a/src/nfa/shufti.c
 +++ b/src/nfa/shufti.c
-@@ -42,39 +42,36 @@
- #ifdef DEBUG
- #include <ctype.h>
- 
--#define DUMP_MSK(_t)                                \
--static UNUSED                                       \
--void dumpMsk##_t(m##_t msk) {                       \
--    u8 * mskAsU8 = (u8 *)&msk;                      \
--    for (unsigned i = 0; i < sizeof(msk); i++) {    \
--        u8 c = mskAsU8[i];                          \
--        for (int j = 0; j < 8; j++) {               \
--            if ((c >> (7-j)) & 0x1)                 \
--                printf("1");                        \
--            else                                    \
--                printf("0");                        \
--        }                                           \
--        printf(" ");                                \
--    }                                               \
--}                                                   \
--static UNUSED                                       \
--void dumpMsk##_t##AsChars(m##_t msk) {              \
--    u8 * mskAsU8 = (u8 *)&msk;                      \
--    for (unsigned i = 0; i < sizeof(msk); i++) {    \
--        u8 c = mskAsU8[i];                          \
--        if (isprint(c))                             \
--            printf("%c",c);                         \
--        else                                        \
--            printf(".");                            \
--    }                                               \
--}
-+#define DUMP_MSK(_t)                                                           \
-+    static UNUSED void dumpMsk##_t(m##_t msk) {                                \
-+        u8 *mskAsU8 = (u8 *)&msk;                                              \
-+        for (unsigned i = 0; i < sizeof(msk); i++) {                           \
-+            u8 c = mskAsU8[i];                                                 \
-+            for (int j = 0; j < 8; j++) {                                      \
-+                if ((c >> (7 - j)) & 0x1)                                      \
-+                    printf("1");                                               \
-+                else                                                           \
-+                    printf("0");                                               \
-+            }                                                                  \
-+            printf(" ");                                                       \
-+        }                                                                      \
-+    }                                                                          \
-+    static UNUSED void dumpMsk##_t##AsChars(m##_t msk) {                       \
-+        u8 *mskAsU8 = (u8 *)&msk;                                              \
-+        for (unsigned i = 0; i < sizeof(msk); i++) {                           \
-+            u8 c = mskAsU8[i];                                                 \
-+            if (isprint(c))                                                    \
-+                printf("%c", c);                                               \
-+            else                                                               \
-+                printf(".");                                                   \
-+        }                                                                      \
-+    }
- 
- #endif
- 
- /** \brief Naive byte-by-byte implementation. */
--static really_inline
--const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf,
--                        const u8 *buf_end) {
-+static really_inline const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi,
-+                                             const u8 *buf, const u8 *buf_end) {
-     assert(buf < buf_end);
- 
-     for (; buf < buf_end; ++buf) {
-@@ -87,9 +84,8 @@ const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf,
- }
- 
- /** \brief Naive byte-by-byte implementation. */
--static really_inline
--const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf,
--                        const u8 *buf_end) {
-+static really_inline const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi,
-+                                             const u8 *buf, const u8 *buf_end) {
-     assert(buf < buf_end);
- 
-     for (buf_end--; buf_end >= buf; buf_end--) {
-@@ -111,25 +107,33 @@ DUMP_MSK(128)
- #define GET_LO_4(chars) and128(chars, low4bits)
- #define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
- 
--static really_inline
--u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
--          const m128 compare) {
--    m128 c_lo  = pshufb_m128(mask_lo, GET_LO_4(chars));
--    m128 c_hi  = pshufb_m128(mask_hi, GET_HI_4(chars));
--    m128 t     = and128(c_lo, c_hi);
-+static really_inline u32 block(m128 mask_lo, m128 mask_hi, m128 chars,
-+                               const m128 low4bits, const m128 compare) {
-+    m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars));
-+    m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars));
-+    m128 t = and128(c_lo, c_hi);
- 
- #ifdef DEBUG
--    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
--    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
--    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
--    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
--    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
-+    DEBUG_PRINTF(" chars: ");
-+    dumpMsk128AsChars(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  char: ");
-+    dumpMsk128(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_lo: ");
-+    dumpMsk128(c_lo);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_hi: ");
-+    dumpMsk128(c_hi);
-+    printf("\n");
-+    DEBUG_PRINTF("     t: ");
-+    dumpMsk128(t);
-+    printf("\n");
- #endif
-     return movemask128(eq128(t, compare));
- }
- 
--static really_inline
--const u8 *firstMatch(const u8 *buf, u32 z) {
-+static really_inline const u8 *firstMatch(const u8 *buf, u32 z) {
-     if (unlikely(z != 0xffff)) {
-         u32 pos = ctz32(~z & 0xffff);
-         assert(pos < 16);
-@@ -139,9 +143,9 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
-     }
- }
- 
--static really_inline
--const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf,
--                   const m128 low4bits, const m128 zeroes) {
-+static really_inline const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars,
-+                                        const u8 *buf, const m128 low4bits,
-+                                        const m128 zeroes) {
-     u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
- 
-     return firstMatch(buf, z);
-@@ -153,13 +157,13 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+@@ -153,13 +153,13 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
      assert(buf < buf_end);
  
      // Slow path for small cases.
 -    if (buf_end - buf < 16) {
--        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
--                             buf, buf_end);
 +    if (unlikely(buf_end - buf < 16)) {
-+        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf,
-+                             buf_end);
+         return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
+                              buf, buf_end);
      }
  
      const m128 zeroes = zeroes128();
@@ -906,7 +766,7 @@ index 09ffc0c..6231e61 100644
      const u8 *rv;
  
      size_t min = (size_t)buf % 16;
-@@ -179,6 +183,11 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+@@ -179,6 +179,11 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
      const u8 *last_block = buf_end - 16;
      while (buf < last_block) {
          m128 lchars = load128(buf);
@@ -918,71 +778,7 @@ index 09ffc0c..6231e61 100644
          rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes);
          if (rv) {
              return rv;
-@@ -198,10 +207,11 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-     return buf_end;
- }
- 
--static really_inline
--const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) {
-+static really_inline const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) {
- #ifdef DEBUG
--    DEBUG_PRINTF("confirming match in:"); dumpMsk128(t); printf("\n");
-+    DEBUG_PRINTF("confirming match in:");
-+    dumpMsk128(t);
-+    printf("\n");
- #endif
- 
-     u32 z = movemask128(eq128(t, compare));
-@@ -215,20 +225,29 @@ const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) {
-     }
- }
- 
--
--static really_inline
--const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf,
--                   const m128 low4bits, const m128 zeroes) {
--    m128 c_lo  = pshufb_m128(mask_lo, GET_LO_4(chars));
--    m128 c_hi  = pshufb_m128(mask_hi, GET_HI_4(chars));
--    m128 t     = and128(c_lo, c_hi);
-+static really_inline const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars,
-+                                        const u8 *buf, const m128 low4bits,
-+                                        const m128 zeroes) {
-+    m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars));
-+    m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars));
-+    m128 t = and128(c_lo, c_hi);
- 
- #ifdef DEBUG
--    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
--    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
--    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
--    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
--    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
-+    DEBUG_PRINTF(" chars: ");
-+    dumpMsk128AsChars(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  char: ");
-+    dumpMsk128(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_lo: ");
-+    dumpMsk128(c_lo);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_hi: ");
-+    dumpMsk128(c_hi);
-+    printf("\n");
-+    DEBUG_PRINTF("     t: ");
-+    dumpMsk128(t);
-+    printf("\n");
- #endif
- 
-     return lastMatch(buf, t, zeroes);
-@@ -241,12 +260,12 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
- 
-     // Slow path for small cases.
-     if (buf_end - buf < 16) {
--        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
--                             buf, buf_end);
-+        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf,
-+                             buf_end);
+@@ -246,7 +251,7 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
      }
  
      const m128 zeroes = zeroes128();
@@ -991,708 +787,27 @@ index 09ffc0c..6231e61 100644
      const u8 *rv;
  
      assert(buf_end - buf >= 16);
-@@ -283,32 +302,48 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-     return buf - 1;
- }
- 
--static really_inline
--const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi,
--                    m128 chars, const u8 *buf, const m128 low4bits,
--                    const m128 ones) {
-+static really_inline const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi,
-+                                         m128 mask2_lo, m128 mask2_hi,
-+                                         m128 chars, const u8 *buf,
-+                                         const m128 low4bits, const m128 ones) {
-     m128 chars_lo = GET_LO_4(chars);
-     m128 chars_hi = GET_HI_4(chars);
--    m128 c_lo  = pshufb_m128(mask1_lo, chars_lo);
--    m128 c_hi  = pshufb_m128(mask1_hi, chars_hi);
--    m128 t     = or128(c_lo, c_hi);
-+    m128 c_lo = pshufb_m128(mask1_lo, chars_lo);
-+    m128 c_hi = pshufb_m128(mask1_hi, chars_hi);
-+    m128 t = or128(c_lo, c_hi);
- 
- #ifdef DEBUG
--    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
--    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
--    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
--    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
--    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
-+    DEBUG_PRINTF(" chars: ");
-+    dumpMsk128AsChars(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  char: ");
-+    dumpMsk128(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_lo: ");
-+    dumpMsk128(c_lo);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_hi: ");
-+    dumpMsk128(c_hi);
-+    printf("\n");
-+    DEBUG_PRINTF("     t: ");
-+    dumpMsk128(t);
-+    printf("\n");
- #endif
- 
--    m128 c2_lo  = pshufb_m128(mask2_lo, chars_lo);
--    m128 c2_hi  = pshufb_m128(mask2_hi, chars_hi);
--    m128 t2     = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1));
-+    m128 c2_lo = pshufb_m128(mask2_lo, chars_lo);
-+    m128 c2_hi = pshufb_m128(mask2_hi, chars_hi);
-+    m128 t2 = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1));
- 
- #ifdef DEBUG
--    DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo);        printf("\n");
--    DEBUG_PRINTF(" c2_hi: "); dumpMsk128(c2_hi);        printf("\n");
--    DEBUG_PRINTF("    t2: "); dumpMsk128(t2);           printf("\n");
-+    DEBUG_PRINTF(" c2_lo: ");
-+    dumpMsk128(c2_lo);
-+    printf("\n");
-+    DEBUG_PRINTF(" c2_hi: ");
-+    dumpMsk128(c2_hi);
-+    printf("\n");
-+    DEBUG_PRINTF("    t2: ");
-+    dumpMsk128(t2);
-+    printf("\n");
- #endif
- 
-     u32 z = movemask128(eq128(t2, ones));
-@@ -316,19 +351,18 @@ const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi,
-     return firstMatch(buf, z);
- }
- 
--const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
--                           m128 mask2_lo, m128 mask2_hi,
--                           const u8 *buf, const u8 *buf_end) {
-+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
-+                           m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
+@@ -320,7 +325,7 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
+                            m128 mask2_lo, m128 mask2_hi,
+                            const u8 *buf, const u8 *buf_end) {
      const m128 ones = ones128();
 -    const m128 low4bits = _mm_set1_epi8(0xf);
 +    const m128 low4bits = set16x8(0xf);
      const u8 *rv;
  
      size_t min = (size_t)buf % 16;
- 
-     // Preconditioning: most of the time our buffer won't be aligned.
-     m128 chars = loadu128(buf);
--    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
--                   chars, buf, low4bits, ones);
-+    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf, low4bits,
-+                   ones);
-     if (rv) {
-         return rv;
-     }
-@@ -340,8 +374,13 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
+@@ -340,6 +345,11 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
      const u8 *last_block = buf_end - 16;
      while (buf < last_block) {
          m128 lchars = load128(buf);
--        rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
--                       lchars, buf, low4bits, ones);
 +
 +#if defined(HAVE_NEON)
 +        __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(buf + 256)));
 +#endif
 +
-+        rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, lchars, buf,
-+                       low4bits, ones);
+         rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
+                        lchars, buf, low4bits, ones);
          if (rv) {
-             return rv;
-         }
-@@ -351,8 +390,8 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
-     // Use an unaligned load to mop up the last 16 bytes and get an accurate
-     // picture to buf_end.
-     chars = loadu128(buf_end - 16);
--    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
--                   chars, buf_end - 16, low4bits, ones);
-+    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf_end - 16,
-+                   low4bits, ones);
-     if (rv) {
-         return rv;
-     }
-@@ -370,26 +409,34 @@ DUMP_MSK(256)
- #define GET_LO_4(chars) and256(chars, low4bits)
- #define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4)
- 
--static really_inline
--u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits,
--          const m256 compare) {
--    m256 c_lo  = pshufb_m256(mask_lo, GET_LO_4(chars));
--    m256 c_hi  = pshufb_m256(mask_hi, GET_HI_4(chars));
-+static really_inline u32 block(m256 mask_lo, m256 mask_hi, m256 chars,
-+                               const m256 low4bits, const m256 compare) {
-+    m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars));
-+    m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars));
-     m256 t = and256(c_lo, c_hi);
- 
- #ifdef DEBUG
--    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
--    DEBUG_PRINTF("  char: "); dumpMsk256(chars); printf("\n");
--    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo); printf("\n");
--    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi); printf("\n");
--    DEBUG_PRINTF("     t: "); dumpMsk256(t); printf("\n");
-+    DEBUG_PRINTF(" chars: ");
-+    dumpMsk256AsChars(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  char: ");
-+    dumpMsk256(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_lo: ");
-+    dumpMsk256(c_lo);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_hi: ");
-+    dumpMsk256(c_hi);
-+    printf("\n");
-+    DEBUG_PRINTF("     t: ");
-+    dumpMsk256(t);
-+    printf("\n");
- #endif
- 
-     return movemask256(eq256(t, compare));
- }
- 
--static really_inline
--const u8 *firstMatch(const u8 *buf, u32 z) {
-+static really_inline const u8 *firstMatch(const u8 *buf, u32 z) {
-     DEBUG_PRINTF("z 0x%08x\n", z);
-     if (unlikely(z != 0xffffffff)) {
-         u32 pos = ctz32(~z);
-@@ -401,9 +448,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
-     }
- }
- 
--static really_inline
--const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf,
--                        const m256 low4bits) {
-+static really_inline const u8 *
-+fwdBlockShort(m256 mask, m128 chars, const u8 *buf, const m256 low4bits) {
-     // do the hi and lo shuffles in the one avx register
-     m256 c = combine2x128(rshift64_m128(chars, 4), chars);
-     c = and256(c, low4bits);
-@@ -415,9 +461,9 @@ const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf,
-     return firstMatch(buf, z);
- }
- 
--static really_inline
--const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
--                         const u8 *buf_end, const m256 low4bits) {
-+static really_inline const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi,
-+                                              const u8 *buf, const u8 *buf_end,
-+                                              const m256 low4bits) {
-     // run shufti over two overlapping 16-byte unaligned reads
-     const m256 mask = combine2x128(mask_hi, mask_lo);
-     m128 chars = loadu128(buf);
-@@ -434,9 +480,9 @@ const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
-     return buf_end;
- }
- 
--static really_inline
--const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
--                   const m256 low4bits, const m256 zeroes) {
-+static really_inline const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars,
-+                                        const u8 *buf, const m256 low4bits,
-+                                        const m256 zeroes) {
-     u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
- 
-     return firstMatch(buf, z);
-@@ -451,8 +497,8 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
- 
-     // Slow path for small cases.
-     if (buf_end - buf < 16) {
--        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
--                             buf, buf_end);
-+        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf,
-+                             buf_end);
-     }
- 
-     const m256 low4bits = set32x8(0xf);
-@@ -483,7 +529,8 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-     const u8 *last_block = buf_end - 32;
-     while (buf < last_block) {
-         m256 lchars = load256(buf);
--        rv = fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes);
-+        rv =
-+            fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes);
-         if (rv) {
-             return rv;
-         }
-@@ -494,7 +541,8 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-     // picture to buf_end.
-     assert(buf <= buf_end && buf >= buf_end - 32);
-     chars = loadu256(buf_end - 32);
--    rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes);
-+    rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits,
-+                  zeroes);
-     if (rv) {
-         return rv;
-     }
-@@ -502,8 +550,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-     return buf_end;
- }
- 
--static really_inline
--const u8 *lastMatch(const u8 *buf, u32 z) {
-+static really_inline const u8 *lastMatch(const u8 *buf, u32 z) {
-     if (unlikely(z != 0xffffffff)) {
-         u32 pos = clz32(~z);
-         DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
-@@ -513,28 +560,37 @@ const u8 *lastMatch(const u8 *buf, u32 z) {
-     }
- }
- 
--static really_inline
--const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
--                   const m256 low4bits, const m256 zeroes) {
--    m256 c_lo  = pshufb_m256(mask_lo, GET_LO_4(chars));
--    m256 c_hi  = pshufb_m256(mask_hi, GET_HI_4(chars));
--    m256 t     = and256(c_lo, c_hi);
-+static really_inline const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars,
-+                                        const u8 *buf, const m256 low4bits,
-+                                        const m256 zeroes) {
-+    m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars));
-+    m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars));
-+    m256 t = and256(c_lo, c_hi);
- 
- #ifdef DEBUG
--    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
--    DEBUG_PRINTF("  char: "); dumpMsk256(chars);        printf("\n");
--    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo);         printf("\n");
--    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi);         printf("\n");
--    DEBUG_PRINTF("     t: "); dumpMsk256(t);            printf("\n");
-+    DEBUG_PRINTF(" chars: ");
-+    dumpMsk256AsChars(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  char: ");
-+    dumpMsk256(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_lo: ");
-+    dumpMsk256(c_lo);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_hi: ");
-+    dumpMsk256(c_hi);
-+    printf("\n");
-+    DEBUG_PRINTF("     t: ");
-+    dumpMsk256(t);
-+    printf("\n");
- #endif
- 
-     u32 z = movemask256(eq256(t, zeroes));
-     return lastMatch(buf, z);
- }
- 
--static really_inline
--const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf,
--                        const m256 low4bits) {
-+static really_inline const u8 *
-+revBlockShort(m256 mask, m128 chars, const u8 *buf, const m256 low4bits) {
-     // do the hi and lo shuffles in the one avx register
-     m256 c = combine2x128(rshift64_m128(chars, 4), chars);
-     c = and256(c, low4bits);
-@@ -546,9 +602,9 @@ const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf,
-     return lastMatch(buf, z);
- }
- 
--static really_inline
--const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
--                         const u8 *buf_end, const m256 low4bits) {
-+static really_inline const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi,
-+                                              const u8 *buf, const u8 *buf_end,
-+                                              const m256 low4bits) {
-     // run shufti over two overlapping 16-byte unaligned reads
-     const m256 mask = combine2x128(mask_hi, mask_lo);
- 
-@@ -566,7 +622,6 @@ const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
-     return buf - 1;
- }
- 
--
- /* takes 128 bit masks, but operates on 256 bits of data */
- const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                       const u8 *buf_end) {
-@@ -575,8 +630,8 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
- 
-     // Slow path for small cases.
-     if (buf_end - buf < 16) {
--        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
--                             buf, buf_end);
-+        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf,
-+                             buf_end);
-     }
- 
-     const m256 low4bits = set32x8(0xf);
-@@ -594,7 +649,8 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
- 
-     // Preconditioning: most of the time our buffer won't be aligned.
-     m256 chars = loadu256(buf_end - 32);
--    rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes);
-+    rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits,
-+                  zeroes);
-     if (rv) {
-         return rv;
-     }
-@@ -606,7 +662,8 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-     while (buf_end > last_block) {
-         buf_end -= 32;
-         m256 lchars = load256(buf_end);
--        rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, zeroes);
-+        rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits,
-+                      zeroes);
-         if (rv) {
-             return rv;
-         }
-@@ -623,42 +680,58 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-     return buf - 1;
- }
- 
--static really_inline
--const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi,
--                    m256 chars, const u8 *buf, const m256 low4bits,
--                    const m256 ones) {
-+static really_inline const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi,
-+                                         m256 mask2_lo, m256 mask2_hi,
-+                                         m256 chars, const u8 *buf,
-+                                         const m256 low4bits, const m256 ones) {
-     DEBUG_PRINTF("buf %p\n", buf);
-     m256 chars_lo = GET_LO_4(chars);
-     m256 chars_hi = GET_HI_4(chars);
--    m256 c_lo  = pshufb_m256(mask1_lo, chars_lo);
--    m256 c_hi  = pshufb_m256(mask1_hi, chars_hi);
--    m256 t     = or256(c_lo, c_hi);
-+    m256 c_lo = pshufb_m256(mask1_lo, chars_lo);
-+    m256 c_hi = pshufb_m256(mask1_hi, chars_hi);
-+    m256 t = or256(c_lo, c_hi);
- 
- #ifdef DEBUG
--    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
--    DEBUG_PRINTF("  char: "); dumpMsk256(chars);        printf("\n");
--    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo);         printf("\n");
--    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi);         printf("\n");
--    DEBUG_PRINTF("     t: "); dumpMsk256(t);            printf("\n");
-+    DEBUG_PRINTF(" chars: ");
-+    dumpMsk256AsChars(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  char: ");
-+    dumpMsk256(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_lo: ");
-+    dumpMsk256(c_lo);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_hi: ");
-+    dumpMsk256(c_hi);
-+    printf("\n");
-+    DEBUG_PRINTF("     t: ");
-+    dumpMsk256(t);
-+    printf("\n");
- #endif
- 
--    m256 c2_lo  = pshufb_m256(mask2_lo, chars_lo);
--    m256 c2_hi  = pshufb_m256(mask2_hi, chars_hi);
-+    m256 c2_lo = pshufb_m256(mask2_lo, chars_lo);
-+    m256 c2_hi = pshufb_m256(mask2_hi, chars_hi);
-     m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1));
- 
- #ifdef DEBUG
--    DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo);        printf("\n");
--    DEBUG_PRINTF(" c2_hi: "); dumpMsk256(c2_hi);        printf("\n");
--    DEBUG_PRINTF("    t2: "); dumpMsk256(t2);           printf("\n");
-+    DEBUG_PRINTF(" c2_lo: ");
-+    dumpMsk256(c2_lo);
-+    printf("\n");
-+    DEBUG_PRINTF(" c2_hi: ");
-+    dumpMsk256(c2_hi);
-+    printf("\n");
-+    DEBUG_PRINTF("    t2: ");
-+    dumpMsk256(t2);
-+    printf("\n");
- #endif
-     u32 z = movemask256(eq256(t2, ones));
- 
-     return firstMatch(buf, z);
- }
- 
--static really_inline
--const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf,
--                         const m256 low4bits) {
-+static really_inline const u8 *fwdBlockShort2(m256 mask1, m256 mask2,
-+                                              m128 chars, const u8 *buf,
-+                                              const m256 low4bits) {
-     // do the hi and lo shuffles in the one avx register
-     m256 c = combine2x128(rshift64_m128(chars, 4), chars);
-     c = and256(c, low4bits);
-@@ -672,9 +745,10 @@ const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf,
-     return firstMatch(buf, z);
- }
- 
--static really_inline
--const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
--                            m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
-+static really_inline const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi,
-+                                                 m128 mask2_lo, m128 mask2_hi,
-+                                                 const u8 *buf,
-+                                                 const u8 *buf_end) {
-     DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
-     const m256 low4bits = set32x8(0xf);
-     // run shufti over two overlapping 16-byte unaligned reads
-@@ -695,9 +769,8 @@ const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
- }
- 
- /* takes 128 bit masks, but operates on 256 bits of data */
--const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
--                           m128 mask2_lo, m128 mask2_hi,
--                           const u8 *buf, const u8 *buf_end) {
-+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
-+                           m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
-     /* we should always have at least 16 bytes */
-     assert(buf_end - buf >= 16);
-     DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
-@@ -731,8 +804,8 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
-     const u8 *last_block = buf_end - 32;
-     while (buf < last_block) {
-         m256 lchars = load256(buf);
--        rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
--                       lchars, buf, low4bits, ones);
-+        rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
-+                       wide_mask2_hi, lchars, buf, low4bits, ones);
-         if (rv) {
-             return rv;
-         }
-@@ -757,26 +830,34 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
- DUMP_MSK(512)
- #endif
- 
--static really_inline
--u64a block(m512 mask_lo, m512 mask_hi, m512 chars, const m512 low4bits,
--           const m512 compare) {
-+static really_inline u64a block(m512 mask_lo, m512 mask_hi, m512 chars,
-+                                const m512 low4bits, const m512 compare) {
-     m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits));
--    m512 c_hi = pshufb_m512(mask_hi,
--                            rshift64_m512(andnot512(low4bits, chars), 4));
-+    m512 c_hi =
-+        pshufb_m512(mask_hi, rshift64_m512(andnot512(low4bits, chars), 4));
-     m512 t = and512(c_lo, c_hi);
- 
- #ifdef DEBUG
--    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
--    DEBUG_PRINTF("  char: "); dumpMsk512(chars); printf("\n");
--    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo); printf("\n");
--    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi); printf("\n");
--    DEBUG_PRINTF("     t: "); dumpMsk512(t); printf("\n");
-+    DEBUG_PRINTF(" chars: ");
-+    dumpMsk512AsChars(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  char: ");
-+    dumpMsk512(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_lo: ");
-+    dumpMsk512(c_lo);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_hi: ");
-+    dumpMsk512(c_hi);
-+    printf("\n");
-+    DEBUG_PRINTF("     t: ");
-+    dumpMsk512(t);
-+    printf("\n");
- #endif
- 
-     return eq512mask(t, compare);
- }
--static really_inline
--const u8 *firstMatch64(const u8 *buf, u64a z) {
-+static really_inline const u8 *firstMatch64(const u8 *buf, u64a z) {
-     DEBUG_PRINTF("z 0x%016llx\n", z);
-     if (unlikely(z != ~0ULL)) {
-         u32 pos = ctz64(~z);
-@@ -788,18 +869,19 @@ const u8 *firstMatch64(const u8 *buf, u64a z) {
-     }
- }
- 
--static really_inline
--const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf,
--                      const m512 low4bits, const m512 zeroes) {
-+static really_inline const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi,
-+                                           m512 chars, const u8 *buf,
-+                                           const m512 low4bits,
-+                                           const m512 zeroes) {
-     u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
- 
-     return firstMatch64(buf, z);
- }
- 
--static really_inline
--const u8 *shortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf,
--                         const u8 *buf_end, const m512 low4bits,
--                         const m512 zeroes) {
-+static really_inline const u8 *shortShufti512(m512 mask_lo, m512 mask_hi,
-+                                              const u8 *buf, const u8 *buf_end,
-+                                              const m512 low4bits,
-+                                              const m512 zeroes) {
-     DEBUG_PRINTF("short shufti %p len %zu\n", buf, buf_end - buf);
-     uintptr_t len = buf_end - buf;
-     assert(len <= 64);
-@@ -877,8 +959,7 @@ done:
-     return buf_end;
- }
- 
--static really_inline
--const u8 *lastMatch64(const u8 *buf, u64a z) {
-+static really_inline const u8 *lastMatch64(const u8 *buf, u64a z) {
-     DEBUG_PRINTF("z 0x%016llx\n", z);
-     if (unlikely(z != ~0ULL)) {
-         u32 pos = clz64(~z);
-@@ -889,10 +970,10 @@ const u8 *lastMatch64(const u8 *buf, u64a z) {
-     }
- }
- 
--static really_inline
--const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf,
--                          const u8 *buf_end, const m512 low4bits,
--                          const m512 zeroes) {
-+static really_inline const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi,
-+                                               const u8 *buf, const u8 *buf_end,
-+                                               const m512 low4bits,
-+                                               const m512 zeroes) {
-     DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf);
-     uintptr_t len = buf_end - buf;
-     assert(len <= 64);
-@@ -909,20 +990,31 @@ const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf,
-     return lastMatch64(buf, z | ~k);
- }
- 
--static really_inline
--const u8 *revBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf,
--                      const m512 low4bits, const m512 zeroes) {
--    m512 c_lo  = pshufb_m512(mask_lo, and512(chars, low4bits));
--    m512 c_hi  = pshufb_m512(mask_hi,
--                             rshift64_m512(andnot512(low4bits, chars), 4));
--    m512 t     = and512(c_lo, c_hi);
-+static really_inline const u8 *revBlock512(m512 mask_lo, m512 mask_hi,
-+                                           m512 chars, const u8 *buf,
-+                                           const m512 low4bits,
-+                                           const m512 zeroes) {
-+    m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits));
-+    m512 c_hi =
-+        pshufb_m512(mask_hi, rshift64_m512(andnot512(low4bits, chars), 4));
-+    m512 t = and512(c_lo, c_hi);
- 
- #ifdef DEBUG
--    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
--    DEBUG_PRINTF("  char: "); dumpMsk512(chars);        printf("\n");
--    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo);         printf("\n");
--    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi);         printf("\n");
--    DEBUG_PRINTF("     t: "); dumpMsk512(t);            printf("\n");
-+    DEBUG_PRINTF(" chars: ");
-+    dumpMsk512AsChars(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  char: ");
-+    dumpMsk512(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_lo: ");
-+    dumpMsk512(c_lo);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_hi: ");
-+    dumpMsk512(c_hi);
-+    printf("\n");
-+    DEBUG_PRINTF("     t: ");
-+    dumpMsk512(t);
-+    printf("\n");
- #endif
- 
-     u64a z = eq512mask(t, zeroes);
-@@ -985,43 +1077,60 @@ done:
-     return buf - 1;
- }
- 
--static really_inline
--const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi,
--                    m512 chars, const u8 *buf, const m512 low4bits,
--                    const m512 ones, __mmask64 k) {
-+static really_inline const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi,
-+                                         m512 mask2_lo, m512 mask2_hi,
-+                                         m512 chars, const u8 *buf,
-+                                         const m512 low4bits, const m512 ones,
-+                                         __mmask64 k) {
-     DEBUG_PRINTF("buf %p %.64s\n", buf, buf);
-     m512 chars_lo = and512(chars, low4bits);
-     m512 chars_hi = rshift64_m512(andnot512(low4bits, chars), 4);
--    m512 c_lo  = maskz_pshufb_m512(k, mask1_lo, chars_lo);
--    m512 c_hi  = maskz_pshufb_m512(k, mask1_hi, chars_hi);
--    m512 t     = or512(c_lo, c_hi);
-+    m512 c_lo = maskz_pshufb_m512(k, mask1_lo, chars_lo);
-+    m512 c_hi = maskz_pshufb_m512(k, mask1_hi, chars_hi);
-+    m512 t = or512(c_lo, c_hi);
- 
- #ifdef DEBUG
--    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
--    DEBUG_PRINTF("  char: "); dumpMsk512(chars);        printf("\n");
--    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo);         printf("\n");
--    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi);         printf("\n");
--    DEBUG_PRINTF("     t: "); dumpMsk512(t);            printf("\n");
-+    DEBUG_PRINTF(" chars: ");
-+    dumpMsk512AsChars(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  char: ");
-+    dumpMsk512(chars);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_lo: ");
-+    dumpMsk512(c_lo);
-+    printf("\n");
-+    DEBUG_PRINTF("  c_hi: ");
-+    dumpMsk512(c_hi);
-+    printf("\n");
-+    DEBUG_PRINTF("     t: ");
-+    dumpMsk512(t);
-+    printf("\n");
- #endif
- 
--    m512 c2_lo  = maskz_pshufb_m512(k, mask2_lo, chars_lo);
--    m512 c2_hi  = maskz_pshufb_m512(k, mask2_hi, chars_hi);
-+    m512 c2_lo = maskz_pshufb_m512(k, mask2_lo, chars_lo);
-+    m512 c2_hi = maskz_pshufb_m512(k, mask2_hi, chars_hi);
-     m512 t2 = or512(t, rshift128_m512(or512(c2_lo, c2_hi), 1));
- 
- #ifdef DEBUG
--    DEBUG_PRINTF(" c2_lo: "); dumpMsk512(c2_lo);        printf("\n");
--    DEBUG_PRINTF(" c2_hi: "); dumpMsk512(c2_hi);        printf("\n");
--    DEBUG_PRINTF("    t2: "); dumpMsk512(t2);           printf("\n");
-+    DEBUG_PRINTF(" c2_lo: ");
-+    dumpMsk512(c2_lo);
-+    printf("\n");
-+    DEBUG_PRINTF(" c2_hi: ");
-+    dumpMsk512(c2_hi);
-+    printf("\n");
-+    DEBUG_PRINTF("    t2: ");
-+    dumpMsk512(t2);
-+    printf("\n");
- #endif
-     u64a z = eq512mask(t2, ones);
- 
-     return firstMatch64(buf, z | ~k);
- }
- 
--static really_inline
--const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo,
--                               m512 mask2_hi, const u8 *buf, const u8 *buf_end,
--                               const m512 low4bits, const m512 ones) {
-+static really_inline const u8 *
-+shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi,
-+                     const u8 *buf, const u8 *buf_end, const m512 low4bits,
-+                     const m512 ones) {
-     DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf);
-     uintptr_t len = buf_end - buf;
-     assert(len <= 64);
-@@ -1038,9 +1147,8 @@ const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo,
- }
- 
- /* takes 128 bit masks, but operates on 512 bits of data */
--const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
--                           m128 mask2_lo, m128 mask2_hi,
--                           const u8 *buf, const u8 *buf_end) {
-+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
-+                           m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
-     /* we should always have at least 16 bytes */
-     assert(buf_end - buf >= 16);
-     DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
 diff --git a/src/nfa/truffle.c b/src/nfa/truffle.c
 index be6b312..c05d778 100644
 --- a/src/nfa/truffle.c
@@ -2122,28 +1237,26 @@ index 985fec6..fe4a910 100644
 +
  #endif // UTIL_ARCH_H_
 diff --git a/src/util/cpuid_flags.c b/src/util/cpuid_flags.c
-index c00ce58..e0f6368 100644
+index c00ce58..96286ee 100644
 --- a/src/util/cpuid_flags.c
 +++ b/src/util/cpuid_flags.c
-@@ -39,7 +39,7 @@
- 
+@@ -40,6 +40,7 @@
  u64a cpuid_flags(void) {
      u64a cap = 0;
--
+ 
 +#if defined(__X86_64__)
      if (check_avx2()) {
          DEBUG_PRINTF("AVX2 enabled\n");
          cap |= HS_CPU_FEATURES_AVX2;
-@@ -68,7 +68,7 @@ u64a cpuid_flags(void) {
+@@ -67,6 +68,7 @@ u64a cpuid_flags(void) {
+ #if (!defined(FAT_RUNTIME) && !defined(HAVE_AVX512VBMI)) ||                    \
      (defined(FAT_RUNTIME) && !defined(BUILD_AVX512VBMI))
      cap &= ~HS_CPU_FEATURES_AVX512VBMI;
- #endif
--
 +#endif
-     return cap;
- }
+ #endif
  
-@@ -78,6 +78,7 @@ struct family_id {
+     return cap;
+@@ -78,6 +80,7 @@ struct family_id {
      u32 tune;
  };
  
@@ -2151,7 +1264,7 @@ index c00ce58..e0f6368 100644
  /* from table 35-1 of the Intel 64 and IA32 Arch. Software Developer's Manual
   * and "Intel Architecture and Processor Identification With CPUID Model and
   * Family Numbers" */
-@@ -121,6 +122,7 @@ static const struct family_id known_microarch[] = {
+@@ -121,6 +124,7 @@ static const struct family_id known_microarch[] = {
      { 0x6, 0x6C, HS_TUNE_FAMILY_ICX }, /* Icelake Xeon */
  
  };
@@ -2159,7 +1272,7 @@ index c00ce58..e0f6368 100644
  
  #ifdef DUMP_SUPPORT
  static UNUSED
-@@ -144,6 +146,7 @@ const char *dumpTune(u32 tune) {
+@@ -144,6 +148,7 @@ const char *dumpTune(u32 tune) {
  #endif
  
  u32 cpuid_tune(void) {
@@ -2167,12 +1280,12 @@ index c00ce58..e0f6368 100644
      unsigned int eax, ebx, ecx, edx;
  
      cpuid(1, 0, &eax, &ebx, &ecx, &edx);
-@@ -171,6 +174,6 @@ u32 cpuid_tune(void) {
+@@ -171,6 +176,7 @@ u32 cpuid_tune(void) {
          DEBUG_PRINTF("found tune flag %s\n", dumpTune(tune) );
          return tune;
      }
--
 +#endif
+ 
      return HS_TUNE_FAMILY_GENERIC;
  }
 diff --git a/src/util/cpuid_flags.h b/src/util/cpuid_flags.h
@@ -3380,17 +2493,11 @@ index 0000000..cce119f
 +
 +#endif
 diff --git a/src/util/simd_types.h b/src/util/simd_types.h
-index 962cad6..62d39ec 100644
+index 962cad6..b3f96ea 100644
 --- a/src/util/simd_types.h
 +++ b/src/util/simd_types.h
-@@ -30,28 +30,58 @@
- #define SIMD_TYPES_H
- 
- #include "config.h"
-+#include "ue2common.h"
- #include "util/arch.h"
- #include "util/intrinsics.h"
--#include "ue2common.h"
+@@ -35,6 +35,23 @@
+ #include "ue2common.h"
  
  #if defined(HAVE_SSE2)
 +typedef __m128i m128;
@@ -3412,2895 +2519,45 @@ index 962cad6..62d39ec 100644
 +
  typedef __m128i m128;
  #else
--typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
-+typedef struct ALIGN_DIRECTIVE {
-+    u64a hi;
-+    u64a lo;
-+} m128;
-+
- #endif
- 
- #if defined(HAVE_AVX2)
- typedef __m256i m256;
- #else
--typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
-+typedef struct ALIGN_AVX_DIRECTIVE {
-+    m128 lo;
-+    m128 hi;
-+} m256;
- #endif
- 
--typedef struct {m128 lo; m128 mid; m128 hi;} m384;
-+typedef struct {
-+    m128 lo;
-+    m128 mid;
-+    m128 hi;
-+} m384;
- #if defined(HAVE_AVX512)
- typedef __m512i m512;
- #else
--typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512;
-+typedef struct ALIGN_ATTR(64) {
-+    m256 lo;
-+    m256 hi;
-+} m512;
- #endif
- 
- #endif /* SIMD_TYPES_H */
--
+ typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
 diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
-index d1f060b..7e926b2 100644
---- a/src/util/simd_utils.h
+new file mode 100644
+index 0000000..9588d97
+--- /dev/null
 +++ b/src/util/simd_utils.h
-@@ -26,1395 +26,14 @@
-  * POSSIBILITY OF SUCH DAMAGE.
-  */
- 
--/** \file
-- * \brief SIMD types and primitive operations.
-- */
--
- #ifndef SIMD_UTILS
- #define SIMD_UTILS
- 
--#if !defined(_WIN32) && !defined(__SSSE3__)
--#error SSSE3 instructions must be enabled
--#endif
--
--#include "config.h"
--#include "ue2common.h"
--#include "simd_types.h"
--#include "unaligned.h"
--#include "util/arch.h"
--#include "util/intrinsics.h"
--
--#include <string.h> // for memcpy
--
--// Define a common assume_aligned using an appropriate compiler built-in, if
--// it's available. Note that we need to handle C or C++ compilation.
--#ifdef __cplusplus
--#  ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED
--#    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
--#  endif
--#else
--#  ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED
--#    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
--#  endif
--#endif
--
--// Fallback to identity case.
--#ifndef assume_aligned
--#define assume_aligned(x, y) (x)
--#endif
--
--#ifdef __cplusplus
--extern "C" {
--#endif
--extern const char vbs_mask_data[];
--#ifdef __cplusplus
--}
--#endif
--
--static really_inline m128 ones128(void) {
--#if defined(__GNUC__) || defined(__INTEL_COMPILER)
--    /* gcc gets this right */
--    return _mm_set1_epi8(0xFF);
--#else
--    /* trick from Intel's optimization guide to generate all-ones.
--     * ICC converts this to the single cmpeq instruction */
--    return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
--#endif
--}
--
--static really_inline m128 zeroes128(void) {
--    return _mm_setzero_si128();
--}
--
--/** \brief Bitwise not for m128*/
--static really_inline m128 not128(m128 a) {
--    return _mm_xor_si128(a, ones128());
--}
--
--/** \brief Return 1 if a and b are different otherwise 0 */
--static really_inline int diff128(m128 a, m128 b) {
--    return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
--}
--
--static really_inline int isnonzero128(m128 a) {
--    return !!diff128(a, zeroes128());
--}
--
--/**
-- * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
-- * mask indicating which 32-bit words contain differences.
-- */
--static really_inline u32 diffrich128(m128 a, m128 b) {
--    a = _mm_cmpeq_epi32(a, b);
--    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
--}
--
--/**
-- * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
-- * returns a 4-bit mask indicating which 64-bit words contain differences.
-- */
--static really_inline u32 diffrich64_128(m128 a, m128 b) {
--#if defined(HAVE_SSE41)
--    a = _mm_cmpeq_epi64(a, b);
--    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
--#else
--    u32 d = diffrich128(a, b);
--    return (d | (d >> 1)) & 0x5;
--#endif
--}
--
--static really_really_inline
--m128 lshift64_m128(m128 a, unsigned b) {
--#if defined(HAVE__BUILTIN_CONSTANT_P)
--    if (__builtin_constant_p(b)) {
--        return _mm_slli_epi64(a, b);
--    }
--#endif
--    m128 x = _mm_cvtsi32_si128(b);
--    return _mm_sll_epi64(a, x);
--}
--
--#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
--#define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
--#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
--
--#if defined(HAVE_AVX512)
--static really_inline m128 cast512to128(const m512 in) {
--    return _mm512_castsi512_si128(in);
--}
--#endif
--
--static really_inline m128 set16x8(u8 c) {
--    return _mm_set1_epi8(c);
--}
--
--static really_inline m128 set4x32(u32 c) {
--    return _mm_set1_epi32(c);
--}
--
--static really_inline u32 movd(const m128 in) {
--    return _mm_cvtsi128_si32(in);
--}
--
--#if defined(HAVE_AVX512)
--static really_inline u32 movd512(const m512 in) {
--    // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
--    //       so we use 2-step convertions to work around.
--    return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
--}
--
--static really_inline u64a movq512(const m512 in) {
--    // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in),
--    //       so we use 2-step convertions to work around.
--    return _mm_cvtsi128_si64(_mm512_castsi512_si128(in));
--}
--#endif
--
--static really_inline u64a movq(const m128 in) {
--#if defined(ARCH_X86_64)
--    return _mm_cvtsi128_si64(in);
--#else // 32-bit - this is horrific
--    u32 lo = movd(in);
--    u32 hi = movd(_mm_srli_epi64(in, 32));
--    return (u64a)hi << 32 | lo;
--#endif
--}
--
--/* another form of movq */
--static really_inline
--m128 load_m128_from_u64a(const u64a *p) {
--    return _mm_set_epi64x(0LL, *p);
--}
--
--#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
--#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
--
--#if defined(HAVE_SSE41)
--#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
--#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
--#else
--#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
--#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
--#endif
--
--#if !defined(HAVE_AVX2)
--// TODO: this entire file needs restructuring - this carveout is awful
--#define extractlow64from256(a) movq(a.lo)
--#define extractlow32from256(a) movd(a.lo)
--#if defined(HAVE_SSE41)
--#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
--#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
--#else
--#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
--#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
--#endif
--
--#endif // !AVX2
--
--static really_inline m128 and128(m128 a, m128 b) {
--    return _mm_and_si128(a,b);
--}
--
--static really_inline m128 xor128(m128 a, m128 b) {
--    return _mm_xor_si128(a,b);
--}
--
--static really_inline m128 or128(m128 a, m128 b) {
--    return _mm_or_si128(a,b);
--}
--
--#if defined(HAVE_AVX512VBMI)
--static really_inline m512 expand128(m128 a) {
--    return _mm512_broadcast_i32x4(a);
--}
--
--static really_inline m512 expand256(m256 a) {
--    return _mm512_broadcast_i64x4(a);
--}
--
--static really_inline m512 expand384(m384 a) {
--    u64a *lo = (u64a*)&a.lo;
--    u64a *mid = (u64a*)&a.mid;
--    u64a *hi = (u64a*)&a.hi;
--    return _mm512_set_epi64(0ULL, 0ULL, hi[1], hi[0], mid[1], mid[0],
--                            lo[1], lo[0]);
--}
+@@ -0,0 +1,13 @@
++// SPDX-License-Identifier: GPL-2.0-only
++// Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved.
++
++#ifndef SIMD_UTILS
++#define SIMD_UTILS
++
 +#if defined(__x86_64__)
 +#include "simd_x86.h"
 +#elif defined(__aarch64__)
 +#include "simd_arm.h"
- #endif
- 
--static really_inline m128 andnot128(m128 a, m128 b) {
--    return _mm_andnot_si128(a, b);
--}
--
--// aligned load
--static really_inline m128 load128(const void *ptr) {
--    assert(ISALIGNED_N(ptr, alignof(m128)));
--    ptr = assume_aligned(ptr, 16);
--    return _mm_load_si128((const m128 *)ptr);
--}
--
--// aligned store
--static really_inline void store128(void *ptr, m128 a) {
--    assert(ISALIGNED_N(ptr, alignof(m128)));
--    ptr = assume_aligned(ptr, 16);
--    *(m128 *)ptr = a;
--}
--
--// unaligned load
--static really_inline m128 loadu128(const void *ptr) {
--    return _mm_loadu_si128((const m128 *)ptr);
--}
--
--// unaligned store
--static really_inline void storeu128(void *ptr, m128 a) {
--    _mm_storeu_si128 ((m128 *)ptr, a);
--}
--
--// packed unaligned store of first N bytes
--static really_inline
--void storebytes128(void *ptr, m128 a, unsigned int n) {
--    assert(n <= sizeof(a));
--    memcpy(ptr, &a, n);
--}
--
--// packed unaligned load of first N bytes, pad with zero
--static really_inline
--m128 loadbytes128(const void *ptr, unsigned int n) {
--    m128 a = zeroes128();
--    assert(n <= sizeof(a));
--    memcpy(&a, ptr, n);
--    return a;
--}
--
--#ifdef __cplusplus
--extern "C" {
--#endif
--extern const u8 simd_onebit_masks[];
--#ifdef __cplusplus
--}
- #endif
- 
--static really_inline
--m128 mask1bit128(unsigned int n) {
--    assert(n < sizeof(m128) * 8);
--    u32 mask_idx = ((n % 8) * 64) + 95;
--    mask_idx -= n / 8;
--    return loadu128(&simd_onebit_masks[mask_idx]);
--}
--
--// switches on bit N in the given vector.
--static really_inline
--void setbit128(m128 *ptr, unsigned int n) {
--    *ptr = or128(mask1bit128(n), *ptr);
--}
--
--// switches off bit N in the given vector.
--static really_inline
--void clearbit128(m128 *ptr, unsigned int n) {
--    *ptr = andnot128(mask1bit128(n), *ptr);
--}
--
--// tests bit N in the given vector.
--static really_inline
--char testbit128(m128 val, unsigned int n) {
--    const m128 mask = mask1bit128(n);
--#if defined(HAVE_SSE41)
--    return !_mm_testz_si128(mask, val);
--#else
--    return isnonzero128(and128(mask, val));
--#endif
--}
--
--// offset must be an immediate
--#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
--
--static really_inline
--m128 pshufb_m128(m128 a, m128 b) {
--    m128 result;
--    result = _mm_shuffle_epi8(a, b);
--    return result;
--}
--
--static really_inline
--m256 pshufb_m256(m256 a, m256 b) {
--#if defined(HAVE_AVX2)
--    return _mm256_shuffle_epi8(a, b);
--#else
--    m256 rv;
--    rv.lo = pshufb_m128(a.lo, b.lo);
--    rv.hi = pshufb_m128(a.hi, b.hi);
--    return rv;
--#endif
--}
--
--#if defined(HAVE_AVX512)
--static really_inline
--m512 pshufb_m512(m512 a, m512 b) {
--    return _mm512_shuffle_epi8(a, b);
--}
--
--static really_inline
--m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
--    return _mm512_maskz_shuffle_epi8(k, a, b);
--}
--
--#if defined(HAVE_AVX512VBMI)
--#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a)
--#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a)
--#endif
--
--#endif
--
--static really_inline
--m128 variable_byte_shift_m128(m128 in, s32 amount) {
--    assert(amount >= -16 && amount <= 16);
--    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
--    return pshufb_m128(in, shift_mask);
--}
--
--static really_inline
--m128 max_u8_m128(m128 a, m128 b) {
--    return _mm_max_epu8(a, b);
--}
--
--static really_inline
--m128 min_u8_m128(m128 a, m128 b) {
--    return _mm_min_epu8(a, b);
--}
--
--static really_inline
--m128 sadd_u8_m128(m128 a, m128 b) {
--    return _mm_adds_epu8(a, b);
--}
--
--static really_inline
--m128 sub_u8_m128(m128 a, m128 b) {
--    return _mm_sub_epi8(a, b);
--}
--
--static really_inline
--m128 set64x2(u64a hi, u64a lo) {
--    return _mm_set_epi64x(hi, lo);
--}
--
--/****
-- **** 256-bit Primitives
-- ****/
--
--#if defined(HAVE_AVX2)
--
--static really_really_inline
--m256 lshift64_m256(m256 a, unsigned b) {
--#if defined(HAVE__BUILTIN_CONSTANT_P)
--    if (__builtin_constant_p(b)) {
--        return _mm256_slli_epi64(a, b);
--    }
--#endif
--    m128 x = _mm_cvtsi32_si128(b);
--    return _mm256_sll_epi64(a, x);
--}
--
--#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
--
--static really_inline
--m256 set32x8(u32 in) {
--    return _mm256_set1_epi8(in);
--}
--
--#define eq256(a, b)     _mm256_cmpeq_epi8((a), (b))
--#define movemask256(a)  ((u32)_mm256_movemask_epi8((a)))
--
--static really_inline
--m256 set2x128(m128 a) {
--    return _mm256_broadcastsi128_si256(a);
--}
--
--#else
--
--static really_really_inline
--m256 lshift64_m256(m256 a, int b) {
--    m256 rv = a;
--    rv.lo = lshift64_m128(rv.lo, b);
--    rv.hi = lshift64_m128(rv.hi, b);
--    return rv;
--}
--
--static really_inline
--m256 rshift64_m256(m256 a, int b) {
--    m256 rv = a;
--    rv.lo = rshift64_m128(rv.lo, b);
--    rv.hi = rshift64_m128(rv.hi, b);
--    return rv;
--}
--static really_inline
--m256 set32x8(u32 in) {
--    m256 rv;
--    rv.lo = set16x8((u8) in);
--    rv.hi = rv.lo;
--    return rv;
--}
--
--static really_inline
--m256 eq256(m256 a, m256 b) {
--    m256 rv;
--    rv.lo = eq128(a.lo, b.lo);
--    rv.hi = eq128(a.hi, b.hi);
--    return rv;
--}
--
--static really_inline
--u32 movemask256(m256 a) {
--    u32 lo_mask = movemask128(a.lo);
--    u32 hi_mask = movemask128(a.hi);
--    return lo_mask | (hi_mask << 16);
--}
--
--static really_inline
--m256 set2x128(m128 a) {
--    m256 rv = {a, a};
--    return rv;
--}
--#endif
--
--static really_inline m256 zeroes256(void) {
--#if defined(HAVE_AVX2)
--    return _mm256_setzero_si256();
--#else
--    m256 rv = {zeroes128(), zeroes128()};
--    return rv;
--#endif
--}
--
--static really_inline m256 ones256(void) {
--#if defined(HAVE_AVX2)
--    m256 rv = _mm256_set1_epi8(0xFF);
--#else
--    m256 rv = {ones128(), ones128()};
--#endif
--    return rv;
--}
--
--#if defined(HAVE_AVX2)
--static really_inline m256 and256(m256 a, m256 b) {
--    return _mm256_and_si256(a, b);
--}
--#else
--static really_inline m256 and256(m256 a, m256 b) {
--    m256 rv;
--    rv.lo = and128(a.lo, b.lo);
--    rv.hi = and128(a.hi, b.hi);
--    return rv;
--}
--#endif
--
--#if defined(HAVE_AVX2)
--static really_inline m256 or256(m256 a, m256 b) {
--    return _mm256_or_si256(a, b);
--}
--#else
--static really_inline m256 or256(m256 a, m256 b) {
--    m256 rv;
--    rv.lo = or128(a.lo, b.lo);
--    rv.hi = or128(a.hi, b.hi);
--    return rv;
--}
--#endif
--
--#if defined(HAVE_AVX2)
--static really_inline m256 xor256(m256 a, m256 b) {
--    return _mm256_xor_si256(a, b);
--}
--#else
--static really_inline m256 xor256(m256 a, m256 b) {
--    m256 rv;
--    rv.lo = xor128(a.lo, b.lo);
--    rv.hi = xor128(a.hi, b.hi);
--    return rv;
--}
--#endif
--
--#if defined(HAVE_AVX2)
--static really_inline m256 not256(m256 a) {
--    return _mm256_xor_si256(a, ones256());
--}
--#else
--static really_inline m256 not256(m256 a) {
--    m256 rv;
--    rv.lo = not128(a.lo);
--    rv.hi = not128(a.hi);
--    return rv;
--}
--#endif
--
--#if defined(HAVE_AVX2)
--static really_inline m256 andnot256(m256 a, m256 b) {
--    return _mm256_andnot_si256(a, b);
--}
--#else
--static really_inline m256 andnot256(m256 a, m256 b) {
--    m256 rv;
--    rv.lo = andnot128(a.lo, b.lo);
--    rv.hi = andnot128(a.hi, b.hi);
--    return rv;
--}
--#endif
--
--static really_inline int diff256(m256 a, m256 b) {
--#if defined(HAVE_AVX2)
--    return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
--#else
--    return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
--#endif
--}
--
--static really_inline int isnonzero256(m256 a) {
--#if defined(HAVE_AVX2)
--    return !!diff256(a, zeroes256());
--#else
--    return isnonzero128(or128(a.lo, a.hi));
--#endif
--}
--
--/**
-- * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
-- * mask indicating which 32-bit words contain differences.
-- */
--static really_inline u32 diffrich256(m256 a, m256 b) {
--#if defined(HAVE_AVX2)
--    a = _mm256_cmpeq_epi32(a, b);
--    return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
--#else
--    m128 z = zeroes128();
--    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
--    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
--    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z);
--    return ~(_mm_movemask_epi8(packed)) & 0xff;
--#endif
--}
--
--/**
-- * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
-- * returns an 8-bit mask indicating which 64-bit words contain differences.
-- */
--static really_inline u32 diffrich64_256(m256 a, m256 b) {
--    u32 d = diffrich256(a, b);
--    return (d | (d >> 1)) & 0x55555555;
--}
--
--// aligned load
--static really_inline m256 load256(const void *ptr) {
--    assert(ISALIGNED_N(ptr, alignof(m256)));
--#if defined(HAVE_AVX2)
--    return _mm256_load_si256((const m256 *)ptr);
--#else
--    m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
--    return rv;
--#endif
--}
--
--// aligned load  of 128-bit value to low and high part of 256-bit value
--static really_inline m256 load2x128(const void *ptr) {
--#if defined(HAVE_AVX2)
--    return set2x128(load128(ptr));
--#else
--    assert(ISALIGNED_N(ptr, alignof(m128)));
--    m256 rv;
--    rv.hi = rv.lo = load128(ptr);
--    return rv;
--#endif
--}
--
--static really_inline m256 loadu2x128(const void *ptr) {
--    return set2x128(loadu128(ptr));
--}
--
--// aligned store
--static really_inline void store256(void *ptr, m256 a) {
--    assert(ISALIGNED_N(ptr, alignof(m256)));
--#if defined(HAVE_AVX2)
--    _mm256_store_si256((m256 *)ptr, a);
--#else
--    ptr = assume_aligned(ptr, 16);
--    *(m256 *)ptr = a;
--#endif
--}
--
--// unaligned load
--static really_inline m256 loadu256(const void *ptr) {
--#if defined(HAVE_AVX2)
--    return _mm256_loadu_si256((const m256 *)ptr);
--#else
--    m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
--    return rv;
--#endif
--}
--
--// unaligned store
--static really_inline void storeu256(void *ptr, m256 a) {
--#if defined(HAVE_AVX2)
--    _mm256_storeu_si256((m256 *)ptr, a);
--#else
--    storeu128(ptr, a.lo);
--    storeu128((char *)ptr + 16, a.hi);
--#endif
--}
--
--// packed unaligned store of first N bytes
--static really_inline
--void storebytes256(void *ptr, m256 a, unsigned int n) {
--    assert(n <= sizeof(a));
--    memcpy(ptr, &a, n);
--}
--
--// packed unaligned load of first N bytes, pad with zero
--static really_inline
--m256 loadbytes256(const void *ptr, unsigned int n) {
--    m256 a = zeroes256();
--    assert(n <= sizeof(a));
--    memcpy(&a, ptr, n);
--    return a;
--}
--
--static really_inline
--m256 mask1bit256(unsigned int n) {
--    assert(n < sizeof(m256) * 8);
--    u32 mask_idx = ((n % 8) * 64) + 95;
--    mask_idx -= n / 8;
--    return loadu256(&simd_onebit_masks[mask_idx]);
--}
--
--static really_inline
--m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
--#if defined(HAVE_AVX2)
--    return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
--#else
--    m256 rv;
--    rv.hi = set64x2(hi_1, hi_0);
--    rv.lo = set64x2(lo_1, lo_0);
--    return rv;
--#endif
--}
--
--#if !defined(HAVE_AVX2)
--// switches on bit N in the given vector.
--static really_inline
--void setbit256(m256 *ptr, unsigned int n) {
--    assert(n < sizeof(*ptr) * 8);
--    m128 *sub;
--    if (n < 128) {
--        sub = &ptr->lo;
--    } else {
--        sub = &ptr->hi;
--        n -= 128;
--    }
--    setbit128(sub, n);
--}
--
--// switches off bit N in the given vector.
--static really_inline
--void clearbit256(m256 *ptr, unsigned int n) {
--    assert(n < sizeof(*ptr) * 8);
--    m128 *sub;
--    if (n < 128) {
--        sub = &ptr->lo;
--    } else {
--        sub = &ptr->hi;
--        n -= 128;
--    }
--    clearbit128(sub, n);
--}
--
--// tests bit N in the given vector.
--static really_inline
--char testbit256(m256 val, unsigned int n) {
--    assert(n < sizeof(val) * 8);
--    m128 sub;
--    if (n < 128) {
--        sub = val.lo;
--    } else {
--        sub = val.hi;
--        n -= 128;
--    }
--    return testbit128(sub, n);
--}
--
--static really_really_inline
--m128 movdq_hi(m256 x) {
--    return x.hi;
--}
--
--static really_really_inline
--m128 movdq_lo(m256 x) {
--    return x.lo;
--}
--
--static really_inline
--m256 combine2x128(m128 hi, m128 lo) {
--    m256 rv = {lo, hi};
--    return rv;
--}
--
--#else // AVX2
--
--// switches on bit N in the given vector.
--static really_inline
--void setbit256(m256 *ptr, unsigned int n) {
--    *ptr = or256(mask1bit256(n), *ptr);
--}
--
--static really_inline
--void clearbit256(m256 *ptr, unsigned int n) {
--    *ptr = andnot256(mask1bit256(n), *ptr);
--}
--
--// tests bit N in the given vector.
--static really_inline
--char testbit256(m256 val, unsigned int n) {
--    const m256 mask = mask1bit256(n);
--    return !_mm256_testz_si256(mask, val);
--}
--
--static really_really_inline
--m128 movdq_hi(m256 x) {
--    return _mm256_extracti128_si256(x, 1);
--}
--
--static really_really_inline
--m128 movdq_lo(m256 x) {
--    return _mm256_extracti128_si256(x, 0);
--}
--
--#define cast256to128(a) _mm256_castsi256_si128(a)
--#define cast128to256(a) _mm256_castsi128_si256(a)
--#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
--#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
--#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
--#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
--#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
--#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
--#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
--#define extractlow32from256(a) movd(cast256to128(a))
--#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
--#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
--#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
--
--static really_inline
--m256 combine2x128(m128 hi, m128 lo) {
--#if defined(_mm256_set_m128i)
--    return _mm256_set_m128i(hi, lo);
--#else
--    return insert128to256(cast128to256(lo), hi, 1);
--#endif
--}
--#endif //AVX2
--
--#if defined(HAVE_AVX512)
--#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
--#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
--#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
--#define set2x256(a) _mm512_broadcast_i64x4(a)
--#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
--#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
--#endif
--
--/****
-- **** 384-bit Primitives
-- ****/
--
--static really_inline m384 and384(m384 a, m384 b) {
--    m384 rv;
--    rv.lo = and128(a.lo, b.lo);
--    rv.mid = and128(a.mid, b.mid);
--    rv.hi = and128(a.hi, b.hi);
--    return rv;
--}
--
--static really_inline m384 or384(m384 a, m384 b) {
--    m384 rv;
--    rv.lo = or128(a.lo, b.lo);
--    rv.mid = or128(a.mid, b.mid);
--    rv.hi = or128(a.hi, b.hi);
--    return rv;
--}
--
--static really_inline m384 xor384(m384 a, m384 b) {
--    m384 rv;
--    rv.lo = xor128(a.lo, b.lo);
--    rv.mid = xor128(a.mid, b.mid);
--    rv.hi = xor128(a.hi, b.hi);
--    return rv;
--}
--static really_inline m384 not384(m384 a) {
--    m384 rv;
--    rv.lo = not128(a.lo);
--    rv.mid = not128(a.mid);
--    rv.hi = not128(a.hi);
--    return rv;
--}
--static really_inline m384 andnot384(m384 a, m384 b) {
--    m384 rv;
--    rv.lo = andnot128(a.lo, b.lo);
--    rv.mid = andnot128(a.mid, b.mid);
--    rv.hi = andnot128(a.hi, b.hi);
--    return rv;
--}
--
--static really_really_inline
--m384 lshift64_m384(m384 a, unsigned b) {
--    m384 rv;
--    rv.lo = lshift64_m128(a.lo, b);
--    rv.mid = lshift64_m128(a.mid, b);
--    rv.hi = lshift64_m128(a.hi, b);
--    return rv;
--}
--
--static really_inline m384 zeroes384(void) {
--    m384 rv = {zeroes128(), zeroes128(), zeroes128()};
--    return rv;
--}
--
--static really_inline m384 ones384(void) {
--    m384 rv = {ones128(), ones128(), ones128()};
--    return rv;
--}
--
--static really_inline int diff384(m384 a, m384 b) {
--    return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
--}
--
--static really_inline int isnonzero384(m384 a) {
--    return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
--}
--
--/**
-- * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
-- * mask indicating which 32-bit words contain differences.
-- */
--static really_inline u32 diffrich384(m384 a, m384 b) {
--    m128 z = zeroes128();
--    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
--    a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
--    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
--    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
--                                  _mm_packs_epi32(a.hi, z));
--    return ~(_mm_movemask_epi8(packed)) & 0xfff;
--}
--
--/**
-- * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
-- * returns a 12-bit mask indicating which 64-bit words contain differences.
-- */
--static really_inline u32 diffrich64_384(m384 a, m384 b) {
--    u32 d = diffrich384(a, b);
--    return (d | (d >> 1)) & 0x55555555;
--}
--
--// aligned load
--static really_inline m384 load384(const void *ptr) {
--    assert(ISALIGNED_16(ptr));
--    m384 rv = { load128(ptr), load128((const char *)ptr + 16),
--                load128((const char *)ptr + 32) };
--    return rv;
--}
--
--// aligned store
--static really_inline void store384(void *ptr, m384 a) {
--    assert(ISALIGNED_16(ptr));
--    ptr = assume_aligned(ptr, 16);
--    *(m384 *)ptr = a;
--}
--
--// unaligned load
--static really_inline m384 loadu384(const void *ptr) {
--    m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
--                loadu128((const char *)ptr + 32)};
--    return rv;
--}
--
--// packed unaligned store of first N bytes
--static really_inline
--void storebytes384(void *ptr, m384 a, unsigned int n) {
--    assert(n <= sizeof(a));
--    memcpy(ptr, &a, n);
--}
--
--// packed unaligned load of first N bytes, pad with zero
--static really_inline
--m384 loadbytes384(const void *ptr, unsigned int n) {
--    m384 a = zeroes384();
--    assert(n <= sizeof(a));
--    memcpy(&a, ptr, n);
--    return a;
--}
--
--// switches on bit N in the given vector.
--static really_inline
--void setbit384(m384 *ptr, unsigned int n) {
--    assert(n < sizeof(*ptr) * 8);
--    m128 *sub;
--    if (n < 128) {
--        sub = &ptr->lo;
--    } else if (n < 256) {
--        sub = &ptr->mid;
--    } else {
--        sub = &ptr->hi;
--    }
--    setbit128(sub, n % 128);
--}
--
--// switches off bit N in the given vector.
--static really_inline
--void clearbit384(m384 *ptr, unsigned int n) {
--    assert(n < sizeof(*ptr) * 8);
--    m128 *sub;
--    if (n < 128) {
--        sub = &ptr->lo;
--    } else if (n < 256) {
--        sub = &ptr->mid;
--    } else {
--        sub = &ptr->hi;
--    }
--    clearbit128(sub, n % 128);
--}
--
--// tests bit N in the given vector.
--static really_inline
--char testbit384(m384 val, unsigned int n) {
--    assert(n < sizeof(val) * 8);
--    m128 sub;
--    if (n < 128) {
--        sub = val.lo;
--    } else if (n < 256) {
--        sub = val.mid;
--    } else {
--        sub = val.hi;
--    }
--    return testbit128(sub, n % 128);
--}
--
--/****
-- **** 512-bit Primitives
-- ****/
--
--#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
--#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
--
--static really_inline
--m512 zeroes512(void) {
--#if defined(HAVE_AVX512)
--    return _mm512_setzero_si512();
--#else
--    m512 rv = {zeroes256(), zeroes256()};
--    return rv;
--#endif
--}
--
--static really_inline
--m512 ones512(void) {
--#if defined(HAVE_AVX512)
--    return _mm512_set1_epi8(0xFF);
--    //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
--#else
--    m512 rv = {ones256(), ones256()};
--    return rv;
--#endif
--}
--
--#if defined(HAVE_AVX512)
--static really_inline
--m512 set64x8(u8 a) {
--    return _mm512_set1_epi8(a);
--}
--
--static really_inline
--m512 set8x64(u64a a) {
--    return _mm512_set1_epi64(a);
--}
--
--static really_inline
--m512 set16x32(u32 a) {
--    return _mm512_set1_epi32(a);
--}
--
--static really_inline
--m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
--               u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
--    return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
--                            lo_3, lo_2, lo_1, lo_0);
--}
--
--static really_inline
--m512 swap256in512(m512 a) {
--    m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
--    return vpermq512(idx, a);
--}
--
--static really_inline
--m512 set4x128(m128 a) {
--    return _mm512_broadcast_i32x4(a);
--}
--
--static really_inline
--m512 sadd_u8_m512(m512 a, m512 b) {
--    return _mm512_adds_epu8(a, b);
--}
--
--static really_inline
--m512 max_u8_m512(m512 a, m512 b) {
--    return _mm512_max_epu8(a, b);
--}
--
--static really_inline
--m512 min_u8_m512(m512 a, m512 b) {
--    return _mm512_min_epu8(a, b);
--}
--
--static really_inline
--m512 sub_u8_m512(m512 a, m512 b) {
--    return _mm512_sub_epi8(a, b);
--}
--#endif
--
--static really_inline
--m512 and512(m512 a, m512 b) {
--#if defined(HAVE_AVX512)
--    return _mm512_and_si512(a, b);
--#else
--    m512 rv;
--    rv.lo = and256(a.lo, b.lo);
--    rv.hi = and256(a.hi, b.hi);
--    return rv;
--#endif
--}
--
--static really_inline
--m512 or512(m512 a, m512 b) {
--#if defined(HAVE_AVX512)
--    return _mm512_or_si512(a, b);
--#else
--    m512 rv;
--    rv.lo = or256(a.lo, b.lo);
--    rv.hi = or256(a.hi, b.hi);
--    return rv;
--#endif
--}
--
--static really_inline
--m512 xor512(m512 a, m512 b) {
--#if defined(HAVE_AVX512)
--    return _mm512_xor_si512(a, b);
--#else
--    m512 rv;
--    rv.lo = xor256(a.lo, b.lo);
--    rv.hi = xor256(a.hi, b.hi);
--    return rv;
--#endif
--}
--
--static really_inline
--m512 not512(m512 a) {
--#if defined(HAVE_AVX512)
--    return _mm512_xor_si512(a, ones512());
--#else
--    m512 rv;
--    rv.lo = not256(a.lo);
--    rv.hi = not256(a.hi);
--    return rv;
--#endif
--}
--
--static really_inline
--m512 andnot512(m512 a, m512 b) {
--#if defined(HAVE_AVX512)
--    return _mm512_andnot_si512(a, b);
--#else
--    m512 rv;
--    rv.lo = andnot256(a.lo, b.lo);
--    rv.hi = andnot256(a.hi, b.hi);
--    return rv;
--#endif
--}
--
--#if defined(HAVE_AVX512)
--static really_really_inline
--m512 lshift64_m512(m512 a, unsigned b) {
--#if defined(HAVE__BUILTIN_CONSTANT_P)
--    if (__builtin_constant_p(b)) {
--        return _mm512_slli_epi64(a, b);
--    }
--#endif
--    m128 x = _mm_cvtsi32_si128(b);
--    return _mm512_sll_epi64(a, x);
--}
--#else
--static really_really_inline
--m512 lshift64_m512(m512 a, unsigned b) {
--    m512 rv;
--    rv.lo = lshift64_m256(a.lo, b);
--    rv.hi = lshift64_m256(a.hi, b);
--    return rv;
--}
--#endif
--
--#if defined(HAVE_AVX512)
--#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
--#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
--#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
--#endif
--
--#if !defined(_MM_CMPINT_NE)
--#define _MM_CMPINT_NE 0x4
--#endif
--
--static really_inline
--int diff512(m512 a, m512 b) {
--#if defined(HAVE_AVX512)
--    return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
--#else
--    return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
--#endif
--}
--
--static really_inline
--int isnonzero512(m512 a) {
--#if defined(HAVE_AVX512)
--    return diff512(a, zeroes512());
--#elif defined(HAVE_AVX2)
--    m256 x = or256(a.lo, a.hi);
--    return !!diff256(x, zeroes256());
--#else
--    m128 x = or128(a.lo.lo, a.lo.hi);
--    m128 y = or128(a.hi.lo, a.hi.hi);
--    return isnonzero128(or128(x, y));
--#endif
--}
--
--/**
-- * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
-- * mask indicating which 32-bit words contain differences.
-- */
--static really_inline
--u32 diffrich512(m512 a, m512 b) {
--#if defined(HAVE_AVX512)
--    return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
--#elif defined(HAVE_AVX2)
--    return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
--#else
--    a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
--    a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi);
--    a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo);
--    a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi);
--    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi),
--                                  _mm_packs_epi32(a.hi.lo, a.hi.hi));
--    return ~(_mm_movemask_epi8(packed)) & 0xffff;
--#endif
--}
--
--/**
-- * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
-- * returns a 16-bit mask indicating which 64-bit words contain differences.
-- */
--static really_inline
--u32 diffrich64_512(m512 a, m512 b) {
--    //TODO: cmp_epi64?
--    u32 d = diffrich512(a, b);
--    return (d | (d >> 1)) & 0x55555555;
--}
--
--// aligned load
--static really_inline
--m512 load512(const void *ptr) {
--#if defined(HAVE_AVX512)
--    return _mm512_load_si512(ptr);
--#else
--    assert(ISALIGNED_N(ptr, alignof(m256)));
--    m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
--    return rv;
--#endif
--}
--
--// aligned store
--static really_inline
--void store512(void *ptr, m512 a) {
--    assert(ISALIGNED_N(ptr, alignof(m512)));
--#if defined(HAVE_AVX512)
--    return _mm512_store_si512(ptr, a);
--#elif defined(HAVE_AVX2)
--    m512 *x = (m512 *)ptr;
--    store256(&x->lo, a.lo);
--    store256(&x->hi, a.hi);
--#else
--    ptr = assume_aligned(ptr, 16);
--    *(m512 *)ptr = a;
--#endif
--}
--
--// unaligned load
--static really_inline
--m512 loadu512(const void *ptr) {
--#if defined(HAVE_AVX512)
--    return _mm512_loadu_si512(ptr);
--#else
--    m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
--    return rv;
--#endif
--}
--
--// unaligned store
--static really_inline
--void storeu512(void *ptr, m512 a) {
--#if defined(HAVE_AVX512)
--    _mm512_storeu_si512((m512 *)ptr, a);
--#elif defined(HAVE_AVX2)
--    storeu256(ptr, a.lo);
--    storeu256((char *)ptr + 32, a.hi);
--#else
--    storeu128(ptr, a.lo.lo);
--    storeu128((char *)ptr + 16, a.lo.hi);
--    storeu128((char *)ptr + 32, a.hi.lo);
--    storeu128((char *)ptr + 48, a.hi.hi);
--#endif
--}
--
--#if defined(HAVE_AVX512)
--static really_inline
--m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
--    return _mm512_maskz_loadu_epi8(k, ptr);
--}
--
--static really_inline
--m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
--    return _mm512_mask_loadu_epi8(src, k, ptr);
--}
--
--static really_inline
--void storeu_mask_m512(void *ptr, __mmask64 k, m512 a) {
--    _mm512_mask_storeu_epi8(ptr, k, a);
--}
--
--static really_inline
--m512 set_mask_m512(__mmask64 k) {
--    return _mm512_movm_epi8(k);
--}
--
--static really_inline
--m256 loadu_maskz_m256(__mmask32 k, const void *ptr) {
--    return _mm256_maskz_loadu_epi8(k, ptr);
--}
--#endif
--
--// packed unaligned store of first N bytes
--static really_inline
--void storebytes512(void *ptr, m512 a, unsigned int n) {
--    assert(n <= sizeof(a));
--    memcpy(ptr, &a, n);
--}
--
--// packed unaligned load of first N bytes, pad with zero
--static really_inline
--m512 loadbytes512(const void *ptr, unsigned int n) {
--    m512 a = zeroes512();
--    assert(n <= sizeof(a));
--    memcpy(&a, ptr, n);
--    return a;
--}
--
--static really_inline
--m512 mask1bit512(unsigned int n) {
--    assert(n < sizeof(m512) * 8);
--    u32 mask_idx = ((n % 8) * 64) + 95;
--    mask_idx -= n / 8;
--    return loadu512(&simd_onebit_masks[mask_idx]);
--}
--
--// switches on bit N in the given vector.
--static really_inline
--void setbit512(m512 *ptr, unsigned int n) {
--    assert(n < sizeof(*ptr) * 8);
--#if !defined(HAVE_AVX2)
--    m128 *sub;
--    if (n < 128) {
--        sub = &ptr->lo.lo;
--    } else if (n < 256) {
--        sub = &ptr->lo.hi;
--    } else if (n < 384) {
--        sub = &ptr->hi.lo;
--    } else {
--        sub = &ptr->hi.hi;
--    }
--    setbit128(sub, n % 128);
--#elif defined(HAVE_AVX512)
--    *ptr = or512(mask1bit512(n), *ptr);
--#else
--    m256 *sub;
--    if (n < 256) {
--        sub = &ptr->lo;
--    } else {
--        sub = &ptr->hi;
--        n -= 256;
--    }
--    setbit256(sub, n);
--#endif
--}
--
--// switches off bit N in the given vector.
--static really_inline
--void clearbit512(m512 *ptr, unsigned int n) {
--    assert(n < sizeof(*ptr) * 8);
--#if !defined(HAVE_AVX2)
--    m128 *sub;
--    if (n < 128) {
--        sub = &ptr->lo.lo;
--    } else if (n < 256) {
--        sub = &ptr->lo.hi;
--    } else if (n < 384) {
--        sub = &ptr->hi.lo;
--    } else {
--        sub = &ptr->hi.hi;
--    }
--    clearbit128(sub, n % 128);
--#elif defined(HAVE_AVX512)
--    *ptr = andnot512(mask1bit512(n), *ptr);
--#else
--    m256 *sub;
--    if (n < 256) {
--        sub = &ptr->lo;
--    } else {
--        sub = &ptr->hi;
--        n -= 256;
--    }
--    clearbit256(sub, n);
--#endif
--}
--
--// tests bit N in the given vector.
--static really_inline
--char testbit512(m512 val, unsigned int n) {
--    assert(n < sizeof(val) * 8);
--#if !defined(HAVE_AVX2)
--    m128 sub;
--    if (n < 128) {
--        sub = val.lo.lo;
--    } else if (n < 256) {
--        sub = val.lo.hi;
--    } else if (n < 384) {
--        sub = val.hi.lo;
--    } else {
--        sub = val.hi.hi;
--    }
--    return testbit128(sub, n % 128);
--#elif defined(HAVE_AVX512)
--    const m512 mask = mask1bit512(n);
--    return !!_mm512_test_epi8_mask(mask, val);
--#else
--    m256 sub;
--    if (n < 256) {
--        sub = val.lo;
--    } else {
--        sub = val.hi;
--        n -= 256;
--    }
--    return testbit256(sub, n);
--#endif
--}
--
--#endif
++#endif
++
++#endif
 diff --git a/src/util/simd_x86.h b/src/util/simd_x86.h
-new file mode 100644
-index 0000000..59ac642
---- /dev/null
+index 5fa727e..5daaa74 100644
+--- a/src/util/simd_x86.h
 +++ b/src/util/simd_x86.h
-@@ -0,0 +1,1334 @@
-+/*
-+ * Copyright (c) 2015-2017, Intel Corporation
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions are met:
-+ *
-+ *  * Redistributions of source code must retain the above copyright notice,
-+ *    this list of conditions and the following disclaimer.
-+ *  * Redistributions in binary form must reproduce the above copyright
-+ *    notice, this list of conditions and the following disclaimer in the
-+ *    documentation and/or other materials provided with the distribution.
-+ *  * Neither the name of Intel Corporation nor the names of its contributors
-+ *    may be used to endorse or promote products derived from this software
-+ *    without specific prior written permission.
-+ *
-+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-+ * POSSIBILITY OF SUCH DAMAGE.
-+ */
-+
-+/** \file
-+ * \brief SIMD types and primitive operations.
-+ */
-+
-+#ifndef SIMD_X86
-+#define SIMD_X86
-+
-+#if !defined(_WIN32) && !defined(__SSSE3__)
-+#error SSSE3 instructions must be enabled
-+#endif
-+
-+#include "config.h"
-+#include "ue2common.h"
-+#include "simd_types.h"
-+#include "unaligned.h"
-+#include "util/arch.h"
-+#include "util/intrinsics.h"
-+
-+#include <string.h> // for memcpy
-+
-+// Define a common assume_aligned using an appropriate compiler built-in, if
-+// it's available. Note that we need to handle C or C++ compilation.
-+#ifdef __cplusplus
-+#  ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED
-+#    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
-+#  endif
-+#else
-+#  ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED
-+#    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
-+#  endif
-+#endif
-+
-+// Fallback to identity case.
-+#ifndef assume_aligned
-+#define assume_aligned(x, y) (x)
-+#endif
-+
-+#ifdef __cplusplus
-+extern "C" {
-+#endif
-+extern const char vbs_mask_data[];
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+static really_inline m128 ones128(void) {
-+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
-+    /* gcc gets this right */
-+    return _mm_set1_epi8(0xFF);
-+#else
-+    /* trick from Intel's optimization guide to generate all-ones.
-+     * ICC converts this to the single cmpeq instruction */
-+    return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
-+#endif
-+}
-+
-+static really_inline m128 zeroes128(void) {
-+    return _mm_setzero_si128();
-+}
-+
-+/** \brief Bitwise not for m128*/
-+static really_inline m128 not128(m128 a) {
-+    return _mm_xor_si128(a, ones128());
-+}
-+
-+/** \brief Return 1 if a and b are different otherwise 0 */
-+static really_inline int diff128(m128 a, m128 b) {
-+    return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
-+}
-+
-+static really_inline int isnonzero128(m128 a) {
-+    return !!diff128(a, zeroes128());
-+}
-+
-+/**
-+ * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
-+ * mask indicating which 32-bit words contain differences.
-+ */
-+static really_inline u32 diffrich128(m128 a, m128 b) {
-+    a = _mm_cmpeq_epi32(a, b);
-+    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
-+}
-+
-+/**
-+ * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
-+ * returns a 4-bit mask indicating which 64-bit words contain differences.
-+ */
-+static really_inline u32 diffrich64_128(m128 a, m128 b) {
-+#if defined(HAVE_SSE41)
-+    a = _mm_cmpeq_epi64(a, b);
-+    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
-+#else
-+    u32 d = diffrich128(a, b);
-+    return (d | (d >> 1)) & 0x5;
-+#endif
-+}
-+
-+static really_really_inline
-+m128 lshift64_m128(m128 a, unsigned b) {
-+#if defined(HAVE__BUILTIN_CONSTANT_P)
-+    if (__builtin_constant_p(b)) {
-+        return _mm_slli_epi64(a, b);
-+    }
-+#endif
-+    m128 x = _mm_cvtsi32_si128(b);
-+    return _mm_sll_epi64(a, x);
-+}
-+
-+#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
-+#define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
-+#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
-+
-+static really_inline m128 set16x8(u8 c) {
-+    return _mm_set1_epi8(c);
-+}
-+
-+static really_inline m128 set4x32(u32 c) {
-+    return _mm_set1_epi32(c);
-+}
-+
-+static really_inline m128 set2x64(u64a c) {
-+    return _mm_set1_epi32(c);
-+}
-+
-+static really_inline u32 movd(const m128 in) {
-+    return _mm_cvtsi128_si32(in);
-+}
-+
-+static really_inline u64a movq(const m128 in) {
-+#if defined(ARCH_X86_64)
-+    return _mm_cvtsi128_si64(in);
-+#else // 32-bit - this is horrific
-+    u32 lo = movd(in);
-+    u32 hi = movd(_mm_srli_epi64(in, 32));
-+    return (u64a)hi << 32 | lo;
-+#endif
-+}
-+
-+/* another form of movq */
-+static really_inline
-+m128 load_m128_from_u64a(const u64a *p) {
-+    return _mm_set_epi64x(0LL, *p);
-+}
-+
-+#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
-+#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
-+
-+#if defined(HAVE_SSE41)
-+#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
-+#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
-+#else
-+#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
-+#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
-+#endif
-+
-+#if !defined(HAVE_AVX2)
-+// TODO: this entire file needs restructuring - this carveout is awful
-+#define extractlow64from256(a) movq(a.lo)
-+#define extractlow32from256(a) movd(a.lo)
-+#if defined(HAVE_SSE41)
-+#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
-+#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
-+#else
-+#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
-+#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
-+#endif
-+
-+#endif // !AVX2
-+
-+static really_inline m128 and128(m128 a, m128 b) {
-+    return _mm_and_si128(a,b);
-+}
-+
-+static really_inline m128 xor128(m128 a, m128 b) {
-+    return _mm_xor_si128(a,b);
-+}
-+
-+static really_inline m128 or128(m128 a, m128 b) {
-+    return _mm_or_si128(a,b);
-+}
-+
-+static really_inline m128 andnot128(m128 a, m128 b) {
-+    return _mm_andnot_si128(a, b);
-+}
-+
-+// aligned load
-+static really_inline m128 load128(const void *ptr) {
-+    assert(ISALIGNED_N(ptr, alignof(m128)));
-+    ptr = assume_aligned(ptr, 16);
-+    return _mm_load_si128((const m128 *)ptr);
-+}
-+
-+// aligned store
-+static really_inline void store128(void *ptr, m128 a) {
-+    assert(ISALIGNED_N(ptr, alignof(m128)));
-+    ptr = assume_aligned(ptr, 16);
-+    *(m128 *)ptr = a;
-+}
-+
-+// unaligned load
-+static really_inline m128 loadu128(const void *ptr) {
-+    return _mm_loadu_si128((const m128 *)ptr);
-+}
-+
-+// unaligned store
-+static really_inline void storeu128(void *ptr, m128 a) {
-+    _mm_storeu_si128 ((m128 *)ptr, a);
-+}
-+
-+// packed unaligned store of first N bytes
-+static really_inline
-+void storebytes128(void *ptr, m128 a, unsigned int n) {
-+    assert(n <= sizeof(a));
-+    memcpy(ptr, &a, n);
-+}
-+
-+// packed unaligned load of first N bytes, pad with zero
-+static really_inline
-+m128 loadbytes128(const void *ptr, unsigned int n) {
-+    m128 a = zeroes128();
-+    assert(n <= sizeof(a));
-+    memcpy(&a, ptr, n);
-+    return a;
-+}
-+
-+#ifdef __cplusplus
-+extern "C" {
-+#endif
-+extern const u8 simd_onebit_masks[];
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+static really_inline
-+m128 mask1bit128(unsigned int n) {
-+    assert(n < sizeof(m128) * 8);
-+    u32 mask_idx = ((n % 8) * 64) + 95;
-+    mask_idx -= n / 8;
-+    return loadu128(&simd_onebit_masks[mask_idx]);
-+}
-+
-+// switches on bit N in the given vector.
-+static really_inline
-+void setbit128(m128 *ptr, unsigned int n) {
-+    *ptr = or128(mask1bit128(n), *ptr);
-+}
-+
-+// switches off bit N in the given vector.
-+static really_inline
-+void clearbit128(m128 *ptr, unsigned int n) {
-+    *ptr = andnot128(mask1bit128(n), *ptr);
-+}
-+
-+// tests bit N in the given vector.
-+static really_inline
-+char testbit128(m128 val, unsigned int n) {
-+    const m128 mask = mask1bit128(n);
-+#if defined(HAVE_SSE41)
-+    return !_mm_testz_si128(mask, val);
-+#else
-+    return isnonzero128(and128(mask, val));
-+#endif
-+}
-+
-+// offset must be an immediate
-+#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
-+
-+static really_inline
-+m128 pshufb_m128(m128 a, m128 b) {
-+    m128 result;
-+    result = _mm_shuffle_epi8(a, b);
-+    return result;
-+}
-+
-+static really_inline
-+m256 pshufb_m256(m256 a, m256 b) {
-+#if defined(HAVE_AVX2)
-+    return _mm256_shuffle_epi8(a, b);
-+#else
-+    m256 rv;
-+    rv.lo = pshufb_m128(a.lo, b.lo);
-+    rv.hi = pshufb_m128(a.hi, b.hi);
-+    return rv;
-+#endif
-+}
-+
-+#if defined(HAVE_AVX512)
-+static really_inline
-+m512 pshufb_m512(m512 a, m512 b) {
-+    return _mm512_shuffle_epi8(a, b);
-+}
-+
-+static really_inline
-+m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
-+    return _mm512_maskz_shuffle_epi8(k, a, b);
-+}
-+#endif
-+
-+static really_inline
-+m128 variable_byte_shift_m128(m128 in, s32 amount) {
-+    assert(amount >= -16 && amount <= 16);
-+    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
-+    return pshufb_m128(in, shift_mask);
-+}
-+
-+static really_inline
-+m128 max_u8_m128(m128 a, m128 b) {
-+    return _mm_max_epu8(a, b);
-+}
-+
-+static really_inline
-+m128 min_u8_m128(m128 a, m128 b) {
-+    return _mm_min_epu8(a, b);
-+}
-+
-+static really_inline
-+m128 sadd_u8_m128(m128 a, m128 b) {
-+    return _mm_adds_epu8(a, b);
-+}
-+
-+static really_inline
-+m128 sub_u8_m128(m128 a, m128 b) {
-+    return _mm_sub_epi8(a, b);
-+}
-+
-+static really_inline
-+m128 set64x2(u64a hi, u64a lo) {
-+    return _mm_set_epi64x(hi, lo);
-+}
-+
-+static really_inline
-+m128 set32x4(int i3, int i2, int i1, int i0) {
-+    return _mm_set_epi32(i3, i2, i1, i0);
-+}
-+
-+/****
-+ **** 256-bit Primitives
-+ ****/
-+
-+#if defined(HAVE_AVX2)
-+
-+static really_really_inline
-+m256 lshift64_m256(m256 a, unsigned b) {
-+#if defined(HAVE__BUILTIN_CONSTANT_P)
-+    if (__builtin_constant_p(b)) {
-+        return _mm256_slli_epi64(a, b);
-+    }
-+#endif
-+    m128 x = _mm_cvtsi32_si128(b);
-+    return _mm256_sll_epi64(a, x);
-+}
-+
-+#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
-+
-+static really_inline
-+m256 set32x8(u32 in) {
-+    return _mm256_set1_epi8(in);
-+}
-+
-+#define eq256(a, b)     _mm256_cmpeq_epi8((a), (b))
-+#define movemask256(a)  ((u32)_mm256_movemask_epi8((a)))
-+
-+static really_inline
-+m256 set2x128(m128 a) {
-+    return _mm256_broadcastsi128_si256(a);
-+}
-+
-+#else
-+
-+static really_really_inline
-+m256 lshift64_m256(m256 a, int b) {
-+    m256 rv = a;
-+    rv.lo = lshift64_m128(rv.lo, b);
-+    rv.hi = lshift64_m128(rv.hi, b);
-+    return rv;
-+}
-+
-+static really_inline
-+m256 rshift64_m256(m256 a, int b) {
-+    m256 rv = a;
-+    rv.lo = rshift64_m128(rv.lo, b);
-+    rv.hi = rshift64_m128(rv.hi, b);
-+    return rv;
-+}
-+static really_inline
-+m256 set32x8(u32 in) {
-+    m256 rv;
-+    rv.lo = set16x8((u8) in);
-+    rv.hi = rv.lo;
-+    return rv;
-+}
-+
-+static really_inline
-+m256 eq256(m256 a, m256 b) {
-+    m256 rv;
-+    rv.lo = eq128(a.lo, b.lo);
-+    rv.hi = eq128(a.hi, b.hi);
-+    return rv;
-+}
-+
-+static really_inline
-+u32 movemask256(m256 a) {
-+    u32 lo_mask = movemask128(a.lo);
-+    u32 hi_mask = movemask128(a.hi);
-+    return lo_mask | (hi_mask << 16);
-+}
-+
-+static really_inline
-+m256 set2x128(m128 a) {
-+    m256 rv = {a, a};
-+    return rv;
-+}
-+#endif
-+
-+static really_inline m256 zeroes256(void) {
-+#if defined(HAVE_AVX2)
-+    return _mm256_setzero_si256();
-+#else
-+    m256 rv = {zeroes128(), zeroes128()};
-+    return rv;
-+#endif
-+}
-+
-+static really_inline m256 ones256(void) {
-+#if defined(HAVE_AVX2)
-+    m256 rv = _mm256_set1_epi8(0xFF);
-+#else
-+    m256 rv = {ones128(), ones128()};
-+#endif
-+    return rv;
-+}
-+
-+#if defined(HAVE_AVX2)
-+static really_inline m256 and256(m256 a, m256 b) {
-+    return _mm256_and_si256(a, b);
-+}
-+#else
-+static really_inline m256 and256(m256 a, m256 b) {
-+    m256 rv;
-+    rv.lo = and128(a.lo, b.lo);
-+    rv.hi = and128(a.hi, b.hi);
-+    return rv;
-+}
-+#endif
-+
-+#if defined(HAVE_AVX2)
-+static really_inline m256 or256(m256 a, m256 b) {
-+    return _mm256_or_si256(a, b);
-+}
-+#else
-+static really_inline m256 or256(m256 a, m256 b) {
-+    m256 rv;
-+    rv.lo = or128(a.lo, b.lo);
-+    rv.hi = or128(a.hi, b.hi);
-+    return rv;
-+}
-+#endif
-+
-+#if defined(HAVE_AVX2)
-+static really_inline m256 xor256(m256 a, m256 b) {
-+    return _mm256_xor_si256(a, b);
-+}
-+#else
-+static really_inline m256 xor256(m256 a, m256 b) {
-+    m256 rv;
-+    rv.lo = xor128(a.lo, b.lo);
-+    rv.hi = xor128(a.hi, b.hi);
-+    return rv;
-+}
-+#endif
-+
-+#if defined(HAVE_AVX2)
-+static really_inline m256 not256(m256 a) {
-+    return _mm256_xor_si256(a, ones256());
-+}
-+#else
-+static really_inline m256 not256(m256 a) {
-+    m256 rv;
-+    rv.lo = not128(a.lo);
-+    rv.hi = not128(a.hi);
-+    return rv;
-+}
-+#endif
-+
-+#if defined(HAVE_AVX2)
-+static really_inline m256 andnot256(m256 a, m256 b) {
-+    return _mm256_andnot_si256(a, b);
-+}
-+#else
-+static really_inline m256 andnot256(m256 a, m256 b) {
-+    m256 rv;
-+    rv.lo = andnot128(a.lo, b.lo);
-+    rv.hi = andnot128(a.hi, b.hi);
-+    return rv;
-+}
-+#endif
-+
-+static really_inline int diff256(m256 a, m256 b) {
-+#if defined(HAVE_AVX2)
-+    return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
-+#else
-+    return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
-+#endif
-+}
-+
-+static really_inline int isnonzero256(m256 a) {
-+#if defined(HAVE_AVX2)
-+    return !!diff256(a, zeroes256());
-+#else
-+    return isnonzero128(or128(a.lo, a.hi));
-+#endif
-+}
-+
-+/**
-+ * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
-+ * mask indicating which 32-bit words contain differences.
-+ */
-+static really_inline u32 diffrich256(m256 a, m256 b) {
-+#if defined(HAVE_AVX2)
-+    a = _mm256_cmpeq_epi32(a, b);
-+    return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
-+#else
-+    m128 z = zeroes128();
-+    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
-+    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
-+    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z);
-+    return ~(_mm_movemask_epi8(packed)) & 0xff;
-+#endif
-+}
-+
-+/**
-+ * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
-+ * returns an 8-bit mask indicating which 64-bit words contain differences.
-+ */
-+static really_inline u32 diffrich64_256(m256 a, m256 b) {
-+    u32 d = diffrich256(a, b);
-+    return (d | (d >> 1)) & 0x55555555;
-+}
-+
-+// aligned load
-+static really_inline m256 load256(const void *ptr) {
-+    assert(ISALIGNED_N(ptr, alignof(m256)));
-+#if defined(HAVE_AVX2)
-+    return _mm256_load_si256((const m256 *)ptr);
-+#else
-+    m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
-+    return rv;
-+#endif
-+}
-+
-+// aligned load  of 128-bit value to low and high part of 256-bit value
-+static really_inline m256 load2x128(const void *ptr) {
-+#if defined(HAVE_AVX2)
-+    return set2x128(load128(ptr));
-+#else
-+    assert(ISALIGNED_N(ptr, alignof(m128)));
-+    m256 rv;
-+    rv.hi = rv.lo = load128(ptr);
-+    return rv;
-+#endif
-+}
-+
-+static really_inline m256 loadu2x128(const void *ptr) {
-+    return set2x128(loadu128(ptr));
-+}
-+
-+// aligned store
-+static really_inline void store256(void *ptr, m256 a) {
-+    assert(ISALIGNED_N(ptr, alignof(m256)));
-+#if defined(HAVE_AVX2)
-+    _mm256_store_si256((m256 *)ptr, a);
-+#else
-+    ptr = assume_aligned(ptr, 16);
-+    *(m256 *)ptr = a;
-+#endif
-+}
-+
-+// unaligned load
-+static really_inline m256 loadu256(const void *ptr) {
-+#if defined(HAVE_AVX2)
-+    return _mm256_loadu_si256((const m256 *)ptr);
-+#else
-+    m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
-+    return rv;
-+#endif
-+}
-+
-+// unaligned store
-+static really_inline void storeu256(void *ptr, m256 a) {
-+#if defined(HAVE_AVX2)
-+    _mm256_storeu_si256((m256 *)ptr, a);
-+#else
-+    storeu128(ptr, a.lo);
-+    storeu128((char *)ptr + 16, a.hi);
-+#endif
-+}
-+
-+// packed unaligned store of first N bytes
-+static really_inline
-+void storebytes256(void *ptr, m256 a, unsigned int n) {
-+    assert(n <= sizeof(a));
-+    memcpy(ptr, &a, n);
-+}
-+
-+// packed unaligned load of first N bytes, pad with zero
-+static really_inline
-+m256 loadbytes256(const void *ptr, unsigned int n) {
-+    m256 a = zeroes256();
-+    assert(n <= sizeof(a));
-+    memcpy(&a, ptr, n);
-+    return a;
-+}
-+
-+static really_inline
-+m256 mask1bit256(unsigned int n) {
-+    assert(n < sizeof(m256) * 8);
-+    u32 mask_idx = ((n % 8) * 64) + 95;
-+    mask_idx -= n / 8;
-+    return loadu256(&simd_onebit_masks[mask_idx]);
-+}
-+
-+static really_inline
-+m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
-+#if defined(HAVE_AVX2)
-+    return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
-+#else
-+    m256 rv;
-+    rv.hi = set64x2(hi_1, hi_0);
-+    rv.lo = set64x2(lo_1, lo_0);
-+    return rv;
-+#endif
-+}
-+
-+#if !defined(HAVE_AVX2)
-+// switches on bit N in the given vector.
-+static really_inline
-+void setbit256(m256 *ptr, unsigned int n) {
-+    assert(n < sizeof(*ptr) * 8);
-+    m128 *sub;
-+    if (n < 128) {
-+        sub = &ptr->lo;
-+    } else {
-+        sub = &ptr->hi;
-+        n -= 128;
-+    }
-+    setbit128(sub, n);
-+}
-+
-+// switches off bit N in the given vector.
-+static really_inline
-+void clearbit256(m256 *ptr, unsigned int n) {
-+    assert(n < sizeof(*ptr) * 8);
-+    m128 *sub;
-+    if (n < 128) {
-+        sub = &ptr->lo;
-+    } else {
-+        sub = &ptr->hi;
-+        n -= 128;
-+    }
-+    clearbit128(sub, n);
-+}
-+
-+// tests bit N in the given vector.
-+static really_inline
-+char testbit256(m256 val, unsigned int n) {
-+    assert(n < sizeof(val) * 8);
-+    m128 sub;
-+    if (n < 128) {
-+        sub = val.lo;
-+    } else {
-+        sub = val.hi;
-+        n -= 128;
-+    }
-+    return testbit128(sub, n);
-+}
-+
-+static really_really_inline
-+m128 movdq_hi(m256 x) {
-+    return x.hi;
-+}
-+
-+static really_really_inline
-+m128 movdq_lo(m256 x) {
-+    return x.lo;
-+}
-+
-+static really_inline
-+m256 combine2x128(m128 hi, m128 lo) {
-+    m256 rv = {lo, hi};
-+    return rv;
-+}
-+
-+#else // AVX2
-+
-+// switches on bit N in the given vector.
-+static really_inline
-+void setbit256(m256 *ptr, unsigned int n) {
-+    *ptr = or256(mask1bit256(n), *ptr);
-+}
-+
-+static really_inline
-+void clearbit256(m256 *ptr, unsigned int n) {
-+    *ptr = andnot256(mask1bit256(n), *ptr);
-+}
-+
-+// tests bit N in the given vector.
-+static really_inline
-+char testbit256(m256 val, unsigned int n) {
-+    const m256 mask = mask1bit256(n);
-+    return !_mm256_testz_si256(mask, val);
-+}
-+
-+static really_really_inline
-+m128 movdq_hi(m256 x) {
-+    return _mm256_extracti128_si256(x, 1);
-+}
-+
-+static really_really_inline
-+m128 movdq_lo(m256 x) {
-+    return _mm256_extracti128_si256(x, 0);
-+}
-+
-+#define cast256to128(a) _mm256_castsi256_si128(a)
-+#define cast128to256(a) _mm256_castsi128_si256(a)
-+#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
-+#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
-+#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
-+#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
-+#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
-+#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
-+#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
-+#define extractlow32from256(a) movd(cast256to128(a))
-+#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
-+#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
-+#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
-+
-+static really_inline
-+m256 combine2x128(m128 hi, m128 lo) {
-+#if defined(_mm256_set_m128i)
-+    return _mm256_set_m128i(hi, lo);
-+#else
-+    return insert128to256(cast128to256(lo), hi, 1);
-+#endif
-+}
-+#endif //AVX2
-+
-+#if defined(HAVE_AVX512)
-+#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
-+#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
-+#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
-+#define set2x256(a) _mm512_broadcast_i64x4(a)
-+#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
-+#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
-+#endif
-+
-+/****
-+ **** 384-bit Primitives
-+ ****/
-+
-+static really_inline m384 and384(m384 a, m384 b) {
-+    m384 rv;
-+    rv.lo = and128(a.lo, b.lo);
-+    rv.mid = and128(a.mid, b.mid);
-+    rv.hi = and128(a.hi, b.hi);
-+    return rv;
-+}
-+
-+static really_inline m384 or384(m384 a, m384 b) {
-+    m384 rv;
-+    rv.lo = or128(a.lo, b.lo);
-+    rv.mid = or128(a.mid, b.mid);
-+    rv.hi = or128(a.hi, b.hi);
-+    return rv;
-+}
-+
-+static really_inline m384 xor384(m384 a, m384 b) {
-+    m384 rv;
-+    rv.lo = xor128(a.lo, b.lo);
-+    rv.mid = xor128(a.mid, b.mid);
-+    rv.hi = xor128(a.hi, b.hi);
-+    return rv;
-+}
-+static really_inline m384 not384(m384 a) {
-+    m384 rv;
-+    rv.lo = not128(a.lo);
-+    rv.mid = not128(a.mid);
-+    rv.hi = not128(a.hi);
-+    return rv;
-+}
-+static really_inline m384 andnot384(m384 a, m384 b) {
-+    m384 rv;
-+    rv.lo = andnot128(a.lo, b.lo);
-+    rv.mid = andnot128(a.mid, b.mid);
-+    rv.hi = andnot128(a.hi, b.hi);
-+    return rv;
-+}
-+
-+static really_really_inline
-+m384 lshift64_m384(m384 a, unsigned b) {
-+    m384 rv;
-+    rv.lo = lshift64_m128(a.lo, b);
-+    rv.mid = lshift64_m128(a.mid, b);
-+    rv.hi = lshift64_m128(a.hi, b);
-+    return rv;
-+}
-+
-+static really_inline m384 zeroes384(void) {
-+    m384 rv = {zeroes128(), zeroes128(), zeroes128()};
-+    return rv;
-+}
-+
-+static really_inline m384 ones384(void) {
-+    m384 rv = {ones128(), ones128(), ones128()};
-+    return rv;
-+}
-+
-+static really_inline int diff384(m384 a, m384 b) {
-+    return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
-+}
-+
-+static really_inline int isnonzero384(m384 a) {
-+    return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
-+}
-+
-+/**
-+ * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
-+ * mask indicating which 32-bit words contain differences.
-+ */
-+static really_inline u32 diffrich384(m384 a, m384 b) {
-+    m128 z = zeroes128();
-+    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
-+    a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
-+    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
-+    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
-+                                  _mm_packs_epi32(a.hi, z));
-+    return ~(_mm_movemask_epi8(packed)) & 0xfff;
-+}
-+
-+/**
-+ * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
-+ * returns a 12-bit mask indicating which 64-bit words contain differences.
-+ */
-+static really_inline u32 diffrich64_384(m384 a, m384 b) {
-+    u32 d = diffrich384(a, b);
-+    return (d | (d >> 1)) & 0x55555555;
-+}
-+
-+// aligned load
-+static really_inline m384 load384(const void *ptr) {
-+    assert(ISALIGNED_16(ptr));
-+    m384 rv = { load128(ptr), load128((const char *)ptr + 16),
-+                load128((const char *)ptr + 32) };
-+    return rv;
-+}
-+
-+// aligned store
-+static really_inline void store384(void *ptr, m384 a) {
-+    assert(ISALIGNED_16(ptr));
-+    ptr = assume_aligned(ptr, 16);
-+    *(m384 *)ptr = a;
-+}
-+
-+// unaligned load
-+static really_inline m384 loadu384(const void *ptr) {
-+    m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
-+                loadu128((const char *)ptr + 32)};
-+    return rv;
-+}
-+
-+// packed unaligned store of first N bytes
-+static really_inline
-+void storebytes384(void *ptr, m384 a, unsigned int n) {
-+    assert(n <= sizeof(a));
-+    memcpy(ptr, &a, n);
-+}
-+
-+// packed unaligned load of first N bytes, pad with zero
-+static really_inline
-+m384 loadbytes384(const void *ptr, unsigned int n) {
-+    m384 a = zeroes384();
-+    assert(n <= sizeof(a));
-+    memcpy(&a, ptr, n);
-+    return a;
-+}
-+
-+// switches on bit N in the given vector.
-+static really_inline
-+void setbit384(m384 *ptr, unsigned int n) {
-+    assert(n < sizeof(*ptr) * 8);
-+    m128 *sub;
-+    if (n < 128) {
-+        sub = &ptr->lo;
-+    } else if (n < 256) {
-+        sub = &ptr->mid;
-+    } else {
-+        sub = &ptr->hi;
-+    }
-+    setbit128(sub, n % 128);
-+}
-+
-+// switches off bit N in the given vector.
-+static really_inline
-+void clearbit384(m384 *ptr, unsigned int n) {
-+    assert(n < sizeof(*ptr) * 8);
-+    m128 *sub;
-+    if (n < 128) {
-+        sub = &ptr->lo;
-+    } else if (n < 256) {
-+        sub = &ptr->mid;
-+    } else {
-+        sub = &ptr->hi;
-+    }
-+    clearbit128(sub, n % 128);
-+}
-+
-+// tests bit N in the given vector.
-+static really_inline
-+char testbit384(m384 val, unsigned int n) {
-+    assert(n < sizeof(val) * 8);
-+    m128 sub;
-+    if (n < 128) {
-+        sub = val.lo;
-+    } else if (n < 256) {
-+        sub = val.mid;
-+    } else {
-+        sub = val.hi;
-+    }
-+    return testbit128(sub, n % 128);
-+}
-+
-+/****
-+ **** 512-bit Primitives
-+ ****/
-+
-+#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
-+#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
-+
-+static really_inline
-+m512 zeroes512(void) {
-+#if defined(HAVE_AVX512)
-+    return _mm512_setzero_si512();
-+#else
-+    m512 rv = {zeroes256(), zeroes256()};
-+    return rv;
-+#endif
-+}
-+
-+static really_inline
-+m512 ones512(void) {
-+#if defined(HAVE_AVX512)
-+    return _mm512_set1_epi8(0xFF);
-+    //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
-+#else
-+    m512 rv = {ones256(), ones256()};
-+    return rv;
-+#endif
-+}
-+
-+#if defined(HAVE_AVX512)
-+static really_inline
-+m512 set64x8(u8 a) {
-+    return _mm512_set1_epi8(a);
-+}
-+
-+static really_inline
-+m512 set8x64(u64a a) {
-+    return _mm512_set1_epi64(a);
-+}
-+
-+static really_inline
-+m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
-+               u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
-+    return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
-+                            lo_3, lo_2, lo_1, lo_0);
-+}
-+
-+static really_inline
-+m512 swap256in512(m512 a) {
-+    m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
-+    return vpermq512(idx, a);
-+}
-+
-+static really_inline
-+m512 set4x128(m128 a) {
-+    return _mm512_broadcast_i32x4(a);
-+}
-+#endif
-+
-+static really_inline
-+m512 and512(m512 a, m512 b) {
-+#if defined(HAVE_AVX512)
-+    return _mm512_and_si512(a, b);
-+#else
-+    m512 rv;
-+    rv.lo = and256(a.lo, b.lo);
-+    rv.hi = and256(a.hi, b.hi);
-+    return rv;
-+#endif
-+}
-+
-+static really_inline
-+m512 or512(m512 a, m512 b) {
-+#if defined(HAVE_AVX512)
-+    return _mm512_or_si512(a, b);
-+#else
-+    m512 rv;
-+    rv.lo = or256(a.lo, b.lo);
-+    rv.hi = or256(a.hi, b.hi);
-+    return rv;
-+#endif
-+}
-+
-+static really_inline
-+m512 xor512(m512 a, m512 b) {
-+#if defined(HAVE_AVX512)
-+    return _mm512_xor_si512(a, b);
-+#else
-+    m512 rv;
-+    rv.lo = xor256(a.lo, b.lo);
-+    rv.hi = xor256(a.hi, b.hi);
-+    return rv;
-+#endif
-+}
-+
-+static really_inline
-+m512 not512(m512 a) {
-+#if defined(HAVE_AVX512)
-+    return _mm512_xor_si512(a, ones512());
-+#else
-+    m512 rv;
-+    rv.lo = not256(a.lo);
-+    rv.hi = not256(a.hi);
-+    return rv;
-+#endif
-+}
-+
-+static really_inline
-+m512 andnot512(m512 a, m512 b) {
-+#if defined(HAVE_AVX512)
-+    return _mm512_andnot_si512(a, b);
-+#else
-+    m512 rv;
-+    rv.lo = andnot256(a.lo, b.lo);
-+    rv.hi = andnot256(a.hi, b.hi);
-+    return rv;
-+#endif
-+}
-+
-+#if defined(HAVE_AVX512)
-+static really_really_inline
-+m512 lshift64_m512(m512 a, unsigned b) {
-+#if defined(HAVE__BUILTIN_CONSTANT_P)
-+    if (__builtin_constant_p(b)) {
-+        return _mm512_slli_epi64(a, b);
-+    }
-+#endif
-+    m128 x = _mm_cvtsi32_si128(b);
-+    return _mm512_sll_epi64(a, x);
-+}
-+#else
-+static really_really_inline
-+m512 lshift64_m512(m512 a, unsigned b) {
-+    m512 rv;
-+    rv.lo = lshift64_m256(a.lo, b);
-+    rv.hi = lshift64_m256(a.hi, b);
-+    return rv;
-+}
-+#endif
-+
-+#if defined(HAVE_AVX512)
-+#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
-+#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
-+#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
-+#endif
-+
-+#if !defined(_MM_CMPINT_NE)
-+#define _MM_CMPINT_NE 0x4
-+#endif
-+
-+static really_inline
-+int diff512(m512 a, m512 b) {
-+#if defined(HAVE_AVX512)
-+    return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
-+#else
-+    return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
-+#endif
-+}
-+
-+static really_inline
-+int isnonzero512(m512 a) {
-+#if defined(HAVE_AVX512)
-+    return diff512(a, zeroes512());
-+#elif defined(HAVE_AVX2)
-+    m256 x = or256(a.lo, a.hi);
-+    return !!diff256(x, zeroes256());
-+#else
-+    m128 x = or128(a.lo.lo, a.lo.hi);
-+    m128 y = or128(a.hi.lo, a.hi.hi);
-+    return isnonzero128(or128(x, y));
-+#endif
-+}
-+
-+/**
-+ * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
-+ * mask indicating which 32-bit words contain differences.
-+ */
-+static really_inline
-+u32 diffrich512(m512 a, m512 b) {
-+#if defined(HAVE_AVX512)
-+    return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
-+#elif defined(HAVE_AVX2)
-+    return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
-+#else
-+    a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
-+    a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi);
-+    a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo);
-+    a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi);
-+    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi),
-+                                  _mm_packs_epi32(a.hi.lo, a.hi.hi));
-+    return ~(_mm_movemask_epi8(packed)) & 0xffff;
-+#endif
-+}
-+
-+/**
-+ * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
-+ * returns a 16-bit mask indicating which 64-bit words contain differences.
-+ */
-+static really_inline
-+u32 diffrich64_512(m512 a, m512 b) {
-+    //TODO: cmp_epi64?
-+    u32 d = diffrich512(a, b);
-+    return (d | (d >> 1)) & 0x55555555;
-+}
-+
-+// aligned load
-+static really_inline
-+m512 load512(const void *ptr) {
-+#if defined(HAVE_AVX512)
-+    return _mm512_load_si512(ptr);
-+#else
-+    assert(ISALIGNED_N(ptr, alignof(m256)));
-+    m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
-+    return rv;
-+#endif
-+}
-+
-+// aligned store
-+static really_inline
-+void store512(void *ptr, m512 a) {
-+    assert(ISALIGNED_N(ptr, alignof(m512)));
-+#if defined(HAVE_AVX512)
-+    return _mm512_store_si512(ptr, a);
-+#elif defined(HAVE_AVX2)
-+    m512 *x = (m512 *)ptr;
-+    store256(&x->lo, a.lo);
-+    store256(&x->hi, a.hi);
-+#else
-+    ptr = assume_aligned(ptr, 16);
-+    *(m512 *)ptr = a;
-+#endif
-+}
-+
-+// unaligned load
-+static really_inline
-+m512 loadu512(const void *ptr) {
-+#if defined(HAVE_AVX512)
-+    return _mm512_loadu_si512(ptr);
-+#else
-+    m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
-+    return rv;
-+#endif
-+}
-+
-+#if defined(HAVE_AVX512)
-+static really_inline
-+m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
-+    return _mm512_maskz_loadu_epi8(k, ptr);
-+}
-+
-+static really_inline
-+m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
-+    return _mm512_mask_loadu_epi8(src, k, ptr);
-+}
-+
-+static really_inline
-+m512 set_mask_m512(__mmask64 k) {
-+    return _mm512_movm_epi8(k);
-+}
-+#endif
-+
-+// packed unaligned store of first N bytes
-+static really_inline
-+void storebytes512(void *ptr, m512 a, unsigned int n) {
-+    assert(n <= sizeof(a));
-+    memcpy(ptr, &a, n);
-+}
-+
-+// packed unaligned load of first N bytes, pad with zero
-+static really_inline
-+m512 loadbytes512(const void *ptr, unsigned int n) {
-+    m512 a = zeroes512();
-+    assert(n <= sizeof(a));
-+    memcpy(&a, ptr, n);
-+    return a;
-+}
-+
-+static really_inline
-+m512 mask1bit512(unsigned int n) {
-+    assert(n < sizeof(m512) * 8);
-+    u32 mask_idx = ((n % 8) * 64) + 95;
-+    mask_idx -= n / 8;
-+    return loadu512(&simd_onebit_masks[mask_idx]);
-+}
-+
-+// switches on bit N in the given vector.
-+static really_inline
-+void setbit512(m512 *ptr, unsigned int n) {
-+    assert(n < sizeof(*ptr) * 8);
-+#if !defined(HAVE_AVX2)
-+    m128 *sub;
-+    if (n < 128) {
-+        sub = &ptr->lo.lo;
-+    } else if (n < 256) {
-+        sub = &ptr->lo.hi;
-+    } else if (n < 384) {
-+        sub = &ptr->hi.lo;
-+    } else {
-+        sub = &ptr->hi.hi;
-+    }
-+    setbit128(sub, n % 128);
-+#elif defined(HAVE_AVX512)
-+    *ptr = or512(mask1bit512(n), *ptr);
-+#else
-+    m256 *sub;
-+    if (n < 256) {
-+        sub = &ptr->lo;
-+    } else {
-+        sub = &ptr->hi;
-+        n -= 256;
-+    }
-+    setbit256(sub, n);
-+#endif
-+}
-+
-+// switches off bit N in the given vector.
-+static really_inline
-+void clearbit512(m512 *ptr, unsigned int n) {
-+    assert(n < sizeof(*ptr) * 8);
-+#if !defined(HAVE_AVX2)
-+    m128 *sub;
-+    if (n < 128) {
-+        sub = &ptr->lo.lo;
-+    } else if (n < 256) {
-+        sub = &ptr->lo.hi;
-+    } else if (n < 384) {
-+        sub = &ptr->hi.lo;
-+    } else {
-+        sub = &ptr->hi.hi;
-+    }
-+    clearbit128(sub, n % 128);
-+#elif defined(HAVE_AVX512)
-+    *ptr = andnot512(mask1bit512(n), *ptr);
-+#else
-+    m256 *sub;
-+    if (n < 256) {
-+        sub = &ptr->lo;
-+    } else {
-+        sub = &ptr->hi;
-+        n -= 256;
-+    }
-+    clearbit256(sub, n);
-+#endif
-+}
-+
-+// tests bit N in the given vector.
-+static really_inline
-+char testbit512(m512 val, unsigned int n) {
-+    assert(n < sizeof(val) * 8);
-+#if !defined(HAVE_AVX2)
-+    m128 sub;
-+    if (n < 128) {
-+        sub = val.lo.lo;
-+    } else if (n < 256) {
-+        sub = val.lo.hi;
-+    } else if (n < 384) {
-+        sub = val.hi.lo;
-+    } else {
-+        sub = val.hi.hi;
-+    }
-+    return testbit128(sub, n % 128);
-+#elif defined(HAVE_AVX512)
-+    const m512 mask = mask1bit512(n);
-+    return !!_mm512_test_epi8_mask(mask, val);
-+#else
-+    m256 sub;
-+    if (n < 256) {
-+        sub = val.lo;
-+    } else {
-+        sub = val.hi;
-+        n -= 256;
-+    }
-+    return testbit256(sub, n);
-+#endif
-+}
-+
-+#endif
-diff --git a/src/util/state_compress.c b/src/util/state_compress.c
-index 7238849..4422403 100644
---- a/src/util/state_compress.c
-+++ b/src/util/state_compress.c
-@@ -150,7 +150,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
-     u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
-                  expand32(v[2], m[2]), expand32(v[3], m[3]) };
- 
--    return _mm_set_epi32(x[3], x[2], x[1], x[0]);
-+    return set32x4(x[3], x[2], x[1], x[0]);
+@@ -1417,4 +1417,14 @@ char testbit512(m512 val, unsigned int n) {
+ #endif
  }
+ 
++static really_inline m128 set2x64(u64a c)
++{
++	return _mm_set1_epi32(c);
++}
++
++static really_inline m128 set32x4(int i3, int i2, int i1, int i0)
++{
++	return _mm_set_epi32(i3, i2, i1, i0);
++}
++
  #endif
- 
-@@ -158,7 +158,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
- static really_inline
- m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
-     // First, decompose our vectors into 64-bit chunks.
--    u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) };
-+    u64a m[2] = { movq(mvec), movq(rshiftbyte_m128(mvec, 8)) };
- 
-     u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
-     u64a v[2];
-@@ -167,7 +167,7 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
- 
-     u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) };
- 
--    return _mm_set_epi64x(x[1], x[0]);
-+    return set64x2(x[1], x[0]);
- }
- #endif
- 
-@@ -264,8 +264,8 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
-                  expand32(v[6], m[6]), expand32(v[7], m[7]) };
- 
- #if !defined(HAVE_AVX2)
--    m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
--                  .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) };
-+    m256 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]),
-+                  .hi = set32x4(x[7], x[6], x[5], x[4]) };
- #else
-     m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4],
-                                  x[3], x[2], x[1], x[0]);
-@@ -291,8 +291,8 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) {
-                   expand64(v[2], m[2]), expand64(v[3], m[3]) };
- 
- #if !defined(HAVE_AVX2)
--    m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
--                  .hi = _mm_set_epi64x(x[3], x[2]) };
-+    m256 xvec = { .lo = set64x2(x[1], x[0]),
-+                  .hi = set64x2(x[3], x[2]) };
- #else
-     m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]);
- #endif
-@@ -402,9 +402,9 @@ m384 loadcompressed384_32bit(const void *ptr, m384 mvec) {
-                   expand32(v[8], m[8]), expand32(v[9], m[9]),
-                   expand32(v[10], m[10]), expand32(v[11], m[11]) };
- 
--    m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
--                  .mid = _mm_set_epi32(x[7], x[6], x[5], x[4]),
--                  .hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) };
-+    m384 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]),
-+                  .mid = set32x4(x[7], x[6], x[5], x[4]),
-+                  .hi = set32x4(x[11], x[10], x[9], x[8]) };
-     return xvec;
- }
- #endif
-@@ -427,9 +427,9 @@ m384 loadcompressed384_64bit(const void *ptr, m384 mvec) {
-                   expand64(v[2], m[2]), expand64(v[3], m[3]),
-                   expand64(v[4], m[4]), expand64(v[5], m[5]) };
- 
--    m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
--                  .mid = _mm_set_epi64x(x[3], x[2]),
--                  .hi = _mm_set_epi64x(x[5], x[4]) };
-+    m384 xvec = { .lo = set64x2(x[1], x[0]),
-+                  .mid = set64x2(x[3], x[2]),
-+                  .hi = set64x2(x[5], x[4]) };
-     return xvec;
- }
- #endif
-@@ -558,10 +558,10 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
-     xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12],
-                                x[11], x[10], x[9], x[8]);
- #else
--    xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]);
--    xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]);
--    xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]);
--    xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]);
-+    xvec.lo.lo = set32x4(x[3], x[2], x[1], x[0]);
-+    xvec.lo.hi = set32x4(x[7], x[6], x[5], x[4]);
-+    xvec.hi.lo = set32x4(x[11], x[10], x[9], x[8]);
-+    xvec.hi.hi = set32x4(x[15], x[14], x[13], x[12]);
- #endif
-     return xvec;
- }
-@@ -594,10 +594,10 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
-     m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]),
-                   .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])};
- #else
--    m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]),
--                          _mm_set_epi64x(x[3], x[2]) },
--                  .hi = { _mm_set_epi64x(x[5], x[4]),
--                          _mm_set_epi64x(x[7], x[6]) } };
-+    m512 xvec = { .lo = { set64x2(x[1], x[0]),
-+                          set64x2(x[3], x[2]) },
-+                  .hi = { set64x2(x[5], x[4]),
-+                          set64x2(x[7], x[6]) } };
- #endif
-     return xvec;
- }
 diff --git a/tools/hscollider/CMakeLists.txt b/tools/hscollider/CMakeLists.txt
 index a4d71b2..0c41ab9 100644
 --- a/tools/hscollider/CMakeLists.txt
@@ -6802,59 +3059,10 @@ index 0000000..5391473
 +    return (cs != FileCorporaParser_error) && (p == pe);
 +}
 diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
-index 623c2c9..d6d52a2 100644
+index 623c2c9..22945d6 100644
 --- a/unit/internal/simd_utils.cpp
 +++ b/unit/internal/simd_utils.cpp
-@@ -40,7 +40,7 @@ using namespace ue2;
- namespace {
- 
- // Switch one bit on in a bitmask.
--template<class Mask>
-+template <class Mask>
- Mask setbit(unsigned int bit) {
-     union {
-         Mask simd;
-@@ -148,7 +148,7 @@ m256 simd_lshift64(const m256 &a, unsigned i) { return lshift64_m256(a, i); }
- m384 simd_lshift64(const m384 &a, unsigned i) { return lshift64_m384(a, i); }
- m512 simd_lshift64(const m512 &a, unsigned i) { return lshift64_m512(a, i); }
- 
--template<typename T>
-+template <typename T>
- class SimdUtilsTest : public testing::Test {
-     // empty
- };
-@@ -260,9 +260,9 @@ TYPED_TEST(SimdUtilsTest, or2) {
- 
-     for (unsigned j = 0; j < 8; j++) {
-         for (unsigned i = 0; i < 32; i++) {
--            m256 x = setbit<m256>(j*32+i);
-+            m256 x = setbit<m256>(j * 32 + i);
-             m256 y = zeroes256();
--            ASSERT_EQ(1U << j, diffrich256(x, y)) << "bit " << j*32+i << " not happy";
-+            ASSERT_EQ(1U << j, diffrich256(x, y)) << "bit " << j * 32 + i << " not happy";
-         }
-     }
- 
-@@ -431,8 +431,8 @@ TYPED_TEST(SimdUtilsTest, testbit) {
-     for (unsigned int i = 0; i < total_bits; i++) {
-         TypeParam a = setbit<TypeParam>(i);
-         for (unsigned int j = 0; j < total_bits; j++) {
--            ASSERT_EQ(i == j ? 1 : 0, simd_testbit(a, j)) << "bit " << i
--                                                          << " is wrong";
-+            ASSERT_EQ(i == j ? 1 : 0, simd_testbit(a, j))
-+                << "bit " << i << " is wrong";
-         }
-     }
- }
-@@ -455,7 +455,6 @@ TYPED_TEST(SimdUtilsTest, setbit) {
-         simd_setbit(&a, i);
-     }
-     ASSERT_FALSE(simd_diff(simd_ones(), a));
--
- }
- 
- TYPED_TEST(SimdUtilsTest, diffrich) {
-@@ -663,12 +662,11 @@ TEST(SimdUtilsTest, movq) {
+@@ -663,7 +663,7 @@ TEST(SimdUtilsTest, movq) {
      ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
      ASSERT_EQ(0, memcmp(cmp, &r, sizeof(r)));
  
@@ -6863,152 +3071,6 @@ index 623c2c9..d6d52a2 100644
      r = movq(simd);
      ASSERT_EQ(r, 0x123456789abcdef);
  }
- 
--
- TEST(SimdUtilsTest, set16x8) {
-     char cmp[sizeof(m128)];
- 
-@@ -680,7 +678,7 @@ TEST(SimdUtilsTest, set16x8) {
- }
- 
- TEST(SimdUtilsTest, set4x32) {
--    u32 cmp[4] = { 0x12345678, 0x12345678, 0x12345678, 0x12345678 };
-+    u32 cmp[4] = {0x12345678, 0x12345678, 0x12345678, 0x12345678};
-     m128 simd = set4x32(cmp[0]);
-     ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
- }
-@@ -714,51 +712,51 @@ TEST(SimdUtilsTest, variableByteShift128) {
-     char base[] = "0123456789ABCDEF";
-     m128 in = loadu128(base);
- 
--    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0),
--                         variable_byte_shift_m128(in, 0)));
--    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1),
--                         variable_byte_shift_m128(in, -1)));
--    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 2),
--                         variable_byte_shift_m128(in, -2)));
--    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 3),
--                         variable_byte_shift_m128(in, -3)));
--    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 4),
--                         variable_byte_shift_m128(in, -4)));
--    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 5),
--                         variable_byte_shift_m128(in, -5)));
--    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 6),
--                         variable_byte_shift_m128(in, -6)));
--    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 7),
--                         variable_byte_shift_m128(in, -7)));
--    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 8),
--                         variable_byte_shift_m128(in, -8)));
--    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 9),
--                         variable_byte_shift_m128(in, -9)));
--    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 10),
--                         variable_byte_shift_m128(in, -10)));
--
--    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 0),
--                         variable_byte_shift_m128(in, 0)));
--    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 1),
--                         variable_byte_shift_m128(in, 1)));
--    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 2),
--                         variable_byte_shift_m128(in, 2)));
--    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 3),
--                         variable_byte_shift_m128(in, 3)));
--    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 4),
--                         variable_byte_shift_m128(in, 4)));
--    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 5),
--                         variable_byte_shift_m128(in, 5)));
--    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 6),
--                         variable_byte_shift_m128(in, 6)));
--    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 7),
--                         variable_byte_shift_m128(in, 7)));
--    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 8),
--                         variable_byte_shift_m128(in, 8)));
--    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 9),
--                         variable_byte_shift_m128(in, 9)));
--    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10),
--                         variable_byte_shift_m128(in, 10)));
-+    EXPECT_TRUE(
-+        !diff128(rshiftbyte_m128(in, 0), variable_byte_shift_m128(in, 0)));
-+    EXPECT_TRUE(
-+        !diff128(rshiftbyte_m128(in, 1), variable_byte_shift_m128(in, -1)));
-+    EXPECT_TRUE(
-+        !diff128(rshiftbyte_m128(in, 2), variable_byte_shift_m128(in, -2)));
-+    EXPECT_TRUE(
-+        !diff128(rshiftbyte_m128(in, 3), variable_byte_shift_m128(in, -3)));
-+    EXPECT_TRUE(
-+        !diff128(rshiftbyte_m128(in, 4), variable_byte_shift_m128(in, -4)));
-+    EXPECT_TRUE(
-+        !diff128(rshiftbyte_m128(in, 5), variable_byte_shift_m128(in, -5)));
-+    EXPECT_TRUE(
-+        !diff128(rshiftbyte_m128(in, 6), variable_byte_shift_m128(in, -6)));
-+    EXPECT_TRUE(
-+        !diff128(rshiftbyte_m128(in, 7), variable_byte_shift_m128(in, -7)));
-+    EXPECT_TRUE(
-+        !diff128(rshiftbyte_m128(in, 8), variable_byte_shift_m128(in, -8)));
-+    EXPECT_TRUE(
-+        !diff128(rshiftbyte_m128(in, 9), variable_byte_shift_m128(in, -9)));
-+    EXPECT_TRUE(
-+        !diff128(rshiftbyte_m128(in, 10), variable_byte_shift_m128(in, -10)));
-+
-+    EXPECT_TRUE(
-+        !diff128(lshiftbyte_m128(in, 0), variable_byte_shift_m128(in, 0)));
-+    EXPECT_TRUE(
-+        !diff128(lshiftbyte_m128(in, 1), variable_byte_shift_m128(in, 1)));
-+    EXPECT_TRUE(
-+        !diff128(lshiftbyte_m128(in, 2), variable_byte_shift_m128(in, 2)));
-+    EXPECT_TRUE(
-+        !diff128(lshiftbyte_m128(in, 3), variable_byte_shift_m128(in, 3)));
-+    EXPECT_TRUE(
-+        !diff128(lshiftbyte_m128(in, 4), variable_byte_shift_m128(in, 4)));
-+    EXPECT_TRUE(
-+        !diff128(lshiftbyte_m128(in, 5), variable_byte_shift_m128(in, 5)));
-+    EXPECT_TRUE(
-+        !diff128(lshiftbyte_m128(in, 6), variable_byte_shift_m128(in, 6)));
-+    EXPECT_TRUE(
-+        !diff128(lshiftbyte_m128(in, 7), variable_byte_shift_m128(in, 7)));
-+    EXPECT_TRUE(
-+        !diff128(lshiftbyte_m128(in, 8), variable_byte_shift_m128(in, 8)));
-+    EXPECT_TRUE(
-+        !diff128(lshiftbyte_m128(in, 9), variable_byte_shift_m128(in, 9)));
-+    EXPECT_TRUE(
-+        !diff128(lshiftbyte_m128(in, 10), variable_byte_shift_m128(in, 10)));
- 
-     EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16)));
-     EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, -16)));
-@@ -785,12 +783,12 @@ TEST(SimdUtilsTest, min_u8_m128) {
- }
- 
- TEST(SimdUtilsTest, sadd_u8_m128) {
--    unsigned char base1[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4',
--                             '1', '2', '3', '4', '1', '2', '3', '4'};
--    unsigned char base2[] = {'a', 0x80, 'b', 'A', 0x10, 0x10, 0x10, 0x10,
--                             0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0};
-+    unsigned char base1[] = {0,   0x80, 0xff, 'A', '1', '2', '3', '4',
-+                             '1', '2',  '3',  '4', '1', '2', '3', '4'};
-+    unsigned char base2[] = {'a',  0x80, 'b',  'A',  0x10, 0x10, 0x10, 0x10,
-+                             0x30, 0x30, 0x30, 0x30, 0,    0,    0,    0};
-     unsigned char expec[] = {'a', 0xff, 0xff, 0x82, 'A', 'B', 'C', 'D',
--                             'a', 'b', 'c', 'd', '1', '2', '3', '4'};
-+                             'a', 'b',  'c',  'd',  '1', '2', '3', '4'};
-     m128 in1 = loadu128(base1);
-     m128 in2 = loadu128(base2);
-     m128 result = sadd_u8_m128(in1, in2);
-@@ -799,11 +797,11 @@ TEST(SimdUtilsTest, sadd_u8_m128) {
- 
- TEST(SimdUtilsTest, sub_u8_m128) {
-     unsigned char base1[] = {'a', 0xff, 0xff, 0x82, 'A', 'B', 'C', 'D',
--                             'a', 'b', 'c', 'd', '1', '2', '3', '4'};
--    unsigned char base2[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4',
--                             '1', '2', '3', '4', '1', '2', '3', '4'};
--    unsigned char expec[] = {'a', 0x7f, 0, 'A', 0x10, 0x10, 0x10, 0x10,
--                             0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0};
-+                             'a', 'b',  'c',  'd',  '1', '2', '3', '4'};
-+    unsigned char base2[] = {0,   0x80, 0xff, 'A', '1', '2', '3', '4',
-+                             '1', '2',  '3',  '4', '1', '2', '3', '4'};
-+    unsigned char expec[] = {'a',  0x7f, 0,    'A',  0x10, 0x10, 0x10, 0x10,
-+                             0x30, 0x30, 0x30, 0x30, 0,    0,    0,    0};
-     m128 in1 = loadu128(base1);
-     m128 in2 = loadu128(base2);
-     m128 result = sub_u8_m128(in1, in2);
 diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt
 index ea942ef..d7bef50 100644
 --- a/util/CMakeLists.txt
diff --git a/hyperscan.spec b/hyperscan.spec
index 3e8190a..5829846 100644
--- a/hyperscan.spec
+++ b/hyperscan.spec
@@ -1,6 +1,6 @@
 Name:    hyperscan
-Version: 5.4.0
-Release: 3
+Version: 5.4.1
+Release: 1
 Summary: High-performance regular expression matching library
 
 License: BSD
@@ -8,10 +8,7 @@ URL:     https://www.hyperscan.io/
 Source0: https://github.com/intel/%{name}/archive/v%{version}.tar.gz#/%{name}-%{version}.tar.gz
 
 Patch0: hyperscan-aarch64-support.patch
-Patch1: Fix-build-error-on-x86_64.patch
-Patch2: Fix-hyperscan-gcc10.patch
-# https://github.com/intel/hyperscan/commit/7d644e7ba27eaadda753febf0b142faa9affbbca
-Patch3: backport-Fix-segfaults-on-allocation-failure.patch
+Patch1: Fix-hyperscan-gcc10.patch
 
 BuildRequires:  gcc-c++
 BuildRequires:  boost-devel
@@ -54,7 +51,15 @@ This package provides the libraries, include files and other resources
 needed for developing Hyperscan applications.
 
 %prep
-%autosetup -n %{name}-%{version} -p1
+%setup -q -n %{name}-%{version}
+cd %{_builddir}/%{name}-%{version}
+mv src/util/simd_utils.h src/util/simd_x86.h
+sed -i 's/SIMD_UTILS/SIMD_X86/' src/util/simd_x86.h
+sed -i 's/_mm_set_epi32/set32x4/' src/util/state_compress.c
+sed -i 's/_mm_set_epi64x/set64x2/' src/util/state_compress.c
+sed -i 's/_mm_srli_si128/rshiftbyte_m128/' src/util/state_compress.c
+cd -
+%autopatch -p1
 
 %build
 %cmake -DBUILD_SHARED_LIBS:BOOL=ON -DBUILD_STATIC_AND_SHARED:BOOL=OFF .
@@ -80,6 +85,9 @@ needed for developing Hyperscan applications.
 %{_includedir}/hs/
 
 %changelog
+* Sat Mar 25 2023 Liu Zixian <liuzixian4@huawei.com> - 5.4.1-1
+- Update to 5.4.1
+
 * Tue Mar 21 2023 Liu Zixian <liuzixian4@huawei.com> - 5.4.0-3
 - Cleanup aarch64 patch