From 5f009c288718095c5cc675bfed12d7ec64237731 Mon Sep 17 00:00:00 2001
From: Yikun Jiang <yikunkero@gmail.com>
Date: Mon, 20 Jul 2020 17:20:15 +0800
Subject: [PATCH] Add aarch64 support

---
 CMakeLists.txt                             |  108 +-
 cmake/config.h.in                          |    9 +
 cmake/platform.cmake                       |   13 +-
 cmake/ragel.cmake                          |   20 +
 src/crc32.c                                |   43 +
 src/fdr/fdr.c                              |  136 +-
 src/hs_valid_platform.c                    |    9 +-
 src/nfa/limex_exceptional.h                |   22 +-
 src/nfa/limex_internal.h                   |    2 +-
 src/nfa/limex_native.c                     |   10 +-
 src/nfa/shufti.c                           |  580 ++++----
 src/nfa/truffle.c                          |   10 +-
 src/parser/control_verbs.cpp               |  340 +++++
 src/rose/counting_miracle.h                |    2 +-
 src/util/arch.h                            |   11 +
 src/util/cpuid_flags.c                     |    9 +-
 src/util/cpuid_flags.h                     |    2 +
 src/util/cpuid_inline.h                    |   17 +-
 src/util/intrinsics.h                      |   12 +
 src/util/popcount.h                        |    6 +-
 src/util/simd_arm.h                        | 1069 +++++++++++++++
 src/util/simd_types.h                      |   42 +-
 src/util/simd_utils.h                      | 1389 +-------------------
 src/util/simd_x86.h                        | 1334 +++++++++++++++++++
 src/util/state_compress.c                  |   42 +-
 tools/hscollider/CMakeLists.txt            |    9 +-
 tools/hscollider/ColliderCorporaParser.cpp |  474 +++++++
 unit/internal/simd_utils.cpp               |  128 +-
 util/CMakeLists.txt                        |    8 +-
 util/ExpressionParser.cpp                  |  397 ++++++
 30 files changed, 4464 insertions(+), 1789 deletions(-)
 create mode 100644 src/parser/control_verbs.cpp
 create mode 100644 src/util/simd_arm.h
 create mode 100644 src/util/simd_x86.h
 create mode 100644 tools/hscollider/ColliderCorporaParser.cpp
 create mode 100644 util/ExpressionParser.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8bc6077..12a889c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,6 +74,7 @@ include (${CMAKE_MODULE_PATH}/boost.cmake)
 # -- make this work? set(python_ADDITIONAL_VERSIONS 2.7 2.6)
 find_package(PythonInterp)
 find_program(RAGEL ragel)
+find_program(COPY cp)
 
 if(PYTHONINTERP_FOUND)
     set(PYTHON ${PYTHON_EXECUTABLE})
@@ -189,24 +190,30 @@ else()
         # cpuid info and then chooses the best microarch it can (and replaces
         # the flag), so use that for tune.
 
-        # arg1 might exist if using ccache
-        string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
-        set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native)
-        execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
-            OUTPUT_VARIABLE _GCC_OUTPUT)
-        string(FIND "${_GCC_OUTPUT}" "march" POS)
-        string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
-        string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1"
-            GNUCC_ARCH "${_GCC_OUTPUT}")
-
-        # test the parsed flag
-        set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH})
-        execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
-            OUTPUT_QUIET ERROR_QUIET
-            INPUT_FILE /dev/null
-            RESULT_VARIABLE GNUCC_TUNE_TEST)
-        if (NOT GNUCC_TUNE_TEST EQUAL 0)
-            message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid")
+        if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386")
+            # arg1 might exist if using ccache
+            string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
+            set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native)
+            execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
+                OUTPUT_VARIABLE _GCC_OUTPUT)
+            string(FIND "${_GCC_OUTPUT}" "march" POS)
+            string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
+            string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1"
+                GNUCC_ARCH "${_GCC_OUTPUT}")
+
+            # test the parsed flag
+            set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH})
+            execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
+                OUTPUT_QUIET ERROR_QUIET
+                INPUT_FILE /dev/null
+                RESULT_VARIABLE GNUCC_TUNE_TEST)
+            if (NOT GNUCC_TUNE_TEST EQUAL 0)
+                message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid")
+            endif()
+        endif()
+
+        if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+            set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=armv8-a -mtune=armv8-a)
         endif()
         set(TUNE_FLAG ${GNUCC_ARCH})
     else ()
@@ -239,6 +246,13 @@ else()
     set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c99 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing")
     set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++11 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing")
 
+    if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fsigned-char")
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fsigned-char")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc")
+    endif()
+
     if (NOT RELEASE_BUILD)
         # -Werror is most useful during development, don't potentially break
         # release builds
@@ -252,11 +266,19 @@ else()
     endif()
 
     if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
-        set(ARCH_C_FLAGS "-march=native -mtune=${TUNE_FLAG}")
+        if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386")
+            set(ARCH_C_FLAGS "-march=native -mtune=${TUNE_FLAG}")
+        elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+            set(ARCH_C_FLAGS "-march=armv8-a -mtune=${TUNE_FLAG}")
+        endif ()
     endif()
 
     if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
-        set(ARCH_CXX_FLAGS "-march=native -mtune=${TUNE_FLAG}")
+        if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386")
+            set(ARCH_CXX_FLAGS "-march=native -mtune=${TUNE_FLAG}")
+        elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+            set(ARCH_CXX_FLAGS "-march=armv8-a -mtune=${TUNE_FLAG}")
+        endif()
     endif()
 
     if(CMAKE_COMPILER_IS_GNUCC)
@@ -289,10 +311,18 @@ else()
 endif()
 
 CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
-CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
-CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
-CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
-CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386")
+    CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
+    CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
+    CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
+    CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
+endif()
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+    CHECK_INCLUDE_FILES(arm_neon.h HAVE_C_ARM_NEON_H)
+    CHECK_INCLUDE_FILE_CXX(arm_neon.h HAVE_CXX_ARM_NEON_H)
+endif()
 
 CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
 CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
@@ -325,6 +355,9 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
             (CMAKE_VERSION VERSION_GREATER "3.0" AND CMAKE_GENERATOR MATCHES "Ninja")))
         message (STATUS "Building the fat runtime requires the Unix Makefiles generator, or Ninja with CMake v3.0 or higher")
         set (FAT_RUNTIME_REQUISITES FALSE)
+    elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+        message(STATUS "AARCH64 platform don't support fat runtime")
+        set (FAT_RUNTIME_REQUISITES FALSE)
     else()
         include (${CMAKE_MODULE_PATH}/attrib.cmake)
         if (NOT HAS_C_ATTR_IFUNC)
@@ -337,7 +370,9 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
     CMAKE_DEPENDENT_OPTION(FAT_RUNTIME "Build a library that supports multiple microarchitectures" ${RELEASE_BUILD} "FAT_RUNTIME_REQUISITES" OFF)
 endif ()
 
-include (${CMAKE_MODULE_PATH}/arch.cmake)
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386")
+    include (${CMAKE_MODULE_PATH}/arch.cmake)
+endif()
 
 # testing a builtin takes a little more work
 CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
@@ -403,12 +438,6 @@ if (CXX_IGNORED_ATTR)
     set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-ignored-attributes")
 endif()
 
-# gcc 9 complains about redundant move for returned variable
-CHECK_CXX_COMPILER_FLAG("-Wredundant-move" CXX_REDUNDANT_MOVE)
-if (CXX_REDUNDANT_MOVE)
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-redundant-move")
-endif()
-
 # note this for later
 # g++ doesn't have this flag but clang does
 CHECK_CXX_COMPILER_FLAG("-Wweak-vtables" CXX_WEAK_VTABLES)
@@ -463,6 +492,14 @@ else()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()
 
+# Test case for neon function.
+option(UNIT_SIMD "Simd funtion test case, default is OFF" OFF)
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+  if (UNIT_SIMD)
+	add_subdirectory(unit-simd)
+  endif()
+endif()
+
 add_subdirectory(util)
 add_subdirectory(doc/dev-reference)
 
@@ -559,7 +596,14 @@ set_source_files_properties(
     PROPERTIES
         COMPILE_FLAGS "${RAGEL_C_FLAGS}")
 
-ragelmaker(src/parser/control_verbs.rl)
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386")
+    ragelmaker(src/parser/control_verbs.rl)
+endif()
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+    ragelcopyer(src/parser/control_verbs.rl)
+endif()
 
 SET(hs_HEADERS
     src/hs.h
diff --git a/cmake/config.h.in b/cmake/config.h.in
index 5454643..336cf19 100644
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -15,6 +15,9 @@
 /* "Define if building for EM64T" */
 #cmakedefine ARCH_X86_64
 
+/* "Define if building for aarch64" */
+#cmakedefine ARCH_AARCH64
+
 /* internal build, switch on dump support. */
 #cmakedefine DUMP_SUPPORT
 
@@ -48,6 +51,12 @@
 /* C compiler has intrin.h */
 #cmakedefine HAVE_C_INTRIN_H
 
+/* C++ compiler has arm_neon.h */
+#cmakedefine HAVE_CXX_ARM_NEON_H
+
+/* C compiler has arm_neon.h */
+#cmakedefine HAVE_C_ARM_NEON_H
+
 /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
    0 if you don't. */
 #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP
diff --git a/cmake/platform.cmake b/cmake/platform.cmake
index 593c544..213dcc5 100644
--- a/cmake/platform.cmake
+++ b/cmake/platform.cmake
@@ -1,9 +1,14 @@
 # determine the target arch
 
 # really only interested in the preprocessor here
-CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_64_BIT)
+CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_X86_64)
 
-CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_32_BIT)
+CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32)
 
-set(ARCH_X86_64 ${ARCH_64_BIT})
-set(ARCH_IA32 ${ARCH_32_BIT})
+CHECK_C_SOURCE_COMPILES("#if !(defined(__aarch64__))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64)
+
+if (ARCH_X86_64 OR ARCH_AARCH64)
+    set(ARCH_64_BIT 1)
+elseif (ARCH_IA32)
+    set(ARCH_32_BIT 1)
+endif()
\ No newline at end of file
diff --git a/cmake/ragel.cmake b/cmake/ragel.cmake
index d3f0b92..3356cb9 100644
--- a/cmake/ragel.cmake
+++ b/cmake/ragel.cmake
@@ -14,3 +14,23 @@ function(ragelmaker src_rl)
     set_source_files_properties(${rl_out} PROPERTIES GENERATED TRUE)
 endfunction(ragelmaker)
 
+ # On the aarch64 platform, char is unsigned by default, so in order to be consistent with
+ # the x86 platform, we will add -fsigned-char to the compile option to force the char type.
+ # However, when the ragel generates c++ code, the char variable used will still be considered
+ # unsigned, resulting in the overflow of the char variable value in the generated code,
+ # resulting in some errors.
+ # function for copying the previously modified code to the specified path
+
+ function(ragelcopyer src_rl)
+     get_filename_component(src_dir ${src_rl} PATH) # old cmake needs PATH
+     get_filename_component(src_file ${src_rl} NAME_WE)
+     set(rl_out ${CMAKE_CURRENT_BINARY_DIR}/${src_dir}/${src_file}.cpp)
+     add_custom_command(
+             OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${src_dir}/${src_file}.cpp
+             COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${src_dir}
+			 COMMAND ${COPY} -f  ${CMAKE_CURRENT_SOURCE_DIR}/${src_dir}/${src_file}.cpp ${rl_out} 2>/dev/null ||:
+             DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${src_dir}/${src_file}.cpp
+     )
+     add_custom_target(ragel_${src_file} DEPENDS ${rl_out})
+     set_source_files_properties(${rl_out} PROPERTIES GENERATED TRUE)
+ endfunction(ragelcopyer)
\ No newline at end of file
diff --git a/src/crc32.c b/src/crc32.c
index 1dae47b..4609c5d 100644
--- a/src/crc32.c
+++ b/src/crc32.c
@@ -32,6 +32,47 @@
 #include "util/arch.h"
 #include "util/intrinsics.h"
 
+#if defined(HAVE_NEON)
+
+#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC_WORD 8
+#define CRC_TYPE u64a
+static  really_inline
+u32 crc32c_neon(u32 running_crc, const unsigned char * p_buf, const size_t length)
+{
+    u32 crc=running_crc;
+
+    //Processbyte-by-byteuntilp_bufisaligned
+    const unsigned char * aligned_buf = ROUNDUP_PTR(p_buf, CRC_WORD);
+    size_t init_bytes = aligned_buf - p_buf;
+    size_t running_length = ((length - init_bytes) / CRC_WORD) * CRC_WORD;
+    size_t end_bytes = length - init_bytes - running_length;
+
+    while(p_buf < aligned_buf){
+        CRC32CB(crc, *p_buf);
+        p_buf++;
+    }
+
+    //Main aligned loop, processes a word at a time.
+    for(size_t li = 0; li < running_length / CRC_WORD; li++){
+        CRC_TYPE block = *(const CRC_TYPE *)p_buf;
+        CRC32CX(crc,block);
+        p_buf += CRC_WORD;
+    }
+
+    //Remainingbytes
+    for(size_t li = 0; li < end_bytes; li++){
+        CRC32CB(crc,*p_buf);
+        p_buf++;
+    }
+    return crc;
+}
+#endif
+
+
 #if !defined(HAVE_SSE42)
 
 /***
@@ -636,6 +677,8 @@ u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf,
 u32 Crc32c_ComputeBuf(u32 inCrc32, const void *buf, size_t bufLen) {
 #if defined(HAVE_SSE42)
     u32 crc = crc32c_sse42(inCrc32, (const unsigned char *)buf, bufLen);
+#elif defined(HAVE_NEON)
+    u32 crc = crc32c_neon(inCrc32, (const unsigned char *)buf, bufLen);
 #else
     u32 crc = crc32c_sb8_64_bit(inCrc32, (const unsigned char *)buf, bufLen);
 #endif
diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index d33756d..718f169 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -127,6 +127,13 @@ u64a andn(const u32 a, const u8 *b) {
     u64a r;
 #if defined(HAVE_BMI) && !defined(NO_ASM)
     __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
+#elif defined(HAVE_NEON)
+    __asm__ __volatile__("ldr w0, %w2         \n\t"
+                         "bic %w0,w0,%w1      \n\t"
+                         : "=r"(r)
+                         : "r"(a), "m"(*(const u32 *)b)
+                         : "w0"
+    );
 #else
     r = unaligned_load_u32(b) & ~a;
 #endif
@@ -159,7 +166,104 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
                        UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
                        const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
     /* +1: the zones ensure that we can read the byte at z->end */
-    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+    assert(itPtr >= start_ptr && itPtr <= end_ptr);
+#if defined(HAVE_NEON)
+    domain_mask_flipped = ~domain_mask_flipped;
+
+    u32 reach0, reach1, reach2, reach3;
+    u64a ptr = unaligned_load_u64a(itPtr);
+
+    reach0 = ptr & domain_mask_flipped;
+    reach1 = ptr >> 8 & domain_mask_flipped;
+    reach2 = ptr >> 16 & domain_mask_flipped;
+    reach3 = ptr >> 24 & domain_mask_flipped;
+
+    m128 st0 = load_m128_from_u64a(ft + reach0);
+    m128 st1 = load_m128_from_u64a(ft + reach1);
+    m128 st2 = load_m128_from_u64a(ft + reach2);
+    m128 st3 = load_m128_from_u64a(ft + reach3);
+
+    u32 reach4, reach5, reach6, reach7;
+    ptr = unaligned_load_u64a(itPtr + 4);
+    reach4 = ptr & domain_mask_flipped;
+    reach5 = ptr >> 8 & domain_mask_flipped;
+    reach6 = ptr >> 16 & domain_mask_flipped;
+    reach7 = ptr >> 24 & domain_mask_flipped;
+
+    m128 st4 = load_m128_from_u64a(ft + reach4);
+    m128 st5 = load_m128_from_u64a(ft + reach5);
+    m128 st6 = load_m128_from_u64a(ft + reach6);
+    m128 st7 = load_m128_from_u64a(ft + reach7);
+
+    m128 zero = zeroes128();
+
+    st1.vect_s8 = vextq_s8(zero.vect_s8, st1.vect_s8, 15);
+    st2.vect_s8 = vextq_s8(zero.vect_s8, st2.vect_s8, 14);
+    st3.vect_s8 = vextq_s8(zero.vect_s8, st3.vect_s8, 13);
+    st4.vect_s8 = vextq_s8(zero.vect_s8, st4.vect_s8, 12);
+    st5.vect_s8 = vextq_s8(zero.vect_s8, st5.vect_s8, 11);
+    st6.vect_s8 = vextq_s8(zero.vect_s8, st6.vect_s8, 10);
+    st7.vect_s8 = vextq_s8(zero.vect_s8, st7.vect_s8, 9);
+
+    st0 = or128(st0, st1);
+    st2 = or128(st2, st3);
+    st4 = or128(st4, st5);
+    st6 = or128(st6, st7);
+    st0 = or128(st0, st2);
+    st4 = or128(st4, st6);
+    st0 = or128(st0, st4);
+    *s = or128(*s, st0);
+
+    *conf0 = movq(*s);
+    *s = rshiftbyte_m128(*s, 8);
+    *conf0 = ~(*conf0);
+
+    u32 reach8, reach9, reach10, reach11;
+    ptr = unaligned_load_u64a(itPtr + 8);
+    reach8 = ptr & domain_mask_flipped;
+    reach9 = ptr >> 8 & domain_mask_flipped;
+    reach10 = ptr >> 16 & domain_mask_flipped;
+    reach11 = ptr >> 24 & domain_mask_flipped;
+
+    m128 st8 = load_m128_from_u64a(ft + reach8);
+    m128 st9 = load_m128_from_u64a(ft + reach9);
+    m128 st10 = load_m128_from_u64a(ft + reach10);
+    m128 st11 = load_m128_from_u64a(ft + reach11);
+
+    u32 reach12, reach13, reach14, reach15;
+    ptr = unaligned_load_u64a(itPtr + 12);
+    reach12 = ptr & domain_mask_flipped;
+    reach13 = ptr >> 8 & domain_mask_flipped;
+    reach14 = ptr >> 16 & domain_mask_flipped;
+    reach15 = ptr >> 24 & domain_mask_flipped;
+
+    m128 st12 = load_m128_from_u64a(ft + reach12);
+    m128 st13 = load_m128_from_u64a(ft + reach13);
+    m128 st14 = load_m128_from_u64a(ft + reach14);
+    m128 st15 = load_m128_from_u64a(ft + reach15);
+
+    st9.vect_s8 = vextq_s8(zero.vect_s8, st9.vect_s8, 15);
+    st10.vect_s8 = vextq_s8(zero.vect_s8, st10.vect_s8, 14);
+    st11.vect_s8 = vextq_s8(zero.vect_s8, st11.vect_s8, 13);
+    st12.vect_s8 = vextq_s8(zero.vect_s8, st12.vect_s8, 12);
+    st13.vect_s8 = vextq_s8(zero.vect_s8, st13.vect_s8, 11);
+    st14.vect_s8 = vextq_s8(zero.vect_s8, st14.vect_s8, 10);
+    st15.vect_s8 = vextq_s8(zero.vect_s8, st15.vect_s8, 9);
+
+    st8 = or128(st8, st9);
+    st10 = or128(st10, st11);
+    st12 = or128(st12, st13);
+    st14 = or128(st14, st15);
+    st8 = or128(st8, st10);
+    st12 = or128(st12, st14);
+    st8 = or128(st8, st12);
+    *s = or128(*s, st8);
+
+    *conf8 = movq(*s);
+    *s = rshiftbyte_m128(*s, 8);
+    *conf8 = ~(*conf8);
+
+#else
     u64a reach0 = andn(domain_mask_flipped, itPtr);
     u64a reach1 = andn(domain_mask_flipped, itPtr + 1);
     u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
@@ -241,6 +345,8 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
     *conf8 = movq(*s);
     *s = rshiftbyte_m128(*s, 8);
     *conf8 ^= ~0ULL;
+
+#endif
 }
 
 static really_inline
@@ -349,12 +455,12 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
         u32 bitRem = bit % bucket;
         u32 idx = bitRem;
         u32 cf = confBase[idx];
-        if (!cf) {
+        if (unlikely(!cf)) {
             continue;
         }
         const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
                                         ((const u8 *)confBase + cf);
-        if (!(fdrc->groups & *control)) {
+        if (unlikely(!(fdrc->groups & *control))) {
             continue;
         }
         u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
@@ -603,7 +709,7 @@ void createEndZone(const u8 *buf, const u8 *begin, const u8 *end,
     assert(z_len > 0);
     size_t iter_bytes_second = 0;
     size_t z_len_first = z_len;
-    if (z_len > ITER_BYTES) {
+    if (unlikely(z_len > ITER_BYTES)) {
         z_len_first = z_len - ITER_BYTES;
         iter_bytes_second = ITER_BYTES;
     }
@@ -637,7 +743,7 @@ void createEndZone(const u8 *buf, const u8 *begin, const u8 *end,
 
     /* copy the last 16 bytes, may overlap with the previous 8 byte write */
     storeu128(z_end_first - sizeof(m128), loadu128(end_first - sizeof(m128)));
-    if (iter_bytes_second) {
+    if (unlikely(iter_bytes_second)) {
         storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
     }
 
@@ -658,7 +764,7 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
     const u8 *ptr = buf + start;
     size_t remaining = len - start;
 
-    if (remaining <= ITER_BYTES) {
+    if (unlikely(remaining <= ITER_BYTES)) {
         /* enough bytes to make only one zone */
         createShortZone(buf, hend, ptr, buf + len, &zoneArr[0]);
         return 1;
@@ -691,13 +797,25 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
 
 #define INVALID_MATCH_ID (~0U)
 
+/* add prefetch for aarch64,
+ *- due to gcc4.8.5 do not support builtin_prefetch.
+ */
+#if defined(HAVE_NEON)
+#define PREFETCH __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(itPtr + 256)))
+#define P2ALIGN   __asm__ __volatile__(".p2align 6")
+#else
+#define PREFETCH __builtin_prefetch(itPtr + ITER_BYTES)
+#define P2ALIGN
+#endif
+
 #define FDR_MAIN_LOOP(zz, s, get_conf_fn)                                   \
     do {                                                                    \
+        P2ALIGN;                                                            \
         const u8 *tryFloodDetect = zz->floodPtr;                            \
         const u8 *start_ptr = zz->start;                                    \
-        const u8 *end_ptr = zz->end;                                        \
+        const u8 *end_ptr = zz->end - ITER_BYTES;                           \
                                                                             \
-        for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr;    \
+        for (const u8 *itPtr = start_ptr; itPtr <= end_ptr;                 \
             itPtr += ITER_BYTES) {                                          \
             if (unlikely(itPtr > tryFloodDetect)) {                         \
                 tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\
@@ -707,7 +825,7 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
                     return HWLM_TERMINATED;                                 \
                 }                                                           \
             }                                                               \
-            __builtin_prefetch(itPtr + ITER_BYTES);                         \
+            PREFETCH;                                                       \
             u64a conf0;                                                     \
             u64a conf8;                                                     \
             get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped,     \
diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c
index 59ad3f3..035d3ff 100644
--- a/src/hs_valid_platform.c
+++ b/src/hs_valid_platform.c
@@ -33,9 +33,16 @@
 HS_PUBLIC_API
 hs_error_t HS_CDECL hs_valid_platform(void) {
     /* Hyperscan requires SSSE3, anything else is a bonus */
+#if defined(__x86_64__)
     if (check_ssse3()) {
         return HS_SUCCESS;
-    } else {
+    }
+#else
+    if (check_neon()) {
+        return HS_SUCCESS;
+    }
+#endif
+    else {
         return HS_ARCH_ERROR;
     }
 }
diff --git a/src/nfa/limex_exceptional.h b/src/nfa/limex_exceptional.h
index 6c7335f..8304215 100644
--- a/src/nfa/limex_exceptional.h
+++ b/src/nfa/limex_exceptional.h
@@ -131,7 +131,7 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
         union RepeatControl *repeat_ctrl = ctx->repeat_ctrl + info->ctrlIndex;
         char *repeat_state = ctx->repeat_state + info->stateOffset;
 
-        if (e->trigger == LIMEX_TRIGGER_POS) {
+        if (unlikely(e->trigger == LIMEX_TRIGGER_POS)) {
             char cyclic_on = TESTBIT_STATE(*STATE_ARG_P, info->cyclicState);
             processPosTrigger(repeat, repeat_ctrl, repeat_state, offset,
                               cyclic_on);
@@ -140,7 +140,7 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
             assert(e->trigger == LIMEX_TRIGGER_TUG);
             enum TriggerResult rv =
                 processTugTrigger(repeat, repeat_ctrl, repeat_state, offset);
-            if (rv == TRIGGER_FAIL) {
+            if (likely(rv == TRIGGER_FAIL)) {
                 *cacheable = DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES;
                 DEBUG_PRINTF("tug found no valid matches in repeat state\n");
                 return 1; // continue
@@ -150,7 +150,7 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
                 assert(e->hasSquash == LIMEX_SQUASH_TUG);
                 *succ = AND_STATE(*succ, LOAD_FROM_ENG(&e->squash));
                 return 1; // continue
-            } else if (rv == TRIGGER_SUCCESS_CACHE) {
+            } else if (unlikely(rv == TRIGGER_SUCCESS_CACHE)) {
                 new_cache->br = 1;
             } else {
                 assert(rv == TRIGGER_SUCCESS);
@@ -160,7 +160,7 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
     }
 
     // Some exceptions fire accepts.
-    if (e->reports != MO_INVALID_IDX) {
+    if (unlikely(e->reports != MO_INVALID_IDX)) {
         if (flags & CALLBACK_OUTPUT) {
             const ReportID *reports =
                 (const ReportID *)((const char *)limex + e->reports);
@@ -171,7 +171,7 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
                 return 0; // halt
             }
             if (*cacheable == CACHE_RESULT) {
-                if (!new_cache->reports || new_cache->reports == reports) {
+                if (likely(!new_cache->reports || new_cache->reports == reports)) {
                     new_cache->reports = reports;
                 } else {
                     *cacheable = DO_NOT_CACHE_RESULT;
@@ -194,8 +194,8 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
 
     // Some exceptions squash states behind them. Note that we squash states in
     // 'succ', not local_succ.
-    if (e->hasSquash == LIMEX_SQUASH_CYCLIC
-        || e->hasSquash == LIMEX_SQUASH_REPORT) {
+    if (unlikely(e->hasSquash == LIMEX_SQUASH_CYCLIC
+        || e->hasSquash == LIMEX_SQUASH_REPORT)) {
         *succ = AND_STATE(*succ, LOAD_FROM_ENG(&e->squash));
         if (*cacheable == CACHE_RESULT) {
             *cacheable = DO_NOT_CACHE_RESULT;
@@ -331,12 +331,12 @@ int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ,
             u32 idx = local_index + base_index[t];
             const EXCEPTION_T *e = &exceptions[idx];
 
-            if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ,
+            if (unlikely(!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ,
 #ifndef BIG_MODEL
                                   &local_succ,
 #endif
                                   limex, offset, ctx, &new_cache, &cacheable,
-                                  in_rev, flags)) {
+                                  in_rev, flags))) {
                 return PE_RV_HALT;
             }
         } while (word);
@@ -349,7 +349,7 @@ int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ,
     *succ = OR_STATE(*succ, ctx->local_succ);
 #endif
 
-    if (cacheable == CACHE_RESULT) {
+    if (likely(cacheable == CACHE_RESULT)) {
         ctx->cached_estate = estate;
 #ifndef BIG_MODEL
         ctx->cached_esucc = local_succ;
@@ -359,7 +359,7 @@ int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ,
         ctx->cached_reports = new_cache.reports;
         ctx->cached_br = new_cache.br;
     } else if (cacheable == DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES) {
-        if (ctx->cached_br) {
+        if (unlikely(ctx->cached_br)) {
             ctx->cached_estate = ZERO_STATE;
         }
     }
diff --git a/src/nfa/limex_internal.h b/src/nfa/limex_internal.h
index 23b1bd9..0e27c79 100644
--- a/src/nfa/limex_internal.h
+++ b/src/nfa/limex_internal.h
@@ -119,7 +119,7 @@ struct NFAException##size {                                                 \
     u32 repeatOffset; /**< offset to NFARepeatInfo, or MO_INVALID_IDX */    \
     u8 hasSquash; /**< from enum LimExSquash */                             \
     u8 trigger; /**< from enum LimExTrigger */                              \
-};                                                                          \
+}__attribute__ ((aligned (16)));                                            \
                                                                             \
 struct LimExNFA##size {                                                     \
     u8 reachMap[N_CHARS]; /**< map of char -> entry in reach[] */           \
diff --git a/src/nfa/limex_native.c b/src/nfa/limex_native.c
index f6f5809..8998830 100644
--- a/src/nfa/limex_native.c
+++ b/src/nfa/limex_native.c
@@ -77,7 +77,7 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
                          struct NFAContext32 *ctx, char in_rev, char flags) {
     assert(estate != 0); // guaranteed by calling macro
 
-    if (estate == ctx->cached_estate) {
+    if (unlikely(estate == ctx->cached_estate)) {
         DEBUG_PRINTF("using cached succ from previous state\n");
         *succ |= ctx->cached_esucc;
         if (ctx->cached_reports && (flags & CALLBACK_OUTPUT)) {
@@ -103,21 +103,21 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
         u32 bit = findAndClearLSB_32(&estate);
         u32 idx = rank_in_mask32(limex->exceptionMask, bit);
         const struct NFAException32 *e = &exceptions[idx];
-        if (!runException32(e, s, succ, &local_succ, limex, offset, ctx,
-                            &new_cache, &cacheable, in_rev, flags)) {
+        if (unlikely(!runException32(e, s, succ, &local_succ, limex, offset, ctx,
+                            &new_cache, &cacheable, in_rev, flags))) {
             return PE_RV_HALT;
         }
     } while (estate != 0);
 
     *succ |= local_succ;
 
-    if (cacheable == CACHE_RESULT) {
+    if (unlikely(cacheable == CACHE_RESULT)) {
         ctx->cached_estate = orig_estate;
         ctx->cached_esucc = local_succ;
         ctx->cached_reports = new_cache.reports;
         ctx->cached_br = new_cache.br;
     } else if (cacheable == DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES) {
-        if (ctx->cached_br) {
+        if (unlikely(ctx->cached_br)) {
             ctx->cached_estate = 0U;
         }
     }
diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c
index 09ffc0c..6231e61 100644
--- a/src/nfa/shufti.c
+++ b/src/nfa/shufti.c
@@ -42,39 +42,36 @@
 #ifdef DEBUG
 #include <ctype.h>
 
-#define DUMP_MSK(_t)                                \
-static UNUSED                                       \
-void dumpMsk##_t(m##_t msk) {                       \
-    u8 * mskAsU8 = (u8 *)&msk;                      \
-    for (unsigned i = 0; i < sizeof(msk); i++) {    \
-        u8 c = mskAsU8[i];                          \
-        for (int j = 0; j < 8; j++) {               \
-            if ((c >> (7-j)) & 0x1)                 \
-                printf("1");                        \
-            else                                    \
-                printf("0");                        \
-        }                                           \
-        printf(" ");                                \
-    }                                               \
-}                                                   \
-static UNUSED                                       \
-void dumpMsk##_t##AsChars(m##_t msk) {              \
-    u8 * mskAsU8 = (u8 *)&msk;                      \
-    for (unsigned i = 0; i < sizeof(msk); i++) {    \
-        u8 c = mskAsU8[i];                          \
-        if (isprint(c))                             \
-            printf("%c",c);                         \
-        else                                        \
-            printf(".");                            \
-    }                                               \
-}
+#define DUMP_MSK(_t)                                                           \
+    static UNUSED void dumpMsk##_t(m##_t msk) {                                \
+        u8 *mskAsU8 = (u8 *)&msk;                                              \
+        for (unsigned i = 0; i < sizeof(msk); i++) {                           \
+            u8 c = mskAsU8[i];                                                 \
+            for (int j = 0; j < 8; j++) {                                      \
+                if ((c >> (7 - j)) & 0x1)                                      \
+                    printf("1");                                               \
+                else                                                           \
+                    printf("0");                                               \
+            }                                                                  \
+            printf(" ");                                                       \
+        }                                                                      \
+    }                                                                          \
+    static UNUSED void dumpMsk##_t##AsChars(m##_t msk) {                       \
+        u8 *mskAsU8 = (u8 *)&msk;                                              \
+        for (unsigned i = 0; i < sizeof(msk); i++) {                           \
+            u8 c = mskAsU8[i];                                                 \
+            if (isprint(c))                                                    \
+                printf("%c", c);                                               \
+            else                                                               \
+                printf(".");                                                   \
+        }                                                                      \
+    }
 
 #endif
 
 /** \brief Naive byte-by-byte implementation. */
-static really_inline
-const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf,
-                        const u8 *buf_end) {
+static really_inline const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi,
+                                             const u8 *buf, const u8 *buf_end) {
     assert(buf < buf_end);
 
     for (; buf < buf_end; ++buf) {
@@ -87,9 +84,8 @@ const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf,
 }
 
 /** \brief Naive byte-by-byte implementation. */
-static really_inline
-const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf,
-                        const u8 *buf_end) {
+static really_inline const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi,
+                                             const u8 *buf, const u8 *buf_end) {
     assert(buf < buf_end);
 
     for (buf_end--; buf_end >= buf; buf_end--) {
@@ -111,25 +107,33 @@ DUMP_MSK(128)
 #define GET_LO_4(chars) and128(chars, low4bits)
 #define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
 
-static really_inline
-u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
-          const m128 compare) {
-    m128 c_lo  = pshufb_m128(mask_lo, GET_LO_4(chars));
-    m128 c_hi  = pshufb_m128(mask_hi, GET_HI_4(chars));
-    m128 t     = and128(c_lo, c_hi);
+static really_inline u32 block(m128 mask_lo, m128 mask_hi, m128 chars,
+                               const m128 low4bits, const m128 compare) {
+    m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars));
+    m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars));
+    m128 t = and128(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
+    DEBUG_PRINTF(" chars: ");
+    dumpMsk128AsChars(chars);
+    printf("\n");
+    DEBUG_PRINTF("  char: ");
+    dumpMsk128(chars);
+    printf("\n");
+    DEBUG_PRINTF("  c_lo: ");
+    dumpMsk128(c_lo);
+    printf("\n");
+    DEBUG_PRINTF("  c_hi: ");
+    dumpMsk128(c_hi);
+    printf("\n");
+    DEBUG_PRINTF("     t: ");
+    dumpMsk128(t);
+    printf("\n");
 #endif
     return movemask128(eq128(t, compare));
 }
 
-static really_inline
-const u8 *firstMatch(const u8 *buf, u32 z) {
+static really_inline const u8 *firstMatch(const u8 *buf, u32 z) {
     if (unlikely(z != 0xffff)) {
         u32 pos = ctz32(~z & 0xffff);
         assert(pos < 16);
@@ -139,9 +143,9 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
     }
 }
 
-static really_inline
-const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf,
-                   const m128 low4bits, const m128 zeroes) {
+static really_inline const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars,
+                                        const u8 *buf, const m128 low4bits,
+                                        const m128 zeroes) {
     u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
 
     return firstMatch(buf, z);
@@ -153,13 +157,13 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     assert(buf < buf_end);
 
     // Slow path for small cases.
-    if (buf_end - buf < 16) {
-        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
-                             buf, buf_end);
+    if (unlikely(buf_end - buf < 16)) {
+        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf,
+                             buf_end);
     }
 
     const m128 zeroes = zeroes128();
-    const m128 low4bits = _mm_set1_epi8(0xf);
+    const m128 low4bits = set16x8(0xf);
     const u8 *rv;
 
     size_t min = (size_t)buf % 16;
@@ -179,6 +183,11 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     const u8 *last_block = buf_end - 16;
     while (buf < last_block) {
         m128 lchars = load128(buf);
+
+#if defined(HAVE_NEON)
+        __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(buf + 256)));
+#endif
+
         rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes);
         if (rv) {
             return rv;
@@ -198,10 +207,11 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     return buf_end;
 }
 
-static really_inline
-const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) {
+static really_inline const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) {
 #ifdef DEBUG
-    DEBUG_PRINTF("confirming match in:"); dumpMsk128(t); printf("\n");
+    DEBUG_PRINTF("confirming match in:");
+    dumpMsk128(t);
+    printf("\n");
 #endif
 
     u32 z = movemask128(eq128(t, compare));
@@ -215,20 +225,29 @@ const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) {
     }
 }
 
-
-static really_inline
-const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf,
-                   const m128 low4bits, const m128 zeroes) {
-    m128 c_lo  = pshufb_m128(mask_lo, GET_LO_4(chars));
-    m128 c_hi  = pshufb_m128(mask_hi, GET_HI_4(chars));
-    m128 t     = and128(c_lo, c_hi);
+static really_inline const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars,
+                                        const u8 *buf, const m128 low4bits,
+                                        const m128 zeroes) {
+    m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars));
+    m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars));
+    m128 t = and128(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
+    DEBUG_PRINTF(" chars: ");
+    dumpMsk128AsChars(chars);
+    printf("\n");
+    DEBUG_PRINTF("  char: ");
+    dumpMsk128(chars);
+    printf("\n");
+    DEBUG_PRINTF("  c_lo: ");
+    dumpMsk128(c_lo);
+    printf("\n");
+    DEBUG_PRINTF("  c_hi: ");
+    dumpMsk128(c_hi);
+    printf("\n");
+    DEBUG_PRINTF("     t: ");
+    dumpMsk128(t);
+    printf("\n");
 #endif
 
     return lastMatch(buf, t, zeroes);
@@ -241,12 +260,12 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
 
     // Slow path for small cases.
     if (buf_end - buf < 16) {
-        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
-                             buf, buf_end);
+        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf,
+                             buf_end);
     }
 
     const m128 zeroes = zeroes128();
-    const m128 low4bits = _mm_set1_epi8(0xf);
+    const m128 low4bits = set16x8(0xf);
     const u8 *rv;
 
     assert(buf_end - buf >= 16);
@@ -283,32 +302,48 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     return buf - 1;
 }
 
-static really_inline
-const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi,
-                    m128 chars, const u8 *buf, const m128 low4bits,
-                    const m128 ones) {
+static really_inline const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi,
+                                         m128 mask2_lo, m128 mask2_hi,
+                                         m128 chars, const u8 *buf,
+                                         const m128 low4bits, const m128 ones) {
     m128 chars_lo = GET_LO_4(chars);
     m128 chars_hi = GET_HI_4(chars);
-    m128 c_lo  = pshufb_m128(mask1_lo, chars_lo);
-    m128 c_hi  = pshufb_m128(mask1_hi, chars_hi);
-    m128 t     = or128(c_lo, c_hi);
+    m128 c_lo = pshufb_m128(mask1_lo, chars_lo);
+    m128 c_hi = pshufb_m128(mask1_hi, chars_hi);
+    m128 t = or128(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
+    DEBUG_PRINTF(" chars: ");
+    dumpMsk128AsChars(chars);
+    printf("\n");
+    DEBUG_PRINTF("  char: ");
+    dumpMsk128(chars);
+    printf("\n");
+    DEBUG_PRINTF("  c_lo: ");
+    dumpMsk128(c_lo);
+    printf("\n");
+    DEBUG_PRINTF("  c_hi: ");
+    dumpMsk128(c_hi);
+    printf("\n");
+    DEBUG_PRINTF("     t: ");
+    dumpMsk128(t);
+    printf("\n");
 #endif
 
-    m128 c2_lo  = pshufb_m128(mask2_lo, chars_lo);
-    m128 c2_hi  = pshufb_m128(mask2_hi, chars_hi);
-    m128 t2     = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1));
+    m128 c2_lo = pshufb_m128(mask2_lo, chars_lo);
+    m128 c2_hi = pshufb_m128(mask2_hi, chars_hi);
+    m128 t2 = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1));
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo);        printf("\n");
-    DEBUG_PRINTF(" c2_hi: "); dumpMsk128(c2_hi);        printf("\n");
-    DEBUG_PRINTF("    t2: "); dumpMsk128(t2);           printf("\n");
+    DEBUG_PRINTF(" c2_lo: ");
+    dumpMsk128(c2_lo);
+    printf("\n");
+    DEBUG_PRINTF(" c2_hi: ");
+    dumpMsk128(c2_hi);
+    printf("\n");
+    DEBUG_PRINTF("    t2: ");
+    dumpMsk128(t2);
+    printf("\n");
 #endif
 
     u32 z = movemask128(eq128(t2, ones));
@@ -316,19 +351,18 @@ const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi,
     return firstMatch(buf, z);
 }
 
-const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
-                           m128 mask2_lo, m128 mask2_hi,
-                           const u8 *buf, const u8 *buf_end) {
+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
+                           m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
     const m128 ones = ones128();
-    const m128 low4bits = _mm_set1_epi8(0xf);
+    const m128 low4bits = set16x8(0xf);
     const u8 *rv;
 
     size_t min = (size_t)buf % 16;
 
     // Preconditioning: most of the time our buffer won't be aligned.
     m128 chars = loadu128(buf);
-    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
-                   chars, buf, low4bits, ones);
+    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf, low4bits,
+                   ones);
     if (rv) {
         return rv;
     }
@@ -340,8 +374,13 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
     const u8 *last_block = buf_end - 16;
     while (buf < last_block) {
         m128 lchars = load128(buf);
-        rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
-                       lchars, buf, low4bits, ones);
+
+#if defined(HAVE_NEON)
+        __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(buf + 256)));
+#endif
+
+        rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, lchars, buf,
+                       low4bits, ones);
         if (rv) {
             return rv;
         }
@@ -351,8 +390,8 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
     // Use an unaligned load to mop up the last 16 bytes and get an accurate
     // picture to buf_end.
     chars = loadu128(buf_end - 16);
-    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
-                   chars, buf_end - 16, low4bits, ones);
+    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf_end - 16,
+                   low4bits, ones);
     if (rv) {
         return rv;
     }
@@ -370,26 +409,34 @@ DUMP_MSK(256)
 #define GET_LO_4(chars) and256(chars, low4bits)
 #define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4)
 
-static really_inline
-u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits,
-          const m256 compare) {
-    m256 c_lo  = pshufb_m256(mask_lo, GET_LO_4(chars));
-    m256 c_hi  = pshufb_m256(mask_hi, GET_HI_4(chars));
+static really_inline u32 block(m256 mask_lo, m256 mask_hi, m256 chars,
+                               const m256 low4bits, const m256 compare) {
+    m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars));
+    m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars));
     m256 t = and256(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk256(chars); printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo); printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi); printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk256(t); printf("\n");
+    DEBUG_PRINTF(" chars: ");
+    dumpMsk256AsChars(chars);
+    printf("\n");
+    DEBUG_PRINTF("  char: ");
+    dumpMsk256(chars);
+    printf("\n");
+    DEBUG_PRINTF("  c_lo: ");
+    dumpMsk256(c_lo);
+    printf("\n");
+    DEBUG_PRINTF("  c_hi: ");
+    dumpMsk256(c_hi);
+    printf("\n");
+    DEBUG_PRINTF("     t: ");
+    dumpMsk256(t);
+    printf("\n");
 #endif
 
     return movemask256(eq256(t, compare));
 }
 
-static really_inline
-const u8 *firstMatch(const u8 *buf, u32 z) {
+static really_inline const u8 *firstMatch(const u8 *buf, u32 z) {
     DEBUG_PRINTF("z 0x%08x\n", z);
     if (unlikely(z != 0xffffffff)) {
         u32 pos = ctz32(~z);
@@ -401,9 +448,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
     }
 }
 
-static really_inline
-const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf,
-                        const m256 low4bits) {
+static really_inline const u8 *
+fwdBlockShort(m256 mask, m128 chars, const u8 *buf, const m256 low4bits) {
     // do the hi and lo shuffles in the one avx register
     m256 c = combine2x128(rshift64_m128(chars, 4), chars);
     c = and256(c, low4bits);
@@ -415,9 +461,9 @@ const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf,
     return firstMatch(buf, z);
 }
 
-static really_inline
-const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                         const u8 *buf_end, const m256 low4bits) {
+static really_inline const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi,
+                                              const u8 *buf, const u8 *buf_end,
+                                              const m256 low4bits) {
     // run shufti over two overlapping 16-byte unaligned reads
     const m256 mask = combine2x128(mask_hi, mask_lo);
     m128 chars = loadu128(buf);
@@ -434,9 +480,9 @@ const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
     return buf_end;
 }
 
-static really_inline
-const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
-                   const m256 low4bits, const m256 zeroes) {
+static really_inline const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars,
+                                        const u8 *buf, const m256 low4bits,
+                                        const m256 zeroes) {
     u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
 
     return firstMatch(buf, z);
@@ -451,8 +497,8 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
 
     // Slow path for small cases.
     if (buf_end - buf < 16) {
-        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
-                             buf, buf_end);
+        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf,
+                             buf_end);
     }
 
     const m256 low4bits = set32x8(0xf);
@@ -483,7 +529,8 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     const u8 *last_block = buf_end - 32;
     while (buf < last_block) {
         m256 lchars = load256(buf);
-        rv = fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes);
+        rv =
+            fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes);
         if (rv) {
             return rv;
         }
@@ -494,7 +541,8 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     // picture to buf_end.
     assert(buf <= buf_end && buf >= buf_end - 32);
     chars = loadu256(buf_end - 32);
-    rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes);
+    rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits,
+                  zeroes);
     if (rv) {
         return rv;
     }
@@ -502,8 +550,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     return buf_end;
 }
 
-static really_inline
-const u8 *lastMatch(const u8 *buf, u32 z) {
+static really_inline const u8 *lastMatch(const u8 *buf, u32 z) {
     if (unlikely(z != 0xffffffff)) {
         u32 pos = clz32(~z);
         DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
@@ -513,28 +560,37 @@ const u8 *lastMatch(const u8 *buf, u32 z) {
     }
 }
 
-static really_inline
-const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
-                   const m256 low4bits, const m256 zeroes) {
-    m256 c_lo  = pshufb_m256(mask_lo, GET_LO_4(chars));
-    m256 c_hi  = pshufb_m256(mask_hi, GET_HI_4(chars));
-    m256 t     = and256(c_lo, c_hi);
+static really_inline const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars,
+                                        const u8 *buf, const m256 low4bits,
+                                        const m256 zeroes) {
+    m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars));
+    m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars));
+    m256 t = and256(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk256(chars);        printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo);         printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi);         printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk256(t);            printf("\n");
+    DEBUG_PRINTF(" chars: ");
+    dumpMsk256AsChars(chars);
+    printf("\n");
+    DEBUG_PRINTF("  char: ");
+    dumpMsk256(chars);
+    printf("\n");
+    DEBUG_PRINTF("  c_lo: ");
+    dumpMsk256(c_lo);
+    printf("\n");
+    DEBUG_PRINTF("  c_hi: ");
+    dumpMsk256(c_hi);
+    printf("\n");
+    DEBUG_PRINTF("     t: ");
+    dumpMsk256(t);
+    printf("\n");
 #endif
 
     u32 z = movemask256(eq256(t, zeroes));
     return lastMatch(buf, z);
 }
 
-static really_inline
-const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf,
-                        const m256 low4bits) {
+static really_inline const u8 *
+revBlockShort(m256 mask, m128 chars, const u8 *buf, const m256 low4bits) {
     // do the hi and lo shuffles in the one avx register
     m256 c = combine2x128(rshift64_m128(chars, 4), chars);
     c = and256(c, low4bits);
@@ -546,9 +602,9 @@ const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf,
     return lastMatch(buf, z);
 }
 
-static really_inline
-const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                         const u8 *buf_end, const m256 low4bits) {
+static really_inline const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi,
+                                              const u8 *buf, const u8 *buf_end,
+                                              const m256 low4bits) {
     // run shufti over two overlapping 16-byte unaligned reads
     const m256 mask = combine2x128(mask_hi, mask_lo);
 
@@ -566,7 +622,6 @@ const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
     return buf - 1;
 }
 
-
 /* takes 128 bit masks, but operates on 256 bits of data */
 const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
                       const u8 *buf_end) {
@@ -575,8 +630,8 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
 
     // Slow path for small cases.
     if (buf_end - buf < 16) {
-        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
-                             buf, buf_end);
+        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf,
+                             buf_end);
     }
 
     const m256 low4bits = set32x8(0xf);
@@ -594,7 +649,8 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
 
     // Preconditioning: most of the time our buffer won't be aligned.
     m256 chars = loadu256(buf_end - 32);
-    rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes);
+    rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits,
+                  zeroes);
     if (rv) {
         return rv;
     }
@@ -606,7 +662,8 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     while (buf_end > last_block) {
         buf_end -= 32;
         m256 lchars = load256(buf_end);
-        rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, zeroes);
+        rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits,
+                      zeroes);
         if (rv) {
             return rv;
         }
@@ -623,42 +680,58 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     return buf - 1;
 }
 
-static really_inline
-const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi,
-                    m256 chars, const u8 *buf, const m256 low4bits,
-                    const m256 ones) {
+static really_inline const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi,
+                                         m256 mask2_lo, m256 mask2_hi,
+                                         m256 chars, const u8 *buf,
+                                         const m256 low4bits, const m256 ones) {
     DEBUG_PRINTF("buf %p\n", buf);
     m256 chars_lo = GET_LO_4(chars);
     m256 chars_hi = GET_HI_4(chars);
-    m256 c_lo  = pshufb_m256(mask1_lo, chars_lo);
-    m256 c_hi  = pshufb_m256(mask1_hi, chars_hi);
-    m256 t     = or256(c_lo, c_hi);
+    m256 c_lo = pshufb_m256(mask1_lo, chars_lo);
+    m256 c_hi = pshufb_m256(mask1_hi, chars_hi);
+    m256 t = or256(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk256(chars);        printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo);         printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi);         printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk256(t);            printf("\n");
+    DEBUG_PRINTF(" chars: ");
+    dumpMsk256AsChars(chars);
+    printf("\n");
+    DEBUG_PRINTF("  char: ");
+    dumpMsk256(chars);
+    printf("\n");
+    DEBUG_PRINTF("  c_lo: ");
+    dumpMsk256(c_lo);
+    printf("\n");
+    DEBUG_PRINTF("  c_hi: ");
+    dumpMsk256(c_hi);
+    printf("\n");
+    DEBUG_PRINTF("     t: ");
+    dumpMsk256(t);
+    printf("\n");
 #endif
 
-    m256 c2_lo  = pshufb_m256(mask2_lo, chars_lo);
-    m256 c2_hi  = pshufb_m256(mask2_hi, chars_hi);
+    m256 c2_lo = pshufb_m256(mask2_lo, chars_lo);
+    m256 c2_hi = pshufb_m256(mask2_hi, chars_hi);
     m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1));
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo);        printf("\n");
-    DEBUG_PRINTF(" c2_hi: "); dumpMsk256(c2_hi);        printf("\n");
-    DEBUG_PRINTF("    t2: "); dumpMsk256(t2);           printf("\n");
+    DEBUG_PRINTF(" c2_lo: ");
+    dumpMsk256(c2_lo);
+    printf("\n");
+    DEBUG_PRINTF(" c2_hi: ");
+    dumpMsk256(c2_hi);
+    printf("\n");
+    DEBUG_PRINTF("    t2: ");
+    dumpMsk256(t2);
+    printf("\n");
 #endif
     u32 z = movemask256(eq256(t2, ones));
 
     return firstMatch(buf, z);
 }
 
-static really_inline
-const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf,
-                         const m256 low4bits) {
+static really_inline const u8 *fwdBlockShort2(m256 mask1, m256 mask2,
+                                              m128 chars, const u8 *buf,
+                                              const m256 low4bits) {
     // do the hi and lo shuffles in the one avx register
     m256 c = combine2x128(rshift64_m128(chars, 4), chars);
     c = and256(c, low4bits);
@@ -672,9 +745,10 @@ const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf,
     return firstMatch(buf, z);
 }
 
-static really_inline
-const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
-                            m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
+static really_inline const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi,
+                                                 m128 mask2_lo, m128 mask2_hi,
+                                                 const u8 *buf,
+                                                 const u8 *buf_end) {
     DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
     const m256 low4bits = set32x8(0xf);
     // run shufti over two overlapping 16-byte unaligned reads
@@ -695,9 +769,8 @@ const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
 }
 
 /* takes 128 bit masks, but operates on 256 bits of data */
-const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
-                           m128 mask2_lo, m128 mask2_hi,
-                           const u8 *buf, const u8 *buf_end) {
+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
+                           m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
     /* we should always have at least 16 bytes */
     assert(buf_end - buf >= 16);
     DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
@@ -731,8 +804,8 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
     const u8 *last_block = buf_end - 32;
     while (buf < last_block) {
         m256 lchars = load256(buf);
-        rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
-                       lchars, buf, low4bits, ones);
+        rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
+                       wide_mask2_hi, lchars, buf, low4bits, ones);
         if (rv) {
             return rv;
         }
@@ -757,26 +830,34 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
 DUMP_MSK(512)
 #endif
 
-static really_inline
-u64a block(m512 mask_lo, m512 mask_hi, m512 chars, const m512 low4bits,
-           const m512 compare) {
+static really_inline u64a block(m512 mask_lo, m512 mask_hi, m512 chars,
+                                const m512 low4bits, const m512 compare) {
     m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits));
-    m512 c_hi = pshufb_m512(mask_hi,
-                            rshift64_m512(andnot512(low4bits, chars), 4));
+    m512 c_hi =
+        pshufb_m512(mask_hi, rshift64_m512(andnot512(low4bits, chars), 4));
     m512 t = and512(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk512(chars); printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo); printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi); printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk512(t); printf("\n");
+    DEBUG_PRINTF(" chars: ");
+    dumpMsk512AsChars(chars);
+    printf("\n");
+    DEBUG_PRINTF("  char: ");
+    dumpMsk512(chars);
+    printf("\n");
+    DEBUG_PRINTF("  c_lo: ");
+    dumpMsk512(c_lo);
+    printf("\n");
+    DEBUG_PRINTF("  c_hi: ");
+    dumpMsk512(c_hi);
+    printf("\n");
+    DEBUG_PRINTF("     t: ");
+    dumpMsk512(t);
+    printf("\n");
 #endif
 
     return eq512mask(t, compare);
 }
-static really_inline
-const u8 *firstMatch64(const u8 *buf, u64a z) {
+static really_inline const u8 *firstMatch64(const u8 *buf, u64a z) {
     DEBUG_PRINTF("z 0x%016llx\n", z);
     if (unlikely(z != ~0ULL)) {
         u32 pos = ctz64(~z);
@@ -788,18 +869,19 @@ const u8 *firstMatch64(const u8 *buf, u64a z) {
     }
 }
 
-static really_inline
-const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf,
-                      const m512 low4bits, const m512 zeroes) {
+static really_inline const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi,
+                                           m512 chars, const u8 *buf,
+                                           const m512 low4bits,
+                                           const m512 zeroes) {
     u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
 
     return firstMatch64(buf, z);
 }
 
-static really_inline
-const u8 *shortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf,
-                         const u8 *buf_end, const m512 low4bits,
-                         const m512 zeroes) {
+static really_inline const u8 *shortShufti512(m512 mask_lo, m512 mask_hi,
+                                              const u8 *buf, const u8 *buf_end,
+                                              const m512 low4bits,
+                                              const m512 zeroes) {
     DEBUG_PRINTF("short shufti %p len %zu\n", buf, buf_end - buf);
     uintptr_t len = buf_end - buf;
     assert(len <= 64);
@@ -877,8 +959,7 @@ done:
     return buf_end;
 }
 
-static really_inline
-const u8 *lastMatch64(const u8 *buf, u64a z) {
+static really_inline const u8 *lastMatch64(const u8 *buf, u64a z) {
     DEBUG_PRINTF("z 0x%016llx\n", z);
     if (unlikely(z != ~0ULL)) {
         u32 pos = clz64(~z);
@@ -889,10 +970,10 @@ const u8 *lastMatch64(const u8 *buf, u64a z) {
     }
 }
 
-static really_inline
-const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf,
-                          const u8 *buf_end, const m512 low4bits,
-                          const m512 zeroes) {
+static really_inline const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi,
+                                               const u8 *buf, const u8 *buf_end,
+                                               const m512 low4bits,
+                                               const m512 zeroes) {
     DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf);
     uintptr_t len = buf_end - buf;
     assert(len <= 64);
@@ -909,20 +990,31 @@ const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf,
     return lastMatch64(buf, z | ~k);
 }
 
-static really_inline
-const u8 *revBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf,
-                      const m512 low4bits, const m512 zeroes) {
-    m512 c_lo  = pshufb_m512(mask_lo, and512(chars, low4bits));
-    m512 c_hi  = pshufb_m512(mask_hi,
-                             rshift64_m512(andnot512(low4bits, chars), 4));
-    m512 t     = and512(c_lo, c_hi);
+static really_inline const u8 *revBlock512(m512 mask_lo, m512 mask_hi,
+                                           m512 chars, const u8 *buf,
+                                           const m512 low4bits,
+                                           const m512 zeroes) {
+    m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits));
+    m512 c_hi =
+        pshufb_m512(mask_hi, rshift64_m512(andnot512(low4bits, chars), 4));
+    m512 t = and512(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk512(chars);        printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo);         printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi);         printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk512(t);            printf("\n");
+    DEBUG_PRINTF(" chars: ");
+    dumpMsk512AsChars(chars);
+    printf("\n");
+    DEBUG_PRINTF("  char: ");
+    dumpMsk512(chars);
+    printf("\n");
+    DEBUG_PRINTF("  c_lo: ");
+    dumpMsk512(c_lo);
+    printf("\n");
+    DEBUG_PRINTF("  c_hi: ");
+    dumpMsk512(c_hi);
+    printf("\n");
+    DEBUG_PRINTF("     t: ");
+    dumpMsk512(t);
+    printf("\n");
 #endif
 
     u64a z = eq512mask(t, zeroes);
@@ -985,43 +1077,60 @@ done:
     return buf - 1;
 }
 
-static really_inline
-const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi,
-                    m512 chars, const u8 *buf, const m512 low4bits,
-                    const m512 ones, __mmask64 k) {
+static really_inline const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi,
+                                         m512 mask2_lo, m512 mask2_hi,
+                                         m512 chars, const u8 *buf,
+                                         const m512 low4bits, const m512 ones,
+                                         __mmask64 k) {
     DEBUG_PRINTF("buf %p %.64s\n", buf, buf);
     m512 chars_lo = and512(chars, low4bits);
     m512 chars_hi = rshift64_m512(andnot512(low4bits, chars), 4);
-    m512 c_lo  = maskz_pshufb_m512(k, mask1_lo, chars_lo);
-    m512 c_hi  = maskz_pshufb_m512(k, mask1_hi, chars_hi);
-    m512 t     = or512(c_lo, c_hi);
+    m512 c_lo = maskz_pshufb_m512(k, mask1_lo, chars_lo);
+    m512 c_hi = maskz_pshufb_m512(k, mask1_hi, chars_hi);
+    m512 t = or512(c_lo, c_hi);
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk512(chars);        printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo);         printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi);         printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk512(t);            printf("\n");
+    DEBUG_PRINTF(" chars: ");
+    dumpMsk512AsChars(chars);
+    printf("\n");
+    DEBUG_PRINTF("  char: ");
+    dumpMsk512(chars);
+    printf("\n");
+    DEBUG_PRINTF("  c_lo: ");
+    dumpMsk512(c_lo);
+    printf("\n");
+    DEBUG_PRINTF("  c_hi: ");
+    dumpMsk512(c_hi);
+    printf("\n");
+    DEBUG_PRINTF("     t: ");
+    dumpMsk512(t);
+    printf("\n");
 #endif
 
-    m512 c2_lo  = maskz_pshufb_m512(k, mask2_lo, chars_lo);
-    m512 c2_hi  = maskz_pshufb_m512(k, mask2_hi, chars_hi);
+    m512 c2_lo = maskz_pshufb_m512(k, mask2_lo, chars_lo);
+    m512 c2_hi = maskz_pshufb_m512(k, mask2_hi, chars_hi);
     m512 t2 = or512(t, rshift128_m512(or512(c2_lo, c2_hi), 1));
 
 #ifdef DEBUG
-    DEBUG_PRINTF(" c2_lo: "); dumpMsk512(c2_lo);        printf("\n");
-    DEBUG_PRINTF(" c2_hi: "); dumpMsk512(c2_hi);        printf("\n");
-    DEBUG_PRINTF("    t2: "); dumpMsk512(t2);           printf("\n");
+    DEBUG_PRINTF(" c2_lo: ");
+    dumpMsk512(c2_lo);
+    printf("\n");
+    DEBUG_PRINTF(" c2_hi: ");
+    dumpMsk512(c2_hi);
+    printf("\n");
+    DEBUG_PRINTF("    t2: ");
+    dumpMsk512(t2);
+    printf("\n");
 #endif
     u64a z = eq512mask(t2, ones);
 
     return firstMatch64(buf, z | ~k);
 }
 
-static really_inline
-const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo,
-                               m512 mask2_hi, const u8 *buf, const u8 *buf_end,
-                               const m512 low4bits, const m512 ones) {
+static really_inline const u8 *
+shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi,
+                     const u8 *buf, const u8 *buf_end, const m512 low4bits,
+                     const m512 ones) {
     DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf);
     uintptr_t len = buf_end - buf;
     assert(len <= 64);
@@ -1038,9 +1147,8 @@ const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo,
 }
 
 /* takes 128 bit masks, but operates on 512 bits of data */
-const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
-                           m128 mask2_lo, m128 mask2_hi,
-                           const u8 *buf, const u8 *buf_end) {
+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
+                           m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
     /* we should always have at least 16 bytes */
     assert(buf_end - buf >= 16);
     DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
diff --git a/src/nfa/truffle.c b/src/nfa/truffle.c
index be6b312..c05d778 100644
--- a/src/nfa/truffle.c
+++ b/src/nfa/truffle.c
@@ -41,7 +41,7 @@
 
 static really_inline
 const u8 *lastMatch(const u8 *buf, u32 z) {
-    if (unlikely(z != 0xffff)) {
+    if (z != 0xffff) {
         u32 pos = clz32(~z & 0xffff);
         assert(pos >= 16 && pos < 32);
         return buf + (31 - pos);
@@ -52,7 +52,7 @@ const u8 *lastMatch(const u8 *buf, u32 z) {
 
 static really_inline
 const u8 *firstMatch(const u8 *buf, u32 z) {
-    if (unlikely(z != 0xffff)) {
+    if (likely(z != 0xffff)) {
         u32 pos = ctz32(~z & 0xffff);
         assert(pos < 16);
         return buf + pos;
@@ -64,8 +64,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
 static really_inline
 u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
 
-    m128 highconst = _mm_set1_epi8(0x80);
-    m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201);
+    m128 highconst = set16x8(0x80);
+    m128 shuf_mask_hi = set2x64(0x8040201008040201);
 
     // and now do the real work
     m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v);
@@ -124,7 +124,7 @@ const u8 *truffleExec(m128 shuf_mask_lo_highclear,
     assert(buf < buf_end);
     const u8 *rv;
 
-    if (buf_end - buf < 16) {
+    if (unlikely(buf_end - buf < 16)) {
         return truffleMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf,
                            buf_end);
     }
diff --git a/src/parser/control_verbs.cpp b/src/parser/control_verbs.cpp
new file mode 100644
index 0000000..482004d
--- /dev/null
+++ b/src/parser/control_verbs.cpp
@@ -0,0 +1,340 @@
+
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Parser for control verbs that can occur at the beginning of a pattern.
+ */
+
+#include "parser/control_verbs.h"
+
+#include "parser/Parser.h"
+#include "parser/parse_error.h"
+
+#include <cstring>
+#include <sstream>
+
+using namespace std;
+
+namespace ue2 {
+
+const char *read_control_verbs(const char *ptr, const char *end, size_t start,
+                               ParseMode &mode) {
+    const char *p = ptr;
+    const char *pe = end;
+    const char *eof = pe;
+    const char *ts, *te;
+    int cs;
+    UNUSED int act;
+
+    static const char _ControlVerbs_actions[] = {
+        0, 1, 0, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1, 9};
+
+    static const unsigned char _ControlVerbs_key_offsets[] = {
+        0,   7,   8,   10,  12,  14,  16,  18,  20,  21,  23,  25,  27,
+        30,  32,  34,  36,  38,  40,  42,  44,  46,  48,  50,  52,  55,
+        57,  59,  61,  63,  66,  68,  70,  72,  74,  76,  79,  82,  84,
+        86,  88,  90,  92,  94,  96,  98,  100, 102, 105, 107, 109, 111,
+        113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137,
+        139, 141, 143, 146, 148, 149, 151, 155, 157, 159, 160, 161};
+
+    static const char _ControlVerbs_trans_keys[] = {
+        41, 65, 66, 67, 76, 78, 85, 41, 41, 78, 41, 89, 41, 67, 41, 82, 41,
+        76, 41, 70, 41, 41, 83, 41, 82, 41, 95, 41, 65, 85, 41, 78, 41, 89,
+        41, 67, 41, 78, 41, 73, 41, 67, 41, 79, 41, 68, 41, 69, 41, 82, 41,
+        76, 41, 70, 73, 41, 77, 41, 73, 41, 84, 41, 95, 41, 77, 82, 41, 65,
+        41, 84, 41, 67, 41, 72, 41, 61, 41, 48, 57, 41, 48, 57, 41, 69, 41,
+        67, 41, 85, 41, 82, 41, 83, 41, 73, 41, 79, 41, 78, 41, 79, 41, 95,
+        41, 65, 83, 41, 85, 41, 84, 41, 79, 41, 95, 41, 80, 41, 79, 41, 83,
+        41, 83, 41, 69, 41, 83, 41, 83, 41, 84, 41, 65, 41, 82, 41, 84, 41,
+        95, 41, 79, 41, 80, 41, 84, 41, 67, 84, 41, 80, 41, 41, 70, 41, 49,
+        51, 56, 41, 54, 41, 50, 41, 40, 42, 0};
+
+    static const char _ControlVerbs_single_lengths[] = {
+        7, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
+        2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2,
+        2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+        2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 2, 4, 2, 2, 1, 1, 1};
+
+    static const char _ControlVerbs_range_lengths[] = {
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+    static const short _ControlVerbs_index_offsets[] = {
+        0,   8,   10,  13,  16,  19,  22,  25,  28,  30,  33,  36,  39,
+        43,  46,  49,  52,  55,  58,  61,  64,  67,  70,  73,  76,  80,
+        83,  86,  89,  92,  96,  99,  102, 105, 108, 111, 114, 117, 120,
+        123, 126, 129, 132, 135, 138, 141, 144, 147, 151, 154, 157, 160,
+        163, 166, 169, 172, 175, 178, 181, 184, 187, 190, 193, 196, 199,
+        202, 205, 208, 212, 215, 217, 220, 225, 228, 231, 233, 235};
+
+    static const char _ControlVerbs_indicies[] = {
+        0,  2,  3,  4,  5,  6,  7,  1,  8,  1,  8,  9,  1,  8,  10, 1,  11,
+        12, 1,  8,  13, 1,  8,  14, 1,  8,  15, 1,  11, 1,  8,  16, 1,  8,
+        17, 1,  8,  18, 1,  8,  19, 20, 1,  8,  21, 1,  8,  22, 1,  8,  12,
+        1,  8,  23, 1,  8,  24, 1,  8,  25, 1,  8,  26, 1,  8,  27, 1,  8,
+        15, 1,  8,  28, 1,  11, 14, 1,  8,  15, 29, 1,  8,  30, 1,  8,  31,
+        1,  8,  32, 1,  8,  33, 1,  8,  34, 35, 1,  8,  36, 1,  8,  37, 1,
+        8,  38, 1,  8,  39, 1,  8,  40, 1,  8,  41, 1,  11, 41, 1,  8,  42,
+        1,  8,  43, 1,  8,  44, 1,  8,  45, 1,  8,  46, 1,  8,  47, 1,  8,
+        48, 1,  8,  39, 1,  8,  49, 1,  8,  50, 1,  8,  51, 52, 1,  8,  53,
+        1,  8,  54, 1,  8,  55, 1,  8,  56, 1,  8,  57, 1,  8,  58, 1,  8,
+        59, 1,  8,  60, 1,  8,  61, 1,  8,  62, 1,  8,  15, 1,  8,  63, 1,
+        8,  64, 1,  8,  65, 1,  8,  66, 1,  8,  67, 1,  8,  68, 1,  8,  69,
+        1,  8,  15, 1,  8,  70, 71, 1,  8,  72, 1,  73, 1,  8,  74, 1,  75,
+        76, 77, 78, 1,  8,  15, 1,  8,  15, 1,  75, 1,  80, 79, 82, 81, 0};
+
+    static const char _ControlVerbs_trans_targs[] = {
+        75, 1,  2,  9,  22, 24, 45, 67, 75, 3,  4,  75, 5,  6,  7,  8,  10,
+        11, 12, 13, 16, 14, 15, 17, 18, 19, 20, 21, 23, 25, 26, 27, 28, 29,
+        30, 37, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 46, 47,
+        48, 59, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63, 64,
+        65, 66, 68, 70, 69, 75, 71, 75, 72, 73, 74, 75, 76, 75, 0};
+
+    static const char _ControlVerbs_trans_actions[] = {
+        19, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 11, 0, 0, 0, 0, 0,  0, 0,  0, 0,
+        0,  0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0,  0, 0, 0, 0, 0,  0, 0,  0, 0,
+        0,  0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0,  0, 0, 0, 0, 0,  0, 0,  0, 0,
+        0,  0, 0, 0, 0, 0, 0, 0, 0,  0, 9, 0,  7, 0, 0, 0, 15, 5, 17, 0};
+
+    static const char _ControlVerbs_to_state_actions[] = {
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0};
+
+    static const char _ControlVerbs_from_state_actions[] = {
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0};
+
+    static const short _ControlVerbs_eof_trans[] = {
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 82};
+
+    static const int ControlVerbs_start = 75;
+    static const int ControlVerbs_first_final = 75;
+    static const int ControlVerbs_error = -1;
+
+    static const int ControlVerbs_en_main = 75;
+
+    {
+        cs = ControlVerbs_start;
+        ts = 0;
+        te = 0;
+        act = 0;
+    }
+
+    try {
+
+        {
+            int _klen;
+            unsigned int _trans;
+            const char *_acts;
+            unsigned int _nacts;
+            const char *_keys;
+
+            if (p == pe)
+                goto _test_eof;
+        _resume:
+            _acts =
+                _ControlVerbs_actions + _ControlVerbs_from_state_actions[cs];
+            _nacts = (unsigned int)*_acts++;
+            while (_nacts-- > 0) {
+                switch (*_acts++) {
+                case 1: {
+                    ts = p;
+                } break;
+                }
+            }
+
+            _keys = _ControlVerbs_trans_keys + _ControlVerbs_key_offsets[cs];
+            _trans = _ControlVerbs_index_offsets[cs];
+
+            _klen = _ControlVerbs_single_lengths[cs];
+            if (_klen > 0) {
+                const char *_lower = _keys;
+                const char *_mid;
+                const char *_upper = _keys + _klen - 1;
+                while (1) {
+                    if (_upper < _lower)
+                        break;
+
+                    _mid = _lower + ((_upper - _lower) >> 1);
+                    if ((*p) < *_mid)
+                        _upper = _mid - 1;
+                    else if ((*p) > *_mid)
+                        _lower = _mid + 1;
+                    else {
+                        _trans += (unsigned int)(_mid - _keys);
+                        goto _match;
+                    }
+                }
+                _keys += _klen;
+                _trans += _klen;
+            }
+
+            _klen = _ControlVerbs_range_lengths[cs];
+            if (_klen > 0) {
+                const char *_lower = _keys;
+                const char *_mid;
+                const char *_upper = _keys + (_klen << 1) - 2;
+                while (1) {
+                    if (_upper < _lower)
+                        break;
+
+                    _mid = _lower + (((_upper - _lower) >> 1) & ~1);
+                    if ((*p) < _mid[0])
+                        _upper = _mid - 2;
+                    else if ((*p) > _mid[1])
+                        _lower = _mid + 2;
+                    else {
+                        _trans += (unsigned int)((_mid - _keys) >> 1);
+                        goto _match;
+                    }
+                }
+                _trans += _klen;
+            }
+
+        _match:
+            _trans = _ControlVerbs_indicies[_trans];
+        _eof_trans:
+            cs = _ControlVerbs_trans_targs[_trans];
+
+            if (_ControlVerbs_trans_actions[_trans] == 0)
+                goto _again;
+
+            _acts = _ControlVerbs_actions + _ControlVerbs_trans_actions[_trans];
+            _nacts = (unsigned int)*_acts++;
+            while (_nacts-- > 0) {
+                switch (*_acts++) {
+                case 2: {
+                    te = p + 1;
+                } break;
+                case 3: {
+                    te = p + 1;
+                    { mode.utf8 = true; }
+                } break;
+                case 4: {
+                    te = p + 1;
+                    { mode.ucp = true; }
+                } break;
+                case 5: {
+                    te = p + 1;
+                    {
+                        ostringstream str;
+                        str << "Unsupported control verb "
+                            << string(ts, te - ts);
+                        throw LocatedParseError(str.str());
+                    }
+                } break;
+                case 6: {
+                    te = p + 1;
+                    {
+                        ostringstream str;
+                        str << "Unknown control verb " << string(ts, te - ts);
+                        throw LocatedParseError(str.str());
+                    }
+                } break;
+                case 7: {
+                    te = p + 1;
+                    {
+                        p--;
+                        {
+                            p++;
+                            goto _out;
+                        }
+                    }
+                } break;
+                case 8: {
+                    te = p;
+                    p--;
+                    {
+                        p--;
+                        {
+                            p++;
+                            goto _out;
+                        }
+                    }
+                } break;
+                case 9: {
+                    { p = ((te)) - 1; }
+                    {
+                        p--;
+                        {
+                            p++;
+                            goto _out;
+                        }
+                    }
+                } break;
+                }
+            }
+
+        _again:
+            _acts = _ControlVerbs_actions + _ControlVerbs_to_state_actions[cs];
+            _nacts = (unsigned int)*_acts++;
+            while (_nacts-- > 0) {
+                switch (*_acts++) {
+                case 0: {
+                    ts = 0;
+                } break;
+                }
+            }
+
+            if (++p != pe)
+                goto _resume;
+        _test_eof : {}
+            if (p == eof) {
+                if (_ControlVerbs_eof_trans[cs] > 0) {
+                    _trans = _ControlVerbs_eof_trans[cs] - 1;
+                    goto _eof_trans;
+                }
+            }
+
+        _out : {}
+        }
+
+    } catch (LocatedParseError &error) {
+        if (ts >= ptr && ts <= pe) {
+            error.locate(ts - ptr + start);
+        } else {
+            error.locate(0);
+        }
+        throw;
+    }
+
+    return p;
+}
+
+} // namespace ue2
diff --git a/src/rose/counting_miracle.h b/src/rose/counting_miracle.h
index 976208b..4456679 100644
--- a/src/rose/counting_miracle.h
+++ b/src/rose/counting_miracle.h
@@ -94,7 +94,7 @@ u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison,
     u32 count = *count_inout;
 
     const m128 zeroes = zeroes128();
-    const m128 low4bits = _mm_set1_epi8(0xf);
+    const m128 low4bits = set16x8(0xf);
 
     for (; d + 16 <= d_end; d_end -= 16) {
         m128 data = loadu128(d_end - 16);
diff --git a/src/util/arch.h b/src/util/arch.h
index 985fec6..fe4a910 100644
--- a/src/util/arch.h
+++ b/src/util/arch.h
@@ -61,6 +61,10 @@
 #define HAVE_AVX512VBMI
 #endif
 
+#if defined(__aarch64__)
+#define HAVE_NEON
+#endif
+
 /*
  * ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros
  */
@@ -87,4 +91,11 @@
 #define NO_ASM
 #endif
 
+/*
+ * AARCH64 uses a different form of inline asm
+ */
+#if defined(__aarch64__)
+#define NO_ASM
+#endif
+
 #endif // UTIL_ARCH_H_
diff --git a/src/util/cpuid_flags.c b/src/util/cpuid_flags.c
index c00ce58..e0f6368 100644
--- a/src/util/cpuid_flags.c
+++ b/src/util/cpuid_flags.c
@@ -39,7 +39,7 @@
 
 u64a cpuid_flags(void) {
     u64a cap = 0;
-
+#if defined(__X86_64__)
     if (check_avx2()) {
         DEBUG_PRINTF("AVX2 enabled\n");
         cap |= HS_CPU_FEATURES_AVX2;
@@ -68,7 +68,7 @@ u64a cpuid_flags(void) {
     (defined(FAT_RUNTIME) && !defined(BUILD_AVX512VBMI))
     cap &= ~HS_CPU_FEATURES_AVX512VBMI;
 #endif
-
+#endif
     return cap;
 }
 
@@ -78,6 +78,7 @@ struct family_id {
     u32 tune;
 };
 
+#if defined(__X86_64__)
 /* from table 35-1 of the Intel 64 and IA32 Arch. Software Developer's Manual
  * and "Intel Architecture and Processor Identification With CPUID Model and
  * Family Numbers" */
@@ -121,6 +122,7 @@ static const struct family_id known_microarch[] = {
     { 0x6, 0x6C, HS_TUNE_FAMILY_ICX }, /* Icelake Xeon */
 
 };
+#endif
 
 #ifdef DUMP_SUPPORT
 static UNUSED
@@ -144,6 +146,7 @@ const char *dumpTune(u32 tune) {
 #endif
 
 u32 cpuid_tune(void) {
+#if defined(__X86_64__)
     unsigned int eax, ebx, ecx, edx;
 
     cpuid(1, 0, &eax, &ebx, &ecx, &edx);
@@ -171,6 +174,6 @@ u32 cpuid_tune(void) {
         DEBUG_PRINTF("found tune flag %s\n", dumpTune(tune) );
         return tune;
     }
-
+#endif
     return HS_TUNE_FAMILY_GENERIC;
 }
diff --git a/src/util/cpuid_flags.h b/src/util/cpuid_flags.h
index 527c6d5..3125bd1 100644
--- a/src/util/cpuid_flags.h
+++ b/src/util/cpuid_flags.h
@@ -32,7 +32,9 @@
 #include "ue2common.h"
 
 #if !defined(_WIN32) && !defined(CPUID_H_)
+#if defined(__x86_64__)
 #include <cpuid.h>
+#endif
  /* system header doesn't have a header guard */
 #define CPUID_H_
 #endif
diff --git a/src/util/cpuid_inline.h b/src/util/cpuid_inline.h
index b7b4245..b228c1d 100644
--- a/src/util/cpuid_inline.h
+++ b/src/util/cpuid_inline.h
@@ -32,17 +32,20 @@
 #include "ue2common.h"
 #include "cpuid_flags.h"
 
+#if defined(__x86_64__) || defined(_M_X64)
 #if !defined(_WIN32) && !defined(CPUID_H_)
 #include <cpuid.h>
 /* system header doesn't have a header guard */
 #define CPUID_H_
 #endif
+#endif
 
 #ifdef __cplusplus
 extern "C"
 {
 #endif
 
+#if defined(__x86_64__) || defined(_M_X64)
 static inline
 void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,
            unsigned int *ebx, unsigned int *ecx, unsigned int *edx) {
@@ -57,6 +60,7 @@ void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,
     *edx = a[3];
 #endif
 }
+#endif
 
 // ECX
 #define CPUID_SSE3 (1 << 0)
@@ -93,11 +97,12 @@ void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,
 #define CPUID_XCR0_AVX512                                                      \
     (CPUID_XCR0_OPMASK | CPUID_XCR0_ZMM_Hi256 | CPUID_XCR0_Hi16_ZMM)
 
+#if defined(__x86_64__)
 static inline
 u64a xgetbv(u32 op) {
 #if defined(_WIN32) || defined(__INTEL_COMPILER)
     return _xgetbv(op);
-#else
+#elif defined(__x86_64__)
     u32 a, d;
     __asm__ volatile (
             "xgetbv\n"
@@ -252,6 +257,16 @@ int check_popcnt(void) {
     cpuid(1, 0, &eax, &ebx, &ecx, &edx);
     return !!(ecx & CPUID_POPCNT);
 }
+#endif  //__x86_64__
+
+static inline
+int check_neon(void) {
+#if defined(__aarch64__)
+    return 1;
+#else
+    return 0;
+#endif
+}
 
 #ifdef __cplusplus
 } /* extern "C" */
diff --git a/src/util/intrinsics.h b/src/util/intrinsics.h
index edc4f6e..ece3b1a 100644
--- a/src/util/intrinsics.h
+++ b/src/util/intrinsics.h
@@ -55,10 +55,22 @@
 # endif
 #endif
 
+#ifdef __cplusplus
+# if defined(HAVE_CXX_ARM_NEON_H)
+#  define USE_ARM_NEON_H
+# endif
+#else // C
+# if defined(HAVE_C_ARM_NEON_H)
+#  define USE_ARM_NEON_H
+# endif
+#endif
+
 #if defined(USE_X86INTRIN_H)
 #include <x86intrin.h>
 #elif defined(USE_INTRIN_H)
 #include <intrin.h>
+#elif defined(USE_ARM_NEON_H)
+#include <arm_neon.h>
 #else
 #error no intrinsics file
 #endif
diff --git a/src/util/popcount.h b/src/util/popcount.h
index eb08f6b..7d794d1 100644
--- a/src/util/popcount.h
+++ b/src/util/popcount.h
@@ -41,6 +41,8 @@ u32 popcount32(u32 x) {
 #if defined(HAVE_POPCOUNT_INSTR)
     // Single-instruction builtin.
     return _mm_popcnt_u32(x);
+#elif defined(HAVE_NEON)
+    return (u32)vaddlv_u8(vcnt_u8(vcreate_u8((u64a)x)));
 #else
     // Fast branch-free version from bit-twiddling hacks as older Intel
     // processors do not have a POPCNT instruction.
@@ -63,7 +65,9 @@ u32 popcount64(u64a x) {
     x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
     x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f;
     return (x * 0x0101010101010101) >> 56;
-# endif
+#endif
+#elif defined(HAVE_NEON)
+    return (u32)vaddlv_u8(vcnt_u8(vcreate_u8((u64a)x)));
 #else
     // Synthesise from two 32-bit cases.
     return popcount32(x >> 32) + popcount32(x);
diff --git a/src/util/simd_arm.h b/src/util/simd_arm.h
new file mode 100644
index 0000000..cce119f
--- /dev/null
+++ b/src/util/simd_arm.h
@@ -0,0 +1,1069 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * 2020.01 - Use the neon instruction to implement the function of 128-bit operation.
+ *           Huawei Technologies Co., Ltd.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief SIMD types and primitive operations.
+ */
+
+#ifndef SIMD_ARM
+#define SIMD_ARM
+
+#include "config.h"
+#include "simd_types.h"
+#include "ue2common.h"
+#include "unaligned.h"
+#include "util/arch.h"
+#include "util/intrinsics.h"
+
+#include <string.h> // for memcpy
+
+// Define a common assume_aligned using an appropriate compiler built-in, if
+// it's available. Note that we need to handle C or C++ compilation.
+#ifdef __cplusplus
+#ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED
+#define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
+#endif
+#else
+#ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED
+#define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
+#endif
+#endif
+
+// Fallback to identity case.
+#ifndef assume_aligned
+#define assume_aligned(x, y) (x)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern const char vbs_mask_data[];
+#ifdef __cplusplus
+}
+#endif
+
+/*
+** extend 4.8.5 neon inline assembly functions
+*/
+__extension__ static __inline uint64x2_t __attribute__((__always_inline__))
+vmvnq_u64(uint64x2_t a) {
+    uint64x2_t result;
+    __asm__("mvn %0.16b,%1.16b" : "=w"(result) : "w"(a) : /* No clobbers */);
+    return result;
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wshadow"
+
+static really_inline m128 ones128(void) {
+    m128 result;
+    result.vect_s32 = vdupq_n_s32(0xFFFFFFFF);
+    return result;
+}
+
+static really_inline m128 zeroes128(void) {
+    m128 result;
+    result.vect_s32 = vdupq_n_s32(0x0);
+    return result;
+}
+
+/** \brief Return 1 if a and b are different otherwise 0 */
+static really_inline int diff128(m128 a, m128 b) {
+    return !!vaddlvq_s16(veorq_s16(a.vect_s16, b.vect_s16));
+}
+
+static really_inline int isnonzero128(m128 a) {
+    return !!diff128(a, zeroes128());
+}
+
+/**
+ * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich128(m128 a, m128 b) {
+    m128 tmp;
+    tmp.vect_u32 = vmvnq_u32(vceqq_u32(a.vect_u32, b.vect_u32));
+    return ((vgetq_lane_u32(tmp.vect_u32, 3) & 0x8) |
+            (vgetq_lane_u32(tmp.vect_u32, 2) & 0x4) |
+            (vgetq_lane_u32(tmp.vect_u32, 1) & 0x2) |
+            (vgetq_lane_u32(tmp.vect_u32, 0) & 0x1));
+}
+
+/**
+ * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
+ * returns a 4-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_128(m128 a, m128 b) {
+    m128 tmp;
+    tmp.vect_u64 = vmvnq_u64(vceqq_u64(a.vect_u64, b.vect_u64));
+    return (u32)((vgetq_lane_u64(tmp.vect_u64, 1) & 0x4) |
+                 (vgetq_lane_u64(tmp.vect_u64, 0) & 0x1));
+}
+
+static really_really_inline m128 lshift64_m128(m128 a, unsigned b) {
+    assert(b <= 63);
+    m128 result;
+    result.vect_s64 = vshlq_n_s64(a.vect_s64, b);
+    return result;
+}
+
+static really_really_inline m128 rshift64_m128(m128 a, int imm8) {
+    assert(imm8 >= 0 && imm8 <= 63);
+    if (unlikely(imm8 == 0)) {
+        return a;
+    }
+    m128 result;
+    result.vect_u64 = vshrq_n_u64(a.vect_u64, imm8);
+    return result;
+}
+
+static really_really_inline m128 eq128(m128 a, m128 b) {
+    m128 result;
+    result.vect_u8 = vceqq_s8(a.vect_s8, b.vect_s8);
+    return result;
+}
+
+static really_really_inline u32 movemask128(m128 a) {
+    m128 result;
+    result.vect_u8 = vshrq_n_u8(a.vect_u8, 7);
+    result.vect_u16 = vsraq_n_u16(result.vect_u16, result.vect_u16, 7);
+    result.vect_u32 = vsraq_n_u32(result.vect_u32, result.vect_u32, 14);
+    result.vect_u64 = vsraq_n_u64(result.vect_u64, result.vect_u64, 28);
+    return (u32)(vgetq_lane_u8(result.vect_u8, 0) |
+                 ((u32)vgetq_lane_u8(result.vect_u8, 8) << 8));
+}
+
+static really_really_inline m128 rshiftbyte_m128(m128 a, int imm8) {
+    assert(imm8 >= 0 && imm8 <= 15);
+    m128 result;
+    result.vect_s8 = vextq_s8(a.vect_s8, vdupq_n_s8(0), imm8);
+    return result;
+}
+
+static really_really_inline m128 lshiftbyte_m128(m128 a, int imm8) {
+    assert(imm8 >= 0 && imm8 <= 15);
+    m128 result;
+    if (unlikely(imm8 == 0)) {
+        return a;
+    }
+    result.vect_s8 = vextq_s8(vdupq_n_s8(0), a.vect_s8, (16 - imm8));
+    return result;
+}
+
+static really_inline m128 set16x8(u8 c) {
+    m128 result;
+    result.vect_s8 = vdupq_n_s8(c);
+    return result;
+}
+
+static really_inline m128 set4x32(u32 c) {
+    m128 result;
+    result.vect_s32 = vdupq_n_s32(c);
+    return result;
+}
+
+static really_inline m128 set2x64(u64a c) {
+    m128 result;
+    result.vect_u64 = vdupq_n_u64(c);
+    return result;
+}
+
+static really_inline u32 movd(const m128 in) {
+    u32 result;
+    result = vgetq_lane_u32(in.vect_u32, 0);
+    return result;
+}
+
+static really_inline u64a movq(const m128 in) {
+    return vgetq_lane_u64(in.vect_u64, 0);
+}
+
+/* another form of movq */
+static really_inline m128 load_m128_from_u64a(const u64a *p) {
+    m128 result;
+    __asm__ __volatile__("ldr %d0, %1         \n\t"
+                         : "=w"(result)
+                         : "Utv"(*p)
+                         : /* No clobbers */
+    );
+    return result;
+}
+
+/*The x86 platform does not perform the lower 2 bit operation.
+If the value of imm exceeds 2 bit, a compilation error occurs.*/
+static really_inline u32 extract32from128(m128 a, int imm) {
+    return vgetq_lane_s32(a.vect_s32, imm & 0x0003);
+}
+
+/*The x86 platform does not perform the lower 1 bit operation.
+If the value of imm exceeds 1 bit, a compilation error occurs.*/
+static really_inline u64a extract64from128(m128 a, int imm) {
+    return vgetq_lane_s64(a.vect_s64, imm & 0x0001);
+}
+
+#define extractlow64from256(a) movq(a.lo)
+#define extractlow32from256(a) movd(a.lo)
+
+/*The x86 platform does not perform the lower 2 bit operation.
+If the value of imm exceeds 2 bit, a compilation error occurs.*/
+static really_inline u32 extract32from256(m256 a, int imm) {
+    return vgetq_lane_s32((imm >> 2) ? a.hi.vect_s32 : a.lo.vect_s32,
+                          imm & 0x0003);
+}
+
+/*The x86 platform does not perform the lower 1 bit operation.
+If the value of imm exceeds 1 bit, a compilation error occurs.*/
+static really_inline u64a extract64from256(m256 a, int imm) {
+    return vgetq_lane_s64((imm >> 1) ? a.hi.vect_s64 : a.lo.vect_s64,
+                          imm & 0x0001);
+}
+
+static really_inline m128 and128(m128 a, m128 b) {
+    m128 result;
+    result.vect_s32 = vandq_s32(a.vect_s32, b.vect_s32);
+    return result;
+}
+
+static really_inline m128 not128(m128 a) {
+    m128 result;
+    result.vect_s32 = vmvnq_s32(a.vect_s32);
+    return result;
+}
+
+static really_inline m128 xor128(m128 a, m128 b) {
+    m128 result;
+    result.vect_s32 = veorq_s32(a.vect_s32, b.vect_s32);
+    return result;
+}
+
+static really_inline m128 or128(m128 a, m128 b) {
+    m128 result;
+    result.vect_s32 = vorrq_s32(a.vect_s32, b.vect_s32);
+    return result;
+}
+
+static really_inline m128 andnot128(m128 a, m128 b) {
+    m128 result;
+    result.vect_s32 = vbicq_s32(b.vect_s32, a.vect_s32);
+    return result;
+}
+
+// aligned load
+static really_inline m128 load128(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    ptr = assume_aligned(ptr, 16);
+    m128 result;
+    result.vect_s32 = vld1q_s32((const int32_t *)ptr);
+    return result;
+}
+
+// aligned store
+static really_inline void store128(void *ptr, m128 a) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    ptr = assume_aligned(ptr, 16);
+    *(m128 *)ptr = a;
+}
+
+// unaligned load
+static really_inline m128 loadu128(const void *ptr) {
+    m128 result;
+    result.vect_s32 = vld1q_s32((const int32_t *)ptr);
+    return result;
+}
+
+// unaligned store
+static really_inline void storeu128(void *ptr, m128 a) {
+    vst1q_s32((int32_t *)ptr, a.vect_s32);
+}
+
+// packed unaligned store of first N bytes
+static really_inline void storebytes128(void *ptr, m128 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline m128 loadbytes128(const void *ptr, unsigned int n) {
+    m128 a = zeroes128();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern const u8 simd_onebit_masks[];
+#ifdef __cplusplus
+}
+#endif
+
+static really_inline m128 mask1bit128(unsigned int n) {
+    assert(n < sizeof(m128) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu128(&simd_onebit_masks[mask_idx]);
+}
+
+// switches on bit N in the given vector.
+static really_inline void setbit128(m128 *ptr, unsigned int n) {
+    *ptr = or128(mask1bit128(n), *ptr);
+}
+
+// switches off bit N in the given vector.
+static really_inline void clearbit128(m128 *ptr, unsigned int n) {
+    *ptr = andnot128(mask1bit128(n), *ptr);
+}
+
+// tests bit N in the given vector.
+static really_inline char testbit128(m128 val, unsigned int n) {
+    const m128 mask = mask1bit128(n);
+    return isnonzero128(and128(mask, val));
+}
+
+// offset must be an immediate
+/*The x86 platform does not perform the lower 8 bit operation.
+If the value of imm exceeds 8 bit, a compilation error occurs.*/
+static really_inline m128 palignr(m128 a, m128 b, int count) {
+    m128 result;
+    count = count & 0xff;
+    if (likely(count < 16)) {
+        result.vect_s8 = vextq_s8(b.vect_s8, a.vect_s8, count);
+    } else if (count < 32) {
+        result.vect_s8 = vextq_s8(a.vect_s8, vdupq_n_s8(0x0), count - 16);
+    } else {
+        result.vect_s32 = vdupq_n_s32(0);
+    }
+    return result;
+}
+
+static really_inline m128 pshufb_m128(m128 a, m128 b) {
+    m128 result;
+    __asm__ __volatile__("movi v3.16b, 0x8f                   \n\t"
+                         "and v3.16b, v3.16b, %2.16b          \n\t"
+                         "tbl %0.16b, {%1.16b}, v3.16b        \n\t"
+                         : "=w"(result)
+                         : "w"(a), "w"(b)
+                         : "v3");
+    return result;
+}
+
+static really_inline m256 pshufb_m256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = pshufb_m128(a.lo, b.lo);
+    rv.hi = pshufb_m128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m128 variable_byte_shift_m128(m128 in, s32 amount) {
+    assert(amount >= -16 && amount <= 16);
+    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
+    return pshufb_m128(in, shift_mask);
+}
+
+static really_inline m128 max_u8_m128(m128 a, m128 b) {
+    m128 result;
+    result.vect_u8 = vmaxq_u8(a.vect_u8, b.vect_u8);
+    return result;
+}
+
+static really_inline m128 min_u8_m128(m128 a, m128 b) {
+    m128 result;
+    result.vect_u8 = vminq_u8(a.vect_u8, b.vect_u8);
+    return result;
+}
+
+static really_inline m128 sadd_u8_m128(m128 a, m128 b) {
+    m128 result;
+    result.vect_u8 = vqaddq_u8(a.vect_u8, b.vect_u8);
+    return result;
+}
+
+static really_inline m128 sub_u8_m128(m128 a, m128 b) {
+    m128 result;
+    result.vect_u8 = vsubq_u8(a.vect_u8, b.vect_u8);
+    return result;
+}
+
+static really_inline m128 set64x2(int64_t hi, int64_t lo) {
+    m128 result;
+    result.vect_s64 = vsetq_lane_s64(hi, vdupq_n_s64(lo), 1);
+    return result;
+}
+
+static really_inline m128 set32x4(int i3, int i2, int i1, int i0) {
+    m128 result;
+    result.vect_s32 = vsetq_lane_s32(
+        i3, vsetq_lane_s32(i2, vsetq_lane_s32(i1, vdupq_n_s32(i0), 1), 2), 3);
+    return result;
+}
+
+/****
+ **** 256-bit Primitives
+ ****/
+
+static really_really_inline m256 lshift64_m256(m256 a, int b) {
+    m256 rv = a;
+    rv.lo = lshift64_m128(rv.lo, b);
+    rv.hi = lshift64_m128(rv.hi, b);
+    return rv;
+}
+
+static really_inline m256 rshift64_m256(m256 a, int b) {
+    m256 rv = a;
+    rv.lo = rshift64_m128(rv.lo, b);
+    rv.hi = rshift64_m128(rv.hi, b);
+    return rv;
+}
+static really_inline m256 set32x8(u32 in) {
+    m256 rv;
+    rv.lo = set16x8((u8)in);
+    rv.hi = rv.lo;
+    return rv;
+}
+
+static really_inline m256 eq256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = eq128(a.lo, b.lo);
+    rv.hi = eq128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline u32 movemask256(m256 a) {
+    u32 lo_mask = movemask128(a.lo);
+    u32 hi_mask = movemask128(a.hi);
+    return lo_mask | (hi_mask << 16);
+}
+
+static really_inline m256 set2x128(m128 a) {
+    m256 rv = {a, a};
+    return rv;
+}
+
+static really_inline m256 zeroes256(void) {
+    m256 rv = {zeroes128(), zeroes128()};
+    return rv;
+}
+
+static really_inline m256 ones256(void) {
+    m256 rv = {ones128(), ones128()};
+    return rv;
+}
+
+static really_inline m256 and256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = and128(a.lo, b.lo);
+    rv.hi = and128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m256 or256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = or128(a.lo, b.lo);
+    rv.hi = or128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m256 xor256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = xor128(a.lo, b.lo);
+    rv.hi = xor128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m256 not256(m256 a) {
+    m256 rv;
+    rv.lo = not128(a.lo);
+    rv.hi = not128(a.hi);
+    return rv;
+}
+
+static really_inline m256 andnot256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = andnot128(a.lo, b.lo);
+    rv.hi = andnot128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline int diff256(m256 a, m256 b) {
+    return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
+}
+
+static really_inline int isnonzero256(m256 a) {
+    return isnonzero128(or128(a.lo, a.hi));
+}
+
+/**
+ * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich256(m256 a, m256 b) {
+    uint32x4_t x = vceqq_s32(a.lo.vect_s32, b.lo.vect_s32);
+    uint32x4_t y = vceqq_s32(a.hi.vect_s32, b.hi.vect_s32);
+    uint8x8_t lo = vqmovn_u16(vcombine_u16(vqmovn_u32(x), vqmovn_u32(y)));
+
+    static const int8_t __attribute__((aligned(16)))
+    xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0};
+    uint8x8_t mask_and = vdup_n_u8(0x80);
+    int8x8_t mask_shift = vld1_s8(xr);
+
+    lo = vand_u8(lo, mask_and);
+    lo = vshl_u8(lo, mask_shift);
+
+    lo = vpadd_u8(lo, lo);
+    lo = vpadd_u8(lo, lo);
+    lo = vpadd_u8(lo, lo);
+
+    return ~(lo[0] & 0xFF) & 0xff;
+}
+
+/**
+ * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
+ * returns an 8-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_256(m256 a, m256 b) {
+    u32 d = diffrich256(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline m256 load256(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+    m256 rv = {load128(ptr), load128((const char *)ptr + 16)};
+    return rv;
+}
+
+// aligned load  of 128-bit value to low and high part of 256-bit value
+static really_inline m256 load2x128(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    m256 rv;
+    rv.hi = rv.lo = load128(ptr);
+    return rv;
+}
+
+static really_inline m256 loadu2x128(const void *ptr) {
+    return set2x128(loadu128(ptr));
+}
+
+// aligned store
+static really_inline void store256(void *ptr, m256 a) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+    ptr = assume_aligned(ptr, 16);
+    *(m256 *)ptr = a;
+}
+
+// unaligned load
+static really_inline m256 loadu256(const void *ptr) {
+    m256 rv = {loadu128(ptr), loadu128((const char *)ptr + 16)};
+    return rv;
+}
+
+// unaligned store
+static really_inline void storeu256(void *ptr, m256 a) {
+    storeu128(ptr, a.lo);
+    storeu128((char *)ptr + 16, a.hi);
+}
+
+// packed unaligned store of first N bytes
+static really_inline void storebytes256(void *ptr, m256 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline m256 loadbytes256(const void *ptr, unsigned int n) {
+    m256 a = zeroes256();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+static really_inline m256 mask1bit256(unsigned int n) {
+    assert(n < sizeof(m256) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu256(&simd_onebit_masks[mask_idx]);
+}
+
+static really_inline m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
+    m256 rv;
+    rv.hi = set64x2(hi_1, hi_0);
+    rv.lo = set64x2(lo_1, lo_0);
+    return rv;
+}
+
+// switches on bit N in the given vector.
+static really_inline void setbit256(m256 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 128;
+    }
+    setbit128(sub, n);
+}
+
+// switches off bit N in the given vector.
+static really_inline void clearbit256(m256 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 128;
+    }
+    clearbit128(sub, n);
+}
+
+// tests bit N in the given vector.
+static really_inline char testbit256(m256 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    m128 sub;
+    if (n < 128) {
+        sub = val.lo;
+    } else {
+        sub = val.hi;
+        n -= 128;
+    }
+    return testbit128(sub, n);
+}
+
+static really_really_inline m128 movdq_hi(m256 x) { return x.hi; }
+
+static really_really_inline m128 movdq_lo(m256 x) { return x.lo; }
+
+static really_inline m256 combine2x128(m128 hi, m128 lo) {
+    m256 rv = {lo, hi};
+    return rv;
+}
+
+/****
+ **** 384-bit Primitives
+ ****/
+
+static really_inline m384 and384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = and128(a.lo, b.lo);
+    rv.mid = and128(a.mid, b.mid);
+    rv.hi = and128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m384 or384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = or128(a.lo, b.lo);
+    rv.mid = or128(a.mid, b.mid);
+    rv.hi = or128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m384 xor384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = xor128(a.lo, b.lo);
+    rv.mid = xor128(a.mid, b.mid);
+    rv.hi = xor128(a.hi, b.hi);
+    return rv;
+}
+static really_inline m384 not384(m384 a) {
+    m384 rv;
+    rv.lo = not128(a.lo);
+    rv.mid = not128(a.mid);
+    rv.hi = not128(a.hi);
+    return rv;
+}
+static really_inline m384 andnot384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = andnot128(a.lo, b.lo);
+    rv.mid = andnot128(a.mid, b.mid);
+    rv.hi = andnot128(a.hi, b.hi);
+    return rv;
+}
+
+static really_really_inline m384 lshift64_m384(m384 a, unsigned b) {
+    m384 rv;
+    rv.lo = lshift64_m128(a.lo, b);
+    rv.mid = lshift64_m128(a.mid, b);
+    rv.hi = lshift64_m128(a.hi, b);
+    return rv;
+}
+
+static really_inline m384 zeroes384(void) {
+    m384 rv = {zeroes128(), zeroes128(), zeroes128()};
+    return rv;
+}
+
+static really_inline m384 ones384(void) {
+    m384 rv = {ones128(), ones128(), ones128()};
+    return rv;
+}
+
+static really_inline int diff384(m384 a, m384 b) {
+    return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
+}
+
+static really_inline int isnonzero384(m384 a) {
+    return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
+}
+
+/**
+ * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich384(m384 a, m384 b) {
+    m128 z = zeroes128();
+    uint32x4_t x = vceqq_s32(a.lo.vect_s32, b.lo.vect_s32);
+    uint32x4_t y = vceqq_s32(a.mid.vect_s32, b.mid.vect_s32);
+    uint32x4_t w = vceqq_s32(a.hi.vect_s32, b.hi.vect_s32);
+
+    uint16x8_t q = vcombine_u16(vqmovn_u32(x), vqmovn_u32(y));
+    uint16x8_t p = vcombine_u16(vqmovn_u32(w), vqmovn_u32(z.vect_u32));
+
+    uint8x16_t input = vcombine_u8(vqmovn_u16(q), vqmovn_u16(p));
+
+    static const int8_t __attribute__((aligned(16)))
+    xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0};
+    uint8x8_t mask_and = vdup_n_u8(0x80);
+    int8x8_t mask_shift = vld1_s8(xr);
+
+    uint8x8_t lo = vget_low_u8(input);
+    uint8x8_t hi = vget_high_u8(input);
+
+    lo = vand_u8(lo, mask_and);
+    lo = vshl_u8(lo, mask_shift);
+
+    hi = vand_u8(hi, mask_and);
+    hi = vshl_u8(hi, mask_shift);
+
+    lo = vpadd_u8(lo, lo);
+    lo = vpadd_u8(lo, lo);
+    lo = vpadd_u8(lo, lo);
+
+    hi = vpadd_u8(hi, hi);
+    hi = vpadd_u8(hi, hi);
+    hi = vpadd_u8(hi, hi);
+
+    return ~((hi[0] << 8) | (lo[0] & 0xFF)) & 0xfff;
+}
+
+/**
+ * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
+ * returns a 12-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_384(m384 a, m384 b) {
+    u32 d = diffrich384(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline m384 load384(const void *ptr) {
+    assert(ISALIGNED_16(ptr));
+    m384 rv = {load128(ptr), load128((const char *)ptr + 16),
+               load128((const char *)ptr + 32)};
+    return rv;
+}
+
+// aligned store
+static really_inline void store384(void *ptr, m384 a) {
+    assert(ISALIGNED_16(ptr));
+    ptr = assume_aligned(ptr, 16);
+    *(m384 *)ptr = a;
+}
+
+// unaligned load
+static really_inline m384 loadu384(const void *ptr) {
+    m384 rv = {loadu128(ptr), loadu128((const char *)ptr + 16),
+               loadu128((const char *)ptr + 32)};
+    return rv;
+}
+
+// packed unaligned store of first N bytes
+static really_inline void storebytes384(void *ptr, m384 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline m384 loadbytes384(const void *ptr, unsigned int n) {
+    m384 a = zeroes384();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+// switches on bit N in the given vector.
+static really_inline void setbit384(m384 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else if (n < 256) {
+        sub = &ptr->mid;
+    } else {
+        sub = &ptr->hi;
+    }
+    setbit128(sub, n % 128);
+}
+
+// switches off bit N in the given vector.
+static really_inline void clearbit384(m384 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else if (n < 256) {
+        sub = &ptr->mid;
+    } else {
+        sub = &ptr->hi;
+    }
+    clearbit128(sub, n % 128);
+}
+
+// tests bit N in the given vector.
+static really_inline char testbit384(m384 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    m128 sub;
+    if (n < 128) {
+        sub = val.lo;
+    } else if (n < 256) {
+        sub = val.mid;
+    } else {
+        sub = val.hi;
+    }
+    return testbit128(sub, n % 128);
+}
+
+/****
+ **** 512-bit Primitives
+ ****/
+
+static really_inline m512 zeroes512(void) {
+    m512 rv = {zeroes256(), zeroes256()};
+    return rv;
+}
+
+static really_inline m512 ones512(void) {
+    m512 rv = {ones256(), ones256()};
+    return rv;
+}
+
+static really_inline m512 and512(m512 a, m512 b) {
+    m512 rv;
+    rv.lo = and256(a.lo, b.lo);
+    rv.hi = and256(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m512 or512(m512 a, m512 b) {
+    m512 rv;
+    rv.lo = or256(a.lo, b.lo);
+    rv.hi = or256(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m512 xor512(m512 a, m512 b) {
+    m512 rv;
+    rv.lo = xor256(a.lo, b.lo);
+    rv.hi = xor256(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m512 not512(m512 a) {
+    m512 rv;
+    rv.lo = not256(a.lo);
+    rv.hi = not256(a.hi);
+    return rv;
+}
+
+static really_inline m512 andnot512(m512 a, m512 b) {
+    m512 rv;
+    rv.lo = andnot256(a.lo, b.lo);
+    rv.hi = andnot256(a.hi, b.hi);
+    return rv;
+}
+
+static really_really_inline m512 lshift64_m512(m512 a, unsigned b) {
+    m512 rv;
+    rv.lo = lshift64_m256(a.lo, b);
+    rv.hi = lshift64_m256(a.hi, b);
+    return rv;
+}
+
+static really_inline int diff512(m512 a, m512 b) {
+    return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
+}
+
+static really_inline int isnonzero512(m512 a) {
+    m128 x = or128(a.lo.lo, a.lo.hi);
+    m128 y = or128(a.hi.lo, a.hi.hi);
+    return isnonzero128(or128(x, y));
+}
+
+/**
+ * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich512(m512 a, m512 b) {
+    uint32x4_t x = vceqq_s32(a.lo.lo.vect_s32, b.lo.lo.vect_s32);
+    uint32x4_t y = vceqq_s32(a.lo.hi.vect_s32, b.lo.hi.vect_s32);
+    uint32x4_t z = vceqq_s32(a.hi.lo.vect_s32, b.hi.lo.vect_s32);
+    uint32x4_t w = vceqq_s32(a.hi.hi.vect_s32, b.hi.hi.vect_s32);
+    uint16x8_t p = vcombine_u16(vqmovn_u32(x), vqmovn_u32(y));
+    uint16x8_t q = vcombine_u16(vqmovn_u32(z), vqmovn_u32(w));
+
+    uint8x16_t input = vcombine_u8(vqmovn_u16(p), vqmovn_u16(q));
+
+    static const int8_t __attribute__((aligned(16)))
+    xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0};
+    uint8x8_t mask_and = vdup_n_u8(0x80);
+    int8x8_t mask_shift = vld1_s8(xr);
+
+    uint8x8_t lo = vget_low_u8(input);
+    uint8x8_t hi = vget_high_u8(input);
+
+    lo = vand_u8(lo, mask_and);
+    lo = vshl_u8(lo, mask_shift);
+
+    hi = vand_u8(hi, mask_and);
+    hi = vshl_u8(hi, mask_shift);
+
+    lo = vpadd_u8(lo, lo);
+    lo = vpadd_u8(lo, lo);
+    lo = vpadd_u8(lo, lo);
+
+    hi = vpadd_u8(hi, hi);
+    hi = vpadd_u8(hi, hi);
+    hi = vpadd_u8(hi, hi);
+
+    return ~((hi[0] << 8) | (lo[0] & 0xFF)) & 0xffff;
+}
+
+/**
+ * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
+ * returns a 16-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_512(m512 a, m512 b) {
+    u32 d = diffrich512(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline m512 load512(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+    m512 rv = {load256(ptr), load256((const char *)ptr + 32)};
+    return rv;
+}
+
+// aligned store
+static really_inline void store512(void *ptr, m512 a) {
+    assert(ISALIGNED_N(ptr, alignof(m512)));
+    ptr = assume_aligned(ptr, 16);
+    *(m512 *)ptr = a;
+}
+
+// unaligned load
+static really_inline m512 loadu512(const void *ptr) {
+    m512 rv = {loadu256(ptr), loadu256((const char *)ptr + 32)};
+    return rv;
+}
+
+// packed unaligned store of first N bytes
+static really_inline void storebytes512(void *ptr, m512 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline m512 loadbytes512(const void *ptr, unsigned int n) {
+    m512 a = zeroes512();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+static really_inline m512 mask1bit512(unsigned int n) {
+    assert(n < sizeof(m512) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu512(&simd_onebit_masks[mask_idx]);
+}
+
+// switches on bit N in the given vector.
+static really_inline void setbit512(m512 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo.lo;
+    } else if (n < 256) {
+        sub = &ptr->lo.hi;
+    } else if (n < 384) {
+        sub = &ptr->hi.lo;
+    } else {
+        sub = &ptr->hi.hi;
+    }
+    setbit128(sub, n % 128);
+}
+
+// switches off bit N in the given vector.
+static really_inline void clearbit512(m512 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo.lo;
+    } else if (n < 256) {
+        sub = &ptr->lo.hi;
+    } else if (n < 384) {
+        sub = &ptr->hi.lo;
+    } else {
+        sub = &ptr->hi.hi;
+    }
+    clearbit128(sub, n % 128);
+}
+
+// tests bit N in the given vector.
+static really_inline char testbit512(m512 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    m128 sub;
+    if (n < 128) {
+        sub = val.lo.lo;
+    } else if (n < 256) {
+        sub = val.lo.hi;
+    } else if (n < 384) {
+        sub = val.hi.lo;
+    } else {
+        sub = val.hi.hi;
+    }
+    return testbit128(sub, n % 128);
+}
+#pragma GCC diagnostic pop
+
+#endif
diff --git a/src/util/simd_types.h b/src/util/simd_types.h
index 962cad6..62d39ec 100644
--- a/src/util/simd_types.h
+++ b/src/util/simd_types.h
@@ -30,28 +30,58 @@
 #define SIMD_TYPES_H
 
 #include "config.h"
+#include "ue2common.h"
 #include "util/arch.h"
 #include "util/intrinsics.h"
-#include "ue2common.h"
 
 #if defined(HAVE_SSE2)
+typedef __m128i m128;
+#elif defined(HAVE_NEON)
+#include "arm_neon.h"
+
+typedef union {
+    int8x16_t vect_s8;
+    int16x8_t vect_s16;
+    int32x4_t vect_s32;
+    int64x2_t vect_s64;
+    uint8x16_t vect_u8;
+    uint16x8_t vect_u16;
+    uint32x4_t vect_u32;
+    uint64x2_t vect_u64;
+} __m128i;
+typedef float32x4_t __m128;
+typedef float64x2_t __m128d;
+
 typedef __m128i m128;
 #else
-typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
+typedef struct ALIGN_DIRECTIVE {
+    u64a hi;
+    u64a lo;
+} m128;
+
 #endif
 
 #if defined(HAVE_AVX2)
 typedef __m256i m256;
 #else
-typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
+typedef struct ALIGN_AVX_DIRECTIVE {
+    m128 lo;
+    m128 hi;
+} m256;
 #endif
 
-typedef struct {m128 lo; m128 mid; m128 hi;} m384;
+typedef struct {
+    m128 lo;
+    m128 mid;
+    m128 hi;
+} m384;
 #if defined(HAVE_AVX512)
 typedef __m512i m512;
 #else
-typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512;
+typedef struct ALIGN_ATTR(64) {
+    m256 lo;
+    m256 hi;
+} m512;
 #endif
 
 #endif /* SIMD_TYPES_H */
-
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index d1f060b..7e926b2 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -26,1395 +26,14 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
- * \brief SIMD types and primitive operations.
- */
-
 #ifndef SIMD_UTILS
 #define SIMD_UTILS
 
-#if !defined(_WIN32) && !defined(__SSSE3__)
-#error SSSE3 instructions must be enabled
-#endif
-
-#include "config.h"
-#include "ue2common.h"
-#include "simd_types.h"
-#include "unaligned.h"
-#include "util/arch.h"
-#include "util/intrinsics.h"
-
-#include <string.h> // for memcpy
-
-// Define a common assume_aligned using an appropriate compiler built-in, if
-// it's available. Note that we need to handle C or C++ compilation.
-#ifdef __cplusplus
-#  ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED
-#    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
-#  endif
-#else
-#  ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED
-#    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
-#  endif
-#endif
-
-// Fallback to identity case.
-#ifndef assume_aligned
-#define assume_aligned(x, y) (x)
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-extern const char vbs_mask_data[];
-#ifdef __cplusplus
-}
-#endif
-
-static really_inline m128 ones128(void) {
-#if defined(__GNUC__) || defined(__INTEL_COMPILER)
-    /* gcc gets this right */
-    return _mm_set1_epi8(0xFF);
-#else
-    /* trick from Intel's optimization guide to generate all-ones.
-     * ICC converts this to the single cmpeq instruction */
-    return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
-#endif
-}
-
-static really_inline m128 zeroes128(void) {
-    return _mm_setzero_si128();
-}
-
-/** \brief Bitwise not for m128*/
-static really_inline m128 not128(m128 a) {
-    return _mm_xor_si128(a, ones128());
-}
-
-/** \brief Return 1 if a and b are different otherwise 0 */
-static really_inline int diff128(m128 a, m128 b) {
-    return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
-}
-
-static really_inline int isnonzero128(m128 a) {
-    return !!diff128(a, zeroes128());
-}
-
-/**
- * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
- * mask indicating which 32-bit words contain differences.
- */
-static really_inline u32 diffrich128(m128 a, m128 b) {
-    a = _mm_cmpeq_epi32(a, b);
-    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
-}
-
-/**
- * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
- * returns a 4-bit mask indicating which 64-bit words contain differences.
- */
-static really_inline u32 diffrich64_128(m128 a, m128 b) {
-#if defined(HAVE_SSE41)
-    a = _mm_cmpeq_epi64(a, b);
-    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
-#else
-    u32 d = diffrich128(a, b);
-    return (d | (d >> 1)) & 0x5;
-#endif
-}
-
-static really_really_inline
-m128 lshift64_m128(m128 a, unsigned b) {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
-    if (__builtin_constant_p(b)) {
-        return _mm_slli_epi64(a, b);
-    }
-#endif
-    m128 x = _mm_cvtsi32_si128(b);
-    return _mm_sll_epi64(a, x);
-}
-
-#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
-#define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
-#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
-
-#if defined(HAVE_AVX512)
-static really_inline m128 cast512to128(const m512 in) {
-    return _mm512_castsi512_si128(in);
-}
-#endif
-
-static really_inline m128 set16x8(u8 c) {
-    return _mm_set1_epi8(c);
-}
-
-static really_inline m128 set4x32(u32 c) {
-    return _mm_set1_epi32(c);
-}
-
-static really_inline u32 movd(const m128 in) {
-    return _mm_cvtsi128_si32(in);
-}
-
-#if defined(HAVE_AVX512)
-static really_inline u32 movd512(const m512 in) {
-    // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
-    //       so we use 2-step convertions to work around.
-    return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
-}
-
-static really_inline u64a movq512(const m512 in) {
-    // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in),
-    //       so we use 2-step convertions to work around.
-    return _mm_cvtsi128_si64(_mm512_castsi512_si128(in));
-}
-#endif
-
-static really_inline u64a movq(const m128 in) {
-#if defined(ARCH_X86_64)
-    return _mm_cvtsi128_si64(in);
-#else // 32-bit - this is horrific
-    u32 lo = movd(in);
-    u32 hi = movd(_mm_srli_epi64(in, 32));
-    return (u64a)hi << 32 | lo;
-#endif
-}
-
-/* another form of movq */
-static really_inline
-m128 load_m128_from_u64a(const u64a *p) {
-    return _mm_set_epi64x(0LL, *p);
-}
-
-#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
-#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
-
-#if defined(HAVE_SSE41)
-#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
-#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
-#else
-#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
-#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
-#endif
-
-#if !defined(HAVE_AVX2)
-// TODO: this entire file needs restructuring - this carveout is awful
-#define extractlow64from256(a) movq(a.lo)
-#define extractlow32from256(a) movd(a.lo)
-#if defined(HAVE_SSE41)
-#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
-#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
-#else
-#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
-#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
-#endif
-
-#endif // !AVX2
-
-static really_inline m128 and128(m128 a, m128 b) {
-    return _mm_and_si128(a,b);
-}
-
-static really_inline m128 xor128(m128 a, m128 b) {
-    return _mm_xor_si128(a,b);
-}
-
-static really_inline m128 or128(m128 a, m128 b) {
-    return _mm_or_si128(a,b);
-}
-
-#if defined(HAVE_AVX512VBMI)
-static really_inline m512 expand128(m128 a) {
-    return _mm512_broadcast_i32x4(a);
-}
-
-static really_inline m512 expand256(m256 a) {
-    return _mm512_broadcast_i64x4(a);
-}
-
-static really_inline m512 expand384(m384 a) {
-    u64a *lo = (u64a*)&a.lo;
-    u64a *mid = (u64a*)&a.mid;
-    u64a *hi = (u64a*)&a.hi;
-    return _mm512_set_epi64(0ULL, 0ULL, hi[1], hi[0], mid[1], mid[0],
-                            lo[1], lo[0]);
-}
+#if defined(__x86_64__)
+#include "simd_x86.h"
+#elif defined(__aarch64__)
+#include "simd_arm.h"
 #endif
 
-static really_inline m128 andnot128(m128 a, m128 b) {
-    return _mm_andnot_si128(a, b);
-}
-
-// aligned load
-static really_inline m128 load128(const void *ptr) {
-    assert(ISALIGNED_N(ptr, alignof(m128)));
-    ptr = assume_aligned(ptr, 16);
-    return _mm_load_si128((const m128 *)ptr);
-}
-
-// aligned store
-static really_inline void store128(void *ptr, m128 a) {
-    assert(ISALIGNED_N(ptr, alignof(m128)));
-    ptr = assume_aligned(ptr, 16);
-    *(m128 *)ptr = a;
-}
-
-// unaligned load
-static really_inline m128 loadu128(const void *ptr) {
-    return _mm_loadu_si128((const m128 *)ptr);
-}
-
-// unaligned store
-static really_inline void storeu128(void *ptr, m128 a) {
-    _mm_storeu_si128 ((m128 *)ptr, a);
-}
-
-// packed unaligned store of first N bytes
-static really_inline
-void storebytes128(void *ptr, m128 a, unsigned int n) {
-    assert(n <= sizeof(a));
-    memcpy(ptr, &a, n);
-}
-
-// packed unaligned load of first N bytes, pad with zero
-static really_inline
-m128 loadbytes128(const void *ptr, unsigned int n) {
-    m128 a = zeroes128();
-    assert(n <= sizeof(a));
-    memcpy(&a, ptr, n);
-    return a;
-}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-extern const u8 simd_onebit_masks[];
-#ifdef __cplusplus
-}
 #endif
 
-static really_inline
-m128 mask1bit128(unsigned int n) {
-    assert(n < sizeof(m128) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 95;
-    mask_idx -= n / 8;
-    return loadu128(&simd_onebit_masks[mask_idx]);
-}
-
-// switches on bit N in the given vector.
-static really_inline
-void setbit128(m128 *ptr, unsigned int n) {
-    *ptr = or128(mask1bit128(n), *ptr);
-}
-
-// switches off bit N in the given vector.
-static really_inline
-void clearbit128(m128 *ptr, unsigned int n) {
-    *ptr = andnot128(mask1bit128(n), *ptr);
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit128(m128 val, unsigned int n) {
-    const m128 mask = mask1bit128(n);
-#if defined(HAVE_SSE41)
-    return !_mm_testz_si128(mask, val);
-#else
-    return isnonzero128(and128(mask, val));
-#endif
-}
-
-// offset must be an immediate
-#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
-
-static really_inline
-m128 pshufb_m128(m128 a, m128 b) {
-    m128 result;
-    result = _mm_shuffle_epi8(a, b);
-    return result;
-}
-
-static really_inline
-m256 pshufb_m256(m256 a, m256 b) {
-#if defined(HAVE_AVX2)
-    return _mm256_shuffle_epi8(a, b);
-#else
-    m256 rv;
-    rv.lo = pshufb_m128(a.lo, b.lo);
-    rv.hi = pshufb_m128(a.hi, b.hi);
-    return rv;
-#endif
-}
-
-#if defined(HAVE_AVX512)
-static really_inline
-m512 pshufb_m512(m512 a, m512 b) {
-    return _mm512_shuffle_epi8(a, b);
-}
-
-static really_inline
-m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
-    return _mm512_maskz_shuffle_epi8(k, a, b);
-}
-
-#if defined(HAVE_AVX512VBMI)
-#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a)
-#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a)
-#endif
-
-#endif
-
-static really_inline
-m128 variable_byte_shift_m128(m128 in, s32 amount) {
-    assert(amount >= -16 && amount <= 16);
-    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
-    return pshufb_m128(in, shift_mask);
-}
-
-static really_inline
-m128 max_u8_m128(m128 a, m128 b) {
-    return _mm_max_epu8(a, b);
-}
-
-static really_inline
-m128 min_u8_m128(m128 a, m128 b) {
-    return _mm_min_epu8(a, b);
-}
-
-static really_inline
-m128 sadd_u8_m128(m128 a, m128 b) {
-    return _mm_adds_epu8(a, b);
-}
-
-static really_inline
-m128 sub_u8_m128(m128 a, m128 b) {
-    return _mm_sub_epi8(a, b);
-}
-
-static really_inline
-m128 set64x2(u64a hi, u64a lo) {
-    return _mm_set_epi64x(hi, lo);
-}
-
-/****
- **** 256-bit Primitives
- ****/
-
-#if defined(HAVE_AVX2)
-
-static really_really_inline
-m256 lshift64_m256(m256 a, unsigned b) {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
-    if (__builtin_constant_p(b)) {
-        return _mm256_slli_epi64(a, b);
-    }
-#endif
-    m128 x = _mm_cvtsi32_si128(b);
-    return _mm256_sll_epi64(a, x);
-}
-
-#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
-
-static really_inline
-m256 set32x8(u32 in) {
-    return _mm256_set1_epi8(in);
-}
-
-#define eq256(a, b)     _mm256_cmpeq_epi8((a), (b))
-#define movemask256(a)  ((u32)_mm256_movemask_epi8((a)))
-
-static really_inline
-m256 set2x128(m128 a) {
-    return _mm256_broadcastsi128_si256(a);
-}
-
-#else
-
-static really_really_inline
-m256 lshift64_m256(m256 a, int b) {
-    m256 rv = a;
-    rv.lo = lshift64_m128(rv.lo, b);
-    rv.hi = lshift64_m128(rv.hi, b);
-    return rv;
-}
-
-static really_inline
-m256 rshift64_m256(m256 a, int b) {
-    m256 rv = a;
-    rv.lo = rshift64_m128(rv.lo, b);
-    rv.hi = rshift64_m128(rv.hi, b);
-    return rv;
-}
-static really_inline
-m256 set32x8(u32 in) {
-    m256 rv;
-    rv.lo = set16x8((u8) in);
-    rv.hi = rv.lo;
-    return rv;
-}
-
-static really_inline
-m256 eq256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = eq128(a.lo, b.lo);
-    rv.hi = eq128(a.hi, b.hi);
-    return rv;
-}
-
-static really_inline
-u32 movemask256(m256 a) {
-    u32 lo_mask = movemask128(a.lo);
-    u32 hi_mask = movemask128(a.hi);
-    return lo_mask | (hi_mask << 16);
-}
-
-static really_inline
-m256 set2x128(m128 a) {
-    m256 rv = {a, a};
-    return rv;
-}
-#endif
-
-static really_inline m256 zeroes256(void) {
-#if defined(HAVE_AVX2)
-    return _mm256_setzero_si256();
-#else
-    m256 rv = {zeroes128(), zeroes128()};
-    return rv;
-#endif
-}
-
-static really_inline m256 ones256(void) {
-#if defined(HAVE_AVX2)
-    m256 rv = _mm256_set1_epi8(0xFF);
-#else
-    m256 rv = {ones128(), ones128()};
-#endif
-    return rv;
-}
-
-#if defined(HAVE_AVX2)
-static really_inline m256 and256(m256 a, m256 b) {
-    return _mm256_and_si256(a, b);
-}
-#else
-static really_inline m256 and256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = and128(a.lo, b.lo);
-    rv.hi = and128(a.hi, b.hi);
-    return rv;
-}
-#endif
-
-#if defined(HAVE_AVX2)
-static really_inline m256 or256(m256 a, m256 b) {
-    return _mm256_or_si256(a, b);
-}
-#else
-static really_inline m256 or256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = or128(a.lo, b.lo);
-    rv.hi = or128(a.hi, b.hi);
-    return rv;
-}
-#endif
-
-#if defined(HAVE_AVX2)
-static really_inline m256 xor256(m256 a, m256 b) {
-    return _mm256_xor_si256(a, b);
-}
-#else
-static really_inline m256 xor256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = xor128(a.lo, b.lo);
-    rv.hi = xor128(a.hi, b.hi);
-    return rv;
-}
-#endif
-
-#if defined(HAVE_AVX2)
-static really_inline m256 not256(m256 a) {
-    return _mm256_xor_si256(a, ones256());
-}
-#else
-static really_inline m256 not256(m256 a) {
-    m256 rv;
-    rv.lo = not128(a.lo);
-    rv.hi = not128(a.hi);
-    return rv;
-}
-#endif
-
-#if defined(HAVE_AVX2)
-static really_inline m256 andnot256(m256 a, m256 b) {
-    return _mm256_andnot_si256(a, b);
-}
-#else
-static really_inline m256 andnot256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = andnot128(a.lo, b.lo);
-    rv.hi = andnot128(a.hi, b.hi);
-    return rv;
-}
-#endif
-
-static really_inline int diff256(m256 a, m256 b) {
-#if defined(HAVE_AVX2)
-    return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
-#else
-    return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
-#endif
-}
-
-static really_inline int isnonzero256(m256 a) {
-#if defined(HAVE_AVX2)
-    return !!diff256(a, zeroes256());
-#else
-    return isnonzero128(or128(a.lo, a.hi));
-#endif
-}
-
-/**
- * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
- * mask indicating which 32-bit words contain differences.
- */
-static really_inline u32 diffrich256(m256 a, m256 b) {
-#if defined(HAVE_AVX2)
-    a = _mm256_cmpeq_epi32(a, b);
-    return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
-#else
-    m128 z = zeroes128();
-    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
-    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
-    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z);
-    return ~(_mm_movemask_epi8(packed)) & 0xff;
-#endif
-}
-
-/**
- * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
- * returns an 8-bit mask indicating which 64-bit words contain differences.
- */
-static really_inline u32 diffrich64_256(m256 a, m256 b) {
-    u32 d = diffrich256(a, b);
-    return (d | (d >> 1)) & 0x55555555;
-}
-
-// aligned load
-static really_inline m256 load256(const void *ptr) {
-    assert(ISALIGNED_N(ptr, alignof(m256)));
-#if defined(HAVE_AVX2)
-    return _mm256_load_si256((const m256 *)ptr);
-#else
-    m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
-    return rv;
-#endif
-}
-
-// aligned load  of 128-bit value to low and high part of 256-bit value
-static really_inline m256 load2x128(const void *ptr) {
-#if defined(HAVE_AVX2)
-    return set2x128(load128(ptr));
-#else
-    assert(ISALIGNED_N(ptr, alignof(m128)));
-    m256 rv;
-    rv.hi = rv.lo = load128(ptr);
-    return rv;
-#endif
-}
-
-static really_inline m256 loadu2x128(const void *ptr) {
-    return set2x128(loadu128(ptr));
-}
-
-// aligned store
-static really_inline void store256(void *ptr, m256 a) {
-    assert(ISALIGNED_N(ptr, alignof(m256)));
-#if defined(HAVE_AVX2)
-    _mm256_store_si256((m256 *)ptr, a);
-#else
-    ptr = assume_aligned(ptr, 16);
-    *(m256 *)ptr = a;
-#endif
-}
-
-// unaligned load
-static really_inline m256 loadu256(const void *ptr) {
-#if defined(HAVE_AVX2)
-    return _mm256_loadu_si256((const m256 *)ptr);
-#else
-    m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
-    return rv;
-#endif
-}
-
-// unaligned store
-static really_inline void storeu256(void *ptr, m256 a) {
-#if defined(HAVE_AVX2)
-    _mm256_storeu_si256((m256 *)ptr, a);
-#else
-    storeu128(ptr, a.lo);
-    storeu128((char *)ptr + 16, a.hi);
-#endif
-}
-
-// packed unaligned store of first N bytes
-static really_inline
-void storebytes256(void *ptr, m256 a, unsigned int n) {
-    assert(n <= sizeof(a));
-    memcpy(ptr, &a, n);
-}
-
-// packed unaligned load of first N bytes, pad with zero
-static really_inline
-m256 loadbytes256(const void *ptr, unsigned int n) {
-    m256 a = zeroes256();
-    assert(n <= sizeof(a));
-    memcpy(&a, ptr, n);
-    return a;
-}
-
-static really_inline
-m256 mask1bit256(unsigned int n) {
-    assert(n < sizeof(m256) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 95;
-    mask_idx -= n / 8;
-    return loadu256(&simd_onebit_masks[mask_idx]);
-}
-
-static really_inline
-m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
-#if defined(HAVE_AVX2)
-    return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
-#else
-    m256 rv;
-    rv.hi = set64x2(hi_1, hi_0);
-    rv.lo = set64x2(lo_1, lo_0);
-    return rv;
-#endif
-}
-
-#if !defined(HAVE_AVX2)
-// switches on bit N in the given vector.
-static really_inline
-void setbit256(m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo;
-    } else {
-        sub = &ptr->hi;
-        n -= 128;
-    }
-    setbit128(sub, n);
-}
-
-// switches off bit N in the given vector.
-static really_inline
-void clearbit256(m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo;
-    } else {
-        sub = &ptr->hi;
-        n -= 128;
-    }
-    clearbit128(sub, n);
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit256(m256 val, unsigned int n) {
-    assert(n < sizeof(val) * 8);
-    m128 sub;
-    if (n < 128) {
-        sub = val.lo;
-    } else {
-        sub = val.hi;
-        n -= 128;
-    }
-    return testbit128(sub, n);
-}
-
-static really_really_inline
-m128 movdq_hi(m256 x) {
-    return x.hi;
-}
-
-static really_really_inline
-m128 movdq_lo(m256 x) {
-    return x.lo;
-}
-
-static really_inline
-m256 combine2x128(m128 hi, m128 lo) {
-    m256 rv = {lo, hi};
-    return rv;
-}
-
-#else // AVX2
-
-// switches on bit N in the given vector.
-static really_inline
-void setbit256(m256 *ptr, unsigned int n) {
-    *ptr = or256(mask1bit256(n), *ptr);
-}
-
-static really_inline
-void clearbit256(m256 *ptr, unsigned int n) {
-    *ptr = andnot256(mask1bit256(n), *ptr);
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit256(m256 val, unsigned int n) {
-    const m256 mask = mask1bit256(n);
-    return !_mm256_testz_si256(mask, val);
-}
-
-static really_really_inline
-m128 movdq_hi(m256 x) {
-    return _mm256_extracti128_si256(x, 1);
-}
-
-static really_really_inline
-m128 movdq_lo(m256 x) {
-    return _mm256_extracti128_si256(x, 0);
-}
-
-#define cast256to128(a) _mm256_castsi256_si128(a)
-#define cast128to256(a) _mm256_castsi128_si256(a)
-#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
-#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
-#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
-#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
-#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
-#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
-#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
-#define extractlow32from256(a) movd(cast256to128(a))
-#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
-#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
-#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
-
-static really_inline
-m256 combine2x128(m128 hi, m128 lo) {
-#if defined(_mm256_set_m128i)
-    return _mm256_set_m128i(hi, lo);
-#else
-    return insert128to256(cast128to256(lo), hi, 1);
-#endif
-}
-#endif //AVX2
-
-#if defined(HAVE_AVX512)
-#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
-#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
-#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
-#define set2x256(a) _mm512_broadcast_i64x4(a)
-#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
-#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
-#endif
-
-/****
- **** 384-bit Primitives
- ****/
-
-static really_inline m384 and384(m384 a, m384 b) {
-    m384 rv;
-    rv.lo = and128(a.lo, b.lo);
-    rv.mid = and128(a.mid, b.mid);
-    rv.hi = and128(a.hi, b.hi);
-    return rv;
-}
-
-static really_inline m384 or384(m384 a, m384 b) {
-    m384 rv;
-    rv.lo = or128(a.lo, b.lo);
-    rv.mid = or128(a.mid, b.mid);
-    rv.hi = or128(a.hi, b.hi);
-    return rv;
-}
-
-static really_inline m384 xor384(m384 a, m384 b) {
-    m384 rv;
-    rv.lo = xor128(a.lo, b.lo);
-    rv.mid = xor128(a.mid, b.mid);
-    rv.hi = xor128(a.hi, b.hi);
-    return rv;
-}
-static really_inline m384 not384(m384 a) {
-    m384 rv;
-    rv.lo = not128(a.lo);
-    rv.mid = not128(a.mid);
-    rv.hi = not128(a.hi);
-    return rv;
-}
-static really_inline m384 andnot384(m384 a, m384 b) {
-    m384 rv;
-    rv.lo = andnot128(a.lo, b.lo);
-    rv.mid = andnot128(a.mid, b.mid);
-    rv.hi = andnot128(a.hi, b.hi);
-    return rv;
-}
-
-static really_really_inline
-m384 lshift64_m384(m384 a, unsigned b) {
-    m384 rv;
-    rv.lo = lshift64_m128(a.lo, b);
-    rv.mid = lshift64_m128(a.mid, b);
-    rv.hi = lshift64_m128(a.hi, b);
-    return rv;
-}
-
-static really_inline m384 zeroes384(void) {
-    m384 rv = {zeroes128(), zeroes128(), zeroes128()};
-    return rv;
-}
-
-static really_inline m384 ones384(void) {
-    m384 rv = {ones128(), ones128(), ones128()};
-    return rv;
-}
-
-static really_inline int diff384(m384 a, m384 b) {
-    return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
-}
-
-static really_inline int isnonzero384(m384 a) {
-    return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
-}
-
-/**
- * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
- * mask indicating which 32-bit words contain differences.
- */
-static really_inline u32 diffrich384(m384 a, m384 b) {
-    m128 z = zeroes128();
-    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
-    a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
-    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
-    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
-                                  _mm_packs_epi32(a.hi, z));
-    return ~(_mm_movemask_epi8(packed)) & 0xfff;
-}
-
-/**
- * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
- * returns a 12-bit mask indicating which 64-bit words contain differences.
- */
-static really_inline u32 diffrich64_384(m384 a, m384 b) {
-    u32 d = diffrich384(a, b);
-    return (d | (d >> 1)) & 0x55555555;
-}
-
-// aligned load
-static really_inline m384 load384(const void *ptr) {
-    assert(ISALIGNED_16(ptr));
-    m384 rv = { load128(ptr), load128((const char *)ptr + 16),
-                load128((const char *)ptr + 32) };
-    return rv;
-}
-
-// aligned store
-static really_inline void store384(void *ptr, m384 a) {
-    assert(ISALIGNED_16(ptr));
-    ptr = assume_aligned(ptr, 16);
-    *(m384 *)ptr = a;
-}
-
-// unaligned load
-static really_inline m384 loadu384(const void *ptr) {
-    m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
-                loadu128((const char *)ptr + 32)};
-    return rv;
-}
-
-// packed unaligned store of first N bytes
-static really_inline
-void storebytes384(void *ptr, m384 a, unsigned int n) {
-    assert(n <= sizeof(a));
-    memcpy(ptr, &a, n);
-}
-
-// packed unaligned load of first N bytes, pad with zero
-static really_inline
-m384 loadbytes384(const void *ptr, unsigned int n) {
-    m384 a = zeroes384();
-    assert(n <= sizeof(a));
-    memcpy(&a, ptr, n);
-    return a;
-}
-
-// switches on bit N in the given vector.
-static really_inline
-void setbit384(m384 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo;
-    } else if (n < 256) {
-        sub = &ptr->mid;
-    } else {
-        sub = &ptr->hi;
-    }
-    setbit128(sub, n % 128);
-}
-
-// switches off bit N in the given vector.
-static really_inline
-void clearbit384(m384 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo;
-    } else if (n < 256) {
-        sub = &ptr->mid;
-    } else {
-        sub = &ptr->hi;
-    }
-    clearbit128(sub, n % 128);
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit384(m384 val, unsigned int n) {
-    assert(n < sizeof(val) * 8);
-    m128 sub;
-    if (n < 128) {
-        sub = val.lo;
-    } else if (n < 256) {
-        sub = val.mid;
-    } else {
-        sub = val.hi;
-    }
-    return testbit128(sub, n % 128);
-}
-
-/****
- **** 512-bit Primitives
- ****/
-
-#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
-#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
-
-static really_inline
-m512 zeroes512(void) {
-#if defined(HAVE_AVX512)
-    return _mm512_setzero_si512();
-#else
-    m512 rv = {zeroes256(), zeroes256()};
-    return rv;
-#endif
-}
-
-static really_inline
-m512 ones512(void) {
-#if defined(HAVE_AVX512)
-    return _mm512_set1_epi8(0xFF);
-    //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
-#else
-    m512 rv = {ones256(), ones256()};
-    return rv;
-#endif
-}
-
-#if defined(HAVE_AVX512)
-static really_inline
-m512 set64x8(u8 a) {
-    return _mm512_set1_epi8(a);
-}
-
-static really_inline
-m512 set8x64(u64a a) {
-    return _mm512_set1_epi64(a);
-}
-
-static really_inline
-m512 set16x32(u32 a) {
-    return _mm512_set1_epi32(a);
-}
-
-static really_inline
-m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
-               u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
-    return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
-                            lo_3, lo_2, lo_1, lo_0);
-}
-
-static really_inline
-m512 swap256in512(m512 a) {
-    m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
-    return vpermq512(idx, a);
-}
-
-static really_inline
-m512 set4x128(m128 a) {
-    return _mm512_broadcast_i32x4(a);
-}
-
-static really_inline
-m512 sadd_u8_m512(m512 a, m512 b) {
-    return _mm512_adds_epu8(a, b);
-}
-
-static really_inline
-m512 max_u8_m512(m512 a, m512 b) {
-    return _mm512_max_epu8(a, b);
-}
-
-static really_inline
-m512 min_u8_m512(m512 a, m512 b) {
-    return _mm512_min_epu8(a, b);
-}
-
-static really_inline
-m512 sub_u8_m512(m512 a, m512 b) {
-    return _mm512_sub_epi8(a, b);
-}
-#endif
-
-static really_inline
-m512 and512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return _mm512_and_si512(a, b);
-#else
-    m512 rv;
-    rv.lo = and256(a.lo, b.lo);
-    rv.hi = and256(a.hi, b.hi);
-    return rv;
-#endif
-}
-
-static really_inline
-m512 or512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return _mm512_or_si512(a, b);
-#else
-    m512 rv;
-    rv.lo = or256(a.lo, b.lo);
-    rv.hi = or256(a.hi, b.hi);
-    return rv;
-#endif
-}
-
-static really_inline
-m512 xor512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return _mm512_xor_si512(a, b);
-#else
-    m512 rv;
-    rv.lo = xor256(a.lo, b.lo);
-    rv.hi = xor256(a.hi, b.hi);
-    return rv;
-#endif
-}
-
-static really_inline
-m512 not512(m512 a) {
-#if defined(HAVE_AVX512)
-    return _mm512_xor_si512(a, ones512());
-#else
-    m512 rv;
-    rv.lo = not256(a.lo);
-    rv.hi = not256(a.hi);
-    return rv;
-#endif
-}
-
-static really_inline
-m512 andnot512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return _mm512_andnot_si512(a, b);
-#else
-    m512 rv;
-    rv.lo = andnot256(a.lo, b.lo);
-    rv.hi = andnot256(a.hi, b.hi);
-    return rv;
-#endif
-}
-
-#if defined(HAVE_AVX512)
-static really_really_inline
-m512 lshift64_m512(m512 a, unsigned b) {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
-    if (__builtin_constant_p(b)) {
-        return _mm512_slli_epi64(a, b);
-    }
-#endif
-    m128 x = _mm_cvtsi32_si128(b);
-    return _mm512_sll_epi64(a, x);
-}
-#else
-static really_really_inline
-m512 lshift64_m512(m512 a, unsigned b) {
-    m512 rv;
-    rv.lo = lshift64_m256(a.lo, b);
-    rv.hi = lshift64_m256(a.hi, b);
-    return rv;
-}
-#endif
-
-#if defined(HAVE_AVX512)
-#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
-#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
-#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
-#endif
-
-#if !defined(_MM_CMPINT_NE)
-#define _MM_CMPINT_NE 0x4
-#endif
-
-static really_inline
-int diff512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
-#else
-    return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
-#endif
-}
-
-static really_inline
-int isnonzero512(m512 a) {
-#if defined(HAVE_AVX512)
-    return diff512(a, zeroes512());
-#elif defined(HAVE_AVX2)
-    m256 x = or256(a.lo, a.hi);
-    return !!diff256(x, zeroes256());
-#else
-    m128 x = or128(a.lo.lo, a.lo.hi);
-    m128 y = or128(a.hi.lo, a.hi.hi);
-    return isnonzero128(or128(x, y));
-#endif
-}
-
-/**
- * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
- * mask indicating which 32-bit words contain differences.
- */
-static really_inline
-u32 diffrich512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
-#elif defined(HAVE_AVX2)
-    return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
-#else
-    a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
-    a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi);
-    a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo);
-    a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi);
-    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi),
-                                  _mm_packs_epi32(a.hi.lo, a.hi.hi));
-    return ~(_mm_movemask_epi8(packed)) & 0xffff;
-#endif
-}
-
-/**
- * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
- * returns a 16-bit mask indicating which 64-bit words contain differences.
- */
-static really_inline
-u32 diffrich64_512(m512 a, m512 b) {
-    //TODO: cmp_epi64?
-    u32 d = diffrich512(a, b);
-    return (d | (d >> 1)) & 0x55555555;
-}
-
-// aligned load
-static really_inline
-m512 load512(const void *ptr) {
-#if defined(HAVE_AVX512)
-    return _mm512_load_si512(ptr);
-#else
-    assert(ISALIGNED_N(ptr, alignof(m256)));
-    m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
-    return rv;
-#endif
-}
-
-// aligned store
-static really_inline
-void store512(void *ptr, m512 a) {
-    assert(ISALIGNED_N(ptr, alignof(m512)));
-#if defined(HAVE_AVX512)
-    return _mm512_store_si512(ptr, a);
-#elif defined(HAVE_AVX2)
-    m512 *x = (m512 *)ptr;
-    store256(&x->lo, a.lo);
-    store256(&x->hi, a.hi);
-#else
-    ptr = assume_aligned(ptr, 16);
-    *(m512 *)ptr = a;
-#endif
-}
-
-// unaligned load
-static really_inline
-m512 loadu512(const void *ptr) {
-#if defined(HAVE_AVX512)
-    return _mm512_loadu_si512(ptr);
-#else
-    m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
-    return rv;
-#endif
-}
-
-// unaligned store
-static really_inline
-void storeu512(void *ptr, m512 a) {
-#if defined(HAVE_AVX512)
-    _mm512_storeu_si512((m512 *)ptr, a);
-#elif defined(HAVE_AVX2)
-    storeu256(ptr, a.lo);
-    storeu256((char *)ptr + 32, a.hi);
-#else
-    storeu128(ptr, a.lo.lo);
-    storeu128((char *)ptr + 16, a.lo.hi);
-    storeu128((char *)ptr + 32, a.hi.lo);
-    storeu128((char *)ptr + 48, a.hi.hi);
-#endif
-}
-
-#if defined(HAVE_AVX512)
-static really_inline
-m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
-    return _mm512_maskz_loadu_epi8(k, ptr);
-}
-
-static really_inline
-m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
-    return _mm512_mask_loadu_epi8(src, k, ptr);
-}
-
-static really_inline
-void storeu_mask_m512(void *ptr, __mmask64 k, m512 a) {
-    _mm512_mask_storeu_epi8(ptr, k, a);
-}
-
-static really_inline
-m512 set_mask_m512(__mmask64 k) {
-    return _mm512_movm_epi8(k);
-}
-
-static really_inline
-m256 loadu_maskz_m256(__mmask32 k, const void *ptr) {
-    return _mm256_maskz_loadu_epi8(k, ptr);
-}
-#endif
-
-// packed unaligned store of first N bytes
-static really_inline
-void storebytes512(void *ptr, m512 a, unsigned int n) {
-    assert(n <= sizeof(a));
-    memcpy(ptr, &a, n);
-}
-
-// packed unaligned load of first N bytes, pad with zero
-static really_inline
-m512 loadbytes512(const void *ptr, unsigned int n) {
-    m512 a = zeroes512();
-    assert(n <= sizeof(a));
-    memcpy(&a, ptr, n);
-    return a;
-}
-
-static really_inline
-m512 mask1bit512(unsigned int n) {
-    assert(n < sizeof(m512) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 95;
-    mask_idx -= n / 8;
-    return loadu512(&simd_onebit_masks[mask_idx]);
-}
-
-// switches on bit N in the given vector.
-static really_inline
-void setbit512(m512 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-#if !defined(HAVE_AVX2)
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo.lo;
-    } else if (n < 256) {
-        sub = &ptr->lo.hi;
-    } else if (n < 384) {
-        sub = &ptr->hi.lo;
-    } else {
-        sub = &ptr->hi.hi;
-    }
-    setbit128(sub, n % 128);
-#elif defined(HAVE_AVX512)
-    *ptr = or512(mask1bit512(n), *ptr);
-#else
-    m256 *sub;
-    if (n < 256) {
-        sub = &ptr->lo;
-    } else {
-        sub = &ptr->hi;
-        n -= 256;
-    }
-    setbit256(sub, n);
-#endif
-}
-
-// switches off bit N in the given vector.
-static really_inline
-void clearbit512(m512 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-#if !defined(HAVE_AVX2)
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo.lo;
-    } else if (n < 256) {
-        sub = &ptr->lo.hi;
-    } else if (n < 384) {
-        sub = &ptr->hi.lo;
-    } else {
-        sub = &ptr->hi.hi;
-    }
-    clearbit128(sub, n % 128);
-#elif defined(HAVE_AVX512)
-    *ptr = andnot512(mask1bit512(n), *ptr);
-#else
-    m256 *sub;
-    if (n < 256) {
-        sub = &ptr->lo;
-    } else {
-        sub = &ptr->hi;
-        n -= 256;
-    }
-    clearbit256(sub, n);
-#endif
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit512(m512 val, unsigned int n) {
-    assert(n < sizeof(val) * 8);
-#if !defined(HAVE_AVX2)
-    m128 sub;
-    if (n < 128) {
-        sub = val.lo.lo;
-    } else if (n < 256) {
-        sub = val.lo.hi;
-    } else if (n < 384) {
-        sub = val.hi.lo;
-    } else {
-        sub = val.hi.hi;
-    }
-    return testbit128(sub, n % 128);
-#elif defined(HAVE_AVX512)
-    const m512 mask = mask1bit512(n);
-    return !!_mm512_test_epi8_mask(mask, val);
-#else
-    m256 sub;
-    if (n < 256) {
-        sub = val.lo;
-    } else {
-        sub = val.hi;
-        n -= 256;
-    }
-    return testbit256(sub, n);
-#endif
-}
-
-#endif
diff --git a/src/util/simd_x86.h b/src/util/simd_x86.h
new file mode 100644
index 0000000..59ac642
--- /dev/null
+++ b/src/util/simd_x86.h
@@ -0,0 +1,1334 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief SIMD types and primitive operations.
+ */
+
+#ifndef SIMD_X86
+#define SIMD_X86
+
+#if !defined(_WIN32) && !defined(__SSSE3__)
+#error SSSE3 instructions must be enabled
+#endif
+
+#include "config.h"
+#include "ue2common.h"
+#include "simd_types.h"
+#include "unaligned.h"
+#include "util/arch.h"
+#include "util/intrinsics.h"
+
+#include <string.h> // for memcpy
+
+// Define a common assume_aligned using an appropriate compiler built-in, if
+// it's available. Note that we need to handle C or C++ compilation.
+#ifdef __cplusplus
+#  ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED
+#    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
+#  endif
+#else
+#  ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED
+#    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
+#  endif
+#endif
+
+// Fallback to identity case.
+#ifndef assume_aligned
+#define assume_aligned(x, y) (x)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern const char vbs_mask_data[];
+#ifdef __cplusplus
+}
+#endif
+
+static really_inline m128 ones128(void) {
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+    /* gcc gets this right */
+    return _mm_set1_epi8(0xFF);
+#else
+    /* trick from Intel's optimization guide to generate all-ones.
+     * ICC converts this to the single cmpeq instruction */
+    return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
+#endif
+}
+
+static really_inline m128 zeroes128(void) {
+    return _mm_setzero_si128();
+}
+
+/** \brief Bitwise not for m128*/
+static really_inline m128 not128(m128 a) {
+    return _mm_xor_si128(a, ones128());
+}
+
+/** \brief Return 1 if a and b are different otherwise 0 */
+static really_inline int diff128(m128 a, m128 b) {
+    return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
+}
+
+static really_inline int isnonzero128(m128 a) {
+    return !!diff128(a, zeroes128());
+}
+
+/**
+ * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich128(m128 a, m128 b) {
+    a = _mm_cmpeq_epi32(a, b);
+    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
+}
+
+/**
+ * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
+ * returns a 4-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_128(m128 a, m128 b) {
+#if defined(HAVE_SSE41)
+    a = _mm_cmpeq_epi64(a, b);
+    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
+#else
+    u32 d = diffrich128(a, b);
+    return (d | (d >> 1)) & 0x5;
+#endif
+}
+
+static really_really_inline
+m128 lshift64_m128(m128 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return _mm_slli_epi64(a, b);
+    }
+#endif
+    m128 x = _mm_cvtsi32_si128(b);
+    return _mm_sll_epi64(a, x);
+}
+
+#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
+#define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
+#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
+
+static really_inline m128 set16x8(u8 c) {
+    return _mm_set1_epi8(c);
+}
+
+static really_inline m128 set4x32(u32 c) {
+    return _mm_set1_epi32(c);
+}
+
+static really_inline m128 set2x64(u64a c) {
+    return _mm_set1_epi32(c);
+}
+
+static really_inline u32 movd(const m128 in) {
+    return _mm_cvtsi128_si32(in);
+}
+
+static really_inline u64a movq(const m128 in) {
+#if defined(ARCH_X86_64)
+    return _mm_cvtsi128_si64(in);
+#else // 32-bit - this is horrific
+    u32 lo = movd(in);
+    u32 hi = movd(_mm_srli_epi64(in, 32));
+    return (u64a)hi << 32 | lo;
+#endif
+}
+
+/* another form of movq */
+static really_inline
+m128 load_m128_from_u64a(const u64a *p) {
+    return _mm_set_epi64x(0LL, *p);
+}
+
+#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
+#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
+
+#if defined(HAVE_SSE41)
+#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
+#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
+#else
+#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
+#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
+#endif
+
+#if !defined(HAVE_AVX2)
+// TODO: this entire file needs restructuring - this carveout is awful
+#define extractlow64from256(a) movq(a.lo)
+#define extractlow32from256(a) movd(a.lo)
+#if defined(HAVE_SSE41)
+#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
+#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
+#else
+#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
+#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
+#endif
+
+#endif // !AVX2
+
+static really_inline m128 and128(m128 a, m128 b) {
+    return _mm_and_si128(a,b);
+}
+
+static really_inline m128 xor128(m128 a, m128 b) {
+    return _mm_xor_si128(a,b);
+}
+
+static really_inline m128 or128(m128 a, m128 b) {
+    return _mm_or_si128(a,b);
+}
+
+static really_inline m128 andnot128(m128 a, m128 b) {
+    return _mm_andnot_si128(a, b);
+}
+
+// aligned load
+static really_inline m128 load128(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    ptr = assume_aligned(ptr, 16);
+    return _mm_load_si128((const m128 *)ptr);
+}
+
+// aligned store
+static really_inline void store128(void *ptr, m128 a) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    ptr = assume_aligned(ptr, 16);
+    *(m128 *)ptr = a;
+}
+
+// unaligned load
+static really_inline m128 loadu128(const void *ptr) {
+    return _mm_loadu_si128((const m128 *)ptr);
+}
+
+// unaligned store
+static really_inline void storeu128(void *ptr, m128 a) {
+    _mm_storeu_si128 ((m128 *)ptr, a);
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes128(void *ptr, m128 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m128 loadbytes128(const void *ptr, unsigned int n) {
+    m128 a = zeroes128();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern const u8 simd_onebit_masks[];
+#ifdef __cplusplus
+}
+#endif
+
+static really_inline
+m128 mask1bit128(unsigned int n) {
+    assert(n < sizeof(m128) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu128(&simd_onebit_masks[mask_idx]);
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit128(m128 *ptr, unsigned int n) {
+    *ptr = or128(mask1bit128(n), *ptr);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit128(m128 *ptr, unsigned int n) {
+    *ptr = andnot128(mask1bit128(n), *ptr);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit128(m128 val, unsigned int n) {
+    const m128 mask = mask1bit128(n);
+#if defined(HAVE_SSE41)
+    return !_mm_testz_si128(mask, val);
+#else
+    return isnonzero128(and128(mask, val));
+#endif
+}
+
+// offset must be an immediate
+#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
+
+static really_inline
+m128 pshufb_m128(m128 a, m128 b) {
+    m128 result;
+    result = _mm_shuffle_epi8(a, b);
+    return result;
+}
+
+static really_inline
+m256 pshufb_m256(m256 a, m256 b) {
+#if defined(HAVE_AVX2)
+    return _mm256_shuffle_epi8(a, b);
+#else
+    m256 rv;
+    rv.lo = pshufb_m128(a.lo, b.lo);
+    rv.hi = pshufb_m128(a.hi, b.hi);
+    return rv;
+#endif
+}
+
+#if defined(HAVE_AVX512)
+static really_inline
+m512 pshufb_m512(m512 a, m512 b) {
+    return _mm512_shuffle_epi8(a, b);
+}
+
+static really_inline
+m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
+    return _mm512_maskz_shuffle_epi8(k, a, b);
+}
+#endif
+
+static really_inline
+m128 variable_byte_shift_m128(m128 in, s32 amount) {
+    assert(amount >= -16 && amount <= 16);
+    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
+    return pshufb_m128(in, shift_mask);
+}
+
+static really_inline
+m128 max_u8_m128(m128 a, m128 b) {
+    return _mm_max_epu8(a, b);
+}
+
+static really_inline
+m128 min_u8_m128(m128 a, m128 b) {
+    return _mm_min_epu8(a, b);
+}
+
+static really_inline
+m128 sadd_u8_m128(m128 a, m128 b) {
+    return _mm_adds_epu8(a, b);
+}
+
+static really_inline
+m128 sub_u8_m128(m128 a, m128 b) {
+    return _mm_sub_epi8(a, b);
+}
+
+static really_inline
+m128 set64x2(u64a hi, u64a lo) {
+    return _mm_set_epi64x(hi, lo);
+}
+
+static really_inline
+m128 set32x4(int i3, int i2, int i1, int i0) {
+    return _mm_set_epi32(i3, i2, i1, i0);
+}
+
+/****
+ **** 256-bit Primitives
+ ****/
+
+#if defined(HAVE_AVX2)
+
+static really_really_inline
+m256 lshift64_m256(m256 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return _mm256_slli_epi64(a, b);
+    }
+#endif
+    m128 x = _mm_cvtsi32_si128(b);
+    return _mm256_sll_epi64(a, x);
+}
+
+#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
+
+static really_inline
+m256 set32x8(u32 in) {
+    return _mm256_set1_epi8(in);
+}
+
+#define eq256(a, b)     _mm256_cmpeq_epi8((a), (b))
+#define movemask256(a)  ((u32)_mm256_movemask_epi8((a)))
+
+static really_inline
+m256 set2x128(m128 a) {
+    return _mm256_broadcastsi128_si256(a);
+}
+
+#else
+
+static really_really_inline
+m256 lshift64_m256(m256 a, int b) {
+    m256 rv = a;
+    rv.lo = lshift64_m128(rv.lo, b);
+    rv.hi = lshift64_m128(rv.hi, b);
+    return rv;
+}
+
+static really_inline
+m256 rshift64_m256(m256 a, int b) {
+    m256 rv = a;
+    rv.lo = rshift64_m128(rv.lo, b);
+    rv.hi = rshift64_m128(rv.hi, b);
+    return rv;
+}
+static really_inline
+m256 set32x8(u32 in) {
+    m256 rv;
+    rv.lo = set16x8((u8) in);
+    rv.hi = rv.lo;
+    return rv;
+}
+
+static really_inline
+m256 eq256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = eq128(a.lo, b.lo);
+    rv.hi = eq128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline
+u32 movemask256(m256 a) {
+    u32 lo_mask = movemask128(a.lo);
+    u32 hi_mask = movemask128(a.hi);
+    return lo_mask | (hi_mask << 16);
+}
+
+static really_inline
+m256 set2x128(m128 a) {
+    m256 rv = {a, a};
+    return rv;
+}
+#endif
+
+static really_inline m256 zeroes256(void) {
+#if defined(HAVE_AVX2)
+    return _mm256_setzero_si256();
+#else
+    m256 rv = {zeroes128(), zeroes128()};
+    return rv;
+#endif
+}
+
+static really_inline m256 ones256(void) {
+#if defined(HAVE_AVX2)
+    m256 rv = _mm256_set1_epi8(0xFF);
+#else
+    m256 rv = {ones128(), ones128()};
+#endif
+    return rv;
+}
+
+#if defined(HAVE_AVX2)
+static really_inline m256 and256(m256 a, m256 b) {
+    return _mm256_and_si256(a, b);
+}
+#else
+static really_inline m256 and256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = and128(a.lo, b.lo);
+    rv.hi = and128(a.hi, b.hi);
+    return rv;
+}
+#endif
+
+#if defined(HAVE_AVX2)
+static really_inline m256 or256(m256 a, m256 b) {
+    return _mm256_or_si256(a, b);
+}
+#else
+static really_inline m256 or256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = or128(a.lo, b.lo);
+    rv.hi = or128(a.hi, b.hi);
+    return rv;
+}
+#endif
+
+#if defined(HAVE_AVX2)
+static really_inline m256 xor256(m256 a, m256 b) {
+    return _mm256_xor_si256(a, b);
+}
+#else
+static really_inline m256 xor256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = xor128(a.lo, b.lo);
+    rv.hi = xor128(a.hi, b.hi);
+    return rv;
+}
+#endif
+
+#if defined(HAVE_AVX2)
+static really_inline m256 not256(m256 a) {
+    return _mm256_xor_si256(a, ones256());
+}
+#else
+static really_inline m256 not256(m256 a) {
+    m256 rv;
+    rv.lo = not128(a.lo);
+    rv.hi = not128(a.hi);
+    return rv;
+}
+#endif
+
+#if defined(HAVE_AVX2)
+static really_inline m256 andnot256(m256 a, m256 b) {
+    return _mm256_andnot_si256(a, b);
+}
+#else
+static really_inline m256 andnot256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = andnot128(a.lo, b.lo);
+    rv.hi = andnot128(a.hi, b.hi);
+    return rv;
+}
+#endif
+
+static really_inline int diff256(m256 a, m256 b) {
+#if defined(HAVE_AVX2)
+    return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
+#else
+    return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
+#endif
+}
+
+static really_inline int isnonzero256(m256 a) {
+#if defined(HAVE_AVX2)
+    return !!diff256(a, zeroes256());
+#else
+    return isnonzero128(or128(a.lo, a.hi));
+#endif
+}
+
+/**
+ * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich256(m256 a, m256 b) {
+#if defined(HAVE_AVX2)
+    a = _mm256_cmpeq_epi32(a, b);
+    return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
+#else
+    m128 z = zeroes128();
+    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
+    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
+    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z);
+    return ~(_mm_movemask_epi8(packed)) & 0xff;
+#endif
+}
+
+/**
+ * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
+ * returns an 8-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_256(m256 a, m256 b) {
+    u32 d = diffrich256(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline m256 load256(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+#if defined(HAVE_AVX2)
+    return _mm256_load_si256((const m256 *)ptr);
+#else
+    m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
+    return rv;
+#endif
+}
+
+// aligned load  of 128-bit value to low and high part of 256-bit value
+static really_inline m256 load2x128(const void *ptr) {
+#if defined(HAVE_AVX2)
+    return set2x128(load128(ptr));
+#else
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    m256 rv;
+    rv.hi = rv.lo = load128(ptr);
+    return rv;
+#endif
+}
+
+static really_inline m256 loadu2x128(const void *ptr) {
+    return set2x128(loadu128(ptr));
+}
+
+// aligned store
+static really_inline void store256(void *ptr, m256 a) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+#if defined(HAVE_AVX2)
+    _mm256_store_si256((m256 *)ptr, a);
+#else
+    ptr = assume_aligned(ptr, 16);
+    *(m256 *)ptr = a;
+#endif
+}
+
+// unaligned load
+static really_inline m256 loadu256(const void *ptr) {
+#if defined(HAVE_AVX2)
+    return _mm256_loadu_si256((const m256 *)ptr);
+#else
+    m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
+    return rv;
+#endif
+}
+
+// unaligned store
+static really_inline void storeu256(void *ptr, m256 a) {
+#if defined(HAVE_AVX2)
+    _mm256_storeu_si256((m256 *)ptr, a);
+#else
+    storeu128(ptr, a.lo);
+    storeu128((char *)ptr + 16, a.hi);
+#endif
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes256(void *ptr, m256 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m256 loadbytes256(const void *ptr, unsigned int n) {
+    m256 a = zeroes256();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+static really_inline
+m256 mask1bit256(unsigned int n) {
+    assert(n < sizeof(m256) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu256(&simd_onebit_masks[mask_idx]);
+}
+
+static really_inline
+m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
+#if defined(HAVE_AVX2)
+    return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
+#else
+    m256 rv;
+    rv.hi = set64x2(hi_1, hi_0);
+    rv.lo = set64x2(lo_1, lo_0);
+    return rv;
+#endif
+}
+
+#if !defined(HAVE_AVX2)
+// switches on bit N in the given vector.
+static really_inline
+void setbit256(m256 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 128;
+    }
+    setbit128(sub, n);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit256(m256 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 128;
+    }
+    clearbit128(sub, n);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit256(m256 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    m128 sub;
+    if (n < 128) {
+        sub = val.lo;
+    } else {
+        sub = val.hi;
+        n -= 128;
+    }
+    return testbit128(sub, n);
+}
+
+static really_really_inline
+m128 movdq_hi(m256 x) {
+    return x.hi;
+}
+
+static really_really_inline
+m128 movdq_lo(m256 x) {
+    return x.lo;
+}
+
+static really_inline
+m256 combine2x128(m128 hi, m128 lo) {
+    m256 rv = {lo, hi};
+    return rv;
+}
+
+#else // AVX2
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit256(m256 *ptr, unsigned int n) {
+    *ptr = or256(mask1bit256(n), *ptr);
+}
+
+static really_inline
+void clearbit256(m256 *ptr, unsigned int n) {
+    *ptr = andnot256(mask1bit256(n), *ptr);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit256(m256 val, unsigned int n) {
+    const m256 mask = mask1bit256(n);
+    return !_mm256_testz_si256(mask, val);
+}
+
+static really_really_inline
+m128 movdq_hi(m256 x) {
+    return _mm256_extracti128_si256(x, 1);
+}
+
+static really_really_inline
+m128 movdq_lo(m256 x) {
+    return _mm256_extracti128_si256(x, 0);
+}
+
+#define cast256to128(a) _mm256_castsi256_si128(a)
+#define cast128to256(a) _mm256_castsi128_si256(a)
+#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
+#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
+#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
+#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
+#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
+#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
+#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
+#define extractlow32from256(a) movd(cast256to128(a))
+#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
+#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
+#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
+
+static really_inline
+m256 combine2x128(m128 hi, m128 lo) {
+#if defined(_mm256_set_m128i)
+    return _mm256_set_m128i(hi, lo);
+#else
+    return insert128to256(cast128to256(lo), hi, 1);
+#endif
+}
+#endif //AVX2
+
+#if defined(HAVE_AVX512)
+#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
+#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
+#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
+#define set2x256(a) _mm512_broadcast_i64x4(a)
+#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
+#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
+#endif
+
+/****
+ **** 384-bit Primitives
+ ****/
+
+static really_inline m384 and384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = and128(a.lo, b.lo);
+    rv.mid = and128(a.mid, b.mid);
+    rv.hi = and128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m384 or384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = or128(a.lo, b.lo);
+    rv.mid = or128(a.mid, b.mid);
+    rv.hi = or128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m384 xor384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = xor128(a.lo, b.lo);
+    rv.mid = xor128(a.mid, b.mid);
+    rv.hi = xor128(a.hi, b.hi);
+    return rv;
+}
+static really_inline m384 not384(m384 a) {
+    m384 rv;
+    rv.lo = not128(a.lo);
+    rv.mid = not128(a.mid);
+    rv.hi = not128(a.hi);
+    return rv;
+}
+static really_inline m384 andnot384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = andnot128(a.lo, b.lo);
+    rv.mid = andnot128(a.mid, b.mid);
+    rv.hi = andnot128(a.hi, b.hi);
+    return rv;
+}
+
+static really_really_inline
+m384 lshift64_m384(m384 a, unsigned b) {
+    m384 rv;
+    rv.lo = lshift64_m128(a.lo, b);
+    rv.mid = lshift64_m128(a.mid, b);
+    rv.hi = lshift64_m128(a.hi, b);
+    return rv;
+}
+
+static really_inline m384 zeroes384(void) {
+    m384 rv = {zeroes128(), zeroes128(), zeroes128()};
+    return rv;
+}
+
+static really_inline m384 ones384(void) {
+    m384 rv = {ones128(), ones128(), ones128()};
+    return rv;
+}
+
+static really_inline int diff384(m384 a, m384 b) {
+    return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
+}
+
+static really_inline int isnonzero384(m384 a) {
+    return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
+}
+
+/**
+ * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich384(m384 a, m384 b) {
+    m128 z = zeroes128();
+    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
+    a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
+    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
+    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
+                                  _mm_packs_epi32(a.hi, z));
+    return ~(_mm_movemask_epi8(packed)) & 0xfff;
+}
+
+/**
+ * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
+ * returns a 12-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_384(m384 a, m384 b) {
+    u32 d = diffrich384(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline m384 load384(const void *ptr) {
+    assert(ISALIGNED_16(ptr));
+    m384 rv = { load128(ptr), load128((const char *)ptr + 16),
+                load128((const char *)ptr + 32) };
+    return rv;
+}
+
+// aligned store
+static really_inline void store384(void *ptr, m384 a) {
+    assert(ISALIGNED_16(ptr));
+    ptr = assume_aligned(ptr, 16);
+    *(m384 *)ptr = a;
+}
+
+// unaligned load
+static really_inline m384 loadu384(const void *ptr) {
+    m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
+                loadu128((const char *)ptr + 32)};
+    return rv;
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes384(void *ptr, m384 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m384 loadbytes384(const void *ptr, unsigned int n) {
+    m384 a = zeroes384();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit384(m384 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else if (n < 256) {
+        sub = &ptr->mid;
+    } else {
+        sub = &ptr->hi;
+    }
+    setbit128(sub, n % 128);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit384(m384 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else if (n < 256) {
+        sub = &ptr->mid;
+    } else {
+        sub = &ptr->hi;
+    }
+    clearbit128(sub, n % 128);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit384(m384 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    m128 sub;
+    if (n < 128) {
+        sub = val.lo;
+    } else if (n < 256) {
+        sub = val.mid;
+    } else {
+        sub = val.hi;
+    }
+    return testbit128(sub, n % 128);
+}
+
+/****
+ **** 512-bit Primitives
+ ****/
+
+#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
+#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
+
+static really_inline
+m512 zeroes512(void) {
+#if defined(HAVE_AVX512)
+    return _mm512_setzero_si512();
+#else
+    m512 rv = {zeroes256(), zeroes256()};
+    return rv;
+#endif
+}
+
+static really_inline
+m512 ones512(void) {
+#if defined(HAVE_AVX512)
+    return _mm512_set1_epi8(0xFF);
+    //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
+#else
+    m512 rv = {ones256(), ones256()};
+    return rv;
+#endif
+}
+
+#if defined(HAVE_AVX512)
+static really_inline
+m512 set64x8(u8 a) {
+    return _mm512_set1_epi8(a);
+}
+
+static really_inline
+m512 set8x64(u64a a) {
+    return _mm512_set1_epi64(a);
+}
+
+static really_inline
+m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
+               u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
+    return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
+                            lo_3, lo_2, lo_1, lo_0);
+}
+
+static really_inline
+m512 swap256in512(m512 a) {
+    m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
+    return vpermq512(idx, a);
+}
+
+static really_inline
+m512 set4x128(m128 a) {
+    return _mm512_broadcast_i32x4(a);
+}
+#endif
+
+static really_inline
+m512 and512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_and_si512(a, b);
+#else
+    m512 rv;
+    rv.lo = and256(a.lo, b.lo);
+    rv.hi = and256(a.hi, b.hi);
+    return rv;
+#endif
+}
+
+static really_inline
+m512 or512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_or_si512(a, b);
+#else
+    m512 rv;
+    rv.lo = or256(a.lo, b.lo);
+    rv.hi = or256(a.hi, b.hi);
+    return rv;
+#endif
+}
+
+static really_inline
+m512 xor512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_xor_si512(a, b);
+#else
+    m512 rv;
+    rv.lo = xor256(a.lo, b.lo);
+    rv.hi = xor256(a.hi, b.hi);
+    return rv;
+#endif
+}
+
+static really_inline
+m512 not512(m512 a) {
+#if defined(HAVE_AVX512)
+    return _mm512_xor_si512(a, ones512());
+#else
+    m512 rv;
+    rv.lo = not256(a.lo);
+    rv.hi = not256(a.hi);
+    return rv;
+#endif
+}
+
+static really_inline
+m512 andnot512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_andnot_si512(a, b);
+#else
+    m512 rv;
+    rv.lo = andnot256(a.lo, b.lo);
+    rv.hi = andnot256(a.hi, b.hi);
+    return rv;
+#endif
+}
+
+#if defined(HAVE_AVX512)
+static really_really_inline
+m512 lshift64_m512(m512 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return _mm512_slli_epi64(a, b);
+    }
+#endif
+    m128 x = _mm_cvtsi32_si128(b);
+    return _mm512_sll_epi64(a, x);
+}
+#else
+static really_really_inline
+m512 lshift64_m512(m512 a, unsigned b) {
+    m512 rv;
+    rv.lo = lshift64_m256(a.lo, b);
+    rv.hi = lshift64_m256(a.hi, b);
+    return rv;
+}
+#endif
+
+#if defined(HAVE_AVX512)
+#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
+#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
+#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
+#endif
+
+#if !defined(_MM_CMPINT_NE)
+#define _MM_CMPINT_NE 0x4
+#endif
+
+static really_inline
+int diff512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
+#else
+    return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
+#endif
+}
+
+static really_inline
+int isnonzero512(m512 a) {
+#if defined(HAVE_AVX512)
+    return diff512(a, zeroes512());
+#elif defined(HAVE_AVX2)
+    m256 x = or256(a.lo, a.hi);
+    return !!diff256(x, zeroes256());
+#else
+    m128 x = or128(a.lo.lo, a.lo.hi);
+    m128 y = or128(a.hi.lo, a.hi.hi);
+    return isnonzero128(or128(x, y));
+#endif
+}
+
+/**
+ * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline
+u32 diffrich512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
+#elif defined(HAVE_AVX2)
+    return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
+#else
+    a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
+    a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi);
+    a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo);
+    a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi);
+    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi),
+                                  _mm_packs_epi32(a.hi.lo, a.hi.hi));
+    return ~(_mm_movemask_epi8(packed)) & 0xffff;
+#endif
+}
+
+/**
+ * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
+ * returns a 16-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline
+u32 diffrich64_512(m512 a, m512 b) {
+    //TODO: cmp_epi64?
+    u32 d = diffrich512(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline
+m512 load512(const void *ptr) {
+#if defined(HAVE_AVX512)
+    return _mm512_load_si512(ptr);
+#else
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+    m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
+    return rv;
+#endif
+}
+
+// aligned store
+static really_inline
+void store512(void *ptr, m512 a) {
+    assert(ISALIGNED_N(ptr, alignof(m512)));
+#if defined(HAVE_AVX512)
+    return _mm512_store_si512(ptr, a);
+#elif defined(HAVE_AVX2)
+    m512 *x = (m512 *)ptr;
+    store256(&x->lo, a.lo);
+    store256(&x->hi, a.hi);
+#else
+    ptr = assume_aligned(ptr, 16);
+    *(m512 *)ptr = a;
+#endif
+}
+
+// unaligned load
+static really_inline
+m512 loadu512(const void *ptr) {
+#if defined(HAVE_AVX512)
+    return _mm512_loadu_si512(ptr);
+#else
+    m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
+    return rv;
+#endif
+}
+
+#if defined(HAVE_AVX512)
+static really_inline
+m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
+    return _mm512_maskz_loadu_epi8(k, ptr);
+}
+
+static really_inline
+m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
+    return _mm512_mask_loadu_epi8(src, k, ptr);
+}
+
+static really_inline
+m512 set_mask_m512(__mmask64 k) {
+    return _mm512_movm_epi8(k);
+}
+#endif
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes512(void *ptr, m512 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m512 loadbytes512(const void *ptr, unsigned int n) {
+    m512 a = zeroes512();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+static really_inline
+m512 mask1bit512(unsigned int n) {
+    assert(n < sizeof(m512) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu512(&simd_onebit_masks[mask_idx]);
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit512(m512 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+#if !defined(HAVE_AVX2)
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo.lo;
+    } else if (n < 256) {
+        sub = &ptr->lo.hi;
+    } else if (n < 384) {
+        sub = &ptr->hi.lo;
+    } else {
+        sub = &ptr->hi.hi;
+    }
+    setbit128(sub, n % 128);
+#elif defined(HAVE_AVX512)
+    *ptr = or512(mask1bit512(n), *ptr);
+#else
+    m256 *sub;
+    if (n < 256) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 256;
+    }
+    setbit256(sub, n);
+#endif
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit512(m512 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+#if !defined(HAVE_AVX2)
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo.lo;
+    } else if (n < 256) {
+        sub = &ptr->lo.hi;
+    } else if (n < 384) {
+        sub = &ptr->hi.lo;
+    } else {
+        sub = &ptr->hi.hi;
+    }
+    clearbit128(sub, n % 128);
+#elif defined(HAVE_AVX512)
+    *ptr = andnot512(mask1bit512(n), *ptr);
+#else
+    m256 *sub;
+    if (n < 256) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 256;
+    }
+    clearbit256(sub, n);
+#endif
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit512(m512 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+#if !defined(HAVE_AVX2)
+    m128 sub;
+    if (n < 128) {
+        sub = val.lo.lo;
+    } else if (n < 256) {
+        sub = val.lo.hi;
+    } else if (n < 384) {
+        sub = val.hi.lo;
+    } else {
+        sub = val.hi.hi;
+    }
+    return testbit128(sub, n % 128);
+#elif defined(HAVE_AVX512)
+    const m512 mask = mask1bit512(n);
+    return !!_mm512_test_epi8_mask(mask, val);
+#else
+    m256 sub;
+    if (n < 256) {
+        sub = val.lo;
+    } else {
+        sub = val.hi;
+        n -= 256;
+    }
+    return testbit256(sub, n);
+#endif
+}
+
+#endif
diff --git a/src/util/state_compress.c b/src/util/state_compress.c
index 7238849..4422403 100644
--- a/src/util/state_compress.c
+++ b/src/util/state_compress.c
@@ -150,7 +150,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
     u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
                  expand32(v[2], m[2]), expand32(v[3], m[3]) };
 
-    return _mm_set_epi32(x[3], x[2], x[1], x[0]);
+    return set32x4(x[3], x[2], x[1], x[0]);
 }
 #endif
 
@@ -158,7 +158,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
 static really_inline
 m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
     // First, decompose our vectors into 64-bit chunks.
-    u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) };
+    u64a m[2] = { movq(mvec), movq(rshiftbyte_m128(mvec, 8)) };
 
     u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
     u64a v[2];
@@ -167,7 +167,7 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
 
     u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) };
 
-    return _mm_set_epi64x(x[1], x[0]);
+    return set64x2(x[1], x[0]);
 }
 #endif
 
@@ -264,8 +264,8 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
                  expand32(v[6], m[6]), expand32(v[7], m[7]) };
 
 #if !defined(HAVE_AVX2)
-    m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
-                  .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) };
+    m256 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]),
+                  .hi = set32x4(x[7], x[6], x[5], x[4]) };
 #else
     m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4],
                                  x[3], x[2], x[1], x[0]);
@@ -291,8 +291,8 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) {
                   expand64(v[2], m[2]), expand64(v[3], m[3]) };
 
 #if !defined(HAVE_AVX2)
-    m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
-                  .hi = _mm_set_epi64x(x[3], x[2]) };
+    m256 xvec = { .lo = set64x2(x[1], x[0]),
+                  .hi = set64x2(x[3], x[2]) };
 #else
     m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]);
 #endif
@@ -402,9 +402,9 @@ m384 loadcompressed384_32bit(const void *ptr, m384 mvec) {
                   expand32(v[8], m[8]), expand32(v[9], m[9]),
                   expand32(v[10], m[10]), expand32(v[11], m[11]) };
 
-    m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
-                  .mid = _mm_set_epi32(x[7], x[6], x[5], x[4]),
-                  .hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) };
+    m384 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]),
+                  .mid = set32x4(x[7], x[6], x[5], x[4]),
+                  .hi = set32x4(x[11], x[10], x[9], x[8]) };
     return xvec;
 }
 #endif
@@ -427,9 +427,9 @@ m384 loadcompressed384_64bit(const void *ptr, m384 mvec) {
                   expand64(v[2], m[2]), expand64(v[3], m[3]),
                   expand64(v[4], m[4]), expand64(v[5], m[5]) };
 
-    m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
-                  .mid = _mm_set_epi64x(x[3], x[2]),
-                  .hi = _mm_set_epi64x(x[5], x[4]) };
+    m384 xvec = { .lo = set64x2(x[1], x[0]),
+                  .mid = set64x2(x[3], x[2]),
+                  .hi = set64x2(x[5], x[4]) };
     return xvec;
 }
 #endif
@@ -558,10 +558,10 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
     xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12],
                                x[11], x[10], x[9], x[8]);
 #else
-    xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]);
-    xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]);
-    xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]);
-    xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]);
+    xvec.lo.lo = set32x4(x[3], x[2], x[1], x[0]);
+    xvec.lo.hi = set32x4(x[7], x[6], x[5], x[4]);
+    xvec.hi.lo = set32x4(x[11], x[10], x[9], x[8]);
+    xvec.hi.hi = set32x4(x[15], x[14], x[13], x[12]);
 #endif
     return xvec;
 }
@@ -594,10 +594,10 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
     m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]),
                   .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])};
 #else
-    m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]),
-                          _mm_set_epi64x(x[3], x[2]) },
-                  .hi = { _mm_set_epi64x(x[5], x[4]),
-                          _mm_set_epi64x(x[7], x[6]) } };
+    m512 xvec = { .lo = { set64x2(x[1], x[0]),
+                          set64x2(x[3], x[2]) },
+                  .hi = { set64x2(x[5], x[4]),
+                          set64x2(x[7], x[6]) } };
 #endif
     return xvec;
 }
diff --git a/tools/hscollider/CMakeLists.txt b/tools/hscollider/CMakeLists.txt
index a4d71b2..0c41ab9 100644
--- a/tools/hscollider/CMakeLists.txt
+++ b/tools/hscollider/CMakeLists.txt
@@ -21,7 +21,14 @@ set_source_files_properties(
     PROPERTIES
     COMPILE_FLAGS "${RAGEL_C_FLAGS} -I${CMAKE_CURRENT_SOURCE_DIR}")
 
-ragelmaker(ColliderCorporaParser.rl)
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386")
+    ragelmaker(ColliderCorporaParser.rl)
+endif()
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+    ragelcopyer(ColliderCorporaParser.rl)
+endif()
 
 if (BUILD_CHIMERA)
     add_definitions(-DHS_HYBRID)
diff --git a/tools/hscollider/ColliderCorporaParser.cpp b/tools/hscollider/ColliderCorporaParser.cpp
new file mode 100644
index 0000000..5391473
--- /dev/null
+++ b/tools/hscollider/ColliderCorporaParser.cpp
@@ -0,0 +1,474 @@
+
+
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "ColliderCorporaParser.h"
+#include "Corpora.h"
+
+#include "ue2common.h"
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+using namespace std;
+
+namespace /* anonymous */ {
+
+// Take a string like '\xFF' and convert it to the character it represents
+char unhex(const char *start, UNUSED const char *end) {
+    assert(start + 4 == end);
+    assert(start[0] == '\\');
+    assert(start[1] == 'x');
+    assert(isxdigit(start[2]));
+    assert(isxdigit(start[2]));
+
+    char temp[3] = {start[2], start[3], 0};
+
+    return strtol(temp, nullptr, 16);
+}
+
+static const char _FileCorporaParser_actions[] = {
+    0,  1,  0,  1,  3,  1,  4,  1,  5,  1,  6, 1,  7, 1,  8, 1,  9, 1,  10,
+    1,  11, 1,  12, 1,  13, 1,  14, 1,  15, 1, 16, 1, 17, 1, 18, 1, 19, 1,
+    20, 1,  21, 1,  22, 1,  23, 1,  24, 2,  0, 2,  2, 3,  0, 3,  1, 0,  2};
+
+static const char _FileCorporaParser_key_offsets[] = {
+    0, 0, 2, 6, 7, 13, 19, 25, 31, 34, 34, 35, 52, 54, 71, 72, 75, 79};
+
+static const char _FileCorporaParser_trans_keys[] = {
+    48,  57,  58,  61,  48,  57,  34,  48,  57,  65,  70,  97,  102, 48,
+    57,  65,  70,  97,  102, 48,  57,  65,  70,  97,  102, 48,  57,  65,
+    70,  97,  102, 32,  48,  57,  92,  48,  97,  110, 114, 116, 118, 120,
+    49,  57,  65,  90,  98,  100, 101, 102, 103, 122, 34,  92,  48,  97,
+    110, 114, 116, 118, 120, 49,  57,  65,  90,  98,  100, 101, 102, 103,
+    122, 58,  32,  48,  57,  32,  44,  48,  57,  32,  44,  0};
+
+static const char _FileCorporaParser_single_lengths[] = {
+    0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 1, 7, 2, 7, 1, 1, 2, 2};
+
+static const char _FileCorporaParser_range_lengths[] = {
+    0, 1, 1, 0, 3, 3, 3, 3, 1, 0, 0, 5, 0, 5, 0, 1, 1, 0};
+
+static const char _FileCorporaParser_index_offsets[] = {
+    0, 0, 2, 6, 8, 12, 16, 20, 24, 27, 28, 30, 43, 46, 59, 61, 64, 68};
+
+static const char _FileCorporaParser_indicies[] = {
+    0,  1,  3,  4,  2,  1,  5,  1,  7,  7,  7,  6,  8,  8,  8,  6,  10, 10,
+    10, 9,  11, 11, 11, 9,  12, 13, 1,  1,  15, 14, 18, 18, 18, 18, 18, 18,
+    19, 16, 16, 16, 18, 16, 17, 21, 22, 20, 25, 25, 25, 25, 25, 25, 26, 23,
+    23, 23, 25, 23, 24, 27, 1,  28, 29, 1,  31, 32, 13, 30, 31, 32, 30, 0};
+
+static const char _FileCorporaParser_trans_targs[] = {
+    2,  0,  2, 9,  3,  9,  10, 5,  10, 12, 7,  12, 8,  16, 10, 11, 10,
+    10, 10, 4, 12, 12, 13, 12, 12, 12, 6,  14, 8,  16, 15, 17, 15};
+
+static const char _FileCorporaParser_trans_actions[] = {
+    53, 0,  47, 5,  0,  7,  25, 0,  15, 39, 0,  27, 0,  1,  21, 13, 23,
+    19, 17, 0,  33, 35, 13, 37, 31, 29, 0,  41, 3,  50, 45, 0,  43};
+
+static const char _FileCorporaParser_to_state_actions[] = {
+    0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 9, 0, 9, 9, 0, 0};
+
+static const char _FileCorporaParser_from_state_actions[] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 11, 0, 11, 11, 0, 0};
+
+static const char _FileCorporaParser_eof_trans[] = {
+    0, 0, 0, 0, 7, 7, 10, 10, 0, 0, 0, 17, 0, 24, 0, 0, 31, 31};
+
+static const int FileCorporaParser_start = 1;
+static const int FileCorporaParser_first_final = 9;
+static const int FileCorporaParser_error = 0;
+
+static const int FileCorporaParser_en_corpus_old = 10;
+static const int FileCorporaParser_en_corpus_new = 12;
+static const int FileCorporaParser_en_colon_sep = 14;
+static const int FileCorporaParser_en_match_list = 15;
+static const int FileCorporaParser_en_main = 1;
+
+} // namespace
+
+bool parseCorpus(const string &line, Corpus &c, unsigned int &id) {
+    const char *p = line.c_str();
+    const char *pe = p + line.size();
+    const char *eof = pe;
+    const char *ts;
+    const char *te;
+    int cs;
+    UNUSED int act;
+
+    // For storing integers as they're scanned
+    unsigned int num = 0;
+
+    string &sout = c.data;
+
+    {
+        cs = FileCorporaParser_start;
+        ts = 0;
+        te = 0;
+        act = 0;
+    }
+
+    {
+        int _klen;
+        unsigned int _trans;
+        const char *_acts;
+        unsigned int _nacts;
+        const char *_keys;
+
+        if (p == pe)
+            goto _test_eof;
+        if (cs == 0)
+            goto _out;
+    _resume:
+        _acts = _FileCorporaParser_actions +
+                _FileCorporaParser_from_state_actions[cs];
+        _nacts = (unsigned int)*_acts++;
+        while (_nacts-- > 0) {
+            switch (*_acts++) {
+            case 7:
+
+            {
+                ts = p;
+            } break;
+            }
+        }
+
+        _keys =
+            _FileCorporaParser_trans_keys + _FileCorporaParser_key_offsets[cs];
+        _trans = _FileCorporaParser_index_offsets[cs];
+
+        _klen = _FileCorporaParser_single_lengths[cs];
+        if (_klen > 0) {
+            const char *_lower = _keys;
+            const char *_mid;
+            const char *_upper = _keys + _klen - 1;
+            while (1) {
+                if (_upper < _lower)
+                    break;
+
+                _mid = _lower + ((_upper - _lower) >> 1);
+                if ((*p) < *_mid)
+                    _upper = _mid - 1;
+                else if ((*p) > *_mid)
+                    _lower = _mid + 1;
+                else {
+                    _trans += (unsigned int)(_mid - _keys);
+                    goto _match;
+                }
+            }
+            _keys += _klen;
+            _trans += _klen;
+        }
+
+        _klen = _FileCorporaParser_range_lengths[cs];
+        if (_klen > 0) {
+            const char *_lower = _keys;
+            const char *_mid;
+            const char *_upper = _keys + (_klen << 1) - 2;
+            while (1) {
+                if (_upper < _lower)
+                    break;
+
+                _mid = _lower + (((_upper - _lower) >> 1) & ~1);
+                if ((*p) < _mid[0])
+                    _upper = _mid - 2;
+                else if ((*p) > _mid[1])
+                    _lower = _mid + 2;
+                else {
+                    _trans += (unsigned int)((_mid - _keys) >> 1);
+                    goto _match;
+                }
+            }
+            _trans += _klen;
+        }
+
+    _match:
+        _trans = _FileCorporaParser_indicies[_trans];
+    _eof_trans:
+        cs = _FileCorporaParser_trans_targs[_trans];
+
+        if (_FileCorporaParser_trans_actions[_trans] == 0)
+            goto _again;
+
+        _acts = _FileCorporaParser_actions +
+                _FileCorporaParser_trans_actions[_trans];
+        _nacts = (unsigned int)*_acts++;
+        while (_nacts-- > 0) {
+            switch (*_acts++) {
+            case 0:
+
+            {
+                num = (num * 10) + ((*p) - '0');
+            } break;
+            case 1:
+
+            {
+                num = 0;
+            } break;
+            case 2:
+
+            {
+                id = num;
+            } break;
+            case 3:
+
+            {
+                num = 0;
+            } break;
+            case 4:
+
+            {
+                {
+                    cs = 10;
+                    goto _again;
+                }
+            } break;
+            case 5:
+
+            {
+                c.hasMatches = true;
+                {
+                    cs = 12;
+                    goto _again;
+                }
+            } break;
+            case 8:
+
+            {
+                te = p + 1;
+            } break;
+            case 9:
+
+            {
+                te = p + 1;
+                { sout.push_back(unhex(ts, te)); }
+            } break;
+            case 10:
+
+            {
+                te = p + 1;
+                {
+                    switch (*(ts + 1)) {
+                    case '0':
+                        sout.push_back('\x00');
+                        break;
+                    case 'a':
+                        sout.push_back('\x07');
+                        break;
+                    case 'e':
+                        sout.push_back('\x1b');
+                        break;
+                    case 'f':
+                        sout.push_back('\x0c');
+                        break;
+                    case 'n':
+                        sout.push_back('\x0a');
+                        break;
+                    case 'v':
+                        sout.push_back('\x0b');
+                        break;
+                    case 'r':
+                        sout.push_back('\x0d');
+                        break;
+                    case 't':
+                        sout.push_back('\x09');
+                        break;
+                    default: {
+                        p++;
+                        goto _out;
+                    }
+                    }
+                }
+            } break;
+            case 11:
+
+            {
+                te = p + 1;
+                { sout.push_back(*(ts + 1)); }
+            } break;
+            case 12:
+
+            {
+                te = p + 1;
+                { sout.push_back(*ts); }
+            } break;
+            case 13:
+
+            {
+                te = p;
+                p--;
+                { sout.push_back(*ts); }
+            } break;
+            case 14:
+
+            {
+                { p = ((te)) - 1; }
+                { sout.push_back(*ts); }
+            } break;
+            case 15:
+
+            {
+                te = p + 1;
+                { sout.push_back(unhex(ts, te)); }
+            } break;
+            case 16:
+
+            {
+                te = p + 1;
+                {
+                    switch (*(ts + 1)) {
+                    case '0':
+                        sout.push_back('\x00');
+                        break;
+                    case 'a':
+                        sout.push_back('\x07');
+                        break;
+                    case 'e':
+                        sout.push_back('\x1b');
+                        break;
+                    case 'f':
+                        sout.push_back('\x0c');
+                        break;
+                    case 'n':
+                        sout.push_back('\x0a');
+                        break;
+                    case 'v':
+                        sout.push_back('\x0b');
+                        break;
+                    case 'r':
+                        sout.push_back('\x0d');
+                        break;
+                    case 't':
+                        sout.push_back('\x09');
+                        break;
+                    default: {
+                        p++;
+                        goto _out;
+                    }
+                    }
+                }
+            } break;
+            case 17:
+
+            {
+                te = p + 1;
+                { sout.push_back(*(ts + 1)); }
+            } break;
+            case 18:
+
+            {
+                te = p + 1;
+                { sout.push_back(*ts); }
+            } break;
+            case 19:
+
+            {
+                te = p + 1;
+                {
+                    {
+                        cs = 14;
+                        goto _again;
+                    }
+                }
+            } break;
+            case 20:
+
+            {
+                te = p;
+                p--;
+                { sout.push_back(*ts); }
+            } break;
+            case 21:
+
+            {
+                { p = ((te)) - 1; }
+                { sout.push_back(*ts); }
+            } break;
+            case 22:
+
+            {
+                te = p + 1;
+                {
+                    {
+                        cs = 15;
+                        goto _again;
+                    }
+                }
+            } break;
+            case 23:
+
+            {
+                te = p + 1;
+                { c.matches.insert(num); }
+            } break;
+            case 24:
+
+            {
+                te = p;
+                p--;
+                { c.matches.insert(num); }
+            } break;
+            }
+        }
+
+    _again:
+        _acts = _FileCorporaParser_actions +
+                _FileCorporaParser_to_state_actions[cs];
+        _nacts = (unsigned int)*_acts++;
+        while (_nacts-- > 0) {
+            switch (*_acts++) {
+            case 6:
+
+            {
+                ts = 0;
+            } break;
+            }
+        }
+
+        if (cs == 0)
+            goto _out;
+        if (++p != pe)
+            goto _resume;
+    _test_eof : {}
+        if (p == eof) {
+            if (_FileCorporaParser_eof_trans[cs] > 0) {
+                _trans = _FileCorporaParser_eof_trans[cs] - 1;
+                goto _eof_trans;
+            }
+        }
+
+    _out : {}
+    }
+
+    return (cs != FileCorporaParser_error) && (p == pe);
+}
diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
index 623c2c9..d6d52a2 100644
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -40,7 +40,7 @@ using namespace ue2;
 namespace {
 
 // Switch one bit on in a bitmask.
-template<class Mask>
+template <class Mask>
 Mask setbit(unsigned int bit) {
     union {
         Mask simd;
@@ -148,7 +148,7 @@ m256 simd_lshift64(const m256 &a, unsigned i) { return lshift64_m256(a, i); }
 m384 simd_lshift64(const m384 &a, unsigned i) { return lshift64_m384(a, i); }
 m512 simd_lshift64(const m512 &a, unsigned i) { return lshift64_m512(a, i); }
 
-template<typename T>
+template <typename T>
 class SimdUtilsTest : public testing::Test {
     // empty
 };
@@ -260,9 +260,9 @@ TYPED_TEST(SimdUtilsTest, or2) {
 
     for (unsigned j = 0; j < 8; j++) {
         for (unsigned i = 0; i < 32; i++) {
-            m256 x = setbit<m256>(j*32+i);
+            m256 x = setbit<m256>(j * 32 + i);
             m256 y = zeroes256();
-            ASSERT_EQ(1U << j, diffrich256(x, y)) << "bit " << j*32+i << " not happy";
+            ASSERT_EQ(1U << j, diffrich256(x, y)) << "bit " << j * 32 + i << " not happy";
         }
     }
 
@@ -431,8 +431,8 @@ TYPED_TEST(SimdUtilsTest, testbit) {
     for (unsigned int i = 0; i < total_bits; i++) {
         TypeParam a = setbit<TypeParam>(i);
         for (unsigned int j = 0; j < total_bits; j++) {
-            ASSERT_EQ(i == j ? 1 : 0, simd_testbit(a, j)) << "bit " << i
-                                                          << " is wrong";
+            ASSERT_EQ(i == j ? 1 : 0, simd_testbit(a, j))
+                << "bit " << i << " is wrong";
         }
     }
 }
@@ -455,7 +455,6 @@ TYPED_TEST(SimdUtilsTest, setbit) {
         simd_setbit(&a, i);
     }
     ASSERT_FALSE(simd_diff(simd_ones(), a));
-
 }
 
 TYPED_TEST(SimdUtilsTest, diffrich) {
@@ -663,12 +662,11 @@ TEST(SimdUtilsTest, movq) {
     ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
     ASSERT_EQ(0, memcmp(cmp, &r, sizeof(r)));
 
-    simd = _mm_set_epi64x(~0LL, 0x123456789abcdef);
+    simd = set64x2(~0LL, 0x123456789abcdef);
     r = movq(simd);
     ASSERT_EQ(r, 0x123456789abcdef);
 }
 
-
 TEST(SimdUtilsTest, set16x8) {
     char cmp[sizeof(m128)];
 
@@ -680,7 +678,7 @@ TEST(SimdUtilsTest, set16x8) {
 }
 
 TEST(SimdUtilsTest, set4x32) {
-    u32 cmp[4] = { 0x12345678, 0x12345678, 0x12345678, 0x12345678 };
+    u32 cmp[4] = {0x12345678, 0x12345678, 0x12345678, 0x12345678};
     m128 simd = set4x32(cmp[0]);
     ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
 }
@@ -714,51 +712,51 @@ TEST(SimdUtilsTest, variableByteShift128) {
     char base[] = "0123456789ABCDEF";
     m128 in = loadu128(base);
 
-    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0),
-                         variable_byte_shift_m128(in, 0)));
-    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1),
-                         variable_byte_shift_m128(in, -1)));
-    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 2),
-                         variable_byte_shift_m128(in, -2)));
-    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 3),
-                         variable_byte_shift_m128(in, -3)));
-    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 4),
-                         variable_byte_shift_m128(in, -4)));
-    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 5),
-                         variable_byte_shift_m128(in, -5)));
-    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 6),
-                         variable_byte_shift_m128(in, -6)));
-    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 7),
-                         variable_byte_shift_m128(in, -7)));
-    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 8),
-                         variable_byte_shift_m128(in, -8)));
-    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 9),
-                         variable_byte_shift_m128(in, -9)));
-    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 10),
-                         variable_byte_shift_m128(in, -10)));
-
-    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 0),
-                         variable_byte_shift_m128(in, 0)));
-    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 1),
-                         variable_byte_shift_m128(in, 1)));
-    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 2),
-                         variable_byte_shift_m128(in, 2)));
-    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 3),
-                         variable_byte_shift_m128(in, 3)));
-    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 4),
-                         variable_byte_shift_m128(in, 4)));
-    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 5),
-                         variable_byte_shift_m128(in, 5)));
-    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 6),
-                         variable_byte_shift_m128(in, 6)));
-    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 7),
-                         variable_byte_shift_m128(in, 7)));
-    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 8),
-                         variable_byte_shift_m128(in, 8)));
-    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 9),
-                         variable_byte_shift_m128(in, 9)));
-    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10),
-                         variable_byte_shift_m128(in, 10)));
+    EXPECT_TRUE(
+        !diff128(rshiftbyte_m128(in, 0), variable_byte_shift_m128(in, 0)));
+    EXPECT_TRUE(
+        !diff128(rshiftbyte_m128(in, 1), variable_byte_shift_m128(in, -1)));
+    EXPECT_TRUE(
+        !diff128(rshiftbyte_m128(in, 2), variable_byte_shift_m128(in, -2)));
+    EXPECT_TRUE(
+        !diff128(rshiftbyte_m128(in, 3), variable_byte_shift_m128(in, -3)));
+    EXPECT_TRUE(
+        !diff128(rshiftbyte_m128(in, 4), variable_byte_shift_m128(in, -4)));
+    EXPECT_TRUE(
+        !diff128(rshiftbyte_m128(in, 5), variable_byte_shift_m128(in, -5)));
+    EXPECT_TRUE(
+        !diff128(rshiftbyte_m128(in, 6), variable_byte_shift_m128(in, -6)));
+    EXPECT_TRUE(
+        !diff128(rshiftbyte_m128(in, 7), variable_byte_shift_m128(in, -7)));
+    EXPECT_TRUE(
+        !diff128(rshiftbyte_m128(in, 8), variable_byte_shift_m128(in, -8)));
+    EXPECT_TRUE(
+        !diff128(rshiftbyte_m128(in, 9), variable_byte_shift_m128(in, -9)));
+    EXPECT_TRUE(
+        !diff128(rshiftbyte_m128(in, 10), variable_byte_shift_m128(in, -10)));
+
+    EXPECT_TRUE(
+        !diff128(lshiftbyte_m128(in, 0), variable_byte_shift_m128(in, 0)));
+    EXPECT_TRUE(
+        !diff128(lshiftbyte_m128(in, 1), variable_byte_shift_m128(in, 1)));
+    EXPECT_TRUE(
+        !diff128(lshiftbyte_m128(in, 2), variable_byte_shift_m128(in, 2)));
+    EXPECT_TRUE(
+        !diff128(lshiftbyte_m128(in, 3), variable_byte_shift_m128(in, 3)));
+    EXPECT_TRUE(
+        !diff128(lshiftbyte_m128(in, 4), variable_byte_shift_m128(in, 4)));
+    EXPECT_TRUE(
+        !diff128(lshiftbyte_m128(in, 5), variable_byte_shift_m128(in, 5)));
+    EXPECT_TRUE(
+        !diff128(lshiftbyte_m128(in, 6), variable_byte_shift_m128(in, 6)));
+    EXPECT_TRUE(
+        !diff128(lshiftbyte_m128(in, 7), variable_byte_shift_m128(in, 7)));
+    EXPECT_TRUE(
+        !diff128(lshiftbyte_m128(in, 8), variable_byte_shift_m128(in, 8)));
+    EXPECT_TRUE(
+        !diff128(lshiftbyte_m128(in, 9), variable_byte_shift_m128(in, 9)));
+    EXPECT_TRUE(
+        !diff128(lshiftbyte_m128(in, 10), variable_byte_shift_m128(in, 10)));
 
     EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16)));
     EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, -16)));
@@ -785,12 +783,12 @@ TEST(SimdUtilsTest, min_u8_m128) {
 }
 
 TEST(SimdUtilsTest, sadd_u8_m128) {
-    unsigned char base1[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4',
-                             '1', '2', '3', '4', '1', '2', '3', '4'};
-    unsigned char base2[] = {'a', 0x80, 'b', 'A', 0x10, 0x10, 0x10, 0x10,
-                             0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0};
+    unsigned char base1[] = {0,   0x80, 0xff, 'A', '1', '2', '3', '4',
+                             '1', '2',  '3',  '4', '1', '2', '3', '4'};
+    unsigned char base2[] = {'a',  0x80, 'b',  'A',  0x10, 0x10, 0x10, 0x10,
+                             0x30, 0x30, 0x30, 0x30, 0,    0,    0,    0};
     unsigned char expec[] = {'a', 0xff, 0xff, 0x82, 'A', 'B', 'C', 'D',
-                             'a', 'b', 'c', 'd', '1', '2', '3', '4'};
+                             'a', 'b',  'c',  'd',  '1', '2', '3', '4'};
     m128 in1 = loadu128(base1);
     m128 in2 = loadu128(base2);
     m128 result = sadd_u8_m128(in1, in2);
@@ -799,11 +797,11 @@ TEST(SimdUtilsTest, sadd_u8_m128) {
 
 TEST(SimdUtilsTest, sub_u8_m128) {
     unsigned char base1[] = {'a', 0xff, 0xff, 0x82, 'A', 'B', 'C', 'D',
-                             'a', 'b', 'c', 'd', '1', '2', '3', '4'};
-    unsigned char base2[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4',
-                             '1', '2', '3', '4', '1', '2', '3', '4'};
-    unsigned char expec[] = {'a', 0x7f, 0, 'A', 0x10, 0x10, 0x10, 0x10,
-                             0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0};
+                             'a', 'b',  'c',  'd',  '1', '2', '3', '4'};
+    unsigned char base2[] = {0,   0x80, 0xff, 'A', '1', '2', '3', '4',
+                             '1', '2',  '3',  '4', '1', '2', '3', '4'};
+    unsigned char expec[] = {'a',  0x7f, 0,    'A',  0x10, 0x10, 0x10, 0x10,
+                             0x30, 0x30, 0x30, 0x30, 0,    0,    0,    0};
     m128 in1 = loadu128(base1);
     m128 in2 = loadu128(base2);
     m128 result = sub_u8_m128(in1, in2);
diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt
index ea942ef..d7bef50 100644
--- a/util/CMakeLists.txt
+++ b/util/CMakeLists.txt
@@ -11,7 +11,13 @@ set_source_files_properties(
     PROPERTIES
     COMPILE_FLAGS "${RAGEL_C_FLAGS}")
 
-ragelmaker(ExpressionParser.rl)
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386")
+    ragelmaker(ExpressionParser.rl)
+endif()
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+    ragelcopyer(ExpressionParser.rl)
+endif()
 
 set(expressionutil_SRCS
     expressions.cpp
diff --git a/util/ExpressionParser.cpp b/util/ExpressionParser.cpp
new file mode 100644
index 0000000..687fc39
--- /dev/null
+++ b/util/ExpressionParser.cpp
@@ -0,0 +1,397 @@
+
+
+/*
+ * Copyright (c) 2015-2018, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "ExpressionParser.h"
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+#include "hs_compile.h"
+#include "ue2common.h"
+
+using std::string;
+
+namespace { // anon
+
+enum ParamKey {
+    PARAM_NONE,
+    PARAM_MIN_OFFSET,
+    PARAM_MAX_OFFSET,
+    PARAM_MIN_LENGTH,
+    PARAM_EDIT_DISTANCE,
+    PARAM_HAMM_DISTANCE
+};
+
+static const char _ExpressionParser_actions[] = {0, 1, 0,  1, 1, 1, 2, 1, 3,
+                                                 1, 4, 1,  5, 1, 6, 1, 7, 1,
+                                                 9, 1, 10, 2, 8, 0
+
+};
+
+static const char _ExpressionParser_key_offsets[] = {
+    0,  0,  4,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+    20, 21, 23, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+    42, 43, 44, 45, 46, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
+    58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 82};
+
+static const char _ExpressionParser_trans_keys[] = {
+    32,  101, 104, 109, 32,  101, 104, 109, 100, 105, 116, 95,  100, 105,
+    115, 116, 97,  110, 99,  101, 61,  48,  57,  32,  44,  125, 48,  57,
+    32,  44,  125, 97,  109, 109, 105, 110, 103, 95,  100, 105, 115, 116,
+    97,  110, 99,  101, 97,  105, 120, 95,  111, 102, 102, 115, 101, 116,
+    110, 95,  108, 111, 101, 110, 103, 116, 104, 102, 102, 115, 101, 116,
+    56,  67,  72,  76,  105, 109, 115, 123, 79,  81,  86,  87,  0};
+
+static const char _ExpressionParser_single_lengths[] = {
+    0, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 3, 3, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 0};
+
+static const char _ExpressionParser_range_lengths[] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0};
+
+static const unsigned char _ExpressionParser_index_offsets[] = {
+    0,   0,   5,   10,  12,  14,  16,  18,  20,  22,  24,  26,  28, 30, 32,
+    34,  36,  38,  43,  47,  49,  51,  53,  55,  57,  59,  61,  63, 65, 67,
+    69,  71,  73,  75,  77,  80,  82,  84,  86,  88,  90,  92,  94, 96, 98,
+    100, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 134};
+
+static const char _ExpressionParser_trans_targs[] = {
+    2,  3,  19, 34, 0,  2,  3,  19, 34, 0,  4,  0,  5,  0,  6,  0,  7,
+    0,  8,  0,  9,  0,  10, 0,  11, 0,  12, 0,  13, 0,  14, 0,  15, 0,
+    16, 0,  17, 0,  18, 1,  57, 17, 0,  18, 1,  57, 0,  20, 0,  21, 0,
+    22, 0,  23, 0,  24, 0,  25, 0,  26, 0,  27, 0,  28, 0,  29, 0,  30,
+    0,  31, 0,  32, 0,  33, 0,  15, 0,  35, 43, 0,  36, 0,  37, 0,  38,
+    0,  39, 0,  40, 0,  41, 0,  42, 0,  15, 0,  44, 0,  45, 0,  46, 51,
+    0,  47, 0,  48, 0,  49, 0,  50, 0,  15, 0,  52, 0,  53, 0,  54, 0,
+    55, 0,  15, 0,  56, 56, 56, 56, 56, 56, 56, 1,  56, 56, 0,  0,  0};
+
+static const char _ExpressionParser_trans_actions[] = {
+    17, 17, 17, 17, 19, 0,  0,  0,  0,  19, 0,  19, 0,  19, 0,  19, 0,
+    19, 0,  19, 0,  19, 0,  19, 0,  19, 0,  19, 0,  19, 0,  19, 13, 19,
+    0,  19, 21, 19, 0,  5,  5,  1,  19, 0,  5,  5,  19, 0,  19, 0,  19,
+    0,  19, 0,  19, 0,  19, 0,  19, 0,  19, 0,  19, 0,  19, 0,  19, 0,
+    19, 0,  19, 0,  19, 0,  19, 15, 19, 0,  0,  19, 0,  19, 0,  19, 0,
+    19, 0,  19, 0,  19, 0,  19, 0,  19, 9,  19, 0,  19, 0,  19, 0,  0,
+    19, 0,  19, 0,  19, 0,  19, 0,  19, 11, 19, 0,  19, 0,  19, 0,  19,
+    0,  19, 7,  19, 3,  3,  3,  3,  3,  3,  3,  0,  3,  3,  19, 19, 0};
+
+static const char _ExpressionParser_eof_actions[] = {
+    0,  19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 0,  0};
+
+static const int ExpressionParser_start = 56;
+static const int ExpressionParser_first_final = 56;
+static const int ExpressionParser_error = 0;
+
+static const int ExpressionParser_en_main = 56;
+
+} // namespace
+
+static void initExt(hs_expr_ext *ext) {
+    memset(ext, 0, sizeof(*ext));
+    ext->max_offset = MAX_OFFSET;
+}
+
+bool HS_CDECL readExpression(const std::string &input, std::string &expr,
+                             unsigned int *flags, hs_expr_ext *ext,
+                             bool *must_be_ordered) {
+    assert(flags);
+    assert(ext);
+
+    // Init flags and ext params.
+    *flags = 0;
+    initExt(ext);
+    if (must_be_ordered) {
+        *must_be_ordered = false;
+    }
+
+    // Extract expr, which is easier to do in straight C++ than with Ragel.
+    if (input.empty() || input[0] != '/') {
+        return false;
+    }
+    size_t end = input.find_last_of('/');
+    if (end == string::npos || end == 0) {
+        return false;
+    }
+    expr = input.substr(1, end - 1);
+
+    // Use a Ragel scanner to handle flags and params.
+    const char *p = input.c_str() + end + 1;
+    const char *pe = input.c_str() + input.size();
+    UNUSED const char *eof = pe;
+    UNUSED const char *ts = p, *te = p;
+    int cs;
+    UNUSED int act;
+
+    assert(p);
+    assert(pe);
+
+    // For storing integers as they're scanned.
+    u64a num = 0;
+    enum ParamKey key = PARAM_NONE;
+
+    { cs = ExpressionParser_start; }
+
+    {
+        int _klen;
+        unsigned int _trans;
+        const char *_acts;
+        unsigned int _nacts;
+        const char *_keys;
+
+        if (p == pe)
+            goto _test_eof;
+        if (cs == 0)
+            goto _out;
+    _resume:
+        _keys =
+            _ExpressionParser_trans_keys + _ExpressionParser_key_offsets[cs];
+        _trans = _ExpressionParser_index_offsets[cs];
+
+        _klen = _ExpressionParser_single_lengths[cs];
+        if (_klen > 0) {
+            const char *_lower = _keys;
+            const char *_mid;
+            const char *_upper = _keys + _klen - 1;
+            while (1) {
+                if (_upper < _lower)
+                    break;
+
+                _mid = _lower + ((_upper - _lower) >> 1);
+                if ((*p) < *_mid)
+                    _upper = _mid - 1;
+                else if ((*p) > *_mid)
+                    _lower = _mid + 1;
+                else {
+                    _trans += (unsigned int)(_mid - _keys);
+                    goto _match;
+                }
+            }
+            _keys += _klen;
+            _trans += _klen;
+        }
+
+        _klen = _ExpressionParser_range_lengths[cs];
+        if (_klen > 0) {
+            const char *_lower = _keys;
+            const char *_mid;
+            const char *_upper = _keys + (_klen << 1) - 2;
+            while (1) {
+                if (_upper < _lower)
+                    break;
+
+                _mid = _lower + (((_upper - _lower) >> 1) & ~1);
+                if ((*p) < _mid[0])
+                    _upper = _mid - 2;
+                else if ((*p) > _mid[1])
+                    _lower = _mid + 2;
+                else {
+                    _trans += (unsigned int)((_mid - _keys) >> 1);
+                    goto _match;
+                }
+            }
+            _trans += _klen;
+        }
+
+    _match:
+        cs = _ExpressionParser_trans_targs[_trans];
+
+        if (_ExpressionParser_trans_actions[_trans] == 0)
+            goto _again;
+
+        _acts =
+            _ExpressionParser_actions + _ExpressionParser_trans_actions[_trans];
+        _nacts = (unsigned int)*_acts++;
+        while (_nacts-- > 0) {
+            switch (*_acts++) {
+            case 0:
+
+            {
+                num = (num * 10) + ((*p) - '0');
+            } break;
+            case 1:
+
+            {
+                switch ((*p)) {
+                case 'i':
+                    *flags |= HS_FLAG_CASELESS;
+                    break;
+                case 's':
+                    *flags |= HS_FLAG_DOTALL;
+                    break;
+                case 'm':
+                    *flags |= HS_FLAG_MULTILINE;
+                    break;
+                case 'H':
+                    *flags |= HS_FLAG_SINGLEMATCH;
+                    break;
+                case 'O':
+                    if (must_be_ordered) {
+                        *must_be_ordered = true;
+                    }
+                    break;
+                case 'V':
+                    *flags |= HS_FLAG_ALLOWEMPTY;
+                    break;
+                case 'W':
+                    *flags |= HS_FLAG_UCP;
+                    break;
+                case '8':
+                    *flags |= HS_FLAG_UTF8;
+                    break;
+                case 'P':
+                    *flags |= HS_FLAG_PREFILTER;
+                    break;
+                case 'L':
+                    *flags |= HS_FLAG_SOM_LEFTMOST;
+                    break;
+                case 'C':
+                    *flags |= HS_FLAG_COMBINATION;
+                    break;
+                case 'Q':
+                    *flags |= HS_FLAG_QUIET;
+                    break;
+                default: {
+                    p++;
+                    goto _out;
+                }
+                }
+            } break;
+            case 2:
+
+            {
+                switch (key) {
+                case PARAM_MIN_OFFSET:
+                    ext->flags |= HS_EXT_FLAG_MIN_OFFSET;
+                    ext->min_offset = num;
+                    break;
+                case PARAM_MAX_OFFSET:
+                    ext->flags |= HS_EXT_FLAG_MAX_OFFSET;
+                    ext->max_offset = num;
+                    break;
+                case PARAM_MIN_LENGTH:
+                    ext->flags |= HS_EXT_FLAG_MIN_LENGTH;
+                    ext->min_length = num;
+                    break;
+                case PARAM_EDIT_DISTANCE:
+                    ext->flags |= HS_EXT_FLAG_EDIT_DISTANCE;
+                    ext->edit_distance = num;
+                    break;
+                case PARAM_HAMM_DISTANCE:
+                    ext->flags |= HS_EXT_FLAG_HAMMING_DISTANCE;
+                    ext->hamming_distance = num;
+                    break;
+                case PARAM_NONE:
+                default:
+                    // No key specified, syntax invalid.
+                    return false;
+                }
+            } break;
+            case 3:
+
+            {
+                key = PARAM_MIN_OFFSET;
+            } break;
+            case 4:
+
+            {
+                key = PARAM_MAX_OFFSET;
+            } break;
+            case 5:
+
+            {
+                key = PARAM_MIN_LENGTH;
+            } break;
+            case 6:
+
+            {
+                key = PARAM_EDIT_DISTANCE;
+            } break;
+            case 7:
+
+            {
+                key = PARAM_HAMM_DISTANCE;
+            } break;
+            case 8:
+
+            {
+                num = 0;
+            } break;
+            case 9:
+
+            {
+                key = PARAM_NONE;
+            } break;
+            case 10:
+
+            {
+                return false;
+            } break;
+            }
+        }
+
+    _again:
+        if (cs == 0)
+            goto _out;
+        if (++p != pe)
+            goto _resume;
+    _test_eof : {}
+        if (p == eof) {
+            const char *__acts =
+                _ExpressionParser_actions + _ExpressionParser_eof_actions[cs];
+            unsigned int __nacts = (unsigned int)*__acts++;
+            while (__nacts-- > 0) {
+                switch (*__acts++) {
+                case 10:
+
+                {
+                    return false;
+                } break;
+                }
+            }
+        }
+
+    _out : {}
+    }
+
+    DEBUG_PRINTF("expr='%s', flags=%u\n", expr.c_str(), *flags);
+
+    return (cs != ExpressionParser_error) && (p == pe);
+}
-- 
2.39.0