From 5f009c288718095c5cc675bfed12d7ec64237731 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Mon, 20 Jul 2020 17:20:15 +0800 Subject: [PATCH] Add aarch64 support --- CMakeLists.txt | 108 +- cmake/config.h.in | 9 + cmake/platform.cmake | 13 +- cmake/ragel.cmake | 20 + src/crc32.c | 43 + src/fdr/fdr.c | 136 +- src/hs_valid_platform.c | 9 +- src/nfa/limex_exceptional.h | 22 +- src/nfa/limex_internal.h | 2 +- src/nfa/limex_native.c | 10 +- src/nfa/shufti.c | 580 ++++---- src/nfa/truffle.c | 10 +- src/parser/control_verbs.cpp | 340 +++++ src/rose/counting_miracle.h | 2 +- src/util/arch.h | 11 + src/util/cpuid_flags.c | 9 +- src/util/cpuid_flags.h | 2 + src/util/cpuid_inline.h | 17 +- src/util/intrinsics.h | 12 + src/util/popcount.h | 6 +- src/util/simd_arm.h | 1069 +++++++++++++++ src/util/simd_types.h | 42 +- src/util/simd_utils.h | 1389 +------------------- src/util/simd_x86.h | 1334 +++++++++++++++++++ src/util/state_compress.c | 42 +- tools/hscollider/CMakeLists.txt | 9 +- tools/hscollider/ColliderCorporaParser.cpp | 474 +++++++ unit/internal/simd_utils.cpp | 128 +- util/CMakeLists.txt | 8 +- util/ExpressionParser.cpp | 397 ++++++ 30 files changed, 4464 insertions(+), 1789 deletions(-) create mode 100644 src/parser/control_verbs.cpp create mode 100644 src/util/simd_arm.h create mode 100644 src/util/simd_x86.h create mode 100644 tools/hscollider/ColliderCorporaParser.cpp create mode 100644 util/ExpressionParser.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 8bc6077..12a889c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,6 +74,7 @@ include (${CMAKE_MODULE_PATH}/boost.cmake) # -- make this work? set(python_ADDITIONAL_VERSIONS 2.7 2.6) find_package(PythonInterp) find_program(RAGEL ragel) +find_program(COPY cp) if(PYTHONINTERP_FOUND) set(PYTHON ${PYTHON_EXECUTABLE}) @@ -189,24 +190,30 @@ else() # cpuid info and then chooses the best microarch it can (and replaces # the flag), so use that for tune. - # arg1 might exist if using ccache - string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1) - set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native) - execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS} - OUTPUT_VARIABLE _GCC_OUTPUT) - string(FIND "${_GCC_OUTPUT}" "march" POS) - string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT) - string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1" - GNUCC_ARCH "${_GCC_OUTPUT}") - - # test the parsed flag - set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH}) - execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS} - OUTPUT_QUIET ERROR_QUIET - INPUT_FILE /dev/null - RESULT_VARIABLE GNUCC_TUNE_TEST) - if (NOT GNUCC_TUNE_TEST EQUAL 0) - message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid") + if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386") + # arg1 might exist if using ccache + string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1) + set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native) + execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS} + OUTPUT_VARIABLE _GCC_OUTPUT) + string(FIND "${_GCC_OUTPUT}" "march" POS) + string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT) + string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1" + GNUCC_ARCH "${_GCC_OUTPUT}") + + # test the parsed flag + set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH}) + execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS} + OUTPUT_QUIET ERROR_QUIET + INPUT_FILE /dev/null + RESULT_VARIABLE GNUCC_TUNE_TEST) + if (NOT GNUCC_TUNE_TEST EQUAL 0) + message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid") + endif() + endif() + + if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=armv8-a -mtune=armv8-a) endif() set(TUNE_FLAG ${GNUCC_ARCH}) else () @@ -239,6 +246,13 @@ else() set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c99 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing") set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++11 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing") + if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fsigned-char") + set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fsigned-char") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc") + endif() + if (NOT RELEASE_BUILD) # -Werror is most useful during development, don't potentially break # release builds @@ -252,11 +266,19 @@ else() endif() if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) - set(ARCH_C_FLAGS "-march=native -mtune=${TUNE_FLAG}") + if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386") + set(ARCH_C_FLAGS "-march=native -mtune=${TUNE_FLAG}") + elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + set(ARCH_C_FLAGS "-march=armv8-a -mtune=${TUNE_FLAG}") + endif () endif() if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) - set(ARCH_CXX_FLAGS "-march=native -mtune=${TUNE_FLAG}") + if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386") + set(ARCH_CXX_FLAGS "-march=native -mtune=${TUNE_FLAG}") + elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + set(ARCH_CXX_FLAGS "-march=armv8-a -mtune=${TUNE_FLAG}") + endif() endif() if(CMAKE_COMPILER_IS_GNUCC) @@ -289,10 +311,18 @@ else() endif() CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H) -CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H) -CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H) -CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H) -CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H) + +if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386") + CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H) + CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H) + CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H) + CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H) +endif() + +if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + CHECK_INCLUDE_FILES(arm_neon.h HAVE_C_ARM_NEON_H) + CHECK_INCLUDE_FILE_CXX(arm_neon.h HAVE_CXX_ARM_NEON_H) +endif() CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN) CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC) @@ -325,6 +355,9 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux") (CMAKE_VERSION VERSION_GREATER "3.0" AND CMAKE_GENERATOR MATCHES "Ninja"))) message (STATUS "Building the fat runtime requires the Unix Makefiles generator, or Ninja with CMake v3.0 or higher") set (FAT_RUNTIME_REQUISITES FALSE) + elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + message(STATUS "AARCH64 platform don't support fat runtime") + set (FAT_RUNTIME_REQUISITES FALSE) else() include (${CMAKE_MODULE_PATH}/attrib.cmake) if (NOT HAS_C_ATTR_IFUNC) @@ -337,7 +370,9 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux") CMAKE_DEPENDENT_OPTION(FAT_RUNTIME "Build a library that supports multiple microarchitectures" ${RELEASE_BUILD} "FAT_RUNTIME_REQUISITES" OFF) endif () -include (${CMAKE_MODULE_PATH}/arch.cmake) +if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386") + include (${CMAKE_MODULE_PATH}/arch.cmake) +endif() # testing a builtin takes a little more work CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED) @@ -403,12 +438,6 @@ if (CXX_IGNORED_ATTR) set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-ignored-attributes") endif() -# gcc 9 complains about redundant move for returned variable -CHECK_CXX_COMPILER_FLAG("-Wredundant-move" CXX_REDUNDANT_MOVE) -if (CXX_REDUNDANT_MOVE) - set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-redundant-move") -endif() - # note this for later # g++ doesn't have this flag but clang does CHECK_CXX_COMPILER_FLAG("-Wweak-vtables" CXX_WEAK_VTABLES) @@ -463,6 +492,14 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") endif() +# Test case for neon function. +option(UNIT_SIMD "Simd funtion test case, default is OFF" OFF) +if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + if (UNIT_SIMD) + add_subdirectory(unit-simd) + endif() +endif() + add_subdirectory(util) add_subdirectory(doc/dev-reference) @@ -559,7 +596,14 @@ set_source_files_properties( PROPERTIES COMPILE_FLAGS "${RAGEL_C_FLAGS}") -ragelmaker(src/parser/control_verbs.rl) + +if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386") + ragelmaker(src/parser/control_verbs.rl) +endif() + +if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + ragelcopyer(src/parser/control_verbs.rl) +endif() SET(hs_HEADERS src/hs.h diff --git a/cmake/config.h.in b/cmake/config.h.in index 5454643..336cf19 100644 --- a/cmake/config.h.in +++ b/cmake/config.h.in @@ -15,6 +15,9 @@ /* "Define if building for EM64T" */ #cmakedefine ARCH_X86_64 +/* "Define if building for aarch64" */ +#cmakedefine ARCH_AARCH64 + /* internal build, switch on dump support. */ #cmakedefine DUMP_SUPPORT @@ -48,6 +51,12 @@ /* C compiler has intrin.h */ #cmakedefine HAVE_C_INTRIN_H +/* C++ compiler has arm_neon.h */ +#cmakedefine HAVE_CXX_ARM_NEON_H + +/* C compiler has arm_neon.h */ +#cmakedefine HAVE_C_ARM_NEON_H + /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to 0 if you don't. */ #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP diff --git a/cmake/platform.cmake b/cmake/platform.cmake index 593c544..213dcc5 100644 --- a/cmake/platform.cmake +++ b/cmake/platform.cmake @@ -1,9 +1,14 @@ # determine the target arch # really only interested in the preprocessor here -CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_64_BIT) +CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_X86_64) -CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_32_BIT) +CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32) -set(ARCH_X86_64 ${ARCH_64_BIT}) -set(ARCH_IA32 ${ARCH_32_BIT}) +CHECK_C_SOURCE_COMPILES("#if !(defined(__aarch64__))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64) + +if (ARCH_X86_64 OR ARCH_AARCH64) + set(ARCH_64_BIT 1) +elseif (ARCH_IA32) + set(ARCH_32_BIT 1) +endif() \ No newline at end of file diff --git a/cmake/ragel.cmake b/cmake/ragel.cmake index d3f0b92..3356cb9 100644 --- a/cmake/ragel.cmake +++ b/cmake/ragel.cmake @@ -14,3 +14,23 @@ function(ragelmaker src_rl) set_source_files_properties(${rl_out} PROPERTIES GENERATED TRUE) endfunction(ragelmaker) + # On the aarch64 platform, char is unsigned by default, so in order to be consistent with + # the x86 platform, we will add -fsigned-char to the compile option to force the char type. + # However, when the ragel generates c++ code, the char variable used will still be considered + # unsigned, resulting in the overflow of the char variable value in the generated code, + # resulting in some errors. + # function for copying the previously modified code to the specified path + + function(ragelcopyer src_rl) + get_filename_component(src_dir ${src_rl} PATH) # old cmake needs PATH + get_filename_component(src_file ${src_rl} NAME_WE) + set(rl_out ${CMAKE_CURRENT_BINARY_DIR}/${src_dir}/${src_file}.cpp) + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${src_dir}/${src_file}.cpp + COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${src_dir} + COMMAND ${COPY} -f ${CMAKE_CURRENT_SOURCE_DIR}/${src_dir}/${src_file}.cpp ${rl_out} 2>/dev/null ||: + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${src_dir}/${src_file}.cpp + ) + add_custom_target(ragel_${src_file} DEPENDS ${rl_out}) + set_source_files_properties(${rl_out} PROPERTIES GENERATED TRUE) + endfunction(ragelcopyer) \ No newline at end of file diff --git a/src/crc32.c b/src/crc32.c index 1dae47b..4609c5d 100644 --- a/src/crc32.c +++ b/src/crc32.c @@ -32,6 +32,47 @@ #include "util/arch.h" #include "util/intrinsics.h" +#if defined(HAVE_NEON) + +#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) +#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) +#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) +#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) +#define CRC_WORD 8 +#define CRC_TYPE u64a +static really_inline +u32 crc32c_neon(u32 running_crc, const unsigned char * p_buf, const size_t length) +{ + u32 crc=running_crc; + + //Processbyte-by-byteuntilp_bufisaligned + const unsigned char * aligned_buf = ROUNDUP_PTR(p_buf, CRC_WORD); + size_t init_bytes = aligned_buf - p_buf; + size_t running_length = ((length - init_bytes) / CRC_WORD) * CRC_WORD; + size_t end_bytes = length - init_bytes - running_length; + + while(p_buf < aligned_buf){ + CRC32CB(crc, *p_buf); + p_buf++; + } + + //Main aligned loop, processes a word at a time. + for(size_t li = 0; li < running_length / CRC_WORD; li++){ + CRC_TYPE block = *(const CRC_TYPE *)p_buf; + CRC32CX(crc,block); + p_buf += CRC_WORD; + } + + //Remainingbytes + for(size_t li = 0; li < end_bytes; li++){ + CRC32CB(crc,*p_buf); + p_buf++; + } + return crc; +} +#endif + + #if !defined(HAVE_SSE42) /*** @@ -636,6 +677,8 @@ u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf, u32 Crc32c_ComputeBuf(u32 inCrc32, const void *buf, size_t bufLen) { #if defined(HAVE_SSE42) u32 crc = crc32c_sse42(inCrc32, (const unsigned char *)buf, bufLen); +#elif defined(HAVE_NEON) + u32 crc = crc32c_neon(inCrc32, (const unsigned char *)buf, bufLen); #else u32 crc = crc32c_sb8_64_bit(inCrc32, (const unsigned char *)buf, bufLen); #endif diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index d33756d..718f169 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -127,6 +127,13 @@ u64a andn(const u32 a, const u8 *b) { u64a r; #if defined(HAVE_BMI) && !defined(NO_ASM) __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b)); +#elif defined(HAVE_NEON) + __asm__ __volatile__("ldr w0, %w2 \n\t" + "bic %w0,w0,%w1 \n\t" + : "=r"(r) + : "r"(a), "m"(*(const u32 *)b) + : "w0" + ); #else r = unaligned_load_u32(b) & ~a; #endif @@ -159,7 +166,104 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, UNUSED const u8 *end_ptr, u32 domain_mask_flipped, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { /* +1: the zones ensure that we can read the byte at z->end */ - assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); + assert(itPtr >= start_ptr && itPtr <= end_ptr); +#if defined(HAVE_NEON) + domain_mask_flipped = ~domain_mask_flipped; + + u32 reach0, reach1, reach2, reach3; + u64a ptr = unaligned_load_u64a(itPtr); + + reach0 = ptr & domain_mask_flipped; + reach1 = ptr >> 8 & domain_mask_flipped; + reach2 = ptr >> 16 & domain_mask_flipped; + reach3 = ptr >> 24 & domain_mask_flipped; + + m128 st0 = load_m128_from_u64a(ft + reach0); + m128 st1 = load_m128_from_u64a(ft + reach1); + m128 st2 = load_m128_from_u64a(ft + reach2); + m128 st3 = load_m128_from_u64a(ft + reach3); + + u32 reach4, reach5, reach6, reach7; + ptr = unaligned_load_u64a(itPtr + 4); + reach4 = ptr & domain_mask_flipped; + reach5 = ptr >> 8 & domain_mask_flipped; + reach6 = ptr >> 16 & domain_mask_flipped; + reach7 = ptr >> 24 & domain_mask_flipped; + + m128 st4 = load_m128_from_u64a(ft + reach4); + m128 st5 = load_m128_from_u64a(ft + reach5); + m128 st6 = load_m128_from_u64a(ft + reach6); + m128 st7 = load_m128_from_u64a(ft + reach7); + + m128 zero = zeroes128(); + + st1.vect_s8 = vextq_s8(zero.vect_s8, st1.vect_s8, 15); + st2.vect_s8 = vextq_s8(zero.vect_s8, st2.vect_s8, 14); + st3.vect_s8 = vextq_s8(zero.vect_s8, st3.vect_s8, 13); + st4.vect_s8 = vextq_s8(zero.vect_s8, st4.vect_s8, 12); + st5.vect_s8 = vextq_s8(zero.vect_s8, st5.vect_s8, 11); + st6.vect_s8 = vextq_s8(zero.vect_s8, st6.vect_s8, 10); + st7.vect_s8 = vextq_s8(zero.vect_s8, st7.vect_s8, 9); + + st0 = or128(st0, st1); + st2 = or128(st2, st3); + st4 = or128(st4, st5); + st6 = or128(st6, st7); + st0 = or128(st0, st2); + st4 = or128(st4, st6); + st0 = or128(st0, st4); + *s = or128(*s, st0); + + *conf0 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf0 = ~(*conf0); + + u32 reach8, reach9, reach10, reach11; + ptr = unaligned_load_u64a(itPtr + 8); + reach8 = ptr & domain_mask_flipped; + reach9 = ptr >> 8 & domain_mask_flipped; + reach10 = ptr >> 16 & domain_mask_flipped; + reach11 = ptr >> 24 & domain_mask_flipped; + + m128 st8 = load_m128_from_u64a(ft + reach8); + m128 st9 = load_m128_from_u64a(ft + reach9); + m128 st10 = load_m128_from_u64a(ft + reach10); + m128 st11 = load_m128_from_u64a(ft + reach11); + + u32 reach12, reach13, reach14, reach15; + ptr = unaligned_load_u64a(itPtr + 12); + reach12 = ptr & domain_mask_flipped; + reach13 = ptr >> 8 & domain_mask_flipped; + reach14 = ptr >> 16 & domain_mask_flipped; + reach15 = ptr >> 24 & domain_mask_flipped; + + m128 st12 = load_m128_from_u64a(ft + reach12); + m128 st13 = load_m128_from_u64a(ft + reach13); + m128 st14 = load_m128_from_u64a(ft + reach14); + m128 st15 = load_m128_from_u64a(ft + reach15); + + st9.vect_s8 = vextq_s8(zero.vect_s8, st9.vect_s8, 15); + st10.vect_s8 = vextq_s8(zero.vect_s8, st10.vect_s8, 14); + st11.vect_s8 = vextq_s8(zero.vect_s8, st11.vect_s8, 13); + st12.vect_s8 = vextq_s8(zero.vect_s8, st12.vect_s8, 12); + st13.vect_s8 = vextq_s8(zero.vect_s8, st13.vect_s8, 11); + st14.vect_s8 = vextq_s8(zero.vect_s8, st14.vect_s8, 10); + st15.vect_s8 = vextq_s8(zero.vect_s8, st15.vect_s8, 9); + + st8 = or128(st8, st9); + st10 = or128(st10, st11); + st12 = or128(st12, st13); + st14 = or128(st14, st15); + st8 = or128(st8, st10); + st12 = or128(st12, st14); + st8 = or128(st8, st12); + *s = or128(*s, st8); + + *conf8 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf8 = ~(*conf8); + +#else u64a reach0 = andn(domain_mask_flipped, itPtr); u64a reach1 = andn(domain_mask_flipped, itPtr + 1); u64a reach2 = andn(domain_mask_flipped, itPtr + 2); @@ -241,6 +345,8 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, *conf8 = movq(*s); *s = rshiftbyte_m128(*s, 8); *conf8 ^= ~0ULL; + +#endif } static really_inline @@ -349,12 +455,12 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control, u32 bitRem = bit % bucket; u32 idx = bitRem; u32 cf = confBase[idx]; - if (!cf) { + if (unlikely(!cf)) { continue; } const struct FDRConfirm *fdrc = (const struct FDRConfirm *) ((const u8 *)confBase + cf); - if (!(fdrc->groups & *control)) { + if (unlikely(!(fdrc->groups & *control))) { continue; } u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1); @@ -603,7 +709,7 @@ void createEndZone(const u8 *buf, const u8 *begin, const u8 *end, assert(z_len > 0); size_t iter_bytes_second = 0; size_t z_len_first = z_len; - if (z_len > ITER_BYTES) { + if (unlikely(z_len > ITER_BYTES)) { z_len_first = z_len - ITER_BYTES; iter_bytes_second = ITER_BYTES; } @@ -637,7 +743,7 @@ void createEndZone(const u8 *buf, const u8 *begin, const u8 *end, /* copy the last 16 bytes, may overlap with the previous 8 byte write */ storeu128(z_end_first - sizeof(m128), loadu128(end_first - sizeof(m128))); - if (iter_bytes_second) { + if (unlikely(iter_bytes_second)) { storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128))); } @@ -658,7 +764,7 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend, const u8 *ptr = buf + start; size_t remaining = len - start; - if (remaining <= ITER_BYTES) { + if (unlikely(remaining <= ITER_BYTES)) { /* enough bytes to make only one zone */ createShortZone(buf, hend, ptr, buf + len, &zoneArr[0]); return 1; @@ -691,13 +797,25 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend, #define INVALID_MATCH_ID (~0U) +/* add prefetch for aarch64, + *- due to gcc4.8.5 do not support builtin_prefetch. + */ +#if defined(HAVE_NEON) +#define PREFETCH __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(itPtr + 256))) +#define P2ALIGN __asm__ __volatile__(".p2align 6") +#else +#define PREFETCH __builtin_prefetch(itPtr + ITER_BYTES) +#define P2ALIGN +#endif + #define FDR_MAIN_LOOP(zz, s, get_conf_fn) \ do { \ + P2ALIGN; \ const u8 *tryFloodDetect = zz->floodPtr; \ const u8 *start_ptr = zz->start; \ - const u8 *end_ptr = zz->end; \ + const u8 *end_ptr = zz->end - ITER_BYTES; \ \ - for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \ + for (const u8 *itPtr = start_ptr; itPtr <= end_ptr; \ itPtr += ITER_BYTES) { \ if (unlikely(itPtr > tryFloodDetect)) { \ tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\ @@ -707,7 +825,7 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend, return HWLM_TERMINATED; \ } \ } \ - __builtin_prefetch(itPtr + ITER_BYTES); \ + PREFETCH; \ u64a conf0; \ u64a conf8; \ get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped, \ diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c index 59ad3f3..035d3ff 100644 --- a/src/hs_valid_platform.c +++ b/src/hs_valid_platform.c @@ -33,9 +33,16 @@ HS_PUBLIC_API hs_error_t HS_CDECL hs_valid_platform(void) { /* Hyperscan requires SSSE3, anything else is a bonus */ +#if defined(__x86_64__) if (check_ssse3()) { return HS_SUCCESS; - } else { + } +#else + if (check_neon()) { + return HS_SUCCESS; + } +#endif + else { return HS_ARCH_ERROR; } } diff --git a/src/nfa/limex_exceptional.h b/src/nfa/limex_exceptional.h index 6c7335f..8304215 100644 --- a/src/nfa/limex_exceptional.h +++ b/src/nfa/limex_exceptional.h @@ -131,7 +131,7 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG, union RepeatControl *repeat_ctrl = ctx->repeat_ctrl + info->ctrlIndex; char *repeat_state = ctx->repeat_state + info->stateOffset; - if (e->trigger == LIMEX_TRIGGER_POS) { + if (unlikely(e->trigger == LIMEX_TRIGGER_POS)) { char cyclic_on = TESTBIT_STATE(*STATE_ARG_P, info->cyclicState); processPosTrigger(repeat, repeat_ctrl, repeat_state, offset, cyclic_on); @@ -140,7 +140,7 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG, assert(e->trigger == LIMEX_TRIGGER_TUG); enum TriggerResult rv = processTugTrigger(repeat, repeat_ctrl, repeat_state, offset); - if (rv == TRIGGER_FAIL) { + if (likely(rv == TRIGGER_FAIL)) { *cacheable = DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES; DEBUG_PRINTF("tug found no valid matches in repeat state\n"); return 1; // continue @@ -150,7 +150,7 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG, assert(e->hasSquash == LIMEX_SQUASH_TUG); *succ = AND_STATE(*succ, LOAD_FROM_ENG(&e->squash)); return 1; // continue - } else if (rv == TRIGGER_SUCCESS_CACHE) { + } else if (unlikely(rv == TRIGGER_SUCCESS_CACHE)) { new_cache->br = 1; } else { assert(rv == TRIGGER_SUCCESS); @@ -160,7 +160,7 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG, } // Some exceptions fire accepts. - if (e->reports != MO_INVALID_IDX) { + if (unlikely(e->reports != MO_INVALID_IDX)) { if (flags & CALLBACK_OUTPUT) { const ReportID *reports = (const ReportID *)((const char *)limex + e->reports); @@ -171,7 +171,7 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG, return 0; // halt } if (*cacheable == CACHE_RESULT) { - if (!new_cache->reports || new_cache->reports == reports) { + if (likely(!new_cache->reports || new_cache->reports == reports)) { new_cache->reports = reports; } else { *cacheable = DO_NOT_CACHE_RESULT; @@ -194,8 +194,8 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG, // Some exceptions squash states behind them. Note that we squash states in // 'succ', not local_succ. - if (e->hasSquash == LIMEX_SQUASH_CYCLIC - || e->hasSquash == LIMEX_SQUASH_REPORT) { + if (unlikely(e->hasSquash == LIMEX_SQUASH_CYCLIC + || e->hasSquash == LIMEX_SQUASH_REPORT)) { *succ = AND_STATE(*succ, LOAD_FROM_ENG(&e->squash)); if (*cacheable == CACHE_RESULT) { *cacheable = DO_NOT_CACHE_RESULT; @@ -331,12 +331,12 @@ int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ, u32 idx = local_index + base_index[t]; const EXCEPTION_T *e = &exceptions[idx]; - if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ, + if (unlikely(!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ, #ifndef BIG_MODEL &local_succ, #endif limex, offset, ctx, &new_cache, &cacheable, - in_rev, flags)) { + in_rev, flags))) { return PE_RV_HALT; } } while (word); @@ -349,7 +349,7 @@ int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ, *succ = OR_STATE(*succ, ctx->local_succ); #endif - if (cacheable == CACHE_RESULT) { + if (likely(cacheable == CACHE_RESULT)) { ctx->cached_estate = estate; #ifndef BIG_MODEL ctx->cached_esucc = local_succ; @@ -359,7 +359,7 @@ int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ, ctx->cached_reports = new_cache.reports; ctx->cached_br = new_cache.br; } else if (cacheable == DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES) { - if (ctx->cached_br) { + if (unlikely(ctx->cached_br)) { ctx->cached_estate = ZERO_STATE; } } diff --git a/src/nfa/limex_internal.h b/src/nfa/limex_internal.h index 23b1bd9..0e27c79 100644 --- a/src/nfa/limex_internal.h +++ b/src/nfa/limex_internal.h @@ -119,7 +119,7 @@ struct NFAException##size { \ u32 repeatOffset; /**< offset to NFARepeatInfo, or MO_INVALID_IDX */ \ u8 hasSquash; /**< from enum LimExSquash */ \ u8 trigger; /**< from enum LimExTrigger */ \ -}; \ +}__attribute__ ((aligned (16))); \ \ struct LimExNFA##size { \ u8 reachMap[N_CHARS]; /**< map of char -> entry in reach[] */ \ diff --git a/src/nfa/limex_native.c b/src/nfa/limex_native.c index f6f5809..8998830 100644 --- a/src/nfa/limex_native.c +++ b/src/nfa/limex_native.c @@ -77,7 +77,7 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ, struct NFAContext32 *ctx, char in_rev, char flags) { assert(estate != 0); // guaranteed by calling macro - if (estate == ctx->cached_estate) { + if (unlikely(estate == ctx->cached_estate)) { DEBUG_PRINTF("using cached succ from previous state\n"); *succ |= ctx->cached_esucc; if (ctx->cached_reports && (flags & CALLBACK_OUTPUT)) { @@ -103,21 +103,21 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ, u32 bit = findAndClearLSB_32(&estate); u32 idx = rank_in_mask32(limex->exceptionMask, bit); const struct NFAException32 *e = &exceptions[idx]; - if (!runException32(e, s, succ, &local_succ, limex, offset, ctx, - &new_cache, &cacheable, in_rev, flags)) { + if (unlikely(!runException32(e, s, succ, &local_succ, limex, offset, ctx, + &new_cache, &cacheable, in_rev, flags))) { return PE_RV_HALT; } } while (estate != 0); *succ |= local_succ; - if (cacheable == CACHE_RESULT) { + if (unlikely(cacheable == CACHE_RESULT)) { ctx->cached_estate = orig_estate; ctx->cached_esucc = local_succ; ctx->cached_reports = new_cache.reports; ctx->cached_br = new_cache.br; } else if (cacheable == DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES) { - if (ctx->cached_br) { + if (unlikely(ctx->cached_br)) { ctx->cached_estate = 0U; } } diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c index 09ffc0c..6231e61 100644 --- a/src/nfa/shufti.c +++ b/src/nfa/shufti.c @@ -42,39 +42,36 @@ #ifdef DEBUG #include -#define DUMP_MSK(_t) \ -static UNUSED \ -void dumpMsk##_t(m##_t msk) { \ - u8 * mskAsU8 = (u8 *)&msk; \ - for (unsigned i = 0; i < sizeof(msk); i++) { \ - u8 c = mskAsU8[i]; \ - for (int j = 0; j < 8; j++) { \ - if ((c >> (7-j)) & 0x1) \ - printf("1"); \ - else \ - printf("0"); \ - } \ - printf(" "); \ - } \ -} \ -static UNUSED \ -void dumpMsk##_t##AsChars(m##_t msk) { \ - u8 * mskAsU8 = (u8 *)&msk; \ - for (unsigned i = 0; i < sizeof(msk); i++) { \ - u8 c = mskAsU8[i]; \ - if (isprint(c)) \ - printf("%c",c); \ - else \ - printf("."); \ - } \ -} +#define DUMP_MSK(_t) \ + static UNUSED void dumpMsk##_t(m##_t msk) { \ + u8 *mskAsU8 = (u8 *)&msk; \ + for (unsigned i = 0; i < sizeof(msk); i++) { \ + u8 c = mskAsU8[i]; \ + for (int j = 0; j < 8; j++) { \ + if ((c >> (7 - j)) & 0x1) \ + printf("1"); \ + else \ + printf("0"); \ + } \ + printf(" "); \ + } \ + } \ + static UNUSED void dumpMsk##_t##AsChars(m##_t msk) { \ + u8 *mskAsU8 = (u8 *)&msk; \ + for (unsigned i = 0; i < sizeof(msk); i++) { \ + u8 c = mskAsU8[i]; \ + if (isprint(c)) \ + printf("%c", c); \ + else \ + printf("."); \ + } \ + } #endif /** \brief Naive byte-by-byte implementation. */ -static really_inline -const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf, - const u8 *buf_end) { +static really_inline const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, + const u8 *buf, const u8 *buf_end) { assert(buf < buf_end); for (; buf < buf_end; ++buf) { @@ -87,9 +84,8 @@ const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf, } /** \brief Naive byte-by-byte implementation. */ -static really_inline -const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf, - const u8 *buf_end) { +static really_inline const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, + const u8 *buf, const u8 *buf_end) { assert(buf < buf_end); for (buf_end--; buf_end >= buf; buf_end--) { @@ -111,25 +107,33 @@ DUMP_MSK(128) #define GET_LO_4(chars) and128(chars, low4bits) #define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4) -static really_inline -u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits, - const m128 compare) { - m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); - m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); - m128 t = and128(c_lo, c_hi); +static really_inline u32 block(m128 mask_lo, m128 mask_hi, m128 chars, + const m128 low4bits, const m128 compare) { + m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); + m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); + m128 t = and128(c_lo, c_hi); #ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n"); + DEBUG_PRINTF(" chars: "); + dumpMsk128AsChars(chars); + printf("\n"); + DEBUG_PRINTF(" char: "); + dumpMsk128(chars); + printf("\n"); + DEBUG_PRINTF(" c_lo: "); + dumpMsk128(c_lo); + printf("\n"); + DEBUG_PRINTF(" c_hi: "); + dumpMsk128(c_hi); + printf("\n"); + DEBUG_PRINTF(" t: "); + dumpMsk128(t); + printf("\n"); #endif return movemask128(eq128(t, compare)); } -static really_inline -const u8 *firstMatch(const u8 *buf, u32 z) { +static really_inline const u8 *firstMatch(const u8 *buf, u32 z) { if (unlikely(z != 0xffff)) { u32 pos = ctz32(~z & 0xffff); assert(pos < 16); @@ -139,9 +143,9 @@ const u8 *firstMatch(const u8 *buf, u32 z) { } } -static really_inline -const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf, - const m128 low4bits, const m128 zeroes) { +static really_inline const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars, + const u8 *buf, const m128 low4bits, + const m128 zeroes) { u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes); return firstMatch(buf, z); @@ -153,13 +157,13 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, assert(buf < buf_end); // Slow path for small cases. - if (buf_end - buf < 16) { - return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, - buf, buf_end); + if (unlikely(buf_end - buf < 16)) { + return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf, + buf_end); } const m128 zeroes = zeroes128(); - const m128 low4bits = _mm_set1_epi8(0xf); + const m128 low4bits = set16x8(0xf); const u8 *rv; size_t min = (size_t)buf % 16; @@ -179,6 +183,11 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *last_block = buf_end - 16; while (buf < last_block) { m128 lchars = load128(buf); + +#if defined(HAVE_NEON) + __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(buf + 256))); +#endif + rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes); if (rv) { return rv; @@ -198,10 +207,11 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, return buf_end; } -static really_inline -const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) { +static really_inline const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) { #ifdef DEBUG - DEBUG_PRINTF("confirming match in:"); dumpMsk128(t); printf("\n"); + DEBUG_PRINTF("confirming match in:"); + dumpMsk128(t); + printf("\n"); #endif u32 z = movemask128(eq128(t, compare)); @@ -215,20 +225,29 @@ const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) { } } - -static really_inline -const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf, - const m128 low4bits, const m128 zeroes) { - m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); - m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); - m128 t = and128(c_lo, c_hi); +static really_inline const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars, + const u8 *buf, const m128 low4bits, + const m128 zeroes) { + m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); + m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); + m128 t = and128(c_lo, c_hi); #ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n"); + DEBUG_PRINTF(" chars: "); + dumpMsk128AsChars(chars); + printf("\n"); + DEBUG_PRINTF(" char: "); + dumpMsk128(chars); + printf("\n"); + DEBUG_PRINTF(" c_lo: "); + dumpMsk128(c_lo); + printf("\n"); + DEBUG_PRINTF(" c_hi: "); + dumpMsk128(c_hi); + printf("\n"); + DEBUG_PRINTF(" t: "); + dumpMsk128(t); + printf("\n"); #endif return lastMatch(buf, t, zeroes); @@ -241,12 +260,12 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, // Slow path for small cases. if (buf_end - buf < 16) { - return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, - buf, buf_end); + return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf, + buf_end); } const m128 zeroes = zeroes128(); - const m128 low4bits = _mm_set1_epi8(0xf); + const m128 low4bits = set16x8(0xf); const u8 *rv; assert(buf_end - buf >= 16); @@ -283,32 +302,48 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, return buf - 1; } -static really_inline -const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, - m128 chars, const u8 *buf, const m128 low4bits, - const m128 ones) { +static really_inline const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, + m128 mask2_lo, m128 mask2_hi, + m128 chars, const u8 *buf, + const m128 low4bits, const m128 ones) { m128 chars_lo = GET_LO_4(chars); m128 chars_hi = GET_HI_4(chars); - m128 c_lo = pshufb_m128(mask1_lo, chars_lo); - m128 c_hi = pshufb_m128(mask1_hi, chars_hi); - m128 t = or128(c_lo, c_hi); + m128 c_lo = pshufb_m128(mask1_lo, chars_lo); + m128 c_hi = pshufb_m128(mask1_hi, chars_hi); + m128 t = or128(c_lo, c_hi); #ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n"); + DEBUG_PRINTF(" chars: "); + dumpMsk128AsChars(chars); + printf("\n"); + DEBUG_PRINTF(" char: "); + dumpMsk128(chars); + printf("\n"); + DEBUG_PRINTF(" c_lo: "); + dumpMsk128(c_lo); + printf("\n"); + DEBUG_PRINTF(" c_hi: "); + dumpMsk128(c_hi); + printf("\n"); + DEBUG_PRINTF(" t: "); + dumpMsk128(t); + printf("\n"); #endif - m128 c2_lo = pshufb_m128(mask2_lo, chars_lo); - m128 c2_hi = pshufb_m128(mask2_hi, chars_hi); - m128 t2 = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1)); + m128 c2_lo = pshufb_m128(mask2_lo, chars_lo); + m128 c2_hi = pshufb_m128(mask2_hi, chars_hi); + m128 t2 = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1)); #ifdef DEBUG - DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo); printf("\n"); - DEBUG_PRINTF(" c2_hi: "); dumpMsk128(c2_hi); printf("\n"); - DEBUG_PRINTF(" t2: "); dumpMsk128(t2); printf("\n"); + DEBUG_PRINTF(" c2_lo: "); + dumpMsk128(c2_lo); + printf("\n"); + DEBUG_PRINTF(" c2_hi: "); + dumpMsk128(c2_hi); + printf("\n"); + DEBUG_PRINTF(" t2: "); + dumpMsk128(t2); + printf("\n"); #endif u32 z = movemask128(eq128(t2, ones)); @@ -316,19 +351,18 @@ const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, return firstMatch(buf, z); } -const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, - m128 mask2_lo, m128 mask2_hi, - const u8 *buf, const u8 *buf_end) { +const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, + m128 mask2_hi, const u8 *buf, const u8 *buf_end) { const m128 ones = ones128(); - const m128 low4bits = _mm_set1_epi8(0xf); + const m128 low4bits = set16x8(0xf); const u8 *rv; size_t min = (size_t)buf % 16; // Preconditioning: most of the time our buffer won't be aligned. m128 chars = loadu128(buf); - rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, - chars, buf, low4bits, ones); + rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf, low4bits, + ones); if (rv) { return rv; } @@ -340,8 +374,13 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, const u8 *last_block = buf_end - 16; while (buf < last_block) { m128 lchars = load128(buf); - rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, - lchars, buf, low4bits, ones); + +#if defined(HAVE_NEON) + __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(buf + 256))); +#endif + + rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, lchars, buf, + low4bits, ones); if (rv) { return rv; } @@ -351,8 +390,8 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, // Use an unaligned load to mop up the last 16 bytes and get an accurate // picture to buf_end. chars = loadu128(buf_end - 16); - rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, - chars, buf_end - 16, low4bits, ones); + rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf_end - 16, + low4bits, ones); if (rv) { return rv; } @@ -370,26 +409,34 @@ DUMP_MSK(256) #define GET_LO_4(chars) and256(chars, low4bits) #define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4) -static really_inline -u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits, - const m256 compare) { - m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); - m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); +static really_inline u32 block(m256 mask_lo, m256 mask_hi, m256 chars, + const m256 low4bits, const m256 compare) { + m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); + m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); m256 t = and256(c_lo, c_hi); #ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); + DEBUG_PRINTF(" chars: "); + dumpMsk256AsChars(chars); + printf("\n"); + DEBUG_PRINTF(" char: "); + dumpMsk256(chars); + printf("\n"); + DEBUG_PRINTF(" c_lo: "); + dumpMsk256(c_lo); + printf("\n"); + DEBUG_PRINTF(" c_hi: "); + dumpMsk256(c_hi); + printf("\n"); + DEBUG_PRINTF(" t: "); + dumpMsk256(t); + printf("\n"); #endif return movemask256(eq256(t, compare)); } -static really_inline -const u8 *firstMatch(const u8 *buf, u32 z) { +static really_inline const u8 *firstMatch(const u8 *buf, u32 z) { DEBUG_PRINTF("z 0x%08x\n", z); if (unlikely(z != 0xffffffff)) { u32 pos = ctz32(~z); @@ -401,9 +448,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) { } } -static really_inline -const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf, - const m256 low4bits) { +static really_inline const u8 * +fwdBlockShort(m256 mask, m128 chars, const u8 *buf, const m256 low4bits) { // do the hi and lo shuffles in the one avx register m256 c = combine2x128(rshift64_m128(chars, 4), chars); c = and256(c, low4bits); @@ -415,9 +461,9 @@ const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf, return firstMatch(buf, z); } -static really_inline -const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf, - const u8 *buf_end, const m256 low4bits) { +static really_inline const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, + const u8 *buf, const u8 *buf_end, + const m256 low4bits) { // run shufti over two overlapping 16-byte unaligned reads const m256 mask = combine2x128(mask_hi, mask_lo); m128 chars = loadu128(buf); @@ -434,9 +480,9 @@ const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf, return buf_end; } -static really_inline -const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf, - const m256 low4bits, const m256 zeroes) { +static really_inline const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, + const u8 *buf, const m256 low4bits, + const m256 zeroes) { u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes); return firstMatch(buf, z); @@ -451,8 +497,8 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, // Slow path for small cases. if (buf_end - buf < 16) { - return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, - buf, buf_end); + return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf, + buf_end); } const m256 low4bits = set32x8(0xf); @@ -483,7 +529,8 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *last_block = buf_end - 32; while (buf < last_block) { m256 lchars = load256(buf); - rv = fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes); + rv = + fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes); if (rv) { return rv; } @@ -494,7 +541,8 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, // picture to buf_end. assert(buf <= buf_end && buf >= buf_end - 32); chars = loadu256(buf_end - 32); - rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes); + rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, + zeroes); if (rv) { return rv; } @@ -502,8 +550,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, return buf_end; } -static really_inline -const u8 *lastMatch(const u8 *buf, u32 z) { +static really_inline const u8 *lastMatch(const u8 *buf, u32 z) { if (unlikely(z != 0xffffffff)) { u32 pos = clz32(~z); DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos); @@ -513,28 +560,37 @@ const u8 *lastMatch(const u8 *buf, u32 z) { } } -static really_inline -const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf, - const m256 low4bits, const m256 zeroes) { - m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); - m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); - m256 t = and256(c_lo, c_hi); +static really_inline const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, + const u8 *buf, const m256 low4bits, + const m256 zeroes) { + m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); + m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); + m256 t = and256(c_lo, c_hi); #ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); + DEBUG_PRINTF(" chars: "); + dumpMsk256AsChars(chars); + printf("\n"); + DEBUG_PRINTF(" char: "); + dumpMsk256(chars); + printf("\n"); + DEBUG_PRINTF(" c_lo: "); + dumpMsk256(c_lo); + printf("\n"); + DEBUG_PRINTF(" c_hi: "); + dumpMsk256(c_hi); + printf("\n"); + DEBUG_PRINTF(" t: "); + dumpMsk256(t); + printf("\n"); #endif u32 z = movemask256(eq256(t, zeroes)); return lastMatch(buf, z); } -static really_inline -const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf, - const m256 low4bits) { +static really_inline const u8 * +revBlockShort(m256 mask, m128 chars, const u8 *buf, const m256 low4bits) { // do the hi and lo shuffles in the one avx register m256 c = combine2x128(rshift64_m128(chars, 4), chars); c = and256(c, low4bits); @@ -546,9 +602,9 @@ const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf, return lastMatch(buf, z); } -static really_inline -const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf, - const u8 *buf_end, const m256 low4bits) { +static really_inline const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, + const u8 *buf, const u8 *buf_end, + const m256 low4bits) { // run shufti over two overlapping 16-byte unaligned reads const m256 mask = combine2x128(mask_hi, mask_lo); @@ -566,7 +622,6 @@ const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf, return buf - 1; } - /* takes 128 bit masks, but operates on 256 bits of data */ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *buf_end) { @@ -575,8 +630,8 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, // Slow path for small cases. if (buf_end - buf < 16) { - return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, - buf, buf_end); + return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf, + buf_end); } const m256 low4bits = set32x8(0xf); @@ -594,7 +649,8 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, // Preconditioning: most of the time our buffer won't be aligned. m256 chars = loadu256(buf_end - 32); - rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes); + rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, + zeroes); if (rv) { return rv; } @@ -606,7 +662,8 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, while (buf_end > last_block) { buf_end -= 32; m256 lchars = load256(buf_end); - rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, zeroes); + rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, + zeroes); if (rv) { return rv; } @@ -623,42 +680,58 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, return buf - 1; } -static really_inline -const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi, - m256 chars, const u8 *buf, const m256 low4bits, - const m256 ones) { +static really_inline const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, + m256 mask2_lo, m256 mask2_hi, + m256 chars, const u8 *buf, + const m256 low4bits, const m256 ones) { DEBUG_PRINTF("buf %p\n", buf); m256 chars_lo = GET_LO_4(chars); m256 chars_hi = GET_HI_4(chars); - m256 c_lo = pshufb_m256(mask1_lo, chars_lo); - m256 c_hi = pshufb_m256(mask1_hi, chars_hi); - m256 t = or256(c_lo, c_hi); + m256 c_lo = pshufb_m256(mask1_lo, chars_lo); + m256 c_hi = pshufb_m256(mask1_hi, chars_hi); + m256 t = or256(c_lo, c_hi); #ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); + DEBUG_PRINTF(" chars: "); + dumpMsk256AsChars(chars); + printf("\n"); + DEBUG_PRINTF(" char: "); + dumpMsk256(chars); + printf("\n"); + DEBUG_PRINTF(" c_lo: "); + dumpMsk256(c_lo); + printf("\n"); + DEBUG_PRINTF(" c_hi: "); + dumpMsk256(c_hi); + printf("\n"); + DEBUG_PRINTF(" t: "); + dumpMsk256(t); + printf("\n"); #endif - m256 c2_lo = pshufb_m256(mask2_lo, chars_lo); - m256 c2_hi = pshufb_m256(mask2_hi, chars_hi); + m256 c2_lo = pshufb_m256(mask2_lo, chars_lo); + m256 c2_hi = pshufb_m256(mask2_hi, chars_hi); m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1)); #ifdef DEBUG - DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo); printf("\n"); - DEBUG_PRINTF(" c2_hi: "); dumpMsk256(c2_hi); printf("\n"); - DEBUG_PRINTF(" t2: "); dumpMsk256(t2); printf("\n"); + DEBUG_PRINTF(" c2_lo: "); + dumpMsk256(c2_lo); + printf("\n"); + DEBUG_PRINTF(" c2_hi: "); + dumpMsk256(c2_hi); + printf("\n"); + DEBUG_PRINTF(" t2: "); + dumpMsk256(t2); + printf("\n"); #endif u32 z = movemask256(eq256(t2, ones)); return firstMatch(buf, z); } -static really_inline -const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf, - const m256 low4bits) { +static really_inline const u8 *fwdBlockShort2(m256 mask1, m256 mask2, + m128 chars, const u8 *buf, + const m256 low4bits) { // do the hi and lo shuffles in the one avx register m256 c = combine2x128(rshift64_m128(chars, 4), chars); c = and256(c, low4bits); @@ -672,9 +745,10 @@ const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf, return firstMatch(buf, z); } -static really_inline -const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, - m128 mask2_hi, const u8 *buf, const u8 *buf_end) { +static really_inline const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, + m128 mask2_lo, m128 mask2_hi, + const u8 *buf, + const u8 *buf_end) { DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); const m256 low4bits = set32x8(0xf); // run shufti over two overlapping 16-byte unaligned reads @@ -695,9 +769,8 @@ const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, } /* takes 128 bit masks, but operates on 256 bits of data */ -const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, - m128 mask2_lo, m128 mask2_hi, - const u8 *buf, const u8 *buf_end) { +const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, + m128 mask2_hi, const u8 *buf, const u8 *buf_end) { /* we should always have at least 16 bytes */ assert(buf_end - buf >= 16); DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); @@ -731,8 +804,8 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, const u8 *last_block = buf_end - 32; while (buf < last_block) { m256 lchars = load256(buf); - rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, - lchars, buf, low4bits, ones); + rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, + wide_mask2_hi, lchars, buf, low4bits, ones); if (rv) { return rv; } @@ -757,26 +830,34 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, DUMP_MSK(512) #endif -static really_inline -u64a block(m512 mask_lo, m512 mask_hi, m512 chars, const m512 low4bits, - const m512 compare) { +static really_inline u64a block(m512 mask_lo, m512 mask_hi, m512 chars, + const m512 low4bits, const m512 compare) { m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits)); - m512 c_hi = pshufb_m512(mask_hi, - rshift64_m512(andnot512(low4bits, chars), 4)); + m512 c_hi = + pshufb_m512(mask_hi, rshift64_m512(andnot512(low4bits, chars), 4)); m512 t = and512(c_lo, c_hi); #ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); + DEBUG_PRINTF(" chars: "); + dumpMsk512AsChars(chars); + printf("\n"); + DEBUG_PRINTF(" char: "); + dumpMsk512(chars); + printf("\n"); + DEBUG_PRINTF(" c_lo: "); + dumpMsk512(c_lo); + printf("\n"); + DEBUG_PRINTF(" c_hi: "); + dumpMsk512(c_hi); + printf("\n"); + DEBUG_PRINTF(" t: "); + dumpMsk512(t); + printf("\n"); #endif return eq512mask(t, compare); } -static really_inline -const u8 *firstMatch64(const u8 *buf, u64a z) { +static really_inline const u8 *firstMatch64(const u8 *buf, u64a z) { DEBUG_PRINTF("z 0x%016llx\n", z); if (unlikely(z != ~0ULL)) { u32 pos = ctz64(~z); @@ -788,18 +869,19 @@ const u8 *firstMatch64(const u8 *buf, u64a z) { } } -static really_inline -const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf, - const m512 low4bits, const m512 zeroes) { +static really_inline const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi, + m512 chars, const u8 *buf, + const m512 low4bits, + const m512 zeroes) { u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes); return firstMatch64(buf, z); } -static really_inline -const u8 *shortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf, - const u8 *buf_end, const m512 low4bits, - const m512 zeroes) { +static really_inline const u8 *shortShufti512(m512 mask_lo, m512 mask_hi, + const u8 *buf, const u8 *buf_end, + const m512 low4bits, + const m512 zeroes) { DEBUG_PRINTF("short shufti %p len %zu\n", buf, buf_end - buf); uintptr_t len = buf_end - buf; assert(len <= 64); @@ -877,8 +959,7 @@ done: return buf_end; } -static really_inline -const u8 *lastMatch64(const u8 *buf, u64a z) { +static really_inline const u8 *lastMatch64(const u8 *buf, u64a z) { DEBUG_PRINTF("z 0x%016llx\n", z); if (unlikely(z != ~0ULL)) { u32 pos = clz64(~z); @@ -889,10 +970,10 @@ const u8 *lastMatch64(const u8 *buf, u64a z) { } } -static really_inline -const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf, - const u8 *buf_end, const m512 low4bits, - const m512 zeroes) { +static really_inline const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, + const u8 *buf, const u8 *buf_end, + const m512 low4bits, + const m512 zeroes) { DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf); uintptr_t len = buf_end - buf; assert(len <= 64); @@ -909,20 +990,31 @@ const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf, return lastMatch64(buf, z | ~k); } -static really_inline -const u8 *revBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf, - const m512 low4bits, const m512 zeroes) { - m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits)); - m512 c_hi = pshufb_m512(mask_hi, - rshift64_m512(andnot512(low4bits, chars), 4)); - m512 t = and512(c_lo, c_hi); +static really_inline const u8 *revBlock512(m512 mask_lo, m512 mask_hi, + m512 chars, const u8 *buf, + const m512 low4bits, + const m512 zeroes) { + m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits)); + m512 c_hi = + pshufb_m512(mask_hi, rshift64_m512(andnot512(low4bits, chars), 4)); + m512 t = and512(c_lo, c_hi); #ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); + DEBUG_PRINTF(" chars: "); + dumpMsk512AsChars(chars); + printf("\n"); + DEBUG_PRINTF(" char: "); + dumpMsk512(chars); + printf("\n"); + DEBUG_PRINTF(" c_lo: "); + dumpMsk512(c_lo); + printf("\n"); + DEBUG_PRINTF(" c_hi: "); + dumpMsk512(c_hi); + printf("\n"); + DEBUG_PRINTF(" t: "); + dumpMsk512(t); + printf("\n"); #endif u64a z = eq512mask(t, zeroes); @@ -985,43 +1077,60 @@ done: return buf - 1; } -static really_inline -const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi, - m512 chars, const u8 *buf, const m512 low4bits, - const m512 ones, __mmask64 k) { +static really_inline const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi, + m512 mask2_lo, m512 mask2_hi, + m512 chars, const u8 *buf, + const m512 low4bits, const m512 ones, + __mmask64 k) { DEBUG_PRINTF("buf %p %.64s\n", buf, buf); m512 chars_lo = and512(chars, low4bits); m512 chars_hi = rshift64_m512(andnot512(low4bits, chars), 4); - m512 c_lo = maskz_pshufb_m512(k, mask1_lo, chars_lo); - m512 c_hi = maskz_pshufb_m512(k, mask1_hi, chars_hi); - m512 t = or512(c_lo, c_hi); + m512 c_lo = maskz_pshufb_m512(k, mask1_lo, chars_lo); + m512 c_hi = maskz_pshufb_m512(k, mask1_hi, chars_hi); + m512 t = or512(c_lo, c_hi); #ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); + DEBUG_PRINTF(" chars: "); + dumpMsk512AsChars(chars); + printf("\n"); + DEBUG_PRINTF(" char: "); + dumpMsk512(chars); + printf("\n"); + DEBUG_PRINTF(" c_lo: "); + dumpMsk512(c_lo); + printf("\n"); + DEBUG_PRINTF(" c_hi: "); + dumpMsk512(c_hi); + printf("\n"); + DEBUG_PRINTF(" t: "); + dumpMsk512(t); + printf("\n"); #endif - m512 c2_lo = maskz_pshufb_m512(k, mask2_lo, chars_lo); - m512 c2_hi = maskz_pshufb_m512(k, mask2_hi, chars_hi); + m512 c2_lo = maskz_pshufb_m512(k, mask2_lo, chars_lo); + m512 c2_hi = maskz_pshufb_m512(k, mask2_hi, chars_hi); m512 t2 = or512(t, rshift128_m512(or512(c2_lo, c2_hi), 1)); #ifdef DEBUG - DEBUG_PRINTF(" c2_lo: "); dumpMsk512(c2_lo); printf("\n"); - DEBUG_PRINTF(" c2_hi: "); dumpMsk512(c2_hi); printf("\n"); - DEBUG_PRINTF(" t2: "); dumpMsk512(t2); printf("\n"); + DEBUG_PRINTF(" c2_lo: "); + dumpMsk512(c2_lo); + printf("\n"); + DEBUG_PRINTF(" c2_hi: "); + dumpMsk512(c2_hi); + printf("\n"); + DEBUG_PRINTF(" t2: "); + dumpMsk512(t2); + printf("\n"); #endif u64a z = eq512mask(t2, ones); return firstMatch64(buf, z | ~k); } -static really_inline -const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, - m512 mask2_hi, const u8 *buf, const u8 *buf_end, - const m512 low4bits, const m512 ones) { +static really_inline const u8 * +shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi, + const u8 *buf, const u8 *buf_end, const m512 low4bits, + const m512 ones) { DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf); uintptr_t len = buf_end - buf; assert(len <= 64); @@ -1038,9 +1147,8 @@ const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, } /* takes 128 bit masks, but operates on 512 bits of data */ -const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, - m128 mask2_lo, m128 mask2_hi, - const u8 *buf, const u8 *buf_end) { +const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, + m128 mask2_hi, const u8 *buf, const u8 *buf_end) { /* we should always have at least 16 bytes */ assert(buf_end - buf >= 16); DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); diff --git a/src/nfa/truffle.c b/src/nfa/truffle.c index be6b312..c05d778 100644 --- a/src/nfa/truffle.c +++ b/src/nfa/truffle.c @@ -41,7 +41,7 @@ static really_inline const u8 *lastMatch(const u8 *buf, u32 z) { - if (unlikely(z != 0xffff)) { + if (z != 0xffff) { u32 pos = clz32(~z & 0xffff); assert(pos >= 16 && pos < 32); return buf + (31 - pos); @@ -52,7 +52,7 @@ const u8 *lastMatch(const u8 *buf, u32 z) { static really_inline const u8 *firstMatch(const u8 *buf, u32 z) { - if (unlikely(z != 0xffff)) { + if (likely(z != 0xffff)) { u32 pos = ctz32(~z & 0xffff); assert(pos < 16); return buf + pos; @@ -64,8 +64,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) { static really_inline u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) { - m128 highconst = _mm_set1_epi8(0x80); - m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201); + m128 highconst = set16x8(0x80); + m128 shuf_mask_hi = set2x64(0x8040201008040201); // and now do the real work m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v); @@ -124,7 +124,7 @@ const u8 *truffleExec(m128 shuf_mask_lo_highclear, assert(buf < buf_end); const u8 *rv; - if (buf_end - buf < 16) { + if (unlikely(buf_end - buf < 16)) { return truffleMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf, buf_end); } diff --git a/src/parser/control_verbs.cpp b/src/parser/control_verbs.cpp new file mode 100644 index 0000000..482004d --- /dev/null +++ b/src/parser/control_verbs.cpp @@ -0,0 +1,340 @@ + +/* + * Copyright (c) 2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * \brief Parser for control verbs that can occur at the beginning of a pattern. + */ + +#include "parser/control_verbs.h" + +#include "parser/Parser.h" +#include "parser/parse_error.h" + +#include +#include + +using namespace std; + +namespace ue2 { + +const char *read_control_verbs(const char *ptr, const char *end, size_t start, + ParseMode &mode) { + const char *p = ptr; + const char *pe = end; + const char *eof = pe; + const char *ts, *te; + int cs; + UNUSED int act; + + static const char _ControlVerbs_actions[] = { + 0, 1, 0, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1, 9}; + + static const unsigned char _ControlVerbs_key_offsets[] = { + 0, 7, 8, 10, 12, 14, 16, 18, 20, 21, 23, 25, 27, + 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 55, + 57, 59, 61, 63, 66, 68, 70, 72, 74, 76, 79, 82, 84, + 86, 88, 90, 92, 94, 96, 98, 100, 102, 105, 107, 109, 111, + 113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, + 139, 141, 143, 146, 148, 149, 151, 155, 157, 159, 160, 161}; + + static const char _ControlVerbs_trans_keys[] = { + 41, 65, 66, 67, 76, 78, 85, 41, 41, 78, 41, 89, 41, 67, 41, 82, 41, + 76, 41, 70, 41, 41, 83, 41, 82, 41, 95, 41, 65, 85, 41, 78, 41, 89, + 41, 67, 41, 78, 41, 73, 41, 67, 41, 79, 41, 68, 41, 69, 41, 82, 41, + 76, 41, 70, 73, 41, 77, 41, 73, 41, 84, 41, 95, 41, 77, 82, 41, 65, + 41, 84, 41, 67, 41, 72, 41, 61, 41, 48, 57, 41, 48, 57, 41, 69, 41, + 67, 41, 85, 41, 82, 41, 83, 41, 73, 41, 79, 41, 78, 41, 79, 41, 95, + 41, 65, 83, 41, 85, 41, 84, 41, 79, 41, 95, 41, 80, 41, 79, 41, 83, + 41, 83, 41, 69, 41, 83, 41, 83, 41, 84, 41, 65, 41, 82, 41, 84, 41, + 95, 41, 79, 41, 80, 41, 84, 41, 67, 84, 41, 80, 41, 41, 70, 41, 49, + 51, 56, 41, 54, 41, 50, 41, 40, 42, 0}; + + static const char _ControlVerbs_single_lengths[] = { + 7, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 2, 4, 2, 2, 1, 1, 1}; + + static const char _ControlVerbs_range_lengths[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + static const short _ControlVerbs_index_offsets[] = { + 0, 8, 10, 13, 16, 19, 22, 25, 28, 30, 33, 36, 39, + 43, 46, 49, 52, 55, 58, 61, 64, 67, 70, 73, 76, 80, + 83, 86, 89, 92, 96, 99, 102, 105, 108, 111, 114, 117, 120, + 123, 126, 129, 132, 135, 138, 141, 144, 147, 151, 154, 157, 160, + 163, 166, 169, 172, 175, 178, 181, 184, 187, 190, 193, 196, 199, + 202, 205, 208, 212, 215, 217, 220, 225, 228, 231, 233, 235}; + + static const char _ControlVerbs_indicies[] = { + 0, 2, 3, 4, 5, 6, 7, 1, 8, 1, 8, 9, 1, 8, 10, 1, 11, + 12, 1, 8, 13, 1, 8, 14, 1, 8, 15, 1, 11, 1, 8, 16, 1, 8, + 17, 1, 8, 18, 1, 8, 19, 20, 1, 8, 21, 1, 8, 22, 1, 8, 12, + 1, 8, 23, 1, 8, 24, 1, 8, 25, 1, 8, 26, 1, 8, 27, 1, 8, + 15, 1, 8, 28, 1, 11, 14, 1, 8, 15, 29, 1, 8, 30, 1, 8, 31, + 1, 8, 32, 1, 8, 33, 1, 8, 34, 35, 1, 8, 36, 1, 8, 37, 1, + 8, 38, 1, 8, 39, 1, 8, 40, 1, 8, 41, 1, 11, 41, 1, 8, 42, + 1, 8, 43, 1, 8, 44, 1, 8, 45, 1, 8, 46, 1, 8, 47, 1, 8, + 48, 1, 8, 39, 1, 8, 49, 1, 8, 50, 1, 8, 51, 52, 1, 8, 53, + 1, 8, 54, 1, 8, 55, 1, 8, 56, 1, 8, 57, 1, 8, 58, 1, 8, + 59, 1, 8, 60, 1, 8, 61, 1, 8, 62, 1, 8, 15, 1, 8, 63, 1, + 8, 64, 1, 8, 65, 1, 8, 66, 1, 8, 67, 1, 8, 68, 1, 8, 69, + 1, 8, 15, 1, 8, 70, 71, 1, 8, 72, 1, 73, 1, 8, 74, 1, 75, + 76, 77, 78, 1, 8, 15, 1, 8, 15, 1, 75, 1, 80, 79, 82, 81, 0}; + + static const char _ControlVerbs_trans_targs[] = { + 75, 1, 2, 9, 22, 24, 45, 67, 75, 3, 4, 75, 5, 6, 7, 8, 10, + 11, 12, 13, 16, 14, 15, 17, 18, 19, 20, 21, 23, 25, 26, 27, 28, 29, + 30, 37, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 46, 47, + 48, 59, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63, 64, + 65, 66, 68, 70, 69, 75, 71, 75, 72, 73, 74, 75, 76, 75, 0}; + + static const char _ControlVerbs_trans_actions[] = { + 19, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 7, 0, 0, 0, 15, 5, 17, 0}; + + static const char _ControlVerbs_to_state_actions[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0}; + + static const char _ControlVerbs_from_state_actions[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0}; + + static const short _ControlVerbs_eof_trans[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 82}; + + static const int ControlVerbs_start = 75; + static const int ControlVerbs_first_final = 75; + static const int ControlVerbs_error = -1; + + static const int ControlVerbs_en_main = 75; + + { + cs = ControlVerbs_start; + ts = 0; + te = 0; + act = 0; + } + + try { + + { + int _klen; + unsigned int _trans; + const char *_acts; + unsigned int _nacts; + const char *_keys; + + if (p == pe) + goto _test_eof; + _resume: + _acts = + _ControlVerbs_actions + _ControlVerbs_from_state_actions[cs]; + _nacts = (unsigned int)*_acts++; + while (_nacts-- > 0) { + switch (*_acts++) { + case 1: { + ts = p; + } break; + } + } + + _keys = _ControlVerbs_trans_keys + _ControlVerbs_key_offsets[cs]; + _trans = _ControlVerbs_index_offsets[cs]; + + _klen = _ControlVerbs_single_lengths[cs]; + if (_klen > 0) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + _klen - 1; + while (1) { + if (_upper < _lower) + break; + + _mid = _lower + ((_upper - _lower) >> 1); + if ((*p) < *_mid) + _upper = _mid - 1; + else if ((*p) > *_mid) + _lower = _mid + 1; + else { + _trans += (unsigned int)(_mid - _keys); + goto _match; + } + } + _keys += _klen; + _trans += _klen; + } + + _klen = _ControlVerbs_range_lengths[cs]; + if (_klen > 0) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + (_klen << 1) - 2; + while (1) { + if (_upper < _lower) + break; + + _mid = _lower + (((_upper - _lower) >> 1) & ~1); + if ((*p) < _mid[0]) + _upper = _mid - 2; + else if ((*p) > _mid[1]) + _lower = _mid + 2; + else { + _trans += (unsigned int)((_mid - _keys) >> 1); + goto _match; + } + } + _trans += _klen; + } + + _match: + _trans = _ControlVerbs_indicies[_trans]; + _eof_trans: + cs = _ControlVerbs_trans_targs[_trans]; + + if (_ControlVerbs_trans_actions[_trans] == 0) + goto _again; + + _acts = _ControlVerbs_actions + _ControlVerbs_trans_actions[_trans]; + _nacts = (unsigned int)*_acts++; + while (_nacts-- > 0) { + switch (*_acts++) { + case 2: { + te = p + 1; + } break; + case 3: { + te = p + 1; + { mode.utf8 = true; } + } break; + case 4: { + te = p + 1; + { mode.ucp = true; } + } break; + case 5: { + te = p + 1; + { + ostringstream str; + str << "Unsupported control verb " + << string(ts, te - ts); + throw LocatedParseError(str.str()); + } + } break; + case 6: { + te = p + 1; + { + ostringstream str; + str << "Unknown control verb " << string(ts, te - ts); + throw LocatedParseError(str.str()); + } + } break; + case 7: { + te = p + 1; + { + p--; + { + p++; + goto _out; + } + } + } break; + case 8: { + te = p; + p--; + { + p--; + { + p++; + goto _out; + } + } + } break; + case 9: { + { p = ((te)) - 1; } + { + p--; + { + p++; + goto _out; + } + } + } break; + } + } + + _again: + _acts = _ControlVerbs_actions + _ControlVerbs_to_state_actions[cs]; + _nacts = (unsigned int)*_acts++; + while (_nacts-- > 0) { + switch (*_acts++) { + case 0: { + ts = 0; + } break; + } + } + + if (++p != pe) + goto _resume; + _test_eof : {} + if (p == eof) { + if (_ControlVerbs_eof_trans[cs] > 0) { + _trans = _ControlVerbs_eof_trans[cs] - 1; + goto _eof_trans; + } + } + + _out : {} + } + + } catch (LocatedParseError &error) { + if (ts >= ptr && ts <= pe) { + error.locate(ts - ptr + start); + } else { + error.locate(0); + } + throw; + } + + return p; +} + +} // namespace ue2 diff --git a/src/rose/counting_miracle.h b/src/rose/counting_miracle.h index 976208b..4456679 100644 --- a/src/rose/counting_miracle.h +++ b/src/rose/counting_miracle.h @@ -94,7 +94,7 @@ u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison, u32 count = *count_inout; const m128 zeroes = zeroes128(); - const m128 low4bits = _mm_set1_epi8(0xf); + const m128 low4bits = set16x8(0xf); for (; d + 16 <= d_end; d_end -= 16) { m128 data = loadu128(d_end - 16); diff --git a/src/util/arch.h b/src/util/arch.h index 985fec6..fe4a910 100644 --- a/src/util/arch.h +++ b/src/util/arch.h @@ -61,6 +61,10 @@ #define HAVE_AVX512VBMI #endif +#if defined(__aarch64__) +#define HAVE_NEON +#endif + /* * ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros */ @@ -87,4 +91,11 @@ #define NO_ASM #endif +/* + * AARCH64 uses a different form of inline asm + */ +#if defined(__aarch64__) +#define NO_ASM +#endif + #endif // UTIL_ARCH_H_ diff --git a/src/util/cpuid_flags.c b/src/util/cpuid_flags.c index c00ce58..e0f6368 100644 --- a/src/util/cpuid_flags.c +++ b/src/util/cpuid_flags.c @@ -39,7 +39,7 @@ u64a cpuid_flags(void) { u64a cap = 0; - +#if defined(__X86_64__) if (check_avx2()) { DEBUG_PRINTF("AVX2 enabled\n"); cap |= HS_CPU_FEATURES_AVX2; @@ -68,7 +68,7 @@ u64a cpuid_flags(void) { (defined(FAT_RUNTIME) && !defined(BUILD_AVX512VBMI)) cap &= ~HS_CPU_FEATURES_AVX512VBMI; #endif - +#endif return cap; } @@ -78,6 +78,7 @@ struct family_id { u32 tune; }; +#if defined(__X86_64__) /* from table 35-1 of the Intel 64 and IA32 Arch. Software Developer's Manual * and "Intel Architecture and Processor Identification With CPUID Model and * Family Numbers" */ @@ -121,6 +122,7 @@ static const struct family_id known_microarch[] = { { 0x6, 0x6C, HS_TUNE_FAMILY_ICX }, /* Icelake Xeon */ }; +#endif #ifdef DUMP_SUPPORT static UNUSED @@ -144,6 +146,7 @@ const char *dumpTune(u32 tune) { #endif u32 cpuid_tune(void) { +#if defined(__X86_64__) unsigned int eax, ebx, ecx, edx; cpuid(1, 0, &eax, &ebx, &ecx, &edx); @@ -171,6 +174,6 @@ u32 cpuid_tune(void) { DEBUG_PRINTF("found tune flag %s\n", dumpTune(tune) ); return tune; } - +#endif return HS_TUNE_FAMILY_GENERIC; } diff --git a/src/util/cpuid_flags.h b/src/util/cpuid_flags.h index 527c6d5..3125bd1 100644 --- a/src/util/cpuid_flags.h +++ b/src/util/cpuid_flags.h @@ -32,7 +32,9 @@ #include "ue2common.h" #if !defined(_WIN32) && !defined(CPUID_H_) +#if defined(__x86_64__) #include +#endif /* system header doesn't have a header guard */ #define CPUID_H_ #endif diff --git a/src/util/cpuid_inline.h b/src/util/cpuid_inline.h index b7b4245..b228c1d 100644 --- a/src/util/cpuid_inline.h +++ b/src/util/cpuid_inline.h @@ -32,17 +32,20 @@ #include "ue2common.h" #include "cpuid_flags.h" +#if defined(__x86_64__) || defined(_M_X64) #if !defined(_WIN32) && !defined(CPUID_H_) #include /* system header doesn't have a header guard */ #define CPUID_H_ #endif +#endif #ifdef __cplusplus extern "C" { #endif +#if defined(__x86_64__) || defined(_M_X64) static inline void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { @@ -57,6 +60,7 @@ void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax, *edx = a[3]; #endif } +#endif // ECX #define CPUID_SSE3 (1 << 0) @@ -93,11 +97,12 @@ void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax, #define CPUID_XCR0_AVX512 \ (CPUID_XCR0_OPMASK | CPUID_XCR0_ZMM_Hi256 | CPUID_XCR0_Hi16_ZMM) +#if defined(__x86_64__) static inline u64a xgetbv(u32 op) { #if defined(_WIN32) || defined(__INTEL_COMPILER) return _xgetbv(op); -#else +#elif defined(__x86_64__) u32 a, d; __asm__ volatile ( "xgetbv\n" @@ -252,6 +257,16 @@ int check_popcnt(void) { cpuid(1, 0, &eax, &ebx, &ecx, &edx); return !!(ecx & CPUID_POPCNT); } +#endif //__x86_64__ + +static inline +int check_neon(void) { +#if defined(__aarch64__) + return 1; +#else + return 0; +#endif +} #ifdef __cplusplus } /* extern "C" */ diff --git a/src/util/intrinsics.h b/src/util/intrinsics.h index edc4f6e..ece3b1a 100644 --- a/src/util/intrinsics.h +++ b/src/util/intrinsics.h @@ -55,10 +55,22 @@ # endif #endif +#ifdef __cplusplus +# if defined(HAVE_CXX_ARM_NEON_H) +# define USE_ARM_NEON_H +# endif +#else // C +# if defined(HAVE_C_ARM_NEON_H) +# define USE_ARM_NEON_H +# endif +#endif + #if defined(USE_X86INTRIN_H) #include #elif defined(USE_INTRIN_H) #include +#elif defined(USE_ARM_NEON_H) +#include #else #error no intrinsics file #endif diff --git a/src/util/popcount.h b/src/util/popcount.h index eb08f6b..7d794d1 100644 --- a/src/util/popcount.h +++ b/src/util/popcount.h @@ -41,6 +41,8 @@ u32 popcount32(u32 x) { #if defined(HAVE_POPCOUNT_INSTR) // Single-instruction builtin. return _mm_popcnt_u32(x); +#elif defined(HAVE_NEON) + return (u32)vaddlv_u8(vcnt_u8(vcreate_u8((u64a)x))); #else // Fast branch-free version from bit-twiddling hacks as older Intel // processors do not have a POPCNT instruction. @@ -63,7 +65,9 @@ u32 popcount64(u64a x) { x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f; return (x * 0x0101010101010101) >> 56; -# endif +#endif +#elif defined(HAVE_NEON) + return (u32)vaddlv_u8(vcnt_u8(vcreate_u8((u64a)x))); #else // Synthesise from two 32-bit cases. return popcount32(x >> 32) + popcount32(x); diff --git a/src/util/simd_arm.h b/src/util/simd_arm.h new file mode 100644 index 0000000..cce119f --- /dev/null +++ b/src/util/simd_arm.h @@ -0,0 +1,1069 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * 2020.01 - Use the neon instruction to implement the function of 128-bit operation. + * Huawei Technologies Co., Ltd. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief SIMD types and primitive operations. + */ + +#ifndef SIMD_ARM +#define SIMD_ARM + +#include "config.h" +#include "simd_types.h" +#include "ue2common.h" +#include "unaligned.h" +#include "util/arch.h" +#include "util/intrinsics.h" + +#include // for memcpy + +// Define a common assume_aligned using an appropriate compiler built-in, if +// it's available. Note that we need to handle C or C++ compilation. +#ifdef __cplusplus +#ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED +#define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) +#endif +#else +#ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED +#define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) +#endif +#endif + +// Fallback to identity case. +#ifndef assume_aligned +#define assume_aligned(x, y) (x) +#endif + +#ifdef __cplusplus +extern "C" { +#endif +extern const char vbs_mask_data[]; +#ifdef __cplusplus +} +#endif + +/* +** extend 4.8.5 neon inline assembly functions +*/ +__extension__ static __inline uint64x2_t __attribute__((__always_inline__)) +vmvnq_u64(uint64x2_t a) { + uint64x2_t result; + __asm__("mvn %0.16b,%1.16b" : "=w"(result) : "w"(a) : /* No clobbers */); + return result; +} + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshadow" + +static really_inline m128 ones128(void) { + m128 result; + result.vect_s32 = vdupq_n_s32(0xFFFFFFFF); + return result; +} + +static really_inline m128 zeroes128(void) { + m128 result; + result.vect_s32 = vdupq_n_s32(0x0); + return result; +} + +/** \brief Return 1 if a and b are different otherwise 0 */ +static really_inline int diff128(m128 a, m128 b) { + return !!vaddlvq_s16(veorq_s16(a.vect_s16, b.vect_s16)); +} + +static really_inline int isnonzero128(m128 a) { + return !!diff128(a, zeroes128()); +} + +/** + * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich128(m128 a, m128 b) { + m128 tmp; + tmp.vect_u32 = vmvnq_u32(vceqq_u32(a.vect_u32, b.vect_u32)); + return ((vgetq_lane_u32(tmp.vect_u32, 3) & 0x8) | + (vgetq_lane_u32(tmp.vect_u32, 2) & 0x4) | + (vgetq_lane_u32(tmp.vect_u32, 1) & 0x2) | + (vgetq_lane_u32(tmp.vect_u32, 0) & 0x1)); +} + +/** + * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and + * returns a 4-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_128(m128 a, m128 b) { + m128 tmp; + tmp.vect_u64 = vmvnq_u64(vceqq_u64(a.vect_u64, b.vect_u64)); + return (u32)((vgetq_lane_u64(tmp.vect_u64, 1) & 0x4) | + (vgetq_lane_u64(tmp.vect_u64, 0) & 0x1)); +} + +static really_really_inline m128 lshift64_m128(m128 a, unsigned b) { + assert(b <= 63); + m128 result; + result.vect_s64 = vshlq_n_s64(a.vect_s64, b); + return result; +} + +static really_really_inline m128 rshift64_m128(m128 a, int imm8) { + assert(imm8 >= 0 && imm8 <= 63); + if (unlikely(imm8 == 0)) { + return a; + } + m128 result; + result.vect_u64 = vshrq_n_u64(a.vect_u64, imm8); + return result; +} + +static really_really_inline m128 eq128(m128 a, m128 b) { + m128 result; + result.vect_u8 = vceqq_s8(a.vect_s8, b.vect_s8); + return result; +} + +static really_really_inline u32 movemask128(m128 a) { + m128 result; + result.vect_u8 = vshrq_n_u8(a.vect_u8, 7); + result.vect_u16 = vsraq_n_u16(result.vect_u16, result.vect_u16, 7); + result.vect_u32 = vsraq_n_u32(result.vect_u32, result.vect_u32, 14); + result.vect_u64 = vsraq_n_u64(result.vect_u64, result.vect_u64, 28); + return (u32)(vgetq_lane_u8(result.vect_u8, 0) | + ((u32)vgetq_lane_u8(result.vect_u8, 8) << 8)); +} + +static really_really_inline m128 rshiftbyte_m128(m128 a, int imm8) { + assert(imm8 >= 0 && imm8 <= 15); + m128 result; + result.vect_s8 = vextq_s8(a.vect_s8, vdupq_n_s8(0), imm8); + return result; +} + +static really_really_inline m128 lshiftbyte_m128(m128 a, int imm8) { + assert(imm8 >= 0 && imm8 <= 15); + m128 result; + if (unlikely(imm8 == 0)) { + return a; + } + result.vect_s8 = vextq_s8(vdupq_n_s8(0), a.vect_s8, (16 - imm8)); + return result; +} + +static really_inline m128 set16x8(u8 c) { + m128 result; + result.vect_s8 = vdupq_n_s8(c); + return result; +} + +static really_inline m128 set4x32(u32 c) { + m128 result; + result.vect_s32 = vdupq_n_s32(c); + return result; +} + +static really_inline m128 set2x64(u64a c) { + m128 result; + result.vect_u64 = vdupq_n_u64(c); + return result; +} + +static really_inline u32 movd(const m128 in) { + u32 result; + result = vgetq_lane_u32(in.vect_u32, 0); + return result; +} + +static really_inline u64a movq(const m128 in) { + return vgetq_lane_u64(in.vect_u64, 0); +} + +/* another form of movq */ +static really_inline m128 load_m128_from_u64a(const u64a *p) { + m128 result; + __asm__ __volatile__("ldr %d0, %1 \n\t" + : "=w"(result) + : "Utv"(*p) + : /* No clobbers */ + ); + return result; +} + +/*The x86 platform does not perform the lower 2 bit operation. +If the value of imm exceeds 2 bit, a compilation error occurs.*/ +static really_inline u32 extract32from128(m128 a, int imm) { + return vgetq_lane_s32(a.vect_s32, imm & 0x0003); +} + +/*The x86 platform does not perform the lower 1 bit operation. +If the value of imm exceeds 1 bit, a compilation error occurs.*/ +static really_inline u64a extract64from128(m128 a, int imm) { + return vgetq_lane_s64(a.vect_s64, imm & 0x0001); +} + +#define extractlow64from256(a) movq(a.lo) +#define extractlow32from256(a) movd(a.lo) + +/*The x86 platform does not perform the lower 2 bit operation. +If the value of imm exceeds 2 bit, a compilation error occurs.*/ +static really_inline u32 extract32from256(m256 a, int imm) { + return vgetq_lane_s32((imm >> 2) ? a.hi.vect_s32 : a.lo.vect_s32, + imm & 0x0003); +} + +/*The x86 platform does not perform the lower 1 bit operation. +If the value of imm exceeds 1 bit, a compilation error occurs.*/ +static really_inline u64a extract64from256(m256 a, int imm) { + return vgetq_lane_s64((imm >> 1) ? a.hi.vect_s64 : a.lo.vect_s64, + imm & 0x0001); +} + +static really_inline m128 and128(m128 a, m128 b) { + m128 result; + result.vect_s32 = vandq_s32(a.vect_s32, b.vect_s32); + return result; +} + +static really_inline m128 not128(m128 a) { + m128 result; + result.vect_s32 = vmvnq_s32(a.vect_s32); + return result; +} + +static really_inline m128 xor128(m128 a, m128 b) { + m128 result; + result.vect_s32 = veorq_s32(a.vect_s32, b.vect_s32); + return result; +} + +static really_inline m128 or128(m128 a, m128 b) { + m128 result; + result.vect_s32 = vorrq_s32(a.vect_s32, b.vect_s32); + return result; +} + +static really_inline m128 andnot128(m128 a, m128 b) { + m128 result; + result.vect_s32 = vbicq_s32(b.vect_s32, a.vect_s32); + return result; +} + +// aligned load +static really_inline m128 load128(const void *ptr) { + assert(ISALIGNED_N(ptr, alignof(m128))); + ptr = assume_aligned(ptr, 16); + m128 result; + result.vect_s32 = vld1q_s32((const int32_t *)ptr); + return result; +} + +// aligned store +static really_inline void store128(void *ptr, m128 a) { + assert(ISALIGNED_N(ptr, alignof(m128))); + ptr = assume_aligned(ptr, 16); + *(m128 *)ptr = a; +} + +// unaligned load +static really_inline m128 loadu128(const void *ptr) { + m128 result; + result.vect_s32 = vld1q_s32((const int32_t *)ptr); + return result; +} + +// unaligned store +static really_inline void storeu128(void *ptr, m128 a) { + vst1q_s32((int32_t *)ptr, a.vect_s32); +} + +// packed unaligned store of first N bytes +static really_inline void storebytes128(void *ptr, m128 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline m128 loadbytes128(const void *ptr, unsigned int n) { + m128 a = zeroes128(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +#ifdef __cplusplus +extern "C" { +#endif +extern const u8 simd_onebit_masks[]; +#ifdef __cplusplus +} +#endif + +static really_inline m128 mask1bit128(unsigned int n) { + assert(n < sizeof(m128) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu128(&simd_onebit_masks[mask_idx]); +} + +// switches on bit N in the given vector. +static really_inline void setbit128(m128 *ptr, unsigned int n) { + *ptr = or128(mask1bit128(n), *ptr); +} + +// switches off bit N in the given vector. +static really_inline void clearbit128(m128 *ptr, unsigned int n) { + *ptr = andnot128(mask1bit128(n), *ptr); +} + +// tests bit N in the given vector. +static really_inline char testbit128(m128 val, unsigned int n) { + const m128 mask = mask1bit128(n); + return isnonzero128(and128(mask, val)); +} + +// offset must be an immediate +/*The x86 platform does not perform the lower 8 bit operation. +If the value of imm exceeds 8 bit, a compilation error occurs.*/ +static really_inline m128 palignr(m128 a, m128 b, int count) { + m128 result; + count = count & 0xff; + if (likely(count < 16)) { + result.vect_s8 = vextq_s8(b.vect_s8, a.vect_s8, count); + } else if (count < 32) { + result.vect_s8 = vextq_s8(a.vect_s8, vdupq_n_s8(0x0), count - 16); + } else { + result.vect_s32 = vdupq_n_s32(0); + } + return result; +} + +static really_inline m128 pshufb_m128(m128 a, m128 b) { + m128 result; + __asm__ __volatile__("movi v3.16b, 0x8f \n\t" + "and v3.16b, v3.16b, %2.16b \n\t" + "tbl %0.16b, {%1.16b}, v3.16b \n\t" + : "=w"(result) + : "w"(a), "w"(b) + : "v3"); + return result; +} + +static really_inline m256 pshufb_m256(m256 a, m256 b) { + m256 rv; + rv.lo = pshufb_m128(a.lo, b.lo); + rv.hi = pshufb_m128(a.hi, b.hi); + return rv; +} + +static really_inline m128 variable_byte_shift_m128(m128 in, s32 amount) { + assert(amount >= -16 && amount <= 16); + m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); + return pshufb_m128(in, shift_mask); +} + +static really_inline m128 max_u8_m128(m128 a, m128 b) { + m128 result; + result.vect_u8 = vmaxq_u8(a.vect_u8, b.vect_u8); + return result; +} + +static really_inline m128 min_u8_m128(m128 a, m128 b) { + m128 result; + result.vect_u8 = vminq_u8(a.vect_u8, b.vect_u8); + return result; +} + +static really_inline m128 sadd_u8_m128(m128 a, m128 b) { + m128 result; + result.vect_u8 = vqaddq_u8(a.vect_u8, b.vect_u8); + return result; +} + +static really_inline m128 sub_u8_m128(m128 a, m128 b) { + m128 result; + result.vect_u8 = vsubq_u8(a.vect_u8, b.vect_u8); + return result; +} + +static really_inline m128 set64x2(int64_t hi, int64_t lo) { + m128 result; + result.vect_s64 = vsetq_lane_s64(hi, vdupq_n_s64(lo), 1); + return result; +} + +static really_inline m128 set32x4(int i3, int i2, int i1, int i0) { + m128 result; + result.vect_s32 = vsetq_lane_s32( + i3, vsetq_lane_s32(i2, vsetq_lane_s32(i1, vdupq_n_s32(i0), 1), 2), 3); + return result; +} + +/**** + **** 256-bit Primitives + ****/ + +static really_really_inline m256 lshift64_m256(m256 a, int b) { + m256 rv = a; + rv.lo = lshift64_m128(rv.lo, b); + rv.hi = lshift64_m128(rv.hi, b); + return rv; +} + +static really_inline m256 rshift64_m256(m256 a, int b) { + m256 rv = a; + rv.lo = rshift64_m128(rv.lo, b); + rv.hi = rshift64_m128(rv.hi, b); + return rv; +} +static really_inline m256 set32x8(u32 in) { + m256 rv; + rv.lo = set16x8((u8)in); + rv.hi = rv.lo; + return rv; +} + +static really_inline m256 eq256(m256 a, m256 b) { + m256 rv; + rv.lo = eq128(a.lo, b.lo); + rv.hi = eq128(a.hi, b.hi); + return rv; +} + +static really_inline u32 movemask256(m256 a) { + u32 lo_mask = movemask128(a.lo); + u32 hi_mask = movemask128(a.hi); + return lo_mask | (hi_mask << 16); +} + +static really_inline m256 set2x128(m128 a) { + m256 rv = {a, a}; + return rv; +} + +static really_inline m256 zeroes256(void) { + m256 rv = {zeroes128(), zeroes128()}; + return rv; +} + +static really_inline m256 ones256(void) { + m256 rv = {ones128(), ones128()}; + return rv; +} + +static really_inline m256 and256(m256 a, m256 b) { + m256 rv; + rv.lo = and128(a.lo, b.lo); + rv.hi = and128(a.hi, b.hi); + return rv; +} + +static really_inline m256 or256(m256 a, m256 b) { + m256 rv; + rv.lo = or128(a.lo, b.lo); + rv.hi = or128(a.hi, b.hi); + return rv; +} + +static really_inline m256 xor256(m256 a, m256 b) { + m256 rv; + rv.lo = xor128(a.lo, b.lo); + rv.hi = xor128(a.hi, b.hi); + return rv; +} + +static really_inline m256 not256(m256 a) { + m256 rv; + rv.lo = not128(a.lo); + rv.hi = not128(a.hi); + return rv; +} + +static really_inline m256 andnot256(m256 a, m256 b) { + m256 rv; + rv.lo = andnot128(a.lo, b.lo); + rv.hi = andnot128(a.hi, b.hi); + return rv; +} + +static really_inline int diff256(m256 a, m256 b) { + return diff128(a.lo, b.lo) || diff128(a.hi, b.hi); +} + +static really_inline int isnonzero256(m256 a) { + return isnonzero128(or128(a.lo, a.hi)); +} + +/** + * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich256(m256 a, m256 b) { + uint32x4_t x = vceqq_s32(a.lo.vect_s32, b.lo.vect_s32); + uint32x4_t y = vceqq_s32(a.hi.vect_s32, b.hi.vect_s32); + uint8x8_t lo = vqmovn_u16(vcombine_u16(vqmovn_u32(x), vqmovn_u32(y))); + + static const int8_t __attribute__((aligned(16))) + xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0}; + uint8x8_t mask_and = vdup_n_u8(0x80); + int8x8_t mask_shift = vld1_s8(xr); + + lo = vand_u8(lo, mask_and); + lo = vshl_u8(lo, mask_shift); + + lo = vpadd_u8(lo, lo); + lo = vpadd_u8(lo, lo); + lo = vpadd_u8(lo, lo); + + return ~(lo[0] & 0xFF) & 0xff; +} + +/** + * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and + * returns an 8-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_256(m256 a, m256 b) { + u32 d = diffrich256(a, b); + return (d | (d >> 1)) & 0x55555555; +} + +// aligned load +static really_inline m256 load256(const void *ptr) { + assert(ISALIGNED_N(ptr, alignof(m256))); + m256 rv = {load128(ptr), load128((const char *)ptr + 16)}; + return rv; +} + +// aligned load of 128-bit value to low and high part of 256-bit value +static really_inline m256 load2x128(const void *ptr) { + assert(ISALIGNED_N(ptr, alignof(m128))); + m256 rv; + rv.hi = rv.lo = load128(ptr); + return rv; +} + +static really_inline m256 loadu2x128(const void *ptr) { + return set2x128(loadu128(ptr)); +} + +// aligned store +static really_inline void store256(void *ptr, m256 a) { + assert(ISALIGNED_N(ptr, alignof(m256))); + ptr = assume_aligned(ptr, 16); + *(m256 *)ptr = a; +} + +// unaligned load +static really_inline m256 loadu256(const void *ptr) { + m256 rv = {loadu128(ptr), loadu128((const char *)ptr + 16)}; + return rv; +} + +// unaligned store +static really_inline void storeu256(void *ptr, m256 a) { + storeu128(ptr, a.lo); + storeu128((char *)ptr + 16, a.hi); +} + +// packed unaligned store of first N bytes +static really_inline void storebytes256(void *ptr, m256 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline m256 loadbytes256(const void *ptr, unsigned int n) { + m256 a = zeroes256(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +static really_inline m256 mask1bit256(unsigned int n) { + assert(n < sizeof(m256) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu256(&simd_onebit_masks[mask_idx]); +} + +static really_inline m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { + m256 rv; + rv.hi = set64x2(hi_1, hi_0); + rv.lo = set64x2(lo_1, lo_0); + return rv; +} + +// switches on bit N in the given vector. +static really_inline void setbit256(m256 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 128; + } + setbit128(sub, n); +} + +// switches off bit N in the given vector. +static really_inline void clearbit256(m256 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 128; + } + clearbit128(sub, n); +} + +// tests bit N in the given vector. +static really_inline char testbit256(m256 val, unsigned int n) { + assert(n < sizeof(val) * 8); + m128 sub; + if (n < 128) { + sub = val.lo; + } else { + sub = val.hi; + n -= 128; + } + return testbit128(sub, n); +} + +static really_really_inline m128 movdq_hi(m256 x) { return x.hi; } + +static really_really_inline m128 movdq_lo(m256 x) { return x.lo; } + +static really_inline m256 combine2x128(m128 hi, m128 lo) { + m256 rv = {lo, hi}; + return rv; +} + +/**** + **** 384-bit Primitives + ****/ + +static really_inline m384 and384(m384 a, m384 b) { + m384 rv; + rv.lo = and128(a.lo, b.lo); + rv.mid = and128(a.mid, b.mid); + rv.hi = and128(a.hi, b.hi); + return rv; +} + +static really_inline m384 or384(m384 a, m384 b) { + m384 rv; + rv.lo = or128(a.lo, b.lo); + rv.mid = or128(a.mid, b.mid); + rv.hi = or128(a.hi, b.hi); + return rv; +} + +static really_inline m384 xor384(m384 a, m384 b) { + m384 rv; + rv.lo = xor128(a.lo, b.lo); + rv.mid = xor128(a.mid, b.mid); + rv.hi = xor128(a.hi, b.hi); + return rv; +} +static really_inline m384 not384(m384 a) { + m384 rv; + rv.lo = not128(a.lo); + rv.mid = not128(a.mid); + rv.hi = not128(a.hi); + return rv; +} +static really_inline m384 andnot384(m384 a, m384 b) { + m384 rv; + rv.lo = andnot128(a.lo, b.lo); + rv.mid = andnot128(a.mid, b.mid); + rv.hi = andnot128(a.hi, b.hi); + return rv; +} + +static really_really_inline m384 lshift64_m384(m384 a, unsigned b) { + m384 rv; + rv.lo = lshift64_m128(a.lo, b); + rv.mid = lshift64_m128(a.mid, b); + rv.hi = lshift64_m128(a.hi, b); + return rv; +} + +static really_inline m384 zeroes384(void) { + m384 rv = {zeroes128(), zeroes128(), zeroes128()}; + return rv; +} + +static really_inline m384 ones384(void) { + m384 rv = {ones128(), ones128(), ones128()}; + return rv; +} + +static really_inline int diff384(m384 a, m384 b) { + return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi); +} + +static really_inline int isnonzero384(m384 a) { + return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); +} + +/** + * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich384(m384 a, m384 b) { + m128 z = zeroes128(); + uint32x4_t x = vceqq_s32(a.lo.vect_s32, b.lo.vect_s32); + uint32x4_t y = vceqq_s32(a.mid.vect_s32, b.mid.vect_s32); + uint32x4_t w = vceqq_s32(a.hi.vect_s32, b.hi.vect_s32); + + uint16x8_t q = vcombine_u16(vqmovn_u32(x), vqmovn_u32(y)); + uint16x8_t p = vcombine_u16(vqmovn_u32(w), vqmovn_u32(z.vect_u32)); + + uint8x16_t input = vcombine_u8(vqmovn_u16(q), vqmovn_u16(p)); + + static const int8_t __attribute__((aligned(16))) + xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0}; + uint8x8_t mask_and = vdup_n_u8(0x80); + int8x8_t mask_shift = vld1_s8(xr); + + uint8x8_t lo = vget_low_u8(input); + uint8x8_t hi = vget_high_u8(input); + + lo = vand_u8(lo, mask_and); + lo = vshl_u8(lo, mask_shift); + + hi = vand_u8(hi, mask_and); + hi = vshl_u8(hi, mask_shift); + + lo = vpadd_u8(lo, lo); + lo = vpadd_u8(lo, lo); + lo = vpadd_u8(lo, lo); + + hi = vpadd_u8(hi, hi); + hi = vpadd_u8(hi, hi); + hi = vpadd_u8(hi, hi); + + return ~((hi[0] << 8) | (lo[0] & 0xFF)) & 0xfff; +} + +/** + * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and + * returns a 12-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_384(m384 a, m384 b) { + u32 d = diffrich384(a, b); + return (d | (d >> 1)) & 0x55555555; +} + +// aligned load +static really_inline m384 load384(const void *ptr) { + assert(ISALIGNED_16(ptr)); + m384 rv = {load128(ptr), load128((const char *)ptr + 16), + load128((const char *)ptr + 32)}; + return rv; +} + +// aligned store +static really_inline void store384(void *ptr, m384 a) { + assert(ISALIGNED_16(ptr)); + ptr = assume_aligned(ptr, 16); + *(m384 *)ptr = a; +} + +// unaligned load +static really_inline m384 loadu384(const void *ptr) { + m384 rv = {loadu128(ptr), loadu128((const char *)ptr + 16), + loadu128((const char *)ptr + 32)}; + return rv; +} + +// packed unaligned store of first N bytes +static really_inline void storebytes384(void *ptr, m384 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline m384 loadbytes384(const void *ptr, unsigned int n) { + m384 a = zeroes384(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +// switches on bit N in the given vector. +static really_inline void setbit384(m384 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else if (n < 256) { + sub = &ptr->mid; + } else { + sub = &ptr->hi; + } + setbit128(sub, n % 128); +} + +// switches off bit N in the given vector. +static really_inline void clearbit384(m384 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else if (n < 256) { + sub = &ptr->mid; + } else { + sub = &ptr->hi; + } + clearbit128(sub, n % 128); +} + +// tests bit N in the given vector. +static really_inline char testbit384(m384 val, unsigned int n) { + assert(n < sizeof(val) * 8); + m128 sub; + if (n < 128) { + sub = val.lo; + } else if (n < 256) { + sub = val.mid; + } else { + sub = val.hi; + } + return testbit128(sub, n % 128); +} + +/**** + **** 512-bit Primitives + ****/ + +static really_inline m512 zeroes512(void) { + m512 rv = {zeroes256(), zeroes256()}; + return rv; +} + +static really_inline m512 ones512(void) { + m512 rv = {ones256(), ones256()}; + return rv; +} + +static really_inline m512 and512(m512 a, m512 b) { + m512 rv; + rv.lo = and256(a.lo, b.lo); + rv.hi = and256(a.hi, b.hi); + return rv; +} + +static really_inline m512 or512(m512 a, m512 b) { + m512 rv; + rv.lo = or256(a.lo, b.lo); + rv.hi = or256(a.hi, b.hi); + return rv; +} + +static really_inline m512 xor512(m512 a, m512 b) { + m512 rv; + rv.lo = xor256(a.lo, b.lo); + rv.hi = xor256(a.hi, b.hi); + return rv; +} + +static really_inline m512 not512(m512 a) { + m512 rv; + rv.lo = not256(a.lo); + rv.hi = not256(a.hi); + return rv; +} + +static really_inline m512 andnot512(m512 a, m512 b) { + m512 rv; + rv.lo = andnot256(a.lo, b.lo); + rv.hi = andnot256(a.hi, b.hi); + return rv; +} + +static really_really_inline m512 lshift64_m512(m512 a, unsigned b) { + m512 rv; + rv.lo = lshift64_m256(a.lo, b); + rv.hi = lshift64_m256(a.hi, b); + return rv; +} + +static really_inline int diff512(m512 a, m512 b) { + return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); +} + +static really_inline int isnonzero512(m512 a) { + m128 x = or128(a.lo.lo, a.lo.hi); + m128 y = or128(a.hi.lo, a.hi.hi); + return isnonzero128(or128(x, y)); +} + +/** + * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich512(m512 a, m512 b) { + uint32x4_t x = vceqq_s32(a.lo.lo.vect_s32, b.lo.lo.vect_s32); + uint32x4_t y = vceqq_s32(a.lo.hi.vect_s32, b.lo.hi.vect_s32); + uint32x4_t z = vceqq_s32(a.hi.lo.vect_s32, b.hi.lo.vect_s32); + uint32x4_t w = vceqq_s32(a.hi.hi.vect_s32, b.hi.hi.vect_s32); + uint16x8_t p = vcombine_u16(vqmovn_u32(x), vqmovn_u32(y)); + uint16x8_t q = vcombine_u16(vqmovn_u32(z), vqmovn_u32(w)); + + uint8x16_t input = vcombine_u8(vqmovn_u16(p), vqmovn_u16(q)); + + static const int8_t __attribute__((aligned(16))) + xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0}; + uint8x8_t mask_and = vdup_n_u8(0x80); + int8x8_t mask_shift = vld1_s8(xr); + + uint8x8_t lo = vget_low_u8(input); + uint8x8_t hi = vget_high_u8(input); + + lo = vand_u8(lo, mask_and); + lo = vshl_u8(lo, mask_shift); + + hi = vand_u8(hi, mask_and); + hi = vshl_u8(hi, mask_shift); + + lo = vpadd_u8(lo, lo); + lo = vpadd_u8(lo, lo); + lo = vpadd_u8(lo, lo); + + hi = vpadd_u8(hi, hi); + hi = vpadd_u8(hi, hi); + hi = vpadd_u8(hi, hi); + + return ~((hi[0] << 8) | (lo[0] & 0xFF)) & 0xffff; +} + +/** + * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and + * returns a 16-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_512(m512 a, m512 b) { + u32 d = diffrich512(a, b); + return (d | (d >> 1)) & 0x55555555; +} + +// aligned load +static really_inline m512 load512(const void *ptr) { + assert(ISALIGNED_N(ptr, alignof(m256))); + m512 rv = {load256(ptr), load256((const char *)ptr + 32)}; + return rv; +} + +// aligned store +static really_inline void store512(void *ptr, m512 a) { + assert(ISALIGNED_N(ptr, alignof(m512))); + ptr = assume_aligned(ptr, 16); + *(m512 *)ptr = a; +} + +// unaligned load +static really_inline m512 loadu512(const void *ptr) { + m512 rv = {loadu256(ptr), loadu256((const char *)ptr + 32)}; + return rv; +} + +// packed unaligned store of first N bytes +static really_inline void storebytes512(void *ptr, m512 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline m512 loadbytes512(const void *ptr, unsigned int n) { + m512 a = zeroes512(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +static really_inline m512 mask1bit512(unsigned int n) { + assert(n < sizeof(m512) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu512(&simd_onebit_masks[mask_idx]); +} + +// switches on bit N in the given vector. +static really_inline void setbit512(m512 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo.lo; + } else if (n < 256) { + sub = &ptr->lo.hi; + } else if (n < 384) { + sub = &ptr->hi.lo; + } else { + sub = &ptr->hi.hi; + } + setbit128(sub, n % 128); +} + +// switches off bit N in the given vector. +static really_inline void clearbit512(m512 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo.lo; + } else if (n < 256) { + sub = &ptr->lo.hi; + } else if (n < 384) { + sub = &ptr->hi.lo; + } else { + sub = &ptr->hi.hi; + } + clearbit128(sub, n % 128); +} + +// tests bit N in the given vector. +static really_inline char testbit512(m512 val, unsigned int n) { + assert(n < sizeof(val) * 8); + m128 sub; + if (n < 128) { + sub = val.lo.lo; + } else if (n < 256) { + sub = val.lo.hi; + } else if (n < 384) { + sub = val.hi.lo; + } else { + sub = val.hi.hi; + } + return testbit128(sub, n % 128); +} +#pragma GCC diagnostic pop + +#endif diff --git a/src/util/simd_types.h b/src/util/simd_types.h index 962cad6..62d39ec 100644 --- a/src/util/simd_types.h +++ b/src/util/simd_types.h @@ -30,28 +30,58 @@ #define SIMD_TYPES_H #include "config.h" +#include "ue2common.h" #include "util/arch.h" #include "util/intrinsics.h" -#include "ue2common.h" #if defined(HAVE_SSE2) +typedef __m128i m128; +#elif defined(HAVE_NEON) +#include "arm_neon.h" + +typedef union { + int8x16_t vect_s8; + int16x8_t vect_s16; + int32x4_t vect_s32; + int64x2_t vect_s64; + uint8x16_t vect_u8; + uint16x8_t vect_u16; + uint32x4_t vect_u32; + uint64x2_t vect_u64; +} __m128i; +typedef float32x4_t __m128; +typedef float64x2_t __m128d; + typedef __m128i m128; #else -typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128; +typedef struct ALIGN_DIRECTIVE { + u64a hi; + u64a lo; +} m128; + #endif #if defined(HAVE_AVX2) typedef __m256i m256; #else -typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256; +typedef struct ALIGN_AVX_DIRECTIVE { + m128 lo; + m128 hi; +} m256; #endif -typedef struct {m128 lo; m128 mid; m128 hi;} m384; +typedef struct { + m128 lo; + m128 mid; + m128 hi; +} m384; #if defined(HAVE_AVX512) typedef __m512i m512; #else -typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512; +typedef struct ALIGN_ATTR(64) { + m256 lo; + m256 hi; +} m512; #endif #endif /* SIMD_TYPES_H */ - diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h index d1f060b..7e926b2 100644 --- a/src/util/simd_utils.h +++ b/src/util/simd_utils.h @@ -26,1395 +26,14 @@ * POSSIBILITY OF SUCH DAMAGE. */ -/** \file - * \brief SIMD types and primitive operations. - */ - #ifndef SIMD_UTILS #define SIMD_UTILS -#if !defined(_WIN32) && !defined(__SSSE3__) -#error SSSE3 instructions must be enabled -#endif - -#include "config.h" -#include "ue2common.h" -#include "simd_types.h" -#include "unaligned.h" -#include "util/arch.h" -#include "util/intrinsics.h" - -#include // for memcpy - -// Define a common assume_aligned using an appropriate compiler built-in, if -// it's available. Note that we need to handle C or C++ compilation. -#ifdef __cplusplus -# ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED -# define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) -# endif -#else -# ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED -# define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) -# endif -#endif - -// Fallback to identity case. -#ifndef assume_aligned -#define assume_aligned(x, y) (x) -#endif - -#ifdef __cplusplus -extern "C" { -#endif -extern const char vbs_mask_data[]; -#ifdef __cplusplus -} -#endif - -static really_inline m128 ones128(void) { -#if defined(__GNUC__) || defined(__INTEL_COMPILER) - /* gcc gets this right */ - return _mm_set1_epi8(0xFF); -#else - /* trick from Intel's optimization guide to generate all-ones. - * ICC converts this to the single cmpeq instruction */ - return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); -#endif -} - -static really_inline m128 zeroes128(void) { - return _mm_setzero_si128(); -} - -/** \brief Bitwise not for m128*/ -static really_inline m128 not128(m128 a) { - return _mm_xor_si128(a, ones128()); -} - -/** \brief Return 1 if a and b are different otherwise 0 */ -static really_inline int diff128(m128 a, m128 b) { - return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff); -} - -static really_inline int isnonzero128(m128 a) { - return !!diff128(a, zeroes128()); -} - -/** - * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit - * mask indicating which 32-bit words contain differences. - */ -static really_inline u32 diffrich128(m128 a, m128 b) { - a = _mm_cmpeq_epi32(a, b); - return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf; -} - -/** - * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and - * returns a 4-bit mask indicating which 64-bit words contain differences. - */ -static really_inline u32 diffrich64_128(m128 a, m128 b) { -#if defined(HAVE_SSE41) - a = _mm_cmpeq_epi64(a, b); - return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5; -#else - u32 d = diffrich128(a, b); - return (d | (d >> 1)) & 0x5; -#endif -} - -static really_really_inline -m128 lshift64_m128(m128 a, unsigned b) { -#if defined(HAVE__BUILTIN_CONSTANT_P) - if (__builtin_constant_p(b)) { - return _mm_slli_epi64(a, b); - } -#endif - m128 x = _mm_cvtsi32_si128(b); - return _mm_sll_epi64(a, x); -} - -#define rshift64_m128(a, b) _mm_srli_epi64((a), (b)) -#define eq128(a, b) _mm_cmpeq_epi8((a), (b)) -#define movemask128(a) ((u32)_mm_movemask_epi8((a))) - -#if defined(HAVE_AVX512) -static really_inline m128 cast512to128(const m512 in) { - return _mm512_castsi512_si128(in); -} -#endif - -static really_inline m128 set16x8(u8 c) { - return _mm_set1_epi8(c); -} - -static really_inline m128 set4x32(u32 c) { - return _mm_set1_epi32(c); -} - -static really_inline u32 movd(const m128 in) { - return _mm_cvtsi128_si32(in); -} - -#if defined(HAVE_AVX512) -static really_inline u32 movd512(const m512 in) { - // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in), - // so we use 2-step convertions to work around. - return _mm_cvtsi128_si32(_mm512_castsi512_si128(in)); -} - -static really_inline u64a movq512(const m512 in) { - // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in), - // so we use 2-step convertions to work around. - return _mm_cvtsi128_si64(_mm512_castsi512_si128(in)); -} -#endif - -static really_inline u64a movq(const m128 in) { -#if defined(ARCH_X86_64) - return _mm_cvtsi128_si64(in); -#else // 32-bit - this is horrific - u32 lo = movd(in); - u32 hi = movd(_mm_srli_epi64(in, 32)); - return (u64a)hi << 32 | lo; -#endif -} - -/* another form of movq */ -static really_inline -m128 load_m128_from_u64a(const u64a *p) { - return _mm_set_epi64x(0LL, *p); -} - -#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed) -#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed) - -#if defined(HAVE_SSE41) -#define extract32from128(a, imm) _mm_extract_epi32(a, imm) -#define extract64from128(a, imm) _mm_extract_epi64(a, imm) -#else -#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2)) -#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3)) -#endif - -#if !defined(HAVE_AVX2) -// TODO: this entire file needs restructuring - this carveout is awful -#define extractlow64from256(a) movq(a.lo) -#define extractlow32from256(a) movd(a.lo) -#if defined(HAVE_SSE41) -#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4) -#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2) -#else -#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4)) -#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8)) -#endif - -#endif // !AVX2 - -static really_inline m128 and128(m128 a, m128 b) { - return _mm_and_si128(a,b); -} - -static really_inline m128 xor128(m128 a, m128 b) { - return _mm_xor_si128(a,b); -} - -static really_inline m128 or128(m128 a, m128 b) { - return _mm_or_si128(a,b); -} - -#if defined(HAVE_AVX512VBMI) -static really_inline m512 expand128(m128 a) { - return _mm512_broadcast_i32x4(a); -} - -static really_inline m512 expand256(m256 a) { - return _mm512_broadcast_i64x4(a); -} - -static really_inline m512 expand384(m384 a) { - u64a *lo = (u64a*)&a.lo; - u64a *mid = (u64a*)&a.mid; - u64a *hi = (u64a*)&a.hi; - return _mm512_set_epi64(0ULL, 0ULL, hi[1], hi[0], mid[1], mid[0], - lo[1], lo[0]); -} +#if defined(__x86_64__) +#include "simd_x86.h" +#elif defined(__aarch64__) +#include "simd_arm.h" #endif -static really_inline m128 andnot128(m128 a, m128 b) { - return _mm_andnot_si128(a, b); -} - -// aligned load -static really_inline m128 load128(const void *ptr) { - assert(ISALIGNED_N(ptr, alignof(m128))); - ptr = assume_aligned(ptr, 16); - return _mm_load_si128((const m128 *)ptr); -} - -// aligned store -static really_inline void store128(void *ptr, m128 a) { - assert(ISALIGNED_N(ptr, alignof(m128))); - ptr = assume_aligned(ptr, 16); - *(m128 *)ptr = a; -} - -// unaligned load -static really_inline m128 loadu128(const void *ptr) { - return _mm_loadu_si128((const m128 *)ptr); -} - -// unaligned store -static really_inline void storeu128(void *ptr, m128 a) { - _mm_storeu_si128 ((m128 *)ptr, a); -} - -// packed unaligned store of first N bytes -static really_inline -void storebytes128(void *ptr, m128 a, unsigned int n) { - assert(n <= sizeof(a)); - memcpy(ptr, &a, n); -} - -// packed unaligned load of first N bytes, pad with zero -static really_inline -m128 loadbytes128(const void *ptr, unsigned int n) { - m128 a = zeroes128(); - assert(n <= sizeof(a)); - memcpy(&a, ptr, n); - return a; -} - -#ifdef __cplusplus -extern "C" { -#endif -extern const u8 simd_onebit_masks[]; -#ifdef __cplusplus -} #endif -static really_inline -m128 mask1bit128(unsigned int n) { - assert(n < sizeof(m128) * 8); - u32 mask_idx = ((n % 8) * 64) + 95; - mask_idx -= n / 8; - return loadu128(&simd_onebit_masks[mask_idx]); -} - -// switches on bit N in the given vector. -static really_inline -void setbit128(m128 *ptr, unsigned int n) { - *ptr = or128(mask1bit128(n), *ptr); -} - -// switches off bit N in the given vector. -static really_inline -void clearbit128(m128 *ptr, unsigned int n) { - *ptr = andnot128(mask1bit128(n), *ptr); -} - -// tests bit N in the given vector. -static really_inline -char testbit128(m128 val, unsigned int n) { - const m128 mask = mask1bit128(n); -#if defined(HAVE_SSE41) - return !_mm_testz_si128(mask, val); -#else - return isnonzero128(and128(mask, val)); -#endif -} - -// offset must be an immediate -#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset) - -static really_inline -m128 pshufb_m128(m128 a, m128 b) { - m128 result; - result = _mm_shuffle_epi8(a, b); - return result; -} - -static really_inline -m256 pshufb_m256(m256 a, m256 b) { -#if defined(HAVE_AVX2) - return _mm256_shuffle_epi8(a, b); -#else - m256 rv; - rv.lo = pshufb_m128(a.lo, b.lo); - rv.hi = pshufb_m128(a.hi, b.hi); - return rv; -#endif -} - -#if defined(HAVE_AVX512) -static really_inline -m512 pshufb_m512(m512 a, m512 b) { - return _mm512_shuffle_epi8(a, b); -} - -static really_inline -m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) { - return _mm512_maskz_shuffle_epi8(k, a, b); -} - -#if defined(HAVE_AVX512VBMI) -#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a) -#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a) -#endif - -#endif - -static really_inline -m128 variable_byte_shift_m128(m128 in, s32 amount) { - assert(amount >= -16 && amount <= 16); - m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); - return pshufb_m128(in, shift_mask); -} - -static really_inline -m128 max_u8_m128(m128 a, m128 b) { - return _mm_max_epu8(a, b); -} - -static really_inline -m128 min_u8_m128(m128 a, m128 b) { - return _mm_min_epu8(a, b); -} - -static really_inline -m128 sadd_u8_m128(m128 a, m128 b) { - return _mm_adds_epu8(a, b); -} - -static really_inline -m128 sub_u8_m128(m128 a, m128 b) { - return _mm_sub_epi8(a, b); -} - -static really_inline -m128 set64x2(u64a hi, u64a lo) { - return _mm_set_epi64x(hi, lo); -} - -/**** - **** 256-bit Primitives - ****/ - -#if defined(HAVE_AVX2) - -static really_really_inline -m256 lshift64_m256(m256 a, unsigned b) { -#if defined(HAVE__BUILTIN_CONSTANT_P) - if (__builtin_constant_p(b)) { - return _mm256_slli_epi64(a, b); - } -#endif - m128 x = _mm_cvtsi32_si128(b); - return _mm256_sll_epi64(a, x); -} - -#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b)) - -static really_inline -m256 set32x8(u32 in) { - return _mm256_set1_epi8(in); -} - -#define eq256(a, b) _mm256_cmpeq_epi8((a), (b)) -#define movemask256(a) ((u32)_mm256_movemask_epi8((a))) - -static really_inline -m256 set2x128(m128 a) { - return _mm256_broadcastsi128_si256(a); -} - -#else - -static really_really_inline -m256 lshift64_m256(m256 a, int b) { - m256 rv = a; - rv.lo = lshift64_m128(rv.lo, b); - rv.hi = lshift64_m128(rv.hi, b); - return rv; -} - -static really_inline -m256 rshift64_m256(m256 a, int b) { - m256 rv = a; - rv.lo = rshift64_m128(rv.lo, b); - rv.hi = rshift64_m128(rv.hi, b); - return rv; -} -static really_inline -m256 set32x8(u32 in) { - m256 rv; - rv.lo = set16x8((u8) in); - rv.hi = rv.lo; - return rv; -} - -static really_inline -m256 eq256(m256 a, m256 b) { - m256 rv; - rv.lo = eq128(a.lo, b.lo); - rv.hi = eq128(a.hi, b.hi); - return rv; -} - -static really_inline -u32 movemask256(m256 a) { - u32 lo_mask = movemask128(a.lo); - u32 hi_mask = movemask128(a.hi); - return lo_mask | (hi_mask << 16); -} - -static really_inline -m256 set2x128(m128 a) { - m256 rv = {a, a}; - return rv; -} -#endif - -static really_inline m256 zeroes256(void) { -#if defined(HAVE_AVX2) - return _mm256_setzero_si256(); -#else - m256 rv = {zeroes128(), zeroes128()}; - return rv; -#endif -} - -static really_inline m256 ones256(void) { -#if defined(HAVE_AVX2) - m256 rv = _mm256_set1_epi8(0xFF); -#else - m256 rv = {ones128(), ones128()}; -#endif - return rv; -} - -#if defined(HAVE_AVX2) -static really_inline m256 and256(m256 a, m256 b) { - return _mm256_and_si256(a, b); -} -#else -static really_inline m256 and256(m256 a, m256 b) { - m256 rv; - rv.lo = and128(a.lo, b.lo); - rv.hi = and128(a.hi, b.hi); - return rv; -} -#endif - -#if defined(HAVE_AVX2) -static really_inline m256 or256(m256 a, m256 b) { - return _mm256_or_si256(a, b); -} -#else -static really_inline m256 or256(m256 a, m256 b) { - m256 rv; - rv.lo = or128(a.lo, b.lo); - rv.hi = or128(a.hi, b.hi); - return rv; -} -#endif - -#if defined(HAVE_AVX2) -static really_inline m256 xor256(m256 a, m256 b) { - return _mm256_xor_si256(a, b); -} -#else -static really_inline m256 xor256(m256 a, m256 b) { - m256 rv; - rv.lo = xor128(a.lo, b.lo); - rv.hi = xor128(a.hi, b.hi); - return rv; -} -#endif - -#if defined(HAVE_AVX2) -static really_inline m256 not256(m256 a) { - return _mm256_xor_si256(a, ones256()); -} -#else -static really_inline m256 not256(m256 a) { - m256 rv; - rv.lo = not128(a.lo); - rv.hi = not128(a.hi); - return rv; -} -#endif - -#if defined(HAVE_AVX2) -static really_inline m256 andnot256(m256 a, m256 b) { - return _mm256_andnot_si256(a, b); -} -#else -static really_inline m256 andnot256(m256 a, m256 b) { - m256 rv; - rv.lo = andnot128(a.lo, b.lo); - rv.hi = andnot128(a.hi, b.hi); - return rv; -} -#endif - -static really_inline int diff256(m256 a, m256 b) { -#if defined(HAVE_AVX2) - return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1); -#else - return diff128(a.lo, b.lo) || diff128(a.hi, b.hi); -#endif -} - -static really_inline int isnonzero256(m256 a) { -#if defined(HAVE_AVX2) - return !!diff256(a, zeroes256()); -#else - return isnonzero128(or128(a.lo, a.hi)); -#endif -} - -/** - * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit - * mask indicating which 32-bit words contain differences. - */ -static really_inline u32 diffrich256(m256 a, m256 b) { -#if defined(HAVE_AVX2) - a = _mm256_cmpeq_epi32(a, b); - return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF; -#else - m128 z = zeroes128(); - a.lo = _mm_cmpeq_epi32(a.lo, b.lo); - a.hi = _mm_cmpeq_epi32(a.hi, b.hi); - m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z); - return ~(_mm_movemask_epi8(packed)) & 0xff; -#endif -} - -/** - * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and - * returns an 8-bit mask indicating which 64-bit words contain differences. - */ -static really_inline u32 diffrich64_256(m256 a, m256 b) { - u32 d = diffrich256(a, b); - return (d | (d >> 1)) & 0x55555555; -} - -// aligned load -static really_inline m256 load256(const void *ptr) { - assert(ISALIGNED_N(ptr, alignof(m256))); -#if defined(HAVE_AVX2) - return _mm256_load_si256((const m256 *)ptr); -#else - m256 rv = { load128(ptr), load128((const char *)ptr + 16) }; - return rv; -#endif -} - -// aligned load of 128-bit value to low and high part of 256-bit value -static really_inline m256 load2x128(const void *ptr) { -#if defined(HAVE_AVX2) - return set2x128(load128(ptr)); -#else - assert(ISALIGNED_N(ptr, alignof(m128))); - m256 rv; - rv.hi = rv.lo = load128(ptr); - return rv; -#endif -} - -static really_inline m256 loadu2x128(const void *ptr) { - return set2x128(loadu128(ptr)); -} - -// aligned store -static really_inline void store256(void *ptr, m256 a) { - assert(ISALIGNED_N(ptr, alignof(m256))); -#if defined(HAVE_AVX2) - _mm256_store_si256((m256 *)ptr, a); -#else - ptr = assume_aligned(ptr, 16); - *(m256 *)ptr = a; -#endif -} - -// unaligned load -static really_inline m256 loadu256(const void *ptr) { -#if defined(HAVE_AVX2) - return _mm256_loadu_si256((const m256 *)ptr); -#else - m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) }; - return rv; -#endif -} - -// unaligned store -static really_inline void storeu256(void *ptr, m256 a) { -#if defined(HAVE_AVX2) - _mm256_storeu_si256((m256 *)ptr, a); -#else - storeu128(ptr, a.lo); - storeu128((char *)ptr + 16, a.hi); -#endif -} - -// packed unaligned store of first N bytes -static really_inline -void storebytes256(void *ptr, m256 a, unsigned int n) { - assert(n <= sizeof(a)); - memcpy(ptr, &a, n); -} - -// packed unaligned load of first N bytes, pad with zero -static really_inline -m256 loadbytes256(const void *ptr, unsigned int n) { - m256 a = zeroes256(); - assert(n <= sizeof(a)); - memcpy(&a, ptr, n); - return a; -} - -static really_inline -m256 mask1bit256(unsigned int n) { - assert(n < sizeof(m256) * 8); - u32 mask_idx = ((n % 8) * 64) + 95; - mask_idx -= n / 8; - return loadu256(&simd_onebit_masks[mask_idx]); -} - -static really_inline -m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { -#if defined(HAVE_AVX2) - return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0); -#else - m256 rv; - rv.hi = set64x2(hi_1, hi_0); - rv.lo = set64x2(lo_1, lo_0); - return rv; -#endif -} - -#if !defined(HAVE_AVX2) -// switches on bit N in the given vector. -static really_inline -void setbit256(m256 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - m128 *sub; - if (n < 128) { - sub = &ptr->lo; - } else { - sub = &ptr->hi; - n -= 128; - } - setbit128(sub, n); -} - -// switches off bit N in the given vector. -static really_inline -void clearbit256(m256 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - m128 *sub; - if (n < 128) { - sub = &ptr->lo; - } else { - sub = &ptr->hi; - n -= 128; - } - clearbit128(sub, n); -} - -// tests bit N in the given vector. -static really_inline -char testbit256(m256 val, unsigned int n) { - assert(n < sizeof(val) * 8); - m128 sub; - if (n < 128) { - sub = val.lo; - } else { - sub = val.hi; - n -= 128; - } - return testbit128(sub, n); -} - -static really_really_inline -m128 movdq_hi(m256 x) { - return x.hi; -} - -static really_really_inline -m128 movdq_lo(m256 x) { - return x.lo; -} - -static really_inline -m256 combine2x128(m128 hi, m128 lo) { - m256 rv = {lo, hi}; - return rv; -} - -#else // AVX2 - -// switches on bit N in the given vector. -static really_inline -void setbit256(m256 *ptr, unsigned int n) { - *ptr = or256(mask1bit256(n), *ptr); -} - -static really_inline -void clearbit256(m256 *ptr, unsigned int n) { - *ptr = andnot256(mask1bit256(n), *ptr); -} - -// tests bit N in the given vector. -static really_inline -char testbit256(m256 val, unsigned int n) { - const m256 mask = mask1bit256(n); - return !_mm256_testz_si256(mask, val); -} - -static really_really_inline -m128 movdq_hi(m256 x) { - return _mm256_extracti128_si256(x, 1); -} - -static really_really_inline -m128 movdq_lo(m256 x) { - return _mm256_extracti128_si256(x, 0); -} - -#define cast256to128(a) _mm256_castsi256_si128(a) -#define cast128to256(a) _mm256_castsi128_si256(a) -#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E) -#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm) -#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed) -#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed) -#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2) -#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4) -#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a)) -#define extractlow32from256(a) movd(cast256to128(a)) -#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b) -#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b) -#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset) - -static really_inline -m256 combine2x128(m128 hi, m128 lo) { -#if defined(_mm256_set_m128i) - return _mm256_set_m128i(hi, lo); -#else - return insert128to256(cast128to256(lo), hi, 1); -#endif -} -#endif //AVX2 - -#if defined(HAVE_AVX512) -#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm) -#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b) -#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b) -#define set2x256(a) _mm512_broadcast_i64x4(a) -#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a) -#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a) -#endif - -/**** - **** 384-bit Primitives - ****/ - -static really_inline m384 and384(m384 a, m384 b) { - m384 rv; - rv.lo = and128(a.lo, b.lo); - rv.mid = and128(a.mid, b.mid); - rv.hi = and128(a.hi, b.hi); - return rv; -} - -static really_inline m384 or384(m384 a, m384 b) { - m384 rv; - rv.lo = or128(a.lo, b.lo); - rv.mid = or128(a.mid, b.mid); - rv.hi = or128(a.hi, b.hi); - return rv; -} - -static really_inline m384 xor384(m384 a, m384 b) { - m384 rv; - rv.lo = xor128(a.lo, b.lo); - rv.mid = xor128(a.mid, b.mid); - rv.hi = xor128(a.hi, b.hi); - return rv; -} -static really_inline m384 not384(m384 a) { - m384 rv; - rv.lo = not128(a.lo); - rv.mid = not128(a.mid); - rv.hi = not128(a.hi); - return rv; -} -static really_inline m384 andnot384(m384 a, m384 b) { - m384 rv; - rv.lo = andnot128(a.lo, b.lo); - rv.mid = andnot128(a.mid, b.mid); - rv.hi = andnot128(a.hi, b.hi); - return rv; -} - -static really_really_inline -m384 lshift64_m384(m384 a, unsigned b) { - m384 rv; - rv.lo = lshift64_m128(a.lo, b); - rv.mid = lshift64_m128(a.mid, b); - rv.hi = lshift64_m128(a.hi, b); - return rv; -} - -static really_inline m384 zeroes384(void) { - m384 rv = {zeroes128(), zeroes128(), zeroes128()}; - return rv; -} - -static really_inline m384 ones384(void) { - m384 rv = {ones128(), ones128(), ones128()}; - return rv; -} - -static really_inline int diff384(m384 a, m384 b) { - return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi); -} - -static really_inline int isnonzero384(m384 a) { - return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); -} - -/** - * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit - * mask indicating which 32-bit words contain differences. - */ -static really_inline u32 diffrich384(m384 a, m384 b) { - m128 z = zeroes128(); - a.lo = _mm_cmpeq_epi32(a.lo, b.lo); - a.mid = _mm_cmpeq_epi32(a.mid, b.mid); - a.hi = _mm_cmpeq_epi32(a.hi, b.hi); - m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid), - _mm_packs_epi32(a.hi, z)); - return ~(_mm_movemask_epi8(packed)) & 0xfff; -} - -/** - * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and - * returns a 12-bit mask indicating which 64-bit words contain differences. - */ -static really_inline u32 diffrich64_384(m384 a, m384 b) { - u32 d = diffrich384(a, b); - return (d | (d >> 1)) & 0x55555555; -} - -// aligned load -static really_inline m384 load384(const void *ptr) { - assert(ISALIGNED_16(ptr)); - m384 rv = { load128(ptr), load128((const char *)ptr + 16), - load128((const char *)ptr + 32) }; - return rv; -} - -// aligned store -static really_inline void store384(void *ptr, m384 a) { - assert(ISALIGNED_16(ptr)); - ptr = assume_aligned(ptr, 16); - *(m384 *)ptr = a; -} - -// unaligned load -static really_inline m384 loadu384(const void *ptr) { - m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16), - loadu128((const char *)ptr + 32)}; - return rv; -} - -// packed unaligned store of first N bytes -static really_inline -void storebytes384(void *ptr, m384 a, unsigned int n) { - assert(n <= sizeof(a)); - memcpy(ptr, &a, n); -} - -// packed unaligned load of first N bytes, pad with zero -static really_inline -m384 loadbytes384(const void *ptr, unsigned int n) { - m384 a = zeroes384(); - assert(n <= sizeof(a)); - memcpy(&a, ptr, n); - return a; -} - -// switches on bit N in the given vector. -static really_inline -void setbit384(m384 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - m128 *sub; - if (n < 128) { - sub = &ptr->lo; - } else if (n < 256) { - sub = &ptr->mid; - } else { - sub = &ptr->hi; - } - setbit128(sub, n % 128); -} - -// switches off bit N in the given vector. -static really_inline -void clearbit384(m384 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - m128 *sub; - if (n < 128) { - sub = &ptr->lo; - } else if (n < 256) { - sub = &ptr->mid; - } else { - sub = &ptr->hi; - } - clearbit128(sub, n % 128); -} - -// tests bit N in the given vector. -static really_inline -char testbit384(m384 val, unsigned int n) { - assert(n < sizeof(val) * 8); - m128 sub; - if (n < 128) { - sub = val.lo; - } else if (n < 256) { - sub = val.mid; - } else { - sub = val.hi; - } - return testbit128(sub, n % 128); -} - -/**** - **** 512-bit Primitives - ****/ - -#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b)) -#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b)) - -static really_inline -m512 zeroes512(void) { -#if defined(HAVE_AVX512) - return _mm512_setzero_si512(); -#else - m512 rv = {zeroes256(), zeroes256()}; - return rv; -#endif -} - -static really_inline -m512 ones512(void) { -#if defined(HAVE_AVX512) - return _mm512_set1_epi8(0xFF); - //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512()); -#else - m512 rv = {ones256(), ones256()}; - return rv; -#endif -} - -#if defined(HAVE_AVX512) -static really_inline -m512 set64x8(u8 a) { - return _mm512_set1_epi8(a); -} - -static really_inline -m512 set8x64(u64a a) { - return _mm512_set1_epi64(a); -} - -static really_inline -m512 set16x32(u32 a) { - return _mm512_set1_epi32(a); -} - -static really_inline -m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0, - u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) { - return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0, - lo_3, lo_2, lo_1, lo_0); -} - -static really_inline -m512 swap256in512(m512 a) { - m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL); - return vpermq512(idx, a); -} - -static really_inline -m512 set4x128(m128 a) { - return _mm512_broadcast_i32x4(a); -} - -static really_inline -m512 sadd_u8_m512(m512 a, m512 b) { - return _mm512_adds_epu8(a, b); -} - -static really_inline -m512 max_u8_m512(m512 a, m512 b) { - return _mm512_max_epu8(a, b); -} - -static really_inline -m512 min_u8_m512(m512 a, m512 b) { - return _mm512_min_epu8(a, b); -} - -static really_inline -m512 sub_u8_m512(m512 a, m512 b) { - return _mm512_sub_epi8(a, b); -} -#endif - -static really_inline -m512 and512(m512 a, m512 b) { -#if defined(HAVE_AVX512) - return _mm512_and_si512(a, b); -#else - m512 rv; - rv.lo = and256(a.lo, b.lo); - rv.hi = and256(a.hi, b.hi); - return rv; -#endif -} - -static really_inline -m512 or512(m512 a, m512 b) { -#if defined(HAVE_AVX512) - return _mm512_or_si512(a, b); -#else - m512 rv; - rv.lo = or256(a.lo, b.lo); - rv.hi = or256(a.hi, b.hi); - return rv; -#endif -} - -static really_inline -m512 xor512(m512 a, m512 b) { -#if defined(HAVE_AVX512) - return _mm512_xor_si512(a, b); -#else - m512 rv; - rv.lo = xor256(a.lo, b.lo); - rv.hi = xor256(a.hi, b.hi); - return rv; -#endif -} - -static really_inline -m512 not512(m512 a) { -#if defined(HAVE_AVX512) - return _mm512_xor_si512(a, ones512()); -#else - m512 rv; - rv.lo = not256(a.lo); - rv.hi = not256(a.hi); - return rv; -#endif -} - -static really_inline -m512 andnot512(m512 a, m512 b) { -#if defined(HAVE_AVX512) - return _mm512_andnot_si512(a, b); -#else - m512 rv; - rv.lo = andnot256(a.lo, b.lo); - rv.hi = andnot256(a.hi, b.hi); - return rv; -#endif -} - -#if defined(HAVE_AVX512) -static really_really_inline -m512 lshift64_m512(m512 a, unsigned b) { -#if defined(HAVE__BUILTIN_CONSTANT_P) - if (__builtin_constant_p(b)) { - return _mm512_slli_epi64(a, b); - } -#endif - m128 x = _mm_cvtsi32_si128(b); - return _mm512_sll_epi64(a, x); -} -#else -static really_really_inline -m512 lshift64_m512(m512 a, unsigned b) { - m512 rv; - rv.lo = lshift64_m256(a.lo, b); - rv.hi = lshift64_m256(a.hi, b); - return rv; -} -#endif - -#if defined(HAVE_AVX512) -#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b)) -#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed) -#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed) -#endif - -#if !defined(_MM_CMPINT_NE) -#define _MM_CMPINT_NE 0x4 -#endif - -static really_inline -int diff512(m512 a, m512 b) { -#if defined(HAVE_AVX512) - return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE); -#else - return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); -#endif -} - -static really_inline -int isnonzero512(m512 a) { -#if defined(HAVE_AVX512) - return diff512(a, zeroes512()); -#elif defined(HAVE_AVX2) - m256 x = or256(a.lo, a.hi); - return !!diff256(x, zeroes256()); -#else - m128 x = or128(a.lo.lo, a.lo.hi); - m128 y = or128(a.hi.lo, a.hi.hi); - return isnonzero128(or128(x, y)); -#endif -} - -/** - * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit - * mask indicating which 32-bit words contain differences. - */ -static really_inline -u32 diffrich512(m512 a, m512 b) { -#if defined(HAVE_AVX512) - return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE); -#elif defined(HAVE_AVX2) - return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8); -#else - a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo); - a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi); - a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo); - a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi); - m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi), - _mm_packs_epi32(a.hi.lo, a.hi.hi)); - return ~(_mm_movemask_epi8(packed)) & 0xffff; -#endif -} - -/** - * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and - * returns a 16-bit mask indicating which 64-bit words contain differences. - */ -static really_inline -u32 diffrich64_512(m512 a, m512 b) { - //TODO: cmp_epi64? - u32 d = diffrich512(a, b); - return (d | (d >> 1)) & 0x55555555; -} - -// aligned load -static really_inline -m512 load512(const void *ptr) { -#if defined(HAVE_AVX512) - return _mm512_load_si512(ptr); -#else - assert(ISALIGNED_N(ptr, alignof(m256))); - m512 rv = { load256(ptr), load256((const char *)ptr + 32) }; - return rv; -#endif -} - -// aligned store -static really_inline -void store512(void *ptr, m512 a) { - assert(ISALIGNED_N(ptr, alignof(m512))); -#if defined(HAVE_AVX512) - return _mm512_store_si512(ptr, a); -#elif defined(HAVE_AVX2) - m512 *x = (m512 *)ptr; - store256(&x->lo, a.lo); - store256(&x->hi, a.hi); -#else - ptr = assume_aligned(ptr, 16); - *(m512 *)ptr = a; -#endif -} - -// unaligned load -static really_inline -m512 loadu512(const void *ptr) { -#if defined(HAVE_AVX512) - return _mm512_loadu_si512(ptr); -#else - m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) }; - return rv; -#endif -} - -// unaligned store -static really_inline -void storeu512(void *ptr, m512 a) { -#if defined(HAVE_AVX512) - _mm512_storeu_si512((m512 *)ptr, a); -#elif defined(HAVE_AVX2) - storeu256(ptr, a.lo); - storeu256((char *)ptr + 32, a.hi); -#else - storeu128(ptr, a.lo.lo); - storeu128((char *)ptr + 16, a.lo.hi); - storeu128((char *)ptr + 32, a.hi.lo); - storeu128((char *)ptr + 48, a.hi.hi); -#endif -} - -#if defined(HAVE_AVX512) -static really_inline -m512 loadu_maskz_m512(__mmask64 k, const void *ptr) { - return _mm512_maskz_loadu_epi8(k, ptr); -} - -static really_inline -m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) { - return _mm512_mask_loadu_epi8(src, k, ptr); -} - -static really_inline -void storeu_mask_m512(void *ptr, __mmask64 k, m512 a) { - _mm512_mask_storeu_epi8(ptr, k, a); -} - -static really_inline -m512 set_mask_m512(__mmask64 k) { - return _mm512_movm_epi8(k); -} - -static really_inline -m256 loadu_maskz_m256(__mmask32 k, const void *ptr) { - return _mm256_maskz_loadu_epi8(k, ptr); -} -#endif - -// packed unaligned store of first N bytes -static really_inline -void storebytes512(void *ptr, m512 a, unsigned int n) { - assert(n <= sizeof(a)); - memcpy(ptr, &a, n); -} - -// packed unaligned load of first N bytes, pad with zero -static really_inline -m512 loadbytes512(const void *ptr, unsigned int n) { - m512 a = zeroes512(); - assert(n <= sizeof(a)); - memcpy(&a, ptr, n); - return a; -} - -static really_inline -m512 mask1bit512(unsigned int n) { - assert(n < sizeof(m512) * 8); - u32 mask_idx = ((n % 8) * 64) + 95; - mask_idx -= n / 8; - return loadu512(&simd_onebit_masks[mask_idx]); -} - -// switches on bit N in the given vector. -static really_inline -void setbit512(m512 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); -#if !defined(HAVE_AVX2) - m128 *sub; - if (n < 128) { - sub = &ptr->lo.lo; - } else if (n < 256) { - sub = &ptr->lo.hi; - } else if (n < 384) { - sub = &ptr->hi.lo; - } else { - sub = &ptr->hi.hi; - } - setbit128(sub, n % 128); -#elif defined(HAVE_AVX512) - *ptr = or512(mask1bit512(n), *ptr); -#else - m256 *sub; - if (n < 256) { - sub = &ptr->lo; - } else { - sub = &ptr->hi; - n -= 256; - } - setbit256(sub, n); -#endif -} - -// switches off bit N in the given vector. -static really_inline -void clearbit512(m512 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); -#if !defined(HAVE_AVX2) - m128 *sub; - if (n < 128) { - sub = &ptr->lo.lo; - } else if (n < 256) { - sub = &ptr->lo.hi; - } else if (n < 384) { - sub = &ptr->hi.lo; - } else { - sub = &ptr->hi.hi; - } - clearbit128(sub, n % 128); -#elif defined(HAVE_AVX512) - *ptr = andnot512(mask1bit512(n), *ptr); -#else - m256 *sub; - if (n < 256) { - sub = &ptr->lo; - } else { - sub = &ptr->hi; - n -= 256; - } - clearbit256(sub, n); -#endif -} - -// tests bit N in the given vector. -static really_inline -char testbit512(m512 val, unsigned int n) { - assert(n < sizeof(val) * 8); -#if !defined(HAVE_AVX2) - m128 sub; - if (n < 128) { - sub = val.lo.lo; - } else if (n < 256) { - sub = val.lo.hi; - } else if (n < 384) { - sub = val.hi.lo; - } else { - sub = val.hi.hi; - } - return testbit128(sub, n % 128); -#elif defined(HAVE_AVX512) - const m512 mask = mask1bit512(n); - return !!_mm512_test_epi8_mask(mask, val); -#else - m256 sub; - if (n < 256) { - sub = val.lo; - } else { - sub = val.hi; - n -= 256; - } - return testbit256(sub, n); -#endif -} - -#endif diff --git a/src/util/simd_x86.h b/src/util/simd_x86.h new file mode 100644 index 0000000..59ac642 --- /dev/null +++ b/src/util/simd_x86.h @@ -0,0 +1,1334 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief SIMD types and primitive operations. + */ + +#ifndef SIMD_X86 +#define SIMD_X86 + +#if !defined(_WIN32) && !defined(__SSSE3__) +#error SSSE3 instructions must be enabled +#endif + +#include "config.h" +#include "ue2common.h" +#include "simd_types.h" +#include "unaligned.h" +#include "util/arch.h" +#include "util/intrinsics.h" + +#include // for memcpy + +// Define a common assume_aligned using an appropriate compiler built-in, if +// it's available. Note that we need to handle C or C++ compilation. +#ifdef __cplusplus +# ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED +# define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) +# endif +#else +# ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED +# define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) +# endif +#endif + +// Fallback to identity case. +#ifndef assume_aligned +#define assume_aligned(x, y) (x) +#endif + +#ifdef __cplusplus +extern "C" { +#endif +extern const char vbs_mask_data[]; +#ifdef __cplusplus +} +#endif + +static really_inline m128 ones128(void) { +#if defined(__GNUC__) || defined(__INTEL_COMPILER) + /* gcc gets this right */ + return _mm_set1_epi8(0xFF); +#else + /* trick from Intel's optimization guide to generate all-ones. + * ICC converts this to the single cmpeq instruction */ + return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); +#endif +} + +static really_inline m128 zeroes128(void) { + return _mm_setzero_si128(); +} + +/** \brief Bitwise not for m128*/ +static really_inline m128 not128(m128 a) { + return _mm_xor_si128(a, ones128()); +} + +/** \brief Return 1 if a and b are different otherwise 0 */ +static really_inline int diff128(m128 a, m128 b) { + return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff); +} + +static really_inline int isnonzero128(m128 a) { + return !!diff128(a, zeroes128()); +} + +/** + * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich128(m128 a, m128 b) { + a = _mm_cmpeq_epi32(a, b); + return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf; +} + +/** + * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and + * returns a 4-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_128(m128 a, m128 b) { +#if defined(HAVE_SSE41) + a = _mm_cmpeq_epi64(a, b); + return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5; +#else + u32 d = diffrich128(a, b); + return (d | (d >> 1)) & 0x5; +#endif +} + +static really_really_inline +m128 lshift64_m128(m128 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return _mm_slli_epi64(a, b); + } +#endif + m128 x = _mm_cvtsi32_si128(b); + return _mm_sll_epi64(a, x); +} + +#define rshift64_m128(a, b) _mm_srli_epi64((a), (b)) +#define eq128(a, b) _mm_cmpeq_epi8((a), (b)) +#define movemask128(a) ((u32)_mm_movemask_epi8((a))) + +static really_inline m128 set16x8(u8 c) { + return _mm_set1_epi8(c); +} + +static really_inline m128 set4x32(u32 c) { + return _mm_set1_epi32(c); +} + +static really_inline m128 set2x64(u64a c) { + return _mm_set1_epi32(c); +} + +static really_inline u32 movd(const m128 in) { + return _mm_cvtsi128_si32(in); +} + +static really_inline u64a movq(const m128 in) { +#if defined(ARCH_X86_64) + return _mm_cvtsi128_si64(in); +#else // 32-bit - this is horrific + u32 lo = movd(in); + u32 hi = movd(_mm_srli_epi64(in, 32)); + return (u64a)hi << 32 | lo; +#endif +} + +/* another form of movq */ +static really_inline +m128 load_m128_from_u64a(const u64a *p) { + return _mm_set_epi64x(0LL, *p); +} + +#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed) +#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed) + +#if defined(HAVE_SSE41) +#define extract32from128(a, imm) _mm_extract_epi32(a, imm) +#define extract64from128(a, imm) _mm_extract_epi64(a, imm) +#else +#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2)) +#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3)) +#endif + +#if !defined(HAVE_AVX2) +// TODO: this entire file needs restructuring - this carveout is awful +#define extractlow64from256(a) movq(a.lo) +#define extractlow32from256(a) movd(a.lo) +#if defined(HAVE_SSE41) +#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4) +#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2) +#else +#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4)) +#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8)) +#endif + +#endif // !AVX2 + +static really_inline m128 and128(m128 a, m128 b) { + return _mm_and_si128(a,b); +} + +static really_inline m128 xor128(m128 a, m128 b) { + return _mm_xor_si128(a,b); +} + +static really_inline m128 or128(m128 a, m128 b) { + return _mm_or_si128(a,b); +} + +static really_inline m128 andnot128(m128 a, m128 b) { + return _mm_andnot_si128(a, b); +} + +// aligned load +static really_inline m128 load128(const void *ptr) { + assert(ISALIGNED_N(ptr, alignof(m128))); + ptr = assume_aligned(ptr, 16); + return _mm_load_si128((const m128 *)ptr); +} + +// aligned store +static really_inline void store128(void *ptr, m128 a) { + assert(ISALIGNED_N(ptr, alignof(m128))); + ptr = assume_aligned(ptr, 16); + *(m128 *)ptr = a; +} + +// unaligned load +static really_inline m128 loadu128(const void *ptr) { + return _mm_loadu_si128((const m128 *)ptr); +} + +// unaligned store +static really_inline void storeu128(void *ptr, m128 a) { + _mm_storeu_si128 ((m128 *)ptr, a); +} + +// packed unaligned store of first N bytes +static really_inline +void storebytes128(void *ptr, m128 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m128 loadbytes128(const void *ptr, unsigned int n) { + m128 a = zeroes128(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +#ifdef __cplusplus +extern "C" { +#endif +extern const u8 simd_onebit_masks[]; +#ifdef __cplusplus +} +#endif + +static really_inline +m128 mask1bit128(unsigned int n) { + assert(n < sizeof(m128) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu128(&simd_onebit_masks[mask_idx]); +} + +// switches on bit N in the given vector. +static really_inline +void setbit128(m128 *ptr, unsigned int n) { + *ptr = or128(mask1bit128(n), *ptr); +} + +// switches off bit N in the given vector. +static really_inline +void clearbit128(m128 *ptr, unsigned int n) { + *ptr = andnot128(mask1bit128(n), *ptr); +} + +// tests bit N in the given vector. +static really_inline +char testbit128(m128 val, unsigned int n) { + const m128 mask = mask1bit128(n); +#if defined(HAVE_SSE41) + return !_mm_testz_si128(mask, val); +#else + return isnonzero128(and128(mask, val)); +#endif +} + +// offset must be an immediate +#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset) + +static really_inline +m128 pshufb_m128(m128 a, m128 b) { + m128 result; + result = _mm_shuffle_epi8(a, b); + return result; +} + +static really_inline +m256 pshufb_m256(m256 a, m256 b) { +#if defined(HAVE_AVX2) + return _mm256_shuffle_epi8(a, b); +#else + m256 rv; + rv.lo = pshufb_m128(a.lo, b.lo); + rv.hi = pshufb_m128(a.hi, b.hi); + return rv; +#endif +} + +#if defined(HAVE_AVX512) +static really_inline +m512 pshufb_m512(m512 a, m512 b) { + return _mm512_shuffle_epi8(a, b); +} + +static really_inline +m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) { + return _mm512_maskz_shuffle_epi8(k, a, b); +} +#endif + +static really_inline +m128 variable_byte_shift_m128(m128 in, s32 amount) { + assert(amount >= -16 && amount <= 16); + m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); + return pshufb_m128(in, shift_mask); +} + +static really_inline +m128 max_u8_m128(m128 a, m128 b) { + return _mm_max_epu8(a, b); +} + +static really_inline +m128 min_u8_m128(m128 a, m128 b) { + return _mm_min_epu8(a, b); +} + +static really_inline +m128 sadd_u8_m128(m128 a, m128 b) { + return _mm_adds_epu8(a, b); +} + +static really_inline +m128 sub_u8_m128(m128 a, m128 b) { + return _mm_sub_epi8(a, b); +} + +static really_inline +m128 set64x2(u64a hi, u64a lo) { + return _mm_set_epi64x(hi, lo); +} + +static really_inline +m128 set32x4(int i3, int i2, int i1, int i0) { + return _mm_set_epi32(i3, i2, i1, i0); +} + +/**** + **** 256-bit Primitives + ****/ + +#if defined(HAVE_AVX2) + +static really_really_inline +m256 lshift64_m256(m256 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return _mm256_slli_epi64(a, b); + } +#endif + m128 x = _mm_cvtsi32_si128(b); + return _mm256_sll_epi64(a, x); +} + +#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b)) + +static really_inline +m256 set32x8(u32 in) { + return _mm256_set1_epi8(in); +} + +#define eq256(a, b) _mm256_cmpeq_epi8((a), (b)) +#define movemask256(a) ((u32)_mm256_movemask_epi8((a))) + +static really_inline +m256 set2x128(m128 a) { + return _mm256_broadcastsi128_si256(a); +} + +#else + +static really_really_inline +m256 lshift64_m256(m256 a, int b) { + m256 rv = a; + rv.lo = lshift64_m128(rv.lo, b); + rv.hi = lshift64_m128(rv.hi, b); + return rv; +} + +static really_inline +m256 rshift64_m256(m256 a, int b) { + m256 rv = a; + rv.lo = rshift64_m128(rv.lo, b); + rv.hi = rshift64_m128(rv.hi, b); + return rv; +} +static really_inline +m256 set32x8(u32 in) { + m256 rv; + rv.lo = set16x8((u8) in); + rv.hi = rv.lo; + return rv; +} + +static really_inline +m256 eq256(m256 a, m256 b) { + m256 rv; + rv.lo = eq128(a.lo, b.lo); + rv.hi = eq128(a.hi, b.hi); + return rv; +} + +static really_inline +u32 movemask256(m256 a) { + u32 lo_mask = movemask128(a.lo); + u32 hi_mask = movemask128(a.hi); + return lo_mask | (hi_mask << 16); +} + +static really_inline +m256 set2x128(m128 a) { + m256 rv = {a, a}; + return rv; +} +#endif + +static really_inline m256 zeroes256(void) { +#if defined(HAVE_AVX2) + return _mm256_setzero_si256(); +#else + m256 rv = {zeroes128(), zeroes128()}; + return rv; +#endif +} + +static really_inline m256 ones256(void) { +#if defined(HAVE_AVX2) + m256 rv = _mm256_set1_epi8(0xFF); +#else + m256 rv = {ones128(), ones128()}; +#endif + return rv; +} + +#if defined(HAVE_AVX2) +static really_inline m256 and256(m256 a, m256 b) { + return _mm256_and_si256(a, b); +} +#else +static really_inline m256 and256(m256 a, m256 b) { + m256 rv; + rv.lo = and128(a.lo, b.lo); + rv.hi = and128(a.hi, b.hi); + return rv; +} +#endif + +#if defined(HAVE_AVX2) +static really_inline m256 or256(m256 a, m256 b) { + return _mm256_or_si256(a, b); +} +#else +static really_inline m256 or256(m256 a, m256 b) { + m256 rv; + rv.lo = or128(a.lo, b.lo); + rv.hi = or128(a.hi, b.hi); + return rv; +} +#endif + +#if defined(HAVE_AVX2) +static really_inline m256 xor256(m256 a, m256 b) { + return _mm256_xor_si256(a, b); +} +#else +static really_inline m256 xor256(m256 a, m256 b) { + m256 rv; + rv.lo = xor128(a.lo, b.lo); + rv.hi = xor128(a.hi, b.hi); + return rv; +} +#endif + +#if defined(HAVE_AVX2) +static really_inline m256 not256(m256 a) { + return _mm256_xor_si256(a, ones256()); +} +#else +static really_inline m256 not256(m256 a) { + m256 rv; + rv.lo = not128(a.lo); + rv.hi = not128(a.hi); + return rv; +} +#endif + +#if defined(HAVE_AVX2) +static really_inline m256 andnot256(m256 a, m256 b) { + return _mm256_andnot_si256(a, b); +} +#else +static really_inline m256 andnot256(m256 a, m256 b) { + m256 rv; + rv.lo = andnot128(a.lo, b.lo); + rv.hi = andnot128(a.hi, b.hi); + return rv; +} +#endif + +static really_inline int diff256(m256 a, m256 b) { +#if defined(HAVE_AVX2) + return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1); +#else + return diff128(a.lo, b.lo) || diff128(a.hi, b.hi); +#endif +} + +static really_inline int isnonzero256(m256 a) { +#if defined(HAVE_AVX2) + return !!diff256(a, zeroes256()); +#else + return isnonzero128(or128(a.lo, a.hi)); +#endif +} + +/** + * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich256(m256 a, m256 b) { +#if defined(HAVE_AVX2) + a = _mm256_cmpeq_epi32(a, b); + return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF; +#else + m128 z = zeroes128(); + a.lo = _mm_cmpeq_epi32(a.lo, b.lo); + a.hi = _mm_cmpeq_epi32(a.hi, b.hi); + m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z); + return ~(_mm_movemask_epi8(packed)) & 0xff; +#endif +} + +/** + * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and + * returns an 8-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_256(m256 a, m256 b) { + u32 d = diffrich256(a, b); + return (d | (d >> 1)) & 0x55555555; +} + +// aligned load +static really_inline m256 load256(const void *ptr) { + assert(ISALIGNED_N(ptr, alignof(m256))); +#if defined(HAVE_AVX2) + return _mm256_load_si256((const m256 *)ptr); +#else + m256 rv = { load128(ptr), load128((const char *)ptr + 16) }; + return rv; +#endif +} + +// aligned load of 128-bit value to low and high part of 256-bit value +static really_inline m256 load2x128(const void *ptr) { +#if defined(HAVE_AVX2) + return set2x128(load128(ptr)); +#else + assert(ISALIGNED_N(ptr, alignof(m128))); + m256 rv; + rv.hi = rv.lo = load128(ptr); + return rv; +#endif +} + +static really_inline m256 loadu2x128(const void *ptr) { + return set2x128(loadu128(ptr)); +} + +// aligned store +static really_inline void store256(void *ptr, m256 a) { + assert(ISALIGNED_N(ptr, alignof(m256))); +#if defined(HAVE_AVX2) + _mm256_store_si256((m256 *)ptr, a); +#else + ptr = assume_aligned(ptr, 16); + *(m256 *)ptr = a; +#endif +} + +// unaligned load +static really_inline m256 loadu256(const void *ptr) { +#if defined(HAVE_AVX2) + return _mm256_loadu_si256((const m256 *)ptr); +#else + m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) }; + return rv; +#endif +} + +// unaligned store +static really_inline void storeu256(void *ptr, m256 a) { +#if defined(HAVE_AVX2) + _mm256_storeu_si256((m256 *)ptr, a); +#else + storeu128(ptr, a.lo); + storeu128((char *)ptr + 16, a.hi); +#endif +} + +// packed unaligned store of first N bytes +static really_inline +void storebytes256(void *ptr, m256 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m256 loadbytes256(const void *ptr, unsigned int n) { + m256 a = zeroes256(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +static really_inline +m256 mask1bit256(unsigned int n) { + assert(n < sizeof(m256) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu256(&simd_onebit_masks[mask_idx]); +} + +static really_inline +m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { +#if defined(HAVE_AVX2) + return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0); +#else + m256 rv; + rv.hi = set64x2(hi_1, hi_0); + rv.lo = set64x2(lo_1, lo_0); + return rv; +#endif +} + +#if !defined(HAVE_AVX2) +// switches on bit N in the given vector. +static really_inline +void setbit256(m256 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 128; + } + setbit128(sub, n); +} + +// switches off bit N in the given vector. +static really_inline +void clearbit256(m256 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 128; + } + clearbit128(sub, n); +} + +// tests bit N in the given vector. +static really_inline +char testbit256(m256 val, unsigned int n) { + assert(n < sizeof(val) * 8); + m128 sub; + if (n < 128) { + sub = val.lo; + } else { + sub = val.hi; + n -= 128; + } + return testbit128(sub, n); +} + +static really_really_inline +m128 movdq_hi(m256 x) { + return x.hi; +} + +static really_really_inline +m128 movdq_lo(m256 x) { + return x.lo; +} + +static really_inline +m256 combine2x128(m128 hi, m128 lo) { + m256 rv = {lo, hi}; + return rv; +} + +#else // AVX2 + +// switches on bit N in the given vector. +static really_inline +void setbit256(m256 *ptr, unsigned int n) { + *ptr = or256(mask1bit256(n), *ptr); +} + +static really_inline +void clearbit256(m256 *ptr, unsigned int n) { + *ptr = andnot256(mask1bit256(n), *ptr); +} + +// tests bit N in the given vector. +static really_inline +char testbit256(m256 val, unsigned int n) { + const m256 mask = mask1bit256(n); + return !_mm256_testz_si256(mask, val); +} + +static really_really_inline +m128 movdq_hi(m256 x) { + return _mm256_extracti128_si256(x, 1); +} + +static really_really_inline +m128 movdq_lo(m256 x) { + return _mm256_extracti128_si256(x, 0); +} + +#define cast256to128(a) _mm256_castsi256_si128(a) +#define cast128to256(a) _mm256_castsi128_si256(a) +#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E) +#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm) +#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed) +#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed) +#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2) +#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4) +#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a)) +#define extractlow32from256(a) movd(cast256to128(a)) +#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b) +#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b) +#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset) + +static really_inline +m256 combine2x128(m128 hi, m128 lo) { +#if defined(_mm256_set_m128i) + return _mm256_set_m128i(hi, lo); +#else + return insert128to256(cast128to256(lo), hi, 1); +#endif +} +#endif //AVX2 + +#if defined(HAVE_AVX512) +#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm) +#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b) +#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b) +#define set2x256(a) _mm512_broadcast_i64x4(a) +#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a) +#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a) +#endif + +/**** + **** 384-bit Primitives + ****/ + +static really_inline m384 and384(m384 a, m384 b) { + m384 rv; + rv.lo = and128(a.lo, b.lo); + rv.mid = and128(a.mid, b.mid); + rv.hi = and128(a.hi, b.hi); + return rv; +} + +static really_inline m384 or384(m384 a, m384 b) { + m384 rv; + rv.lo = or128(a.lo, b.lo); + rv.mid = or128(a.mid, b.mid); + rv.hi = or128(a.hi, b.hi); + return rv; +} + +static really_inline m384 xor384(m384 a, m384 b) { + m384 rv; + rv.lo = xor128(a.lo, b.lo); + rv.mid = xor128(a.mid, b.mid); + rv.hi = xor128(a.hi, b.hi); + return rv; +} +static really_inline m384 not384(m384 a) { + m384 rv; + rv.lo = not128(a.lo); + rv.mid = not128(a.mid); + rv.hi = not128(a.hi); + return rv; +} +static really_inline m384 andnot384(m384 a, m384 b) { + m384 rv; + rv.lo = andnot128(a.lo, b.lo); + rv.mid = andnot128(a.mid, b.mid); + rv.hi = andnot128(a.hi, b.hi); + return rv; +} + +static really_really_inline +m384 lshift64_m384(m384 a, unsigned b) { + m384 rv; + rv.lo = lshift64_m128(a.lo, b); + rv.mid = lshift64_m128(a.mid, b); + rv.hi = lshift64_m128(a.hi, b); + return rv; +} + +static really_inline m384 zeroes384(void) { + m384 rv = {zeroes128(), zeroes128(), zeroes128()}; + return rv; +} + +static really_inline m384 ones384(void) { + m384 rv = {ones128(), ones128(), ones128()}; + return rv; +} + +static really_inline int diff384(m384 a, m384 b) { + return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi); +} + +static really_inline int isnonzero384(m384 a) { + return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); +} + +/** + * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich384(m384 a, m384 b) { + m128 z = zeroes128(); + a.lo = _mm_cmpeq_epi32(a.lo, b.lo); + a.mid = _mm_cmpeq_epi32(a.mid, b.mid); + a.hi = _mm_cmpeq_epi32(a.hi, b.hi); + m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid), + _mm_packs_epi32(a.hi, z)); + return ~(_mm_movemask_epi8(packed)) & 0xfff; +} + +/** + * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and + * returns a 12-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_384(m384 a, m384 b) { + u32 d = diffrich384(a, b); + return (d | (d >> 1)) & 0x55555555; +} + +// aligned load +static really_inline m384 load384(const void *ptr) { + assert(ISALIGNED_16(ptr)); + m384 rv = { load128(ptr), load128((const char *)ptr + 16), + load128((const char *)ptr + 32) }; + return rv; +} + +// aligned store +static really_inline void store384(void *ptr, m384 a) { + assert(ISALIGNED_16(ptr)); + ptr = assume_aligned(ptr, 16); + *(m384 *)ptr = a; +} + +// unaligned load +static really_inline m384 loadu384(const void *ptr) { + m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16), + loadu128((const char *)ptr + 32)}; + return rv; +} + +// packed unaligned store of first N bytes +static really_inline +void storebytes384(void *ptr, m384 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m384 loadbytes384(const void *ptr, unsigned int n) { + m384 a = zeroes384(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +// switches on bit N in the given vector. +static really_inline +void setbit384(m384 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else if (n < 256) { + sub = &ptr->mid; + } else { + sub = &ptr->hi; + } + setbit128(sub, n % 128); +} + +// switches off bit N in the given vector. +static really_inline +void clearbit384(m384 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else if (n < 256) { + sub = &ptr->mid; + } else { + sub = &ptr->hi; + } + clearbit128(sub, n % 128); +} + +// tests bit N in the given vector. +static really_inline +char testbit384(m384 val, unsigned int n) { + assert(n < sizeof(val) * 8); + m128 sub; + if (n < 128) { + sub = val.lo; + } else if (n < 256) { + sub = val.mid; + } else { + sub = val.hi; + } + return testbit128(sub, n % 128); +} + +/**** + **** 512-bit Primitives + ****/ + +#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b)) +#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b)) + +static really_inline +m512 zeroes512(void) { +#if defined(HAVE_AVX512) + return _mm512_setzero_si512(); +#else + m512 rv = {zeroes256(), zeroes256()}; + return rv; +#endif +} + +static really_inline +m512 ones512(void) { +#if defined(HAVE_AVX512) + return _mm512_set1_epi8(0xFF); + //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512()); +#else + m512 rv = {ones256(), ones256()}; + return rv; +#endif +} + +#if defined(HAVE_AVX512) +static really_inline +m512 set64x8(u8 a) { + return _mm512_set1_epi8(a); +} + +static really_inline +m512 set8x64(u64a a) { + return _mm512_set1_epi64(a); +} + +static really_inline +m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0, + u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) { + return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0, + lo_3, lo_2, lo_1, lo_0); +} + +static really_inline +m512 swap256in512(m512 a) { + m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL); + return vpermq512(idx, a); +} + +static really_inline +m512 set4x128(m128 a) { + return _mm512_broadcast_i32x4(a); +} +#endif + +static really_inline +m512 and512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_and_si512(a, b); +#else + m512 rv; + rv.lo = and256(a.lo, b.lo); + rv.hi = and256(a.hi, b.hi); + return rv; +#endif +} + +static really_inline +m512 or512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_or_si512(a, b); +#else + m512 rv; + rv.lo = or256(a.lo, b.lo); + rv.hi = or256(a.hi, b.hi); + return rv; +#endif +} + +static really_inline +m512 xor512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_xor_si512(a, b); +#else + m512 rv; + rv.lo = xor256(a.lo, b.lo); + rv.hi = xor256(a.hi, b.hi); + return rv; +#endif +} + +static really_inline +m512 not512(m512 a) { +#if defined(HAVE_AVX512) + return _mm512_xor_si512(a, ones512()); +#else + m512 rv; + rv.lo = not256(a.lo); + rv.hi = not256(a.hi); + return rv; +#endif +} + +static really_inline +m512 andnot512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_andnot_si512(a, b); +#else + m512 rv; + rv.lo = andnot256(a.lo, b.lo); + rv.hi = andnot256(a.hi, b.hi); + return rv; +#endif +} + +#if defined(HAVE_AVX512) +static really_really_inline +m512 lshift64_m512(m512 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return _mm512_slli_epi64(a, b); + } +#endif + m128 x = _mm_cvtsi32_si128(b); + return _mm512_sll_epi64(a, x); +} +#else +static really_really_inline +m512 lshift64_m512(m512 a, unsigned b) { + m512 rv; + rv.lo = lshift64_m256(a.lo, b); + rv.hi = lshift64_m256(a.hi, b); + return rv; +} +#endif + +#if defined(HAVE_AVX512) +#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b)) +#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed) +#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed) +#endif + +#if !defined(_MM_CMPINT_NE) +#define _MM_CMPINT_NE 0x4 +#endif + +static really_inline +int diff512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE); +#else + return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); +#endif +} + +static really_inline +int isnonzero512(m512 a) { +#if defined(HAVE_AVX512) + return diff512(a, zeroes512()); +#elif defined(HAVE_AVX2) + m256 x = or256(a.lo, a.hi); + return !!diff256(x, zeroes256()); +#else + m128 x = or128(a.lo.lo, a.lo.hi); + m128 y = or128(a.hi.lo, a.hi.hi); + return isnonzero128(or128(x, y)); +#endif +} + +/** + * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline +u32 diffrich512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE); +#elif defined(HAVE_AVX2) + return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8); +#else + a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo); + a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi); + a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo); + a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi); + m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi), + _mm_packs_epi32(a.hi.lo, a.hi.hi)); + return ~(_mm_movemask_epi8(packed)) & 0xffff; +#endif +} + +/** + * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and + * returns a 16-bit mask indicating which 64-bit words contain differences. + */ +static really_inline +u32 diffrich64_512(m512 a, m512 b) { + //TODO: cmp_epi64? + u32 d = diffrich512(a, b); + return (d | (d >> 1)) & 0x55555555; +} + +// aligned load +static really_inline +m512 load512(const void *ptr) { +#if defined(HAVE_AVX512) + return _mm512_load_si512(ptr); +#else + assert(ISALIGNED_N(ptr, alignof(m256))); + m512 rv = { load256(ptr), load256((const char *)ptr + 32) }; + return rv; +#endif +} + +// aligned store +static really_inline +void store512(void *ptr, m512 a) { + assert(ISALIGNED_N(ptr, alignof(m512))); +#if defined(HAVE_AVX512) + return _mm512_store_si512(ptr, a); +#elif defined(HAVE_AVX2) + m512 *x = (m512 *)ptr; + store256(&x->lo, a.lo); + store256(&x->hi, a.hi); +#else + ptr = assume_aligned(ptr, 16); + *(m512 *)ptr = a; +#endif +} + +// unaligned load +static really_inline +m512 loadu512(const void *ptr) { +#if defined(HAVE_AVX512) + return _mm512_loadu_si512(ptr); +#else + m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) }; + return rv; +#endif +} + +#if defined(HAVE_AVX512) +static really_inline +m512 loadu_maskz_m512(__mmask64 k, const void *ptr) { + return _mm512_maskz_loadu_epi8(k, ptr); +} + +static really_inline +m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) { + return _mm512_mask_loadu_epi8(src, k, ptr); +} + +static really_inline +m512 set_mask_m512(__mmask64 k) { + return _mm512_movm_epi8(k); +} +#endif + +// packed unaligned store of first N bytes +static really_inline +void storebytes512(void *ptr, m512 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m512 loadbytes512(const void *ptr, unsigned int n) { + m512 a = zeroes512(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +static really_inline +m512 mask1bit512(unsigned int n) { + assert(n < sizeof(m512) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu512(&simd_onebit_masks[mask_idx]); +} + +// switches on bit N in the given vector. +static really_inline +void setbit512(m512 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); +#if !defined(HAVE_AVX2) + m128 *sub; + if (n < 128) { + sub = &ptr->lo.lo; + } else if (n < 256) { + sub = &ptr->lo.hi; + } else if (n < 384) { + sub = &ptr->hi.lo; + } else { + sub = &ptr->hi.hi; + } + setbit128(sub, n % 128); +#elif defined(HAVE_AVX512) + *ptr = or512(mask1bit512(n), *ptr); +#else + m256 *sub; + if (n < 256) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 256; + } + setbit256(sub, n); +#endif +} + +// switches off bit N in the given vector. +static really_inline +void clearbit512(m512 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); +#if !defined(HAVE_AVX2) + m128 *sub; + if (n < 128) { + sub = &ptr->lo.lo; + } else if (n < 256) { + sub = &ptr->lo.hi; + } else if (n < 384) { + sub = &ptr->hi.lo; + } else { + sub = &ptr->hi.hi; + } + clearbit128(sub, n % 128); +#elif defined(HAVE_AVX512) + *ptr = andnot512(mask1bit512(n), *ptr); +#else + m256 *sub; + if (n < 256) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 256; + } + clearbit256(sub, n); +#endif +} + +// tests bit N in the given vector. +static really_inline +char testbit512(m512 val, unsigned int n) { + assert(n < sizeof(val) * 8); +#if !defined(HAVE_AVX2) + m128 sub; + if (n < 128) { + sub = val.lo.lo; + } else if (n < 256) { + sub = val.lo.hi; + } else if (n < 384) { + sub = val.hi.lo; + } else { + sub = val.hi.hi; + } + return testbit128(sub, n % 128); +#elif defined(HAVE_AVX512) + const m512 mask = mask1bit512(n); + return !!_mm512_test_epi8_mask(mask, val); +#else + m256 sub; + if (n < 256) { + sub = val.lo; + } else { + sub = val.hi; + n -= 256; + } + return testbit256(sub, n); +#endif +} + +#endif diff --git a/src/util/state_compress.c b/src/util/state_compress.c index 7238849..4422403 100644 --- a/src/util/state_compress.c +++ b/src/util/state_compress.c @@ -150,7 +150,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]), expand32(v[2], m[2]), expand32(v[3], m[3]) }; - return _mm_set_epi32(x[3], x[2], x[1], x[0]); + return set32x4(x[3], x[2], x[1], x[0]); } #endif @@ -158,7 +158,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { static really_inline m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { // First, decompose our vectors into 64-bit chunks. - u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) }; + u64a m[2] = { movq(mvec), movq(rshiftbyte_m128(mvec, 8)) }; u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) }; u64a v[2]; @@ -167,7 +167,7 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) }; - return _mm_set_epi64x(x[1], x[0]); + return set64x2(x[1], x[0]); } #endif @@ -264,8 +264,8 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) { expand32(v[6], m[6]), expand32(v[7], m[7]) }; #if !defined(HAVE_AVX2) - m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]), - .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) }; + m256 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]), + .hi = set32x4(x[7], x[6], x[5], x[4]) }; #else m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4], x[3], x[2], x[1], x[0]); @@ -291,8 +291,8 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) { expand64(v[2], m[2]), expand64(v[3], m[3]) }; #if !defined(HAVE_AVX2) - m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]), - .hi = _mm_set_epi64x(x[3], x[2]) }; + m256 xvec = { .lo = set64x2(x[1], x[0]), + .hi = set64x2(x[3], x[2]) }; #else m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]); #endif @@ -402,9 +402,9 @@ m384 loadcompressed384_32bit(const void *ptr, m384 mvec) { expand32(v[8], m[8]), expand32(v[9], m[9]), expand32(v[10], m[10]), expand32(v[11], m[11]) }; - m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]), - .mid = _mm_set_epi32(x[7], x[6], x[5], x[4]), - .hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) }; + m384 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]), + .mid = set32x4(x[7], x[6], x[5], x[4]), + .hi = set32x4(x[11], x[10], x[9], x[8]) }; return xvec; } #endif @@ -427,9 +427,9 @@ m384 loadcompressed384_64bit(const void *ptr, m384 mvec) { expand64(v[2], m[2]), expand64(v[3], m[3]), expand64(v[4], m[4]), expand64(v[5], m[5]) }; - m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]), - .mid = _mm_set_epi64x(x[3], x[2]), - .hi = _mm_set_epi64x(x[5], x[4]) }; + m384 xvec = { .lo = set64x2(x[1], x[0]), + .mid = set64x2(x[3], x[2]), + .hi = set64x2(x[5], x[4]) }; return xvec; } #endif @@ -558,10 +558,10 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) { xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12], x[11], x[10], x[9], x[8]); #else - xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]); - xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]); - xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]); - xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]); + xvec.lo.lo = set32x4(x[3], x[2], x[1], x[0]); + xvec.lo.hi = set32x4(x[7], x[6], x[5], x[4]); + xvec.hi.lo = set32x4(x[11], x[10], x[9], x[8]); + xvec.hi.hi = set32x4(x[15], x[14], x[13], x[12]); #endif return xvec; } @@ -594,10 +594,10 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) { m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]), .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])}; #else - m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]), - _mm_set_epi64x(x[3], x[2]) }, - .hi = { _mm_set_epi64x(x[5], x[4]), - _mm_set_epi64x(x[7], x[6]) } }; + m512 xvec = { .lo = { set64x2(x[1], x[0]), + set64x2(x[3], x[2]) }, + .hi = { set64x2(x[5], x[4]), + set64x2(x[7], x[6]) } }; #endif return xvec; } diff --git a/tools/hscollider/CMakeLists.txt b/tools/hscollider/CMakeLists.txt index a4d71b2..0c41ab9 100644 --- a/tools/hscollider/CMakeLists.txt +++ b/tools/hscollider/CMakeLists.txt @@ -21,7 +21,14 @@ set_source_files_properties( PROPERTIES COMPILE_FLAGS "${RAGEL_C_FLAGS} -I${CMAKE_CURRENT_SOURCE_DIR}") -ragelmaker(ColliderCorporaParser.rl) + +if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386") + ragelmaker(ColliderCorporaParser.rl) +endif() + +if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + ragelcopyer(ColliderCorporaParser.rl) +endif() if (BUILD_CHIMERA) add_definitions(-DHS_HYBRID) diff --git a/tools/hscollider/ColliderCorporaParser.cpp b/tools/hscollider/ColliderCorporaParser.cpp new file mode 100644 index 0000000..5391473 --- /dev/null +++ b/tools/hscollider/ColliderCorporaParser.cpp @@ -0,0 +1,474 @@ + + +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "ColliderCorporaParser.h" +#include "Corpora.h" + +#include "ue2common.h" + +#include +#include +#include +#include + +using namespace std; + +namespace /* anonymous */ { + +// Take a string like '\xFF' and convert it to the character it represents +char unhex(const char *start, UNUSED const char *end) { + assert(start + 4 == end); + assert(start[0] == '\\'); + assert(start[1] == 'x'); + assert(isxdigit(start[2])); + assert(isxdigit(start[2])); + + char temp[3] = {start[2], start[3], 0}; + + return strtol(temp, nullptr, 16); +} + +static const char _FileCorporaParser_actions[] = { + 0, 1, 0, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1, 9, 1, 10, + 1, 11, 1, 12, 1, 13, 1, 14, 1, 15, 1, 16, 1, 17, 1, 18, 1, 19, 1, + 20, 1, 21, 1, 22, 1, 23, 1, 24, 2, 0, 2, 2, 3, 0, 3, 1, 0, 2}; + +static const char _FileCorporaParser_key_offsets[] = { + 0, 0, 2, 6, 7, 13, 19, 25, 31, 34, 34, 35, 52, 54, 71, 72, 75, 79}; + +static const char _FileCorporaParser_trans_keys[] = { + 48, 57, 58, 61, 48, 57, 34, 48, 57, 65, 70, 97, 102, 48, + 57, 65, 70, 97, 102, 48, 57, 65, 70, 97, 102, 48, 57, 65, + 70, 97, 102, 32, 48, 57, 92, 48, 97, 110, 114, 116, 118, 120, + 49, 57, 65, 90, 98, 100, 101, 102, 103, 122, 34, 92, 48, 97, + 110, 114, 116, 118, 120, 49, 57, 65, 90, 98, 100, 101, 102, 103, + 122, 58, 32, 48, 57, 32, 44, 48, 57, 32, 44, 0}; + +static const char _FileCorporaParser_single_lengths[] = { + 0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 1, 7, 2, 7, 1, 1, 2, 2}; + +static const char _FileCorporaParser_range_lengths[] = { + 0, 1, 1, 0, 3, 3, 3, 3, 1, 0, 0, 5, 0, 5, 0, 1, 1, 0}; + +static const char _FileCorporaParser_index_offsets[] = { + 0, 0, 2, 6, 8, 12, 16, 20, 24, 27, 28, 30, 43, 46, 59, 61, 64, 68}; + +static const char _FileCorporaParser_indicies[] = { + 0, 1, 3, 4, 2, 1, 5, 1, 7, 7, 7, 6, 8, 8, 8, 6, 10, 10, + 10, 9, 11, 11, 11, 9, 12, 13, 1, 1, 15, 14, 18, 18, 18, 18, 18, 18, + 19, 16, 16, 16, 18, 16, 17, 21, 22, 20, 25, 25, 25, 25, 25, 25, 26, 23, + 23, 23, 25, 23, 24, 27, 1, 28, 29, 1, 31, 32, 13, 30, 31, 32, 30, 0}; + +static const char _FileCorporaParser_trans_targs[] = { + 2, 0, 2, 9, 3, 9, 10, 5, 10, 12, 7, 12, 8, 16, 10, 11, 10, + 10, 10, 4, 12, 12, 13, 12, 12, 12, 6, 14, 8, 16, 15, 17, 15}; + +static const char _FileCorporaParser_trans_actions[] = { + 53, 0, 47, 5, 0, 7, 25, 0, 15, 39, 0, 27, 0, 1, 21, 13, 23, + 19, 17, 0, 33, 35, 13, 37, 31, 29, 0, 41, 3, 50, 45, 0, 43}; + +static const char _FileCorporaParser_to_state_actions[] = { + 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 9, 0, 9, 9, 0, 0}; + +static const char _FileCorporaParser_from_state_actions[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 11, 0, 11, 11, 0, 0}; + +static const char _FileCorporaParser_eof_trans[] = { + 0, 0, 0, 0, 7, 7, 10, 10, 0, 0, 0, 17, 0, 24, 0, 0, 31, 31}; + +static const int FileCorporaParser_start = 1; +static const int FileCorporaParser_first_final = 9; +static const int FileCorporaParser_error = 0; + +static const int FileCorporaParser_en_corpus_old = 10; +static const int FileCorporaParser_en_corpus_new = 12; +static const int FileCorporaParser_en_colon_sep = 14; +static const int FileCorporaParser_en_match_list = 15; +static const int FileCorporaParser_en_main = 1; + +} // namespace + +bool parseCorpus(const string &line, Corpus &c, unsigned int &id) { + const char *p = line.c_str(); + const char *pe = p + line.size(); + const char *eof = pe; + const char *ts; + const char *te; + int cs; + UNUSED int act; + + // For storing integers as they're scanned + unsigned int num = 0; + + string &sout = c.data; + + { + cs = FileCorporaParser_start; + ts = 0; + te = 0; + act = 0; + } + + { + int _klen; + unsigned int _trans; + const char *_acts; + unsigned int _nacts; + const char *_keys; + + if (p == pe) + goto _test_eof; + if (cs == 0) + goto _out; + _resume: + _acts = _FileCorporaParser_actions + + _FileCorporaParser_from_state_actions[cs]; + _nacts = (unsigned int)*_acts++; + while (_nacts-- > 0) { + switch (*_acts++) { + case 7: + + { + ts = p; + } break; + } + } + + _keys = + _FileCorporaParser_trans_keys + _FileCorporaParser_key_offsets[cs]; + _trans = _FileCorporaParser_index_offsets[cs]; + + _klen = _FileCorporaParser_single_lengths[cs]; + if (_klen > 0) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + _klen - 1; + while (1) { + if (_upper < _lower) + break; + + _mid = _lower + ((_upper - _lower) >> 1); + if ((*p) < *_mid) + _upper = _mid - 1; + else if ((*p) > *_mid) + _lower = _mid + 1; + else { + _trans += (unsigned int)(_mid - _keys); + goto _match; + } + } + _keys += _klen; + _trans += _klen; + } + + _klen = _FileCorporaParser_range_lengths[cs]; + if (_klen > 0) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + (_klen << 1) - 2; + while (1) { + if (_upper < _lower) + break; + + _mid = _lower + (((_upper - _lower) >> 1) & ~1); + if ((*p) < _mid[0]) + _upper = _mid - 2; + else if ((*p) > _mid[1]) + _lower = _mid + 2; + else { + _trans += (unsigned int)((_mid - _keys) >> 1); + goto _match; + } + } + _trans += _klen; + } + + _match: + _trans = _FileCorporaParser_indicies[_trans]; + _eof_trans: + cs = _FileCorporaParser_trans_targs[_trans]; + + if (_FileCorporaParser_trans_actions[_trans] == 0) + goto _again; + + _acts = _FileCorporaParser_actions + + _FileCorporaParser_trans_actions[_trans]; + _nacts = (unsigned int)*_acts++; + while (_nacts-- > 0) { + switch (*_acts++) { + case 0: + + { + num = (num * 10) + ((*p) - '0'); + } break; + case 1: + + { + num = 0; + } break; + case 2: + + { + id = num; + } break; + case 3: + + { + num = 0; + } break; + case 4: + + { + { + cs = 10; + goto _again; + } + } break; + case 5: + + { + c.hasMatches = true; + { + cs = 12; + goto _again; + } + } break; + case 8: + + { + te = p + 1; + } break; + case 9: + + { + te = p + 1; + { sout.push_back(unhex(ts, te)); } + } break; + case 10: + + { + te = p + 1; + { + switch (*(ts + 1)) { + case '0': + sout.push_back('\x00'); + break; + case 'a': + sout.push_back('\x07'); + break; + case 'e': + sout.push_back('\x1b'); + break; + case 'f': + sout.push_back('\x0c'); + break; + case 'n': + sout.push_back('\x0a'); + break; + case 'v': + sout.push_back('\x0b'); + break; + case 'r': + sout.push_back('\x0d'); + break; + case 't': + sout.push_back('\x09'); + break; + default: { + p++; + goto _out; + } + } + } + } break; + case 11: + + { + te = p + 1; + { sout.push_back(*(ts + 1)); } + } break; + case 12: + + { + te = p + 1; + { sout.push_back(*ts); } + } break; + case 13: + + { + te = p; + p--; + { sout.push_back(*ts); } + } break; + case 14: + + { + { p = ((te)) - 1; } + { sout.push_back(*ts); } + } break; + case 15: + + { + te = p + 1; + { sout.push_back(unhex(ts, te)); } + } break; + case 16: + + { + te = p + 1; + { + switch (*(ts + 1)) { + case '0': + sout.push_back('\x00'); + break; + case 'a': + sout.push_back('\x07'); + break; + case 'e': + sout.push_back('\x1b'); + break; + case 'f': + sout.push_back('\x0c'); + break; + case 'n': + sout.push_back('\x0a'); + break; + case 'v': + sout.push_back('\x0b'); + break; + case 'r': + sout.push_back('\x0d'); + break; + case 't': + sout.push_back('\x09'); + break; + default: { + p++; + goto _out; + } + } + } + } break; + case 17: + + { + te = p + 1; + { sout.push_back(*(ts + 1)); } + } break; + case 18: + + { + te = p + 1; + { sout.push_back(*ts); } + } break; + case 19: + + { + te = p + 1; + { + { + cs = 14; + goto _again; + } + } + } break; + case 20: + + { + te = p; + p--; + { sout.push_back(*ts); } + } break; + case 21: + + { + { p = ((te)) - 1; } + { sout.push_back(*ts); } + } break; + case 22: + + { + te = p + 1; + { + { + cs = 15; + goto _again; + } + } + } break; + case 23: + + { + te = p + 1; + { c.matches.insert(num); } + } break; + case 24: + + { + te = p; + p--; + { c.matches.insert(num); } + } break; + } + } + + _again: + _acts = _FileCorporaParser_actions + + _FileCorporaParser_to_state_actions[cs]; + _nacts = (unsigned int)*_acts++; + while (_nacts-- > 0) { + switch (*_acts++) { + case 6: + + { + ts = 0; + } break; + } + } + + if (cs == 0) + goto _out; + if (++p != pe) + goto _resume; + _test_eof : {} + if (p == eof) { + if (_FileCorporaParser_eof_trans[cs] > 0) { + _trans = _FileCorporaParser_eof_trans[cs] - 1; + goto _eof_trans; + } + } + + _out : {} + } + + return (cs != FileCorporaParser_error) && (p == pe); +} diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 623c2c9..d6d52a2 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -40,7 +40,7 @@ using namespace ue2; namespace { // Switch one bit on in a bitmask. -template +template Mask setbit(unsigned int bit) { union { Mask simd; @@ -148,7 +148,7 @@ m256 simd_lshift64(const m256 &a, unsigned i) { return lshift64_m256(a, i); } m384 simd_lshift64(const m384 &a, unsigned i) { return lshift64_m384(a, i); } m512 simd_lshift64(const m512 &a, unsigned i) { return lshift64_m512(a, i); } -template +template class SimdUtilsTest : public testing::Test { // empty }; @@ -260,9 +260,9 @@ TYPED_TEST(SimdUtilsTest, or2) { for (unsigned j = 0; j < 8; j++) { for (unsigned i = 0; i < 32; i++) { - m256 x = setbit(j*32+i); + m256 x = setbit(j * 32 + i); m256 y = zeroes256(); - ASSERT_EQ(1U << j, diffrich256(x, y)) << "bit " << j*32+i << " not happy"; + ASSERT_EQ(1U << j, diffrich256(x, y)) << "bit " << j * 32 + i << " not happy"; } } @@ -431,8 +431,8 @@ TYPED_TEST(SimdUtilsTest, testbit) { for (unsigned int i = 0; i < total_bits; i++) { TypeParam a = setbit(i); for (unsigned int j = 0; j < total_bits; j++) { - ASSERT_EQ(i == j ? 1 : 0, simd_testbit(a, j)) << "bit " << i - << " is wrong"; + ASSERT_EQ(i == j ? 1 : 0, simd_testbit(a, j)) + << "bit " << i << " is wrong"; } } } @@ -455,7 +455,6 @@ TYPED_TEST(SimdUtilsTest, setbit) { simd_setbit(&a, i); } ASSERT_FALSE(simd_diff(simd_ones(), a)); - } TYPED_TEST(SimdUtilsTest, diffrich) { @@ -663,12 +662,11 @@ TEST(SimdUtilsTest, movq) { ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd))); ASSERT_EQ(0, memcmp(cmp, &r, sizeof(r))); - simd = _mm_set_epi64x(~0LL, 0x123456789abcdef); + simd = set64x2(~0LL, 0x123456789abcdef); r = movq(simd); ASSERT_EQ(r, 0x123456789abcdef); } - TEST(SimdUtilsTest, set16x8) { char cmp[sizeof(m128)]; @@ -680,7 +678,7 @@ TEST(SimdUtilsTest, set16x8) { } TEST(SimdUtilsTest, set4x32) { - u32 cmp[4] = { 0x12345678, 0x12345678, 0x12345678, 0x12345678 }; + u32 cmp[4] = {0x12345678, 0x12345678, 0x12345678, 0x12345678}; m128 simd = set4x32(cmp[0]); ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd))); } @@ -714,51 +712,51 @@ TEST(SimdUtilsTest, variableByteShift128) { char base[] = "0123456789ABCDEF"; m128 in = loadu128(base); - EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0), - variable_byte_shift_m128(in, 0))); - EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1), - variable_byte_shift_m128(in, -1))); - EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 2), - variable_byte_shift_m128(in, -2))); - EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 3), - variable_byte_shift_m128(in, -3))); - EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 4), - variable_byte_shift_m128(in, -4))); - EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 5), - variable_byte_shift_m128(in, -5))); - EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 6), - variable_byte_shift_m128(in, -6))); - EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 7), - variable_byte_shift_m128(in, -7))); - EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 8), - variable_byte_shift_m128(in, -8))); - EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 9), - variable_byte_shift_m128(in, -9))); - EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 10), - variable_byte_shift_m128(in, -10))); - - EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 0), - variable_byte_shift_m128(in, 0))); - EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 1), - variable_byte_shift_m128(in, 1))); - EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 2), - variable_byte_shift_m128(in, 2))); - EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 3), - variable_byte_shift_m128(in, 3))); - EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 4), - variable_byte_shift_m128(in, 4))); - EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 5), - variable_byte_shift_m128(in, 5))); - EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 6), - variable_byte_shift_m128(in, 6))); - EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 7), - variable_byte_shift_m128(in, 7))); - EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 8), - variable_byte_shift_m128(in, 8))); - EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 9), - variable_byte_shift_m128(in, 9))); - EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10), - variable_byte_shift_m128(in, 10))); + EXPECT_TRUE( + !diff128(rshiftbyte_m128(in, 0), variable_byte_shift_m128(in, 0))); + EXPECT_TRUE( + !diff128(rshiftbyte_m128(in, 1), variable_byte_shift_m128(in, -1))); + EXPECT_TRUE( + !diff128(rshiftbyte_m128(in, 2), variable_byte_shift_m128(in, -2))); + EXPECT_TRUE( + !diff128(rshiftbyte_m128(in, 3), variable_byte_shift_m128(in, -3))); + EXPECT_TRUE( + !diff128(rshiftbyte_m128(in, 4), variable_byte_shift_m128(in, -4))); + EXPECT_TRUE( + !diff128(rshiftbyte_m128(in, 5), variable_byte_shift_m128(in, -5))); + EXPECT_TRUE( + !diff128(rshiftbyte_m128(in, 6), variable_byte_shift_m128(in, -6))); + EXPECT_TRUE( + !diff128(rshiftbyte_m128(in, 7), variable_byte_shift_m128(in, -7))); + EXPECT_TRUE( + !diff128(rshiftbyte_m128(in, 8), variable_byte_shift_m128(in, -8))); + EXPECT_TRUE( + !diff128(rshiftbyte_m128(in, 9), variable_byte_shift_m128(in, -9))); + EXPECT_TRUE( + !diff128(rshiftbyte_m128(in, 10), variable_byte_shift_m128(in, -10))); + + EXPECT_TRUE( + !diff128(lshiftbyte_m128(in, 0), variable_byte_shift_m128(in, 0))); + EXPECT_TRUE( + !diff128(lshiftbyte_m128(in, 1), variable_byte_shift_m128(in, 1))); + EXPECT_TRUE( + !diff128(lshiftbyte_m128(in, 2), variable_byte_shift_m128(in, 2))); + EXPECT_TRUE( + !diff128(lshiftbyte_m128(in, 3), variable_byte_shift_m128(in, 3))); + EXPECT_TRUE( + !diff128(lshiftbyte_m128(in, 4), variable_byte_shift_m128(in, 4))); + EXPECT_TRUE( + !diff128(lshiftbyte_m128(in, 5), variable_byte_shift_m128(in, 5))); + EXPECT_TRUE( + !diff128(lshiftbyte_m128(in, 6), variable_byte_shift_m128(in, 6))); + EXPECT_TRUE( + !diff128(lshiftbyte_m128(in, 7), variable_byte_shift_m128(in, 7))); + EXPECT_TRUE( + !diff128(lshiftbyte_m128(in, 8), variable_byte_shift_m128(in, 8))); + EXPECT_TRUE( + !diff128(lshiftbyte_m128(in, 9), variable_byte_shift_m128(in, 9))); + EXPECT_TRUE( + !diff128(lshiftbyte_m128(in, 10), variable_byte_shift_m128(in, 10))); EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16))); EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, -16))); @@ -785,12 +783,12 @@ TEST(SimdUtilsTest, min_u8_m128) { } TEST(SimdUtilsTest, sadd_u8_m128) { - unsigned char base1[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4', - '1', '2', '3', '4', '1', '2', '3', '4'}; - unsigned char base2[] = {'a', 0x80, 'b', 'A', 0x10, 0x10, 0x10, 0x10, - 0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0}; + unsigned char base1[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4', + '1', '2', '3', '4', '1', '2', '3', '4'}; + unsigned char base2[] = {'a', 0x80, 'b', 'A', 0x10, 0x10, 0x10, 0x10, + 0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0}; unsigned char expec[] = {'a', 0xff, 0xff, 0x82, 'A', 'B', 'C', 'D', - 'a', 'b', 'c', 'd', '1', '2', '3', '4'}; + 'a', 'b', 'c', 'd', '1', '2', '3', '4'}; m128 in1 = loadu128(base1); m128 in2 = loadu128(base2); m128 result = sadd_u8_m128(in1, in2); @@ -799,11 +797,11 @@ TEST(SimdUtilsTest, sadd_u8_m128) { TEST(SimdUtilsTest, sub_u8_m128) { unsigned char base1[] = {'a', 0xff, 0xff, 0x82, 'A', 'B', 'C', 'D', - 'a', 'b', 'c', 'd', '1', '2', '3', '4'}; - unsigned char base2[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4', - '1', '2', '3', '4', '1', '2', '3', '4'}; - unsigned char expec[] = {'a', 0x7f, 0, 'A', 0x10, 0x10, 0x10, 0x10, - 0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0}; + 'a', 'b', 'c', 'd', '1', '2', '3', '4'}; + unsigned char base2[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4', + '1', '2', '3', '4', '1', '2', '3', '4'}; + unsigned char expec[] = {'a', 0x7f, 0, 'A', 0x10, 0x10, 0x10, 0x10, + 0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0}; m128 in1 = loadu128(base1); m128 in2 = loadu128(base2); m128 result = sub_u8_m128(in1, in2); diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt index ea942ef..d7bef50 100644 --- a/util/CMakeLists.txt +++ b/util/CMakeLists.txt @@ -11,7 +11,13 @@ set_source_files_properties( PROPERTIES COMPILE_FLAGS "${RAGEL_C_FLAGS}") -ragelmaker(ExpressionParser.rl) +if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|i386") + ragelmaker(ExpressionParser.rl) +endif() + +if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + ragelcopyer(ExpressionParser.rl) +endif() set(expressionutil_SRCS expressions.cpp diff --git a/util/ExpressionParser.cpp b/util/ExpressionParser.cpp new file mode 100644 index 0000000..687fc39 --- /dev/null +++ b/util/ExpressionParser.cpp @@ -0,0 +1,397 @@ + + +/* + * Copyright (c) 2015-2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "ExpressionParser.h" + +#include +#include +#include +#include +#include + +#include "hs_compile.h" +#include "ue2common.h" + +using std::string; + +namespace { // anon + +enum ParamKey { + PARAM_NONE, + PARAM_MIN_OFFSET, + PARAM_MAX_OFFSET, + PARAM_MIN_LENGTH, + PARAM_EDIT_DISTANCE, + PARAM_HAMM_DISTANCE +}; + +static const char _ExpressionParser_actions[] = {0, 1, 0, 1, 1, 1, 2, 1, 3, + 1, 4, 1, 5, 1, 6, 1, 7, 1, + 9, 1, 10, 2, 8, 0 + +}; + +static const char _ExpressionParser_key_offsets[] = { + 0, 0, 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 23, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, + 42, 43, 44, 45, 46, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, + 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 82}; + +static const char _ExpressionParser_trans_keys[] = { + 32, 101, 104, 109, 32, 101, 104, 109, 100, 105, 116, 95, 100, 105, + 115, 116, 97, 110, 99, 101, 61, 48, 57, 32, 44, 125, 48, 57, + 32, 44, 125, 97, 109, 109, 105, 110, 103, 95, 100, 105, 115, 116, + 97, 110, 99, 101, 97, 105, 120, 95, 111, 102, 102, 115, 101, 116, + 110, 95, 108, 111, 101, 110, 103, 116, 104, 102, 102, 115, 101, 116, + 56, 67, 72, 76, 105, 109, 115, 123, 79, 81, 86, 87, 0}; + +static const char _ExpressionParser_single_lengths[] = { + 0, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 3, 3, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 0}; + +static const char _ExpressionParser_range_lengths[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0}; + +static const unsigned char _ExpressionParser_index_offsets[] = { + 0, 0, 5, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 43, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, + 69, 71, 73, 75, 77, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, + 100, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 134}; + +static const char _ExpressionParser_trans_targs[] = { + 2, 3, 19, 34, 0, 2, 3, 19, 34, 0, 4, 0, 5, 0, 6, 0, 7, + 0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0, + 16, 0, 17, 0, 18, 1, 57, 17, 0, 18, 1, 57, 0, 20, 0, 21, 0, + 22, 0, 23, 0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, + 0, 31, 0, 32, 0, 33, 0, 15, 0, 35, 43, 0, 36, 0, 37, 0, 38, + 0, 39, 0, 40, 0, 41, 0, 42, 0, 15, 0, 44, 0, 45, 0, 46, 51, + 0, 47, 0, 48, 0, 49, 0, 50, 0, 15, 0, 52, 0, 53, 0, 54, 0, + 55, 0, 15, 0, 56, 56, 56, 56, 56, 56, 56, 1, 56, 56, 0, 0, 0}; + +static const char _ExpressionParser_trans_actions[] = { + 17, 17, 17, 17, 19, 0, 0, 0, 0, 19, 0, 19, 0, 19, 0, 19, 0, + 19, 0, 19, 0, 19, 0, 19, 0, 19, 0, 19, 0, 19, 0, 19, 13, 19, + 0, 19, 21, 19, 0, 5, 5, 1, 19, 0, 5, 5, 19, 0, 19, 0, 19, + 0, 19, 0, 19, 0, 19, 0, 19, 0, 19, 0, 19, 0, 19, 0, 19, 0, + 19, 0, 19, 0, 19, 0, 19, 15, 19, 0, 0, 19, 0, 19, 0, 19, 0, + 19, 0, 19, 0, 19, 0, 19, 0, 19, 9, 19, 0, 19, 0, 19, 0, 0, + 19, 0, 19, 0, 19, 0, 19, 0, 19, 11, 19, 0, 19, 0, 19, 0, 19, + 0, 19, 7, 19, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 19, 19, 0}; + +static const char _ExpressionParser_eof_actions[] = { + 0, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 0, 0}; + +static const int ExpressionParser_start = 56; +static const int ExpressionParser_first_final = 56; +static const int ExpressionParser_error = 0; + +static const int ExpressionParser_en_main = 56; + +} // namespace + +static void initExt(hs_expr_ext *ext) { + memset(ext, 0, sizeof(*ext)); + ext->max_offset = MAX_OFFSET; +} + +bool HS_CDECL readExpression(const std::string &input, std::string &expr, + unsigned int *flags, hs_expr_ext *ext, + bool *must_be_ordered) { + assert(flags); + assert(ext); + + // Init flags and ext params. + *flags = 0; + initExt(ext); + if (must_be_ordered) { + *must_be_ordered = false; + } + + // Extract expr, which is easier to do in straight C++ than with Ragel. + if (input.empty() || input[0] != '/') { + return false; + } + size_t end = input.find_last_of('/'); + if (end == string::npos || end == 0) { + return false; + } + expr = input.substr(1, end - 1); + + // Use a Ragel scanner to handle flags and params. + const char *p = input.c_str() + end + 1; + const char *pe = input.c_str() + input.size(); + UNUSED const char *eof = pe; + UNUSED const char *ts = p, *te = p; + int cs; + UNUSED int act; + + assert(p); + assert(pe); + + // For storing integers as they're scanned. + u64a num = 0; + enum ParamKey key = PARAM_NONE; + + { cs = ExpressionParser_start; } + + { + int _klen; + unsigned int _trans; + const char *_acts; + unsigned int _nacts; + const char *_keys; + + if (p == pe) + goto _test_eof; + if (cs == 0) + goto _out; + _resume: + _keys = + _ExpressionParser_trans_keys + _ExpressionParser_key_offsets[cs]; + _trans = _ExpressionParser_index_offsets[cs]; + + _klen = _ExpressionParser_single_lengths[cs]; + if (_klen > 0) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + _klen - 1; + while (1) { + if (_upper < _lower) + break; + + _mid = _lower + ((_upper - _lower) >> 1); + if ((*p) < *_mid) + _upper = _mid - 1; + else if ((*p) > *_mid) + _lower = _mid + 1; + else { + _trans += (unsigned int)(_mid - _keys); + goto _match; + } + } + _keys += _klen; + _trans += _klen; + } + + _klen = _ExpressionParser_range_lengths[cs]; + if (_klen > 0) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + (_klen << 1) - 2; + while (1) { + if (_upper < _lower) + break; + + _mid = _lower + (((_upper - _lower) >> 1) & ~1); + if ((*p) < _mid[0]) + _upper = _mid - 2; + else if ((*p) > _mid[1]) + _lower = _mid + 2; + else { + _trans += (unsigned int)((_mid - _keys) >> 1); + goto _match; + } + } + _trans += _klen; + } + + _match: + cs = _ExpressionParser_trans_targs[_trans]; + + if (_ExpressionParser_trans_actions[_trans] == 0) + goto _again; + + _acts = + _ExpressionParser_actions + _ExpressionParser_trans_actions[_trans]; + _nacts = (unsigned int)*_acts++; + while (_nacts-- > 0) { + switch (*_acts++) { + case 0: + + { + num = (num * 10) + ((*p) - '0'); + } break; + case 1: + + { + switch ((*p)) { + case 'i': + *flags |= HS_FLAG_CASELESS; + break; + case 's': + *flags |= HS_FLAG_DOTALL; + break; + case 'm': + *flags |= HS_FLAG_MULTILINE; + break; + case 'H': + *flags |= HS_FLAG_SINGLEMATCH; + break; + case 'O': + if (must_be_ordered) { + *must_be_ordered = true; + } + break; + case 'V': + *flags |= HS_FLAG_ALLOWEMPTY; + break; + case 'W': + *flags |= HS_FLAG_UCP; + break; + case '8': + *flags |= HS_FLAG_UTF8; + break; + case 'P': + *flags |= HS_FLAG_PREFILTER; + break; + case 'L': + *flags |= HS_FLAG_SOM_LEFTMOST; + break; + case 'C': + *flags |= HS_FLAG_COMBINATION; + break; + case 'Q': + *flags |= HS_FLAG_QUIET; + break; + default: { + p++; + goto _out; + } + } + } break; + case 2: + + { + switch (key) { + case PARAM_MIN_OFFSET: + ext->flags |= HS_EXT_FLAG_MIN_OFFSET; + ext->min_offset = num; + break; + case PARAM_MAX_OFFSET: + ext->flags |= HS_EXT_FLAG_MAX_OFFSET; + ext->max_offset = num; + break; + case PARAM_MIN_LENGTH: + ext->flags |= HS_EXT_FLAG_MIN_LENGTH; + ext->min_length = num; + break; + case PARAM_EDIT_DISTANCE: + ext->flags |= HS_EXT_FLAG_EDIT_DISTANCE; + ext->edit_distance = num; + break; + case PARAM_HAMM_DISTANCE: + ext->flags |= HS_EXT_FLAG_HAMMING_DISTANCE; + ext->hamming_distance = num; + break; + case PARAM_NONE: + default: + // No key specified, syntax invalid. + return false; + } + } break; + case 3: + + { + key = PARAM_MIN_OFFSET; + } break; + case 4: + + { + key = PARAM_MAX_OFFSET; + } break; + case 5: + + { + key = PARAM_MIN_LENGTH; + } break; + case 6: + + { + key = PARAM_EDIT_DISTANCE; + } break; + case 7: + + { + key = PARAM_HAMM_DISTANCE; + } break; + case 8: + + { + num = 0; + } break; + case 9: + + { + key = PARAM_NONE; + } break; + case 10: + + { + return false; + } break; + } + } + + _again: + if (cs == 0) + goto _out; + if (++p != pe) + goto _resume; + _test_eof : {} + if (p == eof) { + const char *__acts = + _ExpressionParser_actions + _ExpressionParser_eof_actions[cs]; + unsigned int __nacts = (unsigned int)*__acts++; + while (__nacts-- > 0) { + switch (*__acts++) { + case 10: + + { + return false; + } break; + } + } + } + + _out : {} + } + + DEBUG_PRINTF("expr='%s', flags=%u\n", expr.c_str(), *flags); + + return (cs != ExpressionParser_error) && (p == pe); +} -- 2.39.0