diff --git a/include/bitops.hpp b/include/bitops.hpp index ec6137f..8037696 100644 --- a/include/bitops.hpp +++ b/include/bitops.hpp @@ -2,87 +2,111 @@ #if __cplusplus >= 202002L || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L) #include -namespace json::__bitops { - using std::countl_one; - using std::countr_one; - using std::countl_zero; - using std::countr_zero; - inline constexpr bool is_little_endian() { - return std::endian::native == std::endian::little; - } +namespace json::__bitops +{ +using std::countl_one; +using std::countl_zero; +using std::countr_one; +using std::countr_zero; +inline constexpr bool is_little_endian() +{ + return std::endian::native == std::endian::little; +} } #else #include -namespace json::__bitops { +namespace json::__bitops +{ #if defined(__GNUC__) || defined(__clang__) - inline constexpr int countl_zero(uint32_t x) { - if constexpr (sizeof(uint32_t) == sizeof(unsigned int)) - return x == 0 ? 32 : __builtin_clz(x); - if constexpr (sizeof(uint32_t) == sizeof(unsigned long)) - return x == 0 ? 32 : __builtin_clzl(x); - return x == 0 ? 32 : __builtin_clzll(x); - } - inline constexpr int countr_zero(uint32_t x) { - if constexpr (sizeof(uint32_t) == sizeof(unsigned int)) - return x == 0 ? 32 : __builtin_ctz(x); - if constexpr (sizeof(uint32_t) == sizeof(unsigned long)) - return x == 0 ? 32 : __builtin_ctzl(x); - return x == 0 ? 32 : __builtin_ctzll(x); - } - inline constexpr int countl_zero(uint64_t x) { - return x == 0 ? 64 : __builtin_clzll(x); - } - inline constexpr int countr_zero(uint64_t x) { - return x == 0 ? 64 : __builtin_ctzll(x); - } +inline constexpr int countl_zero(uint32_t x) +{ + if constexpr (sizeof(uint32_t) == sizeof(unsigned int)) return x == 0 ? 32 : __builtin_clz(x); + if constexpr (sizeof(uint32_t) == sizeof(unsigned long)) return x == 0 ? 32 : __builtin_clzl(x); + return x == 0 ? 32 : __builtin_clzll(x); +} +inline constexpr int countr_zero(uint32_t x) +{ + if constexpr (sizeof(uint32_t) == sizeof(unsigned int)) return x == 0 ? 32 : __builtin_ctz(x); + if constexpr (sizeof(uint32_t) == sizeof(unsigned long)) return x == 0 ? 32 : __builtin_ctzl(x); + return x == 0 ? 32 : __builtin_ctzll(x); +} +inline constexpr int countl_zero(uint64_t x) +{ + return x == 0 ? 64 : __builtin_clzll(x); +} +inline constexpr int countr_zero(uint64_t x) +{ + return x == 0 ? 64 : __builtin_ctzll(x); +} #elif defined(_MSC_VER) #ifdef __AVX2__ - // lzcnt intrinsics is not constexpr - inline int countl_zero(uint32_t x) { - return __lzcnt(x); - } - inline int countr_zero(uint32_t x) { - return _tzcnt_u32(x); - } - inline int countl_zero(uint64_t x) { - return (int)__lzcnt64(x); - } - inline int countr_zero(uint64_t x) { - return (int)_tzcnt_u64(x); - } +// lzcnt intrinsics is not constexpr +inline int countl_zero(uint32_t x) +{ + return __lzcnt(x); +} +inline int countr_zero(uint32_t x) +{ + return _tzcnt_u32(x); +} +inline int countl_zero(uint64_t x) +{ + return (int)__lzcnt64(x); +} +inline int countr_zero(uint64_t x) +{ + return (int)_tzcnt_u64(x); +} #else - inline constexpr int countl_zero(uint32_t x) { - unsigned long index = 0; - return _BitScanReverse(&index, x) ? 31 - index : 32; - } - inline constexpr int countr_zero(uint32_t x) { - unsigned long index = 0; - return _BitScanForward(&index, x) ? index : 32; - } - inline constexpr int countl_zero(uint64_t x) { - unsigned long index = 0; - return _BitScanReverse64(&index, x) ? 63 - index : 64; - } - inline constexpr int countr_zero(uint64_t x) { - unsigned long index = 0; - return _BitScanForward64(&index, x) ? index : 64; - } +inline constexpr int countl_zero(uint32_t x) +{ + unsigned long index = 0; + return _BitScanReverse(&index, x) ? 31 - index : 32; +} +inline constexpr int countr_zero(uint32_t x) +{ + unsigned long index = 0; + return _BitScanForward(&index, x) ? index : 32; +} +inline constexpr int countl_zero(uint64_t x) +{ + unsigned long index = 0; + return _BitScanReverse64(&index, x) ? 63 - index : 64; +} +inline constexpr int countr_zero(uint64_t x) +{ + unsigned long index = 0; + return _BitScanForward64(&index, x) ? index : 64; +} #endif // __AVX2__ -#else // compiler +#else // compiler #error "bring your own bit counting implementation" #endif - inline int countl_one(uint32_t x) { return countl_zero(~x); } - inline int countr_one(uint32_t x) { return countr_zero(~x); } - inline int countl_one(uint64_t x) { return countl_zero(~x); } - inline int countr_one(uint64_t x) { return countr_zero(~x); } +inline int countl_one(uint32_t x) +{ + return countl_zero(~x); +} +inline int countr_one(uint32_t x) +{ + return countr_zero(~x); +} +inline int countl_one(uint64_t x) +{ + return countl_zero(~x); +} +inline int countr_one(uint64_t x) +{ + return countr_zero(~x); +} - // no constexpr endian awareness before C++20 - inline bool is_little_endian() { - union { - uint32_t u32; - uint8_t u8; - } u = { 0x01020304 }; - return u.u8 == 4; - } +// no constexpr endian awareness before C++20 +inline bool is_little_endian() +{ + union { + uint32_t u32; + uint8_t u8; + } u = { 0x01020304 }; + return u.u8 == 4; } +} // namespace json::__bitops #endif // C++20 diff --git a/include/packed_bytes.hpp b/include/packed_bytes.hpp index c471777..56e4392 100644 --- a/include/packed_bytes.hpp +++ b/include/packed_bytes.hpp @@ -1,23 +1,27 @@ #pragma once + #include #include #include + #include "bitops.hpp" #if defined(__GNUC__) || defined(__clang__) -#define __packed_bytes_strong_inline __attribute__((always_inline)) +#define __packed_bytes_strong_inline __attribute__((always_inline)) #elif defined(_MSC_VER) #define __packed_bytes_strong_inline __forceinline #else #define __packed_bytes_strong_inline inline #endif -struct packed_bytes_trait_none { +struct packed_bytes_trait_none +{ static constexpr bool available = false; }; template -struct packed_bytes { +struct packed_bytes +{ using traits = packed_bytes_trait_none; }; @@ -27,39 +31,40 @@ struct packed_bytes { #include "packed_bytes_arm.hpp" #endif -struct packed_bytes_trait_uint64 { +struct packed_bytes_trait_uint64 +{ static constexpr bool available = sizeof(void*) >= 8; static constexpr auto step = 8; using value_type = std::enable_if_t= 8, uint64_t>; - __packed_bytes_strong_inline static value_type load_unaligned(const void *ptr) { + __packed_bytes_strong_inline static value_type load_unaligned(const void* ptr) + { value_type result; memcpy((void*)&result, ptr, 8); return result; } - __packed_bytes_strong_inline static value_type less(value_type x, uint8_t n) { - return (((x) - UINT64_C(0x0101010101010101) * (n)) & ~(x) & UINT64_C(0x8080808080808080)); - } - - __packed_bytes_strong_inline static value_type is_zero_memberwise(value_type v) { - return (((v) - UINT64_C(0x0101010101010101)) & ~(v) & UINT64_C(0x8080808080808080)); + __packed_bytes_strong_inline static value_type less(value_type x, uint8_t n) + { + return (((x)-UINT64_C(0x0101010101010101) * (n)) & ~(x)&UINT64_C(0x8080808080808080)); } - __packed_bytes_strong_inline static bool is_all_zero(value_type v) + __packed_bytes_strong_inline static value_type is_zero_memberwise(value_type v) { - return v == UINT64_C(0); + return (((v)-UINT64_C(0x0101010101010101)) & ~(v)&UINT64_C(0x8080808080808080)); } - __packed_bytes_strong_inline static value_type equal(value_type x, uint8_t n) { + __packed_bytes_strong_inline static bool is_all_zero(value_type v) { return v == UINT64_C(0); } + + __packed_bytes_strong_inline static value_type equal(value_type x, uint8_t n) + { return is_zero_memberwise((x) ^ (UINT64_C(0x0101010101010101) * (n))); } - __packed_bytes_strong_inline static value_type bitwise_or(value_type a, value_type b) { - return a | b; - } + __packed_bytes_strong_inline static value_type bitwise_or(value_type a, value_type b) { return a | b; } - __packed_bytes_strong_inline static size_t first_nonzero_byte(value_type x) { + __packed_bytes_strong_inline static size_t first_nonzero_byte(value_type x) + { if (json::__bitops::is_little_endian()) return json::__bitops::countr_zero(x) / 8; else @@ -67,38 +72,41 @@ struct packed_bytes_trait_uint64 { } }; -struct packed_bytes_trait_uint32 { +struct packed_bytes_trait_uint32 +{ static constexpr bool available = true; static constexpr auto step = 4; using value_type = uint32_t; - __packed_bytes_strong_inline static value_type load_unaligned(const void *ptr) { + __packed_bytes_strong_inline static value_type load_unaligned(const void* ptr) + { value_type result; memcpy((void*)&result, ptr, 4); return result; } - __packed_bytes_strong_inline static value_type less(value_type x, uint8_t n) { + __packed_bytes_strong_inline static value_type less(value_type x, uint8_t n) + { return (((x) - ~UINT32_C(0) / 255 * (n)) & ~(x) & ~UINT32_C(0) / 255 * 128); } - __packed_bytes_strong_inline static value_type is_zero_memberwise(value_type v) { - return (((v) - UINT32_C(0x01010101)) & ~(v) & UINT32_C(0x80808080));; + __packed_bytes_strong_inline static value_type is_zero_memberwise(value_type v) + { + return (((v)-UINT32_C(0x01010101)) & ~(v)&UINT32_C(0x80808080)); + ; } - __packed_bytes_strong_inline static bool is_all_zero(value_type v) { - return v == UINT32_C(0); - } + __packed_bytes_strong_inline static bool is_all_zero(value_type v) { return v == UINT32_C(0); } - __packed_bytes_strong_inline static value_type equal(value_type x, uint8_t n) { + __packed_bytes_strong_inline static value_type equal(value_type x, uint8_t n) + { return is_zero_memberwise((x) ^ (~UINT32_C(0) / 255 * (n))); } - __packed_bytes_strong_inline static value_type bitwise_or(value_type a, value_type b) { - return a | b; - } + __packed_bytes_strong_inline static value_type bitwise_or(value_type a, value_type b) { return a | b; } - __packed_bytes_strong_inline static size_t first_nonzero_byte(value_type x) { + __packed_bytes_strong_inline static size_t first_nonzero_byte(value_type x) + { if (json::__bitops::is_little_endian()) return json::__bitops::countr_zero(x) / 8; else @@ -106,21 +114,22 @@ struct packed_bytes_trait_uint32 { } }; template <> -struct packed_bytes<8> { +struct packed_bytes<8> +{ using traits = std::enable_if_t; }; template <> -struct packed_bytes<4> { +struct packed_bytes<4> +{ using traits = packed_bytes_trait_uint32; }; template using packed_bytes_trait = typename packed_bytes::traits; -using packed_bytes_trait_max = std::conditional_t::available, packed_bytes_trait<32>, - std::conditional_t::available, packed_bytes_trait<16>, - std::conditional_t::available, packed_bytes_trait<8>, - packed_bytes_trait<4> - >>>; - +using packed_bytes_trait_max = + std::conditional_t::available, packed_bytes_trait<32>, + std::conditional_t::available, packed_bytes_trait<16>, + std::conditional_t::available, packed_bytes_trait<8>, + packed_bytes_trait<4>>>>; diff --git a/include/packed_bytes_arm.hpp b/include/packed_bytes_arm.hpp index 86fc07f..250e8f5 100644 --- a/include/packed_bytes_arm.hpp +++ b/include/packed_bytes_arm.hpp @@ -9,34 +9,29 @@ #define __packed_bytes_trait_arm64 #endif -struct packed_bytes_trait_neon { +struct packed_bytes_trait_neon +{ static constexpr bool available = true; static constexpr auto step = 16; using value_type = uint8x16_t; - __packed_bytes_strong_inline static value_type load_unaligned(const void *ptr) { - return vld1q_u8((uint8_t*)ptr); - } + __packed_bytes_strong_inline static value_type load_unaligned(const void* ptr) { return vld1q_u8((uint8_t*)ptr); } - __packed_bytes_strong_inline static value_type less(value_type x, uint8_t n) { + __packed_bytes_strong_inline static value_type less(value_type x, uint8_t n) + { auto bcast = vdupq_n_u8(n); auto is_less = vcltq_u8(x, bcast); return is_less; } - __packed_bytes_strong_inline static value_type equal(value_type x, uint8_t n) { - return vceqq_u8(x, vdupq_n_u8(n)); - } + __packed_bytes_strong_inline static value_type equal(value_type x, uint8_t n) { return vceqq_u8(x, vdupq_n_u8(n)); } - __packed_bytes_strong_inline static value_type equal(value_type x, value_type y) { - return vceqq_u8(x, y); - } + __packed_bytes_strong_inline static value_type equal(value_type x, value_type y) { return vceqq_u8(x, y); } - __packed_bytes_strong_inline static value_type bitwise_or(value_type a, value_type b) { - return vorrq_u8(a, b); - } + __packed_bytes_strong_inline static value_type bitwise_or(value_type a, value_type b) { return vorrq_u8(a, b); } - __packed_bytes_strong_inline static bool is_all_zero(value_type x) { + __packed_bytes_strong_inline static bool is_all_zero(value_type x) + { #ifdef __packed_bytes_trait_arm64 return vmaxvq_u8(x) == 0; #else @@ -46,7 +41,8 @@ struct packed_bytes_trait_neon { #endif } - __packed_bytes_strong_inline static size_t first_nonzero_byte(value_type x) { + __packed_bytes_strong_inline static size_t first_nonzero_byte(value_type x) + { // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon auto cmp = equal(x, 0); auto res = vshrn_n_u16(cmp, 4); @@ -56,7 +52,8 @@ struct packed_bytes_trait_neon { }; template <> -struct packed_bytes<16> { +struct packed_bytes<16> +{ using traits = packed_bytes_trait_neon; }; diff --git a/include/packed_bytes_x86.hpp b/include/packed_bytes_x86.hpp index afa6951..8fe6228 100644 --- a/include/packed_bytes_x86.hpp +++ b/include/packed_bytes_x86.hpp @@ -2,21 +2,24 @@ #include "packed_bytes.hpp" #include -#if defined(__SSE4_1__) || defined(__AVX2__) || defined(_MSC_VER) +#if defined(__SSE4_1__) || defined(__AVX2__) || defined(_MSC_VER) // MSVC enables all SSE4.1 intrinsics by default #include #endif -struct packed_bytes_trait_sse { +struct packed_bytes_trait_sse +{ static constexpr bool available = true; static constexpr auto step = 16; using value_type = __m128i; - __packed_bytes_strong_inline static value_type load_unaligned(const void *ptr) { + __packed_bytes_strong_inline static value_type load_unaligned(const void* ptr) + { return _mm_loadu_si128(reinterpret_cast(ptr)); } - __packed_bytes_strong_inline static value_type less(value_type x, uint8_t n) { + __packed_bytes_strong_inline static value_type less(value_type x, uint8_t n) + { auto bcast = _mm_set1_epi8(static_cast(n)); auto all1 = _mm_set1_epi8(-1); auto max_with_n = _mm_max_epu8(x, bcast); @@ -25,19 +28,17 @@ struct packed_bytes_trait_sse { return is_less; } - __packed_bytes_strong_inline static value_type equal(value_type x, uint8_t n) { + __packed_bytes_strong_inline static value_type equal(value_type x, uint8_t n) + { return _mm_cmpeq_epi8(x, _mm_set1_epi8(static_cast(n))); } - __packed_bytes_strong_inline static value_type equal(value_type x, value_type y) { - return _mm_cmpeq_epi8(x, y); - } + __packed_bytes_strong_inline static value_type equal(value_type x, value_type y) { return _mm_cmpeq_epi8(x, y); } - __packed_bytes_strong_inline static value_type bitwise_or(value_type a, value_type b) { - return _mm_or_si128(a, b); - } + __packed_bytes_strong_inline static value_type bitwise_or(value_type a, value_type b) { return _mm_or_si128(a, b); } - __packed_bytes_strong_inline static bool is_all_zero(value_type x) { + __packed_bytes_strong_inline static bool is_all_zero(value_type x) + { #if defined(__SSE4_1__) || defined(__AVX2__) || defined(_MSC_VER) // SSE4.1 path return !!_mm_testz_si128(x, x); @@ -49,7 +50,8 @@ struct packed_bytes_trait_sse { #endif } - __packed_bytes_strong_inline static size_t first_nonzero_byte(value_type x) { + __packed_bytes_strong_inline static size_t first_nonzero_byte(value_type x) + { auto cmp = _mm_cmpeq_epi8(x, _mm_set1_epi8(0)); auto mask = (uint16_t)_mm_movemask_epi8(cmp); return json::__bitops::countr_one((uint32_t)mask); @@ -57,11 +59,11 @@ struct packed_bytes_trait_sse { }; template <> -struct packed_bytes<16> { +struct packed_bytes<16> +{ using traits = packed_bytes_trait_sse; }; - #ifdef __AVX2__ #include @@ -91,20 +93,14 @@ struct packed_bytes_trait_avx2 return _mm256_cmpeq_epi8(x, _mm256_set1_epi8(static_cast(n))); } - __packed_bytes_strong_inline static value_type equal(value_type x, value_type y) - { - return _mm256_cmpeq_epi8(x, y); - } + __packed_bytes_strong_inline static value_type equal(value_type x, value_type y) { return _mm256_cmpeq_epi8(x, y); } __packed_bytes_strong_inline static value_type bitwise_or(value_type a, value_type b) { return _mm256_or_si256(a, b); } - __packed_bytes_strong_inline static bool is_all_zero(value_type x) - { - return (bool)_mm256_testz_si256(x, x); - } + __packed_bytes_strong_inline static bool is_all_zero(value_type x) { return (bool)_mm256_testz_si256(x, x); } __packed_bytes_strong_inline static size_t first_nonzero_byte(value_type x) { @@ -116,7 +112,8 @@ struct packed_bytes_trait_avx2 }; template <> -struct packed_bytes<32> { +struct packed_bytes<32> +{ using traits = packed_bytes_trait_avx2; };