Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AVX2 optimization of group lookup. #167

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ Copy the parallel_hashmap directory to your project. Update your include path. T

If you are using Visual Studio, you probably want to add `phmap.natvis` to your projects. This will allow for a clear display of the hash table contents in the debugger.

> A cmake configuration files (CMakeLists.txt) is provided for building the tests and examples. Command for building and running the tests is: `mkdir build && cd build && cmake -DPHMAP_BUILD_TESTS=ON -DPHMAP_BUILD_EXAMPLES=ON .. && cmake --build . && make test`
> A cmake configuration file (CMakeLists.txt) is provided for building the tests and examples.
> * The command for building and running the tests is: `mkdir build && cd build && cmake -DPHMAP_BUILD_TESTS=ON -DPHMAP_BUILD_EXAMPLES=ON .. && cmake --build . && make test`
> * Use this command to build tests with AVX2 support enabled: `mkdir build && cd build && cmake -DPHMAP_BUILD_TESTS=ON -DPHMAP_BUILD_EXAMPLES=ON -DCMAKE_CXX_FLAGS=-mavx2 .. && cmake --build . && make test`

## Example

Expand Down
103 changes: 101 additions & 2 deletions parallel_hashmap/phmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -320,8 +320,10 @@ static_assert(kDeleted == -2,
// This enables removing a branch in the hot path of find().
// --------------------------------------------------------------------------
inline ctrl_t* EmptyGroup() {
alignas(16) static constexpr ctrl_t empty_group[] = {
alignas(32) static constexpr ctrl_t empty_group[] = {
kSentinel, kEmpty, kEmpty, kEmpty, kEmpty, kEmpty, kEmpty, kEmpty,
kEmpty, kEmpty, kEmpty, kEmpty, kEmpty, kEmpty, kEmpty, kEmpty,
kEmpty, kEmpty, kEmpty, kEmpty, kEmpty, kEmpty, kEmpty, kEmpty,
kEmpty, kEmpty, kEmpty, kEmpty, kEmpty, kEmpty, kEmpty, kEmpty};
return const_cast<ctrl_t*>(empty_group);
}
Expand Down Expand Up @@ -464,6 +466,101 @@ struct GroupSse2Impl

#endif // PHMAP_HAVE_SSE2

#if PHMAP_HAVE_AVX2

#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4365) // conversion from 'int' to 'T', signed/unsigned mismatch
#endif

// --------------------------------------------------------------------------
// https://github.com/abseil/abseil-cpp/issues/209
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87853
// _mm_cmpgt_epi8 is broken under GCC with -funsigned-char
// Work around this by using the portable implementation of Group
// when using -funsigned-char under GCC.
// --------------------------------------------------------------------------
inline __m256i _mm256_cmpgt_epi8_fixed(__m256i a, __m256i b) {
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverflow"

if (std::is_unsigned<char>::value) {
const __m256i mask = _mm256_set1_epi8(static_cast<char>(0x80));
const __m256i diff = _mm256_subs_epi8(b, a);
return _mm256_cmpeq_epi8(_mm256_and_si256(diff, mask), mask);
}

#pragma GCC diagnostic pop
#endif
return _mm256_cmpgt_epi8(a, b);
}

// --------------------------------------------------------------------------
// --------------------------------------------------------------------------
struct GroupAvx2Impl {
enum { kWidth = sizeof(__m256i) }; // the number of slots per group

explicit GroupAvx2Impl(const ctrl_t* pos) {
ctrl = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(pos));
}

// Returns a bitmask representing the positions of slots that match hash.
// ----------------------------------------------------------------------
BitMask<uint64_t, kWidth> Match(h2_t hash) const {
auto match = _mm256_set1_epi8(static_cast<char>(hash));
return BitMask<uint64_t, kWidth>(
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(match, ctrl))));
}

// Returns a bitmask representing the positions of empty slots.
// ------------------------------------------------------------
BitMask<uint64_t, kWidth> MatchEmpty() const {
// This only works because kEmpty is -128.
return BitMask<uint64_t, kWidth>(
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_sign_epi8(ctrl, ctrl))));
}

#ifdef __INTEL_COMPILER
#pragma warning push
#pragma warning disable 68
#endif
// Returns a bitmask representing the positions of empty or deleted slots.
// -----------------------------------------------------------------------
BitMask<uint64_t, kWidth> MatchEmptyOrDeleted() const {
auto special = _mm256_set1_epi8(static_cast<char>(kSentinel));
return BitMask<uint64_t, kWidth>(
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpgt_epi8_fixed(special, ctrl))));
}

// Returns the number of trailing empty or deleted elements in the group.
// ----------------------------------------------------------------------
uint32_t CountLeadingEmptyOrDeleted() const {
auto special = _mm256_set1_epi8(static_cast<char>(kSentinel));
return TrailingZeros(
static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpgt_epi8_fixed(special, ctrl)) + 1));
}
#ifdef __INTEL_COMPILER
#pragma warning pop
#endif

// ----------------------------------------------------------------------
void ConvertSpecialToEmptyAndFullToDeleted(ctrl_t* dst) const {
auto msbs = _mm256_set1_epi8(static_cast<char>(-128));
auto x126 = _mm256_set1_epi8(126);
auto res = _mm256_or_si256(_mm256_shuffle_epi8(x126, ctrl), msbs);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), res);
}

__m256i ctrl;
};

#ifdef _MSC_VER
#pragma warning(pop)
#endif

#endif // PHMAP_HAVE_AVX2

// --------------------------------------------------------------------------
// --------------------------------------------------------------------------
struct GroupPortableImpl
Expand Down Expand Up @@ -519,7 +616,9 @@ struct GroupPortableImpl
uint64_t ctrl;
};

#if PHMAP_HAVE_SSE2
#if PHMAP_HAVE_AVX2
using Group = GroupAvx2Impl;
#elif PHMAP_HAVE_SSE2
using Group = GroupSse2Impl;
#else
using Group = GroupPortableImpl;
Expand Down
13 changes: 12 additions & 1 deletion parallel_hashmap/phmap_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,7 @@
#endif

// ----------------------------------------------------------------------
// Figure out SSE support
// Figure out SSE/AVX support
// ----------------------------------------------------------------------
#ifndef PHMAP_HAVE_SSE2
#if defined(__SSE2__) || \
Expand All @@ -650,6 +650,14 @@
#endif
#endif

#ifndef PHMAP_HAVE_AVX2
#if defined(__AVX2__)
#define PHMAP_HAVE_AVX2 1
#else
#define PHMAP_HAVE_AVX2 0
#endif
#endif

#if PHMAP_HAVE_SSSE3 && !PHMAP_HAVE_SSE2
#error "Bad configuration!"
#endif
Expand All @@ -662,6 +670,9 @@
#include <tmmintrin.h>
#endif

#if PHMAP_HAVE_AVX2
#include <immintrin.h>
#endif

// ----------------------------------------------------------------------
// constexpr if
Expand Down
32 changes: 29 additions & 3 deletions tests/raw_hash_set_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,21 @@ TEST(Group, EmptyGroup) {
}

TEST(Group, Match) {
PHMAP_IF_CONSTEXPR (Group::kWidth == 16) {
PHMAP_IF_CONSTEXPR (Group::kWidth == 32) {
// 0 1 2 3 4 5 6 7
ctrl_t group[] = {kEmpty, 1, kDeleted, 3, kEmpty, 5, kSentinel, 7,
7, 5, 3, 1, 1, 1, 1, 1,
7, 5, 3, 1, 1, 1, 1, 1,
7, 5, 3, 1, 1, 1, 1, 1};
EXPECT_THAT(Group{group}.Match(0), ElementsAre());
EXPECT_THAT(Group{group}.Match(1), ElementsAre(1,
11, 12, 13, 14, 15,
19, 20, 21, 22, 23,
27, 28, 29, 30, 31));
EXPECT_THAT(Group{group}.Match(3), ElementsAre(3, 10, 18, 26));
EXPECT_THAT(Group{group}.Match(5), ElementsAre(5, 9, 17, 25));
EXPECT_THAT(Group{group}.Match(7), ElementsAre(7, 8, 16, 24));
} else PHMAP_IF_CONSTEXPR (Group::kWidth == 16) {
ctrl_t group[] = {kEmpty, 1, kDeleted, 3, kEmpty, 5, kSentinel, 7,
7, 5, 3, 1, 1, 1, 1, 1};
EXPECT_THAT(Group{group}.Match(0), ElementsAre());
Expand All @@ -185,7 +199,13 @@ TEST(Group, Match) {
}

TEST(Group, MatchEmpty) {
PHMAP_IF_CONSTEXPR (Group::kWidth == 16) {
PHMAP_IF_CONSTEXPR (Group::kWidth == 32) {
ctrl_t group[] = {kEmpty, 1, kDeleted, 3, kEmpty, 5, kSentinel, 7,
7, 5, 3, 1, 1, 1, 1, 1,
kEmpty, 1, kDeleted, 3, kEmpty, 5, kSentinel, 7,
7, 5, 3, 1, 1, 1, 1, 1};
EXPECT_THAT(Group{group}.MatchEmpty(), ElementsAre(0, 4, 16, 20));
} else PHMAP_IF_CONSTEXPR (Group::kWidth == 16) {
ctrl_t group[] = {kEmpty, 1, kDeleted, 3, kEmpty, 5, kSentinel, 7,
7, 5, 3, 1, 1, 1, 1, 1};
EXPECT_THAT(Group{group}.MatchEmpty(), ElementsAre(0, 4));
Expand All @@ -198,7 +218,13 @@ TEST(Group, MatchEmpty) {
}

TEST(Group, MatchEmptyOrDeleted) {
PHMAP_IF_CONSTEXPR (Group::kWidth == 16) {
PHMAP_IF_CONSTEXPR (Group::kWidth == 32) {
ctrl_t group[] = {kEmpty, 1, kDeleted, 3, kEmpty, 5, kSentinel, 7,
7, 5, 3, 1, 1, 1, 1, 1,
kEmpty, 1, kDeleted, 3, kEmpty, 5, kSentinel, 7,
7, 5, 3, 1, 1, 1, 1, 1};
EXPECT_THAT(Group{group}.MatchEmptyOrDeleted(), ElementsAre(0, 2, 4, 16, 18, 20));
} else PHMAP_IF_CONSTEXPR (Group::kWidth == 16) {
ctrl_t group[] = {kEmpty, 1, kDeleted, 3, kEmpty, 5, kSentinel, 7,
7, 5, 3, 1, 1, 1, 1, 1};
EXPECT_THAT(Group{group}.MatchEmptyOrDeleted(), ElementsAre(0, 2, 4));
Expand Down