Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revise perfect hash to align with libgrape-lite's pthash #1992

Merged
merged 7 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@
path = modules/graph/thirdparty/GraphAr
url = https://github.com/alibaba/GraphAr.git
shallow = true
[submodule "modules/graph/thirdparty/libgrape-lite"]
path = modules/graph/thirdparty/libgrape-lite
[submodule "thirdparty/libgrape-lite"]
path = thirdparty/libgrape-lite
url = https://github.com/alibaba/libgrape-lite.git
shallow = true
[submodule "modules/graph/thirdparty/powturbo"]
Expand Down
4 changes: 0 additions & 4 deletions NOTICE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,6 @@ This product includes software from the ClickHouse project
* Copyright 2016-2022 ClickHouse, Inc.
* https://github.com/ClickHouse/ClickHouse

This product includes software from the BBHash project
* Copyright (c) 2015 Guillaume Rizk
* https://github.com/rizkg/BBHash

This product includes software from the rax project (BSD, 2-clause)
* Copyright (c) 2017-2019, Salvatore Sanfilippo <antirez at gmail dot com>
* https://github.com/antirez/rax
1 change: 0 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,6 @@ We thank the following excellent open-source projects:
- `skywalking-infra-e2e <https://github.com/apache/skywalking-infra-e2e>`_ A generation End-to-End Testing framework.
- `skywalking-swck <https://github.com/apache/skywalking-swck>`_ A kubernetes operator for the Apache Skywalking.
- `wyhash <https://github.com/alainesp/wy>`_, C++ wrapper around wyhash and wyrand.
- `BBHash <https://github.com/rizkg/BBHash>`_, a fast, minimal-memory perfect hash function.
- `rax <https://github.com/antirez/rax>`_, an ANSI C radix tree implementation.
- `MurmurHash3 <https://github.com/aappleby/smhasher>`_, a fast non-cryptographic hash function.

Expand Down
32 changes: 24 additions & 8 deletions modules/basic/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,34 @@ file(GLOB_RECURSE BASIC_SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")

add_library(vineyard_basic ${BASIC_SRC_FILES})
target_add_debuginfo(vineyard_basic)
find_package(MPI REQUIRED)
target_link_libraries(vineyard_basic PUBLIC vineyard_client
${ARROW_SHARED_LIB}
${GLOG_LIBRARIES}
${MPI_CXX_LIBRARIES}
)
target_include_directories(vineyard_basic PUBLIC ${ARROW_INCLUDE_DIR})
target_include_directories(vineyard_basic PUBLIC ${ARROW_INCLUDE_DIR} ${MPI_CXX_INCLUDE_PATH})

find_package(libgrapelite 0.3.4 QUIET)
if(LIBGRAPELITE_INCLUDE_DIRS)
message(STATUS "-- Found libgrape-lite: ${LIBGRAPELITE_INCLUDE_DIRS}")
target_include_directories(vineyard_basic PUBLIC ${LIBGRAPELITE_INCLUDE_DIRS})
else()
# use bundled libgrape-lite
message(STATUS "-- Building libgrape-lite from submodule: ${CMAKE_SOURCE_DIR}/thirdparty/libgrape-lite")
set(BUILD_LIBGRAPELITE_DOCS OFF CACHE BOOL "no libgrape-lite docs")
set(BUILD_LIBGRAPELITE_TESTS OFF CACHE BOOL "no libgrape-lite tests")
# use `add_subdirectory` to use the same CMAKE_BUILD_TYPE with vineyard itself and
# ensure the libgrapelite-targets-{debug/release}.cmake been generated during installation.
add_subdirectory("${CMAKE_SOURCE_DIR}/thirdparty/libgrape-lite"
"${CMAKE_SOURCE_DIR}/thirdparty/libgrape-lite"
)
target_include_directories(vineyard_basic PUBLIC
$<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/thirdparty/libgrape-lite>
$<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/thirdparty/libgrape-lite/thirdparty>
$<INSTALL_INTERFACE:include>
)
endif()

# install bundled thirdparty: flat_hash_map
install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/flat_hash_map
Expand All @@ -64,13 +87,6 @@ install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/wyhash
PATTERN "*.hpp" # select C++ template header files
)

install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/BBHash
DESTINATION include/vineyard/contrib # target directory
FILES_MATCHING # install only matched files
PATTERN "*.h" # select header files
PATTERN "*.hpp" # select C++ template header files
)

# install bundled thirdparty: cityhash
install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/cityhash
DESTINATION include/vineyard/contrib # target directory
Expand Down
106 changes: 69 additions & 37 deletions modules/basic/ds/hashmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,7 @@ limitations under the License.
#include "client/ds/blob.h"
#include "client/ds/i_object.h"
#include "common/util/arrow.h" // IWYU pragma: keep

#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif
#include "BBHash/BooPHF.h"
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
#include "grape/vertex_map/idxers/pthash_idxer.h"

namespace vineyard {

Expand Down Expand Up @@ -229,8 +220,6 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
public:
static_assert(std::is_pod<V>::value, "V in perfect hashmap must be POD type");

typedef boomphf::SingleHashFunctor<K> hasher_t;

explicit PerfectHashmapBuilder(Client& client)
: PerfectHashmapBaseBuilder<K, V>(client) {}

Expand All @@ -248,12 +237,21 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
const V* values, const size_t n_elements) {
this->set_num_elements_(n_elements);
this->set_ph_keys_(keys);
RETURN_ON_ERROR(detail::boomphf::build_keys(
bphf_, reinterpret_cast<const K*>(keys->data()), n_elements));
for (size_t i = 0; i < n_elements; ++i) {
this->builder_.add((reinterpret_cast<const K*>(keys->data()))[i]);
}

this->builder_.buildPhf();
std::unique_ptr<BlobWriter> writer;
size_t serialize_size = this->builder_.getSerializeSize();
RETURN_ON_ERROR(client.CreateBlob(serialize_size, writer));
this->builder_.finishInplace(writer->data(), serialize_size, this->idxer_);
writer->Seal(client, buf);

return this->allocateValues(
client, n_elements, [&](V* shuffled_values) -> Status {
return detail::boomphf::build_values(
bphf_, reinterpret_cast<const K*>(keys->data()), n_elements,
return detail::perfect_hash::build_values(
idxer_, reinterpret_cast<const K*>(keys->data()), n_elements,
values, shuffled_values);
});
}
Expand All @@ -266,11 +264,27 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
const V* values, const size_t n_elements) {
this->set_num_elements_(n_elements);
this->set_ph_keys_(keys);
RETURN_ON_ERROR(detail::boomphf::build_keys(bphf_, keys->GetArray()));
for (auto iter =
detail::perfect_hash::arrow_array_iterator<K, ArrowArrayType<K>>(
keys->GetArray()->begin());
iter !=
detail::perfect_hash::arrow_array_iterator<K, ArrowArrayType<K>>(
keys->GetArray()->end());
iter++) {
this->builder_.add(*iter);
}

this->builder_.buildPhf();
std::unique_ptr<BlobWriter> writer;
size_t serialize_size = this->builder_.getSerializeSize();
RETURN_ON_ERROR(client.CreateBlob(serialize_size, writer));
this->builder_.finishInplace(writer->data(), serialize_size, this->idxer_);
writer->Seal(client, buf);

return this->allocateValues(
client, n_elements, [&](V* shuffled_values) -> Status {
return detail::boomphf::build_values(bphf_, keys->GetArray(), values,
shuffled_values);
return detail::perfect_hash::build_values(idxer_, keys->GetArray(),
values, shuffled_values);
});
return Status::OK();
}
Expand All @@ -289,12 +303,21 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
const V begin_value, const size_t n_elements) {
this->set_num_elements_(n_elements);
this->set_ph_keys_(keys);
RETURN_ON_ERROR(detail::boomphf::build_keys(
bphf_, reinterpret_cast<const K*>(keys->data()), n_elements));
for (size_t i = 0; i < n_elements; ++i) {
this->builder_.add((reinterpret_cast<const K*>(keys->data()))[i]);
}

this->builder_.buildPhf();
std::unique_ptr<BlobWriter> writer;
size_t serialize_size = this->builder_.getSerializeSize();
RETURN_ON_ERROR(client.CreateBlob(serialize_size, writer));
this->builder_.finishInplace(writer->data(), serialize_size, this->idxer_);
writer->Seal(client, buf);

return this->allocateValues(
client, n_elements, [&](V* shuffled_values) -> Status {
return detail::boomphf::build_values(
bphf_, reinterpret_cast<const K*>(keys->data()), n_elements,
return detail::perfect_hash::build_values(
idxer_, reinterpret_cast<const K*>(keys->data()), n_elements,
begin_value, shuffled_values);
});
}
Expand All @@ -307,11 +330,27 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
const V begin_value, const size_t n_elements) {
this->set_num_elements_(n_elements);
this->set_ph_keys_(keys);
RETURN_ON_ERROR(detail::boomphf::build_keys(bphf_, keys->GetArray()));
for (auto iter =
detail::perfect_hash::arrow_array_iterator<K, ArrowArrayType<K>>(
keys->GetArray()->begin());
iter !=
detail::perfect_hash::arrow_array_iterator<K, ArrowArrayType<K>>(
keys->GetArray()->end());
iter++) {
this->builder_.add(*iter);
}

this->builder_.buildPhf();
std::unique_ptr<BlobWriter> writer;
size_t serialize_size = this->builder_.getSerializeSize();
RETURN_ON_ERROR(client.CreateBlob(serialize_size, writer));
this->builder_.finishInplace(writer->data(), serialize_size, this->idxer_);
writer->Seal(client, buf);

return this->allocateValues(
client, n_elements, [&](V* shuffled_values) -> Status {
return detail::boomphf::build_values(bphf_, keys->GetArray(),
begin_value, shuffled_values);
return detail::perfect_hash::build_values(
idxer_, keys->GetArray(), begin_value, shuffled_values);
});
return Status::OK();
}
Expand All @@ -323,15 +362,7 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
*
*/
Status Build(Client& client) override {
size_t size = detail::boomphf::bphf_serde::compute_size(bphf_);
std::unique_ptr<BlobWriter> blob_writer;
RETURN_ON_ERROR(client.CreateBlob(size, blob_writer));
char* dst = detail::boomphf::bphf_serde::ser(blob_writer->data(), bphf_);
RETURN_ON_ASSERT(dst == blob_writer->data() + size,
"boomphf serialization error: buffer size mismatched");
std::shared_ptr<Object> blob;
RETURN_ON_ERROR(blob_writer->Seal(client, blob));
this->set_ph_(std::dynamic_pointer_cast<Blob>(blob));
this->set_ph_(buf);
return Status::OK();
}

Expand Down Expand Up @@ -359,10 +390,11 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
return Status::OK();
}

boomphf::mphf<K, hasher_t> bphf_;
grape::PTHashIdxerBuilder<K, uint64_t> builder_;
grape::PTHashIdxer<K, uint64_t> idxer_;
std::shared_ptr<Object> buf;

const int concurrency_ = std::thread::hardware_concurrency();
const double gamma_ = 2.5f;
};

} // namespace vineyard
Expand Down
Loading
Loading