Skip to content

Commit

Permalink
Merge branch 'main' into zsy/vy-0182
Browse files Browse the repository at this point in the history
  • Loading branch information
siyuan0322 authored Dec 12, 2023
2 parents 57cd97b + 9454f9b commit b0c9453
Show file tree
Hide file tree
Showing 128 changed files with 13,657 additions and 2,707 deletions.
35 changes: 30 additions & 5 deletions .github/workflows/flex.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,21 +50,46 @@ jobs:
mkdir build && cd build
cmake .. && sudo make -j$(nproc)
- name: GRIN on mutable csr test
- name: Test GRIN on mutable csr
run: |
git submodule update --init
cd flex/engines/graph_db/grin
mkdir build && cd build
cmake .. && sudo make -j$(nproc)
export FLEX_DATA_DIR=../../../../interactive/examples/modern_graph/
./run_grin_test 'flex://schema_file=../../../../interactive/examples/modern_graph/modern_graph.yaml&bulk_load_file=../../../../interactive/examples/modern_graph/bulk_load.yaml'
${GITHUB_WORKSPACE}/flex/build/bin/bulk_loader -g ../../../../interactive/examples/modern_graph/modern_graph.yaml -l ../../../../interactive/examples/modern_graph/bulk_load.yaml -d ./data/
rm -r ./data/wal
rm -r ./data/runtime/*
./run_grin_test 'flex://schema_file=../../../../interactive/examples/modern_graph/modern_graph.yaml&data_dir=./data/'
- name: Prepare test dataset
env:
GS_TEST_DIR: ${{ github.workspace }}/gstest/
run: |
git clone -b master --single-branch --depth=1 https://github.com/GraphScope/gstest.git ${GS_TEST_DIR}
- name: Test String edge property on modern graph
env:
FLEX_DATA_DIR: ${{ github.workspace }}/flex/interactive/examples/modern_graph/
run: |
rm -rf /tmp/csr-data-dir/
cd ${GITHUB_WORKSPACE}/flex/build/
SCHEMA_FILE=../tests/rt_mutable_graph/modern_graph_string_edge.yaml
BULK_LOAD_FILE=../interactive/examples/modern_graph/bulk_load.yaml
GLOG_v=10 ./bin/bulk_loader -g ${SCHEMA_FILE} -l ${BULK_LOAD_FILE} -d /tmp/csr-data-dir/
GLOG_v=10 ./tests/rt_mutable_graph/string_edge_property_test ${SCHEMA_FILE} /tmp/csr-data-dir/
- name: Test build empty graph
run: |
rm -rf /tmp/csr-data-dir/
cd ${GITHUB_WORKSPACE}/flex/build/
GLOG_v=10 ./tests/rt_mutable_graph/test_empty_graph /tmp/csr-data-dir/
- name: Test ACID
run: |
rm -rf /tmp/csr-data-dir/
cd ${GITHUB_WORKSPACE}/flex/build/
GLOG_v=10 ./tests/rt_mutable_graph/test_acid 8 /tmp/csr-data-dir/
- name: Test Graph Loading on modern graph
env:
FLEX_DATA_DIR: ${{ github.workspace }}/flex/interactive/examples/modern_graph/
Expand All @@ -73,7 +98,7 @@ jobs:
cd ${GITHUB_WORKSPACE}/flex/build/
SCHEMA_FILE=../interactive/examples/modern_graph/modern_graph.yaml
BULK_LOAD_FILE=../interactive/examples/modern_graph/bulk_load.yaml
GLOG_v=10 ./bin/graph_loader ${SCHEMA_FILE} ${BULK_LOAD_FILE} /tmp/csr-data-dir/
GLOG_v=10 ./bin/bulk_loader -g ${SCHEMA_FILE} -l ${BULK_LOAD_FILE} -d /tmp/csr-data-dir/
- name: Test Graph Loading on type_test graph
env:
Expand All @@ -86,7 +111,7 @@ jobs:
cd ${GITHUB_WORKSPACE}/flex/build/
SCHEMA_FILE=${GS_TEST_DIR}/type_test/graph.yaml
BULK_LOAD_FILE=${GS_TEST_DIR}/type_test/import.yaml
GLOG_v=10 ./bin/graph_loader ${SCHEMA_FILE} ${BULK_LOAD_FILE} /tmp/csr-data-dir/ 2
GLOG_v=10 ./bin/bulk_loader -g ${SCHEMA_FILE} -l ${BULK_LOAD_FILE} -d /tmp/csr-data-dir/ -p 2
- name: Test Graph Loading on LDBC SNB sf0.1
env:
Expand All @@ -99,4 +124,4 @@ jobs:
cd ${GITHUB_WORKSPACE}/flex/build/
SCHEMA_FILE=${FLEX_DATA_DIR}/audit_graph_schema.yaml
BULK_LOAD_FILE=${FLEX_DATA_DIR}/audit_bulk_load.yaml
GLOG_v=10 ./bin/graph_loader ${SCHEMA_FILE} ${BULK_LOAD_FILE} /tmp/csr-data-dir/ 2
GLOG_v=10 ./bin/bulk_loader -g ${SCHEMA_FILE} -l ${BULK_LOAD_FILE} -d /tmp/csr-data-dir/ -p 2
9 changes: 6 additions & 3 deletions .github/workflows/hqps-db-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,9 @@ jobs:
# load graph
cd ${GITHUB_WORKSPACE}/flex/build
export FLEX_DATA_DIR=${GS_TEST_DIR}/flex/ldbc-sf01-long-date
GLOG_v=10 ./bin/graph_loader ${INTERACTIVE_WORKSPACE}/data/ldbc/graph.yaml ${INTERACTIVE_WORKSPACE}/data/ldbc/import.yaml ${INTERACTIVE_WORKSPACE}/data/ldbc/indices/
GLOG_v=10 ./bin/bulk_loader -g ${INTERACTIVE_WORKSPACE}/data/ldbc/graph.yaml -l ${INTERACTIVE_WORKSPACE}/data/ldbc/import.yaml -d ${INTERACTIVE_WORKSPACE}/data/ldbc/indices/
export FLEX_DATA_DIR=../interactive/examples/movies
GLOG_v=10 ./bin/graph_loader ${INTERACTIVE_WORKSPACE}/data/movies/graph.yaml ${INTERACTIVE_WORKSPACE}/data/movies/import.yaml ${INTERACTIVE_WORKSPACE}/data/movies/indices/
GLOG_v=10 ./bin/bulk_loader -g ${INTERACTIVE_WORKSPACE}/data/movies/graph.yaml -l ${INTERACTIVE_WORKSPACE}/data/movies/import.yaml -d ${INTERACTIVE_WORKSPACE}/data/movies/indices/
- name: Test HQPS admin http service
env:
Expand All @@ -120,8 +120,11 @@ jobs:
run: |
cd ${GITHUB_WORKSPACE}/flex/build
export FLEX_DATA_DIR=${GS_TEST_DIR}/flex/ldbc-sf01-long-date
./bin/bulk_loader -g ${GS_TEST_DIR}/flex/ldbc-sf01-long-date/audit_graph_schema.yaml \
-l ${GS_TEST_DIR}/flex/ldbc-sf01-long-date/audit_bulk_load.yaml -d /tmp/csr-data-dir/
rm -r /tmp/csr-data-dir/runtime/*
rm -r /tmp/csr-data-dir/wal
./tests/hqps/query_test ${GS_TEST_DIR}/flex/ldbc-sf01-long-date/audit_graph_schema.yaml \
${GS_TEST_DIR}/flex/ldbc-sf01-long-date/audit_bulk_load.yaml \
/tmp/csr-data-dir/
- name: Run codegen test.
Expand Down
8 changes: 7 additions & 1 deletion docs/flex/interactive/data_import.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,18 @@

In our guide on [using custom graph data](./custom_graph_data.md), we introduced the basics of importing graph data using a simple YAML configuration. This section delves deeper, providing a thorough exploration of the extensive configuration options available for data import.

## Supported data source

Currently we only support import data to graph from local `csv` files or `odps` table. See configuration `loading_config.data_source.scheme`.

## Sample Configuration for the "Modern" Graph

To illustrate, let's examine the `examples/modern_import_full.yaml` file. This configuration is designed for importing the "modern" graph and showcases the full range of configuration possibilities. We'll dissect each configuration item in the sections that follow.

``` yaml
loading_config:
data_source:
scheme: file
location: /home/modern_graph/
format:
metadata:
Expand Down Expand Up @@ -101,7 +106,8 @@ The table below offers a detailed breakdown of each configuration item. In this
| -------- | -------- | -------- |-------- |
| **loading_config** | N/A | Loading configurations | Yes |
| loading_config.data_source | N/A | Place that maintains the raw data | Yes |
|loading_config.data_source.location | N/A | Path to the data source in the container, which must be mapped from the host machine while intializing the service | Yes
| loading_config.data_source.location | N/A | Path to the data source in the container, which must be mapped from the host machine while intializing the service | Yes
| loading_config.scheme | file | The source of input data. Currently only `file` and `odps` are supported | No |
| loading_config.format | N/A | The format of the raw data in CSV | Yes |
| loading_config.format.metadata | N/A | Mainly for configuring the options for reading CSV | Yes |
| loading_config.format.metadata.delimiter | '\|' | Delimiter used to split a row of data | Yes |
Expand Down
15 changes: 14 additions & 1 deletion docs/flex/interactive/development/admin_service.md
Original file line number Diff line number Diff line change
Expand Up @@ -942,7 +942,20 @@ curl -X DELETE -H "Content-Type: application/json" "http://[host]/v1/graph/{grap

#### Description

Start the query service on a graph.
Start the query service on a graph. The `graph_name` param can be empty, indicating restarting on current running graph.

1. After the AdminService receives this request, the current actor scope for query actors will be cancelled.
2. During the scope cancellation process of the query actors or after scope cancellation is completed, all requests sent to the query_service will fail and be rejected.
The response of the http request will be like
```json
{
"code": 500,
"message" : "Unable to send message, the target actor has been canceled!"
}
```
3. After the previous graph is closed and new graph is opened, the new query actors will be available in a new scope.
4. The query service is now ready to serve requests on the new graph.


#### HTTP Request
- **Method**: POST
Expand Down
1 change: 1 addition & 0 deletions docs/flex/interactive/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ GraphScope Interactive boasts several key features:
3. **Future-Ready Expansion Capabilities**: Drawing from the prowess of GraphScope Flex, GraphScope Interactive is primed for adaptability:
* Support for Multiple Query Languages: In the near future, GraphScope Interactive will extend its language support to include [Gremlin](https://tinkerpop.apache.org/gremlin.html), and [GQL](https://www.gqlstandards.org/), further enhancing its versatility.
* Scalability: GraphScope Interactive possesses the potential for distributed processing. This means it can be expanded with few effort to handle larger-scale graphs, ensuring it remains effective as your data grows.
4. **Massive Graph Support**: To enhance system throughput and query performance, we store all graph data in memory by default. However, when encountering graph data too large to fit entirely in memory, we offload excess data to disk storage. This approach, while effectively handling massive-scale graph data, may result in reduced throughput. Moving forward, we aim to adopt a master-slave architecture designed to enhance concurrent querying performance. This will be achieved by distributing the workload across multiple machines, thereby optimizing resource utilization to improve throughput.

## Property Graph Model and Graph Queries

Expand Down
9 changes: 8 additions & 1 deletion docs/interactive_engine/tinkerpop/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,14 @@ For example, in the LDBC schema, we define the property ID as the primary key fo
```scss
g.V().hasLabel('PERSON').has('id', propertyIdValue)
```
Where 'id' is the property ID, and 'propertyIdValue' is the value of the property key. By directly using the primary key index, query performance can be significantly improved, avoiding full table scans and property value filtering, thereby optimizing query performance.
Where 'id' is the property ID, and 'propertyIdValue' is the value of the property key.

Moreover, we support the `within` operator to query multiple values of the same property key, which can also be optimized by the primary key index. For example:
```scss
g.V().hasLabel('PERSON').has('id', within(propertyIdValue1, propertyIdValue2))
```

By directly using the primary key index, query performance can be significantly improved, avoiding full table scans and property value filtering, thereby optimizing query performance.

## How to use subgraph in GIE Gremlin ?

Expand Down
20 changes: 19 additions & 1 deletion flex/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,16 @@ project (
option(BUILD_HQPS "Whether to build HighQPS Engine" ON)
option(BUILD_TEST "Whether to build test" ON)
option(BUILD_DOC "Whether to build doc" ON)
option(USE_MMAPALLOC "Whether to use mmap allocator" OFF)
option(BUILD_ODPS_FRAGMENT_LOADER "Whether to build odps fragment loader" ON)

execute_process(COMMAND uname -r OUTPUT_VARIABLE LINUX_KERNEL_VERSION)
string(STRIP ${LINUX_KERNEL_VERSION} LINUX_KERNEL_VERSION)
message(${LINUX_KERNEL_VERSION})
if(LINUX_KERNEL_VERSION VERSION_GREATER_EQUAL "4.5")
message("Use copy file range")
add_definitions(-DUSE_COPY_FILE_RANGE)
endif()

include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../)

Expand Down Expand Up @@ -101,7 +111,11 @@ if (NOT EXISTS ${CMAKE_SOURCE_DIR}/third_party/nlohmann-json/single_include/nloh
message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/third_party/nlohmann-json/single_include/nlohmann/json.hpp not found, "
"please run `git submodule update --init` to download third_party")
endif()
include_directories(SYSTEM ${CMAKE_SOURCE_DIR}/third_party/single_include)
include_directories(SYSTEM ${CMAKE_SOURCE_DIR}/third_party/nlohmann-json/single_include)

if (BUILD_ODPS_FRAGMENT_LOADER)
include_directories(SYSTEM ${CMAKE_SOURCE_DIR}/third_party/odps/include)
endif()

add_subdirectory(utils)
add_subdirectory(codegen)
Expand All @@ -112,6 +126,10 @@ if (BUILD_TEST)
add_subdirectory(tests)
endif()

if (USE_MMAPALLOC)
add_definitions(-DUSE_MMAPALLOC)
message("mmap allocator")
endif()

file(GLOB_RECURSE FILES_NEED_LINT
"engines/*.cc"
Expand Down
18 changes: 10 additions & 8 deletions flex/bin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,6 @@ install(TARGETS flex_analytical_engine
ARCHIVE DESTINATION lib
LIBRARY DESTINATION lib)

add_executable(graph_loader graph_loader.cc)
target_link_libraries(graph_loader flex_rt_mutable_graph flex_graph_db ${GLOG_LIBRARIES})
install(TARGETS graph_loader
RUNTIME DESTINATION bin
ARCHIVE DESTINATION lib
LIBRARY DESTINATION lib)

if(BUILD_HQPS)
if(Hiactor_FOUND)
add_executable(interactive_server interactive_server.cc)
Expand All @@ -56,4 +49,13 @@ if(BUILD_HQPS)
endif()
# install the script
install(PROGRAMS load_plan_and_gen.sh DESTINATION bin)
endif()
endif()

include_directories(${Boost_INCLUDE_DIRS})
add_executable(bulk_loader bulk_loader.cc)
target_link_libraries(bulk_loader flex_rt_mutable_graph flex_utils ${GLOG_LIBRARIES} ${GFLAGS_LIBRARIES} ${Boost_LIBRARIES})

install(TARGETS bulk_loader
RUNTIME DESTINATION bin
ARCHIVE DESTINATION lib
LIBRARY DESTINATION lib)
97 changes: 97 additions & 0 deletions flex/bin/bulk_loader.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/** Copyright 2020 Alibaba Group Holding Limited.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <filesystem>
#include <iostream>

#include <glog/logging.h>

#include <boost/program_options.hpp>

#include "flex/engines/graph_db/database/graph_db.h"
#include "flex/engines/http_server/options.h"

namespace bpo = boost::program_options;

int main(int argc, char** argv) {
bpo::options_description desc("Usage:");
desc.add_options()("help", "Display help message")(
"version,v", "Display version")("parallelism,p",
bpo::value<uint32_t>()->default_value(1),
"parallelism of bulk loader")(
"data-path,d", bpo::value<std::string>(), "data directory path")(
"graph-config,g", bpo::value<std::string>(), "graph schema config file")(
"bulk-load,l", bpo::value<std::string>(), "bulk-load config file");
google::InitGoogleLogging(argv[0]);
FLAGS_logtostderr = true;

bpo::variables_map vm;
bpo::store(bpo::command_line_parser(argc, argv).options(desc).run(), vm);
bpo::notify(vm);

if (vm.count("help")) {
std::cout << desc << std::endl;
return 0;
}

if (vm.count("version")) {
std::cout << "GraphScope/Flex version " << FLEX_VERSION << std::endl;
return 0;
}

uint32_t parallelism = vm["parallelism"].as<uint32_t>();
std::string data_path = "";
std::string bulk_load_config_path = "";
std::string graph_schema_path = "";

if (!vm.count("graph-config")) {
LOG(ERROR) << "graph-config is required";
return -1;
}
graph_schema_path = vm["graph-config"].as<std::string>();
if (!vm.count("data-path")) {
LOG(ERROR) << "data-path is required";
return -1;
}
data_path = vm["data-path"].as<std::string>();
if (!vm.count("bulk-load")) {
LOG(ERROR) << "bulk-load-config is required";
return -1;
}
bulk_load_config_path = vm["bulk-load"].as<std::string>();

setenv("TZ", "Asia/Shanghai", 1);
tzset();

auto schema = gs::Schema::LoadFromYaml(graph_schema_path);
auto loading_config =
gs::LoadingConfig::ParseFromYamlFile(schema, bulk_load_config_path);

std::filesystem::path data_dir_path(data_path);
if (!std::filesystem::exists(data_dir_path)) {
std::filesystem::create_directory(data_dir_path);
}
std::filesystem::path serial_path = data_dir_path / "schema";
if (std::filesystem::exists(serial_path)) {
LOG(WARNING) << "data directory is not empty";
return 0;
}

auto loader = gs::LoaderFactory::CreateFragmentLoader(
data_dir_path.string(), schema, loading_config, parallelism);
loader->LoadFragment();

return 0;
}
Loading

0 comments on commit b0c9453

Please sign in to comment.