Merge branch 'main' into zsy/vy-0182

alibaba · Dec 12, 2023 · b0c9453 · b0c9453
2 parents 57cd97b + 9454f9b
commit b0c9453
Show file tree

Hide file tree

Showing 128 changed files with 13,657 additions and 2,707 deletions.
diff --git a/.github/workflows/flex.yml b/.github/workflows/flex.yml
@@ -50,21 +50,46 @@ jobs:
         mkdir build && cd build
         cmake .. && sudo make -j$(nproc)
 
-    - name: GRIN on mutable csr test
+    - name: Test GRIN on mutable csr 
       run: |
         git submodule update --init
         cd flex/engines/graph_db/grin
         mkdir build && cd build
         cmake .. && sudo make -j$(nproc)
         export FLEX_DATA_DIR=../../../../interactive/examples/modern_graph/
-        ./run_grin_test 'flex://schema_file=../../../../interactive/examples/modern_graph/modern_graph.yaml&bulk_load_file=../../../../interactive/examples/modern_graph/bulk_load.yaml'
+        ${GITHUB_WORKSPACE}/flex/build/bin/bulk_loader -g ../../../../interactive/examples/modern_graph/modern_graph.yaml -l ../../../../interactive/examples/modern_graph/bulk_load.yaml -d ./data/
+        rm -r ./data/wal
+        rm -r ./data/runtime/*
+        ./run_grin_test 'flex://schema_file=../../../../interactive/examples/modern_graph/modern_graph.yaml&data_dir=./data/'
 
     - name: Prepare test dataset
       env:
         GS_TEST_DIR: ${{ github.workspace }}/gstest/
       run: |
         git clone -b master --single-branch --depth=1 https://github.com/GraphScope/gstest.git ${GS_TEST_DIR}
 
+    - name: Test String edge property on modern graph
+      env:
+        FLEX_DATA_DIR: ${{ github.workspace }}/flex/interactive/examples/modern_graph/
+      run: |
+        rm -rf /tmp/csr-data-dir/
+        cd ${GITHUB_WORKSPACE}/flex/build/
+        SCHEMA_FILE=../tests/rt_mutable_graph/modern_graph_string_edge.yaml 
+        BULK_LOAD_FILE=../interactive/examples/modern_graph/bulk_load.yaml
+        GLOG_v=10 ./bin/bulk_loader  -g ${SCHEMA_FILE} -l ${BULK_LOAD_FILE} -d /tmp/csr-data-dir/
+        GLOG_v=10 ./tests/rt_mutable_graph/string_edge_property_test ${SCHEMA_FILE} /tmp/csr-data-dir/
+
+    - name: Test build empty graph
+      run: |
+        rm -rf /tmp/csr-data-dir/
+        cd ${GITHUB_WORKSPACE}/flex/build/
+        GLOG_v=10 ./tests/rt_mutable_graph/test_empty_graph /tmp/csr-data-dir/
+    - name: Test ACID
+      run: |
+        rm -rf /tmp/csr-data-dir/
+        cd ${GITHUB_WORKSPACE}/flex/build/
+        GLOG_v=10 ./tests/rt_mutable_graph/test_acid 8 /tmp/csr-data-dir/
+
     - name: Test Graph Loading on modern graph
       env:
         FLEX_DATA_DIR: ${{ github.workspace }}/flex/interactive/examples/modern_graph/
@@ -73,7 +98,7 @@ jobs:
         cd ${GITHUB_WORKSPACE}/flex/build/
         SCHEMA_FILE=../interactive/examples/modern_graph/modern_graph.yaml
         BULK_LOAD_FILE=../interactive/examples/modern_graph/bulk_load.yaml
-        GLOG_v=10 ./bin/graph_loader  ${SCHEMA_FILE} ${BULK_LOAD_FILE} /tmp/csr-data-dir/
+        GLOG_v=10 ./bin/bulk_loader  -g ${SCHEMA_FILE} -l ${BULK_LOAD_FILE} -d /tmp/csr-data-dir/
 
     - name: Test Graph Loading on type_test graph
       env:
@@ -86,7 +111,7 @@ jobs:
         cd ${GITHUB_WORKSPACE}/flex/build/
         SCHEMA_FILE=${GS_TEST_DIR}/type_test/graph.yaml
         BULK_LOAD_FILE=${GS_TEST_DIR}/type_test/import.yaml
-        GLOG_v=10 ./bin/graph_loader  ${SCHEMA_FILE} ${BULK_LOAD_FILE} /tmp/csr-data-dir/ 2
+        GLOG_v=10 ./bin/bulk_loader -g ${SCHEMA_FILE} -l ${BULK_LOAD_FILE} -d /tmp/csr-data-dir/ -p 2
 
     - name: Test Graph Loading on LDBC SNB sf0.1
       env:
@@ -99,4 +124,4 @@ jobs:
         cd ${GITHUB_WORKSPACE}/flex/build/
         SCHEMA_FILE=${FLEX_DATA_DIR}/audit_graph_schema.yaml
         BULK_LOAD_FILE=${FLEX_DATA_DIR}/audit_bulk_load.yaml
-        GLOG_v=10 ./bin/graph_loader  ${SCHEMA_FILE} ${BULK_LOAD_FILE} /tmp/csr-data-dir/ 2
+        GLOG_v=10 ./bin/bulk_loader -g ${SCHEMA_FILE} -l ${BULK_LOAD_FILE} -d /tmp/csr-data-dir/ -p 2
diff --git a/.github/workflows/hqps-db-ci.yml b/.github/workflows/hqps-db-ci.yml
@@ -102,9 +102,9 @@ jobs:
         # load graph
         cd ${GITHUB_WORKSPACE}/flex/build
         export FLEX_DATA_DIR=${GS_TEST_DIR}/flex/ldbc-sf01-long-date
-        GLOG_v=10 ./bin/graph_loader ${INTERACTIVE_WORKSPACE}/data/ldbc/graph.yaml ${INTERACTIVE_WORKSPACE}/data/ldbc/import.yaml ${INTERACTIVE_WORKSPACE}/data/ldbc/indices/
+        GLOG_v=10 ./bin/bulk_loader -g ${INTERACTIVE_WORKSPACE}/data/ldbc/graph.yaml -l ${INTERACTIVE_WORKSPACE}/data/ldbc/import.yaml -d ${INTERACTIVE_WORKSPACE}/data/ldbc/indices/
         export FLEX_DATA_DIR=../interactive/examples/movies
-        GLOG_v=10 ./bin/graph_loader ${INTERACTIVE_WORKSPACE}/data/movies/graph.yaml ${INTERACTIVE_WORKSPACE}/data/movies/import.yaml ${INTERACTIVE_WORKSPACE}/data/movies/indices/
+        GLOG_v=10 ./bin/bulk_loader -g ${INTERACTIVE_WORKSPACE}/data/movies/graph.yaml -l ${INTERACTIVE_WORKSPACE}/data/movies/import.yaml -d ${INTERACTIVE_WORKSPACE}/data/movies/indices/
 
     - name: Test HQPS admin http service
       env: 
@@ -120,8 +120,11 @@ jobs:
       run: |
         cd ${GITHUB_WORKSPACE}/flex/build
         export FLEX_DATA_DIR=${GS_TEST_DIR}/flex/ldbc-sf01-long-date
+        ./bin/bulk_loader -g ${GS_TEST_DIR}/flex/ldbc-sf01-long-date/audit_graph_schema.yaml \
+        -l ${GS_TEST_DIR}/flex/ldbc-sf01-long-date/audit_bulk_load.yaml -d /tmp/csr-data-dir/
+        rm -r /tmp/csr-data-dir/runtime/* 
+        rm -r /tmp/csr-data-dir/wal
         ./tests/hqps/query_test ${GS_TEST_DIR}/flex/ldbc-sf01-long-date/audit_graph_schema.yaml \
-        ${GS_TEST_DIR}/flex/ldbc-sf01-long-date/audit_bulk_load.yaml \
         /tmp/csr-data-dir/
 
     - name: Run codegen test.

diff --git a/docs/flex/interactive/data_import.md b/docs/flex/interactive/data_import.md
@@ -2,13 +2,18 @@
 
 In our guide on [using custom graph data](./custom_graph_data.md), we introduced the basics of importing graph data using a simple YAML configuration. This section delves deeper, providing a thorough exploration of the extensive configuration options available for data import.
 
+## Supported data source
+
+Currently we only support import data to graph from local `csv` files or `odps` table. See configuration `loading_config.data_source.scheme`.
+
 ## Sample Configuration for the "Modern" Graph
 
 To illustrate, let's examine the `examples/modern_import_full.yaml` file. This configuration is designed for importing the "modern" graph and showcases the full range of configuration possibilities. We'll dissect each configuration item in the sections that follow.
 
 ``` yaml
 loading_config: 
   data_source:
+    scheme: file
     location: /home/modern_graph/ 
   format: 
     metadata: 
@@ -101,7 +106,8 @@ The table below offers a detailed breakdown of each configuration item. In this
 | -------- | -------- | -------- |-------- |
 | **loading_config**    | N/A     | Loading configurations     |  Yes   |
 | loading_config.data_source    | N/A     | Place that maintains the raw data     |  Yes   |
-|loading_config.data_source.location |	N/A | Path to the data source in the container, which must be mapped from the host machine while intializing the service |	Yes
+| loading_config.data_source.location |	N/A | Path to the data source in the container, which must be mapped from the host machine while intializing the service |	Yes
+| loading_config.scheme | file | The source of input data. Currently only `file` and `odps` are supported | No |
 | loading_config.format    | N/A     | The format of the raw data in CSV    |  Yes   |
 | loading_config.format.metadata    | N/A    | Mainly for configuring the options for reading CSV   |  Yes   |
 | loading_config.format.metadata.delimiter | '\|' | Delimiter used to split a row of data | Yes | 

diff --git a/docs/flex/interactive/development/admin_service.md b/docs/flex/interactive/development/admin_service.md
@@ -942,7 +942,20 @@ curl -X DELETE -H "Content-Type: application/json" "http://[host]/v1/graph/{grap
 
 #### Description 
 
-Start the query service on a graph.
+Start the query service on a graph. The `graph_name` param can be empty, indicating restarting on current running graph.
+
+1. After the AdminService receives this request, the current actor scope for query actors will be cancelled.
+2. During the scope cancellation process of the query actors or after scope cancellation is completed, all requests sent to the query_service will fail and be rejected. 
+The response of the http request will be like
+```json
+{
+  "code": 500,
+  "message" : "Unable to send message, the target actor has been canceled!"
+}
+```
+3. After the previous graph is closed and new graph is opened, the new query actors will be available in a new scope. 
+4. The query service is now ready to serve requests on the new graph.
+
 
 #### HTTP Request
 - **Method**: POST

diff --git a/docs/flex/interactive/overview.md b/docs/flex/interactive/overview.md
@@ -27,6 +27,7 @@ GraphScope Interactive boasts several key features:
 3. **Future-Ready Expansion Capabilities**: Drawing from the prowess of GraphScope Flex, GraphScope Interactive is primed for adaptability:
     * Support for Multiple Query Languages: In the near future, GraphScope Interactive will extend its language support to include [Gremlin](https://tinkerpop.apache.org/gremlin.html), and [GQL](https://www.gqlstandards.org/), further enhancing its versatility.
     * Scalability: GraphScope Interactive possesses the potential for distributed processing. This means it can be expanded with few effort to handle larger-scale graphs, ensuring it remains effective as your data grows.
+4. **Massive Graph Support**: To enhance system throughput and query performance, we store all graph data in memory by default. However, when encountering graph data too large to fit entirely in memory, we offload excess data to disk storage. This approach, while effectively handling massive-scale graph data, may result in reduced throughput. Moving forward, we aim to adopt a master-slave architecture designed to enhance concurrent querying performance. This will be achieved by distributing the workload across multiple machines, thereby optimizing resource utilization to improve throughput. 
 
 ## Property Graph Model and Graph Queries
 

diff --git a/docs/interactive_engine/tinkerpop/faq.md b/docs/interactive_engine/tinkerpop/faq.md
@@ -131,7 +131,14 @@ For example, in the LDBC schema, we define the property ID as the primary key fo
 ```scss
 g.V().hasLabel('PERSON').has('id', propertyIdValue)
 ```
-Where 'id' is the property ID, and 'propertyIdValue' is the value of the property key. By directly using the primary key index, query performance can be significantly improved, avoiding full table scans and property value filtering, thereby optimizing query performance.
+Where 'id' is the property ID, and 'propertyIdValue' is the value of the property key. 
+
+Moreover, we support the `within` operator to query multiple values of the same property key, which can also be optimized by the primary key index. For example:
+```scss
+g.V().hasLabel('PERSON').has('id', within(propertyIdValue1, propertyIdValue2))
+```
+
+By directly using the primary key index, query performance can be significantly improved, avoiding full table scans and property value filtering, thereby optimizing query performance.
 
 ## How to use subgraph in GIE Gremlin ?
 

diff --git a/flex/CMakeLists.txt b/flex/CMakeLists.txt
@@ -13,6 +13,16 @@ project (
 option(BUILD_HQPS "Whether to build HighQPS Engine" ON)
 option(BUILD_TEST "Whether to build test" ON)
 option(BUILD_DOC "Whether to build doc" ON)
+option(USE_MMAPALLOC "Whether to use mmap allocator" OFF)
+option(BUILD_ODPS_FRAGMENT_LOADER "Whether to build odps fragment loader" ON)
+
+execute_process(COMMAND uname -r OUTPUT_VARIABLE LINUX_KERNEL_VERSION)
+string(STRIP ${LINUX_KERNEL_VERSION} LINUX_KERNEL_VERSION)
+message(${LINUX_KERNEL_VERSION})
+if(LINUX_KERNEL_VERSION VERSION_GREATER_EQUAL "4.5")
+    message("Use copy file range")
+    add_definitions(-DUSE_COPY_FILE_RANGE)
+endif()
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../)
 
@@ -101,7 +111,11 @@ if (NOT EXISTS ${CMAKE_SOURCE_DIR}/third_party/nlohmann-json/single_include/nloh
     message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/third_party/nlohmann-json/single_include/nlohmann/json.hpp not found, "
                          "please run `git submodule update --init` to download third_party")
 endif()
-include_directories(SYSTEM ${CMAKE_SOURCE_DIR}/third_party/single_include)
+include_directories(SYSTEM ${CMAKE_SOURCE_DIR}/third_party/nlohmann-json/single_include)
+
+if (BUILD_ODPS_FRAGMENT_LOADER)
+    include_directories(SYSTEM ${CMAKE_SOURCE_DIR}/third_party/odps/include)
+endif()
 
 add_subdirectory(utils)
 add_subdirectory(codegen)
@@ -112,6 +126,10 @@ if (BUILD_TEST)
    add_subdirectory(tests)
 endif()
 
+if (USE_MMAPALLOC)
+    add_definitions(-DUSE_MMAPALLOC)
+    message("mmap allocator")
+endif()
 
 file(GLOB_RECURSE FILES_NEED_LINT
         "engines/*.cc"

diff --git a/flex/bin/CMakeLists.txt b/flex/bin/CMakeLists.txt
@@ -37,13 +37,6 @@ install(TARGETS flex_analytical_engine
         ARCHIVE DESTINATION lib
         LIBRARY DESTINATION lib)
 
-add_executable(graph_loader graph_loader.cc)
-target_link_libraries(graph_loader flex_rt_mutable_graph flex_graph_db ${GLOG_LIBRARIES})
-install(TARGETS graph_loader
-        RUNTIME DESTINATION bin
-        ARCHIVE DESTINATION lib
-        LIBRARY DESTINATION lib)
-
 if(BUILD_HQPS)
         if(Hiactor_FOUND)
                 add_executable(interactive_server interactive_server.cc)
@@ -56,4 +49,13 @@ if(BUILD_HQPS)
         endif()
         # install the script
         install(PROGRAMS load_plan_and_gen.sh DESTINATION bin)
-endif()
+endif()
+
+include_directories(${Boost_INCLUDE_DIRS})
+add_executable(bulk_loader bulk_loader.cc)
+target_link_libraries(bulk_loader flex_rt_mutable_graph flex_utils ${GLOG_LIBRARIES} ${GFLAGS_LIBRARIES} ${Boost_LIBRARIES})
+
+install(TARGETS bulk_loader
+        RUNTIME DESTINATION bin
+        ARCHIVE DESTINATION lib
+        LIBRARY DESTINATION lib)
diff --git a/flex/bin/bulk_loader.cc b/flex/bin/bulk_loader.cc
@@ -0,0 +1,97 @@
+/** Copyright 2020 Alibaba Group Holding Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <filesystem>
+#include <iostream>
+
+#include <glog/logging.h>
+
+#include <boost/program_options.hpp>
+
+#include "flex/engines/graph_db/database/graph_db.h"
+#include "flex/engines/http_server/options.h"
+
+namespace bpo = boost::program_options;
+
+int main(int argc, char** argv) {
+  bpo::options_description desc("Usage:");
+  desc.add_options()("help", "Display help message")(
+      "version,v", "Display version")("parallelism,p",
+                                      bpo::value<uint32_t>()->default_value(1),
+                                      "parallelism of bulk loader")(
+      "data-path,d", bpo::value<std::string>(), "data directory path")(
+      "graph-config,g", bpo::value<std::string>(), "graph schema config file")(
+      "bulk-load,l", bpo::value<std::string>(), "bulk-load config file");
+  google::InitGoogleLogging(argv[0]);
+  FLAGS_logtostderr = true;
+
+  bpo::variables_map vm;
+  bpo::store(bpo::command_line_parser(argc, argv).options(desc).run(), vm);
+  bpo::notify(vm);
+
+  if (vm.count("help")) {
+    std::cout << desc << std::endl;
+    return 0;
+  }
+
+  if (vm.count("version")) {
+    std::cout << "GraphScope/Flex version " << FLEX_VERSION << std::endl;
+    return 0;
+  }
+
+  uint32_t parallelism = vm["parallelism"].as<uint32_t>();
+  std::string data_path = "";
+  std::string bulk_load_config_path = "";
+  std::string graph_schema_path = "";
+
+  if (!vm.count("graph-config")) {
+    LOG(ERROR) << "graph-config is required";
+    return -1;
+  }
+  graph_schema_path = vm["graph-config"].as<std::string>();
+  if (!vm.count("data-path")) {
+    LOG(ERROR) << "data-path is required";
+    return -1;
+  }
+  data_path = vm["data-path"].as<std::string>();
+  if (!vm.count("bulk-load")) {
+    LOG(ERROR) << "bulk-load-config is required";
+    return -1;
+  }
+  bulk_load_config_path = vm["bulk-load"].as<std::string>();
+
+  setenv("TZ", "Asia/Shanghai", 1);
+  tzset();
+
+  auto schema = gs::Schema::LoadFromYaml(graph_schema_path);
+  auto loading_config =
+      gs::LoadingConfig::ParseFromYamlFile(schema, bulk_load_config_path);
+
+  std::filesystem::path data_dir_path(data_path);
+  if (!std::filesystem::exists(data_dir_path)) {
+    std::filesystem::create_directory(data_dir_path);
+  }
+  std::filesystem::path serial_path = data_dir_path / "schema";
+  if (std::filesystem::exists(serial_path)) {
+    LOG(WARNING) << "data directory is not empty";
+    return 0;
+  }
+
+  auto loader = gs::LoaderFactory::CreateFragmentLoader(
+      data_dir_path.string(), schema, loading_config, parallelism);
+  loader->LoadFragment();
+
+  return 0;
+}