From 7c1dfe7e72ff3f62432face6db26ff4ee88e91cd Mon Sep 17 00:00:00 2001 From: Zhang Lei Date: Tue, 25 Jun 2024 20:36:09 +0800 Subject: [PATCH] refactor(interactive): Support timezone info for timestamp type. (#3975) In this PR, we expand our `timestamp` format parsing to include timezone information such as `2010-02-14T15:32:10.447+00:00`. `out_zone_offset_present` are not set as we don't expect timezone info without giving a timezone in type definition. Otherwise will cause parsing failure. https://github.com/apache/arrow/blob/3e7ae5340a123c1040f98f1c36687b81362fab52/cpp/src/arrow/csv/converter.cc#L373 https://github.com/apache/arrow/blob/3e7ae5340a123c1040f98f1c36687b81362fab52/cpp/src/arrow/type.h#L1635 --- .github/workflows/interactive.yml | 11 ++ .../examples/movies/FOLLOWS_TIMESTAMP.csv | 9 ++ .../rt_mutable_graph/movie_import_test.yaml | 122 ++++++++++++++++++ .../rt_mutable_graph/movie_schema_test.yaml | 110 ++++++++++++++++ flex/utils/arrow_utils.h | 63 ++++++++- 5 files changed, 310 insertions(+), 5 deletions(-) create mode 100644 flex/interactive/examples/movies/FOLLOWS_TIMESTAMP.csv create mode 100644 flex/tests/rt_mutable_graph/movie_import_test.yaml create mode 100644 flex/tests/rt_mutable_graph/movie_schema_test.yaml diff --git a/.github/workflows/interactive.yml b/.github/workflows/interactive.yml index 75b69039ed8f..4e5785b30484 100644 --- a/.github/workflows/interactive.yml +++ b/.github/workflows/interactive.yml @@ -512,3 +512,14 @@ jobs: SCHEMA_FILE=${FLEX_DATA_DIR}/audit_graph_schema.yaml BULK_LOAD_FILE=${FLEX_DATA_DIR}/audit_bulk_load.yaml GLOG_v=10 ./bin/bulk_loader -g ${SCHEMA_FILE} -l ${BULK_LOAD_FILE} -d /tmp/csr-data-dir/ -p 2 + + - name: Test graph load on movie graph + env: + GS_TEST_DIR: ${{ github.workspace }}/gstest/ + FLEX_DATA_DIR: ${{ github.workspace }}/flex/interactive/examples/movies/ + run: | + rm -rf /tmp/csr-data-dir/ + cd ${GITHUB_WORKSPACE}/flex/build/ + SCHEMA_FILE=${GITHUB_WORKSPACE}/flex/tests/rt_mutable_graph/movie_schema_test.yaml + BULK_LOAD_FILE=${GITHUB_WORKSPACE}/flex/tests/rt_mutable_graph/movie_import_test.yaml + GLOG_v=10 ./bin/bulk_loader -g ${SCHEMA_FILE} -l ${BULK_LOAD_FILE} -d /tmp/csr-data-dir/ \ No newline at end of file diff --git a/flex/interactive/examples/movies/FOLLOWS_TIMESTAMP.csv b/flex/interactive/examples/movies/FOLLOWS_TIMESTAMP.csv new file mode 100644 index 000000000000..b2c218731482 --- /dev/null +++ b/flex/interactive/examples/movies/FOLLOWS_TIMESTAMP.csv @@ -0,0 +1,9 @@ +start|end|timestamp +169|71|2010-02-14T15:32:10.447+00:00 +171|71|2011-12-27T20:11:20.921+00:00 +172|71|2011-12-27T20:11:20.922+00:00 +173|71|2011-12-27T20:11:20.923+00:00 +174|71|2011-12-27T20:11:20.924+08:00 +169|16|2011-12-27T20:11:20.925+08:00 +172|16|2011-12-27T20:11:20.926+08:00 +174|16|2011-12-27T20:11:20.927+08:00 diff --git a/flex/tests/rt_mutable_graph/movie_import_test.yaml b/flex/tests/rt_mutable_graph/movie_import_test.yaml new file mode 100644 index 000000000000..f9d71340372c --- /dev/null +++ b/flex/tests/rt_mutable_graph/movie_import_test.yaml @@ -0,0 +1,122 @@ +graph: movies +loading_config: + data_source: + scheme: file # file, oss, s3, hdfs; only file is supported now + import_option: init # append, overwrite, only init is supported now + format: + type: csv + metadata: + delimiter: "|" # other loading configuration places here + header_row: true # whether to use the first row as the header + quoting: false + quote_char: '"' + double_quote: true + escape_char: '\' + escaping: false + block_size: 4MB + batch_reader: false +vertex_mappings: + - type_name: Person # must align with the schema + inputs: + - Person.csv + - type_name: User + inputs: + - User.csv + - type_name: Movie + inputs: + - Movie.csv +edge_mappings: + - type_triplet: + edge: ACTED_IN + source_vertex: Person + destination_vertex: Movie + source_vertex_mappings: + - column: + index: 0 + name: id + destination_vertex_mappings: + - column: + index: 1 + name: id + inputs: + - ACTED_IN.csv + - type_triplet: + edge: DIRECTED + source_vertex: Person + destination_vertex: Movie + source_vertex_mappings: + - column: + index: 0 + name: id + destination_vertex_mappings: + - column: + index: 1 + name: id + inputs: + - DIRECTED.csv + - type_triplet: + edge: FOLLOWS + source_vertex: User + destination_vertex: Person + source_vertex_mappings: + - column: + index: 0 + name: id + destination_vertex_mappings: + - column: + index: 1 + name: id + column_mappings: + - column: + index: 2 + name: timestamp + property: timestamp + inputs: + - FOLLOWS_TIMESTAMP.csv + - type_triplet: + edge: PRODUCED + source_vertex: Person + destination_vertex: Movie + source_vertex_mappings: + - column: + index: 0 + name: id + destination_vertex_mappings: + - column: + index: 1 + name: id + inputs: + - PRODUCED.csv + - type_triplet: + edge: REVIEW + source_vertex: User + destination_vertex: Movie + source_vertex_mappings: + - column: + index: 0 + name: id + destination_vertex_mappings: + - column: + index: 1 + name: id + column_mappings: + - column: + index: 2 + name: rating + property: rating + inputs: + - REVIEWED.csv + - type_triplet: + edge: WROTE + source_vertex: Person + destination_vertex: Movie + source_vertex_mappings: + - column: + index: 0 + name: id + destination_vertex_mappings: + - column: + index: 1 + name: id + inputs: + - WROTE.csv diff --git a/flex/tests/rt_mutable_graph/movie_schema_test.yaml b/flex/tests/rt_mutable_graph/movie_schema_test.yaml new file mode 100644 index 000000000000..9c042f18983f --- /dev/null +++ b/flex/tests/rt_mutable_graph/movie_schema_test.yaml @@ -0,0 +1,110 @@ +name: movies +schema: + vertex_types: + - type_id: 0 + type_name: Movie + properties: + - property_id: 0 + property_name: id + property_type: + primitive_type: DT_SIGNED_INT64 + - property_id: 1 + property_name: released + property_type: + primitive_type: DT_SIGNED_INT32 + - property_id: 2 + property_name: tagline + property_type: + string: + long_text: + - property_id: 3 + property_name: title + property_type: + string: + long_text: + primary_keys: + - id + - type_id: 1 + type_name: Person + properties: + - property_id: 0 + property_name: id + property_type: + primitive_type: DT_SIGNED_INT64 + - property_id: 1 + property_name: born + property_type: + primitive_type: DT_SIGNED_INT32 + - property_id: 2 + property_name: name + property_type: + string: + long_text: + primary_keys: + - id + - type_id: 2 + type_name: User + properties: + - property_id: 0 + property_name: id + property_type: + primitive_type: DT_SIGNED_INT64 + - property_id: 1 + property_name: born + property_type: + primitive_type: DT_SIGNED_INT32 + - property_id: 2 + property_name: name + property_type: + string: + long_text: + primary_keys: + - id + edge_types: + - type_id: 0 + type_name: ACTED_IN + vertex_type_pair_relations: + - source_vertex: Person + destination_vertex: Movie + relation: MANY_TO_MANY + - type_id: 1 + type_name: DIRECTED + vertex_type_pair_relations: + - source_vertex: Person + destination_vertex: Movie + relation: MANY_TO_MANY + - type_id: 2 + type_name: REVIEW + vertex_type_pair_relations: + - source_vertex: User + destination_vertex: Movie + relation: MANY_TO_MANY + properties: + - property_id: 0 + property_name: rating + property_type: + primitive_type: DT_SIGNED_INT32 + - type_id: 3 + type_name: FOLLOWS + vertex_type_pair_relations: + - source_vertex: User + destination_vertex: Person + relation: MANY_TO_MANY + properties: + - property_id: 0 + property_name: timestamp + property_type: + temporal: + timestamp: + - type_id: 4 + type_name: WROTE + vertex_type_pair_relations: + - source_vertex: Person + destination_vertex: Movie + relation: MANY_TO_MANY + - type_id: 5 + type_name: PRODUCED + vertex_type_pair_relations: + - source_vertex: Person + destination_vertex: Movie + relation: MANY_TO_MANY diff --git a/flex/utils/arrow_utils.h b/flex/utils/arrow_utils.h index e6436e7cad06..ea9b6941854c 100644 --- a/flex/utils/arrow_utils.h +++ b/flex/utils/arrow_utils.h @@ -36,6 +36,11 @@ class LDBCTimeStampParser : public arrow::TimestampParser { bool* out_zone_offset_present = NULLPTR) const override { using seconds_type = std::chrono::duration; + // We allow the following zone offset formats: + // - (none) + // - Z + // - [+-]HH(:?MM)? + // // We allow the following formats for all units: // - "YYYY-MM-DD" // - "YYYY-MM-DD[ T]hhZ?" @@ -79,14 +84,61 @@ class LDBCTimeStampParser : public arrow::TimestampParser { return false; } + // In the implementation of arrow ISO8601 timestamp parser, the zone offset + // is set to true if the input string contains a zone offset. However, we + // parse the zone offset here but don't set the boolean flag. + // https://github.com/apache/arrow/blob/3e7ae5340a123c1040f98f1c36687b81362fab52/cpp/src/arrow/csv/converter.cc#L373 + // The reason is that, if we want the zone offset to be set, we need to + // to declare the zone offset in the schema and construct TimeStampType with + // that offset. However, we just want to parse the timestamp string and + // convert it to a timestamp value, we have no assumption of the local time + // zone, and we don't require the zone offset to be set in the schema. + // Same for following commented code. + //------------------------------------------------------------------------- + // if (out_zone_offset_present) { + // *out_zone_offset_present = false; + // } + //------------------------------------------------------------------------- + + seconds_type zone_offset(0); if (s[length - 1] == 'Z') { --length; - } - - // if the back the s is '+0000', we should ignore it - auto time_zones = std::string_view(s + length - 5, 5); - if (time_zones == "+0000") { + // if (out_zone_offset_present) + // *out_zone_offset_present = true; + } else if (s[length - 3] == '+' || s[length - 3] == '-') { + // [+-]HH + length -= 3; + if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH( + s + length + 1, &zone_offset))) { + return false; + } + if (s[length] == '+') + zone_offset *= -1; + // if (out_zone_offset_present) + // *out_zone_offset_present = true; + } else if (s[length - 5] == '+' || s[length - 5] == '-') { + // [+-]HHMM length -= 5; + if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHHMM( + s + length + 1, &zone_offset))) { + return false; + } + if (s[length] == '+') + zone_offset *= -1; + // if (out_zone_offset_present) + // *out_zone_offset_present = true; + } else if ((s[length - 6] == '+' || s[length - 6] == '-') && + (s[length - 3] == ':')) { + // [+-]HH:MM + length -= 6; + if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH_MM( + s + length + 1, &zone_offset))) { + return false; + } + if (s[length] == '+') + zone_offset *= -1; + // if (out_zone_offset_present) + // *out_zone_offset_present = true; } seconds_type seconds_since_midnight; @@ -124,6 +176,7 @@ class LDBCTimeStampParser : public arrow::TimestampParser { } seconds_since_epoch += seconds_since_midnight; + seconds_since_epoch += zone_offset; if (length <= 19) { *out =