refactor(interactive): Support timezone info for timestamp type. (#3975)

In this PR, we expand our `timestamp` format parsing to include timezone information such as `2010-02-14T15:32:10.447+00:00`. `out_zone_offset_present` are not set as we don't expect timezone info without giving a timezone in type definition. Otherwise will cause parsing failure. https://github.com/apache/arrow/blob/3e7ae5340a123c1040f98f1c36687b81362fab52/cpp/src/arrow/csv/converter.cc#L373 https://github.com/apache/arrow/blob/3e7ae5340a123c1040f98f1c36687b81362fab52/cpp/src/arrow/type.h#L1635
alibaba · Jun 25, 2024 · 7c1dfe7 · 7c1dfe7
1 parent ff04409
commit 7c1dfe7
Show file tree

Hide file tree

Showing 5 changed files with 310 additions and 5 deletions.
diff --git a/.github/workflows/interactive.yml b/.github/workflows/interactive.yml
@@ -512,3 +512,14 @@ jobs:
         SCHEMA_FILE=${FLEX_DATA_DIR}/audit_graph_schema.yaml
         BULK_LOAD_FILE=${FLEX_DATA_DIR}/audit_bulk_load.yaml
         GLOG_v=10 ./bin/bulk_loader -g ${SCHEMA_FILE} -l ${BULK_LOAD_FILE} -d /tmp/csr-data-dir/ -p 2
+
+    - name: Test graph load on movie graph
+      env:
+        GS_TEST_DIR: ${{ github.workspace }}/gstest/
+        FLEX_DATA_DIR: ${{ github.workspace }}/flex/interactive/examples/movies/
+      run: |
+        rm -rf /tmp/csr-data-dir/
+        cd ${GITHUB_WORKSPACE}/flex/build/
+        SCHEMA_FILE=${GITHUB_WORKSPACE}/flex/tests/rt_mutable_graph/movie_schema_test.yaml
+        BULK_LOAD_FILE=${GITHUB_WORKSPACE}/flex/tests/rt_mutable_graph/movie_import_test.yaml
+        GLOG_v=10 ./bin/bulk_loader -g ${SCHEMA_FILE} -l ${BULK_LOAD_FILE} -d /tmp/csr-data-dir/
diff --git a/flex/interactive/examples/movies/FOLLOWS_TIMESTAMP.csv b/flex/interactive/examples/movies/FOLLOWS_TIMESTAMP.csv
@@ -0,0 +1,9 @@
+start|end|timestamp
+169|71|2010-02-14T15:32:10.447+00:00
+171|71|2011-12-27T20:11:20.921+00:00
+172|71|2011-12-27T20:11:20.922+00:00
+173|71|2011-12-27T20:11:20.923+00:00
+174|71|2011-12-27T20:11:20.924+08:00
+169|16|2011-12-27T20:11:20.925+08:00
+172|16|2011-12-27T20:11:20.926+08:00
+174|16|2011-12-27T20:11:20.927+08:00
diff --git a/flex/tests/rt_mutable_graph/movie_import_test.yaml b/flex/tests/rt_mutable_graph/movie_import_test.yaml
@@ -0,0 +1,122 @@
+graph: movies
+loading_config:
+  data_source:
+    scheme: file  # file, oss, s3, hdfs; only file is supported now
+  import_option: init # append, overwrite, only init is supported now
+  format:
+    type: csv
+    metadata:
+      delimiter: "|"  # other loading configuration places here
+      header_row: true # whether to use the first row as the header
+      quoting: false
+      quote_char: '"'
+      double_quote: true
+      escape_char: '\'
+      escaping: false
+      block_size: 4MB
+      batch_reader: false
+vertex_mappings:
+  - type_name: Person  # must align with the schema
+    inputs:
+      - Person.csv
+  - type_name: User
+    inputs:
+      - User.csv
+  - type_name: Movie
+    inputs:
+      - Movie.csv
+edge_mappings:
+  - type_triplet:
+      edge: ACTED_IN
+      source_vertex:  Person
+      destination_vertex:  Movie
+    source_vertex_mappings:
+      - column:
+          index: 0
+          name: id
+    destination_vertex_mappings:
+      - column:
+          index: 1
+          name: id
+    inputs:
+      - ACTED_IN.csv
+  - type_triplet:
+      edge: DIRECTED
+      source_vertex:  Person
+      destination_vertex:  Movie
+    source_vertex_mappings:
+      - column:
+          index: 0
+          name: id
+    destination_vertex_mappings:
+      - column:
+          index: 1
+          name: id
+    inputs:
+      - DIRECTED.csv
+  - type_triplet:
+      edge: FOLLOWS
+      source_vertex:  User
+      destination_vertex:  Person
+    source_vertex_mappings:
+      - column:
+          index: 0
+          name: id
+    destination_vertex_mappings:
+      - column:
+          index: 1
+          name: id
+    column_mappings:
+      - column:
+          index: 2
+          name: timestamp
+        property: timestamp
+    inputs:
+      - FOLLOWS_TIMESTAMP.csv
+  - type_triplet:
+      edge: PRODUCED
+      source_vertex:  Person
+      destination_vertex:  Movie
+    source_vertex_mappings:
+      - column:
+          index: 0
+          name: id
+    destination_vertex_mappings:
+      - column:
+          index: 1
+          name: id
+    inputs:
+      - PRODUCED.csv
+  - type_triplet:
+      edge: REVIEW
+      source_vertex:  User
+      destination_vertex:  Movie
+    source_vertex_mappings:
+      - column:
+          index: 0
+          name: id
+    destination_vertex_mappings:
+      - column:
+          index: 1
+          name: id
+    column_mappings:
+      - column:
+          index: 2
+          name: rating
+        property: rating
+    inputs:
+      - REVIEWED.csv
+  - type_triplet:
+      edge: WROTE
+      source_vertex:  Person
+      destination_vertex:  Movie
+    source_vertex_mappings:
+      - column:
+          index: 0
+          name: id
+    destination_vertex_mappings:
+      - column:
+          index: 1
+          name: id
+    inputs:
+      - WROTE.csv
diff --git a/flex/tests/rt_mutable_graph/movie_schema_test.yaml b/flex/tests/rt_mutable_graph/movie_schema_test.yaml
@@ -0,0 +1,110 @@
+name: movies
+schema:
+  vertex_types:
+    - type_id: 0
+      type_name: Movie
+      properties:
+        - property_id: 0
+          property_name: id
+          property_type:
+            primitive_type: DT_SIGNED_INT64
+        - property_id: 1
+          property_name: released
+          property_type:
+            primitive_type: DT_SIGNED_INT32
+        - property_id: 2
+          property_name: tagline
+          property_type:
+            string:
+              long_text:
+        - property_id: 3
+          property_name: title
+          property_type:
+            string:
+              long_text:
+      primary_keys:
+        - id
+    - type_id: 1
+      type_name: Person
+      properties:
+        - property_id: 0
+          property_name: id
+          property_type:
+            primitive_type: DT_SIGNED_INT64
+        - property_id: 1
+          property_name: born
+          property_type:
+            primitive_type: DT_SIGNED_INT32
+        - property_id: 2
+          property_name: name
+          property_type:
+            string:
+              long_text:
+      primary_keys:
+        - id
+    - type_id: 2
+      type_name: User
+      properties:
+        - property_id: 0
+          property_name: id
+          property_type:
+            primitive_type: DT_SIGNED_INT64
+        - property_id: 1
+          property_name: born
+          property_type:
+            primitive_type: DT_SIGNED_INT32
+        - property_id: 2
+          property_name: name
+          property_type:
+            string:
+              long_text:
+      primary_keys:
+        - id
+  edge_types:
+    - type_id: 0
+      type_name: ACTED_IN
+      vertex_type_pair_relations:
+        - source_vertex: Person
+          destination_vertex: Movie
+          relation: MANY_TO_MANY
+    - type_id: 1
+      type_name: DIRECTED
+      vertex_type_pair_relations:
+        - source_vertex: Person
+          destination_vertex: Movie
+          relation: MANY_TO_MANY
+    - type_id: 2
+      type_name: REVIEW
+      vertex_type_pair_relations:
+        - source_vertex: User
+          destination_vertex: Movie
+          relation: MANY_TO_MANY
+      properties:
+        - property_id: 0
+          property_name: rating
+          property_type:
+            primitive_type: DT_SIGNED_INT32
+    - type_id: 3
+      type_name: FOLLOWS
+      vertex_type_pair_relations:
+        - source_vertex: User
+          destination_vertex: Person
+          relation: MANY_TO_MANY
+      properties:
+        - property_id: 0
+          property_name: timestamp
+          property_type:
+            temporal: 
+              timestamp: 
+    - type_id: 4
+      type_name: WROTE
+      vertex_type_pair_relations:
+        - source_vertex: Person
+          destination_vertex: Movie
+          relation: MANY_TO_MANY
+    - type_id: 5
+      type_name: PRODUCED
+      vertex_type_pair_relations:
+        - source_vertex: Person
+          destination_vertex: Movie
+          relation: MANY_TO_MANY
diff --git a/flex/utils/arrow_utils.h b/flex/utils/arrow_utils.h
@@ -36,6 +36,11 @@ class LDBCTimeStampParser : public arrow::TimestampParser {
                   bool* out_zone_offset_present = NULLPTR) const override {
     using seconds_type = std::chrono::duration<arrow::TimestampType::c_type>;
 
+    // We allow the following zone offset formats:
+    // - (none)
+    // - Z
+    // - [+-]HH(:?MM)?
+    //
     // We allow the following formats for all units:
     // - "YYYY-MM-DD"
     // - "YYYY-MM-DD[ T]hhZ?"
@@ -79,14 +84,61 @@ class LDBCTimeStampParser : public arrow::TimestampParser {
       return false;
     }
 
+    // In the implementation of arrow ISO8601 timestamp parser, the zone offset
+    // is set to true if the input string contains a zone offset. However, we
+    // parse the zone offset here but don't set the boolean flag.
+    // https://github.com/apache/arrow/blob/3e7ae5340a123c1040f98f1c36687b81362fab52/cpp/src/arrow/csv/converter.cc#L373
+    // The reason is that, if we want the zone offset to be set, we need to
+    // to declare the zone offset in the schema and construct TimeStampType with
+    // that offset. However, we just want to parse the timestamp string and
+    // convert it to a timestamp value, we have no assumption of the local time
+    // zone, and we don't require the zone offset to be set in the schema.
+    // Same for following commented code.
+    //-------------------------------------------------------------------------
+    // if (out_zone_offset_present) {
+    //   *out_zone_offset_present = false;
+    // }
+    //-------------------------------------------------------------------------
+
+    seconds_type zone_offset(0);
     if (s[length - 1] == 'Z') {
       --length;
-    }
-
-    // if the back the s is '+0000', we should ignore it
-    auto time_zones = std::string_view(s + length - 5, 5);
-    if (time_zones == "+0000") {
+      // if (out_zone_offset_present)
+      //   *out_zone_offset_present = true;
+    } else if (s[length - 3] == '+' || s[length - 3] == '-') {
+      // [+-]HH
+      length -= 3;
+      if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH(
+              s + length + 1, &zone_offset))) {
+        return false;
+      }
+      if (s[length] == '+')
+        zone_offset *= -1;
+      // if (out_zone_offset_present)
+      //   *out_zone_offset_present = true;
+    } else if (s[length - 5] == '+' || s[length - 5] == '-') {
+      // [+-]HHMM
       length -= 5;
+      if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHHMM(
+              s + length + 1, &zone_offset))) {
+        return false;
+      }
+      if (s[length] == '+')
+        zone_offset *= -1;
+      // if (out_zone_offset_present)
+      //   *out_zone_offset_present = true;
+    } else if ((s[length - 6] == '+' || s[length - 6] == '-') &&
+               (s[length - 3] == ':')) {
+      // [+-]HH:MM
+      length -= 6;
+      if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH_MM(
+              s + length + 1, &zone_offset))) {
+        return false;
+      }
+      if (s[length] == '+')
+        zone_offset *= -1;
+      // if (out_zone_offset_present)
+      //   *out_zone_offset_present = true;
     }
 
     seconds_type seconds_since_midnight;
@@ -124,6 +176,7 @@ class LDBCTimeStampParser : public arrow::TimestampParser {
     }
 
     seconds_since_epoch += seconds_since_midnight;
+    seconds_since_epoch += zone_offset;
 
     if (length <= 19) {
       *out =