Skip to content

Commit

Permalink
refactor(interactive): Support timezone info for timestamp type. (#3975)
Browse files Browse the repository at this point in the history
In this PR, we expand our `timestamp` format parsing to include timezone
information such as `2010-02-14T15:32:10.447+00:00`.

`out_zone_offset_present` are not set as we don't expect timezone info
without giving a timezone in type definition. Otherwise will cause
parsing failure.

https://github.com/apache/arrow/blob/3e7ae5340a123c1040f98f1c36687b81362fab52/cpp/src/arrow/csv/converter.cc#L373

https://github.com/apache/arrow/blob/3e7ae5340a123c1040f98f1c36687b81362fab52/cpp/src/arrow/type.h#L1635
  • Loading branch information
zhanglei1949 authored Jun 25, 2024
1 parent ff04409 commit 7c1dfe7
Show file tree
Hide file tree
Showing 5 changed files with 310 additions and 5 deletions.
11 changes: 11 additions & 0 deletions .github/workflows/interactive.yml
Original file line number Diff line number Diff line change
Expand Up @@ -512,3 +512,14 @@ jobs:
SCHEMA_FILE=${FLEX_DATA_DIR}/audit_graph_schema.yaml
BULK_LOAD_FILE=${FLEX_DATA_DIR}/audit_bulk_load.yaml
GLOG_v=10 ./bin/bulk_loader -g ${SCHEMA_FILE} -l ${BULK_LOAD_FILE} -d /tmp/csr-data-dir/ -p 2
- name: Test graph load on movie graph
env:
GS_TEST_DIR: ${{ github.workspace }}/gstest/
FLEX_DATA_DIR: ${{ github.workspace }}/flex/interactive/examples/movies/
run: |
rm -rf /tmp/csr-data-dir/
cd ${GITHUB_WORKSPACE}/flex/build/
SCHEMA_FILE=${GITHUB_WORKSPACE}/flex/tests/rt_mutable_graph/movie_schema_test.yaml
BULK_LOAD_FILE=${GITHUB_WORKSPACE}/flex/tests/rt_mutable_graph/movie_import_test.yaml
GLOG_v=10 ./bin/bulk_loader -g ${SCHEMA_FILE} -l ${BULK_LOAD_FILE} -d /tmp/csr-data-dir/
9 changes: 9 additions & 0 deletions flex/interactive/examples/movies/FOLLOWS_TIMESTAMP.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
start|end|timestamp
169|71|2010-02-14T15:32:10.447+00:00
171|71|2011-12-27T20:11:20.921+00:00
172|71|2011-12-27T20:11:20.922+00:00
173|71|2011-12-27T20:11:20.923+00:00
174|71|2011-12-27T20:11:20.924+08:00
169|16|2011-12-27T20:11:20.925+08:00
172|16|2011-12-27T20:11:20.926+08:00
174|16|2011-12-27T20:11:20.927+08:00
122 changes: 122 additions & 0 deletions flex/tests/rt_mutable_graph/movie_import_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
graph: movies
loading_config:
data_source:
scheme: file # file, oss, s3, hdfs; only file is supported now
import_option: init # append, overwrite, only init is supported now
format:
type: csv
metadata:
delimiter: "|" # other loading configuration places here
header_row: true # whether to use the first row as the header
quoting: false
quote_char: '"'
double_quote: true
escape_char: '\'
escaping: false
block_size: 4MB
batch_reader: false
vertex_mappings:
- type_name: Person # must align with the schema
inputs:
- Person.csv
- type_name: User
inputs:
- User.csv
- type_name: Movie
inputs:
- Movie.csv
edge_mappings:
- type_triplet:
edge: ACTED_IN
source_vertex: Person
destination_vertex: Movie
source_vertex_mappings:
- column:
index: 0
name: id
destination_vertex_mappings:
- column:
index: 1
name: id
inputs:
- ACTED_IN.csv
- type_triplet:
edge: DIRECTED
source_vertex: Person
destination_vertex: Movie
source_vertex_mappings:
- column:
index: 0
name: id
destination_vertex_mappings:
- column:
index: 1
name: id
inputs:
- DIRECTED.csv
- type_triplet:
edge: FOLLOWS
source_vertex: User
destination_vertex: Person
source_vertex_mappings:
- column:
index: 0
name: id
destination_vertex_mappings:
- column:
index: 1
name: id
column_mappings:
- column:
index: 2
name: timestamp
property: timestamp
inputs:
- FOLLOWS_TIMESTAMP.csv
- type_triplet:
edge: PRODUCED
source_vertex: Person
destination_vertex: Movie
source_vertex_mappings:
- column:
index: 0
name: id
destination_vertex_mappings:
- column:
index: 1
name: id
inputs:
- PRODUCED.csv
- type_triplet:
edge: REVIEW
source_vertex: User
destination_vertex: Movie
source_vertex_mappings:
- column:
index: 0
name: id
destination_vertex_mappings:
- column:
index: 1
name: id
column_mappings:
- column:
index: 2
name: rating
property: rating
inputs:
- REVIEWED.csv
- type_triplet:
edge: WROTE
source_vertex: Person
destination_vertex: Movie
source_vertex_mappings:
- column:
index: 0
name: id
destination_vertex_mappings:
- column:
index: 1
name: id
inputs:
- WROTE.csv
110 changes: 110 additions & 0 deletions flex/tests/rt_mutable_graph/movie_schema_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
name: movies
schema:
vertex_types:
- type_id: 0
type_name: Movie
properties:
- property_id: 0
property_name: id
property_type:
primitive_type: DT_SIGNED_INT64
- property_id: 1
property_name: released
property_type:
primitive_type: DT_SIGNED_INT32
- property_id: 2
property_name: tagline
property_type:
string:
long_text:
- property_id: 3
property_name: title
property_type:
string:
long_text:
primary_keys:
- id
- type_id: 1
type_name: Person
properties:
- property_id: 0
property_name: id
property_type:
primitive_type: DT_SIGNED_INT64
- property_id: 1
property_name: born
property_type:
primitive_type: DT_SIGNED_INT32
- property_id: 2
property_name: name
property_type:
string:
long_text:
primary_keys:
- id
- type_id: 2
type_name: User
properties:
- property_id: 0
property_name: id
property_type:
primitive_type: DT_SIGNED_INT64
- property_id: 1
property_name: born
property_type:
primitive_type: DT_SIGNED_INT32
- property_id: 2
property_name: name
property_type:
string:
long_text:
primary_keys:
- id
edge_types:
- type_id: 0
type_name: ACTED_IN
vertex_type_pair_relations:
- source_vertex: Person
destination_vertex: Movie
relation: MANY_TO_MANY
- type_id: 1
type_name: DIRECTED
vertex_type_pair_relations:
- source_vertex: Person
destination_vertex: Movie
relation: MANY_TO_MANY
- type_id: 2
type_name: REVIEW
vertex_type_pair_relations:
- source_vertex: User
destination_vertex: Movie
relation: MANY_TO_MANY
properties:
- property_id: 0
property_name: rating
property_type:
primitive_type: DT_SIGNED_INT32
- type_id: 3
type_name: FOLLOWS
vertex_type_pair_relations:
- source_vertex: User
destination_vertex: Person
relation: MANY_TO_MANY
properties:
- property_id: 0
property_name: timestamp
property_type:
temporal:
timestamp:
- type_id: 4
type_name: WROTE
vertex_type_pair_relations:
- source_vertex: Person
destination_vertex: Movie
relation: MANY_TO_MANY
- type_id: 5
type_name: PRODUCED
vertex_type_pair_relations:
- source_vertex: Person
destination_vertex: Movie
relation: MANY_TO_MANY
63 changes: 58 additions & 5 deletions flex/utils/arrow_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ class LDBCTimeStampParser : public arrow::TimestampParser {
bool* out_zone_offset_present = NULLPTR) const override {
using seconds_type = std::chrono::duration<arrow::TimestampType::c_type>;

// We allow the following zone offset formats:
// - (none)
// - Z
// - [+-]HH(:?MM)?
//
// We allow the following formats for all units:
// - "YYYY-MM-DD"
// - "YYYY-MM-DD[ T]hhZ?"
Expand Down Expand Up @@ -79,14 +84,61 @@ class LDBCTimeStampParser : public arrow::TimestampParser {
return false;
}

// In the implementation of arrow ISO8601 timestamp parser, the zone offset
// is set to true if the input string contains a zone offset. However, we
// parse the zone offset here but don't set the boolean flag.
// https://github.com/apache/arrow/blob/3e7ae5340a123c1040f98f1c36687b81362fab52/cpp/src/arrow/csv/converter.cc#L373
// The reason is that, if we want the zone offset to be set, we need to
// to declare the zone offset in the schema and construct TimeStampType with
// that offset. However, we just want to parse the timestamp string and
// convert it to a timestamp value, we have no assumption of the local time
// zone, and we don't require the zone offset to be set in the schema.
// Same for following commented code.
//-------------------------------------------------------------------------
// if (out_zone_offset_present) {
// *out_zone_offset_present = false;
// }
//-------------------------------------------------------------------------

seconds_type zone_offset(0);
if (s[length - 1] == 'Z') {
--length;
}

// if the back the s is '+0000', we should ignore it
auto time_zones = std::string_view(s + length - 5, 5);
if (time_zones == "+0000") {
// if (out_zone_offset_present)
// *out_zone_offset_present = true;
} else if (s[length - 3] == '+' || s[length - 3] == '-') {
// [+-]HH
length -= 3;
if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH(
s + length + 1, &zone_offset))) {
return false;
}
if (s[length] == '+')
zone_offset *= -1;
// if (out_zone_offset_present)
// *out_zone_offset_present = true;
} else if (s[length - 5] == '+' || s[length - 5] == '-') {
// [+-]HHMM
length -= 5;
if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHHMM(
s + length + 1, &zone_offset))) {
return false;
}
if (s[length] == '+')
zone_offset *= -1;
// if (out_zone_offset_present)
// *out_zone_offset_present = true;
} else if ((s[length - 6] == '+' || s[length - 6] == '-') &&
(s[length - 3] == ':')) {
// [+-]HH:MM
length -= 6;
if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH_MM(
s + length + 1, &zone_offset))) {
return false;
}
if (s[length] == '+')
zone_offset *= -1;
// if (out_zone_offset_present)
// *out_zone_offset_present = true;
}

seconds_type seconds_since_midnight;
Expand Down Expand Up @@ -124,6 +176,7 @@ class LDBCTimeStampParser : public arrow::TimestampParser {
}

seconds_since_epoch += seconds_since_midnight;
seconds_since_epoch += zone_offset;

if (length <= 19) {
*out =
Expand Down

0 comments on commit 7c1dfe7

Please sign in to comment.