From 63d3a170efb1eb48971652a0ee7cb3d15edbdee4 Mon Sep 17 00:00:00 2001 From: wyl4pd <164864310+wyl4pd@users.noreply.github.com> Date: Tue, 14 May 2024 16:15:08 +0800 Subject: [PATCH] fix: gcformat space and continuous sign (#3921) * fix: gcformat space * fix: gcformat continuous sign use hash * fix: delete incorrect comments --- cases/query/feature_signature_query.yaml | 80 +++++++++++-------- .../udf/default_defs/feature_signature_def.cc | 17 +++- 2 files changed, 58 insertions(+), 39 deletions(-) diff --git a/cases/query/feature_signature_query.yaml b/cases/query/feature_signature_query.yaml index 1cfbd9b229a..c0763d320e7 100644 --- a/cases/query/feature_signature_query.yaml +++ b/cases/query/feature_signature_query.yaml @@ -43,7 +43,7 @@ cases: mode: procedure-unsupport db: db1 sql: | - select gcformat( + select concat("#", gcformat( discrete(3, -1), discrete(3, 0), discrete(3, int("null")), @@ -57,31 +57,31 @@ cases: discrete(-1, 5), discrete(-2, 5), discrete(-3, 5), - discrete(-4, 5)) as instance, + discrete(-4, 5))) as instance; expect: schema: instance:string data: | - | 4:628 5:491882390849628 6:0 7:4 8:1 9:3 10:1 11:1 12:0 13:0 14:4 + # | 4:628 5:491882390849628 6:0 7:4 8:1 9:3 10:1 11:1 12:0 13:0 14:4 - id: 2 desc: feature signature select GCFormat no label mode: procedure-unsupport db: db1 sql: | - select gcformat( + select concat("#", gcformat( discrete(hash64("x"), 1), continuous(pow(10, 30)), continuous(-pow(10, 1000)), - continuous(abs(sqrt(-1)))) as instance; + continuous(abs(sqrt(-1))))) as instance; expect: schema: instance:string data: | - | 1:0 2:0:1000000000000000019884624838656.000000 3:0:-inf 4:0:nan + # | 1:0 2:3353244675891348105:1000000000000000019884624838656.000000 3:7262150054277104024:-inf 4:3255232038643208583:nan - id: 3 desc: feature signature GCFormat null mode: procedure-unsupport db: db1 sql: | - select gcformat( + select concat("#", gcformat( regression_label(2), regression_label(int("null")), continuous(int("null")), @@ -98,31 +98,31 @@ cases: discrete(3, -100), discrete(3), continuous(0.0), - continuous(int("null"))) as instance; + continuous(int("null")))) as instance; expect: schema: instance:string data: | - | 3:0:-1 4:0:2681491882390849628 5:28 8:2681491882390849628 9:0:-1 10:28 13:2681491882390849628 14:0:0.000000 + # | 3:7262150054277104024:-1 4:3255232038643208583:2681491882390849628 5:28 8:2681491882390849628 9:-7745589761753622095:-1 10:28 13:2681491882390849628 14:398281081943027035:0.000000 - id: 4 desc: feature signature GCFormat no feature mode: procedure-unsupport db: db1 sql: | - select gcformat(binary_label(false)); + select concat(gcformat(binary_label(false)), "#") as instance; expect: - schema: gcformat(binary_label(false)):string + schema: instance:string data: | - 0| + 0 | # - id: 5 desc: feature signature GCFormat nothing mode: procedure-unsupport db: db1 sql: | - select gcformat(); + select concat(concat("#", gcformat()), "#") as instance; expect: - schema: gcformat():string + schema: instance:string data: | - | + # | # - id: 6 desc: feature signature CSV no label mode: procedure-unsupport @@ -136,7 +136,7 @@ cases: expect: columns: [instance:string] rows: - - [",,,628"] + - [ ",,,628" ] - id: 7 desc: feature signature CSV null mode: procedure-unsupport @@ -163,7 +163,7 @@ cases: expect: columns: [ "instance:string "] rows: - - ["2,,,,-1,2681491882390849628,28,,,2681491882390849628,-1,28,,,2681491882390849628,0.000000,"] + - [ "2,,,,-1,2681491882390849628,28,,,2681491882390849628,-1,28,,,2681491882390849628,0.000000," ] - id: 8 desc: feature signature CSV no feature mode: procedure-unsupport @@ -263,7 +263,7 @@ cases: expect: schema: instance:string data: | - 1| 1:0:0 2:0:1 3:0 + 1 | 1:5925585971146611297:0 2:3353244675891348105:1 3:0 - id: 15 desc: feature signature select GCFormat from mode: request-unsupport @@ -289,11 +289,11 @@ cases: schema: instance:string order: instance data: | - 1| 1:0:0 2:0:1 3:0 - 2| 1:0:0 2:0:2 3:0 - 3| 1:0:1 2:0:3 3:0 - 4| 1:0:1 2:0:4 3:0 - 5| 1:0:2 2:0:5 3:0 + 1 | 1:5925585971146611297:0 2:3353244675891348105:1 3:0 + 2 | 1:5925585971146611297:0 2:3353244675891348105:2 3:0 + 3 | 1:5925585971146611297:1 2:3353244675891348105:3 3:0 + 4 | 1:5925585971146611297:1 2:3353244675891348105:4 3:0 + 5 | 1:5925585971146611297:2 2:3353244675891348105:5 3:0 - id: 16 desc: feature signature select CSV from mode: request-unsupport @@ -360,7 +360,7 @@ cases: mode: request-unsupport db: db1 sql: | - SELECT gcformat(regression_label(col1)) as col1, + SELECT gcformat(regression_label(col1), discrete(col1, 1)) as col1, csv(regression_label(col1)) as col2, libsvm(regression_label(col1)) as col3 FROM t1; @@ -375,14 +375,14 @@ cases: 1, 4, 55, 4.4, 44.4, 2, 4444 2, 5, 55, 5.5, 55.5, 3, aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa expect: - schema: col1:string, col2:string, col3:string - order: col1 - data: | - 1|, 1, 1 - 2|, 2, 2 - 3|, 3, 3 - 4|, 4, 4 - 5|, 5, 5 + columns: [ "col1:string", "col2:string", "col3:string" ] + order: "col1" + rows: + - [ "1 | 1:0", "1", "1" ] + - [ "2 | 1:0", "2", "2" ] + - [ "3 | 1:0", "3", "3" ] + - [ "4 | 1:0", "4", "4" ] + - [ "5 | 1:0", "5", "5" ] - id: 19 desc: feature signature select from join mode: request-unsupport @@ -471,15 +471,25 @@ cases: mode: procedure-unsupport db: db1 sql: | - select gcformat( + select concat("#", gcformat( regression_label(2), continuous(1), continuous(int("notint")), continuous(0), continuous(0.0), discrete(3), - regression_label(int("notint"))) as instance; + regression_label(int("notint")))) as instance; expect: schema: instance:string data: | - | 1:0:1 3:0:0 4:0:0.000000 5:2681491882390849628 + # | 1:5925585971146611297:1 3:7262150054277104024:0 4:3255232038643208583:0.000000 5:2681491882390849628 + - id: 23 + desc: hash64 + mode: procedure-unsupport + db: db1 + sql: | + select hash64(3) as col1, hash64(bigint(3)) as col2; + expect: + schema: col1:int64, col2:int64 + data: | + 2681491882390849628, 7262150054277104024 diff --git a/hybridse/src/udf/default_defs/feature_signature_def.cc b/hybridse/src/udf/default_defs/feature_signature_def.cc index 3f9586c7f61..b407d513bb4 100644 --- a/hybridse/src/udf/default_defs/feature_signature_def.cc +++ b/hybridse/src/udf/default_defs/feature_signature_def.cc @@ -204,14 +204,23 @@ struct GCFormat { switch (feature_signature) { case kFeatureSignatureContinuous: { if (!is_null) { - instance_feature += " " + std::to_string(slot_number) + ":0:" + format_continuous(input); + if (!instance_feature.empty()) { + instance_feature += " "; + } + int64_t hash = FarmFingerprint(CCallDataTypeTrait::to_bytes_ref(&slot_number)); + instance_feature += std::to_string(slot_number) + ":"; + instance_feature += format_discrete(hash); + instance_feature += ":" + format_continuous(input); } ++slot_number; break; } case kFeatureSignatureDiscrete: { if (!is_null) { - instance_feature += " " + std::to_string(slot_number) + ":" + format_discrete(input); + if (!instance_feature.empty()) { + instance_feature += " "; + } + instance_feature += std::to_string(slot_number) + ":" + format_discrete(input); } ++slot_number; break; @@ -249,7 +258,7 @@ struct GCFormat { } std::string Output() { - return instance_label + "|" + instance_feature; + return instance_label + " | " + instance_feature; } size_t slot_number = 1; @@ -482,7 +491,7 @@ void DefaultUdfLibrary::InitFeatureSignature() { Example: @code{.sql} select gcformat(multiclass_label(6), continuous(1.5), category(3)); - -- output 6| 1:0:1.500000 2:2681491882390849628 + -- output 6 | 1:0:1.500000 2:2681491882390849628 @endcode @since 0.9.0