Merge branch 'dev' into paimon_catalog_basetype

# Conflicts: # seatunnel-common/src/main/java/org/apache/seatunnel/common/exception/CommonErrorCode.java # seatunnel-connectors-v2/connector-paimon/src/main/java/org/apache/seatunnel/connectors/seatunnel/paimon/catalog/PaimonCatalog.java
TongchengOpenSource · May 22, 2024 · b068137 · b068137
2 parents e57182c + 6010460
commit b068137
Show file tree

Hide file tree

Showing 209 changed files with 7,491 additions and 918 deletions.
diff --git a/docs/en/connector-v2/Error-Quick-Reference-Manual.md b/docs/en/connector-v2/Error-Quick-Reference-Manual.md
@@ -256,6 +256,12 @@ problems encountered by users.
 |--------------|-------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------|
 | FIRESTORE-01 | Close Firestore client failed | When users encounter this error code, it is usually there are some problems with closing the Firestore client, please check the Firestore is work |
 
+## Hbase Connector Error Codes
+
+|   code   |          description          |                                                            solution                                                             |
+|----------|-------------------------------|---------------------------------------------------------------------------------------------------------------------------------|
+| Hbase-01 | Build hbase connection failed | When users create Hbase database connection, the connection failed. Check the Hbase configuration parameters used and try again |
+
 ## FilterFieldTransform Error Codes
 
 |           code            |      description       |        solution         |

diff --git a/docs/en/connector-v2/sink/Paimon.md b/docs/en/connector-v2/sink/Paimon.md
@@ -6,6 +6,24 @@
 
 Sink connector for Apache Paimon. It can support cdc mode 、auto create table.
 
+## Supported DataSource Info
+
+| Datasource | Dependent |                                   Maven                                   |
+|------------|-----------|---------------------------------------------------------------------------|
+| Paimon     | hive-exec | [Download](https://mvnrepository.com/artifact/org.apache.hive/hive-exec)  |
+| Paimon     | libfb303  | [Download](https://mvnrepository.com/artifact/org.apache.thrift/libfb303) |
+
+## Database Dependency
+
+> In order to be compatible with different versions of Hadoop and Hive, the scope of hive-exec in the project pom file are provided, so if you use the Flink engine, first you may need to add the following Jar packages to <FLINK_HOME>/lib directory, if you are using the Spark engine and integrated with Hadoop, then you do not need to add the following Jar packages.
+
+```
+hive-exec-xxx.jar
+libfb303-xxx.jar
+```
+
+> Some versions of the hive-exec package do not have libfb303-xxx.jar, so you also need to manually import the Jar package.
+
 ## Key features
 
 - [x] [exactly-once](../../concept/connector-v2-features.md)
@@ -15,6 +33,8 @@ Sink connector for Apache Paimon. It can support cdc mode 、auto create table.
 |            name             |  type  | required |        default value         |                                                                           Description                                                                            |
 |-----------------------------|--------|----------|------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | warehouse                   | String | Yes      | -                            | Paimon warehouse path                                                                                                                                            |
+| catalog_type                | String | No       | filesystem                   | Catalog type of Paimon, support filesystem and hive                                                                                                              |
+| catalog_uri                 | String | No       | -                            | Catalog uri of Paimon, only needed when catalog_type is hive                                                                                                     |
 | database                    | String | Yes      | -                            | The database you want to access                                                                                                                                  |
 | table                       | String | Yes      | -                            | The table you want to access                                                                                                                                     |
 | hdfs_site_path              | String | No       | -                            | The path of hdfs-site.xml                                                                                                                                        |
@@ -101,6 +121,91 @@ sink {
 }
 ```
 
+### Single table(Hive catalog)
+
+```hocon
+env {
+  parallelism = 1
+  job.mode = "BATCH"
+}
+
+source {
+  FakeSource {
+    schema = {
+      fields {
+        pk_id = bigint
+        name = string
+        score = int
+      }
+      primaryKey {
+        name = "pk_id"
+        columnNames = [pk_id]
+      }
+    }
+    rows = [
+      {
+        kind = INSERT
+        fields = [1, "A", 100]
+      },
+      {
+        kind = INSERT
+        fields = [2, "B", 100]
+      },
+      {
+        kind = INSERT
+        fields = [3, "C", 100]
+      },
+      {
+        kind = INSERT
+        fields = [3, "C", 100]
+      },
+      {
+        kind = INSERT
+        fields = [3, "C", 100]
+      },
+      {
+        kind = INSERT
+        fields = [3, "C", 100]
+      }
+      {
+        kind = UPDATE_BEFORE
+        fields = [1, "A", 100]
+      },
+      {
+        kind = UPDATE_AFTER
+        fields = [1, "A_1", 100]
+      },
+      {
+        kind = DELETE
+        fields = [2, "B", 100]
+      }
+    ]
+  }
+}
+
+sink {
+  Paimon {
+    schema_save_mode = "RECREATE_SCHEMA"
+    catalog_name="seatunnel_test"
+    catalog_type="hive"
+    catalog_uri="thrift://hadoop04:9083"
+    warehouse="hdfs:///tmp/seatunnel"
+    database="seatunnel_test"
+    table="st_test3"
+    paimon.hadoop.conf = {
+      fs.defaultFS = "hdfs://nameservice1"
+      dfs.nameservices = "nameservice1"
+      dfs.ha.namenodes.nameservice1 = "nn1,nn2"
+      dfs.namenode.rpc-address.nameservice1.nn1 = "hadoop03:8020"
+      dfs.namenode.rpc-address.nameservice1.nn2 = "hadoop04:8020"
+      dfs.client.failover.proxy.provider.nameservice1 = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
+      dfs.client.use.datanode.hostname = "true"
+    }
+  }
+}
+
+```
+
 ### Single table with write props of paimon
 
 ```hocon

diff --git a/docs/en/connector-v2/sink/S3File.md b/docs/en/connector-v2/sink/S3File.md
@@ -474,7 +474,7 @@ transform {
 sink {
 S3File {
     bucket = "s3a://seatunnel-test"
-    tmp_path = "/tmp/seatunnel"
+    tmp_path = "/tmp/seatunnel/${table_name}"
     path="/test/${table_name}"
     fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn"
     fs.s3a.aws.credentials.provider="org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"

diff --git a/docs/en/connector-v2/source/Hbase.md b/docs/en/connector-v2/source/Hbase.md
@@ -0,0 +1,91 @@
+# Hbase
+
+> Hbase source connector
+
+## Description
+
+Read data from Apache Hbase.
+
+## Key features
+
+- [x] [batch](../../concept/connector-v2-features.md)
+- [ ] [stream](../../concept/connector-v2-features.md)
+- [ ] [exactly-once](../../concept/connector-v2-features.md)
+- [x] [schema projection](../../concept/connector-v2-features.md)
+- [x] [parallelism](../../concept/connector-v2-features.md)
+- [ ] [support user-defined split](../../concept/connector-v2-features.md)
+
+## Options
+
+|        name        |  type  | required | default value |
+|--------------------|--------|----------|---------------|
+| zookeeper_quorum   | string | yes      | -             |
+| table              | string | yes      | -             |
+| query_columns      | list   | yes      | -             |
+| schema             | config | yes      | -             |
+| hbase_extra_config | string | no       | -             |
+| common-options     |        | no       | -             |
+
+### zookeeper_quorum [string]
+
+The zookeeper cluster host of hbase, example: "hadoop001:2181,hadoop002:2181,hadoop003:2181"
+
+### table [string]
+
+The table name you want to write, example: "seatunnel"
+
+### query_columns [list]
+
+The column name which you want to query in the table. If you want to query the rowkey column, please set "rowkey" in query_columns.
+Other column format should be: columnFamily:columnName, example: ["rowkey", "columnFamily1:column1", "columnFamily1:column1", "columnFamily2:column1"]
+
+### schema [config]
+
+Hbase uses byte arrays for storage. Therefore, you need to configure data types for each column in a table. For more information, see: [guide](../../concept/schema-feature.md#how-to-declare-type-supported).
+
+### hbase_extra_config [config]
+
+The extra configuration of hbase
+
+### common options
+
+Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details
+
+## Examples
+
+```bash
+source {
+  Hbase {
+      zookeeper_quorum = "hadoop001:2181,hadoop002:2181,hadoop003:2181"
+      table = "seatunnel_test"
+      query_columns=["rowkey", "columnFamily1:column1", "columnFamily1:column1", "columnFamily2:column1"]
+      schema = {
+            columns = [
+                  {
+                     name = rowkey
+                     type = string
+                  },
+                  {
+                     name = "columnFamily1:column1"
+                     type = boolean
+                  },
+                  {
+                     name = "columnFamily1:column1"
+                     type = double
+                  },
+                  {
+                     name = "columnFamily2:column1"
+                     type = bigint
+                  }
+            ]
+      }
+  }
+}
+```
+
+## Changelog
+
+### next version
+
+- Add Hbase Source Connector
+
diff --git a/docs/en/connector-v2/source/kafka.md b/docs/en/connector-v2/source/kafka.md
@@ -35,6 +35,7 @@ They can be downloaded via install-plugin.sh or from the Maven central repositor
 |                Name                 |                                    Type                                     | Required |         Default          |                                                                                                                                                                                                                     Description                                                                                                                                                                                                                     |
 |-------------------------------------|-----------------------------------------------------------------------------|----------|--------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | topic                               | String                                                                      | Yes      | -                        | Topic name(s) to read data from when the table is used as source. It also supports topic list for source by separating topic by comma like 'topic-1,topic-2'.                                                                                                                                                                                                                                                                                       |
+| table_list                          | Map                                                                         | No       | -                        | Topic list config You can configure only one `table_list` and one `topic` at the same time                                                                                                                                                                                                                                                                                                                                                          |
 | bootstrap.servers                   | String                                                                      | Yes      | -                        | Comma separated list of Kafka brokers.                                                                                                                                                                                                                                                                                                                                                                                                              |
 | pattern                             | Boolean                                                                     | No       | false                    | If `pattern` is set to `true`,the regular expression for a pattern of topic names to read from. All topics in clients with names that match the specified regular expression will be subscribed by the consumer.                                                                                                                                                                                                                                    |
 | consumer.group                      | String                                                                      | No       | SeaTunnel-Consumer-Group | `Kafka consumer group id`, used to distinguish different consumer groups.                                                                                                                                                                                                                                                                                                                                                                           |
@@ -180,3 +181,64 @@ source {
 }
 ```
 
+### Multiple Kafka Source
+
+> This is written to the same pg table according to different formats and topics of parsing kafka Perform upsert operations based on the id
+
+```hocon
+
+env {
+  execution.parallelism = 1
+  job.mode = "BATCH"
+}
+
+source {
+  Kafka {
+    bootstrap.servers = "kafka_e2e:9092"
+    table_list = [
+      {
+        topic = "^test-ogg-sou.*"
+        pattern = "true"
+        consumer.group = "ogg_multi_group"
+        start_mode = earliest
+        schema = {
+          fields {
+            id = "int"
+            name = "string"
+            description = "string"
+            weight = "string"
+          }
+        },
+        format = ogg_json
+      },
+      {
+        topic = "test-cdc_mds"
+        start_mode = earliest
+        schema = {
+          fields {
+            id = "int"
+            name = "string"
+            description = "string"
+            weight = "string"
+          }
+        },
+        format = canal_json
+      }
+    ]
+  }
+}
+
+sink {
+  Jdbc {
+    driver = org.postgresql.Driver
+    url = "jdbc:postgresql://postgresql:5432/test?loggerLevel=OFF"
+    user = test
+    password = test
+    generate_sink_sql = true
+    database = test
+    table = public.sink
+    primary_keys = ["id"]
+  }
+}
+```
+
diff --git a/docs/sidebars.js b/docs/sidebars.js
@@ -90,6 +90,7 @@ const sidebars = {
                 "concept/connector-v2-features",
                 'concept/schema-feature',
                 'concept/JobEnvConfig',
+                'concept/sql-config',
                 'concept/speed-limit',
                 'concept/event-listener'
             ]