From 91508da0d21f44a7baf0df3b129b28d3087e4ee7 Mon Sep 17 00:00:00 2001 From: ChunFuWu <319355703@qq.com> Date: Mon, 19 Aug 2024 14:25:43 +0800 Subject: [PATCH] Add release docs 2.3.7 (#325) --- .../Connector-v2-release-state.md | 85 ++ .../version-2.3.7/about.md | 70 ++ .../version-2.3.7/command/connector-check.md | 35 + .../version-2.3.7/concept/JobEnvConfig.md | 67 ++ .../version-2.3.7/concept/config.md | 310 ++++++ .../concept/connector-v2-features.md | 70 ++ .../version-2.3.7/concept/event-listener.md | 114 ++ .../version-2.3.7/concept/schema-feature.md | 264 +++++ .../concept/sink-options-placeholders.md | 110 ++ .../version-2.3.7/concept/speed-limit.md | 43 + .../version-2.3.7/concept/sql-config.md | 189 ++++ .../Config-Encryption-Decryption.md | 181 ++++ .../connector-v2/formats/avro.md | 111 ++ .../connector-v2/formats/canal-json.md | 115 +++ .../formats/cdc-compatible-debezium-json.md | 55 + .../connector-v2/formats/debezium-json.md | 115 +++ .../kafka-compatible-kafkaconnect-json.md | 47 + .../connector-v2/formats/ogg-json.md | 93 ++ .../version-2.3.7/connector-v2/sink.md | 0 .../connector-v2/sink/Clickhouse.md | 179 ++++ .../connector-v2/sink/ClickhouseFile.md | 138 +++ .../connector-v2/sink/Console.md | 124 +++ .../connector-v2/sink/DingTalk.md | 55 + .../version-2.3.7/connector-v2/sink/Doris.md | 340 ++++++ .../connector-v2/sink/Elasticsearch.md | 218 ++++ .../version-2.3.7/connector-v2/sink/Email.md | 89 ++ .../version-2.3.7/connector-v2/sink/Feishu.md | 66 ++ .../version-2.3.7/connector-v2/sink/Hbase.md | 141 +++ .../connector-v2/sink/HdfsFile.md | 202 ++++ .../version-2.3.7/connector-v2/sink/Http.md | 63 ++ .../version-2.3.7/connector-v2/sink/Hudi.md | 92 ++ .../version-2.3.7/connector-v2/sink/Jdbc.md | 357 +++++++ .../version-2.3.7/connector-v2/sink/Kafka.md | 196 ++++ .../connector-v2/sink/LocalFile.md | 316 ++++++ .../version-2.3.7/connector-v2/sink/Paimon.md | 273 +++++ .../connector-v2/sink/Phoenix.md | 63 ++ .../version-2.3.7/connector-v2/sink/Pulsar.md | 168 +++ .../connector-v2/sink/Rabbitmq.md | 122 +++ .../version-2.3.7/connector-v2/sink/Redis.md | 157 +++ .../connector-v2/sink/StarRocks.md | 288 ++++++ .../connector-v2/sink/common-options.md | 58 ++ .../version-2.3.7/connector-v2/source.md | 0 .../connector-v2/source/Hbase.md | 96 ++ .../connector-v2/source/HdfsFile.md | 127 +++ .../version-2.3.7/connector-v2/source/Sls.md | 87 ++ .../connector-v2/source/common-options.md | 81 ++ .../contribution/coding-guide.md | 111 ++ .../contribution/contribute-plugin.md | 5 + .../contribute-transform-v2-guide.md | 321 ++++++ .../how-to-create-your-connector.md | 4 + .../version-2.3.7/contribution/new-license.md | 53 + .../version-2.3.7/contribution/setup.md | 121 +++ .../version-2.3.7/faq.md | 354 +++++++ .../version-2.3.7/other-engine/flink.md | 83 ++ .../version-2.3.7/seatunnel-engine/about.md | 44 + .../seatunnel-engine/checkpoint-storage.md | 220 ++++ .../seatunnel-engine/deployment.md | 24 + .../seatunnel-engine/download-seatunnel.md | 70 ++ .../engine-jar-storage-mode.md | 95 ++ .../hybrid-cluster-deployment.md | 315 ++++++ .../seatunnel-engine/local-mode-deployment.md | 35 + .../seatunnel-engine/resource-isolation.md | 83 ++ .../seatunnel-engine/rest-api.md | 490 +++++++++ .../seatunnel-engine/savepoint.md | 26 + .../separated-cluster-deployment.md | 433 ++++++++ .../version-2.3.7/seatunnel-engine/tcp.md | 37 + .../seatunnel-engine/user-command.md | 139 +++ .../start-v2/locally/deployment.md | 68 ++ .../start-v2/locally/quick-start-flink.md | 111 ++ .../locally/quick-start-seatunnel-engine.md | 100 ++ .../start-v2/locally/quick-start-spark.md | 118 +++ .../transform-v2/common-options.md | 23 + .../version-2.3.7/transform-v2/copy.md | 65 ++ .../transform-v2/dynamic-compile.md | 171 +++ .../transform-v2/field-mapper.md | 64 ++ .../transform-v2/filter-rowkind.md | 68 ++ .../version-2.3.7/transform-v2/filter.md | 79 ++ .../version-2.3.7/transform-v2/jsonpath.md | 190 ++++ .../version-2.3.7/transform-v2/llm.md | 120 +++ .../version-2.3.7/transform-v2/replace.md | 121 +++ .../version-2.3.7/transform-v2/split.md | 72 ++ .../transform-v2/sql-functions.md | 966 +++++++++++++++++ .../version-2.3.7/transform-v2/sql-udf.md | 133 +++ .../version-2.3.7/transform-v2/sql.md | 158 +++ src/pages/download/st_data.json | 14 + src/pages/versions/config.json | 28 +- .../Connector-v2-release-state.md | 85 ++ versioned_docs/version-2.3.7/about.md | 72 ++ .../version-2.3.7/command/connector-check.md | 35 + .../version-2.3.7/command/usage.mdx | 176 ++++ .../version-2.3.7/concept/JobEnvConfig.md | 65 ++ .../version-2.3.7/concept/config.md | 323 ++++++ .../concept/connector-v2-features.md | 75 ++ .../version-2.3.7/concept/event-listener.md | 116 +++ .../version-2.3.7/concept/schema-feature.md | 264 +++++ .../concept/sink-options-placeholders.md | 110 ++ .../version-2.3.7/concept/speed-limit.md | 44 + .../version-2.3.7/concept/sql-config.md | 189 ++++ .../Config-Encryption-Decryption.md | 180 ++++ .../Error-Quick-Reference-Manual.md | 286 +++++ .../connector-v2/formats/avro.md | 111 ++ .../connector-v2/formats/canal-json.md | 114 ++ .../formats/cdc-compatible-debezium-json.md | 55 + .../connector-v2/formats/debezium-json.md | 114 ++ .../kafka-compatible-kafkaconnect-json.md | 47 + .../connector-v2/formats/maxwell-json.md | 91 ++ .../connector-v2/formats/ogg-json.md | 93 ++ .../connector-v2/sink/Activemq.md | 123 +++ .../connector-v2/sink/AmazonDynamoDB.md | 66 ++ .../connector-v2/sink/AmazonSqs.md | 87 ++ .../version-2.3.7/connector-v2/sink/Assert.md | 498 +++++++++ .../connector-v2/sink/Cassandra.md | 95 ++ .../connector-v2/sink/Clickhouse.md | 180 ++++ .../connector-v2/sink/ClickhouseFile.md | 147 +++ .../connector-v2/sink/Console.md | 124 +++ .../connector-v2/sink/CosFile.md | 293 ++++++ .../version-2.3.7/connector-v2/sink/DB2.md | 175 ++++ .../connector-v2/sink/Datahub.md | 79 ++ .../connector-v2/sink/DingTalk.md | 55 + .../version-2.3.7/connector-v2/sink/Doris.md | 432 ++++++++ .../version-2.3.7/connector-v2/sink/Druid.md | 83 ++ .../connector-v2/sink/Easysearch.md | 202 ++++ .../connector-v2/sink/Elasticsearch.md | 219 ++++ .../version-2.3.7/connector-v2/sink/Email.md | 87 ++ .../connector-v2/sink/Enterprise-WeChat.md | 75 ++ .../version-2.3.7/connector-v2/sink/Feishu.md | 66 ++ .../connector-v2/sink/FtpFile.md | 296 ++++++ .../connector-v2/sink/GoogleFirestore.md | 52 + .../connector-v2/sink/Greenplum.md | 42 + .../version-2.3.7/connector-v2/sink/Hbase.md | 140 +++ .../connector-v2/sink/HdfsFile.md | 208 ++++ .../version-2.3.7/connector-v2/sink/Hive.md | 420 ++++++++ .../version-2.3.7/connector-v2/sink/Http.md | 134 +++ .../version-2.3.7/connector-v2/sink/Hudi.md | 131 +++ .../connector-v2/sink/Iceberg.md | 258 +++++ .../connector-v2/sink/InfluxDB.md | 142 +++ .../version-2.3.7/connector-v2/sink/IoTDB.md | 221 ++++ .../version-2.3.7/connector-v2/sink/Jdbc.md | 456 ++++++++ .../version-2.3.7/connector-v2/sink/Kafka.md | 215 ++++ .../connector-v2/sink/Kingbase.md | 168 +++ .../version-2.3.7/connector-v2/sink/Kudu.md | 209 ++++ .../connector-v2/sink/LocalFile.md | 336 ++++++ .../connector-v2/sink/Maxcompute.md | 79 ++ .../version-2.3.7/connector-v2/sink/Mivlus.md | 59 ++ .../connector-v2/sink/MongoDB.md | 235 +++++ .../version-2.3.7/connector-v2/sink/Mysql.md | 210 ++++ .../version-2.3.7/connector-v2/sink/Neo4j.md | 147 +++ .../connector-v2/sink/ObsFile.md | 287 ++++++ .../connector-v2/sink/OceanBase.md | 186 ++++ .../version-2.3.7/connector-v2/sink/Oracle.md | 207 ++++ .../connector-v2/sink/OssFile.md | 539 ++++++++++ .../connector-v2/sink/OssJindoFile.md | 297 ++++++ .../version-2.3.7/connector-v2/sink/Paimon.md | 316 ++++++ .../connector-v2/sink/Phoenix.md | 62 ++ .../connector-v2/sink/PostgreSql.md | 273 +++++ .../version-2.3.7/connector-v2/sink/Pulsar.md | 177 ++++ .../connector-v2/sink/Rabbitmq.md | 121 +++ .../version-2.3.7/connector-v2/sink/Redis.md | 164 +++ .../connector-v2/sink/Redshift.md | 99 ++ .../connector-v2/sink/RocketMQ.md | 203 ++++ .../connector-v2/sink/S3-Redshift.md | 278 +++++ .../version-2.3.7/connector-v2/sink/S3File.md | 513 +++++++++ .../connector-v2/sink/SelectDB-Cloud.md | 173 ++++ .../version-2.3.7/connector-v2/sink/Sentry.md | 78 ++ .../connector-v2/sink/SftpFile.md | 266 +++++ .../version-2.3.7/connector-v2/sink/Slack.md | 54 + .../connector-v2/sink/Snowflake.md | 142 +++ .../version-2.3.7/connector-v2/sink/Socket.md | 79 ++ .../connector-v2/sink/SqlServer.md | 182 ++++ .../connector-v2/sink/StarRocks.md | 377 +++++++ .../connector-v2/sink/TDengine.md | 71 ++ .../connector-v2/sink/Tablestore.md | 72 ++ .../connector-v2/sink/Vertica.md | 183 ++++ .../connector-v2/sink/common-options.md | 53 + .../connector-v2/source/AmazonDynamoDB.md | 120 +++ .../connector-v2/source/AmazonSqs.md | 81 ++ .../connector-v2/source/Cassandra.md | 80 ++ .../connector-v2/source/Clickhouse.md | 101 ++ .../connector-v2/source/CosFile.md | 368 +++++++ .../version-2.3.7/connector-v2/source/DB2.md | 165 +++ .../connector-v2/source/Doris.md | 162 +++ .../connector-v2/source/Easysearch.md | 209 ++++ .../connector-v2/source/Elasticsearch.md | 200 ++++ .../connector-v2/source/FakeSource.md | 421 ++++++++ .../connector-v2/source/FtpFile.md | 341 ++++++ .../connector-v2/source/Github.md | 296 ++++++ .../connector-v2/source/Gitlab.md | 299 ++++++ .../connector-v2/source/GoogleSheets.md | 79 ++ .../connector-v2/source/Greenplum.md | 42 + .../connector-v2/source/Hbase.md | 96 ++ .../connector-v2/source/HdfsFile.md | 136 +++ .../version-2.3.7/connector-v2/source/Hive.md | 279 +++++ .../connector-v2/source/HiveJdbc.md | 163 +++ .../version-2.3.7/connector-v2/source/Http.md | 357 +++++++ .../connector-v2/source/Iceberg.md | 222 ++++ .../connector-v2/source/InfluxDB.md | 195 ++++ .../connector-v2/source/IoTDB.md | 187 ++++ .../version-2.3.7/connector-v2/source/Jdbc.md | 304 ++++++ .../version-2.3.7/connector-v2/source/Jira.md | 305 ++++++ .../connector-v2/source/Kingbase.md | 148 +++ .../connector-v2/source/Klaviyo.md | 312 ++++++ .../version-2.3.7/connector-v2/source/Kudu.md | 153 +++ .../connector-v2/source/Lemlist.md | 297 ++++++ .../connector-v2/source/LocalFile.md | 408 ++++++++ .../connector-v2/source/Maxcompute.md | 98 ++ .../connector-v2/source/Mivlus.md | 55 + .../connector-v2/source/MongoDB-CDC.md | 312 ++++++ .../connector-v2/source/MongoDB.md | 458 ++++++++ .../connector-v2/source/MyHours.md | 310 ++++++ .../connector-v2/source/MySQL-CDC.md | 272 +++++ .../connector-v2/source/Mysql.md | 319 ++++++ .../connector-v2/source/Neo4j.md | 107 ++ .../connector-v2/source/Notion.md | 308 ++++++ .../connector-v2/source/ObsFile.md | 350 +++++++ .../connector-v2/source/OceanBase.md | 180 ++++ .../connector-v2/source/OneSignal.md | 327 ++++++ .../connector-v2/source/OpenMldb.md | 86 ++ .../connector-v2/source/Oracle-CDC.md | 349 +++++++ .../connector-v2/source/Oracle.md | 324 ++++++ .../connector-v2/source/OssFile.md | 492 +++++++++ .../connector-v2/source/OssJindoFile.md | 360 +++++++ .../connector-v2/source/Paimon.md | 165 +++ .../connector-v2/source/Persistiq.md | 300 ++++++ .../connector-v2/source/Phoenix.md | 68 ++ .../connector-v2/source/PostgreSQL-CDC.md | 196 ++++ .../connector-v2/source/PostgreSQL.md | 323 ++++++ .../connector-v2/source/Pulsar.md | 164 +++ .../connector-v2/source/Rabbitmq.md | 162 +++ .../connector-v2/source/Redis.md | 273 +++++ .../connector-v2/source/Redshift.md | 133 +++ .../connector-v2/source/RocketMQ.md | 219 ++++ .../connector-v2/source/S3File.md | 353 +++++++ .../connector-v2/source/SftpFile.md | 255 +++++ .../version-2.3.7/connector-v2/source/Sls.md | 87 ++ .../connector-v2/source/Snowflake.md | 153 +++ .../connector-v2/source/Socket.md | 108 ++ .../connector-v2/source/SqlServer-CDC.md | 229 ++++ .../connector-v2/source/SqlServer.md | 266 +++++ .../connector-v2/source/StarRocks.md | 185 ++++ .../connector-v2/source/TDengine.md | 85 ++ .../connector-v2/source/Vertica.md | 162 +++ .../connector-v2/source/Web3j.md | 61 ++ .../connector-v2/source/common-options.md | 81 ++ .../connector-v2/source/kafka.md | 244 +++++ .../contribution/coding-guide.md | 111 ++ .../contribution/contribute-plugin.md | 5 + .../contribute-transform-v2-guide.md | 329 ++++++ .../version-2.3.7/contribution/new-license.md | 53 + .../version-2.3.7/contribution/setup.md | 127 +++ versioned_docs/version-2.3.7/faq.md | 353 +++++++ .../version-2.3.7/other-engine/flink.md | 84 ++ .../version-2.3.7/other-engine/spark.md | 0 .../version-2.3.7/seatunnel-engine/about.md | 44 + .../seatunnel-engine/checkpoint-storage.md | 247 +++++ .../seatunnel-engine/deployment.md | 24 + .../seatunnel-engine/download-seatunnel.md | 70 ++ .../engine-jar-storage-mode.md | 95 ++ .../hybrid-cluster-deployment.md | 315 ++++++ .../seatunnel-engine/local-mode-deployment.md | 35 + .../seatunnel-engine/resource-isolation.md | 83 ++ .../seatunnel-engine/rest-api.md | 491 +++++++++ .../seatunnel-engine/savepoint.md | 24 + .../separated-cluster-deployment.md | 427 ++++++++ .../version-2.3.7/seatunnel-engine/tcp.md | 37 + .../seatunnel-engine/user-command.md | 123 +++ .../version-2.3.7/start-v2/docker/docker.md | 9 + .../start-v2/kubernetes/kubernetes.mdx | 772 ++++++++++++++ .../start-v2/locally/deployment.md | 74 ++ .../start-v2/locally/quick-start-flink.md | 112 ++ .../locally/quick-start-seatunnel-engine.md | 101 ++ .../start-v2/locally/quick-start-spark.md | 119 +++ .../transform-v2/common-options.md | 65 ++ .../version-2.3.7/transform-v2/copy.md | 65 ++ .../transform-v2/dynamic-compile.md | 171 +++ .../transform-v2/field-mapper.md | 64 ++ .../transform-v2/filter-rowkind.md | 68 ++ .../version-2.3.7/transform-v2/filter.md | 81 ++ .../version-2.3.7/transform-v2/jsonpath.md | 190 ++++ .../version-2.3.7/transform-v2/llm.md | 122 +++ .../version-2.3.7/transform-v2/replace.md | 121 +++ .../version-2.3.7/transform-v2/split.md | 72 ++ .../transform-v2/sql-functions.md | 975 ++++++++++++++++++ .../version-2.3.7/transform-v2/sql-udf.md | 134 +++ .../version-2.3.7/transform-v2/sql.md | 160 +++ .../version-2.3.7-sidebars.json | 170 +++ versions.json | 1 + 286 files changed, 49463 insertions(+), 8 deletions(-) create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/Connector-v2-release-state.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/about.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/command/connector-check.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/JobEnvConfig.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/config.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/connector-v2-features.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/event-listener.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/schema-feature.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/sink-options-placeholders.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/speed-limit.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/sql-config.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/Config-Encryption-Decryption.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/avro.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/canal-json.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/cdc-compatible-debezium-json.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/debezium-json.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/kafka-compatible-kafkaconnect-json.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/ogg-json.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Clickhouse.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/ClickhouseFile.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Console.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/DingTalk.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Doris.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Elasticsearch.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Email.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Feishu.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Hbase.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/HdfsFile.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Http.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Hudi.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Jdbc.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Kafka.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/LocalFile.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Paimon.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Phoenix.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Pulsar.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Rabbitmq.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Redis.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/StarRocks.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/common-options.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source/Hbase.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source/HdfsFile.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source/Sls.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source/common-options.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/coding-guide.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/contribute-plugin.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/contribute-transform-v2-guide.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/how-to-create-your-connector.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/new-license.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/setup.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/faq.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/other-engine/flink.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/about.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/checkpoint-storage.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/deployment.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/download-seatunnel.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/engine-jar-storage-mode.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/hybrid-cluster-deployment.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/local-mode-deployment.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/resource-isolation.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/rest-api.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/savepoint.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/separated-cluster-deployment.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/tcp.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/user-command.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/start-v2/locally/deployment.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/start-v2/locally/quick-start-flink.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/start-v2/locally/quick-start-seatunnel-engine.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/start-v2/locally/quick-start-spark.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/common-options.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/copy.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/dynamic-compile.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/field-mapper.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/filter-rowkind.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/filter.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/jsonpath.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/llm.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/replace.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/split.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/sql-functions.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/sql-udf.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/sql.md create mode 100644 versioned_docs/version-2.3.7/Connector-v2-release-state.md create mode 100644 versioned_docs/version-2.3.7/about.md create mode 100644 versioned_docs/version-2.3.7/command/connector-check.md create mode 100644 versioned_docs/version-2.3.7/command/usage.mdx create mode 100644 versioned_docs/version-2.3.7/concept/JobEnvConfig.md create mode 100644 versioned_docs/version-2.3.7/concept/config.md create mode 100644 versioned_docs/version-2.3.7/concept/connector-v2-features.md create mode 100644 versioned_docs/version-2.3.7/concept/event-listener.md create mode 100644 versioned_docs/version-2.3.7/concept/schema-feature.md create mode 100644 versioned_docs/version-2.3.7/concept/sink-options-placeholders.md create mode 100644 versioned_docs/version-2.3.7/concept/speed-limit.md create mode 100644 versioned_docs/version-2.3.7/concept/sql-config.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/Config-Encryption-Decryption.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/Error-Quick-Reference-Manual.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/formats/avro.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/formats/canal-json.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/formats/cdc-compatible-debezium-json.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/formats/debezium-json.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/formats/kafka-compatible-kafkaconnect-json.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/formats/maxwell-json.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/formats/ogg-json.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Activemq.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/AmazonDynamoDB.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/AmazonSqs.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Assert.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Cassandra.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Clickhouse.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/ClickhouseFile.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Console.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/CosFile.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/DB2.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Datahub.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/DingTalk.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Doris.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Druid.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Easysearch.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Elasticsearch.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Email.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Enterprise-WeChat.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Feishu.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/FtpFile.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/GoogleFirestore.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Greenplum.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Hbase.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/HdfsFile.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Hive.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Http.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Hudi.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Iceberg.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/InfluxDB.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/IoTDB.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Jdbc.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Kafka.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Kingbase.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Kudu.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/LocalFile.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Maxcompute.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Mivlus.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/MongoDB.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Mysql.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Neo4j.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/ObsFile.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/OceanBase.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Oracle.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/OssFile.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/OssJindoFile.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Paimon.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Phoenix.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/PostgreSql.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Pulsar.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Rabbitmq.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Redis.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Redshift.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/RocketMQ.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/S3-Redshift.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/S3File.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/SelectDB-Cloud.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Sentry.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/SftpFile.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Slack.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Snowflake.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Socket.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/SqlServer.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/StarRocks.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/TDengine.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Tablestore.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/Vertica.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/sink/common-options.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/AmazonDynamoDB.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/AmazonSqs.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Cassandra.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Clickhouse.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/CosFile.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/DB2.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Doris.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Easysearch.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Elasticsearch.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/FakeSource.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/FtpFile.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Github.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Gitlab.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/GoogleSheets.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Greenplum.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Hbase.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/HdfsFile.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Hive.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/HiveJdbc.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Http.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Iceberg.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/InfluxDB.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/IoTDB.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Jdbc.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Jira.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Kingbase.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Klaviyo.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Kudu.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Lemlist.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/LocalFile.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Maxcompute.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Mivlus.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/MongoDB-CDC.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/MongoDB.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/MyHours.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/MySQL-CDC.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Mysql.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Neo4j.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Notion.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/ObsFile.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/OceanBase.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/OneSignal.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/OpenMldb.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Oracle-CDC.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Oracle.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/OssFile.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/OssJindoFile.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Paimon.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Persistiq.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Phoenix.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/PostgreSQL-CDC.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/PostgreSQL.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Pulsar.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Rabbitmq.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Redis.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Redshift.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/RocketMQ.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/S3File.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/SftpFile.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Sls.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Snowflake.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Socket.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/SqlServer-CDC.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/SqlServer.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/StarRocks.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/TDengine.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Vertica.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/Web3j.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/common-options.md create mode 100644 versioned_docs/version-2.3.7/connector-v2/source/kafka.md create mode 100644 versioned_docs/version-2.3.7/contribution/coding-guide.md create mode 100644 versioned_docs/version-2.3.7/contribution/contribute-plugin.md create mode 100644 versioned_docs/version-2.3.7/contribution/contribute-transform-v2-guide.md create mode 100644 versioned_docs/version-2.3.7/contribution/new-license.md create mode 100644 versioned_docs/version-2.3.7/contribution/setup.md create mode 100644 versioned_docs/version-2.3.7/faq.md create mode 100644 versioned_docs/version-2.3.7/other-engine/flink.md create mode 100644 versioned_docs/version-2.3.7/other-engine/spark.md create mode 100644 versioned_docs/version-2.3.7/seatunnel-engine/about.md create mode 100644 versioned_docs/version-2.3.7/seatunnel-engine/checkpoint-storage.md create mode 100644 versioned_docs/version-2.3.7/seatunnel-engine/deployment.md create mode 100644 versioned_docs/version-2.3.7/seatunnel-engine/download-seatunnel.md create mode 100644 versioned_docs/version-2.3.7/seatunnel-engine/engine-jar-storage-mode.md create mode 100644 versioned_docs/version-2.3.7/seatunnel-engine/hybrid-cluster-deployment.md create mode 100644 versioned_docs/version-2.3.7/seatunnel-engine/local-mode-deployment.md create mode 100644 versioned_docs/version-2.3.7/seatunnel-engine/resource-isolation.md create mode 100644 versioned_docs/version-2.3.7/seatunnel-engine/rest-api.md create mode 100644 versioned_docs/version-2.3.7/seatunnel-engine/savepoint.md create mode 100644 versioned_docs/version-2.3.7/seatunnel-engine/separated-cluster-deployment.md create mode 100644 versioned_docs/version-2.3.7/seatunnel-engine/tcp.md create mode 100644 versioned_docs/version-2.3.7/seatunnel-engine/user-command.md create mode 100644 versioned_docs/version-2.3.7/start-v2/docker/docker.md create mode 100644 versioned_docs/version-2.3.7/start-v2/kubernetes/kubernetes.mdx create mode 100644 versioned_docs/version-2.3.7/start-v2/locally/deployment.md create mode 100644 versioned_docs/version-2.3.7/start-v2/locally/quick-start-flink.md create mode 100644 versioned_docs/version-2.3.7/start-v2/locally/quick-start-seatunnel-engine.md create mode 100644 versioned_docs/version-2.3.7/start-v2/locally/quick-start-spark.md create mode 100644 versioned_docs/version-2.3.7/transform-v2/common-options.md create mode 100644 versioned_docs/version-2.3.7/transform-v2/copy.md create mode 100644 versioned_docs/version-2.3.7/transform-v2/dynamic-compile.md create mode 100644 versioned_docs/version-2.3.7/transform-v2/field-mapper.md create mode 100644 versioned_docs/version-2.3.7/transform-v2/filter-rowkind.md create mode 100644 versioned_docs/version-2.3.7/transform-v2/filter.md create mode 100644 versioned_docs/version-2.3.7/transform-v2/jsonpath.md create mode 100644 versioned_docs/version-2.3.7/transform-v2/llm.md create mode 100644 versioned_docs/version-2.3.7/transform-v2/replace.md create mode 100644 versioned_docs/version-2.3.7/transform-v2/split.md create mode 100644 versioned_docs/version-2.3.7/transform-v2/sql-functions.md create mode 100644 versioned_docs/version-2.3.7/transform-v2/sql-udf.md create mode 100644 versioned_docs/version-2.3.7/transform-v2/sql.md create mode 100644 versioned_sidebars/version-2.3.7-sidebars.json diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/Connector-v2-release-state.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/Connector-v2-release-state.md new file mode 100644 index 000000000000..779394b70356 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/Connector-v2-release-state.md @@ -0,0 +1,85 @@ +# 连接器发布状态 + +SeaTunnel 使用连接器分级系统来帮助您了解连接器的期望: + +| | Alpha | Beta | General Availability (GA) | +|----------------------|------------------------------------------------------------------------------|----------------------------------------------------------------------------|--------------------------------------------------------------| +| Expectations | alpha 连接器表示正在开发的连接器,可帮助 SeaTunnel 收集早期采用者报告的早期反馈和问题。 我们强烈反对在生产用例中使用 alpha 版本 | Beta 连接器被认为稳定可靠,没有向后不兼容的更改,但尚未得到更广泛的用户群体的验证。 我们希望在正式发布之前找到并修复该版本中的一些问题和错误。 | 普遍可用的连接器已被认为可以在生产环境中使用,并得到 SeaTunnel 的正式支持。 它的文档被认为足以支持广泛采用。 | +| | | | | +| Production Readiness | No | Yes | Yes | + +## Connector V2 Health + +| Connector Name | Type | Status | Support Version | +|-------------------------------------------------------------------|--------|--------|-----------------| +| [AmazonDynamoDB](../en/connector-v2/sink/AmazonDynamoDB.md) | Sink | Beta | 2.3.0 | +| [AmazonDynamoDB](../en/connector-v2/source/AmazonDynamoDB.md) | Source | Beta | 2.3.0 | +| [Asset](../en/connector-v2/sink/Assert.md) | Sink | Beta | 2.2.0-beta | +| [Cassandra](../en/connector-v2/sink/Cassandra.md) | Sink | Beta | 2.3.0 | +| [Cassandra](../en/connector-v2/source/Cassandra.md) | Source | Beta | 2.3.0 | +| [ClickHouse](../en/connector-v2/source/Clickhouse.md) | Source | GA | 2.2.0-beta | +| [ClickHouse](../en/connector-v2/sink/Clickhouse.md) | Sink | GA | 2.2.0-beta | +| [ClickHouseFile](../en/connector-v2/sink/ClickhouseFile.md) | Sink | GA | 2.2.0-beta | +| [Console](connector-v2/sink/Console.md) | Sink | GA | 2.2.0-beta | +| [DataHub](../en/connector-v2/sink/Datahub.md) | Sink | Alpha | 2.2.0-beta | +| [Doris](../en/connector-v2/sink/Doris.md) | Sink | Beta | 2.3.0 | +| [DingTalk](../en/connector-v2/sink/DingTalk.md) | Sink | Alpha | 2.2.0-beta | +| [Elasticsearch](connector-v2/sink/Elasticsearch.md) | Sink | GA | 2.2.0-beta | +| [Email](connector-v2/sink/Email.md) | Sink | Alpha | 2.2.0-beta | +| [Enterprise WeChat](../en/connector-v2/sink/Enterprise-WeChat.md) | Sink | Alpha | 2.2.0-beta | +| [FeiShu](connector-v2/sink/Feishu.md) | Sink | Alpha | 2.2.0-beta | +| [Fake](../en/connector-v2/source/FakeSource.md) | Source | GA | 2.2.0-beta | +| [FtpFile](../en/connector-v2/sink/FtpFile.md) | Sink | Beta | 2.2.0-beta | +| [Greenplum](../en/connector-v2/sink/Greenplum.md) | Sink | Beta | 2.2.0-beta | +| [Greenplum](../en/connector-v2/source/Greenplum.md) | Source | Beta | 2.2.0-beta | +| [HdfsFile](connector-v2/sink/HdfsFile.md) | Sink | GA | 2.2.0-beta | +| [HdfsFile](connector-v2/source/HdfsFile.md) | Source | GA | 2.2.0-beta | +| [Hive](../en/connector-v2/sink/Hive.md) | Sink | GA | 2.2.0-beta | +| [Hive](../en/connector-v2/source/Hive.md) | Source | GA | 2.2.0-beta | +| [Http](connector-v2/sink/Http.md) | Sink | Beta | 2.2.0-beta | +| [Http](../en/connector-v2/source/Http.md) | Source | Beta | 2.2.0-beta | +| [Iceberg](../en/connector-v2/source/Iceberg.md) | Source | Beta | 2.2.0-beta | +| [InfluxDB](../en/connector-v2/sink/InfluxDB.md) | Sink | Beta | 2.3.0 | +| [InfluxDB](../en/connector-v2/source/InfluxDB.md) | Source | Beta | 2.3.0-beta | +| [IoTDB](../en/connector-v2/source/IoTDB.md) | Source | GA | 2.2.0-beta | +| [IoTDB](../en/connector-v2/sink/IoTDB.md) | Sink | GA | 2.2.0-beta | +| [Jdbc](../en/connector-v2/source/Jdbc.md) | Source | GA | 2.2.0-beta | +| [Jdbc](connector-v2/sink/Jdbc.md) | Sink | GA | 2.2.0-beta | +| [Kafka](../en/connector-v2/source/kafka.md) | Source | GA | 2.3.0 | +| [Kafka](connector-v2/sink/Kafka.md) | Sink | GA | 2.2.0-beta | +| [Kudu](../en/connector-v2/source/Kudu.md) | Source | Beta | 2.2.0-beta | +| [Kudu](../en/connector-v2/sink/Kudu.md) | Sink | Beta | 2.2.0-beta | +| [Lemlist](../en/connector-v2/source/Lemlist.md) | Source | Beta | 2.3.0 | +| [LocalFile](../en/connector-v2/sink/LocalFile.md) | Sink | GA | 2.2.0-beta | +| [LocalFile](../en/connector-v2/source/LocalFile.md) | Source | GA | 2.2.0-beta | +| [Maxcompute]../en/(connector-v2/source/Maxcompute.md) | Source | Alpha | 2.3.0 | +| [Maxcompute](../en/connector-v2/sink/Maxcompute.md) | Sink | Alpha | 2.3.0 | +| [MongoDB](../en/connector-v2/source/MongoDB.md) | Source | Beta | 2.2.0-beta | +| [MongoDB](../en/connector-v2/sink/MongoDB.md) | Sink | Beta | 2.2.0-beta | +| [MyHours](../en/connector-v2/source/MyHours.md) | Source | Alpha | 2.2.0-beta | +| [MySqlCDC](../en/connector-v2/source/MySQL-CDC.md) | Source | GA | 2.3.0 | +| [Neo4j](../en/connector-v2/sink/Neo4j.md) | Sink | Beta | 2.2.0-beta | +| [Notion](../en/connector-v2/source/Notion.md) | Source | Alpha | 2.3.0 | +| [OneSignal](../en/connector-v2/source/OneSignal.md) | Source | Beta | 2.3.0 | +| [OpenMldb](../en/connector-v2/source/OpenMldb.md) | Source | Beta | 2.3.0 | +| [OssFile](../en/connector-v2/sink/OssFile.md) | Sink | Beta | 2.2.0-beta | +| [OssFile](../en/connector-v2/source/OssFile.md) | Source | Beta | 2.2.0-beta | +| [Phoenix](../en/connector-v2/sink/Phoenix.md) | Sink | Beta | 2.2.0-beta | +| [Phoenix](../en/connector-v2/source/Phoenix.md) | Source | Beta | 2.2.0-beta | +| [Pulsar](../en/connector-v2/source/Pulsar.md) | Source | Beta | 2.2.0-beta | +| [RabbitMQ](../en/connector-v2/sink/Rabbitmq.md) | Sink | Beta | 2.3.0 | +| [RabbitMQ](../en/connector-v2/source/Rabbitmq.md) | Source | Beta | 2.3.0 | +| [Redis](../en/connector-v2/sink/Redis.md) | Sink | Beta | 2.2.0-beta | +| [Redis](../en/connector-v2/source/Redis.md) | Source | Beta | 2.2.0-beta | +| [S3Redshift](../en/connector-v2/sink/S3-Redshift.md) | Sink | GA | 2.3.0-beta | +| [S3File](../en/connector-v2/source/S3File.md) | Source | GA | 2.3.0-beta | +| [S3File](../en/connector-v2/sink/S3File.md) | Sink | GA | 2.3.0-beta | +| [Sentry](../en/connector-v2/sink/Sentry.md) | Sink | Alpha | 2.2.0-beta | +| [SFtpFile](../en/connector-v2/sink/SftpFile.md) | Sink | Beta | 2.3.0 | +| [SFtpFile](../en/connector-v2/source/SftpFile.md) | Source | Beta | 2.3.0 | +| [Slack](../en/connector-v2/sink/Slack.md) | Sink | Beta | 2.3.0 | +| [Socket](../en/connector-v2/sink/Socket.md) | Sink | Beta | 2.2.0-beta | +| [Socket](../en/connector-v2/source/Socket.md) | Source | Beta | 2.2.0-beta | +| [StarRocks](../en/connector-v2/sink/StarRocks.md) | Sink | Alpha | 2.3.0 | +| [Tablestore](../en/connector-v2/sink/Tablestore.md) | Sink | Alpha | 2.3.0 | + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/about.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/about.md new file mode 100644 index 000000000000..9af0bfaaea3b --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/about.md @@ -0,0 +1,70 @@ +# 关于 SeaTunnel + +seatunnel logo + +[![Slack](https://img.shields.io/badge/slack-%23seatunnel-4f8eba?logo=slack)](https://s.apache.org/seatunnel-slack) +[![Twitter Follow](https://img.shields.io/twitter/follow/ASFSeaTunnel.svg?label=Follow&logo=twitter)](https://twitter.com/ASFSeaTunnel) + +SeaTunnel是一个非常易用、超高性能的分布式数据集成平台,支持实时海量数据同步。 每天可稳定高效同步数百亿数据,已被近百家企业应用于生产。 + +## 为什么需要 SeaTunnel + +SeaTunnel专注于数据集成和数据同步,主要旨在解决数据集成领域的常见问题: + +- 数据源多样:常用数据源有数百种,版本不兼容。 随着新技术的出现,更多的数据源不断出现。 用户很难找到一个能够全面、快速支持这些数据源的工具。 +- 同步场景复杂:数据同步需要支持离线全量同步、离线增量同步、CDC、实时同步、全库同步等多种同步场景。 +- 资源需求高:现有的数据集成和数据同步工具往往需要大量的计算资源或JDBC连接资源来完成海量小表的实时同步。 这增加了企业的负担。 +- 缺乏质量和监控:数据集成和同步过程经常会出现数据丢失或重复的情况。 同步过程缺乏监控,无法直观了解任务过程中数据的真实情况。 +- 技术栈复杂:企业使用的技术组件不同,用户需要针对不同组件开发相应的同步程序来完成数据集成。 +- 管理和维护困难:受限于底层技术组件(Flink/Spark)不同,离线同步和实时同步往往需要分开开发和管理,增加了管理和维护的难度。 + +## SeaTunnel 相关特性 + +- 丰富且可扩展的Connector:SeaTunnel提供了不依赖于特定执行引擎的Connector API。 基于该API开发的Connector(Source、Transform、Sink)可以运行在很多不同的引擎上,例如目前支持的SeaTunnel引擎(Zeta)、Flink、Spark等。 +- Connector插件:插件式设计让用户可以轻松开发自己的Connector并将其集成到SeaTunnel项目中。 目前,SeaTunnel 支持超过 100 个连接器,并且数量正在激增。 这是[当前支持的连接器]的列表(Connector-v2-release-state.md) +- 批流集成:基于SeaTunnel Connector API开发的Connector完美兼容离线同步、实时同步、全量同步、增量同步等场景。 它们大大降低了管理数据集成任务的难度。 +- 支持分布式快照算法,保证数据一致性。 +- 多引擎支持:SeaTunnel默认使用SeaTunnel引擎(Zeta)进行数据同步。 SeaTunnel还支持使用Flink或Spark作为Connector的执行引擎,以适应企业现有的技术组件。 SeaTunnel 支持 Spark 和 Flink 的多个版本。 +- JDBC复用、数据库日志多表解析:SeaTunnel支持多表或全库同步,解决了过度JDBC连接的问题; 支持多表或全库日志读取解析,解决了CDC多表同步场景下需要处理日志重复读取解析的问题。 +- 高吞吐量、低延迟:SeaTunnel支持并行读写,提供稳定可靠、高吞吐量、低延迟的数据同步能力。 +- 完善的实时监控:SeaTunnel支持数据同步过程中每一步的详细监控信息,让用户轻松了解同步任务读写的数据数量、数据大小、QPS等信息。 +- 支持两种作业开发方法:编码和画布设计。 SeaTunnel Web 项目 https://github.com/apache/seatunnel-web 提供作业、调度、运行和监控功能的可视化管理。 + +## SeaTunnel 工作流图 + +![SeaTunnel Work Flowchart](/image_zh/architecture_diagram.png) + +SeaTunnel的运行流程如上图所示。 + +用户配置作业信息并选择提交作业的执行引擎。 + +Source Connector负责并行读取数据并将数据发送到下游Transform或直接发送到Sink,Sink将数据写入目的地。 值得注意的是,Source、Transform 和 Sink 可以很容易地自行开发和扩展。 + +SeaTunnel 是一个 EL(T) 数据集成平台。 因此,在SeaTunnel中,Transform只能用于对数据进行一些简单的转换,例如将一列的数据转换为大写或小写,更改列名,或者将一列拆分为多列。 + +SeaTunnel 使用的默认引擎是 [SeaTunnel Engine](seatunnel-engine/about.md)。 如果您选择使用Flink或Spark引擎,SeaTunnel会将Connector打包成Flink或Spark程序并提交给Flink或Spark运行。 + +## 连接器 + +- **源连接器** SeaTunnel 支持从各种关系、图形、NoSQL、文档和内存数据库读取数据; 分布式文件系统,例如HDFS; 以及各种云存储解决方案,例如S3和OSS。 我们还支持很多常见SaaS服务的数据读取。 您可以在[此处] 访问详细列表。 如果您愿意,您可以开发自己的源连接器并将其轻松集成到 SeaTunnel 中。 + +- **转换连接器** 如果源和接收器之间的架构不同,您可以使用转换连接器更改从源读取的架构,使其与接收器架构相同。 + +- **Sink Connector** SeaTunnel 支持将数据写入各种关系型、图形、NoSQL、文档和内存数据库; 分布式文件系统,例如HDFS; 以及各种云存储解决方案,例如S3和OSS。 我们还支持将数据写入许多常见的 SaaS 服务。 您可以在[此处]访问详细列表。 如果您愿意,您可以开发自己的 Sink 连接器并轻松将其集成到 SeaTunnel 中。 + +## 谁在使用 SeaTunnel + +SeaTunnel 拥有大量用户。 您可以在[用户](https://seatunnel.apache.org/user)中找到有关他们的更多信息. + +## 展望 + +

+

+   +

+SeaTunnel 丰富了CNCF 云原生景观。 +

+ +## 了解更多 + +您可以参阅[快速入门](/docs/category/start-v2/locally/deployment) 了解后续相关步骤。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/command/connector-check.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/command/connector-check.md new file mode 100644 index 000000000000..5df7c54611e6 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/command/connector-check.md @@ -0,0 +1,35 @@ +# 连接器检查命令用法 + +## 命令入口 + +```shell +bin/seatunnel-connector.sh +``` + +## 命令选项 + +```text +Usage: seatunnel-connector.sh [options] + Options: + -h, --help Show the usage message + -l, --list List all supported plugins(sources, sinks, transforms) + (default: false) + -o, --option-rule Get option rule of the plugin by the plugin + identifier(connector name or transform name) + -pt, --plugin-type SeaTunnel plugin type, support [source, sink, + transform] +``` + +## 例子 + +```shell +# List all supported connectors(sources and sinks) and transforms +bin/seatunnel-connector.sh -l +# List all supported sinks +bin/seatunnel-connector.sh -l -pt sink +# Get option rule of the connector or transform by the name +bin/seatunnel-connector.sh -o Paimon +# Get option rule of paimon sink +bin/seatunnel-connector.sh -o Paimon -pt sink +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/JobEnvConfig.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/JobEnvConfig.md new file mode 100644 index 000000000000..c20797604f34 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/JobEnvConfig.md @@ -0,0 +1,67 @@ +# JobEnvConfig + +本文档描述了env的配置信息,公共参数可以在所有引擎中使用。为了更好的区分引擎参数,其他引擎的附加参数需要携带前缀。 +在flink引擎中,我们使用`flink.`作为前缀。在spark引擎中,我们不使用任何前缀来修改参数,因为官方的spark参数本身就是以`spark.`开头。 + +## 公共参数 + +以下配置参数对所有引擎通用: + +### job.name + +该参数配置任务名称。 + +### jars + +第三方包可以通过`jars`加载,例如:`jars="file://local/jar1.jar;file://local/jar2.jar"` + +### job.mode + +通过`job.mode`你可以配置任务是在批处理模式还是流处理模式。例如:`job.mode = "BATCH"` 或者 `job.mode = "STREAMING"` + +### checkpoint.interval + +获取定时调度检查点的时间间隔。 + +在`STREAMING`模式下,检查点是必须的,如果不设置,将从应用程序配置文件`seatunnel.yaml`中获取。 在`BATCH`模式下,您可以通过不设置此参数来禁用检查点。 + +### parallelism + +该参数配置source和sink的并行度。 + +### job.retry.times + +用于控制作业失败时的默认重试次数。默认值为3,并且仅适用于Zeta引擎。 + +### job.retry.interval.seconds + +用于控制作业失败时的默认重试间隔。默认值为3秒,并且仅适用于Zeta引擎。 + +### savemode.execute.location + +此参数用于指定在Zeta引擎中执行作业时SaveMode执行的时机。 +默认值为`CLUSTER`,这意味着SaveMode在作业提交到集群上之后在集群上执行。 +当值为`CLIENT`时,SaveMode操作在作业提交的过程中执行,使用shell脚本提交作业时,该过程在提交作业的shell进程中执行。使用rest api提交作业时,该过程在http请求的处理线程中执行。 +请尽量使用`CLUSTER`模式,因为当`CLUSTER`模式没有问题时,我们将删除`CLIENT`模式。 + +### shade.identifier + +指定加密方式,如果您没有加密或解密配置文件的需求,此选项可以忽略。 + +更多详细信息,您可以参考文档 [Config Encryption Decryption](../../en/connector-v2/Config-Encryption-Decryption.md) + +## Flink 引擎参数 + +这里列出了一些与 Flink 中名称相对应的 SeaTunnel 参数名称,并非全部,更多内容请参考官方 [Flink Documentation](https://flink.apache.org/) for more. + +| Flink 配置名称 | SeaTunnel 配置名称 | +|---------------------------------|---------------------------------------| +| pipeline.max-parallelism | flink.pipeline.max-parallelism | +| execution.checkpointing.mode | flink.execution.checkpointing.mode | +| execution.checkpointing.timeout | flink.execution.checkpointing.timeout | +| ... | ... | + +## Spark 引擎参数 + +由于Spark配置项并无调整,这里就不列出来了,请参考官方 [Spark Documentation](https://spark.apache.org/). + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/config.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/config.md new file mode 100644 index 000000000000..72c14bafcec7 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/config.md @@ -0,0 +1,310 @@ +--- + +sidebar_position: 2 +------------------- + +# 配置文件简介 + +在SeaTunnel中,最重要的事情就是配置文件,尽管用户可以自定义他们自己的数据同步需求以发挥SeaTunnel最大的潜力。那么接下来我将会向你介绍如何设置配置文件。 + +配置文件的主要格式是 `hocon`, 有关该格式类型的更多信息你可以参考[HOCON-GUIDE](https://github.com/lightbend/config/blob/main/HOCON.md), +顺便提一下,我们也支持 `json`格式,但你应该知道配置文件的名称应该是以 `.json`结尾。 + +我们同时提供了以 `SQL` 格式,详细可以参考[SQL配置文件](sql-config.md)。 + +## 例子 + +在你阅读之前,你可以在发布包中的config目录[这里](https://github.com/apache/seatunnel/tree/dev/config)找到配置文件的例子。 + +## 配置文件结构 + +配置文件类似下面这个例子: + +### hocon + +```hocon +env { + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 100 + schema = { + fields { + name = "string" + age = "int" + card = "int" + } + } + } +} + +transform { + Filter { + source_table_name = "fake" + result_table_name = "fake1" + fields = [name, card] + } +} + +sink { + Clickhouse { + host = "clickhouse:8123" + database = "default" + table = "seatunnel_console" + fields = ["name", "card"] + username = "default" + password = "" + source_table_name = "fake1" + } +} +``` + +#### 多行文本支持 + +`hocon`支持多行字符串,这样就可以包含较长的文本段落,而不必担心换行符或特殊格式。这可以通过将文本括在三层引号 **`"""`** 中来实现。例如: + +``` +var = """ +Apache SeaTunnel is a +next-generation high-performance, +distributed, massive data integration tool. +""" +sql = """ select * from "table" """ +``` + +### json + +```json + +{ + "env": { + "job.mode": "batch" + }, + "source": [ + { + "plugin_name": "FakeSource", + "result_table_name": "fake", + "row.num": 100, + "schema": { + "fields": { + "name": "string", + "age": "int", + "card": "int" + } + } + } + ], + "transform": [ + { + "plugin_name": "Filter", + "source_table_name": "fake", + "result_table_name": "fake1", + "fields": ["name", "card"] + } + ], + "sink": [ + { + "plugin_name": "Clickhouse", + "host": "clickhouse:8123", + "database": "default", + "table": "seatunnel_console", + "fields": ["name", "card"], + "username": "default", + "password": "", + "source_table_name": "fake1" + } + ] +} + +``` + +正如你看到的,配置文件包括几个部分:env, source, transform, sink。不同的模块具有不同的功能。 +当你了解了这些模块后,你就会懂得SeaTunnel到底是如何工作的。 + +### env + +用于添加引擎可选的参数,不管是什么引擎(Zeta、Spark 或者 Flink),对应的可选参数应该在这里填写。 + +注意,我们按照引擎分离了参数,对于公共参数我们可以像以前一样配置。对于Flink和Spark引擎,其参数的具体配置规则可以参考[JobEnvConfig](./JobEnvConfig.md)。 + + + +### source + +source用于定义SeaTunnel在哪儿检索数据,并将检索的数据用于下一步。 +可以同时定义多个source。目前支持的source请看[Source of SeaTunnel](../../en/connector-v2/source)。每种source都有自己特定的参数用来 +定义如何检索数据,SeaTunnel也抽象了每种source所使用的参数,例如 `result_table_name` 参数,用于指定当前source生成的数据的名称, +方便后续其他模块使用。 + +### transform + +当我们有了数据源之后,我们可能需要对数据进行进一步的处理,所以我们就有了transform模块。当然,这里使用了“可能”这个词, +这意味着我们也可以直接将transform视为不存在,直接从source到sink,像下面这样: + +```hocon +env { + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 100 + schema = { + fields { + name = "string" + age = "int" + card = "int" + } + } + } +} + +sink { + Clickhouse { + host = "clickhouse:8123" + database = "default" + table = "seatunnel_console" + fields = ["name", "age", "card"] + username = "default" + password = "" + source_table_name = "fake1" + } +} +``` + +与source类似, transform也有属于每个模块的特定参数。目前支持的source请看。目前支持的transform请看 [Transform V2 of SeaTunnel](../../en/transform-v2) + + + +### sink + +我们使用SeaTunnel的作用是将数据从一个地方同步到其它地方,所以定义数据如何写入,写入到哪里是至关重要的。通过SeaTunnel提供的 +sink模块,你可以快速高效地完成这个操作。Sink和source非常相似,区别在于读取和写入。所以去看看我们[Sink of SeaTunnel](../../en/connector-v2/sink)吧。 + +### 其它 + +你会疑惑当定义了多个source和多个sink时,每个sink读取哪些数据,每个transform读取哪些数据?我们使用`result_table_name` 和 +`source_table_name` 两个配置。每个source模块都会配置一个`result_table_name`来指示数据源生成的数据源名称,其它transform和sink +模块可以使用`source_table_name` 引用相应的数据源名称,表示要读取数据进行处理。然后transform,作为一个中间的处理模块,可以同时使用 +`result_table_name` 和 `source_table_name` 配置。但你会发现在上面的配置例子中,不是每个模块都配置了这些参数,因为在SeaTunnel中, +有一个默认的约定,如果这两个参数没有配置,则使用上一个节点的最后一个模块生成的数据。当只有一个source时这是非常方便的。 + +## 配置变量替换 + +在配置文件中,我们可以定义一些变量并在运行时替换它们。但是注意仅支持 hocon 格式的文件。 + +```hocon +env { + job.mode = "BATCH" + job.name = ${jobName} + parallelism = 2 +} + +source { + FakeSource { + result_table_name = ${resName} + row.num = ${rowNum} + string.template = ${strTemplate} + int.template = [20, 21] + schema = { + fields { + name = ${nameType} + age = "int" + } + } + } +} + +transform { + sql { + source_table_name = "fake" + result_table_name = "sql" + query = "select * from "${resName}" where name = '"${nameVal}"' " + } + +} + +sink { + Console { + source_table_name = "sql" + username = ${username} + password = ${password} + } +} + +``` + +在上述配置中,我们定义了一些变量,如 ${rowNum}、${resName}。 +我们可以使用以下 shell 命令替换这些参数: + +```shell +./bin/seatunnel.sh -c +-i jobName='this_is_a_job_name' +-i resName=fake +-i rowNum=10 +-i strTemplate=['abc','d~f','hi'] +-i nameType=string +-i nameVal=abc +-i username=seatunnel=2.3.1 +-i password='$a^b%c.d~e0*9(' +-e local +``` + +然后最终提交的配置是: + +```hocon +env { + job.mode = "BATCH" + job.name = "this_is_a_job_name" + parallelism = 2 +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 10 + string.template = ['abc','d~f','hi'] + int.template = [20, 21] + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +transform { + sql { + source_table_name = "fake" + result_table_name = "sql" + query = "select * from "fake" where name = 'abc' " + } + +} + +sink { + Console { + source_table_name = "sql" + username = "seatunnel=2.3.1" + password = "$a^b%c.d~e0*9(" + } +} + +``` + +一些注意事项: + +- 如果值包含特殊字符,如`(`,请使用`'`引号将其括起来。 +- 如果替换变量包含`"`或`'`(如`"resName"`和`"nameVal"`),需要添加`"`。 +- 值不能包含空格`' '`。例如, `-i jobName='this is a job name'`将被替换为`job.name = "this"`。 +- 如果要使用动态参数,可以使用以下格式: `-i date=$(date +"%Y%m%d")`。 + +## 此外 + +如果你想了解更多关于格式配置的详细信息,请查看 [HOCON](https://github.com/lightbend/config/blob/main/HOCON.md)。 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/connector-v2-features.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/connector-v2-features.md new file mode 100644 index 000000000000..77041e953250 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/connector-v2-features.md @@ -0,0 +1,70 @@ +# Connector V2 功能简介 + +## Connector V2 和 V1 之间的不同 + +从 https://github.com/apache/seatunnel/issues/1608 我们添加了 Connector V2 特性。 +Connector V2 是基于SeaTunnel Connector API接口定义的连接器。不像Connector V1, V2 支持如下特性: + +* **多引擎支持** SeaTunnel Connector API 是引擎独立的API。基于这个API开发的连接器可以在多个引擎上运行。目前支持Flink和Spark引擎,后续我们会支持其它的引擎。 +* **多引擎版本支持** 通过翻译层将连接器与引擎解耦,解决了大多数连接器需要修改代码才能支持新版本底层引擎的问题。 +* **流批一体** Connector V2 可以支持批处理和流处理。我们不需要为批和流分别开发连接器。 +* **多路复用JDBC/Log连接。** Connector V2支持JDBC资源复用和共享数据库日志解析。 + +## Source Connector 特性 + +Source connector有一些公共的核心特性,每个source connector在不同程度上支持它们。 + +### 精确一次(exactly-once) + +如果数据源中的每条数据仅由源向下游发送一次,我们认为该source connector支持精确一次(exactly-once)。 + +在SeaTunnel中, 我们可以保存读取的 **Split** 和它的 **offset**(当时读取的数据被分割时的位置,例如行号, 字节大小, 偏移量等) 作为检查点时的 **StateSnapshot** 。 如果任务重新启动, 我们会得到最后的 **StateSnapshot** +然后定位到上次读取的 **Split** 和 **offset**,继续向下游发送数据。 + +例如 `File`, `Kafka`。 + +### 列投影(column projection) + +如果连接器支持仅从数据源读取指定列(请注意,如果先读取所有列,然后通过元数据(schema)过滤不需要的列,则此方法不是真正的列投影) + +例如 `JDBCSource` 可以使用sql定义读取列。 + +`KafkaSource` 从主题中读取所有内容然后使用`schema`过滤不必要的列, 这不是真正的`列投影`。 + +### 批(batch) + +批处理作业模式,读取的数据是有界的,当所有数据读取完成后作业将停止。 + +### 流(stream) + +流式作业模式,数据读取无界,作业永不停止。 + +### 并行性(parallelism) + +并行执行的Source Connector支持配置 `parallelism`,每个并发会创建一个任务来读取数据。 +在**Parallelism Source Connector**中,source会被分割成多个split,然后枚举器会将 split 分配给 SourceReader 进行处理。 + +### 支持用户自定义split + +用户可以配置分割规则。 + +### 支持多表读取 + +支持在一个 SeaTunnel 作业中读取多个表。 + +## Sink Connector 的特性 + +Sink connector有一些公共的核心特性,每个sink connector在不同程度上支持它们。 + +### 精确一次(exactly-once) + +当任意一条数据流入分布式系统时,如果系统在整个处理过程中仅准确处理任意一条数据一次,且处理结果正确,则认为系统满足精确一次一致性。 + +对于sink connector,如果任何数据只写入目标一次,则sink connector支持精确一次。 通常有两种方法可以实现这一目标: + +* 目标数据库支持key去重。例如 `MySQL`, `Kudu`。 +* 目标支持 **XA 事务**(事务可以跨会话使用,即使创建事务的程序已经结束,新启动的程序也只需要知道最后一个事务的ID就可以重新提交或回滚事务)。 然后我们可以使用 **两阶段提交** 来确保 * 精确一次**。 例如:`File`, `MySQL`. + +### cdc(更改数据捕获,change data capture) + +如果sink connector支持基于主键写入行类型(INSERT/UPDATE_BEFORE/UPDATE_AFTER/DELETE),我们认为它支持cdc(更改数据捕获,change data capture)。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/event-listener.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/event-listener.md new file mode 100644 index 000000000000..69972cbfc56b --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/event-listener.md @@ -0,0 +1,114 @@ +# 事件监听器 + +## 介绍 + +SeaTunnel提供了丰富的事件监听器功能,用于管理数据同步时的状态。此功能在需要监听任务运行状态时十分重要(`org.apache.seatunnel.api.event`)。本文档将指导您如何使用这些参数并有效地利用他们。 + +## 支持的引擎 + +> SeaTunnel Zeta
+> Flink
+> Spark
+ +## API + +事件(event)API的定义在 `org.apache.seatunnel.api.event`包中。 + +### Event Data API + +- `org.apache.seatunnel.api.event.Event` - 事件数据的接口。 +- `org.apache.seatunnel.api.event.EventType` - 事件数据的枚举值。 + +### Event Listener API + +您可以自定义事件处理器,例如将事件发送到外部系统。 + +- `org.apache.seatunnel.api.event.EventHandler` - 事件处理器的接口,SPI将会自动从类路径中加载子类。 + +### Event Collect API + +- `org.apache.seatunnel.api.source.SourceSplitEnumerator` - 在`SourceSplitEnumerator`加载事件监听器。 + +```java +package org.apache.seatunnel.api.source; + +public interface SourceSplitEnumerator { + + interface Context { + + /** + * Get the {@link org.apache.seatunnel.api.event.EventListener} of this enumerator. + * + * @return + */ + EventListener getEventListener(); + } +} +``` + +- `org.apache.seatunnel.api.source.SourceReader` - 在`SourceReader`加载事件监听器。 + +```java +package org.apache.seatunnel.api.source; + +public interface SourceReader { + + interface Context { + + /** + * Get the {@link org.apache.seatunnel.api.event.EventListener} of this reader. + * + * @return + */ + EventListener getEventListener(); + } +} +``` + +- `org.apache.seatunnel.api.sink.SinkWriter` - 在`SinkWriter`加载事件监听器。 + +```java +package org.apache.seatunnel.api.sink; + +public interface SinkWriter { + + interface Context { + + /** + * Get the {@link org.apache.seatunnel.api.event.EventListener} of this writer. + * + * @return + */ + EventListener getEventListener(); + } +} +``` + +## 设置监听器 + +您需要设置引擎配置以使用事件监听器功能。 + +### Zeta 引擎 + +配置样例(seatunnel.yaml): + +``` +seatunnel: + engine: + event-report-http: + url: "http://example.com:1024/event/report" + headers: + Content-Type: application/json +``` + +### Flink 引擎 + +您可以定义 `org.apache.seatunnel.api.event.EventHandler` 接口并添加到类路径,SPI会自动加载。 + +支持的flink版本: 1.14.0+ + +样例: `org.apache.seatunnel.api.event.LoggingEventHandler` + +### Spark 引擎 + +您可以定义 `org.apache.seatunnel.api.event.EventHandler` 接口并添加到类路径,SPI会自动加载。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/schema-feature.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/schema-feature.md new file mode 100644 index 000000000000..d719a7953e58 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/schema-feature.md @@ -0,0 +1,264 @@ +# Schema 特性简介 + +## 为什么我们需要Schema + +某些NoSQL数据库或消息队列没有严格限制schema,因此无法通过api获取schema。 +这时需要定义一个schema来转换为TableSchema并获取数据。 + +## SchemaOptions + +我们可以使用SchemaOptions定义schema, SchemaOptions包含了一些定义schema的配置。 例如:columns, primaryKey, constraintKeys。 + +``` +schema = { + table = "database.schema.table" + schema_first = false + comment = "comment" + columns = [ + ... + ] + primaryKey { + ... + } + + constraintKeys { + ... + } +} +``` + +### table + +schema所属的表标识符的表全名,包含数据库、schema、表名。 例如 `database.schema.table`、`database.table`、`table`。 + +### schema_first + +默认是false。 + +如果schema_first是true, schema会优先使用, 这意味着如果我们设置 `table = "a.b"`, `a` 会被解析为schema而不是数据库, 那么我们可以支持写入 `table = "schema.table"`. + +### comment + +schema所属的 CatalogTable 的注释。 + +### Columns + +Columns 是用于定义模式中的列的配置列表,每列可以包含名称(name)、类型(type)、是否可空(nullable)、默认值(defaultValue)、注释(comment)字段。 + +``` +columns = [ + { + name = id + type = bigint + nullable = false + columnLength = 20 + defaultValue = 0 + comment = "primary key id" + } +] +``` + +| 字段 | 是否必须 | 默认值 | 描述 | +|:-------------|:-----|:-----|--------------------| +| name | Yes | - | 列的名称 | +| type | Yes | - | 列的数据类型 | +| nullable | No | true | 列是否可空 | +| columnLength | No | 0 | 列的长度,当您需要定义长度时将很有用 | +| columnScale | No | - | 列的精度,当您需要定义精度时将很有用 | +| defaultValue | No | null | 列的默认值 | +| comment | No | null | 列的注释 | + +#### 目前支持哪些类型 + +| 数据类型 | Java中的值类型 | 描述 | +|:----------|:---------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| string | `java.lang.String` | 字符串 | +| boolean | `java.lang.Boolean` | 布尔 | +| tinyint | `java.lang.Byte` | 常规-128 至 127 。 0 到 255 无符号*。 指定括号中的最大位数。 | +| smallint | `java.lang.Short` | 常规-32768 至 32767。 0 到 65535 无符号*。 指定括号中的最大位数。 | +| int | `java.lang.Integer` | 允许从 -2,147,483,648 到 2,147,483,647 的所有数字。 | +| bigint | `java.lang.Long` | 允许 -9,223,372,036,854,775,808 和 9,223,372,036,854,775,807 之间的所有数字。 | +| float | `java.lang.Float` | 从-1.79E+308 到 1.79E+308浮点精度数值数据。 | +| double | `java.lang.Double` | 双精度浮点。 处理大多数小数。 | +| decimal | `java.math.BigDecimal` | Double 类型存储为字符串,允许固定小数点。 | +| null | `java.lang.Void` | null | +| bytes | `byte[]` | 字节。 | +| date | `java.time.LocalDate` | 仅存储日期。从0001年1月1日到9999 年 12 月 31 日。 | +| time | `java.time.LocalTime` | 仅存储时间。精度为 100 纳秒。 | +| timestamp | `java.time.LocalDateTime` | 存储一个唯一的编号,每当创建或修改行时都会更新该编号。 时间戳基于内部时钟,与实际时间不对应。 每个表只能有一个时间戳变量。 | +| row | `org.apache.seatunnel.api.table.type.SeaTunnelRow` | 行类型,可以嵌套。 | +| map | `java.util.Map` | Map 是将键映射到值的对象。 键类型包括: `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double` `decimal` `date` `time` `timestamp` `null` , and the value type includes `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double` `decimal` `date` `time` `timestamp` `null` `array` `map` `row`. | +| array | `ValueType[]` | 数组是一种表示元素集合的数据类型。 元素类型包括: `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double`. | + +#### 如何声明支持的类型 + +SeaTunnel 提供了一种简单直接的方式来声明基本类型。基本类型的关键字包括:`string`, `boolean`, `tinyint`, `smallint`, `int`, `bigint`, `float`, `double`, `date`, `time`, `timestamp`, 和 `null`。基本类型的关键字名称可以直接用作类型声明,并且SeaTunnel对类型关键字不区分大小写。 例如,如果您需要声明一个整数类型的字段,您可以简单地将字段定义为`int`或`"int"`。 + +> null 类型声明必须用双引号引起来, 例如:`"null"`。 这种方法有助于避免与 [HOCON](https://github.com/lightbend/config/blob/main/HOCON.md) 中表示未定义的对象的 `null` 类型混淆。 + +声明复杂类型(例如 **decimal**、**array**、**map** 和 **row**)时,请注意具体注意事项。 +- 声明decimal类型时,需要设置精度(precision)和小数位数(scale),类型定义遵循“decimal(precision, scale)”格式。 需要强调的是,十进制类型的声明必须用 `"` 括起来;不能像基本类型一样直接使用类型名称。例如,当声明精度为 10、小数位数为 2 的十进制字段时,您可以指定字段类型为`"decimal(10,2)"`。 +- 声明array类型时,需要指定元素类型,类型定义遵循 `array` 格式,其中 `T` 代表元素类型。元素类型包括`int`,`string`,`boolean`,`tinyint`,`smallint`,`bigint`,`float` 和 `double`。与十进制类型声明类似,它也用 `"` 括起来。例如,在声明具有整数数组的字段时,将字段类型指定为 `"array"`。 +- 声明map类型时,需要指定键和值类型。map类型定义遵循`map`格式,其中`K`表示键类型,`V`表示值类型。 `K`可以是任何基本类型和十进制类型,`V`可以是 SeaTunnel 支持的任何类型。 与之前的类型声明类似,map类型声明必须用双引号引起来。 例如,当声明一个map类型的字段时,键类型为字符串,值类型为整数,则可以将该字段声明为`"map"`。 +- 声明row类型时,需要定义一个 [HOCON](https://github.com/lightbend/config/blob/main/HOCON.md) 对象来描述字段及其类型。 字段类型可以是 SeaTunnel 支持的任何类型。 例如,当声明包含整数字段“a”和字符串字段“b”的行类型时,可以将其声明为“{a = int, b = string}”。 将定义作为字符串括在 `"` 中也是可以接受的,因此 `"{a = int, b = string}"` 相当于 `{a = int, c = string}`。由于 HOCON 与 JSON 兼容, `"{\"a\":\"int\", \"b\":\"string\"}"` 等价于 `"{a = int, b = string}"`。 + +以下是复杂类型声明的示例: + +```hocon +schema { + fields { + c_decimal = "decimal(10, 2)" + c_array = "array" + c_row = { + c_int = int + c_string = string + c_row = { + c_int = int + } + } + # 在泛型中Hocon风格声明行类型 + map0 = "map" + # 在泛型中Json风格声明行类型 + map1 = "map" + } +} +``` + +### 主键(PrimaryKey) + +主键是用于定义模式中主键的配置,它包含name、columns字段。 + +``` +primaryKey { + name = id + columns = [id] +} +``` + +| 字段 | 是否必须 | 默认值 | 描述 | +|:--------|:-----|:----|---------| +| name | 是 | - | 主键名称 | +| columns | 是 | - | 主键中的列列表 | + +### 约束键(constraintKeys) + +约束键是用于定义模式中约束键的配置列表,它包含constraintName,constraintType,constraintColumns字段。 + +``` +constraintKeys = [ + { + constraintName = "id_index" + constraintType = KEY + constraintColumns = [ + { + columnName = "id" + sortType = ASC + } + ] + }, + ] +``` + +| 字段 | 是否必须 | 默认值 | 描述 | +|:------------------|:-----|:----|------------------------------------------------------------------------| +| constraintName | 是 | - | 约束键的名称 | +| constraintType | 否 | KEY | 约束键的类型 | +| constraintColumns | 是 | - | PrimaryKey中的列列表,每列应包含constraintType和sortType,sortType支持ASC和DESC,默认为ASC | + +#### 目前支持哪些约束类型 + +| 约束类型 | 描述 | +|:-----------|:----| +| INDEX_KEY | 键 | +| UNIQUE_KEY | 唯一键 | + +## 如何使用schema + +### 推荐 + +``` +source { + FakeSource { + parallelism = 2 + result_table_name = "fake" + row.num = 16 + schema { + table = "FakeDatabase.FakeTable" + columns = [ + { + name = id + type = bigint + nullable = false + defaultValue = 0 + comment = "primary key id" + }, + { + name = name + type = "string" + nullable = true + comment = "name" + }, + { + name = age + type = int + nullable = true + comment = "age" + } + ] + primaryKey { + name = "id" + columnNames = [id] + } + constraintKeys = [ + { + constraintName = "unique_name" + constraintType = UNIQUE_KEY + constraintColumns = [ + { + columnName = "name" + sortType = ASC + } + ] + }, + ] + } + } +} +``` + +### 已弃用 + +如果你只需要定义列,你可以使用字段来定义列,这是一种简单的方式,但将来会被删除。 + +``` +source { + FakeSource { + parallelism = 2 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + id = bigint + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(2, 1)" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} +``` + +## 我们什么时候应该使用它,什么时候不应该使用它 + +如果选项中有`schema`配置项目,则连接器可以自定义schema。 比如 `Fake` `Pulsar` `Http` 源连接器等。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/sink-options-placeholders.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/sink-options-placeholders.md new file mode 100644 index 000000000000..2553feb549fc --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/sink-options-placeholders.md @@ -0,0 +1,110 @@ +# Sink 参数占位符 + +## 介绍 + +SeaTunnel 提供了 Sink 参数占位符自动替换功能,可让您通过占位符获取上游表元数据。 + +当您需要动态获取上游表元数据(例如多表写入)时,此功能至关重要。 + +本文档将指导您如何使用这些占位符以及如何有效地利用它们。 + +## 支持的引擎 + +> SeaTunnel Zeta
+> Flink
+> Spark
+ +## 占位符变量 + +占位符主要通过以下表达式实现: + +- `${database_name}` + - 用于获取上游表中的数据库名称 + - 也可以通过表达式指定默认值:`${database_name:default_my_db}` +- `${schema_name}` + - 用于获取上游表中的 schema 名称 + - 也可以通过表达式指定默认值:`${schema_name:default_my_schema}` +- `${table_name}` + - 用于获取上游表中的 table 名称 + - 也可以通过表达式指定默认值:`${table_name:default_my_table}` +- `${schema_full_name}` + - 用于获取上游表中的 schema 全路径名称,包含 database/schema 名称 +- `${table_full_name}` + - 用于获取上游表中的 table 全路径名称,包含 database/schema/table 名称 +- `${primary_key}` + - 用于获取上游表中的主键字段名称列表 +- `${unique_key}` + - 用于获取上游表中的唯一键字段名称列表 +- `${field_names}` + - 用于获取上游表中的所有字段名称列表 + +## 配置 + +*先决条件*: +- 确认 Sink 连接器已经支持了 `TableSinkFactory` API + +### 配置示例 1 + +```hocon +env { + // ignore... +} +source { + MySQL-CDC { + // ignore... + } +} + +transform { + // ignore... +} + +sink { + jdbc { + url = "jdbc:mysql://localhost:3306" + driver = "com.mysql.cj.jdbc.Driver" + user = "root" + password = "123456" + + database = "${database_name}_test" + table = "${table_name}_test" + primary_keys = ["${primary_key}"] + } +} +``` + +### 配置示例 2 + +```hocon +env { + // ignore... +} +source { + Oracle-CDC { + // ignore... + } +} + +transform { + // ignore... +} + +sink { + jdbc { + url = "jdbc:mysql://localhost:3306" + driver = "com.mysql.cj.jdbc.Driver" + user = "root" + password = "123456" + + database = "${schema_name}_test" + table = "${table_name}_test" + primary_keys = ["${primary_key}"] + } +} +``` + +占位符的替换将在连接器启动之前完成,确保 Sink 参数在使用前已准备就绪。 +若该占位符变量没有被替换,则可能是上游表元数据缺少该选项,例如: +- `mysql` source 连接器不包含 `${schema_name}` 元数据 +- `oracle` source 连接器不包含 `${databse_name}` 元数据 +- ... diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/speed-limit.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/speed-limit.md new file mode 100644 index 000000000000..51007269dd05 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/speed-limit.md @@ -0,0 +1,43 @@ +# 速度控制 + +## 介绍 + +SeaTunnel提供了强大的速度控制功能允许你管理数据同步的速率。当你需要确保在系统之间数据传输的高效和可控这个功能是至关重要的。 +速度控制主要由两个关键参数控制:`read_limit.rows_per_second` 和 `read_limit.bytes_per_second`。 +本文档将指导您如何使用这些参数以及如何有效地利用它们。 + +## 支持这些引擎 + +> SeaTunnel Zeta
+> Flink
+> Spark
+ +## 配置 + +要使用速度控制功能,你需要在job配置中设置`read_limit.rows_per_second` 或 `read_limit.bytes_per_second`参数。 + +配置文件中env配置示例: + +```hocon +env { + job.mode=STREAMING + job.name=SeaTunnel_Job + read_limit.bytes_per_second=7000000 + read_limit.rows_per_second=400 +} +source { + MySQL-CDC { + // ignore... + } +} +transform { +} +sink { + Console { + } +} +``` + +我们在`env`参数中放了`read_limit.bytes_per_second` 和 `read_limit.rows_per_second`来完成速度控制的配置。 +你可以同时配置这两个参数,或者只配置其中一个。每个`value`的值代表每个线程被限制的最大速率。 +因此,在配置各个值时,还需要同时考虑你任务的并行性。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/sql-config.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/sql-config.md new file mode 100644 index 000000000000..7defa0010b24 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/concept/sql-config.md @@ -0,0 +1,189 @@ +# SQL配置文件 + +## SQL配置文件结构 + +`SQL`配置文件类似下面这样: + +### SQL + +```sql +/* config +env { + parallelism = 1 + job.mode = "BATCH" +} +*/ + +CREATE TABLE source_table WITH ( + 'connector'='jdbc', + 'type'='source', + 'url' = 'jdbc:mysql://localhost:3306/seatunnel', + 'driver' = 'com.mysql.cj.jdbc.Driver', + 'user' = 'root', + 'password' = '123456', + 'query' = 'select * from source', + 'properties'= '{ + useSSL = false, + rewriteBatchedStatements = true + }' +); + +CREATE TABLE sink_table WITH ( + 'connector'='jdbc', + 'type'='sink', + 'url' = 'jdbc:mysql://localhost:3306/seatunnel', + 'driver' = 'com.mysql.cj.jdbc.Driver', + 'user' = 'root', + 'password' = '123456', + 'generate_sink_sql' = 'true', + 'database' = 'seatunnel', + 'table' = 'sink' +); + +INSERT INTO sink_table SELECT id, name, age, email FROM source_table; +``` + +## `SQL`配置文件说明 + +### 通用配置 + +```sql +/* config +env { + parallelism = 1 + job.mode = "BATCH" +} +*/ +``` + +在`SQL`文件中通过 `/* config */` 注释定义通用配置部分,内部可以使用`hocon`格式定义通用的配置,如`env`等。 + +### SOURCE SQL语法 + +```sql +CREATE TABLE source_table WITH ( + 'connector'='jdbc', + 'type'='source', + 'url' = 'jdbc:mysql://localhost:3306/seatunnel', + 'driver' = 'com.mysql.cj.jdbc.Driver', + 'user' = 'root', + 'password' = '123456', + 'query' = 'select * from source', + 'properties' = '{ + useSSL = false, + rewriteBatchedStatements = true + }' +); +``` + +* 使用 `CREATE TABLE ... WITH (...)` 语法可创建源端表映射, `TABLE`表名为源端映射的表名,`WITH`语法中为源端相关的配置参数 +* 在WITH语法中有两个固定参数:`connector` 和 `type`,分别表示连接器插件名(如:`jdbc`、`FakeSource`等)和源端类型(固定为:`source`) +* 其它参数名可以参考对应连接器插件的相关配置参数,但是格式需要改为`'key' = 'value',`的形式 +* 如果`'value'`为一个子配置,可以直接使用`hocon`格式的字符串,注意:如果使用`hocon`格式的子配置,内部的属性项之间必须用`,`分隔!如: + +```sql +'properties' = '{ + useSSL = false, + rewriteBatchedStatements = true +}' +``` + +* 如果在`'value'`中使用到`'`,需要用`''`进行转义,如: + +```sql +'query' = 'select * from source where name = ''Joy Ding''' +``` + +### SINK SQL语法 + +```sql +CREATE TABLE sink_table WITH ( + 'connector'='jdbc', + 'type'='sink', + 'url' = 'jdbc:mysql://localhost:3306/seatunnel', + 'driver' = 'com.mysql.cj.jdbc.Driver', + 'user' = 'root', + 'password' = '123456', + 'generate_sink_sql' = 'true', + 'database' = 'seatunnel', + 'table' = 'sink' +); +``` + +* 使用 `CREATE TABLE ... WITH (...)` 语法可创建目标端表映射, `TABLE`表名为目标端映射的表名,`WITH`语法中为目标端相关的配置参数 +* 在WITH语法中有两个固定参数:`connector` 和 `type`,分别表示连接器插件名(如:`jdbc`、`console`等)和目标端类型(固定为:`sink`) +* 其它参数名可以参考对应连接器插件的相关配置参数,但是格式需要改为`'key' = 'value',`的形式 + +### INSERT INTO SELECT语法 + +```sql +INSERT INTO sink_table SELECT id, name, age, email FROM source_table; +``` + +* `SELECT FROM` 部分为源端映射表的表名,`SELECT` 部分的语法参考:[SQL-transform](../transform-v2/sql.md) `query` 配置项 +* `INSERT INTO` 部分为目标端映射表的表名 +* 注意:该语法**不支持**在 `INSERT` 中指定字段,如:`INSERT INTO sink_table (id, name, age, email) SELECT id, name, age, email FROM source_table;` + +### INSERT INTO SELECT TABLE语法 + +```sql +INSERT INTO sink_table SELECT source_table; +``` + +* `SELECT` 部分直接使用源端映射表的表名,表示将源端表的所有数据插入到目标端表中 +* 使用该语法不会生成`trasform`的相关配置,这种语法一般用在多表同步的场景,示例: + +```sql +CREATE TABLE source_table WITH ( + 'connector'='jdbc', + 'type' = 'source', + 'url' = 'jdbc:mysql://127.0.0.1:3306/seatunnel', + 'driver' = 'com.mysql.cj.jdbc.Driver', + 'user' = 'root', + 'password' = '123456', + 'table_list' = '[ + { + table_path = "source.table1" + }, + { + table_path = "source.table2", + query = "select * from source.table2" + } + ]' +); + +CREATE TABLE sink_table WITH ( + 'connector'='jdbc', + 'type' = 'sink', + 'url' = 'jdbc:mysql://127.0.0.1:3306/seatunnel', + 'driver' = 'com.mysql.cj.jdbc.Driver', + 'user' = 'root', + 'password' = '123456', + 'generate_sink_sql' = 'true', + 'database' = 'sink' +); + +INSERT INTO sink_table SELECT source_table; +``` + +### CREATE TABLE AS语法 + +```sql +CREATE TABLE temp1 AS SELECT id, name, age, email FROM source_table; +``` + +* 该语法可以将一个`SELECT`查询结果作为一个临时表,用于的`INSERT INTO`操作 +* `SELECT` 部分的语法参考:[SQL Transform](../transform-v2/sql.md) `query` 配置项 + +```sql +CREATE TABLE temp1 AS SELECT id, name, age, email FROM source_table; + +INSERT INTO sink_table SELECT * FROM temp1; +``` + +## SQL配置文件任务提交示例 + +```bash +./bin/seatunnel.sh --config ./config/sample.sql +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/Config-Encryption-Decryption.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/Config-Encryption-Decryption.md new file mode 100644 index 000000000000..e7b13aea86a7 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/Config-Encryption-Decryption.md @@ -0,0 +1,181 @@ +# 配置文件加密和解密 + +## 介绍 + +在大多数生产环境中,需要对敏感的配置项(如密码)进行加密,不能以明文形式存储。SeaTunnel 为此提供了一个方便的一站式解决方案。 + +## 如何使用 + +SeaTunnel 具备Base64编码和解码的功能,但不建议在生产环境中使用,SeaTunnel 建议用户根据自身需求,实现个性化的加密和解密逻辑。您可以参考本章节[如何实现用户自定义的加密和解密](#如何实现用户自定义的加密和解密)以获取更多相关细节。 + +Base64编码支持加密以下参数: +- username +- password +- auth + +接下来,将展示如何快速使用 SeaTunnel 自带的 `base64` 加密功能: + +1. 在配置文件的环境变量(env)部分新增了一个选项 `shade.identifier`。此选项用于表示您想要使用的加密方法。 +2. 在这个示例中,我们在配置文件中添加了 `shade.identifier = base64`,如下所示: + + ```hocon + # + # Licensed to the Apache Software Foundation (ASF) under one or more + # contributor license agreements. See the NOTICE file distributed with + # this work for additional information regarding copyright ownership. + # The ASF licenses this file to You under the Apache License, Version 2.0 + # (the "License"); you may not use this file except in compliance with + # the License. You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + # + + env { + parallelism = 1 + shade.identifier = "base64" + } + + source { + MySQL-CDC { + result_table_name = "fake" + parallelism = 1 + server-id = 5656 + port = 56725 + hostname = "127.0.0.1" + username = "seatunnel" + password = "seatunnel_password" + database-name = "inventory_vwyw0n" + table-name = "products" + base-url = "jdbc:mysql://localhost:56725" + } + } + + transform { + } + + sink { + # 将数据输出到 Clickhouse。 + Clickhouse { + host = "localhost:8123" + database = "default" + table = "fake_all" + username = "seatunnel" + password = "seatunnel_password" + + # cdc options + primary_key = "id" + support_upsert = true + } + } + ``` +3. 通过Shell脚本调用不同的计算引擎来对配置文件进行加密操作。在本示例中,我们使用 Zeta 引擎对配置文件进行加密。 + + ```shell + ${SEATUNNEL_HOME}/bin/seatunnel.sh --config config/v2.batch.template --encrypt + ``` + + 然后,您可以在终端中看到加密后的配置文件。 + + ```log + 2023-02-20 17:50:58,319 INFO org.apache.seatunnel.core.starter.command.ConfEncryptCommand - Encrypt config: + { + "env" : { + "parallelism" : 1, + "shade.identifier" : "base64" + }, + "source" : [ + { + "base-url" : "jdbc:mysql://localhost:56725", + "hostname" : "127.0.0.1", + "password" : "c2VhdHVubmVsX3Bhc3N3b3Jk", + "port" : 56725, + "database-name" : "inventory_vwyw0n", + "parallelism" : 1, + "result_table_name" : "fake", + "table-name" : "products", + "plugin_name" : "MySQL-CDC", + "server-id" : 5656, + "username" : "c2VhdHVubmVs" + } + ], + "transform" : [], + "sink" : [ + { + "database" : "default", + "password" : "c2VhdHVubmVsX3Bhc3N3b3Jk", + "support_upsert" : true, + "host" : "localhost:8123", + "plugin_name" : "Clickhouse", + "primary_key" : "id", + "table" : "fake_all", + "username" : "c2VhdHVubmVs" + } + ] + } + ``` +4. 当然,不仅支持加密配置文件,还支持对配置文件的解密。如果用户想要查看解密后的配置文件,可以执行以下命令: + + ```shell + ${SEATUNNEL_HOME}/bin/seatunnel.sh --config config/v2.batch.template --decrypt + ``` + +## 如何实现用户自定义的加密和解密 + +如果您希望自定义加密方法和加密配置,本章节将帮助您解决问题。 + +1. 创建一个 java maven 项目 + +2. 在 maven 依赖中添加 `seatunnel-api` 模块,如下所示: + + ```xml + + org.apache.seatunnel + seatunnel-api + ${seatunnel.version} + + ``` +3. 创建一个 java 类并实现 `ConfigShade` 接口,该接口包含以下方法: + + ```java + /** + * The interface that provides the ability to encrypt and decrypt {@link + * org.apache.seatunnel.shade.com.typesafe.config.Config} + */ + public interface ConfigShade { + + /** + * The unique identifier of the current interface, used it to select the correct {@link + * ConfigShade} + */ + String getIdentifier(); + + /** + * Encrypt the content + * + * @param content The content to encrypt + */ + String encrypt(String content); + + /** + * Decrypt the content + * + * @param content The content to decrypt + */ + String decrypt(String content); + + /** To expand the options that user want to encrypt */ + default String[] sensitiveOptions() { + return new String[0]; + } + } + ``` +4. 在 `resources/META-INF/services` 目录下添加 `org.apache.seatunnel.api.configuration.ConfigShade` +5. 将其打成 jar 包, 并添加到 `${SEATUNNEL_HOME}/lib` 目录下。 +6. 将选项 `shade.identifier` 的值更改为上面定义在配置文件中的 `ConfigShade#getIdentifier` 的值。 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/avro.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/avro.md new file mode 100644 index 000000000000..7176f4e507fb --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/avro.md @@ -0,0 +1,111 @@ +# Avro 格式 + +Avro 在流式数据处理管道中非常流行。现在seatunnel在kafka连接器中支持Avro格式 + +# 怎样用 + +## Kafka 使用示例 + +- 模拟随机生成数据源,并以 Avro 的格式 写入 Kafka 的实例 + +```bash +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + row.num = 90 + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + result_table_name = "fake" + } +} + +sink { + Kafka { + bootstrap.servers = "kafkaCluster:9092" + topic = "test_avro_topic_fake_source" + format = avro + } +} +``` + +- 从 kafka 读取 avro 格式的数据并打印到控制台的示例 + +```bash +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Kafka { + bootstrap.servers = "kafkaCluster:9092" + topic = "test_avro_topic" + result_table_name = "kafka_table" + start_mode = "earliest" + format = avro + format_error_handle_way = skip + schema = { + fields { + id = bigint + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(2, 1)" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} + +sink { + Console { + source_table_name = "kafka_table" + } +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/canal-json.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/canal-json.md new file mode 100644 index 000000000000..92c4338eb564 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/canal-json.md @@ -0,0 +1,115 @@ +# Canal 格式 + +变更数据捕获格式: +序列化模式、反序列化模式 + +Canal是一款CDC(变更数据捕获)工具,能够实时捕获MySQL的数据变化并将其流式传输到其他系统中。Canal为变更日志提供了一种统一的格式,并支持使用 JSON 和 protobuf(Canal默认使用protobuf)进行消息的序列化 + +SeaTunnel 能够解析 Canal 的 JSON 消息,并将其转化为 INSERT/UPDATE/DELETE 消息,进而输入到 SeaTunnel 系统中。这个特性在很多场景下都显得非常有用,例如: + + 将增量数据从数据库同步到其他系统 + 审计日志 + 数据库的实时物化视图 + 关联维度数据库的变更历史,等等。 + +SeaTunnel 还支持将 SeaTunnel 中的 INSERT/UPDATE/DELETE 消息编码为 Canal JSON 消息,并将其发送到类似 Kafka 这样的存储中。然而,目前 SeaTunnel 无法将 UPDATE_BEFORE 和 UPDATE_AFTER 合并为一个单一的UPDATE消息。因此,SeaTunnel将 UPDATE_BEFORE 和 UPDATE_AFTER 编码为 Canal的 DELETE 和 INSERT 消息来进行 + +# 格式选项 + +| 选项 | 默认值 | 是否需要 | 描述 | +|--------------------------------|--------|------|------------------------------------------------------------------------------------| +| format | (none) | 是 | 指定要使用的格式,这里应该是 `canal_json` | +| canal_json.ignore-parse-errors | false | 否 | 跳过解析错误的字段和行,而不是失败。出现错误的字段将被设置为null | +| canal_json.database.include | (none) | 否 | 正则表达式,可选,通过正则匹配 Canal 记录中的`database`元字段来仅读取特定数据库变更日志行。此字符串Pattern模式与Java的Pattern兼容 | +| canal_json.table.include | (none) | 否 | 正则表达式,可选,通过正则匹配 Canal 记录中的`table`元字段来仅读取特定数据库变更日志行。此字符串Pattern模式与Java的Pattern兼容 | + +# 如何使用 + +## Kafka 使用示例 + +Canal为变更日志提供了一种统一的格式,以下是一个从MySQL products 表捕获的变更操作的简单示例 + +```bash +{ + "data": [ + { + "id": "111", + "name": "scooter", + "description": "Big 2-wheel scooter", + "weight": "5.18" + } + ], + "database": "inventory", + "es": 1589373560000, + "id": 9, + "isDdl": false, + "mysqlType": { + "id": "INTEGER", + "name": "VARCHAR(255)", + "description": "VARCHAR(512)", + "weight": "FLOAT" + }, + "old": [ + { + "weight": "5.15" + } + ], + "pkNames": [ + "id" + ], + "sql": "", + "sqlType": { + "id": 4, + "name": 12, + "description": 12, + "weight": 7 + }, + "table": "products", + "ts": 1589373560798, + "type": "UPDATE" +} +``` + +注:请参考 [Canal 文档](https://github.com/alibaba/canal/wiki) 以了解每个字段的含义 + +MySQL 的 products 表有 4 列(id、name、description 和 weight) +上述 JSON 消息是产品表的一个更新变更事件,其中 id = 111 的行的 weight 值从 5.15 变为 5.18 +假设此表的 binlog 的消息已经同步到 Kafka topic,那么我们可以使用下面的 SeaTunnel 示例来消费这个主题并体现变更事件 + +```bash +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Kafka { + bootstrap.servers = "kafkaCluster:9092" + topic = "products_binlog" + result_table_name = "kafka_name" + start_mode = earliest + schema = { + fields { + id = "int" + name = "string" + description = "string" + weight = "string" + } + }, + format = canal_json + } + +} + +transform { +} + +sink { + Kafka { + bootstrap.servers = "localhost:9092" + topic = "consume-binlog" + format = canal_json + } +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/cdc-compatible-debezium-json.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/cdc-compatible-debezium-json.md new file mode 100644 index 000000000000..e34a5b39a223 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/cdc-compatible-debezium-json.md @@ -0,0 +1,55 @@ +# CDC 兼容 Debezium-json + +SeaTunnel 支持将 cdc 记录解析为 Debezium-JSON 消息,并发布到 MQ (kafka) 等消息系统中 + +这个特性在很多场景下都非常实用,例如,它可以实现与 Debezium 生态系统的兼容性 + +# 如何使用 + +## MySQL-CDC 流入 Kafka + +```bash +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 15000 +} + +source { + MySQL-CDC { + result_table_name = "table1" + + base-url="jdbc:mysql://localhost:3306/test" + "startup.mode"=INITIAL + table-names=[ + "database1.t1", + "database1.t2", + "database2.t1" + ] + + # compatible_debezium_json options + format = compatible_debezium_json + debezium = { + # include schema into kafka message + key.converter.schemas.enable = false + value.converter.schemas.enable = false + # include ddl + include.schema.changes = true + # topic prefix + database.server.name = "mysql_cdc_1" + } + } +} + +sink { + Kafka { + source_table_name = "table1" + + bootstrap.servers = "localhost:9092" + + # compatible_debezium_json options + format = compatible_debezium_json + } +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/debezium-json.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/debezium-json.md new file mode 100644 index 000000000000..3e70a5d31ed6 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/debezium-json.md @@ -0,0 +1,115 @@ +# Debezium 格式 + +变更数据捕获格式: +序列化模式、反序列化模式 + +Debezium 是一套分布式服务,用于捕获数据库中的变化,以便您的应用程序可以看到这些变化并对其做出响应。Debezium 在变更事件流中记录每个数据库表中的所有行级变化,应用程序只需读取这些流,就可以按照它们发生的顺序看到变更事件 + +SeaTunnel 支持将 Debezium JSON 消息解析为 INSERT/UPDATE/DELETE 消息并导入到 seatunnel 系统中。在许多情况下,利用这个特性是非常有用的,例如: + + 将增量数据从数据库同步到其他系统 + 审计日志 + 数据库的实时物化视图 + 关联维度数据库的变更历史,等等。 + +SeaTunnel 还支持将 SeaTunnel 中的 INSERT/UPDATE/DELETE 消息解析为 Debezium JSON 消息,并将其发送到类似 Kafka 这样的存储中 + +# 格式选项 + +| 选项 | 默认值 | 是否需要 | 描述 | +|-----------------------------------|--------|------|--------------------------------------| +| format | (none) | 是 | 指定要使用的格式,这里应该是 'debezium_json'. | +| debezium-json.ignore-parse-errors | false | 否 | 跳过有解析错误的字段和行而不是失败。如果出现错误,字段将设置为 null | + +# 如何使用 + +## Kafka 使用示例 + +Debezium 提供了一个统一的变更日志格式,下面是一个 MySQL products 表捕获的变更操作的简单示例 + +```bash +{ + "before": { + "id": 111, + "name": "scooter", + "description": "Big 2-wheel scooter ", + "weight": 5.18 + }, + "after": { + "id": 111, + "name": "scooter", + "description": "Big 2-wheel scooter ", + "weight": 5.17 + }, + "source": { + "version": "1.1.1.Final", + "connector": "mysql", + "name": "dbserver1", + "ts_ms": 1589362330000, + "snapshot": "false", + "db": "inventory", + "table": "products", + "server_id": 223344, + "gtid": null, + "file": "mysql-bin.000003", + "pos": 2090, + "row": 0, + "thread": 2, + "query": null + }, + "op": "u", + "ts_ms": 1589362330904, + "transaction": null +} +``` + +注:请参考 [Debezium 文档](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/mysql.adoc#data-change-events) 以了解每个字段的含义 + +MySQL 的 products 表有 4 列(id、name、description 和 weight) +上述 JSON 消息是产品表的一个更新变更事件,其中 id = 111 的行的 weight 值从 5.18 变为 5.17 +假设消息已经同步到 Kafka 主题 products_binlog,那么我们可以使用以下的 SeaTunnel 配置来消费这个主题并通过 Debezium 格式解释变更事件。 + +在此配置中,您必须指定 `schema` 和 `debezium_record_include_schema` 选项: +- `schema` 应与您的表格式相同 +- 如果您的 json 数据包含 `schema` 字段,`debezium_record_include_schema` 应为 true,如果您的 json 数据不包含 `schema` 字段,`debezium_record_include_schema` 应为 false +- `{"schema" : {}, "payload": { "before" : {}, "after": {} ... } }` --> `true` +- `{"before" : {}, "after": {} ... }` --> `false`" + +```bash +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Kafka { + bootstrap.servers = "kafkaCluster:9092" + topic = "products_binlog" + result_table_name = "kafka_name" + start_mode = earliest + schema = { + fields { + id = "int" + name = "string" + description = "string" + weight = "string" + } + } + debezium_record_include_schema = false + format = debezium_json + } + +} + +transform { +} + +sink { + Kafka { + bootstrap.servers = "kafkaCluster:9092" + topic = "consume-binlog" + format = debezium_json + } +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/kafka-compatible-kafkaconnect-json.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/kafka-compatible-kafkaconnect-json.md new file mode 100644 index 000000000000..d0ceb58ac6c9 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/kafka-compatible-kafkaconnect-json.md @@ -0,0 +1,47 @@ +# Kafka source 兼容 kafka-connect-json + +Seatunnel 的 Kafka 连接器支持解析通过 Kafka Connect Source 抽取的数据,特别是从 Kafka Connect JDBC 和 Kafka Connect Debezium 抽取的数据 + +# 如何使用 + +## Kafka 流入 Mysql + +```bash +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Kafka { + bootstrap.servers = "localhost:9092" + topic = "jdbc_source_record" + result_table_name = "kafka_table" + start_mode = earliest + schema = { + fields { + id = "int" + name = "string" + description = "string" + weight = "string" + } + }, + format = COMPATIBLE_KAFKA_CONNECT_JSON + } +} + + +sink { + Jdbc { + driver = com.mysql.cj.jdbc.Driver + url = "jdbc:mysql://localhost:3306/seatunnel" + user = st_user + password = seatunnel + generate_sink_sql = true + database = seatunnel + table = jdbc_sink + primary_keys = ["id"] + } +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/ogg-json.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/ogg-json.md new file mode 100644 index 000000000000..7b64f5b5e41a --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/formats/ogg-json.md @@ -0,0 +1,93 @@ +# Ogg 格式 + +[Oracle GoldenGate](https://www.oracle.com/integration/goldengate/) (a.k.a ogg) 是一项托管服务,提供实时数据网格平台,该平台使用复制来保持数据高度可用,并支持实时分析。客户可以设计、执行和监控其数据复制和流数据处理解决方案,而无需分配或管理计算环境。 Ogg 为变更日志提供了统一的格式结构,并支持使用 JSON 序列化消息。 + +SeaTunnel 支持将 Ogg JSON 消息解释为 Seatunnel 系统中的 INSERT/UPDATE/DELETE 消息。在许多情况下,这个特性带来了很多便利,例如 + + 将增量数据从数据库同步到其他系统 + 审计日志 + 数据库的实时物化视图 + 关联维度数据库的变更历史,等等。 + +SeaTunnel 还支持将 SeaTunnel 中的 INSERT/UPDATE/DELETE 消息转化为 Ogg JSON 消息,并将其发送到类似 Kafka 这样的存储中。然而,目前 SeaTunnel 无法将 UPDATE_BEFORE 和 UPDATE_AFTER 组合成单个 UPDATE 消息。因此,Seatunnel 将 UPDATE_BEFORE 和 UPDATE_AFTER 转化为 DELETE 和 INSERT Ogg 消息来实现 + +# 格式选项 + +| 选项 | 默认值 | 是否需要 | 描述 | +|------------------------------|--------|------|------------------------------------------------------------------------------------| +| format | (none) | 是 | 指定要使用的格式,这里应该是`-json` | +| ogg_json.ignore-parse-errors | false | 否 | 跳过有解析错误的字段和行而不是失败。如果出现错误,字段将设置为 null | +| ogg_json.database.include | (none) | 否 | 正则表达式,可选,通过正则匹配 Canal 记录中的`database`元字段来仅读取特定数据库变更日志行。此字符串Pattern模式与Java的Pattern兼容 | +| ogg_json.table.include | (none) | 否 | 正则表达式,可选,通过正则匹配 Canal 记录中的 `table` 元字段来仅读取特定表的更改日志行。此字符串Pattern模式与Java的Pattern兼容 | + +# 如何使用 Ogg 格式 + +## Kafka 使用示例 + +Ogg 为变更日志提供了统一的格式,下面是从 Oracle PRODUCTS 表捕获变更操作的简单示例: + +```bash +{ + "before": { + "id": 111, + "name": "scooter", + "description": "Big 2-wheel scooter", + "weight": 5.18 + }, + "after": { + "id": 111, + "name": "scooter", + "description": "Big 2-wheel scooter", + "weight": 5.15 + }, + "op_type": "U", + "op_ts": "2020-05-13 15:40:06.000000", + "current_ts": "2020-05-13 15:40:07.000000", + "primary_keys": [ + "id" + ], + "pos": "00000000000000000000143", + "table": "PRODUCTS" +} +``` + +注:各字段含义请参考 [Debezium 文档](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/oracle.adoc#data-change-events) + +此 Oracle PRODUCTS 表有 4 列 (id, name, description 和 weight) +上面的 JSON 消息是 products 表上的更新更改事件,其中 id = 111 的行的字段 `weight` 的值从 5.18 更改为 5.15。 +假设此表的 binlog 的消息已经同步到 Kafka topic,那么我们可以使用下面的 SeaTunnel 示例来消费这个 topic 并体现变更事件。 + +```bash +env { + parallelism = 1 + job.mode = "STREAMING" +} +source { + Kafka { + bootstrap.servers = "127.0.0.1:9092" + topic = "ogg" + result_table_name = "kafka_name" + start_mode = earliest + schema = { + fields { + id = "int" + name = "string" + description = "string" + weight = "double" + } + }, + format = ogg_json + } +} +sink { + jdbc { + url = "jdbc:mysql://127.0.0.1/test" + driver = "com.mysql.cj.jdbc.Driver" + user = "root" + password = "12345678" + table = "ogg" + primary_keys = ["id"] + } +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink.md new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Clickhouse.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Clickhouse.md new file mode 100644 index 000000000000..2b5e23d55688 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Clickhouse.md @@ -0,0 +1,179 @@ +# Clickhouse + +> Clickhouse 数据连接器 + +## 支持引擎 + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## 核心特性 + +- [ ] [精准一次](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +> Clickhouse sink 插件通过实现幂等写入可以达到精准一次,需要配合 aggregating merge tree 支持重复数据删除的引擎。 + +## 描述 + +用于将数据写入 Clickhouse。 + +## 支持的数据源信息 + +为了使用 Clickhouse 连接器,需要以下依赖项。它们可以通过 install-plugin.sh 或从 Maven 中央存储库下载。 + +| 数据源 | 支持的版本 | 依赖 | +|------------|-----------|------------------------------------------------------------------------------------------------------------| +| Clickhouse | universal | [下载](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-clickhouse) | + +## 数据类型映射 + +| SeaTunnel 数据类型 | Clickhouse 数据类型 | +|----------------|-----------------------------------------------------------------------------------------------------------------------------------------------| +| STRING | String / Int128 / UInt128 / Int256 / UInt256 / Point / Ring / Polygon MultiPolygon | +| INT | Int8 / UInt8 / Int16 / UInt16 / Int32 | +| BIGINT | UInt64 / Int64 / IntervalYear / IntervalQuarter / IntervalMonth / IntervalWeek / IntervalDay / IntervalHour / IntervalMinute / IntervalSecond | +| DOUBLE | Float64 | +| DECIMAL | Decimal | +| FLOAT | Float32 | +| DATE | Date | +| TIME | DateTime | +| ARRAY | Array | +| MAP | Map | + +## 输出选项 + +| 名称 | 类型 | 是否必须 | 默认值 | 描述 | +|---------------------------------------|---------|------|-------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| host | String | Yes | - | `ClickHouse` 集群地址, 格式是`host:port` , 允许多个`hosts`配置. 例如 `"host1:8123,host2:8123"`. | +| database | String | Yes | - | `ClickHouse` 数据库名称. | +| table | String | Yes | - | 表名称. | +| username | String | Yes | - | `ClickHouse` 用户账号. | +| password | String | Yes | - | `ClickHouse` 用户密码. | +| clickhouse.config | Map | No | | 除了上述必须由 `clickhouse-jdbc` 指定的必填参数外,用户还可以指定多个可选参数,这些参数涵盖了 `clickhouse-jdbc` 提供的所有[参数](https://github.com/ClickHouse/clickhouse-jdbc/tree/master/clickhouse-client#configuration). | +| bulk_size | String | No | 20000 | 每次通过[Clickhouse-jdbc](https://github.com/ClickHouse/clickhouse-jdbc) 写入的行数,即默认是20000. | +| split_mode | String | No | false | 此模式仅支持引擎为`Distributed`的 `clickhouse` 表。选项 `internal_replication` 应该是 `true` 。他们将在 seatunnel 中拆分分布式表数据,并直接对每个分片进行写入。分片权重定义为 `clickhouse` 将计算在内。 | +| sharding_key | String | No | - | 使用 `split_mode` 时,将数据发送到哪个节点是个问题,默认为随机选择,但可以使用`sharding_key`参数来指定分片算法的字段。此选项仅在`split_mode`为 `true` 时有效. | +| primary_key | String | No | - | 标记`clickhouse`表中的主键列,并根据主键执行INSERT/UPDATE/DELETE到`clickhouse`表. | +| support_upsert | Boolean | No | false | 支持按查询主键更新插入行. | +| allow_experimental_lightweight_delete | Boolean | No | false | 允许基于`MergeTree`表引擎实验性轻量级删除. | +| common-options | | No | - | Sink插件查用参数,详见[Sink常用选项](common-options.md). | + +## 如何创建一个clickhouse 同步任务 + +以下示例演示如何创建将随机生成的数据写入Clickhouse数据库的数据同步作业。 + +```bash +# Set the basic configuration of the task to be performed +env { + parallelism = 1 + job.mode = "BATCH" + checkpoint.interval = 1000 +} + +source { + FakeSource { + row.num = 2 + bigint.min = 0 + bigint.max = 10000000 + split.num = 1 + split.read-interval = 300 + schema { + fields { + c_bigint = bigint + } + } + } +} + +sink { + Clickhouse { + host = "127.0.0.1:9092" + database = "default" + table = "test" + username = "xxxxx" + password = "xxxxx" + } +} +``` + +### 小提示 + +> 1.[SeaTunnel 部署文档](../../start-v2/locally/deployment.md).
+> 2.需要在同步前提前创建要写入的表.
+> 3.当写入 ClickHouse 表,无需设置其结构,因为连接器会在写入前向 ClickHouse 查询当前表的结构信息.
+ +## Clickhouse 接收器配置 + +```hocon +sink { + Clickhouse { + host = "localhost:8123" + database = "default" + table = "fake_all" + username = "xxxxx" + password = "xxxxx" + clickhouse.config = { + max_rows_to_read = "100" + read_overflow_mode = "throw" + } + } +} +``` + +## 切分模式 + +```hocon +sink { + Clickhouse { + host = "localhost:8123" + database = "default" + table = "fake_all" + username = "xxxxx" + password = "xxxxx" + + # split mode options + split_mode = true + sharding_key = "age" + } +} +``` + +## CDC(Change data capture) Sink + +```hocon +sink { + Clickhouse { + host = "localhost:8123" + database = "default" + table = "fake_all" + username = "xxxxx" + password = "xxxxx" + + # cdc options + primary_key = "id" + support_upsert = true + } +} +``` + +## CDC(Change data capture) for *MergeTree engine + +```hocon +sink { + Clickhouse { + host = "localhost:8123" + database = "default" + table = "fake_all" + username = "xxxxx" + password = "xxxxx" + + # cdc options + primary_key = "id" + support_upsert = true + allow_experimental_lightweight_delete = true + } +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/ClickhouseFile.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/ClickhouseFile.md new file mode 100644 index 000000000000..b36a2982f53f --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/ClickhouseFile.md @@ -0,0 +1,138 @@ +# ClickhouseFile + +> Clickhouse文件数据接收器 + +## 描述 + +该接收器使用clickhouse-local程序生成clickhouse数据文件,随后将其发送至clickhouse服务器,这个过程也称为bulkload。该接收器仅支持表引擎为 'Distributed'的表,且`internal_replication`选项需要设置为`true`。支持批和流两种模式。 + +## 主要特性 + +- [ ] [精准一次](../../concept/connector-v2-features.md) + +:::小提示 + +你也可以采用JDBC的方式将数据写入Clickhouse。 + +::: + +## 接收器选项 + +| 名称 | 类型 | 是否必须 | 默认值 | +|------------------------|---------|------|----------------------------------------| +| host | string | yes | - | +| database | string | yes | - | +| table | string | yes | - | +| username | string | yes | - | +| password | string | yes | - | +| clickhouse_local_path | string | yes | - | +| sharding_key | string | no | - | +| copy_method | string | no | scp | +| node_free_password | boolean | no | false | +| node_pass | list | no | - | +| node_pass.node_address | string | no | - | +| node_pass.username | string | no | "root" | +| node_pass.password | string | no | - | +| compatible_mode | boolean | no | false | +| file_fields_delimiter | string | no | "\t" | +| file_temp_path | string | no | "/tmp/seatunnel/clickhouse-local/file" | +| common-options | | no | - | + +### host [string] + +`ClickHouse`集群地址,格式为`host:port`,允许同时指定多个`hosts`。例如`"host1:8123,host2:8123"`。 + +### database [string] + +`ClickHouse`数据库名。 + +### table [string] + +表名称。 + +### username [string] + +连接`ClickHouse`的用户名。 + +### password [string] + +连接`ClickHouse`的用户密码。 + +### sharding_key [string] + +当ClickhouseFile需要拆分数据时,需要考虑的问题是当前数据需要发往哪个节点,默认情况下采用的是随机算法,我们也可以使用'sharding_key'参数为某字段指定对应的分片算法。 + +### clickhouse_local_path [string] + +在spark节点上的clickhouse-local程序路径。由于每个任务都会被调用,所以每个spark节点上的clickhouse-local程序路径必须相同。 + +### copy_method [string] + +为文件传输指定方法,默认为scp,可选值为scp和rsync。 + +### node_free_password [boolean] + +由于seatunnel需要使用scp或者rsync进行文件传输,因此seatunnel需要clickhouse服务端访问权限。如果每个spark节点与clickhouse服务端都配置了免密登录,则可以将此选项配置为true,否则需要在node_pass参数中配置对应节点的密码。 + +### node_pass [list] + +用来保存所有clickhouse服务器地址及其对应的访问密码。 + +### node_pass.node_address [string] + +clickhouse服务器节点地址。 + +### node_pass.username [string] + +clickhouse服务器节点用户名,默认为root。 + +### node_pass.password [string] + +clickhouse服务器节点的访问密码。 + +### compatible_mode [boolean] + +在低版本的Clickhouse中,clickhouse-local程序不支持`--path`参数,需要设置该参数来采用其他方式实现`--path`参数功能。 + +### file_fields_delimiter [string] + +ClickHouseFile使用CSV格式来临时保存数据。但如果数据中包含CSV的分隔符,可能会导致程序异常。使用此配置可以避免该情况。配置的值必须正好为一个字符的长度。 + +### file_temp_path [string] + +ClickhouseFile本地存储临时文件的目录。 + +### common options + +Sink插件常用参数,请参考[Sink常用选项](common-options.md)获取更多细节信息。 + +## 示例 + +```hocon +ClickhouseFile { + host = "192.168.0.1:8123" + database = "default" + table = "fake_all" + username = "default" + password = "" + clickhouse_local_path = "/Users/seatunnel/Tool/clickhouse local" + sharding_key = "age" + node_free_password = false + node_pass = [{ + node_address = "192.168.0.1" + password = "seatunnel" + }] +} +``` + +## 变更日志 + +### 2.2.0-beta 2022-09-26 + +- 支持将数据写入ClickHouse文件并迁移到ClickHouse数据目录 + +### 随后版本 + +- [BugFix] 修复生成的数据部分名称冲突BUG并改进文件提交逻辑 [3416](https://github.com/apache/seatunnel/pull/3416) +- [Feature] 支持compatible_mode来兼容低版本的Clickhouse [3416](https://github.com/apache/seatunnel/pull/3416) + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Console.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Console.md new file mode 100644 index 000000000000..43dff335132d --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Console.md @@ -0,0 +1,124 @@ +# Console + +> Console 数据接收器 + +## 支持连接器版本 + +- 所有版本 + +## 支持的引擎 + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## 描述 + +接收Source端传入的数据并打印到控制台。支持批同步和流同步两种模式。 + +> 例如,来自上游的数据为 [`age: 12, name: jared`] ,则发送到控制台的内容为: `{"name":"jared","age":17}` + +## 主要特性 + +- [ ] [精确一次](../../concept/connector-v2-features.md) + +## 接收器选项 + +| 名称 | 类型 | 是否必须 | 默认值 | 描述 | +|--------------------|---------|------|-----|---------------------------------------------------| +| common-options | | 否 | - | Sink插件常用参数,请参考 [Sink常用选项](common-options.md) 了解详情 | +| log.print.data | boolean | 否 | - | 确定是否应在日志中打印数据的标志。默认值为`true` | +| log.print.delay.ms | int | 否 | - | 将每个数据项打印到日志之间的延迟(以毫秒为单位)。默认值为`0` | + +## 任务示例 + +### 简单示例: + +> 随机生成的数据,包含两个字段,即 `name`(字符串类型)和 `age`(整型),写入控制台,并行度为 `1` + +``` +env { + parallelism = 1 + job.mode = "STREAMING" +} + +source { + FakeSource { + result_table_name = "fake" + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +sink { + Console { + source_table_name = "fake" + } +} +``` + +### 多数据源示例: + +> 多数据源示例,通过配置可以指定数据源写入指定接收器 + +``` +env { + parallelism = 1 + job.mode = "STREAMING" +} + +source { + FakeSource { + result_table_name = "fake1" + schema = { + fields { + id = "int" + name = "string" + age = "int" + sex = "string" + } + } + } + FakeSource { + result_table_name = "fake2" + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +sink { + Console { + source_table_name = "fake1" + } + Console { + source_table_name = "fake2" + } +} +``` + +## 控制台示例数据 + +控制台打印的输出: + +``` +2022-12-19 11:01:45,417 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - output rowType: name, age +2022-12-19 11:01:46,489 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=1: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: CpiOd, 8520946 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=2: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: eQqTs, 1256802974 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=3: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: UsRgO, 2053193072 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=4: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: jDQJj, 1993016602 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=5: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: rqdKp, 1392682764 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=6: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: wCoWN, 986999925 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=7: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: qomTU, 72775247 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=8: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: jcqXR, 1074529204 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=9: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: AkWIO, 1961723427 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=10: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: hBoib, 929089763 +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/DingTalk.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/DingTalk.md new file mode 100644 index 000000000000..8f584fc7ad78 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/DingTalk.md @@ -0,0 +1,55 @@ +# 钉钉 + +> 钉钉 数据接收器 + +## 支持的引擎 + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## 主要特性 + +- [ ] [精确一次](../../concept/connector-v2-features.md) + +## 描述 + +一个使用钉钉机器人发送消息的Sink插件。 + +## Options + +| 名称 | 类型 | 是否必须 | 默认值 | +|----------------|--------|------|-----| +| url | String | 是 | - | +| secret | String | 是 | - | +| common-options | | 否 | - | + +### url [String] + +钉钉机器人地址格式为 https://oapi.dingtalk.com/robot/send?access_token=XXXXXX(String) + +### secret [String] + +钉钉机器人的密钥 (String) + +### common options + +Sink插件的通用参数,请参考 [Sink Common Options](common-options.md) 了解详情 + +## 任务示例 + +```hocon +sink { + DingTalk { + url="https://oapi.dingtalk.com/robot/send?access_token=ec646cccd028d978a7156ceeac5b625ebd94f586ea0743fa501c100007890" + secret="SEC093249eef7aa57d4388aa635f678930c63db3d28b2829d5b2903fc1e5c10000" + } +} +``` + +## 更新日志 + +### 2.2.0-beta 2022-09-26 + +- 添加钉钉接收器 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Doris.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Doris.md new file mode 100644 index 000000000000..afc470326f56 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Doris.md @@ -0,0 +1,340 @@ +# Doris + +> Doris sink 连接器 + +## 支持的doris版本 + +- exactly-once & cdc 支持 `Doris version is >= 1.1.x` +- 支持数组数据类型 `Doris version is >= 1.2.x` +- 将支持Map数据类型 `Doris version is 2.x` + +## 引擎支持 + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## 主要特性 + +- [x] [精确一次](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +## 描述 + +用于发送数据到doris. 同时支持流模式和批模式处理. +Doris Sink连接器的内部实现是通过stream load批量缓存和导入的。 + +## Sink 选项 + +| Name | Type | Required | Default | Description | +|--------------------------------|---------|----------|------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------| +| fenodes | String | Yes | - | `Doris` 集群 fenodes 地址, 格式是 `"fe_ip:fe_http_port, ..."` | +| query-port | int | No | 9030 | `Doris` Fenodes mysql协议查询端口 | +| username | String | Yes | - | `Doris` 用户名 | +| password | String | Yes | - | `Doris` 密码 | +| database | String | Yes | - | `Doris`数据库名称 , 使用 `${database_name}` 表示上游数据库名称。 | +| table | String | Yes | - | `Doris` 表名, 使用 `${table_name}` 表示上游表名。 | +| table.identifier | String | Yes | - | `Doris` 表的名称,2.3.5 版本后将弃用,请使用 `database` 和 `table` 代替。 | +| sink.label-prefix | String | Yes | - | stream load导入使用的标签前缀。 在2pc场景下,需要全局唯一性来保证SeaTunnel的EOS语义。 | +| sink.enable-2pc | bool | No | false | 是否启用两阶段提交(2pc),默认为 false。 对于两阶段提交,请参考[此处](https://doris.apache.org/docs/dev/sql-manual/sql-statements/Data-Manipulation-Statements/Load/STREAM-LOAD/)。 | +| sink.enable-delete | bool | No | - | 是否启用删除。 该选项需要Doris表开启批量删除功能(0.15+版本默认开启),且仅支持Unique模型。 您可以在此[link](https://doris.apache.org/docs/dev/data-operate/delete/batch-delete-manual/)获得更多详细信息 | +| sink.check-interval | int | No | 10000 | 加载过程中检查异常时间间隔。 | +| sink.max-retries | int | No | 3 | 向数据库写入记录失败时的最大重试次数。 | +| sink.buffer-size | int | No | 256 * 1024 | 用于缓存stream load数据的缓冲区大小。 | +| sink.buffer-count | int | No | 3 | 用于缓存stream load数据的缓冲区计数。 | +| doris.batch.size | int | No | 1024 | 每次http请求写入doris的批量大小,当row达到该大小或者执行checkpoint时,缓存的数据就会写入服务器。 | +| needs_unsupported_type_casting | boolean | No | false | 是否启用不支持的类型转换,例如 Decimal64 到 Double。 | +| schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | schema保存模式,请参考下面的`schema_save_mode` | +| data_save_mode | Enum | no | APPEND_DATA | 数据保存模式,请参考下面的`data_save_mode`。 | +| save_mode_create_template | string | no | see below | 见下文。 | +| custom_sql | String | no | - | 当data_save_mode选择CUSTOM_PROCESSING时,需要填写CUSTOM_SQL参数。 该参数通常填写一条可以执行的SQL。 SQL将在同步任务之前执行。 | +| doris.config | map | yes | - | 该选项用于支持自动生成sql时的insert、delete、update等操作,以及支持的格式。 | + +### schema_save_mode[Enum] + +在开启同步任务之前,针对现有的表结构选择不同的处理方案。 +选项介绍: +`RECREATE_SCHEMA` :表不存在时创建,表保存时删除并重建。 +`CREATE_SCHEMA_WHEN_NOT_EXIST` :表不存在时会创建,表存在时跳过。 +`ERROR_WHEN_SCHEMA_NOT_EXIST` :表不存在时会报错。 + +### data_save_mode[Enum] + +在开启同步任务之前,针对目标端已有的数据选择不同的处理方案。 +选项介绍: +`DROP_DATA`: 保留数据库结构并删除数据。 +`APPEND_DATA`:保留数据库结构,保留数据。 +`CUSTOM_PROCESSING`:用户自定义处理。 +`ERROR_WHEN_DATA_EXISTS`:有数据时报错。 + +### save_mode_create_template + +使用模板自动创建Doris表, +会根据上游数据类型和schema类型创建相应的建表语句, +默认模板可以根据情况进行修改。 + +默认模板: + +```sql +CREATE TABLE IF NOT EXISTS `${database}`.`${table_name}` ( +${rowtype_primary_key}, +${rowtype_fields} +) ENGINE=OLAP + UNIQUE KEY (${rowtype_primary_key}) +DISTRIBUTED BY HASH (${rowtype_primary_key}) + PROPERTIES ( +"replication_allocation" = "tag.location.default: 1", +"in_memory" = "false", +"storage_format" = "V2", +"disable_auto_compaction" = "false" +) +``` + +如果模板中填写了自定义字段,例如添加 id 字段 + +```sql +CREATE TABLE IF NOT EXISTS `${database}`.`${table_name}` +( + id, + ${rowtype_fields} +) ENGINE = OLAP UNIQUE KEY (${rowtype_primary_key}) + DISTRIBUTED BY HASH (${rowtype_primary_key}) + PROPERTIES +( + "replication_num" = "1" +); +``` + +连接器会自动从上游获取对应类型完成填充, +并从“rowtype_fields”中删除 id 字段。 该方法可用于自定义字段类型和属性的修改。 + +可以使用以下占位符: + +- database:用于获取上游schema中的数据库。 +- table_name:用于获取上游schema中的表名。 +- rowtype_fields:用于获取上游schema中的所有字段,自动映射到Doris的字段描述。 +- rowtype_primary_key:用于获取上游模式中的主键(可能是列表) +- rowtype_unique_key:用于获取上游模式中的唯一键(可能是列表)。 + +## 数据类型映射 + +| Doris 数据类型 | SeaTunnel 数据类型 | +|----------------|-----------------------------------------| +| BOOLEAN | BOOLEAN | +| TINYINT | TINYINT | +| SMALLINT | SMALLINT
TINYINT | +| INT | INT
SMALLINT
TINYINT | +| BIGINT | BIGINT
INT
SMALLINT
TINYINT | +| LARGEINT | BIGINT
INT
SMALLINT
TINYINT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE
FLOAT | +| DECIMAL | DECIMAL
DOUBLE
FLOAT | +| DATE | DATE | +| DATETIME | TIMESTAMP | +| CHAR | STRING | +| VARCHAR | STRING | +| STRING | STRING | +| ARRAY | ARRAY | +| MAP | MAP | +| JSON | STRING | +| HLL | 尚不支持 | +| BITMAP | 尚不支持 | +| QUANTILE_STATE | 尚不支持 | +| STRUCT | 尚不支持 | + +#### 支持的导入数据格式 + +支持的格式包括 CSV 和 JSON。 + +## 任务示例 + +### 简单示例: + +> 下面的例子描述了向Doris写入多种数据类型,用户需要在下游创建对应的表。 + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" + checkpoint.interval = 10000 +} + +source { + FakeSource { + row.num = 10 + map.size = 10 + array.size = 10 + bytes.length = 10 + string.length = 10 + schema = { + fields { + c_map = "map>" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(16, 1)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} + +sink { + Doris { + fenodes = "doris_cdc_e2e:8030" + username = root + password = "" + database = "test" + table = "e2e_table_sink" + sink.label-prefix = "test-cdc" + sink.enable-2pc = "true" + sink.enable-delete = "true" + doris.config { + format = "json" + read_json_by_line = "true" + } + } +} +``` + +### CDC(监听数据变更捕获)事件: + +> 本示例定义了一个SeaTunnel同步任务,通过FakeSource自动生成数据并发送给Doris Sink,FakeSource使用schema、score(int类型)模拟CDC数据,Doris需要创建一个名为test.e2e_table_sink的sink任务及其对应的表 。 + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" + checkpoint.interval = 10000 +} + +source { + FakeSource { + schema = { + fields { + pk_id = bigint + name = string + score = int + sex = boolean + number = tinyint + height = float + sight = double + create_time = date + update_time = timestamp + } + } + rows = [ + { + kind = INSERT + fields = [1, "A", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + }, + { + kind = INSERT + fields = [2, "B", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + }, + { + kind = INSERT + fields = [3, "C", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + }, + { + kind = UPDATE_BEFORE + fields = [1, "A", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + }, + { + kind = UPDATE_AFTER + fields = [1, "A_1", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + }, + { + kind = DELETE + fields = [2, "B", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + } + ] + } +} + +sink { + Doris { + fenodes = "doris_cdc_e2e:8030" + username = root + password = "" + database = "test" + table = "e2e_table_sink" + sink.label-prefix = "test-cdc" + sink.enable-2pc = "true" + sink.enable-delete = "true" + doris.config { + format = "json" + read_json_by_line = "true" + } + } +} + +``` + +### 使用JSON格式导入数据 + +``` +sink { + Doris { + fenodes = "e2e_dorisdb:8030" + username = root + password = "" + database = "test" + table = "e2e_table_sink" + sink.enable-2pc = "true" + sink.label-prefix = "test_json" + doris.config = { + format="json" + read_json_by_line="true" + } + } +} + +``` + +### 使用CSV格式导入数据 + +``` +sink { + Doris { + fenodes = "e2e_dorisdb:8030" + username = root + password = "" + database = "test" + table = "e2e_table_sink" + sink.enable-2pc = "true" + sink.label-prefix = "test_csv" + doris.config = { + format = "csv" + column_separator = "," + } + } +} +``` + +## 变更日志 + +### 2.3.0-beta 2022-10-20 + +- 添加 Doris sink连接器 + +### Next version + +- [Improve] Change Doris Config Prefix [3856](https://github.com/apache/seatunnel/pull/3856) + +- [Improve] Refactor some Doris Sink code as well as support 2pc and cdc [4235](https://github.com/apache/seatunnel/pull/4235) + +:::tip + +PR 4235 is an incompatible modification to PR 3856. Please refer to PR 4235 to use the new Doris connector + +::: diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Elasticsearch.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Elasticsearch.md new file mode 100644 index 000000000000..edf974d8fbad --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Elasticsearch.md @@ -0,0 +1,218 @@ +# Elasticsearch + +## 描述 + +输出数据到 `Elasticsearch` + +## 主要特性 + +- [ ] [精确一次](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +:::tip + +引擎支持 + +* 支持 `ElasticSearch 版本 >= 2.x 并且 <= 8.x` + +::: + +## 选项 + +| 名称 | 类型 | 是否必须 | 默认值 | +|-------------------------|---------|------|------------------------------| +| hosts | array | 是 | - | +| index | string | 是 | - | +| schema_save_mode | string | 是 | CREATE_SCHEMA_WHEN_NOT_EXIST | +| data_save_mode | string | 是 | APPEND_DATA | +| index_type | string | 否 | | +| primary_keys | list | 否 | | +| key_delimiter | string | 否 | `_` | +| username | string | 否 | | +| password | string | 否 | | +| max_retry_count | int | 否 | 3 | +| max_batch_size | int | 否 | 10 | +| tls_verify_certificate | boolean | 否 | true | +| tls_verify_hostnames | boolean | 否 | true | +| tls_keystore_path | string | 否 | - | +| tls_keystore_password | string | 否 | - | +| tls_truststore_path | string | 否 | - | +| tls_truststore_password | string | 否 | - | +| common-options | | 否 | - | + +### hosts [array] + +`Elasticsearch` 集群http地址,格式为 `host:port` ,允许指定多个主机。例如 `["host1:9200", "host2:9200"]` + +### index [string] + +`Elasticsearch` 的 `index` 名称。索引支持包含字段名变量,例如 `seatunnel_${age}`,并且该字段必须出现在 seatunnel Row 中。如果没有,我们将把它视为普通索引 + +### index_type [string] + +`Elasticsearch` 索引类型,elasticsearch 6及以上版本建议不要指定 + +### primary_keys [list] + +主键字段用于生成文档 `_id` ,这是 CDC 必需的选项。 + +### key_delimiter [string] + +设定复合键的分隔符(默认为 `_`),例如,如果使用 `$` 作为分隔符,那么文档的 `_id` 将呈现为 `KEY1$KEY2$KEY3` 的格式 + +### username [string] + +x-pack 用户名 + +### password [string] + +x-pack 密码 + +### max_retry_count [int] + +批次批量请求最大尝试大小 + +### max_batch_size [int] + +批次批量文档最大大小 + +### tls_verify_certificate [boolean] + +为 HTTPS 端点启用证书验证 + +### tls_verify_hostname [boolean] + +为 HTTPS 端点启用主机名验证 + +### tls_keystore_path [string] + +指向 PEM 或 JKS 密钥存储的路径。运行 SeaTunnel 的操作系统用户必须能够读取此文件 + +### tls_keystore_password [string] + +指定的密钥存储的密钥密码 + +### tls_truststore_path [string] + +指向 PEM 或 JKS 信任存储的路径。运行 SeaTunnel 的操作系统用户必须能够读取此文件 + +### tls_truststore_password [string] + +指定的信任存储的密钥密码 + +### common options + +Sink插件常用参数,请参考 [Sink常用选项](common-options.md) 了解详情 + +### schema_save_mode + +在启动同步任务之前,针对目标侧已有的表结构选择不同的处理方案
+选项介绍:
+`RECREATE_SCHEMA` :当表不存在时会创建,当表已存在时会删除并重建
+`CREATE_SCHEMA_WHEN_NOT_EXIST` :当表不存在时会创建,当表已存在时则跳过创建
+`ERROR_WHEN_SCHEMA_NOT_EXIST` :当表不存在时将抛出错误
+ +### data_save_mode + +在启动同步任务之前,针对目标侧已存在的数据选择不同的处理方案
+选项介绍:
+`DROP_DATA`: 保留数据库结构,删除数据
+`APPEND_DATA`:保留数据库结构,保留数据
+`ERROR_WHEN_DATA_EXISTS`:当有数据时抛出错误
+ +## 示例 + +简单示例 + +```bash +sink { + Elasticsearch { + hosts = ["localhost:9200"] + index = "seatunnel-${age}" + } +} +``` + +变更数据捕获 (Change data capture) 事件 + +```bash +sink { + Elasticsearch { + hosts = ["localhost:9200"] + index = "seatunnel-${age}" + + # CDC required options + primary_keys = ["key1", "key2", ...] + } +} +``` + +SSL 禁用证书验证 + +```hocon +sink { + Elasticsearch { + hosts = ["https://localhost:9200"] + username = "elastic" + password = "elasticsearch" + + tls_verify_certificate = false + } +} +``` + +SSL 禁用主机名验证 + +```hocon +sink { + Elasticsearch { + hosts = ["https://localhost:9200"] + username = "elastic" + password = "elasticsearch" + + tls_verify_hostname = false + } +} +``` + +SSL 启用证书验证 + +通过设置 `tls_keystore_path` 与 `tls_keystore_password` 指定证书路径及密码 + +```hocon +sink { + Elasticsearch { + hosts = ["https://localhost:9200"] + username = "elastic" + password = "elasticsearch" + + tls_keystore_path = "${your elasticsearch home}/config/certs/http.p12" + tls_keystore_password = "${your password}" + } +} +``` + +配置表生成策略 (schema_save_mode) + +通过设置 `schema_save_mode` 配置为 `CREATE_SCHEMA_WHEN_NOT_EXIST` 来支持不存在表时创建表 + +```hocon +sink { + Elasticsearch { + hosts = ["https://localhost:9200"] + username = "elastic" + password = "elasticsearch" + + schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" + data_save_mode = "APPEND_DATA" + } +} +``` + +## 变更日志 + +### 下一版本 + +- [Feature] Support CDC write DELETE/UPDATE/INSERT events ([3673](https://github.com/apache/seatunnel/pull/3673)) +- [Feature] Support https protocol & compatible with opensearch ([3997](https://github.com/apache/seatunnel/pull/3997)) + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Email.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Email.md new file mode 100644 index 000000000000..cc3999c580c9 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Email.md @@ -0,0 +1,89 @@ +# Email + +> Email 数据接收器 + +## 描述 + +将接收的数据作为文件发送到电子邮件 + +## 支持版本 + +测试版本:1.5.6(供参考) + +## 主要特性 + +- [ ] [精确一次](../../concept/connector-v2-features.md) + +## 选项 + +| 名称 | 类型 | 是否必须 | 默认值 | +|--------------------------|--------|------|-----| +| email_from_address | string | 是 | - | +| email_to_address | string | 是 | - | +| email_host | string | 是 | - | +| email_transport_protocol | string | 是 | - | +| email_smtp_auth | string | 是 | - | +| email_authorization_code | string | 是 | - | +| email_message_headline | string | 是 | - | +| email_message_content | string | 是 | - | +| common-options | | 否 | - | + +### email_from_address [string] + +发件人邮箱地址 + +### email_to_address [string] + +接收邮件的地址 + +### email_host [string] + +连接的SMTP服务器地址 + +### email_transport_protocol [string] + +加载会话的协议 + +### email_smtp_auth [string] + +是否对客户进行认证 + +### email_authorization_code [string] + +授权码,您可以从邮箱设置中获取授权码 + +### email_message_headline [string] + +邮件的标题 + +### email_message_content [string] + +邮件消息的正文 + +### common options + +Sink插件常用参数,请参考 [Sink常用选项](common-options.md) 了解详情. + +## 示例 + +```bash + + EmailSink { + email_from_address = "xxxxxx@qq.com" + email_to_address = "xxxxxx@163.com" + email_host="smtp.qq.com" + email_transport_protocol="smtp" + email_smtp_auth="true" + email_authorization_code="" + email_message_headline="" + email_message_content="" + } + +``` + +## 变更日志 + +### 2.2.0-beta 2022-09-26 + +- 添加 Email 接收器连接器 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Feishu.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Feishu.md new file mode 100644 index 000000000000..c561e50a9714 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Feishu.md @@ -0,0 +1,66 @@ +# 飞书 + +> 飞书 数据接收器 + +## 支持的引擎 + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## 主要特性 + +- [ ] [精确一次](../../concept/connector-v2-features.md) +- [ ] [变更数据捕获](../../concept/connector-v2-features.md) + +## 描述 + +用于通过数据调用飞书的web hooks。 + +> 例如,如果来自上游的数据是 [`年龄: 12, 姓名: tyrantlucifer`],则 body 内容如下:`{"年龄": 12, "姓名": "tyrantlucifer"}` + +**提示:飞书接收器仅支持 `post json`类型的web hook,并且源数据将被视为web hook的正文内容。** + +## 数据类型映射 + +| SeaTunnel 数据类型 | 飞书数据类型 | +|-----------------------------|------------| +| ROW
MAP | Json | +| NULL | null | +| BOOLEAN | boolean | +| TINYINT | byte | +| SMALLINT | short | +| INT | int | +| BIGINT | long | +| FLOAT | float | +| DOUBLE | double | +| DECIMAL | BigDecimal | +| BYTES | byte[] | +| STRING | String | +| TIME
TIMESTAMP
TIME | String | +| ARRAY | JsonArray | + +## 接收器选项 + +| 名称 | 类型 | 是否必需 | 默认值 | 描述 | +|----------------|--------|------|-----|----------------------------------------------------| +| url | String | 是 | - | 飞书web hook URL | +| headers | Map | 否 | - | HTTP 请求头 | +| common-options | | 否 | - | 接收器插件常见参数,请参阅 [接收器通用选项](common-options.md) 以获取详细信息 | + +## 任务示例 + +### 简单示例: + +```hocon +Feishu { + url = "https://www.feishu.cn/flow/api/trigger-webhook/108bb8f208d9b2378c8c7aedad715c19" + } +``` + +## 更新日志 + +### 2.2.0-beta 2022-09-26 + +- 添加飞书接收器 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Hbase.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Hbase.md new file mode 100644 index 000000000000..871cad206c64 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Hbase.md @@ -0,0 +1,141 @@ +# Hbase + +> Hbase 数据连接器 + +## 描述 + +将数据输出到hbase + +## 主要特性 + +- [ ] [精准一次](../../concept/connector-v2-features.md) + +## 选项 + +| 名称 | 类型 | 是否必须 | 默认值 | +|--------------------|---------|------|-----------------| +| zookeeper_quorum | string | yes | - | +| table | string | yes | - | +| rowkey_column | list | yes | - | +| family_name | config | yes | - | +| rowkey_delimiter | string | no | "" | +| version_column | string | no | - | +| null_mode | string | no | skip | +| wal_write | boolean | yes | false | +| write_buffer_size | string | no | 8 * 1024 * 1024 | +| encoding | string | no | utf8 | +| hbase_extra_config | string | no | - | +| common-options | | no | - | +| ttl | long | no | - | + +### zookeeper_quorum [string] + +hbase的zookeeper集群主机, 示例: "hadoop001:2181,hadoop002:2181,hadoop003:2181" + +### table [string] + +要写入的表名, 例如: "seatunnel" + +### rowkey_column [list] + +行键的列名列表, 例如: ["id", "uuid"] + +### family_name [config] + +字段的列簇名称映射。例如,上游的行如下所示: + +| id | name | age | +|----|---------------|-----| +| 1 | tyrantlucifer | 27 | + +id作为行键和其他写入不同列簇的字段,可以分配 + +family_name { +name = "info1" +age = "info2" +} + +这主要是name写入列簇info1,age写入将写给列簇 info2 + +如果要将其他字段写入同一列簇,可以分配 + +family_name { +all_columns = "info" +} + +这意味着所有字段都将写入该列簇 info + +### rowkey_delimiter [string] + +连接多行键的分隔符,默认 "" + +### version_column [string] + +版本列名称,您可以使用它来分配 hbase 记录的时间戳 + +### null_mode [double] + +写入 null 值的模式,支持 [ skip , empty], 默认 skip + +- skip: 当字段为 null ,连接器不会将此字段写入 hbase +- empty: 当字段为null时,连接器将写入并为此字段生成空值 + +### wal_write [boolean] + +wal log 写入标志,默认值 false + +### write_buffer_size [int] + +hbase 客户端的写入缓冲区大小,默认 8 * 1024 * 1024 + +### encoding [string] + +字符串字段的编码,支持[ utf8 , gbk],默认 utf8 + +### hbase_extra_config [config] + +hbase扩展配置 + +### ttl [long] + +hbase 写入数据 TTL 时间,默认以表设置的TTL为准,单位毫秒 + +### 常见选项 + +Sink 插件常用参数,详见 Sink 常用选项 [Sink Common Options](common-options.md) + +## 案例 + +```hocon + +Hbase { + zookeeper_quorum = "hadoop001:2181,hadoop002:2181,hadoop003:2181" + table = "seatunnel_test" + rowkey_column = ["name"] + family_name { + all_columns = seatunnel + } +} + +``` + +## 写入指定列族 + +```hocon +Hbase { + zookeeper_quorum = "hbase_e2e:2181" + table = "assign_cf_table" + rowkey_column = ["id"] + family_name { + c_double = "cf1" + c_bigint = "cf2" + } +} +``` + +## 更改日志 + +### 下一个版本 + +- 添加 hbase 输出连接器 ([4049](https://github.com/apache/seatunnel/pull/4049)) + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/HdfsFile.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/HdfsFile.md new file mode 100644 index 000000000000..dee466770e60 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/HdfsFile.md @@ -0,0 +1,202 @@ +# Hdfs文件 + +> Hdfs文件 数据接收器 + +## 支持的引擎 + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## 主要特性 + +- [x] [精确一次](../../../en/concept/connector-v2-features.md) + +默认情况下,我们使用2PC提交来确保"精确一次" + +- [x] 文件格式类型 + - [x] 文本 + - [x] CSV + - [x] Parquet + - [x] ORC + - [x] JSON + - [x] Excel +- [x] 压缩编解码器 + - [x] lzo + +## 描述 + +将数据输出到Hdfs文件 + +## 支持的数据源信息 + +| 数据源 | 支持的版本 | +|--------|------------------| +| Hdfs文件 | hadoop 2.x 和 3.x | + +## 接收器选项 + +| 名称 | 类型 | 是否必须 | 默认值 | 描述 | +|----------------------------------|---------|------|--------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| fs.defaultFS | string | 是 | - | 以 `hdfs://` 开头的 Hadoop 集群地址,例如:`hdfs://hadoopcluster` | +| path | string | 是 | - | 目标目录路径是必需的。 | +| tmp_path | string | 是 | /tmp/seatunnel | 结果文件将首先写入临时路径,然后使用 `mv` 命令将临时目录提交到目标目录。需要一个Hdfs路径。 | +| hdfs_site_path | string | 否 | - | `hdfs-site.xml` 的路径,用于加载 namenodes 的 ha 配置。 | +| custom_filename | boolean | 否 | false | 是否需要自定义文件名 | +| file_name_expression | string | 否 | "${transactionId}" | 仅在 `custom_filename` 为 `true` 时使用。`file_name_expression` 描述将创建到 `path` 中的文件表达式。我们可以在 `file_name_expression` 中添加变量 `${now}` 或 `${uuid}`,例如 `test_${uuid}_${now}`,`${now}` 表示当前时间,其格式可以通过指定选项 `filename_time_format` 来定义。请注意,如果 `is_enable_transaction` 为 `true`,我们将在文件头部自动添加 `${transactionId}_`。 | +| filename_time_format | string | 否 | "yyyy.MM.dd" | 仅在 `custom_filename` 为 `true` 时使用。当 `file_name_expression` 参数中的格式为 `xxxx-${now}` 时,`filename_time_format` 可以指定路径的时间格式,默认值为 `yyyy.MM.dd`。常用的时间格式如下所示:[y:年,M:月,d:月中的一天,H:一天中的小时(0-23),m:小时中的分钟,s:分钟中的秒] | +| file_format_type | string | 否 | "csv" | 我们支持以下文件类型:`text` `json` `csv` `orc` `parquet` `excel`。请注意,最终文件名将以文件格式的后缀结束,文本文件的后缀是 `txt`。 | +| field_delimiter | string | 否 | '\001' | 仅在 file_format 为 text 时使用,数据行中列之间的分隔符。仅需要 `text` 文件格式。 | +| row_delimiter | string | 否 | "\n" | 仅在 file_format 为 text 时使用,文件中行之间的分隔符。仅需要 `text` 文件格式。 | +| have_partition | boolean | 否 | false | 是否需要处理分区。 | +| partition_by | array | 否 | - | 仅在 have_partition 为 true 时使用,根据选定的字段对数据进行分区。 | +| partition_dir_expression | string | 否 | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | 仅在 have_partition 为 true 时使用,如果指定了 `partition_by`,我们将根据分区信息生成相应的分区目录,并将最终文件放置在分区目录中。默认 `partition_dir_expression` 为 `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`。`k0` 是第一个分区字段,`v0` 是第一个分区字段的值。 | +| is_partition_field_write_in_file | boolean | 否 | false | 仅当 `have_partition` 为 `true` 时使用。如果 `is_partition_field_write_in_file` 为 `true`,则分区字段及其值将写入数据文件中。例如,如果要写入Hive数据文件,则其值应为 `false`。 | +| sink_columns | array | 否 | | 当此参数为空时,所有字段都是接收器列。需要写入文件的列,默认值是从 `Transform` 或 `Source` 获取的所有列。字段的顺序确定了实际写入文件时的顺序。 | +| is_enable_transaction | boolean | 否 | true | 如果 `is_enable_transaction` 为 true,则在将数据写入目标目录时,我们将确保数据不会丢失或重复。请注意,如果 `is_enable_transaction` 为 `true`,我们将在文件头部自动添加 `${transactionId}_`。目前仅支持 `true`。 | +| batch_size | int | 否 | 1000000 | 文件中的最大行数。对于 SeaTunnel Engine,文件中的行数由 `batch_size` 和 `checkpoint.interval` 共同决定。如果 `checkpoint.interval` 的值足够大,则接收器写入器将在文件中写入行,直到文件中的行大于 `batch_size`。如果 `checkpoint.interval` 很小,则接收器写入器将在新检查点触发时创建一个新文件。 | +| compress_codec | string | 否 | none | 文件的压缩编解码器及其支持的细节如下所示:[txt: `lzo` `none`,json: `lzo` `none`,csv: `lzo` `none`,orc: `lzo` `snappy` `lz4` `zlib` `none`,parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none`]。提示:excel类型不支持任何压缩格式。 | +| krb5_path | string | 否 | /etc/krb5.conf | kerberos 的 krb5 路径 | +| kerberos_principal | string | 否 | - | kerberos 的主体 | +| kerberos_keytab_path | string | 否 | - | kerberos 的 keytab 路径 | +| compress_codec | string | 否 | none | 压缩编解码器 | +| common-options | object | 否 | - | 接收器插件通用参数,请参阅 [接收器通用选项](common-options.md) 了解详情 | +| max_rows_in_memory | int | 否 | - | 仅当 file_format 为 excel 时使用。当文件格式为 Excel 时,可以缓存在内存中的最大数据项数。 | +| sheet_name | string | 否 | Sheet${Random number} | 仅当 file_format 为 excel 时使用。将工作簿的表写入指定的表名 | + +### 提示 + +> 如果您使用 spark/flink,为了使用此连接器,您必须确保您的 spark/flink 集群已经集成了 hadoop。测试过的 hadoop 版本是 +> 2.x。如果您使用 SeaTunnel Engine,则在下载和安装 SeaTunnel Engine 时会自动集成 hadoop +> jar。您可以检查 `${SEATUNNEL_HOME}/lib` 下的 jar 包来确认这一点。 + +## 任务示例 + +### 简单示例: + +> 此示例定义了一个 SeaTunnel 同步任务,通过 FakeSource 自动生成数据并将其发送到 Hdfs。 + +``` +# 定义运行时环境 +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + # 这是一个示例源插件 **仅用于测试和演示功能源插件** + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } + # 如果您想获取有关如何配置 seatunnel 的更多信息和查看完整的源端插件列表, + # 请访问 https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # 如果您想获取有关如何配置 seatunnel 的更多信息和查看完整的转换插件列表, + # 请访问 https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + file_format_type = "orc" + } + # 如果您想获取有关如何配置 seatunnel 的更多信息和查看完整的接收器插件列表, + # 请访问 https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + +### orc 文件格式的简单配置 + +``` +HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + file_format_type = "orc" +} +``` + +### text 文件格式的配置,包括 `have_partition`、`custom_filename` 和 `sink_columns` + +``` +HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + file_format_type = "text" + field_delimiter = "\t" + row_delimiter = "\n" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + custom_filename = true + file_name_expression = "${transactionId}_${now}" + filename_time_format = "yyyy.MM.dd" + sink_columns = ["name","age"] + is_enable_transaction = true +} +``` + +### parquet 文件格式的配置,包括 `have_partition`、`custom_filename` 和 `sink_columns` + +``` +HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + custom_filename = true + file_name_expression = "${transactionId}_${now}" + filename_time_format = "yyyy.MM.dd" + file_format_type = "parquet" + sink_columns = ["name","age"] + is_enable_transaction = true +} +``` + +### kerberos 的简单配置 + +``` +HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + hdfs_site_path = "/path/to/your/hdfs_site_path" + kerberos_principal = "your_principal@EXAMPLE.COM" + kerberos_keytab_path = "/path/to/your/keytab/file.keytab" +} +``` + +### 压缩的简单配置 + +``` +HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + compress_codec = "lzo" +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Http.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Http.md new file mode 100644 index 000000000000..f837380efdd7 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Http.md @@ -0,0 +1,63 @@ +# Http + +> Http 数据接收器 + +## 支持引擎 + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## 主要特性 + +- [ ] [精确一次](../../concept/connector-v2-features.md) +- [ ] [cdc](../../concept/connector-v2-features.md) + +## 描述 + +接收Source端传入的数据,利用数据触发 web hooks。 + +> 例如,来自上游的数据为[`age: 12, name: tyrantlucifer`],则body内容如下:`{"age": 12, "name": "tyrantlucifer"}` + +**Tips: Http 接收器仅支持 `post json` 类型的 web hook,source 数据将被视为 webhook 中的 body 内容。** + +## 支持的数据源信息 + +想使用 Http 连接器,需要安装以下必要的依赖。可以通过运行 install-plugin.sh 脚本或者从 Maven 中央仓库下载这些依赖 + +| 数据源 | 支持版本 | 依赖 | +|------|------|------------------------------------------------------------------------------------------------------| +| Http | 通用 | [下载](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-http) | + +## 接收器选项 + +| 名称 | 类型 | 是否必须 | 默认值 | 描述 | +|-----------------------------|--------|------|-------|----------------------------------------------------| +| url | String | 是 | - | Http 请求链接 | +| headers | Map | 否 | - | Http 标头 | +| retry | Int | 否 | - | 如果请求http返回`IOException`的最大重试次数 | +| retry_backoff_multiplier_ms | Int | 否 | 100 | http请求失败,重试回退次数(毫秒)乘数 | +| retry_backoff_max_ms | Int | 否 | 10000 | http请求失败,最大重试回退时间(毫秒) | +| connect_timeout_ms | Int | 否 | 12000 | 连接超时设置,默认12s | +| socket_timeout_ms | Int | 否 | 60000 | 套接字超时设置,默认为60s | +| common-options | | 否 | - | Sink插件常用参数,请参考 [Sink常用选项 ](common-options.md) 了解详情 | + +## 示例 + +简单示例: + +```hocon +Http { + url = "http://localhost/test/webhook" + headers { + token = "9e32e859ef044462a257e1fc76730066" + } +} +``` + +## 变更日志 + +### 2.2.0-beta 2022-09-26 + +- 添加Http接收连接器 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Hudi.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Hudi.md new file mode 100644 index 000000000000..ab1fc43603fd --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Hudi.md @@ -0,0 +1,92 @@ +# Hudi + +> Hudi 接收器连接器 + +## 描述 + +用于将数据写入 Hudi。 + +## 主要特点 + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +## 选项 + +| 名称 | 类型 | 是否必需 | 默认值 | +|----------------------------|--------|------|---------------| +| table_name | string | 是 | - | +| table_dfs_path | string | 是 | - | +| conf_files_path | string | 否 | - | +| record_key_fields | string | 否 | - | +| partition_fields | string | 否 | - | +| table_type | enum | 否 | copy_on_write | +| op_type | enum | 否 | insert | +| batch_interval_ms | Int | 否 | 1000 | +| insert_shuffle_parallelism | Int | 否 | 2 | +| upsert_shuffle_parallelism | Int | 否 | 2 | +| min_commits_to_keep | Int | 否 | 20 | +| max_commits_to_keep | Int | 否 | 30 | +| common-options | config | 否 | - | + +### table_name [string] + +`table_name` Hudi 表的名称。 + +### table_dfs_path [string] + +`table_dfs_path` Hudi 表的 DFS 根路径,例如 "hdfs://nameservice/data/hudi/hudi_table/"。 + +### table_type [enum] + +`table_type` Hudi 表的类型。 + +### conf_files_path [string] + +`conf_files_path` 环境配置文件路径列表(本地路径),用于初始化 HDFS 客户端以读取 Hudi 表文件。示例:"/home/test/hdfs-site.xml;/home/test/core-site.xml;/home/test/yarn-site.xml"。 + +### op_type [enum] + +`op_type` Hudi 表的操作类型。值可以是 'insert'、'upsert' 或 'bulk_insert'。 + +### batch_interval_ms [Int] + +`batch_interval_ms` 批量写入 Hudi 表的时间间隔。 + +### insert_shuffle_parallelism [Int] + +`insert_shuffle_parallelism` 插入数据到 Hudi 表的并行度。 + +### upsert_shuffle_parallelism [Int] + +`upsert_shuffle_parallelism` 更新插入数据到 Hudi 表的并行度。 + +### min_commits_to_keep [Int] + +`min_commits_to_keep` Hudi 表保留的最少提交数。 + +### max_commits_to_keep [Int] + +`max_commits_to_keep` Hudi 表保留的最多提交数。 + +### 通用选项 + +数据源插件的通用参数,请参考 [Source Common Options](common-options.md) 了解详细信息。 + +## 示例 + +```hocon +source { + + Hudi { + table_dfs_path = "hdfs://nameserivce/data/hudi/hudi_table/" + table_type = "cow" + conf_files_path = "/home/test/hdfs-site.xml;/home/test/core-site.xml;/home/test/yarn-site.xml" + use.kerberos = true + kerberos.principal = "test_user@xxx" + kerberos.principal.file = "/home/test/test_user.keytab" + } + +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Jdbc.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Jdbc.md new file mode 100644 index 000000000000..d61292cb921a --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Jdbc.md @@ -0,0 +1,357 @@ +# JDBC + +> JDBC 数据接收器 + +## 描述 + +通过jdbc写入数据。支持批处理模式和流处理模式,支持并发写入,支持精确一次语义(使用XA事务保证) + +## 使用依赖 + +### 用于Spark/Flink引擎 + +> 1. 需要确保jdbc驱动jar包已经放在目录`${SEATUNNEL_HOME}/plugins/`下。 + +### 适用于 SeaTunnel Zeta 引擎 + +> 1. 需要确保jdbc驱动jar包已经放到`${SEATUNNEL_HOME}/lib/`目录下。 + +## 主要特性 + +- [x] [精确一次](../../concept/connector-v2-features.md) + +使用 `Xa transactions` 来确保 `exactly-once`。所以仅对于支持 `Xa transactions` 的数据库支持 `exactly-once` +。你可以设置 `is_exactly_once=true` 来启用它。 + +- [x] [cdc](../../concept/connector-v2-features.md) + +## Options + +| 名称 | 类型 | 是否必须 | 默认值 | +|-------------------------------------------|---------|------|------------------------------| +| url | String | 是 | - | +| driver | String | 是 | - | +| user | String | 否 | - | +| password | String | 否 | - | +| query | String | 否 | - | +| compatible_mode | String | 否 | - | +| database | String | 否 | - | +| table | String | 否 | - | +| primary_keys | Array | 否 | - | +| support_upsert_by_query_primary_key_exist | Boolean | 否 | false | +| connection_check_timeout_sec | Int | 否 | 30 | +| max_retries | Int | 否 | 0 | +| batch_size | Int | 否 | 1000 | +| is_exactly_once | Boolean | 否 | false | +| generate_sink_sql | Boolean | 否 | false | +| xa_data_source_class_name | String | 否 | - | +| max_commit_attempts | Int | 否 | 3 | +| transaction_timeout_sec | Int | 否 | -1 | +| auto_commit | Boolean | 否 | true | +| field_ide | String | 否 | - | +| properties | Map | 否 | - | +| common-options | | 否 | - | +| schema_save_mode | Enum | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | +| data_save_mode | Enum | 否 | APPEND_DATA | +| custom_sql | String | 否 | - | +| enable_upsert | Boolean | 否 | true | +| use_copy_statement | Boolean | 否 | false | + +### driver [string] + +用于连接远程数据源的 jdbc 类名,如果使用MySQL,则值为`com.mysql.cj.jdbc.Driver` + +### user [string] + +用户名 + +### password [string] + +密码 + +### url [string] + +JDBC 连接的 URL。参考案例:`jdbc:postgresql://localhost/test` + +### query [string] + +使用 sql 语句将上游输入数据写入到数据库。如 `INSERT ...` + +### compatible_mode [string] + +数据库的兼容模式,当数据库支持多种兼容模式时需要。例如,使用 OceanBase 数据库时,需要将其设置为 'mysql' 或 'oracle' 。 + +Postgres 9.5及以下版本,请设置为 `postgresLow` 来支持 CDC + +### database [string] + +使用此 `database` 和 `table-name` 自动生成 SQL,并接收上游输入的数据写入数据库。 + +此选项与 `query` 选项是互斥的,此选项具有更高的优先级。 + +### table [string] + +使用 `database` 和此 `table-name` 自动生成 SQL,并接收上游输入的数据写入数据库。 + +此选项与 `query` 选项是互斥的,此选项具有更高的优先级。 + +table参数可以填入一个任意的表名,这个名字最终会被用作创建表的表名,并且支持变量(`${table_name}`,`${schema_name}`)。 +替换规则如下:`${schema_name}` 将替换传递给目标端的 SCHEMA 名称,`${table_name}` 将替换传递给目标端的表名。 + +mysql 接收器示例: + +1. test_${schema_name}_${table_name}_test +2. sink_sinktable +3. ss_${table_name} + +pgsql (Oracle Sqlserver ...) 接收器示例: + +1. ${schema_name}.${table_name}_test +2. dbo.tt_${table_name}_sink +3. public.sink_table + +Tip: 如果目标数据库有 SCHEMA 的概念,则表参数必须写成 `xxx.xxx` + +### primary_keys [array] + +该选项用于辅助生成 insert、delete、update 等 sql 语句。设置了该选项,将会根据该选项生成对应的 sql 语句 + +### support_upsert_by_query_primary_key_exist [boolean] + +根据查询主键是否存在来选择使用 INSERT sql、UPDATE sql 来处理变更事件(INSERT、UPDATE_AFTER)。仅当数据库不支持 upsert 语法时才使用此配置 +**注意**:该方法性能较低 + +### connection_check_timeout_sec [int] + +用于验证数据库连接的有效性时等待数据库操作完成所需的时间,单位是秒 + +### max_retries[int] + +重试提交失败的最大次数(executeBatch) + +### batch_size[int] + +对于批量写入,当缓冲的记录数达到 `batch_size` 数量或者时间达到 `checkpoint.interval` 时,数据将被刷新到数据库中 + +### is_exactly_once[boolean] + +是否启用通过XA事务实现的精确一次语义。开启,你还需要设置 `xa_data_source_class_name` + +### generate_sink_sql[boolean] + +根据要写入的数据库表结构生成 sql 语句 + +### xa_data_source_class_name[string] + +指数据库驱动的 XA 数据源的类名。以 MySQL 为例,其类名为 com.mysql.cj.jdbc.MysqlXADataSource。了解其他数据库的数据源类名,可以参考文档的附录部分 + +### max_commit_attempts[int] + +事务提交失败的最大重试次数 + +### transaction_timeout_sec[int] + +在事务开启后的超时时间,默认值为-1(即永不超时)。请注意,设置超时时间可能会影响到精确一次(exactly-once)的语义 + +### auto_commit [boolean] + +默认启用自动事务提交 + +### field_ide [String] + +字段 `field_ide` 用于在从 source 同步到 sink 时,确定字段是否需要转换为大写或小写。'ORIGINAL' 表示不需要转换,'UPPERCASE' 表示转换为大写,'LOWERCASE' 表示转换为小写 + +### properties + +附加连接配置参数,当属性和URL具有相同参数时,优先级由驱动程序的具体实现确定。例如,在 MySQL 中,属性配置优先于 URL。 + +### common options + +Sink插件常用参数,请参考 [Sink常用选项](common-options.md) 了解详情 + +### schema_save_mode [Enum] + +在启动同步任务之前,针对目标侧已有的表结构选择不同的处理方案
+选项介绍:
+`RECREATE_SCHEMA`:当表不存在时会创建,当表已存在时会删除并重建
+`CREATE_SCHEMA_WHEN_NOT_EXIST`:当表不存在时会创建,当表已存在时则跳过创建
+`ERROR_WHEN_SCHEMA_NOT_EXIST`:当表不存在时将抛出错误
+ +### data_save_mode [Enum] + +在启动同步任务之前,针对目标侧已存在的数据选择不同的处理方案
+选项介绍:
+`DROP_DATA`:保留数据库结构,删除数据
+`APPEND_DATA`:保留数据库结构,保留数据
+`CUSTOM_PROCESSING`:允许用户自定义数据处理方式
+`ERROR_WHEN_DATA_EXISTS`:当有数据时抛出错误
+ +### custom_sql [String] + +当`data_save_mode`选择`CUSTOM_PROCESSING`时,需要填写`CUSTOM_SQL`参数。该参数通常填写一条可以执行的SQL。SQL将在同步任务之前执行 + +### enable_upsert [boolean] + +启用通过主键更新插入,如果任务没有key重复数据,设置该参数为 false 可以加快数据导入速度 + +### use_copy_statement [boolean] + +使用 `COPY ${table} FROM STDIN` 语句导入数据。仅支持具有 `getCopyAPI()` 方法连接的驱动程序。例如:Postgresql +驱动程序 `org.postgresql.Driver` + +注意:不支持 `MAP`、`ARRAY`、`ROW`类型 + +## tips + +在 is_exactly_once = "true" 的情况下,使用 XA 事务。这需要数据库支持,有些数据库需要一些设置:
+1 postgres 需要设置 `max_prepared_transactions > 1` 例如 `ALTER SYSTEM set max_prepared_transactions to 10`
+2 mysql 版本需要 >= `8.0.29` 并且非 root 用户需要授予 `XA_RECOVER_ADMIN` 权限。例如:将 test_db.* 上的 XA_RECOVER_ADMIN +授予 `'user1'@'%'`
+3 mysql可以尝试在url中添加 `rewriteBatchedStatements=true` 参数以获得更好的性能
+ +## 附录 + +附录参数仅提供参考 + +| 数据源 | driver | url | xa_data_source_class_name | maven | +|------------|----------------------------------------------|--------------------------------------------------------------------|----------------------------------------------------|-------------------------------------------------------------------------------------------------------------| +| MySQL | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | com.mysql.cj.jdbc.MysqlXADataSource | https://mvnrepository.com/artifact/mysql/mysql-connector-java | +| PostgreSQL | org.postgresql.Driver | jdbc:postgresql://localhost:5432/postgres | org.postgresql.xa.PGXADataSource | https://mvnrepository.com/artifact/org.postgresql/postgresql | +| DM | dm.jdbc.driver.DmDriver | jdbc:dm://localhost:5236 | dm.jdbc.driver.DmdbXADataSource | https://mvnrepository.com/artifact/com.dameng/DmJdbcDriver18 | +| Phoenix | org.apache.phoenix.queryserver.client.Driver | jdbc:phoenix:thin:url=http://localhost:8765;serialization=PROTOBUF | / | https://mvnrepository.com/artifact/com.aliyun.phoenix/ali-phoenix-shaded-thin-client | +| SQL Server | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433 | com.microsoft.sqlserver.jdbc.SQLServerXADataSource | https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc | +| Oracle | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@localhost:1521/xepdb1 | oracle.jdbc.xa.OracleXADataSource | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | +| sqlite | org.sqlite.JDBC | jdbc:sqlite:test.db | / | https://mvnrepository.com/artifact/org.xerial/sqlite-jdbc | +| GBase8a | com.gbase.jdbc.Driver | jdbc:gbase://e2e_gbase8aDb:5258/test | / | https://www.gbase8.cn/wp-content/uploads/2020/10/gbase-connector-java-8.3.81.53-build55.5.7-bin_min_mix.jar | +| StarRocks | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | / | https://mvnrepository.com/artifact/mysql/mysql-connector-java | +| db2 | com.ibm.db2.jcc.DB2Driver | jdbc:db2://localhost:50000/testdb | com.ibm.db2.jcc.DB2XADataSource | https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc/db2jcc4 | +| saphana | com.sap.db.jdbc.Driver | jdbc:sap://localhost:39015 | / | https://mvnrepository.com/artifact/com.sap.cloud.db.jdbc/ngdbc | +| Doris | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | / | https://mvnrepository.com/artifact/mysql/mysql-connector-java | +| teradata | com.teradata.jdbc.TeraDriver | jdbc:teradata://localhost/DBS_PORT=1025,DATABASE=test | / | https://mvnrepository.com/artifact/com.teradata.jdbc/terajdbc | +| Redshift | com.amazon.redshift.jdbc42.Driver | jdbc:redshift://localhost:5439/testdb | com.amazon.redshift.xa.RedshiftXADataSource | https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42 | +| Snowflake | net.snowflake.client.jdbc.SnowflakeDriver | jdbc:snowflake://.snowflakecomputing.com | / | https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc | +| Vertica | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433 | / | https://repo1.maven.org/maven2/com/vertica/jdbc/vertica-jdbc/12.0.3-0/vertica-jdbc-12.0.3-0.jar | +| Kingbase | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | / | https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar | +| OceanBase | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2881 | / | https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.3/oceanbase-client-2.4.3.jar | + +## 示例 + +简单示例 + +``` +jdbc { + url = "jdbc:mysql://localhost:3306/test" + driver = "com.mysql.cj.jdbc.Driver" + user = "root" + password = "123456" + query = "insert into test_table(name,age) values(?,?)" +} + +``` + +精确一次 (Exactly-once) + +通过设置 `is_exactly_once` 开启精确一次语义 + +``` +jdbc { + + url = "jdbc:mysql://localhost:3306/test" + driver = "com.mysql.cj.jdbc.Driver" + + max_retries = 0 + user = "root" + password = "123456" + query = "insert into test_table(name,age) values(?,?)" + + is_exactly_once = "true" + + xa_data_source_class_name = "com.mysql.cj.jdbc.MysqlXADataSource" +} +``` + +变更数据捕获 (Change data capture) 事件 + +jdbc 接收 CDC 示例 + +``` +sink { + jdbc { + url = "jdbc:mysql://localhost:3306" + driver = "com.mysql.cj.jdbc.Driver" + user = "root" + password = "123456" + + database = "sink_database" + table = "sink_table" + primary_keys = ["key1", "key2", ...] + } +} +``` + +配置表生成策略 (schema_save_mode) + +通过设置 `schema_save_mode` 配置为 `CREATE_SCHEMA_WHEN_NOT_EXIST` 来支持不存在表时创建表 + +``` +sink { + jdbc { + url = "jdbc:mysql://localhost:3306" + driver = "com.mysql.cj.jdbc.Driver" + user = "root" + password = "123456" + + database = "sink_database" + table = "sink_table" + primary_keys = ["key1", "key2", ...] + schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" + data_save_mode="APPEND_DATA" + } +} +``` + +支持Postgres 9.5及以下版本的 CDC 示例 + +Postgres 9.5及以下版本,通过设置 `compatible_mode` 配置为 `postgresLow` 来支持 Postgres CDC 操作 + +``` +sink { + jdbc { + url = "jdbc:postgresql://localhost:5432" + driver = "org.postgresql.Driver" + user = "root" + password = "123456" + compatible_mode="postgresLow" + database = "sink_database" + table = "sink_table" + support_upsert_by_query_primary_key_exist = true + generate_sink_sql = true + primary_keys = ["key1", "key2", ...] + } +} + +``` + +## 变更日志 + +### 2.3.0-beta 2022-10-20 + +- [BugFix] Fix JDBC split exception ([2904](https://github.com/apache/seatunnel/pull/2904)) +- [Feature] Support Phoenix JDBC Sink ([2499](https://github.com/apache/seatunnel/pull/2499)) +- [Feature] Support SQL Server JDBC Sink ([2646](https://github.com/apache/seatunnel/pull/2646)) +- [Feature] Support Oracle JDBC Sink ([2550](https://github.com/apache/seatunnel/pull/2550)) +- [Feature] Support StarRocks JDBC Sink ([3060](https://github.com/apache/seatunnel/pull/3060)) +- [Feature] Support DB2 JDBC Sink ([2410](https://github.com/apache/seatunnel/pull/2410)) + +### next version + +- [Feature] Support CDC write DELETE/UPDATE/INSERT events ([3378](https://github.com/apache/seatunnel/issues/3378)) +- [Feature] Support Teradata JDBC Sink ([3362](https://github.com/apache/seatunnel/pull/3362)) +- [Feature] Support Sqlite JDBC Sink ([3089](https://github.com/apache/seatunnel/pull/3089)) +- [Feature] Support CDC write DELETE/UPDATE/INSERT events ([3378](https://github.com/apache/seatunnel/issues/3378)) +- [Feature] Support Doris JDBC Sink +- [Feature] Support Redshift JDBC Sink([#3615](https://github.com/apache/seatunnel/pull/3615)) +- [Improve] Add config item enable upsert by query([#3708](https://github.com/apache/seatunnel/pull/3708)) +- [Improve] Add database field to sink config([#4199](https://github.com/apache/seatunnel/pull/4199)) +- [Improve] Add Vertica connector([#4303](https://github.com/apache/seatunnel/pull/4303)) + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Kafka.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Kafka.md new file mode 100644 index 000000000000..c0ce93387084 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Kafka.md @@ -0,0 +1,196 @@ +# Kafka + +> Kafka 数据接收器 + +## 支持引擎 + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## 主要特性 + +- [x] [精确一次](../../concept/connector-v2-features.md) +- [ ] [cdc](../../concept/connector-v2-features.md) + +> 默认情况下,我们将使用 2pc 来保证消息只发送一次到kafka + +## 描述 + +将 Rows 内容发送到 Kafka topic + +## 支持的数据源信息 + +为了使用 Kafka 连接器,需要以下依赖项 +可以通过 install-plugin.sh 或从 Maven 中央存储库下载 + +| 数据源 | 支持版本 | Maven | +|-------|------|-------------------------------------------------------------------------------------------------------| +| Kafka | 通用 | [下载](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-kafka) | + +## 接收器选项 + +| 名称 | 类型 | 是否需要 | 默认值 | 描述 | +|----------------------|--------|------|------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| topic | String | 是 | - | 当表用作接收器时,topic 名称是要写入数据的 topic | +| bootstrap.servers | String | 是 | - | Kafka brokers 使用逗号分隔 | +| kafka.config | Map | 否 | - | 除了上述 Kafka Producer 客户端必须指定的参数外,用户还可以为 Producer 客户端指定多个非强制参数,涵盖 [Kafka官方文档中指定的所有生产者参数](https://kafka.apache.org/documentation.html#producerconfigs) | +| semantics | String | 否 | NON | 可以选择的语义是 EXACTLY_ONCE/AT_LEAST_ONCE/NON,默认 NON。 | +| partition_key_fields | Array | 否 | - | 配置字段用作 kafka 消息的key | +| partition | Int | 否 | - | 可以指定分区,所有消息都会发送到此分区 | +| assign_partitions | Array | 否 | - | 可以根据消息的内容决定发送哪个分区,该参数的作用是分发信息 | +| transaction_prefix | String | 否 | - | 如果语义指定为EXACTLY_ONCE,生产者将把所有消息写入一个 Kafka 事务中,kafka 通过不同的 transactionId 来区分不同的事务。该参数是kafka transactionId的前缀,确保不同的作业使用不同的前缀 | +| format | String | 否 | json | 数据格式。默认格式是json。可选文本格式,canal-json、debezium-json 和 avro。如果使用 json 或文本格式。默认字段分隔符是`,`。如果自定义分隔符,请添加`field_delimiter`选项。如果使用canal格式,请参考[canal-json](../formats/canal-json.md)。如果使用debezium格式,请参阅 [debezium-json](../formats/debezium-json.md) 了解详细信息 | +| field_delimiter | String | 否 | , | 自定义数据格式的字段分隔符 | +| common-options | | 否 | - | Sink插件常用参数,请参考 [Sink常用选项 ](common-options.md) 了解详情 | + +## 参数解释 + +### Topic 格式 + +目前支持两种格式: + +1. 填写topic名称 + +2. 使用上游数据中的字段值作为 topic ,格式是 `${your field name}`, 其中 topic 是上游数据的其中一列的值 + + 例如,上游数据如下: + +| name | age | data | +|------|-----|---------------| +| Jack | 16 | data-example1 | +| Mary | 23 | data-example2 | + +如果 `${name}` 设置为 topic。因此,第一行发送到 Jack topic,第二行发送到 Mary topic。 + +### 语义 + +在 EXACTLY_ONCE 中,生产者将在 Kafka 事务中写入所有消息,这些消息将在检查点上提交给 Kafka,该模式下能保证数据精确写入kafka一次,即使任务失败重试也不会出现数据重复和丢失 +在 AT_LEAST_ONCE 中,生产者将等待 Kafka 缓冲区中所有未完成的消息在检查点上被 Kafka 生产者确认,该模式下能保证数据至少写入kafka一次,即使任务失败 +NON 不提供任何保证:如果 Kafka 代理出现问题,消息可能会丢失,并且消息可能会重复,该模式下,任务失败重试可能会产生数据丢失或重复。 + +### 分区关键字段 + +例如,如果你想使用上游数据中的字段值作为键,可以将这些字段名指定给此属性 + +上游数据如下所示: + +| name | age | data | +|------|-----|---------------| +| Jack | 16 | data-example1 | +| Mary | 23 | data-example2 | + +如果将 name 设置为 key,那么 name 列的哈希值将决定消息发送到哪个分区。 +如果没有设置分区键字段,则将发送空消息键。 +消息 key 的格式为 json,如果设置 name 为 key,例如 `{"name":"Jack"}`。 +所选的字段必须是上游数据中已存在的字段。 + +### 分区分配 + +假设总有五个分区,配置中的 assign_partitions 字段设置为: +assign_partitions = ["shoe", "clothing"] +在这种情况下,包含 "shoe" 的消息将被发送到第零个分区,因为 "shoe" 在 assign_partitions 中被标记为零, 而包含 "clothing" 的消息将被发送到第一个分区。 +对于其他的消息,我们将使用哈希算法将它们均匀地分配到剩余的分区中。 +这个功能是通过 MessageContentPartitioner 类实现的,该类实现了 org.apache.kafka.clients.producer.Partitioner 接口。如果我们需要自定义分区,我们需要实现这个接口。 + +## 任务示例 + +### 简单: + +> 此示例展示了如何定义一个 SeaTunnel 同步任务,该任务能够通过 FakeSource 自动产生数据并将其发送到 Kafka Sink。在这个例子中,FakeSource 会生成总共 16 行数据(`row.num=16`),每一行都包含两个字段,即 `name`(字符串类型)和 `age`(整型)。最终,这些数据将被发送到名为 test_topic 的 topic 中,因此该 topic 也将包含 16 行数据。 +> 如果你还未安装和部署 SeaTunnel,你需要参照 [安装SeaTunnel](../../start-v2/locally/deployment.md) 的指南来进行安装和部署。完成安装和部署后,你可以按照 [快速开始使用 SeaTunnel 引擎](../../start-v2/locally/quick-start-seatunnel-engine.md) 的指南来运行任务。 + +```hocon +# Defining the runtime environment +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +sink { + kafka { + topic = "test_topic" + bootstrap.servers = "localhost:9092" + format = json + kafka.request.timeout.ms = 60000 + semantics = EXACTLY_ONCE + kafka.config = { + acks = "all" + request.timeout.ms = 60000 + buffer.memory = 33554432 + } + } +} +``` + +### AWS MSK SASL/SCRAM + +将以下 `${username}` 和 `${password}` 替换为 AWS MSK 中的配置值。 + +```hocon +sink { + kafka { + topic = "seatunnel" + bootstrap.servers = "localhost:9092" + format = json + kafka.request.timeout.ms = 60000 + semantics = EXACTLY_ONCE + kafka.config = { + security.protocol=SASL_SSL + sasl.mechanism=SCRAM-SHA-512 + sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required \nusername=${username}\npassword=${password};" + } + } +} +``` + +### AWS MSK IAM + +从 https://github.com/aws/aws-msk-iam-auth/releases 下载 `aws-msk-iam-auth-1.1.5.jar` +并将其放入 `$SEATUNNEL_HOME/plugin/kafka/lib` 中目录。 +请确保 IAM 策略具有 `kafka-cluster:Connect` +如下配置: + +```hocon +"Effect": "Allow", +"Action": [ + "kafka-cluster:Connect", + "kafka-cluster:AlterCluster", + "kafka-cluster:DescribeCluster" +], +``` + +接收器配置 + +```hocon +sink { + kafka { + topic = "seatunnel" + bootstrap.servers = "localhost:9092" + format = json + kafka.request.timeout.ms = 60000 + semantics = EXACTLY_ONCE + kafka.config = { + security.protocol=SASL_SSL + sasl.mechanism=AWS_MSK_IAM + sasl.jaas.config="software.amazon.msk.auth.iam.IAMLoginModule required;" + sasl.client.callback.handler.class="software.amazon.msk.auth.iam.IAMClientCallbackHandler" + } + } +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/LocalFile.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/LocalFile.md new file mode 100644 index 000000000000..53aa0cb480e6 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/LocalFile.md @@ -0,0 +1,316 @@ +# LocalFile + +> 本地文件接收器 + +## 描述 + +将数据输出到本地文件。 + +:::提示 + +如果你使用的是 spark/flink,为了使用此连接器,你必须确保你的 spark/flink 集群已集成 hadoop。已测试的 hadoop 版本是 2.x。 + +如果你使用 SeaTunnel Engine,它会在下载和安装 SeaTunnel Engine 时自动集成 hadoop jar。你可以在 ${SEATUNNEL_HOME}/lib 下检查 jar 包以确认这一点。 + +::: + +## 主要特性 + +- [x] [精确一次](../../concept/connector-v2-features.md) + +默认情况下,我们使用 2PC 提交以确保`精确一次`。 + +- [x] 文件格式类型 + - [x] 文本 + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + - [x] xml + - [x] 二进制 + +## 选项 + +| 名称 | 类型 | 是否必需 | 默认值 | 描述 | +|---------------------------------------|---------|------|--------------------------------------------|-----------------------------------------------------------------| +| path | string | 是 | - | 目标目录路径 | +| tmp_path | string | 否 | /tmp/seatunnel | 结果文件将首先写入临时路径,然后使用 `mv` 将临时目录提交到目标目录。 | +| custom_filename | boolean | 否 | false | 是否需要自定义文件名 | +| file_name_expression | string | 否 | "${transactionId}" | 仅在 custom_filename 为 true 时使用 | +| filename_time_format | string | 否 | "yyyy.MM.dd" | 仅在 custom_filename 为 true 时使用 | +| file_format_type | string | 否 | "csv" | 文件格式类型 | +| field_delimiter | string | 否 | '\001' | 仅在 file_format_type 为 text 时使用 | +| row_delimiter | string | 否 | "\n" | 仅在 file_format_type 为 text 时使用 | +| have_partition | boolean | 否 | false | 是否需要处理分区 | +| partition_by | array | 否 | - | 仅在 have_partition 为 true 时使用 | +| partition_dir_expression | string | 否 | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | 仅在 have_partition 为 true 时使用 | +| is_partition_field_write_in_file | boolean | 否 | false | 仅在 have_partition 为 true 时使用 | +| sink_columns | array | 否 | | 当此参数为空时,所有字段都是 sink 列 | +| is_enable_transaction | boolean | 否 | true | 是否启用事务 | +| batch_size | int | 否 | 1000000 | 批量大小 | +| compress_codec | string | 否 | none | 压缩编码 | +| common-options | object | 否 | - | 常见选项 | +| max_rows_in_memory | int | 否 | - | 仅在 file_format_type 为 excel 时使用 | +| sheet_name | string | 否 | Sheet${随机数} | 仅在 file_format_type 为 excel 时使用 | +| xml_root_tag | string | 否 | RECORDS | 仅在 file_format 为 xml 时使用 | +| xml_row_tag | string | 否 | RECORD | 仅在 file_format 为 xml 时使用 | +| xml_use_attr_format | boolean | 否 | - | 仅在 file_format 为 xml 时使用 | +| parquet_avro_write_timestamp_as_int96 | boolean | 否 | false | 仅在 file_format 为 parquet 时使用 | +| parquet_avro_write_fixed_as_int96 | array | 否 | - | 仅在 file_format 为 parquet 时使用 | +| enable_header_write | boolean | 否 | false | 仅在 file_format_type 为 text,csv 时使用。
false:不写入表头,true:写入表头。 | +| encoding | string | 否 | "UTF-8" | 仅在 file_format_type 为 json,text,csv,xml 时使用 | + +### path [string] + +目标目录路径是必需的,你可以通过使用 `${database_name}`、`${table_name}` 和 `${schema_name}` 将上游的 CatalogTable 注入到路径中。 + +### custom_filename [boolean] + +是否自定义文件名 + +### file_name_expression [string] + +仅在 `custom_filename` 为 `true` 时使用 + +`file_name_expression` 描述将创建到 `path` 中的文件表达式。我们可以在 `file_name_expression` 中添加变量 `${now}` 或 `${uuid}`,例如 `test_${uuid}_${now}`,`${now}` 表示当前时间,其格式可以通过指定 `filename_time_format` 选项来定义。 + +请注意,如果 `is_enable_transaction` 为 `true`,我们将自动在文件名的头部添加 `${transactionId}_`。 + +### filename_time_format [string] + +仅在 `custom_filename` 为 `true` 时使用 + +当 `file_name_expression` 参数中的格式为 `xxxx-${now}` 时,`filename_time_format` 可以指定路径的时间格式,默认值为 `yyyy.MM.dd`。常用的时间格式如下所示: + +| 符号 | 描述 | +|----|-----------| +| y | 年 | +| M | 月 | +| d | 日 | +| H | 小时 (0-23) | +| m | 分钟 | +| s | 秒 | + +### file_format_type [string] + +我们支持以下文件类型: + +`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` + +请注意,最终的文件名将以 file_format_type 的后缀结尾,文本文件的后缀是 `txt`。 + +### field_delimiter [string] + +数据行中列之间的分隔符。仅在 `text` 文件格式下需要。 + +### row_delimiter [string] + +文件中行之间的分隔符。仅在 `text` 文件格式下需要。 + +### have_partition [boolean] + +是否需要处理分区。 + +### partition_by [array] + +仅在 `have_partition` 为 `true` 时使用。 + +基于选定字段进行数据分区。 + +### partition_dir_expression [string] + +仅在 `have_partition` 为 `true` 时使用。 + +如果指定了 `partition_by`,我们将基于分区信息生成相应的分区目录,最终文件将放置在分区目录中。 + +默认的 `partition_dir_expression` 是 `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`。`k0` 是第一个分区字段,`v0` 是第一个分区字段的值。 + +### is_partition_field_write_in_file [boolean] + +仅在 `have_partition` 为 `true` 时使用。 + +如果 `is_partition_field_write_in_file` 为 `true`,分区字段及其值将写入数据文件。 + +例如,如果你想写入一个 Hive 数据文件,其值应该为 `false`。 + +### sink_columns [array] + +需要写入文件的列,默认值为从 `Transform` 或 `Source` 获取的所有列。字段的顺序决定了实际写入文件的顺序。 + +### is_enable_transaction [boolean] + +如果 `is_enable_transaction` 为 true,我们将确保数据在写入目标目录时不会丢失或重复。 + +请注意,如果 `is_enable_transaction` 为 true,我们将自动在文件名前添加 `${transactionId}_`。 + +目前仅支持 `true`。 + +### batch_size [int] + +文件中的最大行数。对于 SeaTunnel Engine,文件中的行数由 `batch_size` 和 `checkpoint.interval` 共同决定。如果 `checkpoint.interval` 的值足够大,sink writer 将在文件中的行数超过 `batch_size` 时写入文件。如果 `checkpoint.interval` 很小,当触发新检查点时,sink writer 将创建一个新文件。 + +### compress_codec [string] + +文件的压缩编码,支持的压缩编码如下所示: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc: `lzo` `snappy` `lz4` `zlib` `none` +- parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` + +提示:excel 类型不支持任何压缩格式 + +### 常见选项 + +Sink 插件的常见参数,请参阅 [Sink 常见选项](common-options.md) 获取详细信息。 + +### max_rows_in_memory [int] + +当文件格式为 Excel 时,内存中可以缓存的数据项最大数量。 + +### sheet_name [string] + +工作簿的表名。 + +### xml + +_root_tag [string] + +指定 XML 文件中根元素的标签名。 + +### xml_row_tag [string] + +指定 XML 文件中数据行的标签名。 + +### xml_use_attr_format [boolean] + +指定是否使用标签属性格式处理数据。 + +### parquet_avro_write_timestamp_as_int96 [boolean] + +支持从时间戳写入 Parquet INT96,仅对 parquet 文件有效。 + +### parquet_avro_write_fixed_as_int96 [array] + +支持从 12 字节字段写入 Parquet INT96,仅对 parquet 文件有效。 + +### enable_header_write [boolean] + +仅在 file_format_type 为 text,csv 时使用。false:不写入表头,true:写入表头。 + +### encoding [string] + +仅在 file_format_type 为 json,text,csv,xml 时使用。文件写入的编码。该参数将通过 `Charset.forName(encoding)` 解析。 + +## 示例 + +对于 orc 文件格式的简单配置 + +```bash + +LocalFile { + path = "/tmp/hive/warehouse/test2" + file_format_type = "orc" +} + +``` + +对于带有 `encoding` 的 json、text、csv 或 xml 文件格式 + +```hocon + +LocalFile { + path = "/tmp/hive/warehouse/test2" + file_format_type = "text" + encoding = "gbk" +} + +``` + +对于带有 `sink_columns` 的 parquet 文件格式 + +```bash + +LocalFile { + path = "/tmp/hive/warehouse/test2" + file_format_type = "parquet" + sink_columns = ["name","age"] +} + +``` + +对于带有 `have_partition`、`custom_filename` 和 `sink_columns` 的 text 文件格式 + +```bash + +LocalFile { + path = "/tmp/hive/warehouse/test2" + file_format_type = "text" + field_delimiter = "\t" + row_delimiter = "\n" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + custom_filename = true + file_name_expression = "${transactionId}_${now}" + filename_time_format = "yyyy.MM.dd" + sink_columns = ["name","age"] + is_enable_transaction = true +} + +``` + +对于带有 `sheet_name` 和 `max_rows_in_memory` 的 excel 文件格式 + +```bash + +LocalFile { + path="/tmp/seatunnel/excel" + sheet_name = "Sheet1" + max_rows_in_memory = 1024 + partition_dir_expression="${k0}=${v0}" + is_partition_field_write_in_file=true + file_name_expression="${transactionId}_${now}" + file_format_type="excel" + filename_time_format="yyyy.MM.dd" + is_enable_transaction=true + } + +``` + +对于从上游提取源元数据,可以在路径中使用 `${database_name}`、`${table_name}` 和 `${schema_name}`。 + +```bash + +LocalFile { + path = "/tmp/hive/warehouse/${table_name}" + file_format_type = "parquet" + sink_columns = ["name","age"] +} + +``` + +## 更新日志 + +### 2.2.0-beta 2022-09-26 + +- 新增本地文件接收器 + +### 2.3.0-beta 2022-10-20 + +- [BugFix] 修复了 Windows 环境中路径错误的 bug ([2980](https://github.com/apache/seatunnel/pull/2980)) +- [BugFix] 修复了文件系统获取错误 ([3117](https://github.com/apache/seatunnel/pull/3117)) +- [BugFix] 解决了无法解析 '\t' 作为配置文件分隔符的 bug ([3083](https://github.com/apache/seatunnel/pull/3083)) + +### 下一个版本 + +- [BugFix] 修复了以下导致数据写入文件失败的 bug ([3258](https://github.com/apache/seatunnel/pull/3258)) + - 当上游字段为 null 时会抛出 NullPointerException + - Sink 列映射失败 + - 从状态恢复 writer 时直接获取事务失败 +- [Improve] 支持为每个文件设置批量大小 ([3625](https://github.com/apache/seatunnel/pull/3625)) +- [Improve] 支持文件压缩 ([3899](https://github.com/apache/seatunnel/pull/3899)) + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Paimon.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Paimon.md new file mode 100644 index 000000000000..50f88731d3e7 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Paimon.md @@ -0,0 +1,273 @@ +# Paimon + +> Paimon 数据连接器 + +## 描述 + +Apache Paimon数据连接器。支持cdc写以及自动建表。 + +## 支持的数据源信息 + +| 数据源 | 依赖 | Maven | +|--------|-----------|---------------------------------------------------------------------------| +| Paimon | hive-exec | [Download](https://mvnrepository.com/artifact/org.apache.hive/hive-exec) | +| Paimon | libfb303 | [Download](https://mvnrepository.com/artifact/org.apache.thrift/libfb303) | + +## 数据源依赖 + +> 为了兼容不同版本的Hadoop和Hive,在项目pom文件中Hive -exec的作用域为provided,所以如果您使用Flink引擎,首先可能需要将以下Jar包添加到/lib目录下,如果您使用Spark引擎并与Hadoop集成,则不需要添加以下Jar包。 + +``` +hive-exec-xxx.jar +libfb303-xxx.jar +``` + +> 有些版本的hive-exec包没有libfb303-xxx.jar,所以您还需要手动导入Jar包。 + +## 主要特性 + +- [x] [exactly-once](../../concept/connector-v2-features.md) + +## 连接器选项 + +| 名称 | 类型 | 是否必须 | 默认值 | 描述 | +|-----------------------------|-----|------|------------------------------|---------------------------------------------------------------------------------------------------|---| +| warehouse | 字符串 | 是 | - | Paimon warehouse路径 | +| catalog_type | 字符串 | 否 | filesystem | Paimon的catalog类型,目前支持filesystem和hive | +| catalog_uri | 字符串 | 否 | - | Paimon catalog的uri,仅当catalog_type为hive时需要配置 | | +| database | 字符串 | 是 | - | 数据库名称 | +| table | 字符串 | 是 | - | 表名 | +| hdfs_site_path | 字符串 | 否 | - | hdfs-site.xml文件路径 | +| schema_save_mode | 枚举 | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | Schema保存模式 | +| data_save_mode | 枚举 | 否 | APPEND_DATA | 数据保存模式 | +| paimon.table.primary-keys | 字符串 | 否 | - | 主键字段列表,联合主键使用逗号分隔(注意:分区字段需要包含在主键字段中) | +| paimon.table.partition-keys | 字符串 | 否 | - | 分区字段列表,多字段使用逗号分隔 | +| paimon.table.write-props | Map | 否 | - | Paimon表初始化指定的属性, [参考](https://paimon.apache.org/docs/0.6/maintenance/configurations/#coreoptions) | +| paimon.hadoop.conf | Map | 否 | - | Hadoop配置文件属性信息 | +| paimon.hadoop.conf-path | 字符串 | 否 | - | Hadoop配置文件目录,用于加载'core-site.xml', 'hdfs-site.xml', 'hive-site.xml'文件配置 | + +## 示例 + +### 单表 + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + Mysql-CDC { + base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel" + username = "root" + password = "******" + table-names = ["seatunnel.role"] + } +} + +transform { +} + +sink { + Paimon { + catalog_name="seatunnel_test" + warehouse="file:///tmp/seatunnel/paimon/hadoop-sink/" + database="seatunnel" + table="role" + } +} +``` + +### 单表(指定hadoop HA配置和kerberos配置) + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + Mysql-CDC { + base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel" + username = "root" + password = "******" + table-names = ["seatunnel.role"] + } +} + +transform { +} + +sink { + Paimon { + catalog_name="seatunnel_test" + warehouse="hdfs:///tmp/seatunnel/paimon/hadoop-sink/" + database="seatunnel" + table="role" + paimon.hadoop.conf = { + fs.defaultFS = "hdfs://nameservice1" + dfs.nameservices = "nameservice1" + dfs.ha.namenodes.nameservice1 = "nn1,nn2" + dfs.namenode.rpc-address.nameservice1.nn1 = "hadoop03:8020" + dfs.namenode.rpc-address.nameservice1.nn2 = "hadoop04:8020" + dfs.client.failover.proxy.provider.nameservice1 = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider" + dfs.client.use.datanode.hostname = "true" + security.kerberos.login.principal = "your-kerberos-principal" + security.kerberos.login.keytab = "your-kerberos-keytab-path" + } + } +} +``` + +### 单表(使用Hive catalog) + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + schema = { + fields { + pk_id = bigint + name = string + score = int + } + primaryKey { + name = "pk_id" + columnNames = [pk_id] + } + } + rows = [ + { + kind = INSERT + fields = [1, "A", 100] + }, + { + kind = INSERT + fields = [2, "B", 100] + }, + { + kind = INSERT + fields = [3, "C", 100] + }, + { + kind = INSERT + fields = [3, "C", 100] + }, + { + kind = INSERT + fields = [3, "C", 100] + }, + { + kind = INSERT + fields = [3, "C", 100] + } + { + kind = UPDATE_BEFORE + fields = [1, "A", 100] + }, + { + kind = UPDATE_AFTER + fields = [1, "A_1", 100] + }, + { + kind = DELETE + fields = [2, "B", 100] + } + ] + } +} + +sink { + Paimon { + schema_save_mode = "RECREATE_SCHEMA" + catalog_name="seatunnel_test" + catalog_type="hive" + catalog_uri="thrift://hadoop04:9083" + warehouse="hdfs:///tmp/seatunnel" + database="seatunnel_test" + table="st_test3" + paimon.hadoop.conf = { + fs.defaultFS = "hdfs://nameservice1" + dfs.nameservices = "nameservice1" + dfs.ha.namenodes.nameservice1 = "nn1,nn2" + dfs.namenode.rpc-address.nameservice1.nn1 = "hadoop03:8020" + dfs.namenode.rpc-address.nameservice1.nn2 = "hadoop04:8020" + dfs.client.failover.proxy.provider.nameservice1 = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider" + dfs.client.use.datanode.hostname = "true" + } + } +} + +``` + +### 指定paimon的写属性的单表 + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + Mysql-CDC { + base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel" + username = "root" + password = "******" + table-names = ["seatunnel.role"] + } +} + +sink { + Paimon { + catalog_name="seatunnel_test" + warehouse="file:///tmp/seatunnel/paimon/hadoop-sink/" + database="seatunnel" + table="role" + paimon.table.write-props = { + bucket = 2 + file.format = "parquet" + } + paimon.table.partition-keys = "dt" + paimon.table.primary-keys = "pk_id,dt" + } +} +``` + +### 多表 + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + Mysql-CDC { + base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel" + username = "root" + password = "******" + table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] + } +} + +transform { +} + +sink { + Paimon { + catalog_name="seatunnel_test" + warehouse="file:///tmp/seatunnel/paimon/hadoop-sink/" + database="${database_name}" + table="${table_name}" + } +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Phoenix.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Phoenix.md new file mode 100644 index 000000000000..9a3adc14e5cf --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Phoenix.md @@ -0,0 +1,63 @@ +# Phoenix + +> Phoenix 数据接收器 + +## 描述 + +该接收器是通过 [Jdbc数据连接器](Jdbc.md)来写Phoenix数据,支持批和流两种模式。测试的Phoenix版本为4.xx和5.xx。 +在底层实现上,通过Phoenix的jdbc驱动,执行upsert语句向HBase写入数据。 +使用Java JDBC连接Phoenix有两种方式:其一是使用JDBC连接zookeeper,其二是通过JDBC瘦客户端连接查询服务器。 + +> 提示1: 该接收器默认使用的是(thin)驱动jar包。如果需要使用(thick)驱动或者其他版本的Phoenix(thin)驱动,需要重新编译jdbc数据接收器模块。 +> +> 提示2: 该接收器还不支持精准一次语义(因为Phoenix还不支持XA事务)。 + +## 主要特性 + +- [ ] [精准一次](../../concept/connector-v2-features.md) + +## 接收器选项 + +### driver [string] + +phoenix(thick)驱动:`org.apache.phoenix.jdbc.PhoenixDriver` +phoenix(thin)驱动:`org.apache.phoenix.queryserver.client.Driver` + +### url [string] + +phoenix(thick)驱动:`jdbc:phoenix:localhost:2182/hbase` +phoenix(thin)驱动:`jdbc:phoenix:thin:url=http://localhost:8765;serialization=PROTOBUF` + +### common options + +Sink插件常用参数,请参考[Sink常用选项](common-options.md)获取更多细节信息。 + +## 示例 + +thick驱动: + +``` + Jdbc { + driver = org.apache.phoenix.jdbc.PhoenixDriver + url = "jdbc:phoenix:localhost:2182/hbase" + query = "upsert into test.sink(age, name) values(?, ?)" + } + +``` + +thin驱动: + +``` +Jdbc { + driver = org.apache.phoenix.queryserver.client.Driver + url = "jdbc:phoenix:thin:url=http://spark_e2e_phoenix_sink:8765;serialization=PROTOBUF" + query = "upsert into test.sink(age, name) values(?, ?)" +} +``` + +## 变更日志 + +### 2.2.0-beta 2022-09-26 + +- 增加Phoenix数据接收器 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Pulsar.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Pulsar.md new file mode 100644 index 000000000000..b85a41ae9dfe --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Pulsar.md @@ -0,0 +1,168 @@ +# Pulsar + +> Pulsar 数据连接器 + +## 引擎支持 + +> Spark
+> Flink
+> Seatunnel Zeta
+ +## 核心特性 + +- [x] [精准一次](../../concept/connector-v2-features.md) + +## 描述 + +Apache Pulsar 的接收连接器。 + +## 支持的数据源信息 + +| 数据源 | 支持的版本 | +|--------|-----------| +| Pulsar | Universal | + +## 输出选项 + +| 名称 | 类型 | 是否必须 | 默认值 | 描述 | +|----------------------|--------|------|---------------------|-----------------------------------------| +| topic | String | Yes | - | 输出到Pulsar主题名称. | +| client.service-url | String | Yes | - | Pulsar 服务的服务 URL 提供者. | +| admin.service-url | String | Yes | - | 管理端点的 Pulsar 服务 HTTP URL. | +| auth.plugin-class | String | No | - | 身份验证插件的名称. | +| auth.params | String | No | - | 身份验证插件的参数. | +| format | String | No | json | 数据格式。默认格式为 json。可选的文本格式. | +| field_delimiter | String | No | , | 自定义数据格式的字段分隔符. | +| semantics | Enum | No | AT_LEAST_ONCE | 写入 pulsar 的一致性语义. | +| transaction_timeout | Int | No | 600 | 默认情况下,事务超时指定为 10 分钟. | +| pulsar.config | Map | No | - | 除了上述必须由 Pulsar 生产者客户端指定的参数外. | +| message.routing.mode | Enum | No | RoundRobinPartition | 要分区的消息的默认路由模式. | +| partition_key_fields | array | No | - | 配置哪些字段用作 pulsar 消息的键. | +| common-options | config | no | - | 源插件常用参数,详见源码 [常用选项](common-options.md). | + +## 参数解释 + +### client.service-url [String] + +Pulsar 服务的 Service URL 提供程序。要使用客户端库连接到 Pulsar, +您需要指定一个 Pulsar 协议 URL。您可以将 Pulsar 协议 URL 分配给特定集群并使用 Pulsar 方案。 + +例如, `localhost`: `pulsar://localhost:6650,localhost:6651`. + +### admin.service-url [String] + +管理端点的 Pulsar 服务 HTTP URL. + +例如, `http://my-broker.example.com:8080`, or `https://my-broker.example.com:8443` for TLS. + +### auth.plugin-class [String] + +身份验证插件的名称。 + +### auth.params [String] + +身份验证插件的参数。 + +例如, `key1:val1,key2:val2` + +### format [String] + +数据格式。默认格式为 json。可选的文本格式。默认字段分隔符为","。如果自定义分隔符,请添加"field_delimiter"选项。 + +### field_delimiter [String] + +自定义数据格式的字段分隔符。默认field_delimiter为','。 + +### semantics [Enum] + +写入 pulsar 的一致性语义。可用选项包括 EXACTLY_ONCE、NON、AT_LEAST_ONCE、默认AT_LEAST_ONCE。 +如果语义被指定为 EXACTLY_ONCE,我们将使用 2pc 来保证消息被准确地发送到 pulsar 一次。 +如果语义指定为 NON,我们将直接将消息发送到 pulsar,如果作业重启/重试或网络错误,数据可能会重复/丢失。 + +### transaction_timeout [Int] + +默认情况下,事务超时指定为 10 分钟。如果事务未在指定的超时时间内提交,则事务将自动中止。因此,您需要确保超时大于检查点间隔。 + +### pulsar.config [Map] + +除了上述 Pulsar 生产者客户端必须指定的参数外,用户还可以为生产者客户端指定多个非强制性参数, +涵盖 Pulsar 官方文档中指定的所有生产者参数。 + +### message.routing.mode [Enum] + +要分区的消息的默认路由模式。可用选项包括 SinglePartition、RoundRobinPartition。 +如果选择 SinglePartition,如果未提供密钥,分区生产者将随机选择一个分区并将所有消息发布到该分区中,如果消息上提供了密钥,则分区生产者将对密钥进行哈希处理并将消息分配给特定分区。 +如果选择 RoundRobinPartition,则如果未提供密钥,则生产者将以循环方式跨所有分区发布消息,以实现最大吞吐量。请注意,轮询不是按单个消息完成的,而是设置为相同的批处理延迟边界,以确保批处理有效。 + +### partition_key_fields [String] + +配置哪些字段用作 pulsar 消息的键。 + +例如,如果要使用上游数据中的字段值作为键,则可以为此属性分配字段名称。 + +上游数据如下: + +| name | age | data | +|------|-----|---------------| +| Jack | 16 | data-example1 | +| Mary | 23 | data-example2 | + +如果将 name 设置为键,则 name 列的哈希值将确定消息发送到哪个分区。 + +如果未设置分区键字段,则将向 null 消息键发送至。 + +消息键的格式为 json,如果 name 设置为键,例如 '{“name”:“Jack”}'。 + +所选字段必须是上游的现有字段。 + +### 常见选项 + +源插件常用参数,详见源码[常用选项](common-options.md) . + +## 任务示例 + +### 简单: + +> 该示例定义了一个 SeaTunnel 同步任务,该任务通过 FakeSource 自动生成数据并将其发送到 Pulsar Sink。FakeSource 总共生成 16 行数据 (row.num=16),每行有两个字段,name(字符串类型)和 age(int 类型)。最终目标主题是test_topic主题中还将有 16 行数据。 如果您尚未安装和部署 SeaTunnel,则需要按照[安装Seatunnel](../../start-v2/locally/deployment.md) SeaTunnel 中的说明安装和部署 SeaTunnel。然后按照 [SeaTunnel 引擎快速入门](../../start-v2/locally/quick-start-seatunnel-engine.md)中的说明运行此作业。 + +```hocon +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +sink { + Pulsar { + topic = "example" + client.service-url = "localhost:pulsar://localhost:6650" + admin.service-url = "http://my-broker.example.com:8080" + result_table_name = "test" + pulsar.config = { + sendTimeoutMs = 30000 + } + } +} +``` + +## 更改日志 + +### 下一个版本 + +- 添加 Pulsar Sink 连接器 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Rabbitmq.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Rabbitmq.md new file mode 100644 index 000000000000..6562dd2fdc5c --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Rabbitmq.md @@ -0,0 +1,122 @@ +# Rabbitmq + +> Rabbitmq 数据接收器 + +## 描述 + +该数据接收器是将数据写入Rabbitmq。 + +## 主要特性 + +- [ ] [精准一次](../../concept/connector-v2-features.md) + +## 接收器选项 + +| 名称 | 类型 | 是否必须 | 默认值 | +|----------------------------|---------|------|-------| +| host | string | yes | - | +| port | int | yes | - | +| virtual_host | string | yes | - | +| username | string | yes | - | +| password | string | yes | - | +| queue_name | string | yes | - | +| url | string | no | - | +| network_recovery_interval | int | no | - | +| topology_recovery_enabled | boolean | no | - | +| automatic_recovery_enabled | boolean | no | - | +| use_correlation_id | boolean | no | false | +| connection_timeout | int | no | - | +| rabbitmq.config | map | no | - | +| common-options | | no | - | + +### host [string] + +Rabbitmq服务器地址 + +### port [int] + +Rabbitmq服务器端口 + +### virtual_host [string] + +virtual host – 连接broker使用的vhost + +### username [string] + +连接broker时使用的用户名 + +### password [string] + +连接broker时使用的密码 + +### url [string] + +设置host、port、username、password和virtual host的简便方式。 + +### queue_name [string] + +数据写入的队列名。 + +### schema [Config] + +#### fields [Config] + +上游数据的模式字段。 + +### network_recovery_interval [int] + +自动恢复需等待多长时间才尝试重连,单位为毫秒。 + +### topology_recovery_enabled [boolean] + +设置为true,表示启用拓扑恢复。 + +### automatic_recovery_enabled [boolean] + +设置为true,表示启用连接恢复。 + +### use_correlation_id [boolean] + +接收到的消息是否都提供唯一ID,来删除重复的消息达到幂等(在失败的情况下) + +### connection_timeout [int] + +TCP连接建立的超时时间,单位为毫秒;0代表不限制。 + +### rabbitmq.config [map] + +In addition to the above parameters that must be specified by the RabbitMQ client, the user can also specify multiple non-mandatory parameters for the client, covering [all the parameters specified in the official RabbitMQ document](https://www.rabbitmq.com/configure.html). +除了上面提及必须设置的RabbitMQ客户端参数,你也还可以为客户端指定多个非强制参数,参见 [RabbitMQ官方文档参数设置](https://www.rabbitmq.com/configure.html)。 + +### common options + +Sink插件常用参数,请参考[Sink常用选项](common-options.md)获取更多细节信息。 + +## 示例 + +simple: + +```hocon +sink { + RabbitMQ { + host = "rabbitmq-e2e" + port = 5672 + virtual_host = "/" + username = "guest" + password = "guest" + queue_name = "test1" + rabbitmq.config = { + requested-heartbeat = 10 + connection-timeout = 10 + } + } +} +``` + +## 变更日志 + +### 随后版本 + +- 增加Rabbitmq数据接收器 +- [Improve] 将连接器自定义配置前缀的数据类型更改为Map [3719](https://github.com/apache/seatunnel/pull/3719) + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Redis.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Redis.md new file mode 100644 index 000000000000..ac09849b7eb4 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/Redis.md @@ -0,0 +1,157 @@ +# Redis + +> Redis sink connector + +## 描述 + +用于将数据写入 Redis。 + +## 主要功能 + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## 选项 + +| 名称 | 类型 | 是否必须 | 默认值 | +|----------------|--------|---------------------|--------| +| host | string | 是 | - | +| port | int | 是 | - | +| key | string | 是 | - | +| data_type | string | 是 | - | +| user | string | 否 | - | +| auth | string | 否 | - | +| db_num | int | 否 | 0 | +| mode | string | 否 | single | +| nodes | list | 当 mode=cluster 时为:是 | - | +| format | string | 否 | json | +| expire | long | 否 | -1 | +| common-options | | 否 | - | + +### host [string] + +Redis 主机地址 + +### port [int] + +Redis 端口 + +### key [string] + +要写入 Redis 的键值。 + +例如,如果想使用上游数据中的某个字段值作为键值,可以将该字段名称指定给 key。 + +上游数据如下: + +| code | data | success | +|------|------|---------| +| 200 | 获取成功 | true | +| 500 | 内部错误 | false | + +如果将字段名称指定为 `code` 并将 data_type 设置为 `key`,将有两个数据写入 Redis: +1. `200 -> {code: 200, message: true, data: 获取成功}` +2. `500 -> {code: 500, message: false, data: 内部错误}` + +如果将字段名称指定为 `value` 并将 data_type 设置为 `key`,则由于上游数据的字段中没有 `value` 字段,将只有一个数据写入 Redis: + +1. `value -> {code: 500, message: false, data: 内部错误}` + +请参见 data_type 部分以了解具体的写入规则。 + +当然,这里写入的数据格式只是以 json 为例,具体格式以用户配置的 `format` 为准。 + +### data_type [string] + +Redis 数据类型,支持 `key` `hash` `list` `set` `zset` + +- key + +> 每个来自上游的数据都会更新到配置的 key,这意味着后面的数据会覆盖前面的数据,只有最后的数据会存储在该 key 中。 + +- hash + +> 每个来自上游的数据会根据字段拆分并写入 hash key,后面的数据会覆盖前面的数据。 + +- list + +> 每个来自上游的数据都会被添加到配置的 list key 中。 + +- set + +> 每个来自上游的数据都会被添加到配置的 set key 中。 + +- zset + +> 每个来自上游的数据都会以权重为 1 的方式添加到配置的 zset key 中。因此,zset 中数据的顺序基于数据的消费顺序。 + +### user [string] + +Redis 认证用户,连接加密集群时需要 + +### auth [string] + +Redis 认证密码,连接加密集群时需要 + +### db_num [int] + +Redis 数据库索引 ID,默认连接到 db 0 + +### mode [string] + +Redis 模式,`single` 或 `cluster`,默认是 `single` + +### nodes [list] + +Redis 节点信息,在集群模式下使用,必须按如下格式: + +["host1:port1", "host2:port2"] + +### format [string] + +上游数据的格式,目前只支持 `json`,以后会支持 `text`,默认 `json`。 + +当你指定格式为 `json` 时,例如: + +上游数据如下: + +| code | data | success | +|------|------|---------| +| 200 | 获取成功 | true | + +连接器会生成如下数据并写入 Redis: + +```json +{"code": 200, "data": "获取成功", "success": "true"} +``` + +### expire [long] + +设置 Redis 的过期时间,单位为秒。默认值为 -1,表示键不会自动过期。 + +### common options + +Sink 插件通用参数,请参考 [Sink Common Options](common-options.md) 获取详情 + +## 示例 + +简单示例: + +```hocon +Redis { + host = localhost + port = 6379 + key = age + data_type = list +} +``` + +## 更新日志 + +### 2.2.0-beta 2022-09-26 + +- 添加 Redis Sink Connector + +### 下一个版本 + +- [改进] 支持 Redis 集群模式连接和用户认证 [3188](https://github.com/apache/seatunnel/pull/3188) + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/StarRocks.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/StarRocks.md new file mode 100644 index 000000000000..6be7ff7e8e01 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/StarRocks.md @@ -0,0 +1,288 @@ +# StarRocks + +> StarRocks 数据接收器 + +## 引擎支持 + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## 主要特性 + +- [ ] [精准一次](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +## 描述 + +该接收器用于将数据写入到StarRocks中。支持批和流两种模式。 +StarRocks数据接收器内部实现采用了缓存,通过stream load将数据批导入。 + +## 接收器选项 + +| 名称 | 类型 | 是否必须 | 默认值 | Description | +|-----------------------------|---------|------|------------------------------|---------------------------------------------------------------------------------------------------------------------| +| nodeUrls | list | yes | - | `StarRocks`集群地址, 格式为 `["fe_ip:fe_http_port", ...]` | +| base-url | string | yes | - | JDBC URL样式的连接信息。如:`jdbc:mysql://localhost:9030/` 或 `jdbc:mysql://localhost:9030` 或 `jdbc:mysql://localhost:9030/db` | +| username | string | yes | - | 目标`StarRocks` 用户名 | +| password | string | yes | - | 目标`StarRocks` 密码 | +| database | string | yes | - | 指定目标 StarRocks 表所在的数据库的名称 | +| table | string | no | - | 指定目标 StarRocks 表的名称, 如果没有设置该值,则表名与上游表名相同 | +| labelPrefix | string | no | - | StarRocks stream load作业标签前缀 | +| batch_max_rows | long | no | 1024 | 在批写情况下,当缓冲区数量达到`batch_max_rows`数量或`batch_max_bytes`字节大小或者时间达到`checkpoint.interval`时,数据会被刷新到StarRocks | +| batch_max_bytes | int | no | 5 * 1024 * 1024 | 在批写情况下,当缓冲区数量达到`batch_max_rows`数量或`batch_max_bytes`字节大小或者时间达到`checkpoint.interval`时,数据会被刷新到StarRocks | +| max_retries | int | no | - | 数据写入StarRocks失败后的重试次数 | +| retry_backoff_multiplier_ms | int | no | - | 用作生成下一个退避延迟的乘数 | +| max_retry_backoff_ms | int | no | - | 向StarRocks发送重试请求之前的等待时长 | +| enable_upsert_delete | boolean | no | false | 是否开启upsert/delete事件的同步,仅仅支持主键模型的表 | +| save_mode_create_template | string | no | 参见表下方的说明 | 参见表下方的说明 | +| starrocks.config | map | no | - | stream load `data_desc`参数 | +| http_socket_timeout_ms | int | no | 180000 | http socket超时时间,默认为3分钟 | +| schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | 在同步任务打开之前,针对目标端已存在的表结构选择不同的处理方法 | +| data_save_mode | Enum | no | APPEND_DATA | 在同步任务打开之前,针对目标端已存在的数据选择不同的处理方法 | +| custom_sql | String | no | - | 当data_save_mode设置为CUSTOM_PROCESSING时,必须同时设置CUSTOM_SQL参数。CUSTOM_SQL的值为可执行的SQL语句,在同步任务开启前SQL将会被执行 | + +### save_mode_create_template + +StarRocks数据接收器使用模板,在需求需要的时候也可以修改模板,并结合上游数据类型和结构生成表的创建语句来自动创建StarRocks表。当前仅在多表模式下有效。 + +默认模板如下: + +```sql +CREATE TABLE IF NOT EXISTS `${database}`.`${table_name}` ( +${rowtype_primary_key}, +${rowtype_fields} +) ENGINE=OLAP +PRIMARY KEY (${rowtype_primary_key}) +DISTRIBUTED BY HASH (${rowtype_primary_key})PROPERTIES ( +"replication_num" = "1" +) +``` + +在模板中添加自定义字段,比如说加上`id`字段的修改模板如下: + +```sql +CREATE TABLE IF NOT EXISTS `${database}`.`${table_name}` +( + id, + ${rowtype_fields} +) ENGINE = OLAP DISTRIBUTED BY HASH (${rowtype_primary_key}) + PROPERTIES +( + "replication_num" = "1" +); +``` + +StarRocks数据接收器根据上游数据自动获取相应的信息来填充模板,并且会移除`rowtype_fields`中的id字段信息。使用此方法可用来为自定义字段修改类型及相关属性。 + +可以使用的占位符有: + +- database: 上游数据模式的库名称 +- table_name: 上游数据模式的表名称 +- rowtype_fields: 上游数据模式的所有字段信息,连接器会将字段信息自动映射到StarRocks对应的类型 +- rowtype_primary_key: 上游数据模式的主键信息,结果可能是列表 +- rowtype_unique_key: 上游数据模式的唯一键信息,结果可能是列表 + +### table [string] + +使用选项参数`database`和`table-name`自动生成SQL,并接收上游输入数据写入StarRocks中。 + +此选项与 `query` 是互斥的,具具有更高的优先级。 + +table选项参数可以填入一任意表名,这个名字最终会被用作目标表的表名,并且支持变量(`${table_name}`,`${schema_name}`)。 +替换规则如下:`${schema_name}` 将替换传递给目标端的 SCHEMA 名称,`${table_name}` 将替换传递给目标端的表名。 + +例如: +1. test_${schema_name}_${table_name}_test +2. sink_sinktable +3. ss_${table_name} + +### schema_save_mode[Enum] + +在同步任务打开之前,针对目标端已存在的表结构选择不同的处理方法。可选值有: +`RECREATE_SCHEMA` :不存在的表会直接创建,已存在的表会删除并根据参数重新创建 +`CREATE_SCHEMA_WHEN_NOT_EXIST` :忽略已存在的表,不存在的表会直接创建 +`ERROR_WHEN_SCHEMA_NOT_EXIST` :当有不存在的表时会直接报错 + +### data_save_mode[Enum] + +在同步任务打开之前,针对目标端已存在的数据选择不同的处理方法。可选值有: +`DROP_DATA`: 保存数据库结构,但是会删除表中存量数据 +`APPEND_DATA`:保存数据库结构和相关的表存量数据 +`CUSTOM_PROCESSING`:自定义处理 +`ERROR_WHEN_DATA_EXISTS`:当对应表存在数据时直接报错 + +### custom_sql[String] + +当data_save_mode设置为CUSTOM_PROCESSING时,必须同时设置CUSTOM_SQL参数。CUSTOM_SQL的值为可执行的SQL语句,在同步任务开启前SQL将会被执行。 + +## 数据类型映射 + +| StarRocks数据类型 | SeaTunnel数据类型 | +|---------------|---------------| +| BOOLEAN | BOOLEAN | +| TINYINT | TINYINT | +| SMALLINT | SMALLINT | +| INT | INT | +| BIGINT | BIGINT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| DECIMAL | DECIMAL | +| DATE | STRING | +| TIME | STRING | +| DATETIME | STRING | +| STRING | STRING | +| ARRAY | STRING | +| MAP | STRING | +| BYTES | STRING | + +#### 支持导入的数据格式 + +StarRocks数据接收器支持的格式有CSV和JSON格式。 + +## 任务示例 + +### 简单示例 + +> 接下来给出一个示例,该示例包含多种数据类型的数据写入,且用户需要为目标端下游创建相应表 + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" + checkpoint.interval = 10000 +} + +source { + FakeSource { + row.num = 10 + map.size = 10 + array.size = 10 + bytes.length = 10 + string.length = 10 + schema = { + fields { + c_map = "map>" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(16, 1)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} + +sink { + StarRocks { + nodeUrls = ["e2e_starRocksdb:8030"] + username = root + password = "" + database = "test" + table = "e2e_table_sink" + batch_max_rows = 10 + starrocks.config = { + format = "JSON" + strip_outer_array = true + } + } +} +``` + +### 支持写入cdc变更事件(INSERT/UPDATE/DELETE)示例 + +```hocon +sink { + StarRocks { + nodeUrls = ["e2e_starRocksdb:8030"] + username = root + password = "" + database = "test" + table = "e2e_table_sink" + ... + + // 支持upsert/delete事件的同步(需要将选项参数enable_upsert_delete设置为true),仅支持表引擎为主键模型 + enable_upsert_delete = true + } +} +``` + +### JSON格式数据导入示例 + +``` +sink { + StarRocks { + nodeUrls = ["e2e_starRocksdb:8030"] + username = root + password = "" + database = "test" + table = "e2e_table_sink" + batch_max_rows = 10 + starrocks.config = { + format = "JSON" + strip_outer_array = true + } + } +} + +``` + +### CSV格式数据导入示例 + +``` +sink { + StarRocks { + nodeUrls = ["e2e_starRocksdb:8030"] + username = root + password = "" + database = "test" + table = "e2e_table_sink" + batch_max_rows = 10 + starrocks.config = { + format = "CSV" + column_separator = "\\x01" + row_delimiter = "\\x02" + } + } +} +``` + +### 使用save_mode的示例 + +``` +sink { + StarRocks { + nodeUrls = ["e2e_starRocksdb:8030"] + username = root + password = "" + database = "test" + table = "test_${schema_name}_${table_name}" + schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" + data_save_mode="APPEND_DATA" + batch_max_rows = 10 + starrocks.config = { + format = "CSV" + column_separator = "\\x01" + row_delimiter = "\\x02" + } + } +} +``` + +## 变更日志 + +### 随后版本 + +- 增加StarRocks数据接收器 +- [Improve] 将连接器自定义配置前缀的数据类型更改为Map [3719](https://github.com/apache/seatunnel/pull/3719) +- [Feature] 支持写入cdc变更事件(INSERT/UPDATE/DELETE) [3865](https://github.com/apache/seatunnel/pull/3865) + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/common-options.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/common-options.md new file mode 100644 index 000000000000..8569b46da0e8 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/sink/common-options.md @@ -0,0 +1,58 @@ +# Sink 常用选项 + +> Sink 连接器常用参数 + +| 名称 | 类型 | 是否需要 | 默认值 | +|-------------------|--------|------|-----| +| source_table_name | string | 否 | - | +| parallelism | int | 否 | - | + +### source_table_name [string] + +当不指定 `source_table_name` 时,当前插件处理配置文件中上一个插件输出的数据集 `dataset` + +当指定了 `source_table_name` 时,当前插件正在处理该参数对应的数据集 + +### parallelism [int] + +当没有指定`parallelism`时,默认使用 env 中的 `parallelism`。 + +当指定 `parallelism` 时,它将覆盖 env 中的 `parallelism`。 + +## Examples + +```bash +source { + FakeSourceStream { + parallelism = 2 + result_table_name = "fake" + field_name = "name,age" + } +} + +transform { + Filter { + source_table_name = "fake" + fields = [name] + result_table_name = "fake_name" + } + Filter { + source_table_name = "fake" + fields = [age] + result_table_name = "fake_age" + } +} + +sink { + Console { + source_table_name = "fake_name" + } + Console { + source_table_name = "fake_age" + } +} +``` + +> 如果作业只有一个 source 和一个(或零个)transform 和一个 sink ,则不需要为连接器指定 `source_table_name` 和 `result_table_name`。 +> 如果 source 、transform 和 sink 中任意运算符的数量大于 1,则必须为作业中的每个连接器指定 `source_table_name` 和 `result_table_name` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source.md new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source/Hbase.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source/Hbase.md new file mode 100644 index 000000000000..5f15a30b99aa --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source/Hbase.md @@ -0,0 +1,96 @@ +# Hbase + +> Hbase 源连接器 + +## 描述 + +从 Apache Hbase 读取数据。 + +## 主要功能 + +- [x] [批处理](../../concept/connector-v2-features.md) +- [ ] [流处理](../../concept/connector-v2-features.md) +- [ ] [精确一次](../../concept/connector-v2-features.md) +- [x] [Schema](../../concept/connector-v2-features.md) +- [x] [并行度](../../concept/connector-v2-features.md) +- [ ] [支持用户定义的拆分](../../concept/connector-v2-features.md) + +## 选项 + +| 名称 | 类型 | 必填 | 默认值 | +|--------------------|---------|----|-------| +| zookeeper_quorum | string | 是 | - | +| table | string | 是 | - | +| schema | config | 是 | - | +| hbase_extra_config | string | 否 | - | +| caching | int | 否 | -1 | +| batch | int | 否 | -1 | +| cache_blocks | boolean | 否 | false | +| common-options | | 否 | - | + +### zookeeper_quorum [string] + +hbase的zookeeper集群主机,例如:“hadoop001:2181,hadoop002:2181,hadoop003:2181” + +### table [string] + +要写入的表名,例如:“seatunnel” + +### schema [config] + +Hbase 使用字节数组进行存储。因此,您需要为表中的每一列配置数据类型。有关更多信息,请参阅:[guide](../../concept/schema-feature.md#how-to-declare-type-supported)。 + +### hbase_extra_config [config] + +hbase 的额外配置 + +### caching + +caching 参数用于设置在扫描过程中一次从服务器端获取的行数。这可以减少客户端与服务器之间的往返次数,从而提高扫描效率。默认值:-1 + +### batch + +batch 参数用于设置在扫描过程中每次返回的最大列数。这对于处理有很多列的行特别有用,可以避免一次性返回过多数据,从而节省内存并提高性能。 + +### cache_blocks + +cache_blocks 参数用于设置在扫描过程中是否缓存数据块。默认情况下,HBase 会在扫描时将数据块缓存到块缓存中。如果设置为 false,则在扫描过程中不会缓存数据块,从而减少内存的使用。在SeaTunnel中默认值为: false + +### 常用选项 + +Source 插件常用参数,具体请参考 [Source 常用选项](common-options.md) + +## 示例 + +```bash +source { + Hbase { + zookeeper_quorum = "hadoop001:2181,hadoop002:2181,hadoop003:2181" + table = "seatunnel_test" + caching = 1000 + batch = 100 + cache_blocks = false + schema = { + columns = [ + { + name = "rowkey" + type = string + }, + { + name = "columnFamily1:column1" + type = boolean + }, + { + name = "columnFamily1:column2" + type = double + }, + { + name = "columnFamily2:column1" + type = bigint + } + ] + } + } +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source/HdfsFile.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source/HdfsFile.md new file mode 100644 index 000000000000..efce1d140171 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source/HdfsFile.md @@ -0,0 +1,127 @@ +# Hdfs文件 + +> Hdfs文件 数据源连接器 + +## 支持的引擎 + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## 主要特性 + +- [x] [批处理](../../concept/connector-v2-features.md) +- [ ] [流处理](../../concept/connector-v2-features.md) +- [x] [精确一次](../../concept/connector-v2-features.md) + +在一次 pollNext 调用中读取分片中的所有数据。将读取的分片保存在快照中。 + +- [x] [列投影](../../concept/connector-v2-features.md) +- [x] [并行度](../../concept/connector-v2-features.md) +- [ ] [支持用户定义的分片](../../concept/connector-v2-features.md) +- [x] 文件格式 + - [x] 文本 + - [x] CSV + - [x] Parquet + - [x] ORC + - [x] JSON + - [x] Excel + +## 描述 + +从Hdfs文件系统中读取数据。 + +## 支持的数据源信息 + +| 数据源 | 支持的版本 | +|--------|------------------| +| Hdfs文件 | hadoop 2.x 和 3.x | + +## 源选项 + +| 名称 | 类型 | 是否必须 | 默认值 | 描述 | +|---------------------------|---------|------|----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| path | string | 是 | - | 源文件路径。 | +| file_format_type | string | 是 | - | 我们支持以下文件类型:`text` `json` `csv` `orc` `parquet` `excel`。请注意,最终文件名将以文件格式的后缀结束,文本文件的后缀是 `txt`。 | +| fs.defaultFS | string | 是 | - | 以 `hdfs://` 开头的 Hadoop 集群地址,例如:`hdfs://hadoopcluster`。 | +| read_columns | list | 否 | - | 数据源的读取列列表,用户可以使用它实现字段投影。支持的文件类型的列投影如下所示:[text,json,csv,orc,parquet,excel]。提示:如果用户在读取 `text` `json` `csv` 文件时想要使用此功能,必须配置 schema 选项。 | +| hdfs_site_path | string | 否 | - | `hdfs-site.xml` 的路径,用于加载 namenodes 的 ha 配置。 | +| delimiter/field_delimiter | string | 否 | \001 | 字段分隔符,用于告诉连接器在读取文本文件时如何切分字段。默认 `\001`,与 Hive 的默认分隔符相同。 | +| parse_partition_from_path | boolean | 否 | true | 控制是否从文件路径中解析分区键和值。例如,如果您从路径 `hdfs://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` 读取文件,则来自文件的每条记录数据将添加这两个字段:[name:tyrantlucifer,age:26]。提示:不要在 schema 选项中定义分区字段。 | +| date_format | string | 否 | yyyy-MM-dd | 日期类型格式,用于告诉连接器如何将字符串转换为日期,支持的格式如下:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`,默认 `yyyy-MM-dd`。日期时间类型格式,用于告诉连接器如何将字符串转换为日期时间,支持的格式如下:`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss`,默认 `yyyy-MM-dd HH:mm:ss`。 | +| time_format | string | 否 | HH:mm:ss | 时间类型格式,用于告诉连接器如何将字符串转换为时间,支持的格式如下:`HH:mm:ss` `HH:mm:ss.SSS`,默认 `HH:mm:ss`。 | +| remote_user | string | 否 | - | 用于连接 Hadoop 的登录用户。它旨在用于 RPC 中的远程用户,不会有任何凭据。 | +| krb5_path | string | 否 | /etc/krb5.conf | kerberos 的 krb5 路径。 | +| kerberos_principal | string | 否 | - | kerberos 的 principal。 | +| kerberos_keytab_path | string | 否 | - | kerberos 的 keytab 路径。 | +| skip_header_row_number | long | 否 | 0 | 跳过前几行,但仅适用于 txt 和 csv。例如,设置如下:`skip_header_row_number = 2`。然后 Seatunnel 将跳过源文件中的前两行。 | +| schema | config | 否 | - | 上游数据的模式字段。 | +| sheet_name | string | 否 | - | 读取工作簿的表格,仅在文件格式为 excel 时使用。 | +| compress_codec | string | 否 | none | 文件的压缩编解码器。 | +| common-options | | 否 | - | 源插件通用参数,请参阅 [源通用选项](../../../en/connector-v2/source/common-options.md) 获取详细信息。 | + +### delimiter/field_delimiter [string] + +**delimiter** 参数在版本 2.3.5 后将被弃用,请改用 **field_delimiter**。 + +### compress_codec [string] + +文件的压缩编解码器及支持的详细信息如下所示: + +- txt:`lzo` `none` +- json:`lzo` `none` +- csv:`lzo` `none` +- orc/parquet: + 自动识别压缩类型,无需额外设置。 + +### 提示 + +> 如果您使用 spark/flink,为了 + +使用此连接器,您必须确保您的 spark/flink 集群已经集成了 hadoop。测试过的 hadoop 版本是 2.x。如果您使用 SeaTunnel Engine,则在下载和安装 SeaTunnel Engine 时会自动集成 hadoop jar。您可以检查 `${SEATUNNEL_HOME}/lib` 下的 jar 包来确认这一点。 + +## 任务示例 + +### 简单示例: + +> 此示例定义了一个 SeaTunnel 同步任务,从 Hdfs 中读取数据并将其发送到 Hdfs。 + +``` +# 定义运行时环境 +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + HdfsFile { + schema { + fields { + name = string + age = int + } + } + path = "/apps/hive/demo/student" + type = "json" + fs.defaultFS = "hdfs://namenode001" + } + # 如果您想获取有关如何配置 seatunnel 和查看源插件完整列表的更多信息, + # 请访问 https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # 如果您想获取有关如何配置 seatunnel 和查看转换插件完整列表的更多信息, + # 请访问 https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + file_format = "orc" + } + # 如果您想获取有关如何配置 seatunnel 和查看接收器插件完整列表的更多信息, + # 请访问 https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source/Sls.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source/Sls.md new file mode 100644 index 000000000000..d0e10257258f --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source/Sls.md @@ -0,0 +1,87 @@ +# Sls + +> Sls source connector + +## 支持的引擎 + +> Spark
+> Flink
+> Seatunnel Zeta
+ +## 主要特性 + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## 描述 + +从阿里云Sls日志服务中读取数据。 + +## 支持的数据源信息 + +为了使用Sls连接器,需要以下依赖关系。 +它们可以通过install-plugin.sh或Maven中央存储库下载。 + +| 数据源 | 支持的版本 | Maven | +|-----|-----------|-----------------------------------------------------------------------------------------------------------| +| Sls | Universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-sls) | + +## Source Options + +| Name | Type | Required | Default | Description | +|-------------------------------------|---------------------------------------------|----------|--------------------------|------------------------------------------------------------------------------------------------------------------------------------| +| project | String | Yes | - | [阿里云 Sls 项目](https://help.aliyun.com/zh/sls/user-guide/manage-a-project?spm=a2c4g.11186623.0.0.6f9755ebyfaYSl) | +| logstore | String | Yes | - | [阿里云 Sls 日志库](https://help.aliyun.com/zh/sls/user-guide/manage-a-logstore?spm=a2c4g.11186623.0.0.13137c08nfuiBC) | +| endpoint | String | Yes | - | [阿里云访问服务点](https://help.aliyun.com/zh/sls/developer-reference/api-sls-2020-12-30-endpoint?spm=a2c4g.11186623.0.0.548945a8UyJULa) | +| access_key_id | String | Yes | - | [阿里云访问用户ID](https://help.aliyun.com/zh/ram/user-guide/create-an-accesskey-pair?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#task-2245479) | +| access_key_secret | String | Yes | - | [阿里云访问用户密码](https://help.aliyun.com/zh/ram/user-guide/create-an-accesskey-pair?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#task-2245479) | +| start_mode | StartMode[earliest],[group_cursor],[latest] | No | group_cursor | 消费者的初始消费模式 | +| consumer_group | String | No | SeaTunnel-Consumer-Group | Sls消费者组id,用于区分不同的消费者组 | +| auto_cursor_reset | CursorMode[begin],[end] | No | end | 当消费者组中没有记录读取游标时,初始化读取游标 | +| batch_size | Int | No | 1000 | 每次从SLS中读取的数据量 | +| partition-discovery.interval-millis | Long | No | -1 | 动态发现主题和分区的间隔 | + +## 任务示例 + +### 简单示例 + +> 此示例读取sls的logstore1的数据并将其打印到客户端。如果您尚未安装和部署SeaTunnel,则需要按照安装SeaTunnel中的说明安装和部署SeaTunnel。然后按照[快速启动SeaTunnel引擎](../../Start-v2/locale/Quick-Start SeaTunnel Engine.md)中的说明运行此作业。 + +[创建RAM用户及授权](https://help.aliyun.com/zh/sls/create-a-ram-user-and-authorize-the-ram-user-to-access-log-service?spm=a2c4g.11186623.0.i4), 请确认RAM用户有足够的权限来读取及管理数据,参考:[RAM自定义授权示例](https://help.aliyun.com/zh/sls/use-custom-policies-to-grant-permissions-to-a-ram-user?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#reference-s3z-m1l-z2b) + +```hocon +# Defining the runtime environment +env { + parallelism = 2 + job.mode = "STREAMING" + checkpoint.interval = 30000 +} + +source { + Sls { + endpoint = "cn-hangzhou-intranet.log.aliyuncs.com" + project = "project1" + logstore = "logstore1" + access_key_id = "xxxxxxxxxxxxxxxxxxxxxxxx" + access_key_secret = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + schema = { + fields = { + id = "int" + name = "string" + description = "string" + weight = "string" + } + } + } +} + +sink { + Console { + } +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source/common-options.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source/common-options.md new file mode 100644 index 000000000000..902dca2c1953 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/connector-v2/source/common-options.md @@ -0,0 +1,81 @@ +# Source Common Options + +> Source connector 的常用参数 + +| 名称 | 类型 | 必填 | 默认值 | 描述 | +|-------------------|--------|----|-----|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| result_table_name | String | 否 | - | 当未指定 `result_table_name` 时,此插件处理的数据将不会被注册为可由其他插件直接访问的数据集 `(dataStream/dataset)`,或称为临时表 `(table)`。
当指定了 `result_table_name` 时,此插件处理的数据将被注册为可由其他插件直接访问的数据集 `(dataStream/dataset)`,或称为临时表 `(table)`。此处注册的数据集 `(dataStream/dataset)` 可通过指定 `source_table_name` 直接被其他插件访问。 | +| parallelism | Int | 否 | - | 当未指定 `parallelism` 时,默认使用环境中的 `parallelism`。
当指定了 `parallelism` 时,将覆盖环境中的 `parallelism` 设置。 | + +# 重要提示 + +在作业配置中使用 `result_table_name` 时,必须设置 `source_table_name` 参数。 + +## 任务示例 + +### 简单示例 + +> 注册一个流或批处理数据源,并在注册时返回表名 `fake_table` + +```bash +source { + FakeSourceStream { + result_table_name = "fake_table" + } +} +``` + +### 复杂示例 + +> 这是将Fake数据源转换并写入到两个不同的目标中 + +```bash +env { + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 100 + schema = { + fields { + id = "int" + name = "string" + age = "int" + c_timestamp = "timestamp" + c_date = "date" + c_map = "map" + c_array = "array" + c_decimal = "decimal(30, 8)" + c_row = { + c_row = { + c_int = int + } + } + } + } + } +} + +transform { + Sql { + source_table_name = "fake" + result_table_name = "fake1" + # 查询表名必须与字段 'source_table_name' 相同 + query = "select id, regexp_replace(name, '.+', 'b') as name, age+1 as age, pi() as pi, c_timestamp, c_date, c_map, c_array, c_decimal, c_row from fake" + } + # SQL 转换支持基本函数和条件操作 + # 但不支持复杂的 SQL 操作,包括:多源表/行 JOIN 和聚合操作等 +} + +sink { + Console { + source_table_name = "fake1" + } + Console { + source_table_name = "fake" + } +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/coding-guide.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/coding-guide.md new file mode 100644 index 000000000000..8ee04d4374a5 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/coding-guide.md @@ -0,0 +1,111 @@ +# 编码指南 + +本指南整体介绍了当前 Apache SeaTunnel 的模块和提交一个高质量 pull request 的最佳实践。 + +## 模块概述 + +| 模块名 | 介绍 | +|----------------------------------------|--------------------------------------------------------------------| +| seatunnel-api | SeaTunnel connector V2 API 模块 | +| seatunnel-common | SeaTunnel 通用模块 | +| seatunnel-connectors-v2 | SeaTunnel connector V2 模块, connector V2 处于社区重点开发中 | +| seatunnel-core/seatunnel-spark-starter | SeaTunnel connector V2 的 Spark 引擎核心启动模块 | +| seatunnel-core/seatunnel-flink-starter | SeaTunnel connector V2 的 Flink 引擎核心启动模块 | +| seatunnel-core/seatunnel-starter | SeaTunnel connector V2 的 SeaTunnel 引擎核心启动模块 | +| seatunnel-e2e | SeaTunnel 端到端测试模块 | +| seatunnel-examples | SeaTunnel 本地案例模块, 开发者可以用来单元测试和集成测试 | +| seatunnel-engine | SeaTunnel 引擎模块, seatunnel-engine 是 SeaTunnel 社区新开发的计算引擎,用来实现数据同步 | +| seatunnel-formats | SeaTunnel 格式化模块,用来提供格式化数据的能力 | +| seatunnel-plugin-discovery | SeaTunnel 插件发现模块,用来加载类路径中的SPI插件 | +| seatunnel-transforms-v2 | SeaTunnel transform V2 模块, transform V2 处于社区重点开发中 | +| seatunnel-translation | SeaTunnel translation 模块, 用来适配Connector V2 和其他计算引擎, 例如Spark、Flink等 | + +## 如何提交一个高质量的Pull Request + +1. 创建实体类的时候使用 `lombok` 插件的注解(`@Data` `@Getter` `@Setter` `@NonNull` 等)来减少代码量。在编码过程中优先使用 lombok 插件是一个很好的习惯。 + +2. 如果你需要在类中使用 log4j 打印日志, 优先使用 `lombok` 中的 `@Slf4j` 注解。 + +3. SeaTunnel 使用 Github issue 来跟踪代码问题,包括 bugs 和 改进, 并且使用 Github pull request 来管理代码的审查和合并。所以创建一个清晰的 issue 或者 pull request 能让社区更好的理解开发者的意图,最佳实践如下: + + > [目的] [模块名称] [子模块名称] 描述 + + 1. Pull request 目的包含: `Hotfix`, `Feature`, `Improve`, `Docs`, `WIP`。 请注意如果选择 `WIP`, 你需要使用 github 的 draft pull request。 + 2. Issue 目的包含: `Feature`, `Bug`, `Docs`, `Discuss`。 + 3. 模块名称: 当前 pull request 或 issue 所涉及的模块名称, 例如: `Core`, `Connector-V2`, `Connector-V1`等。 + 4. 子模块名称: 当前 pull request 或 issue 所涉及的子模块名称, 例如:`File` `Redis` `Hbase`等。 + 5. 描述: 高度概括下当前 pull request 和 issue 要做的事情,尽量见名知意。 + + 提示:**更多内容, 可以参考 [Issue Guide](https://seatunnel.apache.org/community/contribution_guide/contribute#issue) 和 [Pull Request Guide](https://seatunnel.apache.org/community/contribution_guide/contribute#pull-request)** + +4. 代码片段不要重复。 如果一段代码被使用多次,定义多次不是好的选择,最佳实践是把它公共独立出来让其他模块使用。 + +5. 当抛出一个异常时, 需要一起带上提示信息并且使异常的范围尽可能地小。抛出过于广泛的异常会让错误处理变得复杂并且容易包含安全问题。例如,如果你的 connector 在读数据的时候遇到 `IOException`, 合理的做法如下: + + ```java + try { + // read logic + } catch (IOException e) { + throw SeaTunnelORCFormatException("This orc file is corrupted, please check it", e); + } + ``` + +6. Apache 项目的 license 要求很严格, 每个 Apache 项目文件都应该包含一个 license 声明。 在提交 pull request 之前请检查每个新文件都包含 `Apache License Header`。 + + ```java + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + ``` + +7. Apache SeaTunnel 使用 `Spotless` 管理代码风格和格式检查。你可以使用下面的命令来自动修复代码风格问题和格式。 + + ```shell + ./mvnw spotless:apply + ``` + +8. 提交 pull request 之前,确保修改后项目编译正常,使用下面命令打包整个项目: + + ```shell + # 多线程编译 + ./mvnw -T 1C clean package + ``` + + ```shell + # 单线程编译 + ./mvnw clean package + ``` + +9. 提交 pull request 之前,在本地用完整的单元测试和集成测试来检查你的功能性是否正确,最佳实践是用 `seatunnel-examples` 模块的例子去检查多引擎是否正确运行并且结果正确。 + +10. 如果提交的 pull request 是一个新的特性, 请记得更新文档。 + +11. 提交 connector 相关的 pull request, 可以通过写 e2e 测试保证鲁棒性,e2e 测试需要包含所有的数据类型,并且初始化尽可能小的 docker 镜像,sink 和 source 的测试用例可以写在一起减少资源的损耗。 可以参考这个不错的例子: [MongodbIT.java](https://github.com/apache/seatunnel/blob/dev/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/java/org/apache/seatunnel/e2e/connector/v2/mongodb/MongodbIT.java) + +12. 类中默认的权限需要使用 `private`, 不可修改的需要设置 `final`, 特殊场景除外。 + +13. 类中的属性和方法参数倾向于使用基本数据类型(int boolean double float...), 而不是包装类型(Integer Boolean Double Float...), 特殊情况除外。 + +14. 开发一个 sink connector 的时候你需要知道 sink 需要被序列化,如果有不能被序列化的属性, 需要包装到一个类中,并且使用单例模式。 + +15. 如果代码中有多个 `if` 流程判断, 尽量简化为多个 if 而不是 if-else-if。 + +16. Pull request 具有单一职责的特点, 不允许在 pull request 包含与该功能无关的代码, 如果有这种情况, 需要在提交 pull request 之前单独处理好, 否则 Apache SeaTunnel 社区会主动关闭 pull request。 + +17. 贡献者需要对自己的 pull request 负责。 如果 pull request 包含新的特性, 或者修改了老的特性,增加测试用例或者 e2e 用例来证明合理性和保护完整性是一个很好的做法。 + +18. 如果你认为社区当前某部分代码不合理(尤其是核心的 `core` 和 `api` 模块),有函数需要更新修改,优先使用 `discuss issue` 和 `email` 与社区讨论是否有必要修改,社区同意后再提交 pull request, 请不要不经讨论直接提交 pull request, 社区会认为无效并且关闭。 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/contribute-plugin.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/contribute-plugin.md new file mode 100644 index 000000000000..514355840d03 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/contribute-plugin.md @@ -0,0 +1,5 @@ +# 贡献 Connector-v2 插件 + +如果你想要贡献 Connector-V2, 可以参考下面的 Connector-V2 贡献指南。 可以帮你快速进入开发。 + +[Connector-v2 贡献指南](https://github.com/apache/seatunnel/blob/dev/seatunnel-connectors-v2/README.md) diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/contribute-transform-v2-guide.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/contribute-transform-v2-guide.md new file mode 100644 index 000000000000..ad02b9e977c1 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/contribute-transform-v2-guide.md @@ -0,0 +1,321 @@ +# 贡献 Transform 指南 + +本文描述了如何理解、开发和贡献一个 transform。 + +我们也提供了 [Transform E2E Test](../../../seatunnel-e2e/seatunnel-transforms-v2-e2e) +来验证 transform 的数据输入和输出。 + +## 概念 + +在 SeaTunnel 中你可以通过 connector 读写数据, 但如果你需要在读取数据后或者写入数据前处理数据, 你需要使用 transform。 + +使用 transform 可以简单修改数据行和字段, 例如拆分字段、修改字段的值或者删除字段。 + +### 类型转换 + +Transform 从上游(source 或者 transform)获取类型输入,然后给下游(sink 或者 transform)输出新的类型,这个过程就是类型转换。 + +案例 1:删除字段 + +```shell +| A | B | C | +|-----------|-----------|-----------| +| STRING | INT | BOOLEAN | + +| A | B | +|-----------|-----------| +| STRING | INT | +``` + +案例 2:字段排序 + +```shell +| B | C | A | +|-----------|-----------|-----------| +| INT | BOOLEAN | STRING | + +| A | B | C | +|-----------|-----------|-----------| +| STRING | INT | BOOLEAN | +``` + +案例 3:修改字段类型 + +```shell +| A | B | C | +|-----------|-----------|-----------| +| STRING | INT | BOOLEAN | + + +| A | B | C | +|-----------|-----------|-----------| +| STRING | STRING | STRING | +``` + +案例 4:添加新的字段 + +```shell +| A | B | C | +|-----------|-----------|-----------| +| STRING | INT | BOOLEAN | + + +| A | B | C | D | +|-----------|-----------|-----------|-----------| +| STRING | INT | BOOLEAN | DOUBLE | +``` + +### 数据转换 + +转换类型后,Transform 会从上游(source 或者 transform)获取数据行, 使用[新的数据类型](#类型转换)编辑数据后输出到下游(sink 或者 transform)。这个过程叫数据转换。 + +### 翻译 + +Transform 已经从 execution engine 中解耦, 任何 transform 实现可以不需要修改和配置的适用所有引擎, 这就需要翻译层来做 transform 和 execution engine 的适配。 + +案例:翻译数据类型和数据 + +```shell +原始数据: + +| A | B | C | +|-----------|-----------|-----------| +| STRING | INT | BOOLEAN | + +类型转换: + +| A | B | C | +|-------------------|-------------------|-------------------| +| ENGINE | ENGINE | ENGINE | + +数据转换: + +| A | B | C | +|-------------------|-------------------|-------------------| +| ENGINE<"test"> | ENGINE<1> | ENGINE | +``` + +## 核心 APIs + +### SeaTunnelTransform + +`SeaTunnelTransform` 提供了所有主要的 API, 你可以继承它实现任何转换。 + +1. 从上游获取数据类型。 + +```java +/** + * Set the data type info of input data. + * + * @param inputDataType The data type info of upstream input. + */ + void setTypeInfo(SeaTunnelDataType inputDataType); +``` + +2. 输出新的数据类型给下游。 + +```java +/** + * Get the data type of the records produced by this transform. + * + * @return Produced data type. + */ +SeaTunnelDataType getProducedType(); +``` + +3. 修改输入数据并且输出新的数据到下游。 + +```java +/** + * Transform input data to {@link this#getProducedType()} types data. + * + * @param row the data need be transform. + * @return transformed data. + */ +T map(T row); +``` + +### SingleFieldOutputTransform + +`SingleFieldOutputTransform` 抽象了一个单字段修改操作 + +1. 定义输出字段 + +```java +/** + * Outputs new field + * + * @return + */ +protected abstract String getOutputFieldName(); +``` + +2. 定义输出字段类型 + +```java +/** + * Outputs new field datatype + * + * @return + */ +protected abstract SeaTunnelDataType getOutputFieldDataType(); +``` + +3. 定义输出字段值 + +```java +/** + * Outputs new field value + * + * @param inputRow The inputRow of upstream input. + * @return + */ +protected abstract Object getOutputFieldValue(SeaTunnelRowAccessor inputRow); +``` + +### MultipleFieldOutputTransform + +`MultipleFieldOutputTransform` 抽象了多字段修改操作 + +1. 定义多个输出的字段 + +```java +/** + * Outputs new fields + * + * @return + */ +protected abstract String[] getOutputFieldNames(); +``` + +2. 定义输出字段的类型 + +```java +/** + * Outputs new fields datatype + * + * @return + */ +protected abstract SeaTunnelDataType[] getOutputFieldDataTypes(); +``` + +3. 定义输出字段的值 + +```java +/** + * Outputs new fields value + * + * @param inputRow The inputRow of upstream input. + * @return + */ +protected abstract Object[] getOutputFieldValues(SeaTunnelRowAccessor inputRow); +``` + +### AbstractSeaTunnelTransform + +`AbstractSeaTunnelTransform` 抽象了数据类型和字段的修改操作 + +1. 转换输入的行类型到新的行类型 + +```java +/** + * Outputs transformed row type. + * + * @param inputRowType upstream input row type + * @return + */ +protected abstract SeaTunnelRowType transformRowType(SeaTunnelRowType inputRowType); +``` + +2. 转换输入的行数据到新的行数据 + +```java +/** + * Outputs transformed row data. + * + * @param inputRow upstream input row data + * @return + */ +protected abstract SeaTunnelRow transformRow(SeaTunnelRow inputRow); +``` + +## 开发一个 Transform + +Transform 必须实现下面其中一个 API: +- SeaTunnelTransform +- AbstractSeaTunnelTransform +- SingleFieldOutputTransform +- MultipleFieldOutputTransform + +将实现类放入模块 `seatunnel-transforms-v2`。 + +### 案例: 拷贝字段到一个新的字段 + +```java +@AutoService(SeaTunnelTransform.class) +public class CopyFieldTransform extends SingleFieldOutputTransform { + + private String srcField; + private int srcFieldIndex; + private SeaTunnelDataType srcFieldDataType; + private String destField; + + @Override + public String getPluginName() { + return "Copy"; + } + + @Override + protected void setConfig(Config pluginConfig) { + this.srcField = pluginConfig.getString("src_field"); + this.destField = pluginConfig.getString("dest_fields"); + } + + @Override + protected void setInputRowType(SeaTunnelRowType inputRowType) { + srcFieldIndex = inputRowType.indexOf(srcField); + srcFieldDataType = inputRowType.getFieldType(srcFieldIndex); + } + + @Override + protected String getOutputFieldName() { + return destField; + } + + @Override + protected SeaTunnelDataType getOutputFieldDataType() { + return srcFieldDataType; + } + + @Override + protected Object getOutputFieldValue(SeaTunnelRowAccessor inputRow) { + return inputRow.getField(srcFieldIndex); + } +} +``` + +1. `getPluginName` 方法用来定义 transform 的名字。 +2. @AutoService 注解用来自动生成 `META-INF/services/org.apache.seatunnel.api.transform.SeaTunnelTransform` 文件 +3. `setConfig` 方法用来注入用户配置。 + +## Transform 测试工具 + +当你添加了一个新的插件, 推荐添加一个 e2e 测试用例来测试。 +我们有 `seatunnel-e2e/seatunnel-transforms-v2-e2e` 来帮助你实现。 + +例如, 如果你想要添加一个 `CopyFieldTransform` 的测试用例, 你可以在 `seatunnel-e2e/seatunnel-transforms-v2-e2e` +模块中添加一个新的测试用例, 并且在用例中继承 `TestSuiteBase` 类。 + +```java +public class TestCopyFieldTransformIT extends TestSuiteBase { + + @TestTemplate + public void testCopyFieldTransform(TestContainer container) { + Container.ExecResult execResult = container.executeJob("/copy_transform.conf"); + Assertions.assertEquals(0, execResult.getExitCode()); + } +} +``` + +一旦你的测试用例实现了 `TestSuiteBase` 接口, 并且添加 `@TestTemplate` 注解,它会在所有引擎运行作业,你只需要用你自己的 SeaTunnel 配置文件执行 executeJob 方法, +它会提交 SeaTunnel 作业。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/how-to-create-your-connector.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/how-to-create-your-connector.md new file mode 100644 index 000000000000..3aef1b140c27 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/how-to-create-your-connector.md @@ -0,0 +1,4 @@ +## 开发自己的Connector + +如果你想针对SeaTunnel新的连接器API开发自己的连接器(Connector V2),请查看[这里](https://github.com/apache/seatunnel/blob/dev/seatunnel-connectors-v2/README.zh.md) 。 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/new-license.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/new-license.md new file mode 100644 index 000000000000..d39019f25b7c --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/new-license.md @@ -0,0 +1,53 @@ +# 如何添加新的 License + +### ASF 第三方许可政策 + +如果您打算向SeaTunnel(或其他Apache项目)添加新功能,并且该功能涉及到其他开源软件引用的时候,请注意目前 Apache 项目支持遵从以下协议的开源软件。 + +[ASF 第三方许可政策](https://apache.org/legal/resolved.html) + +如果您所使用的第三方软件并不在以上协议之中,那么很抱歉,您的代码将无法通过审核,建议您找寻其他替代方案。 + +### 如何在 SeaTunnel 中合法使用第三方开源软件 + +当我们想要引入一个新的第三方软件(包含但不限于第三方的 jar、文本、CSS、js、图片、图标、音视频等及在第三方基础上做的修改)至我们的项目中的时候,除了他们所遵从的协议是 Apache 允许的,另外一点很重要,就是合法的使用。您可以参考以下文章 + +* [COMMUNITY-LED DEVELOPMENT "THE APACHE WAY"](https://apache.org/dev/licensing-howto.html) + +举个例子,当我们使用了 ZooKeeper,那么我们项目就必须包含 ZooKeeper 的 NOTICE 文件(每个开源项目都会有 NOTICE 文件,一般位于根目录),用Apache的话来讲,就是 "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work. + +关于具体的各个开源协议使用协议,在此不做过多篇幅一一介绍,有兴趣可以自行查询了解。 + +### SeaTunnel-License 检测规则 + +通常情况下, 我们会为项目添加 License-check 脚本。 跟其他开源项目略有不同,SeaTunnel 使用 [SkyWalking](https://github.com/apache/skywalking) 提供的 SeaTunnel-License-Check。 总之,我们试图第一时间避免 License 问题。 + +当我们需要添加新的 jar 包或者使用外部资源时, 我们需要按照以下步骤进行操作: + +* 在 known-dependencies.txt 文件中添加 jar 的名称和版本 +* 在 'seatunnel-dist/release-docs/LICENSE' 目录下添加相关 maven 仓库地址 +* 在 'seatunnel-dist/release-docs/NOTICE' 目录下添加相关的 NOTICE 文件, 并确保他们跟原来的仓库中的文件没有区别 +* 在 'seatunnel-dist/release-docs/licenses' 目录下添加相关源码协议文件, 并且文件命令遵守 license-filename.txt 规则。 例:license-zk.txt +* 检查依赖的 license 是否出错 + +``` +--- /dev/fd/63 2020-12-03 03:08:57.191579482 +0000 ++++ /dev/fd/62 2020-12-03 03:08:57.191579482 +0000 +@@ -1,0 +2 @@ ++HikariCP-java6-2.3.13.jar +@@ -16,0 +18 @@ ++c3p0-0.9.5.2.jar +@@ -149,0 +152 @@ ++mchange-commons-java-0.2.11.jar + +- commons-lang-2.1.3.jar +Error: Process completed with exit code 1. +``` + +一般来说,添加一个 jar 的工作通常不是很容易,因为 jar 通常依赖其他各种 jar, 我们还需要为这些 jar 添加相应的许可证。 在这种情况下, 我们会收到检查 license 失败的错误信息。像上面的例子,我们缺少 `HikariCP-java6-2.3.13`, `c3p0` 等的 license 声明(`+` 表示新添加,`-` 表示需要删除), 按照步骤添加 jar。 + +### 参考 + +* [COMMUNITY-LED DEVELOPMENT "THE APACHE WAY"](https://apache.org/dev/licensing-howto.html) +* [ASF 第三方许可政策](https://apache.org/legal/resolved.html) + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/setup.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/setup.md new file mode 100644 index 000000000000..c00c3132c226 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/contribution/setup.md @@ -0,0 +1,121 @@ +# 搭建开发环境 + +在这个章节, 我们会向你展示如何搭建 SeaTunnel 的开发环境, 然后用 JetBrains IntelliJ IDEA 跑一个简单的示例。 + +> 你可以用任何你喜欢的开发环境进行开发和测试,我们只是用 [JetBrains IDEA](https://www.jetbrains.com/idea/) +> 作为示例来展示如何一步步完成设置。 + +## 准备 + +在设置开发环境之前, 需要做一些准备工作, 确保你安装了以下软件: + +* 安装 [Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git)。 +* 安装 [Java](https://www.java.com/en/download/) (目前只支持 JDK8/JDK11) 并且设置 `JAVA_HOME` 环境变量。 +* 安装 [Scala](https://www.scala-lang.org/download/2.11.12.html) (目前只支持 scala 2.11.12)。 +* 安装 [JetBrains IDEA](https://www.jetbrains.com/idea/)。 + +## 设置 + +### 克隆源码 + +首先使用以下命令从 [GitHub](https://github.com/apache/seatunnel) 克隆 SeaTunnel 源代码。 + +```shell +git clone git@github.com:apache/seatunnel.git +``` + +### 本地安装子项目 + +在克隆好源代码以后, 运行 `./mvnw` 命令安装子项目到 maven 本地仓库目录。 否则你的代码无法在 IDEA 中正常启动。 + +```shell +./mvnw install -Dmaven.test.skip +``` + +### 源码编译 + +在安装 maven 以后, 可以使用下面命令进行编译和打包。 + +``` +mvn clean package -pl seatunnel-dist -am -Dmaven.test.skip=true +``` + +### 编译子模块 + +如果要单独编译子模块, 可以使用下面的命令进行编译和打包。 + +```ssh +# 这是一个单独构建 redis connector 的示例 + + mvn clean package -pl seatunnel-connectors-v2/connector-redis -am -DskipTests -T 1C +``` + +### 安装 JetBrains IDEA Scala 插件 + +用 JetBrains IntelliJ IDEA 打开你的源码,如果有 Scala 的代码,则需要安装 JetBrains IntelliJ IDEA's [Scala plugin](https://plugins.jetbrains.com/plugin/1347-scala)。 +可以参考 [install plugins for IDEA](https://www.jetbrains.com/help/idea/managing-plugins.html#install-plugins) 。 + +### 安装 JetBrains IDEA Lombok 插件 + +在运行示例之前, 安装 JetBrains IntelliJ IDEA 的 [Lombok plugin](https://plugins.jetbrains.com/plugin/6317-lombok)。 +可以参考 [install plugins for IDEA](https://www.jetbrains.com/help/idea/managing-plugins.html#install-plugins) 。 + +### 代码风格 + +Apache SeaTunnel 使用 `Spotless` 来统一代码风格和格式检查。可以运行下面 `Spotless` 命令自动格式化。 + +```shell +./mvnw spotless:apply +``` + +拷贝 `pre-commit hook` 文件 `/tools/spotless_check/pre-commit.sh` 到你项目的 `.git/hooks/` 目录, 这样每次你使用 `git commit` 提交代码的时候会自动调用 `Spotless` 修复格式问题。 + +## 运行一个简单的示例 + +完成上面所有的工作后,环境搭建已经完成, 可以直接运行我们的示例了。 所有的示例在 `seatunnel-examples` 模块里, 你可以随意选择进行编译和调试,参考 [running or debugging +it in IDEA](https://www.jetbrains.com/help/idea/run-debug-configuration.html)。 + +我们使用 `seatunnel-examples/seatunnel-engine-examples/src/main/java/org/apache/seatunnel/example/engine/SeaTunnelEngineExample.java` +作为示例, 运行成功后的输出如下: + +```log +2024-08-10 11:45:32,839 INFO org.apache.seatunnel.core.starter.seatunnel.command.ClientExecuteCommand - +*********************************************** + Job Statistic Information +*********************************************** +Start Time : 2024-08-10 11:45:30 +End Time : 2024-08-10 11:45:32 +Total Time(s) : 2 +Total Read Count : 5 +Total Write Count : 5 +Total Failed Count : 0 +*********************************************** +``` + +## 更多信息 + +所有的实例都用了简单的 source 和 sink, 这样可以使得运行更独立和更简单。 +你可以修改 `resources/examples` 中的示例的配置。 例如下面的配置使用 PostgreSQL 作为源,并且输出到控制台。 +请注意引用FakeSource 和 Console 以外的连接器时,需要修改seatunnel-example对应子模块下的`pom.xml`文件中的依赖。 + +```conf +env { + parallelism = 1 + job.mode = "BATCH" +} +source { + Jdbc { + driver = org.postgresql.Driver + url = "jdbc:postgresql://host:port/database" + username = postgres + password = "123456" + query = "select * from test" + table_path = "database.test" + } +} + +sink { + Console {} +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/faq.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/faq.md new file mode 100644 index 000000000000..505cc97fd6d9 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/faq.md @@ -0,0 +1,354 @@ +# 常见问题解答 + +## 为什么要安装Spark或者Flink这样的计算引擎? + +SeaTunnel现在使用Spark、Flink等计算引擎来完成资源调度和节点通信,因此我们可以专注于数据同步的易用性和高性能组件的开发。 但这只是暂时的。 + +## 我有一个问题,我自己无法解决 + +我在使用SeaTunnel时遇到了问题,无法自行解决。 我应该怎么办? 首先,在[问题列表](https://github.com/apache/seatunnel/issues)或[邮件列表](https://lists.apache.org/list.html?dev@seatunnel.apache.org)中搜索 )看看是否有人已经问过同样的问题并得到答案。 如果您找不到问题的答案,您可以通过[这些方式](https://github.com/apache/seatunnel#contact-us)联系社区成员寻求帮助。 + +## 如何声明变量? + +您想知道如何在 SeaTunnel 的配置中声明一个变量,然后在运行时动态替换该变量的值吗? + +从“v1.2.4”开始,SeaTunnel 支持配置中的变量替换。 该功能常用于定时或非定时离线处理,以替代时间、日期等变量。 用法如下: + +在配置中配置变量名称。 下面是一个sql转换的例子(实际上,配置文件中任何地方“key = value”中的值都可以使用变量替换): + +``` +... +transform { + sql { + query = "select * from user_view where city ='"${city}"' and dt = '"${date}"'" + } +} +... +``` + +以Spark Local模式为例,启动命令如下: + +```bash +./bin/start-seatunnel-spark.sh \ +-c ./config/your_app.conf \ +-e client \ +-m local[2] \ +-i city=shanghai \ +-i date=20190319 +``` + +您可以使用参数“-i”或“--variable”后跟“key=value”来指定变量的值,其中key需要与配置中的变量名称相同。 + +## 如何在配置文件中写入多行文本的配置项? + +当配置的文本很长并且想要将其换行时,可以使用三个双引号来指示其开始和结束: + +``` +var = """ + whatever you want +""" +``` + +## 如何实现多行文本的变量替换? + +在多行文本中进行变量替换有点麻烦,因为变量不能包含在三个双引号中: + +``` +var = """ +your string 1 +"""${you_var}""" your string 2""" +``` + +请参阅:[lightbend/config#456](https://github.com/lightbend/config/issues/456)。 + +## Azkaban、Oozie、DolphinScheduler 是否支持 SeaTunnel? + +当然! 请参阅下面的屏幕截图: + +![工作流程.png](/image_zh/workflow.png) + +![azkaban.png](/image_zh/azkaban.png) + +## SeaTunnel是否有配置多个源的情况,例如同时在源中配置elasticsearch和hdfs? + +``` +env { + ... +} + +source { + hdfs { ... } + elasticsearch { ... } + jdbc {...} +} + +transform { + ... +} + +sink { + elasticsearch { ... } +} +``` + +## 有 HBase 插件吗? + +有一个 HBase 输入插件。 您可以从这里下载:https://github.com/garyelephant/waterdrop-input-hbase + +## 如何使用SeaTunnel将数据写入Hive? + +``` +env { + spark.sql.catalogImplementation = "hive" + spark.hadoop.hive.exec.dynamic.partition = "true" + spark.hadoop.hive.exec.dynamic.partition.mode = "nonstrict" +} + +source { + sql = "insert into ..." +} + +sink { + // The data has been written to hive through the sql source. This is just a placeholder, it does not actually work. + stdout { + limit = 1 + } +} +``` + +此外,SeaTunnel 在 `1.5.7` 版本之后在 `1.x` 分支中实现了 `Hive` 输出插件; 在“2.x”分支中。 Spark 引擎的 Hive 插件已从版本“2.0.5”开始支持:https://github.com/apache/seatunnel/issues/910。 + +## SeaTunnel如何编写ClickHouse的多个实例来实现负载均衡? + +1.直接写分布式表(不推荐) + +2.在ClickHouse的多个实例前面添加代理或域名(DNS): + +``` +{ + output { + clickhouse { + host = "ck-proxy.xx.xx:8123" + # Local table + table = "table_name" + } + } +} +``` + +3. 在配置文件中配置多个ClickHouse实例: + + ``` + { + output { + clickhouse { + host = "ck1:8123,ck2:8123,ck3:8123" + # Local table + table = "table_name" + } + } + } + ``` +4. 使用集群模式: + + ``` + { + output { + clickhouse { + # Configure only one host + host = "ck1:8123" + cluster = "clickhouse_cluster_name" + # Local table + table = "table_name" + } + } + } + ``` + +## SeaTunnel 消费 Kafka 时如何解决 OOM? + +大多数情况下,OOM是由于没有对消费进行速率限制而导致的。 解决方法如下: + +对于目前Kafka的Spark消费限制: + +1. 假设您使用 KafkaStream 消费的 Kafka `Topic 1` 的分区数量 = N。 + +2. 假设“Topic 1”的消息生产者(Producer)的生产速度为K条消息/秒,则向分区写入消息的速度必须一致。 + +3、假设经过测试发现Spark Executor每核每秒的处理能力为M。 + +可以得出以下结论: + +1、如果想让Spark对`Topic 1`的消耗跟上它的生产速度,那么需要 `spark.executor.cores` * `spark.executor.instances` >= K / M + +2、当出现数据延迟时,如果希望消耗速度不要太快,导致spark执行器OOM,那么需要配置 `spark.streaming.kafka.maxRatePerPartition` <= (`spark.executor.cores` * `spark.executor.instances`) * M / N + +3、一般来说,M和N都确定了,从2可以得出结论:`spark.streaming.kafka.maxRatePerPartition`的大小与`spark.executor.cores` * `spark的大小正相关 .executor.instances`,可以在增加资源`maxRatePerPartition`的同时增加,以加快消耗。 + +![Kafka](/image_zh/kafka.png) + +## 如何解决错误 `Exception in thread "main" java.lang.NoSuchFieldError: INSTANCE`? + +原因是Spark的CDH版本自带的httpclient.jar版本较低,而ClickHouse JDBC基于的httpclient版本是4.5.2,包版本冲突。 解决办法是将CDH自带的jar包替换为httpclient-4.5.2版本。 + +## 我的Spark集群默认的JDK是JDK7。 安装JDK8后,如何指定SeaTunnel以JDK8启动? + +在 SeaTunnel 的配置文件中,指定以下配置: + +```shell +spark { + ... + spark.executorEnv.JAVA_HOME="/your/java_8_home/directory" + spark.yarn.appMasterEnv.JAVA_HOME="/your/java_8_home/directory" + ... +} +``` + +## 如何为 YARN 上的 SeaTunnel 指定不同的 JDK 版本? + +例如要设置JDK版本为JDK8,有两种情况: + +- YARN集群已部署JDK8,但默认JDK不是JDK8。 在 SeaTunnel 配置文件中添加两个配置: + + ``` + env { + ... + spark.executorEnv.JAVA_HOME="/your/java_8_home/directory" + spark.yarn.appMasterEnv.JAVA_HOME="/your/java_8_home/directory" + ... + } + ``` +- YARN集群未部署JDK8。 此时,启动附带JDK8的SeaTunnel。 详细操作参见: + https://www.cnblogs.com/jasondan/p/spark-specific-jdk-version.html + +## Spark local[*]模式运行SeaTunnel时总是出现OOM怎么办? + +如果以本地模式运行,则需要修改`start-seatunnel.sh`启动脚本。 在 `spark-submit` 之后添加参数 `--driver-memory 4g` 。 一般情况下,生产环境中不使用本地模式。 因此,On YARN时一般不需要设置该参数。 有关详细信息,请参阅:[应用程序属性](https://spark.apache.org/docs/latest/configuration.html#application-properties)。 + +## 我可以在哪里放置自己编写的插件或第三方 jdbc.jar 以供 SeaTunnel 加载? + +将Jar包放置在plugins目录指定结构下: + +```bash +cd SeaTunnel +mkdir -p plugins/my_plugins/lib +cp third-part.jar plugins/my_plugins/lib +``` + +`my_plugins` 可以是任何字符串。 + +## 如何在 SeaTunnel-V1(Spark) 中配置日志记录相关参数? + +可以通过三种方式配置日志相关参数(例如日志级别): + +- [不推荐] 更改默认的`$SPARK_HOME/conf/log4j.properties`。 + - 这将影响通过 `$SPARK_HOME/bin/spark-submit` 提交的所有程序。 +- [不推荐]直接在SeaTunnel的Spark代码中修改日志相关参数。 + - 这相当于写死了,每次改变都需要重新编译。 +- [推荐] 使用以下方法更改 SeaTunnel 配置文件中的日志记录配置(更改仅在 SeaTunnel >= 1.5.5 时生效): + + ``` + env { + spark.driver.extraJavaOptions = "-Dlog4j.configuration=file:/log4j.properties" + spark.executor.extraJavaOptions = "-Dlog4j.configuration=file:/log4j.properties" + } + source { + ... + } + transform { + ... + } + sink { + ... + } + ``` + +可供参考的log4j配置文件内容如下: + +``` +$ cat log4j.properties +log4j.rootLogger=ERROR, console + +# set the log level for these components +log4j.logger.org=ERROR +log4j.logger.org.apache.spark=ERROR +log4j.logger.org.spark-project=ERROR +log4j.logger.org.apache.hadoop=ERROR +log4j.logger.io.netty=ERROR +log4j.logger.org.apache.zookeeper=ERROR + +# add a ConsoleAppender to the logger stdout to write to the console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.layout=org.apache.log4j.PatternLayout +# use a simple message format +log4j.appender.console.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n +``` + +## 如何在 SeaTunnel-V2(Spark、Flink) 中配置日志记录相关参数? + +目前,无法直接设置它们。 您需要修改SeaTunnel启动脚本。 相关参数在任务提交命令中指定。 具体参数请参考官方文档: + +- Spark官方文档:http://spark.apache.org/docs/latest/configuration.html#configuring-logging +- Flink 官方文档:https://ci.apache.org/projects/flink/flink-docs-stable/monitoring/logging.html + +参考: + +https://stackoverflow.com/questions/27781187/how-to-stop-info-messages-displaying-on-spark-console + +http://spark.apache.org/docs/latest/configuration.html#configuring-logging + +https://medium.com/@iacomini.riccardo/spark-logging-configuration-in-yarn-faf5ba5fdb01 + +## 如何配置SeaTunnel-E2E Test的日志记录相关参数? + +`seatunnel-e2e` 的 log4j 配置文件位于 `seatunnel-e2e/seatunnel-e2e-common/src/test/resources/log4j2.properties` 中。 您可以直接在配置文件中修改日志记录相关参数。 + +例如,如果您想输出更详细的E2E Test日志,只需将配置文件中的“rootLogger.level”降级即可。 + +## 写入 ClickHouse 时出错:ClassCastException + +在SeaTunnel中,不会主动转换数据类型。 Input读取数据后,对应的 +架构。 编写ClickHouse时,需要严格匹配字段类型,不匹配的情况需要解决。 + +数据转换可以通过以下两个插件实现: + +1.过滤器转换插件 +2.过滤Sql插件 + +详细数据类型转换参考:[ClickHouse数据类型检查列表](https://interestinglab.github.io/seatunnel-docs/#/en/configuration/output-plugins/Clickhouse?id=clickhouse-data-type-check-list) + +请参阅问题:[#488](https://github.com/apache/seatunnel/issues/488)[#382](https://github.com/apache/seatunnel/issues/382)。 + +## SeaTunnel 如何访问经过 kerberos 验证的 HDFS、YARN、Hive 等资源? + +请参考:[#590](https://github.com/apache/seatunnel/issues/590)。 + +## 如何排查 NoClassDefFoundError、ClassNotFoundException 等问题? + +有很大概率是Java类路径中加载了多个不同版本的对应Jar包类,是因为加载顺序冲突,而不是因为Jar确实丢失了。 修改这条SeaTunnel启动命令,在spark-submit提交部分添加如下参数,通过输出日志进行详细调试。 + +``` +spark-submit --verbose + ... + --conf 'spark.driver.extraJavaOptions=-verbose:class' + --conf 'spark.executor.extraJavaOptions=-verbose:class' + ... +``` + +## 如何使用SeaTunnel跨HDFS集群同步数据? + +只需正确配置 hdfs-site.xml 即可。 参考:https://www.cnblogs.com/suanec/p/7828139.html。 + +## 我想学习SeaTunnel的源代码。 我应该从哪里开始? + +SeaTunnel 拥有完全抽象、结构化的代码实现,很多人都选择 SeaTunnel 作为学习 Spark 的方式。 您可以从主程序入口了解源代码:SeaTunnel.java + +## SeaTunnel开发者开发自己的插件时,是否需要了解SeaTunnel代码? 这些插件是否应该集成到 SeaTunnel 项目中? + +开发者开发的插件与SeaTunnel项目无关,不需要包含您的插件代码。 + +该插件可以完全独立于 SeaTunnel 项目,因此您可以使用 Java、Scala、Maven、sbt、Gradle 或任何您想要的方式编写它。 这也是我们推荐开发者开发插件的方式。 + +## 当我导入项目时,编译器出现异常“找不到类`org.apache.seatunnel.shade.com.typesafe.config.Config`” + +首先运行“mvn install”。 在 `seatunnel-config/seatunnel-config-base` 子项目中,包 `com.typesafe.config` 已重新定位到 `org.apache.seatunnel.shade.com.typesafe.config` 并安装到 maven 本地存储库 在子项目 `seatunnel-config/seatunnel-config-shade` 中。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/other-engine/flink.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/other-engine/flink.md new file mode 100644 index 000000000000..856aeb78101e --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/other-engine/flink.md @@ -0,0 +1,83 @@ +# Flink引擎方式运行SeaTunnel + +Flink是一个强大的高性能分布式流处理引擎。你可以搜索 `Apache Flink`获取更多关于它的信息。 + +### 在Job中设置Flink的配置信息 + +以 `flink.` 开始: + +例子: 我对这个项目设置一个精确的检查点 + +``` +env { + parallelism = 1 + flink.execution.checkpointing.unaligned.enabled=true +} +``` + +枚举类型当前还不支持,你需要在Flink的配置文件中指定它们。暂时只有这些类型的设置受支持:
+Integer/Boolean/String/Duration + +### 如何设置一个简单的Flink Job + +这是一个运行在Flink中随机生成数据打印到控制台的简单job + +``` +env { + # 公共参数 + parallelism = 1 + checkpoint.interval = 5000 + + # flink特殊参数 + flink.execution.checkpointing.mode = "EXACTLY_ONCE" + flink.execution.checkpointing.timeout = 600000 +} + +source { + FakeSource { + row.num = 16 + result_table_name = "fake_table" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + } + } + } + } +} + +transform { + # 如果你想知道更多关于如何配置seatunnel的信息和查看完整的transform插件, + # 请访问:https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink{ + Console{} +} +``` + +### 如何在项目中运行Job + +当你将代码拉到本地后,转到 `seatunnel-examples/seatunnel-flink-connector-v2-example` 模块,查找 `org.apache.seatunnel.example.flink.v2.SeaTunnelApiExample` 即可完成job的操作。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/about.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/about.md new file mode 100644 index 000000000000..9deeec82f987 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/about.md @@ -0,0 +1,44 @@ +--- + +sidebar_position: 1 +------------------- + +# SeaTunnel Engine 简介 + +SeaTunnel Engine 是一个由社区开发的用于数据同步场景的引擎,作为 SeaTunnel 的默认引擎,它支持高吞吐量、低延迟和强一致性的数据同步作业操作,更快、更稳定、更节省资源且易于使用。 + +SeaTunnel Engine 的整体设计遵循以下路径: + +- 更快,SeaTunnel Engine 的执行计划优化器旨在减少数据网络传输,从而减少由于数据序列化和反序列化造成的整体同步性能损失,使用户能够更快地完成数据同步操作。同时,支持速度限制,以合理速度同步数据。 +- 更稳定,SeaTunnel Engine 使用 Pipeline 作为数据同步任务的最小粒度的检查点和容错。任务的失败只会影响其上游和下游任务,避免了任务失败导致整个作业失败或回滚的情况。同时,SeaTunnel Engine 还支持数据缓存,用于源数据有存储时间限制的场景。当启用缓存时,从源读取的数据将自动缓存,然后由下游任务读取并写入目标。在这种情况下,即使由于目标失败而无法写入数据,也不会影响源的常规读取,防止源数据过期被删除。 +- 节省空间,SeaTunnel Engine 内部使用动态线程共享技术。在实时同步场景中,对于每个表数据量很大但每个表数据量很小的表,SeaTunnel Engine 将在共享线程中运行这些同步任务,以减少不必要的线程创建并节省系统空间。在读取和写入数据方面,SeaTunnel Engine 的设计目标是最小化 JDBC 连接的数量;在 CDC 场景中,SeaTunnel Engine 将重用日志读取和解析资源。 +- 简单易用,SeaTunnel Engine 减少了对第三方服务的依赖,并且可以独立于如 Zookeeper 和 HDFS 等大数据组件实现集群管理、快照存储和集群 HA 功能。这对于目前缺乏大数据平台的用户,或者不愿意依赖大数据平台进行数据同步的用户来说非常有用。 + +未来,SeaTunnel Engine 将进一步优化其功能,以支持离线批同步的全量同步和增量同步、实时同步和 CDC。 + +### 集群管理 + +- 支持独立运行; +- 支持集群运行; +- 支持自治集群(去中心化),使用户无需为 SeaTunnel Engine 集群指定主节点,因为它可以在运行过程中自行选择主节点,并且在主节点失败时自动选择新的主节点; +- 自治集群节点发现和具有相同 cluster_name 的节点将自动形成集群。 + +### 核心功能 + +- 支持在本地模式下运行作业,作业完成后集群自动销毁; +- 支持在集群模式下运行作业(单机或集群),通过 SeaTunnel 客户端将作业提交给 SeaTunnel Engine 服务,作业完成后服务继续运行并等待下一个作业提交; +- 支持离线批同步; +- 支持实时同步; +- 批流一体,所有 SeaTunnel V2 Connector 均可在 SeaTunnel Engine 中运行; +- 支持分布式快照算法,并支持与 SeaTunnel V2 Connector 的两阶段提交,确保数据只执行一次。 +- 支持在 Pipeline 级别调用作业,以确保即使在资源有限的情况下也能启动; +- 支持在 Pipeline 级别对作业进行容错。任务失败只影响其所在 Pipeline,只需要回滚 Pipeline 下的任务; +- 支持动态线程共享,以实时同步大量小数据集。 + +### 快速开始 + +https://seatunnel.apache.org/docs/start-v2/locally/quick-start-seatunnel-engine + +### 下载安装 + +[下载安装](download-seatunnel.md) diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/checkpoint-storage.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/checkpoint-storage.md new file mode 100644 index 000000000000..f0c506fdbf88 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/checkpoint-storage.md @@ -0,0 +1,220 @@ +--- + +sidebar_position: 7 +------------------- + +# 检查点存储 + +## 简介 + +检查点是一种容错恢复机制。这种机制确保程序在运行时,即使突然遇到异常,也能自行恢复。 + +### 检查点存储 + +SeaTunnel Engine支持以下检查点存储类型: + +- HDFS (OSS,S3,HDFS,LocalFile) +- LocalFile (本地),(已弃用: 使用HDFS(LocalFile)替代). + +我们使用微内核设计模式将检查点存储模块从引擎中分离出来。这允许用户实现他们自己的检查点存储模块。 + +`checkpoint-storage-api`是检查点 存储模块API,它定义了检查点存储模块的接口。 + +如果你想实现你自己的检查点存储模块,你需要实现`CheckpointStorage`并提供相应的`CheckpointStorageFactory`实现。 + +### 检查点存储配置 + +`seatunnel-server`模块的配置在`seatunnel.yaml`文件中。 + +```yaml + +seatunnel: + engine: + checkpoint: + storage: + type: hdfs #检查点存储的插件名称,支持hdfs(S3, local, hdfs), 默认为localfile (本地文件), 但这种方式已弃用 + # 插件配置 + plugin-config: + namespace: #检查点存储父路径,默认值为/seatunnel/checkpoint/ + K1: V1 # 插件其它配置 + K2: V2 # 插件其它配置 +``` + +注意: namespace必须以"/"结尾。 + +#### OSS + +阿里云OSS是基于hdfs-file,所以你可以参考[Hadoop OSS文档](https://hadoop.apache.org/docs/stable/hadoop-aliyun/tools/hadoop-aliyun/index.html)来配置oss. + +OSS buckets交互外,oss客户端需要与buckets交互所需的凭据。 +客户端支持多种身份验证机制,并且可以配置使用哪种机制及其使用顺序。也可以使用of org.apache.hadoop.fs.aliyun.oss.AliyunCredentialsProvider的自定义实现。 +如果您使用AliyunCredentialsProvider(可以从阿里云访问密钥管理中获得),它们包括一个access key和一个secret key。 +你可以这样配置: + +```yaml +seatunnel: + engine: + checkpoint: + interval: 6000 + timeout: 7000 + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: oss + oss.bucket: your-bucket + fs.oss.accessKeyId: your-access-key + fs.oss.accessKeySecret: your-secret-key + fs.oss.endpoint: endpoint address + fs.oss.credentials.provider: org.apache.hadoop.fs.aliyun.oss.AliyunCredentialsProvider +``` + +有关Hadoop Credential Provider API的更多信息,请参见: [Credential Provider API](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/CredentialProviderAPI.html). + +阿里云OSS凭证提供程序实现见: [验证凭证提供](https://github.com/aliyun/aliyun-oss-java-sdk/tree/master/src/main/java/com/aliyun/oss/common/auth) + +#### S3 + +S3基于hdfs-file,所以你可以参考[Hadoop s3文档](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html)来配置s3。 + +除了与公共S3 buckets交互之外,S3A客户端需要与buckets交互所需的凭据。 +客户端支持多种身份验证机制,并且可以配置使用哪种机制及其使用顺序。也可以使用com.amazonaws.auth.AWSCredentialsProvider的自定义实现。 +如果您使用SimpleAWSCredentialsProvider(可以从Amazon Security Token服务中获得),它们包括一个access key和一个secret key。 +您可以这样配置: + +```yaml + +seatunnel: + engine: + checkpoint: + interval: 6000 + timeout: 7000 + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: s3 + s3.bucket: your-bucket + fs.s3a.access.key: your-access-key + fs.s3a.secret.key: your-secret-key + fs.s3a.aws.credentials.provider: org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider + + +``` + +如果您使用`InstanceProfileCredentialsProvider`,它支持在EC2 VM中运行时使用实例配置文件凭据,您可以检查[iam-roles-for-amazon-ec2](https://docs.aws.amazon.com/zh_cn/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html). +您可以这样配置: + +```yaml + +seatunnel: + engine: + checkpoint: + interval: 6000 + timeout: 7000 + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: s3 + s3.bucket: your-bucket + fs.s3a.endpoint: your-endpoint + fs.s3a.aws.credentials.provider: org.apache.hadoop.fs.s3a.InstanceProfileCredentialsProvider +``` + +有关Hadoop Credential Provider API的更多信息,请参见: [Credential Provider API](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/CredentialProviderAPI.html). + +#### HDFS + +如果您使用HDFS,您可以这样配置: + +```yaml +seatunnel: + engine: + checkpoint: + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: hdfs + fs.defaultFS: hdfs://localhost:9000 + // 如果您使用kerberos,您可以这样配置: + kerberosPrincipal: your-kerberos-principal + kerberosKeytabFilePath: your-kerberos-keytab +``` + +如果HDFS是HA模式,您可以这样配置: + +```yaml +seatunnel: + engine: + checkpoint: + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: hdfs + fs.defaultFS: hdfs://usdp-bing + seatunnel.hadoop.dfs.nameservices: usdp-bing + seatunnel.hadoop.dfs.ha.namenodes.usdp-bing: nn1,nn2 + seatunnel.hadoop.dfs.namenode.rpc-address.usdp-bing.nn1: usdp-bing-nn1:8020 + seatunnel.hadoop.dfs.namenode.rpc-address.usdp-bing.nn2: usdp-bing-nn2:8020 + seatunnel.hadoop.dfs.client.failover.proxy.provider.usdp-bing: org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider + +``` + +如果HDFS在`hdfs-site.xml`或`core-site.xml`中有其他配置,只需使用`seatunnel.hadoop.`前缀设置HDFS配置即可。 + +#### 本地文件 + +```yaml +seatunnel: + engine: + checkpoint: + interval: 6000 + timeout: 7000 + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: hdfs + fs.defaultFS: file:/// # 请确保该目录具有写权限 + +``` + +### 开启高速缓存 + +当storage:type为hdfs时,默认关闭cache。如果您想启用它,请设置为`disable.cache: false`。 + +```yaml +seatunnel: + engine: + checkpoint: + interval: 6000 + timeout: 7000 + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: hdfs + disable.cache: false + fs.defaultFS: hdfs:/// # Ensure that the directory has written permission +``` + +or + +```yaml +seatunnel: + engine: + checkpoint: + interval: 6000 + timeout: 7000 + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: hdfs + disable.cache: false + fs.defaultFS: file:/// +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/deployment.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/deployment.md new file mode 100644 index 000000000000..d9dc8d3b39c5 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/deployment.md @@ -0,0 +1,24 @@ +--- + +sidebar_position: 3 +------------------- + +# SeaTunnel Engine(Zeta) 安装部署 + +SeaTunnel Engine(Zeta) 支持三种不同的部署模式:本地模式、混合集群模式和分离集群模式。 + +每种部署模式都有不同的使用场景和优缺点。在选择部署模式时,您应该根据您的需求和环境来选择。 + +Local模式:只用于测试,每个任务都会启动一个独立的进程,任务运行完成后进程会退出。 + +混合集群模式:SeaTunnel Engine 的Master服务和Worker服务混合在同一个进程中,所有节点都可以运行作业并参与选举成为master,即master节点也在同时运行同步任务。在该模式下,Imap(保存任务的状态信息用于为任务的容错提供支持)数据会分布在所有节点中。 + +分离集群模式:SeaTunnel Engine 的Master服务和Worker服务分离,每个服务单独一个进程。Master节点只负责作业调度,rest api,任务提交等,Imap数据只存储在Master节点中。Worker节点只负责任务的执行,不参与选举成为master,也不存储Imap数据。 + +使用建议:建议使用[分离集群模式](separated-cluster-deployment.md)。在混合集群模式下,Master节点要同步运行任务,当任务规模较大时,会影响Master节点的稳定性,一但Master节点宕机或心跳超时,会导致Master节点切换,Master节点切换会导致所有正在运行的任务进行容错,会进一步增长集群的负载。因此,我们更建议使用分离模式。 + +[Local模式部署](local-mode-deployment.md) + +[混合集群模式部署](hybrid-cluster-deployment.md) + +[分离集群模式部署](separated-cluster-deployment.md) diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/download-seatunnel.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/download-seatunnel.md new file mode 100644 index 000000000000..74281d0648f1 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/download-seatunnel.md @@ -0,0 +1,70 @@ +--- + +sidebar_position: 2 +------------------- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# 下载和制作安装包 + +## 步骤 1: 准备工作 + +在开始下载SeaTunnel之前,您需要确保您已经安装了SeaTunnel所需要的以下软件: + +* 安装[Java](https://www.java.com/en/download/) (Java 8 或 11, 其他高于Java 8的版本理论上也可以工作) 以及设置 `JAVA_HOME`。 + +## 步骤 2: 下载 SeaTunnel + +进入[SeaTunnel下载页面](https://seatunnel.apache.org/download)下载最新版本的发布版安装包`seatunnel--bin.tar.gz` + +或者您也可以通过终端下载 + +```shell +export version="2.3.7" +wget "https://archive.apache.org/dist/seatunnel/${version}/apache-seatunnel-${version}-bin.tar.gz" +tar -xzvf "apache-seatunnel-${version}-bin.tar.gz" +``` + +## 步骤 3: 下载连接器插件 + +从2.2.0-beta版本开始,二进制包不再默认提供连接器依赖,因此在第一次使用它时,您需要执行以下命令来安装连接器:(当然,您也可以从 [Apache Maven Repository](https://repo.maven.apache.org/maven2/org/apache/seatunnel/) 手动下载连接器,然后将其移动至`connectors/seatunnel`目录下)。 + +```bash +sh bin/install-plugin.sh 2.3.7 +``` + +如果您需要指定的连接器版本,以2.3.7为例,您需要执行如下命令 + +```bash +sh bin/install-plugin.sh 2.3.7 +``` + +通常您并不需要所有的连接器插件,所以您可以通过配置`config/plugin_config`来指定您所需要的插件,例如,您只需要`connector-console`插件,那么您可以修改plugin.properties配置文件如下 + +```plugin_config +--seatunnel-connectors-- +connector-console +--end-- +``` + +如果您希望示例应用程序能正常工作,那么您需要添加以下插件 + +```plugin_config +--seatunnel-connectors-- +connector-fake +connector-console +--end-- +``` + +您可以在`${SEATUNNEL_HOME}/connectors/plugins-mapping.properties`下找到所有支持的连接器和相应的plugin_config配置名称。 + +:::tip 提示 + +如果您想通过手动下载连接器的方式来安装连接器插件,您只需要下载您所需要的连接器插件即可,并将它们放在`${SEATUNNEL_HOME}/connectors/`目录下 + +::: + +现在你已经完成了SeaTunnel安装包的下载和连接器插件的下载。接下来,您可以根据您的需求选择不同的运行模式来运行或部署SeaTunnel。 + +如果你使用SeaTunnel自带的SeaTunnel Engine(Zeta)来运行任务,需要先部署SeaTunnel Engine服务。参考[SeaTunnel Engine(Zeta)服务部署](deployment.md) diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/engine-jar-storage-mode.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/engine-jar-storage-mode.md new file mode 100644 index 000000000000..81dc0cacb342 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/engine-jar-storage-mode.md @@ -0,0 +1,95 @@ +--- + +sidebar_position: 9 +------------------- + +# 配置引擎 Jar 存储模式 + +:::caution 警告 + +请注意,此功能目前处于实验阶段,还有许多方面需要改进。因此,我们建议在使用此功能时谨慎行事,以避免潜在的问题和不必要的风险。 +我们致力于持续努力增强和稳定此功能,确保为您提供更好的体验。 + +::: + +我们可以启用优化的作业提交过程,这在 `seatunnel.yaml` 中进行配置。启用了 Seatunnel 作业提交过程配置项的优化后, +用户可以使用 Seatunnel Zeta 引擎作为执行引擎,而无需在每个引擎 `connector` 目录中放置任务执行所需的连接器 Jar 包或连接器所依赖的第三方 Jar 包。 +用户只需在提交作业的客户端上放置所有任务执行所需的 Jar 包,客户端将自动上传任务执行所需的 Jars 到 Zeta 引擎。在 Docker 或 k8s 模式下提交作业时,启用此配置项是必要的, +这可以从根本上解决由 Seatunnel Zeta 引擎的重量造成的大型容器镜像问题。在镜像中,只需要提供 Zeta 引擎的核心框架包, +然后可以将连接器的 jar 包和连接器所依赖的第三方 jar 包分别上传到 pod 进行分发。 + +启用了优化作业提交过程配置项后,您不需要在 Zeta 引擎中放置以下两种类型的 Jar 包: +- COMMON_PLUGIN_JARS +- CONNECTOR_PLUGIN_JARS + +COMMON_ PLUGIN_ JARS 指的是连接器所依赖的第三方 Jar 包, CONNECTOR_ PLUGIN_ JARS 指的是连接器 Jar 包。 +当 Zeta 的 `lib` 中不存在公共 jars 时,它可以将客户端的本地公共 jars 上传到所有引擎节点的 `lib` 目录。 +这样,即使用户没有在 Zeta 的 `lib` 中放置 jar,任务仍然可以正常执行。 +然而,我们不推荐依赖打开优化作业提交过程的配置项来上传连接器所依赖的第三方 Jar 包。 +如果您使用 Zeta 引擎,请将连接器所依赖的第三方 jar 包文件添加到每个节点的 `$SEATUNNEL_HOME/lib/` 目录中,例如 jdbc 驱动程序。 + +# 连接器 Jar 存储策略 + +您可以通过配置文件配置当前连接器 Jar 包和连接器所依赖的第三方 Jar 包的存储策略。 +可以配置两种存储策略,即共享 Jar 包存储策略和隔离 Jar 包存储策略。 +两种不同的存储策略为 Jar 文件提供了更灵活的存储模式。 +您可以配置存储策略,使引擎中的多个执行作业共享相同的 Jar 包文件。 + +## 相关配置 + +| 参数 | 默认值 | 描述 | +|-------------------------------------|--------|-------------------------------------------------------------------------| +| connector-jar-storage-enable | false | 是否启用上传连接器 Jar 包到引擎。默认启用状态为 false。 | +| connector-jar-storage-mode | SHARED | 引擎端 Jar 包存储模式选择。有两个可选模式,SHARED(共享)和 ISOLATED(隔离)。默认的 Jar 包存储模式是 SHARED。 | +| connector-jar-storage-path | " " | 用户自定义的 Jar 包存储路径。 | +| connector-jar-cleanup-task-interval | 3600s | 引擎端 Jar 包清理定时任务执行间隔。 | +| connector-jar-expiry-time | 600s | 引擎端 Jar 包存储过期时间。 | + +## 隔离连接器Jar存储策略 + +在作业提交之前,连接器 Jar 包将被上传到 Master 节点上的一个独立文件存储路径中。 +不同作业的连接器 Jar 包位于不同的存储路径中,因此不同作业的连接器 Jar 包彼此隔离。 +作业执行所需的 Jar 包文件不会影响其他作业。当当前作业执行结束时,基于 `JobId` 生成的存储路径中的 Jar 包文件将被删除。 + +示例: + +```yaml +jar-storage: + connector-jar-storage-enable: true + connector-jar-storage-mode: ISOLATED + connector-jar-storage-path: "" + connector-jar-cleanup-task-interval: 3600 + connector-jar-expiry-time: 600 +``` + +配置参数的详细解释: +- connector-jar-storage-enable: 在执行作业前启用上传连接器 Jar 包的功能。 +- connector-jar-storage-mode: 连接器 Jar 包的存储模式,有两种存储模式可供选择:共享模式(SHARED)和隔离模式(ISOLATED)。 +- connector-jar-storage-path: 在 Zeta 引擎上用户自定义连接器 Jar 包的本地存储路径。 +- connector-jar-cleanup-task-interval: Zeta 引擎连接器 Jar 包定时清理任务的间隔时间,默认为 3600 秒。 +- connector-jar-expiry-time: 连接器 Jar 包的过期时间,默认为 600 秒。 + +## 共享连接器Jar存储策略 + +在作业提交之前,连接器 Jar 包将被上传到 Master 节点。如果不同的作业使用相同的 Jar 包文件,它们可以在 Master 节点上共享连接器 Jars。 +所有 Jar 包文件都被持久化到一个共享的文件存储路径中,引用 Master 节点的 Jar 包可以在不同作业之间共享。任务执行完成后, +共享连接器Jar存储策略 不会立即删除与当前任务执行相关的所有 Jar 包,而是有一个独立的线程负责清理工作。 +以下配置文件中的配置设置了清理工作的运行时间和 Jar 包的存活时间。 + +示例: + +```yaml +jar-storage: + connector-jar-storage-enable: true + connector-jar-storage-mode: SHARED + connector-jar-storage-path: "" + connector-jar-cleanup-task-interval: 3600 + connector-jar-expiry-time: 600 +``` + +配置参数的详细解释: +- connector-jar-storage-enable: 在执行作业前启用上传连接器 Jar 包的功能。 +- connector-jar-storage-mode: 连接器 Jar 包的存储模式,有两种存储模式可供选择:共享模式(SHARED)和隔离模式(ISOLATED)。 +- connector-jar-storage-path: 在 Zeta 引擎上用户自定义连接器 Jar 包的本地存储路径。 +- connector-jar-cleanup-task-interval: Zeta 引擎连接器 Jar 包定时清理任务的间隔时间,默认为 3600 秒。 +- connector-jar-expiry-time: 连接器 Jar 包的过期时间,默认为 600 秒。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/hybrid-cluster-deployment.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/hybrid-cluster-deployment.md new file mode 100644 index 000000000000..4d101b416782 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/hybrid-cluster-deployment.md @@ -0,0 +1,315 @@ +--- + +sidebar_position: 5 +------------------- + +# 部署 SeaTunnel Engine 混合模式集群 + +SeaTunnel Engine 的Master服务和Worker服务混合在同一个进程中,所有节点都可以运行作业并参与选举成为master,即master节点也在同时运行同步任务。在该模式下,Imap(保存任务的状态信息用于为任务的容错提供支持)数据会分布在所有节点中。 + +使用建议:建议使用分离集群模式。在混合集群模式下,Master节点要同步运行任务,当任务规模较大时,会影响Master节点的稳定性,一但Master节点宕机或心跳超时,会导致Master节点切换,Master节点切换会导致所有正在运行的任务进行容错,会进一步增长集群的负载。因此,我们更建议使用[分离集群模式](separated-cluster-deployment.md)。 + +## 1. 下载 + +[下载和制作SeaTunnel安装包](download-seatunnel.md) + +## 2 配置 SEATUNNEL_HOME + +您可以通过添加 `/etc/profile.d/seatunnel.sh` 文件来配置 `SEATUNNEL_HOME` 。`/etc/profile.d/seatunnel.sh` 的内容如下: + +``` +export SEATUNNEL_HOME=${seatunnel install path} +export PATH=$PATH:$SEATUNNEL_HOME/bin +``` + +## 3. 配置 SeaTunnel Engine JVM 选项 + +SeaTunnel Engine 支持两种设置 JVM 选项的方法。 + +1. 将 JVM 选项添加到 `$SEATUNNEL_HOME/config/jvm_options`. + + 修改 `$SEATUNNEL_HOME/config/jvm_options` 文件中的jvm参数。 + +2. 在启动 SeaTunnel Engine 时添加 JVM 选项。例如 `seatunnel-cluster.sh -DJvmOption="-Xms2G -Xmx2G"` + +## 4. 配置 SeaTunnel Engine + +SeaTunnel Engine 提供许多功能,需要在 `seatunnel.yaml` 中进行配置。. + +### 4.1 Imap中数据的备份数设置 + +SeaTunnel Engine 基于 [Hazelcast IMDG](https://docs.hazelcast.com/imdg/4.1/) 实现集群管理。集群的状态数据(作业运行状态、资源状态)存储在 [Hazelcast IMap](https://docs.hazelcast.com/imdg/4.1/data-structures/map)。 +存储在 Hazelcast IMap 中的数据将在集群的所有节点上分布和存储。Hazelcast 会分区存储在 Imap 中的数据。每个分区可以指定备份数量。 +因此,SeaTunnel Engine 可以实现集群 HA,无需使用其他服务(例如 zookeeper)。 + +`backup count` 是定义同步备份数量的参数。例如,如果设置为 1,则分区的备份将放置在一个其他成员上。如果设置为 2,则将放置在两个其他成员上。 + +我们建议 `backup-count` 的值为 `min(1, max(5, N/2))`。 `N` 是集群节点的数量。 + +```yaml +seatunnel: + engine: + backup-count: 1 + # 其他配置 +``` + +### 4.2 Slot配置 + +Slot数量决定了集群节点可以并行运行的任务组数量。一个任务需要的Slot的个数公式为 N = 2 + P(任务配置的并行度)。 默认情况下SeaTunnel Engine的slot个数为动态,即不限制个数。我们建议slot的个数设置为节点CPU核心数的2倍。 + +动态slot个数(默认)配置如下: + +```yaml +seatunnel: + engine: + slot-service: + dynamic-slot: true + # 其他配置 +``` + +静态slot个数配置如下: + +```yaml +seatunnel: + engine: + slot-service: + dynamic-slot: false + slot-num: 20 +``` + +### 4.3 检查点管理器 + +与 Flink 一样,SeaTunnel Engine 支持 Chandy–Lamport 算法。因此,可以实现无数据丢失和重复的数据同步。 + +**interval** + +两个检查点之间的间隔,单位是毫秒。如果在作业配置文件的 `env` 中配置了 `checkpoint.interval` 参数,将以作业配置文件中设置的为准。 + +**timeout** + +检查点的超时时间。如果在超时时间内无法完成检查点,则会触发检查点失败,作业失败。如果在作业的配置文件的`env`中配置了`checkpoint.timeout`参数,将以作业配置文件中设置的为准。 + +示例 + +```yaml +seatunnel: + engine: + backup-count: 1 + print-execution-info-interval: 10 + slot-service: + dynamic-slot: true + checkpoint: + interval: 300000 + timeout: 10000 +``` + +**checkpoint storage** + +检查点是一种容错恢复机制。这种机制确保程序在运行时,即使突然遇到异常,也能自行恢复。检查点定时触发,每次检查点进行时每个Task都会被要求将自身的状态信息(比如读取kafka时读取到了哪个offset)上报给检查点线程,由该线程写入一个分布式存储(或共享存储)。当任务失败然后自动容错恢复时,或者通过seatunnel.sh -r 指令恢复之前被暂停的任务时,会从检查点存储中加载对应作业的状态信息,并基于这些状态信息进行作业的恢复。 + +如果集群的节点大于1,检查点存储必须是一个分布式存储,或者共享存储,这样才能保证任意节点挂掉后依然可以在另一个节点加载到存储中的任务状态信息。 + +有关检查点存储的信息,您可以查看 [Checkpoint Storage](checkpoint-storage.md) + +### 4.4 历史作业过期配置 + +每个完成的作业的信息,如状态、计数器和错误日志,都存储在 IMap 对象中。随着运行作业数量的增加,内存会增加,最终内存将溢出。因此,您可以调整 `history-job-expire-minutes` 参数来解决这个问题。此参数的时间单位是分钟。默认值是 1440 分钟,即一天。 + +示例 + +```yaml +seatunnel: + engine: + history-job-expire-minutes: 1440 +``` + +### 4.5 类加载器缓存模式 + +此配置主要解决不断创建和尝试销毁类加载器所导致的资源泄漏问题。 +如果您遇到与metaspace空间溢出相关的异常,您可以尝试启用此配置。 +为了减少创建类加载器的频率,在启用此配置后,SeaTunnel 在作业完成时不会尝试释放相应的类加载器,以便它可以被后续作业使用,也就是说,当运行作业中使用的 Source/Sink 连接器类型不是太多时,它更有效。 +默认值是 false。 +示例 + +```yaml +seatunnel: + engine: + classloader-cache-mode: true +``` + +## 5. 配置 SeaTunnel Engine 网络服务 + +所有 SeaTunnel Engine 网络相关的配置都在 `hazelcast.yaml` 文件中. + +### 5.1 集群名称 + +SeaTunnel Engine 节点使用 `cluster-name` 来确定另一个节点是否与自己在同一集群中。如果两个节点之间的集群名称不同,SeaTunnel 引擎将拒绝服务请求。 + +### 5.2 网络 + +基于 [Hazelcast](https://docs.hazelcast.com/imdg/4.1/clusters/discovery-mechanisms), 一个 SeaTunnel Engine 集群是由运行 SeaTunnel Engine 服务器的集群成员组成的网络。 集群成员自动加入一起形成集群。这种自动加入是通过集群成员使用的各种发现机制来相互发现的。 + +请注意,集群形成后,集群成员之间的通信始终通过 TCP/IP 进行,无论使用的发现机制如何。 + +SeaTunnel Engine 使用以下发现机制。 + +#### TCP + +您可以将 SeaTunnel Engine 配置为完整的 TCP/IP 集群。有关配置详细信息,请参阅 [Discovering Members By TCP Section](tcp.md)。 + +一个示例如下 `hazelcast.yaml` + +```yaml +hazelcast: + cluster-name: seatunnel + network: + join: + tcp-ip: + enabled: true + member-list: + - hostname1 + port: + auto-increment: false + port: 5801 + properties: + hazelcast.logging.type: log4j2 +``` + +TCP 是我们建议在独立 SeaTunnel Engine 集群中使用的方式。 + +另一方面,Hazelcast 提供了一些其他的服务发现方法。有关详细信息,请参阅 [Hazelcast Network](https://docs.hazelcast.com/imdg/4.1/clusters/setting-up-clusters) + +### 5.3 IMap持久化配置 + +在SeaTunnel中,我们使用IMap(一种分布式的Map,可以实现数据跨节点跨进程的写入的读取 有关详细信息,请参阅 [hazelcast map](https://docs.hazelcast.com/imdg/4.2/data-structures/map)) 来存储每个任务及其task的状态,以便在任务所在节点宕机后,可以在其他节点上获取到任务之前的状态信息,从而恢复任务实现任务的容错。 + +默认情况下Imap的信息只是存储在内存中,我们可以设置Imap数据的复本数,具体可参考(4.1 Imap中数据的备份数设置),如果复本数是2,代表每个数据会同时存储在2个不同的节点中。一旦节点宕机,Imap中的数据会重新在其它节点上自动补充到设置的复本数。但是当所有节点都被停止后,Imap中的数据会丢失。当集群节点再次启动后,所有之前正在运行的任务都会被标记为失败,需要用户手工通过seatunnel.sh -r 指令恢复运行。 + +为了解决这个问题,我们可以将Imap中的数据持久化到外部存储中,如HDFS、OSS等。这样即使所有节点都被停止,Imap中的数据也不会丢失,当集群节点再次启动后,所有之前正在运行的任务都会被自动恢复。 + +下面介绍如何使用 MapStore 持久化配置。有关详细信息,请参阅 [Hazelcast Map](https://docs.hazelcast.com/imdg/4.2/data-structures/map) + +**type** + +imap 持久化的类型,目前仅支持 `hdfs`。 + +**namespace** + +它用于区分不同业务的数据存储位置,如 OSS 存储桶名称。 + +**clusterName** + +此参数主要用于集群隔离, 我们可以使用它来区分不同的集群,如 cluster1、cluster2,这也用于区分不同的业务。 + +**fs.defaultFS** + +我们使用 hdfs api 读写文件,因此使用此存储需要提供 hdfs 配置。 + +如果您使用 HDFS,可以像这样配置: + +```yaml +map: + engine*: + map-store: + enabled: true + initial-mode: EAGER + factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory + properties: + type: hdfs + namespace: /tmp/seatunnel/imap + clusterName: seatunnel-cluster + storage.type: hdfs + fs.defaultFS: hdfs://localhost:9000 +``` + +如果没有 HDFS,并且您的集群只有一个节点,您可以像这样配置使用本地文件: + +```yaml +map: + engine*: + map-store: + enabled: true + initial-mode: EAGER + factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory + properties: + type: hdfs + namespace: /tmp/seatunnel/imap + clusterName: seatunnel-cluster + storage.type: hdfs + fs.defaultFS: file:/// +``` + +如果您使用 OSS,可以像这样配置: + +```yaml +map: + engine*: + map-store: + enabled: true + initial-mode: EAGER + factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory + properties: + type: hdfs + namespace: /tmp/seatunnel/imap + clusterName: seatunnel-cluster + storage.type: oss + block.size: block size(bytes) + oss.bucket: oss://bucket name/ + fs.oss.accessKeyId: OSS access key id + fs.oss.accessKeySecret: OSS access key secret + fs.oss.endpoint: OSS endpoint +``` + +注意:使用OSS 时,确保 lib目录下有这几个jar. + +``` +aliyun-sdk-oss-3.13.2.jar +hadoop-aliyun-3.3.6.jar +jdom2-2.0.6.jar +netty-buffer-4.1.89.Final.jar +netty-common-4.1.89.Final.jar +seatunnel-hadoop3-3.1.4-uber.jar +``` + +## 6. 配置 SeaTunnel Engine 客户端 + +所有 SeaTunnel Engine 客户端的配置都在 `hazelcast-client.yaml` 里。 + +### 6.1 cluster-name + +客户端必须与 SeaTunnel Engine 具有相同的 `cluster-name`。否则,SeaTunnel Engine 将拒绝客户端的请求。 + +### 6.2 网络 + +**cluster-members** + +需要将所有 SeaTunnel Engine 服务器节点的地址添加到这里。 + +```yaml +hazelcast-client: + cluster-name: seatunnel + properties: + hazelcast.logging.type: log4j2 + network: + cluster-members: + - hostname1:5801 +``` + +## 7. 启动 SeaTunnel Engine 服务器节点 + +可以通过守护进程使用 `-d` 参数启动。 + +```shell +mkdir -p $SEATUNNEL_HOME/logs +./bin/seatunnel-cluster.sh -d +``` + +日志将写入 `$SEATUNNEL_HOME/logs/seatunnel-engine-server.log` + +## 8. 安装 SeaTunnel Engine 客户端 + +您只需将 SeaTunnel Engine 节点上的 `$SEATUNNEL_HOME` 目录复制到客户端节点,并像 SeaTunnel Engine 服务器节点一样配置 `SEATUNNEL_HOME`。 + +## 9. 提交作业和管理作业 + +现在集群部署完成了,您可以通过以下教程完成作业的提交和管理:[提交和管理作业](user-command.md) diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/local-mode-deployment.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/local-mode-deployment.md new file mode 100644 index 000000000000..0230cfcca1a9 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/local-mode-deployment.md @@ -0,0 +1,35 @@ +--- + +sidebar_position: 4 +------------------- + +# 以Local模式运行作业 + +仅用于测试。 + +Local模式下每个任务都会启动一个独立的进程,任务运行完成后进程会退出。在该模式下有以下限制: + +1. 不支持任务的暂停、恢复。 +2. 不支持获取任务列表查看。 +3. 不支持通过命令取消作业,只能通过Kill进程的方式终止任务。 +4. 不支持RESTful API。 + +最推荐在生产环境中使用SeaTunnel Engine的[分离集群模式](separated-cluster-deployment.md) + +## 本地模式部署SeaTunnel Engine + +本地模式下,不需要部署SeaTunnel Engine集群,只需要使用如下命令即可提交作业即可。系统会在提交提交作业的进程中启动SeaTunnel Engine(Zeta)服务来运行提交的作业,作业完成后进程退出。 + +该模式下只需要将下载和制作好的安装包拷贝到需要运行的服务器上即可,如果需要调整作业运行的JVM参数,可以修改$SEATUNNEL_HOME/config/jvm_client_options文件。 + +## 提交作业 + +```shell +$SEATUNNEL_HOME/bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template -e local +``` + +## 作业运维 + +Local模式下提交的作业会在提交作业的进程中运行,作业完成后进程会退出,如果要中止作业只需要退出提交作业的进程即可。作业的运行日志会输出到提交作业的进程的标准输出中。 + +不支持其它运维操作。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/resource-isolation.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/resource-isolation.md new file mode 100644 index 000000000000..a175e9c51ecc --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/resource-isolation.md @@ -0,0 +1,83 @@ +--- + +sidebar_position: 9 +------------------- + +在2.3.6版本之后, SeaTunnel支持对每个实例添加`tag`, 然后在提交任务时可以在配置文件中使用`tag_filter`来选择任务将要运行的节点. + +# 如何实现改功能 + +1. 更新`hazelcast.yaml`文件 + +```yaml +hazelcast: + cluster-name: seatunnel + network: + rest-api: + enabled: true + endpoint-groups: + CLUSTER_WRITE: + enabled: true + DATA: + enabled: true + join: + tcp-ip: + enabled: true + member-list: + - localhost + port: + auto-increment: false + port: 5801 + properties: + hazelcast.invocation.max.retry.count: 20 + hazelcast.tcp.join.port.try.count: 30 + hazelcast.logging.type: log4j2 + hazelcast.operation.generic.thread.count: 50 + member-attributes: + group: + type: string + value: platform + team: + type: string + value: team1 +``` + +在这个配置中, 我们通过`member-attributes`设置了`group=platform, team=team1`这样两个`tag` + +2. 在任务的配置中添加`tag_filter`来选择你需要运行该任务的节点 + +```hacon +env { + parallelism = 1 + job.mode = "BATCH" + tag_filter { + group = "platform" + team = "team1" + } +} +source { + FakeSource { + result_table_name = "fake" + parallelism = 1 + schema = { + fields { + name = "string" + } + } + } +} +transform { +} +sink { + console { + source_table_name="fake" + } +} +``` + +**注意:** +- 当在任务的配置中, 没有添加`tag_filter`时, 会从所有节点中随机选择节点来运行任务. +- 当`tag_filter`中存在多个过滤条件时, 会根据key存在以及value相等的全部匹配的节点, 当没有找到匹配的节点时, 会抛出 `NoEnoughResourceException`异常. + +![img.png](/image_zh/resource-isolation.png) + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/rest-api.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/rest-api.md new file mode 100644 index 000000000000..1b0166425ba4 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/rest-api.md @@ -0,0 +1,490 @@ +--- + +sidebar_position: 11 +-------------------- + +# RESTful API + +SeaTunnel有一个用于监控的API,可用于查询运行作业的状态和统计信息,以及最近完成的作业。监控API是RESTful风格的,它接受HTTP请求并使用JSON数据格式进行响应。 + +## 概述 + +监控API是由运行的web服务提供的,它是节点运行的一部分,每个节点成员都可以提供rest API功能。 +默认情况下,该服务监听端口为5801,该端口可以在hazelcast.yaml中配置,如下所示: + +```yaml +network: + rest-api: + enabled: true + endpoint-groups: + CLUSTER_WRITE: + enabled: true + DATA: + enabled: true + join: + tcp-ip: + enabled: true + member-list: + - localhost + port: + auto-increment: true + port-count: 100 + port: 5801 +``` + +## API参考 + +### 返回Zeta集群的概览 + +
+ GET /hazelcast/rest/maps/overview?tag1=value1&tag2=value2 (Returns an overview over the Zeta engine cluster.) + +#### 参数 + +> | 参数名称 | 是否必传 | 参数类型 | 参数描述 | +> |--------|------|------|--------------------------| +> | tag键值对 | 否 | 字符串 | 一组标签值, 通过该标签值过滤满足条件的节点信息 | + +#### 响应 + +```json +{ + "projectVersion":"2.3.5-SNAPSHOT", + "gitCommitAbbrev":"DeadD0d0", + "totalSlot":"0", + "unassignedSlot":"0", + "works":"1", + "runningJobs":"0", + "finishedJobs":"0", + "failedJobs":"0", + "cancelledJobs":"0" +} +``` + +**注意:** +- 当你使用`dynamic-slot`时, 返回结果中的`totalSlot`和`unassignedSlot`将始终为0. 设置为固定的slot值后, 将正确返回集群中总共的slot数量以及未分配的slot数量. +- 当添加标签过滤后, `works`, `totalSlot`, `unassignedSlot`将返回满足条件的节点的相关指标. 注意`runningJobs`等job相关指标为集群级别结果, 无法根据标签进行过滤. + +
+ +------------------------------------------------------------------------------------------ + +### 返回所有作业及其当前状态的概览。 + +
+ GET /hazelcast/rest/maps/running-jobs (返回所有作业及其当前状态的概览。) + +#### 参数 + +#### 响应 + +```json +[ + { + "jobId": "", + "jobName": "", + "jobStatus": "", + "envOptions": { + }, + "createTime": "", + "jobDag": { + "vertices": [ + ], + "edges": [ + ] + }, + "pluginJarsUrls": [ + ], + "isStartWithSavePoint": false, + "metrics": { + "sourceReceivedCount": "", + "sinkWriteCount": "" + } + } +] +``` + +
+ +------------------------------------------------------------------------------------------ + +### 返回作业的详细信息。 + +
+ GET /hazelcast/rest/maps/job-info/:jobId (返回作业的详细信息。) + +#### 参数 + +> | 参数名称 | 是否必传 | 参数类型 | 参数描述 | +> |-------|------|------|--------| +> | jobId | 是 | long | job id | + +#### 响应 + +```json +{ + "jobId": "", + "jobName": "", + "jobStatus": "", + "createTime": "", + "jobDag": { + "vertices": [ + ], + "edges": [ + ] + }, + "metrics": { + "sourceReceivedCount": "", + "sinkWriteCount": "" + }, + "finishedTime": "", + "errorMsg": null, + "envOptions": { + }, + "pluginJarsUrls": [ + ], + "isStartWithSavePoint": false +} +``` + +`jobId`, `jobName`, `jobStatus`, `createTime`, `jobDag`, `metrics` 字段总会返回. +`envOptions`, `pluginJarsUrls`, `isStartWithSavePoint` 字段在Job在RUNNING状态时会返回 +`finishedTime`, `errorMsg` 字段在Job结束时会返回,结束状态为不为RUNNING,可能为FINISHED,可能为CANCEL + +当我们查询不到这个Job时,返回结果为: + +```json +{ + "jobId" : "" +} +``` + +
+ +------------------------------------------------------------------------------------------ + +### 返回作业的详细信息 + +此API已经弃用,请使用/hazelcast/rest/maps/job-info/:jobId替代。 + +
+ GET /hazelcast/rest/maps/running-job/:jobId (返回作业的详细信息。) + +#### 参数 + +> | 参数名称 | 是否必传 | 参数类型 | 参数描述 | +> |-------|------|------|--------| +> | jobId | 是 | long | job id | + +#### 响应 + +```json +{ + "jobId": "", + "jobName": "", + "jobStatus": "", + "createTime": "", + "jobDag": { + "vertices": [ + ], + "edges": [ + ] + }, + "metrics": { + "sourceReceivedCount": "", + "sinkWriteCount": "" + }, + "finishedTime": "", + "errorMsg": null, + "envOptions": { + }, + "pluginJarsUrls": [ + ], + "isStartWithSavePoint": false +} +``` + +`jobId`, `jobName`, `jobStatus`, `createTime`, `jobDag`, `metrics` 字段总会返回. +`envOptions`, `pluginJarsUrls`, `isStartWithSavePoint` 字段在Job在RUNNING状态时会返回 +`finishedTime`, `errorMsg` 字段在Job结束时会返回,结束状态为不为RUNNING,可能为FINISHED,可能为CANCEL + +当我们查询不到这个Job时,返回结果为: + +```json +{ + "jobId" : "" +} +``` + +
+ +------------------------------------------------------------------------------------------ + +### 返回所有已完成的作业信息。 + +
+ GET /hazelcast/rest/maps/finished-jobs/:state (返回所有已完成的作业信息。) + +#### 参数 + +> | 参数名称 | 是否必传 | 参数类型 | 参数描述 | +> |-------|----------|--------|------------------------------------------------------------------| +> | state | optional | string | finished job status. `FINISHED`,`CANCELED`,`FAILED`,`UNKNOWABLE` | + +#### 响应 + +```json +[ + { + "jobId": "", + "jobName": "", + "jobStatus": "", + "errorMsg": null, + "createTime": "", + "finishTime": "", + "jobDag": "", + "metrics": "" + } +] +``` + +
+ +------------------------------------------------------------------------------------------ + +### 返回系统监控信息。 + +
+ GET /hazelcast/rest/maps/system-monitoring-information (返回系统监控信息。) + +#### 参数 + +#### 响应 + +```json +[ + { + "processors":"8", + "physical.memory.total":"16.0G", + "physical.memory.free":"16.3M", + "swap.space.total":"0", + "swap.space.free":"0", + "heap.memory.used":"135.7M", + "heap.memory.free":"440.8M", + "heap.memory.total":"576.5M", + "heap.memory.max":"3.6G", + "heap.memory.used/total":"23.54%", + "heap.memory.used/max":"3.73%", + "minor.gc.count":"6", + "minor.gc.time":"110ms", + "major.gc.count":"2", + "major.gc.time":"73ms", + "load.process":"24.78%", + "load.system":"60.00%", + "load.systemAverage":"2.07", + "thread.count":"117", + "thread.peakCount":"118", + "cluster.timeDiff":"0", + "event.q.size":"0", + "executor.q.async.size":"0", + "executor.q.client.size":"0", + "executor.q.client.query.size":"0", + "executor.q.client.blocking.size":"0", + "executor.q.query.size":"0", + "executor.q.scheduled.size":"0", + "executor.q.io.size":"0", + "executor.q.system.size":"0", + "executor.q.operations.size":"0", + "executor.q.priorityOperation.size":"0", + "operations.completed.count":"10", + "executor.q.mapLoad.size":"0", + "executor.q.mapLoadAllKeys.size":"0", + "executor.q.cluster.size":"0", + "executor.q.response.size":"0", + "operations.running.count":"0", + "operations.pending.invocations.percentage":"0.00%", + "operations.pending.invocations.count":"0", + "proxy.count":"8", + "clientEndpoint.count":"0", + "connection.active.count":"2", + "client.connection.count":"0", + "connection.count":"0" + } +] +``` + +
+ +------------------------------------------------------------------------------------------ + +### 提交作业。 + +
+POST /hazelcast/rest/maps/submit-job (如果作业提交成功,返回jobId和jobName。) + +#### 参数 + +> | 参数名称 | 是否必传 | 参数类型 | 参数描述 | +> |----------------------|----------|--------|-----------------------------------| +> | jobId | optional | string | job id | +> | jobName | optional | string | job name | +> | isStartWithSavePoint | optional | string | if job is started with save point | + +#### 请求体 + +```json +{ + "env": { + "job.mode": "batch" + }, + "source": [ + { + "plugin_name": "FakeSource", + "result_table_name": "fake", + "row.num": 100, + "schema": { + "fields": { + "name": "string", + "age": "int", + "card": "int" + } + } + } + ], + "transform": [ + ], + "sink": [ + { + "plugin_name": "Console", + "source_table_name": ["fake"] + } + ] +} +``` + +#### 响应 + +```json +{ + "jobId": 733584788375666689, + "jobName": "rest_api_test" +} +``` + +
+ +------------------------------------------------------------------------------------------ + +### 停止作业。 + +
+POST /hazelcast/rest/maps/stop-job (如果作业成功停止,返回jobId。) + +#### 请求体 + +```json +{ + "jobId": 733584788375666689, + "isStopWithSavePoint": false # if job is stopped with save point +} +``` + +#### 响应 + +```json +{ +"jobId": 733584788375666689 +} +``` + +
+ +------------------------------------------------------------------------------------------ + +### 加密配置。 + +
+POST /hazelcast/rest/maps/encrypt-config (如果配置加密成功,则返回加密后的配置。) +有关自定义加密的更多信息,请参阅文档[配置-加密-解密](../connector-v2/Config-Encryption-Decryption.md). + +#### 请求体 + +```json +{ + "env": { + "parallelism": 1, + "shade.identifier":"base64" + }, + "source": [ + { + "plugin_name": "MySQL-CDC", + "schema" : { + "fields": { + "name": "string", + "age": "int" + } + }, + "result_table_name": "fake", + "parallelism": 1, + "hostname": "127.0.0.1", + "username": "seatunnel", + "password": "seatunnel_password", + "table-name": "inventory_vwyw0n" + } + ], + "transform": [ + ], + "sink": [ + { + "plugin_name": "Clickhouse", + "host": "localhost:8123", + "database": "default", + "table": "fake_all", + "username": "seatunnel", + "password": "seatunnel_password" + } + ] +} +``` + +#### 响应 + +```json +{ + "env": { + "parallelism": 1, + "shade.identifier": "base64" + }, + "source": [ + { + "plugin_name": "MySQL-CDC", + "schema": { + "fields": { + "name": "string", + "age": "int" + } + }, + "result_table_name": "fake", + "parallelism": 1, + "hostname": "127.0.0.1", + "username": "c2VhdHVubmVs", + "password": "c2VhdHVubmVsX3Bhc3N3b3Jk", + "table-name": "inventory_vwyw0n" + } + ], + "transform": [], + "sink": [ + { + "plugin_name": "Clickhouse", + "host": "localhost:8123", + "database": "default", + "table": "fake_all", + "username": "c2VhdHVubmVs", + "password": "c2VhdHVubmVsX3Bhc3N3b3Jk" + } + ] +} +``` + +
+ diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/savepoint.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/savepoint.md new file mode 100644 index 000000000000..b1bab640e5e4 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/savepoint.md @@ -0,0 +1,26 @@ +--- + +sidebar_position: 8 +------------------- + +# 使用保存点和通过保存点恢复 + +保存点是使用检查点创建的,它是作业执行状态的全局镜像,可以用于作业或 SeaTunnel 的停止和恢复、升级等。 + +## 使用保存点 + +要使用保存点, 您需要确保作业使用的连接器支持检查点,否则数据可能会丢失或重复。 + +1. 确保作业正在运行。 + +2. 使用以下命令触发保存点: + + ```./bin/seatunnel.sh -s {jobId}``` + +执行成功后,检查点数据将被保存,任务将结束。 + +## 使用保存点进行恢复 + +通过 `jobId` 使用保存点来恢复作业。 + +```./bin/seatunnel.sh -c {jobConfig} -r {jobId}``` diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/separated-cluster-deployment.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/separated-cluster-deployment.md new file mode 100644 index 000000000000..ce328d3bd572 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/separated-cluster-deployment.md @@ -0,0 +1,433 @@ +--- + +sidebar_position: 6 +------------------- + +# 部署 SeaTunnel Engine 分离模式集群 + +SeaTunnel Engine 的Master服务和Worker服务分离,每个服务单独一个进程。Master节点只负责作业调度,RESTful API,任务提交等,Imap数据只存储在Master节点中。Worker节点只负责任务的执行,不参与选举成为master,也不存储Imap数据。 + +在所有Master节点中,同一时间只有一个Master节点工作,其他Master节点处于standby状态。当当前Master节点宕机或心跳超时,会从其它Master节点中选举出一个新的Master Active节点。 + +这是最推荐的一种使用方式,在该模式下Master的负载会很小,Master有更多的资源用来进行作业的调度,任务的容错指标监控以及提供rest api服务等,会有更高的稳定性。同时Worker节点不存储Imap的数据,所有的Imap数据都存储在Master节点中,即使Worker节点负载高或者挂掉,也不会导致Imap数据重新分布。 + +## 1. 下载 + +[下载和制作SeaTunnel安装包](download-seatunnel.md) + +## 2 配置 SEATUNNEL_HOME + +您可以通过添加 `/etc/profile.d/seatunnel.sh` 文件来配置 `SEATUNNEL_HOME` 。`/etc/profile.d/seatunnel.sh` 的内容如下: + +``` +export SEATUNNEL_HOME=${seatunnel install path} +export PATH=$PATH:$SEATUNNEL_HOME/bin +``` + +## 3. 配置 Master 节点 JVM 选项 + +Master节点的JVM参数在`$SEATUNNEL_HOME/config/jvm_master_options`文件中配置。 + +```shell +# JVM Heap +-Xms2g +-Xmx2g + +# JVM Dump +-XX:+HeapDumpOnOutOfMemoryError +-XX:HeapDumpPath=/tmp/seatunnel/dump/zeta-server + +# Metaspace +-XX:MaxMetaspaceSize=2g + +# G1GC +-XX:+UseG1GC + +``` + +Worker节点的JVM参数在`$SEATUNNEL_HOME/config/jvm_worker_options`文件中配置。 + +```shell +# JVM Heap +-Xms2g +-Xmx2g + +# JVM Dump +-XX:+HeapDumpOnOutOfMemoryError +-XX:HeapDumpPath=/tmp/seatunnel/dump/zeta-server + +# Metaspace +-XX:MaxMetaspaceSize=2g + +# G1GC +-XX:+UseG1GC + +``` + +## 4. 配置 SeaTunnel Engine + +SeaTunnel Engine 提供许多功能,需要在 `seatunnel.yaml` 中进行配置。. + +### 4.1 Imap中数据的备份数设置(该参数在Worker节点无效) + +SeaTunnel Engine 基于 [Hazelcast IMDG](https://docs.hazelcast.com/imdg/4.1/) 实现集群管理。集群的状态数据(作业运行状态、资源状态)存储在 [Hazelcast IMap](https://docs.hazelcast.com/imdg/4.1/data-structures/map)。 +存储在 Hazelcast IMap 中的数据将在集群的所有节点上分布和存储。Hazelcast 会分区存储在 Imap 中的数据。每个分区可以指定备份数量。 +因此,SeaTunnel Engine 可以实现集群 HA,无需使用其他服务(例如 zookeeper)。 + +`backup count` 是定义同步备份数量的参数。例如,如果设置为 1,则分区的备份将放置在一个其他成员上。如果设置为 2,则将放置在两个其他成员上。 + +我们建议 `backup-count` 的值为 `min(1, max(5, N/2))`。 `N` 是集群节点的数量。 + +```yaml +seatunnel: + engine: + backup-count: 1 + # 其他配置 +``` + +:::tip + +由于在分离集群模式下,Worker节点不存储Imap数据,因此Worker节点的`backup-count`配置无效。如果Master和Worker进程在同一个机器上启动,Master和Worker会共用`seatunnel.yaml`配置文件,此时Worker节点服务会忽略`backup-count`配置。 + +::: + +### 4.2 Slot配置(该参数在Master节点无效) + +Slot数量决定了集群节点可以并行运行的任务组数量。一个任务需要的Slot的个数公式为 N = 2 + P(任务配置的并行度)。 默认情况下SeaTunnel Engine的slot个数为动态,即不限制个数。我们建议slot的个数设置为节点CPU核心数的2倍。 + +动态slot个数(默认)配置如下: + +```yaml +seatunnel: + engine: + slot-service: + dynamic-slot: true + # 其他配置 +``` + +静态slot个数配置如下: + +```yaml +seatunnel: + engine: + slot-service: + dynamic-slot: false + slot-num: 20 +``` + +:::tip + +由于在分离集群模式下,Master节点不运行任务,所以Master服务不会启动Slot服务,因此Master节点的`slot-service`配置无效。如果Master和Worker进程在同一个机器上启动,Master和Worker会共用`seatunnel.yaml`配置文件,此时Master节点服务会忽略`slot-service`配置。 + +::: + +### 4.3 检查点管理器(该参数在Worker节点无效) + +与 Flink 一样,SeaTunnel Engine 支持 Chandy–Lamport 算法。因此,可以实现无数据丢失和重复的数据同步。 + +**interval** + +两个检查点之间的间隔,单位是毫秒。如果在作业配置文件的 `env` 中配置了 `checkpoint.interval` 参数,将以作业配置文件中设置的为准。 + +**timeout** + +检查点的超时时间。如果在超时时间内无法完成检查点,则会触发检查点失败,作业失败。如果在作业的配置文件的`env`中配置了`checkpoint.timeout`参数,将以作业配置文件中设置的为准。 + +示例 + +```yaml +seatunnel: + engine: + backup-count: 1 + print-execution-info-interval: 10 + slot-service: + dynamic-slot: true + checkpoint: + interval: 300000 + timeout: 10000 +``` + +**checkpoint storage** + +检查点是一种容错恢复机制。这种机制确保程序在运行时,即使突然遇到异常,也能自行恢复。检查点定时触发,每次检查点进行时每个Task都会被要求将自身的状态信息(比如读取kafka时读取到了哪个offset)上报给检查点线程,由该线程写入一个分布式存储(或共享存储)。当任务失败然后自动容错恢复时,或者通过seatunnel.sh -r 指令恢复之前被暂停的任务时,会从检查点存储中加载对应作业的状态信息,并基于这些状态信息进行作业的恢复。 + +如果集群的节点大于1,检查点存储必须是一个分布式存储,或者共享存储,这样才能保证任意节点挂掉后依然可以在另一个节点加载到存储中的任务状态信息。 + +:::tip + +检查点配置只有Master服务才会读取,Worker服务不会读取检查点配置。如果Master和Worker进程在同一个机器上启动,Master和Worker会共用`seatunnel.yaml`配置文件,此时Worker节点服务会忽略`checkpoint`配置。 + +::: + +有关检查点存储的信息,您可以查看 [Checkpoint Storage](checkpoint-storage.md) + +### 4.4 历史作业过期配置 + +每个完成的作业的信息,如状态、计数器和错误日志,都存储在 IMap 对象中。随着运行作业数量的增加,内存会增加,最终内存将溢出。因此,您可以调整 `history-job-expire-minutes` 参数来解决这个问题。此参数的时间单位是分钟。默认值是 1440 分钟,即一天。 + +示例 + +```yaml +seatunnel: + engine: + history-job-expire-minutes: 1440 +``` + +### 4.5 类加载器缓存模式 + +此配置主要解决不断创建和尝试销毁类加载器所导致的资源泄漏问题。 +如果您遇到与metaspace空间溢出相关的异常,您可以尝试启用此配置。 +为了减少创建类加载器的频率,在启用此配置后,SeaTunnel 在作业完成时不会尝试释放相应的类加载器,以便它可以被后续作业使用,也就是说,当运行作业中使用的 Source/Sink 连接器类型不是太多时,它更有效。 +默认值是 false。 +示例 + +```yaml +seatunnel: + engine: + classloader-cache-mode: true +``` + +### 4.6 IMap持久化配置(该参数在Worker节点无效) + +:::tip + +由于在分离集群模式下,只有Master节点存储Imap数据,Worker节点不存储Imap数据,所以Worker服务不会读取该参数项。 + +::: + +在SeaTunnel中,我们使用IMap(一种分布式的Map,可以实现数据跨节点跨进程的写入的读取 有关详细信息,请参阅 [Hazelcast Map](https://docs.hazelcast.com/imdg/4.2/data-structures/map)) 来存储每个任务及其task的状态,以便在任务所在节点宕机后,可以在其他节点上获取到任务之前的状态信息,从而恢复任务实现任务的容错。 + +默认情况下Imap的信息只是存储在内存中,我们可以设置Imap数据的复本数,具体可参考(4.1 Imap中数据的备份数设置),如果复本数是2,代表每个数据会同时存储在2个不同的节点中。一旦节点宕机,Imap中的数据会重新在其它节点上自动补充到设置的复本数。但是当所有节点都被停止后,Imap中的数据会丢失。当集群节点再次启动后,所有之前正在运行的任务都会被标记为失败,需要用户手工通过seatunnel.sh -r 指令恢复运行。 + +为了解决这个问题,我们可以将Imap中的数据持久化到外部存储中,如HDFS、OSS等。这样即使所有节点都被停止,Imap中的数据也不会丢失,当集群节点再次启动后,所有之前正在运行的任务都会被自动恢复。 + +下面介绍如何使用 MapStore 持久化配置。有关详细信息,请参阅 [Hazelcast Map](https://docs.hazelcast.com/imdg/4.2/data-structures/map) + +**type** + +imap 持久化的类型,目前仅支持 `hdfs`。 + +**namespace** + +它用于区分不同业务的数据存储位置,如 OSS 存储桶名称。 + +**clusterName** + +此参数主要用于集群隔离, 我们可以使用它来区分不同的集群,如 cluster1、cluster2,这也用于区分不同的业务。 + +**fs.defaultFS** + +我们使用 hdfs api 读写文件,因此使用此存储需要提供 hdfs 配置。 + +如果您使用 HDFS,可以像这样配置: + +```yaml +map: + engine*: + map-store: + enabled: true + initial-mode: EAGER + factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory + properties: + type: hdfs + namespace: /tmp/seatunnel/imap + clusterName: seatunnel-cluster + storage.type: hdfs + fs.defaultFS: hdfs://localhost:9000 +``` + +如果没有 HDFS,并且您的集群只有一个节点,您可以像这样配置使用本地文件: + +```yaml +map: + engine*: + map-store: + enabled: true + initial-mode: EAGER + factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory + properties: + type: hdfs + namespace: /tmp/seatunnel/imap + clusterName: seatunnel-cluster + storage.type: hdfs + fs.defaultFS: file:/// +``` + +如果您使用 OSS,可以像这样配置: + +```yaml +map: + engine*: + map-store: + enabled: true + initial-mode: EAGER + factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory + properties: + type: hdfs + namespace: /tmp/seatunnel/imap + clusterName: seatunnel-cluster + storage.type: oss + block.size: block size(bytes) + oss.bucket: oss://bucket name/ + fs.oss.accessKeyId: OSS access key id + fs.oss.accessKeySecret: OSS access key secret + fs.oss.endpoint: OSS endpoint +``` + +注意:使用OSS 时,确保 lib目录下有这几个jar. + +``` +aliyun-sdk-oss-3.13.2.jar +hadoop-aliyun-3.3.6.jar +jdom2-2.0.6.jar +netty-buffer-4.1.89.Final.jar +netty-common-4.1.89.Final.jar +seatunnel-hadoop3-3.1.4-uber.jar +``` + +## 5. 配置 SeaTunnel Engine 网络服务 + +所有 SeaTunnel Engine 网络相关的配置都在 `hazelcast-master.yaml`和`hazelcast-worker.yaml` 文件中. + +### 5.1 集群名称 + +SeaTunnel Engine 节点使用 `cluster-name` 来确定另一个节点是否与自己在同一集群中。如果两个节点之间的集群名称不同,SeaTunnel 引擎将拒绝服务请求。 + +### 5.2 网络 + +基于 [Hazelcast](https://docs.hazelcast.com/imdg/4.1/clusters/discovery-mechanisms), 一个 SeaTunnel Engine 集群是由运行 SeaTunnel Engine 服务器的集群成员组成的网络。 集群成员自动加入一起形成集群。这种自动加入是通过集群成员使用的各种发现机制来相互发现的。 + +请注意,集群形成后,集群成员之间的通信始终通过 TCP/IP 进行,无论使用的发现机制如何。 + +SeaTunnel Engine 使用以下发现机制。 + +#### TCP + +您可以将 SeaTunnel Engine 配置为完整的 TCP/IP 集群。有关配置详细信息,请参阅 [Discovering Members by TCP section](tcp.md)。 + +在分离集群模式下,Master和Worker服务使用不同的端口。 + +Master节点网络配置 `hazelcast-master.yaml` + +```yaml + +hazelcast: + cluster-name: seatunnel + network: + rest-api: + enabled: true + endpoint-groups: + CLUSTER_WRITE: + enabled: true + DATA: + enabled: true + join: + tcp-ip: + enabled: true + member-list: + - master-node-1:5801 + - master-node-2:5801 + - worker-node-1:5802 + - worker-node-2:5802 + port: + auto-increment: false + port: 5801 + properties: + hazelcast.heartbeat.failuredetector.type: phi-accrual + hazelcast.heartbeat.interval.seconds: 2 + hazelcast.max.no.heartbeat.seconds: 180 + hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10 + hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200 + hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100 +``` + +Worker节点网络配置 `hazelcast-worker.yaml` + +```yaml + +hazelcast: + cluster-name: seatunnel + network: + join: + tcp-ip: + enabled: true + member-list: + - master-node-1:5801 + - master-node-2:5801 + - worker-node-1:5802 + - worker-node-2:5802 + port: + auto-increment: false + port: 5802 + properties: + hazelcast.heartbeat.failuredetector.type: phi-accrual + hazelcast.heartbeat.interval.seconds: 2 + hazelcast.max.no.heartbeat.seconds: 180 + hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10 + hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200 + hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100 +``` + +TCP 是我们建议在独立 SeaTunnel Engine 集群中使用的方式。 + +另一方面,Hazelcast 提供了一些其他的服务发现方法。有关详细信息,请参阅 [Hazelcast Network](https://docs.hazelcast.com/imdg/4.1/clusters/setting-up-clusters) + +## 6. 启动 SeaTunnel Engine Master 节点 + +可以通过守护进程使用 `-d` 参数启动。 + +```shell +mkdir -p $SEATUNNEL_HOME/logs +./bin/seatunnel-cluster.sh -d -r master +``` + +日志将写入 `$SEATUNNEL_HOME/logs/seatunnel-engine-master.log` + +## 7. 启动 SeaTunnel Engine Worker 节点 + +可以通过守护进程使用 `-d` 参数启动。 + +```shell +mkdir -p $SEATUNNEL_HOME/logs +./bin/seatunnel-cluster.sh -d -r worker +``` + +日志将写入 `$SEATUNNEL_HOME/logs/seatunnel-engine-worker.log` + +## 8. 安装 SeaTunnel Engine 客户端 + +### 8.1 和服务端一样设置`SEATUNNEL_HOME` + +您可以通过添加 `/etc/profile.d/seatunnel.sh` 文件来配置 `SEATUNNEL_HOME` 。`/etc/profile.d/seatunnel.sh` 的内容如下: + +``` +export SEATUNNEL_HOME=${seatunnel install path} +export PATH=$PATH:$SEATUNNEL_HOME/bin +``` + +### 8.2 配置 SeaTunnel Engine 客户端 + +所有 SeaTunnel Engine 客户端的配置都在 `hazelcast-client.yaml` 里。 + +**cluster-name** + +客户端必须与 SeaTunnel Engine 具有相同的 `cluster-name`。否则,SeaTunnel Engine 将拒绝客户端的请求。 + +**network** + +需要将所有 SeaTunnel Engine Master节点的地址添加到这里。 + +```yaml +hazelcast-client: + cluster-name: seatunnel + properties: + hazelcast.logging.type: log4j2 + network: + cluster-members: + - master-node-1:5801 + - master-node-2:5801 +``` + +## 9. 提交作业和管理作业 + +现在集群部署完成了,您可以通过以下教程完成作业的提交和管理:[提交和管理作业](user-command.md) diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/tcp.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/tcp.md new file mode 100644 index 000000000000..256bb01fe6b0 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/tcp.md @@ -0,0 +1,37 @@ +--- + +sidebar_position: 10 +-------------------- + +# TCP NetWork + +如果您的环境中多播不是首选的发现方式,那么您可以将 SeaTunnel 引擎配置为一个完整的 TCP/IP 集群。当您通过 TCP/IP 配置 SeaTunnel 引擎以发现成员时,您必须将所有或一部分成员的主机名和/或 IP 地址列为集群成员。您不必列出所有这些集群成员,但在新成员加入时,至少有一个列出的成员必须是活跃的。 + +要配置您的 Hazelcast 作为一个完整的 TCP/IP 集群,请设置以下配置元素。有关 TCP/IP 发现配置元素的完整描述,请参见 tcp-ip 元素部分。 + +- 将 tcp-ip 元素的 enabled 属性设置为 true。 +- 在 tcp-ip 元素内提供您的成员元素。 + +以下是一个示例声明性配置。 + +```yaml +hazelcast: + network: + join: + tcp-ip: + enabled: true + member-list: + - machine1 + - machine2 + - machine3:5799 + - 192.168.1.0-7 + - 192.168.1.21 +``` + +如上所示,您可以为成员元素提供 IP 地址或主机名。您还可以提供一个 IP 地址范围,例如 `192.168.1.0-7`. + +除了像上面展示的那样逐行提供成员外,您还可以选择使用 members 元素并写入逗号分隔的 IP 地址,如下所示。 + +`192.168.1.0-7,192.168.1.21` + +如果您没有为成员提供端口,Hazelcast 会自动尝试端口 `5701`, `5702` 等。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/user-command.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/user-command.md new file mode 100644 index 000000000000..d4d06d25a780 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/seatunnel-engine/user-command.md @@ -0,0 +1,139 @@ +--- + +sidebar_position: 12 +-------------------- + +# 命令行工具 + +SeaTunnel Engine 提供了一个命令行工具,用于管理 SeaTunnel Engine 的作业。您可以使用命令行工具提交、停止、暂停、恢复、删除作业,查看作业状态和监控指标等。 + +可以通过如下命令获取命令行工具的帮助信息: + +```shell +bin/seatunnel.sh -h +``` + +输出如下: + +```shell + +Usage: seatunnel.sh [options] + Options: + --async Run the job asynchronously, when the job + is submitted, the client will exit + (default: false) + -can, --cancel-job Cancel job by JobId + --check Whether check config (default: false) + -cj, --close-job Close client the task will also be closed + (default: true) + -cn, --cluster The name of cluster + -c, --config Config file + --decrypt Decrypt config file, When both --decrypt + and --encrypt are specified, only + --encrypt will take effect (default: + false) + -m, --master, -e, --deploy-mode SeaTunnel job submit master, support + [local, cluster] (default: cluster) + --encrypt Encrypt config file, when both --decrypt + and --encrypt are specified, only + --encrypt will take effect (default: + false) + --get_running_job_metrics Gets metrics for running jobs (default: + false) + -h, --help Show the usage message + -j, --job-id Get job status by JobId + -l, --list list job status (default: false) + --metrics Get job metrics by JobId + -n, --name SeaTunnel job name (default: SeaTunnel) + -r, --restore restore with savepoint by jobId + -s, --savepoint savepoint job by jobId + -i, --variable Variable substitution, such as -i + city=beijing, or -i date=20190318.We use + ',' as separator, when inside "", ',' are + treated as normal characters instead of + delimiters. (default: []) + +``` + +## 提交作业 + +```shell +bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template +``` + +**--async** 参数可以让作业在后台运行,当作业提交后,客户端会退出。 + +```shell +./bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template --async +``` + +**-n** 或 **--name** 参数可以指定作业的名称 + +```shell +./bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template --async -n myjob +``` + +## 查看作业列表 + +```shell +./bin/seatunnel.sh -l +``` + +该命令会输出所有当前集群中的作业列表(包含运行完成的历史作业和正在运行的作业) + +## 查看作业状态 + +```shell +./bin/seatunnel.sh -j <jobId> +``` + +该命令会输出指定作业的状态信息 + +## 获取正在运行的作业监控信息 + +```shell +./bin/seatunnel.sh --get_running_job_metrics +``` + +该命令会输出正在运行的作业的监控信息 + +## 获取指定作业监控信息 + +--metrics 参数可以获取指定作业的监控信息 + +```shell +./bin/seatunnel.sh --metrics <jobId> +``` + +## 暂停作业 + +```shell +./bin/seatunnel.sh -s <jobId> +``` + +该命令会暂停指定作业,注意,只有开启了checkpoint的作业才支持暂停作业(实时同步作业默认开启checkpoint,批处理作业默认不开启checkpoint需要通过在 `env` 中配置checkpoint.interval来开启checkpoint)。 + +暂停作业是以split为最小单位的,即暂停作业后,会等待当前正在运行的split运行完成后再暂停。任务恢复后,会从暂停的split继续运行。 + +## 恢复作业 + +```shell +./bin/seatunnel.sh -r <jobId> -c $SEATUNNEL_HOME/config/v2.batch.config.template +``` + +该命令会恢复指定作业,注意,只有开启了checkpoint的作业才支持恢复作业(实时同步作业默认开启checkpoint,批处理作业默认不开启checkpoint需要通过在 `env` 中配置checkpoint.interval来开启checkpoint)。 + +恢复作业需要指定jobId和作业的配置文件。 + +运行失败的作业和通过seatunnel.sh -s <jobId>暂停的作业都可以通过该命令恢复。 + +## 取消作业 + +```shell +./bin/seatunnel.sh -can <jobId> +``` + +该命令会取消指定作业,取消作业后,作业会被停止,作业的状态会变为`CANCELED`。 + +被cancel的作业的所有断点信息都将被删除,无法通过seatunnel.sh -r <jobId>恢复。 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/start-v2/locally/deployment.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/start-v2/locally/deployment.md new file mode 100644 index 000000000000..167abeaeaab9 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/start-v2/locally/deployment.md @@ -0,0 +1,68 @@ +--- + +sidebar_position: 1 +------------------- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# 本地部署 + +## 步骤 1: 准备工作 + +在开始本地运行前,您需要确保您已经安装了SeaTunnel所需要的以下软件: + +* 安装[Java](https://www.java.com/en/download/) (Java 8 或 11, 其他高于Java 8的版本理论上也可以工作) 以及设置 `JAVA_HOME`。 + +## 步骤 2: 下载 SeaTunnel + +进入[SeaTunnel下载页面](https://seatunnel.apache.org/download)下载最新版本的二进制安装包`seatunnel--bin.tar.gz` + +或者您也可以通过终端下载: + +```shell +export version="2.3.7" +wget "https://archive.apache.org/dist/seatunnel/${version}/apache-seatunnel-${version}-bin.tar.gz" +tar -xzvf "apache-seatunnel-${version}-bin.tar.gz" +``` + +## 步骤 3: 下载连接器插件 + +从2.2.0-beta版本开始,二进制包不再默认提供连接器依赖,因此在第一次使用时,您需要执行以下命令来安装连接器:(当然,您也可以从 [Apache Maven Repository](https://repo.maven.apache.org/maven2/org/apache/seatunnel/) 手动下载连接器,然后将其移动至`connectors/seatunnel`目录下)。 + +```bash +sh bin/install-plugin.sh +``` + +如果您需要指定的连接器版本,以2.3.7为例,您需要执行如下命令: + +```bash +sh bin/install-plugin.sh 2.3.7 +``` + +通常您并不需要所有的连接器插件,可以通过配置`config/plugin_config`来指定您所需要的插件,例如,您只需要`connector-console`插件,那么您可以修改plugin.properties配置文件如下: + +```plugin_config +--seatunnel-connectors-- +connector-console +--end-- +``` + +如果您希望示例应用程序能正常工作,那么您需要添加以下插件: + +```plugin_config +--seatunnel-connectors-- +connector-fake +connector-console +--end-- +``` + +您可以在`${SEATUNNEL_HOME}/connectors/plugins-mapping.properties`下找到所有支持的连接器和相应的plugin_config配置名称。 + +:::tip 提示 + +如果您想通过手动下载连接器的方式来安装连接器插件,则需要下载您所需要的连接器插件即可,并将它们放在`${SEATUNNEL_HOME}/connectors/`目录下。 + +::: + +现在,您已经完成了SeaTunnel部署。您可以按照[快速开始](quick-start-seatunnel-engine.md)来配置并运行数据同步作业了。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/start-v2/locally/quick-start-flink.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/start-v2/locally/quick-start-flink.md new file mode 100644 index 000000000000..09189c91dcef --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/start-v2/locally/quick-start-flink.md @@ -0,0 +1,111 @@ +--- + +sidebar_position: 3 +------------------- + +# Flink Engine快速开始 + +## 步骤 1: 部署SeaTunnel及连接器 + +在开始前,请确保您已经按照[部署](deployment.md)中的描述下载并部署了SeaTunnel。 + +## 步骤 2: 部署并配置Flink + +请先[下载Flink](https://flink.apache.org/downloads.html)(**需要版本 >= 1.12.0**)。更多信息您可以查看[入门: Standalone模式](https://nightlies.apache.org/flink/flink-docs-release-1.14/docs/deployment/resource-providers/standalone/overview/) + +**配置SeaTunnel**: 修改`config/seatunnel-env.sh`中的设置,将`FLINK_HOME`配置设置为Flink的部署目录。 + +## 步骤 3: 添加作业配置文件来定义作业 + +编辑`config/v2.streaming.conf.template`,它决定了SeaTunnel启动后数据输入、处理和输出的方式及逻辑。 +下面是配置文件的示例,它与上面提到的示例应用程序相同。 + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +transform { + FieldMapper { + source_table_name = "fake" + result_table_name = "fake1" + field_mapper = { + age = age + name = new_name + } + } +} + +sink { + Console { + source_table_name = "fake1" + } +} + +``` + +关于配置的更多信息请查看[配置的基本概念](../../concept/config.md) + +## 步骤 4: 运行SeaTunnel应用程序 + +您可以通过以下命令启动应用程序: + +Flink版本`1.12.x`到`1.14.x` + +```shell +cd "apache-seatunnel-${version}" +./bin/start-seatunnel-flink-13-connector-v2.sh --config ./config/v2.streaming.conf.template +``` + +Flink版本`1.15.x`到`1.16.x` + +```shell +cd "apache-seatunnel-${version}" +./bin/start-seatunnel-flink-15-connector-v2.sh --config ./config/v2.streaming.conf.template +``` + +**查看输出**: 当您运行该命令时,您可以在控制台中看到它的输出。您可以认为这是命令运行成功或失败的标志。 + +SeaTunnel控制台将会打印一些如下日志信息: + +```shell +fields : name, age +types : STRING, INT +row=1 : elWaB, 1984352560 +row=2 : uAtnp, 762961563 +row=3 : TQEIB, 2042675010 +row=4 : DcFjo, 593971283 +row=5 : SenEb, 2099913608 +row=6 : DHjkg, 1928005856 +row=7 : eScCM, 526029657 +row=8 : sgOeE, 600878991 +row=9 : gwdvw, 1951126920 +row=10 : nSiKE, 488708928 +row=11 : xubpl, 1420202810 +row=12 : rHZqb, 331185742 +row=13 : rciGD, 1112878259 +row=14 : qLhdI, 1457046294 +row=15 : ZTkRx, 1240668386 +row=16 : SGZCr, 94186144 +``` + +## 此外 + +现在,您已经快速浏览了SeaTunnel使用Flink引擎的方式,可以通过在[连接器](/docs/category/connector-v2)中找到SeaTunnel所支持的所有sources和sinks。 +如果您想要了解更多关于SeaTunnel运行在Flink上的信息,请参阅[基于Flink的SeaTunnel](../../other-engine/flink.md)。 + +SeaTunnel有内置的`Zeta`引擎,它是作为SeaTunnel的默认引擎。您可以参考[快速开始](quick-start-seatunnel-engine.md)配置和运行数据同步作业。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/start-v2/locally/quick-start-seatunnel-engine.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/start-v2/locally/quick-start-seatunnel-engine.md new file mode 100644 index 000000000000..cd7a9e88e3ea --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/start-v2/locally/quick-start-seatunnel-engine.md @@ -0,0 +1,100 @@ +--- + +sidebar_position: 2 +------------------- + +# SeaTunnel Engine快速开始 + +## 步骤 1: 部署SeaTunnel及连接器 + +在开始前,请确保您已经按照[部署](deployment.md)中的描述下载并部署了SeaTunnel。 + +## 步骤 2: 添加作业配置文件来定义作业 + +编辑`config/v2.batch.config.template`,它决定了当seatunnel启动后数据输入、处理和输出的方式及逻辑。 +下面是配置文件的示例,它与上面提到的示例应用程序相同。 + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +transform { + FieldMapper { + source_table_name = "fake" + result_table_name = "fake1" + field_mapper = { + age = age + name = new_name + } + } +} + +sink { + Console { + source_table_name = "fake1" + } +} + +``` + +关于配置的更多信息请查看[配置的基本概念](../../concept/config.md) + +## 步骤 3: 运行SeaTunnel应用程序 + +您可以通过以下命令启动应用程序: + +:::tip + +从2.3.1版本开始,seatunnel.sh中的-e参数被废弃,请改用-m参数。 + +::: + +```shell +cd "apache-seatunnel-${version}" +./bin/seatunnel.sh --config ./config/v2.batch.config.template -m local + +``` + +**查看输出**: 当您运行该命令时,您可以在控制台中看到它的输出。您可以认为这是命令运行成功或失败的标志。 + +SeaTunnel控制台将会打印一些如下日志信息: + +```shell +2022-12-19 11:01:45,417 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - output rowType: name, age +2022-12-19 11:01:46,489 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=1: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: CpiOd, 8520946 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=2: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: eQqTs, 1256802974 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=3: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: UsRgO, 2053193072 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=4: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: jDQJj, 1993016602 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=5: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: rqdKp, 1392682764 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=6: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: wCoWN, 986999925 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=7: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: qomTU, 72775247 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=8: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: jcqXR, 1074529204 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=9: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: AkWIO, 1961723427 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=10: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: hBoib, 929089763 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=11: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: GSvzm, 827085798 +2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=12: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: NNAYI, 94307133 +2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=13: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: EexFl, 1823689599 +2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=14: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: CBXUb, 869582787 +2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=15: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: Wbxtm, 1469371353 +2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=16: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: mIJDt, 995616438 +``` + +## 此外 + +现在,您已经快速浏览了SeaTunnel,可以通过[连接器](../../../en/connector-v2/source/FakeSource.md)来找到SeaTunnel所支持的所有sources和sinks。 +如果您想要了解更多关于信息,请参阅[SeaTunnel引擎](../../seatunnel-engine/about.md). 在这里你将了解如何部署SeaTunnel Engine的集群模式以及如何在集群模式下使用。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/start-v2/locally/quick-start-spark.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/start-v2/locally/quick-start-spark.md new file mode 100644 index 000000000000..fbd0fa15fe55 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/start-v2/locally/quick-start-spark.md @@ -0,0 +1,118 @@ +--- + +sidebar_position: 4 +------------------- + +# Spark引擎快速开始 + +## 步骤 1: 部署SeaTunnel及连接器 + +在开始前,请确保您已经按照[部署](deployment.md)中的描述下载并部署了SeaTunnel。 + +## 步骤 2: 部署并配置Spark + +请先[下载Spark](https://spark.apache.org/downloads.html)(**需要版本 >= 2.4.0**)。 更多信息您可以查看[入门: Standalone模式](https://spark.apache.org/docs/latest/spark-standalone.html#installing-spark-standalone-to-a-cluster) + +**配置SeaTunnel**: 修改`config/seatunnel-env.sh`中的设置,它是基于你的引擎在[部署](deployment.md)时的安装路径。 +将`SPARK_HOME`修改为Spark的部署目录。 + +## 步骤 3: 添加作业配置文件来定义作业 + +编辑`config/v2.streaming.conf.template`,它决定了当SeaTunnel启动后数据输入、处理和输出的方式及逻辑。 +下面是配置文件的示例,它与上面提到的示例应用程序相同。 + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +transform { + FieldMapper { + source_table_name = "fake" + result_table_name = "fake1" + field_mapper = { + age = age + name = new_name + } + } +} + +sink { + Console { + source_table_name = "fake1" + } +} + +``` + +关于配置的更多信息请查看[配置的基本概念](../../concept/config.md) + +## 步骤 4: 运行SeaTunnel应用程序 + +您可以通过以下命令启动应用程序: + +Spark 2.4.x + +```bash +cd "apache-seatunnel-${version}" +./bin/start-seatunnel-spark-2-connector-v2.sh \ +--master local[4] \ +--deploy-mode client \ +--config ./config/v2.streaming.conf.template +``` + +Spark 3.x.x + +```shell +cd "apache-seatunnel-${version}" +./bin/start-seatunnel-spark-3-connector-v2.sh \ +--master local[4] \ +--deploy-mode client \ +--config ./config/v2.streaming.conf.template +``` + +**查看输出**: 当您运行该命令时,您可以在控制台中看到它的输出。您可以认为这是命令运行成功或失败的标志。 + +SeaTunnel控制台将会打印一些如下日志信息: + +```shell +fields : name, age +types : STRING, INT +row=1 : elWaB, 1984352560 +row=2 : uAtnp, 762961563 +row=3 : TQEIB, 2042675010 +row=4 : DcFjo, 593971283 +row=5 : SenEb, 2099913608 +row=6 : DHjkg, 1928005856 +row=7 : eScCM, 526029657 +row=8 : sgOeE, 600878991 +row=9 : gwdvw, 1951126920 +row=10 : nSiKE, 488708928 +row=11 : xubpl, 1420202810 +row=12 : rHZqb, 331185742 +row=13 : rciGD, 1112878259 +row=14 : qLhdI, 1457046294 +row=15 : ZTkRx, 1240668386 +row=16 : SGZCr, 94186144 +``` + +## 此外 + +现在,您已经快速浏览了SeaTunnel使用Spark引擎的方式,可以通过在[连接器](/docs/category/connector-v2)中找到SeaTunnel所支持的所有source和sink。 +如果您想要了解更多关于SeaTunnel运行在Spark上的信息,请参阅[基于Spark的SeaTunnel](../../../en/other-engine/spark.md)。 + +SeaTunnel有内置的`Zeta`引擎,它是作为SeaTunnel的默认引擎。您可以参考[快速开始](quick-start-seatunnel-engine.md)配置和运行数据同步作业。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/common-options.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/common-options.md new file mode 100644 index 000000000000..9a756760f2cb --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/common-options.md @@ -0,0 +1,23 @@ +# 转换常见选项 + +> 源端连接器的常见参数 + +| 参数名称 | 参数类型 | 是否必须 | 默认值 | +|-------------------|--------|------|-----| +| result_table_name | string | no | - | +| source_table_name | string | no | - | + +### source_table_name [string] + +当未指定 `source_table_name` 时,当前插件在配置文件中处理由前一个插件输出的数据集 `(dataset)` ; + +当指定了 `source_table_name` 时,当前插件正在处理与该参数对应的数据集 + +### result_table_name [string] + +当未指定 `result_table_name` 时,此插件处理的数据不会被注册为其他插件可以直接访问的数据集,也不会被称为临时表 `(table)`; + +当指定了 `result_table_name` 时,此插件处理的数据将被注册为其他插件可以直接访问的数据集 `(dataset)`,或者被称为临时表 `(table)`。在这里注册的数据集可以通过指定 `source_table_name` 被其他插件直接访问。 + +## 示例 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/copy.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/copy.md new file mode 100644 index 000000000000..a4ca5c613a74 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/copy.md @@ -0,0 +1,65 @@ +# 复制 + +> 复制转换插件 + +## 描述 + +将字段复制到一个新字段。 + +## 属性 + +| 名称 | 类型 | 是否必须 | 默认值 | +|--------|--------|------|-----| +| fields | Object | yes | | + +### fields [config] + +指定输入和输出之间的字段复制关系 + +### 常见选项 [string] + +转换插件的常见参数, 请参考 [Transform Plugin](common-options.md) 了解详情。 + +## 示例 + +从源读取的数据是这样的一个表: + +| name | age | card | +|----------|-----|------| +| Joy Ding | 20 | 123 | +| May Ding | 20 | 123 | +| Kin Dom | 20 | 123 | +| Joy Dom | 20 | 123 | + +想要将字段 `name`、`age` 复制到新的字段 `name1`、`name2`、`age1`,我们可以像这样添加 `Copy` 转换: + +``` +transform { + Copy { + source_table_name = "fake" + result_table_name = "fake1" + fields { + name1 = name + name2 = name + age1 = age + } + } +} +``` + +那么结果表 `fake1` 中的数据将会像这样: + +| name | age | card | name1 | name2 | age1 | +|----------|-----|------|----------|----------|------| +| Joy Ding | 20 | 123 | Joy Ding | Joy Ding | 20 | +| May Ding | 20 | 123 | May Ding | May Ding | 20 | +| Kin Dom | 20 | 123 | Kin Dom | Kin Dom | 20 | +| Joy Dom | 20 | 123 | Joy Dom | Joy Dom | 20 | + +## 更新日志 + +### 新版本 + +- 添加复制转换连接器 +- 支持将字段复制到新字段 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/dynamic-compile.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/dynamic-compile.md new file mode 100644 index 000000000000..0fef5c253e3f --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/dynamic-compile.md @@ -0,0 +1,171 @@ +# DynamicCompile + +> 动态编译插件 + +## 描述 + +:::tip + +特别申明 +您需要确保服务的安全性,并防止攻击者上传破坏性代码 + +::: + +提供一种可编程的方式来处理行,允许用户自定义任何业务行为,甚至基于现有行字段作为参数的RPC请求,或者通过从其他数据源检索相关数据来扩展字段。为了区分业务,您还可以定义多个转换进行组合, +如果转换过于复杂,可能会影响性能 + +## 属性 + +| name | type | required | default value | +|------------------|--------|----------|---------------| +| source_code | string | no | | +| compile_language | Enum | yes | | +| compile_pattern | Enum | no | SOURCE_CODE | +| absolute_path | string | no | | + +### source_code [string] + +代码必须实现两个方法:getInlineOutputColumns和getInlineOutputFieldValues。getInlineOutputColumns确定要添加或转换的列,原始列结构可以从CatalogTable中获得 +GetInlineOutputFieldValues决定您的列值。您可以满足任何要求,甚至可以完成RPC请求以基于原始列获取新值 +如果有第三方依赖包,请将它们放在${SEATUNNEL_HOME}/lib中,如果您使用spark或flink,则需要将其放在相应服务的libs下。 + +### common options [string] + +转换插件的常见参数, 请参考 [Transform Plugin](common-options.md) 了解详情。 + +### compile_language [Enum] + +Java中的某些语法可能不受支持,请参阅https://github.com/janino-compiler/janino +GROOVY,JAVA + +### compile_pattern [Enum] + +SOURCE_CODE,ABSOLUTE_PATH +选择 SOURCE_CODE,SOURCE_CODE 属性必填;选择ABSOLUTE_PATH,ABSOLUTE_PATH属性必填。 + +### absolute_path [string] + +服务器上Java或Groovy文件的绝对路径 + +## Example + +源端数据读取的表格如下: + +| name | age | card | +|----------|-----|------| +| Joy Ding | 20 | 123 | +| May Ding | 20 | 123 | +| Kin Dom | 20 | 123 | +| Joy Dom | 20 | 123 | + +``` +transform { + DynamicCompile { + source_table_name = "fake" + result_table_name = "groovy_out" + compile_language="GROOVY" + compile_pattern="SOURCE_CODE" + source_code=""" + import org.apache.seatunnel.api.table.catalog.Column + import org.apache.seatunnel.transform.common.SeaTunnelRowAccessor + import org.apache.seatunnel.api.table.catalog.CatalogTable + import org.apache.seatunnel.api.table.catalog.PhysicalColumn; + import org.apache.seatunnel.api.table.type.*; + import java.util.ArrayList; + class demo { + public Column[] getInlineOutputColumns(CatalogTable inputCatalogTable) { + List columns = new ArrayList<>(); + PhysicalColumn destColumn = + PhysicalColumn.of( + "compile_language", + BasicType.STRING_TYPE, + 10, + true, + "", + ""); + columns.add(destColumn); + return columns.toArray(new Column[0]); + } + public Object[] getInlineOutputFieldValues(SeaTunnelRowAccessor inputRow) { + Object[] fieldValues = new Object[1]; + fieldValues[0]="GROOVY" + return fieldValues; + } + };""" + + } +} + +transform { + DynamicCompile { + source_table_name = "fake" + result_table_name = "java_out" + compile_language="JAVA" + compile_pattern="SOURCE_CODE" + source_code=""" + import org.apache.seatunnel.api.table.catalog.Column; + import org.apache.seatunnel.transform.common.SeaTunnelRowAccessor; + import org.apache.seatunnel.api.table.catalog.*; + import org.apache.seatunnel.api.table.type.*; + import java.util.ArrayList; + public Column[] getInlineOutputColumns(CatalogTable inputCatalogTable) { + + ArrayList columns = new ArrayList(); + PhysicalColumn destColumn = + PhysicalColumn.of( + "compile_language", + BasicType.STRING_TYPE, + 10, + true, + "", + ""); + return new Column[]{ + destColumn + }; + + } + public Object[] getInlineOutputFieldValues(SeaTunnelRowAccessor inputRow) { + Object[] fieldValues = new Object[1]; + fieldValues[0]="JAVA"; + return fieldValues; + } + """ + + } + } + + transform { + DynamicCompile { + source_table_name = "fake" + result_table_name = "groovy_out" + compile_language="GROOVY" + compile_pattern="ABSOLUTE_PATH" + absolute_path="""/tmp/GroovyFile""" + + } +} +``` + +那么结果表 `groovy_out` 中的数据将会更新为: + +| name | age | card | compile_language | +|----------|-----|------|------------------| +| Joy Ding | 20 | 123 | GROOVY | +| May Ding | 20 | 123 | GROOVY | +| Kin Dom | 20 | 123 | GROOVY | +| Joy Dom | 20 | 123 | GROOVY | + +那么结果表 `java_out` 中的数据将会更新为: + +| name | age | card | compile_language | +|----------|-----|------|------------------| +| Joy Ding | 20 | 123 | JAVA | +| May Ding | 20 | 123 | JAVA | +| Kin Dom | 20 | 123 | JAVA | +| Joy Dom | 20 | 123 | JAVA | + +更多复杂例子可以参考 +https://github.com/apache/seatunnel/tree/dev/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-2/src/test/resources/dynamic_compile/conf + +## Changelog + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/field-mapper.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/field-mapper.md new file mode 100644 index 000000000000..298d3fa72c92 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/field-mapper.md @@ -0,0 +1,64 @@ +# 字段映射 + +> 字段映射转换插件 + +## 描述 + +添加输入模式和输出模式映射 + +## 属性 + +| 名称 | 类型 | 是否必须 | 默认值 | +|--------------|--------|------|-----| +| field_mapper | Object | yes | | + +### field_mapper [config] + +指定输入和输出之间的字段映射关系 + +### common options [config] + +转换插件的常见参数, 请参考 [Transform Plugin](common-options.md) 了解详情 + +## 示例 + +源端数据读取的表格如下: + +| id | name | age | card | +|----|----------|-----|------| +| 1 | Joy Ding | 20 | 123 | +| 2 | May Ding | 20 | 123 | +| 3 | Kin Dom | 20 | 123 | +| 4 | Joy Dom | 20 | 123 | + +我们想要删除 `age` 字段,并更新字段顺序为 `id`、`card`、`name`,同时将 `name` 重命名为 `new_name`。我们可以像这样添加 `FieldMapper` 转换: + +``` +transform { + FieldMapper { + source_table_name = "fake" + result_table_name = "fake1" + field_mapper = { + id = id + card = card + name = new_name + } + } +} +``` + +那么结果表 `fake1` 中的数据将会像这样: + +| id | card | new_name | +|----|------|----------| +| 1 | 123 | Joy Ding | +| 2 | 123 | May Ding | +| 3 | 123 | Kin Dom | +| 4 | 123 | Joy Dom | + +## 更新日志 + +### 新版本 + +- 添加复制转换连接器 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/filter-rowkind.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/filter-rowkind.md new file mode 100644 index 000000000000..74d2b2d5b1e1 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/filter-rowkind.md @@ -0,0 +1,68 @@ +# 行类型过滤 + +> 行类型转换插件 + +## 描述 + +按行类型过滤数据 + +## 操作 + +| 名称 | 类型 | 是否必须 | 默认值 | +|---------------|-------|------|-----| +| include_kinds | array | yes | | +| exclude_kinds | array | yes | | + +### include_kinds [array] + +要包含的行类型 + +### exclude_kinds [array] + +要排除的行类型。 + +您只能配置 `include_kinds` 和 `exclude_kinds` 中的一个。 + +### common options [string] + +转换插件的常见参数, 请参考 [Transform Plugin](common-options.md) 了解详情 + +## 示例 + +FakeSource 生成的数据的行类型是 `INSERT`。如果我们使用 `FilterRowKink` 转换并排除 `INSERT` 数据,我们将不会向接收器写入任何行。 + +```yaml + +env { + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 100 + schema = { + fields { + id = "int" + name = "string" + age = "int" + } + } + } +} + +transform { + FilterRowKind { + source_table_name = "fake" + result_table_name = "fake1" + exclude_kinds = ["INSERT"] + } +} + +sink { + Console { + source_table_name = "fake1" + } +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/filter.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/filter.md new file mode 100644 index 000000000000..1f02c999a375 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/filter.md @@ -0,0 +1,79 @@ +# 过滤器 + +> 过滤器转换插件 + +## 描述 + +过滤字段 + +## 属性 + +| 名称 | 类型 | 是否必须 | 默认值 | +|----------------|-------|------|-----| +| include_fields | array | no | | +| exclude_fields | array | no | | + +### include_fields [array] + +需要保留的字段列表。不在列表中的字段将被删除。 + +### exclude_fields [array] + +需要删除的字段列表。不在列表中的字段将被保留。 + +注意,`include_fields` 和 `exclude_fields` 两个属性中,必须设置一个且只能设置一个 + +### common options [string] + +转换插件的常见参数, 请参考 [Transform Plugin](common-options.md) 了解详情 + +## 示例 + +源端数据读取的表格如下: + +| name | age | card | +|----------|-----|------| +| Joy Ding | 20 | 123 | +| May Ding | 20 | 123 | +| Kin Dom | 20 | 123 | +| Joy Dom | 20 | 123 | + +我们想要保留字段 `name`, `card`,我们可以像这样添加 `Filter` 转换: + +``` +transform { + Filter { + source_table_name = "fake" + result_table_name = "fake1" + include_fields = [name, card] + } +} +``` + +我们也可以通过删除字段 `age` 来实现, 我们可以添加一个 `Filter` 转换,并设置exclude_fields: + +``` +transform { + Filter { + source_table_name = "fake" + result_table_name = "fake1" + exclude_fields = [age] + } +} +``` + +那么结果表 `fake1` 中的数据将会像这样: + +| name | card | +|----------|------| +| Joy Ding | 123 | +| May Ding | 123 | +| Kin Dom | 123 | +| Joy Dom | 123 | + +## 更新日志 + +### 新版本 + +- 添加过滤转器换连接器 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/jsonpath.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/jsonpath.md new file mode 100644 index 000000000000..449f0f6a77f1 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/jsonpath.md @@ -0,0 +1,190 @@ +# JsonPath + +> JSONPath 转换插件 + +## 描述 + +> 支持使用 JSONPath 选择数据 + +## 属性 + +| 名称 | 类型 | 是否必须 | 默认值 | +|---------|-------|------|-----| +| Columns | Array | Yes | | + +### common options [string] + +转换插件的常见参数, 请参考 [Transform Plugin](common-options.md) 了解详情 + +### fields[array] + +#### 属性 + +| 名称 | 类型 | 是否必须 | 默认值 | +|------------|--------|------|--------| +| src_field | String | Yes | | +| dest_field | String | Yes | | +| path | String | Yes | | +| dest_type | String | No | String | + +#### src_field + +> 要解析的 JSON 源字段 + +支持的Seatunnel数据类型 + +* STRING +* BYTES +* ARRAY +* MAP +* ROW + +#### dest_field + +> 使用 JSONPath 后的输出字段 + +#### dest_type + +> 目标字段的类型 + +#### path + +> Jsonpath + +## 读取 JSON 示例 + +从源读取的数据是像这样的 JSON + +```json +{ + "data": { + "c_string": "this is a string", + "c_boolean": true, + "c_integer": 42, + "c_float": 3.14, + "c_double": 3.14, + "c_decimal": 10.55, + "c_date": "2023-10-29", + "c_datetime": "16:12:43.459", + "c_array":["item1", "item2", "item3"] + } +} +``` + +假设我们想要使用 JsonPath 提取属性。 + +```json +transform { + JsonPath { + source_table_name = "fake" + result_table_name = "fake1" + columns = [ + { + "src_field" = "data" + "path" = "$.data.c_string" + "dest_field" = "c1_string" + }, + { + "src_field" = "data" + "path" = "$.data.c_boolean" + "dest_field" = "c1_boolean" + "dest_type" = "boolean" + }, + { + "src_field" = "data" + "path" = "$.data.c_integer" + "dest_field" = "c1_integer" + "dest_type" = "int" + }, + { + "src_field" = "data" + "path" = "$.data.c_float" + "dest_field" = "c1_float" + "dest_type" = "float" + }, + { + "src_field" = "data" + "path" = "$.data.c_double" + "dest_field" = "c1_double" + "dest_type" = "double" + }, + { + "src_field" = "data" + "path" = "$.data.c_decimal" + "dest_field" = "c1_decimal" + "dest_type" = "decimal(4,2)" + }, + { + "src_field" = "data" + "path" = "$.data.c_date" + "dest_field" = "c1_date" + "dest_type" = "date" + }, + { + "src_field" = "data" + "path" = "$.data.c_datetime" + "dest_field" = "c1_datetime" + "dest_type" = "time" + }, + { + "src_field" = "data" + "path" = "$.data.c_array" + "dest_field" = "c1_array" + "dest_type" = "array" + } + ] + } +} +``` + +那么数据结果表 `fake1` 将会像这样 + +| data | c1_string | c1_boolean | c1_integer | c1_float | c1_double | c1_decimal | c1_date | c1_datetime | c1_array | +|------------------------------|------------------|------------|------------|----------|-----------|------------|------------|--------------|-----------------------------| +| too much content not to show | this is a string | true | 42 | 3.14 | 3.14 | 10.55 | 2023-10-29 | 16:12:43.459 | ["item1", "item2", "item3"] | + +## 读取 SeatunnelRow 示例 + +假设数据行中的一列的类型是 SeatunnelRow,列的名称为 col + + + + + +
SeatunnelRow(col)other
nameage....
a18....
+ +JsonPath 转换将 seatunnel 的值转换为一个数组。 + +```json +transform { + JsonPath { + source_table_name = "fake" + result_table_name = "fake1" + columns = [ + { + "src_field" = "col" + "path" = "$[0]" + "dest_field" = "name" + "dest_type" = "string" + }, + { + "src_field" = "col" + "path" = "$[1]" + "dest_field" = "age" + "dest_type" = "int" + } + ] + } +} +``` + +那么数据结果表 `fake1` 将会像这样: + +| name | age | col | other | +|------|-----|----------|-------| +| a | 18 | ["a",18] | ... | + +## 更新日志 + +* 添加 JsonPath 转换 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/llm.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/llm.md new file mode 100644 index 000000000000..acd3245b8eb4 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/llm.md @@ -0,0 +1,120 @@ +# LLM + +> LLM 转换插件 + +## 描述 + +利用大型语言模型 (LLM) 的强大功能来处理数据,方法是将数据发送到 LLM 并接收生成的结果。利用 LLM 的功能来标记、清理、丰富数据、执行数据推理等。 + +## 属性 + +| 名称 | 类型 | 是否必须 | 默认值 | +|------------------|--------|------|--------------------------------------------| +| model_provider | enum | yes | | +| output_data_type | enum | no | String | +| prompt | string | yes | | +| model | string | yes | | +| api_key | string | yes | | +| openai.api_path | string | no | https://api.openai.com/v1/chat/completions | + +### model_provider + +要使用的模型提供者。可用选项为: +OPENAI + +### output_data_type + +输出数据的数据类型。可用选项为: +STRING,INT,BIGINT,DOUBLE,BOOLEAN. +默认值为 STRING。 + +### prompt + +发送到 LLM 的提示。此参数定义 LLM 将如何处理和返回数据,例如: + +从源读取的数据是这样的表格: + +| name | age | +|---------------|-----| +| Jia Fan | 20 | +| Hailin Wang | 20 | +| Eric | 20 | +| Guangdong Liu | 20 | + +我们可以使用以下提示: + +``` +Determine whether someone is Chinese or American by their name +``` + +这将返回: + +| name | age | llm_output | +|---------------|-----|------------| +| Jia Fan | 20 | Chinese | +| Hailin Wang | 20 | Chinese | +| Eric | 20 | American | +| Guangdong Liu | 20 | Chinese | + +### model + +要使用的模型。不同的模型提供者有不同的模型。例如,OpenAI 模型可以是 `gpt-4o-mini`。 +如果使用 OpenAI 模型,请参考 https://platform.openai.com/docs/models/model-endpoint-compatibility 文档的`/v1/chat/completions` 端点。 + +### api_key + +用于模型提供者的 API 密钥。 +如果使用 OpenAI 模型,请参考 https://platform.openai.com/docs/api-reference/api-keys 文档的如何获取 API 密钥。 + +### openai.api_path + +用于 OpenAI 模型提供者的 API 路径。在大多数情况下,您不需要更改此配置。如果使用 API 代理的服务,您可能需要将其配置为代理的 API 地址。 + +### common options [string] + +转换插件的常见参数, 请参考 [Transform Plugin](common-options.md) 了解详情 + +## 示例 + +通过 LLM 确定用户所在的国家。 + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + row.num = 5 + schema = { + fields { + id = "int" + name = "string" + } + } + rows = [ + {fields = [1, "Jia Fan"], kind = INSERT} + {fields = [2, "Hailin Wang"], kind = INSERT} + {fields = [3, "Tomas"], kind = INSERT} + {fields = [4, "Eric"], kind = INSERT} + {fields = [5, "Guangdong Liu"], kind = INSERT} + ] + } +} + +transform { + LLM { + model_provider = OPENAI + model = gpt-4o-mini + api_key = sk-xxx + prompt = "Determine whether someone is Chinese or American by their name" + } +} + +sink { + console { + } +} +``` + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/replace.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/replace.md new file mode 100644 index 000000000000..99eef89a1ab1 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/replace.md @@ -0,0 +1,121 @@ +# 替换 + +> 替换转换插件 + +## 描述 + +检查给定字段中的字符串值,并用给定的替换项替换与给定字符串字面量或正则表达式匹配的字符串值的子字符串。 + +## 属性 + +| 名称 | 类型 | 是否必须 | 默认值 | +|---------------|---------|------|-------| +| replace_field | string | yes | | +| pattern | string | yes | - | +| replacement | string | yes | - | +| is_regex | boolean | no | false | +| replace_first | boolean | no | false | + +### replace_field [string] + +需要替换的字段 + +### pattern [string] + +将被替换的旧字符串 + +### replacement [string] + +用于替换的新字符串 + +### is_regex [boolean] + +使用正则表达式进行字符串匹配 + +### replace_first [boolean] + +是否替换第一个匹配字符串。仅在 `is_regex = true` 时使用。 + +### common options [string] + +转换插件的常见参数, 请参考 [Transform Plugin](common-options.md) 了解详情 + +## 示例 + +源端数据读取的表格如下: + +| name | age | card | +|----------|-----|------| +| Joy Ding | 20 | 123 | +| May Ding | 20 | 123 | +| Kin Dom | 20 | 123 | +| Joy Dom | 20 | 123 | + +我们想要将 `name` 字段中的字符 ``替换为 `_`。然后我们可以添加一个 `Replace` 转换,像这样: + +``` +transform { + Replace { + source_table_name = "fake" + result_table_name = "fake1" + replace_field = "name" + pattern = " " + replacement = "_" + is_regex = true + } +} +``` + +那么结果表 `fake1` 中的数据将会更新为: + +| name | age | card | +|----------|-----|------| +| Joy_Ding | 20 | 123 | +| May_Ding | 20 | 123 | +| Kin_Dom | 20 | 123 | +| Joy_Dom | 20 | 123 | + +## 作业配置示例 + +``` +env { + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 100 + schema = { + fields { + id = "int" + name = "string" + } + } + } +} + +transform { + Replace { + source_table_name = "fake" + result_table_name = "fake1" + replace_field = "name" + pattern = ".+" + replacement = "b" + is_regex = true + } +} + +sink { + Console { + source_table_name = "fake1" + } +} +``` + +## 更新日志 + +### 新版本 + +- 添加替换转换连接器 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/split.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/split.md new file mode 100644 index 000000000000..ef8c3f585403 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/split.md @@ -0,0 +1,72 @@ +# 拆分 + +> 拆分转换插件 + +## 描述 + +拆分一个字段为多个字段。 + +## 属性 + +| 名称 | 类型 | 是否必须 | 默认值 | +|---------------|--------|------|-----| +| separator | string | yes | | +| split_field | string | yes | | +| output_fields | array | yes | | + +### separator [string] + +拆分内容的分隔符 + +### split_field[string] + +需要拆分的字段 + +### output_fields[array] + +拆分后的结果字段 + +### common options [string] + +转换插件的常见参数, 请参考 [Transform Plugin](common-options.md) 了解详情 + +## 示例 + +源端数据读取的表格如下: + +| name | age | card | +|----------|-----|------| +| Joy Ding | 20 | 123 | +| May Ding | 20 | 123 | +| Kin Dom | 20 | 123 | +| Joy Dom | 20 | 123 | + +我们想要将 `name` 字段拆分为 `first_name` 和 `second_name`,我们可以像这样添加 `Split` 转换: + +``` +transform { + Split { + source_table_name = "fake" + result_table_name = "fake1" + separator = " " + split_field = "name" + output_fields = [first_name, second_name] + } +} +``` + +那么结果表 `fake1` 中的数据将会像这样: + +| name | age | card | first_name | last_name | +|----------|-----|------|------------|-----------| +| Joy Ding | 20 | 123 | Joy | Ding | +| May Ding | 20 | 123 | May | Ding | +| Kin Dom | 20 | 123 | Kin | Dom | +| Joy Dom | 20 | 123 | Joy | Dom | + +## 更新日志 + +### 新版本 + +- 添加拆分转换连接器 + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/sql-functions.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/sql-functions.md new file mode 100644 index 000000000000..57c440a39b3b --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/sql-functions.md @@ -0,0 +1,966 @@ +# SQL函数 + +> SQL函数转换插件功能 + +## 字符串函数 + +### ASCII + +```ASCII(string)``` + +返回字符串中第一个字符的ASCII值。此方法返回一个整数。 + +示例: + +ASCII('Hi') + +### BIT_LENGTH + +```BIT_LENGTH(bytes)``` + +返回二进制字符串中的位数。该方法返回一个长整型 + +示例: + +BIT_LENGTH(NAME) + +### CHAR_LENGTH / LENGTH + +```CHAR_LENGTH | LENGTH (string)``` + +这个方法返回一个字符串中字符的数量,返回类型为 long。 + +示例: + +CHAR_LENGTH(NAME) + +### OCTET_LENGTH + +```OCTET_LENGTH(bytes)``` + +返回二进制字符串中字节的数量。此方法返回一个 long 类型的值。 + +示例: + +OCTET_LENGTH(NAME) + +### CHAR / CHR + +```CHAR | CHR (int)``` + +返回表示ASCII值的字符。该方法返回一个字符串。 + +示例: + +CHAR(65) + +### CONCAT + +```CONCAT(string, string[, string ...] )``` + +组合字符串。与运算符 `||` 不同,**NULL** 参数会被忽略,不会导致结果变为 **NULL**。如果所有参数都是 NULL,则结果是一个空字符串。该方法返回一个字符串。 + +示例: + +CONCAT(NAME, '_') + +### CONCAT_WS + +```CONCAT_WS(separatorString, string, string[, string ...] )``` + +使用分隔符组合字符串。如果分隔符为 **NULL**,则会被视为空字符串。其他 **NULL** 参数会被忽略。剩余的 **非NULL** 参数(如果有)将用指定的分隔符连接起来。如果没有剩余参数,则结果是一个空字符串。该方法返回一个字符串。 + +示例: + +CONCAT_WS(',', NAME, '_') + +### HEXTORAW + +```HEXTORAW(string)``` + +将字符串的十六进制表示转换为字符串。每个字符串字符使用4个十六进制字符。 + +示例: + +HEXTORAW(DATA) + +### RAWTOHEX + +```RAWTOHEX(string)``` + +```RAWTOHEX(bytes)``` + +将字符串或字节转换为十六进制表示。每个字符串字符使用4个十六进制字符。该方法返回一个字符串。 + +示例: + +RAWTOHEX(DATA) + +### INSERT + +```INSERT(originalString, startInt, lengthInt, addString)``` + +在原始字符串的指定起始位置插入额外的字符串。长度参数指定在原始字符串的起始位置删除的字符数。该方法返回一个字符串。 + +示例: + +INSERT(NAME, 1, 1, ' ') + +### LOWER / LCASE + +```LOWER | LCASE (string)``` + +将字符串转换为小写形式。 + +示例: + +LOWER(NAME) + +### UPPER / UCASE + +```UPPER | UCASE (string)``` + +将字符串转换为大写形式。 + +示例: + +UPPER(NAME) + +### LEFT + +```LEFT(string, int)``` + +返回最左边的一定数量的字符。 + +示例: + +LEFT(NAME, 3) + +### RIGHT + +```RIGHT(string, int)``` + +返回最右边的一定数量的字符。 + +示例: + +RIGHT(NAME, 3) + +### LOCATE / INSTR / POSITION + +```LOCATE(searchString, string[, startInit])``` + +```INSTR(string, searchString[, startInit])``` + +```POSITION(searchString, string)``` + +返回字符串中搜索字符串的位置。如果使用了起始位置参数,则忽略它之前的字符。如果位置参数是负数,则返回最右边的位置。如果未找到搜索字符串,则返回 0。请注意,即使参数不区分大小写,此函数也区分大小写。 + +示例: + +LOCATE('.', NAME) + +### LPAD + +```LPAD(string ,int[, string])``` + +将字符串左侧填充到指定的长度。如果长度比字符串短,则字符串将在末尾被截断。如果未设置填充字符串,则使用空格填充。 + +示例: + +LPAD(AMOUNT, 10, '*') + +### RPAD + +```RPAD(string, int[, string])``` + +将字符串右侧填充到指定的长度。如果长度比字符串短,则字符串将被截断。如果未设置填充字符串,则使用空格填充。 + +示例: + +RPAD(TEXT, 10, '-') + +### LTRIM + +```LTRIM(string[, characterToTrimString])``` + +移除字符串中所有前导空格或其他指定的字符。 + +此函数已被弃用,请使用 TRIM 替代。 + +示例: + +LTRIM(NAME) + +### RTRIM + +```RTRIM(string[, characterToTrimString])``` + +移除字符串中所有尾随空格或其他指定的字符。 + +此函数已被弃用,请使用 TRIM 替代。 + +示例: + +RTRIM(NAME) + +### TRIM + +```TRIM(string[, characterToTrimString])``` + +移除字符串中所有前导空格或其他指定的字符。 + +此函数已被弃用,请使用 TRIM 替代。 + +示例: + +LTRIM(NAME) + +### REGEXP_REPLACE + +```REGEXP_REPLACE(inputString, regexString, replacementString[, flagsString])``` + +替换与正则表达式匹配的每个子字符串。详情请参阅 Java String.replaceAll() 方法。如果任何参数为 null(除了可选的 flagsString 参数),则结果为 null。 + +标志值限于 'i'、'c'、'n'、'm'。其他符号会引发异常。可以在一个 flagsString 参数中使用多个符号(例如 'im')。后面的标志会覆盖前面的标志,例如 'ic' 等同于区分大小写匹配 'c'。 + +'i' 启用不区分大小写匹配(Pattern.CASE_INSENSITIVE) + +'c' 禁用不区分大小写匹配(Pattern.CASE_INSENSITIVE) + +'n' 允许句点匹配换行符(Pattern.DOTALL) + +'m' 启用多行模式(Pattern.MULTILINE) + +示例: + +REGEXP_REPLACE('Hello World', ' +', ' ') +REGEXP_REPLACE('Hello WWWWorld', 'w+', 'W', 'i') + +### REGEXP_LIKE + +```REGEXP_LIKE(inputString, regexString[, flagsString])``` + +将字符串与正则表达式匹配。详情请参阅 Java Matcher.find() 方法。如果任何参数为 null(除了可选的 flagsString 参数),则结果为 null。 + +标志值限于 'i'、'c'、'n'、'm'。其他符号会引发异常。可以在一个 flagsString 参数中使用多个符号(例如 'im')。后面的标志会覆盖前面的标志,例如 'ic' 等同于区分大小写匹配 'c'。 + +'i' 启用不区分大小写匹配(Pattern.CASE_INSENSITIVE) + +'c' 禁用不区分大小写匹配(Pattern.CASE_INSENSITIVE) + +'n' 允许句点匹配换行符(Pattern.DOTALL) + +'m' 启用多行模式(Pattern.MULTILINE) + +示例: + +REGEXP_LIKE('Hello World', '[A-Z ]*', 'i') + +### REGEXP_SUBSTR + +```REGEXP_SUBSTR(inputString, regexString[, positionInt, occurrenceInt, flagsString, groupInt])``` + +将字符串与正则表达式匹配,并返回匹配的子字符串。详情请参阅 java.util.regex.Pattern 和相关功能。 + +参数 position 指定匹配应该从 inputString 的哪里开始。Occurrence 指示在 inputString 中搜索 pattern 的哪个出现。 + +标志值限于 'i'、'c'、'n'、'm'。其他符号会引发异常。可以在一个 flagsString 参数中使用多个符号(例如 'im')。后面的标志会覆盖前面的标志,例如 'ic' 等同于区分大小写匹配 'c'。 + +'i' 启用不区分大小写匹配(Pattern.CASE_INSENSITIVE) + +'c' 禁用不区分大小写匹配(Pattern.CASE_INSENSITIVE) + +'n' 允许句点匹配换行符(Pattern.DOTALL) + +'m' 启用多行模式(Pattern.MULTILINE) + +如果模式具有组,则可以使用 group 参数指定要返回的组。 + +示例: + +REGEXP_SUBSTR('2020-10-01', '\d{4}') +REGEXP_SUBSTR('2020-10-01', '(\d{4})-(\d{2})-(\d{2})', 1, 1, NULL, 2) + +### REPEAT + +```REPEAT(string, int)``` + +Returns a string repeated some number of times. + +示例: + +REPEAT(NAME || ' ', 10) + +### REPLACE + +```REPLACE(string, searchString[, replacementString])``` + +在文本中替换所有出现的搜索字符串为另一个字符串。如果没有指定替换字符串,则从原始字符串中移除搜索字符串。如果任何参数为 null,则结果为 null。 + +示例: + +REPLACE(NAME, ' ') + +### SOUNDEX + +```SOUNDEX(string)``` + +表示字符串发音。此方法返回一个字符串,如果参数为 null,则返回 null。有关更多信息,请参阅 https://en.wikipedia.org/wiki/Soundex 。 + +示例: + +SOUNDEX(NAME) + +### SPACE + +```SPACE(int)``` + +返回由一定数量的空格组成的字符串。 + +示例: + +SPACE(80) + +### SUBSTRING / SUBSTR + +```SUBSTRING | SUBSTR (string, startInt[, lengthInt ])``` + +返回从指定位置开始的字符串的子串。如果起始索引为负数,则相对于字符串的末尾计算起始索引。长度是可选的。 + +示例: + +CALL SUBSTRING('[Hello]', 2); +CALL SUBSTRING('hour', 3, 2); + +### TO_CHAR + +```TO_CHAR(value[, formatString])``` + +Oracle 兼容的 TO_CHAR 函数可用于格式化时间戳、数字或文本。 + +示例: + +CALL TO_CHAR(SYS_TIME, 'yyyy-MM-dd HH:mm:ss') + +### TRANSLATE + +```TRANSLATE(value, searchString, replacementString)``` + +Oracle 兼容的 TRANSLATE 函数用于将字符串中的一系列字符替换为另一组字符。 + +示例: + +CALL TRANSLATE('Hello world', 'eo', 'EO') + +## Numeric Functions + +### ABS + +```ABS(numeric)``` + +返回指定值的绝对值。返回的值与参数的数据类型相同。 + +请注意,TINYINT、SMALLINT、INT 和 BIGINT 数据类型无法表示它们的最小负值的绝对值,因为它们的负值比正值多。例如,对于 INT 数据类型,允许的值范围是从 -2147483648 到 2147483647。ABS(-2147483648) 应该是 2147483648,但是这个值对于这个数据类型是不允许的。这会导致异常。为了避免这种情况,请将此函数的参数转换为更高的数据类型。 + +示例: + +ABS(I) + +### ACOS + +```ACOS(numeric)``` + +计算反余弦值。另请参阅 Java Math.acos。该方法返回一个双精度浮点数。 + +示例: + +ACOS(D) + +### ASIN + +```ASIN(numeric)``` + +计算反正弦值。另请参阅 Java Math.asin。该方法返回一个双精度浮点数。 + +示例: + +ASIN(D) + +### ATAN + +```ATAN(numeric)``` + +计算反正切值。另请参阅 Java Math.atan。该方法返回一个双精度浮点数。 + +示例: + +ATAN(D) + +### COS + +```COS(numeric)``` + +计算三角余弦值。另请参阅 Java Math.cos。该方法返回一个双精度浮点数。 + +示例: + +COS(ANGLE) + +### COSH + +```COSH(numeric)``` + +计算双曲余弦值。另请参阅 Java Math.cosh。该方法返回一个双精度浮点数。 + +示例: + +COSH(X) + +### COT + +```COT(numeric)``` + +计算三角余切值(1/TAN(角度))。另请参阅 Java Math.* 函数。该方法返回一个双精度浮点数。 + +示例: + +COT(ANGLE) + +### SIN + +```SIN(numeric)``` + +计算三角正弦值。另请参阅 Java Math.sin。该方法返回一个双精度浮点数。 + +示例: + +SIN(ANGLE) + +### SINH + +```SINH(numeric)``` + +计算双曲正弦值。另请参阅 Java Math.sinh。该方法返回一个双精度浮点数。 + +示例: + +SINH(ANGLE) + +### TAN + +```TAN(numeric)``` + +计算三角正切值。另请参阅 Java Math.tan。该方法返回一个双精度浮点数。 + +示例: + +TAN(ANGLE) + +### TANH + +```TANH(numeric)``` + +计算双曲正切值。另请参阅 Java Math.tanh。该方法返回一个双精度浮点数。 + +示例: + +TANH(X) + +### MOD + +```MOD(dividendNumeric, divisorNumeric )``` + +取模运算表达式。 + +结果与除数的类型相同。如果任一参数为 NULL,则结果为 NULL。如果除数为 0,则会引发异常。结果与被除数的符号相同,或者等于 0。 + +通常情况下,参数应具有标度 0,但 H2 并不要求。 + +示例: + +MOD(A, B) + +### CEIL / CEILING + +```CEIL | CEILING (numeric)``` + +返回大于或等于参数的最小整数值。该方法返回与参数相同类型的值,但标度设置为 0,并且如果适用,则调整精度。 + +示例: + +CEIL(A) + +### EXP + +```EXP(numeric)``` + +请参阅 Java Math.exp。该方法返回一个双精度浮点数。 + +示例: + +EXP(A) + +### FLOOR + +```FLOOR(numeric)``` + +返回小于或等于参数的最大整数值。该方法返回与参数相同类型的值,但标度设置为 0,并且如果适用,则调整精度。 + +示例: + +FLOOR(A) + +### LN + +```LN(numeric)``` + +计算自然对数(以 e 为底)的双精度浮点数值。参数必须是一个正数值。 + +示例: + +LN(A) + +### LOG + +```LOG(baseNumeric, numeric)``` + +计算以指定底数的对数,返回一个双精度浮点数。参数和底数必须是正数值。底数不能等于1。 + +默认底数是 e(自然对数),在 PostgreSQL 模式下,默认底数是 10。在 MSSQLServer 模式下,可选的底数在参数之后指定。 + +LOG 函数的单参数变体已被弃用,请使用 LN 或 LOG10 替代。 + +示例: + +LOG(2, A) + +### LOG10 + +```LOG10(numeric)``` + +计算以 10 为底的对数,返回一个双精度浮点数。参数必须是一个正数值。 + +示例: + +LOG10(A) + +### RADIANS + +```RADIANS(numeric)``` + +请参阅 Java Math.toRadians。该方法返回一个双精度浮点数。 + +示例: + +RADIANS(A) + +### SQRT + +```SQRT(numeric)``` + +请参阅 Java Math.sqrt。该方法返回一个双精度浮点数。 + +示例: + +SQRT(A) + +### PI + +```PI()``` + +请参阅 Java Math.PI。该方法返回一个双精度浮点数。 + +示例: + +PI() + +### POWER + +```POWER(numeric, numeric)``` + +请参阅 Java Math.pow。该方法返回一个双精度浮点数。 + +示例: + +POWER(A, B) + +### RAND / RANDOM + +```RAND | RANDOM([ int ])``` + +如果不带参数调用该函数,则返回下一个伪随机数。如果带有参数调用,则将会给该会话的随机数生成器设定种子。该方法返回一个介于 0(包括)和 1(不包括)之间的双精度浮点数。 + +示例: + +RAND() + +### ROUND + +```ROUND(numeric[, digitsInt])``` + +四舍五入到指定的小数位数。该方法返回与参数相同类型的值,但如果适用,则调整精度和标度。 + +示例: + +ROUND(N, 2) + +### SIGN + +```SIGN(numeric)``` + +如果值小于 0,则返回 -1;如果值为零或 NaN,则返回 0;否则返回 1。 + +示例: + +SIGN(N) + +### TRUNC + +```TRUNC | TRUNCATE(numeric[, digitsInt])``` + +当指定了一个数值参数时,将其截断为指定的数字位数(接近0的下一个值),并返回与参数相同类型的值,但如果适用,则调整精度和标度。 + +示例: + +TRUNC(N, 2) + +## Time and Date Functions + +### CURRENT_DATE + +```CURRENT_DATE [()]``` + +返回当前日期。 + +这些函数在事务(默认)或命令内部返回相同的值,具体取决于数据库模式。 + +示例: + +CURRENT_DATE + +### CURRENT_TIME + +```CURRENT_TIME [()]``` + +返回带有系统时区的当前时间。实际可用的最大精度取决于操作系统和 JVM,可以是 3(毫秒)或更高。在 Java 9 之前不支持更高的精度。 + +示例: + +CURRENT_TIME + +### CURRENT_TIMESTAMP / NOW + +```CURRENT_TIMESTAMP[()] | NOW()``` + +返回带有系统时区的当前时间戳。实际可用的最大精度取决于操作系统和 JVM,可以是 3(毫秒)或更高。在 Java 9 之前不支持更高的精度。 + +示例: + +CURRENT_TIMESTAMP + +### DATEADD / TIMESTAMPADD + +```DATEADD| TIMESTAMPADD(dateAndTime, addIntLong, datetimeFieldString)``` + +将单位添加到日期时间值中。datetimeFieldString 表示单位。使用负值来减去单位。当操作毫秒、微秒或纳秒时,addIntLong 可能是一个 long 值,否则其范围被限制为 int。如果单位与指定值兼容,则此方法返回与指定值相同类型的值。如果指定的字段是 HOUR、MINUTE、SECOND、MILLISECOND 等,而值是 DATE 值,DATEADD 返回组合的 TIMESTAMP。对于 TIME 值,不允许使用 DAY、MONTH、YEAR、WEEK 等字段。 + +示例: + +DATEADD(CREATED, 1, 'MONTH') + +### DATEDIFF + +```DATEDIFF(aDateAndTime, bDateAndTime, datetimeFieldString)``` + +返回两个日期时间值之间跨越的单位边界数。此方法返回一个 long 值。datetimeField 表示单位。 + +示例: + +DATEDIFF(T1.CREATED, T2.CREATED, 'MONTH') + +### DATE_TRUNC + +```DATE_TRUNC (dateAndTime, datetimeFieldString)``` + +将指定的日期时间值截断到指定的字段。 + +示例: + +DATE_TRUNC(CREATED, 'DAY'); + +### DAYNAME + +```DAYNAME(dateAndTime)``` + +返回星期几的名称(英文)。 + +示例: + +DAYNAME(CREATED) + +### DAY_OF_MONTH + +```DAY_OF_MONTH(dateAndTime)``` + +返回月份中的日期(1-31)。 + +示例: + +DAY_OF_MONTH(CREATED) + +### DAY_OF_WEEK + +```DAY_OF_WEEK(dateAndTime)``` + +返回星期几的数值(1-7)(星期一至星期日),根据本地化设置。 + +示例: + +DAY_OF_WEEK(CREATED) + +### DAY_OF_YEAR + +```DAY_OF_YEAR(dateAndTime)``` + +返回一年中的日期(1-366)。 + +示例: + +DAY_OF_YEAR(CREATED) + +### EXTRACT + +```EXTRACT ( datetimeField FROM dateAndTime)``` + +从日期/时间值中返回特定时间单位的值。该方法对于 EPOCH 字段返回一个数值,对于其他字段返回一个整数。 + +示例: + +EXTRACT(SECOND FROM CURRENT_TIMESTAMP) + +### FORMATDATETIME + +```FORMATDATETIME (dateAndTime, formatString)``` + +将日期、时间或时间戳格式化为字符串。最重要的格式字符包括:y(年)、M(月)、d(日)、H(时)、m(分)、s(秒)。有关格式的详细信息,请参阅 java.time.format.DateTimeFormatter。 + +该方法返回一个字符串。 + +示例: + +CALL FORMATDATETIME(CREATED, 'yyyy-MM-dd HH:mm:ss') + +### HOUR + +```HOUR(dateAndTime)``` + +从日期/时间值中返回小时(0-23)。 + +示例: + +HOUR(CREATED) + +### MINUTE + +```MINUTE(dateAndTime)``` + +从日期/时间值中返回分钟(0-59)。 + +该函数已经被弃用,请使用 EXTRACT 替代。 + +示例: + +MINUTE(CREATED) + +### MONTH + +```MONTH(dateAndTime)``` + +从日期/时间值中返回月份(1-12)。 + +该函数已经被弃用,请使用 EXTRACT 替代。 + +示例: + +MONTH(CREATED) + +### MONTHNAME + +```MONTHNAME(dateAndTime)``` + +返回月份的名称(英文)。 + +示例: + +MONTHNAME(CREATED) + +### PARSEDATETIME / TO_DATE + +```PARSEDATETIME | TO_DATE(string, formatString)``` +解析一个字符串并返回一个 TIMESTAMP WITH TIME ZONE 值。最重要的格式字符包括:y(年)、M(月)、d(日)、H(时)、m(分)、s(秒)。有关格式的详细信息,请参阅 java.time.format.DateTimeFormatter。 + +示例: + +CALL PARSEDATETIME('2021-04-08 13:34:45','yyyy-MM-dd HH:mm:ss') + +### QUARTER + +```QUARTER(dateAndTime)``` + +从日期/时间值中返回季度(1-4)。 + +示例: + +QUARTER(CREATED) + +### SECOND + +```SECOND(dateAndTime)``` + +从日期/时间值中返回秒数(0-59)。 + +该函数已经被弃用,请使用 EXTRACT 替代。 + +示例: + +SECOND(CREATED) + +### WEEK + +```WEEK(dateAndTime)``` + +返回日期/时间值中的周数(1-53)。 + +该函数使用当前系统的区域设置。 + +示例: + +WEEK(CREATED) + +### YEAR + +```YEAR(dateAndTime)``` + +返回日期/时间值中的年份。 + +示例: + +YEAR(CREATED) + +### FROM_UNIXTIME + +```FROM_UNIXTIME (unixtime, formatString,timeZone)``` + +将从 UNIX 纪元(1970-01-01 00:00:00 UTC)开始的秒数转换为表示该时刻时间戳的字符串。 + +最重要的格式字符包括:y(年)、M(月)、d(日)、H(时)、m(分)、s(秒)。有关格式的详细信息,请参阅 `java.time.format.DateTimeFormatter`。 + +`timeZone` 是可选的,默认值为系统的时区。`timezone` 的值可以是一个 `UTC+ 时区偏移`,例如,`UTC+8` 表示亚洲/上海时区,请参阅 `java.time.ZoneId`。 + +该方法返回一个字符串。 + +示例: + +// 使用默认时区 + +CALL FROM_UNIXTIME(1672502400, 'yyyy-MM-dd HH:mm:ss') + +or + +// 使用指定时区 + +CALL FROM_UNIXTIME(1672502400, 'yyyy-MM-dd HH:mm:ss','UTC+6') + +## System Functions + +### CAST + +```CAST(value as dataType)``` + +将一个值转换为另一个数据类型。 + +支持的数据类型有:STRING | VARCHAR,INT | INTEGER,LONG | BIGINT,BYTE,FLOAT,DOUBLE,DECIMAL(p,s),TIMESTAMP,DATE,TIME,BYTES + +示例: + +CONVERT(NAME AS INT) + +### COALESCE + +```COALESCE(aValue, bValue [,...])``` + +返回第一个非空值。 + +示例: + +COALESCE(A, B, C) + +### IFNULL + +```IFNULL(aValue, bValue)``` + +返回第一个非空值。 + +示例: + +IFNULL(A, B) + +### NULLIF + +```NULLIF(aValue, bValue)``` + +如果 'a' 等于 'b',则返回 NULL,否则返回 'a'。 + +示例: + +NULLIF(A, B) + +### CASE WHEN + +``` +select + case + when c_string in ('c_string') then 1 + else 0 + end as c_string_1, + case + when c_string not in ('c_string') then 1 + else 0 + end as c_string_0, + case + when c_tinyint = 117 + and TO_CHAR(c_boolean) = 'true' then 1 + else 0 + end as c_tinyint_boolean_1, + case + when c_tinyint != 117 + and TO_CHAR(c_boolean) = 'true' then 1 + else 0 + end as c_tinyint_boolean_0, + case + when c_tinyint != 117 + or TO_CHAR(c_boolean) = 'true' then 1 + else 0 + end as c_tinyint_boolean_or_1, + case + when c_int > 1 + and c_bigint > 1 + and c_float > 1 + and c_double > 1 + and c_decimal > 1 then 1 + else 0 + end as c_number_1, + case + when c_tinyint <> 117 then 1 + else 0 + end as c_number_0 +from + fake +``` + +用于确定条件是否有效,并根据不同的判断返回不同的值 + +示例: + +case when c_string in ('c_string') then 1 else 0 end diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/sql-udf.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/sql-udf.md new file mode 100644 index 000000000000..4c1a3777408d --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/sql-udf.md @@ -0,0 +1,133 @@ +# SQL用户定义函数 + +> SQL 转换插件的用户定义函数 (UDF) + +## 描述 + +使用UDF SPI扩展SQL转换函数库。 + +## UDF API + +```java +package org.apache.seatunnel.transform.sql.zeta; + +public interface ZetaUDF { + /** + * Function name + * + * @return function name + */ + String functionName(); + + /** + * The type of function result + * + * @param argsType input arguments type + * @return result type + */ + SeaTunnelDataType resultType(List> argsType); + + /** + * Evaluate + * + * @param args input arguments + * @return result value + */ + Object evaluate(List args); +} +``` + +## UDF 实现示例 + +将这些依赖项添加到您的 Maven 项目,并使用 provided 作用域。 + +```xml + + + + org.apache.seatunnel + seatunnel-transforms-v2 + 2.3.2 + provided + + + org.apache.seatunnel + seatunnel-api + 2.3.2 + provided + + + com.google.auto.service + auto-service + 1.0.1 + provided + + + +``` + +添加一个 Java 类来实现 ZetaUDF,类似于以下的方式: + +```java + +@AutoService(ZetaUDF.class) +public class ExampleUDF implements ZetaUDF { + @Override + public String functionName() { + return "EXAMPLE"; + } + + @Override + public SeaTunnelDataType resultType(List> argsType) { + return BasicType.STRING_TYPE; + } + + @Override + public Object evaluate(List args) { + String arg = (String) args.get(0); + if (arg == null) return null; + return "UDF: " + arg; + } +} +``` + +打包UDF项目并将jar文件复制到路径:${SEATUNNEL_HOME}/lib + +## 示例 + +源端数据读取的表格如下: + +| id | name | age | +|----|----------|-----| +| 1 | Joy Ding | 20 | +| 2 | May Ding | 21 | +| 3 | Kin Dom | 24 | +| 4 | Joy Dom | 22 | + +我们使用SQL查询中的UDF来转换源数据,类似于以下方式: + +``` +transform { + Sql { + source_table_name = "fake" + result_table_name = "fake1" + query = "select id, example(name) as name, age from fake" + } +} +``` + +那么结果表 `fake1` 中的数据将会更新为 + +| id | name | age | +|----|---------------|-----| +| 1 | UDF: Joy Ding | 20 | +| 2 | UDF: May Ding | 21 | +| 3 | UDF: Kin Dom | 24 | +| 4 | UDF: Joy Dom | 22 | + +## 更新日志 + +### 新版本 + +- 添加SQL转换连接器的UDF + diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/sql.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/sql.md new file mode 100644 index 000000000000..1b56f1fef3f0 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.3.7/transform-v2/sql.md @@ -0,0 +1,158 @@ +# SQL + +> SQL 转换插件 + +## 描述 + +使用 SQL 来转换给定的输入行。 + +SQL 转换使用内存中的 SQL 引擎,我们可以通过 SQL 函数和 SQL 引擎的能力来实现转换任务。 + +## 属性 + +| 名称 | 类型 | 是否必须 | 默认值 | +|-------------------|--------|------|-----| +| source_table_name | string | yes | - | +| result_table_name | string | yes | - | +| query | string | yes | - | + +### source_table_name [string] + +源表名称,查询 SQL 表名称必须与此字段匹配。 + +### query [string] + +查询 SQL,它是一个简单的 SQL,支持基本的函数和条件过滤操作。但是,复杂的 SQL 尚不支持,包括:多源表/行连接和聚合操作等。 + +查询表达式可以是`select [table_name.]column_a`,这时会去查询列为`column_a`的列,`table_name`为可选项 +也可以是`select c_row.c_inner_row.column_b`,这时会去查询列`c_row`下的`c_inner_row`的`column_b`。**嵌套结构查询中,不能存在`table_name`** + +## 示例 + +源端数据读取的表格如下: + +| id | name | age | +|----|----------|-----| +| 1 | Joy Ding | 20 | +| 2 | May Ding | 21 | +| 3 | Kin Dom | 24 | +| 4 | Joy Dom | 22 | + +我们使用 SQL 查询来转换源数据,类似这样: + +``` +transform { + Sql { + source_table_name = "fake" + result_table_name = "fake1" + query = "select id, concat(name, '_') as name, age+1 as age from fake where id>0" + } +} +``` + +那么结果表 `fake1` 中的数据将会更新为: + +| id | name | age | +|----|-----------|-----| +| 1 | Joy Ding_ | 21 | +| 2 | May Ding_ | 22 | +| 3 | Kin Dom_ | 25 | +| 4 | Joy Dom_ | 23 | + +### 嵌套结构查询 + +例如你的上游数据结构是这样: + +```hacon +source { + FakeSource { + result_table_name = "fake" + row.num = 100 + string.template = ["innerQuery"] + schema = { + fields { + name = "string" + c_date = "date" + c_row = { + c_inner_row = { + c_inner_int = "int" + c_inner_string = "string" + c_inner_timestamp = "timestamp" + c_map_1 = "map" + c_map_2 = "map>" + } + c_string = "string" + } + } + } + } +} +``` + +那么下列所有的查询表达式都是有效的 + +```sql +select +name, +c_date, +c_row, +c_row.c_inner_row, +c_row.c_string, +c_row.c_inner_row.c_inner_int, +c_row.c_inner_row.c_inner_string, +c_row.c_inner_row.c_inner_timestamp, +c_row.c_inner_row.c_map_1, +c_row.c_inner_row.c_map_1.some_key +``` + +但是这个查询语句是无效的 + +```sql +select +c_row.c_inner_row.c_map_2.some_key.inner_map_key +``` + +当查询map结构时,map结构应该为最后一个数据结构,不能查询嵌套map + +## 作业配置示例 + +``` +env { + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 100 + schema = { + fields { + id = "int" + name = "string" + age = "int" + } + } + } +} + +transform { + Sql { + source_table_name = "fake" + result_table_name = "fake1" + query = "select id, concat(name, '_') as name, age+1 as age from fake where id>0" + } +} + +sink { + Console { + source_table_name = "fake1" + } +} +``` + +## 更新日志 + +### 新版本 + +- 添加SQL转换连接器 + diff --git a/src/pages/download/st_data.json b/src/pages/download/st_data.json index 4055d9e02163..2fc29801b737 100644 --- a/src/pages/download/st_data.json +++ b/src/pages/download/st_data.json @@ -1,4 +1,18 @@ [ + { + "date": "2024-08-19", + "version": "v2.3.7", + "sourceCode": { + "src": "https://www.apache.org/dyn/closer.lua/seatunnel/2.3.7/apache-seatunnel-2.3.7-src.tar.gz", + "asc": "https://downloads.apache.org/seatunnel/2.3.7/apache-seatunnel-2.3.7-src.tar.gz.asc", + "sha512": "https://downloads.apache.org/seatunnel/2.3.7/apache-seatunnel-2.3.7-src.tar.gz.sha512" + }, + "binaryDistribution": { + "bin": "https://www.apache.org/dyn/closer.lua/seatunnel/2.3.7/apache-seatunnel-2.3.7-bin.tar.gz", + "asc": "https://downloads.apache.org/seatunnel/2.3.7/apache-seatunnel-2.3.7-bin.tar.gz.asc", + "sha512": "https://downloads.apache.org/seatunnel/2.3.7/apache-seatunnel-2.3.7-bin.tar.gz.sha512" + } + }, { "date": "2024-08-02", "version": "v2.3.6", diff --git a/src/pages/versions/config.json b/src/pages/versions/config.json index 89389fbcc594..c38d7aa0a3d2 100644 --- a/src/pages/versions/config.json +++ b/src/pages/versions/config.json @@ -14,10 +14,10 @@ "nextLink": "/docs/intro/about", "latestData": [ { - "versionLabel": "2.3.6", - "docUrl": "/docs/2.3.6/about", - "downloadUrl": "https://github.com/apache/seatunnel/releases/tag/2.3.6", - "sourceTag": "2.3.6" + "versionLabel": "2.3.7", + "docUrl": "/docs/2.3.7/about", + "downloadUrl": "https://github.com/apache/seatunnel/releases/tag/2.3.7", + "sourceTag": "2.3.7" } ], "nextData": [ @@ -27,6 +27,12 @@ } ], "historyData": [ + { + "versionLabel": "2.3.7", + "docUrl": "/docs/2.3.7/about", + "downloadUrl": "https://github.com/apache/seatunnel/releases/tag/2.3.7", + "sourceTag": "2.3.7" + }, { "versionLabel": "2.3.6", "docUrl": "/docs/2.3.6/about", @@ -123,10 +129,10 @@ "nextLink": "/docs/about", "latestData": [ { - "versionLabel": "2.3.6", - "docUrl": "/docs/2.3.6/about", - "downloadUrl": "https://github.com/apache/seatunnel/releases/tag/2.3.6", - "sourceTag": "2.3.6" + "versionLabel": "2.3.7", + "docUrl": "/docs/2.3.7/about", + "downloadUrl": "https://github.com/apache/seatunnel/releases/tag/2.3.7", + "sourceTag": "2.3.7" } ], "nextData": [ @@ -136,6 +142,12 @@ } ], "historyData": [ + { + "versionLabel": "2.3.7", + "docUrl": "/docs/2.3.7/about", + "downloadUrl": "https://github.com/apache/seatunnel/releases/tag/2.3.7", + "sourceTag": "2.3.7" + }, { "versionLabel": "2.3.6", "docUrl": "/docs/2.3.6/about", diff --git a/versioned_docs/version-2.3.7/Connector-v2-release-state.md b/versioned_docs/version-2.3.7/Connector-v2-release-state.md new file mode 100644 index 000000000000..8705de7c7666 --- /dev/null +++ b/versioned_docs/version-2.3.7/Connector-v2-release-state.md @@ -0,0 +1,85 @@ +# Connector Release Status + +SeaTunnel uses a grading system for connectors to help you understand what to expect from a connector: + +| | Alpha | Beta | General Availability (GA) | +|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Expectations | An alpha connector signifies a connector under development and helps SeaTunnel gather early feedback and issues reported by early adopters. We strongly discourage using alpha releases for production use cases | A beta connector is considered stable and reliable with no backwards incompatible changes but has not been validated by a broader group of users. We expect to find and fix a few issues and bugs in the release before it’s ready for GA. | A generally available connector has been deemed ready for use in a production environment and is officially supported by SeaTunnel. Its documentation is considered sufficient to support widespread adoption. | +| | | | | +| Production Readiness | No | Yes | Yes | + +## Connector V2 Health + +| Connector Name | Type | Status | Support Version | +|-------------------------------------------------------------|--------|--------|-----------------| +| [AmazonDynamoDB](connector-v2/sink/AmazonDynamoDB.md) | Sink | Beta | 2.3.0 | +| [AmazonDynamoDB](connector-v2/source/AmazonDynamoDB.md) | Source | Beta | 2.3.0 | +| [Asset](connector-v2/sink/Assert.md) | Sink | Beta | 2.2.0-beta | +| [Cassandra](connector-v2/sink/Cassandra.md) | Sink | Beta | 2.3.0 | +| [Cassandra](connector-v2/source/Cassandra.md) | Source | Beta | 2.3.0 | +| [ClickHouse](connector-v2/source/Clickhouse.md) | Source | GA | 2.2.0-beta | +| [ClickHouse](connector-v2/sink/Clickhouse.md) | Sink | GA | 2.2.0-beta | +| [ClickHouseFile](connector-v2/sink/ClickhouseFile.md) | Sink | GA | 2.2.0-beta | +| [Console](connector-v2/sink/Console.md) | Sink | GA | 2.2.0-beta | +| [DataHub](connector-v2/sink/Datahub.md) | Sink | Alpha | 2.2.0-beta | +| [Doris](connector-v2/sink/Doris.md) | Sink | Beta | 2.3.0 | +| [DingTalk](connector-v2/sink/DingTalk.md) | Sink | Alpha | 2.2.0-beta | +| [Elasticsearch](connector-v2/sink/Elasticsearch.md) | Sink | GA | 2.2.0-beta | +| [Email](connector-v2/sink/Email.md) | Sink | Alpha | 2.2.0-beta | +| [Enterprise WeChat](connector-v2/sink/Enterprise-WeChat.md) | Sink | Alpha | 2.2.0-beta | +| [FeiShu](connector-v2/sink/Feishu.md) | Sink | Alpha | 2.2.0-beta | +| [Fake](connector-v2/source/FakeSource.md) | Source | GA | 2.2.0-beta | +| [FtpFile](connector-v2/sink/FtpFile.md) | Sink | Beta | 2.2.0-beta | +| [Greenplum](connector-v2/sink/Greenplum.md) | Sink | Beta | 2.2.0-beta | +| [Greenplum](connector-v2/source/Greenplum.md) | Source | Beta | 2.2.0-beta | +| [HdfsFile](connector-v2/sink/HdfsFile.md) | Sink | GA | 2.2.0-beta | +| [HdfsFile](connector-v2/source/HdfsFile.md) | Source | GA | 2.2.0-beta | +| [Hive](connector-v2/sink/Hive.md) | Sink | GA | 2.2.0-beta | +| [Hive](connector-v2/source/Hive.md) | Source | GA | 2.2.0-beta | +| [Http](connector-v2/sink/Http.md) | Sink | Beta | 2.2.0-beta | +| [Http](connector-v2/source/Http.md) | Source | Beta | 2.2.0-beta | +| [Iceberg](connector-v2/source/Iceberg.md) | Source | Beta | 2.2.0-beta | +| [InfluxDB](connector-v2/sink/InfluxDB.md) | Sink | Beta | 2.3.0 | +| [InfluxDB](connector-v2/source/InfluxDB.md) | Source | Beta | 2.3.0-beta | +| [IoTDB](connector-v2/source/IoTDB.md) | Source | GA | 2.2.0-beta | +| [IoTDB](connector-v2/sink/IoTDB.md) | Sink | GA | 2.2.0-beta | +| [Jdbc](connector-v2/source/Jdbc.md) | Source | GA | 2.2.0-beta | +| [Jdbc](connector-v2/sink/Jdbc.md) | Sink | GA | 2.2.0-beta | +| [Kafka](connector-v2/source/kafka.md) | Source | GA | 2.3.0 | +| [Kafka](connector-v2/sink/Kafka.md) | Sink | GA | 2.2.0-beta | +| [Kudu](connector-v2/source/Kudu.md) | Source | Beta | 2.2.0-beta | +| [Kudu](connector-v2/sink/Kudu.md) | Sink | Beta | 2.2.0-beta | +| [Lemlist](connector-v2/source/Lemlist.md) | Source | Beta | 2.3.0 | +| [LocalFile](connector-v2/sink/LocalFile.md) | Sink | GA | 2.2.0-beta | +| [LocalFile](connector-v2/source/LocalFile.md) | Source | GA | 2.2.0-beta | +| [Maxcompute](connector-v2/source/Maxcompute.md) | Source | Alpha | 2.3.0 | +| [Maxcompute](connector-v2/sink/Maxcompute.md) | Sink | Alpha | 2.3.0 | +| [MongoDB](connector-v2/source/MongoDB.md) | Source | Beta | 2.2.0-beta | +| [MongoDB](connector-v2/sink/MongoDB.md) | Sink | Beta | 2.2.0-beta | +| [MyHours](connector-v2/source/MyHours.md) | Source | Alpha | 2.2.0-beta | +| [MySqlCDC](connector-v2/source/MySQL-CDC.md) | Source | GA | 2.3.0 | +| [Neo4j](connector-v2/sink/Neo4j.md) | Sink | Beta | 2.2.0-beta | +| [Notion](connector-v2/source/Notion.md) | Source | Alpha | 2.3.0 | +| [OneSignal](connector-v2/source/OneSignal.md) | Source | Beta | 2.3.0 | +| [OpenMldb](connector-v2/source/OpenMldb.md) | Source | Beta | 2.3.0 | +| [OssFile](connector-v2/sink/OssFile.md) | Sink | Beta | 2.2.0-beta | +| [OssFile](connector-v2/source/OssFile.md) | Source | Beta | 2.2.0-beta | +| [Phoenix](connector-v2/sink/Phoenix.md) | Sink | Beta | 2.2.0-beta | +| [Phoenix](connector-v2/source/Phoenix.md) | Source | Beta | 2.2.0-beta | +| [Pulsar](connector-v2/source/Pulsar.md) | Source | Beta | 2.2.0-beta | +| [RabbitMQ](connector-v2/sink/Rabbitmq.md) | Sink | Beta | 2.3.0 | +| [RabbitMQ](connector-v2/source/Rabbitmq.md) | Source | Beta | 2.3.0 | +| [Redis](connector-v2/sink/Redis.md) | Sink | Beta | 2.2.0-beta | +| [Redis](connector-v2/source/Redis.md) | Source | Beta | 2.2.0-beta | +| [S3Redshift](connector-v2/sink/S3-Redshift.md) | Sink | GA | 2.3.0-beta | +| [S3File](connector-v2/source/S3File.md) | Source | GA | 2.3.0-beta | +| [S3File](connector-v2/sink/S3File.md) | Sink | GA | 2.3.0-beta | +| [Sentry](connector-v2/sink/Sentry.md) | Sink | Alpha | 2.2.0-beta | +| [SFtpFile](connector-v2/sink/SftpFile.md) | Sink | Beta | 2.3.0 | +| [SFtpFile](connector-v2/source/SftpFile.md) | Source | Beta | 2.3.0 | +| [Slack](connector-v2/sink/Slack.md) | Sink | Beta | 2.3.0 | +| [Socket](connector-v2/sink/Socket.md) | Sink | Beta | 2.2.0-beta | +| [Socket](connector-v2/source/Socket.md) | Source | Beta | 2.2.0-beta | +| [StarRocks](connector-v2/sink/StarRocks.md) | Sink | Alpha | 2.3.0 | +| [Tablestore](connector-v2/sink/Tablestore.md) | Sink | Alpha | 2.3.0 | + diff --git a/versioned_docs/version-2.3.7/about.md b/versioned_docs/version-2.3.7/about.md new file mode 100644 index 000000000000..0e961cbc0746 --- /dev/null +++ b/versioned_docs/version-2.3.7/about.md @@ -0,0 +1,72 @@ +# About SeaTunnel + +seatunnel logo + +[![Slack](https://img.shields.io/badge/slack-%23seatunnel-4f8eba?logo=slack)](https://s.apache.org/seatunnel-slack) +[![Twitter Follow](https://img.shields.io/twitter/follow/ASFSeaTunnel.svg?label=Follow&logo=twitter)](https://twitter.com/ASFSeaTunnel) + +SeaTunnel is a very easy-to-use, ultra-high-performance, distributed data integration platform that supports real-time +synchronization of massive data. It can synchronize tens of billions of data stably and efficiently every day, and has +been used in production by nearly 100 companies. + +## Why We Need SeaTunnel + +SeaTunnel focuses on data integration and data synchronization, and is mainly designed to solve common problems in the field of data integration: + +- Various data sources: There are hundreds of commonly-used data sources with incompatible versions. With the emergence of new technologies, more data sources are appearing. It is difficult for users to find a tool that can fully and quickly support these data sources. +- Complex synchronization scenarios: Data synchronization needs to support various synchronization scenarios such as offline-full synchronization, offline-incremental synchronization, CDC, real-time synchronization, and full database synchronization. +- High resource demand: Existing data integration and data synchronization tools often require vast computing resources or JDBC connection resources to complete real-time synchronization of massive small tables. This has increased the burden on enterprises. +- Lack of quality and monitoring: Data integration and synchronization processes often experience loss or duplication of data. The synchronization process lacks monitoring, and it is impossible to intuitively understand the real situation of the data during the task process. +- Complex technology stack: The technology components used by enterprises are different, and users need to develop corresponding synchronization programs for different components to complete data integration. +- Difficulty in management and maintenance: Limited to different underlying technology components (Flink/Spark), offline synchronization and real-time synchronization often have be developed and managed separately, which increases the difficulty of management and maintenance. + +## Features Of SeaTunnel + +- Rich and extensible Connector: SeaTunnel provides a Connector API that does not depend on a specific execution engine. Connectors (Source, Transform, Sink) developed based on this API can run on many different engines, such as SeaTunnel Engine(Zeta), Flink, and Spark. +- Connector plugin: The plugin design allows users to easily develop their own Connector and integrate it into the SeaTunnel project. Currently, SeaTunnel supports more than 100 Connectors, and the number is surging. Here is the list of [Currently Supported Connectors](Connector-v2-release-state.md) +- Batch-stream integration: Connectors developed based on the SeaTunnel Connector API are perfectly compatible with offline synchronization, real-time synchronization, full-synchronization, incremental synchronization and other scenarios. They greatly reduce the difficulty of managing data integration tasks. +- Supports a distributed snapshot algorithm to ensure data consistency. +- Multi-engine support: SeaTunnel uses the SeaTunnel Engine(Zeta) for data synchronization by default. SeaTunnel also supports the use of Flink or Spark as the execution engine of the Connector to adapt to the enterprise's existing technical components. SeaTunnel supports multiple versions of Spark and Flink. +- JDBC multiplexing, database log multi-table parsing: SeaTunnel supports multi-table or whole database synchronization, which solves the problem of over-JDBC connections; and supports multi-table or whole database log reading and parsing, which solves the need for CDC multi-table synchronization scenarios to deal with problems with repeated reading and parsing of logs. +- High throughput and low latency: SeaTunnel supports parallel reading and writing, providing stable and reliable data synchronization capabilities with high throughput and low latency. +- Perfect real-time monitoring: SeaTunnel supports detailed monitoring information of each step in the data synchronization process, allowing users to easily understand the number of data, data size, QPS and other information read and written by the synchronization task. +- Two job development methods are supported: coding and canvas design. The SeaTunnel web project https://github.com/apache/seatunnel-web provides visual management of jobs, scheduling, running and monitoring capabilities. + +## SeaTunnel Work Flowchart + +![SeaTunnel Work Flowchart](/image_en/architecture_diagram.png) + +The runtime process of SeaTunnel is shown in the figure above. + +The user configures the job information and selects the execution engine to submit the job. + +The Source Connector is responsible for parallel reading and sending the data to the downstream Transform or directly to the Sink, and the Sink writes the data to the destination. It is worth noting that Source, Transform and Sink can be easily developed and extended by yourself. + +SeaTunnel is an EL(T) data integration platform. Therefore, in SeaTunnel, Transform can only be used to perform some simple transformations on data, such as converting the data of a column to uppercase or lowercase, changing the column name, or splitting a column into multiple columns. + +The default engine use by SeaTunnel is [SeaTunnel Engine](seatunnel-engine/about.md). If you choose to use the Flink or Spark engine, SeaTunnel will package the Connector into a Flink or Spark program and submit it to Flink or Spark to run. + +## Connector + +- **Source Connectors** SeaTunnel supports reading data from various relational, graph, NoSQL, document, and memory databases; distributed file systems such as HDFS; and a variety of cloud storage solutions, such as S3 and OSS. We also support data reading of many common SaaS services. You can access the detailed list [Here](connector-v2/source). If you want, You can develop your own source connector and easily integrate it into SeaTunnel. + +- **Transform Connector** If the schema is different between source and Sink, You can use the Transform Connector to change the schema read from source and make it the same as the Sink schema. + +- **Sink Connector** SeaTunnel supports writing data to various relational, graph, NoSQL, document, and memory databases; distributed file systems such as HDFS; and a variety of cloud storage solutions, such as S3 and OSS. We also support writing data to many common SaaS services. You can access the detailed list [Here](connector-v2/sink). If you want, you can develop your own Sink connector and easily integrate it into SeaTunnel. + +## Who Uses SeaTunnel + +SeaTunnel has lots of users. You can find more information about them in [Users](https://seatunnel.apache.org/user). + +## Landscapes + +

+

+   +

+SeaTunnel enriches the CNCF CLOUD NATIVE Landscape. +

+ +## Learn more + +You can see [Quick Start](/docs/category/start-v2/locally/deployment) for the next steps. diff --git a/versioned_docs/version-2.3.7/command/connector-check.md b/versioned_docs/version-2.3.7/command/connector-check.md new file mode 100644 index 000000000000..8ac35af6d10f --- /dev/null +++ b/versioned_docs/version-2.3.7/command/connector-check.md @@ -0,0 +1,35 @@ +# Connector Check Command Usage + +## Command Entrypoint + +```shell +bin/seatunnel-connector.sh +``` + +## Options + +```text +Usage: seatunnel-connector.sh [options] + Options: + -h, --help Show the usage message + -l, --list List all supported plugins(sources, sinks, transforms) + (default: false) + -o, --option-rule Get option rule of the plugin by the plugin + identifier(connector name or transform name) + -pt, --plugin-type SeaTunnel plugin type, support [source, sink, + transform] +``` + +## Example + +```shell +# List all supported connectors(sources and sinks) and transforms +bin/seatunnel-connector.sh -l +# List all supported sinks +bin/seatunnel-connector.sh -l -pt sink +# Get option rule of the connector or transform by the name +bin/seatunnel-connector.sh -o Paimon +# Get option rule of paimon sink +bin/seatunnel-connector.sh -o Paimon -pt sink +``` + diff --git a/versioned_docs/version-2.3.7/command/usage.mdx b/versioned_docs/version-2.3.7/command/usage.mdx new file mode 100644 index 000000000000..e3d82519cb5c --- /dev/null +++ b/versioned_docs/version-2.3.7/command/usage.mdx @@ -0,0 +1,176 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Command Usage + +## Command Entrypoint + + + + +```bash +bin/start-seatunnel-spark-2-connector-v2.sh +``` + + + + +```bash +bin/start-seatunnel-spark-3-connector-v2.sh +``` + + + + +```bash +bin/start-seatunnel-flink-13-connector-v2.sh +``` + + + + +```bash +bin/start-seatunnel-flink-15-connector-v2.sh +``` + + + + + +## Options + + + + +```bash +Usage: start-seatunnel-spark-2-connector-v2.sh [options] + Options: + --check Whether check config (default: false) + -c, --config Config file + -e, --deploy-mode Spark deploy mode, support [cluster, client] (default: + client) + -h, --help Show the usage message + -m, --master Spark master, support [spark://host:port, + mesos://host:port, yarn, k8s://https://host:port, + local], default local[*] (default: local[*]) + -n, --name SeaTunnel job name (default: SeaTunnel) + -i, --variable Variable substitution, such as -i city=beijing, or -i + date=20190318 (default: []) +``` + + + + +```bash +Usage: start-seatunnel-spark-3-connector-v2.sh [options] + Options: + --check Whether check config (default: false) + -c, --config Config file + -e, --deploy-mode Spark deploy mode, support [cluster, client] (default: + client) + -h, --help Show the usage message + -m, --master Spark master, support [spark://host:port, + mesos://host:port, yarn, k8s://https://host:port, + local], default local[*] (default: local[*]) + -n, --name SeaTunnel job name (default: SeaTunnel) + -i, --variable Variable substitution, such as -i city=beijing, or -i + date=20190318 (default: []) +``` + + + + +```bash +Usage: start-seatunnel-flink-13-connector-v2.sh [options] + Options: + --check Whether check config (default: false) + -c, --config Config file + -e, --deploy-mode Flink job deploy mode, support [run, run-application] + (default: run) + -h, --help Show the usage message + --master, --target Flink job submitted target master, support [local, + remote, yarn-session, yarn-per-job, kubernetes-session, + yarn-application, kubernetes-application] + -n, --name SeaTunnel job name (default: SeaTunnel) + -i, --variable Variable substitution, such as -i city=beijing, or -i + date=20190318 (default: []) +``` + + + + +```bash +Usage: start-seatunnel-flink-15-connector-v2.sh [options] + Options: + --check Whether check config (default: false) + -c, --config Config file + -e, --deploy-mode Flink job deploy mode, support [run, run-application] + (default: run) + -h, --help Show the usage message + --master, --target Flink job submitted target master, support [local, + remote, yarn-session, yarn-per-job, kubernetes-session, + yarn-application, kubernetes-application] + -n, --name SeaTunnel job name (default: SeaTunnel) + -i, --variable Variable substitution, such as -i city=beijing, or -i + date=20190318 (default: []) +``` + + + + +## Example + + + + +```bash +bin/start-seatunnel-spark-2-connector-v2.sh --config config/v2.batch.config.template -m local -e client +``` + + + + +```bash +bin/start-seatunnel-spark-3-connector-v2.sh --config config/v2.batch.config.template -m local -e client +``` + + + + +```bash +bin/start-seatunnel-flink-13-connector-v2.sh --config config/v2.batch.config.template +``` + + + + +```bash +bin/start-seatunnel-flink-15-connector-v2.sh --config config/v2.batch.config.template +``` + + + diff --git a/versioned_docs/version-2.3.7/concept/JobEnvConfig.md b/versioned_docs/version-2.3.7/concept/JobEnvConfig.md new file mode 100644 index 000000000000..77c924b68f20 --- /dev/null +++ b/versioned_docs/version-2.3.7/concept/JobEnvConfig.md @@ -0,0 +1,65 @@ +# Job Env Config + +This document describes env configuration information. The common parameters can be used in all engines. In order to better distinguish between engine parameters, the additional parameters of other engine need to carry a prefix. +In flink engine, we use `flink.` as the prefix. In the spark engine, we do not use any prefixes to modify parameters, because the official spark parameters themselves start with `spark.` + +## Common Parameter + +The following configuration parameters are common to all engines. + +### job.name + +This parameter configures the task name. + +### jars + +Third-party packages can be loaded via `jars`, like `jars="file://local/jar1.jar;file://local/jar2.jar"`. + +### job.mode + +You can configure whether the task is in batch or stream mode through `job.mode`, like `job.mode = "BATCH"` or `job.mode = "STREAMING"` + +### checkpoint.interval + +Gets the interval in which checkpoints are periodically scheduled. + +In `STREAMING` mode, checkpoints is required, if you do not set it, it will be obtained from the application configuration file `seatunnel.yaml`. In `BATCH` mode, you can disable checkpoints by not setting this parameter. + +### parallelism + +This parameter configures the parallelism of source and sink. + +### job.retry.times + +Used to control the default retry times when a job fails. The default value is 3, and it only works in the Zeta engine. + +### job.retry.interval.seconds + +Used to control the default retry interval when a job fails. The default value is 3 seconds, and it only works in the Zeta engine. + +### savemode.execute.location + +This parameter is used to specify the location of the savemode when the job is executed in the Zeta engine. +The default value is `CLUSTER`, which means that the savemode is executed on the cluster. If you want to execute the savemode on the client, +you can set it to `CLIENT`. Please use `CLUSTER` mode as much as possible, because when there are no problems with `CLUSTER` mode, we will remove `CLIENT` mode. + +### shade.identifier + +Specify the method of encryption, if you didn't have the requirement for encrypting or decrypting config files, this option can be ignored. + +For more details, you can refer to the documentation [Config Encryption Decryption](../connector-v2/Config-Encryption-Decryption.md) + +## Flink Engine Parameter + +Here are some SeaTunnel parameter names corresponding to the names in Flink, not all of them. Please refer to the official [Flink Documentation](https://flink.apache.org/). + +| Flink Configuration Name | SeaTunnel Configuration Name | +|---------------------------------|---------------------------------------| +| pipeline.max-parallelism | flink.pipeline.max-parallelism | +| execution.checkpointing.mode | flink.execution.checkpointing.mode | +| execution.checkpointing.timeout | flink.execution.checkpointing.timeout | +| ... | ... | + +## Spark Engine Parameter + +Because Spark configuration items have not been modified, they are not listed here, please refer to the official [Spark Documentation](https://spark.apache.org/). diff --git a/versioned_docs/version-2.3.7/concept/config.md b/versioned_docs/version-2.3.7/concept/config.md new file mode 100644 index 000000000000..3c206587a75a --- /dev/null +++ b/versioned_docs/version-2.3.7/concept/config.md @@ -0,0 +1,323 @@ +--- + +sidebar_position: 2 +------------------- + +# Intro to config file + +In SeaTunnel, the most important thing is the config file, through which users can customize their own data +synchronization requirements to maximize the potential of SeaTunnel. So next, I will introduce you how to +configure the config file. + +The main format of the config file is `hocon`, for more details you can refer to [HOCON-GUIDE](https://github.com/lightbend/config/blob/main/HOCON.md), +BTW, we also support the `json` format, but you should keep in mind that the name of the config file should end with `.json`. + +We also support the `SQL` format, please refer to [SQL configuration](sql-config.md) for more details. + +## Example + +Before you read on, you can find config file +examples [Here](https://github.com/apache/seatunnel/tree/dev/config) from the binary package's +config directory. + +## Config File Structure + +The config file is similar to the below one: + +### hocon + +```hocon +env { + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 100 + schema = { + fields { + name = "string" + age = "int" + card = "int" + } + } + } +} + +transform { + Filter { + source_table_name = "fake" + result_table_name = "fake1" + fields = [name, card] + } +} + +sink { + Clickhouse { + host = "clickhouse:8123" + database = "default" + table = "seatunnel_console" + fields = ["name", "card"] + username = "default" + password = "" + source_table_name = "fake1" + } +} +``` + +#### multi-line support + +In `hocon`, multiline strings are supported, which allows you to include extended passages of text without worrying about newline characters or special formatting. This is achieved by enclosing the text within triple quotes **`"""`** . For example: + +``` +var = """ +Apache SeaTunnel is a +next-generation high-performance, +distributed, massive data integration tool. +""" +sql = """ select * from "table" """ +``` + +### json + +```json + +{ + "env": { + "job.mode": "batch" + }, + "source": [ + { + "plugin_name": "FakeSource", + "result_table_name": "fake", + "row.num": 100, + "schema": { + "fields": { + "name": "string", + "age": "int", + "card": "int" + } + } + } + ], + "transform": [ + { + "plugin_name": "Filter", + "source_table_name": "fake", + "result_table_name": "fake1", + "fields": ["name", "card"] + } + ], + "sink": [ + { + "plugin_name": "Clickhouse", + "host": "clickhouse:8123", + "database": "default", + "table": "seatunnel_console", + "fields": ["name", "card"], + "username": "default", + "password": "", + "source_table_name": "fake1" + } + ] +} + +``` + +As you can see, the config file contains several sections: env, source, transform, sink. Different modules +have different functions. After you understand these modules, you will see how SeaTunnel works. + +### env + +Used to add some engine optional parameters, no matter which engine (Zeta, Spark or Flink), the corresponding +optional parameters should be filled in here. + +Note that we have separated the parameters by engine, and for the common parameters, we can configure them as before. +For flink and spark engine, the specific configuration rules of their parameters can be referred to [JobEnvConfig](./JobEnvConfig.md). + + + +### source + +Source is used to define where SeaTunnel needs to fetch data, and use the fetched data for the next step. +Multiple sources can be defined at the same time. The supported source can be found +in [Source of SeaTunnel](../connector-v2/source). Each source has its own specific parameters to define how to +fetch data, and SeaTunnel also extracts the parameters that each source will use, such as +the `result_table_name` parameter, which is used to specify the name of the data generated by the current +source, which is convenient for follow-up used by other modules. + +### transform + +When we have the data source, we may need to further process the data, so we have the transform module. Of +course, this uses the word 'may', which means that we can also directly treat the transform as non-existent, +directly from source to sink. Like below. + +```hocon +env { + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 100 + schema = { + fields { + name = "string" + age = "int" + card = "int" + } + } + } +} + +sink { + Clickhouse { + host = "clickhouse:8123" + database = "default" + table = "seatunnel_console" + fields = ["name", "age", "card"] + username = "default" + password = "" + source_table_name = "fake" + } +} +``` + +Like source, transform has specific parameters that belong to each module. The supported transform can be found +in [Transform V2 of SeaTunnel](../transform-v2) + +### sink + +Our purpose with SeaTunnel is to synchronize data from one place to another, so it is critical to define how +and where data is written. With the sink module provided by SeaTunnel, you can complete this operation quickly +and efficiently. Sink and source are very similar, but the difference is reading and writing. So please check out +[Supported Sinks](../connector-v2/sink). + +### Other + +You will find that when multiple sources and multiple sinks are defined, which data is read by each sink, and +which is the data read by each transform? We introduce two key configurations called `result_table_name` and +`source_table_name`. Each source module will be configured with a `result_table_name` to indicate the name of the +data source generated by the data source, and other transform and sink modules can use `source_table_name` to +refer to the corresponding data source name, indicating that I want to read the data for processing. Then +transform, as an intermediate processing module, can use both `result_table_name` and `source_table_name` +configurations at the same time. But you will find that in the above example config, not every module is +configured with these two parameters, because in SeaTunnel, there is a default convention, if these two +parameters are not configured, then the generated data from the last module of the previous node will be used. +This is much more convenient when there is only one source. + +## Config Variable Substitution + +In config file we can define some variables and replace it in run time. **This is only support `hocon` format file**. + +```hocon +env { + job.mode = "BATCH" + job.name = ${jobName} + parallelism = 2 +} + +source { + FakeSource { + result_table_name = ${resName} + row.num = ${rowNum} + string.template = ${strTemplate} + int.template = [20, 21] + schema = { + fields { + name = ${nameType} + age = "int" + } + } + } +} + +transform { + sql { + source_table_name = "fake" + result_table_name = "sql" + query = "select * from "${resName}" where name = '"${nameVal}"' " + } + +} + +sink { + Console { + source_table_name = "sql" + username = ${username} + password = ${password} + } +} + +``` + +In the above config, we define some variables, like `${rowNum}`, `${resName}`. +We can replace those parameters with this shell command: + +```shell +./bin/seatunnel.sh -c +-i jobName='this_is_a_job_name' +-i resName=fake +-i rowNum=10 +-i strTemplate=['abc','d~f','hi'] +-i nameType=string +-i nameVal=abc +-i username=seatunnel=2.3.1 +-i password='$a^b%c.d~e0*9(' +-m local +``` + +Then the final submitted config is: + +```hocon +env { + job.mode = "BATCH" + job.name = "this_is_a_job_name" + parallelism = 2 +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 10 + string.template = ["abc","d~f","h i"] + int.template = [20, 21] + schema = { + fields { + name = string + age = "int" + } + } + } +} + +transform { + sql { + source_table_name = "fake" + result_table_name = "sql" + query = "select * from fake where name = 'abc' " + } + +} + +sink { + Console { + source_table_name = "sql" + username = "seatunnel=2.3.1" + password = "$a^b%c.d~e0*9(" + } +} +``` + +Some Notes: +- Quota with `'` if the value has special character such as `(` +- If the replacement variables is in `"` or `'`, like `resName` and `nameVal`, you need add `"` +- The value can't have space `' '`, like `-i jobName='this is a job name' `, this will be replaced to `job.name = "this"` +- If you want to use dynamic parameters, you can use the following format: -i date=$(date +"%Y%m%d"). + +## What's More + +If you want to know the details of the format configuration, please +see [HOCON](https://github.com/lightbend/config/blob/main/HOCON.md). diff --git a/versioned_docs/version-2.3.7/concept/connector-v2-features.md b/versioned_docs/version-2.3.7/concept/connector-v2-features.md new file mode 100644 index 000000000000..83b24edebf49 --- /dev/null +++ b/versioned_docs/version-2.3.7/concept/connector-v2-features.md @@ -0,0 +1,75 @@ +# Intro To Connector V2 Features + +## Differences Between Connector V2 And V1 + +Since https://github.com/apache/seatunnel/issues/1608 We Added Connector V2 Features. +Connector V2 is a connector defined based on the SeaTunnel Connector API interface. Unlike Connector V1, V2 supports the following features: + +* **Multi Engine Support** SeaTunnel Connector API is an engine independent API. The connectors developed based on this API can run in multiple engines. Currently, Flink and Spark are supported, and we will support other engines in the future. +* **Multi Engine Version Support** Decoupling the connector from the engine through the translation layer solves the problem that most connectors need to modify the code in order to support a new version of the underlying engine. +* **Unified Batch And Stream** Connector V2 can perform batch processing or streaming processing. We do not need to develop connectors for batch and stream separately. +* **Multiplexing JDBC/Log connection.** Connector V2 supports JDBC resource reuse and sharing database log parsing. + +## Source Connector Features + +Source connectors have some common core features, and each source connector supports them to varying degrees. + +### exactly-once + +If each piece of data in the data source will only be sent downstream by the source once, we think this source connector supports exactly once. + +In SeaTunnel, we can save the read **Split** and its **offset** (The position of the read data in split at that time, +such as line number, byte size, offset, etc.) as **StateSnapshot** when checkpointing. If the task restarted, we will get the last **StateSnapshot** +and then locate the **Split** and **offset** read last time and continue to send data downstream. + +For example `File`, `Kafka`. + +### column projection + +If the connector supports reading only specified columns from the data source (Note that if you read all columns first and then filter unnecessary columns through the schema, this method is not a real column projection) + +For example `JDBCSource` can use sql to define reading columns. + +`KafkaSource` will read all content from topic and then use `schema` to filter unnecessary columns, This is not `column projection`. + +### batch + +Batch Job Mode, The data read is bounded and the job will stop after completing all data read. + +### stream + +Streaming Job Mode, The data read is unbounded and the job never stop. + +### parallelism + +Parallelism Source Connector support config `parallelism`, every parallelism will create a task to read the data. +In the **Parallelism Source Connector**, the source will be split into multiple splits, and then the enumerator will allocate the splits to the SourceReader for processing. + +### support user-defined split + +User can config the split rule. + +### support multiple table read + +Supports reading multiple tables in one SeaTunnel job + +## Sink Connector Features + +Sink connectors have some common core features, and each sink connector supports them to varying degrees. + +### exactly-once + +When any piece of data flows into a distributed system, if the system processes any piece of data accurately only once in the whole processing process and the processing results are correct, it is considered that the system meets the exact once consistency. + +For sink connector, the sink connector supports exactly-once if any piece of data only write into target once. There are generally two ways to achieve this: + +* The target database supports key deduplication. For example `MySQL`, `Kudu`. +* The target support **XA Transaction**(This transaction can be used across sessions. Even if the program that created the transaction has ended, the newly started program only needs to know the ID of the last transaction to resubmit or roll back the transaction). Then we can use **Two-phase Commit** to ensure **exactly-once**. For example `File`, `MySQL`. + +### cdc(change data capture) + +If a sink connector supports writing row kinds(INSERT/UPDATE_BEFORE/UPDATE_AFTER/DELETE) based on primary key, we think it supports cdc(change data capture). + +### support multiple table write + +Supports write multiple tables in one SeaTunnel job, users can dynamically specify the table's identifier by [configuring placeholders](./sink-options-placeholders.md). diff --git a/versioned_docs/version-2.3.7/concept/event-listener.md b/versioned_docs/version-2.3.7/concept/event-listener.md new file mode 100644 index 000000000000..7ba4550205df --- /dev/null +++ b/versioned_docs/version-2.3.7/concept/event-listener.md @@ -0,0 +1,116 @@ +# Event Listener + +## Introduction + +The SeaTunnel provides a rich event listening feature that allows you to manage the status at which data is synchronized. +This functionality is crucial when you need to listen job running status(`org.apache.seatunnel.api.event`). +This document will guide you through the usage of these parameters and how to leverage them effectively. + +## Support Those Engines + +> SeaTunnel Zeta
+> Flink
+> Spark
+ +## API + +The event API is defined in the `org.apache.seatunnel.api.event` package. + +### Event Data API + +- `org.apache.seatunnel.api.event.Event` - The interface for event data. +- `org.apache.seatunnel.api.event.EventType` - The enum for event type. + +### Event Listener API + +You can customize event handler, such as sending events to external systems + +- `org.apache.seatunnel.api.event.EventHandler` - The interface for event handler, SPI will automatically load subclass from the classpath. + +### Event Collect API + +- `org.apache.seatunnel.api.source.SourceSplitEnumerator` - Attached event listener API to report events from `SourceSplitEnumerator`. + +```java +package org.apache.seatunnel.api.source; + +public interface SourceSplitEnumerator { + + interface Context { + + /** + * Get the {@link org.apache.seatunnel.api.event.EventListener} of this enumerator. + * + * @return + */ + EventListener getEventListener(); + } +} +``` + +- `org.apache.seatunnel.api.source.SourceReader` - Attached event listener API to report events from `SourceReader`. + +```java +package org.apache.seatunnel.api.source; + +public interface SourceReader { + + interface Context { + + /** + * Get the {@link org.apache.seatunnel.api.event.EventListener} of this reader. + * + * @return + */ + EventListener getEventListener(); + } +} +``` + +- `org.apache.seatunnel.api.sink.SinkWriter` - Attached event listener API to report events from `SinkWriter`. + +```java +package org.apache.seatunnel.api.sink; + +public interface SinkWriter { + + interface Context { + + /** + * Get the {@link org.apache.seatunnel.api.event.EventListener} of this writer. + * + * @return + */ + EventListener getEventListener(); + } +} +``` + +## Configuration Listener + +To use the event listening feature, you need to configure engine config. + +### Zeta Engine + +Example config in your config file(seatunnel.yaml): + +``` +seatunnel: + engine: + event-report-http: + url: "http://example.com:1024/event/report" + headers: + Content-Type: application/json +``` + +### Flink Engine + +You can define the implementation class of `org.apache.seatunnel.api.event.EventHandler` interface and add to the classpath to automatically load it through SPI. + +Support flink version: 1.14.0+ + +Example: `org.apache.seatunnel.api.event.LoggingEventHandler` + +### Spark Engine + +You can define the implementation class of `org.apache.seatunnel.api.event.EventHandler` interface and add to the classpath to automatically load it through SPI. diff --git a/versioned_docs/version-2.3.7/concept/schema-feature.md b/versioned_docs/version-2.3.7/concept/schema-feature.md new file mode 100644 index 000000000000..a448104fcf3d --- /dev/null +++ b/versioned_docs/version-2.3.7/concept/schema-feature.md @@ -0,0 +1,264 @@ +# Intro to schema feature + +## Why We Need Schema + +Some NoSQL databases or message queue are not strongly limited schema, so the schema cannot be obtained through the api. +At this time, a schema needs to be defined to convert to TableSchema and obtain data. + +## SchemaOptions + +We can use SchemaOptions to define schema, the SchemaOptions contains some configs to define the schema. e.g. columns, primaryKey, constraintKeys. + +``` +schema = { + table = "database.schema.table" + schema_first = false + comment = "comment" + columns = [ + ... + ] + primaryKey { + ... + } + + constraintKeys { + ... + } +} +``` + +### table + +The table full name of the table identifier which the schema belongs to, it contains database, schema, table name. e.g. `database.schema.table`, `database.table`, `table`. + +### schema_first + +Default is false. + +If the schema_first is true, the schema will be used first, this means if we set `table = "a.b"`, `a` will be parsed as schema rather than database, then we can support write `table = "schema.table"`. + +### comment + +The comment of the CatalogTable which the schema belongs to. + +### Columns + +Columns is a list of configs used to define the column in schema, each column can contains name, type, nullable, defaultValue, comment field. + +``` +columns = [ + { + name = id + type = bigint + nullable = false + columnLength = 20 + defaultValue = 0 + comment = "primary key id" + } +] +``` + +| Field | Required | Default Value | Description | +|:-------------|:---------|:--------------|----------------------------------------------------------------------------------| +| name | Yes | - | The name of the column | +| type | Yes | - | The data type of the column | +| nullable | No | true | If the column can be nullable | +| columnLength | No | 0 | The length of the column which will be useful when you need to define the length | +| columnScale | No | - | The scale of the column which will be useful when you need to define the scale | +| defaultValue | No | null | The default value of the column | +| comment | No | null | The comment of the column | + +#### What type supported at now + +| Data type | Value type in Java | Description | +|:----------|:---------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| string | `java.lang.String` | string | +| boolean | `java.lang.Boolean` | boolean | +| tinyint | `java.lang.Byte` | -128 to 127 regular. 0 to 255 unsigned*. Specify the maximum number of digits in parentheses. | +| smallint | `java.lang.Short` | -32768 to 32767 General. 0 to 65535 unsigned*. Specify the maximum number of digits in parentheses. | +| int | `java.lang.Integer` | All numbers from -2,147,483,648 to 2,147,483,647 are allowed. | +| bigint | `java.lang.Long` | All numbers between -9,223,372,036,854,775,808 and 9,223,372,036,854,775,807 are allowed. | +| float | `java.lang.Float` | Float-precision numeric data from -1.79E+308 to 1.79E+308. | +| double | `java.lang.Double` | Double precision floating point. Handle most decimals. | +| decimal | `java.math.BigDecimal` | Double type stored as a string, allowing a fixed decimal point. | +| null | `java.lang.Void` | null | +| bytes | `byte[]` | bytes | +| date | `java.time.LocalDate` | Only the date is stored. From January 1, 0001 to December 31, 9999. | +| time | `java.time.LocalTime` | Only store time. Accuracy is 100 nanoseconds. | +| timestamp | `java.time.LocalDateTime` | Stores a unique number that is updated whenever a row is created or modified. timestamp is based on the internal clock and does not correspond to real time. There can only be one timestamp variable per table. | +| row | `org.apache.seatunnel.api.table.type.SeaTunnelRow` | Row type, can be nested. | +| map | `java.util.Map` | A Map is an object that maps keys to values. The key type includes `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double` `decimal` `date` `time` `timestamp` `null` , and the value type includes `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double` `decimal` `date` `time` `timestamp` `null` `array` `map` `row`. | +| array | `ValueType[]` | A array is a data type that represents a collection of elements. The element type includes `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double`. | + +#### How to declare type supported + +SeaTunnel provides a simple and direct way to declare basic types. Basic type keywords include `string`, `boolean`, `tinyint`, `smallint`, `int`, `bigint`, `float`, `double`, `date`, `time`, `timestamp`, and `null`. The keyword names for basic types can be used directly as type declarations, and SeaTunnel is case-insensitive to type keywords. For example, if you need to declare a field with integer type, you can simply define the field as `int` or `"int"`. + +> The null type declaration must be enclosed in double quotes, like `"null"`. This approach helps avoid confusion with [HOCON](https://github.com/lightbend/config/blob/main/HOCON.md)'s `null` type which represents undefined object. + +When declaring complex types (such as **decimal**, **array**, **map**, and **row**), pay attention to specific considerations. +- When declaring a decimal type, precision and scale settings are required, and the type definition follows the format `decimal(precision, scale)`. It's essential to emphasize that the declaration of the decimal type must be enclosed in `"`; you cannot use the type name directly, as with basic types. For example, when declaring a decimal field with precision 10 and scale 2, you specify the field type as `"decimal(10,2)"`. +- When declaring an array type, you need to specify the element type, and the type definition follows the format `array`, where `T` represents the element type. The element type includes `int`,`string`,`boolean`,`tinyint`,`smallint`,`bigint`,`float` and `double`. Similar to the decimal type declaration, it also be enclosed in `"`. For example, when declaring a field with an array of integers, you specify the field type as `"array"`. +- When declaring a map type, you need to specify the key and value types. The map type definition follows the format `map`, where `K` represents the key type and `V` represents the value type. `K` can be any basic type and decimal type, and `V` can be any type supported by SeaTunnel. Similar to previous type declarations, the map type declaration must be enclosed in double quotes. For example, when declaring a field with map type, where the key type is string and the value type is integer, you can declare the field as `"map"`. +- When declaring a row type, you need to define a [HOCON](https://github.com/lightbend/config/blob/main/HOCON.md) object to describe the fields and their types. The field types can be any type supported by SeaTunnel. For example, when declaring a row type containing an integer field `a` and a string field `b`, you can declare it as `{a = int, b = string}`. Enclosing the definition in `"` as a string is also acceptable, so `"{a = int, b = string}"` is equivalent to `{a = int, c = string}`. Since HOCON is compatible with JSON, `"{\"a\":\"int\", \"b\":\"string\"}"` is equivalent to `"{a = int, b = string}"`. + +Here is an example of complex type declarations: + +```hocon +schema { + fields { + c_decimal = "decimal(10, 2)" + c_array = "array" + c_row = { + c_int = int + c_string = string + c_row = { + c_int = int + } + } + # Hocon style declare row type in generic type + map0 = "map" + # Json style declare row type in generic type + map1 = "map" + } +} +``` + +### PrimaryKey + +Primary key is a config used to define the primary key in schema, it contains name, columns field. + +``` +primaryKey { + name = id + columns = [id] +} +``` + +| Field | Required | Default Value | Description | +|:--------|:---------|:--------------|-----------------------------------| +| name | Yes | - | The name of the primaryKey | +| columns | Yes | - | The column list in the primaryKey | + +### ConstraintKeys + +Constraint keys is a list of config used to define the constraint keys in schema, it contains constraintName, constraintType, constraintColumns field. + +``` +constraintKeys = [ + { + constraintName = "id_index" + constraintType = KEY + constraintColumns = [ + { + columnName = "id" + sortType = ASC + } + ] + }, + ] +``` + +| Field | Required | Default Value | Description | +|:------------------|:---------|:--------------|-------------------------------------------------------------------------------------------------------------------------------------------| +| constraintName | Yes | - | The name of the constraintKey | +| constraintType | No | KEY | The type of the constraintKey | +| constraintColumns | Yes | - | The column list in the primaryKey, each column should contains constraintType and sortType, sortType support ASC and DESC, default is ASC | + +#### What constraintType supported at now + +| ConstraintType | Description | +|:---------------|:------------| +| INDEX_KEY | key | +| UNIQUE_KEY | unique key | + +## How to use schema + +### Recommended + +``` +source { + FakeSource { + parallelism = 2 + result_table_name = "fake" + row.num = 16 + schema { + table = "FakeDatabase.FakeTable" + columns = [ + { + name = id + type = bigint + nullable = false + defaultValue = 0 + comment = "primary key id" + }, + { + name = name + type = "string" + nullable = true + comment = "name" + }, + { + name = age + type = int + nullable = true + comment = "age" + } + ] + primaryKey { + name = "id" + columnNames = [id] + } + constraintKeys = [ + { + constraintName = "unique_name" + constraintType = UNIQUE_KEY + constraintColumns = [ + { + columnName = "name" + sortType = ASC + } + ] + }, + ] + } + } +} +``` + +### Deprecated + +If you only need to define the column, you can use fields to define the column, this is a simple way but will be remove in the future. + +``` +source { + FakeSource { + parallelism = 2 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + id = bigint + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(2, 1)" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} +``` + +## When we should use it or not + +If there is a `schema` configuration project in Options,the connector can then customize the schema. Like `Fake` `Pulsar` `Http` source connector etc. diff --git a/versioned_docs/version-2.3.7/concept/sink-options-placeholders.md b/versioned_docs/version-2.3.7/concept/sink-options-placeholders.md new file mode 100644 index 000000000000..88eada299fc8 --- /dev/null +++ b/versioned_docs/version-2.3.7/concept/sink-options-placeholders.md @@ -0,0 +1,110 @@ +# Sink Options Placeholders + +## Introduction + +The SeaTunnel provides a sink options placeholders feature that allows you to get upstream table metadata through placeholders. + +This functionality is essential when you need to dynamically get upstream table metadata (such as multi-table writes). + +This document will guide you through the usage of these placeholders and how to leverage them effectively. + +## Support Those Engines + +> SeaTunnel Zeta
+> Flink
+> Spark
+ +## Placeholder + +The placeholders are mainly controlled by the following expressions: + +- `${database_name}` + - Used to get the database in the upstream catalog table + - Default values can also be specified via expressions:`${database_name:default_my_db}` +- `${schema_name}` + - Used to get the schema in the upstream catalog table + - Default values can also be specified via expressions:`${schema_name:default_my_schema}` +- `${table_name}` + - Used to get the table in the upstream catalog table + - Default values can also be specified via expressions:`${table_name:default_my_table}` +- `${schema_full_name}` + - Used to get the schema full path(database & schema) in the upstream catalog table +- `${table_full_name}` + - Used to get the table full path(database & schema & table) in the upstream catalog table +- `${primary_key}` + - Used to get the table primary-key fields in the upstream catalog table +- `${unique_key}` + - Used to get the table unique-key fields in the upstream catalog table +- `${field_names}` + - Used to get the table field keys in the upstream catalog table + +## Configuration + +*Requires*: +- Make sure the sink connector you are using has implemented `TableSinkFactory` API + +### Example 1 + +```hocon +env { + // ignore... +} +source { + MySQL-CDC { + // ignore... + } +} + +transform { + // ignore... +} + +sink { + jdbc { + url = "jdbc:mysql://localhost:3306" + driver = "com.mysql.cj.jdbc.Driver" + user = "root" + password = "123456" + + database = "${database_name}_test" + table = "${table_name}_test" + primary_keys = ["${primary_key}"] + } +} +``` + +### Example 2 + +```hocon +env { + // ignore... +} +source { + Oracle-CDC { + // ignore... + } +} + +transform { + // ignore... +} + +sink { + jdbc { + url = "jdbc:mysql://localhost:3306" + driver = "com.mysql.cj.jdbc.Driver" + user = "root" + password = "123456" + + database = "${schema_name}_test" + table = "${table_name}_test" + primary_keys = ["${primary_key}"] + } +} +``` + +We will complete the placeholder replacement before the connector is started, ensuring that the sink options is ready before use. +If the variable is not replaced, it may be that the upstream table metadata is missing this option, for example: +- `mysql` source not contain `${schema_name}` +- `oracle` source not contain `${databse_name}` +- ... diff --git a/versioned_docs/version-2.3.7/concept/speed-limit.md b/versioned_docs/version-2.3.7/concept/speed-limit.md new file mode 100644 index 000000000000..87379e5b751e --- /dev/null +++ b/versioned_docs/version-2.3.7/concept/speed-limit.md @@ -0,0 +1,44 @@ +# Speed Control + +## Introduction + +The SeaTunnel provides a powerful speed control feature that allows you to manage the rate at which data is synchronized. +This functionality is essential when you need to ensure efficient and controlled data transfer between systems. +The speed control is primarily governed by two key parameters: `read_limit.rows_per_second` and `read_limit.bytes_per_second`. +This document will guide you through the usage of these parameters and how to leverage them effectively. + +## Support Those Engines + +> SeaTunnel Zeta
+> Flink
+> Spark
+ +## Configuration + +To use the speed control feature, you need to configure the `read_limit.rows_per_second` or `read_limit.bytes_per_second` parameters in your job config. + +Example env config in your config file: + +```hocon +env { + job.mode=STREAMING + job.name=SeaTunnel_Job + read_limit.bytes_per_second=7000000 + read_limit.rows_per_second=400 +} +source { + MySQL-CDC { + // ignore... + } +} +transform { +} +sink { + Console { + } +} +``` + +We have placed `read_limit.bytes_per_second` and `read_limit.rows_per_second` in the `env` parameters to finish the speed control configuration. +You can configure both of these parameters simultaneously or choose to configure only one of them. The value of each `value` represents the maximum rate at which each thread is restricted. +Therefore, when configuring the respective values, please take into account the parallelism of your tasks. diff --git a/versioned_docs/version-2.3.7/concept/sql-config.md b/versioned_docs/version-2.3.7/concept/sql-config.md new file mode 100644 index 000000000000..fe148a6f726f --- /dev/null +++ b/versioned_docs/version-2.3.7/concept/sql-config.md @@ -0,0 +1,189 @@ +# SQL Configuration File + +## Structure of SQL Configuration File + +The `SQL` configuration file appears as follows: + +### SQL + +```sql +/* config +env { + parallelism = 1 + job.mode = "BATCH" +} +*/ + +CREATE TABLE source_table WITH ( + 'connector'='jdbc', + 'type'='source', + 'url' = 'jdbc:mysql://localhost:3306/seatunnel', + 'driver' = 'com.mysql.cj.jdbc.Driver', + 'user' = 'root', + 'password' = '123456', + 'query' = 'select * from source', + 'properties'= '{ + useSSL = false, + rewriteBatchedStatements = true + }' +); + +CREATE TABLE sink_table WITH ( + 'connector'='jdbc', + 'type'='sink', + 'url' = 'jdbc:mysql://localhost:3306/seatunnel', + 'driver' = 'com.mysql.cj.jdbc.Driver', + 'user' = 'root', + 'password' = '123456', + 'generate_sink_sql' = 'true', + 'database' = 'seatunnel', + 'table' = 'sink' +); + +INSERT INTO sink_table SELECT id, name, age, email FROM source_table; +``` + +## Explanation of `SQL` Configuration File + +### General Configuration in SQL File + +```sql +/* config +env { + parallelism = 1 + job.mode = "BATCH" +} +*/ +``` + +In the `SQL` file, common configuration sections are defined using `/* config */` comments. Inside, common configurations like `env` can be defined using `HOCON` format. + +### SOURCE SQL Syntax + +```sql +CREATE TABLE source_table WITH ( + 'connector'='jdbc', + 'type'='source', + 'url' = 'jdbc:mysql://localhost:3306/seatunnel', + 'driver' = 'com.mysql.cj.jdbc.Driver', + 'user' = 'root', + 'password' = '123456', + 'query' = 'select * from source', + 'properties' = '{ + useSSL = false, + rewriteBatchedStatements = true + }' +); +``` + +* Using `CREATE TABLE ... WITH (...)` syntax creates a mapping for the source table. The `TABLE` name is the name of the source-mapped table, and the `WITH` syntax contains source-related configuration parameters. +* There are two fixed parameters in the WITH syntax: `connector` and `type`, representing connector plugin name (such as `jdbc`, `FakeSource`, etc.) and source type (fixed as `source`), respectively. +* Other parameter names can reference relevant configuration parameters of the corresponding connector plugin, but the format needs to be changed to `'key' = 'value',`. +* If `'value'` is a sub-configuration, you can directly use a string in `HOCON` format. Note: if using a sub-configuration in `HOCON` format, the internal property items must be separated by `,`, like this: + +```sql +'properties' = '{ + useSSL = false, + rewriteBatchedStatements = true +}' +``` + +* If using `'` within `'value'`, it needs to be escaped with `''`, like this: + +```sql +'query' = 'select * from source where name = ''Joy Ding''' +``` + +### SINK SQL Syntax + +```sql +CREATE TABLE sink_table WITH ( + 'connector'='jdbc', + 'type'='sink', + 'url' = 'jdbc:mysql://localhost:3306/seatunnel', + 'driver' = 'com.mysql.cj.jdbc.Driver', + 'user' = 'root', + 'password' = '123456', + 'generate_sink_sql' = 'true', + 'database' = 'seatunnel', + 'table' = 'sink' +); +``` + +* Using `CREATE TABLE ... WITH (...)` syntax creates a mapping for the target table. The `TABLE` name is the name of the target-mapped table, and the `WITH` syntax contains sink-related configuration parameters. +* There are two fixed parameters in the `WITH` syntax: `connector` and `type`, representing connector plugin name (such as `jdbc`, `console`, etc.) and target type (fixed as `sink`), respectively. +* Other parameter names can reference relevant configuration parameters of the corresponding connector plugin, but the format needs to be changed to `'key' = 'value',`. + +### INSERT INTO SELECT Syntax + +```sql +INSERT INTO sink_table SELECT id, name, age, email FROM source_table; +``` + +* The `SELECT FROM` part is the table name of the source-mapped table. +* The `INSERT INTO` part is the table name of the target-mapped table. +* Note: This syntax does **not support** specifying fields in `INSERT`, like this: `INSERT INTO sink_table (id, name, age, email) SELECT id, name, age, email FROM source_table;` + +### INSERT INTO SELECT TABLE Syntax + +```sql +INSERT INTO sink_table SELECT source_table; +``` + +* The `SELECT` part directly uses the name of the source-mapped table, indicating that all data from the source table will be inserted into the target table. +* Using this syntax does not generate related `transform` configurations. This syntax is generally used in multi-table synchronization scenarios. For example: + +```sql +CREATE TABLE source_table WITH ( + 'connector'='jdbc', + 'type' = 'source', + 'url' = 'jdbc:mysql://127.0.0.1:3306/seatunnel', + 'driver' = 'com.mysql.cj.jdbc.Driver', + 'user' = 'root', + 'password' = '123456', + 'table_list' = '[ + { + table_path = "source.table1" + }, + { + table_path = "source.table2", + query = "select * from source.table2" + } + ]' +); + +CREATE TABLE sink_table WITH ( + 'connector'='jdbc', + 'type' = 'sink', + 'url' = 'jdbc:mysql://127.0.0.1:3306/seatunnel', + 'driver' = 'com.mysql.cj.jdbc.Driver', + 'user' = 'root', + 'password' = '123456', + 'generate_sink_sql' = 'true', + 'database' = 'sink' +); + +INSERT INTO sink_table SELECT source_table; +``` + +### CREATE TABLE AS Syntax + +```sql +CREATE TABLE temp1 AS SELECT id, name, age, email FROM source_table; +``` + +* This syntax creates a temporary table with the result of a `SELECT` query, used for `INSERT INTO` operations. +* The syntax of the `SELECT` part refers to: [SQL Transform](../transform-v2/sql.md) `query` configuration item + +```sql +CREATE TABLE temp1 AS SELECT id, name, age, email FROM source_table; + +INSERT INTO sink_table SELECT * FROM temp1; +``` + +## Example of SQL Configuration File Submission + +```bash +./bin/seatunnel.sh --config ./config/sample.sql +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/Config-Encryption-Decryption.md b/versioned_docs/version-2.3.7/connector-v2/Config-Encryption-Decryption.md new file mode 100644 index 000000000000..e49db8c1107c --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/Config-Encryption-Decryption.md @@ -0,0 +1,180 @@ +# Config File Encryption And Decryption + +## Introduction + +In most production environments, sensitive configuration items such as passwords are required to be encrypted and cannot be stored in plain text, SeaTunnel provides a convenient one-stop solution for this. + +## How to use + +SeaTunnel comes with the function of base64 encryption and decryption, but it is not recommended for production use, it is recommended that users implement custom encryption and decryption logic. You can refer to this chapter [How to implement user-defined encryption and decryption](#How to implement user-defined encryption and decryption) get more details about it. + +Base64 encryption support encrypt the following parameters: +- username +- password +- auth + +Next, I'll show how to quickly use SeaTunnel's own `base64` encryption: + +1. And a new option `shade.identifier` in env block of config file, this option indicate what the encryption method that you want to use, in this example, we should add `shade.identifier = base64` in config as the following shown: + + ```hocon + # + # Licensed to the Apache Software Foundation (ASF) under one or more + # contributor license agreements. See the NOTICE file distributed with + # this work for additional information regarding copyright ownership. + # The ASF licenses this file to You under the Apache License, Version 2.0 + # (the "License"); you may not use this file except in compliance with + # the License. You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + # + + env { + parallelism = 1 + shade.identifier = "base64" + } + + source { + MySQL-CDC { + result_table_name = "fake" + parallelism = 1 + server-id = 5656 + port = 56725 + hostname = "127.0.0.1" + username = "seatunnel" + password = "seatunnel_password" + database-name = "inventory_vwyw0n" + table-name = "products" + base-url = "jdbc:mysql://localhost:56725" + } + } + + transform { + } + + sink { + # choose stdout output plugin to output data to console + Clickhouse { + host = "localhost:8123" + database = "default" + table = "fake_all" + username = "seatunnel" + password = "seatunnel_password" + + # cdc options + primary_key = "id" + support_upsert = true + } + } + ``` +2. Using the shell based on different calculate engine to encrypt config file, in this example we use zeta: + + ```shell + ${SEATUNNEL_HOME}/bin/seatunnel.sh --config config/v2.batch.template --encrypt + ``` + + Then you can see the encrypted configuration file in the terminal: + + ```log + 2023-02-20 17:50:58,319 INFO org.apache.seatunnel.core.starter.command.ConfEncryptCommand - Encrypt config: + { + "env" : { + "parallelism" : 1, + "shade.identifier" : "base64" + }, + "source" : [ + { + "base-url" : "jdbc:mysql://localhost:56725", + "hostname" : "127.0.0.1", + "password" : "c2VhdHVubmVsX3Bhc3N3b3Jk", + "port" : 56725, + "database-name" : "inventory_vwyw0n", + "parallelism" : 1, + "result_table_name" : "fake", + "table-name" : "products", + "plugin_name" : "MySQL-CDC", + "server-id" : 5656, + "username" : "c2VhdHVubmVs" + } + ], + "transform" : [], + "sink" : [ + { + "database" : "default", + "password" : "c2VhdHVubmVsX3Bhc3N3b3Jk", + "support_upsert" : true, + "host" : "localhost:8123", + "plugin_name" : "Clickhouse", + "primary_key" : "id", + "table" : "fake_all", + "username" : "c2VhdHVubmVs" + } + ] + } + ``` +3. Of course, not only encrypted configuration files are supported, but if the user wants to see the decrypted configuration file, you can execute this command: + + ```shell + ${SEATUNNEL_HOME}/bin/seatunnel.sh --config config/v2.batch.template --decrypt + ``` + +## How to implement user-defined encryption and decryption + +If you want to customize the encryption method and the configuration of the encryption, this section will help you to solve the problem. + +1. Create a java maven project + +2. Add `seatunnel-api` module in dependencies like the following shown: + + ```xml + + org.apache.seatunnel + seatunnel-api + ${seatunnel.version} + + ``` +3. Create a new class and implement interface `ConfigShade`, this interface has the following methods: + + ```java + /** + * The interface that provides the ability to encrypt and decrypt {@link + * org.apache.seatunnel.shade.com.typesafe.config.Config} + */ + public interface ConfigShade { + + /** + * The unique identifier of the current interface, used it to select the correct {@link + * ConfigShade} + */ + String getIdentifier(); + + /** + * Encrypt the content + * + * @param content The content to encrypt + */ + String encrypt(String content); + + /** + * Decrypt the content + * + * @param content The content to decrypt + */ + String decrypt(String content); + + /** To expand the options that user want to encrypt */ + default String[] sensitiveOptions() { + return new String[0]; + } + } + ``` +4. Add `org.apache.seatunnel.api.configuration.ConfigShade` in `resources/META-INF/services` +5. Package it to jar and add jar to `${SEATUNNEL_HOME}/lib` +6. Change the option `shade.identifier` to the value that you defined in `ConfigShade#getIdentifier`of you config file, please enjoy it \^_\^ + diff --git a/versioned_docs/version-2.3.7/connector-v2/Error-Quick-Reference-Manual.md b/versioned_docs/version-2.3.7/connector-v2/Error-Quick-Reference-Manual.md new file mode 100644 index 000000000000..960bddc0ebd6 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/Error-Quick-Reference-Manual.md @@ -0,0 +1,286 @@ +# Error Quick Reference Manual + +This document records some common error codes and corresponding solutions of SeaTunnel, aiming to quickly solve the +problems encountered by users. + +## SeaTunnel API Error Codes + +| code | description | solution | +|--------|------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| API-01 | Configuration item validate failed | When users encounter this error code, it is usually due to a problem with the connector parameters configured by the user, please check the connector documentation and correct the parameters | +| API-02 | Option item validate failed | - | +| API-03 | Catalog initialize failed | When users encounter this error code, it is usually because the connector initialization catalog failed, please check the connector connector options whether are correct | +| API-04 | Database not existed | When users encounter this error code, it is usually because the database that you want to access is not existed, please double check the database exists | +| API-05 | Table not existed | When users encounter this error code, it is usually because the table that you want to access is not existed, please double check the table exists | +| API-06 | Factory initialize failed | When users encounter this error code, it is usually because there is a problem with the jar package dependency, please check whether your local SeaTunnel installation package is complete | +| API-07 | Database already existed | When users encounter this error code, it means that the database you want to create has already existed, please delete database and try again | +| API-08 | Table already existed | When users encounter this error code, it means that the table you want to create has already existed, please delete table and try again | + +## SeaTunnel Common Error Codes + +| code | description | solution | +|-----------|------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| COMMON-01 | File operation failed, such as (read,list,write,move,copy,sync) etc... | When users encounter this error code, it is usually there are some problems in the file operation, please check if the file is OK | +| COMMON-02 | Json covert/parse operation failed | When users encounter this error code, it is usually there are some problems about json converting or parsing, please check if the json format is correct | +| COMMON-03 | Reflect class operation failed | When users encounter this error code, it is usually there are some problems on class reflect operation, please check the jar dependency whether exists in classpath | +| COMMON-04 | Serialize class operation failed | When users encounter this error code, it is usually there are some problems on class serialize operation, please check java environment | +| COMMON-05 | Unsupported operation | When users encounter this error code, users may trigger an unsupported operation such as enabled some unsupported features | +| COMMON-06 | Illegal argument | When users encounter this error code, it maybe user-configured parameters are not legal, please correct it according to the tips | +| COMMON-07 | Unsupported data type | When users encounter this error code, it maybe connectors don't support this data type | +| COMMON-08 | Sql operation failed, such as (execute,addBatch,close) etc... | When users encounter this error code, it is usually there are some problems on sql execute process, please check the sql whether correct | +| COMMON-09 | Get table schema from upstream data failed | When users encounter this error code, it maybe SeaTunnel try to get schema information from connector source data failed, please check your configuration whether correct and connector is work | +| COMMON-10 | Flush data operation that in sink connector failed | When users encounter this error code, it maybe SeaTunnel try to flush batch data to sink connector field, please check your configuration whether correct and connector is work | +| COMMON-11 | Sink writer operation failed, such as (open, close) etc... | When users encounter this error code, it maybe some operation of writer such as Parquet,Orc,IceBerg failed, you need to check if the corresponding file or resource has read and write permissions | +| COMMON-12 | Source reader operation failed, such as (open, close) etc... | When users encounter this error code, it maybe some operation of reader such as Parquet,Orc,IceBerg failed, you need to check if the corresponding file or resource has read and write permissions | +| COMMON-13 | Http operation failed, such as (open, close, response) etc... | When users encounter this error code, it maybe some http requests failed, please check your network environment | +| COMMON-14 | Kerberos authorized failed | When users encounter this error code, it maybe some The Kerberos authorized is misconfigured | +| COMMON-15 | Class load operation failed | When users encounter this error code, it maybe some The corresponding jar does not exist, or the type is not supported | + +## Assert Connector Error Codes + +| code | description | solution | +|-----------|----------------------|-------------------------------------------------------------------------------------------| +| ASSERT-01 | Rule validate failed | When users encounter this error code, it means that upstream data does not meet the rules | + +## Cassandra Connector Error Codes + +| code | description | solution | +|--------------|-------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| CASSANDRA-01 | Field is not existed in target table | When users encounter this error code, it means that the fields of upstream data don't meet with target cassandra table, please check target cassandra table structure | +| CASSANDRA-02 | Add batch SeaTunnelRow data into a batch failed | When users encounter this error code, it means that cassandra has some problems, please check it whether is work | +| CASSANDRA-03 | Close cql session of cassandra failed | When users encounter this error code, it means that cassandra has some problems, please check it whether is work | +| CASSANDRA-04 | No data in source table | When users encounter this error code, it means that source cassandra table has no data, please check it | +| CASSANDRA-05 | Parse ip address from string failed | When users encounter this error code, it means that upstream data does not match ip address format, please check it | + +## Slack Connector Error Codes + +| code | description | solution | +|----------|---------------------------------------------|--------------------------------------------------------------------------------------------------------------------| +| SLACK-01 | Conversation can not be founded in channels | When users encounter this error code, it means that the channel is not existed in slack workspace, please check it | +| SLACK-02 | Write to slack channel failed | When users encounter this error code, it means that slack has some problems, please check it whether is work | + +## MyHours Connector Error Codes + +| code | description | solution | +|------------|--------------------------|--------------------------------------------------------------------------------------------------------------------------| +| MYHOURS-01 | Get myhours token failed | When users encounter this error code, it means that login to the MyHours Failed, please check your network and try again | + +## Rabbitmq Connector Error Codes + +| code | description | solution | +|-------------|---------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------| +| RABBITMQ-01 | handle queue consumer shutdown signal failed | When users encounter this error code, it means that job has some problems, please check it whether is work well | +| RABBITMQ-02 | create rabbitmq client failed | When users encounter this error code, it means that rabbitmq has some problems, please check it whether is work | +| RABBITMQ-03 | close connection failed | When users encounter this error code, it means that rabbitmq has some problems, please check it whether is work | +| RABBITMQ-04 | send messages failed | When users encounter this error code, it means that rabbitmq has some problems, please check it whether is work | +| RABBITMQ-05 | messages could not be acknowledged during checkpoint creation | When users encounter this error code, it means that job has some problems, please check it whether is work well | +| RABBITMQ-06 | messages could not be acknowledged with basicReject | When users encounter this error code, it means that job has some problems, please check it whether is work well | +| RABBITMQ-07 | parse uri failed | When users encounter this error code, it means that rabbitmq connect uri incorrect, please check it | +| RABBITMQ-08 | initialize ssl context failed | When users encounter this error code, it means that rabbitmq has some problems, please check it whether is work | +| RABBITMQ-09 | setup ssl factory failed | When users encounter this error code, it means that rabbitmq has some problems, please check it whether is work | + +## Socket Connector Error Codes + +| code | description | solution | +|-----------|----------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| +| SOCKET-01 | Cannot connect to socket server | When the user encounters this error code, it means that the connection address may not match, please check | +| SOCKET-02 | Failed to send message to socket server | When the user encounters this error code, it means that there is a problem sending data and retry is not enabled, please check | +| SOCKET-03 | Unable to write; interrupted while doing another attempt | When the user encounters this error code, it means that the data writing is interrupted abnormally, please check | + +## TableStore Connector Error Codes + +| code | description | solution | +|---------------|-----------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| +| TABLESTORE-01 | Failed to send these rows of data | When users encounter this error code, it means that failed to write these rows of data, please check the rows that failed to import | + +## Hive Connector Error Codes + +| code | description | solution | +|---------|---------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------| +| HIVE-01 | Get name node host from table location failed | When users encounter this error code, it means that the metastore inforamtion has some problems, please check it | +| HIVE-02 | Initialize hive metastore client failed | When users encounter this error code, it means that connect to hive metastore service failed, please check it whether is work | +| HIVE-03 | Get hive table information from hive metastore service failed | When users encounter this error code, it means that hive metastore service has some problems, please check it whether is work | + +## Elasticsearch Connector Error Codes + +| code | description | solution | +|------------------|-----------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| +| ELASTICSEARCH-01 | Bulk es response error | When the user encounters this error code, it means that the connection was aborted, please check it whether is work | +| ELASTICSEARCH-02 | Get elasticsearch version failed | When the user encounters this error code, it means that the connection was aborted, please check it whether is work | +| ELASTICSEARCH-03 | Fail to scroll request | When the user encounters this error code, it means that the connection was aborted, please check it whether is work | +| ELASTICSEARCH-04 | Get elasticsearch document index count failed | When the user encounters this error code, it means that the es index may not wrong or the connection was aborted, please check | + +## Kafka Connector Error Codes + +| code | description | solution | +|----------|-----------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------| +| KAFKA-01 | Incompatible KafkaProducer version | When users encounter this error code, it means that KafkaProducer version is incompatible, please check it | +| KAFKA-02 | Get transactionManager in KafkaProducer exception | When users encounter this error code, it means that can not get transactionManager in KafkaProducer, please check it | +| KAFKA-03 | Add the split checkpoint state to reader failed | When users encounter this error code, it means that add the split checkpoint state to reader failed, please retry it | +| KAFKA-04 | Add a split back to the split enumerator,it will only happen when a SourceReader failed | When users encounter this error code, it means that add a split back to the split enumerator failed, please check it | +| KAFKA-05 | Error occurred when the kafka consumer thread was running | When users encounter this error code, it means that an error occurred when the kafka consumer thread was running, please check it | +| KAFKA-06 | Kafka failed to consume data | When users encounter this error code, it means that Kafka failed to consume data, please check config and retry it | +| KAFKA-07 | Kafka failed to close consumer | When users encounter this error code, it means that Kafka failed to close consumer | + +## InfluxDB Connector Error Codes + +| code | description | solution | +|-------------|------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------| +| INFLUXDB-01 | Connect influxdb failed, due to influxdb version info is unknown | When the user encounters this error code, it indicates that the connection to influxdb failed. Please check | +| INFLUXDB-02 | Get column index of query result exception | When the user encounters this error code, it indicates that obtaining the column index failed. Please check | + +## Kudu Connector Error Codes + +| code | description | solution | +|---------|----------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---| +| KUDU-01 | Get the Kuduscan object for each splice failed | When users encounter this error code, it is usually there are some problems with getting the KuduScan Object for each splice, please check your configuration whether correct and Kudu is work | +| KUDU-02 | Close Kudu client failed | When users encounter this error code, it is usually there are some problems with closing the Kudu client, please check the Kudu is work | | +| KUDU-03 | Value type does not match column type | When users encounter this error code, it is usually there are some problems on matching the Type between value type and colum type, please check if the data type is supported | +| KUDU-04 | Upsert data to Kudu failed | When users encounter this error code, it means that Kudu has some problems, please check it whether is work | +| KUDU-05 | Insert data to Kudu failed | When users encounter this error code, it means that Kudu has some problems, please check it whether is work | +| KUDU-06 | Initialize the Kudu client failed | When users encounter this error code, it is usually there are some problems with initializing the Kudu client, please check your configuration whether correct and connector is work | +| KUDU-07 | Generate Kudu Parameters in the preparation phase failed | When users encounter this error code, it means that there are some problems on Kudu parameters generation, please check your configuration | + +## IotDB Connector Error Codes + +| code | description | solution | +|----------|--------------------------------|------------------------------------------------------------------------------------------------------------| +| IOTDB-01 | Close IoTDB session failed | When the user encounters this error code, it indicates that closing the session failed. Please check | +| IOTDB-02 | Initialize IoTDB client failed | When the user encounters this error code, it indicates that the client initialization failed. Please check | +| IOTDB-03 | Close IoTDB client failed | When the user encounters this error code, it indicates that closing the client failed. Please check | + +## File Connector Error Codes + +| code | description | solution | +|---------|-----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| FILE-01 | File type is invalid | When users encounter this error code, it means that the this file is not the format that user assigned, please check it | +| FILE-02 | Data deserialization failed | When users encounter this error code, it means that data from files not satisfied the schema that user assigned, please check data from files whether is correct | +| FILE-03 | Get file list failed | When users encounter this error code, it means that connector try to traverse the path and get file list failed, please check file system whether is work | +| FILE-04 | File list is empty | When users encounter this error code, it means that the path user want to sync is empty, please check file path | + +## Doris Connector Error Codes + +| code | description | solution | +|----------|---------------------|-----------------------------------------------------------------------------------------------------------------------------------| +| Doris-01 | stream load error. | When users encounter this error code, it means that stream load to Doris failed, please check data from files whether is correct. | +| Doris-02 | commit error. | When users encounter this error code, it means that commit to Doris failed, please check network. | +| Doris-03 | rest service error. | When users encounter this error code, it means that rest service failed, please check network and config. | + +## SelectDB Cloud Connector Error Codes + +| code | description | solution | +|-------------|-----------------------------|-------------------------------------------------------------------------------------------------------------------------------------------| +| SelectDB-01 | stage load file error | When users encounter this error code, it means that stage load file to SelectDB Cloud failed, please check the configuration and network. | +| SelectDB-02 | commit copy into sql failed | When users encounter this error code, it means that commit copy into sql to SelectDB Cloud failed, please check the configuration. | + +## Clickhouse Connector Error Codes + +| code | description | solution | +|---------------|---------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| CLICKHOUSE-01 | Field is not existed in target table | When users encounter this error code, it means that the fields of upstream data don't meet with target clickhouse table, please check target clickhouse table structure | +| CLICKHOUSE-02 | Can’t find password of shard node | When users encounter this error code, it means that no password is configured for each node, please check | +| CLICKHOUSE-03 | Can’t delete directory | When users encounter this error code, it means that the directory does not exist or does not have permission, please check | +| CLICKHOUSE-04 | Ssh operation failed, such as (login,connect,authentication,close) etc... | When users encounter this error code, it means that the ssh request failed, please check your network environment | +| CLICKHOUSE-05 | Get cluster list from clickhouse failed | When users encounter this error code, it means that the clickhouse cluster is not configured correctly, please check | +| CLICKHOUSE-06 | Shard key not found in table | When users encounter this error code, it means that the shard key of the distributed table is not configured, please check | + +## Jdbc Connector Error Codes + +| code | description | solution | +|---------|----------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| JDBC-01 | Fail to create driver of class | When users encounter this error code, it means that driver package may not be added. Check whether the driver exists | +| JDBC-02 | No suitable driver found | When users encounter this error code, it means that no password is configured for each node, please check | +| JDBC-03 | Xa operation failed, such as (commit, rollback) etc.. | When users encounter this error code, it means that if a distributed sql transaction fails, check the transaction execution of the corresponding database to determine the cause of the transaction failure | +| JDBC-04 | Connector database failed | When users encounter this error code, it means that database connection failure, check whether the url is correct or whether the corresponding service is normal | +| JDBC-05 | transaction operation failed, such as (commit, rollback) etc.. | When users encounter this error code, it means that if a sql transaction fails, check the transaction execution of the corresponding database to determine the cause of the transaction failure | +| JDBC-06 | No suitable dialect factory found | When users encounter this error code, it means that may be an unsupported dialect type | +| JDBC-07 | The jdbc type don't support sink | When users encounter this error code, it means that jdbc type don't support sink | +| JDBC-08 | Kerberos authentication failed | When users encounter this error code, it means that database connection Kerberos authentication failed | + +## Pulsar Connector Error Codes + +| code | description | solution | +|-----------|--------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------| +| PULSAR-01 | Open pulsar admin failed | When users encounter this error code, it means that open pulsar admin failed, please check it | +| PULSAR-02 | Open pulsar client failed | When users encounter this error code, it means that open pulsar client failed, please check it | +| PULSAR-03 | Pulsar authentication failed | When users encounter this error code, it means that Pulsar Authentication failed, please check it | +| PULSAR-04 | Subscribe topic from pulsar failed | When users encounter this error code, it means that Subscribe topic from pulsar failed, please check it | +| PULSAR-05 | Get last cursor of pulsar topic failed | When users encounter this error code, it means that get last cursor of pulsar topic failed, please check it | +| PULSAR-06 | Get partition information of pulsar topic failed | When users encounter this error code, it means that Get partition information of pulsar topic failed, please check it | +| PULSAR-07 | Pulsar consumer acknowledgeCumulative failed | When users encounter this error code, it means that Pulsar consumer acknowledgeCumulative failed | +| PULSAR-08 | Pulsar create producer failed | When users encounter this error code, it means that create producer failed, please check it | +| PULSAR-09 | Pulsar create transaction failed | When users encounter this error code, it means that Pulsar create transaction failed, please check it | +| PULSAR-10 | Pulsar send message failed | When users encounter this error code, it means that Pulsar sned message failed, please check it | + +## StarRocks Connector Error Codes + +| code | description | solution | +|--------------|-------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------| +| STARROCKS-01 | Flush batch data to sink connector failed | When users encounter this error code, it means that flush batch data to sink connector failed, please check it | +| STARROCKS-02 | Writing records to StarRocks failed | When users encounter this error code, it means that writing records to StarRocks failed, please check data from files whether is correct | +| STARROCKS-03 | Close StarRocks BE reader failed. | it means that StarRocks has some problems, please check it whether is work | +| STARROCKS-04 | Create StarRocks BE reader failed. | it means that StarRocks has some problems, please check it whether is work | +| STARROCKS-05 | Scan data from StarRocks BE failed. | When users encounter this error code, it means that scan data from StarRocks failed, please check it | +| STARROCKS-06 | Request query Plan failed. | When users encounter this error code, it means that scan data from StarRocks failed, please check it | +| STARROCKS-07 | Read Arrow data failed. | When users encounter this error code, it means that that job has some problems, please check it whether is work well | + +## DingTalk Connector Error Codes + +| code | description | solution | +|-------------|-----------------------------------------|----------------------------------------------------------------------------------------------------------------------| +| DINGTALK-01 | Send response to DinkTalk server failed | When users encounter this error code, it means that send response message to DinkTalk server failed, please check it | +| DINGTALK-02 | Get sign from DinkTalk server failed | When users encounter this error code, it means that get signature from DinkTalk server failed , please check it | + +## Iceberg Connector Error Codes + +| code | description | solution | +|------------|--------------------------------|----------------------------------------------------------------------------------------------------------| +| ICEBERG-01 | File Scan Split failed | When users encounter this error code, it means that the file scanning and splitting failed. Please check | +| ICEBERG-02 | Invalid starting record offset | When users encounter this error code, it means that the starting record offset is invalid. Please check | + +## Email Connector Error Codes + +| code | description | solution | +|----------|-------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| EMAIL-01 | Send email failed | When users encounter this error code, it means that send email to target server failed, please adjust the network environment according to the abnormal information | + +## S3Redshift Connector Error Codes + +| code | description | solution | +|---------------|---------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| S3RedShift-01 | Aggregate committer error | S3Redshift Sink Connector will write data to s3 and then move file to the target s3 path. And then use `Copy` action copy the data to Redshift. Please check the error log and find out the specific reason. | + +## Google Firestore Connector Error Codes + +| code | description | solution | +|--------------|-------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------| +| FIRESTORE-01 | Close Firestore client failed | When users encounter this error code, it is usually there are some problems with closing the Firestore client, please check the Firestore is work | + +## Hbase Connector Error Codes + +| code | description | solution | +|----------|-------------------------------|---------------------------------------------------------------------------------------------------------------------------------| +| Hbase-01 | Build hbase connection failed | When users create Hbase database connection, the connection failed. Check the Hbase configuration parameters used and try again | + +## FilterFieldTransform Error Codes + +| code | description | solution | +|---------------------------|------------------------|-------------------------| +| FILTER_FIELD_TRANSFORM-01 | filter field not found | filter field not found. | + +## RocketMq Connector Error Codes + +| code | description | solution | +|-------------|-------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------| +| ROCKETMQ-01 | Add a split back to the split enumerator failed, it will only happen when a SourceReader failed | When users encounter this error code, it means that add a split back to the split enumerator failed, please check it. | +| ROCKETMQ-02 | Add the split checkpoint state to reader failed | When users encounter this error code, it means that add the split checkpoint state to reader failed, please check it. | +| ROCKETMQ-03 | Rocketmq failed to consume data | When users encounter this error code, it means that rocketmq failed to consume data, please check it., please check it. | +| ROCKETMQ-04 | Error occurred when the rocketmq consumer thread was running | When the user encounters this error code, it means that an error occurred while running the Rocketmq consumer thread | +| ROCKETMQ-05 | Rocketmq producer failed to send message | When users encounter this error code, it means that Rocketmq producer failed to send message, please check it. | +| ROCKETMQ-06 | Rocketmq producer failed to start | When users encounter this error code, it means that Rocketmq producer failed to start, please check it. | +| ROCKETMQ-07 | Rocketmq consumer failed to start | When users encounter this error code, it means that Rocketmq consumer failed to start, please check it. | +| ROCKETMQ-08 | Unsupported start mode | When users encounter this error code, it means that the configured start mode is not supported, please check it. | +| ROCKETMQ-09 | Failed to get the offsets of the current consumer group | When users encounter this error code, it means that failed to get the offsets of the current consumer group, please check it. | +| ROCKETMQ-10 | Failed to search offset through timestamp | When users encounter this error code, it means that failed to search offset through timestamp, please check it. | +| ROCKETMQ-11 | Failed to get topic min and max topic | When users encounter this error code, it means that failed to get topic min and max topic, please check it. | + diff --git a/versioned_docs/version-2.3.7/connector-v2/formats/avro.md b/versioned_docs/version-2.3.7/connector-v2/formats/avro.md new file mode 100644 index 000000000000..8fef411fb58e --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/formats/avro.md @@ -0,0 +1,111 @@ +# Avro format + +Avro is very popular in streaming data pipeline. Now seatunnel supports Avro format in kafka connector. + +# How To Use + +## Kafka uses example + +- This is an example to generate data from fake source and sink to kafka with avro format. + +```bash +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + row.num = 90 + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + result_table_name = "fake" + } +} + +sink { + Kafka { + bootstrap.servers = "kafkaCluster:9092" + topic = "test_avro_topic_fake_source" + format = avro + } +} +``` + +- This is an example read data from kafka with avro format and print to console. + +```bash +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Kafka { + bootstrap.servers = "kafkaCluster:9092" + topic = "test_avro_topic" + result_table_name = "kafka_table" + start_mode = "earliest" + format = avro + format_error_handle_way = skip + schema = { + fields { + id = bigint + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(2, 1)" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} + +sink { + Console { + source_table_name = "kafka_table" + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/formats/canal-json.md b/versioned_docs/version-2.3.7/connector-v2/formats/canal-json.md new file mode 100644 index 000000000000..6e133a9a82a4 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/formats/canal-json.md @@ -0,0 +1,114 @@ +# Canal Format + +Changelog-Data-Capture Format Format: Serialization Schema Format: Deserialization Schema + +Canal is a CDC (Changelog Data Capture) tool that can stream changes in real-time from MySQL into other systems. Canal provides a unified format schema for changelog and supports to serialize messages using JSON and protobuf (protobuf is the default format for Canal). + +SeaTunnel supports to interpret Canal JSON messages as INSERT/UPDATE/DELETE messages into seatunnel system. This is useful in many cases to leverage this feature, such as + + synchronizing incremental data from databases to other systems + auditing logs + real-time materialized views on databases + temporal join changing history of a database table and so on. + +SeaTunnel also supports to encode the INSERT/UPDATE/DELETE messages in SeaTunnel as Canal JSON messages, and emit to storage like Kafka. However, currently SeaTunnel can’t combine UPDATE_BEFORE and UPDATE_AFTER into a single UPDATE message. Therefore, SeaTunnel encodes UPDATE_BEFORE and UPDATE_AFTER as DELETE and INSERT Canal messages. + +# Format Options + +| Option | Default | Required | Description | +|--------------------------------|---------|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| format | (none) | yes | Specify what format to use, here should be 'canal_json'. | +| canal_json.ignore-parse-errors | false | no | Skip fields and rows with parse errors instead of failing. Fields are set to null in case of errors. | +| canal_json.database.include | (none) | no | An optional regular expression to only read the specific databases changelog rows by regular matching the "database" meta field in the Canal record. The pattern string is compatible with Java's Pattern. | +| canal_json.table.include | (none) | no | An optional regular expression to only read the specific tables changelog rows by regular matching the "table" meta field in the Canal record. The pattern string is compatible with Java's Pattern. | + +# How to use + +## Kafka Uses Example + +Canal provides a unified format for changelog, here is a simple example for an update operation captured from a MySQL products table: + +```bash +{ + "data": [ + { + "id": "111", + "name": "scooter", + "description": "Big 2-wheel scooter", + "weight": "5.18" + } + ], + "database": "inventory", + "es": 1589373560000, + "id": 9, + "isDdl": false, + "mysqlType": { + "id": "INTEGER", + "name": "VARCHAR(255)", + "description": "VARCHAR(512)", + "weight": "FLOAT" + }, + "old": [ + { + "weight": "5.15" + } + ], + "pkNames": [ + "id" + ], + "sql": "", + "sqlType": { + "id": 4, + "name": 12, + "description": 12, + "weight": 7 + }, + "table": "products", + "ts": 1589373560798, + "type": "UPDATE" +} +``` + +Note: please refer to [Canal documentation](https://github.com/alibaba/canal/wiki) about the meaning of each fields. + +The MySQL products table has 4 columns (id, name, description and weight). +The above JSON message is an update change event on the products table where the weight value of the row with id = 111 is changed from 5.15 to 5.18. +Assuming the messages have been synchronized to Kafka topic products_binlog, then we can use the following SeaTunnel to consume this topic and interpret the change events. + +```bash +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Kafka { + bootstrap.servers = "kafkaCluster:9092" + topic = "products_binlog" + result_table_name = "kafka_name" + start_mode = earliest + schema = { + fields { + id = "int" + name = "string" + description = "string" + weight = "string" + } + }, + format = canal_json + } + +} + +transform { +} + +sink { + Kafka { + bootstrap.servers = "localhost:9092" + topic = "consume-binlog" + format = canal_json + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/formats/cdc-compatible-debezium-json.md b/versioned_docs/version-2.3.7/connector-v2/formats/cdc-compatible-debezium-json.md new file mode 100644 index 000000000000..b35501a62a70 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/formats/cdc-compatible-debezium-json.md @@ -0,0 +1,55 @@ +# CDC Compatible Debezium-json + +SeaTunnel supports to interpret cdc record as Debezium-JSON messages publish to mq(kafka) system. + +This is useful in many cases to leverage this feature, such as compatible with the debezium ecosystem. + +# How To Use + +## MySQL-CDC Sink Kafka + +```bash +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 15000 +} + +source { + MySQL-CDC { + result_table_name = "table1" + + base-url="jdbc:mysql://localhost:3306/test" + "startup.mode"=INITIAL + table-names=[ + "database1.t1", + "database1.t2", + "database2.t1" + ] + + # compatible_debezium_json options + format = compatible_debezium_json + debezium = { + # include schema into kafka message + key.converter.schemas.enable = false + value.converter.schemas.enable = false + # include ddl + include.schema.changes = true + # topic prefix + database.server.name = "mysql_cdc_1" + } + } +} + +sink { + Kafka { + source_table_name = "table1" + + bootstrap.servers = "localhost:9092" + + # compatible_debezium_json options + format = compatible_debezium_json + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/formats/debezium-json.md b/versioned_docs/version-2.3.7/connector-v2/formats/debezium-json.md new file mode 100644 index 000000000000..5f71e14f09d6 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/formats/debezium-json.md @@ -0,0 +1,114 @@ +# Debezium Format + +Changelog-Data-Capture Format: Serialization Schema Format: Deserialization Schema + +Debezium is a set of distributed services to capture changes in your databases so that your applications can see those changes and respond to them. Debezium records all row-level changes within each database table in a *change event stream*, and applications simply read these streams to see the change events in the same order in which they occurred. + +Seatunnel supports to interpret Debezium JSON messages as INSERT/UPDATE/DELETE messages into seatunnel system. This is useful in many cases to leverage this feature, such as + + synchronizing incremental data from databases to other systems + auditing logs + real-time materialized views on databases + temporal join changing history of a database table and so on. + +Seatunnel also supports to encode the INSERT/UPDATE/DELETE messages in Seatunnel asDebezium JSON messages, and emit to storage like Kafka. + +# Format Options + +| Option | Default | Required | Description | +|-----------------------------------|---------|----------|------------------------------------------------------------------------------------------------------| +| format | (none) | yes | Specify what format to use, here should be 'debezium_json'. | +| debezium-json.ignore-parse-errors | false | no | Skip fields and rows with parse errors instead of failing. Fields are set to null in case of errors. | + +# How To Use + +## Kafka Uses example + +Debezium provides a unified format for changelog, here is a simple example for an update operation captured from a MySQL products table: + +```bash +{ + "before": { + "id": 111, + "name": "scooter", + "description": "Big 2-wheel scooter ", + "weight": 5.18 + }, + "after": { + "id": 111, + "name": "scooter", + "description": "Big 2-wheel scooter ", + "weight": 5.17 + }, + "source": { + "version": "1.1.1.Final", + "connector": "mysql", + "name": "dbserver1", + "ts_ms": 1589362330000, + "snapshot": "false", + "db": "inventory", + "table": "products", + "server_id": 223344, + "gtid": null, + "file": "mysql-bin.000003", + "pos": 2090, + "row": 0, + "thread": 2, + "query": null + }, + "op": "u", + "ts_ms": 1589362330904, + "transaction": null +} +``` + +Note: please refer to [Debezium documentation](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/mysql.adoc#data-change-events) about the meaning of each fields. + +The MySQL products table has 4 columns (id, name, description and weight). +The above JSON message is an update change event on the products table where the weight value of the row with id = 111 is changed from 5.18 to 5.17. +Assuming the messages have been synchronized to Kafka topic products_binlog, then we can use the following Seatunnel conf to consume this topic and interpret the change events by Debezium format. + +**In this config, you must specify the `schema` and `debezium_record_include_schema` options ** +- `schema` should same with your table format +- if your json data contains `schema` field, `debezium_record_include_schema` should be true, and if your json data doesn't contains `schema` field, `debezium_record_include_schema` should be false +- `{"schema" : {}, "payload": { "before" : {}, "after": {} ... } }` --> `true` +- `{"before" : {}, "after": {} ... }` --> `false` + +```bash +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Kafka { + bootstrap.servers = "kafkaCluster:9092" + topic = "products_binlog" + result_table_name = "kafka_name" + start_mode = earliest + schema = { + fields { + id = "int" + name = "string" + description = "string" + weight = "string" + } + } + debezium_record_include_schema = false + format = debezium_json + } + +} + +transform { +} + +sink { + Kafka { + bootstrap.servers = "kafkaCluster:9092" + topic = "consume-binlog" + format = debezium_json + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/formats/kafka-compatible-kafkaconnect-json.md b/versioned_docs/version-2.3.7/connector-v2/formats/kafka-compatible-kafkaconnect-json.md new file mode 100644 index 000000000000..def638367ca5 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/formats/kafka-compatible-kafkaconnect-json.md @@ -0,0 +1,47 @@ +# Kafka source compatible kafka-connect-json + +Seatunnel connector kafka supports parsing data extracted through kafka connect source, especially data extracted from kafka connect jdbc and kafka connect debezium + +# How To Use + +## Kafka Sink Mysql + +```bash +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Kafka { + bootstrap.servers = "localhost:9092" + topic = "jdbc_source_record" + result_table_name = "kafka_table" + start_mode = earliest + schema = { + fields { + id = "int" + name = "string" + description = "string" + weight = "string" + } + }, + format = COMPATIBLE_KAFKA_CONNECT_JSON + } +} + + +sink { + Jdbc { + driver = com.mysql.cj.jdbc.Driver + url = "jdbc:mysql://localhost:3306/seatunnel" + user = st_user + password = seatunnel + generate_sink_sql = true + database = seatunnel + table = jdbc_sink + primary_keys = ["id"] + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/formats/maxwell-json.md b/versioned_docs/version-2.3.7/connector-v2/formats/maxwell-json.md new file mode 100644 index 000000000000..5e1c851d9e99 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/formats/maxwell-json.md @@ -0,0 +1,91 @@ +# MaxWell Format + +[Maxwell](https://maxwells-daemon.io/) is a CDC (Changelog Data Capture) tool that can stream changes in real-time from MySQL into Kafka, Kinesis and other streaming connectors. Maxwell provides a unified format schema for changelog and supports to serialize messages using JSON. + +Seatunnel supports to interpret MaxWell JSON messages as INSERT/UPDATE/DELETE messages into seatunnel system. This is useful in many cases to leverage this feature, such as + + synchronizing incremental data from databases to other systems + auditing logs + real-time materialized views on databases + temporal join changing history of a database table and so on. + +Seatunnel also supports to encode the INSERT/UPDATE/DELETE messages in Seatunnel as MaxWell JSON messages, and emit to storage like Kafka. However, currently Seatunnel can’t combine UPDATE_BEFORE and UPDATE_AFTER into a single UPDATE message. Therefore, Seatunnel encodes UPDATE_BEFORE and UPDATE_AFTER as DELETE and INSERT MaxWell messages. + +# Format Options + +| Option | Default | Required | Description | +|----------------------------------|---------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| format | (none) | yes | Specify what format to use, here should be 'maxwell_json'. | +| maxwell_json.ignore-parse-errors | false | no | Skip fields and rows with parse errors instead of failing. Fields are set to null in case of errors. | +| maxwell_json.database.include | (none) | no | An optional regular expression to only read the specific databases changelog rows by regular matching the "database" meta field in the MaxWell record. The pattern string is compatible with Java's Pattern. | +| maxwell_json.table.include | (none) | no | An optional regular expression to only read the specific tables changelog rows by regular matching the "table" meta field in the MaxWell record. The pattern string is compatible with Java's Pattern. | + +# How To Use MaxWell format + +## Kafka Uses Example + +MaxWell provides a unified format for changelog, here is a simple example for an update operation captured from a MySQL products table: + +```bash +{ + "database":"test", + "table":"product", + "type":"insert", + "ts":1596684904, + "xid":7201, + "commit":true, + "data":{ + "id":111, + "name":"scooter", + "description":"Big 2-wheel scooter ", + "weight":5.18 + }, + "primary_key_columns":[ + "id" + ] +} +``` + +Note: please refer to MaxWell documentation about the meaning of each fields. + +The MySQL products table has 4 columns (id, name, description and weight). +The above JSON message is an update change event on the products table where the weight value of the row with id = 111 is changed from 5.18 to 5.15. +Assuming the messages have been synchronized to Kafka topic products_binlog, then we can use the following Seatunnel to consume this topic and interpret the change events. + +```bash +env { + execution.parallelism = 1 + job.mode = "BATCH" +} + +source { + Kafka { + bootstrap.servers = "kafkaCluster:9092" + topic = "products_binlog" + result_table_name = "kafka_name" + start_mode = earliest + schema = { + fields { + id = "int" + name = "string" + description = "string" + weight = "string" + } + }, + format = maxwell_json + } + +} + +transform { +} + +sink { + Kafka { + bootstrap.servers = "localhost:9092" + topic = "consume-binlog" + format = maxwell_json + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/formats/ogg-json.md b/versioned_docs/version-2.3.7/connector-v2/formats/ogg-json.md new file mode 100644 index 000000000000..3faeb33c4f08 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/formats/ogg-json.md @@ -0,0 +1,93 @@ +# Ogg Format + +[Oracle GoldenGate](https://www.oracle.com/integration/goldengate/) (a.k.a ogg) is a managed service providing a real-time data mesh platform, which uses replication to keep data highly available, and enabling real-time analysis. Customers can design, execute, and monitor their data replication and stream data processing solutions without the need to allocate or manage compute environments. Ogg provides a format schema for changelog and supports to serialize messages using JSON. + +Seatunnel supports to interpret Ogg JSON messages as INSERT/UPDATE/DELETE messages into seatunnel system. This is useful in many cases to leverage this feature, such as + + synchronizing incremental data from databases to other systems + auditing logs + real-time materialized views on databases + temporal join changing history of a database table and so on. + +Seatunnel also supports to encode the INSERT/UPDATE/DELETE messages in Seatunnel as Ogg JSON messages, and emit to storage like Kafka. However, currently Seatunnel can’t combine UPDATE_BEFORE and UPDATE_AFTER into a single UPDATE message. Therefore, Seatunnel encodes UPDATE_BEFORE and UPDATE_AFTER as DELETE and INSERT Ogg messages. + +# Format Options + +| Option | Default | Required | Description | +|------------------------------|---------|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| format | (none) | yes | Specify what format to use, here should be '-json'. | +| ogg_json.ignore-parse-errors | false | no | Skip fields and rows with parse errors instead of failing. Fields are set to null in case of errors. | +| ogg_json.database.include | (none) | no | An optional regular expression to only read the specific databases changelog rows by regular matching the "database" meta field in the Canal record. The pattern string is compatible with Java's Pattern. | +| ogg_json.table.include | (none) | no | An optional regular expression to only read the specific tables changelog rows by regular matching the "table" meta field in the Canal record. The pattern string is compatible with Java's Pattern. | + +# How to Use Ogg format + +## Kafka Uses Example + +Ogg provides a unified format for changelog, here is a simple example for an update operation captured from a Oracle products table: + +```bash +{ + "before": { + "id": 111, + "name": "scooter", + "description": "Big 2-wheel scooter", + "weight": 5.18 + }, + "after": { + "id": 111, + "name": "scooter", + "description": "Big 2-wheel scooter", + "weight": 5.15 + }, + "op_type": "U", + "op_ts": "2020-05-13 15:40:06.000000", + "current_ts": "2020-05-13 15:40:07.000000", + "primary_keys": [ + "id" + ], + "pos": "00000000000000000000143", + "table": "PRODUCTS" +} +``` + +Note: please refer to [Debezium documentation](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/oracle.adoc#data-change-events) about the meaning of each fields. + +The Oracle products table has 4 columns (id, name, description and weight). +The above JSON message is an update change event on the products table where the weight value of the row with id = 111 is changed from 5.18 to 5.15. +Assuming the messages have been synchronized to Kafka topic products_binlog, then we can use the following Seatunnel to consume this topic and interpret the change events. + +```bash +env { + parallelism = 1 + job.mode = "STREAMING" +} +source { + Kafka { + bootstrap.servers = "127.0.0.1:9092" + topic = "ogg" + result_table_name = "kafka_name" + start_mode = earliest + schema = { + fields { + id = "int" + name = "string" + description = "string" + weight = "double" + } + }, + format = ogg_json + } +} +sink { + jdbc { + url = "jdbc:mysql://127.0.0.1/test" + driver = "com.mysql.cj.jdbc.Driver" + user = "root" + password = "12345678" + table = "ogg" + primary_keys = ["id"] + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Activemq.md b/versioned_docs/version-2.3.7/connector-v2/sink/Activemq.md new file mode 100644 index 000000000000..3151585d0824 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Activemq.md @@ -0,0 +1,123 @@ +# Activemq + +> Activemq sink connector + +## Description + +Used to write data to Activemq. + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-------------------------------------|---------|----------|---------------| +| host | string | no | - | +| port | int | no | - | +| virtual_host | string | no | - | +| username | string | no | - | +| password | string | no | - | +| queue_name | string | yes | - | +| uri | string | yes | - | +| check_for_duplicate | boolean | no | - | +| client_id | boolean | no | - | +| copy_message_on_send | boolean | no | - | +| disable_timeStamps_by_default | boolean | no | - | +| use_compression | boolean | no | - | +| always_session_async | boolean | no | - | +| dispatch_async | boolean | no | - | +| nested_map_and_list_enabled | boolean | no | - | +| warnAboutUnstartedConnectionTimeout | boolean | no | - | +| closeTimeout | int | no | - | + +### host [string] + +the default host to use for connections + +### port [int] + +the default port to use for connections + +### username [string] + +the AMQP user name to use when connecting to the broker + +### password [string] + +the password to use when connecting to the broker + +### uri [string] + +convenience method for setting the fields in an AMQP URI: host, port, username, password and virtual host + +### queue_name [string] + +the queue to write the message to + +### check_for_duplicate [boolean] + +will check for duplucate messages + +### client_id [string] + +client id + +### copy_message_on_send [boolean] + +if true, enables new JMS Message object as part of the send method + +### disable_timeStamps_by_default [boolean] + +disables timestamp for slight performance boost + +### use_compression [boolean] + +Enables the use of compression on the message’s body. + +### always_session_async [boolean] + +When true a separate thread is used for dispatching messages for each Session in the Connection. + +### always_sync_send [boolean] + +When true a MessageProducer will always use Sync sends when sending a Message + +### close_timeout [boolean] + +Sets the timeout, in milliseconds, before a close is considered complete. + +### dispatch_async [boolean] + +Should the broker dispatch messages asynchronously to the consumer + +### nested_map_and_list_enabled [boolean] + +Controls whether Structured Message Properties and MapMessages are supported + +### warn_about_unstarted_connection_timeout [int] + +The timeout, in milliseconds, from the time of connection creation to when a warning is generated + +## Example + +simple: + +```hocon +sink { + ActiveMQ { + uri="tcp://localhost:61616" + username = "admin" + password = "admin" + queue_name = "test1" + } +} +``` + +## Changelog + +### next version + +- Add Activemq Source Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/AmazonDynamoDB.md b/versioned_docs/version-2.3.7/connector-v2/sink/AmazonDynamoDB.md new file mode 100644 index 000000000000..63211077c740 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/AmazonDynamoDB.md @@ -0,0 +1,66 @@ +# AmazonDynamoDB + +> Amazon DynamoDB sink connector + +## Description + +Write data to Amazon DynamoDB + +## Key Features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Options + +| Name | Type | Required | Default value | +|-------------------|--------|----------|---------------| +| url | string | yes | - | +| region | string | yes | - | +| access_key_id | string | yes | - | +| secret_access_key | string | yes | - | +| table | string | yes | - | +| batch_size | string | no | 25 | +| common-options | | no | - | + +### url [string] + +The URL to write to Amazon DynamoDB. + +### region [string] + +The region of Amazon DynamoDB. + +### accessKeyId [string] + +The access id of Amazon DynamoDB. + +### secretAccessKey [string] + +The access secret of Amazon DynamoDB. + +### table [string] + +The table of Amazon DynamoDB. + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. + +## Example + +```bash +Amazondynamodb { + url = "http://127.0.0.1:8000" + region = "us-east-1" + accessKeyId = "dummy-key" + secretAccessKey = "dummy-secret" + table = "TableName" + } +``` + +## Changelog + +### next version + +- Add Amazon DynamoDB Sink Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/AmazonSqs.md b/versioned_docs/version-2.3.7/connector-v2/sink/AmazonSqs.md new file mode 100644 index 000000000000..8efabfa395bd --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/AmazonSqs.md @@ -0,0 +1,87 @@ +# AmazonSqs + +> Amazon SQS sink connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Description + +Write data to Amazon SQS + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Sink Options + +| Name | Type | Required | Default | Description | +|-------------------------|--------|----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The Queue URL to read from Amazon SQS. | +| region | String | No | - | The AWS region for the SQS service | +| format | String | No | json | Data format. The default format is json. Optional text format, canal-json and debezium-json.If you use json or text format. The default field separator is ", ". If you customize the delimiter, add the "field_delimiter" option.If you use canal format, please refer to [canal-json](../formats/canal-json.md) for details.If you use debezium format, please refer to [debezium-json](../formats/debezium-json.md) for details. | +| format_error_handle_way | String | No | fail | The processing method of data format error. The default value is fail, and the optional value is (fail, skip). When fail is selected, data format error will block and an exception will be thrown. When skip is selected, data format error will skip this line data. | +| field_delimiter | String | No | , | Customize the field delimiter for data format. | + +## Task Example + +```bash +source { + FakeSource { + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + result_table_name = "fake" + } +} + +sink { + AmazonSqs { + url = "http://127.0.0.1:8000" + region = "us-east-1" + queue = "queueName" + format = text + field_delimiter = "|" + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Assert.md b/versioned_docs/version-2.3.7/connector-v2/sink/Assert.md new file mode 100644 index 000000000000..e02d0fc6b966 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Assert.md @@ -0,0 +1,498 @@ +# Assert + +> Assert sink connector + +## Description + +A flink sink plugin which can assert illegal data by user defined rules + +## Key Features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Options + +| Name | Type | Required | Default | +|------------------------------------------------------------------------------------------------|-------------------------------------------------|----------|---------| +| rules | ConfigMap | yes | - | +| rules.field_rules | string | yes | - | +| rules.field_rules.field_name | string\|ConfigMap | yes | - | +| rules.field_rules.field_type | string | no | - | +| rules.field_rules.field_value | ConfigList | no | - | +| rules.field_rules.field_value.rule_type | string | no | - | +| rules.field_rules.field_value.rule_value | numeric | no | - | +| rules.field_rules.field_value.equals_to | boolean\|numeric\|string\|ConfigList\|ConfigMap | no | - | +| rules.row_rules | string | yes | - | +| rules.row_rules.rule_type | string | no | - | +| rules.row_rules.rule_value | string | no | - | +| rules.catalog_table_rule | ConfigMap | no | - | +| rules.catalog_table_rule.primary_key_rule | ConfigMap | no | - | +| rules.catalog_table_rule.primary_key_rule.primary_key_name | string | no | - | +| rules.catalog_table_rule.primary_key_rule.primary_key_columns | ConfigList | no | - | +| rules.catalog_table_rule.constraint_key_rule | ConfigList | no | - | +| rules.catalog_table_rule.constraint_key_rule.constraint_key_name | string | no | - | +| rules.catalog_table_rule.constraint_key_rule.constraint_key_type | string | no | - | +| rules.catalog_table_rule.constraint_key_rule.constraint_key_columns | ConfigList | no | - | +| rules.catalog_table_rule.constraint_key_rule.constraint_key_columns.constraint_key_column_name | string | no | - | +| rules.catalog_table_rule.constraint_key_rule.constraint_key_columns.constraint_key_sort_type | string | no | - | +| rules.catalog_table_rule.column_rule | ConfigList | no | - | +| rules.catalog_table_rule.column_rule.name | string | no | - | +| rules.catalog_table_rule.column_rule.type | string | no | - | +| rules.catalog_table_rule.column_rule.column_length | int | no | - | +| rules.catalog_table_rule.column_rule.nullable | boolean | no | - | +| rules.catalog_table_rule.column_rule.default_value | string | no | - | +| rules.catalog_table_rule.column_rule.comment | comment | no | - | +| rules.table-names | ConfigList | no | - | +| common-options | | no | - | + +### rules [ConfigMap] + +Rule definition of user's available data. Each rule represents one field validation or row num validation. + +### field_rules [ConfigList] + +field rules for field validation + +### field_name [string] + +field name(string) + +### field_type [string | ConfigMap] + +Field type declarations should adhere to this [guide](../../concept/schema-feature.md#how-to-declare-type-supported). + +### field_value [ConfigList] + +A list value rule define the data value validation + +### rule_type [string] + +The following rules are supported for now +- NOT_NULL `value can't be null` +- NULL `value can be null` +- MIN `define the minimum value of data` +- MAX `define the maximum value of data` +- MIN_LENGTH `define the minimum string length of a string data` +- MAX_LENGTH `define the maximum string length of a string data` +- MIN_ROW `define the minimun number of rows` +- MAX_ROW `define the maximum number of rows` + +### rule_value [numeric] + +The value related to rule type. When the `rule_type` is `MIN`, `MAX`, `MIN_LENGTH`, `MAX_LENGTH`, `MIN_ROW` or `MAX_ROW`, users need to assign a value to the `rule_value`. + +### equals_to [boolean | numeric | string | ConfigList | ConfigMap] + +`equals_to` is used to compare whether the field value is equal to the configured expected value. You can assign values of all types to `equals_to`. These types are detailed [here](../../concept/schema-feature.md#what-type-supported-at-now). For instance, if one field is a row with three fields, and the declaration of row type is `{a = array, b = map, c={c_0 = int, b = string}}`, users can assign the value `[["a", "b"], { k0 = 9999.99, k1 = 111.11 }, [123, "abcd"]]` to `equals_to`. + +> The way of defining field values is consistent with [FakeSource](../source/FakeSource.md#customize-the-data-content-simple). +> +> `equals_to` cannot be applied to `null` type fields. However, users can use the rule type `NULL` for verification, such as `{rule_type = NULL}`. + +### catalog_table_rule [ConfigMap] + +Used to assert the catalog table is same with the user defined table. + +### table-names [ConfigList] + +Used to assert the table should be in the data. + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +## Example + +the whole config obey with `hocon` style + +```hocon +Assert { + rules = + { + row_rules = [ + { + rule_type = MAX_ROW + rule_value = 10 + }, + { + rule_type = MIN_ROW + rule_value = 5 + } + ], + field_rules = [{ + field_name = name + field_type = string + field_value = [ + { + rule_type = NOT_NULL + }, + { + rule_type = MIN_LENGTH + rule_value = 5 + }, + { + rule_type = MAX_LENGTH + rule_value = 10 + } + ] + }, { + field_name = age + field_type = int + field_value = [ + { + rule_type = NOT_NULL + equals_to = 23 + }, + { + rule_type = MIN + rule_value = 32767 + }, + { + rule_type = MAX + rule_value = 2147483647 + } + ] + } + ] + catalog_table_rule { + primary_key_rule = { + primary_key_name = "primary key" + primary_key_columns = ["id"] + } + constraint_key_rule = [ + { + constraint_key_name = "unique_name" + constraint_key_type = UNIQUE_KEY + constraint_key_columns = [ + { + constraint_key_column_name = "id" + constraint_key_sort_type = ASC + } + ] + } + ] + column_rule = [ + { + name = "id" + type = bigint + }, + { + name = "name" + type = string + }, + { + name = "age" + type = int + } + ] + } + } + + } +``` + +Here is a more complex example about `equals_to`. The example involves FakeSource. You may want to learn it, please read this [document](../source/FakeSource.md). + +```hocon +source { + FakeSource { + row.num = 1 + schema = { + fields { + c_null = "null" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_date = date + c_timestamp = timestamp + c_time = time + c_bytes = bytes + c_array = "array" + c_map = "map" + c_map_nest = "map" + c_row = { + c_null = "null" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_date = date + c_timestamp = timestamp + c_time = time + c_bytes = bytes + c_array = "array" + c_map = "map" + } + } + } + rows = [ + { + kind = INSERT + fields = [ + null, "AAA", false, 1, 1, 333, 323232, 3.1, 9.33333, 99999.99999999, "2012-12-21", "2012-12-21T12:34:56", "12:34:56", + "bWlJWmo=", + [0, 1, 2], + "{ 12:01:26 = v0 }", + { k1 = [123, "BBB-BB"]}, + [ + null, "AAA", false, 1, 1, 333, 323232, 3.1, 9.33333, 99999.99999999, "2012-12-21", "2012-12-21T12:34:56", "12:34:56", + "bWlJWmo=", + [0, 1, 2], + { k0 = v0 } + ] + ] + } + ] + result_table_name = "fake" + } +} + +sink{ + Assert { + source_table_name = "fake" + rules = + { + row_rules = [ + { + rule_type = MAX_ROW + rule_value = 1 + }, + { + rule_type = MIN_ROW + rule_value = 1 + } + ], + field_rules = [ + { + field_name = c_null + field_type = "null" + field_value = [ + { + rule_type = NULL + } + ] + }, + { + field_name = c_string + field_type = string + field_value = [ + { + rule_type = NOT_NULL + equals_to = "AAA" + } + ] + }, + { + field_name = c_boolean + field_type = boolean + field_value = [ + { + rule_type = NOT_NULL + equals_to = false + } + ] + }, + { + field_name = c_tinyint + field_type = tinyint + field_value = [ + { + rule_type = NOT_NULL + equals_to = 1 + } + ] + }, + { + field_name = c_smallint + field_type = smallint + field_value = [ + { + rule_type = NOT_NULL + equals_to = 1 + } + ] + }, + { + field_name = c_int + field_type = int + field_value = [ + { + rule_type = NOT_NULL + equals_to = 333 + } + ] + }, + { + field_name = c_bigint + field_type = bigint + field_value = [ + { + rule_type = NOT_NULL + equals_to = 323232 + } + ] + }, + { + field_name = c_float + field_type = float + field_value = [ + { + rule_type = NOT_NULL + equals_to = 3.1 + } + ] + }, + { + field_name = c_double + field_type = double + field_value = [ + { + rule_type = NOT_NULL + equals_to = 9.33333 + } + ] + }, + { + field_name = c_decimal + field_type = "decimal(30, 8)" + field_value = [ + { + rule_type = NOT_NULL + equals_to = 99999.99999999 + } + ] + }, + { + field_name = c_date + field_type = date + field_value = [ + { + rule_type = NOT_NULL + equals_to = "2012-12-21" + } + ] + }, + { + field_name = c_timestamp + field_type = timestamp + field_value = [ + { + rule_type = NOT_NULL + equals_to = "2012-12-21T12:34:56" + } + ] + }, + { + field_name = c_time + field_type = time + field_value = [ + { + rule_type = NOT_NULL + equals_to = "12:34:56" + } + ] + }, + { + field_name = c_bytes + field_type = bytes + field_value = [ + { + rule_type = NOT_NULL + equals_to = "bWlJWmo=" + } + ] + }, + { + field_name = c_array + field_type = "array" + field_value = [ + { + rule_type = NOT_NULL + equals_to = [0, 1, 2] + } + ] + }, + { + field_name = c_map + field_type = "map" + field_value = [ + { + rule_type = NOT_NULL + equals_to = "{ 12:01:26 = v0 }" + } + ] + }, + { + field_name = c_map_nest + field_type = "map" + field_value = [ + { + rule_type = NOT_NULL + equals_to = { k1 = [123, "BBB-BB"] } + } + ] + }, + { + field_name = c_row + field_type = { + c_null = "null" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_date = date + c_timestamp = timestamp + c_time = time + c_bytes = bytes + c_array = "array" + c_map = "map" + } + field_value = [ + { + rule_type = NOT_NULL + equals_to = [ + null, "AAA", false, 1, 1, 333, 323232, 3.1, 9.33333, 99999.99999999, "2012-12-21", "2012-12-21T12:34:56", "12:34:56", + "bWlJWmo=", + [0, 1, 2], + { k0 = v0 } + ] + } + ] + } + ] + } + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Assert Sink Connector + +### 2.3.0-beta 2022-10-20 + +- [Improve] 1.Support check the number of rows ([2844](https://github.com/apache/seatunnel/pull/2844)) ([3031](https://github.com/apache/seatunnel/pull/3031)): + - check rows not empty + - check minimum number of rows + - check maximum number of rows +- [Improve] 2.Support direct define of data values(row) ([2844](https://github.com/apache/seatunnel/pull/2844)) ([3031](https://github.com/apache/seatunnel/pull/3031)) +- [Improve] 3.Support setting parallelism as 1 ([2844](https://github.com/apache/seatunnel/pull/2844)) ([3031](https://github.com/apache/seatunnel/pull/3031)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Cassandra.md b/versioned_docs/version-2.3.7/connector-v2/sink/Cassandra.md new file mode 100644 index 000000000000..73c6d3aba550 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Cassandra.md @@ -0,0 +1,95 @@ +# Cassandra + +> Cassandra sink connector + +## Description + +Write data to Apache Cassandra. + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-------------------|---------|----------|---------------| +| host | String | Yes | - | +| keyspace | String | Yes | - | +| table | String | Yes | - | +| username | String | No | - | +| password | String | No | - | +| datacenter | String | No | datacenter1 | +| consistency_level | String | No | LOCAL_ONE | +| fields | String | No | LOCAL_ONE | +| batch_size | int | No | 5000 | +| batch_type | String | No | UNLOGGED | +| async_write | boolean | No | true | + +### host [string] + +`Cassandra` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as +`"cassandra1:9042,cassandra2:9042"`. + +### keyspace [string] + +The `Cassandra` keyspace. + +### table [String] + +The `Cassandra` table name. + +### username [string] + +`Cassandra` user username. + +### password [string] + +`Cassandra` user password. + +### datacenter [String] + +The `Cassandra` datacenter, default is `datacenter1`. + +### consistency_level [String] + +The `Cassandra` write consistency level, default is `LOCAL_ONE`. + +### fields [array] + +The data field that needs to be output to `Cassandra` , if not configured, it will be automatically adapted +according to the sink table `schema`. + +### batch_size [number] + +The number of rows written through [Cassandra-Java-Driver](https://github.com/datastax/java-driver) each time, +default is `5000`. + +### batch_type [String] + +The `Cassandra` batch processing mode, default is `UNLOGGER`. + +### async_write [boolean] + +Whether `cassandra` writes in asynchronous mode, default is `true`. + +## Examples + +```hocon +sink { + Cassandra { + host = "localhost:9042" + username = "cassandra" + password = "cassandra" + datacenter = "datacenter1" + keyspace = "test" + } +} +``` + +## Changelog + +### next version + +- Add Cassandra Sink Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Clickhouse.md b/versioned_docs/version-2.3.7/connector-v2/sink/Clickhouse.md new file mode 100644 index 000000000000..3798e2baae34 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Clickhouse.md @@ -0,0 +1,180 @@ +# Clickhouse + +> Clickhouse sink connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +> The Clickhouse sink plug-in can achieve accuracy once by implementing idempotent writing, and needs to cooperate with aggregatingmergetree and other engines that support deduplication. + +## Description + +Used to write data to Clickhouse. + +## Supported DataSource Info + +In order to use the Clickhouse connector, the following dependencies are required. +They can be downloaded via install-plugin.sh or from the Maven central repository. + +| Datasource | Supported Versions | Dependency | +|------------|--------------------|------------------------------------------------------------------------------------------------------------------| +| Clickhouse | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-clickhouse) | + +## Data Type Mapping + +| SeaTunnel Data Type | Clickhouse Data Type | +|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------| +| STRING | String / Int128 / UInt128 / Int256 / UInt256 / Point / Ring / Polygon MultiPolygon | +| INT | Int8 / UInt8 / Int16 / UInt16 / Int32 | +| BIGINT | UInt64 / Int64 / IntervalYear / IntervalQuarter / IntervalMonth / IntervalWeek / IntervalDay / IntervalHour / IntervalMinute / IntervalSecond | +| DOUBLE | Float64 | +| DECIMAL | Decimal | +| FLOAT | Float32 | +| DATE | Date | +| TIME | DateTime | +| ARRAY | Array | +| MAP | Map | + +## Sink Options + +| Name | Type | Required | Default | Description | +|---------------------------------------|---------|----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| host | String | Yes | - | `ClickHouse` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as `"host1:8123,host2:8123"`. | +| database | String | Yes | - | The `ClickHouse` database. | +| table | String | Yes | - | The table name. | +| username | String | Yes | - | `ClickHouse` user username. | +| password | String | Yes | - | `ClickHouse` user password. | +| clickhouse.config | Map | No | | In addition to the above mandatory parameters that must be specified by `clickhouse-jdbc` , users can also specify multiple optional parameters, which cover all the [parameters](https://github.com/ClickHouse/clickhouse-jdbc/tree/master/clickhouse-client#configuration) provided by `clickhouse-jdbc`. | +| bulk_size | String | No | 20000 | The number of rows written through [Clickhouse-jdbc](https://github.com/ClickHouse/clickhouse-jdbc) each time, the `default is 20000`. | +| split_mode | String | No | false | This mode only support clickhouse table which engine is 'Distributed'.And `internal_replication` option-should be `true`.They will split distributed table data in seatunnel and perform write directly on each shard. The shard weight define is clickhouse will counted. | +| sharding_key | String | No | - | When use split_mode, which node to send data to is a problem, the default is random selection, but the 'sharding_key' parameter can be used to specify the field for the sharding algorithm. This option only worked when 'split_mode' is true. | +| primary_key | String | No | - | Mark the primary key column from clickhouse table, and based on primary key execute INSERT/UPDATE/DELETE to clickhouse table. | +| support_upsert | Boolean | No | false | Support upsert row by query primary key. | +| allow_experimental_lightweight_delete | Boolean | No | false | Allow experimental lightweight delete based on `*MergeTree` table engine. | +| common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. | + +## How to Create a Clickhouse Data Synchronization Jobs + +The following example demonstrates how to create a data synchronization job that writes randomly generated data to a Clickhouse database: + +```bash +# Set the basic configuration of the task to be performed +env { + parallelism = 1 + job.mode = "BATCH" + checkpoint.interval = 1000 +} + +source { + FakeSource { + row.num = 2 + bigint.min = 0 + bigint.max = 10000000 + split.num = 1 + split.read-interval = 300 + schema { + fields { + c_bigint = bigint + } + } + } +} + +sink { + Clickhouse { + host = "127.0.0.1:9092" + database = "default" + table = "test" + username = "xxxxx" + password = "xxxxx" + } +} +``` + +### Tips + +> 1.[SeaTunnel Deployment Document](../../start-v2/locally/deployment.md).
+> 2.The table to be written to needs to be created in advance before synchronization.
+> 3.When sink is writing to the ClickHouse table, you don't need to set its schema because the connector will query ClickHouse for the current table's schema information before writing.
+ +## Clickhouse Sink Config + +```hocon +sink { + Clickhouse { + host = "localhost:8123" + database = "default" + table = "fake_all" + username = "xxxxx" + password = "xxxxx" + clickhouse.config = { + max_rows_to_read = "100" + read_overflow_mode = "throw" + } + } +} +``` + +## Split Mode + +```hocon +sink { + Clickhouse { + host = "localhost:8123" + database = "default" + table = "fake_all" + username = "xxxxx" + password = "xxxxx" + + # split mode options + split_mode = true + sharding_key = "age" + } +} +``` + +## CDC(Change data capture) Sink + +```hocon +sink { + Clickhouse { + host = "localhost:8123" + database = "default" + table = "fake_all" + username = "xxxxx" + password = "xxxxx" + + # cdc options + primary_key = "id" + support_upsert = true + } +} +``` + +## CDC(Change data capture) for *MergeTree engine + +```hocon +sink { + Clickhouse { + host = "localhost:8123" + database = "default" + table = "fake_all" + username = "xxxxx" + password = "xxxxx" + + # cdc options + primary_key = "id" + support_upsert = true + allow_experimental_lightweight_delete = true + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/ClickhouseFile.md b/versioned_docs/version-2.3.7/connector-v2/sink/ClickhouseFile.md new file mode 100644 index 000000000000..ebafbc016282 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/ClickhouseFile.md @@ -0,0 +1,147 @@ +# ClickhouseFile + +> Clickhouse file sink connector + +## Description + +Generate the clickhouse data file with the clickhouse-local program, and then send it to the clickhouse +server, also call bulk load. This connector only support clickhouse table which engine is 'Distributed'.And `internal_replication` option +should be `true`. Supports Batch and Streaming mode. + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +:::tip + +Write data to Clickhouse can also be done using JDBC + +::: + +## Options + +| Name | Type | Required | Default | +|------------------------|---------|----------|----------------------------------------| +| host | string | yes | - | +| database | string | yes | - | +| table | string | yes | - | +| username | string | yes | - | +| password | string | yes | - | +| clickhouse_local_path | string | yes | - | +| sharding_key | string | no | - | +| copy_method | string | no | scp | +| node_free_password | boolean | no | false | +| node_pass | list | no | - | +| node_pass.node_address | string | no | - | +| node_pass.username | string | no | "root" | +| node_pass.password | string | no | - | +| compatible_mode | boolean | no | false | +| file_fields_delimiter | string | no | "\t" | +| file_temp_path | string | no | "/tmp/seatunnel/clickhouse-local/file" | +| common-options | | no | - | + +### host [string] + +`ClickHouse` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as `"host1:8123,host2:8123"` . + +### database [string] + +The `ClickHouse` database + +### table [string] + +The table name + +### username [string] + +`ClickHouse` user username + +### password [string] + +`ClickHouse` user password + +### sharding_key [string] + +When ClickhouseFile split data, which node to send data to is a problem, the default is random selection, but the +'sharding_key' parameter can be used to specify the field for the sharding algorithm. + +### clickhouse_local_path [string] + +The address of the clickhouse-local program on the spark node. Since each task needs to be called, +clickhouse-local should be located in the same path of each spark node. + +### copy_method [string] + +Specifies the method used to transfer files, the default is scp, optional scp and rsync + +### node_free_password [boolean] + +Because seatunnel need to use scp or rsync for file transfer, seatunnel need clickhouse server-side access. +If each spark node and clickhouse server are configured with password-free login, +you can configure this option to true, otherwise you need to configure the corresponding node password in the node_pass configuration + +### node_pass [list] + +Used to save the addresses and corresponding passwords of all clickhouse servers + +### node_pass.node_address [string] + +The address corresponding to the clickhouse server + +### node_pass.username [string] + +The username corresponding to the clickhouse server, default root user. + +### node_pass.password [string] + +The password corresponding to the clickhouse server. + +### compatible_mode [boolean] + +In the lower version of Clickhouse, the ClickhouseLocal program does not support the `--path` parameter, +you need to use this mode to take other ways to realize the `--path` parameter function + +### file_fields_delimiter [string] + +ClickhouseFile uses csv format to temporarily save data. If the data in the row contains the delimiter value +of csv, it may cause program exceptions. +Avoid this with this configuration. Value string has to be an exactly one character long + +### file_temp_path [string] + +The directory where ClickhouseFile stores temporary files locally. + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +## Examples + +```hocon +ClickhouseFile { + host = "192.168.0.1:8123" + database = "default" + table = "fake_all" + username = "default" + password = "" + clickhouse_local_path = "/Users/seatunnel/Tool/clickhouse local" + sharding_key = "age" + node_free_password = false + node_pass = [{ + node_address = "192.168.0.1" + password = "seatunnel" + }] +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Support write data to ClickHouse File and move to ClickHouse data dir + +### Next version + +- [BugFix] Fix generated data part name conflict and improve file commit logic [3416](https://github.com/apache/seatunnel/pull/3416) +- [Feature] Support compatible_mode compatible with lower version Clickhouse [3416](https://github.com/apache/seatunnel/pull/3416) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Console.md b/versioned_docs/version-2.3.7/connector-v2/sink/Console.md new file mode 100644 index 000000000000..5d83c8102635 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Console.md @@ -0,0 +1,124 @@ +# Console + +> Console sink connector + +## Support Connector Version + +- All versions + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Description + +Used to send data to Console. Both support streaming and batch mode. + +> For example, if the data from upstream is [`age: 12, name: jared`], the content send to console is the following: `{"name":"jared","age":17}` + +## Key Features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Options + +| Name | Type | Required | Default | Description | +|--------------------|---------|----------|---------|-----------------------------------------------------------------------------------------------------| +| common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | +| log.print.data | boolean | No | - | Flag to determine whether data should be printed in the logs. The default value is `true` | +| log.print.delay.ms | int | No | - | Delay in milliseconds between printing each data item to the logs. The default value is `0`. | + +## Task Example + +### Simple: + +> This is a randomly generated data, written to the console, with a degree of parallelism of 1 + +``` +env { + parallelism = 1 + job.mode = "STREAMING" +} + +source { + FakeSource { + result_table_name = "fake" + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +sink { + Console { + source_table_name = "fake" + } +} +``` + +### Multiple Sources Simple: + +> This is a multiple source and you can specify a data source to write to the specified end + +``` +env { + parallelism = 1 + job.mode = "STREAMING" +} + +source { + FakeSource { + result_table_name = "fake1" + schema = { + fields { + id = "int" + name = "string" + age = "int" + sex = "string" + } + } + } + FakeSource { + result_table_name = "fake2" + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +sink { + Console { + source_table_name = "fake1" + } + Console { + source_table_name = "fake2" + } +} +``` + +## Console Sample Data + +This is a printout from our console + +``` +2022-12-19 11:01:45,417 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - output rowType: name, age +2022-12-19 11:01:46,489 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=1: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: CpiOd, 8520946 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=2: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: eQqTs, 1256802974 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=3: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: UsRgO, 2053193072 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=4: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: jDQJj, 1993016602 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=5: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: rqdKp, 1392682764 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=6: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: wCoWN, 986999925 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=7: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: qomTU, 72775247 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=8: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: jcqXR, 1074529204 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=9: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: AkWIO, 1961723427 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=10: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: hBoib, 929089763 +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/CosFile.md b/versioned_docs/version-2.3.7/connector-v2/sink/CosFile.md new file mode 100644 index 000000000000..612273a936db --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/CosFile.md @@ -0,0 +1,293 @@ +# CosFile + +> Cos file sink connector + +## Description + +Output data to cos file system. + +:::tip + +If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. + +If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. + +To use this connector you need put hadoop-cos-{hadoop.version}-{version}.jar and cos_api-bundle-{version}.jar in ${SEATUNNEL_HOME}/lib dir, download: [Hadoop-Cos-release](https://github.com/tencentyun/hadoop-cos/releases). It only supports hadoop version 2.6.5+ and version 8.0.2+. + +::: + +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) + +By default, we use 2PC commit to ensure `exactly-once` + +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + - [x] xml + - [x] binary + +## Options + +| Name | Type | Required | Default | Description | +|---------------------------------------|---------|----------|--------------------------------------------|-------------------------------------------------------------------------------------------------------------------| +| path | string | yes | - | | +| tmp_path | string | no | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. Need a COS dir. | +| bucket | string | yes | - | | +| secret_id | string | yes | - | | +| secret_key | string | yes | - | | +| region | string | yes | - | | +| custom_filename | boolean | no | false | Whether you need custom the filename | +| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | +| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | +| file_format_type | string | no | "csv" | | +| field_delimiter | string | no | '\001' | Only used when file_format is text | +| row_delimiter | string | no | "\n" | Only used when file_format is text | +| have_partition | boolean | no | false | Whether you need processing partitions. | +| partition_by | array | no | - | Only used then have_partition is true | +| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | +| is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true | +| sink_columns | array | no | | When this parameter is empty, all fields are sink columns | +| is_enable_transaction | boolean | no | true | | +| batch_size | int | no | 1000000 | | +| compress_codec | string | no | none | | +| common-options | object | no | - | | +| max_rows_in_memory | int | no | - | Only used when file_format is excel. | +| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. | +| xml_root_tag | string | no | RECORDS | Only used when file_format is xml. | +| xml_row_tag | string | no | RECORD | Only used when file_format is xml. | +| xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | +| parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | +| parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | +| encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | + +### path [string] + +The target dir path is required. + +### bucket [string] + +The bucket address of cos file system, for example: `cosn://seatunnel-test-1259587829` + +### secret_id [string] + +The secret id of cos file system. + +### secret_key [string] + +The secret key of cos file system. + +### region [string] + +The region of cos file system. + +### custom_filename [boolean] + +Whether custom the filename + +### file_name_expression [string] + +Only used when `custom_filename` is `true` + +`file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, +`${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +### filename_time_format [string] + +Only used when `custom_filename` is `true` + +When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: + +| Symbol | Description | +|--------|--------------------| +| y | Year | +| M | Month | +| d | Day of month | +| H | Hour in day (0-23) | +| m | Minute in hour | +| s | Second in minute | + +### file_format_type [string] + +We supported as the following file types: + +`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` + +Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. + +### field_delimiter [string] + +The separator between columns in a row of data. Only needed by `text` file format. + +### row_delimiter [string] + +The separator between rows in a file. Only needed by `text` file format. + +### have_partition [boolean] + +Whether you need processing partitions. + +### partition_by [array] + +Only used when `have_partition` is `true`. + +Partition data based on selected fields. + +### partition_dir_expression [string] + +Only used when `have_partition` is `true`. + +If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. + +Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. + +### is_partition_field_write_in_file [boolean] + +Only used when `have_partition` is `true`. + +If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. + +For example, if you want to write a Hive Data File, Its value should be `false`. + +### sink_columns [array] + +Which columns need be written to file, default value is all the columns get from `Transform` or `Source`. +The order of the fields determines the order in which the file is actually written. + +### is_enable_transaction [boolean] + +If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +Only support `true` now. + +### batch_size [int] + +The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. + +### compress_codec [string] + +The compress codec of files and the details that supported as the following shown: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc: `lzo` `snappy` `lz4` `zlib` `none` +- parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` + +Tips: excel type does not support any compression format + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. + +### max_rows_in_memory [int] + +When File Format is Excel,The maximum number of data items that can be cached in the memory. + +### sheet_name [string] + +Writer the sheet of the workbook + +### xml_root_tag [string] + +Specifies the tag name of the root element within the XML file. + +### xml_row_tag [string] + +Specifies the tag name of the data rows within the XML file. + +### xml_use_attr_format [boolean] + +Specifies Whether to process data using the tag attribute format. + +### parquet_avro_write_timestamp_as_int96 [boolean] + +Support writing Parquet INT96 from a timestamp, only valid for parquet files. + +### parquet_avro_write_fixed_as_int96 [array] + +Support writing Parquet INT96 from a 12-byte field, only valid for parquet files. + +### encoding [string] + +Only used when file_format_type is json,text,csv,xml. +The encoding of the file to write. This param will be parsed by `Charset.forName(encoding)`. + +## Example + +For text file format with `have_partition` and `custom_filename` and `sink_columns` + +```hocon + + CosFile { + path="/sink" + bucket = "cosn://seatunnel-test-1259587829" + secret_id = "xxxxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxxxx" + region = "ap-chengdu" + file_format_type = "text" + field_delimiter = "\t" + row_delimiter = "\n" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + custom_filename = true + file_name_expression = "${transactionId}_${now}" + filename_time_format = "yyyy.MM.dd" + sink_columns = ["name","age"] + is_enable_transaction = true + } + +``` + +For parquet file format with `have_partition` and `sink_columns` + +```hocon + + CosFile { + path="/sink" + bucket = "cosn://seatunnel-test-1259587829" + secret_id = "xxxxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxxxx" + region = "ap-chengdu" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + file_format_type = "parquet" + sink_columns = ["name","age"] + } + +``` + +For orc file format simple config + +```bash + + CosFile { + path="/sink" + bucket = "cosn://seatunnel-test-1259587829" + secret_id = "xxxxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxxxx" + region = "ap-chengdu" + file_format_type = "orc" + } + +``` + +## Changelog + +### next version + +- Add file cos sink connector ([4979](https://github.com/apache/seatunnel/pull/4979)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/DB2.md b/versioned_docs/version-2.3.7/connector-v2/sink/DB2.md new file mode 100644 index 000000000000..5c3de373063c --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/DB2.md @@ -0,0 +1,175 @@ +# DB2 + +> JDBC DB2 Sink Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Description + +Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once +semantics (using XA transaction guarantee). + +## Using Dependency + +### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [cdc](../../concept/connector-v2-features.md) + +> Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is +> support `Xa transactions`. You can set `is_exactly_once=true` to enable it. + +## Supported DataSource Info + +| Datasource | Supported Versions | Driver | Url | Maven | +|------------|----------------------------------------------------------|--------------------------------|-----------------------------------|-----------------------------------------------------------------------| +| DB2 | Different dependency version has different driver class. | com.ibm.db2.jdbc.app.DB2Driver | jdbc:db2://127.0.0.1:50000/dbname | [Download](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) | + +## Data Type Mapping + +| DB2 Data Type | SeaTunnel Data Type | +|------------------------------------------------------------------------------------------------------|---------------------| +| BOOLEAN | BOOLEAN | +| SMALLINT | SHORT | +| INT
INTEGER
| INTEGER | +| BIGINT | LONG | +| DECIMAL
DEC
NUMERIC
NUM | DECIMAL(38,18) | +| REAL | FLOAT | +| FLOAT
DOUBLE
DOUBLE PRECISION
DECFLOAT | DOUBLE | +| CHAR
VARCHAR
LONG VARCHAR
CLOB
GRAPHIC
VARGRAPHIC
LONG VARGRAPHIC
DBCLOB | STRING | +| BLOB | BYTES | +| DATE | DATE | +| TIME | TIME | +| TIMESTAMP | TIMESTAMP | +| ROWID
XML | Not supported yet | + +## Sink Options + +| Name | Type | Required | Default | Description | +|-------------------------------------------|---------|----------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:db2://127.0.0.1:50000/dbname | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use DB2 the value is `com.ibm.db2.jdbc.app.DB2Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | +| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | +| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | +| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | +| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | +| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | +| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to | +| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, DB2 is `com.db2.cj.jdbc.Db2XADataSource`, and
please refer to appendix for other data sources | +| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | +| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | +| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | +| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your DB2. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +``` +# Defining the runtime environment +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + jdbc { + url = "jdbc:db2://127.0.0.1:50000/dbname" + driver = "com.ibm.db2.jdbc.app.DB2Driver" + user = "root" + password = "123456" + query = "insert into test_table(name,age) values(?,?)" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + +### Generate Sink SQL + +> This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you + +``` +sink { + jdbc { + url = "jdbc:db2://127.0.0.1:50000/dbname" + driver = "com.ibm.db2.jdbc.app.DB2Driver" + user = "root" + password = "123456" + # Automatically generate sql statements based on database table names + generate_sink_sql = true + database = test + table = test_table + } +} +``` + +### Exactly-once : + +> For accurate write scene we guarantee accurate once + +``` +sink { + jdbc { + url = "jdbc:db2://127.0.0.1:50000/dbname" + driver = "com.ibm.db2.jdbc.app.DB2Driver" + + max_retries = 0 + user = "root" + password = "123456" + query = "insert into test_table(name,age) values(?,?)" + + is_exactly_once = "true" + + xa_data_source_class_name = "com.db2.cj.jdbc.Db2XADataSource" + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Datahub.md b/versioned_docs/version-2.3.7/connector-v2/sink/Datahub.md new file mode 100644 index 000000000000..c4c1856f9239 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Datahub.md @@ -0,0 +1,79 @@ +# DataHub + +> DataHub sink connector + +## Description + +A sink plugin which use send message to DataHub + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|----------------|--------|----------|---------------| +| endpoint | string | yes | - | +| accessId | string | yes | - | +| accessKey | string | yes | - | +| project | string | yes | - | +| topic | string | yes | - | +| timeout | int | yes | - | +| retryTimes | int | yes | - | +| common-options | | no | - | + +### endpoint [string] + +your DataHub endpoint start with http (string) + +### accessId [string] + +your DataHub accessId which cloud be access from Alibaba Cloud (string) + +### accessKey[string] + +your DataHub accessKey which cloud be access from Alibaba Cloud (string) + +### project [string] + +your DataHub project which is created in Alibaba Cloud (string) + +### topic [string] + +your DataHub topic (string) + +### timeout [int] + +the max connection timeout (int) + +### retryTimes [int] + +the max retry times when your client put record failed (int) + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +## Example + +```hocon +sink { + DataHub { + endpoint="yourendpoint" + accessId="xxx" + accessKey="xxx" + project="projectname" + topic="topicname" + timeout=3000 + retryTimes=3 + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add DataHub Sink Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/DingTalk.md b/versioned_docs/version-2.3.7/connector-v2/sink/DingTalk.md new file mode 100644 index 000000000000..13a7e534dec4 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/DingTalk.md @@ -0,0 +1,55 @@ +# DingTalk + +> DinkTalk sink connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Description + +A sink plugin which use DingTalk robot send message + +## Options + +| name | type | required | default value | +|----------------|--------|----------|---------------| +| url | String | yes | - | +| secret | String | yes | - | +| common-options | | no | - | + +### url [String] + +DingTalk robot address format is https://oapi.dingtalk.com/robot/send?access_token=XXXXXX(String) + +### secret [String] + +DingTalk robot secret (String) + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +## Example + +```hocon +sink { + DingTalk { + url="https://oapi.dingtalk.com/robot/send?access_token=ec646cccd028d978a7156ceeac5b625ebd94f586ea0743fa501c100007890" + secret="SEC093249eef7aa57d4388aa635f678930c63db3d28b2829d5b2903fc1e5c10000" + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add DingTalk Sink Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Doris.md b/versioned_docs/version-2.3.7/connector-v2/sink/Doris.md new file mode 100644 index 000000000000..592cd8702bec --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Doris.md @@ -0,0 +1,432 @@ +# Doris + +> Doris sink connector + +## Support Doris Version + +- exactly-once & cdc supported `Doris version is >= 1.1.x` +- Array data type supported `Doris version is >= 1.2.x` +- Map data type will be support in `Doris version is 2.x` + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) +- [x] [support multiple table write](../../concept/connector-v2-features.md) + +## Description + +Used to send data to Doris. Both support streaming and batch mode. +The internal implementation of Doris sink connector is cached and imported by stream load in batches. + +## Sink Options + +| Name | Type | Required | Default | Description | +|--------------------------------|---------|----------|------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| fenodes | String | Yes | - | `Doris` cluster fenodes address, the format is `"fe_ip:fe_http_port, ..."` | +| query-port | int | No | 9030 | `Doris` Fenodes query_port | +| username | String | Yes | - | `Doris` user username | +| password | String | Yes | - | `Doris` user password | +| database | String | Yes | - | The database name of `Doris` table, use `${database_name}` to represent the upstream table name | +| table | String | Yes | - | The table name of `Doris` table, use `${table_name}` to represent the upstream table name | +| table.identifier | String | Yes | - | The name of `Doris` table, it will deprecate after version 2.3.5, please use `database` and `table` instead. | +| sink.label-prefix | String | Yes | - | The label prefix used by stream load imports. In the 2pc scenario, global uniqueness is required to ensure the EOS semantics of SeaTunnel. | +| sink.enable-2pc | bool | No | false | Whether to enable two-phase commit (2pc), the default is false. For two-phase commit, please refer to [here](https://doris.apache.org/docs/dev/sql-manual/sql-statements/Data-Manipulation-Statements/Load/STREAM-LOAD/). | +| sink.enable-delete | bool | No | - | Whether to enable deletion. This option requires Doris table to enable batch delete function (0.15+ version is enabled by default), and only supports Unique model. you can get more detail at this [link](https://doris.apache.org/docs/dev/data-operate/delete/batch-delete-manual/) | +| sink.check-interval | int | No | 10000 | check exception with the interval while loading | +| sink.max-retries | int | No | 3 | the max retry times if writing records to database failed | +| sink.buffer-size | int | No | 256 * 1024 | the buffer size to cache data for stream load. | +| sink.buffer-count | int | No | 3 | the buffer count to cache data for stream load. | +| doris.batch.size | int | No | 1024 | the batch size of the write to doris each http request, when the row reaches the size or checkpoint is executed, the data of cached will write to server. | +| needs_unsupported_type_casting | boolean | No | false | Whether to enable the unsupported type casting, such as Decimal64 to Double | +| schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | the schema save mode, please refer to `schema_save_mode` below | +| data_save_mode | Enum | no | APPEND_DATA | the data save mode, please refer to `data_save_mode` below | +| save_mode_create_template | string | no | see below | see below | +| custom_sql | String | no | - | When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. | +| doris.config | map | yes | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql,and supported formats. | + +### schema_save_mode[Enum] + +Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. +Option introduction: +`RECREATE_SCHEMA` :Will create when the table does not exist, delete and rebuild when the table is saved +`CREATE_SCHEMA_WHEN_NOT_EXIST` :Will Created when the table does not exist, skipped when the table is saved +`ERROR_WHEN_SCHEMA_NOT_EXIST` :Error will be reported when the table does not exist + +### data_save_mode[Enum] + +Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. +Option introduction: +`DROP_DATA`: Preserve database structure and delete data +`APPEND_DATA`:Preserve database structure, preserve data +`CUSTOM_PROCESSING`:User defined processing +`ERROR_WHEN_DATA_EXISTS`:When there is data, an error is reported + +### save_mode_create_template + +We use templates to automatically create Doris tables, +which will create corresponding table creation statements based on the type of upstream data and schema type, +and the default template can be modified according to the situation. + +Default template: + +```sql +CREATE TABLE IF NOT EXISTS `${database}`.`${table}` ( +${rowtype_primary_key}, +${rowtype_fields} +) ENGINE=OLAP + UNIQUE KEY (${rowtype_primary_key}) +DISTRIBUTED BY HASH (${rowtype_primary_key}) + PROPERTIES ( +"replication_allocation" = "tag.location.default: 1", +"in_memory" = "false", +"storage_format" = "V2", +"disable_auto_compaction" = "false" +) +``` + +If a custom field is filled in the template, such as adding an `id` field + +```sql +CREATE TABLE IF NOT EXISTS `${database}`.`${table}` +( + id, + ${rowtype_fields} +) ENGINE = OLAP UNIQUE KEY (${rowtype_primary_key}) + DISTRIBUTED BY HASH (${rowtype_primary_key}) + PROPERTIES +( + "replication_num" = "1" +); +``` + +The connector will automatically obtain the corresponding type from the upstream to complete the filling, +and remove the id field from `rowtype_fields`. This method can be used to customize the modification of field types and attributes. + +You can use the following placeholders + +- database: Used to get the database in the upstream schema +- table_name: Used to get the table name in the upstream schema +- rowtype_fields: Used to get all the fields in the upstream schema, we will automatically map to the field + description of Doris +- rowtype_primary_key: Used to get the primary key in the upstream schema (maybe a list) +- rowtype_unique_key: Used to get the unique key in the upstream schema (maybe a list) +- rowtype_duplicate_key: Used to get the duplicate key in the upstream schema (only for doris source, maybe a list) + +## Data Type Mapping + +| Doris Data Type | SeaTunnel Data Type | +|-----------------|-----------------------------------------| +| BOOLEAN | BOOLEAN | +| TINYINT | TINYINT | +| SMALLINT | SMALLINT
TINYINT | +| INT | INT
SMALLINT
TINYINT | +| BIGINT | BIGINT
INT
SMALLINT
TINYINT | +| LARGEINT | BIGINT
INT
SMALLINT
TINYINT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE
FLOAT | +| DECIMAL | DECIMAL
DOUBLE
FLOAT | +| DATE | DATE | +| DATETIME | TIMESTAMP | +| CHAR | STRING | +| VARCHAR | STRING | +| STRING | STRING | +| ARRAY | ARRAY | +| MAP | MAP | +| JSON | STRING | +| HLL | Not supported yet | +| BITMAP | Not supported yet | +| QUANTILE_STATE | Not supported yet | +| STRUCT | Not supported yet | + +#### Supported import data formats + +The supported formats include CSV and JSON + +## Task Example + +### Simple: + +> The following example describes writing multiple data types to Doris, and users need to create corresponding tables downstream + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" + checkpoint.interval = 10000 +} + +source { + FakeSource { + row.num = 10 + map.size = 10 + array.size = 10 + bytes.length = 10 + string.length = 10 + schema = { + fields { + c_map = "map>" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(16, 1)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} + +sink { + Doris { + fenodes = "doris_cdc_e2e:8030" + username = root + password = "" + database = "test" + table = "e2e_table_sink" + sink.label-prefix = "test-cdc" + sink.enable-2pc = "true" + sink.enable-delete = "true" + doris.config { + format = "json" + read_json_by_line = "true" + } + } +} +``` + +### CDC(Change Data Capture) Event: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to Doris Sink,FakeSource simulates CDC data with schema, score (int type),Doris needs to create a table sink named test.e2e_table_sink and a corresponding table for it. + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" + checkpoint.interval = 10000 +} + +source { + FakeSource { + schema = { + fields { + pk_id = bigint + name = string + score = int + sex = boolean + number = tinyint + height = float + sight = double + create_time = date + update_time = timestamp + } + } + rows = [ + { + kind = INSERT + fields = [1, "A", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + }, + { + kind = INSERT + fields = [2, "B", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + }, + { + kind = INSERT + fields = [3, "C", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + }, + { + kind = UPDATE_BEFORE + fields = [1, "A", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + }, + { + kind = UPDATE_AFTER + fields = [1, "A_1", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + }, + { + kind = DELETE + fields = [2, "B", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + } + ] + } +} + +sink { + Doris { + fenodes = "doris_cdc_e2e:8030" + username = root + password = "" + database = "test" + table = "e2e_table_sink" + sink.label-prefix = "test-cdc" + sink.enable-2pc = "true" + sink.enable-delete = "true" + doris.config { + format = "json" + read_json_by_line = "true" + } + } +} + +``` + +### Use JSON format to import data + +``` +sink { + Doris { + fenodes = "e2e_dorisdb:8030" + username = root + password = "" + database = "test" + table = "e2e_table_sink" + sink.enable-2pc = "true" + sink.label-prefix = "test_json" + doris.config = { + format="json" + read_json_by_line="true" + } + } +} + +``` + +### Use CSV format to import data + +``` +sink { + Doris { + fenodes = "e2e_dorisdb:8030" + username = root + password = "" + database = "test" + table = "e2e_table_sink" + sink.enable-2pc = "true" + sink.label-prefix = "test_csv" + doris.config = { + format = "csv" + column_separator = "," + } + } +} +``` + +### Multiple table + +#### example1 + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + Mysql-CDC { + base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel" + username = "root" + password = "******" + + table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] + } +} + +transform { +} + +sink { + Doris { + fenodes = "doris_cdc_e2e:8030" + username = root + password = "" + database = "${database_name}_test" + table = "${table_name}_test" + sink.label-prefix = "test-cdc" + sink.enable-2pc = "true" + sink.enable-delete = "true" + doris.config { + format = "json" + read_json_by_line = "true" + } + } +} +``` + +#### example2 + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Jdbc { + driver = oracle.jdbc.driver.OracleDriver + url = "jdbc:oracle:thin:@localhost:1521/XE" + user = testUser + password = testPassword + + table_list = [ + { + table_path = "TESTSCHEMA.TABLE_1" + }, + { + table_path = "TESTSCHEMA.TABLE_2" + } + ] + } +} + +transform { +} + +sink { + Doris { + fenodes = "doris_cdc_e2e:8030" + username = root + password = "" + database = "${schema_name}_test" + table = "${table_name}_test" + sink.label-prefix = "test-cdc" + sink.enable-2pc = "true" + sink.enable-delete = "true" + doris.config { + format = "json" + read_json_by_line = "true" + } + } +} +``` + +## Changelog + +### 2.3.0-beta 2022-10-20 + +- Add Doris Sink Connector + +### Next version + +- [Improve] Change Doris Config Prefix [3856](https://github.com/apache/seatunnel/pull/3856) + +- [Improve] Refactor some Doris Sink code as well as support 2pc and cdc [4235](https://github.com/apache/seatunnel/pull/4235) + +:::tip + +PR 4235 is an incompatible modification to PR 3856. Please refer to PR 4235 to use the new Doris connector + +::: diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Druid.md b/versioned_docs/version-2.3.7/connector-v2/sink/Druid.md new file mode 100644 index 000000000000..2c1a2fe25dd4 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Druid.md @@ -0,0 +1,83 @@ +# Druid + +> Druid sink connector + +## Description + +Write data to Druid + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [support multiple table write](../../concept/connector-v2-features.md) + +## Data Type Mapping + +| SeaTunnel Data Type | Druid Data Type | +|---------------------|-----------------| +| TINYINT | LONG | +| SMALLINT | LONG | +| INT | LONG | +| BIGINT | LONG | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| DECIMAL | DOUBLE | +| STRING | STRING | +| BOOLEAN | STRING | +| TIMESTAMP | STRING | + +## Options + +| name | type | required | default value | +|----------------|--------|----------|---------------| +| coordinatorUrl | string | yes | - | +| datasource | string | yes | - | +| batchSize | int | no | 10000 | +| common-options | | no | - | + +### coordinatorUrl [string] + +The coordinatorUrl host and port of Druid, example: "myHost:8888" + +### datasource [string] + +The datasource name you want to write, example: "seatunnel" + +### batchSize [int] + +The number of rows flushed to Druid per batch. Default value is `1024`. + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +## Example + +Simple example: + +```hocon +sink { + Druid { + coordinatorUrl = "testHost:8888" + datasource = "seatunnel" + } +} +``` + +Use placeholders get upstream table metadata example: + +```hocon +sink { + Druid { + coordinatorUrl = "testHost:8888" + datasource = "${table_name}_test" + } +} +``` + +## Changelog + +### next version + +- Add Druid sink connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Easysearch.md b/versioned_docs/version-2.3.7/connector-v2/sink/Easysearch.md new file mode 100644 index 000000000000..f474735082dd --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Easysearch.md @@ -0,0 +1,202 @@ +# INFINI Easysearch + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Description + +A sink plugin which use send data to `INFINI Easysearch`. + +## Using Dependency + +> Depenndency [easysearch-client](https://central.sonatype.com/artifact/com.infinilabs/easysearch-client) +> + ## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +:::tip + +Engine Supported + +* Supported all versions released by [INFINI Easysearch](https://www.infini.com/download/?product=easysearch). + +::: + +## Data Type Mapping + +| Easysearch Data Type | SeaTunnel Data Type | +|-----------------------------|----------------------| +| STRING
KEYWORD
TEXT | STRING | +| BOOLEAN | BOOLEAN | +| BYTE | BYTE | +| SHORT | SHORT | +| INTEGER | INT | +| LONG | LONG | +| FLOAT
HALF_FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| Date | LOCAL_DATE_TIME_TYPE | + +## Sink Options + +| name | type | required | default value | +|-------------------------|---------|----------|---------------| +| hosts | array | yes | - | +| index | string | yes | - | +| primary_keys | list | no | | +| key_delimiter | string | no | `_` | +| username | string | no | | +| password | string | no | | +| max_retry_count | int | no | 3 | +| max_batch_size | int | no | 10 | +| tls_verify_certificate | boolean | no | true | +| tls_verify_hostnames | boolean | no | true | +| tls_keystore_path | string | no | - | +| tls_keystore_password | string | no | - | +| tls_truststore_path | string | no | - | +| tls_truststore_password | string | no | - | +| common-options | | no | - | + +### hosts [array] + +`INFINI Easysearch` cluster http address, the format is `host:port` , allowing multiple hosts to be specified. Such as `["host1:9200", "host2:9200"]`. + +### index [string] + +`INFINI Easysearch` `index` name.Index support contains variables of field name,such as `seatunnel_${age}`,and the field must appear at seatunnel row. +If not, we will treat it as a normal index. + +### primary_keys [list] + +Primary key fields used to generate the document `_id`, this is cdc required options. + +### key_delimiter [string] + +Delimiter for composite keys ("_" by default), e.g., "$" would result in document `_id` "KEY1$KEY2$KEY3". + +### username [string] + +security username + +### password [string] + +security password + +### max_retry_count [int] + +one bulk request max try size + +### max_batch_size [int] + +batch bulk doc max size + +### tls_verify_certificate [boolean] + +Enable certificates validation for HTTPS endpoints + +### tls_verify_hostname [boolean] + +Enable hostname validation for HTTPS endpoints + +### tls_keystore_path [string] + +The path to the PEM or JKS key store. This file must be readable by the operating system user running SeaTunnel. + +### tls_keystore_password [string] + +The key password for the key store specified + +### tls_truststore_path [string] + +The path to PEM or JKS trust store. This file must be readable by the operating system user running SeaTunnel. + +### tls_truststore_password [string] + +The key password for the trust store specified + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +## Examples + +Simple + +```bash +sink { + Easysearch { + hosts = ["localhost:9200"] + index = "seatunnel-${age}" + } +} +``` + +CDC(Change data capture) event + +```bash +sink { + Easysearch { + hosts = ["localhost:9200"] + index = "seatunnel-${age}" + + # cdc required options + primary_keys = ["key1", "key2", ...] + } +} +``` + +SSL (Disable certificates validation) + +```hocon +sink { + Easysearch { + hosts = ["https://localhost:9200"] + username = "admin" + password = "admin" + + tls_verify_certificate = false + } +} +``` + +SSL (Disable hostname validation) + +```hocon +sink { + Easysearch { + hosts = ["https://localhost:9200"] + username = "admin" + password = "admin" + + tls_verify_hostname = false + } +} +``` + +SSL (Enable certificates validation) + +```hocon +sink { + Easysearch { + hosts = ["https://localhost:9200"] + username = "admin" + password = "admin" + + tls_keystore_path = "${your Easysearch home}/config/certs/http.p12" + tls_keystore_password = "${your password}" + } +} +``` + +## Changelog + +### 2.3.4 2023-11-16 + +- Add Easysearch Sink Connector +- Support http/https protocol +- Support CDC write DELETE/UPDATE/INSERT events + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Elasticsearch.md b/versioned_docs/version-2.3.7/connector-v2/sink/Elasticsearch.md new file mode 100644 index 000000000000..af61df228837 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Elasticsearch.md @@ -0,0 +1,219 @@ +# Elasticsearch + +## Description + +Output data to `Elasticsearch`. + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +:::tip + +Engine Supported + +* supported `ElasticSearch version is >= 2.x and <= 8.x` + +::: + +## Options + +| name | type | required | default value | +|-------------------------|---------|----------|------------------------------| +| hosts | array | yes | - | +| index | string | yes | - | +| schema_save_mode | string | yes | CREATE_SCHEMA_WHEN_NOT_EXIST | +| data_save_mode | string | yes | APPEND_DATA | +| index_type | string | no | | +| primary_keys | list | no | | +| key_delimiter | string | no | `_` | +| username | string | no | | +| password | string | no | | +| max_retry_count | int | no | 3 | +| max_batch_size | int | no | 10 | +| tls_verify_certificate | boolean | no | true | +| tls_verify_hostnames | boolean | no | true | +| tls_keystore_path | string | no | - | +| tls_keystore_password | string | no | - | +| tls_truststore_path | string | no | - | +| tls_truststore_password | string | no | - | +| common-options | | no | - | + +### hosts [array] + +`Elasticsearch` cluster http address, the format is `host:port` , allowing multiple hosts to be specified. Such as `["host1:9200", "host2:9200"]`. + +### index [string] + +`Elasticsearch` `index` name.Index support contains variables of field name,such as `seatunnel_${age}`,and the field must appear at seatunnel row. +If not, we will treat it as a normal index. + +### index_type [string] + +`Elasticsearch` index type, it is recommended not to specify in elasticsearch 6 and above + +### primary_keys [list] + +Primary key fields used to generate the document `_id`, this is cdc required options. + +### key_delimiter [string] + +Delimiter for composite keys ("_" by default), e.g., "$" would result in document `_id` "KEY1$KEY2$KEY3". + +### username [string] + +x-pack username + +### password [string] + +x-pack password + +### max_retry_count [int] + +one bulk request max try size + +### max_batch_size [int] + +batch bulk doc max size + +### tls_verify_certificate [boolean] + +Enable certificates validation for HTTPS endpoints + +### tls_verify_hostname [boolean] + +Enable hostname validation for HTTPS endpoints + +### tls_keystore_path [string] + +The path to the PEM or JKS key store. This file must be readable by the operating system user running SeaTunnel. + +### tls_keystore_password [string] + +The key password for the key store specified + +### tls_truststore_path [string] + +The path to PEM or JKS trust store. This file must be readable by the operating system user running SeaTunnel. + +### tls_truststore_password [string] + +The key password for the trust store specified + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +### schema_save_mode + +Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. +Option introduction: +RECREATE_SCHEMA :Will create when the table does not exist, delete and rebuild when the table is saved +CREATE_SCHEMA_WHEN_NOT_EXIST :Will Created when the table does not exist, skipped when the table is saved +ERROR_WHEN_SCHEMA_NOT_EXIST :Error will be reported when the table does not exist + +### data_save_mode + +Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. +Option introduction: +DROP_DATA: Preserve database structure and delete data +APPEND_DATA:Preserve database structure, preserve data +ERROR_WHEN_DATA_EXISTS:When there is data, an error is reported + +## Examples + +Simple + +```bash +sink { + Elasticsearch { + hosts = ["localhost:9200"] + index = "seatunnel-${age}" + } +} +``` + +CDC(Change data capture) event + +```bash +sink { + Elasticsearch { + hosts = ["localhost:9200"] + index = "seatunnel-${age}" + + # cdc required options + primary_keys = ["key1", "key2", ...] + } +} +``` + +SSL (Disable certificates validation) + +```hocon +sink { + Elasticsearch { + hosts = ["https://localhost:9200"] + username = "elastic" + password = "elasticsearch" + + tls_verify_certificate = false + } +} +``` + +SSL (Disable hostname validation) + +```hocon +sink { + Elasticsearch { + hosts = ["https://localhost:9200"] + username = "elastic" + password = "elasticsearch" + + tls_verify_hostname = false + } +} +``` + +SSL (Enable certificates validation) + +```hocon +sink { + Elasticsearch { + hosts = ["https://localhost:9200"] + username = "elastic" + password = "elasticsearch" + + tls_keystore_path = "${your elasticsearch home}/config/certs/http.p12" + tls_keystore_password = "${your password}" + } +} +``` + +SAVE_MODE (Add saveMode function) + +```hocon +sink { + Elasticsearch { + hosts = ["https://localhost:9200"] + username = "elastic" + password = "elasticsearch" + + schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" + data_save_mode = "APPEND_DATA" + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Elasticsearch Sink Connector + +### next version + +- [Feature] Support CDC write DELETE/UPDATE/INSERT events ([3673](https://github.com/apache/seatunnel/pull/3673)) +- [Feature] Support https protocol & compatible with opensearch ([3997](https://github.com/apache/seatunnel/pull/3997)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Email.md b/versioned_docs/version-2.3.7/connector-v2/sink/Email.md new file mode 100644 index 000000000000..f2bca2783d4a --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Email.md @@ -0,0 +1,87 @@ +# Email + +> Email sink connector + +## Description + +Send the data as a file to email. + +The tested email version is 1.5.6. + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|--------------------------|--------|----------|---------------| +| email_from_address | string | yes | - | +| email_to_address | string | yes | - | +| email_host | string | yes | - | +| email_transport_protocol | string | yes | - | +| email_smtp_auth | string | yes | - | +| email_authorization_code | string | yes | - | +| email_message_headline | string | yes | - | +| email_message_content | string | yes | - | +| common-options | | no | - | + +### email_from_address [string] + +Sender Email Address . + +### email_to_address [string] + +Address to receive mail. + +### email_host [string] + +SMTP server to connect to. + +### email_transport_protocol [string] + +The protocol to load the session . + +### email_smtp_auth [string] + +Whether to authenticate the customer. + +### email_authorization_code [string] + +authorization code,You can obtain the authorization code from the mailbox Settings. + +### email_message_headline [string] + +The subject line of the entire message. + +### email_message_content [string] + +The body of the entire message. + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. + +## Example + +```bash + + EmailSink { + email_from_address = "xxxxxx@qq.com" + email_to_address = "xxxxxx@163.com" + email_host="smtp.qq.com" + email_transport_protocol="smtp" + email_smtp_auth="true" + email_authorization_code="" + email_message_headline="" + email_message_content="" + } + +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Email Sink Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Enterprise-WeChat.md b/versioned_docs/version-2.3.7/connector-v2/sink/Enterprise-WeChat.md new file mode 100644 index 000000000000..253c91497f83 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Enterprise-WeChat.md @@ -0,0 +1,75 @@ +# Enterprise WeChat + +> Enterprise WeChat sink connector + +## Description + +A sink plugin which use Enterprise WeChat robot send message + +> For example, if the data from upstream is [`"alarmStatus": "firing", "alarmTime": "2022-08-03 01:38:49","alarmContent": "The disk usage exceeds the threshold"`], the output content to WeChat Robot is the following: +> +> ``` +> alarmStatus: firing +> alarmTime: 2022-08-03 01:38:49 +> alarmContent: The disk usage exceeds the threshold +> ``` +> +> **Tips: WeChat sink only support `string` webhook and the data from source will be treated as body content in web hook.** + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-----------------------|--------|----------|---------------| +| url | String | Yes | - | +| mentioned_list | array | No | - | +| mentioned_mobile_list | array | No | - | +| common-options | | no | - | + +### url [string] + +Enterprise WeChat webhook url format is https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=XXXXXX(string) + +### mentioned_list [array] + +A list of userids to remind the specified members in the group (@ a member), @ all means to remind everyone. If the developer can't get the userid, he can use called_ mobile_ list + +### mentioned_mobile_list [array] + +Mobile phone number list, remind the group member corresponding to the mobile phone number (@ a member), @ all means remind everyone + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +## Example + +simple: + +```hocon +WeChat { + url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=693axxx6-7aoc-4bc4-97a0-0ec2sifa5aaa" + } +``` + +```hocon +WeChat { + url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=693axxx6-7aoc-4bc4-97a0-0ec2sifa5aaa" + mentioned_list=["wangqing","@all"] + mentioned_mobile_list=["13800001111","@all"] + } +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Enterprise-WeChat Sink Connector + +### 2.3.0-beta 2022-10-20 + +- [BugFix] Fix Enterprise-WeChat Sink data serialization ([2856](https://github.com/apache/seatunnel/pull/2856)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Feishu.md b/versioned_docs/version-2.3.7/connector-v2/sink/Feishu.md new file mode 100644 index 000000000000..b965d8413f0f --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Feishu.md @@ -0,0 +1,66 @@ +# Feishu + +> Feishu sink connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [cdc](../../concept/connector-v2-features.md) + +## Description + +Used to launch Feishu web hooks using data. + +> For example, if the data from upstream is [`age: 12, name: tyrantlucifer`], the body content is the following: `{"age": 12, "name": "tyrantlucifer"}` + +**Tips: Feishu sink only support `post json` webhook and the data from source will be treated as body content in web hook.** + +## Data Type Mapping + +| Seatunnel Data Type | Feishu Data Type | +|-----------------------------|------------------| +| ROW
MAP | Json | +| NULL | null | +| BOOLEAN | boolean | +| TINYINT | byte | +| SMALLINT | short | +| INT | int | +| BIGINT | long | +| FLOAT | float | +| DOUBLE | double | +| DECIMAL | BigDecimal | +| BYTES | byte[] | +| STRING | String | +| TIME
TIMESTAMP
TIME | String | +| ARRAY | JsonArray | + +## Sink Options + +| Name | Type | Required | Default | Description | +|----------------|--------|----------|---------|-----------------------------------------------------------------------------------------------------| +| url | String | Yes | - | Feishu webhook url | +| headers | Map | No | - | Http request headers | +| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | + +## Task Example + +### Simple: + +```hocon +Feishu { + url = "https://www.feishu.cn/flow/api/trigger-webhook/108bb8f208d9b2378c8c7aedad715c19" + } +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Feishu Sink Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/FtpFile.md b/versioned_docs/version-2.3.7/connector-v2/sink/FtpFile.md new file mode 100644 index 000000000000..01c46190afad --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/FtpFile.md @@ -0,0 +1,296 @@ +# FtpFile + +> Ftp file sink connector + +## Description + +Output data to Ftp . + +:::tip + +If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. + +If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. + +::: + +## Key features + +- [x] [exactly-once](../../concept/connector-v2-features.md) + +By default, we use 2PC commit to ensure `exactly-once` + +- [x] file format + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + - [x] xml + - [x] binary + +## Options + +| Name | Type | Required | Default | Description | +|---------------------------------------|---------|----------|--------------------------------------------|-------------------------------------------------------------------------------------------------------------------| +| host | string | yes | - | | +| port | int | yes | - | | +| user | string | yes | - | | +| password | string | yes | - | | +| path | string | yes | - | | +| tmp_path | string | yes | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. Need a FTP dir. | +| connection_mode | string | no | active_local | The target ftp connection mode | +| custom_filename | boolean | no | false | Whether you need custom the filename | +| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | +| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | +| file_format_type | string | no | "csv" | | +| field_delimiter | string | no | '\001' | Only used when file_format_type is text | +| row_delimiter | string | no | "\n" | Only used when file_format_type is text | +| have_partition | boolean | no | false | Whether you need processing partitions. | +| partition_by | array | no | - | Only used then have_partition is true | +| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | +| is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true | +| sink_columns | array | no | | When this parameter is empty, all fields are sink columns | +| is_enable_transaction | boolean | no | true | | +| batch_size | int | no | 1000000 | | +| compress_codec | string | no | none | | +| common-options | object | no | - | | +| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. | +| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. | +| xml_root_tag | string | no | RECORDS | Only used when file_format is xml. | +| xml_row_tag | string | no | RECORD | Only used when file_format is xml. | +| xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | +| parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | +| parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | +| encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | + +### host [string] + +The target ftp host is required + +### port [int] + +The target ftp port is required + +### user [string] + +The target ftp username is required + +### password [string] + +The target ftp password is required + +### path [string] + +The target dir path is required. + +### connection_mode [string] + +The target ftp connection mode , default is active mode, supported as the following modes: + +`active_local` `passive_local` + +### custom_filename [boolean] + +Whether custom the filename + +### file_name_expression [string] + +Only used when `custom_filename` is `true` + +`file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, +`${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +### filename_time_format [string] + +Only used when `custom_filename` is `true` + +When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: + +| Symbol | Description | +|--------|--------------------| +| y | Year | +| M | Month | +| d | Day of month | +| H | Hour in day (0-23) | +| m | Minute in hour | +| s | Second in minute | + +### file_format_type [string] + +We supported as the following file types: + +`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` + +Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. + +### field_delimiter [string] + +The separator between columns in a row of data. Only needed by `text` file format. + +### row_delimiter [string] + +The separator between rows in a file. Only needed by `text` file format. + +### have_partition [boolean] + +Whether you need processing partitions. + +### partition_by [array] + +Only used when `have_partition` is `true`. + +Partition data based on selected fields. + +### partition_dir_expression [string] + +Only used when `have_partition` is `true`. + +If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. + +Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. + +### is_partition_field_write_in_file [boolean] + +Only used when `have_partition` is `true`. + +If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. + +For example, if you want to write a Hive Data File, Its value should be `false`. + +### sink_columns [array] + +Which columns need be wrote to file, default value is all the columns get from `Transform` or `Source`. +The order of the fields determines the order in which the file is actually written. + +### is_enable_transaction [boolean] + +If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +Only support `true` now. + +### batch_size [int] + +The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. + +### compress_codec [string] + +The compress codec of files and the details that supported as the following shown: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc: `lzo` `snappy` `lz4` `zlib` `none` +- parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` + +Tips: excel type does not support any compression format + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. + +### max_rows_in_memory [int] + +When File Format is Excel,The maximum number of data items that can be cached in the memory. + +### sheet_name [string] + +Writer the sheet of the workbook + +### xml_root_tag [string] + +Specifies the tag name of the root element within the XML file. + +### xml_row_tag [string] + +Specifies the tag name of the data rows within the XML file. + +### xml_use_attr_format [boolean] + +Specifies Whether to process data using the tag attribute format. + +### parquet_avro_write_timestamp_as_int96 [boolean] + +Support writing Parquet INT96 from a timestamp, only valid for parquet files. + +### parquet_avro_write_fixed_as_int96 [array] + +Support writing Parquet INT96 from a 12-byte field, only valid for parquet files. + +### encoding [string] + +Only used when file_format_type is json,text,csv,xml. +The encoding of the file to write. This param will be parsed by `Charset.forName(encoding)`. + +## Example + +For text file format simple config + +```bash + +FtpFile { + host = "xxx.xxx.xxx.xxx" + port = 21 + user = "username" + password = "password" + path = "/data/ftp" + file_format_type = "text" + field_delimiter = "\t" + row_delimiter = "\n" + sink_columns = ["name","age"] +} + +``` + +For text file format with `have_partition` and `custom_filename` and `sink_columns` + +```bash + +FtpFile { + host = "xxx.xxx.xxx.xxx" + port = 21 + user = "username" + password = "password" + path = "/data/ftp/seatunnel/job1" + tmp_path = "/data/ftp/seatunnel/tmp" + file_format_type = "text" + field_delimiter = "\t" + row_delimiter = "\n" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + custom_filename = true + file_name_expression = "${transactionId}_${now}" + sink_columns = ["name","age"] + filename_time_format = "yyyy.MM.dd" +} + +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Ftp File Sink Connector + +### 2.3.0-beta 2022-10-20 + +- [BugFix] Fix the bug of incorrect path in windows environment ([2980](https://github.com/apache/seatunnel/pull/2980)) +- [BugFix] Fix filesystem get error ([3117](https://github.com/apache/seatunnel/pull/3117)) +- [BugFix] Solved the bug of can not parse '\t' as delimiter from config file ([3083](https://github.com/apache/seatunnel/pull/3083)) + +### Next version + +- [BugFix] Fixed the following bugs that failed to write data to files ([3258](https://github.com/apache/seatunnel/pull/3258)) + - When field from upstream is null it will throw NullPointerException + - Sink columns mapping failed + - When restore writer from states getting transaction directly failed +- [Improve] Support setting batch size for every file ([3625](https://github.com/apache/seatunnel/pull/3625)) +- [Improve] Support file compress ([3899](https://github.com/apache/seatunnel/pull/3899)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/GoogleFirestore.md b/versioned_docs/version-2.3.7/connector-v2/sink/GoogleFirestore.md new file mode 100644 index 000000000000..cdf6ce2f0c75 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/GoogleFirestore.md @@ -0,0 +1,52 @@ +# GoogleFirestore + +> Google Firestore sink connector + +## Description + +Write data to Google Firestore + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-------------|--------|----------|---------------| +| project_id | string | yes | - | +| collection | string | yes | - | +| credentials | string | no | - | + +### project_id [string] + +The unique identifier for a Google Firestore database project. + +### collection [string] + +The collection of Google Firestore. + +### credentials [string] + +The credentials of Google Cloud service account, use base64 codec. If not set, need to check the `GOOGLE APPLICATION CREDENTIALS` environment exists. + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. + +## Example + +```bash +GoogleFirestore { + project_id = "dummy-project-id", + collection = "dummy-collection", + credentials = "dummy-credentials" +} +``` + +## Changelog + +### next version + +- Add Google Firestore Sink Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Greenplum.md b/versioned_docs/version-2.3.7/connector-v2/sink/Greenplum.md new file mode 100644 index 000000000000..6d4622b437d2 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Greenplum.md @@ -0,0 +1,42 @@ +# Greenplum + +> Greenplum sink connector + +## Description + +Write data to Greenplum using [Jdbc connector](Jdbc.md). + +## Key Features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +:::tip + +Not support exactly-once semantics (XA transaction is not yet supported in Greenplum database). + +::: + +## Options + +### driver [string] + +Optional jdbc drivers: +- `org.postgresql.Driver` +- `com.pivotal.jdbc.GreenplumDriver` + +Warn: for license compliance, if you use `GreenplumDriver` the have to provide Greenplum JDBC driver yourself, e.g. copy greenplum-xxx.jar to $SEATNUNNEL_HOME/lib for Standalone. + +### url [string] + +The URL of the JDBC connection. if you use postgresql driver the value is `jdbc:postgresql://${yous_host}:${yous_port}/${yous_database}`, or you use greenplum driver the value is `jdbc:pivotal:greenplum://${yous_host}:${yous_port};DatabaseName=${yous_database}` + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Greenplum Sink Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Hbase.md b/versioned_docs/version-2.3.7/connector-v2/sink/Hbase.md new file mode 100644 index 000000000000..dd75d21f0bec --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Hbase.md @@ -0,0 +1,140 @@ +# Hbase + +> Hbase sink connector + +## Description + +Output data to Hbase + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|--------------------|---------|----------|-----------------| +| zookeeper_quorum | string | yes | - | +| table | string | yes | - | +| rowkey_column | list | yes | - | +| family_name | config | yes | - | +| rowkey_delimiter | string | no | "" | +| version_column | string | no | - | +| null_mode | string | no | skip | +| wal_write | boolean | yes | false | +| write_buffer_size | string | no | 8 * 1024 * 1024 | +| encoding | string | no | utf8 | +| hbase_extra_config | string | no | - | +| common-options | | no | - | +| ttl | long | no | - | + +### zookeeper_quorum [string] + +The zookeeper cluster host of hbase, example: "hadoop001:2181,hadoop002:2181,hadoop003:2181" + +### table [string] + +The table name you want to write, example: "seatunnel" + +### rowkey_column [list] + +The column name list of row keys, example: ["id", "uuid"] + +### family_name [config] + +The family name mapping of fields. For example the row from upstream like the following shown: + +| id | name | age | +|----|---------------|-----| +| 1 | tyrantlucifer | 27 | + +id as the row key and other fields written to the different families, you can assign + +family_name { +name = "info1" +age = "info2" +} + +this means that `name` will be written to the family `info1` and the `age` will be written to the family `info2` + +if you want other fields written to the same family, you can assign + +family_name { +all_columns = "info" +} + +this means that all fields will be written to the family `info` + +### rowkey_delimiter [string] + +The delimiter of joining multi row keys, default `""` + +### version_column [string] + +The version column name, you can use it to assign timestamp for hbase record + +### null_mode [double] + +The mode of writing null value, support [`skip`, `empty`], default `skip` + +- skip: When the field is null, connector will not write this field to hbase +- empty: When the field is null, connector will write generate empty value for this field + +### wal_write [boolean] + +The wal log write flag, default `false` + +### write_buffer_size [int] + +The write buffer size of hbase client, default `8 * 1024 * 1024` + +### encoding [string] + +The encoding of string field, support [`utf8`, `gbk`], default `utf8` + +### hbase_extra_config [config] + +The extra configuration of hbase + +### ttl [long] + +Hbase writes data TTL time, the default is based on the TTL set in the table, unit: milliseconds + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +## Example + +```hocon + +Hbase { + zookeeper_quorum = "hadoop001:2181,hadoop002:2181,hadoop003:2181" + table = "seatunnel_test" + rowkey_column = ["name"] + family_name { + all_columns = seatunnel + } +} +``` + +## Writes To The Specified Column Family + +```hocon +Hbase { + zookeeper_quorum = "hbase_e2e:2181" + table = "assign_cf_table" + rowkey_column = ["id"] + family_name { + c_double = "cf1" + c_bigint = "cf2" + } +} +``` + +## Changelog + +### next version + +- Add hbase sink connector ([4049](https://github.com/apache/seatunnel/pull/4049)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/HdfsFile.md b/versioned_docs/version-2.3.7/connector-v2/sink/HdfsFile.md new file mode 100644 index 000000000000..d42aa98967f3 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/HdfsFile.md @@ -0,0 +1,208 @@ +# HdfsFile + +> HDFS File Sink Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) + +By default, we use 2PC commit to ensure `exactly-once` + +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + - [x] xml + - [x] binary +- [x] compress codec + - [x] lzo + +## Description + +Output data to hdfs file + +## Supported DataSource Info + +| Datasource | Supported Versions | +|------------|--------------------| +| HdfsFile | hadoop 2.x and 3.x | + +## Sink Options + +| Name | Type | Required | Default | Description | +|---------------------------------------|---------|----------|--------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| fs.defaultFS | string | yes | - | The hadoop cluster address that start with `hdfs://`, for example: `hdfs://hadoopcluster` | +| path | string | yes | - | The target dir path is required. | +| tmp_path | string | yes | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. Need a hdfs path. | +| hdfs_site_path | string | no | - | The path of `hdfs-site.xml`, used to load ha configuration of namenodes | +| custom_filename | boolean | no | false | Whether you need custom the filename | +| file_name_expression | string | no | "${transactionId}" | Only used when `custom_filename` is `true`.`file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`,`${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`.Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. | +| filename_time_format | string | no | "yyyy.MM.dd" | Only used when `custom_filename` is `true`.When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows:[y:Year,M:Month,d:Day of month,H:Hour in day (0-23),m:Minute in hour,s:Second in minute] | +| file_format_type | string | no | "csv" | We supported as the following file types:`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary`.Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. | +| field_delimiter | string | no | '\001' | Only used when file_format is text,The separator between columns in a row of data. Only needed by `text` file format. | +| row_delimiter | string | no | "\n" | Only used when file_format is text,The separator between rows in a file. Only needed by `text` file format. | +| have_partition | boolean | no | false | Whether you need processing partitions. | +| partition_by | array | no | - | Only used then have_partition is true,Partition data based on selected fields. | +| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true,If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. | +| is_partition_field_write_in_file | boolean | no | false | Only used when `have_partition` is `true`. If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file.For example, if you want to write a Hive Data File, Its value should be `false`. | +| sink_columns | array | no | | When this parameter is empty, all fields are sink columns.Which columns need be write to file, default value is all of the columns get from `Transform` or `Source`. The order of the fields determines the order in which the file is actually written. | +| is_enable_transaction | boolean | no | true | If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory.Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file.Only support `true` now. | +| batch_size | int | no | 1000000 | The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. | +| compress_codec | string | no | none | The compress codec of files and the details that supported as the following shown:[txt: `lzo` `none`,json: `lzo` `none`,csv: `lzo` `none`,orc: `lzo` `snappy` `lz4` `zlib` `none`,parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none`].Tips: excel type does not support any compression format. | +| krb5_path | string | no | /etc/krb5.conf | The krb5 path of kerberos | +| kerberos_principal | string | no | - | The principal of kerberos | +| kerberos_keytab_path | string | no | - | The keytab path of kerberos | +| compress_codec | string | no | none | compress codec | +| common-options | object | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | +| max_rows_in_memory | int | no | - | Only used when file_format is excel.When File Format is Excel,The maximum number of data items that can be cached in the memory. | +| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel.Writer the sheet of the workbook | +| xml_root_tag | string | no | RECORDS | Only used when file_format is xml, specifies the tag name of the root element within the XML file. | +| xml_row_tag | string | no | RECORD | Only used when file_format is xml, specifies the tag name of the data rows within the XML file | +| xml_use_attr_format | boolean | no | - | Only used when file_format is xml, specifies Whether to process data using the tag attribute format. | +| parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | +| parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | +| encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | + +### Tips + +> If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. + +## Task Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to Hdfs. + +``` +# Defining the runtime environment +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + file_format_type = "orc" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + +### For orc file format simple config + +``` +HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + file_format_type = "orc" +} +``` + +### For text file format with `have_partition` and `custom_filename` and `sink_columns` + +``` +HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + file_format_type = "text" + field_delimiter = "\t" + row_delimiter = "\n" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + custom_filename = true + file_name_expression = "${transactionId}_${now}" + filename_time_format = "yyyy.MM.dd" + sink_columns = ["name","age"] + is_enable_transaction = true +} +``` + +### For parquet file format with `have_partition` and `custom_filename` and `sink_columns` + +``` +HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + custom_filename = true + file_name_expression = "${transactionId}_${now}" + filename_time_format = "yyyy.MM.dd" + file_format_type = "parquet" + sink_columns = ["name","age"] + is_enable_transaction = true +} +``` + +### For kerberos simple config + +``` +HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + hdfs_site_path = "/path/to/your/hdfs_site_path" + kerberos_principal = "your_principal@EXAMPLE.COM" + kerberos_keytab_path = "/path/to/your/keytab/file.keytab" +} +``` + +### For compress simple config + +``` +HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + compress_codec = "lzo" +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Hive.md b/versioned_docs/version-2.3.7/connector-v2/sink/Hive.md new file mode 100644 index 000000000000..e3c62294ee68 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Hive.md @@ -0,0 +1,420 @@ +# Hive + +> Hive sink connector + +## Description + +Write data to Hive. + +:::tip + +In order to use this connector, You must ensure your spark/flink cluster already integrated hive. The tested hive version is 2.3.9. + +If you use SeaTunnel Engine, You need put seatunnel-hadoop3-3.1.4-uber.jar and hive-exec-3.1.3.jar and libfb303-0.9.3.jar in $SEATUNNEL_HOME/lib/ dir. +::: + +## Key features + +- [x] [support multiple table write](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) + +By default, we use 2PC commit to ensure `exactly-once` + +- [x] file format + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json +- [x] compress codec + - [x] lzo + +## Options + +| name | type | required | default value | +|-------------------------------|---------|----------|----------------| +| table_name | string | yes | - | +| metastore_uri | string | yes | - | +| compress_codec | string | no | none | +| hdfs_site_path | string | no | - | +| hive_site_path | string | no | - | +| hive.hadoop.conf | Map | no | - | +| hive.hadoop.conf-path | string | no | - | +| krb5_path | string | no | /etc/krb5.conf | +| kerberos_principal | string | no | - | +| kerberos_keytab_path | string | no | - | +| abort_drop_partition_metadata | boolean | no | true | +| common-options | | no | - | + +### table_name [string] + +Target Hive table name eg: db1.table1, and if the source is multiple mode, you can use `${database_name}.${table_name}` to generate the table name, it will replace the `${database_name}` and `${table_name}` with the value of the CatalogTable generate from the source. + +### metastore_uri [string] + +Hive metastore uri + +### hdfs_site_path [string] + +The path of `hdfs-site.xml`, used to load ha configuration of namenodes + +### hive_site_path [string] + +The path of `hive-site.xml` + +### hive.hadoop.conf [map] + +Properties in hadoop conf('core-site.xml', 'hdfs-site.xml', 'hive-site.xml') + +### hive.hadoop.conf-path [string] + +The specified loading path for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files + +### krb5_path [string] + +The path of `krb5.conf`, used to authentication kerberos + +The path of `hive-site.xml`, used to authentication hive metastore + +### kerberos_principal [string] + +The principal of kerberos + +### kerberos_keytab_path [string] + +The keytab path of kerberos + +### abort_drop_partition_metadata [list] + +Flag to decide whether to drop partition metadata from Hive Metastore during an abort operation. Note: this only affects the metadata in the metastore, the data in the partition will always be deleted(data generated during the synchronization process). + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +## Example + +```bash + + Hive { + table_name = "default.seatunnel_orc" + metastore_uri = "thrift://namenode001:9083" + } + +``` + +### example 1 + +We have a source table like this: + +```bash +create table test_hive_source( + test_tinyint TINYINT, + test_smallint SMALLINT, + test_int INT, + test_bigint BIGINT, + test_boolean BOOLEAN, + test_float FLOAT, + test_double DOUBLE, + test_string STRING, + test_binary BINARY, + test_timestamp TIMESTAMP, + test_decimal DECIMAL(8,2), + test_char CHAR(64), + test_varchar VARCHAR(64), + test_date DATE, + test_array ARRAY, + test_map MAP, + test_struct STRUCT + ) +PARTITIONED BY (test_par1 STRING, test_par2 STRING); + +``` + +We need read data from the source table and write to another table: + +```bash +create table test_hive_sink_text_simple( + test_tinyint TINYINT, + test_smallint SMALLINT, + test_int INT, + test_bigint BIGINT, + test_boolean BOOLEAN, + test_float FLOAT, + test_double DOUBLE, + test_string STRING, + test_binary BINARY, + test_timestamp TIMESTAMP, + test_decimal DECIMAL(8,2), + test_char CHAR(64), + test_varchar VARCHAR(64), + test_date DATE + ) +PARTITIONED BY (test_par1 STRING, test_par2 STRING); + +``` + +The job config file can like this: + +``` +env { + parallelism = 3 + job.name="test_hive_source_to_hive" +} + +source { + Hive { + table_name = "test_hive.test_hive_source" + metastore_uri = "thrift://ctyun7:9083" + } +} + +sink { + # choose stdout output plugin to output data to console + + Hive { + table_name = "test_hive.test_hive_sink_text_simple" + metastore_uri = "thrift://ctyun7:9083" + hive.hadoop.conf = { + bucket = "s3a://mybucket" + fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" + } +} +``` + +## Hive on s3 + +### Step 1 + +Create the lib dir for hive of emr. + +```shell +mkdir -p ${SEATUNNEL_HOME}/plugins/Hive/lib +``` + +### Step 2 + +Get the jars from maven center to the lib. + +```shell +cd ${SEATUNNEL_HOME}/plugins/Hive/lib +wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.6.5/hadoop-aws-2.6.5.jar +wget https://repo1.maven.org/maven2/org/apache/hive/hive-exec/2.3.9/hive-exec-2.3.9.jar +``` + +### Step 3 + +Copy the jars from your environment on emr to the lib dir. + +```shell +cp /usr/share/aws/emr/emrfs/lib/emrfs-hadoop-assembly-2.60.0.jar ${SEATUNNEL_HOME}/plugins/Hive/lib +cp /usr/share/aws/emr/hadoop-state-pusher/lib/hadoop-common-3.3.6-amzn-1.jar ${SEATUNNEL_HOME}/plugins/Hive/lib +cp /usr/share/aws/emr/hadoop-state-pusher/lib/javax.inject-1.jar ${SEATUNNEL_HOME}/plugins/Hive/lib +cp /usr/share/aws/emr/hadoop-state-pusher/lib/aopalliance-1.0.jar ${SEATUNNEL_HOME}/plugins/Hive/lib +``` + +### Step 4 + +Run the case. + +```shell +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + schema = { + fields { + pk_id = bigint + name = string + score = int + } + primaryKey { + name = "pk_id" + columnNames = [pk_id] + } + } + rows = [ + { + kind = INSERT + fields = [1, "A", 100] + }, + { + kind = INSERT + fields = [2, "B", 100] + }, + { + kind = INSERT + fields = [3, "C", 100] + } + ] + } +} + +sink { + Hive { + table_name = "test_hive.test_hive_sink_on_s3" + metastore_uri = "thrift://ip-192-168-0-202.cn-north-1.compute.internal:9083" + hive.hadoop.conf-path = "/home/ec2-user/hadoop-conf" + hive.hadoop.conf = { + bucket="s3://ws-package" + fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" + } + } +} +``` + +## Hive on oss + +### Step 1 + +Create the lib dir for hive of emr. + +```shell +mkdir -p ${SEATUNNEL_HOME}/plugins/Hive/lib +``` + +### Step 2 + +Get the jars from maven center to the lib. + +```shell +cd ${SEATUNNEL_HOME}/plugins/Hive/lib +wget https://repo1.maven.org/maven2/org/apache/hive/hive-exec/2.3.9/hive-exec-2.3.9.jar +``` + +### Step 3 + +Copy the jars from your environment on emr to the lib dir and delete the conflicting jar. + +```shell +cp -r /opt/apps/JINDOSDK/jindosdk-current/lib/jindo-*.jar ${SEATUNNEL_HOME}/plugins/Hive/lib +rm -f ${SEATUNNEL_HOME}/lib/hadoop-aliyun-*.jar +``` + +### Step 4 + +Run the case. + +```shell +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + schema = { + fields { + pk_id = bigint + name = string + score = int + } + primaryKey { + name = "pk_id" + columnNames = [pk_id] + } + } + rows = [ + { + kind = INSERT + fields = [1, "A", 100] + }, + { + kind = INSERT + fields = [2, "B", 100] + }, + { + kind = INSERT + fields = [3, "C", 100] + } + ] + } +} + +sink { + Hive { + table_name = "test_hive.test_hive_sink_on_oss" + metastore_uri = "thrift://master-1-1.c-1009b01725b501f2.cn-wulanchabu.emr.aliyuncs.com:9083" + hive.hadoop.conf-path = "/tmp/hadoop" + hive.hadoop.conf = { + bucket="oss://emr-osshdfs.cn-wulanchabu.oss-dls.aliyuncs.com" + } + } +} +``` + +### example 2 + +We have multiple source table like this: + +```bash +create table test_1( +) +PARTITIONED BY (xx); + +create table test_2( +) +PARTITIONED BY (xx); +... +``` + +We need read data from these source tables and write to another tables: + +The job config file can like this: + +``` +env { + # You can set flink configuration here + parallelism = 3 + job.name="test_hive_source_to_hive" +} + +source { + Hive { + tables_configs = [ + { + table_name = "test_hive.test_1" + metastore_uri = "thrift://ctyun6:9083" + }, + { + table_name = "test_hive.test_2" + metastore_uri = "thrift://ctyun7:9083" + } + ] + } +} + +sink { + # choose stdout output plugin to output data to console + Hive { + table_name = "${database_name}.${table_name}" + metastore_uri = "thrift://ctyun7:9083" + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Hive Sink Connector + +### 2.3.0-beta 2022-10-20 + +- [Improve] Hive Sink supports automatic partition repair ([3133](https://github.com/apache/seatunnel/pull/3133)) + +### 2.3.0 2022-12-30 + +- [BugFix] Fixed the following bugs that failed to write data to files ([3258](https://github.com/apache/seatunnel/pull/3258)) + - When field from upstream is null it will throw NullPointerException + - Sink columns mapping failed + - When restore writer from states getting transaction directly failed + +### Next version + +- [Improve] Support kerberos authentication ([3840](https://github.com/apache/seatunnel/pull/3840)) +- [Improve] Added partition_dir_expression validation logic ([3886](https://github.com/apache/seatunnel/pull/3886)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Http.md b/versioned_docs/version-2.3.7/connector-v2/sink/Http.md new file mode 100644 index 000000000000..59f80514cbde --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Http.md @@ -0,0 +1,134 @@ +# Http + +> Http sink connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [cdc](../../concept/connector-v2-features.md) +- [x] [support multiple table write](../../concept/connector-v2-features.md) + +## Description + +Used to launch web hooks using data. + +> For example, if the data from upstream is [`age: 12, name: tyrantlucifer`], the body content is the following: `{"age": 12, "name": "tyrantlucifer"}` + +**Tips: Http sink only support `post json` webhook and the data from source will be treated as body content in web hook.** + +## Supported DataSource Info + +In order to use the Http connector, the following dependencies are required. +They can be downloaded via install-plugin.sh or from the Maven central repository. + +| Datasource | Supported Versions | Dependency | +|------------|--------------------|------------------------------------------------------------------------------------------------------------| +| Http | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-http) | + +## Sink Options + +| Name | Type | Required | Default | Description | +|-----------------------------|--------|----------|---------|-----------------------------------------------------------------------------------------------------| +| url | String | Yes | - | Http request url | +| headers | Map | No | - | Http headers | +| retry | Int | No | - | The max retry times if request http return to `IOException` | +| retry_backoff_multiplier_ms | Int | No | 100 | The retry-backoff times(millis) multiplier if request http failed | +| retry_backoff_max_ms | Int | No | 10000 | The maximum retry-backoff times(millis) if request http failed | +| connect_timeout_ms | Int | No | 12000 | Connection timeout setting, default 12s. | +| socket_timeout_ms | Int | No | 60000 | Socket timeout setting, default 60s. | +| common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | + +## Example + +simple: + +```hocon +Http { + url = "http://localhost/test/webhook" + headers { + token = "9e32e859ef044462a257e1fc76730066" + } +} +``` + +### Multiple table + +#### example1 + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + Mysql-CDC { + base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel" + username = "root" + password = "******" + + table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] + } +} + +transform { +} + +sink { + Http { + ... + url = "http://localhost/test/${database_name}_test/${table_name}_test" + } +} +``` + +#### example2 + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Jdbc { + driver = oracle.jdbc.driver.OracleDriver + url = "jdbc:oracle:thin:@localhost:1521/XE" + user = testUser + password = testPassword + + table_list = [ + { + table_path = "TESTSCHEMA.TABLE_1" + }, + { + table_path = "TESTSCHEMA.TABLE_2" + } + ] + } +} + +transform { +} + +sink { + Http { + ... + url = "http://localhost/test/${schema_name}_test/${table_name}_test" + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Http Sink Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Hudi.md b/versioned_docs/version-2.3.7/connector-v2/sink/Hudi.md new file mode 100644 index 000000000000..406212ca853d --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Hudi.md @@ -0,0 +1,131 @@ +# Hudi + +> Hudi sink connector + +## Description + +Used to write data to Hudi. + +## Key features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) +- [x] [support multiple table write](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|----------------------------|--------|----------|---------------| +| table_name | string | yes | - | +| table_dfs_path | string | yes | - | +| conf_files_path | string | no | - | +| record_key_fields | string | no | - | +| partition_fields | string | no | - | +| table_type | enum | no | copy_on_write | +| op_type | enum | no | insert | +| batch_interval_ms | Int | no | 1000 | +| insert_shuffle_parallelism | Int | no | 2 | +| upsert_shuffle_parallelism | Int | no | 2 | +| min_commits_to_keep | Int | no | 20 | +| max_commits_to_keep | Int | no | 30 | +| common-options | config | no | - | + +### table_name [string] + +`table_name` The name of hudi table. + +### table_dfs_path [string] + +`table_dfs_path` The dfs root path of hudi table,such as 'hdfs://nameserivce/data/hudi/hudi_table/'. + +### table_type [enum] + +`table_type` The type of hudi table. The value is 'copy_on_write' or 'merge_on_read'. + +### conf_files_path [string] + +`conf_files_path` The environment conf file path list(local path), which used to init hdfs client to read hudi table file. The example is '/home/test/hdfs-site.xml;/home/test/core-site.xml;/home/test/yarn-site.xml'. + +### op_type [enum] + +`op_type` The operation type of hudi table. The value is 'insert' or 'upsert' or 'bulk_insert'. + +### batch_interval_ms [Int] + +`batch_interval_ms` The interval time of batch write to hudi table. + +### insert_shuffle_parallelism [Int] + +`insert_shuffle_parallelism` The parallelism of insert data to hudi table. + +### upsert_shuffle_parallelism [Int] + +`upsert_shuffle_parallelism` The parallelism of upsert data to hudi table. + +### min_commits_to_keep [Int] + +`min_commits_to_keep` The min commits to keep of hudi table. + +### max_commits_to_keep [Int] + +`max_commits_to_keep` The max commits to keep of hudi table. + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. + +## Examples + +```hocon +sink { + Hudi { + table_dfs_path = "hdfs://nameserivce/data/hudi/hudi_table/" + table_name = "test_table" + table_type = "copy_on_write" + conf_files_path = "/home/test/hdfs-site.xml;/home/test/core-site.xml;/home/test/yarn-site.xml" + use.kerberos = true + kerberos.principal = "test_user@xxx" + kerberos.principal.file = "/home/test/test_user.keytab" + } +} +``` + +### Multiple table + +#### example1 + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + Mysql-CDC { + base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel" + username = "root" + password = "******" + + table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] + } +} + +transform { +} + +sink { + Hudi { + ... + table_dfs_path = "hdfs://nameserivce/data/hudi/hudi_table/" + table_name = "${table_name}_test" + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Hudi Source Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Iceberg.md b/versioned_docs/version-2.3.7/connector-v2/sink/Iceberg.md new file mode 100644 index 000000000000..721c5ea7c08b --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Iceberg.md @@ -0,0 +1,258 @@ +# Apache Iceberg + +> Apache Iceberg sink connector + +## Support Iceberg Version + +- 1.4.2 + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Description + +Sink connector for Apache Iceberg. It can support cdc mode 、auto create table and table schema evolution. + +## Key features + +- [x] [support multiple table write](../../concept/connector-v2-features.md) + +## Supported DataSource Info + +| Datasource | Dependent | Maven | +|------------|-----------|---------------------------------------------------------------------------| +| Iceberg | hive-exec | [Download](https://mvnrepository.com/artifact/org.apache.hive/hive-exec) | +| Iceberg | libfb303 | [Download](https://mvnrepository.com/artifact/org.apache.thrift/libfb303) | + +## Database Dependency + +> In order to be compatible with different versions of Hadoop and Hive, the scope of hive-exec in the project pom file are provided, so if you use the Flink engine, first you may need to add the following Jar packages to /lib directory, if you are using the Spark engine and integrated with Hadoop, then you do not need to add the following Jar packages. + +``` +hive-exec-xxx.jar +libfb303-xxx.jar +``` + +> Some versions of the hive-exec package do not have libfb303-xxx.jar, so you also need to manually import the Jar package. + +## Data Type Mapping + +| SeaTunnel Data type | Iceberg Data type | +|---------------------|-------------------| +| BOOLEAN | BOOLEAN | +| INT | INTEGER | +| BIGINT | LONG | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| DATE | DATE | +| TIME | TIME | +| TIMESTAMP | TIMESTAMP | +| STRING | STRING | +| BYTES | FIXED
BINARY | +| DECIMAL | DECIMAL | +| ROW | STRUCT | +| ARRAY | LIST | +| MAP | MAP | + +## Sink Options + +| Name | Type | Required | Default | Description | +|----------------------------------------|---------|----------|------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| catalog_name | string | yes | default | User-specified catalog name. default is `default` | +| namespace | string | yes | default | The iceberg database name in the backend catalog. default is `default` | +| table | string | yes | - | The iceberg table name in the backend catalog. | +| iceberg.catalog.config | map | yes | - | Specify the properties for initializing the Iceberg catalog, which can be referenced in this file:"https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/CatalogProperties.java" | +| hadoop.config | map | no | - | Properties passed through to the Hadoop configuration | +| iceberg.hadoop-conf-path | string | no | - | The specified loading paths for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files. | +| case_sensitive | boolean | no | false | If data columns where selected via schema [config], controls whether the match to the schema will be done with case sensitivity. | +| iceberg.table.write-props | map | no | - | Properties passed through to Iceberg writer initialization, these take precedence, such as 'write.format.default', 'write.target-file-size-bytes', and other settings, can be found with specific parameters at 'https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/TableProperties.java'. | +| iceberg.table.auto-create-props | map | no | - | Configuration specified by Iceberg during automatic table creation. | +| iceberg.table.schema-evolution-enabled | boolean | no | false | Setting to true enables Iceberg tables to support schema evolution during the synchronization process | +| iceberg.table.primary-keys | string | no | - | Default comma-separated list of columns that identify a row in tables (primary key) | +| iceberg.table.partition-keys | string | no | - | Default comma-separated list of partition fields to use when creating tables | +| iceberg.table.upsert-mode-enabled | boolean | no | false | Set to `true` to enable upsert mode, default is `false` | +| schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | the schema save mode, please refer to `schema_save_mode` below | +| data_save_mode | Enum | no | APPEND_DATA | the data save mode, please refer to `data_save_mode` below | +| iceberg.table.commit-branch | string | no | - | Default branch for commits | + +## Task Example + +### Simple: + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + MySQL-CDC { + result_table_name = "customers_mysql_cdc_iceberg" + server-id = 5652 + username = "st_user" + password = "seatunnel" + table-names = ["mysql_cdc.mysql_cdc_e2e_source_table"] + base-url = "jdbc:mysql://mysql_cdc_e2e:3306/mysql_cdc" + } +} + +transform { +} + +sink { + Iceberg { + catalog_name="seatunnel_test" + iceberg.catalog.config={ + "type"="hadoop" + "warehouse"="file:///tmp/seatunnel/iceberg/hadoop-sink/" + } + namespace="seatunnel_namespace" + table="iceberg_sink_table" + iceberg.table.write-props={ + write.format.default="parquet" + write.target-file-size-bytes=536870912 + } + iceberg.table.primary-keys="id" + iceberg.table.partition-keys="f_datetime" + iceberg.table.upsert-mode-enabled=true + iceberg.table.schema-evolution-enabled=true + case_sensitive=true + } +} +``` + +### Hive Catalog: + +```hocon +sink { + Iceberg { + catalog_name="seatunnel_test" + iceberg.catalog.config={ + type = "hive" + uri = "thrift://localhost:9083" + warehouse = "hdfs://your_cluster//tmp/seatunnel/iceberg/" + } + namespace="seatunnel_namespace" + table="iceberg_sink_table" + iceberg.table.write-props={ + write.format.default="parquet" + write.target-file-size-bytes=536870912 + } + iceberg.table.primary-keys="id" + iceberg.table.partition-keys="f_datetime" + iceberg.table.upsert-mode-enabled=true + iceberg.table.schema-evolution-enabled=true + case_sensitive=true + } +} +``` + +### Hadoop catalog: + +```hocon +sink { + Iceberg { + catalog_name="seatunnel_test" + iceberg.catalog.config={ + type = "hadoop" + warehouse = "hdfs://your_cluster/tmp/seatunnel/iceberg/" + } + namespace="seatunnel_namespace" + table="iceberg_sink_table" + iceberg.table.write-props={ + write.format.default="parquet" + write.target-file-size-bytes=536870912 + } + iceberg.table.primary-keys="id" + iceberg.table.partition-keys="f_datetime" + iceberg.table.upsert-mode-enabled=true + iceberg.table.schema-evolution-enabled=true + case_sensitive=true + } +} + +``` + +### Multiple table + +#### example1 + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + Mysql-CDC { + base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel" + username = "root" + password = "******" + + table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] + } +} + +transform { +} + +sink { + Iceberg { + ... + namespace = "${database_name}_test" + table = "${table_name}_test" + } +} +``` + +#### example2 + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Jdbc { + driver = oracle.jdbc.driver.OracleDriver + url = "jdbc:oracle:thin:@localhost:1521/XE" + user = testUser + password = testPassword + + table_list = [ + { + table_path = "TESTSCHEMA.TABLE_1" + }, + { + table_path = "TESTSCHEMA.TABLE_2" + } + ] + } +} + +transform { +} + +sink { + Iceberg { + ... + namespace = "${schema_name}_test" + table = "${table_name}_test" + } +} +``` + +## Changelog + +### 2.3.4-SNAPSHOT 2024-01-18 + +- Add Iceberg Sink Connector + +### next version + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/InfluxDB.md b/versioned_docs/version-2.3.7/connector-v2/sink/InfluxDB.md new file mode 100644 index 000000000000..e899840b0fa2 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/InfluxDB.md @@ -0,0 +1,142 @@ +# InfluxDB + +> InfluxDB sink connector + +## Description + +Write data to InfluxDB. + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [support multiple table write](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-----------------------------|--------|----------|------------------------------| +| url | string | yes | - | +| database | string | yes | | +| measurement | string | yes | | +| username | string | no | - | +| password | string | no | - | +| key_time | string | no | processing time | +| key_tags | array | no | exclude `field` & `key_time` | +| batch_size | int | no | 1024 | +| max_retries | int | no | - | +| retry_backoff_multiplier_ms | int | no | - | +| connect_timeout_ms | long | no | 15000 | +| common-options | config | no | - | + +### url + +the url to connect to influxDB e.g. + +``` +http://influxdb-host:8086 +``` + +### database [string] + +The name of `influxDB` database + +### measurement [string] + +The name of `influxDB` measurement + +### username [string] + +`influxDB` user username + +### password [string] + +`influxDB` user password + +### key_time [string] + +Specify field-name of the `influxDB` measurement timestamp in SeaTunnelRow. If not specified, use processing-time as timestamp + +### key_tags [array] + +Specify field-name of the `influxDB` measurement tags in SeaTunnelRow. +If not specified, include all fields with `influxDB` measurement field + +### batch_size [int] + +For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `checkpoint.interval`, the data will be flushed into the influxDB + +### max_retries [int] + +The number of retries to flush failed + +### retry_backoff_multiplier_ms [int] + +Using as a multiplier for generating the next delay for backoff + +### max_retry_backoff_ms [int] + +The amount of time to wait before attempting to retry a request to `influxDB` + +### connect_timeout_ms [long] + +the timeout for connecting to InfluxDB, in milliseconds + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +## Examples + +```hocon +sink { + InfluxDB { + url = "http://influxdb-host:8086" + database = "test" + measurement = "sink" + key_time = "time" + key_tags = ["label"] + batch_size = 1 + } +} + +``` + +### Multiple table + +#### example1 + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + Mysql-CDC { + base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel" + username = "root" + password = "******" + + table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] + } +} + +transform { +} + +sink { + InfluxDB { + url = "http://influxdb-host:8086" + database = "test" + measurement = "${table_name}_test" + } +} +``` + +## Changelog + +### next version + +- Add InfluxDB Sink Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/IoTDB.md b/versioned_docs/version-2.3.7/connector-v2/sink/IoTDB.md new file mode 100644 index 000000000000..9cbcd68b8a40 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/IoTDB.md @@ -0,0 +1,221 @@ +# IoTDB + +> IoTDB sink connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Description + +Used to write data to IoTDB. + +## Using Dependency + +### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.apache.iotdb/iotdb-jdbc) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.apache.iotdb/iotdb-jdbc) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) + +IoTDB supports the `exactly-once` feature through idempotent writing. If two pieces of data have +the same `key` and `timestamp`, the new data will overwrite the old one. + +:::tip + +There is a conflict of thrift version between IoTDB and Spark.Therefore, you need to execute `rm -f $SPARK_HOME/jars/libthrift*` and `cp $IOTDB_HOME/lib/libthrift* $SPARK_HOME/jars/` to resolve it. + +::: + +## Supported DataSource Info + +| Datasource | Supported Versions | Url | +|------------|--------------------|----------------| +| IoTDB | `>= 0.13.0` | localhost:6667 | + +## Data Type Mapping + +| IotDB Data Type | SeaTunnel Data Type | +|-----------------|---------------------| +| BOOLEAN | BOOLEAN | +| INT32 | TINYINT | +| INT32 | SMALLINT | +| INT32 | INT | +| INT64 | BIGINT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| TEXT | STRING | + +## Sink Options + +| Name | Type | Required | Default | Description | +|-----------------------------|---------|----------|--------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| node_urls | String | Yes | - | `IoTDB` cluster address, the format is `"host1:port"` or `"host1:port,host2:port"` | +| username | String | Yes | - | `IoTDB` user username | +| password | String | Yes | - | `IoTDB` user password | +| key_device | String | Yes | - | Specify field name of the `IoTDB` deviceId in SeaTunnelRow | +| key_timestamp | String | No | processing time | Specify field-name of the `IoTDB` timestamp in SeaTunnelRow. If not specified, use processing-time as timestamp | +| key_measurement_fields | Array | No | exclude `device` & `timestamp` | Specify field-name of the `IoTDB` measurement list in SeaTunnelRow. If not specified, include all fields but exclude `device` & `timestamp` | +| storage_group | Array | No | - | Specify device storage group(path prefix)
example: deviceId = ${storage_group} + "." + ${key_device} | +| batch_size | Integer | No | 1024 | For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the IoTDB | +| max_retries | Integer | No | - | The number of retries to flush failed | +| retry_backoff_multiplier_ms | Integer | No | - | Using as a multiplier for generating the next delay for backoff | +| max_retry_backoff_ms | Integer | No | - | The amount of time to wait before attempting to retry a request to `IoTDB` | +| default_thrift_buffer_size | Integer | No | - | Thrift init buffer size in `IoTDB` client | +| max_thrift_frame_size | Integer | No | - | Thrift max frame size in `IoTDB` client | +| zone_id | string | No | - | java.time.ZoneId in `IoTDB` client | +| enable_rpc_compression | Boolean | No | - | Enable rpc compression in `IoTDB` client | +| connection_timeout_in_ms | Integer | No | - | The maximum time (in ms) to wait when connecting to `IoTDB` | +| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | + +## Examples + +```hocon +env { + parallelism = 2 + job.mode = "BATCH" +} + +source { + FakeSource { + row.num = 16 + bigint.template = [1664035200001] + schema = { + fields { + device_name = "string" + temperature = "float" + moisture = "int" + event_ts = "bigint" + c_string = "string" + c_boolean = "boolean" + c_tinyint = "tinyint" + c_smallint = "smallint" + c_int = "int" + c_bigint = "bigint" + c_float = "float" + c_double = "double" + } + } + } +} +``` + +Upstream SeaTunnelRow data format is the following: + +| device_name | temperature | moisture | event_ts | c_string | c_boolean | c_tinyint | c_smallint | c_int | c_bigint | c_float | c_double | +|--------------------------|-------------|----------|---------------|----------|-----------|-----------|------------|-------|------------|---------|----------| +| root.test_group.device_a | 36.1 | 100 | 1664035200001 | abc1 | true | 1 | 1 | 1 | 2147483648 | 1.0 | 1.0 | +| root.test_group.device_b | 36.2 | 101 | 1664035200001 | abc2 | false | 2 | 2 | 2 | 2147483649 | 2.0 | 2.0 | +| root.test_group.device_c | 36.3 | 102 | 1664035200001 | abc3 | false | 3 | 3 | 3 | 2147483649 | 3.0 | 3.0 | + +### Case1 + +only fill required config. +use current processing time as timestamp. and include all fields but exclude `device` & `timestamp` as measurement fields + +```hocon +sink { + IoTDB { + node_urls = "localhost:6667" + username = "root" + password = "root" + key_device = "device_name" # specify the `deviceId` use device_name field + } +} +``` + +Output to `IoTDB` data format is the following: + +```shell +IoTDB> SELECT * FROM root.test_group.* align by device; ++------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ +| Time| Device| temperature| moisture| event_ts| c_string| c_boolean| c_tinyint| c_smallint| c_int| c_bigint| c_float| c_double| ++------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ +|2023-09-01T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1664035200001| abc1| true| 1| 1| 1| 2147483648| 1.0| 1.0| +|2023-09-01T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 1664035200001| abc2| false| 2| 2| 2| 2147483649| 2.0| 2.0| +|2023-09-01T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 1664035200001| abc2| false| 3| 3| 3| 2147483649| 3.0| 3.0| ++------------------------+------------------------+--------------+-----------+--------------+---------+---------+-----------+-----------+------+-----------+--------+---------+ +``` + +### Case2 + +use source event's time + +```hocon +sink { + IoTDB { + node_urls = "localhost:6667" + username = "root" + password = "root" + key_device = "device_name" # specify the `deviceId` use device_name field + key_timestamp = "event_ts" # specify the `timestamp` use event_ts field + } +} +``` + +Output to `IoTDB` data format is the following: + +```shell +IoTDB> SELECT * FROM root.test_group.* align by device; ++------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ +| Time| Device| temperature| moisture| event_ts| c_string| c_boolean| c_tinyint| c_smallint| c_int| c_bigint| c_float| c_double| ++------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ +|2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1664035200001| abc1| true| 1| 1| 1| 2147483648| 1.0| 1.0| +|2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 1664035200001| abc2| false| 2| 2| 2| 2147483649| 2.0| 2.0| +|2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 1664035200001| abc2| false| 3| 3| 3| 2147483649| 3.0| 3.0| ++------------------------+------------------------+--------------+-----------+--------------+---------+---------+-----------+-----------+------+-----------+--------+---------+ +``` + +### Case3 + +use source event's time and limit measurement fields + +```hocon +sink { + IoTDB { + node_urls = "localhost:6667" + username = "root" + password = "root" + key_device = "device_name" + key_timestamp = "event_ts" + key_measurement_fields = ["temperature", "moisture"] + } +} +``` + +Output to `IoTDB` data format is the following: + +```shell +IoTDB> SELECT * FROM root.test_group.* align by device; ++------------------------+------------------------+--------------+-----------+ +| Time| Device| temperature| moisture| ++------------------------+------------------------+--------------+-----------+ +|2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100| +|2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101| +|2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102| ++------------------------+------------------------+--------------+-----------+ +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add IoTDB Sink Connector + +### 2.3.0-beta 2022-10-20 + +- [Improve] Improve IoTDB Sink Connector ([2917](https://github.com/apache/seatunnel/pull/2917)) + - Support align by sql syntax + - Support sql split ignore case + - Support restore split offset to at-least-once + - Support read timestamp from RowRecord +- [BugFix] Fix IoTDB connector sink NPE ([3080](https://github.com/apache/seatunnel/pull/3080)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Jdbc.md b/versioned_docs/version-2.3.7/connector-v2/sink/Jdbc.md new file mode 100644 index 000000000000..c46933b486ba --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Jdbc.md @@ -0,0 +1,456 @@ +# JDBC + +> JDBC sink connector + +## Description + +Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once +semantics (using XA transaction guarantee). + +## Using Dependency + +### For Spark/Flink Engine + +> 1. You need to ensure that the jdbc driver jar package has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the jdbc driver jar package has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) + +Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is +support `Xa transactions`. You can set `is_exactly_once=true` to enable it. + +- [x] [cdc](../../concept/connector-v2-features.md) +- [x] [support multiple table write](../../concept/connector-v2-features.md) + +## Options + +| Name | Type | Required | Default | +|-------------------------------------------|---------|----------|------------------------------| +| url | String | Yes | - | +| driver | String | Yes | - | +| user | String | No | - | +| password | String | No | - | +| query | String | No | - | +| compatible_mode | String | No | - | +| database | String | No | - | +| table | String | No | - | +| primary_keys | Array | No | - | +| support_upsert_by_query_primary_key_exist | Boolean | No | false | +| connection_check_timeout_sec | Int | No | 30 | +| max_retries | Int | No | 0 | +| batch_size | Int | No | 1000 | +| is_exactly_once | Boolean | No | false | +| generate_sink_sql | Boolean | No | false | +| xa_data_source_class_name | String | No | - | +| max_commit_attempts | Int | No | 3 | +| transaction_timeout_sec | Int | No | -1 | +| auto_commit | Boolean | No | true | +| field_ide | String | No | - | +| properties | Map | No | - | +| common-options | | No | - | +| schema_save_mode | Enum | No | CREATE_SCHEMA_WHEN_NOT_EXIST | +| data_save_mode | Enum | No | APPEND_DATA | +| custom_sql | String | No | - | +| enable_upsert | Boolean | No | true | +| use_copy_statement | Boolean | No | false | +| create_index | Boolean | No | true | + +### driver [string] + +The jdbc class name used to connect to the remote data source, if you use MySQL the value is `com.mysql.cj.jdbc.Driver`. + +### user [string] + +userName + +### password [string] + +password + +### url [string] + +The URL of the JDBC connection. Refer to a case: jdbc:postgresql://localhost/test + +### query [string] + +Use this sql write upstream input datas to database. e.g `INSERT ...` + +### compatible_mode [string] + +The compatible mode of database, required when the database supports multiple compatible modes. For example, when using OceanBase database, you need to set it to 'mysql' or 'oracle'. + +Postgres 9.5 version or below,please set it to `postgresLow` to support cdc + +### database [string] + +Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database. + +This option is mutually exclusive with `query` and has a higher priority. + +### table [string] + +Use `database` and this `table-name` auto-generate sql and receive upstream input datas write to database. + +This option is mutually exclusive with `query` and has a higher priority. + +The table parameter can fill in the name of an unwilling table, which will eventually be used as the table name of the creation table, and supports variables (`${table_name}`, `${schema_name}`). Replacement rules: `${schema_name}` will replace the SCHEMA name passed to the target side, and `${table_name}` will replace the name of the table passed to the table at the target side. + +mysql sink for example: +1. test_${schema_name}_${table_name}_test +2. sink_sinktable +3. ss_${table_name} + +pgsql (Oracle Sqlserver ...) Sink for example: +1. ${schema_name}.${table_name} _test +2. dbo.tt_${table_name} _sink +3. public.sink_table + +Tip: If the target database has the concept of SCHEMA, the table parameter must be written as `xxx.xxx` + +### primary_keys [array] + +This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. + +### support_upsert_by_query_primary_key_exist [boolean] + +Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupported upsert syntax. +**Note**: that this method has low performance + +### connection_check_timeout_sec [int] + +The time in seconds to wait for the database operation used to validate the connection to complete. + +### max_retries[int] + +The number of retries to submit failed (executeBatch) + +### batch_size[int] + +For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval` +, the data will be flushed into the database + +### is_exactly_once[boolean] + +Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to +set `xa_data_source_class_name`. + +### generate_sink_sql[boolean] + +Generate sql statements based on the database table you want to write to + +### xa_data_source_class_name[string] + +The xa data source class name of the database Driver, for example, mysql is `com.mysql.cj.jdbc.MysqlXADataSource`, and +please refer to appendix for other data sources + +### max_commit_attempts[int] + +The number of retries for transaction commit failures + +### transaction_timeout_sec[int] + +The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect +exactly-once semantics + +### auto_commit [boolean] + +Automatic transaction commit is enabled by default + +### field_ide [String] + +The field "field_ide" is used to identify whether the field needs to be converted to uppercase or lowercase when +synchronizing from the source to the sink. "ORIGINAL" indicates no conversion is needed, "UPPERCASE" indicates +conversion to uppercase, and "LOWERCASE" indicates conversion to lowercase. + +### properties + +Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +### schema_save_mode [Enum] + +Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. +Option introduction: +`RECREATE_SCHEMA` :Will create when the table does not exist, delete and rebuild when the table is saved +`CREATE_SCHEMA_WHEN_NOT_EXIST` :Will Created when the table does not exist, skipped when the table is saved +`ERROR_WHEN_SCHEMA_NOT_EXIST` :Error will be reported when the table does not exist + +### data_save_mode [Enum] + +Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. +Option introduction: +`DROP_DATA`: Preserve database structure and delete data +`APPEND_DATA`:Preserve database structure, preserve data +`CUSTOM_PROCESSING`:User defined processing +`ERROR_WHEN_DATA_EXISTS`:When there is data, an error is reported + +### custom_sql [String] + +When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. + +### enable_upsert [boolean] + +Enable upsert by primary_keys exist, If the task has no key duplicate data, setting this parameter to `false` can speed up data import + +### use_copy_statement [boolean] + +Use `COPY ${table} FROM STDIN` statement to import data. Only drivers with `getCopyAPI()` method connections are supported. e.g.: Postgresql driver `org.postgresql.Driver`. + +NOTICE: `MAP`, `ARRAY`, `ROW` types are not supported. + +### create_index [boolean] + +Create the index(contains primary key and any other indexes) or not when auto-create table. You can use this option to improve the performance of jdbc writes when migrating large tables. + +Notice: Note that this will sacrifice read performance, so you'll need to manually create indexes after the table migration to improve read performance + +## tips + +In the case of is_exactly_once = "true", Xa transactions are used. This requires database support, and some databases require some setup : +1 postgres needs to set `max_prepared_transactions > 1` such as `ALTER SYSTEM set max_prepared_transactions to 10`. +2 mysql version need >= `8.0.29` and Non-root users need to grant `XA_RECOVER_ADMIN` permissions. such as `grant XA_RECOVER_ADMIN on test_db.* to 'user1'@'%'`. +3 mysql can try to add `rewriteBatchedStatements=true` parameter in url for better performance. + +## appendix + +there are some reference value for params above. + +| datasource | driver | url | xa_data_source_class_name | maven | +|-------------------|----------------------------------------------|--------------------------------------------------------------------|----------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------| +| MySQL | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | com.mysql.cj.jdbc.MysqlXADataSource | https://mvnrepository.com/artifact/mysql/mysql-connector-java | +| PostgreSQL | org.postgresql.Driver | jdbc:postgresql://localhost:5432/postgres | org.postgresql.xa.PGXADataSource | https://mvnrepository.com/artifact/org.postgresql/postgresql | +| DM | dm.jdbc.driver.DmDriver | jdbc:dm://localhost:5236 | dm.jdbc.driver.DmdbXADataSource | https://mvnrepository.com/artifact/com.dameng/DmJdbcDriver18 | +| Phoenix | org.apache.phoenix.queryserver.client.Driver | jdbc:phoenix:thin:url=http://localhost:8765;serialization=PROTOBUF | / | https://mvnrepository.com/artifact/com.aliyun.phoenix/ali-phoenix-shaded-thin-client | +| SQL Server | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433 | com.microsoft.sqlserver.jdbc.SQLServerXADataSource | https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc | +| Oracle | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@localhost:1521/xepdb1 | oracle.jdbc.xa.OracleXADataSource | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | +| sqlite | org.sqlite.JDBC | jdbc:sqlite:test.db | / | https://mvnrepository.com/artifact/org.xerial/sqlite-jdbc | +| GBase8a | com.gbase.jdbc.Driver | jdbc:gbase://e2e_gbase8aDb:5258/test | / | https://www.gbase8.cn/wp-content/uploads/2020/10/gbase-connector-java-8.3.81.53-build55.5.7-bin_min_mix.jar | +| StarRocks | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | / | https://mvnrepository.com/artifact/mysql/mysql-connector-java | +| db2 | com.ibm.db2.jcc.DB2Driver | jdbc:db2://localhost:50000/testdb | com.ibm.db2.jcc.DB2XADataSource | https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc/db2jcc4 | +| saphana | com.sap.db.jdbc.Driver | jdbc:sap://localhost:39015 | / | https://mvnrepository.com/artifact/com.sap.cloud.db.jdbc/ngdbc | +| Doris | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | / | https://mvnrepository.com/artifact/mysql/mysql-connector-java | +| teradata | com.teradata.jdbc.TeraDriver | jdbc:teradata://localhost/DBS_PORT=1025,DATABASE=test | / | https://mvnrepository.com/artifact/com.teradata.jdbc/terajdbc | +| Redshift | com.amazon.redshift.jdbc42.Driver | jdbc:redshift://localhost:5439/testdb | com.amazon.redshift.xa.RedshiftXADataSource | https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42 | +| Snowflake | net.snowflake.client.jdbc.SnowflakeDriver | jdbc:snowflake://.snowflakecomputing.com | / | https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc | +| Vertica | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433 | / | https://repo1.maven.org/maven2/com/vertica/jdbc/vertica-jdbc/12.0.3-0/vertica-jdbc-12.0.3-0.jar | +| Kingbase | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | / | https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar | +| OceanBase | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2881 | / | https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.3/oceanbase-client-2.4.3.jar | +| xugu | com.xugu.cloudjdbc.Driver | jdbc:xugu://localhost:5138 | / | https://repo1.maven.org/maven2/com/xugudb/xugu-jdbc/12.2.0/xugu-jdbc-12.2.0.jar | +| InterSystems IRIS | com.intersystems.jdbc.IRISDriver | jdbc:IRIS://localhost:1972/%SYS | / | https://raw.githubusercontent.com/intersystems-community/iris-driver-distribution/main/JDBC/JDK18/intersystems-jdbc-3.8.4.jar | + +## Example + +Simple + +``` +jdbc { + url = "jdbc:mysql://localhost:3306/test" + driver = "com.mysql.cj.jdbc.Driver" + user = "root" + password = "123456" + query = "insert into test_table(name,age) values(?,?)" +} + +``` + +Exactly-once + +Turn on exact one-time semantics by setting `is_exactly_once` + +``` +jdbc { + + url = "jdbc:mysql://localhost:3306/test" + driver = "com.mysql.cj.jdbc.Driver" + + max_retries = 0 + user = "root" + password = "123456" + query = "insert into test_table(name,age) values(?,?)" + + is_exactly_once = "true" + + xa_data_source_class_name = "com.mysql.cj.jdbc.MysqlXADataSource" +} +``` + +CDC(Change data capture) event + +jdbc receive CDC example + +``` +sink { + jdbc { + url = "jdbc:mysql://localhost:3306" + driver = "com.mysql.cj.jdbc.Driver" + user = "root" + password = "123456" + + database = "sink_database" + table = "sink_table" + primary_keys = ["key1", "key2", ...] + } +} +``` + +Add saveMode function + +To facilitate the creation of tables when they do not already exist, set the `schema_save_mode` to `CREATE_SCHEMA_WHEN_NOT_EXIST`. + +``` +sink { + jdbc { + url = "jdbc:mysql://localhost:3306" + driver = "com.mysql.cj.jdbc.Driver" + user = "root" + password = "123456" + generate_sink_sql = "true" + database = "sink_database" + table = "sink_table" + primary_keys = ["key1", "key2", ...] + schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" + data_save_mode="APPEND_DATA" + } +} +``` + +Postgresql 9.5 version below support CDC(Change data capture) event + +For PostgreSQL versions 9.5 and below, setting `compatible_mode` to `postgresLow` to enable support for PostgreSQL Change Data Capture (CDC) operations. + +``` +sink { + jdbc { + url = "jdbc:postgresql://localhost:5432" + driver = "org.postgresql.Driver" + user = "root" + password = "123456" + compatible_mode="postgresLow" + database = "sink_database" + table = "sink_table" + support_upsert_by_query_primary_key_exist = true + generate_sink_sql = true + primary_keys = ["key1", "key2", ...] + } +} + +``` + +### Multiple table + +#### example1 + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + Mysql-CDC { + base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel" + username = "root" + password = "******" + + table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] + } +} + +transform { +} + +sink { + jdbc { + url = "jdbc:mysql://localhost:3306" + driver = "com.mysql.cj.jdbc.Driver" + user = "root" + password = "123456" + generate_sink_sql = true + + database = "${database_name}_test" + table = "${table_name}_test" + primary_keys = ["${primary_key}"] + } +} +``` + +#### example2 + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Jdbc { + driver = oracle.jdbc.driver.OracleDriver + url = "jdbc:oracle:thin:@localhost:1521/XE" + user = testUser + password = testPassword + + table_list = [ + { + table_path = "TESTSCHEMA.TABLE_1" + }, + { + table_path = "TESTSCHEMA.TABLE_2" + } + ] + } +} + +transform { +} + +sink { + jdbc { + url = "jdbc:mysql://localhost:3306" + driver = "com.mysql.cj.jdbc.Driver" + user = "root" + password = "123456" + generate_sink_sql = true + + database = "${schema_name}_test" + table = "${table_name}_test" + primary_keys = ["${primary_key}"] + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Console Sink Connector + +### 2.3.0-beta 2022-10-20 + +- [BugFix] Fix JDBC split exception ([2904](https://github.com/apache/seatunnel/pull/2904)) +- [Feature] Support Phoenix JDBC Sink ([2499](https://github.com/apache/seatunnel/pull/2499)) +- [Feature] Support SQL Server JDBC Sink ([2646](https://github.com/apache/seatunnel/pull/2646)) +- [Feature] Support Oracle JDBC Sink ([2550](https://github.com/apache/seatunnel/pull/2550)) +- [Feature] Support StarRocks JDBC Sink ([3060](https://github.com/apache/seatunnel/pull/3060)) +- [Feature] Support DB2 JDBC Sink ([2410](https://github.com/apache/seatunnel/pull/2410)) + +### next version + +- [Feature] Support CDC write DELETE/UPDATE/INSERT events ([3378](https://github.com/apache/seatunnel/issues/3378)) +- [Feature] Support Teradata JDBC Sink ([3362](https://github.com/apache/seatunnel/pull/3362)) +- [Feature] Support Sqlite JDBC Sink ([3089](https://github.com/apache/seatunnel/pull/3089)) +- [Feature] Support CDC write DELETE/UPDATE/INSERT events ([3378](https://github.com/apache/seatunnel/issues/3378)) +- [Feature] Support Doris JDBC Sink +- [Feature] Support Redshift JDBC Sink([#3615](https://github.com/apache/seatunnel/pull/3615)) +- [Improve] Add config item enable upsert by query([#3708](https://github.com/apache/seatunnel/pull/3708)) +- [Improve] Add database field to sink config([#4199](https://github.com/apache/seatunnel/pull/4199)) +- [Improve] Add Vertica connector([#4303](https://github.com/apache/seatunnel/pull/4303)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Kafka.md b/versioned_docs/version-2.3.7/connector-v2/sink/Kafka.md new file mode 100644 index 000000000000..4a98c7d6a7c3 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Kafka.md @@ -0,0 +1,215 @@ +# Kafka + +> Kafka sink connector + +## Support Those Engines + +> Spark
+> Flink
+> Seatunnel Zeta
+ +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [cdc](../../concept/connector-v2-features.md) + +> By default, we will use 2pc to guarantee the message is sent to kafka exactly once. + +## Description + +Write Rows to a Kafka topic. + +## Supported DataSource Info + +In order to use the Kafka connector, the following dependencies are required. +They can be downloaded via install-plugin.sh or from the Maven central repository. + +| Datasource | Supported Versions | Maven | +|------------|--------------------|-------------------------------------------------------------------------------------------------------------| +| Kafka | Universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-kafka) | + +## Sink Options + +| Name | Type | Required | Default | Description | +|----------------------|--------|----------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| topic | String | Yes | - | When the table is used as sink, the topic name is the topic to write data to. | +| bootstrap.servers | String | Yes | - | Comma separated list of Kafka brokers. | +| kafka.config | Map | No | - | In addition to the above parameters that must be specified by the `Kafka producer` client, the user can also specify multiple non-mandatory parameters for the `producer` client, covering [all the producer parameters specified in the official Kafka document](https://kafka.apache.org/documentation.html#producerconfigs). | +| semantics | String | No | NON | Semantics that can be chosen EXACTLY_ONCE/AT_LEAST_ONCE/NON, default NON. | +| partition_key_fields | Array | No | - | Configure which fields are used as the key of the kafka message. | +| partition | Int | No | - | We can specify the partition, all messages will be sent to this partition. | +| assign_partitions | Array | No | - | We can decide which partition to send based on the content of the message. The function of this parameter is to distribute information. | +| transaction_prefix | String | No | - | If semantic is specified as EXACTLY_ONCE, the producer will write all messages in a Kafka transaction,kafka distinguishes different transactions by different transactionId. This parameter is prefix of kafka transactionId, make sure different job use different prefix. | +| format | String | No | json | Data format. The default format is json. Optional text format, canal_json, debezium_json, ogg_json and avro.If you use json or text format. The default field separator is ", ". If you customize the delimiter, add the "field_delimiter" option.If you use canal format, please refer to [canal-json](../formats/canal-json.md) for details.If you use debezium format, please refer to [debezium-json](../formats/debezium-json.md) for details. | +| field_delimiter | String | No | , | Customize the field delimiter for data format. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## Parameter Interpretation + +### Topic Formats + +Currently two formats are supported: + +1. Fill in the name of the topic. + +2. Use value of a field from upstream data as topic,the format is `${your field name}`, where topic is the value of one of the columns of the upstream data. + + For example, Upstream data is the following: + +| name | age | data | +|------|-----|---------------| +| Jack | 16 | data-example1 | +| Mary | 23 | data-example2 | + +If `${name}` is set as the topic. So the first row is sent to Jack topic, and the second row is sent to Mary topic. + +### Semantics + +In EXACTLY_ONCE, producer will write all messages in a Kafka transaction that will be committed to Kafka on a checkpoint. +In AT_LEAST_ONCE, producer will wait for all outstanding messages in the Kafka buffers to be acknowledged by the Kafka producer on a checkpoint. +NON does not provide any guarantees: messages may be lost in case of issues on the Kafka broker and messages may be duplicated. + +### Partition Key Fields + +For example, if you want to use value of fields from upstream data as key, you can assign field names to this property. + +Upstream data is the following: + +| name | age | data | +|------|-----|---------------| +| Jack | 16 | data-example1 | +| Mary | 23 | data-example2 | + +If name is set as the key, then the hash value of the name column will determine which partition the message is sent to. +If not set partition key fields, the null message key will be sent to. +The format of the message key is json, If name is set as the key, for example '{"name":"Jack"}'. +The selected field must be an existing field in the upstream. + +### Assign Partitions + +For example, there are five partitions in total, and the assign_partitions field in config is as follows: +assign_partitions = ["shoe", "clothing"] +Then the message containing "shoe" will be sent to partition zero ,because "shoe" is subscribed as zero in assign_partitions, and the message containing "clothing" will be sent to partition one.For other messages, the hash algorithm will be used to divide them into the remaining partitions. +This function by `MessageContentPartitioner` class implements `org.apache.kafka.clients.producer.Partitioner` interface.If we need custom partitions, we need to implement this interface as well. + +## Task Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to Kafka Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target topic is test_topic will also be 16 rows of data in the topic. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +```hocon +# Defining the runtime environment +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +sink { + kafka { + topic = "test_topic" + bootstrap.servers = "localhost:9092" + format = json + kafka.request.timeout.ms = 60000 + semantics = EXACTLY_ONCE + kafka.config = { + acks = "all" + request.timeout.ms = 60000 + buffer.memory = 33554432 + } + } +} +``` + +### AWS MSK SASL/SCRAM + +Replace the following `${username}` and `${password}` with the configuration values in AWS MSK. + +```hocon +sink { + kafka { + topic = "seatunnel" + bootstrap.servers = "localhost:9092" + format = json + kafka.request.timeout.ms = 60000 + semantics = EXACTLY_ONCE + kafka.config = { + security.protocol=SASL_SSL + sasl.mechanism=SCRAM-SHA-512 + sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required \nusername=${username}\npassword=${password};" + } + } +} +``` + +### AWS MSK IAM + +Download `aws-msk-iam-auth-1.1.5.jar` from https://github.com/aws/aws-msk-iam-auth/releases and put it in `$SEATUNNEL_HOME/plugin/kafka/lib` dir. + +Please ensure the IAM policy have `"kafka-cluster:Connect",`. Like this: + +```hocon +"Effect": "Allow", +"Action": [ + "kafka-cluster:Connect", + "kafka-cluster:AlterCluster", + "kafka-cluster:DescribeCluster" +], +``` + +Sink Config + +```hocon +sink { + kafka { + topic = "seatunnel" + bootstrap.servers = "localhost:9092" + format = json + kafka.request.timeout.ms = 60000 + semantics = EXACTLY_ONCE + kafka.config = { + security.protocol=SASL_SSL + sasl.mechanism=AWS_MSK_IAM + sasl.jaas.config="software.amazon.msk.auth.iam.IAMLoginModule required;" + sasl.client.callback.handler.class="software.amazon.msk.auth.iam.IAMClientCallbackHandler" + } + } +} +``` + +### Kerberos Authentication Example + +Sink Config + +``` +sink { + Kafka { + topic = "seatunnel" + bootstrap.servers = "127.0.0.1:9092" + format = json + semantics = EXACTLY_ONCE + kafka.config = { + security.protocol=SASL_PLAINTEXT + sasl.kerberos.service.name=kafka + sasl.mechanism=GSSAPI + java.security.krb5.conf="/etc/krb5.conf" + sasl.jaas.config="com.sun.security.auth.module.Krb5LoginModule required \n useKeyTab=true \n storeKey=true \n keyTab=\"/path/to/xxx.keytab\" \n principal=\"user@xxx.com\";" + } + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Kingbase.md b/versioned_docs/version-2.3.7/connector-v2/sink/Kingbase.md new file mode 100644 index 000000000000..361ca9a728dd --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Kingbase.md @@ -0,0 +1,168 @@ +# Kingbase + +> JDBC Kingbase Sink Connector + +## Support Connector Version + +- 8.6 + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [cdc](../../concept/connector-v2-features.md) + +## Description + +> Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is +> support `Xa transactions`. You can set `is_exactly_once=true` to enable it.Kingbase currently does not support + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|--------------------|----------------------|------------------------------------------|------------------------------------------------------------------------------------------------| +| Kingbase | 8.6 | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | [Download](https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' +> working directory
+> For example: cp kingbase8-8.6.0.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| Kingbase Data Type | SeaTunnel Data Type | +|----------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------| +| BOOL | BOOLEAN | +| INT2 | SHORT | +| SMALLSERIAL
SERIAL
INT4 | INT | +| INT8
BIGSERIAL | BIGINT | +| FLOAT4 | FLOAT | +| FLOAT8 | DOUBLE | +| NUMERIC | DECIMAL((Get the designated column's specified column size),
(Gets the designated column's number of digits to right of the decimal point.))) | +| BPCHAR
CHARACTER
VARCHAR
TEXT | STRING | +| TIMESTAMP | LOCALDATETIME | +| TIME | LOCALTIME | +| DATE | LOCALDATE | +| Other data type | Not supported yet | + +## Sink Options + +| Name | Type | Required | Default | Description | +|-------------------------------------------|---------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:db2://127.0.0.1:50000/dbname | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use DB2 the value is `com.ibm.db2.jdbc.app.DB2Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | +| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | +| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | +| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | +| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | +| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. Kingbase currently does not support | +| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to | +| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver,Kingbase currently does not support | +| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | +| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | +| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | +| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | +| enable_upsert | Boolean | No | true | Enable upsert by primary_keys exist, If the task has no key duplicate data, setting this parameter to `false` can speed up data import | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed +> in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends +> it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having 12 fields. The final target table is test_table will also be 16 rows of data in the table. +> Before +> run this job, you need create database test and table test_table in your Kingbase. And if you have not yet installed and +> deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) +> to +> install and deploy SeaTunnel. And then follow the instructions +> in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +``` +# Defining the runtime environment +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_date = date + c_time = time + c_timestamp = timestamp + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + jdbc { + url = "jdbc:kingbase8://127.0.0.1:54321/dbname" + driver = "com.kingbase8.Driver" + user = "root" + password = "123456" + query = "insert into test_table(c_string,c_boolean,c_tinyint,c_smallint,c_int,c_bigint,c_float,c_double,c_decimal,c_date,c_time,c_timestamp) values(?,?,?,?,?,?,?,?,?,?,?,?)" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + +### Generate Sink SQL + +> This example not need to write complex sql statements, you can configure the database name table name to automatically +> generate add statements for you + +``` +sink { + jdbc { + url = "jdbc:kingbase8://127.0.0.1:54321/dbname" + driver = "com.kingbase8.Driver" + user = "root" + password = "123456" + # Automatically generate sql statements based on database table names + generate_sink_sql = true + database = test + table = test_table + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Kudu.md b/versioned_docs/version-2.3.7/connector-v2/sink/Kudu.md new file mode 100644 index 000000000000..aea1a917fb19 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Kudu.md @@ -0,0 +1,209 @@ +# Kudu + +> Kudu sink connector + +## Support Kudu Version + +- 1.11.1/1.12.0/1.13.0/1.14.0/1.15.0 + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) +- [x] [support multiple table write](../../concept/connector-v2-features.md) + +## Data Type Mapping + +| SeaTunnel Data Type | Kudu Data Type | +|---------------------|--------------------------| +| BOOLEAN | BOOL | +| INT | INT8
INT16
INT32 | +| BIGINT | INT64 | +| DECIMAL | DECIMAL | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| STRING | STRING | +| TIMESTAMP | UNIXTIME_MICROS | +| BYTES | BINARY | + +## Sink Options + +| Name | Type | Required | Default | Description | +|-------------------------------------------|--------|----------|------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------| +| kudu_masters | String | Yes | - | Kudu master address. Separated by ',',such as '192.168.88.110:7051'. | +| table_name | String | Yes | - | The name of kudu table. | +| client_worker_count | Int | No | 2 * Runtime.getRuntime().availableProcessors() | Kudu worker count. Default value is twice the current number of cpu cores. | +| client_default_operation_timeout_ms | Long | No | 30000 | Kudu normal operation time out. | +| client_default_admin_operation_timeout_ms | Long | No | 30000 | Kudu admin operation time out. | +| enable_kerberos | Bool | No | false | Kerberos principal enable. | +| kerberos_principal | String | No | - | Kerberos principal. Note that all zeta nodes require have this file. | +| kerberos_keytab | String | No | - | Kerberos keytab. Note that all zeta nodes require have this file. | +| kerberos_krb5conf | String | No | - | Kerberos krb5 conf. Note that all zeta nodes require have this file. | +| save_mode | String | No | - | Storage mode, support `overwrite` and `append`. | +| session_flush_mode | String | No | AUTO_FLUSH_SYNC | Kudu flush mode. Default AUTO_FLUSH_SYNC. | +| batch_size | Int | No | 1024 | The flush max size (includes all append, upsert and delete records), over this number of records, will flush data. The default value is 100 | +| buffer_flush_interval | Int | No | 10000 | The flush interval mills, over this time, asynchronous threads will flush data. | +| ignore_not_found | Bool | No | false | If true, ignore all not found rows. | +| ignore_not_duplicate | Bool | No | false | If true, ignore all dulicate rows. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | + +## Task Example + +### Simple: + +> The following example refers to a FakeSource named "kudu" cdc write kudu table "kudu_sink_table" + +```hocon + +env { + parallelism = 1 + job.mode = "BATCH" +} + source { + FakeSource { + result_table_name = "kudu" + schema = { + fields { + id = int + val_bool = boolean + val_int8 = tinyint + val_int16 = smallint + val_int32 = int + val_int64 = bigint + val_float = float + val_double = double + val_decimal = "decimal(16, 1)" + val_string = string + val_unixtime_micros = timestamp + } + } + rows = [ + { + kind = INSERT + fields = [1, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] + }, + { + kind = INSERT + fields = [2, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] + }, + { + kind = INSERT + fields = [3, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] + }, + { + kind = UPDATE_BEFORE + fields = [1, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] + }, + { + kind = UPDATE_AFTER + fields = [1, true, 2, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] + }, + { + kind = DELETE + fields = [2, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] + } + ] + } + } + +sink { + kudu{ + source_table_name = "kudu" + kudu_masters = "kudu-master-cdc:7051" + table_name = "kudu_sink_table" + enable_kerberos = true + kerberos_principal = "xx@xx.COM" + kerberos_keytab = "xx.keytab" + } +} +``` + +### Multiple table + +#### example1 + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + Mysql-CDC { + base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel" + username = "root" + password = "******" + + table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] + } +} + +transform { +} + +sink { + kudu{ + kudu_masters = "kudu-master-cdc:7051" + table_name = "${database_name}_${table_name}_test" + } +} +``` + +#### example2 + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Jdbc { + driver = oracle.jdbc.driver.OracleDriver + url = "jdbc:oracle:thin:@localhost:1521/XE" + user = testUser + password = testPassword + + table_list = [ + { + table_path = "TESTSCHEMA.TABLE_1" + }, + { + table_path = "TESTSCHEMA.TABLE_2" + } + ] + } +} + +transform { +} + +sink { + kudu{ + kudu_masters = "kudu-master-cdc:7051" + table_name = "${schema_name}_${table_name}_test" + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Kudu Sink Connector + +### 2.3.0-beta 2022-10-20 + +- [Improve] Kudu Sink Connector Support to upsert row ([2881](https://github.com/apache/seatunnel/pull/2881)) + +### Next Version + +- Change plugin name from `KuduSink` to `Kudu` [3432](https://github.com/apache/seatunnel/pull/3432) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/LocalFile.md b/versioned_docs/version-2.3.7/connector-v2/sink/LocalFile.md new file mode 100644 index 000000000000..a0bb53ff1d66 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/LocalFile.md @@ -0,0 +1,336 @@ +# LocalFile + +> Local file sink connector + +## Description + +Output data to local file. + +:::tip + +If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. + +If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. + +::: + +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [support multiple table write](../../concept/connector-v2-features.md) + +By default, we use 2PC commit to ensure `exactly-once` + +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + - [x] xml + - [x] binary + +## Options + +| Name | Type | Required | Default | Description | +|---------------------------------------|---------|----------|--------------------------------------------|---------------------------------------------------------------------------------------------------| +| path | string | yes | - | | +| tmp_path | string | no | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. | +| custom_filename | boolean | no | false | Whether you need custom the filename | +| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | +| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | +| file_format_type | string | no | "csv" | | +| field_delimiter | string | no | '\001' | Only used when file_format_type is text | +| row_delimiter | string | no | "\n" | Only used when file_format_type is text | +| have_partition | boolean | no | false | Whether you need processing partitions. | +| partition_by | array | no | - | Only used then have_partition is true | +| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | +| is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true | +| sink_columns | array | no | | When this parameter is empty, all fields are sink columns | +| is_enable_transaction | boolean | no | true | | +| batch_size | int | no | 1000000 | | +| compress_codec | string | no | none | | +| common-options | object | no | - | | +| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. | +| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. | +| xml_root_tag | string | no | RECORDS | Only used when file_format is xml. | +| xml_row_tag | string | no | RECORD | Only used when file_format is xml. | +| xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | +| parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | +| parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | +| enable_header_write | boolean | no | false | Only used when file_format_type is text,csv.
false:don't write header,true:write header. | +| encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | +| schema_save_mode | string | no | CREATE_SCHEMA_WHEN_NOT_EXIST | Existing dir processing method | +| data_save_mode | string | no | APPEND_DATA | Existing data processing method | + +### path [string] + +The target dir path is required, you can inject the upstream CatalogTable into the path by using: `${database_name}`, `${table_name}` and `${schema_name}`. + +### custom_filename [boolean] + +Whether custom the filename + +### file_name_expression [string] + +Only used when `custom_filename` is `true` + +`file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, +`${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +### filename_time_format [string] + +Only used when `custom_filename` is `true` + +When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: + +| Symbol | Description | +|--------|--------------------| +| y | Year | +| M | Month | +| d | Day of month | +| H | Hour in day (0-23) | +| m | Minute in hour | +| s | Second in minute | + +### file_format_type [string] + +We supported as the following file types: + +`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` + +Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. + +### field_delimiter [string] + +The separator between columns in a row of data. Only needed by `text` file format. + +### row_delimiter [string] + +The separator between rows in a file. Only needed by `text` file format. + +### have_partition [boolean] + +Whether you need processing partitions. + +### partition_by [array] + +Only used when `have_partition` is `true`. + +Partition data based on selected fields. + +### partition_dir_expression [string] + +Only used when `have_partition` is `true`. + +If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. + +Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. + +### is_partition_field_write_in_file [boolean] + +Only used when `have_partition` is `true`. + +If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. + +For example, if you want to write a Hive Data File, Its value should be `false`. + +### sink_columns [array] + +Which columns need be write to file, default value is all of the columns get from `Transform` or `Source`. +The order of the fields determines the order in which the file is actually written. + +### is_enable_transaction [boolean] + +If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +Only support `true` now. + +### batch_size [int] + +The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. + +### compress_codec [string] + +The compress codec of files and the details that supported as the following shown: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc: `lzo` `snappy` `lz4` `zlib` `none` +- parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` + +Tips: excel type does not support any compression format + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. + +### max_rows_in_memory [int] + +When File Format is Excel,The maximum number of data items that can be cached in the memory. + +### sheet_name [string] + +Writer the sheet of the workbook + +### xml_root_tag [string] + +Specifies the tag name of the root element within the XML file. + +### xml_row_tag [string] + +Specifies the tag name of the data rows within the XML file. + +### xml_use_attr_format [boolean] + +Specifies Whether to process data using the tag attribute format. + +### parquet_avro_write_timestamp_as_int96 [boolean] + +Support writing Parquet INT96 from a timestamp, only valid for parquet files. + +### parquet_avro_write_fixed_as_int96 [array] + +Support writing Parquet INT96 from a 12-byte field, only valid for parquet files. + +### enable_header_write [boolean] + +Only used when file_format_type is text,csv.false:don't write header,true:write header. + +### encoding [string] + +Only used when file_format_type is json,text,csv,xml. +The encoding of the file to write. This param will be parsed by `Charset.forName(encoding)`. + +### schema_save_mode [string] + +Existing dir processing method. +- RECREATE_SCHEMA: will create when the dir does not exist, delete and recreate when the dir is exist +- CREATE_SCHEMA_WHEN_NOT_EXIST: will create when the dir does not exist, skipped when the dir is exist +- ERROR_WHEN_SCHEMA_NOT_EXIST: error will be reported when the dir does not exist + +### data_save_mode [string] + +Existing data processing method. +- DROP_DATA: preserve dir and delete data files +- APPEND_DATA: preserve dir, preserve data files +- ERROR_WHEN_DATA_EXISTS: when there is data files, an error is reported + +## Example + +For orc file format simple config + +```bash + +LocalFile { + path = "/tmp/hive/warehouse/test2" + file_format_type = "orc" +} + +``` + +For json, text, csv or xml file format with `encoding` + +```hocon + +LocalFile { + path = "/tmp/hive/warehouse/test2" + file_format_type = "text" + encoding = "gbk" +} + +``` + +For parquet file format with `sink_columns` + +```bash + +LocalFile { + path = "/tmp/hive/warehouse/test2" + file_format_type = "parquet" + sink_columns = ["name","age"] +} + +``` + +For text file format with `have_partition` and `custom_filename` and `sink_columns` + +```bash + +LocalFile { + path = "/tmp/hive/warehouse/test2" + file_format_type = "text" + field_delimiter = "\t" + row_delimiter = "\n" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + custom_filename = true + file_name_expression = "${transactionId}_${now}" + filename_time_format = "yyyy.MM.dd" + sink_columns = ["name","age"] + is_enable_transaction = true +} + +``` + +For excel file format with `sheet_name` and `max_rows_in_memory` + +```bash + +LocalFile { + path="/tmp/seatunnel/excel" + sheet_name = "Sheet1" + max_rows_in_memory = 1024 + partition_dir_expression="${k0}=${v0}" + is_partition_field_write_in_file=true + file_name_expression="${transactionId}_${now}" + file_format_type="excel" + filename_time_format="yyyy.MM.dd" + is_enable_transaction=true + schema_save_mode=RECREATE_SCHEMA + data_save_mode=DROP_DATA + } + +``` + +For extract source metadata from upstream, you can use `${database_name}`, `${table_name}` and `${schema_name}` in the path. + +```bash + +LocalFile { + path = "/tmp/hive/warehouse/${table_name}" + file_format_type = "parquet" + sink_columns = ["name","age"] +} + +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Local File Sink Connector + +### 2.3.0-beta 2022-10-20 + +- [BugFix] Fix the bug of incorrect path in windows environment ([2980](https://github.com/apache/seatunnel/pull/2980)) +- [BugFix] Fix filesystem get error ([3117](https://github.com/apache/seatunnel/pull/3117)) +- [BugFix] Solved the bug of can not parse '\t' as delimiter from config file ([3083](https://github.com/apache/seatunnel/pull/3083)) + +### Next version + +- [BugFix] Fixed the following bugs that failed to write data to files ([3258](https://github.com/apache/seatunnel/pull/3258)) + - When field from upstream is null it will throw NullPointerException + - Sink columns mapping failed + - When restore writer from states getting transaction directly failed +- [Improve] Support setting batch size for every file ([3625](https://github.com/apache/seatunnel/pull/3625)) +- [Improve] Support file compress ([3899](https://github.com/apache/seatunnel/pull/3899)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Maxcompute.md b/versioned_docs/version-2.3.7/connector-v2/sink/Maxcompute.md new file mode 100644 index 000000000000..362b53be6bcf --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Maxcompute.md @@ -0,0 +1,79 @@ +# Maxcompute + +> Maxcompute sink connector + +## Description + +Used to read data from Maxcompute. + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|----------------|---------|----------|---------------| +| accessId | string | yes | - | +| accesskey | string | yes | - | +| endpoint | string | yes | - | +| project | string | yes | - | +| table_name | string | yes | - | +| partition_spec | string | no | - | +| overwrite | boolean | no | false | +| common-options | string | no | | + +### accessId [string] + +`accessId` Your Maxcompute accessId which cloud be access from Alibaba Cloud. + +### accesskey [string] + +`accesskey` Your Maxcompute accessKey which cloud be access from Alibaba Cloud. + +### endpoint [string] + +`endpoint` Your Maxcompute endpoint start with http. + +### project [string] + +`project` Your Maxcompute project which is created in Alibaba Cloud. + +### table_name [string] + +`table_name` Target Maxcompute table name eg: fake. + +### partition_spec [string] + +`partition_spec` This spec of Maxcompute partition table eg:ds='20220101'. + +### overwrite [boolean] + +`overwrite` Whether to overwrite the table or partition, default: false. + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. + +## Examples + +```hocon +sink { + Maxcompute { + accessId="" + accesskey="" + endpoint="" + project="" + table_name="" + #partition_spec="" + #overwrite = false + } +} +``` + +## Changelog + +### next version + +- [Feature] Add Maxcompute Sink Connector([3640](https://github.com/apache/seatunnel/pull/3640)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Mivlus.md b/versioned_docs/version-2.3.7/connector-v2/sink/Mivlus.md new file mode 100644 index 000000000000..081f427a5dfd --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Mivlus.md @@ -0,0 +1,59 @@ +# Milvus + +> Milvus sink connector + +## Description + +Write data to Milvus or Zilliz Cloud + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) + +## Data Type Mapping + +| Milvus Data Type | SeaTunnel Data Type | +|---------------------|---------------------| +| INT8 | TINYINT | +| INT16 | SMALLINT | +| INT32 | INT | +| INT64 | BIGINT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| BOOL | BOOLEAN | +| JSON | STRING | +| ARRAY | ARRAY | +| VARCHAR | STRING | +| FLOAT_VECTOR | FLOAT_VECTOR | +| BINARY_VECTOR | BINARY_VECTOR | +| FLOAT16_VECTOR | FLOAT16_VECTOR | +| BFLOAT16_VECTOR | BFLOAT16_VECTOR | +| SPARSE_FLOAT_VECTOR | SPARSE_FLOAT_VECTOR | + +## Sink Options + +| Name | Type | Required | Default | Description | +|----------------------|---------|----------|------------------------------|-----------------------------------------------------------| +| url | String | Yes | - | The URL to connect to Milvus or Zilliz Cloud. | +| token | String | Yes | - | User:password | +| database | String | No | - | Write data to which database, default is source database. | +| schema_save_mode | enum | No | CREATE_SCHEMA_WHEN_NOT_EXIST | Auto create table when table not exist. | +| enable_auto_id | boolean | No | false | Primary key column enable autoId. | +| enable_upsert | boolean | No | false | Upsert data not insert. | +| enable_dynamic_field | boolean | No | true | Enable create table with dynamic field. | +| batch_size | int | No | 1000 | Write batch size. | + +## Task Example + +```bash +sink { + Milvus { + url = "http://127.0.0.1:19530" + token = "username:password" + batch_size = 1000 + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/MongoDB.md b/versioned_docs/version-2.3.7/connector-v2/sink/MongoDB.md new file mode 100644 index 000000000000..e1cfd34ebad0 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/MongoDB.md @@ -0,0 +1,235 @@ +# MongoDB + +> MongoDB Sink Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +**Tips** + +> 1.If you want to use CDC-written features, recommend enable the upsert-enable configuration. + +## Description + +The MongoDB Connector provides the ability to read and write data from and to MongoDB. +This document describes how to set up the MongoDB connector to run data writers against MongoDB. + +## Supported DataSource Info + +In order to use the Mongodb connector, the following dependencies are required. +They can be downloaded via install-plugin.sh or from the Maven central repository. + +| Datasource | Supported Versions | Dependency | +|------------|--------------------|---------------------------------------------------------------------------------------------------------------| +| MongoDB | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-mongodb) | + +## Data Type Mapping + +The following table lists the field data type mapping from MongoDB BSON type to Seatunnel data type. + +| Seatunnel Data Type | MongoDB BSON Type | +|---------------------|-------------------| +| STRING | ObjectId | +| STRING | String | +| BOOLEAN | Boolean | +| BINARY | Binary | +| INTEGER | Int32 | +| TINYINT | Int32 | +| SMALLINT | Int32 | +| BIGINT | Int64 | +| DOUBLE | Double | +| FLOAT | Double | +| DECIMAL | Decimal128 | +| Date | Date | +| Timestamp | Timestamp[Date] | +| ROW | Object | +| ARRAY | Array | + +**Tips** + +> 1.When using SeaTunnel to write Date and Timestamp types to MongoDB, both will produce a Date data type in MongoDB, but the precision will be different. The data generated by the SeaTunnel Date type has second-level precision, while the data generated by the SeaTunnel Timestamp type has millisecond-level precision.
+> 2.When using the DECIMAL type in SeaTunnel, be aware that the maximum range cannot exceed 34 digits, which means you should use decimal(34, 18).
+ +## Sink Options + +| Name | Type | Required | Default | Description | +|-----------------------|----------|----------|---------|------------------------------------------------------------------------------------------------------------------------------| +| uri | String | Yes | - | The MongoDB standard connection uri. eg. mongodb://user:password@hosts:27017/database?readPreference=secondary&slaveOk=true. | +| database | String | Yes | - | The name of MongoDB database to read or write. | +| collection | String | Yes | - | The name of MongoDB collection to read or write. | +| schema | String | Yes | - | MongoDB's BSON and seatunnel data structure mapping. | +| buffer-flush.max-rows | String | No | 1000 | Specifies the maximum number of buffered rows per batch request. | +| buffer-flush.interval | String | No | 30000 | Specifies the maximum interval of buffered rows per batch request, the unit is millisecond. | +| retry.max | String | No | 3 | Specifies the max number of retry if writing records to database failed. | +| retry.interval | Duration | No | 1000 | Specifies the retry time interval if writing records to database failed, the unit is millisecond. | +| upsert-enable | Boolean | No | false | Whether to write documents via upsert mode. | +| primary-key | List | No | - | The primary keys for upsert/update. Keys are in `["id","name",...]` format for properties. | +| transaction | Boolean | No | false | Whether to use transactions in MongoSink (requires MongoDB 4.2+). | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +### Tips + +> 1.The data flushing logic of the MongoDB Sink Connector is jointly controlled by three parameters: `buffer-flush.max-rows`, `buffer-flush.interval`, and `checkpoint.interval`.
+> Data flushing will be triggered if any of these conditions are met.
+> 2.Compatible with the historical parameter `upsert-key`. If `upsert-key` is set, please do not set `primary-key`.
+ +## How to Create a MongoDB Data Synchronization Jobs + +The following example demonstrates how to create a data synchronization job that writes randomly generated data to a MongoDB database: + +```bash +# Set the basic configuration of the task to be performed +env { + parallelism = 1 + job.mode = "BATCH" + checkpoint.interval = 1000 +} + +source { + FakeSource { + row.num = 2 + bigint.min = 0 + bigint.max = 10000000 + split.num = 1 + split.read-interval = 300 + schema { + fields { + c_bigint = bigint + } + } + } +} + +sink { + MongoDB{ + uri = mongodb://user:password@127.0.0.1:27017 + database = "test" + collection = "test" + schema = { + fields { + _id = string + c_bigint = bigint + } + } + } +} +``` + +## Parameter Interpretation + +### MongoDB Database Connection URI Examples + +Unauthenticated single node connection: + +```bash +mongodb://127.0.0.0:27017/mydb +``` + +Replica set connection: + +```bash +mongodb://127.0.0.0:27017/mydb?replicaSet=xxx +``` + +Authenticated replica set connection: + +```bash +mongodb://admin:password@127.0.0.0:27017/mydb?replicaSet=xxx&authSource=admin +``` + +Multi-node replica set connection: + +```bash +mongodb://127.0.0..1:27017,127.0.0..2:27017,127.0.0.3:27017/mydb?replicaSet=xxx +``` + +Sharded cluster connection: + +```bash +mongodb://127.0.0.0:27017/mydb +``` + +Multiple mongos connections: + +```bash +mongodb://192.168.0.1:27017,192.168.0.2:27017,192.168.0.3:27017/mydb +``` + +Note: The username and password in the URI must be URL-encoded before being concatenated into the connection string. + +### Buffer Flush + +```bash +sink { + MongoDB { + uri = "mongodb://user:password@127.0.0.1:27017" + database = "test_db" + collection = "users" + buffer-flush.max-rows = 2000 + buffer-flush.interval = 1000 + schema = { + fields { + _id = string + id = bigint + status = string + } + } + } +} +``` + +### Why is Not Recommended to Use Transactions for Operation? + +Although MongoDB has fully supported multi-document transactions since version 4.2, it doesn't mean that everyone should use them recklessly. +Transactions are equivalent to locks, node coordination, additional overhead, and performance impact. +Instead, the principle for using transactions should be: avoid using them if possible. +The necessity for using transactions can be greatly avoided by designing systems rationally. + +### Idempotent Writes + +By specifying a clear primary key and using the upsert method, exactly-once write semantics can be achieved. + +If `primary-key` and `upsert-enable` is defined in the configuration, the MongoDB sink will use upsert semantics instead of regular INSERT statements. We combine the primary keys declared in upsert-key as the MongoDB reserved primary key and use upsert mode for writing to ensure idempotent writes. +In the event of a failure, Seatunnel jobs will recover from the last successful checkpoint and reprocess, which may result in duplicate message processing during recovery. It is highly recommended to use upsert mode, as it helps to avoid violating database primary key constraints and generating duplicate data if records need to be reprocessed. + +```bash +sink { + MongoDB { + uri = "mongodb://user:password@127.0.0.1:27017" + database = "test_db" + collection = "users" + upsert-enable = true + primary-key = ["name","status"] + schema = { + fields { + _id = string + name = string + status = string + } + } + } +} +``` + +## Changelog + +### 2.2.0-beta + +- Add MongoDB Source Connector + +### 2.3.1-release + +- [Feature]Refactor mongodb source connector([4620](https://github.com/apache/incubator-seatunnel/pull/4620)) + +### Next Version + +- [Feature]Mongodb support cdc sink([4833](https://github.com/apache/seatunnel/pull/4833)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Mysql.md b/versioned_docs/version-2.3.7/connector-v2/sink/Mysql.md new file mode 100644 index 000000000000..5ac881f69ff6 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Mysql.md @@ -0,0 +1,210 @@ +# MySQL + +> JDBC Mysql Sink Connector + +## Support Mysql Version + +- 5.5/5.6/5.7/8.0/8.4 + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Description + +Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once +semantics (using XA transaction guarantee). + +## Using Dependency + +### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +> Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is +> support `Xa transactions`. You can set `is_exactly_once=true` to enable it. + +## Supported DataSource Info + +| Datasource | Supported Versions | Driver | Url | Maven | +|------------|----------------------------------------------------------|--------------------------|---------------------------------------|---------------------------------------------------------------------------| +| Mysql | Different dependency version has different driver class. | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306:3306/test | [Download](https://mvnrepository.com/artifact/mysql/mysql-connector-java) | + +## Data Type Mapping + +| Mysql Data Type | SeaTunnel Data Type | +|-----------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| BIT(1)
INT UNSIGNED | BOOLEAN | +| TINYINT
TINYINT UNSIGNED
SMALLINT
SMALLINT UNSIGNED
MEDIUMINT
MEDIUMINT UNSIGNED
INT
INTEGER
YEAR | INT | +| INT UNSIGNED
INTEGER UNSIGNED
BIGINT | BIGINT | +| BIGINT UNSIGNED | DECIMAL(20,0) | +| DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | +| DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | +| DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
(Gets the designated column's number of digits to right of the decimal point.))) | +| FLOAT
FLOAT UNSIGNED | FLOAT | +| DOUBLE
DOUBLE UNSIGNED | DOUBLE | +| CHAR
VARCHAR
TINYTEXT
MEDIUMTEXT
TEXT
LONGTEXT
JSON | STRING | +| DATE | DATE | +| TIME | TIME | +| DATETIME
TIMESTAMP | TIMESTAMP | +| TINYBLOB
MEDIUMBLOB
BLOB
LONGBLOB
BINARY
VARBINAR
BIT(n) | BYTES | +| GEOMETRY
UNKNOWN | Not supported yet | + +## Sink Options + +| Name | Type | Required | Default | Description | +|-------------------------------------------|---------|----------|------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:mysql://localhost:3306:3306/test | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use MySQL the value is `com.mysql.cj.jdbc.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | +| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | +| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | +| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | +| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | +| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | +| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to | +| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, mysql is `com.mysql.cj.jdbc.MysqlXADataSource`, and
please refer to appendix for other data sources | +| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | +| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | +| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | +| field_ide | String | No | - | Identify whether the field needs to be converted when synchronizing from the source to the sink. `ORIGINAL` indicates no conversion is needed;`UPPERCASE` indicates conversion to uppercase;`LOWERCASE` indicates conversion to lowercase. | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | +| common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | +| schema_save_mode | Enum | No | CREATE_SCHEMA_WHEN_NOT_EXIST | Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. | +| data_save_mode | Enum | No | APPEND_DATA | Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. | +| custom_sql | String | No | - | When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. | +| enable_upsert | Boolean | No | true | Enable upsert by primary_keys exist, If the task only has `insert`, setting this parameter to `false` can speed up data import | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your mysql. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +``` +# Defining the runtime environment +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + jdbc { + url = "jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" + driver = "com.mysql.cj.jdbc.Driver" + user = "root" + password = "123456" + query = "insert into test_table(name,age) values(?,?)" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + +### Generate Sink SQL + +> This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you + +``` +sink { + jdbc { + url = "jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" + driver = "com.mysql.cj.jdbc.Driver" + user = "root" + password = "123456" + # Automatically generate sql statements based on database table names + generate_sink_sql = true + database = test + table = test_table + } +} +``` + +### Exactly-once : + +> For accurate write scene we guarantee accurate once + +``` +sink { + jdbc { + url = "jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" + driver = "com.mysql.cj.jdbc.Driver" + + max_retries = 0 + user = "root" + password = "123456" + query = "insert into test_table(name,age) values(?,?)" + + is_exactly_once = "true" + + xa_data_source_class_name = "com.mysql.cj.jdbc.MysqlXADataSource" + } +} +``` + +### CDC(Change Data Capture) Event + +> CDC change data is also supported by us In this case, you need config database, table and primary_keys. + +``` +sink { + jdbc { + url = "jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" + driver = "com.mysql.cj.jdbc.Driver" + user = "root" + password = "123456" + + generate_sink_sql = true + # You need to configure both database and table + database = test + table = sink_table + primary_keys = ["id","name"] + field_ide = UPPERCASE + schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" + data_save_mode="APPEND_DATA" + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Neo4j.md b/versioned_docs/version-2.3.7/connector-v2/sink/Neo4j.md new file mode 100644 index 000000000000..15e88646d3cd --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Neo4j.md @@ -0,0 +1,147 @@ +# Neo4j + +> Neo4j sink connector + +## Description + +Write data to Neo4j. + +`neo4j-java-driver` version 4.4.9 + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|----------------------------|---------|----------|---------------| +| uri | String | Yes | - | +| username | String | No | - | +| password | String | No | - | +| max_batch_size | Integer | No | - | +| write_mode | String | No | OneByOne | +| bearer_token | String | No | - | +| kerberos_ticket | String | No | - | +| database | String | Yes | - | +| query | String | Yes | - | +| queryParamPosition | Object | Yes | - | +| max_transaction_retry_time | Long | No | 30 | +| max_connection_timeout | Long | No | 30 | +| common-options | config | no | - | + +### uri [string] + +The URI of the Neo4j database. Refer to a case: `neo4j://localhost:7687` + +### username [string] + +username of the Neo4j + +### password [string] + +password of the Neo4j. required if `username` is provided + +### max_batch_size[Integer] + +max_batch_size refers to the maximum number of data entries that can be written in a single transaction when writing to a database. + +### write_mode + +The default value is oneByOne, or set it to "Batch" if you want to have the ability to write in batches + +```cypher +unwind $ttt as row create (n:Label) set n.name = row.name,n.age = rw.age +``` + +"ttt" represents a batch of data.,"ttt" can be any arbitrary string as long as it matches the configured "batch_data_variable". + +### bearer_token [string] + +base64 encoded bearer token of the Neo4j. for Auth. + +### kerberos_ticket [string] + +base64 encoded kerberos ticket of the Neo4j. for Auth. + +### database [string] + +database name. + +### query [string] + +Query statement. contain parameter placeholders that are substituted with the corresponding values at runtime + +### queryParamPosition [object] + +position mapping information for query parameters. + +key name is parameter placeholder name. + +associated value is position of field in input data row. + +### max_transaction_retry_time [long] + +maximum transaction retry time(seconds). transaction fail if exceeded + +### max_connection_timeout [long] + +The maximum amount of time to wait for a TCP connection to be established (seconds) + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +## WriteOneByOneExample + +``` +sink { + Neo4j { + uri = "neo4j://localhost:7687" + username = "neo4j" + password = "1234" + database = "neo4j" + + max_transaction_retry_time = 10 + max_connection_timeout = 10 + + query = "CREATE (a:Person {name: $name, age: $age})" + queryParamPosition = { + name = 0 + age = 1 + } + } +} +``` + +## WriteBatchExample +> The unwind keyword provided by cypher supports batch writing, and the default variable for a batch of data is batch. If you write a batch write statement, then you should declare cypher:unwind $batch as row to do someting +``` +sink { + Neo4j { + uri = "bolt://localhost:7687" + username = "neo4j" + password = "neo4j" + database = "neo4j" + max_batch_size = 1000 + write_mode = "BATCH" + + max_transaction_retry_time = 3 + max_connection_timeout = 10 + + query = "unwind $batch as row create(n:MyLabel) set n.name = row.name,n.age = row.age" + + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Neo4j Sink Connector + +### issue ##4835 + +- Sink supports batch write + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/ObsFile.md b/versioned_docs/version-2.3.7/connector-v2/sink/ObsFile.md new file mode 100644 index 000000000000..cfb1ec8c55e9 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/ObsFile.md @@ -0,0 +1,287 @@ +# ObsFile + +> Obs file sink connector + +## Support those engines + +> Spark +> +> Flink +> +> Seatunnel Zeta + +## Key features + +- [x] [exactly-once](../../concept/connector-v2-features.md) + +By default, we use 2PC commit to ensure `exactly-once` + +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + +## Description + +Output data to huawei cloud obs file system. + +If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. + +If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. + +We made some trade-offs in order to support more file types, so we used the HDFS protocol for internal access to OBS and this connector need some hadoop dependencies. +It only supports hadoop version **2.9.X+**. + +## Required Jar List + +| jar | supported versions | maven | +|--------------------|-----------------------------|----------------------------------------------------------------------------------------------------------------| +| hadoop-huaweicloud | support version >= 3.1.1.29 | [Download](https://repo.huaweicloud.com/repository/maven/huaweicloudsdk/org/apache/hadoop/hadoop-huaweicloud/) | +| esdk-obs-java | support version >= 3.19.7.3 | [Download](https://repo.huaweicloud.com/repository/maven/huaweicloudsdk/com/huawei/storage/esdk-obs-java/) | +| okhttp | support version >= 3.11.0 | [Download](https://repo1.maven.org/maven2/com/squareup/okhttp3/okhttp/) | +| okio | support version >= 1.14.0 | [Download](https://repo1.maven.org/maven2/com/squareup/okio/okio/) | + +> Please download the support list corresponding to 'Maven' and copy them to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory. +> +> And copy all jars to $SEATNUNNEL_HOME/lib/ + +## Options + +| name | type | required | default | description | +|----------------------------------|---------|----------|--------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------| +| path | string | yes | - | The target dir path. | +| bucket | string | yes | - | The bucket address of obs file system, for example: `obs://obs-bucket-name`. | +| access_key | string | yes | - | The access key of obs file system. | +| access_secret | string | yes | - | The access secret of obs file system. | +| endpoint | string | yes | - | The endpoint of obs file system. | +| custom_filename | boolean | no | false | Whether you need custom the filename. | +| file_name_expression | string | no | "${transactionId}" | Describes the file expression which will be created into the `path`. Only used when custom_filename is true. [Tips](#file_name_expression) | +| filename_time_format | string | no | "yyyy.MM.dd" | Specify the time format of the `path`. Only used when custom_filename is true. [Tips](#filename_time_format) | +| file_format_type | string | no | "csv" | Supported file types. [Tips](#file_format_type) | +| field_delimiter | string | no | '\001' | The separator between columns in a row of data.Only used when file_format is text. | +| row_delimiter | string | no | "\n" | The separator between rows in a file. Only needed by `text` file format. | +| have_partition | boolean | no | false | Whether you need processing partitions. | +| partition_by | array | no | - | Partition data based on selected fields. Only used then have_partition is true. | +| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true.[Tips](#partition_dir_expression) | +| is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true.[Tips](#is_partition_field_write_in_file) | +| sink_columns | array | no | | When this parameter is empty, all fields are sink columns.[Tips](#sink_columns) | +| is_enable_transaction | boolean | no | true | [Tips](#is_enable_transaction) | +| batch_size | int | no | 1000000 | [Tips](#batch_size) | +| compress_codec | string | no | none | [Tips](#compress_codec) | +| common-options | object | no | - | [Tips](#common_options) | +| max_rows_in_memory | int | no | - | When File Format is Excel,The maximum number of data items that can be cached in the memory.Only used when file_format is excel. | +| sheet_name | string | no | Sheet${Random number} | Writer the sheet of the workbook. Only used when file_format is excel. | + +### Tips + +#### file_name_expression + +> Only used when `custom_filename` is `true` +> +> `file_name_expression` describes the file expression which will be created into the `path`. +> +> We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, +> +> `${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +#### filename_time_format + +> Only used when `custom_filename` is `true` +> +> When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: + +| Symbol | Description | +|--------|--------------------| +| y | Year | +| M | Month | +| d | Day of month | +| H | Hour in day (0-23) | +| m | Minute in hour | +| s | Second in minute | + +#### file_format_type + +> We supported as the following file types: +> +> `text` `json` `csv` `orc` `parquet` `excel` + +Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. + +#### partition_dir_expression + +> Only used when `have_partition` is `true`. +> +> If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. +> +> Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. + +#### is_partition_field_write_in_file + +> Only used when `have_partition` is `true`. +> +> If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. +> +> For example, if you want to write a Hive Data File, Its value should be `false`. + +#### sink_columns + +> Which columns need be written to file, default value is all the columns get from `Transform` or `Source`. +> The order of the fields determines the order in which the file is actually written. + +#### is_enable_transaction + +> If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. +> +> Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. Only support `true` now. + +#### batch_size + +> The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. + +#### compress_codec + +> The compress codec of files and the details that supported as the following shown: +> +> - txt: `lzo` `none` +> - json: `lzo` `none` +> - csv: `lzo` `none` +> - orc: `lzo` `snappy` `lz4` `zlib` `none` +> - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` + +Please note that excel type does not support any compression format + +#### common options + +> Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. + +## Task Example + +### text file + +> For text file format with `have_partition` and `custom_filename` and `sink_columns` + +```hocon + + ObsFile { + path="/seatunnel/text" + bucket = "obs://obs-bucket-name" + access_key = "xxxxxxxxxxx" + access_secret = "xxxxxxxxxxx" + endpoint = "obs.xxxxxx.myhuaweicloud.com" + file_format_type = "text" + field_delimiter = "\t" + row_delimiter = "\n" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + custom_filename = true + file_name_expression = "${transactionId}_${now}" + filename_time_format = "yyyy.MM.dd" + sink_columns = ["name","age"] + is_enable_transaction = true + } + +``` + +### parquet file + +> For parquet file format with `have_partition` and `sink_columns` + +```hocon + + ObsFile { + path = "/seatunnel/parquet" + bucket = "obs://obs-bucket-name" + access_key = "xxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxx" + endpoint = "obs.xxxxxx.myhuaweicloud.com" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + file_format_type = "parquet" + sink_columns = ["name","age"] + } + +``` + +### orc file + +> For orc file format simple config + +```hocon + + ObsFile { + path="/seatunnel/orc" + bucket = "obs://obs-bucket-name" + access_key = "xxxxxxxxxxx" + access_secret = "xxxxxxxxxxx" + endpoint = "obs.xxxxx.myhuaweicloud.com" + file_format_type = "orc" + } + +``` + +### json file + +> For json file format simple config + +```hcocn + + ObsFile { + path = "/seatunnel/json" + bucket = "obs://obs-bucket-name" + access_key = "xxxxxxxxxxx" + access_secret = "xxxxxxxxxxx" + endpoint = "obs.xxxxx.myhuaweicloud.com" + file_format_type = "json" + } + +``` + +### excel file + +> For excel file format simple config + +```hcocn + + ObsFile { + path = "/seatunnel/excel" + bucket = "obs://obs-bucket-name" + access_key = "xxxxxxxxxxx" + access_secret = "xxxxxxxxxxx" + endpoint = "obs.xxxxx.myhuaweicloud.com" + file_format_type = "excel" + } + +``` + +### csv file + +> For csv file format simple config + +```hcocn + + ObsFile { + path = "/seatunnel/csv" + bucket = "obs://obs-bucket-name" + access_key = "xxxxxxxxxxx" + access_secret = "xxxxxxxxxxx" + endpoint = "obs.xxxxx.myhuaweicloud.com" + file_format_type = "csv" + } + +``` + +## Changelog + +### next version + +- Add Obs Sink Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/OceanBase.md b/versioned_docs/version-2.3.7/connector-v2/sink/OceanBase.md new file mode 100644 index 000000000000..ce60b0937dea --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/OceanBase.md @@ -0,0 +1,186 @@ +# OceanBase + +> JDBC OceanBase Sink Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +## Description + +Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once semantics. + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|--------------------------------|---------------------------|--------------------------------------|-------------------------------------------------------------------------------| +| OceanBase | All OceanBase server versions. | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2883/test | [Download](https://mvnrepository.com/artifact/com.oceanbase/oceanbase-client) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example: cp oceanbase-client-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +### Mysql Mode + +| Mysql Data type | SeaTunnel Data type | +|-----------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| BIT(1)
INT UNSIGNED | BOOLEAN | +| TINYINT
TINYINT UNSIGNED
SMALLINT
SMALLINT UNSIGNED
MEDIUMINT
MEDIUMINT UNSIGNED
INT
INTEGER
YEAR | INT | +| INT UNSIGNED
INTEGER UNSIGNED
BIGINT | BIGINT | +| BIGINT UNSIGNED | DECIMAL(20,0) | +| DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | +| DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | +| DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
(Gets the designated column's number of digits to right of the decimal point.))) | +| FLOAT
FLOAT UNSIGNED | FLOAT | +| DOUBLE
DOUBLE UNSIGNED | DOUBLE | +| CHAR
VARCHAR
TINYTEXT
MEDIUMTEXT
TEXT
LONGTEXT
JSON | STRING | +| DATE | DATE | +| TIME | TIME | +| DATETIME
TIMESTAMP | TIMESTAMP | +| TINYBLOB
MEDIUMBLOB
BLOB
LONGBLOB
BINARY
VARBINAR
BIT(n) | BYTES | +| GEOMETRY
UNKNOWN | Not supported yet | + +### Oracle Mode + +| Oracle Data type | SeaTunnel Data type | +|-----------------------------------------------------------|---------------------| +| Number(p), p <= 9 | INT | +| Number(p), p <= 18 | BIGINT | +| Number(p), p > 18 | DECIMAL(38,18) | +| REAL
BINARY_FLOAT | FLOAT | +| BINARY_DOUBLE | DOUBLE | +| CHAR
NCHAR
NVARCHAR2
NCLOB
CLOB
ROWID | STRING | +| DATE | DATE | +| TIMESTAMP
TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | +| BLOB
RAW
LONG RAW
BFILE | BYTES | +| UNKNOWN | Not supported yet | + +## Sink Options + +| Name | Type | Required | Default | Description | +|-------------------------------------------|---------|----------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:oceanbase://localhost:2883/test | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, should be `com.oceanbase.jdbc.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | +| compatible_mode | String | Yes | - | The compatible mode of OceanBase, can be 'mysql' or 'oracle'. | +| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | +| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | +| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | +| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | +| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to | +| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | +| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | +| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | +| common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | +| enable_upsert | Boolean | No | true | Enable upsert by primary_keys exist, If the task has no key duplicate data, setting this parameter to `false` can speed up data import | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your mysql. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +``` +# Defining the runtime environment +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + jdbc { + url = "jdbc:oceanbase://localhost:2883/test" + driver = "com.oceanbase.jdbc.Driver" + user = "root" + password = "123456" + compatible_mode = "mysql" + query = "insert into test_table(name,age) values(?,?)" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + +### Generate Sink SQL + +> This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you + +``` +sink { + jdbc { + url = "jdbc:oceanbase://localhost:2883/test" + driver = "com.oceanbase.jdbc.Driver" + user = "root" + password = "123456" + compatible_mode = "mysql" + # Automatically generate sql statements based on database table names + generate_sink_sql = true + database = test + table = test_table + } +} +``` + +### CDC(Change Data Capture) Event + +> CDC change data is also supported by us In this case, you need config database, table and primary_keys. + +``` +sink { + jdbc { + url = "jdbc:oceanbase://localhost:3306/test" + driver = "com.oceanbase.jdbc.Driver" + user = "root" + password = "123456" + compatible_mode = "mysql" + generate_sink_sql = true + # You need to configure both database and table + database = test + table = sink_table + primary_keys = ["id","name"] + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Oracle.md b/versioned_docs/version-2.3.7/connector-v2/sink/Oracle.md new file mode 100644 index 000000000000..f250f552bd21 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Oracle.md @@ -0,0 +1,207 @@ +# Oracle + +> JDBC Oracle Sink Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Description + +Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once +semantics (using XA transaction guarantee). + +## Using Dependency + +### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +> Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is +> support `Xa transactions`. You can set `is_exactly_once=true` to enable it. + +## Supported DataSource Info + +| Datasource | Supported Versions | Driver | Url | Maven | +|------------|----------------------------------------------------------|--------------------------|----------------------------------------|--------------------------------------------------------------------| +| Oracle | Different dependency version has different driver class. | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@datasource01:1523:xe | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example Oracle datasource: cp ojdbc8-xxxxxx.jar $SEATNUNNEL_HOME/lib/
+> To support the i18n character set, copy the orai18n.jar to the $SEATNUNNEL_HOME/lib/ directory. + +## Data Type Mapping + +| Oracle Data Type | SeaTunnel Data Type | +|--------------------------------------------------------------------------------------|---------------------| +| INTEGER | INT | +| FLOAT | DECIMAL(38, 18) | +| NUMBER(precision <= 9, scale == 0) | INT | +| NUMBER(9 < precision <= 18, scale == 0) | BIGINT | +| NUMBER(18 < precision, scale == 0) | DECIMAL(38, 0) | +| NUMBER(scale != 0) | DECIMAL(38, 18) | +| BINARY_DOUBLE | DOUBLE | +| BINARY_FLOAT
REAL | FLOAT | +| CHAR
NCHAR
NVARCHAR2
VARCHAR2
LONG
ROWID
NCLOB
CLOB
| STRING | +| DATE | DATE | +| TIMESTAMP
TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | +| BLOB
RAW
LONG RAW
BFILE | BYTES | + +## Options + +| Name | Type | Required | Default | Description | +|-------------------------------------------|---------|----------|------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:oracle:thin:@datasource01:1523:xe | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use Oracle the value is `oracle.jdbc.OracleDriver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | +| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | +| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | +| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | +| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `batch_interval_ms`
, the data will be flushed into the database | +| batch_interval_ms | Int | No | 1000 | For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the database | +| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | +| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to. | +| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, Oracle is `oracle.jdbc.xa.client.OracleXADataSource`, and
please refer to appendix for other data sources | +| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | +| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | +| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | +| common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | +| schema_save_mode | Enum | No | CREATE_SCHEMA_WHEN_NOT_EXIST | Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. | +| data_save_mode | Enum | No | APPEND_DATA | Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. | +| custom_sql | String | No | - | When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. | +| enable_upsert | Boolean | No | true | Enable upsert by primary_keys exist, If the task has no key duplicate data, setting this parameter to `false` can speed up data import | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your Oracle. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +``` +# Defining the runtime environment +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + jdbc { + url = "jdbc:oracle:thin:@datasource01:1523:xe" + driver = "oracle.jdbc.OracleDriver" + user = root + password = 123456 + query = "INSERT INTO TEST.TEST_TABLE(NAME,AGE) VALUES(?,?)" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + +### Generate Sink SQL + +> This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you + +``` +sink { + Jdbc { + url = "jdbc:oracle:thin:@datasource01:1523:xe" + driver = "oracle.jdbc.OracleDriver" + user = root + password = 123456 + + generate_sink_sql = true + database = XE + table = "TEST.TEST_TABLE" + } +} +``` + +### Exactly-once : + +> For accurate write scene we guarantee accurate once + +``` +sink { + jdbc { + url = "jdbc:oracle:thin:@datasource01:1523:xe" + driver = "oracle.jdbc.OracleDriver" + + max_retries = 0 + user = root + password = 123456 + query = "INSERT INTO TEST.TEST_TABLE(NAME,AGE) VALUES(?,?)" + + is_exactly_once = "true" + + xa_data_source_class_name = "oracle.jdbc.xa.client.OracleXADataSource" + } +} +``` + +### CDC(Change Data Capture) Event + +> CDC change data is also supported by us In this case, you need config database, table and primary_keys. + +``` +sink { + jdbc { + url = "jdbc:oracle:thin:@datasource01:1523:xe" + driver = "oracle.jdbc.OracleDriver" + user = root + password = 123456 + + generate_sink_sql = true + # You need to configure both database and table + database = XE + table = "TEST.TEST_TABLE" + primary_keys = ["ID"] + schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" + data_save_mode="APPEND_DATA" + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/OssFile.md b/versioned_docs/version-2.3.7/connector-v2/sink/OssFile.md new file mode 100644 index 000000000000..f83fdcf49973 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/OssFile.md @@ -0,0 +1,539 @@ +# OssFile + +> Oss file sink connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Usage Dependency + +### For Spark/Flink Engine + +1. You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. +2. You must ensure `hadoop-aliyun-xx.jar`, `aliyun-sdk-oss-xx.jar` and `jdom-xx.jar` in `${SEATUNNEL_HOME}/plugins/` dir and the version of `hadoop-aliyun` jar need equals your hadoop version which used in spark/flink and `aliyun-sdk-oss-xx.jar` and `jdom-xx.jar` version needs to be the version corresponding to the `hadoop-aliyun` version. Eg: `hadoop-aliyun-3.1.4.jar` dependency `aliyun-sdk-oss-3.4.1.jar` and `jdom-1.1.jar`. + +### For SeaTunnel Zeta Engine + +1. You must ensure `seatunnel-hadoop3-3.1.4-uber.jar`, `aliyun-sdk-oss-3.4.1.jar`, `hadoop-aliyun-3.1.4.jar` and `jdom-1.1.jar` in `${SEATUNNEL_HOME}/lib/` dir. + +## Key features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [support multiple table write](../../concept/connector-v2-features.md) + +By default, we use 2PC commit to ensure `exactly-once` + +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + - [x] xml + - [x] binary + +## Data Type Mapping + +If write to `csv`, `text` file type, All column will be string. + +### Orc File Type + +| SeaTunnel Data Type | Orc Data Type | +|----------------------|-----------------------| +| STRING | STRING | +| BOOLEAN | BOOLEAN | +| TINYINT | BYTE | +| SMALLINT | SHORT | +| INT | INT | +| BIGINT | LONG | +| FLOAT | FLOAT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| DECIMAL | DECIMAL | +| BYTES | BINARY | +| DATE | DATE | +| TIME
TIMESTAMP | TIMESTAMP | +| ROW | STRUCT | +| NULL | UNSUPPORTED DATA TYPE | +| ARRAY | LIST | +| Map | Map | + +### Parquet File Type + +| SeaTunnel Data Type | Parquet Data Type | +|----------------------|-----------------------| +| STRING | STRING | +| BOOLEAN | BOOLEAN | +| TINYINT | INT_8 | +| SMALLINT | INT_16 | +| INT | INT32 | +| BIGINT | INT64 | +| FLOAT | FLOAT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| DECIMAL | DECIMAL | +| BYTES | BINARY | +| DATE | DATE | +| TIME
TIMESTAMP | TIMESTAMP_MILLIS | +| ROW | GroupType | +| NULL | UNSUPPORTED DATA TYPE | +| ARRAY | LIST | +| Map | Map | + +## Options + +| Name | Type | Required | Default | Description | +|---------------------------------------|---------|----------|--------------------------------------------|-------------------------------------------------------------------------------------------------------------------| +| path | string | yes | The oss path to write file in. | | +| tmp_path | string | no | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. Need a OSS dir. | +| bucket | string | yes | - | | +| access_key | string | yes | - | | +| access_secret | string | yes | - | | +| endpoint | string | yes | - | | +| custom_filename | boolean | no | false | Whether you need custom the filename | +| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | +| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | +| file_format_type | string | no | "csv" | | +| field_delimiter | string | no | '\001' | Only used when file_format_type is text | +| row_delimiter | string | no | "\n" | Only used when file_format_type is text | +| have_partition | boolean | no | false | Whether you need processing partitions. | +| partition_by | array | no | - | Only used then have_partition is true | +| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | +| is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true | +| sink_columns | array | no | | When this parameter is empty, all fields are sink columns | +| is_enable_transaction | boolean | no | true | | +| batch_size | int | no | 1000000 | | +| compress_codec | string | no | none | | +| common-options | object | no | - | | +| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. | +| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. | +| xml_root_tag | string | no | RECORDS | Only used when file_format is xml. | +| xml_row_tag | string | no | RECORD | Only used when file_format is xml. | +| xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | +| parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | +| parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | +| encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | + +### path [string] + +The target dir path is required. + +### bucket [string] + +The bucket address of oss file system, for example: `oss://tyrantlucifer-image-bed` + +### access_key [string] + +The access key of oss file system. + +### access_secret [string] + +The access secret of oss file system. + +### endpoint [string] + +The endpoint of oss file system. + +### custom_filename [boolean] + +Whether custom the filename + +### file_name_expression [string] + +Only used when `custom_filename` is `true` + +`file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, +`${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +### filename_time_format [String] + +Only used when `custom_filename` is `true` + +When the format in the `file_name_expression` parameter is `xxxx-${Now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: + +| Symbol | Description | +|--------|--------------------| +| y | Year | +| M | Month | +| d | Day of month | +| H | Hour in day (0-23) | +| m | Minute in hour | +| s | Second in minute | + +### file_format_type [string] + +We supported as the following file types: + +`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` + +Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. + +### field_delimiter [string] + +The separator between columns in a row of data. Only needed by `text` file format. + +### row_delimiter [string] + +The separator between rows in a file. Only needed by `text` file format. + +### have_partition [boolean] + +Whether you need processing partitions. + +### partition_by [array] + +Only used when `have_partition` is `true`. + +Partition data based on selected fields. + +### partition_dir_expression [string] + +Only used when `have_partition` is `true`. + +If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. + +Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. + +### is_partition_field_write_in_file [boolean] + +Only used when `have_partition` is `true`. + +If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. + +For example, if you want to write a Hive Data File, Its value should be `false`. + +### sink_columns [array] + +Which columns need be written to file, default value is all the columns get from `Transform` or `Source`. +The order of the fields determines the order in which the file is actually written. + +### is_enable_transaction [boolean] + +If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +Only support `true` now. + +### batch_size [int] + +The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. + +### compress_codec [string] + +The compress codec of files and the details that supported as the following shown: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc: `lzo` `snappy` `lz4` `zlib` `none` +- parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` + +Tips: excel type does not support any compression format + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. + +### max_rows_in_memory [int] + +When File Format is Excel,The maximum number of data items that can be cached in the memory. + +### sheet_name [string] + +Writer the sheet of the workbook + +### xml_root_tag [string] + +Specifies the tag name of the root element within the XML file. + +### xml_row_tag [string] + +Specifies the tag name of the data rows within the XML file. + +### xml_use_attr_format [boolean] + +Specifies Whether to process data using the tag attribute format. + +### parquet_avro_write_timestamp_as_int96 [boolean] + +Support writing Parquet INT96 from a timestamp, only valid for parquet files. + +### parquet_avro_write_fixed_as_int96 [array] + +Support writing Parquet INT96 from a 12-byte field, only valid for parquet files. + +### encoding [string] + +Only used when file_format_type is json,text,csv,xml. +The encoding of the file to write. This param will be parsed by `Charset.forName(encoding)`. + +## How to Create an Oss Data Synchronization Jobs + +The following example demonstrates how to create a data synchronization job that reads data from Fake Source and writes it to the Oss: + +For text file format with `have_partition` and `custom_filename` and `sink_columns` + +```bash +# Set the basic configuration of the task to be performed +env { + parallelism = 1 + job.mode = "BATCH" +} + +# Create a source to product data +source { + FakeSource { + schema = { + fields { + name = string + age = int + } + } + } +} + +# write data to Oss +sink { + OssFile { + path="/seatunnel/sink" + bucket = "oss://tyrantlucifer-image-bed" + access_key = "xxxxxxxxxxx" + access_secret = "xxxxxxxxxxx" + endpoint = "oss-cn-beijing.aliyuncs.com" + file_format_type = "text" + field_delimiter = "\t" + row_delimiter = "\n" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + custom_filename = true + file_name_expression = "${transactionId}_${now}" + filename_time_format = "yyyy.MM.dd" + sink_columns = ["name","age"] + is_enable_transaction = true + } +} +``` + +For parquet file format with `have_partition` and `sink_columns` + +```bash +# Set the basic configuration of the task to be performed +env { + parallelism = 1 + job.mode = "BATCH" +} + +# Create a source to product data +source { + FakeSource { + schema = { + fields { + name = string + age = int + } + } + } +} + +# Write data to Oss +sink { + OssFile { + path = "/seatunnel/sink" + bucket = "oss://tyrantlucifer-image-bed" + access_key = "xxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxx" + endpoint = "oss-cn-beijing.aliyuncs.com" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + file_format_type = "parquet" + sink_columns = ["name","age"] + } +} +``` + +For orc file format simple config + +```bash +# Set the basic configuration of the task to be performed +env { + parallelism = 1 + job.mode = "BATCH" +} + +# Create a source to product data +source { + FakeSource { + schema = { + fields { + name = string + age = int + } + } + } +} + +# Write data to Oss +sink { + OssFile { + path="/seatunnel/sink" + bucket = "oss://tyrantlucifer-image-bed" + access_key = "xxxxxxxxxxx" + access_secret = "xxxxxxxxxxx" + endpoint = "oss-cn-beijing.aliyuncs.com" + file_format_type = "orc" + } +} +``` + +### Multiple Table + +For extract source metadata from upstream, you can use `${database_name}`, `${table_name}` and `${schema_name}` in the path. + +```bash + +env { + parallelism = 1 + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local + job.mode = "BATCH" +} + +source { + FakeSource { + tables_configs = [ + { + schema = { + table = "fake1" + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + }, + { + schema = { + table = "fake2" + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + } + ] + } +} + +sink { + OssFile { + bucket = "oss://whale-ops" + access_key = "xxxxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxx" + endpoint = "https://oss-accelerate.aliyuncs.com" + path = "/tmp/fake_empty/text/${table_name}" + row_delimiter = "\n" + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + file_name_expression = "${transactionId}_${now}" + file_format_type = "text" + filename_time_format = "yyyy.MM.dd" + is_enable_transaction = true + compress_codec = "lzo" + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add OSS Sink Connector + +### 2.3.0-beta 2022-10-20 + +- [BugFix] Fix the bug of incorrect path in windows environment ([2980](https://github.com/apache/seatunnel/pull/2980)) +- [BugFix] Fix filesystem get error ([3117](https://github.com/apache/seatunnel/pull/3117)) +- [BugFix] Solved the bug of can not parse '\t' as delimiter from config file ([3083](https://github.com/apache/seatunnel/pull/3083)) + +### Next version + +- [BugFix] Fixed the following bugs that failed to write data to files ([3258](https://github.com/apache/seatunnel/pull/3258)) + - When field from upstream is null it will throw NullPointerException + - Sink columns mapping failed + - When restore writer from states getting transaction directly failed +- [Improve] Support setting batch size for every file ([3625](https://github.com/apache/seatunnel/pull/3625)) +- [Improve] Support file compress ([3899](https://github.com/apache/seatunnel/pull/3899)) + +### Tips + +> 1.[SeaTunnel Deployment Document](../../start-v2/locally/deployment.md). + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/OssJindoFile.md b/versioned_docs/version-2.3.7/connector-v2/sink/OssJindoFile.md new file mode 100644 index 000000000000..80e6cf775c1e --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/OssJindoFile.md @@ -0,0 +1,297 @@ +# OssJindoFile + +> OssJindo file sink connector + +## Description + +Output data to oss file system using jindo api. + +:::tip + +You need to download [jindosdk-4.6.1.tar.gz](https://jindodata-binary.oss-cn-shanghai.aliyuncs.com/release/4.6.1/jindosdk-4.6.1.tar.gz) +and then unzip it, copy jindo-sdk-4.6.1.jar and jindo-core-4.6.1.jar from lib to ${SEATUNNEL_HOME}/lib. + +If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. + +If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. + +We made some trade-offs in order to support more file types, so we used the HDFS protocol for internal access to OSS and this connector need some hadoop dependencies. +It only supports hadoop version **2.9.X+**. + +::: + +## Key features + +- [x] [exactly-once](../../concept/connector-v2-features.md) + +By default, we use 2PC commit to ensure `exactly-once` + +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + - [x] xml + - [x] binary + +## Options + +| Name | Type | Required | Default | Description | +|---------------------------------------|---------|----------|--------------------------------------------|-------------------------------------------------------------------------------------------------------------------| +| path | string | yes | - | | +| tmp_path | string | no | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. Need a OSS dir. | +| bucket | string | yes | - | | +| access_key | string | yes | - | | +| access_secret | string | yes | - | | +| endpoint | string | yes | - | | +| custom_filename | boolean | no | false | Whether you need custom the filename | +| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | +| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | +| file_format_type | string | no | "csv" | | +| field_delimiter | string | no | '\001' | Only used when file_format_type is text | +| row_delimiter | string | no | "\n" | Only used when file_format_type is text | +| have_partition | boolean | no | false | Whether you need processing partitions. | +| partition_by | array | no | - | Only used then have_partition is true | +| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | +| is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true | +| sink_columns | array | no | | When this parameter is empty, all fields are sink columns | +| is_enable_transaction | boolean | no | true | | +| batch_size | int | no | 1000000 | | +| compress_codec | string | no | none | | +| common-options | object | no | - | | +| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. | +| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. | +| xml_root_tag | string | no | RECORDS | Only used when file_format is xml. | +| xml_row_tag | string | no | RECORD | Only used when file_format is xml. | +| xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | +| parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | +| parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | +| encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | + +### path [string] + +The target dir path is required. + +### bucket [string] + +The bucket address of oss file system, for example: `oss://tyrantlucifer-image-bed` + +### access_key [string] + +The access key of oss file system. + +### access_secret [string] + +The access secret of oss file system. + +### endpoint [string] + +The endpoint of oss file system. + +### custom_filename [boolean] + +Whether custom the filename + +### file_name_expression [string] + +Only used when `custom_filename` is `true` + +`file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, +`${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +### filename_time_format [string] + +Only used when `custom_filename` is `true` + +When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: + +| Symbol | Description | +|--------|--------------------| +| y | Year | +| M | Month | +| d | Day of month | +| H | Hour in day (0-23) | +| m | Minute in hour | +| s | Second in minute | + +### file_format_type [string] + +We supported as the following file types: + +`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` + +Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. + +### field_delimiter [string] + +The separator between columns in a row of data. Only needed by `text` file format. + +### row_delimiter [string] + +The separator between rows in a file. Only needed by `text` file format. + +### have_partition [boolean] + +Whether you need processing partitions. + +### partition_by [array] + +Only used when `have_partition` is `true`. + +Partition data based on selected fields. + +### partition_dir_expression [string] + +Only used when `have_partition` is `true`. + +If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. + +Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. + +### is_partition_field_write_in_file [boolean] + +Only used when `have_partition` is `true`. + +If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. + +For example, if you want to write a Hive Data File, Its value should be `false`. + +### sink_columns [array] + +Which columns need be written to file, default value is all the columns get from `Transform` or `Source`. +The order of the fields determines the order in which the file is actually written. + +### is_enable_transaction [boolean] + +If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +Only support `true` now. + +### batch_size [int] + +The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. + +### compress_codec [string] + +The compress codec of files and the details that supported as the following shown: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc: `lzo` `snappy` `lz4` `zlib` `none` +- parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` + +Tips: excel type does not support any compression format + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. + +### max_rows_in_memory [int] + +When File Format is Excel,The maximum number of data items that can be cached in the memory. + +### sheet_name [string] + +Writer the sheet of the workbook + +### xml_root_tag [string] + +Specifies the tag name of the root element within the XML file. + +### xml_row_tag [string] + +Specifies the tag name of the data rows within the XML file. + +### xml_use_attr_format [boolean] + +Specifies Whether to process data using the tag attribute format. + +### parquet_avro_write_timestamp_as_int96 [boolean] + +Support writing Parquet INT96 from a timestamp, only valid for parquet files. + +### parquet_avro_write_fixed_as_int96 [array] + +Support writing Parquet INT96 from a 12-byte field, only valid for parquet files. + +### encoding [string] + +Only used when file_format_type is json,text,csv,xml. +The encoding of the file to write. This param will be parsed by `Charset.forName(encoding)`. + +## Example + +For text file format with `have_partition` and `custom_filename` and `sink_columns` + +```hocon + + OssJindoFile { + path="/seatunnel/sink" + bucket = "oss://tyrantlucifer-image-bed" + access_key = "xxxxxxxxxxx" + access_secret = "xxxxxxxxxxx" + endpoint = "oss-cn-beijing.aliyuncs.com" + file_format_type = "text" + field_delimiter = "\t" + row_delimiter = "\n" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + custom_filename = true + file_name_expression = "${transactionId}_${now}" + filename_time_format = "yyyy.MM.dd" + sink_columns = ["name","age"] + is_enable_transaction = true + } + +``` + +For parquet file format with `sink_columns` + +```hocon + + OssJindoFile { + path = "/seatunnel/sink" + bucket = "oss://tyrantlucifer-image-bed" + access_key = "xxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxx" + endpoint = "oss-cn-beijing.aliyuncs.com" + file_format_type = "parquet" + sink_columns = ["name","age"] + } + +``` + +For orc file format simple config + +```bash + + OssJindoFile { + path="/seatunnel/sink" + bucket = "oss://tyrantlucifer-image-bed" + access_key = "xxxxxxxxxxx" + access_secret = "xxxxxxxxxxx" + endpoint = "oss-cn-beijing.aliyuncs.com" + file_format_type = "orc" + } + +``` + +## Changelog + +### 2.3.0 2022-12-30 + +- Add OSS Jindo File Sink Connector + +### Next version + +- [Improve] Support file compress ([3899](https://github.com/apache/seatunnel/pull/3899)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Paimon.md b/versioned_docs/version-2.3.7/connector-v2/sink/Paimon.md new file mode 100644 index 000000000000..58978cc20c22 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Paimon.md @@ -0,0 +1,316 @@ +# Paimon + +> Paimon sink connector + +## Description + +Sink connector for Apache Paimon. It can support cdc mode 、auto create table. + +## Supported DataSource Info + +| Datasource | Dependent | Maven | +|------------|-----------|---------------------------------------------------------------------------| +| Paimon | hive-exec | [Download](https://mvnrepository.com/artifact/org.apache.hive/hive-exec) | +| Paimon | libfb303 | [Download](https://mvnrepository.com/artifact/org.apache.thrift/libfb303) | + +## Database Dependency + +> In order to be compatible with different versions of Hadoop and Hive, the scope of hive-exec in the project pom file are provided, so if you use the Flink engine, first you may need to add the following Jar packages to /lib directory, if you are using the Spark engine and integrated with Hadoop, then you do not need to add the following Jar packages. + +``` +hive-exec-xxx.jar +libfb303-xxx.jar +``` + +> Some versions of the hive-exec package do not have libfb303-xxx.jar, so you also need to manually import the Jar package. + +## Key features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [support multiple table write](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | Description | +|-----------------------------|--------|----------|------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| warehouse | String | Yes | - | Paimon warehouse path | +| catalog_type | String | No | filesystem | Catalog type of Paimon, support filesystem and hive | +| catalog_uri | String | No | - | Catalog uri of Paimon, only needed when catalog_type is hive | +| database | String | Yes | - | The database you want to access | +| table | String | Yes | - | The table you want to access | +| hdfs_site_path | String | No | - | The path of hdfs-site.xml | +| schema_save_mode | Enum | No | CREATE_SCHEMA_WHEN_NOT_EXIST | The schema save mode | +| data_save_mode | Enum | No | APPEND_DATA | The data save mode | +| paimon.table.primary-keys | String | No | - | Default comma-separated list of columns (primary key) that identify a row in tables.(Notice: The partition field needs to be included in the primary key fields) | +| paimon.table.partition-keys | String | No | - | Default comma-separated list of partition fields to use when creating tables. | +| paimon.table.write-props | Map | No | - | Properties passed through to paimon table initialization, [reference](https://paimon.apache.org/docs/0.6/maintenance/configurations/#coreoptions). | +| paimon.hadoop.conf | Map | No | - | Properties in hadoop conf | +| paimon.hadoop.conf-path | String | No | - | The specified loading path for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files | + +## Examples + +### Single table + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + Mysql-CDC { + base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel" + username = "root" + password = "******" + table-names = ["seatunnel.role"] + } +} + +transform { +} + +sink { + Paimon { + catalog_name="seatunnel_test" + warehouse="file:///tmp/seatunnel/paimon/hadoop-sink/" + database="seatunnel" + table="role" + } +} +``` + +### Single table(Specify hadoop HA config and kerberos config) + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + Mysql-CDC { + base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel" + username = "root" + password = "******" + table-names = ["seatunnel.role"] + } +} + +transform { +} + +sink { + Paimon { + catalog_name="seatunnel_test" + warehouse="hdfs:///tmp/seatunnel/paimon/hadoop-sink/" + database="seatunnel" + table="role" + paimon.hadoop.conf = { + fs.defaultFS = "hdfs://nameservice1" + dfs.nameservices = "nameservice1" + dfs.ha.namenodes.nameservice1 = "nn1,nn2" + dfs.namenode.rpc-address.nameservice1.nn1 = "hadoop03:8020" + dfs.namenode.rpc-address.nameservice1.nn2 = "hadoop04:8020" + dfs.client.failover.proxy.provider.nameservice1 = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider" + dfs.client.use.datanode.hostname = "true" + security.kerberos.login.principal = "your-kerberos-principal" + security.kerberos.login.keytab = "your-kerberos-keytab-path" + } + } +} +``` + +### Single table(Hive catalog) + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + schema = { + fields { + pk_id = bigint + name = string + score = int + } + primaryKey { + name = "pk_id" + columnNames = [pk_id] + } + } + rows = [ + { + kind = INSERT + fields = [1, "A", 100] + }, + { + kind = INSERT + fields = [2, "B", 100] + }, + { + kind = INSERT + fields = [3, "C", 100] + }, + { + kind = INSERT + fields = [3, "C", 100] + }, + { + kind = INSERT + fields = [3, "C", 100] + }, + { + kind = INSERT + fields = [3, "C", 100] + } + { + kind = UPDATE_BEFORE + fields = [1, "A", 100] + }, + { + kind = UPDATE_AFTER + fields = [1, "A_1", 100] + }, + { + kind = DELETE + fields = [2, "B", 100] + } + ] + } +} + +sink { + Paimon { + schema_save_mode = "RECREATE_SCHEMA" + catalog_name="seatunnel_test" + catalog_type="hive" + catalog_uri="thrift://hadoop04:9083" + warehouse="hdfs:///tmp/seatunnel" + database="seatunnel_test" + table="st_test3" + paimon.hadoop.conf = { + fs.defaultFS = "hdfs://nameservice1" + dfs.nameservices = "nameservice1" + dfs.ha.namenodes.nameservice1 = "nn1,nn2" + dfs.namenode.rpc-address.nameservice1.nn1 = "hadoop03:8020" + dfs.namenode.rpc-address.nameservice1.nn2 = "hadoop04:8020" + dfs.client.failover.proxy.provider.nameservice1 = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider" + dfs.client.use.datanode.hostname = "true" + } + } +} + +``` + +### Single table with write props of paimon + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + Mysql-CDC { + base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel" + username = "root" + password = "******" + table-names = ["seatunnel.role"] + } +} + +sink { + Paimon { + catalog_name="seatunnel_test" + warehouse="file:///tmp/seatunnel/paimon/hadoop-sink/" + database="seatunnel" + table="role" + paimon.table.write-props = { + bucket = 2 + file.format = "parquet" + } + paimon.table.partition-keys = "dt" + paimon.table.primary-keys = "pk_id,dt" + } +} +``` + +### Multiple table + +#### example1 + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + Mysql-CDC { + base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel" + username = "root" + password = "******" + + table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] + } +} + +transform { +} + +sink { + Paimon { + catalog_name="seatunnel_test" + warehouse="file:///tmp/seatunnel/paimon/hadoop-sink/" + database="${database_name}_test" + table="${table_name}_test" + } +} +``` + +#### example2 + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Jdbc { + driver = oracle.jdbc.driver.OracleDriver + url = "jdbc:oracle:thin:@localhost:1521/XE" + user = testUser + password = testPassword + + table_list = [ + { + table_path = "TESTSCHEMA.TABLE_1" + }, + { + table_path = "TESTSCHEMA.TABLE_2" + } + ] + } +} + +transform { +} + +sink { + Paimon { + catalog_name="seatunnel_test" + warehouse="file:///tmp/seatunnel/paimon/hadoop-sink/" + database="${schema_name}_test" + table="${table_name}_test" + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Phoenix.md b/versioned_docs/version-2.3.7/connector-v2/sink/Phoenix.md new file mode 100644 index 000000000000..549deedde330 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Phoenix.md @@ -0,0 +1,62 @@ +# Phoenix + +> Phoenix sink connector + +## Description + +Write Phoenix data through [Jdbc connector](Jdbc.md). +Support Batch mode and Streaming mode. The tested Phoenix version is 4.xx and 5.xx +On the underlying implementation, through the jdbc driver of Phoenix, execute the upsert statement to write data to HBase. +Two ways of connecting Phoenix with Java JDBC. One is to connect to zookeeper through JDBC, and the other is to connect to queryserver through JDBC thin client. + +> Tips: By default, the (thin) driver jar is used. If you want to use the (thick) driver or other versions of Phoenix (thin) driver, you need to recompile the jdbc connector module +> +> Tips: Not support exactly-once semantics (XA transaction is not yet supported in Phoenix). + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Options + +### driver [string] + +if you use phoenix (thick) driver the value is `org.apache.phoenix.jdbc.PhoenixDriver` or you use (thin) driver the value is `org.apache.phoenix.queryserver.client.Driver` + +### url [string] + +if you use phoenix (thick) driver the value is `jdbc:phoenix:localhost:2182/hbase` or you use (thin) driver the value is `jdbc:phoenix:thin:url=http://localhost:8765;serialization=PROTOBUF` + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +## Example + +use thick client drive + +``` + Jdbc { + driver = org.apache.phoenix.jdbc.PhoenixDriver + url = "jdbc:phoenix:localhost:2182/hbase" + query = "upsert into test.sink(age, name) values(?, ?)" + } + +``` + +use thin client drive + +``` +Jdbc { + driver = org.apache.phoenix.queryserver.client.Driver + url = "jdbc:phoenix:thin:url=http://spark_e2e_phoenix_sink:8765;serialization=PROTOBUF" + query = "upsert into test.sink(age, name) values(?, ?)" +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Phoenix Sink Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/PostgreSql.md b/versioned_docs/version-2.3.7/connector-v2/sink/PostgreSql.md new file mode 100644 index 000000000000..a750755e31bd --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/PostgreSql.md @@ -0,0 +1,273 @@ +# PostgreSql + +> JDBC PostgreSql Sink Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Description + +Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once +semantics (using XA transaction guarantee). + +## Using Dependency + +### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.postgresql/postgresql) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.postgresql/postgresql) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +> Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is +> support `Xa transactions`. You can set `is_exactly_once=true` to enable it. + +## Supported DataSource Info + +| Datasource | Supported Versions | Driver | Url | Maven | +|------------|------------------------------------------------------------|-----------------------|---------------------------------------|--------------------------------------------------------------------------| +| PostgreSQL | Different dependency version has different driver class. | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/org.postgresql/postgresql) | +| PostgreSQL | If you want to manipulate the GEOMETRY type in PostgreSQL. | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/net.postgis/postgis-jdbc) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example PostgreSQL datasource: cp postgresql-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/
+> If you want to manipulate the GEOMETRY type in PostgreSQL, add postgresql-xxx.jar and postgis-jdbc-xxx.jar to $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| PostgreSQL Data Type | SeaTunnel Data Type | +|--------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| +| BOOL
| BOOLEAN | +| _BOOL
| ARRAY<BOOLEAN> | +| BYTEA
| BYTES | +| _BYTEA
| ARRAY<TINYINT> | +| INT2
SMALLSERIAL
INT4
SERIAL
| INT | +| _INT2
_INT4
| ARRAY<INT> | +| INT8
BIGSERIAL
| BIGINT | +| _INT8
| ARRAY<BIGINT> | +| FLOAT4
| FLOAT | +| _FLOAT4
| ARRAY<FLOAT> | +| FLOAT8
| DOUBLE | +| _FLOAT8
| ARRAY<DOUBLE> | +| NUMERIC(Get the designated column's specified column size>0) | DECIMAL(Get the designated column's specified column size,Gets the number of digits in the specified column to the right of the decimal point) | +| NUMERIC(Get the designated column's specified column size<0) | DECIMAL(38, 18) | +| BPCHAR
CHARACTER
VARCHAR
TEXT
GEOMETRY
GEOGRAPHY
JSON
JSONB
UUID | STRING | +| _BPCHAR
_CHARACTER
_VARCHAR
_TEXT | ARRAY<STRING> | +| TIMESTAMP
| TIMESTAMP | +| TIME
| TIME | +| DATE
| DATE | +| OTHER DATA TYPES | NOT SUPPORTED YET | + +## Options + +| Name | Type | Required | Default | Description | +|-------------------------------------------|---------|----------|------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:postgresql://localhost:5432/test
if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use PostgreSQL the value is `org.postgresql.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | +| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority.The table parameter can fill in the name of an unwilling table, which will eventually be used as the table name of the creation table, and supports variables (`${table_name}`, `${schema_name}`). Replacement rules: `${schema_name}` will replace the SCHEMA name passed to the target side, and `${table_name}` will replace the name of the table passed to the table at the target side. | +| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | +| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | +| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | +| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | +| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | +| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to. | +| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, PostgreSQL is `org.postgresql.xa.PGXADataSource`, and
please refer to appendix for other data sources | +| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | +| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | +| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | +| field_ide | String | No | - | Identify whether the field needs to be converted when synchronizing from the source to the sink. `ORIGINAL` indicates no conversion is needed;`UPPERCASE` indicates conversion to uppercase;`LOWERCASE` indicates conversion to lowercase. | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | +| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | +| schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. | +| data_save_mode | Enum | no | APPEND_DATA | Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. | +| custom_sql | String | no | - | When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. | +| enable_upsert | Boolean | No | true | Enable upsert by primary_keys exist, If the task has no key duplicate data, setting this parameter to `false` can speed up data import | + +### table [string] + +Use `database` and this `table-name` auto-generate sql and receive upstream input datas write to database. + +This option is mutually exclusive with `query` and has a higher priority. + +The table parameter can fill in the name of an unwilling table, which will eventually be used as the table name of the creation table, and supports variables (`${table_name}`, `${schema_name}`). Replacement rules: `${schema_name}` will replace the SCHEMA name passed to the target side, and `${table_name}` will replace the name of the table passed to the table at the target side. + +for example: +1. ${schema_name}.${table_name} _test +2. dbo.tt_${table_name} _sink +3. public.sink_table + +### schema_save_mode[Enum] + +Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. +Option introduction: +`RECREATE_SCHEMA` :Will create when the table does not exist, delete and rebuild when the table is saved +`CREATE_SCHEMA_WHEN_NOT_EXIST` :Will Created when the table does not exist, skipped when the table is saved +`ERROR_WHEN_SCHEMA_NOT_EXIST` :Error will be reported when the table does not exist + +### data_save_mode[Enum] + +Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. +Option introduction: +`DROP_DATA`: Preserve database structure and delete data +`APPEND_DATA`:Preserve database structure, preserve data +`CUSTOM_PROCESSING`:User defined processing +`ERROR_WHEN_DATA_EXISTS`:When there is data, an error is reported + +### custom_sql[String] + +When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your PostgreSQL. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +``` +# Defining the runtime environment +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + jdbc { + # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + user = root + password = 123456 + query = "insert into test_table(name,age) values(?,?)" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + +### Generate Sink SQL + +> This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you + +``` +sink { + Jdbc { + # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option + url = "jdbc:postgresql://localhost:5432/test" + driver = org.postgresql.Driver + user = root + password = 123456 + + generate_sink_sql = true + database = test + table = "public.test_table" + } +} +``` + +### Exactly-once : + +> For accurate write scene we guarantee accurate once + +``` +sink { + jdbc { + # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + + max_retries = 0 + user = root + password = 123456 + query = "insert into test_table(name,age) values(?,?)" + + is_exactly_once = "true" + + xa_data_source_class_name = "org.postgresql.xa.PGXADataSource" + } +} +``` + +### CDC(Change Data Capture) Event + +> CDC change data is also supported by us In this case, you need config database, table and primary_keys. + +``` +sink { + jdbc { + # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + user = root + password = 123456 + + generate_sink_sql = true + # You need to configure both database and table + database = test + table = sink_table + primary_keys = ["id","name"] + field_ide = UPPERCASE + } +} +``` + +### Save mode function + +``` +sink { + Jdbc { + # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option + url = "jdbc:postgresql://localhost:5432/test" + driver = org.postgresql.Driver + user = root + password = 123456 + + generate_sink_sql = true + database = test + table = "public.test_table" + schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" + data_save_mode="APPEND_DATA" + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Pulsar.md b/versioned_docs/version-2.3.7/connector-v2/sink/Pulsar.md new file mode 100644 index 000000000000..d3a648e23691 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Pulsar.md @@ -0,0 +1,177 @@ +# Pulsar + +> Pulsar sink connector + +## Support Those Engines + +> Spark
+> Flink
+> Seatunnel Zeta
+ +## Key features + +- [x] [exactly-once](../../concept/connector-v2-features.md) + +## Description + +Sink connector for Apache Pulsar. + +## Supported DataSource Info + +| Datasource | Supported Versions | +|------------|--------------------| +| Pulsar | Universal | + +## Sink Options + +| Name | Type | Required | Default | Description | +|----------------------|--------|----------|---------------------|----------------------------------------------------------------------------------------------------------| +| topic | String | Yes | - | sink pulsar topic | +| client.service-url | String | Yes | - | Service URL provider for Pulsar service. | +| admin.service-url | String | Yes | - | The Pulsar service HTTP URL for the admin endpoint. | +| auth.plugin-class | String | No | - | Name of the authentication plugin. | +| auth.params | String | No | - | Parameters for the authentication plugin. | +| format | String | No | json | Data format. The default format is json. Optional text format. | +| field_delimiter | String | No | , | Customize the field delimiter for data format. | +| semantics | Enum | No | AT_LEAST_ONCE | Consistency semantics for writing to pulsar. | +| transaction_timeout | Int | No | 600 | The transaction timeout is specified as 10 minutes by default. | +| pulsar.config | Map | No | - | In addition to the above parameters that must be specified by the Pulsar producer client. | +| message.routing.mode | Enum | No | RoundRobinPartition | Default routing mode for messages to partition. | +| partition_key_fields | array | No | - | Configure which fields are used as the key of the pulsar message. | +| common-options | config | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | + +## Parameter Interpretation + +### client.service-url [String] + +Service URL provider for Pulsar service. +To connect to Pulsar using client libraries, you need to specify a Pulsar protocol URL. +You can assign Pulsar protocol URLs to specific clusters and use the Pulsar scheme. + +For example, `localhost`: `pulsar://localhost:6650,localhost:6651`. + +### admin.service-url [String] + +The Pulsar service HTTP URL for the admin endpoint. + +For example, `http://my-broker.example.com:8080`, or `https://my-broker.example.com:8443` for TLS. + +### auth.plugin-class [String] + +Name of the authentication plugin. + +### auth.params [String] + +Parameters for the authentication plugin. + +For example, `key1:val1,key2:val2` + +### format [String] + +Data format. The default format is json. Optional text format. The default field separator is ",". +If you customize the delimiter, add the "field_delimiter" option. + +### field_delimiter [String] + +Customize the field delimiter for data format.The default field_delimiter is ','. + +### semantics [Enum] + +Consistency semantics for writing to pulsar. +Available options are EXACTLY_ONCE,NON,AT_LEAST_ONCE, default AT_LEAST_ONCE. +If semantic is specified as EXACTLY_ONCE, we will use 2pc to guarantee the message is sent to pulsar exactly once. +If semantic is specified as NON, we will directly send the message to pulsar, the data may duplicat/lost if +job restart/retry or network error. + +### transaction_timeout [Int] + +The transaction timeout is specified as 10 minutes by default. +If the transaction does not commit within the specified timeout, the transaction will be automatically aborted. +So you need to ensure that the timeout is greater than the checkpoint interval. + +### pulsar.config [Map] + +In addition to the above parameters that must be specified by the Pulsar producer client, +the user can also specify multiple non-mandatory parameters for the producer client, +covering all the producer parameters specified in the official Pulsar document. + +### message.routing.mode [Enum] + +Default routing mode for messages to partition. +Available options are SinglePartition,RoundRobinPartition. +If you choose SinglePartition, If no key is provided, The partitioned producer will randomly pick one single partition and publish all the messages into that partition, If a key is provided on the message, the partitioned producer will hash the key and assign message to a particular partition. +If you choose RoundRobinPartition, If no key is provided, the producer will publish messages across all partitions in round-robin fashion to achieve maximum throughput. +Please note that round-robin is not done per individual message but rather it's set to the same boundary of batching delay, to ensure batching is effective. + +### partition_key_fields [String] + +Configure which fields are used as the key of the pulsar message. + +For example, if you want to use value of fields from upstream data as key, you can assign field names to this property. + +Upstream data is the following: + +| name | age | data | +|------|-----|---------------| +| Jack | 16 | data-example1 | +| Mary | 23 | data-example2 | + +If name is set as the key, then the hash value of the name column will determine which partition the message is sent to. + +If not set partition key fields, the null message key will be sent to. + +The format of the message key is json, If name is set as the key, for example '{"name":"Jack"}'. + +The selected field must be an existing field in the upstream. + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. + +## Task Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to Pulsar Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target topic is test_topic will also be 16 rows of data in the topic. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +```hocon +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +sink { + Pulsar { + topic = "example" + client.service-url = "localhost:pulsar://localhost:6650" + admin.service-url = "http://my-broker.example.com:8080" + result_table_name = "test" + pulsar.config = { + sendTimeoutMs = 30000 + } + } +} +``` + +## Changelog + +### next version + +- Add Pulsar Sink Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Rabbitmq.md b/versioned_docs/version-2.3.7/connector-v2/sink/Rabbitmq.md new file mode 100644 index 000000000000..4f12f5911584 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Rabbitmq.md @@ -0,0 +1,121 @@ +# Rabbitmq + +> Rabbitmq sink connector + +## Description + +Used to write data to Rabbitmq. + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|----------------------------|---------|----------|---------------| +| host | string | yes | - | +| port | int | yes | - | +| virtual_host | string | yes | - | +| username | string | yes | - | +| password | string | yes | - | +| queue_name | string | yes | - | +| url | string | no | - | +| network_recovery_interval | int | no | - | +| topology_recovery_enabled | boolean | no | - | +| automatic_recovery_enabled | boolean | no | - | +| use_correlation_id | boolean | no | false | +| connection_timeout | int | no | - | +| rabbitmq.config | map | no | - | +| common-options | | no | - | + +### host [string] + +the default host to use for connections + +### port [int] + +the default port to use for connections + +### virtual_host [string] + +virtual host – the virtual host to use when connecting to the broker + +### username [string] + +the AMQP user name to use when connecting to the broker + +### password [string] + +the password to use when connecting to the broker + +### url [string] + +convenience method for setting the fields in an AMQP URI: host, port, username, password and virtual host + +### queue_name [string] + +the queue to write the message to + +### schema [Config] + +#### fields [Config] + +the schema fields of upstream data. + +### network_recovery_interval [int] + +how long will automatic recovery wait before attempting to reconnect, in ms + +### topology_recovery_enabled [boolean] + +if true, enables topology recovery + +### automatic_recovery_enabled [boolean] + +if true, enables connection recovery + +### use_correlation_id [boolean] + +whether the messages received are supplied with a unique id to deduplicate messages (in case of failed acknowledgments). + +### connection_timeout [int] + +connection TCP establishment timeout in milliseconds; zero for infinite + +### rabbitmq.config [map] + +In addition to the above parameters that must be specified by the RabbitMQ client, the user can also specify multiple non-mandatory parameters for the client, covering [all the parameters specified in the official RabbitMQ document](https://www.rabbitmq.com/configure.html). + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +## Example + +simple: + +```hocon +sink { + RabbitMQ { + host = "rabbitmq-e2e" + port = 5672 + virtual_host = "/" + username = "guest" + password = "guest" + queue_name = "test1" + rabbitmq.config = { + requested-heartbeat = 10 + connection-timeout = 10 + } + } +} +``` + +## Changelog + +### next version + +- Add Rabbitmq Sink Connector +- [Improve] Change Connector Custom Config Prefix To Map [3719](https://github.com/apache/seatunnel/pull/3719) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Redis.md b/versioned_docs/version-2.3.7/connector-v2/sink/Redis.md new file mode 100644 index 000000000000..ac4cd55cc4f2 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Redis.md @@ -0,0 +1,164 @@ +# Redis + +> Redis sink connector + +## Description + +Used to write data to Redis. + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|----------------|--------|-----------------------|---------------| +| host | string | yes | - | +| port | int | yes | - | +| key | string | yes | - | +| data_type | string | yes | - | +| batch_size | int | no | 10 | +| user | string | no | - | +| auth | string | no | - | +| db_num | int | no | 0 | +| mode | string | no | single | +| nodes | list | yes when mode=cluster | - | +| format | string | no | json | +| expire | long | no | -1 | +| common-options | | no | - | + +### host [string] + +Redis host + +### port [int] + +Redis port + +### key [string] + +The value of key you want to write to redis. + +For example, if you want to use value of a field from upstream data as key, you can assign it to the field name. + +Upstream data is the following: + +| code | data | success | +|------|----------------|---------| +| 200 | get success | true | +| 500 | internal error | false | + +If you assign field name to `code` and data_type to `key`, two data will be written to redis: +1. `200 -> {code: 200, message: true, data: get success}` +2. `500 -> {code: 500, message: false, data: internal error}` + +If you assign field name to `value` and data_type to `key`, only one data will be written to redis because `value` is not existed in upstream data's fields: + +1. `value -> {code: 500, message: false, data: internal error}` + +Please see the data_type section for specific writing rules. + +Of course, the format of the data written here I just take json as an example, the specific or user-configured `format` prevails. + +### data_type [string] + +Redis data types, support `key` `hash` `list` `set` `zset` + +- key + +> Each data from upstream will be updated to the configured key, which means the later data will overwrite the earlier data, and only the last data will be stored in the key. + +- hash + +> Each data from upstream will be split according to the field and written to the hash key, also the data after will overwrite the data before. + +- list + +> Each data from upstream will be added to the configured list key. + +- set + +> Each data from upstream will be added to the configured set key. + +- zset + +> Each data from upstream will be added to the configured zset key with a weight of 1. So the order of data in zset is based on the order of data consumption. +> + ### batch_size [int] + +ensure the batch write size in single-machine mode; no guarantees in cluster mode. + +### user [string] + +redis authentication user, you need it when you connect to an encrypted cluster + +### auth [string] + +Redis authentication password, you need it when you connect to an encrypted cluster + +### db_num [int] + +Redis database index ID. It is connected to db 0 by default + +### mode [string] + +redis mode, `single` or `cluster`, default is `single` + +### nodes [list] + +redis nodes information, used in cluster mode, must like as the following format: + +["host1:port1", "host2:port2"] + +### format [string] + +The format of upstream data, now only support `json`, `text` will be supported later, default `json`. + +When you assign format is `json`, for example: + +Upstream data is the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +Connector will generate data as the following and write it to redis: + +```json + +{"code": 200, "data": "get success", "success": "true"} + +``` + +### expire [long] + +Set redis expiration time, the unit is second. The default value is -1, keys do not automatically expire by default. + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +## Example + +simple: + +```hocon +Redis { + host = localhost + port = 6379 + key = age + data_type = list +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Redis Sink Connector + +### next version + +- [Improve] Support redis cluster mode connection and user authentication [3188](https://github.com/apache/seatunnel/pull/3188) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Redshift.md b/versioned_docs/version-2.3.7/connector-v2/sink/Redshift.md new file mode 100644 index 000000000000..90f312fab945 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Redshift.md @@ -0,0 +1,99 @@ +# Redshift + +> JDBC Redshift sink Connector + +## Support those engines + +> Spark
+> Flink
+> Seatunnel Zeta
+ +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +> Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is +> support `Xa transactions`. You can set `is_exactly_once=true` to enable it. + +## Description + +Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once +semantics (using XA transaction guarantee). + +## Supported DataSource list + +| datasource | supported versions | driver | url | maven | +|------------|----------------------------------------------------------|---------------------------------|-----------------------------------------|------------------------------------------------------------------------------------| +| redshift | Different dependency version has different driver class. | com.amazon.redshift.jdbc.Driver | jdbc:redshift://localhost:5439/database | [Download](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) | + +## Database dependency + +### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Data Type Mapping + +| SeaTunnel Data type | Redshift Data type | +|-------------------------|--------------------| +| BOOLEAN | BOOLEAN | +| TINYINT
SMALLINT | SMALLINT | +| INT | INTEGER | +| BIGINT | BIGINT | +| FLOAT | REAL | +| DOUBLE | DOUBLE PRECISION | +| DECIMAL | NUMERIC | +| STRING(<=65535) | CHARACTER VARYING | +| STRING(>65535) | SUPER | +| BYTES | BINARY VARYING | +| TIME | TIME | +| TIMESTAMP | TIMESTAMP | +| MAP
ARRAY
ROW | SUPER | + +## Task Example + +### Simple: + +``` +sink { + jdbc { + url = "jdbc:redshift://localhost:5439/mydatabase" + driver = "com.amazon.redshift.jdbc.Driver" + user = "myUser" + password = "myPassword" + + generate_sink_sql = true + schema = "public" + table = "sink_table" + } +} +``` + +### CDC(Change data capture) event + +> CDC change data is also supported by us In this case, you need config database, table and primary_keys. + +``` +sink { + jdbc { + url = "jdbc:redshift://localhost:5439/mydatabase" + driver = "com.amazon.redshift.jdbc.Driver" + user = "myUser" + password = "mypassword" + + generate_sink_sql = true + schema = "public" + table = "sink_table" + + # config update/delete primary keys + primary_keys = ["id","name"] + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/RocketMQ.md b/versioned_docs/version-2.3.7/connector-v2/sink/RocketMQ.md new file mode 100644 index 000000000000..a31534ec26bb --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/RocketMQ.md @@ -0,0 +1,203 @@ +# RocketMQ + +> RocketMQ sink connector + +## Support Apache RocketMQ Version + +- 4.9.0 (Or a newer version, for reference) + +## Support These Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) + +By default, we will use 2pc to guarantee the message is sent to RocketMQ exactly once. + +## Description + +Write Rows to a Apache RocketMQ topic. + +## Sink Options + +| Name | Type | Required | Default | Description | +|----------------------|---------|----------|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| topic | string | yes | - | `RocketMQ topic` name. | +| name.srv.addr | string | yes | - | `RocketMQ` name server cluster address. | +| acl.enabled | Boolean | no | false | false | +| access.key | String | no | | When ACL_ENABLED is true, access key cannot be empty | +| secret.key | String | no | | When ACL_ENABLED is true, secret key cannot be empty | +| producer.group | String | no | SeaTunnel-producer-Group | SeaTunnel-producer-Group | +| partition.key.fields | array | no | - | - | +| format | String | no | json | Data format. The default format is json. Optional text format. The default field separator is ",".If you customize the delimiter, add the "field_delimiter" option. | +| field.delimiter | String | no | , | Customize the field delimiter for data format. | +| producer.send.sync | Boolean | no | false | If true, the message will be sync sent. | +| common-options | config | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. | + +### partition.key.fields [array] + +Configure which fields are used as the key of the RocketMQ message. + +For example, if you want to use value of fields from upstream data as key, you can assign field names to this property. + +Upstream data is the following: + +| name | age | data | +|------|-----|---------------| +| Jack | 16 | data-example1 | +| Mary | 23 | data-example2 | + +If name is set as the key, then the hash value of the name column will determine which partition the message is sent to. + +## Task Example + +### Fake to Rocketmq Simple + +> The data is randomly generated and asynchronously sent to the test topic + +```hocon +env { + parallelism = 1 +} + +source { + FakeSource { + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform +} + +sink { + Rocketmq { + name.srv.addr = "localhost:9876" + topic = "test_topic" + } +} + +``` + +### Rocketmq To Rocketmq Simple + +> Consuming Rocketmq writes to c_int field Hash number of partitions written to different partitions This is the default asynchronous way to write + +```hocon +env { + parallelism = 1 +} + +source { + Rocketmq { + name.srv.addr = "localhost:9876" + topics = "test_topic" + result_table_name = "rocketmq_table" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} + +sink { + Rocketmq { + name.srv.addr = "localhost:9876" + topic = "test_topic_sink" + partition.key.fields = ["c_int"] + } +} +``` + +### Timestamp consumption write Simple + +> This is a stream consumption specified time stamp consumption, when there are new partitions added the program will refresh the perception and consumption at intervals, and write to another topic type + +```hocon + +env { + parallelism = 1 + job.mode = "STREAMING" +} + +source { + Rocketmq { + name.srv.addr = "localhost:9876" + topics = "test_topic" + result_table_name = "rocketmq_table" + start.mode = "CONSUME_FROM_FIRST_OFFSET" + batch.size = "400" + consumer.group = "test_topic_group" + format = "json" + format = json + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform +} +sink { + Rocketmq { + name.srv.addr = "localhost:9876" + topic = "test_topic" + partition.key.fields = ["c_int"] + producer.send.sync = true + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/S3-Redshift.md b/versioned_docs/version-2.3.7/connector-v2/sink/S3-Redshift.md new file mode 100644 index 000000000000..2e02e2f446a7 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/S3-Redshift.md @@ -0,0 +1,278 @@ +# S3Redshift + +> The way of S3Redshift is to write data into S3, and then use Redshift's COPY command to import data from S3 to Redshift. + +## Description + +Output data to AWS Redshift. + +> Tips: +> We based on the [S3File](S3File.md) to implement this connector. So you can use the same configuration as S3File. +> We made some trade-offs in order to support more file types, so we used the HDFS protocol for internal access to S3 and this connector need some hadoop dependencies. +> It's only support hadoop version **2.6.5+**. + +## Key features + +- [x] [exactly-once](../../concept/connector-v2-features.md) + +By default, we use 2PC commit to ensure `exactly-once` + +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + +## Options + +| name | type | required | default value | +|----------------------------------|---------|----------|-----------------------------------------------------------| +| jdbc_url | string | yes | - | +| jdbc_user | string | yes | - | +| jdbc_password | string | yes | - | +| execute_sql | string | yes | - | +| path | string | yes | - | +| bucket | string | yes | - | +| access_key | string | no | - | +| access_secret | string | no | - | +| hadoop_s3_properties | map | no | - | +| file_name_expression | string | no | "${transactionId}" | +| file_format_type | string | no | "text" | +| filename_time_format | string | no | "yyyy.MM.dd" | +| field_delimiter | string | no | '\001' | +| row_delimiter | string | no | "\n" | +| partition_by | array | no | - | +| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | +| is_partition_field_write_in_file | boolean | no | false | +| sink_columns | array | no | When this parameter is empty, all fields are sink columns | +| is_enable_transaction | boolean | no | true | +| batch_size | int | no | 1000000 | +| common-options | | no | - | + +### jdbc_url + +The JDBC URL to connect to the Redshift database. + +### jdbc_user + +The JDBC user to connect to the Redshift database. + +### jdbc_password + +The JDBC password to connect to the Redshift database. + +### execute_sql + +The SQL to execute after the data is written to S3. + +eg: + +```sql + +COPY target_table FROM 's3://yourbucket${path}' IAM_ROLE 'arn:XXX' REGION 'your region' format as json 'auto'; +``` + +`target_table` is the table name in Redshift. + +`${path}` is the path of the file written to S3. please confirm your sql include this variable. and don't need replace it. we will replace it when execute sql. + +IAM_ROLE is the role that has permission to access S3. + +format is the format of the file written to S3. please confirm this format is same as the file format you set in the configuration. + +please refer to [Redshift COPY](https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html) for more details. + +please confirm that the role has permission to access S3. + +### path [string] + +The target dir path is required. + +### bucket [string] + +The bucket address of s3 file system, for example: `s3n://seatunnel-test`, if you use `s3a` protocol, this parameter should be `s3a://seatunnel-test`. + +### access_key [string] + +The access key of s3 file system. If this parameter is not set, please confirm that the credential provider chain can be authenticated correctly, you could check this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) + +### access_secret [string] + +The access secret of s3 file system. If this parameter is not set, please confirm that the credential provider chain can be authenticated correctly, you could check this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) + +### hadoop_s3_properties [map] + +If you need to add a other option, you could add it here and refer to this [Hadoop-AWS](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) + +``` +hadoop_s3_properties { + "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" + } +``` + +### file_name_expression [string] + +`file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, +`${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +### file_format_type [string] + +We supported as the following file types: + +`text` `csv` `parquet` `orc` `json` + +Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. + +### filename_time_format [string] + +When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: + +| Symbol | Description | +|--------|--------------------| +| y | Year | +| M | Month | +| d | Day of month | +| H | Hour in day (0-23) | +| m | Minute in hour | +| s | Second in minute | + +See [Java SimpleDateFormat](https://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html) for detailed time format syntax. + +### field_delimiter [string] + +The separator between columns in a row of data. Only needed by `text` and `csv` file format. + +### row_delimiter [string] + +The separator between rows in a file. Only needed by `text` and `csv` file format. + +### partition_by [array] + +Partition data based on selected fields + +### partition_dir_expression [string] + +If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. + +Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. + +### is_partition_field_write_in_file [boolean] + +If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be written into data file. + +For example, if you want to write a Hive Data File, Its value should be `false`. + +### sink_columns [array] + +Which columns need be written to file, default value is all the columns get from `Transform` or `Source`. +The order of the fields determines the order in which the file is actually written. + +### is_enable_transaction [boolean] + +If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +Only support `true` now. + +### batch_size [int] + +The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. + +## Example + +For text file format + +```hocon + + S3Redshift { + jdbc_url = "jdbc:redshift://xxx.amazonaws.com.cn:5439/xxx" + jdbc_user = "xxx" + jdbc_password = "xxxx" + execute_sql="COPY table_name FROM 's3://test${path}' IAM_ROLE 'arn:aws-cn:iam::xxx' REGION 'cn-north-1' removequotes emptyasnull blanksasnull maxerror 100 delimiter '|' ;" + access_key = "xxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxx" + bucket = "s3a://seatunnel-test" + tmp_path = "/tmp/seatunnel" + path="/seatunnel/text" + row_delimiter="\n" + partition_dir_expression="${k0}=${v0}" + is_partition_field_write_in_file=true + file_name_expression="${transactionId}_${now}" + file_format_type = "text" + filename_time_format="yyyy.MM.dd" + is_enable_transaction=true + hadoop_s3_properties { + "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" + } + } + +``` + +For parquet file format + +```hocon + + S3Redshift { + jdbc_url = "jdbc:redshift://xxx.amazonaws.com.cn:5439/xxx" + jdbc_user = "xxx" + jdbc_password = "xxxx" + execute_sql="COPY table_name FROM 's3://test${path}' IAM_ROLE 'arn:aws-cn:iam::xxx' REGION 'cn-north-1' format as PARQUET;" + access_key = "xxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxx" + bucket = "s3a://seatunnel-test" + tmp_path = "/tmp/seatunnel" + path="/seatunnel/parquet" + row_delimiter="\n" + partition_dir_expression="${k0}=${v0}" + is_partition_field_write_in_file=true + file_name_expression="${transactionId}_${now}" + file_format_type = "parquet" + filename_time_format="yyyy.MM.dd" + is_enable_transaction=true + hadoop_s3_properties { + "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" + } + } + +``` + +For orc file format + +```hocon + + S3Redshift { + jdbc_url = "jdbc:redshift://xxx.amazonaws.com.cn:5439/xxx" + jdbc_user = "xxx" + jdbc_password = "xxxx" + execute_sql="COPY table_name FROM 's3://test${path}' IAM_ROLE 'arn:aws-cn:iam::xxx' REGION 'cn-north-1' format as ORC;" + access_key = "xxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxx" + bucket = "s3a://seatunnel-test" + tmp_path = "/tmp/seatunnel" + path="/seatunnel/orc" + row_delimiter="\n" + partition_dir_expression="${k0}=${v0}" + is_partition_field_write_in_file=true + file_name_expression="${transactionId}_${now}" + file_format_type = "orc" + filename_time_format="yyyy.MM.dd" + is_enable_transaction=true + hadoop_s3_properties { + "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" + } + } + +``` + +## Changelog + +### 2.3.0-beta 2022-10-20 + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/S3File.md b/versioned_docs/version-2.3.7/connector-v2/sink/S3File.md new file mode 100644 index 000000000000..cb711f6b3b77 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/S3File.md @@ -0,0 +1,513 @@ +# S3File + +> S3 File Sink Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [cdc](../../concept/connector-v2-features.md) +- [x] [support multiple table write](../../concept/connector-v2-features.md) + +By default, we use 2PC commit to ensure `exactly-once` + +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + - [x] xml + - [x] binary + +## Description + +Output data to aws s3 file system. + +## Supported DataSource Info + +| Datasource | Supported Versions | +|------------|--------------------| +| S3 | current | + +## Database Dependency + +> If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. +> +> If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under `${SEATUNNEL_HOME}/lib` to confirm this. +> To use this connector you need put `hadoop-aws-3.1.4.jar` and `aws-java-sdk-bundle-1.12.692.jar` in `${SEATUNNEL_HOME}/lib` dir. + +## Data Type Mapping + +If write to `csv`, `text` file type, All column will be string. + +### Orc File Type + +| SeaTunnel Data type | Orc Data type | +|----------------------|-----------------------| +| STRING | STRING | +| BOOLEAN | BOOLEAN | +| TINYINT | BYTE | +| SMALLINT | SHORT | +| INT | INT | +| BIGINT | LONG | +| FLOAT | FLOAT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| DECIMAL | DECIMAL | +| BYTES | BINARY | +| DATE | DATE | +| TIME
TIMESTAMP | TIMESTAMP | +| ROW | STRUCT | +| NULL | UNSUPPORTED DATA TYPE | +| ARRAY | LIST | +| Map | Map | + +### Parquet File Type + +| SeaTunnel Data type | Parquet Data type | +|----------------------|-----------------------| +| STRING | STRING | +| BOOLEAN | BOOLEAN | +| TINYINT | INT_8 | +| SMALLINT | INT_16 | +| INT | INT32 | +| BIGINT | INT64 | +| FLOAT | FLOAT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| DECIMAL | DECIMAL | +| BYTES | BINARY | +| DATE | DATE | +| TIME
TIMESTAMP | TIMESTAMP_MILLIS | +| ROW | GroupType | +| NULL | UNSUPPORTED DATA TYPE | +| ARRAY | LIST | +| Map | Map | + +## Sink Options + +| name | type | required | default value | Description | +|---------------------------------------|---------|----------|-------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| path | string | yes | - | | +| tmp_path | string | no | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. Need a S3 dir. | +| bucket | string | yes | - | | +| fs.s3a.endpoint | string | yes | - | | +| fs.s3a.aws.credentials.provider | string | yes | com.amazonaws.auth.InstanceProfileCredentialsProvider | The way to authenticate s3a. We only support `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` now. | +| access_key | string | no | - | Only used when fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider | +| access_secret | string | no | - | Only used when fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider | +| custom_filename | boolean | no | false | Whether you need custom the filename | +| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | +| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | +| file_format_type | string | no | "csv" | | +| field_delimiter | string | no | '\001' | Only used when file_format is text | +| row_delimiter | string | no | "\n" | Only used when file_format is text | +| have_partition | boolean | no | false | Whether you need processing partitions. | +| partition_by | array | no | - | Only used when have_partition is true | +| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used when have_partition is true | +| is_partition_field_write_in_file | boolean | no | false | Only used when have_partition is true | +| sink_columns | array | no | | When this parameter is empty, all fields are sink columns | +| is_enable_transaction | boolean | no | true | | +| batch_size | int | no | 1000000 | | +| compress_codec | string | no | none | | +| common-options | object | no | - | | +| max_rows_in_memory | int | no | - | Only used when file_format is excel. | +| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. | +| xml_root_tag | string | no | RECORDS | Only used when file_format is xml, specifies the tag name of the root element within the XML file. | +| xml_row_tag | string | no | RECORD | Only used when file_format is xml, specifies the tag name of the data rows within the XML file | +| xml_use_attr_format | boolean | no | - | Only used when file_format is xml, specifies Whether to process data using the tag attribute format. | +| parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | +| parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | +| hadoop_s3_properties | map | no | | If you need to add a other option, you could add it here and refer to this [link](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) | +| schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | Before turning on the synchronous task, do different treatment of the target path | +| data_save_mode | Enum | no | APPEND_DATA | Before opening the synchronous task, the data file in the target path is differently processed | +| encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | + +### path [string] + +Store the path of the data file to support variable replacement. For example: path=/test/${database_name}/${schema_name}/${table_name} + +### hadoop_s3_properties [map] + +If you need to add a other option, you could add it here and refer to this [link](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) + +``` +hadoop_s3_properties { + "fs.s3a.buffer.dir" = "/data/st_test/s3a" + "fs.s3a.fast.upload.buffer" = "disk" + } +``` + +### custom_filename [boolean] + +Whether custom the filename + +### file_name_expression [string] + +Only used when `custom_filename` is `true` + +`file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, +`${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +### filename_time_format [string] + +Only used when `custom_filename` is `true` + +When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: + +| Symbol | Description | +|--------|--------------------| +| y | Year | +| M | Month | +| d | Day of month | +| H | Hour in day (0-23) | +| m | Minute in hour | +| s | Second in minute | + +### file_format_type [string] + +We supported as the following file types: + +`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` + +Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. + +### field_delimiter [string] + +The separator between columns in a row of data. Only needed by `text` file format. + +### row_delimiter [string] + +The separator between rows in a file. Only needed by `text` file format. + +### have_partition [boolean] + +Whether you need processing partitions. + +### partition_by [array] + +Only used when `have_partition` is `true`. + +Partition data based on selected fields. + +### partition_dir_expression [string] + +Only used when `have_partition` is `true`. + +If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. + +Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. + +### is_partition_field_write_in_file [boolean] + +Only used when `have_partition` is `true`. + +If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. + +For example, if you want to write a Hive Data File, Its value should be `false`. + +### sink_columns [array] + +Which columns need be written to file, default value is all the columns get from `Transform` or `Source`. +The order of the fields determines the order in which the file is actually written. + +### is_enable_transaction [boolean] + +If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +Only support `true` now. + +### batch_size [int] + +The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. + +### compress_codec [string] + +The compress codec of files and the details that supported as the following shown: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc: `lzo` `snappy` `lz4` `zlib` `none` +- parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` + +Tips: excel type does not support any compression format + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. + +### max_rows_in_memory [int] + +When File Format is Excel,The maximum number of data items that can be cached in the memory. + +### sheet_name [string] + +Writer the sheet of the workbook + +### xml_root_tag [string] + +Specifies the tag name of the root element within the XML file. + +### xml_row_tag [string] + +Specifies the tag name of the data rows within the XML file. + +### xml_use_attr_format [boolean] + +Specifies Whether to process data using the tag attribute format. + +### parquet_avro_write_timestamp_as_int96 [boolean] + +Support writing Parquet INT96 from a timestamp, only valid for parquet files. + +### parquet_avro_write_fixed_as_int96 [array] + +Support writing Parquet INT96 from a 12-byte field, only valid for parquet files. + +### schema_save_mode[Enum] + +Before turning on the synchronous task, do different treatment of the target path. +Option introduction: +`RECREATE_SCHEMA` :Will be created when the path does not exist. If the path already exists, delete the path and recreate it. +`CREATE_SCHEMA_WHEN_NOT_EXIST` :Will Created when the path does not exist, use the path when the path is existed. +`ERROR_WHEN_SCHEMA_NOT_EXIST` :Error will be reported when the path does not exist + +### data_save_mode[Enum] + +Before opening the synchronous task, the data file in the target path is differently processed. +Option introduction: +`DROP_DATA`: use the path but delete data files in the path. +`APPEND_DATA`:use the path, and add new files in the path for write data. +`ERROR_WHEN_DATA_EXISTS`:When there are some data files in the path, an error will is reported. + +### encoding [string] + +Only used when file_format_type is json,text,csv,xml. +The encoding of the file to write. This param will be parsed by `Charset.forName(encoding)`. + +## Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to S3File Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target s3 dir will also create a file and all of the data in write in it. +> Before run this job, you need create s3 path: /seatunnel/text. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +``` +# Defining the runtime environment +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + c_map = "map>" + c_array = "array" + name = string + c_boolean = boolean + age = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(16, 1)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + S3File { + bucket = "s3a://seatunnel-test" + tmp_path = "/tmp/seatunnel" + path="/seatunnel/text" + fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" + fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" + file_format_type = "text" + field_delimiter = "\t" + row_delimiter = "\n" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + custom_filename = true + file_name_expression = "${transactionId}_${now}" + filename_time_format = "yyyy.MM.dd" + sink_columns = ["name","age"] + is_enable_transaction=true + hadoop_s3_properties { + "fs.s3a.buffer.dir" = "/data/st_test/s3a" + "fs.s3a.fast.upload.buffer" = "disk" + } + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + +For text file format with `have_partition` and `custom_filename` and `sink_columns` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` + +```hocon + + S3File { + bucket = "s3a://seatunnel-test" + tmp_path = "/tmp/seatunnel" + path="/seatunnel/text" + fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" + fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" + file_format_type = "text" + field_delimiter = "\t" + row_delimiter = "\n" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + custom_filename = true + file_name_expression = "${transactionId}_${now}" + filename_time_format = "yyyy.MM.dd" + sink_columns = ["name","age"] + is_enable_transaction=true + hadoop_s3_properties { + "fs.s3a.buffer.dir" = "/data/st_test/s3a" + "fs.s3a.fast.upload.buffer" = "disk" + } + } + +``` + +For parquet file format simple config with `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` + +```hocon + + S3File { + bucket = "s3a://seatunnel-test" + tmp_path = "/tmp/seatunnel" + path="/seatunnel/parquet" + fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" + fs.s3a.aws.credentials.provider="org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" + access_key = "xxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxx" + file_format_type = "parquet" + hadoop_s3_properties { + "fs.s3a.buffer.dir" = "/data/st_test/s3a" + "fs.s3a.fast.upload.buffer" = "disk" + } + } + +``` + +For orc file format simple config with `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` + +```hocon + + S3File { + bucket = "s3a://seatunnel-test" + tmp_path = "/tmp/seatunnel" + path="/seatunnel/orc" + fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" + fs.s3a.aws.credentials.provider="org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" + access_key = "xxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxx" + file_format_type = "orc" + schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" + data_save_mode="APPEND_DATA" + } + +``` + +Multi-table writing and saveMode + +```hocon +env { + "job.name"="SeaTunnel_job" + "job.mode"=STREAMING +} +source { + MySQL-CDC { + database-names=[ + "wls_t1" + ] + table-names=[ + "wls_t1.mysqlcdc_to_s3_t3", + "wls_t1.mysqlcdc_to_s3_t4", + "wls_t1.mysqlcdc_to_s3_t5", + "wls_t1.mysqlcdc_to_s3_t1", + "wls_t1.mysqlcdc_to_s3_t2" + ] + password="xxxxxx" + username="xxxxxxxxxxxxx" + base-url="jdbc:mysql://localhost:3306/qa_source" + } +} + +transform { +} + +sink { + S3File { + bucket = "s3a://seatunnel-test" + tmp_path = "/tmp/seatunnel/${table_name}" + path="/test/${table_name}" + fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" + fs.s3a.aws.credentials.provider="org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" + access_key = "xxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxx" + file_format_type = "orc" + schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" + data_save_mode="APPEND_DATA" + } +} +``` + +## Changelog + +### 2.3.0-beta 2022-10-20 + +- Add S3File Sink Connector + +### 2.3.0 2022-12-30 + +- [BugFix] Fixed the following bugs that failed to write data to files ([3258](https://github.com/apache/seatunnel/pull/3258)) + - When field from upstream is null it will throw NullPointerException + - Sink columns mapping failed + - When restore writer from states getting transaction directly failed +- [Feature] Support S3A protocol ([3632](https://github.com/apache/seatunnel/pull/3632)) + - Allow user to add additional hadoop-s3 parameters + - Allow the use of the s3a protocol + - Decouple hadoop-aws dependencies +- [Improve] Support setting batch size for every file ([3625](https://github.com/apache/seatunnel/pull/3625)) +- [Feature]Set S3 AK to optional ([3688](https://github.com/apache/seatunnel/pull/)) + +### Next version + +- ​ [Improve] Support file compress ([3899](https://github.com/apache/seatunnel/pull/3899)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/SelectDB-Cloud.md b/versioned_docs/version-2.3.7/connector-v2/sink/SelectDB-Cloud.md new file mode 100644 index 000000000000..41ca0ddaf254 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/SelectDB-Cloud.md @@ -0,0 +1,173 @@ +# SelectDB Cloud + +> SelectDB Cloud sink connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +## Description + +Used to send data to SelectDB Cloud. Both support streaming and batch mode. +The internal implementation of SelectDB Cloud sink connector upload after batch caching and commit the CopyInto sql to load data into the table. + +## Supported DataSource Info + +:::tip + +Version Supported + +* supported `SelectDB Cloud version is >= 2.2.x` + +::: + +## Sink Options + +| Name | Type | Required | Default | Description | +|--------------------|--------|----------|------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| load-url | String | Yes | - | `SelectDB Cloud` warehouse http address, the format is `warehouse_ip:http_port` | +| jdbc-url | String | Yes | - | `SelectDB Cloud` warehouse jdbc address, the format is `warehouse_ip:mysql_port` | +| cluster-name | String | Yes | - | `SelectDB Cloud` cluster name | +| username | String | Yes | - | `SelectDB Cloud` user username | +| password | String | Yes | - | `SelectDB Cloud` user password | +| sink.enable-2pc | bool | No | true | Whether to enable two-phase commit (2pc), the default is true, to ensure Exactly-Once semantics. SelectDB uses cache files to load data. When the amount of data is large, cached data may become invalid (the default expiration time is 1 hour). If you encounter a large amount of data write loss, please configure sink.enable-2pc to false. | +| table.identifier | String | Yes | - | The name of `SelectDB Cloud` table, the format is `database.table` | +| sink.enable-delete | bool | No | false | Whether to enable deletion. This option requires SelectDB Cloud table to enable batch delete function, and only supports Unique model. | +| sink.max-retries | int | No | 3 | the max retry times if writing records to database failed | +| sink.buffer-size | int | No | 10 * 1024 * 1024 (1MB) | the buffer size to cache data for stream load. | +| sink.buffer-count | int | No | 10000 | the buffer count to cache data for stream load. | +| selectdb.config | map | yes | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql,and supported formats. | + +## Data Type Mapping + +| SelectDB Cloud Data type | SeaTunnel Data type | +|--------------------------|-----------------------------------------| +| BOOLEAN | BOOLEAN | +| TINYINT | TINYINT | +| SMALLINT | SMALLINT
TINYINT | +| INT | INT
SMALLINT
TINYINT | +| BIGINT | BIGINT
INT
SMALLINT
TINYINT | +| LARGEINT | BIGINT
INT
SMALLINT
TINYINT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE
FLOAT | +| DECIMAL | DECIMAL
DOUBLE
FLOAT | +| DATE | DATE | +| DATETIME | TIMESTAMP | +| CHAR | STRING | +| VARCHAR | STRING | +| STRING | STRING | +| ARRAY | ARRAY | +| MAP | MAP | +| JSON | STRING | +| HLL | Not supported yet | +| BITMAP | Not supported yet | +| QUANTILE_STATE | Not supported yet | +| STRUCT | Not supported yet | + +#### Supported import data formats + +The supported formats include CSV and JSON + +## Task Example + +### Simple: + +> The following example describes writing multiple data types to SelectDBCloud, and users need to create corresponding tables downstream + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" + checkpoint.interval = 10000 +} + +source { + FakeSource { + row.num = 10 + map.size = 10 + array.size = 10 + bytes.length = 10 + string.length = 10 + schema = { + fields { + c_map = "map>" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(16, 1)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} + +sink { + SelectDBCloud { + load-url = "warehouse_ip:http_port" + jdbc-url = "warehouse_ip:mysql_port" + cluster-name = "Cluster" + table.identifier = "test.test" + username = "admin" + password = "******" + selectdb.config { + file.type = "json" + } + } +} +``` + +### Use JSON format to import data + +``` +sink { + SelectDBCloud { + load-url = "warehouse_ip:http_port" + jdbc-url = "warehouse_ip:mysql_port" + cluster-name = "Cluster" + table.identifier = "test.test" + username = "admin" + password = "******" + selectdb.config { + file.type = "json" + } + } +} + +``` + +### Use CSV format to import data + +``` +sink { + SelectDBCloud { + load-url = "warehouse_ip:http_port" + jdbc-url = "warehouse_ip:mysql_port" + cluster-name = "Cluster" + table.identifier = "test.test" + username = "admin" + password = "******" + selectdb.config { + file.type = "csv" + file.column_separator = "," + file.line_delimiter = "\n" + } + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Sentry.md b/versioned_docs/version-2.3.7/connector-v2/sink/Sentry.md new file mode 100644 index 000000000000..1a31d1c87bed --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Sentry.md @@ -0,0 +1,78 @@ +# Sentry + +## Description + +Write message to Sentry. + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-----------------------------|---------|----------|---------------| +| dsn | string | yes | - | +| env | string | no | - | +| release | string | no | - | +| cacheDirPath | string | no | - | +| enableExternalConfiguration | boolean | no | - | +| maxCacheItems | number | no | - | +| flushTimeoutMills | number | no | - | +| maxQueueSize | number | no | - | +| common-options | | no | - | + +### dsn [string] + +The DSN tells the SDK where to send the events to. + +### env [string] + +specify the environment + +### release [string] + +specify the release + +### cacheDirPath [string] + +the cache dir path for caching offline events + +### enableExternalConfiguration [boolean] + +if loading properties from external sources is enabled. + +### maxCacheItems [number] + +The max cache items for capping the number of events Default is 30 + +### flushTimeoutMillis [number] + +Controls how many seconds to wait before flushing down. Sentry SDKs cache events from a background queue and this queue is given a certain amount to drain pending events Default is 15000 = 15s + +### maxQueueSize [number] + +Max queue size before flushing events/envelopes to the disk + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details + +## Example + +``` + Sentry { + dsn = "https://xxx@sentry.xxx.com:9999/6" + enableExternalConfiguration = true + maxCacheItems = 1000 + env = prod + } + +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Sentry Sink Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/SftpFile.md b/versioned_docs/version-2.3.7/connector-v2/sink/SftpFile.md new file mode 100644 index 000000000000..7fdb542a2a68 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/SftpFile.md @@ -0,0 +1,266 @@ +# SftpFile + +> Sftp file sink connector + +## Description + +Output data to Sftp . + +:::tip + +If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. + +If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. + +::: + +## Key features + +- [x] [exactly-once](../../concept/connector-v2-features.md) + +By default, we use 2PC commit to ensure `exactly-once` + +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + - [x] xml + - [x] binary + +## Options + +| name | type | required | default value | remarks | +|---------------------------------------|---------|----------|--------------------------------------------|-------------------------------------------------------------------------------------------------------------------| +| host | string | yes | - | | +| port | int | yes | - | | +| user | string | yes | - | | +| password | string | yes | - | | +| path | string | yes | - | | +| tmp_path | string | yes | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. Need a FTP dir. | +| custom_filename | boolean | no | false | Whether you need custom the filename | +| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | +| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | +| file_format_type | string | no | "csv" | | +| field_delimiter | string | no | '\001' | Only used when file_format_type is text | +| row_delimiter | string | no | "\n" | Only used when file_format_type is text | +| have_partition | boolean | no | false | Whether you need processing partitions. | +| partition_by | array | no | - | Only used then have_partition is true | +| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | +| is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true | +| sink_columns | array | no | | When this parameter is empty, all fields are sink columns | +| is_enable_transaction | boolean | no | true | | +| batch_size | int | no | 1000000 | | +| compress_codec | string | no | none | | +| common-options | object | no | - | | +| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. | +| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. | +| xml_root_tag | string | no | RECORDS | Only used when file_format is xml. | +| xml_row_tag | string | no | RECORD | Only used when file_format is xml. | +| xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | +| parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | +| parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | +| encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | + +### host [string] + +The target sftp host is required + +### port [int] + +The target sftp port is required + +### user [string] + +The target sftp user is required + +### password [string] + +The target sftp password is required + +### path [string] + +The target dir path is required. + +### custom_filename [boolean] + +Whether custom the filename + +### file_name_expression [string] + +Only used when `custom_filename` is `true` + +`file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, +`${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +### filename_time_format [string] + +Only used when `custom_filename` is `true` + +When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: + +| Symbol | Description | +|--------|--------------------| +| y | Year | +| M | Month | +| d | Day of month | +| H | Hour in day (0-23) | +| m | Minute in hour | +| s | Second in minute | + +### file_format_type [string] + +We supported as the following file types: + +`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` + +Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. + +### field_delimiter [string] + +The separator between columns in a row of data. Only needed by `text` file format. + +### row_delimiter [string] + +The separator between rows in a file. Only needed by `text` file format. + +### have_partition [boolean] + +Whether you need processing partitions. + +### partition_by [array] + +Only used when `have_partition` is `true`. + +Partition data based on selected fields. + +### partition_dir_expression [string] + +Only used when `have_partition` is `true`. + +If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. + +Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. + +### is_partition_field_write_in_file [boolean] + +Only used when `have_partition` is `true`. + +If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. + +For example, if you want to write a Hive Data File, Its value should be `false`. + +### sink_columns [array] + +Which columns need be wrote to file, default value is all the columns get from `Transform` or `Source`. +The order of the fields determines the order in which the file is actually written. + +### is_enable_transaction [boolean] + +If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +Only support `true` now. + +### batch_size [int] + +The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. + +### compress_codec [string] + +The compress codec of files and the details that supported as the following shown: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc: `lzo` `snappy` `lz4` `zlib` `none` +- parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` + +Tips: excel type does not support any compression format + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. + +### max_rows_in_memory + +When File Format is Excel,The maximum number of data items that can be cached in the memory. + +### sheet_name + +Writer the sheet of the workbook + +### xml_root_tag [string] + +Specifies the tag name of the root element within the XML file. + +### xml_row_tag [string] + +Specifies the tag name of the data rows within the XML file. + +### xml_use_attr_format [boolean] + +Specifies Whether to process data using the tag attribute format. + +### parquet_avro_write_timestamp_as_int96 [boolean] + +Support writing Parquet INT96 from a timestamp, only valid for parquet files. + +### parquet_avro_write_fixed_as_int96 [array] + +Support writing Parquet INT96 from a 12-byte field, only valid for parquet files. + +### encoding [string] + +Only used when file_format_type is json,text,csv,xml. +The encoding of the file to write. This param will be parsed by `Charset.forName(encoding)`. + +## Example + +For text file format with `have_partition` and `custom_filename` and `sink_columns` + +```bash + +SftpFile { + host = "xxx.xxx.xxx.xxx" + port = 22 + user = "username" + password = "password" + path = "/data/sftp/seatunnel/job1" + tmp_path = "/data/sftp/seatunnel/tmp" + file_format_type = "text" + field_delimiter = "\t" + row_delimiter = "\n" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + custom_filename = true + file_name_expression = "${transactionId}_${now}" + filename_time_format = "yyyy.MM.dd" + sink_columns = ["name","age"] + is_enable_transaction = true +} + +``` + +## Changelog + +### 2.3.0 2022-12-30 + +- Add SftpFile Sink Connector +- [BugFix] Fixed the following bugs that failed to write data to files ([3258](https://github.com/apache/seatunnel/pull/3258)) + - When field from upstream is null it will throw NullPointerException + - Sink columns mapping failed + - When restore writer from states getting transaction directly failed +- [Improve] Support setting batch size for every file ([3625](https://github.com/apache/seatunnel/pull/3625)) + +### Next version + +- [Improve] Support file compress ([3899](https://github.com/apache/seatunnel/pull/3899)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Slack.md b/versioned_docs/version-2.3.7/connector-v2/sink/Slack.md new file mode 100644 index 000000000000..7ed87d2022c3 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Slack.md @@ -0,0 +1,54 @@ +# Slack + +> Slack sink connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [cdc](../../concept/connector-v2-features.md) + +## Description + +Used to send data to Slack Channel. Both support streaming and batch mode. + +> For example, if the data from upstream is [`age: 12, name: huan`], the content send to socket server is the following: `{"name":"huan","age":17}` + +## Data Type Mapping + +All data types are mapped to string. + +## Options + +| Name | Type | Required | Default | Description | +|----------------|--------|----------|---------|-----------------------------------------------------------------------------------------------------| +| webhooks_url | String | Yes | - | Slack webhook url | +| oauth_token | String | Yes | - | Slack oauth token used for the actual authentication | +| slack_channel | String | Yes | - | slack channel for data write | +| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | + +## Task Example + +### Simple: + +```hocon +sink { + SlackSink { + webhooks_url = "https://hooks.slack.com/services/xxxxxxxxxxxx/xxxxxxxxxxxx/xxxxxxxxxxxxxxxx" + oauth_token = "xoxp-xxxxxxxxxx-xxxxxxxx-xxxxxxxxx-xxxxxxxxxxx" + slack_channel = "channel name" + } +} +``` + +## Changelog + +### new version + +- Add Slack Sink Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Snowflake.md b/versioned_docs/version-2.3.7/connector-v2/sink/Snowflake.md new file mode 100644 index 000000000000..91a64d0009d9 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Snowflake.md @@ -0,0 +1,142 @@ +# Snowflake + +> JDBC Snowflake Sink Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +## Description + +Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing. + +## Supported DataSource list + +| Datasource | Supported Versions | Driver | Url | Maven | +|------------|----------------------------------------------------------|-------------------------------------------|------------------------------------------------------------|-----------------------------------------------------------------------------| +| snowflake | Different dependency version has different driver class. | net.snowflake.client.jdbc.SnowflakeDriver | jdbc:snowflake://.snowflakecomputing.com | [Download](https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc) | + +## Database dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example Snowflake datasource: cp snowflake-connector-java-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| Snowflake Data Type | SeaTunnel Data Type | +|-----------------------------------------------------------------------------|---------------------| +| BOOLEAN | BOOLEAN | +| TINYINT
SMALLINT
BYTEINT
| SHORT_TYPE | +| INT
INTEGER
| INT | +| BIGINT | LONG | +| DECIMAL
NUMERIC
NUMBER
| DECIMAL(x,y) | +| DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | +| REAL
FLOAT4 | FLOAT | +| DOUBLE
DOUBLE PRECISION
FLOAT8
FLOAT
| DOUBLE | +| CHAR
CHARACTER
VARCHAR
STRING
TEXT
VARIANT
OBJECT | STRING | +| DATE | DATE | +| TIME | TIME | +| DATETIME
TIMESTAMP
TIMESTAMP_LTZ
TIMESTAMP_NTZ
TIMESTAMP_TZ | TIMESTAMP | +| BINARY
VARBINARY
GEOGRAPHY
GEOMETRY | BYTES | + +## Options + +| Name | Type | Required | Default | Description | +|-------------------------------------------|---------|----------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:snowflake://.snowflakecomputing.com | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use Snowflake the value is `net.snowflake.client.jdbc.SnowflakeDriver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | +| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | +| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | +| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | +| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | +| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | +| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | +| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | +| common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | +| enable_upsert | Boolean | No | true | Enable upsert by primary_keys exist, If the task has no key duplicate data, setting this parameter to `false` can speed up data import | + +## tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. +> + ## Task Example + +### simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your snowflake database. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +``` +# Defining the runtime environment +env { + parallelism = 1 + job.mode = "BATCH" +} +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} +sink { + jdbc { + url = "jdbc:snowflake://.snowflakecomputing.com" + driver = "net.snowflake.client.jdbc.SnowflakeDriver" + user = "root" + password = "123456" + query = "insert into test_table(name,age) values(?,?)" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + +### CDC(Change data capture) event + +> CDC change data is also supported by us In this case, you need config database, table and primary_keys. + +``` +sink { + jdbc { + url = "jdbc:snowflake://.snowflakecomputing.com" + driver = "net.snowflake.client.jdbc.SnowflakeDriver" + user = "root" + password = "123456" + generate_sink_sql = true + + + # You need to configure both database and table + database = test + table = sink_table + primary_keys = ["id","name"] + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Socket.md b/versioned_docs/version-2.3.7/connector-v2/sink/Socket.md new file mode 100644 index 000000000000..a23490073747 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Socket.md @@ -0,0 +1,79 @@ +# Socket + +> Socket sink connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Description + +Used to send data to Socket Server. Both support streaming and batch mode. + +> For example, if the data from upstream is [`age: 12, name: jared`], the content send to socket server is the following: `{"name":"jared","age":17}` + +## Sink Options + +| Name | Type | Required | Default | Description | +|----------------|---------|----------|---------|---------------------------------------------------------------------------------------------------------| +| host | String | Yes | | socket server host | +| port | Integer | Yes | | socket server port | +| max_retries | Integer | No | 3 | The number of retries to send record failed | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## Task Example + +> This is randomly generated data written to the Socket side + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" +} + +source { + FakeSource { + result_table_name = "fake" + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +sink { + Socket { + host = "localhost" + port = 9999 + } +} +``` + +* Start a port listening + +```shell +nc -l -v 9999 +``` + +* Start a SeaTunnel task + +* Socket Server Console print data + +```text +{"name":"jared","age":17} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Socket Sink Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/SqlServer.md b/versioned_docs/version-2.3.7/connector-v2/sink/SqlServer.md new file mode 100644 index 000000000000..1a50a01d6a66 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/SqlServer.md @@ -0,0 +1,182 @@ +# SQL Server + +> JDBC SQL Server Sink Connector + +## Support SQL Server Version + +- server:2008 (Or later version for information only) + +## Support Those engines + +> Spark
+> Flink
+> Seatunnel Zeta
+ +## Description + +Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once +semantics (using XA transaction guarantee). + +## Using Dependency + +### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +> Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is +> support `Xa transactions`. You can set `is_exactly_once=true` to enable it. + +## Supported DataSource Info + +| Datasource | Supported Versions | Driver | Url | Maven | +|------------|-------------------------|----------------------------------------------|---------------------------------|-----------------------------------------------------------------------------------| +| SQL Server | support version >= 2008 | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433 | [Download](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) | + +## Database dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example SQL Server datasource: cp mssql-jdbc-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| SQLserver Data Type | SeaTunnel Data Type | +|-----------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------| +| BIT | BOOLEAN | +| TINYINT
SMALLINT | SHORT | +| INTEGER | INT | +| BIGINT | LONG | +| DECIMAL
NUMERIC
MONEY
SMALLMONEY | DECIMAL((Get the designated column's specified column size)+1,
(Gets the designated column's number of digits to right of the
decimal point.))) | +| REAL | FLOAT | +| FLOAT | DOUBLE | +| CHAR
NCHAR
VARCHAR
NTEXT
NVARCHAR
TEXT | STRING | +| DATE | LOCAL_DATE | +| TIME | LOCAL_TIME | +| DATETIME
DATETIME2
SMALLDATETIME
DATETIMEOFFSET | LOCAL_DATE_TIME | +| TIMESTAMP
BINARY
VARBINARY
IMAGE
UNKNOWN | Not supported yet | + +## Sink Options + +| Name | Type | Required | Default | Description | +|-------------------------------------------|---------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:sqlserver://localhost:1433;databaseName=mydatabase | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use sqlServer the value is `com.microsoft.sqlserver.jdbc.SQLServerDriver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | +| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | +| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | +| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | +| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | +| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | +| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to | +| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, SqlServer is `com.microsoft.sqlserver.jdbc.SQLServerXADataSource`, and
please refer to appendix for other data sources | +| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | +| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | +| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | +| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | +| enable_upsert | Boolean | No | true | Enable upsert by primary_keys exist, If the task has no key duplicate data, setting this parameter to `false` can speed up data import | + +## tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### simple: + +> This is one that reads Sqlserver data and inserts it directly into another table + +``` +env { + # You can set engine configuration here + parallelism = 10 +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + Jdbc { + driver = com.microsoft.sqlserver.jdbc.SQLServerDriver + url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" + user = SA + password = "Y.sa123456" + query = "select * from column_type_test.dbo.full_types_jdbc" + # Parallel sharding reads fields + partition_column = "id" + # Number of fragments + partition_num = 10 + + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/connector-v2/source/Jdbc +} + +transform { + + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink { + Jdbc { + driver = com.microsoft.sqlserver.jdbc.SQLServerDriver + url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" + user = SA + password = "Y.sa123456" + query = "insert into full_types_jdbc_sink( id, val_char, val_varchar, val_text, val_nchar, val_nvarchar, val_ntext, val_decimal, val_numeric, val_float, val_real, val_smallmoney, val_money, val_bit, val_tinyint, val_smallint, val_int, val_bigint, val_date, val_time, val_datetime2, val_datetime, val_smalldatetime ) values( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )" + + } # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/connector-v2/sink/Jdbc +} +``` + +### CDC(Change data capture) event + +> CDC change data is also supported by us In this case, you need config database, table and primary_keys. + +``` +Jdbc { + source_table_name = "customers" + driver = com.microsoft.sqlserver.jdbc.SQLServerDriver + url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" + user = SA + password = "Y.sa123456" + generate_sink_sql = true + database = "column_type_test" + table = "dbo.full_types_sink" + batch_size = 100 + primary_keys = ["id"] +} +``` + +### Exactly Once Sink + +> Transactional writes may be slower but more accurate to the data + +``` + Jdbc { + driver = com.microsoft.sqlserver.jdbc.SQLServerDriver + url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" + user = SA + password = "Y.sa123456" + query = "insert into full_types_jdbc_sink( id, val_char, val_varchar, val_text, val_nchar, val_nvarchar, val_ntext, val_decimal, val_numeric, val_float, val_real, val_smallmoney, val_money, val_bit, val_tinyint, val_smallint, val_int, val_bigint, val_date, val_time, val_datetime2, val_datetime, val_smalldatetime ) values( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )" + is_exactly_once = "true" + + xa_data_source_class_name = "com.microsoft.sqlserver.jdbc.SQLServerXADataSource" + + } # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/connector-v2/sink/Jdbc + +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/StarRocks.md b/versioned_docs/version-2.3.7/connector-v2/sink/StarRocks.md new file mode 100644 index 000000000000..5fe57cd3f4eb --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/StarRocks.md @@ -0,0 +1,377 @@ +# StarRocks + +> StarRocks sink connector + +## Support These Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) +- [x] [support multiple table write](../../concept/connector-v2-features.md) + +## Description + +Used to send data to StarRocks. Both support streaming and batch mode. +The internal implementation of StarRocks sink connector is cached and imported by stream load in batches. + +## Sink Options + +| Name | Type | Required | Default | Description | +|-----------------------------|---------|----------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| nodeUrls | list | yes | - | `StarRocks` cluster address, the format is `["fe_ip:fe_http_port", ...]` | +| base-url | string | yes | - | The JDBC URL like `jdbc:mysql://localhost:9030/` or `jdbc:mysql://localhost:9030` or `jdbc:mysql://localhost:9030/db` | +| username | string | yes | - | `StarRocks` user username | +| password | string | yes | - | `StarRocks` user password | +| database | string | yes | - | The name of StarRocks database | +| table | string | no | - | The name of StarRocks table, If not set, the table name will be the name of the upstream table | +| labelPrefix | string | no | - | The prefix of StarRocks stream load label | +| batch_max_rows | long | no | 1024 | For batch writing, when the number of buffers reaches the number of `batch_max_rows` or the byte size of `batch_max_bytes` or the time reaches `checkpoint.interval`, the data will be flushed into the StarRocks | +| batch_max_bytes | int | no | 5 * 1024 * 1024 | For batch writing, when the number of buffers reaches the number of `batch_max_rows` or the byte size of `batch_max_bytes` or the time reaches `checkpoint.interval`, the data will be flushed into the StarRocks | +| max_retries | int | no | - | The number of retries to flush failed | +| retry_backoff_multiplier_ms | int | no | - | Using as a multiplier for generating the next delay for backoff | +| max_retry_backoff_ms | int | no | - | The amount of time to wait before attempting to retry a request to `StarRocks` | +| enable_upsert_delete | boolean | no | false | Whether to enable upsert/delete, only supports PrimaryKey model. | +| save_mode_create_template | string | no | see below | see below | +| starrocks.config | map | no | - | The parameter of the stream load `data_desc` | +| http_socket_timeout_ms | int | no | 180000 | Set http socket timeout, default is 3 minutes. | +| schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. | +| data_save_mode | Enum | no | APPEND_DATA | Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. | +| custom_sql | String | no | - | When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. | + +### save_mode_create_template + +We use templates to automatically create starrocks tables, +which will create corresponding table creation statements based on the type of upstream data and schema type, +and the default template can be modified according to the situation. Only work on multi-table mode at now. + +Default template: + +```sql +CREATE TABLE IF NOT EXISTS `${database}`.`${table}` ( +${rowtype_primary_key}, +${rowtype_fields} +) ENGINE=OLAP +PRIMARY KEY (${rowtype_primary_key}) +DISTRIBUTED BY HASH (${rowtype_primary_key})PROPERTIES ( +"replication_num" = "1" +) +``` + +If a custom field is filled in the template, such as adding an `id` field + +```sql +CREATE TABLE IF NOT EXISTS `${database}`.`${table}` +( + id, + ${rowtype_fields} +) ENGINE = OLAP DISTRIBUTED BY HASH (${rowtype_primary_key}) + PROPERTIES +( + "replication_num" = "1" +); +``` + +The connector will automatically obtain the corresponding type from the upstream to complete the filling, +and remove the id field from `rowtype_fields`. This method can be used to customize the modification of field types and attributes. + +You can use the following placeholders + +- database: Used to get the database in the upstream schema +- table_name: Used to get the table name in the upstream schema +- rowtype_fields: Used to get all the fields in the upstream schema, we will automatically map to the field + description of StarRocks +- rowtype_primary_key: Used to get the primary key in the upstream schema (maybe a list) +- rowtype_unique_key: Used to get the unique key in the upstream schema (maybe a list) + +### table [string] + +Use `database` and this `table-name` auto-generate sql and receive upstream input datas write to database. + +This option is mutually exclusive with `query` and has a higher priority. + +The table parameter can fill in the name of an unwilling table, which will eventually be used as the table name of the creation table, and supports variables (`${table_name}`, `${schema_name}`). Replacement rules: `${schema_name}` will replace the SCHEMA name passed to the target side, and `${table_name}` will replace the name of the table passed to the table at the target side. + +for example: +1. test_${schema_name}_${table_name}_test +2. sink_sinktable +3. ss_${table_name} + +### schema_save_mode[Enum] + +Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. +Option introduction: +`RECREATE_SCHEMA` :Will create when the table does not exist, delete and rebuild when the table is saved +`CREATE_SCHEMA_WHEN_NOT_EXIST` :Will Created when the table does not exist, skipped when the table is saved +`ERROR_WHEN_SCHEMA_NOT_EXIST` :Error will be reported when the table does not exist + +### data_save_mode[Enum] + +Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. +Option introduction: +`DROP_DATA`: Preserve database structure and delete data +`APPEND_DATA`:Preserve database structure, preserve data +`CUSTOM_PROCESSING`:User defined processing +`ERROR_WHEN_DATA_EXISTS`:When there is data, an error is reported + +### custom_sql[String] + +When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. + +## Data Type Mapping + +| StarRocks Data type | SeaTunnel Data type | +|---------------------|---------------------| +| BOOLEAN | BOOLEAN | +| TINYINT | TINYINT | +| SMALLINT | SMALLINT | +| INT | INT | +| BIGINT | BIGINT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| DECIMAL | DECIMAL | +| DATE | STRING | +| TIME | STRING | +| DATETIME | STRING | +| STRING | STRING | +| ARRAY | STRING | +| MAP | STRING | +| BYTES | STRING | + +#### Supported import data formats + +The supported formats include CSV and JSON + +## Task Example + +### Simple: + +> The following example describes writing multiple data types to StarRocks, and users need to create corresponding tables downstream + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" + checkpoint.interval = 10000 +} + +source { + FakeSource { + row.num = 10 + map.size = 10 + array.size = 10 + bytes.length = 10 + string.length = 10 + schema = { + fields { + c_map = "map>" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(16, 1)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} + +sink { + StarRocks { + nodeUrls = ["e2e_starRocksdb:8030"] + username = root + password = "" + database = "test" + table = "e2e_table_sink" + batch_max_rows = 10 + starrocks.config = { + format = "JSON" + strip_outer_array = true + } + } +} +``` + +### Support write cdc changelog event(INSERT/UPDATE/DELETE) + +```hocon +sink { + StarRocks { + nodeUrls = ["e2e_starRocksdb:8030"] + username = root + password = "" + database = "test" + table = "e2e_table_sink" + ... + + // Support upsert/delete event synchronization (enable_upsert_delete=true), only supports PrimaryKey model. + enable_upsert_delete = true + } +} +``` + +### Use JSON format to import data + +``` +sink { + StarRocks { + nodeUrls = ["e2e_starRocksdb:8030"] + username = root + password = "" + database = "test" + table = "e2e_table_sink" + batch_max_rows = 10 + starrocks.config = { + format = "JSON" + strip_outer_array = true + } + } +} + +``` + +### Use CSV format to import data + +``` +sink { + StarRocks { + nodeUrls = ["e2e_starRocksdb:8030"] + username = root + password = "" + database = "test" + table = "e2e_table_sink" + batch_max_rows = 10 + starrocks.config = { + format = "CSV" + column_separator = "\\x01" + row_delimiter = "\\x02" + } + } +} +``` + +### Use save_mode function + +``` +sink { + StarRocks { + nodeUrls = ["e2e_starRocksdb:8030"] + username = root + password = "" + database = "test" + table = "test_${schema_name}_${table_name}" + schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" + data_save_mode="APPEND_DATA" + batch_max_rows = 10 + starrocks.config = { + format = "CSV" + column_separator = "\\x01" + row_delimiter = "\\x02" + } + } +} +``` + +### Multiple table + +#### example1 + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + Mysql-CDC { + base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel" + username = "root" + password = "******" + + table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] + } +} + +transform { +} + +sink { + StarRocks { + nodeUrls = ["e2e_starRocksdb:8030"] + username = root + password = "" + database = "${database_name}_test" + table = "${table_name}_test" + ... + + // Support upsert/delete event synchronization (enable_upsert_delete=true), only supports PrimaryKey model. + enable_upsert_delete = true + } +} +``` + +#### example2 + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Jdbc { + driver = oracle.jdbc.driver.OracleDriver + url = "jdbc:oracle:thin:@localhost:1521/XE" + user = testUser + password = testPassword + + table_list = [ + { + table_path = "TESTSCHEMA.TABLE_1" + }, + { + table_path = "TESTSCHEMA.TABLE_2" + } + ] + } +} + +transform { +} + +sink { + StarRocks { + nodeUrls = ["e2e_starRocksdb:8030"] + username = root + password = "" + database = "${schema_name}_test" + table = "${table_name}_test" + ... + + // Support upsert/delete event synchronization (enable_upsert_delete=true), only supports PrimaryKey model. + enable_upsert_delete = true + } +} +``` + +## Changelog + +### next version + +- Add StarRocks Sink Connector +- [Improve] Change Connector Custom Config Prefix To Map [3719](https://github.com/apache/seatunnel/pull/3719) +- [Feature] Support write cdc changelog event(INSERT/UPDATE/DELETE) [3865](https://github.com/apache/seatunnel/pull/3865) + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/TDengine.md b/versioned_docs/version-2.3.7/connector-v2/sink/TDengine.md new file mode 100644 index 000000000000..455e0effa20f --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/TDengine.md @@ -0,0 +1,71 @@ +# TDengine + +> TDengine sink connector + +## Description + +Used to write data to TDengine. You need to create stable before running seatunnel task + +## Key features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [cdc](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|----------|--------|----------|---------------| +| url | string | yes | - | +| username | string | yes | - | +| password | string | yes | - | +| database | string | yes | | +| stable | string | yes | - | +| timezone | string | no | UTC | + +### url [string] + +the url of the TDengine when you select the TDengine + +e.g. + +``` +jdbc:TAOS-RS://localhost:6041/ +``` + +### username [string] + +the username of the TDengine when you select + +### password [string] + +the password of the TDengine when you select + +### database [string] + +the database of the TDengine when you select + +### stable [string] + +the stable of the TDengine when you select + +### timezone [string] + +the timeznoe of the TDengine sever, it's important to the ts field + +## Example + +### sink + +```hocon +sink { + TDengine { + url : "jdbc:TAOS-RS://localhost:6041/" + username : "root" + password : "taosdata" + database : "power2" + stable : "meters2" + timezone: UTC + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Tablestore.md b/versioned_docs/version-2.3.7/connector-v2/sink/Tablestore.md new file mode 100644 index 000000000000..8f161ad25f6e --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Tablestore.md @@ -0,0 +1,72 @@ +# Tablestore + +> Tablestore sink connector + +## Description + +Write data to `Tablestore` + +## Key features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-------------------|--------|----------|---------------| +| end_point | string | yes | - | +| instance_name | string | yes | - | +| access_key_id | string | yes | - | +| access_key_secret | string | yes | - | +| table | string | yes | - | +| primary_keys | array | yes | - | +| batch_size | string | no | 25 | +| common-options | config | no | - | + +### end_point [string] + +endPoint to write to Tablestore. + +### instanceName [string] + +The instanceName of Tablestore. + +### access_key_id [string] + +The access id of Tablestore. + +### access_key_secret [string] + +The access secret of Tablestore. + +### table [string] + +The table of Tablestore. + +### primaryKeys [array] + +The primaryKeys of Tablestore. + +### common options [ config ] + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. + +## Example + +```bash +Tablestore { + end_point = "xxxx" + instance_name = "xxxx" + access_key_id = "xxxx" + access_key_secret = "xxxx" + table = "sink" + primary_keys = ["pk_1","pk_2","pk_3","pk_4"] + } +``` + +## Changelog + +### next version + +- Add Tablestore Sink Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/Vertica.md b/versioned_docs/version-2.3.7/connector-v2/sink/Vertica.md new file mode 100644 index 000000000000..dc302c5d7d5e --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/Vertica.md @@ -0,0 +1,183 @@ +# Vertica + +> JDBC Vertica Sink Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Description + +Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once +semantics (using XA transaction guarantee). + +## Using Dependency + +### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://www.vertica.com/download/vertica/client-drivers/) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://www.vertica.com/download/vertica/client-drivers/) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [cdc](../../concept/connector-v2-features.md) + +> Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is +> support `Xa transactions`. You can set `is_exactly_once=true` to enable it. + +## Supported DataSource Info + +| Datasource | Supported Versions | Driver | Url | Maven | +|------------|----------------------------------------------------------|-------------------------|---------------------------------------|----------------------------------------------------------------------| +| Vertica | Different dependency version has different driver class. | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433/vertica | [Download](https://www.vertica.com/download/vertica/client-drivers/) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example Vertica datasource: cp vertica-jdbc-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| Vertica Data Type | SeaTunnel Data Type | +|-----------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| BIT(1)
INT UNSIGNED | BOOLEAN | +| TINYINT
TINYINT UNSIGNED
SMALLINT
SMALLINT UNSIGNED
MEDIUMINT
MEDIUMINT UNSIGNED
INT
INTEGER
YEAR | INT | +| INT UNSIGNED
INTEGER UNSIGNED
BIGINT | BIGINT | +| BIGINT UNSIGNED | DECIMAL(20,0) | +| DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | +| DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | +| DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
(Gets the designated column's number of digits to right of the decimal point.))) | +| FLOAT
FLOAT UNSIGNED | FLOAT | +| DOUBLE
DOUBLE UNSIGNED | DOUBLE | +| CHAR
VARCHAR
TINYTEXT
MEDIUMTEXT
TEXT
LONGTEXT
JSON | STRING | +| DATE | DATE | +| TIME | TIME | +| DATETIME
TIMESTAMP | TIMESTAMP | +| TINYBLOB
MEDIUMBLOB
BLOB
LONGBLOB
BINARY
VARBINAR
BIT(n) | BYTES | +| GEOMETRY
UNKNOWN | Not supported yet | + +## Sink Options + +| Name | Type | Required | Default | Description | +|-------------------------------------------|---------|----------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:vertica://localhost:5433/vertica | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use Vertical the value is `com.vertica.jdbc.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | +| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | +| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | +| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | +| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | +| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | +| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to | +| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, vertical is `com.vertical.cj.jdbc.VerticalXADataSource`, and
please refer to appendix for other data sources | +| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | +| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | +| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | +| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | +| enable_upsert | Boolean | No | true | Enable upsert by primary_keys exist, If the task has no key duplicate data, setting this parameter to `false` can speed up data import | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your vertical. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +``` +# Defining the runtime environment +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + jdbc { + url = "jdbc:vertica://localhost:5433/vertica" + driver = "com.vertica.jdbc.Driver" + user = "root" + password = "123456" + query = "insert into test_table(name,age) values(?,?)" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + +### Generate Sink SQL + +> This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you + +``` +sink { + jdbc { + url = "jdbc:vertica://localhost:5433/vertica" + driver = "com.vertica.jdbc.Driver" + user = "root" + password = "123456" + # Automatically generate sql statements based on database table names + generate_sink_sql = true + database = test + table = test_table + } +} +``` + +### Exactly-once : + +> For accurate write scene we guarantee accurate once + +``` +sink { + jdbc { + url = "jdbc:vertica://localhost:5433/vertica" + driver = "com.vertica.jdbc.Driver" + + max_retries = 0 + user = "root" + password = "123456" + query = "insert into test_table(name,age) values(?,?)" + + is_exactly_once = "true" + + xa_data_source_class_name = "com.vertical.cj.jdbc.VerticalXADataSource" + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/sink/common-options.md b/versioned_docs/version-2.3.7/connector-v2/sink/common-options.md new file mode 100644 index 000000000000..bfcdc26a2bb8 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/sink/common-options.md @@ -0,0 +1,53 @@ +# Sink Common Options + +> Common parameters of sink connectors + +| Name | Type | Required | Default | Description | +|-------------------|--------|----------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| source_table_name | String | No | - | When `source_table_name` is not specified, the current plug-in processes the data set `dataset` output by the previous plugin in the configuration file
When `source_table_name` is specified, the current plug-in is processing the data set corresponding to this parameter. | + +# Important note + +When the job configuration `source_table_name` you must set the `result_table_name` parameter + +## Task Example + +### Simple: + +> This is the process of passing a data source through two transforms and returning two different pipiles to different sinks + +```bash +source { + FakeSourceStream { + parallelism = 2 + result_table_name = "fake" + field_name = "name,age" + } +} + +transform { + Filter { + source_table_name = "fake" + fields = [name] + result_table_name = "fake_name" + } + Filter { + source_table_name = "fake" + fields = [age] + result_table_name = "fake_age" + } +} + +sink { + Console { + source_table_name = "fake_name" + } + Console { + source_table_name = "fake_age" + } +} +``` + +> If the job only have one source and one(or zero) transform and one sink, You do not need to specify `source_table_name` and `result_table_name` for connector. +> If the number of any operator in source, transform and sink is greater than 1, you must specify the `source_table_name` and `result_table_name` for each connector in the job. + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/AmazonDynamoDB.md b/versioned_docs/version-2.3.7/connector-v2/source/AmazonDynamoDB.md new file mode 100644 index 000000000000..3261046b7398 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/AmazonDynamoDB.md @@ -0,0 +1,120 @@ +# AmazonDynamoDB + +> AmazonDynamoDB source connector + +## Description + +Read data from Amazon DynamoDB. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-----------------------|--------|----------|---------------| +| url | string | yes | - | +| region | string | yes | - | +| access_key_id | string | yes | - | +| secret_access_key | string | yes | - | +| table | string | yes | - | +| schema | config | yes | - | +| common-options | | yes | - | +| scan_item_limit | | false | - | +| parallel_scan_threads | | false | - | + +### url [string] + +The URL to read to Amazon Dynamodb. + +### region [string] + +The region of Amazon Dynamodb. + +### accessKeyId [string] + +The access id of Amazon DynamoDB. + +### secretAccessKey [string] + +The access secret of Amazon DynamoDB. + +### table [string] + +The table of Amazon DynamoDB. + +### schema [Config] + +#### fields [config] + +Amazon Dynamodb is a NOSQL database service of support keys-value storage and document data structure,there is no way to get the data type.Therefore, we must configure schema. + +such as: + +``` +schema { + fields { + id = int + key_aa = string + key_bb = string + } +} +``` + +### common options + +Source Plugin common parameters, refer to [Source Plugin](common-options.md) for details + +### scan_item_limit + +number of item each scan request should return + +### parallel_scan_threads + +number of logical segments for parallel scan + +## Example + +```bash +Amazondynamodb { + url = "http://127.0.0.1:8000" + region = "us-east-1" + accessKeyId = "dummy-key" + secretAccessKey = "dummy-secret" + table = "TableName" + schema = { + fields { + artist = string + c_map = "map>" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } +} +``` + +## Changelog + +### next version + +- Add Amazon DynamoDB Source Connector +- Add source split to Amazondynamodb Connectors + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/AmazonSqs.md b/versioned_docs/version-2.3.7/connector-v2/source/AmazonSqs.md new file mode 100644 index 000000000000..accd6ec6c67f --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/AmazonSqs.md @@ -0,0 +1,81 @@ +# AmazonSqs + +> AmazonSqs source connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +Read data from Amazon SQS. + +## Source Options + +| Name | Type | Required | Default | Description | +|-------------------------|--------|----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The Queue URL to read from Amazon SQS. | +| region | String | No | - | The AWS region for the SQS service | +| schema | Config | No | - | The structure of the data, including field names and field types. | +| format | String | No | json | Data format. The default format is json. Optional text format, canal-json and debezium-json.If you use json or text format. The default field separator is ", ". If you customize the delimiter, add the "field_delimiter" option.If you use canal format, please refer to [canal-json](../formats/canal-json.md) for details.If you use debezium format, please refer to [debezium-json](../formats/debezium-json.md) for details. | +| format_error_handle_way | String | No | fail | The processing method of data format error. The default value is fail, and the optional value is (fail, skip). When fail is selected, data format error will block and an exception will be thrown. When skip is selected, data format error will skip this line data. | +| field_delimiter | String | No | , | Customize the field delimiter for data format. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## Task Example + +```bash +source { + AmazonSqs { + url = "http://127.0.0.1:4566" + region = "us-east-1" + format = text + field_delimiter = "#" + schema = { + fields { + artist = string + c_map = "map>" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink { + Console {} +} +``` + +## Changelog + +### next version + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Cassandra.md b/versioned_docs/version-2.3.7/connector-v2/source/Cassandra.md new file mode 100644 index 000000000000..d4d4e97088ad --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Cassandra.md @@ -0,0 +1,80 @@ +# Cassandra + +> Cassandra source connector + +## Description + +Read data from Apache Cassandra. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-------------------|--------|----------|---------------| +| host | String | Yes | - | +| keyspace | String | Yes | - | +| cql | String | Yes | - | +| username | String | No | - | +| password | String | No | - | +| datacenter | String | No | datacenter1 | +| consistency_level | String | No | LOCAL_ONE | + +### host [string] + +`Cassandra` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as +`"cassandra1:9042,cassandra2:9042"`. + +### keyspace [string] + +The `Cassandra` keyspace. + +### cql [String] + +The query cql used to search data though Cassandra session. + +### username [string] + +`Cassandra` user username. + +### password [string] + +`Cassandra` user password. + +### datacenter [String] + +The `Cassandra` datacenter, default is `datacenter1`. + +### consistency_level [String] + +The `Cassandra` write consistency level, default is `LOCAL_ONE`. + +## Examples + +```hocon +source { + Cassandra { + host = "localhost:9042" + username = "cassandra" + password = "cassandra" + datacenter = "datacenter1" + keyspace = "test" + cql = "select * from source_table" + result_table_name = "source_table" + } +} +``` + +## Changelog + +### next version + +- Add Cassandra Source Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Clickhouse.md b/versioned_docs/version-2.3.7/connector-v2/source/Clickhouse.md new file mode 100644 index 000000000000..6fe0a5bb56bc --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Clickhouse.md @@ -0,0 +1,101 @@ +# Clickhouse + +> Clickhouse source connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +> supports query SQL and can achieve projection effect. + +## Description + +Used to read data from Clickhouse. + +## Supported DataSource Info + +In order to use the Clickhouse connector, the following dependencies are required. +They can be downloaded via install-plugin.sh or from the Maven central repository. + +| Datasource | Supported Versions | Dependency | +|------------|--------------------|------------------------------------------------------------------------------------------------------------------| +| Clickhouse | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-clickhouse) | + +## Data Type Mapping + +| Clickhouse Data Type | SeaTunnel Data Type | +|-----------------------------------------------------------------------------------------------------------------------------------------------|---------------------| +| String / Int128 / UInt128 / Int256 / UInt256 / Point / Ring / Polygon MultiPolygon | STRING | +| Int8 / UInt8 / Int16 / UInt16 / Int32 | INT | +| UInt64 / Int64 / IntervalYear / IntervalQuarter / IntervalMonth / IntervalWeek / IntervalDay / IntervalHour / IntervalMinute / IntervalSecond | BIGINT | +| Float64 | DOUBLE | +| Decimal | DECIMAL | +| Float32 | FLOAT | +| Date | DATE | +| DateTime | TIME | +| Array | ARRAY | +| Map | MAP | + +## Source Options + +| Name | Type | Required | Default | Description | +|-------------------|--------|----------|------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| host | String | Yes | - | `ClickHouse` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as `"host1:8123,host2:8123"` . | +| database | String | Yes | - | The `ClickHouse` database. | +| sql | String | Yes | - | The query sql used to search data though Clickhouse server. | +| username | String | Yes | - | `ClickHouse` user username. | +| password | String | Yes | - | `ClickHouse` user password. | +| clickhouse.config | Map | No | - | In addition to the above mandatory parameters that must be specified by `clickhouse-jdbc` , users can also specify multiple optional parameters, which cover all the [parameters](https://github.com/ClickHouse/clickhouse-jdbc/tree/master/clickhouse-client#configuration) provided by `clickhouse-jdbc`. | +| server_time_zone | String | No | ZoneId.systemDefault() | The session time zone in database server. If not set, then ZoneId.systemDefault() is used to determine the server time zone. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | + +## How to Create a Clickhouse Data Synchronization Jobs + +The following example demonstrates how to create a data synchronization job that reads data from Clickhouse and prints it on the local client: + +```bash +# Set the basic configuration of the task to be performed +env { + parallelism = 10 + job.mode = "BATCH" +} + +# Create a source to connect to Clickhouse +source { + Clickhouse { + host = "localhost:8123" + database = "default" + sql = "select * from test where age = 20 limit 100" + username = "xxxxx" + password = "xxxxx" + server_time_zone = "UTC" + result_table_name = "test" + clickhouse.config = { + "socket_timeout": "300000" + } + } +} + +# Console printing of the read Clickhouse data +sink { + Console { + parallelism = 1 + } +} +``` + +### Tips + +> 1.[SeaTunnel Deployment Document](../../start-v2/locally/deployment.md). + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/CosFile.md b/versioned_docs/version-2.3.7/connector-v2/source/CosFile.md new file mode 100644 index 000000000000..05efe7b9fd64 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/CosFile.md @@ -0,0 +1,368 @@ +# CosFile + +> Cos file source connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) + +Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot. + +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + - [x] xml + - [x] binary + +## Description + +Read data from aliyun Cos file system. + +:::tip + +If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. + +If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. + +To use this connector you need put hadoop-cos-{hadoop.version}-{version}.jar and cos_api-bundle-{version}.jar in ${SEATUNNEL_HOME}/lib dir, download: [Hadoop-Cos-release](https://github.com/tencentyun/hadoop-cos/releases). It only supports hadoop version 2.6.5+ and version 8.0.2+. + +::: + +## Options + +| name | type | required | default value | +|---------------------------|---------|----------|---------------------| +| path | string | yes | - | +| file_format_type | string | yes | - | +| bucket | string | yes | - | +| secret_id | string | yes | - | +| secret_key | string | yes | - | +| region | string | yes | - | +| read_columns | list | yes | - | +| delimiter/field_delimiter | string | no | \001 | +| parse_partition_from_path | boolean | no | true | +| skip_header_row_number | long | no | 0 | +| date_format | string | no | yyyy-MM-dd | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | +| time_format | string | no | HH:mm:ss | +| schema | config | no | - | +| sheet_name | string | no | - | +| xml_row_tag | string | no | - | +| xml_use_attr_format | boolean | no | - | +| file_filter_pattern | string | no | - | +| compress_codec | string | no | none | +| encoding | string | no | UTF-8 | +| common-options | | no | - | + +### path [string] + +The source file path. + +### file_format_type [string] + +File type, supported as the following file types: + +`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` + +If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want. + +For example: + +upstream data is the following: + +```json + +{"code": 200, "data": "get success", "success": true} + +``` + +You can also save multiple pieces of data in one file and split them by newline: + +```json lines + +{"code": 200, "data": "get success", "success": true} +{"code": 300, "data": "get failed", "success": false} + +``` + +you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. + +If you assign file type to `text` `csv`, you can choose to specify the schema information or not. + +For example, upstream data is the following: + +```text + +tyrantlucifer#26#male + +``` + +If you do not assign data schema connector will treat the upstream data as the following: + +| content | +|-----------------------| +| tyrantlucifer#26#male | + +If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type + +you should assign schema and delimiter as the following: + +```hocon + +field_delimiter = "#" +schema { + fields { + name = string + age = int + gender = string + } +} + +``` + +connector will generate data as the following: + +| name | age | gender | +|---------------|-----|--------| +| tyrantlucifer | 26 | male | + +If you assign file type to `binary`, SeaTunnel can synchronize files in any format, +such as compressed packages, pictures, etc. In short, any files can be synchronized to the target place. +Under this requirement, you need to ensure that the source and sink use `binary` format for file synchronization +at the same time. You can find the specific usage in the example below. + +### bucket [string] + +The bucket address of Cos file system, for example: `Cos://tyrantlucifer-image-bed` + +### secret_id [string] + +The secret id of Cos file system. + +### secret_key [string] + +The secret key of Cos file system. + +### region [string] + +The region of cos file system. + +### read_columns [list] + +The read column list of the data source, user can use it to implement field projection. + +### delimiter/field_delimiter [string] + +**delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. + +Only need to be configured when file_format is text. + +Field delimiter, used to tell connector how to slice and dice fields + +default `\001`, the same as hive's default delimiter + +### parse_partition_from_path [boolean] + +Control whether parse the partition keys and values from file path + +For example if you read a file from path `cosn://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` + +Every record data from file will be added these two fields: + +| name | age | +|---------------|-----| +| tyrantlucifer | 26 | + +Tips: **Do not define partition fields in schema option** + +### skip_header_row_number [long] + +Skip the first few lines, but only for the txt and csv. + +For example, set like following: + +`skip_header_row_number = 2` + +then SeaTunnel will skip the first 2 lines from source files + +### date_format [string] + +Date type format, used to tell connector how to convert string to date, supported as the following formats: + +`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` + +default `yyyy-MM-dd` + +### datetime_format [string] + +Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats: + +`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` + +default `yyyy-MM-dd HH:mm:ss` + +### time_format [string] + +Time type format, used to tell connector how to convert string to time, supported as the following formats: + +`HH:mm:ss` `HH:mm:ss.SSS` + +default `HH:mm:ss` + +### schema [config] + +Only need to be configured when the file_format_type are text, json, excel, xml or csv ( Or other format we can't read the schema from metadata). + +#### fields [Config] + +The schema of upstream data. + +### sheet_name [string] + +Only need to be configured when file_format is excel. + +Reader the sheet of the workbook. + +### xml_row_tag [string] + +Only need to be configured when file_format is xml. + +Specifies the tag name of the data rows within the XML file. + +### xml_use_attr_format [boolean] + +Only need to be configured when file_format is xml. + +Specifies Whether to process data using the tag attribute format. + +### file_filter_pattern [string] + +Filter pattern, which used for filtering files. + +### compress_codec [string] + +The compress codec of files and the details that supported as the following shown: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc/parquet: + automatically recognizes the compression type, no additional settings required. + +### encoding [string] + +Only used when file_format_type is json,text,csv,xml. +The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`. + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. + +## Example + +```hocon + + CosFile { + path = "/seatunnel/orc" + bucket = "cosn://seatunnel-test-1259587829" + secret_id = "xxxxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxxxx" + region = "ap-chengdu" + file_format_type = "orc" + } + +``` + +```hocon + + CosFile { + path = "/seatunnel/json" + bucket = "cosn://seatunnel-test-1259587829" + secret_id = "xxxxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxxxx" + region = "ap-chengdu" + file_format_type = "json" + schema { + fields { + id = int + name = string + } + } + } + +``` + +### Transfer Binary File + +```hocon + +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + CosFile { + bucket = "cosn://seatunnel-test-1259587829" + secret_id = "xxxxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxxxx" + region = "ap-chengdu" + path = "/seatunnel/read/binary/" + file_format_type = "binary" + } +} +sink { + // you can transfer local file to s3/hdfs/oss etc. + CosFile { + bucket = "cosn://seatunnel-test-1259587829" + secret_id = "xxxxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxxxx" + region = "ap-chengdu" + path = "/seatunnel/read/binary2/" + file_format_type = "binary" + } +} + +``` + +## Changelog + +### next version + +- Add file cos source connector ([4979](https://github.com/apache/seatunnel/pull/4979)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/DB2.md b/versioned_docs/version-2.3.7/connector-v2/source/DB2.md new file mode 100644 index 000000000000..0d2df826a055 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/DB2.md @@ -0,0 +1,165 @@ +# DB2 + +> JDBC DB2 Source Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Description + +Read external data source data through JDBC. + +## Using Dependency + +### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +> supports query SQL and can achieve projection effect. + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|----------------------------------------------------------|--------------------------------|-----------------------------------|-----------------------------------------------------------------------| +| DB2 | Different dependency version has different driver class. | com.ibm.db2.jdbc.app.DB2Driver | jdbc:db2://127.0.0.1:50000/dbname | [Download](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example DB2 datasource: cp db2-connector-java-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| DB2 Data Type | SeaTunnel Data Type | +|------------------------------------------------------------------------------------------------------|---------------------|---| +| BOOLEAN | BOOLEAN | +| SMALLINT | SHORT | +| INT
INTEGER
| INTEGER | +| BIGINT | LONG | +| DECIMAL
DEC
NUMERIC
NUM | DECIMAL(38,18) | +| REAL | FLOAT | +| FLOAT
DOUBLE
DOUBLE PRECISION
DECFLOAT | DOUBLE | +| CHAR
VARCHAR
LONG VARCHAR
CLOB
GRAPHIC
VARGRAPHIC
LONG VARGRAPHIC
DBCLOB | STRING | +| BLOB | BYTES | +| DATE | DATE | +| TIME | TIME | +| TIMESTAMP | TIMESTAMP | +| ROWID
XML | Not supported yet | + +## Source Options + +| Name | Type | Required | Default | Description | +|------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:db2://127.0.0.1:50000/dbname | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use db2 the value is `com.ibm.db2.jdbc.app.DB2Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | +| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
the row fetch size used in the query toimprove performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. + +``` +# Defining the runtime environment +env { + parallelism = 2 + job.mode = "BATCH" +} +source{ + Jdbc { + url = "jdbc:db2://127.0.0.1:50000/dbname" + driver = "com.ibm.db2.jdbc.app.DB2Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + query = "select * from table_xxx" + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink { + Console {} +} +``` + +### Parallel: + +> Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table + +``` +source { + Jdbc { + url = "jdbc:db2://127.0.0.1:50000/dbname" + driver = "com.ibm.db2.jdbc.app.DB2Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + # Define query logic as required + query = "select * from type_bin" + # Parallel sharding reads fields + partition_column = "id" + # Number of fragments + partition_num = 10 + } +} +``` + +### Parallel Boundary: + +> It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured + +``` +source { + Jdbc { + url = "jdbc:db2://127.0.0.1:50000/dbname" + driver = "com.ibm.db2.jdbc.app.DB2Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + # Define query logic as required + query = "select * from type_bin" + partition_column = "id" + # Read start boundary + partition_lower_bound = 1 + # Read end boundary + partition_upper_bound = 500 + partition_num = 10 + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Doris.md b/versioned_docs/version-2.3.7/connector-v2/source/Doris.md new file mode 100644 index 000000000000..c67444b58c8e --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Doris.md @@ -0,0 +1,162 @@ +# Doris + +> Doris source connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [schema projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +Used to read data from Doris. +Doris Source will send a SQL to FE, FE will parse it into an execution plan, send it to BE, and BE will +directly return the data + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|--------------------------------------|--------|-----|-------| +| Doris | Only Doris2.0 or later is supported. | - | - | - | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' +> working directory
+ +## Data Type Mapping + +| Doris Data type | SeaTunnel Data type | +|--------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| INT | INT | +| TINYINT | TINYINT | +| SMALLINT | SMALLINT | +| BIGINT | BIGINT | +| LARGEINT | STRING | +| BOOLEAN | BOOLEAN | +| DECIMAL | DECIMAL((Get the designated column's specified column size)+1,
(Gets the designated column's number of digits to right of the decimal point.))) | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| CHAR
VARCHAR
STRING
TEXT | STRING | +| DATE | DATE | +| DATETIME
DATETIME(p) | TIMESTAMP | +| ARRAY | ARRAY | + +## Source Options + +| Name | Type | Required | Default | Description | +|----------------------------------|--------|----------|------------|-----------------------------------------------------------------------------------------------------| +| fenodes | string | yes | - | FE address, the format is `"fe_host:fe_http_port"` | +| username | string | yes | - | User username | +| password | string | yes | - | User password | +| database | string | yes | - | The name of Doris database | +| table | string | yes | - | The name of Doris table | +| doris.read.field | string | no | - | Use the 'doris.read.field' parameter to select the doris table columns to read | +| query-port | string | no | 9030 | Doris QueryPort | +| doris.filter.query | string | no | - | Data filtering in doris. the format is "field = value",example : doris.filter.query = "F_ID > 2" | +| doris.batch.size | int | no | 1024 | The maximum value that can be obtained by reading Doris BE once. | +| doris.request.query.timeout.s | int | no | 3600 | Timeout period of Doris scan data, expressed in seconds. | +| doris.exec.mem.limit | long | no | 2147483648 | Maximum memory that can be used by a single be scan request. The default memory is 2G (2147483648). | +| doris.request.retries | int | no | 3 | Number of retries to send requests to Doris FE. | +| doris.request.read.timeout.ms | int | no | 30000 | | +| doris.request.connect.timeout.ms | int | no | 30000 | | + +### Tips + +> It is not recommended to modify advanced parameters at will + +## Task Example + +> This is an example of reading a Doris table and writing to Console. + +``` +env { + parallelism = 2 + job.mode = "BATCH" +} +source{ + Doris { + fenodes = "doris_e2e:8030" + username = root + password = "" + database = "e2e_source" + table = "doris_e2e_table" + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform/sql +} + +sink { + Console {} +} +``` + +Use the 'doris.read.field' parameter to select the doris table columns to read + +``` +env { + parallelism = 2 + job.mode = "BATCH" +} +source{ + Doris { + fenodes = "doris_e2e:8030" + username = root + password = "" + database = "e2e_source" + table = "doris_e2e_table" + doris.read.field = "F_ID,F_INT,F_BIGINT,F_TINYINT,F_SMALLINT" + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform/sql +} + +sink { + Console {} +} +``` + +Use 'doris.filter.query' to filter the data, and the parameter values are passed directly to doris + +``` +env { + parallelism = 2 + job.mode = "BATCH" +} +source{ + Doris { + fenodes = "doris_e2e:8030" + username = root + password = "" + database = "e2e_source" + table = "doris_e2e_table" + doris.filter.query = "F_ID > 2" + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform/sql +} + +sink { + Console {} +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Easysearch.md b/versioned_docs/version-2.3.7/connector-v2/source/Easysearch.md new file mode 100644 index 000000000000..d94609c77239 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Easysearch.md @@ -0,0 +1,209 @@ +# Easysearch + +> Easysearch source connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Description + +Used to read data from INFINI Easysearch. + +## Using Dependency + +> Depenndency [easysearch-client](https://central.sonatype.com/artifact/com.infinilabs/easysearch-client) + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +:::tip + +Engine Supported + +* Supported all versions released by [INFINI Easysearch](https://www.infini.com/download/?product=easysearch). + +::: + +## Data Type Mapping + +| Easysearch Data Type | SeaTunnel Data Type | +|-----------------------------|----------------------| +| STRING
KEYWORD
TEXT | STRING | +| BOOLEAN | BOOLEAN | +| BYTE | BYTE | +| SHORT | SHORT | +| INTEGER | INT | +| LONG | LONG | +| FLOAT
HALF_FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| Date | LOCAL_DATE_TIME_TYPE | + +### hosts [array] + +Easysearch cluster http address, the format is `host:port`, allowing multiple hosts to be specified. Such as `["host1:9200", "host2:9200"]`. + +### username [string] + +security username. + +### password [string] + +security password. + +### index [string] + +Easysearch index name, support * fuzzy matching. + +### source [array] + +The fields of index. +You can get the document id by specifying the field `_id`.If sink _id to other index,you need specify an alias for _id due to the Easysearch limit. +If you don't config source, you must config `schema`. + +### query [json] + +Easysearch DSL. +You can control the range of data read. + +### scroll_time [String] + +Amount of time Easysearch will keep the search context alive for scroll requests. + +### scroll_size [int] + +Maximum number of hits to be returned with each Easysearch scroll request. + +### schema + +The structure of the data, including field names and field types. +If you don't config schema, you must config `source`. + +### tls_verify_certificate [boolean] + +Enable certificates validation for HTTPS endpoints + +### tls_verify_hostname [boolean] + +Enable hostname validation for HTTPS endpoints + +### tls_keystore_path [string] + +The path to the PEM or JKS key store. This file must be readable by the operating system user running SeaTunnel. + +### tls_keystore_password [string] + +The key password for the key store specified + +### tls_truststore_path [string] + +The path to PEM or JKS trust store. This file must be readable by the operating system user running SeaTunnel. + +### tls_truststore_password [string] + +The key password for the trust store specified + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +## Examples + +simple + +```hocon +Easysearch { + hosts = ["localhost:9200"] + index = "seatunnel-*" + source = ["_id","name","age"] + query = {"range":{"firstPacket":{"gte":1700407367588,"lte":1700407367588}}} +} +``` + +complex + +```hocon +Easysearch { + hosts = ["Easysearch:9200"] + index = "st_index" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(2, 1)" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + query = {"range":{"firstPacket":{"gte":1700407367588,"lte":1700407367588}}} +} +``` + +SSL (Disable certificates validation) + +```hocon +source { + Easysearch { + hosts = ["https://localhost:9200"] + username = "admin" + password = "admin" + + tls_verify_certificate = false + } +} +``` + +SSL (Disable hostname validation) + +```hocon +source { + Easysearch { + hosts = ["https://localhost:9200"] + username = "admin" + password = "admin" + + tls_verify_hostname = false + } +} +``` + +SSL (Enable certificates validation) + +```hocon +source { + Easysearch { + hosts = ["https://localhost:9200"] + username = "admin" + password = "admin" + + tls_keystore_path = "${your Easysearch home}/config/certs/http.p12" + tls_keystore_password = "${your password}" + } +} +``` + +## Changelog + +### next version + +- Add Easysearch Source Connector +- Support https protocol +- Support DSL + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Elasticsearch.md b/versioned_docs/version-2.3.7/connector-v2/source/Elasticsearch.md new file mode 100644 index 000000000000..62ddfc5487cf --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Elasticsearch.md @@ -0,0 +1,200 @@ +# Elasticsearch + +> Elasticsearch source connector + +## Description + +Used to read data from Elasticsearch. + +support version >= 2.x and <= 8.x. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-------------------------|---------|----------|-------------------| +| hosts | array | yes | - | +| username | string | no | - | +| password | string | no | - | +| index | string | yes | - | +| source | array | no | - | +| query | json | no | {"match_all": {}} | +| scroll_time | string | no | 1m | +| scroll_size | int | no | 100 | +| tls_verify_certificate | boolean | no | true | +| tls_verify_hostnames | boolean | no | true | +| array_column | map | no | | +| tls_keystore_path | string | no | - | +| tls_keystore_password | string | no | - | +| tls_truststore_path | string | no | - | +| tls_truststore_password | string | no | - | +| common-options | | no | - | + +### hosts [array] + +Elasticsearch cluster http address, the format is `host:port`, allowing multiple hosts to be specified. Such as `["host1:9200", "host2:9200"]`. + +### username [string] + +x-pack username. + +### password [string] + +x-pack password. + +### index [string] + +Elasticsearch index name, support * fuzzy matching. + +### source [array] + +The fields of index. +You can get the document id by specifying the field `_id`.If sink _id to other index,you need specify an alias for _id due to the Elasticsearch limit. +If you don't config source, it is automatically retrieved from the mapping of the index. + +### array_column [array] + +The fields of array type. +Since there is no array index in es,so need assign array type,just like `{c_array = "array"}`. + +### query [json] + +Elasticsearch DSL. +You can control the range of data read. + +### scroll_time [String] + +Amount of time Elasticsearch will keep the search context alive for scroll requests. + +### scroll_size [int] + +Maximum number of hits to be returned with each Elasticsearch scroll request. + +### tls_verify_certificate [boolean] + +Enable certificates validation for HTTPS endpoints + +### tls_verify_hostname [boolean] + +Enable hostname validation for HTTPS endpoints + +### tls_keystore_path [string] + +The path to the PEM or JKS key store. This file must be readable by the operating system user running SeaTunnel. + +### tls_keystore_password [string] + +The key password for the key store specified + +### tls_truststore_path [string] + +The path to PEM or JKS trust store. This file must be readable by the operating system user running SeaTunnel. + +### tls_truststore_password [string] + +The key password for the trust store specified + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +## Examples + +simple + +```hocon +Elasticsearch { + hosts = ["localhost:9200"] + index = "seatunnel-*" + source = ["_id","name","age"] + query = {"range":{"firstPacket":{"gte":1669225429990,"lte":1669225429990}}} +} +``` + +complex + +```hocon +Elasticsearch { + hosts = ["elasticsearch:9200"] + index = "st_index" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(2, 1)" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + query = {"range":{"firstPacket":{"gte":1669225429990,"lte":1669225429990}}} +} +``` + +SSL (Disable certificates validation) + +```hocon +source { + Elasticsearch { + hosts = ["https://localhost:9200"] + username = "elastic" + password = "elasticsearch" + + tls_verify_certificate = false + } +} +``` + +SSL (Disable hostname validation) + +```hocon +source { + Elasticsearch { + hosts = ["https://localhost:9200"] + username = "elastic" + password = "elasticsearch" + + tls_verify_hostname = false + } +} +``` + +SSL (Enable certificates validation) + +```hocon +source { + Elasticsearch { + hosts = ["https://localhost:9200"] + username = "elastic" + password = "elasticsearch" + + tls_keystore_path = "${your elasticsearch home}/config/certs/http.p12" + tls_keystore_password = "${your password}" + } +} +``` + +## Changelog + +### next version + +- Add Elasticsearch Source Connector +- [Feature] Support https protocol & compatible with opensearch ([3997](https://github.com/apache/seatunnel/pull/3997)) +- [Feature] Support DSL + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/FakeSource.md b/versioned_docs/version-2.3.7/connector-v2/source/FakeSource.md new file mode 100644 index 000000000000..c85df3726116 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/FakeSource.md @@ -0,0 +1,421 @@ +# FakeSource + +> FakeSource connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Description + +The FakeSource is a virtual data source, which randomly generates the number of rows according to the data structure of the user-defined schema, +just for some test cases such as type conversion or connector new feature testing + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Source Options + +| Name | Type | Required | Default | Description | +|---------------------|----------|----------|-------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| tables_configs | list | no | - | Define Multiple FakeSource, each item can contains the whole fake source config description below | +| schema | config | yes | - | Define Schema information | +| rows | config | no | - | The row list of fake data output per degree of parallelism see title `Options rows Case`. | +| row.num | int | no | 5 | The total number of data generated per degree of parallelism | +| split.num | int | no | 1 | the number of splits generated by the enumerator for each degree of parallelism | +| split.read-interval | long | no | 1 | The interval(mills) between two split reads in a reader | +| map.size | int | no | 5 | The size of `map` type that connector generated | +| array.size | int | no | 5 | The size of `array` type that connector generated | +| bytes.length | int | no | 5 | The length of `bytes` type that connector generated | +| string.length | int | no | 5 | The length of `string` type that connector generated | +| string.fake.mode | string | no | range | The fake mode of generating string data, support `range` and `template`, default `range`,if use configured it to `template`, user should also configured `string.template` option | +| string.template | list | no | - | The template list of string type that connector generated, if user configured it, connector will randomly select an item from the template list | +| tinyint.fake.mode | string | no | range | The fake mode of generating tinyint data, support `range` and `template`, default `range`,if use configured it to `template`, user should also configured `tinyint.template` option | +| tinyint.min | tinyint | no | 0 | The min value of tinyint data that connector generated | +| tinyint.max | tinyint | no | 127 | The max value of tinyint data that connector generated | +| tinyint.template | list | no | - | The template list of tinyint type that connector generated, if user configured it, connector will randomly select an item from the template list | +| smallint.fake.mode | string | no | range | The fake mode of generating smallint data, support `range` and `template`, default `range`,if use configured it to `template`, user should also configured `smallint.template` option | +| smallint.min | smallint | no | 0 | The min value of smallint data that connector generated | +| smallint.max | smallint | no | 32767 | The max value of smallint data that connector generated | +| smallint.template | list | no | - | The template list of smallint type that connector generated, if user configured it, connector will randomly select an item from the template list | +| int.fake.template | string | no | range | The fake mode of generating int data, support `range` and `template`, default `range`,if use configured it to `template`, user should also configured `int.template` option | +| int.min | int | no | 0 | The min value of int data that connector generated | +| int.max | int | no | 0x7fffffff | The max value of int data that connector generated | +| int.template | list | no | - | The template list of int type that connector generated, if user configured it, connector will randomly select an item from the template list | +| bigint.fake.mode | string | no | range | The fake mode of generating bigint data, support `range` and `template`, default `range`,if use configured it to `template`, user should also configured `bigint.template` option | +| bigint.min | bigint | no | 0 | The min value of bigint data that connector generated | +| bigint.max | bigint | no | 0x7fffffffffffffff | The max value of bigint data that connector generated | +| bigint.template | list | no | - | The template list of bigint type that connector generated, if user configured it, connector will randomly select an item from the template list | +| float.fake.mode | string | no | range | The fake mode of generating float data, support `range` and `template`, default `range`,if use configured it to `template`, user should also configured `float.template` option | +| float.min | float | no | 0 | The min value of float data that connector generated | +| float.max | float | no | 0x1.fffffeP+127 | The max value of float data that connector generated | +| float.template | list | no | - | The template list of float type that connector generated, if user configured it, connector will randomly select an item from the template list | +| double.fake.mode | string | no | range | The fake mode of generating float data, support `range` and `template`, default `range`,if use configured it to `template`, user should also configured `double.template` option | +| double.min | double | no | 0 | The min value of double data that connector generated | +| double.max | double | no | 0x1.fffffffffffffP+1023 | The max value of double data that connector generated | +| double.template | list | no | - | The template list of double type that connector generated, if user configured it, connector will randomly select an item from the template list | +| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## Task Example + +### Simple: + +> This example Randomly generates data of a specified type. If you want to learn how to declare field types, click [here](../../concept/schema-feature.md#how-to-declare-type-supported). + +```hocon +schema = { + fields { + c_map = "map>" + c_map_nest = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + c_row = { + c_map = "map>" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } +} +``` + +### Random Generation + +> 16 data matching the type are randomly generated + +```hocon +source { + # This is a example input plugin **only for test and demonstrate the feature input plugin** + FakeSource { + row.num = 16 + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + result_table_name = "fake" + } +} +``` + +### Customize the data content Simple: + +> This is a self-defining data source information, defining whether each piece of data is an add or delete modification operation, and defining what each field stores + +```hocon +source { + FakeSource { + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + rows = [ + { + kind = INSERT + fields = [{"a": "b"}, [101], "c_string", true, 117, 15987, 56387395, 7084913402530365000, 1.23, 1.23, "2924137191386439303744.39292216", null, "bWlJWmo=", "2023-04-22", "2023-04-22T23:20:58"] + } + { + kind = UPDATE_BEFORE + fields = [{"a": "c"}, [102], "c_string", true, 117, 15987, 56387395, 7084913402530365000, 1.23, 1.23, "2924137191386439303744.39292216", null, "bWlJWmo=", "2023-04-22", "2023-04-22T23:20:58"] + } + { + kind = UPDATE_AFTER + fields = [{"a": "e"}, [103], "c_string", true, 117, 15987, 56387395, 7084913402530365000, 1.23, 1.23, "2924137191386439303744.39292216", null, "bWlJWmo=", "2023-04-22", "2023-04-22T23:20:58"] + } + { + kind = DELETE + fields = [{"a": "f"}, [104], "c_string", true, 117, 15987, 56387395, 7084913402530365000, 1.23, 1.23, "2924137191386439303744.39292216", null, "bWlJWmo=", "2023-04-22", "2023-04-22T23:20:58"] + } + ] + } +} +``` + +> Due to the constraints of the [HOCON](https://github.com/lightbend/config/blob/main/HOCON.md) specification, users cannot directly create byte sequence objects. FakeSource uses strings to assign `bytes` type values. In the example above, the `bytes` type field is assigned `"bWlJWmo="`, which is encoded from "miIZj" with **base64**. Hence, when assigning values to `bytes` type fields, please use strings encoded with **base64**. + +### Specified Data number Simple: + +> This case specifies the number of data generated and the length of the generated value + +```hocon +FakeSource { + row.num = 10 + map.size = 10 + array.size = 10 + bytes.length = 10 + string.length = 10 + schema = { + fields { + c_map = "map>" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + c_row = { + c_map = "map>" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} +``` + +### Template data Simple: + +> Randomly generated according to the specified template + +Using template + +```hocon +FakeSource { + row.num = 5 + string.fake.mode = "template" + string.template = ["tyrantlucifer", "hailin", "kris", "fanjia", "zongwen", "gaojun"] + tinyint.fake.mode = "template" + tinyint.template = [1, 2, 3, 4, 5, 6, 7, 8, 9] + smalling.fake.mode = "template" + smallint.template = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19] + int.fake.mode = "template" + int.template = [20, 21, 22, 23, 24, 25, 26, 27, 28, 29] + bigint.fake.mode = "template" + bigint.template = [30, 31, 32, 33, 34, 35, 36, 37, 38, 39] + float.fake.mode = "template" + float.template = [40.0, 41.0, 42.0, 43.0] + double.fake.mode = "template" + double.template = [44.0, 45.0, 46.0, 47.0] + schema { + fields { + c_string = string + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + } + } +} +``` + +### Range data Simple: + +> The specified data generation range is randomly generated + +```hocon +FakeSource { + row.num = 5 + string.template = ["tyrantlucifer", "hailin", "kris", "fanjia", "zongwen", "gaojun"] + tinyint.min = 1 + tinyint.max = 9 + smallint.min = 10 + smallint.max = 19 + int.min = 20 + int.max = 29 + bigint.min = 30 + bigint.max = 39 + float.min = 40.0 + float.max = 43.0 + double.min = 44.0 + double.max = 47.0 + schema { + fields { + c_string = string + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + } + } +} +``` + +### Generate Multiple tables + +> This is a case of generating a multi-data source test.table1 and test.table2 + +```hocon +FakeSource { + tables_configs = [ + { + row.num = 16 + schema { + table = "test.table1" + fields { + c_string = string + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + } + } + }, + { + row.num = 17 + schema { + table = "test.table2" + fields { + c_string = string + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + } + } + } + ] +} +``` + +### Options `rows` Case + +```hocon +rows = [ + { + kind = INSERT + fields = [1, "A", 100] + }, + { + kind = UPDATE_BEFORE + fields = [1, "A", 100] + }, + { + kind = UPDATE_AFTER + fields = [1, "A_1", 100] + }, + { + kind = DELETE + fields = [1, "A_1", 100] + } +] +``` + +### Options `table-names` Case + +```hocon + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + FakeSource { + table-names = ["test.table1", "test.table2", "test.table3"] + parallelism = 1 + schema = { + fields { + name = "string" + age = "int" + } + } + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add FakeSource Source Connector + +### 2.3.0-beta 2022-10-20 + +- [Improve] Supports direct definition of data values(row) ([2839](https://github.com/apache/seatunnel/pull/2839)) +- [Improve] Improve fake source connector: ([2944](https://github.com/apache/seatunnel/pull/2944)) + - Support user-defined map size + - Support user-defined array size + - Support user-defined string length + - Support user-defined bytes length +- [Improve] Support multiple splits for fake source connector ([2974](https://github.com/apache/seatunnel/pull/2974)) +- [Improve] Supports setting the number of splits per parallelism and the reading interval between two splits ([3098](https://github.com/apache/seatunnel/pull/3098)) + +### next version + +- [Feature] Support config fake data rows [3865](https://github.com/apache/seatunnel/pull/3865) +- [Feature] Support config template or range for fake data [3932](https://github.com/apache/seatunnel/pull/3932) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/FtpFile.md b/versioned_docs/version-2.3.7/connector-v2/source/FtpFile.md new file mode 100644 index 000000000000..e4cff24fc837 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/FtpFile.md @@ -0,0 +1,341 @@ +# FtpFile + +> Ftp file source connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) +- [x] file format type + - [x] text + - [x] csv + - [x] json + - [x] excel + - [x] xml + - [x] binary + +## Description + +Read data from ftp file server. + +:::tip + +If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. + +If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. + +::: + +## Options + +| name | type | required | default value | +|---------------------------|---------|----------|---------------------| +| host | string | yes | - | +| port | int | yes | - | +| user | string | yes | - | +| password | string | yes | - | +| path | string | yes | - | +| file_format_type | string | yes | - | +| connection_mode | string | no | active_local | +| delimiter/field_delimiter | string | no | \001 | +| read_columns | list | no | - | +| parse_partition_from_path | boolean | no | true | +| date_format | string | no | yyyy-MM-dd | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | +| time_format | string | no | HH:mm:ss | +| skip_header_row_number | long | no | 0 | +| schema | config | no | - | +| sheet_name | string | no | - | +| xml_row_tag | string | no | - | +| xml_use_attr_format | boolean | no | - | +| file_filter_pattern | string | no | - | +| compress_codec | string | no | none | +| encoding | string | no | UTF-8 | +| common-options | | no | - | + +### host [string] + +The target ftp host is required + +### port [int] + +The target ftp port is required + +### user [string] + +The target ftp user name is required + +### password [string] + +The target ftp password is required + +### path [string] + +The source file path. + +### file_format_type [string] + +File type, supported as the following file types: + +`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` + +If you assign file type to `json` , you should also assign schema option to tell connector how to parse data to the row you want. + +For example: + +upstream data is the following: + +```json + +{"code": 200, "data": "get success", "success": true} + +``` + +you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +If you assign file type to `text` `csv`, you can choose to specify the schema information or not. + +For example, upstream data is the following: + +```text + +tyrantlucifer#26#male + +``` + +If you do not assign data schema connector will treat the upstream data as the following: + +| content | +|-----------------------| +| tyrantlucifer#26#male | + +If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type + +you should assign schema and delimiter as the following: + +```hocon + +field_delimiter = "#" +schema { + fields { + name = string + age = int + gender = string + } +} + +``` + +connector will generate data as the following: + +| name | age | gender | +|---------------|-----|--------| +| tyrantlucifer | 26 | male | + +If you assign file type to `binary`, SeaTunnel can synchronize files in any format, +such as compressed packages, pictures, etc. In short, any files can be synchronized to the target place. +Under this requirement, you need to ensure that the source and sink use `binary` format for file synchronization +at the same time. You can find the specific usage in the example below. + +### connection_mode [string] + +The target ftp connection mode , default is active mode, supported as the following modes: + +`active_local` `passive_local` + +### delimiter/field_delimiter [string] + +**delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. + +Only need to be configured when file_format is text. + +Field delimiter, used to tell connector how to slice and dice fields. + +default `\001`, the same as hive's default delimiter + +### parse_partition_from_path [boolean] + +Control whether parse the partition keys and values from file path + +For example if you read a file from path `ftp://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` + +Every record data from file will be added these two fields: + +| name | age | +|---------------|-----| +| tyrantlucifer | 26 | + +Tips: **Do not define partition fields in schema option** + +### date_format [string] + +Date type format, used to tell connector how to convert string to date, supported as the following formats: + +`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` + +default `yyyy-MM-dd` + +### datetime_format [string] + +Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats: + +`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` + +default `yyyy-MM-dd HH:mm:ss` + +### time_format [string] + +Time type format, used to tell connector how to convert string to time, supported as the following formats: + +`HH:mm:ss` `HH:mm:ss.SSS` + +default `HH:mm:ss` + +### skip_header_row_number [long] + +Skip the first few lines, but only for the txt and csv. + +For example, set like following: + +`skip_header_row_number = 2` + +then SeaTunnel will skip the first 2 lines from source files + +### schema [config] + +Only need to be configured when the file_format_type are text, json, excel, xml or csv ( Or other format we can't read the schema from metadata). + +The schema information of upstream data. + +### read_columns [list] + +The read column list of the data source, user can use it to implement field projection. + +### sheet_name [string] + +Reader the sheet of the workbook,Only used when file_format_type is excel. + +### xml_row_tag [string] + +Only need to be configured when file_format is xml. + +Specifies the tag name of the data rows within the XML file. + +### xml_use_attr_format [boolean] + +Only need to be configured when file_format is xml. + +Specifies Whether to process data using the tag attribute format. + +### compress_codec [string] + +The compress codec of files and the details that supported as the following shown: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc/parquet: + automatically recognizes the compression type, no additional settings required. + +### encoding [string] + +Only used when file_format_type is json,text,csv,xml. +The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`. + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. + +## Example + +```hocon + + FtpFile { + path = "/tmp/seatunnel/sink/text" + host = "192.168.31.48" + port = 21 + user = tyrantlucifer + password = tianchao + file_format_type = "text" + schema = { + name = string + age = int + } + field_delimiter = "#" + } + +``` + +### Transfer Binary File + +```hocon + +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FtpFile { + host = "192.168.31.48" + port = 21 + user = tyrantlucifer + password = tianchao + path = "/seatunnel/read/binary/" + file_format_type = "binary" + } +} +sink { + // you can transfer local file to s3/hdfs/oss etc. + FtpFile { + host = "192.168.31.48" + port = 21 + user = tyrantlucifer + password = tianchao + path = "/seatunnel/read/binary2/" + file_format_type = "binary" + } +} + +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Ftp Source Connector + +### 2.3.0-beta 2022-10-20 + +- [BugFix] Fix the bug of incorrect path in windows environment ([2980](https://github.com/apache/seatunnel/pull/2980)) +- [Improve] Support extract partition from SeaTunnelRow fields ([3085](https://github.com/apache/seatunnel/pull/3085)) +- [Improve] Support parse field from file path ([2985](https://github.com/apache/seatunnel/pull/2985)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Github.md b/versioned_docs/version-2.3.7/connector-v2/source/Github.md new file mode 100644 index 000000000000..900a207e6971 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Github.md @@ -0,0 +1,296 @@ +# Github + +> Github source connector + +## Description + +Used to read data from Github. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-----------------------------|---------|----------|---------------| +| url | String | Yes | - | +| access_token | String | No | - | +| method | String | No | get | +| schema.fields | Config | No | - | +| format | String | No | json | +| params | Map | No | - | +| body | String | No | - | +| json_field | Config | No | - | +| content_json | String | No | - | +| poll_interval_millis | int | No | - | +| retry | int | No | - | +| retry_backoff_multiplier_ms | int | No | 100 | +| retry_backoff_max_ms | int | No | 10000 | +| enable_multi_lines | boolean | No | false | +| common-options | config | No | - | + +### url [String] + +http request url + +### access_token [String] + +Github personal access token, see: [Creating a personal access token - GitHub Docs](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) + +### method [String] + +http request method, only supports GET, POST method + +### params [Map] + +http params + +### body [String] + +http body + +### poll_interval_millis [int] + +request http api interval(millis) in stream mode + +### retry [int] + +The max retry times if request http return to `IOException` + +### retry_backoff_multiplier_ms [int] + +The retry-backoff times(millis) multiplier if request http failed + +### retry_backoff_max_ms [int] + +The maximum retry-backoff times(millis) if request http failed + +### format [String] + +the format of upstream data, now only support `json` `text`, default `json`. + +when you assign format is `json`, you should also assign schema option, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +when you assign format is `text`, connector will do nothing for upstream data, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +connector will generate data as the following: + +| content | +|----------------------------------------------------------| +| {"code": 200, "data": "get success", "success": true} | + +### schema [Config] + +#### fields [Config] + +the schema fields of upstream data + +### content_json [String] + +This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. + +If your return data looks something like this. + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can configure `content_field = "$.store.book.*"` and the result returned looks like this: + +```json +[ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } +] +``` + +Then you can get the desired result with a simpler schema,like + +```hocon +Http { + url = "http://mockserver:1080/contentjson/mock" + method = "GET" + format = "json" + content_field = "$.store.book.*" + schema = { + fields { + category = string + author = string + title = string + price = string + } + } +} +``` + +Here is an example: + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). + +### json_field [Config] + +This parameter helps you configure the schema,so this parameter must be used with schema. + +If your data looks something like this: + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can get the contents of 'book' by configuring the task as follows: + +```hocon +source { + Http { + url = "http://mockserver:1080/jsonpath/mock" + method = "GET" + format = "json" + json_field = { + category = "$.store.book[*].category" + author = "$.store.book[*].author" + title = "$.store.book[*].title" + price = "$.store.book[*].price" + } + schema = { + fields { + category = string + author = string + title = string + price = string + } + } + } +} +``` + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +## Example + +```hocon +Github { + url = "https://api.github.com/orgs/apache/repos" + access_token = "xxxx" + method = "GET" + format = "json" + schema = { + fields { + id = int + name = string + description = string + html_url = string + stargazers_count = int + forks = int + } + } +} +``` + +## Changelog + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Gitlab.md b/versioned_docs/version-2.3.7/connector-v2/source/Gitlab.md new file mode 100644 index 000000000000..ff3b6bc6423b --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Gitlab.md @@ -0,0 +1,299 @@ +# Gitlab + +> Gitlab source connector + +## Description + +Used to read data from Gitlab. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-----------------------------|---------|----------|---------------| +| url | String | Yes | - | +| access_token | String | Yes | - | +| method | String | No | get | +| schema.fields | Config | No | - | +| format | String | No | json | +| params | Map | No | - | +| body | String | No | - | +| json_field | Config | No | - | +| content_json | String | No | - | +| poll_interval_millis | int | No | - | +| retry | int | No | - | +| retry_backoff_multiplier_ms | int | No | 100 | +| retry_backoff_max_ms | int | No | 10000 | +| enable_multi_lines | boolean | No | false | +| common-options | config | No | - | + +### url [String] + +http request url + +### access_token [String] + +personal access token + +### method [String] + +http request method, only supports GET, POST method + +### params [Map] + +http params + +### body [String] + +http body + +### poll_interval_millis [int] + +request http api interval(millis) in stream mode + +### retry [int] + +The max retry times if request http return to `IOException` + +### retry_backoff_multiplier_ms [int] + +The retry-backoff times(millis) multiplier if request http failed + +### retry_backoff_max_ms [int] + +The maximum retry-backoff times(millis) if request http failed + +### format [String] + +the format of upstream data, now only support `json` `text`, default `json`. + +when you assign format is `json`, you should also assign schema option, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +when you assign format is `text`, connector will do nothing for upstream data, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +connector will generate data as the following: + +| content | +|----------------------------------------------------------| +| {"code": 200, "data": "get success", "success": true} | + +### schema [Config] + +#### fields [Config] + +the schema fields of upstream data + +### content_json [String] + +This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. + +If your return data looks something like this. + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can configure `content_field = "$.store.book.*"` and the result returned looks like this: + +```json +[ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } +] +``` + +Then you can get the desired result with a simpler schema,like + +```hocon +Http { + url = "http://mockserver:1080/contentjson/mock" + method = "GET" + format = "json" + content_field = "$.store.book.*" + schema = { + fields { + category = string + author = string + title = string + price = string + } + } +} +``` + +Here is an example: + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). + +### json_field [Config] + +This parameter helps you configure the schema,so this parameter must be used with schema. + +If your data looks something like this: + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can get the contents of 'book' by configuring the task as follows: + +```hocon +source { + Http { + url = "http://mockserver:1080/jsonpath/mock" + method = "GET" + format = "json" + json_field = { + category = "$.store.book[*].category" + author = "$.store.book[*].author" + title = "$.store.book[*].title" + price = "$.store.book[*].price" + } + schema = { + fields { + category = string + author = string + title = string + price = string + } + } + } +} +``` + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +## Example + +```hocon +Gitlab{ + url = "https://gitlab.com/api/v4/projects" + access_token = "xxxxx" + schema { + fields { + id = int + description = string + name = string + name_with_namespace = string + path = string + http_url_to_repo = string + } + } +} +``` + +## Changelog + +### next version + +- Add Gitlab Source Connector +- [Feature][Connector-V2][HTTP] Use json-path parsing ([3510](https://github.com/apache/seatunnel/pull/3510)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/GoogleSheets.md b/versioned_docs/version-2.3.7/connector-v2/source/GoogleSheets.md new file mode 100644 index 000000000000..754a502f2b71 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/GoogleSheets.md @@ -0,0 +1,79 @@ +# GoogleSheets + +> GoogleSheets source connector + +## Description + +Used to read data from GoogleSheets. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) +- [ ] file format + - [ ] text + - [ ] csv + - [ ] json + +## Options + +| name | type | required | default value | +|---------------------|--------|----------|---------------| +| service_account_key | string | yes | - | +| sheet_id | string | yes | - | +| sheet_name | string | yes | - | +| range | string | yes | - | +| schema | config | no | - | + +### service_account_key [string] + +google cloud service account, base64 required + +### sheet_id [string] + +sheet id in a Google Sheets URL + +### sheet_name [string] + +the name of the sheet you want to import + +### range [string] + +the range of the sheet you want to import + +### schema [config] + +#### fields [config] + +the schema fields of upstream data + +## Example + +simple: + +```hocon +GoogleSheets { + service_account_key = "seatunnel-test" + sheet_id = "1VI0DvyZK-NIdssSdsDSsSSSC-_-rYMi7ppJiI_jhE" + sheet_name = "sheets01" + range = "A1:C3" + schema = { + fields { + a = int + b = string + c = string + } + } +} +``` + +## Changelog + +### next version + +- Add GoogleSheets Source Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Greenplum.md b/versioned_docs/version-2.3.7/connector-v2/source/Greenplum.md new file mode 100644 index 000000000000..74669898df95 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Greenplum.md @@ -0,0 +1,42 @@ +# Greenplum + +> Greenplum source connector + +## Description + +Read Greenplum data through [Jdbc connector](Jdbc.md). + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) + +supports query SQL and can achieve projection effect. + +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +:::tip + +Optional jdbc drivers: +- `org.postgresql.Driver` +- `com.pivotal.jdbc.GreenplumDriver` + +Warn: for license compliance, if you use `GreenplumDriver` the have to provide Greenplum JDBC driver yourself, e.g. copy greenplum-xxx.jar to $SEATNUNNEL_HOME/lib for Standalone. + +::: + +## Options + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Greenplum Source Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Hbase.md b/versioned_docs/version-2.3.7/connector-v2/source/Hbase.md new file mode 100644 index 000000000000..753d68eb6e87 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Hbase.md @@ -0,0 +1,96 @@ +# Hbase + +> Hbase Source Connector + +## Description + +Reads data from Apache Hbase. + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [schema projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| Name | Type | Required | Default | +|--------------------|---------|----------|---------| +| zookeeper_quorum | string | Yes | - | +| table | string | Yes | - | +| schema | config | Yes | - | +| hbase_extra_config | string | No | - | +| caching | int | No | -1 | +| batch | int | No | -1 | +| cache_blocks | boolean | No | false | +| common-options | | No | - | + +### zookeeper_quorum [string] + +The zookeeper quorum for Hbase cluster hosts, e.g., "hadoop001:2181,hadoop002:2181,hadoop003:2181". + +### table [string] + +The name of the table to write to, e.g., "seatunnel". + +### schema [config] + +Hbase stores data in byte arrays. Therefore, you need to configure the data types for each column in the table. For more information, see: [guide](../../concept/schema-feature.md#how-to-declare-type-supported). + +### hbase_extra_config [config] + +Additional configurations for Hbase. + +### caching + +The caching parameter sets the number of rows fetched per server trip during scans. This reduces round-trips between client and server, improving scan efficiency. Default: -1. + +### batch + +The batch parameter sets the maximum number of columns returned per scan. This is useful for rows with many columns to avoid fetching excessive data at once, thus saving memory and improving performance. + +### cache_blocks + +The cache_blocks parameter determines whether to cache data blocks during scans. By default, HBase caches data blocks during scans. Setting this to false reduces memory usage during scans. Default in SeaTunnel: false. + +### common-options + +Common parameters for Source plugins, refer to [Common Source Options](common-options.md). + +## Example + +```bash +source { + Hbase { + zookeeper_quorum = "hadoop001:2181,hadoop002:2181,hadoop003:2181" + table = "seatunnel_test" + caching = 1000 + batch = 100 + cache_blocks = false + schema = { + columns = [ + { + name = "rowkey" + type = string + }, + { + name = "columnFamily1:column1" + type = boolean + }, + { + name = "columnFamily1:column2" + type = double + }, + { + name = "columnFamily2:column1" + type = bigint + } + ] + } + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/HdfsFile.md b/versioned_docs/version-2.3.7/connector-v2/source/HdfsFile.md new file mode 100644 index 000000000000..20a2559ddb8d --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/HdfsFile.md @@ -0,0 +1,136 @@ +# HdfsFile + +> Hdfs File Source Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) + +Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot. + +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) +- [x] file format file + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + - [x] xml + - [x] binary + +## Description + +Read data from hdfs file system. + +## Supported DataSource Info + +| Datasource | Supported Versions | +|------------|--------------------| +| HdfsFile | hadoop 2.x and 3.x | + +## Source Options + +| Name | Type | Required | Default | Description | +|---------------------------|---------|----------|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| path | string | yes | - | The source file path. | +| file_format_type | string | yes | - | We supported as the following file types:`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary`.Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. | +| fs.defaultFS | string | yes | - | The hadoop cluster address that start with `hdfs://`, for example: `hdfs://hadoopcluster` | +| read_columns | list | no | - | The read column list of the data source, user can use it to implement field projection.The file type supported column projection as the following shown:[text,json,csv,orc,parquet,excel,xml].Tips: If the user wants to use this feature when reading `text` `json` `csv` files, the schema option must be configured. | +| hdfs_site_path | string | no | - | The path of `hdfs-site.xml`, used to load ha configuration of namenodes | +| delimiter/field_delimiter | string | no | \001 | Field delimiter, used to tell connector how to slice and dice fields when reading text files. default `\001`, the same as hive's default delimiter | +| parse_partition_from_path | boolean | no | true | Control whether parse the partition keys and values from file path. For example if you read a file from path `hdfs://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`. Every record data from file will be added these two fields:[name:tyrantlucifer,age:26].Tips:Do not define partition fields in schema option. | +| date_format | string | no | yyyy-MM-dd | Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` default `yyyy-MM-dd`.Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` default `yyyy-MM-dd` | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` .default `yyyy-MM-dd HH:mm:ss` | +| time_format | string | no | HH:mm:ss | Time type format, used to tell connector how to convert string to time, supported as the following formats:`HH:mm:ss` `HH:mm:ss.SSS`.default `HH:mm:ss` | +| remote_user | string | no | - | The login user used to connect to hadoop login name. It is intended to be used for remote users in RPC, it won't have any credentials. | +| krb5_path | string | no | /etc/krb5.conf | The krb5 path of kerberos | +| kerberos_principal | string | no | - | The principal of kerberos | +| kerberos_keytab_path | string | no | - | The keytab path of kerberos | +| skip_header_row_number | long | no | 0 | Skip the first few lines, but only for the txt and csv.For example, set like following:`skip_header_row_number = 2`.then Seatunnel will skip the first 2 lines from source files | +| schema | config | no | - | the schema fields of upstream data | +| sheet_name | string | no | - | Reader the sheet of the workbook,Only used when file_format is excel. | +| xml_row_tag | string | no | - | Specifies the tag name of the data rows within the XML file, only used when file_format is xml. | +| xml_use_attr_format | boolean | no | - | Specifies whether to process data using the tag attribute format, only used when file_format is xml. | +| compress_codec | string | no | none | The compress codec of files | +| encoding | string | no | UTF-8 | +| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | + +### delimiter/field_delimiter [string] + +**delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. + +### compress_codec [string] + +The compress codec of files and the details that supported as the following shown: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc/parquet: + automatically recognizes the compression type, no additional settings required. + +### encoding [string] + +Only used when file_format_type is json,text,csv,xml. +The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`. + +### Tips + +> If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. + +## Task Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that read data from Hdfs and sends it to Hdfs. + +``` +# Defining the runtime environment +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + HdfsFile { + schema { + fields { + name = string + age = int + } + } + path = "/apps/hive/demo/student" + file_format_type = "json" + fs.defaultFS = "hdfs://namenode001" + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + file_format = "orc" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Hive.md b/versioned_docs/version-2.3.7/connector-v2/source/Hive.md new file mode 100644 index 000000000000..da70cf7aa34a --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Hive.md @@ -0,0 +1,279 @@ +# Hive + +> Hive source connector + +## Description + +Read data from Hive. + +:::tip + +In order to use this connector, You must ensure your spark/flink cluster already integrated hive. The tested hive version is 2.3.9. + +If you use SeaTunnel Engine, You need put seatunnel-hadoop3-3.1.4-uber.jar and hive-exec-3.1.3.jar and libfb303-0.9.3.jar in $SEATUNNEL_HOME/lib/ dir. +::: + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) + +Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot. + +- [x] [schema projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) +- [x] file format + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + +## Options + +| name | type | required | default value | +|-----------------------|--------|----------|----------------| +| table_name | string | yes | - | +| metastore_uri | string | yes | - | +| krb5_path | string | no | /etc/krb5.conf | +| kerberos_principal | string | no | - | +| kerberos_keytab_path | string | no | - | +| hdfs_site_path | string | no | - | +| hive_site_path | string | no | - | +| hive.hadoop.conf | Map | no | - | +| hive.hadoop.conf-path | string | no | - | +| read_partitions | list | no | - | +| read_columns | list | no | - | +| compress_codec | string | no | none | +| common-options | | no | - | + +### table_name [string] + +Target Hive table name eg: db1.table1 + +### metastore_uri [string] + +Hive metastore uri + +### hdfs_site_path [string] + +The path of `hdfs-site.xml`, used to load ha configuration of namenodes + +### hive.hadoop.conf [map] + +Properties in hadoop conf('core-site.xml', 'hdfs-site.xml', 'hive-site.xml') + +### hive.hadoop.conf-path [string] + +The specified loading path for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files + +### read_partitions [list] + +The target partitions that user want to read from hive table, if user does not set this parameter, it will read all the data from hive table. + +**Tips: Every partition in partitions list should have the same directory depth. For example, a hive table has two partitions: par1 and par2, if user sets it like as the following:** +**read_partitions = [par1=xxx, par1=yyy/par2=zzz], it is illegal** + +### krb5_path [string] + +The path of `krb5.conf`, used to authentication kerberos + +### kerberos_principal [string] + +The principal of kerberos authentication + +### kerberos_keytab_path [string] + +The keytab file path of kerberos authentication + +### read_columns [list] + +The read column list of the data source, user can use it to implement field projection. + +### compress_codec [string] + +The compress codec of files and the details that supported as the following shown: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc/parquet: + automatically recognizes the compression type, no additional settings required. + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +## Example + +### Example 1: Single table + +```bash + + Hive { + table_name = "default.seatunnel_orc" + metastore_uri = "thrift://namenode001:9083" + } + +``` + +### Example 2: Multiple tables + +```bash + + Hive { + tables_configs = [ + { + table_name = "default.seatunnel_orc_1" + metastore_uri = "thrift://namenode001:9083" + }, + { + table_name = "default.seatunnel_orc_2" + metastore_uri = "thrift://namenode001:9083" + } + ] + } + +``` + +## Hive on s3 + +### Step 1 + +Create the lib dir for hive of emr. + +```shell +mkdir -p ${SEATUNNEL_HOME}/plugins/Hive/lib +``` + +### Step 2 + +Get the jars from maven center to the lib. + +```shell +cd ${SEATUNNEL_HOME}/plugins/Hive/lib +wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.6.5/hadoop-aws-2.6.5.jar +wget https://repo1.maven.org/maven2/org/apache/hive/hive-exec/2.3.9/hive-exec-2.3.9.jar +``` + +### Step 3 + +Copy the jars from your environment on emr to the lib dir. + +```shell +cp /usr/share/aws/emr/emrfs/lib/emrfs-hadoop-assembly-2.60.0.jar ${SEATUNNEL_HOME}/plugins/Hive/lib +cp /usr/share/aws/emr/hadoop-state-pusher/lib/hadoop-common-3.3.6-amzn-1.jar ${SEATUNNEL_HOME}/plugins/Hive/lib +cp /usr/share/aws/emr/hadoop-state-pusher/lib/javax.inject-1.jar ${SEATUNNEL_HOME}/plugins/Hive/lib +cp /usr/share/aws/emr/hadoop-state-pusher/lib/aopalliance-1.0.jar ${SEATUNNEL_HOME}/plugins/Hive/lib +``` + +### Step 4 + +Run the case. + +```shell +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Hive { + table_name = "test_hive.test_hive_sink_on_s3" + metastore_uri = "thrift://ip-192-168-0-202.cn-north-1.compute.internal:9083" + hive.hadoop.conf-path = "/home/ec2-user/hadoop-conf" + hive.hadoop.conf = { + bucket="s3://ws-package" + fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" + } + read_columns = ["pk_id", "name", "score"] + } +} + +sink { + Hive { + table_name = "test_hive.test_hive_sink_on_s3_sink" + metastore_uri = "thrift://ip-192-168-0-202.cn-north-1.compute.internal:9083" + hive.hadoop.conf-path = "/home/ec2-user/hadoop-conf" + hive.hadoop.conf = { + bucket="s3://ws-package" + fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" + } + } +} +``` + +## Hive on oss + +### Step 1 + +Create the lib dir for hive of emr. + +```shell +mkdir -p ${SEATUNNEL_HOME}/plugins/Hive/lib +``` + +### Step 2 + +Get the jars from maven center to the lib. + +```shell +cd ${SEATUNNEL_HOME}/plugins/Hive/lib +wget https://repo1.maven.org/maven2/org/apache/hive/hive-exec/2.3.9/hive-exec-2.3.9.jar +``` + +### Step 3 + +Copy the jars from your environment on emr to the lib dir and delete the conflicting jar. + +```shell +cp -r /opt/apps/JINDOSDK/jindosdk-current/lib/jindo-*.jar ${SEATUNNEL_HOME}/plugins/Hive/lib +rm -f ${SEATUNNEL_HOME}/lib/hadoop-aliyun-*.jar +``` + +### Step 4 + +Run the case. + +```shell +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Hive { + table_name = "test_hive.test_hive_sink_on_oss" + metastore_uri = "thrift://master-1-1.c-1009b01725b501f2.cn-wulanchabu.emr.aliyuncs.com:9083" + hive.hadoop.conf-path = "/tmp/hadoop" + hive.hadoop.conf = { + bucket="oss://emr-osshdfs.cn-wulanchabu.oss-dls.aliyuncs.com" + } + } +} + +sink { + Hive { + table_name = "test_hive.test_hive_sink_on_oss_sink" + metastore_uri = "thrift://master-1-1.c-1009b01725b501f2.cn-wulanchabu.emr.aliyuncs.com:9083" + hive.hadoop.conf-path = "/tmp/hadoop" + hive.hadoop.conf = { + bucket="oss://emr-osshdfs.cn-wulanchabu.oss-dls.aliyuncs.com" + } + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Hive Source Connector + +### Next version + +- [Improve] Support kerberos authentication ([3840](https://github.com/apache/seatunnel/pull/3840)) +- Support user-defined partitions ([3842](https://github.com/apache/seatunnel/pull/3842)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/HiveJdbc.md b/versioned_docs/version-2.3.7/connector-v2/source/HiveJdbc.md new file mode 100644 index 000000000000..20366c529521 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/HiveJdbc.md @@ -0,0 +1,163 @@ +# HiveJdbc + +> JDBC Hive Source Connector + +## Support Hive Version + +- Definitely supports 3.1.3 and 3.1.2, other versions need to be tested. + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +> supports query SQL and can achieve projection effect. + +## Description + +Read external data source data through JDBC. + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|----------------------------------------------------------|---------------------------------|--------------------------------------|--------------------------------------------------------------------------| +| Hive | Different dependency version has different driver class. | org.apache.hive.jdbc.HiveDriver | jdbc:hive2://localhost:10000/default | [Download](https://mvnrepository.com/artifact/org.apache.hive/hive-jdbc) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' +> working directory
+> For example Hive datasource: cp hive-jdbc-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| Hive Data Type | SeaTunnel Data Type | +|-------------------------------------------------------------------------------------------|---------------------| +| BOOLEAN | BOOLEAN | +| TINYINT
SMALLINT | SHORT | +| INT
INTEGER | INT | +| BIGINT | LONG | +| FLOAT | FLOAT | +| DOUBLE
DOUBLE PRECISION | DOUBLE | +| DECIMAL(x,y)
NUMERIC(x,y)
(Get the designated column's specified column size.<38) | DECIMAL(x,y) | +| DECIMAL(x,y)
NUMERIC(x,y)
(Get the designated column's specified column size.>38) | DECIMAL(38,18) | +| CHAR
VARCHAR
STRING | STRING | +| DATE | DATE | +| DATETIME
TIMESTAMP | TIMESTAMP | +| BINARY
ARRAY
INTERVAL
MAP
STRUCT
UNIONTYPE | Not supported yet | + +## Source Options + +| Name | Type | Required | Default | Description | +|------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:hive2://localhost:10000/default | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use Hive the value is `org.apache.hive.jdbc.HiveDriver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | +| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
the row fetch size used in the query toimprove performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | +| useKerberos | Boolean | No | no | Whether to enable Kerberos, default is false | +| kerberos_principal | String | No | - | When use kerberos, we should set kerberos principal such as 'test_user@xxx'. | +| kerberos_keytab_path | String | No | - | When use kerberos, we should set kerberos principal file path such as '/home/test/test_user.keytab' . | +| krb5_path | String | No | /etc/krb5.conf | When use kerberos, we should set krb5 path file path such as '/seatunnel/krb5.conf' or use the default path '/etc/krb5.conf '. | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed +> in parallel according to the concurrency of tasks , When your shard read field is a large number type such as bigint( +> and above and the data is not evenly distributed, it is recommended to set the parallelism level to 1 to ensure that +> the +> data skew problem is resolved + +## Task Example + +### Simple: + +> This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its +> fields. You can also specify which fields to query for final output to the console. + +``` +# Defining the runtime environment +env { + parallelism = 2 + job.mode = "BATCH" +} +source{ + Jdbc { + url = "jdbc:hive2://localhost:10000/default" + driver = "org.apache.hive.jdbc.HiveDriver" + connection_check_timeout_sec = 100 + query = "select * from type_bin limit 16" + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink { + Console {} +} +``` + +### Parallel: + +> Read your query table in parallel with the shard field you configured and the shard data You can do this if you want +> to read the whole table + +``` +source { + Jdbc { + url = "jdbc:hive2://localhost:10000/default" + driver = "org.apache.hive.jdbc.HiveDriver" + connection_check_timeout_sec = 100 + # Define query logic as required + query = "select * from type_bin" + # Parallel sharding reads fields + partition_column = "id" + # Number of fragments + partition_num = 10 + } +} +``` + +### Parallel Boundary: + +> It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read +> your data source according to the upper and lower boundaries you configured + +``` +source { + Jdbc { + url = "jdbc:hive2://localhost:10000/default" + driver = "org.apache.hive.jdbc.HiveDriver" + connection_check_timeout_sec = 100 + # Define query logic as required + query = "select * from type_bin" + partition_column = "id" + # Read start boundary + partition_lower_bound = 1 + # Read end boundary + partition_upper_bound = 500 + partition_num = 10 + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Http.md b/versioned_docs/version-2.3.7/connector-v2/source/Http.md new file mode 100644 index 000000000000..e4d021c05c5b --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Http.md @@ -0,0 +1,357 @@ +# Http + +> Http source connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +Used to read data from Http. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +Supported DataSource Info +------------------------- + +In order to use the Http connector, the following dependencies are required. +They can be downloaded via install-plugin.sh or from the Maven central repository. + +| Datasource | Supported Versions | Dependency | +|------------|--------------------|------------------------------------------------------------------------------------------------------------| +| Http | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-http) | + +## Source Options + +| Name | Type | Required | Default | Description | +|-----------------------------|---------|----------|---------|--------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | Http request url. | +| schema | Config | No | - | Http and seatunnel data structure mapping | +| schema.fields | Config | No | - | The schema fields of upstream data | +| json_field | Config | No | - | This parameter helps you configure the schema,so this parameter must be used with schema. | +| pageing | Config | No | - | This parameter is used for paging queries | +| pageing.page_field | String | No | - | This parameter is used to specify the page field name in the request parameter | +| pageing.total_page_size | Int | No | - | This parameter is used to control the total number of pages | +| pageing.batch_size | Int | No | - | The batch size returned per request is used to determine whether to continue when the total number of pages is unknown | +| content_json | String | No | - | This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. | +| format | String | No | text | The format of upstream data, now only support `json` `text`, default `text`. | +| method | String | No | get | Http request method, only supports GET, POST method. | +| headers | Map | No | - | Http headers. | +| params | Map | No | - | Http params,the program will automatically add http header application/x-www-form-urlencoded. | +| body | String | No | - | Http body,the program will automatically add http header application/json,body is jsonbody. | +| poll_interval_millis | Int | No | - | Request http api interval(millis) in stream mode. | +| retry | Int | No | - | The max retry times if request http return to `IOException`. | +| retry_backoff_multiplier_ms | Int | No | 100 | The retry-backoff times(millis) multiplier if request http failed. | +| retry_backoff_max_ms | Int | No | 10000 | The maximum retry-backoff times(millis) if request http failed | +| enable_multi_lines | Boolean | No | false | | +| connect_timeout_ms | Int | No | 12000 | Connection timeout setting, default 12s. | +| socket_timeout_ms | Int | No | 60000 | Socket timeout setting, default 60s. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## How to Create a Http Data Synchronization Jobs + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Http { + result_table_name = "http" + url = "http://mockserver:1080/example/http" + method = "GET" + format = "json" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + C_MAP = "map" + C_ARRAY = "array" + C_STRING = string + C_BOOLEAN = boolean + C_TINYINT = tinyint + C_SMALLINT = smallint + C_INT = int + C_BIGINT = bigint + C_FLOAT = float + C_DOUBLE = double + C_BYTES = bytes + C_DATE = date + C_DECIMAL = "decimal(38, 18)" + C_TIMESTAMP = timestamp + } + } + } + } +} + +# Console printing of the read Http data +sink { + Console { + parallelism = 1 + } +} +``` + +## Parameter Interpretation + +### format + +when you assign format is `json`, you should also assign schema option, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +when you assign format is `text`, connector will do nothing for upstream data, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +connector will generate data as the following: + +| content | +|----------------------------------------------------------| +| {"code": 200, "data": "get success", "success": true} | + +### content_json + +This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. + +If your return data looks something like this. + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can configure `content_field = "$.store.book.*"` and the result returned looks like this: + +```json +[ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } +] +``` + +Then you can get the desired result with a simpler schema,like + +```hocon +Http { + url = "http://mockserver:1080/contentjson/mock" + method = "GET" + format = "json" + content_field = "$.store.book.*" + schema = { + fields { + category = string + author = string + title = string + price = string + } + } +} +``` + +Here is an example: + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). + +### json_field + +This parameter helps you configure the schema,so this parameter must be used with schema. + +If your data looks something like this: + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can get the contents of 'book' by configuring the task as follows: + +```hocon +source { + Http { + url = "http://mockserver:1080/jsonpath/mock" + method = "GET" + format = "json" + json_field = { + category = "$.store.book[*].category" + author = "$.store.book[*].author" + title = "$.store.book[*].title" + price = "$.store.book[*].price" + } + schema = { + fields { + category = string + author = string + title = string + price = string + } + } + } +} +``` + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). + +### pageing + +```hocon +source { + Http { + url = "http://localhost:8080/mock/queryData" + method = "GET" + format = "json" + params={ + page: "${page}" + } + content_field = "$.data.*" + pageing={ + total_page_size=20 + page_field=page + #when don't know the total_page_size use batch_size if read size Apache Iceberg source connector + +## Support Iceberg Version + +- 1.4.2 + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) +- [x] data format + - [x] parquet + - [x] orc + - [x] avro +- [x] iceberg catalog + - [x] hadoop(2.7.1 , 2.7.5 , 3.1.3) + - [x] hive(2.3.9 , 3.1.2) + +## Description + +Source connector for Apache Iceberg. It can support batch and stream mode. + +## Supported DataSource Info + +| Datasource | Dependent | Maven | +|------------|-----------|---------------------------------------------------------------------------| +| Iceberg | hive-exec | [Download](https://mvnrepository.com/artifact/org.apache.hive/hive-exec) | +| Iceberg | libfb303 | [Download](https://mvnrepository.com/artifact/org.apache.thrift/libfb303) | + +## Database Dependency + +> In order to be compatible with different versions of Hadoop and Hive, the scope of hive-exec in the project pom file are provided, so if you use the Flink engine, first you may need to add the following Jar packages to /lib directory, if you are using the Spark engine and integrated with Hadoop, then you do not need to add the following Jar packages. If you are using the hadoop s3 catalog, you need to add the hadoop-aws,aws-java-sdk jars for your Flink and Spark engine versions. (Additional locations: /lib, /jars) + +``` +hive-exec-xxx.jar +libfb303-xxx.jar +``` + +> Some versions of the hive-exec package do not have libfb303-xxx.jar, so you also need to manually import the Jar package. + +## Data Type Mapping + +| Iceberg Data type | SeaTunnel Data type | +|-------------------|---------------------| +| BOOLEAN | BOOLEAN | +| INTEGER | INT | +| LONG | BIGINT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| DATE | DATE | +| TIME | TIME | +| TIMESTAMP | TIMESTAMP | +| STRING | STRING | +| FIXED
BINARY | BYTES | +| DECIMAL | DECIMAL | +| STRUCT | ROW | +| LIST | ARRAY | +| MAP | MAP | + +## Source Options + +| Name | Type | Required | Default | Description | +|--------------------------|---------|----------|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| catalog_name | string | yes | - | User-specified catalog name. | +| namespace | string | yes | - | The iceberg database name in the backend catalog. | +| table | string | yes | - | The iceberg table name in the backend catalog. | +| iceberg.catalog.config | map | yes | - | Specify the properties for initializing the Iceberg catalog, which can be referenced in this file:"https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/CatalogProperties.java" | +| hadoop.config | map | no | - | Properties passed through to the Hadoop configuration | +| iceberg.hadoop-conf-path | string | no | - | The specified loading paths for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files. | +| schema | config | no | - | Use projection to select data columns and columns order. | +| case_sensitive | boolean | no | false | If data columns where selected via schema [config], controls whether the match to the schema will be done with case sensitivity. | +| start_snapshot_timestamp | long | no | - | Instructs this scan to look for changes starting from the most recent snapshot for the table as of the timestamp.
timestamp – the timestamp in millis since the Unix epoch | +| start_snapshot_id | long | no | - | Instructs this scan to look for changes starting from a particular snapshot (exclusive). | +| end_snapshot_id | long | no | - | Instructs this scan to look for changes up to a particular snapshot (inclusive). | +| use_snapshot_id | long | no | - | Instructs this scan to look for use the given snapshot ID. | +| use_snapshot_timestamp | long | no | - | Instructs this scan to look for use the most recent snapshot as of the given time in milliseconds. timestamp – the timestamp in millis since the Unix epoch | +| stream_scan_strategy | enum | no | FROM_LATEST_SNAPSHOT | Starting strategy for stream mode execution, Default to use `FROM_LATEST_SNAPSHOT` if don’t specify any value,The optional values are:
TABLE_SCAN_THEN_INCREMENTAL: Do a regular table scan then switch to the incremental mode.
FROM_LATEST_SNAPSHOT: Start incremental mode from the latest snapshot inclusive.
FROM_EARLIEST_SNAPSHOT: Start incremental mode from the earliest snapshot inclusive.
FROM_SNAPSHOT_ID: Start incremental mode from a snapshot with a specific id inclusive.
FROM_SNAPSHOT_TIMESTAMP: Start incremental mode from a snapshot with a specific timestamp inclusive. | +| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | + +## Task Example + +### Simple: + +```hocon +env { + parallelism = 2 + job.mode = "BATCH" +} + +source { + Iceberg { + schema { + fields { + f2 = "boolean" + f1 = "bigint" + f3 = "int" + f4 = "bigint" + f5 = "float" + f6 = "double" + f7 = "date" + f9 = "timestamp" + f10 = "timestamp" + f11 = "string" + f12 = "bytes" + f13 = "bytes" + f14 = "decimal(19,9)" + f15 = "array" + f16 = "map" + } + } + catalog_name = "seatunnel" + iceberg.catalog.config={ + type = "hadoop" + warehouse = "file:///tmp/seatunnel/iceberg/hadoop/" + } + namespace = "database1" + table = "source" + result_table_name = "iceberg" + } +} + +transform { +} + +sink { + Console { + source_table_name = "iceberg" + } +} +``` + +### Hadoop S3 Catalog: + +```hocon +source { + iceberg { + catalog_name = "seatunnel" + iceberg.catalog.config={ + "type"="hadoop" + "warehouse"="s3a://your_bucket/spark/warehouse/" + } + hadoop.config={ + "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" + "fs.s3a.endpoint" = "s3.cn-north-1.amazonaws.com.cn" + "fs.s3a.access.key" = "xxxxxxxxxxxxxxxxx" + "fs.s3a.secret.key" = "xxxxxxxxxxxxxxxxx" + "fs.defaultFS" = "s3a://your_bucket" + } + namespace = "your_iceberg_database" + table = "your_iceberg_table" + result_table_name = "iceberg_test" + } +} +``` + +### Hive Catalog: + +```hocon +source { + Iceberg { + catalog_name = "seatunnel" + iceberg.catalog.config={ + type = "hive" + uri = "thrift://localhost:9083" + warehouse = "hdfs://your_cluster//tmp/seatunnel/iceberg/" + } + catalog_type = "hive" + + namespace = "your_iceberg_database" + table = "your_iceberg_table" + } +} +``` + +### Column Projection: + +```hocon +source { + Iceberg { + catalog_name = "seatunnel" + iceberg.catalog.config={ + type = "hadoop" + warehouse = "hdfs://your_cluster/tmp/seatunnel/iceberg/" + } + namespace = "your_iceberg_database" + table = "your_iceberg_table" + + schema { + fields { + f2 = "boolean" + f1 = "bigint" + f3 = "int" + f4 = "bigint" + } + } + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Iceberg Source Connector + +### next version + +- [Feature] Support Hadoop3.x ([3046](https://github.com/apache/seatunnel/pull/3046)) +- [improve][api] Refactoring schema parse ([4157](https://github.com/apache/seatunnel/pull/4157)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/InfluxDB.md b/versioned_docs/version-2.3.7/connector-v2/source/InfluxDB.md new file mode 100644 index 000000000000..d9eee30829d7 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/InfluxDB.md @@ -0,0 +1,195 @@ +# InfluxDB + +> InfluxDB source connector + +## Description + +Read external data source data through InfluxDB. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) + +supports query SQL and can achieve projection effect. + +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|--------------------|--------|----------|---------------| +| url | string | yes | - | +| sql | string | yes | - | +| schema | config | yes | - | +| database | string | yes | | +| username | string | no | - | +| password | string | no | - | +| lower_bound | long | no | - | +| upper_bound | long | no | - | +| partition_num | int | no | - | +| split_column | string | no | - | +| epoch | string | no | n | +| connect_timeout_ms | long | no | 15000 | +| query_timeout_sec | int | no | 3 | +| common-options | config | no | - | + +### url + +the url to connect to influxDB e.g. + +``` +http://influxdb-host:8086 +``` + +### sql [string] + +The query sql used to search data + +``` +select name,age from test +``` + +### schema [config] + +#### fields [Config] + +The schema information of upstream data. +e.g. + +``` +schema { + fields { + name = string + age = int + } + } +``` + +### database [string] + +The `influxDB` database + +### username [string] + +the username of the influxDB when you select + +### password [string] + +the password of the influxDB when you select + +### split_column [string] + +the `split_column` of the influxDB when you select + +> Tips: +> - influxDB tags is not supported as a segmented primary key because the type of tags can only be a string +> - influxDB time is not supported as a segmented primary key because the time field cannot participate in mathematical calculation +> - Currently, `split_column` only supports integer data segmentation, and does not support `float`, `string`, `date` and other types. + +### upper_bound [long] + +upper bound of the `split_column`column + +### lower_bound [long] + +lower bound of the `split_column` column + +``` + split the $split_column range into $partition_num parts + if partition_num is 1, use the whole `split_column` range + if partition_num < (upper_bound - lower_bound), use (upper_bound - lower_bound) partitions + + eg: lower_bound = 1, upper_bound = 10, partition_num = 2 + sql = "select * from test where age > 0 and age < 10" + + split result + + split 1: select * from test where ($split_column >= 1 and $split_column < 6) and ( age > 0 and age < 10 ) + + split 2: select * from test where ($split_column >= 6 and $split_column < 11) and ( age > 0 and age < 10 ) + +``` + +### partition_num [int] + +the `partition_num` of the InfluxDB when you select + +> Tips: Ensure that `upper_bound` minus `lower_bound` is divided `bypartition_num`, otherwise the query results will overlap + +### epoch [string] + +returned time precision +- Optional values: H, m, s, MS, u, n +- default value: n + +### query_timeout_sec [int] + +the `query_timeout` of the InfluxDB when you select, in seconds + +### connect_timeout_ms [long] + +the timeout for connecting to InfluxDB, in milliseconds + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +## Examples + +Example of multi parallelism and multi partition scanning + +```hocon +source { + + InfluxDB { + url = "http://influxdb-host:8086" + sql = "select label, value, rt, time from test" + database = "test" + upper_bound = 100 + lower_bound = 1 + partition_num = 4 + split_column = "value" + schema { + fields { + label = STRING + value = INT + rt = STRING + time = BIGINT + } + } + +} + +``` + +Example of not using partition scan + +```hocon +source { + + InfluxDB { + url = "http://influxdb-host:8086" + sql = "select label, value, rt, time from test" + database = "test" + schema { + fields { + label = STRING + value = INT + rt = STRING + time = BIGINT + } + } + +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add InfluxDB Source Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/IoTDB.md b/versioned_docs/version-2.3.7/connector-v2/source/IoTDB.md new file mode 100644 index 000000000000..ee9f04cb7a5f --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/IoTDB.md @@ -0,0 +1,187 @@ +# IoTDB + +> IoTDB source connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Description + +Read external data source data through IoTDB. + +:::tip + +There is a conflict of thrift version between IoTDB and Spark.Therefore, you need to execute `rm -f $SPARK_HOME/jars/libthrift*` and `cp $IOTDB_HOME/lib/libthrift* $SPARK_HOME/jars/` to resolve it. + +::: + +## Using Dependency + +### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.apache.iotdb/iotdb-jdbc) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.apache.iotdb/iotdb-jdbc) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) + +supports query SQL and can achieve projection effect. + +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Supported DataSource Info + +| Datasource | Supported Versions | Url | +|------------|--------------------|----------------| +| IoTDB | `>= 0.13.0` | localhost:6667 | + +## Data Type Mapping + +| IotDB Data Type | SeaTunnel Data Type | +|-----------------|---------------------| +| BOOLEAN | BOOLEAN | +| INT32 | TINYINT | +| INT32 | SMALLINT | +| INT32 | INT | +| INT64 | BIGINT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| TEXT | STRING | + +## Source Options + +| Name | Type | Required | Default Value | Description | +|----------------------------|---------|----------|---------------|------------------------------------------------------------------------------------| +| node_urls | string | yes | - | `IoTDB` cluster address, the format is `"host1:port"` or `"host1:port,host2:port"` | +| username | string | yes | - | `IoTDB` user username | +| password | string | yes | - | `IoTDB` user password | +| sql | string | yes | - | execute sql statement | +| schema | config | yes | - | the data schema | +| fetch_size | int | no | - | the fetch_size of the IoTDB when you select | +| lower_bound | long | no | - | the lower_bound of the IoTDB when you select | +| upper_bound | long | no | - | the upper_bound of the IoTDB when you select | +| num_partitions | int | no | - | the num_partitions of the IoTDB when you select | +| thrift_default_buffer_size | int | no | - | the thrift_default_buffer_size of the IoTDB when you select | +| thrift_max_frame_size | int | no | - | the thrift max frame size | +| enable_cache_leader | boolean | no | - | enable_cache_leader of the IoTDB when you select | +| version | string | no | - | SQL semantic version used by the client, The possible values are: V_0_12, V_0_13 | +| common-options | | no | - | | + +### split partitions + +we can split the partitions of the IoTDB and we used time column split + +#### num_partitions [int] + +split num + +### upper_bound [long] + +upper bound of the time column + +### lower_bound [long] + +lower bound of the time column + +``` + split the time range into numPartitions parts + if numPartitions is 1, use the whole time range + if numPartitions < (upper_bound - lower_bound), use (upper_bound - lower_bound) partitions + + eg: lower_bound = 1, upper_bound = 10, numPartitions = 2 + sql = "select * from test where age > 0 and age < 10" + + split result + + split 1: select * from test where (time >= 1 and time < 6) and ( age > 0 and age < 10 ) + + split 2: select * from test where (time >= 6 and time < 11) and ( age > 0 and age < 10 ) + +``` + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +## Examples + +```hocon +env { + parallelism = 2 + job.mode = "BATCH" +} + +source { + IoTDB { + node_urls = "localhost:6667" + username = "root" + password = "root" + sql = "SELECT temperature, moisture, c_int, c_bigint, c_float, c_double, c_string, c_boolean FROM root.test_group.* WHERE time < 4102329600000 align by device" + schema { + fields { + ts = timestamp + device_name = string + temperature = float + moisture = bigint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_string = string + c_boolean = boolean + } + } + } +} + +sink { + Console { + } +} +``` + +Upstream `IoTDB` data format is the following: + +```shell +IoTDB> SELECT temperature, moisture, c_int, c_bigint, c_float, c_double, c_string, c_boolean FROM root.test_group.* WHERE time < 4102329600000 align by device; ++------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+ +| Time| Device| temperature| moisture| c_int| c_bigint| c_float| c_double| c_string| c_boolean| ++------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+ +|2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1| 21474836470| 1.0f| 1.0d| abc| true| +|2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 2| 21474836470| 2.0f| 2.0d| abc| true| +|2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 3| 21474836470| 3.0f| 3.0d| abc| true| ++------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+ +``` + +Loaded to SeaTunnelRow data format is the following: + +| ts | device_name | temperature | moisture | c_int | c_bigint | c_float | c_double | c_string | c_boolean | +|---------------|--------------------------|-------------|----------|-------|-------------|---------|----------|----------|-----------| +| 1664035200001 | root.test_group.device_a | 36.1 | 100 | 1 | 21474836470 | 1.0f | 1.0d | abc | true | +| 1664035200001 | root.test_group.device_b | 36.2 | 101 | 2 | 21474836470 | 2.0f | 2.0d | abc | true | +| 1664035200001 | root.test_group.device_c | 36.3 | 102 | 3 | 21474836470 | 3.0f | 3.0d | abc | true | + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add IoTDB Source Connector + +### 2.3.0-beta 2022-10-20 + +- [Improve] Improve IoTDB Source Connector ([2917](https://github.com/apache/seatunnel/pull/2917)) + - Support extract timestamp、device、measurement from SeaTunnelRow + - Support TINYINT、SMALLINT + - Support flush cache to database before prepareCommit + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Jdbc.md b/versioned_docs/version-2.3.7/connector-v2/source/Jdbc.md new file mode 100644 index 000000000000..7fab8d50b25d --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Jdbc.md @@ -0,0 +1,304 @@ +# JDBC + +> JDBC source connector + +## Description + +Read external data source data through JDBC. + +:::tip + +Warn: for license compliance, you have to provide database driver yourself, copy to `$SEATNUNNEL_HOME/lib/` directory in order to make them work. + +e.g. If you use MySQL, should download and copy `mysql-connector-java-xxx.jar` to `$SEATNUNNEL_HOME/lib/`. For Spark/Flink, you should also copy it to `$SPARK_HOME/jars/` or `$FLINK_HOME/lib/`. + +::: + +## Using Dependency + +### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) + +supports query SQL and can achieve projection effect. + +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) +- [x] [support multiple table read](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | description | +|--------------------------------------------|---------|----------|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:postgresql://localhost/test | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, if you use MySQL the value is `com.mysql.cj.jdbc.Driver`. | +| user | String | No | - | userName | +| password | String | No | - | password | +| query | String | No | - | Query statement | +| compatible_mode | String | No | - | The compatible mode of database, required when the database supports multiple compatible modes. For example, when using OceanBase database, you need to set it to 'mysql' or 'oracle'. | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | +| partition_column | String | No | - | The column name for split data. | +| partition_upper_bound | Long | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_lower_bound | Long | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_num | Int | No | job parallelism | Not recommended for use, The correct approach is to control the number of split through `split.size`
How many splits do we need to split into, only support positive integer. default value is job parallelism. | +| use_select_count | Boolean | No | false | Use select count for table count rather then other methods in dynamic chunk split stage. This is currently only available for jdbc-oracle.In this scenario, select count directly is used when it is faster to update statistics using sql from analysis table | +| skip_analyze | Boolean | No | false | Skip the analysis of table count in dynamic chunk split stage. This is currently only available for jdbc-oracle.In this scenario, you schedule analysis table sql to update related table statistics periodically or your table data does not change frequently | +| fetch_size | Int | No | 0 | For queries that return a large number of objects, you can configure the row fetch size used in the query to improve performance by reducing the number database hits required to satisfy the selection criteria. Zero means use jdbc default value. | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | +| table_path | String | No | - | The path to the full path of table, you can use this configuration instead of `query`.
examples:
`- mysql: "testdb.table1" `
`- oracle: "test_schema.table1" `
`- sqlserver: "testdb.test_schema.table1"`
`- postgresql: "testdb.test_schema.table1"`
`- iris: "test_schema.table1"` | +| table_list | Array | No | - | The list of tables to be read, you can use this configuration instead of `table_path` | +| where_condition | String | No | - | Common row filter conditions for all tables/queries, must start with `where`. for example `where id > 100` | +| split.size | Int | No | 8096 | How many rows in one split, captured tables are split into multiple splits when read of table. | +| split.even-distribution.factor.lower-bound | Double | No | 0.05 | Not recommended for use.
The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | +| split.even-distribution.factor.upper-bound | Double | No | 100 | Not recommended for use.
The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | +| split.sample-sharding.threshold | Int | No | 1000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | +| split.inverse-sampling.rate | Int | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | + +## Parallel Reader + +The JDBC Source connector supports parallel reading of data from tables. SeaTunnel will use certain rules to split the data in the table, which will be handed over to readers for reading. The number of readers is determined by the `parallelism` option. + +**Split Key Rules:** + +1. If `partition_column` is not null, It will be used to calculate split. The column must in **Supported split data type**. +2. If `partition_column` is null, seatunnel will read the schema from table and get the Primary Key and Unique Index. If there are more than one column in Primary Key and Unique Index, The first column which in the **supported split data type** will be used to split data. For example, the table have Primary Key(nn guid, name varchar), because `guid` id not in **supported split data type**, so the column `name` will be used to split data. + +**Supported split data type:** +* String +* Number(int, bigint, decimal, ...) +* Date + +## tips + +> If the table can not be split(for example, table have no Primary Key or Unique Index, and `partition_column` is not set), it will run in single concurrency. +> +> Use `table_path` to replace `query` for single table reading. If you need to read multiple tables, use `table_list`. + +## appendix + +there are some reference value for params above. + +| datasource | driver | url | maven | +|-------------------|-----------------------------------------------------|------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------| +| mysql | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | https://mvnrepository.com/artifact/mysql/mysql-connector-java | +| postgresql | org.postgresql.Driver | jdbc:postgresql://localhost:5432/postgres | https://mvnrepository.com/artifact/org.postgresql/postgresql | +| dm | dm.jdbc.driver.DmDriver | jdbc:dm://localhost:5236 | https://mvnrepository.com/artifact/com.dameng/DmJdbcDriver18 | +| phoenix | org.apache.phoenix.queryserver.client.Driver | jdbc:phoenix:thin:url=http://localhost:8765;serialization=PROTOBUF | https://mvnrepository.com/artifact/com.aliyun.phoenix/ali-phoenix-shaded-thin-client | +| sqlserver | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433 | https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc | +| oracle | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@localhost:1521/xepdb1 | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | +| sqlite | org.sqlite.JDBC | jdbc:sqlite:test.db | https://mvnrepository.com/artifact/org.xerial/sqlite-jdbc | +| gbase8a | com.gbase.jdbc.Driver | jdbc:gbase://e2e_gbase8aDb:5258/test | https://www.gbase8.cn/wp-content/uploads/2020/10/gbase-connector-java-8.3.81.53-build55.5.7-bin_min_mix.jar | +| starrocks | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | https://mvnrepository.com/artifact/mysql/mysql-connector-java | +| db2 | com.ibm.db2.jcc.DB2Driver | jdbc:db2://localhost:50000/testdb | https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc/db2jcc4 | +| tablestore | com.alicloud.openservices.tablestore.jdbc.OTSDriver | "jdbc:ots:http s://myinstance.cn-hangzhou.ots.aliyuncs.com/myinstance" | https://mvnrepository.com/artifact/com.aliyun.openservices/tablestore-jdbc | +| saphana | com.sap.db.jdbc.Driver | jdbc:sap://localhost:39015 | https://mvnrepository.com/artifact/com.sap.cloud.db.jdbc/ngdbc | +| doris | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | https://mvnrepository.com/artifact/mysql/mysql-connector-java | +| teradata | com.teradata.jdbc.TeraDriver | jdbc:teradata://localhost/DBS_PORT=1025,DATABASE=test | https://mvnrepository.com/artifact/com.teradata.jdbc/terajdbc | +| Snowflake | net.snowflake.client.jdbc.SnowflakeDriver | jdbc:snowflake://.snowflakecomputing.com | https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc | +| Redshift | com.amazon.redshift.jdbc42.Driver | jdbc:redshift://localhost:5439/testdb?defaultRowFetchSize=1000 | https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42 | +| Vertica | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433 | https://repo1.maven.org/maven2/com/vertica/jdbc/vertica-jdbc/12.0.3-0/vertica-jdbc-12.0.3-0.jar | +| Kingbase | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar | +| OceanBase | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2881 | https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.3/oceanbase-client-2.4.3.jar | +| Hive | org.apache.hive.jdbc.HiveDriver | jdbc:hive2://localhost:10000 | https://repo1.maven.org/maven2/org/apache/hive/hive-jdbc/3.1.3/hive-jdbc-3.1.3-standalone.jar | +| xugu | com.xugu.cloudjdbc.Driver | jdbc:xugu://localhost:5138 | https://repo1.maven.org/maven2/com/xugudb/xugu-jdbc/12.2.0/xugu-jdbc-12.2.0.jar | +| InterSystems IRIS | com.intersystems.jdbc.IRISDriver | jdbc:IRIS://localhost:1972/%SYS | https://raw.githubusercontent.com/intersystems-community/iris-driver-distribution/main/JDBC/JDK18/intersystems-jdbc-3.8.4.jar | + +## Example + +### simple + +#### Case 1 + +``` +Jdbc { + url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" + driver = "com.mysql.cj.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + query = "select * from type_bin" +} +``` + +#### Case 2 Use the select count(*) instead of analysis table for count table rows in dynamic chunk split stage + +``` +Jdbc { + url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" + driver = "com.mysql.cj.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + use_select_count = true + query = "select * from type_bin" +} +``` + +#### Case 3 Use the select NUM_ROWS from all_tables for the table rows but skip the analyze table. + +``` +Jdbc { + url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" + driver = "com.mysql.cj.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + skip_analyze = true + query = "select * from type_bin" +} +``` + +### parallel by partition_column + +``` +env { + parallelism = 10 + job.mode = "BATCH" +} +source { + Jdbc { + url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" + driver = "com.mysql.cj.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + query = "select * from type_bin" + partition_column = "id" + split.size = 10000 + # Read start boundary + #partition_lower_bound = ... + # Read end boundary + #partition_upper_bound = ... + } +} + +sink { + Console {} +} +``` + +### Parallel Boundary: + +> It is more efficient to specify the data within the upper and lower bounds of the query. It is more efficient to read your data source according to the upper and lower boundaries you configured. + +``` +source { + Jdbc { + url = "jdbc:mysql://localhost:3306/test?serverTimezone=GMT%2b8&useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" + driver = "com.mysql.cj.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + # Define query logic as required + query = "select * from type_bin" + partition_column = "id" + # Read start boundary + partition_lower_bound = 1 + # Read end boundary + partition_upper_bound = 500 + partition_num = 10 + properties { + useSSL=false + } + } +} +``` + +### parallel by Primary Key or Unique Index + +> Configuring `table_path` will turn on auto split, you can configure `split.*` to adjust the split strategy + +``` +env { + parallelism = 10 + job.mode = "BATCH" +} +source { + Jdbc { + url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" + driver = "com.mysql.cj.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + table_path = "testdb.table1" + query = "select * from testdb.table1" + split.size = 10000 + } +} + +sink { + Console {} +} +``` + +### multiple table read: + +***Configuring `table_list` will turn on auto split, you can configure `split.*` to adjust the split strategy*** + +```hocon +Jdbc { + url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" + driver = "com.mysql.cj.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + + table_list = [ + { + # e.g. table_path = "testdb.table1"、table_path = "test_schema.table1"、table_path = "testdb.test_schema.table1" + table_path = "testdb.table1" + }, + { + table_path = "testdb.table2" + # Use query filetr rows & columns + query = "select id, name from testdb.table2 where id > 100" + } + ] + #where_condition= "where id > 100" + #split.size = 10000 + #split.even-distribution.factor.upper-bound = 100 + #split.even-distribution.factor.lower-bound = 0.05 + #split.sample-sharding.threshold = 1000 + #split.inverse-sampling.rate = 1000 +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add ClickHouse Source Connector + +### 2.3.0-beta 2022-10-20 + +- [Feature] Support Phoenix JDBC Source ([2499](https://github.com/apache/seatunnel/pull/2499)) +- [Feature] Support SQL Server JDBC Source ([2646](https://github.com/apache/seatunnel/pull/2646)) +- [Feature] Support Oracle JDBC Source ([2550](https://github.com/apache/seatunnel/pull/2550)) +- [Feature] Support StarRocks JDBC Source ([3060](https://github.com/apache/seatunnel/pull/3060)) +- [Feature] Support GBase8a JDBC Source ([3026](https://github.com/apache/seatunnel/pull/3026)) +- [Feature] Support DB2 JDBC Source ([2410](https://github.com/apache/seatunnel/pull/2410)) + +### next version + +- [BugFix] Fix jdbc split bug ([3220](https://github.com/apache/seatunnel/pull/3220)) +- [Feature] Support Sqlite JDBC Source ([3089](https://github.com/apache/seatunnel/pull/3089)) +- [Feature] Support Tablestore Source ([3309](https://github.com/apache/seatunnel/pull/3309)) +- [Feature] Support Teradata JDBC Source ([3362](https://github.com/apache/seatunnel/pull/3362)) +- [Feature] Support JDBC Fetch Size Config ([3478](https://github.com/apache/seatunnel/pull/3478)) +- [Feature] Support Doris JDBC Source ([3586](https://github.com/apache/seatunnel/pull/3586)) +- [Feature] Support Redshift JDBC Sink([#3615](https://github.com/apache/seatunnel/pull/3615)) +- [BugFix] Fix jdbc connection reset bug ([3670](https://github.com/apache/seatunnel/pull/3670)) +- [Improve] Add Vertica connector([#4303](https://github.com/apache/seatunnel/pull/4303)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Jira.md b/versioned_docs/version-2.3.7/connector-v2/source/Jira.md new file mode 100644 index 000000000000..dcfe6cc11d37 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Jira.md @@ -0,0 +1,305 @@ +# Jira + +> Jira source connector + +## Description + +Used to read data from Jira. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-----------------------------|---------|----------|---------------| +| url | String | Yes | - | +| email | String | Yes | - | +| api_token | String | Yes | - | +| method | String | No | get | +| schema.fields | Config | No | - | +| format | String | No | json | +| params | Map | No | - | +| body | String | No | - | +| json_field | Config | No | - | +| content_json | String | No | - | +| poll_interval_millis | int | No | - | +| retry | int | No | - | +| retry_backoff_multiplier_ms | int | No | 100 | +| retry_backoff_max_ms | int | No | 10000 | +| enable_multi_lines | boolean | No | false | +| common-options | config | No | - | + +### url [String] + +http request url + +### email [String] + +Jira Email + +### api_token [String] + +Jira API Token + +https://id.atlassian.com/manage-profile/security/api-tokens + +### method [String] + +http request method, only supports GET, POST method + +### params [Map] + +http params + +### body [String] + +http body + +### poll_interval_millis [int] + +request http api interval(millis) in stream mode + +### retry [int] + +The max retry times if request http return to `IOException` + +### retry_backoff_multiplier_ms [int] + +The retry-backoff times(millis) multiplier if request http failed + +### retry_backoff_max_ms [int] + +The maximum retry-backoff times(millis) if request http failed + +### format [String] + +the format of upstream data, now only support `json` `text`, default `json`. + +when you assign format is `json`, you should also assign schema option, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +when you assign format is `text`, connector will do nothing for upstream data, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +connector will generate data as the following: + +| content | +|----------------------------------------------------------| +| {"code": 200, "data": "get success", "success": true} | + +### schema [Config] + +#### fields [Config] + +the schema fields of upstream data + +### content_json [String] + +This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. + +If your return data looks something like this. + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can configure `content_field = "$.store.book.*"` and the result returned looks like this: + +```json +[ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } +] +``` + +Then you can get the desired result with a simpler schema,like + +```hocon +Http { + url = "http://mockserver:1080/contentjson/mock" + method = "GET" + format = "json" + content_field = "$.store.book.*" + schema = { + fields { + category = string + author = string + title = string + price = string + } + } +} +``` + +Here is an example: + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). + +### json_field [Config] + +This parameter helps you configure the schema,so this parameter must be used with schema. + +If your data looks something like this: + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can get the contents of 'book' by configuring the task as follows: + +```hocon +source { + Http { + url = "http://mockserver:1080/jsonpath/mock" + method = "GET" + format = "json" + json_field = { + category = "$.store.book[*].category" + author = "$.store.book[*].author" + title = "$.store.book[*].title" + price = "$.store.book[*].price" + } + schema = { + fields { + category = string + author = string + title = string + price = string + } + } + } +} +``` + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +## Example + +```hocon +Jira { + url = "https://liugddx.atlassian.net/rest/api/3/search" + email = "test@test.com" + api_token = "xxx" + schema { + fields { + expand = string + startAt = bigint + maxResults = int + total = int + } + } +} +``` + +## Changelog + +### next version + +- Add Jira Source Connector +- [Feature][Connector-V2][HTTP] Use json-path parsing ([3510](https://github.com/apache/seatunnel/pull/3510)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Kingbase.md b/versioned_docs/version-2.3.7/connector-v2/source/Kingbase.md new file mode 100644 index 000000000000..5e9630e4e526 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Kingbase.md @@ -0,0 +1,148 @@ +# Kingbase + +> JDBC Kingbase Source Connector + +## Support Connector Version + +- 8.6 + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +Read external data source data through JDBC. + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|--------------------|----------------------|------------------------------------------|------------------------------------------------------------------------------------------------| +| Kingbase | 8.6 | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | [Download](https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example: cp kingbase8-8.6.0.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| Kingbase Data type | SeaTunnel Data type | +|-------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------| +| BOOL | BOOLEAN | +| INT2 | SHORT | +| SMALLSERIAL
SERIAL
INT4 | INT | +| INT8
BIGSERIAL | BIGINT | +| FLOAT4 | FLOAT | +| FLOAT8 | DOUBLE | +| NUMERIC | DECIMAL((Get the designated column's specified column size),
(Gets the designated column's number of digits to right of the decimal point.))) | +| BPCHAR
CHARACTER
VARCHAR
TEXT | STRING | +| TIMESTAMP | LOCALDATETIME | +| TIME | LOCALTIME | +| DATE | LOCALDATE | +| Other data type | Not supported yet | + +## Source Options + +| Name | Type | Required | Default | Description | +|------------------------------|------------|----------|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:kingbase8://localhost:54321/test | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, should be `com.kingbase8.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type column and string type column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. Default value is job parallelism. | +| fetch_size | Int | No | 0 | For queries that return a large number of objects, you can configure
the row fetch size used in the query to improve performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +``` +env { + parallelism = 2 + job.mode = "BATCH" +} + +source { + Jdbc { + driver = "com.kingbase8.Driver" + url = "jdbc:kingbase8://localhost:54321/db_test" + user = "root" + password = "" + query = "select * from source" + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform/sql +} + +sink { + Console {} +} +``` + +### Parallel: + +> Read your query table in parallel with the shard field you configured and the shard data. You can do this if you want to read the whole table + +``` +source { + Jdbc { + driver = "com.kingbase8.Driver" + url = "jdbc:kingbase8://localhost:54321/db_test" + user = "root" + password = "" + query = "select * from source" + # Parallel sharding reads fields + partition_column = "id" + # Number of fragments + partition_num = 10 + } +} +``` + +### Parallel Boundary: + +> It is more efficient to read your data source according to the upper and lower boundaries you configured + +``` +source { + Jdbc { + driver = "com.kingbase8.Driver" + url = "jdbc:kingbase8://localhost:54321/db_test" + user = "root" + password = "" + query = "select * from source" + partition_column = "id" + partition_num = 10 + # Read start boundary + partition_lower_bound = 1 + # Read end boundary + partition_upper_bound = 500 + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Klaviyo.md b/versioned_docs/version-2.3.7/connector-v2/source/Klaviyo.md new file mode 100644 index 000000000000..e80a2434fdf1 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Klaviyo.md @@ -0,0 +1,312 @@ +# Klaviyo + +> Klaviyo source connector + +## Description + +Used to read data from Klaviyo. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-----------------------------|---------|----------|---------------| +| url | String | Yes | - | +| private_key | String | Yes | - | +| revision | String | Yes | - | +| method | String | No | get | +| schema | Config | No | - | +| schema.fields | Config | No | - | +| format | String | No | json | +| params | Map | No | - | +| body | String | No | - | +| json_field | Config | No | - | +| content_json | String | No | - | +| poll_interval_millis | int | No | - | +| retry | int | No | - | +| retry_backoff_multiplier_ms | int | No | 100 | +| retry_backoff_max_ms | int | No | 10000 | +| enable_multi_lines | boolean | No | false | +| common-options | config | No | - | + +### url [String] + +http request url + +### private_key [String] + +API private key for login, you can get more detail at this link: + +https://developers.klaviyo.com/en/docs/retrieve_api_credentials + +### revision [String] + +API endpoint revision (format: YYYY-MM-DD) + +### method [String] + +http request method, only supports GET, POST method + +### params [Map] + +http params + +### body [String] + +http body + +### poll_interval_millis [int] + +request http api interval(millis) in stream mode + +### retry [int] + +The max retry times if request http return to `IOException` + +### retry_backoff_multiplier_ms [int] + +The retry-backoff times(millis) multiplier if request http failed + +### retry_backoff_max_ms [int] + +The maximum retry-backoff times(millis) if request http failed + +### format [String] + +the format of upstream data, now only support `json` `text`, default `json`. + +when you assign format is `json`, you should also assign schema option, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +you should assign schema as the following: + +```hocon +schema { + fields { + code = int + data = string + success = boolean + } +} +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +when you assign format is `text`, connector will do nothing for upstream data, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +connector will generate data as the following: + +| content | +|----------------------------------------------------------| +| {"code": 200, "data": "get success", "success": true} | + +### schema [Config] + +#### fields [Config] + +the schema fields of upstream data + +### content_json [String] + +This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. + +If your return data looks something like this. + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can configure `content_field = "$.store.book.*"` and the result returned looks like this: + +```json +[ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } +] +``` + +Then you can get the desired result with a simpler schema,like + +```hocon +Http { + url = "http://mockserver:1080/contentjson/mock" + method = "GET" + format = "json" + content_field = "$.store.book.*" + schema = { + fields { + category = string + author = string + title = string + price = string + } + } +} +``` + +Here is an example: + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). + +### json_field [Config] + +This parameter helps you configure the schema,so this parameter must be used with schema. + +If your data looks something like this: + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can get the contents of 'book' by configuring the task as follows: + +```hocon +source { + Http { + url = "http://mockserver:1080/jsonpath/mock" + method = "GET" + format = "json" + json_field = { + category = "$.store.book[*].category" + author = "$.store.book[*].author" + title = "$.store.book[*].title" + price = "$.store.book[*].price" + } + schema = { + fields { + category = string + author = string + title = string + price = string + } + } + } +} +``` + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +## Example + +```hocon +Klaviyo { + url = "https://a.klaviyo.com/api/lists/" + private_key = "SeaTunnel-test" + revision = "2020-10-17" + method = "GET" + format = "json" + schema = { + fields { + type = string + id = string + attributes = { + name = string + created = string + updated = string + } + links = { + self = string + } + } + } +} +``` + +## Changelog + +### next version + +- Add Klaviyo Source Connector +- [Feature][Connector-V2][HTTP] Use json-path parsing ([3510](https://github.com/apache/seatunnel/pull/3510)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Kudu.md b/versioned_docs/version-2.3.7/connector-v2/source/Kudu.md new file mode 100644 index 000000000000..4d834e5e2d67 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Kudu.md @@ -0,0 +1,153 @@ +# Kudu + +> Kudu source connector + +## Support Kudu Version + +- 1.11.1/1.12.0/1.13.0/1.14.0/1.15.0 + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +Used to read data from Kudu. + +The tested kudu version is 1.11.1. + +## Data Type Mapping + +| kudu Data Type | SeaTunnel Data Type | +|--------------------------|---------------------| +| BOOL | BOOLEAN | +| INT8
INT16
INT32 | INT | +| INT64 | BIGINT | +| DECIMAL | DECIMAL | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| STRING | STRING | +| UNIXTIME_MICROS | TIMESTAMP | +| BINARY | BYTES | + +## Source Options + +| Name | Type | Required | Default | Description | +|-------------------------------------------|--------|----------|------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| kudu_masters | String | Yes | - | Kudu master address. Separated by ',',such as '192.168.88.110:7051'. | +| table_name | String | Yes | - | The name of kudu table. | +| client_worker_count | Int | No | 2 * Runtime.getRuntime().availableProcessors() | Kudu worker count. Default value is twice the current number of cpu cores. | +| client_default_operation_timeout_ms | Long | No | 30000 | Kudu normal operation time out. | +| client_default_admin_operation_timeout_ms | Long | No | 30000 | Kudu admin operation time out. | +| enable_kerberos | Bool | No | false | Kerberos principal enable. | +| kerberos_principal | String | No | - | Kerberos principal. Note that all zeta nodes require have this file. | +| kerberos_keytab | String | No | - | Kerberos keytab. Note that all zeta nodes require have this file. | +| kerberos_krb5conf | String | No | - | Kerberos krb5 conf. Note that all zeta nodes require have this file. | +| scan_token_query_timeout | Long | No | 30000 | The timeout for connecting scan token. If not set, it will be the same as operationTimeout. | +| scan_token_batch_size_bytes | Int | No | 1024 * 1024 | Kudu scan bytes. The maximum number of bytes read at a time, the default is 1MB. | +| filter | Int | No | 1024 * 1024 | Kudu scan filter expressions,Not supported yet. | +| schema | Map | No | 1024 * 1024 | SeaTunnel Schema. | +| table_list | Array | No | - | The list of tables to be read. you can use this configuration instead of `table_path` example: ```table_list = [{ table_name = "kudu_source_table_1"},{ table_name = "kudu_source_table_2"}] ``` | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | + +## Task Example + +### Simple: + +> The following example is for a Kudu table named "kudu_source_table", The goal is to print the data from this table on the console and write kudu table "kudu_sink_table" + +```hocon +# Defining the runtime environment +env { + parallelism = 2 + job.mode = "BATCH" +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + kudu { + kudu_masters = "kudu-master:7051" + table_name = "kudu_source_table" + result_table_name = "kudu" + enable_kerberos = true + kerberos_principal = "xx@xx.COM" + kerberos_keytab = "xx.keytab" + } +} + +transform { +} + +sink { + console { + source_table_name = "kudu" + } + + kudu { + source_table_name = "kudu" + kudu_masters = "kudu-master:7051" + table_name = "kudu_sink_table" + enable_kerberos = true + kerberos_principal = "xx@xx.COM" + kerberos_keytab = "xx.keytab" + } +} +``` + +### Multiple Table + +```hocon +env { + # You can set engine configuration here + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + kudu{ + kudu_masters = "kudu-master:7051" + table_list = [ + { + table_name = "kudu_source_table_1" + },{ + table_name = "kudu_source_table_2" + } + ] + result_table_name = "kudu" +} +} + +transform { +} + +sink { + Assert { + rules { + table-names = ["kudu_source_table_1", "kudu_source_table_2"] + } + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Kudu Source Connector + +### Next Version + +- Change plugin name from `KuduSource` to `Kudu` [3432](https://github.com/apache/seatunnel/pull/3432) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Lemlist.md b/versioned_docs/version-2.3.7/connector-v2/source/Lemlist.md new file mode 100644 index 000000000000..76cac3b9bf81 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Lemlist.md @@ -0,0 +1,297 @@ +# Lemlist + +> Lemlist source connector + +## Description + +Used to read data from Lemlist. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-----------------------------|---------|----------|---------------| +| url | String | Yes | - | +| password | String | Yes | - | +| method | String | No | get | +| schema.fields | Config | No | - | +| format | String | No | json | +| params | Map | No | - | +| body | String | No | - | +| json_field | Config | No | - | +| content_json | String | No | - | +| poll_interval_millis | int | No | - | +| retry | int | No | - | +| retry_backoff_multiplier_ms | int | No | 100 | +| retry_backoff_max_ms | int | No | 10000 | +| enable_multi_lines | boolean | No | false | +| common-options | config | No | - | + +### url [String] + +http request url + +### password [String] + +API key for login, you can get more detail at this link: + +https://app.lemlist.com/settings/integrations + +### method [String] + +http request method, only supports GET, POST method + +### params [Map] + +http params + +### body [String] + +http body + +### poll_interval_millis [int] + +request http api interval(millis) in stream mode + +### retry [int] + +The max retry times if request http return to `IOException` + +### retry_backoff_multiplier_ms [int] + +The retry-backoff times(millis) multiplier if request http failed + +### retry_backoff_max_ms [int] + +The maximum retry-backoff times(millis) if request http failed + +### format [String] + +the format of upstream data, now only support `json` `text`, default `json`. + +when you assign format is `json`, you should also assign schema option, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +when you assign format is `text`, connector will do nothing for upstream data, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +connector will generate data as the following: + +| content | +|----------------------------------------------------------| +| {"code": 200, "data": "get success", "success": true} | + +### schema [Config] + +#### fields [Config] + +the schema fields of upstream data + +### content_json [String] + +This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. + +If your return data looks something like this. + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can configure `content_field = "$.store.book.*"` and the result returned looks like this: + +```json +[ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } +] +``` + +Then you can get the desired result with a simpler schema,like + +```hocon +Http { + url = "http://mockserver:1080/contentjson/mock" + method = "GET" + format = "json" + content_field = "$.store.book.*" + schema = { + fields { + category = string + author = string + title = string + price = string + } + } +} +``` + +Here is an example: + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). + +### json_field [Config] + +This parameter helps you configure the schema,so this parameter must be used with schema. + +If your data looks something like this: + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can get the contents of 'book' by configuring the task as follows: + +```hocon +source { + Http { + url = "http://mockserver:1080/jsonpath/mock" + method = "GET" + format = "json" + json_field = { + category = "$.store.book[*].category" + author = "$.store.book[*].author" + title = "$.store.book[*].title" + price = "$.store.book[*].price" + } + schema = { + fields { + category = string + author = string + title = string + price = string + } + } + } +} +``` + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +## Example + +```hocon +Lemlist { + url = "https://api.lemlist.com/api/campaigns" + password = "SeaTunnel-test" + schema { + fields { + _id = string + name = string + } + } +} +``` + +## Changelog + +### next version + +- Add Lemlist Source Connector +- [Feature][Connector-V2][HTTP] Use json-path parsing ([3510](https://github.com/apache/seatunnel/pull/3510)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/LocalFile.md b/versioned_docs/version-2.3.7/connector-v2/source/LocalFile.md new file mode 100644 index 000000000000..05d87362f6d6 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/LocalFile.md @@ -0,0 +1,408 @@ +# LocalFile + +> Local file source connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) + +Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot. + +- [ ] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + - [x] xml + - [x] binary + +## Description + +Read data from local file system. + +:::tip + +If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. + +If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. + +::: + +## Options + +| name | type | required | default value | +|---------------------------|---------|----------|--------------------------------------| +| path | string | yes | - | +| file_format_type | string | yes | - | +| read_columns | list | no | - | +| delimiter/field_delimiter | string | no | \001 | +| parse_partition_from_path | boolean | no | true | +| date_format | string | no | yyyy-MM-dd | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | +| time_format | string | no | HH:mm:ss | +| skip_header_row_number | long | no | 0 | +| schema | config | no | - | +| sheet_name | string | no | - | +| xml_row_tag | string | no | - | +| xml_use_attr_format | boolean | no | - | +| file_filter_pattern | string | no | - | +| compress_codec | string | no | none | +| encoding | string | no | UTF-8 | +| common-options | | no | - | +| tables_configs | list | no | used to define a multiple table task | + +### path [string] + +The source file path. + +### file_format_type [string] + +File type, supported as the following file types: + +`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` + +If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want. + +For example: + +upstream data is the following: + +```json + +{"code": 200, "data": "get success", "success": true} + +``` + +You can also save multiple pieces of data in one file and split them by newline: + +```json lines + +{"code": 200, "data": "get success", "success": true} +{"code": 300, "data": "get failed", "success": false} + +``` + +you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. + +If you assign file type to `text` `csv`, you can choose to specify the schema information or not. + +For example, upstream data is the following: + +```text + +tyrantlucifer#26#male + +``` + +If you do not assign data schema connector will treat the upstream data as the following: + +| content | +|-----------------------| +| tyrantlucifer#26#male | + +If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type + +you should assign schema and delimiter as the following: + +```hocon + +field_delimiter = "#" +schema { + fields { + name = string + age = int + gender = string + } +} + +``` + +connector will generate data as the following: + +| name | age | gender | +|---------------|-----|--------| +| tyrantlucifer | 26 | male | + +If you assign file type to `binary`, SeaTunnel can synchronize files in any format, +such as compressed packages, pictures, etc. In short, any files can be synchronized to the target place. +Under this requirement, you need to ensure that the source and sink use `binary` format for file synchronization +at the same time. You can find the specific usage in the example below. + +### read_columns [list] + +The read column list of the data source, user can use it to implement field projection. + +### delimiter/field_delimiter [string] + +**delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. + +Only need to be configured when file_format is text. + +Field delimiter, used to tell connector how to slice and dice fields. + +default `\001`, the same as hive's default delimiter + +### parse_partition_from_path [boolean] + +Control whether parse the partition keys and values from file path + +For example if you read a file from path `file://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` + +Every record data from file will be added these two fields: + +| name | age | +|---------------|-----| +| tyrantlucifer | 26 | + +Tips: **Do not define partition fields in schema option** + +### date_format [string] + +Date type format, used to tell connector how to convert string to date, supported as the following formats: + +`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` + +default `yyyy-MM-dd` + +### datetime_format [string] + +Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats: + +`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` + +default `yyyy-MM-dd HH:mm:ss` + +### time_format [string] + +Time type format, used to tell connector how to convert string to time, supported as the following formats: + +`HH:mm:ss` `HH:mm:ss.SSS` + +default `HH:mm:ss` + +### skip_header_row_number [long] + +Skip the first few lines, but only for the txt and csv. + +For example, set like following: + +`skip_header_row_number = 2` + +then SeaTunnel will skip the first 2 lines from source files + +### schema [config] + +Only need to be configured when the file_format_type are text, json, excel, xml or csv ( Or other format we can't read the schema from metadata). + +#### fields [Config] + +The schema information of upstream data. + +### sheet_name [string] + +Only need to be configured when file_format is excel. + +Reader the sheet of the workbook. + +### xml_row_tag [string] + +Only need to be configured when file_format is xml. + +Specifies the tag name of the data rows within the XML file. + +### xml_use_attr_format [boolean] + +Only need to be configured when file_format is xml. + +Specifies Whether to process data using the tag attribute format. + +### file_filter_pattern [string] + +Filter pattern, which used for filtering files. + +### compress_codec [string] + +The compress codec of files and the details that supported as the following shown: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc/parquet: + automatically recognizes the compression type, no additional settings required. + +### encoding [string] + +Only used when file_format_type is json,text,csv,xml. +The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`. + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +### tables_configs + +Used to define a multiple table task, when you have multiple tables to read, you can use this option to define multiple tables. + +## Example + +### One Table + +```hocon + +LocalFile { + path = "/apps/hive/demo/student" + file_format_type = "parquet" +} + +``` + +```hocon + +LocalFile { + schema { + fields { + name = string + age = int + } + } + path = "/apps/hive/demo/student" + file_format_type = "json" +} + +``` + +For json, text or csv file format with `encoding` + +```hocon + +LocalFile { + path = "/tmp/hive/warehouse/test2" + file_format_type = "text" + encoding = "gbk" +} + +``` + +### Multiple Table + +```hocon + +LocalFile { + tables_configs = [ + { + schema { + table = "student" + } + path = "/apps/hive/demo/student" + file_format_type = "parquet" + }, + { + schema { + table = "teacher" + } + path = "/apps/hive/demo/teacher" + file_format_type = "parquet" + } + ] +} + +``` + +```hocon + +LocalFile { + tables_configs = [ + { + schema { + fields { + name = string + age = int + } + } + path = "/apps/hive/demo/student" + file_format_type = "json" + }, + { + schema { + fields { + name = string + age = int + } + } + path = "/apps/hive/demo/teacher" + file_format_type = "json" + } +} + +``` + +### Transfer Binary File + +```hocon + +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + LocalFile { + path = "/seatunnel/read/binary/" + file_format_type = "binary" + } +} +sink { + // you can transfer local file to s3/hdfs/oss etc. + LocalFile { + path = "/seatunnel/read/binary2/" + file_format_type = "binary" + } +} + +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Local File Source Connector + +### 2.3.0-beta 2022-10-20 + +- [BugFix] Fix the bug of incorrect path in windows environment ([2980](https://github.com/apache/seatunnel/pull/2980)) +- [Improve] Support extract partition from SeaTunnelRow fields ([3085](https://github.com/apache/seatunnel/pull/3085)) +- [Improve] Support parse field from file path ([2985](https://github.com/apache/seatunnel/pull/2985)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Maxcompute.md b/versioned_docs/version-2.3.7/connector-v2/source/Maxcompute.md new file mode 100644 index 000000000000..cb9bc32dd382 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Maxcompute.md @@ -0,0 +1,98 @@ +# Maxcompute + +> Maxcompute source connector + +## Description + +Used to read data from Maxcompute. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|----------------|--------|----------|---------------| +| accessId | string | yes | - | +| accesskey | string | yes | - | +| endpoint | string | yes | - | +| project | string | yes | - | +| table_name | string | yes | - | +| partition_spec | string | no | - | +| split_row | int | no | 10000 | +| common-options | string | no | | +| schema | config | no | | + +### accessId [string] + +`accessId` Your Maxcompute accessId which cloud be access from Alibaba Cloud. + +### accesskey [string] + +`accesskey` Your Maxcompute accessKey which cloud be access from Alibaba Cloud. + +### endpoint [string] + +`endpoint` Your Maxcompute endpoint start with http. + +### project [string] + +`project` Your Maxcompute project which is created in Alibaba Cloud. + +### table_name [string] + +`table_name` Target Maxcompute table name eg: fake. + +### partition_spec [string] + +`partition_spec` This spec of Maxcompute partition table eg:ds='20220101'. + +### split_row [int] + +`split_row` Number of rows per split, default: 10000. + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. + +### schema [config] + +#### fields [Config] + +The schema information of upstream data. + +## Examples + +```hocon +source { + Maxcompute { + accessId="" + accesskey="" + endpoint="" + project="" + table_name="" + #partition_spec="" + #split_row = 10000 + schema { + fields { + name = string + age = int + gender = string + } + } + } +} +``` + +## Changelog + +### next version + +- [Feature] Add Maxcompute Source Connector([3640](https://github.com/apache/seatunnel/pull/3640)) +- [Feature] Support Schema in MaxCompute Source([3640](https://github.com/apache/seatunnel/pull/5283)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Mivlus.md b/versioned_docs/version-2.3.7/connector-v2/source/Mivlus.md new file mode 100644 index 000000000000..a56df4c5fe77 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Mivlus.md @@ -0,0 +1,55 @@ +# Milvus + +> Milvus source connector + +## Description + +Read data from Milvus or Zilliz Cloud + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) + +## Data Type Mapping + +| Milvus Data Type | SeaTunnel Data Type | +|---------------------|---------------------| +| INT8 | TINYINT | +| INT16 | SMALLINT | +| INT32 | INT | +| INT64 | BIGINT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| BOOL | BOOLEAN | +| JSON | STRING | +| ARRAY | ARRAY | +| VARCHAR | STRING | +| FLOAT_VECTOR | FLOAT_VECTOR | +| BINARY_VECTOR | BINARY_VECTOR | +| FLOAT16_VECTOR | FLOAT16_VECTOR | +| BFLOAT16_VECTOR | BFLOAT16_VECTOR | +| SPARSE_FLOAT_VECTOR | SPARSE_FLOAT_VECTOR | + +## Source Options + +| Name | Type | Required | Default | Description | +|------------|--------|----------|---------|--------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL to connect to Milvus or Zilliz Cloud. | +| token | String | Yes | - | User:password | +| database | String | Yes | default | Read data from which database. | +| collection | String | No | - | If set, will only read one collection, otherwise will read all collections under database. | + +## Task Example + +```bash +source { + Milvus { + url = "http://127.0.0.1:19530" + token = "username:password" + database = "default" + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/MongoDB-CDC.md b/versioned_docs/version-2.3.7/connector-v2/source/MongoDB-CDC.md new file mode 100644 index 000000000000..a7bd980b6d32 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/MongoDB-CDC.md @@ -0,0 +1,312 @@ +# MongoDB CDC + +> MongoDB CDC source connector + +## Support Those Engines + +> SeaTunnel Zeta
+> Flink
+ +## Key Features + +- [ ] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +The MongoDB CDC connector allows for reading snapshot data and incremental data from MongoDB database. + +## Supported DataSource Info + +In order to use the Mongodb CDC connector, the following dependencies are required. +They can be downloaded via install-plugin.sh or from the Maven central repository. + +| Datasource | Supported Versions | Dependency | +|------------|--------------------|-------------------------------------------------------------------------------------------------------------------| +| MongoDB | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-cdc-mongodb) | + +## Availability Settings + +1.MongoDB version: MongoDB version >= 4.0. + +2.Cluster deployment: replica sets or sharded clusters. + +3.Storage Engine: WiredTiger Storage Engine. + +4.Permissions:changeStream and read + +```shell +use admin; +db.createRole( + { + role: "strole", + privileges: [{ + resource: { db: "", collection: "" }, + actions: [ + "splitVector", + "listDatabases", + "listCollections", + "collStats", + "find", + "changeStream" ] + }], + roles: [ + { role: 'read', db: 'config' } + ] + } +); + +db.createUser( + { + user: 'stuser', + pwd: 'stpw', + roles: [ + { role: 'strole', db: 'admin' } + ] + } +); +``` + +## Data Type Mapping + +The following table lists the field data type mapping from MongoDB BSON type to Seatunnel data type. + +| MongoDB BSON Type | SeaTunnel Data Type | +|-------------------|---------------------| +| ObjectId | STRING | +| String | STRING | +| Boolean | BOOLEAN | +| Binary | BINARY | +| Int32 | INTEGER | +| Int64 | BIGINT | +| Double | DOUBLE | +| Decimal128 | DECIMAL | +| Date | DATE | +| Timestamp | TIMESTAMP | +| Object | ROW | +| Array | ARRAY | + +For specific types in MongoDB, we use Extended JSON format to map them to Seatunnel STRING type. + +| MongoDB BSON type | SeaTunnel STRING | +|-------------------|----------------------------------------------------------------------------------------------| +| Symbol | {"_value": {"$symbol": "12"}} | +| RegularExpression | {"_value": {"$regularExpression": {"pattern": "^9$", "options": "i"}}} | +| JavaScript | {"_value": {"$code": "function() { return 10; }"}} | +| DbPointer | {"_value": {"$dbPointer": {"$ref": "db.coll", "$id": {"$oid": "63932a00da01604af329e33c"}}}} | + +**Tips** + +> 1.When using the DECIMAL type in SeaTunnel, be aware that the maximum range cannot exceed 34 digits, which means you should use decimal(34, 18).
+ +## Source Options + +| Name | Type | Required | Default | Description | +|------------------------------------|--------|----------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| hosts | String | Yes | - | The comma-separated list of hostname and port pairs of the MongoDB servers. eg. `localhost:27017,localhost:27018` | +| username | String | No | - | Name of the database user to be used when connecting to MongoDB. | +| password | String | No | - | Password to be used when connecting to MongoDB. | +| database | List | Yes | - | Name of the database to watch for changes. If not set then all databases will be captured. The database also supports regular expressions to monitor multiple databases matching the regular expression. eg. `db1,db2`. | +| collection | List | Yes | - | Name of the collection in the database to watch for changes. If not set then all collections will be captured. The collection also supports regular expressions to monitor multiple collections matching fully-qualified collection identifiers. eg. `db1.coll1,db2.coll2`. | +| connection.options | String | No | - | The ampersand-separated connection options of MongoDB. eg. `replicaSet=test&connectTimeoutMS=300000`. | +| batch.size | Long | No | 1024 | The cursor batch size. | +| poll.max.batch.size | Enum | No | 1024 | Maximum number of change stream documents to include in a single batch when polling for new data. | +| poll.await.time.ms | Long | No | 1000 | The amount of time to wait before checking for new results on the change stream. | +| heartbeat.interval.ms | String | No | 0 | The length of time in milliseconds between sending heartbeat messages. Use 0 to disable. | +| incremental.snapshot.chunk.size.mb | Long | No | 64 | The chunk size mb of incremental snapshot. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | + +### Tips: + +> 1.If the collection changes at a slow pace, it is strongly recommended to set an appropriate value greater than 0 for the heartbeat.interval.ms parameter. When we recover a Seatunnel job from a checkpoint or savepoint, the heartbeat events can push the resumeToken forward to avoid its expiration.
+> 2.MongoDB has a limit of 16MB for a single document. Change documents include additional information, so even if the original document is not larger than 15MB, the change document may exceed the 16MB limit, resulting in the termination of the Change Stream operation.
+> 3.It is recommended to use immutable shard keys. In MongoDB, shard keys allow modifications after transactions are enabled, but changing the shard key can cause frequent shard migrations, resulting in additional performance overhead. Additionally, modifying the shard key can also cause the Update Lookup feature to become ineffective, leading to inconsistent results in CDC (Change Data Capture) scenarios.
+ +## How to Create a MongoDB CDC Data Synchronization Jobs + +### CDC Data Print to Client + +The following example demonstrates how to create a data synchronization job that reads cdc data from MongoDB and prints it on the local client: + +```hocon +env { + # You can set engine configuration here + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + MongoDB-CDC { + hosts = "mongo0:27017" + database = ["inventory"] + collection = ["inventory.products"] + username = stuser + password = stpw + schema = { + fields { + "_id" : string, + "name" : string, + "description" : string, + "weight" : string + } + } + } +} + +# Console printing of the read Mongodb data +sink { + Console { + parallelism = 1 + } +} +``` + +## CDC Data Write to MysqlDB + +The following example demonstrates how to create a data synchronization job that reads cdc data from MongoDB and write to mysql database: + +```hocon +env { + # You can set engine configuration here + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + MongoDB-CDC { + hosts = "mongo0:27017" + database = ["inventory"] + collection = ["inventory.products"] + username = stuser + password = stpw + } +} + +sink { + jdbc { + url = "jdbc:mysql://mysql_cdc_e2e:3306" + driver = "com.mysql.cj.jdbc.Driver" + user = "st_user" + password = "seatunnel" + + generate_sink_sql = true + # You need to configure both database and table + database = mongodb_cdc + table = products + primary_keys = ["_id"] + } +} +``` + +## Multi-table Synchronization + +The following example demonstrates how to create a data synchronization job that read the cdc data of multiple library tables mongodb and prints it on the local client: + +```hocon +env { + # You can set engine configuration here + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + MongoDB-CDC { + hosts = "mongo0:27017" + database = ["inventory","crm"] + collection = ["inventory.products","crm.test"] + username = stuser + password = stpw + } +} + +# Console printing of the read Mongodb data +sink { + Console { + parallelism = 1 + } +} +``` + +### Tips: + +> 1.The cdc synchronization of multiple library tables cannot specify the schema, and can only output json data downstream. +> This is because MongoDB does not provide metadata information for querying, so if you want to support multiple tables, all tables can only be read as one structure. + +## Regular Expression Matching for Multiple Tables + +The following example demonstrates how to create a data synchronization job that through regular expression read the data of multiple library tables mongodb and prints it on the local client: + +| Matching example | Expressions | | Describe | +|------------------|-------------|---|----------------------------------------------------------------------------------------| +| Prefix matching | ^(test).* | | Match the database name or table name with the prefix test, such as test1, test2, etc. | +| Suffix matching | .*[p$] | | Match the database name or table name with the suffix p, such as cdcp, edcp, etc. | + +```hocon +env { + # You can set engine configuration here + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + MongoDB-CDC { + hosts = "mongo0:27017" + # So this example is used (^(test).*|^(tpc).*|txc|.*[p$]|t{2}).(t[5-8]|tt),matching txc.tt、test2.test5. + database = ["(^(test).*|^(tpc).*|txc|.*[p$]|t{2})"] + collection = ["(t[5-8]|tt)"] + username = stuser + password = stpw + } +} + +# Console printing of the read Mongodb data +sink { + Console { + parallelism = 1 + } +} +``` + +## Format of real-time streaming data + +```shell +{ + _id : { }, // Identifier of the open change stream, can be assigned to the 'resumeAfter' parameter for subsequent resumption of this change stream + "operationType" : "", // The type of change operation that occurred, such as: insert, delete, update, etc. + "fullDocument" : { }, // The full document data involved in the change operation. This field does not exist in delete operations + "ns" : { + "db" : "", // The database where the change operation occurred + "coll" : "" // The collection where the change operation occurred + }, + "to" : { // These fields are displayed only when the operation type is 'rename' + "db" : "", // The new database name after the change + "coll" : "" // The new collection name after the change + }, + "source":{ + "ts_ms":"", // The timestamp when the change operation occurred + "table":"" // The collection where the change operation occurred + "db":"", // The database where the change operation occurred + "snapshot":"false" // Identify the current stage of data synchronization + }, + "documentKey" : { "_id" : }, // The _id field value of the document involved in the change operation + "updateDescription" : { // Description of the update operation + "updatedFields" : { }, // The fields and values that the update operation modified + "removedFields" : [ "", ... ] // The fields and values that the update operation removed + } + "clusterTime" : , // The timestamp of the Oplog log entry corresponding to the change operation + "txnNumber" : , // If the change operation is executed in a multi-document transaction, this field and value are displayed, representing the transaction number + "lsid" : { // Represents information related to the Session in which the transaction is located + "id" : , + "uid" : + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/MongoDB.md b/versioned_docs/version-2.3.7/connector-v2/source/MongoDB.md new file mode 100644 index 000000000000..4662e1712a96 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/MongoDB.md @@ -0,0 +1,458 @@ +# MongoDB + +> MongoDB Source Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +The MongoDB Connector provides the ability to read and write data from and to MongoDB. +This document describes how to set up the MongoDB connector to run data reads against MongoDB. + +## Supported DataSource Info + +In order to use the Mongodb connector, the following dependencies are required. +They can be downloaded via install-plugin.sh or from the Maven central repository. + +| Datasource | Supported Versions | Dependency | +|------------|--------------------|---------------------------------------------------------------------------------------------------------------| +| MongoDB | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-mongodb) | + +## Data Type Mapping + +The following table lists the field data type mapping from MongoDB BSON type to SeaTunnel data type. + +| MongoDB BSON type | SeaTunnel Data type | +|-------------------|---------------------| +| ObjectId | STRING | +| String | STRING | +| Boolean | BOOLEAN | +| Binary | BINARY | +| Int32 | INTEGER | +| Int64 | BIGINT | +| Double | DOUBLE | +| Decimal128 | DECIMAL | +| Date | Date | +| Timestamp | Timestamp | +| Object | ROW | +| Array | ARRAY | + +For specific types in MongoDB, we use Extended JSON format to map them to SeaTunnel STRING type. + +| MongoDB BSON type | SeaTunnel STRING | +|-------------------|----------------------------------------------------------------------------------------------| +| Symbol | {"_value": {"$symbol": "12"}} | +| RegularExpression | {"_value": {"$regularExpression": {"pattern": "^9$", "options": "i"}}} | +| JavaScript | {"_value": {"$code": "function() { return 10; }"}} | +| DbPointer | {"_value": {"$dbPointer": {"$ref": "db.coll", "$id": {"$oid": "63932a00da01604af329e33c"}}}} | + +**Tips** + +> 1.When using the DECIMAL type in SeaTunnel, be aware that the maximum range cannot exceed 34 digits, which means you should use decimal(34, 18).
+ +## Source Options + +| Name | Type | Required | Default | Description | +|----------------------|---------|----------|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| uri | String | Yes | - | The MongoDB standard connection uri. eg. mongodb://user:password@hosts:27017/database?readPreference=secondary&slaveOk=true. | +| database | String | Yes | - | The name of MongoDB database to read or write. | +| collection | String | Yes | - | The name of MongoDB collection to read or write. | +| schema | String | Yes | - | MongoDB's BSON and seatunnel data structure mapping. | +| match.query | String | No | - | In MongoDB, filters are used to filter documents for query operations. | +| match.projection | String | No | - | In MongoDB, Projection is used to control the fields contained in the query results. | +| partition.split-key | String | No | _id | The key of Mongodb fragmentation. | +| partition.split-size | Long | No | 64 * 1024 * 1024 | The size of Mongodb fragment. | +| cursor.no-timeout | Boolean | No | true | MongoDB server normally times out idle cursors after an inactivity period (10 minutes) to prevent excess memory use. Set this option to true to prevent that. However, if the application takes longer than 30 minutes to process the current batch of documents, the session is marked as expired and closed. | +| fetch.size | Int | No | 2048 | Set the number of documents obtained from the server for each batch. Setting the appropriate batch size can improve query performance and avoid the memory pressure caused by obtaining a large amount of data at one time. | +| max.time-min | Long | No | 600 | This parameter is a MongoDB query option that limits the maximum execution time for query operations. The value of maxTimeMin is in Minute. If the execution time of the query exceeds the specified time limit, MongoDB will terminate the operation and return an error. | +| flat.sync-string | Boolean | No | true | By utilizing flatSyncString, only one field attribute value can be set, and the field type must be a String. This operation will perform a string mapping on a single MongoDB data entry. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +### Tips + +> 1.The parameter `match.query` is compatible with the historical old version parameter `matchQuery`, and they are equivalent replacements.
+ +## How to Create a MongoDB Data Synchronization Jobs + +The following example demonstrates how to create a data synchronization job that reads data from MongoDB and prints it on the local client: + +```bash +# Set the basic configuration of the task to be performed +env { + parallelism = 1 + job.mode = "BATCH" +} + +# Create a source to connect to Mongodb +source { + MongoDB { + uri = "mongodb://user:password@127.0.0.1:27017" + database = "test_db" + collection = "source_table" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + } +} + +# Console printing of the read Mongodb data +sink { + Console { + parallelism = 1 + } +} +``` + +## Parameter Interpretation + +### MongoDB Database Connection URI Examples + +Unauthenticated single node connection: + +```bash +mongodb://192.168.0.100:27017/mydb +``` + +Replica set connection: + +```bash +mongodb://192.168.0.100:27017/mydb?replicaSet=xxx +``` + +Authenticated replica set connection: + +```bash +mongodb://admin:password@192.168.0.100:27017/mydb?replicaSet=xxx&authSource=admin +``` + +Multi-node replica set connection: + +```bash +mongodb://192.168.0.1:27017,192.168.0.2:27017,192.168.0.3:27017/mydb?replicaSet=xxx +``` + +Sharded cluster connection: + +```bash +mongodb://192.168.0.100:27017/mydb +``` + +Multiple mongos connections: + +```bash +mongodb://192.168.0.1:27017,192.168.0.2:27017,192.168.0.3:27017/mydb +``` + +Note: The username and password in the URI must be URL-encoded before being concatenated into the connection string. + +### MatchQuery Scan + +In data synchronization scenarios, the matchQuery approach needs to be used early to reduce the number of documents that need to be processed by subsequent operators, thus improving performance. +Here is a simple example of a seatunnel using `match.query` + +```bash +source { + MongoDB { + uri = "mongodb://user:password@127.0.0.1:27017" + database = "test_db" + collection = "orders" + match.query = "{status: \"A\"}" + schema = { + fields { + id = bigint + status = string + } + } + } +} +``` + +The following are examples of MatchQuery query statements of various data types: + +```bash +# Query Boolean type +"{c_boolean:true}" +# Query string type +"{c_string:\"OCzCj\"}" +# Query the integer +"{c_int:2}" +# Type of query time +"{c_date:ISODate(\"2023-06-26T16:00:00.000Z\")}" +# Query floating point type +{c_double:{$gte:1.71763202185342e+308}} +``` + +Please refer to how to write the syntax of `match.query`:https://www.mongodb.com/docs/manual/tutorial/query-documents + +### Projection Scan + +In MongoDB, Projection is used to control which fields are included in the query results. This can be accomplished by specifying which fields need to be returned and which fields do not. +In the find() method, a projection object can be passed as a second argument. The key of the projection object indicates the fields to include or exclude, and a value of 1 indicates inclusion and 0 indicates exclusion. +Here is a simple example, assuming we have a collection named users: + +```bash +# Returns only the name and email fields +db.users.find({}, { name: 1, email: 0 }); +``` + +In data synchronization scenarios, projection needs to be used early to reduce the number of documents that need to be processed by subsequent operators, thus improving performance. +Here is a simple example of a seatunnel using projection: + +```bash +source { + MongoDB { + uri = "mongodb://user:password@127.0.0.1:27017" + database = "test_db" + collection = "users" + match.projection = "{ name: 1, email: 0 }" + schema = { + fields { + name = string + } + } + } +} + +``` + +### Partitioned Scan + +To speed up reading data in parallel source task instances, seatunnel provides a partitioned scan feature for MongoDB collections. The following partitioning strategies are provided. +Users can control data sharding by setting the partition.split-key for sharding keys and partition.split-size for sharding size. + +```bash +source { + MongoDB { + uri = "mongodb://user:password@127.0.0.1:27017" + database = "test_db" + collection = "users" + partition.split-key = "id" + partition.split-size = 1024 + schema = { + fields { + id = bigint + status = string + } + } + } +} + +``` + +### Flat Sync String + +By utilizing `flat.sync-string`, only one field attribute value can be set, and the field type must be a String. +This operation will perform a string mapping on a single MongoDB data entry. + +```bash +env { + parallelism = 10 + job.mode = "BATCH" +} +source { + MongoDB { + uri = "mongodb://user:password@127.0.0.1:27017" + database = "test_db" + collection = "users" + flat.sync-string = true + schema = { + fields { + data = string + } + } + } +} +sink { + Console {} +} +``` + +Use the data samples synchronized with modified parameters, such as the following: + +```json +{ + "_id":{ + "$oid":"643d41f5fdc6a52e90e59cbf" + }, + "c_map":{ + "OQBqH":"jllt", + "rkvlO":"pbfdf", + "pCMEX":"hczrdtve", + "DAgdj":"t", + "dsJag":"voo" + }, + "c_array":[ + { + "$numberInt":"-865590937" + }, + { + "$numberInt":"833905600" + }, + { + "$numberInt":"-1104586446" + }, + { + "$numberInt":"2076336780" + }, + { + "$numberInt":"-1028688944" + } + ], + "c_string":"bddkzxr", + "c_boolean":false, + "c_tinyint":{ + "$numberInt":"39" + }, + "c_smallint":{ + "$numberInt":"23672" + }, + "c_int":{ + "$numberInt":"-495763561" + }, + "c_bigint":{ + "$numberLong":"3768307617923954543" + }, + "c_float":{ + "$numberDouble":"5.284220288280258E37" + }, + "c_double":{ + "$numberDouble":"1.1706091642478246E308" + }, + "c_bytes":{ + "$binary":{ + "base64":"ZWJ4", + "subType":"00" + } + }, + "c_date":{ + "$date":{ + "$numberLong":"1686614400000" + } + }, + "c_decimal":{ + "$numberDecimal":"683265300" + }, + "c_timestamp":{ + "$date":{ + "$numberLong":"1684283772000" + } + }, + "c_row":{ + "c_map":{ + "OQBqH":"cbrzhsktmm", + "rkvlO":"qtaov", + "pCMEX":"tuq", + "DAgdj":"jzop", + "dsJag":"vwqyxtt" + }, + "c_array":[ + { + "$numberInt":"1733526799" + }, + { + "$numberInt":"-971483501" + }, + { + "$numberInt":"-1716160960" + }, + { + "$numberInt":"-919976360" + }, + { + "$numberInt":"727499700" + } + ], + "c_string":"oboislr", + "c_boolean":true, + "c_tinyint":{ + "$numberInt":"-66" + }, + "c_smallint":{ + "$numberInt":"1308" + }, + "c_int":{ + "$numberInt":"-1573886733" + }, + "c_bigint":{ + "$numberLong":"4877994302999518682" + }, + "c_float":{ + "$numberDouble":"1.5353209063652051E38" + }, + "c_double":{ + "$numberDouble":"1.1952441956458565E308" + }, + "c_bytes":{ + "$binary":{ + "base64":"cWx5Ymp0Yw==", + "subType":"00" + } + }, + "c_date":{ + "$date":{ + "$numberLong":"1686614400000" + } + }, + "c_decimal":{ + "$numberDecimal":"656406177" + }, + "c_timestamp":{ + "$date":{ + "$numberLong":"1684283772000" + } + } + }, + "id":{ + "$numberInt":"2" + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add MongoDB Source Connector + +### Next Version + +- [Feature]Refactor mongodb source connector([4620](https://github.com/apache/seatunnel/pull/4620)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/MyHours.md b/versioned_docs/version-2.3.7/connector-v2/source/MyHours.md new file mode 100644 index 000000000000..012b32fb1e22 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/MyHours.md @@ -0,0 +1,310 @@ +# My Hours + +> My Hours source connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +Used to read data from My Hours. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Supported DataSource Info + +In order to use the My Hours connector, the following dependencies are required. +They can be downloaded via install-plugin.sh or from the Maven central repository. + +| Datasource | Supported Versions | Dependency | +|------------|--------------------|---------------------------------------------------------------------------------------------| +| My Hours | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2) | + +## Source Options + +| Name | Type | Required | Default | Description | +|-----------------------------|---------|----------|---------|--------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | Http request url. | +| email | String | Yes | - | My hours login email address. | +| password | String | Yes | - | My hours login password. | +| schema | Config | No | - | Http and seatunnel data structure mapping | +| schema.fields | Config | No | - | The schema fields of upstream data | +| json_field | Config | No | - | This parameter helps you configure the schema,so this parameter must be used with schema. | +| content_json | String | No | - | This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. | +| format | String | No | json | The format of upstream data, now only support `json` `text`, default `json`. | +| method | String | No | get | Http request method, only supports GET, POST method. | +| headers | Map | No | - | Http headers. | +| params | Map | No | - | Http params. | +| body | String | No | - | Http body. | +| poll_interval_millis | Int | No | - | Request http api interval(millis) in stream mode. | +| retry | Int | No | - | The max retry times if request http return to `IOException`. | +| retry_backoff_multiplier_ms | Int | No | 100 | The retry-backoff times(millis) multiplier if request http failed. | +| retry_backoff_max_ms | Int | No | 10000 | The maximum retry-backoff times(millis) if request http failed | +| enable_multi_lines | Boolean | No | false | | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## How to Create a My Hours Data Synchronization Jobs + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +MyHours{ + url = "https://api2.myhours.com/api/Projects/getAll" + email = "seatunnel@test.com" + password = "seatunnel" + schema { + fields { + name = string + archived = boolean + dateArchived = string + dateCreated = string + clientName = string + budgetAlertPercent = string + budgetType = int + totalTimeLogged = double + budgetValue = double + totalAmount = double + totalExpense = double + laborCost = double + totalCost = double + billableTimeLogged = double + totalBillableAmount = double + billable = boolean + roundType = int + roundInterval = int + budgetSpentPercentage = double + budgetTarget = int + budgetPeriodType = string + budgetSpent = string + id = string + } + } +} + +# Console printing of the read data +sink { + Console { + parallelism = 1 + } +} +``` + +## Parameter Interpretation + +### format + +when you assign format is `json`, you should also assign schema option, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +when you assign format is `text`, connector will do nothing for upstream data, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +connector will generate data as the following: + +| content | +|----------------------------------------------------------| +| {"code": 200, "data": "get success", "success": true} | + +### content_json + +This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. + +If your return data looks something like this. + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can configure `content_field = "$.store.book.*"` and the result returned looks like this: + +```json +[ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } +] +``` + +Then you can get the desired result with a simpler schema,like + +```hocon +Http { + url = "http://mockserver:1080/contentjson/mock" + method = "GET" + format = "json" + content_field = "$.store.book.*" + schema = { + fields { + category = string + author = string + title = string + price = string + } + } +} +``` + +Here is an example: + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). + +### json_field + +This parameter helps you configure the schema,so this parameter must be used with schema. + +If your data looks something like this: + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can get the contents of 'book' by configuring the task as follows: + +```hocon +source { + Http { + url = "http://mockserver:1080/jsonpath/mock" + method = "GET" + format = "json" + json_field = { + category = "$.store.book[*].category" + author = "$.store.book[*].author" + title = "$.store.book[*].title" + price = "$.store.book[*].price" + } + schema = { + fields { + category = string + author = string + title = string + price = string + } + } + } +} +``` + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). + +## Changelog + +### next version + +- Add My Hours Source Connector +- [Feature][Connector-V2][HTTP] Use json-path parsing ([3510](https://github.com/apache/seatunnel/pull/3510)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/MySQL-CDC.md b/versioned_docs/version-2.3.7/connector-v2/source/MySQL-CDC.md new file mode 100644 index 000000000000..2cf6c506c65e --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/MySQL-CDC.md @@ -0,0 +1,272 @@ +# MySQL CDC + +> MySQL CDC source connector + +## Support Those Engines + +> SeaTunnel Zeta
+> Flink
+ +## Description + +The MySQL CDC connector allows for reading snapshot data and incremental data from MySQL database. This document +describes how to set up the MySQL CDC connector to run SQL queries against MySQL databases. + +## Key features + +- [ ] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------|----------------------------------|----------------------------------------------------------------------| +| MySQL |
  • [MySQL](https://dev.mysql.com/doc): 5.5, 5.6, 5.7, 8.0.x
  • [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x
  • | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | https://mvnrepository.com/artifact/mysql/mysql-connector-java/8.0.28 | + +## Using Dependency + +### Install Jdbc Driver + +#### For Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +#### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +### Creating MySQL user + +You have to define a MySQL user with appropriate permissions on all databases that the Debezium MySQL connector monitors. + +1. Create the MySQL user: + +```sql +mysql> CREATE USER 'user'@'localhost' IDENTIFIED BY 'password'; +``` + +2. Grant the required permissions to the user: + +```sql +mysql> GRANT SELECT, RELOAD, SHOW DATABASES, REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO 'user' IDENTIFIED BY 'password'; +``` + +3. Finalize the user’s permissions: + +```sql +mysql> FLUSH PRIVILEGES; +``` + +### Enabling the MySQL Binlog + +You must enable binary logging for MySQL replication. The binary logs record transaction updates for replication tools to propagate changes. + +1. Check whether the `log-bin` option is already on: + +```sql +mysql> show variables where variable_name in ('log_bin', 'binlog_format', 'binlog_row_image', 'gtid_mode', 'enforce_gtid_consistency'); ++--------------------------+----------------+ +| Variable_name | Value | ++--------------------------+----------------+ +| binlog_format | ROW | +| binlog_row_image | FULL | +| enforce_gtid_consistency | ON | +| gtid_mode | ON | +| log_bin | ON | ++--------------------------+----------------+ +5 rows in set (0.00 sec) +``` + +2. If inconsistent with the above results, configure your MySQL server configuration file(`$MYSQL_HOME/mysql.cnf`) with the following properties, which are described in the table below: + +``` +# Enable binary replication log and set the prefix, expiration, and log format. +# The prefix is arbitrary, expiration can be short for integration tests but would +# be longer on a production system. Row-level info is required for ingest to work. +# Server ID is required, but this will vary on production systems +server-id = 223344 +log_bin = mysql-bin +expire_logs_days = 10 +binlog_format = row +# mysql 5.6+ requires binlog_row_image to be set to FULL +binlog_row_image = FULL + +# enable gtid mode +# mysql 5.6+ requires gtid_mode to be set to ON +gtid_mode = on +enforce_gtid_consistency = on +``` + +3. Restart MySQL Server + +```shell +/etc/inint.d/mysqld restart +``` + +4. Confirm your changes by checking the binlog status once more: + +MySQL 5.5: + +```sql +mysql> show variables where variable_name in ('log_bin', 'binlog_format', 'binlog_row_image', 'gtid_mode', 'enforce_gtid_consistency'); ++--------------------------+----------------+ +| Variable_name | Value | ++--------------------------+----------------+ +| binlog_format | ROW | +| log_bin | ON | ++--------------------------+----------------+ +5 rows in set (0.00 sec) +``` + +MySQL 5.6+: + +```sql +mysql> show variables where variable_name in ('log_bin', 'binlog_format', 'binlog_row_image', 'gtid_mode', 'enforce_gtid_consistency'); ++--------------------------+----------------+ +| Variable_name | Value | ++--------------------------+----------------+ +| binlog_format | ROW | +| binlog_row_image | FULL | +| enforce_gtid_consistency | ON | +| gtid_mode | ON | +| log_bin | ON | ++--------------------------+----------------+ +5 rows in set (0.00 sec) +``` + +### Notes + +#### Setting up MySQL session timeouts + +When an initial consistent snapshot is made for large databases, your established connection could timeout while the tables are being read. You can prevent this behavior by configuring interactive_timeout and wait_timeout in your MySQL configuration file. +- `interactive_timeout`: The number of seconds the server waits for activity on an interactive connection before closing it. See [MySQL’s documentation](https://dev.mysql.com/doc/refman/8.0/en/server-system-variables.html#sysvar_interactive_timeout) for more details. +- `wait_timeout`: The number of seconds the server waits for activity on a non-interactive connection before closing it. See [MySQL’s documentation](https://dev.mysql.com/doc/refman/8.0/en/server-system-variables.html#sysvar_wait_timeout) for more details. + +*For more database settings see [Debezium MySQL Connector](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/mysql.adoc#setting-up-mysql)* + +## Data Type Mapping + +| Mysql Data Type | SeaTunnel Data Type | +|------------------------------------------------------------------------------------------------|---------------------| +| BIT(1)
    TINYINT(1) | BOOLEAN | +| TINYINT | TINYINT | +| TINYINT UNSIGNED
    SMALLINT | SMALLINT | +| SMALLINT UNSIGNED
    MEDIUMINT
    MEDIUMINT UNSIGNED
    INT
    INTEGER
    YEAR | INT | +| INT UNSIGNED
    INTEGER UNSIGNED
    BIGINT | BIGINT | +| BIGINT UNSIGNED | DECIMAL(20,0) | +| DECIMAL(p, s)
    DECIMAL(p, s) UNSIGNED
    NUMERIC(p, s)
    NUMERIC(p, s) UNSIGNED | DECIMAL(p,s) | +| FLOAT
    FLOAT UNSIGNED | FLOAT | +| DOUBLE
    DOUBLE UNSIGNED
    REAL
    REAL UNSIGNED | DOUBLE | +| CHAR
    VARCHAR
    TINYTEXT
    MEDIUMTEXT
    TEXT
    LONGTEXT
    ENUM
    JSON
    ENUM | STRING | +| DATE | DATE | +| TIME(s) | TIME(s) | +| DATETIME
    TIMESTAMP(s) | TIMESTAMP(s) | +| BINARY
    VARBINAR
    BIT(p)
    TINYBLOB
    MEDIUMBLOB
    BLOB
    LONGBLOB
    GEOMETRY | BYTES | + +## Source Options + +| Name | Type | Required | Default | Description | +|------------------------------------------------|----------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| base-url | String | Yes | - | The URL of the JDBC connection. Refer to a case: `jdbc:mysql://localhost:3306:3306/test`. | +| username | String | Yes | - | Name of the database to use when connecting to the database server. | +| password | String | Yes | - | Password to use when connecting to the database server. | +| database-names | List | No | - | Database name of the database to monitor. | +| table-names | List | Yes | - | Table name of the database to monitor. The table name needs to include the database name, for example: `database_name.table_name` | +| table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys":["key1"]}] | +| startup.mode | Enum | No | INITIAL | Optional startup mode for MySQL CDC consumer, valid enumerations are `initial`, `earliest`, `latest` and `specific`.
    `initial`: Synchronize historical data at startup, and then synchronize incremental data.
    `earliest`: Startup from the earliest offset possible.
    `latest`: Startup from the latest offset.
    `specific`: Startup from user-supplied specific offsets. | +| startup.specific-offset.file | String | No | - | Start from the specified binlog file name. **Note, This option is required when the `startup.mode` option used `specific`.** | +| startup.specific-offset.pos | Long | No | - | Start from the specified binlog file position. **Note, This option is required when the `startup.mode` option used `specific`.** | +| stop.mode | Enum | No | NEVER | Optional stop mode for MySQL CDC consumer, valid enumerations are `never`, `latest` or `specific`.
    `never`: Real-time job don't stop the source.
    `latest`: Stop from the latest offset.
    `specific`: Stop from user-supplied specific offset. | +| stop.specific-offset.file | String | No | - | Stop from the specified binlog file name. **Note, This option is required when the `stop.mode` option used `specific`.** | +| stop.specific-offset.pos | Long | No | - | Stop from the specified binlog file position. **Note, This option is required when the `stop.mode` option used `specific`.** | +| snapshot.split.size | Integer | No | 8096 | The split size (number of rows) of table snapshot, captured tables are split into multiple splits when read the snapshot of table. | +| snapshot.fetch.size | Integer | No | 1024 | The maximum fetch size for per poll when read table snapshot. | +| server-id | String | No | - | A numeric ID or a numeric ID range of this database client, The numeric ID syntax is like `5400`, the numeric ID range syntax is like '5400-5408'.
    Every ID must be unique across all currently-running database processes in the MySQL cluster. This connector joins the
    MySQL cluster as another server (with this unique ID) so it can read the binlog.
    By default, a random number is generated between 6500 and 2,148,492,146, though we recommend setting an explicit value. | +| server-time-zone | String | No | UTC | The session time zone in database server. If not set, then ZoneId.systemDefault() is used to determine the server time zone. | +| connect.timeout.ms | Duration | No | 30000 | The maximum time that the connector should wait after trying to connect to the database server before timing out. | +| connect.max-retries | Integer | No | 3 | The max retry times that the connector should retry to build database server connection. | +| connection.pool.size | Integer | No | 20 | The jdbc connection pool size. | +| chunk-key.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | +| chunk-key.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | +| sample-sharding.threshold | Integer | No | 1000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | +| inverse-sampling.rate | Integer | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | +| exactly_once | Boolean | No | false | Enable exactly once semantic. | +| format | Enum | No | DEFAULT | Optional output format for MySQL CDC, valid enumerations are `DEFAULT`、`COMPATIBLE_DEBEZIUM_JSON`. | +| debezium | Config | No | - | Pass-through [Debezium's properties](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/mysql.adoc#connector-properties) to Debezium Embedded Engine which is used to capture data changes from MySQL server. | +| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## Task Example + +### Simple + +> Support multi-table reading + +``` +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 10000 +} + +source { + MySQL-CDC { + base-url = "jdbc:mysql://localhost:3306/testdb" + username = "root" + password = "root@123" + table-names = ["testdb.table1", "testdb.table2"] + + startup.mode = "initial" + } +} + +sink { + Console { + } +} +``` + +### Support debezium-compatible format send to kafka + +> Must be used with kafka connector sink, see [compatible debezium format](../formats/cdc-compatible-debezium-json.md) for details + +### Support custom primary key for table + +``` +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 10000 +} + +source { + MySQL-CDC { + base-url = "jdbc:mysql://localhost:3306/testdb" + username = "root" + password = "root@123" + + table-names = ["testdb.table1", "testdb.table2"] + table-names-config = [ + { + table = "testdb.table2" + primaryKeys = ["id"] + } + ] + } +} + +sink { + Console { + } +} +``` + +## Changelog + +- Add MySQL CDC Source Connector + +### next version + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Mysql.md b/versioned_docs/version-2.3.7/connector-v2/source/Mysql.md new file mode 100644 index 000000000000..b97ec324eabc --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Mysql.md @@ -0,0 +1,319 @@ +# MySQL + +> JDBC Mysql Source Connector + +## Description + +Read external data source data through JDBC. + +## Support Mysql Version + +- 5.5/5.6/5.7/8.0/8.4 + +## Support Those Engines + +> Spark
    +> Flink
    +> SeaTunnel Zeta
    + +## Using Dependency + +### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) +- [x] [support multiple table reading](../../concept/connector-v2-features.md) + +> supports query SQL and can achieve projection effect. + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|----------------------------------------------------------|--------------------------|---------------------------------------|---------------------------------------------------------------------------| +| Mysql | Different dependency version has different driver class. | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306:3306/test | [Download](https://mvnrepository.com/artifact/mysql/mysql-connector-java) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
    +> For example Mysql datasource: cp mysql-connector-java-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| Mysql Data Type | SeaTunnel Data Type | +|-----------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| BIT(1)
    TINYINT(1) | BOOLEAN | +| TINYINT | BYTE | +| TINYINT UNSIGNED
    SMALLINT | SMALLINT | +| SMALLINT UNSIGNED
    MEDIUMINT
    MEDIUMINT UNSIGNED
    INT
    INTEGER
    YEAR | INT | +| INT UNSIGNED
    INTEGER UNSIGNED
    BIGINT | BIGINT | +| BIGINT UNSIGNED | DECIMAL(20,0) | +| DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | +| DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | +| DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
    (Gets the designated column's number of digits to right of the decimal point.))) | +| FLOAT
    FLOAT UNSIGNED | FLOAT | +| DOUBLE
    DOUBLE UNSIGNED | DOUBLE | +| CHAR
    VARCHAR
    TINYTEXT
    MEDIUMTEXT
    TEXT
    LONGTEXT
    JSON
    ENUM | STRING | +| DATE | DATE | +| TIME(s) | TIME(s) | +| DATETIME
    TIMESTAMP(s) | TIMESTAMP(s) | +| TINYBLOB
    MEDIUMBLOB
    BLOB
    LONGBLOB
    BINARY
    VARBINAR
    BIT(n)
    GEOMETRY | BYTES | + +## Source Options + +| Name | Type | Required | Default | Description | +|--------------------------------------------|------------|----------|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:mysql://localhost:3306:3306/test | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
    if you use MySQL the value is `com.mysql.cj.jdbc.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | +| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
    the row fetch size used in the query toimprove performance by
    reducing the number database hits required to satisfy the selection criteria.
    Zero means use jdbc default value. | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
    specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | +| table_path | Int | No | 0 | The path to the full path of table, you can use this configuration instead of `query`.
    examples:
    mysql: "testdb.table1"
    oracle: "test_schema.table1"
    sqlserver: "testdb.test_schema.table1"
    postgresql: "testdb.test_schema.table1" | +| table_list | Array | No | 0 | The list of tables to be read, you can use this configuration instead of `table_path` example: ```[{ table_path = "testdb.table1"}, {table_path = "testdb.table2", query = "select * id, name from testdb.table2"}]``` | +| where_condition | String | No | - | Common row filter conditions for all tables/queries, must start with `where`. for example `where id > 100` | +| split.size | Int | No | 8096 | The split size (number of rows) of table, captured tables are split into multiple splits when read of table. | +| split.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | +| split.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | +| split.sample-sharding.threshold | Int | No | 10000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | +| split.inverse-sampling.rate | Int | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## Parallel Reader + +The JDBC Source connector supports parallel reading of data from tables. SeaTunnel will use certain rules to split the data in the table, which will be handed over to readers for reading. The number of readers is determined by the `parallelism` option. + +**Split Key Rules:** + +1. If `partition_column` is not null, It will be used to calculate split. The column must in **Supported split data type**. +2. If `partition_column` is null, seatunnel will read the schema from table and get the Primary Key and Unique Index. If there are more than one column in Primary Key and Unique Index, The first column which in the **supported split data type** will be used to split data. For example, the table have Primary Key(nn guid, name varchar), because `guid` id not in **supported split data type**, so the column `name` will be used to split data. + +**Supported split data type:** +* String +* Number(int, bigint, decimal, ...) +* Date + +### Options Related To Split + +#### split.size + +How many rows in one split, captured tables are split into multiple splits when read of table. + +#### split.even-distribution.factor.lower-bound + +> Not recommended for use + +The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. + +#### split.even-distribution.factor.upper-bound + +> Not recommended for use + +The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. + +#### split.sample-sharding.threshold + +This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. + +#### split.inverse-sampling.rate + +The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. + +#### partition_column [string] + +The column name for split data. + +#### partition_upper_bound [BigDecimal] + +The partition_column max value for scan, if not set SeaTunnel will query database get max value. + +#### partition_lower_bound [BigDecimal] + +The partition_column min value for scan, if not set SeaTunnel will query database get min value. + +#### partition_num [int] + +> Not recommended for use, The correct approach is to control the number of split through `split.size` + +How many splits do we need to split into, only support positive integer. default value is job parallelism. + +## tips + +> If the table can not be split(for example, table have no Primary Key or Unique Index, and `partition_column` is not set), it will run in single concurrency. +> +> Use `table_path` to replace `query` for single table reading. If you need to read multiple tables, use `table_list`. + +## Task Example + +### Simple: + +> This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. + +``` +# Defining the runtime environment +env { + parallelism = 4 + job.mode = "BATCH" +} +source{ + Jdbc { + url = "jdbc:mysql://localhost:3306/test?serverTimezone=GMT%2b8&useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" + driver = "com.mysql.cj.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + query = "select * from type_bin limit 16" + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink { + Console {} +} +``` + +### parallel by partition_column + +``` +env { + parallelism = 4 + job.mode = "BATCH" +} +source { + Jdbc { + url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" + driver = "com.mysql.cj.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + query = "select * from type_bin" + partition_column = "id" + split.size = 10000 + # Read start boundary + #partition_lower_bound = ... + # Read end boundary + #partition_upper_bound = ... + } +} + +sink { + Console {} +} +``` + +### parallel by Primary Key or Unique Index + +> Configuring `table_path` will turn on auto split, you can configure `split.*` to adjust the split strategy + +``` +env { + parallelism = 4 + job.mode = "BATCH" +} +source { + Jdbc { + url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" + driver = "com.mysql.cj.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + table_path = "testdb.table1" + query = "select * from testdb.table1" + split.size = 10000 + } +} + +sink { + Console {} +} +``` + +### Parallel Boundary: + +> It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured + +``` +source { + Jdbc { + url = "jdbc:mysql://localhost:3306/test?serverTimezone=GMT%2b8&useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" + driver = "com.mysql.cj.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + # Define query logic as required + query = "select * from type_bin" + partition_column = "id" + # Read start boundary + partition_lower_bound = 1 + # Read end boundary + partition_upper_bound = 500 + partition_num = 10 + properties { + useSSL=false + } + } +} +``` + +### Multiple table read: + +***Configuring `table_list` will turn on auto split, you can configure `split.*` to adjust the split strategy*** + +```hocon +env { + job.mode = "BATCH" + parallelism = 4 +} +source { + Jdbc { + url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" + driver = "com.mysql.cj.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + + table_list = [ + { + table_path = "testdb.table1" + }, + { + table_path = "testdb.table2" + # Use query filetr rows & columns + query = "select id, name from testdb.table2 where id > 100" + } + ] + #where_condition= "where id > 100" + #split.size = 8096 + #split.even-distribution.factor.upper-bound = 100 + #split.even-distribution.factor.lower-bound = 0.05 + #split.sample-sharding.threshold = 1000 + #split.inverse-sampling.rate = 1000 + } +} + +sink { + Console {} +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Neo4j.md b/versioned_docs/version-2.3.7/connector-v2/source/Neo4j.md new file mode 100644 index 000000000000..9797a8a575d7 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Neo4j.md @@ -0,0 +1,107 @@ +# Neo4j + +> Neo4j source connector + +## Description + +Read data from Neo4j. + +`neo4j-java-driver` version 4.4.9 + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|----------------------------|--------|----------|---------------| +| uri | String | Yes | - | +| username | String | No | - | +| password | String | No | - | +| bearer_token | String | No | - | +| kerberos_ticket | String | No | - | +| database | String | Yes | - | +| query | String | Yes | - | +| schema | Object | Yes | - | +| max_transaction_retry_time | Long | No | 30 | +| max_connection_timeout | Long | No | 30 | + +### uri [string] + +The URI of the Neo4j database. Refer to a case: `neo4j://localhost:7687` + +### username [string] + +username of the Neo4j + +### password [string] + +password of the Neo4j. required if `username` is provided + +### bearer_token [string] + +base64 encoded bearer token of the Neo4j. for Auth. + +### kerberos_ticket [string] + +base64 encoded kerberos ticket of the Neo4j. for Auth. + +### database [string] + +database name. + +### query [string] + +Query statement. + +### schema.fields [string] + +returned fields of `query` + +see [column projection](../../concept/connector-v2-features.md) + +### max_transaction_retry_time [long] + +maximum transaction retry time(seconds). transaction fail if exceeded + +### max_connection_timeout [long] + +The maximum amount of time to wait for a TCP connection to be established (seconds) + +## Example + +``` +source { + Neo4j { + uri = "neo4j://localhost:7687" + username = "neo4j" + password = "1234" + database = "neo4j" + + max_transaction_retry_time = 1 + max_connection_timeout = 1 + + query = "MATCH (a:Person) RETURN a.name, a.age" + + schema { + fields { + a.age=INT + a.name=STRING + } + } + } +} +``` + +## Changelog + +### next version + +- Add Neo4j Source Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Notion.md b/versioned_docs/version-2.3.7/connector-v2/source/Notion.md new file mode 100644 index 000000000000..d138c21c1d69 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Notion.md @@ -0,0 +1,308 @@ +# Notion + +> Notion source connector + +## Description + +Used to read data from Notion. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-----------------------------|---------|----------|---------------| +| url | String | Yes | - | +| password | String | Yes | - | +| version | String | Yes | - | +| method | String | No | get | +| schema.fields | Config | No | - | +| format | String | No | json | +| params | Map | No | - | +| body | String | No | - | +| json_field | Config | No | - | +| content_json | String | No | - | +| poll_interval_millis | int | No | - | +| retry | int | No | - | +| retry_backoff_multiplier_ms | int | No | 100 | +| retry_backoff_max_ms | int | No | 10000 | +| enable_multi_lines | boolean | No | false | +| common-options | config | No | - | + +### url [String] + +http request url + +### password [String] + +API key for login, you can get more detail at this link: + +https://developers.notion.com/docs/authorization + +### version [String] + +The Notion API is versioned. API versions are named for the date the version is released + +### method [String] + +http request method, only supports GET, POST method + +### params [Map] + +http params + +### body [String] + +http body + +### poll_interval_millis [int] + +request http api interval(millis) in stream mode + +### retry [int] + +The max retry times if request http return to `IOException` + +### retry_backoff_multiplier_ms [int] + +The retry-backoff times(millis) multiplier if request http failed + +### retry_backoff_max_ms [int] + +The maximum retry-backoff times(millis) if request http failed + +### format [String] + +the format of upstream data, now only support `json` `text`, default `json`. + +when you assign format is `json`, you should also assign schema option, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +when you assign format is `text`, connector will do nothing for upstream data, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +connector will generate data as the following: + +| content | +|----------------------------------------------------------| +| {"code": 200, "data": "get success", "success": true} | + +### schema [Config] + +#### fields [Config] + +the schema fields of upstream data + +### content_json [String] + +This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. + +If your return data looks something like this. + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can configure `content_field = "$.store.book.*"` and the result returned looks like this: + +```json +[ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } +] +``` + +Then you can get the desired result with a simpler schema,like + +```hocon +Http { + url = "http://mockserver:1080/contentjson/mock" + method = "GET" + format = "json" + content_field = "$.store.book.*" + schema = { + fields { + category = string + author = string + title = string + price = string + } + } +} +``` + +Here is an example: + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). + +### json_field [Config] + +This parameter helps you configure the schema,so this parameter must be used with schema. + +If your data looks something like this: + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can get the contents of 'book' by configuring the task as follows: + +```hocon +source { + Http { + url = "http://mockserver:1080/jsonpath/mock" + method = "GET" + format = "json" + json_field = { + category = "$.store.book[*].category" + author = "$.store.book[*].author" + title = "$.store.book[*].title" + price = "$.store.book[*].price" + } + schema = { + fields { + category = string + author = string + title = string + price = string + } + } + } +} +``` + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +## Example + +```hocon +Notion { + url = "https://api.notion.com/v1/users" + password = "SeaTunnel-test" + version = "2022-06-28" + content_field = "$.results.*" + schema = { + fields { + object = string + id = string + type = string + person = { + email = string + } + avatar_url = string + } + } +} +``` + +## Changelog + +### next version + +- Add Notion Source Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/ObsFile.md b/versioned_docs/version-2.3.7/connector-v2/source/ObsFile.md new file mode 100644 index 000000000000..b5363d77173e --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/ObsFile.md @@ -0,0 +1,350 @@ +# ObsFile + +> Obs file source connector + +## Support those engines + +> Spark +> +> Flink +> +> Seatunnel Zeta + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) + +Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot. + +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + +## Description + +Read data from huawei cloud obs file system. + +If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. + +If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. + +We made some trade-offs in order to support more file types, so we used the HDFS protocol for internal access to OBS and this connector need some hadoop dependencies. +It only supports hadoop version **2.9.X+**. + +## Required Jar List + +| jar | supported versions | maven | +|--------------------|-----------------------------|----------------------------------------------------------------------------------------------------------------| +| hadoop-huaweicloud | support version >= 3.1.1.29 | [Download](https://repo.huaweicloud.com/repository/maven/huaweicloudsdk/org/apache/hadoop/hadoop-huaweicloud/) | +| esdk-obs-java | support version >= 3.19.7.3 | [Download](https://repo.huaweicloud.com/repository/maven/huaweicloudsdk/com/huawei/storage/esdk-obs-java/) | +| okhttp | support version >= 3.11.0 | [Download](https://repo1.maven.org/maven2/com/squareup/okhttp3/okhttp/) | +| okio | support version >= 1.14.0 | [Download](https://repo1.maven.org/maven2/com/squareup/okio/okio/) | + +> Please download the support list corresponding to 'Maven' and copy them to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory. +> +> And copy all jars to $SEATNUNNEL_HOME/lib/ + +## Options + +| name | type | required | default | description | +|---------------------------|---------|----------|---------------------|--------------------------------------------------------------------------------------------------------------| +| path | string | yes | - | The target dir path | +| file_format_type | string | yes | - | File type.[Tips](#file_format_type) | +| bucket | string | yes | - | The bucket address of obs file system, for example: `obs://obs-bucket-name` | +| access_key | string | yes | - | The access key of obs file system | +| access_secret | string | yes | - | The access secret of obs file system | +| endpoint | string | yes | - | The endpoint of obs file system | +| read_columns | list | yes | - | The read column list of the data source, user can use it to implement field projection.[Tips](#read_columns) | +| delimiter | string | no | \001 | Field delimiter, used to tell connector how to slice and dice fields when reading text files | +| parse_partition_from_path | boolean | no | true | Control whether parse the partition keys and values from file path. [Tips](#parse_partition_from_path) | +| skip_header_row_number | long | no | 0 | Skip the first few lines, but only for the txt and csv. | +| date_format | string | no | yyyy-MM-dd | Date type format, used to tell the connector how to convert string to date.[Tips](#date_format) | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | Datetime type format, used to tell the connector how to convert string to datetime.[Tips](#datetime_format) | +| time_format | string | no | HH:mm:ss | Time type format, used to tell the connector how to convert string to time.[Tips](#time_format) | +| schema | config | no | - | [Tips](#schema) | +| common-options | | no | - | [Tips](#common_options) | +| sheet_name | string | no | - | Reader the sheet of the workbook,Only used when file_format is excel. | + +### Tips + +#### parse_partition_from_path + +> Control whether parse the partition keys and values from file path +> +> For example if you read a file from path `obs://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` +> +> Every record data from the file will be added these two fields: + +| name | age | +|---------------|-----| +| tyrantlucifer | 26 | + +> Do not define partition fields in schema option + +#### date_format + +> Date type format, used to tell the connector how to convert string to date, supported as the following formats: +> +> `yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` +> +> default `yyyy-MM-dd` + +### datetime_format + +> Datetime type format, used to tell the connector how to convert string to datetime, supported as the following formats: +> +> `yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` +> +> default `yyyy-MM-dd HH:mm:ss` + +### time_format + +> Time type format, used to tell the connector how to convert string to time, supported as the following formats: +> +> `HH:mm:ss` `HH:mm:ss.SSS` +> +> default `HH:mm:ss` + +### skip_header_row_number + +> Skip the first few lines, but only for the txt and csv. +> +> For example, set like following: +> +> `skip_header_row_number = 2` +> +> Then Seatunnel will skip the first 2 lines from source files + +### file_format_type + +> File type, supported as the following file types: +> +> `text` `csv` `parquet` `orc` `json` `excel` +> +> If you assign file type to `json`, you should also assign schema option to tell the connector how to parse data to the row you want. +> +> For example,upstream data is the following: +> +> ```json +> +> ``` + +{"code": 200, "data": "get success", "success": true} + +``` + +> You can also save multiple pieces of data in one file and split them by one newline: + +```json lines + +{"code": 200, "data": "get success", "success": true} +{"code": 300, "data": "get failed", "success": false} + +``` + +> you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +> connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +> If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. +> +> If you assign file type to `text` `csv`, you can choose to specify the schema information or not. +> +> For example, upstream data is the following: + +```text + +tyrantlucifer#26#male + +``` + +> If you do not assign data schema connector will treat the upstream data as the following: + +| content | +|-----------------------| +| tyrantlucifer#26#male | + +> If you assign data schema, you should also assign the option `delimiter` too except CSV file type +> +> you should assign schema and delimiter as the following: + +```hocon + +delimiter = "#" +schema { + fields { + name = string + age = int + gender = string + } +} + +``` + +> connector will generate data as the following: + +| name | age | gender | +|---------------|-----|--------| +| tyrantlucifer | 26 | male | + +#### schema + +##### fields + +> The schema of upstream data. + +#### read_columns + +> The read column list of the data source, user can use it to implement field projection. +> +> The file type supported column projection as the following shown: + +- text +- json +- csv +- orc +- parquet +- excel + +> If the user wants to use this feature when reading `text` `json` `csv` files, the schema option must be configured + +#### common options + +> Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. + +## Task Example + +### text file + +> For text file format simple config + +```hocon + + ObsFile { + path = "/seatunnel/text" + bucket = "obs://obs-bucket-name" + access_key = "xxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxxxxx" + endpoint = "obs.xxxxxx.myhuaweicloud.com" + file_format_type = "text" + } + +``` + +### parquet file + +> For parquet file format simple config + +```hocon + + ObsFile { + path = "/seatunnel/parquet" + bucket = "obs://obs-bucket-name" + access_key = "xxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxxxxx" + endpoint = "obs.xxxxxx.myhuaweicloud.com" + file_format_type = "parquet" + } + +``` + +### orc file + +> For orc file format simple config + +```hocon + + ObsFile { + path = "/seatunnel/orc" + bucket = "obs://obs-bucket-name" + access_key = "xxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxxxxx" + endpoint = "obs.xxxxxx.myhuaweicloud.com" + file_format_type = "orc" + } + +``` + +### json file + +> For json file format simple config + +```hocon + + ObsFile { + path = "/seatunnel/json" + bucket = "obs://obs-bucket-name" + access_key = "xxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxxxxx" + endpoint = "obs.xxxxxx.myhuaweicloud.com" + file_format_type = "json" + } + +``` + +### excel file + +> For excel file format simple config + +```hocon + + ObsFile { + path = "/seatunnel/excel" + bucket = "obs://obs-bucket-name" + access_key = "xxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxxxxx" + endpoint = "obs.xxxxxx.myhuaweicloud.com" + file_format_type = "excel" + } + +``` + +### csv file + +> For csv file format simple config + +```hocon + + ObsFile { + path = "/seatunnel/csv" + bucket = "obs://obs-bucket-name" + access_key = "xxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxxxxx" + endpoint = "obs.xxxxxx.myhuaweicloud.com" + file_format_type = "csv" + delimiter = "," + } + +``` + +## Changelog + +### next version + +- Add Obs File Source Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/OceanBase.md b/versioned_docs/version-2.3.7/connector-v2/source/OceanBase.md new file mode 100644 index 000000000000..66a79f222a75 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/OceanBase.md @@ -0,0 +1,180 @@ +# OceanBase + +> JDBC OceanBase Source Connector + +## Support Those Engines + +> Spark
    +> Flink
    +> SeaTunnel Zeta
    + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +Read external data source data through JDBC. + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|--------------------------------|---------------------------|--------------------------------------|-------------------------------------------------------------------------------| +| OceanBase | All OceanBase server versions. | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2883/test | [Download](https://mvnrepository.com/artifact/com.oceanbase/oceanbase-client) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
    +> For example: cp oceanbase-client-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +### Mysql Mode + +| Mysql Data type | SeaTunnel Data type | +|-----------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| BIT(1)
    TINYINT(1) | BOOLEAN | +| TINYINT | BYTE | +| TINYINT
    TINYINT UNSIGNED | SMALLINT | +| SMALLINT UNSIGNED
    MEDIUMINT
    MEDIUMINT UNSIGNED
    INT
    INTEGER
    YEAR | INT | +| INT UNSIGNED
    INTEGER UNSIGNED
    BIGINT | BIGINT | +| BIGINT UNSIGNED | DECIMAL(20,0) | +| DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | +| DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | +| DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
    (Gets the designated column's number of digits to right of the decimal point.))) | +| FLOAT
    FLOAT UNSIGNED | FLOAT | +| DOUBLE
    DOUBLE UNSIGNED | DOUBLE | +| CHAR
    VARCHAR
    TINYTEXT
    MEDIUMTEXT
    TEXT
    LONGTEXT
    JSON
    ENUM | STRING | +| DATE | DATE | +| TIME | TIME | +| DATETIME
    TIMESTAMP | TIMESTAMP | +| TINYBLOB
    MEDIUMBLOB
    BLOB
    LONGBLOB
    BINARY
    VARBINAR
    BIT(n)
    GEOMETRY | BYTES | + +### Oracle Mode + +| Oracle Data type | SeaTunnel Data type | +|-----------------------------------------------------------------------------------------------------|---------------------| +| Integer | DECIMAL(38,0) | +| Number(p), p <= 9 | INT | +| Number(p), p <= 18 | BIGINT | +| Number(p), p > 18 | DECIMAL(38,18) | +| Number(p,s) | DECIMAL(p,s) | +| Float | DECIMAL(38,18) | +| REAL
    BINARY_FLOAT | FLOAT | +| BINARY_DOUBLE | DOUBLE | +| CHAR
    NCHAR
    VARCHAR
    VARCHAR2
    NVARCHAR2
    NCLOB
    CLOB
    LONG
    XML
    ROWID | STRING | +| DATE | TIMESTAMP | +| TIMESTAMP
    TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | +| BLOB
    RAW
    LONG RAW
    BFILE | BYTES | +| UNKNOWN | Not supported yet | + +## Source Options + +| Name | Type | Required | Default | Description | +|------------------------------|------------|----------|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:oceanbase://localhost:2883/test | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, should be `com.oceanbase.jdbc.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| compatible_mode | String | Yes | - | The compatible mode of OceanBase, can be 'mysql' or 'oracle'. | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type column and string type column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. Default value is job parallelism. | +| fetch_size | Int | No | 0 | For queries that return a large number of objects, you can configure
    the row fetch size used in the query to improve performance by
    reducing the number database hits required to satisfy the selection criteria.
    Zero means use jdbc default value. | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
    specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +``` +env { + parallelism = 2 + job.mode = "BATCH" +} + +source { + Jdbc { + driver = "com.oceanbase.jdbc.Driver" + url = "jdbc:oceanbase://localhost:2883/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" + user = "root" + password = "" + compatible_mode = "mysql" + query = "select * from source" + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform/sql +} + +sink { + Console {} +} +``` + +### Parallel: + +> Read your query table in parallel with the shard field you configured and the shard data. You can do this if you want to read the whole table + +``` +env { + parallelism = 10 + job.mode = "BATCH" +} +source { + Jdbc { + driver = "com.oceanbase.jdbc.Driver" + url = "jdbc:oceanbase://localhost:2883/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" + user = "root" + password = "" + compatible_mode = "mysql" + query = "select * from source" + # Parallel sharding reads fields + partition_column = "id" + # Number of fragments + partition_num = 10 + } +} +sink { + Console {} +} +``` + +### Parallel Boundary: + +> It is more efficient to read your data source according to the upper and lower boundaries you configured + +``` +source { + Jdbc { + driver = "com.oceanbase.jdbc.Driver" + url = "jdbc:oceanbase://localhost:2883/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" + user = "root" + password = "" + compatible_mode = "mysql" + query = "select * from source" + partition_column = "id" + partition_num = 10 + # Read start boundary + partition_lower_bound = 1 + # Read end boundary + partition_upper_bound = 500 + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/OneSignal.md b/versioned_docs/version-2.3.7/connector-v2/source/OneSignal.md new file mode 100644 index 000000000000..9fb6d65379be --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/OneSignal.md @@ -0,0 +1,327 @@ +# OneSignal + +> OneSignal source connector + +## Description + +Used to read data from OneSignal. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-----------------------------|---------|----------|---------------| +| url | String | Yes | - | +| password | String | Yes | - | +| method | String | No | get | +| schema | Config | No | - | +| schema.fields | Config | No | - | +| format | String | No | json | +| params | Map | No | - | +| body | String | No | - | +| json_field | Config | No | - | +| content_json | String | No | - | +| poll_interval_millis | int | No | - | +| retry | int | No | - | +| retry_backoff_multiplier_ms | int | No | 100 | +| retry_backoff_max_ms | int | No | 10000 | +| enable_multi_lines | boolean | No | false | +| common-options | config | No | - | + +### url [String] + +http request url + +### password [String] + +Auth key for login, you can get more detail at this link: + +https://documentation.onesignal.com/docs/accounts-and-keys#user-auth-key + +### method [String] + +http request method, only supports GET, POST method + +### params [Map] + +http params + +### body [String] + +http body + +### poll_interval_millis [int] + +request http api interval(millis) in stream mode + +### retry [int] + +The max retry times if request http return to `IOException` + +### retry_backoff_multiplier_ms [int] + +The retry-backoff times(millis) multiplier if request http failed + +### retry_backoff_max_ms [int] + +The maximum retry-backoff times(millis) if request http failed + +### format [String] + +the format of upstream data, now only support `json` `text`, default `json`. + +when you assign format is `json`, you should also assign schema option, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +when you assign format is `text`, connector will do nothing for upstream data, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +connector will generate data as the following: + +| content | +|----------------------------------------------------------| +| {"code": 200, "data": "get success", "success": true} | + +### schema [Config] + +#### fields [Config] + +the schema fields of upstream data + +### content_json [String] + +This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. + +If your return data looks something like this. + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can configure `content_field = "$.store.book.*"` and the result returned looks like this: + +```json +[ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } +] +``` + +Then you can get the desired result with a simpler schema,like + +```hocon +Http { + url = "http://mockserver:1080/contentjson/mock" + method = "GET" + format = "json" + content_field = "$.store.book.*" + schema = { + fields { + category = string + author = string + title = string + price = string + } + } +} +``` + +Here is an example: + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). + +### json_field [Config] + +This parameter helps you configure the schema,so this parameter must be used with schema. + +If your data looks something like this: + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can get the contents of 'book' by configuring the task as follows: + +```hocon +source { + Http { + url = "http://mockserver:1080/jsonpath/mock" + method = "GET" + format = "json" + json_field = { + category = "$.store.book[*].category" + author = "$.store.book[*].author" + title = "$.store.book[*].title" + price = "$.store.book[*].price" + } + schema = { + fields { + category = string + author = string + title = string + price = string + } + } + } +} +``` + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +## Example + +```hocon + +OneSignal { + url = "https://onesignal.com/api/v1/apps" + password = "SeaTunnel-test" + schema = { + fields { + id = string + name = string + gcm_key = string + chrome_key = string + chrome_web_key = string + chrome_web_origin = string + chrome_web_gcm_sender_id = string + chrome_web_default_notification_icon = string + chrome_web_sub_domain = string + apns_env = string + apns_certificates = string + apns_p8 = string + apns_team_id = string + apns_key_id = string + apns_bundle_id = string + safari_apns_certificate = string + safari_site_origin = string + safari_push_id = string + safari_icon_16_16 = string + safari_icon_32_32 = string + safari_icon_64_64 = string + safari_icon_128_128 = string + safari_icon_256_256 = string + site_name = string + created_at = string + updated_at = string + players = int + messageable_players = int + basic_auth_key = string + additional_data_is_root_payload = string + } + } +} +``` + +## Changelog + +### next version + +- Add OneSignal Source Connector +- [Feature][Connector-V2][HTTP] Use json-path parsing ([3510](https://github.com/apache/seatunnel/pull/3510)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/OpenMldb.md b/versioned_docs/version-2.3.7/connector-v2/source/OpenMldb.md new file mode 100644 index 000000000000..a3eb291762dd --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/OpenMldb.md @@ -0,0 +1,86 @@ +# OpenMldb + +> OpenMldb source connector + +## Description + +Used to read data from OpenMldb. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-----------------|---------|----------|---------------| +| cluster_mode | boolean | yes | - | +| sql | string | yes | - | +| database | string | yes | - | +| host | string | no | - | +| port | int | no | - | +| zk_path | string | no | - | +| zk_host | string | no | - | +| session_timeout | int | no | 10000 | +| request_timeout | int | no | 60000 | +| common-options | | no | - | + +### cluster_mode [string] + +OpenMldb is or not cluster mode + +### sql [string] + +Sql statement + +### database [string] + +Database name + +### host [string] + +OpenMldb host, only supported on OpenMldb single mode + +### port [int] + +OpenMldb port, only supported on OpenMldb single mode + +### zk_host [string] + +Zookeeper host, only supported on OpenMldb cluster mode + +### zk_path [string] + +Zookeeper path, only supported on OpenMldb cluster mode + +### session_timeout [int] + +OpenMldb session timeout(ms), default 60000 + +### request_timeout [int] + +OpenMldb request timeout(ms), default 10000 + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +## Example + +```hocon + + OpenMldb { + host = "172.17.0.2" + port = 6527 + sql = "select * from demo_table1" + database = "demo_db" + cluster_mode = false + } + +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Oracle-CDC.md b/versioned_docs/version-2.3.7/connector-v2/source/Oracle-CDC.md new file mode 100644 index 000000000000..5d22aa1c4ae7 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Oracle-CDC.md @@ -0,0 +1,349 @@ +# Oracle CDC + +> Oracle CDC source connector + +## Support Those Engines + +> SeaTunnel Zeta
    +> Flink
    + +## Key features + +- [ ] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +The Oracle CDC connector allows for reading snapshot data and incremental data from Oracle database. This document +describes how to set up the Oracle CDC connector to run SQL queries against Oracle databases. + +## Notice + +The Debezium Oracle connector does not rely on the continuous mining option. The connector is responsible for detecting log switches and adjusting the logs that are mined automatically, which the continuous mining option did for you automatically. +So, you can not set this property named `log.mining.continuous.mine` in the debezium. + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|----------------------------------------------------------|--------------------------|----------------------------------------|--------------------------------------------------------------------| +| Oracle | Different dependency version has different driver class. | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@datasource01:1523:xe | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | + +## Database Dependency + +### Install Jdbc Driver + +#### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. +> 2. To support the i18n character set, copy the `orai18n.jar` to the `$SEATNUNNEL_HOME/plugins/` directory. + +#### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8) has been placed in directory `${SEATUNNEL_HOME}/lib/`. +> 2. To support the i18n character set, copy the `orai18n.jar` to the `$SEATNUNNEL_HOME/lib/` directory. + +### Enable Oracle Logminer + +> To enable Oracle CDC (Change Data Capture) using Logminer in Seatunnel, which is a built-in tool provided by Oracle, follow the steps below: + +#### Enabling Logminer without CDB (Container Database) mode. + +1. The operating system creates an empty file directory to store Oracle archived logs and user tablespaces. + +```shell +mkdir -p /opt/oracle/oradata/recovery_area +mkdir -p /opt/oracle/oradata/ORCLCDB +chown -R oracle /opt/oracle/*** +``` + +2. Login as admin and enable Oracle archived logs. + +```sql +sqlplus /nolog; +connect sys as sysdba; +alter system set db_recovery_file_dest_size = 10G; +alter system set db_recovery_file_dest = '/opt/oracle/oradata/recovery_area' scope=spfile; +shutdown immediate; +startup mount; +alter database archivelog; +alter database open; +ALTER DATABASE ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS; +archive log list; +``` + +3. Login as admin and create an account called logminer_user with the password "oracle", and grant it privileges to read tables and logs. + +```sql +CREATE TABLESPACE logminer_tbs DATAFILE '/opt/oracle/oradata/ORCLCDB/logminer_tbs.dbf' SIZE 25M REUSE AUTOEXTEND ON MAXSIZE UNLIMITED; +CREATE USER logminer_user IDENTIFIED BY oracle DEFAULT TABLESPACE logminer_tbs QUOTA UNLIMITED ON logminer_tbs; + +GRANT CREATE SESSION TO logminer_user; +GRANT SELECT ON V_$DATABASE to logminer_user; +GRANT SELECT ON V_$LOG TO logminer_user; +GRANT SELECT ON V_$LOGFILE TO logminer_user; +GRANT SELECT ON V_$LOGMNR_LOGS TO logminer_user; +GRANT SELECT ON V_$LOGMNR_CONTENTS TO logminer_user; +GRANT SELECT ON V_$ARCHIVED_LOG TO logminer_user; +GRANT SELECT ON V_$ARCHIVE_DEST_STATUS TO logminer_user; +GRANT EXECUTE ON DBMS_LOGMNR TO logminer_user; +GRANT EXECUTE ON DBMS_LOGMNR_D TO logminer_user; +GRANT SELECT ANY TRANSACTION TO logminer_user; +GRANT SELECT ON V_$TRANSACTION TO logminer_user; +``` + +##### Oracle 11g is not supported + +```sql +GRANT LOGMINING TO logminer_user; +``` + +##### Grant privileges only to the tables that need to be collected + +```sql +GRANT SELECT ANY TABLE TO logminer_user; +GRANT ANALYZE ANY TO logminer_user; +``` + +#### To enable Logminer in Oracle with CDB (Container Database) + PDB (Pluggable Database) mode, follow the steps below: + +1. The operating system creates an empty file directory to store Oracle archived logs and user tablespaces. + +```shell +mkdir -p /opt/oracle/oradata/recovery_area +mkdir -p /opt/oracle/oradata/ORCLCDB +mkdir -p /opt/oracle/oradata/ORCLCDB/ORCLPDB1 +chown -R oracle /opt/oracle/*** +``` + +2. Login as admin and enable logging + +```sql +sqlplus /nolog +connect sys as sysdba; # Password: oracle +alter system set db_recovery_file_dest_size = 10G; +alter system set db_recovery_file_dest = '/opt/oracle/oradata/recovery_area' scope=spfile; +shutdown immediate +startup mount +alter database archivelog; +alter database open; +archive log list; +``` + +3. Executing in CDB + +```sql +ALTER TABLE TEST.* ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS; +ALTER TABLE TEST.T2 ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS; +``` + +4. Creating debeziume account + +> Operating in CDB + +```sql +sqlplus sys/top_secret@//localhost:1521/ORCLCDB as sysdba +CREATE TABLESPACE logminer_tbs DATAFILE '/opt/oracle/oradata/ORCLCDB/logminer_tbs.dbf' + SIZE 25M REUSE AUTOEXTEND ON MAXSIZE UNLIMITED; +exit; +``` + +> Operating in PDB + +```sql +sqlplus sys/top_secret@//localhost:1521/ORCLPDB1 as sysdba + CREATE TABLESPACE logminer_tbs DATAFILE '/opt/oracle/oradata/ORCLCDB/ORCLPDB1/logminer_tbs.dbf' + SIZE 25M REUSE AUTOEXTEND ON MAXSIZE UNLIMITED; + exit; +``` + +5. Operating in CDB + +```sql +sqlplus sys/top_secret@//localhost:1521/ORCLCDB as sysdba + +CREATE USER c##dbzuser IDENTIFIED BY dbz +DEFAULT TABLESPACE logminer_tbs +QUOTA UNLIMITED ON logminer_tbs +CONTAINER=ALL; + +GRANT CREATE SESSION TO c##dbzuser CONTAINER=ALL; +GRANT SET CONTAINER TO c##dbzuser CONTAINER=ALL; +GRANT SELECT ON V_$DATABASE to c##dbzuser CONTAINER=ALL; +GRANT FLASHBACK ANY TABLE TO c##dbzuser CONTAINER=ALL; +GRANT SELECT ANY TABLE TO c##dbzuser CONTAINER=ALL; +GRANT SELECT_CATALOG_ROLE TO c##dbzuser CONTAINER=ALL; +GRANT EXECUTE_CATALOG_ROLE TO c##dbzuser CONTAINER=ALL; +GRANT SELECT ANY TRANSACTION TO c##dbzuser CONTAINER=ALL; +GRANT LOGMINING TO c##dbzuser CONTAINER=ALL; + +GRANT CREATE TABLE TO c##dbzuser CONTAINER=ALL; +GRANT LOCK ANY TABLE TO c##dbzuser CONTAINER=ALL; +GRANT CREATE SEQUENCE TO c##dbzuser CONTAINER=ALL; + +GRANT EXECUTE ON DBMS_LOGMNR TO c##dbzuser CONTAINER=ALL; +GRANT EXECUTE ON DBMS_LOGMNR_D TO c##dbzuser CONTAINER=ALL; + +GRANT SELECT ON V_$LOG TO c##dbzuser CONTAINER=ALL; +GRANT SELECT ON V_$LOG_HISTORY TO c##dbzuser CONTAINER=ALL; +GRANT SELECT ON V_$LOGMNR_LOGS TO c##dbzuser CONTAINER=ALL; +GRANT SELECT ON V_$LOGMNR_CONTENTS TO c##dbzuser CONTAINER=ALL; +GRANT SELECT ON V_$LOGMNR_PARAMETERS TO c##dbzuser CONTAINER=ALL; +GRANT SELECT ON V_$LOGFILE TO c##dbzuser CONTAINER=ALL; +GRANT SELECT ON V_$ARCHIVED_LOG TO c##dbzuser CONTAINER=ALL; +GRANT SELECT ON V_$ARCHIVE_DEST_STATUS TO c##dbzuser CONTAINER=ALL; +GRANT analyze any TO debeziume_1 CONTAINER=ALL; + +exit; +``` + +## Data Type Mapping + +| Oracle Data type | SeaTunnel Data type | +|--------------------------------------------------------------------------------------|---------------------| +| INTEGER | INT | +| FLOAT | DECIMAL(38, 18) | +| NUMBER(precision <= 9, scale == 0) | INT | +| NUMBER(9 < precision <= 18, scale == 0) | BIGINT | +| NUMBER(18 < precision, scale == 0) | DECIMAL(38, 0) | +| NUMBER(precision == 0, scale == 0) | DECIMAL(38, 18) | +| NUMBER(scale != 0) | DECIMAL(38, 18) | +| BINARY_DOUBLE | DOUBLE | +| BINARY_FLOAT
    REAL | FLOAT | +| CHAR
    NCHAR
    NVARCHAR2
    VARCHAR2
    LONG
    ROWID
    NCLOB
    CLOB
    | STRING | +| DATE | DATE | +| TIMESTAMP
    TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | +| BLOB
    RAW
    LONG RAW
    BFILE | BYTES | + +## Source Options + +| Name | Type | Required | Default | Description | +|------------------------------------------------|----------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| base-url | String | Yes | - | The URL of the JDBC connection. Refer to a case: `idbc:oracle:thin:datasource01:1523:xe`. | +| username | String | Yes | - | Name of the database to use when connecting to the database server. | +| password | String | Yes | - | Password to use when connecting to the database server. | +| database-names | List | No | - | Database name of the database to monitor. | +| schema-names | List | No | - | Schema name of the database to monitor. | +| table-names | List | Yes | - | Table name of the database to monitor. The table name needs to include the database name, for example: `database_name.table_name` | +| table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys":["key1"]}] | +| startup.mode | Enum | No | INITIAL | Optional startup mode for Oracle CDC consumer, valid enumerations are `initial`, `earliest`, `latest` and `specific`.
    `initial`: Synchronize historical data at startup, and then synchronize incremental data.
    `earliest`: Startup from the earliest offset possible.
    `latest`: Startup from the latest offset.
    `specific`: Startup from user-supplied specific offsets. | +| startup.specific-offset.file | String | No | - | Start from the specified binlog file name. **Note, This option is required when the `startup.mode` option used `specific`.** | +| startup.specific-offset.pos | Long | No | - | Start from the specified binlog file position. **Note, This option is required when the `startup.mode` option used `specific`.** | +| stop.mode | Enum | No | NEVER | Optional stop mode for Oracle CDC consumer, valid enumerations are `never`, `latest` or `specific`.
    `never`: Real-time job don't stop the source.
    `latest`: Stop from the latest offset.
    `specific`: Stop from user-supplied specific offset. | +| stop.specific-offset.file | String | No | - | Stop from the specified binlog file name. **Note, This option is required when the `stop.mode` option used `specific`.** | +| stop.specific-offset.pos | Long | No | - | Stop from the specified binlog file position. **Note, This option is required when the `stop.mode` option used `specific`.** | +| snapshot.split.size | Integer | No | 8096 | The split size (number of rows) of table snapshot, captured tables are split into multiple splits when read the snapshot of table. | +| snapshot.fetch.size | Integer | No | 1024 | The maximum fetch size for per poll when read table snapshot. | +| server-time-zone | String | No | UTC | The session time zone in database server. If not set, then ZoneId.systemDefault() is used to determine the server time zone. | +| connect.timeout.ms | Duration | No | 30000 | The maximum time that the connector should wait after trying to connect to the database server before timing out. | +| connect.max-retries | Integer | No | 3 | The max retry times that the connector should retry to build database server connection. | +| connection.pool.size | Integer | No | 20 | The jdbc connection pool size. | +| chunk-key.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | +| chunk-key.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | +| sample-sharding.threshold | Integer | No | 1000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | +| inverse-sampling.rate | Integer | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | +| exactly_once | Boolean | No | false | Enable exactly once semantic. | +| use_select_count | Boolean | No | false | Use select count for table count rather then other methods in full stage.In this scenario, select count directly is used when it is faster to update statistics using sql from analysis table | +| skip_analyze | Boolean | No | false | Skip the analysis of table count in full stage.In this scenario, you schedule analysis table sql to update related table statistics periodically or your table data does not change frequently | +| format | Enum | No | DEFAULT | Optional output format for Oracle CDC, valid enumerations are `DEFAULT`、`COMPATIBLE_DEBEZIUM_JSON`. | +| debezium | Config | No | - | Pass-through [Debezium's properties](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/oracle.adoc#connector-properties) to Debezium Embedded Engine which is used to capture data changes from Oracle server. | +| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## Task Example + +### Simple + +> Support multi-table reading + +```conf +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + Oracle-CDC { + result_table_name = "customers" + username = "system" + password = "oracle" + database-names = ["XE"] + schema-names = ["DEBEZIUM"] + table-names = ["XE.DEBEZIUM.FULL_TYPES"] + base-url = "jdbc:oracle:thin:system/oracle@oracle-host:1521:xe" + source.reader.close.timeout = 120000 + } +} +``` + +> Use the select count(*) instead of analysis table for count table rows in full stage +> +> ```conf +> source { +> # This is a example source plugin **only for test and demonstrate the feature source plugin** +> Oracle-CDC { +> result_table_name = "customers" +> use_select_count = true +> username = "system" +> password = "oracle" +> database-names = ["XE"] +> schema-names = ["DEBEZIUM"] +> table-names = ["XE.DEBEZIUM.FULL_TYPES"] +> base-url = "jdbc:oracle:thin:system/oracle@oracle-host:1521:xe" +> source.reader.close.timeout = 120000 +> } +> } +> ``` +> +> Use the select NUM_ROWS from all_tables for the table rows but skip the analyze table. +> +> ```conf +> source { +> # This is a example source plugin **only for test and demonstrate the feature source plugin** +> Oracle-CDC { +> result_table_name = "customers" +> skip_analyze = true +> username = "system" +> password = "oracle" +> database-names = ["XE"] +> schema-names = ["DEBEZIUM"] +> table-names = ["XE.DEBEZIUM.FULL_TYPES"] +> base-url = "jdbc:oracle:thin:system/oracle@oracle-host:1521:xe" +> source.reader.close.timeout = 120000 +> } +> } +> ``` + +### Support custom primary key for table + +``` + +source { + Oracle-CDC { + result_table_name = "customers" + base-url = "jdbc:oracle:thin:system/oracle@oracle-host:1521:xe" + source.reader.close.timeout = 120000 + username = "system" + password = "oracle" + database-names = ["XE"] + schema-names = ["DEBEZIUM"] + table-names = ["XE.DEBEZIUM.FULL_TYPES"] + table-names-config = [ + { + table = "XE.DEBEZIUM.FULL_TYPES" + primaryKeys = ["ID"] + } + ] + } +} + +``` + +### Support debezium-compatible format send to kafka + +> Must be used with kafka connector sink, see [compatible debezium format](../formats/cdc-compatible-debezium-json.md) for details + +## Changelog + +- Add Oracle CDC Source Connector + +### next version + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Oracle.md b/versioned_docs/version-2.3.7/connector-v2/source/Oracle.md new file mode 100644 index 000000000000..4309f2af08b8 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Oracle.md @@ -0,0 +1,324 @@ +# Oracle + +> JDBC Oracle Source Connector + +## Description + +Read external data source data through JDBC. + +## Support Those Engines + +> Spark
    +> Flink
    +> SeaTunnel Zeta
    + +## Using Dependency + +### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. +> 2. To support the i18n character set, copy the `orai18n.jar` to the `$SEATNUNNEL_HOME/plugins/` directory. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8) has been placed in directory `${SEATUNNEL_HOME}/lib/`. +> 2. To support the i18n character set, copy the `orai18n.jar` to the `$SEATNUNNEL_HOME/lib/` directory. + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +> supports query SQL and can achieve projection effect. + +## Supported DataSource Info + +| Datasource | Supported Versions | Driver | Url | Maven | +|------------|----------------------------------------------------------|--------------------------|----------------------------------------|--------------------------------------------------------------------| +| Oracle | Different dependency version has different driver class. | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@datasource01:1523:xe | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
    +> For example Oracle datasource: cp ojdbc8-xxxxxx.jar $SEATNUNNEL_HOME/lib/
    +> To support the i18n character set, copy the orai18n.jar to the $SEATNUNNEL_HOME/lib/ directory. + +## Data Type Mapping + +| Oracle Data Type | SeaTunnel Data Type | +|----------------------------------------------------------------------------------------------------------|---------------------| +| INTEGER | DECIMAL(38,0) | +| FLOAT | DECIMAL(38, 18) | +| NUMBER(precision <= 9, scale == 0) | INT | +| NUMBER(9 < precision <= 18, scale == 0) | BIGINT | +| NUMBER(18 < precision, scale == 0) | DECIMAL(38, 0) | +| NUMBER(scale != 0) | DECIMAL(38, 18) | +| BINARY_DOUBLE | DOUBLE | +| BINARY_FLOAT
    REAL | FLOAT | +| CHAR
    NCHAR
    VARCHAR
    NVARCHAR2
    VARCHAR2
    LONG
    ROWID
    NCLOB
    CLOB
    XML
    | STRING | +| DATE | TIMESTAMP | +| TIMESTAMP
    TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | +| BLOB
    RAW
    LONG RAW
    BFILE | BYTES | + +## Source Options + +| Name | Type | Required | Default | Description | +|------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:oracle:thin:@datasource01:1523:xe | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
    if you use MySQL the value is `oracle.jdbc.OracleDriver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | +| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
    the row fetch size used in the query toimprove performance by
    reducing the number database hits required to satisfy the selection criteria.
    Zero means use jdbc default value. | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
    specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | + +| Name | Type | Required | Default | Description | +|--------------------------------------------|------------|----------|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:mysql://localhost:3306:3306/test | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
    if you use MySQL the value is `com.mysql.cj.jdbc.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | +| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
    the row fetch size used in the query toimprove performance by
    reducing the number database hits required to satisfy the selection criteria.
    Zero means use jdbc default value. | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
    specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | +| table_path | Int | No | 0 | The path to the full path of table, you can use this configuration instead of `query`.
    examples:
    mysql: "testdb.table1"
    oracle: "test_schema.table1"
    sqlserver: "testdb.test_schema.table1"
    postgresql: "testdb.test_schema.table1" | +| table_list | Array | No | 0 | The list of tables to be read, you can use this configuration instead of `table_path` example: ```[{ table_path = "testdb.table1"}, {table_path = "testdb.table2", query = "select * id, name from testdb.table2"}]``` | +| where_condition | String | No | - | Common row filter conditions for all tables/queries, must start with `where`. for example `where id > 100` | +| split.size | Int | No | 8096 | The split size (number of rows) of table, captured tables are split into multiple splits when read of table. | +| split.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | +| split.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | +| split.sample-sharding.threshold | Int | No | 10000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | +| split.inverse-sampling.rate | Int | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## Parallel Reader + +The JDBC Source connector supports parallel reading of data from tables. SeaTunnel will use certain rules to split the data in the table, which will be handed over to readers for reading. The number of readers is determined by the `parallelism` option. + +**Split Key Rules:** + +1. If `partition_column` is not null, It will be used to calculate split. The column must in **Supported split data type**. +2. If `partition_column` is null, seatunnel will read the schema from table and get the Primary Key and Unique Index. If there are more than one column in Primary Key and Unique Index, The first column which in the **supported split data type** will be used to split data. For example, the table have Primary Key(nn guid, name varchar), because `guid` id not in **supported split data type**, so the column `name` will be used to split data. + +**Supported split data type:** +* String +* Number(int, bigint, decimal, ...) +* Date + +### Options Related To Split + +#### split.size + +How many rows in one split, captured tables are split into multiple splits when read of table. + +#### split.even-distribution.factor.lower-bound + +> Not recommended for use + +The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. + +#### split.even-distribution.factor.upper-bound + +> Not recommended for use + +The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. + +#### split.sample-sharding.threshold + +This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. + +#### split.inverse-sampling.rate + +The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. + +#### partition_column [string] + +The column name for split data. + +#### partition_upper_bound [BigDecimal] + +The partition_column max value for scan, if not set SeaTunnel will query database get max value. + +#### partition_lower_bound [BigDecimal] + +The partition_column min value for scan, if not set SeaTunnel will query database get min value. + +#### partition_num [int] + +> Not recommended for use, The correct approach is to control the number of split through `split.size` + +How many splits do we need to split into, only support positive integer. default value is job parallelism. + +## tips + +> If the table can not be split(for example, table have no Primary Key or Unique Index, and `partition_column` is not set), it will run in single concurrency. +> +> Use `table_path` to replace `query` for single table reading. If you need to read multiple tables, use `table_list`. + +## Task Example + +### Simple: + +> This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. + +``` +# Defining the runtime environment +env { + parallelism = 4 + job.mode = "BATCH" +} +source{ + Jdbc { + url = "jdbc:oracle:thin:@datasource01:1523:xe" + driver = "oracle.jdbc.OracleDriver" + user = "root" + password = "123456" + query = "SELECT * FROM TEST_TABLE" + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink { + Console {} +} +``` + +### parallel by partition_column + +> Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table + +``` +env { + parallelism = 4 + job.mode = "BATCH" +} +source { + Jdbc { + url = "jdbc:oracle:thin:@datasource01:1523:xe" + driver = "oracle.jdbc.OracleDriver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + # Define query logic as required + query = "SELECT * FROM TEST_TABLE" + # Parallel sharding reads fields + partition_column = "ID" + # Number of fragments + partition_num = 10 + properties { + database.oracle.jdbc.timezoneAsRegion = "false" + } + } +} +sink { + Console {} +} +``` + +### parallel by Primary Key or Unique Index + +> Configuring `table_path` will turn on auto split, you can configure `split.*` to adjust the split strategy + +``` +env { + parallelism = 4 + job.mode = "BATCH" +} +source { + Jdbc { + url = "jdbc:oracle:thin:@datasource01:1523:xe" + driver = "oracle.jdbc.OracleDriver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + table_path = "DA.SCHEMA1.TABLE1" + query = "select * from SCHEMA1.TABLE1" + split.size = 10000 + } +} + +sink { + Console {} +} +``` + +### Parallel Boundary: + +> It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured + +``` +source { + Jdbc { + url = "jdbc:oracle:thin:@datasource01:1523:xe" + driver = "oracle.jdbc.OracleDriver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + # Define query logic as required + query = "SELECT * FROM TEST_TABLE" + partition_column = "ID" + # Read start boundary + partition_lower_bound = 1 + # Read end boundary + partition_upper_bound = 500 + partition_num = 10 + } +} +``` + +### Multiple table read: + +***Configuring `table_list` will turn on auto split, you can configure `split.*` to adjust the split strategy*** + +```hocon +env { + job.mode = "BATCH" + parallelism = 4 +} +source { + Jdbc { + url = "jdbc:oracle:thin:@datasource01:1523:xe" + driver = "oracle.jdbc.OracleDriver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + "table_list"=[ + { + "table_path"="XE.TEST.USER_INFO" + }, + { + "table_path"="XE.TEST.YOURTABLENAME" + } + ] + #where_condition= "where id > 100" + split.size = 10000 + #split.even-distribution.factor.upper-bound = 100 + #split.even-distribution.factor.lower-bound = 0.05 + #split.sample-sharding.threshold = 1000 + #split.inverse-sampling.rate = 1000 + } +} + +sink { + Console {} +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/OssFile.md b/versioned_docs/version-2.3.7/connector-v2/source/OssFile.md new file mode 100644 index 000000000000..3f781eb11a91 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/OssFile.md @@ -0,0 +1,492 @@ +# OssFile + +> Oss file source connector + +## Support Those Engines + +> Spark
    +> Flink
    +> SeaTunnel Zeta
    + +## Usage Dependency + +### For Spark/Flink Engine + +1. You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. +2. You must ensure `hadoop-aliyun-xx.jar`, `aliyun-sdk-oss-xx.jar` and `jdom-xx.jar` in `${SEATUNNEL_HOME}/plugins/` dir and the version of `hadoop-aliyun` jar need equals your hadoop version which used in spark/flink and `aliyun-sdk-oss-xx.jar` and `jdom-xx.jar` version needs to be the version corresponding to the `hadoop-aliyun` version. Eg: `hadoop-aliyun-3.1.4.jar` dependency `aliyun-sdk-oss-3.4.1.jar` and `jdom-1.1.jar`. + +### For SeaTunnel Zeta Engine + +1. You must ensure `seatunnel-hadoop3-3.1.4-uber.jar`, `aliyun-sdk-oss-3.4.1.jar`, `hadoop-aliyun-3.1.4.jar` and `jdom-1.1.jar` in `${SEATUNNEL_HOME}/lib/` dir. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) + +Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot. + +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + - [x] xml + - [x] binary + +## Data Type Mapping + +Data type mapping is related to the type of file being read, We supported as the following file types: + +`text` `csv` `parquet` `orc` `json` `excel` `xml` + +### JSON File Type + +If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want. + +For example: + +upstream data is the following: + +```json + +{"code": 200, "data": "get success", "success": true} + +``` + +You can also save multiple pieces of data in one file and split them by newline: + +```json lines + +{"code": 200, "data": "get success", "success": true} +{"code": 300, "data": "get failed", "success": false} + +``` + +you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +### Text Or CSV File Type + +If you assign file type to `text` `csv`, you can choose to specify the schema information or not. + +For example, upstream data is the following: + +```text + +tyrantlucifer#26#male + +``` + +If you do not assign data schema connector will treat the upstream data as the following: + +| content | +|-----------------------| +| tyrantlucifer#26#male | + +If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type + +you should assign schema and delimiter as the following: + +```hocon + +field_delimiter = "#" +schema { + fields { + name = string + age = int + gender = string + } +} + +``` + +connector will generate data as the following: + +| name | age | gender | +|---------------|-----|--------| +| tyrantlucifer | 26 | male | + +### Orc File Type + +If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. + +| Orc Data type | SeaTunnel Data type | +|----------------------------------|----------------------------------------------------------------| +| BOOLEAN | BOOLEAN | +| INT | INT | +| BYTE | BYTE | +| SHORT | SHORT | +| LONG | LONG | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| BINARY | BINARY | +| STRING
    VARCHAR
    CHAR
    | STRING | +| DATE | LOCAL_DATE_TYPE | +| TIMESTAMP | LOCAL_DATE_TIME_TYPE | +| DECIMAL | DECIMAL | +| LIST(STRING) | STRING_ARRAY_TYPE | +| LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE | +| LIST(TINYINT) | BYTE_ARRAY_TYPE | +| LIST(SMALLINT) | SHORT_ARRAY_TYPE | +| LIST(INT) | INT_ARRAY_TYPE | +| LIST(BIGINT) | LONG_ARRAY_TYPE | +| LIST(FLOAT) | FLOAT_ARRAY_TYPE | +| LIST(DOUBLE) | DOUBLE_ARRAY_TYPE | +| Map | MapType, This type of K and V will transform to SeaTunnel type | +| STRUCT | SeaTunnelRowType | + +### Parquet File Type + +If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. + +| Orc Data type | SeaTunnel Data type | +|----------------------|----------------------------------------------------------------| +| INT_8 | BYTE | +| INT_16 | SHORT | +| DATE | DATE | +| TIMESTAMP_MILLIS | TIMESTAMP | +| INT64 | LONG | +| INT96 | TIMESTAMP | +| BINARY | BYTES | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| BOOLEAN | BOOLEAN | +| FIXED_LEN_BYTE_ARRAY | TIMESTAMP
    DECIMAL | +| DECIMAL | DECIMAL | +| LIST(STRING) | STRING_ARRAY_TYPE | +| LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE | +| LIST(TINYINT) | BYTE_ARRAY_TYPE | +| LIST(SMALLINT) | SHORT_ARRAY_TYPE | +| LIST(INT) | INT_ARRAY_TYPE | +| LIST(BIGINT) | LONG_ARRAY_TYPE | +| LIST(FLOAT) | FLOAT_ARRAY_TYPE | +| LIST(DOUBLE) | DOUBLE_ARRAY_TYPE | +| Map | MapType, This type of K and V will transform to SeaTunnel type | +| STRUCT | SeaTunnelRowType | + +## Options + +| name | type | required | default value | Description | +|---------------------------|---------|----------|---------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| path | string | yes | - | The Oss path that needs to be read can have sub paths, but the sub paths need to meet certain format requirements. Specific requirements can be referred to "parse_partition_from_path" option | +| file_format_type | string | yes | - | File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` | +| bucket | string | yes | - | The bucket address of oss file system, for example: `oss://seatunnel-test`. | +| endpoint | string | yes | - | fs oss endpoint | +| read_columns | list | no | - | The read column list of the data source, user can use it to implement field projection. The file type supported column projection as the following shown: `text` `csv` `parquet` `orc` `json` `excel` `xml` . If the user wants to use this feature when reading `text` `json` `csv` files, the "schema" option must be configured. | +| access_key | string | no | - | | +| access_secret | string | no | - | | +| delimiter | string | no | \001 | Field delimiter, used to tell connector how to slice and dice fields when reading text files. Default `\001`, the same as hive's default delimiter. | +| parse_partition_from_path | boolean | no | true | Control whether parse the partition keys and values from file path. For example if you read a file from path `oss://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`. Every record data from file will be added these two fields: name="tyrantlucifer", age=16 | +| date_format | string | no | yyyy-MM-dd | Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`. default `yyyy-MM-dd` | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` | +| time_format | string | no | HH:mm:ss | Time type format, used to tell connector how to convert string to time, supported as the following formats:`HH:mm:ss` `HH:mm:ss.SSS` | +| skip_header_row_number | long | no | 0 | Skip the first few lines, but only for the txt and csv. For example, set like following:`skip_header_row_number = 2`. Then SeaTunnel will skip the first 2 lines from source files | +| schema | config | no | - | The schema of upstream data. | +| sheet_name | string | no | - | Reader the sheet of the workbook,Only used when file_format is excel. | +| xml_row_tag | string | no | - | Specifies the tag name of the data rows within the XML file, only used when file_format is xml. | +| xml_use_attr_format | boolean | no | - | Specifies whether to process data using the tag attribute format, only used when file_format is xml. | +| compress_codec | string | no | none | Which compress codec the files used. | +| encoding | string | no | UTF-8 | +| file_filter_pattern | string | no | | `*.txt` means you only need read the files end with `.txt` | +| common-options | config | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | + +### compress_codec [string] + +The compress codec of files and the details that supported as the following shown: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc/parquet: + automatically recognizes the compression type, no additional settings required. + +### encoding [string] + +Only used when file_format_type is json,text,csv,xml. +The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`. + +### file_filter_pattern [string] + +Filter pattern, which used for filtering files. + +### schema [config] + +Only need to be configured when the file_format_type are text, json, excel, xml or csv ( Or other format we can't read the schema from metadata). + +#### fields [Config] + +The schema of upstream data. + +## How to Create a Oss Data Synchronization Jobs + +The following example demonstrates how to create a data synchronization job that reads data from Oss and prints it on the local client: + +```bash +# Set the basic configuration of the task to be performed +env { + parallelism = 1 + job.mode = "BATCH" +} + +# Create a source to connect to Oss +source { + OssFile { + path = "/seatunnel/orc" + bucket = "oss://tyrantlucifer-image-bed" + access_key = "xxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxxxxx" + endpoint = "oss-cn-beijing.aliyuncs.com" + file_format_type = "orc" + } +} + +# Console printing of the read Oss data +sink { + Console { + } +} +``` + +```bash +# Set the basic configuration of the task to be performed +env { + parallelism = 1 + job.mode = "BATCH" +} + +# Create a source to connect to Oss +source { + OssFile { + path = "/seatunnel/json" + bucket = "oss://tyrantlucifer-image-bed" + access_key = "xxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxxxxx" + endpoint = "oss-cn-beijing.aliyuncs.com" + file_format_type = "json" + schema { + fields { + id = int + name = string + } + } + } +} + +# Console printing of the read Oss data +sink { + Console { + } +} +``` + +### Multiple Table + +No need to config schema file type, eg: `orc`. + +``` +env { + parallelism = 1 + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local + job.mode = "BATCH" +} + +source { + OssFile { + tables_configs = [ + { + schema = { + table = "fake01" + } + bucket = "oss://whale-ops" + access_key = "xxxxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxx" + endpoint = "https://oss-accelerate.aliyuncs.com" + path = "/test/seatunnel/read/orc" + file_format_type = "orc" + }, + { + schema = { + table = "fake02" + } + bucket = "oss://whale-ops" + access_key = "xxxxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxx" + endpoint = "https://oss-accelerate.aliyuncs.com" + path = "/test/seatunnel/read/orc" + file_format_type = "orc" + } + ] + result_table_name = "fake" + } +} + +sink { + Assert { + rules { + table-names = ["fake01", "fake02"] + } + } +} +``` + +Need config schema file type, eg: `json` + +``` + +env { + execution.parallelism = 1 + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local + job.mode = "BATCH" +} + +source { + OssFile { + tables_configs = [ + { + bucket = "oss://whale-ops" + access_key = "xxxxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxx" + endpoint = "https://oss-accelerate.aliyuncs.com" + path = "/test/seatunnel/read/json" + file_format_type = "json" + schema = { + table = "fake01" + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + C_MAP = "map" + C_ARRAY = "array" + C_STRING = string + C_BOOLEAN = boolean + C_TINYINT = tinyint + C_SMALLINT = smallint + C_INT = int + C_BIGINT = bigint + C_FLOAT = float + C_DOUBLE = double + C_BYTES = bytes + C_DATE = date + C_DECIMAL = "decimal(38, 18)" + C_TIMESTAMP = timestamp + } + } + } + }, + { + bucket = "oss://whale-ops" + access_key = "xxxxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxx" + endpoint = "https://oss-accelerate.aliyuncs.com" + path = "/test/seatunnel/read/json" + file_format_type = "json" + schema = { + table = "fake02" + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + C_MAP = "map" + C_ARRAY = "array" + C_STRING = string + C_BOOLEAN = boolean + C_TINYINT = tinyint + C_SMALLINT = smallint + C_INT = int + C_BIGINT = bigint + C_FLOAT = float + C_DOUBLE = double + C_BYTES = bytes + C_DATE = date + C_DECIMAL = "decimal(38, 18)" + C_TIMESTAMP = timestamp + } + } + } + } + ] + result_table_name = "fake" + } +} + +sink { + Assert { + rules { + table-names = ["fake01", "fake02"] + } + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add OSS File Source Connector + +### 2.3.0-beta 2022-10-20 + +- [BugFix] Fix the bug of incorrect path in windows environment ([2980](https://github.com/apache/seatunnel/pull/2980)) +- [Improve] Support extract partition from SeaTunnelRow fields ([3085](https://github.com/apache/seatunnel/pull/3085)) +- [Improve] Support parse field from file path ([2985](https://github.com/apache/seatunnel/pull/2985)) + +### Tips + +> 1.[SeaTunnel Deployment Document](../../start-v2/locally/deployment.md). + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/OssJindoFile.md b/versioned_docs/version-2.3.7/connector-v2/source/OssJindoFile.md new file mode 100644 index 000000000000..f24ea83f0ccd --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/OssJindoFile.md @@ -0,0 +1,360 @@ +# OssJindoFile + +> OssJindo file source connector + +## Support Those Engines + +> Spark
    +> Flink
    +> SeaTunnel Zeta
    + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) + +Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot. + +- [ ] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + - [x] xml + - [x] binary + +## Description + +Read data from aliyun oss file system using jindo api. + +:::tip + +You need to download [jindosdk-4.6.1.tar.gz](https://jindodata-binary.oss-cn-shanghai.aliyuncs.com/release/4.6.1/jindosdk-4.6.1.tar.gz) +and then unzip it, copy jindo-sdk-4.6.1.jar and jindo-core-4.6.1.jar from lib to ${SEATUNNEL_HOME}/lib. + +If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. + +If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. + +We made some trade-offs in order to support more file types, so we used the HDFS protocol for internal access to OSS and this connector need some hadoop dependencies. +It only supports hadoop version **2.9.X+**. + +::: + +## Options + +| name | type | required | default value | +|---------------------------|---------|----------|---------------------| +| path | string | yes | - | +| file_format_type | string | yes | - | +| bucket | string | yes | - | +| access_key | string | yes | - | +| access_secret | string | yes | - | +| endpoint | string | yes | - | +| read_columns | list | no | - | +| delimiter/field_delimiter | string | no | \001 | +| parse_partition_from_path | boolean | no | true | +| date_format | string | no | yyyy-MM-dd | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | +| time_format | string | no | HH:mm:ss | +| skip_header_row_number | long | no | 0 | +| schema | config | no | - | +| sheet_name | string | no | - | +| xml_row_tag | string | no | - | +| xml_use_attr_format | boolean | no | - | +| file_filter_pattern | string | no | - | +| compress_codec | string | no | none | +| encoding | string | no | UTF-8 | +| common-options | | no | - | + +### path [string] + +The source file path. + +### file_format_type [string] + +File type, supported as the following file types: + +`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` + +If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want. + +For example: + +upstream data is the following: + +```json + +{"code": 200, "data": "get success", "success": true} + +``` + +You can also save multiple pieces of data in one file and split them by newline: + +```json lines + +{"code": 200, "data": "get success", "success": true} +{"code": 300, "data": "get failed", "success": false} + +``` + +you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. + +If you assign file type to `text` `csv`, you can choose to specify the schema information or not. + +For example, upstream data is the following: + +```text + +tyrantlucifer#26#male + +``` + +If you do not assign data schema connector will treat the upstream data as the following: + +| content | +|-----------------------| +| tyrantlucifer#26#male | + +If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type + +you should assign schema and delimiter as the following: + +```hocon + +field_delimiter = "#" +schema { + fields { + name = string + age = int + gender = string + } +} + +``` + +connector will generate data as the following: + +| name | age | gender | +|---------------|-----|--------| +| tyrantlucifer | 26 | male | + +If you assign file type to `binary`, SeaTunnel can synchronize files in any format, +such as compressed packages, pictures, etc. In short, any files can be synchronized to the target place. +Under this requirement, you need to ensure that the source and sink use `binary` format for file synchronization +at the same time. You can find the specific usage in the example below. + +### bucket [string] + +The bucket address of oss file system, for example: `oss://tyrantlucifer-image-bed` + +### access_key [string] + +The access key of oss file system. + +### access_secret [string] + +The access secret of oss file system. + +### endpoint [string] + +The endpoint of oss file system. + +### read_columns [list] + +The read column list of the data source, user can use it to implement field projection. + +### delimiter/field_delimiter [string] + +**delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. + +Only need to be configured when file_format is text. + +Field delimiter, used to tell connector how to slice and dice fields. + +default `\001`, the same as hive's default delimiter + +### parse_partition_from_path [boolean] + +Control whether parse the partition keys and values from file path + +For example if you read a file from path `oss://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` + +Every record data from file will be added these two fields: + +| name | age | +|---------------|-----| +| tyrantlucifer | 26 | + +Tips: **Do not define partition fields in schema option** + +### date_format [string] + +Date type format, used to tell connector how to convert string to date, supported as the following formats: + +`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` + +default `yyyy-MM-dd` + +### datetime_format [string] + +Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats: + +`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` + +default `yyyy-MM-dd HH:mm:ss` + +### time_format [string] + +Time type format, used to tell connector how to convert string to time, supported as the following formats: + +`HH:mm:ss` `HH:mm:ss.SSS` + +default `HH:mm:ss` + +### skip_header_row_number [long] + +Skip the first few lines, but only for the txt and csv. + +For example, set like following: + +`skip_header_row_number = 2` + +then SeaTunnel will skip the first 2 lines from source files + +### schema [config] + +Only need to be configured when the file_format_type are text, json, excel, xml or csv ( Or other format we can't read the schema from metadata). + +#### fields [Config] + +The schema of upstream data. + +### sheet_name [string] + +Only need to be configured when file_format is excel. + +Reader the sheet of the workbook. + +### file_filter_pattern [string] + +Filter pattern, which used for filtering files. + +### compress_codec [string] + +The compress codec of files and the details that supported as the following shown: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc/parquet: + automatically recognizes the compression type, no additional settings required. + +### encoding [string] + +Only used when file_format_type is json,text,csv,xml. +The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`. + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. + +## Example + +```hocon + +OssJindoFile { + path = "/seatunnel/orc" + bucket = "oss://tyrantlucifer-image-bed" + access_key = "xxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxxxxx" + endpoint = "oss-cn-beijing.aliyuncs.com" + file_format_type = "orc" + } + +``` + +```hocon + +OssJindoFile { + path = "/seatunnel/json" + bucket = "oss://tyrantlucifer-image-bed" + access_key = "xxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxxxxx" + endpoint = "oss-cn-beijing.aliyuncs.com" + file_format_type = "json" + schema { + fields { + id = int + name = string + } + } + } + +``` + +### Transfer Binary File + +```hocon + +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + OssJindoFile { + bucket = "oss://tyrantlucifer-image-bed" + access_key = "xxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxxxxx" + endpoint = "oss-cn-beijing.aliyuncs.com" + path = "/seatunnel/read/binary/" + file_format_type = "binary" + } +} +sink { + // you can transfer local file to s3/hdfs/oss etc. + OssJindoFile { + bucket = "oss://tyrantlucifer-image-bed" + access_key = "xxxxxxxxxxxxxxxxx" + access_secret = "xxxxxxxxxxxxxxxxxxxxxx" + endpoint = "oss-cn-beijing.aliyuncs.com" + path = "/seatunnel/read/binary2/" + file_format_type = "binary" + } +} + +``` + +## Changelog + +### next version + +- Add OSS Jindo File Source Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Paimon.md b/versioned_docs/version-2.3.7/connector-v2/source/Paimon.md new file mode 100644 index 000000000000..32155abde026 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Paimon.md @@ -0,0 +1,165 @@ +# Paimon + +> Paimon source connector + +## Description + +Read data from Apache Paimon. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-------------------------|--------|----------|---------------| +| warehouse | String | Yes | - | +| catalog_type | String | No | filesystem | +| catalog_uri | String | No | - | +| database | String | Yes | - | +| table | String | Yes | - | +| hdfs_site_path | String | No | - | +| query | String | No | - | +| paimon.hadoop.conf | Map | No | - | +| paimon.hadoop.conf-path | String | No | - | + +### warehouse [string] + +Paimon warehouse path + +### catalog_type [string] + +Catalog type of Paimon, support filesystem and hive + +### catalog_uri [string] + +Catalog uri of Paimon, only needed when catalog_type is hive + +### database [string] + +The database you want to access + +### table [string] + +The table you want to access + +### hdfs_site_path [string] + +The file path of `hdfs-site.xml` + +### query [string] + +The filter condition of the table read. For example: `select * from st_test where id > 100`. If not specified, all rows are read. +Currently, where conditions only support <, <=, >, >=, =, !=, or, and,is null, is not null, and others are not supported. +The Having, Group By, Order By clauses are currently unsupported, because these clauses are not supported by Paimon. +The projection and limit will be supported in the future. + +Note: When the field after the where condition is a string or boolean value, its value must be enclosed in single quotes, otherwise an error will be reported. `For example: name='abc' or tag='true'` +The field data types currently supported by where conditions are as follows: + +* string +* boolean +* tinyint +* smallint +* int +* bigint +* float +* double +* date +* timestamp + +### paimon.hadoop.conf [string] + +Properties in hadoop conf + +### paimon.hadoop.conf-path [string] + +The specified loading path for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files + +## Examples + +### Simple example + +```hocon +source { + Paimon { + warehouse = "/tmp/paimon" + database = "default" + table = "st_test" + } +} +``` + +### Filter example + +```hocon +source { + Paimon { + warehouse = "/tmp/paimon" + database = "full_type" + table = "st_test" + query = "select c_boolean, c_tinyint from st_test where c_boolean= 'true' and c_tinyint > 116 and c_smallint = 15987 or c_decimal='2924137191386439303744.39292213'" + } +} +``` + +### Hadoop conf example + +```hocon +source { + Paimon { + catalog_name="seatunnel_test" + warehouse="hdfs:///tmp/paimon" + database="seatunnel_namespace1" + table="st_test" + query = "select * from st_test where pk_id is not null and pk_id < 3" + paimon.hadoop.conf = { + fs.defaultFS = "hdfs://nameservice1" + dfs.nameservices = "nameservice1" + dfs.ha.namenodes.nameservice1 = "nn1,nn2" + dfs.namenode.rpc-address.nameservice1.nn1 = "hadoop03:8020" + dfs.namenode.rpc-address.nameservice1.nn2 = "hadoop04:8020" + dfs.client.failover.proxy.provider.nameservice1 = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider" + dfs.client.use.datanode.hostname = "true" + } + } +} +``` + +### Hive catalog example + +```hocon +source { + Paimon { + catalog_name="seatunnel_test" + catalog_type="hive" + catalog_uri="thrift://hadoop04:9083" + warehouse="hdfs:///tmp/seatunnel" + database="seatunnel_test" + table="st_test3" + paimon.hadoop.conf = { + fs.defaultFS = "hdfs://nameservice1" + dfs.nameservices = "nameservice1" + dfs.ha.namenodes.nameservice1 = "nn1,nn2" + dfs.namenode.rpc-address.nameservice1.nn1 = "hadoop03:8020" + dfs.namenode.rpc-address.nameservice1.nn2 = "hadoop04:8020" + dfs.client.failover.proxy.provider.nameservice1 = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider" + dfs.client.use.datanode.hostname = "true" + } + } +} +``` + +## Changelog + +### next version + +- Add Paimon Source Connector +- Support projection for Paimon Source + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Persistiq.md b/versioned_docs/version-2.3.7/connector-v2/source/Persistiq.md new file mode 100644 index 000000000000..c308efbb389c --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Persistiq.md @@ -0,0 +1,300 @@ +# Persistiq + +> Persistiq source connector + +## Description + +Used to read data from Persistiq. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [schema projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-----------------------------|---------|----------|---------------| +| url | String | Yes | - | +| password | String | Yes | - | +| method | String | No | get | +| schema | Config | No | - | +| schema.fields | Config | No | - | +| format | String | No | json | +| params | Map | No | - | +| body | String | No | - | +| json_field | Config | No | - | +| content_json | String | No | - | +| poll_interval_millis | int | No | - | +| retry | int | No | - | +| retry_backoff_multiplier_ms | int | No | 100 | +| retry_backoff_max_ms | int | No | 10000 | +| enable_multi_lines | boolean | No | false | +| common-options | config | No | - | + +### url [String] + +http request url + +### password [String] + +API key for login, you can get it at Persistiq website + +### method [String] + +http request method, only supports GET, POST method + +### params [Map] + +http params + +### body [String] + +http body + +### poll_interval_millis [int] + +request http api interval(millis) in stream mode + +### retry [int] + +The max retry times if request http return to `IOException` + +### retry_backoff_multiplier_ms [int] + +The retry-backoff times(millis) multiplier if request http failed + +### retry_backoff_max_ms [int] + +The maximum retry-backoff times(millis) if request http failed + +### format [String] + +the format of upstream data, now only support `json` `text`, default `json`. + +when you assign format is `json`, you should also assign schema option, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +when you assign format is `text`, connector will do nothing for upstream data, for example: + +upstream data is the following: + +```json +{ + "code": 200, + "data": "get success", + "success": true +} +``` + +connector will generate data as the following: + +| content | +|----------------------------------------------------------| +| {"code": 200, "data": "get success", "success": true} | + +### schema [Config] + +#### fields [Config] + +the schema fields of upstream data + +### content_json [String] + +This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. + +If your return data looks something like this. + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can configure `content_field = "$.store.book.*"` and the result returned looks like this: + +```json +[ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } +] +``` + +Then you can get the desired result with a simpler schema,like + +```hocon +Http { + url = "http://example.com/xyz" + method = "GET" + format = "json" + content_field = "$.store.book.*" + schema = { + fields { + category = string + author = string + title = string + price = string + } + } +} +``` + +Here is an example: + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). + +### json_field [Config] + +This parameter helps you configure the schema,so this parameter must be used with schema. + +If your data looks something like this: + +```json +{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 +} +``` + +You can get the contents of 'book' by configuring the task as follows: + +```hocon +source { + Http { + url = "http://example.com/xyz" + method = "GET" + format = "json" + json_field = { + category = "$.store.book[*].category" + author = "$.store.book[*].author" + title = "$.store.book[*].title" + price = "$.store.book[*].price" + } + schema = { + fields { + category = string + author = string + title = string + price = string + } + } + } +} +``` + +- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) +- See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +## Example + +```hocon +Persistiq{ + url = "https://api.persistiq.com/v1/users" + password = "Your password" + content_field = "$.users.*" + schema = { + fields { + id = string + name = string + email = string + activated = boolean + default_mailbox_id = string + salesforce_id = string + } + } +} +``` + +## Changelog + +### next version + +- Add Persistiq Source Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Phoenix.md b/versioned_docs/version-2.3.7/connector-v2/source/Phoenix.md new file mode 100644 index 000000000000..78dafe925177 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Phoenix.md @@ -0,0 +1,68 @@ +# Phoenix + +> Phoenix source connector + +## Description + +Read Phoenix data through [Jdbc connector](Jdbc.md). +Support Batch mode and Streaming mode. The tested Phoenix version is 4.xx and 5.xx +On the underlying implementation, through the jdbc driver of Phoenix, execute the upsert statement to write data to HBase. +Two ways of connecting Phoenix with Java JDBC. One is to connect to zookeeper through JDBC, and the other is to connect to queryserver through JDBC thin client. + +> Tips: By default, the (thin) driver jar is used. If you want to use the (thick) driver or other versions of Phoenix (thin) driver, you need to recompile the jdbc connector module + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) + +supports query SQL and can achieve projection effect. + +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +### driver [string] + +if you use phoenix (thick) driver the value is `org.apache.phoenix.jdbc.PhoenixDriver` or you use (thin) driver the value is `org.apache.phoenix.queryserver.client.Driver` + +### url [string] + +if you use phoenix (thick) driver the value is `jdbc:phoenix:localhost:2182/hbase` or you use (thin) driver the value is `jdbc:phoenix:thin:url=http://localhost:8765;serialization=PROTOBUF` + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +## Example + +use thick client drive + +``` + Jdbc { + driver = org.apache.phoenix.jdbc.PhoenixDriver + url = "jdbc:phoenix:localhost:2182/hbase" + query = "select age, name from test.source" + } + +``` + +use thin client drive + +``` +Jdbc { + driver = org.apache.phoenix.queryserver.client.Driver + url = "jdbc:phoenix:thin:url=http://spark_e2e_phoenix_sink:8765;serialization=PROTOBUF" + query = "select age, name from test.source" +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Phoenix Source Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/PostgreSQL-CDC.md b/versioned_docs/version-2.3.7/connector-v2/source/PostgreSQL-CDC.md new file mode 100644 index 000000000000..e0a5936df0bf --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/PostgreSQL-CDC.md @@ -0,0 +1,196 @@ +# PostgreSQL CDC + +> PostgreSQL CDC source connector + +## Support Those Engines + +> SeaTunnel Zeta
    +> Flink
    + +## Key features + +- [ ] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +The Postgre CDC connector allows for reading snapshot data and incremental data from Postgre database. This document +describes how to set up the Postgre CDC connector to run SQL queries against Postgre databases. + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|------------------------------------------------------------|-----------------------|---------------------------------------|--------------------------------------------------------------------------| +| PostgreSQL | Different dependency version has different driver class. | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/org.postgresql/postgresql) | +| PostgreSQL | If you want to manipulate the GEOMETRY type in PostgreSQL. | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/net.postgis/postgis-jdbc) | + +## Using Dependency + +### Install Jdbc Driver + +#### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.postgresql/postgresql) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +#### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.postgresql/postgresql) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +Please download and put PostgreSQL driver in `${SEATUNNEL_HOME}/lib/` dir. For example: cp postgresql-xxx.jar `$SEATNUNNEL_HOME/lib/` + +> Here are the steps to enable CDC (Change Data Capture) in PostgreSQL: + +1. Ensure the wal_level is set to logical: Modify the postgresql.conf configuration file by adding "wal_level = logical", + restart the PostgreSQL server for the changes to take effect. + Alternatively, you can use SQL commands to modify the configuration directly: + +```sql +ALTER SYSTEM SET wal_level TO 'logical'; +SELECT pg_reload_conf(); +``` + +2. Change the REPLICA policy of the specified table to FULL + +```sql +ALTER TABLE your_table_name REPLICA IDENTITY FULL; +``` + +## Data Type Mapping + +| PostgreSQL Data type | SeaTunnel Data type | +|-----------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| +| BOOL
    | BOOLEAN | +| _BOOL
    | ARRAY<BOOLEAN> | +| BYTEA
    | BYTES | +| _BYTEA
    | ARRAY<TINYINT> | +| INT2
    SMALLSERIAL
    INT4
    SERIAL
    | INT | +| _INT2
    _INT4
    | ARRAY<INT> | +| INT8
    BIGSERIAL
    | BIGINT | +| _INT8
    | ARRAY<BIGINT> | +| FLOAT4
    | FLOAT | +| _FLOAT4
    | ARRAY<FLOAT> | +| FLOAT8
    | DOUBLE | +| _FLOAT8
    | ARRAY<DOUBLE> | +| NUMERIC(Get the designated column's specified column size>0) | DECIMAL(Get the designated column's specified column size,Gets the number of digits in the specified column to the right of the decimal point) | +| NUMERIC(Get the designated column's specified column size<0) | DECIMAL(38, 18) | +| BPCHAR
    CHARACTER
    VARCHAR
    TEXT
    GEOMETRY
    GEOGRAPHY
    JSON
    JSONB | STRING | +| _BPCHAR
    _CHARACTER
    _VARCHAR
    _TEXT | ARRAY<STRING> | +| TIMESTAMP
    | TIMESTAMP | +| TIME
    | TIME | +| DATE
    | DATE | +| OTHER DATA TYPES | NOT SUPPORTED YET | + +## Source Options + +| Name | Type | Required | Default | Description | +|------------------------------------------------|----------|----------|----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| base-url | String | Yes | - | The URL of the JDBC connection. Refer to a case: `jdbc:postgresql://localhost:5432/postgres_cdc?loggerLevel=OFF`. | +| username | String | Yes | - | Name of the database to use when connecting to the database server. | +| password | String | Yes | - | Password to use when connecting to the database server. | +| database-names | List | No | - | Database name of the database to monitor. | +| table-names | List | Yes | - | Table name of the database to monitor. The table name needs to include the database name, for example: `database_name.table_name` | +| table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys":["key1"]}] | +| startup.mode | Enum | No | INITIAL | Optional startup mode for PostgreSQL CDC consumer, valid enumerations are `initial`, `earliest`, `latest` and `specific`.
    `initial`: Synchronize historical data at startup, and then synchronize incremental data.
    `earliest`: Startup from the earliest offset possible.
    `latest`: Startup from the latest offset.
    `specific`: Startup from user-supplied specific offsets. | +| snapshot.split.size | Integer | No | 8096 | The split size (number of rows) of table snapshot, captured tables are split into multiple splits when read the snapshot of table. | +| snapshot.fetch.size | Integer | No | 1024 | The maximum fetch size for per poll when read table snapshot. | +| slot.name | String | No | - | The name of the PostgreSQL logical decoding slot that was created for streaming changes from a particular plug-in for a particular database/schema. The server uses this slot to stream events to the connector that you are configuring. Default is seatunnel. | +| decoding.plugin.name | String | No | pgoutput | The name of the Postgres logical decoding plug-in installed on the server,Supported values are decoderbufs, wal2json, wal2json_rds, wal2json_streaming,wal2json_rds_streaming and pgoutput. | +| server-time-zone | String | No | UTC | The session time zone in database server. If not set, then ZoneId.systemDefault() is used to determine the server time zone. | +| connect.timeout.ms | Duration | No | 30000 | The maximum time that the connector should wait after trying to connect to the database server before timing out. | +| connect.max-retries | Integer | No | 3 | The max retry times that the connector should retry to build database server connection. | +| connection.pool.size | Integer | No | 20 | The jdbc connection pool size. | +| chunk-key.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | +| chunk-key.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | +| sample-sharding.threshold | Integer | No | 1000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | +| inverse-sampling.rate | Integer | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | +| exactly_once | Boolean | No | false | Enable exactly once semantic. | +| format | Enum | No | DEFAULT | Optional output format for PostgreSQL CDC, valid enumerations are `DEFAULT`, `COMPATIBLE_DEBEZIUM_JSON`. | +| debezium | Config | No | - | Pass-through [Debezium's properties](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/postgresql.adoc#connector-configuration-properties) to Debezium Embedded Engine which is used to capture data changes from PostgreSQL server. | +| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## Task Example + +### Simple + +> Support multi-table reading + +``` + + +env { + # You can set engine configuration here + execution.parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 + read_limit.bytes_per_second=7000000 + read_limit.rows_per_second=400 +} + +source { + Postgres-CDC { + result_table_name = "customers_Postgre_cdc" + username = "postgres" + password = "postgres" + database-names = ["postgres_cdc"] + schema-names = ["inventory"] + table-names = ["postgres_cdc.inventory.postgres_cdc_table_1,postgres_cdc.inventory.postgres_cdc_table_2"] + base-url = "jdbc:postgresql://postgres_cdc_e2e:5432/postgres_cdc?loggerLevel=OFF" + } +} + +transform { + +} + +sink { + jdbc { + source_table_name = "customers_Postgre_cdc" + url = "jdbc:postgresql://postgres_cdc_e2e:5432/postgres_cdc?loggerLevel=OFF" + driver = "org.postgresql.Driver" + user = "postgres" + password = "postgres" + + generate_sink_sql = true + # You need to configure both database and table + database = postgres_cdc + chema = "inventory" + tablePrefix = "sink_" + primary_keys = ["id"] + } +} +``` + +### Support custom primary key for table + +``` +source { + Postgres-CDC { + result_table_name = "customers_mysql_cdc" + username = "postgres" + password = "postgres" + database-names = ["postgres_cdc"] + schema-names = ["inventory"] + table-names = ["postgres_cdc.inventory.full_types_no_primary_key"] + base-url = "jdbc:postgresql://postgres_cdc_e2e:5432/postgres_cdc?loggerLevel=OFF" + decoding.plugin.name = "decoderbufs" + exactly_once = false + table-names-config = [ + { + table = "postgres_cdc.inventory.full_types_no_primary_key" + primaryKeys = ["id"] + } + ] + } +} +``` + +## Changelog + +- Add PostgreSQL CDC Source Connector + +### next version + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/PostgreSQL.md b/versioned_docs/version-2.3.7/connector-v2/source/PostgreSQL.md new file mode 100644 index 000000000000..b687d2a18216 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/PostgreSQL.md @@ -0,0 +1,323 @@ +# PostgreSQL + +> JDBC PostgreSQL Source Connector + +## Support Those Engines + +> Spark
    +> Flink
    +> SeaTunnel Zeta
    + +## Using Dependency + +### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.postgresql/postgresql) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.postgresql/postgresql) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +> supports query SQL and can achieve projection effect. + +## Description + +Read external data source data through JDBC. + +## Supported DataSource Info + +| Datasource | Supported Versions | Driver | Url | Maven | +|------------|------------------------------------------------------------|-----------------------|---------------------------------------|--------------------------------------------------------------------------| +| PostgreSQL | Different dependency version has different driver class. | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/org.postgresql/postgresql) | +| PostgreSQL | If you want to manipulate the GEOMETRY type in PostgreSQL. | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/net.postgis/postgis-jdbc) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
    +> For example PostgreSQL datasource: cp postgresql-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/
    +> If you want to manipulate the GEOMETRY type in PostgreSQL, add postgresql-xxx.jar and postgis-jdbc-xxx.jar to $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| PostgreSQL Data type | SeaTunnel Data type | +|--------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| +| BOOL
    | BOOLEAN | +| _BOOL
    | ARRAY<BOOLEAN> | +| BYTEA
    | BYTES | +| _BYTEA
    | ARRAY<TINYINT> | +| INT2
    SMALLSERIAL | SMALLINT | +| _INT2 | ARRAY<SMALLINT> | +| INT4
    SERIAL
    | INT | +| _INT4
    | ARRAY<INT> | +| INT8
    BIGSERIAL
    | BIGINT | +| _INT8
    | ARRAY<BIGINT> | +| FLOAT4
    | FLOAT | +| _FLOAT4
    | ARRAY<FLOAT> | +| FLOAT8
    | DOUBLE | +| _FLOAT8
    | ARRAY<DOUBLE> | +| NUMERIC(Get the designated column's specified column size>0) | DECIMAL(Get the designated column's specified column size,Gets the number of digits in the specified column to the right of the decimal point) | +| NUMERIC(Get the designated column's specified column size<0) | DECIMAL(38, 18) | +| BPCHAR
    CHARACTER
    VARCHAR
    TEXT
    GEOMETRY
    GEOGRAPHY
    JSON
    JSONB
    UUID | STRING | +| _BPCHAR
    _CHARACTER
    _VARCHAR
    _TEXT | ARRAY<STRING> | +| TIMESTAMP(s)
    TIMESTAMPTZ(s) | TIMESTAMP(s) | +| TIME(s)
    TIMETZ(s) | TIME(s) | +| DATE
    | DATE | + +## Options + +| Name | Type | Required | Default | Description | +|------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:postgresql://localhost:5432/test | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
    if you use PostgreSQL the value is `org.postgresql.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | +| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
    the row fetch size used in the query toimprove performance by
    reducing the number database hits required to satisfy the selection criteria.
    Zero means use jdbc default value. | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
    specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | + +| Name | Type | Required | Default | Description | +|--------------------------------------------|------------|----------|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:mysql://localhost:3306:3306/test | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
    if you use MySQL the value is `com.mysql.cj.jdbc.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | +| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
    the row fetch size used in the query toimprove performance by
    reducing the number database hits required to satisfy the selection criteria.
    Zero means use jdbc default value. | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
    specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | +| table_path | Int | No | 0 | The path to the full path of table, you can use this configuration instead of `query`.
    examples:
    mysql: "testdb.table1"
    oracle: "test_schema.table1"
    sqlserver: "testdb.test_schema.table1"
    postgresql: "testdb.test_schema.table1" | +| table_list | Array | No | 0 | The list of tables to be read, you can use this configuration instead of `table_path` example: ```[{ table_path = "testdb.table1"}, {table_path = "testdb.table2", query = "select * id, name from testdb.table2"}]``` | +| where_condition | String | No | - | Common row filter conditions for all tables/queries, must start with `where`. for example `where id > 100` | +| split.size | Int | No | 8096 | The split size (number of rows) of table, captured tables are split into multiple splits when read of table. | +| split.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | +| split.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | +| split.sample-sharding.threshold | Int | No | 10000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | +| split.inverse-sampling.rate | Int | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## Parallel Reader + +The JDBC Source connector supports parallel reading of data from tables. SeaTunnel will use certain rules to split the data in the table, which will be handed over to readers for reading. The number of readers is determined by the `parallelism` option. + +**Split Key Rules:** + +1. If `partition_column` is not null, It will be used to calculate split. The column must in **Supported split data type**. +2. If `partition_column` is null, seatunnel will read the schema from table and get the Primary Key and Unique Index. If there are more than one column in Primary Key and Unique Index, The first column which in the **supported split data type** will be used to split data. For example, the table have Primary Key(nn guid, name varchar), because `guid` id not in **supported split data type**, so the column `name` will be used to split data. + +**Supported split data type:** +* String +* Number(int, bigint, decimal, ...) +* Date + +### Options Related To Split + +#### split.size + +How many rows in one split, captured tables are split into multiple splits when read of table. + +#### split.even-distribution.factor.lower-bound + +> Not recommended for use + +The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. + +#### split.even-distribution.factor.upper-bound + +> Not recommended for use + +The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. + +#### split.sample-sharding.threshold + +This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. + +#### split.inverse-sampling.rate + +The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. + +#### partition_column [string] + +The column name for split data. + +#### partition_upper_bound [BigDecimal] + +The partition_column max value for scan, if not set SeaTunnel will query database get max value. + +#### partition_lower_bound [BigDecimal] + +The partition_column min value for scan, if not set SeaTunnel will query database get min value. + +#### partition_num [int] + +> Not recommended for use, The correct approach is to control the number of split through `split.size` + +How many splits do we need to split into, only support positive integer. default value is job parallelism. + +## tips + +> If the table can not be split(for example, table have no Primary Key or Unique Index, and `partition_column` is not set), it will run in single concurrency. +> +> Use `table_path` to replace `query` for single table reading. If you need to read multiple tables, use `table_list`. + +## Task Example + +### Simple: + +> This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. + +``` +# Defining the runtime environment +env { + parallelism = 4 + job.mode = "BATCH" +} + +source{ + Jdbc { + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + user = "root" + password = "test" + query = "select * from source limit 16" + } +} + +transform { + # please go to https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink { + Console {} +} +``` + +### parallel by partition_column + +> Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table + +``` +env { + parallelism = 4 + job.mode = "BATCH" +} +source{ + jdbc{ + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + user = "root" + password = "test" + query = "select * from source" + partition_column= "id" + partition_num = 5 + } +} +sink { + Console {} +} +``` + +### parallel by Primary Key or Unique Index + +> Configuring `table_path` will turn on auto split, you can configure `split.*` to adjust the split strategy + +``` +env { + parallelism = 4 + job.mode = "BATCH" +} +source { + Jdbc { + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + table_path = "test.public.AllDataType_1" + query = "select * from public.AllDataType_1" + split.size = 10000 + } +} + +sink { + Console {} +} +``` + +### Parallel Boundary: + +> It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured + +``` +source{ + jdbc{ + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + user = "root" + password = "test" + query = "select * from source" + partition_column= "id" + + # The name of the table returned + result_table_name = "jdbc" + partition_lower_bound = 1 + partition_upper_bound = 50 + partition_num = 5 + } +} +``` + +### Multiple table read: + +***Configuring `table_list` will turn on auto split, you can configure `split.*` to adjust the split strategy*** + +```hocon +env { + job.mode = "BATCH" + parallelism = 4 +} +source { + Jdbc { + url="jdbc:postgresql://datasource01:5432/demo" + user="iDm82k6Q0Tq+wUprWnPsLQ==" + driver="org.postgresql.Driver" + password="iDm82k6Q0Tq+wUprWnPsLQ==" + "table_list"=[ + { + "table_path"="demo.public.AllDataType_1" + }, + { + "table_path"="demo.public.alldatatype" + } + ] + #where_condition= "where id > 100" + split.size = 10000 + #split.even-distribution.factor.upper-bound = 100 + #split.even-distribution.factor.lower-bound = 0.05 + #split.sample-sharding.threshold = 1000 + #split.inverse-sampling.rate = 1000 + } +} + +sink { + Console {} +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Pulsar.md b/versioned_docs/version-2.3.7/connector-v2/source/Pulsar.md new file mode 100644 index 000000000000..e4ddea679ad9 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Pulsar.md @@ -0,0 +1,164 @@ +# Apache Pulsar + +> Apache Pulsar source connector + +## Description + +Source connector for Apache Pulsar. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|--------------------------|---------|----------|---------------| +| topic | String | No | - | +| topic-pattern | String | No | - | +| topic-discovery.interval | Long | No | -1 | +| subscription.name | String | Yes | - | +| client.service-url | String | Yes | - | +| admin.service-url | String | Yes | - | +| auth.plugin-class | String | No | - | +| auth.params | String | No | - | +| poll.timeout | Integer | No | 100 | +| poll.interval | Long | No | 50 | +| poll.batch.size | Integer | No | 500 | +| cursor.startup.mode | Enum | No | LATEST | +| cursor.startup.timestamp | Long | No | - | +| cursor.reset.mode | Enum | No | LATEST | +| cursor.stop.mode | Enum | No | NEVER | +| cursor.stop.timestamp | Long | No | - | +| schema | config | No | - | +| common-options | | no | - | +| format | String | no | json | + +### topic [String] + +Topic name(s) to read data from when the table is used as source. It also supports topic list for source by separating topic by semicolon like 'topic-1;topic-2'. + +**Note, only one of "topic-pattern" and "topic" can be specified for sources.** + +### topic-pattern [String] + +The regular expression for a pattern of topic names to read from. All topics with names that match the specified regular expression will be subscribed by the consumer when the job starts running. + +**Note, only one of "topic-pattern" and "topic" can be specified for sources.** + +### topic-discovery.interval [Long] + +The interval (in ms) for the Pulsar source to discover the new topic partitions. A non-positive value disables the topic partition discovery. + +**Note, This option only works if the 'topic-pattern' option is used.** + +### subscription.name [String] + +Specify the subscription name for this consumer. This argument is required when constructing the consumer. + +### client.service-url [String] + +Service URL provider for Pulsar service. +To connect to Pulsar using client libraries, you need to specify a Pulsar protocol URL. +You can assign Pulsar protocol URLs to specific clusters and use the Pulsar scheme. + +For example, `localhost`: `pulsar://localhost:6650,localhost:6651`. + +### admin.service-url [String] + +The Pulsar service HTTP URL for the admin endpoint. + +For example, `http://my-broker.example.com:8080`, or `https://my-broker.example.com:8443` for TLS. + +### auth.plugin-class [String] + +Name of the authentication plugin. + +### auth.params [String] + +Parameters for the authentication plugin. + +For example, `key1:val1,key2:val2` + +### poll.timeout [Integer] + +The maximum time (in ms) to wait when fetching records. A longer time increases throughput but also latency. + +### poll.interval [Long] + +The interval time(in ms) when fetcing records. A shorter time increases throughput, but also increases CPU load. + +### poll.batch.size [Integer] + +The maximum number of records to fetch to wait when polling. A longer time increases throughput but also latency. + +### cursor.startup.mode [Enum] + +Startup mode for Pulsar consumer, valid values are `'EARLIEST'`, `'LATEST'`, `'SUBSCRIPTION'`, `'TIMESTAMP'`. + +### cursor.startup.timestamp [Long] + +Start from the specified epoch timestamp (in milliseconds). + +**Note, This option is required when the "cursor.startup.mode" option used `'TIMESTAMP'`.** + +### cursor.reset.mode [Enum] + +Cursor reset strategy for Pulsar consumer valid values are `'EARLIEST'`, `'LATEST'`. + +**Note, This option only works if the "cursor.startup.mode" option used `'SUBSCRIPTION'`.** + +### cursor.stop.mode [String] + +Stop mode for Pulsar consumer, valid values are `'NEVER'`, `'LATEST'`and `'TIMESTAMP'`. + +**Note, When `'NEVER' `is specified, it is a real-time job, and other mode are off-line jobs.** + +### cursor.stop.timestamp [Long] + +Stop from the specified epoch timestamp (in milliseconds). + +**Note, This option is required when the "cursor.stop.mode" option used `'TIMESTAMP'`.** + +### schema [Config] + +The structure of the data, including field names and field types. +reference to [Schema-Feature](../../concept/schema-feature.md) + +## format [String] + +Data format. The default format is json, reference [formats](../formats). + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. + +## Example + +```Jdbc { +source { + Pulsar { + topic = "example" + subscription.name = "seatunnel" + client.service-url = "pulsar://localhost:6650" + admin.service-url = "http://my-broker.example.com:8080" + result_table_name = "test" + } +} +``` + +## Changelog + +### 2.3.0-beta 2022-10-20 + +- Add Pulsar Source Connector + +### next version + +- [Feature] Add Pulsar canal-format and e2e ([4111](https://github.com/apache/seatunnel/pull/4111)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Rabbitmq.md b/versioned_docs/version-2.3.7/connector-v2/source/Rabbitmq.md new file mode 100644 index 000000000000..9381603ef6c6 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Rabbitmq.md @@ -0,0 +1,162 @@ +# Rabbitmq + +> Rabbitmq source connector + +## Description + +Used to read data from Rabbitmq. + +## Key features + +- [ ] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +:::tip + +The source must be non-parallel (parallelism set to 1) in order to achieve exactly-once. This limitation is mainly due to RabbitMQ’s approach to dispatching messages from a single queue to multiple consumers. + +::: + +## Options + +| name | type | required | default value | +|----------------------------|---------|----------|---------------| +| host | string | yes | - | +| port | int | yes | - | +| virtual_host | string | yes | - | +| username | string | yes | - | +| password | string | yes | - | +| queue_name | string | yes | - | +| schema | config | yes | - | +| url | string | no | - | +| routing_key | string | no | - | +| exchange | string | no | - | +| network_recovery_interval | int | no | - | +| topology_recovery_enabled | boolean | no | - | +| automatic_recovery_enabled | boolean | no | - | +| connection_timeout | int | no | - | +| requested_channel_max | int | no | - | +| requested_frame_max | int | no | - | +| requested_heartbeat | int | no | - | +| prefetch_count | int | no | - | +| delivery_timeout | long | no | - | +| common-options | | no | - | + +### host [string] + +the default host to use for connections + +### port [int] + +the default port to use for connections + +### virtual_host [string] + +virtual host – the virtual host to use when connecting to the broker + +### username [string] + +the AMQP user name to use when connecting to the broker + +### password [string] + +the password to use when connecting to the broker + +### url [string] + +convenience method for setting the fields in an AMQP URI: host, port, username, password and virtual host + +### queue_name [string] + +the queue to publish the message to + +### routing_key [string] + +the routing key to publish the message to + +### exchange [string] + +the exchange to publish the message to + +### schema [Config] + +#### fields [Config] + +the schema fields of upstream data. + +### network_recovery_interval [int] + +how long will automatic recovery wait before attempting to reconnect, in ms + +### topology_recovery [string] + +if true, enables topology recovery + +### automatic_recovery [string] + +if true, enables connection recovery + +### connection_timeout [int] + +connection tcp establishment timeout in milliseconds; zero for infinite + +### requested_channel_max [int] + +initially requested maximum channel number; zero for unlimited +**Note: Note the value must be between 0 and 65535 (unsigned short in AMQP 0-9-1). + +### requested_frame_max [int] + +the requested maximum frame size + +### requested_heartbeat [int] + +Set the requested heartbeat timeout +**Note: Note the value must be between 0 and 65535 (unsigned short in AMQP 0-9-1). + +### prefetch_count [int] + +prefetchCount the max number of messages to receive without acknowledgement + +### delivery_timeout [long] + +deliveryTimeout maximum wait time, in milliseconds, for the next message delivery + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +## Example + +simple: + +```hocon +source { + RabbitMQ { + host = "rabbitmq-e2e" + port = 5672 + virtual_host = "/" + username = "guest" + password = "guest" + queue_name = "test" + schema = { + fields { + id = bigint + c_map = "map" + c_array = "array" + } + } + } +} +``` + +## Changelog + +### next version + +- Add Rabbitmq source Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Redis.md b/versioned_docs/version-2.3.7/connector-v2/source/Redis.md new file mode 100644 index 000000000000..9af103f88412 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Redis.md @@ -0,0 +1,273 @@ +# Redis + +> Redis source connector + +## Description + +Used to read data from Redis. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|---------------------|--------|-----------------------|---------------| +| host | string | yes | - | +| port | int | yes | - | +| keys | string | yes | - | +| batch_size | int | yes | 10 | +| data_type | string | yes | - | +| user | string | no | - | +| auth | string | no | - | +| db_num | int | no | 0 | +| mode | string | no | single | +| hash_key_parse_mode | string | no | all | +| nodes | list | yes when mode=cluster | - | +| schema | config | yes when format=json | - | +| format | string | no | json | +| common-options | | no | - | + +### host [string] + +redis host + +### port [int] + +redis port + +### hash_key_parse_mode [string] + +hash key parse mode, support `all` `kv`, used to tell connector how to parse hash key. + +when setting it to `all`, connector will treat the value of hash key as a row and use the schema config to parse it, when setting it to `kv`, connector will treat each kv in hash key as a row and use the schema config to parse it: + +for example, if the value of hash key is the following shown: + +```text +{ + "001": { + "name": "tyrantlucifer", + "age": 26 + }, + "002": { + "name": "Zongwen", + "age": 26 + } +} + +``` + +if hash_key_parse_mode is `all` and schema config as the following shown, it will generate the following data: + +```hocon + +schema { + fields { + 001 { + name = string + age = int + } + 002 { + name = string + age = int + } + } +} + +``` + +| 001 | 002 | +|---------------------------------|---------------------------| +| Row(name=tyrantlucifer, age=26) | Row(name=Zongwen, age=26) | + +if hash_key_parse_mode is `kv` and schema config as the following shown, it will generate the following data: + +```hocon + +schema { + fields { + hash_key = string + name = string + age = int + } +} + +``` + +| hash_key | name | age | +|----------|---------------|-----| +| 001 | tyrantlucifer | 26 | +| 002 | Zongwen | 26 | + +each kv that in hash key it will be treated as a row and send it to upstream. + +**Tips: connector will use the first field information of schema config as the field name of each k that in each kv** + +### keys [string] + +keys pattern + +### batch_size [int] + +indicates the number of keys to attempt to return per iteration,default 10 + +**Tips:Redis source connector support fuzzy key matching, user needs to ensure that the matched keys are the same type** + +### data_type [string] + +redis data types, support `key` `hash` `list` `set` `zset` + +- key + +> The value of each key will be sent downstream as a single row of data. +> For example, the value of key is `SeaTunnel test message`, the data received downstream is `SeaTunnel test message` and only one message will be received. + +- hash + +> The hash key-value pairs will be formatted as json to be sent downstream as a single row of data. +> For example, the value of hash is `name:tyrantlucifer age:26`, the data received downstream is `{"name":"tyrantlucifer", "age":"26"}` and only one message will be received. + +- list + +> Each element in the list will be sent downstream as a single row of data. +> For example, the value of list is `[tyrantlucier, CalvinKirs]`, the data received downstream are `tyrantlucifer` and `CalvinKirs` and only two message will be received. + +- set + +> Each element in the set will be sent downstream as a single row of data +> For example, the value of set is `[tyrantlucier, CalvinKirs]`, the data received downstream are `tyrantlucifer` and `CalvinKirs` and only two message will be received. + +- zset + +> Each element in the sorted set will be sent downstream as a single row of data +> For example, the value of sorted set is `[tyrantlucier, CalvinKirs]`, the data received downstream are `tyrantlucifer` and `CalvinKirs` and only two message will be received. + +### user [string] + +redis authentication user, you need it when you connect to an encrypted cluster + +### auth [string] + +redis authentication password, you need it when you connect to an encrypted cluster + +### db_num [int] + +Redis database index ID. It is connected to db 0 by default + +### mode [string] + +redis mode, `single` or `cluster`, default is `single` + +### nodes [list] + +redis nodes information, used in cluster mode, must like as the following format: + +["host1:port1", "host2:port2"] + +### format [string] + +the format of upstream data, now only support `json` `text`, default `json`. + +when you assign format is `json`, you should also assign schema option, for example: + +upstream data is the following: + +```json + +{"code": 200, "data": "get success", "success": true} + +``` + +you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +when you assign format is `text`, connector will do nothing for upstream data, for example: + +upstream data is the following: + +```json + +{"code": 200, "data": "get success", "success": true} + +``` + +connector will generate data as the following: + +| content | +|----------------------------------------------------------| +| {"code": 200, "data": "get success", "success": true} | + +### schema [config] + +#### fields [config] + +the schema fields of redis data + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details + +## Example + +simple: + +```hocon +Redis { + host = localhost + port = 6379 + keys = "key_test*" + data_type = key + format = text +} +``` + +```hocon +Redis { + host = localhost + port = 6379 + keys = "key_test*" + data_type = key + format = json + schema { + fields { + name = string + age = int + } + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- Add Redis Source Connector + +### next version + +- [Improve] Support redis cluster mode connection and user authentication [3188](https://github.com/apache/seatunnel/pull/3188) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Redshift.md b/versioned_docs/version-2.3.7/connector-v2/source/Redshift.md new file mode 100644 index 000000000000..8da5ea9391de --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Redshift.md @@ -0,0 +1,133 @@ +# Redshift + +> JDBC Redshift Source Connector + +## Description + +Read external data source data through JDBC. + +## Support those engines + +> Spark
    +> Flink
    +> Seatunnel Zeta
    + +### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +> supports query SQL and can achieve projection effect. + +## Supported DataSource list + +| datasource | supported versions | driver | url | maven | +|------------|----------------------------------------------------------|---------------------------------|-----------------------------------------|------------------------------------------------------------------------------------| +| redshift | Different dependency version has different driver class. | com.amazon.redshift.jdbc.Driver | jdbc:redshift://localhost:5439/database | [Download](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) | + +## Database dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
    +> For example Redshift datasource: cp RedshiftJDBC42-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| Redshift Data type | Seatunnel Data type | +|-------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| SMALLINT
    INT2 | SHORT | +| INTEGER
    INT
    INT4 | INT | +| BIGINT
    INT8
    OID | LONG | +| DECIMAL
    NUMERIC | DECIMAL((Get the designated column's specified column size)+1,
    (Gets the designated column's number of digits to right of the decimal point.))) | +| REAL
    FLOAT4 | FLOAT | +| DOUBLE_PRECISION
    FLOAT8
    FLOAT | DOUBLE | +| BOOLEAN
    BOOL | BOOLEAN | +| CHAR
    CHARACTER
    NCHAR
    BPCHAR
    VARCHAR
    CHARACTER_VARYING
    NVARCHAR
    TEXT
    SUPER | STRING | +| VARBYTE
    BINARY_VARYING | BYTES | +| TIME
    TIME_WITH_TIME_ZONE
    TIMETZ | LOCALTIME | +| TIMESTAMP
    TIMESTAMP_WITH_OUT_TIME_ZONE
    TIMESTAMPTZ | LOCALDATETIME | + +## Example + +### Simple: + +> This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. + +``` +env { + parallelism = 2 + job.mode = "BATCH" +} +source{ + Jdbc { + url = "jdbc:redshift://localhost:5439/dev" + driver = "com.amazon.redshift.jdbc.Driver" + user = "root" + password = "123456" + + table_path = "public.table2" + # Use query filetr rows & columns + query = "select id, name from public.table2 where id > 100" + + #split.size = 8096 + #split.even-distribution.factor.upper-bound = 100 + #split.even-distribution.factor.lower-bound = 0.05 + #split.sample-sharding.threshold = 1000 + #split.inverse-sampling.rate = 1000 + } +} + +sink { + Console {} +} +``` + +### Multiple table read: + +***Configuring `table_list` will turn on auto split, you can configure `split.*` to adjust the split strategy*** + +```hocon +env { + job.mode = "BATCH" + parallelism = 2 +} +source { + Jdbc { + url = "jdbc:redshift://localhost:5439/dev" + driver = "com.amazon.redshift.jdbc.Driver" + user = "root" + password = "123456" + + table_list = [ + { + table_path = "public.table1" + }, + { + table_path = "public.table2" + # Use query filetr rows & columns + query = "select id, name from public.table2 where id > 100" + } + ] + #split.size = 8096 + #split.even-distribution.factor.upper-bound = 100 + #split.even-distribution.factor.lower-bound = 0.05 + #split.sample-sharding.threshold = 1000 + #split.inverse-sampling.rate = 1000 + } +} + +sink { + Console {} +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/RocketMQ.md b/versioned_docs/version-2.3.7/connector-v2/source/RocketMQ.md new file mode 100644 index 000000000000..d496a259bdb6 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/RocketMQ.md @@ -0,0 +1,219 @@ +# RocketMQ + +> RocketMQ source connector + +## Support Apache RocketMQ Version + +- 4.9.0 (Or a newer version, for reference) + +## Support These Engines + +> Spark
    +> Flink
    +> SeaTunnel Zeta
    + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +Source connector for Apache RocketMQ. + +## Source Options + +| Name | Type | Required | Default | Description | +|-------------------------------------|---------|----------|----------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| topics | String | yes | - | `RocketMQ topic` name. If there are multiple `topics`, use `,` to split, for example: `"tpc1,tpc2"`. | +| name.srv.addr | String | yes | - | `RocketMQ` name server cluster address. | +| acl.enabled | Boolean | no | false | If true, access control is enabled, and access key and secret key need to be configured. | +| access.key | String | no | | | +| secret.key | String | no | | When ACL_ENABLED is true, secret key cannot be empty. | +| batch.size | int | no | 100 | `RocketMQ` consumer pull batch size | +| consumer.group | String | no | SeaTunnel-Consumer-Group | `RocketMQ consumer group id`, used to distinguish different consumer groups. | +| commit.on.checkpoint | Boolean | no | true | If true the consumer's offset will be periodically committed in the background. | +| schema | | no | - | The structure of the data, including field names and field types. | +| format | String | no | json | Data format. The default format is json. Optional text format. The default field separator is ",".If you customize the delimiter, add the "field.delimiter" option. | +| field.delimiter | String | no | , | Customize the field delimiter for data format | +| start.mode | String | no | CONSUME_FROM_GROUP_OFFSETS | The initial consumption pattern of consumers,there are several types: [CONSUME_FROM_LAST_OFFSET],[CONSUME_FROM_FIRST_OFFSET],[CONSUME_FROM_GROUP_OFFSETS],[CONSUME_FROM_TIMESTAMP],[CONSUME_FROM_SPECIFIC_OFFSETS] | +| start.mode.offsets | | no | | | +| start.mode.timestamp | Long | no | | The time required for consumption mode to be "CONSUME_FROM_TIMESTAMP". | +| partition.discovery.interval.millis | long | no | -1 | The interval for dynamically discovering topics and partitions. | +| common-options | config | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | + +### start.mode.offsets + +The offset required for consumption mode to be "CONSUME_FROM_SPECIFIC_OFFSETS". + +for example: + +```hocon +start.mode.offsets = { + topic1-0 = 70 + topic1-1 = 10 + topic1-2 = 10 +} +``` + +## Task Example + +### Simple: + +> Consumer reads Rocketmq data and prints it to the console type + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Rocketmq { + name.srv.addr = "rocketmq-e2e:9876" + topics = "test_topic_json" + result_table_name = "rocketmq_table" + schema = { + fields { + id = bigint + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(2, 1)" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform +} + +sink { + Console { + } +} +``` + +### Specified format consumption Simple: + +> When I consume the topic data in json format parsing and pulling the number of bars each time is 400, the consumption starts from the original location + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Rocketmq { + name.srv.addr = "localhost:9876" + topics = "test_topic" + result_table_name = "rocketmq_table" + start.mode = "CONSUME_FROM_FIRST_OFFSET" + batch.size = "400" + consumer.group = "test_topic_group" + format = "json" + format = json + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform +} +sink { + Console { + } +} +``` + +### Specified timestamp Simple: + +> This is to specify a time to consume, and I dynamically sense the existence of a new partition every 1000 milliseconds to pull the consumption + +```hocon +env { + parallelism = 1 + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local + job.mode = "BATCH" +} + +source { + Rocketmq { + name.srv.addr = "localhost:9876" + topics = "test_topic" + partition.discovery.interval.millis = "1000" + start.mode.timestamp="1694508382000" + consumer.group="test_topic_group" + format="json" + format = json + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform +} + +sink { + Console { + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/S3File.md b/versioned_docs/version-2.3.7/connector-v2/source/S3File.md new file mode 100644 index 000000000000..bf621be590c5 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/S3File.md @@ -0,0 +1,353 @@ +# S3File + +> S3 File Source Connector + +## Support Those Engines + +> Spark
    +> Flink
    +> SeaTunnel Zeta
    + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) + +Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot. + +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + - [x] xml + - [x] binary + +## Description + +Read data from aws s3 file system. + +## Supported DataSource Info + +| Datasource | Supported versions | +|------------|--------------------| +| S3 | current | + +## Dependency + +> If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x.
    +> +> If you use SeaTunnel Zeta, It automatically integrated the hadoop jar when you download and install SeaTunnel Zeta. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this.
    +> To use this connector you need put hadoop-aws-3.1.4.jar and aws-java-sdk-bundle-1.12.692.jar in ${SEATUNNEL_HOME}/lib dir. + +## Data Type Mapping + +Data type mapping is related to the type of file being read, We supported as the following file types: + +`text` `csv` `parquet` `orc` `json` `excel` `xml` + +### JSON File Type + +If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want. + +For example: + +upstream data is the following: + +```json + +{"code": 200, "data": "get success", "success": true} + +``` + +You can also save multiple pieces of data in one file and split them by newline: + +```json lines + +{"code": 200, "data": "get success", "success": true} +{"code": 300, "data": "get failed", "success": false} + +``` + +you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +### Text Or CSV File Type + +If you assign file type to `text` `csv`, you can choose to specify the schema information or not. + +For example, upstream data is the following: + +```text + +tyrantlucifer#26#male + +``` + +If you do not assign data schema connector will treat the upstream data as the following: + +| content | +|-----------------------| +| tyrantlucifer#26#male | + +If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type + +you should assign schema and delimiter as the following: + +```hocon + +field_delimiter = "#" +schema { + fields { + name = string + age = int + gender = string + } +} + +``` + +connector will generate data as the following: + +| name | age | gender | +|---------------|-----|--------| +| tyrantlucifer | 26 | male | + +### Orc File Type + +If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. + +| Orc Data type | SeaTunnel Data type | +|----------------------------------|----------------------------------------------------------------| +| BOOLEAN | BOOLEAN | +| INT | INT | +| BYTE | BYTE | +| SHORT | SHORT | +| LONG | LONG | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| BINARY | BINARY | +| STRING
    VARCHAR
    CHAR
    | STRING | +| DATE | LOCAL_DATE_TYPE | +| TIMESTAMP | LOCAL_DATE_TIME_TYPE | +| DECIMAL | DECIMAL | +| LIST(STRING) | STRING_ARRAY_TYPE | +| LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE | +| LIST(TINYINT) | BYTE_ARRAY_TYPE | +| LIST(SMALLINT) | SHORT_ARRAY_TYPE | +| LIST(INT) | INT_ARRAY_TYPE | +| LIST(BIGINT) | LONG_ARRAY_TYPE | +| LIST(FLOAT) | FLOAT_ARRAY_TYPE | +| LIST(DOUBLE) | DOUBLE_ARRAY_TYPE | +| Map | MapType, This type of K and V will transform to SeaTunnel type | +| STRUCT | SeaTunnelRowType | + +### Parquet File Type + +If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. + +| Orc Data type | SeaTunnel Data type | +|----------------------|----------------------------------------------------------------| +| INT_8 | BYTE | +| INT_16 | SHORT | +| DATE | DATE | +| TIMESTAMP_MILLIS | TIMESTAMP | +| INT64 | LONG | +| INT96 | TIMESTAMP | +| BINARY | BYTES | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| BOOLEAN | BOOLEAN | +| FIXED_LEN_BYTE_ARRAY | TIMESTAMP
    DECIMAL | +| DECIMAL | DECIMAL | +| LIST(STRING) | STRING_ARRAY_TYPE | +| LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE | +| LIST(TINYINT) | BYTE_ARRAY_TYPE | +| LIST(SMALLINT) | SHORT_ARRAY_TYPE | +| LIST(INT) | INT_ARRAY_TYPE | +| LIST(BIGINT) | LONG_ARRAY_TYPE | +| LIST(FLOAT) | FLOAT_ARRAY_TYPE | +| LIST(DOUBLE) | DOUBLE_ARRAY_TYPE | +| Map | MapType, This type of K and V will transform to SeaTunnel type | +| STRUCT | SeaTunnelRowType | + +## Options + +| name | type | required | default value | Description | +|---------------------------------|---------|----------|-------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| path | string | yes | - | The s3 path that needs to be read can have sub paths, but the sub paths need to meet certain format requirements. Specific requirements can be referred to "parse_partition_from_path" option | +| file_format_type | string | yes | - | File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` | +| bucket | string | yes | - | The bucket address of s3 file system, for example: `s3n://seatunnel-test`, if you use `s3a` protocol, this parameter should be `s3a://seatunnel-test`. | +| fs.s3a.endpoint | string | yes | - | fs s3a endpoint | +| fs.s3a.aws.credentials.provider | string | yes | com.amazonaws.auth.InstanceProfileCredentialsProvider | The way to authenticate s3a. We only support `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` now. More information about the credential provider you can see [Hadoop AWS Document](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Simple_name.2Fsecret_credentials_with_SimpleAWSCredentialsProvider.2A) | +| read_columns | list | no | - | The read column list of the data source, user can use it to implement field projection. The file type supported column projection as the following shown: `text` `csv` `parquet` `orc` `json` `excel` `xml` . If the user wants to use this feature when reading `text` `json` `csv` files, the "schema" option must be configured. | +| access_key | string | no | - | Only used when `fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider ` | +| access_secret | string | no | - | Only used when `fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider ` | +| hadoop_s3_properties | map | no | - | If you need to add other option, you could add it here and refer to this [link](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) | +| delimiter/field_delimiter | string | no | \001 | Field delimiter, used to tell connector how to slice and dice fields when reading text files. Default `\001`, the same as hive's default delimiter. | +| parse_partition_from_path | boolean | no | true | Control whether parse the partition keys and values from file path. For example if you read a file from path `s3n://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`. Every record data from file will be added these two fields: name="tyrantlucifer", age=16 | +| date_format | string | no | yyyy-MM-dd | Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`. default `yyyy-MM-dd` | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` | +| time_format | string | no | HH:mm:ss | Time type format, used to tell connector how to convert string to time, supported as the following formats:`HH:mm:ss` `HH:mm:ss.SSS` | +| skip_header_row_number | long | no | 0 | Skip the first few lines, but only for the txt and csv. For example, set like following:`skip_header_row_number = 2`. Then SeaTunnel will skip the first 2 lines from source files | +| schema | config | no | - | The schema of upstream data. | +| sheet_name | string | no | - | Reader the sheet of the workbook,Only used when file_format is excel. | +| xml_row_tag | string | no | - | Specifies the tag name of the data rows within the XML file, only valid for XML files. | +| xml_use_attr_format | boolean | no | - | Specifies whether to process data using the tag attribute format, only valid for XML files. | +| compress_codec | string | no | none | +| encoding | string | no | UTF-8 | +| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | + +### delimiter/field_delimiter [string] + +**delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. + +### compress_codec [string] + +The compress codec of files and the details that supported as the following shown: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc/parquet: + automatically recognizes the compression type, no additional settings required. + +### encoding [string] + +Only used when file_format_type is json,text,csv,xml. +The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`. + +## Example + +1. In this example, We read data from s3 path `s3a://seatunnel-test/seatunnel/text` and the file type is orc in this path. + We use `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` to authentication so `access_key` and `secret_key` is required. + All columns in the file will be read and send to sink. + +``` +# Defining the runtime environment +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + S3File { + path = "/seatunnel/text" + fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" + fs.s3a.aws.credentials.provider = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" + access_key = "xxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxx" + bucket = "s3a://seatunnel-test" + file_format_type = "orc" + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + Console {} +} +``` + +2. Use `InstanceProfileCredentialsProvider` to authentication + The file type in S3 is json, so need config schema option. + +```hocon + + S3File { + path = "/seatunnel/json" + bucket = "s3a://seatunnel-test" + fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" + fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" + file_format_type = "json" + schema { + fields { + id = int + name = string + } + } + } + +``` + +3. Use `InstanceProfileCredentialsProvider` to authentication + The file type in S3 is json and has five fields (`id`, `name`, `age`, `sex`, `type`), so need config schema option. + In this job, we only need send `id` and `name` column to mysql. + +``` +# Defining the runtime environment +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + S3File { + path = "/seatunnel/json" + bucket = "s3a://seatunnel-test" + fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" + fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" + file_format_type = "json" + read_columns = ["id", "name"] + schema { + fields { + id = int + name = string + age = int + sex = int + type = string + } + } + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + Console {} +} +``` + +## Changelog + +### 2.3.0-beta 2022-10-20 + +- Add S3File Source Connector + +### Next version + +- [Feature] Support S3A protocol ([3632](https://github.com/apache/seatunnel/pull/3632)) + - Allow user to add additional hadoop-s3 parameters + - Allow the use of the s3a protocol + - Decouple hadoop-aws dependencies +- [Feature]Set S3 AK to optional ([3688](https://github.com/apache/seatunnel/pull/)) + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/SftpFile.md b/versioned_docs/version-2.3.7/connector-v2/source/SftpFile.md new file mode 100644 index 000000000000..dafdb6aba9e6 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/SftpFile.md @@ -0,0 +1,255 @@ +# SftpFile + +> Sftp file source connector + +## Support Those Engines + +> Spark
    +> Flink
    +> SeaTunnel Zeta
    + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) +- [x] file format type + - [x] text + - [x] csv + - [x] json + - [x] excel + - [x] xml + - [x] binary + +## Description + +Read data from sftp file server. + +## Supported DataSource Info + +In order to use the SftpFile connector, the following dependencies are required. +They can be downloaded via install-plugin.sh or from the Maven central repository. + +| Datasource | Supported Versions | Dependency | +|------------|--------------------|-----------------------------------------------------------------------------------------| +| SftpFile | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-file-sftp) | + +:::tip + +If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. + +If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. + +We made some trade-offs in order to support more file types, so we used the HDFS protocol for internal access to Sftp and this connector need some hadoop dependencies. +It only supports hadoop version **2.9.X+**. + +::: + +## Data Type Mapping + +The File does not have a specific type list, and we can indicate which SeaTunnel data type the corresponding data needs to be converted to by specifying the Schema in the config. + +| SeaTunnel Data type | +|---------------------| +| STRING | +| SHORT | +| INT | +| BIGINT | +| BOOLEAN | +| DOUBLE | +| DECIMAL | +| FLOAT | +| DATE | +| TIME | +| TIMESTAMP | +| BYTES | +| ARRAY | +| MAP | + +## Source Options + +| Name | Type | Required | default value | Description | +|---------------------------|---------|----------|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| host | String | Yes | - | The target sftp host is required | +| port | Int | Yes | - | The target sftp port is required | +| user | String | Yes | - | The target sftp username is required | +| password | String | Yes | - | The target sftp password is required | +| path | String | Yes | - | The source file path. | +| file_format_type | String | Yes | - | Please check #file_format_type below | +| file_filter_pattern | String | No | - | Filter pattern, which used for filtering files. | +| delimiter/field_delimiter | String | No | \001 | **delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead.
    Field delimiter, used to tell connector how to slice and dice fields when reading text files.
    Default `\001`, the same as hive's default delimiter | +| parse_partition_from_path | Boolean | No | true | Control whether parse the partition keys and values from file path
    For example if you read a file from path `oss://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`
    Every record data from file will be added these two fields:
    name age
    tyrantlucifer 26
    Tips: **Do not define partition fields in schema option** | +| date_format | String | No | yyyy-MM-dd | Date type format, used to tell connector how to convert string to date, supported as the following formats:
    `yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`
    default `yyyy-MM-dd` | +| datetime_format | String | No | yyyy-MM-dd HH:mm:ss | Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:
    `yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss`
    default `yyyy-MM-dd HH:mm:ss` | +| time_format | String | No | HH:mm:ss | Time type format, used to tell connector how to convert string to time, supported as the following formats:
    `HH:mm:ss` `HH:mm:ss.SSS`
    default `HH:mm:ss` | +| skip_header_row_number | Long | No | 0 | Skip the first few lines, but only for the txt and csv.
    For example, set like following:
    `skip_header_row_number = 2`
    then SeaTunnel will skip the first 2 lines from source files | +| read_columns | list | no | - | The read column list of the data source, user can use it to implement field projection. | +| sheet_name | String | No | - | Reader the sheet of the workbook,Only used when file_format is excel. | +| xml_row_tag | string | no | - | Specifies the tag name of the data rows within the XML file, only used when file_format is xml. | +| xml_use_attr_format | boolean | no | - | Specifies whether to process data using the tag attribute format, only used when file_format is xml. | +| schema | Config | No | - | Please check #schema below | +| compress_codec | String | No | None | The compress codec of files and the details that supported as the following shown:
    - txt: `lzo` `None`
    - json: `lzo` `None`
    - csv: `lzo` `None`
    - orc: `lzo` `snappy` `lz4` `zlib` `None`
    - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `None`
    Tips: excel type does Not support any compression format | +| encoding | string | no | UTF-8 | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | + +### file_format_type [string] + +File type, supported as the following file types: +`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` +If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want. +For example: +upstream data is the following: + +```json +{"code": 200, "data": "get success", "success": true} +``` + +You can also save multiple pieces of data in one file and split them by newline: + +```json lines +{"code": 200, "data": "get success", "success": true} +{"code": 300, "data": "get failed", "success": false} +``` + +you should assign schema as the following: + +```hocon +schema { + fields { + code = int + data = string + success = boolean + } +} +``` + +connector will generate data as the following: +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | +If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. +If you assign file type to `text` `csv`, you can choose to specify the schema information or not. +For example, upstream data is the following: + +```text +tyrantlucifer#26#male +``` + +If you do not assign data schema connector will treat the upstream data as the following: +| content | +|-----------------------| +| tyrantlucifer#26#male | +If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type +you should assign schema and delimiter as the following: + +```hocon +field_delimiter = "#" +schema { + fields { + name = string + age = int + gender = string + } +} +``` + +connector will generate data as the following: +| name | age | gender | +|---------------|-----|--------| +| tyrantlucifer | 26 | male | + +If you assign file type to `binary`, SeaTunnel can synchronize files in any format, +such as compressed packages, pictures, etc. In short, any files can be synchronized to the target place. +Under this requirement, you need to ensure that the source and sink use `binary` format for file synchronization +at the same time. + +### compress_codec [string] + +The compress codec of files and the details that supported as the following shown: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc/parquet: + automatically recognizes the compression type, no additional settings required. + +### encoding [string] + +Only used when file_format_type is json,text,csv,xml. +The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`. + +### schema [config] + +#### fields [Config] + +The schema of upstream data. + +## How to Create a Sftp Data Synchronization Jobs + +The following example demonstrates how to create a data synchronization job that reads data from sftp and prints it on the local client: + +```bash +# Set the basic configuration of the task to be performed +env { + parallelism = 1 + job.mode = "BATCH" +} + +# Create a source to connect to sftp +source { + SftpFile { + host = "sftp" + port = 22 + user = seatunnel + password = pass + path = "tmp/seatunnel/read/json" + file_format_type = "json" + result_table_name = "sftp" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + C_MAP = "map" + C_ARRAY = "array" + C_STRING = string + C_BOOLEAN = boolean + C_TINYINT = tinyint + C_SMALLINT = smallint + C_INT = int + C_BIGINT = bigint + C_FLOAT = float + C_DOUBLE = double + C_BYTES = bytes + C_DATE = date + C_DECIMAL = "decimal(38, 18)" + C_TIMESTAMP = timestamp + } + } + } + } +} + +# Console printing of the read sftp data +sink { + Console { + parallelism = 1 + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Sls.md b/versioned_docs/version-2.3.7/connector-v2/source/Sls.md new file mode 100644 index 000000000000..6468f397ab7c --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Sls.md @@ -0,0 +1,87 @@ +# Sls + +> Sls source connector + +## Support Those Engines + +> Spark
    +> Flink
    +> Seatunnel Zeta
    + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +Source connector for Aliyun Sls. + +## Supported DataSource Info + +In order to use the Sls connector, the following dependencies are required. +They can be downloaded via install-plugin.sh or from the Maven central repository. + +| Datasource | Supported Versions | Maven | +|------------|--------------------|-----------------------------------------------------------------------------------------------------------| +| Sls | Universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-sls) | + +## Source Options + +| Name | Type | Required | Default | Description | +|-------------------------------------|---------------------------------------------|----------|--------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------| +| project | String | Yes | - | [Aliyun Sls Project](https://help.aliyun.com/zh/sls/user-guide/manage-a-project?spm=a2c4g.11186623.0.0.6f9755ebyfaYSl) | +| logstore | String | Yes | - | [Aliyun Sls Logstore](https://help.aliyun.com/zh/sls/user-guide/manage-a-logstore?spm=a2c4g.11186623.0.0.13137c08nfuiBC) | +| endpoint | String | Yes | - | [Aliyun Access Endpoint](https://help.aliyun.com/zh/sls/developer-reference/api-sls-2020-12-30-endpoint?spm=a2c4g.11186623.0.0.548945a8UyJULa) | +| access_key_id | String | Yes | - | [Aliyun AccessKey ID](https://help.aliyun.com/zh/ram/user-guide/create-an-accesskey-pair?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#task-2245479) | +| access_key_secret | String | Yes | - | [Aliyun AccessKey Secret](https://help.aliyun.com/zh/ram/user-guide/create-an-accesskey-pair?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#task-2245479) | +| start_mode | StartMode[earliest],[group_cursor],[latest] | No | group_cursor | The initial consumption pattern of consumers. | +| consumer_group | String | No | SeaTunnel-Consumer-Group | Sls consumer group id, used to distinguish different consumer groups. | +| auto_cursor_reset | CursorMode[begin],[end] | No | end | When there is no cursor in the consumer group, cursor initialization occurs | +| batch_size | Int | No | 1000 | The amount of data pulled from SLS each time | +| partition-discovery.interval-millis | Long | No | -1 | The interval for dynamically discovering topics and partitions. | + +## Task Example + +### Simple + +> This example reads the data of sls's logstore1 and prints it to the client.And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in Install SeaTunnel to install and deploy SeaTunnel. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +[Create RAM user and authorization](https://help.aliyun.com/zh/sls/create-a-ram-user-and-authorize-the-ram-user-to-access-log-service?spm=a2c4g.11186623.0.i4),Please ensure thr ram user have sufficient rights to perform, reference [RAM Custom Authorization Example](https://help.aliyun.com/zh/sls/use-custom-policies-to-grant-permissions-to-a-ram-user?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#reference-s3z-m1l-z2b) + +```hocon +# Defining the runtime environment +env { + parallelism = 2 + job.mode = "STREAMING" + checkpoint.interval = 30000 +} + +source { + Sls { + endpoint = "cn-hangzhou-intranet.log.aliyuncs.com" + project = "project1" + logstore = "logstore1" + access_key_id = "xxxxxxxxxxxxxxxxxxxxxxxx" + access_key_secret = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + schema = { + fields = { + id = "int" + name = "string" + description = "string" + weight = "string" + } + } + } +} + +sink { + Console { + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Snowflake.md b/versioned_docs/version-2.3.7/connector-v2/source/Snowflake.md new file mode 100644 index 000000000000..c36c1cac1633 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Snowflake.md @@ -0,0 +1,153 @@ +# Snowflake + +> JDBC Snowflake Source Connector +> +> ## Support those engines +> +> Spark
    +> Flink
    +> SeaTunnel Zeta
    +> + ## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +> supports query SQL and can achieve projection effect. +> + ## Description + +Read external data source data through JDBC. + +## Supported DataSource list + +| datasource | supported versions | driver | url | maven | +|------------|----------------------------------------------------------|-------------------------------------------|------------------------------------------------------------|-----------------------------------------------------------------------------| +| snowflake | Different dependency version has different driver class. | net.snowflake.client.jdbc.SnowflakeDriver | jdbc:snowflake://.snowflakecomputing.com | [Download](https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc) | + +## Database dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
    +> For example Snowflake datasource: cp snowflake-connector-java-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ +> + ## Data Type Mapping + +| Snowflake Data type | SeaTunnel Data type | +|-----------------------------------------------------------------------------|---------------------| +| BOOLEAN | BOOLEAN | +| TINYINT
    SMALLINT
    BYTEINT
    | SHORT_TYPE | +| INT
    INTEGER
    | INT | +| BIGINT | LONG | +| DECIMAL
    NUMERIC
    NUMBER
    | DECIMAL(x,y) | +| DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | +| REAL
    FLOAT4 | FLOAT | +| DOUBLE
    DOUBLE PRECISION
    FLOAT8
    FLOAT
    | DOUBLE | +| CHAR
    CHARACTER
    VARCHAR
    STRING
    TEXT
    VARIANT
    OBJECT | STRING | +| DATE | DATE | +| TIME | TIME | +| DATETIME
    TIMESTAMP
    TIMESTAMP_LTZ
    TIMESTAMP_NTZ
    TIMESTAMP_TZ | TIMESTAMP | +| BINARY
    VARBINARY | BYTES | +| GEOGRAPHY (WKB or EWKB)
    GEOMETRY (WKB or EWKB) | BYTES | +| GEOGRAPHY (GeoJSON, WKT or EWKT)
    GEOMETRY (GeoJSON, WKB or EWKB) | STRING | + +## Options + +| name | type | required | default | description | +|------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:snowflake://.snowflakecomputing.com | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
    if you use Snowflake the value is `net.snowflake.client.jdbc.SnowflakeDriver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | +| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
    the row fetch size used in the query toimprove performance by
    reducing the number database hits required to satisfy the selection criteria.
    Zero means use jdbc default value. | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
    specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. +> +> JDBC Driver Connection Parameters are supported in JDBC connection string. E.g, you can add `?GEOGRAPHY_OUTPUT_FORMAT='EWKT'` to specify the Geospatial Data Types. For more information about configurable parameters, and geospatial data types please visit Snowflake official [document](https://docs.snowflake.com/en/sql-reference/data-types-geospatial) + +## Task Example + +### simple: + +> This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. +> +> ``` +> # Defining the runtime environment +> env { +> parallelism = 2 +> job.mode = "BATCH" +> } +> source{ +> Jdbc { +> url = "jdbc:snowflake://.snowflakecomputing.com" +> driver = "net.snowflake.client.jdbc.SnowflakeDriver" +> connection_check_timeout_sec = 100 +> user = "root" +> password = "123456" +> query = "select * from type_bin limit 16" +> } +> } +> transform { +> # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, +> # please go to https://seatunnel.apache.org/docs/transform-v2/sql +> } +> sink { +> Console {} +> } +> ``` + +### parallel: + +> Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table +> +> ``` +> Jdbc { +> url = "jdbc:snowflake://.snowflakecomputing.com" +> driver = "net.snowflake.client.jdbc.SnowflakeDriver" +> connection_check_timeout_sec = 100 +> user = "root" +> password = "123456" +> # Define query logic as required +> query = "select * from type_bin" +> # Parallel sharding reads fields +> partition_column = "id" +> # Number of fragments +> partition_num = 10 +> } +> ``` + +### parallel boundary: + +> It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured +> +> ``` +> Jdbc { +> url = "jdbc:snowflake://.snowflakecomputing.com" +> driver = "net.snowflake.client.jdbc.SnowflakeDriver" +> connection_check_timeout_sec = 100 +> user = "root" +> password = "123456" +> # Define query logic as required +> query = "select * from type_bin" +> partition_column = "id" +> # Read start boundary +> partition_lower_bound = 1 +> # Read end boundary +> partition_upper_bound = 500 +> partition_num = 10 +> } +> ``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Socket.md b/versioned_docs/version-2.3.7/connector-v2/source/Socket.md new file mode 100644 index 000000000000..7d8eb3bb7f3a --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Socket.md @@ -0,0 +1,108 @@ +# Socket + +> Socket source connector + +## Support Those Engines + +> Spark
    +> Flink
    +> SeaTunnel Zeta
    + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +Used to read data from Socket. + +## Data Type Mapping + +The File does not have a specific type list, and we can indicate which SeaTunnel data type the corresponding data needs to be converted to by specifying the Schema in the config. + +| SeaTunnel Data type | +|---------------------| +| STRING | +| SHORT | +| INT | +| BIGINT | +| BOOLEAN | +| DOUBLE | +| DECIMAL | +| FLOAT | +| DATE | +| TIME | +| TIMESTAMP | +| BYTES | +| ARRAY | +| MAP | + +## Options + +| Name | Type | Required | Default | Description | +|----------------|---------|----------|---------|----------------------------------------------------------------------------------------------------------| +| host | String | Yes | _ | socket server host | +| port | Integer | Yes | _ | socket server port | +| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | + +## How to Create a Socket Data Synchronization Jobs + +* Configuring the SeaTunnel config file + +The following example demonstrates how to create a data synchronization job that reads data from Socket and prints it on the local client: + +```bash +# Set the basic configuration of the task to be performed +env { + parallelism = 1 + job.mode = "BATCH" +} + +# Create a source to connect to socket +source { + Socket { + host = "localhost" + port = 9999 + } +} + +# Console printing of the read socket data +sink { + Console { + parallelism = 1 + } +} +``` + +* Start a port listening + +```shell +nc -l 9999 +``` + +* Start a SeaTunnel task + +* Socket Source send test data + +```text +~ nc -l 9999 +test +hello +flink +spark +``` + +* Console Sink print data + +```text +[test] +[hello] +[flink] +[spark] +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/SqlServer-CDC.md b/versioned_docs/version-2.3.7/connector-v2/source/SqlServer-CDC.md new file mode 100644 index 000000000000..25d490ed0af8 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/SqlServer-CDC.md @@ -0,0 +1,229 @@ +# SQL Server CDC + +> Sql Server CDC source connector + +## Support SQL Server Version + +- server:2019 (Or later version for information only) + +## Support Those Engines + +> SeaTunnel Zeta
    +> Flink
    + +## Key Features + +- [ ] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +The Sql Server CDC connector allows for reading snapshot data and incremental data from SqlServer database. This document +describes how to setup the Sql Server CDC connector to run SQL queries against SqlServer databases. + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|---------------------------------------------------------------|----------------------------------------------|---------------------------------------------------------------|-----------------------------------------------------------------------| +| SqlServer |
  • server:2019 (Or later version for information only)
  • | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433;databaseName=column_type_test | https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc | + +## Using Dependency + +### Install Jdbc Driver + +#### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +#### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Data Type Mapping + +| SQLserver Data Type | SeaTunnel Data Type | +|----------------------------------------------------------------------|---------------------| +| CHAR
    VARCHAR
    NCHAR
    NVARCHAR
    TEXT
    NTEXT
    XML | STRING | +| BINARY
    VARBINARY
    IMAGE | BYTES | +| INTEGER
    INT | INT | +| SMALLINT
    TINYINT | SMALLINT | +| BIGINT | BIGINT | +| FLOAT(1~24)
    REAL | FLOAT | +| DOUBLE
    FLOAT(>24) | DOUBLE | +| NUMERIC(p,s)
    DECIMAL(p,s)
    MONEY
    SMALLMONEY | DECIMAL(p, s) | +| TIMESTAMP | BYTES | +| DATE | DATE | +| TIME(s) | TIME(s) | +| DATETIME(s)
    DATETIME2(s)
    DATETIMEOFFSET(s)
    SMALLDATETIME | TIMESTAMP(s) | +| BOOLEAN
    BIT
    | BOOLEAN | + +## Source Options + +| Name | Type | Required | Default | Description | +|------------------------------------------------|----------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| username | String | Yes | - | Name of the database to use when connecting to the database server. | +| password | String | Yes | - | Password to use when connecting to the database server. | +| database-names | List | Yes | - | Database name of the database to monitor. | +| table-names | List | Yes | - | Table name is a combination of schema name and table name (databaseName.schemaName.tableName). | +| table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys":["key1"]}] | +| base-url | String | Yes | - | URL has to be with database, like "jdbc:sqlserver://localhost:1433;databaseName=test". | +| startup.mode | Enum | No | INITIAL | Optional startup mode for SqlServer CDC consumer, valid enumerations are "initial", "earliest", "latest" and "specific". | +| startup.timestamp | Long | No | - | Start from the specified epoch timestamp (in milliseconds).
    **Note, This option is required when** the **"startup.mode" option used `'timestamp'`.** | +| startup.specific-offset.file | String | No | - | Start from the specified binlog file name.
    **Note, This option is required when the "startup.mode" option used `'specific'`.** | +| startup.specific-offset.pos | Long | No | - | Start from the specified binlog file position.
    **Note, This option is required when the "startup.mode" option used `'specific'`.** | +| stop.mode | Enum | No | NEVER | Optional stop mode for SqlServer CDC consumer, valid enumerations are "never". | +| stop.timestamp | Long | No | - | Stop from the specified epoch timestamp (in milliseconds).
    **Note, This option is required when the "stop.mode" option used `'timestamp'`.** | +| stop.specific-offset.file | String | No | - | Stop from the specified binlog file name.
    **Note, This option is required when the "stop.mode" option used `'specific'`.** | +| stop.specific-offset.pos | Long | No | - | Stop from the specified binlog file position.
    **Note, This option is required when the "stop.mode" option used `'specific'`.** | +| incremental.parallelism | Integer | No | 1 | The number of parallel readers in the incremental phase. | +| snapshot.split.size | Integer | No | 8096 | The split size (number of rows) of table snapshot, captured tables are split into multiple splits when read the snapshotof table. | +| snapshot.fetch.size | Integer | No | 1024 | The maximum fetch size for per poll when read table snapshot. | +| server-time-zone | String | No | UTC | The session time zone in database server. | +| connect.timeout | Duration | No | 30s | The maximum time that the connector should wait after trying to connect to the database server before timing out. | +| connect.max-retries | Integer | No | 3 | The max retry times that the connector should retry to build database server connection. | +| connection.pool.size | Integer | No | 20 | The connection pool size. | +| chunk-key.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | +| chunk-key.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | +| sample-sharding.threshold | int | No | 1000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | +| inverse-sampling.rate | int | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | +| exactly_once | Boolean | No | false | Enable exactly once semantic. | +| debezium.* | config | No | - | Pass-through Debezium's properties to Debezium Embedded Engine which is used to capture data changes from SqlServer server.
    See more about
    the [Debezium's SqlServer Connector properties](https://github.com/debezium/debezium/blob/1.6/documentation/modules/ROOT/pages/connectors/sqlserver.adoc#connector-properties) | +| format | Enum | No | DEFAULT | Optional output format for SqlServer CDC, valid enumerations are "DEFAULT"、"COMPATIBLE_DEBEZIUM_JSON". | +| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | + +### Enable Sql Server CDC + +1. Check whether the CDC Agent is enabled + +> EXEC xp_servicecontrol N'querystate', N'SQLServerAGENT';
    +> If the result is running, prove that it is enabled. Otherwise, you need to manually enable it + +2. Enable the CDC Agent + +> /opt/mssql/bin/mssql-conf setup + +3. The result is as follows + +> 1) Evaluation (free, no production use rights, 180-day limit) +> 2) Developer (free, no production use rights) +> 3) Express (free) +> 4) Web (PAID) +> 5) Standard (PAID) +> 6) Enterprise (PAID) +> 7) Enterprise Core (PAID) +> 8) I bought a license through a retail sales channel and have a product key to enter. + +4. Set the CDC at the library level + Set the library level below to enable CDC. At this level, all tables under the libraries of the enabled CDC automatically enable CDC + +> USE TestDB; -- Replace with the actual database name
    +> EXEC sys.sp_cdc_enable_db;
    +> SELECT name, is_tracked_by_cdc FROM sys.tables WHERE name = 'table'; -- table Replace with the name of the table you want to check + +## Task Example + +### initiali read Simple + +> This is a stream mode cdc initializes read table data will be read incrementally after successful read The following sql DDL is for reference only + +``` +env { + # You can set engine configuration here + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + SqlServer-CDC { + result_table_name = "customers" + username = "sa" + password = "Y.sa123456" + startup.mode="initial" + database-names = ["column_type_test"] + table-names = ["column_type_test.dbo.full_types"] + base-url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" + } +} + +transform { +} + +sink { + console { + source_table_name = "customers" + } +``` + +### increment read Simple + +> This is an incremental read that reads the changed data for printing + +``` +env { + # You can set engine configuration here + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + SqlServer-CDC { + # Set up accurate one read + exactly_once=true + result_table_name = "customers" + username = "sa" + password = "Y.sa123456" + startup.mode="latest" + database-names = ["column_type_test"] + table-names = ["column_type_test.dbo.full_types"] + base-url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" + } +} + +transform { +} + +sink { + console { + source_table_name = "customers" + } +``` + +### Support custom primary key for table + +``` +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + SqlServer-CDC { + base-url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" + username = "sa" + password = "Y.sa123456" + database-names = ["column_type_test"] + + table-names = ["column_type_test.dbo.simple_types", "column_type_test.dbo.full_types"] + table-names-config = [ + { + table = "column_type_test.dbo.full_types" + primaryKeys = ["id"] + } + ] + } +} + +sink { + console { + } +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/SqlServer.md b/versioned_docs/version-2.3.7/connector-v2/source/SqlServer.md new file mode 100644 index 000000000000..2443291f91ce --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/SqlServer.md @@ -0,0 +1,266 @@ +# SQL Server + +> JDBC SQL Server Source Connector + +## Support SQL Server Version + +- server:2008 (Or later version for information only) + +## Support Those Engines + +> Spark
    +> Flink
    +> Seatunnel Zeta
    + +## Using Dependency + +### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +> supports query SQL and can achieve projection effect. + +## Description + +Read external data source data through JDBC. + +## Supported DataSource Info + +| datasource | supported versions | driver | url | maven | +|------------|-------------------------|----------------------------------------------|---------------------------------|-----------------------------------------------------------------------------------| +| SQL Server | support version >= 2008 | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433 | [Download](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) | + +## Database dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
    +> For example SQL Server datasource: cp mssql-jdbc-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| SQLserver Data type | Seatunnel Data type | +|----------------------------------------------------------------------|---------------------| +| BIT | BOOLEAN | +| TINYINT
    SMALLINT | SMALLINT | +| INTEGER
    INT | INT | +| BIGINT | BIGINT | +| NUMERIC(p,s)
    DECIMAL(p,s)
    MONEY
    SMALLMONEY | DECIMAL(p,s) | +| FLOAT(1~24)
    REAL | FLOAT | +| DOUBLE
    FLOAT(>24) | DOUBLE | +| CHAR
    NCHAR
    VARCHAR
    NTEXT
    NVARCHAR
    TEXT
    XML | STRING | +| DATE | DATE | +| TIME(s) | TIME(s) | +| DATETIME(s)
    DATETIME2(s)
    DATETIMEOFFSET(s)
    SMALLDATETIME | TIMESTAMP(s) | +| BINARY
    VARBINARY
    IMAGE | BYTES | + +## Source Options + +| name | type | required | default | Description | +|--------------------------------------------|--------|----------|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:sqlserver://127.0.0.1:1434;database=TestDB | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
    if you use SQLserver the value is `com.microsoft.sqlserver.jdbc.SQLServerDriver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type. | +| partition_lower_bound | Long | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | Long | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | +| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
    the row fetch size used in the query toimprove performance by
    reducing the number database hits required to satisfy the selection criteria.
    Zero means use jdbc default value. | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
    specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | +| table_path | Int | No | 0 | The path to the full path of table, you can use this configuration instead of `query`.
    examples:
    mysql: "testdb.table1"
    oracle: "test_schema.table1"
    sqlserver: "testdb.test_schema.table1"
    postgresql: "testdb.test_schema.table1" | +| table_list | Array | No | 0 | The list of tables to be read, you can use this configuration instead of `table_path` example: ```[{ table_path = "testdb.table1"}, {table_path = "testdb.table2", query = "select * id, name from testdb.table2"}]``` | +| where_condition | String | No | - | Common row filter conditions for all tables/queries, must start with `where`. for example `where id > 100` | +| split.size | Int | No | 8096 | The split size (number of rows) of table, captured tables are split into multiple splits when read of table. | +| split.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | +| split.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | +| split.sample-sharding.threshold | Int | No | 10000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | +| split.inverse-sampling.rate | Int | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## Parallel Reader + +The JDBC Source connector supports parallel reading of data from tables. SeaTunnel will use certain rules to split the data in the table, which will be handed over to readers for reading. The number of readers is determined by the `parallelism` option. + +**Split Key Rules:** + +1. If `partition_column` is not null, It will be used to calculate split. The column must in **Supported split data type**. +2. If `partition_column` is null, seatunnel will read the schema from table and get the Primary Key and Unique Index. If there are more than one column in Primary Key and Unique Index, The first column which in the **supported split data type** will be used to split data. For example, the table have Primary Key(nn guid, name varchar), because `guid` id not in **supported split data type**, so the column `name` will be used to split data. + +**Supported split data type:** +* String +* Number(int, bigint, decimal, ...) +* Date + +### Options Related To Split + +#### split.size + +How many rows in one split, captured tables are split into multiple splits when read of table. + +#### split.even-distribution.factor.lower-bound + +> Not recommended for use + +The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. + +#### split.even-distribution.factor.upper-bound + +> Not recommended for use + +The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. + +#### split.sample-sharding.threshold + +This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. + +#### split.inverse-sampling.rate + +The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. + +#### partition_column [string] + +The column name for split data. + +#### partition_upper_bound [BigDecimal] + +The partition_column max value for scan, if not set SeaTunnel will query database get max value. + +#### partition_lower_bound [BigDecimal] + +The partition_column min value for scan, if not set SeaTunnel will query database get min value. + +#### partition_num [int] + +> Not recommended for use, The correct approach is to control the number of split through `split.size` + +How many splits do we need to split into, only support positive integer. default value is job parallelism. + +## tips + +> If the table can not be split(for example, table have no Primary Key or Unique Index, and `partition_column` is not set), it will run in single concurrency. +> +> Use `table_path` to replace `query` for single table reading. If you need to read multiple tables, use `table_list`. + +## Task Example + +### Simple: + +> Simple single task to read the data table + +``` +# Defining the runtime environment +env { + parallelism = 1 + job.mode = "BATCH" +} +source{ + Jdbc { + driver = com.microsoft.sqlserver.jdbc.SQLServerDriver + url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" + user = SA + password = "Y.sa123456" + query = "select * from full_types_jdbc" + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink { + Console {} +} +``` + +### Parallel: + +> Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table + +``` +env { + parallelism = 10 + job.mode = "BATCH" +} + +source { + Jdbc { + driver = com.microsoft.sqlserver.jdbc.SQLServerDriver + url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" + user = SA + password = "Y.sa123456" + # Define query logic as required + query = "select * from full_types_jdbc" + # Parallel sharding reads fields + partition_column = "id" + # Number of fragments + partition_num = 10 + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink { + Console {} +} + +``` + +### Fragmented Parallel Read Simple: + +> It is a shard that reads data in parallel fast + +``` +env { + # You can set engine configuration here + parallelism = 10 +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + Jdbc { + driver = com.microsoft.sqlserver.jdbc.SQLServerDriver + url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" + user = SA + password = "Y.sa123456" + query = "select * from column_type_test.dbo.full_types_jdbc" + # Parallel sharding reads fields + partition_column = "id" + # Number of fragments + partition_num = 10 + + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/connector-v2/source/Jdbc +} + + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink { + Console {} + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/connector-v2/sink/Jdbc +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/StarRocks.md b/versioned_docs/version-2.3.7/connector-v2/source/StarRocks.md new file mode 100644 index 000000000000..d46105cc9af1 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/StarRocks.md @@ -0,0 +1,185 @@ +# StarRocks + +> StarRocks source connector + +## Description + +Read external data source data through StarRocks. +The internal implementation of StarRocks source connector is obtains the query plan from the frontend (FE), +delivers the query plan as a parameter to BE nodes, and then obtains data results from BE nodes. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [schema projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-------------------------|--------|----------|-------------------| +| node_urls | list | yes | - | +| username | string | yes | - | +| password | string | yes | - | +| database | string | yes | - | +| table | string | yes | - | +| scan_filter | string | no | - | +| schema | config | yes | - | +| request_tablet_size | int | no | Integer.MAX_VALUE | +| scan_connect_timeout_ms | int | no | 30000 | +| scan_query_timeout_sec | int | no | 3600 | +| scan_keep_alive_min | int | no | 10 | +| scan_batch_rows | int | no | 1024 | +| scan_mem_limit | long | no | 2147483648 | +| max_retries | int | no | 3 | +| scan.params.* | string | no | - | + +### node_urls [list] + +`StarRocks` cluster address, the format is `["fe_ip:fe_http_port", ...]` + +### username [string] + +`StarRocks` user username + +### password [string] + +`StarRocks` user password + +### database [string] + +The name of StarRocks database + +### table [string] + +The name of StarRocks table + +### scan_filter [string] + +Filter expression of the query, which is transparently transmitted to StarRocks. StarRocks uses this expression to complete source-side data filtering. + +e.g. + +``` +"tinyint_1 = 100" +``` + +### schema [config] + +#### fields [Config] + +The schema of the starRocks that you want to generate + +e.g. + +``` +schema { + fields { + name = string + age = int + } + } +``` + +### request_tablet_size [int] + +The number of StarRocks Tablets corresponding to an Partition. The smaller this value is set, the more partitions will be generated. This will increase the parallelism on the engine side, but at the same time will cause greater pressure on StarRocks. + +The following is an example to explain how to use request_tablet_size to controls the generation of partitions + +``` +the tablet distribution of StarRocks table in cluster as follower + +be_node_1 tablet[1, 2, 3, 4, 5] +be_node_2 tablet[6, 7, 8, 9, 10] +be_node_3 tablet[11, 12, 13, 14, 15] + +1.If not set request_tablet_size, there will no limit on the number of tablets in a single partition. The partitions will be generated as follows + +partition[0] read data of tablet[1, 2, 3, 4, 5] from be_node_1 +partition[1] read data of tablet[6, 7, 8, 9, 10] from be_node_2 +partition[2] read data of tablet[11, 12, 13, 14, 15] from be_node_3 + +2.if set request_tablet_size=3, the limit on the number of tablets in a single partition is 3. The partitions will be generated as follows + +partition[0] read data of tablet[1, 2, 3] from be_node_1 +partition[1] read data of tablet[4, 5] from be_node_1 +partition[2] read data of tablet[6, 7, 8] from be_node_2 +partition[3] read data of tablet[9, 10] from be_node_2 +partition[4] read data of tablet[11, 12, 13] from be_node_3 +partition[5] read data of tablet[14, 15] from be_node_3 +``` + +### scan_connect_timeout_ms [int] + +requests connection timeout sent to StarRocks + +### scan_query_timeout_sec [int] + +Query the timeout time of StarRocks, the default value is 1 hour, -1 means no timeout limit + +### scan_keep_alive_min [int] + +The keep-alive duration of the query task, in minutes. The default value is 10. we recommend that you set this parameter to a value greater than or equal to 5. + +### scan_batch_rows [int] + +The maximum number of data rows to read from BE at a time. Increasing this value reduces the number of connections established between engine and StarRocks and therefore mitigates overhead caused by network latency. + +### scan_mem_limit [long] + +The maximum memory space allowed for a single query in the BE node, in bytes. The default value is 2147483648 (2 GB). + +### max_retries [int] + +number of retry requests sent to StarRocks + +### scan.params. [string] + +The parameter of the scan data from be + +## Example + +``` +source { + StarRocks { + nodeUrls = ["starrocks_e2e:8030"] + username = root + password = "" + database = "test" + table = "e2e_table_source" + scan_batch_rows = 10 + max_retries = 3 + schema { + fields { + BIGINT_COL = BIGINT + LARGEINT_COL = STRING + SMALLINT_COL = SMALLINT + TINYINT_COL = TINYINT + BOOLEAN_COL = BOOLEAN + DECIMAL_COL = "DECIMAL(20, 1)" + DOUBLE_COL = DOUBLE + FLOAT_COL = FLOAT + INT_COL = INT + CHAR_COL = STRING + VARCHAR_11_COL = STRING + STRING_COL = STRING + DATETIME_COL = TIMESTAMP + DATE_COL = DATE + } + } + scan.params.scanner_thread_pool_thread_num = "3" + + } +} +``` + +## Changelog + +### next version + +- Add StarRocks Source Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/TDengine.md b/versioned_docs/version-2.3.7/connector-v2/source/TDengine.md new file mode 100644 index 000000000000..a24744d5c176 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/TDengine.md @@ -0,0 +1,85 @@ +# TDengine + +> TDengine source connector + +## Description + +Read external data source data through TDengine. + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) + +supports query SQL and can achieve projection effect. + +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Options + +| name | type | required | default value | +|-------------|--------|----------|---------------| +| url | string | yes | - | +| username | string | yes | - | +| password | string | yes | - | +| database | string | yes | | +| stable | string | yes | - | +| lower_bound | long | yes | - | +| upper_bound | long | yes | - | + +### url [string] + +the url of the TDengine when you select the TDengine + +e.g. + +``` +jdbc:TAOS-RS://localhost:6041/ +``` + +### username [string] + +the username of the TDengine when you select + +### password [string] + +the password of the TDengine when you select + +### database [string] + +the database of the TDengine when you select + +### stable [string] + +the stable of the TDengine when you select + +### lower_bound [long] + +the lower_bound of the migration period + +### upper_bound [long] + +the upper_bound of the migration period + +## Example + +### source + +```hocon +source { + TDengine { + url : "jdbc:TAOS-RS://localhost:6041/" + username : "root" + password : "taosdata" + database : "power" + stable : "meters" + lower_bound : "2018-10-03 14:38:05.000" + upper_bound : "2018-10-03 14:38:16.800" + result_table_name = "tdengine_result" + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Vertica.md b/versioned_docs/version-2.3.7/connector-v2/source/Vertica.md new file mode 100644 index 000000000000..1d8a83faa109 --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Vertica.md @@ -0,0 +1,162 @@ +# Vertica + +> JDBC Vertica Source Connector + +## Description + +Read external data source data through JDBC. + +## Support Those Engines + +> Spark
    +> Flink
    +> SeaTunnel Zeta
    + +## Using Dependency + +### For Spark/Flink Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://www.vertica.com/download/vertica/client-drivers/) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. + +### For SeaTunnel Zeta Engine + +> 1. You need to ensure that the [jdbc driver jar package](https://www.vertica.com/download/vertica/client-drivers/) has been placed in directory `${SEATUNNEL_HOME}/lib/`. + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +> supports query SQL and can achieve projection effect. + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|----------------------------------------------------------|-------------------------|---------------------------------------|----------------------------------------------------------------------| +| Vertica | Different dependency version has different driver class. | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433/vertica | [Download](https://www.vertica.com/download/vertica/client-drivers/) | + +## Data Type Mapping + +| Vertical Data Type | SeaTunnel Data Type | +|-----------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| BIT | BOOLEAN | +| TINYINT
    TINYINT UNSIGNED
    SMALLINT
    SMALLINT UNSIGNED
    MEDIUMINT
    MEDIUMINT UNSIGNED
    INT
    INTEGER
    YEAR | INT | +| INT UNSIGNED
    INTEGER UNSIGNED
    BIGINT | LONG | +| BIGINT UNSIGNED | DECIMAL(20,0) | +| DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | +| DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | +| DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
    (Gets the designated column's number of digits to right of the decimal point.))) | +| FLOAT
    FLOAT UNSIGNED | FLOAT | +| DOUBLE
    DOUBLE UNSIGNED | DOUBLE | +| CHAR
    VARCHAR
    TINYTEXT
    MEDIUMTEXT
    TEXT
    LONGTEXT
    JSON | STRING | +| DATE | DATE | +| TIME | TIME | +| DATETIME
    TIMESTAMP | TIMESTAMP | +| TINYBLOB
    MEDIUMBLOB
    BLOB
    LONGBLOB
    BINARY
    VARBINAR
    BIT(n) | BYTES | +| GEOMETRY
    UNKNOWN | Not supported yet | + +## Source Options + +| Name | Type | Required | Default | Description | +|------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:vertica://localhost:5433/vertica | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
    if you use Vertica the value is `com.vertica.jdbc.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | +| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
    the row fetch size used in the query toimprove performance by
    reducing the number database hits required to satisfy the selection criteria.
    Zero means use jdbc default value. | +| properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
    specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. + +``` +# Defining the runtime environment +env { + parallelism = 2 + job.mode = "BATCH" +} +source{ + Jdbc { + url = "jdbc:vertica://localhost:5433/vertica" + driver = "com.vertica.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + query = "select * from type_bin limit 16" + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink { + Console {} +} +``` + +### Parallel: + +> Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table + +``` +source { + Jdbc { + url = "jdbc:vertica://localhost:5433/vertica" + driver = "com.vertica.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + # Define query logic as required + query = "select * from type_bin" + # Parallel sharding reads fields + partition_column = "id" + # Number of fragments + partition_num = 10 + } +} +``` + +### Parallel Boundary: + +> It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured + +``` +source { + Jdbc { + url = "jdbc:vertica://localhost:5433/vertica" + driver = "com.vertica.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + # Define query logic as required + query = "select * from type_bin" + partition_column = "id" + # Read start boundary + partition_lower_bound = 1 + # Read end boundary + partition_upper_bound = 500 + partition_num = 10 + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/Web3j.md b/versioned_docs/version-2.3.7/connector-v2/source/Web3j.md new file mode 100644 index 000000000000..6e50789b419e --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/Web3j.md @@ -0,0 +1,61 @@ +# Web3j + +> Web3j source connector + +## Support Those Engines + +> Spark
    +> Flink
    +> Seatunnel Zeta
    + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +Source connector for web3j. It is used to read data from the blockchain, such as block information, transactions, smart contract events, etc. Currently, it supports reading block height data. + +## Source Options + +| Name | Type | Required | Default | Description | +|------|--------|----------|---------|---------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | When using Infura as the service provider, the URL is used for communication with the Ethereum network. | + +## How to Create a Http Data Synchronization Jobs + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + Web3j { + url = "https://mainnet.infura.io/v3/xxxxx" + } +} + +# Console printing of the read Http data +sink { + Console { + parallelism = 1 + } +} +``` + +Then you will get the following data: + +```json +{"blockNumber":19525949,"timestamp":"2024-03-27T13:28:45.605Z"} +``` + +## Changelog + +- Add Web3j Source Connector + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/common-options.md b/versioned_docs/version-2.3.7/connector-v2/source/common-options.md new file mode 100644 index 000000000000..079f40663a3f --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/common-options.md @@ -0,0 +1,81 @@ +# Source Common Options + +> Common parameters of source connectors + +| Name | Type | Required | Default | Description | +|-------------------|--------|----------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| result_table_name | String | No | - | When `result_table_name` is not specified, the data processed by this plugin will not be registered as a data set `(dataStream/dataset)` that can be directly accessed by other plugins, or called a temporary table `(table)`
    When `result_table_name` is specified, the data processed by this plugin will be registered as a data set `(dataStream/dataset)` that can be directly accessed by other plugins, or called a temporary table `(table)` . The data set `(dataStream/dataset)` registered here can be directly accessed by other plugins by specifying `source_table_name` . | +| parallelism | Int | No | - | When `parallelism` is not specified, the `parallelism` in env is used by default.
    When parallelism is specified, it will override the parallelism in env. | + +# Important note + +When the job configuration `result_table_name` you must set the `source_table_name` parameter + +## Task Example + +### Simple: + +> This registers a stream or batch data source and returns the table name `fake_table` at registration + +```bash +source { + FakeSourceStream { + result_table_name = "fake_table" + } +} +``` + +### Multiple Pipeline Simple + +> This is to convert the data source fake and write it to two different sinks + +```bash +env { + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 100 + schema = { + fields { + id = "int" + name = "string" + age = "int" + c_timestamp = "timestamp" + c_date = "date" + c_map = "map" + c_array = "array" + c_decimal = "decimal(30, 8)" + c_row = { + c_row = { + c_int = int + } + } + } + } + } +} + +transform { + Sql { + source_table_name = "fake" + result_table_name = "fake1" + # the query table name must same as field 'source_table_name' + query = "select id, regexp_replace(name, '.+', 'b') as name, age+1 as age, pi() as pi, c_timestamp, c_date, c_map, c_array, c_decimal, c_row from fake" + } + # The SQL transform support base function and criteria operation + # But the complex SQL unsupported yet, include: multi source table/rows JOIN and AGGREGATE operation and the like +} + +sink { + Console { + source_table_name = "fake1" + } + Console { + source_table_name = "fake" + } +} +``` + diff --git a/versioned_docs/version-2.3.7/connector-v2/source/kafka.md b/versioned_docs/version-2.3.7/connector-v2/source/kafka.md new file mode 100644 index 000000000000..c42c84a8a5af --- /dev/null +++ b/versioned_docs/version-2.3.7/connector-v2/source/kafka.md @@ -0,0 +1,244 @@ +# Kafka + +> Kafka source connector + +## Support Those Engines + +> Spark
    +> Flink
    +> Seatunnel Zeta
    + +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [x] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +Source connector for Apache Kafka. + +## Supported DataSource Info + +In order to use the Kafka connector, the following dependencies are required. +They can be downloaded via install-plugin.sh or from the Maven central repository. + +| Datasource | Supported Versions | Maven | +|------------|--------------------|-------------------------------------------------------------------------------------------------------------| +| Kafka | Universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-kafka) | + +## Source Options + +| Name | Type | Required | Default | Description | +|-------------------------------------|-----------------------------------------------------------------------------|----------|--------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| topic | String | Yes | - | Topic name(s) to read data from when the table is used as source. It also supports topic list for source by separating topic by comma like 'topic-1,topic-2'. | +| table_list | Map | No | - | Topic list config You can configure only one `table_list` and one `topic` at the same time | +| bootstrap.servers | String | Yes | - | Comma separated list of Kafka brokers. | +| pattern | Boolean | No | false | If `pattern` is set to `true`,the regular expression for a pattern of topic names to read from. All topics in clients with names that match the specified regular expression will be subscribed by the consumer. | +| consumer.group | String | No | SeaTunnel-Consumer-Group | `Kafka consumer group id`, used to distinguish different consumer groups. | +| commit_on_checkpoint | Boolean | No | true | If true the consumer's offset will be periodically committed in the background. | +| kafka.config | Map | No | - | In addition to the above necessary parameters that must be specified by the `Kafka consumer` client, users can also specify multiple `consumer` client non-mandatory parameters, covering [all consumer parameters specified in the official Kafka document](https://kafka.apache.org/documentation.html#consumerconfigs). | +| schema | Config | No | - | The structure of the data, including field names and field types. | +| format | String | No | json | Data format. The default format is json. Optional text format, canal_json, debezium_json, ogg_json and avro.If you use json or text format. The default field separator is ", ". If you customize the delimiter, add the "field_delimiter" option.If you use canal format, please refer to [canal-json](../formats/canal-json.md) for details.If you use debezium format, please refer to [debezium-json](../formats/debezium-json.md) for details. | +| format_error_handle_way | String | No | fail | The processing method of data format error. The default value is fail, and the optional value is (fail, skip). When fail is selected, data format error will block and an exception will be thrown. When skip is selected, data format error will skip this line data. | +| field_delimiter | String | No | , | Customize the field delimiter for data format. | +| start_mode | StartMode[earliest],[group_offsets],[latest],[specific_offsets],[timestamp] | No | group_offsets | The initial consumption pattern of consumers. | +| start_mode.offsets | Config | No | - | The offset required for consumption mode to be specific_offsets. | +| start_mode.timestamp | Long | No | - | The time required for consumption mode to be "timestamp". | +| partition-discovery.interval-millis | Long | No | -1 | The interval for dynamically discovering topics and partitions. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## Task Example + +### Simple + +> This example reads the data of kafka's topic_1, topic_2, topic_3 and prints it to the client.And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in Install SeaTunnel to install and deploy SeaTunnel. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +```hocon +# Defining the runtime environment +env { + parallelism = 2 + job.mode = "BATCH" +} +source { + Kafka { + schema = { + fields { + name = "string" + age = "int" + } + } + format = text + field_delimiter = "#" + topic = "topic_1,topic_2,topic_3" + bootstrap.servers = "localhost:9092" + kafka.config = { + client.id = client_1 + max.poll.records = 500 + auto.offset.reset = "earliest" + enable.auto.commit = "false" + } + } +} +sink { + Console {} +} +``` + +### Regex Topic + +```hocon +source { + Kafka { + topic = ".*seatunnel*." + pattern = "true" + bootstrap.servers = "localhost:9092" + consumer.group = "seatunnel_group" + } +} +``` + +### AWS MSK SASL/SCRAM + +Replace the following `${username}` and `${password}` with the configuration values in AWS MSK. + +```hocon +source { + Kafka { + topic = "seatunnel" + bootstrap.servers = "xx.amazonaws.com.cn:9096,xxx.amazonaws.com.cn:9096,xxxx.amazonaws.com.cn:9096" + consumer.group = "seatunnel_group" + kafka.config = { + security.protocol=SASL_SSL + sasl.mechanism=SCRAM-SHA-512 + sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required username=\"username\" password=\"password\";" + #security.protocol=SASL_SSL + #sasl.mechanism=AWS_MSK_IAM + #sasl.jaas.config="software.amazon.msk.auth.iam.IAMLoginModule required;" + #sasl.client.callback.handler.class="software.amazon.msk.auth.iam.IAMClientCallbackHandler" + } + } +} +``` + +### AWS MSK IAM + +Download `aws-msk-iam-auth-1.1.5.jar` from https://github.com/aws/aws-msk-iam-auth/releases and put it in `$SEATUNNEL_HOME/plugin/kafka/lib` dir. + +Please ensure the IAM policy have `"kafka-cluster:Connect",`. Like this: + +```hocon +"Effect": "Allow", +"Action": [ + "kafka-cluster:Connect", + "kafka-cluster:AlterCluster", + "kafka-cluster:DescribeCluster" +], +``` + +Source Config + +```hocon +source { + Kafka { + topic = "seatunnel" + bootstrap.servers = "xx.amazonaws.com.cn:9098,xxx.amazonaws.com.cn:9098,xxxx.amazonaws.com.cn:9098" + consumer.group = "seatunnel_group" + kafka.config = { + #security.protocol=SASL_SSL + #sasl.mechanism=SCRAM-SHA-512 + #sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required username=\"username\" password=\"password\";" + security.protocol=SASL_SSL + sasl.mechanism=AWS_MSK_IAM + sasl.jaas.config="software.amazon.msk.auth.iam.IAMLoginModule required;" + sasl.client.callback.handler.class="software.amazon.msk.auth.iam.IAMClientCallbackHandler" + } + } +} +``` + +### Kerberos Authentication Example + +Source Config + +``` +source { + Kafka { + topic = "seatunnel" + bootstrap.servers = "127.0.0.1:9092" + consumer.group = "seatunnel_group" + kafka.config = { + security.protocol=SASL_PLAINTEXT + sasl.kerberos.service.name=kafka + sasl.mechanism=GSSAPI + java.security.krb5.conf="/etc/krb5.conf" + sasl.jaas.config="com.sun.security.auth.module.Krb5LoginModule required \n useKeyTab=true \n storeKey=true \n keyTab=\"/path/to/xxx.keytab\" \n principal=\"user@xxx.com\";" + } + } +} +``` + +### Multiple Kafka Source + +> This is written to the same pg table according to different formats and topics of parsing kafka Perform upsert operations based on the id + +```hocon + +env { + execution.parallelism = 1 + job.mode = "BATCH" +} + +source { + Kafka { + bootstrap.servers = "kafka_e2e:9092" + table_list = [ + { + topic = "^test-ogg-sou.*" + pattern = "true" + consumer.group = "ogg_multi_group" + start_mode = earliest + schema = { + fields { + id = "int" + name = "string" + description = "string" + weight = "string" + } + }, + format = ogg_json + }, + { + topic = "test-cdc_mds" + start_mode = earliest + schema = { + fields { + id = "int" + name = "string" + description = "string" + weight = "string" + } + }, + format = canal_json + } + ] + } +} + +sink { + Jdbc { + driver = org.postgresql.Driver + url = "jdbc:postgresql://postgresql:5432/test?loggerLevel=OFF" + user = test + password = test + generate_sink_sql = true + database = test + table = public.sink + primary_keys = ["id"] + } +} +``` + diff --git a/versioned_docs/version-2.3.7/contribution/coding-guide.md b/versioned_docs/version-2.3.7/contribution/coding-guide.md new file mode 100644 index 000000000000..9995c16854e4 --- /dev/null +++ b/versioned_docs/version-2.3.7/contribution/coding-guide.md @@ -0,0 +1,111 @@ +# Coding Guide + +This guide documents an overview of the current Apache SeaTunnel modules and best practices on how to submit a high quality pull request to Apache SeaTunnel. + +## Modules Overview + +| Module Name | Introduction | +|----------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------| +| seatunnel-api | SeaTunnel connector V2 API module | +| seatunnel-common | SeaTunnel common module | +| seatunnel-connectors-v2 | SeaTunnel connector V2 module, currently connector V2 is under development and the community will focus on it | +| seatunnel-core/seatunnel-spark-starter | SeaTunnel core starter module of connector V2 on Spark engine | +| seatunnel-core/seatunnel-flink-starter | SeaTunnel core starter module of connector V2 on Flink engine | +| seatunnel-core/seatunnel-starter | SeaTunnel core starter module of connector V2 on SeaTunnel engine | +| seatunnel-e2e | SeaTunnel end-to-end test module | +| seatunnel-examples | SeaTunnel local examples module, developer can use it to do unit test and integration test | +| seatunnel-engine | SeaTunnel engine module, seatunnel-engine is a new computational engine developed by the SeaTunnel Community that focuses on data synchronization. | +| seatunnel-formats | SeaTunnel formats module, used to offer the ability of formatting data | +| seatunnel-plugin-discovery | SeaTunnel plugin discovery module, used to offer the ability of loading SPI plugins from classpath | +| seatunnel-transforms-v2 | SeaTunnel transform V2 module, currently transform V2 is under development and the community will focus on it | +| seatunnel-translation | SeaTunnel translation module, used to adapt Connector V2 and other computing engines such as Spark, Flink etc... | + +## How To Submit A High Quality Pull Request + +1. Create entity classes using annotations in the `lombok` plugin (`@Data` `@Getter` `@Setter` `@NonNull` etc...) to reduce the amount of code. It's a good practice to prioritize the use of lombok plugins in your coding process. + +2. If you need to use log4j to print logs in a class, preferably use the annotation `@Slf4j` in the `lombok` plugin. + +3. SeaTunnel uses issue to track logical issues, including bugs and improvements, and uses Github's pull requests to manage the review and merge of specific code changes. So making a clear issue or pull request helps the community better understand the developer's intent. The best practice of creating issue or pull request is as the following shown: + + > [purpose] [module name] [sub-module name] Description + + 1. Pull request purpose includes: `Hotfix`, `Feature`, `Improve`, `Docs`, `WIP`. Note that if your pull request's purpose is `WIP`, then you need to use github's draft pull request + 2. Issue purpose includes: `Feature`, `Bug`, `Docs`, `Discuss` + 3. Module name: the current pull request or issue involves the name of the module, for example: `Core`, `Connector-V2`, `Connector-V1`, etc. + 4. Sub-module name: the current pull request or issue involves the name of the sub-module, for example:`File` `Redis` `Hbase` etc. + 5. Description: provide a brief, clear summary of the current pull request and issue's main goals and aim for a title that conveys the core purpose at a glance. + + Tips:**For more details, you can refer to [Issue Guide](https://seatunnel.apache.org/community/contribution_guide/contribute#issue) and [Pull Request Guide](https://seatunnel.apache.org/community/contribution_guide/contribute#pull-request)** + +4. Code segments are never repeated. If a code segment is used multiple times, define it multiple times is not a good option, make it a public segment for other modules to use is a best practice. + +5. When throwing an exception, throw it along with a hint message and the exception should be smaller in scope. Throwing overly broad exceptions promotes complex error handling code that is more likely to contain security vulnerabilities. For example, if your connector encounters an `IOException` while reading data, a reasonable approach would be to the following: + + ```java + try { + // read logic + } catch (IOException e) { + throw SeaTunnelORCFormatException("This orc file is corrupted, please check it", e); + } + ``` + +6. The Apache project has very strict licensing requirements, so every file in an Apache project should contain a license statement. Check that each new file you add contains the `Apache License Header` before submitting pull request: + + ```java + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + ``` + +7. Apache SeaTunnel uses `Spotless` for code style and formatting checks. You could run the following command and `Spotless` will automatically fix the code style and formatting errors for you: + + ```shell + ./mvnw spotless:apply + ``` + +8. Before you submit your pull request, make sure the project will compile properly after adding your code, you can use the following commands to package the whole project: + + ```shell + # multi threads compile + ./mvnw -T 1C clean package + ``` + + ```shell + # single thread compile + ./mvnw clean package + ``` + +9. Before submitting pull request, do a full unit test and integration test locally can better verify the functionality of your code, best practice is to use the `seatunnel-examples` module's ability to self-test to ensure that the multi-engine is running properly and the results are correct. + +10. If you submit a pull request with a feature that requires updated documentation, always remember to update the documentation. + +11. Submit the pull request of connector type can write e2e test to ensure the robustness and robustness of the code, e2e test should include the full data type, and e2e test as little as possible to initialize the docker image, write the test cases of sink and source together to reduce the loss of resources, while using asynchronous features to ensure the stability of the test. A good example can be found at: [MongodbIT.java](https://github.com/apache/seatunnel/blob/dev/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/java/org/apache/seatunnel/e2e/connector/v2/mongodb/MongodbIT.java) + +12. The priority of property permission in the class is set to `private`, and mutability is set to `final`, which can be changed reasonably if special circumstances are encountered. + +13. The properties in the class and method parameters prefer to use the base type(int boolean double float...), not recommended to use the wrapper type(Integer Boolean Double Float...), if encounter special circumstances reasonable change. + +14. When developing a sink connector you need to be aware that the sink will be serialized, and if some properties cannot be serialized, encapsulate the properties into classes and use the singleton pattern. + +15. If there are multiple `if` process judgments in the code flow, try to simplify the flow to multiple ifs instead of if-else-if. + +16. Pull request has the characteristic of single responsibility, not allowed to include irrelevant code of the feature in pull request, once this situation deal with their own branch before submitting pull request, otherwise the Apache SeaTunnel community will actively close pull request. + +17. Contributors should be responsible for their own pull request. If your pull request contains new features or modifies old features, add test cases or e2e tests to prove the reasonableness and functional integrity of your pull request is a good practice. + +18. If you think which part of the community's current code is unreasonable (especially the core `core` module and the `api` module), the function needs to be updated or modified, the first thing to do is to propose a `discuss issue` or `email` with the community to discuss the need to modify this part of the function, if the community agrees to submit pull request again, do not submit the issue and pull request directly without discussion, so the community will directly consider this pull request is useless, and will be closed down. + diff --git a/versioned_docs/version-2.3.7/contribution/contribute-plugin.md b/versioned_docs/version-2.3.7/contribution/contribute-plugin.md new file mode 100644 index 000000000000..17275e35f0ba --- /dev/null +++ b/versioned_docs/version-2.3.7/contribution/contribute-plugin.md @@ -0,0 +1,5 @@ +# Contribute Connector-v2 Plugins + +If you want to contribute Connector-V2, please click the Connector-V2 Contribution Guide below for reference. It can help you enter development more quickly. + +[Connector-v2 Contribution Guide](https://github.com/apache/seatunnel/blob/dev/seatunnel-connectors-v2/README.md) diff --git a/versioned_docs/version-2.3.7/contribution/contribute-transform-v2-guide.md b/versioned_docs/version-2.3.7/contribution/contribute-transform-v2-guide.md new file mode 100644 index 000000000000..37837f9eeb6e --- /dev/null +++ b/versioned_docs/version-2.3.7/contribution/contribute-transform-v2-guide.md @@ -0,0 +1,329 @@ +# Contribute Transform Guide + +This document describes how to understand, develop and contribute a transform. + +We also provide the [Transform E2E Test](../../../seatunnel-e2e/seatunnel-transforms-v2-e2e) +to verify the data input and output by the transform. + +## Concepts + +Using SeaTunnel you can read or write data through the connector, but if you need to +process your data after reading or before writing, then need to use transform. + +Use transform to make simple edits to your data rows or fields, such as split field, +change field values, add or remove field. + +### DataType Transform + +Transform receives datatype input from upstream(source or transform) and outputs new datatype to +downstream(sink or transform), this process is datatype transform. + +Example 1:Remove fields + +```shell +| A | B | C | +|-----------|-----------|-----------| +| STRING | INT | BOOLEAN | + +| A | B | +|-----------|-----------| +| STRING | INT | +``` + +Example 2:Sort fields + +```shell +| B | C | A | +|-----------|-----------|-----------| +| INT | BOOLEAN | STRING | + +| A | B | C | +|-----------|-----------|-----------| +| STRING | INT | BOOLEAN | +``` + +Example 3:Update fields datatype + +```shell +| A | B | C | +|-----------|-----------|-----------| +| STRING | INT | BOOLEAN | + + +| A | B | C | +|-----------|-----------|-----------| +| STRING | STRING | STRING | +``` + +Example 4:Add new fields + +```shell +| A | B | C | +|-----------|-----------|-----------| +| STRING | INT | BOOLEAN | + + +| A | B | C | D | +|-----------|-----------|-----------|-----------| +| STRING | INT | BOOLEAN | DOUBLE | +``` + +### Data Transform + +After datatype transformed, Transform will receive data-row input from upstream(source or transform), +edit into data-row with [New Datatype](#DataType transform) and output to downstream (sink or transform). +This process is called data transform. + +### Translation + +Transform is decoupled from the execution engine, any transform implement can run into all engines +without changing the code & config, which requires the translation layer to adapt transform and execution engine. + +Example:Translation datatype & data + +```shell +Original: + +| A | B | C | +|-----------|-----------|-----------| +| STRING | INT | BOOLEAN | + +Datatype translation: + +| A | B | C | +|-------------------|-------------------|-------------------| +| ENGINE | ENGINE | ENGINE | + +Data translation: + +| A | B | C | +|-------------------|-------------------|-------------------| +| ENGINE<"test"> | ENGINE<1> | ENGINE | +``` + +## Core APIs + +### SeaTunnelTransform + +`SeaTunnelTransform` provides all major and primary APIs, you can subclass it to do whatever transform. + +1. Receive datatype input from upstream. + +```java +/** + * Set the data type info of input data. + * + * @param inputDataType The data type info of upstream input. + */ + void setTypeInfo(SeaTunnelDataType inputDataType); +``` + +2. Outputs new datatype to downstream. + +```java +/** + * Get the data type of the records produced by this transform. + * + * @return Produced data type. + */ +SeaTunnelDataType getProducedType(); +``` + +3. Edit input data and outputs new data to downstream. + +```java +/** + * Transform input data to {@link this#getProducedType()} types data. + * + * @param row the data need be transform. + * @return transformed data. + */ +T map(T row); +``` + +### SingleFieldOutputTransform + +`SingleFieldOutputTransform` abstract single field change operator + +1. Define output field + +```java +/** + * Outputs new field + * + * @return + */ +protected abstract String getOutputFieldName(); +``` + +2. Define output field datatype + +```java +/** + * Outputs new field datatype + * + * @return + */ +protected abstract SeaTunnelDataType getOutputFieldDataType(); +``` + +3. Define output field value + +```java +/** + * Outputs new field value + * + * @param inputRow The inputRow of upstream input. + * @return + */ +protected abstract Object getOutputFieldValue(SeaTunnelRowAccessor inputRow); +``` + +### MultipleFieldOutputTransform + +`MultipleFieldOutputTransform` abstract multiple fields change operator + +1. Define output fields + +```java +/** + * Outputs new fields + * + * @return + */ +protected abstract String[] getOutputFieldNames(); +``` + +2. Define output fields datatype + +```java +/** + * Outputs new fields datatype + * + * @return + */ +protected abstract SeaTunnelDataType[] getOutputFieldDataTypes(); +``` + +3. Define output field values + +```java +/** + * Outputs new fields value + * + * @param inputRow The inputRow of upstream input. + * @return + */ +protected abstract Object[] getOutputFieldValues(SeaTunnelRowAccessor inputRow); +``` + +### AbstractSeaTunnelTransform + +`AbstractSeaTunnelTransform` abstract datatype & fields change operator + +1. Transform input row type and outputs new row type + +```java +/** + * Outputs transformed row type. + * + * @param inputRowType upstream input row type + * @return + */ +protected abstract SeaTunnelRowType transformRowType(SeaTunnelRowType inputRowType); +``` + +2. Transform input row data and outputs new row data + +```java +/** + * Outputs transformed row data. + * + * @param inputRow upstream input row data + * @return + */ +protected abstract SeaTunnelRow transformRow(SeaTunnelRow inputRow); +``` + +## Develop A Transform + +It must implement one of the following APIs: +- SeaTunnelTransform +- AbstractSeaTunnelTransform +- SingleFieldOutputTransform +- MultipleFieldOutputTransform + +Add implement subclass into module `seatunnel-transforms-v2`. + +### Example: copy field to new field + +```java +@AutoService(SeaTunnelTransform.class) +public class CopyFieldTransform extends SingleFieldOutputTransform { + + private String srcField; + private int srcFieldIndex; + private SeaTunnelDataType srcFieldDataType; + private String destField; + + @Override + public String getPluginName() { + return "Copy"; + } + + @Override + protected void setConfig(Config pluginConfig) { + this.srcField = pluginConfig.getString("src_field"); + this.destField = pluginConfig.getString("dest_fields"); + } + + @Override + protected void setInputRowType(SeaTunnelRowType inputRowType) { + srcFieldIndex = inputRowType.indexOf(srcField); + srcFieldDataType = inputRowType.getFieldType(srcFieldIndex); + } + + @Override + protected String getOutputFieldName() { + return destField; + } + + @Override + protected SeaTunnelDataType getOutputFieldDataType() { + return srcFieldDataType; + } + + @Override + protected Object getOutputFieldValue(SeaTunnelRowAccessor inputRow) { + return inputRow.getField(srcFieldIndex); + } +} +``` + +1. The `getPluginName` method is used to identify the transform name. +2. The @AutoService is used to generate the `META-INF/services/org.apache.seatunnel.api.transform.SeaTunnelTransform` + file automatically. +3. The `setConfig` method is used to inject user configs. + +## Transform Test Tool + +Once you add a new plugin, it is recommended to add e2e tests for it. +We have a `seatunnel-e2e/seatunnel-transforms-v2-e2e` module to help you to do this. + +For example, if you want to add an e2e test for `CopyFieldTransform`, you can create a new test in +`seatunnel-e2e/seatunnel-transforms-v2-e2e` module and extend the `TestSuiteBase` class in the test. + +```java +public class TestCopyFieldTransformIT extends TestSuiteBase { + + @TestTemplate + public void testCopyFieldTransform(TestContainer container) { + Container.ExecResult execResult = container.executeJob("/copy_transform.conf"); + Assertions.assertEquals(0, execResult.getExitCode()); + } +} +``` + +Once your testcase implements the `TestSuiteBase` interface and use `@TestTemplate` annotation startup, +it will run job to all engines, and you just need to execute the executeJob method with your SeaTunnel configuration file, +it will submit the SeaTunnel job. diff --git a/versioned_docs/version-2.3.7/contribution/new-license.md b/versioned_docs/version-2.3.7/contribution/new-license.md new file mode 100644 index 000000000000..631b00404b42 --- /dev/null +++ b/versioned_docs/version-2.3.7/contribution/new-license.md @@ -0,0 +1,53 @@ +# How To Add New License + +### ASF 3RD PARTY LICENSE POLICY + +You have to pay attention to the following open-source software protocols which Apache projects support when you intend to add a new feature to the SeaTunnel (or other Apache projects), which functions refers to other open-source software references. + +[ASF 3RD PARTY LICENSE POLICY](https://apache.org/legal/resolved.html) + +If the 3rd party software is not present at the above policy, we wouldn't accept your code. + +### How to Legally Use 3rd Party Open-source Software In The SeaTunnel + +Moreover, when we intend to refer a new software ( not limited to 3rd party jar, text, CSS, js, pics, icons, audios etc and modifications based on 3rd party files) to our project, we need to use them legally in addition to the permission of ASF. Refer to the following article: + +* [COMMUNITY-LED DEVELOPMENT "THE APACHE WAY"](https://apache.org/dev/licensing-howto.html) + +For example, we should contain the NOTICE file (most of open-source project has NOTICE file, generally under root directory) of ZooKeeper in our project when we are using ZooKeeper. As the Apache explains, "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work. + +We are not going to dive into every 3rd party open-source license policy in here, you may look up them if interested. + +### SeaTunnel-License Check Rules + +In general, we would have our License-check scripts to our project. SeaTunnel-License-Check is provided by [SkyWalking](https://github.com/apache/skywalking) which differ a bit from other open-source projects. All in all, we are trying to make sure avoiding the license issues at the first time. + +We need to follow the following steps when we need to add new jars or external resources: + +* Add the name and the version of the jar file in the known-dependencies.txt +* Add relevant maven repository address under 'seatunnel-dist/release-docs/LICENSE' directory +* Append relevant NOTICE files under 'seatunnel-dist/release-docs/NOTICE' directory and make sure they are no different to the original repository +* Add relevant source code protocols under 'seatunnel-dist/release-docs/licenses' directory and the file name should be named as license+filename.txt. e.g.: license-zk.txt +* check dependency license fail + +``` +--- /dev/fd/63 2020-12-03 03:08:57.191579482 +0000 ++++ /dev/fd/62 2020-12-03 03:08:57.191579482 +0000 +@@ -1,0 +2 @@ ++HikariCP-java6-2.3.13.jar +@@ -16,0 +18 @@ ++c3p0-0.9.5.2.jar +@@ -149,0 +152 @@ ++mchange-commons-java-0.2.11.jar + +- commons-lang-2.1.3.jar +Error: Process completed with exit code 1. +``` + +Generally speaking, the work of adding a jar is often not so easy to end, because it often depends on various other jars, and we also need to add corresponding licenses for these jars. In this case, we will get the error message of check dependency license fail in check. As above, we are missing the license declaration of `HikariCP-java6-2.3.13`, `c3p0`, etc. (`+` means new, `-` means need to delete ), follow the steps to add jar to add + +### References + +* [COMMUNITY-LED DEVELOPMENT "THE APACHE WAY"](https://apache.org/dev/licensing-howto.html) +* [ASF 3RD PARTY LICENSE POLICY](https://apache.org/legal/resolved.html) + diff --git a/versioned_docs/version-2.3.7/contribution/setup.md b/versioned_docs/version-2.3.7/contribution/setup.md new file mode 100644 index 000000000000..b2579e1ee1e4 --- /dev/null +++ b/versioned_docs/version-2.3.7/contribution/setup.md @@ -0,0 +1,127 @@ +# Set Up Develop Environment + +In this section, we are going to show you how to set up your development environment for SeaTunnel, and then run a simple +example in your JetBrains IntelliJ IDEA. + +> You can develop or test SeaTunnel code in any development environment that you like, but here we use +> [JetBrains IDEA](https://www.jetbrains.com/idea/) as an example to teach you to step by step. + +## Prepare + +Before we start talking about how to set up the environment, we need to do some preparation work. Make sure you already +have installed the following software: + +* [Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) installed. +* [Java](https://www.java.com/en/download/) ( JDK8/JDK11 are supported by now) installed and `JAVA_HOME` set. +* [Scala](https://www.scala-lang.org/download/2.11.12.html) (only scala 2.11.12 supported by now) installed. +* [JetBrains IDEA](https://www.jetbrains.com/idea/) installed. + +## Set Up + +### Clone the Source Code + +First of all, you need to clone the SeaTunnel source code from [GitHub](https://github.com/apache/seatunnel). + +```shell +git clone git@github.com:apache/seatunnel.git +``` + +### Install Subproject Locally + +After cloning the source code, you should run the `./mvnw` command to install the subproject to the maven local repository. +Otherwise, your code could not start in JetBrains IntelliJ IDEA correctly. + +```shell +./mvnw install -Dmaven.test.skip +``` + +### Building SeaTunnel From Source + +After you install the maven, you can use the following command to compile and package. + +``` +mvn clean package -pl seatunnel-dist -am -Dmaven.test.skip=true +``` + +### Building Sub Module + +If you want to build submodules separately, you can use the following command to compile and package. + +```ssh +# This is an example of building the redis connector separately + + mvn clean package -pl seatunnel-connectors-v2/connector-redis -am -DskipTests -T 1C +``` + +### Install JetBrains IDEA Scala Plugin + +Now, you can open your JetBrains IntelliJ IDEA and explore the source code. But before building Scala code in IDEA, +you should also install JetBrains IntelliJ IDEA's [Scala Plugin](https://plugins.jetbrains.com/plugin/1347-scala). +See [Install Plugins For IDEA](https://www.jetbrains.com/help/idea/managing-plugins.html#install-plugins) if you want to. + +### Install JetBrains IDEA Lombok Plugin + +Before running the following example, you should also install JetBrains IntelliJ IDEA's [Lombok plugin](https://plugins.jetbrains.com/plugin/6317-lombok). +See [install plugins for IDEA](https://www.jetbrains.com/help/idea/managing-plugins.html#install-plugins) if you want to. + +### Code Style + +Apache SeaTunnel uses `Spotless` for code style and format checks. You can run the following command and `Spotless` will automatically fix the code style and formatting errors for you: + +```shell +./mvnw spotless:apply +``` + +You could copy the `pre-commit hook` file `/tools/spotless_check/pre-commit.sh` to your `.git/hooks/` directory so that every time you commit your code with `git commit`, `Spotless` will automatically fix things for you. + +## Run Simple Example + +After all the above things are done, you just finish the environment setup and can run an example we provide to you out +of box. All examples are in module `seatunnel-examples`, you could pick one you are interested in, [Running Or Debugging +It In IDEA](https://www.jetbrains.com/help/idea/run-debug-configuration.html) as you wish. + +Here we use `seatunnel-examples/seatunnel-engine-examples/src/main/java/org/apache/seatunnel/example/engine/SeaTunnelEngineExample.java` +as an example, when you run it successfully you can see the output as below: + +```log +2024-08-10 11:45:32,839 INFO org.apache.seatunnel.core.starter.seatunnel.command.ClientExecuteCommand - +*********************************************** + Job Statistic Information +*********************************************** +Start Time : 2024-08-10 11:45:30 +End Time : 2024-08-10 11:45:32 +Total Time(s) : 2 +Total Read Count : 5 +Total Write Count : 5 +Total Failed Count : 0 +*********************************************** +``` + +## What's More + +All our examples use simple source and sink to make it less dependent and easy to run. You can change the example configuration +in `resources/examples`. You can change your configuration as below, if you want to use PostgreSQL as the source and +sink to console. +Please note that when using connectors other than FakeSource and Console, you need to modify the dependencies in the `pom.xml` file of the corresponding submodule of seatunnel-example. + +```conf +env { + parallelism = 1 + job.mode = "BATCH" +} +source { + Jdbc { + driver = org.postgresql.Driver + url = "jdbc:postgresql://host:port/database" + username = postgres + password = "123456" + query = "select * from test" + table_path = "database.test" + } +} + +sink { + Console {} +} +``` + diff --git a/versioned_docs/version-2.3.7/faq.md b/versioned_docs/version-2.3.7/faq.md new file mode 100644 index 000000000000..1eaf50255892 --- /dev/null +++ b/versioned_docs/version-2.3.7/faq.md @@ -0,0 +1,353 @@ +# FAQs + +## Why should I install a computing engine like Spark or Flink? + +SeaTunnel now uses computing engines such as Spark and Flink to complete resource scheduling and node communication, so we can focus on the ease of use of data synchronization and the development of high-performance components. But this is only temporary. + +## I have a question, and I cannot solve it by myself + +I have encountered a problem when using SeaTunnel and I cannot solve it by myself. What should I do? First, search in [Issue List](https://github.com/apache/seatunnel/issues) or [Mailing List](https://lists.apache.org/list.html?dev@seatunnel.apache.org) to see if someone has already asked the same question and got an answer. If you cannot find an answer to your question, you can contact community members for help in [These Ways](https://github.com/apache/seatunnel#contact-us). + +## How do I declare a variable? + +Do you want to know how to declare a variable in SeaTunnel's configuration, and then dynamically replace the value of the variable at runtime? + +Since `v1.2.4`, SeaTunnel supports variable substitution in the configuration. This feature is often used for timing or non-timing offline processing to replace variables such as time and date. The usage is as follows: + +Configure the variable name in the configuration. Here is an example of sql transform (actually, anywhere in the configuration file the value in `'key = value'` can use the variable substitution): + +``` +... +transform { + sql { + query = "select * from user_view where city ='"${city}"' and dt = '"${date}"'" + } +} +... +``` + +Taking Spark Local mode as an example, the startup command is as follows: + +```bash +./bin/start-seatunnel-spark.sh \ +-c ./config/your_app.conf \ +-e client \ +-m local[2] \ +-i city=shanghai \ +-i date=20190319 +``` + +You can use the parameter `-i` or `--variable` followed by `key=value` to specify the value of the variable, where the key needs to be same as the variable name in the configuration. + +## How do I write a configuration item in multi-line text in the configuration file? + +When a configured text is very long and you want to wrap it, you can use three double quotes to indicate its start and end: + +``` +var = """ + whatever you want +""" +``` + +## How do I implement variable substitution for multi-line text? + +It is a little troublesome to do variable substitution in multi-line text, because the variable cannot be included in three double quotation marks: + +``` +var = """ +your string 1 +"""${you_var}""" your string 2""" +``` + +Refer to: [lightbend/config#456](https://github.com/lightbend/config/issues/456). + +## Is SeaTunnel supported in Azkaban, Oozie, DolphinScheduler? + +Of course! See the screenshot below: + +![workflow.png](/image_en/workflow.png) + +![azkaban.png](/image_en/azkaban.png) + +## Does SeaTunnel have a case for configuring multiple sources, such as configuring elasticsearch and hdfs in source at the same time? + +``` +env { + ... +} + +source { + hdfs { ... } + elasticsearch { ... } + jdbc {...} +} + +transform { + ... +} + +sink { + elasticsearch { ... } +} +``` + +## Are there any HBase plugins? + +There is a HBase input plugin. You can download it from here: https://github.com/garyelephant/waterdrop-input-hbase . + +## How can I use SeaTunnel to write data to Hive? + +``` +env { + spark.sql.catalogImplementation = "hive" + spark.hadoop.hive.exec.dynamic.partition = "true" + spark.hadoop.hive.exec.dynamic.partition.mode = "nonstrict" +} + +source { + sql = "insert into ..." +} + +sink { + // The data has been written to hive through the sql source. This is just a placeholder, it does not actually work. + stdout { + limit = 1 + } +} +``` + +In addition, SeaTunnel has implemented a `Hive` output plugin after version `1.5.7` in `1.x` branch; in `2.x` branch. The Hive plugin for the Spark engine has been supported from version `2.0.5`: https://github.com/apache/seatunnel/issues/910. + +## How does SeaTunnel write multiple instances of ClickHouse to achieve load balancing? + +1. Write distributed tables directly (not recommended) + +2. Add a proxy or domain name (DNS) in front of multiple instances of ClickHouse: + + ``` + { + output { + clickhouse { + host = "ck-proxy.xx.xx:8123" + # Local table + table = "table_name" + } + } + } + ``` +3. Configure multiple instances in the configuration: + + ``` + { + output { + clickhouse { + host = "ck1:8123,ck2:8123,ck3:8123" + # Local table + table = "table_name" + } + } + } + ``` +4. Use cluster mode: + + ``` + { + output { + clickhouse { + # Configure only one host + host = "ck1:8123" + cluster = "clickhouse_cluster_name" + # Local table + table = "table_name" + } + } + } + ``` + +## How can I solve OOM when SeaTunnel consumes Kafka? + +In most cases, OOM is caused by not having a rate limit for consumption. The solution is as follows: + +For the current limit of Spark consumption of Kafka: + +1. Suppose the number of partitions of Kafka `Topic 1` you consume with KafkaStream = N. + +2. Assuming that the production speed of the message producer (Producer) of `Topic 1` is K messages/second, the speed of write messages to the partition must be uniform. + +3. Suppose that, after testing, it is found that the processing capacity of Spark Executor per core per second is M. + +The following conclusions can be drawn: + +1. If you want to make Spark's consumption of `Topic 1` keep up with its production speed, then you need `spark.executor.cores` * `spark.executor.instances` >= K / M + +2. When a data delay occurs, if you want the consumption speed not to be too fast, resulting in spark executor OOM, then you need to configure `spark.streaming.kafka.maxRatePerPartition` <= (`spark.executor.cores` * `spark.executor.instances`) * M / N + +3. In general, both M and N are determined, and the conclusion can be drawn from 2: The size of `spark.streaming.kafka.maxRatePerPartition` is positively correlated with the size of `spark.executor.cores` * `spark.executor.instances`, and it can be increased while increasing the resource `maxRatePerPartition` to speed up consumption. + +![Kafka](/image_en/kafka.png) + +## How can I solve the Error `Exception in thread "main" java.lang.NoSuchFieldError: INSTANCE`? + +The reason is that the version of httpclient.jar that comes with the CDH version of Spark is lower, and The httpclient version that ClickHouse JDBC is based on is 4.5.2, and the package versions conflict. The solution is to replace the jar package that comes with CDH with the httpclient-4.5.2 version. + +## The default JDK of my Spark cluster is JDK7. After I install JDK8, how can I specify that SeaTunnel starts with JDK8? + +In SeaTunnel's config file, specify the following configuration: + +```shell +spark { + ... + spark.executorEnv.JAVA_HOME="/your/java_8_home/directory" + spark.yarn.appMasterEnv.JAVA_HOME="/your/java_8_home/directory" + ... +} +``` + +## How do I specify a different JDK version for SeaTunnel on YARN? + +For example, if you want to set the JDK version to JDK8, there are two cases: + +- The YARN cluster has deployed JDK8, but the default JDK is not JDK8. Add two configurations to the SeaTunnel config file: + + ``` + env { + ... + spark.executorEnv.JAVA_HOME="/your/java_8_home/directory" + spark.yarn.appMasterEnv.JAVA_HOME="/your/java_8_home/directory" + ... + } + ``` +- YARN cluster does not deploy JDK8. At this time, start SeaTunnel attached with JDK8. For detailed operations, see: + https://www.cnblogs.com/jasondan/p/spark-specific-jdk-version.html + +## What should I do if OOM always appears when running SeaTunnel in Spark local[*] mode? + +If you run in local mode, you need to modify the `start-seatunnel.sh` startup script. After `spark-submit`, add a parameter `--driver-memory 4g` . Under normal circumstances, local mode is not used in the production environment. Therefore, this parameter generally does not need to be set during On YARN. See: [Application Properties](https://spark.apache.org/docs/latest/configuration.html#application-properties) for details. + +## Where can I place self-written plugins or third-party jdbc.jars to be loaded by SeaTunnel? + +Place the Jar package under the specified structure of the plugins directory: + +```bash +cd SeaTunnel +mkdir -p plugins/my_plugins/lib +cp third-part.jar plugins/my_plugins/lib +``` + +`my_plugins` can be any string. + +## How do I configure logging-related parameters in SeaTunnel-V1(Spark)? + +There are three ways to configure logging-related parameters (such as Log Level): + +- [Not recommended] Change the default `$SPARK_HOME/conf/log4j.properties`. + - This will affect all programs submitted via `$SPARK_HOME/bin/spark-submit`. +- [Not recommended] Modify logging related parameters directly in the Spark code of SeaTunnel. + - This is equivalent to hardcoding, and each change needs to be recompiled. +- [Recommended] Use the following methods to change the logging configuration in the SeaTunnel configuration file (The change only takes effect if SeaTunnel >= 1.5.5 ): + + ``` + env { + spark.driver.extraJavaOptions = "-Dlog4j.configuration=file:/log4j.properties" + spark.executor.extraJavaOptions = "-Dlog4j.configuration=file:/log4j.properties" + } + source { + ... + } + transform { + ... + } + sink { + ... + } + ``` + +The contents of the log4j configuration file for reference are as follows: + +``` +$ cat log4j.properties +log4j.rootLogger=ERROR, console + +# set the log level for these components +log4j.logger.org=ERROR +log4j.logger.org.apache.spark=ERROR +log4j.logger.org.spark-project=ERROR +log4j.logger.org.apache.hadoop=ERROR +log4j.logger.io.netty=ERROR +log4j.logger.org.apache.zookeeper=ERROR + +# add a ConsoleAppender to the logger stdout to write to the console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.layout=org.apache.log4j.PatternLayout +# use a simple message format +log4j.appender.console.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n +``` + +## How do I configure logging related parameters in SeaTunnel-V2(Spark, Flink)? + +Currently, they cannot be set directly. you need to modify the SeaTunnel startup script. The relevant parameters are specified in the task submission command. For specific parameters, please refer to the official documents: + +- Spark official documentation: http://spark.apache.org/docs/latest/configuration.html#configuring-logging +- Flink official documentation: https://ci.apache.org/projects/flink/flink-docs-stable/monitoring/logging.html + +Reference: + +https://stackoverflow.com/questions/27781187/how-to-stop-info-messages-displaying-on-spark-console + +http://spark.apache.org/docs/latest/configuration.html#configuring-logging + +https://medium.com/@iacomini.riccardo/spark-logging-configuration-in-yarn-faf5ba5fdb01 + +## How do I configure logging related parameters of SeaTunnel-E2E Test? + +The log4j configuration file of `seatunnel-e2e` existed in `seatunnel-e2e/seatunnel-e2e-common/src/test/resources/log4j2.properties`. You can modify logging related parameters directly in the configuration file. + +For example, if you want to output more detailed logs of E2E Test, just downgrade `rootLogger.level` in the configuration file. + +## Error when writing to ClickHouse: ClassCastException + +In SeaTunnel, the data type will not be actively converted. After the Input reads the data, the corresponding +Schema. When writing ClickHouse, the field type needs to be strictly matched, and the mismatch needs to be resolved. + +Data conversion can be achieved through the following two plugins: + +1. Filter Convert plugin +2. Filter Sql plugin + +Detailed data type conversion reference: [ClickHouse Data Type Check List](https://interestinglab.github.io/seatunnel-docs/#/en/configuration/output-plugins/Clickhouse?id=clickhouse-data-type-check-list) + +Refer to issue:[#488](https://github.com/apache/seatunnel/issues/488) [#382](https://github.com/apache/seatunnel/issues/382). + +## How does SeaTunnel access kerberos-authenticated HDFS, YARN, Hive and other resources? + +Please refer to: [#590](https://github.com/apache/seatunnel/issues/590). + +## How do I troubleshoot NoClassDefFoundError, ClassNotFoundException and other issues? + +There is a high probability that there are multiple different versions of the corresponding Jar package class loaded in the Java classpath, because of the conflict of the load order, not because the Jar is really missing. Modify this SeaTunnel startup command, adding the following parameters to the spark-submit submission section, and debug in detail through the output log. + +``` +spark-submit --verbose + ... + --conf 'spark.driver.extraJavaOptions=-verbose:class' + --conf 'spark.executor.extraJavaOptions=-verbose:class' + ... +``` + +## How do I use SeaTunnel to synchronize data across HDFS clusters? + +Just configure hdfs-site.xml properly. Refer to: https://www.cnblogs.com/suanec/p/7828139.html. + +## I want to learn the source code of SeaTunnel. Where should I start? + +SeaTunnel has a completely abstract and structured code implementation, and many people have chosen SeaTunnel As a way to learn Spark. You can learn the source code from the main program entry: SeaTunnel.java + +## When SeaTunnel developers develop their own plugins, do they need to understand the SeaTunnel code? Should these plugins be integrated into the SeaTunnel project? + +The plugin developed by the developer has nothing to do with the SeaTunnel project and does not need to include your plugin code. + +The plugin can be completely independent from SeaTunnel project, so you can write it using Java, Scala, Maven, sbt, Gradle, or whatever you want. This is also the way we recommend developers to develop plugins. + +## When I import a project, the compiler has the exception "class not found `org.apache.seatunnel.shade.com.typesafe.config.Config`" + +Run `mvn install` first. In the `seatunnel-config/seatunnel-config-base` subproject, the package `com.typesafe.config` has been relocated to `org.apache.seatunnel.shade.com.typesafe.config` and installed to the maven local repository in the subproject `seatunnel-config/seatunnel-config-shade`. diff --git a/versioned_docs/version-2.3.7/other-engine/flink.md b/versioned_docs/version-2.3.7/other-engine/flink.md new file mode 100644 index 000000000000..8a77fbfc2415 --- /dev/null +++ b/versioned_docs/version-2.3.7/other-engine/flink.md @@ -0,0 +1,84 @@ +# Seatunnel Runs On Flink + +Flink is a powerful high-performance distributed stream processing engine. More information about it you can search for `Apache Flink` + +### Set Flink Configuration Information In The Job + +Begin with `flink.` + +Example: +I set a precise Checkpoint for this job + +``` +env { + parallelism = 1 + flink.execution.checkpointing.unaligned.enabled=true +} +``` + +Enumeration types are not currently supported, you need to specify them in the Flink conf file ,Only these types of Settings are supported for the time being:
    +Integer/Boolean/String/Duration + +### How To Set Up A Simple Flink Job + +This is a simple job that runs on Flink. Randomly generated data is printed to the console + +``` +env { + # common parameter + parallelism = 1 + checkpoint.interval = 5000 + + # flink special parameter + flink.execution.checkpointing.mode = "EXACTLY_ONCE" + flink.execution.checkpointing.timeout = 600000 +} + +source { + FakeSource { + row.num = 16 + result_table_name = "fake_table" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + } + } + } + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink{ + Console{} +} +``` + +### How To Run A Job In A Project + +After you pull the code to the local, go to the `seatunnel-examples/seatunnel-flink-connector-v2-example` module and find `org.apache.seatunnel.example.flink.v2.SeaTunnelApiExample` to complete the operation of the job. diff --git a/versioned_docs/version-2.3.7/other-engine/spark.md b/versioned_docs/version-2.3.7/other-engine/spark.md new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/versioned_docs/version-2.3.7/seatunnel-engine/about.md b/versioned_docs/version-2.3.7/seatunnel-engine/about.md new file mode 100644 index 000000000000..da78035c8b43 --- /dev/null +++ b/versioned_docs/version-2.3.7/seatunnel-engine/about.md @@ -0,0 +1,44 @@ +--- + +sidebar_position: 1 +------------------- + +# SeaTunnel Engine + +SeaTunnel Engine is a community-developed data synchronization engine designed for data synchronization scenarios debuts. As the default engine of SeaTunnel, it supports high-throughput, low-latency, and strong-consistent synchronous job operation, which is faster, more stable, more resource-saving, and easy to use. + +The overall design of the SeaTunnel Engine follows the path below: + +- Faster, SeaTunnel Engine’s execution plan optimizer aims to reduce data network transmission, thereby reducing the loss of overall synchronization performance caused by data serialization and de-serialization, allowing users to complete data synchronization operations faster. At the same time, a speed limit is supported to synchronize data at a reasonable speed. +- More stable, SeaTunnel Engine uses Pipeline as the minimum granularity of checkpoint and fault tolerance for data synchronization tasks. The failure of a task will only affect its upstream and downstream tasks, which avoids task failures that cause the entire job to fail or rollback. At the same time, SeaTunnel Engine also supports data cache for scenarios where the source data has a storage time limit. When the cache is enabled, the data read from the source will be automatically cached, then read by the downstream task and written to the target. Under this condition, even if the data cannot be written due to the failure of the target, it will not affect the regular reading of the source, preventing the data from the source is deleted when expired. +- Space-saving, SeaTunnel Engine uses Dynamic Thread Sharing technology internally. In the real-time synchronization scenario, for the tables with a large amount but small data sizes per table, SeaTunnel Engine will run these synchronization tasks in shared threads to reduce unnecessary thread creation and save system space. On the reading and data writing side, the design goal of SeaTunnel Engine is to minimize the amount of JDBC connections; in CDC scenarios, SeaTunnel Engine will reuse log reading and parsing resources. +- Simple and easy to use, SeaTunnel Engine reduces the dependence on third-party services and can implement cluster management, snapshot storage, and cluster HA functions independently of big data components such as Zookeeper and HDFS. This is very useful for users who currently lack a big data platform, or are unwilling to rely on a big data platform for data synchronization. + +In the future, SeaTunnel Engine will further optimize its functions to support full synchronization and incremental synchronization of offline batch synchronization, real-time synchronization, and CDC. + +### Cluster Management + +- Support standalone operation; +- Support cluster operation; +- Support autonomous cluster (decentralized), which saves the users from specifying a master node for the SeaTunnel Engine cluster, because it can select a master node by itself during operation, and a new master node will be chosen automatically when the master node fails. +- Autonomous Cluster nodes-discovery and nodes with the same cluster_name will automatically form a cluster. + +### Core functions + +- Support running jobs in local mode, and the cluster is automatically destroyed after the job once completed; +- Support running jobs in cluster mode (single machine or cluster), submitting jobs to the SeaTunnel Engine service through the SeaTunnel client, and the service continues to run after the job is completed and waits for the next job submission; +- Support offline batch synchronization; +- Support real-time synchronization; +- Batch-stream integration, all SeaTunnel V2 connectors can run in SeaTunnel Engine; +- Support distributed snapshot algorithm, and supports two-stage submission with SeaTunnel V2 connector, ensuring that data is executed only once. +- Support job invocation at the pipeline level to ensure that it can be started even when resources are limited; +- Support fault tolerance for jobs at the Pipeline level. Task failure only affects the pipeline where it is located, and only the task under the Pipeline needs to be rolled back; +- Support dynamic thread sharing to synchronize a large number of small data sets in real-time. + +### Quick Start + +https://seatunnel.apache.org/docs/start-v2/locally/quick-start-seatunnel-engine + +### Download & Install + +[Download & Install](download-seatunnel.md) diff --git a/versioned_docs/version-2.3.7/seatunnel-engine/checkpoint-storage.md b/versioned_docs/version-2.3.7/seatunnel-engine/checkpoint-storage.md new file mode 100644 index 000000000000..52af8c4af27a --- /dev/null +++ b/versioned_docs/version-2.3.7/seatunnel-engine/checkpoint-storage.md @@ -0,0 +1,247 @@ +--- + +sidebar_position: 7 +------------------- + +# Checkpoint Storage + +## Introduction + +Checkpoint is a fault-tolerant recovery mechanism. This mechanism ensures that when the program is running, it can recover itself even if it suddenly encounters an exception. + +### Checkpoint Storage + +Checkpoint Storage is a storage mechanism for storing checkpoint data. + +SeaTunnel Engine supports the following checkpoint storage types: + +- HDFS (OSS,S3,HDFS,LocalFile) +- LocalFile (native), (it's deprecated: use Hdfs(LocalFile) instead. + +We use the microkernel design pattern to separate the checkpoint storage module from the engine. This allows users to implement their own checkpoint storage modules. + +`checkpoint-storage-api` is the checkpoint storage module API, which defines the interface of the checkpoint storage module. + +If you want to implement your own checkpoint storage module, you need to implement the `CheckpointStorage` and provide the corresponding `CheckpointStorageFactory` implementation. + +### Checkpoint Storage Configuration + +The configuration of the `seatunnel-server` module is in the `seatunnel.yaml` file. + +```yaml + +seatunnel: + engine: + checkpoint: + storage: + type: hdfs #plugin name of checkpoint storage, we support hdfs(S3, local, hdfs), localfile (native local file) is the default, but this plugin is de + # plugin configuration + plugin-config: + namespace: #checkpoint storage parent path, the default value is /seatunnel/checkpoint/ + K1: V1 # plugin other configuration + K2: V2 # plugin other configuration +``` + +Notice: namespace must end with "/". + +#### OSS + +Aliyun OSS based hdfs-file you can refer [Hadoop OSS Docs](https://hadoop.apache.org/docs/stable/hadoop-aliyun/tools/hadoop-aliyun/index.html) to config oss. + +Except when interacting with oss buckets, the oss client needs the credentials needed to interact with buckets. +The client supports multiple authentication mechanisms and can be configured as to which mechanisms to use, and their order of use. Custom implementations of org.apache.hadoop.fs.aliyun.oss.AliyunCredentialsProvider may also be used. +If you used AliyunCredentialsProvider (can be obtained from the Aliyun Access Key Management), these consist of an access key, a secret key. +You can config like this: + +```yaml +seatunnel: + engine: + checkpoint: + interval: 6000 + timeout: 7000 + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: oss + oss.bucket: your-bucket + fs.oss.accessKeyId: your-access-key + fs.oss.accessKeySecret: your-secret-key + fs.oss.endpoint: endpoint address + fs.oss.credentials.provider: org.apache.hadoop.fs.aliyun.oss.AliyunCredentialsProvider +``` + +For additional reading on the Hadoop Credential Provider API, you can see: [Credential Provider API](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/CredentialProviderAPI.html). + +For Aliyun OSS Credential Provider implements, you can see: [Auth Credential Providers](https://github.com/aliyun/aliyun-oss-java-sdk/tree/master/src/main/java/com/aliyun/oss/common/auth) + +#### S3 + +S3 based hdfs-file you can refer [hadoop s3 docs](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) to config s3. + +Except when interacting with public S3 buckets, the S3A client needs the credentials needed to interact with buckets. +The client supports multiple authentication mechanisms and can be configured as to which mechanisms to use, and their order of use. Custom implementations of com.amazonaws.auth.AWSCredentialsProvider may also be used. +If you used SimpleAWSCredentialsProvider (can be obtained from the Amazon Security Token Service), these consist of an access key, a secret key. +You can config like this: + +```yaml + +seatunnel: + engine: + checkpoint: + interval: 6000 + timeout: 7000 + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: s3 + s3.bucket: your-bucket + fs.s3a.access.key: your-access-key + fs.s3a.secret.key: your-secret-key + fs.s3a.aws.credentials.provider: org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider + + +``` + +If you used `InstanceProfileCredentialsProvider`, which supports use of instance profile credentials if running in an EC2 VM, you can check [iam-roles-for-amazon-ec2](https://docs.aws.amazon.com/zh_cn/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html). +You can config like this: + +```yaml + +seatunnel: + engine: + checkpoint: + interval: 6000 + timeout: 7000 + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: s3 + s3.bucket: your-bucket + fs.s3a.endpoint: your-endpoint + fs.s3a.aws.credentials.provider: org.apache.hadoop.fs.s3a.InstanceProfileCredentialsProvider +``` + +If you want to use Minio that supports the S3 protocol as checkpoint storage, you should configure it this way: + +```yaml + +seatunnel: + engine: + checkpoint: + interval: 10000 + timeout: 60000 + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: s3 + fs.s3a.access.key: xxxxxxxxx # Access Key of MinIO + fs.s3a.secret.key: xxxxxxxxxxxxxxxxxxxxx # Secret Key of MinIO + fs.s3a.endpoint: http://127.0.0.1:9000 # Minio HTTP service access address + s3.bucket: s3a://test # test is the bucket name which storage the checkpoint file + fs.s3a.aws.credentials.provider: org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider + # important: The user of this key needs to have write permission for the bucket, otherwise an exception of 403 will be returned +``` + +For additional reading on the Hadoop Credential Provider API, you can see: [Credential Provider API](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/CredentialProviderAPI.html). + +#### HDFS + +if you use HDFS, you can config like this: + +```yaml +seatunnel: + engine: + checkpoint: + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: hdfs + fs.defaultFS: hdfs://localhost:9000 + // if you used kerberos, you can config like this: + kerberosPrincipal: your-kerberos-principal + kerberosKeytabFilePath: your-kerberos-keytab + // if you need hdfs-site config, you can config like this: + hdfs_site_path: /path/to/your/hdfs_site_path +``` + +if HDFS is in HA mode , you can config like this: + +```yaml +seatunnel: + engine: + checkpoint: + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: hdfs + fs.defaultFS: hdfs://usdp-bing + seatunnel.hadoop.dfs.nameservices: usdp-bing + seatunnel.hadoop.dfs.ha.namenodes.usdp-bing: nn1,nn2 + seatunnel.hadoop.dfs.namenode.rpc-address.usdp-bing.nn1: usdp-bing-nn1:8020 + seatunnel.hadoop.dfs.namenode.rpc-address.usdp-bing.nn2: usdp-bing-nn2:8020 + seatunnel.hadoop.dfs.client.failover.proxy.provider.usdp-bing: org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider + +``` + +if HDFS has some other configs in `hdfs-site.xml` or `core-site.xml` , just set HDFS config by using `seatunnel.hadoop.` prefix. + +#### LocalFile + +```yaml +seatunnel: + engine: + checkpoint: + interval: 6000 + timeout: 7000 + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: hdfs + fs.defaultFS: file:/// # Ensure that the directory has written permission + +``` + +### Enable cache + +When storage:type is hdfs, cache is disabled by default. If you want to enable it, set `disable.cache: false` + +```yaml +seatunnel: + engine: + checkpoint: + interval: 6000 + timeout: 7000 + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: hdfs + disable.cache: false + fs.defaultFS: hdfs:/// + +``` + +or + +```yaml +seatunnel: + engine: + checkpoint: + interval: 6000 + timeout: 7000 + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: hdfs + disable.cache: false + fs.defaultFS: file:/// +``` + diff --git a/versioned_docs/version-2.3.7/seatunnel-engine/deployment.md b/versioned_docs/version-2.3.7/seatunnel-engine/deployment.md new file mode 100644 index 000000000000..a708091e32e4 --- /dev/null +++ b/versioned_docs/version-2.3.7/seatunnel-engine/deployment.md @@ -0,0 +1,24 @@ +--- + +sidebar_position: 3 +------------------- + +# SeaTunnel Engine(Zeta) Deployment + +SeaTunnel Engine(Zeta) supports three different deployment modes: local mode, hybrid cluster mode, and separated cluster mode. + +Each deployment mode has different usage scenarios, advantages, and disadvantages. You should choose a deployment mode according to your needs and environment. + +**Local mode:** Only used for testing, each task will start an independent process, and the process will exit after the task is completed. + +**Hybrid cluster mode:** The Master service and Worker service of SeaTunnel Engine are mixed in the same process. All nodes can run jobs and participate in the election to become the master, that is, the master node is also running synchronous tasks simultaneously. In this mode, Imap (saving the state information of the task to provide support for the fault tolerance of the task) data will be distributed among all nodes. + +**Separated cluster mode(experimental feature):** The Master service and Worker service of SeaTunnel Engine are separated, and each service is a single process. The Master node is only responsible for job scheduling, rest api, task submission, etc., and Imap data is only stored in the Master node. The Worker node is only responsible for the execution of the task, does not participate in the election to become the master, and does not store Imap data. + +**Usage suggestion:** Although [Separated Cluster Mode](separated-cluster-deployment.md) is an experimental feature, the first recommended usage will be made in the future. In the hybrid cluster mode, the Master node needs to run tasks synchronously. When the task scale is large, it will affect the stability of the Master node. Once the Master node crashes or the heartbeat times out, it will lead to the switch of the Master node, and the switch of the Master node will cause fault tolerance of all running tasks, which will further increase the load of the cluster. Therefore, we recommend using the separated mode more. + +[Local Mode Deployment](local-mode-deployment.md) + +[Hybrid Cluster Mode Deployment](hybrid-cluster-deployment.md) + +[Separated Cluster Mode Deployment](separated-cluster-deployment.md) diff --git a/versioned_docs/version-2.3.7/seatunnel-engine/download-seatunnel.md b/versioned_docs/version-2.3.7/seatunnel-engine/download-seatunnel.md new file mode 100644 index 000000000000..e1ddd88b681e --- /dev/null +++ b/versioned_docs/version-2.3.7/seatunnel-engine/download-seatunnel.md @@ -0,0 +1,70 @@ +--- + +sidebar_position: 2 +------------------- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Download And Make Installation Packages + +## Step 1: Preparation + +Before starting to download SeaTunnel, you need to ensure that you have installed the following software required by SeaTunnel: + +* Install [Java](https://www.java.com/en/download/) (Java 8 or 11, and other versions higher than Java 8 can theoretically work) and set `JAVA_HOME`. + +## Step 2: Download SeaTunnel + +Go to the [Seatunnel Download Page](https://seatunnel.apache.org/download) to download the latest version of the release version installation package `seatunnel--bin.tar.gz`. + +Or you can also download it through the terminal. + +```shell +export version="2.3.7" +wget "https://archive.apache.org/dist/seatunnel/${version}/apache-seatunnel-${version}-bin.tar.gz" +tar -xzvf "apache-seatunnel-${version}-bin.tar.gz" +``` + +## Step 3: Download The Connector Plugin + +Starting from the 2.2.0-beta version, the binary package no longer provides the connector dependency by default. Therefore, when using it for the first time, you need to execute the following command to install the connector: (Of course, you can also manually download the connector from the [Apache Maven Repository](https://repo.maven.apache.org/maven2/org/apache/seatunnel/), and then move it to the `connectors/seatunnel` directory). + +```bash +sh bin/install-plugin.sh +``` + +If you need a specific connector version, taking 2.3.7 as an example, you need to execute the following command. + +```bash +sh bin/install-plugin.sh 2.3.7 +``` + +Usually you don't need all the connector plugins, so you can specify the plugins you need through configuring `config/plugin_config`, for example, if you only need the `connector-console` plugin, then you can modify the plugin.properties configuration file as follows. + +```plugin_config +--seatunnel-connectors-- +connector-console +--end-- +``` + +If you want the example application to work properly, you need to add the following plugins. + +```plugin_config +--seatunnel-connectors-- +connector-fake +connector-console +--end-- +``` + +You can find all supported connectors and the corresponding plugin_config configuration names under `${SEATUNNEL_HOME}/connectors/plugins-mapping.properties`. + +:::tip Tip + +If you want to install connector plugins by manually downloading connectors, you only need to download the connector plugins you need and place them in the `${SEATUNNEL_HOME}/connectors/` directory + +::: + +Now you have completed the download of the SeaTunnel installation package and the download of the connector plugin. Next, you can choose different running modes according to your needs to run or deploy SeaTunnel. + +If you use the SeaTunnel Engine (Zeta) that comes with SeaTunnel to run tasks, you need to deploy the SeaTunnel Engine service first. Refer to [Deployment Of SeaTunnel Engine (Zeta) Service](deployment.md). diff --git a/versioned_docs/version-2.3.7/seatunnel-engine/engine-jar-storage-mode.md b/versioned_docs/version-2.3.7/seatunnel-engine/engine-jar-storage-mode.md new file mode 100644 index 000000000000..75a583c0048d --- /dev/null +++ b/versioned_docs/version-2.3.7/seatunnel-engine/engine-jar-storage-mode.md @@ -0,0 +1,95 @@ +--- + +sidebar_position: 9 +------------------- + +# Config Engine Jar Storage Mode + +:::caution warn + +Please note that this feature is currently in an experimental stage, and there are many areas that still need improvement. Therefore, we recommend exercising caution when using this feature to avoid potential issues and unnecessary risks. +We are committed to ongoing efforts to enhance and stabilize this functionality, ensuring a better experience for you. + +::: + +We can enable the optimization job submission process, which is configured in the `seatunel.yaml`. After enabling the optimization of the Seatunnel job submission process configuration item, +users can use the Seatunnel engine(Zeta) as the execution engine without placing the connector jar packages required for task execution or the third-party jar packages that the connector relies on in each engine `connector` directory. +Users only need to place all the jar packages for task execution on the client that submits the job, and the client will automatically upload the jars required for task execution to the Zeta engine. It is necessary to enable this configuration item when submitting jobs in Docker or k8s mode, +which can fundamentally solve the problem of large container /image_en caused by the heavy weight of the Seatunnel Zeta engine. In the image, only the core framework package of the Zeta engine needs to be provided, +and then the jar package of the connector and the third-party jar package that the connector relies on can be separately uploaded to the pod for distribution. + +After enabling the optimization job submission process configuration item, you do not need to place the following two types of jar packages in the Zeta engine: +- COMMON_PLUGIN_JARS +- CONNECTOR_PLUGIN_JARS + +COMMON_ PLUGIN_ JARS refers to the third-party jar package that the connector relies on, CONNECTOR_ PLUGIN_ JARS refers to the connector jar package. +When common jars do not exist in Zeta's `lib`, it can upload the local common jars of the client to the `lib` directory of all engine nodes. +This way, even if the user does not place a jar on all nodes in Zeta's `lib`, the task can still be executed normally. +However, we do not recommend relying on the configuration item of opening the optimization job submission process to upload the third-party jar package that the connector relies on. +If you use Zeta Engine, please add the third-party jar package files that the connector relies on to `$SEATUNNEL_HOME/lib/` directory on each node, such as jdbc drivers. + +# ConnectorJar Storage Strategy + +You can configure the storage strategy of the current connector jar package and the third-party jar package that the connector depends on through the configuration file. +There are two storage strategies that can be configured, namely shared jar package storage strategy and isolated jar package storage strategy. +Two different storage strategies provide a more flexible storage mode for jar files. You can configure the storage strategy to share the same jar package file with multiple execution jobs in the engine. + +## Related Configuration + +| Parameter | Default Value | Describe | +|-------------------------------------|---------------|----------------------------------------------------------------------------------------------------------------------------------------------------| +| connector-jar-storage-enable | false | Whether to enable uploading the connector jar package to the engine. The default enabled state is false. | +| connector-jar-storage-mode | SHARED | Engine-side jar package storage mode selection. There are two optional modes, SHARED and ISOLATED. The default Jar package storage mode is SHARED. | +| connector-jar-storage-path | " " | User-defined jar package storage path. | +| connector-jar-cleanup-task-interval | 3600s | Engine-side jar package cleaning scheduled task execution interval. | +| connector-jar-expiry-time | 600s | Engine-side jar package storage expiration time. | + +## IsolatedConnectorJarStorageStrategy + +Before the job is submitted, the connector Jjr package will be uploaded to an independent file storage path on the Master node. +The connector jar packages of different jobs are in different storage paths, so the connector jar packages of different jobs are isolated from each other. +The jar package files required for the execution of a job have no influence on other jobs. When the current job execution ends, the jar package file in the storage path generated based on the JobId will be deleted. + +Example: + +```yaml +jar-storage: + connector-jar-storage-enable: true + connector-jar-storage-mode: ISOLATED + connector-jar-storage-path: "" + connector-jar-cleanup-task-interval: 3600 + connector-jar-expiry-time: 600 +``` + +Detailed explanation of configuration parameters: +- connector-jar-storage-enable: Enable uploading the connector jar package before executing the job. +- connector-jar-storage-mode: Connector jar package storage mode, two storage modes are available: shared mode (SHARED) and isolation mode (ISOLATED). +- connector-jar-storage-path: The local storage path of the user-defined connector jar package on the Zeta engine. +- connector-jar-cleanup-task-interval: Zeta engine connector jar package scheduled cleanup task interval, the default is 3600 seconds. +- connector-jar-expiry-time: The expiration time of the connector jar package. The default is 600 seconds. + +## SharedConnectorJarStorageStrategy + +Before the job is submitted, the connector jar package will be uploaded to the Master node. Different jobs can share connector jars on the Master node if they use the same Jar package file. +All jar package files are persisted to a shared file storage path, and jar packages that reference the Master node can be shared between different jobs. After the task execution is completed, +the SharedConnectorJarStorageStrategy will not immediately delete all jar packages related to the current task execution,but instead has an independent thread responsible for cleaning up the work. +The configuration in the following configuration file sets the running time of the cleaning work and the survival time of the jar package. + +Example: + +```yaml +jar-storage: + connector-jar-storage-enable: true + connector-jar-storage-mode: SHARED + connector-jar-storage-path: "" + connector-jar-cleanup-task-interval: 3600 + connector-jar-expiry-time: 600 +``` + +Detailed explanation of configuration parameters: +- connector-jar-storage-enable: Enable uploading the connector jar package before executing the job. +- connector-jar-storage-mode: Connector jar package storage mode, two storage modes are available: shared mode (SHARED) and isolation mode (ISOLATED). +- connector-jar-storage-path: The local storage path of the user-defined connector jar package on the Zeta engine. +- connector-jar-cleanup-task-interval: Zeta engine connector Jjr package scheduled cleanup task interval, the default is 3600 seconds. +- connector-jar-expiry-time: The expiration time of the connector jar package. The default is 600 seconds. + diff --git a/versioned_docs/version-2.3.7/seatunnel-engine/hybrid-cluster-deployment.md b/versioned_docs/version-2.3.7/seatunnel-engine/hybrid-cluster-deployment.md new file mode 100644 index 000000000000..534d5e69c5e2 --- /dev/null +++ b/versioned_docs/version-2.3.7/seatunnel-engine/hybrid-cluster-deployment.md @@ -0,0 +1,315 @@ +--- + +sidebar_position: 5 +------------------- + +# Deploy SeaTunnel Engine Hybrid Mode Cluster + +The Master service and Worker service of SeaTunnel Engine are mixed in the same process, and all nodes can run jobs and participate in the election to become master. The master node is also running synchronous tasks simultaneously. In this mode, the Imap (which saves the status information of the task to provide support for the task's fault tolerance) data will be distributed across all nodes. + +Usage Recommendation: It is recommended to use the [Separated Cluster Mode](separated-cluster-deployment.md). In the hybrid cluster mode, the Master node needs to run tasks synchronously. When the task scale is large, it will affect the stability of the Master node. Once the Master node crashes or the heartbeat times out, it will cause the Master node to switch, and the Master node switch will cause all running tasks to perform fault tolerance, further increasing the load on the cluster. Therefore, we recommend using the [Separated Cluster Mode](separated-cluster-deployment.md). + +## 1. Download + +[Download And Create The SeaTunnel Installation Package](download-seatunnel.md) + +## 2. Configure SEATUNNEL_HOME + +You can configure `SEATUNNEL_HOME` by adding the `/etc/profile.d/seatunnel.sh` file. The content of `/etc/profile.d/seatunnel.sh` is as follows: + +``` +export SEATUNNEL_HOME=${seatunnel install path} +export PATH=$PATH:$SEATUNNEL_HOME/bin +``` + +## 3. Configure The JVM Options For The SeaTunnel Engine + +The SeaTunnel Engine supports two methods for setting JVM options: + +1. Add the JVM options to `$SEATUNNEL_HOME/config/jvm_options`. + + Modify the JVM parameters in the `$SEATUNNEL_HOME/config/jvm_options` file. + +2. Add JVM options when starting the SeaTunnel Engine. For example, `seatunnel-cluster.sh -DJvmOption="-Xms2G -Xmx2G"` + +## 4. Configure The SeaTunnel Engine + +The SeaTunnel Engine provides many functions that need to be configured in the `seatunnel.yaml` file. + +### 4.1 Backup Count Setting For Data In Imap + +The SeaTunnel Engine implements cluster management based on [Hazelcast IMDG](https://docs.hazelcast.com/imdg/4.1/). The cluster's status data (job running status, resource status) is stored in the [Hazelcast IMap](https://docs.hazelcast.com/imdg/4.1/data-structures/map). +The data stored in the Hazelcast IMap is distributed and stored on all nodes in the cluster. Hazelcast partitions the data stored in the Imap. Each partition can specify the number of backups. +Therefore, the SeaTunnel Engine can implement cluster HA without using other services (such as Zookeeper). + +`backup count` is a parameter that defines the number of synchronous backups. For example, if it is set to 1, the backup of the partition will be placed on one other member. If it is set to 2, it will be placed on two other members. + +We recommend that the value of `backup count` be `min(1, max(5, N/2))`. `N` is the number of cluster nodes. + +```yaml +seatunnel: + engine: + backup-count: 1 + # Other configurations +``` + +### 4.2 Slot Configuration + +The number of slots determines the number of task groups that the cluster node can run in parallel. The formula for the number of slots required for a task is N = 2 + P (the parallelism configured by the task). By default, the number of slots in the SeaTunnel Engine is dynamic, that is, there is no limit on the number. We recommend that the number of slots be set to twice the number of CPU cores on the node. + +Configuration of dynamic slot number (default): + +```yaml +seatunnel: + engine: + slot-service: + dynamic-slot: true + # Other configurations +``` + +Configuration of static slot number: + +```yaml +seatunnel: + engine: + slot-service: + dynamic-slot: false + slot-num: 20 +``` + +### 4.3 Checkpoint Manager + +Like Flink, the SeaTunnel Engine supports the Chandy–Lamport algorithm. Therefore, it is possible to achieve data synchronization without data loss and duplication. + +**interval** + +The interval between two checkpoints, in milliseconds. If the `checkpoint.interval` parameter is configured in the job configuration file's `env`, the one set in the job configuration file will be used. + +**timeout** + +The timeout for checkpoints. If the checkpoint cannot be completed within the timeout, a checkpoint failure will be triggered and the job will fail. If the `checkpoint.timeout` parameter is configured in the job configuration file's `env`, the one set in the job configuration file will be used. + +Example + +```yaml +seatunnel: + engine: + backup-count: 1 + print-execution-info-interval: 10 + slot-service: + dynamic-slot: true + checkpoint: + interval: 300000 + timeout: 10000 +``` + +**checkpoint storage** + +Checkpoints are a fault-tolerant recovery mechanism. This mechanism ensures that the program can recover on its own even if an exception occurs suddenly during operation. Checkpoints are triggered at regular intervals. Each time a checkpoint is performed, each task is required to report its own status information (such as which offset was read when reading from Kafka) to the checkpoint thread, which writes it to a distributed storage (or shared storage). When a task fails and is automatically fault-tolerant and restored, or when a previously suspended task is restored using the seatunnel.sh -r command, the status information of the corresponding job will be loaded from the checkpoint storage and the job will be restored based on this status information. + +If the cluster has more than one node, the checkpoint storage must be a distributed storage or shared storage so that the task status information in the storage can be loaded on another node in case of a node failure. + +For information about checkpoint storage, you can refer to [Checkpoint Storage](checkpoint-storage.md) + +### 4.4 Expiration Configuration For Historical Jobs + +The information of each completed job, such as status, counters, and error logs, is stored in the IMap object. As the number of running jobs increases, the memory usage will increase, and eventually, the memory will overflow. Therefore, you can adjust the `history-job-expire-minutes` parameter to address this issue. The time unit for this parameter is minutes. The default value is 1440 minutes, which is one day. + +Example + +```yaml +seatunnel: + engine: + history-job-expire-minutes: 1440 +``` + +### 4.5 Class Loader Cache Mode + +This configuration primarily addresses the issue of resource leakage caused by constantly creating and attempting to destroy the class loader. +If you encounter exceptions related to metaspace overflow, you can try enabling this configuration. +To reduce the frequency of class loader creation, after enabling this configuration, SeaTunnel will not attempt to release the corresponding class loader when a job is completed, allowing it to be used by subsequent jobs. This is more effective when the number of Source/Sink connectors used in the running job is not excessive. +The default value is false. +Example + +```yaml +seatunnel: + engine: + classloader-cache-mode: true +``` + +## 5. Configure The SeaTunnel Engine Network Service + +All SeaTunnel Engine network-related configurations are in the `hazelcast.yaml` file. + +### 5.1 Cluster Name + +The SeaTunnel Engine node uses the `cluster-name` to determine if another node is in the same cluster as itself. If the cluster names of the two nodes are different, the SeaTunnel Engine will reject the service request. + +### 5.2 Network + +Based on [Hazelcast](https://docs.hazelcast.com/imdg/4.1/clusters/discovery-mechanisms), a SeaTunnel Engine cluster is a network composed of cluster members running the SeaTunnel Engine server. Cluster members automatically join together to form a cluster. This automatic joining occurs through various discovery mechanisms used by cluster members to detect each other. + +Please note that once the cluster is formed, communication between cluster members always occurs via TCP/IP, regardless of the discovery mechanism used. + +The SeaTunnel Engine utilizes the following discovery mechanisms: + +#### TCP + +You can configure the SeaTunnel Engine as a full TCP/IP cluster. For detailed configuration information, please refer to the [Discovering Members by TCP section](tcp.md). + +An example `hazelcast.yaml` file is as follows: + +```yaml +hazelcast: + cluster-name: seatunnel + network: + join: + tcp-ip: + enabled: true + member-list: + - hostname1 + port: + auto-increment: false + port: 5801 + properties: + hazelcast.logging.type: log4j2 +``` + +TCP is the recommended method for use in a standalone SeaTunnel Engine cluster. + +Alternatively, Hazelcast provides several other service discovery methods. For more details, please refer to [Hazelcast Network](https://docs.hazelcast.com/imdg/4.1/clusters/setting-up-clusters) + +### 5.3 IMap Persistence Configuration + +In SeaTunnel, we use IMap (a distributed Map that enables the writing and reading of data across nodes and processes. For more information, please refer to [hazelcast map](https://docs.hazelcast.com/imdg/4.2/data-structures/map)) to store the status of each task and task, allowing us to recover tasks and achieve task fault tolerance in the event of a node failure. + +By default, the information in Imap is only stored in memory. We can set the replica count for Imap data. For more details, please refer to (4.1 Backup count setting for data in Imap). If the replica count is set to 2, it means that each data will be stored in two different nodes simultaneously. In the event of a node failure, the data in Imap will be automatically replenished to the set replica count on other nodes. However, when all nodes are stopped, the data in Imap will be lost. When the cluster nodes are restarted, all previously running tasks will be marked as failed, and users will need to manually resume them using the seatunnel.sh -r command. + +To address this issue, we can persist the data in Imap to an external storage such as HDFS or OSS. This way, even if all nodes are stopped, the data in Imap will not be lost. When the cluster nodes are restarted, all previously running tasks will be automatically restored. + +The following describes how to use the MapStore persistence configuration. For more details, please refer to [hazelcast map](https://docs.hazelcast.com/imdg/4.2/data-structures/map) + +**type** + +The type of IMap persistence, currently only supporting `hdfs`. + +**namespace** + +It is used to distinguish the storage location of different business data, such as the name of an OSS bucket. + +**clusterName** + +This parameter is mainly used for cluster isolation, allowing you to distinguish between different clusters, such as cluster1 and cluster2, and can also be used to distinguish different business data. + +**fs.defaultFS** + +We use the hdfs api to read and write files, so providing the hdfs configuration is required for using this storage. + +If using HDFS, you can configure it as follows: + +```yaml +map: + engine*: + map-store: + enabled: true + initial-mode: EAGER + factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory + properties: + type: hdfs + namespace: /tmp/seatunnel/imap + clusterName: seatunnel-cluster + storage.type: hdfs + fs.defaultFS: hdfs://localhost:9000 +``` + +If there is no HDFS and the cluster has only one node, you can configure it to use local files as follows: + +```yaml +map: + engine*: + map-store: + enabled: true + initial-mode: EAGER + factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory + properties: + type: hdfs + namespace: /tmp/seatunnel/imap + clusterName: seatunnel-cluster + storage.type: hdfs + fs.defaultFS: file:/// +``` + +If using OSS, you can configure it as follows: + +```yaml +map: + engine*: + map-store: + enabled: true + initial-mode: EAGER + factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory + properties: + type: hdfs + namespace: /tmp/seatunnel/imap + clusterName: seatunnel-cluster + storage.type: oss + block.size: block size(bytes) + oss.bucket: oss://bucket name/ + fs.oss.accessKeyId: OSS access key id + fs.oss.accessKeySecret: OSS access key secret + fs.oss.endpoint: OSS endpoint +``` + +Notice: When using OSS, make sure that the following jars are in the lib directory. + +``` +aliyun-sdk-oss-3.13.2.jar +hadoop-aliyun-3.3.6.jar +jdom2-2.0.6.jar +netty-buffer-4.1.89.Final.jar +netty-common-4.1.89.Final.jar +seatunnel-hadoop3-3.1.4-uber.jar +``` + +## 6. Configure The SeaTunnel Engine Client + +All SeaTunnel Engine client configurations are in the `hazelcast-client.yaml`. + +### 6.1 cluster-name + +The client must have the same `cluster-name` as the SeaTunnel Engine. Otherwise, the SeaTunnel Engine will reject the client's request. + +### 6.2 network + +**cluster-members** + +You need to add the addresses of all SeaTunnel Engine server nodes here. + +```yaml +hazelcast-client: + cluster-name: seatunnel + properties: + hazelcast.logging.type: log4j2 + network: + cluster-members: + - hostname1:5801 +``` + +## 7. Start The SeaTunnel Engine Server Node + +It can be started with the `-d` parameter through the daemon. + +```shell +mkdir -p $SEATUNNEL_HOME/logs +./bin/seatunnel-cluster.sh -d +``` + +The logs will be written to `$SEATUNNEL_HOME/logs/seatunnel-engine-server.log` + +## 8. Install The SeaTunnel Engine Client + +You only need to copy the `$SEATUNNEL_HOME` directory on the SeaTunnel Engine node to the client node and configure `SEATUNNEL_HOME` in the same way as the SeaTunnel Engine server node. + +## 9. Submit And Manage Jobs + +Now that the cluster is deployed, you can complete the submission and management of jobs through the following tutorials: [Submit And Manage Jobs](user-command.md) diff --git a/versioned_docs/version-2.3.7/seatunnel-engine/local-mode-deployment.md b/versioned_docs/version-2.3.7/seatunnel-engine/local-mode-deployment.md new file mode 100644 index 000000000000..f4cd0bcb2c55 --- /dev/null +++ b/versioned_docs/version-2.3.7/seatunnel-engine/local-mode-deployment.md @@ -0,0 +1,35 @@ +--- + +sidebar_position: 4 +------------------- + +# Run Jobs In Local Mode + +Only for testing. + +In local mode, each task will start a separate process, and the process will exit when the task is completed. There are the following limitations in this mode: + +1. Pausing and resuming tasks are not supported. +2. Viewing the task list is not supported. +3. Jobs cannot be cancelled via commands, only by killing the process. +4. REST API is not supported. + +The [Separated Cluster Mode](separated-cluster-deployment.md) of SeaTunnel Engine is recommended for use in production environments. + +## Deploying SeaTunnel Engine In Local Mode + +In local mode, there is no need to deploy a SeaTunnel Engine cluster. You only need to use the following command to submit jobs. The system will start the SeaTunnel Engine (Zeta) service in the process that submitted the job to run the submitted job, and the process will exit after the job is completed. + +In this mode, you only need to copy the downloaded and created installation package to the server where you need to run it. If you need to adjust the JVM parameters for job execution, you can modify the `$SEATUNNEL_HOME/config/jvm_client_options` file. + +## Submitting Jobs + +```shell +$SEATUNNEL_HOME/bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template -m local +``` + +## Job Operations + +Jobs submitted in local mode will run in the process that submitted the job, and the process will exit when the job is completed. If you want to abort the job, you only need to exit the process that submitted the job. The job's runtime logs will be output to the standard output of the process that submitted the job. + +Other operation and maintenance operations are not supported. diff --git a/versioned_docs/version-2.3.7/seatunnel-engine/resource-isolation.md b/versioned_docs/version-2.3.7/seatunnel-engine/resource-isolation.md new file mode 100644 index 000000000000..e16129b53be1 --- /dev/null +++ b/versioned_docs/version-2.3.7/seatunnel-engine/resource-isolation.md @@ -0,0 +1,83 @@ +--- + +sidebar_position: 9 +------------------- + +After version 2.3.6. SeaTunnel can add `tag` to each worker node, when you submit job you can use `tag_filter` to filter the node you want run this job. + +# How To Archive This: + +1. update the config in `hazelcast.yaml`, + +```yaml +hazelcast: + cluster-name: seatunnel + network: + rest-api: + enabled: true + endpoint-groups: + CLUSTER_WRITE: + enabled: true + DATA: + enabled: true + join: + tcp-ip: + enabled: true + member-list: + - localhost + port: + auto-increment: false + port: 5801 + properties: + hazelcast.invocation.max.retry.count: 20 + hazelcast.tcp.join.port.try.count: 30 + hazelcast.logging.type: log4j2 + hazelcast.operation.generic.thread.count: 50 + member-attributes: + group: + type: string + value: platform + team: + type: string + value: team1 +``` + +In this config, we specify the tag by `member-attributes`, the node has `group=platform, team=team1` tags. + +2. add `tag_filter` to your job config + +```hacon +env { + parallelism = 1 + job.mode = "BATCH" + tag_filter { + group = "platform" + team = "team1" + } +} +source { + FakeSource { + result_table_name = "fake" + parallelism = 1 + schema = { + fields { + name = "string" + } + } + } +} +transform { +} +sink { + console { + source_table_name="fake" + } +} +``` + +**Notice:** +- If not set `tag_filter` in job config, it will random choose the node in all active nodes. +- When you add multiple tag in `tag_filter`, it need all key exist and value match. if all node not match, you will get `NoEnoughResourceException` exception. + +![img.png](/image_en/resource-isolation.png) + diff --git a/versioned_docs/version-2.3.7/seatunnel-engine/rest-api.md b/versioned_docs/version-2.3.7/seatunnel-engine/rest-api.md new file mode 100644 index 000000000000..99bba92dae0c --- /dev/null +++ b/versioned_docs/version-2.3.7/seatunnel-engine/rest-api.md @@ -0,0 +1,491 @@ +--- + +sidebar_position: 11 +-------------------- + +# RESTful API + +SeaTunnel has a monitoring API that can be used to query status and statistics of running jobs, as well as recent +completed jobs. The monitoring API is a RESTful API that accepts HTTP requests and responds with JSON data. + +## Overview + +The monitoring API is backed by a web server that runs as part of the node, each node member can provide RESTful api capability. +By default, this server listens at port 5801, which can be configured in hazelcast.yaml like : + +```yaml +network: + rest-api: + enabled: true + endpoint-groups: + CLUSTER_WRITE: + enabled: true + DATA: + enabled: true + join: + tcp-ip: + enabled: true + member-list: + - localhost + port: + auto-increment: true + port-count: 100 + port: 5801 +``` + +## API reference + +### Returns an overview over the Zeta engine cluster. + +
    + GET /hazelcast/rest/maps/overview?tag1=value1&tag2=value2 (Returns an overview over the Zeta engine cluster.) + +#### Parameters + +> | name | type | data type | description | +> |----------|----------|-----------|------------------------------------------------------------------------------------------------------| +> | tag_name | optional | string | the tags filter, you can add tag filter to get those matched worker count, and slot on those workers | + +#### Responses + +```json +{ + "projectVersion":"2.3.5-SNAPSHOT", + "gitCommitAbbrev":"DeadD0d0", + "totalSlot":"0", + "unassignedSlot":"0", + "works":"1", + "runningJobs":"0", + "finishedJobs":"0", + "failedJobs":"0", + "cancelledJobs":"0" +} +``` + +**Notes:** +- If you use `dynamic-slot`, the `totalSlot` and `unassignedSlot` always be `0`. when you set it to fix slot number, it will return the correct total and unassigned slot number +- If the url has tag filter, the `works`, `totalSlot` and `unassignedSlot` will return the result on the matched worker. but the job related metric will always return the cluster level information. + +
    + +------------------------------------------------------------------------------------------ + +### Returns An Overview And State Of All Jobs + +
    + GET /hazelcast/rest/maps/running-jobs (Returns an overview over all jobs and their current state.) + +#### Parameters + +#### Responses + +```json +[ + { + "jobId": "", + "jobName": "", + "jobStatus": "", + "envOptions": { + }, + "createTime": "", + "jobDag": { + "vertices": [ + ], + "edges": [ + ] + }, + "pluginJarsUrls": [ + ], + "isStartWithSavePoint": false, + "metrics": { + "sourceReceivedCount": "", + "sinkWriteCount": "" + } + } +] +``` + +
    + +------------------------------------------------------------------------------------------ + +### Return Details Of A Job + +
    + GET /hazelcast/rest/maps/job-info/:jobId (Return details of a job. ) + +#### Parameters + +> | name | type | data type | description | +> |-------|----------|-----------|-------------| +> | jobId | required | long | job id | + +#### Responses + +```json +{ + "jobId": "", + "jobName": "", + "jobStatus": "", + "createTime": "", + "jobDag": { + "vertices": [ + ], + "edges": [ + ] + }, + "metrics": { + "sourceReceivedCount": "", + "sinkWriteCount": "" + }, + "finishedTime": "", + "errorMsg": null, + "envOptions": { + }, + "pluginJarsUrls": [ + ], + "isStartWithSavePoint": false +} +``` + +`jobId`, `jobName`, `jobStatus`, `createTime`, `jobDag`, `metrics` always be returned. +`envOptions`, `pluginJarsUrls`, `isStartWithSavePoint` will return when job is running. +`finishedTime`, `errorMsg` will return when job is finished. + +When we can't get the job info, the response will be: + +```json +{ + "jobId" : "" +} +``` + +
    + +------------------------------------------------------------------------------------------ + +### Return Details Of A Job + +This API has been deprecated, please use /hazelcast/rest/maps/job-info/:jobId instead + +
    + GET /hazelcast/rest/maps/running-job/:jobId (Return details of a job. ) + +#### Parameters + +> | name | type | data type | description | +> |-------|----------|-----------|-------------| +> | jobId | required | long | job id | + +#### Responses + +```json +{ + "jobId": "", + "jobName": "", + "jobStatus": "", + "createTime": "", + "jobDag": { + "vertices": [ + ], + "edges": [ + ] + }, + "metrics": { + "sourceReceivedCount": "", + "sinkWriteCount": "" + }, + "finishedTime": "", + "errorMsg": null, + "envOptions": { + }, + "pluginJarsUrls": [ + ], + "isStartWithSavePoint": false +} +``` + +`jobId`, `jobName`, `jobStatus`, `createTime`, `jobDag`, `metrics` always be returned. +`envOptions`, `pluginJarsUrls`, `isStartWithSavePoint` will return when job is running. +`finishedTime`, `errorMsg` will return when job is finished. + +When we can't get the job info, the response will be: + +```json +{ + "jobId" : "" +} +``` + +
    + +------------------------------------------------------------------------------------------ + +### Return All Finished Jobs Info + +
    + GET /hazelcast/rest/maps/finished-jobs/:state (Return all finished Jobs Info.) + +#### Parameters + +> | name | type | data type | description | +> |-------|----------|-----------|------------------------------------------------------------------| +> | state | optional | string | finished job status. `FINISHED`,`CANCELED`,`FAILED`,`UNKNOWABLE` | + +#### Responses + +```json +[ + { + "jobId": "", + "jobName": "", + "jobStatus": "", + "errorMsg": null, + "createTime": "", + "finishTime": "", + "jobDag": "", + "metrics": "" + } +] +``` + +
    + +------------------------------------------------------------------------------------------ + +### Returns System Monitoring Information + +
    + GET /hazelcast/rest/maps/system-monitoring-information (Returns system monitoring information.) + +#### Parameters + +#### Responses + +```json +[ + { + "processors":"8", + "physical.memory.total":"16.0G", + "physical.memory.free":"16.3M", + "swap.space.total":"0", + "swap.space.free":"0", + "heap.memory.used":"135.7M", + "heap.memory.free":"440.8M", + "heap.memory.total":"576.5M", + "heap.memory.max":"3.6G", + "heap.memory.used/total":"23.54%", + "heap.memory.used/max":"3.73%", + "minor.gc.count":"6", + "minor.gc.time":"110ms", + "major.gc.count":"2", + "major.gc.time":"73ms", + "load.process":"24.78%", + "load.system":"60.00%", + "load.systemAverage":"2.07", + "thread.count":"117", + "thread.peakCount":"118", + "cluster.timeDiff":"0", + "event.q.size":"0", + "executor.q.async.size":"0", + "executor.q.client.size":"0", + "executor.q.client.query.size":"0", + "executor.q.client.blocking.size":"0", + "executor.q.query.size":"0", + "executor.q.scheduled.size":"0", + "executor.q.io.size":"0", + "executor.q.system.size":"0", + "executor.q.operations.size":"0", + "executor.q.priorityOperation.size":"0", + "operations.completed.count":"10", + "executor.q.mapLoad.size":"0", + "executor.q.mapLoadAllKeys.size":"0", + "executor.q.cluster.size":"0", + "executor.q.response.size":"0", + "operations.running.count":"0", + "operations.pending.invocations.percentage":"0.00%", + "operations.pending.invocations.count":"0", + "proxy.count":"8", + "clientEndpoint.count":"0", + "connection.active.count":"2", + "client.connection.count":"0", + "connection.count":"0" + } +] +``` + +
    + +------------------------------------------------------------------------------------------ + +### Submit A Job + +
    +POST /hazelcast/rest/maps/submit-job (Returns jobId and jobName if job submitted successfully.) + +#### Parameters + +> | name | type | data type | description | +> |----------------------|----------|-----------|-----------------------------------| +> | jobId | optional | string | job id | +> | jobName | optional | string | job name | +> | isStartWithSavePoint | optional | string | if job is started with save point | + +#### Body + +```json +{ + "env": { + "job.mode": "batch" + }, + "source": [ + { + "plugin_name": "FakeSource", + "result_table_name": "fake", + "row.num": 100, + "schema": { + "fields": { + "name": "string", + "age": "int", + "card": "int" + } + } + } + ], + "transform": [ + ], + "sink": [ + { + "plugin_name": "Console", + "source_table_name": ["fake"] + } + ] +} +``` + +#### Responses + +```json +{ + "jobId": 733584788375666689, + "jobName": "rest_api_test" +} +``` + +
    + +------------------------------------------------------------------------------------------ + +### Stop A Job + +
    +POST /hazelcast/rest/maps/stop-job (Returns jobId if job stoped successfully.) + +#### Body + +```json +{ + "jobId": 733584788375666689, + "isStopWithSavePoint": false # if job is stopped with save point +} +``` + +#### Responses + +```json +{ +"jobId": 733584788375666689 +} +``` + +
    + +------------------------------------------------------------------------------------------ + +### Encrypt Config + +
    +POST /hazelcast/rest/maps/encrypt-config (Returns the encrypted config if config is encrypted successfully.) +For more information about customize encryption, please refer to the documentation [config-encryption-decryption](../connector-v2/Config-Encryption-Decryption.md). + +#### Body + +```json +{ + "env": { + "parallelism": 1, + "shade.identifier":"base64" + }, + "source": [ + { + "plugin_name": "MySQL-CDC", + "schema" : { + "fields": { + "name": "string", + "age": "int" + } + }, + "result_table_name": "fake", + "parallelism": 1, + "hostname": "127.0.0.1", + "username": "seatunnel", + "password": "seatunnel_password", + "table-name": "inventory_vwyw0n" + } + ], + "transform": [ + ], + "sink": [ + { + "plugin_name": "Clickhouse", + "host": "localhost:8123", + "database": "default", + "table": "fake_all", + "username": "seatunnel", + "password": "seatunnel_password" + } + ] +} +``` + +#### Responses + +```json +{ + "env": { + "parallelism": 1, + "shade.identifier": "base64" + }, + "source": [ + { + "plugin_name": "MySQL-CDC", + "schema": { + "fields": { + "name": "string", + "age": "int" + } + }, + "result_table_name": "fake", + "parallelism": 1, + "hostname": "127.0.0.1", + "username": "c2VhdHVubmVs", + "password": "c2VhdHVubmVsX3Bhc3N3b3Jk", + "table-name": "inventory_vwyw0n" + } + ], + "transform": [], + "sink": [ + { + "plugin_name": "Clickhouse", + "host": "localhost:8123", + "database": "default", + "table": "fake_all", + "username": "c2VhdHVubmVs", + "password": "c2VhdHVubmVsX3Bhc3N3b3Jk" + } + ] +} +``` + +
    + diff --git a/versioned_docs/version-2.3.7/seatunnel-engine/savepoint.md b/versioned_docs/version-2.3.7/seatunnel-engine/savepoint.md new file mode 100644 index 000000000000..06d4e6b6b34c --- /dev/null +++ b/versioned_docs/version-2.3.7/seatunnel-engine/savepoint.md @@ -0,0 +1,24 @@ +--- + +sidebar_position: 8 +------------------- + +# Savepoint And Restore With Savepoint + +Savepoint is created for using the checkpoint. A global mirror of job execution status can be used for job or seatunnel stop and recovery, upgrade, etc. + +## Use Savepoint + +To use savepoint, you need to ensure that the connector used by the job supports checkpoint, otherwise data may be lost or duplicated. + +1. Make sure the job is running + +2. Use the following command to trigger savepoint: + ```./bin/seatunnel.sh -s {jobId}``` + +After successful execution, the checkpoint data will be saved and the task will end. + +## Use Restore With Savepoint + +Resume from savepoint using jobId +```./bin/seatunnel.sh -c {jobConfig} -r {jobId}``` diff --git a/versioned_docs/version-2.3.7/seatunnel-engine/separated-cluster-deployment.md b/versioned_docs/version-2.3.7/seatunnel-engine/separated-cluster-deployment.md new file mode 100644 index 000000000000..168cac8d0f07 --- /dev/null +++ b/versioned_docs/version-2.3.7/seatunnel-engine/separated-cluster-deployment.md @@ -0,0 +1,427 @@ +--- + +sidebar_position: 6 +------------------- + +# Deploy SeaTunnel Engine In Separated Cluster Mode + +The Master service and Worker service of SeaTunnel Engine are separated, and each service is a separate process. The Master node is only responsible for job scheduling, RESTful API, task submission, etc., and the Imap data is only stored on the Master node. The Worker node is only responsible for the execution of tasks and does not participate in the election to become the master nor stores Imap data. + +Among all the Master nodes, only one Master node works at the same time, and the other Master nodes are in the standby state. When the current Master node fails or the heartbeat times out, a new Master Active node will be elected from the other Master nodes. + +This is the most recommended usage method. In this mode, the load on the Master will be very low, and the Master has more resources for job scheduling, task fault tolerance index monitoring, and providing RESTful API services, etc., and will have higher stability. At the same time, the Worker node does not store Imap data. All Imap data is stored on the Master node. Even if the Worker node has a high load or crashes, it will not cause the Imap data to be redistributed. + +## 1. Download + +[Download And Make SeaTunnel Installation Package](download-seatunnel.md) + +## 2. Configure SEATUNNEL_HOME + +You can configure `SEATUNNEL_HOME` by adding the `/etc/profile.d/seatunnel.sh` file. The content of `/etc/profile.d/seatunnel.sh` is as follows: + +``` +export SEATUNNEL_HOME=${seatunnel install path} +export PATH=$PATH:$SEATUNNEL_HOME/bin +``` + +## 3. Configure JVM Options For Master Nodes + +The JVM parameters of the Master node are configured in the `$SEATUNNEL_HOME/config/jvm_master_options` file. + +```shell +# JVM Heap +-Xms2g +-Xmx2g + +# JVM Dump +-XX:+HeapDumpOnOutOfMemoryError +-XX:HeapDumpPath=/tmp/seatunnel/dump/zeta-server + +# Metaspace +-XX:MaxMetaspaceSize=2g + +# G1GC +-XX:+UseG1GC +``` + +The JVM parameters of the Worker node are configured in the `$SEATUNNEL_HOME/config/jvm_worker_options` file. + +```shell +# JVM Heap +-Xms2g +-Xmx2g + +# JVM Dump +-XX:+HeapDumpOnOutOfMemoryError +-XX:HeapDumpPath=/tmp/seatunnel/dump/zeta-server + +# Metaspace +-XX:MaxMetaspaceSize=2g + +# G1GC +-XX:+UseG1GC +``` + +## 4. Configure SeaTunnel Engine + +SeaTunnel Engine provides many functions and needs to be configured in `seatunnel.yaml`. + +### 4.1 Setting the backup number of data in Imap (this parameter is not effective on the Worker node) + +SeaTunnel Engine implements cluster management based on [Hazelcast IMDG](https://docs.hazelcast.com/imdg/4.1/). The status data of the cluster (job running status, resource status) is stored in [Hazelcast IMap](https://docs.hazelcast.com/imdg/4.1/data-structures/map). The data stored in Hazelcast IMap will be distributed and stored on all nodes of the cluster. Hazelcast partitions the data stored in Imap. Each partition can specify the number of backups. Therefore, SeaTunnel Engine can achieve cluster HA without using other services (such as zookeeper). + +The `backup count` is a parameter that defines the number of synchronous backups. For example, if it is set to 1, the backup of the partition will be placed on one other member. If it is set to 2, it will be placed on two other members. + +We recommend that the value of `backup-count` be `min(1, max(5, N/2))`. `N` is the number of cluster nodes. + +```yaml +seatunnel: + engine: + backup-count: 1 + # other configurations +``` + +:::tip + +Since in the separated cluster mode, the Worker node does not store Imap data, the `backup-count` configuration of the Worker node is not effective. If the Master and Worker processes are started on the same machine, the Master and Worker will share the `seatunnel.yaml` configuration file. At this time, the Worker node service will ignore the `backup-count` configuration. + +::: + +### 4.2 Slot configuration (this parameter is not effective on the Master node) + +The number of Slots determines the number of task groups that can be run in parallel on the cluster node. The number of Slots required by a task is formulated as N = 2 + P (parallelism configured by the task). By default, the number of Slots of SeaTunnel Engine is dynamic, that is, there is no limit on the number. We recommend that the number of Slots be set to twice the number of CPU cores of the node. + +The configuration of dynamic slot number (default) is as follows: + +```yaml +seatunnel: + engine: + slot-service: + dynamic-slot: true + # other configurations +``` + +The configuration of static slot number is as follows: + +```yaml +seatunnel: + engine: + slot-service: + dynamic-slot: false + slot-num: 20 +``` + +:::tip + +Since in the separated cluster mode, the Master node does not run tasks, so the Master service will not start the Slot service, and the `slot-service` configuration of the Master node is not effective. If the Master and Worker processes are started on the same machine, the Master and Worker will share the `seatunnel.yaml` configuration file. At this time, the Master node service will ignore the `slot-service` configuration. + +::: + +### 4.3 Checkpoint Manager (This parameter is invalid on the Worker node) + +Just like Flink, the SeaTunnel Engine supports the Chandy–Lamport algorithm. Therefore, data synchronization without data loss and duplication can be achieved. + +**interval** + +The interval between two checkpoints, in milliseconds. If the `checkpoint.interval` parameter is configured in the `env` of the job configuration file, it will be subject to the setting in the job configuration file. + +**timeout** + +The timeout time of the checkpoint. If the checkpoint cannot be completed within the timeout time, it will trigger a checkpoint failure and the job fails. If the `checkpoint.timeout` parameter is configured in the `env` of the job configuration file, it will be subject to the setting in the job configuration file. + +Example + +```yaml +seatunnel: + engine: + backup-count: 1 + print-execution-info-interval: 10 + slot-service: + dynamic-slot: true + checkpoint: + interval: 300000 + timeout: 10000 +``` + +**checkpoint storage** + +The checkpoint is a fault-tolerant recovery mechanism. This mechanism ensures that when the program is running, even if it suddenly encounters an exception, it can recover by itself. The checkpoints are triggered regularly, and when each checkpoint is performed, each Task will be required to report its own state information (such as which offset has been read when reading Kafka) to the checkpoint thread, which writes it into a distributed storage (or shared storage). When the task fails and then automatically recovers from fault tolerance, or when recovering a previously paused task through the seatunnel.sh -r instruction, the state information of the corresponding job will be loaded from the checkpoint storage, and the job will be recovered based on these state information. + +If the number of nodes in the cluster is greater than 1, the checkpoint storage must be a distributed storage or a shared storage, so as to ensure that the task state information stored in it can still be loaded on another node after any node fails. + +:::tip + +The checkpoint configuration is only read by the Master service, and the Worker service will not read the checkpoint configuration. If the Master and Worker processes are started on the same machine, the Master and Worker will share the `seatunnel.yaml` configuration file, and at this time the Worker node service will ignore the `checkpoint` configuration. + +::: + +For information about checkpoint storage, you can view [checkpoint storage](checkpoint-storage.md). + +### 4.4 History Job Expiry Configuration + +The information of each completed job, such as status, counters, and error logs, is stored in an IMap object. As the number of running jobs increases, the memory will increase, and eventually the memory will overflow. Therefore, you can adjust the `history-job-expire-minutes` parameter to solve this problem. The time unit of this parameter is minutes. The default value is 1440 minutes, that is, one day. + +Example + +```yaml +seatunnel: + engine: + history-job-expire-minutes: 1440 +``` + +### 4.5 Class Loader Cache Mode + +This configuration mainly solves the problem of resource leakage caused by continuously creating and attempting to destroy class loaders. +If you encounter an exception related to metaspace space overflow, you can try to enable this configuration. +In order to reduce the frequency of creating class loaders, after enabling this configuration, SeaTunnel will not try to release the corresponding class loader when the job is completed, so that it can be used by subsequent jobs, that is to say, when not too many types of Source/Sink connector are used in the running job, it is more effective. +The default value is false. +Example + +```yaml +seatunnel: + engine: + classloader-cache-mode: true +``` + +### 4.6 Persistence Configuration of IMap (This parameter is invalid on the Worker node) + +:::tip + +Since in the separated cluster mode, only the Master node stores IMap data and the Worker node does not store IMap data, the Worker service will not read this parameter item. + +::: + +In SeaTunnel, we use IMap (a distributed Map that can implement the writing and reading of data across nodes and processes. For detailed information, please refer to [hazelcast map](https://docs.hazelcast.com/imdg/4.2/data-structures/map)) to store the state of each task and its task, so that after the node where the task is located fails, the state information of the task before can be obtained on other nodes, thereby recovering the task and realizing the fault tolerance of the task. + +By default, the information of IMap is only stored in the memory, and we can set the number of replicas of IMap data. For specific reference (4.1 Setting the number of backups of data in IMap), if the number of replicas is 2, it means that each data will be simultaneously stored in 2 different nodes. Once the node fails, the data in IMap will be automatically replenished to the set number of replicas on other nodes. But when all nodes are stopped, the data in IMap will be lost. When the cluster nodes are started again, all previously running tasks will be marked as failed and need to be recovered manually by the user through the seatunnel.sh -r instruction. + +To solve this problem, we can persist the data in IMap to an external storage such as HDFS, OSS, etc. In this way, even if all nodes are stopped, the data in IMap will not be lost, and when the cluster nodes are started again, all previously running tasks will be automatically recovered. + +The following describes how to use the MapStore persistence configuration. For detailed information, please refer to [hazelcast map](https://docs.hazelcast.com/imdg/4.2/data-structures/map) + +**type** + +The type of IMap persistence, currently only supports `hdfs`. + +**namespace** + +It is used to distinguish the data storage locations of different businesses, such as the OSS bucket name. + +**clusterName** + +This parameter is mainly used for cluster isolation. We can use it to distinguish different clusters, such as cluster1, cluster2, which is also used to distinguish different businesses. + +**fs.defaultFS** + +We use the hdfs api to read and write files, so providing the hdfs configuration is required for using this storage. + +If you use HDFS, you can configure it like this: + +```yaml +map: + engine*: + map-store: + enabled: true + initial-mode: EAGER + factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory + properties: + type: hdfs + namespace: /tmp/seatunnel/imap + clusterName: seatunnel-cluster + storage.type: hdfs + fs.defaultFS: hdfs://localhost:9000 +``` + +If there is no HDFS and your cluster has only one node, you can configure it like this to use local files: + +```yaml +map: + engine*: + map-store: + enabled: true + initial-mode: EAGER + factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory + properties: + type: hdfs + namespace: /tmp/seatunnel/imap + clusterName: seatunnel-cluster + storage.type: hdfs + fs.defaultFS: file:/// +``` + +If you use OSS, you can configure it like this: + +```yaml +map: + engine*: + map-store: + enabled: true + initial-mode: EAGER + factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory + properties: + type: hdfs + namespace: /tmp/seatunnel/imap + clusterName: seatunnel-cluster + storage.type: oss + block.size: block size(bytes) + oss.bucket: oss://bucket name/ + fs.oss.accessKeyId: OSS access key id + fs.oss.accessKeySecret: OSS access key secret + fs.oss.endpoint: OSS endpoint +``` + +Notice: When using OSS, make sure that the following jars are in the lib directory. + +``` +aliyun-sdk-oss-3.13.2.jar +hadoop-aliyun-3.3.6.jar +jdom2-2.0.6.jar +netty-buffer-4.1.89.Final.jar +netty-common-4.1.89.Final.jar +seatunnel-hadoop3-3.1.4-uber.jar +``` + +## 5. Configuring SeaTunnel Engine Network Services + +All network-related configurations of the SeaTunnel Engine are in the `hazelcast-master.yaml` and `hazelcast-worker.yaml` files. + +### 5.1 cluster-name + +SeaTunnel Engine nodes use the `cluster-name` to determine whether another node is in the same cluster as themselves. If the cluster names between two nodes are different, the SeaTunnel Engine will reject service requests. + +### 5.2 network + +Based on [Hazelcast](https://docs.hazelcast.com/imdg/4.1/clusters/discovery-mechanisms), a SeaTunnel Engine cluster is a network composed of cluster members running the SeaTunnel Engine server. Cluster members automatically join together to form a cluster. This automatic joining is through the various discovery mechanisms used by cluster members to discover each other. + +Please note that after the cluster is formed, the communication between cluster members is always through TCP/IP regardless of the discovery mechanism used. + +The SeaTunnel Engine uses the following discovery mechanisms. + +#### tcp-ip + +You can configure the SeaTunnel Engine as a complete TCP/IP cluster. For configuration details, please refer to the [Discovering Members by TCP section](tcp.md). + +In the separated cluster mode, the Master and Worker services use different ports. + +Master node network configuration `hazelcast-master.yaml` + +```yaml +hazelcast: + cluster-name: seatunnel + network: + rest-api: + enabled: true + endpoint-groups: + CLUSTER_WRITE: + enabled: true + DATA: + enabled: true + join: + tcp-ip: + enabled: true + member-list: + - master-node-1:5801 + - master-node-2:5801 + - worker-node-1:5802 + - worker-node-2:5802 + port: + auto-increment: false + port: 5801 + properties: + hazelcast.heartbeat.failuredetector.type: phi-accrual + hazelcast.heartbeat.interval.seconds: 2 + hazelcast.max.no.heartbeat.seconds: 180 + hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10 + hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200 + hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100 +``` + +Worker node network configuration `hazelcast-worker.yaml` + +```yaml +hazelcast: + cluster-name: seatunnel + network: + join: + tcp-ip: + enabled: true + member-list: + - master-node-1:5801 + - master-node-2:5801 + - worker-node-1:5802 + - worker-node-2:5802 + port: + auto-increment: false + port: 5802 + properties: + hazelcast.heartbeat.failuredetector.type: phi-accrual + hazelcast.heartbeat.interval.seconds: 2 + hazelcast.max.no.heartbeat.seconds: 180 + hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10 + hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200 + hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100 +``` + +TCP is the way we recommend to use in a standalone SeaTunnel Engine cluster. + +On the other hand, Hazelcast provides some other service discovery methods. For details, please refer to [hazelcast network](https://docs.hazelcast.com/imdg/4.1/clusters/setting-up-clusters). + +## 6. Starting the SeaTunnel Engine Master Node + +It can be started using the `-d` parameter through the daemon. + +```shell +mkdir -p $SEATUNNEL_HOME/logs +./bin/seatunnel-cluster.sh -d -r master +``` + +The logs will be written to `$SEATUNNEL_HOME/logs/seatunnel-engine-master.log`. + +## 7. Starting The SeaTunnel Engine Worker Node + +It can be started using the `-d` parameter through the daemon. + +```shell +mkdir -p $SEATUNNEL_HOME/logs +./bin/seatunnel-cluster.sh -d -r worker +``` + +The logs will be written to `$SEATUNNEL_HOME/logs/seatunnel-engine-worker.log`. + +## 8. Installing The SeaTunnel Engine Client + +### 8.1 Setting the `SEATUNNEL_HOME` the same as the server + +You can configure the `SEATUNNEL_HOME` by adding the `/etc/profile.d/seatunnel.sh` file. The content of `/etc/profile.d/seatunnel.sh` is as follows: + +``` +export SEATUNNEL_HOME=${seatunnel install path} +export PATH=$PATH:$SEATUNNEL_HOME/bin +``` + +### 8.2 Configuring The SeaTunnel Engine Client + +All configurations of the SeaTunnel Engine client are in the `hazelcast-client.yaml`. + +**cluster-name** + +The client must have the same `cluster-name` as the SeaTunnel Engine. Otherwise, the SeaTunnel Engine will reject the client's request. + +**network** + +All addresses of the SeaTunnel Engine Master nodes need to be added here. + +```yaml +hazelcast-client: + cluster-name: seatunnel + properties: + hazelcast.logging.type: log4j2 + network: + cluster-members: + - master-node-1:5801 + - master-node-2:5801 +``` + +# 9 Submitting And Managing Jobs + +Now that the cluster has been deployed, you can complete the job submission and management through the following tutorial: [Submitting And Managing Jobs](user-command.md). diff --git a/versioned_docs/version-2.3.7/seatunnel-engine/tcp.md b/versioned_docs/version-2.3.7/seatunnel-engine/tcp.md new file mode 100644 index 000000000000..b28907ac8f1a --- /dev/null +++ b/versioned_docs/version-2.3.7/seatunnel-engine/tcp.md @@ -0,0 +1,37 @@ +--- + +sidebar_position: 10 +-------------------- + +# TCP Network + +If multicast is not the preferred way of discovery for your environment, then you can configure SeaTunnel Engine to be a full TCP/IP cluster. When you configure SeaTunnel Engine to discover members by TCP/IP, you must list all or a subset of the members' host names and/or IP addresses as cluster members. You do not have to list all of these cluster members, but at least one of the listed members has to be active in the cluster when a new member joins. + +To configure your Hazelcast to be a full TCP/IP cluster, set the following configuration elements. See the tcp-ip element section for the full descriptions of the TCP/IP discovery configuration elements. + +- Set the enabled attribute of the tcp-ip element to true. +- Provide your member elements within the tcp-ip element. + +The following is an example declarative configuration. + +```yaml +hazelcast: + network: + join: + tcp-ip: + enabled: true + member-list: + - machine1 + - machine2 + - machine3:5799 + - 192.168.1.0-7 + - 192.168.1.21 +``` + +As shown above, you can provide IP addresses or host names for member elements. You can also give a range of IP addresses, such as `192.168.1.0-7`. + +Instead of providing members line-by-line as shown above, you also have the option to use the members element and write comma-separated IP addresses, as shown below. + +`192.168.1.0-7,192.168.1.21` + +If you do not provide ports for the members, Hazelcast automatically tries the ports `5701`, `5702` and so on. diff --git a/versioned_docs/version-2.3.7/seatunnel-engine/user-command.md b/versioned_docs/version-2.3.7/seatunnel-engine/user-command.md new file mode 100644 index 000000000000..a18ec931e09a --- /dev/null +++ b/versioned_docs/version-2.3.7/seatunnel-engine/user-command.md @@ -0,0 +1,123 @@ +--- + +sidebar_position: 12 +-------------------- + +# Command Line Tool + +The SeaTunnel Engine provides a command line tool for managing the jobs of the SeaTunnel Engine. You can use the command line tool to submit, stop, pause, resume, delete jobs, view job status and monitoring metrics, etc. + +You can obtain the help information of the command line tool through the following command: + +```shell +sh bin/seatunnel.sh -h +``` + +The output is as follows: + +``` + +Usage: seatunnel.sh [options] + Options: + --async Run the job asynchronously. When the job is submitted, the client will exit (default: false). + -can, --cancel-job Cancel the job by JobId. + --check Whether to check the config (default: false). + -cj, --close-job Close the client and the task will also be closed (default: true). + -cn, --cluster The name of the cluster. + -c, --config Config file. + --decrypt Decrypt the config file. When both --decrypt and --encrypt are specified, only --encrypt will take effect (default: false). + -m, --master, -e, --deploy-mode SeaTunnel job submit master, support [local, cluster] (default: cluster). + --encrypt Encrypt the config file. When both --decrypt and --encrypt are specified, only --encrypt will take effect (default: false). + --get_running_job_metrics Get metrics for running jobs (default: false). + -h, --help Show the usage message. + -j, --job-id Get the job status by JobId. + -l, --list List the job status (default: false). + --metrics Get the job metrics by JobId. + -n, --name The SeaTunnel job name (default: SeaTunnel). + -r, --restore Restore with savepoint by jobId. + -s, --savepoint Savepoint the job by jobId. + -i, --variable Variable substitution, such as -i city=beijing, or -i date=20190318. We use ',' as a separator. When inside "", ',' are treated as normal characters instead of delimiters. (default: []). + +``` + +## Submitting Jobs + +```shell +sh bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template +``` + +The **--async** parameter allows the job to run in the background. When the job is submitted, the client will exit. + +```shell +sh bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template --async +``` + +The **-n** or **--name** parameter can specify the name of the job. + +```shell +sh bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template --async -n myjob +``` + +## Viewing The Job List + +```shell +sh bin/seatunnel.sh -l +``` + +This command will output the list of all jobs in the current cluster (including completed historical jobs and running jobs). + +## Viewing The Job Status + +```shell +sh bin/seatunnel.sh -j <jobId> +``` + +This command will output the status information of the specified job. + +## Getting The Monitoring Information Of Running Jobs + +```shell +sh bin/seatunnel.sh --get_running_job_metrics +``` + +This command will output the monitoring information of running jobs. + +## Getting the Monitoring Information of a Specified Job + +The --metrics parameter can get the monitoring information of a specified job. + +```shell +sh bin/seatunnel.sh --metrics <jobId> +``` + +## Pausing Jobs + +```shell +sh bin/seatunnel.sh -s <jobId> +``` + +This command will pause the specified job. Note that only jobs with checkpoints enabled support pausing jobs (real-time synchronization jobs have checkpoints enabled by default, and batch jobs do not have checkpoints enabled by default and need to configure checkpoint.interval in `env` to enable checkpoints). + +Pausing a job is in the smallest unit of split. That is, after pausing a job, it will wait for the currently running split to finish running and then pause. After the task is resumed, it will continue to run from the paused split. + +## Resuming Jobs + +```shell +sh bin/seatunnel.sh -r <jobId> -c $SEATUNNEL_HOME/config/v2.batch.config.template +``` + +This command will resume the specified job. Note that only jobs with checkpoints enabled support resuming jobs (real-time synchronization jobs have checkpoints enabled by default, and batch jobs do not have checkpoints enabled by default and need to configure checkpoint.interval in `env` to enable checkpoints). + +Resuming a job requires the jobId and the configuration file of the job. + +Both failed jobs and jobs paused by seatunnel.sh -s <jobId> can be resumed by this command. + +## Canceling Jobs + +```shell +sh bin/seatunnel.sh -can <jobId> +``` + +This command will cancel the specified job. After canceling the job, the job will be stopped and its status will become `CANCELED`. + +All breakpoint information of the canceled job will be deleted and cannot be resumed by seatunnel.sh -r <jobId>. diff --git a/versioned_docs/version-2.3.7/start-v2/docker/docker.md b/versioned_docs/version-2.3.7/start-v2/docker/docker.md new file mode 100644 index 000000000000..111df5b20c97 --- /dev/null +++ b/versioned_docs/version-2.3.7/start-v2/docker/docker.md @@ -0,0 +1,9 @@ +--- + +sidebar_position: 3 +------------------- + +# Set Up With Docker + + +--> diff --git a/versioned_docs/version-2.3.7/start-v2/kubernetes/kubernetes.mdx b/versioned_docs/version-2.3.7/start-v2/kubernetes/kubernetes.mdx new file mode 100644 index 000000000000..7c5a4ac27950 --- /dev/null +++ b/versioned_docs/version-2.3.7/start-v2/kubernetes/kubernetes.mdx @@ -0,0 +1,772 @@ +--- +sidebar_position: 4 +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Set Up with Kubernetes + +This section provides a quick guide to use SeaTunnel with Kubernetes. + +## Prerequisites + +We assume that you have one local installation as follow: + +- [docker](https://docs.docker.com/) +- [kubernetes](https://kubernetes.io/) +- [helm](https://helm.sh/docs/intro/quickstart/) + +So that the `kubectl` and `helm` commands are available on your local system. + +Take kubernetes [minikube](https://minikube.sigs.k8s.io/docs/start/) as an example, you can start a cluster with the following command: + +```bash +minikube start --kubernetes-version=v1.23.3 +``` + +## Installation + +### SeaTunnel Docker Image + +To run the image with SeaTunnel, first create a `Dockerfile`: + + + + +```Dockerfile +FROM flink:1.13 + +ENV SEATUNNEL_VERSION="2.3.7" +ENV SEATUNNEL_HOME="/opt/seatunnel" + +RUN wget https://dlcdn.apache.org/seatunnel/${SEATUNNEL_VERSION}/apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz +RUN tar -xzvf apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz +RUN mv apache-seatunnel-${SEATUNNEL_VERSION} ${SEATUNNEL_HOME} + +RUN cd ${SEATUNNEL_HOME} && sh bin/install-plugin.sh ${SEATUNNEL_VERSION} +``` + +Then run the following commands to build the image: +```bash +docker build -t seatunnel:2.3.7-flink-1.13 -f Dockerfile . +``` +Image `seatunnel:2.3.7-flink-1.13` needs to be present in the host (minikube) so that the deployment can take place. + +Load image to minikube via: +```bash +minikube image load seatunnel:2.3.7-flink-1.13 +``` + + + + + +```Dockerfile +FROM openjdk:8 + +ENV SEATUNNEL_VERSION="2.3.7" +ENV SEATUNNEL_HOME="/opt/seatunnel" + +RUN wget https://dlcdn.apache.org/seatunnel/${SEATUNNEL_VERSION}/apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz +RUN tar -xzvf apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz +RUN mv apache-seatunnel-${SEATUNNEL_VERSION} ${SEATUNNEL_HOME} + +RUN cd ${SEATUNNEL_HOME} && sh bin/install-plugin.sh ${SEATUNNEL_VERSION} +``` + +Then run the following commands to build the image: +```bash +docker build -t seatunnel:2.3.7 -f Dockerfile . +``` +Image `seatunnel:2.3.7` need to be present in the host (minikube) so that the deployment can take place. + +Load image to minikube via: +```bash +minikube image load seatunnel:2.3.7 +``` + + + + + +```Dockerfile +FROM openjdk:8 + +ENV SEATUNNEL_VERSION="2.3.7" +ENV SEATUNNEL_HOME="/opt/seatunnel" + +RUN wget https://dlcdn.apache.org/seatunnel/${SEATUNNEL_VERSION}/apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz +RUN tar -xzvf apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz +RUN mv apache-seatunnel-${SEATUNNEL_VERSION} ${SEATUNNEL_HOME} +RUN mkdir -p $SEATUNNEL_HOME/logs +RUN cd ${SEATUNNEL_HOME} && sh bin/install-plugin.sh ${SEATUNNEL_VERSION} +``` + +Then run the following commands to build the image: +```bash +docker build -t seatunnel:2.3.7 -f Dockerfile . +``` +Image `seatunnel:2.3.7` needs to be present in the host (minikube) so that the deployment can take place. + +Load image to minikube via: +```bash +minikube image load seatunnel:2.3.7 +``` + + + + + +### Deploying The Operator + + + + +The steps below provide a quick walk-through on setting up the Flink Kubernetes Operator. +You can refer to [Flink Kubernetes Operator - Quick Start](https://nightlies.apache.org/flink/flink-kubernetes-operator-docs-main/docs/try-flink-kubernetes-operator/quick-start/) for more details. + +> Notice: All the Kubernetes resources bellow are created in default namespace. + +Install the certificate manager on your Kubernetes cluster to enable adding the webhook component (only needed once per Kubernetes cluster): + +```bash +kubectl create -f https://github.com/jetstack/cert-manager/releases/download/v1.8.2/cert-manager.yaml +``` +Now you can deploy the latest stable Flink Kubernetes Operator version using the included Helm chart: + +```bash +helm repo add flink-operator-repo https://downloads.apache.org/flink/flink-kubernetes-operator-1.3.1/ + +helm install flink-kubernetes-operator flink-operator-repo/flink-kubernetes-operator \ +--set image.repository=apache/flink-kubernetes-operator +``` + +You may verify your installation via `kubectl`: + +```bash +kubectl get pods +NAME READY STATUS RESTARTS AGE +flink-kubernetes-operator-5f466b8549-mgchb 1/1 Running 3 (23h ago) 16d + +``` + + + + + +none + + + +none + + + +## Run SeaTunnel Application + +**Run Application:**: SeaTunnel already providers out-of-the-box [configurations](https://github.com/apache/seatunnel/tree/dev/config). + + + + +In this guide we will use [seatunnel.streaming.conf](https://github.com/apache/seatunnel/blob/2.3.7-release/config/v2.streaming.conf.template): + +```conf +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 2000 +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 160000 + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +transform { + FieldMapper { + source_table_name = "fake" + result_table_name = "fake1" + field_mapper = { + age = age + name = new_name + } + } +} + +sink { + Console { + source_table_name = "fake1" + } +} +``` + +Generate a configmap named seatunnel-config in Kubernetes for the seatunnel.streaming.conf so that we can mount the config content in pod. +```bash +kubectl create cm seatunnel-config \ +--from-file=seatunnel.streaming.conf=seatunnel.streaming.conf +``` + +Once the Flink Kubernetes Operator is running as seen in the previous steps you are ready to submit a Flink (SeaTunnel) job: +- Create `seatunnel-flink.yaml` FlinkDeployment manifest: +```yaml +apiVersion: flink.apache.org/v1beta1 +kind: FlinkDeployment +metadata: + name: seatunnel-flink-streaming-example +spec: + image: seatunnel:2.3.7-flink-1.13 + flinkVersion: v1_13 + flinkConfiguration: + taskmanager.numberOfTaskSlots: "2" + serviceAccount: flink + jobManager: + replicas: 1 + resource: + memory: "1024m" + cpu: 1 + taskManager: + resource: + memory: "1024m" + cpu: 1 + podTemplate: + spec: + containers: + - name: flink-main-container + volumeMounts: + - name: seatunnel-config + mountPath: /data/seatunnel.streaming.conf + subPath: seatunnel.streaming.conf + volumes: + - name: seatunnel-config + configMap: + name: seatunnel-config + items: + - key: seatunnel.streaming.conf + path: seatunnel.streaming.conf + job: + jarURI: local:///opt/seatunnel/starter/seatunnel-flink-13-starter.jar + entryClass: org.apache.seatunnel.core.starter.flink.SeaTunnelFlink + args: ["--config", "/data/seatunnel.streaming.conf"] + parallelism: 2 + upgradeMode: stateless +``` + +- Run the example application: +```bash +kubectl apply -f seatunnel-flink.yaml +``` + + + + + +In this guide we will use [seatunnel.streaming.conf](https://github.com/apache/seatunnel/blob/2.3.7-release/config/v2.streaming.conf.template): + +```conf +env { + parallelism = 2 + job.mode = "STREAMING" + checkpoint.interval = 2000 +} + +source { + FakeSource { + parallelism = 2 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +sink { + Console { + } +} +``` + +Generate a configmap named seatunnel-config in Kubernetes for the seatunnel.streaming.conf so that we can mount the config content in pod. +```bash +kubectl create cm seatunnel-config \ +--from-file=seatunnel.streaming.conf=seatunnel.streaming.conf +``` +- Create `seatunnel.yaml`: +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: seatunnel +spec: + containers: + - name: seatunnel + image: seatunnel:2.3.7 + command: ["/bin/sh","-c","/opt/seatunnel/bin/seatunnel.sh --config /data/seatunnel.streaming.conf -e local"] + resources: + limits: + cpu: "1" + memory: 4G + requests: + cpu: "1" + memory: 2G + volumeMounts: + - name: seatunnel-config + mountPath: /data/seatunnel.streaming.conf + subPath: seatunnel.streaming.conf + volumes: + - name: seatunnel-config + configMap: + name: seatunnel-config + items: + - key: seatunnel.streaming.conf + path: seatunnel.streaming.conf +``` + +- Run the example application: +```bash +kubectl apply -f seatunnel.yaml +``` + + + + + + +In this guide we will use [seatunnel.streaming.conf](https://github.com/apache/seatunnel/blob/2.3.7-release/config/v2.streaming.conf.template): + +```conf +env { + parallelism = 2 + job.mode = "STREAMING" + checkpoint.interval = 2000 +} + +source { + FakeSource { + parallelism = 2 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +sink { + Console { + } +} +``` + +Generate a configmap named seatunnel-config in Kubernetes for the seatunnel.streaming.conf so that we can mount the config content in pod. +```bash +kubectl create cm seatunnel-config \ +--from-file=seatunnel.streaming.conf=seatunnel.streaming.conf +``` + +Then, we use the following command to load some configuration files used by the seatunnel cluster into the configmap + +Create the yaml file locally as follows + +- Create `hazelcast-client.yaml`: + +```yaml + +hazelcast-client: + cluster-name: seatunnel + properties: + hazelcast.logging.type: log4j2 + network: + cluster-members: + - localhost:5801 + +``` +- Create `hazelcast.yaml`: + +```yaml + +hazelcast: + cluster-name: seatunnel + network: + rest-api: + enabled: true + endpoint-groups: + CLUSTER_WRITE: + enabled: true + DATA: + enabled: true + join: + tcp-ip: + enabled: true + member-list: + - localhost + port: + auto-increment: false + port: 5801 + properties: + hazelcast.invocation.max.retry.count: 20 + hazelcast.tcp.join.port.try.count: 30 + hazelcast.logging.type: log4j2 + hazelcast.operation.generic.thread.count: 50 + +``` +- Create `seatunnel.yaml`: + +```yaml +seatunnel: + engine: + history-job-expire-minutes: 1440 + backup-count: 1 + queue-type: blockingqueue + print-execution-info-interval: 60 + print-job-metrics-info-interval: 60 + slot-service: + dynamic-slot: true + checkpoint: + interval: 10000 + timeout: 60000 + storage: + type: hdfs + max-retained: 3 + plugin-config: + namespace: /tmp/seatunnel/checkpoint_snapshot + storage.type: hdfs + fs.defaultFS: file:///tmp/ # Ensure that the directory has written permission +``` + +Create congfigmaps for the configuration file using the following command + +```bash +kubectl create configmap hazelcast-client --from-file=hazelcast-client.yaml +kubectl create configmap hazelcast --from-file=hazelcast.yaml +kubectl create configmap seatunnelmap --from-file=seatunnel.yaml + +``` + +Deploy Reloader to achieve hot deployment +We use the Reloader here to automatically restart the pod when the configuration file or other modifications are made. You can also directly give the value of the configuration file and do not use the Reloader + +- [Reloader](https://github.com/stakater/Reloader/) + +```bash +wget https://raw.githubusercontent.com/stakater/Reloader/master/deployments/kubernetes/reloader.yaml +kubectl apply -f reloader.yaml + +``` + +- Create `seatunnel-cluster.yml`: +```yaml +apiVersion: v1 +kind: Service +metadata: + name: seatunnel +spec: + selector: + app: seatunnel + ports: + - port: 5801 + name: seatunnel + clusterIP: None +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: seatunnel + annotations: + configmap.reloader.stakater.com/reload: "hazelcast,hazelcast-client,seatunnelmap" +spec: + serviceName: "seatunnel" + replicas: 3 # modify replicas according to your case + selector: + matchLabels: + app: seatunnel + template: + metadata: + labels: + app: seatunnel + spec: + containers: + - name: seatunnel + image: seatunnel:2.3.7 + imagePullPolicy: IfNotPresent + ports: + - containerPort: 5801 + name: client + command: ["/bin/sh","-c","/opt/seatunnel/bin/seatunnel-cluster.sh -DJvmOption=-Xms2G -Xmx2G"] + resources: + limits: + cpu: "1" + memory: 4G + requests: + cpu: "1" + memory: 2G + volumeMounts: + - mountPath: "/opt/seatunnel/config/hazelcast.yaml" + name: hazelcast + subPath: hazelcast.yaml + - mountPath: "/opt/seatunnel/config/hazelcast-client.yaml" + name: hazelcast-client + subPath: hazelcast-client.yaml + - mountPath: "/opt/seatunnel/config/seatunnel.yaml" + name: seatunnelmap + subPath: seatunnel.yaml + - mountPath: /data/seatunnel.streaming.conf + name: seatunnel-config + subPath: seatunnel.streaming.conf + volumes: + - name: hazelcast + configMap: + name: hazelcast + - name: hazelcast-client + configMap: + name: hazelcast-client + - name: seatunnelmap + configMap: + name: seatunnelmap + - name: seatunnel-config + configMap: + name: seatunnel-config + items: + - key: seatunnel.streaming.conf + path: seatunnel.streaming.conf +``` + +- Starting a cluster: +```bash +kubectl apply -f seatunnel-cluster.yml +``` +Then modify the seatunnel configuration in pod using the following command: + +```bash +kubectl edit cm hazelcast +``` +Change the member-list option to your cluster address + +This uses the headless service access mode + +The format for accessing between general pods is [pod-name].[service-name].[namespace].svc.cluster.local + +for example: +```bash +- seatunnel-0.seatunnel.default.svc.cluster.local +- seatunnel-1.seatunnel.default.svc.cluster.local +- seatunnel-2.seatunnel.default.svc.cluster.local +``` +```bash +kubectl edit cm hazelcast-client +``` +Change the cluster-members option to your cluster address + +for example: +```bash +- seatunnel-0.seatunnel.default.svc.cluster.local:5801 +- seatunnel-1.seatunnel.default.svc.cluster.local:5801 +- seatunnel-2.seatunnel.default.svc.cluster.local:5801 +``` +Later, you will see that the pod automatically restarts and updates the seatunnel configuration + +```bash +kubectl edit cm hazelcast-client +``` +After we wait for all pod updates to be completed, we can use the following command to check if the configuration inside the pod has been updated + +```bash +kubectl exec -it seatunnel-0 -- cat /opt/seatunnel/config/hazelcast-client.yaml +``` +Afterwards, we can submit tasks to any pod + +```bash +kubectl exec -it seatunnel-0 -- /opt/seatunnel/bin/seatunnel.sh --config /data/seatunnel.streaming.conf +``` + + + + +**See The Output** + + + + +You may follow the logs of your job, after a successful startup (which can take on the order of a minute in a fresh environment, seconds afterwards) you can: + +```bash +kubectl logs -f deploy/seatunnel-flink-streaming-example +``` +looks like the below: + +```shell +... +2023-01-31 12:13:54,349 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: SeaTunnel FakeSource -> Sink Writer: Console (1/1) (1665d2d011b2f6cf6525c0e5e75ec251) switched from SCHEDULED to DEPLOYING. +2023-01-31 12:13:56,684 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Deploying Source: SeaTunnel FakeSource -> Sink Writer: Console (1/1) (attempt #0) with attempt id 1665d2d011b2f6cf6525c0e5e75ec251 to seatunnel-flink-streaming-example-taskmanager-1-1 @ 100.103.244.106 (dataPort=39137) with allocation id fbe162650c4126649afcdaff00e46875 +2023-01-31 12:13:57,794 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: SeaTunnel FakeSource -> Sink Writer: Console (1/1) (1665d2d011b2f6cf6525c0e5e75ec251) switched from DEPLOYING to INITIALIZING. +2023-01-31 12:13:58,203 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: SeaTunnel FakeSource -> Sink Writer: Console (1/1) (1665d2d011b2f6cf6525c0e5e75ec251) switched from INITIALIZING to RUNNING. +``` + +If OOM error accur in the log, you can decrease the `row.num` value in seatunnel.streaming.conf + +To expose the Flink Dashboard you may add a port-forward rule: +```bash +kubectl port-forward svc/seatunnel-flink-streaming-example-rest 8081 +``` +Now the Flink Dashboard is accessible at [localhost:8081](http://localhost:8081). + +Or launch `minikube dashboard` for a web-based Kubernetes user interface. + +The content printed in the TaskManager Stdout log: +```bash +kubectl logs \ +-l 'app in (seatunnel-flink-streaming-example), component in (taskmanager)' \ +--tail=-1 \ +-f +``` +looks like the below (your content may be different since we use `FakeSource` to automatically generate random stream data): + +```shell +... +subtaskIndex=0: row=159991 : VVgpp, 978840000 +subtaskIndex=0: row=159992 : JxrOC, 1493825495 +subtaskIndex=0: row=159993 : YmCZR, 654146216 +subtaskIndex=0: row=159994 : LdmUn, 643140261 +subtaskIndex=0: row=159995 : tURkE, 837012821 +subtaskIndex=0: row=159996 : uPDfd, 2021489045 +subtaskIndex=0: row=159997 : mjrdG, 2074957853 +subtaskIndex=0: row=159998 : xbeUi, 864518418 +subtaskIndex=0: row=159999 : sSWLb, 1924451911 +subtaskIndex=0: row=160000 : AuPlM, 1255017876 +``` + +To stop your job and delete your FlinkDeployment you can simply: + +```bash +kubectl delete -f seatunnel-flink.yaml +``` + + + + +You may follow the logs of your job, after a successful startup (which can take on the order of a minute in a fresh environment, seconds afterwards) you can: + +```bash +kubectl logs -f seatunnel +``` + +looks like the below (your content may be different since we use `FakeSource` to automatically generate random stream data): + +```shell +... +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25673: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : hRJdE, 1295862507 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25674: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : kXlew, 935460726 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25675: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : FrNOT, 1714358118 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25676: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : kSajX, 126709414 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25677: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : YhpQv, 2020198351 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25678: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : nApin, 691339553 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25679: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : KZNNa, 1720773736 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25680: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : uCUBI, 490868386 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25681: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : oTLmO, 98770781 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25682: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : UECud, 835494636 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25683: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : XNegY, 1602828896 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25684: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : LcFBx, 1400869177 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25685: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : EqSfF, 1933614060 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25686: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : BODIs, 1839533801 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25687: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : doxcI, 970104616 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25688: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : IEVYn, 371893767 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25689: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : YXYfq, 1719257882 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25690: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : LFWEm, 725033360 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25691: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : ypUrY, 1591744616 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25692: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : rlnzJ, 412162913 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25693: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : zWKnt, 976816261 +2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25694: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : PXrsk, 43554541 + +``` + +To stop your job and delete your FlinkDeployment you can simply: + +```bash +kubectl delete -f seatunnel.yaml +``` + + + + +You may follow the logs of your job, after a successful startup (which can take on the order of a minute in a fresh environment, seconds afterwards) you can: + +```bash +kubectl exec -it seatunnel-1 -- tail -f /opt/seatunnel/logs/seatunnel-engine-server.log | grep ConsoleSinkWriter +``` + +looks like the below (your content may be different since we use `FakeSource` to automatically generate random stream data): + +```shell +... +2023-10-10 08:05:07,283 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=7: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : IibHk, 820962465 +2023-10-10 08:05:07,283 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=8: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : lmKdb, 1072498088 +2023-10-10 08:05:07,283 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=9: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : iqGva, 918730371 +2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=10: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : JMHmq, 1130771733 +2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=11: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : rxoHF, 189596686 +2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=12: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : OSblw, 559472064 +2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=13: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : yTZjG, 1842482272 +2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=14: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : RRiMg, 1713777214 +2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=15: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : lRcsd, 1626041649 +2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=16: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : QrNNW, 41355294 + +``` + +To stop your job and delete your FlinkDeployment you can simply: + +```bash +kubectl delete -f seatunnel-cluster.yaml +``` + + + + +Happy SeaTunneling! + +## What's More + +For now, you have taken a quick look at SeaTunnel, and you can see [connector](/category/connector) to find all sources and sinks SeaTunnel supported. +Or see [deployment](../deployment.mdx) if you want to submit your application in another kind of your engine cluster. diff --git a/versioned_docs/version-2.3.7/start-v2/locally/deployment.md b/versioned_docs/version-2.3.7/start-v2/locally/deployment.md new file mode 100644 index 000000000000..0d5f0e26d110 --- /dev/null +++ b/versioned_docs/version-2.3.7/start-v2/locally/deployment.md @@ -0,0 +1,74 @@ +--- + +sidebar_position: 2 +------------------- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Download and Make Installation Packages + +## Step 1: Preparation + +Before starting to download SeaTunnel, you need to ensure that you have installed the following software required by SeaTunnel: + +* Install [Java](https://www.java.com/en/download/) (Java 8 or 11, and other versions higher than Java 8 can theoretically work) and set `JAVA_HOME`. + +## Step 2: Download SeaTunnel + +Visit the [SeaTunnel Download Page](https://seatunnel.apache.org/download) to download the latest binary package `seatunnel--bin.tar.gz`. + +Or you can also download it through the terminal: + +```shell +export version="2.3.7" +wget "https://archive.apache.org/dist/seatunnel/${version}/apache-seatunnel-${version}-bin.tar.gz" +tar -xzvf "apache-seatunnel-${version}-bin.tar.gz" +``` + +## Step 3: Download The Connector Plugins + +Starting from the 2.2.0-beta version, the binary package no longer provides the connector dependencies by default. Therefore, when using it for the first time, you need to execute the following command to install the connectors (Of course, you can also manually download the connector from the [Apache Maven Repository](https://repo.maven.apache.org/maven2/org/apache/seatunnel/), and then move it to the `connectors/seatunnel` directory) : + +```bash +sh bin/install-plugin.sh +``` + +If you need a specific connector version, taking 2.3.7 as an example, you need to execute the following command: + +```bash +sh bin/install-plugin.sh 2.3.7 +``` + +Usually you don't need all connector plugins, so you can specify the plugins you need through configuring `config/plugin_config`. For example, if you only need the `connector-console` plugin, you can modify the plugin.properties configuration file as follows: + +```plugin_config +--seatunnel-connectors-- +connector-console +--end-- +``` + +If you want the example application to work properly, you need to add the following plugins. + +```plugin_config +--seatunnel-connectors-- +connector-fake +connector-console +--end-- +``` + +You can find all supported connectors and the corresponding plugin_config configuration names under `${SEATUNNEL_HOME}/connectors/plugins-mapping.properties`. + +:::tip Tip + +If you want to install connector plugins by manually downloading connectors, you only need to download the related connector plugins and place them in the `${SEATUNNEL_HOME}/connectors/` directory. + +::: + +Now you have downloaded the SeaTunnel binary package and the connector plugins. Next, you can choose different engine option to run synchronization tasks. + +If you use Flink to run the synchronization task, there is no need to deploy the SeaTunnel Engine service cluster. You can refer to [Quick Start of SeaTunnel Flink Engine](quick-start-flink.md) to run your synchronization task. + +If you use Spark to run the synchronization task, there is no need to deploy the SeaTunnel Engine service cluster. You can refer to [Quick Start of SeaTunnel Spark Engine](quick-start-spark.md) to run your synchronization task. + +If you use the builtin SeaTunnel Engine (Zeta) to run tasks, you need to deploy the SeaTunnel Engine service first. Refer to [Deployment of SeaTunnel Engine (Zeta) Service](quick-start-seatunnel-engine.md). diff --git a/versioned_docs/version-2.3.7/start-v2/locally/quick-start-flink.md b/versioned_docs/version-2.3.7/start-v2/locally/quick-start-flink.md new file mode 100644 index 000000000000..fcb5ab409308 --- /dev/null +++ b/versioned_docs/version-2.3.7/start-v2/locally/quick-start-flink.md @@ -0,0 +1,112 @@ +--- + +sidebar_position: 3 +------------------- + +# Quick Start With Flink + +## Step 1: Deploy SeaTunnel And Connectors + +Before starting, make sure you have downloaded and deployed SeaTunnel as described in [Deployment](deployment.md) + +## Step 2: Deployment And Config Flink + +Please [Download Flink](https://flink.apache.org/downloads.html) first(**required version >= 1.12.0**). For more information you can see [Getting Started: Standalone](https://nightlies.apache.org/flink/flink-docs-release-1.14/docs/deployment/resource-providers/standalone/overview/) + +**Configure SeaTunnel**: Change the setting in `${SEATUNNEL_HOME}/config/seatunnel-env.sh` and set `FLINK_HOME` to the Flink deployment dir. + +## Step 3: Add Job Config File To Define A Job + +Edit `config/v2.streaming.conf.template`, which determines the way and logic of data input, processing, and output after seatunnel is started. +The following is an example of the configuration file, which is the same as the example application mentioned above. + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +transform { + FieldMapper { + source_table_name = "fake" + result_table_name = "fake1" + field_mapper = { + age = age + name = new_name + } + } +} + +sink { + Console { + source_table_name = "fake1" + } +} + +``` + +More information about config please check [Config Concept](../../concept/config.md) + +## Step 4: Run SeaTunnel Application + +You can start the application by the following commands: + +Flink version between `1.12.x` and `1.14.x` + +```shell +cd "apache-seatunnel-${version}" +./bin/start-seatunnel-flink-13-connector-v2.sh --config ./config/v2.streaming.conf.template +``` + +Flink version between `1.15.x` and `1.16.x` + +```shell +cd "apache-seatunnel-${version}" +./bin/start-seatunnel-flink-15-connector-v2.sh --config ./config/v2.streaming.conf.template +``` + +**See The Output**: When you run the command, you can see its output in your console. This +is a sign to determine whether the command ran successfully or not. + +The SeaTunnel console will print some logs as below: + +```shell +fields : name, age +types : STRING, INT +row=1 : elWaB, 1984352560 +row=2 : uAtnp, 762961563 +row=3 : TQEIB, 2042675010 +row=4 : DcFjo, 593971283 +row=5 : SenEb, 2099913608 +row=6 : DHjkg, 1928005856 +row=7 : eScCM, 526029657 +row=8 : sgOeE, 600878991 +row=9 : gwdvw, 1951126920 +row=10 : nSiKE, 488708928 +row=11 : xubpl, 1420202810 +row=12 : rHZqb, 331185742 +row=13 : rciGD, 1112878259 +row=14 : qLhdI, 1457046294 +row=15 : ZTkRx, 1240668386 +row=16 : SGZCr, 94186144 +``` + +## What's More + +For now, you have taken a quick look about SeaTunnel with Flink, and you can see [Connector](/docs/category/connector-v2) to find all +sources and sinks SeaTunnel supported. Or see [SeaTunnel With Flink](../../other-engine/flink.md) if you want to know more about SeaTunnel With Flink. + +SeaTunnel have a builtin engine named `Zeta`, and it's the default engine of SeaTunnel. You can follow [Quick Start](quick-start-seatunnel-engine.md) to configure and run a data synchronization job. diff --git a/versioned_docs/version-2.3.7/start-v2/locally/quick-start-seatunnel-engine.md b/versioned_docs/version-2.3.7/start-v2/locally/quick-start-seatunnel-engine.md new file mode 100644 index 000000000000..10814f0050fc --- /dev/null +++ b/versioned_docs/version-2.3.7/start-v2/locally/quick-start-seatunnel-engine.md @@ -0,0 +1,101 @@ +--- + +sidebar_position: 2 +------------------- + +# Quick Start With SeaTunnel Engine + +## Step 1: Deploy SeaTunnel And Connectors + +Before starting, make sure you have downloaded and deployed SeaTunnel as described in [Deployment](deployment.md) + +## Step 2: Add Job Config File To Define A Job + +Edit `config/v2.batch.config.template`, which determines the way and logic of data input, processing, and output after seatunnel is started. +The following is an example of the configuration file, which is the same as the example application mentioned above. + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +transform { + FieldMapper { + source_table_name = "fake" + result_table_name = "fake1" + field_mapper = { + age = age + name = new_name + } + } +} + +sink { + Console { + source_table_name = "fake1" + } +} + +``` + +More information can be found in [Config Concept](../../concept/config.md) + +## Step 3: Run SeaTunnel Application + +You could start the application by the following commands: + +:::tip + +Starting from version 2.3.1, the parameter -e in seatunnel.sh is deprecated, use -m instead. + +::: + +```shell +cd "apache-seatunnel-${version}" +./bin/seatunnel.sh --config ./config/v2.batch.config.template -m local + +``` + +**See The Output**: When you run the command, you can see its output in your console. This +is a sign to determine whether the command ran successfully or not. + +The SeaTunnel console will print some logs as below: + +```shell +2022-12-19 11:01:45,417 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - output rowType: name, age +2022-12-19 11:01:46,489 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=1: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: CpiOd, 8520946 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=2: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: eQqTs, 1256802974 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=3: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: UsRgO, 2053193072 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=4: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: jDQJj, 1993016602 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=5: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: rqdKp, 1392682764 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=6: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: wCoWN, 986999925 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=7: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: qomTU, 72775247 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=8: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: jcqXR, 1074529204 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=9: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: AkWIO, 1961723427 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=10: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: hBoib, 929089763 +2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=11: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: GSvzm, 827085798 +2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=12: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: NNAYI, 94307133 +2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=13: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: EexFl, 1823689599 +2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=14: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: CBXUb, 869582787 +2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=15: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: Wbxtm, 1469371353 +2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=16: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: mIJDt, 995616438 +``` + +## What's More + +For now, you have taken a quick look about SeaTunnel, and you can see [connector](../../connector-v2/source/FakeSource.md) to find all +sources and sinks SeaTunnel supported. Or see [SeaTunnel Engine(Zeta)](../../seatunnel-engine/about.md) if you want to know more about SeaTunnel Engine. Here you will learn how to deploy SeaTunnel Engine and how to use it in cluster mode. diff --git a/versioned_docs/version-2.3.7/start-v2/locally/quick-start-spark.md b/versioned_docs/version-2.3.7/start-v2/locally/quick-start-spark.md new file mode 100644 index 000000000000..160da9498cbf --- /dev/null +++ b/versioned_docs/version-2.3.7/start-v2/locally/quick-start-spark.md @@ -0,0 +1,119 @@ +--- + +sidebar_position: 4 +------------------- + +# Quick Start With Spark + +## Step 1: Deployment SeaTunnel And Connectors + +Before starting, make sure you have downloaded and deployed SeaTunnel as described in [Deployment](deployment.md) + +## Step 2: Deploy And Config Spark + +Please [Download Spark](https://spark.apache.org/downloads.html) first(**required version >= 2.4.0**). For more information you can +see [Getting Started: Standalone](https://spark.apache.org/docs/latest/spark-standalone.html#installing-spark-standalone-to-a-cluster) + +**Configure SeaTunnel**: Change the setting in `${SEATUNNEL_HOME}/config/seatunnel-env.sh` and set `SPARK_HOME` to the Spark deployment dir. + +## Step 3: Add Job Config File To Define A Job + +Edit `config/seatunnel.streaming.conf.template`, which determines the way and logic of data input, processing, and output after seatunnel is started. +The following is an example of the configuration file, which is the same as the example application mentioned above. + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } +} + +transform { + FieldMapper { + source_table_name = "fake" + result_table_name = "fake1" + field_mapper = { + age = age + name = new_name + } + } +} + +sink { + Console { + source_table_name = "fake1" + } +} + +``` + +More information about config please check [Config Concept](../../concept/config.md) + +## Step 4: Run SeaTunnel Application + +You could start the application by the following commands: + +Spark 2.4.x + +```bash +cd "apache-seatunnel-${version}" +./bin/start-seatunnel-spark-2-connector-v2.sh \ +--master local[4] \ +--deploy-mode client \ +--config ./config/v2.streaming.conf.template +``` + +Spark3.x.x + +```shell +cd "apache-seatunnel-${version}" +./bin/start-seatunnel-spark-3-connector-v2.sh \ +--master local[4] \ +--deploy-mode client \ +--config ./config/v2.streaming.conf.template +``` + +**See The Output**: When you run the command, you can see its output in your console. This +is a sign to determine whether the command ran successfully or not. + +The SeaTunnel console will print some logs as below: + +```shell +fields : name, age +types : STRING, INT +row=1 : elWaB, 1984352560 +row=2 : uAtnp, 762961563 +row=3 : TQEIB, 2042675010 +row=4 : DcFjo, 593971283 +row=5 : SenEb, 2099913608 +row=6 : DHjkg, 1928005856 +row=7 : eScCM, 526029657 +row=8 : sgOeE, 600878991 +row=9 : gwdvw, 1951126920 +row=10 : nSiKE, 488708928 +row=11 : xubpl, 1420202810 +row=12 : rHZqb, 331185742 +row=13 : rciGD, 1112878259 +row=14 : qLhdI, 1457046294 +row=15 : ZTkRx, 1240668386 +row=16 : SGZCr, 94186144 +``` + +## What's More + +For now, you have taken a quick look about SeaTunnel with Spark, and you can see [Connector](/docs/category/connector-v2) to find all +sources and sinks SeaTunnel supported. Or see [SeaTunnel With Spark](../../other-engine/spark.md) if you want to know more about SeaTunnel With Spark. + +SeaTunnel have a builtin engine named `Zeta`, and it's the default engine of SeaTunnel. You can follow [Quick Start](quick-start-seatunnel-engine.md) to configure and run a data synchronization job. diff --git a/versioned_docs/version-2.3.7/transform-v2/common-options.md b/versioned_docs/version-2.3.7/transform-v2/common-options.md new file mode 100644 index 000000000000..7c13bac4f001 --- /dev/null +++ b/versioned_docs/version-2.3.7/transform-v2/common-options.md @@ -0,0 +1,65 @@ +# Transform Common Options + +> This is a process of intermediate conversion between the source and sink terminals,You can use sql statements to smoothly complete the conversion process + +| Name | Type | Required | Default | Description | +|-------------------|--------|----------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| result_table_name | String | No | - | When `source_table_name` is not specified, the current plugin processes the data set `(dataset)` output by the previous plugin in the configuration file;
    When `source_table_name` is specified, the current plugin is processing the data set corresponding to this parameter. | +| source_table_name | String | No | - | When `result_table_name` is not specified, the data processed by this plugin will not be registered as a data set that can be directly accessed by other plugins, or called a temporary table `(table)`;
    When `result_table_name` is specified, the data processed by this plugin will be registered as a data set `(dataset)` that can be directly accessed by other plugins, or called a temporary table `(table)` . The dataset registered here can be directly accessed by other plugins by specifying `source_table_name` . | + +## Task Example + +### Simple: + +> This is the process of converting the data source to fake and write it to two different sinks, Detailed reference `transform` + +```bash +env { + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 100 + schema = { + fields { + id = "int" + name = "string" + age = "int" + c_timestamp = "timestamp" + c_date = "date" + c_map = "map" + c_array = "array" + c_decimal = "decimal(30, 8)" + c_row = { + c_row = { + c_int = int + } + } + } + } + } +} + +transform { + Sql { + source_table_name = "fake" + result_table_name = "fake1" + # the query table name must same as field 'source_table_name' + query = "select id, regexp_replace(name, '.+', 'b') as name, age+1 as age, pi() as pi, c_timestamp, c_date, c_map, c_array, c_decimal, c_row from fake" + } + # The SQL transform support base function and criteria operation + # But the complex SQL unsupported yet, include: multi source table/rows JOIN and AGGREGATE operation and the like +} + +sink { + Console { + source_table_name = "fake1" + } + Console { + source_table_name = "fake" + } +} +``` + diff --git a/versioned_docs/version-2.3.7/transform-v2/copy.md b/versioned_docs/version-2.3.7/transform-v2/copy.md new file mode 100644 index 000000000000..7a0e73f44beb --- /dev/null +++ b/versioned_docs/version-2.3.7/transform-v2/copy.md @@ -0,0 +1,65 @@ +# Copy + +> Copy transform plugin + +## Description + +Copy a field to a new field. + +## Options + +| name | type | required | default value | +|--------|--------|----------|---------------| +| fields | Object | yes | | + +### fields [config] + +Specify the field copy relationship between input and output + +### common options [string] + +Transform plugin common parameters, please refer to [Transform Plugin](common-options.md) for details + +## Example + +The data read from source is a table like this: + +| name | age | card | +|----------|-----|------| +| Joy Ding | 20 | 123 | +| May Ding | 20 | 123 | +| Kin Dom | 20 | 123 | +| Joy Dom | 20 | 123 | + +We want copy fields `name`、`age` to a new fields `name1`、`name2`、`age1`, we can add `Copy` Transform like this + +``` +transform { + Copy { + source_table_name = "fake" + result_table_name = "fake1" + fields { + name1 = name + name2 = name + age1 = age + } + } +} +``` + +Then the data in result table `fake1` will like this + +| name | age | card | name1 | name2 | age1 | +|----------|-----|------|----------|----------|------| +| Joy Ding | 20 | 123 | Joy Ding | Joy Ding | 20 | +| May Ding | 20 | 123 | May Ding | May Ding | 20 | +| Kin Dom | 20 | 123 | Kin Dom | Kin Dom | 20 | +| Joy Dom | 20 | 123 | Joy Dom | Joy Dom | 20 | + +## Changelog + +### new version + +- Add Copy Transform Connector +- Support copy fields to a new fields + diff --git a/versioned_docs/version-2.3.7/transform-v2/dynamic-compile.md b/versioned_docs/version-2.3.7/transform-v2/dynamic-compile.md new file mode 100644 index 000000000000..17e3b0047ee8 --- /dev/null +++ b/versioned_docs/version-2.3.7/transform-v2/dynamic-compile.md @@ -0,0 +1,171 @@ +# DynamicCompile + +> DynamicCompile transform plugin + +## Description + +:::tip + +important clause +You need to ensure the security of your service and prevent attackers from uploading destructive code + +::: + +Provide a programmable way to process rows, allowing users to customize any business behavior, even RPC requests based on existing row fields as parameters, or to expand fields by retrieving associated data from other data sources. To distinguish businesses, you can also define multiple transforms to combine, +If the conversion is too complex, it may affect performance + +## Options + +| name | type | required | default value | +|------------------|--------|----------|---------------| +| source_code | string | no | | +| compile_language | Enum | yes | | +| compile_pattern | Enum | no | SOURCE_CODE | +| absolute_path | string | no | | + +### source_code [string] + +The code must implement two methods: getInlineOutputColumns and getInlineOutputFieldValues. getInlineOutputColumns determines the columns you want to add or convert, and the original column structure can be obtained from CatalogTable +GetInlineOutputFieldValues determines your column values. You can fulfill any of your requirements, and even complete RPC requests to obtain new values based on the original columns +If there are third-party dependency packages, please place them in ${SEATUNNEL_HOME}/lib, if you use spark or flink, you need to put it under the libs of the corresponding service. + +### common options [string] + +Transform plugin common parameters, please refer to [Transform Plugin](common-options.md) for details + +### compile_language [Enum] + +Some syntax in Java may not be supported, please refer https://github.com/janino-compiler/janino +GROOVY,JAVA + +### compile_pattern [Enum] + +SOURCE_CODE,ABSOLUTE_PATH +If it is a SOURCE-CODE enumeration; the SOURCE-CODE attribute is required, and the ABSOLUTE_PATH enumeration;ABSOLUTE_PATH attribute is required + +### absolute_path [string] + +The absolute path of Java or Groovy files on the server + +## Example + +The data read from source is a table like this: + +| name | age | card | +|----------|-----|------| +| Joy Ding | 20 | 123 | +| May Ding | 20 | 123 | +| Kin Dom | 20 | 123 | +| Joy Dom | 20 | 123 | + +``` +transform { + DynamicCompile { + source_table_name = "fake" + result_table_name = "groovy_out" + compile_language="GROOVY" + compile_pattern="SOURCE_CODE" + source_code=""" + import org.apache.seatunnel.api.table.catalog.Column + import org.apache.seatunnel.transform.common.SeaTunnelRowAccessor + import org.apache.seatunnel.api.table.catalog.CatalogTable + import org.apache.seatunnel.api.table.catalog.PhysicalColumn; + import org.apache.seatunnel.api.table.type.*; + import java.util.ArrayList; + class demo { + public Column[] getInlineOutputColumns(CatalogTable inputCatalogTable) { + List columns = new ArrayList<>(); + PhysicalColumn destColumn = + PhysicalColumn.of( + "compile_language", + BasicType.STRING_TYPE, + 10, + true, + "", + ""); + columns.add(destColumn); + return columns.toArray(new Column[0]); + } + public Object[] getInlineOutputFieldValues(SeaTunnelRowAccessor inputRow) { + Object[] fieldValues = new Object[1]; + fieldValues[0]="GROOVY" + return fieldValues; + } + };""" + + } +} + +transform { + DynamicCompile { + source_table_name = "fake" + result_table_name = "java_out" + compile_language="JAVA" + compile_pattern="SOURCE_CODE" + source_code=""" + import org.apache.seatunnel.api.table.catalog.Column; + import org.apache.seatunnel.transform.common.SeaTunnelRowAccessor; + import org.apache.seatunnel.api.table.catalog.*; + import org.apache.seatunnel.api.table.type.*; + import java.util.ArrayList; + public Column[] getInlineOutputColumns(CatalogTable inputCatalogTable) { + + ArrayList columns = new ArrayList(); + PhysicalColumn destColumn = + PhysicalColumn.of( + "compile_language", + BasicType.STRING_TYPE, + 10, + true, + "", + ""); + return new Column[]{ + destColumn + }; + + } + public Object[] getInlineOutputFieldValues(SeaTunnelRowAccessor inputRow) { + Object[] fieldValues = new Object[1]; + fieldValues[0]="JAVA"; + return fieldValues; + } + """ + + } + } + + transform { + DynamicCompile { + source_table_name = "fake" + result_table_name = "groovy_out" + compile_language="GROOVY" + compile_pattern="ABSOLUTE_PATH" + absolute_path="""/tmp/GroovyFile""" + + } +} +``` + +Then the data in result table `groovy_out` will like this + +| name | age | card | compile_language | +|----------|-----|------|------------------| +| Joy Ding | 20 | 123 | GROOVY | +| May Ding | 20 | 123 | GROOVY | +| Kin Dom | 20 | 123 | GROOVY | +| Joy Dom | 20 | 123 | GROOVY | + +Then the data in result table `java_out` will like this + +| name | age | card | compile_language | +|----------|-----|------|------------------| +| Joy Ding | 20 | 123 | JAVA | +| May Ding | 20 | 123 | JAVA | +| Kin Dom | 20 | 123 | JAVA | +| Joy Dom | 20 | 123 | JAVA | + +More complex examples can be referred to +https://github.com/apache/seatunnel/tree/dev/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-2/src/test/resources/dynamic_compile/conf + +## Changelog + diff --git a/versioned_docs/version-2.3.7/transform-v2/field-mapper.md b/versioned_docs/version-2.3.7/transform-v2/field-mapper.md new file mode 100644 index 000000000000..e0bd32e14929 --- /dev/null +++ b/versioned_docs/version-2.3.7/transform-v2/field-mapper.md @@ -0,0 +1,64 @@ +# FieldMapper + +> FieldMapper transform plugin + +## Description + +Add input schema and output schema mapping. + +## Options + +| name | type | required | default value | +|--------------|--------|----------|---------------| +| field_mapper | Object | yes | | + +### field_mapper [config] + +Specify the field mapping relationship between input and output + +### common options [config] + +Transform plugin common parameters, please refer to [Transform Plugin](common-options.md) for details. + +## Example + +The data read from source is a table like this: + +| id | name | age | card | +|----|----------|-----|------| +| 1 | Joy Ding | 20 | 123 | +| 2 | May Ding | 20 | 123 | +| 3 | Kin Dom | 20 | 123 | +| 4 | Joy Dom | 20 | 123 | + +We want to delete `age` field and update the filed order to `id`, `card`, `name` and rename `name` to `new_name`. We can add `FieldMapper` transform like this + +``` +transform { + FieldMapper { + source_table_name = "fake" + result_table_name = "fake1" + field_mapper = { + id = id + card = card + name = new_name + } + } +} +``` + +Then the data in result table `fake1` will like this + +| id | card | new_name | +|----|------|----------| +| 1 | 123 | Joy Ding | +| 2 | 123 | May Ding | +| 3 | 123 | Kin Dom | +| 4 | 123 | Joy Dom | + +## Changelog + +### new version + +- Add Copy Transform Connector + diff --git a/versioned_docs/version-2.3.7/transform-v2/filter-rowkind.md b/versioned_docs/version-2.3.7/transform-v2/filter-rowkind.md new file mode 100644 index 000000000000..e6ef5ba98cdc --- /dev/null +++ b/versioned_docs/version-2.3.7/transform-v2/filter-rowkind.md @@ -0,0 +1,68 @@ +# FilterRowKind + +> FilterRowKind transform plugin + +## Description + +Filter the data by RowKind + +## Options + +| name | type | required | default value | +|---------------|-------|----------|---------------| +| include_kinds | array | yes | | +| exclude_kinds | array | yes | | + +### include_kinds [array] + +The row kinds to include + +### exclude_kinds [array] + +The row kinds to exclude. + +You can only config one of `include_kinds` and `exclude_kinds`. + +### common options [string] + +Transform plugin common parameters, please refer to [Transform Plugin](common-options.md) for details + +## Examples + +The RowKink of the data generate by FakeSource is `INSERT`, If we use `FilterRowKink` transform and exclude the `INSERT` data, we will write zero rows into sink. + +```yaml + +env { + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 100 + schema = { + fields { + id = "int" + name = "string" + age = "int" + } + } + } +} + +transform { + FilterRowKind { + source_table_name = "fake" + result_table_name = "fake1" + exclude_kinds = ["INSERT"] + } +} + +sink { + Console { + source_table_name = "fake1" + } +} +``` + diff --git a/versioned_docs/version-2.3.7/transform-v2/filter.md b/versioned_docs/version-2.3.7/transform-v2/filter.md new file mode 100644 index 000000000000..f9f28b8398a2 --- /dev/null +++ b/versioned_docs/version-2.3.7/transform-v2/filter.md @@ -0,0 +1,81 @@ +# Filter + +> Filter transform plugin + +## Description + +Filter the field. + +## Options + +| name | type | required | default value | +|----------------|-------|----------|---------------| +| include_fields | array | no | | +| exclude_fields | array | no | | + +Notice, you must set one and only one of `include_fields` and `exclude_fields` properties + +### include_fields [array] + +The list of fields that need to be kept. Fields not in the list will be deleted. + +### exclude_fields [array] + +The list of fields that need to be deleted. Fields not in the list will be kept. + +### common options [string] + +Transform plugin common parameters, please refer to [Transform Plugin](common-options.md) for details + +## Example + +The data read from source is a table like this: + +| name | age | card | +|----------|-----|------| +| Joy Ding | 20 | 123 | +| May Ding | 20 | 123 | +| Kin Dom | 20 | 123 | +| Joy Dom | 20 | 123 | + +we want to keep the field named `name`, `card`, we can add a `Filter` Transform like below: + +``` +transform { + Filter { + source_table_name = "fake" + result_table_name = "fake1" + include_fields = [name, card] + } +} +``` + +Or we can delete the field named `age` by adding a `Filter` Transform with `exclude_fields` field set like below: + +``` +transform { + Filter { + source_table_name = "fake" + result_table_name = "fake1" + exclude_fields = [age] + } +} +``` + +It is useful when you want to delete a small number of fields from a large table with tons of fields. + +Then the data in result table `fake1` will like this + +| name | card | +|----------|------| +| Joy Ding | 123 | +| May Ding | 123 | +| Kin Dom | 123 | +| Joy Dom | 123 | + +## Changelog + +### new version + +- Add Filter Transform Connector + diff --git a/versioned_docs/version-2.3.7/transform-v2/jsonpath.md b/versioned_docs/version-2.3.7/transform-v2/jsonpath.md new file mode 100644 index 000000000000..3baf5853b700 --- /dev/null +++ b/versioned_docs/version-2.3.7/transform-v2/jsonpath.md @@ -0,0 +1,190 @@ +# JsonPath + +> JsonPath transform plugin + +## Description + +> Support use jsonpath select data + +## Options + +| name | type | required | default value | +|---------|-------|----------|---------------| +| Columns | Array | Yes | | + +### common options [string] + +Transform plugin common parameters, please refer to [Transform Plugin](common-options.md) for details + +### fields[array] + +#### option + +| name | type | required | default value | +|------------|--------|----------|---------------| +| src_field | String | Yes | | +| dest_field | String | Yes | | +| path | String | Yes | | +| dest_type | String | No | String | + +#### src_field + +> the json source field you want to parse + +Support SeatunnelDateType + +* STRING +* BYTES +* ARRAY +* MAP +* ROW + +#### dest_field + +> after use jsonpath output field + +#### dest_type + +> the type of dest field + +#### path + +> Jsonpath + +## Read Json Example + +The data read from source is a table like this json: + +```json +{ + "data": { + "c_string": "this is a string", + "c_boolean": true, + "c_integer": 42, + "c_float": 3.14, + "c_double": 3.14, + "c_decimal": 10.55, + "c_date": "2023-10-29", + "c_datetime": "16:12:43.459", + "c_array":["item1", "item2", "item3"] + } +} +``` + +Assuming we want to use JsonPath to extract properties. + +```json +transform { + JsonPath { + source_table_name = "fake" + result_table_name = "fake1" + columns = [ + { + "src_field" = "data" + "path" = "$.data.c_string" + "dest_field" = "c1_string" + }, + { + "src_field" = "data" + "path" = "$.data.c_boolean" + "dest_field" = "c1_boolean" + "dest_type" = "boolean" + }, + { + "src_field" = "data" + "path" = "$.data.c_integer" + "dest_field" = "c1_integer" + "dest_type" = "int" + }, + { + "src_field" = "data" + "path" = "$.data.c_float" + "dest_field" = "c1_float" + "dest_type" = "float" + }, + { + "src_field" = "data" + "path" = "$.data.c_double" + "dest_field" = "c1_double" + "dest_type" = "double" + }, + { + "src_field" = "data" + "path" = "$.data.c_decimal" + "dest_field" = "c1_decimal" + "dest_type" = "decimal(4,2)" + }, + { + "src_field" = "data" + "path" = "$.data.c_date" + "dest_field" = "c1_date" + "dest_type" = "date" + }, + { + "src_field" = "data" + "path" = "$.data.c_datetime" + "dest_field" = "c1_datetime" + "dest_type" = "time" + }, + { + "src_field" = "data" + "path" = "$.data.c_array" + "dest_field" = "c1_array" + "dest_type" = "array" + } + ] + } +} +``` + +Then the data result table `fake1` will like this + +| data | c1_string | c1_boolean | c1_integer | c1_float | c1_double | c1_decimal | c1_date | c1_datetime | c1_array | +|------------------------------|------------------|------------|------------|----------|-----------|------------|------------|--------------|-----------------------------| +| too much content not to show | this is a string | true | 42 | 3.14 | 3.14 | 10.55 | 2023-10-29 | 16:12:43.459 | ["item1", "item2", "item3"] | + +## Read SeatunnelRow Example + +Suppose a column in a row of data is of type SeatunnelRow and that the name of the column is col + + + + + +
    SeatunnelRow(col)other
    nameage....
    a18....
    + +The JsonPath transform converts the values of seatunnel into an array, + +```json +transform { + JsonPath { + source_table_name = "fake" + result_table_name = "fake1" + columns = [ + { + "src_field" = "col" + "path" = "$[0]" + "dest_field" = "name" + "dest_type" = "string" + }, + { + "src_field" = "col" + "path" = "$[1]" + "dest_field" = "age" + "dest_type" = "int" + } + ] + } +} +``` + +Then the data result table `fake1` will like this + +| name | age | col | other | +|------|-----|----------|-------| +| a | 18 | ["a",18] | ... | + +## Changelog + +* Add JsonPath Transform + diff --git a/versioned_docs/version-2.3.7/transform-v2/llm.md b/versioned_docs/version-2.3.7/transform-v2/llm.md new file mode 100644 index 000000000000..d03b8226f06f --- /dev/null +++ b/versioned_docs/version-2.3.7/transform-v2/llm.md @@ -0,0 +1,122 @@ +# LLM + +> LLM transform plugin + +## Description + +Leverage the power of a large language model (LLM) to process data by sending it to the LLM and receiving the +generated results. Utilize the LLM's capabilities to label, clean, enrich data, perform data inference, and +more. + +## Options + +| name | type | required | default value | +|------------------|--------|----------|--------------------------------------------| +| model_provider | enum | yes | | +| output_data_type | enum | no | String | +| prompt | string | yes | | +| model | string | yes | | +| api_key | string | yes | | +| openai.api_path | string | no | https://api.openai.com/v1/chat/completions | + +### model_provider + +The model provider to use. The available options are: +OPENAI + +### output_data_type + +The data type of the output data. The available options are: +STRING,INT,BIGINT,DOUBLE,BOOLEAN. +Default value is STRING. + +### prompt + +The prompt to send to the LLM. This parameter defines how LLM will process and return data, eg: + +The data read from source is a table like this: + +| name | age | +|---------------|-----| +| Jia Fan | 20 | +| Hailin Wang | 20 | +| Eric | 20 | +| Guangdong Liu | 20 | + +The prompt can be: + +``` +Determine whether someone is Chinese or American by their name +``` + +The result will be: + +| name | age | llm_output | +|---------------|-----|------------| +| Jia Fan | 20 | Chinese | +| Hailin Wang | 20 | Chinese | +| Eric | 20 | American | +| Guangdong Liu | 20 | Chinese | + +### model + +The model to use. Different model providers have different models. For example, the OpenAI model can be `gpt-4o-mini`. +If you use OpenAI model, please refer https://platform.openai.com/docs/models/model-endpoint-compatibility of `/v1/chat/completions` endpoint. + +### api_key + +The API key to use for the model provider. +If you use OpenAI model, please refer https://platform.openai.com/docs/api-reference/api-keys of how to get the API key. + +### openai.api_path + +The API path to use for the OpenAI model provider. In most cases, you do not need to change this configuration. If you are using an API agent's service, you may need to configure it to the agent's API address. + +### common options [string] + +Transform plugin common parameters, please refer to [Transform Plugin](common-options.md) for details + +## Example + +Determine the user's country through a LLM. + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + row.num = 5 + schema = { + fields { + id = "int" + name = "string" + } + } + rows = [ + {fields = [1, "Jia Fan"], kind = INSERT} + {fields = [2, "Hailin Wang"], kind = INSERT} + {fields = [3, "Tomas"], kind = INSERT} + {fields = [4, "Eric"], kind = INSERT} + {fields = [5, "Guangdong Liu"], kind = INSERT} + ] + } +} + +transform { + LLM { + model_provider = OPENAI + model = gpt-4o-mini + api_key = sk-xxx + prompt = "Determine whether someone is Chinese or American by their name" + } +} + +sink { + console { + } +} +``` + diff --git a/versioned_docs/version-2.3.7/transform-v2/replace.md b/versioned_docs/version-2.3.7/transform-v2/replace.md new file mode 100644 index 000000000000..1cc99c0ace7b --- /dev/null +++ b/versioned_docs/version-2.3.7/transform-v2/replace.md @@ -0,0 +1,121 @@ +# Replace + +> Replace transform plugin + +## Description + +Examines string value in a given field and replaces substring of the string value that matches the given string literal or regexes with the given replacement. + +## Options + +| name | type | required | default value | +|---------------|---------|----------|---------------| +| replace_field | string | yes | | +| pattern | string | yes | - | +| replacement | string | yes | - | +| is_regex | boolean | no | false | +| replace_first | boolean | no | false | + +### replace_field [string] + +The field you want to replace + +### pattern [string] + +The old string that will be replaced + +### replacement [string] + +The new string for replace + +### is_regex [boolean] + +Use regex for string match + +### replace_first [boolean] + +Whether replace the first match string. Only used when `is_regex = true`. + +### common options [string] + +Transform plugin common parameters, please refer to [Transform Plugin](common-options.md) for details + +## Example + +The data read from source is a table like this: + +| name | age | card | +|----------|-----|------| +| Joy Ding | 20 | 123 | +| May Ding | 20 | 123 | +| Kin Dom | 20 | 123 | +| Joy Dom | 20 | 123 | + +We want to replace the char ` ` to `_` at the `name` field. Then we can add a `Replace` Transform like this: + +``` +transform { + Replace { + source_table_name = "fake" + result_table_name = "fake1" + replace_field = "name" + pattern = " " + replacement = "_" + is_regex = true + } +} +``` + +Then the data in result table `fake1` will update to + +| name | age | card | +|----------|-----|------| +| Joy_Ding | 20 | 123 | +| May_Ding | 20 | 123 | +| Kin_Dom | 20 | 123 | +| Joy_Dom | 20 | 123 | + +## Job Config Example + +``` +env { + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 100 + schema = { + fields { + id = "int" + name = "string" + } + } + } +} + +transform { + Replace { + source_table_name = "fake" + result_table_name = "fake1" + replace_field = "name" + pattern = ".+" + replacement = "b" + is_regex = true + } +} + +sink { + Console { + source_table_name = "fake1" + } +} +``` + +## Changelog + +### new version + +- Add Replace Transform Connector + diff --git a/versioned_docs/version-2.3.7/transform-v2/split.md b/versioned_docs/version-2.3.7/transform-v2/split.md new file mode 100644 index 000000000000..ecfe94c854bf --- /dev/null +++ b/versioned_docs/version-2.3.7/transform-v2/split.md @@ -0,0 +1,72 @@ +# Split + +> Split transform plugin + +## Description + +Split a field to more than one field. + +## Options + +| name | type | required | default value | +|---------------|--------|----------|---------------| +| separator | string | yes | | +| split_field | string | yes | | +| output_fields | array | yes | | + +### separator [string] + +The list of fields that need to be kept. Fields not in the list will be deleted + +### split_field[string] + +The field to be split + +### output_fields[array] + +The result fields after split + +### common options [string] + +Transform plugin common parameters, please refer to [Transform Plugin](common-options.md) for details + +## Example + +The data read from source is a table like this: + +| name | age | card | +|----------|-----|------| +| Joy Ding | 20 | 123 | +| May Ding | 20 | 123 | +| Kin Dom | 20 | 123 | +| Joy Dom | 20 | 123 | + +We want split `name` field to `first_name` and `second name`, we can add `Split` transform like this + +``` +transform { + Split { + source_table_name = "fake" + result_table_name = "fake1" + separator = " " + split_field = "name" + output_fields = [first_name, second_name] + } +} +``` + +Then the data in result table `fake1` will like this + +| name | age | card | first_name | last_name | +|----------|-----|------|------------|-----------| +| Joy Ding | 20 | 123 | Joy | Ding | +| May Ding | 20 | 123 | May | Ding | +| Kin Dom | 20 | 123 | Kin | Dom | +| Joy Dom | 20 | 123 | Joy | Dom | + +## Changelog + +### new version + +- Add Split Transform Connector + diff --git a/versioned_docs/version-2.3.7/transform-v2/sql-functions.md b/versioned_docs/version-2.3.7/transform-v2/sql-functions.md new file mode 100644 index 000000000000..3438a24de9c6 --- /dev/null +++ b/versioned_docs/version-2.3.7/transform-v2/sql-functions.md @@ -0,0 +1,975 @@ +# SQL Functions + +> The Functions of SQL transform plugin + +## String Functions + +### ASCII + +```ASCII(string)``` + +Returns the ```ASCII``` value of the first character in the string. This method returns an int. + +Example: + +ASCII('Hi') + +### BIT_LENGTH + +```BIT_LENGTH(bytes)``` + +Returns the number of bits in a binary string. This method returns a long. + +Example: + +BIT_LENGTH(NAME) + +### CHAR_LENGTH / LENGTH + +```CHAR_LENGTH | LENGTH (string)``` + +Returns the number of characters in a character string. This method returns a long. + +Example: + +CHAR_LENGTH(NAME) + +### OCTET_LENGTH + +```OCTET_LENGTH(bytes)``` + +Returns the number of bytes in a binary string. This method returns a long. + +Example: + +OCTET_LENGTH(NAME) + +### CHAR / CHR + +```CHAR | CHR (int)``` + +Returns the character that represents the ASCII value. This method returns a string. + +Example: + +CHAR(65) + +### CONCAT + +```CONCAT(string, string[, string ...] )``` + +Combines strings. Unlike with the operator ```||```, **NULL** parameters are ignored, and do not cause the result to become **NULL**. If all parameters are NULL the result is an empty string. This method returns a string. + +Example: + +CONCAT(NAME, '_') + +### CONCAT_WS + +```CONCAT_WS(separatorString, string, string[, string ...] )``` + +Combines strings with separator. If separator is **NULL** it is treated like an empty string. Other **NULL** parameters are ignored. Remaining **non-NULL** parameters, if any, are concatenated with the specified separator. If there are no remaining parameters the result is an empty string. This method returns a string. + +Example: + +CONCAT_WS(',', NAME, '_') + +### HEXTORAW + +```HEXTORAW(string)``` + +Converts a hex representation of a string to a string. 4 hex characters per string character are used. + +Example: + +HEXTORAW(DATA) + +### RAWTOHEX + +```RAWTOHEX(string)``` + +```RAWTOHEX(bytes)``` + +Converts a string or bytes to the hex representation. 4 hex characters per string character are used. This method returns a string. + +Example: + +RAWTOHEX(DATA) + +### INSERT + +```INSERT(originalString, startInt, lengthInt, addString)``` + +Inserts a additional string into the original string at a specified start position. The length specifies the number of characters that are removed at the start position in the original string. This method returns a string. + +Example: + +INSERT(NAME, 1, 1, ' ') + +### LOWER / LCASE + +```LOWER | LCASE (string)``` + +Converts a string to lowercase. + +Example: + +LOWER(NAME) + +### UPPER / UCASE + +```UPPER | UCASE (string)``` + +Converts a string to uppercase. + +Example: + +UPPER(NAME) + +### LEFT + +```LEFT(string, int)``` + +Returns the leftmost number of characters. + +Example: + +LEFT(NAME, 3) + +### RIGHT + +```RIGHT(string, int)``` + +Returns the rightmost number of characters. + +Example: + +RIGHT(NAME, 3) + +### LOCATE / INSTR / POSITION + +```LOCATE(searchString, string[, startInit])``` + +```INSTR(string, searchString[, startInit])``` + +```POSITION(searchString, string)``` + +Returns the location of a search string in a string. If a start position is used, the characters before it are ignored. If position is negative, the rightmost location is returned. 0 is returned if the search string is not found. Please note this function is case sensitive, even if the parameters are not. + +Example: + +LOCATE('.', NAME) + +### LPAD + +```LPAD(string ,int[, string])``` + +Left pad the string to the specified length. If the length is shorter than the string, it will be truncated at the end. If the padding string is not set, spaces will be used. + +Example: + +LPAD(AMOUNT, 10, '*') + +### RPAD + +```RPAD(string, int[, string])``` + +Right pad the string to the specified length. If the length is shorter than the string, it will be truncated. If the padding string is not set, spaces will be used. + +Example: + +RPAD(TEXT, 10, '-') + +### LTRIM + +```LTRIM(string[, characterToTrimString])``` + +Removes all leading spaces or other specified characters from a string. + +This function is deprecated, use TRIM instead of it. + +Example: + +LTRIM(NAME) + +### RTRIM + +```RTRIM(string[, characterToTrimString])``` + +Removes all trailing spaces or other specified characters from a string. + +This function is deprecated, use TRIM instead of it. + +Example: + +RTRIM(NAME) + +### TRIM + +```TRIM(string[, characterToTrimString])``` + +Removes all leading spaces or other specified characters from a string. + +This function is deprecated, use TRIM instead of it. + +Example: + +LTRIM(NAME) + +### REGEXP_REPLACE + +```REGEXP_REPLACE(inputString, regexString, replacementString[, flagsString])``` + +Replaces each substring that matches a regular expression. For details, see the Java String.replaceAll() method. If any parameter is null (except optional flagsString parameter), the result is null. + +Flags values are limited to 'i', 'c', 'n', 'm'. Other symbols cause exception. Multiple symbols could be used in one flagsString parameter (like 'im'). Later flags override first ones, for example 'ic' is equivalent to case sensitive matching 'c'. + +'i' enables case insensitive matching (Pattern.CASE_INSENSITIVE) + +'c' disables case insensitive matching (Pattern.CASE_INSENSITIVE) + +'n' allows the period to match the newline character (Pattern.DOTALL) + +'m' enables multiline mode (Pattern.MULTILINE) + +Example: + +REGEXP_REPLACE('Hello World', ' +', ' ') +REGEXP_REPLACE('Hello WWWWorld', 'w+', 'W', 'i') + +### REGEXP_LIKE + +```REGEXP_LIKE(inputString, regexString[, flagsString])``` + +Matches string to a regular expression. For details, see the Java Matcher.find() method. If any parameter is null (except optional flagsString parameter), the result is null. + +Flags values are limited to 'i', 'c', 'n', 'm'. Other symbols cause exception. Multiple symbols could be used in one flagsString parameter (like 'im'). Later flags override first ones, for example 'ic' is equivalent to case sensitive matching 'c'. + +'i' enables case insensitive matching (Pattern.CASE_INSENSITIVE) + +'c' disables case insensitive matching (Pattern.CASE_INSENSITIVE) + +'n' allows the period to match the newline character (Pattern.DOTALL) + +'m' enables multiline mode (Pattern.MULTILINE) + +Example: + +REGEXP_LIKE('Hello World', '[A-Z ]*', 'i') + +### REGEXP_SUBSTR + +```REGEXP_SUBSTR(inputString, regexString[, positionInt, occurrenceInt, flagsString, groupInt])``` + +Matches string to a regular expression and returns the matched substring. For details, see the java.util.regex.Pattern and related functionality. + +The parameter position specifies where in inputString the match should start. Occurrence indicates which occurrence of pattern in inputString to search for. + +Flags values are limited to 'i', 'c', 'n', 'm'. Other symbols cause exception. Multiple symbols could be used in one flagsString parameter (like 'im'). Later flags override first ones, for example 'ic' is equivalent to case sensitive matching 'c'. + +'i' enables case insensitive matching (Pattern.CASE_INSENSITIVE) + +'c' disables case insensitive matching (Pattern.CASE_INSENSITIVE) + +'n' allows the period to match the newline character (Pattern.DOTALL) + +'m' enables multiline mode (Pattern.MULTILINE) + +If the pattern has groups, the group parameter can be used to specify which group to return. + +Example: + +REGEXP_SUBSTR('2020-10-01', '\d{4}') +REGEXP_SUBSTR('2020-10-01', '(\d{4})-(\d{2})-(\d{2})', 1, 1, NULL, 2) + +### REPEAT + +```REPEAT(string, int)``` + +Returns a string repeated some number of times. + +Example: + +REPEAT(NAME || ' ', 10) + +### REPLACE + +```REPLACE(string, searchString[, replacementString])``` + +Replaces all occurrences of a search string in a text with another string. If no replacement is specified, the search string is removed from the original string. If any parameter is null, the result is null. + +Example: + +REPLACE(NAME, ' ') + +### SOUNDEX + +```SOUNDEX(string)``` + +Returns a four character code representing the sound of a string. This method returns a string, or null if parameter is null. See https://en.wikipedia.org/wiki/Soundex for more information. + +Example: + +SOUNDEX(NAME) + +### SPACE + +```SPACE(int)``` + +Returns a string consisting of a number of spaces. + +Example: + +SPACE(80) + +### SUBSTRING / SUBSTR + +```SUBSTRING | SUBSTR (string, startInt[, lengthInt ])``` + +Returns a substring of a string starting at a position. If the start index is negative, then the start index is relative to the end of the string. The length is optional. + +Example: + +CALL SUBSTRING('[Hello]', 2); +CALL SUBSTRING('hour', 3, 2); + +### TO_CHAR + +```TO_CHAR(value[, formatString])``` + +Oracle-compatible TO_CHAR function that can format a timestamp, a number, or text. + +Example: + +CALL TO_CHAR(SYS_TIME, 'yyyy-MM-dd HH:mm:ss') + +### TRANSLATE + +```TRANSLATE(value, searchString, replacementString)``` + +Oracle-compatible TRANSLATE function that replaces a sequence of characters in a string with another set of characters. + +Example: + +CALL TRANSLATE('Hello world', 'eo', 'EO') + +## Numeric Functions + +### ABS + +```ABS(numeric)``` + +Returns the absolute value of a specified value. The returned value is of the same data type as the parameter. + +Note that TINYINT, SMALLINT, INT, and BIGINT data types cannot represent absolute values of their minimum negative values, because they have more negative values than positive. For example, for INT data type allowed values are from -2147483648 to 2147483647. ABS(-2147483648) should be 2147483648, but this value is not allowed for this data type. It leads to an exception. To avoid it cast argument of this function to a higher data type. + +Example: + +ABS(I) + +### ACOS + +```ACOS(numeric)``` + +Calculate the arc cosine. See also Java Math.acos. This method returns a double. + +Example: + +ACOS(D) + +### ASIN + +```ASIN(numeric)``` + +Calculate the arc sine. See also Java Math.asin. This method returns a double. + +Example: + +ASIN(D) + +### ATAN + +```ATAN(numeric)``` + +Calculate the arc tangent. See also Java Math.atan. This method returns a double. + +Example: + +ATAN(D) + +### COS + +```COS(numeric)``` + +Calculate the trigonometric cosine. See also Java Math.cos. This method returns a double. + +Example: + +COS(ANGLE) + +### COSH + +```COSH(numeric)``` + +Calculate the hyperbolic cosine. See also Java Math.cosh. This method returns a double. + +Example: + +COSH(X) + +### COT + +```COT(numeric)``` + +Calculate the trigonometric cotangent (1/TAN(ANGLE)). See also Java Math.* functions. This method returns a double. + +Example: + +COT(ANGLE) + +### SIN + +```SIN(numeric)``` + +Calculate the trigonometric sine. See also Java Math.sin. This method returns a double. + +Example: + +SIN(ANGLE) + +### SINH + +```SINH(numeric)``` + +Calculate the hyperbolic sine. See also Java Math.sinh. This method returns a double. + +Example: + +SINH(ANGLE) + +### TAN + +```TAN(numeric)``` + +Calculate the trigonometric tangent. See also Java Math.tan. This method returns a double. + +Example: + +TAN(ANGLE) + +### TANH + +```TANH(numeric)``` + +Calculate the hyperbolic tangent. See also Java Math.tanh. This method returns a double. + +Example: + +TANH(X) + +### MOD + +```MOD(dividendNumeric, divisorNumeric )``` + +The modulus expression. + +Result has the same type as divisor. Result is NULL if either of arguments is NULL. If divisor is 0, an exception is raised. Result has the same sign as dividend or is equal to 0. + +Usually arguments should have scale 0, but it isn't required by H2. + +Example: + +MOD(A, B) + +### CEIL / CEILING + +```CEIL | CEILING (numeric)``` + +Returns the smallest integer value that is greater than or equal to the argument. This method returns value of the same type as argument, but with scale set to 0 and adjusted precision, if applicable. + +Example: + +CEIL(A) + +### EXP + +```EXP(numeric)``` + +See also Java Math.exp. This method returns a double. + +Example: + +EXP(A) + +### FLOOR + +```FLOOR(numeric)``` + +Returns the largest integer value that is less than or equal to the argument. This method returns value of the same type as argument, but with scale set to 0 and adjusted precision, if applicable. + +Example: + +FLOOR(A) + +### LN + +```LN(numeric)``` + +Calculates the natural (base e) logarithm as a double value. Argument must be a positive numeric value. + +Example: + +LN(A) + +### LOG + +```LOG(baseNumeric, numeric)``` + +Calculates the logarithm with specified base as a double value. Argument and base must be positive numeric values. Base cannot be equal to 1. + +The default base is e (natural logarithm), in the PostgreSQL mode the default base is base 10. In MSSQLServer mode the optional base is specified after the argument. + +Single-argument variant of LOG function is deprecated, use LN or LOG10 instead. + +Example: + +LOG(2, A) + +### LOG10 + +```LOG10(numeric)``` + +Calculates the base 10 logarithm as a double value. Argument must be a positive numeric value. + +Example: + +LOG10(A) + +### RADIANS + +```RADIANS(numeric)``` + +See also Java Math.toRadians. This method returns a double. + +Example: + +RADIANS(A) + +### SQRT + +```SQRT(numeric)``` + +See also Java Math.sqrt. This method returns a double. + +Example: + +SQRT(A) + +### PI + +```PI()``` + +See also Java Math.PI. This method returns a double. + +Example: + +PI() + +### POWER + +```POWER(numeric, numeric)``` + +See also Java Math.pow. This method returns a double. + +Example: + +POWER(A, B) + +### RAND / RANDOM + +```RAND | RANDOM([ int ])``` + +Calling the function without parameter returns the next a pseudo random number. Calling it with an parameter seeds the session's random number generator. This method returns a double between 0 (including) and 1 (excluding). + +Example: + +RAND() + +### ROUND + +```ROUND(numeric[, digitsInt])``` + +Rounds to a number of fractional digits. This method returns value of the same type as argument, but with adjusted precision and scale, if applicable. + +Example: + +ROUND(N, 2) + +### SIGN + +```SIGN(numeric)``` + +Returns -1 if the value is smaller than 0, 0 if zero or NaN, and otherwise 1. + +Example: + +SIGN(N) + +### TRUNC + +```TRUNC | TRUNCATE(numeric[, digitsInt])``` + +When a numeric argument is specified, truncates it to a number of digits (to the next value closer to 0) and returns value of the same type as argument, but with adjusted precision and scale, if applicable. + +Example: + +TRUNC(N, 2) + +## Time and Date Functions + +### CURRENT_DATE + +```CURRENT_DATE [()]``` + +Returns the current date. + +These functions return the same value within a transaction (default) or within a command depending on database mode. + +Example: + +CURRENT_DATE + +### CURRENT_TIME + +```CURRENT_TIME [()]``` + +Returns the current time with system time zone. The actual maximum available precision depends on operating system and JVM and can be 3 (milliseconds) or higher. Higher precision is not available before Java 9. + +Example: + +CURRENT_TIME + +### CURRENT_TIMESTAMP / NOW + +```CURRENT_TIMESTAMP[()] | NOW()``` + +Returns the current timestamp with system time zone. The actual maximum available precision depends on operating system and JVM and can be 3 (milliseconds) or higher. Higher precision is not available before Java 9. + +Example: + +CURRENT_TIMESTAMP + +### DATEADD / TIMESTAMPADD + +```DATEADD| TIMESTAMPADD(dateAndTime, addIntLong, datetimeFieldString)``` + +Adds units to a date-time value. The datetimeFieldString indicates the unit. Use negative values to subtract units. addIntLong may be a long value when manipulating milliseconds, microseconds, or nanoseconds otherwise its range is restricted to int. This method returns a value with the same type as specified value if unit is compatible with this value. If specified field is a HOUR, MINUTE, SECOND, MILLISECOND, etc and value is a DATE value DATEADD returns combined TIMESTAMP. Fields DAY, MONTH, YEAR, WEEK, etc are not allowed for TIME values. + +Example: + +DATEADD(CREATED, 1, 'MONTH') + +### DATEDIFF + +```DATEDIFF(aDateAndTime, bDateAndTime, datetimeFieldString)``` + +Returns the number of crossed unit boundaries between two date-time values. This method returns a long. The datetimeField indicates the unit. + +Example: + +DATEDIFF(T1.CREATED, T2.CREATED, 'MONTH') + +### DATE_TRUNC + +```DATE_TRUNC (dateAndTime, datetimeFieldString)``` + +Truncates the specified date-time value to the specified field. + +Example: + +DATE_TRUNC(CREATED, 'DAY'); + +### DAYNAME + +```DAYNAME(dateAndTime)``` + +Returns the name of the day (in English). + +Example: + +DAYNAME(CREATED) + +### DAY_OF_MONTH + +```DAY_OF_MONTH(dateAndTime)``` + +Returns the day of the month (1-31). + +Example: + +DAY_OF_MONTH(CREATED) + +### DAY_OF_WEEK + +```DAY_OF_WEEK(dateAndTime)``` + +Returns the day of the week (1-7) (Monday-Sunday), locale-specific. + +Example: + +DAY_OF_WEEK(CREATED) + +### DAY_OF_YEAR + +```DAY_OF_YEAR(dateAndTime)``` + +Returns the day of the year (1-366). + +Example: + +DAY_OF_YEAR(CREATED) + +### EXTRACT + +```EXTRACT ( datetimeField FROM dateAndTime)``` + +Returns a value of the specific time unit from a date/time value. This method returns a numeric value with EPOCH field and an int for all other fields. + +Example: + +EXTRACT(SECOND FROM CURRENT_TIMESTAMP) + +### FORMATDATETIME + +```FORMATDATETIME (dateAndTime, formatString)``` + +Formats a date, time or timestamp as a string. The most important format characters are: y year, M month, d day, H hour, m minute, s second. For details of the format, see java.time.format.DateTimeFormatter. + +This method returns a string. + +Example: + +CALL FORMATDATETIME(CREATED, 'yyyy-MM-dd HH:mm:ss') + +### HOUR + +```HOUR(dateAndTime)``` + +Returns the hour (0-23) from a date/time value. + +Example: + +HOUR(CREATED) + +### MINUTE + +```MINUTE(dateAndTime)``` + +Returns the minute (0-59) from a date/time value. + +This function is deprecated, use EXTRACT instead of it. + +Example: + +MINUTE(CREATED) + +### MONTH + +```MONTH(dateAndTime)``` + +Returns the month (1-12) from a date/time value. + +This function is deprecated, use EXTRACT instead of it. + +Example: + +MONTH(CREATED) + +### MONTHNAME + +```MONTHNAME(dateAndTime)``` + +Returns the name of the month (in English). + +Example: + +MONTHNAME(CREATED) + +### IS_DATE + +```IS_DATE(string, formatString)``` +Parses a string and returns a boolean value. The most important format characters are: y year, M month, d day, H hour, m minute, s second. For details of the format, see java.time.format.DateTimeFormatter. + +Example: + +CALL IS_DATE('2021-04-08 13:34:45','yyyy-MM-dd HH:mm:ss') + +### PARSEDATETIME / TO_DATE + +```PARSEDATETIME | TO_DATE(string, formatString)``` +Parses a string and returns a TIMESTAMP WITH TIME ZONE value. The most important format characters are: y year, M month, d day, H hour, m minute, s second. For details of the format, see java.time.format.DateTimeFormatter. + +Example: + +CALL PARSEDATETIME('2021-04-08 13:34:45','yyyy-MM-dd HH:mm:ss') + +### QUARTER + +```QUARTER(dateAndTime)``` + +Returns the quarter (1-4) from a date/time value. + +Example: + +QUARTER(CREATED) + +### SECOND + +```SECOND(dateAndTime)``` + +Returns the second (0-59) from a date/time value. + +This function is deprecated, use EXTRACT instead of it. + +Example: + +SECOND(CREATED) + +### WEEK + +```WEEK(dateAndTime)``` + +Returns the week (1-53) from a date/time value. + +This function uses the current system locale. + +Example: + +WEEK(CREATED) + +### YEAR + +```YEAR(dateAndTime)``` + +Returns the year from a date/time value. + +Example: + +YEAR(CREATED) + +### FROM_UNIXTIME + +```FROM_UNIXTIME (unixtime, formatString,timeZone)``` + +Convert the number of seconds from the UNIX epoch (1970-01-01 00:00:00 UTC) to a string representing the timestamp of that moment. + +The most important format characters are: y year, M month, d day, H hour, m minute, s second. For details of the format, see `java.time.format.DateTimeFormatter`. + +`timeZone` is optional, default value is system's time zone. `timezone` value can be a `UTC+ timezone offset`, for example, `UTC+8` represents the Asia/Shanghai time zone, see `java.time.ZoneId` + +This method returns a string. + +Example: + +// use default zone + +CALL FROM_UNIXTIME(1672502400, 'yyyy-MM-dd HH:mm:ss') + +or + +// use given zone + +CALL FROM_UNIXTIME(1672502400, 'yyyy-MM-dd HH:mm:ss','UTC+6') + +## System Functions + +### CAST + +```CAST(value as dataType)``` + +Converts a value to another data type. + +Supported data types: STRING | VARCHAR, INT | INTEGER, LONG | BIGINT, BYTE, FLOAT, DOUBLE, DECIMAL(p,s), TIMESTAMP, DATE, TIME, BYTES + +Example: + +CONVERT(NAME AS INT) + +### COALESCE + +```COALESCE(aValue, bValue [,...])``` + +Returns the first value that is not null. + +Example: + +COALESCE(A, B, C) + +### IFNULL + +```IFNULL(aValue, bValue)``` + +Returns the first value that is not null. + +Example: + +IFNULL(A, B) + +### NULLIF + +```NULLIF(aValue, bValue)``` + +Returns NULL if 'a' is equal to 'b', otherwise 'a'. + +Example: + +NULLIF(A, B) + +### CASE WHEN + +``` +select + case + when c_string in ('c_string') then 1 + else 0 + end as c_string_1, + case + when c_string not in ('c_string') then 1 + else 0 + end as c_string_0, + case + when c_tinyint = 117 + and TO_CHAR(c_boolean) = 'true' then 1 + else 0 + end as c_tinyint_boolean_1, + case + when c_tinyint != 117 + and TO_CHAR(c_boolean) = 'true' then 1 + else 0 + end as c_tinyint_boolean_0, + case + when c_tinyint != 117 + or TO_CHAR(c_boolean) = 'true' then 1 + else 0 + end as c_tinyint_boolean_or_1, + case + when c_int > 1 + and c_bigint > 1 + and c_float > 1 + and c_double > 1 + and c_decimal > 1 then 1 + else 0 + end as c_number_1, + case + when c_tinyint <> 117 then 1 + else 0 + end as c_number_0 +from + fake +``` + +It is used to determine whether the condition is valid and return different values according to different judgments + +Example: + +case when c_string in ('c_string') then 1 else 0 end diff --git a/versioned_docs/version-2.3.7/transform-v2/sql-udf.md b/versioned_docs/version-2.3.7/transform-v2/sql-udf.md new file mode 100644 index 000000000000..df5d3b93fe52 --- /dev/null +++ b/versioned_docs/version-2.3.7/transform-v2/sql-udf.md @@ -0,0 +1,134 @@ +# SQL UDF + +> UDF of SQL transform plugin + +## Description + +Use UDF SPI to extend the SQL transform functions lib. + +## UDF API + +```java +package org.apache.seatunnel.transform.sql.zeta; + +public interface ZetaUDF { + /** + * Function name + * + * @return function name + */ + String functionName(); + + /** + * The type of function result + * + * @param argsType input arguments type + * @return result type + */ + SeaTunnelDataType resultType(List> argsType); + + /** + * Evaluate + * + * @param args input arguments + * @return result value + */ + Object evaluate(List args); +} +``` + +## UDF Implements Example + +Add these dependencies and provided scope to your maven project: + +```xml + + + + org.apache.seatunnel + seatunnel-transforms-v2 + 2.3.2 + provided + + + org.apache.seatunnel + seatunnel-api + 2.3.2 + provided + + + com.google.auto.service + auto-service + 1.0.1 + provided + + + +``` + +Add a Java Class implements of ZetaUDF like this: + +```java + +@AutoService(ZetaUDF.class) +public class ExampleUDF implements ZetaUDF { + @Override + public String functionName() { + return "EXAMPLE"; + } + + @Override + public SeaTunnelDataType resultType(List> argsType) { + return BasicType.STRING_TYPE; + } + + @Override + public Object evaluate(List args) { + String arg = (String) args.get(0); + if (arg == null) return null; + return "UDF: " + arg; + } +} +``` + +Package the UDF project and copy the jar to the path: ${SEATUNNEL_HOME}/lib. And if your UDF use third party library, you also need put it to ${SEATUNNEL_HOME}/lib. +If you use cluster mode, you need put the lib to all your node's ${SEATUNNEL_HOME}/lib folder and re-start the cluster. + +## Example + +The data read from source is a table like this: + +| id | name | age | +|----|----------|-----| +| 1 | Joy Ding | 20 | +| 2 | May Ding | 21 | +| 3 | Kin Dom | 24 | +| 4 | Joy Dom | 22 | + +We use UDF of SQL query to transform the source data like this: + +``` +transform { + Sql { + source_table_name = "fake" + result_table_name = "fake1" + query = "select id, example(name) as name, age from fake" + } +} +``` + +Then the data in result table `fake1` will update to + +| id | name | age | +|----|---------------|-----| +| 1 | UDF: Joy Ding | 20 | +| 2 | UDF: May Ding | 21 | +| 3 | UDF: Kin Dom | 24 | +| 4 | UDF: Joy Dom | 22 | + +## Changelog + +### new version + +- Add UDF of SQL Transform Connector + diff --git a/versioned_docs/version-2.3.7/transform-v2/sql.md b/versioned_docs/version-2.3.7/transform-v2/sql.md new file mode 100644 index 000000000000..a3bdb9bbfc1b --- /dev/null +++ b/versioned_docs/version-2.3.7/transform-v2/sql.md @@ -0,0 +1,160 @@ +# SQL + +> SQL transform plugin + +## Description + +Use SQL to transform given input row. + +SQL transform use memory SQL engine, we can via SQL functions and ability of SQL engine to implement the transform task. + +## Options + +| name | type | required | default value | +|-------------------|--------|----------|---------------| +| source_table_name | string | yes | - | +| result_table_name | string | yes | - | +| query | string | yes | - | + +### source_table_name [string] + +The source table name, the query SQL table name must match this field. + +### query [string] + +The query SQL, it's a simple SQL supported base function and criteria filter operation. But the complex SQL unsupported yet, include: multi source table/rows JOIN and AGGREGATE operation and the like. + +the query expression can be `select [table_name.]column_a` to query the column that named `column_a`. and the table name is optional. +or `select c_row.c_inner_row.column_b` to query the inline struct column that named `column_b` within `c_row` column and `c_inner_row` column. **In this query expression, can't have table name.** + +## Example + +The data read from source is a table like this: + +| id | name | age | +|----|----------|-----| +| 1 | Joy Ding | 20 | +| 2 | May Ding | 21 | +| 3 | Kin Dom | 24 | +| 4 | Joy Dom | 22 | + +We use SQL query to transform the source data like this: + +``` +transform { + Sql { + source_table_name = "fake" + result_table_name = "fake1" + query = "select id, concat(name, '_') as name, age+1 as age from fake where id>0" + } +} +``` + +Then the data in result table `fake1` will update to + +| id | name | age | +|----|-----------|-----| +| 1 | Joy Ding_ | 21 | +| 2 | May Ding_ | 22 | +| 3 | Kin Dom_ | 25 | +| 4 | Joy Dom_ | 23 | + +### Struct query + +if your upstream data schema is like this: + +```hacon +source { + FakeSource { + result_table_name = "fake" + row.num = 100 + string.template = ["innerQuery"] + schema = { + fields { + name = "string" + c_date = "date" + c_row = { + c_inner_row = { + c_inner_int = "int" + c_inner_string = "string" + c_inner_timestamp = "timestamp" + c_map_1 = "map" + c_map_2 = "map>" + } + c_string = "string" + } + } + } + } +} +``` + +Those query all are valid: + +```sql +select +name, +c_date, +c_row, +c_row.c_inner_row, +c_row.c_string, +c_row.c_inner_row.c_inner_int, +c_row.c_inner_row.c_inner_string, +c_row.c_inner_row.c_inner_timestamp, +c_row.c_inner_row.c_map_1, +c_row.c_inner_row.c_map_1.some_key +``` + +But this query are not valid: + +```sql +select +c_row.c_inner_row.c_map_2.some_key.inner_map_key +``` + +The map must be the latest struct, can't query the nesting map. + +## Job Config Example + +``` +env { + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + row.num = 100 + schema = { + fields { + id = "int" + name = "string" + age = "int" + } + } + } +} + +transform { + Sql { + source_table_name = "fake" + result_table_name = "fake1" + query = "select id, concat(name, '_') as name, age+1 as age from fake where id>0" + } +} + +sink { + Console { + source_table_name = "fake1" + } +} +``` + +## Changelog + +- Support struct query + +### new version + +- Add SQL Transform Connector + diff --git a/versioned_sidebars/version-2.3.7-sidebars.json b/versioned_sidebars/version-2.3.7-sidebars.json new file mode 100644 index 000000000000..3b68102a6e0b --- /dev/null +++ b/versioned_sidebars/version-2.3.7-sidebars.json @@ -0,0 +1,170 @@ +{ + "docs": [ + "about", + { + "type": "category", + "label": "Quick Start - V2", + "items": [ + { + "type": "category", + "label": "Start With Locally", + "items": [ + { + "type": "autogenerated", + "dirName": "start-v2/locally" + } + ] + }, + { + "type": "category", + "label": "Start With Docker", + "items": [ + { + "type": "autogenerated", + "dirName": "start-v2/docker" + } + ] + }, + { + "type": "category", + "label": "Start With K8s", + "items": [ + { + "type": "autogenerated", + "dirName": "start-v2/kubernetes" + } + ] + } + ] + }, + { + "type": "category", + "label": "Concept", + "items": [ + "concept/config", + "concept/connector-v2-features", + "concept/schema-feature", + "concept/JobEnvConfig", + "concept/sink-options-placeholders", + "concept/sql-config", + "concept/speed-limit", + "concept/event-listener" + ] + }, + "Connector-v2-release-state", + { + "type": "category", + "label": "Connector-V2", + "items": [ + { + "type": "category", + "label": "Source", + "link": { + "type": "generated-index", + "title": "Source(V2) of SeaTunnel", + "description": "List all source(v2) supported Apache SeaTunnel for now.", + "slug": "/category/source-v2", + "keywords": [ + "source" + ], + "image": "/img/favicon.ico" + }, + "items": [ + { + "type": "autogenerated", + "dirName": "connector-v2/source" + } + ] + }, + { + "type": "category", + "label": "Sink", + "link": { + "type": "generated-index", + "title": "Sink(V2) of SeaTunnel", + "description": "List all sink(v2) supported Apache SeaTunnel for now.", + "slug": "/category/sink-v2", + "keywords": [ + "sink" + ], + "image": "/img/favicon.ico" + }, + "items": [ + { + "type": "autogenerated", + "dirName": "connector-v2/sink" + } + ] + }, + "connector-v2/Error-Quick-Reference-Manual", + "connector-v2/Config-Encryption-Decryption" + ] + }, + { + "type": "category", + "label": "Transform-V2", + "link": { + "type": "generated-index", + "title": "Transform V2 of SeaTunnel", + "description": "List all transform v2 supported Apache SeaTunnel for now.", + "slug": "/category/transform-v2", + "keywords": [ + "transform-v2" + ], + "image": "/img/favicon.ico" + }, + "items": [ + { + "type": "autogenerated", + "dirName": "transform-v2" + } + ] + }, + { + "type": "category", + "label": "Command", + "items": [ + "command/usage", + "command/connector-check" + ] + }, + { + "type": "category", + "label": "SeaTunnel Engine", + "items": [ + "seatunnel-engine/about", + "seatunnel-engine/download-seatunnel", + "seatunnel-engine/deployment", + "seatunnel-engine/local-mode-deployment", + "seatunnel-engine/hybrid-cluster-deployment", + "seatunnel-engine/separated-cluster-deployment", + "seatunnel-engine/savepoint", + "seatunnel-engine/checkpoint-storage", + "seatunnel-engine/engine-jar-storage-mode", + "seatunnel-engine/tcp", + "seatunnel-engine/resource-isolation", + "seatunnel-engine/rest-api", + "seatunnel-engine/user-command" + ] + }, + { + "type": "category", + "label": "Other Engine", + "items": [ + "other-engine/flink", + "other-engine/spark" + ] + }, + { + "type": "category", + "label": "Contribution", + "items": [ + "contribution/setup", + "contribution/new-license", + "contribution/coding-guide", + "contribution/contribute-transform-v2-guide" + ] + }, + "faq" + ] +} diff --git a/versions.json b/versions.json index e2f2458bc84b..9842227db87d 100644 --- a/versions.json +++ b/versions.json @@ -1,4 +1,5 @@ [ + "2.3.7", "2.3.6", "2.3.5", "2.3.4",