diff --git a/.github/REVIEWERS.yml b/.github/REVIEWERS.yml index 44e80bd594af..de7cb5476d92 100644 --- a/.github/REVIEWERS.yml +++ b/.github/REVIEWERS.yml @@ -39,7 +39,6 @@ labels: - Abacn - kennknowles - robertwb - - bvolpato - m-trieu - damondouglas exclusionList: [] @@ -49,7 +48,6 @@ labels: - johnjcasey - Abacn - ahmedabu98 - - bvolpato - damondouglas - shunping exclusionList: [] diff --git a/.github/trigger_files/beam_PostCommit_Python_Examples_Direct.json b/.github/trigger_files/beam_PostCommit_Python.json similarity index 100% rename from .github/trigger_files/beam_PostCommit_Python_Examples_Direct.json rename to .github/trigger_files/beam_PostCommit_Python.json diff --git a/.github/trigger_files/beam_PostCommit_Python_ValidatesContainer_Dataflow.json b/.github/trigger_files/beam_PostCommit_Python_ValidatesContainer_Dataflow.json new file mode 100644 index 000000000000..d6c608f6daba --- /dev/null +++ b/.github/trigger_files/beam_PostCommit_Python_ValidatesContainer_Dataflow.json @@ -0,0 +1,3 @@ +{ + "comment": "Modify this file in a trivial way to cause this test suite to run" +} \ No newline at end of file diff --git a/.github/workflows/IO_Iceberg.yml b/.github/workflows/IO_Iceberg.yml index abc75836322c..ecf785f42a5f 100644 --- a/.github/workflows/IO_Iceberg.yml +++ b/.github/workflows/IO_Iceberg.yml @@ -66,13 +66,14 @@ jobs: strategy: matrix: job_name: ["IO_Iceberg"] + job_phrase: ["Run IcebergIO Unit Tests"] timeout-minutes: 60 if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || - github.event.comment.body == 'Run Java_Amqp_IO_Direct PreCommit' + github.event.comment.body == 'Run IcebergIO Unit Tests' runs-on: [self-hosted, ubuntu-20.04, main] steps: - uses: actions/checkout@v4 @@ -84,7 +85,7 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: run Amqp IO build script + - name: run IcebergIO build script uses: ./.github/actions/gradle-command-self-hosted-action with: gradle-command: :sdks:java:io:iceberg:build diff --git a/.github/workflows/beam_LoadTests_Java_PubsubIO.yml b/.github/workflows/beam_LoadTests_Java_PubsubIO.yml new file mode 100644 index 000000000000..07025218e820 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Java_PubsubIO.yml @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Java PubsubIO + +on: + schedule: + - cron: '30 21 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Java_PubsubIO: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Java PubsubIO' + runs-on: [self-hosted, ubuntu-20.04, highmem] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Java_PubsubIO"] + job_phrase: ["Run Load Tests Java PubsubIO"] + steps: + - uses: actions/checkout@v4 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: run PubSub Performance test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :it:google-cloud-platform:PubsubLoadTestLarge --info -DinfluxHost="http://10.128.0.96:8086" -DinfluxDatabase="beam_test_metrics" -DinfluxMeasurement="java_load_test_pubsub" \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Yaml_Xlang_Direct.yml b/.github/workflows/beam_PreCommit_Yaml_Xlang_Direct.yml new file mode 100644 index 000000000000..2f2171f3d4df --- /dev/null +++ b/.github/workflows/beam_PreCommit_Yaml_Xlang_Direct.yml @@ -0,0 +1,108 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: PostCommit Python Xlang Gcp Direct + +on: + pull_request_target: + paths: ['release/trigger_all_tests.json', 'model/**', 'sdks/python/**'] + issue_comment: + types: [created] + push: + tags: ['v*'] + branches: ['master', 'release-*'] + paths: + - "model/**" + - "release/**" + - "sdks/python/**" + - "sdks/java/extensions/schemaio-expansion-service/**" + - "sdks/java/extensions/sql/**" + - "sdks/java/io/expansion-service/**" + - "sdks/java/io/google-cloud-platform/**" + - ".github/workflows/beam_PreCommit_Yaml_Xlang_Direct.yml" + schedule: + - cron: '30 5/6 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: write + checks: write + contents: read + deployments: read + id-token: none + issues: write + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + beam_PreCommit_Yaml_Xlang_Direct: + if: | + github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Yaml_Xlang_Direct PreCommit' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 100 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_PreCommit_Yaml_Xlang_Direct"] + job_phrase: ["Run Yaml_Xlang_Direct"] + steps: + - uses: actions/checkout@v4 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: | + 3.8 + - name: run PostCommit Yaml Xlang Gcp Direct script + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:yamlIntegrationTests + - name: Archive Python Test Results + uses: actions/upload-artifact@v4 + if: failure() + with: + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_StressTests_Java_BigQueryIO.yml b/.github/workflows/beam_StressTests_Java_BigQueryIO.yml new file mode 100644 index 000000000000..38bf1b54e082 --- /dev/null +++ b/.github/workflows/beam_StressTests_Java_BigQueryIO.yml @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: StressTests Java BigQueryIO + +on: + schedule: + - cron: '0 10 * * 6' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_StressTests_Java_BigQueryIO: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Stress Tests Java BigQueryIO' + runs-on: [self-hosted, ubuntu-20.04, highmem] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_StressTests_Java_BigQueryIO"] + job_phrase: ["Run Stress Tests Java BigQueryIO"] + steps: + - uses: actions/checkout@v4 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: run BigQuery StressTest Large + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :it:google-cloud-platform:BigQueryStressTestLarge --info -DinfluxHost="http://10.128.0.96:8086" -DinfluxDatabase="beam_test_metrics" -DinfluxMeasurement="java_stress_test_bigquery" \ No newline at end of file diff --git a/.github/workflows/beam_StressTests_Java_BigTableIO.yml b/.github/workflows/beam_StressTests_Java_BigTableIO.yml new file mode 100644 index 000000000000..31d4de760a11 --- /dev/null +++ b/.github/workflows/beam_StressTests_Java_BigTableIO.yml @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: StressTests Java BigTableIO + +on: + schedule: + - cron: '0 16 * * 6' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_StressTests_Java_BigTableIO: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Stress Tests Java BigTableIO' + runs-on: [self-hosted, ubuntu-20.04, highmem] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_StressTests_Java_BigTableIO"] + job_phrase: ["Run Stress Tests Java BigTableIO"] + steps: + - uses: actions/checkout@v4 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: run BigTable StressTest Large + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :it:google-cloud-platform:BigTableStressTestLarge --info -DinfluxHost="http://10.128.0.96:8086" -DinfluxDatabase="beam_test_metrics" -DinfluxMeasurement="java_stress_test_bigtable" \ No newline at end of file diff --git a/.github/workflows/beam_StressTests_Java_KafkaIO.yml b/.github/workflows/beam_StressTests_Java_KafkaIO.yml new file mode 100644 index 000000000000..aba7e2175193 --- /dev/null +++ b/.github/workflows/beam_StressTests_Java_KafkaIO.yml @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: StressTests Java KafkaIO + +on: + schedule: + - cron: '0 10 * * 0' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_StressTests_Java_KafkaIO: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Stress Tests Java KafkaIO' + runs-on: [self-hosted, ubuntu-20.04, highmem] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_StressTests_Java_KafkaIO"] + job_phrase: ["Run Stress Tests Java KafkaIO"] + steps: + - uses: actions/checkout@v4 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Authenticate on GCP + id: auth + uses: google-github-actions/auth@v1 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + project_id: ${{ secrets.GCP_PROJECT_ID }} + - name: Set k8s access + uses: ./.github/actions/setup-k8s-access + with: + cluster_name: beam-utility + k8s_namespace: ${{ matrix.job_name }}-${{ github.run_id }} + cluster_zone: us-central1 + - name: Install Kafka + id: install_kafka + run: | + kubectl apply -k ${{ github.workspace }}/.test-infra/kafka/strimzi/02-kafka-persistent/overlays/gke-internal-load-balanced + kubectl wait kafka beam-testing-cluster --for=condition=Ready --timeout=1800s + - name: Set up Kafka brokers + id: set_brokers + run: | + declare -a kafka_service_brokers + declare -a kafka_service_brokers_ports + for INDEX in {0..2}; do + kubectl wait svc/beam-testing-cluster-kafka-${INDEX} --for=jsonpath='{.status.loadBalancer.ingress[0].ip}' --timeout=1200s + kafka_service_brokers[$INDEX]=$(kubectl get svc beam-testing-cluster-kafka-${INDEX} -o jsonpath='{.status.loadBalancer.ingress[0].ip}') + kafka_service_brokers_ports[$INDEX]=$(kubectl get svc beam-testing-cluster-kafka-${INDEX} -o jsonpath='{.spec.ports[0].port}') + echo "KAFKA_SERVICE_BROKER_${INDEX}=${kafka_service_brokers[$INDEX]}" >> $GITHUB_OUTPUT + echo "KAFKA_SERVICE_BROKER_PORTS_${INDEX}=${kafka_service_brokers_ports[$INDEX]}" >> $GITHUB_OUTPUT + done + - name: run Kafka StressTest Large + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :it:kafka:KafkaStressTestLarge --info -DbootstrapServers="${{ steps.set_brokers.outputs.KAFKA_SERVICE_BROKER_0 }}:${{ steps.set_brokers.outputs.KAFKA_SERVICE_BROKER_PORTS_0 }},${{ steps.set_brokers.outputs.KAFKA_SERVICE_BROKER_1 }}:${{ steps.set_brokers.outputs.KAFKA_SERVICE_BROKER_PORTS_1 }},${{ steps.set_brokers.outputs.KAFKA_SERVICE_BROKER_2 }}:${{ steps.set_brokers.outputs.KAFKA_SERVICE_BROKER_PORTS_2 }}" -DinfluxHost="http://10.128.0.96:8086" -DinfluxDatabase="beam_test_metrics" -DinfluxMeasurement="java_stress_test_kafka" \ No newline at end of file diff --git a/.github/workflows/beam_StressTests_Java_PubSubIO.yml b/.github/workflows/beam_StressTests_Java_PubSubIO.yml new file mode 100644 index 000000000000..9fb4b19aa3b5 --- /dev/null +++ b/.github/workflows/beam_StressTests_Java_PubSubIO.yml @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: StressTests Java PubSubIO + +on: + schedule: + - cron: '0 22 * * 6' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_StressTests_Java_PubSubIO: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Stress Tests Java PubSubIO' + runs-on: [self-hosted, ubuntu-20.04, highmem] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_StressTests_Java_PubSubIO"] + job_phrase: ["Run Stress Tests Java PubSubIO"] + steps: + - uses: actions/checkout@v4 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: run PubSub StressTest Large + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :it:google-cloud-platform:PubSubStressTestLarge --info -DinfluxHost="http://10.128.0.96:8086" -DinfluxDatabase="beam_test_metrics" -DinfluxMeasurement="java_stress_test_pubsub" \ No newline at end of file diff --git a/.github/workflows/beam_StressTests_Java_SpannerIO.yml b/.github/workflows/beam_StressTests_Java_SpannerIO.yml new file mode 100644 index 000000000000..f327223cb685 --- /dev/null +++ b/.github/workflows/beam_StressTests_Java_SpannerIO.yml @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: StressTests Java SpannerIO + +on: + schedule: + - cron: '0 22 * * 6' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_StressTests_Java_SpannerIO: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Stress Tests Java SpannerIO' + runs-on: [self-hosted, ubuntu-20.04, highmem] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_StressTests_Java_SpannerIO"] + job_phrase: ["Run Stress Tests Java SpannerIO"] + steps: + - uses: actions/checkout@v4 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: run Spanner StressTest Large + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :it:google-cloud-platform:SpannerStressTestLarge --info -DinfluxHost="http://10.128.0.96:8086" -DinfluxDatabase="beam_test_metrics" -DinfluxMeasurement="java_stress_test_spanner" \ No newline at end of file diff --git a/CHANGES.md b/CHANGES.md index 5824c71a98dc..941ba23a7573 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -73,6 +73,7 @@ ## Breaking Changes * X behavior was changed ([#X](https://github.com/apache/beam/issues/X)). +* Default consumer polling timeout for KafkaIO.Read was increased from 1 second to 2 seconds. Use KafkaIO.read().withConsumerPollingTimeout(Duration duration) to configure this timeout value when necessary ([#30870](https://github.com/apache/beam/issues/30870)). ## Deprecations diff --git a/examples/notebooks/get-started/try-apache-beam-yaml.ipynb b/examples/notebooks/get-started/try-apache-beam-yaml.ipynb index b18b318e85ea..48771a010d52 100644 --- a/examples/notebooks/get-started/try-apache-beam-yaml.ipynb +++ b/examples/notebooks/get-started/try-apache-beam-yaml.ipynb @@ -533,7 +533,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Beam YAML has EXPERIMENTAL ability to do aggregations to group and combine values across records. The is accomplished via the `Combine` transform type. Currently `Combine` needs to be in the `yaml_experimental_features` option (see the bottom of the pipeline) to use this transform.\n", + "Beam YAML has EXPERIMENTAL ability to do aggregations to group and combine values across records. The is accomplished via the `Combine` transform type.\n", "\n", "In this example we'll aggregate our records based on the `is_adult` classification. We'll calculate an average age for each of the groups." ] @@ -566,9 +566,7 @@ " total:\n", " value: age\n", " fn: mean\n", - " - type: LogForTesting\n", - "options:\n", - " yaml_experimental_features: Combine" + " - type: LogForTesting" ] }, { diff --git a/it/google-cloud-platform/build.gradle b/it/google-cloud-platform/build.gradle index 4258baa4f42a..1a7e21e4a79a 100644 --- a/it/google-cloud-platform/build.gradle +++ b/it/google-cloud-platform/build.gradle @@ -82,16 +82,77 @@ dependencies { testRuntimeOnly library.java.slf4j_simple } -tasks.register("GCSPerformanceTest", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'FileBasedIOLT', ['configuration':'large','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) -tasks.register("BigTablePerformanceTest", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigTableIOLT', ['configuration':'large','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) -tasks.register("BigTableStressTestMedium", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigTableIOST', ['configuration':'medium','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) -tasks.register("BigTableStressTestLarge", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigTableIOST', ['configuration':'large','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) -tasks.register("BigQueryPerformanceTest", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigQueryIOLT', ['configuration':'medium','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) -tasks.register("BigQueryStressTestMedium", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigQueryIOST', ['configuration':'medium','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) -tasks.register("BigQueryStressTestLarge", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigQueryIOST', ['configuration':'large','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) -tasks.register("BigQueryStorageApiStreamingPerformanceTest", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigQueryStreamingLT', ['configuration':'large', 'project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) -tasks.register("PubSubPerformanceTest", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'PubSubIOLT', ['configuration':'large','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) -tasks.register("SpannerStressTestMedium", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'SpannerIOST', ['configuration':'medium','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + System.properties) -tasks.register("SpannerStressTestLarge", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'SpannerIOST', ['configuration':'large','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + System.properties) -tasks.register("WordCountIntegrationTest", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'WordCountIT', ['project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) +tasks.register( + "GCSPerformanceTest", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'FileBasedIOLT', + ['configuration':'large','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + + System.properties +) +tasks.register( + "BigTablePerformanceTest", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigTableIOLT', + ['configuration':'large', 'project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + + System.properties +) +tasks.register( + "BigQueryPerformanceTest", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigQueryIOLT', + ['configuration':'medium', 'project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + + System.properties +) +tasks.register( + "BigQueryStressTestMedium", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigQueryIOST', + ['configuration':'medium', 'project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + + System.properties +) +tasks.register( + "BigQueryStressTestLarge", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigQueryIOST', + ['configuration':'large', 'project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + + System.properties +) +tasks.register( + "BigQueryStorageApiStreamingPerformanceTest", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigQueryStreamingLT', + ['configuration':'large', 'project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + + System.properties +) +tasks.register( + "PubsubLoadTestMedium", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'PubsubIOLT', + ['configuration':'medium', 'project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + + System.properties +) +tasks.register( + "PubsubLoadTestLarge", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'PubsubIOLT', + ['configuration':'large', 'project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + + System.properties +) +tasks.register( + "WordCountIntegrationTest", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'WordCountIT', + ['project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + + System.properties +) +tasks.register( + "BigTableStressTestMedium", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigTableIOST', + ['configuration':'medium','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + + System.properties +) +tasks.register( + "BigTableStressTestLarge", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigTableIOST', + ['configuration':'large','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + + System.properties +) +tasks.register( + "SpannerStressTestMedium", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'SpannerIOST', + ['configuration':'medium','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + + System.properties +) +tasks.register( + "SpannerStressTestLarge", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'SpannerIOST', + ['configuration':'large','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + + System.properties +) +tasks.register( + "PubSubStressTestMedium", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'PubSubIOST', + ['configuration':'medium','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + + System.properties) +tasks.register( + "PubSubStressTestLarge", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'PubSubIOST', + ['configuration':'large','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + + System.properties) diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/IOLoadTestBase.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/IOLoadTestBase.java index e5f20c07c01f..bbf9dd0519ec 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/IOLoadTestBase.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/IOLoadTestBase.java @@ -19,7 +19,6 @@ import com.google.cloud.Timestamp; import java.io.IOException; -import java.text.ParseException; import java.util.ArrayList; import java.util.Collection; import java.util.Map; @@ -116,10 +115,15 @@ protected void exportMetrics( PipelineLauncher.LaunchInfo launchInfo, MetricsConfiguration metricsConfig, boolean exportToInfluxDB, - InfluxDBSettings influxDBSettings) - throws IOException, ParseException, InterruptedException { - - Map metrics = getMetrics(launchInfo, metricsConfig); + InfluxDBSettings influxDBSettings) { + + Map metrics; + try { + metrics = getMetrics(launchInfo, metricsConfig); + } catch (Exception e) { + LOG.warn("Unable to get metrics due to error: {}", e.getMessage()); + return; + } String testId = UUID.randomUUID().toString(); String testTimestamp = Timestamp.now().toString(); diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java index 44a439b0ce91..5cc5d0562b85 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java @@ -172,8 +172,12 @@ protected void exportMetricsToBigQuery(LaunchInfo launchInfo, Mapwrite() - .withTriggeringFrequency(org.joda.time.Duration.standardSeconds(30)) .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND) .withAvroFormatFunction( new AvroFormatFn( configuration.numColumns, - !("STORAGE_WRITE_API".equalsIgnoreCase(configuration.writeMethod)))); + !(STORAGE_WRITE_API_METHOD.equalsIgnoreCase(configuration.writeMethod)))); break; case JSON: writeIO = BigQueryIO.write() - .withTriggeringFrequency(org.joda.time.Duration.standardSeconds(30)) .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND) .withSuccessfulInsertsPropagation(false) .withFormatFunction(new JsonFormatFn(configuration.numColumns)); break; } + if (configuration.writeMethod.equals(STORAGE_WRITE_API_METHOD)) { + writeIO = writeIO.withTriggeringFrequency(org.joda.time.Duration.standardSeconds(60)); + } generateDataAndWrite(writeIO); } @@ -265,43 +247,32 @@ private void generateDataAndWrite(BigQueryIO.Write writeIO) throws IOExc BigQueryIO.Write.Method method = BigQueryIO.Write.Method.valueOf(configuration.writeMethod); writePipeline.getOptions().as(StreamingOptions.class).setStreaming(true); - // The PeriodicImpulse source will generate an element every this many millis: - int fireInterval = 1; // Each element from PeriodicImpulse will fan out to this many elements: int startMultiplier = Math.max(configuration.rowsPerSecond, DEFAULT_ROWS_PER_SECOND) / DEFAULT_ROWS_PER_SECOND; - long stopAfterMillis = - org.joda.time.Duration.standardMinutes(configuration.minutes).getMillis(); - long totalRows = startMultiplier * stopAfterMillis / fireInterval; List loadPeriods = getLoadPeriods(configuration.minutes, DEFAULT_LOAD_INCREASE_ARRAY); - PCollection source = - writePipeline - .apply( - PeriodicImpulse.create() - .stopAfter(org.joda.time.Duration.millis(stopAfterMillis - 1)) - .withInterval(org.joda.time.Duration.millis(fireInterval))) - .apply( - "Extract row IDs", - MapElements.into(TypeDescriptor.of(byte[].class)) - .via(instant -> Longs.toByteArray(instant.getMillis() % totalRows))); + PCollection> source = + writePipeline.apply(Read.from(new SyntheticUnboundedSource(configuration))); if (startMultiplier > 1) { source = source .apply( "One input to multiple outputs", ParDo.of(new MultiplierDoFn<>(startMultiplier, loadPeriods))) - .apply("Reshuffle fanout", Reshuffle.viaRandomKey()) - .apply("Counting element", ParDo.of(new CountingFn<>(READ_ELEMENT_METRIC_NAME))); + .apply("Reshuffle fanout", Reshuffle.of()); } - source.apply( - "Write to BQ", - writeIO - .to(tableQualifier) - .withMethod(method) - .withSchema(schema) - .withCustomGcsTempLocation(ValueProvider.StaticValueProvider.of(tempLocation))); + source + .apply("Extract values", Values.create()) + .apply("Counting element", ParDo.of(new CountingFn<>(READ_ELEMENT_METRIC_NAME))) + .apply( + "Write to BQ", + writeIO + .to(tableQualifier) + .withMethod(method) + .withSchema(schema) + .withCustomGcsTempLocation(ValueProvider.StaticValueProvider.of(tempLocation))); PipelineLauncher.LaunchConfig options = PipelineLauncher.LaunchConfig.builder("write-bigquery") @@ -332,33 +303,20 @@ private void generateDataAndWrite(BigQueryIO.Write writeIO) throws IOExc region, launchInfo.jobId(), getBeamMetricsName(PipelineMetricsType.COUNTER, READ_ELEMENT_METRIC_NAME)); - Long rowCount = resourceManager.getRowCount(tableQualifier); - assertEquals(rowCount, numRecords, 0.5); + Long rowCount = resourceManager.getRowCount(tableName); + + // Assert that numRecords equals or greater than rowCount since there might be + // duplicates when testing big amount of data + assertTrue(numRecords >= rowCount); // export metrics MetricsConfiguration metricsConfig = MetricsConfiguration.builder() - .setInputPCollection("Reshuffle fanout/Values/Values/Map.out0") - .setInputPCollectionV2("Reshuffle fanout/Values/Values/Map/ParMultiDo(Anonymous).out0") + .setInputPCollection("Reshuffle fanout/ExpandIterable.out0") .setOutputPCollection("Counting element.out0") - .setOutputPCollectionV2("Counting element/ParMultiDo(Counting).out0") .build(); - try { - Map metrics = getMetrics(launchInfo, metricsConfig); - if (configuration.exportMetricsToInfluxDB) { - Collection namedTestResults = new ArrayList<>(); - for (Map.Entry entry : metrics.entrySet()) { - NamedTestResult metricResult = - NamedTestResult.create(TEST_ID, TEST_TIMESTAMP, entry.getKey(), entry.getValue()); - namedTestResults.add(metricResult); - } - IOITMetrics.publishToInflux(TEST_ID, TEST_TIMESTAMP, namedTestResults, influxDBSettings); - } else { - exportMetricsToBigQuery(launchInfo, metrics); - } - } catch (ParseException | InterruptedException e) { - throw new RuntimeException(e); - } + exportMetrics( + launchInfo, metricsConfig, configuration.exportMetricsToInfluxDB, influxDBSettings); } abstract static class FormatFn implements SerializableFunction { @@ -475,7 +433,7 @@ static class Configuration extends SyntheticSourceOptions { @JsonProperty public String writeMethod = "DEFAULT"; /** BigQuery write format: AVRO/JSON. */ - @JsonProperty public String writeFormat = "AVRO"; + @JsonProperty public String writeFormat = WriteFormat.AVRO.name(); /** * Rate of generated elements sent to the source table. Will run with a minimum of 1k rows per @@ -491,7 +449,7 @@ static class Configuration extends SyntheticSourceOptions { * InfluxDB and displayed using Grafana. If set to false, metrics will be exported to BigQuery * and displayed with Looker Studio. */ - @JsonProperty public boolean exportMetricsToInfluxDB = false; + @JsonProperty public boolean exportMetricsToInfluxDB = true; /** InfluxDB measurement to publish results to. * */ @JsonProperty public String influxMeasurement = BigQueryIOST.class.getName(); diff --git a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigTableIOST.java b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigTableIOST.java index 4821992381b8..4abcee8e6d59 100644 --- a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigTableIOST.java +++ b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigTableIOST.java @@ -18,15 +18,14 @@ package org.apache.beam.it.gcp.bigtable; import static org.apache.beam.it.gcp.bigtable.BigtableResourceManagerUtils.generateTableId; -import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertTrue; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.bigtable.v2.Mutation; import com.google.protobuf.ByteString; import java.io.IOException; import java.io.Serializable; -import java.text.ParseException; import java.time.Duration; import java.util.List; import java.util.Map; @@ -121,6 +120,15 @@ public void setup() throws IOException { } // Use streaming pipeline to write records writePipeline.getOptions().as(StreamingOptions.class).setStreaming(true); + + if (configuration.exportMetricsToInfluxDB) { + configuration.influxHost = + TestProperties.getProperty("influxHost", "", TestProperties.Type.PROPERTY); + configuration.influxDatabase = + TestProperties.getProperty("influxDatabase", "", TestProperties.Type.PROPERTY); + configuration.influxMeasurement = + TestProperties.getProperty("influxMeasurement", "", TestProperties.Type.PROPERTY); + } } @After @@ -149,7 +157,7 @@ public void teardown() { /** Run stress test with configurations specified by TestProperties. */ @Test - public void runTest() throws IOException, ParseException, InterruptedException { + public void runTest() throws IOException { if (configuration.exportMetricsToInfluxDB) { influxDBSettings = InfluxDBSettings.builder() @@ -186,7 +194,9 @@ public void runTest() throws IOException, ParseException, InterruptedException { readInfo.jobId(), getBeamMetricsName(PipelineMetricsType.COUNTER, READ_ELEMENT_METRIC_NAME)); - assertEquals(writeNumRecords, readNumRecords, 0); + // Assert that writeNumRecords equals or greater than readNumRecords since there might be + // duplicates when testing big amount of data + assertTrue(writeNumRecords >= readNumRecords); } finally { // clean up write streaming pipeline if (pipelineLauncher.getJobStatus(project, region, writeInfo.jobId()) @@ -329,7 +339,7 @@ static class Configuration extends SyntheticSourceOptions { * InfluxDB and displayed using Grafana. If set to false, metrics will be exported to BigQuery * and displayed with Looker Studio. */ - @JsonProperty public boolean exportMetricsToInfluxDB = false; + @JsonProperty public boolean exportMetricsToInfluxDB = true; /** InfluxDB measurement to publish results to. * */ @JsonProperty public String influxMeasurement = BigTableIOST.class.getName(); diff --git a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/pubsub/PubSubIOST.java b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/pubsub/PubSubIOST.java new file mode 100644 index 000000000000..cb96db40b749 --- /dev/null +++ b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/pubsub/PubSubIOST.java @@ -0,0 +1,507 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.it.gcp.pubsub; + +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertTrue; + +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.protobuf.ByteString; +import com.google.pubsub.v1.SubscriptionName; +import com.google.pubsub.v1.TopicName; +import java.io.IOException; +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.time.Duration; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import org.apache.beam.it.common.PipelineLauncher; +import org.apache.beam.it.common.PipelineOperator; +import org.apache.beam.it.common.TestProperties; +import org.apache.beam.it.common.utils.ResourceManagerUtils; +import org.apache.beam.it.gcp.IOStressTestBase; +import org.apache.beam.runners.dataflow.options.DataflowPipelineWorkerPoolOptions; +import org.apache.beam.sdk.extensions.protobuf.Proto3SchemaMessages.Primitive; +import org.apache.beam.sdk.io.Read; +import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO; +import org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage; +import org.apache.beam.sdk.io.gcp.pubsub.PubsubOptions; +import org.apache.beam.sdk.io.synthetic.SyntheticSourceOptions; +import org.apache.beam.sdk.io.synthetic.SyntheticUnboundedSource; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.testing.TestPipelineOptions; +import org.apache.beam.sdk.testutils.publishing.InfluxDBSettings; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.Reshuffle; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.junit.After; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; + +/** + * PubSubIO stress test. The test is designed to assess the performance of PubSubIO under various + * conditions. + * + *

Usage:
+ * - To run medium-scale stress tests: {@code gradle + * :it:google-cloud-platform:PubSubStressTestMedium} - To run large-scale stress tests: {@code + * gradle :it:google-cloud-platform:PubSubStressTestLarge} + */ +public class PubSubIOST extends IOStressTestBase { + private static final int NUMBER_OF_BUNDLES_FOR_MEDIUM = 20; + private static final int NUMBER_OF_BUNDLES_FOR_LARGE = 200; + private static final String READ_ELEMENT_METRIC_NAME = "read_count"; + private static final String WRITE_ELEMENT_METRIC_NAME = "write_count"; + private static final String MAP_RECORDS_STEP_NAME = "Map records"; + private static final String WRITE_TO_PUBSUB_STEP_NAME = "Write to PubSub"; + private static final Map TEST_CONFIGS_PRESET; + private static TopicName topicName; + private static String testConfigName; + private static Configuration configuration; + private static SubscriptionName subscription; + private static InfluxDBSettings influxDBSettings; + private static PubsubResourceManager resourceManager; + + @Rule public transient TestPipeline writePipeline = TestPipeline.create(); + @Rule public transient TestPipeline readPipeline = TestPipeline.create(); + + static { + try { + TEST_CONFIGS_PRESET = + ImmutableMap.of( + "medium", + Configuration.fromJsonString( + "{\"numRecords\":2000000,\"rowsPerSecond\":25000,\"minutes\":10,\"valueSizeBytes\":1000,\"pipelineTimeout\":20,\"runner\":\"DataflowRunner\"}", + Configuration.class), + "large", + Configuration.fromJsonString( + "{\"numRecords\":20000000,\"rowsPerSecond\":25000,\"minutes\":40,\"valueSizeBytes\":1000,\"pipelineTimeout\":70,\"runner\":\"DataflowRunner\"}", + Configuration.class)); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Before + public void setup() throws IOException { + resourceManager = + PubsubResourceManager.builder("io-pubsub-st", project, CREDENTIALS_PROVIDER).build(); + topicName = resourceManager.createTopic("topic"); + subscription = resourceManager.createSubscription(topicName, "subscription"); + PipelineOptionsFactory.register(TestPipelineOptions.class); + + // parse configuration + testConfigName = + TestProperties.getProperty("configuration", "local", TestProperties.Type.PROPERTY); + configuration = TEST_CONFIGS_PRESET.get(testConfigName); + if (configuration == null) { + try { + configuration = Configuration.fromJsonString(testConfigName, Configuration.class); + } catch (IOException e) { + throw new IllegalArgumentException( + String.format( + "Unknown test configuration: [%s]. Pass to a valid configuration json, or use" + + " config presets: %s", + testConfigName, TEST_CONFIGS_PRESET.keySet())); + } + } + + // Explicitly set up number of bundles in SyntheticUnboundedSource since it has a bug in + // implementation where + // number of lost data in streaming pipeline equals to number of initial bundles. + configuration.forceNumInitialBundles = + testConfigName.equals("medium") + ? NUMBER_OF_BUNDLES_FOR_MEDIUM + : NUMBER_OF_BUNDLES_FOR_LARGE; + + // tempLocation needs to be set for DataflowRunner + if (!Strings.isNullOrEmpty(tempBucketName)) { + String tempLocation = String.format("gs://%s/temp/", tempBucketName); + writePipeline.getOptions().as(TestPipelineOptions.class).setTempRoot(tempLocation); + writePipeline.getOptions().setTempLocation(tempLocation); + readPipeline.getOptions().as(TestPipelineOptions.class).setTempRoot(tempLocation); + readPipeline.getOptions().setTempLocation(tempLocation); + } + writePipeline.getOptions().as(PubsubOptions.class).setProject(project); + readPipeline.getOptions().as(PubsubOptions.class).setProject(project); + + if (configuration.exportMetricsToInfluxDB) { + configuration.influxHost = + TestProperties.getProperty("influxHost", "", TestProperties.Type.PROPERTY); + configuration.influxDatabase = + TestProperties.getProperty("influxDatabase", "", TestProperties.Type.PROPERTY); + configuration.influxMeasurement = + TestProperties.getProperty("influxMeasurement", "", TestProperties.Type.PROPERTY); + } + } + + @After + public void tearDownClass() { + ResourceManagerUtils.cleanResources(resourceManager); + } + + @Test + public void testStringWriteAndRead() throws IOException { + configuration.writeAndReadFormat = WriteAndReadFormat.STRING.toString(); + testWriteAndRead(); + } + + @Test + public void testAvroGenericClassWriteAndRead() throws IOException { + configuration.writeAndReadFormat = WriteAndReadFormat.AVRO.toString(); + testWriteAndRead(); + } + + @Test + public void testProtoPrimitiveWriteAndRead() throws IOException { + configuration.writeAndReadFormat = WriteAndReadFormat.PROTO.toString(); + testWriteAndRead(); + } + + @Test + public void testPubsubMessageWriteAndRead() throws IOException { + configuration.writeAndReadFormat = WriteAndReadFormat.PUBSUB_MESSAGE.toString(); + testWriteAndRead(); + } + + public void testWriteAndRead() throws IOException { + if (configuration.exportMetricsToInfluxDB) { + influxDBSettings = + InfluxDBSettings.builder() + .withHost(configuration.influxHost) + .withDatabase(configuration.influxDatabase) + .withMeasurement( + configuration.influxMeasurement + + "_" + + testConfigName + + "_" + + configuration.writeAndReadFormat) + .get(); + } + + WriteAndReadFormat format = WriteAndReadFormat.valueOf(configuration.writeAndReadFormat); + PipelineLauncher.LaunchInfo writeLaunchInfo = generateDataAndWrite(format); + PipelineLauncher.LaunchInfo readLaunchInfo = testRead(format); + try { + PipelineOperator.Result readResult = + pipelineOperator.waitUntilDone( + createConfig(readLaunchInfo, Duration.ofMinutes(configuration.pipelineTimeout))); + + // Check the initial launch didn't fail + assertNotEquals(PipelineOperator.Result.LAUNCH_FAILED, readResult); + + // check metrics + double writeNumRecords = + pipelineLauncher.getMetric( + project, + region, + writeLaunchInfo.jobId(), + getBeamMetricsName(PipelineMetricsType.COUNTER, WRITE_ELEMENT_METRIC_NAME)); + double readNumRecords = + pipelineLauncher.getMetric( + project, + region, + readLaunchInfo.jobId(), + getBeamMetricsName(PipelineMetricsType.COUNTER, READ_ELEMENT_METRIC_NAME)); + + // Assert that writeNumRecords equals or greater than readNumRecords since there might be + // duplicates when testing big amount of data + assertTrue(writeNumRecords >= readNumRecords); + + // export metrics + MetricsConfiguration writeMetricsConfig = + MetricsConfiguration.builder() + .setInputPCollection("Map records.out0") + .setInputPCollectionV2("Map records/ParMultiDo(MapKVToPubSubType).out0") + .setOutputPCollection("Counting element.out0") + .setOutputPCollectionV2("Counting element/ParMultiDo(Counting).out0") + .build(); + + MetricsConfiguration readMetricsConfig = + MetricsConfiguration.builder() + .setOutputPCollection("Counting element.out0") + .setOutputPCollectionV2("Counting element/ParMultiDo(Counting).out0") + .build(); + + exportMetrics( + writeLaunchInfo, + writeMetricsConfig, + configuration.exportMetricsToInfluxDB, + influxDBSettings); + exportMetrics( + readLaunchInfo, + readMetricsConfig, + configuration.exportMetricsToInfluxDB, + influxDBSettings); + } finally { + cancelJobIfRunning(writeLaunchInfo); + cancelJobIfRunning(readLaunchInfo); + } + } + + /** + * The method creates a pipeline to simulate data generation and write operations to PubSub, based + * on the specified configuration parameters. The stress test involves varying the load + * dynamically over time, with options to use configurable parameters. + */ + private PipelineLauncher.LaunchInfo generateDataAndWrite(WriteAndReadFormat format) + throws IOException { + int startMultiplier = + Math.max(configuration.rowsPerSecond, DEFAULT_ROWS_PER_SECOND) / DEFAULT_ROWS_PER_SECOND; + List loadPeriods = + getLoadPeriods(configuration.minutes, DEFAULT_LOAD_INCREASE_ARRAY); + + PCollection> dataFromSource = + writePipeline.apply( + "Read from source", Read.from(new SyntheticUnboundedSource(configuration))); + + if (startMultiplier > 1) { + dataFromSource = + dataFromSource + .apply( + "One input to multiple outputs", + ParDo.of(new MultiplierDoFn<>(startMultiplier, loadPeriods))) + .apply("Reshuffle fanout", Reshuffle.of()) + .apply("Counting element", ParDo.of(new CountingFn<>(WRITE_ELEMENT_METRIC_NAME))); + ; + } + + switch (format) { + case STRING: + dataFromSource + .apply(MAP_RECORDS_STEP_NAME, ParDo.of(new MapKVtoString())) + .apply(WRITE_TO_PUBSUB_STEP_NAME, PubsubIO.writeStrings().to(topicName.toString())); + break; + case AVRO: + dataFromSource + .apply(MAP_RECORDS_STEP_NAME, ParDo.of(new MapKVtoGenericClass())) + .apply( + WRITE_TO_PUBSUB_STEP_NAME, + PubsubIO.writeAvros(GenericClass.class).to(topicName.toString())); + break; + case PROTO: + dataFromSource + .apply(MAP_RECORDS_STEP_NAME, ParDo.of(new MapKVtoPrimitiveProto())) + .apply( + WRITE_TO_PUBSUB_STEP_NAME, + PubsubIO.writeProtos(Primitive.class).to(topicName.toString())); + break; + case PUBSUB_MESSAGE: + dataFromSource + .apply(MAP_RECORDS_STEP_NAME, ParDo.of(new MapKVtoPubSubMessage())) + .apply(WRITE_TO_PUBSUB_STEP_NAME, PubsubIO.writeMessages().to(topicName.toString())); + break; + } + + PipelineLauncher.LaunchConfig options = + PipelineLauncher.LaunchConfig.builder("write-pubsub") + .setSdk(PipelineLauncher.Sdk.JAVA) + .setPipeline(writePipeline) + .addParameter("runner", configuration.runner) + .addParameter( + "autoscalingAlgorithm", + DataflowPipelineWorkerPoolOptions.AutoscalingAlgorithmType.THROUGHPUT_BASED + .toString()) + .addParameter("numWorkers", String.valueOf(configuration.numWorkers)) + .addParameter("maxNumWorkers", String.valueOf(configuration.maxNumWorkers)) + .addParameter("streaming", "true") + .addParameter("experiments", "use_runner_v2") + .build(); + + return pipelineLauncher.launch(project, region, options); + } + + private PipelineLauncher.LaunchInfo testRead(WriteAndReadFormat format) throws IOException { + PubsubIO.Read read = null; + + switch (format) { + case STRING: + read = PubsubIO.readStrings().fromSubscription(subscription.toString()); + break; + case AVRO: + read = PubsubIO.readAvros(GenericClass.class).fromSubscription(subscription.toString()); + break; + case PROTO: + read = PubsubIO.readProtos(Primitive.class).fromSubscription(subscription.toString()); + break; + case PUBSUB_MESSAGE: + read = PubsubIO.readMessages().fromSubscription(subscription.toString()); + break; + } + + readPipeline + .apply("Read from PubSub", read) + .apply("Counting element", ParDo.of(new CountingFn<>(READ_ELEMENT_METRIC_NAME))); + + PipelineLauncher.LaunchConfig readOptions = + PipelineLauncher.LaunchConfig.builder("read-pubsub") + .setSdk(PipelineLauncher.Sdk.JAVA) + .setPipeline(readPipeline) + .addParameter("runner", configuration.runner) + .addParameter("streaming", "true") + .addParameter("experiments", "use_runner_v2") + .addParameter("numWorkers", String.valueOf(configuration.numWorkers)) + .addParameter("maxNumWorkers", String.valueOf(configuration.maxNumWorkers)) + .build(); + + return pipelineLauncher.launch(project, region, readOptions); + } + + private void cancelJobIfRunning(PipelineLauncher.LaunchInfo pipelineLaunchInfo) + throws IOException { + if (pipelineLauncher.getJobStatus(project, region, pipelineLaunchInfo.jobId()) + == PipelineLauncher.JobState.RUNNING) { + pipelineLauncher.cancelJob(project, region, pipelineLaunchInfo.jobId()); + } + } + + /** Mapper class to convert data from KV to String. */ + private static class MapKVtoString extends DoFn, String> { + @ProcessElement + public void process(ProcessContext context) { + byte[] byteValue = Objects.requireNonNull(context.element()).getValue(); + context.output(ByteString.copyFrom(byteValue).toString(StandardCharsets.UTF_8)); + } + } + + /** Mapper class to convert data from KV to GenericClass. */ + private static class MapKVtoGenericClass extends DoFn, GenericClass> { + @ProcessElement + public void process(ProcessContext context) { + byte[] byteValue = Objects.requireNonNull(context.element()).getValue(); + GenericClass pojo = new GenericClass(byteValue); + context.output(pojo); + } + } + + /** Mapper class to convert data from KV to Proto Primitive. */ + private static class MapKVtoPrimitiveProto extends DoFn, Primitive> { + @ProcessElement + public void process(ProcessContext context) { + byte[] byteValue = Objects.requireNonNull(context.element()).getValue(); + Primitive proto = + Primitive.newBuilder() + .setPrimitiveBytes(ByteString.copyFrom(byteValue)) + .setPrimitiveInt32(ByteBuffer.wrap(byteValue).getInt()) + .build(); + context.output(proto); + } + } + + /** Mapper class to convert data from KV to PubSubMessage. */ + private static class MapKVtoPubSubMessage extends DoFn, PubsubMessage> { + @ProcessElement + public void process(ProcessContext context) { + byte[] byteValue = Objects.requireNonNull(context.element()).getValue(); + PubsubMessage pubsubMessage = new PubsubMessage(byteValue, Collections.emptyMap()); + context.output(pubsubMessage); + } + } + + /** Example of Generic class to test PubSubIO.writeAvros()/readAvros methods. */ + static class GenericClass implements Serializable { + byte[] byteField; + + public GenericClass() {} + + public GenericClass(byte[] byteField) { + this.byteField = byteField; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(getClass()).add("byteField", byteField).toString(); + } + + @Override + public int hashCode() { + return Objects.hash(Arrays.hashCode(byteField)); + } + + @Override + public boolean equals(@Nullable Object other) { + if (other == null || !(other instanceof GenericClass)) { + return false; + } + GenericClass o = (GenericClass) other; + return Arrays.equals(byteField, o.byteField); + } + } + + private enum WriteAndReadFormat { + STRING, + AVRO, + PROTO, + PUBSUB_MESSAGE + } + + /** Options for PubSub IO load test. */ + static class Configuration extends SyntheticSourceOptions { + /** Pipeline timeout in minutes. Must be a positive value. */ + @JsonProperty public int pipelineTimeout = 20; + + /** Runner specified to run the pipeline. */ + @JsonProperty public String runner = "DirectRunner"; + + /** PubSub write and read format: STRING/AVRO/PROTO/PUBSUB_MESSAGE. */ + @JsonProperty public String writeAndReadFormat = "STRING"; + + /** Number of workers for the pipeline. */ + @JsonProperty public int numWorkers = 20; + + /** Maximum number of workers for the pipeline. */ + @JsonProperty public int maxNumWorkers = 100; + + /** + * Rate of generated elements sent to the source table. Will run with a minimum of 1k rows per + * second. + */ + @JsonProperty public int rowsPerSecond = DEFAULT_ROWS_PER_SECOND; + + /** Rows will be generated for this many minutes. */ + @JsonProperty public int minutes = 15; + + /** + * Determines the destination for exporting metrics. If set to true, metrics will be exported to + * InfluxDB and displayed using Grafana. If set to false, metrics will be exported to BigQuery + * and displayed with Looker Studio. + */ + @JsonProperty public boolean exportMetricsToInfluxDB = true; + + /** InfluxDB measurement to publish results to. * */ + @JsonProperty public String influxMeasurement; + + /** InfluxDB host to publish metrics. * */ + @JsonProperty public String influxHost; + + /** InfluxDB database to publish metrics. * */ + @JsonProperty public String influxDatabase; + } +} diff --git a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/pubsub/PubSubIOLT.java b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/pubsub/PubsubIOLT.java similarity index 82% rename from it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/pubsub/PubSubIOLT.java rename to it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/pubsub/PubsubIOLT.java index f5bd4f59149b..77f32a94103d 100644 --- a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/pubsub/PubSubIOLT.java +++ b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/pubsub/PubsubIOLT.java @@ -22,7 +22,6 @@ import static org.junit.Assert.assertTrue; import com.fasterxml.jackson.annotation.JsonProperty; -import com.google.cloud.Timestamp; import com.google.protobuf.ByteString; import com.google.pubsub.v1.SubscriptionName; import com.google.pubsub.v1.TopicName; @@ -30,15 +29,11 @@ import java.io.Serializable; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; -import java.text.ParseException; import java.time.Duration; -import java.util.ArrayList; import java.util.Arrays; -import java.util.Collection; import java.util.Collections; import java.util.Map; import java.util.Objects; -import java.util.UUID; import org.apache.beam.it.common.PipelineLauncher; import org.apache.beam.it.common.PipelineOperator; import org.apache.beam.it.common.TestProperties; @@ -55,8 +50,6 @@ import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.testing.TestPipelineOptions; -import org.apache.beam.sdk.testutils.NamedTestResult; -import org.apache.beam.sdk.testutils.metrics.IOITMetrics; import org.apache.beam.sdk.testutils.publishing.InfluxDBSettings; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.ParDo; @@ -71,8 +64,15 @@ import org.junit.Rule; import org.junit.Test; -/** PubSubIO performance tests. */ -public class PubSubIOLT extends IOLoadTestBase { +/** + * PubsubIO load test. + * + *

Usage:
+ * - To run medium-scale load tests: {@code gradle :it:google-cloud-platform:PubsubLoadTestMedium} + *
+ * - To run large-scale load tests: {@code gradle :it:google-cloud-platform:PubsubLoadTestLarge} + */ +public class PubsubIOLT extends IOLoadTestBase { private static final int NUMBER_OF_BUNDLES_FOR_LOCAL = 10; private static final int NUMBER_OF_BUNDLES_FOR_MEDIUM_AND_LARGE = 20; @@ -95,17 +95,17 @@ public class PubSubIOLT extends IOLoadTestBase { TEST_CONFIGS_PRESET = ImmutableMap.of( "local", - PubSubIOLT.Configuration.fromJsonString( + Configuration.fromJsonString( "{\"numRecords\":200,\"valueSizeBytes\":1000,\"pipelineTimeout\":7,\"runner\":\"DirectRunner\",\"numWorkers\":1}", - PubSubIOLT.Configuration.class), // 0.2 MB + Configuration.class), // 0.2 MB "medium", - PubSubIOLT.Configuration.fromJsonString( + Configuration.fromJsonString( "{\"numRecords\":10000000,\"valueSizeBytes\":1000,\"pipelineTimeout\":20,\"runner\":\"DataflowRunner\",\"numWorkers\":10}", - PubSubIOLT.Configuration.class), // 10 GB + Configuration.class), // 10 GB "large", - PubSubIOLT.Configuration.fromJsonString( + Configuration.fromJsonString( "{\"numRecords\":100000000,\"valueSizeBytes\":1000,\"pipelineTimeout\":50,\"runner\":\"DataflowRunner\",\"numWorkers\":20}", - PubSubIOLT.Configuration.class) // 100 GB + Configuration.class) // 100 GB ); } catch (IOException e) { throw new RuntimeException(e); @@ -126,8 +126,7 @@ public void setup() throws IOException { configuration = TEST_CONFIGS_PRESET.get(testConfigName); if (configuration == null) { try { - configuration = - PubSubIOLT.Configuration.fromJsonString(testConfigName, PubSubIOLT.Configuration.class); + configuration = Configuration.fromJsonString(testConfigName, Configuration.class); } catch (IOException e) { throw new IllegalArgumentException( String.format( @@ -157,6 +156,15 @@ public void setup() throws IOException { readPipeline.getOptions().as(PubsubOptions.class).setProject(project); writePipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false); readPipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false); + + if (configuration.exportMetricsToInfluxDB) { + configuration.influxHost = + TestProperties.getProperty("influxHost", "", TestProperties.Type.PROPERTY); + configuration.influxDatabase = + TestProperties.getProperty("influxDatabase", "", TestProperties.Type.PROPERTY); + configuration.influxMeasurement = + TestProperties.getProperty("influxMeasurement", "", TestProperties.Type.PROPERTY); + } } @After @@ -206,11 +214,11 @@ public void testWriteAndRead() throws IOException { WriteAndReadFormat format = WriteAndReadFormat.valueOf(configuration.writeAndReadFormat); PipelineLauncher.LaunchInfo writeLaunchInfo = testWrite(format); PipelineLauncher.LaunchInfo readLaunchInfo = testRead(format); - try { - PipelineOperator.Result readResult = - pipelineOperator.waitUntilDone( - createConfig(readLaunchInfo, Duration.ofMinutes(configuration.pipelineTimeout))); + PipelineOperator.Result readResult = + pipelineOperator.waitUntilDone( + createConfig(readLaunchInfo, Duration.ofMinutes(configuration.pipelineTimeout))); + try { // Check the initial launch didn't fail assertNotEquals(PipelineOperator.Result.LAUNCH_FAILED, readResult); // streaming read pipeline does not end itself @@ -218,41 +226,44 @@ public void testWriteAndRead() throws IOException { assertEquals( PipelineLauncher.JobState.RUNNING, pipelineLauncher.getJobStatus(project, region, readLaunchInfo.jobId())); - - // check metrics - double numRecords = - pipelineLauncher.getMetric( - project, - region, - readLaunchInfo.jobId(), - getBeamMetricsName(PipelineMetricsType.COUNTER, READ_ELEMENT_METRIC_NAME)); - - // Assert that actual data equals or greater than expected data number since there might be - // duplicates when testing big amount of data - long expectedDataNum = configuration.numRecords - configuration.forceNumInitialBundles; - assertTrue(numRecords >= expectedDataNum); - - // export metrics - MetricsConfiguration writeMetricsConfig = - MetricsConfiguration.builder() - .setInputPCollection("Map records.out0") - .setInputPCollectionV2("Map records/ParMultiDo(MapKVToV).out0") - .build(); - - MetricsConfiguration readMetricsConfig = - MetricsConfiguration.builder() - .setOutputPCollection("Counting element.out0") - .setOutputPCollectionV2("Counting element/ParMultiDo(Counting).out0") - .build(); - - exportMetrics(writeLaunchInfo, writeMetricsConfig); - exportMetrics(readLaunchInfo, readMetricsConfig); - } catch (ParseException | InterruptedException e) { - throw new RuntimeException(e); } finally { cancelJobIfRunning(writeLaunchInfo); cancelJobIfRunning(readLaunchInfo); } + + // check metrics + double numRecords = + pipelineLauncher.getMetric( + project, + region, + readLaunchInfo.jobId(), + getBeamMetricsName(PipelineMetricsType.COUNTER, READ_ELEMENT_METRIC_NAME)); + + // Assert that actual data equals or greater than expected data number since there might be + // duplicates when testing big amount of data + long expectedDataNum = configuration.numRecords - configuration.forceNumInitialBundles; + assertTrue(numRecords >= expectedDataNum); + + // export metrics + MetricsConfiguration writeMetricsConfig = + MetricsConfiguration.builder() + .setInputPCollection("Map records.out0") + .setInputPCollectionV2("Map records/ParMultiDo(MapKVToV).out0") + .build(); + + MetricsConfiguration readMetricsConfig = + MetricsConfiguration.builder() + .setOutputPCollection("Counting element.out0") + .setOutputPCollectionV2("Counting element/ParMultiDo(Counting).out0") + .build(); + + exportMetrics( + writeLaunchInfo, + writeMetricsConfig, + configuration.exportMetricsToInfluxDB, + influxDBSettings); + exportMetrics( + readLaunchInfo, readMetricsConfig, configuration.exportMetricsToInfluxDB, influxDBSettings); } private PipelineLauncher.LaunchInfo testWrite(WriteAndReadFormat format) throws IOException { @@ -341,27 +352,6 @@ private void cancelJobIfRunning(PipelineLauncher.LaunchInfo pipelineLaunchInfo) } } - private void exportMetrics( - PipelineLauncher.LaunchInfo launchInfo, MetricsConfiguration metricsConfig) - throws IOException, ParseException, InterruptedException { - - Map metrics = getMetrics(launchInfo, metricsConfig); - String testId = UUID.randomUUID().toString(); - String testTimestamp = Timestamp.now().toString(); - - if (configuration.exportMetricsToInfluxDB) { - Collection namedTestResults = new ArrayList<>(); - for (Map.Entry entry : metrics.entrySet()) { - NamedTestResult metricResult = - NamedTestResult.create(testId, testTimestamp, entry.getKey(), entry.getValue()); - namedTestResults.add(metricResult); - } - IOITMetrics.publishToInflux(testId, testTimestamp, namedTestResults, influxDBSettings); - } else { - exportMetricsToBigQuery(launchInfo, metrics); - } - } - /** Mapper class to convert data from KV to String. */ private static class MapKVtoString extends DoFn, String> { @ProcessElement @@ -395,7 +385,7 @@ public void process(ProcessContext context) { } } - /** Mapper class to convert data from KV to PubSubMessage. */ + /** Mapper class to convert data from KV to PubsubMessage. */ private static class MapKVtoPubSubMessage extends DoFn, PubsubMessage> { @ProcessElement public void process(ProcessContext context) { @@ -405,7 +395,7 @@ public void process(ProcessContext context) { } } - /** Example of Generic class to test PubSubIO.writeAvros()/readAvros methods. */ + /** Example of Generic class to test PubsubIO.writeAvros() / readAvros() methods. */ static class GenericClass implements Serializable { byte[] byteField; @@ -442,7 +432,7 @@ private enum WriteAndReadFormat { PUBSUB_MESSAGE } - /** Options for PubSub IO load test. */ + /** Options for Pubsub IO load test. */ static class Configuration extends SyntheticSourceOptions { /** Pipeline timeout in minutes. Must be a positive value. */ @JsonProperty public int pipelineTimeout = 20; @@ -461,7 +451,7 @@ static class Configuration extends SyntheticSourceOptions { * InfluxDB and displayed using Grafana. If set to false, metrics will be exported to BigQuery * and displayed with Looker Studio. */ - @JsonProperty public boolean exportMetricsToInfluxDB = false; + @JsonProperty public boolean exportMetricsToInfluxDB = true; /** InfluxDB measurement to publish results to. * */ @JsonProperty public String influxMeasurement; diff --git a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/spanner/SpannerIOST.java b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/spanner/SpannerIOST.java index 00cf4994e90f..ba8b5f57e47e 100644 --- a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/spanner/SpannerIOST.java +++ b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/spanner/SpannerIOST.java @@ -18,8 +18,8 @@ package org.apache.beam.it.gcp.spanner; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; -import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertTrue; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.cloud.ByteArray; @@ -191,7 +191,9 @@ public void runTest() throws IOException, ParseException, InterruptedException { readInfo.jobId(), getBeamMetricsName(PipelineMetricsType.COUNTER, READ_ELEMENT_METRIC_NAME)); - assertEquals(writeNumRecords, readNumRecords, 0); + // Assert that writeNumRecords equals or greater than readNumRecords since there might be + // duplicates when testing big amount of data + assertTrue(writeNumRecords >= readNumRecords); } finally { // clean up write streaming pipeline if (pipelineLauncher.getJobStatus(project, region, writeInfo.jobId()) diff --git a/it/kafka/build.gradle b/it/kafka/build.gradle index 96f915a1d846..158de54d89e4 100644 --- a/it/kafka/build.gradle +++ b/it/kafka/build.gradle @@ -46,5 +46,13 @@ dependencies { testImplementation project(path: ":sdks:java:extensions:google-cloud-platform-core", configuration: "testRuntimeMigration") } -tasks.register("KafkaStressTestMedium", IoPerformanceTestUtilities.IoPerformanceTest, project, 'kafka', 'KafkaIOST', ['configuration':'medium','bootstrapServers':System.getProperty("bootstrapServers"),'useDataflowRunnerV2':System.getProperty("useDataflowRunnerV2"),'project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) -tasks.register("KafkaStressTestLarge", IoPerformanceTestUtilities.IoPerformanceTest, project, 'kafka', 'KafkaIOST', ['configuration':'large','bootstrapServers':System.getProperty("bootstrapServers"),'useDataflowRunnerV2':System.getProperty("useDataflowRunnerV2"),'project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) +tasks.register( + "KafkaStressTestMedium", IoPerformanceTestUtilities.IoPerformanceTest, project, 'kafka', 'KafkaIOST', + ['configuration':'medium','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + + System.properties +) +tasks.register( + "KafkaStressTestLarge", IoPerformanceTestUtilities.IoPerformanceTest, project, 'kafka', 'KafkaIOST', + ['configuration':'large','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp'] + + System.properties +) diff --git a/it/kafka/src/test/java/org/apache/beam/it/kafka/KafkaIOST.java b/it/kafka/src/test/java/org/apache/beam/it/kafka/KafkaIOST.java index 4ca34328637f..505b51cec04a 100644 --- a/it/kafka/src/test/java/org/apache/beam/it/kafka/KafkaIOST.java +++ b/it/kafka/src/test/java/org/apache/beam/it/kafka/KafkaIOST.java @@ -17,12 +17,11 @@ */ package org.apache.beam.it.kafka; -import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertTrue; import com.fasterxml.jackson.annotation.JsonProperty; import java.io.IOException; -import java.text.ParseException; import java.time.Duration; import java.time.ZoneId; import java.time.format.DateTimeFormatter; @@ -132,6 +131,15 @@ public void setup() { // Use streaming pipeline to write and read records writePipeline.getOptions().as(StreamingOptions.class).setStreaming(true); readPipeline.getOptions().as(StreamingOptions.class).setStreaming(true); + + if (configuration.exportMetricsToInfluxDB) { + configuration.influxHost = + TestProperties.getProperty("influxHost", "", TestProperties.Type.PROPERTY); + configuration.influxDatabase = + TestProperties.getProperty("influxDatabase", "", TestProperties.Type.PROPERTY); + configuration.influxMeasurement = + TestProperties.getProperty("influxMeasurement", "", TestProperties.Type.PROPERTY); + } } private static final Map TEST_CONFIGS_PRESET; @@ -155,7 +163,7 @@ public void setup() { /** Run stress test with configurations specified by TestProperties. */ @Test - public void testWriteAndRead() throws IOException, ParseException, InterruptedException { + public void testWriteAndRead() throws IOException { if (configuration.exportMetricsToInfluxDB) { influxDBSettings = InfluxDBSettings.builder() @@ -173,10 +181,6 @@ public void testWriteAndRead() throws IOException, ParseException, InterruptedEx pipelineOperator.waitUntilDone( createConfig(readInfo, Duration.ofMinutes(configuration.pipelineTimeout))); assertNotEquals(PipelineOperator.Result.LAUNCH_FAILED, readResult); - // streaming read pipeline does not end itself - assertEquals( - PipelineLauncher.JobState.RUNNING, - pipelineLauncher.getJobStatus(project, region, readInfo.jobId())); // Delete topic after test run adminClient.deleteTopics(Collections.singleton(kafkaTopic)); @@ -193,7 +197,10 @@ public void testWriteAndRead() throws IOException, ParseException, InterruptedEx region, readInfo.jobId(), getBeamMetricsName(PipelineMetricsType.COUNTER, READ_ELEMENT_METRIC_NAME)); - assertEquals(writeNumRecords, readNumRecords, 0); + + // Assert that writeNumRecords equals or greater than readNumRecords since there might be + // duplicates when testing big amount of data + assertTrue(writeNumRecords >= readNumRecords); } finally { // clean up pipelines if (pipelineLauncher.getJobStatus(project, region, writeInfo.jobId()) @@ -354,7 +361,7 @@ static class Configuration extends SyntheticSourceOptions { * InfluxDB and displayed using Grafana. If set to false, metrics will be exported to BigQuery * and displayed with Looker Studio. */ - @JsonProperty public boolean exportMetricsToInfluxDB = false; + @JsonProperty public boolean exportMetricsToInfluxDB = true; /** InfluxDB measurement to publish results to. * */ @JsonProperty public String influxMeasurement = KafkaIOST.class.getName(); diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java index f0514c69891b..909789bbb129 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java @@ -19,6 +19,7 @@ import org.apache.beam.sdk.options.ApplicationNameOptions; import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.DefaultValueFactory; import org.apache.beam.sdk.options.Description; import org.apache.beam.sdk.options.FileStagingOptions; import org.apache.beam.sdk.options.PipelineOptions; @@ -228,18 +229,51 @@ public interface FlinkPipelineOptions void setRetainExternalizedCheckpointsOnCancellation(Boolean retainOnCancellation); - @Description("The maximum number of elements in a bundle.") - @Default.Long(1000) + @Description( + "The maximum number of elements in a bundle. Default values are 1000 for a streaming job and 1,000,000 for batch") + @Default.InstanceFactory(MaxBundleSizeFactory.class) Long getMaxBundleSize(); void setMaxBundleSize(Long size); - @Description("The maximum time to wait before finalising a bundle (in milliseconds).") - @Default.Long(1000) + /** + * Maximum bundle size factory. For a streaming job it's desireable to keep bundle size small to + * optimize latency. In batch, we optimize for throughput and hence bundle size is kept large. + */ + class MaxBundleSizeFactory implements DefaultValueFactory { + @Override + public Long create(PipelineOptions options) { + if (options.as(StreamingOptions.class).isStreaming()) { + return 1000L; + } else { + return 1000000L; + } + } + } + + @Description( + "The maximum time to wait before finalising a bundle (in milliseconds). Default values are 1000 for streaming and 10,000 for batch.") + @Default.InstanceFactory(MaxBundleTimeFactory.class) Long getMaxBundleTimeMills(); void setMaxBundleTimeMills(Long time); + /** + * Maximum bundle time factory. For a streaming job it's desireable to keep the value small to + * optimize latency. In batch, we optimize for throughput and hence bundle time size is kept + * larger. + */ + class MaxBundleTimeFactory implements DefaultValueFactory { + @Override + public Long create(PipelineOptions options) { + if (options.as(StreamingOptions.class).isStreaming()) { + return 1000L; + } else { + return 10000L; + } + } + } + @Description( "Interval in milliseconds for sending latency tracking marks from the sources to the sinks. " + "Interval value <= 0 disables the feature.") diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkTransformOverrides.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkTransformOverrides.java index 46458eccb83c..15ccc39c12c4 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkTransformOverrides.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkTransformOverrides.java @@ -36,18 +36,19 @@ class FlinkTransformOverrides { static List getDefaultOverrides(FlinkPipelineOptions options) { ImmutableList.Builder builder = ImmutableList.builder(); + if (options.isStreaming()) { + builder.add( + PTransformOverride.of( + FlinkStreamingPipelineTranslator.StreamingShardedWriteFactory + .writeFilesNeedsOverrides(), + new FlinkStreamingPipelineTranslator.StreamingShardedWriteFactory( + checkNotNull(options)))); + } if (options.isStreaming() || options.getUseDataStreamForBatch()) { - builder - .add( - PTransformOverride.of( - FlinkStreamingPipelineTranslator.StreamingShardedWriteFactory - .writeFilesNeedsOverrides(), - new FlinkStreamingPipelineTranslator.StreamingShardedWriteFactory( - checkNotNull(options)))) - .add( - PTransformOverride.of( - PTransformMatchers.urnEqualTo(PTransformTranslation.CREATE_VIEW_TRANSFORM_URN), - CreateStreamingFlinkView.Factory.INSTANCE)); + builder.add( + PTransformOverride.of( + PTransformMatchers.urnEqualTo(PTransformTranslation.CREATE_VIEW_TRANSFORM_URN), + CreateStreamingFlinkView.Factory.INSTANCE)); } builder .add( diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineOptionsTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineOptionsTest.java index da8c560690a6..c20bd077c3f2 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineOptionsTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineOptionsTest.java @@ -91,14 +91,22 @@ public void testDefaults() { assertThat(options.getStateBackendFactory(), is(nullValue())); assertThat(options.getStateBackend(), is(nullValue())); assertThat(options.getStateBackendStoragePath(), is(nullValue())); - assertThat(options.getMaxBundleSize(), is(1000L)); - assertThat(options.getMaxBundleTimeMills(), is(1000L)); assertThat(options.getExecutionModeForBatch(), is(ExecutionMode.PIPELINED.name())); assertThat(options.getUseDataStreamForBatch(), is(false)); assertThat(options.getSavepointPath(), is(nullValue())); assertThat(options.getAllowNonRestoredState(), is(false)); assertThat(options.getDisableMetrics(), is(false)); assertThat(options.getFasterCopy(), is(false)); + + assertThat(options.isStreaming(), is(false)); + assertThat(options.getMaxBundleSize(), is(1000000L)); + assertThat(options.getMaxBundleTimeMills(), is(10000L)); + + // In streaming mode bundle size and bundle time are shorter + FlinkPipelineOptions optionsStreaming = FlinkPipelineOptions.defaults(); + optionsStreaming.setStreaming(true); + assertThat(optionsStreaming.getMaxBundleSize(), is(1000L)); + assertThat(optionsStreaming.getMaxBundleTimeMills(), is(1000L)); } @Test(expected = Exception.class) diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/LockFreeHistogram.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/LockFreeHistogram.java new file mode 100644 index 000000000000..bc42e1283240 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/LockFreeHistogram.java @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker; + +import com.google.auto.value.AutoValue; +import com.google.auto.value.extension.memoized.Memoized; +import java.io.Serializable; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLongArray; +import java.util.concurrent.atomic.AtomicReference; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.sdk.metrics.Histogram; +import org.apache.beam.sdk.metrics.MetricName; +import org.apache.beam.sdk.util.HistogramData; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.primitives.ImmutableLongArray; + +/** + * A lock free implementation of {@link org.apache.beam.sdk.metrics.Histogram}. This class supports + * extracting delta updates with the {@link #getSnapshotAndReset} method. + */ +@ThreadSafe +@Internal +public final class LockFreeHistogram implements Histogram { + private final HistogramData.BucketType bucketType; + private final AtomicLongArray buckets; + private final MetricName name; + private final AtomicReference underflowStatistic; + private final AtomicReference overflowStatistic; + + /** + * Whether this histogram has updates that have not been extracted by {@code getSnapshotAndReset}. + * This values should be flipped to true AFTER recording a value, and flipped to false BEFORE + * extracting a snapshot. This ensures that recorded values will always be seen by a future {@code + * getSnapshotAndReset} call. + */ + private final AtomicBoolean dirty; + + /** Create a histogram. */ + public LockFreeHistogram(KV kv) { + this.name = kv.getKey(); + this.bucketType = kv.getValue(); + this.buckets = new AtomicLongArray(bucketType.getNumBuckets()); + this.underflowStatistic = + new AtomicReference(OutlierStatistic.EMPTY); + this.overflowStatistic = + new AtomicReference(OutlierStatistic.EMPTY); + this.dirty = new AtomicBoolean(false); + } + + /** + * Represents the sum and mean of a collection of numbers. Used to represent the + * underflow/overflow statistics of a histogram. + */ + @AutoValue + public abstract static class OutlierStatistic implements Serializable { + abstract double sum(); + + public abstract long count(); + + public static final OutlierStatistic EMPTY = create(0, 0); + + public static OutlierStatistic create(double sum, long count) { + return new AutoValue_LockFreeHistogram_OutlierStatistic(sum, count); + } + + public OutlierStatistic combine(double value) { + return create(sum() + value, count() + 1); + } + + public double mean() { + if (count() == 0) { + return 0; + } + return sum() / count(); + } + } + + /** + * The snapshot of a histogram. The snapshot contains the overflow/underflow statistic, number of + * values recorded in each bucket, and the BucketType of the underlying histogram. + */ + @AutoValue + public abstract static class Snapshot { + public abstract OutlierStatistic underflowStatistic(); + + public abstract OutlierStatistic overflowStatistic(); + + public abstract ImmutableLongArray buckets(); + + public abstract HistogramData.BucketType bucketType(); + + public static Snapshot create( + OutlierStatistic underflowStatistic, + OutlierStatistic overflowStatistic, + ImmutableLongArray buckets, + HistogramData.BucketType bucketType) { + return new AutoValue_LockFreeHistogram_Snapshot( + underflowStatistic, overflowStatistic, buckets, bucketType); + } + + @Memoized + public long totalCount() { + long count = 0; + count += underflowStatistic().count(); + count += overflowStatistic().count(); + count += buckets().stream().sum(); + + return count; + } + } + + /** + * Extract a delta update of this histogram. Update represents values that have been recorded in + * this histogram since the last time this method was called. + * + *

If this histogram is being updated concurrent to this method, then the returned snapshot is + * not guarenteed to contain those updates. However, those updates are not dropped and will be + * represented in a future call to this method. + * + *

If this histogram has not been updated since the last call to this method, an empty optional + * is returned. + */ + public Optional getSnapshotAndReset() { + if (!dirty.getAndSet(false)) { + return Optional.empty(); + } + + ImmutableLongArray.Builder bucketsSnapshotBuilder = + ImmutableLongArray.builder(buckets.length()); + for (int i = 0; i < buckets.length(); i++) { + bucketsSnapshotBuilder.add(buckets.getAndSet(i, 0)); + } + OutlierStatistic overflowSnapshot = overflowStatistic.getAndSet(OutlierStatistic.EMPTY); + OutlierStatistic underflowSnapshot = underflowStatistic.getAndSet(OutlierStatistic.EMPTY); + + return Optional.of( + Snapshot.create( + underflowSnapshot, overflowSnapshot, bucketsSnapshotBuilder.build(), bucketType)); + } + + @Override + public MetricName getName() { + return name; + } + + private void updateInternal(double value) { + double rangeTo = bucketType.getRangeTo(); + double rangeFrom = bucketType.getRangeFrom(); + if (value >= rangeTo) { + recordTopRecordsValue(value); + } else if (value < rangeFrom) { + recordBottomRecordsValue(value); + } else { + recordInBoundsValue(value); + } + } + + @Override + public void update(double value) { + updateInternal(value); + dirty.set(true); + } + + @Override + public void update(double... values) { + for (double value : values) { + updateInternal(value); + } + dirty.set(true); + } + + /** Record a inbounds value to the appropriate bucket. */ + private void recordInBoundsValue(double value) { + int index = bucketType.getBucketIndex(value); + if (index < 0 || index >= bucketType.getNumBuckets()) { + return; + } + + buckets.getAndIncrement(index); + } + + /** + * Record a new value in {@code overflowStatistic}. This method should only be called when a + * Histogram is recording a value greater than the upper bound of it's largest bucket. + * + * @param value + */ + private void recordTopRecordsValue(double value) { + OutlierStatistic original; + do { + original = overflowStatistic.get(); + } while (!overflowStatistic.compareAndSet(original, original.combine(value))); + } + + /** + * Record a new value in {@code underflowStatistic}. This method should only be called when a + * Histogram is recording a value smaller than the lowerbound bound of it's smallest bucket. + */ + private void recordBottomRecordsValue(double value) { + OutlierStatistic original; + do { + original = underflowStatistic.get(); + } while (!underflowStatistic.compareAndSet(original, original.combine(value))); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/MetricsToPerStepNamespaceMetricsConverter.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/MetricsToPerStepNamespaceMetricsConverter.java index 8f9cbd350a25..1f54ee95dad7 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/MetricsToPerStepNamespaceMetricsConverter.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/MetricsToPerStepNamespaceMetricsConverter.java @@ -70,9 +70,9 @@ private static Optional convertCounterToMetricValue( * @param outputHistogram */ private static void addOutlierStatsToHistogram( - HistogramData inputHistogram, DataflowHistogramValue outputHistogram) { - long overflowCount = inputHistogram.getTopBucketCount(); - long underflowCount = inputHistogram.getBottomBucketCount(); + LockFreeHistogram.Snapshot inputHistogram, DataflowHistogramValue outputHistogram) { + long overflowCount = inputHistogram.overflowStatistic().count(); + long underflowCount = inputHistogram.underflowStatistic().count(); if (underflowCount == 0 && overflowCount == 0) { return; } @@ -81,12 +81,12 @@ private static void addOutlierStatsToHistogram( if (underflowCount > 0) { outlierStats .setUnderflowCount(underflowCount) - .setUnderflowMean(inputHistogram.getBottomBucketMean()); + .setUnderflowMean(inputHistogram.underflowStatistic().mean()); } if (overflowCount > 0) { outlierStats .setOverflowCount(overflowCount) - .setOverflowMean(inputHistogram.getTopBucketMean()); + .setOverflowMean(inputHistogram.overflowStatistic().mean()); } outputHistogram.setOutlierStats(outlierStats); } @@ -99,8 +99,8 @@ private static void addOutlierStatsToHistogram( * Otherwise returns an empty optional. */ private static Optional convertHistogramToMetricValue( - MetricName metricName, HistogramData inputHistogram) { - if (inputHistogram.getTotalCount() == 0L) { + MetricName metricName, LockFreeHistogram.Snapshot inputHistogram) { + if (inputHistogram.totalCount() == 0L) { return Optional.empty(); } @@ -111,20 +111,20 @@ private static Optional convertHistogramToMetricValue( } DataflowHistogramValue outputHistogram = new DataflowHistogramValue(); - int numberOfBuckets = inputHistogram.getBucketType().getNumBuckets(); + int numberOfBuckets = inputHistogram.bucketType().getNumBuckets(); - if (inputHistogram.getBucketType() instanceof HistogramData.LinearBuckets) { + if (inputHistogram.bucketType() instanceof HistogramData.LinearBuckets) { HistogramData.LinearBuckets buckets = - (HistogramData.LinearBuckets) inputHistogram.getBucketType(); + (HistogramData.LinearBuckets) inputHistogram.bucketType(); Linear linearOptions = new Linear() .setNumberOfBuckets(numberOfBuckets) .setWidth(buckets.getWidth()) .setStart(buckets.getStart()); outputHistogram.setBucketOptions(new BucketOptions().setLinear(linearOptions)); - } else if (inputHistogram.getBucketType() instanceof HistogramData.ExponentialBuckets) { + } else if (inputHistogram.bucketType() instanceof HistogramData.ExponentialBuckets) { HistogramData.ExponentialBuckets buckets = - (HistogramData.ExponentialBuckets) inputHistogram.getBucketType(); + (HistogramData.ExponentialBuckets) inputHistogram.bucketType(); Base2Exponent expoenntialOptions = new Base2Exponent().setNumberOfBuckets(numberOfBuckets).setScale(buckets.getScale()); outputHistogram.setBucketOptions(new BucketOptions().setExponential(expoenntialOptions)); @@ -132,12 +132,10 @@ private static Optional convertHistogramToMetricValue( return Optional.empty(); } - outputHistogram.setCount(inputHistogram.getTotalCount()); - List bucketCounts = new ArrayList<>(inputHistogram.getBucketType().getNumBuckets()); + outputHistogram.setCount(inputHistogram.totalCount()); + List bucketCounts = new ArrayList<>(inputHistogram.buckets().length()); - for (int i = 0; i < inputHistogram.getBucketType().getNumBuckets(); i++) { - bucketCounts.add(inputHistogram.getCount(i)); - } + inputHistogram.buckets().forEach(val -> bucketCounts.add(val)); // Remove trailing 0 buckets. for (int i = bucketCounts.size() - 1; i >= 0; i--) { @@ -167,7 +165,9 @@ private static Optional convertHistogramToMetricValue( * stage, metrics namespace} pair. */ public static Collection convert( - String stepName, Map counters, Map histograms) { + String stepName, + Map counters, + Map histograms) { Map metricsByNamespace = new HashMap<>(); for (Entry entry : counters.entrySet()) { @@ -192,7 +192,7 @@ public static Collection convert( stepNamespaceMetrics.getMetricValues().add(metricValue.get()); } - for (Entry entry : histograms.entrySet()) { + for (Entry entry : histograms.entrySet()) { MetricName metricName = entry.getKey(); Optional metricValue = convertHistogramToMetricValue(metricName, entry.getValue()); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainer.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainer.java index 54a3ef49776b..71e6380ab108 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainer.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainer.java @@ -32,7 +32,6 @@ import javax.annotation.Nonnull; import org.apache.beam.runners.core.metrics.DistributionData; import org.apache.beam.runners.core.metrics.GaugeCell; -import org.apache.beam.runners.core.metrics.HistogramCell; import org.apache.beam.runners.core.metrics.MetricsMap; import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Distribution; @@ -71,8 +70,8 @@ public class StreamingStepMetricsContainer implements MetricsContainer { private MetricsMap distributions = new MetricsMap<>(DeltaDistributionCell::new); - private MetricsMap, HistogramCell> perWorkerHistograms = - new MetricsMap<>(HistogramCell::new); + private MetricsMap, LockFreeHistogram> + perWorkerHistograms = new MetricsMap<>(LockFreeHistogram::new); private final Map perWorkerCountersByFirstStaleTime; @@ -267,8 +266,8 @@ private void deleteStaleCounters( @VisibleForTesting Iterable extractPerWorkerMetricUpdates() { ConcurrentHashMap counters = new ConcurrentHashMap(); - ConcurrentHashMap histograms = - new ConcurrentHashMap(); + ConcurrentHashMap histograms = + new ConcurrentHashMap(); HashSet currentZeroValuedCounters = new HashSet(); // Extract metrics updates. @@ -283,11 +282,7 @@ Iterable extractPerWorkerMetricUpdates() { }); perWorkerHistograms.forEach( (k, v) -> { - HistogramData val = v.getCumulative().getAndReset(); - if (val.getTotalCount() == 0) { - return; - } - histograms.put(k.getKey(), val); + v.getSnapshotAndReset().ifPresent(snapshot -> histograms.put(k.getKey(), snapshot)); }); deleteStaleCounters(currentZeroValuedCounters, Instant.now(clock)); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateCache.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateCache.java index 85c74fe8591d..c6c49134bcb5 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateCache.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateCache.java @@ -64,7 +64,8 @@ public class WindmillStateCache implements StatusDataProvider { // Initial size of hash tables per entry. private static final int INITIAL_HASH_MAP_CAPACITY = 4; // Overhead of each hash map entry. - private static final int HASH_MAP_ENTRY_OVERHEAD = 16; + // https://appsintheopen.com/posts/52-the-memory-overhead-of-java-ojects + private static final int HASH_MAP_ENTRY_OVERHEAD = 32; // Overhead of each StateCacheEntry. One long, plus a hash table. private static final int PER_CACHE_ENTRY_OVERHEAD = 8 + HASH_MAP_ENTRY_OVERHEAD * INITIAL_HASH_MAP_CAPACITY; diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/LockFreeHistogramTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/LockFreeHistogramTest.java new file mode 100644 index 000000000000..dfb63a36f836 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/LockFreeHistogramTest.java @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.equalTo; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import org.apache.beam.sdk.metrics.MetricName; +import org.apache.beam.sdk.util.HistogramData; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.primitives.ImmutableLongArray; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link LockFreeHistogram}. */ +@RunWith(JUnit4.class) +public class LockFreeHistogramTest { + + @Test + public void testUpdate_OverflowValues() { + HistogramData.BucketType bucketType = HistogramData.LinearBuckets.of(0, 10, 3); + LockFreeHistogram histogram = + new LockFreeHistogram(KV.of(MetricName.named("name", "namespace"), bucketType)); + histogram.update(35, 40, 45); + Optional snapshot = histogram.getSnapshotAndReset(); + + LockFreeHistogram.OutlierStatistic expectedOverflow = + LockFreeHistogram.OutlierStatistic.create(120.0, 3L); + LockFreeHistogram.OutlierStatistic expectedUnderflow = LockFreeHistogram.OutlierStatistic.EMPTY; + ImmutableLongArray expectedBuckets = ImmutableLongArray.of(0L, 0L, 0L); + LockFreeHistogram.Snapshot expectedSnapshot = + LockFreeHistogram.Snapshot.create( + expectedUnderflow, expectedOverflow, expectedBuckets, bucketType); + + assertThat(snapshot.isPresent(), equalTo(true)); + assertThat(snapshot.get(), equalTo(expectedSnapshot)); + assertThat(snapshot.get().underflowStatistic().mean(), equalTo(0.0)); + assertThat(snapshot.get().overflowStatistic(), equalTo(expectedOverflow)); + } + + @Test + public void testUpdate_UnderflowValues() { + HistogramData.BucketType bucketType = HistogramData.LinearBuckets.of(100, 10, 3); + LockFreeHistogram histogram = + new LockFreeHistogram(KV.of(MetricName.named("name", "namespace"), bucketType)); + histogram.update(35, 40, 45); + Optional snapshot = histogram.getSnapshotAndReset(); + + LockFreeHistogram.OutlierStatistic expectedUnderflow = + LockFreeHistogram.OutlierStatistic.create(120.0, 3L); + LockFreeHistogram.OutlierStatistic expectedOverflow = LockFreeHistogram.OutlierStatistic.EMPTY; + ImmutableLongArray expectedBuckets = ImmutableLongArray.of(0L, 0L, 0L); + LockFreeHistogram.Snapshot expectedSnapshot = + LockFreeHistogram.Snapshot.create( + expectedUnderflow, expectedOverflow, expectedBuckets, bucketType); + + assertThat(snapshot.isPresent(), equalTo(true)); + assertThat(snapshot.get(), equalTo(expectedSnapshot)); + assertThat(snapshot.get().underflowStatistic(), equalTo(expectedUnderflow)); + } + + @Test + public void testUpdate_InBoundsValues() { + HistogramData.BucketType bucketType = HistogramData.LinearBuckets.of(0, 10, 3); + LockFreeHistogram histogram = + new LockFreeHistogram(KV.of(MetricName.named("name", "namespace"), bucketType)); + histogram.update(5, 15, 25); + Optional snapshot = histogram.getSnapshotAndReset(); + + LockFreeHistogram.OutlierStatistic expectedOverflow = LockFreeHistogram.OutlierStatistic.EMPTY; + LockFreeHistogram.OutlierStatistic expectedUnderflow = LockFreeHistogram.OutlierStatistic.EMPTY; + ImmutableLongArray expectedBuckets = ImmutableLongArray.of(1L, 1L, 1L); + LockFreeHistogram.Snapshot expectedSnapshot = + LockFreeHistogram.Snapshot.create( + expectedUnderflow, expectedOverflow, expectedBuckets, bucketType); + + assertThat(snapshot.isPresent(), equalTo(true)); + assertThat(snapshot.get(), equalTo(expectedSnapshot)); + } + + @Test + public void testUpdate_EmptySnapshot() { + HistogramData.BucketType bucketType = HistogramData.LinearBuckets.of(0, 10, 3); + LockFreeHistogram histogram = + new LockFreeHistogram(KV.of(MetricName.named("name", "namespace"), bucketType)); + histogram.update(5, 15, 25); + Optional snapshot_1 = histogram.getSnapshotAndReset(); + + assertThat(snapshot_1.isPresent(), equalTo(true)); + + Optional snapshot_2 = histogram.getSnapshotAndReset(); + assertThat(snapshot_2.isPresent(), equalTo(false)); + } + + /** A runnable records 200 values and then calls getSnapshotAndReset. */ + private static class UpdateHistogramCallable implements Callable { + private final LockFreeHistogram histogram; + private final int val; + private Optional snapshot; + + private static final long valuesRecorded = 200L; + + public UpdateHistogramCallable(LockFreeHistogram histogram, int val) { + this.histogram = histogram; + this.val = val; + this.snapshot = Optional.empty(); + } + + @Override + public Long call() { + for (long j = 0; j < valuesRecorded; j++) { + histogram.update(val); + } + snapshot = histogram.getSnapshotAndReset(); + + if (snapshot.isPresent()) { + return snapshot.get().totalCount(); + } else { + return 0L; + } + } + + public static long numValuesRecorded() { + return valuesRecorded; + } + } + + @Test + public void testUpdateAndSnapshots_MultipleThreads() { + int numRunnables = 200; + ExecutorService executor = Executors.newFixedThreadPool(numRunnables); + + HistogramData.BucketType bucketType = HistogramData.ExponentialBuckets.of(1, 10); + LockFreeHistogram histogram = + new LockFreeHistogram(KV.of(MetricName.named("name", "namespace"), bucketType)); + + List callables = new ArrayList<>(); + + for (int i = 0; i < numRunnables; i++) { + callables.add(new UpdateHistogramCallable(histogram, i)); + } + + long totalValuesRecorded = 0; + + try { + List> futures = executor.invokeAll(callables); + for (Future future : futures) { + totalValuesRecorded += future.get(); + } + } catch (Exception e) { + return; + } + + Optional finalSnapshot = histogram.getSnapshotAndReset(); + if (finalSnapshot.isPresent()) { + totalValuesRecorded += finalSnapshot.get().totalCount(); + } + + assertThat( + totalValuesRecorded, equalTo(numRunnables * UpdateHistogramCallable.numValuesRecorded())); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/MetricsToPerStepNamespaceMetricsConverterTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/MetricsToPerStepNamespaceMetricsConverterTest.java index 4e5108399f62..b0a3d57487b0 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/MetricsToPerStepNamespaceMetricsConverterTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/MetricsToPerStepNamespaceMetricsConverterTest.java @@ -34,13 +34,19 @@ import java.util.Map; import org.apache.beam.sdk.metrics.MetricName; import org.apache.beam.sdk.util.HistogramData; +import org.apache.beam.sdk.values.KV; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.primitives.ImmutableLongArray; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @RunWith(JUnit4.class) public class MetricsToPerStepNamespaceMetricsConverterTest { + private static final HistogramData.BucketType lienarBuckets = + HistogramData.LinearBuckets.of(0, 10, 10); + private static final HistogramData.BucketType exponentialBuckets = + HistogramData.ExponentialBuckets.of(0, 5); public static class TestBucketType implements HistogramData.BucketType { @Override @@ -77,7 +83,7 @@ public double getAccumulatedBucketSize(int endIndex) { @Test public void testConvert_successfulyConvertCounters() { String step = "testStepName"; - Map emptyHistograms = new HashMap<>(); + Map emptyHistograms = new HashMap<>(); Map counters = new HashMap(); MetricName bigQueryMetric1 = MetricName.named("BigQuerySink", "metric1"); MetricName bigQueryMetric2 = @@ -115,11 +121,12 @@ public void testConvert_skipInvalidMetricNames() { MetricName invalidName1 = MetricName.named("BigQuerySink", "**"); counters.put(invalidName1, 5L); - Map histograms = new HashMap<>(); + Map histograms = new HashMap<>(); MetricName invalidName2 = MetricName.named("BigQuerySink", "****"); - HistogramData nonEmptyLinearHistogram = HistogramData.linear(0, 10, 10); - nonEmptyLinearHistogram.record(-5.0); - histograms.put(invalidName2, nonEmptyLinearHistogram); + LockFreeHistogram nonEmptyLinearHistogram = + new LockFreeHistogram(KV.of(invalidName2, lienarBuckets)); + nonEmptyLinearHistogram.update(-5.0); + histograms.put(invalidName2, nonEmptyLinearHistogram.getSnapshotAndReset().get()); Collection conversionResult = MetricsToPerStepNamespaceMetricsConverter.convert("testStep", counters, histograms); @@ -128,22 +135,29 @@ public void testConvert_skipInvalidMetricNames() { @Test public void testConvert_successfulConvertHistograms() { - Map histograms = new HashMap(); + Map histograms = new HashMap<>(); MetricName bigQueryMetric1 = MetricName.named("BigQuerySink", "baseLabel"); MetricName bigQueryMetric2 = MetricName.named("BigQuerySink", "baseLabel*label1:val1;label2:val2;"); MetricName bigQueryMetric3 = MetricName.named("BigQuerySink", "zeroValue"); - HistogramData nonEmptyLinearHistogram = HistogramData.linear(0, 10, 10); - nonEmptyLinearHistogram.record(-5.0, 15.0, 25.0, 35.0, 105.0); - histograms.put(bigQueryMetric1, nonEmptyLinearHistogram); + LockFreeHistogram nonEmptyLinearHistogram = + new LockFreeHistogram(KV.of(bigQueryMetric1, lienarBuckets)); + nonEmptyLinearHistogram.update(-5.0, 15.0, 25.0, 35.0, 105.0); + histograms.put(bigQueryMetric1, nonEmptyLinearHistogram.getSnapshotAndReset().get()); - HistogramData noEmptyExponentialHistogram = HistogramData.exponential(0, 5); - noEmptyExponentialHistogram.record(-5.0, 15.0, 25.0, 35.0, 105.0); - histograms.put(bigQueryMetric2, noEmptyExponentialHistogram); + LockFreeHistogram noEmptyExponentialHistogram = + new LockFreeHistogram(KV.of(bigQueryMetric2, exponentialBuckets)); + noEmptyExponentialHistogram.update(-5.0, 15.0, 25.0, 35.0, 105.0); + histograms.put(bigQueryMetric2, noEmptyExponentialHistogram.getSnapshotAndReset().get()); - HistogramData emptyHistogram = HistogramData.linear(0, 10, 10); - histograms.put(bigQueryMetric3, emptyHistogram); + LockFreeHistogram.Snapshot emptySnapshot = + LockFreeHistogram.Snapshot.create( + LockFreeHistogram.OutlierStatistic.EMPTY, + LockFreeHistogram.OutlierStatistic.EMPTY, + ImmutableLongArray.of(), + lienarBuckets); + histograms.put(bigQueryMetric3, emptySnapshot); String step = "testStep"; Map emptyCounters = new HashMap<>(); @@ -217,12 +231,13 @@ public void testConvert_successfulConvertHistograms() { public void testConvert_skipUnknownHistogramBucketType() { String step = "testStep"; Map emptyCounters = new HashMap<>(); - Map histograms = new HashMap(); + Map histograms = new HashMap<>(); - HistogramData histogram = new HistogramData(new TestBucketType()); - histogram.record(1.0, 2.0); MetricName bigQueryMetric1 = MetricName.named("BigQuerySink", "baseLabel"); - histograms.put(bigQueryMetric1, histogram); + LockFreeHistogram histogram = + new LockFreeHistogram(KV.of(bigQueryMetric1, new TestBucketType())); + histogram.update(1.0, 2.0); + histograms.put(bigQueryMetric1, histogram.getSnapshotAndReset().get()); Collection conversionResult = MetricsToPerStepNamespaceMetricsConverter.convert(step, emptyCounters, histograms); @@ -233,15 +248,16 @@ public void testConvert_skipUnknownHistogramBucketType() { public void testConvert_convertCountersAndHistograms() { String step = "testStep"; Map counters = new HashMap<>(); - Map histograms = new HashMap(); + Map histograms = new HashMap<>(); MetricName counterMetricName = MetricName.named("BigQuerySink", "counter*label1:val1;"); counters.put(counterMetricName, 3L); MetricName histogramMetricName = MetricName.named("BigQuerySink", "histogram*label2:val2;"); - HistogramData linearHistogram = HistogramData.linear(0, 10, 10); - linearHistogram.record(5.0); - histograms.put(histogramMetricName, linearHistogram); + LockFreeHistogram linearHistogram = + new LockFreeHistogram(KV.of(histogramMetricName, lienarBuckets)); + linearHistogram.update(5.0); + histograms.put(histogramMetricName, linearHistogram.getSnapshotAndReset().get()); Collection conversionResult = MetricsToPerStepNamespaceMetricsConverter.convert(step, counters, histograms); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainerTest.java index 6aecafbb10de..267a49bb771d 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainerTest.java @@ -56,6 +56,7 @@ import org.apache.beam.sdk.metrics.NoOpHistogram; import org.apache.beam.sdk.util.HistogramData; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; +import org.hamcrest.collection.IsEmptyIterable; import org.junit.Rule; import org.junit.Test; import org.junit.rules.Timeout; @@ -223,7 +224,7 @@ public void testPerWorkerMetrics() { } @Test - public void testExtractPerWorkerMetricUpdates() { + public void testExtractPerWorkerMetricUpdates_populatedMetrics() { StreamingStepMetricsContainer.setEnablePerWorkerMetrics(true); MetricName counterMetricName = MetricName.named("BigQuerySink", "counter"); c1.getPerWorkerCounter(counterMetricName).inc(3); @@ -272,6 +273,22 @@ public void testExtractPerWorkerMetricUpdates() { assertThat(updates, containsInAnyOrder(histograms, counters)); } + @Test + public void testExtractPerWorkerMetricUpdates_emptyMetrics() { + StreamingStepMetricsContainer.setEnablePerWorkerMetrics(true); + StreamingStepMetricsContainer.setEnablePerWorkerMetrics(true); + MetricName counterMetricName = MetricName.named("BigQuerySink", "counter"); + c1.getPerWorkerCounter(counterMetricName); + + MetricName histogramMetricName = MetricName.named("BigQuerySink", "histogram"); + HistogramData.LinearBuckets linearBuckets = HistogramData.LinearBuckets.of(0, 10, 10); + c2.getPerWorkerHistogram(histogramMetricName, linearBuckets); + + Iterable updates = + StreamingStepMetricsContainer.extractPerWorkerMetricUpdates(registry); + assertThat(updates, IsEmptyIterable.emptyIterable()); + } + public class TestClock extends Clock { private Instant currentTime; diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateCacheTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateCacheTest.java index 1f4355b156be..446a34f73dec 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateCacheTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateCacheTest.java @@ -168,15 +168,15 @@ public void testBasic() throws Exception { assertEquals(0, cache.getWeight()); keyCache.persist(); - assertEquals(254, cache.getWeight()); + assertEquals(414, cache.getWeight()); keyCache.put(triggerNamespace(0, 0), new TestStateTag("tag3"), new TestState("t3"), 2); keyCache.put(triggerNamespace(0, 0), new TestStateTag("tag2"), new TestState("t2"), 2); // Observes updated weight in entries, though cache will not know about it. - assertEquals(290, cache.getWeight()); + assertEquals(482, cache.getWeight()); keyCache.persist(); - assertEquals(290, cache.getWeight()); + assertEquals(482, cache.getWeight()); keyCache = cache.forComputation(COMPUTATION).forKey(COMPUTATION_KEY, 0L, 2L).forFamily(STATE_FAMILY); @@ -212,7 +212,7 @@ public void testInvalidation() throws Exception { keyCache = cache.forComputation(COMPUTATION).forKey(COMPUTATION_KEY, 0L, 2L).forFamily(STATE_FAMILY); - assertEquals(127, cache.getWeight()); + assertEquals(207, cache.getWeight()); assertEquals( Optional.of(new TestState("g1")), keyCache.get(StateNamespaces.global(), new TestStateTag("tag1"))); @@ -221,7 +221,7 @@ public void testInvalidation() throws Exception { cache.forComputation(COMPUTATION).forKey(COMPUTATION_KEY, 1L, 3L).forFamily(STATE_FAMILY); assertEquals( Optional.empty(), keyCache.get(StateNamespaces.global(), new TestStateTag("tag1"))); - assertEquals(127, cache.getWeight()); + assertEquals(207, cache.getWeight()); } /** Verifies that the cache is invalidated when the cache token changes. */ @@ -254,7 +254,7 @@ public void testStaleWorkItem() throws Exception { assertEquals(Optional.of(new TestState("w2")), keyCache.get(windowNamespace(0), tag)); assertEquals(0, cache.getWeight()); keyCache.persist(); - assertEquals(127, cache.getWeight()); + assertEquals(207, cache.getWeight()); assertEquals(Optional.of(new TestState("w2")), keyCache.get(windowNamespace(0), tag)); // Previous work token. diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternalsTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternalsTest.java index d53b1d8c3e89..a53240d64530 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternalsTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternalsTest.java @@ -3043,7 +3043,7 @@ public void testCachedValue() throws Exception { value.write("Hi"); underTest.persist(Windmill.WorkItemCommitRequest.newBuilder()); - assertEquals(141, cache.getWeight()); + assertEquals(221, cache.getWeight()); resetUnderTest(); value = underTest.state(NAMESPACE, addr); @@ -3051,7 +3051,7 @@ public void testCachedValue() throws Exception { value.clear(); underTest.persist(Windmill.WorkItemCommitRequest.newBuilder()); - assertEquals(139, cache.getWeight()); + assertEquals(219, cache.getWeight()); resetUnderTest(); value = underTest.state(NAMESPACE, addr); @@ -3083,7 +3083,7 @@ public void testCachedBag() throws Exception { underTest.persist(Windmill.WorkItemCommitRequest.newBuilder()); - assertEquals(147, cache.getWeight()); + assertEquals(227, cache.getWeight()); resetUnderTest(); bag = underTest.state(NAMESPACE, addr); @@ -3103,7 +3103,7 @@ public void testCachedBag() throws Exception { underTest.persist(Windmill.WorkItemCommitRequest.newBuilder()); - assertEquals(140, cache.getWeight()); + assertEquals(220, cache.getWeight()); resetUnderTest(); bag = underTest.state(NAMESPACE, addr); @@ -3114,7 +3114,7 @@ public void testCachedBag() throws Exception { underTest.persist(Windmill.WorkItemCommitRequest.newBuilder()); - assertEquals(141, cache.getWeight()); + assertEquals(221, cache.getWeight()); resetUnderTest(); bag = underTest.state(NAMESPACE, addr); @@ -3145,7 +3145,7 @@ public void testCachedWatermarkHold() throws Exception { underTest.persist(Windmill.WorkItemCommitRequest.newBuilder()); - assertEquals(151, cache.getWeight()); + assertEquals(231, cache.getWeight()); resetUnderTest(); hold = underTest.state(NAMESPACE, addr); @@ -3154,7 +3154,7 @@ public void testCachedWatermarkHold() throws Exception { underTest.persist(Windmill.WorkItemCommitRequest.newBuilder()); - assertEquals(151, cache.getWeight()); + assertEquals(231, cache.getWeight()); resetUnderTest(); hold = underTest.state(NAMESPACE, addr); @@ -3185,7 +3185,7 @@ public void testCachedCombining() throws Exception { underTest.persist(Windmill.WorkItemCommitRequest.newBuilder()); - assertEquals(144, cache.getWeight()); + assertEquals(224, cache.getWeight()); resetUnderTest(); value = underTest.state(NAMESPACE, COMBINING_ADDR); @@ -3196,7 +3196,7 @@ public void testCachedCombining() throws Exception { underTest.persist(Windmill.WorkItemCommitRequest.newBuilder()); - assertEquals(143, cache.getWeight()); + assertEquals(223, cache.getWeight()); resetUnderTest(); value = underTest.state(NAMESPACE, COMBINING_ADDR); diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/planner/BeamRuleSets.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/planner/BeamRuleSets.java index 9851b2fcbf21..8d5b4d4fa08d 100644 --- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/planner/BeamRuleSets.java +++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/planner/BeamRuleSets.java @@ -21,7 +21,6 @@ import java.util.List; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.extensions.sql.impl.rel.BeamRelNode; -import org.apache.beam.sdk.extensions.sql.impl.rule.BeamAggregateProjectMergeRule; import org.apache.beam.sdk.extensions.sql.impl.rule.BeamAggregationRule; import org.apache.beam.sdk.extensions.sql.impl.rule.BeamBasicAggregationRule; import org.apache.beam.sdk.extensions.sql.impl.rule.BeamCalcMergeRule; @@ -83,7 +82,7 @@ public class BeamRuleSets { CoreRules.PROJECT_SET_OP_TRANSPOSE, // aggregation and projection rules - BeamAggregateProjectMergeRule.INSTANCE, + // BeamAggregateProjectMergeRule.INSTANCE, // push a projection past a filter or vice versa CoreRules.PROJECT_FILTER_TRANSPOSE, CoreRules.FILTER_PROJECT_TRANSPOSE, diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlAliasTest b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlAliasTest new file mode 100644 index 000000000000..790312b7e756 --- /dev/null +++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlAliasTest @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.extensions.sql; + +import java.io.Serializable; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.Row; +import org.junit.Rule; +import org.junit.Test; +import org.testcontainers.shaded.com.fasterxml.jackson.databind.MapperFeature; +import org.testcontainers.shaded.com.fasterxml.jackson.databind.ObjectMapper; + +public class BeamSqlAliasTest implements Serializable { + + @Rule public final transient TestPipeline pipeline = TestPipeline.create(); + + @Test + public void testSqlWithAliasIsNotIgnoredWithOptimizers() { + String ID = "id"; + String EVENT = "event"; + + Schema inputType = Schema.builder().addStringField(ID).addStringField(EVENT).build(); + + String sql = + "select event as event_name, count(*) as c\n" + "from PCOLLECTION\n" + "group by event"; + + List inputRows = + TestUtils.RowsBuilder.of(inputType).addRows("123", "some_event").getRows(); + + PCollection rowPCollection = + pipeline + .apply("boundedInput", Create.of(inputRows).withRowSchema(inputType)) + .apply(SqlTransform.query(sql)) + .apply( + ParDo.of( + new DoFn() { + @DoFn.ProcessElement + public void processElement(DoFn.ProcessContext c) + throws Exception { + ObjectMapper objectMapper = new ObjectMapper(); + Map map = new HashMap<>(); + + for (int i = + Objects.requireNonNull(c.element()).getSchema().getFields().size() + - 1; + i >= 0; + i--) { + Object value = Objects.requireNonNull(c.element()).getValue(i); + Schema.Field field = + Objects.requireNonNull(c.element()).getSchema().getField(i); + map.put(field.getName(), value); + } + + String json = + objectMapper + .configure(MapperFeature.SORT_PROPERTIES_ALPHABETICALLY, true) + .writeValueAsString(map); + c.output(json); + } + })) + .setCoder(StringUtf8Coder.of()); + + // assert alias is kept + PAssert.that(rowPCollection).containsInAnyOrder("{\"c\":1,\"event_name\":\"some_event\"}"); + + pipeline.run().waitUntilFinish(); + } +} \ No newline at end of file diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/rule/BeamAggregateProjectMergeRuleTest.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/rule/BeamAggregateProjectMergeRuleTest.java index 593febb9f190..4eff5c753c4e 100644 --- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/rule/BeamAggregateProjectMergeRuleTest.java +++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/rule/BeamAggregateProjectMergeRuleTest.java @@ -36,6 +36,7 @@ import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.values.Row; import org.junit.Before; +import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.runner.RunWith; @@ -107,6 +108,7 @@ public void testBeamAggregateProjectMergeRule_withProjectTable_withPredicate() { } @Test + @Ignore("BeamAggregateProjectMergeRule disabled due to CALCITE-6357") public void testBeamAggregateProjectMergeRule_withFilterTable() { // When an IO does not supports project push-down, Projects should be merged with an aggregate. String sqlQuery = "select SUM(id) as id_sum from TEST_FILTER group by name"; @@ -126,6 +128,7 @@ public void testBeamAggregateProjectMergeRule_withFilterTable() { } @Test + @Ignore("BeamAggregateProjectMergeRule disabled due to CALCITE-6357") public void testBeamAggregateProjectMergeRule_withNoneTable() { // When an IO does not supports project push-down, Projects should be merged with an aggregate. String sqlQuery = "select SUM(id) as id_sum from TEST_NONE group by name"; diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerTransformRegistrar.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerTransformRegistrar.java index 919d84002293..76d9815960f1 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerTransformRegistrar.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerTransformRegistrar.java @@ -289,6 +289,7 @@ public static class Configuration extends CrossLanguageConfiguration { private @Nullable Duration commitDeadline; private @Nullable Duration maxCumulativeBackoff; private @Nullable String failureMode; + private Boolean highPriority = false; public void setTable(String table) { this.table = table; @@ -327,6 +328,10 @@ public void setMaxCumulativeBackoff(@Nullable Long maxCumulativeBackoff) { public void setFailureMode(@Nullable String failureMode) { this.failureMode = failureMode; } + + public void setHighPriority(Boolean highPriority) { + this.highPriority = highPriority; + } } @Override @@ -341,6 +346,9 @@ public PTransform, PDone> buildExternal( .withDatabaseId(configuration.databaseId) .withInstanceId(configuration.instanceId); + if (configuration.highPriority) { + writeTransform = writeTransform.withHighPriority(); + } if (configuration.maxBatchSizeBytes != null) { writeTransform = writeTransform.withBatchSizeBytes(configuration.maxBatchSizeBytes); } diff --git a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIO.java b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIO.java index 231a1b9e49e1..c56071e85adb 100644 --- a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIO.java +++ b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIO.java @@ -587,6 +587,7 @@ public static Read read() { .setCommitOffsetsInFinalizeEnabled(false) .setDynamicRead(false) .setTimestampPolicyFactory(TimestampPolicyFactory.withProcessingTime()) + .setConsumerPollingTimeout(Duration.standardSeconds(2L)) .build(); } @@ -706,6 +707,9 @@ public abstract static class Read @Pure public abstract @Nullable ErrorHandler getBadRecordErrorHandler(); + @Pure + public abstract @Nullable Duration getConsumerPollingTimeout(); + abstract Builder toBuilder(); @AutoValue.Builder @@ -762,6 +766,8 @@ Builder setCheckStopReadingFn( return setCheckStopReadingFn(CheckStopReadingFnWrapper.of(checkStopReadingFn)); } + abstract Builder setConsumerPollingTimeout(Duration consumerPollingTimeout); + abstract Read build(); static void setupExternalBuilder( @@ -1334,6 +1340,17 @@ public Read withBadRecordErrorHandler(ErrorHandler badRecord return toBuilder().setBadRecordErrorHandler(badRecordErrorHandler).build(); } + /** + * Sets the timeout time for Kafka consumer polling request in the {@link ReadFromKafkaDoFn}. + * The default is 2 second. + */ + public Read withConsumerPollingTimeout(Duration duration) { + checkState( + duration == null || duration.compareTo(Duration.ZERO) > 0, + "Consumer polling timeout must be greater than 0."); + return toBuilder().setConsumerPollingTimeout(duration).build(); + } + /** Returns a {@link PTransform} for PCollection of {@link KV}, dropping Kafka metatdata. */ public PTransform>> withoutMetadata() { return new TypedWithoutMetadata<>(this); @@ -1596,7 +1613,8 @@ public PCollection> expand(PBegin input) { .withValueDeserializerProvider(kafkaRead.getValueDeserializerProvider()) .withManualWatermarkEstimator() .withTimestampPolicyFactory(kafkaRead.getTimestampPolicyFactory()) - .withCheckStopReadingFn(kafkaRead.getCheckStopReadingFn()); + .withCheckStopReadingFn(kafkaRead.getCheckStopReadingFn()) + .withConsumerPollingTimeout(kafkaRead.getConsumerPollingTimeout()); if (kafkaRead.isCommitOffsetsInFinalizeEnabled()) { readTransform = readTransform.commitOffsets(); } @@ -2036,6 +2054,9 @@ public abstract static class ReadSourceDescriptors @Pure abstract ErrorHandler getBadRecordErrorHandler(); + @Pure + abstract @Nullable Duration getConsumerPollingTimeout(); + abstract boolean isBounded(); abstract ReadSourceDescriptors.Builder toBuilder(); @@ -2086,6 +2107,9 @@ abstract ReadSourceDescriptors.Builder setBadRecordRouter( abstract ReadSourceDescriptors.Builder setBadRecordErrorHandler( ErrorHandler badRecordErrorHandler); + abstract ReadSourceDescriptors.Builder setConsumerPollingTimeout( + @Nullable Duration duration); + abstract ReadSourceDescriptors.Builder setBounded(boolean bounded); abstract ReadSourceDescriptors build(); @@ -2099,6 +2123,7 @@ public static ReadSourceDescriptors read() { .setBounded(false) .setBadRecordRouter(BadRecordRouter.THROWING_ROUTER) .setBadRecordErrorHandler(new ErrorHandler.DefaultErrorHandler<>()) + .setConsumerPollingTimeout(Duration.standardSeconds(2L)) .build() .withProcessingTime() .withMonotonicallyIncreasingWatermarkEstimator(); @@ -2360,6 +2385,14 @@ public ReadSourceDescriptors withBadRecordErrorHandler( .build(); } + /** + * Sets the timeout time for Kafka consumer polling request in the {@link ReadFromKafkaDoFn}. + * The default is 2 second. + */ + public ReadSourceDescriptors withConsumerPollingTimeout(@Nullable Duration duration) { + return toBuilder().setConsumerPollingTimeout(duration).build(); + } + ReadAllFromRow forExternalBuild() { return new ReadAllFromRow<>(this); } diff --git a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIOReadImplementationCompatibility.java b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIOReadImplementationCompatibility.java index a2cc9aaeb4d9..7e54407300d4 100644 --- a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIOReadImplementationCompatibility.java +++ b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIOReadImplementationCompatibility.java @@ -112,6 +112,7 @@ Object getDefaultValue() { VALUE_DESERIALIZER_PROVIDER, CHECK_STOP_READING_FN(SDF), BAD_RECORD_ERROR_HANDLER(SDF), + CONSUMER_POLLING_TIMEOUT, ; @Nonnull private final ImmutableSet supportedImplementations; diff --git a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java index 924833290f13..3a821ef9519e 100644 --- a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java +++ b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java @@ -191,6 +191,12 @@ private ReadFromKafkaDoFn( this.checkStopReadingFn = transform.getCheckStopReadingFn(); this.badRecordRouter = transform.getBadRecordRouter(); this.recordTag = recordTag; + if (transform.getConsumerPollingTimeout() != null) { + this.consumerPollingTimeout = + java.time.Duration.ofMillis(transform.getConsumerPollingTimeout().getMillis()); + } else { + this.consumerPollingTimeout = KAFKA_POLL_TIMEOUT; + } } private static final Logger LOG = LoggerFactory.getLogger(ReadFromKafkaDoFn.class); @@ -217,8 +223,9 @@ private ReadFromKafkaDoFn( private transient @Nullable LoadingCache avgRecordSize; - private static final java.time.Duration KAFKA_POLL_TIMEOUT = java.time.Duration.ofSeconds(1); + private static final java.time.Duration KAFKA_POLL_TIMEOUT = java.time.Duration.ofSeconds(2); + @VisibleForTesting final java.time.Duration consumerPollingTimeout; @VisibleForTesting final DeserializerProvider keyDeserializerProvider; @VisibleForTesting final DeserializerProvider valueDeserializerProvider; @VisibleForTesting final Map consumerConfig; @@ -508,7 +515,7 @@ private ConsumerRecords poll( java.time.Duration elapsed = java.time.Duration.ZERO; while (true) { final ConsumerRecords rawRecords = - consumer.poll(KAFKA_POLL_TIMEOUT.minus(elapsed)); + consumer.poll(consumerPollingTimeout.minus(elapsed)); if (!rawRecords.isEmpty()) { // return as we have found some entries return rawRecords; @@ -518,8 +525,11 @@ private ConsumerRecords poll( return rawRecords; } elapsed = sw.elapsed(); - if (elapsed.toMillis() >= KAFKA_POLL_TIMEOUT.toMillis()) { + if (elapsed.toMillis() >= consumerPollingTimeout.toMillis()) { // timeout is over + LOG.warn( + "No messages retrieved with polling timeout {} seconds. Consider increasing the consumer polling timeout using withConsumerPollingTimeout method.", + consumerPollingTimeout.getSeconds()); return rawRecords; } } diff --git a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOTest.java b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOTest.java index 9b15b86051f5..44c028f08a27 100644 --- a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOTest.java +++ b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOTest.java @@ -2121,6 +2121,18 @@ public void testSinkMetrics() throws Exception { } } + @Test(expected = IllegalStateException.class) + public void testWithInvalidConsumerPollingTimeout() { + KafkaIO.read().withConsumerPollingTimeout(Duration.standardSeconds(-5)); + } + + @Test + public void testWithValidConsumerPollingTimeout() { + KafkaIO.Read reader = + KafkaIO.read().withConsumerPollingTimeout(Duration.standardSeconds(15)); + assertEquals(15, reader.getConsumerPollingTimeout().getStandardSeconds()); + } + private static void verifyProducerRecords( MockProducer mockProducer, String topic, diff --git a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFnTest.java b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFnTest.java index 48b5b060a295..8902f22164bc 100644 --- a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFnTest.java +++ b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFnTest.java @@ -641,6 +641,20 @@ public void testUnbounded() { Assert.assertNotEquals(0, visitor.unboundedPCollections.size()); } + @Test + public void testConstructorWithPollTimeout() { + ReadSourceDescriptors descriptors = makeReadSourceDescriptor(consumer); + // default poll timeout = 1 scond + ReadFromKafkaDoFn dofnInstance = ReadFromKafkaDoFn.create(descriptors, RECORDS); + Assert.assertEquals(Duration.ofSeconds(2L), dofnInstance.consumerPollingTimeout); + // updated timeout = 5 seconds + descriptors = + descriptors.withConsumerPollingTimeout(org.joda.time.Duration.standardSeconds(5L)); + ReadFromKafkaDoFn dofnInstanceNew = + ReadFromKafkaDoFn.create(descriptors, RECORDS); + Assert.assertEquals(Duration.ofSeconds(5L), dofnInstanceNew.consumerPollingTimeout); + } + private BoundednessVisitor testBoundedness( Function, ReadSourceDescriptors> readSourceDescriptorsDecorator) { diff --git a/sdks/python/apache_beam/examples/inference/runinference_metrics/setup.py b/sdks/python/apache_beam/examples/inference/runinference_metrics/setup.py index d1bc0a06c4b2..fd2c07cc4e1b 100644 --- a/sdks/python/apache_beam/examples/inference/runinference_metrics/setup.py +++ b/sdks/python/apache_beam/examples/inference/runinference_metrics/setup.py @@ -29,7 +29,7 @@ from setuptools import find_packages REQUIREMENTS = [ - "apache-beam[gcp]==2.41.0", "transformers==4.36.0", "torch==1.13.1" + "apache-beam[gcp]==2.41.0", "transformers==4.38.0", "torch==1.13.1" ] setuptools.setup( diff --git a/sdks/python/apache_beam/io/external/xlang_jdbcio_it_test.py b/sdks/python/apache_beam/io/external/xlang_jdbcio_it_test.py index 54a473d1b52b..38a405c2d331 100644 --- a/sdks/python/apache_beam/io/external/xlang_jdbcio_it_test.py +++ b/sdks/python/apache_beam/io/external/xlang_jdbcio_it_test.py @@ -129,12 +129,16 @@ def test_xlang_jdbc_write_read(self, database): else: binary_type = ('BINARY(10)', 'VARBINARY(10)') - self.engine.execute( - "CREATE TABLE IF NOT EXISTS {}".format(table_name) + "(f_id INTEGER, " + - "f_float DOUBLE PRECISION, " + "f_char CHAR(10), " + - "f_varchar VARCHAR(10), " + f"f_bytes {binary_type[0]}, " + - f"f_varbytes {binary_type[1]}, " + "f_timestamp TIMESTAMP(3), " + - "f_decimal DECIMAL(10, 2), " + "f_date DATE, " + "f_time TIME(3))") + with self.engine.begin() as connection: + connection.execute( + sqlalchemy.text( + "CREATE TABLE IF NOT EXISTS {}".format(table_name) + + "(f_id INTEGER, " + "f_float DOUBLE PRECISION, " + + "f_char CHAR(10), " + "f_varchar VARCHAR(10), " + + f"f_bytes {binary_type[0]}, " + f"f_varbytes {binary_type[1]}, " + + "f_timestamp TIMESTAMP(3), " + "f_decimal DECIMAL(10, 2), " + + "f_date DATE, " + "f_time TIME(3))")) + inserted_rows = [ JdbcTestRow( i, diff --git a/sdks/python/apache_beam/io/gcp/spanner.py b/sdks/python/apache_beam/io/gcp/spanner.py index 51c7fc65c171..9089d746fe1c 100644 --- a/sdks/python/apache_beam/io/gcp/spanner.py +++ b/sdks/python/apache_beam/io/gcp/spanner.py @@ -288,6 +288,7 @@ class WriteToSpannerSchema(NamedTuple): commit_deadline: Optional[int] max_cumulative_backoff: Optional[int] failure_mode: Optional[str] + high_priority: bool _CLASS_DOC = \ @@ -405,6 +406,7 @@ def __init__( max_cumulative_backoff=None, failure_mode=None, expansion_service=None, + high_priority=False, ): max_cumulative_backoff = int( max_cumulative_backoff) if max_cumulative_backoff else None @@ -426,6 +428,7 @@ def __init__( commit_deadline=commit_deadline, max_cumulative_backoff=max_cumulative_backoff, failure_mode=_get_enum_name(failure_mode), + high_priority=high_priority, ), ), expansion_service=expansion_service or default_io_expansion_service(), @@ -459,6 +462,7 @@ def __init__( max_cumulative_backoff=None, expansion_service=None, failure_mode=None, + high_priority=False, ): max_cumulative_backoff = int( max_cumulative_backoff) if max_cumulative_backoff else None @@ -480,6 +484,7 @@ def __init__( commit_deadline=commit_deadline, max_cumulative_backoff=max_cumulative_backoff, failure_mode=_get_enum_name(failure_mode), + high_priority=high_priority, ), ), expansion_service=expansion_service or default_io_expansion_service(), @@ -513,6 +518,7 @@ def __init__( max_cumulative_backoff=None, expansion_service=None, failure_mode=None, + high_priority=False, ): max_cumulative_backoff = int( max_cumulative_backoff) if max_cumulative_backoff else None @@ -534,6 +540,7 @@ def __init__( commit_deadline=commit_deadline, max_cumulative_backoff=max_cumulative_backoff, failure_mode=_get_enum_name(failure_mode), + high_priority=high_priority, ), ), expansion_service=expansion_service or default_io_expansion_service(), @@ -567,6 +574,7 @@ def __init__( max_cumulative_backoff=None, failure_mode=None, expansion_service=None, + high_priority=False, ): max_cumulative_backoff = int( max_cumulative_backoff) if max_cumulative_backoff else None @@ -588,6 +596,7 @@ def __init__( commit_deadline=commit_deadline, max_cumulative_backoff=max_cumulative_backoff, failure_mode=_get_enum_name(failure_mode), + high_priority=high_priority, ), ), expansion_service=expansion_service or default_io_expansion_service(), @@ -621,6 +630,7 @@ def __init__( max_cumulative_backoff=None, failure_mode=None, expansion_service=None, + high_priority=False, ): max_cumulative_backoff = int( max_cumulative_backoff) if max_cumulative_backoff else None @@ -642,6 +652,7 @@ def __init__( commit_deadline=commit_deadline, max_cumulative_backoff=max_cumulative_backoff, failure_mode=_get_enum_name(failure_mode), + high_priority=high_priority, ), ), expansion_service=expansion_service or default_io_expansion_service(), diff --git a/sdks/python/apache_beam/yaml/examples/README.md b/sdks/python/apache_beam/yaml/examples/README.md index 0d76f41bef67..06e64d0c08e1 100644 --- a/sdks/python/apache_beam/yaml/examples/README.md +++ b/sdks/python/apache_beam/yaml/examples/README.md @@ -54,14 +54,5 @@ These examples leverage the built-in mapping transforms including `MapToFields`, These examples leverage the built-in `Combine` transform for performing simple aggregations including sum, mean, count, etc. -These examples are experimental and require that -`yaml_experimental_features: Combine` be specified under the `options` tag, or -by passing `--yaml_experimental_features=Combine` to the command to run the -pipeline. i.e. -``` -python -m apache_beam.yaml.main \ - --pipeline_spec_file=/path/to/example.yaml \ - --yaml_experimental_features=Combine -``` More information can be found about aggregation transforms [here](https://beam.apache.org/documentation/sdks/yaml-combine/). \ No newline at end of file diff --git a/sdks/python/apache_beam/yaml/examples/simple_filter.yaml b/sdks/python/apache_beam/yaml/examples/simple_filter.yaml new file mode 100644 index 000000000000..d2cfbb91259b --- /dev/null +++ b/sdks/python/apache_beam/yaml/examples/simple_filter.yaml @@ -0,0 +1,49 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the# Row(word='License'); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an# Row(word='AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This examples reads from a public file stored on Google Cloud. This +# requires authenticating with Google Cloud, or setting the file in +#`ReadFromText` to a local file. +# +# To set up Application Default Credentials, +# see https://cloud.google.com/docs/authentication/external/set-up-adc for more +# information +# +# The following example reads mock transaction data from resources/products.csv +# then performs a simple filter for "Electronics". +pipeline: + transforms: + - type: ReadFromCsv + name: ReadInputFile + config: + path: gs://apache-beam-samples/beam-yaml-blog/products.csv + - type: Filter + name: FilterWithCategory + input: ReadInputFile + config: + language: python + keep: category == "Electronics" + - type: WriteToCsv + name: WriteOutputFile + input: FilterWithCategory + config: + path: output + +# Expected: +# Row(transaction_id='T0012', product_name='Headphones', category='Electronics', price=59.99) +# Row(transaction_id='T0104', product_name='Headphones', category='Electronics', price=59.99) +# Row(transaction_id='T0302', product_name='Monitor', category='Electronics', price=249.99) diff --git a/sdks/python/apache_beam/yaml/examples/simple_filter_and_combine.yaml b/sdks/python/apache_beam/yaml/examples/simple_filter_and_combine.yaml new file mode 100644 index 000000000000..f39d03e6211d --- /dev/null +++ b/sdks/python/apache_beam/yaml/examples/simple_filter_and_combine.yaml @@ -0,0 +1,64 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the# Row(word='License'); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an# Row(word='AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This examples reads from a public file stored on Google Cloud. This +# requires authenticating with Google Cloud, or setting the file in +#`ReadFromText` to a local file. +# +# To set up Application Default Credentials, +# see https://cloud.google.com/docs/authentication/external/set-up-adc for more +# information +# +# The following example reads mock transaction data from resources/products.csv, +# performs a simple filter for "Electronics", then calculates the revenue and +# number of products sold for each product type. +pipeline: + transforms: + - type: ReadFromCsv + name: ReadInputFile + config: + path: gs://apache-beam-samples/beam-yaml-blog/products.csv + - type: Filter + name: FilterWithCategory + input: ReadInputFile + config: + language: python + keep: category == "Electronics" + - type: Combine + name: CountNumberSold + input: FilterWithCategory + config: + group_by: product_name + combine: + num_sold: + value: product_name + fn: count + total_revenue: + value: price + fn: sum + - type: WriteToCsv + name: WriteOutputFile + input: CountNumberSold + config: + path: output + +options: + yaml_experimental_features: Combine + +# Expected: +# Row(product_name='Headphones', num_sold=2, total_revenue=119.98) +# Row(product_name='Monitor', num_sold=1, total_revenue=249.99) diff --git a/sdks/python/apache_beam/yaml/examples/__init__.py b/sdks/python/apache_beam/yaml/examples/testing/__init__.py similarity index 100% rename from sdks/python/apache_beam/yaml/examples/__init__.py rename to sdks/python/apache_beam/yaml/examples/testing/__init__.py diff --git a/sdks/python/apache_beam/yaml/examples/examples_test.py b/sdks/python/apache_beam/yaml/examples/testing/examples_test.py similarity index 63% rename from sdks/python/apache_beam/yaml/examples/examples_test.py rename to sdks/python/apache_beam/yaml/examples/testing/examples_test.py index e084f710a7cb..6c8efac980aa 100644 --- a/sdks/python/apache_beam/yaml/examples/examples_test.py +++ b/sdks/python/apache_beam/yaml/examples/testing/examples_test.py @@ -24,7 +24,6 @@ from typing import Callable from typing import Dict from typing import List -from typing import Optional from typing import Union from unittest import mock @@ -49,9 +48,20 @@ def _check_inner(actual: PCollection[str]): return _check_inner +def products_csv(): + return '\n'.join([ + 'transaction_id,product_name,category,price', + 'T0012,Headphones,Electronics,59.99', + 'T5034,Leather Jacket,Apparel,109.99', + 'T0024,Aluminum Mug,Kitchen,29.99', + 'T0104,Headphones,Electronics,59.99', + 'T0302,Monitor,Electronics,249.99' + ]) + + def create_test_method( pipeline_spec_file: str, - custom_preprocessor: Optional[Callable[..., Union[Dict, List]]] = None): + custom_preprocessors: List[Callable[..., Union[Dict, List]]]): @mock.patch('apache_beam.Pipeline', TestPipeline) def test_yaml_example(self): with open(pipeline_spec_file, encoding="utf-8") as f: @@ -68,20 +78,22 @@ def test_yaml_example(self): ''.join(lines), Loader=yaml_transform.SafeLineLoader) with TestEnvironment() as env: - if custom_preprocessor: - pipeline_spec = custom_preprocessor(pipeline_spec, expected, env) + for fn in custom_preprocessors: + pipeline_spec = fn(pipeline_spec, expected, env) with beam.Pipeline(options=PipelineOptions( pickle_library='cloudpickle', **yaml_transform.SafeLineLoader.strip_metadata(pipeline_spec.get( 'options', {})))) as p: actual = yaml_transform.expand_pipeline(p, pipeline_spec) + if not actual: + actual = p.transforms_stack[0].parts[-1].outputs[None] check_output(expected)(actual) return test_yaml_example class YamlExamplesTestSuite: - _test_preprocessor: Dict[str, Callable[..., Union[Dict, List]]] = {} + _test_preprocessor: Dict[str, List[Callable[..., Union[Dict, List]]]] = {} def __init__(self, name: str, path: str): self._test_suite = self.create_test_suite(name, path) @@ -96,17 +108,23 @@ def parse_test_methods(cls, path: str): files = [path] for file in files: test_name = f'test_{file.split(os.sep)[-1].replace(".", "_")}' - custom_preprocessor = cls._test_preprocessor.get(test_name, None) - yield test_name, create_test_method(file, custom_preprocessor) + custom_preprocessors = cls._test_preprocessor.get(test_name, []) + yield test_name, create_test_method(file, custom_preprocessors) @classmethod def create_test_suite(cls, name: str, path: str): return type(name, (unittest.TestCase, ), dict(cls.parse_test_methods(path))) @classmethod - def register_test_preprocessor(cls, test_name: str): + def register_test_preprocessor(cls, test_names: Union[str, List]): + if isinstance(test_names, str): + test_names = [test_names] + def apply(preprocessor): - cls._test_preprocessor[test_name] = preprocessor + for test_name in test_names: + if test_name not in cls._test_preprocessor: + cls._test_preprocessor[test_name] = [] + cls._test_preprocessor[test_name].append(preprocessor) return preprocessor return apply @@ -114,7 +132,7 @@ def apply(preprocessor): @YamlExamplesTestSuite.register_test_preprocessor('test_wordcount_minimal_yaml') def _wordcount_test_preprocessor( - test_spec: str, expected: List[str], env: TestEnvironment): + test_spec: dict, expected: List[str], env: TestEnvironment): all_words = [] for element in expected: word = element.split('=')[1].split(',')[0].replace("'", '') @@ -137,17 +155,55 @@ def _wordcount_test_preprocessor( env.input_file('kinglear.txt', '\n'.join(lines))) +@YamlExamplesTestSuite.register_test_preprocessor( + ['test_simple_filter_yaml', 'test_simple_filter_and_combine_yaml']) +def _file_io_write_test_preprocessor( + test_spec: dict, expected: List[str], env: TestEnvironment): + + if pipeline := test_spec.get('pipeline', None): + for transform in pipeline.get('transforms', []): + if transform.get('type', '').startswith('WriteTo'): + transform['type'] = 'LogForTesting' + transform['config'] = { + k: v + for k, + v in transform.get('config', {}).items() if k.startswith('__') + } + + return test_spec + + +@YamlExamplesTestSuite.register_test_preprocessor( + ['test_simple_filter_yaml', 'test_simple_filter_and_combine_yaml']) +def _file_io_read_test_preprocessor( + test_spec: dict, expected: List[str], env: TestEnvironment): + + if pipeline := test_spec.get('pipeline', None): + for transform in pipeline.get('transforms', []): + if transform.get('type', '').startswith('ReadFrom'): + file_name = transform['config']['path'].split('/')[-1] + return replace_recursive( + test_spec, + transform['type'], + 'path', + env.input_file(file_name, INPUT_FILES[file_name])) + + return test_spec + + +INPUT_FILES = {'products.csv': products_csv()} + YAML_DOCS_DIR = os.path.join(os.path.dirname(__file__)) ExamplesTest = YamlExamplesTestSuite( - 'ExamplesTest', os.path.join(YAML_DOCS_DIR, '*.yaml')).run() + 'ExamplesTest', os.path.join(YAML_DOCS_DIR, '../*.yaml')).run() ElementWiseTest = YamlExamplesTestSuite( 'ElementwiseExamplesTest', - os.path.join(YAML_DOCS_DIR, 'transforms/elementwise/*.yaml')).run() + os.path.join(YAML_DOCS_DIR, '../transforms/elementwise/*.yaml')).run() AggregationTest = YamlExamplesTestSuite( 'AggregationExamplesTest', - os.path.join(YAML_DOCS_DIR, 'transforms/aggregation/*.yaml')).run() + os.path.join(YAML_DOCS_DIR, '../transforms/aggregation/*.yaml')).run() if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) diff --git a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_count_minimal.yaml b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_count_minimal.yaml index b746e55e3448..17efcc3e6eee 100644 --- a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_count_minimal.yaml +++ b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_count_minimal.yaml @@ -52,9 +52,6 @@ pipeline: produce: count - type: LogForTesting -options: - yaml_experimental_features: Combine - # Expected: # Row(season='spring', produce=4) # Row(season='summer', produce=3) diff --git a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_max_minimal.yaml b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_max_minimal.yaml index 2588897a7179..2faadf9ac64a 100644 --- a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_max_minimal.yaml +++ b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_max_minimal.yaml @@ -44,9 +44,6 @@ pipeline: amount: max - type: LogForTesting -options: - yaml_experimental_features: Combine - # Expected: # Row(produce='🥕', amount=3) # Row(produce='🍆', amount=1) diff --git a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_mean_minimal.yaml b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_mean_minimal.yaml index b00b7e817abe..a51e48b25974 100644 --- a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_mean_minimal.yaml +++ b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_mean_minimal.yaml @@ -44,9 +44,6 @@ pipeline: amount: mean - type: LogForTesting -options: - yaml_experimental_features: Combine - # Expected: # Row(produce='🥕', amount=2.5) # Row(produce='🍆', amount=1.0) diff --git a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_min_minimal.yaml b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_min_minimal.yaml index 1e05fd3f755f..5fcadd1b2c3b 100644 --- a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_min_minimal.yaml +++ b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_min_minimal.yaml @@ -44,9 +44,6 @@ pipeline: amount: min - type: LogForTesting -options: - yaml_experimental_features: Combine - # Expected: # Row(produce='🥕', amount=2) # Row(produce='🍆', amount=1) diff --git a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_multiple_aggregations.yaml b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_multiple_aggregations.yaml index 1263ac21fac4..0597d403de64 100644 --- a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_multiple_aggregations.yaml +++ b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_multiple_aggregations.yaml @@ -76,8 +76,5 @@ pipeline: max_price: max_price - type: LogForTesting -options: - yaml_experimental_features: Combine - # Expected: # Row(min_price=1.0, mean_price=2.5, max_price=4.0) diff --git a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_sum.yaml b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_sum.yaml index 77cd1529b580..2308cee96061 100644 --- a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_sum.yaml +++ b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_sum.yaml @@ -57,9 +57,6 @@ pipeline: fn: mean - type: LogForTesting -options: - yaml_experimental_features: Combine - # Expected: # Row(fruit='raspberry', total_quantity=1, mean_price=3.5) # Row(fruit='blackberry', total_quantity=1, mean_price=4.0) diff --git a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_sum_minimal.yaml b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_sum_minimal.yaml index 905a30d13f08..d2f1887b5819 100644 --- a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_sum_minimal.yaml +++ b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/combine_sum_minimal.yaml @@ -44,9 +44,6 @@ pipeline: amount: sum - type: LogForTesting -options: - yaml_experimental_features: Combine - # Expected: # Row(produce='🥕', amount=5) # Row(produce='🍆', amount=1) diff --git a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/group_into_batches.yaml b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/group_into_batches.yaml index a956ea604c65..e107a6cabadf 100644 --- a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/group_into_batches.yaml +++ b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/group_into_batches.yaml @@ -57,9 +57,6 @@ pipeline: n: 3 - type: LogForTesting -options: - yaml_experimental_features: Combine - # Expected: # Row(season='spring', produce=['🥕', '🍓', '🍆']) # Row(season='summer', produce=['🥕', '🍅', '🌽']) diff --git a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/top_largest_per_key.yaml b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/top_largest_per_key.yaml index c35a5da1bd4f..d283b1f40860 100644 --- a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/top_largest_per_key.yaml +++ b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/top_largest_per_key.yaml @@ -48,9 +48,6 @@ pipeline: n: 2 - type: LogForTesting -options: - yaml_experimental_features: Combine - # Expected: # Row(produce='🥕', biggest=[3, 2]) # Row(produce='🍆', biggest=[1]) diff --git a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/top_smallest_per_key.yaml b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/top_smallest_per_key.yaml index ebd7c0c34fe1..bbf927492df3 100644 --- a/sdks/python/apache_beam/yaml/examples/transforms/aggregation/top_smallest_per_key.yaml +++ b/sdks/python/apache_beam/yaml/examples/transforms/aggregation/top_smallest_per_key.yaml @@ -50,9 +50,6 @@ pipeline: reverse: true - type: LogForTesting -options: - yaml_experimental_features: Combine - # Expected: # Row(produce='🥕', smallest=[2, 3]) # Row(produce='🍆', smallest=[1]) diff --git a/sdks/python/apache_beam/yaml/examples/wordcount_minimal.yaml b/sdks/python/apache_beam/yaml/examples/wordcount_minimal.yaml index 044db2790acc..dd8f9049b489 100644 --- a/sdks/python/apache_beam/yaml/examples/wordcount_minimal.yaml +++ b/sdks/python/apache_beam/yaml/examples/wordcount_minimal.yaml @@ -16,7 +16,7 @@ # limitations under the License. # -# This examples reads from a public file stores on Google Cloud. This +# This examples reads from a public file stored on Google Cloud. This # requires authenticating with Google Cloud, or setting the file in #`ReadFromText` to a local file. # @@ -70,9 +70,6 @@ pipeline: # Log out results - type: LogForTesting - -options: - yaml_experimental_features: Combine # Expected: # Row(word='king', count=311) diff --git a/sdks/python/apache_beam/yaml/programming_guide_test.py b/sdks/python/apache_beam/yaml/programming_guide_test.py index fe5e242f7f5b..2d62213e2869 100644 --- a/sdks/python/apache_beam/yaml/programming_guide_test.py +++ b/sdks/python/apache_beam/yaml/programming_guide_test.py @@ -65,8 +65,7 @@ class ProgrammingGuideTest(unittest.TestCase): def test_group_by(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pickle_library='cloudpickle', yaml_experimental_features=['Combine' - ])) as p: + pickle_library='cloudpickle')) as p: elements = p | beam.Create(DATA) result = elements | YamlTransform( ''' @@ -87,8 +86,7 @@ def test_group_by(self): def test_co_group_by(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pickle_library='cloudpickle', yaml_experimental_features=['Combine' - ])) as p: + pickle_library='cloudpickle')) as p: result = p | YamlTransform( ''' type: composite @@ -168,8 +166,7 @@ def test_co_group_by(self): def test_combine_ref(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pickle_library='cloudpickle', yaml_experimental_features=['Combine' - ])) as p: + pickle_library='cloudpickle')) as p: elements = p | beam.Create(DATA) result = elements | YamlTransform( ''' @@ -196,8 +193,7 @@ def test_combine_ref(self): def test_combine_globally(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pickle_library='cloudpickle', yaml_experimental_features=['Combine' - ])) as p: + pickle_library='cloudpickle')) as p: elements = p | beam.Create(DATA) result = elements | YamlTransform( ''' @@ -217,8 +213,7 @@ def test_combine_globally(self): def test_combine_per_key(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pickle_library='cloudpickle', yaml_experimental_features=['Combine' - ])) as p: + pickle_library='cloudpickle')) as p: elements = p | beam.Create(DATA) result = elements | YamlTransform( ''' diff --git a/sdks/python/apache_beam/yaml/readme_test.py b/sdks/python/apache_beam/yaml/readme_test.py index ea7a015dab5d..592ef03ce33d 100644 --- a/sdks/python/apache_beam/yaml/readme_test.py +++ b/sdks/python/apache_beam/yaml/readme_test.py @@ -226,10 +226,7 @@ def test(self): if write in test_yaml: spec = replace_recursive(spec, write, 'path', env.output_file()) modified_yaml = yaml.dump(spec) - options = { - 'pickle_library': 'cloudpickle', - 'yaml_experimental_features': ['Combine'] - } + options = {'pickle_library': 'cloudpickle'} if RENDER_DIR is not None: options['runner'] = 'apache_beam.runners.render.RenderRunner' options['render_output'] = [ diff --git a/sdks/python/apache_beam/yaml/yaml_combine.py b/sdks/python/apache_beam/yaml/yaml_combine.py index bb9d0964f912..bd43bac1a65a 100644 --- a/sdks/python/apache_beam/yaml/yaml_combine.py +++ b/sdks/python/apache_beam/yaml/yaml_combine.py @@ -29,7 +29,6 @@ from apache_beam.typehints.decorators import get_type_hints from apache_beam.typehints.schemas import named_fields_from_element_type from apache_beam.utils import python_callable -from apache_beam.yaml import options from apache_beam.yaml import yaml_mapping from apache_beam.yaml import yaml_provider @@ -106,7 +105,6 @@ def __init__( self._language = language def expand(self, pcoll): - options.YamlOptions.check_enabled(pcoll.pipeline, 'Combine') input_types = dict(named_fields_from_element_type(pcoll.element_type)) all_fields = list(input_types.keys()) unknown_keys = set(self._group_by) - set(all_fields) @@ -178,7 +176,6 @@ def extract_return_type(expr): @beam.ptransform.ptransform_fn def _SqlCombineTransform( pcoll, sql_transform_constructor, group_by, combine, language=None): - options.YamlOptions.check_enabled(pcoll.pipeline, 'Combine') all_fields = [ x for x, _ in named_fields_from_element_type(pcoll.element_type) ] diff --git a/sdks/python/apache_beam/yaml/yaml_combine_test.py b/sdks/python/apache_beam/yaml/yaml_combine_test.py index 615b697e77b0..caf3de10078b 100644 --- a/sdks/python/apache_beam/yaml/yaml_combine_test.py +++ b/sdks/python/apache_beam/yaml/yaml_combine_test.py @@ -34,8 +34,7 @@ class YamlCombineTest(unittest.TestCase): def test_multiple_aggregations(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pickle_library='cloudpickle', yaml_experimental_features=['Combine' - ])) as p: + pickle_library='cloudpickle')) as p: elements = p | beam.Create(DATA) result = elements | YamlTransform( ''' @@ -55,8 +54,7 @@ def test_multiple_aggregations(self): def test_multiple_keys(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pickle_library='cloudpickle', yaml_experimental_features=['Combine' - ])) as p: + pickle_library='cloudpickle')) as p: elements = p | beam.Create(DATA) result = elements | YamlTransform( ''' @@ -76,8 +74,7 @@ def test_multiple_keys(self): def test_no_keys(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pickle_library='cloudpickle', yaml_experimental_features=['Combine' - ])) as p: + pickle_library='cloudpickle')) as p: elements = p | beam.Create(DATA) result = elements | YamlTransform( ''' @@ -95,8 +92,7 @@ def test_no_keys(self): def test_multiple_combines(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pickle_library='cloudpickle', yaml_experimental_features=['Combine' - ])) as p: + pickle_library='cloudpickle')) as p: elements = p | beam.Create(DATA) result = elements | YamlTransform( ''' @@ -120,8 +116,7 @@ def test_multiple_combines(self): def test_group(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pickle_library='cloudpickle', yaml_experimental_features=['Combine' - ])) as p: + pickle_library='cloudpickle')) as p: elements = p | beam.Create(DATA) result = elements | YamlTransform( ''' @@ -146,8 +141,7 @@ def test_group(self): def test_expression(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pickle_library='cloudpickle', yaml_experimental_features=['Combine' - ])) as p: + pickle_library='cloudpickle')) as p: elements = p | beam.Create(DATA) result = elements | YamlTransform( ''' @@ -169,8 +163,7 @@ def test_expression(self): def test_config(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pickle_library='cloudpickle', yaml_experimental_features=['Combine' - ])) as p: + pickle_library='cloudpickle')) as p: elements = p | beam.Create(DATA) result = elements | YamlTransform( ''' diff --git a/sdks/python/apache_beam/yaml/yaml_mapping.py b/sdks/python/apache_beam/yaml/yaml_mapping.py index 4839728dd886..32095fe39f2a 100644 --- a/sdks/python/apache_beam/yaml/yaml_mapping.py +++ b/sdks/python/apache_beam/yaml/yaml_mapping.py @@ -31,11 +31,6 @@ from typing import TypeVar from typing import Union -import js2py -from js2py import base -from js2py.constructors import jsdate -from js2py.internals import simplex - import apache_beam as beam from apache_beam.io.filesystems import FileSystems from apache_beam.portability.api import schema_pb2 @@ -52,6 +47,14 @@ from apache_beam.yaml import yaml_provider from apache_beam.yaml.yaml_provider import dicts_to_rows +# Import js2py package if it exists +try: + import js2py + from js2py.base import JsObjectWrapper +except ImportError: + js2py = None + JsObjectWrapper = object + def normalize_mapping(spec): """ @@ -87,7 +90,7 @@ def _check_mapping_arguments( # js2py's JsObjectWrapper object has a self-referencing __dict__ property # that cannot be pickled without implementing the __getstate__ and # __setstate__ methods. -class _CustomJsObjectWrapper(js2py.base.JsObjectWrapper): +class _CustomJsObjectWrapper(JsObjectWrapper): def __init__(self, js_obj): super().__init__(js_obj.__dict__['_obj']) @@ -116,6 +119,17 @@ def py_value_to_js_dict(py_value): def _expand_javascript_mapping_func( original_fields, expression=None, callable=None, path=None, name=None): + # Check for installed js2py package + if js2py is None: + raise ValueError( + "Javascript mapping functions are not supported on" + " Python 3.12 or later.") + + # import remaining js2py objects + from js2py import base + from js2py.constructors import jsdate + from js2py.internals import simplex + js_array_type = ( base.PyJsArray, base.PyJsArrayBuffer, diff --git a/sdks/python/apache_beam/yaml/yaml_transform.py b/sdks/python/apache_beam/yaml/yaml_transform.py index debf6a63af26..fd265c42cf73 100644 --- a/sdks/python/apache_beam/yaml/yaml_transform.py +++ b/sdks/python/apache_beam/yaml/yaml_transform.py @@ -920,7 +920,7 @@ def ensure_transforms_have_providers(spec): f'for type {spec["type"]} for {identify_object(spec)}') return spec - def preprocess_langauges(spec): + def preprocess_languages(spec): if spec['type'] in ('AssignTimestamps', 'Combine', 'Filter', @@ -942,7 +942,7 @@ def preprocess_langauges(spec): ensure_transforms_have_types, normalize_mapping, normalize_combine, - preprocess_langauges, + preprocess_languages, ensure_transforms_have_providers, preprocess_source_sink, preprocess_chain, diff --git a/sdks/python/apache_beam/yaml/yaml_transform_test.py b/sdks/python/apache_beam/yaml/yaml_transform_test.py index 9641df0896f5..fbdae6679e96 100644 --- a/sdks/python/apache_beam/yaml/yaml_transform_test.py +++ b/sdks/python/apache_beam/yaml/yaml_transform_test.py @@ -592,8 +592,7 @@ def test_windowing_on_outer(self): def test_assign_timestamps(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pickle_library='cloudpickle', yaml_experimental_features=['Combine' - ])) as p: + pickle_library='cloudpickle')) as p: result = p | YamlTransform( ''' type: chain diff --git a/sdks/python/apache_beam/yaml/yaml_udf_test.py b/sdks/python/apache_beam/yaml/yaml_udf_test.py index 5f5ee1147ded..c26d8ec92dd5 100644 --- a/sdks/python/apache_beam/yaml/yaml_udf_test.py +++ b/sdks/python/apache_beam/yaml/yaml_udf_test.py @@ -29,6 +29,12 @@ from apache_beam.yaml.yaml_provider import dicts_to_rows from apache_beam.yaml.yaml_transform import YamlTransform +try: + import js2py +except ImportError: + js2py = None + logging.warning('js2py is not installed; some tests will be skipped.') + def AsRows(): return beam.Map( @@ -55,6 +61,7 @@ def setUp(self): def tearDown(self): shutil.rmtree(self.tmpdir) + @unittest.skipIf(js2py is None, 'js2py not installed.') def test_map_to_fields_filter_inline_js(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( pickle_library='cloudpickle', yaml_experimental_features=['javascript' @@ -125,6 +132,7 @@ def test_map_to_fields_filter_inline_py(self): beam.Row(label='389ax', conductor=390, sum=24), ])) + @unittest.skipIf(js2py is None, 'js2py not installed.') def test_filter_inline_js(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( pickle_library='cloudpickle', yaml_experimental_features=['javascript' @@ -179,6 +187,7 @@ def test_filter_inline_py(self): row=beam.Row(rank=2, values=[7, 8, 9])), ])) + @unittest.skipIf(js2py is None, 'js2py not installed.') def test_filter_expression_js(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( pickle_library='cloudpickle', yaml_experimental_features=['javascript' @@ -222,6 +231,7 @@ def test_filter_expression_py(self): row=beam.Row(rank=0, values=[1, 2, 3])), ])) + @unittest.skipIf(js2py is None, 'js2py not installed.') def test_filter_inline_js_file(self): data = ''' function f(x) { diff --git a/sdks/python/build.gradle b/sdks/python/build.gradle index 37b80519b7c4..713acea0f41d 100644 --- a/sdks/python/build.gradle +++ b/sdks/python/build.gradle @@ -107,6 +107,25 @@ tasks.register("generateYamlDocs") { outputs.file "${buildDir}/yaml-ref.html" } +tasks.register("yamlIntegrationTests") { + description "Runs integration tests for yaml pipelines." + + dependsOn installGcpTest + // Need to build all expansion services referenced in apache_beam/yaml/*.* + // grep -oh 'sdk.*Jar' sdks/python/apache_beam/yaml/*.yaml | sort | uniq + dependsOn ":sdks:java:extensions:schemaio-expansion-service:shadowJar" + dependsOn ":sdks:java:extensions:sql:expansion-service:shadowJar" + dependsOn ":sdks:java:io:expansion-service:build" + dependsOn ":sdks:java:io:google-cloud-platform:expansion-service:build" + + doLast { + exec { + executable 'sh' + args '-c', "${envdir}/bin/pytest -v apache_beam/yaml/integration_tests.py" + } + } +} + // Create Python wheels for given platform and Python version // build identifiers for cibuildwheel def platform_identifiers_map = [ diff --git a/sdks/python/container/base_image_requirements_manual.txt b/sdks/python/container/base_image_requirements_manual.txt index dd18da1c63f5..5bc60c474b4f 100644 --- a/sdks/python/container/base_image_requirements_manual.txt +++ b/sdks/python/container/base_image_requirements_manual.txt @@ -37,9 +37,7 @@ guppy3 mmh3 # Optimizes execution of some Beam codepaths. TODO: Make it Beam's dependency. nltk # Commonly used for natural language processing. nose==1.3.7 # For Dataflow internal testing. TODO: remove this. -# TODO: Uncomment python version once python-snappy supports it. -# https://github.com/apache/beam/issues/25985 -python-snappy;python_version<"3.11" # Optimizes execution of some Beam codepaths. +python-snappy # Optimizes execution of some Beam codepaths. scipy scikit-learn build>=1.0,<2 # tool to build sdist from setup.py in stager. \ No newline at end of file diff --git a/sdks/python/container/py310/base_image_requirements.txt b/sdks/python/container/py310/base_image_requirements.txt index 06fd375a8315..57cd7fcb68fc 100644 --- a/sdks/python/container/py310/base_image_requirements.txt +++ b/sdks/python/container/py310/base_image_requirements.txt @@ -21,11 +21,12 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. +annotated-types==0.6.0 async-timeout==4.0.3 attrs==23.2.0 beautifulsoup4==4.12.3 bs4==0.0.2 -build==1.1.1 +build==1.2.1 cachetools==5.3.3 certifi==2024.2.2 cffi==1.16.0 @@ -44,18 +45,18 @@ docker==7.0.0 docopt==0.6.2 docstring_parser==0.16 exceptiongroup==1.2.0 -execnet==2.0.2 +execnet==2.1.1 fastavro==1.9.4 fasteners==0.19 freezegun==1.4.0 future==1.0.0 google-api-core==2.18.0 -google-api-python-client==2.123.0 +google-api-python-client==2.125.0 google-apitools==0.5.31 google-auth==2.29.0 -google-auth-httplib2==0.1.1 -google-cloud-aiplatform==1.44.0 -google-cloud-bigquery==3.19.0 +google-auth-httplib2==0.2.0 +google-cloud-aiplatform==1.47.0 +google-cloud-bigquery==3.20.1 google-cloud-bigquery-storage==2.24.0 google-cloud-bigtable==2.23.0 google-cloud-core==2.4.1 @@ -63,7 +64,7 @@ google-cloud-datastore==2.19.0 google-cloud-dlp==3.16.0 google-cloud-language==2.13.3 google-cloud-profiler==4.1.0 -google-cloud-pubsub==2.20.3 +google-cloud-pubsub==2.21.1 google-cloud-pubsublite==1.9.0 google-cloud-recommendations-ai==0.10.10 google-cloud-resource-manager==1.12.3 @@ -82,10 +83,10 @@ grpcio-status==1.62.1 guppy3==3.1.4.post1 hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.99.13 +hypothesis==6.100.1 idna==3.6 iniconfig==2.0.0 -joblib==1.3.2 +joblib==1.4.0 Js2Py==0.74 jsonpickle==3.0.3 jsonschema==4.21.1 @@ -97,7 +98,7 @@ nose==1.3.7 numpy==1.26.4 oauth2client==4.1.3 objsize==0.7.0 -orjson==3.9.15 +orjson==3.10.0 overrides==7.7.0 packaging==24.0 pandas==2.0.3 @@ -108,13 +109,15 @@ protobuf==4.25.3 psycopg2-binary==2.9.9 pyarrow==14.0.2 pyarrow-hotfix==0.6 -pyasn1==0.5.1 -pyasn1-modules==0.3.0 -pycparser==2.21 +pyasn1==0.6.0 +pyasn1_modules==0.4.0 +pycparser==2.22 +pydantic==2.6.4 +pydantic_core==2.16.3 pydot==1.4.2 PyHamcrest==2.1.0 pyjsparser==2.7.1 -pymongo==4.6.2 +pymongo==4.6.3 PyMySQL==1.1.0 pyparsing==3.1.2 pyproject_hooks==1.0.0 @@ -129,23 +132,23 @@ redis==5.0.3 referencing==0.34.0 regex==2023.12.25 requests==2.31.0 -requests-mock==1.11.0 +requests-mock==1.12.1 rpds-py==0.18.0 rsa==4.9 -scikit-learn==1.4.1.post1 -scipy==1.12.0 +scikit-learn==1.4.2 +scipy==1.13.0 shapely==2.0.3 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 -SQLAlchemy==1.4.52 +SQLAlchemy==2.0.29 sqlparse==0.4.4 tenacity==8.2.3 testcontainers==3.7.1 threadpoolctl==3.4.0 tomli==2.0.1 tqdm==4.66.2 -typing_extensions==4.10.0 +typing_extensions==4.11.0 tzdata==2024.1 tzlocal==5.2 uritemplate==4.1.1 diff --git a/sdks/python/container/py311/base_image_requirements.txt b/sdks/python/container/py311/base_image_requirements.txt index 71b3c48a35ba..faf93fe59e07 100644 --- a/sdks/python/container/py311/base_image_requirements.txt +++ b/sdks/python/container/py311/base_image_requirements.txt @@ -21,16 +21,18 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. +annotated-types==0.6.0 attrs==23.2.0 beautifulsoup4==4.12.3 bs4==0.0.2 -build==1.1.1 +build==1.2.1 cachetools==5.3.3 certifi==2024.2.2 cffi==1.16.0 charset-normalizer==3.3.2 click==8.1.7 cloudpickle==2.2.1 +cramjam==2.8.3 crcmod==1.7 cryptography==42.0.5 Cython==0.29.37 @@ -41,18 +43,18 @@ dnspython==2.6.1 docker==7.0.0 docopt==0.6.2 docstring_parser==0.16 -execnet==2.0.2 +execnet==2.1.1 fastavro==1.9.4 fasteners==0.19 freezegun==1.4.0 future==1.0.0 google-api-core==2.18.0 -google-api-python-client==2.123.0 +google-api-python-client==2.125.0 google-apitools==0.5.31 google-auth==2.29.0 -google-auth-httplib2==0.1.1 -google-cloud-aiplatform==1.44.0 -google-cloud-bigquery==3.19.0 +google-auth-httplib2==0.2.0 +google-cloud-aiplatform==1.47.0 +google-cloud-bigquery==3.20.1 google-cloud-bigquery-storage==2.24.0 google-cloud-bigtable==2.23.0 google-cloud-core==2.4.1 @@ -60,7 +62,7 @@ google-cloud-datastore==2.19.0 google-cloud-dlp==3.16.0 google-cloud-language==2.13.3 google-cloud-profiler==4.1.0 -google-cloud-pubsub==2.20.3 +google-cloud-pubsub==2.21.1 google-cloud-pubsublite==1.9.0 google-cloud-recommendations-ai==0.10.10 google-cloud-resource-manager==1.12.3 @@ -79,10 +81,10 @@ grpcio-status==1.62.1 guppy3==3.1.4.post1 hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.99.13 +hypothesis==6.100.1 idna==3.6 iniconfig==2.0.0 -joblib==1.3.2 +joblib==1.4.0 Js2Py==0.74 jsonpickle==3.0.3 jsonschema==4.21.1 @@ -94,7 +96,7 @@ nose==1.3.7 numpy==1.26.4 oauth2client==4.1.3 objsize==0.7.0 -orjson==3.9.15 +orjson==3.10.0 overrides==7.7.0 packaging==24.0 pandas==2.0.3 @@ -105,13 +107,15 @@ protobuf==4.25.3 psycopg2-binary==2.9.9 pyarrow==14.0.2 pyarrow-hotfix==0.6 -pyasn1==0.5.1 -pyasn1-modules==0.3.0 -pycparser==2.21 +pyasn1==0.6.0 +pyasn1_modules==0.4.0 +pycparser==2.22 +pydantic==2.6.4 +pydantic_core==2.16.3 pydot==1.4.2 PyHamcrest==2.1.0 pyjsparser==2.7.1 -pymongo==4.6.2 +pymongo==4.6.3 PyMySQL==1.1.0 pyparsing==3.1.2 pyproject_hooks==1.0.0 @@ -119,28 +123,29 @@ pytest==7.4.4 pytest-timeout==2.3.1 pytest-xdist==3.5.0 python-dateutil==2.9.0.post0 +python-snappy==0.7.1 pytz==2024.1 PyYAML==6.0.1 redis==5.0.3 referencing==0.34.0 regex==2023.12.25 requests==2.31.0 -requests-mock==1.11.0 +requests-mock==1.12.1 rpds-py==0.18.0 rsa==4.9 -scikit-learn==1.4.1.post1 -scipy==1.12.0 +scikit-learn==1.4.2 +scipy==1.13.0 shapely==2.0.3 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 -SQLAlchemy==1.4.52 +SQLAlchemy==2.0.29 sqlparse==0.4.4 tenacity==8.2.3 testcontainers==3.7.1 threadpoolctl==3.4.0 tqdm==4.66.2 -typing_extensions==4.10.0 +typing_extensions==4.11.0 tzdata==2024.1 tzlocal==5.2 uritemplate==4.1.1 diff --git a/sdks/python/container/py38/base_image_requirements.txt b/sdks/python/container/py38/base_image_requirements.txt index 887aa8a2531b..a2484f356c73 100644 --- a/sdks/python/container/py38/base_image_requirements.txt +++ b/sdks/python/container/py38/base_image_requirements.txt @@ -21,12 +21,13 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. +annotated-types==0.6.0 async-timeout==4.0.3 attrs==23.2.0 backports.zoneinfo==0.2.1 beautifulsoup4==4.12.3 bs4==0.0.2 -build==1.1.1 +build==1.2.1 cachetools==5.3.3 certifi==2024.2.2 cffi==1.16.0 @@ -45,18 +46,18 @@ docker==7.0.0 docopt==0.6.2 docstring_parser==0.16 exceptiongroup==1.2.0 -execnet==2.0.2 +execnet==2.1.1 fastavro==1.9.4 fasteners==0.19 freezegun==1.4.0 future==1.0.0 google-api-core==2.18.0 -google-api-python-client==2.123.0 +google-api-python-client==2.125.0 google-apitools==0.5.31 google-auth==2.29.0 -google-auth-httplib2==0.1.1 -google-cloud-aiplatform==1.44.0 -google-cloud-bigquery==3.19.0 +google-auth-httplib2==0.2.0 +google-cloud-aiplatform==1.47.0 +google-cloud-bigquery==3.20.1 google-cloud-bigquery-storage==2.24.0 google-cloud-bigtable==2.23.0 google-cloud-core==2.4.1 @@ -64,7 +65,7 @@ google-cloud-datastore==2.19.0 google-cloud-dlp==3.16.0 google-cloud-language==2.13.3 google-cloud-profiler==4.1.0 -google-cloud-pubsub==2.20.3 +google-cloud-pubsub==2.21.1 google-cloud-pubsublite==1.9.0 google-cloud-recommendations-ai==0.10.10 google-cloud-resource-manager==1.12.3 @@ -83,12 +84,12 @@ grpcio-status==1.62.1 guppy3==3.1.4.post1 hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.99.13 +hypothesis==6.100.1 idna==3.6 importlib_metadata==7.1.0 importlib_resources==6.4.0 iniconfig==2.0.0 -joblib==1.3.2 +joblib==1.4.0 Js2Py==0.74 jsonpickle==3.0.3 jsonschema==4.21.1 @@ -100,7 +101,7 @@ nose==1.3.7 numpy==1.24.4 oauth2client==4.1.3 objsize==0.7.0 -orjson==3.9.15 +orjson==3.10.0 overrides==7.7.0 packaging==24.0 pandas==2.0.3 @@ -112,13 +113,15 @@ protobuf==4.25.3 psycopg2-binary==2.9.9 pyarrow==14.0.2 pyarrow-hotfix==0.6 -pyasn1==0.5.1 -pyasn1-modules==0.3.0 -pycparser==2.21 +pyasn1==0.6.0 +pyasn1_modules==0.4.0 +pycparser==2.22 +pydantic==2.6.4 +pydantic_core==2.16.3 pydot==1.4.2 PyHamcrest==2.1.0 pyjsparser==2.7.1 -pymongo==4.6.2 +pymongo==4.6.3 PyMySQL==1.1.0 pyparsing==3.1.2 pyproject_hooks==1.0.0 @@ -133,7 +136,7 @@ redis==5.0.3 referencing==0.34.0 regex==2023.12.25 requests==2.31.0 -requests-mock==1.11.0 +requests-mock==1.12.1 rpds-py==0.18.0 rsa==4.9 scikit-learn==1.3.2 @@ -142,14 +145,14 @@ shapely==2.0.3 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 -SQLAlchemy==1.4.52 +SQLAlchemy==2.0.29 sqlparse==0.4.4 tenacity==8.2.3 testcontainers==3.7.1 threadpoolctl==3.4.0 tomli==2.0.1 tqdm==4.66.2 -typing_extensions==4.10.0 +typing_extensions==4.11.0 tzdata==2024.1 tzlocal==5.2 uritemplate==4.1.1 diff --git a/sdks/python/container/py39/base_image_requirements.txt b/sdks/python/container/py39/base_image_requirements.txt index 9d8009c0048f..876c6e6bdf69 100644 --- a/sdks/python/container/py39/base_image_requirements.txt +++ b/sdks/python/container/py39/base_image_requirements.txt @@ -21,11 +21,12 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. +annotated-types==0.6.0 async-timeout==4.0.3 attrs==23.2.0 beautifulsoup4==4.12.3 bs4==0.0.2 -build==1.1.1 +build==1.2.1 cachetools==5.3.3 certifi==2024.2.2 cffi==1.16.0 @@ -44,18 +45,18 @@ docker==7.0.0 docopt==0.6.2 docstring_parser==0.16 exceptiongroup==1.2.0 -execnet==2.0.2 +execnet==2.1.1 fastavro==1.9.4 fasteners==0.19 freezegun==1.4.0 future==1.0.0 google-api-core==2.18.0 -google-api-python-client==2.123.0 +google-api-python-client==2.125.0 google-apitools==0.5.31 google-auth==2.29.0 -google-auth-httplib2==0.1.1 -google-cloud-aiplatform==1.44.0 -google-cloud-bigquery==3.19.0 +google-auth-httplib2==0.2.0 +google-cloud-aiplatform==1.47.0 +google-cloud-bigquery==3.20.1 google-cloud-bigquery-storage==2.24.0 google-cloud-bigtable==2.23.0 google-cloud-core==2.4.1 @@ -63,7 +64,7 @@ google-cloud-datastore==2.19.0 google-cloud-dlp==3.16.0 google-cloud-language==2.13.3 google-cloud-profiler==4.1.0 -google-cloud-pubsub==2.20.3 +google-cloud-pubsub==2.21.1 google-cloud-pubsublite==1.9.0 google-cloud-recommendations-ai==0.10.10 google-cloud-resource-manager==1.12.3 @@ -82,11 +83,11 @@ grpcio-status==1.62.1 guppy3==3.1.4.post1 hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.99.13 +hypothesis==6.100.1 idna==3.6 importlib_metadata==7.1.0 iniconfig==2.0.0 -joblib==1.3.2 +joblib==1.4.0 Js2Py==0.74 jsonpickle==3.0.3 jsonschema==4.21.1 @@ -98,7 +99,7 @@ nose==1.3.7 numpy==1.26.4 oauth2client==4.1.3 objsize==0.7.0 -orjson==3.9.15 +orjson==3.10.0 overrides==7.7.0 packaging==24.0 pandas==2.0.3 @@ -109,13 +110,15 @@ protobuf==4.25.3 psycopg2-binary==2.9.9 pyarrow==14.0.2 pyarrow-hotfix==0.6 -pyasn1==0.5.1 -pyasn1-modules==0.3.0 -pycparser==2.21 +pyasn1==0.6.0 +pyasn1_modules==0.4.0 +pycparser==2.22 +pydantic==2.6.4 +pydantic_core==2.16.3 pydot==1.4.2 PyHamcrest==2.1.0 pyjsparser==2.7.1 -pymongo==4.6.2 +pymongo==4.6.3 PyMySQL==1.1.0 pyparsing==3.1.2 pyproject_hooks==1.0.0 @@ -130,23 +133,23 @@ redis==5.0.3 referencing==0.34.0 regex==2023.12.25 requests==2.31.0 -requests-mock==1.11.0 +requests-mock==1.12.1 rpds-py==0.18.0 rsa==4.9 -scikit-learn==1.4.1.post1 -scipy==1.12.0 +scikit-learn==1.4.2 +scipy==1.13.0 shapely==2.0.3 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 -SQLAlchemy==1.4.52 +SQLAlchemy==2.0.29 sqlparse==0.4.4 tenacity==8.2.3 testcontainers==3.7.1 threadpoolctl==3.4.0 tomli==2.0.1 tqdm==4.66.2 -typing_extensions==4.10.0 +typing_extensions==4.11.0 tzdata==2024.1 tzlocal==5.2 uritemplate==4.1.1 diff --git a/sdks/python/setup.py b/sdks/python/setup.py index 2975f16d40d8..36ecedae543c 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -368,7 +368,8 @@ def get_portability_package_data(): 'grpcio>=1.33.1,!=1.48.0,<2', 'hdfs>=2.1.0,<3.0.0', 'httplib2>=0.8,<0.23.0', - 'js2py>=0.74,<1', + # https://github.com/PiotrDabkowski/Js2Py/issues/317 + 'js2py>=0.74,<1; python_version<"3.12"', 'jsonschema>=4.0.0,<5.0.0', 'jsonpickle>=3.0.0,<4.0.0', # numpy can have breaking changes in minor versions. @@ -427,7 +428,7 @@ def get_portability_package_data(): 'pytest-xdist>=2.5.0,<4', 'pytest-timeout>=2.1.0,<3', 'scikit-learn>=0.20.0', - 'sqlalchemy>=1.3,<2.0', + 'sqlalchemy>=1.3,<3.0', 'psycopg2-binary>=2.8.5,<3.0.0', 'testcontainers[mysql]>=3.0.3,<4.0.0', 'cryptography>=41.0.2', diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index 153b6acde8e7..91d6b6d7300f 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -447,6 +447,8 @@ commands = [testenv:py{38,39,310,311}-TFHubEmbeddings-{014,015}] deps = 014: tensorflow-hub>=0.14.0,<0.15.0 + # Help pip resolve conflict with typing-extensions due to an old version of tensorboard https://github.com/apache/beam/issues/30852 + 014: pydantic<2.7 015: tensorflow-hub>=0.15.0,<0.16.0 # Help pip resolve conflict with typing-extensions due to an old version of tensorboard https://github.com/apache/beam/issues/30852 015: pydantic<2.7 diff --git a/website/www/site/content/en/blog/beam-yaml-release.md b/website/www/site/content/en/blog/beam-yaml-release.md new file mode 100644 index 000000000000..f1082e0eac7e --- /dev/null +++ b/website/www/site/content/en/blog/beam-yaml-release.md @@ -0,0 +1,205 @@ +--- +title: "Introducing Beam YAML: Apache Beam's First No-code SDK" +date: 2024-04-11 10:00:00 -0400 +categories: + - blog +authors: + - jkinard + +--- + + +Writing a Beam pipeline can be a daunting task. Learning the Beam model, downloading dependencies for the SDK language +of choice, debugging the pipeline, and maintaining the pipeline code is a lot of overhead for users who want to write a +simple to intermediate data processing pipeline. There have been strides in making the SDK's entry points easier, but +for many, it is still a long way from being a painless process. + +To address some of these issues and simplify the entry point to Beam, we have introduced a new way to specify Beam +pipelines by using configuration files rather than code. This new SDK, known as +[Beam YAML](https://beam.apache.org/documentation/sdks/yaml/), employs a declarative approach to creating +data processing pipelines using [YAML](https://yaml.org/), a widely used data serialization language. + + + +# Benefits of using Beam YAML + +The primary goal of Beam YAML is to make the entry point to Beam as welcoming as possible. However, this should not +come at the expense of sacrificing the rich features that Beam offers. + +Here are some of the benefits of using Beam YAML: + +* **No-code development:** Allows users to develop pipelines without writing any code. This makes it easier to get + started with Beam and to develop pipelines quickly and easily. +* **Maintainability**: Configuration-based pipelines are easier to maintain than code-based pipelines. YAML format + enables clear separation of concerns, simplifying changes and updates without affecting other code sections. +* **Declarative language:** Provides a declarative language, which means that it is based on the description of the + desired outcome rather than expressing the intent through code. This makes it easy to understand the structure and + flow of a pipeline. The YAML syntax is also widely used with a rich community of resources for learning and + leveraging the YAML syntax. +* **Powerful features:** Supports a wide range of features, including a variety of data sources and sinks, turn-key + transforms, and execution parameters. This makes it possible to develop complex data processing pipelines with Beam + YAML. +* **Reusability**: Beam YAML promotes code reuse by providing a way to define and share common pipeline patterns. You + can create reusable YAML snippets or blocks that can be easily shared and reused in different pipelines. This reduces + the need to write repetitive tasks and helps maintain consistency across pipelines. +* **Extensibility**: Beam YAML offers a structure for integrating custom transformations into a pipeline, enabling + organizations to contribute or leverage a pre-existing catalog of transformations that can be seamlessly accessed + using the Beam YAML syntax across multiple pipelines. It is also possible to build third-party extensions, including + custom parsers and other tools, that do not need to depend on Beam directly. +* **Backwards Compatibility**: Beam YAML is still being actively worked on, bringing exciting new features and + capabilities, but as these features are added, backwards compatibility will be preserved. This way, once a pipeline + is written, it will continue to work despite future released versions of the SDK. + +Overall, using Beam YAML provides a number of advantages. It makes pipeline development and management more efficient +and effective, enabling users to focus on the business logic and data processing tasks, rather than spending time on +low-level coding details. + + +# Case Study: A simple business analytics use-case + +Let's take the following sample transaction data for a department store: + +| transaction_id | product_name | category | price | +|:----------------|:----------------|:-------------|:--------| +| T0012 | Headphones | Electronics | 59.99 | +| T5034 | Leather Jacket | Apparel | 109.99 | +| T0024 | Aluminum Mug | Kitchen | 29.99 | +| T0104 | Headphones | Electronics | 59.99 | +| T0302 | Monitor | Electronics | 249.99 | + +Now, let's say that the business wants to get a record of transactions for all purchases made in the Electronics +department for audit purposes. Assuming the records are stored as a CSV file, a Beam YAML pipeline may look something +like this: + +Source code for this example can be found +[here](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/yaml/examples/simple_filter.yaml). +```yaml +pipeline: + transforms: + - type: ReadFromCsv + name: ReadInputFile + config: + path: /path/to/input.csv + - type: Filter + name: FilterWithCategory + input: ReadInputFile + config: + language: python + keep: category == "Electronics" + - type: WriteToCsv + name: WriteOutputFile + input: FilterWithCategory + config: + path: /path/to/output +``` + +This would leave us with the following data: + +| transaction_id | product_name | category | price | +|:----------------|:--------------|:-------------|:--------| +| T0012 | Headphones | Electronics | 59.99 | +| T0104 | Headphones | Electronics | 59.99 | +| T0302 | Monitor | Electronics | 249.99 | + +Now, let's say the business wants to determine how much of each Electronics item is being sold to ensure that the +correct number is being ordered from the supplier. Let's also assume that they want to determine the total revenue for +each item. This simple aggregation can follow the Filter from the previous example as such: + +Source code for this example can be found +[here](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/yaml/examples/simple_filter_and_combine.yaml). +```yaml +pipeline: + transforms: + - type: ReadFromCsv + name: ReadInputFile + config: + path: /path/to/input.csv + - type: Filter + name: FilterWithCategory + input: ReadInputFile + config: + language: python + keep: category == "Electronics" + - type: Combine + name: CountNumberSold + input: FilterWithCategory + config: + group_by: product_name + combine: + num_sold: + value: product_name + fn: count + total_revenue: + value: price + fn: sum + - type: WriteToCsv + name: WriteOutputFile + input: CountNumberSold + config: + path: /path/to/output +``` + +This would leave us with the following data: + +| product_name | num_sold | total_revenue | +|:--------------|:----------|:---------------| +| Headphones | 2 | 119.98 | +| Monitor | 1 | 249.99 | + +While this was a relatively simple use-case, it shows the power of Beam YAML and how easy it is to go from business +use-case to a prototype data pipeline in just a few lines of YAML. + + +# Getting started with Beam YAML + +There are several resources that have been compiled to help users get familiar with Beam YAML. + + +## Day Zero Notebook + + +Open In Colab + + +To help get started with Apache Beam, there is a Day Zero Notebook available on +[Google Colab](https://colab.sandbox.google.com/), an online Python notebook environment with a free attachable +runtime, containing some basic YAML pipeline examples. + + +## Documentation + +The Apache Beam website provides a set of [docs](https://beam.apache.org/documentation/sdks/yaml/) that demonstrate the +current capabilities of the Beam YAML SDK. There is also a catalog of currently-supported turnkey transforms found +[here](https://beam.apache.org/releases/yamldoc/current/). + + +## Examples + +A catalog of examples can be found +[here](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/yaml/examples). These examples showcase +all the turnkey transforms that can be utilized in Beam YAML. There are also a number of Dataflow Cookbook examples +that can be found [here](https://github.com/GoogleCloudPlatform/dataflow-cookbook/tree/main/Python/yaml). + + +## Contributing + +Developers who wish to help build out and add functionalities are welcome to start contributing to the effort in the +Beam YAML module found [here](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/yaml). + +There is also a list of open [bugs](https://github.com/apache/beam/issues?q=is%3Aopen+is%3Aissue+label%3Ayaml) found +on the GitHub repo - now marked with the 'yaml' tag. + +While Beam YAML has been marked stable as of Beam 2.52, it is still under heavy development, with new features being +added with each release. Those who wish to be part of the design decisions and give insights to how the framework is +being used are highly encouraged to join the dev mailing list as those discussions will be directed there. A link to +the dev list can be found [here](https://beam.apache.org/community/contact-us/). diff --git a/website/www/site/content/en/documentation/sdks/yaml-combine.md b/website/www/site/content/en/documentation/sdks/yaml-combine.md index b7780c689fab..62a036cc81a0 100644 --- a/website/www/site/content/en/documentation/sdks/yaml-combine.md +++ b/website/www/site/content/en/documentation/sdks/yaml-combine.md @@ -25,8 +25,6 @@ title: "Apache Beam YAML Aggregations" Beam YAML has EXPERIMENTAL ability to do aggregations to group and combine values across records. The is accomplished via the `Combine` transform type. -Currently `Combine` needs to be in the `yaml_experimental_features` -option to use this transform. For example, one can write diff --git a/website/www/site/content/en/documentation/transforms/python/aggregation/distinct.md b/website/www/site/content/en/documentation/transforms/python/aggregation/distinct.md index 24abe6bdd247..e6701d63baea 100644 --- a/website/www/site/content/en/documentation/transforms/python/aggregation/distinct.md +++ b/website/www/site/content/en/documentation/transforms/python/aggregation/distinct.md @@ -30,7 +30,7 @@ In the following example, we create a pipeline with two `PCollection`s of produc We use `Distinct` to get rid of duplicate elements, which outputs a `PCollection` of all the unique elements. {{< playground height="700px" >}} -{{< playground_snippet language="py" path="SDK_PYTHON_Distinct" show="distinc" >}} +{{< playground_snippet language="py" path="SDK_PYTHON_Distinct" show="distinct" >}} {{< /playground >}} ## Related transforms diff --git a/website/www/site/data/authors.yml b/website/www/site/data/authors.yml index 13c31c4f4782..a8b741864fb7 100644 --- a/website/www/site/data/authors.yml +++ b/website/www/site/data/authors.yml @@ -278,3 +278,6 @@ namitasharma: talat: name: Talat Uyarer email: talat@apache.org +jkinard: + name: Jeff Kinard + email: jkinard@google.com