From 8f721dac5c1d5b1d28b74ea9bccb85b6ac97282f Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Tue, 26 Mar 2024 12:15:01 -0400 Subject: [PATCH 01/10] Fix playground build for Beam 2.55.0 change (#30736) * Fix playground build for Beam 2.55.0 change --- playground/backend/containers/java/Dockerfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/playground/backend/containers/java/Dockerfile b/playground/backend/containers/java/Dockerfile index 5d5ce8019c8b..18a37f6f016c 100644 --- a/playground/backend/containers/java/Dockerfile +++ b/playground/backend/containers/java/Dockerfile @@ -78,8 +78,6 @@ COPY --from=dep /pipeline-dependencies/target/dependency/ /opt/apache/beam/jars/ RUN wget https://repo1.maven.org/maven2/org/apache/beam/beam-examples-java/$BEAM_VERSION/beam-examples-java-$BEAM_VERSION.jar &&\ mv beam-examples-java-$BEAM_VERSION.jar /opt/apache/beam/jars/beam-examples-java.jar # Install jars for Playground graphs -RUN wget https://repo1.maven.org/maven2/org/apache/beam/beam-runners-core-construction-java/$BEAM_VERSION/beam-runners-core-construction-java-$BEAM_VERSION.jar &&\ - mv beam-runners-core-construction-java-$BEAM_VERSION.jar /opt/apache/beam/jars/beam-runners-core-construction-java-$BEAM_VERSION.jar RUN wget https://repo1.maven.org/maven2/org/apache/beam/beam-sdks-java-core/$BEAM_VERSION/beam-sdks-java-core-$BEAM_VERSION-tests.jar &&\ mv beam-sdks-java-core-$BEAM_VERSION-tests.jar /opt/apache/beam/jars/beam-sdks-java-core-tests.jar # Install Spring Expression From bcd9783f3bd930d3aefc15c3545d8e162b1e1340 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Tue, 26 Mar 2024 13:11:20 -0400 Subject: [PATCH 02/10] Fix typo in download webpage 2.55.0 (#30752) --- website/www/site/content/en/get-started/downloads.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/www/site/content/en/get-started/downloads.md b/website/www/site/content/en/get-started/downloads.md index a598a3e905cd..8b11768d4ded 100644 --- a/website/www/site/content/en/get-started/downloads.md +++ b/website/www/site/content/en/get-started/downloads.md @@ -96,7 +96,7 @@ versions denoted `0.x.y`. ## Releases -### 2.54.0 (2024-03-25) +### 2.55.0 (2024-03-25) Official [source code download](https://downloads.apache.org/beam/2.55.0/apache-beam-2.55.0-source-release.zip). [SHA-512](https://downloads.apache.org/beam/2.55.0/apache-beam-2.55.0-source-release.zip.sha512). [signature](https://downloads.apache.org/beam/2.55.0/apache-beam-2.55.0-source-release.zip.asc). From 488d2a1c30d91e73e16ac341707b1e7c1b651ff4 Mon Sep 17 00:00:00 2001 From: Andrey Devyatkin Date: Tue, 26 Mar 2024 21:21:40 +0400 Subject: [PATCH 03/10] Fix the gap between flaky_test_detection and workflow_prefetcher scripts (#30739) * fix the gap between flaky_test_detection and workflow_prefetcher scripts * fix the gap between flaky_test_detection and workflow_prefetcher scripts --- .../provisioning/alerting/flaky_test.yaml | 2 +- .../github_runs_prefetcher/code/main.py | 26 ++++++++++++------- .test-infra/tools/flaky_test_detection.py | 8 +++++- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/.test-infra/metrics/grafana/provisioning/alerting/flaky_test.yaml b/.test-infra/metrics/grafana/provisioning/alerting/flaky_test.yaml index 4a04ddb57490..005239cca2ce 100644 --- a/.test-infra/metrics/grafana/provisioning/alerting/flaky_test.yaml +++ b/.test-infra/metrics/grafana/provisioning/alerting/flaky_test.yaml @@ -36,7 +36,7 @@ groups: maxDataPoints: 43200 rawQuery: true rawSql: |- - SELECT COUNT(workflow_id), CAST(workflow_id AS TEXT), name AS workflow_name, filename AS workflow_filename, url AS workflow_url, CAST(threshold AS TEXT) AS workflow_threshold + SELECT COUNT(workflow_id), CAST(workflow_id AS TEXT), name AS workflow_name, filename AS workflow_filename, url AS workflow_url, CAST(threshold AS TEXT) AS workflow_threshold, CAST(retrieved_at AS TEXT) AS workflow_retrieved_at FROM github_workflows WHERE is_flaky = true GROUP BY workflow_id diff --git a/.test-infra/metrics/sync/github/github_runs_prefetcher/code/main.py b/.test-infra/metrics/sync/github/github_runs_prefetcher/code/main.py index ddb2120ab95f..c2fe093b657e 100644 --- a/.test-infra/metrics/sync/github/github_runs_prefetcher/code/main.py +++ b/.test-infra/metrics/sync/github/github_runs_prefetcher/code/main.py @@ -269,6 +269,7 @@ async def fetch(url, semaphore, params=None, headers=None, request_id=None): async def fetch_workflow_runs(): def append_workflow_runs(workflow, runs): + workflow_runs = {} for run in runs: # Getting rid of all runs with a "skipped" status to display # only actual runs @@ -278,15 +279,17 @@ def append_workflow_runs(workflow, runs): status = run["conclusion"] elif run["status"] != "cancelled": status = run["status"] - workflow.runs.append( - WorkflowRun( - run["id"], - status, - run["html_url"], - workflow.id, - datetime.strptime(run["run_started_at"], "%Y-%m-%dT%H:%M:%SZ"), - ) + workflow_run = WorkflowRun( + run["id"], + status, + run["html_url"], + workflow.id, + datetime.strptime(run["run_started_at"], "%Y-%m-%dT%H:%M:%SZ"), ) + if workflow_runs.get(workflow_run.id): + print(f"Duplicate run for {workflow.id} workflow: {workflow_run.id}") + workflow_runs[workflow_run.id] = workflow_run + workflow.runs.extend(workflow_runs.values()) url = f"https://api.github.com/repos/{GIT_ORG}/beam/actions/workflows" headers = {"Authorization": get_token()} @@ -428,7 +431,8 @@ def save_workflows(workflows): url text NOT NULL, dashboard_category text NOT NULL, threshold real NOT NULL, - is_flaky boolean NOT NULL)\n""" + is_flaky boolean NOT NULL, + retrieved_at timestamp with time zone NOT NULL)\n""" create_workflow_runs_table_query = f""" CREATE TABLE IF NOT EXISTS {workflow_runs_table_name} ( run_id text NOT NULL PRIMARY KEY, @@ -441,13 +445,14 @@ def save_workflows(workflows): cursor.execute(create_workflows_table_query) cursor.execute(create_workflow_runs_table_query) insert_workflows_query = f""" - INSERT INTO {workflows_table_name} (workflow_id, name, filename, url, dashboard_category, threshold, is_flaky) + INSERT INTO {workflows_table_name} (workflow_id, name, filename, url, dashboard_category, threshold, is_flaky, retrieved_at) VALUES %s""" insert_workflow_runs_query = f""" INSERT INTO {workflow_runs_table_name} (run_id, run_number, status, url, workflow_id, started_at) VALUES %s""" insert_workflows = [] insert_workflow_runs = [] + current_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") for workflow in workflows: insert_workflows.append( ( @@ -458,6 +463,7 @@ def save_workflows(workflows): workflow.category, workflow.threshold, workflow.is_flaky, + current_date, ) ) for idx, run in enumerate(workflow.runs): diff --git a/.test-infra/tools/flaky_test_detection.py b/.test-infra/tools/flaky_test_detection.py index 87a2fb83bb2e..768900db94cb 100644 --- a/.test-infra/tools/flaky_test_detection.py +++ b/.test-infra/tools/flaky_test_detection.py @@ -16,6 +16,7 @@ import os import re import requests +from datetime import datetime from github import Github from github import Auth @@ -34,12 +35,14 @@ def __init__( workflow_name, workflow_filename, workflow_threshold, + workflow_retrieved_at, ): self.workflow_id = workflow_id self.workflow_url = workflow_url self.workflow_name = workflow_name self.workflow_filename = workflow_filename self.workflow_threshold = round(float(workflow_threshold), 2) + self.workflow_retrieved_at = workflow_retrieved_at def get_workflow_issues(issues): @@ -89,6 +92,7 @@ def get_grafana_alerts(): alert["labels"]["workflow_name"], alert["labels"]["workflow_filename"], alert["labels"]["workflow_threshold"], + datetime.fromisoformat(alert["labels"]["workflow_retrieved_at"]), ) ) return alerts @@ -114,6 +118,8 @@ def main(): issue = workflow_closed_issues[alert.workflow_id] if READ_ONLY == "true": print("READ_ONLY is true, not reopening issue") + elif issue.closed_at > alert.workflow_retrieved_at: + print(f"The issue for the workflow {alert.workflow_id} has been closed, skipping") else: issue.edit(state="open") issue.create_comment(body="Reopening since the workflow is still flaky") @@ -121,7 +127,7 @@ def main(): elif alert.workflow_id not in workflow_open_issues.keys(): create_github_issue(repo, alert) else: - print("Issue is already open, skipping") + print(f"The issue for the workflow {alert.workflow_id} is already open, skipping") g.close() From 4af19ffa8cc9039e2c2110686ff8ad043a07f5cc Mon Sep 17 00:00:00 2001 From: Ritesh Ghorse Date: Tue, 26 Mar 2024 16:04:48 -0400 Subject: [PATCH 04/10] [Docs] Beam website doc for vertex ai enrichment handler (#30692) * add vetex ai page * update beam side docs for vertex ai enrichment * add links to release doc * update links, correct table formatting * Update website/www/site/content/en/documentation/transforms/python/elementwise/enrichment-bigtable.md Co-authored-by: Rebecca Szper <98840847+rszper@users.noreply.github.com> * Update website/www/site/content/en/documentation/transforms/python/elementwise/enrichment-vertexai.md Co-authored-by: Rebecca Szper <98840847+rszper@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Rebecca Szper <98840847+rszper@users.noreply.github.com> --------- Co-authored-by: Rebecca Szper <98840847+rszper@users.noreply.github.com> --- .../transforms/elementwise/enrichment.py | 67 ++++++++++++++ .../transforms/elementwise/enrichment_test.py | 37 +++++++- .../python/elementwise/enrichment-bigtable.md | 62 +++++++++++++ .../python/elementwise/enrichment-vertexai.md | 89 +++++++++++++++++++ .../python/elementwise/enrichment.md | 37 +++----- .../python/elementwise/runinference.md | 2 +- .../section-menu/en/documentation.html | 9 +- 7 files changed, 275 insertions(+), 28 deletions(-) create mode 100644 website/www/site/content/en/documentation/transforms/python/elementwise/enrichment-bigtable.md create mode 100644 website/www/site/content/en/documentation/transforms/python/elementwise/enrichment-vertexai.md diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment.py index 59af38584412..acee633b6f67 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment.py @@ -49,3 +49,70 @@ def enrichment_with_bigtable(): | "Enrich W/ BigTable" >> Enrichment(bigtable_handler) | "Print" >> beam.Map(print)) # [END enrichment_with_bigtable] + + +def enrichment_with_vertex_ai(): + # [START enrichment_with_vertex_ai] + import apache_beam as beam + from apache_beam.transforms.enrichment import Enrichment + from apache_beam.transforms.enrichment_handlers.vertex_ai_feature_store \ + import VertexAIFeatureStoreEnrichmentHandler + + project_id = 'apache-beam-testing' + location = 'us-central1' + api_endpoint = f"{location}-aiplatform.googleapis.com" + data = [ + beam.Row(user_id='2963', product_id=14235, sale_price=15.0), + beam.Row(user_id='21422', product_id=11203, sale_price=12.0), + beam.Row(user_id='20592', product_id=8579, sale_price=9.0), + ] + + vertex_ai_handler = VertexAIFeatureStoreEnrichmentHandler( + project=project_id, + location=location, + api_endpoint=api_endpoint, + feature_store_name="vertexai_enrichment_example", + feature_view_name="users", + row_key="user_id", + ) + with beam.Pipeline() as p: + _ = ( + p + | "Create" >> beam.Create(data) + | "Enrich W/ Vertex AI" >> Enrichment(vertex_ai_handler) + | "Print" >> beam.Map(print)) + # [END enrichment_with_vertex_ai] + + +def enrichment_with_vertex_ai_legacy(): + # [START enrichment_with_vertex_ai_legacy] + import apache_beam as beam + from apache_beam.transforms.enrichment import Enrichment + from apache_beam.transforms.enrichment_handlers.vertex_ai_feature_store \ + import VertexAIFeatureStoreLegacyEnrichmentHandler + + project_id = 'apache-beam-testing' + location = 'us-central1' + api_endpoint = f"{location}-aiplatform.googleapis.com" + data = [ + beam.Row(entity_id="movie_01", title='The Shawshank Redemption'), + beam.Row(entity_id="movie_02", title="The Shining"), + beam.Row(entity_id="movie_04", title='The Dark Knight'), + ] + + vertex_ai_handler = VertexAIFeatureStoreLegacyEnrichmentHandler( + project=project_id, + location=location, + api_endpoint=api_endpoint, + entity_type_id='movies', + feature_store_id="movie_prediction_unique", + feature_ids=["title", "genres"], + row_key="entity_id", + ) + with beam.Pipeline() as p: + _ = ( + p + | "Create" >> beam.Create(data) + | "Enrich W/ Vertex AI" >> Enrichment(vertex_ai_handler) + | "Print" >> beam.Map(print)) + # [END enrichment_with_vertex_ai_legacy] diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py index 257ce53f8e2a..3fd759bbc320 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py @@ -25,7 +25,9 @@ # pylint: disable=unused-import try: - from apache_beam.examples.snippets.transforms.elementwise.enrichment import enrichment_with_bigtable + from apache_beam.examples.snippets.transforms.elementwise.enrichment import enrichment_with_bigtable, \ + enrichment_with_vertex_ai_legacy + from apache_beam.examples.snippets.transforms.elementwise.enrichment import enrichment_with_vertex_ai from apache_beam.io.requestresponse import RequestResponseIO except ImportError: raise unittest.SkipTest('RequestResponseIO dependencies are not installed') @@ -40,6 +42,24 @@ def validate_enrichment_with_bigtable(): return expected +def validate_enrichment_with_vertex_ai(): + expected = '''[START enrichment_with_vertex_ai] +Row(user_id='2963', product_id=14235, sale_price=15.0, age=29.0, gender='1', state='97', country='2') +Row(user_id='21422', product_id=11203, sale_price=12.0, age=36.0, state='184', gender='1', country='5') +Row(user_id='20592', product_id=8579, sale_price=9.0, age=30.0, state='86', gender='1', country='4') + [END enrichment_with_vertex_ai]'''.splitlines()[1:-1] + return expected + + +def validate_enrichment_with_vertex_ai_legacy(): + expected = '''[START enrichment_with_vertex_ai_legacy] +Row(entity_id='movie_01', title='The Shawshank Redemption', genres='Drama') +Row(entity_id='movie_02', title='The Shining', genres='Horror') +Row(entity_id='movie_04', title='The Dark Knight', genres='Action') + [END enrichment_with_vertex_ai_legacy]'''.splitlines()[1:-1] + return expected + + @mock.patch('sys.stdout', new_callable=StringIO) class EnrichmentTest(unittest.TestCase): def test_enrichment_with_bigtable(self, mock_stdout): @@ -48,6 +68,21 @@ def test_enrichment_with_bigtable(self, mock_stdout): expected = validate_enrichment_with_bigtable() self.assertEqual(output, expected) + def test_enrichment_with_vertex_ai(self, mock_stdout): + enrichment_with_vertex_ai() + output = mock_stdout.getvalue().splitlines() + expected = validate_enrichment_with_vertex_ai() + + for i in range(len(expected)): + self.assertEqual(set(output[i].split(',')), set(expected[i].split(','))) + + def test_enrichment_with_vertex_ai_legacy(self, mock_stdout): + enrichment_with_vertex_ai_legacy() + output = mock_stdout.getvalue().splitlines() + expected = validate_enrichment_with_vertex_ai_legacy() + self.maxDiff = None + self.assertEqual(output, expected) + if __name__ == '__main__': unittest.main() diff --git a/website/www/site/content/en/documentation/transforms/python/elementwise/enrichment-bigtable.md b/website/www/site/content/en/documentation/transforms/python/elementwise/enrichment-bigtable.md new file mode 100644 index 000000000000..ee259355e303 --- /dev/null +++ b/website/www/site/content/en/documentation/transforms/python/elementwise/enrichment-bigtable.md @@ -0,0 +1,62 @@ +--- +title: "Enrichment with Bigtable" +--- + + +# Use Bigtable to enrich data + +{{< localstorage language language-py >}} + + + + + +
+ + {{< button-pydoc path="apache_beam.transforms.enrichment_handlers.bigtable" class="BigTableEnrichmentHandler" >}} + +
+ +In Apache Beam 2.54.0 and later versions, the enrichment transform includes a built-in enrichment handler for [Bigtable](https://cloud.google.com/bigtable/docs/overview). +The following example demonstrates how to create a pipeline that use the enrichment transform with the [`BigTableEnrichmentHandler`](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.enrichment_handlers.bigtable.html#apache_beam.transforms.enrichment_handlers.bigtable.BigTableEnrichmentHandler) handler. + +The data stored in the Bigtable cluster uses the following format: + +{{< table >}} +| Row key | product:product_id | product:product_name | product:product_stock | +|:-----------:|:--------------------:|:----------------------:|:-----------------------:| +| 1 | 1 | pixel 5 | 2 | +| 2 | 2 | pixel 6 | 4 | +| 3 | 3 | pixel 7 | 20 | +| 4 | 4 | pixel 8 | 10 | +{{< /table >}} + + +{{< highlight language="py" >}} +{{< code_sample "sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment.py" enrichment_with_bigtable >}} +{{}} + +{{< paragraph class="notebook-skip" >}} +Output: +{{< /paragraph >}} +{{< highlight class="notebook-skip" >}} +{{< code_sample "sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py" enrichment_with_bigtable >}} +{{< /highlight >}} + +## Related transforms + +Not applicable. + +{{< button-pydoc path="apache_beam.transforms.enrichment_handlers.bigtable" class="BigTableEnrichmentHandler" >}} \ No newline at end of file diff --git a/website/www/site/content/en/documentation/transforms/python/elementwise/enrichment-vertexai.md b/website/www/site/content/en/documentation/transforms/python/elementwise/enrichment-vertexai.md new file mode 100644 index 000000000000..0c869aa16732 --- /dev/null +++ b/website/www/site/content/en/documentation/transforms/python/elementwise/enrichment-vertexai.md @@ -0,0 +1,89 @@ +--- +title: "Enrichment with Vertex AI Feature Store" +--- + + +# Enrichment with Google Cloud Vertex AI Feature Store + +{{< localstorage language language-py >}} + + + + + +
+ + {{< button-pydoc path="apache_beam.transforms.enrichment_handlers.vertex_ai_feature_store" class="VertexAIFeatureStoreEnrichmentHandler" >}} + +
+ + +In Apache Beam 2.55.0 and later versions, the enrichment transform includes a built-in enrichment handler for [Vertex AI Feature Store](https://cloud.google.com/vertex-ai/docs/featurestore). +The following example demonstrates how to create a pipeline that use the enrichment transform with the [`VertexAIFeatureStoreEnrichmentHandler`](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.enrichment_handlers.vertex_ai_feature_store.html#apache_beam.transforms.enrichment_handlers.vertex_ai_feature_store.VertexAIFeatureStoreEnrichmentHandler) handler and the [`VertexAIFeatureStoreLegacyEnrichmentHandler`](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.enrichment_handlers.vertex_ai_feature_store.html#apache_beam.transforms.enrichment_handlers.vertex_ai_feature_store.VertexAIFeatureStoreLegacyEnrichmentHandler) handler. + +## Example 1: Enrichment with Vertex AI Feature Store + +The precomputed feature values stored in Vertex AI Feature Store uses the following format: + +{{< table >}} +| user_id | age | gender | state | country | +|:--------:|:----:|:-------:|:-----:|:-------:| +| 21422 | 12 | 0 | 0 | 0 | +| 2963 | 12 | 1 | 1 | 1 | +| 20592 | 12 | 1 | 2 | 2 | +| 76538 | 12 | 1 | 3 | 0 | +{{< /table >}} + + +{{< highlight language="py" >}} +{{< code_sample "sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment.py" enrichment_with_vertex_ai >}} +{{}} + +{{< paragraph class="notebook-skip" >}} +Output: +{{< /paragraph >}} +{{< highlight class="notebook-skip" >}} +{{< code_sample "sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py" enrichment_with_vertex_ai >}} +{{< /highlight >}} + +## Example 2: Enrichment with Vertex AI Feature Store (legacy) + +The precomputed feature values stored in Vertex AI Feature Store (Legacy) use the following format: + +{{< table >}} +| entity_id | title | genres | +|:----------|:-------------------------|:--------| +| movie_01 | The Shawshank Redemption | Drama | +| movie_02 | The Shining | Horror | +| movie_04 | The Dark Knight | Action | +{{< /table >}} + +{{< highlight language="py" >}} +{{< code_sample "sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment.py" enrichment_with_vertex_ai_legacy >}} +{{}} + +{{< paragraph class="notebook-skip" >}} +Output: +{{< /paragraph >}} +{{< highlight class="notebook-skip" >}} +{{< code_sample "sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py" enrichment_with_vertex_ai_legacy >}} +{{< /highlight >}} + + +## Related transforms + +Not applicable. + +{{< button-pydoc path="apache_beam.transforms.enrichment_handlers.vertex_ai_feature_store" class="VertexAIFeatureStoreEnrichmentHandler" >}} \ No newline at end of file diff --git a/website/www/site/content/en/documentation/transforms/python/elementwise/enrichment.md b/website/www/site/content/en/documentation/transforms/python/elementwise/enrichment.md index 5dfa5df04fae..1c6aeff38ec2 100644 --- a/website/www/site/content/en/documentation/transforms/python/elementwise/enrichment.md +++ b/website/www/site/content/en/documentation/transforms/python/elementwise/enrichment.md @@ -23,7 +23,7 @@ limitations under the License. - {{< button-pydoc path="apache_beam.transforms" class="Enrichment" >}} + {{< button-pydoc path="apache_beam.transforms.enrichment" class="Enrichment" >}} @@ -32,35 +32,22 @@ limitations under the License. The enrichment transform lets you dynamically enrich data in a pipeline by doing a key-value lookup to a remote service. The transform uses [`RequestResponeIO`](https://beam.apache.org/releases/pydoc/current/apache_beam.io.requestresponseio.html#apache_beam.io.requestresponseio.RequestResponseIO) internally. This feature uses client-side throttling to ensure that the remote service isn't overloaded with requests. If service-side errors occur, like `TooManyRequests` and `Timeout` exceptions, it retries the requests by using exponential backoff. -In Apache Beam 2.54.0 and later versions, the transform includes a built-in enrichment handler for [Bigtable](https://cloud.google.com/bigtable/docs/overview). +This transform is available in Apache Beam 2.54.0 and later versions. -## Use Bigtable to enrich data +## Examples -The following example demonstrates how to create a pipeline that use the enrichment transform with [`BigTableEnrichmentHandler`](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.enrichment_handlers.bigtable.html#apache_beam.transforms.enrichment_handlers.bigtable.BigTableEnrichmentHandler). +The following examples demonstrate how to create a pipeline that use the enrichment transform to enrich data from external services. -The data stored in the Bigtable cluster uses the following format: - -| Row key | product:product_id | product:product_name | product:product_stock | -|:---------:|:--------------------:|:----------------------:|:-----------------------:| -| 1 | 1 | pixel 5 | 2 | -| 2 | 2 | pixel 6 | 4 | -| 3 | 3 | pixel 7 | 20 | -| 4 | 4 | pixel 8 | 10 | - - -{{< highlight language="py" >}} -{{< code_sample "sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment.py" enrichment_with_bigtable >}} -{{}} - -{{< paragraph class="notebook-skip" >}} -Output: -{{< /paragraph >}} -{{< highlight class="notebook-skip" >}} -{{< code_sample "sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py" enrichment_with_bigtable >}} -{{< /highlight >}} +{{< table >}} +| Service | Example | +|:-----------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Cloud Bigtable | [Enrichment with Bigtable](/documentation/transforms/python/elementwise/enrichment-bigtable/#example) | +| Vertex AI Feature Store | [Enrichment with Vertex AI Feature Store](/documentation/transforms/python/elementwise/enrichment-vertexai/#example-1-enrichment-with-vertex-ai-feature-store) | +| Vertex AI Feature Store (Legacy) | [Enrichment with Legacy Vertex AI Feature Store](/documentation/transforms/python/elementwise/enrichment-vertexai/#example-2-enrichment-with-vertex-ai-feature-store-legacy) | +{{< /table >}} ## Related transforms Not applicable. -{{< button-pydoc path="apache_beam.transforms" class="Enrichment" >}} \ No newline at end of file +{{< button-pydoc path="apache_beam.transforms.enrichment" class="Enrichment" >}} \ No newline at end of file diff --git a/website/www/site/content/en/documentation/transforms/python/elementwise/runinference.md b/website/www/site/content/en/documentation/transforms/python/elementwise/runinference.md index 47944b9a232f..0f3cacf1d748 100644 --- a/website/www/site/content/en/documentation/transforms/python/elementwise/runinference.md +++ b/website/www/site/content/en/documentation/transforms/python/elementwise/runinference.md @@ -23,7 +23,7 @@ limitations under the License. - {{< button-pydoc path="apache_beam.ml.inference" class="RunInference" >}} + {{< button-pydoc path="apache_beam.ml.inference.base" class="RunInference" >}} diff --git a/website/www/site/layouts/partials/section-menu/en/documentation.html b/website/www/site/layouts/partials/section-menu/en/documentation.html index 135f82bf9105..fda1d3960aca 100755 --- a/website/www/site/layouts/partials/section-menu/en/documentation.html +++ b/website/www/site/layouts/partials/section-menu/en/documentation.html @@ -289,7 +289,14 @@ Element-wise
    -
  • Enrichment
  • +
  • + Enrichment + +
  • Filter
  • FlatMap
  • Keys
  • From 7f04c4f07f2698f823953902bfb79fc7cb6e1584 Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Tue, 26 Mar 2024 16:54:00 -0700 Subject: [PATCH 05/10] Deduplicate common environments. (#30681) We deduplicate both on proto construction (as before, but fixed) and again after more environments have been resolved. --- sdks/python/apache_beam/pipeline.py | 31 +-------- sdks/python/apache_beam/runners/common.py | 67 +++++++++++++++++++ .../python/apache_beam/runners/common_test.py | 59 ++++++++++++++++ .../runners/dataflow/dataflow_runner.py | 2 + .../portability/fn_api_runner/fn_runner.py | 4 +- 5 files changed, 133 insertions(+), 30 deletions(-) diff --git a/sdks/python/apache_beam/pipeline.py b/sdks/python/apache_beam/pipeline.py index 53044982a066..11bc74d27eca 100644 --- a/sdks/python/apache_beam/pipeline.py +++ b/sdks/python/apache_beam/pipeline.py @@ -86,6 +86,7 @@ from apache_beam.portability import common_urns from apache_beam.portability.api import beam_runner_api_pb2 from apache_beam.runners import PipelineRunner +from apache_beam.runners import common from apache_beam.runners import create_runner from apache_beam.transforms import ParDo from apache_beam.transforms import ptransform @@ -967,35 +968,7 @@ def merge_compatible_environments(proto): Mutates proto as contexts may have references to proto.components. """ - env_map = {} - canonical_env = {} - files_by_hash = {} - for env_id, env in proto.components.environments.items(): - # First deduplicate any file dependencies by their hash. - for dep in env.dependencies: - if dep.type_urn == common_urns.artifact_types.FILE.urn: - file_payload = beam_runner_api_pb2.ArtifactFilePayload.FromString( - dep.type_payload) - if file_payload.sha256: - if file_payload.sha256 in files_by_hash: - file_payload.path = files_by_hash[file_payload.sha256] - dep.type_payload = file_payload.SerializeToString() - else: - files_by_hash[file_payload.sha256] = file_payload.path - # Next check if we've ever seen this environment before. - normalized = env.SerializeToString(deterministic=True) - if normalized in canonical_env: - env_map[env_id] = canonical_env[normalized] - else: - canonical_env[normalized] = env_id - for old_env, new_env in env_map.items(): - for transform in proto.components.transforms.values(): - if transform.environment_id == old_env: - transform.environment_id = new_env - for windowing_strategy in proto.components.windowing_strategies.values(): - if windowing_strategy.environment_id == old_env: - windowing_strategy.environment_id = new_env - del proto.components.environments[old_env] + common.merge_common_environments(proto, inplace=True) @staticmethod def from_runner_api( diff --git a/sdks/python/apache_beam/runners/common.py b/sdks/python/apache_beam/runners/common.py index 1cd0a3044663..630ed7910c8d 100644 --- a/sdks/python/apache_beam/runners/common.py +++ b/sdks/python/apache_beam/runners/common.py @@ -24,6 +24,8 @@ # pytype: skip-file +import collections +import copy import logging import sys import threading @@ -43,6 +45,7 @@ from apache_beam.internal import util from apache_beam.options.value_provider import RuntimeValueProvider from apache_beam.portability import common_urns +from apache_beam.portability.api import beam_runner_api_pb2 from apache_beam.pvalue import TaggedOutput from apache_beam.runners.sdf_utils import NoOpWatermarkEstimatorProvider from apache_beam.runners.sdf_utils import RestrictionTrackerView @@ -52,6 +55,7 @@ from apache_beam.runners.sdf_utils import ThreadsafeWatermarkEstimator from apache_beam.transforms import DoFn from apache_beam.transforms import core +from apache_beam.transforms import environments from apache_beam.transforms import userstate from apache_beam.transforms.core import RestrictionProvider from apache_beam.transforms.core import WatermarkEstimatorProvider @@ -1941,3 +1945,66 @@ def validate_transform(transform_id): for t in pipeline_proto.root_transform_ids: validate_transform(t) + + +def merge_common_environments(pipeline_proto, inplace=False): + def dep_key(dep): + if dep.type_urn == common_urns.artifact_types.FILE.urn: + payload = beam_runner_api_pb2.ArtifactFilePayload.FromString( + dep.type_payload) + if payload.sha256: + type_info = 'sha256', payload.sha256 + else: + type_info = 'path', payload.path + elif dep.type_urn == common_urns.artifact_types.URL.urn: + payload = beam_runner_api_pb2.ArtifactUrlPayload.FromString( + dep.type_payload) + if payload.sha256: + type_info = 'sha256', payload.sha256 + else: + type_info = 'url', payload.url + else: + type_info = dep.type_urn, dep.type_payload + return type_info, dep.role_urn, dep.role_payload + + def base_env_key(env): + return ( + env.urn, + env.payload, + tuple(sorted(env.capabilities)), + tuple(sorted(env.resource_hints.items())), + tuple(sorted(dep_key(dep) for dep in env.dependencies))) + + def env_key(env): + return tuple( + sorted( + base_env_key(e) + for e in environments.expand_anyof_environments(env))) + + cannonical_enviornments = collections.defaultdict(list) + for env_id, env in pipeline_proto.components.environments.items(): + cannonical_enviornments[env_key(env)].append(env_id) + + if len(cannonical_enviornments) == len( + pipeline_proto.components.environments): + # All environments are already sufficiently distinct. + return pipeline_proto + + environment_remappings = { + e: es[0] + for es in cannonical_enviornments.values() for e in es + } + + if not inplace: + pipeline_proto = copy.copy(pipeline_proto) + + for t in pipeline_proto.components.transforms.values(): + if t.environment_id: + t.environment_id = environment_remappings[t.environment_id] + for w in pipeline_proto.components.windowing_strategies.values(): + if w.environment_id: + w.environment_id = environment_remappings[w.environment_id] + for e in set(pipeline_proto.components.environments.keys()) - set( + environment_remappings.values()): + del pipeline_proto.components.environments[e] + return pipeline_proto diff --git a/sdks/python/apache_beam/runners/common_test.py b/sdks/python/apache_beam/runners/common_test.py index 00645948c3ed..ca2cd2539a8c 100644 --- a/sdks/python/apache_beam/runners/common_test.py +++ b/sdks/python/apache_beam/runners/common_test.py @@ -26,8 +26,11 @@ from apache_beam.io.restriction_trackers import OffsetRestrictionTracker from apache_beam.io.watermark_estimators import ManualWatermarkEstimator from apache_beam.options.pipeline_options import PipelineOptions +from apache_beam.portability.api import beam_runner_api_pb2 from apache_beam.runners.common import DoFnSignature from apache_beam.runners.common import PerWindowInvoker +from apache_beam.runners.common import merge_common_environments +from apache_beam.runners.portability.expansion_service_test import FibTransform from apache_beam.runners.sdf_utils import SplitResultPrimary from apache_beam.runners.sdf_utils import SplitResultResidual from apache_beam.testing.test_pipeline import TestPipeline @@ -584,5 +587,61 @@ def test_window_observing_split_on_window_boundary_round_down_on_last_window( self.assertEqual(stop_index, 2) +class UtilitiesTest(unittest.TestCase): + def test_equal_environments_merged(self): + pipeline_proto = merge_common_environments( + beam_runner_api_pb2.Pipeline( + components=beam_runner_api_pb2.Components( + environments={ + 'a1': beam_runner_api_pb2.Environment(urn='A'), + 'a2': beam_runner_api_pb2.Environment(urn='A'), + 'b1': beam_runner_api_pb2.Environment( + urn='B', payload=b'x'), + 'b2': beam_runner_api_pb2.Environment( + urn='B', payload=b'x'), + 'b3': beam_runner_api_pb2.Environment( + urn='B', payload=b'y'), + }, + transforms={ + 't1': beam_runner_api_pb2.PTransform( + unique_name='t1', environment_id='a1'), + 't2': beam_runner_api_pb2.PTransform( + unique_name='t2', environment_id='a2'), + }, + windowing_strategies={ + 'w1': beam_runner_api_pb2.WindowingStrategy( + environment_id='b1'), + 'w2': beam_runner_api_pb2.WindowingStrategy( + environment_id='b2'), + }))) + self.assertEqual(len(pipeline_proto.components.environments), 3) + self.assertTrue(('a1' in pipeline_proto.components.environments) + ^ ('a2' in pipeline_proto.components.environments)) + self.assertTrue(('b1' in pipeline_proto.components.environments) + ^ ('b2' in pipeline_proto.components.environments)) + self.assertEqual( + len( + set( + t.environment_id + for t in pipeline_proto.components.transforms.values())), + 1) + self.assertEqual( + len( + set( + w.environment_id for w in + pipeline_proto.components.windowing_strategies.values())), + 1) + + def test_external_merged(self): + p = beam.Pipeline() + # This transform recursively creates several external environments. + _ = p | FibTransform(4) + pipeline_proto = p.to_runner_api() + # All our external environments are equal and consolidated. + # We also have a placeholder "default" environment that has not been + # resolved do anything concrete yet. + self.assertEqual(len(pipeline_proto.components.environments), 2) + + if __name__ == '__main__': unittest.main() diff --git a/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py b/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py index db6a5235ac92..e428551ef028 100644 --- a/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py +++ b/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py @@ -43,6 +43,7 @@ from apache_beam.options.pipeline_options import WorkerOptions from apache_beam.portability import common_urns from apache_beam.runners.common import group_by_key_input_visitor +from apache_beam.runners.common import merge_common_environments from apache_beam.runners.dataflow.internal.clients import dataflow as dataflow_api from apache_beam.runners.runner import PipelineResult from apache_beam.runners.runner import PipelineRunner @@ -419,6 +420,7 @@ def run_pipeline(self, pipeline, options, pipeline_proto=None): self.proto_pipeline.components.environments[env_id].CopyFrom( environments.resolve_anyof_environment( env, common_urns.environments.DOCKER.urn)) + self.proto_pipeline = merge_common_environments(self.proto_pipeline) # Optimize the pipeline if it not streaming and the pre_optimize # experiment is set. diff --git a/sdks/python/apache_beam/runners/portability/fn_api_runner/fn_runner.py b/sdks/python/apache_beam/runners/portability/fn_api_runner/fn_runner.py index b3dd124216be..07569fe328d8 100644 --- a/sdks/python/apache_beam/runners/portability/fn_api_runner/fn_runner.py +++ b/sdks/python/apache_beam/runners/portability/fn_api_runner/fn_runner.py @@ -62,6 +62,7 @@ from apache_beam.portability.api import beam_runner_api_pb2 from apache_beam.runners import runner from apache_beam.runners.common import group_by_key_input_visitor +from apache_beam.runners.common import merge_common_environments from apache_beam.runners.common import validate_pipeline_graph from apache_beam.runners.portability import portable_metrics from apache_beam.runners.portability.fn_api_runner import execution @@ -221,7 +222,8 @@ def run_via_runner_api(self, pipeline_proto, options): ] if direct_options.direct_embed_docker_python: pipeline_proto = self.embed_default_docker_image(pipeline_proto) - pipeline_proto = self.resolve_any_environments(pipeline_proto) + pipeline_proto = merge_common_environments( + self.resolve_any_environments(pipeline_proto)) stage_context, stages = self.create_stages(pipeline_proto) return self.run_stages(stage_context, stages) From 45444209c776460ba67ac14a1c33e0775485266a Mon Sep 17 00:00:00 2001 From: Daria Bezkorovaina <99457894+dariabezkorovaina@users.noreply.github.com> Date: Wed, 27 Mar 2024 17:00:50 +0000 Subject: [PATCH 06/10] Duet AI Prompt: Test Your Pipeline (#30738) * Rename 33_coders_data_encoding.md to 34_coders_data_encoding.md Fix numeration * Rename 34_coders_data_encoding.md to 35_coders_data_encoding.md numeration * Rename 34_ml_data_preprocessing.md to 36_ml_data_preprocessing.md * Rename 35_ml_data_exploration.md to 37_ml_data_exploration.md * Rename 36_ml_data_preprocessing.md to 38_ml_data_preprocessing.md * Rename 37_ml_data_exploration.md to 48_ml_data_exploration.md * Rename 38_ml_data_preprocessing.md to 49_ml_data_preprocessing.md * Rename 47_batching_for_external_calls.md to 44_batching_for_external_calls.md * Rename 47_mltransform.md to 45_mltransform.md * Rename 48_ml_data_exploration.md to 46_ml_data_exploration.md * Rename 48_resource_hints.md to 47_resource_hints.md * Rename 49_ml_data_preprocessing.md to 51_ml_data_preprocessing.md * Rename 49_transform_service.md to 51_transform_service.md * Rename 51_transform_service.md to 52_transform_service.md * Rename 44_dataflow_runner.md to 53_dataflow_runner.md * Rename 45_spark_runner.md to 54_spark_runner.md * Rename 46_direct_runner.md to 55_direct_runner.md * Rename 49_transform_service.md to 52_transform_service.md * Create 56_unit_test_your_pipeline.md --- ...encoding.md => 35_coders_data_encoding.md} | 0 ...s.md => 44_batching_for_external_calls.md} | 0 .../{47_mltransform.md => 45_mltransform.md} | 0 ...ploration.md => 46_ml_data_exploration.md} | 0 ...resource_hints.md => 47_resource_hints.md} | 0 ...cessing.md => 51_ml_data_preprocessing.md} | 0 ...orm_service.md => 52_transform_service.md} | 0 .../56_unit_test_your_pipeline.md | 148 ++++++++++++++++++ ...orm_service.md => 52_transform_service.md} | 0 ...taflow_runner.md => 53_dataflow_runner.md} | 0 ...{45_spark_runner.md => 54_spark_runner.md} | 0 ...6_direct_runner.md => 55_direct_runner.md} | 0 12 files changed, 148 insertions(+) rename learning/prompts/documentation-lookup-nolinks/{33_coders_data_encoding.md => 35_coders_data_encoding.md} (100%) rename learning/prompts/documentation-lookup-nolinks/{47_batching_for_external_calls.md => 44_batching_for_external_calls.md} (100%) rename learning/prompts/documentation-lookup-nolinks/{47_mltransform.md => 45_mltransform.md} (100%) rename learning/prompts/documentation-lookup-nolinks/{35_ml_data_exploration.md => 46_ml_data_exploration.md} (100%) rename learning/prompts/documentation-lookup-nolinks/{48_resource_hints.md => 47_resource_hints.md} (100%) rename learning/prompts/documentation-lookup-nolinks/{34_ml_data_preprocessing.md => 51_ml_data_preprocessing.md} (100%) rename learning/prompts/documentation-lookup-nolinks/{49_transform_service.md => 52_transform_service.md} (100%) create mode 100644 learning/prompts/documentation-lookup-nolinks/56_unit_test_your_pipeline.md rename learning/prompts/documentation-lookup/{49_transform_service.md => 52_transform_service.md} (100%) rename learning/prompts/documentation-lookup/{44_dataflow_runner.md => 53_dataflow_runner.md} (100%) rename learning/prompts/documentation-lookup/{45_spark_runner.md => 54_spark_runner.md} (100%) rename learning/prompts/documentation-lookup/{46_direct_runner.md => 55_direct_runner.md} (100%) diff --git a/learning/prompts/documentation-lookup-nolinks/33_coders_data_encoding.md b/learning/prompts/documentation-lookup-nolinks/35_coders_data_encoding.md similarity index 100% rename from learning/prompts/documentation-lookup-nolinks/33_coders_data_encoding.md rename to learning/prompts/documentation-lookup-nolinks/35_coders_data_encoding.md diff --git a/learning/prompts/documentation-lookup-nolinks/47_batching_for_external_calls.md b/learning/prompts/documentation-lookup-nolinks/44_batching_for_external_calls.md similarity index 100% rename from learning/prompts/documentation-lookup-nolinks/47_batching_for_external_calls.md rename to learning/prompts/documentation-lookup-nolinks/44_batching_for_external_calls.md diff --git a/learning/prompts/documentation-lookup-nolinks/47_mltransform.md b/learning/prompts/documentation-lookup-nolinks/45_mltransform.md similarity index 100% rename from learning/prompts/documentation-lookup-nolinks/47_mltransform.md rename to learning/prompts/documentation-lookup-nolinks/45_mltransform.md diff --git a/learning/prompts/documentation-lookup-nolinks/35_ml_data_exploration.md b/learning/prompts/documentation-lookup-nolinks/46_ml_data_exploration.md similarity index 100% rename from learning/prompts/documentation-lookup-nolinks/35_ml_data_exploration.md rename to learning/prompts/documentation-lookup-nolinks/46_ml_data_exploration.md diff --git a/learning/prompts/documentation-lookup-nolinks/48_resource_hints.md b/learning/prompts/documentation-lookup-nolinks/47_resource_hints.md similarity index 100% rename from learning/prompts/documentation-lookup-nolinks/48_resource_hints.md rename to learning/prompts/documentation-lookup-nolinks/47_resource_hints.md diff --git a/learning/prompts/documentation-lookup-nolinks/34_ml_data_preprocessing.md b/learning/prompts/documentation-lookup-nolinks/51_ml_data_preprocessing.md similarity index 100% rename from learning/prompts/documentation-lookup-nolinks/34_ml_data_preprocessing.md rename to learning/prompts/documentation-lookup-nolinks/51_ml_data_preprocessing.md diff --git a/learning/prompts/documentation-lookup-nolinks/49_transform_service.md b/learning/prompts/documentation-lookup-nolinks/52_transform_service.md similarity index 100% rename from learning/prompts/documentation-lookup-nolinks/49_transform_service.md rename to learning/prompts/documentation-lookup-nolinks/52_transform_service.md diff --git a/learning/prompts/documentation-lookup-nolinks/56_unit_test_your_pipeline.md b/learning/prompts/documentation-lookup-nolinks/56_unit_test_your_pipeline.md new file mode 100644 index 000000000000..94dcfd09c805 --- /dev/null +++ b/learning/prompts/documentation-lookup-nolinks/56_unit_test_your_pipeline.md @@ -0,0 +1,148 @@ +Prompt: +How can I unit-test my Apache Beam pipeline? + +Response: +Before deploying your Apache Beam pipeline on your chosen runner, it's crucial to conduct thorough unit tests locally. This approach is highly effective for identifying and rectifying bugs within your pipeline code, all while allowing you to leverage your preferred debugging tools. + +The Apache Beam Direct Runner is particularly valuable for local testing and development. Once you've tested your pipeline with the Direct Runner, you can seamlessly transition to testing it on a small scale using your preferred runner, such as the Flink runner with either a local or remote Flink cluster. + +The Apache Beam SDKs offer three levels of unit testing for pipeline code: +* ***Testing individual functions***: you can test each function used in your pipeline independently. +* ***Testing an entire transform***: this involves testing a transform as a cohesive unit. +* ***Performing an end-to-end test***: this comprehensive test evaluates the entire pipeline. + +Both the Java and Python SDKs provide numerous test classes within their testing packages. The following examples demonstrate the most commonly used test classes in Java and Python. + +***Testing a Transform*** + +1. Create a `TestPipeline`. The `TestPipeline` class, included in the Beam SDKs, is specifically designed for testing transforms. Replace `Pipeline` with `TestPipeline` when creating the pipeline object for tests. Unlike `Pipeline.create`, `TestPipeline.create` handles the setting of `PipelineOptions` internally. + +Java: + +```java +Pipeline p = TestPipeline.create(); +``` + +Python: + +```python +with TestPipeline as p: + … +``` + +2. Create static test input data. +3. Use the `Create` transform. You can use this transform to create a `PCollection` of your input data from a standard in-memory collection class, such as Java or Python `List`. +4. Apply your transform. Apply your transform to the input `PCollection` and store the resulting output `PCollection`. +5. Verify output using `PAssert` (Java) or `assert_that` (Python). These assertion classes ensure that the output `PCollection` contains the expected elements. + +Java: + +```java +PCollection output = ...; + +// Check whether a PCollection contains some elements in any order. +PAssert.that(output) +.containsInAnyOrder( + "elem1", + "elem3", + "elem2"); +``` + +Python: + +```python +from apache_beam.testing.util import assert_that +from apache_beam.testing.util import equal_to + +output = ... + +# Check whether a PCollection contains some elements in any order. +assert_that( + output, + equal_to(["elem1", "elem3", "elem2"])) +``` + +***Testing a Pipeline End-to-End*** + +To test an entire pipeline end-to-end: +* Create static test input data for each source of input data. +* Prepare static test output data matching the expected final output `PCollection`. +* Use `TestPipeline` instead of `Pipeline.create`. +* Replace the pipeline’s `Read` transforms with the `Create` transform to generate `PCollection` objects from static input data. +* Apply the pipeline’s transforms. +* Replace the pipeline’s `Write` transforms with `PAssert` (Java) or `assert_that` (Python) to verify that the final `PCollection` objects match the expected values in static output data. + +The following examples demonstrate how to test the WordCount example pipeline in Java and Python using these steps. + +Java: + +```java +public class WordCountTest { + // Static input data for the initial PCollection. + static final String[] WORDS_ARRAY = new String[] { + "hi there", "hi", "hi sue bob", + "hi sue", "", "bob hi"}; + + static final List WORDS = Arrays.asList(WORDS_ARRAY); + + // Static output data, expected to match the final PCollection. + static final String[] COUNTS_ARRAY = new String[] { + "hi: 5", "there: 1", "sue: 2", "bob: 2"}; + + // Example test for pipeline's transforms. + public void testCountWords() throws Exception { + Pipeline p = TestPipeline.create(); + + // Create a PCollection from the static input data. + PCollection input = p.apply(Create.of(WORDS)); + + // Run ALL the pipeline's transforms. + PCollection output = input.apply(new CountWords()); + + // Assert that the output matches the known static output data. + PAssert.that(output).containsInAnyOrder(COUNTS_ARRAY); + + // Execute the pipeline. + p.run(); + } +} +``` + +Python: + +```python +import unittest +import apache_beam as beam +from apache_beam.testing.test_pipeline import TestPipeline +from apache_beam.testing.util import assert_that +from apache_beam.testing.util import equal_to + +class CountWords(beam.PTransform): + # CountWords transform omitted for conciseness. + +class WordCountTest(unittest.TestCase): + # Input data for the initial PCollection. + WORDS = [ + "hi", "there", "hi", "hi", "sue", "bob", + "hi", "sue", "", "", "ZOW", "bob", "" + ] + + # Expected output data to match the final PCollection. + EXPECTED_COUNTS = ["hi: 5", "there: 1", "sue: 2", "bob: 2"] + + # Example test for pipeline's transforms. + def test_count_words(self): + with TestPipeline() as p: + # Create a PCollection from the static input data. + input = p | beam.Create(WORDS) + + # Run ALL the pipeline's transforms. + output = input | CountWords() + + # Assert that the output matches the expected data. + assert_that(output, equal_to(EXPECTED_COUNTS), label='CheckOutput') + + # The pipeline runs and verifies the results. +``` + +Typically, WordCount reads lines from a text file for input data. However, the provided sample tests create static input data containing text lines and use the `Create` transform to create an initial `PCollection`. Instead of writing that `PCollection` to an output text file, the test pipelines use `PAssert` (Java) or `assert_that` (Python) to verify that the `PCollection` elements match a static string containing expected output data. diff --git a/learning/prompts/documentation-lookup/49_transform_service.md b/learning/prompts/documentation-lookup/52_transform_service.md similarity index 100% rename from learning/prompts/documentation-lookup/49_transform_service.md rename to learning/prompts/documentation-lookup/52_transform_service.md diff --git a/learning/prompts/documentation-lookup/44_dataflow_runner.md b/learning/prompts/documentation-lookup/53_dataflow_runner.md similarity index 100% rename from learning/prompts/documentation-lookup/44_dataflow_runner.md rename to learning/prompts/documentation-lookup/53_dataflow_runner.md diff --git a/learning/prompts/documentation-lookup/45_spark_runner.md b/learning/prompts/documentation-lookup/54_spark_runner.md similarity index 100% rename from learning/prompts/documentation-lookup/45_spark_runner.md rename to learning/prompts/documentation-lookup/54_spark_runner.md diff --git a/learning/prompts/documentation-lookup/46_direct_runner.md b/learning/prompts/documentation-lookup/55_direct_runner.md similarity index 100% rename from learning/prompts/documentation-lookup/46_direct_runner.md rename to learning/prompts/documentation-lookup/55_direct_runner.md From ec312f00788d8d7dc76deb191360e70e3a219b20 Mon Sep 17 00:00:00 2001 From: Rebecca Szper <98840847+rszper@users.noreply.github.com> Date: Wed, 27 Mar 2024 11:02:34 -0700 Subject: [PATCH 07/10] Copy editing ML notebooks for DevSite import (#30759) * Copy editing ML notebooks for DevSite import * Update configuration parameter text --- .../beam-ml/run_inference_gemma.ipynb | 10 +++---- .../vertex_ai_feature_store_enrichment.ipynb | 30 ++++++++----------- 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/examples/notebooks/beam-ml/run_inference_gemma.ipynb b/examples/notebooks/beam-ml/run_inference_gemma.ipynb index 6af1bd07c148..489f01c4c9aa 100644 --- a/examples/notebooks/beam-ml/run_inference_gemma.ipynb +++ b/examples/notebooks/beam-ml/run_inference_gemma.ipynb @@ -153,7 +153,7 @@ "id": "1FQdEMq8GEpl" }, "source": [ - "The pipeline defined below automatically pulls the model weights from Kaggle. Please go to https://www.kaggle.com/models/keras/gemma and accept the terms of usage for Gemma models, then generate an API token using the instructions at https://www.kaggle.com/docs/api and provide your username and token here." + "The pipeline defined here automatically pulls the model weights from Kaggle. First, accept the terms of use for Gemma models on the Keras [Gemma](https://www.kaggle.com/models/keras/gemma) page. Next, generate an API token by following the instructions in [How to use Kaggle](https://www.kaggle.com/docs/api). Provide your username and token." ] }, { @@ -231,7 +231,7 @@ "## Import dependencies and provide a model preset\n", "Use the following code to import dependencies.\n", "\n", - "Replace the `model_preset` variable with the name of the Gemma preset to use. For example, if you want to use the default English weights, set the preset to \"gemma_2b_en\". For this demo, we wil use the instruction-tuned preset \"gemma_instruct_2b_en\". We also optionally use keras to run the model at half-precision to reduce GPU memory usage." + "Replace the value for the `model_preset` variable with the name of the Gemma preset to use. For example, to use the default English weights, use the value `gemma_2b_en`. This example uses the instruction-tuned preset `gemma_instruct_2b_en`. Optionally, to run the model at half-precision and reduce GPU memory usage, use Keras." ] }, { @@ -269,8 +269,8 @@ "To run the pipeline, use a custom model handler.\n", "\n", "### Provide a custom model handler\n", - "To simplify model loading, this notebook defines a custom model handler that will load the model by pulling the model weights directly from Kaggle presets. Implementing `load_model()`, `validate_inference_args()`, and `share_model_across_processes()` allows us to customize the behavior of the handler. The Keras implementation of the Gemma models has a `generate()` method\n", - "that generates text based on a prompt. Using this function in `run_inference()` routes the prompts properly." + "To simplify model loading, this notebook defines a custom model handler that loads the model by pulling the model weights directly from Kaggle presets. To customize the behavior of the handler, implement `load_model`, `validate_inference_args`, and `share_model_across_processes`. The Keras implementation of the Gemma models has a `generate` method\n", + "that generates text based on a prompt. To route the prompts properly, use this function in the `run_inference` method." ] }, { @@ -281,7 +281,7 @@ }, "outputs": [], "source": [ - "# Define `GemmaModelHandler` to load the model and perform the inference.\n", + "# To load the model and perform the inference, define `GemmaModelHandler`.\n", "\n", "from apache_beam.ml.inference.base import ModelHandler\n", "from apache_beam.ml.inference.base import PredictionResult\n", diff --git a/examples/notebooks/beam-ml/vertex_ai_feature_store_enrichment.ipynb b/examples/notebooks/beam-ml/vertex_ai_feature_store_enrichment.ipynb index c8ae558a1ba0..ebfcca34b94c 100644 --- a/examples/notebooks/beam-ml/vertex_ai_feature_store_enrichment.ipynb +++ b/examples/notebooks/beam-ml/vertex_ai_feature_store_enrichment.ipynb @@ -53,7 +53,7 @@ "id": "HrCtxslBGK8Z" }, "source": [ - "This notebook shows how to enrich data by using the Apache Beam [enrichment transform](https://beam.apache.org/documentation/transforms/python/elementwise/enrichment/) with [Vertex AI Feature Store](https://cloud.google.com/vertex-ai/docs). The enrichment transform is a turnkey transform in Apache Beam that lets you enrich data using a key-value lookup. This transform has the following features:\n", + "This notebook shows how to enrich data by using the Apache Beam [enrichment transform](https://beam.apache.org/documentation/transforms/python/elementwise/enrichment/) with [Vertex AI Feature Store](https://cloud.google.com/vertex-ai/docs/featurestore/latest/overview). The enrichment transform is an Apache Beam turnkey transform that lets you enrich data by using a key-value lookup. This transform has the following features:\n", "\n", "- The transform has a built-in Apache Beam handler that interacts with Vertex AI to get precomputed feature values.\n", "- The transform uses client-side throttling to manage rate limiting the requests.\n", @@ -72,8 +72,8 @@ "\n", "* Use a stream of online transactions from [Pub/Sub](https://cloud.google.com/pubsub/docs/guides) that contains the following fields: `product_id`, `user_id`, and `sale_price`.\n", "* Deploy a pretrained model on Vertex AI based on the features `product_id`, `user_id`, `sale_price`, `age`, `gender`, `state`, and `country`.\n", - "* Precompute the feature values for the pretrained model, and store the values in the Vertex AI Feature Store.\n", - "* Enrich the stream of transactions from Pub/Sub with feature values from Vertex AI Feature Store by using the `Enrichment` transform.\n", + "* Precompute the feature values for the pretrained model, and store the values in Vertex AI Feature Store.\n", + "* Enrich the stream of transactions from Pub/Sub with feature values from Vertex AI Feature Store by using the enrichment transform.\n", "* Send the enriched data to the Vertex AI model for online prediction by using the `RunInference` transform, which predicts the product recommendation for the user." ] }, @@ -1139,7 +1139,7 @@ "source": [ "Deploy the model to the Vertex AI endpoint.\n", "\n", - "**Note:** This step is a Long Running Operation (LRO). Depending on the size of the model, it might take more than five minutes to complete." + "**Note:** This step is a long running operation (LRO). Depending on the size of the model, it might take more than five minutes to complete." ] }, { @@ -1186,7 +1186,7 @@ "id": "ouMQZ4sC4zuO" }, "source": [ - "### Set up the Vertex AI Feature Store for online serving\n" + "### Set up Vertex AI Feature Store for online serving\n" ] }, { @@ -1544,7 +1544,7 @@ "id": "Mm-HCUaa3ROZ" }, "source": [ - "Create a BigQuery dataset to use as the source for the Vertex AI Feature Store." + "Create a BigQuery dataset to use as the source for Vertex AI Feature Store." ] }, { @@ -2148,7 +2148,7 @@ "source": [ "## Use the Vertex AI Feature Store enrichment handler\n", "\n", - "The [`VertexAIFeatureStoreEnrichmentHandler`](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.enrichment_handlers.vertex_ai_feature_store.html#apache_beam.transforms.enrichment_handlers.vertex_ai_feature_store.VertexAIFeatureStoreEnrichmentHandler) is a built-in handler included in the Apache Beam SDK versions 2.55.0 and later." + "The [`VertexAIFeatureStoreEnrichmentHandler`](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.enrichment_handlers.vertex_ai_feature_store.html#apache_beam.transforms.enrichment_handlers.vertex_ai_feature_store.VertexAIFeatureStoreEnrichmentHandler) is a built-in handler in the Apache Beam SDK versions 2.55.0 and later." ] }, { @@ -2157,16 +2157,16 @@ "id": "K41xhvmA5yQk" }, "source": [ - "Configure the `VertexAIFeatureStoreEnrichmentHandler` with the following required parameters:\n", + "Configure the `VertexAIFeatureStoreEnrichmentHandler` handler with the following required parameters:\n", "\n", "* `project`: the Google Cloud project ID for the feature store\n", "* `location`: the region of the feature store, for example `us-central1`\n", "* `api_endpoint`: the public endpoint of the feature store\n", - "* `feature_store_name`: the name of the Vertex AI Feature Store\n", - "* `feature_view_name`: the name of the feature view within the Vertex AI Feature Store\n", + "* `feature_store_name`: the name of the Vertex AI feature store\n", + "* `feature_view_name`: the name of the feature view within the Vertex AI feature store\n", "* `row_key`: The field name in the input row containing the entity ID for the feature store. This value is used to extract the entity ID from each element. The entity ID is used to fetch feature values for that specific element in the enrichment transform.\n", "\n", - "Optionally, to provide more configuration values to connect with the Vertex AI client, the `VertexAIFeatureStoreEnrichmentHandler` accepts a keyword argument (kwargs). For more information, see [`FeatureOnlineStoreServiceClient`](https://cloud.google.com/php/docs/reference/cloud-ai-platform/latest/V1.FeatureOnlineStoreServiceClient).\n", + "Optionally, to provide more configuration values to connect with the Vertex AI client, the `VertexAIFeatureStoreEnrichmentHandler` handler accepts a keyword argument (kwargs). For more information, see [`FeatureOnlineStoreServiceClient`](https://cloud.google.com/php/docs/reference/cloud-ai-platform/latest/V1.FeatureOnlineStoreServiceClient).\n", "\n", "**Note:** When exceptions occur, by default, the logging severity is set to warning ([`ExceptionLevel.WARN`](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.enrichment_handlers.utils.html#apache_beam.transforms.enrichment_handlers.utils.ExceptionLevel.WARN)). To configure the severity to raise exceptions, set `exception_level` to [`ExceptionLevel.RAISE`](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.enrichment_handlers.utils.html#apache_beam.transforms.enrichment_handlers.utils.ExceptionLevel.RAISE). To ignore exceptions, set `exception_level` to [`ExceptionLevel.QUIET`](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.enrichment_handlers.utils.html#apache_beam.transforms.enrichment_handlers.utils.ExceptionLevel.QUIET).\n", "\n", @@ -2208,13 +2208,7 @@ "source": [ "## Use the enrichment transform\n", "\n", - "To use the [enrichment transform](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.enrichment.html#apache_beam.transforms.enrichment.Enrichment), the [`EnrichmentHandler`](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.enrichment.html#apache_beam.transforms.enrichment.EnrichmentSourceHandler) parameter is required. You can also use a configuration parameter to specify a `lambda` for a join function, a timeout, a throttler, and a repeater (retry strategy).\n", - "\n", - "\n", - "* `join_fn`: A lambda function that takes dictionaries as input and returns an enriched row (`Callable[[Dict[str, Any], Dict[str, Any]], beam.Row]`). The enriched row specifies how to join the data fetched from the API. Defaults to a [cross-join](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.enrichment.html#apache_beam.transforms.enrichment.cross_join).\n", - "* `timeout`: The number of seconds to wait for the request to be completed by the API before timing out. Defaults to 30 seconds.\n", - "* `throttler`: Specifies the throttling mechanism. The only supported option is default client-side adaptive throttling.\n", - "* `repeater`: Specifies the retry strategy when errors like `TooManyRequests` and `TimeoutException` occur. Defaults to [`ExponentialBackOffRepeater`](https://beam.apache.org/releases/pydoc/current/apache_beam.io.requestresponse.html#apache_beam.io.requestresponse.ExponentialBackOffRepeater).\n", + "To use the [enrichment transform](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.enrichment.html#apache_beam.transforms.enrichment.Enrichment), the [`EnrichmentHandler`](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.enrichment.html#apache_beam.transforms.enrichment.EnrichmentSourceHandler) parameter is required. You can also use configuration parameters to specify a `lambda` for a join function, a timeout, a throttler, and a repeater (retry strategy). For more information, see [Parameters](https://cloud.google.com/dataflow/docs/guides/enrichment#parameters).\n", "\n", "\n", "To use the Redis cache, apply the `with_redis_cache` hook to the enrichment transform. The coders for encoding and decoding the input and output for the cache are optional and are internally inferred." From 7097443e27c90857258e9478ddb83c9d7e32afce Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Wed, 27 Mar 2024 11:03:37 -0700 Subject: [PATCH 08/10] Add link to official reference on beam site. (#30768) --- website/www/site/content/en/documentation/sdks/yaml.md | 5 ++--- website/www/site/layouts/partials/section-menu/en/sdks.html | 3 +++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/website/www/site/content/en/documentation/sdks/yaml.md b/website/www/site/content/en/documentation/sdks/yaml.md index 8d4c6fb94b54..5e5a8c34e62b 100644 --- a/website/www/site/content/en/documentation/sdks/yaml.md +++ b/website/www/site/content/en/documentation/sdks/yaml.md @@ -631,6 +631,5 @@ options: ## Other Resources -* [Example pipelines](https://gist.github.com/robertwb/2cb26973f1b1203e8f5f8f88c5764da0) -* [More examples](https://github.com/Polber/beam/tree/jkinard/bug-bash/sdks/python/apache_beam/yaml/examples) -* [Transform glossary](https://gist.github.com/robertwb/64e2f51ff88320eeb6ffd96634202df7) +* [Example pipeline](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/yaml/examples) +* [More examples](https://gist.github.com/robertwb/2cb26973f1b1203e8f5f8f88c5764da0) diff --git a/website/www/site/layouts/partials/section-menu/en/sdks.html b/website/www/site/layouts/partials/section-menu/en/sdks.html index 4405c9e4d3ee..fd7de314992b 100644 --- a/website/www/site/layouts/partials/section-menu/en/sdks.html +++ b/website/www/site/layouts/partials/section-menu/en/sdks.html @@ -92,6 +92,9 @@
  • Yaml Aggregation
  • Error handling
  • Inlining Python
  • +
  • YAML API reference External link.
From 62200da9df8fee2835ab2b7731c33bd8e243c502 Mon Sep 17 00:00:00 2001 From: martin trieu Date: Wed, 27 Mar 2024 12:52:39 -0700 Subject: [PATCH 09/10] factor out failure handling logic from StreamingDataflowWorker (#30695) * factor out failure handling logic from StreamingDataflowWorker * address cl comments * address cl comments --- .../worker/StreamingDataflowWorker.java | 634 ++++-------------- .../dataflow/worker/WorkUnitClient.java | 2 +- .../streaming/harness/StreamingCounters.java | 101 +++ .../StreamingWorkerStatusReporter.java | 396 +++++++++++ .../worker/util/BoundedQueueExecutor.java | 6 + .../processing/failures/FailureTracker.java | 104 +++ .../work/processing/failures/HeapDumper.java | 28 + .../StreamingApplianceFailureTracker.java | 60 ++ .../StreamingApplianceStatsReporter.java | 25 + .../StreamingEngineFailureTracker.java | 44 ++ .../failures/WorkFailureProcessor.java | 185 +++++ .../worker/StreamingDataflowWorkerTest.java | 41 +- .../StreamingApplianceFailureTrackerTest.java | 97 +++ .../StreamingEngineFailureTrackerTest.java | 91 +++ .../failures/WorkFailureProcessorTest.java | 204 ++++++ 15 files changed, 1519 insertions(+), 499 deletions(-) create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingCounters.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingWorkerStatusReporter.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/FailureTracker.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/HeapDumper.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingApplianceFailureTracker.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingApplianceStatsReporter.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingEngineFailureTracker.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessor.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingApplianceFailureTrackerTest.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingEngineFailureTrackerTest.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java index c3e820767cd2..22d3b105c2b3 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java @@ -24,15 +24,9 @@ import com.google.api.services.dataflow.model.CounterUpdate; import com.google.api.services.dataflow.model.MapTask; -import com.google.api.services.dataflow.model.PerStepNamespaceMetrics; -import com.google.api.services.dataflow.model.PerWorkerMetrics; -import com.google.api.services.dataflow.model.Status; import com.google.api.services.dataflow.model.StreamingComputationConfig; import com.google.api.services.dataflow.model.StreamingConfigTask; -import com.google.api.services.dataflow.model.StreamingScalingReport; import com.google.api.services.dataflow.model.WorkItem; -import com.google.api.services.dataflow.model.WorkItemStatus; -import com.google.api.services.dataflow.model.WorkerMessage; import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import java.io.File; import java.io.IOException; @@ -42,7 +36,6 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Optional; @@ -55,7 +48,6 @@ import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicLong; import java.util.function.Consumer; import java.util.function.Function; import java.util.function.Supplier; @@ -67,10 +59,7 @@ import org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions; import org.apache.beam.runners.dataflow.util.CloudObject; import org.apache.beam.runners.dataflow.util.CloudObjects; -import org.apache.beam.runners.dataflow.worker.DataflowSystemMetrics.StreamingSystemCounterNames; import org.apache.beam.runners.dataflow.worker.apiary.FixMultiOutputInfosOnParDoInstructions; -import org.apache.beam.runners.dataflow.worker.counters.Counter; -import org.apache.beam.runners.dataflow.worker.counters.CounterSet; import org.apache.beam.runners.dataflow.worker.counters.DataflowCounterUpdateExtractor; import org.apache.beam.runners.dataflow.worker.counters.NameContext; import org.apache.beam.runners.dataflow.worker.graph.Edges.Edge; @@ -95,6 +84,8 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.streaming.Work.State; import org.apache.beam.runners.dataflow.worker.streaming.WorkHeartbeatResponseProcessor; +import org.apache.beam.runners.dataflow.worker.streaming.harness.StreamingCounters; +import org.apache.beam.runners.dataflow.worker.streaming.harness.StreamingWorkerStatusReporter; import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcher; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.util.MemoryMonitor; @@ -119,6 +110,10 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillStreamFactory; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateReader; +import org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures.FailureTracker; +import org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures.StreamingApplianceFailureTracker; +import org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures.StreamingEngineFailureTracker; +import org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures.WorkFailureProcessor; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.ActiveWorkRefresher; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.ActiveWorkRefreshers; import org.apache.beam.sdk.coders.Coder; @@ -135,7 +130,6 @@ import org.apache.beam.sdk.util.BackOffUtils; import org.apache.beam.sdk.util.FluentBackoff; import org.apache.beam.sdk.util.Sleeper; -import org.apache.beam.sdk.util.UserCodeException; import org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; @@ -170,11 +164,10 @@ public class StreamingDataflowWorker { // Maximum number of threads for processing. Currently each thread processes one key at a time. static final int MAX_PROCESSING_THREADS = 300; static final long THREAD_EXPIRATION_TIME_SEC = 60; - static final long TARGET_COMMIT_BUNDLE_BYTES = 32 << 20; - static final int MAX_COMMIT_QUEUE_BYTES = 500 << 20; // 500MB static final int NUM_COMMIT_STREAMS = 1; static final int GET_WORK_STREAM_TIMEOUT_MINUTES = 3; static final Duration COMMIT_STREAM_TIMEOUT = Duration.standardMinutes(1); + /** * Sinks are marked 'full' in {@link StreamingModeExecutionContext} once the amount of data sinked * (across all the sinks, if there are more than one) reaches this limit. This serves as hint for @@ -206,13 +199,9 @@ public class StreamingDataflowWorker { private static final int DEFAULT_STATUS_PORT = 8081; // Maximum size of the result of a GetWork request. private static final long MAX_GET_WORK_FETCH_BYTES = 64L << 20; // 64m - // Reserved ID for counter updates. - // Matches kWindmillCounterUpdate in workflow_worker_service_multi_hubs.cc. - private static final String WINDMILL_COUNTER_UPDATE_WORK_ID = "3"; /** Maximum number of failure stacktraces to report in each update sent to backend. */ private static final int MAX_FAILURES_TO_REPORT_IN_UPDATE = 1000; - private static final Duration MAX_LOCAL_PROCESSING_RETRY_DURATION = Duration.standardMinutes(5); private static final Random clientIdGenerator = new Random(); private static final String CHANNELZ_PATH = "/channelz"; final WindmillStateCache stateCache; @@ -232,44 +221,23 @@ public class StreamingDataflowWorker { private final BoundedQueueExecutor workUnitExecutor; private final WindmillServerStub windmillServer; private final Thread dispatchThread; - private final AtomicLong previousTimeAtMaxThreads = new AtomicLong(); private final AtomicBoolean running = new AtomicBoolean(); private final SideInputStateFetcher sideInputStateFetcher; private final DataflowWorkerHarnessOptions options; private final boolean windmillServiceEnabled; private final long clientId; private final MetricTrackingWindmillServerStub metricTrackingWindmillServer; - private final CounterSet pendingDeltaCounters = new CounterSet(); - private final CounterSet pendingCumulativeCounters = new CounterSet(); + private final java.util.concurrent.ConcurrentLinkedQueue pendingMonitoringInfos = new ConcurrentLinkedQueue<>(); // Map from stage name to StageInfo containing metrics container registry and per stage counters. - private final ConcurrentMap stageInfoMap = new ConcurrentHashMap<>(); - // Built-in delta counters. - private final Counter windmillShuffleBytesRead; - private final Counter windmillStateBytesRead; - private final Counter windmillStateBytesWritten; - private final Counter windmillQuotaThrottling; - private final Counter timeAtMaxActiveThreads; - // Built-in cumulative counters. - private final Counter javaHarnessUsedMemory; - private final Counter javaHarnessMaxMemory; - private final Counter activeThreads; - private final Counter totalAllocatedThreads; - private final Counter outstandingBytes; - private final Counter maxOutstandingBytes; - private final Counter outstandingBundles; - private final Counter maxOutstandingBundles; - private final Counter windmillMaxObservedWorkItemCommitBytes; - private final Counter memoryThrashing; - private final boolean publishCounters; + private final ConcurrentMap stageInfoMap; + private final MemoryMonitor memoryMonitor; private final Thread memoryMonitorThread; private final WorkerStatusPages statusPages; // Limit on bytes sinked (committed) in a work item. private final long maxSinkBytes; // = MAX_SINK_BYTES unless disabled in options. - private final EvictingQueue pendingFailuresToReport = - EvictingQueue.create(MAX_FAILURES_TO_REPORT_IN_UPDATE); private final ReaderCache readerCache; private final WorkUnitClient workUnitClient; private final CompletableFuture isDoneFuture; @@ -284,7 +252,6 @@ public class StreamingDataflowWorker { private final DebugCapture.@Nullable Manager debugCaptureManager; // Collection of ScheduledExecutorServices that are running periodic functions. private final ArrayList scheduledExecutors = new ArrayList<>(); - private int retryLocallyDelayMs = 10000; // Periodically fires a global config request to dataflow service. Only used when windmill service // is enabled. // Possibly overridden by streaming engine config. @@ -293,21 +260,31 @@ public class StreamingDataflowWorker { private final DataflowExecutionStateSampler sampler = DataflowExecutionStateSampler.instance(); private final ActiveWorkRefresher activeWorkRefresher; private final WorkCommitter workCommitter; + private final StreamingWorkerStatusReporter workerStatusReporter; + private final FailureTracker failureTracker; + private final WorkFailureProcessor workFailureProcessor; + private final StreamingCounters streamingCounters; private StreamingDataflowWorker( WindmillServerStub windmillServer, long clientId, ConcurrentMap computationMap, + ConcurrentMap stageInfoMap, WindmillStateCache windmillStateCache, BoundedQueueExecutor workUnitExecutor, DataflowMapTaskExecutorFactory mapTaskExecutorFactory, WorkUnitClient workUnitClient, DataflowWorkerHarnessOptions options, - boolean publishCounters, HotKeyLogger hotKeyLogger, Supplier clock, + StreamingWorkerStatusReporter workerStatusReporter, + FailureTracker failureTracker, + WorkFailureProcessor workFailureProcessor, + StreamingCounters streamingCounters, + MemoryMonitor memoryMonitor, Function executorSupplier) { this.computationMap = computationMap; + this.stageInfoMap = stageInfoMap; this.stateCache = windmillStateCache; this.readerCache = new ReaderCache( @@ -320,7 +297,6 @@ private StreamingDataflowWorker( this.clock = clock; this.executorSupplier = executorSupplier; this.windmillServiceEnabled = options.isEnableStreamingEngine(); - this.memoryMonitor = MemoryMonitor.fromOptions(options); this.statusPages = WorkerStatusPages.create(DEFAULT_STATUS_PORT, memoryMonitor, () -> true); if (windmillServiceEnabled) { this.debugCaptureManager = @@ -328,52 +304,8 @@ private StreamingDataflowWorker( } else { this.debugCaptureManager = null; } - this.windmillShuffleBytesRead = - pendingDeltaCounters.longSum( - StreamingSystemCounterNames.WINDMILL_SHUFFLE_BYTES_READ.counterName()); - this.windmillStateBytesRead = - pendingDeltaCounters.longSum( - StreamingSystemCounterNames.WINDMILL_STATE_BYTES_READ.counterName()); - this.windmillStateBytesWritten = - pendingDeltaCounters.longSum( - StreamingSystemCounterNames.WINDMILL_STATE_BYTES_WRITTEN.counterName()); - this.windmillQuotaThrottling = - pendingDeltaCounters.longSum( - StreamingSystemCounterNames.WINDMILL_QUOTA_THROTTLING.counterName()); - this.timeAtMaxActiveThreads = - pendingDeltaCounters.longSum( - StreamingSystemCounterNames.TIME_AT_MAX_ACTIVE_THREADS.counterName()); - this.javaHarnessUsedMemory = - pendingCumulativeCounters.longSum( - StreamingSystemCounterNames.JAVA_HARNESS_USED_MEMORY.counterName()); - this.javaHarnessMaxMemory = - pendingCumulativeCounters.longSum( - StreamingSystemCounterNames.JAVA_HARNESS_MAX_MEMORY.counterName()); - this.activeThreads = - pendingCumulativeCounters.intSum(StreamingSystemCounterNames.ACTIVE_THREADS.counterName()); - this.outstandingBytes = - pendingCumulativeCounters.longSum( - StreamingSystemCounterNames.OUTSTANDING_BYTES.counterName()); - this.maxOutstandingBytes = - pendingCumulativeCounters.longSum( - StreamingSystemCounterNames.MAX_OUTSTANDING_BYTES.counterName()); - this.outstandingBundles = - pendingCumulativeCounters.longSum( - StreamingSystemCounterNames.OUTSTANDING_BUNDLES.counterName()); - this.maxOutstandingBundles = - pendingCumulativeCounters.longSum( - StreamingSystemCounterNames.MAX_OUTSTANDING_BUNDLES.counterName()); - this.totalAllocatedThreads = - pendingCumulativeCounters.intSum( - StreamingSystemCounterNames.TOTAL_ALLOCATED_THREADS.counterName()); - this.windmillMaxObservedWorkItemCommitBytes = - pendingCumulativeCounters.intMax( - StreamingSystemCounterNames.WINDMILL_MAX_WORK_ITEM_COMMIT_BYTES.counterName()); - this.memoryThrashing = - pendingCumulativeCounters.intSum( - StreamingSystemCounterNames.MEMORY_THRASHING.counterName()); - this.isDoneFuture = new CompletableFuture<>(); + this.isDoneFuture = new CompletableFuture<>(); this.workUnitExecutor = workUnitExecutor; maxSinkBytes = @@ -399,8 +331,6 @@ private StreamingDataflowWorker( dispatchThread.setDaemon(true); dispatchThread.setPriority(Thread.MIN_PRIORITY); dispatchThread.setName("DispatchThread"); - - this.publishCounters = publishCounters; this.clientId = clientId; this.windmillServer = windmillServer; this.metricTrackingWindmillServer = @@ -446,6 +376,11 @@ private StreamingDataflowWorker( sampler, metricTrackingWindmillServer::refreshActiveWork, executorSupplier.apply("RefreshWork")); + this.workerStatusReporter = workerStatusReporter; + this.failureTracker = failureTracker; + this.workFailureProcessor = workFailureProcessor; + this.streamingCounters = streamingCounters; + this.memoryMonitor = memoryMonitor; LOG.debug("windmillServiceEnabled: {}", windmillServiceEnabled); LOG.debug("WindmillServiceEndpoint: {}", options.getWindmillServiceEndpoint()); @@ -455,24 +390,60 @@ private StreamingDataflowWorker( } public static StreamingDataflowWorker fromOptions(DataflowWorkerHarnessOptions options) { - ConcurrentMap computationMap = new ConcurrentHashMap<>(); long clientId = clientIdGenerator.nextLong(); - return new StreamingDataflowWorker( + MemoryMonitor memoryMonitor = MemoryMonitor.fromOptions(options); + ConcurrentMap computationMap = new ConcurrentHashMap<>(); + ConcurrentMap stageInfo = new ConcurrentHashMap<>(); + StreamingCounters streamingCounters = StreamingCounters.create(); + WindmillServerStub windmillServer = createWindmillServerStub( options, clientId, new WorkHeartbeatResponseProcessor( - computationId -> Optional.ofNullable(computationMap.get(computationId)))), + computationId -> Optional.ofNullable(computationMap.get(computationId)))); + FailureTracker failureTracker = + options.isEnableStreamingEngine() + ? StreamingEngineFailureTracker.create( + MAX_FAILURES_TO_REPORT_IN_UPDATE, options.getMaxStackTraceDepthToReport()) + : StreamingApplianceFailureTracker.create( + MAX_FAILURES_TO_REPORT_IN_UPDATE, + options.getMaxStackTraceDepthToReport(), + windmillServer::reportStats); + WorkUnitClient dataflowServiceClient = new DataflowWorkUnitClient(options, LOG); + BoundedQueueExecutor workExecutor = createWorkUnitExecutor(options); + Supplier clock = Instant::now; + WorkFailureProcessor workFailureProcessor = + WorkFailureProcessor.create( + workExecutor, + failureTracker, + () -> Optional.ofNullable(memoryMonitor.tryToDumpHeap()), + clock); + StreamingWorkerStatusReporter workerStatusReporter = + StreamingWorkerStatusReporter.create( + dataflowServiceClient, + windmillServer::getAndResetThrottleTime, + stageInfo::values, + failureTracker, + streamingCounters, + memoryMonitor, + workExecutor); + return new StreamingDataflowWorker( + windmillServer, clientId, computationMap, + stageInfo, WindmillStateCache.ofSizeMbs(options.getWorkerCacheMb()), - createWorkUnitExecutor(options), + workExecutor, IntrinsicMapTaskExecutorFactory.defaultFactory(), - new DataflowWorkUnitClient(options, LOG), + dataflowServiceClient, options, - /* publishCounters= */ true, new HotKeyLogger(), - Instant::now, + clock, + workerStatusReporter, + failureTracker, + workFailureProcessor, + streamingCounters, + memoryMonitor, threadName -> Executors.newSingleThreadScheduledExecutor( new ThreadFactoryBuilder().setNameFormat(threadName).build())); @@ -489,26 +460,66 @@ static StreamingDataflowWorker forTesting( boolean publishCounters, HotKeyLogger hotKeyLogger, Supplier clock, - Function executorSupplier) { - BoundedQueueExecutor boundedQueueExecutor = createWorkUnitExecutor(options); + Function executorSupplier, + int localRetryTimeoutMs) { + ConcurrentMap stageInfo = new ConcurrentHashMap<>(); + BoundedQueueExecutor workExecutor = createWorkUnitExecutor(options); WindmillStateCache stateCache = WindmillStateCache.ofSizeMbs(options.getWorkerCacheMb()); computationMap.putAll( - createComputationMapForTesting(mapTasks, boundedQueueExecutor, stateCache::forComputation)); + createComputationMapForTesting(mapTasks, workExecutor, stateCache::forComputation)); + MemoryMonitor memoryMonitor = MemoryMonitor.fromOptions(options); + StreamingCounters streamingCounters = StreamingCounters.create(); + FailureTracker failureTracker = + options.isEnableStreamingEngine() + ? StreamingEngineFailureTracker.create( + MAX_FAILURES_TO_REPORT_IN_UPDATE, options.getMaxStackTraceDepthToReport()) + : StreamingApplianceFailureTracker.create( + MAX_FAILURES_TO_REPORT_IN_UPDATE, + options.getMaxStackTraceDepthToReport(), + windmillServer::reportStats); + WorkFailureProcessor workFailureProcessor = + WorkFailureProcessor.forTesting( + workExecutor, + failureTracker, + () -> Optional.ofNullable(memoryMonitor.tryToDumpHeap()), + clock, + localRetryTimeoutMs); + StreamingWorkerStatusReporter workerStatusReporter = + StreamingWorkerStatusReporter.forTesting( + publishCounters, + workUnitClient, + windmillServer::getAndResetThrottleTime, + stageInfo::values, + failureTracker, + streamingCounters, + memoryMonitor, + workExecutor, + executorSupplier); return new StreamingDataflowWorker( windmillServer, 1L, computationMap, + stageInfo, stateCache, - boundedQueueExecutor, + workExecutor, mapTaskExecutorFactory, workUnitClient, options, - publishCounters, hotKeyLogger, clock, + workerStatusReporter, + failureTracker, + workFailureProcessor, + streamingCounters, + memoryMonitor, executorSupplier); } + @VisibleForTesting + final void reportPeriodicWorkerUpdatesForTest() { + workerStatusReporter.reportPeriodicWorkerUpdates(); + } + private static ConcurrentMap createComputationMapForTesting( List mapTasks, BoundedQueueExecutor workUnitExecutor, @@ -539,17 +550,6 @@ private static BoundedQueueExecutor createWorkUnitExecutor(DataflowWorkerHarness new ThreadFactoryBuilder().setNameFormat("DataflowWorkUnits-%d").setDaemon(true).build()); } - /** Returns whether an exception was caused by a {@link OutOfMemoryError}. */ - private static boolean isOutOfMemoryError(Throwable t) { - while (t != null) { - if (t instanceof OutOfMemoryError) { - return true; - } - t = t.getCause(); - } - return false; - } - private static MapTask parseMapTask(String input) throws IOException { return Transport.getJsonFactory().fromString(input, MapTask.class); } @@ -683,11 +683,6 @@ void addStateNameMappings(Map nameMap) { stateNameMap.putAll(nameMap); } - @VisibleForTesting - public void setRetryLocallyDelayMs(int retryLocallyDelayMs) { - this.retryLocallyDelayMs = retryLocallyDelayMs; - } - @VisibleForTesting public void setMaxWorkItemCommitBytes(int maxWorkItemCommitBytes) { if (maxWorkItemCommitBytes != this.maxWorkItemCommitBytes) { @@ -719,27 +714,6 @@ public void start() { dispatchThread.start(); sampler.start(); - // Periodically report workers counters and other updates. - ScheduledExecutorService workerUpdateTimer = executorSupplier.apply("GlobalWorkerUpdates"); - workerUpdateTimer.scheduleWithFixedDelay( - this::reportPeriodicWorkerUpdates, - 0, - options.getWindmillHarnessUpdateReportingPeriod().getMillis(), - TimeUnit.MILLISECONDS); - scheduledExecutors.add(workerUpdateTimer); - - ScheduledExecutorService workerMessageTimer = executorSupplier.apply("ReportWorkerMessage"); - if (options.getWindmillHarnessUpdateReportingPeriod().getMillis() > 0) { - workerMessageTimer.scheduleWithFixedDelay( - this::reportPeriodicWorkerMessage, - 0, - options.getWindmillHarnessUpdateReportingPeriod().getMillis(), - TimeUnit.MILLISECONDS); - scheduledExecutors.add(workerMessageTimer); - } - - activeWorkRefresher.start(); - if (options.getPeriodicStatusPageOutputDirectory() != null) { ScheduledExecutorService statusPageTimer = executorSupplier.apply("DumpStatusPages"); statusPageTimer.scheduleWithFixedDelay( @@ -748,7 +722,7 @@ public void start() { if (pages.isEmpty()) { LOG.warn("No captured status pages."); } - Long timestamp = clock.get().getMillis(); + long timestamp = clock.get().getMillis(); for (Capturable page : pages) { PrintWriter writer = null; try { @@ -779,7 +753,8 @@ public void start() { scheduledExecutors.add(statusPageTimer); } workCommitter.start(); - reportHarnessStartup(); + workerStatusReporter.start(options.getWindmillHarnessUpdateReportingPeriod().getMillis()); + activeWorkRefresher.start(); } public void startStatusPages() { @@ -806,13 +781,6 @@ public void startStatusPages() { statusPages.start(); } - public void addWorkerStatusPage(BaseStatusServlet page) { - statusPages.addServlet(page); - if (page instanceof Capturable) { - statusPages.addCapturePage((Capturable) page); - } - } - public void stop() { try { for (ScheduledExecutorService timer : scheduledExecutors) { @@ -843,9 +811,7 @@ public void stop() { state.close(); } - // one last send - reportPeriodicWorkerUpdates(); - reportPeriodicWorkerMessage(); + workerStatusReporter.stop(); } catch (Exception e) { LOG.warn("Exception while shutting down: ", e); } @@ -1132,7 +1098,7 @@ private void process( work.getLatencyTrackingId()); StreamingModeExecutionContext context = new StreamingModeExecutionContext( - pendingDeltaCounters, + streamingCounters.pendingDeltaCounters(), computationId, readerCache, !computationState.getTransformUserNameToStateFamily().isEmpty() @@ -1151,7 +1117,7 @@ private void process( readerRegistry, sinkRegistry, context, - pendingDeltaCounters, + streamingCounters.pendingDeltaCounters(), idGenerator); ReadOperation readOperation = mapTaskExecutor.getReadOperation(); // Disable progress updates since its results are unused for streaming @@ -1298,11 +1264,11 @@ private void process( int estimatedCommitSize = commitSize < 0 ? Integer.MAX_VALUE : commitSize; // Detect overflow of integer serialized size or if the byte limit was exceeded. - windmillMaxObservedWorkItemCommitBytes.addValue(estimatedCommitSize); + streamingCounters.windmillMaxObservedWorkItemCommitBytes().addValue(estimatedCommitSize); if (commitSize < 0 || commitSize > byteLimit) { KeyCommitTooLargeException e = KeyCommitTooLargeException.causedBy(computationId, byteLimit, commitRequest); - reportFailure(computationId, workItem, e); + failureTracker.trackFailure(computationId, workItem, e); LOG.error(e.toString()); // Drop the current request in favor of a new, minimal one requesting truncation. @@ -1327,9 +1293,9 @@ private void process( } } long stateBytesRead = stateReader.getBytesRead() + localSideInputStateFetcher.getBytesRead(); - windmillShuffleBytesRead.addValue(shuffleBytesRead); - windmillStateBytesRead.addValue(stateBytesRead); - windmillStateBytesWritten.addValue(stateBytesWritten); + streamingCounters.windmillShuffleBytesRead().addValue(shuffleBytesRead); + streamingCounters.windmillStateBytesRead().addValue(stateBytesRead); + streamingCounters.windmillStateBytesWritten().addValue(stateBytesWritten); LOG.debug("Processing done for work token: {}", workItem.getWorkToken()); } catch (Throwable t) { @@ -1345,71 +1311,13 @@ private void process( } } - t = t instanceof UserCodeException ? t.getCause() : t; - - boolean retryLocally = false; - if (KeyTokenInvalidException.isKeyTokenInvalidException(t)) { - LOG.debug( - "Execution of work for computation '{}' on key '{}' failed due to token expiration. " - + "Work will not be retried locally.", - computationId, - key.toStringUtf8()); - } else if (WorkItemCancelledException.isWorkItemCancelledException(t)) { - LOG.debug( - "Execution of work for computation '{}' on key '{}' failed. " - + "Work will not be retried locally.", - computationId, - workItem.getShardingKey()); - } else { - LastExceptionDataProvider.reportException(t); - LOG.debug("Failed work: {}", work); - Duration elapsedTimeSinceStart = new Duration(work.getStartTime(), clock.get()); - if (!reportFailure(computationId, workItem, t)) { - LOG.error( - "Execution of work for computation '{}' on key '{}' failed with uncaught exception, " - + "and Windmill indicated not to retry locally.", - computationId, - key.toStringUtf8(), - t); - } else if (isOutOfMemoryError(t)) { - File heapDump = memoryMonitor.tryToDumpHeap(); - LOG.error( - "Execution of work for computation '{}' for key '{}' failed with out-of-memory. " - + "Work will not be retried locally. Heap dump {}.", - computationId, - key.toStringUtf8(), - heapDump == null ? "not written" : ("written to '" + heapDump + "'"), - t); - } else if (elapsedTimeSinceStart.isLongerThan(MAX_LOCAL_PROCESSING_RETRY_DURATION)) { - LOG.error( - "Execution of work for computation '{}' for key '{}' failed with uncaught exception, " - + "and it will not be retried locally because the elapsed time since start {} " - + "exceeds {}.", - computationId, - key.toStringUtf8(), - elapsedTimeSinceStart, - MAX_LOCAL_PROCESSING_RETRY_DURATION, - t); - } else { - LOG.error( - "Execution of work for computation '{}' on key '{}' failed with uncaught exception. " - + "Work will be retried locally.", - computationId, - key.toStringUtf8(), - t); - retryLocally = true; - } - } - if (retryLocally) { - // Try again after some delay and at the end of the queue to avoid a tight loop. - sleep(retryLocallyDelayMs); - workUnitExecutor.forceExecute(work, work.getWorkItem().getSerializedSize()); - } else { - // Consider the item invalid. It will eventually be retried by Windmill if it still needs to - // be processed. - computationState.completeWorkAndScheduleNextWorkForKey( - ShardedKey.create(key, workItem.getShardingKey()), work.id()); - } + workFailureProcessor.logAndProcessFailure( + computationId, + work, + t, + invalidWork -> + computationState.completeWorkAndScheduleNextWorkForKey( + createShardedKey(invalidWork), invalidWork.id())); } finally { // Update total processing time counters. Updating in finally clause ensures that // work items causing exceptions are also accounted in time spent. @@ -1432,6 +1340,10 @@ private void process( } } + private static ShardedKey createShardedKey(Work work) { + return ShardedKey.create(work.getWorkItem().getKey(), work.getWorkItem().getShardingKey()); + } + private WorkItemCommitRequest buildWorkItemTruncationRequest( final ByteString key, final Windmill.WorkItem workItem, final int estimatedCommitSize) { Windmill.WorkItemCommitRequest.Builder outputBuilder = initializeOutputBuilder(key, workItem); @@ -1646,275 +1558,15 @@ private void getConfig(String computation) { @VisibleForTesting public Iterable buildCounters() { return Iterables.concat( - pendingDeltaCounters.extractModifiedDeltaUpdates(DataflowCounterUpdateExtractor.INSTANCE), - pendingCumulativeCounters.extractUpdates(false, DataflowCounterUpdateExtractor.INSTANCE)); - } - - private String buildExceptionStackTrace(Throwable t, final int maxDepth) { - StringBuilder builder = new StringBuilder(1024); - Throwable cur = t; - for (int depth = 0; cur != null && depth < maxDepth; cur = cur.getCause()) { - if (depth > 0) { - builder.append("\nCaused by: "); - } - builder.append(cur); - depth++; - for (StackTraceElement frame : cur.getStackTrace()) { - if (depth < maxDepth) { - builder.append("\n "); - builder.append(frame); - depth++; - } - } - } - if (cur != null) { - builder.append("\nStack trace truncated. Please see Cloud Logging for the entire trace."); - } - return builder.toString(); - } - - // Returns true if reporting the exception is successful and the work should be retried. - private boolean reportFailure(String computation, Windmill.WorkItem work, Throwable t) { - // Enqueue the errors to be sent to DFE in periodic updates - addFailure(buildExceptionStackTrace(t, options.getMaxStackTraceDepthToReport())); - if (windmillServiceEnabled) { - return true; - } else { - Windmill.ReportStatsResponse response = - windmillServer.reportStats( - Windmill.ReportStatsRequest.newBuilder() - .setComputationId(computation) - .setKey(work.getKey()) - .setShardingKey(work.getShardingKey()) - .setWorkToken(work.getWorkToken()) - .build()); - return !response.getFailed(); - } - } - - /** - * Adds the given failure message to the queue of messages to be reported to DFE in periodic - * updates. - */ - public void addFailure(String failureMessage) { - synchronized (pendingFailuresToReport) { - pendingFailuresToReport.add(failureMessage); - } - } - - private void reportHarnessStartup() { - DataflowWorkerLoggingMDC.setStageName("startup"); - CounterSet restartCounter = new CounterSet(); - restartCounter - .longSum(StreamingSystemCounterNames.JAVA_HARNESS_RESTARTS.counterName()) - .addValue(1L); - try { - // Sending a one time update. Use empty counter set for cumulativeCounters (2nd arg). - sendWorkerUpdatesToDataflowService(restartCounter, new CounterSet()); - } catch (IOException e) { - LOG.warn("Failed to send harness startup counter", e); - } - } - - /** Updates VM metrics like memory and CPU utilization. */ - private void updateVMMetrics() { - Runtime rt = Runtime.getRuntime(); - long usedMemory = rt.totalMemory() - rt.freeMemory(); - long maxMemory = rt.maxMemory(); - - javaHarnessUsedMemory.getAndReset(); - javaHarnessUsedMemory.addValue(usedMemory); - javaHarnessMaxMemory.getAndReset(); - javaHarnessMaxMemory.addValue(maxMemory); - } - - private void updateThreadMetrics() { - timeAtMaxActiveThreads.getAndReset(); - long allThreadsActiveTime = workUnitExecutor.allThreadsActiveTime(); - timeAtMaxActiveThreads.addValue(allThreadsActiveTime - previousTimeAtMaxThreads.get()); - previousTimeAtMaxThreads.set(allThreadsActiveTime); - activeThreads.getAndReset(); - activeThreads.addValue(workUnitExecutor.activeCount()); - totalAllocatedThreads.getAndReset(); - totalAllocatedThreads.addValue(chooseMaximumNumberOfThreads()); - outstandingBytes.getAndReset(); - outstandingBytes.addValue(workUnitExecutor.bytesOutstanding()); - maxOutstandingBytes.getAndReset(); - maxOutstandingBytes.addValue(workUnitExecutor.maximumBytesOutstanding()); - outstandingBundles.getAndReset(); - outstandingBundles.addValue((long) workUnitExecutor.elementsOutstanding()); - maxOutstandingBundles.getAndReset(); - maxOutstandingBundles.addValue((long) workUnitExecutor.maximumElementsOutstanding()); - } - - private WorkerMessage createWorkerMessageForStreamingScalingReport() { - StreamingScalingReport activeThreadsReport = - new StreamingScalingReport() - .setActiveThreadCount(workUnitExecutor.activeCount()) - .setActiveBundleCount(workUnitExecutor.elementsOutstanding()) - .setOutstandingBytes(workUnitExecutor.bytesOutstanding()) - .setMaximumThreadCount(chooseMaximumNumberOfThreads()) - .setMaximumBundleCount(workUnitExecutor.maximumElementsOutstanding()) - .setMaximumBytes(workUnitExecutor.maximumBytesOutstanding()); - return workUnitClient.createWorkerMessageFromStreamingScalingReport(activeThreadsReport); - } - - private Optional createWorkerMessageForPerWorkerMetrics() { - List metrics = new ArrayList<>(); - stageInfoMap.values().forEach(s -> metrics.addAll(s.extractPerWorkerMetricValues())); - - if (metrics.isEmpty()) { - return Optional.empty(); - } - - PerWorkerMetrics perWorkerMetrics = new PerWorkerMetrics().setPerStepNamespaceMetrics(metrics); - return Optional.of(workUnitClient.createWorkerMessageFromPerWorkerMetrics(perWorkerMetrics)); - } - - private void sendWorkerMessage() throws IOException { - List workerMessages = new ArrayList(2); - workerMessages.add(createWorkerMessageForStreamingScalingReport()); - - if (StreamingStepMetricsContainer.getEnablePerWorkerMetrics()) { - Optional metricsMsg = createWorkerMessageForPerWorkerMetrics(); - if (metricsMsg.isPresent()) { - workerMessages.add(metricsMsg.get()); - } - } - - workUnitClient.reportWorkerMessage(workerMessages); - } - - @VisibleForTesting - public void reportPeriodicWorkerUpdates() { - updateVMMetrics(); - updateThreadMetrics(); - try { - sendWorkerUpdatesToDataflowService(pendingDeltaCounters, pendingCumulativeCounters); - } catch (IOException e) { - LOG.warn("Failed to send periodic counter updates", e); - } catch (Exception e) { - LOG.error("Unexpected exception while trying to send counter updates", e); - } - } - - @VisibleForTesting - public void reportPeriodicWorkerMessage() { - try { - sendWorkerMessage(); - } catch (IOException e) { - LOG.warn("Failed to send worker messages", e); - } catch (Exception e) { - LOG.error("Unexpected exception while trying to send worker messages", e); - } - } - - /** - * Returns key for a counter update. It is a String in case of legacy counter and - * CounterStructuredName in the case of a structured counter. - */ - private Object getCounterUpdateKey(CounterUpdate counterUpdate) { - Object key = null; - if (counterUpdate.getNameAndKind() != null) { - key = counterUpdate.getNameAndKind().getName(); - } else if (counterUpdate.getStructuredNameAndMetadata() != null) { - key = counterUpdate.getStructuredNameAndMetadata().getName(); - } - checkArgument(key != null, "Could not find name for CounterUpdate: %s", counterUpdate); - return key; - } - - /** Sends counter updates to Dataflow backend. */ - private void sendWorkerUpdatesToDataflowService( - CounterSet deltaCounters, CounterSet cumulativeCounters) throws IOException { - // Throttle time is tracked by the windmillServer but is reported to DFE here. - windmillQuotaThrottling.addValue(windmillServer.getAndResetThrottleTime()); - if (memoryMonitor.isThrashing()) { - memoryThrashing.addValue(1); - } - - List counterUpdates = new ArrayList<>(128); - - if (publishCounters) { - stageInfoMap.values().forEach(s -> counterUpdates.addAll(s.extractCounterUpdates())); - counterUpdates.addAll( - cumulativeCounters.extractUpdates(false, DataflowCounterUpdateExtractor.INSTANCE)); - counterUpdates.addAll( - deltaCounters.extractModifiedDeltaUpdates(DataflowCounterUpdateExtractor.INSTANCE)); - } - - // Handle duplicate counters from different stages. Store all the counters in a multi-map and - // send the counters that appear multiple times in separate RPCs. Same logical counter could - // appear in multiple stages if a step runs in multiple stages (as with flatten-unzipped stages) - // especially if the counter definition does not set execution_step_name. - ListMultimap counterMultimap = - MultimapBuilder.hashKeys(counterUpdates.size()).linkedListValues().build(); - boolean hasDuplicates = false; - - for (CounterUpdate c : counterUpdates) { - Object key = getCounterUpdateKey(c); - if (counterMultimap.containsKey(key)) { - hasDuplicates = true; - } - counterMultimap.put(key, c); - } - - // Clears counterUpdates and enqueues unique counters from counterMultimap. If a counter - // appears more than once, one of them is extracted leaving the remaining in the map. - Runnable extractUniqueCounters = - () -> { - counterUpdates.clear(); - for (Iterator iter = counterMultimap.keySet().iterator(); iter.hasNext(); ) { - List counters = counterMultimap.get(iter.next()); - counterUpdates.add(counters.get(0)); - if (counters.size() == 1) { - // There is single value. Remove the entry through the iterator. - iter.remove(); - } else { - // Otherwise remove the first value. - counters.remove(0); - } - } - }; - - if (hasDuplicates) { - extractUniqueCounters.run(); - } else { // Common case: no duplicates. We can just send counterUpdates, empty the multimap. - counterMultimap.clear(); - } - - List errors; - synchronized (pendingFailuresToReport) { - errors = new ArrayList<>(pendingFailuresToReport.size()); - for (String stackTrace : pendingFailuresToReport) { - errors.add( - new Status() - .setCode(2) // rpc.Code.UNKNOWN - .setMessage(stackTrace)); - } - pendingFailuresToReport.clear(); // Best effort only, no need to wait till successfully sent. - } - - WorkItemStatus workItemStatus = - new WorkItemStatus() - .setWorkItemId(WINDMILL_COUNTER_UPDATE_WORK_ID) - .setErrors(errors) - .setCounterUpdates(counterUpdates); - - workUnitClient.reportWorkItemStatus(workItemStatus); - - // Send any counters appearing more than once in subsequent RPCs: - while (!counterMultimap.isEmpty()) { - extractUniqueCounters.run(); - workUnitClient.reportWorkItemStatus( - new WorkItemStatus() - .setWorkItemId(WINDMILL_COUNTER_UPDATE_WORK_ID) - .setCounterUpdates(counterUpdates)); - } + streamingCounters + .pendingDeltaCounters() + .extractModifiedDeltaUpdates(DataflowCounterUpdateExtractor.INSTANCE), + streamingCounters + .pendingCumulativeCounters() + .extractUpdates(false, DataflowCounterUpdateExtractor.INSTANCE)); } private class HarnessDataProvider implements StatusDataProvider { - @Override public void appendSummaryHtml(PrintWriter writer) { writer.println("Running: " + running.get() + "
"); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WorkUnitClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WorkUnitClient.java index f9637a375ed5..d75d91d00885 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WorkUnitClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WorkUnitClient.java @@ -28,7 +28,7 @@ import java.util.Optional; /** Abstract base class describing a client for WorkItem work units. */ -interface WorkUnitClient { +public interface WorkUnitClient { /** * Returns a new WorkItem unit for this Worker to work on or null if no work item is available. */ diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingCounters.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingCounters.java new file mode 100644 index 000000000000..6e4f09926eda --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingCounters.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.harness; + +import com.google.auto.value.AutoValue; +import org.apache.beam.runners.dataflow.worker.DataflowSystemMetrics.StreamingSystemCounterNames; +import org.apache.beam.runners.dataflow.worker.counters.Counter; +import org.apache.beam.runners.dataflow.worker.counters.CounterSet; +import org.apache.beam.sdk.annotations.Internal; + +/** Streaming pipeline counters to report pipeline processing metrics to Dataflow backend. */ +@Internal +@AutoValue +public abstract class StreamingCounters { + + public static StreamingCounters create() { + CounterSet pendingDeltaCounters = new CounterSet(); + CounterSet pendingCumulativeCounters = new CounterSet(); + return new AutoValue_StreamingCounters( + pendingDeltaCounters, + pendingCumulativeCounters, + pendingDeltaCounters.longSum( + StreamingSystemCounterNames.WINDMILL_SHUFFLE_BYTES_READ.counterName()), + pendingDeltaCounters.longSum( + StreamingSystemCounterNames.WINDMILL_STATE_BYTES_READ.counterName()), + pendingDeltaCounters.longSum( + StreamingSystemCounterNames.WINDMILL_STATE_BYTES_WRITTEN.counterName()), + pendingDeltaCounters.longSum( + StreamingSystemCounterNames.WINDMILL_QUOTA_THROTTLING.counterName()), + pendingDeltaCounters.longSum( + StreamingSystemCounterNames.TIME_AT_MAX_ACTIVE_THREADS.counterName()), + pendingCumulativeCounters.longSum( + StreamingSystemCounterNames.JAVA_HARNESS_USED_MEMORY.counterName()), + pendingCumulativeCounters.longSum( + StreamingSystemCounterNames.JAVA_HARNESS_MAX_MEMORY.counterName()), + pendingCumulativeCounters.intSum(StreamingSystemCounterNames.ACTIVE_THREADS.counterName()), + pendingCumulativeCounters.intSum( + StreamingSystemCounterNames.TOTAL_ALLOCATED_THREADS.counterName()), + pendingCumulativeCounters.longSum( + StreamingSystemCounterNames.OUTSTANDING_BYTES.counterName()), + pendingCumulativeCounters.longSum( + StreamingSystemCounterNames.MAX_OUTSTANDING_BYTES.counterName()), + pendingCumulativeCounters.longSum( + StreamingSystemCounterNames.OUTSTANDING_BUNDLES.counterName()), + pendingCumulativeCounters.longSum( + StreamingSystemCounterNames.MAX_OUTSTANDING_BUNDLES.counterName()), + pendingCumulativeCounters.intMax( + StreamingSystemCounterNames.WINDMILL_MAX_WORK_ITEM_COMMIT_BYTES.counterName()), + pendingCumulativeCounters.intSum( + StreamingSystemCounterNames.MEMORY_THRASHING.counterName())); + } + + public abstract CounterSet pendingDeltaCounters(); + + public abstract CounterSet pendingCumulativeCounters(); + // Built-in delta counters. + public abstract Counter windmillShuffleBytesRead(); + + public abstract Counter windmillStateBytesRead(); + + public abstract Counter windmillStateBytesWritten(); + + public abstract Counter windmillQuotaThrottling(); + + public abstract Counter timeAtMaxActiveThreads(); + // Built-in cumulative counters. + public abstract Counter javaHarnessUsedMemory(); + + public abstract Counter javaHarnessMaxMemory(); + + public abstract Counter activeThreads(); + + public abstract Counter totalAllocatedThreads(); + + public abstract Counter outstandingBytes(); + + public abstract Counter maxOutstandingBytes(); + + public abstract Counter outstandingBundles(); + + public abstract Counter maxOutstandingBundles(); + + public abstract Counter windmillMaxObservedWorkItemCommitBytes(); + + public abstract Counter memoryThrashing(); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingWorkerStatusReporter.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingWorkerStatusReporter.java new file mode 100644 index 000000000000..409f0337eebd --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingWorkerStatusReporter.java @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.harness; + +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull; + +import com.google.api.services.dataflow.model.CounterUpdate; +import com.google.api.services.dataflow.model.PerStepNamespaceMetrics; +import com.google.api.services.dataflow.model.PerWorkerMetrics; +import com.google.api.services.dataflow.model.StreamingScalingReport; +import com.google.api.services.dataflow.model.WorkItemStatus; +import com.google.api.services.dataflow.model.WorkerMessage; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.Function; +import java.util.function.Supplier; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.runners.dataflow.worker.DataflowSystemMetrics; +import org.apache.beam.runners.dataflow.worker.StreamingStepMetricsContainer; +import org.apache.beam.runners.dataflow.worker.WorkUnitClient; +import org.apache.beam.runners.dataflow.worker.counters.CounterSet; +import org.apache.beam.runners.dataflow.worker.counters.DataflowCounterUpdateExtractor; +import org.apache.beam.runners.dataflow.worker.logging.DataflowWorkerLoggingMDC; +import org.apache.beam.runners.dataflow.worker.streaming.StageInfo; +import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; +import org.apache.beam.runners.dataflow.worker.util.MemoryMonitor; +import org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures.FailureTracker; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ListMultimap; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.MultimapBuilder; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Reports the status of the worker to Dataflow Service. */ +@Internal +@ThreadSafe +public final class StreamingWorkerStatusReporter { + private static final Logger LOG = LoggerFactory.getLogger(StreamingWorkerStatusReporter.class); + + // Reserved ID for counter updates. + // Matches kWindmillCounterUpdate in workflow_worker_service_multi_hubs.cc. + private static final String WINDMILL_COUNTER_UPDATE_WORK_ID = "3"; + private static final int COUNTER_UPDATES_SIZE = 128; + private static final String WORKER_MESSAGE_REPORTER_THREAD = "ReportWorkerMessage"; + private static final String GLOBAL_WORKER_UPDATE_REPORTER_THREAD = "GlobalWorkerUpdates"; + + private final boolean publishCounters; + private final WorkUnitClient dataflowServiceClient; + private final Supplier windmillQuotaThrottleTime; + private final Supplier> allStageInfo; + private final FailureTracker failureTracker; + private final StreamingCounters streamingCounters; + private final MemoryMonitor memoryMonitor; + private final BoundedQueueExecutor workExecutor; + private final AtomicLong previousTimeAtMaxThreads; + private final ScheduledExecutorService globalWorkerUpdateReporter; + private final ScheduledExecutorService workerMessageReporter; + + private StreamingWorkerStatusReporter( + boolean publishCounters, + WorkUnitClient dataflowServiceClient, + Supplier windmillQuotaThrottleTime, + Supplier> allStageInfo, + FailureTracker failureTracker, + StreamingCounters streamingCounters, + MemoryMonitor memoryMonitor, + BoundedQueueExecutor workExecutor, + Function executorFactory) { + this.publishCounters = publishCounters; + this.dataflowServiceClient = dataflowServiceClient; + this.windmillQuotaThrottleTime = windmillQuotaThrottleTime; + this.allStageInfo = allStageInfo; + this.failureTracker = failureTracker; + this.streamingCounters = streamingCounters; + this.memoryMonitor = memoryMonitor; + this.workExecutor = workExecutor; + this.previousTimeAtMaxThreads = new AtomicLong(); + this.globalWorkerUpdateReporter = executorFactory.apply(GLOBAL_WORKER_UPDATE_REPORTER_THREAD); + this.workerMessageReporter = executorFactory.apply(WORKER_MESSAGE_REPORTER_THREAD); + } + + public static StreamingWorkerStatusReporter create( + WorkUnitClient workUnitClient, + Supplier windmillQuotaThrottleTime, + Supplier> allStageInfo, + FailureTracker failureTracker, + StreamingCounters streamingCounters, + MemoryMonitor memoryMonitor, + BoundedQueueExecutor workExecutor) { + return new StreamingWorkerStatusReporter( + /* publishCounters= */ true, + workUnitClient, + windmillQuotaThrottleTime, + allStageInfo, + failureTracker, + streamingCounters, + memoryMonitor, + workExecutor, + threadName -> + Executors.newSingleThreadScheduledExecutor( + new ThreadFactoryBuilder().setNameFormat(threadName).build())); + } + + @VisibleForTesting + public static StreamingWorkerStatusReporter forTesting( + boolean publishCounters, + WorkUnitClient workUnitClient, + Supplier windmillQuotaThrottleTime, + Supplier> allStageInfo, + FailureTracker failureTracker, + StreamingCounters streamingCounters, + MemoryMonitor memoryMonitor, + BoundedQueueExecutor workExecutor, + Function executorFactory) { + return new StreamingWorkerStatusReporter( + publishCounters, + workUnitClient, + windmillQuotaThrottleTime, + allStageInfo, + failureTracker, + streamingCounters, + memoryMonitor, + workExecutor, + executorFactory); + } + + /** + * Returns key for a counter update. It is a String in case of legacy counter and + * CounterStructuredName in the case of a structured counter. + */ + private static Object getCounterUpdateKey(CounterUpdate counterUpdate) { + Object key = null; + if (counterUpdate.getNameAndKind() != null) { + key = counterUpdate.getNameAndKind().getName(); + } else if (counterUpdate.getStructuredNameAndMetadata() != null) { + key = counterUpdate.getStructuredNameAndMetadata().getName(); + } + return checkNotNull(key, "Could not find name for CounterUpdate: %s", counterUpdate); + } + + /** + * Clears counterUpdates and enqueues unique counters from counterMultimap. If a counter appears + * more than once, one of them is extracted leaving the remaining in the map. + */ + private static void extractUniqueCounters( + List counterUpdates, ListMultimap counterMultimap) { + counterUpdates.clear(); + for (Iterator iter = counterMultimap.keySet().iterator(); iter.hasNext(); ) { + List counters = counterMultimap.get(iter.next()); + counterUpdates.add(counters.get(0)); + if (counters.size() == 1) { + // There is single value. Remove the entry through the iterator. + iter.remove(); + } else { + // Otherwise remove the first value. + counters.remove(0); + } + } + } + + private static void shutdownExecutor(ScheduledExecutorService executor) { + executor.shutdown(); + try { + executor.awaitTermination(10, TimeUnit.SECONDS); + } catch (InterruptedException e) { + LOG.warn("Error occurred trying to gracefully shutdown executor={}", executor, e); + executor.shutdownNow(); + } + } + + @SuppressWarnings("FutureReturnValueIgnored") + public void start(long windmillHarnessUpdateReportingPeriodMillis) { + reportHarnessStartup(); + if (windmillHarnessUpdateReportingPeriodMillis > 0) { + LOG.info( + "Starting periodic worker status reporters. Reporting period is every {} millis.", + windmillHarnessUpdateReportingPeriodMillis); + // Periodically report workers counters and other updates. + globalWorkerUpdateReporter.scheduleWithFixedDelay( + this::reportPeriodicWorkerUpdates, + 0, + windmillHarnessUpdateReportingPeriodMillis, + TimeUnit.MILLISECONDS); + + workerMessageReporter.scheduleWithFixedDelay( + this::reportPeriodicWorkerMessage, + 0, + windmillHarnessUpdateReportingPeriodMillis, + TimeUnit.MILLISECONDS); + } else { + LOG.info("Periodic worker status reporting is disabled."); + } + } + + public void stop() { + shutdownExecutor(globalWorkerUpdateReporter); + shutdownExecutor(workerMessageReporter); + // one last send + reportPeriodicWorkerUpdates(); + reportPeriodicWorkerMessage(); + } + + private void reportHarnessStartup() { + DataflowWorkerLoggingMDC.setStageName("startup"); + CounterSet restartCounter = new CounterSet(); + restartCounter + .longSum( + DataflowSystemMetrics.StreamingSystemCounterNames.JAVA_HARNESS_RESTARTS.counterName()) + .addValue(1L); + try { + // Sending a one time update. Use empty counter set for cumulativeCounters (2nd arg). + sendWorkerUpdatesToDataflowService(restartCounter, new CounterSet()); + } catch (IOException e) { + LOG.warn("Failed to send harness startup counter", e); + } + } + + /** Sends counter updates to Dataflow backend. */ + private void sendWorkerUpdatesToDataflowService( + CounterSet deltaCounters, CounterSet cumulativeCounters) throws IOException { + // Throttle time is tracked by the windmillServer but is reported to DFE here. + streamingCounters.windmillQuotaThrottling().addValue(windmillQuotaThrottleTime.get()); + if (memoryMonitor.isThrashing()) { + streamingCounters.memoryThrashing().addValue(1); + } + + List counterUpdates = new ArrayList<>(COUNTER_UPDATES_SIZE); + + if (publishCounters) { + allStageInfo.get().forEach(s -> counterUpdates.addAll(s.extractCounterUpdates())); + counterUpdates.addAll( + cumulativeCounters.extractUpdates(false, DataflowCounterUpdateExtractor.INSTANCE)); + counterUpdates.addAll( + deltaCounters.extractModifiedDeltaUpdates(DataflowCounterUpdateExtractor.INSTANCE)); + } + + // Handle duplicate counters from different stages. Store all the counters in a multimap and + // send the counters that appear multiple times in separate RPCs. Same logical counter could + // appear in multiple stages if a step runs in multiple stages (as with flatten-unzipped stages) + // especially if the counter definition does not set execution_step_name. + ListMultimap counterMultimap = + MultimapBuilder.hashKeys(counterUpdates.size()).linkedListValues().build(); + boolean hasDuplicates = false; + + for (CounterUpdate c : counterUpdates) { + Object key = getCounterUpdateKey(c); + if (counterMultimap.containsKey(key)) { + hasDuplicates = true; + } + counterMultimap.put(key, c); + } + + if (hasDuplicates) { + extractUniqueCounters(counterUpdates, counterMultimap); + } else { // Common case: no duplicates. We can just send counterUpdates, empty the multimap. + counterMultimap.clear(); + } + + WorkItemStatus workItemStatus = + new WorkItemStatus() + .setWorkItemId(WINDMILL_COUNTER_UPDATE_WORK_ID) + .setErrors(failureTracker.drainPendingFailuresToReport()) + .setCounterUpdates(counterUpdates); + + dataflowServiceClient.reportWorkItemStatus(workItemStatus); + + // Send any counters appearing more than once in subsequent RPCs: + while (!counterMultimap.isEmpty()) { + extractUniqueCounters(counterUpdates, counterMultimap); + dataflowServiceClient.reportWorkItemStatus( + new WorkItemStatus() + .setWorkItemId(WINDMILL_COUNTER_UPDATE_WORK_ID) + .setCounterUpdates(counterUpdates)); + } + } + + private void reportPeriodicWorkerMessage() { + try { + dataflowServiceClient.reportWorkerMessage(createWorkerMessage()); + } catch (IOException e) { + LOG.warn("Failed to send worker messages", e); + } catch (Exception e) { + LOG.error("Unexpected exception while trying to send worker messages", e); + } + } + + private List createWorkerMessage() { + List workerMessages = new ArrayList<>(2); + workerMessages.add(createWorkerMessageForStreamingScalingReport()); + + if (StreamingStepMetricsContainer.getEnablePerWorkerMetrics()) { + Optional metricsMsg = createWorkerMessageForPerWorkerMetrics(); + metricsMsg.ifPresent(workerMessages::add); + } + + return workerMessages; + } + + private WorkerMessage createWorkerMessageForStreamingScalingReport() { + StreamingScalingReport activeThreadsReport = + new StreamingScalingReport() + .setActiveThreadCount(workExecutor.activeCount()) + .setActiveBundleCount(workExecutor.elementsOutstanding()) + .setOutstandingBytes(workExecutor.bytesOutstanding()) + .setMaximumThreadCount(workExecutor.getMaximumPoolSize()) + .setMaximumBundleCount(workExecutor.maximumElementsOutstanding()) + .setMaximumBytes(workExecutor.maximumBytesOutstanding()); + return dataflowServiceClient.createWorkerMessageFromStreamingScalingReport(activeThreadsReport); + } + + private Optional createWorkerMessageForPerWorkerMetrics() { + List metrics = new ArrayList<>(); + allStageInfo.get().forEach(s -> metrics.addAll(s.extractPerWorkerMetricValues())); + + if (metrics.isEmpty()) { + return Optional.empty(); + } + + PerWorkerMetrics perWorkerMetrics = new PerWorkerMetrics().setPerStepNamespaceMetrics(metrics); + return Optional.of( + dataflowServiceClient.createWorkerMessageFromPerWorkerMetrics(perWorkerMetrics)); + } + + @VisibleForTesting + public void reportPeriodicWorkerUpdates() { + updateVMMetrics(); + updateThreadMetrics(); + try { + sendWorkerUpdatesToDataflowService( + streamingCounters.pendingDeltaCounters(), streamingCounters.pendingCumulativeCounters()); + } catch (IOException e) { + LOG.warn("Failed to send periodic counter updates", e); + } catch (Exception e) { + LOG.error("Unexpected exception while trying to send counter updates", e); + } + } + + private void updateVMMetrics() { + Runtime rt = Runtime.getRuntime(); + long usedMemory = rt.totalMemory() - rt.freeMemory(); + long maxMemory = rt.maxMemory(); + + streamingCounters.javaHarnessUsedMemory().getAndReset(); + streamingCounters.javaHarnessUsedMemory().addValue(usedMemory); + streamingCounters.javaHarnessMaxMemory().getAndReset(); + streamingCounters.javaHarnessMaxMemory().addValue(maxMemory); + } + + private void updateThreadMetrics() { + streamingCounters.timeAtMaxActiveThreads().getAndReset(); + long allThreadsActiveTime = workExecutor.allThreadsActiveTime(); + streamingCounters + .timeAtMaxActiveThreads() + .addValue(allThreadsActiveTime - previousTimeAtMaxThreads.get()); + previousTimeAtMaxThreads.set(allThreadsActiveTime); + streamingCounters.activeThreads().getAndReset(); + streamingCounters.activeThreads().addValue(workExecutor.activeCount()); + streamingCounters.totalAllocatedThreads().getAndReset(); + streamingCounters.totalAllocatedThreads().addValue(workExecutor.getMaximumPoolSize()); + streamingCounters.outstandingBytes().getAndReset(); + streamingCounters.outstandingBytes().addValue(workExecutor.bytesOutstanding()); + streamingCounters.maxOutstandingBytes().getAndReset(); + streamingCounters.maxOutstandingBytes().addValue(workExecutor.maximumBytesOutstanding()); + streamingCounters.outstandingBundles().getAndReset(); + streamingCounters.outstandingBundles().addValue((long) workExecutor.elementsOutstanding()); + streamingCounters.maxOutstandingBundles().getAndReset(); + streamingCounters + .maxOutstandingBundles() + .addValue((long) workExecutor.maximumElementsOutstanding()); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutor.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutor.java index cd4c727e310e..b1a0e087ef65 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutor.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutor.java @@ -33,6 +33,7 @@ public class BoundedQueueExecutor { private final ThreadPoolExecutor executor; private final int maximumElementsOutstanding; private final long maximumBytesOutstanding; + private final int maximumPoolSize; private final Monitor monitor = new Monitor(); private int elementsOutstanding = 0; @@ -48,6 +49,7 @@ public BoundedQueueExecutor( int maximumElementsOutstanding, long maximumBytesOutstanding, ThreadFactory threadFactory) { + this.maximumPoolSize = maximumPoolSize; executor = new ThreadPoolExecutor( maximumPoolSize, @@ -139,6 +141,10 @@ public int maximumElementsOutstanding() { return maximumElementsOutstanding; } + public final int getMaximumPoolSize() { + return maximumPoolSize; + } + public String summaryHtml() { monitor.enter(); try { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/FailureTracker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/FailureTracker.java new file mode 100644 index 000000000000..98ed71963a17 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/FailureTracker.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures; + +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList.toImmutableList; + +import com.google.api.services.dataflow.model.Status; +import com.google.rpc.Code; +import javax.annotation.concurrent.GuardedBy; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItem; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.EvictingQueue; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; + +/** Tracks failures that occur during user processing. */ +@ThreadSafe +@Internal +public abstract class FailureTracker { + + private final int maxStackTraceDepthToReport; + + @GuardedBy("pendingFailuresToReport") + private final EvictingQueue pendingFailuresToReport; + + protected FailureTracker(int maxFailuresToReportInUpdate, int maxStackTraceDepthToReport) { + this.pendingFailuresToReport = EvictingQueue.create(maxFailuresToReportInUpdate); + this.maxStackTraceDepthToReport = maxStackTraceDepthToReport; + } + + /** + * Reports the failure to streaming backend. Returns whether the processing can be retried + * locally. + */ + public final boolean trackFailure(String computationId, WorkItem work, Throwable failure) { + // Adds the given failure message to the queue of messages to be reported to DFE in periodic + // updates. + synchronized (pendingFailuresToReport) { + pendingFailuresToReport.add(buildExceptionStackTrace(failure)); + } + return reportFailureInternal(computationId, work); + } + + /** + * Returns all pending failures that have not been reported to Dataflow backend then clears the + * pending failure queue. + */ + public final ImmutableList drainPendingFailuresToReport() { + synchronized (pendingFailuresToReport) { + ImmutableList pendingFailures = + pendingFailuresToReport.stream() + .map( + stackTrace -> + new Status().setCode(Code.UNKNOWN.getNumber()).setMessage(stackTrace)) + .collect(toImmutableList()); + + // Best effort only, no need to wait till successfully sent. + pendingFailuresToReport.clear(); + + return pendingFailures; + } + } + + private String buildExceptionStackTrace(Throwable t) { + StringBuilder builder = new StringBuilder(1024); + Throwable cur = t; + for (int depth = 0; cur != null && depth < maxStackTraceDepthToReport; cur = cur.getCause()) { + if (depth > 0) { + builder.append("\nCaused by: "); + } + builder.append(cur); + depth++; + for (StackTraceElement frame : cur.getStackTrace()) { + if (depth < maxStackTraceDepthToReport) { + builder.append("\n "); + builder.append(frame); + depth++; + } + } + } + if (cur != null) { + builder.append("\nStack trace truncated. Please see Cloud Logging for the entire trace."); + } + return builder.toString(); + } + + protected abstract boolean reportFailureInternal(String computationId, Windmill.WorkItem work); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/HeapDumper.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/HeapDumper.java new file mode 100644 index 000000000000..3191a4ebbbf7 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/HeapDumper.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures; + +import java.io.File; +import java.util.Optional; + +@FunctionalInterface +public interface HeapDumper { + + /** Creates the heap dump and returns it if present. */ + Optional dumpAndGetHeap(); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingApplianceFailureTracker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingApplianceFailureTracker.java new file mode 100644 index 000000000000..dc5e3f540955 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingApplianceFailureTracker.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures; + +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.ReportStatsRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.ReportStatsResponse; +import org.apache.beam.sdk.annotations.Internal; + +/** Implementation of {@link FailureTracker} that reports failures to Streaming Appliance. */ +@ThreadSafe +@Internal +public final class StreamingApplianceFailureTracker extends FailureTracker { + private final StreamingApplianceStatsReporter statsReporter; + + private StreamingApplianceFailureTracker( + int maxFailuresToReportInUpdate, + int maxStackTraceDepthToReport, + StreamingApplianceStatsReporter statsReporter) { + super(maxFailuresToReportInUpdate, maxStackTraceDepthToReport); + this.statsReporter = statsReporter; + } + + public static StreamingApplianceFailureTracker create( + int maxFailuresToReportInUpdate, + int maxStackTraceDepthToReport, + StreamingApplianceStatsReporter statsReporter) { + return new StreamingApplianceFailureTracker( + maxFailuresToReportInUpdate, maxStackTraceDepthToReport, statsReporter); + } + + @Override + public boolean reportFailureInternal(String computationId, Windmill.WorkItem work) { + ReportStatsResponse response = + statsReporter.reportStats( + ReportStatsRequest.newBuilder() + .setComputationId(computationId) + .setKey(work.getKey()) + .setShardingKey(work.getShardingKey()) + .setWorkToken(work.getWorkToken()) + .build()); + return !response.getFailed(); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingApplianceStatsReporter.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingApplianceStatsReporter.java new file mode 100644 index 000000000000..9ab0b6eaf358 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingApplianceStatsReporter.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures; + +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; + +@FunctionalInterface +public interface StreamingApplianceStatsReporter { + Windmill.ReportStatsResponse reportStats(Windmill.ReportStatsRequest reportStatsRequest); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingEngineFailureTracker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingEngineFailureTracker.java new file mode 100644 index 000000000000..2d68886cf8be --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingEngineFailureTracker.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures; + +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItem; +import org.apache.beam.sdk.annotations.Internal; + +/** Implementation of {@link FailureTracker} that reports failures to Streaming Engine. */ +@ThreadSafe +@Internal +public final class StreamingEngineFailureTracker extends FailureTracker { + + private StreamingEngineFailureTracker( + int maxFailuresToReportInUpdate, int maxStackTraceDepthToReport) { + super(maxFailuresToReportInUpdate, maxStackTraceDepthToReport); + } + + public static StreamingEngineFailureTracker create( + int maxFailuresToReportInUpdate, int maxStackTraceDepthToReport) { + return new StreamingEngineFailureTracker( + maxFailuresToReportInUpdate, maxStackTraceDepthToReport); + } + + @Override + protected boolean reportFailureInternal(String computationId, WorkItem work) { + return true; + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessor.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessor.java new file mode 100644 index 000000000000..594c29e0ad25 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessor.java @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures; + +import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; +import java.util.function.Supplier; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.runners.dataflow.worker.KeyTokenInvalidException; +import org.apache.beam.runners.dataflow.worker.WorkItemCancelledException; +import org.apache.beam.runners.dataflow.worker.status.LastExceptionDataProvider; +import org.apache.beam.runners.dataflow.worker.streaming.Work; +import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.sdk.util.UserCodeException; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.Uninterruptibles; +import org.joda.time.Duration; +import org.joda.time.Instant; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Processes a failure that occurs during user processing of {@link Work}. */ +@ThreadSafe +@Internal +public final class WorkFailureProcessor { + private static final Logger LOG = LoggerFactory.getLogger(WorkFailureProcessor.class); + private static final Duration MAX_LOCAL_PROCESSING_RETRY_DURATION = Duration.standardMinutes(5); + private static final int DEFAULT_RETRY_LOCALLY_MS = 10000; + + private final BoundedQueueExecutor workUnitExecutor; + private final FailureTracker failureTracker; + private final HeapDumper heapDumper; + private final Supplier clock; + private final int retryLocallyDelayMs; + + private WorkFailureProcessor( + BoundedQueueExecutor workUnitExecutor, + FailureTracker failureTracker, + HeapDumper heapDumper, + Supplier clock, + int retryLocallyDelayMs) { + this.workUnitExecutor = workUnitExecutor; + this.failureTracker = failureTracker; + this.heapDumper = heapDumper; + this.clock = clock; + this.retryLocallyDelayMs = retryLocallyDelayMs; + } + + public static WorkFailureProcessor create( + BoundedQueueExecutor workUnitExecutor, + FailureTracker failureTracker, + HeapDumper heapDumper, + Supplier clock) { + return new WorkFailureProcessor( + workUnitExecutor, failureTracker, heapDumper, clock, DEFAULT_RETRY_LOCALLY_MS); + } + + @VisibleForTesting + public static WorkFailureProcessor forTesting( + BoundedQueueExecutor workUnitExecutor, + FailureTracker failureTracker, + HeapDumper heapDumper, + Supplier clock, + int retryLocallyDelayMs) { + return new WorkFailureProcessor( + workUnitExecutor, + failureTracker, + heapDumper, + clock, + retryLocallyDelayMs >= 0 ? retryLocallyDelayMs : DEFAULT_RETRY_LOCALLY_MS); + } + + /** Returns whether an exception was caused by a {@link OutOfMemoryError}. */ + private static boolean isOutOfMemoryError(Throwable t) { + while (t != null) { + if (t instanceof OutOfMemoryError) { + return true; + } + t = t.getCause(); + } + return false; + } + + /** + * Processes failures caused by thrown exceptions that occur during execution of {@link Work}. May + * attempt to retry execution of the {@link Work} or drop it if it is invalid. + */ + public void logAndProcessFailure( + String computationId, Work work, Throwable t, Consumer onInvalidWork) { + if (shouldRetryLocally(computationId, work, t)) { + // Try again after some delay and at the end of the queue to avoid a tight loop. + executeWithDelay(retryLocallyDelayMs, work); + } else { + // Consider the item invalid. It will eventually be retried by Windmill if it still needs to + // be processed. + onInvalidWork.accept(work); + } + } + + private String tryToDumpHeap() { + return heapDumper + .dumpAndGetHeap() + .map(heapDump -> "written to '" + heapDump + "'") + .orElseGet(() -> "not written"); + } + + private void executeWithDelay(long delayMs, Work work) { + Uninterruptibles.sleepUninterruptibly(delayMs, TimeUnit.MILLISECONDS); + workUnitExecutor.forceExecute(work, work.getWorkItem().getSerializedSize()); + } + + private boolean shouldRetryLocally(String computationId, Work work, Throwable t) { + Throwable parsedException = t instanceof UserCodeException ? t.getCause() : t; + if (KeyTokenInvalidException.isKeyTokenInvalidException(parsedException)) { + LOG.debug( + "Execution of work for computation '{}' on key '{}' failed due to token expiration. " + + "Work will not be retried locally.", + computationId, + work.getWorkItem().getKey().toStringUtf8()); + } else if (WorkItemCancelledException.isWorkItemCancelledException(parsedException)) { + LOG.debug( + "Execution of work for computation '{}' on key '{}' failed. " + + "Work will not be retried locally.", + computationId, + work.getWorkItem().getShardingKey()); + } else { + LastExceptionDataProvider.reportException(parsedException); + LOG.debug("Failed work: {}", work); + Duration elapsedTimeSinceStart = new Duration(work.getStartTime(), clock.get()); + if (!failureTracker.trackFailure(computationId, work.getWorkItem(), parsedException)) { + LOG.error( + "Execution of work for computation '{}' on key '{}' failed with uncaught exception, " + + "and Windmill indicated not to retry locally.", + computationId, + work.getWorkItem().getKey().toStringUtf8(), + parsedException); + } else if (isOutOfMemoryError(parsedException)) { + String heapDump = tryToDumpHeap(); + LOG.error( + "Execution of work for computation '{}' for key '{}' failed with out-of-memory. " + + "Work will not be retried locally. Heap dump {}.", + computationId, + work.getWorkItem().getKey().toStringUtf8(), + heapDump, + parsedException); + } else if (elapsedTimeSinceStart.isLongerThan(MAX_LOCAL_PROCESSING_RETRY_DURATION)) { + LOG.error( + "Execution of work for computation '{}' for key '{}' failed with uncaught exception, " + + "and it will not be retried locally because the elapsed time since start {} " + + "exceeds {}.", + computationId, + work.getWorkItem().getKey().toStringUtf8(), + elapsedTimeSinceStart, + MAX_LOCAL_PROCESSING_RETRY_DURATION, + parsedException); + } else { + LOG.error( + "Execution of work for computation '{}' on key '{}' failed with uncaught exception. " + + "Work will be retried locally.", + computationId, + work.getWorkItem().getKey().toStringUtf8(), + parsedException); + return true; + } + } + + return false; + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java index d8ead447e8e5..4fe5fa00c21f 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java @@ -795,7 +795,8 @@ private StreamingDataflowWorker makeWorker( DataflowWorkerHarnessOptions options, boolean publishCounters, Supplier clock, - Function executorSupplier) { + Function executorSupplier, + int localRetryTimeoutMs) { StreamingDataflowWorker worker = StreamingDataflowWorker.forTesting( computationMap, @@ -807,12 +808,22 @@ private StreamingDataflowWorker makeWorker( publishCounters, hotKeyLogger, clock, - executorSupplier); + executorSupplier, + localRetryTimeoutMs); worker.addStateNameMappings( ImmutableMap.of(DEFAULT_PARDO_USER_NAME, DEFAULT_PARDO_STATE_FAMILY)); return worker; } + private StreamingDataflowWorker makeWorker( + List instructions, + DataflowWorkerHarnessOptions options, + boolean publishCounters, + Supplier clock, + Function executorSupplier) { + return makeWorker(instructions, options, publishCounters, clock, executorSupplier, -1); + } + private StreamingDataflowWorker makeWorker( List instructions, DataflowWorkerHarnessOptions options, @@ -822,7 +833,22 @@ private StreamingDataflowWorker makeWorker( options, publishCounters, Instant::now, - (threadName) -> Executors.newSingleThreadScheduledExecutor()); + (threadName) -> Executors.newSingleThreadScheduledExecutor(), + -1); + } + + private StreamingDataflowWorker makeWorker( + List instructions, + DataflowWorkerHarnessOptions options, + boolean publishCounters, + int localRetryTimeoutMs) { + return makeWorker( + instructions, + options, + publishCounters, + Instant::now, + (threadName) -> Executors.newSingleThreadScheduledExecutor(), + localRetryTimeoutMs); } @Test @@ -1218,7 +1244,7 @@ public void testKeyCommitTooLargeException() throws Exception { // Spam worker updates a few times. int maxTries = 10; while (--maxTries > 0) { - worker.reportPeriodicWorkerUpdates(); + worker.reportPeriodicWorkerUpdatesForTest(); Uninterruptibles.sleepUninterruptibly(1000, TimeUnit.MILLISECONDS); } @@ -1299,6 +1325,7 @@ public void testExceptions() throws Exception { // TODO: This test needs to be adapted to work with streamingEngine=true. return; } + List instructions = Arrays.asList( makeSourceInstruction(StringUtf8Coder.of()), @@ -1353,7 +1380,7 @@ public void testExceptions() throws Exception { // Spam worker updates a few times. maxTries = 10; while (maxTries-- > 0) { - worker.reportPeriodicWorkerUpdates(); + worker.reportPeriodicWorkerUpdatesForTest(); Uninterruptibles.sleepUninterruptibly(1000, TimeUnit.MILLISECONDS); } @@ -3133,8 +3160,8 @@ public void testExceptionInvalidatesCache() throws Exception { makeWorker( instructions, options.as(DataflowWorkerHarnessOptions.class), - true /* publishCounters */); - worker.setRetryLocallyDelayMs(100); + true /* publishCounters */, + 100); worker.start(); // Three GetData requests diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingApplianceFailureTrackerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingApplianceFailureTrackerTest.java new file mode 100644 index 000000000000..5aa028ee5adc --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingApplianceFailureTrackerTest.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import com.google.api.services.dataflow.model.Status; +import com.google.common.truth.Correspondence; +import com.google.rpc.Code; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class StreamingApplianceFailureTrackerTest { + + private static final String DEFAULT_COMPUTATION_ID = "computationId"; + + private static FailureTracker streamingApplianceFailureReporter(boolean isWorkFailed) { + return StreamingApplianceFailureTracker.create( + 10, + 10, + ignored -> Windmill.ReportStatsResponse.newBuilder().setFailed(isWorkFailed).build()); + } + + private static Windmill.WorkItem workItem() { + return Windmill.WorkItem.newBuilder() + .setKey(ByteString.EMPTY) + .setWorkToken(1L) + .setCacheToken(1L) + .setShardingKey(1L) + .build(); + } + + @Test + public void testReportFailure_returnsFalseWhenResponseHasFailed() { + FailureTracker failureTracker = streamingApplianceFailureReporter(true); + assertFalse( + failureTracker.trackFailure(DEFAULT_COMPUTATION_ID, workItem(), new RuntimeException())); + } + + @Test + public void testReportFailure_returnsTrueWhenResponseNotFailed() { + FailureTracker failureTracker = streamingApplianceFailureReporter(false); + assertTrue( + failureTracker.trackFailure(DEFAULT_COMPUTATION_ID, workItem(), new RuntimeException())); + } + + @Test + public void testReportFailure_addsPendingErrors() { + FailureTracker failureTracker = streamingApplianceFailureReporter(true); + failureTracker.trackFailure(DEFAULT_COMPUTATION_ID, workItem(), new RuntimeException()); + assertThat(failureTracker.drainPendingFailuresToReport()).hasSize(1); + } + + @Test + public void testGet_correctlyCreatesErrorStatus() { + FailureTracker failureTracker = streamingApplianceFailureReporter(true); + RuntimeException error = new RuntimeException(); + failureTracker.trackFailure(DEFAULT_COMPUTATION_ID, workItem(), error); + assertThat(failureTracker.drainPendingFailuresToReport()) + .comparingElementsUsing( + Correspondence.from( + (Status a, Status b) -> + a.getCode().equals(b.getCode()) && a.getMessage().contains(b.getMessage()), + "Assert that both status codes are the same, and b contains a message.")) + .containsExactly( + new Status().setCode(Code.UNKNOWN.getNumber()).setMessage(error.toString())); + } + + @Test + public void testGet_clearsPendingErrors() { + FailureTracker failureTracker = streamingApplianceFailureReporter(true); + failureTracker.trackFailure(DEFAULT_COMPUTATION_ID, workItem(), new RuntimeException()); + failureTracker.drainPendingFailuresToReport(); + assertThat(failureTracker.drainPendingFailuresToReport()).isEmpty(); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingEngineFailureTrackerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingEngineFailureTrackerTest.java new file mode 100644 index 000000000000..b49f45ee684f --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/StreamingEngineFailureTrackerTest.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertTrue; + +import com.google.api.services.dataflow.model.Status; +import com.google.common.truth.Correspondence; +import com.google.rpc.Code; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class StreamingEngineFailureTrackerTest { + + private static final String DEFAULT_COMPUTATION_ID = "computationId"; + + private static FailureTracker streamingEngineFailureReporter() { + return StreamingEngineFailureTracker.create(10, 10); + } + + private static Windmill.WorkItem workItem() { + return Windmill.WorkItem.newBuilder() + .setKey(ByteString.EMPTY) + .setWorkToken(1L) + .setCacheToken(1L) + .setShardingKey(1L) + .build(); + } + + @Test + public void testReportFailure_returnsTrue() { + FailureTracker failureTracker = streamingEngineFailureReporter(); + assertTrue( + failureTracker.trackFailure(DEFAULT_COMPUTATION_ID, workItem(), new RuntimeException())); + } + + @Test + public void testReportFailure_addsPendingErrors() { + FailureTracker failureTracker = streamingEngineFailureReporter(); + failureTracker.trackFailure(DEFAULT_COMPUTATION_ID, workItem(), new RuntimeException()); + failureTracker.trackFailure(DEFAULT_COMPUTATION_ID, workItem(), new RuntimeException()); + failureTracker.trackFailure(DEFAULT_COMPUTATION_ID, workItem(), new RuntimeException()); + + assertThat(failureTracker.drainPendingFailuresToReport()).hasSize(3); + } + + @Test + public void testGet_correctlyCreatesErrorStatus() { + FailureTracker failureTracker = streamingEngineFailureReporter(); + RuntimeException error = new RuntimeException(); + failureTracker.trackFailure(DEFAULT_COMPUTATION_ID, workItem(), error); + assertThat(failureTracker.drainPendingFailuresToReport()) + .comparingElementsUsing( + Correspondence.from( + (Status a, Status b) -> + a.getCode().equals(b.getCode()) && a.getMessage().contains(b.getMessage()), + "Assert that both status codes are the same, and b contains a message.")) + .containsExactly( + new Status().setCode(Code.UNKNOWN.getNumber()).setMessage(error.toString())); + } + + @Test + public void testGet_clearsPendingErrors() { + FailureTracker failureTracker = streamingEngineFailureReporter(); + failureTracker.trackFailure(DEFAULT_COMPUTATION_ID, workItem(), new RuntimeException()); + failureTracker.trackFailure(DEFAULT_COMPUTATION_ID, workItem(), new RuntimeException()); + + failureTracker.drainPendingFailuresToReport(); + assertThat(failureTracker.drainPendingFailuresToReport()).isEmpty(); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java new file mode 100644 index 000000000000..05b92e73f0ca --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures; + +import static com.google.common.truth.Truth.assertThat; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; +import java.util.function.Supplier; +import org.apache.beam.runners.dataflow.worker.KeyTokenInvalidException; +import org.apache.beam.runners.dataflow.worker.WorkItemCancelledException; +import org.apache.beam.runners.dataflow.worker.streaming.Work; +import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; +import org.joda.time.Duration; +import org.joda.time.Instant; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class WorkFailureProcessorTest { + + private static final String DEFAULT_COMPUTATION_ID = "computationId"; + + private static WorkFailureProcessor createWorkFailureProcessor( + FailureTracker failureTracker, Supplier clock) { + BoundedQueueExecutor workExecutor = + new BoundedQueueExecutor( + 1, + 60, + TimeUnit.SECONDS, + 1, + 10000000, + new ThreadFactoryBuilder() + .setNameFormat("DataflowWorkUnits-%d") + .setDaemon(true) + .build()); + + return WorkFailureProcessor.forTesting(workExecutor, failureTracker, Optional::empty, clock, 0); + } + + private static WorkFailureProcessor createWorkFailureProcessor(FailureTracker failureTracker) { + return createWorkFailureProcessor(failureTracker, Instant::now); + } + + private static FailureTracker streamingEngineFailureReporter() { + return StreamingEngineFailureTracker.create(10, 10); + } + + private static FailureTracker streamingApplianceFailureReporter(boolean isWorkFailed) { + return StreamingApplianceFailureTracker.create( + 10, + 10, + ignored -> Windmill.ReportStatsResponse.newBuilder().setFailed(isWorkFailed).build()); + } + + private static Work createWork(Supplier clock, Consumer processWorkFn) { + return Work.create( + Windmill.WorkItem.newBuilder() + .setKey(ByteString.EMPTY) + .setWorkToken(1L) + .setCacheToken(1L) + .setShardingKey(1L) + .build(), + clock, + new ArrayList<>(), + processWorkFn); + } + + private static Work createWork() { + return createWork(Instant::now, ignored -> {}); + } + + private static Work createWork(Consumer processWorkFn) { + return createWork(Instant::now, processWorkFn); + } + + @Test + public void logAndProcessFailure_doesNotRetryKeyTokenInvalidException() { + Work work = spy(createWork()); + WorkFailureProcessor workFailureProcessor = + createWorkFailureProcessor(streamingEngineFailureReporter()); + Set invalidWork = new HashSet<>(); + workFailureProcessor.logAndProcessFailure( + DEFAULT_COMPUTATION_ID, work, new KeyTokenInvalidException("key"), invalidWork::add); + + verify(work, times(0)).run(); + assertThat(invalidWork).containsExactly(work); + } + + @Test + public void logAndProcessFailure_doesNotRetryWhenWorkItemCancelled() { + Work work = spy(createWork()); + WorkFailureProcessor workFailureProcessor = + createWorkFailureProcessor(streamingEngineFailureReporter()); + Set invalidWork = new HashSet<>(); + workFailureProcessor.logAndProcessFailure( + DEFAULT_COMPUTATION_ID, + work, + new WorkItemCancelledException(work.getWorkItem().getShardingKey()), + invalidWork::add); + + verify(work, times(0)).run(); + assertThat(invalidWork).containsExactly(work); + } + + @Test + public void logAndProcessFailure_doesNotRetryOOM() { + Work work = spy(createWork()); + WorkFailureProcessor workFailureProcessor = + createWorkFailureProcessor(streamingEngineFailureReporter()); + Set invalidWork = new HashSet<>(); + workFailureProcessor.logAndProcessFailure( + DEFAULT_COMPUTATION_ID, work, new OutOfMemoryError(), invalidWork::add); + + verify(work, times(0)).run(); + assertThat(invalidWork).containsExactly(work); + } + + @Test + public void logAndProcessFailure_doesNotRetryWhenFailureReporterMarksAsNonRetryable() { + Work work = spy(createWork()); + WorkFailureProcessor workFailureProcessor = + createWorkFailureProcessor(streamingApplianceFailureReporter(true)); + Set invalidWork = new HashSet<>(); + workFailureProcessor.logAndProcessFailure( + DEFAULT_COMPUTATION_ID, work, new RuntimeException(), invalidWork::add); + + verify(work, times(0)).run(); + assertThat(invalidWork).containsExactly(work); + } + + @Test + public void logAndProcessFailure_doesNotRetryAfterLocalRetryTimeout() { + Work veryOldWork = + spy(createWork(() -> Instant.now().minus(Duration.standardDays(30)), ignored -> {})); + WorkFailureProcessor workFailureProcessor = + createWorkFailureProcessor(streamingEngineFailureReporter()); + Set invalidWork = new HashSet<>(); + workFailureProcessor.logAndProcessFailure( + DEFAULT_COMPUTATION_ID, veryOldWork, new RuntimeException(), invalidWork::add); + + verify(veryOldWork, times(0)).run(); + assertThat(invalidWork).contains(veryOldWork); + } + + @Test + public void logAndProcessFailure_retriesOnUncaughtUnhandledException_streamingEngine() + throws InterruptedException { + CountDownLatch runWork = new CountDownLatch(1); + Work work = spy(createWork(ignored -> runWork.countDown())); + WorkFailureProcessor workFailureProcessor = + createWorkFailureProcessor(streamingEngineFailureReporter()); + Set invalidWork = new HashSet<>(); + workFailureProcessor.logAndProcessFailure( + DEFAULT_COMPUTATION_ID, work, new RuntimeException(), invalidWork::add); + + runWork.await(); + verify(work, times(1)).run(); + assertThat(invalidWork).isEmpty(); + } + + @Test + public void logAndProcessFailure_retriesOnUncaughtUnhandledException_streamingAppliance() + throws InterruptedException { + CountDownLatch runWork = new CountDownLatch(1); + Work work = spy(createWork(ignored -> runWork.countDown())); + WorkFailureProcessor workFailureProcessor = + createWorkFailureProcessor(streamingApplianceFailureReporter(false)); + Set invalidWork = new HashSet<>(); + workFailureProcessor.logAndProcessFailure( + DEFAULT_COMPUTATION_ID, work, new RuntimeException(), invalidWork::add); + + runWork.await(); + verify(work, times(1)).run(); + assertThat(invalidWork).isEmpty(); + } +} From 2f8854a3e34f31c1cc034f95ad36f317abc906ff Mon Sep 17 00:00:00 2001 From: Kenn Knowles Date: Thu, 28 Mar 2024 10:09:01 -0400 Subject: [PATCH 10/10] Fix core SDK misuse of "provided" dep for everit_json_schema (#30780) --- sdks/java/core/build.gradle | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sdks/java/core/build.gradle b/sdks/java/core/build.gradle index ff6568787bdc..438a3fb1806c 100644 --- a/sdks/java/core/build.gradle +++ b/sdks/java/core/build.gradle @@ -97,11 +97,7 @@ dependencies { implementation enforcedPlatform(library.java.google_cloud_platform_libraries_bom) permitUnusedDeclared enforcedPlatform(library.java.google_cloud_platform_libraries_bom) provided library.java.json_org - // com.github.everit JSON schema validation library is used for json-schema.org validation. - // to avoid forcing the library onto users, we ask users to provide it rather than include - // it by default. - // It is only used for optional functionality in JsonUtils schema parsing and conversion. - provided library.java.everit_json_schema + implementation library.java.everit_json_schema shadowTest library.java.everit_json_schema provided library.java.junit testImplementation "com.github.stefanbirkner:system-rules:1.19.0"