Skip to content

Commit

Permalink
Revert "[BEAM-30531] Automatically execute unbounded pipelines in str…
Browse files Browse the repository at this point in the history
…eaming mode. (apache#30533)" (apache#30706)

This reverts commit 1c55117.
  • Loading branch information
damondouglas authored and hjtran committed Apr 4, 2024
1 parent bdb9faa commit 8d7cfd8
Show file tree
Hide file tree
Showing 3 changed files with 0 additions and 82 deletions.
1 change: 0 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@
* Merged sdks/java/fn-execution and runners/core-construction-java into the main SDK. These artifacts were never meant for users, but noting
that they no longer exist. These are steps to bring portability into the core SDK alongside all other core functionality.
* Added Vertex AI Feature Store handler for Enrichment transform (Python) ([#30388](https://github.com/apache/beam/pull/30388))
* Python Dataflow users no longer need to manually specify --streaming for pipelines using unbounded sources such as ReadFromPubSub.

## Breaking Changes

Expand Down
20 changes: 0 additions & 20 deletions sdks/python/apache_beam/runners/dataflow/dataflow_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
from apache_beam.options.pipeline_options import TypeOptions
from apache_beam.options.pipeline_options import WorkerOptions
from apache_beam.portability import common_urns
from apache_beam.portability.api import beam_runner_api_pb2
from apache_beam.runners.common import group_by_key_input_visitor
from apache_beam.runners.dataflow.internal.clients import dataflow as dataflow_api
from apache_beam.runners.runner import PipelineResult
Expand Down Expand Up @@ -415,12 +414,6 @@ def run_pipeline(self, pipeline, options, pipeline_proto=None):
self.proto_pipeline, self.proto_context = pipeline.to_runner_api(
return_context=True, default_environment=self._default_environment)

if any(pcoll.is_bounded == beam_runner_api_pb2.IsBounded.UNBOUNDED
for pcoll in self.proto_pipeline.components.pcollections.values()):
options.view_as(StandardOptions).streaming = True
if options.view_as(StandardOptions).streaming:
_check_and_add_missing_streaming_options(options)

# Dataflow can only handle Docker environments.
for env_id, env in self.proto_pipeline.components.environments.items():
self.proto_pipeline.components.environments[env_id].CopyFrom(
Expand Down Expand Up @@ -478,7 +471,6 @@ def run_pipeline(self, pipeline, options, pipeline_proto=None):
if test_options.dry_run:
result = PipelineResult(PipelineState.DONE)
result.wait_until_finish = lambda duration=None: None
result.job = self.job
return result

# Get a Dataflow API client and set its options
Expand Down Expand Up @@ -604,21 +596,9 @@ def _check_and_add_missing_options(options):
"an SDK preinstalled in the default Dataflow dev runtime environment "
"or in a custom container image, use --sdk_location=container.")


def _check_and_add_missing_streaming_options(options):
# Type: (PipelineOptions) -> None

"""Validates and adds missing pipeline options depending on options set.
Must be called after it has been determined whether we're running in
streaming mode.
:param options: PipelineOptions for this pipeline.
"""
# Streaming only supports using runner v2 (aka unified worker).
# Runner v2 only supports using streaming engine (aka windmill service)
if options.view_as(StandardOptions).streaming:
debug_options = options.view_as(DebugOptions)
google_cloud_options = options.view_as(GoogleCloudOptions)
if (not google_cloud_options.enable_streaming_engine and
(debug_options.lookup_experiment("enable_windmill_service") or
Expand Down
61 changes: 0 additions & 61 deletions sdks/python/apache_beam/runners/dataflow/dataflow_runner_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
from apache_beam.runners.dataflow.dataflow_runner import DataflowPipelineResult
from apache_beam.runners.dataflow.dataflow_runner import DataflowRuntimeException
from apache_beam.runners.dataflow.dataflow_runner import _check_and_add_missing_options
from apache_beam.runners.dataflow.dataflow_runner import _check_and_add_missing_streaming_options
from apache_beam.runners.dataflow.internal.clients import dataflow as dataflow_api
from apache_beam.runners.runner import PipelineState
from apache_beam.testing.extra_assertions import ExtraAssertionsMixin
Expand Down Expand Up @@ -524,7 +523,6 @@ def test_batch_is_runner_v2(self):
def test_streaming_is_runner_v2(self):
options = PipelineOptions(['--sdk_location=container', '--streaming'])
_check_and_add_missing_options(options)
_check_and_add_missing_streaming_options(options)
for expected in ['beam_fn_api',
'use_unified_worker',
'use_runner_v2',
Expand Down Expand Up @@ -556,7 +554,6 @@ def test_dataflow_service_options_enable_prime_sets_runner_v2(self):
'--dataflow_service_options=enable_prime'
])
_check_and_add_missing_options(options)
_check_and_add_missing_streaming_options(options)
for expected in ['beam_fn_api',
'use_unified_worker',
'use_runner_v2',
Expand All @@ -567,64 +564,6 @@ def test_dataflow_service_options_enable_prime_sets_runner_v2(self):
options.view_as(DebugOptions).lookup_experiment(expected, False),
expected)

@unittest.skipIf(apiclient is None, 'GCP dependencies are not installed')
@mock.patch(
'apache_beam.options.pipeline_options.GoogleCloudOptions.validate',
lambda *args: [])
def test_auto_streaming_with_unbounded(self):
options = PipelineOptions([
'--sdk_location=container',
'--runner=DataflowRunner',
'--dry_run=True',
'--temp_location=gs://bucket',
'--project=project',
'--region=region'
])
with beam.Pipeline(options=options) as p:
_ = p | beam.io.ReadFromPubSub('projects/some-project/topics/some-topic')
self.assertEqual(
p.result.job.proto.type,
apiclient.dataflow.Job.TypeValueValuesEnum.JOB_TYPE_STREAMING)

@unittest.skipIf(apiclient is None, 'GCP dependencies are not installed')
@mock.patch(
'apache_beam.options.pipeline_options.GoogleCloudOptions.validate',
lambda *args: [])
def test_auto_streaming_no_unbounded(self):
options = PipelineOptions([
'--sdk_location=container',
'--runner=DataflowRunner',
'--dry_run=True',
'--temp_location=gs://bucket',
'--project=project',
'--region=region'
])
with beam.Pipeline(options=options) as p:
_ = p | beam.Create([1, 2, 3])
self.assertEqual(
p.result.job.proto.type,
apiclient.dataflow.Job.TypeValueValuesEnum.JOB_TYPE_BATCH)

@unittest.skipIf(apiclient is None, 'GCP dependencies are not installed')
@mock.patch(
'apache_beam.options.pipeline_options.GoogleCloudOptions.validate',
lambda *args: [])
def test_explicit_streaming_no_unbounded(self):
options = PipelineOptions([
'--streaming',
'--sdk_location=container',
'--runner=DataflowRunner',
'--dry_run=True',
'--temp_location=gs://bucket',
'--project=project',
'--region=region'
])
with beam.Pipeline(options=options) as p:
_ = p | beam.Create([1, 2, 3])
self.assertEqual(
p.result.job.proto.type,
apiclient.dataflow.Job.TypeValueValuesEnum.JOB_TYPE_STREAMING)


if __name__ == '__main__':
unittest.main()

0 comments on commit 8d7cfd8

Please sign in to comment.