apache · AnandInguva · Oct 31, 2023 · Oct 4, 2023 · Oct 29, 2023 · Oct 30, 2023
diff --git a/CHANGES.md b/CHANGES.md
@@ -53,6 +53,40 @@
 * ([#X](https://github.com/apache/beam/issues/X)).
 -->
 
+# [2.52.0] - Unreleased
+
+## Highlights
+
+* New highly anticipated feature X added to Python SDK ([#X](https://github.com/apache/beam/issues/X)).
+* New highly anticipated feature Y added to Java SDK ([#Y](https://github.com/apache/beam/issues/Y)).
+
+## I/Os
+
+* Support for X source added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)).
+
+## New Features / Improvements
+
+* state cache has been enabled to a default of 100 MB (Python) ([#28770](https://github.com/apache/beam/issues/28770)).
+
+## Breaking Changes
+
+* X behavior was changed ([#X](https://github.com/apache/beam/issues/X)).
+
+## Deprecations
+
+* X behavior is deprecated and will be removed in X versions ([#X](https://github.com/apache/beam/issues/X)).
+
+## Bugfixes
+
+* Fixed X (Java/Python) ([#X](https://github.com/apache/beam/issues/X)).
+
+## Security Fixes
+* Fixed (CVE-YYYY-NNNN)[https://www.cve.org/CVERecord?id=CVE-YYYY-NNNN] (Java/Python/Go) ([#X](https://github.com/apache/beam/issues/X)).
+
+## Known Issues
+
+* ([#X](https://github.com/apache/beam/issues/X)).
+
 # [2.51.0] - Unreleased
 
 ## Highlights

diff --git a/sdks/python/apache_beam/options/pipeline_options.py b/sdks/python/apache_beam/options/pipeline_options.py
@@ -1127,6 +1127,17 @@ def _add_argparse_args(cls, parser):
         dest='min_cpu_platform',
         type=str,
         help='GCE minimum CPU platform. Default is determined by GCP.')
+    parser.add_argument(
+        '--max_cache_memory_usage_mb',
+        dest='max_cache_memory_usage_mb',
+        type=int,
+        default=None,
+        help=(
+            'Size of the SdkHarness/Sdk Process cache in MB. Default is 100MB.'
+            'This cache is used to store the user state and side input '
+            'elements. If the cache is full, the least recently used '
+            'elements will be evicted. This cache will be per SdkHarness/Sdk '
+            'Process. SDKHarness is a python process that runs the user code.'))
 
   def validate(self, validator):
     errors = []

diff --git a/sdks/python/apache_beam/runners/portability/portable_runner.py b/sdks/python/apache_beam/runners/portability/portable_runner.py
@@ -415,7 +415,8 @@ def start_and_replace_loopback_environments(pipeline, options):
         portable_options.environment_config, server = (
             worker_pool_main.BeamFnExternalWorkerPoolServicer.start(
                 state_cache_size=
-                sdk_worker_main._get_state_cache_size(experiments),
+                sdk_worker_main._get_state_cache_size_bytes(
+                  options=options),
                 data_buffer_time_limit_ms=
                 sdk_worker_main._get_data_buffer_time_limit_ms(experiments),
                 use_process=use_loopback_process_worker))

diff --git a/sdks/python/apache_beam/runners/worker/sdk_worker_main.py b/sdks/python/apache_beam/runners/worker/sdk_worker_main.py
@@ -36,6 +36,7 @@
 from apache_beam.options.pipeline_options import PipelineOptions
 from apache_beam.options.pipeline_options import ProfilingOptions
 from apache_beam.options.pipeline_options import SetupOptions
+from apache_beam.options.pipeline_options import WorkerOptions
 from apache_beam.options.value_provider import RuntimeValueProvider
 from apache_beam.portability.api import endpoints_pb2
 from apache_beam.runners.internal import names
@@ -46,6 +47,7 @@
 
 _LOGGER = logging.getLogger(__name__)
 _ENABLE_GOOGLE_CLOUD_PROFILER = 'enable_google_cloud_profiler'
+_STATE_CACHE_SIZE_BYTES = 100 << 20
 
 
 def _import_beam_plugins(plugins):
@@ -159,7 +161,8 @@ def create_harness(environment, dry_run=False):
       control_address=control_service_descriptor.url,
       status_address=status_service_descriptor.url,
       worker_id=_worker_id,
-      state_cache_size=_get_state_cache_size(experiments),
+      state_cache_size=_get_state_cache_size_bytes(
+          options=sdk_pipeline_options),
       data_buffer_time_limit_ms=_get_data_buffer_time_limit_ms(experiments),
       profiler_factory=profiler.Profile.factory_from_options(
           sdk_pipeline_options.view_as(ProfilingOptions)),
@@ -239,24 +242,26 @@ def _parse_pipeline_options(options_json):
   return PipelineOptions.from_dictionary(_load_pipeline_options(options_json))
 
 
-def _get_state_cache_size(experiments):
-  """Defines the upper number of state items to cache.
-
-  Note: state_cache_size is an experimental flag and might not be available in
-  future releases.
+def _get_state_cache_size_bytes(options):
+  """Return the maximun size of state cache in bytes.
 
   Returns:
     an int indicating the maximum number of megabytes to cache.
       Default is 0 MB
   """
-
-  for experiment in experiments:
-    # There should only be 1 match so returning from the loop
-    if re.match(r'state_cache_size=', experiment):
-      return int(
-          re.match(r'state_cache_size=(?P<state_cache_size>.*)',
-                   experiment).group('state_cache_size')) << 20
-  return 0
+  max_cache_memory_usage_mb = options.view_as(
+      WorkerOptions).max_cache_memory_usage_mb
+  if not max_cache_memory_usage_mb:
+    # to maintain backward compatibility
+    experiments = options.view_as(DebugOptions).experiments or []
+    for experiment in experiments:
+      # There should only be 1 match so returning from the loop
+      if re.match(r'state_cache_size=', experiment):
+        return int(
+            re.match(r'state_cache_size=(?P<state_cache_size>.*)',
+                     experiment).group('state_cache_size')) << 20
+    return _STATE_CACHE_SIZE_BYTES
+  return max_cache_memory_usage_mb << 20
 
 
 def _get_data_buffer_time_limit_ms(experiments):

diff --git a/sdks/python/apache_beam/runners/worker/sdk_worker_main_test.py b/sdks/python/apache_beam/runners/worker/sdk_worker_main_test.py
@@ -234,6 +234,19 @@ def test_gcp_profiler_uses_job_name_when_enabled_as_experiment(self):
     sdk_worker_main._start_profiler(gcp_profiler_name, "version")
     sdk_worker_main._start_profiler.assert_called_with("sample_job", "version")
 
+  @unittest.mock.patch.dict(os.environ, {"JOB_NAME": "sample_job"}, clear=True)
+  def test_pipeline_option_max_cache_memory_usage_mb(self):
+    options = PipelineOptions(flags=['--max_cache_memory_usage_mb=100'])
+
+    cache_size = sdk_worker_main._get_state_cache_size_bytes(options)
+    self.assertEqual(cache_size, sdk_worker_main._STATE_CACHE_SIZE_BYTES)
+
+  @unittest.mock.patch.dict(os.environ, {"JOB_NAME": "sample_job"}, clear=True)
+  def test_pipeline_option_max_cache_memory_usage_mb_with_experiments(self):
+    options = PipelineOptions(flags=['--experiments=state_cache_size=100'])
+    cache_size = sdk_worker_main._get_state_cache_size_bytes(options)
+    self.assertEqual(cache_size, sdk_worker_main._STATE_CACHE_SIZE_BYTES)
+
 
 if __name__ == '__main__':
   logging.getLogger().setLevel(logging.INFO)