Add TF MNIST classification cost benchmark (apache#33391)

* Add TF MNIST classification cost benchmark * linting * Generalize to single workflow file for cost benchmarks * fix incorrect UTC time in comment * move wordcount to same workflow * update workflow job name
Polber · Dec 17, 2024 · ac0f3c3 · ac0f3c3
1 parent 8e1e124
commit ac0f3c3
Show file tree

Hide file tree

Showing 19 changed files with 711 additions and 135 deletions.
diff --git a/...dcount_Python_Cost_Benchmark_Dataflow.yml → ...s/beam_Python_CostBenchmarks_Dataflow.yml b/...dcount_Python_Cost_Benchmark_Dataflow.yml → ...s/beam_Python_CostBenchmarks_Dataflow.yml
@@ -13,9 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name: Wordcount Python Cost Benchmarks Dataflow
+name: Python Cost Benchmarks Dataflow
 
 on:
+  schedule:
+    - cron: '30 18 * * 6' # Run at 6:30 pm UTC on Saturdays
   workflow_dispatch:
 
 #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event
@@ -47,16 +49,17 @@ env:
   INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }}
 
 jobs:
-  beam_Inference_Python_Benchmarks_Dataflow:
+  beam_Python_Cost_Benchmarks_Dataflow:
     if: |
-      github.event_name == 'workflow_dispatch'
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'apache/beam')
     runs-on: [self-hosted, ubuntu-20.04, main]
     timeout-minutes: 900
     name: ${{ matrix.job_name }} (${{ matrix.job_phrase }})
     strategy:
       matrix:
-        job_name: ["beam_Wordcount_Python_Cost_Benchmarks_Dataflow"]
-        job_phrase: ["Run Wordcount Cost Benchmark"]
+        job_name: ["beam_Python_CostBenchmark_Dataflow"]
+        job_phrase: ["Run Python Dataflow Cost Benchmarks"]
     steps:
       - uses: actions/checkout@v4
       - name: Setup repository
@@ -76,10 +79,11 @@ jobs:
           test-language: python
           argument-file-paths: |
             ${{ github.workspace }}/.github/workflows/cost-benchmarks-pipeline-options/python_wordcount.txt
+            ${{ github.workspace }}/.github/workflows/cost-benchmarks-pipeline-options/python_tf_mnist_classification.txt
       # The env variables are created and populated in the test-arguments-action as "<github.job>_test_arguments_<argument_file_paths_index>"
       - name: get current time
         run: echo "NOW_UTC=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_ENV
-      - name: run wordcount on Dataflow Python
+      - name: Run wordcount on Dataflow
         uses: ./.github/actions/gradle-command-self-hosted-action
         timeout-minutes: 30
         with:
@@ -88,4 +92,14 @@ jobs:
             -PloadTest.mainClass=apache_beam.testing.benchmarks.wordcount.wordcount \
             -Prunner=DataflowRunner \
             -PpythonVersion=3.10 \
-            '-PloadTest.args=${{ env.beam_Inference_Python_Benchmarks_Dataflow_test_arguments_1 }} --job_name=benchmark-tests-wordcount-python-${{env.NOW_UTC}} --output=gs://temp-storage-for-end-to-end-tests/wordcount/result_wordcount-${{env.NOW_UTC}}.txt' \
+            '-PloadTest.args=${{ env.beam_Inference_Python_Benchmarks_Dataflow_test_arguments_1 }} --job_name=benchmark-tests-wordcount-python-${{env.NOW_UTC}} --output_file=gs://temp-storage-for-end-to-end-tests/wordcount/result_wordcount-${{env.NOW_UTC}}.txt' \
+      - name: Run Tensorflow MNIST Image Classification on Dataflow
+        uses: ./.github/actions/gradle-command-self-hosted-action
+        timeout-minutes: 30
+        with:
+          gradle-command: :sdks:python:apache_beam:testing:load_tests:run
+          arguments: |
+            -PloadTest.mainClass=apache_beam.testing.benchmarks.inference.tensorflow_mnist_classification_cost_benchmark \
+            -Prunner=DataflowRunner \
+            -PpythonVersion=3.10 \
+            '-PloadTest.args=${{ env.beam_Inference_Python_Benchmarks_Dataflow_test_arguments_2 }} --job_name=benchmark-tests-tf-mnist-classification-python-${{env.NOW_UTC}} --input_file=gs://apache-beam-ml/testing/inputs/it_mnist_data.csv --output_file=gs://temp-storage-for-end-to-end-tests/wordcount/result_tf_mnist-${{env.NOW_UTC}}.txt --model=gs://apache-beam-ml/models/tensorflow/mnist/' \
diff --git a/.github/workflows/cost-benchmarks-pipeline-options/python_tf_mnist_classification.txt b/.github/workflows/cost-benchmarks-pipeline-options/python_tf_mnist_classification.txt
@@ -0,0 +1,29 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+--region=us-central1
+--machine_type=n1-standard-2
+--num_workers=1
+--disk_size_gb=50
+--autoscaling_algorithm=NONE
+--input_options={}
+--staging_location=gs://temp-storage-for-perf-tests/loadtests
+--temp_location=gs://temp-storage-for-perf-tests/loadtests
+--requirements_file=apache_beam/ml/inference/tensorflow_tests_requirements.txt
+--publish_to_big_query=true
+--metrics_dataset=beam_run_inference
+--metrics_table=tf_mnist_classification
+--runner=DataflowRunner
diff --git a/sdks/python/apache_beam/examples/inference/output.txt b/sdks/python/apache_beam/examples/inference/output.txt
@@ -0,0 +1,3 @@
+What does Apache Beam do?;enables batch and streaming data processing
+What is the capital of France?;Paris
+Where was beam summit?;NYC
diff --git a/...pache_beam/testing/benchmarks/inference/tensorflow_mnist_classification_cost_benchmark.py b/...pache_beam/testing/benchmarks/inference/tensorflow_mnist_classification_cost_benchmark.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# pytype: skip-file
+
+import logging
+
+from apache_beam.examples.inference import tensorflow_mnist_classification
+from apache_beam.testing.load_tests.dataflow_cost_benchmark import DataflowCostBenchmark
+
+
+class TensorflowMNISTClassificationCostBenchmark(DataflowCostBenchmark):
+  def __init__(self):
+    super().__init__()
+
+  def test(self):
+    extra_opts = {}
+    extra_opts['input'] = self.pipeline.get_option('input_file')
+    extra_opts['output'] = self.pipeline.get_option('output_file')
+    extra_opts['model_path'] = self.pipeline.get_option('model')
+    tensorflow_mnist_classification.run(
+        self.pipeline.get_full_options_as_args(**extra_opts),
+        save_main_session=False)
+
+
+if __name__ == '__main__':
+  logging.basicConfig(level=logging.INFO)
+  TensorflowMNISTClassificationCostBenchmark().run()
diff --git a/sdks/python/apache_beam/yaml/generate_yaml_docs.py b/sdks/python/apache_beam/yaml/generate_yaml_docs.py
@@ -241,6 +241,7 @@ def main():
     json_config_schemas = []
     markdown_out = io.StringIO()
     providers = yaml_provider.standard_providers()
+    providers = {'RunInference': providers['RunInference']}
     for transform_base, transforms in itertools.groupby(
         sorted(providers.keys(), key=io_grouping_key),
         key=lambda s: s.split('-')[0]):

diff --git a/sdks/python/apache_beam/yaml/main.py b/sdks/python/apache_beam/yaml/main.py
@@ -26,6 +26,7 @@
 from apache_beam.typehints.schemas import LogicalType
 from apache_beam.typehints.schemas import MillisInstant
 from apache_beam.yaml import yaml_transform
+from apache_beam.yaml.yaml_utils import SafeLineLoader
 
 
 def _preparse_jinja_flags(argv):
@@ -126,15 +127,14 @@ def run(argv=None):
   pipeline_template = _pipeline_spec_from_args(known_args)
   pipeline_yaml = yaml_transform.expand_jinja(
       pipeline_template, known_args.jinja_variables or {})
-  pipeline_spec = yaml.load(pipeline_yaml, Loader=yaml_transform.SafeLineLoader)
+  pipeline_spec = yaml.load(pipeline_yaml, Loader=SafeLineLoader)
 
   with _fix_xlang_instant_coding():
     with beam.Pipeline(  # linebreak for better yapf formatting
         options=beam.options.pipeline_options.PipelineOptions(
             pipeline_args,
             pickle_library='cloudpickle',
-            **yaml_transform.SafeLineLoader.strip_metadata(pipeline_spec.get(
-                'options', {}))),
+            **SafeLineLoader.strip_metadata(pipeline_spec.get('options', {}))),
         display_data={'yaml': pipeline_yaml,
                       'yaml_jinja_template': pipeline_template,
                       'yaml_jinja_variables': json.dumps(

diff --git a/sdks/python/apache_beam/yaml/ml.yaml b/sdks/python/apache_beam/yaml/ml.yaml
@@ -0,0 +1,46 @@
+pipeline:
+  - type: RunInference
+    config:
+      model_handler:
+          type: Huggingface
+          config:
+            task: translation_en_to_fr
+            model: google-t5/t5-small
+            preprocess_fn: |
+              def preprocess_fn(_element):
+                ...
+            inference_fn: |
+              def inference_fn(batch, pipeline, inference_args):
+                ...
+            postprocess_fn: |
+              def postprocess_fn(result):
+                ...
+            load_model_args:
+              framework: pt
+              revision: main
+              device: gpu
+              min_batch_size: 1
+              max_batch_size: 2
+              max_batch_duration: 60s
+              large_model: false
+              model_copies: 1
+              env_vars:
+                SOME_ENV_VAR: val
+      using_key: key
+      element: element
+      inference_args:
+        arg1: val1
+        arg2: val2
+transform_providers:
+  - ...
+
+# Would probably require callable syntax
+model_handler_providers:
+  - type: python
+    config:
+      packages:
+        - 'some_pypi_package>=version'
+      model_handlers:
+        SomeName: 'pkg.module.MyModelHandler'
+        SomeOtherName: 'pkg.module.MyOtherModelHandler'
+
diff --git a/sdks/python/apache_beam/yaml/requirements.txt b/sdks/python/apache_beam/yaml/requirements.txt
@@ -0,0 +1,3 @@
+tensorflow
+torch
+transformers
diff --git a/sdks/python/apache_beam/yaml/standard_providers.yaml b/sdks/python/apache_beam/yaml/standard_providers.yaml
@@ -56,6 +56,7 @@
   config: {}
   transforms:
     MLTransform: 'apache_beam.yaml.yaml_ml.ml_transform'
+    RunInference: 'apache_beam.yaml.yaml_ml.run_inference'
 
 - type: renaming
   transforms:

diff --git a/sdks/python/apache_beam/yaml/tests/bigquery.yaml b/sdks/python/apache_beam/yaml/tests/bigquery.yaml
@@ -38,7 +38,7 @@ pipelines:
               - {label: "389a", rank: 2}
         - type: WriteToBigQuery
           config:
-            table: "{BQ_TABLE}"
+            table: "{BQ_TABLE[0]}:{BQ_TABLE[1]}.{BQ_TABLE[2]}"
     options:
       project: "apache-beam-testing"
       temp_location: "{TEMP_DIR}"
@@ -48,7 +48,7 @@ pipelines:
       transforms:
         - type: ReadFromBigQuery
           config:
-            table: "{BQ_TABLE}"
+            table: "{BQ_TABLE[0]}:{BQ_TABLE[1]}.{BQ_TABLE[2]}"
         - type: AssertEqual
           config:
             elements:
@@ -64,7 +64,7 @@ pipelines:
       transforms:
         - type: ReadFromBigQuery
           config:
-            table: "{BQ_TABLE}"
+            table: "{BQ_TABLE[0]}:{BQ_TABLE[1]}.{BQ_TABLE[2]}"
             fields: ["label"]
             row_restriction: "rank > 0"
         - type: AssertEqual

diff --git a/sdks/python/apache_beam/yaml/yaml_combine.py b/sdks/python/apache_beam/yaml/yaml_combine.py
@@ -31,6 +31,7 @@
 from apache_beam.utils import python_callable
 from apache_beam.yaml import yaml_mapping
 from apache_beam.yaml import yaml_provider
+from apache_beam.yaml.yaml_utils import SafeLineLoader
 
 BUILTIN_COMBINE_FNS = {
     'sum': sum,
@@ -61,7 +62,6 @@ def normalize_combine(spec):
         fn:
           type: fn_type
   """
-  from apache_beam.yaml.yaml_transform import SafeLineLoader
   if spec['type'] == 'Combine':
     config = spec.get('config')
     if isinstance(config.get('group_by'), str):

diff --git a/sdks/python/apache_beam/yaml/yaml_mapping.py b/sdks/python/apache_beam/yaml/yaml_mapping.py
@@ -297,10 +297,13 @@ def _expand_python_mapping_func(
     # TODO(robertwb): Consider constructing a single callable that takes
     # the row and returns the new row, rather than invoking (and unpacking)
     # for each field individually.
-    source = '\n'.join(['def fn(__row__):'] + [
-        f'  {name} = __row__.{name}'
+    source = '\n'.join(['def fn(__row__):'] + ['  try:'] + [
+        f'    {name} = __row__.{name}'
         for name in original_fields if name in expression
-    ] + ['  return (' + expression + ')'])
+    ] + [f'    return ({expression})'] + ['  except NameError as e:'] + [
+        f'    raise ValueError(f"{{e}}. Valid values include '
+        f'{original_fields}")'
+    ])
 
   else:
     source = callable