ImperialCollegeLondon · MarionBWeinzierl · Dec 3, 2024 · Oct 22, 2024 · Oct 22, 2024 · Nov 5, 2024
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -99,13 +99,8 @@ details on the workflow and process.
 Even if the code works as expected and passes all our tests, it can still be slow! We
 use code profiling to work out where time is spent when using `pyrealm` and identify
 where we can improve performance. We also use benchmarking between `pyrealm` versions to
-make sure that changes to the code aren't making it slower. This is run automatically
-when new code is pulled to the `develop` or `main` branches but can also be used to do
-local profiling and benchmarking.
-
-See the [profiling and benchmarking
-page](https://pyrealm.readthedocs.io/en/latest/development/profiling_and_benchmarking.md)
-for more details.
+make sure that changes to the code aren't making it slower. This is currently run
+manually using the `performance_regression_checking.sh` script in the `profiling` directory.
 
 ### Documentation
 

diff --git a/profiling/performance_regression_checking.sh b/profiling/performance_regression_checking.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+
+if [[ $# -eq 0 ]] ; then
+    echo "No input arguments, comparing HEAD to origin/develop"
+    new_commit=HEAD
+    old_commit=origin/develop
+else
+    while getopts n:o: flag
+    do
+        case "${flag}" in
+            n) new_commit=${OPTARG};;
+            o) old_commit=${OPTARG};;
+            *) echo "Invalid input argument"; exit 1;;
+        esac
+    done
+fi
+
+cd ..
+git checkout $new_commit
+
+# Remember where we start from
+current_repo=`pwd`
+
+#This is the where we want to check the other worktree out to
+cmp_repo=$current_repo/../pyrealm_performance_check
+
+# Adding the worktree
+echo "Add worktree" $cmp_repo
+git worktree add $cmp_repo $old_commit
+
+# Go there and activate poetry environment
+cd $cmp_repo
+poetry install
+#source .venv/bin/activate
+
+# Run the profiling on old commit
+echo "Run profiling tests on old commit"
+poetry run /usr/bin/time -v pytest -m "profiling" --profile-svg
+if [ "$?" != "0" ]; then
+    echo "Profiling the current code went wrong."
+    exit 1
+fi
+
+# Go back into the current repo and run there
+cd $current_repo
+poetry install
+echo "Run profiling tests on new commit"
+poetry run /usr/bin/time -v pytest -m "profiling" --profile-svg
+if [ "$?" != "0" ]; then
+    echo "Profiling the new code went wrong."
+    exit 1
+fi
+
+# Compare the profiling outputs
+cd profiling
+python -c "
+from pathlib import Path
+import simple_benchmarking
+import pandas as pd
+import sys
+
+prof_path_old = Path('$cmp_repo'+'/prof/combined.prof')
+print(prof_path_old)
+df_old = simple_benchmarking.run_simple_benchmarking(prof_path=prof_path_old)
+cumtime_old = (df_old.sum(numeric_only=True)['cumtime'])
+print('Old time:', cumtime_old)
+
+prof_path_new = Path('$current_repo'+'/prof/combined.prof')
+print(prof_path_new)
+df_new = simple_benchmarking.run_simple_benchmarking(prof_path=prof_path_new)
+cumtime_new = (df_new.sum(numeric_only=True)['cumtime'])
+print('New time:', cumtime_new)
+
+if cumtime_old < 0.95*cumtime_new:
+  print('We got slower. :(')
+  sys.exit(1)
+elif cumtime_new < 0.95*cumtime_old:
+  print('We got quicker! :)')
+else:
+  print('Times haven\'t changed')
+"
+
+benchmarking_out="$?"
+
+cd ..
+# Remove the working tree for the comparison commit
+echo "Clean up"
+git worktree remove --force $cmp_repo
+git worktree prune
+
+if [ $benchmarking_out != "0" ]; then
+    echo "The new code is more than 5% slower than the old one."
+    exit 1
+fi
+
+echo "No significant performance regression detected."
diff --git a/profiling/simple_benchmarking.py b/profiling/simple_benchmarking.py
@@ -0,0 +1,117 @@
+"""Run profile benchmarking and generate benchmarking graphics."""
+
+import datetime
+import pstats
+import sys
+import textwrap
+from argparse import ArgumentParser
+from io import StringIO
+from pathlib import Path
+
+import pandas as pd
+
+
+def run_simple_benchmarking(
+    prof_path: Path,
+    exclude: list[str] = ["{.*", "<.*", "/lib/", "/tests/", "virtualenv", "venv"],
+) -> pd.DataFrame:
+    """Run a simplified benchmarking version.
+
+    The function reads the contents of a ``.prof`` file (typically
+    ``prof/combined.prof``) generated by running the profiling test suite and returns
+    the profiling data as a standardised `pandas.DataFrame`.
+
+    The profiling results include a field 'filename:lineno(function)', which identifies
+    each profiled code object. Many of these will be from functions outside of the
+    `pyrealm` codebase and these are excluded using a list of regex patterns:
+
+    * '/lib/' excludes standard and site packages,
+    * '{.*' excludes profiling of '{built-in ... } and similar,
+    * '<.*' excludes profiling of '<frozen ...>` and similar.
+    * '/tests/' excludes the test functions and classes calling the pyrealm code.
+    * 'virtualenv' and 'venv' exclude standard packages in virtual environments
+
+    Args:
+        prof_path: Path to the profiling output.
+        exclude: A list of patterns used to exclude rows from the profiling stats.
+    """
+
+    # Import the profile data, write the stats report to a StringIO and seek the start
+    # to allow the data to be read. The print_stats() explicitly does not filter for
+    # 'pyrealm' because the string can be found in virtual environment paths and leads
+    # to inconsistent behaviour across platforms
+    sio = StringIO()
+    p = pstats.Stats(str(prof_path), stream=sio)
+    p.sort_stats(pstats.SortKey.CUMULATIVE).print_stats()
+    sio.seek(0)
+
+    # Consume lines from the report to find the header row
+    header_found = False
+    while not header_found:
+        header = sio.readline()
+        if "ncalls" in header:
+            header_found = True
+
+    # Set replacement non-duplicated headers
+    column_names = [
+        "ncalls",
+        "tottime",
+        "tottime_percall",
+        "cumtime",
+        "cumtime_percall",
+        "filename:lineno(function)",
+    ]
+
+    # Convert to a DataFrame using fixed width format
+    df = pd.read_fwf(sio, engine="python", names=column_names, infer_nrows=10)
+
+    # Reduce to rows not matching any of the regex exclude patterns
+    exclude_rows = pd.DataFrame(
+        [df["filename:lineno(function)"].str.contains(excl) for excl in exclude]
+    ).any()
+    df = df[~exclude_rows]
+
+    # Add a timestamp from the file creation date
+    m_time = datetime.datetime.fromtimestamp(prof_path.stat().st_mtime)
+    df["timestamp"] = m_time.isoformat(timespec="seconds")
+
+    df.to_csv("profiling-database.csv")
+
+    return df
+
+
+def run_simple_benchmarking_cli() -> None:
+    """Run the simple benchmarking."""
+
+    if run_simple_benchmarking_cli.__doc__ is not None:
+        doc = "    " + run_simple_benchmarking_cli.__doc__
+    else:
+        doc = "Python in -OO mode"
+
+    parser = ArgumentParser(
+        description=textwrap.dedent(doc),
+    )
+    parser.add_argument(
+        "prof_path",
+        type=Path,
+        help="Path to pytest-profiling output",
+    )
+
+    args = parser.parse_args()
+
+    # Copy the profiling results to the current folder
+    if not args.prof_path.exists():
+        raise FileNotFoundError(f"Cannot find the profiling file at {args.prof_path}.")
+
+    success = run_simple_benchmarking(prof_path=args.prof_path)
+
+    if not success:
+        print("Benchmarking failed.")
+        sys.exit(1)
+
+    print("Benchmarking passed.")
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    run_simple_benchmarking_cli()
diff --git a/tests/profiling/pmodel/test_profiling_pmodel.py b/tests/profiling/pmodel/test_profiling_pmodel.py
@@ -28,7 +28,7 @@ def test_profiling_pmodel(pmodel_profile_data):
     gpp_c3_annual = pmod_c3.gpp * (60 * 60 * 24 * 365) * 1e-6
     gpp_c4_annual = pmod_c4.gpp * (60 * 60 * 24 * 365) * 1e-6
 
-    # Fit the competition model - making some extrenely poor judgements about what
+    # Fit the competition model - making some extremely poor judgements about what
     # is cropland and what is below the minimum temperature that really should be
     # fixed.
     comp = C3C4Competition(