From 7cb1e2f3cf2cfef6ebdfc8779c6d7e623a27dbf0 Mon Sep 17 00:00:00 2001
From: Conor Schaefer <conor@penumbralabs.xyz>
Date: Fri, 26 Apr 2024 09:55:09 -0700
Subject: [PATCH] feat(tests): use process-compose for smoke tests

Containers are out, process orchestration is in.

A while back we ditched using containers for the smoke tests,
mostly because the caching on the container-build story was
atrocious, so test re-runs took a really long time. And frankly,
container ergonomics on dev workstations, particularly macOS,
are not awesome. Instead, let's assume the dev env can run
processes for cargo, pd, and cometbft. If so, that's all
we need to wire up our integration testing. Enter process-compose [0].

The new smoke test setup ditches the bash script and delegates
to process-compose for orchestrating processes.

Benchmarking via hyperfine shows a decrease of over 2x in runtime.

There's one substantive change to the integration test logic,
in the pcli suite, that reduces the sleep time between tests,
refining it to be more precisely the duration necessary for
claiming an undelegation.

[0] https://github.com/F1bonacc1/process-compose
---
 .github/workflows/smoke.yml                   |  24 +---
 crates/bin/pcli/tests/network_integration.rs  |  10 +-
 .../compose/process-compose-smoke-test.yml    | 128 ++++++++++++++++++
 deployments/scripts/smoke-test.sh             |  80 ++---------
 justfile                                      |   4 +
 5 files changed, 157 insertions(+), 89 deletions(-)
 create mode 100644 deployments/compose/process-compose-smoke-test.yml
 create mode 100644 justfile
diff --git a/.github/workflows/smoke.yml b/.github/workflows/smoke.yml
index cceb802326..f14a098f5a 100644
--- a/.github/workflows/smoke.yml
+++ b/.github/workflows/smoke.yml
@@ -23,26 +23,16 @@ jobs:
       - name: Install cometbft binary
         run: ./deployments/scripts/install-cometbft
 
+      - name: Install process-compose
+        run: >-
+          sh -c "$(curl --location https://raw.githubusercontent.com/F1bonacc1/process-compose/main/scripts/get-pc.sh)" --
+          -d -b ~/bin
+
       - name: Run the smoke test suite
         run: |
           export PATH="$HOME/bin:$PATH"
           ./deployments/scripts/smoke-test.sh
 
-      - name: Display comet logs
-        if: always()
-        run: cat deployments/logs/comet.log
-      - name: Display pd runtime logs
-        if: always()
-        run: cat deployments/logs/pd.log
-      - name: Display pd test logs
+      - name: Display smoke-test logs
         if: always()
-        run: cat deployments/logs/pd-tests.log
-      - name: Display pclientd logs
-        if: always()
-        run: cat deployments/logs/pclientd.log
-      - name: Display pcli logs
-        if: always()
-        run: cat deployments/logs/pcli.log
-    env:
-      TESTNET_RUNTIME: 2m
-
+        run: cat deployments/logs/smoke-*.log
diff --git a/crates/bin/pcli/tests/network_integration.rs b/crates/bin/pcli/tests/network_integration.rs
index 4e37f59a8c..b84a660413 100644
--- a/crates/bin/pcli/tests/network_integration.rs
+++ b/crates/bin/pcli/tests/network_integration.rs
@@ -36,14 +36,16 @@ const TEST_ASSET: &str = "1020test_usd";
 const TIMEOUT_COMMAND_SECONDS: u64 = 20;
 
 // The time to wait before attempting to perform an undelegation claim.
-// By default the epoch duration is 100 blocks, the block time is ~500 ms,
-// and the number of unbonding epochs is 2.
+// The "unbonding_delay" value is specified in blocks, and in the smoke tests,
+// block time is set to ~500ms, so we'll take the total number of blocks
+// that must elapse and sleep half that many seconds.
 static UNBONDING_DURATION: Lazy<Duration> = Lazy::new(|| {
-    let blocks: f64 = std::env::var("EPOCH_DURATION")
+    let blocks: f64 = std::env::var("UNBONDING_DELAY")
         .unwrap_or("100".to_string())
         .parse()
         .unwrap();
-    Duration::from_secs((1.5 * blocks) as u64)
+    // 0.5 -> 0.6 for comfort, since 500ms is only an estimate.
+    Duration::from_secs((0.6 * blocks) as u64)
 });
 
 /// Import the wallet from seed phrase into a temporary directory.
diff --git a/deployments/compose/process-compose-smoke-test.yml b/deployments/compose/process-compose-smoke-test.yml
new file mode 100644
index 0000000000..975619c46d
--- /dev/null
+++ b/deployments/compose/process-compose-smoke-test.yml
@@ -0,0 +1,128 @@
+---
+# A process-compose configuration for running penumbra smoke-tests.
+#
+# https://github.com/F1bonacc1/process-compose/
+#
+version: "0.5"
+
+# Env vars set here will be accessible to all processes.
+environment:
+  - "PENUMBRA_NODE_PD_URL=http://127.0.0.1:8080"
+  - "PCLI_UNLEASH_DANGER=yes"
+  - "EPOCH_DURATION=50"
+  - "UNBONDING_DELAY=50"
+  - "RUST_LOG=info,network_integration=debug,pclientd=debug,pcli=info,pd=info,penumbra=info"
+
+log_level: info
+is_strict: true
+# Interleave logs from all services in single file, so it's greppable.
+log_location: deployments/logs/smoke-combined.log
+
+processes:
+  # Build latest version of local code. We do this once, up front,
+  # so that each test suite runs immediately when ready, without iterative building.
+  build-code:
+    command: |-
+      echo "Building source code before running tests..."
+      cargo --quiet build --release --all-targets
+      cargo --quiet test --release --no-run
+      cargo --quiet test --release --no-run -- --ignored
+      cargo --quiet test --release --features sct-divergence-check --package pclientd --no-run -- \
+        --ignored --test-threads 1 --nocapture
+      cargo --quiet test --release --features sct-divergence-check,download-proving-keys --package pcli --no-run -- \
+        --ignored --test-threads 1 --nocapture
+      cargo --quiet test --release --package pd --no-run -- \
+        --ignored --test-threads 1 --nocapture
+
+  # Create network configuration, for running a pd validator locally.
+  network-generate:
+    command: >
+      cargo run --quiet --release --bin pd -- 
+      testnet generate --unbonding-delay 50
+      --epoch-duration 50 --timeout-commit 500ms --gas-price-simple 1000
+    depends_on:
+      build-code:
+        condition: process_completed_successfully
+
+  # Run pd validator based on generated network.
+  pd:
+    command: "cargo run --release --bin pd -- start"
+    readiness_probe:
+      http_get:
+        host: 127.0.0.1
+        scheme: http
+        path: "/"
+        port: 8080
+      period_seconds: 5
+    depends_on:
+      network-generate:
+        condition: process_completed_successfully
+
+  # Run CometBFT for pd p2p.
+  cometbft:
+    command: "cometbft --home ~/.penumbra/testnet_data/node0/cometbft start"
+    depends_on:
+      pd:
+        condition: process_healthy
+    environment:
+      - "LOCAL_ENV_VAR=1"
+
+  # Run `pd` integration tests.
+  test-pd:
+    command: >-
+      cargo test --release --package pd -- --ignored --test-threads 1 --nocapture
+    depends_on:
+      pd:
+        condition: process_healthy
+      cometbft:
+        condition: process_started
+    availability:
+      restart: exit_on_failure
+
+  # Run `pclientd` integration tests.
+  test-pclientd:
+    command: >-
+      cargo test --release --features sct-divergence-check --package pclientd --
+      --ignored --test-threads 1 --nocapture
+    log_location: deployments/logs/smoke-test-pclientd.log
+    depends_on:
+      pd:
+        condition: process_healthy
+      cometbft:
+        condition: process_started
+      test-pd:
+        condition: process_completed
+    availability:
+      restart: exit_on_failure
+
+  # Run `pcli` integration tests.
+  test-pcli:
+    command: >-
+      cargo test --release --features sct-divergence-check,download-proving-keys --package pcli --
+      --ignored --test-threads 1 --nocapture
+    log_location: deployments/logs/smoke-test-pcli.log
+    depends_on:
+      pd:
+        condition: process_healthy
+      cometbft:
+        condition: process_started
+      test-pclientd:
+        condition: process_completed
+    availability:
+      restart: exit_on_failure
+
+  # Finalizer task, which will wait until all test suites have finished.
+  # This allows us to ensure that.
+  summary:
+    # The `command` only runs if all tests were succesful,
+    # otherwise the process exits due to dep failure.
+    command: echo tests finished
+    depends_on:
+      test-pd:
+        condition: process_completed_successfully
+      test-pclientd:
+        condition: process_completed_successfully
+      test-pcli:
+        condition: process_completed_successfully
+    availability:
+      exit_on_end: true
diff --git a/deployments/scripts/smoke-test.sh b/deployments/scripts/smoke-test.sh
index a619c347e3..7e65cce2d3 100755
--- a/deployments/scripts/smoke-test.sh
+++ b/deployments/scripts/smoke-test.sh
@@ -1,15 +1,5 @@
 #!/usr/bin/env bash
-# Wrapper script to bottle up logic for running "smoke tests" in CI,
-# supporting backgrounding tasks and checking on their status later.
-# The execution plan is:
-#
-#   1. Start the network
-#   2. Wait ~10s
-#   3. Run integration tests (fail here if non-zero)
-#   4. Continue running network ~5m
-#
-# The goal is to fail fast if an integration test exits, but permit
-# a slightly longer runtime for the suite to find more errors.
+# Run smoke test suite, via process-compose config.
 set -euo pipefail
 
 
@@ -27,66 +17,20 @@ if ! hash cometbft > /dev/null 2>&1 ; then
     exit 1
 fi
 
-# If the action is running in debugging mode, then show me *everything*
-if [ -n "${RUNNER_DEBUG:-}" ]; then
-    export RUST_LOG=debug
+# Check for interactive terminal session, enable TUI if yes.
+if [[ -t 1 ]] ; then
+    use_tui="true"
 else
-    export RUST_LOG="info,network_integration=debug,pclientd=debug,pcli=info,pd=info,penumbra=info"
+    use_tui="false"
 fi
 
-# Duration that the network will be left running before script exits.
-TESTNET_RUNTIME="${TESTNET_RUNTIME:-120}"
-# Duration that the network will run before integration tests are run.
-TESTNET_BOOTTIME="${TESTNET_BOOTTIME:-20}"
-
-# Directory to store log output, useful for debugging; is git-ignored.
-SMOKE_LOG_DIR="deployments/logs"
-
-echo "Building latest version of pd from source..."
-cargo build --quiet --release --bin pd
-
-echo "Generating testnet config..."
-EPOCH_DURATION="${EPOCH_DURATION:-50}"
-UNBONDING_DELAY="${UNBONDING_DELAY:-50}"
-cargo run --quiet --release --bin pd -- testnet generate --unbonding-delay "$UNBONDING_DELAY" --epoch-duration "$EPOCH_DURATION" --timeout-commit 500ms --gas-price-simple=1000
-
-echo "Starting CometBFT..."
-cometbft start --log_level=error --home "${HOME}/.penumbra/testnet_data/node0/cometbft" > "${SMOKE_LOG_DIR}/comet.log" &
-cometbft_pid="$!"
-
-echo "Starting pd..."
-cargo run --release --bin pd -- start --home "${HOME}/.penumbra/testnet_data/node0/pd" > "${SMOKE_LOG_DIR}/pd.log" &
-pd_pid="$!"
-
-# Ensure processes are cleaned up after script exits, regardless of status.
-trap 'kill -9 "$cometbft_pid" "$pd_pid"' EXIT
-
-echo "Waiting $TESTNET_BOOTTIME seconds for network to boot..."
-sleep "$TESTNET_BOOTTIME"
-
-echo "Running pd integration tests against running pd binary"
-    cargo test --release --package pd -- --ignored --test-threads 1 --nocapture | tee "${SMOKE_LOG_DIR}/pd-tests.log"
-
-echo "Running pclientd integration tests against network"
-PENUMBRA_NODE_PD_URL="http://127.0.0.1:8080" \
-    PCLI_UNLEASH_DANGER="yes" \
-    cargo test --release --features sct-divergence-check --package pclientd -- --ignored --test-threads 1 --nocapture | tee "${SMOKE_LOG_DIR}/pclientd.log"
-
-echo "Running pcli integration tests against network"
-PENUMBRA_NODE_PD_URL="http://127.0.0.1:8080" \
-    PCLI_UNLEASH_DANGER="yes" \
-    cargo test --release --features sct-divergence-check,download-proving-keys --package pcli -- --ignored --test-threads 1 --nocapture | tee "${SMOKE_LOG_DIR}/pcli.log"
-
-echo "Waiting another $TESTNET_RUNTIME seconds while network runs..."
-sleep "$TESTNET_RUNTIME"
-# `kill -0` checks existence of pid, i.e. whether the process is still running.
-# It doesn't inspect errors, but the only reason the process would be stopped
-# is if it failed, so it's good enough for our needs.
-if ! kill -0 "$cometbft_pid" || ! kill -0 "$pd_pid" ; then
-    >&2 echo "ERROR: smoke test process exited early"
-    >&2 echo "Review logs in: ${SMOKE_LOG_DIR}/"
+repo_root="$(git rev-parse --show-toplevel)"
+# Override the pc API port 8080 -> 9191, to avoid conflict with pd.
+if ! process-compose --config deployments/compose/process-compose-smoke-test.yml --port 9191 -t="$use_tui" ; then
+    >&2 echo "ERROR: smoke tests failed"
+    >&2 echo "Review logs in: deployments/logs/smoke-*.log"
+    find "${repo_root}/deployments/logs/smoke-"*".log" | sort >&2
     exit 1
 else
-    echo "SUCCESS! Smoke test complete. Ran for $TESTNET_RUNTIME, found no errors."
+    echo "SUCCESS! Smoke test complete."
 fi
-exit 0
diff --git a/justfile b/justfile
new file mode 100644
index 0000000000..e63d30dea8
--- /dev/null
+++ b/justfile
@@ -0,0 +1,4 @@
+smoke:
+    # resetting network state
+    cargo run --release --bin pd -- testnet unsafe-reset-all || true
+    ./deployments/scripts/smoke-test.sh