feat(tests): use process-compose for smoke tests

Containers are out, process orchestration is in. A while back we ditched using containers for the smoke tests, mostly because the caching on the container-build story was atrocious, so test re-runs took a really long time. And frankly, container ergonomics on dev workstations, particularly macOS, are not awesome. Instead, let's assume the dev env can run processes for cargo, pd, and cometbft. If so, that's all we need to wire up our integration testing. Enter process-compose [0]. The new smoke test setup ditches the bash script and delegates to process-compose for orchestrating processes. Benchmarking via hyperfine shows a decrease of over 2x in runtime. There's one substantive change to the integration test logic, in the pcli suite, that reduces the sleep time between tests, refining it to be more precisely the duration necessary for claiming an undelegation. [0] https://github.com/F1bonacc1/process-compose
penumbra-zone · May 6, 2024 · 7cb1e2f · 7cb1e2f
1 parent ac169d7
commit 7cb1e2f
Show file tree

Hide file tree

Showing 5 changed files with 157 additions and 89 deletions.
diff --git a/.github/workflows/smoke.yml b/.github/workflows/smoke.yml
@@ -23,26 +23,16 @@ jobs:
       - name: Install cometbft binary
         run: ./deployments/scripts/install-cometbft
 
+      - name: Install process-compose
+        run: >-
+          sh -c "$(curl --location https://raw.githubusercontent.com/F1bonacc1/process-compose/main/scripts/get-pc.sh)" --
+          -d -b ~/bin
+
       - name: Run the smoke test suite
         run: |
           export PATH="$HOME/bin:$PATH"
           ./deployments/scripts/smoke-test.sh
 
-      - name: Display comet logs
-        if: always()
-        run: cat deployments/logs/comet.log
-      - name: Display pd runtime logs
-        if: always()
-        run: cat deployments/logs/pd.log
-      - name: Display pd test logs
+      - name: Display smoke-test logs
         if: always()
-        run: cat deployments/logs/pd-tests.log
-      - name: Display pclientd logs
-        if: always()
-        run: cat deployments/logs/pclientd.log
-      - name: Display pcli logs
-        if: always()
-        run: cat deployments/logs/pcli.log
-    env:
-      TESTNET_RUNTIME: 2m
-
+        run: cat deployments/logs/smoke-*.log
diff --git a/crates/bin/pcli/tests/network_integration.rs b/crates/bin/pcli/tests/network_integration.rs
@@ -36,14 +36,16 @@ const TEST_ASSET: &str = "1020test_usd";
 const TIMEOUT_COMMAND_SECONDS: u64 = 20;
 
 // The time to wait before attempting to perform an undelegation claim.
-// By default the epoch duration is 100 blocks, the block time is ~500 ms,
-// and the number of unbonding epochs is 2.
+// The "unbonding_delay" value is specified in blocks, and in the smoke tests,
+// block time is set to ~500ms, so we'll take the total number of blocks
+// that must elapse and sleep half that many seconds.
 static UNBONDING_DURATION: Lazy<Duration> = Lazy::new(|| {
-    let blocks: f64 = std::env::var("EPOCH_DURATION")
+    let blocks: f64 = std::env::var("UNBONDING_DELAY")
         .unwrap_or("100".to_string())
         .parse()
         .unwrap();
-    Duration::from_secs((1.5 * blocks) as u64)
+    // 0.5 -> 0.6 for comfort, since 500ms is only an estimate.
+    Duration::from_secs((0.6 * blocks) as u64)
 });
 
 /// Import the wallet from seed phrase into a temporary directory.

diff --git a/deployments/compose/process-compose-smoke-test.yml b/deployments/compose/process-compose-smoke-test.yml
@@ -0,0 +1,128 @@
+---
+# A process-compose configuration for running penumbra smoke-tests.
+#
+# https://github.com/F1bonacc1/process-compose/
+#
+version: "0.5"
+
+# Env vars set here will be accessible to all processes.
+environment:
+  - "PENUMBRA_NODE_PD_URL=http://127.0.0.1:8080"
+  - "PCLI_UNLEASH_DANGER=yes"
+  - "EPOCH_DURATION=50"
+  - "UNBONDING_DELAY=50"
+  - "RUST_LOG=info,network_integration=debug,pclientd=debug,pcli=info,pd=info,penumbra=info"
+
+log_level: info
+is_strict: true
+# Interleave logs from all services in single file, so it's greppable.
+log_location: deployments/logs/smoke-combined.log
+
+processes:
+  # Build latest version of local code. We do this once, up front,
+  # so that each test suite runs immediately when ready, without iterative building.
+  build-code:
+    command: |-
+      echo "Building source code before running tests..."
+      cargo --quiet build --release --all-targets
+      cargo --quiet test --release --no-run
+      cargo --quiet test --release --no-run -- --ignored
+      cargo --quiet test --release --features sct-divergence-check --package pclientd --no-run -- \
+        --ignored --test-threads 1 --nocapture
+      cargo --quiet test --release --features sct-divergence-check,download-proving-keys --package pcli --no-run -- \
+        --ignored --test-threads 1 --nocapture
+      cargo --quiet test --release --package pd --no-run -- \
+        --ignored --test-threads 1 --nocapture
+
+  # Create network configuration, for running a pd validator locally.
+  network-generate:
+    command: >
+      cargo run --quiet --release --bin pd -- 
+      testnet generate --unbonding-delay 50
+      --epoch-duration 50 --timeout-commit 500ms --gas-price-simple 1000
+    depends_on:
+      build-code:
+        condition: process_completed_successfully
+
+  # Run pd validator based on generated network.
+  pd:
+    command: "cargo run --release --bin pd -- start"
+    readiness_probe:
+      http_get:
+        host: 127.0.0.1
+        scheme: http
+        path: "/"
+        port: 8080
+      period_seconds: 5
+    depends_on:
+      network-generate:
+        condition: process_completed_successfully
+
+  # Run CometBFT for pd p2p.
+  cometbft:
+    command: "cometbft --home ~/.penumbra/testnet_data/node0/cometbft start"
+    depends_on:
+      pd:
+        condition: process_healthy
+    environment:
+      - "LOCAL_ENV_VAR=1"
+
+  # Run `pd` integration tests.
+  test-pd:
+    command: >-
+      cargo test --release --package pd -- --ignored --test-threads 1 --nocapture
+    depends_on:
+      pd:
+        condition: process_healthy
+      cometbft:
+        condition: process_started
+    availability:
+      restart: exit_on_failure
+
+  # Run `pclientd` integration tests.
+  test-pclientd:
+    command: >-
+      cargo test --release --features sct-divergence-check --package pclientd --
+      --ignored --test-threads 1 --nocapture
+    log_location: deployments/logs/smoke-test-pclientd.log
+    depends_on:
+      pd:
+        condition: process_healthy
+      cometbft:
+        condition: process_started
+      test-pd:
+        condition: process_completed
+    availability:
+      restart: exit_on_failure
+
+  # Run `pcli` integration tests.
+  test-pcli:
+    command: >-
+      cargo test --release --features sct-divergence-check,download-proving-keys --package pcli --
+      --ignored --test-threads 1 --nocapture
+    log_location: deployments/logs/smoke-test-pcli.log
+    depends_on:
+      pd:
+        condition: process_healthy
+      cometbft:
+        condition: process_started
+      test-pclientd:
+        condition: process_completed
+    availability:
+      restart: exit_on_failure
+
+  # Finalizer task, which will wait until all test suites have finished.
+  # This allows us to ensure that.
+  summary:
+    # The `command` only runs if all tests were succesful,
+    # otherwise the process exits due to dep failure.
+    command: echo tests finished
+    depends_on:
+      test-pd:
+        condition: process_completed_successfully
+      test-pclientd:
+        condition: process_completed_successfully
+      test-pcli:
+        condition: process_completed_successfully
+    availability:
+      exit_on_end: true
diff --git a/deployments/scripts/smoke-test.sh b/deployments/scripts/smoke-test.sh
@@ -1,15 +1,5 @@
 #!/usr/bin/env bash
-# Wrapper script to bottle up logic for running "smoke tests" in CI,
-# supporting backgrounding tasks and checking on their status later.
-# The execution plan is:
-#
-#   1. Start the network
-#   2. Wait ~10s
-#   3. Run integration tests (fail here if non-zero)
-#   4. Continue running network ~5m
-#
-# The goal is to fail fast if an integration test exits, but permit
-# a slightly longer runtime for the suite to find more errors.
+# Run smoke test suite, via process-compose config.
 set -euo pipefail
 
 
@@ -27,66 +17,20 @@ if ! hash cometbft > /dev/null 2>&1 ; then
     exit 1
 fi
 
-# If the action is running in debugging mode, then show me *everything*
-if [ -n "${RUNNER_DEBUG:-}" ]; then
-    export RUST_LOG=debug
+# Check for interactive terminal session, enable TUI if yes.
+if [[ -t 1 ]] ; then
+    use_tui="true"
 else
-    export RUST_LOG="info,network_integration=debug,pclientd=debug,pcli=info,pd=info,penumbra=info"
+    use_tui="false"
 fi
 
-# Duration that the network will be left running before script exits.
-TESTNET_RUNTIME="${TESTNET_RUNTIME:-120}"
-# Duration that the network will run before integration tests are run.
-TESTNET_BOOTTIME="${TESTNET_BOOTTIME:-20}"
-
-# Directory to store log output, useful for debugging; is git-ignored.
-SMOKE_LOG_DIR="deployments/logs"
-
-echo "Building latest version of pd from source..."
-cargo build --quiet --release --bin pd
-
-echo "Generating testnet config..."
-EPOCH_DURATION="${EPOCH_DURATION:-50}"
-UNBONDING_DELAY="${UNBONDING_DELAY:-50}"
-cargo run --quiet --release --bin pd -- testnet generate --unbonding-delay "$UNBONDING_DELAY" --epoch-duration "$EPOCH_DURATION" --timeout-commit 500ms --gas-price-simple=1000
-
-echo "Starting CometBFT..."
-cometbft start --log_level=error --home "${HOME}/.penumbra/testnet_data/node0/cometbft" > "${SMOKE_LOG_DIR}/comet.log" &
-cometbft_pid="$!"
-
-echo "Starting pd..."
-cargo run --release --bin pd -- start --home "${HOME}/.penumbra/testnet_data/node0/pd" > "${SMOKE_LOG_DIR}/pd.log" &
-pd_pid="$!"
-
-# Ensure processes are cleaned up after script exits, regardless of status.
-trap 'kill -9 "$cometbft_pid" "$pd_pid"' EXIT
-
-echo "Waiting $TESTNET_BOOTTIME seconds for network to boot..."
-sleep "$TESTNET_BOOTTIME"
-
-echo "Running pd integration tests against running pd binary"
-    cargo test --release --package pd -- --ignored --test-threads 1 --nocapture | tee "${SMOKE_LOG_DIR}/pd-tests.log"
-
-echo "Running pclientd integration tests against network"
-PENUMBRA_NODE_PD_URL="http://127.0.0.1:8080" \
-    PCLI_UNLEASH_DANGER="yes" \
-    cargo test --release --features sct-divergence-check --package pclientd -- --ignored --test-threads 1 --nocapture | tee "${SMOKE_LOG_DIR}/pclientd.log"
-
-echo "Running pcli integration tests against network"
-PENUMBRA_NODE_PD_URL="http://127.0.0.1:8080" \
-    PCLI_UNLEASH_DANGER="yes" \
-    cargo test --release --features sct-divergence-check,download-proving-keys --package pcli -- --ignored --test-threads 1 --nocapture | tee "${SMOKE_LOG_DIR}/pcli.log"
-
-echo "Waiting another $TESTNET_RUNTIME seconds while network runs..."
-sleep "$TESTNET_RUNTIME"
-# `kill -0` checks existence of pid, i.e. whether the process is still running.
-# It doesn't inspect errors, but the only reason the process would be stopped
-# is if it failed, so it's good enough for our needs.
-if ! kill -0 "$cometbft_pid" || ! kill -0 "$pd_pid" ; then
-    >&2 echo "ERROR: smoke test process exited early"
-    >&2 echo "Review logs in: ${SMOKE_LOG_DIR}/"
+repo_root="$(git rev-parse --show-toplevel)"
+# Override the pc API port 8080 -> 9191, to avoid conflict with pd.
+if ! process-compose --config deployments/compose/process-compose-smoke-test.yml --port 9191 -t="$use_tui" ; then
+    >&2 echo "ERROR: smoke tests failed"
+    >&2 echo "Review logs in: deployments/logs/smoke-*.log"
+    find "${repo_root}/deployments/logs/smoke-"*".log" | sort >&2
     exit 1
 else
-    echo "SUCCESS! Smoke test complete. Ran for $TESTNET_RUNTIME, found no errors."
+    echo "SUCCESS! Smoke test complete."
 fi
-exit 0
diff --git a/justfile b/justfile
@@ -0,0 +1,4 @@
+smoke:
+    # resetting network state
+    cargo run --release --bin pd -- testnet unsafe-reset-all || true
+    ./deployments/scripts/smoke-test.sh