Skip to content

Commit

Permalink
pd: improve migration UX
Browse files Browse the repository at this point in the history
Just have one command that does the right thing.
  • Loading branch information
hdevalence authored and conorsch committed May 7, 2024
1 parent f505101 commit 15f347c
Show file tree
Hide file tree
Showing 6 changed files with 148 additions and 90 deletions.
27 changes: 13 additions & 14 deletions crates/bin/pd/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,21 +121,20 @@ pub enum RootCommand {
#[clap(long, display_order = 300)]
prune: bool,
},
/// Run a migration on the exported storage state of the full node,
/// and create a genesis file.
/// Run a migration before resuming post-upgrade.
Migrate {
/// The directory containing exported state, created via `pd export`, to be modified
/// in-place. This should be a pd home directory, with a subdirectory called "rocksdb".
#[clap(long, display_order = 200, alias = "target-dir")]
target_directory: PathBuf,
#[clap(long, display_order = 300)]
/// Timestamp of the genesis file in RFC3339 format. If unset, defaults to the current time,
/// unless the migration logic overrides it.
genesis_start: Option<tendermint::time::Time>,
/// An optional filepath for a compressed archive containing the migrated node state,
/// e.g. ~/pd-state-post-upgrade.tar.gz.
#[clap(long, display_order = 400)]
migrate_archive: Option<PathBuf>,
/// The home directory of the full node.
///
/// Migration is performed in-place on the home directory.
#[clap(long, env = "PENUMBRA_PD_HOME", display_order = 100)]
home: Option<PathBuf>,
/// If set, also migrate the CometBFT state located in this home directory.
/// If both `--home` and `--comet-home` are unset, will attempt to migrate
/// CometBFT state alongside the auto-located `pd` state.
// Note: this does _NOT_ use an env var because we are trying to
// get explicit consent to muck around in another daemon's state.
#[clap(long, display_order = 200)]
comet_home: Option<PathBuf>,
},
}

Expand Down
41 changes: 23 additions & 18 deletions crates/bin/pd/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use rand_core::OsRng;
use tendermint_config::net::Address as TendermintAddress;
use tokio::runtime;
use tower_http::cors::CorsLayer;
use tracing::Instrument as _;
use tracing_subscriber::{prelude::*, EnvFilter};
use url::Url;

Expand Down Expand Up @@ -426,27 +427,31 @@ async fn main() -> anyhow::Result<()> {
tracing::info!("export complete: {}", export_directory.display());
}
}
RootCommand::Migrate {
target_directory,
genesis_start,
migrate_archive,
} => {
tracing::info!("migrating state in {}", target_directory.display());
RootCommand::Migrate { home, comet_home } => {
let (pd_home, comet_home) = match home {
Some(h) => (h, comet_home),
None => {
// If no pd_home was configured, we're assuming we set up the
// data in the default location, in which case we also know where comet lives.
let base = get_testnet_dir(None).join("node0");
(base.join("pd"), Some(base.join("cometbft")))
}
};
let genesis_start = pd::migrate::last_block_timestamp(pd_home.clone()).await?;
tracing::info!(?genesis_start, "last block timestamp");
let pd_migrate_span = tracing::error_span!("pd_migrate");
pd_migrate_span
.in_scope(|| tracing::info!("migrating pd state in {}", pd_home.display()));
Testnet74
.migrate(target_directory.clone(), genesis_start)
.migrate(pd_home.clone(), Some(genesis_start))
.instrument(pd_migrate_span)
.await
.context("failed to upgrade state")?;
// Compress to tarball if requested.
if let Some(archive_filepath) = migrate_archive {
pd::migrate::archive_directory(
target_directory.clone(),
archive_filepath.clone(),
None,
)?;
tracing::info!("migration complete: {}", archive_filepath.display());
} else {
// Provide friendly "OK" message that's still accurate without archiving.
tracing::info!("migration complete: {}", target_directory.display());

if let Some(comet_home) = comet_home {
// TODO avoid this when refactoring to clean up migrations
let genesis_path = pd_home.join("genesis.json");
pd::migrate::migrate_comet_data(comet_home, genesis_path).await?;
}
}
}
Expand Down
106 changes: 105 additions & 1 deletion crates/bin/pd/src/migrate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ mod testnet74;

use anyhow::Context;
use futures::StreamExt as _;
use std::path::PathBuf;
use std::path::{Path, PathBuf};
use tracing::instrument;

use cnidarium::{StateDelta, StateRead, StateWrite, Storage};
use jmt::RootHash;
Expand Down Expand Up @@ -241,3 +242,106 @@ pub fn archive_directory(
.context("failed to package archive contents")?;
Ok(())
}

/// Read the last block timestamp from the pd state.
pub async fn last_block_timestamp(home: PathBuf) -> anyhow::Result<tendermint::Time> {
let rocksdb = home.join("rocksdb");
let storage = Storage::load(rocksdb, SUBSTORE_PREFIXES.to_vec())
.await
.context("error loading store for timestamp")?;
let state = storage.latest_snapshot();
let last_block_time = state
.get_block_timestamp()
.await
.context("error reading latest block timestamp")?;
Ok(last_block_time)
}

#[instrument(skip_all)]
pub async fn migrate_comet_data(
comet_home: PathBuf,
new_genesis_file: PathBuf,
) -> anyhow::Result<()> {
tracing::info!(?comet_home, ?new_genesis_file, "migrating comet data");

// Read the contents of new_genesis_file into a serde_json::Value and pull out .initial_height
let genesis_contents =
std::fs::read_to_string(new_genesis_file).context("error reading new genesis file")?;
let genesis_json: serde_json::Value =
serde_json::from_str(&genesis_contents).context("error parsing new genesis file")?;
tracing::info!(?genesis_json, "parsed genesis file");
let initial_height = genesis_json["initial_height"]
.as_str()
.context("error reading initial_height from genesis file")?
.parse::<u64>()?;

// Write the genesis data to HOME/config/genesis.json
let genesis_file = comet_home.join("config").join("genesis.json");
tracing::info!(?genesis_file, "writing genesis file to comet config");
std::fs::write(genesis_file, genesis_contents)
.context("error writing genesis file to comet config")?;

// Adjust the high-water mark in priv_validator_state.json but don't decrease it
adjust_priv_validator_state(&comet_home, initial_height)?;

// Delete other cometbft data.
clear_comet_data(&comet_home)?;

Ok(())
}

#[instrument(skip_all)]
fn adjust_priv_validator_state(comet_home: &Path, initial_height: u64) -> anyhow::Result<()> {
let priv_validator_state = comet_home.join("data").join("priv_validator_state.json");
let current_state: serde_json::Value =
serde_json::from_str(&std::fs::read_to_string(&priv_validator_state)?)?;

let current_height = current_state["height"]
.as_str()
.context("error reading height from priv_validator_state.json")?
.parse::<u64>()?;
if current_height < initial_height {
tracing::info!(
"increasing height in priv_validator_state from {} to {}",
current_height,
initial_height
);
let new_state = serde_json::json!({
"height": initial_height.to_string(), // Important to use to_string here as if protojson
"round": 0,
"step": 0,
});
tracing::info!(?new_state, "updated priv_validator_state.json");
std::fs::write(
&priv_validator_state,
&serde_json::to_string_pretty(&new_state)?,
)?;
} else {
anyhow::bail!(
"priv_validator_state height {} is already greater than or equal to initial_height {}",
current_height,
initial_height
);
}

Ok(())
}

#[instrument(skip_all)]
fn clear_comet_data(comet_home: &Path) -> anyhow::Result<()> {
let data_dir = comet_home.join("data");

/*
N.B. We want to preserve the `tx_index.db` directory.
Doing so will allow CometBFT to reference historical transactions behind the upgrade boundary.
*/
for subdir in &["evidence.db", "state.db", "blockstore.db", "cs.wal"] {
let path = data_dir.join(subdir);
if path.exists() {
tracing::info!(?path, "removing file");
std::fs::remove_dir_all(path)?;
}
}

Ok(())
}
62 changes: 7 additions & 55 deletions docs/guide/src/node/pd/chain-upgrade.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,69 +13,21 @@ At a high level, the upgrade process consists of the following steps:
2. Governance proposal passes.
3. Chain reaches specified height `n-1`, nodes stop generating blocks.
4. Manual upgrade is performed on each validator and fullnode:
1. Prepare migration directory via `pd export`.
2. Install the new version of pd.
3. Apply changes to node state via `pd migrate`.
4. Copy a few files and directories around, clean up CometBFT state.
5. Restart node.
1. Install the new version of pd.
2. Apply changes to `pd` and `cometbft` state via `pd migrate`.
3. Restart node.

After the node is restarted on the new version, it should be able to talk to the network again.
Once enough validators with sufficient stake weight have upgraded, the network
will resume generating blocks.

## Genesis time

In order for the chain to start again after the upgrade, all nodes must be using the same genesis information,
including the timestamp for the genesis event. While the `pd migrate` command will create a new `genesis.json` file,
it cannot know the correct genesis start time to use without the operator supplying the `--genesis-start` flag.
The community may choose to specify a start time within the upgrade proposal. If so, all operators must use that value
when performing the migration, as described below. Otherwise, validators must coordinate out of band to agree
on a genesis start time.

Leveraging the governance proposal is the recommended way to solve this problem. If the genesis start time is a value
in the future, then after the upgrade is performed, the node will start, but not process blocks. It will wait
until the `--genesis-start` time is reached, at which point it will resume processing blocks. In this way,
the community of validators can coordinate resumption of chain activity, even when operators perform migrate their ndoes
at slightly different times.

### Testnet 71 -> 72

For the most recent upgrade on the Penumbra testnet, use this value for genesis time: `{{ #include ../../upgrade_genesis_time_71_72.md }}`.
See an example below for how to supply this value when performing the migration.

## Performing a chain upgrade

The following steps assume that your node uses the default directory of `~/.penumbra/testnet_data/node0/`
to store state for both `pd` and `cometbft`. If your instance is using a different directory, update the paths accordingly.

1. Stop both `pd` and `cometbft`. Depending on how you run Penumbra, this could mean `sudo systemctl stop penumbra cometbft`.
2. Back up the existing node state, as a precaution: `tar -cf ~/.penumbra/testnet_data/node0-state-backup-71.tar ~/.penumbra/testnet_data/node0`
3. Download the latest version of `pd` and install it. Run `pd --version` and confirm you see `{{ #include ../../penumbra_version.md }}` before proceeding.
4. Prepare an export directory:
`pd export --home ~/.penumbra/testnet_data/node0/pd --export-directory ~/.penumbra/testnet_data/node0/pd-exported-state-71`
<!--
An example log message emitted by `pd migrate` without providing `--genesis-start`:
pd::upgrade: no genesis time provided, detecting a testing setup now=2023-12-09T00:08:24.225277473Z`
The value after `now=` is what should be copied. In practice, for testnets, Penumbra Labs will advise on a genesis time
and provide that value in the documentation. Or should we just pick a genesis start ahead of time, and use that for all?
-->
5. Apply the migration: `pd migrate --genesis-start "{{ #include ../../upgrade_genesis_time_71_72.md }}" --target-directory ~/.penumbra/testnet_data/node0/pd-exported-state-71/`
You must use that precise genesis time, otherwise your node will not be able to reach consensus with the rest of the network.
6. Move the migrated state into place: `rm -r ~/.penumbra/testnet_data/node0/pd/rocksdb && mv ~/.penumbra/testnet_data/node0/pd-exported-state-71/rocksdb ~/.penumbra/testnet_data/node0/pd/`
7. Copy the new genesis into place: `cp ~/.penumbra/testnet_data/node0/pd-exported-state-71/genesis.json ~/.penumbra/testnet_data/node0/cometbft/config/genesis.json`
8. Copy the new signing state into place: `cp ~/.penumbra/testnet_data/node0/pd-exported-state-71/priv_validator_state.json ~/.penumbra/testnet_data/node0/cometbft/data/priv_validator_state.json`
9. Clean up the old CometBFT state: `find ~/.penumbra/testnet_data/node0/cometbft/data/ -mindepth 1 -maxdepth 1 -type d -and -not -name tx_index.db -exec rm -r {} +`

<!--
N.B. We use an ugly ad-hoc find command rather than `cometbft reset-state` because we want to preserve the `tx_index.db` directory.
Doing so will allow CometBFT to reference historical transactions behind the upgrade boundary.
-->
2. Download the latest version of `pd` and install it. Run `pd --version` and confirm you see `{{ #include ../../penumbra_version.md }}` before proceeding.
3. Optionally, use `pd export` to create a snapshot of the `pd` state.
4. Apply the migration with `pd migrate --home PD_HOME --comet-home COMETBFT_HOME`. If using the default home locations (from `pd testnet join`), you can omit the paths and just run `pd migrate`.
5. Optionally, use `pd export` to create a snapshot of the post-migration state.

Finally, restart the node, e.g. `sudo systemctl restart penumbra cometbft`. Check the logs, and you should see the chain progressing
past the halt height `n`.

If you want to host a snapshot for this migration, copy the file
`~/.penumbra/testnet_data/node0/pd-migrated-state-{{ #include ../../penumbra_version.md }}.tar.gz` to the appropriate hosting environment,
and inform the users of your validator.
1 change: 0 additions & 1 deletion docs/guide/src/upgrade_genesis_time_70_71.md

This file was deleted.

1 change: 0 additions & 1 deletion docs/guide/src/upgrade_genesis_time_71_72.md

This file was deleted.

0 comments on commit 15f347c

Please sign in to comment.