Skip to content

Commit

Permalink
Basic fail points impl, test case that uses them to cause crashes whe…
Browse files Browse the repository at this point in the history
…n attempting to write to DB (MystenLabs#7694)

Note that I'm not using https://docs.rs/failpoints/latest/failpoints/
(used elsewhere in the code) because it doesn't allow user-defined
failpoint behavior, but only a limited set of hard-coded actions (such
as panicking).
  • Loading branch information
mystenmark authored Jan 28, 2023
1 parent 72d133d commit a35b872
Show file tree
Hide file tree
Showing 14 changed files with 187 additions and 16 deletions.
8 changes: 6 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion crates/sui-benchmark/src/drivers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ impl FromStr for Interval {

// wrapper which implements serde
#[allow(dead_code)]
#[derive(Debug)]
pub struct HistogramWrapper {
histogram: Histogram<u64>,
}
Expand Down Expand Up @@ -94,7 +95,7 @@ impl StressStats {
}

/// Stores the final statistics of the test run.
#[derive(serde::Serialize, serde::Deserialize)]
#[derive(serde::Serialize, serde::Deserialize, Debug)]
pub struct BenchmarkStats {
pub duration: Duration,
pub num_error: u64,
Expand Down
60 changes: 57 additions & 3 deletions crates/sui-benchmark/tests/simtest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
#[cfg(msim)]
mod test {

use rand::{thread_rng, Rng};
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use sui_benchmark::system_state_observer::SystemStateObserver;
use sui_benchmark::util::generate_all_gas_for_test;
use sui_benchmark::workloads::delegation::DelegationWorkload;
Expand All @@ -20,7 +21,7 @@ mod test {
LocalValidatorAggregatorProxy, ValidatorProxy,
};
use sui_config::SUI_KEYSTORE_FILENAME;
use sui_macros::sim_test;
use sui_macros::{register_fail_points, sim_test};
use sui_simulator::{configs::*, SimConfig};
use sui_types::object::Owner;
use test_utils::messages::get_sui_gas_object_with_wallet_context;
Expand Down Expand Up @@ -87,6 +88,57 @@ mod test {
test_simulated_load(test_cluster, 120).await;
}

#[sim_test(config = "test_config()")]
async fn test_simulated_load_reconfig_crashes() {
let test_cluster = build_test_cluster(4, 10).await;

struct DeadValidator {
node_id: sui_simulator::task::NodeId,
dead_until: std::time::Instant,
}
let dead_validator: Arc<Mutex<Option<DeadValidator>>> = Default::default();

let client_node = sui_simulator::runtime::NodeHandle::current().id();
register_fail_points(
&["batch-write", "transaction-commit", "put-cf"],
move || {
let mut dead_validator = dead_validator.lock().unwrap();
let cur_node = sui_simulator::runtime::NodeHandle::current().id();

// never kill the client node (which is running the test)
if cur_node == client_node {
return;
}

// do not fail multiple nodes at a time.
if let Some(dead) = &*dead_validator {
if dead.node_id != cur_node && dead.dead_until > Instant::now() {
return;
}
}

// otherwise, possibly fail the current node
let mut rng = thread_rng();
if rng.gen_range(0.0..1.0) < 0.01 {
let restart_after = Duration::from_millis(rng.gen_range(10000..20000));

*dead_validator = Some(DeadValidator {
node_id: cur_node,
dead_until: Instant::now() + restart_after,
});

// must manually release lock before calling kill_current_node, which panics
// and would poison the lock.
drop(dead_validator);

sui_simulator::task::kill_current_node(Some(restart_after));
}
},
);

test_simulated_load(test_cluster, 120).await;
}

async fn build_test_cluster(
default_num_validators: usize,
default_checkpoints_per_epoch: u64,
Expand Down Expand Up @@ -221,5 +273,7 @@ mod test {
.unwrap();

assert_eq!(benchmark_stats.num_error, 0);

tracing::info!("end of test {:?}", benchmark_stats);
}
}
1 change: 1 addition & 0 deletions crates/sui-macros/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ edition = "2021"
[dependencies]
sui-proc-macros = { path = "../sui-proc-macros" }
workspace-hack = { version = "0.1", path = "../workspace-hack" }
once_cell = "1.16"
74 changes: 74 additions & 0 deletions crates/sui-macros/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
// Copyright (c) Mysten Labs, Inc.
// SPDX-License-Identifier: Apache-2.0

use std::collections::HashMap;
use std::sync::Arc;

pub use sui_proc_macros::*;

/// Evaluates an expression in a new thread which will not be subject to interception of
Expand All @@ -21,3 +24,74 @@ macro_rules! nondeterministic {
$expr
};
}

type FpMap = HashMap<&'static str, Arc<dyn Fn() + Sync + Send + 'static>>;

#[cfg(msim)]
fn with_fp_map(func: impl FnOnce(&mut FpMap)) {
thread_local! {
static MAP: std::cell::RefCell<FpMap> = Default::default();
}

MAP.with(|val| {
func(&mut val.borrow_mut());
})
}

#[cfg(not(msim))]
fn with_fp_map(func: impl FnOnce(&mut FpMap)) {
use once_cell::sync::Lazy;
use std::sync::Mutex;

static MAP: Lazy<Mutex<FpMap>> = Lazy::new(Default::default);
let mut map = MAP.lock().unwrap();
func(&mut map);
}

pub fn handle_fail_point(identifier: &'static str) {
with_fp_map(|map| {
if let Some(callback) = map.get(identifier) {
callback();
}
})
}

fn register_fail_point_impl(
identifier: &'static str,
callback: Arc<dyn Fn() + Sync + Send + 'static>,
) {
with_fp_map(move |map| {
assert!(
map.insert(identifier, callback).is_none(),
"duplicate fail point registration"
);
})
}

pub fn register_fail_point(identifier: &'static str, callback: impl Fn() + Sync + Send + 'static) {
register_fail_point_impl(identifier, Arc::new(callback));
}

pub fn register_fail_points(
identifiers: &[&'static str],
callback: impl Fn() + Sync + Send + 'static,
) {
let cb = Arc::new(callback);
for id in identifiers {
register_fail_point_impl(id, cb.clone());
}
}

#[cfg(any(msim, fail_points))]
#[macro_export]
macro_rules! fail_point {
($tag: expr) => {
$crate::handle_fail_point($tag)
};
}

#[cfg(not(any(msim, fail_points)))]
#[macro_export]
macro_rules! fail_point {
($tag: expr) => {};
}
2 changes: 1 addition & 1 deletion crates/sui-proc-macros/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ syn = "1.0.104"
workspace-hack = { version = "0.1", path = "../workspace-hack" }

[target.'cfg(msim)'.dependencies]
msim-macros = { git = "https://github.com/MystenLabs/mysten-sim.git", rev = "2bd1f8d83eeb9be60e9570391b01b29bb90421f5", package = "msim-macros" }
msim-macros = { git = "https://github.com/MystenLabs/mysten-sim.git", rev = "95269e60b8570411e0b09ffbd8ac95a506259bdb", package = "msim-macros" }
19 changes: 19 additions & 0 deletions crates/sui-proc-macros/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,25 @@ pub fn init_static_initializers(_args: TokenStream, item: TokenStream) -> TokenS
::sui_simulator::sui_framework::get_sui_framework();
::sui_simulator::sui_types::gas::SuiGasStatus::new_unmetered();

{
// Initialize the static initializers here:
// https://github.com/move-language/move/blob/f976503ec92e6942eac1c05dd8231918d07e0af6/language/tools/move-package/src/package_lock.rs#L12
use std::path::PathBuf;
use sui_simulator::sui_framework_build::compiled_package::BuildConfig;
use sui_simulator::sui_framework::build_move_package;
use sui_simulator::tempfile::TempDir;

let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
path.push("../../sui_programmability/examples/basics");
let mut build_config = BuildConfig::default();

build_config.config.install_dir = Some(TempDir::new().unwrap().into_path());
let _all_module_bytes = build_move_package(&path, build_config)
.unwrap()
.get_package_bytes(/* with_unpublished_deps */ false);
}


use ::sui_simulator::anemo_tower::callback::CallbackLayer;
use ::sui_simulator::anemo_tower::trace::DefaultMakeSpan;
use ::sui_simulator::anemo_tower::trace::DefaultOnFailure;
Expand Down
4 changes: 3 additions & 1 deletion crates/sui-simulator/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ edition = "2021"
[dependencies]
workspace-hack = { version = "0.1", path = "../workspace-hack" }
sui-framework = { path = "../sui-framework" }
sui-framework-build = { path = "../sui-framework-build" }
sui-types = { path = "../sui-types" }
tempfile = "3.3.0"
tracing = "0.1"
anemo.workspace = true
anemo-tower.workspace = true
Expand All @@ -19,4 +21,4 @@ telemetry-subscribers.workspace = true
tower = "0.4.13"

[target.'cfg(msim)'.dependencies]
msim = { git = "https://github.com/MystenLabs/mysten-sim.git", rev = "2bd1f8d83eeb9be60e9570391b01b29bb90421f5", package = "msim" }
msim = { git = "https://github.com/MystenLabs/mysten-sim.git", rev = "95269e60b8570411e0b09ffbd8ac95a506259bdb", package = "msim" }
2 changes: 2 additions & 0 deletions crates/sui-simulator/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@ pub use anemo_tower;
pub use fastcrypto;
pub use narwhal_network;
pub use sui_framework;
pub use sui_framework_build;
pub use sui_types;
pub use telemetry_subscribers;
pub use tempfile;
pub use tower;

#[cfg(msim)]
Expand Down
13 changes: 9 additions & 4 deletions crates/sui-swarm/src/memory/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ impl Node {

/// Start this Node
pub async fn spawn(&self) -> Result<()> {
info!(name =% self.name(), "starting in-memory node");
info!(name =% self.name().concise(), "starting in-memory node");
*self.container.lock().unwrap() =
Some(Container::spawn(self.config.clone(), self.runtime_type).await);
Ok(())
Expand All @@ -63,7 +63,7 @@ impl Node {

/// Stop this Node
pub fn stop(&self) {
info!(name =% self.name(), "stopping in-memory node");
info!(name =% self.name().concise(), "stopping in-memory node");
*self.container.lock().unwrap() = None;
}

Expand All @@ -89,14 +89,19 @@ impl Node {
.await
.map_err(|err| anyhow!(err.to_string()))
.map_err(HealthCheckError::Failure)
.tap_err(|e| error!("error connecting to {}: {e}", self.name()))?;
.tap_err(|e| error!("error connecting to {}: {e}", self.name().concise()))?;

let mut client = tonic_health::proto::health_client::HealthClient::new(channel);
client
.check(tonic_health::proto::HealthCheckRequest::default())
.await
.map_err(|e| HealthCheckError::Failure(e.into()))
.tap_err(|e| error!("error performing health check on {}: {e}", self.name()))?;
.tap_err(|e| {
error!(
"error performing health check on {}: {e}",
self.name().concise()
)
})?;
}

Ok(())
Expand Down
3 changes: 2 additions & 1 deletion crates/test-utils/src/messages.rs
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,8 @@ pub fn create_publish_move_package_transaction(
keypair: &AccountKeyPair,
gas_price: Option<u64>,
) -> VerifiedTransaction {
let build_config = BuildConfig::default();
let mut build_config = BuildConfig::default();
build_config.config.install_dir = Some(tempfile::TempDir::new().unwrap().into_path());
let all_module_bytes = sui_framework::build_move_package(&path, build_config)
.unwrap()
.get_package_bytes(/* with_unpublished_deps */ false);
Expand Down
5 changes: 5 additions & 0 deletions crates/typed-store/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,8 @@ rstest = "0.16.0"
rand = "0.8.5"
syn = { version = "1.0.104", features = ["derive"] }
typed-store-derive = {path = "../typed-store-derive"}

# Most packages should depend on sui-simulator instead of directly on msim, but for typed-store
# that creates a circular dependency.
[target.'cfg(msim)'.dependencies]
msim = { git = "https://github.com/MystenLabs/mysten-sim.git", rev = "95269e60b8570411e0b09ffbd8ac95a506259bdb", package = "msim" }
5 changes: 4 additions & 1 deletion crates/typed-store/src/rocks/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ use tracing::{debug, error, info, instrument};

use self::{iter::Iter, keys::Keys, values::Values};
pub use errors::TypedStoreError;
use sui_macros::nondeterministic;
use sui_macros::{fail_point, nondeterministic};

// Write buffer size per RocksDB instance can be set via the env var below.
// If the env var is not set, use the default value in MiB.
Expand Down Expand Up @@ -264,6 +264,7 @@ impl RocksDB {
K: AsRef<[u8]>,
V: AsRef<[u8]>,
{
fail_point!("put-cf");
delegate_call!(self.put_cf_opt(cf, key, value, writeopts))
}

Expand All @@ -281,6 +282,7 @@ impl RocksDB {
}

pub fn write(&self, batch: RocksDBBatch) -> Result<(), TypedStoreError> {
fail_point!("batch-write");
match (self, batch) {
(RocksDB::DBWithThreadMode(db), RocksDBBatch::Regular(batch)) => {
db.underlying.write(batch)?;
Expand Down Expand Up @@ -1278,6 +1280,7 @@ impl<'a> DBTransaction<'a> {
}

pub fn commit(self) -> Result<(), TypedStoreError> {
fail_point!("transaction-commit");
self.transaction.commit().map_err(|e| match e.kind() {
// empirically, this is what you get when there is a write conflict. it is not
// documented whether this is the only time you can get this error.
Expand Down
Loading

0 comments on commit a35b872

Please sign in to comment.