Skip to content

Commit

Permalink
enha: rework rocksdb configuration (#1871)
Browse files Browse the repository at this point in the history
* enha: rework rocksdb configuration
  • Loading branch information
carneiro-cw authored Dec 17, 2024
1 parent de60e3e commit 4fc58a0
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 153 deletions.
5 changes: 5 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,10 @@ indicatif = "=0.17.8"
# ------------------------------------------------------------------------------

[target.'cfg(not(all(target_arch = "aarch64", target_os = "linux")))'.dependencies]
revm = { version = "=9.0.0", features = ["asm-keccak"] }
revm = { version = "=9.0.0", features = ["asm-keccak", "serde"] }

[target.'cfg(all(target_arch = "aarch64", target_os = "linux"))'.dependencies]
revm = { version = "=9.0.0" }
revm = { version = "=9.0.0", features = ["serde"]}

[target.'cfg(not(target_env = "msvc"))'.dependencies]
tikv-jemallocator = { version = "=0.6", optional = true }
Expand Down
4 changes: 2 additions & 2 deletions src/eth/storage/permanent/rocks/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ pub use rocks_state::RocksStorageState;
mod rocks_permanent;

/// State handler for DB and column families.
mod rocks_state;
pub mod rocks_state;

/// CFs versionated by value variant.
mod cf_versions;
Expand All @@ -19,7 +19,7 @@ mod rocks_cf;
mod rocks_config;

/// Functionalities related to the whole database.
mod rocks_db;
pub mod rocks_db;

/// All types to be serialized and desserialized in the db.
pub mod types;
153 changes: 32 additions & 121 deletions src/eth/storage/permanent/rocks/rocks_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,6 @@ use rocksdb::BlockBasedOptions;
use rocksdb::Cache;
use rocksdb::Options;

const GIGABYTE: usize = 1024 * 1024 * 1024;
const MEGABYTE: usize = 1024 * 1024;
const KILOBYTE: usize = 1024;

const GIGABYTE_U64: u64 = 1024 * 1024 * 1024;
const MEGABYTE_U64: u64 = 1024 * 1024;

#[derive(Debug, Clone, Copy)]
pub enum CacheSetting {
/// Enabled cache with the given size in bytes
Enabled(usize),
Expand All @@ -18,8 +10,7 @@ pub enum CacheSetting {

#[derive(Debug, Clone, Copy)]
pub enum DbConfig {
LargeSSTFiles,
FastWriteSST,
OptimizedPointLookUp,
Default,
}

Expand All @@ -30,139 +21,59 @@ impl Default for DbConfig {
}

impl DbConfig {
pub fn to_options(self, cache_setting: CacheSetting) -> Options {
pub fn to_options(self, cache_setting: CacheSetting, prefix_len: Option<usize>) -> Options {
let mut opts = Options::default();
let mut block_based_options = BlockBasedOptions::default();

opts.create_if_missing(true);
opts.create_missing_column_families(true);
opts.increase_parallelism(16);

// NOTE: As per the rocks db wiki: "The overhead of statistics is usually small but non-negligible. We usually observe an overhead of 5%-10%."
block_based_options.set_pin_l0_filter_and_index_blocks_in_cache(true);
block_based_options.set_cache_index_and_filter_blocks(true);
block_based_options.set_bloom_filter(15.5, false);

// due to the nature of our application enabling rocks metrics decreases point lookup performance by 5x.
#[cfg(feature = "metrics")]
{
opts.enable_statistics();
opts.set_statistics_level(rocksdb::statistics::StatsLevel::ExceptTimeForMutex);
}

match self {
DbConfig::LargeSSTFiles => {
// Set the compaction style to Level Compaction
opts.set_compaction_style(rocksdb::DBCompactionStyle::Level);

// Configure the size of SST files at each level
opts.set_target_file_size_base(512 * MEGABYTE_U64);

// Increase the file size multiplier to expand file size at upper levels
opts.set_target_file_size_multiplier(2); // Each level grows in file size quicker

// Reduce the number of L0 files that trigger compaction, increasing frequency
opts.set_level_zero_file_num_compaction_trigger(2);

// Reduce thresholds for slowing and stopping writes, which forces more frequent compaction
opts.set_level_zero_slowdown_writes_trigger(10);
opts.set_level_zero_stop_writes_trigger(20);

// Increase the max bytes for L1 to allow more data before triggering compaction
opts.set_max_bytes_for_level_base(2 * GIGABYTE_U64);

// Increase the level multiplier to aggressively increase space at each level
opts.set_max_bytes_for_level_multiplier(8.0); // Exponential growth of levels is more pronounced

// Configure block size to optimize for larger blocks, improving sequential read performance
block_based_options.set_block_size(128 * KILOBYTE);

// Increase the number of write buffers to delay flushing, optimizing CPU usage for compaction
opts.set_max_write_buffer_number(5);
opts.set_write_buffer_size(128 * MEGABYTE); // 128MB per write buffer

// Keep a higher number of open files to accommodate more files being produced by aggressive compaction
opts.set_max_open_files(20_000);

// Apply more aggressive compression settings, if I/O and CPU permit
opts.set_compression_per_level(&[
rocksdb::DBCompressionType::Lz4,
rocksdb::DBCompressionType::Zstd, // Use Zstd for higher compression from L1 onwards
]);
}
DbConfig::FastWriteSST => {
// Continue using Level Compaction due to its effective use of I/O and CPU for writes
opts.set_compaction_style(rocksdb::DBCompactionStyle::Level);

// Increase initial SST file sizes to reduce the frequency of writes to disk
opts.set_target_file_size_base(512 * MEGABYTE_U64); // Starting at 512MB for L1

// Minimize the file size multiplier to control the growth of file sizes at upper levels
opts.set_target_file_size_multiplier(1); // Minimal increase in file size at upper levels

// Increase triggers for write slowdown and stop to maximize buffer before I/O actions
opts.set_level_zero_file_num_compaction_trigger(100); // Slow down writes at 100 L0 files
opts.set_level_zero_stop_writes_trigger(200); // Stop writes at 200 L0 files

// Expand the maximum bytes for base level to further delay the need for compaction-related I/O
opts.set_max_bytes_for_level_base(2048 * MEGABYTE_U64);

// Use a higher level multiplier to increase space exponentially at higher levels
opts.set_max_bytes_for_level_multiplier(10.0);

// Opt for larger block sizes to decrease the number of read and write operations to disk
block_based_options.set_block_size(512 * KILOBYTE); // 512KB blocks
if let Some(prefix_len) = prefix_len {
let transform = rocksdb::SliceTransform::create_fixed_prefix(prefix_len);
block_based_options.set_index_type(rocksdb::BlockBasedIndexType::HashSearch);
opts.set_memtable_prefix_bloom_ratio(0.15);
opts.set_prefix_extractor(transform);
}

// Maximize the use of write buffers to extend the time data stays in memory before flushing
opts.set_max_write_buffer_number(16);
opts.set_write_buffer_size(GIGABYTE); // 1GB per write buffer
if let CacheSetting::Enabled(cache_size) = cache_setting {
let block_cache = Cache::new_lru_cache(cache_size / 2);
let row_cache = Cache::new_lru_cache(cache_size / 2);

// Allow a very high number of open files to minimize the overhead of opening and closing files
opts.set_max_open_files(20_000);
opts.set_row_cache(&row_cache);
block_based_options.set_block_cache(&block_cache);
}

// Choose compression that balances CPU use and effective storage reduction
opts.set_compression_per_level(&[rocksdb::DBCompressionType::Lz4, rocksdb::DBCompressionType::Zstd]);
match self {
DbConfig::OptimizedPointLookUp => {
block_based_options.set_data_block_hash_ratio(0.3);
block_based_options.set_data_block_index_type(rocksdb::DataBlockIndexType::BinaryAndHash);

// Enable settings that make full use of CPU to handle more data in memory and process compaction
opts.set_allow_concurrent_memtable_write(true);
opts.set_enable_write_thread_adaptive_yield(true);
opts.set_use_direct_reads(true);
opts.set_memtable_whole_key_filtering(true);
opts.set_compression_type(rocksdb::DBCompressionType::None);
}
DbConfig::Default => {
block_based_options.set_ribbon_filter(15.5); // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter

opts.set_allow_concurrent_memtable_write(true);
opts.set_enable_write_thread_adaptive_yield(true);

let transform = rocksdb::SliceTransform::create_fixed_prefix(10);
opts.set_prefix_extractor(transform);
opts.set_memtable_prefix_bloom_ratio(0.2);

// Enable a size-tiered compaction style, which is good for workloads with a high rate of updates and overwrites
opts.set_compaction_style(rocksdb::DBCompactionStyle::Universal);

let mut universal_compact_options = rocksdb::UniversalCompactOptions::default();
universal_compact_options.set_size_ratio(10);
universal_compact_options.set_min_merge_width(2);
universal_compact_options.set_max_merge_width(6);
universal_compact_options.set_max_size_amplification_percent(50);
universal_compact_options.set_compression_size_percent(-1);
universal_compact_options.set_stop_style(rocksdb::UniversalCompactionStopStyle::Total);
opts.set_universal_compaction_options(&universal_compact_options);

let pt_opts = rocksdb::PlainTableFactoryOptions {
user_key_length: 0,
bloom_bits_per_key: 10,
hash_table_ratio: 0.75,
index_sparseness: 8,
encoding_type: rocksdb::KeyEncodingType::Plain, // Default encoding
full_scan_mode: false, // Optimized for point lookups rather than full scans
huge_page_tlb_size: 0, // Not using huge pages
store_index_in_file: false, // Store index in memory for faster access
};
opts.set_plain_table_factory(&pt_opts);
opts.set_compression_type(rocksdb::DBCompressionType::Lz4);
opts.set_bottommost_compression_type(rocksdb::DBCompressionType::Zstd);
opts.set_bottommost_compression_options(-14, 32767, 0, 16 * 1024, true); // mostly defaults except max_dict_bytes
opts.set_bottommost_zstd_max_train_bytes(1600 * 1024, true);
}
}
if let CacheSetting::Enabled(cache_size) = cache_setting {
let cache = Cache::new_lru_cache(cache_size);
block_based_options.set_block_cache(&cache);
block_based_options.set_cache_index_and_filter_blocks(true);
}

opts.set_block_based_table_factory(&block_based_options);

opts
}
}
2 changes: 1 addition & 1 deletion src/eth/storage/permanent/rocks/rocks_db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ pub fn create_or_open_db(path: impl AsRef<Path>, cf_configs: &HashMap<&'static s
let cf_config_iter = cf_configs.iter().map(|(name, opts)| (*name, opts.clone()));

tracing::debug!("generating options for column families");
let db_opts = DbConfig::Default.to_options(CacheSetting::Disabled);
let db_opts = DbConfig::Default.to_options(CacheSetting::Disabled, None);

if !path.exists() {
tracing::warn!(?path, "RocksDB at path doesn't exist, creating a new one there instead");
Expand Down
6 changes: 0 additions & 6 deletions src/eth/storage/permanent/rocks/rocks_permanent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,6 @@ impl PermanentStorage for RocksPermanentStorage {
}

fn save_block(&self, block: Block) -> anyhow::Result<()> {
#[cfg(feature = "metrics")]
{
self.state.export_metrics().inspect_err(|e| {
tracing::error!(reason = ?e, "failed to export metrics in RocksPermanent");
})?;
}
self.state.save_block(block).inspect_err(|e| {
tracing::error!(reason = ?e, "failed to save block in RocksPermanent");
})
Expand Down
29 changes: 8 additions & 21 deletions src/eth/storage/permanent/rocks/rocks_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ use super::cf_versions::CfAccountsHistoryValue;
use super::cf_versions::CfAccountsValue;
use super::cf_versions::CfBlocksByHashValue;
use super::cf_versions::CfBlocksByNumberValue;
use super::cf_versions::CfLogsValue;
use super::cf_versions::CfTransactionsValue;
use super::rocks_cf::RocksCfRef;
use super::rocks_config::CacheSetting;
Expand All @@ -35,7 +34,6 @@ use super::types::AccountRocksdb;
use super::types::AddressRocksdb;
use super::types::BlockNumberRocksdb;
use super::types::HashRocksdb;
use super::types::IndexRocksdb;
use super::types::SlotIndexRocksdb;
use super::types::SlotValueRocksdb;
use crate::eth::primitives::Account;
Expand Down Expand Up @@ -66,7 +64,7 @@ cfg_if::cfg_if! {
}
}

fn generate_cf_options_map(cache_multiplier: Option<f32>) -> HashMap<&'static str, Options> {
pub fn generate_cf_options_map(cache_multiplier: Option<f32>) -> HashMap<&'static str, Options> {
let cache_multiplier = cache_multiplier.unwrap_or(1.0);

// multiplies the given size in GBs by the cache multiplier
Expand All @@ -77,14 +75,13 @@ fn generate_cf_options_map(cache_multiplier: Option<f32>) -> HashMap<&'static st
};

hmap! {
"accounts" => DbConfig::Default.to_options(cached_in_gigs_and_multiplied(15)),
"accounts_history" => DbConfig::FastWriteSST.to_options(CacheSetting::Disabled),
"account_slots" => DbConfig::Default.to_options(cached_in_gigs_and_multiplied(45)),
"account_slots_history" => DbConfig::FastWriteSST.to_options(CacheSetting::Disabled),
"transactions" => DbConfig::LargeSSTFiles.to_options(CacheSetting::Disabled),
"blocks_by_number" => DbConfig::LargeSSTFiles.to_options(CacheSetting::Disabled),
"blocks_by_hash" => DbConfig::LargeSSTFiles.to_options(CacheSetting::Disabled),
"logs" => DbConfig::LargeSSTFiles.to_options(CacheSetting::Disabled),
"accounts" => DbConfig::OptimizedPointLookUp.to_options(cached_in_gigs_and_multiplied(15), None),
"accounts_history" => DbConfig::Default.to_options(CacheSetting::Disabled, Some(20)),
"account_slots" => DbConfig::OptimizedPointLookUp.to_options(cached_in_gigs_and_multiplied(45), Some(20)),
"account_slots_history" => DbConfig::Default.to_options(CacheSetting::Disabled, Some(52)),
"transactions" => DbConfig::Default.to_options(CacheSetting::Disabled, None),
"blocks_by_number" => DbConfig::Default.to_options(CacheSetting::Disabled, None),
"blocks_by_hash" => DbConfig::Default.to_options(CacheSetting::Disabled, None)
}
}

Expand Down Expand Up @@ -117,7 +114,6 @@ pub struct RocksStorageState {
pub transactions: RocksCfRef<HashRocksdb, CfTransactionsValue>,
pub blocks_by_number: RocksCfRef<BlockNumberRocksdb, CfBlocksByNumberValue>,
blocks_by_hash: RocksCfRef<HashRocksdb, CfBlocksByHashValue>,
logs: RocksCfRef<(HashRocksdb, IndexRocksdb), CfLogsValue>,
/// Last collected stats for a histogram
#[cfg(feature = "metrics")]
prev_stats: Mutex<HashMap<HistogramInt, (Sum, Count)>>,
Expand Down Expand Up @@ -156,7 +152,6 @@ impl RocksStorageState {
transactions: new_cf_ref(&db, "transactions", &cf_options_map)?,
blocks_by_number: new_cf_ref(&db, "blocks_by_number", &cf_options_map)?,
blocks_by_hash: new_cf_ref(&db, "blocks_by_hash", &cf_options_map)?,
logs: new_cf_ref(&db, "logs", &cf_options_map)?,
#[cfg(feature = "metrics")]
prev_stats: Mutex::default(),
#[cfg(feature = "metrics")]
Expand Down Expand Up @@ -200,7 +195,6 @@ impl RocksStorageState {
self.transactions.clear()?;
self.blocks_by_number.clear()?;
self.blocks_by_hash.clear()?;
self.logs.clear()?;
Ok(())
}

Expand Down Expand Up @@ -432,16 +426,11 @@ impl RocksStorageState {
let account_changes = block.compact_account_changes();

let mut txs_batch = vec![];
let mut logs_batch = vec![];
for transaction in block.transactions.iter().cloned() {
txs_batch.push((transaction.input.hash.into(), transaction.block_number.into()));
for log in transaction.logs {
logs_batch.push(((transaction.input.hash.into(), log.log_index.into()), transaction.block_number.into()));
}
}

self.transactions.prepare_batch_insertion(txs_batch, batch)?;
self.logs.prepare_batch_insertion(logs_batch, batch)?;

let number = block.number();
let block_hash = block.hash();
Expand Down Expand Up @@ -521,7 +510,6 @@ impl RocksStorageState {
self.transactions.clear().context("when clearing transactions")?;
self.blocks_by_hash.clear().context("when clearing blocks_by_hash")?;
self.blocks_by_number.clear().context("when clearing blocks_by_number")?;
self.logs.clear().context("when clearing logs")?;
Ok(())
}
}
Expand Down Expand Up @@ -587,7 +575,6 @@ impl RocksStorageState {
self.accounts_history.export_metrics();
self.blocks_by_hash.export_metrics();
self.blocks_by_number.export_metrics();
self.logs.export_metrics();
self.transactions.export_metrics();
Ok(())
}
Expand Down

0 comments on commit 4fc58a0

Please sign in to comment.