From fabf2ded0dde577c30d4b1c9602977e749ff69ee Mon Sep 17 00:00:00 2001 From: zhangkai Date: Sun, 25 Jun 2023 16:53:56 +0800 Subject: [PATCH] support repair snapshot (#172) * support repair snapshot * add flag to control record snapshot journal --------- Co-authored-by: KamiD <44460798+KamiD@users.noreply.github.com> --- app/repair_state.go | 3 +- cmd/okbchaind/repair_data.go | 4 --- libs/cosmos-sdk/server/start.go | 1 + libs/cosmos-sdk/server/start_okchain.go | 1 + libs/cosmos-sdk/store/mpt/params.go | 2 ++ libs/cosmos-sdk/store/mpt/snapshot.go | 31 +++++++++++++++++-- .../store/rootmulti/rootmulti_store.go | 15 ++------- libs/tendermint/global/status.go | 11 +++++++ 8 files changed, 48 insertions(+), 20 deletions(-) create mode 100644 libs/tendermint/global/status.go diff --git a/app/repair_state.go b/app/repair_state.go index 8811e27e4..f3931c2ae 100644 --- a/app/repair_state.go +++ b/app/repair_state.go @@ -89,7 +89,8 @@ func repairStateOnStart(ctx *server.Context) { func RepairState(ctx *server.Context, onStart bool) { sm.SetIgnoreSmbCheck(true) iavl.SetIgnoreVersionCheck(true) - rootmulti.SetRepair() + global.SetRepairState(true) + defer global.SetRepairState(false) // load latest block height dataDir := filepath.Join(ctx.Config.RootDir, "data") diff --git a/cmd/okbchaind/repair_data.go b/cmd/okbchaind/repair_data.go index 15c11c43f..366da6b4d 100644 --- a/cmd/okbchaind/repair_data.go +++ b/cmd/okbchaind/repair_data.go @@ -2,7 +2,6 @@ package main import ( "fmt" - "github.com/okx/okbchain/libs/cosmos-sdk/store/mpt" "log" "net/http" _ "net/http/pprof" @@ -64,7 +63,4 @@ func setExternalPackageValue() { tmiavl.SetEnableFastStorage(true) tmiavl.SetIgnoreAutoUpgrade(true) } - if !viper.GetBool(tmiavl.FlagIavlDiscardFastStorage) { - mpt.SetSnapshotRebuild(true) - } } diff --git a/libs/cosmos-sdk/server/start.go b/libs/cosmos-sdk/server/start.go index ef80dbca2..cf52937ac 100644 --- a/libs/cosmos-sdk/server/start.go +++ b/libs/cosmos-sdk/server/start.go @@ -362,4 +362,5 @@ func SetExternalPackageValue(cmd *cobra.Command) { mptstore.TrieAsyncDBInitCap = viper.GetInt(mptstore.FlagTrieAsyncDBInitCap) mptstore.TrieAsyncDBAutoPruningOff = viper.GetBool(mptstore.FlagTrieAsyncDBAutoPruningOff) mptstore.TrieAsyncDBSyncPruning = viper.GetBool(mptstore.FlagTrieAsyncDBSyncPruning) + mptstore.SetSnapshotJournal(viper.GetBool(mptstore.FlagTrieEnableSnapshotJournal)) } diff --git a/libs/cosmos-sdk/server/start_okchain.go b/libs/cosmos-sdk/server/start_okchain.go index 6f568df76..9af7a5832 100644 --- a/libs/cosmos-sdk/server/start_okchain.go +++ b/libs/cosmos-sdk/server/start_okchain.go @@ -260,6 +260,7 @@ func RegisterServerFlags(cmd *cobra.Command) *cobra.Command { cmd.Flags().Int(mpt.FlagTrieAsyncDBInitCap, 200_0000, "Init cap of trie async db") cmd.Flags().Bool(mpt.FlagTrieAsyncDBAutoPruningOff, false, "Disable auto prune of trie async db") cmd.Flags().Bool(mpt.FlagTrieAsyncDBSyncPruning, false, "if auto pruning is off and this is on, trie async db will be pruned every block in sync mode") + cmd.Flags().Bool(mpt.FlagTrieEnableSnapshotJournal, false, "Enable record snapshot's journal. So that snapshot can be repaired within certain version") cmd.Flags().Int64(FlagCommitGapHeight, 10, "Block interval to commit cached data into db, affects iavl & mpt") cmd.Flags().Int64(FlagFastSyncGap, 20, "Block height interval to switch fast-sync mode") diff --git a/libs/cosmos-sdk/store/mpt/params.go b/libs/cosmos-sdk/store/mpt/params.go index 5ca978eaf..0deb7a9ed 100644 --- a/libs/cosmos-sdk/store/mpt/params.go +++ b/libs/cosmos-sdk/store/mpt/params.go @@ -24,6 +24,8 @@ const ( FlagTrieAsyncDBInitCap = "trie.asyncdb.init-cap" FlagTrieAsyncDBAutoPruningOff = "trie.asyncdb.auto-pruning-off" FlagTrieAsyncDBSyncPruning = "trie.asyncdb.sync-pruning" + + FlagTrieEnableSnapshotJournal = "trie.enable-snapshot-journal" ) var ( diff --git a/libs/cosmos-sdk/store/mpt/snapshot.go b/libs/cosmos-sdk/store/mpt/snapshot.go index 45e52d141..c89e530fe 100644 --- a/libs/cosmos-sdk/store/mpt/snapshot.go +++ b/libs/cosmos-sdk/store/mpt/snapshot.go @@ -6,17 +6,24 @@ import ( "github.com/ethereum/go-ethereum/core/rawdb" "github.com/ethereum/go-ethereum/core/state/snapshot" mpttypes "github.com/okx/okbchain/libs/cosmos-sdk/store/mpt/types" + "github.com/okx/okbchain/libs/tendermint/global" ) var ( gDisableSnapshot = false gSnapshotRebuild = false + + // gEnableSnapshotJournal enable snapshot journal. + // so snapshot can be repaired within snapshotMemoryLayerCount. + gEnableSnapshotJournal = false ) const ( // snapshotMemoryLayerCount snapshot memory layer count - // as we dont rollback transactions so we only keep 1 memory layer - snapshotMemoryLayerCount = 1 + // snapshotMemoryLayerCount controls the snapshot Journal height, + // if repair start-height is lower than snapshot Journal height, + // snapshot will not be repaired anymore + snapshotMemoryLayerCount = 10 ) func DisableSnapshot() { @@ -27,6 +34,14 @@ func SetSnapshotRebuild(rebuild bool) { gSnapshotRebuild = rebuild } +func SetSnapshotJournal(enable bool) { + gEnableSnapshotJournal = enable +} + +func checkSnapshotJournal() bool { + return gEnableSnapshotJournal +} + func (ms *MptStore) openSnapshot() error { if ms == nil || ms.db == nil || ms.trie == nil || ms.db.TrieDB().DiskDB() == nil || gDisableSnapshot { return fmt.Errorf("mpt store is nil or mpt trie is nil") @@ -42,6 +57,9 @@ func (ms *MptStore) openSnapshot() error { ms.logger.Error("Enabling snapshot recovery", "chainhead", version, "diskbase", *layer) recovery = true } + if global.GetRepairState() { + recovery = true + } var err error ms.snaps, err = snapshot.NewCustom(ms.db.TrieDB().DiskDB(), ms.db.TrieDB(), 256, ms.originalRoot, false, gSnapshotRebuild, recovery, ms.retriever) if err != nil { @@ -74,6 +92,8 @@ func (ms *MptStore) prepareSnap(root common.Hash) { ms.snapDestructs = make(map[common.Hash]struct{}) ms.snapAccounts = make(map[common.Hash][]byte) ms.snapStorage = make(map[common.Hash]map[common.Hash][]byte) + } else { + ms.logger.Error("prepare snapshot error", "root", root) } } @@ -94,6 +114,13 @@ func (ms *MptStore) commitSnap(root common.Hash) { if err := ms.snaps.Cap(root, snapshotMemoryLayerCount); err != nil { ms.logger.Error("Failed to cap snapshot tree", "root", root, "layers", snapshotMemoryLayerCount, "err", err) } + + // record snapshot journal + if checkSnapshotJournal() { + if _, err := ms.snaps.Journal(root); err != nil { + ms.logger.Error("Failed to journal snapshot tree", "root", root, "err", err) + } + } } ms.snap, ms.snapDestructs, ms.snapAccounts, ms.snapStorage = nil, nil, nil, nil diff --git a/libs/cosmos-sdk/store/rootmulti/rootmulti_store.go b/libs/cosmos-sdk/store/rootmulti/rootmulti_store.go index 0ef7d61b8..c757c945f 100644 --- a/libs/cosmos-sdk/store/rootmulti/rootmulti_store.go +++ b/libs/cosmos-sdk/store/rootmulti/rootmulti_store.go @@ -3,6 +3,7 @@ package rootmulti import ( "encoding/binary" "fmt" + "github.com/okx/okbchain/libs/tendermint/global" "io" "log" "path/filepath" @@ -47,18 +48,6 @@ const ( maxPruneHeightsLength = 100 ) -var ( - repairing bool -) - -func SetRepair() { - repairing = true -} - -func getRepair() bool { - return repairing -} - // Store is composed of many CommitStores. Name contrasts with // cacheMultiStore which is for cache-wrapping other MultiStores. It implements // the CommitMultiStore interface. @@ -408,7 +397,7 @@ func (rs *Store) loadVersion(ver int64, upgrades *types.StoreUpgrades) error { // we can not get the upgrade version before the expect height, // and we should not use the original 0 too, because 0 means the latest height, // so when we repair data before the milestone. we open a empty tree by cur version. - if getRepair() && version == 0 { + if global.GetRepairState() && version == 0 { param.upgradeVersion = uint64(ver) } rs.storesParams[key] = param diff --git a/libs/tendermint/global/status.go b/libs/tendermint/global/status.go new file mode 100644 index 000000000..8a1416681 --- /dev/null +++ b/libs/tendermint/global/status.go @@ -0,0 +1,11 @@ +package global + +var repairState bool + +func SetRepairState(state bool) { + repairState = state +} + +func GetRepairState() bool { + return repairState +}