diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index ea5364f29b1..92918d88ae7 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -169,7 +169,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, log_dir_synced_(false), log_empty_(true), persist_stats_cf_handle_(nullptr), - log_sync_cv_(&mutex_), + log_sync_cv_(&log_write_mutex_), total_log_size_(0), is_snapshot_supported_(true), write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()), @@ -258,6 +258,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, // we won't drop any deletion markers until SetPreserveDeletesSequenceNumber() // is called by client and this seqnum is advanced. preserve_deletes_seqnum_.store(0); + max_total_wal_size_.store(mutable_db_options_.max_total_wal_size, + std::memory_order_relaxed); } Status DBImpl::Resume() { @@ -526,25 +528,28 @@ Status DBImpl::CloseHelper() { mutex_.Lock(); } - for (auto l : logs_to_free_) { - delete l; - } - for (auto& log : logs_) { - uint64_t log_number = log.writer->get_log_number(); - Status s = log.ClearWriter(); - if (!s.ok()) { - ROCKS_LOG_WARN( - immutable_db_options_.info_log, - "Unable to Sync WAL file %s with error -- %s", - LogFileName(immutable_db_options_.wal_dir, log_number).c_str(), - s.ToString().c_str()); - // Retain the first error - if (ret.ok()) { - ret = s; + { + InstrumentedMutexLock lock(&log_write_mutex_); + for (auto l : logs_to_free_) { + delete l; + } + for (auto& log : logs_) { + uint64_t log_number = log.writer->get_log_number(); + Status s = log.ClearWriter(); + if (!s.ok()) { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "Unable to Sync WAL file %s with error -- %s", + LogFileName(immutable_db_options_.wal_dir, log_number).c_str(), + s.ToString().c_str()); + // Retain the first error + if (ret.ok()) { + ret = s; + } } } + logs_.clear(); } - logs_.clear(); // Table cache may have table handles holding blocks from the block cache. // We need to release them before the block cache is destroyed. The block @@ -1014,11 +1019,12 @@ Status DBImpl::SetDBOptions( mutable_db_options_.max_background_jobs, mutable_db_options_.base_background_compactions, /* parallelize_compactions */ true); - const BGJobLimits new_bg_job_limits = GetBGJobLimits( - new_options.max_background_flushes, - new_options.max_background_compactions, - new_options.max_background_jobs, - new_options.base_background_compactions, /* parallelize_compactions */ true); + const BGJobLimits new_bg_job_limits = + GetBGJobLimits(new_options.max_background_flushes, + new_options.max_background_compactions, + new_options.max_background_jobs, + new_options.base_background_compactions, + /* parallelize_compactions */ true); const bool max_flushes_increased = new_bg_job_limits.max_flushes > current_bg_job_limits.max_flushes; @@ -1072,6 +1078,11 @@ Status DBImpl::SetDBOptions( thread_persist_stats_.reset(); } } + if (new_options.max_total_wal_size != + mutable_db_options_.max_total_wal_size) { + max_total_wal_size_.store(new_options.max_total_wal_size, + std::memory_order_release); + } write_controller_.set_max_delayed_write_rate( new_options.delayed_write_rate); table_cache_.get()->SetCapacity(new_options.max_open_files == -1 @@ -1187,7 +1198,7 @@ Status DBImpl::SyncWAL() { uint64_t current_log_number; { - InstrumentedMutexLock l(&mutex_); + InstrumentedMutexLock l(&log_write_mutex_); assert(!logs_.empty()); // This SyncWAL() call only cares about logs up to this number. @@ -1235,7 +1246,7 @@ Status DBImpl::SyncWAL() { TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1"); { - InstrumentedMutexLock l(&mutex_); + InstrumentedMutexLock l(&log_write_mutex_); MarkLogsSynced(current_log_number, need_log_dir_sync, status); } TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); @@ -1264,7 +1275,7 @@ Status DBImpl::UnlockWAL() { void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status) { - mutex_.AssertHeld(); + log_write_mutex_.AssertHeld(); if (synced_dir && logfile_number_ == up_to && status.ok()) { log_dir_synced_ = true; } @@ -1273,8 +1284,6 @@ void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir, assert(log.getting_synced); if (status.ok() && logs_.size() > 1) { logs_to_free_.push_back(log.ReleaseWriter()); - // To modify logs_ both mutex_ and log_write_mutex_ must be held - InstrumentedMutexLock l(&log_write_mutex_); it = logs_.erase(it); } else { log.getting_synced = false; @@ -2117,13 +2126,13 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, write_thread_.ExitUnbatched(&w); } if (s.ok()) { + single_column_family_mode_.store(false, std::memory_order_release); auto* cfd = versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); assert(cfd != nullptr); s = cfd->AddDirectories(); } if (s.ok()) { - single_column_family_mode_ = false; auto* cfd = versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); assert(cfd != nullptr); diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index ec2bfe3d39d..8a6f06e044b 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -554,6 +554,8 @@ class DBImpl : public DB { void FindObsoleteFiles(JobContext* job_context, bool force, bool no_full_scan = false); + void FindObsoleteLogFiles(JobContext* job_context); + // Diffs the files listed in filenames and those that do not // belong to live files are possibly removed. Also, removes all the // files in sst_delete_files and log_delete_files. @@ -937,10 +939,12 @@ class DBImpl : public DB { // only used for dynamically adjusting max_total_wal_size. it is a sum of // [write_buffer_size * max_write_buffer_number] over all column families - uint64_t max_total_in_memory_state_; + std::atomic max_total_in_memory_state_; + + std::atomic max_total_wal_size_; // If true, we have only one (default) column family. We use this to optimize // some code-paths - bool single_column_family_mode_; + std::atomic single_column_family_mode_; // The options to access storage files const EnvOptions env_options_; @@ -1134,6 +1138,14 @@ class DBImpl : public DB { } }; + struct LogContext { + explicit LogContext(bool need_sync = false) + : need_log_sync(need_sync), need_log_dir_sync(need_sync) {} + bool need_log_sync; + bool need_log_dir_sync; + log::Writer* writer; + }; + struct LogFileNumberSize { explicit LogFileNumberSize(uint64_t _number) : number(_number) {} void AddSize(uint64_t new_size) { size += new_size; } @@ -1404,8 +1416,8 @@ class DBImpl : public DB { Status HandleWriteBufferFull(WriteContext* write_context); // REQUIRES: mutex locked - Status PreprocessWrite(const WriteOptions& write_options, bool* need_log_sync, - WriteContext* write_context); + Status PreprocessWrite(const WriteOptions& write_options, + LogContext* log_context, WriteContext* write_context); WriteBatch* MergeBatch(const WriteThread::WriteGroup& write_group, WriteBatch* tmp_batch, size_t* write_with_wal, @@ -1610,12 +1622,11 @@ class DBImpl : public DB { // Lock over the persistent DB state. Non-nullptr iff successfully acquired. FileLock* db_lock_; - // In addition to mutex_, log_write_mutex_ protected writes to stats_history_ + // In addition to mutex_, stats_history_mutex_ protected writes to stats_history_ InstrumentedMutex stats_history_mutex_; - // In addition to mutex_, log_write_mutex_ protected writes to logs_ and - // logfile_number_. With two_write_queues it also protects alive_log_files_, - // and log_empty_. Refer to the definition of each variable below for more - // details. + // In addition to mutex_, log_write_mutex_ protected access to logs_, + // logfile_number_, alive_log_files_ and log_empty_. + // Refer to the definition of each variable below for more details. // Note: to avoid dealock, if needed to acquire both log_write_mutex_ and // mutex_, the order should be first mutex_ and then log_write_mutex_. InstrumentedMutex log_write_mutex_; @@ -1664,11 +1675,10 @@ class DBImpl : public DB { std::deque alive_log_files_; // Log files that aren't fully synced, and the current log file. // Synchronization: - // - push_back() is done from write_thread_ with locked mutex_ and - // log_write_mutex_ + // - push_back() is done from write_thread_ with locked log_write_mutex_ // - pop_front() is done from any thread with locked mutex_ and // log_write_mutex_ - // - reads are done with either locked mutex_ or log_write_mutex_ + // - reads are done with locked log_write_mutex_ // - back() and items with getting_synced=true are not popped, // - The same thread that sets getting_synced=true will reset it. // - it follows that the object referred by back() can be safely read from @@ -1690,7 +1700,7 @@ class DBImpl : public DB { std::atomic total_log_size_; // If this is non-empty, we need to delete these log files in background - // threads. Protected by db mutex. + // threads. Protected by db log_write_mutex_. autovector logs_to_free_; bool is_snapshot_supported_; diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 86985871438..81253997326 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -80,7 +80,7 @@ bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force, Status DBImpl::SyncClosedLogs(JobContext* job_context) { TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start"); - mutex_.AssertHeld(); + InstrumentedMutexLock l(&log_write_mutex_); autovector logs_to_sync; uint64_t current_log_number = logfile_number_; while (logs_.front().number < current_log_number && @@ -97,7 +97,7 @@ Status DBImpl::SyncClosedLogs(JobContext* job_context) { Status s; if (!logs_to_sync.empty()) { - mutex_.Unlock(); + log_write_mutex_.Unlock(); for (log::Writer* log : logs_to_sync) { ROCKS_LOG_INFO(immutable_db_options_.info_log, @@ -119,13 +119,12 @@ Status DBImpl::SyncClosedLogs(JobContext* job_context) { s = directories_.GetWalDir()->Fsync(); } - mutex_.Lock(); + log_write_mutex_.Lock(); // "number <= current_log_number - 1" is equivalent to // "number < current_log_number". MarkLogsSynced(current_log_number - 1, true, s); if (!s.ok()) { - error_handler_.SetBGError(s, BackgroundErrorReason::kFlush); TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed"); return s; } @@ -174,8 +173,13 @@ Status DBImpl::FlushMemTableToOutputFile( // the host crashes after flushing and before WAL is persistent, the // flushed SST may contain data from write batches whose updates to // other column families are missing. - // SyncClosedLogs() may unlock and re-lock the db_mutex. + // We must release mutex_ before calling `SyncClosedLogs` because it + // may be blocked waiting other thread to complete the operation of + // synchronizing log file. + // SyncClosedLogs() may unlock and re-lock the log_write_mutex. + mutex_.Unlock(); s = SyncClosedLogs(job_context); + mutex_.Lock(); } else { TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip"); } @@ -357,7 +361,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( if (logfile_number_ > 0) { // TODO (yanqin) investigate whether we should sync the closed logs for // single column family case. + mutex_.Unlock(); s = SyncClosedLogs(job_context); + mutex_.Lock(); } // exec_status stores the execution status of flush_jobs as @@ -1988,7 +1994,8 @@ DBImpl::BGJobLimits DBImpl::GetBGJobLimits(int max_background_flushes, } if (!parallelize_compactions) { // throttle background compactions until we deem necessary - res.max_compactions = std::max(1, std::min(base_background_compactions, res.max_compactions)); + res.max_compactions = + std::max(1, std::min(base_background_compactions, res.max_compactions)); } return res; } diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc index 566c175735a..51371b95f30 100644 --- a/db/db_impl/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -187,12 +187,12 @@ void DBImpl::TEST_EndWrite(void* w) { } size_t DBImpl::TEST_LogsToFreeSize() { - InstrumentedMutexLock l(&mutex_); + InstrumentedMutexLock l(&log_write_mutex_); return logs_to_free_.size(); } uint64_t DBImpl::TEST_LogfileNumber() { - InstrumentedMutexLock l(&mutex_); + InstrumentedMutexLock l(&log_write_mutex_); return logfile_number_; } diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index a3cc0a92d96..60c59ed28c3 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -177,9 +177,21 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, } } } + mutex_.Unlock(); + FindObsoleteLogFiles(job_context); + mutex_.Lock(); + if (job_context->HaveSomethingToDelete()) { + ++pending_purge_obsolete_files_; + if (doing_the_full_scan) { + versions_->AddLiveFiles(&job_context->sst_live); + } + } +} - // logs_ is empty when called during recovery, in which case there can't yet - // be any tracked obsolete logs +void DBImpl::FindObsoleteLogFiles(JobContext* job_context) { + // logs_ is empty when called during recovery, in which case there can't + // yet be any tracked obsolete logs + InstrumentedMutexLock l(&log_write_mutex_); if (!alive_log_files_.empty() && !logs_.empty()) { uint64_t min_log_number = job_context->log_number; size_t num_alive_log_files = alive_log_files_.size(); @@ -201,13 +213,8 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, } job_context->size_log_to_delete += earliest.size; total_log_size_ -= earliest.size; - if (two_write_queues_) { - log_write_mutex_.Lock(); - } alive_log_files_.pop_front(); - if (two_write_queues_) { - log_write_mutex_.Unlock(); - } + // Current log should always stay alive since it can't have // number < MinLogNumber(). assert(alive_log_files_.size()); @@ -220,10 +227,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, continue; } logs_to_free_.push_back(log.ReleaseWriter()); - { - InstrumentedMutexLock wl(&log_write_mutex_); - logs_.pop_front(); - } + logs_.pop_front(); } // Current log cannot be obsolete. assert(!logs_.empty()); @@ -234,12 +238,6 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, job_context->logs_to_free = logs_to_free_; job_context->log_recycle_files.assign(log_recycle_files_.begin(), log_recycle_files_.end()); - if (job_context->HaveSomethingToDelete()) { - ++pending_purge_obsolete_files_; - if (doing_the_full_scan) { - versions_->AddLiveFiles(&job_context->sst_live); - } - } logs_to_free_.clear(); } diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 4e3ca2f5503..88bbe9b3441 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -57,7 +57,8 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { } auto bg_job_limits = DBImpl::GetBGJobLimits( result.max_background_flushes, result.max_background_compactions, - result.max_background_jobs, result.base_background_compactions, true /* parallelize_compactions */); + result.max_background_jobs, result.base_background_compactions, + true /* parallelize_compactions */); result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions, Env::Priority::LOW); result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes, @@ -1066,7 +1067,14 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector& log_numbers) { break; } total_log_size_ += log.size; - alive_log_files_.push_back(log); + { + if (two_write_queues_) { + alive_log_files_.push_back(log); + } else { + InstrumentedMutexLock l(&log_write_mutex_); + alive_log_files_.push_back(log); + } + } // We preallocate space for logs, but then after a crash and restart, those // preallocated space are not needed anymore. It is likely only the last // log has such preallocated space, so we only truncate for the last log. @@ -1380,14 +1388,10 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, cfd, &sv_context, *cfd->GetLatestMutableCFOptions()); } sv_context.Clean(); - if (impl->two_write_queues_) { - impl->log_write_mutex_.Lock(); - } + impl->log_write_mutex_.Lock(); impl->alive_log_files_.push_back( DBImpl::LogFileNumberSize(impl->logfile_number_)); - if (impl->two_write_queues_) { - impl->log_write_mutex_.Unlock(); - } + impl->log_write_mutex_.Unlock(); impl->DeleteObsoleteFiles(); s = impl->directories_.GetDbDir()->Fsync(); } diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 27a98a39767..95ee823eb24 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -134,18 +134,14 @@ Status DBImpl::MultiBatchWriteImpl(const WriteOptions& write_options, WriteContext write_context; if (writer.state == WriteThread::STATE_GROUP_LEADER) { WriteThread::WriteGroup wal_write_group; - mutex_.Lock(); + LogContext log_context; if (writer.callback && !writer.callback->AllowWriteBatching()) { WaitForPendingWrites(); } - bool need_log_sync = !write_options.disableWAL && write_options.sync; - bool need_log_dir_sync = need_log_sync && !log_dir_synced_; PERF_TIMER_STOP(write_pre_and_post_process_time); writer.status = - PreprocessWrite(write_options, &need_log_sync, &write_context); + PreprocessWrite(write_options, &log_context, &write_context); PERF_TIMER_START(write_pre_and_post_process_time); - log::Writer* log_writer = logs_.back().writer; - mutex_.Unlock(); // This can set non-OK status if callback fail. last_batch_group_size_ = @@ -204,18 +200,19 @@ Status DBImpl::MultiBatchWriteImpl(const WriteOptions& write_options, RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1); } writer.status = - WriteToWAL(wal_write_group, log_writer, log_used, need_log_sync, - need_log_dir_sync, current_sequence); + WriteToWAL(wal_write_group, log_context.writer, log_used, + log_context.need_log_sync, log_context.need_log_dir_sync, + current_sequence); } } if (!writer.CallbackFailed()) { WriteStatusCheck(writer.status); } - if (need_log_sync) { - mutex_.Lock(); - MarkLogsSynced(logfile_number_, need_log_dir_sync, writer.status); - mutex_.Unlock(); + if (log_context.need_log_sync) { + InstrumentedMutexLock l(&log_write_mutex_); + MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync, + writer.status); } if (writer.status.ok()) { pending_memtable_writes_ += memtable_write_cnt; @@ -420,14 +417,12 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // when it finds suitable, and finish them in the same write batch. // This is how a write job could be done by the other writer. WriteContext write_context; + LogContext log_context(write_options.sync); WriteThread::WriteGroup write_group; bool in_parallel_group = false; uint64_t last_sequence = kMaxSequenceNumber; - mutex_.Lock(); - - bool need_log_sync = write_options.sync; - bool need_log_dir_sync = need_log_sync && !log_dir_synced_; + // The writer will only be used when two_write_queues_ is false. if (!two_write_queues_ || !disable_memtable) { // With concurrent writes we do preprocess only in the write thread that // also does write to memtable to avoid sync issue on shared data structure @@ -436,7 +431,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // PreprocessWrite does its own perf timing. PERF_TIMER_STOP(write_pre_and_post_process_time); - status = PreprocessWrite(write_options, &need_log_sync, &write_context); + status = PreprocessWrite(write_options, &log_context, &write_context); if (!two_write_queues_) { // Assign it after ::PreprocessWrite since the sequence might advance // inside it by WriteRecoverableState @@ -445,9 +440,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, PERF_TIMER_START(write_pre_and_post_process_time); } - log::Writer* log_writer = logs_.back().writer; - - mutex_.Unlock(); // Add to log and apply to memtable. We can release the lock // during this phase since &w is currently responsible for logging @@ -531,8 +523,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, if (!two_write_queues_) { if (status.ok() && !write_options.disableWAL) { PERF_TIMER_GUARD(write_wal_time); - status = WriteToWAL(write_group, log_writer, log_used, need_log_sync, - need_log_dir_sync, last_sequence + 1); + status = WriteToWAL(write_group, log_context.writer, log_used, + log_context.need_log_sync, + log_context.need_log_dir_sync, last_sequence + 1); } } else { if (status.ok() && !write_options.disableWAL) { @@ -620,10 +613,10 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, WriteStatusCheck(status); } - if (need_log_sync) { - mutex_.Lock(); - MarkLogsSynced(logfile_number_, need_log_dir_sync, status); - mutex_.Unlock(); + if (log_context.need_log_sync) { + log_write_mutex_.Lock(); + MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync, status); + log_write_mutex_.Unlock(); // Requesting sync with two_write_queues_ is expected to be very rare. We // hence provide a simple implementation that is not necessarily efficient. if (two_write_queues_) { @@ -674,15 +667,11 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, if (w.callback && !w.callback->AllowWriteBatching()) { write_thread_.WaitForMemTableWriters(); } - mutex_.Lock(); - bool need_log_sync = !write_options.disableWAL && write_options.sync; - bool need_log_dir_sync = need_log_sync && !log_dir_synced_; + LogContext log_context(!write_options.disableWAL && write_options.sync); // PreprocessWrite does its own perf timing. PERF_TIMER_STOP(write_pre_and_post_process_time); - w.status = PreprocessWrite(write_options, &need_log_sync, &write_context); + w.status = PreprocessWrite(write_options, &log_context, &write_context); PERF_TIMER_START(write_pre_and_post_process_time); - log::Writer* log_writer = logs_.back().writer; - mutex_.Unlock(); // This can set non-OK status if callback fail. last_batch_group_size_ = @@ -731,18 +720,18 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, wal_write_group.size - 1); RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1); } - w.status = WriteToWAL(wal_write_group, log_writer, log_used, - need_log_sync, need_log_dir_sync, current_sequence); + w.status = WriteToWAL(wal_write_group, log_context.writer, log_used, + log_context.need_log_sync, + log_context.need_log_dir_sync, current_sequence); } if (!w.CallbackFailed()) { WriteStatusCheck(w.status); } - if (need_log_sync) { - mutex_.Lock(); - MarkLogsSynced(logfile_number_, need_log_dir_sync, w.status); - mutex_.Unlock(); + if (log_context.need_log_sync) { + InstrumentedMutexLock l(&log_write_mutex_); + MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync, w.status); } write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status); @@ -876,9 +865,8 @@ Status DBImpl::WriteImplWALOnly( // TODO(myabandeh): Make preliminary checks thread-safe so we could do them // without paying the cost of obtaining the mutex. if (status.ok()) { - InstrumentedMutexLock l(&mutex_); - bool need_log_sync = false; - status = PreprocessWrite(write_options, &need_log_sync, &write_context); + LogContext log_context; + status = PreprocessWrite(write_options, &log_context, &write_context); WriteStatusCheck(status); } if (!status.ok()) { @@ -1042,22 +1030,22 @@ void DBImpl::MemTableInsertStatusCheck(const Status& status) { } Status DBImpl::PreprocessWrite(const WriteOptions& write_options, - bool* need_log_sync, + LogContext* log_context, WriteContext* write_context) { - mutex_.AssertHeld(); - assert(write_context != nullptr && need_log_sync != nullptr); + assert(write_context != nullptr && log_context != nullptr); Status status; if (error_handler_.IsDBStopped()) { + InstrumentedMutexLock l(&mutex_); status = error_handler_.GetBGError(); } PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time); - assert(!single_column_family_mode_ || - versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1); - if (UNLIKELY(status.ok() && !single_column_family_mode_ && + if (UNLIKELY(status.ok() && + !single_column_family_mode_.load(std::memory_order_acquire) && total_log_size_ > GetMaxTotalWalSize())) { + InstrumentedMutexLock l(&mutex_); WaitForPendingWrites(); status = SwitchWAL(write_context); } @@ -1068,11 +1056,13 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, // thread is writing to another DB with the same write buffer, they may also // be flushed. We may end up with flushing much more DBs than needed. It's // suboptimal but still correct. + InstrumentedMutexLock l(&mutex_); WaitForPendingWrites(); status = HandleWriteBufferFull(write_context); } if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) { + InstrumentedMutexLock l(&mutex_); WaitForPendingWrites(); status = ScheduleFlushes(write_context); } @@ -1088,11 +1078,13 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, // for previous one. It might create a fairness issue that expiration // might happen for smaller writes but larger writes can go through. // Can optimize it if it is an issue. + InstrumentedMutexLock l(&mutex_); status = DelayWrite(last_batch_group_size_, write_options); PERF_TIMER_START(write_pre_and_post_process_time); } - if (status.ok() && *need_log_sync) { + InstrumentedMutexLock l(&log_write_mutex_); + if (status.ok() && log_context->need_log_sync) { // Wait until the parallel syncs are finished. Any sync process has to sync // the front log too so it is enough to check the status of front() // We do a while loop since log_sync_cv_ is signalled when any sync is @@ -1113,8 +1105,11 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, log.getting_synced = true; } } else { - *need_log_sync = false; + log_context->need_log_sync = false; } + log_context->writer = logs_.back().writer; + log_context->need_log_dir_sync = + log_context->need_log_dir_sync && !log_dir_synced_; return status; } @@ -1230,7 +1225,7 @@ Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, if (status.ok() && need_log_sync) { StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS); - // It's safe to access logs_ with unlocked mutex_ here because: + // It's safe to access logs_ with unlocked log_write_mutex_ here because: // - we've set getting_synced=true for all logs, // so other threads won't pop from logs_ while we're here, // - only writer thread can push to logs_, and we're in @@ -1567,10 +1562,11 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) { } uint64_t DBImpl::GetMaxTotalWalSize() const { - mutex_.AssertHeld(); - return mutable_db_options_.max_total_wal_size == 0 - ? 4 * max_total_in_memory_state_ - : mutable_db_options_.max_total_wal_size; + auto max_total_wal_size = max_total_wal_size_.load(std::memory_order_acquire); + if (max_total_wal_size > 0) { + return max_total_wal_size; + } + return 4 * max_total_in_memory_state_.load(std::memory_order_acquire); } // REQUIRES: mutex_ is held @@ -1851,7 +1847,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { cfd->GetName().c_str(), new_log_number, num_imm_unflushed); mutex_.Lock(); if (s.ok() && creating_new_log) { - log_write_mutex_.Lock(); + InstrumentedMutexLock l(&log_write_mutex_); assert(new_log != nullptr); if (!logs_.empty()) { // Alway flush the buffer of the last log before switching to a new one @@ -1872,7 +1868,6 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { logs_.emplace_back(logfile_number_, new_log); alive_log_files_.push_back(LogFileNumberSize(logfile_number_)); } - log_write_mutex_.Unlock(); } if (!s.ok()) { diff --git a/db/error_handler.cc b/db/error_handler.cc index 317f0072372..208b50c34a8 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -220,6 +220,9 @@ Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reas db_mutex_, &auto_recovery); if (!s.ok() && (s.severity() > bg_error_.severity())) { bg_error_ = s; + if (bg_error_.severity() >= Status::Severity::kHardError) { + db_stopped_.store(true, std::memory_order_release); + } } else { // This error is less severe than previously encountered error. Don't // take any further action @@ -295,6 +298,7 @@ Status ErrorHandler::ClearBGError() { if (recovery_error_.ok()) { Status old_bg_error = bg_error_; bg_error_ = Status::OK(); + db_stopped_.store(false, std::memory_order_release); recovery_in_prog_ = false; EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners, old_bg_error, db_mutex_); diff --git a/db/error_handler.h b/db/error_handler.h index c2af809fc69..5fa6ffab822 100644 --- a/db/error_handler.h +++ b/db/error_handler.h @@ -23,15 +23,12 @@ class ErrorHandler { recovery_error_(Status::OK()), db_mutex_(db_mutex), auto_recovery_(false), - recovery_in_prog_(false) {} + recovery_in_prog_(false), + db_stopped_(false) {} ~ErrorHandler() {} void EnableAutoRecovery() { auto_recovery_ = true; } - Status::Severity GetErrorSeverity(BackgroundErrorReason reason, - Status::Code code, - Status::SubCode subcode); - Status SetBGError(const Status& bg_err, BackgroundErrorReason reason); Status GetBGError() { return bg_error_; } @@ -40,15 +37,14 @@ class ErrorHandler { Status ClearBGError(); - bool IsDBStopped() { - return !bg_error_.ok() && - bg_error_.severity() >= Status::Severity::kHardError; - } + // Do not require DB mutex held. + bool IsDBStopped() { return db_stopped_.load(std::memory_order_acquire); } - bool IsBGWorkStopped() { - return !bg_error_.ok() && - (bg_error_.severity() >= Status::Severity::kHardError || - !auto_recovery_); + // Require DB mutex held. + bool IsBGWorkStopped() { + return !bg_error_.ok() && + (bg_error_.severity() >= Status::Severity::kHardError || + !auto_recovery_); } bool IsRecoveryInProgress() { return recovery_in_prog_; } @@ -67,6 +63,7 @@ class ErrorHandler { // A flag indicating whether automatic recovery from errors is enabled bool auto_recovery_; bool recovery_in_prog_; + std::atomic db_stopped_; Status OverrideNoSpaceError(Status bg_error, bool* auto_recovery); void RecoverFromNoSpace(); diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index 3cd82078267..adc9e791714 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -383,7 +383,7 @@ void ProfileQueries(bool enabled_time = false) { EXPECT_GT(hist_write_scheduling_time.Average(), 0); #ifndef NDEBUG - ASSERT_GT(total_db_mutex_nanos, 2000U); + ASSERT_LT(total_db_mutex_nanos, 100U); #endif }