From bf163e9b339aa12e76fcd8fb2447b68da61fdf60 Mon Sep 17 00:00:00 2001 From: "Kaleb S. KEITHLEY" Date: Mon, 21 Mar 2022 10:15:50 -0400 Subject: [PATCH] 17.1.0 snapshot 56 Signed-off-by: Kaleb S. KEITHLEY --- 0020-src-os-bluestore-BlueFS.cc.patch | 438 -------------------------- ceph.spec | 10 +- sources | 2 +- 3 files changed, 7 insertions(+), 443 deletions(-) delete mode 100644 0020-src-os-bluestore-BlueFS.cc.patch diff --git a/0020-src-os-bluestore-BlueFS.cc.patch b/0020-src-os-bluestore-BlueFS.cc.patch deleted file mode 100644 index 49ed09a..0000000 --- a/0020-src-os-bluestore-BlueFS.cc.patch +++ /dev/null @@ -1,438 +0,0 @@ -diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in -index 7a4e581fbec..3a277418f73 100644 ---- a/src/common/options/global.yaml.in -+++ b/src/common/options/global.yaml.in -@@ -3260,6 +3260,13 @@ options: - slow shutdown is primarilyy useful for doing memory leak checking with valgrind. - default: true - with_legacy: true -+- name: osd_fast_shutdown_timeout -+ type: int -+ level: advanced -+ desc: timeout in seconds for osd fast-shutdown (0 is unlimited) -+ default: 15 -+ with_legacy: true -+ min: 0 - - name: osd_fast_shutdown_notify_mon - type: bool - level: advanced -@@ -4931,6 +4938,12 @@ options: - This setting is used only when OSD is doing ``--mkfs``. - Next runs of OSD retrieve sharding from disk. - default: m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P -+- name: bluestore_qfsck_on_mount -+ type: bool -+ level: dev -+ desc: Run quick-fsck at mount comparing allocation-file to RocksDB allocation state -+ default: true -+ with_legacy: true - - name: bluestore_fsck_on_mount - type: bool - level: dev -diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h -index d934d092919..44d67c26e88 100644 ---- a/src/os/ObjectStore.h -+++ b/src/os/ObjectStore.h -@@ -288,7 +288,8 @@ public: - virtual bool needs_journal() = 0; //< requires a journal - virtual bool wants_journal() = 0; //< prefers a journal - virtual bool allows_journal() = 0; //< allows a journal -- -+ virtual void prepare_for_fast_shutdown() {} -+ virtual bool has_null_manager() { return false; } - // return store min allocation size, if applicable - virtual uint64_t get_min_alloc_size() const { - return 0; -diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc -index 0b9bb0bba8e..baae7c5ab2b 100644 ---- a/src/os/bluestore/BlueFS.cc -+++ b/src/os/bluestore/BlueFS.cc -@@ -2507,6 +2507,9 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback, - } - #endif - _flush_bdev(); -+ ++log.seq_live; -+ dirty.seq_live = log.seq_live; -+ log.t.seq = log.seq_live; - - super.memorized_layout = layout; - super.log_fnode = log_file->fnode; -diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc -index d1a0fe4897c..86062f290f0 100644 ---- a/src/os/bluestore/BlueStore.cc -+++ b/src/os/bluestore/BlueStore.cc -@@ -7565,9 +7565,16 @@ void BlueStore::set_cache_shards(unsigned num) - } - } - -+//--------------------------------------------- -+bool BlueStore::has_null_manager() -+{ -+ return (fm && fm->is_null_manager()); -+} -+ - int BlueStore::_mount() - { - dout(5) << __func__ << "NCB:: path " << path << dendl; -+ - _kv_only = false; - if (cct->_conf->bluestore_fsck_on_mount) { - dout(5) << __func__ << "::NCB::calling fsck()" << dendl; -@@ -7681,12 +7688,15 @@ int BlueStore::umount() - #endif - dout(20) << __func__ << " stopping kv thread" << dendl; - _kv_stop(); -- _shutdown_cache(); -+ // skip cache cleanup step on fast shutdown -+ if (likely(!m_fast_shutdown)) { -+ _shutdown_cache(); -+ } - dout(20) << __func__ << " closing" << dendl; - } -- - _close_db_and_around(); -- if (cct->_conf->bluestore_fsck_on_umount) { -+ // disable fsck on fast-shutdown -+ if (cct->_conf->bluestore_fsck_on_umount && !m_fast_shutdown) { - int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep); - if (rc < 0) - return rc; -@@ -10305,6 +10315,11 @@ int BlueStore::get_numa_node( - return 0; - } - -+void BlueStore::prepare_for_fast_shutdown() -+{ -+ m_fast_shutdown = true; -+} -+ - int BlueStore::get_devices(set *ls) - { - if (bdev) { -@@ -10432,7 +10447,8 @@ int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf, - string key_prefix; - _key_encode_u64(pool_id, &key_prefix); - *out_per_pool_omap = per_pool_omap != OMAP_BULK; -- if (*out_per_pool_omap) { -+ // stop calls after db was closed -+ if (*out_per_pool_omap && db) { - auto prefix = per_pool_omap == OMAP_PER_POOL ? - PREFIX_PERPOOL_OMAP : - PREFIX_PERPG_OMAP; -@@ -18344,11 +18360,10 @@ int BlueStore::store_allocator(Allocator* src_allocator) - return -1; - } - } -- -+ bluefs->compact_log(); - // reuse previous file-allocation if exists - ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr); - bool overwrite_file = (ret == 0); -- //derr << __func__ << "bluefs->open_for_write(" << overwrite_file << ")" << dendl; - BlueFS::FileWriter *p_handle = nullptr; - ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, overwrite_file); - if (ret != 0) { -@@ -18358,8 +18373,9 @@ int BlueStore::store_allocator(Allocator* src_allocator) - - uint64_t file_size = p_handle->file->fnode.size; - uint64_t allocated = p_handle->file->fnode.get_allocated(); -- dout(5) << "file_size=" << file_size << ", allocated=" << allocated << dendl; -+ dout(10) << "file_size=" << file_size << ", allocated=" << allocated << dendl; - -+ bluefs->sync_metadata(false); - unique_ptr allocator(clone_allocator_without_bluefs(src_allocator)); - if (!allocator) { - bluefs->close_writer(p_handle); -@@ -18431,12 +18447,11 @@ int BlueStore::store_allocator(Allocator* src_allocator) - bluefs->fsync(p_handle); - - utime_t duration = ceph_clock_now() - start_time; -- dout(5) <<"WRITE-extent_count=" << extent_count << ", file_size=" << p_handle->file->fnode.size << dendl; -+ dout(5) <<"WRITE-extent_count=" << extent_count << ", allocation_size=" << allocation_size << ", serial=" << s_serial << dendl; - dout(5) <<"p_handle->pos=" << p_handle->pos << " WRITE-duration=" << duration << " seconds" << dendl; - - bluefs->close_writer(p_handle); - need_to_destage_allocation_file = false; -- dout(10) << "need_to_destage_allocation_file was clear" << dendl; - return 0; - } - -@@ -18628,7 +18643,7 @@ int BlueStore::__restore_allocator(Allocator* allocator, uint64_t *num, uint64_t - utime_t duration = ceph_clock_now() - start_time; - dout(5) << "READ--extent_count=" << extent_count << ", read_alloc_size= " - << read_alloc_size << ", file_size=" << file_size << dendl; -- dout(5) << "READ duration=" << duration << " seconds, s_serial=" << s_serial << dendl; -+ dout(5) << "READ duration=" << duration << " seconds, s_serial=" << header.serial << dendl; - *num = extent_count; - *bytes = read_alloc_size; - return 0; -@@ -18923,7 +18938,7 @@ int BlueStore::read_allocation_from_drive_on_startup() - - utime_t start = ceph_clock_now(); - read_alloc_stats_t stats = {}; -- SimpleBitmap sbmap(cct, div_round_up(bdev->get_size(), min_alloc_size)); -+ SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size)); - ret = reconstruct_allocations(&sbmap, stats); - if (ret != 0) { - return ret; -@@ -19025,15 +19040,6 @@ int BlueStore::compare_allocators(Allocator* alloc1, Allocator* alloc2, uint64_t - return 0; - } else { - derr << "mismatch:: idx1=" << idx1 << " idx2=" << idx2 << dendl; -- std::cout << "===================================================================" << std::endl; -- for (uint64_t i = 0; i < idx1; i++) { -- std::cout << "arr1[" << i << "]<" << arr1[i].offset << "," << arr1[i].length << "> " << std::endl; -- } -- -- std::cout << "===================================================================" << std::endl; -- for (uint64_t i = 0; i < idx2; i++) { -- std::cout << "arr2[" << i << "]<" << arr2[i].offset << "," << arr2[i].length << "> " << std::endl; -- } - return -1; - } - } -@@ -19081,9 +19087,9 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool() - utime_t start = ceph_clock_now(); - - auto shutdown_cache = make_scope_guard([&] { -- std::cout << "Allocation Recovery was completed in " << duration -- << " seconds; insert_count=" << stats.insert_count -- << "; extent_count=" << stats.extent_count << std::endl; -+ dout(1) << "Allocation Recovery was completed in " << duration -+ << " seconds; insert_count=" << stats.insert_count -+ << "; extent_count=" << stats.extent_count << dendl; - _shutdown_cache(); - _close_db_and_around(); - }); -@@ -19092,7 +19098,7 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool() - auto allocator = unique_ptr(create_bitmap_allocator(bdev->get_size())); - //reconstruct allocations into a temp simple-bitmap and copy into allocator - { -- SimpleBitmap sbmap(cct, div_round_up(bdev->get_size(), min_alloc_size)); -+ SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size)); - ret = reconstruct_allocations(&sbmap, stats); - if (ret != 0) { - return ret; -@@ -19113,14 +19119,14 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool() - }; - allocator->dump(count_entries); - ret = compare_allocators(allocator.get(), alloc, stats.insert_count, memory_target); -- if (ret != 0) { -+ if (ret == 0) { - dout(5) << "Allocator drive - file integrity check OK" << dendl; - } else { - derr << "FAILURE. Allocator from file and allocator from metadata differ::ret=" << ret << dendl; - } - } - -- std::cout << stats << std::endl; -+ dout(1) << stats << dendl; - return ret; - } - -diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h -index 72cfc2d076b..0f804595ebb 100644 ---- a/src/os/bluestore/BlueStore.h -+++ b/src/os/bluestore/BlueStore.h -@@ -2764,7 +2764,7 @@ public: - - private: - int32_t ondisk_format = 0; ///< value detected on mount -- -+ bool m_fast_shutdown = false; - int _upgrade_super(); ///< upgrade (called during open_super) - uint64_t _get_ondisk_reserved() const; - void _prepare_ondisk_format_super(KeyValueDB::Transaction& t); -@@ -2783,6 +2783,9 @@ public: - bool wants_journal() override { return false; }; - bool allows_journal() override { return false; }; - -+ void prepare_for_fast_shutdown() override; -+ virtual bool has_null_manager(); -+ - uint64_t get_min_alloc_size() const override { - return min_alloc_size; - } -diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc -index 7658fb59911..6def6621c1e 100644 ---- a/src/osd/OSD.cc -+++ b/src/osd/OSD.cc -@@ -4245,27 +4245,44 @@ PerfCounters* OSD::create_recoverystate_perf() - - int OSD::shutdown() - { -+ // vstart overwrites osd_fast_shutdown value in the conf file -> force the value here! -+ //cct->_conf->osd_fast_shutdown = true; -+ -+ dout(0) << "Fast Shutdown: - cct->_conf->osd_fast_shutdown = " -+ << cct->_conf->osd_fast_shutdown -+ << ", null-fm = " << store->has_null_manager() << dendl; -+ -+ utime_t start_time_func = ceph_clock_now(); -+ - if (cct->_conf->osd_fast_shutdown) { - derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl; - if (cct->_conf->osd_fast_shutdown_notify_mon) - service.prepare_to_stop(); -- cct->_log->flush(); -- _exit(0); -- } - -- if (!service.prepare_to_stop()) -+ // There is no state we need to keep wehn running in NULL-FM moode -+ if (!store->has_null_manager()) { -+ cct->_log->flush(); -+ _exit(0); -+ } -+ } else if (!service.prepare_to_stop()) { - return 0; // already shutting down -+ } -+ - osd_lock.lock(); - if (is_stopping()) { - osd_lock.unlock(); - return 0; - } -- dout(0) << "shutdown" << dendl; - -+ if (!cct->_conf->osd_fast_shutdown) { -+ dout(0) << "shutdown" << dendl; -+ } -+ -+ // don't accept new task for this OSD - set_state(STATE_STOPPING); - -- // Debugging -- if (cct->_conf.get_val("osd_debug_shutdown")) { -+ // Disabled debugging during fast-shutdown -+ if (!cct->_conf->osd_fast_shutdown && cct->_conf.get_val("osd_debug_shutdown")) { - cct->_conf.set_val("debug_osd", "100"); - cct->_conf.set_val("debug_journal", "100"); - cct->_conf.set_val("debug_filestore", "100"); -@@ -4274,6 +4291,45 @@ int OSD::shutdown() - cct->_conf.apply_changes(nullptr); - } - -+ if (cct->_conf->osd_fast_shutdown) { -+ // first, stop new task from being taken from op_shardedwq -+ // and clear all pending tasks -+ op_shardedwq.stop_for_fast_shutdown(); -+ -+ utime_t start_time_timer = ceph_clock_now(); -+ tick_timer.shutdown(); -+ { -+ std::lock_guard l(tick_timer_lock); -+ tick_timer_without_osd_lock.shutdown(); -+ } -+ -+ osd_lock.unlock(); -+ utime_t start_time_osd_drain = ceph_clock_now(); -+ -+ // then, wait on osd_op_tp to drain (TBD: should probably add a timeout) -+ osd_op_tp.drain(); -+ osd_op_tp.stop(); -+ -+ utime_t start_time_umount = ceph_clock_now(); -+ store->prepare_for_fast_shutdown(); -+ std::lock_guard lock(osd_lock); -+ // TBD: assert in allocator that nothing is being add -+ store->umount(); -+ -+ utime_t end_time = ceph_clock_now(); -+ if (cct->_conf->osd_fast_shutdown_timeout) { -+ ceph_assert(end_time - start_time_func < cct->_conf->osd_fast_shutdown_timeout); -+ } -+ dout(0) <<"Fast Shutdown duration total :" << end_time - start_time_func << " seconds" << dendl; -+ dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount - start_time_osd_drain << " seconds" << dendl; -+ dout(0) <<"Fast Shutdown duration umount :" << end_time - start_time_umount << " seconds" << dendl; -+ dout(0) <<"Fast Shutdown duration timer :" << start_time_osd_drain - start_time_timer << " seconds" << dendl; -+ cct->_log->flush(); -+ -+ // now it is safe to exit -+ _exit(0); -+ } -+ - // stop MgrClient earlier as it's more like an internal consumer of OSD - mgrc.shutdown(); - -@@ -4435,6 +4491,9 @@ int OSD::shutdown() - hb_front_server_messenger->shutdown(); - hb_back_server_messenger->shutdown(); - -+ utime_t duration = ceph_clock_now() - start_time_func; -+ dout(0) <<"Slow Shutdown duration:" << duration << " seconds" << dendl; -+ - tracing::osd::tracer.shutdown(); - - return r; -@@ -11058,6 +11117,11 @@ void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb) - } - - void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) { -+ if (unlikely(m_fast_shutdown) ) { -+ // stop enqueing when we are in the middle of a fast shutdown -+ return; -+ } -+ - uint32_t shard_index = - item.get_ordering_token().hash_to_shard(osd->shards.size()); - -@@ -11088,6 +11152,11 @@ void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) { - - void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item) - { -+ if (unlikely(m_fast_shutdown) ) { -+ // stop enqueing when we are in the middle of a fast shutdown -+ return; -+ } -+ - auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size()); - auto& sdata = osd->shards[shard_index]; - ceph_assert(sdata); -@@ -11114,6 +11183,24 @@ void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item) - sdata->sdata_cond.notify_one(); - } - -+void OSD::ShardedOpWQ::stop_for_fast_shutdown() -+{ -+ uint32_t shard_index = 0; -+ m_fast_shutdown = true; -+ -+ for (; shard_index < osd->num_shards; shard_index++) { -+ auto& sdata = osd->shards[shard_index]; -+ ceph_assert(sdata); -+ sdata->shard_lock.lock(); -+ int work_count = 0; -+ while(! sdata->scheduler->empty() ) { -+ auto work_item = sdata->scheduler->dequeue(); -+ work_count++; -+ } -+ sdata->shard_lock.unlock(); -+ } -+} -+ - namespace ceph::osd_cmds { - - int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, -diff --git a/src/osd/OSD.h b/src/osd/OSD.h -index 30d0b0b4aef..2da5de10aa6 100644 ---- a/src/osd/OSD.h -+++ b/src/osd/OSD.h -@@ -1592,7 +1592,7 @@ protected: - : public ShardedThreadPool::ShardedWQ - { - OSD *osd; -- -+ bool m_fast_shutdown = false; - public: - ShardedOpWQ(OSD *o, - ceph::timespan ti, -@@ -1610,6 +1610,8 @@ protected: - /// try to do some work - void _process(uint32_t thread_index, ceph::heartbeat_handle_d *hb) override; - -+ void stop_for_fast_shutdown(); -+ - /// enqueue a new item - void _enqueue(OpSchedulerItem&& item) override; - diff --git a/ceph.spec b/ceph.spec index d8ef8c6..b4b7b05 100644 --- a/ceph.spec +++ b/ceph.spec @@ -151,7 +151,7 @@ ################################################################################# Name: ceph Version: 17.1.0 -Release: 0.4.31.g1ccf6db7%{?dist} +Release: 0.5.56.g60fdd357%{?dist} %if 0%{?fedora} || 0%{?rhel} Epoch: 2 %endif @@ -169,7 +169,7 @@ Group: System/Filesystems URL: http://ceph.com/ #Source0: https://download.ceph.com/tarballs/ceph-%{version}.tar.gz #Source0: https://1.chacra.ceph.com/r/ceph/quincy/... -Source0: ceph-17.1.0-31-g1ccf6db7.tar.bz2 +Source0: ceph-17.1.0-56-g60fdd357.tar.gz Patch0001: 0001-src-common-crc32c_intel_fast.patch Patch0003: 0003-src-common-bitstr.h.patch Patch0008: 0008-cmake-modules-Finduring.cmake.patch @@ -180,7 +180,6 @@ Patch0016: 0016-src-tracing-patch Patch0017: 0017-gcc-12-omnibus.patch Patch0018: 0018-src-rgw-store-dbstore-CMakeLists.txt.patch Patch0019: 0019-cmake-modules-CheckCxxAtomic.cmake.patch -Patch0020: 0020-src-os-bluestore-BlueFS.cc.patch # ceph 14.0.1 does not support 32-bit architectures, bugs #1727788, #1727787 ExcludeArch: i686 armv7hl %if 0%{?suse_version} @@ -1256,7 +1255,7 @@ This package provides Ceph default alerts for Prometheus. # common ################################################################################# %prep -%autosetup -p1 -n ceph-17.1.0-31-g1ccf6db7 +%autosetup -p1 -n ceph-17.1.0-56-g60fdd357 %build # Disable lto on systems that do not support symver attribute @@ -2548,6 +2547,9 @@ exit 0 %config %{_sysconfdir}/prometheus/ceph/ceph_default_alerts.yml %changelog +* Mon Mar 21 2022 Kaleb S. KEITHLEY - 2:17.1.0-0.5.56-g60fdd357 +- 17.1.0 snapshot 56 + * Thu Mar 17 2022 Kaleb S. KEITHLEY - 2:17.1.0-0.4.31-g1ccf6db7 - 17.1.0 snapshot 31 plus rhbz#2064219 (ceph #53266, #54561) diff --git a/sources b/sources index c6c9df3..90406c3 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (ceph-17.1.0-31-g1ccf6db7.tar.bz2) = 008f7c58639c2a2f074a5971ba5b84ca3b3397d6799691c8587c96277cf352218f405a147a5b90c037af54c0a9b3eaec60053a051fe2d345f9a1de0c46538959 +SHA512 (ceph-17.1.0-56-g60fdd357.tar.gz) = 42f1548089ad3c9e8f5eef8e01906ed96c3dac29187a9a8a38c75e41ad8b459a6be21b85c2b6ef9989d44cf2d0ef0b4433b9a5aa99655643bd78d75700f6dcb5