From e1d2ff12d87daaccb8b0b70ec6b34ca4c97a4ec5 Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Fri, 21 Mar 2025 09:41:28 +0100 Subject: [PATCH 1/6] FIX metadata collision API If `is_valid` was NULL, OCF would crash Signed-off-by: Michal Mielewczyk --- src/metadata/metadata_status.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/metadata/metadata_status.h b/src/metadata/metadata_status.h index 1d470e6..99e6296 100644 --- a/src/metadata/metadata_status.h +++ b/src/metadata/metadata_status.h @@ -1,5 +1,6 @@ /* * Copyright(c) 2012-2021 Intel Corporation + * Copyright(c) 2025 Huawei Technologies * SPDX-License-Identifier: BSD-3-Clause */ @@ -326,15 +327,18 @@ static inline bool metadata_clear_valid_sec_changed( struct ocf_cache *cache, ocf_cache_line_t line, uint8_t start, uint8_t stop, bool *is_valid) { - bool was_any_valid; + bool was_any_valid, _line_remains_valid; was_any_valid = ocf_metadata_test_valid(cache, line, 0, ocf_line_end_sector(cache), false); - *is_valid = ocf_metadata_clear_valid(cache, line, + _line_remains_valid = ocf_metadata_clear_valid(cache, line, start, stop); - return was_any_valid && !*is_valid; + if (likely(is_valid)) + *is_valid = _line_remains_valid; + + return was_any_valid && !_line_remains_valid; } #endif /* METADATA_STATUS_H_ */ From 67eb940589024f84b22395d6032452872bfe6954 Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Fri, 21 Mar 2025 10:03:54 +0100 Subject: [PATCH 2/6] Refactor metadata collision API Signed-off-by: Michal Mielewczyk --- src/metadata/metadata_status.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/metadata/metadata_status.h b/src/metadata/metadata_status.h index 99e6296..1356b7c 100644 --- a/src/metadata/metadata_status.h +++ b/src/metadata/metadata_status.h @@ -317,28 +317,28 @@ static inline void metadata_set_valid_sec_one(struct ocf_cache *cache, ocf_metadata_set_valid(cache, line, pos, pos); } /* - * Marks given cache line's bits as invalid + * Marks given cache line's sectors as invalid * - * @return true if any of the cache line's bits was valid and the cache line - * became invalid (all bits invalid) after the operation - * @return false in other cases + * @return true if line was valid and became invalid (all sectors invalid) + * @return false if line was invalid and remains invalid or + * if line was valid and still has valid sectors */ static inline bool metadata_clear_valid_sec_changed( struct ocf_cache *cache, ocf_cache_line_t line, - uint8_t start, uint8_t stop, bool *is_valid) + uint8_t start, uint8_t stop, bool *line_remains_valid) { - bool was_any_valid, _line_remains_valid; + bool line_was_valid, _line_remains_valid; - was_any_valid = ocf_metadata_test_valid(cache, line, 0, + line_was_valid = ocf_metadata_test_valid(cache, line, 0, ocf_line_end_sector(cache), false); _line_remains_valid = ocf_metadata_clear_valid(cache, line, start, stop); - if (likely(is_valid)) - *is_valid = _line_remains_valid; + if (likely(line_remains_valid != NULL)) + *line_remains_valid = _line_remains_valid; - return was_any_valid && !_line_remains_valid; + return line_was_valid && !_line_remains_valid; } #endif /* METADATA_STATUS_H_ */ From 4a01ff957df192dffda85bd4d384802c5ca4841b Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Fri, 21 Mar 2025 10:18:38 +0100 Subject: [PATCH 3/6] Refactor __set_cache_line_invalid() pt.1 Better naming Signed-off-by: Michal Mielewczyk --- src/utils/utils_cache_line.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/utils/utils_cache_line.c b/src/utils/utils_cache_line.c index 8546953..54a16ac 100644 --- a/src/utils/utils_cache_line.c +++ b/src/utils/utils_cache_line.c @@ -1,6 +1,6 @@ /* * Copyright(c) 2012-2021 Intel Corporation - * Copyright(c) 2024 Huawei Technologies + * Copyright(c) 2024-2025 Huawei Technologies * SPDX-License-Identifier: BSD-3-Clause */ @@ -12,21 +12,21 @@ static void __set_cache_line_invalid(struct ocf_cache *cache, uint8_t start_bit, ocf_core_id_t core_id, ocf_part_id_t part_id) { ocf_core_t core; - bool is_valid, changed; + bool line_remains_valid, line_became_invalid; ENV_BUG_ON(core_id >= OCF_CORE_MAX); core = ocf_cache_get_core(cache, core_id); - changed = metadata_clear_valid_sec_changed(cache, line, - start_bit, end_bit, &is_valid); + line_became_invalid = metadata_clear_valid_sec_changed(cache, line, + start_bit, end_bit, &line_remains_valid); /* If we have waiters, do not remove cache line * for this cache line which will use one, clear * only valid bits */ - if (!is_valid && !ocf_cache_line_are_waiters( + if (!line_remains_valid && !ocf_cache_line_are_waiters( ocf_cache_line_concurrency(cache), line)) { - if (changed) { + if (line_became_invalid) { env_atomic_dec(&core->runtime_meta->cached_clines); env_atomic_dec(&core->runtime_meta-> part_counters[part_id].cached_clines); From 96e527049a52ccbfbaa16f561797120a910c93ee Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Fri, 21 Mar 2025 11:51:01 +0100 Subject: [PATCH 4/6] Refactor __set_cache_line_invalid() pt.2 Get rid of nested ifs Signed-off-by: Michal Mielewczyk --- src/utils/utils_cache_line.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/utils/utils_cache_line.c b/src/utils/utils_cache_line.c index 54a16ac..8668966 100644 --- a/src/utils/utils_cache_line.c +++ b/src/utils/utils_cache_line.c @@ -20,20 +20,26 @@ static void __set_cache_line_invalid(struct ocf_cache *cache, uint8_t start_bit, line_became_invalid = metadata_clear_valid_sec_changed(cache, line, start_bit, end_bit, &line_remains_valid); - /* If we have waiters, do not remove cache line - * for this cache line which will use one, clear - * only valid bits - */ - if (!line_remains_valid && !ocf_cache_line_are_waiters( - ocf_cache_line_concurrency(cache), line)) { - if (line_became_invalid) { - env_atomic_dec(&core->runtime_meta->cached_clines); - env_atomic_dec(&core->runtime_meta-> - part_counters[part_id].cached_clines); - } - ocf_lru_rm_cline(cache, line); - ocf_metadata_remove_cache_line(cache, line); + if (line_remains_valid) { + ENV_BUG_ON(line_became_invalid); + return; } + + /* If there are any waiters, don't transfer the line to the freelist. + * The new owner will take care about the repart anyways + */ + if (ocf_cache_line_are_waiters(ocf_cache_line_concurrency(cache), line)) + return; + + ocf_lru_rm_cline(cache, line); + ocf_metadata_remove_cache_line(cache, line); + + if (!line_became_invalid) + return; + + env_atomic_dec(&core->runtime_meta->cached_clines); + env_atomic_dec(&core->runtime_meta-> + part_counters[part_id].cached_clines); } void set_cache_line_invalid(struct ocf_cache *cache, uint8_t start_bit, From 7f55116b5e51d226db606636a5c98106395d7f6a Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Mon, 24 Mar 2025 10:50:36 +0100 Subject: [PATCH 5/6] pyocf: settle cache before testing occupancy The next commit will move occupancy accounting to backfill which makes testing statistics value even more time dependent. Settling cache before cache.get_stats() prevents this error-inducing race conditions Signed-off-by: Michal Mielewczyk --- tests/functional/tests/eviction/test_eviction.py | 9 ++++++++- tests/functional/tests/management/test_start_stop.py | 10 +++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/tests/functional/tests/eviction/test_eviction.py b/tests/functional/tests/eviction/test_eviction.py index 690dcd8..1c1e688 100644 --- a/tests/functional/tests/eviction/test_eviction.py +++ b/tests/functional/tests/eviction/test_eviction.py @@ -1,10 +1,11 @@ # # Copyright(c) 2019-2022 Intel Corporation -# Copyright(c) 2024 Huawei Technologies +# Copyright(c) 2024-2025 Huawei Technologies # SPDX-License-Identifier: BSD-3-Clause # import logging +import time from math import ceil, isclose from ctypes import c_int @@ -45,6 +46,8 @@ def test_eviction_two_cores(pyocf_ctx, mode: CacheMode, cls: CacheLineSize): send_io(vol1, test_data) send_io(vol2, test_data) + cache.settle() + stats1 = core1.get_stats() stats2 = core2.get_stats() # IO to the second core should evict all the data from the first core @@ -285,6 +288,7 @@ def test_eviction_freelist(pyocf_ctx, cls: CacheLineSize, cache_mode: CacheMode, for j in range(cache_lines_written): addr = (cache_lines_written * i + j) * data.size send_io(vol, data, addr, ioclass, io_dir) + cache.settle() assert ( get_ioclass_occupancy(cache, ioclass) == expected_occpancy_4k ), f"Doesn't match for ioclass {ioclass}" @@ -298,6 +302,9 @@ def test_eviction_freelist(pyocf_ctx, cls: CacheLineSize, cache_mode: CacheMode, addr += data.size send_io(vol, data, addr, high_prio_ioclass, io_dir) + cache.settle() + time.sleep(1) + assert cache.get_stats()["usage"]["occupancy"]["value"] == cache_size_4k for ioclass in low_prio_ioclasses: diff --git a/tests/functional/tests/management/test_start_stop.py b/tests/functional/tests/management/test_start_stop.py index b72cbe3..b7be8d0 100644 --- a/tests/functional/tests/management/test_start_stop.py +++ b/tests/functional/tests/management/test_start_stop.py @@ -1,6 +1,6 @@ # # Copyright(c) 2019-2022 Intel Corporation -# Copyright(c) 2024 Huawei Technologies +# Copyright(c) 2024-2025 Huawei Technologies # SPDX-License-Identifier: BSD-3-Clause # @@ -212,7 +212,9 @@ def test_stop(pyocf_ctx, mode: CacheMode, cls: CacheLineSize, with_flush: bool): cls_no = 10 - run_io_and_cache_data_if_possible(front_vol, mode, cls, cls_no) + run_io_and_cache_data_if_possible(cache, front_vol, mode, cls, cls_no) + + cache.settle() stats = cache.get_stats() assert int(stats["conf"]["dirty"]) == ( @@ -494,7 +496,7 @@ def test_start_stop_noqueue(pyocf_ctx): assert not c.results["error"], "Failed to stop cache: {}".format(c.results["error"]) -def run_io_and_cache_data_if_possible(vol, mode, cls, cls_no): +def run_io_and_cache_data_if_possible(cache, vol, mode, cls, cls_no): queue = vol.parent.get_default_queue() test_data = Data(cls_no * cls) @@ -508,6 +510,8 @@ def run_io_and_cache_data_if_possible(vol, mode, cls, cls_no): logger.info("[STAGE] Write to exported object") io_to_core(vol, queue, test_data, 0) + cache.settle() + stats = vol.parent.cache.get_stats() assert stats["usage"]["occupancy"]["value"] == ( (cls_no * cls / CacheLineSize.LINE_4KiB) if mode != CacheMode.PT else 0 From 295b3949bc406c178d9fbfafa72a11ac89038055 Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Mon, 24 Mar 2025 09:47:19 +0100 Subject: [PATCH 6/6] backfill: Update occupancy only if BF succeeded Increasing the occupancy before the backfill request has completed leads to incorrect statistics test_lazy_engine_errors() detected the bug in the following scenario: 1. Rio submitted a read request 2. The read turned out to be a full miss. After a successful read from the core device, engine_read set the metadata bits to valid and submitted a backfill request 3. In meantime, Rio submitted a write request to the same cache lines 4. Since the test uses an error device as cache (all sectors are erroneous) the backfill request failed and was redirected to engine_invalidate 5. engine_invalidate reset the metadata bits to invalid, but since cache lines had waiters registered, the occupancy wasn't decremented as there was an assumption that the new owner (in this case, the write request) would do the bookkeeping 6. Upon receiving cache line locks, the write request detected that the mapping changed so it was completed with an error without decrementing the occupancy 7. The incorrect value of cached cache lines was persisted for the whole lifetime of the cache Signed-off-by: Michal Mielewczyk --- src/engine/engine_bf.c | 25 ++++++++++++++++++++----- src/engine/engine_rd.c | 9 +-------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/src/engine/engine_bf.c b/src/engine/engine_bf.c index 2d80d78..219efb2 100644 --- a/src/engine/engine_bf.c +++ b/src/engine/engine_bf.c @@ -1,6 +1,6 @@ /* * Copyright(c) 2012-2022 Intel Corporation - * Copyright(c) 2024 Huawei Technologies + * Copyright(c) 2024-2025 Huawei Technologies * SPDX-License-Identifier: BSD-3-Clause */ @@ -39,6 +39,23 @@ static inline void backfill_queue_inc_block(struct ocf_cache *cache) env_atomic_set(&cache->pending_read_misses_list_blocked, 1); } +static int _ocf_backfill_update_metadata(struct ocf_request *req) +{ + ocf_cache_t cache = req->cache; + + ocf_hb_req_prot_lock_wr(req); + + ocf_set_valid_map_info(req); + + ocf_hb_req_prot_unlock_wr(req); + + ocf_req_unlock(ocf_cache_line_concurrency(cache), req); + + ocf_req_put(req); + + return 0; +} + static void _ocf_backfill_complete(struct ocf_request *req, int error) { struct ocf_cache *cache = req->cache; @@ -61,10 +78,8 @@ static void _ocf_backfill_complete(struct ocf_request *req, int error) if (error) { ocf_engine_invalidate(req); } else { - ocf_req_unlock(ocf_cache_line_concurrency(cache), req); - - /* put the request at the last point of the completion path */ - ocf_req_put(req); + ocf_queue_push_req_cb(req, _ocf_backfill_update_metadata, + OCF_QUEUE_ALLOW_SYNC | OCF_QUEUE_PRIO_HIGH); } } diff --git a/src/engine/engine_rd.c b/src/engine/engine_rd.c index bbc90cb..ac994fa 100644 --- a/src/engine/engine_rd.c +++ b/src/engine/engine_rd.c @@ -1,6 +1,6 @@ /* * Copyright(c) 2012-2022 Intel Corporation - * Copyright(c) 2024 Huawei Technologies + * Copyright(c) 2024-2025 Huawei Technologies * SPDX-License-Identifier: BSD-3-Clause */ @@ -133,13 +133,6 @@ static int _ocf_read_generic_do(struct ocf_request *req) return 0; } - - ocf_hb_req_prot_lock_wr(req); - - /* Set valid status bits map */ - ocf_set_valid_map_info(req); - - ocf_hb_req_prot_unlock_wr(req); } if (ocf_engine_needs_repart(req)) {