Merge pull request #844 from mmichal10/engine-errors-test

Engine errors test
This commit is contained in:
Robert Baldyga 2024-10-02 16:26:00 +02:00 committed by GitHub
commit 630748a1e7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 165 additions and 0 deletions

View File

@ -87,6 +87,8 @@ class Rio:
self.ios = Size(0) self.ios = Size(0)
self.io_target = 0 self.io_target = 0
self.finish_time = None self.finish_time = None
self.submitted_reads = 0
self.submitted_writes = 0
self.qd_condition = Condition() self.qd_condition = Condition()
self.qd = 0 self.qd = 0
@ -159,6 +161,12 @@ class Rio:
io.callback = self.get_io_cb() io.callback = self.get_io_cb()
self.ios += self.jobspec.bs self.ios += self.jobspec.bs
io.submit() io.submit()
if iodir is IoDir.WRITE:
self.submitted_writes += 1
if iodir is IoDir.READ:
self.submitted_reads += 1
with self.qd_condition: with self.qd_condition:
self.qd += 1 self.qd += 1
@ -172,6 +180,8 @@ class Rio:
self._threads = [] self._threads = []
self.errors = {} self.errors = {}
self.error_count = 0 self.error_count = 0
self.submitted_reads = 0
self.submitted_writes = 0
def copy(self): def copy(self):
r = copy.copy(self) r = copy.copy(self)
@ -254,6 +264,8 @@ class Rio:
thread.join() thread.join()
self.errors.update({thread.name: thread.errors}) self.errors.update({thread.name: thread.errors})
self.error_count += len(thread.errors) self.error_count += len(thread.errors)
self.submitted_reads += thread.submitted_reads
self.submitted_writes += thread.submitted_writes
self.global_jobspec.target.close() self.global_jobspec.target.close()

View File

@ -1,5 +1,6 @@
# #
# Copyright(c) 2019-2022 Intel Corporation # Copyright(c) 2019-2022 Intel Corporation
# Copyright(c) 2024 Huawei Technologies
# SPDX-License-Identifier: BSD-3-Clause # SPDX-License-Identifier: BSD-3-Clause
# #
@ -113,6 +114,10 @@ class OcfCtx:
def stop_caches(self): def stop_caches(self):
for cache in self.caches[:]: for cache in self.caches[:]:
try:
cache.get_volume().disarm()
except AttributeError:
pass
cache.stop() cache.stop()
def exit(self): def exit(self):

View File

@ -0,0 +1,148 @@
#
# Copyright(c) 2024 Huawei Technologies
# SPDX-License-Identifier: BSD-3-Clause
#
import pytest
from pyocf.types.cache import Cache, CacheMode
from pyocf.types.core import Core
from pyocf.types.volume import RamVolume, ErrorDevice
from pyocf.types.volume_core import CoreVolume
from pyocf.types.shared import CacheLineSize
from pyocf.utils import Size
from pyocf.rio import Rio, ReadWrite
BLOCK_SIZES = [Size(512), Size.from_KiB(1), Size.from_KiB(4), Size.from_KiB(64), Size.from_KiB(256)]
@pytest.mark.parametrize("cls", [CacheLineSize.LINE_4KiB, CacheLineSize.LINE_64KiB])
@pytest.mark.parametrize("cache_mode", [c for c in CacheMode if not c.lazy_write()])
@pytest.mark.parametrize("rio_bs", BLOCK_SIZES)
def test_strict_engine_errors(pyocf_ctx, cache_mode: CacheMode, cls: CacheLineSize, rio_bs: Size):
cache_vol_size = Size.from_MiB(50)
ram_cache_volume = RamVolume(cache_vol_size)
error_sectors = set(x for x in range(0, cache_vol_size, 512))
error_device = ErrorDevice(ram_cache_volume, error_sectors, armed=False)
core_device = RamVolume(Size.from_MiB(50))
cache = Cache.start_on_device(error_device, cache_mode=cache_mode)
core = Core.using_device(core_device)
queue = cache.get_default_queue()
cache.add_core(core)
core_volume = CoreVolume(core)
core_volume.open()
error_device.reset_stats()
error_device.arm()
rio_size = Size.from_MiB(3) if rio_bs > Size(4096) else Size.from_MiB(1)
read_rio_stats = (
Rio()
.target(core_volume)
.njobs(1)
.readwrite(ReadWrite.RANDREAD)
.size(rio_size)
.bs(rio_bs)
.qd(16)
.continue_on_error()
.run([queue])
)
# FIXME: Get rid of the second Rio instance, once the real RANDRW support is
# implemented in Rio
write_rio_stats = (
Rio()
.target(core_volume)
.njobs(1)
.readwrite(ReadWrite.RANDWRITE)
.size(rio_size)
.bs(rio_bs)
.qd(16)
.continue_on_error()
.run([queue])
)
cache.settle()
assert cache.get_stats()["usage"]["occupancy"]["value"] == 0
assert read_rio_stats.error_count == 0
assert write_rio_stats.error_count == 0
if cache_mode is CacheMode.PT:
expected_cache_write_errors = 0
else:
expected_cache_write_errors = write_rio_stats.submitted_writes
actual_cache_write_errors = cache.get_stats()["errors"]["cache_volume_wr"]["value"]
assert actual_cache_write_errors >= expected_cache_write_errors
error_device.disarm()
@pytest.mark.parametrize("cls", [CacheLineSize.LINE_4KiB, CacheLineSize.LINE_64KiB])
@pytest.mark.parametrize("cache_mode", [c for c in CacheMode if c.lazy_write()])
@pytest.mark.parametrize("rio_bs", BLOCK_SIZES)
def test_lazy_engine_errors(pyocf_ctx, cache_mode: CacheMode, cls: CacheLineSize, rio_bs: Size):
cache_vol_size = Size.from_MiB(50)
ram_cache_volume = RamVolume(cache_vol_size)
error_sectors = set(x for x in range(0, cache_vol_size, 512))
error_device = ErrorDevice(ram_cache_volume, error_sectors, armed=False)
core_device = RamVolume(Size.from_MiB(50))
cache = Cache.start_on_device(error_device, cache_mode=cache_mode)
core = Core.using_device(core_device)
queue = cache.get_default_queue()
cache.add_core(core)
core_volume = CoreVolume(core)
core_volume.open()
error_device.reset_stats()
error_device.arm()
rio_size = Size.from_MiB(3) if rio_bs > Size(4096) else Size.from_MiB(1)
read_rio_stats = (
Rio()
.target(core_volume)
.njobs(1)
.readwrite(ReadWrite.RANDREAD)
.size(rio_size)
.bs(rio_bs)
.qd(16)
.continue_on_error()
.run([queue])
)
# FIXME: Get rid of the second Rio instance, once the real RANDRW support is
# implemented in Rio
write_rio_stats = (
Rio()
.target(core_volume)
.njobs(1)
.readwrite(ReadWrite.RANDWRITE)
.size(rio_size)
.bs(rio_bs)
.qd(16)
.continue_on_error()
.run([queue])
)
cache.settle()
assert cache.get_stats()["usage"]["occupancy"]["value"] == 0
assert read_rio_stats.error_count == 0
assert write_rio_stats.error_count == write_rio_stats.submitted_writes
expected_cache_write_errors = write_rio_stats.submitted_writes
actual_cache_write_errors = cache.get_stats()["errors"]["cache_volume_wr"]["value"]
assert actual_cache_write_errors >= expected_cache_write_errors
error_device.disarm()