Merge pull request #425 from katlapinka/recovery-tests

Add new recovery tests for cache modes with lazy writes
This commit is contained in:
Robert Baldyga 2020-06-09 14:55:32 +02:00 committed by GitHub
commit 281c57fae5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 528 additions and 1 deletions

@ -1 +1 @@
Subproject commit a3757d106247bbc2c7b9fa6742f9dbbbeb3c24ea Subproject commit 308b014e2b6f7b9eaac6217270d6c9f6b7d4e208

View File

@ -0,0 +1,51 @@
from datetime import timedelta
from core.test_run import TestRun
from test_tools import fs_utils
from test_tools.dd import Dd
from test_utils import os_utils
from test_utils.filesystem.file import File
from test_utils.size import Size, Unit
def create_test_files(test_file_size):
source_file = fs_utils.create_random_test_file("/tmp/source_test_file", test_file_size)
target_file = File.create_file("/tmp/target_test_file")
return source_file, target_file
def copy_file(source, target, size, direct=None):
dd = Dd() \
.input(source) \
.output(target) \
.block_size(Size(1, Unit.Blocks4096)) \
.count(int(size.get_value(Unit.Blocks4096)))
if direct == "oflag":
dd.oflag("direct")
elif direct == "iflag":
dd.iflag("direct")
dd.run()
def compare_files(file1, file2, should_differ=False):
file1_md5 = file1.md5sum()
file2_md5 = file2.md5sum()
if should_differ ^ (file1_md5 != file2_md5):
if should_differ:
TestRun.fail("Source and target file checksums are identical.")
else:
TestRun.fail("Source and target file checksums are different.")
def power_cycle_dut(wait_for_flush_begin=False, core_device=None):
if wait_for_flush_begin:
if not core_device:
raise Exception("Core device is None.")
TestRun.LOGGER.info("Waiting for flushing to begin...")
core_writes_before = core_device.get_io_stats().sectors_written
os_utils.wait(lambda: core_writes_before < core_device.get_io_stats().sectors_written,
timedelta(minutes=3),
timedelta(milliseconds=100))
power_control = TestRun.plugin_manager.get_plugin('power_control')
power_control.power_cycle()

View File

@ -0,0 +1,125 @@
#
# Copyright(c) 2019-2020 Intel Corporation
# SPDX-License-Identifier: BSD-3-Clause-Clear
#
import os
import pytest
from api.cas import casadm
from api.cas.cache_config import CacheMode, CacheModeTrait, CacheLineSize, CleaningPolicy, \
FlushParametersAcp
from core.test_run import TestRun
from storage_devices.disk import DiskTypeSet, DiskType, DiskTypeLowerThan
from test_tools.disk_utils import Filesystem
from test_tools.fio.fio import Fio
from test_tools.fio.fio_param import IoEngine, ReadWrite
from test_utils import os_utils
from test_utils.filesystem.file import File
from test_utils.os_utils import DropCachesMode
from test_utils.size import Size, Unit
from test_utils.time import Time
from tests.lazy_writes.recovery.recovery_tests_methods import power_cycle_dut
test_file_size = Size(300, Unit.MebiByte)
mount_point = "/mnt"
filename = "fio_test_file"
pattern = "0xabcd"
other_pattern = "0x0000"
@pytest.mark.require_disk("cache", DiskTypeSet([DiskType.optane, DiskType.nand]))
@pytest.mark.require_disk("core", DiskTypeLowerThan("cache"))
@pytest.mark.parametrize("cache_mode", CacheMode.with_traits(CacheModeTrait.LazyWrites))
@pytest.mark.parametrize("cleaning_policy", CleaningPolicy)
@pytest.mark.parametrize("cache_line_size", CacheLineSize)
@pytest.mark.parametrize("filesystem", Filesystem)
@pytest.mark.require_plugin("power_control")
def test_recovery_all_options(cache_mode, cache_line_size, cleaning_policy, filesystem):
"""
title: Test for recovery after reset with various cache options.
description: Verify that unflushed data can be safely recovered after reset.
pass_criteria:
- CAS recovers successfully after reboot
- No data corruption
"""
with TestRun.step("Prepare cache and core devices."):
cache_disk = TestRun.disks['cache']
core_disk = TestRun.disks['core']
cache_disk.create_partitions([Size(200, Unit.MebiByte)])
core_disk.create_partitions([Size(2000, Unit.MebiByte)] * 2)
cache_device = cache_disk.partitions[0]
core_device = core_disk.partitions[0]
core_device_link = core_device.get_device_link("/dev/disk/by-id")
cache_device_link = cache_device.get_device_link("/dev/disk/by-id")
test_file = File(os.path.join(mount_point, filename))
file_operation(test_file.full_path, pattern, ReadWrite.write)
file_md5 = test_file.md5sum()
with TestRun.step(f"Make {filesystem} on core device."):
core_device.create_filesystem(filesystem)
with TestRun.step("Mount core device."):
core_device.mount(mount_point)
file_operation(test_file.full_path, other_pattern, ReadWrite.write)
os_utils.drop_caches(DropCachesMode.ALL)
with TestRun.step("Unmount core device."):
core_device.unmount()
with TestRun.step(f"Start cache in {cache_mode.name} with given configuration."):
cache = casadm.start_cache(cache_device, cache_mode, cache_line_size, force=True)
cache.set_cleaning_policy(cleaning_policy)
if cleaning_policy == CleaningPolicy.acp:
cache.set_params_acp(FlushParametersAcp(wake_up_time=Time(seconds=1)))
with TestRun.step("Add core."):
core = cache.add_core(core_device)
with TestRun.step("Mount CAS device."):
core.mount(mount_point)
file_operation(test_file.full_path, pattern, ReadWrite.write)
with TestRun.step("Change cache mode to Write-Through without flush option."):
cache.set_cache_mode(CacheMode.WT, flush=False)
with TestRun.step("Reset platform."):
os_utils.sync()
core.unmount()
TestRun.LOGGER.info(f"Number of dirty blocks in cache: {cache.get_dirty_blocks()}")
power_cycle_dut()
cache_device.full_path = cache_device_link.get_target()
core_device.full_path = core_device_link.get_target()
with TestRun.step("Try to start cache without load and force option."):
try:
casadm.start_cache(cache_device, cache_mode, cache_line_size)
TestRun.fail("Cache started without load or force option.")
except Exception:
TestRun.LOGGER.info("Cache did not start without load and force option.")
with TestRun.step("Load cache and stop it with flush."):
cache = casadm.load_cache(cache_device)
cache.stop()
with TestRun.step("Check md5sum of tested file on core device."):
core_device.mount(mount_point)
cas_md5 = test_file.md5sum()
core_device.unmount()
if cas_md5 == file_md5:
TestRun.LOGGER.info("Source and target file checksums are identical.")
else:
TestRun.fail("Source and target file checksums are different.")
def file_operation(target_path, data_pattern, io_pattern):
fio = (Fio().create_command()
.target(target_path)
.io_engine(IoEngine.libaio)
.size(test_file_size)
.read_write(io_pattern)
.block_size(Size(1, Unit.Blocks4096))
.verification_with_pattern(data_pattern)
.direct()
.set_param("do_verify", 0))
fio.run()

View File

@ -0,0 +1,186 @@
#
# Copyright(c) 2019-2020 Intel Corporation
# SPDX-License-Identifier: BSD-3-Clause-Clear
#
import os
import pytest
from api.cas import casadm, cli
from api.cas.cache_config import CacheMode, CacheModeTrait, CleaningPolicy, SeqCutOffPolicy
from core.test_run import TestRun
from storage_devices.disk import DiskTypeSet, DiskType, DiskTypeLowerThan
from test_tools.dd import Dd
from test_tools.disk_utils import Filesystem
from test_utils import os_utils
from test_utils.os_utils import Udev
from test_utils.output import CmdException
from test_utils.size import Size, Unit
from tests.lazy_writes.recovery.recovery_tests_methods import create_test_files, copy_file, \
compare_files, power_cycle_dut
mount_point = "/mnt"
test_file_size = Size(1.5, Unit.GibiByte)
@pytest.mark.require_disk("cache", DiskTypeSet([DiskType.optane, DiskType.nand]))
@pytest.mark.require_disk("core", DiskTypeLowerThan("cache"))
@pytest.mark.parametrize("cache_mode", CacheMode.with_traits(CacheModeTrait.LazyWrites))
@pytest.mark.require_plugin("power_control")
def test_recovery_flush_reset_raw(cache_mode):
"""
title: Recovery after reset during cache flushing - test on raw device.
description: |
Verify that unflushed data can be safely recovered, when reset was pressed during
data flushing on raw device.
pass_criteria:
- CAS recovers successfully after reboot
- No data corruption
"""
with TestRun.step("Prepare cache and core devices."):
cache_disk = TestRun.disks['cache']
core_disk = TestRun.disks['core']
cache_disk.create_partitions([Size(2, Unit.GibiByte)])
core_disk.create_partitions([Size(16, Unit.GibiByte)] * 2)
cache_device = cache_disk.partitions[0]
core_device = core_disk.partitions[0]
core_device_link = core_device.get_device_link("/dev/disk/by-id")
cache_device_link = cache_device.get_device_link("/dev/disk/by-id")
with TestRun.step("Create test files."):
source_file, target_file = create_test_files(test_file_size)
with TestRun.step("Setup cache and add core."):
cache = casadm.start_cache(cache_device, cache_mode)
core = cache.add_core(core_device)
cache.set_cleaning_policy(CleaningPolicy.nop)
cache.set_seq_cutoff_policy(SeqCutOffPolicy.never)
with TestRun.step("Copy file to CAS."):
copy_file(source=source_file.full_path, target=core.system_path, size=test_file_size,
direct="oflag")
with TestRun.step("Sync and flush buffers."):
os_utils.sync()
output = TestRun.executor.run(f"hdparm -f {core.system_path}")
if output.exit_code != 0:
raise CmdException("Error during hdparm", output)
with TestRun.step("Trigger flush."):
TestRun.executor.run_in_background(cli.flush_cache_cmd(f"{cache.cache_id}"))
with TestRun.step("Hard reset DUT during data flushing."):
power_cycle_dut(wait_for_flush_begin=True, core_device=core_device)
cache_device.full_path = cache_device_link.get_target()
core_device.full_path = core_device_link.get_target()
with TestRun.step("Copy file from core and check if current md5sum is different than "
"before restart."):
copy_file(source=core_device_link.get_target(), target=target_file.full_path,
size=test_file_size, direct="iflag")
compare_files(source_file, target_file, should_differ=True)
with TestRun.step("Load cache."):
cache = casadm.load_cache(cache_device)
if cache.get_dirty_blocks() == Size.zero():
TestRun.fail("There are no dirty blocks on cache device.")
with TestRun.step("Stop cache with dirty data flush."):
core_writes_before = core_device.get_io_stats().sectors_written
cache.stop()
if core_writes_before >= core_device.get_io_stats().sectors_written:
TestRun.fail("No data was flushed after stopping cache started with load option.")
with TestRun.step("Copy test file from core device to temporary location. "
"Compare it with the first version they should be the same."):
copy_file(source=core_device_link.get_target(), target=target_file.full_path,
size=test_file_size, direct="iflag")
compare_files(source_file, target_file)
with TestRun.step("Cleanup core device and remove test files."):
target_file.remove()
source_file.remove()
@pytest.mark.require_disk("cache", DiskTypeSet([DiskType.optane, DiskType.nand]))
@pytest.mark.require_disk("core", DiskTypeLowerThan("cache"))
@pytest.mark.parametrize("cache_mode", CacheMode.with_traits(CacheModeTrait.LazyWrites))
@pytest.mark.parametrize("fs", [Filesystem.xfs, Filesystem.ext4])
@pytest.mark.require_plugin("power_control")
def test_recovery_flush_reset_fs(cache_mode, fs):
"""
title: Recovery after reset during cache flushing - test on filesystem.
description: |
Verify that unflushed data can be safely recovered, when reset was pressed during
data flushing on filesystem.
pass_criteria:
- CAS recovers successfully after reboot
- No data corruption
"""
with TestRun.step("Prepare cache and core devices."):
cache_disk = TestRun.disks['cache']
core_disk = TestRun.disks['core']
cache_disk.create_partitions([Size(2, Unit.GibiByte)])
core_disk.create_partitions([Size(16, Unit.GibiByte)] * 2)
cache_device = cache_disk.partitions[0]
core_device = core_disk.partitions[0]
core_device_link = core_device.get_device_link("/dev/disk/by-id")
cache_device_link = cache_device.get_device_link("/dev/disk/by-id")
with TestRun.step(f"Create {fs} filesystem on core."):
core_device.create_filesystem(fs)
with TestRun.step("Create test files."):
source_file, target_file = create_test_files(test_file_size)
with TestRun.step("Setup cache and add core."):
cache = casadm.start_cache(cache_device, cache_mode)
Udev.disable()
core = cache.add_core(core_device)
cache.set_cleaning_policy(CleaningPolicy.nop)
cache.set_seq_cutoff_policy(SeqCutOffPolicy.never)
with TestRun.step("Mount CAS device."):
core.mount(mount_point)
with TestRun.step("Copy file to CAS."):
copy_file(source=source_file.full_path,
target=os.path.join(mount_point, "source_test_file"),
size=test_file_size, direct="oflag")
with TestRun.step("Unmount CAS device."):
core.unmount()
with TestRun.step("Trigger flush."):
TestRun.executor.run_in_background(cli.flush_cache_cmd(f"{cache.cache_id}"))
with TestRun.step("Hard reset DUT during data flushing."):
power_cycle_dut(True, core_device)
cache_device.full_path = cache_device_link.get_target()
core_device.full_path = core_device_link.get_target()
with TestRun.step("Load cache."):
cache = casadm.load_cache(cache_device)
if cache.get_dirty_blocks() == Size.zero():
TestRun.fail("There are no dirty blocks on cache device.")
with TestRun.step("Stop cache with dirty data flush."):
core_writes_before = core_device.get_io_stats().sectors_written
cache.stop()
if core_writes_before >= core_device.get_io_stats().sectors_written:
TestRun.fail("No data was flushed after stopping cache started with load option.")
with TestRun.step("Mount core device."):
core_device.mount(mount_point)
with TestRun.step("Copy test file from core device to temporary location. "
"Compare it with the first version they should be the same."):
copy_file(source=os.path.join(mount_point, "source_test_file"),
target=target_file.full_path,
size=test_file_size, direct="iflag")
compare_files(source_file, target_file)
with TestRun.step("Unmount core device and remove test files."):
core_device.unmount()
target_file.remove()
source_file.remove()
Udev.enable()

View File

@ -0,0 +1,165 @@
#
# Copyright(c) 2019-2020 Intel Corporation
# SPDX-License-Identifier: BSD-3-Clause-Clear
#
import os
import pytest
from api.cas import casadm
from api.cas.cache_config import CacheMode, CacheModeTrait, CacheLineSize
from core.test_run import TestRun
from storage_devices.disk import DiskTypeSet, DiskType, DiskTypeLowerThan
from test_tools.dd import Dd
from test_tools.disk_utils import Filesystem
from test_utils.size import Size, Unit
from tests.lazy_writes.recovery.recovery_tests_methods import create_test_files, copy_file, \
compare_files
test_file_size = Size(0.5, Unit.GibiByte)
mount_point = "/mnt"
test_file_path = os.path.join(mount_point, "test_file")
@pytest.mark.require_disk("cache", DiskTypeSet([DiskType.optane, DiskType.nand]))
@pytest.mark.require_disk("core", DiskTypeLowerThan("cache"))
@pytest.mark.parametrize("cache_mode", CacheMode.with_traits(CacheModeTrait.LazyWrites))
@pytest.mark.parametrize("cls", [CacheLineSize.LINE_4KiB, CacheLineSize.LINE_64KiB])
@pytest.mark.parametrize("filesystem", Filesystem)
@pytest.mark.parametrize("direct", [True, False])
@pytest.mark.require_plugin("power_control")
def test_recovery_unplug_cache_fs(cache_mode, cls, filesystem, direct):
"""
title: Test for recovery after cache drive removal - test with filesystem.
description: |
Verify that unflushed data can be safely recovered after, when SSD drive is removed
after write completion - test with filesystem.
pass_criteria:
- CAS recovers successfully after cache drive unplug
- No data corruption
"""
with TestRun.step("Prepare devices"):
cache_disk = TestRun.disks['cache']
core_disk = TestRun.disks['core']
cache_disk.create_partitions([Size(2, Unit.GibiByte)])
core_disk.create_partitions([Size(16, Unit.GibiByte)])
cache_device = cache_disk.partitions[0]
core_device = core_disk.partitions[0]
with TestRun.step("Create test files."):
source_file, target_file = create_test_files(test_file_size)
with TestRun.step("Create filesystem on core device."):
core_device.create_filesystem(filesystem)
with TestRun.step("Start cache and add core."):
cache = casadm.start_cache(cache_device, cache_mode, cls)
core = cache.add_core(core_device)
with TestRun.step("Mount CAS device."):
core.mount(mount_point)
with TestRun.step("Copy file to CAS."):
copy_file(source=source_file.full_path, target=test_file_path,
size=test_file_size, direct="oflag" if direct else None)
TestRun.LOGGER.info(str(core.get_statistics()))
with TestRun.step("Unmount CAS device."):
core.unmount()
with TestRun.step("Unplug cache device."):
cache_disk.unplug()
TestRun.LOGGER.info(f"List caches:\n{casadm.list_caches().stdout}")
TestRun.LOGGER.info(f"Dirty blocks on cache: "
f"{cache.get_dirty_blocks().get_value(Unit.Blocks4096)}")
with TestRun.step("Stop cache."):
cache.stop()
with TestRun.step("Plug missing cache device."):
cache_disk.plug()
with TestRun.step("Load cache."):
cache = casadm.load_cache(cache_device)
TestRun.LOGGER.info(f"Dirty blocks on cache: "
f"{cache.get_dirty_blocks().get_value(Unit.Blocks4096)}")
with TestRun.step("Stop cache with data flush."):
cache.stop()
with TestRun.step("Mount core device."):
core_device.mount(mount_point)
with TestRun.step("Copy file from core device and check md5sum."):
copy_file(source=test_file_path, target=target_file.full_path,
size=test_file_size, direct="iflag" if direct else None)
compare_files(source_file, target_file)
with TestRun.step("Unmount core device and remove files."):
core_device.unmount()
target_file.remove()
source_file.remove()
@pytest.mark.require_disk("cache", DiskTypeSet([DiskType.optane, DiskType.nand]))
@pytest.mark.require_disk("core", DiskTypeLowerThan("cache"))
@pytest.mark.parametrize("cache_mode", CacheMode.with_traits(CacheModeTrait.LazyWrites))
@pytest.mark.parametrize("cls", [CacheLineSize.LINE_4KiB, CacheLineSize.LINE_64KiB])
@pytest.mark.require_plugin("power_control")
def test_recovery_unplug_cache_raw(cache_mode, cls):
"""
title: Test for recovery after cache drive removal - test on raw device.
description: |
Verify that unflushed data can be safely recovered after, when SSD drive is removed
after write completion - test on raw device.
pass_criteria:
- CAS recovers successfully after cache drive unplug
- No data corruption
"""
with TestRun.step("Prepare devices"):
cache_disk = TestRun.disks['cache']
core_disk = TestRun.disks['core']
cache_disk.create_partitions([Size(2, Unit.GibiByte)])
core_disk.create_partitions([Size(16, Unit.GibiByte)])
cache_device = cache_disk.partitions[0]
core_device = core_disk.partitions[0]
with TestRun.step("Create test files."):
source_file, target_file = create_test_files(test_file_size)
with TestRun.step("Start cache and add core."):
cache = casadm.start_cache(cache_device, cache_mode, cls)
core = cache.add_core(core_device)
with TestRun.step("Copy file to CAS."):
copy_file(source=source_file.full_path, target=core.system_path,
size=test_file_size, direct="oflag")
TestRun.LOGGER.info(str(core.get_statistics()))
with TestRun.step("Unplug cache device."):
cache_disk.unplug()
TestRun.LOGGER.info(f"List caches:\n{casadm.list_caches().stdout}")
TestRun.LOGGER.info(f"Dirty blocks on cache: "
f"{cache.get_dirty_blocks().get_value(Unit.Blocks4096)}")
with TestRun.step("Stop cache."):
cache.stop()
with TestRun.step("Plug missing cache device."):
cache_disk.plug()
with TestRun.step("Load cache."):
cache = casadm.load_cache(cache_device)
TestRun.LOGGER.info(f"Dirty blocks on cache: "
f"{cache.get_dirty_blocks().get_value(Unit.Blocks4096)}")
with TestRun.step("Stop cache with data flush."):
cache.stop()
with TestRun.step("Copy file from core device and check md5sum."):
copy_file(source=core_device.system_path, target=target_file.full_path,
size=test_file_size, direct="iflag")
compare_files(source_file, target_file)
with TestRun.step("Cleanup core device and remove test files."):
target_file.remove()
source_file.remove()