diff --git a/test/functional/tests/failover_standby/test_failover_advanced.py b/test/functional/tests/failover_standby/test_failover_advanced.py new file mode 100644 index 0000000..710178f --- /dev/null +++ b/test/functional/tests/failover_standby/test_failover_advanced.py @@ -0,0 +1,215 @@ +# +# Copyright(c) 2022 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause +# + +import pytest + +from api.cas.cache_config import CacheMode, CleaningPolicy, CacheModeTrait, CacheLineSize + + +@pytest.mark.skip(reason="not implemented") +@pytest.mark.multidut(2) +@pytest.mark.parametrize("cache_mode", CacheMode.with_traits(CacheModeTrait.LazyWrites)) +@pytest.mark.parametrize("cls", [CacheLineSize.LINE_4KiB, CacheLineSize.LINE_64KiB]) +@pytest.mark.parametrize("cleaning_policy", CleaningPolicy) +def test_failover_during_background_cleaning(pyocf_ctx, cache_mode, cls, cleaning_policy): + """ + title: Failover sequence with background cleaning: + description: + Verify proper failover behaviour and data integrity after power failure during background + cleaning running. + pass_criteria: + - Failover procedure success + - Data integrity is maintained + parametrizations: + - cache mode: all cache modes with lazy writes - to make sure dirty data is produced so that + metadata synchronization between hosts occurs + - cacheline size: 4K, 64K - to test both sector I/O and full-cacheline I/O + - cleaning policy - as different policies have separate metadata handling implementation + steps: + - On 2 DUTs (main and backup) prepare RAID1 cache devices of 1GiB size, comprising of 2 + Optane drives each. + - On 2 DUTs (main and backup) prepare primary storage device of size 1.5GiB + - On main DUT prefill primary storage device with random data + - Start a standby cache instance on the backup DUT with parametrized cacheline size + - Configure DRBD to replicate cache and core storage from main to backup node + - On main DUT: + - Start cache on top of cache DRBD device with parametrized cacheline size and cache mode + - Set cleaning policy to NOP + - Wait for DRBD synchronization + - Fill cache with random 50% read/write mix workload, block size 4K + - Verify cache is > 25% dirty + - Switch to WO cache mode without flush + - Calculate checksum of CAS exported object + - Switch back to the parametrized cache mode without flush + - Switch to parametrized cleaning policy + - Wait for the background cleaner to start working (no wait for ACP, according to + policy parameters for ALRU) + - Verify cleaner is progressing by inspecting dirty statistics + - Power off the main DUT + - On backup DUT: + - stop cache DRBD + - set backup DUT as primary for core DRBD + - deatch cache drive from standby cache instance + - activate standby cache instance directly on the cache RAID drive + - calculate checksum of CAS exported object + - Verify that the two checksums are equal + - Power on the main DUT + """ + pass + + +@pytest.mark.skip(reason="not implemented") +@pytest.mark.multidut(2) +@pytest.mark.parametrize("cache_mode", CacheMode.with_traits(CacheModeTrait.LazyWrites)) +@pytest.mark.parametrize("cls", [CacheLineSize.LINE_4KiB, CacheLineSize.LINE_64KiB]) +def test_failover_during_dirty_flush(pyocf_ctx, cache_mode, cls): + """ + title: Failover sequence with after power failure during dirty data flush + description: + Verify proper failover behaviour and data integrity after power failure during + user-issued cleaning + pass_criteria: + - Failover procedure success + - Data integrity is maintained + parametrizations: + - cache mode: all cache modes with lazy writes - to make sure dirty data is produced so that + metadata synchronization between hosts occurs + - cacheline size: 4K, 64K - to test both sector I/O and full-cacheline I/O + steps: + - On 2 DUTs (main and backup) prepare RAID1 cache devices of 1GiB size, comprising of 2 + Optane drives each. + - On 2 DUTs (main and backup) prepare primary storage device of size 1.5GiB + - On main DUT prefill primary storage device with random data + - Start a standby cache instance on the backup DUT with parametrized cacheline size + - Configure DRBD to replicate cache and core storage from main to backup node + - On main DUT: + - Start cache on top of cache DRBD device with parametrized cacheline size and cache mode + - Wait for DRBD synchronization + - Set cleaning policy to NOP + - Fill cache with random 50% read/write mix workload, block size 4K + - Verify cache is > 25% dirty + - Switch to WO cache mode without flush + - Calculate checksum of CAS exported object + - Switch back to the parametrized cache mode without flush + - Issue cache flush command + - Verify flush is progressing by inspecting dirty statistics + - Power off the main DUT + - On backup DUT: + - stop cache DRBD + - set backup DUT as primary for core DRBD + - deatch cache drive from standby cache instance + - activate standby cache instance directly on the cache RAID drive + - calculate checksum of CAS exported object + - Verify that the two checksums are equal + - Power on the main DUT + """ + pass + + +@pytest.mark.skip(reason="not implemented") +@pytest.mark.multidut(2) +@pytest.mark.parametrize( + "cache_mode", [m for m in CacheMode if m != CacheMode.WO and m != CacheMode.PT] +) +@pytest.mark.parametrize("cls", [CacheLineSize.LINE_4KiB, CacheLineSize.LINE_64KiB]) +@pytest.mark.parametrize("cleaning_policy", [c for c in CleaningPolicy if c != CleaningPolicy.alru]) +def test_failover_during_io_with_eviction(pyocf_ctx, cache_mode, cls, cleaning_policy): + """ + title: Failover sequence with after power failure during I/O with eviction + description: + Verify proper failover behaviour and data integrity after power failure during + I/O handling with eviction + pass_criteria: + - Failover procedure success + - Data integrity is maintained + parametrizations: + - cache mode: all cache modes except WO and PT - to trigger eviction via + reads + - cacheline size: 4K, 64K - to test both sector I/O and full-cacheline I/O + - cleaning policy: all except ALRU, as it doesn't do any cleaning in runtime + steps: + - On 2 DUTs (main and backup) prepare RAID1 cache devices of 1GiB size, comprising of 2 + Optane drives each. + - On 2 DUTs (main and backup) prepare primary storage device of size 1.5GiB + - On main DUT prefill primary storage device with random data + - Start a standby cache instance on the backup DUT with parametrized cacheline size + - Configure DRBD to replicate cache and core storage from main to backup node + - On main DUT: + - Start WB cache on top of cache DRBD device with parametrized cacheline size + - Set cleaning policy to NOP + - Wait for DRBD synchronization + - Fill cache with random 50% read/write mix workload, block size = parametrized cache + line size + - Verify cache is > 25% dirty + - Verify cache ocuppancy is 100% + - Switch to WO cache mode without flush + - Calculate checksum of CAS exported object + - Switch back to parametrized cache mode without flush + - Switch to parametrized cleaning policy and cache mode + - Run multi-threaded I/O, 100% random read, block_size range [4K, parametrized cache line + size] with 4K increment, different random seed than the previous prefill I/O, entire + primary storage LBA address range, runtime 1h + - Verify cache miss statistic is being incremented + - Verify pass-through I/O statistic is not being incremented + - Power off the main DUT + - On backup DUT: + - stop cache DRBD + - set backup DUT as primary for core DRBD + - deatch cache drive from standby cache instance + - activate standby cache instance directly on the cache RAID drive + - calculate checksum of CAS exported object + - Verify that the two checksums are equal + - Power on the main DUT + """ + pass + + +@pytest.mark.skip(reason="not implemented") +@pytest.mark.multidut(2) +@pytest.mark.parametrize("cls", [CacheLineSize.LINE_4KiB, CacheLineSize.LINE_64KiB]) +@pytest.mark.parametrize("cleaning_policy", [c for c in CleaningPolicy if c != CleaningPolicy.alru]) +def test_failover_io_long(pyocf_ctx, cls, cleaning_policy): + """ + title: + Failover WB I/O long + Description: + 4h I/O with data verification in failover setup + pass_criteria: + - Data integrity is maintained + - Failover procedure success + parametrizations: + - cacheline size: 4K, 64K - to test both sector I/O and full-cacheline I/O + - cleaning policy: all except ALRU, as it doesn't do any cleaning in runtime + steps: + - On 2 DUTs (main and backup) prepare RAID1 cache devices of 1GiB size, comprising of 2 + Optane drives each. + - On 2 DUTs (main and backup) prepare primary storage device of size 1.5GiB + - Start a standby cache instance on the backup DUT with parametrized cacheline size + - Configure DRBD to replicate cache and core storage from main to backup node + - On main DUT: + - Start WB cache on top of cache DRBD device with parametrized cacheline size + - Set the parametrized cleaning policy + - Create XFS file system on CAS exported object + - Wait for DRBD synchronization + - Mount file system + - Run 4h FIO with data verification: random R/W, 16 jobs, filesystem, entire primary + storage LBA address range, --bssplit=4k/10:8k/25:16k/25:32k/20:64k/10:128k/5:256k/5 + - Verify no data errors + - Switch to WO cache mode without flush + - Calculate checksum of fio test file(s) + - Switch back to WB cache mode without flush + - Flush page cache + - Power off the main DUT + - On backup DUT: + - stop cache DRBD + - set backup DUT as primary for core DRBD + - deatch cache drive from standby cache instance + - activate standby cache instance directly on the cache RAID drive + - mount file system located on CAS exported object + - Calculate checksum of fio test file(s) + - Verify checksums from the previous steps are equal + - Power on the main DUT + """ + pass