failover advanced test scenarios

Signed-off-by: Adam Rutkowski <adam.j.rutkowski@intel.com>
This commit is contained in:
Adam Rutkowski 2022-07-06 19:30:07 +02:00
parent e82b07746e
commit a8be9d4bf3

View File

@ -0,0 +1,215 @@
#
# Copyright(c) 2022 Intel Corporation
# SPDX-License-Identifier: BSD-3-Clause
#
import pytest
from api.cas.cache_config import CacheMode, CleaningPolicy, CacheModeTrait, CacheLineSize
@pytest.mark.skip(reason="not implemented")
@pytest.mark.multidut(2)
@pytest.mark.parametrize("cache_mode", CacheMode.with_traits(CacheModeTrait.LazyWrites))
@pytest.mark.parametrize("cls", [CacheLineSize.LINE_4KiB, CacheLineSize.LINE_64KiB])
@pytest.mark.parametrize("cleaning_policy", CleaningPolicy)
def test_failover_during_background_cleaning(pyocf_ctx, cache_mode, cls, cleaning_policy):
"""
title: Failover sequence with background cleaning:
description:
Verify proper failover behaviour and data integrity after power failure during background
cleaning running.
pass_criteria:
- Failover procedure success
- Data integrity is maintained
parametrizations:
- cache mode: all cache modes with lazy writes - to make sure dirty data is produced so that
metadata synchronization between hosts occurs
- cacheline size: 4K, 64K - to test both sector I/O and full-cacheline I/O
- cleaning policy - as different policies have separate metadata handling implementation
steps:
- On 2 DUTs (main and backup) prepare RAID1 cache devices of 1GiB size, comprising of 2
Optane drives each.
- On 2 DUTs (main and backup) prepare primary storage device of size 1.5GiB
- On main DUT prefill primary storage device with random data
- Start a standby cache instance on the backup DUT with parametrized cacheline size
- Configure DRBD to replicate cache and core storage from main to backup node
- On main DUT:
- Start cache on top of cache DRBD device with parametrized cacheline size and cache mode
- Set cleaning policy to NOP
- Wait for DRBD synchronization
- Fill cache with random 50% read/write mix workload, block size 4K
- Verify cache is > 25% dirty
- Switch to WO cache mode without flush
- Calculate checksum of CAS exported object
- Switch back to the parametrized cache mode without flush
- Switch to parametrized cleaning policy
- Wait for the background cleaner to start working (no wait for ACP, according to
policy parameters for ALRU)
- Verify cleaner is progressing by inspecting dirty statistics
- Power off the main DUT
- On backup DUT:
- stop cache DRBD
- set backup DUT as primary for core DRBD
- deatch cache drive from standby cache instance
- activate standby cache instance directly on the cache RAID drive
- calculate checksum of CAS exported object
- Verify that the two checksums are equal
- Power on the main DUT
"""
pass
@pytest.mark.skip(reason="not implemented")
@pytest.mark.multidut(2)
@pytest.mark.parametrize("cache_mode", CacheMode.with_traits(CacheModeTrait.LazyWrites))
@pytest.mark.parametrize("cls", [CacheLineSize.LINE_4KiB, CacheLineSize.LINE_64KiB])
def test_failover_during_dirty_flush(pyocf_ctx, cache_mode, cls):
"""
title: Failover sequence with after power failure during dirty data flush
description:
Verify proper failover behaviour and data integrity after power failure during
user-issued cleaning
pass_criteria:
- Failover procedure success
- Data integrity is maintained
parametrizations:
- cache mode: all cache modes with lazy writes - to make sure dirty data is produced so that
metadata synchronization between hosts occurs
- cacheline size: 4K, 64K - to test both sector I/O and full-cacheline I/O
steps:
- On 2 DUTs (main and backup) prepare RAID1 cache devices of 1GiB size, comprising of 2
Optane drives each.
- On 2 DUTs (main and backup) prepare primary storage device of size 1.5GiB
- On main DUT prefill primary storage device with random data
- Start a standby cache instance on the backup DUT with parametrized cacheline size
- Configure DRBD to replicate cache and core storage from main to backup node
- On main DUT:
- Start cache on top of cache DRBD device with parametrized cacheline size and cache mode
- Wait for DRBD synchronization
- Set cleaning policy to NOP
- Fill cache with random 50% read/write mix workload, block size 4K
- Verify cache is > 25% dirty
- Switch to WO cache mode without flush
- Calculate checksum of CAS exported object
- Switch back to the parametrized cache mode without flush
- Issue cache flush command
- Verify flush is progressing by inspecting dirty statistics
- Power off the main DUT
- On backup DUT:
- stop cache DRBD
- set backup DUT as primary for core DRBD
- deatch cache drive from standby cache instance
- activate standby cache instance directly on the cache RAID drive
- calculate checksum of CAS exported object
- Verify that the two checksums are equal
- Power on the main DUT
"""
pass
@pytest.mark.skip(reason="not implemented")
@pytest.mark.multidut(2)
@pytest.mark.parametrize(
"cache_mode", [m for m in CacheMode if m != CacheMode.WO and m != CacheMode.PT]
)
@pytest.mark.parametrize("cls", [CacheLineSize.LINE_4KiB, CacheLineSize.LINE_64KiB])
@pytest.mark.parametrize("cleaning_policy", [c for c in CleaningPolicy if c != CleaningPolicy.alru])
def test_failover_during_io_with_eviction(pyocf_ctx, cache_mode, cls, cleaning_policy):
"""
title: Failover sequence with after power failure during I/O with eviction
description:
Verify proper failover behaviour and data integrity after power failure during
I/O handling with eviction
pass_criteria:
- Failover procedure success
- Data integrity is maintained
parametrizations:
- cache mode: all cache modes except WO and PT - to trigger eviction via
reads
- cacheline size: 4K, 64K - to test both sector I/O and full-cacheline I/O
- cleaning policy: all except ALRU, as it doesn't do any cleaning in runtime
steps:
- On 2 DUTs (main and backup) prepare RAID1 cache devices of 1GiB size, comprising of 2
Optane drives each.
- On 2 DUTs (main and backup) prepare primary storage device of size 1.5GiB
- On main DUT prefill primary storage device with random data
- Start a standby cache instance on the backup DUT with parametrized cacheline size
- Configure DRBD to replicate cache and core storage from main to backup node
- On main DUT:
- Start WB cache on top of cache DRBD device with parametrized cacheline size
- Set cleaning policy to NOP
- Wait for DRBD synchronization
- Fill cache with random 50% read/write mix workload, block size = parametrized cache
line size
- Verify cache is > 25% dirty
- Verify cache ocuppancy is 100%
- Switch to WO cache mode without flush
- Calculate checksum of CAS exported object
- Switch back to parametrized cache mode without flush
- Switch to parametrized cleaning policy and cache mode
- Run multi-threaded I/O, 100% random read, block_size range [4K, parametrized cache line
size] with 4K increment, different random seed than the previous prefill I/O, entire
primary storage LBA address range, runtime 1h
- Verify cache miss statistic is being incremented
- Verify pass-through I/O statistic is not being incremented
- Power off the main DUT
- On backup DUT:
- stop cache DRBD
- set backup DUT as primary for core DRBD
- deatch cache drive from standby cache instance
- activate standby cache instance directly on the cache RAID drive
- calculate checksum of CAS exported object
- Verify that the two checksums are equal
- Power on the main DUT
"""
pass
@pytest.mark.skip(reason="not implemented")
@pytest.mark.multidut(2)
@pytest.mark.parametrize("cls", [CacheLineSize.LINE_4KiB, CacheLineSize.LINE_64KiB])
@pytest.mark.parametrize("cleaning_policy", [c for c in CleaningPolicy if c != CleaningPolicy.alru])
def test_failover_io_long(pyocf_ctx, cls, cleaning_policy):
"""
title:
Failover WB I/O long
Description:
4h I/O with data verification in failover setup
pass_criteria:
- Data integrity is maintained
- Failover procedure success
parametrizations:
- cacheline size: 4K, 64K - to test both sector I/O and full-cacheline I/O
- cleaning policy: all except ALRU, as it doesn't do any cleaning in runtime
steps:
- On 2 DUTs (main and backup) prepare RAID1 cache devices of 1GiB size, comprising of 2
Optane drives each.
- On 2 DUTs (main and backup) prepare primary storage device of size 1.5GiB
- Start a standby cache instance on the backup DUT with parametrized cacheline size
- Configure DRBD to replicate cache and core storage from main to backup node
- On main DUT:
- Start WB cache on top of cache DRBD device with parametrized cacheline size
- Set the parametrized cleaning policy
- Create XFS file system on CAS exported object
- Wait for DRBD synchronization
- Mount file system
- Run 4h FIO with data verification: random R/W, 16 jobs, filesystem, entire primary
storage LBA address range, --bssplit=4k/10:8k/25:16k/25:32k/20:64k/10:128k/5:256k/5
- Verify no data errors
- Switch to WO cache mode without flush
- Calculate checksum of fio test file(s)
- Switch back to WB cache mode without flush
- Flush page cache
- Power off the main DUT
- On backup DUT:
- stop cache DRBD
- set backup DUT as primary for core DRBD
- deatch cache drive from standby cache instance
- activate standby cache instance directly on the cache RAID drive
- mount file system located on CAS exported object
- Calculate checksum of fio test file(s)
- Verify checksums from the previous steps are equal
- Power on the main DUT
"""
pass