failover advanced test scenarios

Signed-off-by: Adam Rutkowski <adam.j.rutkowski@intel.com>
2022-07-06 19:30:07 +02:00 · 2022-07-06 19:30:07 +02:00 · a8be9d4bf3
commit a8be9d4bf3
parent e82b07746e
1 changed files with 215 additions and 0 deletions
--- a/test/functional/tests/failover_standby/test_failover_advanced.py
+++ b/test/functional/tests/failover_standby/test_failover_advanced.py
@ -0,0 +1,215 @@
+#
+# Copyright(c) 2022 Intel Corporation
+# SPDX-License-Identifier: BSD-3-Clause
+#
+
+import pytest
+
+from api.cas.cache_config import CacheMode, CleaningPolicy, CacheModeTrait, CacheLineSize
+
+
+@pytest.mark.skip(reason="not implemented")
+@pytest.mark.multidut(2)
+@pytest.mark.parametrize("cache_mode", CacheMode.with_traits(CacheModeTrait.LazyWrites))
+@pytest.mark.parametrize("cls", [CacheLineSize.LINE_4KiB, CacheLineSize.LINE_64KiB])
+@pytest.mark.parametrize("cleaning_policy", CleaningPolicy)
+def test_failover_during_background_cleaning(pyocf_ctx, cache_mode, cls, cleaning_policy):
+    """
+    title: Failover sequence with background cleaning:
+    description:
+      Verify proper failover behaviour and data integrity after power failure during background
+      cleaning running.
+    pass_criteria:
+      - Failover procedure success
+      - Data integrity is maintained
+    parametrizations:
+      - cache mode: all cache modes with lazy writes - to make sure dirty data is produced so that
+        metadata synchronization between hosts occurs
+      - cacheline size: 4K, 64K - to test both sector I/O and full-cacheline I/O
+      - cleaning policy - as different policies have separate metadata handling implementation
+    steps:
+      - On 2 DUTs (main and backup) prepare RAID1 cache devices of 1GiB size, comprising of 2
+        Optane drives each.
+      - On 2 DUTs (main and backup) prepare primary storage device of size 1.5GiB
+      - On main DUT prefill primary storage device with random data
+      - Start a standby cache instance on the backup DUT with parametrized cacheline size
+      - Configure DRBD to replicate cache and core storage from main to backup node
+      - On main DUT:
+          - Start cache on top of cache DRBD device with parametrized cacheline size and cache mode
+          - Set cleaning policy to NOP
+          - Wait for DRBD synchronization
+          - Fill cache with random 50% read/write mix workload, block size 4K
+          - Verify cache is > 25% dirty
+          - Switch to WO cache mode without flush
+          - Calculate checksum of CAS exported object
+          - Switch back to the parametrized cache mode without flush
+          - Switch to parametrized cleaning policy
+          - Wait for the background cleaner to start working (no wait for ACP, according to
+            policy parameters for ALRU)
+          - Verify cleaner is progressing by inspecting dirty statistics
+          - Power off the main DUT
+      - On backup DUT:
+          - stop cache DRBD
+          - set backup DUT as primary for core DRBD
+          - deatch cache drive from standby cache instance
+          - activate standby cache instance directly on the cache RAID drive
+          - calculate checksum of CAS exported object
+      - Verify that the two checksums are equal
+      - Power on the main DUT
+    """
+    pass
+
+
+@pytest.mark.skip(reason="not implemented")
+@pytest.mark.multidut(2)
+@pytest.mark.parametrize("cache_mode", CacheMode.with_traits(CacheModeTrait.LazyWrites))
+@pytest.mark.parametrize("cls", [CacheLineSize.LINE_4KiB, CacheLineSize.LINE_64KiB])
+def test_failover_during_dirty_flush(pyocf_ctx, cache_mode, cls):
+    """
+    title: Failover sequence with after power failure during dirty data flush
+    description:
+      Verify proper failover behaviour and data integrity after power failure during
+      user-issued cleaning
+    pass_criteria:
+      - Failover procedure success
+      - Data integrity is maintained
+    parametrizations:
+      - cache mode: all cache modes with lazy writes - to make sure dirty data is produced so that
+        metadata synchronization between hosts occurs
+      - cacheline size: 4K, 64K - to test both sector I/O and full-cacheline I/O
+    steps:
+      - On 2 DUTs (main and backup) prepare RAID1 cache devices of 1GiB size, comprising of 2
+        Optane drives each.
+      - On 2 DUTs (main and backup) prepare primary storage device of size 1.5GiB
+      - On main DUT prefill primary storage device with random data
+      - Start a standby cache instance on the backup DUT with parametrized cacheline size
+      - Configure DRBD to replicate cache and core storage from main to backup node
+      - On main DUT:
+          - Start cache on top of cache DRBD device with parametrized cacheline size and cache mode
+          - Wait for DRBD synchronization
+          - Set cleaning policy to NOP
+          - Fill cache with random 50% read/write mix workload, block size 4K
+          - Verify cache is > 25% dirty
+          - Switch to WO cache mode without flush
+          - Calculate checksum of CAS exported object
+          - Switch back to the parametrized cache mode without flush
+          - Issue cache flush command
+          - Verify flush is progressing by inspecting dirty statistics
+          - Power off the main DUT
+      - On backup DUT:
+          - stop cache DRBD
+          - set backup DUT as primary for core DRBD
+          - deatch cache drive from standby cache instance
+          - activate standby cache instance directly on the cache RAID drive
+          - calculate checksum of CAS exported object
+      - Verify that the two checksums are equal
+      - Power on the main DUT
+    """
+    pass
+
+
+@pytest.mark.skip(reason="not implemented")
+@pytest.mark.multidut(2)
+@pytest.mark.parametrize(
+    "cache_mode", [m for m in CacheMode if m != CacheMode.WO and m != CacheMode.PT]
+)
+@pytest.mark.parametrize("cls", [CacheLineSize.LINE_4KiB, CacheLineSize.LINE_64KiB])
+@pytest.mark.parametrize("cleaning_policy", [c for c in CleaningPolicy if c != CleaningPolicy.alru])
+def test_failover_during_io_with_eviction(pyocf_ctx, cache_mode, cls, cleaning_policy):
+    """
+    title: Failover sequence with after power failure during I/O with eviction
+    description:
+      Verify proper failover behaviour and data integrity after power failure during
+      I/O handling with eviction
+    pass_criteria:
+      - Failover procedure success
+      - Data integrity is maintained
+    parametrizations:
+      - cache mode: all cache modes except WO and PT - to trigger eviction via
+        reads
+      - cacheline size: 4K, 64K - to test both sector I/O and full-cacheline I/O
+      - cleaning policy: all except ALRU, as it doesn't do any cleaning in runtime
+    steps:
+      - On 2 DUTs (main and backup) prepare RAID1 cache devices of 1GiB size, comprising of 2
+        Optane drives each.
+      - On 2 DUTs (main and backup) prepare primary storage device of size 1.5GiB
+      - On main DUT prefill primary storage device with random data
+      - Start a standby cache instance on the backup DUT with parametrized cacheline size
+      - Configure DRBD to replicate cache and core storage from main to backup node
+      - On main DUT:
+          - Start WB cache on top of cache DRBD device with parametrized cacheline size
+          - Set cleaning policy to NOP
+          - Wait for DRBD synchronization
+          - Fill cache with random 50% read/write mix workload, block size = parametrized cache
+            line size
+          - Verify cache is > 25% dirty
+          - Verify cache ocuppancy is 100%
+          - Switch to WO cache mode without flush
+          - Calculate checksum of CAS exported object
+          - Switch back to parametrized cache mode without flush
+          - Switch to parametrized cleaning policy and cache mode
+          - Run multi-threaded I/O, 100% random read, block_size range [4K, parametrized cache line
+            size] with 4K increment, different random seed than the previous prefill I/O, entire
+            primary storage LBA address range, runtime 1h
+          - Verify cache miss statistic is being incremented
+          - Verify pass-through I/O statistic is not being incremented
+          - Power off the main DUT
+      - On backup DUT:
+          - stop cache DRBD
+          - set backup DUT as primary for core DRBD
+          - deatch cache drive from standby cache instance
+          - activate standby cache instance directly on the cache RAID drive
+          - calculate checksum of CAS exported object
+      - Verify that the two checksums are equal
+      - Power on the main DUT
+    """
+    pass
+
+
+@pytest.mark.skip(reason="not implemented")
+@pytest.mark.multidut(2)
+@pytest.mark.parametrize("cls", [CacheLineSize.LINE_4KiB, CacheLineSize.LINE_64KiB])
+@pytest.mark.parametrize("cleaning_policy", [c for c in CleaningPolicy if c != CleaningPolicy.alru])
+def test_failover_io_long(pyocf_ctx, cls, cleaning_policy):
+    """
+    title:
+        Failover WB I/O long
+    Description:
+         4h I/O with data verification in failover setup
+    pass_criteria:
+      - Data integrity is maintained
+      - Failover procedure success
+    parametrizations:
+      - cacheline size: 4K, 64K - to test both sector I/O and full-cacheline I/O
+      - cleaning policy: all except ALRU, as it doesn't do any cleaning in runtime
+    steps:
+      - On 2 DUTs (main and backup) prepare RAID1 cache devices of 1GiB size, comprising of 2
+        Optane drives each.
+      - On 2 DUTs (main and backup) prepare primary storage device of size 1.5GiB
+      - Start a standby cache instance on the backup DUT with parametrized cacheline size
+      - Configure DRBD to replicate cache and core storage from main to backup node
+      - On main DUT:
+          - Start WB cache on top of cache DRBD device with parametrized cacheline size
+          - Set the parametrized cleaning policy
+          - Create XFS file system on CAS exported object
+          - Wait for DRBD synchronization
+          - Mount file system
+          - Run 4h FIO with data verification: random R/W, 16 jobs, filesystem, entire primary
+            storage LBA address range, --bssplit=4k/10:8k/25:16k/25:32k/20:64k/10:128k/5:256k/5
+          - Verify no data errors
+          - Switch to WO cache mode without flush
+          - Calculate checksum of fio test file(s)
+          - Switch back to WB cache mode without flush
+          - Flush page cache
+          - Power off the main DUT
+      - On backup DUT:
+          - stop cache DRBD
+          - set backup DUT as primary for core DRBD
+          - deatch cache drive from standby cache instance
+          - activate standby cache instance directly on the cache RAID drive
+          - mount file system located on CAS exported object
+          - Calculate checksum of fio test file(s)
+       - Verify checksums from the previous steps are equal
+       - Power on the main DUT
+    """
+    pass