From 2358ae1af486f9bbea6a4ae9de6a45c005075629 Mon Sep 17 00:00:00 2001
From: Michal Mielewczyk <michal.mielewczyk@intel.com>
Date: Tue, 17 May 2022 11:54:01 +0200
Subject: [PATCH] tests: add test_functional_activate_twice_round_trip

Signed-off-by: Michal Mielewczyk <michal.mielewczyk@intel.com>
---
 .../test_failover_multihost.py                | 386 ++++++++++++++++++
 1 file changed, 386 insertions(+)
 create mode 100644 test/functional/tests/failover_standby/test_failover_multihost.py

diff --git a/test/functional/tests/failover_standby/test_failover_multihost.py b/test/functional/tests/failover_standby/test_failover_multihost.py
new file mode 100644
index 0000000..a700169
--- /dev/null
+++ b/test/functional/tests/failover_standby/test_failover_multihost.py
@@ -0,0 +1,386 @@
+#
+# Copyright(c) 2022 Intel Corporation
+# SPDX-License-Identifier: BSD-3-Clause
+#
+
+from time import sleep
+import pytest
+
+from api.cas import casadm
+from api.cas.cache_config import (
+    SeqCutOffPolicy,
+    CacheMode,
+    CleaningPolicy,
+    CacheLineSize,
+    CacheStatus,
+)
+from api.cas.casadm_parser import get_caches
+from api.cas.init_config import InitConfig
+from core.test_run import TestRun
+from storage_devices.disk import DiskType, DiskTypeSet
+from storage_devices.drbd import Drbd
+from storage_devices.raid import Raid, RaidConfiguration, MetadataVariant, Level
+from test_tools.dd import Dd
+from test_tools.drbdadm import Drbdadm
+from test_tools.disk_utils import Filesystem
+from test_tools.fio.fio import Fio
+from test_tools.fio.fio_param import ReadWrite
+from test_tools.fs_utils import readlink
+from test_utils.drbd import Resource, Node
+from test_utils.os_utils import sync, Udev
+from test_utils.size import Size, Unit
+from test_tools import fs_utils
+
+
+cache_id = 5
+raid_size = Size(1, Unit.GibiByte)
+core_size = Size(500, Unit.MebiByte)
+metadata_size = Size(100, Unit.MebiByte)
+cache_exp_obj_path = f"/dev/cas-cache-{cache_id}"
+cls = CacheLineSize.LINE_32KiB
+mountpoint = "/tmp/drbd_functional_test"
+test_file_path = f"{mountpoint}/test_file"
+
+
+@pytest.mark.require_disk("metadata_dev", DiskTypeSet([DiskType.nand]))
+@pytest.mark.require_disk("core_dev", DiskTypeSet([DiskType.hdd]))
+@pytest.mark.require_disk("raid_dev1", DiskTypeSet([DiskType.optane]))
+@pytest.mark.require_disk("raid_dev2", DiskTypeSet([DiskType.optane]))
+@pytest.mark.multidut(2)
+@pytest.mark.require_plugin("power_control")
+@pytest.mark.parametrize("filesystem", [Filesystem.xfs, None])
+def test_functional_activate_twice_round_trip(filesystem):
+    """
+    title:  Cache replication.
+    description:
+      Restore cache operations from a replicated cache and make sure
+      second failover is possible to return to original configuration
+    pass_criteria:
+      - A cache exported object appears after starting a cache in passive state
+      - The cache exported object can be used for replicating a cache device
+      - The cache exported object disappears after the cache activation
+      - The core exported object reappears after the cache activation
+      - A data integrity check passes for the core exported object before and after
+        switching cache instances
+      - CAS standby cahce starts automatically after starting OS when configured
+        in CAS config
+    """
+    with TestRun.step("Make sure DRBD is installed on both nodes"):
+        check_drbd_installed(TestRun.duts)
+
+    with TestRun.step("Prepare DUTs"):
+        prepare_devices(TestRun.duts)
+        primary_node, secondary_node = TestRun.duts
+        extra_init_config_flags = (
+            f"cache_line_size={str(cls.value.value//1024)},target_failover_state=standby"
+        )
+        primary_init_config = InitConfig()
+        primary_init_config.add_cache(
+            cache_id,
+            primary_node.raid,
+            CacheMode.WB,
+            extra_flags=extra_init_config_flags,
+        )
+        secondary_init_config = InitConfig()
+        secondary_init_config.add_cache(
+            cache_id,
+            secondary_node.raid,
+            CacheMode.WB,
+            extra_flags=extra_init_config_flags,
+        )
+
+    # THIS IS WHERE THE REAL TEST STARTS
+    TestRun.LOGGER.start_group(
+        f"Initial configuration with {primary_node.ip} as primary node "
+        f"and {secondary_node.ip} as secondary node"
+    )
+
+    with TestRun.use_dut(secondary_node), TestRun.step(
+        f"Prepare standby cache instance on {secondary_node.ip}"
+    ):
+        secondary_node.cache = casadm.standby_init(
+            cache_dev=secondary_node.raid,
+            cache_line_size=str(cls.value.value // 1024),
+            cache_id=cache_id,
+            force=True,
+        )
+
+    with TestRun.step("Prepare DRBD config files on both DUTs"):
+        caches_original_resource, caches_failover_resource, cores_resource = get_drbd_configs(
+            primary_node, secondary_node
+        )
+
+    for dut in TestRun.duts:
+        with TestRun.use_dut(dut), TestRun.step(f"Create DRBD instances on {dut.ip}"):
+            caches_original_resource.save()
+            dut.cache_drbd = Drbd(caches_original_resource)
+            dut.cache_drbd.create_metadata()
+            dut.cache_drbd_dev = dut.cache_drbd.up()
+
+            cores_resource.save()
+            dut.core_drbd = Drbd(cores_resource)
+            dut.core_drbd.create_metadata()
+            dut.core_drbd_dev = dut.core_drbd.up()
+
+    with TestRun.use_dut(primary_node), TestRun.step(
+        f"Set {primary_node.ip} as primary node for both DRBD instances"
+    ):
+        primary_node.cache_drbd.set_primary(force=True)
+        primary_node.core_drbd.set_primary(force=True)
+
+    with TestRun.use_dut(primary_node), TestRun.step("Make sure drbd instances are in sync"):
+        primary_node.cache_drbd.wait_for_sync()
+        primary_node.core_drbd.wait_for_sync()
+
+    with TestRun.use_dut(primary_node), TestRun.step(f"Start cache on {primary_node.ip}"):
+        primary_node.cache = casadm.start_cache(
+            primary_node.cache_drbd_dev,
+            force=True,
+            cache_mode=CacheMode.WB,
+            cache_line_size=cls,
+            cache_id=cache_id,
+        )
+        core = primary_node.cache.add_core(primary_node.core_drbd_dev)
+        primary_node.cache.set_cleaning_policy(CleaningPolicy.nop)
+        primary_node.cache.set_seq_cutoff_policy(SeqCutOffPolicy.never)
+        if filesystem:
+            TestRun.executor.run(f"rm -rf {mountpoint}")
+            fs_utils.create_directory(path=mountpoint)
+            core.create_filesystem(filesystem)
+            core.mount(mountpoint)
+
+    with TestRun.use_dut(primary_node), TestRun.step(
+        f"Prepare standby init config on {primary_node.ip}"
+    ):
+        primary_init_config.save_config_file()
+        sync()
+
+    with TestRun.use_dut(primary_node), TestRun.step("Fill core with data randrwmix=50%"):
+        fio = Fio().create_command().read_write(ReadWrite.randrw).size(core_size * 0.9)
+        fio.file_name(test_file_path) if filesystem else fio.target(core.path).direct()
+        fio.run()
+        sync()
+
+    data_path = test_file_path if filesystem else core.path
+    original_core_md5, original_cache_stats = power_failure(primary_node, data_path)
+
+    TestRun.LOGGER.end_group()
+    TestRun.LOGGER.start_group(
+        f"First failover sequence. {secondary_node.ip} becomes"
+        f" primary node and {primary_node.ip} becomes secondary node"
+    )
+
+    failover_sequence(secondary_node, caches_failover_resource, filesystem, core)
+
+    with TestRun.use_dut(secondary_node), TestRun.step(
+        f"Prepare standby init config on {secondary_node.ip}"
+    ):
+        secondary_init_config.save_config_file()
+        sync()
+
+    postfailover_check(secondary_node, data_path, original_core_md5, original_cache_stats)
+
+    with TestRun.use_dut(secondary_node), TestRun.step(
+        "Fill half of the core with data randrwmix=50%"
+    ):
+        fio = Fio().create_command().read_write(ReadWrite.randrw).size(core_size * 0.5)
+        fio.file_name(f"{mountpoint}/test_file") if filesystem else fio.target(core.path).direct()
+        fio.run()
+        sync()
+
+    with TestRun.use_dut(primary_node), TestRun.step(f"Restore core DRBD on {primary_node.ip}"):
+        TestRun.executor.wait_for_connection()
+        primary_node.core_drbd_dev = primary_node.core_drbd.up()
+
+    new_failover_instance(primary_node, caches_failover_resource, autoload=True)
+
+    with TestRun.use_dut(secondary_node), TestRun.step(
+        "Fill the second half of the core with data randrwmix=50%"
+    ):
+        fio = (
+            Fio()
+            .create_command()
+            .read_write(ReadWrite.randrw)
+            .size(core_size * 0.4)
+            .offset(core_size * 0.5)
+        )
+        fio.file_name(f"{mountpoint}/test_file") if filesystem else fio.target(core.path).direct()
+        fio.run()
+        sync()
+
+    original_core_md5, original_cache_stats = power_failure(secondary_node, data_path)
+
+    TestRun.LOGGER.end_group()
+    TestRun.LOGGER.start_group(
+        f"Second failover sequence. {primary_node.ip} becomes"
+        f" primary node and {secondary_node.ip} becomes secondary node"
+    )
+
+    failover_sequence(primary_node, caches_original_resource, filesystem, core)
+
+    postfailover_check(primary_node, data_path, original_core_md5, original_cache_stats)
+
+    with TestRun.use_dut(secondary_node):
+        TestRun.executor.wait_for_connection()
+
+    TestRun.LOGGER.end_group()
+
+
+def check_drbd_installed(duts):
+    for dut in duts:
+        with TestRun.use_dut(dut):
+            if not Drbd.is_installed():
+                TestRun.fail(f"DRBD is not installed on DUT {dut.ip}")
+
+
+def prepare_devices(duts):
+    for dut in duts:
+        with TestRun.use_dut(dut):
+            TestRun.dut.hostname = TestRun.executor.run_expect_success("uname -n").stdout
+
+            raid_members = [TestRun.disks["raid_dev1"], TestRun.disks["raid_dev2"]]
+            for d in raid_members:
+                d.create_partitions([raid_size * 1.1])  # extra space for RAID metadata
+
+            raid_config = RaidConfiguration(
+                level=Level.Raid1,
+                metadata=MetadataVariant.Legacy,
+                number_of_devices=2,
+                size=raid_size,
+            )
+            dut.raid = Raid.create(raid_config, [d.partitions[0] for d in raid_members])
+            dut.raid_path = readlink(dut.raid.path)
+
+            TestRun.disks["metadata_dev"].create_partitions([metadata_size] * 2)
+            dut.cache_md_dev = TestRun.disks["metadata_dev"].partitions[0]
+            dut.core_md_dev = TestRun.disks["metadata_dev"].partitions[1]
+
+            TestRun.disks["core_dev"].create_partitions([core_size])
+            dut.core_dev = TestRun.disks["core_dev"].partitions[0]
+
+
+def get_drbd_configs(n1, n2):
+    cache_original_drbd_nodes = [
+        Node(n1.hostname, n1.raid_path, n1.cache_md_dev.path, n1.ip, "7790"),
+        Node(n2.hostname, cache_exp_obj_path, n2.cache_md_dev.path, n2.ip, "7790"),
+    ]
+    cache_failover_drbd_nodes = [
+        Node(n1.hostname, cache_exp_obj_path, n1.cache_md_dev.path, n1.ip, "7790"),
+        Node(n2.hostname, n2.raid_path, n2.cache_md_dev.path, n2.ip, "7790"),
+    ]
+    core_drbd_nodes = [
+        Node(dut.hostname, dut.core_dev.path, dut.core_md_dev.path, dut.ip, "7791")
+        for dut in [n1, n2]
+    ]
+
+    caches_original_resource = Resource(
+        name="caches", device="/dev/drbd0", nodes=cache_original_drbd_nodes
+    )
+    caches_failover_resource = Resource(
+        name="caches", device="/dev/drbd0", nodes=cache_failover_drbd_nodes
+    )
+    cores_resource = Resource(name="cores", device="/dev/drbd100", nodes=core_drbd_nodes)
+
+    return caches_original_resource, caches_failover_resource, cores_resource
+
+
+def power_failure(primary_node, data_path):
+    with TestRun.use_dut(primary_node), TestRun.step("Make sure drbd instances are in sync"):
+        primary_node.cache_drbd.wait_for_sync()
+        primary_node.core_drbd.wait_for_sync()
+
+    with TestRun.use_dut(primary_node), TestRun.step(
+        "Switch cache to WO, get cache stats and core's md5 and restore WB"
+    ):
+        primary_node.cache.set_cache_mode(CacheMode.WO)
+        core_md5 = TestRun.executor.run(f"md5sum {data_path}").stdout.split()[0]
+        cache_stats = primary_node.cache.get_statistics().usage_stats
+        primary_node.cache.set_cache_mode(CacheMode.WB)
+
+    with TestRun.use_dut(primary_node), TestRun.step(
+        f"Simulate power failure on {primary_node.ip}"
+    ):
+        power_control = TestRun.plugin_manager.get_plugin("power_control")
+        power_control.power_cycle(wait_for_connection=False)
+
+    return core_md5, cache_stats
+
+
+def failover_sequence(standby_node, drbd_resource, filesystem, core):
+    with TestRun.use_dut(standby_node), TestRun.step(f"Stop cache DRBD on the {standby_node.ip}"):
+        standby_node.cache_drbd.down()
+
+    with TestRun.use_dut(standby_node), TestRun.step(
+        f"Set core DRBD as primary on the {standby_node.ip}"
+    ):
+        standby_node.core_drbd.set_primary()
+
+    with TestRun.use_dut(standby_node), TestRun.step("Detach the standby cache instance"):
+        standby_node.cache.standby_detach()
+        TestRun.executor.run_expect_fail(f"ls -la /dev/ | grep {cache_exp_obj_path}")
+
+    with TestRun.use_dut(standby_node), TestRun.step(f"Start primary DRBD on {standby_node.ip}"):
+        drbd_resource.save()
+        standby_node.cache_drbd = Drbd(drbd_resource)
+        standby_node.cache_drbd_dev = standby_node.cache_drbd.up()
+        standby_node.cache_drbd.set_primary()
+
+    with TestRun.use_dut(standby_node), TestRun.step(f"Activate cache on {standby_node.ip}"):
+        Udev.disable()
+        standby_node.cache.standby_activate(standby_node.cache_drbd_dev)
+        TestRun.executor.run_expect_success(f"ls -la /dev/ | grep cas{cache_id}-1")
+
+    if filesystem:
+        with TestRun.use_dut(standby_node), TestRun.step(f"Mount core"):
+            TestRun.executor.run(f"rm -rf {mountpoint}")
+            fs_utils.create_directory(path=mountpoint)
+            core.mount(mountpoint)
+
+
+def postfailover_check(new_primary_node, data_path, core_md5, cache_stats):
+    with TestRun.use_dut(new_primary_node), TestRun.step(f"Make sure the usage stats are correct"):
+        failover_cache_stats = new_primary_node.cache.get_statistics().usage_stats
+        if cache_stats.dirty != failover_cache_stats.dirty:
+            TestRun.LOGGER.error(
+                "The number of dirty blocks after the failover sequence doesn't match\n"
+                f"Dirty before the failover {cache_stats.dirty}\n"
+                f"Dirty after the failover {failover_cache_stats.dirty}\n"
+            )
+
+    with TestRun.use_dut(new_primary_node), TestRun.step(
+        f"Swtich cache to WO, make sure md5 of {data_path} is correct and restore WB"
+    ):
+        new_primary_node.cache.set_cache_mode(CacheMode.WO)
+        failover_core_md5 = TestRun.executor.run(f"md5sum {data_path}").stdout.split()[0]
+        new_primary_node.cache.set_cache_mode(CacheMode.WB)
+
+        if failover_core_md5 != core_md5:
+            TestRun.LOGGER.error("md5 after the failover sequence doesn't match")
+
+
+def new_failover_instance(new_secondary_node, drbd_resource, *, autoload):
+    if autoload:
+        with TestRun.use_dut(new_secondary_node), TestRun.step(
+            f"Verify whether the passive cache instance on {new_secondary_node.ip}"
+            f" started automatically"
+        ):
+            caches = get_caches()
+            if len(caches) < 1:
+                TestRun.LOGGER.error(f"Cache not present in system")
+            else:
+                cache_status = caches[0].get_status()
+                if cache_status != CacheStatus.standby:
+                    TestRun.LOGGER.error(
+                        f'Expected Cache state: "{CacheStatus.standby.value}" '
+                        f'Got "{cache_status.value}" instead.'
+                    )
+
+    with TestRun.use_dut(new_secondary_node), TestRun.step(
+        f"Start secondary DRBD on {new_secondary_node.ip}"
+        "" if autoload else " with newly created metadata"
+    ):
+        drbd_resource.save()
+        if not autoload:
+            new_secondary_node.cache_drbd.create_metadata()
+        new_secondary_node.cache_drbd = Drbd(drbd_resource)
+        new_secondary_node.cache_drbd_dev = new_secondary_node.cache_drbd.up()