Change startup procedure

Current startup procedure works on an assumption that we will
deal with asynchronously appearing devices in asynchronous way
(udev rules) and synchronous events in the system (systemd units)
won't interfere. If we would break anything (mounts) we would just
take those units and restart them. This tactic was working as long
as resetting systemd units took reasonable time.

As hackish as it sounds it worked in all systems that the software
has been validated on. Unfortunately it stopped working because
of *.mount units taking MUCH longer time to restart even on
mainstream OSes, so it's time to change.

This change implements open-cas systemd service which will wait
synchronously with systemd bootup process for all required Open CAS
devices to start. If they don't we fail the boot process just as
failing mounts would. We also make sure that this process takes place
before any mounts (aside from root FS and other critical FS's) are
even attempted. Now opencas-mount-utility can be discarded.

To override this behaviour on per-core basis you can specify
lazy_startup=true option in opencas.conf.

Signed-off-by: Jan Musial <jan.musial@intel.com>
This commit is contained in:
Jan Musial
2019-10-23 16:18:00 +02:00
parent db1cb96010
commit aaedfb35dd
8 changed files with 227 additions and 65 deletions

View File

@@ -8,6 +8,7 @@ import csv
import re
import os
import stat
import time
# Casadm functionality
@@ -308,24 +309,35 @@ class cas_config(object):
return ret
class core_config(object):
def __init__(self, cache_id, core_id, path):
def __init__(self, cache_id, core_id, path, **params):
self.cache_id = int(cache_id)
self.core_id = int(core_id)
self.device = path
self.params = params
@classmethod
def from_line(cls, line, allow_incomplete=False):
values = line.split()
if len(values) > 3:
raise ValueError('Invalid core configuration (too many columns)')
if len(values) > 4:
raise ValueError("Invalid core configuration (too many columns)")
elif len(values) < 3:
raise ValueError('Invalid core configuration (too few columns)')
raise ValueError("Invalid core configuration (too few columns)")
cache_id = int(values[0])
core_id = int(values[1])
device = values[2]
core_config = cls(cache_id, core_id, device)
params = dict()
if len(values) > 3:
for param in values[3].lower().split(","):
param_name, param_value = param.split("=")
if param_name in params:
raise ValueError(
"Invalid core configuration (repeated parameter)"
)
params[param_name] = param_value
core_config = cls(cache_id, core_id, device, **params)
core_config.validate_config(allow_incomplete)
@@ -335,9 +347,24 @@ class cas_config(object):
self.check_core_id_valid()
self.check_recursive()
cas_config.cache_config.check_cache_id_valid(self.cache_id)
for param_name, param_value in self.params.items():
self.validate_parameter(param_name, param_value)
if not allow_incomplete:
cas_config.check_block_device(self.device)
def validate_parameter(self, param_name, param_value):
if param_name == "lazy_startup":
if param_value.lower() not in ["true", "false"]:
raise ValueError(
"{} is invalid value for '{}' core param".format(
param_value, param_name
)
)
else:
raise ValueError("'{}' is invalid core param name".format(param_name))
def check_core_id_valid(self):
if not 0 <= int(self.core_id) <= 4095:
raise ValueError('{0} is invalid core id'.format(self.core_id))
@@ -353,7 +380,14 @@ class cas_config(object):
raise ValueError('Recursive configuration detected')
def to_line(self):
return '{0}\t{1}\t{2}\n'.format(self.cache_id, self.core_id, self.device)
ret = "{0}\t{1}\t{2}".format(self.cache_id, self.core_id, self.device)
for i, (param, value) in enumerate(self.params.items()):
ret += "," if i > 0 else "\t"
ret += "{0}={1}".format(param, value)
ret += "\n"
return ret
def __init__(self, caches=None, cores=None, version_tag=None):
self.caches = caches if caches else dict()
@@ -494,6 +528,13 @@ class cas_config(object):
except:
raise Exception('Couldn\'t write config file')
def get_startup_cores(self):
return [
core
for core in self.cores
if core.params.get("lazy_startup", "false") == "false"
]
# Config helper functions
@@ -689,3 +730,69 @@ def stop(flush):
error.raise_nonempty()
def get_devices_state():
device_list = get_caches_list()
devices = {"core_pool": [], "caches": {}, "cores": {}}
core_pool = False
prev_cache_id = -1
for device in device_list:
if device["type"] == "core pool":
core_pool = True
continue
if device["type"] == "cache":
core_pool = False
prev_cache_id = int(device["id"])
devices["caches"].update(
{
int(device["id"]): {
"device": device["disk"],
"status": device["status"],
}
}
)
elif device["type"] == "core":
core = {"device": device["disk"], "status": device["status"]}
if core_pool:
devices["core_pool"].append(core)
else:
core.update({"cache_id": prev_cache_id})
devices["cores"].update(
{(prev_cache_id, int(device["id"])): core}
)
return devices
def wait_for_startup(timeout=300, interval=5):
try:
config = cas_config.from_file(
cas_config.default_location, allow_incomplete=True
)
except Exception as e:
raise Exception("Unable to load opencas config. Reason: {0}".format(str(e)))
stop_time = time.time() + int(timeout)
not_initialized = None
target_core_state = config.get_startup_cores()
while stop_time > time.time():
not_initialized = []
runtime_core_state = get_devices_state()["cores"]
for core in target_core_state:
runtime_state = runtime_core_state.get((core.cache_id, core.core_id), None)
if not runtime_state or runtime_state["status"] != "Active":
not_initialized.append(core)
if not not_initialized:
break
time.sleep(interval)
return not_initialized