From 49c87c95270d4d0dc5e794bdb42f7f4942705645 Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Wed, 11 Dec 2019 18:26:27 -0500 Subject: [PATCH 01/13] Fix legacy error messages. Signed-off-by: Michal Mielewczyk --- modules/cas_cache/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/cas_cache/main.c b/modules/cas_cache/main.c index 3432ec8..b0124a3 100644 --- a/modules/cas_cache/main.c +++ b/modules/cas_cache/main.c @@ -111,13 +111,13 @@ static int __init cas_init_module(void) result = cas_casdisk_lookup_funtions(); if (result) { printk(KERN_ERR OCF_PREFIX_SHORT - "Could not find inteldisk functions.\n"); + "Could not find cas_disk functions.\n"); return result; } if (casdisk_functions.casdsk_get_version() != CASDSK_IFACE_VERSION) { printk(KERN_ERR OCF_PREFIX_SHORT - "Incompatible inteldisk module\n"); + "Incompatible cas_disk module\n"); return -EINVAL; } From 3eda5030959602e58c3b8eda7011759ea3c2087d Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Mon, 9 Dec 2019 06:40:48 -0500 Subject: [PATCH 02/13] Additional null check when starting cache instance Signed-off-by: Michal Mielewczyk --- modules/cas_cache/layer_cache_management.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/cas_cache/layer_cache_management.c b/modules/cas_cache/layer_cache_management.c index 7d0ea19..b7e3c2e 100644 --- a/modules/cas_cache/layer_cache_management.c +++ b/modules/cas_cache/layer_cache_management.c @@ -1118,6 +1118,8 @@ int cache_mngt_prepare_cache_cfg(struct ocf_mngt_cache_config *cfg, uint16_t cache_id; bool is_part; + BUG_ON(!cmd); + if (strnlen(cmd->cache_path_name, MAX_STR_LEN) >= MAX_STR_LEN) return -OCF_ERR_INVAL; From d483951ebe6e040e53579640a32eeb4e6fe189cf Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Wed, 4 Dec 2019 08:51:33 -0500 Subject: [PATCH 03/13] Free thread memory after it is stopped. After marking thread as ready to stop, CAS was waiting this for thread to exit out of main execution loop (in _cas_io_queue_thread()). In case of management queue it lead to deadlock because both stoping queue and main execution loop was performed in the same execution context. Since freeing memory is the only operation after stopping thread, it can be moved just after the main thread loop. After this little reordering, synchronising between _cas_stop_thread() and _cas_io_queue_thread() in no longer needed, and no deadlock will occur. This change is needed to put management qeueue from completion context. Without this cachnge, there will be no possiblitiy to stop cache from completion context and to make rollback. Signed-off-by: Michal Mielewczyk --- modules/cas_cache/threads.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/modules/cas_cache/threads.c b/modules/cas_cache/threads.c index d4d5f80..3ffcd42 100644 --- a/modules/cas_cache/threads.c +++ b/modules/cas_cache/threads.c @@ -50,9 +50,10 @@ static int _cas_io_queue_thread(void *data) WARN(ocf_queue_pending_io(q), "Still pending IO requests\n"); /* If we get here, then thread was signalled to terminate. - * So, let's complete and exit. + * So, let's free memory and exit. */ - complete_and_exit(&info->compl, 0); + printk(KERN_DEBUG "Thread %s stopped\n", info->name); + kfree(info); return 0; } @@ -201,13 +202,9 @@ static void _cas_start_thread(struct cas_thread_info *info) static void _cas_stop_thread(struct cas_thread_info *info) { if (info->running && info->thread) { - init_completion(&info->compl); atomic_set(&info->stop, 1); wake_up(&info->wq); - wait_for_completion(&info->compl); - printk(KERN_DEBUG "Thread %s stopped\n", info->name); } - kfree(info); } int cas_create_queue_thread(ocf_queue_t q, int cpu) From 2ac82143796242a599c236c7c80240e1cc27726f Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Wed, 18 Dec 2019 05:24:09 -0500 Subject: [PATCH 04/13] Don't allow probe interruption. Usually metadata probe is non time consuming. To avoid dealing with synchronization problems, noninterruptible wait is performed. This commit is part of patch that will handle interrupt of waiting for OCF operations. Signed-off-by: Michal Mielewczyk --- modules/cas_cache/layer_cache_management.c | 27 +++++++++++++++------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/modules/cas_cache/layer_cache_management.c b/modules/cas_cache/layer_cache_management.c index b7e3c2e..6d3971c 100644 --- a/modules/cas_cache/layer_cache_management.c +++ b/modules/cas_cache/layer_cache_management.c @@ -19,6 +19,17 @@ struct _cache_mngt_sync_context { int *result; }; +struct _cache_mngt_async_context { + struct completion compl; + atomic_t ref_count; + int result; +}; + +struct _cache_mngt_async_lock_context { + struct completion compl; + int *result; +}; + static void _cache_mngt_lock_complete(ocf_cache_t cache, void *priv, int error) { struct _cache_mngt_sync_context *context = priv; @@ -473,7 +484,7 @@ int cache_mngt_core_pool_remove(struct kcas_core_pool_remove *cmd_info) } struct cache_mngt_metadata_probe_context { - struct completion compl; + struct completion cmpl; struct kcas_cache_check_device *cmd_info; int *result; }; @@ -495,7 +506,7 @@ static void cache_mngt_metadata_probe_end(void *priv, int error, cmd_info->cache_dirty = status->cache_dirty; } - complete(&context->compl); + complete(&context->cmpl); } int cache_mngt_cache_check_device(struct kcas_cache_check_device *cmd_info) @@ -521,13 +532,13 @@ int cache_mngt_cache_check_device(struct kcas_cache_check_device *cmd_info) cmd_info->format_atomic = (ocf_ctx_get_volume_type_id(cas_ctx, ocf_volume_get_type(volume)) == ATOMIC_DEVICE_VOLUME); - init_completion(&context.compl); + init_completion(&context.cmpl); context.cmd_info = cmd_info; context.result = &result; ocf_metadata_probe(cas_ctx, volume, cache_mngt_metadata_probe_end, &context); - wait_for_completion_interruptible(&context.compl); + wait_for_completion(&context.cmpl); cas_blk_close_volume(volume); out_bdev: @@ -1379,7 +1390,7 @@ static void _cache_mngt_load_complete(ocf_cache_t cache, void *priv, int error) } struct cache_mngt_check_metadata_context { - struct completion compl; + struct completion cmpl; char *cache_name; int *result; }; @@ -1400,7 +1411,7 @@ static void cache_mngt_check_metadata_end(void *priv, int error, status->cache_name); } - complete(&context->compl); + complete(&context->cmpl); } static int _cache_mngt_check_metadata(struct ocf_mngt_cache_config *cfg, @@ -1424,13 +1435,13 @@ static int _cache_mngt_check_metadata(struct ocf_mngt_cache_config *cfg, if (result) goto out_bdev; - init_completion(&context.compl); + init_completion(&context.cmpl); context.cache_name = cfg->name; context.result = &result; ocf_metadata_probe(cas_ctx, volume, cache_mngt_check_metadata_end, &context); - wait_for_completion_interruptible(&context.compl); + wait_for_completion(&context.cmpl); cas_blk_close_volume(volume); out_bdev: From f7d88c4b3f8049962e1966361af24afe02cc5454 Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Wed, 18 Dec 2019 07:03:05 -0500 Subject: [PATCH 05/13] Don't allow core add, remove nor detach interruptions They are usually not time comsuming operations, so risk of hung task is low. So it's easier to temporarily disable interrupts instead of properly handle async completion. This commit is part of patch that will handle interrupt of waiting for OCF operations. Signed-off-by: Michal Mielewczyk --- modules/cas_cache/layer_cache_management.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/modules/cas_cache/layer_cache_management.c b/modules/cas_cache/layer_cache_management.c index 6d3971c..3f6163d 100644 --- a/modules/cas_cache/layer_cache_management.c +++ b/modules/cas_cache/layer_cache_management.c @@ -686,7 +686,7 @@ static int _cache_mngt_core_device_loaded_visitor(ocf_core_t core, void *cntx) } struct _cache_mngt_add_core_context { - struct completion compl; + struct completion cmpl; ocf_core_t *core; int *result; }; @@ -702,7 +702,7 @@ static void _cache_mngt_add_core_complete(ocf_cache_t cache, *context->core = core; *context->result = error; - complete(&context->compl); + complete(&context->cmpl); } static void _cache_mngt_remove_core_complete(void *priv, int error); @@ -755,13 +755,13 @@ int cache_mngt_add_core_to_cache(const char *cache_name, size_t name_len, cfg->seq_cutoff_threshold = seq_cut_off_mb * MiB; - init_completion(&add_context.compl); + init_completion(&add_context.cmpl); add_context.core = &core; add_context.result = &result; ocf_mngt_cache_add_core(cache, cfg, _cache_mngt_add_core_complete, &add_context); - wait_for_completion_interruptible(&add_context.compl); + wait_for_completion(&add_context.cmpl); if (result) goto error_affter_lock; @@ -788,11 +788,11 @@ error_after_create_exported_object: block_dev_destroy_exported_object(core); error_after_add_core: - init_completion(&remove_context.compl); + init_completion(&remove_context.cmpl); remove_context.result = &remove_core_result; ocf_mngt_cache_remove_core(core, _cache_mngt_remove_core_complete, &remove_context); - wait_for_completion_interruptible(&remove_context.compl); + wait_for_completion(&remove_context.cmpl); error_affter_lock: ocf_mngt_cache_unlock(cache); @@ -856,7 +856,7 @@ static void _cache_mngt_remove_core_complete(void *priv, int error) struct _cache_mngt_sync_context *context = priv; *context->result = error; - complete(&context->compl); + complete(&context->cmpl); } int cache_mngt_remove_core_from_cache(struct kcas_remove_core *cmd) @@ -914,7 +914,7 @@ int cache_mngt_remove_core_from_cache(struct kcas_remove_core *cmd) goto unlock; } - init_completion(&context.compl); + init_completion(&context.cmpl); context.result = &result; if (cmd->detach || flush_result) { @@ -928,7 +928,7 @@ int cache_mngt_remove_core_from_cache(struct kcas_remove_core *cmd) if (!cmd->force_no_flush && !flush_result) BUG_ON(ocf_mngt_core_is_dirty(core)); - wait_for_completion_interruptible(&context.compl); + wait_for_completion(&context.cmpl); if (!result && !cmd->detach) { cache_priv = ocf_cache_get_priv(cache); From 7af5d296e1996da8b3cae501763366b10af4d171 Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Fri, 13 Dec 2019 03:08:28 -0500 Subject: [PATCH 06/13] New error code for interrupted waiting. This commit is part of patch that will allow to interrupt waiting for OCF operation. Signed-off-by: Michal Mielewczyk --- modules/include/cas_ioctl_codes.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/include/cas_ioctl_codes.h b/modules/include/cas_ioctl_codes.h index 8f51ac4..d424d9a 100644 --- a/modules/include/cas_ioctl_codes.h +++ b/modules/include/cas_ioctl_codes.h @@ -578,6 +578,9 @@ enum kcas_error { /** Condition token does not identify any known condition */ KCAS_ERR_CLS_RULE_UNKNOWN_CONDITION, + + /** Waiting for async operation was interrupted*/ + KCAS_ERR_WAITING_INTERRUPTED, }; #endif From de823b15fc4417539655dbab1df512d58257d7c8 Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Wed, 18 Dec 2019 08:19:49 -0500 Subject: [PATCH 07/13] Allow cache r&w locks to be interrupted. When context was allocated on the stack and waiting for completion was interrupted, completion function would attempt to save lock/unlock result in memory which might in use by other process. This would cause a system crash. To prevent such scenario, context is allocated dynamiclly and extended with reference counter. In case of interrupt, completion function doesn't have to save result in context, it can simply free it's memory. This commit is part of patch that will allow to interrupt waiting for OCF operations. Signed-off-by: Michal Mielewczyk --- modules/cas_cache/layer_cache_management.c | 144 +++++++++++++++++---- 1 file changed, 120 insertions(+), 24 deletions(-) diff --git a/modules/cas_cache/layer_cache_management.c b/modules/cas_cache/layer_cache_management.c index 3f6163d..2251336 100644 --- a/modules/cas_cache/layer_cache_management.c +++ b/modules/cas_cache/layer_cache_management.c @@ -15,53 +15,149 @@ extern u32 seq_cut_off_mb; extern u32 use_io_scheduler; struct _cache_mngt_sync_context { - struct completion compl; + struct completion cmpl; int *result; }; struct _cache_mngt_async_context { - struct completion compl; - atomic_t ref_count; + struct completion cmpl; + spinlock_t lock; int result; }; -struct _cache_mngt_async_lock_context { - struct completion compl; - int *result; -}; +/* + * Value used to mark async call as completed. Used when OCF call already + * finished, but complete function still has to be performed. + */ +#define ASYNC_CALL_FINISHED 1 -static void _cache_mngt_lock_complete(ocf_cache_t cache, void *priv, int error) +static int _cache_mngt_async_callee_set_result( + struct _cache_mngt_async_context *context, + int error) { - struct _cache_mngt_sync_context *context = priv; + bool interrupted; + int ret; - *context->result = error; - complete(&context->compl); + spin_lock(&context->lock); + + interrupted = (context->result == -KCAS_ERR_WAITING_INTERRUPTED); + if (!interrupted) + context->result = error ?: ASYNC_CALL_FINISHED; + complete(&context->cmpl); + + ret = context->result; + spin_unlock(&context->lock); + + return ret == ASYNC_CALL_FINISHED ? 0 : ret; } -static int _cache_mngt_lock_sync(ocf_cache_t cache) +static int _cache_mngt_async_callee_peek_result( + struct _cache_mngt_async_context *context) { - struct _cache_mngt_sync_context context; int result; - init_completion(&context.compl); - context.result = &result; - - ocf_mngt_cache_lock(cache, _cache_mngt_lock_complete, &context); - wait_for_completion_interruptible(&context.compl); + spin_lock(&context->lock); + result = context->result; + spin_unlock(&context->lock); return result; } -static int _cache_mngt_read_lock_sync(ocf_cache_t cache) +static int _cache_mngt_async_caller_set_result( + struct _cache_mngt_async_context *context, + int error) { - struct _cache_mngt_sync_context context; + unsigned long lock_flags = 0; + int result = error; + + spin_lock_irqsave(&context->lock, lock_flags); + if (context->result) + result = (context->result != ASYNC_CALL_FINISHED) ? + context->result : 0; + else if (result < 0) + result = context->result = -KCAS_ERR_WAITING_INTERRUPTED; + spin_unlock_irqrestore(&context->lock, lock_flags); + + return result; +} + +static inline void _cache_mngt_async_context_init( + struct _cache_mngt_async_context *context) +{ + init_completion(&context->cmpl); + spin_lock_init(&context->lock); + context->result = 0; +} + +static void _cache_mngt_lock_complete(ocf_cache_t cache, void *priv, int error) +{ + struct _cache_mngt_async_context *context = priv; int result; - init_completion(&context.compl); - context.result = &result; + result = _cache_mngt_async_callee_set_result(context, error); - ocf_mngt_cache_read_lock(cache, _cache_mngt_lock_complete, &context); - wait_for_completion_interruptible(&context.compl); + if (result == -KCAS_ERR_WAITING_INTERRUPTED && error == 0) + ocf_mngt_cache_unlock(cache); + + if (result == -KCAS_ERR_WAITING_INTERRUPTED) + kfree(context); +} + +static int _cache_mngt_lock_sync(ocf_cache_t cache) +{ + struct _cache_mngt_async_context *context; + int result; + + context = kmalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return -ENOMEM; + + _cache_mngt_async_context_init(context); + + ocf_mngt_cache_lock(cache, _cache_mngt_lock_complete, context); + result = wait_for_completion_interruptible(&context->cmpl); + + result = _cache_mngt_async_caller_set_result(context, result); + + if (result != -KCAS_ERR_WAITING_INTERRUPTED) + kfree(context); + + return result; +} + +static void _cache_mngt_read_lock_complete(ocf_cache_t cache, void *priv, + int error) +{ + struct _cache_mngt_async_context *context = priv; + int result; + + result = _cache_mngt_async_callee_set_result(context, error); + + if (result == -KCAS_ERR_WAITING_INTERRUPTED && error == 0) + ocf_mngt_cache_read_unlock(cache); + + if (result == -KCAS_ERR_WAITING_INTERRUPTED) + kfree(context); +} + +static int _cache_mngt_read_lock_sync(ocf_cache_t cache) +{ + struct _cache_mngt_async_context *context; + int result; + + context = kmalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return -ENOMEM; + + _cache_mngt_async_context_init(context); + + ocf_mngt_cache_read_lock(cache, _cache_mngt_read_lock_complete, context); + result = wait_for_completion_interruptible(&context->cmpl); + + result = _cache_mngt_async_caller_set_result(context, result); + + if (result != -KCAS_ERR_WAITING_INTERRUPTED) + kfree(context); return result; } From b7f1dd69a99667cf0399efada8f69faae0545dd0 Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Tue, 17 Dec 2019 02:54:41 -0500 Subject: [PATCH 08/13] Allow waiting for metadata flush to be interrupted. When context was allocated on the stack and waiting for completion was interrupted, completion function would attempt to save flush result in memory which might in use by other process. This would cause a system crash. To prevent such scenario, context is allocated dynamiclly and extended with reference counter. In case of interrupt, completion function doesn't have to save result in context, it can simply free it's memory. This commit is part of patch that will allow to interrupt waiting for OCF operations. Signed-off-by: Michal Mielewczyk --- modules/cas_cache/layer_cache_management.c | 27 +++++++++++++++------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/modules/cas_cache/layer_cache_management.c b/modules/cas_cache/layer_cache_management.c index 2251336..dfda4ae 100644 --- a/modules/cas_cache/layer_cache_management.c +++ b/modules/cas_cache/layer_cache_management.c @@ -165,22 +165,33 @@ static int _cache_mngt_read_lock_sync(ocf_cache_t cache) static void _cache_mngt_save_sync_complete(ocf_cache_t cache, void *priv, int error) { - struct _cache_mngt_sync_context *context = priv; + struct _cache_mngt_async_context *context = priv; + int result; - *context->result = error; - complete(&context->compl); + result = _cache_mngt_async_callee_set_result(context, error); + + if (result == -KCAS_ERR_WAITING_INTERRUPTED) + kfree(context); } static int _cache_mngt_save_sync(ocf_cache_t cache) { - struct _cache_mngt_sync_context context; + struct _cache_mngt_async_context *context; int result; - init_completion(&context.compl); - context.result = &result; + context = kmalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return -ENOMEM; - ocf_mngt_cache_save(cache, _cache_mngt_save_sync_complete, &context); - wait_for_completion_interruptible(&context.compl); + _cache_mngt_async_context_init(context); + + ocf_mngt_cache_save(cache, _cache_mngt_save_sync_complete, context); + result = wait_for_completion_interruptible(&context->cmpl); + + result = _cache_mngt_async_caller_set_result(context, result); + + if (result != -KCAS_ERR_WAITING_INTERRUPTED) + kfree(context); return result; } From 0b5ed3f00bfddf646fe2640a34506a182e3c5fcc Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Tue, 17 Dec 2019 04:21:47 -0500 Subject: [PATCH 09/13] Allow flush operations to be interrupted. When context was allocated on the stack and waiting for completion was interrupted, completion function would attempt to save flush result in memory which might in use by other process. This would cause a system crash. To prevent such scenario, context is allocated dynamiclly and extended with reference counter. In case of interrupt, completion function doesn't have to save result in context, it can simply free it's memory. This commit also enables possibility to interrupt regular flush properly, by seding SIGING to casadm. This commit is part of patch that will allow to interrupt waiting for OCF operations. Signed-off-by: Michal Mielewczyk --- modules/cas_cache/layer_cache_management.c | 64 +++++++++++++++------- 1 file changed, 44 insertions(+), 20 deletions(-) diff --git a/modules/cas_cache/layer_cache_management.c b/modules/cas_cache/layer_cache_management.c index dfda4ae..929abdb 100644 --- a/modules/cas_cache/layer_cache_management.c +++ b/modules/cas_cache/layer_cache_management.c @@ -199,24 +199,36 @@ static int _cache_mngt_save_sync(ocf_cache_t cache) static void _cache_mngt_cache_flush_complete(ocf_cache_t cache, void *priv, int error) { - struct _cache_mngt_sync_context *context = priv; + struct _cache_mngt_async_context *context = priv; + int result; - *context->result = error; - complete(&context->compl); + result = _cache_mngt_async_callee_set_result(context, error); + + if (result == -KCAS_ERR_WAITING_INTERRUPTED) + kfree(context); } static int _cache_mngt_cache_flush_sync(ocf_cache_t cache, bool interruption) { - struct cache_priv *cache_priv = ocf_cache_get_priv(cache); - struct _cache_mngt_sync_context context; int result; + struct _cache_mngt_async_context *context; + struct cache_priv *cache_priv = ocf_cache_get_priv(cache); - init_completion(&context.compl); - context.result = &result; + context = kmalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return -ENOMEM; + + _cache_mngt_async_context_init(context); + atomic_set(&cache_priv->flush_interrupt_enabled, interruption); + + ocf_mngt_cache_flush(cache, _cache_mngt_cache_flush_complete, context); + result = wait_for_completion_interruptible(&context->cmpl); + + result = _cache_mngt_async_caller_set_result(context, result); + + if (result != -KCAS_ERR_WAITING_INTERRUPTED) + kfree(context); - atomic_set(&cache_priv->flush_interrupt_enabled, 0); - ocf_mngt_cache_flush(cache, _cache_mngt_cache_flush_complete, &context); - wait_for_completion_interruptible(&context.compl); atomic_set(&cache_priv->flush_interrupt_enabled, 1); return result; @@ -225,25 +237,37 @@ static int _cache_mngt_cache_flush_sync(ocf_cache_t cache, bool interruption) static void _cache_mngt_core_flush_complete(ocf_core_t core, void *priv, int error) { - struct _cache_mngt_sync_context *context = priv; + struct _cache_mngt_async_context *context = priv; + int result; - *context->result = error; - complete(&context->compl); + result = _cache_mngt_async_callee_set_result(context, error); + + if (result == -KCAS_ERR_WAITING_INTERRUPTED) + kfree(context); } static int _cache_mngt_core_flush_sync(ocf_core_t core, bool interruption) { + int result; + struct _cache_mngt_async_context *context; ocf_cache_t cache = ocf_core_get_cache(core); struct cache_priv *cache_priv = ocf_cache_get_priv(cache); - struct _cache_mngt_sync_context context; - int result; - init_completion(&context.compl); - context.result = &result; + context = kmalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return -ENOMEM; + + _cache_mngt_async_context_init(context); + atomic_set(&cache_priv->flush_interrupt_enabled, interruption); + + ocf_mngt_core_flush(core, _cache_mngt_core_flush_complete, context); + result = wait_for_completion_interruptible(&context->cmpl); + + result = _cache_mngt_async_caller_set_result(context, result); + + if (result != -KCAS_ERR_WAITING_INTERRUPTED) + kfree(context); - atomic_set(&cache_priv->flush_interrupt_enabled, 0); - ocf_mngt_core_flush(core, _cache_mngt_core_flush_complete, &context); - wait_for_completion_interruptible(&context.compl); atomic_set(&cache_priv->flush_interrupt_enabled, 1); return result; From 232f13a8a43460e42b8d2640a4cde0ce73a7d97c Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Fri, 13 Dec 2019 03:23:53 -0500 Subject: [PATCH 10/13] Allow to interrupt cache init, load and stop. When device used as cache had a big size, it took a lot of time to initialize. If user would interrupt waiting, asyc OCF init procedure would continue, but after finish, there was nobody to perfrom kernel part of start nor error handling. Now error handling and kernel part of start procedure are moved to completion. If user will interrupt waiting at any point, newly started cache instance will be stopped. Since cache init and load vary only with check for old metadata and initializing exported objects, they are now merged into one function. Async cache stop is part of this commit because it was needed for rollback path. Load, init and stop have common context, because in case of non interrupted attach CAS needs to wait for rollback to be completed. Common context makes passing `struct completion` easier between load, init and stop. This commit is part of patch that will allow to interrupt waiting for OCF operations. Signed-off-by: Michal Mielewczyk --- modules/cas_cache/layer_cache_management.c | 509 ++++++++++++--------- 1 file changed, 282 insertions(+), 227 deletions(-) diff --git a/modules/cas_cache/layer_cache_management.c b/modules/cas_cache/layer_cache_management.c index 929abdb..5fabe33 100644 --- a/modules/cas_cache/layer_cache_management.c +++ b/modules/cas_cache/layer_cache_management.c @@ -273,25 +273,152 @@ static int _cache_mngt_core_flush_sync(ocf_core_t core, bool interruption) return result; } +static void _cache_mngt_cache_priv_deinit(ocf_cache_t cache) +{ + struct cache_priv *cache_priv = ocf_cache_get_priv(cache); + + vfree(cache_priv); +} + +static int exit_instance_finish(ocf_cache_t cache, int error) +{ + struct cache_priv *cache_priv; + ocf_queue_t mngt_queue; + bool flush_status; + + flush_status = ocf_mngt_cache_is_dirty(cache); + cache_priv = ocf_cache_get_priv(cache); + mngt_queue = cache_priv->mngt_queue; + + if (error && error != -OCF_ERR_WRITE_CACHE) + goto restore_exp_obj; + + if (!error && flush_status) + error = -KCAS_ERR_STOPPED_DIRTY; + + module_put(THIS_MODULE); + + cas_cls_deinit(cache); + + vfree(cache_priv); + + ocf_mngt_cache_unlock(cache); + ocf_mngt_cache_put(cache); + ocf_queue_put(mngt_queue); + + return error; + +restore_exp_obj: + if (block_dev_create_all_exported_objects(cache)) { + /* Print error msg but do not change return err code to inform user why + * stop failed originally. */ + printk(KERN_WARNING + "Failed to restore (create) all exported objects!\n"); + goto unlock; + } + if (block_dev_activate_all_exported_objects(cache)) { + block_dev_destroy_all_exported_objects(cache); + printk(KERN_WARNING + "Failed to restore (activate) all exported objects!\n"); + } +unlock: + ocf_mngt_cache_unlock(cache); + ocf_mngt_cache_put(cache); + module_put(THIS_MODULE); + return error; +} + +struct _cache_mngt_attach_context { + struct _cache_mngt_async_context async; + struct kcas_start_cache *cmd; + struct ocf_mngt_cache_device_config *device_cfg; + ocf_cache_t cache; + int ocf_start_error; + struct work_struct work; + + struct { + bool priv_inited:1; + bool cls_inited:1; + }; +}; + + +static void cache_start_rollback(struct work_struct *work) +{ + struct cache_priv *cache_priv; + ocf_queue_t mngt_queue = NULL; + struct _cache_mngt_attach_context *ctx = + container_of(work, struct _cache_mngt_attach_context, work); + ocf_cache_t cache = ctx->cache; + int result; + + if (ctx->cls_inited) + cas_cls_deinit(cache); + + if (ctx->priv_inited) { + cache_priv = ocf_cache_get_priv(cache); + mngt_queue = cache_priv->mngt_queue; + _cache_mngt_cache_priv_deinit(cache); + } + + ocf_mngt_cache_unlock(cache); + + if (mngt_queue) + ocf_queue_put(mngt_queue); + + module_put(THIS_MODULE); + + result = _cache_mngt_async_callee_set_result(&ctx->async, + ctx->ocf_start_error); + + if (result == -KCAS_ERR_WAITING_INTERRUPTED) + kfree(ctx); +} + +static void _cache_mngt_cache_stop_rollback_complete(ocf_cache_t cache, + void *priv, int error) +{ + struct _cache_mngt_attach_context *ctx = priv; + + if (error == -OCF_ERR_WRITE_CACHE) + printk(KERN_WARNING "Cannot save cache state\n"); + else + BUG_ON(error); + + INIT_WORK(&ctx->work, cache_start_rollback); + schedule_work(&ctx->work); +} + static void _cache_mngt_cache_stop_complete(ocf_cache_t cache, void *priv, int error) { - struct _cache_mngt_sync_context *context = priv; + struct _cache_mngt_async_context *context = priv; + int result = exit_instance_finish(cache, error); - *context->result = error; - complete(&context->compl); + result = _cache_mngt_async_callee_set_result(context, result); + + if (result == -KCAS_ERR_WAITING_INTERRUPTED) + kfree(context); } static int _cache_mngt_cache_stop_sync(ocf_cache_t cache) { - struct _cache_mngt_sync_context context; - int result; + struct _cache_mngt_async_context *context; + int result = 0; - init_completion(&context.compl); - context.result = &result; + context = env_malloc(sizeof(*context), GFP_KERNEL); + if (!context) + return -ENOMEM; - ocf_mngt_cache_stop(cache, _cache_mngt_cache_stop_complete, &context); - wait_for_completion_interruptible(&context.compl); + _cache_mngt_async_context_init(context); + + ocf_mngt_cache_stop(cache, _cache_mngt_cache_stop_complete, context); + result = wait_for_completion_interruptible(&context->cmpl); + + result = _cache_mngt_async_caller_set_result(context, result); + + if (context->result != -KCAS_ERR_WAITING_INTERRUPTED) + kfree(context); return result; } @@ -1407,17 +1534,86 @@ err: return result; } -struct _cache_mngt_attach_context { - struct completion compl; - int *result; -}; - -static void _cache_mngt_attach_complete(ocf_cache_t cache, void *priv, int error) +static void init_instance_complete(struct _cache_mngt_attach_context *ctx, + ocf_cache_t cache) { - struct _cache_mngt_attach_context *context = priv; + ocf_volume_t cache_obj; + struct bd_object *bd_cache_obj; + struct block_device *bdev; + const char *name; - *context->result = error; - complete(&context->compl); + cache_obj = ocf_cache_get_volume(cache); + BUG_ON(!cache_obj); + + bd_cache_obj = bd_object(cache_obj); + bdev = bd_cache_obj->btm_bd; + + /* If we deal with whole device, reread partitions */ + if (bdev->bd_contains == bdev) + ioctl_by_bdev(bdev, BLKRRPART, (unsigned long)NULL); + + /* Set other back information */ + name = block_dev_get_elevator_name( + casdsk_disk_get_queue(bd_cache_obj->dsk)); + if (name) + strlcpy(ctx->cmd->cache_elevator, + name, MAX_ELEVATOR_NAME); +} + +static void _cache_mngt_start_complete(ocf_cache_t cache, void *priv, + int error); + +static void cache_start_finalize(struct work_struct *work) +{ + struct _cache_mngt_attach_context *ctx = + container_of(work, struct _cache_mngt_attach_context, work); + int result; + ocf_cache_t cache = ctx->cache; + + result = cache_mngt_initialize_core_objects(cache); + if (result) { + ctx->ocf_start_error = result; + return _cache_mngt_start_complete(cache, ctx, result); + } + + ocf_core_visit(cache, _cache_mngt_core_device_loaded_visitor, + NULL, false); + + init_instance_complete(ctx, cache); + + if (_cache_mngt_async_callee_set_result(&ctx->async, 0)) { + /* caller interrupted */ + ctx->ocf_start_error = 0; + ocf_mngt_cache_stop(cache, + _cache_mngt_cache_stop_rollback_complete, ctx); + return; + } + + ocf_mngt_cache_unlock(cache); +} + +static void _cache_mngt_start_complete(ocf_cache_t cache, void *priv, int error) +{ + struct _cache_mngt_attach_context *ctx = priv; + int caller_status = _cache_mngt_async_callee_peek_result(&ctx->async); + + if (caller_status || error) { + if (error == -OCF_ERR_NO_FREE_RAM) { + ocf_mngt_get_ram_needed(cache, ctx->device_cfg, + &ctx->cmd->min_free_ram); + } else if (caller_status == -KCAS_ERR_WAITING_INTERRUPTED) { + printk(KERN_WARNING "Cache added successfully, " + "but waiting interrupted. Rollback\n"); + } + ctx->ocf_start_error = error; + ocf_mngt_cache_stop(cache, _cache_mngt_cache_stop_rollback_complete, + ctx); + } else { + _cache_mngt_log_cache_device_path(cache, ctx->device_cfg); + + INIT_WORK(&ctx->work, cache_start_finalize); + schedule_work(&ctx->work); + } } static int _cache_mngt_cache_priv_init(ocf_cache_t cache) @@ -1437,89 +1633,6 @@ static int _cache_mngt_cache_priv_init(ocf_cache_t cache) return 0; } -static void _cache_mngt_cache_priv_deinit(ocf_cache_t cache) -{ - struct cache_priv *cache_priv = ocf_cache_get_priv(cache); - - vfree(cache_priv); -} - -static int _cache_mngt_start(struct ocf_mngt_cache_config *cfg, - struct ocf_mngt_cache_device_config *device_cfg, - struct kcas_start_cache *cmd, ocf_cache_t *cache) -{ - struct _cache_mngt_attach_context context; - ocf_cache_t tmp_cache; - ocf_queue_t mngt_queue = NULL; - struct cache_priv *cache_priv; - int result; - - result = ocf_mngt_cache_start(cas_ctx, &tmp_cache, cfg); - if (result) - return result; - - result = _cache_mngt_cache_priv_init(tmp_cache); - BUG_ON(result); - - /* Currently we can't recover without queues setup. OCF doesn't - * support stopping cache when management queue isn't started. */ - - result = _cache_mngt_start_queues(tmp_cache); - BUG_ON(result); - - /* Ditto */ - - cache_priv = ocf_cache_get_priv(tmp_cache); - mngt_queue = cache_priv->mngt_queue; - - result = cas_cls_init(tmp_cache); - if (result) - goto err_classifier; - - init_completion(&context.compl); - context.result = &result; - - ocf_mngt_cache_attach(tmp_cache, device_cfg, - _cache_mngt_attach_complete, &context); - - wait_for_completion_interruptible(&context.compl); - if (result) - goto err_attach; - - _cache_mngt_log_cache_device_path(tmp_cache, device_cfg); - - *cache = tmp_cache; - - return 0; - -err_attach: - if (result == -OCF_ERR_NO_FREE_RAM && cmd) { - ocf_mngt_get_ram_needed(tmp_cache, device_cfg, - &cmd->min_free_ram); - } - cas_cls_deinit(tmp_cache); -err_classifier: - _cache_mngt_cache_priv_deinit(tmp_cache); - _cache_mngt_cache_stop_sync(tmp_cache); - if (mngt_queue) - ocf_queue_put(mngt_queue); - ocf_mngt_cache_unlock(tmp_cache); - return result; -} - -struct _cache_mngt_load_context { - struct completion compl; - int *result; -}; - -static void _cache_mngt_load_complete(ocf_cache_t cache, void *priv, int error) -{ - struct _cache_mngt_load_context *context = priv; - - *context->result = error; - complete(&context->compl); -} - struct cache_mngt_check_metadata_context { struct completion cmpl; char *cache_name; @@ -1580,77 +1693,86 @@ out_bdev: return result; } -static int _cache_mngt_load(struct ocf_mngt_cache_config *cfg, +static int _cache_mngt_start(struct ocf_mngt_cache_config *cfg, struct ocf_mngt_cache_device_config *device_cfg, - struct kcas_start_cache *cmd, ocf_cache_t *cache) + struct kcas_start_cache *cmd) { - struct _cache_mngt_load_context context; - ocf_cache_t tmp_cache; + struct _cache_mngt_attach_context *context; + ocf_cache_t cache; ocf_queue_t mngt_queue = NULL; struct cache_priv *cache_priv; - int result; + int result = 0, rollback_result = 0; + bool load = (cmd && cmd->init_cache == CACHE_INIT_LOAD); - result = _cache_mngt_check_metadata(cfg, cmd->cache_path_name); - if (result) + if (load) + result = _cache_mngt_check_metadata(cfg, cmd->cache_path_name); + if (result) { + module_put(THIS_MODULE); return result; + } - result = ocf_mngt_cache_start(cas_ctx, &tmp_cache, cfg); - if (result) + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return -ENOMEM; + + context->device_cfg = device_cfg; + context->cmd = cmd; + _cache_mngt_async_context_init(&context->async); + + /* Start cache. Returned cache instance will be locked as it was set + * in configuration. + */ + result = ocf_mngt_cache_start(cas_ctx, &cache, cfg); + if (result) { + kfree(context); + module_put(THIS_MODULE); return result; + } + context->cache = cache; - result = _cache_mngt_cache_priv_init(tmp_cache); - BUG_ON(result); + result = _cache_mngt_cache_priv_init(cache); + if (result) + goto err; + context->priv_inited = true; - /* Currently we can't recover without queues setup. OCF doesn't - * support stopping cache when management queue isn't started. */ + result = _cache_mngt_start_queues(cache); + if (result) + goto err; - result = _cache_mngt_start_queues(tmp_cache); - BUG_ON(result); - - /* Ditto */ - - cache_priv = ocf_cache_get_priv(tmp_cache); + cache_priv = ocf_cache_get_priv(cache); mngt_queue = cache_priv->mngt_queue; - init_completion(&context.compl); - context.result = &result; - - ocf_mngt_cache_load(tmp_cache, device_cfg, - _cache_mngt_load_complete, &context); - - wait_for_completion_interruptible(&context.compl); + result = cas_cls_init(cache); if (result) - goto err_load; + goto err; + context->cls_inited = true; - _cache_mngt_log_cache_device_path(tmp_cache, device_cfg); - - result = cas_cls_init(tmp_cache); - if (result) - goto err_load; - - result = cache_mngt_initialize_core_objects(tmp_cache); - if (result) - goto err_core_obj; - - ocf_core_visit(tmp_cache, _cache_mngt_core_device_loaded_visitor, - NULL, false); - - *cache = tmp_cache; - - return 0; - -err_core_obj: - cas_cls_deinit(tmp_cache); -err_load: - if (result == -OCF_ERR_NO_FREE_RAM && cmd) { - ocf_mngt_get_ram_needed(tmp_cache, device_cfg, - &cmd->min_free_ram); + if (load) { + ocf_mngt_cache_load(cache, device_cfg, + _cache_mngt_start_complete, context); + } else { + ocf_mngt_cache_attach(cache, device_cfg, + _cache_mngt_start_complete, context); } - _cache_mngt_cache_priv_deinit(tmp_cache); - _cache_mngt_cache_stop_sync(tmp_cache); - if (mngt_queue) - ocf_queue_put(mngt_queue); - ocf_mngt_cache_unlock(tmp_cache); + result = wait_for_completion_interruptible(&context->async.cmpl); + + result = _cache_mngt_async_caller_set_result(&context->async, result); + + if (result != -KCAS_ERR_WAITING_INTERRUPTED); + kfree(context); + + return result; +err: + ocf_mngt_cache_stop(cache, _cache_mngt_cache_stop_rollback_complete, + context); + rollback_result = wait_for_completion_interruptible(&context->async.cmpl); + + rollback_result = _cache_mngt_async_caller_set_result(&context->async, + rollback_result); + + if (rollback_result != -KCAS_ERR_WAITING_INTERRUPTED); + kfree(context); + return result; } @@ -1658,53 +1780,10 @@ int cache_mngt_init_instance(struct ocf_mngt_cache_config *cfg, struct ocf_mngt_cache_device_config *device_cfg, struct kcas_start_cache *cmd) { - ocf_cache_t cache = NULL; - const char *name; - bool load = (cmd && cmd->init_cache == CACHE_INIT_LOAD); - int result; - if (!try_module_get(THIS_MODULE)) return -KCAS_ERR_SYSTEM; - /* Start cache. Returned cache instance will be locked as it was set - * in configuration. - */ - if (!load) - result = _cache_mngt_start(cfg, device_cfg, cmd, &cache); - else - result = _cache_mngt_load(cfg, device_cfg, cmd, &cache); - - if (result) { - module_put(THIS_MODULE); - return result; - } - - if (cmd) { - ocf_volume_t cache_obj; - struct bd_object *bd_cache_obj; - struct block_device *bdev; - - cache_obj = ocf_cache_get_volume(cache); - BUG_ON(!cache_obj); - - bd_cache_obj = bd_object(cache_obj); - bdev = bd_cache_obj->btm_bd; - - /* If we deal with whole device, reread partitions */ - if (bdev->bd_contains == bdev) - ioctl_by_bdev(bdev, BLKRRPART, (unsigned long)NULL); - - /* Set other back information */ - name = block_dev_get_elevator_name( - casdsk_disk_get_queue(bd_cache_obj->dsk)); - if (name) - strlcpy(cmd->cache_elevator, - name, MAX_ELEVATOR_NAME); - } - - ocf_mngt_cache_unlock(cache); - - return 0; + return _cache_mngt_start(cfg, device_cfg, cmd); } /** @@ -1884,6 +1963,7 @@ int cache_mngt_exit_instance(const char *cache_name, size_t name_len, int flush) { ocf_cache_t cache; struct cache_priv *cache_priv; + ocf_queue_t mngt_queue; int status, flush_status = 0; /* Get cache */ @@ -1893,6 +1973,7 @@ int cache_mngt_exit_instance(const char *cache_name, size_t name_len, int flush) return status; cache_priv = ocf_cache_get_priv(cache); + mngt_queue = cache_priv->mngt_queue; status = _cache_mngt_read_lock_sync(cache); if (status) @@ -1955,37 +2036,11 @@ int cache_mngt_exit_instance(const char *cache_name, size_t name_len, int flush) /* Stop cache device */ status = _cache_mngt_cache_stop_sync(cache); - if (status && status != -OCF_ERR_WRITE_CACHE) - goto restore_exp_obj; - if (!status && flush_status) - status = -KCAS_ERR_STOPPED_DIRTY; - - module_put(THIS_MODULE); - - cas_cls_deinit(cache); - - ocf_queue_put(cache_priv->mngt_queue); - vfree(cache_priv); - - ocf_mngt_cache_unlock(cache); - ocf_mngt_cache_put(cache); + ocf_queue_put(mngt_queue); return status; -restore_exp_obj: - if (block_dev_create_all_exported_objects(cache)) { - /* Print error msg but do not change return err code to inform user why - * stop failed originally. */ - printk(KERN_WARNING - "Failed to restore (create) all exported objects!\n"); - goto unlock; - } - if (block_dev_activate_all_exported_objects(cache)) { - block_dev_destroy_all_exported_objects(cache); - printk(KERN_WARNING - "Failed to restore (activate) all exported objects!\n"); - } unlock: ocf_mngt_cache_unlock(cache); put: From 36e34b5a690555902d8f452b025262dd1bb0a248 Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Mon, 23 Dec 2019 11:09:43 -0500 Subject: [PATCH 11/13] Don't try to restore cache after stop error. In current OCF cache stop implemetation no error should occur, so there is no need to handle it in adapter. Signed-off-by: Michal Mielewczyk --- modules/cas_cache/layer_cache_management.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/modules/cas_cache/layer_cache_management.c b/modules/cas_cache/layer_cache_management.c index 5fabe33..1642356 100644 --- a/modules/cas_cache/layer_cache_management.c +++ b/modules/cas_cache/layer_cache_management.c @@ -291,7 +291,7 @@ static int exit_instance_finish(ocf_cache_t cache, int error) mngt_queue = cache_priv->mngt_queue; if (error && error != -OCF_ERR_WRITE_CACHE) - goto restore_exp_obj; + BUG_ON(error); if (!error && flush_status) error = -KCAS_ERR_STOPPED_DIRTY; @@ -307,25 +307,6 @@ static int exit_instance_finish(ocf_cache_t cache, int error) ocf_queue_put(mngt_queue); return error; - -restore_exp_obj: - if (block_dev_create_all_exported_objects(cache)) { - /* Print error msg but do not change return err code to inform user why - * stop failed originally. */ - printk(KERN_WARNING - "Failed to restore (create) all exported objects!\n"); - goto unlock; - } - if (block_dev_activate_all_exported_objects(cache)) { - block_dev_destroy_all_exported_objects(cache); - printk(KERN_WARNING - "Failed to restore (activate) all exported objects!\n"); - } -unlock: - ocf_mngt_cache_unlock(cache); - ocf_mngt_cache_put(cache); - module_put(THIS_MODULE); - return error; } struct _cache_mngt_attach_context { From 2449ab0b1701898cc267f417fabe001dcc026112 Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Thu, 2 Jan 2020 05:09:44 -0500 Subject: [PATCH 12/13] Add extended error message to casadm. If waiting for the operation to finish was interrupted, casadm should print informative error message. Signed-off-by: Michal Mielewczyk --- casadm/extended_err_msg.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/casadm/extended_err_msg.c b/casadm/extended_err_msg.c index 9d3b4fe..752eb43 100644 --- a/casadm/extended_err_msg.c +++ b/casadm/extended_err_msg.c @@ -257,6 +257,10 @@ struct { KCAS_ERR_CLS_RULE_INVALID_SYNTAX, "Invalid classification rule syntax" }, + { + KCAS_ERR_WAITING_INTERRUPTED, + "Waiting for operation interrupted" + }, }; From 9a42ad730ba2295f5628fb443914ec7777ddb1fd Mon Sep 17 00:00:00 2001 From: Michal Mielewczyk Date: Thu, 2 Jan 2020 09:25:19 -0500 Subject: [PATCH 13/13] Update OCF Signed-off-by: Michal Mielewczyk --- ocf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocf b/ocf index 3aa68bc..1ec0a5c 160000 --- a/ocf +++ b/ocf @@ -1 +1 @@ -Subproject commit 3aa68bcb15e513bb0a53934445fa9e4c75a77ba1 +Subproject commit 1ec0a5c053c2e5fb1d502b0aa052030838011278