From: corwin <corwin@xxxxxxxxxx> This adds the source files that implement the dm-vdo target. Signed-off-by: corwin <corwin@xxxxxxxxxx> --- drivers/md/dm-vdo/action-manager.c | 410 ++ drivers/md/dm-vdo/action-manager.h | 117 + drivers/md/dm-vdo/admin-state.c | 512 +++ drivers/md/dm-vdo/admin-state.h | 180 + drivers/md/dm-vdo/block-map.c | 3388 +++++++++++++++++ drivers/md/dm-vdo/block-map.h | 391 ++ drivers/md/dm-vdo/completion.c | 141 + drivers/md/dm-vdo/completion.h | 155 + drivers/md/dm-vdo/constants.c | 15 + drivers/md/dm-vdo/constants.h | 102 + drivers/md/dm-vdo/data-vio.c | 2070 ++++++++++ drivers/md/dm-vdo/data-vio.h | 689 ++++ drivers/md/dm-vdo/dedupe.c | 3073 +++++++++++++++ drivers/md/dm-vdo/dedupe.h | 120 + drivers/md/dm-vdo/dump.c | 288 ++ drivers/md/dm-vdo/dump.h | 17 + drivers/md/dm-vdo/encodings.c | 1523 ++++++++ drivers/md/dm-vdo/encodings.h | 1307 +++++++ drivers/md/dm-vdo/flush.c | 563 +++ drivers/md/dm-vdo/flush.h | 44 + drivers/md/dm-vdo/int-map.c | 710 ++++ drivers/md/dm-vdo/int-map.h | 40 + drivers/md/dm-vdo/io-submitter.c | 483 +++ drivers/md/dm-vdo/io-submitter.h | 52 + drivers/md/dm-vdo/logical-zone.c | 378 ++ drivers/md/dm-vdo/logical-zone.h | 87 + drivers/md/dm-vdo/message-stats.c | 1222 ++++++ drivers/md/dm-vdo/message-stats.h | 13 + drivers/md/dm-vdo/packer.c | 794 ++++ drivers/md/dm-vdo/packer.h | 123 + drivers/md/dm-vdo/physical-zone.c | 650 ++++ drivers/md/dm-vdo/physical-zone.h | 115 + drivers/md/dm-vdo/pointer-map.c | 691 ++++ drivers/md/dm-vdo/pointer-map.h | 81 + drivers/md/dm-vdo/pool-sysfs-stats.c | 2063 ++++++++++ drivers/md/dm-vdo/pool-sysfs.c | 193 + drivers/md/dm-vdo/pool-sysfs.h | 19 + drivers/md/dm-vdo/priority-table.c | 226 ++ drivers/md/dm-vdo/priority-table.h | 48 + drivers/md/dm-vdo/recovery-journal.c | 1772 +++++++++ drivers/md/dm-vdo/recovery-journal.h | 313 ++ drivers/md/dm-vdo/release-versions.h | 20 + drivers/md/dm-vdo/repair.c | 1775 +++++++++ drivers/md/dm-vdo/repair.h | 14 + drivers/md/dm-vdo/slab-depot.c | 5212 ++++++++++++++++++++++++++ drivers/md/dm-vdo/slab-depot.h | 594 +++ drivers/md/dm-vdo/statistics.h | 279 ++ drivers/md/dm-vdo/status-codes.c | 127 + drivers/md/dm-vdo/status-codes.h | 112 + drivers/md/dm-vdo/sysfs.c | 84 + drivers/md/dm-vdo/types.h | 403 ++ drivers/md/dm-vdo/vdo.c | 1846 +++++++++ drivers/md/dm-vdo/vdo.h | 381 ++ drivers/md/dm-vdo/vio.c | 525 +++ drivers/md/dm-vdo/vio.h | 221 ++ drivers/md/dm-vdo/wait-queue.c | 223 ++ drivers/md/dm-vdo/wait-queue.h | 129 + drivers/md/dm-vdo/work-queue.c | 658 ++++ drivers/md/dm-vdo/work-queue.h | 53 + 59 files changed, 37834 insertions(+) create mode 100644 drivers/md/dm-vdo/action-manager.c create mode 100644 drivers/md/dm-vdo/action-manager.h create mode 100644 drivers/md/dm-vdo/admin-state.c create mode 100644 drivers/md/dm-vdo/admin-state.h create mode 100644 drivers/md/dm-vdo/block-map.c create mode 100644 drivers/md/dm-vdo/block-map.h create mode 100644 drivers/md/dm-vdo/completion.c create mode 100644 drivers/md/dm-vdo/completion.h create mode 100644 drivers/md/dm-vdo/constants.c create mode 100644 drivers/md/dm-vdo/constants.h create mode 100644 drivers/md/dm-vdo/data-vio.c create mode 100644 drivers/md/dm-vdo/data-vio.h create mode 100644 drivers/md/dm-vdo/dedupe.c create mode 100644 drivers/md/dm-vdo/dedupe.h create mode 100644 drivers/md/dm-vdo/dump.c create mode 100644 drivers/md/dm-vdo/dump.h create mode 100644 drivers/md/dm-vdo/encodings.c create mode 100644 drivers/md/dm-vdo/encodings.h create mode 100644 drivers/md/dm-vdo/flush.c create mode 100644 drivers/md/dm-vdo/flush.h create mode 100644 drivers/md/dm-vdo/int-map.c create mode 100644 drivers/md/dm-vdo/int-map.h create mode 100644 drivers/md/dm-vdo/io-submitter.c create mode 100644 drivers/md/dm-vdo/io-submitter.h create mode 100644 drivers/md/dm-vdo/logical-zone.c create mode 100644 drivers/md/dm-vdo/logical-zone.h create mode 100644 drivers/md/dm-vdo/message-stats.c create mode 100644 drivers/md/dm-vdo/message-stats.h create mode 100644 drivers/md/dm-vdo/packer.c create mode 100644 drivers/md/dm-vdo/packer.h create mode 100644 drivers/md/dm-vdo/physical-zone.c create mode 100644 drivers/md/dm-vdo/physical-zone.h create mode 100644 drivers/md/dm-vdo/pointer-map.c create mode 100644 drivers/md/dm-vdo/pointer-map.h create mode 100644 drivers/md/dm-vdo/pool-sysfs-stats.c create mode 100644 drivers/md/dm-vdo/pool-sysfs.c create mode 100644 drivers/md/dm-vdo/pool-sysfs.h create mode 100644 drivers/md/dm-vdo/priority-table.c create mode 100644 drivers/md/dm-vdo/priority-table.h create mode 100644 drivers/md/dm-vdo/recovery-journal.c create mode 100644 drivers/md/dm-vdo/recovery-journal.h create mode 100644 drivers/md/dm-vdo/release-versions.h create mode 100644 drivers/md/dm-vdo/repair.c create mode 100644 drivers/md/dm-vdo/repair.h create mode 100644 drivers/md/dm-vdo/slab-depot.c create mode 100644 drivers/md/dm-vdo/slab-depot.h create mode 100644 drivers/md/dm-vdo/statistics.h create mode 100644 drivers/md/dm-vdo/status-codes.c create mode 100644 drivers/md/dm-vdo/status-codes.h create mode 100644 drivers/md/dm-vdo/sysfs.c create mode 100644 drivers/md/dm-vdo/types.h create mode 100644 drivers/md/dm-vdo/vdo.c create mode 100644 drivers/md/dm-vdo/vdo.h create mode 100644 drivers/md/dm-vdo/vio.c create mode 100644 drivers/md/dm-vdo/vio.h create mode 100644 drivers/md/dm-vdo/wait-queue.c create mode 100644 drivers/md/dm-vdo/wait-queue.h create mode 100644 drivers/md/dm-vdo/work-queue.c create mode 100644 drivers/md/dm-vdo/work-queue.h diff --git a/drivers/md/dm-vdo/action-manager.c b/drivers/md/dm-vdo/action-manager.c new file mode 100644 index 00000000000..76880677aaf --- /dev/null +++ b/drivers/md/dm-vdo/action-manager.c @@ -0,0 +1,410 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "action-manager.h" + +#include "memory-alloc.h" +#include "permassert.h" + +#include "admin-state.h" +#include "completion.h" +#include "status-codes.h" +#include "types.h" +#include "vdo.h" + +/** + * struct action - An action to be performed in each of a set of zones. + * @in_use: Whether this structure is in use. + * @operation: The admin operation associated with this action. + * @preamble: The method to run on the initiator thread before the action is applied to each zone. + * @zone_action: The action to be performed in each zone. + * @conclusion: The method to run on the initiator thread after the action is applied to each zone. + * @parent: The object to notify when the action is complete. + * @context: The action specific context. + * @next: The action to perform after this one. + */ +struct action { + bool in_use; + const struct admin_state_code *operation; + vdo_action_preamble *preamble; + vdo_zone_action *zone_action; + vdo_action_conclusion *conclusion; + struct vdo_completion *parent; + void *context; + struct action *next; +}; + +/** + * struct action_manager - Definition of an action manager. + * @completion: The completion for performing actions. + * @state: The state of this action manager. + * @actions: The two action slots. + * @current_action: The current action slot. + * @zones: The number of zones in which an action is to be applied. + * @Scheduler: A function to schedule a default next action. + * @get_zone_thread_id: A function to get the id of the thread on which to apply an action to a + * zone. + * @initiator_thread_id: The ID of the thread on which actions may be initiated. + * @context: Opaque data associated with this action manager. + * @acting_zone: The zone currently being acted upon. + */ +struct action_manager { + struct vdo_completion completion; + struct admin_state state; + struct action actions[2]; + struct action *current_action; + zone_count_t zones; + vdo_action_scheduler *scheduler; + vdo_zone_thread_getter *get_zone_thread_id; + thread_id_t initiator_thread_id; + void *context; + zone_count_t acting_zone; +}; + +static inline struct action_manager *as_action_manager(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion, VDO_ACTION_COMPLETION); + return container_of(completion, struct action_manager, completion); +} + +/* Implements vdo_action_scheduler. */ +static bool no_default_action(void *context __always_unused) +{ + return false; +} + +/* Implements vdo_action_preamble. */ +static void no_preamble(void *context __always_unused, + struct vdo_completion *completion) +{ + vdo_finish_completion(completion); +} + +/* Implements vdo_action_conclusion. */ +static int no_conclusion(void *context __always_unused) +{ + return VDO_SUCCESS; +} + +/** + * vdo_make_action_manager() - Make an action manager. + * @zones: The number of zones to which actions will be applied. + * @get_zone_thread_id: A function to get the thread id associated with a zone. + * @initiator_thread_id: The thread on which actions may initiated. + * @context: The object which holds the per-zone context for the action. + * @scheduler: A function to schedule a next action after an action concludes if there is no + * pending action (may be NULL). + * @vdo: The vdo used to initialize completions. + * @manager_ptr: A pointer to hold the new action manager. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_make_action_manager(zone_count_t zones, + vdo_zone_thread_getter *get_zone_thread_id, + thread_id_t initiator_thread_id, + void *context, + vdo_action_scheduler *scheduler, + struct vdo *vdo, + struct action_manager **manager_ptr) +{ + struct action_manager *manager; + int result = UDS_ALLOCATE(1, struct action_manager, __func__, &manager); + + if (result != VDO_SUCCESS) + return result; + + *manager = (struct action_manager) { + .zones = zones, + .scheduler = + ((scheduler == NULL) ? no_default_action : scheduler), + .get_zone_thread_id = get_zone_thread_id, + .initiator_thread_id = initiator_thread_id, + .context = context, + }; + + manager->actions[0].next = &manager->actions[1]; + manager->current_action = manager->actions[1].next = + &manager->actions[0]; + vdo_set_admin_state_code(&manager->state, + VDO_ADMIN_STATE_NORMAL_OPERATION); + vdo_initialize_completion(&manager->completion, vdo, + VDO_ACTION_COMPLETION); + *manager_ptr = manager; + return VDO_SUCCESS; +} + +const struct admin_state_code *vdo_get_current_manager_operation(struct action_manager *manager) +{ + return vdo_get_admin_state_code(&manager->state); +} + +void *vdo_get_current_action_context(struct action_manager *manager) +{ + return manager->current_action->in_use ? manager->current_action->context : NULL; +} + +static void finish_action_callback(struct vdo_completion *completion); +static void apply_to_zone(struct vdo_completion *completion); + +static thread_id_t get_acting_zone_thread_id(struct action_manager *manager) +{ + return manager->get_zone_thread_id(manager->context, manager->acting_zone); +} + +static void preserve_error(struct vdo_completion *completion) +{ + if (completion->parent != NULL) + vdo_set_completion_result(completion->parent, completion->result); + + vdo_reset_completion(completion); + vdo_run_completion(completion); +} + +static void prepare_for_next_zone(struct action_manager *manager) +{ + vdo_prepare_completion_for_requeue(&manager->completion, + apply_to_zone, + preserve_error, + get_acting_zone_thread_id(manager), + manager->current_action->parent); +} + +static void prepare_for_conclusion(struct action_manager *manager) +{ + vdo_prepare_completion_for_requeue(&manager->completion, + finish_action_callback, + preserve_error, + manager->initiator_thread_id, + manager->current_action->parent); +} + +static void apply_to_zone(struct vdo_completion *completion) +{ + zone_count_t zone; + struct action_manager *manager = as_action_manager(completion); + + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == get_acting_zone_thread_id(manager)), + "%s() called on acting zones's thread", + __func__); + + zone = manager->acting_zone++; + if (manager->acting_zone == manager->zones) + /* + * We are about to apply to the last zone. Once that is finished, we're done, so go + * back to the initiator thread and finish up. + */ + prepare_for_conclusion(manager); + else + /* Prepare to come back on the next zone */ + prepare_for_next_zone(manager); + + manager->current_action->zone_action(manager->context, zone, completion); +} + +static void handle_preamble_error(struct vdo_completion *completion) +{ + /* Skip the zone actions since the preamble failed. */ + completion->callback = finish_action_callback; + preserve_error(completion); +} + +static void launch_current_action(struct action_manager *manager) +{ + struct action *action = manager->current_action; + int result = vdo_start_operation(&manager->state, action->operation); + + if (result != VDO_SUCCESS) { + if (action->parent != NULL) + vdo_set_completion_result(action->parent, result); + + /* We aren't going to run the preamble, so don't run the conclusion */ + action->conclusion = no_conclusion; + finish_action_callback(&manager->completion); + return; + } + + if (action->zone_action == NULL) { + prepare_for_conclusion(manager); + } else { + manager->acting_zone = 0; + vdo_prepare_completion_for_requeue(&manager->completion, + apply_to_zone, + handle_preamble_error, + get_acting_zone_thread_id(manager), + manager->current_action->parent); + } + + action->preamble(manager->context, &manager->completion); +} + +/** + * vdo_schedule_default_action() - Attempt to schedule the default action. + * @manager: The action manager. + * + * If the manager is not operating normally, the action will not be scheduled. + * + * Return: true if an action was scheduled. + */ +bool vdo_schedule_default_action(struct action_manager *manager) +{ + /* Don't schedule a default action if we are operating or not in normal operation. */ + const struct admin_state_code *code = vdo_get_current_manager_operation(manager); + + return ((code == VDO_ADMIN_STATE_NORMAL_OPERATION) && + manager->scheduler(manager->context)); +} + +static void finish_action_callback(struct vdo_completion *completion) +{ + bool has_next_action; + int result; + struct action_manager *manager = as_action_manager(completion); + struct action action = *(manager->current_action); + + manager->current_action->in_use = false; + manager->current_action = manager->current_action->next; + + /* + * We need to check this now to avoid use-after-free issues if running the conclusion or + * notifying the parent results in the manager being freed. + */ + has_next_action = + (manager->current_action->in_use || vdo_schedule_default_action(manager)); + result = action.conclusion(manager->context); + vdo_finish_operation(&manager->state, VDO_SUCCESS); + if (action.parent != NULL) + vdo_continue_completion(action.parent, result); + + if (has_next_action) + launch_current_action(manager); +} + +/** + * vdo_schedule_action() - Schedule an action to be applied to all zones. + * @manager: The action manager to schedule the action on. + * @preamble: A method to be invoked on the initiator thread once this action is started but before + * applying to each zone; may be NULL. + * @action: The action to apply to each zone; may be NULL. + * @conclusion: A method to be invoked back on the initiator thread once the action has been + * applied to all zones; may be NULL. + * @parent: The object to notify once the action is complete or if the action can not be scheduled; + * may be NULL. + * + * The action will be launched immediately if there is no current action, or as soon as the current + * action completes. If there is already a pending action, this action will not be scheduled, and, + * if it has a parent, that parent will be notified. At least one of the preamble, action, or + * conclusion must not be NULL. + * + * Return: true if the action was scheduled. + */ +bool vdo_schedule_action(struct action_manager *manager, + vdo_action_preamble *preamble, + vdo_zone_action *action, + vdo_action_conclusion *conclusion, + struct vdo_completion *parent) +{ + return vdo_schedule_operation(manager, + VDO_ADMIN_STATE_OPERATING, + preamble, + action, + conclusion, + parent); +} + +/** + * vdo_schedule_operation() - Schedule an operation to be applied to all zones. + * @manager: The action manager to schedule the action on. + * @operation: The operation this action will perform + * @preamble: A method to be invoked on the initiator thread once this action is started but before + * applying to each zone; may be NULL. + * @action: The action to apply to each zone; may be NULL. + * @conclusion: A method to be invoked back on the initiator thread once the action has been + * applied to all zones; may be NULL. + * @parent: The object to notify once the action is complete or if the action can not be scheduled; + * may be NULL. + * + * The operation's action will be launched immediately if there is no current action, or as soon as + * the current action completes. If there is already a pending action, this operation will not be + * scheduled, and, if it has a parent, that parent will be notified. At least one of the preamble, + * action, or conclusion must not be NULL. + * + * Return: true if the action was scheduled. + */ +bool vdo_schedule_operation(struct action_manager *manager, + const struct admin_state_code *operation, + vdo_action_preamble *preamble, + vdo_zone_action *action, + vdo_action_conclusion *conclusion, + struct vdo_completion *parent) +{ + return vdo_schedule_operation_with_context(manager, + operation, + preamble, + action, + conclusion, + NULL, + parent); +} + +/** + * vdo_schedule_operation_with_context() - Schedule an operation on all zones. + * @manager: The action manager to schedule the action on. + * @operation: The operation this action will perform. + * @preamble: A method to be invoked on the initiator thread once this action is started but before + * applying to each zone; may be NULL. + * @action: The action to apply to each zone; may be NULL. + * @conclusion: A method to be invoked back on the initiator thread once the action has been + * applied to all zones; may be NULL. + * @context: An action-specific context which may be retrieved via + * vdo_get_current_action_context(); may be NULL. + * @parent: The object to notify once the action is complete or if the action can not be scheduled; + * may be NULL. + * + * The operation's action will be launched immediately if there is no current action, or as soon as + * the current action completes. If there is already a pending action, this operation will not be + * scheduled, and, if it has a parent, that parent will be notified. At least one of the preamble, + * action, or conclusion must not be NULL. + * + * Return: true if the action was scheduled + */ +bool vdo_schedule_operation_with_context(struct action_manager *manager, + const struct admin_state_code *operation, + vdo_action_preamble *preamble, + vdo_zone_action *action, + vdo_action_conclusion *conclusion, + void *context, + struct vdo_completion *parent) +{ + struct action *current_action; + + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == manager->initiator_thread_id), + "action initiated from correct thread"); + if (!manager->current_action->in_use) { + current_action = manager->current_action; + } else if (!manager->current_action->next->in_use) { + current_action = manager->current_action->next; + } else { + if (parent != NULL) + vdo_continue_completion(parent, VDO_COMPONENT_BUSY); + + return false; + } + + *current_action = (struct action) { + .in_use = true, + .operation = operation, + .preamble = (preamble == NULL) ? no_preamble : preamble, + .zone_action = action, + .conclusion = (conclusion == NULL) ? no_conclusion : conclusion, + .context = context, + .parent = parent, + .next = current_action->next, + }; + + if (current_action == manager->current_action) + launch_current_action(manager); + + return true; +} diff --git a/drivers/md/dm-vdo/action-manager.h b/drivers/md/dm-vdo/action-manager.h new file mode 100644 index 00000000000..a04c6c0d43b --- /dev/null +++ b/drivers/md/dm-vdo/action-manager.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_ACTION_MANAGER_H +#define VDO_ACTION_MANAGER_H + +#include "admin-state.h" +#include "types.h" + +/* + * An action_manager provides a generic mechanism for applying actions to multi-zone entities (such + * as the block map or slab depot). Each action manager is tied to a specific context for which it + * manages actions. The manager ensures that only one action is active on that context at a time, + * and supports at most one pending action. Calls to schedule an action when there is already a + * pending action will result in VDO_COMPONENT_BUSY errors. Actions may only be submitted to the + * action manager from a single thread (which thread is determined when the action manager is + * constructed). + * + * A scheduled action consists of four components: + * + * preamble + * an optional method to be run on the initator thread before applying the action to all zones + * zone_action + * an optional method to be applied to each of the zones + * conclusion + * an optional method to be run on the initiator thread once the per-zone method has been + * applied to all zones + * parent + * an optional completion to be finished once the conclusion is done + * + * At least one of the three methods must be provided. + */ + +/* + * A function which is to be applied asynchronously to a set of zones. + * @context: The object which holds the per-zone context for the action + * @zone_number: The number of zone to which the action is being applied + * @parent: The object to notify when the action is complete + */ +typedef void vdo_zone_action(void *context, + zone_count_t zone_number, + struct vdo_completion *parent); + +/* + * A function which is to be applied asynchronously on an action manager's initiator thread as the + * preamble of an action. + * @context: The object which holds the per-zone context for the action + * @parent: The object to notify when the action is complete + */ +typedef void vdo_action_preamble(void *context, struct vdo_completion *parent); + +/* + * A function which will run on the action manager's initiator thread as the conclusion of an + * action. + * @context: The object which holds the per-zone context for the action + * + * Return: VDO_SUCCESS or an error + */ +typedef int vdo_action_conclusion(void *context); + +/* + * A function to schedule an action. + * @context: The object which holds the per-zone context for the action + * + * Return: true if an action was scheduled + */ +typedef bool vdo_action_scheduler(void *context); + +/* + * A function to get the id of the thread associated with a given zone. + * @context: The action context + * @zone_number: The number of the zone for which the thread ID is desired + */ +typedef thread_id_t vdo_zone_thread_getter(void *context, zone_count_t zone_number); + +struct action_manager; + +int __must_check +vdo_make_action_manager(zone_count_t zones, + vdo_zone_thread_getter *get_zone_thread_id, + thread_id_t initiator_thread_id, + void *context, + vdo_action_scheduler *scheduler, + struct vdo *vdo, + struct action_manager **manager_ptr); + +const struct admin_state_code *__must_check +vdo_get_current_manager_operation(struct action_manager *manager); + +void * __must_check vdo_get_current_action_context(struct action_manager *manager); + +bool vdo_schedule_default_action(struct action_manager *manager); + +bool vdo_schedule_action(struct action_manager *manager, + vdo_action_preamble *preamble, + vdo_zone_action *action, + vdo_action_conclusion *conclusion, + struct vdo_completion *parent); + +bool vdo_schedule_operation(struct action_manager *manager, + const struct admin_state_code *operation, + vdo_action_preamble *preamble, + vdo_zone_action *action, + vdo_action_conclusion *conclusion, + struct vdo_completion *parent); + +bool vdo_schedule_operation_with_context(struct action_manager *manager, + const struct admin_state_code *operation, + vdo_action_preamble *preamble, + vdo_zone_action *action, + vdo_action_conclusion *conclusion, + void *context, + struct vdo_completion *parent); + +#endif /* VDO_ACTION_MANAGER_H */ diff --git a/drivers/md/dm-vdo/admin-state.c b/drivers/md/dm-vdo/admin-state.c new file mode 100644 index 00000000000..87cb0e28369 --- /dev/null +++ b/drivers/md/dm-vdo/admin-state.c @@ -0,0 +1,512 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "admin-state.h" + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "completion.h" +#include "types.h" + +static const struct admin_state_code VDO_CODE_NORMAL_OPERATION = { + .name = "VDO_ADMIN_STATE_NORMAL_OPERATION", + .normal = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_NORMAL_OPERATION = &VDO_CODE_NORMAL_OPERATION; +static const struct admin_state_code VDO_CODE_OPERATING = { + .name = "VDO_ADMIN_STATE_OPERATING", + .normal = true, + .operating = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_OPERATING = &VDO_CODE_OPERATING; +static const struct admin_state_code VDO_CODE_FORMATTING = { + .name = "VDO_ADMIN_STATE_FORMATTING", + .operating = true, + .loading = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_FORMATTING = &VDO_CODE_FORMATTING; +static const struct admin_state_code VDO_CODE_PRE_LOADING = { + .name = "VDO_ADMIN_STATE_PRE_LOADING", + .operating = true, + .loading = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_PRE_LOADING = &VDO_CODE_PRE_LOADING; +static const struct admin_state_code VDO_CODE_PRE_LOADED = { + .name = "VDO_ADMIN_STATE_PRE_LOADED", +}; +const struct admin_state_code *VDO_ADMIN_STATE_PRE_LOADED = &VDO_CODE_PRE_LOADED; +static const struct admin_state_code VDO_CODE_LOADING = { + .name = "VDO_ADMIN_STATE_LOADING", + .normal = true, + .operating = true, + .loading = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_LOADING = &VDO_CODE_LOADING; +static const struct admin_state_code VDO_CODE_LOADING_FOR_RECOVERY = { + .name = "VDO_ADMIN_STATE_LOADING_FOR_RECOVERY", + .operating = true, + .loading = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_LOADING_FOR_RECOVERY = + &VDO_CODE_LOADING_FOR_RECOVERY; +static const struct admin_state_code VDO_CODE_LOADING_FOR_REBUILD = { + .name = "VDO_ADMIN_STATE_LOADING_FOR_REBUILD", + .operating = true, + .loading = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_LOADING_FOR_REBUILD = &VDO_CODE_LOADING_FOR_REBUILD; +static const struct admin_state_code VDO_CODE_WAITING_FOR_RECOVERY = { + .name = "VDO_ADMIN_STATE_WAITING_FOR_RECOVERY", + .operating = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_WAITING_FOR_RECOVERY = + &VDO_CODE_WAITING_FOR_RECOVERY; +static const struct admin_state_code VDO_CODE_NEW = { + .name = "VDO_ADMIN_STATE_NEW", + .quiescent = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_NEW = &VDO_CODE_NEW; +static const struct admin_state_code VDO_CODE_INITIALIZED = { + .name = "VDO_ADMIN_STATE_INITIALIZED", +}; +const struct admin_state_code *VDO_ADMIN_STATE_INITIALIZED = &VDO_CODE_INITIALIZED; +static const struct admin_state_code VDO_CODE_RECOVERING = { + .name = "VDO_ADMIN_STATE_RECOVERING", + .draining = true, + .operating = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_RECOVERING = &VDO_CODE_RECOVERING; +static const struct admin_state_code VDO_CODE_REBUILDING = { + .name = "VDO_ADMIN_STATE_REBUILDING", + .draining = true, + .operating = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_REBUILDING = &VDO_CODE_REBUILDING; +static const struct admin_state_code VDO_CODE_SAVING = { + .name = "VDO_ADMIN_STATE_SAVING", + .draining = true, + .quiescing = true, + .operating = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_SAVING = &VDO_CODE_SAVING; +static const struct admin_state_code VDO_CODE_SAVED = { + .name = "VDO_ADMIN_STATE_SAVED", + .quiescent = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_SAVED = &VDO_CODE_SAVED; +static const struct admin_state_code VDO_CODE_SCRUBBING = { + .name = "VDO_ADMIN_STATE_SCRUBBING", + .draining = true, + .loading = true, + .operating = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_SCRUBBING = &VDO_CODE_SCRUBBING; +static const struct admin_state_code VDO_CODE_SAVE_FOR_SCRUBBING = { + .name = "VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING", + .draining = true, + .operating = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING = &VDO_CODE_SAVE_FOR_SCRUBBING; +static const struct admin_state_code VDO_CODE_STOPPING = { + .name = "VDO_ADMIN_STATE_STOPPING", + .draining = true, + .quiescing = true, + .operating = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_STOPPING = &VDO_CODE_STOPPING; +static const struct admin_state_code VDO_CODE_STOPPED = { + .name = "VDO_ADMIN_STATE_STOPPED", + .quiescent = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_STOPPED = &VDO_CODE_STOPPED; +static const struct admin_state_code VDO_CODE_SUSPENDING = { + .name = "VDO_ADMIN_STATE_SUSPENDING", + .draining = true, + .quiescing = true, + .operating = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDING = &VDO_CODE_SUSPENDING; +static const struct admin_state_code VDO_CODE_SUSPENDED = { + .name = "VDO_ADMIN_STATE_SUSPENDED", + .quiescent = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDED = &VDO_CODE_SUSPENDED; +static const struct admin_state_code VDO_CODE_SUSPENDED_OPERATION = { + .name = "VDO_ADMIN_STATE_SUSPENDED_OPERATION", + .operating = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDED_OPERATION = &VDO_CODE_SUSPENDED_OPERATION; +static const struct admin_state_code VDO_CODE_RESUMING = { + .name = "VDO_ADMIN_STATE_RESUMING", + .operating = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_RESUMING = &VDO_CODE_RESUMING; + +/** + * get_next_state() - Determine the state which should be set after a given operation completes + * based on the operation and the current state. + * @operation The operation to be started. + * + * Return: The state to set when the operation completes or NULL if the operation can not be + * started in the current state. + */ +static const struct admin_state_code * +get_next_state(const struct admin_state *state, const struct admin_state_code *operation) +{ + const struct admin_state_code *code = vdo_get_admin_state_code(state); + + if (code->operating) + return NULL; + + if (operation == VDO_ADMIN_STATE_SAVING) + return (code == VDO_ADMIN_STATE_NORMAL_OPERATION ? VDO_ADMIN_STATE_SAVED : NULL); + + if (operation == VDO_ADMIN_STATE_SUSPENDING) + return (code == VDO_ADMIN_STATE_NORMAL_OPERATION + ? VDO_ADMIN_STATE_SUSPENDED + : NULL); + + if (operation == VDO_ADMIN_STATE_STOPPING) + return (code == VDO_ADMIN_STATE_NORMAL_OPERATION ? VDO_ADMIN_STATE_STOPPED : NULL); + + if (operation == VDO_ADMIN_STATE_PRE_LOADING) + return (code == VDO_ADMIN_STATE_INITIALIZED ? VDO_ADMIN_STATE_PRE_LOADED : NULL); + + if (operation == VDO_ADMIN_STATE_SUSPENDED_OPERATION) + return (((code == VDO_ADMIN_STATE_SUSPENDED) || + (code == VDO_ADMIN_STATE_SAVED)) ? code : NULL); + + return VDO_ADMIN_STATE_NORMAL_OPERATION; +} + +/** + * vdo_finish_operation() - Finish the current operation. + * + * Will notify the operation waiter if there is one. This method should be used for operations + * started with vdo_start_operation(). For operations which were started with vdo_start_draining(), + * use vdo_finish_draining() instead. + * + * Return: true if there was an operation to finish. + */ +bool vdo_finish_operation(struct admin_state *state, int result) +{ + if (!vdo_get_admin_state_code(state)->operating) + return false; + + state->complete = state->starting; + if (state->waiter != NULL) + vdo_set_completion_result(state->waiter, result); + + if (!state->starting) { + vdo_set_admin_state_code(state, state->next_state); + if (state->waiter != NULL) + vdo_launch_completion(UDS_FORGET(state->waiter)); + } + + return true; +} + +/** + * begin_operation() - Begin an operation if it may be started given the current state. + * @waiter A completion to notify when the operation is complete; may be NULL. + * @initiator The vdo_admin_initiator to call if the operation may begin; may be NULL. + * + * Return: VDO_SUCCESS or an error. + */ +static int __must_check begin_operation(struct admin_state *state, + const struct admin_state_code *operation, + struct vdo_completion *waiter, + vdo_admin_initiator *initiator) +{ + int result; + const struct admin_state_code *next_state = get_next_state(state, operation); + + if (next_state == NULL) { + result = uds_log_error_strerror(VDO_INVALID_ADMIN_STATE, + "Can't start %s from %s", + operation->name, + vdo_get_admin_state_code(state)->name); + } else if (state->waiter != NULL) { + result = uds_log_error_strerror(VDO_COMPONENT_BUSY, + "Can't start %s with extant waiter", + operation->name); + } else { + state->waiter = waiter; + state->next_state = next_state; + vdo_set_admin_state_code(state, operation); + if (initiator != NULL) { + state->starting = true; + initiator(state); + state->starting = false; + if (state->complete) + vdo_finish_operation(state, VDO_SUCCESS); + } + + return VDO_SUCCESS; + } + + if (waiter != NULL) + vdo_continue_completion(waiter, result); + + return result; +} + +/** + * start_operation() - Start an operation if it may be started given the current state. + * @waiter A completion to notify when the operation is complete. + * @initiator The vdo_admin_initiator to call if the operation may begin; may be NULL. + * + * Return: true if the operation was started. + */ +static inline bool __must_check start_operation(struct admin_state *state, + const struct admin_state_code *operation, + struct vdo_completion *waiter, + vdo_admin_initiator *initiator) +{ + return (begin_operation(state, operation, waiter, initiator) == VDO_SUCCESS); +} + +/** + * check_code() - Check the result of a state validation. + * @valid true if the code is of an appropriate type. + * @code The code which failed to be of the correct type. + * @what What the code failed to be, for logging. + * @waiter The completion to notify of the error; may be NULL. + * + * If the result failed, log an invalid state error and, if there is a waiter, notify it. + * + * Return: The result of the check. + */ +static bool check_code(bool valid, + const struct admin_state_code *code, + const char *what, + struct vdo_completion *waiter) +{ + int result; + + if (valid) + return true; + + result = uds_log_error_strerror(VDO_INVALID_ADMIN_STATE, + "%s is not a %s", code->name, what); + if (waiter != NULL) + vdo_continue_completion(waiter, result); + + return false; +} + +/** + * vdo_drain_operation() - Check that an operation is a drain. + * @waiter The completion to finish with an error if the operation is not a drain. + * + * Return: true if the specified operation is a drain. + */ +static bool __must_check +assert_vdo_drain_operation(const struct admin_state_code *operation, struct vdo_completion *waiter) +{ + return check_code(operation->draining, operation, "drain operation", waiter); +} + +/** + * vdo_start_draining() - Initiate a drain operation if the current state permits it. + * @operation The type of drain to initiate. + * @waiter The completion to notify when the drain is complete. + * @initiator The vdo_admin_initiator to call if the operation may begin; may be NULL. + * + * Return: true if the drain was initiated, if not the waiter will be notified. + */ +bool vdo_start_draining(struct admin_state *state, + const struct admin_state_code *operation, + struct vdo_completion *waiter, + vdo_admin_initiator *initiator) +{ + const struct admin_state_code *code = vdo_get_admin_state_code(state); + + if (!assert_vdo_drain_operation(operation, waiter)) + return false; + + if (code->quiescent) { + vdo_launch_completion(waiter); + return false; + } + + if (!code->normal) { + uds_log_error_strerror(VDO_INVALID_ADMIN_STATE, + "can't start %s from %s", + operation->name, + code->name); + vdo_continue_completion(waiter, VDO_INVALID_ADMIN_STATE); + return false; + } + + return start_operation(state, operation, waiter, initiator); +} + +/** + * vdo_finish_draining() - Finish a drain operation if one was in progress. + * + * Return: true if the state was draining; will notify the waiter if so. + */ +bool vdo_finish_draining(struct admin_state *state) +{ + return vdo_finish_draining_with_result(state, VDO_SUCCESS); +} + +/** + * vdo_finish_draining_with_result() - Finish a drain operation with a status code. + * + * Return: true if the state was draining; will notify the waiter if so. + */ +bool vdo_finish_draining_with_result(struct admin_state *state, int result) +{ + return (vdo_is_state_draining(state) && vdo_finish_operation(state, result)); +} + +/** + * vdo_assert_load_operation() - Check that an operation is a load. + * @waiter The completion to finish with an error if the operation is not a load. + * + * Return: true if the specified operation is a load. + */ +bool vdo_assert_load_operation(const struct admin_state_code *operation, + struct vdo_completion *waiter) +{ + return check_code(operation->loading, operation, "load operation", waiter); +} + +/** + * vdo_start_loading() - Initiate a load operation if the current state permits it. + * @operation The type of load to initiate. + * @waiter The completion to notify when the load is complete (may be NULL). + * @initiator The vdo_admin_initiator to call if the operation may begin; may be NULL. + * + * Return: true if the load was initiated, if not the waiter will be notified. + */ +bool vdo_start_loading(struct admin_state *state, + const struct admin_state_code *operation, + struct vdo_completion *waiter, + vdo_admin_initiator *initiator) +{ + return (vdo_assert_load_operation(operation, waiter) && + start_operation(state, operation, waiter, initiator)); +} + +/** + * vdo_finish_loading() - Finish a load operation if one was in progress. + * + * Return: true if the state was loading; will notify the waiter if so. + */ +bool vdo_finish_loading(struct admin_state *state) +{ + return vdo_finish_loading_with_result(state, VDO_SUCCESS); +} + +/** + * vdo_finish_loading_with_result() - Finish a load operation with a status code. + * @result The result of the load operation. + * + * Return: true if the state was loading; will notify the waiter if so. + */ +bool vdo_finish_loading_with_result(struct admin_state *state, int result) +{ + return (vdo_is_state_loading(state) && vdo_finish_operation(state, result)); +} + +/** + * assert_vdo_resume_operation() - Check whether an admin_state_code is a resume operation. + * @waiter The completion to notify if the operation is not a resume operation; may be NULL. + * + * Return: true if the code is a resume operation. + */ +static bool __must_check assert_vdo_resume_operation(const struct admin_state_code *operation, + struct vdo_completion *waiter) +{ + return check_code(operation == VDO_ADMIN_STATE_RESUMING, + operation, + "resume operation", + waiter); +} + +/** + * vdo_start_resuming() - Initiate a resume operation if the current state permits it. + * @operation The type of resume to start. + * @waiter The completion to notify when the resume is complete (may be NULL). + * @initiator The vdo_admin_initiator to call if the operation may begin; may be NULL. + * + * Return: true if the resume was initiated, if not the waiter will be notified. + */ +bool vdo_start_resuming(struct admin_state *state, + const struct admin_state_code *operation, + struct vdo_completion *waiter, + vdo_admin_initiator *initiator) +{ + return (assert_vdo_resume_operation(operation, waiter) && + start_operation(state, operation, waiter, initiator)); +} + +/** + * vdo_finish_resuming() - Finish a resume operation if one was in progress. + * + * Return: true if the state was resuming; will notify the waiter if so. + */ +bool vdo_finish_resuming(struct admin_state *state) +{ + return vdo_finish_resuming_with_result(state, VDO_SUCCESS); +} + +/** + * vdo_finish_resuming_with_result() - Finish a resume operation with a status code. + * @result The result of the resume operation. + * + * Return: true if the state was resuming; will notify the waiter if so. + */ +bool vdo_finish_resuming_with_result(struct admin_state *state, int result) +{ + return (vdo_is_state_resuming(state) && vdo_finish_operation(state, result)); +} + +/** + * vdo_resume_if_quiescent() - Change the state to normal operation if the current state is + * quiescent. + * + * Return: VDO_SUCCESS if the state resumed, VDO_INVALID_ADMIN_STATE otherwise. + */ +int vdo_resume_if_quiescent(struct admin_state *state) +{ + if (!vdo_is_state_quiescent(state)) + return VDO_INVALID_ADMIN_STATE; + + vdo_set_admin_state_code(state, VDO_ADMIN_STATE_NORMAL_OPERATION); + return VDO_SUCCESS; +} + +/** + * vdo_start_operation() - Attempt to start an operation. + * + * Return: VDO_SUCCESS if the operation was started, VDO_INVALID_ADMIN_STATE if not + */ +int vdo_start_operation(struct admin_state *state, const struct admin_state_code *operation) +{ + return vdo_start_operation_with_waiter(state, operation, NULL, NULL); +} + +/** + * vdo_start_operation_with_waiter() - Attempt to start an operation. + * @waiter the completion to notify when the operation completes or fails to start; may be NULL. + * @initiator The vdo_admin_initiator to call if the operation may begin; may be NULL. + * + * Return: VDO_SUCCESS if the operation was started, VDO_INVALID_ADMIN_STATE if not + */ +int vdo_start_operation_with_waiter(struct admin_state *state, + const struct admin_state_code *operation, + struct vdo_completion *waiter, + vdo_admin_initiator *initiator) +{ + return (check_code(operation->operating, operation, "operation", waiter) ? + begin_operation(state, operation, waiter, initiator) : + VDO_INVALID_ADMIN_STATE); +} diff --git a/drivers/md/dm-vdo/admin-state.h b/drivers/md/dm-vdo/admin-state.h new file mode 100644 index 00000000000..925211a8a76 --- /dev/null +++ b/drivers/md/dm-vdo/admin-state.h @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_ADMIN_STATE_H +#define VDO_ADMIN_STATE_H + +#include "completion.h" +#include "types.h" + +struct admin_state_code { + const char *name; + /* Normal operation, data_vios may be active */ + bool normal; + /* I/O is draining, new requests should not start */ + bool draining; + /* This is a startup time operation */ + bool loading; + /* The next state will be quiescent */ + bool quiescing; + /* The VDO is quiescent, there should be no I/O */ + bool quiescent; + /* Whether an operation is in progress and so no other operation may be started */ + bool operating; +}; + +extern const struct admin_state_code *VDO_ADMIN_STATE_NORMAL_OPERATION; +extern const struct admin_state_code *VDO_ADMIN_STATE_OPERATING; +extern const struct admin_state_code *VDO_ADMIN_STATE_FORMATTING; +extern const struct admin_state_code *VDO_ADMIN_STATE_PRE_LOADING; +extern const struct admin_state_code *VDO_ADMIN_STATE_PRE_LOADED; +extern const struct admin_state_code *VDO_ADMIN_STATE_LOADING; +extern const struct admin_state_code *VDO_ADMIN_STATE_LOADING_FOR_RECOVERY; +extern const struct admin_state_code *VDO_ADMIN_STATE_LOADING_FOR_REBUILD; +extern const struct admin_state_code *VDO_ADMIN_STATE_WAITING_FOR_RECOVERY; +extern const struct admin_state_code *VDO_ADMIN_STATE_NEW; +extern const struct admin_state_code *VDO_ADMIN_STATE_INITIALIZED; +extern const struct admin_state_code *VDO_ADMIN_STATE_RECOVERING; +extern const struct admin_state_code *VDO_ADMIN_STATE_REBUILDING; +extern const struct admin_state_code *VDO_ADMIN_STATE_SAVING; +extern const struct admin_state_code *VDO_ADMIN_STATE_SAVED; +extern const struct admin_state_code *VDO_ADMIN_STATE_SCRUBBING; +extern const struct admin_state_code *VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING; +extern const struct admin_state_code *VDO_ADMIN_STATE_STOPPING; +extern const struct admin_state_code *VDO_ADMIN_STATE_STOPPED; +extern const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDING; +extern const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDED; +extern const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDED_OPERATION; +extern const struct admin_state_code *VDO_ADMIN_STATE_RESUMING; + +struct admin_state { + const struct admin_state_code *current_state; + /* The next administrative state (when the current operation finishes) */ + const struct admin_state_code *next_state; + /* A completion waiting on a state change */ + struct vdo_completion *waiter; + /* Whether an operation is being initiated */ + bool starting; + /* Whether an operation has completed in the initiator */ + bool complete; +}; + +/** + * typedef vdo_admin_initiator - A method to be called once an admin operation may be initiated. + */ +typedef void vdo_admin_initiator(struct admin_state *state); + +static inline const struct admin_state_code * __must_check +vdo_get_admin_state_code(const struct admin_state *state) +{ + return READ_ONCE(state->current_state); +} + +/** + * vdo_set_admin_state_code() - Set the current admin state code. + * + * This function should be used primarily for initialization and by adminState internals. Most uses + * should go through the operation interfaces. + */ +static inline void +vdo_set_admin_state_code(struct admin_state *state, const struct admin_state_code *code) +{ + WRITE_ONCE(state->current_state, code); +} + +static inline bool __must_check vdo_is_state_normal(const struct admin_state *state) +{ + return vdo_get_admin_state_code(state)->normal; +} + +static inline bool __must_check vdo_is_state_suspending(const struct admin_state *state) +{ + return (vdo_get_admin_state_code(state) == VDO_ADMIN_STATE_SUSPENDING); +} + +static inline bool __must_check vdo_is_state_saving(const struct admin_state *state) +{ + return (vdo_get_admin_state_code(state) == VDO_ADMIN_STATE_SAVING); +} + +static inline bool __must_check vdo_is_state_saved(const struct admin_state *state) +{ + return (vdo_get_admin_state_code(state) == VDO_ADMIN_STATE_SAVED); +} + +static inline bool __must_check vdo_is_state_draining(const struct admin_state *state) +{ + return vdo_get_admin_state_code(state)->draining; +} + +static inline bool __must_check vdo_is_state_loading(const struct admin_state *state) +{ + return vdo_get_admin_state_code(state)->loading; +} + +static inline bool __must_check vdo_is_state_resuming(const struct admin_state *state) +{ + return (vdo_get_admin_state_code(state) == VDO_ADMIN_STATE_RESUMING); +} + +static inline bool __must_check vdo_is_state_clean_load(const struct admin_state *state) +{ + const struct admin_state_code *code = vdo_get_admin_state_code(state); + + return ((code == VDO_ADMIN_STATE_FORMATTING) || (code == VDO_ADMIN_STATE_LOADING)); +} + +static inline bool __must_check vdo_is_state_quiescing(const struct admin_state *state) +{ + return vdo_get_admin_state_code(state)->quiescing; +} + +static inline bool __must_check vdo_is_state_quiescent(const struct admin_state *state) +{ + return vdo_get_admin_state_code(state)->quiescent; +} + +bool vdo_start_draining(struct admin_state *state, + const struct admin_state_code *operation, + struct vdo_completion *waiter, + vdo_admin_initiator * initiator); + +bool vdo_finish_draining(struct admin_state *state); + +bool vdo_finish_draining_with_result(struct admin_state *state, int result); + +bool __must_check +vdo_assert_load_operation(const struct admin_state_code *operation, struct vdo_completion *waiter); + +bool vdo_start_loading(struct admin_state *state, + const struct admin_state_code *operation, + struct vdo_completion *waiter, + vdo_admin_initiator *initiator); + +bool vdo_finish_loading(struct admin_state *state); + +bool vdo_finish_loading_with_result(struct admin_state *state, int result); + +bool vdo_start_resuming(struct admin_state *state, + const struct admin_state_code *operation, + struct vdo_completion *waiter, + vdo_admin_initiator *initiator); + +bool vdo_finish_resuming(struct admin_state *state); + +bool vdo_finish_resuming_with_result(struct admin_state *state, int result); + +int vdo_resume_if_quiescent(struct admin_state *state); + +int vdo_start_operation(struct admin_state *state, const struct admin_state_code *operation); + +int vdo_start_operation_with_waiter(struct admin_state *state, + const struct admin_state_code *operation, + struct vdo_completion *waiter, + vdo_admin_initiator *initiator); + +bool vdo_finish_operation(struct admin_state *state, int result); + +#endif /* VDO_ADMIN_STATE_H */ diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c new file mode 100644 index 00000000000..4c9184cbc1a --- /dev/null +++ b/drivers/md/dm-vdo/block-map.c @@ -0,0 +1,3388 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "block-map.h" + +#include <linux/bio.h> +#include <linux/ratelimit.h> + +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "action-manager.h" +#include "admin-state.h" +#include "completion.h" +#include "constants.h" +#include "data-vio.h" +#include "encodings.h" +#include "io-submitter.h" +#include "physical-zone.h" +#include "recovery-journal.h" +#include "slab-depot.h" +#include "status-codes.h" +#include "types.h" +#include "vdo.h" +#include "vio.h" +#include "wait-queue.h" + +/** + * DOC: Block map eras + * + * The block map era, or maximum age, is used as follows: + * + * Each block map page, when dirty, records the earliest recovery journal block sequence number of + * the changes reflected in that dirty block. Sequence numbers are classified into eras: every + * @maximum_age sequence numbers, we switch to a new era. Block map pages are assigned to eras + * according to the sequence number they record. + * + * In the current (newest) era, block map pages are not written unless there is cache pressure. In + * the next oldest era, each time a new journal block is written 1/@maximum_age of the pages in + * this era are issued for write. In all older eras, pages are issued for write immediately. + */ + +struct page_descriptor { + root_count_t root_index; + height_t height; + page_number_t page_index; + slot_number_t slot; +} __packed; + +union page_key { + struct page_descriptor descriptor; + u64 key; +}; + +struct write_if_not_dirtied_context { + struct block_map_zone *zone; + u8 generation; +}; + +struct block_map_tree_segment { + struct tree_page *levels[VDO_BLOCK_MAP_TREE_HEIGHT]; +}; + +struct block_map_tree { + struct block_map_tree_segment *segments; +} block_map_tree; + +struct forest { + struct block_map *map; + size_t segments; + struct boundary *boundaries; + struct tree_page **pages; + struct block_map_tree trees[]; +}; + +struct cursor_level { + page_number_t page_index; + slot_number_t slot; +}; + +struct cursors; + +struct cursor { + struct waiter waiter; + struct block_map_tree *tree; + height_t height; + struct cursors *parent; + struct boundary boundary; + struct cursor_level levels[VDO_BLOCK_MAP_TREE_HEIGHT]; + struct pooled_vio *vio; +}; + +struct cursors { + struct block_map_zone *zone; + struct vio_pool *pool; + vdo_entry_callback *entry_callback; + struct vdo_completion *parent; + root_count_t active_roots; + struct cursor cursors[]; +}; + +/* Used to indicate that the page holding the location of a tree root has been "loaded". */ +const physical_block_number_t VDO_INVALID_PBN = 0xFFFFFFFFFFFFFFFF; + +enum { + LOG_INTERVAL = 4000, + DISPLAY_INTERVAL = 100000, +}; + +/* + * For adjusting VDO page cache statistic fields which are only mutated on the logical zone thread. + * Prevents any compiler shenanigans from affecting other threads reading those stats. + */ +#define ADD_ONCE(value, delta) WRITE_ONCE(value, (value) + (delta)) + +static inline bool is_dirty(const struct page_info *info) +{ + return info->state == PS_DIRTY; +} + +static inline bool is_present(const struct page_info *info) +{ + return (info->state == PS_RESIDENT) || (info->state == PS_DIRTY); +} + +static inline bool is_in_flight(const struct page_info *info) +{ + return (info->state == PS_INCOMING) || (info->state == PS_OUTGOING); +} + +static inline bool is_incoming(const struct page_info *info) +{ + return info->state == PS_INCOMING; +} + +static inline bool is_outgoing(const struct page_info *info) +{ + return info->state == PS_OUTGOING; +} + +static inline bool is_valid(const struct page_info *info) +{ + return is_present(info) || is_outgoing(info); +} + +static char *get_page_buffer(struct page_info *info) +{ + struct vdo_page_cache *cache = info->cache; + + return &cache->pages[(info - cache->infos) * VDO_BLOCK_SIZE]; +} + +static inline struct vdo_page_completion *page_completion_from_waiter(struct waiter *waiter) +{ + struct vdo_page_completion *completion; + + if (waiter == NULL) + return NULL; + + completion = container_of(waiter, struct vdo_page_completion, waiter); + vdo_assert_completion_type(&completion->completion, VDO_PAGE_COMPLETION); + return completion; +} + +/** + * initialize_info() - Initialize all page info structures and put them on the free list. + * + * Return: VDO_SUCCESS or an error. + */ +static int initialize_info(struct vdo_page_cache *cache) +{ + struct page_info *info; + + INIT_LIST_HEAD(&cache->free_list); + for (info = cache->infos; info < cache->infos + cache->page_count; ++info) { + int result; + + info->cache = cache; + info->state = PS_FREE; + info->pbn = NO_PAGE; + + result = create_metadata_vio(cache->vdo, + VIO_TYPE_BLOCK_MAP, + VIO_PRIORITY_METADATA, info, + get_page_buffer(info), + &info->vio); + if (result != VDO_SUCCESS) + return result; + + /* The thread ID should never change. */ + info->vio->completion.callback_thread_id = cache->zone->thread_id; + + INIT_LIST_HEAD(&info->state_entry); + list_add_tail(&info->state_entry, &cache->free_list); + INIT_LIST_HEAD(&info->lru_entry); + } + + return VDO_SUCCESS; +} + +/** + * allocate_cache_components() - Allocate components of the cache which require their own + * allocation. + * @maximum_age: The number of journal blocks before a dirtied page is considered old and must be + * written out. + * + * The caller is responsible for all clean up on errors. + * + * Return: VDO_SUCCESS or an error code. + */ +static int __must_check allocate_cache_components(struct vdo_page_cache *cache) +{ + u64 size = cache->page_count * (u64) VDO_BLOCK_SIZE; + int result; + + result = UDS_ALLOCATE(cache->page_count, struct page_info, "page infos", &cache->infos); + if (result != UDS_SUCCESS) + return result; + + result = uds_allocate_memory(size, VDO_BLOCK_SIZE, "cache pages", &cache->pages); + if (result != UDS_SUCCESS) + return result; + + result = vdo_make_int_map(cache->page_count, 0, &cache->page_map); + if (result != UDS_SUCCESS) + return result; + + return initialize_info(cache); +} + +/** + * assert_on_cache_thread() - Assert that a function has been called on the VDO page cache's + * thread. + */ +static inline void assert_on_cache_thread(struct vdo_page_cache *cache, const char *function_name) +{ + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((thread_id == cache->zone->thread_id), + "%s() must only be called on cache thread %d, not thread %d", + function_name, + cache->zone->thread_id, + thread_id); +} + +/** assert_io_allowed() - Assert that a page cache may issue I/O. */ +static inline void assert_io_allowed(struct vdo_page_cache *cache) +{ + ASSERT_LOG_ONLY(!vdo_is_state_quiescent(&cache->zone->state), + "VDO page cache may issue I/O"); +} + +/** report_cache_pressure() - Log and, if enabled, report cache pressure. */ +static void report_cache_pressure(struct vdo_page_cache *cache) +{ + ADD_ONCE(cache->stats.cache_pressure, 1); + if (cache->waiter_count > cache->page_count) { + if ((cache->pressure_report % LOG_INTERVAL) == 0) + uds_log_info("page cache pressure %u", cache->stats.cache_pressure); + + if (++cache->pressure_report >= DISPLAY_INTERVAL) + cache->pressure_report = 0; + } +} + +/** + * get_page_state_name() - Return the name of a page state. + * + * If the page state is invalid a static string is returned and the invalid state is logged. + * + * Return: A pointer to a static page state name. + */ +static const char * __must_check get_page_state_name(enum vdo_page_buffer_state state) +{ + int result; + static const char * const state_names[] = { + "UDS_FREE", "INCOMING", "FAILED", "RESIDENT", "DIRTY", "OUTGOING" + }; + + STATIC_ASSERT(ARRAY_SIZE(state_names) == PAGE_STATE_COUNT); + + result = ASSERT(state < ARRAY_SIZE(state_names), "Unknown page_state value %d", state); + if (result != UDS_SUCCESS) + return "[UNKNOWN PAGE STATE]"; + + return state_names[state]; +} + +/** + * update_counter() - Update the counter associated with a given state. + * @info: The page info to count. + * @delta: The delta to apply to the counter. + */ +static void update_counter(struct page_info *info, s32 delta) +{ + struct block_map_statistics *stats = &info->cache->stats; + + switch (info->state) { + case PS_FREE: + ADD_ONCE(stats->free_pages, delta); + return; + + case PS_INCOMING: + ADD_ONCE(stats->incoming_pages, delta); + return; + + case PS_OUTGOING: + ADD_ONCE(stats->outgoing_pages, delta); + return; + + case PS_FAILED: + ADD_ONCE(stats->failed_pages, delta); + return; + + case PS_RESIDENT: + ADD_ONCE(stats->clean_pages, delta); + return; + + case PS_DIRTY: + ADD_ONCE(stats->dirty_pages, delta); + return; + + default: + return; + } +} + +/** update_lru() - Update the lru information for an active page. */ +static void update_lru(struct page_info *info) +{ + if (info->cache->lru_list.prev != &info->lru_entry) + list_move_tail(&info->lru_entry, &info->cache->lru_list); +} + +/** + * set_info_state() - Set the state of a page_info and put it on the right list, adjusting + * counters. + */ +static void +set_info_state(struct page_info *info, enum vdo_page_buffer_state new_state) +{ + if (new_state == info->state) + return; + + update_counter(info, -1); + info->state = new_state; + update_counter(info, 1); + + switch (info->state) { + case PS_FREE: + case PS_FAILED: + list_move_tail(&info->state_entry, &info->cache->free_list); + return; + + case PS_OUTGOING: + list_move_tail(&info->state_entry, &info->cache->outgoing_list); + return; + + case PS_DIRTY: + return; + + default: + list_del_init(&info->state_entry); + } +} + +/** set_info_pbn() - Set the pbn for an info, updating the map as needed. */ +static int __must_check set_info_pbn(struct page_info *info, physical_block_number_t pbn) +{ + struct vdo_page_cache *cache = info->cache; + + /* Either the new or the old page number must be NO_PAGE. */ + int result = ASSERT((pbn == NO_PAGE) || (info->pbn == NO_PAGE), + "Must free a page before reusing it."); + if (result != VDO_SUCCESS) + return result; + + if (info->pbn != NO_PAGE) + vdo_int_map_remove(cache->page_map, info->pbn); + + info->pbn = pbn; + + if (pbn != NO_PAGE) { + result = vdo_int_map_put(cache->page_map, pbn, info, true, NULL); + if (result != UDS_SUCCESS) + return result; + } + return VDO_SUCCESS; +} + +/** reset_page_info() - Reset page info to represent an unallocated page. */ +static int reset_page_info(struct page_info *info) +{ + int result; + + result = ASSERT(info->busy == 0, "VDO Page must not be busy"); + if (result != UDS_SUCCESS) + return result; + + result = ASSERT(!vdo_has_waiters(&info->waiting), "VDO Page must not have waiters"); + if (result != UDS_SUCCESS) + return result; + + result = set_info_pbn(info, NO_PAGE); + set_info_state(info, PS_FREE); + list_del_init(&info->lru_entry); + return result; +} + +/** + * find_free_page() - Find a free page. + * + * Return: A pointer to the page info structure (if found), NULL otherwise. + */ +static struct page_info * __must_check find_free_page(struct vdo_page_cache *cache) +{ + struct page_info *info; + + info = list_first_entry_or_null(&cache->free_list, struct page_info, state_entry); + if (info != NULL) + list_del_init(&info->state_entry); + return info; +} + +/** + * find_page() - Find the page info (if any) associated with a given pbn. + * @pbn: The absolute physical block number of the page. + * + * Return: The page info for the page if available, or NULL if not. + */ +static struct page_info * __must_check +find_page(struct vdo_page_cache *cache, physical_block_number_t pbn) +{ + if ((cache->last_found != NULL) && (cache->last_found->pbn == pbn)) + return cache->last_found; + cache->last_found = vdo_int_map_get(cache->page_map, pbn); + return cache->last_found; +} + +/** + * select_lru_page() - Determine which page is least recently used. + * + * Picks the least recently used from among the non-busy entries at the front of each of the lru + * ring. Since whenever we mark a page busy we also put it to the end of the ring it is unlikely + * that the entries at the front are busy unless the queue is very short, but not impossible. + * + * Return: A pointer to the info structure for a relevant page, or NULL if no such page can be + * found. The page can be dirty or resident. + */ +static struct page_info * __must_check select_lru_page(struct vdo_page_cache *cache) +{ + struct page_info *info; + + list_for_each_entry(info, &cache->lru_list, lru_entry) + if ((info->busy == 0) && !is_in_flight(info)) + return info; + + return NULL; +} + +/* ASYNCHRONOUS INTERFACE BEYOND THIS POINT */ + +/** + * complete_with_page() - Helper to complete the VDO Page Completion request successfully. + * @info: The page info representing the result page. + * @vdo_page_comp: The VDO page completion to complete. + */ +static void complete_with_page(struct page_info *info, struct vdo_page_completion *vdo_page_comp) +{ + bool available = vdo_page_comp->writable ? is_present(info) : is_valid(info); + + if (!available) { + uds_log_error_strerror(VDO_BAD_PAGE, + "Requested cache page %llu in state %s is not %s", + (unsigned long long) info->pbn, + get_page_state_name(info->state), + vdo_page_comp->writable ? "present" : + "valid"); + vdo_fail_completion(&vdo_page_comp->completion, VDO_BAD_PAGE); + return; + } + + vdo_page_comp->info = info; + vdo_page_comp->ready = true; + vdo_finish_completion(&vdo_page_comp->completion); +} + +/** + * complete_waiter_with_error() - Complete a page completion with an error code. + * @waiter: The page completion, as a waiter. + * @result_ptr: A pointer to the error code. + * + * Implements waiter_callback. + */ +static void complete_waiter_with_error(struct waiter *waiter, void *result_ptr) +{ + int *result = result_ptr; + + vdo_fail_completion(&page_completion_from_waiter(waiter)->completion, *result); +} + +/** + * complete_waiter_with_page() - Complete a page completion with a page. + * @waiter: The page completion, as a waiter. + * @page_info: The page info to complete with. + * + * Implements waiter_callback. + */ +static void complete_waiter_with_page(struct waiter *waiter, void *page_info) +{ + complete_with_page((struct page_info *) page_info, page_completion_from_waiter(waiter)); +} + +/** + * distribute_page_over_queue() - Complete a queue of VDO page completions with a page result. + * + * Upon completion the queue will be empty. + * + * Return: The number of pages distributed. + */ +static unsigned int distribute_page_over_queue(struct page_info *info, struct wait_queue *queue) +{ + size_t pages; + + update_lru(info); + pages = vdo_count_waiters(queue); + + /* + * Increment the busy count once for each pending completion so that this page does not + * stop being busy until all completions have been processed (VDO-83). + */ + info->busy += pages; + + vdo_notify_all_waiters(queue, complete_waiter_with_page, info); + return pages; +} + +/** + * set_persistent_error() - Set a persistent error which all requests will receive in the future. + * @context: A string describing what triggered the error. + * + * Once triggered, all enqueued completions will get this error. Any future requests will result in + * this error as well. + */ +static void set_persistent_error(struct vdo_page_cache *cache, const char *context, int result) +{ + struct page_info *info; + /* If we're already read-only, there's no need to log. */ + struct vdo *vdo = cache->zone->block_map->vdo; + + if ((result != VDO_READ_ONLY) && !vdo_is_read_only(vdo)) { + uds_log_error_strerror(result, "VDO Page Cache persistent error: %s", context); + vdo_enter_read_only_mode(vdo, result); + } + + assert_on_cache_thread(cache, __func__); + + vdo_notify_all_waiters(&cache->free_waiters, complete_waiter_with_error, &result); + cache->waiter_count = 0; + + for (info = cache->infos; info < cache->infos + cache->page_count; ++info) + vdo_notify_all_waiters(&info->waiting, complete_waiter_with_error, &result); +} + +/** + * validate_completed_page() - Check that a page completion which is being freed to the cache + * referred to a valid page and is in a valid state. + * @writable: Whether a writable page is required. + * + * Return: VDO_SUCCESS if the page was valid, otherwise as error + */ +static int __must_check +validate_completed_page(struct vdo_page_completion *completion, bool writable) +{ + int result; + + result = ASSERT(completion->ready, "VDO Page completion not ready"); + if (result != UDS_SUCCESS) + return result; + + result = ASSERT(completion->info != NULL, "VDO Page Completion must be complete"); + if (result != UDS_SUCCESS) + return result; + + result = ASSERT(completion->info->pbn == completion->pbn, + "VDO Page Completion pbn must be consistent"); + if (result != UDS_SUCCESS) + return result; + + result = ASSERT(is_valid(completion->info), "VDO Page Completion page must be valid"); + if (result != UDS_SUCCESS) + return result; + + if (writable) { + result = ASSERT(completion->writable, "VDO Page Completion is writable"); + if (result != UDS_SUCCESS) + return result; + } + + return VDO_SUCCESS; +} + +static void check_for_drain_complete(struct block_map_zone *zone) +{ + if (vdo_is_state_draining(&zone->state) && + (zone->active_lookups == 0) && + !vdo_has_waiters(&zone->flush_waiters) && + !is_vio_pool_busy(zone->vio_pool) && + (zone->page_cache.outstanding_reads == 0) && + (zone->page_cache.outstanding_writes == 0)) + vdo_finish_draining_with_result(&zone->state, + (vdo_is_read_only(zone->block_map->vdo) ? + VDO_READ_ONLY : VDO_SUCCESS)); +} + +static void enter_zone_read_only_mode(struct block_map_zone *zone, int result) +{ + vdo_enter_read_only_mode(zone->block_map->vdo, result); + + /* + * We are in read-only mode, so we won't ever write any page out. Just take all waiters off + * the queue so the zone can drain. + */ + while (vdo_has_waiters(&zone->flush_waiters)) + vdo_dequeue_next_waiter(&zone->flush_waiters); + + check_for_drain_complete(zone); +} + +static bool __must_check +validate_completed_page_or_enter_read_only_mode(struct vdo_page_completion *completion, + bool writable) +{ + int result = validate_completed_page(completion, writable); + + if (result == VDO_SUCCESS) + return true; + + enter_zone_read_only_mode(completion->info->cache->zone, result); + return false; +} + +/** + * handle_load_error() - Handle page load errors. + * @completion: The page read vio. + */ +static void handle_load_error(struct vdo_completion *completion) +{ + int result = completion->result; + struct page_info *info = completion->parent; + struct vdo_page_cache *cache = info->cache; + + assert_on_cache_thread(cache, __func__); + vio_record_metadata_io_error(as_vio(completion)); + vdo_enter_read_only_mode(cache->zone->block_map->vdo, result); + ADD_ONCE(cache->stats.failed_reads, 1); + set_info_state(info, PS_FAILED); + vdo_notify_all_waiters(&info->waiting, complete_waiter_with_error, &result); + reset_page_info(info); + + /* + * Don't decrement until right before calling check_for_drain_complete() to + * ensure that the above work can't cause the page cache to be freed out from under us. + */ + cache->outstanding_reads--; + check_for_drain_complete(cache->zone); +} + +/** + * page_is_loaded() - Callback used when a page has been loaded. + * @completion: The vio which has loaded the page. Its parent is the page_info. + */ +static void page_is_loaded(struct vdo_completion *completion) +{ + struct page_info *info = completion->parent; + struct vdo_page_cache *cache = info->cache; + nonce_t nonce = info->cache->zone->block_map->nonce; + struct block_map_page *page; + enum block_map_page_validity validity; + + assert_on_cache_thread(cache, __func__); + + page = (struct block_map_page *) get_page_buffer(info); + validity = vdo_validate_block_map_page(page, nonce, info->pbn); + if (validity == VDO_BLOCK_MAP_PAGE_BAD) { + int result = uds_log_error_strerror(VDO_BAD_PAGE, + "Expected page %llu but got page %llu instead", + (unsigned long long) info->pbn, + (unsigned long long) vdo_get_block_map_page_pbn(page)); + + vdo_continue_completion(completion, result); + return; + } + + if (validity == VDO_BLOCK_MAP_PAGE_INVALID) + vdo_format_block_map_page(page, nonce, info->pbn, false); + + info->recovery_lock = 0; + set_info_state(info, PS_RESIDENT); + distribute_page_over_queue(info, &info->waiting); + + /* + * Don't decrement until right before calling check_for_drain_complete() to + * ensure that the above work can't cause the page cache to be freed out from under us. + */ + cache->outstanding_reads--; + check_for_drain_complete(cache->zone); +} + +/** + * handle_rebuild_read_error() - Handle a read error during a read-only rebuild. + * @completion: The page load completion. + */ +static void handle_rebuild_read_error(struct vdo_completion *completion) +{ + struct page_info *info = completion->parent; + struct vdo_page_cache *cache = info->cache; + + assert_on_cache_thread(cache, __func__); + + /* + * We are doing a read-only rebuild, so treat this as a successful read of an uninitialized + * page. + */ + vio_record_metadata_io_error(as_vio(completion)); + ADD_ONCE(cache->stats.failed_reads, 1); + memset(get_page_buffer(info), 0, VDO_BLOCK_SIZE); + vdo_reset_completion(completion); + page_is_loaded(completion); +} + +static void load_cache_page_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct page_info *info = vio->completion.parent; + + continue_vio_after_io(vio, page_is_loaded, info->cache->zone->thread_id); +} + +/** + * launch_page_load() - Begin the process of loading a page. + * + * Return: VDO_SUCCESS or an error code. + */ +static int __must_check launch_page_load(struct page_info *info, physical_block_number_t pbn) +{ + int result; + vdo_action *callback; + struct vdo_page_cache *cache = info->cache; + + assert_io_allowed(cache); + + result = set_info_pbn(info, pbn); + if (result != VDO_SUCCESS) + return result; + + result = ASSERT((info->busy == 0), "Page is not busy before loading."); + if (result != VDO_SUCCESS) + return result; + + set_info_state(info, PS_INCOMING); + cache->outstanding_reads++; + ADD_ONCE(cache->stats.pages_loaded, 1); + callback = (cache->rebuilding ? handle_rebuild_read_error : handle_load_error); + submit_metadata_vio(info->vio, + pbn, + load_cache_page_endio, + callback, + REQ_OP_READ | REQ_PRIO); + return VDO_SUCCESS; +} + +static void write_pages(struct vdo_completion *completion); + +/** handle_flush_error() - Handle errors flushing the layer. */ +static void handle_flush_error(struct vdo_completion *completion) +{ + struct page_info *info = completion->parent; + + vio_record_metadata_io_error(as_vio(completion)); + set_persistent_error(info->cache, "flush failed", completion->result); + write_pages(completion); +} + +static void flush_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct page_info *info = vio->completion.parent; + + continue_vio_after_io(vio, write_pages, info->cache->zone->thread_id); +} + +/** save_pages() - Attempt to save the outgoing pages by first flushing the layer. */ +static void save_pages(struct vdo_page_cache *cache) +{ + struct page_info *info; + struct vio *vio; + + if ((cache->pages_in_flush > 0) || (cache->pages_to_flush == 0)) + return; + + assert_io_allowed(cache); + + info = list_first_entry(&cache->outgoing_list, struct page_info, state_entry); + + cache->pages_in_flush = cache->pages_to_flush; + cache->pages_to_flush = 0; + ADD_ONCE(cache->stats.flush_count, 1); + + vio = info->vio; + + /* + * We must make sure that the recovery journal entries that changed these pages were + * successfully persisted, and thus must issue a flush before each batch of pages is + * written to ensure this. + */ + submit_flush_vio(vio, flush_endio, handle_flush_error); +} + +/** + * schedule_page_save() - Add a page to the outgoing list of pages waiting to be saved. + * + * Once in the list, a page may not be used until it has been written out. + */ +static void schedule_page_save(struct page_info *info) +{ + if (info->busy > 0) { + info->write_status = WRITE_STATUS_DEFERRED; + return; + } + + info->cache->pages_to_flush++; + info->cache->outstanding_writes++; + set_info_state(info, PS_OUTGOING); +} + +/** + * launch_page_save() - Add a page to outgoing pages waiting to be saved, and then start saving + * pages if another save is not in progress. + */ +static void launch_page_save(struct page_info *info) +{ + schedule_page_save(info); + save_pages(info->cache); +} + +/** + * completion_needs_page() - Determine whether a given vdo_page_completion (as a waiter) is + * requesting a given page number. + * @context: A pointer to the pbn of the desired page. + * + * Implements waiter_match. + * + * Return: true if the page completion is for the desired page number. + */ +static bool completion_needs_page(struct waiter *waiter, void *context) +{ + physical_block_number_t *pbn = context; + + return (page_completion_from_waiter(waiter)->pbn == *pbn); +} + +/** + * allocate_free_page() - Allocate a free page to the first completion in the waiting queue, and + * any other completions that match it in page number. + */ +static void allocate_free_page(struct page_info *info) +{ + int result; + struct waiter *oldest_waiter; + physical_block_number_t pbn; + struct vdo_page_cache *cache = info->cache; + + assert_on_cache_thread(cache, __func__); + + if (!vdo_has_waiters(&cache->free_waiters)) { + if (cache->stats.cache_pressure > 0) { + uds_log_info("page cache pressure relieved"); + WRITE_ONCE(cache->stats.cache_pressure, 0); + } + return; + } + + result = reset_page_info(info); + if (result != VDO_SUCCESS) { + set_persistent_error(cache, "cannot reset page info", result); + return; + } + + oldest_waiter = vdo_get_first_waiter(&cache->free_waiters); + pbn = page_completion_from_waiter(oldest_waiter)->pbn; + + /* + * Remove all entries which match the page number in question and push them onto the page + * info's wait queue. + */ + vdo_dequeue_matching_waiters(&cache->free_waiters, + completion_needs_page, + &pbn, + &info->waiting); + cache->waiter_count -= vdo_count_waiters(&info->waiting); + + result = launch_page_load(info, pbn); + if (result != VDO_SUCCESS) + vdo_notify_all_waiters(&info->waiting, complete_waiter_with_error, &result); +} + +/** + * discard_a_page() - Begin the process of discarding a page. + * + * If no page is discardable, increments a count of deferred frees so that the next release of a + * page which is no longer busy will kick off another discard cycle. This is an indication that the + * cache is not big enough. + * + * If the selected page is not dirty, immediately allocates the page to the oldest completion + * waiting for a free page. + */ +static void discard_a_page(struct vdo_page_cache *cache) +{ + struct page_info *info = select_lru_page(cache); + + if (info == NULL) { + report_cache_pressure(cache); + return; + } + + if (!is_dirty(info)) { + allocate_free_page(info); + return; + } + + ASSERT_LOG_ONLY(!is_in_flight(info), "page selected for discard is not in flight"); + + ++cache->discard_count; + info->write_status = WRITE_STATUS_DISCARD; + launch_page_save(info); +} + +/** + * discard_page_for_completion() - Helper used to trigger a discard so that the completion can get + * a different page. + */ +static void discard_page_for_completion(struct vdo_page_completion *vdo_page_comp) +{ + struct vdo_page_cache *cache = vdo_page_comp->cache; + + ++cache->waiter_count; + vdo_enqueue_waiter(&cache->free_waiters, &vdo_page_comp->waiter); + discard_a_page(cache); +} + +/** + * discard_page_if_needed() - Helper used to trigger a discard if the cache needs another free + * page. + * @cache: The page cache. + */ +static void discard_page_if_needed(struct vdo_page_cache *cache) +{ + if (cache->waiter_count > cache->discard_count) + discard_a_page(cache); +} + +/** + * write_has_finished() - Inform the cache that a write has finished (possibly with an error). + * @info: The info structure for the page whose write just completed. + * + * Return: true if the page write was a discard. + */ +static bool write_has_finished(struct page_info *info) +{ + bool was_discard = (info->write_status == WRITE_STATUS_DISCARD); + + assert_on_cache_thread(info->cache, __func__); + info->cache->outstanding_writes--; + + info->write_status = WRITE_STATUS_NORMAL; + return was_discard; +} + +/** + * handle_page_write_error() - Handler for page write errors. + * @completion: The page write vio. + */ +static void handle_page_write_error(struct vdo_completion *completion) +{ + int result = completion->result; + struct page_info *info = completion->parent; + struct vdo_page_cache *cache = info->cache; + + vio_record_metadata_io_error(as_vio(completion)); + + /* If we're already read-only, write failures are to be expected. */ + if (result != VDO_READ_ONLY) { + static DEFINE_RATELIMIT_STATE(error_limiter, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if (__ratelimit(&error_limiter)) + uds_log_error("failed to write block map page %llu", + (unsigned long long) info->pbn); + } + + set_info_state(info, PS_DIRTY); + ADD_ONCE(cache->stats.failed_writes, 1); + set_persistent_error(cache, "cannot write page", result); + + if (!write_has_finished(info)) + discard_page_if_needed(cache); + + check_for_drain_complete(cache->zone); +} + +static void page_is_written_out(struct vdo_completion *completion); + +static void write_cache_page_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct page_info *info = vio->completion.parent; + + continue_vio_after_io(vio, page_is_written_out, info->cache->zone->thread_id); +} + +/** + * page_is_written_out() - Callback used when a page has been written out. + * @completion: The vio which wrote the page. Its parent is a page_info. + */ +static void page_is_written_out(struct vdo_completion *completion) +{ + bool was_discard, reclaimed; + u32 reclamations; + struct page_info *info = completion->parent; + struct vdo_page_cache *cache = info->cache; + struct block_map_page *page = (struct block_map_page *) get_page_buffer(info); + + if (!page->header.initialized) { + page->header.initialized = true; + submit_metadata_vio(info->vio, + info->pbn, + write_cache_page_endio, + handle_page_write_error, + (REQ_OP_WRITE | REQ_PRIO | REQ_PREFLUSH)); + return; + } + + /* Handle journal updates and torn write protection. */ + vdo_release_recovery_journal_block_reference(cache->zone->block_map->journal, + info->recovery_lock, + VDO_ZONE_TYPE_LOGICAL, + cache->zone->zone_number); + info->recovery_lock = 0; + was_discard = write_has_finished(info); + reclaimed = (!was_discard || (info->busy > 0) || vdo_has_waiters(&info->waiting)); + + set_info_state(info, PS_RESIDENT); + + reclamations = distribute_page_over_queue(info, &info->waiting); + ADD_ONCE(cache->stats.reclaimed, reclamations); + + if (was_discard) + cache->discard_count--; + + if (reclaimed) + discard_page_if_needed(cache); + else + allocate_free_page(info); + + check_for_drain_complete(cache->zone); +} + +/** + * write_pages() - Write the batch of pages which were covered by the layer flush which just + * completed. + * @flush_completion: The flush vio. + * + * This callback is registered in save_pages(). + */ +static void write_pages(struct vdo_completion *flush_completion) +{ + struct vdo_page_cache *cache = ((struct page_info *) flush_completion->parent)->cache; + + /* + * We need to cache these two values on the stack since in the error case below, it is + * possible for the last page info to cause the page cache to get freed. Hence once we + * launch the last page, it may be unsafe to dereference the cache [VDO-4724]. + */ + bool has_unflushed_pages = (cache->pages_to_flush > 0); + page_count_t pages_in_flush = cache->pages_in_flush; + + cache->pages_in_flush = 0; + while (pages_in_flush-- > 0) { + struct page_info *info = + list_first_entry(&cache->outgoing_list, struct page_info, state_entry); + + list_del_init(&info->state_entry); + if (vdo_is_read_only(info->cache->zone->block_map->vdo)) { + struct vdo_completion *completion = &info->vio->completion; + + vdo_reset_completion(completion); + completion->callback = page_is_written_out; + completion->error_handler = handle_page_write_error; + vdo_fail_completion(completion, VDO_READ_ONLY); + continue; + } + ADD_ONCE(info->cache->stats.pages_saved, 1); + submit_metadata_vio(info->vio, + info->pbn, + write_cache_page_endio, + handle_page_write_error, + REQ_OP_WRITE | REQ_PRIO); + } + + if (has_unflushed_pages) + /* + * If there are unflushed pages, the cache can't have been freed, so this call is + * safe. + */ + save_pages(cache); +} + +/** + * vdo_release_page_completion() - Release a VDO Page Completion. + * + * The page referenced by this completion (if any) will no longer be held busy by this completion. + * If a page becomes discardable and there are completions awaiting free pages then a new round of + * page discarding is started. + */ +void vdo_release_page_completion(struct vdo_completion *completion) +{ + struct page_info *discard_info = NULL; + struct vdo_page_completion *page_completion = as_vdo_page_completion(completion); + struct vdo_page_cache *cache; + + if (completion->result == VDO_SUCCESS) { + if (!validate_completed_page_or_enter_read_only_mode(page_completion, false)) + return; + + if (--page_completion->info->busy == 0) + discard_info = page_completion->info; + } + + ASSERT_LOG_ONLY((page_completion->waiter.next_waiter == NULL), + "Page being released after leaving all queues"); + + page_completion->info = NULL; + cache = page_completion->cache; + assert_on_cache_thread(cache, __func__); + + if (discard_info != NULL) { + if (discard_info->write_status == WRITE_STATUS_DEFERRED) { + discard_info->write_status = WRITE_STATUS_NORMAL; + launch_page_save(discard_info); + } + /* + * if there are excess requests for pages (that have not already started discards) + * we need to discard some page (which may be this one) + */ + discard_page_if_needed(cache); + } +} + +/** + * load_page_for_completion() - Helper function to load a page as described by a VDO Page + * Completion. + */ +static void +load_page_for_completion(struct page_info *info, struct vdo_page_completion *vdo_page_comp) +{ + int result; + + vdo_enqueue_waiter(&info->waiting, &vdo_page_comp->waiter); + result = launch_page_load(info, vdo_page_comp->pbn); + if (result != VDO_SUCCESS) + vdo_notify_all_waiters(&info->waiting, complete_waiter_with_error, &result); +} + +/** + * vdo_get_page() - Initialize a page completion and get a block map page. + * @page_completion: The vdo_page_completion to initialize. + * @zone: The block map zone of the desired page. + * @pbn: The absolute physical block of the desired page. + * @writable: Whether the page can be modified. + * @parent: The object to notify when the fetch is complete. + * @callback: The notification callback. + * @error_handler: The handler for fetch errors. + * @requeue: Whether we must requeue when notifying the parent. + * + * May cause another page to be discarded (potentially writing a dirty page) and the one nominated + * by the completion to be loaded from disk. When the callback is invoked, the page will be + * resident in the cache and marked busy. All callers must call vdo_release_page_completion() + * when they are done with the page to clear the busy mark. + */ +void vdo_get_page(struct vdo_page_completion *page_completion, + struct block_map_zone *zone, + physical_block_number_t pbn, + bool writable, + void *parent, + vdo_action *callback, + vdo_action *error_handler, + bool requeue) +{ + struct vdo_page_cache *cache = &zone->page_cache; + struct vdo_completion *completion = &page_completion->completion; + struct page_info *info; + + assert_on_cache_thread(cache, __func__); + ASSERT_LOG_ONLY((page_completion->waiter.next_waiter == NULL), + "New page completion was not already on a wait queue"); + + *page_completion = (struct vdo_page_completion) { + .pbn = pbn, + .writable = writable, + .cache = cache, + }; + + vdo_initialize_completion(completion, cache->vdo, VDO_PAGE_COMPLETION); + vdo_prepare_completion(completion, + callback, + error_handler, + cache->zone->thread_id, + parent); + completion->requeue = requeue; + + if (page_completion->writable && vdo_is_read_only(cache->zone->block_map->vdo)) { + vdo_fail_completion(completion, VDO_READ_ONLY); + return; + } + + if (page_completion->writable) + ADD_ONCE(cache->stats.write_count, 1); + else + ADD_ONCE(cache->stats.read_count, 1); + + info = find_page(cache, page_completion->pbn); + if (info != NULL) { + /* The page is in the cache already. */ + if ((info->write_status == WRITE_STATUS_DEFERRED) || + is_incoming(info) || + (is_outgoing(info) && page_completion->writable)) { + /* The page is unusable until it has finished I/O. */ + ADD_ONCE(cache->stats.wait_for_page, 1); + vdo_enqueue_waiter(&info->waiting, &page_completion->waiter); + return; + } + + if (is_valid(info)) { + /* The page is usable. */ + ADD_ONCE(cache->stats.found_in_cache, 1); + if (!is_present(info)) + ADD_ONCE(cache->stats.read_outgoing, 1); + update_lru(info); + ++info->busy; + complete_with_page(info, page_completion); + return; + } + /* Something horrible has gone wrong. */ + ASSERT_LOG_ONLY(false, "Info found in a usable state."); + } + + /* The page must be fetched. */ + info = find_free_page(cache); + if (info != NULL) { + ADD_ONCE(cache->stats.fetch_required, 1); + load_page_for_completion(info, page_completion); + return; + } + + /* The page must wait for a page to be discarded. */ + ADD_ONCE(cache->stats.discard_required, 1); + discard_page_for_completion(page_completion); +} + +/** + * vdo_request_page_write() - Request that a VDO page be written out as soon as it is not busy. + * @completion: The vdo_page_completion containing the page. + */ +void vdo_request_page_write(struct vdo_completion *completion) +{ + struct page_info *info; + struct vdo_page_completion *vdo_page_comp = as_vdo_page_completion(completion); + + if (!validate_completed_page_or_enter_read_only_mode(vdo_page_comp, true)) + return; + + info = vdo_page_comp->info; + set_info_state(info, PS_DIRTY); + launch_page_save(info); +} + +/** + * vdo_get_cached_page() - Get the block map page from a page completion. + * @completion: A vdo page completion whose callback has been called. + * @page_ptr: A pointer to hold the page + * + * Return: VDO_SUCCESS or an error + */ +int vdo_get_cached_page(struct vdo_completion *completion, struct block_map_page **page_ptr) +{ + int result; + struct vdo_page_completion *vpc; + + vpc = as_vdo_page_completion(completion); + result = validate_completed_page(vpc, true); + if (result == VDO_SUCCESS) + *page_ptr = (struct block_map_page *) get_page_buffer(vpc->info); + + return result; +} + +/** + * vdo_invalidate_page_cache() - Invalidate all entries in the VDO page cache. + * + * There must not be any dirty pages in the cache. + * + * Return: A success or error code. + */ +int vdo_invalidate_page_cache(struct vdo_page_cache *cache) +{ + struct page_info *info; + + assert_on_cache_thread(cache, __func__); + + /* Make sure we don't throw away any dirty pages. */ + for (info = cache->infos; info < cache->infos + cache->page_count; info++) { + int result = ASSERT(!is_dirty(info), "cache must have no dirty pages"); + + if (result != VDO_SUCCESS) + return result; + } + + /* Reset the page map by re-allocating it. */ + vdo_free_int_map(UDS_FORGET(cache->page_map)); + return vdo_make_int_map(cache->page_count, 0, &cache->page_map); +} + +/** + * get_tree_page_by_index() - Get the tree page for a given height and page index. + * + * Return: The requested page. + */ +static struct tree_page * __must_check +get_tree_page_by_index(struct forest *forest, + root_count_t root_index, + height_t height, + page_number_t page_index) +{ + page_number_t offset = 0; + size_t segment; + + for (segment = 0; segment < forest->segments; segment++) { + page_number_t border = forest->boundaries[segment].levels[height - 1]; + + if (page_index < border) { + struct block_map_tree *tree = &forest->trees[root_index]; + + return &(tree->segments[segment].levels[height - 1][page_index - offset]); + } + offset = border; + } + + return NULL; +} + +/* Get the page referred to by the lock's tree slot at its current height. */ +static inline struct tree_page * +get_tree_page(const struct block_map_zone *zone, const struct tree_lock *lock) +{ + return get_tree_page_by_index(zone->block_map->forest, + lock->root_index, + lock->height, + lock->tree_slots[lock->height].page_index); +} + +/** vdo_copy_valid_page() - Validate and copy a buffer to a page. */ +bool vdo_copy_valid_page(char *buffer, + nonce_t nonce, + physical_block_number_t pbn, + struct block_map_page *page) +{ + struct block_map_page *loaded = (struct block_map_page *) buffer; + enum block_map_page_validity validity = vdo_validate_block_map_page(loaded, nonce, pbn); + + if (validity == VDO_BLOCK_MAP_PAGE_VALID) { + memcpy(page, loaded, VDO_BLOCK_SIZE); + return true; + } + + if (validity == VDO_BLOCK_MAP_PAGE_BAD) + uds_log_error_strerror(VDO_BAD_PAGE, + "Expected page %llu but got page %llu instead", + (unsigned long long) pbn, + (unsigned long long) vdo_get_block_map_page_pbn(loaded)); + + return false; +} + +/** + * in_cyclic_range() - Check whether the given value is between the lower and upper bounds, within + * a cyclic range of values from 0 to (modulus - 1). + * @lower: The lowest value to accept. + * @value: The value to check. + * @upper: The highest value to accept. + * @modulus: The size of the cyclic space, no more than 2^15. + * + * The value and both bounds must be smaller than the modulus. + * + * Return: true if the value is in range. + */ +static bool in_cyclic_range(u16 lower, u16 value, u16 upper, u16 modulus) +{ + if (value < lower) + value += modulus; + if (upper < lower) + upper += modulus; + return (value <= upper); +} + +/** + * is_not_older() - Check whether a generation is strictly older than some other generation in the + * context of a zone's current generation range. + * @zone: The zone in which to do the comparison. + * @a: The generation in question. + * @b: The generation to compare to. + * + * Return: true if generation @a is not strictly older than generation @b in the context of @zone + */ +static bool __must_check is_not_older(struct block_map_zone *zone, u8 a, u8 b) +{ + int result; + + result = ASSERT((in_cyclic_range(zone->oldest_generation, a, zone->generation, 1 << 8) && + in_cyclic_range(zone->oldest_generation, b, zone->generation, 1 << 8)), + "generation(s) %u, %u are out of range [%u, %u]", + a, b, zone->oldest_generation, zone->generation); + if (result != VDO_SUCCESS) { + enter_zone_read_only_mode(zone, result); + return true; + } + + return in_cyclic_range(b, a, zone->generation, 1 << 8); +} + +static void release_generation(struct block_map_zone *zone, u8 generation) +{ + int result; + + result = ASSERT((zone->dirty_page_counts[generation] > 0), + "dirty page count underflow for generation %u", + generation); + if (result != VDO_SUCCESS) { + enter_zone_read_only_mode(zone, result); + return; + } + + zone->dirty_page_counts[generation]--; + while ((zone->dirty_page_counts[zone->oldest_generation] == 0) && + (zone->oldest_generation != zone->generation)) + zone->oldest_generation++; +} + +static void +set_generation(struct block_map_zone *zone, struct tree_page *page, u8 new_generation) +{ + u32 new_count; + int result; + bool decrement_old = vdo_is_waiting(&page->waiter); + u8 old_generation = page->generation; + + if (decrement_old && (old_generation == new_generation)) + return; + + page->generation = new_generation; + new_count = ++zone->dirty_page_counts[new_generation]; + result = ASSERT((new_count != 0), + "dirty page count overflow for generation %u", + new_generation); + if (result != VDO_SUCCESS) { + enter_zone_read_only_mode(zone, result); + return; + } + + if (decrement_old) + release_generation(zone, old_generation); +} + +static void write_page(struct tree_page *tree_page, struct pooled_vio *vio); + +/* Implements waiter_callback */ +static void write_page_callback(struct waiter *waiter, void *context) +{ + write_page(container_of(waiter, struct tree_page, waiter), (struct pooled_vio *) context); +} + +static void acquire_vio(struct waiter *waiter, struct block_map_zone *zone) +{ + waiter->callback = write_page_callback; + acquire_vio_from_pool(zone->vio_pool, waiter); +} + +/* Return: true if all possible generations were not already active */ +static bool attempt_increment(struct block_map_zone *zone) +{ + u8 generation = zone->generation + 1; + + if (zone->oldest_generation == generation) + return false; + + zone->generation = generation; + return true; +} + +/* Launches a flush if one is not already in progress. */ +static void enqueue_page(struct tree_page *page, struct block_map_zone *zone) +{ + if ((zone->flusher == NULL) && attempt_increment(zone)) { + zone->flusher = page; + acquire_vio(&page->waiter, zone); + return; + } + + vdo_enqueue_waiter(&zone->flush_waiters, &page->waiter); +} + +static void write_page_if_not_dirtied(struct waiter *waiter, void *context) +{ + struct tree_page *page = container_of(waiter, struct tree_page, waiter); + struct write_if_not_dirtied_context *write_context = context; + + if (page->generation == write_context->generation) { + acquire_vio(waiter, write_context->zone); + return; + } + + enqueue_page(page, write_context->zone); +} + +static void return_to_pool(struct block_map_zone *zone, struct pooled_vio *vio) +{ + return_vio_to_pool(zone->vio_pool, vio); + check_for_drain_complete(zone); +} + +/* This callback is registered in write_initialized_page(). */ +static void finish_page_write(struct vdo_completion *completion) +{ + bool dirty; + struct vio *vio = as_vio(completion); + struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio); + struct tree_page *page = completion->parent; + struct block_map_zone *zone = pooled->context; + + vdo_release_recovery_journal_block_reference(zone->block_map->journal, + page->writing_recovery_lock, + VDO_ZONE_TYPE_LOGICAL, + zone->zone_number); + + dirty = (page->writing_generation != page->generation); + release_generation(zone, page->writing_generation); + page->writing = false; + + if (zone->flusher == page) { + struct write_if_not_dirtied_context context = { + .zone = zone, + .generation = page->writing_generation, + }; + + vdo_notify_all_waiters(&zone->flush_waiters, write_page_if_not_dirtied, &context); + if (dirty && attempt_increment(zone)) { + write_page(page, pooled); + return; + } + + zone->flusher = NULL; + } + + if (dirty) { + enqueue_page(page, zone); + } else if ((zone->flusher == NULL) && + vdo_has_waiters(&zone->flush_waiters) && + attempt_increment(zone)) { + zone->flusher = container_of(vdo_dequeue_next_waiter(&zone->flush_waiters), + struct tree_page, + waiter); + write_page(zone->flusher, pooled); + return; + } + + return_to_pool(zone, pooled); +} + +static void handle_write_error(struct vdo_completion *completion) +{ + int result = completion->result; + struct vio *vio = as_vio(completion); + struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio); + struct block_map_zone *zone = pooled->context; + + vio_record_metadata_io_error(vio); + enter_zone_read_only_mode(zone, result); + return_to_pool(zone, pooled); +} + +static void write_page_endio(struct bio *bio); + +static void write_initialized_page(struct vdo_completion *completion) +{ + struct vio *vio = as_vio(completion); + struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio); + struct block_map_zone *zone = pooled->context; + struct tree_page *tree_page = completion->parent; + struct block_map_page *page = (struct block_map_page *) vio->data; + unsigned int operation = REQ_OP_WRITE | REQ_PRIO; + + /* + * Now that we know the page has been written at least once, mark the copy we are writing + * as initialized. + */ + page->header.initialized = true; + + if (zone->flusher == tree_page) + operation |= REQ_PREFLUSH; + + submit_metadata_vio(vio, + vdo_get_block_map_page_pbn(page), + write_page_endio, + handle_write_error, + operation); +} + +static void write_page_endio(struct bio *bio) +{ + struct pooled_vio *vio = bio->bi_private; + struct block_map_zone *zone = vio->context; + struct block_map_page *page = (struct block_map_page *) vio->vio.data; + + continue_vio_after_io(&vio->vio, + (page->header.initialized ? + finish_page_write : + write_initialized_page), + zone->thread_id); +} + +static void write_page(struct tree_page *tree_page, struct pooled_vio *vio) +{ + struct vdo_completion *completion = &vio->vio.completion; + struct block_map_zone *zone = vio->context; + struct block_map_page *page = vdo_as_block_map_page(tree_page); + + if ((zone->flusher != tree_page) && + is_not_older(zone, tree_page->generation, zone->generation)) { + /* + * This page was re-dirtied after the last flush was issued, hence we need to do + * another flush. + */ + enqueue_page(tree_page, zone); + return_to_pool(zone, vio); + return; + } + + completion->parent = tree_page; + memcpy(vio->vio.data, tree_page->page_buffer, VDO_BLOCK_SIZE); + completion->callback_thread_id = zone->thread_id; + + tree_page->writing = true; + tree_page->writing_generation = tree_page->generation; + tree_page->writing_recovery_lock = tree_page->recovery_lock; + + /* Clear this now so that we know this page is not on any dirty list. */ + tree_page->recovery_lock = 0; + + /* + * We've already copied the page into the vio which will write it, so if it was not yet + * initialized, the first write will indicate that (for torn write protection). It is now + * safe to mark it as initialized in memory since if the write fails, the in memory state + * will become irrelevant. + */ + if (page->header.initialized) { + write_initialized_page(completion); + return; + } + + page->header.initialized = true; + submit_metadata_vio(&vio->vio, + vdo_get_block_map_page_pbn(page), + write_page_endio, + handle_write_error, + REQ_OP_WRITE | REQ_PRIO); +} + +/* Release a lock on a page which was being loaded or allocated. */ +static void release_page_lock(struct data_vio *data_vio, char *what) +{ + struct block_map_zone *zone; + struct tree_lock *lock_holder; + struct tree_lock *lock = &data_vio->tree_lock; + + ASSERT_LOG_ONLY(lock->locked, + "release of unlocked block map page %s for key %llu in tree %u", + what, (unsigned long long) lock->key, + lock->root_index); + + zone = data_vio->logical.zone->block_map_zone; + lock_holder = vdo_int_map_remove(zone->loading_pages, lock->key); + ASSERT_LOG_ONLY((lock_holder == lock), + "block map page %s mismatch for key %llu in tree %u", + what, + (unsigned long long) lock->key, + lock->root_index); + lock->locked = false; +} + +static void finish_lookup(struct data_vio *data_vio, int result) +{ + data_vio->tree_lock.height = 0; + + --data_vio->logical.zone->block_map_zone->active_lookups; + + set_data_vio_logical_callback(data_vio, continue_data_vio_with_block_map_slot); + data_vio->vio.completion.error_handler = handle_data_vio_error; + continue_data_vio_with_error(data_vio, result); +} + +static void abort_lookup_for_waiter(struct waiter *waiter, void *context) +{ + struct data_vio *data_vio = waiter_as_data_vio(waiter); + int result = *((int *) context); + + if (!data_vio->write) { + if (result == VDO_NO_SPACE) + result = VDO_SUCCESS; + } else if (result != VDO_NO_SPACE) { + result = VDO_READ_ONLY; + } + + finish_lookup(data_vio, result); +} + +static void abort_lookup(struct data_vio *data_vio, int result, char *what) +{ + if (result != VDO_NO_SPACE) + enter_zone_read_only_mode(data_vio->logical.zone->block_map_zone, result); + + if (data_vio->tree_lock.locked) { + release_page_lock(data_vio, what); + vdo_notify_all_waiters(&data_vio->tree_lock.waiters, + abort_lookup_for_waiter, + &result); + } + + finish_lookup(data_vio, result); +} + +static void abort_load(struct data_vio *data_vio, int result) +{ + abort_lookup(data_vio, result, "load"); +} + +static bool __must_check +is_invalid_tree_entry(const struct vdo *vdo, const struct data_location *mapping, height_t height) +{ + if (!vdo_is_valid_location(mapping) || + vdo_is_state_compressed(mapping->state) || + (vdo_is_mapped_location(mapping) && (mapping->pbn == VDO_ZERO_BLOCK))) + return true; + + /* Roots aren't physical data blocks, so we can't check their PBNs. */ + if (height == VDO_BLOCK_MAP_TREE_HEIGHT) + return false; + + return !vdo_is_physical_data_block(vdo->depot, mapping->pbn); +} + +static void load_block_map_page(struct block_map_zone *zone, struct data_vio *data_vio); +static void allocate_block_map_page(struct block_map_zone *zone, struct data_vio *data_vio); + +static void continue_with_loaded_page(struct data_vio *data_vio, struct block_map_page *page) +{ + struct tree_lock *lock = &data_vio->tree_lock; + struct block_map_tree_slot slot = lock->tree_slots[lock->height]; + struct data_location mapping = + vdo_unpack_block_map_entry(&page->entries[slot.block_map_slot.slot]); + + if (is_invalid_tree_entry(vdo_from_data_vio(data_vio), &mapping, lock->height)) { + uds_log_error_strerror(VDO_BAD_MAPPING, + "Invalid block map tree PBN: %llu with state %u for page index %u at height %u", + (unsigned long long) mapping.pbn, + mapping.state, + lock->tree_slots[lock->height - 1].page_index, + lock->height - 1); + abort_load(data_vio, VDO_BAD_MAPPING); + return; + } + + if (!vdo_is_mapped_location(&mapping)) { + /* The page we need is unallocated */ + allocate_block_map_page(data_vio->logical.zone->block_map_zone, data_vio); + return; + } + + lock->tree_slots[lock->height - 1].block_map_slot.pbn = mapping.pbn; + if (lock->height == 1) { + finish_lookup(data_vio, VDO_SUCCESS); + return; + } + + /* We know what page we need to load next */ + load_block_map_page(data_vio->logical.zone->block_map_zone, data_vio); +} + +static void continue_load_for_waiter(struct waiter *waiter, void *context) +{ + struct data_vio *data_vio = waiter_as_data_vio(waiter); + + data_vio->tree_lock.height--; + continue_with_loaded_page(data_vio, (struct block_map_page *) context); +} + +static void finish_block_map_page_load(struct vdo_completion *completion) +{ + physical_block_number_t pbn; + struct tree_page *tree_page; + struct block_map_page *page; + nonce_t nonce; + struct vio *vio = as_vio(completion); + struct pooled_vio *pooled = vio_as_pooled_vio(vio); + struct data_vio *data_vio = completion->parent; + struct block_map_zone *zone = pooled->context; + struct tree_lock *tree_lock = &data_vio->tree_lock; + + tree_lock->height--; + pbn = tree_lock->tree_slots[tree_lock->height].block_map_slot.pbn; + tree_page = get_tree_page(zone, tree_lock); + page = (struct block_map_page *) tree_page->page_buffer; + nonce = zone->block_map->nonce; + + if (!vdo_copy_valid_page(vio->data, nonce, pbn, page)) + vdo_format_block_map_page(page, nonce, pbn, false); + return_vio_to_pool(zone->vio_pool, pooled); + + /* Release our claim to the load and wake any waiters */ + release_page_lock(data_vio, "load"); + vdo_notify_all_waiters(&tree_lock->waiters, continue_load_for_waiter, page); + continue_with_loaded_page(data_vio, page); +} + +static void handle_io_error(struct vdo_completion *completion) +{ + int result = completion->result; + struct vio *vio = as_vio(completion); + struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio); + struct data_vio *data_vio = completion->parent; + struct block_map_zone *zone = pooled->context; + + vio_record_metadata_io_error(vio); + return_vio_to_pool(zone->vio_pool, pooled); + abort_load(data_vio, result); +} + +static void load_page_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct data_vio *data_vio = vio->completion.parent; + + continue_vio_after_io(vio, finish_block_map_page_load, data_vio->logical.zone->thread_id); +} + +static void load_page(struct waiter *waiter, void *context) +{ + struct pooled_vio *pooled = context; + struct data_vio *data_vio = waiter_as_data_vio(waiter); + struct tree_lock *lock = &data_vio->tree_lock; + physical_block_number_t pbn = lock->tree_slots[lock->height - 1].block_map_slot.pbn; + + pooled->vio.completion.parent = data_vio; + submit_metadata_vio(&pooled->vio, + pbn, + load_page_endio, + handle_io_error, + REQ_OP_READ | REQ_PRIO); +} + +/* + * If the page is already locked, queue up to wait for the lock to be released. If the lock is + * acquired, @data_vio->tree_lock.locked will be true. + */ +static int attempt_page_lock(struct block_map_zone *zone, struct data_vio *data_vio) +{ + int result; + struct tree_lock *lock_holder; + struct tree_lock *lock = &data_vio->tree_lock; + height_t height = lock->height; + struct block_map_tree_slot tree_slot = lock->tree_slots[height]; + union page_key key; + + key.descriptor = (struct page_descriptor) { + .root_index = lock->root_index, + .height = height, + .page_index = tree_slot.page_index, + .slot = tree_slot.block_map_slot.slot, + }; + lock->key = key.key; + + result = vdo_int_map_put(zone->loading_pages, + lock->key, + lock, + false, + (void **) &lock_holder); + if (result != VDO_SUCCESS) + return result; + + if (lock_holder == NULL) { + /* We got the lock */ + data_vio->tree_lock.locked = true; + return VDO_SUCCESS; + } + + /* Someone else is loading or allocating the page we need */ + vdo_enqueue_waiter(&lock_holder->waiters, &data_vio->waiter); + return VDO_SUCCESS; +} + +/* Load a block map tree page from disk, for the next level in the data vio tree lock. */ +static void load_block_map_page(struct block_map_zone *zone, struct data_vio *data_vio) +{ + int result; + + result = attempt_page_lock(zone, data_vio); + if (result != VDO_SUCCESS) { + abort_load(data_vio, result); + return; + } + + if (data_vio->tree_lock.locked) { + data_vio->waiter.callback = load_page; + acquire_vio_from_pool(zone->vio_pool, &data_vio->waiter); + } +} + +static void allocation_failure(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + if (vdo_requeue_completion_if_needed(completion, data_vio->logical.zone->thread_id)) + return; + + abort_lookup(data_vio, completion->result, "allocation"); +} + +static void continue_allocation_for_waiter(struct waiter *waiter, void *context) +{ + struct data_vio *data_vio = waiter_as_data_vio(waiter); + struct tree_lock *tree_lock = &data_vio->tree_lock; + physical_block_number_t pbn = *((physical_block_number_t *) context); + + tree_lock->height--; + data_vio->tree_lock.tree_slots[tree_lock->height].block_map_slot.pbn = pbn; + + if (tree_lock->height == 0) { + finish_lookup(data_vio, VDO_SUCCESS); + return; + } + + allocate_block_map_page(data_vio->logical.zone->block_map_zone, data_vio); +} + +/** expire_oldest_list() - Expire the oldest list. */ +static void expire_oldest_list(struct dirty_lists *dirty_lists) +{ + block_count_t i = dirty_lists->offset++; + + dirty_lists->oldest_period++; + if (!list_empty(&dirty_lists->eras[i][VDO_TREE_PAGE])) + list_splice_tail_init(&dirty_lists->eras[i][VDO_TREE_PAGE], + &dirty_lists->expired[VDO_TREE_PAGE]); + if (!list_empty(&dirty_lists->eras[i][VDO_CACHE_PAGE])) + list_splice_tail_init(&dirty_lists->eras[i][VDO_CACHE_PAGE], + &dirty_lists->expired[VDO_CACHE_PAGE]); + + if (dirty_lists->offset == dirty_lists->maximum_age) + dirty_lists->offset = 0; +} + + +/** update_period() - Update the dirty_lists period if necessary. */ +static void update_period(struct dirty_lists *dirty, sequence_number_t period) +{ + while (dirty->next_period <= period) { + if ((dirty->next_period - dirty->oldest_period) == dirty->maximum_age) + expire_oldest_list(dirty); + dirty->next_period++; + } +} + +/** write_expired_elements() - Write out the expired list. */ +static void write_expired_elements(struct block_map_zone *zone) +{ + struct tree_page *page, *ttmp; + struct page_info *info, *ptmp; + struct list_head *expired; + u8 generation = zone->generation; + + expired = &zone->dirty_lists->expired[VDO_TREE_PAGE]; + list_for_each_entry_safe(page, ttmp, expired, entry) { + int result; + + list_del_init(&page->entry); + + result = ASSERT(!vdo_is_waiting(&page->waiter), + "Newly expired page not already waiting to write"); + if (result != VDO_SUCCESS) { + enter_zone_read_only_mode(zone, result); + continue; + } + + set_generation(zone, page, generation); + if (!page->writing) + enqueue_page(page, zone); + } + + expired = &zone->dirty_lists->expired[VDO_CACHE_PAGE]; + list_for_each_entry_safe(info, ptmp, expired, state_entry) { + list_del_init(&info->state_entry); + schedule_page_save(info); + } + + save_pages(&zone->page_cache); +} + +/** + * add_to_dirty_lists() - Add an element to the dirty lists. + * @zone: The zone in which we are operating. + * @entry: The list entry of the element to add. + * @type: The type of page. + * @old_period: The period in which the element was previously dirtied, or 0 if it was not dirty. + * @new_period: The period in which the element has now been dirtied, or 0 if it does not hold a + * lock. + */ +static void +add_to_dirty_lists(struct block_map_zone *zone, + struct list_head *entry, + enum block_map_page_type type, + sequence_number_t old_period, + sequence_number_t new_period) +{ + struct dirty_lists *dirty_lists = zone->dirty_lists; + + if ((old_period == new_period) || ((old_period != 0) && (old_period < new_period))) + return; + + if (new_period < dirty_lists->oldest_period) { + list_move_tail(entry, &dirty_lists->expired[type]); + } else { + update_period(dirty_lists, new_period); + list_move_tail(entry, + &dirty_lists->eras[new_period % dirty_lists->maximum_age][type]); + } + + write_expired_elements(zone); +} + +/* + * Record the allocation in the tree and wake any waiters now that the write lock has been + * released. + */ +static void finish_block_map_allocation(struct vdo_completion *completion) +{ + physical_block_number_t pbn; + struct tree_page *tree_page; + struct block_map_page *page; + sequence_number_t old_lock; + struct data_vio *data_vio = as_data_vio(completion); + struct block_map_zone *zone = data_vio->logical.zone->block_map_zone; + struct tree_lock *tree_lock = &data_vio->tree_lock; + height_t height = tree_lock->height; + + assert_data_vio_in_logical_zone(data_vio); + + tree_page = get_tree_page(zone, tree_lock); + pbn = tree_lock->tree_slots[height - 1].block_map_slot.pbn; + + /* Record the allocation. */ + page = (struct block_map_page *) tree_page->page_buffer; + old_lock = tree_page->recovery_lock; + vdo_update_block_map_page(page, + data_vio, + pbn, + VDO_MAPPING_STATE_UNCOMPRESSED, + &tree_page->recovery_lock); + + if (vdo_is_waiting(&tree_page->waiter)) { + /* This page is waiting to be written out. */ + if (zone->flusher != tree_page) + /* + * The outstanding flush won't cover the update we just made, so mark the + * page as needing another flush. + */ + set_generation(zone, tree_page, zone->generation); + } else { + /* Put the page on a dirty list */ + if (old_lock == 0) + INIT_LIST_HEAD(&tree_page->entry); + add_to_dirty_lists(zone, + &tree_page->entry, + VDO_TREE_PAGE, + old_lock, + tree_page->recovery_lock); + } + + tree_lock->height--; + if (height > 1) { + /* Format the interior node we just allocated (in memory). */ + tree_page = get_tree_page(zone, tree_lock); + vdo_format_block_map_page(tree_page->page_buffer, + zone->block_map->nonce, + pbn, + false); + } + + /* Release our claim to the allocation and wake any waiters */ + release_page_lock(data_vio, "allocation"); + vdo_notify_all_waiters(&tree_lock->waiters, continue_allocation_for_waiter, &pbn); + if (tree_lock->height == 0) { + finish_lookup(data_vio, VDO_SUCCESS); + return; + } + + allocate_block_map_page(zone, data_vio); +} + +static void release_block_map_write_lock(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_in_allocated_zone(data_vio); + + release_data_vio_allocation_lock(data_vio, true); + launch_data_vio_logical_callback(data_vio, finish_block_map_allocation); +} + +/* + * Newly allocated block map pages are set to have to MAXIMUM_REFERENCES after they are journaled, + * to prevent deduplication against the block after we release the write lock on it, but before we + * write out the page. + */ +static void set_block_map_page_reference_count(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_in_allocated_zone(data_vio); + + completion->callback = release_block_map_write_lock; + vdo_modify_reference_count(completion, &data_vio->increment_updater); +} + +static void journal_block_map_allocation(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_in_journal_zone(data_vio); + + set_data_vio_allocated_zone_callback(data_vio, set_block_map_page_reference_count); + vdo_add_recovery_journal_entry(completion->vdo->recovery_journal, data_vio); +} + +static void allocate_block(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + struct tree_lock *lock = &data_vio->tree_lock; + physical_block_number_t pbn; + + assert_data_vio_in_allocated_zone(data_vio); + + if (!vdo_allocate_block_in_zone(data_vio)) + return; + + pbn = data_vio->allocation.pbn; + lock->tree_slots[lock->height - 1].block_map_slot.pbn = pbn; + data_vio->increment_updater = (struct reference_updater) { + .operation = VDO_JOURNAL_BLOCK_MAP_REMAPPING, + .increment = true, + .zpbn = { + .pbn = pbn, + .state = VDO_MAPPING_STATE_UNCOMPRESSED, + }, + .lock = data_vio->allocation.lock, + }; + + launch_data_vio_journal_callback(data_vio, journal_block_map_allocation); +} + +static void allocate_block_map_page(struct block_map_zone *zone, struct data_vio *data_vio) +{ + int result; + + if (!data_vio->write || data_vio->is_trim) { + /* This is a pure read or a trim, so there's nothing left to do here. */ + finish_lookup(data_vio, VDO_SUCCESS); + return; + } + + result = attempt_page_lock(zone, data_vio); + if (result != VDO_SUCCESS) { + abort_lookup(data_vio, result, "allocation"); + return; + } + + if (!data_vio->tree_lock.locked) + return; + + data_vio_allocate_data_block(data_vio, + VIO_BLOCK_MAP_WRITE_LOCK, + allocate_block, + allocation_failure); +} + +/* + * vdo_find_block_map_slot(): Find the block map slot in which the block map entry for a data_vio + * resides and cache that result in the data_vio. + * + * All ancestors in the tree will be allocated or loaded, as needed. + */ +void vdo_find_block_map_slot(struct data_vio *data_vio) +{ + page_number_t page_index; + struct block_map_tree_slot tree_slot; + struct data_location mapping; + struct block_map_page *page = NULL; + struct tree_lock *lock = &data_vio->tree_lock; + struct block_map_zone *zone = data_vio->logical.zone->block_map_zone; + + zone->active_lookups++; + if (vdo_is_state_draining(&zone->state)) { + finish_lookup(data_vio, VDO_SHUTTING_DOWN); + return; + } + + lock->tree_slots[0].block_map_slot.slot = + data_vio->logical.lbn % VDO_BLOCK_MAP_ENTRIES_PER_PAGE; + page_index = (lock->tree_slots[0].page_index / zone->block_map->root_count); + tree_slot = (struct block_map_tree_slot) { + .page_index = page_index / VDO_BLOCK_MAP_ENTRIES_PER_PAGE, + .block_map_slot = { + .pbn = 0, + .slot = page_index % VDO_BLOCK_MAP_ENTRIES_PER_PAGE, + }, + }; + + for (lock->height = 1; lock->height <= VDO_BLOCK_MAP_TREE_HEIGHT; lock->height++) { + physical_block_number_t pbn; + + lock->tree_slots[lock->height] = tree_slot; + page = (struct block_map_page *) (get_tree_page(zone, lock)->page_buffer); + pbn = vdo_get_block_map_page_pbn(page); + if (pbn != VDO_ZERO_BLOCK) { + lock->tree_slots[lock->height].block_map_slot.pbn = pbn; + break; + } + + /* Calculate the index and slot for the next level. */ + tree_slot.block_map_slot.slot = + tree_slot.page_index % VDO_BLOCK_MAP_ENTRIES_PER_PAGE; + tree_slot.page_index = tree_slot.page_index / VDO_BLOCK_MAP_ENTRIES_PER_PAGE; + } + + /* The page at this height has been allocated and loaded. */ + mapping = vdo_unpack_block_map_entry(&page->entries[tree_slot.block_map_slot.slot]); + if (is_invalid_tree_entry(vdo_from_data_vio(data_vio), &mapping, lock->height)) { + uds_log_error_strerror(VDO_BAD_MAPPING, + "Invalid block map tree PBN: %llu with state %u for page index %u at height %u", + (unsigned long long) mapping.pbn, + mapping.state, + lock->tree_slots[lock->height - 1].page_index, + lock->height - 1); + abort_load(data_vio, VDO_BAD_MAPPING); + return; + } + + if (!vdo_is_mapped_location(&mapping)) { + /* The page we want one level down has not been allocated, so allocate it. */ + allocate_block_map_page(zone, data_vio); + return; + } + + lock->tree_slots[lock->height - 1].block_map_slot.pbn = mapping.pbn; + if (lock->height == 1) { + /* This is the ultimate block map page, so we're done */ + finish_lookup(data_vio, VDO_SUCCESS); + return; + } + + /* We know what page we need to load. */ + load_block_map_page(zone, data_vio); +} + +/* + * Find the PBN of a leaf block map page. This method may only be used after all allocated tree + * pages have been loaded, otherwise, it may give the wrong answer (0). + */ +physical_block_number_t +vdo_find_block_map_page_pbn(struct block_map *map, page_number_t page_number) +{ + struct data_location mapping; + struct tree_page *tree_page; + struct block_map_page *page; + root_count_t root_index = page_number % map->root_count; + page_number_t page_index = page_number / map->root_count; + slot_number_t slot = page_index % VDO_BLOCK_MAP_ENTRIES_PER_PAGE; + + page_index /= VDO_BLOCK_MAP_ENTRIES_PER_PAGE; + + tree_page = get_tree_page_by_index(map->forest, root_index, 1, page_index); + page = (struct block_map_page *) tree_page->page_buffer; + if (!page->header.initialized) + return VDO_ZERO_BLOCK; + + mapping = vdo_unpack_block_map_entry(&page->entries[slot]); + if (!vdo_is_valid_location(&mapping) || vdo_is_state_compressed(mapping.state)) + return VDO_ZERO_BLOCK; + return mapping.pbn; +} + +/* + * Write a tree page or indicate that it has been re-dirtied if it is already being written. This + * method is used when correcting errors in the tree during read-only rebuild. + */ +void vdo_write_tree_page(struct tree_page *page, struct block_map_zone *zone) +{ + bool waiting = vdo_is_waiting(&page->waiter); + + if (waiting && (zone->flusher == page)) + return; + + set_generation(zone, page, zone->generation); + if (waiting || page->writing) + return; + + enqueue_page(page, zone); +} + +static int make_segment(struct forest *old_forest, + block_count_t new_pages, + struct boundary *new_boundary, + struct forest *forest) +{ + size_t index = (old_forest == NULL) ? 0 : old_forest->segments; + struct tree_page *page_ptr; + page_count_t segment_sizes[VDO_BLOCK_MAP_TREE_HEIGHT]; + height_t height; + root_count_t root; + int result; + + forest->segments = index + 1; + + result = UDS_ALLOCATE(forest->segments, struct boundary, + "forest boundary array", &forest->boundaries); + if (result != VDO_SUCCESS) + return result; + + result = UDS_ALLOCATE(forest->segments, + struct tree_page *, + "forest page pointers", + &forest->pages); + if (result != VDO_SUCCESS) + return result; + + result = UDS_ALLOCATE(new_pages, + struct tree_page, + "new forest pages", + &forest->pages[index]); + if (result != VDO_SUCCESS) + return result; + + if (index > 0) { + memcpy(forest->boundaries, + old_forest->boundaries, + index * sizeof(struct boundary)); + memcpy(forest->pages, old_forest->pages, index * sizeof(struct tree_page *)); + } + + memcpy(&(forest->boundaries[index]), new_boundary, sizeof(struct boundary)); + + for (height = 0; height < VDO_BLOCK_MAP_TREE_HEIGHT; height++) { + segment_sizes[height] = new_boundary->levels[height]; + if (index > 0) + segment_sizes[height] -= old_forest->boundaries[index - 1].levels[height]; + } + + page_ptr = forest->pages[index]; + for (root = 0; root < forest->map->root_count; root++) { + struct block_map_tree_segment *segment; + struct block_map_tree *tree = &(forest->trees[root]); + height_t height; + + int result = UDS_ALLOCATE(forest->segments, + struct block_map_tree_segment, + "tree root segments", + &tree->segments); + if (result != VDO_SUCCESS) + return result; + + if (index > 0) + memcpy(tree->segments, + old_forest->trees[root].segments, + index * sizeof(struct block_map_tree_segment)); + + segment = &(tree->segments[index]); + for (height = 0; height < VDO_BLOCK_MAP_TREE_HEIGHT; height++) { + if (segment_sizes[height] == 0) + continue; + + segment->levels[height] = page_ptr; + if (height == (VDO_BLOCK_MAP_TREE_HEIGHT - 1)) { + /* Record the root. */ + struct block_map_page *page = + vdo_format_block_map_page(page_ptr->page_buffer, + forest->map->nonce, + VDO_INVALID_PBN, + true); + page->entries[0] = + vdo_pack_block_map_entry(forest->map->root_origin + root, + VDO_MAPPING_STATE_UNCOMPRESSED); + } + page_ptr += segment_sizes[height]; + } + } + + return VDO_SUCCESS; +} + +static void deforest(struct forest *forest, size_t first_page_segment) +{ + root_count_t root; + + if (forest->pages != NULL) { + size_t segment; + + for (segment = first_page_segment; segment < forest->segments; segment++) + UDS_FREE(forest->pages[segment]); + UDS_FREE(forest->pages); + } + + for (root = 0; root < forest->map->root_count; root++) + UDS_FREE(forest->trees[root].segments); + + UDS_FREE(forest->boundaries); + UDS_FREE(forest); +} + +/** + * make_forest() - Make a collection of trees for a block_map, expanding the existing forest if + * there is one. + * @entries: The number of entries the block map will hold. + * + * Return: VDO_SUCCESS or an error. + */ +static int make_forest(struct block_map *map, block_count_t entries) +{ + struct forest *forest, *old_forest = map->forest; + struct boundary new_boundary, *old_boundary = NULL; + block_count_t new_pages; + int result; + + if (old_forest != NULL) + old_boundary = &(old_forest->boundaries[old_forest->segments - 1]); + + new_pages = vdo_compute_new_forest_pages(map->root_count, old_boundary, + entries, &new_boundary); + if (new_pages == 0) { + map->next_entry_count = entries; + return VDO_SUCCESS; + } + + result = UDS_ALLOCATE_EXTENDED(struct forest, map->root_count, + struct block_map_tree, __func__, + &forest); + if (result != VDO_SUCCESS) + return result; + + forest->map = map; + result = make_segment(old_forest, new_pages, &new_boundary, forest); + if (result != VDO_SUCCESS) { + deforest(forest, forest->segments - 1); + return result; + } + + map->next_forest = forest; + map->next_entry_count = entries; + return VDO_SUCCESS; +} + +/** + * replace_forest() - Replace a block_map's forest with the already-prepared larger forest. + */ +static void replace_forest(struct block_map *map) +{ + if (map->next_forest != NULL) { + if (map->forest != NULL) + deforest(map->forest, map->forest->segments); + map->forest = UDS_FORGET(map->next_forest); + } + + map->entry_count = map->next_entry_count; + map->next_entry_count = 0; +} + +/** + * finish_cursor() - Finish the traversal of a single tree. If it was the last cursor, finish the + * traversal. + */ +static void finish_cursor(struct cursor *cursor) +{ + struct cursors *cursors = cursor->parent; + struct vdo_completion *parent = cursors->parent; + + return_vio_to_pool(cursors->pool, UDS_FORGET(cursor->vio)); + if (--cursors->active_roots > 0) + return; + + UDS_FREE(cursors); + + vdo_finish_completion(parent); +} + +static void traverse(struct cursor *cursor); + +/** + * continue_traversal() - Continue traversing a block map tree. + * @completion: The VIO doing a read or write. + */ +static void continue_traversal(struct vdo_completion *completion) +{ + vio_record_metadata_io_error(as_vio(completion)); + traverse(completion->parent); +} + +/** + * finish_traversal_load() - Continue traversing a block map tree now that a page has been loaded. + * @completion: The VIO doing the read. + */ +static void finish_traversal_load(struct vdo_completion *completion) +{ + struct cursor *cursor = completion->parent; + height_t height = cursor->height; + struct cursor_level *level = &cursor->levels[height]; + struct tree_page *tree_page = + &(cursor->tree->segments[0].levels[height][level->page_index]); + struct block_map_page *page = (struct block_map_page *) tree_page->page_buffer; + + vdo_copy_valid_page(cursor->vio->vio.data, + cursor->parent->zone->block_map->nonce, + pbn_from_vio_bio(cursor->vio->vio.bio), + page); + traverse(cursor); +} + +static void traversal_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct cursor *cursor = vio->completion.parent; + + continue_vio_after_io(vio, finish_traversal_load, cursor->parent->zone->thread_id); +} + +/** + * traverse() - Traverse a single block map tree. + * + * This is the recursive heart of the traversal process. + */ +static void traverse(struct cursor *cursor) +{ + for (; cursor->height < VDO_BLOCK_MAP_TREE_HEIGHT; cursor->height++) { + height_t height = cursor->height; + struct cursor_level *level = &cursor->levels[height]; + struct tree_page *tree_page = + &(cursor->tree->segments[0].levels[height][level->page_index]); + struct block_map_page *page = (struct block_map_page *) tree_page->page_buffer; + + if (!page->header.initialized) + continue; + + for (; level->slot < VDO_BLOCK_MAP_ENTRIES_PER_PAGE; level->slot++) { + struct cursor_level *next_level; + page_number_t entry_index = + (VDO_BLOCK_MAP_ENTRIES_PER_PAGE * level->page_index) + level->slot; + struct data_location location = + vdo_unpack_block_map_entry(&page->entries[level->slot]); + + if (!vdo_is_valid_location(&location)) { + /* This entry is invalid, so remove it from the page. */ + page->entries[level->slot] = + vdo_pack_block_map_entry(VDO_ZERO_BLOCK, + VDO_MAPPING_STATE_UNMAPPED); + vdo_write_tree_page(tree_page, cursor->parent->zone); + continue; + } + + if (!vdo_is_mapped_location(&location)) + continue; + + /* Erase mapped entries past the end of the logical space. */ + if (entry_index >= cursor->boundary.levels[height]) { + page->entries[level->slot] = + vdo_pack_block_map_entry(VDO_ZERO_BLOCK, + VDO_MAPPING_STATE_UNMAPPED); + vdo_write_tree_page(tree_page, cursor->parent->zone); + continue; + } + + if (cursor->height < VDO_BLOCK_MAP_TREE_HEIGHT - 1) { + int result = + cursor->parent->entry_callback(location.pbn, + cursor->parent->parent); + + if (result != VDO_SUCCESS) { + page->entries[level->slot] = + vdo_pack_block_map_entry(VDO_ZERO_BLOCK, + VDO_MAPPING_STATE_UNMAPPED); + vdo_write_tree_page(tree_page, cursor->parent->zone); + continue; + } + } + + if (cursor->height == 0) + continue; + + cursor->height--; + next_level = &cursor->levels[cursor->height]; + next_level->page_index = entry_index; + next_level->slot = 0; + level->slot++; + submit_metadata_vio(&cursor->vio->vio, + location.pbn, + traversal_endio, + continue_traversal, + REQ_OP_READ | REQ_PRIO); + return; + } + } + + finish_cursor(cursor); +} + +/** + * launch_cursor() - Start traversing a single block map tree now that the cursor has a VIO with + * which to load pages. + * @context: The pooled_vio just acquired. + * + * Implements waiter_callback. + */ +static void launch_cursor(struct waiter *waiter, void *context) +{ + struct cursor *cursor = container_of(waiter, struct cursor, waiter); + struct pooled_vio *pooled = context; + + cursor->vio = pooled; + pooled->vio.completion.parent = cursor; + pooled->vio.completion.callback_thread_id = cursor->parent->zone->thread_id; + traverse(cursor); +} + +/** + * compute_boundary() - Compute the number of pages used at each level of the given root's tree. + * + * Return: The list of page counts as a boundary structure. + */ +static struct boundary compute_boundary(struct block_map *map, root_count_t root_index) +{ + struct boundary boundary; + height_t height; + page_count_t leaf_pages = vdo_compute_block_map_page_count(map->entry_count); + /* + * Compute the leaf pages for this root. If the number of leaf pages does not distribute + * evenly, we must determine if this root gets an extra page. Extra pages are assigned to + * roots starting from tree 0. + */ + page_count_t last_tree_root = (leaf_pages - 1) % map->root_count; + page_count_t level_pages = leaf_pages / map->root_count; + + if (root_index <= last_tree_root) + level_pages++; + + for (height = 0; height < VDO_BLOCK_MAP_TREE_HEIGHT - 1; height++) { + boundary.levels[height] = level_pages; + level_pages = DIV_ROUND_UP(level_pages, VDO_BLOCK_MAP_ENTRIES_PER_PAGE); + } + + /* The root node always exists, even if the root is otherwise unused. */ + boundary.levels[VDO_BLOCK_MAP_TREE_HEIGHT - 1] = 1; + + return boundary; +} + +/** + * vdo_traverse_forest() - Walk the entire forest of a block map. + * @callback: A function to call with the pbn of each allocated node in the forest. + * @parent: The completion to notify on each traversed PBN, and when the traversal is complete. + */ +void vdo_traverse_forest(struct block_map *map, + vdo_entry_callback *callback, + struct vdo_completion *parent) +{ + root_count_t root; + struct cursors *cursors; + int result; + + result = UDS_ALLOCATE_EXTENDED(struct cursors, + map->root_count, + struct cursor, + __func__, + &cursors); + if (result != VDO_SUCCESS) { + vdo_fail_completion(parent, result); + return; + } + + cursors->zone = &map->zones[0]; + cursors->pool = cursors->zone->vio_pool; + cursors->entry_callback = callback; + cursors->parent = parent; + cursors->active_roots = map->root_count; + for (root = 0; root < map->root_count; root++) { + struct cursor *cursor = &cursors->cursors[root]; + + *cursor = (struct cursor) { + .tree = &map->forest->trees[root], + .height = VDO_BLOCK_MAP_TREE_HEIGHT - 1, + .parent = cursors, + .boundary = compute_boundary(map, root), + }; + + cursor->waiter.callback = launch_cursor; + acquire_vio_from_pool(cursors->pool, &cursor->waiter); + }; +} + +/** + * initialize_block_map_zone() - Initialize the per-zone portions of the block map. + * @maximum_age: The number of journal blocks before a dirtied page is considered old and must be + * written out. + */ +static int __must_check initialize_block_map_zone(struct block_map *map, + zone_count_t zone_number, + struct vdo *vdo, + page_count_t cache_size, + block_count_t maximum_age) +{ + int result; + block_count_t i; + struct block_map_zone *zone = &map->zones[zone_number]; + + STATIC_ASSERT_SIZEOF(struct page_descriptor, sizeof(u64)); + + zone->zone_number = zone_number; + zone->thread_id = vdo->thread_config.logical_threads[zone_number]; + zone->block_map = map; + + result = UDS_ALLOCATE_EXTENDED(struct dirty_lists, + maximum_age, + dirty_era_t, + __func__, + &zone->dirty_lists); + if (result != VDO_SUCCESS) + return result; + + zone->dirty_lists->maximum_age = maximum_age; + INIT_LIST_HEAD(&zone->dirty_lists->expired[VDO_TREE_PAGE]); + INIT_LIST_HEAD(&zone->dirty_lists->expired[VDO_CACHE_PAGE]); + + for (i = 0; i < maximum_age; i++) { + INIT_LIST_HEAD(&zone->dirty_lists->eras[i][VDO_TREE_PAGE]); + INIT_LIST_HEAD(&zone->dirty_lists->eras[i][VDO_CACHE_PAGE]); + } + + result = vdo_make_int_map(VDO_LOCK_MAP_CAPACITY, 0, &zone->loading_pages); + if (result != VDO_SUCCESS) + return result; + + result = make_vio_pool(vdo, + BLOCK_MAP_VIO_POOL_SIZE, + zone->thread_id, + VIO_TYPE_BLOCK_MAP_INTERIOR, + VIO_PRIORITY_METADATA, + zone, + &zone->vio_pool); + if (result != VDO_SUCCESS) + return result; + + vdo_set_admin_state_code(&zone->state, VDO_ADMIN_STATE_NORMAL_OPERATION); + + zone->page_cache.zone = zone; + zone->page_cache.vdo = vdo; + zone->page_cache.page_count = cache_size / map->zone_count; + zone->page_cache.stats.free_pages = zone->page_cache.page_count; + + result = allocate_cache_components(&zone->page_cache); + if (result != VDO_SUCCESS) + return result; + + /* initialize empty circular queues */ + INIT_LIST_HEAD(&zone->page_cache.lru_list); + INIT_LIST_HEAD(&zone->page_cache.outgoing_list); + + return VDO_SUCCESS; +} + +/* Implements vdo_zone_thread_getter */ +static thread_id_t get_block_map_zone_thread_id(void *context, zone_count_t zone_number) +{ + struct block_map *map = context; + + return map->zones[zone_number].thread_id; +} + +/* Implements vdo_action_preamble */ +static void prepare_for_era_advance(void *context, struct vdo_completion *parent) +{ + struct block_map *map = context; + + map->current_era_point = map->pending_era_point; + vdo_finish_completion(parent); +} + +/* Implements vdo_zone_action */ +static void advance_block_map_zone_era(void *context, + zone_count_t zone_number, + struct vdo_completion *parent) +{ + struct block_map *map = context; + struct block_map_zone *zone = &map->zones[zone_number]; + + update_period(zone->dirty_lists, map->current_era_point); + write_expired_elements(zone); + vdo_finish_completion(parent); +} + +/* + * Schedule an era advance if necessary. This method should not be called directly. Rather, call + * vdo_schedule_default_action() on the block map's action manager. + * + * Implements vdo_action_scheduler. + */ +static bool schedule_era_advance(void *context) +{ + struct block_map *map = context; + + if (map->current_era_point == map->pending_era_point) + return false; + + return vdo_schedule_action(map->action_manager, + prepare_for_era_advance, + advance_block_map_zone_era, + NULL, + NULL); +} + +static void uninitialize_block_map_zone(struct block_map_zone *zone) +{ + struct vdo_page_cache *cache = &zone->page_cache; + + UDS_FREE(UDS_FORGET(zone->dirty_lists)); + free_vio_pool(UDS_FORGET(zone->vio_pool)); + vdo_free_int_map(UDS_FORGET(zone->loading_pages)); + if (cache->infos != NULL) { + struct page_info *info; + + for (info = cache->infos; info < cache->infos + cache->page_count; ++info) + free_vio(UDS_FORGET(info->vio)); + } + + vdo_free_int_map(UDS_FORGET(cache->page_map)); + UDS_FREE(UDS_FORGET(cache->infos)); + UDS_FREE(UDS_FORGET(cache->pages)); +} + +void vdo_free_block_map(struct block_map *map) +{ + zone_count_t zone; + + if (map == NULL) + return; + + for (zone = 0; zone < map->zone_count; zone++) + uninitialize_block_map_zone(&map->zones[zone]); + + vdo_abandon_block_map_growth(map); + if (map->forest != NULL) + deforest(UDS_FORGET(map->forest), 0); + UDS_FREE(UDS_FORGET(map->action_manager)); + UDS_FREE(map); +} + +/* @journal may be NULL. */ +int vdo_decode_block_map(struct block_map_state_2_0 state, + block_count_t logical_blocks, + struct vdo *vdo, + struct recovery_journal *journal, + nonce_t nonce, + page_count_t cache_size, + block_count_t maximum_age, + struct block_map **map_ptr) +{ + struct block_map *map; + int result; + zone_count_t zone = 0; + + STATIC_ASSERT(VDO_BLOCK_MAP_ENTRIES_PER_PAGE == + ((VDO_BLOCK_SIZE - sizeof(struct block_map_page)) / + sizeof(struct block_map_entry))); + result = ASSERT(cache_size > 0, "block map cache size is specified"); + if (result != UDS_SUCCESS) + return result; + + result = UDS_ALLOCATE_EXTENDED(struct block_map, + vdo->thread_config.logical_zone_count, + struct block_map_zone, + __func__, + &map); + if (result != UDS_SUCCESS) + return result; + + map->vdo = vdo; + map->root_origin = state.root_origin; + map->root_count = state.root_count; + map->entry_count = logical_blocks; + map->journal = journal; + map->nonce = nonce; + + result = make_forest(map, map->entry_count); + if (result != VDO_SUCCESS) { + vdo_free_block_map(map); + return result; + } + + replace_forest(map); + + map->zone_count = vdo->thread_config.logical_zone_count; + for (zone = 0; zone < map->zone_count; zone++) { + result = initialize_block_map_zone(map, zone, vdo, cache_size, maximum_age); + if (result != VDO_SUCCESS) { + vdo_free_block_map(map); + return result; + } + } + + result = vdo_make_action_manager(map->zone_count, + get_block_map_zone_thread_id, + vdo_get_recovery_journal_thread_id(journal), + map, + schedule_era_advance, + vdo, + &map->action_manager); + if (result != VDO_SUCCESS) { + vdo_free_block_map(map); + return result; + } + + *map_ptr = map; + return VDO_SUCCESS; +} + +struct block_map_state_2_0 vdo_record_block_map(const struct block_map *map) +{ + return (struct block_map_state_2_0) { + .flat_page_origin = VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN, + /* This is the flat page count, which has turned out to always be 0. */ + .flat_page_count = 0, + .root_origin = map->root_origin, + .root_count = map->root_count, + }; +} + +/* The block map needs to know the journals' sequence number to initialize the eras. */ +void vdo_initialize_block_map_from_journal(struct block_map *map, struct recovery_journal *journal) +{ + zone_count_t z = 0; + + map->current_era_point = vdo_get_recovery_journal_current_sequence_number(journal); + map->pending_era_point = map->current_era_point; + + for (z = 0; z < map->zone_count; z++) { + struct dirty_lists *dirty_lists = map->zones[z].dirty_lists; + + ASSERT_LOG_ONLY(dirty_lists->next_period == 0, "current period not set"); + dirty_lists->oldest_period = map->current_era_point; + dirty_lists->next_period = map->current_era_point + 1; + dirty_lists->offset = map->current_era_point % dirty_lists->maximum_age; + } +} + +/* Compute the logical zone for the LBN of a data vio. */ +zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio) +{ + struct block_map *map = vdo_from_data_vio(data_vio)->block_map; + struct tree_lock *tree_lock = &data_vio->tree_lock; + page_number_t page_number = data_vio->logical.lbn / VDO_BLOCK_MAP_ENTRIES_PER_PAGE; + + tree_lock->tree_slots[0].page_index = page_number; + tree_lock->root_index = page_number % map->root_count; + return (tree_lock->root_index % map->zone_count); +} + +/** + * vdo_find_block_map_slot() - Compute the block map slot in which the block map entry for a + * data_vio resides and cache that in the data_vio. + * @thread_id: The thread on which to run the callback. + * Update the block map era information for a newly finished journal block. + * This method must be called from the journal zone thread. + */ +void vdo_advance_block_map_era(struct block_map *map, sequence_number_t recovery_block_number) +{ + if (map == NULL) + return; + + map->pending_era_point = recovery_block_number; + vdo_schedule_default_action(map->action_manager); +} + +/* Implements vdo_admin_initiator */ +static void initiate_drain(struct admin_state *state) +{ + struct block_map_zone *zone = container_of(state, struct block_map_zone, state); + + ASSERT_LOG_ONLY((zone->active_lookups == 0), + "%s() called with no active lookups", + __func__); + + if (!vdo_is_state_suspending(state)) { + while (zone->dirty_lists->oldest_period < zone->dirty_lists->next_period) + expire_oldest_list(zone->dirty_lists); + write_expired_elements(zone); + } + + check_for_drain_complete(zone); +} + +/* Implements vdo_zone_action. */ +static void drain_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent) +{ + struct block_map *map = context; + struct block_map_zone *zone = &map->zones[zone_number]; + + vdo_start_draining(&zone->state, + vdo_get_current_manager_operation(map->action_manager), + parent, + initiate_drain); +} + +void vdo_drain_block_map(struct block_map *map, + const struct admin_state_code *operation, + struct vdo_completion *parent) +{ + vdo_schedule_operation(map->action_manager, operation, NULL, drain_zone, NULL, parent); +} + +/* Implements vdo_zone_action. */ +static void +resume_block_map_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent) +{ + struct block_map *map = context; + struct block_map_zone *zone = &map->zones[zone_number]; + + vdo_fail_completion(parent, vdo_resume_if_quiescent(&zone->state)); +} + +void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent) +{ + vdo_schedule_operation(map->action_manager, + VDO_ADMIN_STATE_RESUMING, + NULL, + resume_block_map_zone, + NULL, + parent); +} + +/* Allocate an expanded collection of trees, for a future growth. */ +int vdo_prepare_to_grow_block_map(struct block_map *map, block_count_t new_logical_blocks) +{ + if (map->next_entry_count == new_logical_blocks) + return VDO_SUCCESS; + + if (map->next_entry_count > 0) + vdo_abandon_block_map_growth(map); + + if (new_logical_blocks < map->entry_count) { + map->next_entry_count = map->entry_count; + return VDO_SUCCESS; + } + + return make_forest(map, new_logical_blocks); +} + +/* Implements vdo_action_preamble */ +static void grow_forest(void *context, struct vdo_completion *completion) +{ + replace_forest(context); + vdo_finish_completion(completion); +} + +/* Requires vdo_prepare_to_grow_block_map() to have been previously called. */ +void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent) +{ + vdo_schedule_operation(map->action_manager, + VDO_ADMIN_STATE_SUSPENDED_OPERATION, + grow_forest, + NULL, + NULL, + parent); +} + +void vdo_abandon_block_map_growth(struct block_map *map) +{ + struct forest *forest = UDS_FORGET(map->next_forest); + + if (forest != NULL) + deforest(forest, forest->segments - 1); + + map->next_entry_count = 0; +} + +/* Release the page completion and then continue the requester. */ +static inline void finish_processing_page(struct vdo_completion *completion, int result) +{ + struct vdo_completion *parent = completion->parent; + + vdo_release_page_completion(completion); + vdo_continue_completion(parent, result); +} + +static void handle_page_error(struct vdo_completion *completion) +{ + finish_processing_page(completion, completion->result); +} + +/* Fetch the mapping page for a block map update, and call the provided handler when fetched. */ +static void fetch_mapping_page(struct data_vio *data_vio, bool modifiable, vdo_action *action) +{ + struct block_map_zone *zone = data_vio->logical.zone->block_map_zone; + + if (vdo_is_state_draining(&zone->state)) { + continue_data_vio_with_error(data_vio, VDO_SHUTTING_DOWN); + return; + } + + vdo_get_page(&data_vio->page_completion, + zone, + data_vio->tree_lock.tree_slots[0].block_map_slot.pbn, + modifiable, + &data_vio->vio.completion, + action, + handle_page_error, + false); +} + +/** + * clear_mapped_location() - Clear a data_vio's mapped block location, setting it to be unmapped. + * + * This indicates the block map entry for the logical block is either unmapped or corrupted. + */ +static void clear_mapped_location(struct data_vio *data_vio) +{ + data_vio->mapped = (struct zoned_pbn) { + .state = VDO_MAPPING_STATE_UNMAPPED, + }; +} + +/** + * set_mapped_location() - Decode and validate a block map entry, and set the mapped location of a + * data_vio. + * + * Return: VDO_SUCCESS or VDO_BAD_MAPPING if the map entry is invalid or an error code for any + * other failure + */ +static int __must_check +set_mapped_location(struct data_vio *data_vio, const struct block_map_entry *entry) +{ + /* Unpack the PBN for logging purposes even if the entry is invalid. */ + struct data_location mapped = vdo_unpack_block_map_entry(entry); + + if (vdo_is_valid_location(&mapped)) { + int result; + + result = vdo_get_physical_zone(vdo_from_data_vio(data_vio), + mapped.pbn, + &data_vio->mapped.zone); + if (result == VDO_SUCCESS) { + data_vio->mapped.pbn = mapped.pbn; + data_vio->mapped.state = mapped.state; + return VDO_SUCCESS; + } + + /* + * Return all errors not specifically known to be errors from validating the + * location. + */ + if ((result != VDO_OUT_OF_RANGE) && (result != VDO_BAD_MAPPING)) + return result; + } + + /* + * Log the corruption even if we wind up ignoring it for write VIOs, converting all cases + * to VDO_BAD_MAPPING. + */ + uds_log_error_strerror(VDO_BAD_MAPPING, + "PBN %llu with state %u read from the block map was invalid", + (unsigned long long) mapped.pbn, + mapped.state); + + /* + * A read VIO has no option but to report the bad mapping--reading zeros would be hiding + * known data loss. + */ + if (!data_vio->write) + return VDO_BAD_MAPPING; + + /* + * A write VIO only reads this mapping to decref the old block. Treat this as an unmapped + * entry rather than fail the write. + */ + clear_mapped_location(data_vio); + return VDO_SUCCESS; +} + +/* This callback is registered in vdo_get_mapped_block(). */ +static void get_mapping_from_fetched_page(struct vdo_completion *completion) +{ + int result; + struct vdo_page_completion *vpc = as_vdo_page_completion(completion); + const struct block_map_page *page; + const struct block_map_entry *entry; + struct data_vio *data_vio = as_data_vio(completion->parent); + struct block_map_tree_slot *tree_slot; + + if (completion->result != VDO_SUCCESS) { + finish_processing_page(completion, completion->result); + return; + } + + result = validate_completed_page(vpc, false); + if (result != VDO_SUCCESS) { + finish_processing_page(completion, result); + return; + } + + page = (const struct block_map_page *) get_page_buffer(vpc->info); + tree_slot = &data_vio->tree_lock.tree_slots[0]; + entry = &page->entries[tree_slot->block_map_slot.slot]; + + result = set_mapped_location(data_vio, entry); + finish_processing_page(completion, result); +} + +void vdo_update_block_map_page(struct block_map_page *page, + struct data_vio *data_vio, + physical_block_number_t pbn, + enum block_mapping_state mapping_state, + sequence_number_t *recovery_lock) +{ + struct block_map_zone *zone = data_vio->logical.zone->block_map_zone; + struct block_map *block_map = zone->block_map; + struct recovery_journal *journal = block_map->journal; + sequence_number_t old_locked, new_locked; + struct tree_lock *tree_lock = &data_vio->tree_lock; + + /* Encode the new mapping. */ + page->entries[tree_lock->tree_slots[tree_lock->height].block_map_slot.slot] = + vdo_pack_block_map_entry(pbn, mapping_state); + + /* Adjust references on the recovery journal blocks. */ + old_locked = *recovery_lock; + new_locked = data_vio->recovery_sequence_number; + + if ((old_locked == 0) || (old_locked > new_locked)) { + vdo_acquire_recovery_journal_block_reference(journal, + new_locked, + VDO_ZONE_TYPE_LOGICAL, + zone->zone_number); + + if (old_locked > 0) + vdo_release_recovery_journal_block_reference(journal, + old_locked, + VDO_ZONE_TYPE_LOGICAL, + zone->zone_number); + + *recovery_lock = new_locked; + } + + /* + * FIXME: explain this more + * Release the transferred lock from the data_vio. + */ + vdo_release_journal_entry_lock(journal, new_locked); + data_vio->recovery_sequence_number = 0; +} + +static void put_mapping_in_fetched_page(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion->parent); + sequence_number_t old_lock; + struct vdo_page_completion *vpc; + struct page_info *info; + int result; + + if (completion->result != VDO_SUCCESS) { + finish_processing_page(completion, completion->result); + return; + } + + vpc = as_vdo_page_completion(completion); + result = validate_completed_page(vpc, true); + if (result != VDO_SUCCESS) { + finish_processing_page(completion, result); + return; + } + + info = vpc->info; + old_lock = info->recovery_lock; + vdo_update_block_map_page((struct block_map_page *) get_page_buffer(info), + data_vio, + data_vio->new_mapped.pbn, + data_vio->new_mapped.state, + &info->recovery_lock); + set_info_state(info, PS_DIRTY); + add_to_dirty_lists(info->cache->zone, + &info->state_entry, + VDO_CACHE_PAGE, + old_lock, + info->recovery_lock); + finish_processing_page(completion, VDO_SUCCESS); +} + +/* Read a stored block mapping into a data_vio. */ +void vdo_get_mapped_block(struct data_vio *data_vio) +{ + if (data_vio->tree_lock.tree_slots[0].block_map_slot.pbn == VDO_ZERO_BLOCK) { + /* + * We know that the block map page for this LBN has not been allocated, so the + * block must be unmapped. + */ + clear_mapped_location(data_vio); + continue_data_vio(data_vio); + return; + } + + fetch_mapping_page(data_vio, false, get_mapping_from_fetched_page); +} + +/* Update a stored block mapping to reflect a data_vio's new mapping. */ +void vdo_put_mapped_block(struct data_vio *data_vio) +{ + fetch_mapping_page(data_vio, true, put_mapping_in_fetched_page); +} + +struct block_map_statistics vdo_get_block_map_statistics(struct block_map *map) +{ + zone_count_t zone = 0; + struct block_map_statistics totals; + + memset(&totals, 0, sizeof(struct block_map_statistics)); + for (zone = 0; zone < map->zone_count; zone++) { + const struct block_map_statistics *stats = &(map->zones[zone].page_cache.stats); + + totals.dirty_pages += READ_ONCE(stats->dirty_pages); + totals.clean_pages += READ_ONCE(stats->clean_pages); + totals.free_pages += READ_ONCE(stats->free_pages); + totals.failed_pages += READ_ONCE(stats->failed_pages); + totals.incoming_pages += READ_ONCE(stats->incoming_pages); + totals.outgoing_pages += READ_ONCE(stats->outgoing_pages); + totals.cache_pressure += READ_ONCE(stats->cache_pressure); + totals.read_count += READ_ONCE(stats->read_count); + totals.write_count += READ_ONCE(stats->write_count); + totals.failed_reads += READ_ONCE(stats->failed_reads); + totals.failed_writes += READ_ONCE(stats->failed_writes); + totals.reclaimed += READ_ONCE(stats->reclaimed); + totals.read_outgoing += READ_ONCE(stats->read_outgoing); + totals.found_in_cache += READ_ONCE(stats->found_in_cache); + totals.discard_required += READ_ONCE(stats->discard_required); + totals.wait_for_page += READ_ONCE(stats->wait_for_page); + totals.fetch_required += READ_ONCE(stats->fetch_required); + totals.pages_loaded += READ_ONCE(stats->pages_loaded); + totals.pages_saved += READ_ONCE(stats->pages_saved); + totals.flush_count += READ_ONCE(stats->flush_count); + } + + return totals; +} diff --git a/drivers/md/dm-vdo/block-map.h b/drivers/md/dm-vdo/block-map.h new file mode 100644 index 00000000000..e7d243ae5bb --- /dev/null +++ b/drivers/md/dm-vdo/block-map.h @@ -0,0 +1,391 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_BLOCK_MAP_H +#define VDO_BLOCK_MAP_H + +#include <linux/list.h> + +#include "numeric.h" + +#include "admin-state.h" +#include "completion.h" +#include "encodings.h" +#include "int-map.h" +#include "statistics.h" +#include "types.h" +#include "vio.h" +#include "wait-queue.h" + +enum { + BLOCK_MAP_VIO_POOL_SIZE = 64, +}; + +/* Used to indicate that the page holding the location of a tree root has been "loaded". */ +extern const physical_block_number_t VDO_INVALID_PBN; + +/* + * Generation counter for page references. + */ +typedef u32 vdo_page_generation; + +static const physical_block_number_t NO_PAGE = 0xFFFFFFFFFFFFFFFF; + +/* The VDO Page Cache abstraction. */ +struct vdo_page_cache { + /* the VDO which owns this cache */ + struct vdo *vdo; + /* number of pages in cache */ + page_count_t page_count; + /* number of pages to write in the current batch */ + page_count_t pages_in_batch; + /* Whether the VDO is doing a read-only rebuild */ + bool rebuilding; + + /* array of page information entries */ + struct page_info *infos; + /* raw memory for pages */ + char *pages; + /* cache last found page info */ + struct page_info *last_found; + /* map of page number to info */ + struct int_map *page_map; + /* main LRU list (all infos) */ + struct list_head lru_list; + /* free page list (oldest first) */ + struct list_head free_list; + /* outgoing page list */ + struct list_head outgoing_list; + /* number of read I/O operations pending */ + page_count_t outstanding_reads; + /* number of write I/O operations pending */ + page_count_t outstanding_writes; + /* number of pages covered by the current flush */ + page_count_t pages_in_flush; + /* number of pages waiting to be included in the next flush */ + page_count_t pages_to_flush; + /* number of discards in progress */ + unsigned int discard_count; + /* how many VPCs waiting for free page */ + unsigned int waiter_count; + /* queue of waiters who want a free page */ + struct wait_queue free_waiters; + /* + * Statistics are only updated on the logical zone thread, but are accessed from other + * threads. + */ + struct block_map_statistics stats; + /* counter for pressure reports */ + u32 pressure_report; + /* the block map zone to which this cache belongs */ + struct block_map_zone *zone; +}; + +/* + * The state of a page buffer. If the page buffer is free no particular page is bound to it, + * otherwise the page buffer is bound to particular page whose absolute pbn is in the pbn field. If + * the page is resident or dirty the page data is stable and may be accessed. Otherwise the page is + * in flight (incoming or outgoing) and its data should not be accessed. + * + * @note Update the static data in get_page_state_name() if you change this enumeration. + */ +enum vdo_page_buffer_state { + /* this page buffer is not being used */ + PS_FREE, + /* this page is being read from store */ + PS_INCOMING, + /* attempt to load this page failed */ + PS_FAILED, + /* this page is valid and un-modified */ + PS_RESIDENT, + /* this page is valid and modified */ + PS_DIRTY, + /* this page is being written and should not be used */ + PS_OUTGOING, + /* not a state */ + PAGE_STATE_COUNT, +} __packed; + +/* + * The write status of page + */ +enum vdo_page_write_status { + WRITE_STATUS_NORMAL, + WRITE_STATUS_DISCARD, + WRITE_STATUS_DEFERRED, +} __packed; + +/* Per-page-slot information. */ +struct page_info { + /* Preallocated page struct vio */ + struct vio *vio; + /* back-link for references */ + struct vdo_page_cache *cache; + /* the pbn of the page */ + physical_block_number_t pbn; + /* page is busy (temporarily locked) */ + u16 busy; + /* the write status the page */ + enum vdo_page_write_status write_status; + /* page state */ + enum vdo_page_buffer_state state; + /* queue of completions awaiting this item */ + struct wait_queue waiting; + /* state linked list entry */ + struct list_head state_entry; + /* LRU entry */ + struct list_head lru_entry; + /* + * The earliest recovery journal block containing uncommitted updates to the block map page + * associated with this page_info. A reference (lock) is held on that block to prevent it + * from being reaped. When this value changes, the reference on the old value must be + * released and a reference on the new value must be acquired. + */ + sequence_number_t recovery_lock; +}; + +/* + * A completion awaiting a specific page. Also a live reference into the page once completed, until + * freed. + */ +struct vdo_page_completion { + /* The generic completion */ + struct vdo_completion completion; + /* The cache involved */ + struct vdo_page_cache *cache; + /* The waiter for the pending list */ + struct waiter waiter; + /* The absolute physical block number of the page on disk */ + physical_block_number_t pbn; + /* Whether the page may be modified */ + bool writable; + /* Whether the page is available */ + bool ready; + /* The info structure for the page, only valid when ready */ + struct page_info *info; +}; + +struct forest; + +struct tree_page { + struct waiter waiter; + + /* Dirty list entry */ + struct list_head entry; + + /* If dirty, the tree zone flush generation in which it was last dirtied. */ + u8 generation; + + /* Whether this page is an interior tree page being written out. */ + bool writing; + + /* If writing, the tree zone flush generation of the copy being written. */ + u8 writing_generation; + + /* + * Sequence number of the earliest recovery journal block containing uncommitted updates to + * this page + */ + sequence_number_t recovery_lock; + + /* The value of recovery_lock when the this page last started writing */ + sequence_number_t writing_recovery_lock; + + char page_buffer[VDO_BLOCK_SIZE]; +}; + +enum block_map_page_type { + VDO_TREE_PAGE, + VDO_CACHE_PAGE, +}; + +typedef struct list_head dirty_era_t[2]; + +struct dirty_lists { + /** The number of periods after which an element will be expired */ + block_count_t maximum_age; + /** The oldest period which has unexpired elements */ + sequence_number_t oldest_period; + /** One more than the current period */ + sequence_number_t next_period; + /** The offset in the array of lists of the oldest period */ + block_count_t offset; + /** Expired pages */ + dirty_era_t expired; + /** The lists of dirty pages */ + dirty_era_t eras[]; +}; + +struct block_map_zone { + zone_count_t zone_number; + thread_id_t thread_id; + struct admin_state state; + struct block_map *block_map; + /* Dirty pages, by era*/ + struct dirty_lists *dirty_lists; + struct vdo_page_cache page_cache; + data_vio_count_t active_lookups; + struct int_map *loading_pages; + struct vio_pool *vio_pool; + /* The tree page which has issued or will be issuing a flush */ + struct tree_page *flusher; + struct wait_queue flush_waiters; + /* The generation after the most recent flush */ + u8 generation; + u8 oldest_generation; + /* The counts of dirty pages in each generation */ + u32 dirty_page_counts[256]; +}; + +struct block_map { + struct vdo *vdo; + struct action_manager *action_manager; + /* The absolute PBN of the first root of the tree part of the block map */ + physical_block_number_t root_origin; + block_count_t root_count; + + /* The era point we are currently distributing to the zones */ + sequence_number_t current_era_point; + /* The next era point */ + sequence_number_t pending_era_point; + + /* The number of entries in block map */ + block_count_t entry_count; + nonce_t nonce; + struct recovery_journal *journal; + + /* The trees for finding block map pages */ + struct forest *forest; + /* The expanded trees awaiting growth */ + struct forest *next_forest; + /* The number of entries after growth */ + block_count_t next_entry_count; + + zone_count_t zone_count; + struct block_map_zone zones[]; +}; + +/** + * typedef vdo_entry_callback - A function to be called for each allocated PBN when traversing the + * forest. + * @pbn: A PBN of a tree node. + * @completion: The parent completion of the traversal. + * + * Return: VDO_SUCCESS or an error. + */ +typedef int vdo_entry_callback(physical_block_number_t pbn, struct vdo_completion *completion); + +static inline struct vdo_page_completion *as_vdo_page_completion(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion, VDO_PAGE_COMPLETION); + return container_of(completion, struct vdo_page_completion, completion); +} + +void vdo_release_page_completion(struct vdo_completion *completion); + +void vdo_get_page(struct vdo_page_completion *page_completion, + struct block_map_zone *zone, + physical_block_number_t pbn, + bool writable, + void *parent, + vdo_action *callback, + vdo_action *error_handler, + bool requeue); + +void vdo_request_page_write(struct vdo_completion *completion); + +int __must_check vdo_get_cached_page(struct vdo_completion *completion, + struct block_map_page **page_ptr); + +int __must_check vdo_invalidate_page_cache(struct vdo_page_cache *cache); + +static inline struct block_map_page * __must_check +vdo_as_block_map_page(struct tree_page *tree_page) +{ + return (struct block_map_page *) tree_page->page_buffer; +} + +bool vdo_copy_valid_page(char *buffer, nonce_t nonce, + physical_block_number_t pbn, + struct block_map_page *page); + +void vdo_find_block_map_slot(struct data_vio *data_vio); + +physical_block_number_t +vdo_find_block_map_page_pbn(struct block_map *map, page_number_t page_number); + +void vdo_write_tree_page(struct tree_page *page, struct block_map_zone *zone); + +void vdo_traverse_forest(struct block_map *map, + vdo_entry_callback *callback, + struct vdo_completion *parent); + +int __must_check vdo_decode_block_map(struct block_map_state_2_0 state, + block_count_t logical_blocks, + struct vdo *vdo, + struct recovery_journal *journal, + nonce_t nonce, + page_count_t cache_size, + block_count_t maximum_age, + struct block_map **map_ptr); + +void vdo_drain_block_map(struct block_map *map, + const struct admin_state_code *operation, + struct vdo_completion *parent); + +void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent); + +int __must_check +vdo_prepare_to_grow_block_map(struct block_map *map, block_count_t new_logical_blocks); + +void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent); + +void vdo_abandon_block_map_growth(struct block_map *map); + +void vdo_free_block_map(struct block_map *map); + +struct block_map_state_2_0 __must_check +vdo_record_block_map(const struct block_map *map); + +void vdo_initialize_block_map_from_journal(struct block_map *map, + struct recovery_journal *journal); + +zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio); + +void vdo_advance_block_map_era(struct block_map *map, sequence_number_t recovery_block_number); + +void vdo_update_block_map_page(struct block_map_page *page, + struct data_vio *data_vio, + physical_block_number_t pbn, + enum block_mapping_state mapping_state, + sequence_number_t *recovery_lock); + +void vdo_get_mapped_block(struct data_vio *data_vio); + +void vdo_put_mapped_block(struct data_vio *data_vio); + +struct block_map_statistics __must_check vdo_get_block_map_statistics(struct block_map *map); + +/** + * convert_maximum_age(): Convert the maximum age to reflect the new recovery journal format + * @age: The configured maximum age + * + * Return: The converted age + * + * In the old recovery journal format, each journal block held 311 entries, and every write bio + * made two entries. The old maximum age was half the usable journal length. In the new format, + * each block holds only 217 entries, but each bio only makes one entry. We convert the configured + * age so that the number of writes in a block map era is the same in the old and new formats. This + * keeps the bound on the amount of work required to recover the block map from the recovery + * journal the same across the format change. It also keeps the amortization of block map page + * writes to write bios the same. + */ +static inline block_count_t vdo_convert_maximum_age(block_count_t age) +{ + return DIV_ROUND_UP(age * RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK, + 2 * RECOVERY_JOURNAL_ENTRIES_PER_BLOCK); +} + +#endif /* VDO_BLOCK_MAP_H */ diff --git a/drivers/md/dm-vdo/completion.c b/drivers/md/dm-vdo/completion.c new file mode 100644 index 00000000000..8feb9c05c19 --- /dev/null +++ b/drivers/md/dm-vdo/completion.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "completion.h" + +#include <linux/kernel.h> + +#include "logger.h" +#include "permassert.h" + +#include "status-codes.h" +#include "types.h" +#include "vio.h" +#include "vdo.h" + +/** + * DOC: vdo completions. + * + * Most of vdo's data structures are lock free, each either belonging to a single "zone," or + * divided into a number of zones whose accesses to the structure do not overlap. During normal + * operation, at most one thread will be operating in any given zone. Each zone has a + * vdo_work_queue which holds vdo_completions that are to be run in that zone. A completion may + * only be enqueued on one queue or operating in a single zone at a time. + * + * At each step of a multi-threaded operation, the completion performing the operation is given a + * callback, error handler, and thread id for the next step. A completion is "run" when it is + * operating on the correct thread (as specified by its callback_thread_id). If the value of its + * "result" field is an error (i.e. not VDO_SUCCESS), the function in its "error_handler" will be + * invoked. If the error_handler is NULL, or there is no error, the function set as its "callback" + * will be invoked. Generally, a completion will not be run directly, but rather will be + * "launched." In this case, it will check whether it is operating on the correct thread. If it is, + * it will run immediately. Otherwise, it will be enqueue on the vdo_work_queue associated with the + * completion's "callback_thread_id". When it is dequeued, it will be on the correct thread, and + * will get run. In some cases, the completion should get queued instead of running immediately, + * even if it is being launched from the correct thread. This is usually in cases where there is a + * long chain of callbacks, all on the same thread, which could overflow the stack. In such cases, + * the completion's "requeue" field should be set to true. Doing so will skip the current thread + * check and simply enqueue the completion. + * + * A completion may be "finished," in which case its "complete" field will be set to true before it + * is next run. It is a bug to attempt to set the result or re-finish a finished completion. + * Because a completion's fields are not safe to examine from any thread other than the one on + * which the completion is currently operating, this field is used only to aid in detecting + * programming errors. It can not be used for cross-thread checking on the status of an operation. + * A completion must be "reset" before it can be reused after it has been finished. Resetting will + * also clear any error from the result field. + **/ + +void vdo_initialize_completion(struct vdo_completion *completion, + struct vdo *vdo, + enum vdo_completion_type type) +{ + memset(completion, 0, sizeof(*completion)); + completion->vdo = vdo; + completion->type = type; + vdo_reset_completion(completion); +} + +static inline void assert_incomplete(struct vdo_completion *completion) +{ + ASSERT_LOG_ONLY(!completion->complete, "completion is not complete"); +} + +/** + * vdo_set_completion_result() - Set the result of a completion. + * + * Older errors will not be masked. + */ +void vdo_set_completion_result(struct vdo_completion *completion, int result) +{ + assert_incomplete(completion); + if (completion->result == VDO_SUCCESS) + completion->result = result; +} + +/** + * vdo_launch_completion_with_priority() - Run or enqueue a completion. + * @priority: The priority at which to enqueue the completion. + * + * If called on the correct thread (i.e. the one specified in the completion's callback_thread_id + * field) and not marked for requeue, the completion will be run immediately. Otherwise, the + * completion will be enqueued on the specified thread. + */ +void vdo_launch_completion_with_priority(struct vdo_completion *completion, + enum vdo_completion_priority priority) +{ + thread_id_t callback_thread = completion->callback_thread_id; + + if (completion->requeue || (callback_thread != vdo_get_callback_thread_id())) { + vdo_enqueue_completion(completion, priority); + return; + } + + vdo_run_completion(completion); +} + +/** vdo_finish_completion() - Mark a completion as complete and then launch it. */ +void vdo_finish_completion(struct vdo_completion *completion) +{ + assert_incomplete(completion); + completion->complete = true; + if (completion->callback != NULL) + vdo_launch_completion(completion); +} + +void vdo_enqueue_completion(struct vdo_completion *completion, + enum vdo_completion_priority priority) +{ + struct vdo *vdo = completion->vdo; + thread_id_t thread_id = completion->callback_thread_id; + + if (ASSERT(thread_id < vdo->thread_config.thread_count, + "thread_id %u (completion type %d) is less than thread count %u", + thread_id, + completion->type, + vdo->thread_config.thread_count) != UDS_SUCCESS) + BUG(); + + completion->requeue = false; + completion->priority = priority; + completion->my_queue = NULL; + vdo_enqueue_work_queue(vdo->threads[thread_id].queue, completion); +} + +/** + * vdo_requeue_completion_if_needed() - Requeue a completion if not called on the specified thread. + * + * Return: True if the completion was requeued; callers may not access the completion in this case. + */ +bool vdo_requeue_completion_if_needed(struct vdo_completion *completion, + thread_id_t callback_thread_id) +{ + if (vdo_get_callback_thread_id() == callback_thread_id) + return false; + + completion->callback_thread_id = callback_thread_id; + vdo_enqueue_completion(completion, VDO_WORK_Q_DEFAULT_PRIORITY); + return true; +} diff --git a/drivers/md/dm-vdo/completion.h b/drivers/md/dm-vdo/completion.h new file mode 100644 index 00000000000..03b2daa6546 --- /dev/null +++ b/drivers/md/dm-vdo/completion.h @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_COMPLETION_H +#define VDO_COMPLETION_H + +#include "permassert.h" + +#include "status-codes.h" +#include "types.h" + +/** + * vdo_run_completion() - Run a completion's callback or error handler on the current thread. + * + * Context: This function must be called from the correct callback thread. + */ +static inline void vdo_run_completion(struct vdo_completion *completion) +{ + if ((completion->result != VDO_SUCCESS) && (completion->error_handler != NULL)) { + completion->error_handler(completion); + return; + } + + completion->callback(completion); +} + +void vdo_set_completion_result(struct vdo_completion *completion, int result); + +void vdo_initialize_completion(struct vdo_completion *completion, + struct vdo *vdo, + enum vdo_completion_type type); + +/** + * vdo_reset_completion() - Reset a completion to a clean state, while keeping the type, vdo and + * parent information. + */ +static inline void vdo_reset_completion(struct vdo_completion *completion) +{ + completion->result = VDO_SUCCESS; + completion->complete = false; +} + +void vdo_launch_completion_with_priority(struct vdo_completion *completion, + enum vdo_completion_priority priority); + +/** + * vdo_launch_completion() - Launch a completion with default priority. + */ +static inline void vdo_launch_completion(struct vdo_completion *completion) +{ + vdo_launch_completion_with_priority(completion, VDO_WORK_Q_DEFAULT_PRIORITY); +} + +/** + * vdo_continue_completion() - Continue processing a completion. + * @result: The current result (will not mask older errors). + * + * Continue processing a completion by setting the current result and calling + * vdo_launch_completion(). + */ +static inline void vdo_continue_completion(struct vdo_completion *completion, int result) +{ + vdo_set_completion_result(completion, result); + vdo_launch_completion(completion); +} + +void vdo_finish_completion(struct vdo_completion *completion); + +/** + * vdo_fail_completion() - Set the result of a completion if it does not already have an error, + * then finish it. + */ +static inline void vdo_fail_completion(struct vdo_completion *completion, int result) +{ + vdo_set_completion_result(completion, result); + vdo_finish_completion(completion); +} + +/** + * vdo_assert_completion_type() - Assert that a completion is of the correct type. + * + * Return: VDO_SUCCESS or an error + */ +static inline int +vdo_assert_completion_type(struct vdo_completion *completion, enum vdo_completion_type expected) +{ + return ASSERT(expected == completion->type, + "completion type should be %u, not %u", + expected, + completion->type); +} + +static inline void vdo_set_completion_callback(struct vdo_completion *completion, + vdo_action *callback, + thread_id_t callback_thread_id) +{ + completion->callback = callback; + completion->callback_thread_id = callback_thread_id; +} + +/** + * vdo_launch_completion_callback() - Set the callback for a completion and launch it immediately. + */ +static inline void vdo_launch_completion_callback(struct vdo_completion *completion, + vdo_action *callback, + thread_id_t callback_thread_id) +{ + vdo_set_completion_callback(completion, callback, callback_thread_id); + vdo_launch_completion(completion); +} + +/** + * vdo_prepare_completion() - Prepare a completion for launch. + * + * Resets the completion, and then sets its callback, error handler, callback thread, and parent. + */ +static inline void vdo_prepare_completion(struct vdo_completion *completion, + vdo_action *callback, + vdo_action *error_handler, + thread_id_t callback_thread_id, + void *parent) +{ + vdo_reset_completion(completion); + vdo_set_completion_callback(completion, callback, callback_thread_id); + completion->error_handler = error_handler; + completion->parent = parent; +} + +/** + * vdo_prepare_completion_for_requeue() - Prepare a completion for launch ensuring that it will + * always be requeued. + * + * Resets the completion, and then sets its callback, error handler, callback thread, and parent. + */ +static inline void +vdo_prepare_completion_for_requeue(struct vdo_completion *completion, + vdo_action *callback, + vdo_action *error_handler, + thread_id_t callback_thread_id, + void *parent) +{ + vdo_prepare_completion(completion, callback, error_handler, callback_thread_id, parent); + completion->requeue = true; +} + +void vdo_enqueue_completion(struct vdo_completion *completion, + enum vdo_completion_priority priority); + + +bool vdo_requeue_completion_if_needed(struct vdo_completion *completion, + thread_id_t callback_thread_id); + +#endif /* VDO_COMPLETION_H */ diff --git a/drivers/md/dm-vdo/constants.c b/drivers/md/dm-vdo/constants.c new file mode 100644 index 00000000000..8bdfb782b13 --- /dev/null +++ b/drivers/md/dm-vdo/constants.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "types.h" + +/* The maximum logical space is 4 petabytes, which is 1 terablock. */ +const block_count_t MAXIMUM_VDO_LOGICAL_BLOCKS = 1024ULL * 1024 * 1024 * 1024; + +/* The maximum physical space is 256 terabytes, which is 64 gigablocks. */ +const block_count_t MAXIMUM_VDO_PHYSICAL_BLOCKS = 1024ULL * 1024 * 1024 * 64; + +/* unit test minimum */ +const block_count_t MINIMUM_VDO_SLAB_JOURNAL_BLOCKS = 2; diff --git a/drivers/md/dm-vdo/constants.h b/drivers/md/dm-vdo/constants.h new file mode 100644 index 00000000000..7196e99efe9 --- /dev/null +++ b/drivers/md/dm-vdo/constants.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_CONSTANTS_H +#define VDO_CONSTANTS_H + +#include <linux/blkdev.h> + +#include "types.h" + +enum { + /* + * The maximum number of contiguous PBNs which will go to a single bio submission queue, + * assuming there is more than one queue. + */ + VDO_BIO_ROTATION_INTERVAL_LIMIT = 1024, + + /** The number of entries on a block map page */ + VDO_BLOCK_MAP_ENTRIES_PER_PAGE = 812, + + /** The origin of the flat portion of the block map */ + VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN = 1, + + /* + * The height of a block map tree. Assuming a root count of 60 and 812 entries per page, + * this is big enough to represent almost 95 PB of logical space. + */ + VDO_BLOCK_MAP_TREE_HEIGHT = 5, + + /** The default number of bio submission queues. */ + DEFAULT_VDO_BIO_SUBMIT_QUEUE_COUNT = 4, + + /** The number of contiguous PBNs to be submitted to a single bio queue. */ + DEFAULT_VDO_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL = 64, + + /** The number of trees in the arboreal block map */ + DEFAULT_VDO_BLOCK_MAP_TREE_ROOT_COUNT = 60, + + /** The default size of the recovery journal, in blocks */ + DEFAULT_VDO_RECOVERY_JOURNAL_SIZE = 32 * 1024, + + /** The default size of each slab journal, in blocks */ + DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224, + + /* + * The initial size of lbn_operations and pbn_operations, which is based upon the expected + * maximum number of outstanding VIOs. This value was chosen to make it highly unlikely + * that the maps would need to be resized. + */ + VDO_LOCK_MAP_CAPACITY = 10000, + + /** The maximum number of logical zones */ + MAX_VDO_LOGICAL_ZONES = 60, + + /** The maximum number of physical zones */ + MAX_VDO_PHYSICAL_ZONES = 16, + + /** The base-2 logarithm of the maximum blocks in one slab */ + MAX_VDO_SLAB_BITS = 23, + + /** The maximum number of slabs the slab depot supports */ + MAX_VDO_SLABS = 8192, + + /* + * The maximum number of block map pages to load simultaneously during recovery or rebuild. + */ + MAXIMUM_SIMULTANEOUS_VDO_BLOCK_MAP_RESTORATION_READS = 1024, + + /** The maximum number of entries in the slab summary */ + MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES = MAX_VDO_SLABS * MAX_VDO_PHYSICAL_ZONES, + + /** The maximum number of total threads in a VDO thread configuration. */ + MAXIMUM_VDO_THREADS = 100, + + /** The maximum number of VIOs in the system at once */ + MAXIMUM_VDO_USER_VIOS = 2048, + + /** The only physical block size supported by VDO */ + VDO_BLOCK_SIZE = 4096, + + /** The number of sectors per block */ + VDO_SECTORS_PER_BLOCK = (VDO_BLOCK_SIZE >> SECTOR_SHIFT), + + /** The size of a sector that will not be torn */ + VDO_SECTOR_SIZE = 512, + + /** The physical block number reserved for storing the zero block */ + VDO_ZERO_BLOCK = 0, +}; + +/** The maximum logical space is 4 petabytes, which is 1 terablock. */ +extern const block_count_t MAXIMUM_VDO_LOGICAL_BLOCKS; + +/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */ +extern const block_count_t MAXIMUM_VDO_PHYSICAL_BLOCKS; + +/** unit test minimum */ +extern const block_count_t MINIMUM_VDO_SLAB_JOURNAL_BLOCKS; + +#endif /* VDO_CONSTANTS_H */ diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c new file mode 100644 index 00000000000..b8737e5cbc5 --- /dev/null +++ b/drivers/md/dm-vdo/data-vio.c @@ -0,0 +1,2070 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "data-vio.h" + +#include <linux/atomic.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <linux/device-mapper.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/lz4.h> +#include <linux/minmax.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/wait.h> + +#include "logger.h" +#include "memory-alloc.h" +#include "murmurhash3.h" +#include "permassert.h" + +#include "block-map.h" +#include "dump.h" +#include "encodings.h" +#include "int-map.h" +#include "io-submitter.h" +#include "logical-zone.h" +#include "packer.h" +#include "recovery-journal.h" +#include "slab-depot.h" +#include "status-codes.h" +#include "types.h" +#include "vdo.h" +#include "vio.h" +#include "wait-queue.h" + +/** + * DOC: Bio flags. + * + * For certain flags set on user bios, if the user bio has not yet been acknowledged, setting those + * flags on our own bio(s) for that request may help underlying layers better fulfill the user + * bio's needs. This constant contains the aggregate of those flags; VDO strips all the other + * flags, as they convey incorrect information. + * + * These flags are always irrelevant if we have already finished the user bio as they are only + * hints on IO importance. If VDO has finished the user bio, any remaining IO done doesn't care how + * important finishing the finished bio was. + * + * Note that bio.c contains the complete list of flags we believe may be set; the following list + * explains the action taken with each of those flags VDO could receive: + * + * * REQ_SYNC: Passed down if the user bio is not yet completed, since it indicates the user bio + * completion is required for further work to be done by the issuer. + * * REQ_META: Passed down if the user bio is not yet completed, since it may mean the lower layer + * treats it as more urgent, similar to REQ_SYNC. + * * REQ_PRIO: Passed down if the user bio is not yet completed, since it indicates the user bio is + * important. + * * REQ_NOMERGE: Set only if the incoming bio was split; irrelevant to VDO IO. + * * REQ_IDLE: Set if the incoming bio had more IO quickly following; VDO's IO pattern doesn't + * match incoming IO, so this flag is incorrect for it. + * * REQ_FUA: Handled separately, and irrelevant to VDO IO otherwise. + * * REQ_RAHEAD: Passed down, as, for reads, it indicates trivial importance. + * * REQ_BACKGROUND: Not passed down, as VIOs are a limited resource and VDO needs them recycled + * ASAP to service heavy load, which is the only place where REQ_BACKGROUND might aid in load + * prioritization. + */ +static unsigned int PASSTHROUGH_FLAGS = (REQ_PRIO | REQ_META | REQ_SYNC | REQ_RAHEAD); + +/** + * DOC: + * + * The data_vio_pool maintains the pool of data_vios which a vdo uses to service incoming bios. For + * correctness, and in order to avoid potentially expensive or blocking memory allocations during + * normal operation, the number of concurrently active data_vios is capped. Furthermore, in order + * to avoid starvation of reads and writes, at most 75% of the data_vios may be used for + * discards. The data_vio_pool is responsible for enforcing these limits. Threads submitting bios + * for which a data_vio or discard permit are not available will block until the necessary + * resources are available. The pool is also responsible for distributing resources to blocked + * threads and waking them. Finally, the pool attempts to batch the work of recycling data_vios by + * performing the work of actually assigning resources to blocked threads or placing data_vios back + * into the pool on a single cpu at a time. + * + * The pool contains two "limiters", one for tracking data_vios and one for tracking discard + * permits. The limiters also provide safe cross-thread access to pool statistics without the need + * to take the pool's lock. When a thread submits a bio to a vdo device, it will first attempt to + * get a discard permit if it is a discard, and then to get a data_vio. If the necessary resources + * are available, the incoming bio will be assigned to the acquired data_vio, and it will be + * launched. However, if either of these are unavailable, the arrival time of the bio is recorded + * in the bio's bi_private field, the bio and its submitter are both queued on the appropriate + * limiter and the submitting thread will then put itself to sleep. (note that this mechanism will + * break if jiffies are only 32 bits.) + * + * Whenever a data_vio has completed processing for the bio it was servicing, release_data_vio() + * will be called on it. This function will add the data_vio to a funnel queue, and then check the + * state of the pool. If the pool is not currently processing released data_vios, the pool's + * completion will be enqueued on a cpu queue. This obviates the need for the releasing threads to + * hold the pool's lock, and also batches release work while avoiding starvation of the cpu + * threads. + * + * Whenever the pool's completion is run on a cpu thread, it calls process_release_callback() which + * processes a batch of returned data_vios (currently at most 32) from the pool's funnel queue. For + * each data_vio, it first checks whether that data_vio was processing a discard. If so, and there + * is a blocked bio waiting for a discard permit, that permit is notionally transferred to the + * eldest discard waiter, and that waiter is moved to the end of the list of discard bios waiting + * for a data_vio. If there are no discard waiters, the discard permit is returned to the pool. + * Next, the data_vio is assigned to the oldest blocked bio which either has a discard permit, or + * doesn't need one and relaunched. If neither of these exist, the data_vio is returned to the + * pool. Finally, if any waiting bios were launched, the threads which blocked trying to submit + * them are awakened. + */ + +enum { + DATA_VIO_RELEASE_BATCH_SIZE = 128, +}; + +static const unsigned int VDO_SECTORS_PER_BLOCK_MASK = VDO_SECTORS_PER_BLOCK - 1; +static const u32 COMPRESSION_STATUS_MASK = 0xff; +static const u32 MAY_NOT_COMPRESS_MASK = 0x80000000; + +struct limiter; +typedef void assigner(struct limiter *limiter); + +/* Bookkeeping structure for a single type of resource. */ +struct limiter { + /* The data_vio_pool to which this limiter belongs */ + struct data_vio_pool *pool; + /* The maximum number of data_vios available */ + data_vio_count_t limit; + /* The number of resources in use */ + data_vio_count_t busy; + /* The maximum number of resources ever simultaneously in use */ + data_vio_count_t max_busy; + /* The number of resources to release */ + data_vio_count_t release_count; + /* The number of waiters to wake */ + data_vio_count_t wake_count; + /* The list of waiting bios which are known to process_release_callback() */ + struct bio_list waiters; + /* The list of waiting bios which are not yet known to process_release_callback() */ + struct bio_list new_waiters; + /* The list of waiters which have their permits */ + struct bio_list *permitted_waiters; + /* The function for assigning a resource to a waiter */ + assigner *assigner; + /* The queue of blocked threads */ + wait_queue_head_t blocked_threads; + /* The arrival time of the eldest waiter */ + u64 arrival; +}; + +/* + * A data_vio_pool is a collection of preallocated data_vios which may be acquired from any thread, + * and are released in batches. + */ +struct data_vio_pool { + /* Completion for scheduling releases */ + struct vdo_completion completion; + /* The administrative state of the pool */ + struct admin_state state; + /* Lock protecting the pool */ + spinlock_t lock; + /* The main limiter controlling the total data_vios in the pool. */ + struct limiter limiter; + /* The limiter controlling data_vios for discard */ + struct limiter discard_limiter; + /* The list of bios which have discard permits but still need a data_vio */ + struct bio_list permitted_discards; + /* The list of available data_vios */ + struct list_head available; + /* The queue of data_vios waiting to be returned to the pool */ + struct funnel_queue *queue; + /* Whether the pool is processing, or scheduled to process releases */ + atomic_t processing; + /* The data vios in the pool */ + struct data_vio data_vios[]; +}; + +static const char * const ASYNC_OPERATION_NAMES[] = { + "launch", + "acknowledge_write", + "acquire_hash_lock", + "attempt_logical_block_lock", + "lock_duplicate_pbn", + "check_for_duplication", + "cleanup", + "compress_data_vio", + "find_block_map_slot", + "get_mapped_block_for_read", + "get_mapped_block_for_write", + "hash_data_vio", + "journal_remapping", + "vdo_attempt_packing", + "put_mapped_block", + "read_data_vio", + "update_dedupe_index", + "update_reference_counts", + "verify_duplication", + "write_data_vio", +}; + +/* The steps taken cleaning up a VIO, in the order they are performed. */ +enum data_vio_cleanup_stage { + VIO_CLEANUP_START, + VIO_RELEASE_HASH_LOCK = VIO_CLEANUP_START, + VIO_RELEASE_ALLOCATED, + VIO_RELEASE_RECOVERY_LOCKS, + VIO_RELEASE_LOGICAL, + VIO_CLEANUP_DONE +}; + +static inline struct data_vio_pool * __must_check +as_data_vio_pool(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion, VDO_DATA_VIO_POOL_COMPLETION); + return container_of(completion, struct data_vio_pool, completion); +} + +static inline u64 get_arrival_time(struct bio *bio) +{ + return (u64) bio->bi_private; +} + +/** + * check_for_drain_complete_locked() - Check whether a data_vio_pool has no outstanding data_vios + * or waiters while holding the pool's lock. + */ +static bool check_for_drain_complete_locked(struct data_vio_pool *pool) +{ + if (pool->limiter.busy > 0) + return false; + + ASSERT_LOG_ONLY((pool->discard_limiter.busy == 0), "no outstanding discard permits"); + + return (bio_list_empty(&pool->limiter.new_waiters) && + bio_list_empty(&pool->discard_limiter.new_waiters)); +} + +static void initialize_lbn_lock(struct data_vio *data_vio, logical_block_number_t lbn) +{ + struct vdo *vdo = vdo_from_data_vio(data_vio); + zone_count_t zone_number; + struct lbn_lock *lock = &data_vio->logical; + + lock->lbn = lbn; + lock->locked = false; + vdo_initialize_wait_queue(&lock->waiters); + zone_number = vdo_compute_logical_zone(data_vio); + lock->zone = &vdo->logical_zones->zones[zone_number]; +} + +static void launch_locked_request(struct data_vio *data_vio) +{ + data_vio->logical.locked = true; + if (data_vio->write) { + struct vdo *vdo = vdo_from_data_vio(data_vio); + + if (vdo_is_read_only(vdo)) { + continue_data_vio_with_error(data_vio, VDO_READ_ONLY); + return; + } + } + + data_vio->last_async_operation = VIO_ASYNC_OP_FIND_BLOCK_MAP_SLOT; + vdo_find_block_map_slot(data_vio); +} + +static void acknowledge_data_vio(struct data_vio *data_vio) +{ + struct vdo *vdo = vdo_from_data_vio(data_vio); + struct bio *bio = data_vio->user_bio; + int error = vdo_map_to_system_error(data_vio->vio.completion.result); + + if (bio == NULL) + return; + + ASSERT_LOG_ONLY((data_vio->remaining_discard <= + (u32) (VDO_BLOCK_SIZE - data_vio->offset)), + "data_vio to acknowledge is not an incomplete discard"); + + data_vio->user_bio = NULL; + vdo_count_bios(&vdo->stats.bios_acknowledged, bio); + if (data_vio->is_partial) + vdo_count_bios(&vdo->stats.bios_acknowledged_partial, bio); + + bio->bi_status = errno_to_blk_status(error); + bio_endio(bio); +} + +static void copy_to_bio(struct bio *bio, char *data_ptr) +{ + struct bio_vec biovec; + struct bvec_iter iter; + + bio_for_each_segment(biovec, bio, iter) { + memcpy_to_bvec(&biovec, data_ptr); + data_ptr += biovec.bv_len; + } +} + +struct data_vio_compression_status +get_data_vio_compression_status(struct data_vio *data_vio) +{ + u32 packed = atomic_read(&data_vio->compression.status); + + /* pairs with cmpxchg in set_data_vio_compression_status */ + smp_rmb(); + return (struct data_vio_compression_status) { + .stage = packed & COMPRESSION_STATUS_MASK, + .may_not_compress = ((packed & MAY_NOT_COMPRESS_MASK) != 0), + }; +} + +/** + * pack_status() - Convert a data_vio_compression_status into a u32 which may be stored + * atomically. + * @status: The state to convert. + * + * Return: The compression state packed into a u32. + */ +static u32 __must_check pack_status(struct data_vio_compression_status status) +{ + return status.stage | (status.may_not_compress ? MAY_NOT_COMPRESS_MASK : 0); +} + +/** + * set_data_vio_compression_status() - Set the compression status of a data_vio. + * @state: The expected current status of the data_vio. + * @new_state: The status to set. + * + * Return: true if the new status was set, false if the data_vio's compression status did not + * match the expected state, and so was left unchanged. + */ +static bool __must_check +set_data_vio_compression_status(struct data_vio *data_vio, + struct data_vio_compression_status status, + struct data_vio_compression_status new_status) +{ + u32 actual; + u32 expected = pack_status(status); + u32 replacement = pack_status(new_status); + + /* + * Extra barriers because this was original developed using a CAS operation that implicitly + * had them. + */ + smp_mb__before_atomic(); + actual = atomic_cmpxchg(&data_vio->compression.status, expected, replacement); + /* same as before_atomic */ + smp_mb__after_atomic(); + return (expected == actual); +} + +struct data_vio_compression_status advance_data_vio_compression_stage(struct data_vio *data_vio) +{ + for (;;) { + struct data_vio_compression_status status = + get_data_vio_compression_status(data_vio); + struct data_vio_compression_status new_status = status; + + if (status.stage == DATA_VIO_POST_PACKER) + /* We're already in the last stage. */ + return status; + + if (status.may_not_compress) + /* + * Compression has been dis-allowed for this VIO, so skip the rest of the + * path and go to the end. + */ + new_status.stage = DATA_VIO_POST_PACKER; + else + /* Go to the next state. */ + new_status.stage++; + + if (set_data_vio_compression_status(data_vio, status, new_status)) + return new_status; + + /* Another thread changed the status out from under us so try again. */ + } +} + +/** + * cancel_data_vio_compression() - Prevent this data_vio from being compressed or packed. + * + * Return: true if the data_vio is in the packer and the caller was the first caller to cancel it. + */ +bool cancel_data_vio_compression(struct data_vio *data_vio) +{ + struct data_vio_compression_status status, new_status; + + for (;;) { + status = get_data_vio_compression_status(data_vio); + if (status.may_not_compress || (status.stage == DATA_VIO_POST_PACKER)) + /* This data_vio is already set up to not block in the packer. */ + break; + + new_status.stage = status.stage; + new_status.may_not_compress = true; + + if (set_data_vio_compression_status(data_vio, status, new_status)) + break; + } + + return ((status.stage == DATA_VIO_PACKING) && !status.may_not_compress); +} + +/** + * attempt_logical_block_lock() - Attempt to acquire the lock on a logical block. + * @completion: The data_vio for an external data request as a completion. + * + * This is the start of the path for all external requests. It is registered in launch_data_vio(). + */ +static void attempt_logical_block_lock(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + struct lbn_lock *lock = &data_vio->logical; + struct vdo *vdo = vdo_from_data_vio(data_vio); + struct data_vio *lock_holder; + int result; + + assert_data_vio_in_logical_zone(data_vio); + + if (data_vio->logical.lbn >= vdo->states.vdo.config.logical_blocks) { + continue_data_vio_with_error(data_vio, VDO_OUT_OF_RANGE); + return; + } + + result = vdo_int_map_put(lock->zone->lbn_operations, + lock->lbn, + data_vio, + false, + (void **) &lock_holder); + if (result != VDO_SUCCESS) { + continue_data_vio_with_error(data_vio, result); + return; + } + + if (lock_holder == NULL) { + /* We got the lock */ + launch_locked_request(data_vio); + return; + } + + result = ASSERT(lock_holder->logical.locked, "logical block lock held"); + if (result != VDO_SUCCESS) { + continue_data_vio_with_error(data_vio, result); + return; + } + + /* + * If the new request is a pure read request (not read-modify-write) and the lock_holder is + * writing and has received an allocation (VDO-2683), service the read request immediately + * by copying data from the lock_holder to avoid having to flush the write out of the + * packer just to prevent the read from waiting indefinitely. If the lock_holder does not + * yet have an allocation, prevent it from blocking in the packer and wait on it. + */ + if (!data_vio->write && READ_ONCE(lock_holder->allocation_succeeded)) { + copy_to_bio(data_vio->user_bio, (lock_holder->vio.data + data_vio->offset)); + acknowledge_data_vio(data_vio); + complete_data_vio(completion); + return; + } + + data_vio->last_async_operation = VIO_ASYNC_OP_ATTEMPT_LOGICAL_BLOCK_LOCK; + vdo_enqueue_waiter(&lock_holder->logical.waiters, &data_vio->waiter); + + /* + * Prevent writes and read-modify-writes from blocking indefinitely on lock holders in the + * packer. + */ + if (lock_holder->write && cancel_data_vio_compression(lock_holder)) { + data_vio->compression.lock_holder = lock_holder; + launch_data_vio_packer_callback(data_vio, vdo_remove_lock_holder_from_packer); + } +} + +/** + * launch_data_vio() - (Re)initialize a data_vio to have a new logical block number, keeping the + * same parent and other state and send it on its way. + */ +static void launch_data_vio(struct data_vio *data_vio, logical_block_number_t lbn) +{ + struct vdo_completion *completion = &data_vio->vio.completion; + + /* + * Clearing the tree lock must happen before initializing the LBN lock, which also adds + * information to the tree lock. + */ + memset(&data_vio->tree_lock, 0, sizeof(data_vio->tree_lock)); + initialize_lbn_lock(data_vio, lbn); + INIT_LIST_HEAD(&data_vio->hash_lock_entry); + INIT_LIST_HEAD(&data_vio->write_entry); + + memset(&data_vio->allocation, 0, sizeof(data_vio->allocation)); + + data_vio->is_duplicate = false; + + memset(&data_vio->record_name, 0, sizeof(data_vio->record_name)); + memset(&data_vio->duplicate, 0, sizeof(data_vio->duplicate)); + vdo_reset_completion(completion); + completion->error_handler = handle_data_vio_error; + set_data_vio_logical_callback(data_vio, attempt_logical_block_lock); + vdo_enqueue_completion(completion, VDO_DEFAULT_Q_MAP_BIO_PRIORITY); +} + +static bool is_zero_block(char *block) +{ + int i; + + for (i = 0; i < VDO_BLOCK_SIZE; i += sizeof(u64)) + if (*((u64 *) &block[i])) + return false; + return true; +} + +static void copy_from_bio(struct bio *bio, char *data_ptr) +{ + struct bio_vec biovec; + struct bvec_iter iter; + + bio_for_each_segment(biovec, bio, iter) { + memcpy_from_bvec(data_ptr, &biovec); + data_ptr += biovec.bv_len; + } +} + +static void launch_bio(struct vdo *vdo, struct data_vio *data_vio, struct bio *bio) +{ + logical_block_number_t lbn; + /* + * Zero out the fields which don't need to be preserved (i.e. which are not pointers to + * separately allocated objects). + */ + memset(data_vio, 0, offsetof(struct data_vio, vio)); + memset(&data_vio->compression, 0, offsetof(struct compression_state, block)); + + data_vio->user_bio = bio; + data_vio->offset = to_bytes(bio->bi_iter.bi_sector & VDO_SECTORS_PER_BLOCK_MASK); + data_vio->is_partial = (bio->bi_iter.bi_size < VDO_BLOCK_SIZE) || (data_vio->offset != 0); + + /* + * Discards behave very differently than other requests when coming in from device-mapper. + * We have to be able to handle any size discards and various sector offsets within a + * block. + */ + if (bio_op(bio) == REQ_OP_DISCARD) { + data_vio->remaining_discard = bio->bi_iter.bi_size; + data_vio->write = true; + data_vio->is_trim = true; + if (data_vio->is_partial) { + vdo_count_bios(&vdo->stats.bios_in_partial, bio); + data_vio->read = true; + } + } else if (data_vio->is_partial) { + vdo_count_bios(&vdo->stats.bios_in_partial, bio); + data_vio->read = true; + if (bio_data_dir(bio) == WRITE) + data_vio->write = true; + } else if (bio_data_dir(bio) == READ) { + data_vio->read = true; + } else { + /* + * Copy the bio data to a char array so that we can continue to use the data after + * we acknowledge the bio. + */ + copy_from_bio(bio, data_vio->vio.data); + data_vio->is_zero = is_zero_block(data_vio->vio.data); + data_vio->write = true; + } + + if (data_vio->user_bio->bi_opf & REQ_FUA) + data_vio->fua = true; + + lbn = (bio->bi_iter.bi_sector - vdo->starting_sector_offset) / VDO_SECTORS_PER_BLOCK; + launch_data_vio(data_vio, lbn); +} + +static void assign_data_vio(struct limiter *limiter, struct data_vio *data_vio) +{ + struct bio *bio = bio_list_pop(limiter->permitted_waiters); + + launch_bio(limiter->pool->completion.vdo, data_vio, bio); + limiter->wake_count++; + + bio = bio_list_peek(limiter->permitted_waiters); + limiter->arrival = ((bio == NULL) ? U64_MAX : get_arrival_time(bio)); +} + +static void assign_discard_permit(struct limiter *limiter) +{ + struct bio *bio = bio_list_pop(&limiter->waiters); + + if (limiter->arrival == U64_MAX) + limiter->arrival = get_arrival_time(bio); + + bio_list_add(limiter->permitted_waiters, bio); +} + +static void get_waiters(struct limiter *limiter) +{ + bio_list_merge(&limiter->waiters, &limiter->new_waiters); + bio_list_init(&limiter->new_waiters); +} + +static inline +struct data_vio *get_available_data_vio(struct data_vio_pool *pool) +{ + struct data_vio *data_vio = + list_first_entry(&pool->available, struct data_vio, pool_entry); + + list_del_init(&data_vio->pool_entry); + return data_vio; +} + +static void assign_data_vio_to_waiter(struct limiter *limiter) +{ + assign_data_vio(limiter, get_available_data_vio(limiter->pool)); +} + +static void update_limiter(struct limiter *limiter) +{ + struct bio_list *waiters = &limiter->waiters; + data_vio_count_t available = limiter->limit - limiter->busy; + + ASSERT_LOG_ONLY((limiter->release_count <= limiter->busy), + "Release count %u is not more than busy count %u", + limiter->release_count, + limiter->busy); + + get_waiters(limiter); + for (; (limiter->release_count > 0) && !bio_list_empty(waiters); limiter->release_count--) + limiter->assigner(limiter); + + if (limiter->release_count > 0) { + WRITE_ONCE(limiter->busy, limiter->busy - limiter->release_count); + limiter->release_count = 0; + return; + } + + for (; (available > 0) && !bio_list_empty(waiters); available--) + limiter->assigner(limiter); + + WRITE_ONCE(limiter->busy, limiter->limit - available); + if (limiter->max_busy < limiter->busy) + WRITE_ONCE(limiter->max_busy, limiter->busy); +} + +/** + * schedule_releases() - Ensure that release processing is scheduled. + * + * If this call switches the state to processing, enqueue. Otherwise, some other thread has already + * done so. + */ +static void schedule_releases(struct data_vio_pool *pool) +{ + /* Pairs with the barrier in process_release_callback(). */ + smp_mb__before_atomic(); + if (atomic_cmpxchg(&pool->processing, false, true)) + return; + + pool->completion.requeue = true; + vdo_launch_completion_with_priority(&pool->completion, CPU_Q_COMPLETE_VIO_PRIORITY); +} + +static void reuse_or_release_resources(struct data_vio_pool *pool, + struct data_vio *data_vio, + struct list_head *returned) +{ + if (data_vio->remaining_discard > 0) { + if (bio_list_empty(&pool->discard_limiter.waiters)) + /* Return the data_vio's discard permit. */ + pool->discard_limiter.release_count++; + else + assign_discard_permit(&pool->discard_limiter); + } + + if (pool->limiter.arrival < pool->discard_limiter.arrival) { + assign_data_vio(&pool->limiter, data_vio); + } else if (pool->discard_limiter.arrival < U64_MAX) { + assign_data_vio(&pool->discard_limiter, data_vio); + } else { + list_add(&data_vio->pool_entry, returned); + pool->limiter.release_count++; + } +} + +/** + * process_release_callback() - Process a batch of data_vio releases. + * @completion: The pool with data_vios to release. + */ +static void process_release_callback(struct vdo_completion *completion) +{ + struct data_vio_pool *pool = as_data_vio_pool(completion); + bool reschedule; + bool drained; + data_vio_count_t processed; + data_vio_count_t to_wake; + data_vio_count_t discards_to_wake; + LIST_HEAD(returned); + + spin_lock(&pool->lock); + get_waiters(&pool->discard_limiter); + get_waiters(&pool->limiter); + spin_unlock(&pool->lock); + + if (pool->limiter.arrival == U64_MAX) { + struct bio *bio = bio_list_peek(&pool->limiter.waiters); + + if (bio != NULL) + pool->limiter.arrival = get_arrival_time(bio); + } + + for (processed = 0; processed < DATA_VIO_RELEASE_BATCH_SIZE; processed++) { + struct data_vio *data_vio; + struct funnel_queue_entry *entry = uds_funnel_queue_poll(pool->queue); + + if (entry == NULL) + break; + + data_vio = as_data_vio(container_of(entry, + struct vdo_completion, + work_queue_entry_link)); + acknowledge_data_vio(data_vio); + reuse_or_release_resources(pool, data_vio, &returned); + } + + spin_lock(&pool->lock); + /* + * There is a race where waiters could be added while we are in the unlocked section above. + * Those waiters could not see the resources we are now about to release, so we assign + * those resources now as we have no guarantee of being rescheduled. This is handled in + * update_limiter(). + */ + update_limiter(&pool->discard_limiter); + list_splice(&returned, &pool->available); + update_limiter(&pool->limiter); + to_wake = pool->limiter.wake_count; + pool->limiter.wake_count = 0; + discards_to_wake = pool->discard_limiter.wake_count; + pool->discard_limiter.wake_count = 0; + + atomic_set(&pool->processing, false); + /* Pairs with the barrier in schedule_releases(). */ + smp_mb(); + + reschedule = !uds_is_funnel_queue_empty(pool->queue); + drained = (!reschedule && + vdo_is_state_draining(&pool->state) && + check_for_drain_complete_locked(pool)); + spin_unlock(&pool->lock); + + if (to_wake > 0) + wake_up_nr(&pool->limiter.blocked_threads, to_wake); + + if (discards_to_wake > 0) + wake_up_nr(&pool->discard_limiter.blocked_threads, + discards_to_wake); + + if (reschedule) + schedule_releases(pool); + else if (drained) + vdo_finish_draining(&pool->state); +} + +static void initialize_limiter(struct limiter *limiter, + struct data_vio_pool *pool, + assigner *assigner, + data_vio_count_t limit) +{ + limiter->pool = pool; + limiter->assigner = assigner; + limiter->limit = limit; + limiter->arrival = U64_MAX; + init_waitqueue_head(&limiter->blocked_threads); +} + +/** + * initialize_data_vio() - Allocate the components of a data_vio. + * + * The caller is responsible for cleaning up the data_vio on error. + * + * Return: VDO_SUCCESS or an error. + */ +static int initialize_data_vio(struct data_vio *data_vio, struct vdo *vdo) +{ + struct bio *bio; + int result; + + STATIC_ASSERT(VDO_BLOCK_SIZE <= PAGE_SIZE); + result = uds_allocate_memory(VDO_BLOCK_SIZE, 0, "data_vio data", &data_vio->vio.data); + if (result != VDO_SUCCESS) + return uds_log_error_strerror(result, "data_vio data allocation failure"); + + result = uds_allocate_memory(VDO_BLOCK_SIZE, + 0, + "compressed block", + &data_vio->compression.block); + if (result != VDO_SUCCESS) + return uds_log_error_strerror(result, + "data_vio compressed block allocation failure"); + + result = uds_allocate_memory(VDO_BLOCK_SIZE, 0, "vio scratch", &data_vio->scratch_block); + if (result != VDO_SUCCESS) + return uds_log_error_strerror(result, "data_vio scratch allocation failure"); + + result = vdo_create_bio(&bio); + if (result != VDO_SUCCESS) + return uds_log_error_strerror(result, "data_vio data bio allocation failure"); + + vdo_initialize_completion(&data_vio->decrement_completion, vdo, VDO_DECREMENT_COMPLETION); + initialize_vio(&data_vio->vio, bio, 1, VIO_TYPE_DATA, VIO_PRIORITY_DATA, vdo); + + return VDO_SUCCESS; +} + +static void destroy_data_vio(struct data_vio *data_vio) +{ + if (data_vio == NULL) + return; + + vdo_free_bio(UDS_FORGET(data_vio->vio.bio)); + UDS_FREE(UDS_FORGET(data_vio->vio.data)); + UDS_FREE(UDS_FORGET(data_vio->compression.block)); + UDS_FREE(UDS_FORGET(data_vio->scratch_block)); +} + +/** + * make_data_vio_pool() - Initialize a data_vio pool. + * @vdo: The vdo to which the pool will belong. + * @pool_size: The number of data_vios in the pool. + * @discard_limit: The maximum number of data_vios which may be used for discards. + * @pool: A pointer to hold the newly allocated pool. + */ +int make_data_vio_pool(struct vdo *vdo, + data_vio_count_t pool_size, + data_vio_count_t discard_limit, + struct data_vio_pool **pool_ptr) +{ + int result; + struct data_vio_pool *pool; + data_vio_count_t i; + + result = UDS_ALLOCATE_EXTENDED(struct data_vio_pool, + pool_size, + struct data_vio, + __func__, + &pool); + if (result != UDS_SUCCESS) + return result; + + ASSERT_LOG_ONLY((discard_limit <= pool_size), "discard limit does not exceed pool size"); + initialize_limiter(&pool->discard_limiter, pool, assign_discard_permit, discard_limit); + pool->discard_limiter.permitted_waiters = &pool->permitted_discards; + initialize_limiter(&pool->limiter, pool, assign_data_vio_to_waiter, pool_size); + pool->limiter.permitted_waiters = &pool->limiter.waiters; + INIT_LIST_HEAD(&pool->available); + spin_lock_init(&pool->lock); + vdo_set_admin_state_code(&pool->state, VDO_ADMIN_STATE_NORMAL_OPERATION); + vdo_initialize_completion(&pool->completion, vdo, VDO_DATA_VIO_POOL_COMPLETION); + vdo_prepare_completion(&pool->completion, + process_release_callback, + process_release_callback, + vdo->thread_config.cpu_thread, + NULL); + + result = uds_make_funnel_queue(&pool->queue); + if (result != UDS_SUCCESS) { + free_data_vio_pool(UDS_FORGET(pool)); + return result; + } + + for (i = 0; i < pool_size; i++) { + struct data_vio *data_vio = &pool->data_vios[i]; + + result = initialize_data_vio(data_vio, vdo); + if (result != VDO_SUCCESS) { + destroy_data_vio(data_vio); + free_data_vio_pool(pool); + return result; + } + + list_add(&data_vio->pool_entry, &pool->available); + } + + *pool_ptr = pool; + return VDO_SUCCESS; +} + +/** + * free_data_vio_pool() - Free a data_vio_pool and the data_vios in it. + * + * All data_vios must be returned to the pool before calling this function. + */ +void free_data_vio_pool(struct data_vio_pool *pool) +{ + struct data_vio *data_vio, *tmp; + + if (pool == NULL) + return; + + /* + * Pairs with the barrier in process_release_callback(). Possibly not needed since it + * caters to an enqueue vs. free race. + */ + smp_mb(); + BUG_ON(atomic_read(&pool->processing)); + + spin_lock(&pool->lock); + ASSERT_LOG_ONLY((pool->limiter.busy == 0), + "data_vio pool must not have %u busy entries when being freed", + pool->limiter.busy); + ASSERT_LOG_ONLY((bio_list_empty(&pool->limiter.waiters) && + bio_list_empty(&pool->limiter.new_waiters)), + "data_vio pool must not have threads waiting to read or write when being freed"); + ASSERT_LOG_ONLY((bio_list_empty(&pool->discard_limiter.waiters) && + bio_list_empty(&pool->discard_limiter.new_waiters)), + "data_vio pool must not have threads waiting to discard when being freed"); + spin_unlock(&pool->lock); + + list_for_each_entry_safe(data_vio, tmp, &pool->available, pool_entry) { + list_del_init(&data_vio->pool_entry); + destroy_data_vio(data_vio); + } + + uds_free_funnel_queue(UDS_FORGET(pool->queue)); + UDS_FREE(pool); +} + +static bool acquire_permit(struct limiter *limiter, struct bio *bio) +{ + if (limiter->busy >= limiter->limit) { + DEFINE_WAIT(wait); + + bio_list_add(&limiter->new_waiters, bio); + prepare_to_wait_exclusive(&limiter->blocked_threads, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&limiter->pool->lock); + io_schedule(); + finish_wait(&limiter->blocked_threads, &wait); + return false; + } + + WRITE_ONCE(limiter->busy, limiter->busy + 1); + if (limiter->max_busy < limiter->busy) + WRITE_ONCE(limiter->max_busy, limiter->busy); + + return true; +} + +/** + * vdo_launch_bio() - Acquire a data_vio from the pool, assign the bio to it, and launch it. + * + * This will block if data_vios or discard permits are not available. + */ +void vdo_launch_bio(struct data_vio_pool *pool, struct bio *bio) +{ + struct data_vio *data_vio; + + ASSERT_LOG_ONLY(!vdo_is_state_quiescent(&pool->state), + "data_vio_pool not quiescent on acquire"); + + bio->bi_private = (void *) jiffies; + spin_lock(&pool->lock); + if ((bio_op(bio) == REQ_OP_DISCARD) && !acquire_permit(&pool->discard_limiter, bio)) + return; + + if (!acquire_permit(&pool->limiter, bio)) + return; + + data_vio = get_available_data_vio(pool); + spin_unlock(&pool->lock); + launch_bio(pool->completion.vdo, data_vio, bio); +} + +/* Implements vdo_admin_initiator. */ +static void initiate_drain(struct admin_state *state) +{ + bool drained; + struct data_vio_pool *pool = container_of(state, struct data_vio_pool, state); + + spin_lock(&pool->lock); + drained = check_for_drain_complete_locked(pool); + spin_unlock(&pool->lock); + + if (drained) + vdo_finish_draining(state); +} + +static void assert_on_vdo_cpu_thread(const struct vdo *vdo, const char *name) +{ + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == vdo->thread_config.cpu_thread), + "%s called on cpu thread", + name); +} + +/** + * drain_data_vio_pool() - Wait asynchronously for all data_vios to be returned to the pool. + * @completion: The completion to notify when the pool has drained. + */ +void drain_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *completion) +{ + assert_on_vdo_cpu_thread(completion->vdo, __func__); + vdo_start_draining(&pool->state, VDO_ADMIN_STATE_SUSPENDING, completion, initiate_drain); +} + +/** + * resume_data_vio_pool() - Resume a data_vio pool. + * @completion: The completion to notify when the pool has resumed. + */ +void resume_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *completion) +{ + assert_on_vdo_cpu_thread(completion->vdo, __func__); + vdo_continue_completion(completion, vdo_resume_if_quiescent(&pool->state)); +} + +static void dump_limiter(const char *name, struct limiter *limiter) +{ + uds_log_info("%s: %u of %u busy (max %u), %s", + name, + limiter->busy, + limiter->limit, + limiter->max_busy, + ((bio_list_empty(&limiter->waiters) && + bio_list_empty(&limiter->new_waiters)) ? "no waiters" : "has waiters")); +} + +/** + * dump_data_vio_pool() - Dump a data_vio pool to the log. + * @dump_vios: Whether to dump the details of each busy data_vio as well. + */ +void dump_data_vio_pool(struct data_vio_pool *pool, bool dump_vios) +{ + /* + * In order that syslog can empty its buffer, sleep after 35 elements for 4ms (till the + * second clock tick). These numbers were picked based on experiments with lab machines. + */ + enum { ELEMENTS_PER_BATCH = 35 }; + enum { SLEEP_FOR_SYSLOG = 4000 }; + + if (pool == NULL) + return; + + spin_lock(&pool->lock); + dump_limiter("data_vios", &pool->limiter); + dump_limiter("discard permits", &pool->discard_limiter); + if (dump_vios) { + int i; + int dumped = 0; + + for (i = 0; i < pool->limiter.limit; i++) { + struct data_vio *data_vio = &pool->data_vios[i]; + + if (!list_empty(&data_vio->pool_entry)) + continue; + + dump_data_vio(data_vio); + if (++dumped >= ELEMENTS_PER_BATCH) { + spin_unlock(&pool->lock); + dumped = 0; + fsleep(SLEEP_FOR_SYSLOG); + spin_lock(&pool->lock); + } + } + } + + spin_unlock(&pool->lock); +} + +data_vio_count_t get_data_vio_pool_active_discards(struct data_vio_pool *pool) +{ + return READ_ONCE(pool->discard_limiter.busy); +} + +data_vio_count_t get_data_vio_pool_discard_limit(struct data_vio_pool *pool) +{ + return READ_ONCE(pool->discard_limiter.limit); +} + +data_vio_count_t get_data_vio_pool_maximum_discards(struct data_vio_pool *pool) +{ + return READ_ONCE(pool->discard_limiter.max_busy); +} + +int set_data_vio_pool_discard_limit(struct data_vio_pool *pool, data_vio_count_t limit) +{ + if (get_data_vio_pool_request_limit(pool) < limit) + // The discard limit may not be higher than the data_vio limit. + return -EINVAL; + + spin_lock(&pool->lock); + pool->discard_limiter.limit = limit; + spin_unlock(&pool->lock); + + return VDO_SUCCESS; +} + +data_vio_count_t get_data_vio_pool_active_requests(struct data_vio_pool *pool) +{ + return READ_ONCE(pool->limiter.busy); +} + +data_vio_count_t get_data_vio_pool_request_limit(struct data_vio_pool *pool) +{ + return READ_ONCE(pool->limiter.limit); +} + +data_vio_count_t get_data_vio_pool_maximum_requests(struct data_vio_pool *pool) +{ + return READ_ONCE(pool->limiter.max_busy); +} + +static void update_data_vio_error_stats(struct data_vio *data_vio) +{ + u8 index = 0; + static const char * const operations[] = { + [0] = "empty", + [1] = "read", + [2] = "write", + [3] = "read-modify-write", + [5] = "read+fua", + [6] = "write+fua", + [7] = "read-modify-write+fua", + }; + + if (data_vio->read) + index = 1; + + if (data_vio->write) + index += 2; + + if (data_vio->fua) + index += 4; + + update_vio_error_stats(&data_vio->vio, + "Completing %s vio for LBN %llu with error after %s", + operations[index], + (unsigned long long) data_vio->logical.lbn, + get_data_vio_operation_name(data_vio)); +} + +static void perform_cleanup_stage(struct data_vio *data_vio, enum data_vio_cleanup_stage stage); + +/** + * release_allocated_lock() - Release the PBN lock and/or the reference on the allocated block at + * the end of processing a data_vio. + */ +static void release_allocated_lock(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_in_allocated_zone(data_vio); + release_data_vio_allocation_lock(data_vio, false); + perform_cleanup_stage(data_vio, VIO_RELEASE_RECOVERY_LOCKS); +} + +/** release_lock() - Release an uncontended LBN lock. */ +static void release_lock(struct data_vio *data_vio, struct lbn_lock *lock) +{ + struct int_map *lock_map = lock->zone->lbn_operations; + struct data_vio *lock_holder; + + if (!lock->locked) { + /* The lock is not locked, so it had better not be registered in the lock map. */ + struct data_vio *lock_holder = vdo_int_map_get(lock_map, lock->lbn); + + ASSERT_LOG_ONLY((data_vio != lock_holder), + "no logical block lock held for block %llu", + (unsigned long long) lock->lbn); + return; + } + + /* Release the lock by removing the lock from the map. */ + lock_holder = vdo_int_map_remove(lock_map, lock->lbn); + ASSERT_LOG_ONLY((data_vio == lock_holder), + "logical block lock mismatch for block %llu", + (unsigned long long) lock->lbn); + lock->locked = false; +} + +/** transfer_lock() - Transfer a contended LBN lock to the eldest waiter. */ +static void transfer_lock(struct data_vio *data_vio, struct lbn_lock *lock) +{ + struct data_vio *lock_holder, *next_lock_holder; + int result; + + ASSERT_LOG_ONLY(lock->locked, "lbn_lock with waiters is not locked"); + + /* Another data_vio is waiting for the lock, transfer it in a single lock map operation. */ + next_lock_holder = waiter_as_data_vio(vdo_dequeue_next_waiter(&lock->waiters)); + + /* Transfer the remaining lock waiters to the next lock holder. */ + vdo_transfer_all_waiters(&lock->waiters, &next_lock_holder->logical.waiters); + + result = vdo_int_map_put(lock->zone->lbn_operations, + lock->lbn, + next_lock_holder, + true, + (void **) &lock_holder); + if (result != VDO_SUCCESS) { + continue_data_vio_with_error(next_lock_holder, result); + return; + } + + ASSERT_LOG_ONLY((lock_holder == data_vio), + "logical block lock mismatch for block %llu", + (unsigned long long) lock->lbn); + lock->locked = false; + + /* + * If there are still waiters, other data_vios must be trying to get the lock we just + * transferred. We must ensure that the new lock holder doesn't block in the packer. + */ + if (vdo_has_waiters(&next_lock_holder->logical.waiters)) + cancel_data_vio_compression(next_lock_holder); + + /* + * Avoid stack overflow on lock transfer. + * FIXME: this is only an issue in the 1 thread config. + */ + next_lock_holder->vio.completion.requeue = true; + launch_locked_request(next_lock_holder); +} + +/** + * release_logical_lock() - Release the logical block lock and flush generation lock at the end of + * processing a data_vio. + */ +static void release_logical_lock(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + struct lbn_lock *lock = &data_vio->logical; + + assert_data_vio_in_logical_zone(data_vio); + + if (vdo_has_waiters(&lock->waiters)) + transfer_lock(data_vio, lock); + else + release_lock(data_vio, lock); + + vdo_release_flush_generation_lock(data_vio); + perform_cleanup_stage(data_vio, VIO_CLEANUP_DONE); +} + +/** clean_hash_lock() - Release the hash lock at the end of processing a data_vio. */ +static void clean_hash_lock(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_in_hash_zone(data_vio); + if (completion->result != VDO_SUCCESS) { + vdo_clean_failed_hash_lock(data_vio); + return; + } + + vdo_release_hash_lock(data_vio); + perform_cleanup_stage(data_vio, VIO_RELEASE_LOGICAL); +} + +/** + * finish_cleanup() - Make some assertions about a data_vio which has finished cleaning up. + * + * If it is part of a multi-block discard, starts on the next block, otherwise, returns it to the + * pool. + */ +static void finish_cleanup(struct data_vio *data_vio) +{ + struct vdo_completion *completion = &data_vio->vio.completion; + + ASSERT_LOG_ONLY(data_vio->allocation.lock == NULL, + "complete data_vio has no allocation lock"); + ASSERT_LOG_ONLY(data_vio->hash_lock == NULL, "complete data_vio has no hash lock"); + if ((data_vio->remaining_discard <= VDO_BLOCK_SIZE) || + (completion->result != VDO_SUCCESS)) { + struct data_vio_pool *pool = completion->vdo->data_vio_pool; + + uds_funnel_queue_put(pool->queue, &completion->work_queue_entry_link); + schedule_releases(pool); + return; + } + + data_vio->remaining_discard -= min_t(u32, + data_vio->remaining_discard, + VDO_BLOCK_SIZE - data_vio->offset); + data_vio->is_partial = (data_vio->remaining_discard < VDO_BLOCK_SIZE); + data_vio->read = data_vio->is_partial; + data_vio->offset = 0; + completion->requeue = true; + launch_data_vio(data_vio, data_vio->logical.lbn + 1); +} + +/** perform_cleanup_stage() - Perform the next step in the process of cleaning up a data_vio. */ +static void perform_cleanup_stage(struct data_vio *data_vio, enum data_vio_cleanup_stage stage) +{ + struct vdo *vdo = vdo_from_data_vio(data_vio); + + switch (stage) { + case VIO_RELEASE_HASH_LOCK: + if (data_vio->hash_lock != NULL) { + launch_data_vio_hash_zone_callback(data_vio, clean_hash_lock); + return; + } + fallthrough; + + case VIO_RELEASE_ALLOCATED: + if (data_vio_has_allocation(data_vio)) { + launch_data_vio_allocated_zone_callback(data_vio, release_allocated_lock); + return; + } + fallthrough; + + case VIO_RELEASE_RECOVERY_LOCKS: + if ((data_vio->recovery_sequence_number > 0) && + (READ_ONCE(vdo->read_only_notifier.read_only_error) == VDO_SUCCESS) && + (data_vio->vio.completion.result != VDO_READ_ONLY)) + uds_log_warning("VDO not read-only when cleaning data_vio with RJ lock"); + fallthrough; + + case VIO_RELEASE_LOGICAL: + launch_data_vio_logical_callback(data_vio, release_logical_lock); + return; + + default: + finish_cleanup(data_vio); + } +} + +void complete_data_vio(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + completion->error_handler = NULL; + data_vio->last_async_operation = VIO_ASYNC_OP_CLEANUP; + perform_cleanup_stage(data_vio, + (data_vio->write ? VIO_CLEANUP_START : VIO_RELEASE_LOGICAL)); +} + +static void enter_read_only_mode(struct vdo_completion *completion) +{ + if (vdo_is_read_only(completion->vdo)) + return; + + if (completion->result != VDO_READ_ONLY) { + struct data_vio *data_vio = as_data_vio(completion); + + uds_log_error_strerror(completion->result, + "Preparing to enter read-only mode: data_vio for LBN %llu (becoming mapped to %llu, previously mapped to %llu, allocated %llu) is completing with a fatal error after operation %s", + (unsigned long long) data_vio->logical.lbn, + (unsigned long long) data_vio->new_mapped.pbn, + (unsigned long long) data_vio->mapped.pbn, + (unsigned long long) data_vio->allocation.pbn, + get_data_vio_operation_name(data_vio)); + } + + vdo_enter_read_only_mode(completion->vdo, completion->result); +} + +void handle_data_vio_error(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + if ((completion->result == VDO_READ_ONLY) || (data_vio->user_bio == NULL)) + enter_read_only_mode(completion); + + update_data_vio_error_stats(data_vio); + complete_data_vio(completion); +} + +/** + * get_data_vio_operation_name() - Get the name of the last asynchronous operation performed on a + * data_vio. + */ +const char *get_data_vio_operation_name(struct data_vio *data_vio) +{ + STATIC_ASSERT((MAX_VIO_ASYNC_OPERATION_NUMBER - MIN_VIO_ASYNC_OPERATION_NUMBER) == + ARRAY_SIZE(ASYNC_OPERATION_NAMES)); + + return ((data_vio->last_async_operation < MAX_VIO_ASYNC_OPERATION_NUMBER) ? + ASYNC_OPERATION_NAMES[data_vio->last_async_operation] : + "unknown async operation"); +} + +/** + * data_vio_allocate_data_block() - Allocate a data block. + * + * @write_lock_type: The type of write lock to obtain on the block. + * @callback: The callback which will attempt an allocation in the current zone and continue if it + * succeeds. + * @error_handler: The handler for errors while allocating. + */ +void data_vio_allocate_data_block(struct data_vio *data_vio, + enum pbn_lock_type write_lock_type, + vdo_action *callback, + vdo_action *error_handler) +{ + struct allocation *allocation = &data_vio->allocation; + + ASSERT_LOG_ONLY((allocation->pbn == VDO_ZERO_BLOCK), + "data_vio does not have an allocation"); + allocation->write_lock_type = write_lock_type; + allocation->zone = vdo_get_next_allocation_zone(data_vio->logical.zone); + allocation->first_allocation_zone = allocation->zone->zone_number; + + data_vio->vio.completion.error_handler = error_handler; + launch_data_vio_allocated_zone_callback(data_vio, callback); +} + +void release_data_vio_allocation_lock(struct data_vio *data_vio, bool reset) +{ + struct allocation *allocation = &data_vio->allocation; + physical_block_number_t locked_pbn = allocation->pbn; + + assert_data_vio_in_allocated_zone(data_vio); + + if (reset || vdo_pbn_lock_has_provisional_reference(allocation->lock)) + allocation->pbn = VDO_ZERO_BLOCK; + + vdo_release_physical_zone_pbn_lock(allocation->zone, + locked_pbn, + UDS_FORGET(allocation->lock)); +} + +/** + * uncompress_data_vio() - Uncompress the data a data_vio has just read. + * @mapping_state: The mapping state indicating which fragment to decompress. + * @buffer: The buffer to receive the uncompressed data. + */ +int uncompress_data_vio(struct data_vio *data_vio, + enum block_mapping_state mapping_state, + char *buffer) +{ + int size; + u16 fragment_offset, fragment_size; + struct compressed_block *block = data_vio->compression.block; + int result = vdo_get_compressed_block_fragment(mapping_state, + block, + &fragment_offset, + &fragment_size); + + if (result != VDO_SUCCESS) { + uds_log_debug("%s: compressed fragment error %d", __func__, result); + return result; + } + + size = LZ4_decompress_safe((block->data + fragment_offset), + buffer, + fragment_size, + VDO_BLOCK_SIZE); + if (size != VDO_BLOCK_SIZE) { + uds_log_debug("%s: lz4 error", __func__); + return VDO_INVALID_FRAGMENT; + } + + return VDO_SUCCESS; +} + +/** + * modify_for_partial_write() - Do the modify-write part of a read-modify-write cycle. + * @completion: The data_vio which has just finished its read. + * + * This callback is registered in read_block(). + */ +static void modify_for_partial_write(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + char *data = data_vio->vio.data; + struct bio *bio = data_vio->user_bio; + + assert_data_vio_on_cpu_thread(data_vio); + + if (bio_op(bio) == REQ_OP_DISCARD) { + memset(data + data_vio->offset, '\0', min_t(u32, + data_vio->remaining_discard, + VDO_BLOCK_SIZE - data_vio->offset)); + } else { + copy_from_bio(bio, data + data_vio->offset); + } + + data_vio->is_zero = is_zero_block(data); + data_vio->read = false; + launch_data_vio_logical_callback(data_vio, continue_data_vio_with_block_map_slot); +} + +static void complete_read(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + char *data = data_vio->vio.data; + bool compressed = vdo_is_state_compressed(data_vio->mapped.state); + + assert_data_vio_on_cpu_thread(data_vio); + + if (compressed) { + int result = uncompress_data_vio(data_vio, data_vio->mapped.state, data); + + if (result != VDO_SUCCESS) { + continue_data_vio_with_error(data_vio, result); + return; + } + } + + if (data_vio->write) { + modify_for_partial_write(completion); + return; + } + + if (compressed || data_vio->is_partial) + copy_to_bio(data_vio->user_bio, data + data_vio->offset); + + acknowledge_data_vio(data_vio); + complete_data_vio(completion); +} + +static void read_endio(struct bio *bio) +{ + struct data_vio *data_vio = vio_as_data_vio(bio->bi_private); + int result = blk_status_to_errno(bio->bi_status); + + vdo_count_completed_bios(bio); + if (result != VDO_SUCCESS) { + continue_data_vio_with_error(data_vio, result); + return; + } + + launch_data_vio_cpu_callback(data_vio, complete_read, CPU_Q_COMPLETE_READ_PRIORITY); +} + +static void complete_zero_read(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_on_cpu_thread(data_vio); + + if (data_vio->is_partial) { + memset(data_vio->vio.data, 0, VDO_BLOCK_SIZE); + if (data_vio->write) { + modify_for_partial_write(completion); + return; + } + } else { + zero_fill_bio(data_vio->user_bio); + } + + complete_read(completion); +} + +/** + * read_block() - Read a block asynchronously. + * + * This is the callback registered in read_block_mapping(). + */ +static void read_block(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + struct vio *vio = as_vio(completion); + int result = VDO_SUCCESS; + + if (data_vio->mapped.pbn == VDO_ZERO_BLOCK) { + launch_data_vio_cpu_callback(data_vio, + complete_zero_read, + CPU_Q_COMPLETE_VIO_PRIORITY); + return; + } + + data_vio->last_async_operation = VIO_ASYNC_OP_READ_DATA_VIO; + if (vdo_is_state_compressed(data_vio->mapped.state)) { + result = vio_reset_bio(vio, + (char *) data_vio->compression.block, + read_endio, + REQ_OP_READ, + data_vio->mapped.pbn); + } else { + int opf = ((data_vio->user_bio->bi_opf & PASSTHROUGH_FLAGS) | REQ_OP_READ); + + if (data_vio->is_partial) { + result = vio_reset_bio(vio, + vio->data, + read_endio, + opf, + data_vio->mapped.pbn); + } else { + /* A full 4k read. Use the incoming bio to avoid having to copy the data */ + bio_reset(vio->bio, vio->bio->bi_bdev, opf); + bio_init_clone(data_vio->user_bio->bi_bdev, + vio->bio, + data_vio->user_bio, + GFP_KERNEL); + + /* Copy over the original bio iovec and opflags. */ + vdo_set_bio_properties(vio->bio, + vio, + read_endio, + opf, + data_vio->mapped.pbn); + } + } + + if (result != VDO_SUCCESS) { + continue_data_vio_with_error(data_vio, result); + return; + } + + submit_data_vio_io(data_vio); +} + +static inline struct data_vio * +reference_count_update_completion_as_data_vio(struct vdo_completion *completion) +{ + if (completion->type == VIO_COMPLETION) + return as_data_vio(completion); + + return container_of(completion, struct data_vio, decrement_completion); +} + +/** + * update_block_map() - Rendezvous of the data_vio and decrement completions after each has + * made its reference updates. Handle any error from either, or proceed + * to updating the block map. + * @completion: The completion of the write in progress. + */ +static void update_block_map(struct vdo_completion *completion) +{ + struct data_vio *data_vio = reference_count_update_completion_as_data_vio(completion); + + assert_data_vio_in_logical_zone(data_vio); + + if (!data_vio->first_reference_operation_complete) { + /* Rendezvous, we're first */ + data_vio->first_reference_operation_complete = true; + return; + } + + completion = &data_vio->vio.completion; + vdo_set_completion_result(completion, data_vio->decrement_completion.result); + if (completion->result != VDO_SUCCESS) { + handle_data_vio_error(completion); + return; + } + + completion->error_handler = handle_data_vio_error; + if (data_vio->hash_lock != NULL) + set_data_vio_hash_zone_callback(data_vio, vdo_continue_hash_lock); + else + completion->callback = complete_data_vio; + + data_vio->last_async_operation = VIO_ASYNC_OP_PUT_MAPPED_BLOCK; + vdo_put_mapped_block(data_vio); +} + +static void decrement_reference_count(struct vdo_completion *completion) +{ + struct data_vio *data_vio = container_of(completion, + struct data_vio, + decrement_completion); + + assert_data_vio_in_mapped_zone(data_vio); + + vdo_set_completion_callback(completion, + update_block_map, + data_vio->logical.zone->thread_id); + completion->error_handler = update_block_map; + vdo_modify_reference_count(completion, &data_vio->decrement_updater); +} + +static void increment_reference_count(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_in_new_mapped_zone(data_vio); + + if (data_vio->downgrade_allocation_lock) { + /* + * Now that the data has been written, it's safe to deduplicate against the + * block. Downgrade the allocation lock to a read lock so it can be used later by + * the hash lock. This is done here since it needs to happen sometime before we + * return to the hash zone, and we are currently on the correct thread. For + * compressed blocks, the downgrade will have already been done. + */ + vdo_downgrade_pbn_write_lock(data_vio->allocation.lock, false); + } + + set_data_vio_logical_callback(data_vio, update_block_map); + completion->error_handler = update_block_map; + vdo_modify_reference_count(completion, &data_vio->increment_updater); +} + +/** journal_remapping() - Add a recovery journal entry for a data remapping. */ +static void journal_remapping(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_in_journal_zone(data_vio); + + data_vio->decrement_updater.operation = VDO_JOURNAL_DATA_REMAPPING; + data_vio->decrement_updater.zpbn = data_vio->mapped; + if (data_vio->new_mapped.pbn == VDO_ZERO_BLOCK) { + data_vio->first_reference_operation_complete = true; + if (data_vio->mapped.pbn == VDO_ZERO_BLOCK) + set_data_vio_logical_callback(data_vio, update_block_map); + } else { + set_data_vio_new_mapped_zone_callback(data_vio, increment_reference_count); + } + + if (data_vio->mapped.pbn == VDO_ZERO_BLOCK) + data_vio->first_reference_operation_complete = true; + else + vdo_set_completion_callback(&data_vio->decrement_completion, + decrement_reference_count, + data_vio->mapped.zone->thread_id); + + data_vio->last_async_operation = VIO_ASYNC_OP_JOURNAL_REMAPPING; + vdo_add_recovery_journal_entry(completion->vdo->recovery_journal, data_vio); +} + +/** + * read_old_block_mapping() - Get the previous PBN/LBN mapping of an in-progress write. + * + * Gets the previous PBN mapped to this LBN from the block map, so as to make an appropriate + * journal entry referencing the removal of this LBN->PBN mapping. + */ +static void read_old_block_mapping(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_in_logical_zone(data_vio); + + data_vio->last_async_operation = VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_WRITE; + set_data_vio_journal_callback(data_vio, journal_remapping); + vdo_get_mapped_block(data_vio); +} + +void update_metadata_for_data_vio_write(struct data_vio *data_vio, struct pbn_lock *lock) +{ + data_vio->increment_updater = (struct reference_updater) { + .operation = VDO_JOURNAL_DATA_REMAPPING, + .increment = true, + .zpbn = data_vio->new_mapped, + .lock = lock, + }; + + launch_data_vio_logical_callback(data_vio, read_old_block_mapping); +} + +/** + * pack_compressed_data() - Attempt to pack the compressed data_vio into a block. + * + * This is the callback registered in launch_compress_data_vio(). + */ +static void pack_compressed_data(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_in_packer_zone(data_vio); + + if (!vdo_get_compressing(vdo_from_data_vio(data_vio)) || + get_data_vio_compression_status(data_vio).may_not_compress) { + write_data_vio(data_vio); + return; + } + + data_vio->last_async_operation = VIO_ASYNC_OP_ATTEMPT_PACKING; + vdo_attempt_packing(data_vio); +} + +/** + * compress_data_vio() - Do the actual work of compressing the data on a CPU queue. + * + * This callback is registered in launch_compress_data_vio(). + */ +static void compress_data_vio(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + int size; + + assert_data_vio_on_cpu_thread(data_vio); + + /* + * By putting the compressed data at the start of the compressed block data field, we won't + * need to copy it if this data_vio becomes a compressed write agent. + */ + size = LZ4_compress_default(data_vio->vio.data, + data_vio->compression.block->data, + VDO_BLOCK_SIZE, + VDO_MAX_COMPRESSED_FRAGMENT_SIZE, + (char *) vdo_get_work_queue_private_data()); + if ((size > 0) && (size < VDO_COMPRESSED_BLOCK_DATA_SIZE)) { + data_vio->compression.size = size; + launch_data_vio_packer_callback(data_vio, pack_compressed_data); + return; + } + + write_data_vio(data_vio); +} + +/** + * launch_compress_data_vio() - Continue a write by attempting to compress the data. + * + * This is a re-entry point to vio_write used by hash locks. + */ +void launch_compress_data_vio(struct data_vio *data_vio) +{ + ASSERT_LOG_ONLY(!data_vio->is_duplicate, "compressing a non-duplicate block"); + ASSERT_LOG_ONLY(data_vio->hash_lock != NULL, "data_vio to compress has a hash_lock"); + ASSERT_LOG_ONLY(data_vio_has_allocation(data_vio), + "data_vio to compress has an allocation"); + + /* + * There are 4 reasons why a data_vio which has reached this point will not be eligible for + * compression: + * + * 1) Since data_vios can block indefinitely in the packer, it would be bad to do so if the + * write request also requests FUA. + * + * 2) A data_vio should not be compressed when compression is disabled for the vdo. + * + * 3) A data_vio could be doing a partial write on behalf of a larger discard which has not + * yet been acknowledged and hence blocking in the packer would be bad. + * + * 4) Some other data_vio may be waiting on this data_vio in which case blocking in the + * packer would also be bad. + */ + if (data_vio->fua || + !vdo_get_compressing(vdo_from_data_vio(data_vio)) || + ((data_vio->user_bio != NULL) && (bio_op(data_vio->user_bio) == REQ_OP_DISCARD)) || + (advance_data_vio_compression_stage(data_vio).stage != DATA_VIO_COMPRESSING)) { + write_data_vio(data_vio); + return; + } + + data_vio->last_async_operation = VIO_ASYNC_OP_COMPRESS_DATA_VIO; + launch_data_vio_cpu_callback(data_vio, compress_data_vio, CPU_Q_COMPRESS_BLOCK_PRIORITY); +} + +/** + * hash_data_vio() - Hash the data in a data_vio and set the hash zone (which also flags the record + * name as set). + + * This callback is registered in prepare_for_dedupe(). + */ +static void hash_data_vio(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_on_cpu_thread(data_vio); + ASSERT_LOG_ONLY(!data_vio->is_zero, "zero blocks should not be hashed"); + + murmurhash3_128(data_vio->vio.data, + VDO_BLOCK_SIZE, + 0x62ea60be, + &data_vio->record_name); + + data_vio->hash_zone = vdo_select_hash_zone(vdo_from_data_vio(data_vio)->hash_zones, + &data_vio->record_name); + data_vio->last_async_operation = VIO_ASYNC_OP_ACQUIRE_VDO_HASH_LOCK; + launch_data_vio_hash_zone_callback(data_vio, vdo_acquire_hash_lock); +} + +/** prepare_for_dedupe() - Prepare for the dedupe path after attempting to get an allocation. */ +static void prepare_for_dedupe(struct data_vio *data_vio) +{ + /* We don't care what thread we are on. */ + ASSERT_LOG_ONLY(!data_vio->is_zero, "must not prepare to dedupe zero blocks"); + + /* + * Before we can dedupe, we need to know the record name, so the first + * step is to hash the block data. + */ + data_vio->last_async_operation = VIO_ASYNC_OP_HASH_DATA_VIO; + launch_data_vio_cpu_callback(data_vio, hash_data_vio, CPU_Q_HASH_BLOCK_PRIORITY); +} + +/** + * write_bio_finished() - This is the bio_end_io function registered in write_block() to be called + * when a data_vio's write to the underlying storage has completed. + */ +static void write_bio_finished(struct bio *bio) +{ + struct data_vio *data_vio = vio_as_data_vio((struct vio *) bio->bi_private); + + vdo_count_completed_bios(bio); + vdo_set_completion_result(&data_vio->vio.completion, blk_status_to_errno(bio->bi_status)); + data_vio->downgrade_allocation_lock = true; + update_metadata_for_data_vio_write(data_vio, data_vio->allocation.lock); +} + +/** write_data_vio() - Write a data block to storage without compression. */ +void write_data_vio(struct data_vio *data_vio) +{ + struct data_vio_compression_status status, new_status; + int result; + + if (!data_vio_has_allocation(data_vio)) { + /* + * There was no space to write this block and we failed to deduplicate or compress + * it. + */ + continue_data_vio_with_error(data_vio, VDO_NO_SPACE); + return; + } + + new_status = (struct data_vio_compression_status) { + .stage = DATA_VIO_POST_PACKER, + .may_not_compress = true, + }; + + do { + status = get_data_vio_compression_status(data_vio); + } while ((status.stage != DATA_VIO_POST_PACKER) && + !set_data_vio_compression_status(data_vio, status, new_status)); + + /* Write the data from the data block buffer. */ + result = vio_reset_bio(&data_vio->vio, + data_vio->vio.data, + write_bio_finished, + REQ_OP_WRITE, + data_vio->allocation.pbn); + if (result != VDO_SUCCESS) { + continue_data_vio_with_error(data_vio, result); + return; + } + + data_vio->last_async_operation = VIO_ASYNC_OP_WRITE_DATA_VIO; + submit_data_vio_io(data_vio); +} + +/** + * acknowledge_write_callback() - Acknowledge a write to the requestor. + * + * This callback is registered in allocate_block() and continue_write_with_block_map_slot(). + */ +static void acknowledge_write_callback(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + struct vdo *vdo = completion->vdo; + + ASSERT_LOG_ONLY((!vdo_uses_bio_ack_queue(vdo) || + (vdo_get_callback_thread_id() == vdo->thread_config.bio_ack_thread)), + "%s() called on bio ack queue", + __func__); + ASSERT_LOG_ONLY(data_vio_has_flush_generation_lock(data_vio), + "write VIO to be acknowledged has a flush generation lock"); + acknowledge_data_vio(data_vio); + if (data_vio->new_mapped.pbn == VDO_ZERO_BLOCK) { + /* This is a zero write or discard */ + update_metadata_for_data_vio_write(data_vio, NULL); + return; + } + + prepare_for_dedupe(data_vio); +} + +/** + * allocate_block() - Attempt to allocate a block in the current allocation zone. + * + * This callback is registered in continue_write_with_block_map_slot(). + */ +static void allocate_block(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_in_allocated_zone(data_vio); + + if (!vdo_allocate_block_in_zone(data_vio)) + return; + + completion->error_handler = handle_data_vio_error; + WRITE_ONCE(data_vio->allocation_succeeded, true); + data_vio->new_mapped = (struct zoned_pbn) { + .zone = data_vio->allocation.zone, + .pbn = data_vio->allocation.pbn, + .state = VDO_MAPPING_STATE_UNCOMPRESSED, + }; + + if (data_vio->fua) { + prepare_for_dedupe(data_vio); + return; + } + + data_vio->last_async_operation = VIO_ASYNC_OP_ACKNOWLEDGE_WRITE; + launch_data_vio_on_bio_ack_queue(data_vio, acknowledge_write_callback); +} + +/** + * handle_allocation_error() - Handle an error attempting to allocate a block. + * + * This error handler is registered in continue_write_with_block_map_slot(). + */ +static void handle_allocation_error(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + if (completion->result == VDO_NO_SPACE) { + /* We failed to get an allocation, but we can try to dedupe. */ + vdo_reset_completion(completion); + completion->error_handler = handle_data_vio_error; + prepare_for_dedupe(data_vio); + return; + } + + /* We got a "real" error, not just a failure to allocate, so fail the request. */ + handle_data_vio_error(completion); +} + +static int assert_is_trim(struct data_vio *data_vio) +{ + int result = ASSERT(data_vio->is_trim, "data_vio with no block map page is a trim"); + + return ((result == VDO_SUCCESS) ? result : VDO_READ_ONLY); +} + +/** + * continue_data_vio_with_block_map_slot() - Read the data_vio's mapping from the block map. + * + * This callback is registered in launch_read_data_vio(). + */ +void continue_data_vio_with_block_map_slot(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_in_logical_zone(data_vio); + if (data_vio->read) { + set_data_vio_logical_callback(data_vio, read_block); + data_vio->last_async_operation = VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_READ; + vdo_get_mapped_block(data_vio); + return; + } + + vdo_acquire_flush_generation_lock(data_vio); + + if (data_vio->tree_lock.tree_slots[0].block_map_slot.pbn == VDO_ZERO_BLOCK) { + /* + * This is a trim for a block on a block map page which has not been allocated, so + * there's nothing more we need to do. + */ + completion->callback = complete_data_vio; + continue_data_vio_with_error(data_vio, assert_is_trim(data_vio)); + return; + } + + /* + * We need an allocation if this is neither a full-block trim nor a + * full-block zero write. + */ + if (!data_vio->is_zero && (!data_vio->is_trim || data_vio->is_partial)) { + data_vio_allocate_data_block(data_vio, + VIO_WRITE_LOCK, + allocate_block, + handle_allocation_error); + return; + } + + + /* + * We don't need to write any data, so skip allocation and just update the block map and + * reference counts (via the journal). + */ + data_vio->new_mapped.pbn = VDO_ZERO_BLOCK; + if (data_vio->is_zero) + data_vio->new_mapped.state = VDO_MAPPING_STATE_UNCOMPRESSED; + + if (data_vio->remaining_discard > VDO_BLOCK_SIZE) { + /* This is not the final block of a discard so we can't acknowledge it yet. */ + update_metadata_for_data_vio_write(data_vio, NULL); + return; + } + + data_vio->last_async_operation = VIO_ASYNC_OP_ACKNOWLEDGE_WRITE; + launch_data_vio_on_bio_ack_queue(data_vio, acknowledge_write_callback); +} diff --git a/drivers/md/dm-vdo/data-vio.h b/drivers/md/dm-vdo/data-vio.h new file mode 100644 index 00000000000..0049d6ed0a8 --- /dev/null +++ b/drivers/md/dm-vdo/data-vio.h @@ -0,0 +1,689 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef DATA_VIO_H +#define DATA_VIO_H + +#include <linux/atomic.h> +#include <linux/bio.h> +#include <linux/list.h> + +#include "permassert.h" +#include "uds.h" + +#include "block-map.h" +#include "completion.h" +#include "constants.h" +#include "dedupe.h" +#include "encodings.h" +#include "logical-zone.h" +#include "physical-zone.h" +#include "types.h" +#include "vdo.h" +#include "vio.h" +#include "wait-queue.h" + +/* Codes for describing the last asynchronous operation performed on a vio. */ +enum async_operation_number { + MIN_VIO_ASYNC_OPERATION_NUMBER, + VIO_ASYNC_OP_LAUNCH = MIN_VIO_ASYNC_OPERATION_NUMBER, + VIO_ASYNC_OP_ACKNOWLEDGE_WRITE, + VIO_ASYNC_OP_ACQUIRE_VDO_HASH_LOCK, + VIO_ASYNC_OP_ATTEMPT_LOGICAL_BLOCK_LOCK, + VIO_ASYNC_OP_LOCK_DUPLICATE_PBN, + VIO_ASYNC_OP_CHECK_FOR_DUPLICATION, + VIO_ASYNC_OP_CLEANUP, + VIO_ASYNC_OP_COMPRESS_DATA_VIO, + VIO_ASYNC_OP_FIND_BLOCK_MAP_SLOT, + VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_READ, + VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_WRITE, + VIO_ASYNC_OP_HASH_DATA_VIO, + VIO_ASYNC_OP_JOURNAL_REMAPPING, + VIO_ASYNC_OP_ATTEMPT_PACKING, + VIO_ASYNC_OP_PUT_MAPPED_BLOCK, + VIO_ASYNC_OP_READ_DATA_VIO, + VIO_ASYNC_OP_UPDATE_DEDUPE_INDEX, + VIO_ASYNC_OP_UPDATE_REFERENCE_COUNTS, + VIO_ASYNC_OP_VERIFY_DUPLICATION, + VIO_ASYNC_OP_WRITE_DATA_VIO, + MAX_VIO_ASYNC_OPERATION_NUMBER, +} __packed; + +struct lbn_lock { + logical_block_number_t lbn; + bool locked; + struct wait_queue waiters; + struct logical_zone *zone; +}; + +/* A position in the arboreal block map at a specific level. */ +struct block_map_tree_slot { + page_number_t page_index; + struct block_map_slot block_map_slot; +}; + +/* Fields for using the arboreal block map. */ +struct tree_lock { + /* The current height at which this data_vio is operating */ + height_t height; + /* The block map tree for this LBN */ + root_count_t root_index; + /* Whether we hold a page lock */ + bool locked; + /* The key for the lock map */ + u64 key; + /* The queue of waiters for the page this vio is allocating or loading */ + struct wait_queue waiters; + /* The block map tree slots for this LBN */ + struct block_map_tree_slot tree_slots[VDO_BLOCK_MAP_TREE_HEIGHT + 1]; +}; + +struct zoned_pbn { + physical_block_number_t pbn; + enum block_mapping_state state; + struct physical_zone *zone; +}; + +/* + * Where a data_vio is on the compression path; advance_compression_stage() depends on the order of + * this enum. + */ +enum data_vio_compression_stage { + /* A data_vio which has not yet entered the compression path */ + DATA_VIO_PRE_COMPRESSOR, + /* A data_vio which is in the compressor */ + DATA_VIO_COMPRESSING, + /* A data_vio which is blocked in the packer */ + DATA_VIO_PACKING, + /* A data_vio which is no longer on the compression path (and never will be) */ + DATA_VIO_POST_PACKER, +}; + +struct data_vio_compression_status { + enum data_vio_compression_stage stage; + bool may_not_compress; +}; + +struct compression_state { + /* + * The current compression status of this data_vio. This field contains a value which + * consists of a data_vio_compression_stage and a flag indicating whether a request has + * been made to cancel (or prevent) compression for this data_vio. + * + * This field should be accessed through the get_data_vio_compression_status() and + * set_data_vio_compression_status() methods. It should not be accessed directly. + */ + atomic_t status; + + /* The compressed size of this block */ + u16 size; + + /* The packer input or output bin slot which holds the enclosing data_vio */ + slot_number_t slot; + + /* The packer bin to which the enclosing data_vio has been assigned */ + struct packer_bin *bin; + + /* A link in the chain of data_vios which have been packed together */ + struct data_vio *next_in_batch; + + /* A vio which is blocked in the packer while holding a lock this vio needs. */ + struct data_vio *lock_holder; + + /* + * The compressed block used to hold the compressed form of this block and that of any + * other blocks for which this data_vio is the compressed write agent. + */ + struct compressed_block *block; +}; + +/* Fields supporting allocation of data blocks. */ +struct allocation { + /* The physical zone in which to allocate a physical block */ + struct physical_zone *zone; + + /* The block allocated to this vio */ + physical_block_number_t pbn; + + /* + * If non-NULL, the pooled PBN lock held on the allocated block. Must be a write lock until + * the block has been written, after which it will become a read lock. + */ + struct pbn_lock *lock; + + /* The type of write lock to obtain on the allocated block */ + enum pbn_lock_type write_lock_type; + + /* The zone which was the start of the current allocation cycle */ + zone_count_t first_allocation_zone; + + /* Whether this vio should wait for a clean slab */ + bool wait_for_clean_slab; +}; + +struct reference_updater { + enum journal_operation operation; + bool increment; + struct zoned_pbn zpbn; + struct pbn_lock *lock; + struct waiter waiter; +}; + +/* A vio for processing user data requests. */ +struct data_vio { + /* The wait_queue entry structure */ + struct waiter waiter; + + /* The logical block of this request */ + struct lbn_lock logical; + + /* The state for traversing the block map tree */ + struct tree_lock tree_lock; + + /* The current partition address of this block */ + struct zoned_pbn mapped; + + /* The hash of this vio (if not zero) */ + struct uds_record_name record_name; + + /* Used for logging and debugging */ + enum async_operation_number last_async_operation; + + /* The operations to record in the recovery and slab journals */ + struct reference_updater increment_updater; + struct reference_updater decrement_updater; + + u16 read : 1; + u16 write : 1; + u16 fua : 1; + u16 is_zero : 1; + u16 is_trim : 1; + u16 is_partial : 1; + u16 is_duplicate : 1; + u16 first_reference_operation_complete : 1; + u16 downgrade_allocation_lock : 1; + + struct allocation allocation; + + /* + * Whether this vio has received an allocation. This field is examined from threads not in + * the allocation zone. + */ + bool allocation_succeeded; + + /* The new partition address of this block after the vio write completes */ + struct zoned_pbn new_mapped; + + /* The hash zone responsible for the name (NULL if is_zero_block) */ + struct hash_zone *hash_zone; + + /* The lock this vio holds or shares with other vios with the same data */ + struct hash_lock *hash_lock; + + /* All data_vios sharing a hash lock are kept in a list linking these list entries */ + struct list_head hash_lock_entry; + + /* The block number in the partition of the UDS deduplication advice */ + struct zoned_pbn duplicate; + + /* + * The sequence number of the recovery journal block containing the increment entry for + * this vio. + */ + sequence_number_t recovery_sequence_number; + + /* The point in the recovery journal where this write last made an entry */ + struct journal_point recovery_journal_point; + + /* The list of vios in user initiated write requests */ + struct list_head write_entry; + + /* The generation number of the VDO that this vio belongs to */ + sequence_number_t flush_generation; + + /* The completion to use for fetching block map pages for this vio */ + struct vdo_page_completion page_completion; + + /* The user bio that initiated this VIO */ + struct bio *user_bio; + + /* partial block support */ + block_size_t offset; + + /* + * The number of bytes to be discarded. For discards, this field will always be positive, + * whereas for non-discards it will always be 0. Hence it can be used to determine whether + * a data_vio is processing a discard, even after the user_bio has been acknowledged. + */ + u32 remaining_discard; + + struct dedupe_context *dedupe_context; + + /* Fields beyond this point will not be reset when a pooled data_vio is reused. */ + + struct vio vio; + + /* The completion for making reference count decrements */ + struct vdo_completion decrement_completion; + + /* All of the fields necessary for the compression path */ + struct compression_state compression; + + /* A block used as output during compression or uncompression */ + char *scratch_block; + + struct list_head pool_entry; +}; + +static inline struct data_vio *vio_as_data_vio(struct vio *vio) +{ + ASSERT_LOG_ONLY((vio->type == VIO_TYPE_DATA), "vio is a data_vio"); + return container_of(vio, struct data_vio, vio); +} + +static inline struct data_vio *as_data_vio(struct vdo_completion *completion) +{ + return vio_as_data_vio(as_vio(completion)); +} + +static inline struct data_vio *waiter_as_data_vio(struct waiter *waiter) +{ + if (waiter == NULL) + return NULL; + + return container_of(waiter, struct data_vio, waiter); +} + +static inline struct data_vio * +data_vio_from_reference_updater(struct reference_updater *updater) +{ + if (updater->increment) + return container_of(updater, struct data_vio, increment_updater); + + return container_of(updater, struct data_vio, decrement_updater); +} + +static inline bool data_vio_has_flush_generation_lock(struct data_vio *data_vio) +{ + return !list_empty(&data_vio->write_entry); +} + +static inline struct vdo *vdo_from_data_vio(struct data_vio *data_vio) +{ + return data_vio->vio.completion.vdo; +} + +static inline bool data_vio_has_allocation(struct data_vio *data_vio) +{ + return (data_vio->allocation.pbn != VDO_ZERO_BLOCK); +} + +struct data_vio_compression_status __must_check +advance_data_vio_compression_stage(struct data_vio *data_vio); +struct data_vio_compression_status __must_check +get_data_vio_compression_status(struct data_vio *data_vio); +bool cancel_data_vio_compression(struct data_vio *data_vio); + +struct data_vio_pool; + +int make_data_vio_pool(struct vdo *vdo, + data_vio_count_t pool_size, + data_vio_count_t discard_limit, + struct data_vio_pool **pool_ptr); +void free_data_vio_pool(struct data_vio_pool *pool); +void vdo_launch_bio(struct data_vio_pool *pool, struct bio *bio); +void drain_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *completion); +void resume_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *completion); + +void dump_data_vio_pool(struct data_vio_pool *pool, bool dump_vios); +data_vio_count_t get_data_vio_pool_active_discards(struct data_vio_pool *pool); +data_vio_count_t get_data_vio_pool_discard_limit(struct data_vio_pool *pool); +data_vio_count_t get_data_vio_pool_maximum_discards(struct data_vio_pool *pool); +int __must_check +set_data_vio_pool_discard_limit(struct data_vio_pool *pool, data_vio_count_t limit); +data_vio_count_t get_data_vio_pool_active_requests(struct data_vio_pool *pool); +data_vio_count_t get_data_vio_pool_request_limit(struct data_vio_pool *pool); +data_vio_count_t get_data_vio_pool_maximum_requests(struct data_vio_pool *pool); + +void complete_data_vio(struct vdo_completion *completion); +void handle_data_vio_error(struct vdo_completion *completion); + +static inline void continue_data_vio(struct data_vio *data_vio) +{ + vdo_launch_completion(&data_vio->vio.completion); +} + +/** + * continue_data_vio_with_error() - Set an error code and then continue processing a data_vio. + * + * This will not mask older errors. This function can be called with a success code, but it is more + * efficient to call continue_data_vio() if the caller knows the result was a success. + */ +static inline void continue_data_vio_with_error(struct data_vio *data_vio, int result) +{ + vdo_continue_completion(&data_vio->vio.completion, result); +} + +const char * __must_check get_data_vio_operation_name(struct data_vio *data_vio); + +static inline void assert_data_vio_in_hash_zone(struct data_vio *data_vio) +{ + thread_id_t expected = data_vio->hash_zone->thread_id; + thread_id_t thread_id = vdo_get_callback_thread_id(); + /* + * It's odd to use the LBN, but converting the record name to hex is a bit clunky for an + * inline, and the LBN better than nothing as an identifier. + */ + ASSERT_LOG_ONLY((expected == thread_id), + "data_vio for logical block %llu on thread %u, should be on hash zone thread %u", + (unsigned long long) data_vio->logical.lbn, + thread_id, + expected); +} + +static inline void set_data_vio_hash_zone_callback(struct data_vio *data_vio, vdo_action *callback) +{ + vdo_set_completion_callback(&data_vio->vio.completion, + callback, + data_vio->hash_zone->thread_id); +} + +/** + * launch_data_vio_hash_zone_callback() - Set a callback as a hash zone operation and invoke it + * immediately. + */ +static inline void +launch_data_vio_hash_zone_callback(struct data_vio *data_vio, vdo_action *callback) +{ + set_data_vio_hash_zone_callback(data_vio, callback); + vdo_launch_completion(&data_vio->vio.completion); +} + +static inline void assert_data_vio_in_logical_zone(struct data_vio *data_vio) +{ + thread_id_t expected = data_vio->logical.zone->thread_id; + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((expected == thread_id), + "data_vio for logical block %llu on thread %u, should be on thread %u", + (unsigned long long) data_vio->logical.lbn, + thread_id, + expected); +} + +static inline void set_data_vio_logical_callback(struct data_vio *data_vio, vdo_action *callback) +{ + vdo_set_completion_callback(&data_vio->vio.completion, + callback, + data_vio->logical.zone->thread_id); +} + +/** + * launch_data_vio_logical_callback() - Set a callback as a logical block operation and invoke it + * immediately. + */ +static inline void +launch_data_vio_logical_callback(struct data_vio *data_vio, vdo_action *callback) +{ + set_data_vio_logical_callback(data_vio, callback); + vdo_launch_completion(&data_vio->vio.completion); +} + +static inline void assert_data_vio_in_allocated_zone(struct data_vio *data_vio) +{ + thread_id_t expected = data_vio->allocation.zone->thread_id; + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((expected == thread_id), + "struct data_vio for allocated physical block %llu on thread %u, should be on thread %u", + (unsigned long long) data_vio->allocation.pbn, + thread_id, + expected); +} + +static inline void +set_data_vio_allocated_zone_callback(struct data_vio *data_vio, vdo_action *callback) +{ + vdo_set_completion_callback(&data_vio->vio.completion, + callback, + data_vio->allocation.zone->thread_id); +} + +/** + * launch_data_vio_allocated_zone_callback() - Set a callback as a physical block operation in a + * data_vio's allocated zone and queue the data_vio and + * invoke it immediately. + */ +static inline void +launch_data_vio_allocated_zone_callback(struct data_vio *data_vio, vdo_action *callback) +{ + set_data_vio_allocated_zone_callback(data_vio, callback); + vdo_launch_completion(&data_vio->vio.completion); +} + +static inline void assert_data_vio_in_duplicate_zone(struct data_vio *data_vio) +{ + thread_id_t expected = data_vio->duplicate.zone->thread_id; + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((expected == thread_id), + "data_vio for duplicate physical block %llu on thread %u, should be on thread %u", + (unsigned long long) data_vio->duplicate.pbn, + thread_id, + expected); +} + +static inline void +set_data_vio_duplicate_zone_callback(struct data_vio *data_vio, vdo_action *callback) +{ + vdo_set_completion_callback(&data_vio->vio.completion, + callback, + data_vio->duplicate.zone->thread_id); +} + +/** + * launch_data_vio_duplicate_zone_callback() - Set a callback as a physical block operation in a + * data_vio's duplicate zone and queue the data_vio and + * invoke it immediately. + */ +static inline void +launch_data_vio_duplicate_zone_callback(struct data_vio *data_vio, vdo_action *callback) +{ + set_data_vio_duplicate_zone_callback(data_vio, callback); + vdo_launch_completion(&data_vio->vio.completion); +} + +static inline void assert_data_vio_in_mapped_zone(struct data_vio *data_vio) +{ + thread_id_t expected = data_vio->mapped.zone->thread_id; + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((expected == thread_id), + "data_vio for mapped physical block %llu on thread %u, should be on thread %u", + (unsigned long long) data_vio->mapped.pbn, + thread_id, + expected); +} + +static inline void +set_data_vio_mapped_zone_callback(struct data_vio *data_vio, vdo_action *callback) +{ + vdo_set_completion_callback(&data_vio->vio.completion, + callback, + data_vio->mapped.zone->thread_id); +} + +static inline void assert_data_vio_in_new_mapped_zone(struct data_vio *data_vio) +{ + thread_id_t expected = data_vio->new_mapped.zone->thread_id; + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((expected == thread_id), + "data_vio for new_mapped physical block %llu on thread %u, should be on thread %u", + (unsigned long long) data_vio->new_mapped.pbn, + thread_id, + expected); +} + +static inline void +set_data_vio_new_mapped_zone_callback(struct data_vio *data_vio, vdo_action *callback) +{ + vdo_set_completion_callback(&data_vio->vio.completion, + callback, + data_vio->new_mapped.zone->thread_id); +} + +static inline void assert_data_vio_in_journal_zone(struct data_vio *data_vio) +{ + thread_id_t journal_thread = vdo_from_data_vio(data_vio)->thread_config.journal_thread; + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((journal_thread == thread_id), + "data_vio for logical block %llu on thread %u, should be on journal thread %u", + (unsigned long long) data_vio->logical.lbn, + thread_id, + journal_thread); +} + +static inline void set_data_vio_journal_callback(struct data_vio *data_vio, vdo_action *callback) +{ + thread_id_t journal_thread = vdo_from_data_vio(data_vio)->thread_config.journal_thread; + + vdo_set_completion_callback(&data_vio->vio.completion, callback, journal_thread); +} + +/** + * launch_data_vio_journal_callback() - Set a callback as a journal operation and invoke it + * immediately. + */ +static inline void +launch_data_vio_journal_callback(struct data_vio *data_vio, vdo_action *callback) +{ + set_data_vio_journal_callback(data_vio, callback); + vdo_launch_completion(&data_vio->vio.completion); +} + +static inline void assert_data_vio_in_packer_zone(struct data_vio *data_vio) +{ + thread_id_t packer_thread = vdo_from_data_vio(data_vio)->thread_config.packer_thread; + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((packer_thread == thread_id), + "data_vio for logical block %llu on thread %u, should be on packer thread %u", + (unsigned long long) data_vio->logical.lbn, + thread_id, + packer_thread); +} + +static inline void set_data_vio_packer_callback(struct data_vio *data_vio, vdo_action *callback) +{ + thread_id_t packer_thread = vdo_from_data_vio(data_vio)->thread_config.packer_thread; + + vdo_set_completion_callback(&data_vio->vio.completion, callback, packer_thread); +} + +/** + * launch_data_vio_packer_callback() - Set a callback as a packer operation and invoke it + * immediately. + */ +static inline void launch_data_vio_packer_callback(struct data_vio *data_vio, vdo_action *callback) +{ + set_data_vio_packer_callback(data_vio, callback); + vdo_launch_completion(&data_vio->vio.completion); +} + +static inline void assert_data_vio_on_cpu_thread(struct data_vio *data_vio) +{ + thread_id_t cpu_thread = vdo_from_data_vio(data_vio)->thread_config.cpu_thread; + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((cpu_thread == thread_id), + "data_vio for logical block %llu on thread %u, should be on cpu thread %u", + (unsigned long long) data_vio->logical.lbn, + thread_id, + cpu_thread); +} + +static inline void set_data_vio_cpu_callback(struct data_vio *data_vio, vdo_action *callback) +{ + thread_id_t cpu_thread = vdo_from_data_vio(data_vio)->thread_config.cpu_thread; + + vdo_set_completion_callback(&data_vio->vio.completion, callback, cpu_thread); +} + +/** + * launch_data_vio_cpu_callback() - Set a callback to run on the CPU queues and invoke it + * immediately. + */ +static inline void +launch_data_vio_cpu_callback(struct data_vio *data_vio, + vdo_action *callback, + enum vdo_completion_priority priority) +{ + set_data_vio_cpu_callback(data_vio, callback); + vdo_launch_completion_with_priority(&data_vio->vio.completion, priority); +} + +static inline void set_data_vio_bio_zone_callback(struct data_vio *data_vio, vdo_action *callback) +{ + vdo_set_completion_callback(&data_vio->vio.completion, + callback, + get_vio_bio_zone_thread_id(&data_vio->vio)); +} + +/** + * launch_data_vio_bio_zone_callback() - Set a callback as a bio zone operation and invoke it + * immediately. + */ +static inline void +launch_data_vio_bio_zone_callback(struct data_vio *data_vio, vdo_action *callback) +{ + set_data_vio_bio_zone_callback(data_vio, callback); + vdo_launch_completion_with_priority(&data_vio->vio.completion, BIO_Q_DATA_PRIORITY); +} + +/** + * launch_data_vio_on_bio_ack_queue() - If the vdo uses a bio_ack queue, set a callback to run on + * it and invoke it immediately, otherwise, just run the + * callback on the current thread. + */ +static inline void +launch_data_vio_on_bio_ack_queue(struct data_vio *data_vio, vdo_action *callback) +{ + struct vdo_completion *completion = &data_vio->vio.completion; + struct vdo *vdo = completion->vdo; + + if (!vdo_uses_bio_ack_queue(vdo)) { + callback(completion); + return; + } + + vdo_set_completion_callback(completion, callback, vdo->thread_config.bio_ack_thread); + vdo_launch_completion_with_priority(completion, BIO_ACK_Q_ACK_PRIORITY); +} + +void data_vio_allocate_data_block(struct data_vio *data_vio, + enum pbn_lock_type write_lock_type, + vdo_action *callback, + vdo_action *error_handler); + +/** + * release_data_vio_allocation_lock() - Release the PBN lock on a data_vio's allocated block. + * @reset: If true, the allocation will be reset (i.e. any allocated pbn will be forgotten). + * + * If the reference to the locked block is still provisional, it will be released as well. + */ +void release_data_vio_allocation_lock(struct data_vio *data_vio, bool reset); + +int __must_check uncompress_data_vio(struct data_vio *data_vio, + enum block_mapping_state mapping_state, + char *buffer); + +void update_metadata_for_data_vio_write(struct data_vio *data_vio, struct pbn_lock *lock); +void write_data_vio(struct data_vio *data_vio); +void launch_compress_data_vio(struct data_vio *data_vio); +void continue_data_vio_with_block_map_slot(struct vdo_completion *completion); + +#endif /* DATA_VIO_H */ diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c new file mode 100644 index 00000000000..d71f93caf98 --- /dev/null +++ b/drivers/md/dm-vdo/dedupe.c @@ -0,0 +1,3073 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +/** + * DOC: + * + * Hash Locks: + * + * A hash_lock controls and coordinates writing, index access, and dedupe among groups of data_vios + * concurrently writing identical blocks, allowing them to deduplicate not only against advice but + * also against each other. This saves on index queries and allows those data_vios to concurrently + * deduplicate against a single block instead of being serialized through a PBN read lock. Only one + * index query is needed for each hash_lock, instead of one for every data_vio. + * + * A hash_lock acts like a state machine perhaps more than as a lock. Other than the starting and + * ending states INITIALIZING and BYPASSING, every state represents and is held for the duration of + * an asynchronous operation. All state transitions are performed on the thread of the hash_zone + * containing the lock. An asynchronous operation is almost always performed upon entering a state, + * and the callback from that operation triggers exiting the state and entering a new state. + * + * In all states except DEDUPING, there is a single data_vio, called the lock agent, performing the + * asynchronous operations on behalf of the lock. The agent will change during the lifetime of the + * lock if the lock is shared by more than one data_vio. data_vios waiting to deduplicate are kept + * on a wait queue. Viewed a different way, the agent holds the lock exclusively until the lock + * enters the DEDUPING state, at which point it becomes a shared lock that all the waiters (and any + * new data_vios that arrive) use to share a PBN lock. In state DEDUPING, there is no agent. When + * the last data_vio in the lock calls back in DEDUPING, it becomes the agent and the lock becomes + * exclusive again. New data_vios that arrive in the lock will also go on the wait queue. + * + * The existence of lock waiters is a key factor controlling which state the lock transitions to + * next. When the lock is new or has waiters, it will always try to reach DEDUPING, and when it + * doesn't, it will try to clean up and exit. + * + * Deduping requires holding a PBN lock on a block that is known to contain data identical to the + * data_vios in the lock, so the lock will send the agent to the duplicate zone to acquire the PBN + * lock (LOCKING), to the kernel I/O threads to read and verify the data (VERIFYING), or to write a + * new copy of the data to a full data block or a slot in a compressed block (WRITING). + * + * Cleaning up consists of updating the index when the data location is different from the initial + * index query (UPDATING, triggered by stale advice, compression, and rollover), releasing the PBN + * lock on the duplicate block (UNLOCKING), and if the agent is the last data_vio referencing the + * lock, releasing the hash_lock itself back to the hash zone (BYPASSING). + * + * The shortest sequence of states is for non-concurrent writes of new data: + * INITIALIZING -> QUERYING -> WRITING -> BYPASSING + * This sequence is short because no PBN read lock or index update is needed. + * + * Non-concurrent, finding valid advice looks like this (endpoints elided): + * -> QUERYING -> LOCKING -> VERIFYING -> DEDUPING -> UNLOCKING -> + * Or with stale advice (endpoints elided): + * -> QUERYING -> LOCKING -> VERIFYING -> UNLOCKING -> WRITING -> UPDATING -> + * + * When there are not enough available reference count increments available on a PBN for a data_vio + * to deduplicate, a new lock is forked and the excess waiters roll over to the new lock (which + * goes directly to WRITING). The new lock takes the place of the old lock in the lock map so new + * data_vios will be directed to it. The two locks will proceed independently, but only the new + * lock will have the right to update the index (unless it also forks). + * + * Since rollover happens in a lock instance, once a valid data location has been selected, it will + * not change. QUERYING and WRITING are only performed once per lock lifetime. All other + * non-endpoint states can be re-entered. + * + * The function names in this module follow a convention referencing the states and transitions in + * the state machine. For example, for the LOCKING state, there are start_locking() and + * finish_locking() functions. start_locking() is invoked by the finish function of the state (or + * states) that transition to LOCKING. It performs the actual lock state change and must be invoked + * on the hash zone thread. finish_locking() is called by (or continued via callback from) the + * code actually obtaining the lock. It does any bookkeeping or decision-making required and + * invokes the appropriate start function of the state being transitioned to after LOCKING. + * + * ---------------------------------------------------------------------- + * + * Index Queries: + * + * A query to the UDS index is handled asynchronously by the index's threads. When the query is + * complete, a callback supplied with the query will be called from one of the those threads. Under + * heavy system load, the index may be slower to respond then is desirable for reasonable I/O + * throughput. Since deduplication of writes is not necessary for correct operation of a VDO + * device, it is acceptable to timeout out slow index queries and proceed to fulfill a write + * request without deduplicating. However, because the uds_request struct itself is supplied by the + * caller, we can not simply reuse a uds_request object which we have chosen to timeout. Hence, + * each hash_zone maintains a pool of dedupe_contexts which each contain a uds_request along with a + * reference to the data_vio on behalf of which they are performing a query. + * + * When a hash_lock needs to query the index, it attempts to acquire an unused dedupe_context from + * its hash_zone's pool. If one is available, that context is prepared, associated with the + * hash_lock's agent, added to the list of pending contexts, and then sent to the index. The + * context's state will be transitioned from DEDUPE_CONTEXT_IDLE to DEDUPE_CONTEXT_PENDING. If all + * goes well, the dedupe callback will be called by the index which will change the context's state + * to DEDUPE_CONTEXT_COMPLETE, and the associated data_vio will be enqueued to run back in the hash + * zone where the query results will be processed and the context will be put back in the idle + * state and returned to the hash_zone's available list. + * + * The first time an index query is launched from a given hash_zone, a timer is started. When the + * timer fires, the hash_zone's completion is enqueued to run in the hash_zone where the zone's + * pending list will be searched for any contexts in the pending state which have been running for + * too long. Those contexts are transitioned to the DEDUPE_CONTEXT_TIMED_OUT state and moved to the + * zone's timed_out list where they won't be examined again if there is a subsequent time out). The + * data_vios associated with timed out contexts are sent to continue processing their write + * operation without deduplicating. The timer is also restarted. + * + * When the dedupe callback is run for a context which is in the timed out state, that context is + * moved to the DEDUPE_CONTEXT_TIMED_OUT_COMPLETE state. No other action need be taken as the + * associated data_vios have already been dispatched. + * + * If a hash_lock needs a dedupe context, and the available list is empty, the timed_out list will + * be searched for any contexts which are timed out and complete. One of these will be used + * immediately, and the rest will be returned to the available list and marked idle. + */ + +#include "dedupe.h" + +#include <linux/atomic.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/kobject.h> +#include <linux/list.h> +#include <linux/ratelimit.h> +#include <linux/spinlock.h> +#include <linux/timer.h> + +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "permassert.h" +#include "string-utils.h" +#include "uds.h" + +#include "action-manager.h" +#include "admin-state.h" +#include "completion.h" +#include "constants.h" +#include "data-vio.h" +#include "io-submitter.h" +#include "packer.h" +#include "physical-zone.h" +#include "pointer-map.h" +#include "slab-depot.h" +#include "statistics.h" +#include "types.h" +#include "vdo.h" +#include "wait-queue.h" + +struct uds_attribute { + struct attribute attr; + const char *(*show_string)(struct hash_zones *hash_zones); +}; + +enum timer_state { + DEDUPE_QUERY_TIMER_IDLE, + DEDUPE_QUERY_TIMER_RUNNING, + DEDUPE_QUERY_TIMER_FIRED, +}; + +enum dedupe_context_state { + DEDUPE_CONTEXT_IDLE, + DEDUPE_CONTEXT_PENDING, + DEDUPE_CONTEXT_TIMED_OUT, + DEDUPE_CONTEXT_COMPLETE, + DEDUPE_CONTEXT_TIMED_OUT_COMPLETE, +}; + +/* Possible index states: closed, opened, or transitioning between those two. */ +enum index_state { + IS_CLOSED, + IS_CHANGING, + IS_OPENED, +}; + +static const char *CLOSED = "closed"; +static const char *CLOSING = "closing"; +static const char *ERROR = "error"; +static const char *OFFLINE = "offline"; +static const char *ONLINE = "online"; +static const char *OPENING = "opening"; +static const char *SUSPENDED = "suspended"; +static const char *UNKNOWN = "unknown"; + +/* Version 2 uses the kernel space UDS index and is limited to 16 bytes */ +enum { + UDS_ADVICE_VERSION = 2, + /* version byte + state byte + 64-bit little-endian PBN */ + UDS_ADVICE_SIZE = 1 + 1 + sizeof(u64), +}; + +enum hash_lock_state { + /* State for locks that are not in use or are being initialized. */ + VDO_HASH_LOCK_INITIALIZING, + + /* This is the sequence of states typically used on the non-dedupe path. */ + VDO_HASH_LOCK_QUERYING, + VDO_HASH_LOCK_WRITING, + VDO_HASH_LOCK_UPDATING, + + /* The remaining states are typically used on the dedupe path in this order. */ + VDO_HASH_LOCK_LOCKING, + VDO_HASH_LOCK_VERIFYING, + VDO_HASH_LOCK_DEDUPING, + VDO_HASH_LOCK_UNLOCKING, + + /* + * Terminal state for locks returning to the pool. Must be last both because it's the final + * state, and also because it's used to count the states. + */ + VDO_HASH_LOCK_BYPASSING, +}; + +static const char * const LOCK_STATE_NAMES[] = { + [VDO_HASH_LOCK_BYPASSING] = "BYPASSING", + [VDO_HASH_LOCK_DEDUPING] = "DEDUPING", + [VDO_HASH_LOCK_INITIALIZING] = "INITIALIZING", + [VDO_HASH_LOCK_LOCKING] = "LOCKING", + [VDO_HASH_LOCK_QUERYING] = "QUERYING", + [VDO_HASH_LOCK_UNLOCKING] = "UNLOCKING", + [VDO_HASH_LOCK_UPDATING] = "UPDATING", + [VDO_HASH_LOCK_VERIFYING] = "VERIFYING", + [VDO_HASH_LOCK_WRITING] = "WRITING", +}; + +struct hash_lock { + /* The block hash covered by this lock */ + struct uds_record_name hash; + + /* When the lock is unused, this list entry allows the lock to be pooled */ + struct list_head pool_node; + + /* + * A list containing the data VIOs sharing this lock, all having the same record name and + * data block contents, linked by their hash_lock_node fields. + */ + struct list_head duplicate_ring; + + /* The number of data_vios sharing this lock instance */ + data_vio_count_t reference_count; + + /* The maximum value of reference_count in the lifetime of this lock */ + data_vio_count_t max_references; + + /* The current state of this lock */ + enum hash_lock_state state; + + /* True if the UDS index should be updated with new advice */ + bool update_advice; + + /* True if the advice has been verified to be a true duplicate */ + bool verified; + + /* True if the lock has already accounted for an initial verification */ + bool verify_counted; + + /* True if this lock is registered in the lock map (cleared on rollover) */ + bool registered; + + /* + * If verified is false, this is the location of a possible duplicate. If verified is true, + * it is the verified location of a true duplicate. + */ + struct zoned_pbn duplicate; + + /* The PBN lock on the block containing the duplicate data */ + struct pbn_lock *duplicate_lock; + + /* The data_vio designated to act on behalf of the lock */ + struct data_vio *agent; + + /* + * Other data_vios with data identical to the agent who are currently waiting for the agent + * to get the information they all need to deduplicate--either against each other, or + * against an existing duplicate on disk. + */ + struct wait_queue waiters; +}; + +enum { + LOCK_POOL_CAPACITY = MAXIMUM_VDO_USER_VIOS, +}; + +struct hash_zones { + struct action_manager *manager; + struct kobject dedupe_directory; + struct uds_parameters parameters; + struct uds_index_session *index_session; + struct ratelimit_state ratelimiter; + atomic64_t timeouts; + atomic64_t dedupe_context_busy; + + /* This spinlock protects the state fields and the starting of dedupe requests. */ + spinlock_t lock; + + /* The fields in the next block are all protected by the lock */ + struct vdo_completion completion; + enum index_state index_state; + enum index_state index_target; + struct admin_state state; + bool changing; + bool create_flag; + bool dedupe_flag; + bool error_flag; + u64 reported_timeouts; + + /* The number of zones */ + zone_count_t zone_count; + /* The hash zones themselves */ + struct hash_zone zones[]; +}; + +/* These are in milliseconds. */ +unsigned int vdo_dedupe_index_timeout_interval = 5000; +unsigned int vdo_dedupe_index_min_timer_interval = 100; +/* Same two variables, in jiffies for easier consumption. */ +static u64 vdo_dedupe_index_timeout_jiffies; +static u64 vdo_dedupe_index_min_timer_jiffies; + +static inline struct hash_zone *as_hash_zone(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion, VDO_HASH_ZONE_COMPLETION); + return container_of(completion, struct hash_zone, completion); +} + +static inline struct hash_zones *as_hash_zones(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion, VDO_HASH_ZONES_COMPLETION); + return container_of(completion, struct hash_zones, completion); +} + +static inline void assert_in_hash_zone(struct hash_zone *zone, const char *name) +{ + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == zone->thread_id), + "%s called on hash zone thread", + name); +} + +static inline bool change_context_state(struct dedupe_context *context, int old, int new) +{ + return (atomic_cmpxchg(&context->state, old, new) == old); +} + +static inline bool change_timer_state(struct hash_zone *zone, int old, int new) +{ + return (atomic_cmpxchg(&zone->timer_state, old, new) == old); +} + +/** + * return_hash_lock_to_pool() - (Re)initialize a hash lock and return it to its pool. + * @zone: The zone from which the lock was borrowed. + * @lock: The lock that is no longer in use. + */ +static void return_hash_lock_to_pool(struct hash_zone *zone, struct hash_lock *lock) +{ + memset(lock, 0, sizeof(*lock)); + INIT_LIST_HEAD(&lock->pool_node); + INIT_LIST_HEAD(&lock->duplicate_ring); + vdo_initialize_wait_queue(&lock->waiters); + list_add_tail(&lock->pool_node, &zone->lock_pool); +} + +/** + * vdo_get_duplicate_lock() - Get the PBN lock on the duplicate data location for a data_vio from + * the hash_lock the data_vio holds (if there is one). + * @data_vio: The data_vio to query. + * + * Return: The PBN lock on the data_vio's duplicate location. + */ +struct pbn_lock *vdo_get_duplicate_lock(struct data_vio *data_vio) +{ + if (data_vio->hash_lock == NULL) + return NULL; + return data_vio->hash_lock->duplicate_lock; +} + +/** + * get_hash_lock_state_name() - Get the string representation of a hash lock state. + * @state: The hash lock state. + * + * Return: The short string representing the state + */ +static const char *get_hash_lock_state_name(enum hash_lock_state state) +{ + /* Catch if a state has been added without updating the name array. */ + STATIC_ASSERT((VDO_HASH_LOCK_BYPASSING + 1) == ARRAY_SIZE(LOCK_STATE_NAMES)); + return (state < ARRAY_SIZE(LOCK_STATE_NAMES)) ? LOCK_STATE_NAMES[state] : "INVALID"; +} + +/** + * assert_hash_lock_agent() - Assert that a data_vio is the agent of its hash lock, and that this + * is being called in the hash zone. + * @data_vio: The data_vio expected to be the lock agent. + * @where: A string describing the function making the assertion. + */ +static void assert_hash_lock_agent(struct data_vio *data_vio, const char *where) +{ + /* Not safe to access the agent field except from the hash zone. */ + assert_data_vio_in_hash_zone(data_vio); + ASSERT_LOG_ONLY(data_vio == data_vio->hash_lock->agent, + "%s must be for the hash lock agent", where); +} + +/** + * set_duplicate_lock() - Set the duplicate lock held by a hash lock. May only be called in the + * physical zone of the PBN lock. + * @hash_lock: The hash lock to update. + * @pbn_lock: The PBN read lock to use as the duplicate lock. + */ +static void set_duplicate_lock(struct hash_lock *hash_lock, struct pbn_lock *pbn_lock) +{ + ASSERT_LOG_ONLY((hash_lock->duplicate_lock == NULL), + "hash lock must not already hold a duplicate lock"); + + pbn_lock->holder_count += 1; + hash_lock->duplicate_lock = pbn_lock; +} + +/** + * dequeue_lock_waiter() - Remove the first data_vio from the lock's wait queue and return it. + * @lock: The lock containing the wait queue. + * + * Return: The first (oldest) waiter in the queue, or NULL if the queue is empty. + */ +static inline struct data_vio *dequeue_lock_waiter(struct hash_lock *lock) +{ + return waiter_as_data_vio(vdo_dequeue_next_waiter(&lock->waiters)); +} + +/** + * set_hash_lock() - Set, change, or clear the hash lock a data_vio is using. + * @data_vio: The data_vio to update. + * @new_lock: The hash lock the data_vio is joining. + * + * Updates the hash lock (or locks) to reflect the change in membership. + */ +static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock) +{ + struct hash_lock *old_lock = data_vio->hash_lock; + + if (old_lock != NULL) { + ASSERT_LOG_ONLY(data_vio->hash_zone != NULL, + "must have a hash zone when holding a hash lock"); + ASSERT_LOG_ONLY(!list_empty(&data_vio->hash_lock_entry), + "must be on a hash lock ring when holding a hash lock"); + ASSERT_LOG_ONLY(old_lock->reference_count > 0, + "hash lock reference must be counted"); + + if ((old_lock->state != VDO_HASH_LOCK_BYPASSING) && + (old_lock->state != VDO_HASH_LOCK_UNLOCKING)) + /* + * If the reference count goes to zero in a non-terminal state, we're most + * likely leaking this lock. + */ + ASSERT_LOG_ONLY(old_lock->reference_count > 1, + "hash locks should only become unreferenced in a terminal state, not state %s", + get_hash_lock_state_name(old_lock->state)); + + list_del_init(&data_vio->hash_lock_entry); + old_lock->reference_count -= 1; + + data_vio->hash_lock = NULL; + } + + if (new_lock != NULL) { + /* + * Keep all data_vios sharing the lock on a ring since they can complete in any + * order and we'll always need a pointer to one to compare data. + */ + list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_ring); + new_lock->reference_count += 1; + if (new_lock->max_references < new_lock->reference_count) + new_lock->max_references = new_lock->reference_count; + + data_vio->hash_lock = new_lock; + } +} + +/* There are loops in the state diagram, so some forward decl's are needed. */ +static void start_deduping(struct hash_lock *lock, struct data_vio *agent, bool agent_is_done); +static void start_locking(struct hash_lock *lock, struct data_vio *agent); +static void start_writing(struct hash_lock *lock, struct data_vio *agent); +static void unlock_duplicate_pbn(struct vdo_completion *completion); +static void transfer_allocation_lock(struct data_vio *data_vio); + +/** + * exit_hash_lock() - Bottleneck for data_vios that have written or deduplicated and that are no + * longer needed to be an agent for the hash lock. + * @data_vio: The data_vio to complete and send to be cleaned up. + */ +static void exit_hash_lock(struct data_vio *data_vio) +{ + /* Release the hash lock now, saving a thread transition in cleanup. */ + vdo_release_hash_lock(data_vio); + + /* Complete the data_vio and start the clean-up path to release any locks it still holds. */ + data_vio->vio.completion.callback = complete_data_vio; + + continue_data_vio(data_vio); +} + +/** + * set_duplicate_location() - Set the location of the duplicate block for data_vio, updating the + * is_duplicate and duplicate fields from a zoned_pbn. + * @data_vio: The data_vio to modify. + * @source: The location of the duplicate. + */ +static void set_duplicate_location(struct data_vio *data_vio, const struct zoned_pbn source) +{ + data_vio->is_duplicate = (source.pbn != VDO_ZERO_BLOCK); + data_vio->duplicate = source; +} + +/** + * retire_lock_agent() - Retire the active lock agent, replacing it with the first lock waiter, and + * make the retired agent exit the hash lock. + * @lock: The hash lock to update. + * + * Return: The new lock agent (which will be NULL if there was no waiter) + */ +static struct data_vio *retire_lock_agent(struct hash_lock *lock) +{ + struct data_vio *old_agent = lock->agent; + struct data_vio *new_agent = dequeue_lock_waiter(lock); + + lock->agent = new_agent; + exit_hash_lock(old_agent); + if (new_agent != NULL) + set_duplicate_location(new_agent, lock->duplicate); + return new_agent; +} + +/** + * wait_on_hash_lock() - Add a data_vio to the lock's queue of waiters. + * @lock: The hash lock on which to wait. + * @data_vio: The data_vio to add to the queue. + */ +static void wait_on_hash_lock(struct hash_lock *lock, struct data_vio *data_vio) +{ + vdo_enqueue_waiter(&lock->waiters, &data_vio->waiter); + + /* + * Make sure the agent doesn't block indefinitely in the packer since it now has at least + * one other data_vio waiting on it. + */ + if ((lock->state != VDO_HASH_LOCK_WRITING) || !cancel_data_vio_compression(lock->agent)) + return; + + /* + * Even though we're waiting, we also have to send ourselves as a one-way message to the + * packer to ensure the agent continues executing. This is safe because + * cancel_vio_compression() guarantees the agent won't continue executing until this + * message arrives in the packer, and because the wait queue link isn't used for sending + * the message. + */ + data_vio->compression.lock_holder = lock->agent; + launch_data_vio_packer_callback(data_vio, vdo_remove_lock_holder_from_packer); +} + +/** + * abort_waiter() - waiter_callback function that shunts waiters to write their blocks without + * optimization. + * @waiter: The data_vio's waiter link. + * @context: Not used. + */ +static void abort_waiter(struct waiter *waiter, void *context __always_unused) +{ + write_data_vio(waiter_as_data_vio(waiter)); +} + +/** + * start_bypassing() - Stop using the hash lock. + * @lock: The hash lock. + * @agent: The data_vio acting as the agent for the lock. + * + * Stops using the hash lock. This is the final transition for hash locks which did not get an + * error. + */ +static void start_bypassing(struct hash_lock *lock, struct data_vio *agent) +{ + lock->state = VDO_HASH_LOCK_BYPASSING; + exit_hash_lock(agent); +} + +void vdo_clean_failed_hash_lock(struct data_vio *data_vio) +{ + struct hash_lock *lock = data_vio->hash_lock; + + if (lock->state == VDO_HASH_LOCK_BYPASSING) { + exit_hash_lock(data_vio); + return; + } + + if (lock->agent == NULL) { + lock->agent = data_vio; + } else if (data_vio != lock->agent) { + exit_hash_lock(data_vio); + return; + } + + lock->state = VDO_HASH_LOCK_BYPASSING; + + /* Ensure we don't attempt to update advice when cleaning up. */ + lock->update_advice = false; + + vdo_notify_all_waiters(&lock->waiters, abort_waiter, NULL); + + if (lock->duplicate_lock != NULL) { + /* The agent must reference the duplicate zone to launch it. */ + data_vio->duplicate = lock->duplicate; + launch_data_vio_duplicate_zone_callback(data_vio, unlock_duplicate_pbn); + return; + } + + lock->agent = NULL; + data_vio->is_duplicate = false; + exit_hash_lock(data_vio); +} + +/** + * finish_unlocking() - Handle the result of the agent for the lock releasing a read lock on + * duplicate candidate. + * @completion: The completion of the data_vio acting as the lock's agent. + * + * This continuation is registered in unlock_duplicate_pbn(). + */ +static void finish_unlocking(struct vdo_completion *completion) +{ + struct data_vio *agent = as_data_vio(completion); + struct hash_lock *lock = agent->hash_lock; + + assert_hash_lock_agent(agent, __func__); + + ASSERT_LOG_ONLY(lock->duplicate_lock == NULL, + "must have released the duplicate lock for the hash lock"); + + if (!lock->verified) { + /* + * UNLOCKING -> WRITING transition: The lock we released was on an unverified + * block, so it must have been a lock on advice we were verifying, not on a + * location that was used for deduplication. Go write (or compress) the block to + * get a location to dedupe against. + */ + start_writing(lock, agent); + return; + } + + /* + * With the lock released, the verified duplicate block may already have changed and will + * need to be re-verified if a waiter arrived. + */ + lock->verified = false; + + if (vdo_has_waiters(&lock->waiters)) { + /* + * UNLOCKING -> LOCKING transition: A new data_vio entered the hash lock while the + * agent was releasing the PBN lock. The current agent exits and the waiter has to + * re-lock and re-verify the duplicate location. + * + * TODO: If we used the current agent to re-acquire the PBN lock we wouldn't need + * to re-verify. + */ + agent = retire_lock_agent(lock); + start_locking(lock, agent); + return; + } + + /* + * UNLOCKING -> BYPASSING transition: The agent is done with the lock and no other + * data_vios reference it, so remove it from the lock map and return it to the pool. + */ + start_bypassing(lock, agent); +} + +/** + * unlock_duplicate_pbn() - Release a read lock on the PBN of the block that may or may not have + * contained duplicate data. + * @completion: The completion of the data_vio acting as the lock's agent. + * + * This continuation is launched by start_unlocking(), and calls back to finish_unlocking() on the + * hash zone thread. + */ +static void unlock_duplicate_pbn(struct vdo_completion *completion) +{ + struct data_vio *agent = as_data_vio(completion); + struct hash_lock *lock = agent->hash_lock; + + assert_data_vio_in_duplicate_zone(agent); + ASSERT_LOG_ONLY(lock->duplicate_lock != NULL, "must have a duplicate lock to release"); + + vdo_release_physical_zone_pbn_lock(agent->duplicate.zone, + agent->duplicate.pbn, + UDS_FORGET(lock->duplicate_lock)); + if (lock->state == VDO_HASH_LOCK_BYPASSING) { + complete_data_vio(completion); + return; + } + + launch_data_vio_hash_zone_callback(agent, finish_unlocking); +} + +/** + * start_unlocking() - Release a read lock on the PBN of the block that may or may not have + * contained duplicate data. + * @lock: The hash lock. + * @agent: The data_vio currently acting as the agent for the lock. + */ +static void start_unlocking(struct hash_lock *lock, struct data_vio *agent) +{ + lock->state = VDO_HASH_LOCK_UNLOCKING; + launch_data_vio_duplicate_zone_callback(agent, unlock_duplicate_pbn); +} + +static void release_context(struct dedupe_context *context) +{ + struct hash_zone *zone = context->zone; + + WRITE_ONCE(zone->active, zone->active - 1); + list_move(&context->list_entry, &zone->available); +} + +static void process_update_result(struct data_vio *agent) +{ + struct dedupe_context *context = agent->dedupe_context; + + if ((context == NULL) || + !change_context_state(context, DEDUPE_CONTEXT_COMPLETE, DEDUPE_CONTEXT_IDLE)) + return; + + release_context(context); +} + +/** + * finish_updating() - Process the result of a UDS update performed by the agent for the lock. + * @completion: The completion of the data_vio that performed the update + * + * This continuation is registered in start_querying(). + */ +static void finish_updating(struct vdo_completion *completion) +{ + struct data_vio *agent = as_data_vio(completion); + struct hash_lock *lock = agent->hash_lock; + + assert_hash_lock_agent(agent, __func__); + + process_update_result(agent); + + /* + * UDS was updated successfully, so don't update again unless the duplicate location + * changes due to rollover. + */ + lock->update_advice = false; + + if (vdo_has_waiters(&lock->waiters)) { + /* + * UPDATING -> DEDUPING transition: A new data_vio arrived during the UDS update. + * Send it on the verified dedupe path. The agent is done with the lock, but the + * lock may still need to use it to clean up after rollover. + */ + start_deduping(lock, agent, true); + return; + } + + if (lock->duplicate_lock != NULL) { + /* + * UPDATING -> UNLOCKING transition: No one is waiting to dedupe, but we hold a + * duplicate PBN lock, so go release it. + */ + start_unlocking(lock, agent); + return; + } + + /* + * UPDATING -> BYPASSING transition: No one is waiting to dedupe and there's no lock to + * release. + */ + start_bypassing(lock, agent); +} + +static void query_index(struct data_vio *data_vio, enum uds_request_type operation); + +/** + * start_updating() - Continue deduplication with the last step, updating UDS with the location of + * the duplicate that should be returned as advice in the future. + * @lock: The hash lock. + * @agent: The data_vio currently acting as the agent for the lock. + */ +static void start_updating(struct hash_lock *lock, struct data_vio *agent) +{ + lock->state = VDO_HASH_LOCK_UPDATING; + + ASSERT_LOG_ONLY(lock->verified, "new advice should have been verified"); + ASSERT_LOG_ONLY(lock->update_advice, "should only update advice if needed"); + + agent->last_async_operation = VIO_ASYNC_OP_UPDATE_DEDUPE_INDEX; + set_data_vio_hash_zone_callback(agent, finish_updating); + query_index(agent, UDS_UPDATE); +} + +/** + * finish_deduping() - Handle a data_vio that has finished deduplicating against the block locked + * by the hash lock. + * @lock: The hash lock. + * @data_vio: The lock holder that has finished deduplicating. + * + * If there are other data_vios still sharing the lock, this will just release the data_vio's share + * of the lock and finish processing the data_vio. If this is the last data_vio holding the lock, + * this makes the data_vio the lock agent and uses it to advance the state of the lock so it can + * eventually be released. + */ +static void finish_deduping(struct hash_lock *lock, struct data_vio *data_vio) +{ + struct data_vio *agent = data_vio; + + ASSERT_LOG_ONLY(lock->agent == NULL, "shouldn't have an agent in DEDUPING"); + ASSERT_LOG_ONLY(!vdo_has_waiters(&lock->waiters), + "shouldn't have any lock waiters in DEDUPING"); + + /* Just release the lock reference if other data_vios are still deduping. */ + if (lock->reference_count > 1) { + exit_hash_lock(data_vio); + return; + } + + /* The hash lock must have an agent for all other lock states. */ + lock->agent = agent; + if (lock->update_advice) + /* + * DEDUPING -> UPDATING transition: The location of the duplicate block changed + * since the initial UDS query because of compression, rollover, or because the + * query agent didn't have an allocation. The UDS update was delayed in case there + * was another change in location, but with only this data_vio using the hash lock, + * it's time to update the advice. + */ + start_updating(lock, agent); + else + /* + * DEDUPING -> UNLOCKING transition: Release the PBN read lock on the duplicate + * location so the hash lock itself can be released (contingent on no new data_vios + * arriving in the lock before the agent returns). + */ + start_unlocking(lock, agent); +} + +/** + * acquire_lock() - Get the lock for a record name. + * @zone: The zone responsible for the hash. + * @hash: The hash to lock. + * @replace_lock: If non-NULL, the lock already registered for the hash which should be replaced by + * the new lock. + * @lock_ptr: A pointer to receive the hash lock. + * + * Gets the lock for the hash (record name) of the data in a data_vio, or if one does not exist (or + * if we are explicitly rolling over), initialize a new lock for the hash and register it in the + * zone. This must only be called in the correct thread for the zone. + * + * Return: VDO_SUCCESS or an error code. + */ +static int __must_check acquire_lock(struct hash_zone *zone, + const struct uds_record_name *hash, + struct hash_lock *replace_lock, + struct hash_lock **lock_ptr) +{ + struct hash_lock *lock, *new_lock; + int result; + + /* + * Borrow and prepare a lock from the pool so we don't have to do two pointer_map accesses + * in the common case of no lock contention. + */ + result = ASSERT(!list_empty(&zone->lock_pool), "never need to wait for a free hash lock"); + if (result != VDO_SUCCESS) + return result; + + new_lock = list_entry(zone->lock_pool.prev, struct hash_lock, pool_node); + list_del_init(&new_lock->pool_node); + + /* + * Fill in the hash of the new lock so we can map it, since we have to use the hash as the + * map key. + */ + new_lock->hash = *hash; + + result = vdo_pointer_map_put(zone->hash_lock_map, + &new_lock->hash, + new_lock, + (replace_lock != NULL), + (void **) &lock); + if (result != VDO_SUCCESS) { + return_hash_lock_to_pool(zone, UDS_FORGET(new_lock)); + return result; + } + + if (replace_lock != NULL) { + /* On mismatch put the old lock back and return a severe error */ + ASSERT_LOG_ONLY(lock == replace_lock, "old lock must have been in the lock map"); + /* TODO: Check earlier and bail out? */ + ASSERT_LOG_ONLY(replace_lock->registered, + "old lock must have been marked registered"); + replace_lock->registered = false; + } + + if (lock == replace_lock) { + lock = new_lock; + lock->registered = true; + } else { + /* There's already a lock for the hash, so we don't need the borrowed lock. */ + return_hash_lock_to_pool(zone, UDS_FORGET(new_lock)); + } + + *lock_ptr = lock; + return VDO_SUCCESS; +} + +/** + * enter_forked_lock() - Bind the data_vio to a new hash lock. + * + * Implements waiter_callback. Binds the data_vio that was waiting to a new hash lock and waits on + * that lock. + */ +static void enter_forked_lock(struct waiter *waiter, void *context) +{ + struct data_vio *data_vio = waiter_as_data_vio(waiter); + struct hash_lock *new_lock = (struct hash_lock *) context; + + set_hash_lock(data_vio, new_lock); + wait_on_hash_lock(new_lock, data_vio); +} + +/** + * fork_hash_lock() - Fork a hash lock because it has run out of increments on the duplicate PBN. + * @old_lock: The hash lock to fork. + * @new_agent: The data_vio that will be the agent for the new lock. + * + * Transfers the new agent and any lock waiters to a new hash lock instance which takes the place + * of the old lock in the lock map. The old lock remains active, but will not update advice. + */ +static void fork_hash_lock(struct hash_lock *old_lock, struct data_vio *new_agent) +{ + struct hash_lock *new_lock; + int result; + + result = acquire_lock(new_agent->hash_zone, &new_agent->record_name, old_lock, &new_lock); + if (result != VDO_SUCCESS) { + continue_data_vio_with_error(new_agent, result); + return; + } + + /* + * Only one of the two locks should update UDS. The old lock is out of references, so it + * would be poor dedupe advice in the short term. + */ + old_lock->update_advice = false; + new_lock->update_advice = true; + + set_hash_lock(new_agent, new_lock); + new_lock->agent = new_agent; + + vdo_notify_all_waiters(&old_lock->waiters, enter_forked_lock, new_lock); + + new_agent->is_duplicate = false; + start_writing(new_lock, new_agent); +} + +/** + * launch_dedupe() - Reserve a reference count increment for a data_vio and launch it on the dedupe + * path. + * @lock: The hash lock. + * @data_vio: The data_vio to deduplicate using the hash lock. + * @has_claim: true if the data_vio already has claimed an increment from the duplicate lock. + * + * If no increments are available, this will roll over to a new hash lock and launch the data_vio + * as the writing agent for that lock. + */ +static void launch_dedupe(struct hash_lock *lock, struct data_vio *data_vio, bool has_claim) +{ + if (!has_claim && !vdo_claim_pbn_lock_increment(lock->duplicate_lock)) { + /* Out of increments, so must roll over to a new lock. */ + fork_hash_lock(lock, data_vio); + return; + } + + /* Deduplicate against the lock's verified location. */ + set_duplicate_location(data_vio, lock->duplicate); + data_vio->new_mapped = data_vio->duplicate; + update_metadata_for_data_vio_write(data_vio, lock->duplicate_lock); +} + +/** + * start_deduping() - Enter the hash lock state where data_vios deduplicate in parallel against a + * true copy of their data on disk. + * @lock: The hash lock. + * @agent: The data_vio acting as the agent for the lock. + * @agent_is_done: true only if the agent has already written or deduplicated against its data. + * + * If the agent itself needs to deduplicate, an increment for it must already have been claimed + * from the duplicate lock, ensuring the hash lock will still have a data_vio holding it. + */ +static void start_deduping(struct hash_lock *lock, struct data_vio *agent, bool agent_is_done) +{ + lock->state = VDO_HASH_LOCK_DEDUPING; + + /* + * We don't take the downgraded allocation lock from the agent unless we actually need to + * deduplicate against it. + */ + if (lock->duplicate_lock == NULL) { + ASSERT_LOG_ONLY(!vdo_is_state_compressed(agent->new_mapped.state), + "compression must have shared a lock"); + ASSERT_LOG_ONLY(agent_is_done, "agent must have written the new duplicate"); + transfer_allocation_lock(agent); + } + + ASSERT_LOG_ONLY(vdo_is_pbn_read_lock(lock->duplicate_lock), + "duplicate_lock must be a PBN read lock"); + + /* + * This state is not like any of the other states. There is no designated agent--the agent + * transitioning to this state and all the waiters will be launched to deduplicate in + * parallel. + */ + lock->agent = NULL; + + /* + * Launch the agent (if not already deduplicated) and as many lock waiters as we have + * available increments for on the dedupe path. If we run out of increments, rollover will + * be triggered and the remaining waiters will be transferred to the new lock. + */ + if (!agent_is_done) { + launch_dedupe(lock, agent, true); + agent = NULL; + } + while (vdo_has_waiters(&lock->waiters)) + launch_dedupe(lock, dequeue_lock_waiter(lock), false); + + if (agent_is_done) + /* + * In the degenerate case where all the waiters rolled over to a new lock, this + * will continue to use the old agent to clean up this lock, and otherwise it just + * lets the agent exit the lock. + */ + finish_deduping(lock, agent); +} + +/** + * increment_stat() - Increment a statistic counter in a non-atomic yet thread-safe manner. + * @stat: The statistic field to increment. + */ +static void increment_stat(u64 *stat) +{ + /* + * Must only be mutated on the hash zone thread. Prevents any compiler shenanigans from + * affecting other threads reading stats. + */ + WRITE_ONCE(*stat, *stat + 1); +} + +/** + * finish_verifying() - Handle the result of the agent for the lock comparing its data to the + * duplicate candidate. + * @completion: The completion of the data_vio used to verify dedupe + * + * This continuation is registered in start_verifying(). + */ +static void finish_verifying(struct vdo_completion *completion) +{ + struct data_vio *agent = as_data_vio(completion); + struct hash_lock *lock = agent->hash_lock; + + assert_hash_lock_agent(agent, __func__); + + lock->verified = agent->is_duplicate; + + /* + * Only count the result of the initial verification of the advice as valid or stale, and + * not any re-verifications due to PBN lock releases. + */ + if (!lock->verify_counted) { + lock->verify_counted = true; + if (lock->verified) + increment_stat(&agent->hash_zone->statistics.dedupe_advice_valid); + else + increment_stat(&agent->hash_zone->statistics.dedupe_advice_stale); + } + + /* + * Even if the block is a verified duplicate, we can't start to deduplicate unless we can + * claim a reference count increment for the agent. + */ + if (lock->verified && !vdo_claim_pbn_lock_increment(lock->duplicate_lock)) { + agent->is_duplicate = false; + lock->verified = false; + } + + if (lock->verified) { + /* + * VERIFYING -> DEDUPING transition: The advice is for a true duplicate, so start + * deduplicating against it, if references are available. + */ + start_deduping(lock, agent, false); + } else { + /* + * VERIFYING -> UNLOCKING transition: Either the verify failed or we'd try to + * dedupe and roll over immediately, which would fail because it would leave the + * lock without an agent to release the PBN lock. In both cases, the data will have + * to be written or compressed, but first the advice PBN must be unlocked by the + * VERIFYING agent. + */ + lock->update_advice = true; + start_unlocking(lock, agent); + } +} + +static bool blocks_equal(char *block1, char *block2) +{ + int i; + + + for (i = 0; i < VDO_BLOCK_SIZE; i += sizeof(u64)) + if (*((u64 *) &block1[i]) != *((u64 *) &block2[i])) + return false; + + return true; +} + +static void verify_callback(struct vdo_completion *completion) +{ + struct data_vio *agent = as_data_vio(completion); + + agent->is_duplicate = blocks_equal(agent->vio.data, agent->scratch_block); + launch_data_vio_hash_zone_callback(agent, finish_verifying); +} + +static void uncompress_and_verify(struct vdo_completion *completion) +{ + struct data_vio *agent = as_data_vio(completion); + int result; + + result = uncompress_data_vio(agent, agent->duplicate.state, agent->scratch_block); + if (result == VDO_SUCCESS) { + verify_callback(completion); + return; + } + + agent->is_duplicate = false; + launch_data_vio_hash_zone_callback(agent, finish_verifying); +} + +static void verify_endio(struct bio *bio) +{ + struct data_vio *agent = vio_as_data_vio(bio->bi_private); + int result = blk_status_to_errno(bio->bi_status); + + vdo_count_completed_bios(bio); + if (result != VDO_SUCCESS) { + agent->is_duplicate = false; + launch_data_vio_hash_zone_callback(agent, finish_verifying); + return; + } + + if (vdo_is_state_compressed(agent->duplicate.state)) { + launch_data_vio_cpu_callback(agent, + uncompress_and_verify, + CPU_Q_COMPRESS_BLOCK_PRIORITY); + return; + } + + launch_data_vio_cpu_callback(agent, verify_callback, CPU_Q_COMPLETE_READ_PRIORITY); +} + +/** + * start_verifying() - Begin the data verification phase. + * @lock: The hash lock (must be LOCKING). + * @agent: The data_vio to use to read and compare candidate data. + * + * Continue the deduplication path for a hash lock by using the agent to read (and possibly + * decompress) the data at the candidate duplicate location, comparing it to the data in the agent + * to verify that the candidate is identical to all the data_vios sharing the hash. If so, it can + * be deduplicated against, otherwise a data_vio allocation will have to be written to and used for + * dedupe. + */ +static void start_verifying(struct hash_lock *lock, struct data_vio *agent) +{ + int result; + struct vio *vio = &agent->vio; + char *buffer = (vdo_is_state_compressed(agent->duplicate.state) ? + (char *) agent->compression.block : + agent->scratch_block); + + lock->state = VDO_HASH_LOCK_VERIFYING; + ASSERT_LOG_ONLY(!lock->verified, "hash lock only verifies advice once"); + + agent->last_async_operation = VIO_ASYNC_OP_VERIFY_DUPLICATION; + result = vio_reset_bio(vio, buffer, verify_endio, REQ_OP_READ, agent->duplicate.pbn); + if (result != VDO_SUCCESS) { + set_data_vio_hash_zone_callback(agent, finish_verifying); + continue_data_vio_with_error(agent, result); + return; + } + + set_data_vio_bio_zone_callback(agent, process_vio_io); + vdo_launch_completion_with_priority(&vio->completion, BIO_Q_VERIFY_PRIORITY); +} + +/** + * finish_locking() - Handle the result of the agent for the lock attempting to obtain a PBN read + * lock on the candidate duplicate block. + * @completion: The completion of the data_vio that attempted to get the read lock. + * + * This continuation is registered in lock_duplicate_pbn(). + */ +static void finish_locking(struct vdo_completion *completion) +{ + struct data_vio *agent = as_data_vio(completion); + struct hash_lock *lock = agent->hash_lock; + + assert_hash_lock_agent(agent, __func__); + + if (!agent->is_duplicate) { + ASSERT_LOG_ONLY(lock->duplicate_lock == NULL, + "must not hold duplicate_lock if not flagged as a duplicate"); + /* + * LOCKING -> WRITING transition: The advice block is being modified or has no + * available references, so try to write or compress the data, remembering to + * update UDS later with the new advice. + */ + increment_stat(&agent->hash_zone->statistics.dedupe_advice_stale); + lock->update_advice = true; + start_writing(lock, agent); + return; + } + + ASSERT_LOG_ONLY(lock->duplicate_lock != NULL, + "must hold duplicate_lock if flagged as a duplicate"); + + if (!lock->verified) { + /* + * LOCKING -> VERIFYING transition: Continue on the unverified dedupe path, reading + * the candidate duplicate and comparing it to the agent's data to decide whether + * it is a true duplicate or stale advice. + */ + start_verifying(lock, agent); + return; + } + + if (!vdo_claim_pbn_lock_increment(lock->duplicate_lock)) { + /* + * LOCKING -> UNLOCKING transition: The verified block was re-locked, but has no + * available increments left. Must first release the useless PBN read lock before + * rolling over to a new copy of the block. + */ + agent->is_duplicate = false; + lock->verified = false; + lock->update_advice = true; + start_unlocking(lock, agent); + return; + } + + /* + * LOCKING -> DEDUPING transition: Continue on the verified dedupe path, deduplicating + * against a location that was previously verified or written to. + */ + start_deduping(lock, agent, false); +} + +static bool acquire_provisional_reference(struct data_vio *agent, + struct pbn_lock *lock, + struct slab_depot *depot) +{ + /* Ensure that the newly-locked block is referenced. */ + struct vdo_slab *slab = vdo_get_slab(depot, agent->duplicate.pbn); + int result = vdo_acquire_provisional_reference(slab, agent->duplicate.pbn, lock); + + if (result == VDO_SUCCESS) + return true; + + uds_log_warning_strerror(result, + "Error acquiring provisional reference for dedupe candidate; aborting dedupe"); + agent->is_duplicate = false; + vdo_release_physical_zone_pbn_lock(agent->duplicate.zone, agent->duplicate.pbn, lock); + continue_data_vio_with_error(agent, result); + return false; +} + +/** + * lock_duplicate_pbn() - Acquire a read lock on the PBN of the block containing candidate + * duplicate data (compressed or uncompressed). + * @completion: The completion of the data_vio attempting to acquire the physical block lock on + * behalf of its hash lock. + * + * If the PBN is already locked for writing, the lock attempt is abandoned and is_duplicate will be + * cleared before calling back. this continuation is launched from start_locking(), and calls back + * to finish_locking() on the hash zone thread. + */ +static void lock_duplicate_pbn(struct vdo_completion *completion) +{ + unsigned int increment_limit; + struct pbn_lock *lock; + int result; + + struct data_vio *agent = as_data_vio(completion); + struct slab_depot *depot = vdo_from_data_vio(agent)->depot; + struct physical_zone *zone = agent->duplicate.zone; + + assert_data_vio_in_duplicate_zone(agent); + + set_data_vio_hash_zone_callback(agent, finish_locking); + + /* + * While in the zone that owns it, find out how many additional references can be made to + * the block if it turns out to truly be a duplicate. + */ + increment_limit = vdo_get_increment_limit(depot, agent->duplicate.pbn); + if (increment_limit == 0) { + /* + * We could deduplicate against it later if a reference happened to be released + * during verification, but it's probably better to bail out now. + */ + agent->is_duplicate = false; + continue_data_vio(agent); + return; + } + + result = vdo_attempt_physical_zone_pbn_lock(zone, + agent->duplicate.pbn, + VIO_READ_LOCK, + &lock); + if (result != VDO_SUCCESS) { + continue_data_vio_with_error(agent, result); + return; + } + + if (!vdo_is_pbn_read_lock(lock)) { + /* + * There are three cases of write locks: uncompressed data block writes, compressed + * (packed) block writes, and block map page writes. In all three cases, we give up + * on trying to verify the advice and don't bother to try deduplicate against the + * data in the write lock holder. + * + * 1) We don't ever want to try to deduplicate against a block map page. + * + * 2a) It's very unlikely we'd deduplicate against an entire packed block, both + * because of the chance of matching it, and because we don't record advice for it, + * but for the uncompressed representation of all the fragments it contains. The + * only way we'd be getting lock contention is if we've written the same + * representation coincidentally before, had it become unreferenced, and it just + * happened to be packed together from compressed writes when we go to verify the + * lucky advice. Giving up is a minuscule loss of potential dedupe. + * + * 2b) If the advice is for a slot of a compressed block, it's about to get + * smashed, and the write smashing it cannot contain our data--it would have to be + * writing on behalf of our hash lock, but that's impossible since we're the lock + * agent. + * + * 3a) If the lock is held by a data_vio with different data, the advice is already + * stale or is about to become stale. + * + * 3b) If the lock is held by a data_vio that matches us, we may as well either + * write it ourselves (or reference the copy we already wrote) instead of + * potentially having many duplicates wait for the lock holder to write, journal, + * hash, and finally arrive in the hash lock. We lose a chance to avoid a UDS + * update in the very rare case of advice for a free block that just happened to be + * allocated to a data_vio with the same hash. There's also a chance to save on a + * block write, at the cost of a block verify. Saving on a full block compare in + * all stale advice cases almost certainly outweighs saving a UDS update and + * trading a write for a read in a lucky case where advice would have been saved + * from becoming stale. + */ + agent->is_duplicate = false; + continue_data_vio(agent); + return; + } + + if (lock->holder_count == 0) { + if (!acquire_provisional_reference(agent, lock, depot)) + return; + + /* + * The increment limit we grabbed earlier is still valid. The lock now holds the + * rights to acquire all those references. Those rights will be claimed by hash + * locks sharing this read lock. + */ + lock->increment_limit = increment_limit; + } + + /* + * We've successfully acquired a read lock on behalf of the hash lock, so mark it as such. + */ + set_duplicate_lock(agent->hash_lock, lock); + + /* + * TODO: Optimization: We could directly launch the block verify, then switch to a hash + * thread. + */ + continue_data_vio(agent); +} + +/** + * start_locking() - Continue deduplication for a hash lock that has obtained valid advice of a + * potential duplicate through its agent. + * @lock: The hash lock (currently must be QUERYING). + * @agent: The data_vio bearing the dedupe advice. + */ +static void start_locking(struct hash_lock *lock, struct data_vio *agent) +{ + ASSERT_LOG_ONLY(lock->duplicate_lock == NULL, + "must not acquire a duplicate lock when already holding it"); + + lock->state = VDO_HASH_LOCK_LOCKING; + + /* + * TODO: Optimization: If we arrange to continue on the duplicate zone thread when + * accepting the advice, and don't explicitly change lock states (or use an agent-local + * state, or an atomic), we can avoid a thread transition here. + */ + agent->last_async_operation = VIO_ASYNC_OP_LOCK_DUPLICATE_PBN; + launch_data_vio_duplicate_zone_callback(agent, lock_duplicate_pbn); +} + +/** + * finish_writing() - Re-entry point for the lock agent after it has finished writing or + * compressing its copy of the data block. + * @lock: The hash lock, which must be in state WRITING. + * @agent: The data_vio that wrote its data for the lock. + * + * The agent will never need to dedupe against anything, so it's done with the lock, but the lock + * may not be finished with it, as a UDS update might still be needed. + * + * If there are other lock holders, the agent will hand the job to one of them and exit, leaving + * the lock to deduplicate against the just-written block. If there are no other lock holders, the + * agent either exits (and later tears down the hash lock), or it remains the agent and updates + * UDS. + */ +static void finish_writing(struct hash_lock *lock, struct data_vio *agent) +{ + /* + * Dedupe against the data block or compressed block slot the agent wrote. Since we know + * the write succeeded, there's no need to verify it. + */ + lock->duplicate = agent->new_mapped; + lock->verified = true; + + if (vdo_is_state_compressed(lock->duplicate.state) && + lock->registered) + /* + * Compression means the location we gave in the UDS query is not the location + * we're using to deduplicate. + */ + lock->update_advice = true; + + /* If there are any waiters, we need to start deduping them. */ + if (vdo_has_waiters(&lock->waiters)) { + /* + * WRITING -> DEDUPING transition: an asynchronously-written block failed to + * compress, so the PBN lock on the written copy was already transferred. The agent + * is done with the lock, but the lock may still need to use it to clean up after + * rollover. + */ + start_deduping(lock, agent, true); + return; + } + + /* + * There are no waiters and the agent has successfully written, so take a step towards + * being able to release the hash lock (or just release it). + */ + if (lock->update_advice) { + /* + * WRITING -> UPDATING transition: There's no waiter and a UDS update is needed, so + * retain the WRITING agent and use it to launch the update. The happens on + * compression, rollover, or the QUERYING agent not having an allocation. + */ + start_updating(lock, agent); + } else if (lock->duplicate_lock != NULL) { + /* + * WRITING -> UNLOCKING transition: There's no waiter and no update needed, but the + * compressed write gave us a shared duplicate lock that we must release. + */ + set_duplicate_location(agent, lock->duplicate); + start_unlocking(lock, agent); + } else { + /* + * WRITING -> BYPASSING transition: There's no waiter, no update needed, and no + * duplicate lock held, so both the agent and lock have no more work to do. The + * agent will release its allocation lock in cleanup. + */ + start_bypassing(lock, agent); + } +} + +/** + * select_writing_agent() - Search through the lock waiters for a data_vio that has an allocation. + * @lock: The hash lock to modify. + * + * If an allocation is found, swap agents, put the old agent at the head of the wait queue, then + * return the new agent. Otherwise, just return the current agent. + */ +static struct data_vio *select_writing_agent(struct hash_lock *lock) +{ + struct wait_queue temp_queue; + struct data_vio *data_vio; + + vdo_initialize_wait_queue(&temp_queue); + + /* + * Move waiters to the temp queue one-by-one until we find an allocation. Not ideal to + * search, but it only happens when nearly out of space. + */ + while (((data_vio = dequeue_lock_waiter(lock)) != NULL) && + !data_vio_has_allocation(data_vio)) { + /* Use the lower-level enqueue since we're just moving waiters around. */ + vdo_enqueue_waiter(&temp_queue, &data_vio->waiter); + } + + if (data_vio != NULL) { + /* + * Move the rest of the waiters over to the temp queue, preserving the order they + * arrived at the lock. + */ + vdo_transfer_all_waiters(&lock->waiters, &temp_queue); + + /* + * The current agent is being replaced and will have to wait to dedupe; make it the + * first waiter since it was the first to reach the lock. + */ + vdo_enqueue_waiter(&lock->waiters, &lock->agent->waiter); + lock->agent = data_vio; + } else { + /* No one has an allocation, so keep the current agent. */ + data_vio = lock->agent; + } + + /* Swap all the waiters back onto the lock's queue. */ + vdo_transfer_all_waiters(&temp_queue, &lock->waiters); + return data_vio; +} + +/** + * start_writing() - Begin the non-duplicate write path. + * @lock: The hash lock (currently must be QUERYING). + * @agent: The data_vio currently acting as the agent for the lock. + * + * Begins the non-duplicate write path for a hash lock that had no advice, selecting a data_vio + * with an allocation as a new agent, if necessary, then resuming the agent on the data_vio write + * path. + */ +static void start_writing(struct hash_lock *lock, struct data_vio *agent) +{ + lock->state = VDO_HASH_LOCK_WRITING; + + /* + * The agent might not have received an allocation and so can't be used for writing, but + * it's entirely possible that one of the waiters did. + */ + if (!data_vio_has_allocation(agent)) { + agent = select_writing_agent(lock); + /* If none of the waiters had an allocation, the writes all have to fail. */ + if (!data_vio_has_allocation(agent)) { + /* + * TODO: Should we keep a variant of BYPASSING that causes new arrivals to + * fail immediately if they don't have an allocation? It might be possible + * that on some path there would be non-waiters still referencing the lock, + * so it would remain in the map as everything is currently spelled, even + * if the agent and all waiters release. + */ + continue_data_vio_with_error(agent, VDO_NO_SPACE); + return; + } + } + + /* + * If the agent compresses, it might wait indefinitely in the packer, which would be bad if + * there are any other data_vios waiting. + */ + if (vdo_has_waiters(&lock->waiters)) + cancel_data_vio_compression(agent); + + /* + * Send the agent to the compress/pack/write path in vioWrite. If it succeeds, it will + * return to the hash lock via vdo_continue_hash_lock() and call finish_writing(). + */ + launch_compress_data_vio(agent); +} + +/* + * Decode VDO duplicate advice from the old_metadata field of a UDS request. + * Returns true if valid advice was found and decoded + */ +static bool decode_uds_advice(struct dedupe_context *context) +{ + const struct uds_request *request = &context->request; + struct data_vio *data_vio = context->requestor; + size_t offset = 0; + const struct uds_record_data *encoding = &request->old_metadata; + struct vdo *vdo = vdo_from_data_vio(data_vio); + struct zoned_pbn *advice = &data_vio->duplicate; + u8 version; + int result; + + if ((request->status != UDS_SUCCESS) || !request->found) + return false; + + version = encoding->data[offset++]; + if (version != UDS_ADVICE_VERSION) { + uds_log_error("invalid UDS advice version code %u", version); + return false; + } + + advice->state = encoding->data[offset++]; + advice->pbn = get_unaligned_le64(&encoding->data[offset]); + offset += sizeof(u64); + BUG_ON(offset != UDS_ADVICE_SIZE); + + /* Don't use advice that's clearly meaningless. */ + if ((advice->state == VDO_MAPPING_STATE_UNMAPPED) || (advice->pbn == VDO_ZERO_BLOCK)) { + uds_log_debug("Invalid advice from deduplication server: pbn %llu, state %u. Giving up on deduplication of logical block %llu", + (unsigned long long) advice->pbn, + advice->state, + (unsigned long long) data_vio->logical.lbn); + atomic64_inc(&vdo->stats.invalid_advice_pbn_count); + return false; + } + + result = vdo_get_physical_zone(vdo, advice->pbn, &advice->zone); + if ((result != VDO_SUCCESS) || (advice->zone == NULL)) { + uds_log_debug("Invalid physical block number from deduplication server: %llu, giving up on deduplication of logical block %llu", + (unsigned long long) advice->pbn, + (unsigned long long) data_vio->logical.lbn); + atomic64_inc(&vdo->stats.invalid_advice_pbn_count); + return false; + } + + return true; +} + +static void process_query_result(struct data_vio *agent) +{ + struct dedupe_context *context = agent->dedupe_context; + + if (context == NULL) + return; + + if (change_context_state(context, DEDUPE_CONTEXT_COMPLETE, DEDUPE_CONTEXT_IDLE)) { + agent->is_duplicate = decode_uds_advice(context); + release_context(context); + } +} + +/** + * finish_querying() - Process the result of a UDS query performed by the agent for the lock. + * @completion: The completion of the data_vio that performed the query. + * + * This continuation is registered in start_querying(). + */ +static void finish_querying(struct vdo_completion *completion) +{ + struct data_vio *agent = as_data_vio(completion); + struct hash_lock *lock = agent->hash_lock; + + assert_hash_lock_agent(agent, __func__); + + process_query_result(agent); + + if (agent->is_duplicate) { + lock->duplicate = agent->duplicate; + /* + * QUERYING -> LOCKING transition: Valid advice was obtained from UDS. Use the + * QUERYING agent to start the hash lock on the unverified dedupe path, verifying + * that the advice can be used. + */ + start_locking(lock, agent); + } else { + /* + * The agent will be used as the duplicate if has an allocation; if it does, that + * location was posted to UDS, so no update will be needed. + */ + lock->update_advice = !data_vio_has_allocation(agent); + /* + * QUERYING -> WRITING transition: There was no advice or the advice wasn't valid, + * so try to write or compress the data. + */ + start_writing(lock, agent); + } +} + +/** + * start_querying() - Start deduplicatoin for a hash lock. + * @lock: The initialized hash lock. + * @data_vio: The data_vio that has just obtained the new lock. + * + * Starts deduplication for a hash lock that has finished initializing by making the data_vio that + * requested it the agent, entering the QUERYING state, and using the agent to perform the UDS + * query on behalf of the lock. + */ +static void start_querying(struct hash_lock *lock, struct data_vio *data_vio) +{ + lock->agent = data_vio; + lock->state = VDO_HASH_LOCK_QUERYING; + data_vio->last_async_operation = VIO_ASYNC_OP_CHECK_FOR_DUPLICATION; + set_data_vio_hash_zone_callback(data_vio, finish_querying); + query_index(data_vio, (data_vio_has_allocation(data_vio) ? UDS_POST : UDS_QUERY)); +} + +/** + * report_bogus_lock_state() - Complain that a data_vio has entered a hash_lock that is in an + * unimplemented or unusable state and continue the data_vio with an + * error. + * @lock: The hash lock. + * @data_vio: The data_vio attempting to enter the lock. + */ +static void report_bogus_lock_state(struct hash_lock *lock, struct data_vio *data_vio) +{ + ASSERT_LOG_ONLY(false, + "hash lock must not be in unimplemented state %s", + get_hash_lock_state_name(lock->state)); + continue_data_vio_with_error(data_vio, VDO_LOCK_ERROR); +} + +/** + * vdo_continue_hash_lock() - Continue the processing state after writing, compressing, or + * deduplicating. + * @data_vio: The data_vio to continue processing in its hash lock. + * + * Asynchronously continue processing a data_vio in its hash lock after it has finished writing, + * compressing, or deduplicating, so it can share the result with any data_vios waiting in the hash + * lock, or update the UDS index, or simply release its share of the lock. + * + * Context: This must only be called in the correct thread for the hash zone. + */ +void vdo_continue_hash_lock(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + struct hash_lock *lock = data_vio->hash_lock; + + switch (lock->state) { + case VDO_HASH_LOCK_WRITING: + ASSERT_LOG_ONLY(data_vio == lock->agent, + "only the lock agent may continue the lock"); + finish_writing(lock, data_vio); + break; + + case VDO_HASH_LOCK_DEDUPING: + finish_deduping(lock, data_vio); + break; + + case VDO_HASH_LOCK_BYPASSING: + /* This data_vio has finished the write path and the lock doesn't need it. */ + exit_hash_lock(data_vio); + break; + + case VDO_HASH_LOCK_INITIALIZING: + case VDO_HASH_LOCK_QUERYING: + case VDO_HASH_LOCK_UPDATING: + case VDO_HASH_LOCK_LOCKING: + case VDO_HASH_LOCK_VERIFYING: + case VDO_HASH_LOCK_UNLOCKING: + /* A lock in this state should never be re-entered. */ + report_bogus_lock_state(lock, data_vio); + break; + + default: + report_bogus_lock_state(lock, data_vio); + } +} + +/** + * is_hash_collision() - Check to see if a hash collision has occurred. + * @lock: The lock to check. + * @candidate: The data_vio seeking to share the lock. + * + * Check whether the data in data_vios sharing a lock is different than in a data_vio seeking to + * share the lock, which should only be possible in the extremely unlikely case of a hash + * collision. + * + * Return: true if the given data_vio must not share the lock because it doesn't have the same data + * as the lock holders. + */ +static bool is_hash_collision(struct hash_lock *lock, struct data_vio *candidate) +{ + struct data_vio *lock_holder; + struct hash_zone *zone; + bool collides; + + if (list_empty(&lock->duplicate_ring)) + return false; + + lock_holder = list_first_entry(&lock->duplicate_ring, struct data_vio, hash_lock_entry); + zone = candidate->hash_zone; + collides = !blocks_equal(lock_holder->vio.data, candidate->vio.data); + if (collides) + increment_stat(&zone->statistics.concurrent_hash_collisions); + else + increment_stat(&zone->statistics.concurrent_data_matches); + + return collides; +} + +static inline int assert_hash_lock_preconditions(const struct data_vio *data_vio) +{ + int result; + + /* FIXME: BUG_ON() and/or enter read-only mode? */ + result = ASSERT(data_vio->hash_lock == NULL, "must not already hold a hash lock"); + if (result != VDO_SUCCESS) + return result; + + result = ASSERT(list_empty(&data_vio->hash_lock_entry), + "must not already be a member of a hash lock ring"); + if (result != VDO_SUCCESS) + return result; + + return ASSERT(data_vio->recovery_sequence_number == 0, + "must not hold a recovery lock when getting a hash lock"); +} + +/** + * vdo_acquire_hash_lock() - Acquire or share a lock on a record name. + * @data_vio: The data_vio acquiring a lock on its record name. + * + * Acquire or share a lock on the hash (record name) of the data in a data_vio, updating the + * data_vio to reference the lock. This must only be called in the correct thread for the zone. In + * the unlikely case of a hash collision, this function will succeed, but the data_vio will not get + * a lock reference. + */ +void vdo_acquire_hash_lock(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + struct hash_lock *lock; + int result; + + assert_data_vio_in_hash_zone(data_vio); + + result = assert_hash_lock_preconditions(data_vio); + if (result != VDO_SUCCESS) { + continue_data_vio_with_error(data_vio, result); + return; + } + + result = acquire_lock(data_vio->hash_zone, &data_vio->record_name, NULL, &lock); + if (result != VDO_SUCCESS) { + continue_data_vio_with_error(data_vio, result); + return; + } + + if (is_hash_collision(lock, data_vio)) { + /* + * Hash collisions are extremely unlikely, but the bogus dedupe would be a data + * corruption. Bypass optimization entirely. We can't compress a data_vio without + * a hash_lock as the compressed write depends on the hash_lock to manage the + * references for the compressed block. + */ + write_data_vio(data_vio); + return; + } + + set_hash_lock(data_vio, lock); + switch (lock->state) { + case VDO_HASH_LOCK_INITIALIZING: + start_querying(lock, data_vio); + return; + + case VDO_HASH_LOCK_QUERYING: + case VDO_HASH_LOCK_WRITING: + case VDO_HASH_LOCK_UPDATING: + case VDO_HASH_LOCK_LOCKING: + case VDO_HASH_LOCK_VERIFYING: + case VDO_HASH_LOCK_UNLOCKING: + /* The lock is busy, and can't be shared yet. */ + wait_on_hash_lock(lock, data_vio); + return; + + case VDO_HASH_LOCK_BYPASSING: + /* We can't use this lock, so bypass optimization entirely. */ + vdo_release_hash_lock(data_vio); + write_data_vio(data_vio); + return; + + case VDO_HASH_LOCK_DEDUPING: + launch_dedupe(lock, data_vio, false); + return; + + default: + /* A lock in this state should not be acquired by new VIOs. */ + report_bogus_lock_state(lock, data_vio); + } +} + +/** + * vdo_release_hash_lock() - Release a data_vio's share of a hash lock, if held, and null out the + * data_vio's reference to it. + * @data_vio: The data_vio releasing its hash lock. + * + * If the data_vio is the only one holding the lock, this also releases any resources or locks used + * by the hash lock (such as a PBN read lock on a block containing data with the same hash) and + * returns the lock to the hash zone's lock pool. + * + * Context: This must only be called in the correct thread for the hash zone. + */ +void vdo_release_hash_lock(struct data_vio *data_vio) +{ + struct hash_lock *lock = data_vio->hash_lock; + struct hash_zone *zone = data_vio->hash_zone; + + if (lock == NULL) + return; + + set_hash_lock(data_vio, NULL); + + if (lock->reference_count > 0) + /* The lock is still in use by other data_vios. */ + return; + + if (lock->registered) { + struct hash_lock *removed; + + removed = vdo_pointer_map_remove(zone->hash_lock_map, &lock->hash); + ASSERT_LOG_ONLY(lock == removed, "hash lock being released must have been mapped"); + } else { + ASSERT_LOG_ONLY(lock != vdo_pointer_map_get(zone->hash_lock_map, &lock->hash), + "unregistered hash lock must not be in the lock map"); + } + + ASSERT_LOG_ONLY(!vdo_has_waiters(&lock->waiters), + "hash lock returned to zone must have no waiters"); + ASSERT_LOG_ONLY((lock->duplicate_lock == NULL), + "hash lock returned to zone must not reference a PBN lock"); + ASSERT_LOG_ONLY((lock->state == VDO_HASH_LOCK_BYPASSING), + "returned hash lock must not be in use with state %s", + get_hash_lock_state_name(lock->state)); + ASSERT_LOG_ONLY(list_empty(&lock->pool_node), + "hash lock returned to zone must not be in a pool ring"); + ASSERT_LOG_ONLY(list_empty(&lock->duplicate_ring), + "hash lock returned to zone must not reference DataVIOs"); + + return_hash_lock_to_pool(zone, lock); +} + +/** + * transfer_allocation_lock() - Transfer a data_vio's downgraded allocation PBN lock to the + * data_vio's hash lock, converting it to a duplicate PBN lock. + * @data_vio: The data_vio holding the allocation lock to transfer. + */ +static void transfer_allocation_lock(struct data_vio *data_vio) +{ + struct allocation *allocation = &data_vio->allocation; + struct hash_lock *hash_lock = data_vio->hash_lock; + + ASSERT_LOG_ONLY(data_vio->new_mapped.pbn == allocation->pbn, + "transferred lock must be for the block written"); + + allocation->pbn = VDO_ZERO_BLOCK; + + ASSERT_LOG_ONLY(vdo_is_pbn_read_lock(allocation->lock), + "must have downgraded the allocation lock before transfer"); + + hash_lock->duplicate = data_vio->new_mapped; + data_vio->duplicate = data_vio->new_mapped; + + /* + * Since the lock is being transferred, the holder count doesn't change (and isn't even + * safe to examine on this thread). + */ + hash_lock->duplicate_lock = UDS_FORGET(allocation->lock); +} + +/** + * vdo_share_compressed_write_lock() - Make a data_vio's hash lock a shared holder of the PBN lock + * on the compressed block to which its data was just written. + * @data_vio: The data_vio which was just compressed. + * @pbn_lock: The PBN lock on the compressed block. + * + * If the lock is still a write lock (as it will be for the first share), it will be converted to a + * read lock. This also reserves a reference count increment for the data_vio. + */ +void vdo_share_compressed_write_lock(struct data_vio *data_vio, struct pbn_lock *pbn_lock) +{ + bool claimed; + + ASSERT_LOG_ONLY(vdo_get_duplicate_lock(data_vio) == NULL, + "a duplicate PBN lock should not exist when writing"); + ASSERT_LOG_ONLY(vdo_is_state_compressed(data_vio->new_mapped.state), + "lock transfer must be for a compressed write"); + assert_data_vio_in_new_mapped_zone(data_vio); + + /* First sharer downgrades the lock. */ + if (!vdo_is_pbn_read_lock(pbn_lock)) + vdo_downgrade_pbn_write_lock(pbn_lock, true); + + /* + * Get a share of the PBN lock, ensuring it cannot be released until after this data_vio + * has had a chance to journal a reference. + */ + data_vio->duplicate = data_vio->new_mapped; + data_vio->hash_lock->duplicate = data_vio->new_mapped; + set_duplicate_lock(data_vio->hash_lock, pbn_lock); + + /* + * Claim a reference for this data_vio. Necessary since another hash_lock might start + * deduplicating against it before our incRef. + */ + claimed = vdo_claim_pbn_lock_increment(pbn_lock); + ASSERT_LOG_ONLY(claimed, "impossible to fail to claim an initial increment"); +} + +/** compare_keys() - Implements pointer_key_comparator. */ +static bool compare_keys(const void *this_key, const void *that_key) +{ + /* Null keys are not supported. */ + return (memcmp(this_key, that_key, sizeof(struct uds_record_name)) == 0); +} + +/** hash_key() - Implements pointer_key_comparator. */ +static u32 hash_key(const void *key) +{ + const struct uds_record_name *name = key; + + /* Use a fragment of the record name as a hash code. */ + return get_unaligned_le32(&name->name[4]); +} + +static void dedupe_kobj_release(struct kobject *directory) +{ + UDS_FREE(container_of(directory, struct hash_zones, dedupe_directory)); +} + +static ssize_t dedupe_status_show(struct kobject *directory, struct attribute *attr, char *buf) +{ + struct uds_attribute *ua = container_of(attr, struct uds_attribute, attr); + struct hash_zones *zones = container_of(directory, struct hash_zones, dedupe_directory); + + if (ua->show_string != NULL) + return sprintf(buf, "%s\n", ua->show_string(zones)); + else + return -EINVAL; +} + +static ssize_t dedupe_status_store(struct kobject *kobj __always_unused, + struct attribute *attr __always_unused, + const char *buf __always_unused, + size_t length __always_unused) +{ + return -EINVAL; +} + +/*----------------------------------------------------------------------*/ + +static const struct sysfs_ops dedupe_sysfs_ops = { + .show = dedupe_status_show, + .store = dedupe_status_store, +}; + +static struct uds_attribute dedupe_status_attribute = { + .attr = {.name = "status", .mode = 0444, }, + .show_string = vdo_get_dedupe_index_state_name, +}; + +static struct attribute *dedupe_attrs[] = { + &dedupe_status_attribute.attr, + NULL, +}; +ATTRIBUTE_GROUPS(dedupe); + +static struct kobj_type dedupe_directory_type = { + .release = dedupe_kobj_release, + .sysfs_ops = &dedupe_sysfs_ops, + .default_groups = dedupe_groups, +}; + +static void start_uds_queue(void *ptr) +{ + /* + * Allow the UDS dedupe worker thread to do memory allocations. It will only do allocations + * during the UDS calls that open or close an index, but those allocations can safely sleep + * while reserving a large amount of memory. We could use an allocations_allowed boolean + * (like the base threads do), but it would be an unnecessary embellishment. + */ + struct vdo_thread *thread = vdo_get_work_queue_owner(vdo_get_current_work_queue()); + + uds_register_allocating_thread(&thread->allocating_thread, NULL); +} + +static void finish_uds_queue(void *ptr __always_unused) +{ + uds_unregister_allocating_thread(); +} + +static void close_index(struct hash_zones *zones) +{ + int result; + + /* + * Change the index state so that get_index_statistics() will not try to use the index + * session we are closing. + */ + zones->index_state = IS_CHANGING; + /* Close the index session, while not holding the lock. */ + spin_unlock(&zones->lock); + result = uds_close_index(zones->index_session); + + if (result != UDS_SUCCESS) + uds_log_error_strerror(result, "Error closing index"); + spin_lock(&zones->lock); + zones->index_state = IS_CLOSED; + zones->error_flag |= result != UDS_SUCCESS; + /* ASSERTION: We leave in IS_CLOSED state. */ +} + +static void open_index(struct hash_zones *zones) +{ + /* ASSERTION: We enter in IS_CLOSED state. */ + int result; + bool create_flag = zones->create_flag; + + zones->create_flag = false; + /* + * Change the index state so that the it will be reported to the outside world as + * "opening". + */ + zones->index_state = IS_CHANGING; + zones->error_flag = false; + + /* Open the index session, while not holding the lock */ + spin_unlock(&zones->lock); + result = uds_open_index(create_flag ? UDS_CREATE : UDS_LOAD, + &zones->parameters, + zones->index_session); + if (result != UDS_SUCCESS) + uds_log_error_strerror(result, "Error opening index"); + + spin_lock(&zones->lock); + if (!create_flag) { + switch (result) { + case -ENOENT: + /* + * Either there is no index, or there is no way we can recover the index. + * We will be called again and try to create a new index. + */ + zones->index_state = IS_CLOSED; + zones->create_flag = true; + return; + default: + break; + } + } + if (result == UDS_SUCCESS) { + zones->index_state = IS_OPENED; + } else { + zones->index_state = IS_CLOSED; + zones->index_target = IS_CLOSED; + zones->error_flag = true; + spin_unlock(&zones->lock); + uds_log_info("Setting UDS index target state to error"); + spin_lock(&zones->lock); + } + /* + * ASSERTION: On success, we leave in IS_OPENED state. + * ASSERTION: On failure, we leave in IS_CLOSED state. + */ +} + +static void change_dedupe_state(struct vdo_completion *completion) +{ + struct hash_zones *zones = as_hash_zones(completion); + + spin_lock(&zones->lock); + + /* Loop until the index is in the target state and the create flag is clear. */ + while (vdo_is_state_normal(&zones->state) && + ((zones->index_state != zones->index_target) || zones->create_flag)) { + if (zones->index_state == IS_OPENED) + close_index(zones); + else + open_index(zones); + } + + zones->changing = false; + spin_unlock(&zones->lock); +} + +static void start_expiration_timer(struct dedupe_context *context) +{ + u64 start_time = context->submission_jiffies; + u64 end_time; + + if (!change_timer_state(context->zone, + DEDUPE_QUERY_TIMER_IDLE, + DEDUPE_QUERY_TIMER_RUNNING)) + return; + + end_time = max(start_time + vdo_dedupe_index_timeout_jiffies, + jiffies + vdo_dedupe_index_min_timer_jiffies); + mod_timer(&context->zone->timer, end_time); +} + +/** + * report_dedupe_timeouts() - Record and eventually report that some dedupe requests reached their + * expiration time without getting answers, so we timed them out. + * @zones: the hash zones. + * @timeouts: the number of newly timed out requests. + */ +static void report_dedupe_timeouts(struct hash_zones *zones, unsigned int timeouts) +{ + atomic64_add(timeouts, &zones->timeouts); + spin_lock(&zones->lock); + if (__ratelimit(&zones->ratelimiter)) { + u64 unreported = atomic64_read(&zones->timeouts); + + unreported -= zones->reported_timeouts; + uds_log_debug("UDS index timeout on %llu requests", + (unsigned long long) unreported); + zones->reported_timeouts += unreported; + } + spin_unlock(&zones->lock); +} + +static int initialize_index(struct vdo *vdo, struct hash_zones *zones) +{ + int result; + off_t uds_offset; + struct volume_geometry geometry = vdo->geometry; + static const struct vdo_work_queue_type uds_queue_type = { + .start = start_uds_queue, + .finish = finish_uds_queue, + .max_priority = UDS_Q_MAX_PRIORITY, + .default_priority = UDS_Q_PRIORITY, + }; + + vdo_set_dedupe_index_timeout_interval(vdo_dedupe_index_timeout_interval); + vdo_set_dedupe_index_min_timer_interval(vdo_dedupe_index_min_timer_interval); + + /* + * Since we will save up the timeouts that would have been reported but were ratelimited, + * we don't need to report ratelimiting. + */ + ratelimit_default_init(&zones->ratelimiter); + ratelimit_set_flags(&zones->ratelimiter, RATELIMIT_MSG_ON_RELEASE); + uds_offset = ((vdo_get_index_region_start(geometry) - + geometry.bio_offset) * VDO_BLOCK_SIZE); + zones->parameters = (struct uds_parameters) { + .name = vdo->device_config->parent_device_name, + .offset = uds_offset, + .size = (vdo_get_index_region_size(geometry) * VDO_BLOCK_SIZE), + .memory_size = geometry.index_config.mem, + .sparse = geometry.index_config.sparse, + .nonce = (u64) geometry.nonce, + }; + + result = uds_create_index_session(&zones->index_session); + if (result != UDS_SUCCESS) + return result; + + result = vdo_make_thread(vdo, vdo->thread_config.dedupe_thread, &uds_queue_type, 1, NULL); + if (result != VDO_SUCCESS) { + uds_destroy_index_session(UDS_FORGET(zones->index_session)); + uds_log_error("UDS index queue initialization failed (%d)", result); + return result; + } + + vdo_initialize_completion(&zones->completion, vdo, VDO_HASH_ZONES_COMPLETION); + vdo_set_completion_callback(&zones->completion, + change_dedupe_state, + vdo->thread_config.dedupe_thread); + kobject_init(&zones->dedupe_directory, &dedupe_directory_type); + return VDO_SUCCESS; +} + +/** + * finish_index_operation(): This is the UDS callback for index queries. + * @request: The uds request which has just completed. + */ +static void finish_index_operation(struct uds_request *request) +{ + struct dedupe_context *context = container_of(request, struct dedupe_context, request); + + if (change_context_state(context, DEDUPE_CONTEXT_PENDING, DEDUPE_CONTEXT_COMPLETE)) { + /* + * This query has not timed out, so send its data_vio back to its hash zone to + * process the results. + */ + continue_data_vio(context->requestor); + return; + } + + /* + * This query has timed out, so try to mark it complete and hence eligible for reuse. Its + * data_vio has already moved on. + */ + if (!change_context_state(context, + DEDUPE_CONTEXT_TIMED_OUT, + DEDUPE_CONTEXT_TIMED_OUT_COMPLETE)) + ASSERT_LOG_ONLY(false, + "uds request was timed out (state %d)", + atomic_read(&context->state)); + + uds_funnel_queue_put(context->zone->timed_out_complete, &context->queue_entry); +} + +/** + * check_for_drain_complete() - Check whether this zone has drained. + * @zone: The zone to check. + */ +static void check_for_drain_complete(struct hash_zone *zone) +{ + data_vio_count_t recycled = 0; + + if (!vdo_is_state_draining(&zone->state)) + return; + + if ((atomic_read(&zone->timer_state) == DEDUPE_QUERY_TIMER_IDLE) || + change_timer_state(zone, DEDUPE_QUERY_TIMER_RUNNING, DEDUPE_QUERY_TIMER_IDLE)) + del_timer_sync(&zone->timer); + else + /* + * There is an in flight time-out, which must get processed before we can continue. + */ + return; + + for (;;) { + struct dedupe_context *context; + struct funnel_queue_entry *entry; + + entry = uds_funnel_queue_poll(zone->timed_out_complete); + if (entry == NULL) + break; + + context = container_of(entry, struct dedupe_context, queue_entry); + atomic_set(&context->state, DEDUPE_CONTEXT_IDLE); + list_add(&context->list_entry, &zone->available); + recycled++; + } + + if (recycled > 0) + WRITE_ONCE(zone->active, zone->active - recycled); + ASSERT_LOG_ONLY(READ_ONCE(zone->active) == 0, "all contexts inactive"); + vdo_finish_draining(&zone->state); +} + +static void timeout_index_operations_callback(struct vdo_completion *completion) +{ + struct dedupe_context *context, *tmp; + struct hash_zone *zone = as_hash_zone(completion); + u64 timeout_jiffies = msecs_to_jiffies(vdo_dedupe_index_timeout_interval); + unsigned long cutoff = jiffies - timeout_jiffies; + unsigned int timed_out = 0; + + atomic_set(&zone->timer_state, DEDUPE_QUERY_TIMER_IDLE); + list_for_each_entry_safe(context, tmp, &zone->pending, list_entry) { + if (cutoff <= context->submission_jiffies) { + /* + * We have reached the oldest query which has not timed out yet, so restart + * the timer. + */ + start_expiration_timer(context); + break; + } + + if (!change_context_state(context, + DEDUPE_CONTEXT_PENDING, + DEDUPE_CONTEXT_TIMED_OUT)) + /* + * This context completed between the time the timeout fired, and now. We + * can treat it as a a successful query, its requestor is already enqueued + * to process it. + */ + continue; + + /* + * Remove this context from the pending list so we won't look at it again on a + * subsequent timeout. Once the index completes it, it will be reused. Meanwhile, + * send its requestor on its way. + */ + list_del_init(&context->list_entry); + continue_data_vio(context->requestor); + timed_out++; + } + + if (timed_out > 0) + report_dedupe_timeouts(completion->vdo->hash_zones, timed_out); + + check_for_drain_complete(zone); +} + +static void timeout_index_operations(struct timer_list *t) +{ + struct hash_zone *zone = from_timer(zone, t, timer); + + if (change_timer_state(zone, DEDUPE_QUERY_TIMER_RUNNING, DEDUPE_QUERY_TIMER_FIRED)) + vdo_launch_completion(&zone->completion); +} + +static int __must_check +initialize_zone(struct vdo *vdo, struct hash_zones *zones, zone_count_t zone_number) +{ + int result; + data_vio_count_t i; + struct hash_zone *zone = &zones->zones[zone_number]; + + result = vdo_make_pointer_map(VDO_LOCK_MAP_CAPACITY, + 0, + compare_keys, + hash_key, + &zone->hash_lock_map); + if (result != VDO_SUCCESS) + return result; + + vdo_set_admin_state_code(&zone->state, VDO_ADMIN_STATE_NORMAL_OPERATION); + zone->zone_number = zone_number; + zone->thread_id = vdo->thread_config.hash_zone_threads[zone_number]; + vdo_initialize_completion(&zone->completion, vdo, VDO_HASH_ZONE_COMPLETION); + vdo_set_completion_callback(&zone->completion, + timeout_index_operations_callback, + zone->thread_id); + INIT_LIST_HEAD(&zone->lock_pool); + result = UDS_ALLOCATE(LOCK_POOL_CAPACITY, + struct hash_lock, + "hash_lock array", + &zone->lock_array); + if (result != VDO_SUCCESS) + return result; + + for (i = 0; i < LOCK_POOL_CAPACITY; i++) + return_hash_lock_to_pool(zone, &zone->lock_array[i]); + + INIT_LIST_HEAD(&zone->available); + INIT_LIST_HEAD(&zone->pending); + result = uds_make_funnel_queue(&zone->timed_out_complete); + if (result != VDO_SUCCESS) + return result; + + timer_setup(&zone->timer, timeout_index_operations, 0); + + for (i = 0; i < MAXIMUM_VDO_USER_VIOS; i++) { + struct dedupe_context *context = &zone->contexts[i]; + + context->zone = zone; + context->request.callback = finish_index_operation; + context->request.session = zones->index_session; + list_add(&context->list_entry, &zone->available); + } + + return vdo_make_default_thread(vdo, zone->thread_id); +} + +/** get_thread_id_for_zone() - Implements vdo_zone_thread_getter. */ +static thread_id_t get_thread_id_for_zone(void *context, zone_count_t zone_number) +{ + struct hash_zones *zones = context; + + return zones->zones[zone_number].thread_id; +} + +/** + * vdo_make_hash_zones() - Create the hash zones. + * + * @vdo: The vdo to which the zone will belong. + * @zones_ptr: A pointer to hold the zones. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_make_hash_zones(struct vdo *vdo, struct hash_zones **zones_ptr) +{ + int result; + struct hash_zones *zones; + zone_count_t z; + zone_count_t zone_count = vdo->thread_config.hash_zone_count; + + if (zone_count == 0) + return VDO_SUCCESS; + + result = UDS_ALLOCATE_EXTENDED(struct hash_zones, + zone_count, + struct hash_zone, + __func__, + &zones); + if (result != VDO_SUCCESS) + return result; + + result = initialize_index(vdo, zones); + if (result != VDO_SUCCESS) { + UDS_FREE(zones); + return result; + } + + vdo_set_admin_state_code(&zones->state, VDO_ADMIN_STATE_NEW); + + zones->zone_count = zone_count; + for (z = 0; z < zone_count; z++) { + result = initialize_zone(vdo, zones, z); + if (result != VDO_SUCCESS) { + vdo_free_hash_zones(zones); + return result; + } + } + + result = vdo_make_action_manager(zones->zone_count, + get_thread_id_for_zone, + vdo->thread_config.admin_thread, + zones, + NULL, + vdo, + &zones->manager); + if (result != VDO_SUCCESS) { + vdo_free_hash_zones(zones); + return result; + } + + *zones_ptr = zones; + return VDO_SUCCESS; +} + +void vdo_finish_dedupe_index(struct hash_zones *zones) +{ + if (zones == NULL) + return; + + uds_destroy_index_session(UDS_FORGET(zones->index_session)); +} + +/** + * vdo_free_hash_zones() - Free the hash zones. + * @zones: The zone to free. + */ +void vdo_free_hash_zones(struct hash_zones *zones) +{ + zone_count_t i; + + if (zones == NULL) + return; + + UDS_FREE(UDS_FORGET(zones->manager)); + + for (i = 0; i < zones->zone_count; i++) { + struct hash_zone *zone = &zones->zones[i]; + + uds_free_funnel_queue(UDS_FORGET(zone->timed_out_complete)); + vdo_free_pointer_map(UDS_FORGET(zone->hash_lock_map)); + UDS_FREE(UDS_FORGET(zone->lock_array)); + } + + if (zones->index_session != NULL) + vdo_finish_dedupe_index(zones); + + ratelimit_state_exit(&zones->ratelimiter); + if (vdo_get_admin_state_code(&zones->state) == VDO_ADMIN_STATE_NEW) + UDS_FREE(zones); + else + kobject_put(&zones->dedupe_directory); +} + +static void initiate_suspend_index(struct admin_state *state) +{ + struct hash_zones *zones = container_of(state, struct hash_zones, state); + enum index_state index_state; + + spin_lock(&zones->lock); + index_state = zones->index_state; + spin_unlock(&zones->lock); + + if (index_state != IS_CLOSED) { + bool save = vdo_is_state_saving(&zones->state); + int result; + + result = uds_suspend_index_session(zones->index_session, save); + if (result != UDS_SUCCESS) + uds_log_error_strerror(result, "Error suspending dedupe index"); + } + + vdo_finish_draining(state); +} + +/** + * suspend_index() - Suspend the UDS index prior to draining hash zones. + * + * Implements vdo_action_preamble + */ +static void suspend_index(void *context, struct vdo_completion *completion) +{ + struct hash_zones *zones = context; + + vdo_start_draining(&zones->state, + vdo_get_current_manager_operation(zones->manager), + completion, + initiate_suspend_index); +} + +/** + * initiate_drain() - Initiate a drain. + * + * Implements vdo_admin_initiator. + */ +static void initiate_drain(struct admin_state *state) +{ + check_for_drain_complete(container_of(state, struct hash_zone, state)); +} + +/** + * drain_hash_zone() - Drain a hash zone. + * + * Implements vdo_zone_action. + */ +static void drain_hash_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent) +{ + struct hash_zones *zones = context; + + vdo_start_draining(&zones->zones[zone_number].state, + vdo_get_current_manager_operation(zones->manager), + parent, + initiate_drain); +} + +/** vdo_drain_hash_zones() - Drain all hash zones. */ +void vdo_drain_hash_zones(struct hash_zones *zones, struct vdo_completion *parent) +{ + vdo_schedule_operation(zones->manager, + parent->vdo->suspend_type, + suspend_index, + drain_hash_zone, + NULL, + parent); +} + +static void launch_dedupe_state_change(struct hash_zones *zones) +{ + /* ASSERTION: We enter with the lock held. */ + if (zones->changing || !vdo_is_state_normal(&zones->state)) + /* Either a change is already in progress, or changes are not allowed. */ + return; + + if (zones->create_flag || (zones->index_state != zones->index_target)) { + zones->changing = true; + vdo_launch_completion(&zones->completion); + return; + } + + /* ASSERTION: We exit with the lock held. */ +} + +/** + * resume_index() - Resume the UDS index prior to resuming hash zones. + * + * Implements vdo_action_preamble + */ +static void resume_index(void *context, struct vdo_completion *parent) +{ + struct hash_zones *zones = context; + struct device_config *config = parent->vdo->device_config; + int result; + + zones->parameters.name = config->parent_device_name; + result = uds_resume_index_session(zones->index_session, zones->parameters.name); + if (result != UDS_SUCCESS) + uds_log_error_strerror(result, "Error resuming dedupe index"); + + spin_lock(&zones->lock); + vdo_resume_if_quiescent(&zones->state); + + if (config->deduplication) { + zones->index_target = IS_OPENED; + WRITE_ONCE(zones->dedupe_flag, true); + } else { + zones->index_target = IS_CLOSED; + } + + launch_dedupe_state_change(zones); + spin_unlock(&zones->lock); + + vdo_finish_completion(parent); +} + +/** + * resume_hash_zone() - Resume a hash zone. + * + * Implements vdo_zone_action. + */ +static void +resume_hash_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent) +{ + struct hash_zone *zone = &(((struct hash_zones *) context)->zones[zone_number]); + + vdo_fail_completion(parent, vdo_resume_if_quiescent(&zone->state)); +} + +/** + * vdo_resume_hash_zones() - Resume a set of hash zones. + * @zones: The hash zones to resume. + * @parent: The object to notify when the zones have resumed. + */ +void vdo_resume_hash_zones(struct hash_zones *zones, struct vdo_completion *parent) +{ + if (vdo_is_read_only(parent->vdo)) { + vdo_launch_completion(parent); + return; + } + + vdo_schedule_operation(zones->manager, + VDO_ADMIN_STATE_RESUMING, + resume_index, + resume_hash_zone, + NULL, + parent); +} + +/** + * get_hash_zone_statistics() - Add the statistics for this hash zone to the tally for all zones. + * @zone: The hash zone to query. + * @tally: The tally + */ +static void +get_hash_zone_statistics(const struct hash_zone *zone, struct hash_lock_statistics *tally) +{ + const struct hash_lock_statistics *stats = &zone->statistics; + + tally->dedupe_advice_valid += READ_ONCE(stats->dedupe_advice_valid); + tally->dedupe_advice_stale += READ_ONCE(stats->dedupe_advice_stale); + tally->concurrent_data_matches += READ_ONCE(stats->concurrent_data_matches); + tally->concurrent_hash_collisions += READ_ONCE(stats->concurrent_hash_collisions); + tally->curr_dedupe_queries += READ_ONCE(zone->active); +} + +static void get_index_statistics(struct hash_zones *zones, struct index_statistics *stats) +{ + enum index_state state; + struct uds_index_stats index_stats; + int result; + + spin_lock(&zones->lock); + state = zones->index_state; + spin_unlock(&zones->lock); + + if (state != IS_OPENED) + return; + + result = uds_get_index_session_stats(zones->index_session, &index_stats); + if (result != UDS_SUCCESS) { + uds_log_error_strerror(result, "Error reading index stats"); + return; + } + + stats->entries_indexed = index_stats.entries_indexed; + stats->posts_found = index_stats.posts_found; + stats->posts_not_found = index_stats.posts_not_found; + stats->queries_found = index_stats.queries_found; + stats->queries_not_found = index_stats.queries_not_found; + stats->updates_found = index_stats.updates_found; + stats->updates_not_found = index_stats.updates_not_found; + stats->entries_discarded = index_stats.entries_discarded; +} + +/** + * vdo_get_dedupe_statistics() - Tally the statistics from all the hash zones and the UDS index. + * @hash_zones: The hash zones to query + * + * Return: The sum of the hash lock statistics from all hash zones plus the statistics from the UDS + * index + */ +void vdo_get_dedupe_statistics(struct hash_zones *zones, struct vdo_statistics *stats) + +{ + zone_count_t zone; + + for (zone = 0; zone < zones->zone_count; zone++) + get_hash_zone_statistics(&zones->zones[zone], &stats->hash_lock); + + get_index_statistics(zones, &stats->index); + + /* + * zones->timeouts gives the number of timeouts, and dedupe_context_busy gives the number + * of queries not made because of earlier timeouts. + */ + stats->dedupe_advice_timeouts = + (atomic64_read(&zones->timeouts) + atomic64_read(&zones->dedupe_context_busy)); +} + +/** + * vdo_select_hash_zone() - Select the hash zone responsible for locking a given record name. + * @zones: The hash_zones from which to select. + * @name: The record name. + * + * Return: The hash zone responsible for the record name. + */ +struct hash_zone * +vdo_select_hash_zone(struct hash_zones *zones, const struct uds_record_name *name) +{ + /* + * Use a fragment of the record name as a hash code. Eight bits of hash should suffice + * since the number of hash zones is small. + * TODO: Verify that the first byte is independent enough. + */ + u32 hash = name->name[0]; + + /* + * Scale the 8-bit hash fragment to a zone index by treating it as a binary fraction and + * multiplying that by the zone count. If the hash is uniformly distributed over [0 .. + * 2^8-1], then (hash * count / 2^8) should be uniformly distributed over [0 .. count-1]. + * The multiply and shift is much faster than a divide (modulus) on X86 CPUs. + */ + hash = (hash * zones->zone_count) >> 8; + return &zones->zones[hash]; +} + +/** + * dump_hash_lock() - Dump a compact description of hash_lock to the log if the lock is not on the + * free list. + * @lock: The hash lock to dump. + */ +static void dump_hash_lock(const struct hash_lock *lock) +{ + const char *state; + + if (!list_empty(&lock->pool_node)) + /* This lock is on the free list. */ + return; + + /* + * Necessarily cryptic since we can log a lot of these. First three chars of state is + * unambiguous. 'U' indicates a lock not registered in the map. + */ + state = get_hash_lock_state_name(lock->state); + uds_log_info(" hl %px: %3.3s %c%llu/%u rc=%u wc=%zu agt=%px", + (const void *) lock, state, (lock->registered ? 'D' : 'U'), + (unsigned long long) lock->duplicate.pbn, + lock->duplicate.state, lock->reference_count, + vdo_count_waiters(&lock->waiters), (void *) lock->agent); +} + +static const char *index_state_to_string(struct hash_zones *zones, enum index_state state) +{ + if (!vdo_is_state_normal(&zones->state)) + return SUSPENDED; + + switch (state) { + case IS_CLOSED: + return zones->error_flag ? ERROR : CLOSED; + case IS_CHANGING: + return zones->index_target == IS_OPENED ? OPENING : CLOSING; + case IS_OPENED: + return READ_ONCE(zones->dedupe_flag) ? ONLINE : OFFLINE; + default: + return UNKNOWN; + } +} + +/** + * vdo_dump_hash_zone() - Dump information about a hash zone to the log for debugging. + * @zone: The zone to dump. + */ +static void dump_hash_zone(const struct hash_zone *zone) +{ + data_vio_count_t i; + + if (zone->hash_lock_map == NULL) { + uds_log_info("struct hash_zone %u: NULL map", zone->zone_number); + return; + } + + uds_log_info("struct hash_zone %u: mapSize=%zu", + zone->zone_number, + vdo_pointer_map_size(zone->hash_lock_map)); + for (i = 0; i < LOCK_POOL_CAPACITY; i++) + dump_hash_lock(&zone->lock_array[i]); +} + +/** + * vdo_dump_hash_zones() - Dump information about the hash zones to the log for debugging. + * @zones: The zones to dump. + */ +void vdo_dump_hash_zones(struct hash_zones *zones) +{ + const char *state, *target; + zone_count_t zone; + + spin_lock(&zones->lock); + state = index_state_to_string(zones, zones->index_state); + target = (zones->changing ? index_state_to_string(zones, zones->index_target) : NULL); + spin_unlock(&zones->lock); + + uds_log_info("UDS index: state: %s", state); + if (target != NULL) + uds_log_info("UDS index: changing to state: %s", target); + + for (zone = 0; zone < zones->zone_count; zone++) + dump_hash_zone(&zones->zones[zone]); +} + +void vdo_set_dedupe_index_timeout_interval(unsigned int value) +{ + u64 alb_jiffies; + + /* Arbitrary maximum value is two minutes */ + if (value > 120000) + value = 120000; + /* Arbitrary minimum value is 2 jiffies */ + alb_jiffies = msecs_to_jiffies(value); + + if (alb_jiffies < 2) { + alb_jiffies = 2; + value = jiffies_to_msecs(alb_jiffies); + } + vdo_dedupe_index_timeout_interval = value; + vdo_dedupe_index_timeout_jiffies = alb_jiffies; +} + +void vdo_set_dedupe_index_min_timer_interval(unsigned int value) +{ + u64 min_jiffies; + + /* Arbitrary maximum value is one second */ + if (value > 1000) + value = 1000; + + /* Arbitrary minimum value is 2 jiffies */ + min_jiffies = msecs_to_jiffies(value); + + if (min_jiffies < 2) { + min_jiffies = 2; + value = jiffies_to_msecs(min_jiffies); + } + + vdo_dedupe_index_min_timer_interval = value; + vdo_dedupe_index_min_timer_jiffies = min_jiffies; +} + +/** + * acquire_context() - Acquire a dedupe context from a hash_zone if any are available. + * @zone: the hash zone + * + * Return: A dedupe_context or NULL if none are available + */ +static struct dedupe_context * __must_check acquire_context(struct hash_zone *zone) +{ + struct dedupe_context *context; + struct funnel_queue_entry *entry; + + assert_in_hash_zone(zone, __func__); + + if (!list_empty(&zone->available)) { + WRITE_ONCE(zone->active, zone->active + 1); + context = list_first_entry(&zone->available, struct dedupe_context, list_entry); + list_del_init(&context->list_entry); + return context; + } + + entry = uds_funnel_queue_poll(zone->timed_out_complete); + return ((entry == NULL) ? NULL : container_of(entry, struct dedupe_context, queue_entry)); +} + +static void prepare_uds_request(struct uds_request *request, + struct data_vio *data_vio, + enum uds_request_type operation) +{ + request->record_name = data_vio->record_name; + request->type = operation; + if ((operation == UDS_POST) || (operation == UDS_UPDATE)) { + size_t offset = 0; + struct uds_record_data *encoding = &request->new_metadata; + + encoding->data[offset++] = UDS_ADVICE_VERSION; + encoding->data[offset++] = data_vio->new_mapped.state; + put_unaligned_le64(data_vio->new_mapped.pbn, &encoding->data[offset]); + offset += sizeof(u64); + BUG_ON(offset != UDS_ADVICE_SIZE); + } +} + +/* + * The index operation will inquire about data_vio.record_name, providing (if the operation is + * appropriate) advice from the data_vio's new_mapped fields. The advice found in the index (or + * NULL if none) will be returned via receive_data_vio_dedupe_advice(). dedupe_context.status is + * set to the return status code of any asynchronous index processing. + */ +static void query_index(struct data_vio *data_vio, enum uds_request_type operation) +{ + int result; + struct dedupe_context *context; + struct vdo *vdo = vdo_from_data_vio(data_vio); + struct hash_zone *zone = data_vio->hash_zone; + + assert_data_vio_in_hash_zone(data_vio); + + if (!READ_ONCE(vdo->hash_zones->dedupe_flag)) { + continue_data_vio(data_vio); + return; + } + + context = acquire_context(zone); + if (context == NULL) { + atomic64_inc(&vdo->hash_zones->dedupe_context_busy); + continue_data_vio(data_vio); + return; + } + + data_vio->dedupe_context = context; + context->requestor = data_vio; + context->submission_jiffies = jiffies; + prepare_uds_request(&context->request, data_vio, operation); + atomic_set(&context->state, DEDUPE_CONTEXT_PENDING); + list_add_tail(&context->list_entry, &zone->pending); + start_expiration_timer(context); + result = uds_launch_request(&context->request); + if (result != UDS_SUCCESS) { + context->request.status = result; + finish_index_operation(&context->request); + } +} + +static void set_target_state(struct hash_zones *zones, + enum index_state target, + bool change_dedupe, + bool dedupe, + bool set_create) +{ + const char *old_state, *new_state; + + spin_lock(&zones->lock); + old_state = index_state_to_string(zones, zones->index_target); + if (change_dedupe) + WRITE_ONCE(zones->dedupe_flag, dedupe); + + if (set_create) + zones->create_flag = true; + + zones->index_target = target; + launch_dedupe_state_change(zones); + new_state = index_state_to_string(zones, zones->index_target); + spin_unlock(&zones->lock); + + if (old_state != new_state) + uds_log_info("Setting UDS index target state to %s", new_state); +} + +const char *vdo_get_dedupe_index_state_name(struct hash_zones *zones) +{ + const char *state; + + spin_lock(&zones->lock); + state = index_state_to_string(zones, zones->index_state); + spin_unlock(&zones->lock); + + return state; +} + +/* Handle a dmsetup message relevant to the index. */ +int vdo_message_dedupe_index(struct hash_zones *zones, const char *name) +{ + if (strcasecmp(name, "index-close") == 0) { + set_target_state(zones, IS_CLOSED, false, false, false); + return 0; + } else if (strcasecmp(name, "index-create") == 0) { + set_target_state(zones, IS_OPENED, false, false, true); + return 0; + } else if (strcasecmp(name, "index-disable") == 0) { + set_target_state(zones, IS_OPENED, true, false, false); + return 0; + } else if (strcasecmp(name, "index-enable") == 0) { + set_target_state(zones, IS_OPENED, true, true, false); + return 0; + } + return -EINVAL; +} + +int vdo_add_dedupe_index_sysfs(struct hash_zones *zones) +{ + int result = kobject_add(&zones->dedupe_directory, + &zones->completion.vdo->vdo_directory, + "dedupe"); + + if (result == 0) + vdo_set_admin_state_code(&zones->state, VDO_ADMIN_STATE_NORMAL_OPERATION); + + return result; +} + +/* If create_flag, create a new index without first attempting to load an existing index. */ +void vdo_start_dedupe_index(struct hash_zones *zones, bool create_flag) +{ + set_target_state(zones, IS_OPENED, true, true, create_flag); +} diff --git a/drivers/md/dm-vdo/dedupe.h b/drivers/md/dm-vdo/dedupe.h new file mode 100644 index 00000000000..d5730619694 --- /dev/null +++ b/drivers/md/dm-vdo/dedupe.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_DEDUPE_H +#define VDO_DEDUPE_H + +#include <linux/kobject.h> +#include <linux/list.h> +#include <linux/timer.h> + +#include "uds.h" + +#include "admin-state.h" +#include "constants.h" +#include "statistics.h" +#include "types.h" +#include "wait-queue.h" + +struct dedupe_context { + struct hash_zone *zone; + struct uds_request request; + struct list_head list_entry; + struct funnel_queue_entry queue_entry; + u64 submission_jiffies; + struct data_vio *requestor; + atomic_t state; +}; + +struct hash_lock; + +struct hash_zone { + /* Which hash zone this is */ + zone_count_t zone_number; + + /* The administrative state of the zone */ + struct admin_state state; + + /* The thread ID for this zone */ + thread_id_t thread_id; + + /* Mapping from record name fields to hash_locks */ + struct pointer_map *hash_lock_map; + + /* List containing all unused hash_locks */ + struct list_head lock_pool; + + /* + * Statistics shared by all hash locks in this zone. Only modified on the hash zone thread, + * but queried by other threads. + */ + struct hash_lock_statistics statistics; + + /* Array of all hash_locks */ + struct hash_lock *lock_array; + + /* These fields are used to manage the dedupe contexts */ + struct list_head available; + struct list_head pending; + struct funnel_queue *timed_out_complete; + struct timer_list timer; + struct vdo_completion completion; + unsigned int active; + atomic_t timer_state; + + /* The dedupe contexts for querying the index from this zone */ + struct dedupe_context contexts[MAXIMUM_VDO_USER_VIOS]; +}; + +struct hash_zones; + +struct pbn_lock * __must_check vdo_get_duplicate_lock(struct data_vio *data_vio); + +void vdo_acquire_hash_lock(struct vdo_completion *completion); +void vdo_continue_hash_lock(struct vdo_completion *completion); +void vdo_release_hash_lock(struct data_vio *data_vio); +void vdo_clean_failed_hash_lock(struct data_vio *data_vio); +void vdo_share_compressed_write_lock(struct data_vio *data_vio, struct pbn_lock *pbn_lock); + +int __must_check vdo_make_hash_zones(struct vdo *vdo, struct hash_zones **zones_ptr); + +void vdo_free_hash_zones(struct hash_zones *zones); + +void vdo_drain_hash_zones(struct hash_zones *zones, struct vdo_completion *parent); + +void vdo_get_dedupe_statistics(struct hash_zones *zones, struct vdo_statistics *stats); + +struct hash_zone * __must_check +vdo_select_hash_zone(struct hash_zones *zones, const struct uds_record_name *name); + +void vdo_dump_hash_zones(struct hash_zones *zones); + +const char *vdo_get_dedupe_index_state_name(struct hash_zones *zones); + +u64 vdo_get_dedupe_index_timeout_count(struct hash_zones *zones); + +int vdo_message_dedupe_index(struct hash_zones *zones, const char *name); + +int vdo_add_dedupe_index_sysfs(struct hash_zones *zones); + +void vdo_start_dedupe_index(struct hash_zones *zones, bool create_flag); + +void vdo_resume_hash_zones(struct hash_zones *zones, struct vdo_completion *parent); + +void vdo_finish_dedupe_index(struct hash_zones *zones); + +/* Interval (in milliseconds) from submission until switching to fast path and skipping UDS. */ +extern unsigned int vdo_dedupe_index_timeout_interval; + +/* + * Minimum time interval (in milliseconds) between timer invocations to check for requests waiting + * for UDS that should now time out. + */ +extern unsigned int vdo_dedupe_index_min_timer_interval; + +void vdo_set_dedupe_index_timeout_interval(unsigned int value); +void vdo_set_dedupe_index_min_timer_interval(unsigned int value); + +#endif /* VDO_DEDUPE_H */ diff --git a/drivers/md/dm-vdo/dump.c b/drivers/md/dm-vdo/dump.c new file mode 100644 index 00000000000..0943357f98b --- /dev/null +++ b/drivers/md/dm-vdo/dump.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "dump.h" + +#include <linux/module.h> + +#include "memory-alloc.h" +#include "string-utils.h" + +#include "constants.h" +#include "data-vio.h" +#include "dedupe.h" +#include "io-submitter.h" +#include "logger.h" +#include "types.h" +#include "vdo.h" +#include "work-queue.h" + +enum dump_options { + /* Work queues */ + SHOW_QUEUES, + /* Memory pools */ + SHOW_VIO_POOL, + /* Others */ + SHOW_VDO_STATUS, + /* This one means an option overrides the "default" choices, instead of altering them. */ + SKIP_DEFAULT +}; + +enum dump_option_flags { + /* Work queues */ + FLAG_SHOW_QUEUES = (1 << SHOW_QUEUES), + /* Memory pools */ + FLAG_SHOW_VIO_POOL = (1 << SHOW_VIO_POOL), + /* Others */ + FLAG_SHOW_VDO_STATUS = (1 << SHOW_VDO_STATUS), + /* Special */ + FLAG_SKIP_DEFAULT = (1 << SKIP_DEFAULT) +}; + +enum { + FLAGS_ALL_POOLS = (FLAG_SHOW_VIO_POOL), + DEFAULT_DUMP_FLAGS = (FLAG_SHOW_QUEUES | FLAG_SHOW_VDO_STATUS) +}; + +static inline bool is_arg_string(const char *arg, const char *this_option) +{ + /* convention seems to be case-independent options */ + return strncasecmp(arg, this_option, strlen(this_option)) == 0; +} + +static void do_dump(struct vdo *vdo, unsigned int dump_options_requested, const char *why) +{ + u32 active, maximum; + s64 outstanding; + + uds_log_info("%s dump triggered via %s", UDS_LOGGING_MODULE_NAME, why); + active = get_data_vio_pool_active_requests(vdo->data_vio_pool); + maximum = get_data_vio_pool_maximum_requests(vdo->data_vio_pool); + outstanding = (atomic64_read(&vdo->stats.bios_submitted) - + atomic64_read(&vdo->stats.bios_completed)); + uds_log_info("%u device requests outstanding (max %u), %lld bio requests outstanding, device '%s'", + active, + maximum, + outstanding, + vdo_get_device_name(vdo->device_config->owning_target)); + if (((dump_options_requested & FLAG_SHOW_QUEUES) != 0) && (vdo->threads != NULL)) { + thread_id_t id; + + for (id = 0; id < vdo->thread_config.thread_count; id++) + vdo_dump_work_queue(vdo->threads[id].queue); + } + + vdo_dump_hash_zones(vdo->hash_zones); + dump_data_vio_pool(vdo->data_vio_pool, (dump_options_requested & FLAG_SHOW_VIO_POOL) != 0); + if ((dump_options_requested & FLAG_SHOW_VDO_STATUS) != 0) + vdo_dump_status(vdo); + + uds_report_memory_usage(); + uds_log_info("end of %s dump", UDS_LOGGING_MODULE_NAME); +} + +static int +parse_dump_options(unsigned int argc, char *const *argv, unsigned int *dump_options_requested_ptr) +{ + unsigned int dump_options_requested = 0; + + static const struct { + const char *name; + unsigned int flags; + } option_names[] = { + { "viopool", FLAG_SKIP_DEFAULT | FLAG_SHOW_VIO_POOL }, + { "vdo", FLAG_SKIP_DEFAULT | FLAG_SHOW_VDO_STATUS }, + { "pools", FLAG_SKIP_DEFAULT | FLAGS_ALL_POOLS }, + { "queues", FLAG_SKIP_DEFAULT | FLAG_SHOW_QUEUES }, + { "threads", FLAG_SKIP_DEFAULT | FLAG_SHOW_QUEUES }, + { "default", FLAG_SKIP_DEFAULT | DEFAULT_DUMP_FLAGS }, + { "all", ~0 }, + }; + + bool options_okay = true; + unsigned int i; + + for (i = 1; i < argc; i++) { + unsigned int j; + + for (j = 0; j < ARRAY_SIZE(option_names); j++) { + if (is_arg_string(argv[i], option_names[j].name)) { + dump_options_requested |= option_names[j].flags; + break; + } + } + if (j == ARRAY_SIZE(option_names)) { + uds_log_warning("dump option name '%s' unknown", argv[i]); + options_okay = false; + } + } + if (!options_okay) + return -EINVAL; + if ((dump_options_requested & FLAG_SKIP_DEFAULT) == 0) + dump_options_requested |= DEFAULT_DUMP_FLAGS; + *dump_options_requested_ptr = dump_options_requested; + return 0; +} + +/* Dump as specified by zero or more string arguments. */ +int vdo_dump(struct vdo *vdo, unsigned int argc, char *const *argv, const char *why) +{ + unsigned int dump_options_requested = 0; + int result = parse_dump_options(argc, argv, &dump_options_requested); + + if (result != 0) + return result; + + do_dump(vdo, dump_options_requested, why); + return 0; +} + +/* Dump everything we know how to dump */ +void vdo_dump_all(struct vdo *vdo, const char *why) +{ + do_dump(vdo, ~0, why); +} + +/* + * Dump out the data_vio waiters on a wait queue. + * wait_on should be the label to print for queue (e.g. logical or physical) + */ +static void dump_vio_waiters(struct wait_queue *queue, char *wait_on) +{ + struct waiter *waiter, *first = vdo_get_first_waiter(queue); + struct data_vio *data_vio; + + if (first == NULL) + return; + + data_vio = waiter_as_data_vio(first); + + uds_log_info(" %s is locked. Waited on by: vio %px pbn %llu lbn %llu d-pbn %llu lastOp %s", + wait_on, + data_vio, + data_vio->allocation.pbn, + data_vio->logical.lbn, + data_vio->duplicate.pbn, + get_data_vio_operation_name(data_vio)); + + for (waiter = first->next_waiter; waiter != first; waiter = waiter->next_waiter) { + data_vio = waiter_as_data_vio(waiter); + uds_log_info(" ... and : vio %px pbn %llu lbn %llu d-pbn %llu lastOp %s", + data_vio, + data_vio->allocation.pbn, + data_vio->logical.lbn, + data_vio->duplicate.pbn, + get_data_vio_operation_name(data_vio)); + } +} + +/* + * Encode various attributes of a data_vio as a string of one-character flags. This encoding is for + * logging brevity: + * + * R => vio completion result not VDO_SUCCESS + * W => vio is on a wait queue + * D => vio is a duplicate + * p => vio is a partial block operation + * z => vio is a zero block + * d => vio is a discard + * + * The common case of no flags set will result in an empty, null-terminated buffer. If any flags + * are encoded, the first character in the string will be a space character. + */ +static void encode_vio_dump_flags(struct data_vio *data_vio, char buffer[8]) +{ + char *p_flag = buffer; + *p_flag++ = ' '; + if (data_vio->vio.completion.result != VDO_SUCCESS) + *p_flag++ = 'R'; + if (data_vio->waiter.next_waiter != NULL) + *p_flag++ = 'W'; + if (data_vio->is_duplicate) + *p_flag++ = 'D'; + if (data_vio->is_partial) + *p_flag++ = 'p'; + if (data_vio->is_zero) + *p_flag++ = 'z'; + if (data_vio->remaining_discard > 0) + *p_flag++ = 'd'; + if (p_flag == &buffer[1]) + /* No flags, so remove the blank space. */ + p_flag = buffer; + *p_flag = '\0'; +} + +/* Implements buffer_dump_function. */ +void dump_data_vio(void *data) +{ + struct data_vio *data_vio = (struct data_vio *) data; + + /* + * This just needs to be big enough to hold a queue (thread) name and a function name (plus + * a separator character and NUL). The latter is limited only by taste. + * + * In making this static, we're assuming only one "dump" will run at a time. If more than + * one does run, the log output will be garbled anyway. + */ + static char vio_completion_dump_buffer[100 + MAX_VDO_WORK_QUEUE_NAME_LEN]; + /* Another static buffer... log10(256) = 2.408+, round up: */ + enum { DIGITS_PER_U64 = 1 + sizeof(u64) * 2409 / 1000 }; + + static char vio_block_number_dump_buffer[sizeof("P L D") + 3 * DIGITS_PER_U64]; + static char vio_flush_generation_buffer[sizeof(" FG") + DIGITS_PER_U64]; + static char flags_dump_buffer[8]; + + /* + * We're likely to be logging a couple thousand of these lines, and in some circumstances + * syslogd may have trouble keeping up, so keep it BRIEF rather than user-friendly. + */ + vdo_dump_completion_to_buffer(&data_vio->vio.completion, + vio_completion_dump_buffer, + sizeof(vio_completion_dump_buffer)); + if (data_vio->is_duplicate) + snprintf(vio_block_number_dump_buffer, + sizeof(vio_block_number_dump_buffer), + "P%llu L%llu D%llu", + data_vio->allocation.pbn, + data_vio->logical.lbn, + data_vio->duplicate.pbn); + else if (data_vio_has_allocation(data_vio)) + snprintf(vio_block_number_dump_buffer, + sizeof(vio_block_number_dump_buffer), + "P%llu L%llu", + data_vio->allocation.pbn, + data_vio->logical.lbn); + else + snprintf(vio_block_number_dump_buffer, + sizeof(vio_block_number_dump_buffer), + "L%llu", + data_vio->logical.lbn); + + if (data_vio->flush_generation != 0) + snprintf(vio_flush_generation_buffer, + sizeof(vio_flush_generation_buffer), + " FG%llu", + data_vio->flush_generation); + else + vio_flush_generation_buffer[0] = 0; + + encode_vio_dump_flags(data_vio, flags_dump_buffer); + + uds_log_info(" vio %px %s%s %s %s%s", + data_vio, + vio_block_number_dump_buffer, + vio_flush_generation_buffer, + get_data_vio_operation_name(data_vio), + vio_completion_dump_buffer, + flags_dump_buffer); + /* + * might want info on: wantUDSAnswer / operation / status + * might want info on: bio / bios_merged + */ + + dump_vio_waiters(&data_vio->logical.waiters, "lbn"); + + /* might want to dump more info from vio here */ +} diff --git a/drivers/md/dm-vdo/dump.h b/drivers/md/dm-vdo/dump.h new file mode 100644 index 00000000000..74ade5eab07 --- /dev/null +++ b/drivers/md/dm-vdo/dump.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_DUMP_H +#define VDO_DUMP_H + +#include "types.h" + +int vdo_dump(struct vdo *vdo, unsigned int argc, char *const *argv, const char *why); + +void vdo_dump_all(struct vdo *vdo, const char *why); + +void dump_data_vio(void *data); + +#endif /* VDO_DUMP_H */ diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c new file mode 100644 index 00000000000..d4425686092 --- /dev/null +++ b/drivers/md/dm-vdo/encodings.c @@ -0,0 +1,1523 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "encodings.h" + +#include <linux/log2.h> + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "constants.h" +#include "release-versions.h" +#include "status-codes.h" +#include "types.h" + +struct geometry_block { + char magic_number[VDO_GEOMETRY_MAGIC_NUMBER_SIZE]; + struct packed_header header; + u32 checksum; +} __packed; + +static const struct header GEOMETRY_BLOCK_HEADER_5_0 = { + .id = VDO_GEOMETRY_BLOCK, + .version = { + .major_version = 5, + .minor_version = 0, + }, + /* + * Note: this size isn't just the payload size following the header, like it is everywhere + * else in VDO. + */ + .size = sizeof(struct geometry_block) + sizeof(struct volume_geometry), +}; + +static const struct header GEOMETRY_BLOCK_HEADER_4_0 = { + .id = VDO_GEOMETRY_BLOCK, + .version = { + .major_version = 4, + .minor_version = 0, + }, + /* + * Note: this size isn't just the payload size following the header, like it is everywhere + * else in VDO. + */ + .size = sizeof(struct geometry_block) + sizeof(struct volume_geometry_4_0), +}; + +const u8 VDO_GEOMETRY_MAGIC_NUMBER[VDO_GEOMETRY_MAGIC_NUMBER_SIZE + 1] = "dmvdo001"; + +static const release_version_number_t COMPATIBLE_RELEASE_VERSIONS[] = { + VDO_MAGNESIUM_RELEASE_VERSION_NUMBER, + VDO_ALUMINUM_RELEASE_VERSION_NUMBER, +}; + +enum { + PAGE_HEADER_4_1_SIZE = 8 + 8 + 8 + 1 + 1 + 1 + 1, +}; + +static const struct version_number BLOCK_MAP_4_1 = { + .major_version = 4, + .minor_version = 1, +}; + +const struct header VDO_BLOCK_MAP_HEADER_2_0 = { + .id = VDO_BLOCK_MAP, + .version = { + .major_version = 2, + .minor_version = 0, + }, + .size = sizeof(struct block_map_state_2_0), +}; + +const struct header VDO_RECOVERY_JOURNAL_HEADER_7_0 = { + .id = VDO_RECOVERY_JOURNAL, + .version = { + .major_version = 7, + .minor_version = 0, + }, + .size = sizeof(struct recovery_journal_state_7_0), +}; + +const struct header VDO_SLAB_DEPOT_HEADER_2_0 = { + .id = VDO_SLAB_DEPOT, + .version = { + .major_version = 2, + .minor_version = 0, + }, + .size = sizeof(struct slab_depot_state_2_0), +}; + +const struct header VDO_LAYOUT_HEADER_3_0 = { + .id = VDO_LAYOUT, + .version = { + .major_version = 3, + .minor_version = 0, + }, + .size = sizeof(struct layout_3_0) + (sizeof(struct partition_3_0) * VDO_PARTITION_COUNT), +}; + +static const enum partition_id REQUIRED_PARTITIONS[] = { + VDO_BLOCK_MAP_PARTITION, + VDO_SLAB_DEPOT_PARTITION, + VDO_RECOVERY_JOURNAL_PARTITION, + VDO_SLAB_SUMMARY_PARTITION, +}; + +/* + * The current version for the data encoded in the super block. This must be changed any time there + * is a change to encoding of the component data of any VDO component. + */ +static const struct version_number VDO_COMPONENT_DATA_41_0 = { + .major_version = 41, + .minor_version = 0, +}; + +const struct version_number VDO_VOLUME_VERSION_67_0 = { + .major_version = 67, + .minor_version = 0, +}; + +static const struct header SUPER_BLOCK_HEADER_12_0 = { + .id = VDO_SUPER_BLOCK, + .version = { + .major_version = 12, + .minor_version = 0, + }, + + /* This is the minimum size, if the super block contains no components. */ + .size = VDO_SUPER_BLOCK_FIXED_SIZE - VDO_ENCODED_HEADER_SIZE, +}; + +/** + * validate_version() - Check whether a version matches an expected version. + * @expected_version: The expected version. + * @actual_version: The version being validated. + * @component_name: The name of the component or the calling function (for error logging). + * + * Logs an error describing a mismatch. + * + * Return: VDO_SUCCESS if the versions are the same, + * VDO_UNSUPPORTED_VERSION if the versions don't match. + */ +static int __must_check validate_version(struct version_number expected_version, + struct version_number actual_version, + const char *component_name) +{ + if (!vdo_are_same_version(expected_version, actual_version)) + return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION, + "%s version mismatch, expected %d.%d, got %d.%d", + component_name, + expected_version.major_version, + expected_version.minor_version, + actual_version.major_version, + actual_version.minor_version); + return VDO_SUCCESS; +} + +/** + * vdo_validate_header() - Check whether a header matches expectations. + * @expected_header: The expected header. + * @actual_header: The header being validated. + * @exact_size: If true, the size fields of the two headers must be the same, otherwise it is + * required that actual_header.size >= expected_header.size. + * @name: The name of the component or the calling function (for error logging). + * + * Logs an error describing the first mismatch found. + * + * Return: VDO_SUCCESS if the header meets expectations, + * VDO_INCORRECT_COMPONENT if the component ids don't match, + * VDO_UNSUPPORTED_VERSION if the versions or sizes don't match. + */ +int vdo_validate_header(const struct header *expected_header, + const struct header *actual_header, + bool exact_size, + const char *name) +{ + int result; + + if (expected_header->id != actual_header->id) + return uds_log_error_strerror(VDO_INCORRECT_COMPONENT, + "%s ID mismatch, expected %d, got %d", + name, + expected_header->id, + actual_header->id); + + result = validate_version(expected_header->version, actual_header->version, name); + if (result != VDO_SUCCESS) + return result; + + if ((expected_header->size > actual_header->size) || + (exact_size && (expected_header->size < actual_header->size))) + return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION, + "%s size mismatch, expected %zu, got %zu", + name, + expected_header->size, + actual_header->size); + + return VDO_SUCCESS; +} + +static void encode_version_number(u8 *buffer, size_t *offset, struct version_number version) +{ + struct packed_version_number packed = vdo_pack_version_number(version); + + memcpy(buffer + *offset, &packed, sizeof(packed)); + *offset += sizeof(packed); +} + +void vdo_encode_header(u8 *buffer, size_t *offset, const struct header *header) +{ + struct packed_header packed = vdo_pack_header(header); + + memcpy(buffer + *offset, &packed, sizeof(packed)); + *offset += sizeof(packed); +} + +static void decode_version_number(u8 *buffer, size_t *offset, struct version_number *version) +{ + struct packed_version_number packed; + + memcpy(&packed, buffer + *offset, sizeof(packed)); + *offset += sizeof(packed); + *version = vdo_unpack_version_number(packed); +} + +void vdo_decode_header(u8 *buffer, size_t *offset, struct header *header) +{ + struct packed_header packed; + + memcpy(&packed, buffer + *offset, sizeof(packed)); + *offset += sizeof(packed); + + *header = vdo_unpack_header(&packed); +} + +/** + * is_loadable_release_version() - Determine whether the supplied release version can be understood + * by the VDO code. + * @version: The release version number to check. + * + * Return: True if the given version can be loaded. + */ +static inline bool is_loadable_release_version(release_version_number_t version) +{ + unsigned int i; + + if (version == VDO_CURRENT_RELEASE_VERSION_NUMBER) + return true; + + for (i = 0; i < ARRAY_SIZE(COMPATIBLE_RELEASE_VERSIONS); i++) + if (version == COMPATIBLE_RELEASE_VERSIONS[i]) + return true; + + return false; +} + +/** + * decode_volume_geometry() - Decode the on-disk representation of a volume geometry from a buffer. + * @buffer: A buffer to decode from. + * @offset: The offset in the buffer at which to decode. + * @geometry: The structure to receive the decoded fields. + * @version: The geometry block version to decode. + */ +static void +decode_volume_geometry(u8 *buffer, size_t *offset, struct volume_geometry *geometry, u32 version) +{ + release_version_number_t release_version; + enum volume_region_id id; + nonce_t nonce; + block_count_t bio_offset = 0; + u32 mem; + bool sparse; + + decode_u32_le(buffer, offset, &release_version); + decode_u64_le(buffer, offset, &nonce); + geometry->release_version = release_version; + geometry->nonce = nonce; + + memcpy((unsigned char *) &geometry->uuid, buffer + *offset, sizeof(uuid_t)); + *offset += sizeof(uuid_t); + + if (version > 4) + decode_u64_le(buffer, offset, &bio_offset); + geometry->bio_offset = bio_offset; + + for (id = 0; id < VDO_VOLUME_REGION_COUNT; id++) { + physical_block_number_t start_block; + enum volume_region_id saved_id; + + decode_u32_le(buffer, offset, &saved_id); + decode_u64_le(buffer, offset, &start_block); + + geometry->regions[id] = (struct volume_region) { + .id = saved_id, + .start_block = start_block, + }; + } + + decode_u32_le(buffer, offset, &mem); + *offset += sizeof(u32); + sparse = buffer[(*offset)++]; + + geometry->index_config = (struct index_config) { + .mem = mem, + .sparse = sparse, + }; +} + +/** + * vdo_parse_geometry_block() - Decode and validate an encoded geometry block. + * @block: The encoded geometry block. + * @geometry: The structure to receive the decoded fields. + */ +int __must_check vdo_parse_geometry_block(u8 *block, struct volume_geometry *geometry) +{ + u32 checksum, saved_checksum; + struct header header; + size_t offset = 0; + int result; + + if (memcmp(block, VDO_GEOMETRY_MAGIC_NUMBER, VDO_GEOMETRY_MAGIC_NUMBER_SIZE) != 0) + return VDO_BAD_MAGIC; + offset += VDO_GEOMETRY_MAGIC_NUMBER_SIZE; + + vdo_decode_header(block, &offset, &header); + if (header.version.major_version <= 4) + result = vdo_validate_header(&GEOMETRY_BLOCK_HEADER_4_0, &header, true, __func__); + else + result = vdo_validate_header(&GEOMETRY_BLOCK_HEADER_5_0, &header, true, __func__); + if (result != VDO_SUCCESS) + return result; + + decode_volume_geometry(block, &offset, geometry, header.version.major_version); + + result = ASSERT(header.size == offset + sizeof(u32), + "should have decoded up to the geometry checksum"); + if (result != VDO_SUCCESS) + return result; + + /* Decode and verify the checksum. */ + checksum = vdo_crc32(block, offset); + decode_u32_le(block, &offset, &saved_checksum); + + if (!is_loadable_release_version(geometry->release_version)) + return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION, + "release version %d cannot be loaded", + geometry->release_version); + + return ((checksum == saved_checksum) ? VDO_SUCCESS : VDO_CHECKSUM_MISMATCH); +} + +struct block_map_page *vdo_format_block_map_page(void *buffer, + nonce_t nonce, + physical_block_number_t pbn, + bool initialized) +{ + struct block_map_page *page = (struct block_map_page *) buffer; + + memset(buffer, 0, VDO_BLOCK_SIZE); + page->version = vdo_pack_version_number(BLOCK_MAP_4_1); + page->header.nonce = __cpu_to_le64(nonce); + page->header.pbn = __cpu_to_le64(pbn); + page->header.initialized = initialized; + return page; +} + +enum block_map_page_validity +vdo_validate_block_map_page(struct block_map_page *page, + nonce_t nonce, + physical_block_number_t pbn) +{ + STATIC_ASSERT_SIZEOF(struct block_map_page_header, PAGE_HEADER_4_1_SIZE); + + if (!vdo_are_same_version(BLOCK_MAP_4_1, vdo_unpack_version_number(page->version)) || + !page->header.initialized || + (nonce != __le64_to_cpu(page->header.nonce))) + return VDO_BLOCK_MAP_PAGE_INVALID; + + if (pbn != vdo_get_block_map_page_pbn(page)) + return VDO_BLOCK_MAP_PAGE_BAD; + + return VDO_BLOCK_MAP_PAGE_VALID; +} + +static int +decode_block_map_state_2_0(u8 *buffer, size_t *offset, struct block_map_state_2_0 *state) +{ + size_t initial_offset; + block_count_t flat_page_count, root_count; + physical_block_number_t flat_page_origin, root_origin; + struct header header; + int result; + + vdo_decode_header(buffer, offset, &header); + result = vdo_validate_header(&VDO_BLOCK_MAP_HEADER_2_0, &header, true, __func__); + if (result != VDO_SUCCESS) + return result; + + initial_offset = *offset; + + decode_u64_le(buffer, offset, &flat_page_origin); + result = ASSERT(flat_page_origin == VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN, + "Flat page origin must be %u (recorded as %llu)", + VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN, + (unsigned long long) state->flat_page_origin); + if (result != UDS_SUCCESS) + return result; + + decode_u64_le(buffer, offset, &flat_page_count); + result = ASSERT(flat_page_count == 0, + "Flat page count must be 0 (recorded as %llu)", + (unsigned long long) state->flat_page_count); + if (result != UDS_SUCCESS) + return result; + + decode_u64_le(buffer, offset, &root_origin); + decode_u64_le(buffer, offset, &root_count); + + result = ASSERT(VDO_BLOCK_MAP_HEADER_2_0.size == *offset - initial_offset, + "decoded block map component size must match header size"); + if (result != VDO_SUCCESS) + return result; + + *state = (struct block_map_state_2_0) { + .flat_page_origin = flat_page_origin, + .flat_page_count = flat_page_count, + .root_origin = root_origin, + .root_count = root_count, + }; + + return VDO_SUCCESS; +} + +static void +encode_block_map_state_2_0(u8 *buffer, size_t *offset, struct block_map_state_2_0 state) +{ + size_t initial_offset; + + vdo_encode_header(buffer, offset, &VDO_BLOCK_MAP_HEADER_2_0); + + initial_offset = *offset; + encode_u64_le(buffer, offset, state.flat_page_origin); + encode_u64_le(buffer, offset, state.flat_page_count); + encode_u64_le(buffer, offset, state.root_origin); + encode_u64_le(buffer, offset, state.root_count); + + ASSERT_LOG_ONLY(VDO_BLOCK_MAP_HEADER_2_0.size == *offset - initial_offset, + "encoded block map component size must match header size"); +} + +/** + * vdo_compute_new_forest_pages() - Compute the number of pages which must be allocated at each + * level in order to grow the forest to a new number of entries. + * @entries: The new number of entries the block map must address. + * + * Return: The total number of non-leaf pages required. + */ +block_count_t vdo_compute_new_forest_pages(root_count_t root_count, + struct boundary *old_sizes, + block_count_t entries, + struct boundary *new_sizes) +{ + page_count_t leaf_pages = max(vdo_compute_block_map_page_count(entries), 1U); + page_count_t level_size = DIV_ROUND_UP(leaf_pages, root_count); + block_count_t total_pages = 0; + height_t height; + + for (height = 0; height < VDO_BLOCK_MAP_TREE_HEIGHT; height++) { + block_count_t new_pages; + + level_size = DIV_ROUND_UP(level_size, VDO_BLOCK_MAP_ENTRIES_PER_PAGE); + new_sizes->levels[height] = level_size; + new_pages = level_size; + if (old_sizes != NULL) + new_pages -= old_sizes->levels[height]; + total_pages += (new_pages * root_count); + } + + return total_pages; +} + +/** + * encode_recovery_journal_state_7_0() - Encode the state of a recovery journal. + * + * Return: VDO_SUCCESS or an error code. + */ +static void +encode_recovery_journal_state_7_0(u8 *buffer, + size_t *offset, + struct recovery_journal_state_7_0 state) +{ + size_t initial_offset; + + vdo_encode_header(buffer, offset, &VDO_RECOVERY_JOURNAL_HEADER_7_0); + + initial_offset = *offset; + encode_u64_le(buffer, offset, state.journal_start); + encode_u64_le(buffer, offset, state.logical_blocks_used); + encode_u64_le(buffer, offset, state.block_map_data_blocks); + + ASSERT_LOG_ONLY(VDO_RECOVERY_JOURNAL_HEADER_7_0.size == *offset - initial_offset, + "encoded recovery journal component size must match header size"); +} + +/** + * decode_recovery_journal_state_7_0() - Decode the state of a recovery journal saved in a buffer. + * @buffer: The buffer containing the saved state. + * @state: A pointer to a recovery journal state to hold the result of a successful decode. + * + * Return: VDO_SUCCESS or an error code. + */ +static int __must_check +decode_recovery_journal_state_7_0(u8 *buffer, + size_t *offset, + struct recovery_journal_state_7_0 *state) +{ + struct header header; + int result; + size_t initial_offset; + sequence_number_t journal_start; + block_count_t logical_blocks_used, block_map_data_blocks; + + vdo_decode_header(buffer, offset, &header); + result = vdo_validate_header(&VDO_RECOVERY_JOURNAL_HEADER_7_0, &header, true, __func__); + if (result != VDO_SUCCESS) + return result; + + initial_offset = *offset; + decode_u64_le(buffer, offset, &journal_start); + decode_u64_le(buffer, offset, &logical_blocks_used); + decode_u64_le(buffer, offset, &block_map_data_blocks); + + result = ASSERT(VDO_RECOVERY_JOURNAL_HEADER_7_0.size == *offset - initial_offset, + "decoded recovery journal component size must match header size"); + if (result != UDS_SUCCESS) + return result; + + *state = (struct recovery_journal_state_7_0) { + .journal_start = journal_start, + .logical_blocks_used = logical_blocks_used, + .block_map_data_blocks = block_map_data_blocks, + }; + + return VDO_SUCCESS; +} + +/** + * vdo_get_journal_operation_name() - Get the name of a journal operation. + * @operation: The operation to name. + * + * Return: The name of the operation. + */ +const char *vdo_get_journal_operation_name(enum journal_operation operation) +{ + switch (operation) { + case VDO_JOURNAL_DATA_REMAPPING: + return "data remapping"; + + case VDO_JOURNAL_BLOCK_MAP_REMAPPING: + return "block map remapping"; + + default: + return "unknown journal operation"; + } +} + +/** + * encode_slab_depot_state_2_0() - Encode the state of a slab depot into a buffer. + * + * Return: UDS_SUCCESS or an error. + */ +static void +encode_slab_depot_state_2_0(u8 *buffer, size_t *offset, struct slab_depot_state_2_0 state) +{ + size_t initial_offset; + + vdo_encode_header(buffer, offset, &VDO_SLAB_DEPOT_HEADER_2_0); + + initial_offset = *offset; + encode_u64_le(buffer, offset, state.slab_config.slab_blocks); + encode_u64_le(buffer, offset, state.slab_config.data_blocks); + encode_u64_le(buffer, offset, state.slab_config.reference_count_blocks); + encode_u64_le(buffer, offset, state.slab_config.slab_journal_blocks); + encode_u64_le(buffer, offset, state.slab_config.slab_journal_flushing_threshold); + encode_u64_le(buffer, offset, state.slab_config.slab_journal_blocking_threshold); + encode_u64_le(buffer, offset, state.slab_config.slab_journal_scrubbing_threshold); + encode_u64_le(buffer, offset, state.first_block); + encode_u64_le(buffer, offset, state.last_block); + buffer[(*offset)++] = state.zone_count; + + ASSERT_LOG_ONLY(VDO_SLAB_DEPOT_HEADER_2_0.size == *offset - initial_offset, + "encoded block map component size must match header size"); +} + +/** + * decode_slab_depot_state_2_0() - Decode slab depot component state version 2.0 from a buffer. + * + * Return: UDS_SUCCESS or an error code. + */ +static int +decode_slab_depot_state_2_0(u8 *buffer, size_t *offset, struct slab_depot_state_2_0 *state) +{ + struct header header; + int result; + size_t initial_offset; + struct slab_config slab_config; + block_count_t count; + physical_block_number_t first_block, last_block; + zone_count_t zone_count; + + vdo_decode_header(buffer, offset, &header); + result = vdo_validate_header(&VDO_SLAB_DEPOT_HEADER_2_0, &header, true, __func__); + if (result != VDO_SUCCESS) + return result; + + initial_offset = *offset; + decode_u64_le(buffer, offset, &count); + slab_config.slab_blocks = count; + + decode_u64_le(buffer, offset, &count); + slab_config.data_blocks = count; + + decode_u64_le(buffer, offset, &count); + slab_config.reference_count_blocks = count; + + decode_u64_le(buffer, offset, &count); + slab_config.slab_journal_blocks = count; + + decode_u64_le(buffer, offset, &count); + slab_config.slab_journal_flushing_threshold = count; + + decode_u64_le(buffer, offset, &count); + slab_config.slab_journal_blocking_threshold = count; + + decode_u64_le(buffer, offset, &count); + slab_config.slab_journal_scrubbing_threshold = count; + + decode_u64_le(buffer, offset, &first_block); + decode_u64_le(buffer, offset, &last_block); + zone_count = buffer[(*offset)++]; + + result = ASSERT(VDO_SLAB_DEPOT_HEADER_2_0.size == *offset - initial_offset, + "decoded slab depot component size must match header size"); + if (result != UDS_SUCCESS) + return result; + + *state = (struct slab_depot_state_2_0) { + .slab_config = slab_config, + .first_block = first_block, + .last_block = last_block, + .zone_count = zone_count, + }; + + return VDO_SUCCESS; +} + +/** + * vdo_configure_slab_depot() - Configure the slab depot. + * @partition: The slab depot partition + * @slab_config: The configuration of a single slab. + * @zone_count: The number of zones the depot will use. + * @state: The state structure to be configured. + * + * Configures the slab_depot for the specified storage capacity, finding the number of data blocks + * that will fit and still leave room for the depot metadata, then return the saved state for that + * configuration. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_configure_slab_depot(const struct partition *partition, + struct slab_config slab_config, + zone_count_t zone_count, + struct slab_depot_state_2_0 *state) +{ + block_count_t total_slab_blocks, total_data_blocks; + size_t slab_count; + physical_block_number_t last_block; + block_count_t slab_size = slab_config.slab_blocks; + + uds_log_debug("slabDepot %s(block_count=%llu, first_block=%llu, slab_size=%llu, zone_count=%u)", + __func__, + (unsigned long long) partition->count, + (unsigned long long) partition->offset, + (unsigned long long) slab_size, + zone_count); + + /* We do not allow runt slabs, so we waste up to a slab's worth. */ + slab_count = (partition->count / slab_size); + if (slab_count == 0) + return VDO_NO_SPACE; + + if (slab_count > MAX_VDO_SLABS) + return VDO_TOO_MANY_SLABS; + + total_slab_blocks = slab_count * slab_config.slab_blocks; + total_data_blocks = slab_count * slab_config.data_blocks; + last_block = partition->offset + total_slab_blocks; + + *state = (struct slab_depot_state_2_0) { + .slab_config = slab_config, + .first_block = partition->offset, + .last_block = last_block, + .zone_count = zone_count, + }; + + uds_log_debug("slab_depot last_block=%llu, total_data_blocks=%llu, slab_count=%zu, left_over=%llu", + (unsigned long long) last_block, + (unsigned long long) total_data_blocks, + slab_count, + (unsigned long long) (partition->count - (last_block - partition->offset))); + + return VDO_SUCCESS; +} + +/** + * vdo_configure_slab() - Measure and initialize the configuration to use for each slab. + * @slab_size: The number of blocks per slab. + * @slab_journal_blocks: The number of blocks for the slab journal. + * @slab_config: The slab configuration to initialize. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_configure_slab(block_count_t slab_size, + block_count_t slab_journal_blocks, + struct slab_config *slab_config) +{ + block_count_t ref_blocks, meta_blocks, data_blocks; + block_count_t flushing_threshold, remaining, blocking_threshold; + block_count_t minimal_extra_space, scrubbing_threshold; + + if (slab_journal_blocks >= slab_size) + return VDO_BAD_CONFIGURATION; + + /* + * This calculation should technically be a recurrence, but the total number of metadata + * blocks is currently less than a single block of ref_counts, so we'd gain at most one + * data block in each slab with more iteration. + */ + ref_blocks = vdo_get_saved_reference_count_size(slab_size - slab_journal_blocks); + meta_blocks = (ref_blocks + slab_journal_blocks); + + /* Make sure test code hasn't configured slabs to be too small. */ + if (meta_blocks >= slab_size) + return VDO_BAD_CONFIGURATION; + + /* + * If the slab size is very small, assume this must be a unit test and override the number + * of data blocks to be a power of two (wasting blocks in the slab). Many tests need their + * data_blocks fields to be the exact capacity of the configured volume, and that used to + * fall out since they use a power of two for the number of data blocks, the slab size was + * a power of two, and every block in a slab was a data block. + * + * TODO: Try to figure out some way of structuring testParameters and unit tests so this + * hack isn't needed without having to edit several unit tests every time the metadata size + * changes by one block. + */ + data_blocks = slab_size - meta_blocks; + if ((slab_size < 1024) && !is_power_of_2(data_blocks)) + data_blocks = ((block_count_t) 1 << ilog2(data_blocks)); + + /* + * Configure the slab journal thresholds. The flush threshold is 168 of 224 blocks in + * production, or 3/4ths, so we use this ratio for all sizes. + */ + flushing_threshold = ((slab_journal_blocks * 3) + 3) / 4; + /* + * The blocking threshold should be far enough from the flushing threshold to not produce + * delays, but far enough from the end of the journal to allow multiple successive recovery + * failures. + */ + remaining = slab_journal_blocks - flushing_threshold; + blocking_threshold = flushing_threshold + ((remaining * 5) / 7); + /* The scrubbing threshold should be at least 2048 entries before the end of the journal. */ + minimal_extra_space = 1 + (MAXIMUM_VDO_USER_VIOS / VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK); + scrubbing_threshold = blocking_threshold; + if (slab_journal_blocks > minimal_extra_space) + scrubbing_threshold = slab_journal_blocks - minimal_extra_space; + if (blocking_threshold > scrubbing_threshold) + blocking_threshold = scrubbing_threshold; + + *slab_config = (struct slab_config) { + .slab_blocks = slab_size, + .data_blocks = data_blocks, + .reference_count_blocks = ref_blocks, + .slab_journal_blocks = slab_journal_blocks, + .slab_journal_flushing_threshold = flushing_threshold, + .slab_journal_blocking_threshold = blocking_threshold, + .slab_journal_scrubbing_threshold = scrubbing_threshold}; + return VDO_SUCCESS; +} + +/** + * vdo_decode_slab_journal_entry() - Decode a slab journal entry. + * @block: The journal block holding the entry. + * @entry_count: The number of the entry. + * + * Return: The decoded entry. + */ +struct slab_journal_entry +vdo_decode_slab_journal_entry(struct packed_slab_journal_block *block, + journal_entry_count_t entry_count) +{ + struct slab_journal_entry entry = + vdo_unpack_slab_journal_entry(&block->payload.entries[entry_count]); + + if (block->header.has_block_map_increments && + ((block->payload.full_entries.entry_types[entry_count / 8] & + ((u8)1 << (entry_count % 8))) != 0)) + entry.operation = VDO_JOURNAL_BLOCK_MAP_REMAPPING; + + return entry; +} + +/** + * allocate_partition() - Allocate a partition and add it to a layout. + * @layout: The layout containing the partition. + * @id: The id of the partition. + * @offset: The offset into the layout at which the partition begins. + * @size: The size of the partition in blocks. + * + * Return: VDO_SUCCESS or an error. + */ +static int allocate_partition(struct layout *layout, + u8 id, + physical_block_number_t offset, + block_count_t size) +{ + struct partition *partition; + int result; + + result = UDS_ALLOCATE(1, struct partition, __func__, &partition); + if (result != UDS_SUCCESS) + return result; + + partition->id = id; + partition->offset = offset; + partition->count = size; + partition->next = layout->head; + layout->head = partition; + + return VDO_SUCCESS; +} + +/** + * make_partition() - Create a new partition from the beginning or end of the unused space in a + * layout. + * @layout: The layout. + * @id: The id of the partition to make. + * @size: The number of blocks to carve out; if 0, all remaining space will be used. + * @beginning: True if the partition should start at the beginning of the unused space. + * + * Return: A success or error code, particularly VDO_NO_SPACE if there are fewer than size blocks + * remaining. + */ +static int __must_check +make_partition(struct layout *layout, + enum partition_id id, + block_count_t size, + bool beginning) +{ + int result; + physical_block_number_t offset; + block_count_t free_blocks = layout->last_free - layout->first_free; + + if (size == 0) { + if (free_blocks == 0) + return VDO_NO_SPACE; + size = free_blocks; + } else if (size > free_blocks) { + return VDO_NO_SPACE; + } + + result = vdo_get_partition(layout, id, NULL); + if (result != VDO_UNKNOWN_PARTITION) + return VDO_PARTITION_EXISTS; + + offset = beginning ? layout->first_free : (layout->last_free - size); + + result = allocate_partition(layout, id, offset, size); + if (result != VDO_SUCCESS) + return result; + + layout->num_partitions++; + if (beginning) + layout->first_free += size; + else + layout->last_free = layout->last_free - size; + + return VDO_SUCCESS; +} + +/** + * vdo_initialize_layout() - Lay out the partitions of a vdo. + * @size: The entire size of the vdo. + * @origin: The start of the layout on the underlying storage in blocks. + * @block_map_blocks: The size of the block map partition. + * @journal_blocks: The size of the journal partition. + * @summary_blocks: The size of the slab summary partition. + * @layout: The layout to initialize. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_initialize_layout(block_count_t size, + physical_block_number_t offset, + block_count_t block_map_blocks, + block_count_t journal_blocks, + block_count_t summary_blocks, + struct layout *layout) +{ + int result; + block_count_t necessary_size = + (offset + block_map_blocks + journal_blocks + summary_blocks); + + if (necessary_size > size) + return uds_log_error_strerror(VDO_NO_SPACE, "Not enough space to make a VDO"); + + *layout = (struct layout) { + .start = offset, + .size = size, + .first_free = offset, + .last_free = size, + .num_partitions = 0, + .head = NULL, + }; + + result = make_partition(layout, VDO_BLOCK_MAP_PARTITION, block_map_blocks, true); + if (result != VDO_SUCCESS) { + vdo_uninitialize_layout(layout); + return result; + } + + result = make_partition(layout, VDO_SLAB_SUMMARY_PARTITION, summary_blocks, false); + if (result != VDO_SUCCESS) { + vdo_uninitialize_layout(layout); + return result; + } + + result = make_partition(layout, VDO_RECOVERY_JOURNAL_PARTITION, journal_blocks, false); + if (result != VDO_SUCCESS) { + vdo_uninitialize_layout(layout); + return result; + } + + result = make_partition(layout, VDO_SLAB_DEPOT_PARTITION, 0, true); + if (result != VDO_SUCCESS) + vdo_uninitialize_layout(layout); + + return result; +} + +/** + * vdo_uninitialize_layout() - Clean up a layout. + * @layout: The layout to clean up. + * + * All partitions created by this layout become invalid pointers. + */ +void vdo_uninitialize_layout(struct layout *layout) +{ + while (layout->head != NULL) { + struct partition *part = layout->head; + + layout->head = part->next; + UDS_FREE(part); + } + + memset(layout, 0, sizeof(struct layout)); +} + +/** + * vdo_get_partition() - Get a partition by id. + * @layout: The layout from which to get a partition. + * @id: The id of the partition. + * @partition_ptr: A pointer to hold the partition. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_get_partition(struct layout *layout, + enum partition_id id, + struct partition **partition_ptr) +{ + struct partition *partition; + + for (partition = layout->head; partition != NULL; partition = partition->next) { + if (partition->id == id) { + if (partition_ptr != NULL) + *partition_ptr = partition; + return VDO_SUCCESS; + } + } + + return VDO_UNKNOWN_PARTITION; +} + +/** + * vdo_get_known_partition() - Get a partition by id from a validated layout. + * @layout: The layout from which to get a partition. + * @id: The id of the partition. + * + * Return: the partition + */ +struct partition *vdo_get_known_partition(struct layout *layout, enum partition_id id) +{ + struct partition *partition; + int result = vdo_get_partition(layout, id, &partition); + + ASSERT_LOG_ONLY(result == VDO_SUCCESS, "layout has expected partition: %u", id); + + return partition; +} + +static void encode_layout(u8 *buffer, size_t *offset, const struct layout *layout) +{ + const struct partition *partition; + size_t initial_offset; + struct header header = VDO_LAYOUT_HEADER_3_0; + + STATIC_ASSERT_SIZEOF(enum partition_id, sizeof(u8)); + ASSERT_LOG_ONLY(layout->num_partitions <= U8_MAX, + "layout partition count must fit in a byte"); + + vdo_encode_header(buffer, offset, &header); + + initial_offset = *offset; + encode_u64_le(buffer, offset, layout->first_free); + encode_u64_le(buffer, offset, layout->last_free); + buffer[(*offset)++] = layout->num_partitions; + + ASSERT_LOG_ONLY(sizeof(struct layout_3_0) == *offset - initial_offset, + "encoded size of a layout header must match structure"); + + for (partition = layout->head; partition != NULL; partition = partition->next) { + buffer[(*offset)++] = partition->id; + encode_u64_le(buffer, offset, partition->offset); + /* This field only exists for backwards compatability */ + encode_u64_le(buffer, offset, 0); + encode_u64_le(buffer, offset, partition->count); + } + + ASSERT_LOG_ONLY(header.size == *offset - initial_offset, + "encoded size of a layout must match header size"); +} + +static int +decode_layout(u8 *buffer, + size_t *offset, + physical_block_number_t start, + block_count_t size, + struct layout *layout) +{ + struct header header; + struct layout_3_0 layout_header; + struct partition *partition; + size_t initial_offset; + physical_block_number_t first_free, last_free; + u8 partition_count; + u8 i; + int result; + + vdo_decode_header(buffer, offset, &header); + /* Layout is variable size, so only do a minimum size check here. */ + result = vdo_validate_header(&VDO_LAYOUT_HEADER_3_0, &header, false, __func__); + if (result != VDO_SUCCESS) + return result; + + initial_offset = *offset; + decode_u64_le(buffer, offset, &first_free); + decode_u64_le(buffer, offset, &last_free); + partition_count = buffer[(*offset)++]; + layout_header = (struct layout_3_0) { + .first_free = first_free, + .last_free = last_free, + .partition_count = partition_count, + }; + + result = ASSERT(sizeof(struct layout_3_0) == *offset - initial_offset, + "decoded size of a layout header must match structure"); + if (result != VDO_SUCCESS) + return result; + + layout->start = start; + layout->size = size; + layout->first_free = layout_header.first_free; + layout->last_free = layout_header.last_free; + layout->num_partitions = layout_header.partition_count; + + if (layout->num_partitions > VDO_PARTITION_COUNT) + return uds_log_error_strerror(VDO_UNKNOWN_PARTITION, + "layout has extra partitions"); + + for (i = 0; i < layout->num_partitions; i++) { + u8 id; + u64 partition_offset, count; + + id = buffer[(*offset)++]; + decode_u64_le(buffer, offset, &partition_offset); + *offset += sizeof(u64); + decode_u64_le(buffer, offset, &count); + + result = allocate_partition(layout, id, partition_offset, count); + if (result != VDO_SUCCESS) { + vdo_uninitialize_layout(layout); + return result; + } + } + + /* Validate that the layout has all (and only) the required partitions */ + for (i = 0; i < VDO_PARTITION_COUNT; i++) { + result = vdo_get_partition(layout, REQUIRED_PARTITIONS[i], &partition); + if (result != VDO_SUCCESS) { + vdo_uninitialize_layout(layout); + return uds_log_error_strerror(result, + "layout is missing required partition %u", + REQUIRED_PARTITIONS[i]); + } + + start += partition->count; + } + + if (start != size) { + vdo_uninitialize_layout(layout); + return uds_log_error_strerror(UDS_BAD_STATE, "partitions do not cover the layout"); + } + + return VDO_SUCCESS; +} + +/** + * pack_vdo_config() - Convert a vdo_config to its packed on-disk representation. + * @config: The vdo config to convert. + * + * Return: The platform-independent representation of the config. + */ +static struct packed_vdo_config pack_vdo_config(struct vdo_config config) +{ + return (struct packed_vdo_config) { + .logical_blocks = __cpu_to_le64(config.logical_blocks), + .physical_blocks = __cpu_to_le64(config.physical_blocks), + .slab_size = __cpu_to_le64(config.slab_size), + .recovery_journal_size = __cpu_to_le64(config.recovery_journal_size), + .slab_journal_blocks = __cpu_to_le64(config.slab_journal_blocks), + }; +} + +/** + * pack_vdo_component() - Convert a vdo_component to its packed on-disk representation. + * @component: The VDO component data to convert. + * + * Return: The platform-independent representation of the component. + */ +static struct packed_vdo_component_41_0 pack_vdo_component(const struct vdo_component component) +{ + return (struct packed_vdo_component_41_0) { + .state = __cpu_to_le32(component.state), + .complete_recoveries = __cpu_to_le64(component.complete_recoveries), + .read_only_recoveries = __cpu_to_le64(component.read_only_recoveries), + .config = pack_vdo_config(component.config), + .nonce = __cpu_to_le64(component.nonce), + }; +} + +static void encode_vdo_component(u8 *buffer, size_t *offset, struct vdo_component component) +{ + struct packed_vdo_component_41_0 packed; + + encode_version_number(buffer, offset, VDO_COMPONENT_DATA_41_0); + packed = pack_vdo_component(component); + memcpy(buffer + *offset, &packed, sizeof(packed)); + *offset += sizeof(packed); +} + +/** + * unpack_vdo_config() - Convert a packed_vdo_config to its native in-memory representation. + * @config: The packed vdo config to convert. + * + * Return: The native in-memory representation of the vdo config. + */ +static struct vdo_config unpack_vdo_config(struct packed_vdo_config config) +{ + return (struct vdo_config) { + .logical_blocks = __le64_to_cpu(config.logical_blocks), + .physical_blocks = __le64_to_cpu(config.physical_blocks), + .slab_size = __le64_to_cpu(config.slab_size), + .recovery_journal_size = __le64_to_cpu(config.recovery_journal_size), + .slab_journal_blocks = __le64_to_cpu(config.slab_journal_blocks), + }; +} + +/** + * unpack_vdo_component_41_0() - Convert a packed_vdo_component_41_0 to its native in-memory + * representation. + * @component: The packed vdo component data to convert. + * + * Return: The native in-memory representation of the component. + */ +static struct vdo_component +unpack_vdo_component_41_0(struct packed_vdo_component_41_0 component) +{ + return (struct vdo_component) { + .state = __le32_to_cpu(component.state), + .complete_recoveries = __le64_to_cpu(component.complete_recoveries), + .read_only_recoveries = __le64_to_cpu(component.read_only_recoveries), + .config = unpack_vdo_config(component.config), + .nonce = __le64_to_cpu(component.nonce), + }; +} + +/** + * vdo_decode_component() - Decode the component data for the vdo itself out of the super block. + * + * Return: VDO_SUCCESS or an error. + */ +static int decode_vdo_component(u8 *buffer, size_t *offset, struct vdo_component *component) +{ + struct version_number version; + struct packed_vdo_component_41_0 packed; + int result; + + decode_version_number(buffer, offset, &version); + result = validate_version(version, VDO_COMPONENT_DATA_41_0, "VDO component data"); + if (result != VDO_SUCCESS) + return result; + + memcpy(&packed, buffer + *offset, sizeof(packed)); + *offset += sizeof(packed); + *component = unpack_vdo_component_41_0(packed); + return VDO_SUCCESS; +} + +/** + * vdo_validate_config() - Validate constraints on a VDO config. + * @config: The VDO config. + * @physical_block_count: The minimum block count of the underlying storage. + * @logical_block_count: The expected logical size of the VDO, or 0 if the logical size may be + * unspecified. + * + * Return: A success or error code. + */ +int vdo_validate_config(const struct vdo_config *config, + block_count_t physical_block_count, + block_count_t logical_block_count) +{ + struct slab_config slab_config; + int result; + + result = ASSERT(config->slab_size > 0, "slab size unspecified"); + if (result != UDS_SUCCESS) + return result; + + result = ASSERT(is_power_of_2(config->slab_size), "slab size must be a power of two"); + if (result != UDS_SUCCESS) + return result; + + result = ASSERT(config->slab_size <= (1 << MAX_VDO_SLAB_BITS), + "slab size must be less than or equal to 2^%d", + MAX_VDO_SLAB_BITS); + if (result != VDO_SUCCESS) + return result; + + result = ASSERT(config->slab_journal_blocks >= MINIMUM_VDO_SLAB_JOURNAL_BLOCKS, + "slab journal size meets minimum size"); + if (result != UDS_SUCCESS) + return result; + + result = ASSERT(config->slab_journal_blocks <= config->slab_size, + "slab journal size is within expected bound"); + if (result != UDS_SUCCESS) + return result; + + result = vdo_configure_slab(config->slab_size, config->slab_journal_blocks, &slab_config); + if (result != VDO_SUCCESS) + return result; + + result = ASSERT((slab_config.data_blocks >= 1), + "slab must be able to hold at least one block"); + if (result != UDS_SUCCESS) + return result; + + result = ASSERT(config->physical_blocks > 0, "physical blocks unspecified"); + if (result != UDS_SUCCESS) + return result; + + result = ASSERT(config->physical_blocks <= MAXIMUM_VDO_PHYSICAL_BLOCKS, + "physical block count %llu exceeds maximum %llu", + (unsigned long long) config->physical_blocks, + (unsigned long long) MAXIMUM_VDO_PHYSICAL_BLOCKS); + if (result != UDS_SUCCESS) + return VDO_OUT_OF_RANGE; + + if (physical_block_count != config->physical_blocks) { + uds_log_error("A physical size of %llu blocks was specified, not the %llu blocks configured in the vdo super block", + (unsigned long long) physical_block_count, + (unsigned long long) config->physical_blocks); + return VDO_PARAMETER_MISMATCH; + } + + if (logical_block_count > 0) { + result = ASSERT((config->logical_blocks > 0), "logical blocks unspecified"); + if (result != UDS_SUCCESS) + return result; + + if (logical_block_count != config->logical_blocks) { + uds_log_error("A logical size of %llu blocks was specified, but that differs from the %llu blocks configured in the vdo super block", + (unsigned long long) logical_block_count, + (unsigned long long) config->logical_blocks); + return VDO_PARAMETER_MISMATCH; + } + } + + result = ASSERT(config->logical_blocks <= MAXIMUM_VDO_LOGICAL_BLOCKS, + "logical blocks too large"); + if (result != UDS_SUCCESS) + return result; + + result = ASSERT(config->recovery_journal_size > 0, "recovery journal size unspecified"); + if (result != UDS_SUCCESS) + return result; + + result = ASSERT(is_power_of_2(config->recovery_journal_size), + "recovery journal size must be a power of two"); + if (result != UDS_SUCCESS) + return result; + + return result; +} + +/** + * vdo_destroy_component_states() - Clean up any allocations in a vdo_component_states. + * @states: The component states to destroy. + */ +void vdo_destroy_component_states(struct vdo_component_states *states) +{ + if (states == NULL) + return; + + vdo_uninitialize_layout(&states->layout); +} + +/** + * decode_components() - Decode the components now that we know the component data is a version we + * understand. + * @buffer: The buffer being decoded. + * @offset: The offset to start decoding from. + * @geometry: The vdo geometry + * @states: An object to hold the successfully decoded state. + * + * Return: VDO_SUCCESS or an error. + */ +static int __must_check decode_components(u8 *buffer, + size_t *offset, + struct volume_geometry *geometry, + struct vdo_component_states *states) +{ + int result; + + decode_vdo_component(buffer, offset, &states->vdo); + + result = decode_layout(buffer, + offset, + vdo_get_data_region_start(*geometry) + 1, + states->vdo.config.physical_blocks, + &states->layout); + if (result != VDO_SUCCESS) + return result; + + result = decode_recovery_journal_state_7_0(buffer, offset, &states->recovery_journal); + if (result != VDO_SUCCESS) + return result; + + result = decode_slab_depot_state_2_0(buffer, offset, &states->slab_depot); + if (result != VDO_SUCCESS) + return result; + + result = decode_block_map_state_2_0(buffer, offset, &states->block_map); + if (result != VDO_SUCCESS) + return result; + + ASSERT_LOG_ONLY(*offset == VDO_COMPONENT_DATA_OFFSET + VDO_COMPONENT_DATA_SIZE, + "All decoded component data was used"); + return VDO_SUCCESS; +} + +/** + * vdo_decode_component_states() - Decode the payload of a super block. + * @buffer: The buffer containing the encoded super block contents. + * @geometry: The vdo geometry + * @states: A pointer to hold the decoded states. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_decode_component_states(u8 *buffer, + struct volume_geometry *geometry, + struct vdo_component_states *states) +{ + int result; + size_t offset = VDO_COMPONENT_DATA_OFFSET; + + /* Get and check the release version against the one from the geometry. */ + decode_u32_le(buffer, &offset, &states->release_version); + if (states->release_version != geometry->release_version) + return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION, + "Geometry release version %u does not match super block release version %u", + geometry->release_version, + states->release_version); + + /* Check the VDO volume version */ + decode_version_number(buffer, &offset, &states->volume_version); + result = validate_version(VDO_VOLUME_VERSION_67_0, states->volume_version, "volume"); + if (result != VDO_SUCCESS) + return result; + + result = decode_components(buffer, &offset, geometry, states); + if (result != VDO_SUCCESS) + vdo_uninitialize_layout(&states->layout); + + return result; +} + +/** + * vdo_validate_component_states() - Validate the decoded super block configuration. + * @states: The state decoded from the super block. + * @geometry_nonce: The nonce from the geometry block. + * @physical_size: The minimum block count of the underlying storage. + * @logical_size: The expected logical size of the VDO, or 0 if the logical size may be + * unspecified. + * + * Return: VDO_SUCCESS or an error if the configuration is invalid. + */ +int vdo_validate_component_states(struct vdo_component_states *states, + nonce_t geometry_nonce, + block_count_t physical_size, + block_count_t logical_size) +{ + if (geometry_nonce != states->vdo.nonce) + return uds_log_error_strerror(VDO_BAD_NONCE, + "Geometry nonce %llu does not match superblock nonce %llu", + (unsigned long long) geometry_nonce, + (unsigned long long) states->vdo.nonce); + + return vdo_validate_config(&states->vdo.config, physical_size, logical_size); +} + +/** + * vdo_encode_component_states() - Encode the state of all vdo components in the super block. + */ +static void +vdo_encode_component_states(u8 *buffer, size_t *offset, const struct vdo_component_states *states) +{ + encode_u32_le(buffer, offset, states->release_version); + encode_version_number(buffer, offset, states->volume_version); + encode_vdo_component(buffer, offset, states->vdo); + encode_layout(buffer, offset, &states->layout); + encode_recovery_journal_state_7_0(buffer, offset, states->recovery_journal); + encode_slab_depot_state_2_0(buffer, offset, states->slab_depot); + encode_block_map_state_2_0(buffer, offset, states->block_map); + + ASSERT_LOG_ONLY(*offset == VDO_COMPONENT_DATA_OFFSET + VDO_COMPONENT_DATA_SIZE, + "All super block component data was encoded"); +} + +/** + * vdo_encode_super_block() - Encode a super block into its on-disk representation. + */ +void vdo_encode_super_block(u8 *buffer, struct vdo_component_states *states) +{ + u32 checksum; + struct header header = SUPER_BLOCK_HEADER_12_0; + size_t offset = 0; + + header.size += VDO_COMPONENT_DATA_SIZE; + vdo_encode_header(buffer, &offset, &header); + vdo_encode_component_states(buffer, &offset, states); + + checksum = vdo_crc32(buffer, offset); + encode_u32_le(buffer, &offset, checksum); + + /* + * Even though the buffer is a full block, to avoid the potential corruption from a torn + * write, the entire encoding must fit in the first sector. + */ + ASSERT_LOG_ONLY(offset <= VDO_SECTOR_SIZE, "entire superblock must fit in one sector"); +} + +/** + * vdo_decode_super_block() - Decode a super block from its on-disk representation. + */ +int vdo_decode_super_block(u8 *buffer) +{ + struct header header; + int result; + u32 checksum, saved_checksum; + size_t offset = 0; + + /* Decode and validate the header. */ + vdo_decode_header(buffer, &offset, &header); + result = vdo_validate_header(&SUPER_BLOCK_HEADER_12_0, &header, false, __func__); + if (result != VDO_SUCCESS) + return result; + + if (header.size > VDO_COMPONENT_DATA_SIZE + sizeof(u32)) + /* + * We can't check release version or checksum until we know the content size, so we + * have to assume a version mismatch on unexpected values. + */ + return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION, + "super block contents too large: %zu", + header.size); + + /* Skip past the component data for now, to verify the checksum. */ + offset += VDO_COMPONENT_DATA_SIZE; + + checksum = vdo_crc32(buffer, offset); + decode_u32_le(buffer, &offset, &saved_checksum); + + result = ASSERT(offset == VDO_SUPER_BLOCK_FIXED_SIZE + VDO_COMPONENT_DATA_SIZE, + "must have decoded entire superblock payload"); + if (result != VDO_SUCCESS) + return result; + + return ((checksum != saved_checksum) ? VDO_CHECKSUM_MISMATCH : VDO_SUCCESS); +} diff --git a/drivers/md/dm-vdo/encodings.h b/drivers/md/dm-vdo/encodings.h new file mode 100644 index 00000000000..8432112e928 --- /dev/null +++ b/drivers/md/dm-vdo/encodings.h @@ -0,0 +1,1307 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_ENCODINGS_H +#define VDO_ENCODINGS_H + +#include <linux/blk_types.h> +#include <linux/crc32.h> +#include <linux/limits.h> +#include <linux/uuid.h> + +#include "numeric.h" +#include "uds.h" + +#include "constants.h" +#include "types.h" + +/* + * An in-memory representation of a version number for versioned structures on disk. + * + * A version number consists of two portions, a major version and a minor version. Any format + * change which does not require an explicit upgrade step from the previous version should + * increment the minor version. Any format change which either requires an explicit upgrade step, + * or is wholly incompatible (i.e. can not be upgraded to), should increment the major version, and + * set the minor version to 0. + */ +struct version_number { + u32 major_version; + u32 minor_version; +}; + +/* + * A packed, machine-independent, on-disk representation of a version_number. Both fields are + * stored in little-endian byte order. + */ +struct packed_version_number { + __le32 major_version; + __le32 minor_version; +} __packed; + +/* The registry of component ids for use in headers */ +#define VDO_SUPER_BLOCK 0 +#define VDO_LAYOUT 1 +#define VDO_RECOVERY_JOURNAL 2 +#define VDO_SLAB_DEPOT 3 +#define VDO_BLOCK_MAP 4 +#define VDO_GEOMETRY_BLOCK 5 + +/* The header for versioned data stored on disk. */ +struct header { + u32 id; /* The component this is a header for */ + struct version_number version; /* The version of the data format */ + size_t size; /* The size of the data following this header */ +}; + +/* A packed, machine-independent, on-disk representation of a component header. */ +struct packed_header { + __le32 id; + struct packed_version_number version; + __le64 size; +} __packed; + +enum { + VDO_GEOMETRY_BLOCK_LOCATION = 0, + VDO_GEOMETRY_MAGIC_NUMBER_SIZE = 8, + VDO_DEFAULT_GEOMETRY_BLOCK_VERSION = 5, +}; + +struct index_config { + u32 mem; + u32 unused; + bool sparse; +} __packed; + +enum volume_region_id { + VDO_INDEX_REGION = 0, + VDO_DATA_REGION = 1, + VDO_VOLUME_REGION_COUNT, +}; + +struct volume_region { + /* The ID of the region */ + enum volume_region_id id; + /* + * The absolute starting offset on the device. The region continues until the next region + * begins. + */ + physical_block_number_t start_block; +} __packed; + +struct volume_geometry { + /* The release version number of this volume */ + release_version_number_t release_version; + /* The nonce of this volume */ + nonce_t nonce; + /* The uuid of this volume */ + uuid_t uuid; + /* The block offset to be applied to bios */ + block_count_t bio_offset; + /* The regions in ID order */ + struct volume_region regions[VDO_VOLUME_REGION_COUNT]; + /* The index config */ + struct index_config index_config; +} __packed; + +/* This volume geometry struct is used for sizing only */ +struct volume_geometry_4_0 { + /* The release version number of this volume */ + release_version_number_t release_version; + /* The nonce of this volume */ + nonce_t nonce; + /* The uuid of this volume */ + uuid_t uuid; + /* The regions in ID order */ + struct volume_region regions[VDO_VOLUME_REGION_COUNT]; + /* The index config */ + struct index_config index_config; +} __packed; + +extern const u8 VDO_GEOMETRY_MAGIC_NUMBER[VDO_GEOMETRY_MAGIC_NUMBER_SIZE + 1]; + +/** + * DOC: Block map entries + * + * The entry for each logical block in the block map is encoded into five bytes, which saves space + * in both the on-disk and in-memory layouts. It consists of the 36 low-order bits of a + * physical_block_number_t (addressing 256 terabytes with a 4KB block size) and a 4-bit encoding of + * a block_mapping_state. + * + * Of the 8 high bits of the 5-byte structure: + * + * Bits 7..4: The four highest bits of the 36-bit physical block number + * Bits 3..0: The 4-bit block_mapping_state + * + * The following 4 bytes are the low order bytes of the physical block number, in little-endian + * order. + * + * Conversion functions to and from a data location are provided. + */ +struct block_map_entry { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned mapping_state : 4; + unsigned pbn_high_nibble : 4; +#else + unsigned pbn_high_nibble : 4; + unsigned mapping_state : 4; +#endif + + __le32 pbn_low_word; +} __packed; + +struct block_map_page_header { + __le64 nonce; + __le64 pbn; + + /** May be non-zero on disk */ + u8 unused_long_word[8]; + + /* Whether this page has been written twice to disk */ + bool initialized; + + /* Always zero on disk */ + u8 unused_byte1; + + /* May be non-zero on disk */ + u8 unused_byte2; + u8 unused_byte3; +} __packed; + +struct block_map_page { + struct packed_version_number version; + struct block_map_page_header header; + struct block_map_entry entries[]; +} __packed; + +enum block_map_page_validity { + VDO_BLOCK_MAP_PAGE_VALID, + VDO_BLOCK_MAP_PAGE_INVALID, + /* Valid page found in the wrong location on disk */ + VDO_BLOCK_MAP_PAGE_BAD, +}; + +struct block_map_state_2_0 { + physical_block_number_t flat_page_origin; + block_count_t flat_page_count; + physical_block_number_t root_origin; + block_count_t root_count; +} __packed; + +struct boundary { + page_number_t levels[VDO_BLOCK_MAP_TREE_HEIGHT]; +}; + +extern const struct header VDO_BLOCK_MAP_HEADER_2_0; + +/* The state of the recovery journal as encoded in the VDO super block. */ +struct recovery_journal_state_7_0 { + /** Sequence number to start the journal */ + sequence_number_t journal_start; + /** Number of logical blocks used by VDO */ + block_count_t logical_blocks_used; + /** Number of block map pages allocated */ + block_count_t block_map_data_blocks; +} __packed; + +extern const struct header VDO_RECOVERY_JOURNAL_HEADER_7_0; + +typedef u16 journal_entry_count_t; + +/* + * A recovery journal entry stores three physical locations: a data location that is the value of a + * single mapping in the block map tree, and the two locations of the block map pages and slots + * that are acquiring and releasing a reference to the location. The journal entry also stores an + * operation code that says whether the mapping is for a logical block or for the block map tree + * itself. + */ +struct recovery_journal_entry { + struct block_map_slot slot; + struct data_location mapping; + struct data_location unmapping; + enum journal_operation operation; +}; + +/* The packed, on-disk representation of a recovery journal entry. */ +struct packed_recovery_journal_entry { + /* + * In little-endian bit order: + * Bits 15..12: The four highest bits of the 36-bit physical block number of the block map + * tree page + * Bits 11..2: The 10-bit block map page slot number + * Bit 1..0: The journal_operation of the entry (this actually only requires 1 bit, but + * it is convenient to keep the extra bit as part of this field. + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned operation : 2; + unsigned slot_low : 6; + unsigned slot_high : 4; + unsigned pbn_high_nibble : 4; +#else + unsigned slot_low : 6; + unsigned operation : 2; + unsigned pbn_high_nibble : 4; + unsigned slot_high : 4; +#endif + + /* + * Bits 47..16: The 32 low-order bits of the block map page PBN, in little-endian byte + * order + */ + __le32 pbn_low_word; + + /* + * Bits 87..48: The five-byte block map entry encoding the location that will be stored in + * the block map page slot + */ + struct block_map_entry mapping; + + /* + * Bits 127..88: The five-byte block map entry encoding the location that was stored in the + * block map page slot + */ + struct block_map_entry unmapping; +} __packed; + +/* The packed, on-disk representation of an old format recovery journal entry. */ +struct packed_recovery_journal_entry_1 { + /* + * In little-endian bit order: + * Bits 15..12: The four highest bits of the 36-bit physical block number of the block map + * tree page + * Bits 11..2: The 10-bit block map page slot number + * Bits 1..0: The 2-bit journal_operation of the entry + * + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned operation : 2; + unsigned slot_low : 6; + unsigned slot_high : 4; + unsigned pbn_high_nibble : 4; +#else + unsigned slot_low : 6; + unsigned operation : 2; + unsigned pbn_high_nibble : 4; + unsigned slot_high : 4; +#endif + + /* + * Bits 47..16: The 32 low-order bits of the block map page PBN, in little-endian byte + * order + */ + __le32 pbn_low_word; + + /* + * Bits 87..48: The five-byte block map entry encoding the location that was or will be + * stored in the block map page slot + */ + struct block_map_entry block_map_entry; +} __packed; + +enum journal_operation_1 { + VDO_JOURNAL_DATA_DECREMENT = 0, + VDO_JOURNAL_DATA_INCREMENT = 1, + VDO_JOURNAL_BLOCK_MAP_DECREMENT = 2, + VDO_JOURNAL_BLOCK_MAP_INCREMENT = 3, +} __packed; + +struct recovery_block_header { + sequence_number_t block_map_head; /* Block map head sequence number */ + sequence_number_t slab_journal_head; /* Slab journal head seq. number */ + sequence_number_t sequence_number; /* Sequence number for this block */ + nonce_t nonce; /* A given VDO instance's nonce */ + block_count_t logical_blocks_used; /* Logical blocks in use */ + block_count_t block_map_data_blocks; /* Allocated block map pages */ + journal_entry_count_t entry_count; /* Number of entries written */ + u8 check_byte; /* The protection check byte */ + u8 recovery_count; /* Number of recoveries completed */ + enum vdo_metadata_type metadata_type; /* Metadata type */ +}; + +/* + * The packed, on-disk representation of a recovery journal block header. All fields are kept in + * little-endian byte order. + */ +struct packed_journal_header { + /* Block map head 64-bit sequence number */ + __le64 block_map_head; + + /* Slab journal head 64-bit sequence number */ + __le64 slab_journal_head; + + /* The 64-bit sequence number for this block */ + __le64 sequence_number; + + /* A given VDO instance's 64-bit nonce */ + __le64 nonce; + + /* 8-bit metadata type (should always be one for the recovery journal) */ + u8 metadata_type; + + /* 16-bit count of the entries encoded in the block */ + __le16 entry_count; + + /* 64-bit count of the logical blocks used when this block was opened */ + __le64 logical_blocks_used; + + /* 64-bit count of the block map blocks used when this block was opened */ + __le64 block_map_data_blocks; + + /* The protection check byte */ + u8 check_byte; + + /* The number of recoveries completed */ + u8 recovery_count; +} __packed; + +struct packed_journal_sector { + /* The protection check byte */ + u8 check_byte; + + /* The number of recoveries completed */ + u8 recovery_count; + + /* The number of entries in this sector */ + u8 entry_count; + + /* Journal entries for this sector */ + struct packed_recovery_journal_entry entries[]; +} __packed; + +enum { + /* The number of entries in each sector (except the last) when filled */ + RECOVERY_JOURNAL_ENTRIES_PER_SECTOR = + ((VDO_SECTOR_SIZE - sizeof(struct packed_journal_sector)) / + sizeof(struct packed_recovery_journal_entry)), + RECOVERY_JOURNAL_ENTRIES_PER_BLOCK = RECOVERY_JOURNAL_ENTRIES_PER_SECTOR * 7, + /* The number of entries in a v1 recovery journal block. */ + RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK = 311, + /* The number of entries in each v1 sector (except the last) when filled */ + RECOVERY_JOURNAL_1_ENTRIES_PER_SECTOR = + ((VDO_SECTOR_SIZE - sizeof(struct packed_journal_sector)) / + sizeof(struct packed_recovery_journal_entry_1)), + /* The number of entries in the last sector when a block is full */ + RECOVERY_JOURNAL_1_ENTRIES_IN_LAST_SECTOR = + (RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK % RECOVERY_JOURNAL_1_ENTRIES_PER_SECTOR), +}; + +/* A type representing a reference count of a block. */ +typedef u8 vdo_refcount_t; + +/* The absolute position of an entry in a recovery journal or slab journal. */ +struct journal_point { + sequence_number_t sequence_number; + journal_entry_count_t entry_count; +}; + +/* A packed, platform-independent encoding of a struct journal_point. */ +struct packed_journal_point { + /* + * The packed representation is the little-endian 64-bit representation of the low-order 48 + * bits of the sequence number, shifted up 16 bits, or'ed with the 16-bit entry count. + * + * Very long-term, the top 16 bits of the sequence number may not always be zero, as this + * encoding assumes--see BZ 1523240. + */ + __le64 encoded_point; +} __packed; + +/* Special vdo_refcount_t values. */ +#define EMPTY_REFERENCE_COUNT 0 +enum { + MAXIMUM_REFERENCE_COUNT = 254, + PROVISIONAL_REFERENCE_COUNT = 255, +}; + +enum { + COUNTS_PER_SECTOR = + ((VDO_SECTOR_SIZE - sizeof(struct packed_journal_point)) / sizeof(vdo_refcount_t)), + COUNTS_PER_BLOCK = COUNTS_PER_SECTOR * VDO_SECTORS_PER_BLOCK, +}; + +/* The format of each sector of a reference_block on disk. */ +struct packed_reference_sector { + struct packed_journal_point commit_point; + vdo_refcount_t counts[COUNTS_PER_SECTOR]; +} __packed; + +struct packed_reference_block { + struct packed_reference_sector sectors[VDO_SECTORS_PER_BLOCK]; +}; + +struct slab_depot_state_2_0 { + struct slab_config slab_config; + physical_block_number_t first_block; + physical_block_number_t last_block; + zone_count_t zone_count; +} __packed; + +extern const struct header VDO_SLAB_DEPOT_HEADER_2_0; + +/* + * vdo_slab journal blocks may have one of two formats, depending upon whether or not any of the + * entries in the block are block map increments. Since the steady state for a VDO is that all of + * the necessary block map pages will be allocated, most slab journal blocks will have only data + * entries. Such blocks can hold more entries, hence the two formats. + */ + +/* A single slab journal entry */ +struct slab_journal_entry { + slab_block_number sbn; + enum journal_operation operation; + bool increment; +}; + +/* A single slab journal entry in its on-disk form */ +typedef struct { + u8 offset_low8; + u8 offset_mid8; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned offset_high7 : 7; + unsigned increment : 1; +#else + unsigned increment : 1; + unsigned offset_high7 : 7; +#endif +} __packed packed_slab_journal_entry; + +/* The unpacked representation of the header of a slab journal block */ +struct slab_journal_block_header { + /* Sequence number for head of journal */ + sequence_number_t head; + /* Sequence number for this block */ + sequence_number_t sequence_number; + /* The nonce for a given VDO instance */ + nonce_t nonce; + /* Recovery journal point for last entry */ + struct journal_point recovery_point; + /* Metadata type */ + enum vdo_metadata_type metadata_type; + /* Whether this block contains block map increments */ + bool has_block_map_increments; + /* The number of entries in the block */ + journal_entry_count_t entry_count; +}; + +/* + * The packed, on-disk representation of a slab journal block header. All fields are kept in + * little-endian byte order. + */ +struct packed_slab_journal_block_header { + /* 64-bit sequence number for head of journal */ + __le64 head; + /* 64-bit sequence number for this block */ + __le64 sequence_number; + /* Recovery journal point for the last entry, packed into 64 bits */ + struct packed_journal_point recovery_point; + /* The 64-bit nonce for a given VDO instance */ + __le64 nonce; + /* 8-bit metadata type (should always be two, for the slab journal) */ + u8 metadata_type; + /* Whether this block contains block map increments */ + bool has_block_map_increments; + /* 16-bit count of the entries encoded in the block */ + __le16 entry_count; +} __packed; + +enum { + VDO_SLAB_JOURNAL_PAYLOAD_SIZE = + VDO_BLOCK_SIZE - sizeof(struct packed_slab_journal_block_header), + VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK = (VDO_SLAB_JOURNAL_PAYLOAD_SIZE * 8) / 25, + VDO_SLAB_JOURNAL_ENTRY_TYPES_SIZE = + ((VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK - 1) / 8) + 1, + VDO_SLAB_JOURNAL_ENTRIES_PER_BLOCK = + (VDO_SLAB_JOURNAL_PAYLOAD_SIZE / sizeof(packed_slab_journal_entry)), +}; + +/* The payload of a slab journal block which has block map increments */ +struct full_slab_journal_entries { + /* The entries themselves */ + packed_slab_journal_entry entries[VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK]; + /* The bit map indicating which entries are block map increments */ + u8 entry_types[VDO_SLAB_JOURNAL_ENTRY_TYPES_SIZE]; +} __packed; + +typedef union { + /* Entries which include block map increments */ + struct full_slab_journal_entries full_entries; + /* Entries which are only data updates */ + packed_slab_journal_entry entries[VDO_SLAB_JOURNAL_ENTRIES_PER_BLOCK]; + /* Ensure the payload fills to the end of the block */ + u8 space[VDO_SLAB_JOURNAL_PAYLOAD_SIZE]; +} __packed slab_journal_payload; + +struct packed_slab_journal_block { + struct packed_slab_journal_block_header header; + slab_journal_payload payload; +} __packed; + +/* The offset of a slab journal tail block. */ +typedef u8 tail_block_offset_t; + +struct slab_summary_entry { + /* Bits 7..0: The offset of the tail block within the slab journal */ + tail_block_offset_t tail_block_offset; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + /* Bits 13..8: A hint about the fullness of the slab */ + unsigned int fullness_hint : 6; + /* Bit 14: Whether the ref_counts must be loaded from the layer */ + unsigned int load_ref_counts : 1; + /* Bit 15: The believed cleanliness of this slab */ + unsigned int is_dirty : 1; +#else + /* Bit 15: The believed cleanliness of this slab */ + unsigned int is_dirty : 1; + /* Bit 14: Whether the ref_counts must be loaded from the layer */ + unsigned int load_ref_counts : 1; + /* Bits 13..8: A hint about the fullness of the slab */ + unsigned int fullness_hint : 6; +#endif +} __packed; + +enum { + VDO_SLAB_SUMMARY_FULLNESS_HINT_BITS = 6, + VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK = VDO_BLOCK_SIZE / sizeof(struct slab_summary_entry), + VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE = MAX_VDO_SLABS / VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK, + VDO_SLAB_SUMMARY_BLOCKS = VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE * MAX_VDO_PHYSICAL_ZONES, +}; + +struct layout { + physical_block_number_t start; + block_count_t size; + physical_block_number_t first_free; + physical_block_number_t last_free; + size_t num_partitions; + struct partition *head; +}; + +struct partition { + enum partition_id id; /* The id of this partition */ + physical_block_number_t offset; /* The offset into the layout of this partition */ + block_count_t count; /* The number of blocks in the partition */ + struct partition *next; /* A pointer to the next partition in the layout */ +}; + +struct layout_3_0 { + physical_block_number_t first_free; + physical_block_number_t last_free; + u8 partition_count; +} __packed; + +struct partition_3_0 { + enum partition_id id; + physical_block_number_t offset; + physical_block_number_t base; /* unused but retained for backwards compatability */ + block_count_t count; +} __packed; + +/* + * The configuration of the VDO service. + */ +struct vdo_config { + block_count_t logical_blocks; /* number of logical blocks */ + block_count_t physical_blocks; /* number of physical blocks */ + block_count_t slab_size; /* number of blocks in a slab */ + block_count_t recovery_journal_size; /* number of recovery journal blocks */ + block_count_t slab_journal_blocks; /* number of slab journal blocks */ +}; + +/* This is the structure that captures the vdo fields saved as a super block component. */ +struct vdo_component { + enum vdo_state state; + u64 complete_recoveries; + u64 read_only_recoveries; + struct vdo_config config; + nonce_t nonce; +}; + +/* + * A packed, machine-independent, on-disk representation of the vdo_config in the VDO component + * data in the super block. + */ +struct packed_vdo_config { + __le64 logical_blocks; + __le64 physical_blocks; + __le64 slab_size; + __le64 recovery_journal_size; + __le64 slab_journal_blocks; +} __packed; + +/* + * A packed, machine-independent, on-disk representation of version 41.0 of the VDO component data + * in the super block. + */ +struct packed_vdo_component_41_0 { + __le32 state; + __le64 complete_recoveries; + __le64 read_only_recoveries; + struct packed_vdo_config config; + __le64 nonce; +} __packed; + +/* + * The version of the on-disk format of a VDO volume. This should be incremented any time the + * on-disk representation of any VDO structure changes. Changes which require only online upgrade + * steps should increment the minor version. Changes which require an offline upgrade or which can + * not be upgraded to at all should increment the major version and set the minor version to 0. + */ +extern const struct version_number VDO_VOLUME_VERSION_67_0; + +enum { + VDO_ENCODED_HEADER_SIZE = sizeof(struct packed_header), + BLOCK_MAP_COMPONENT_ENCODED_SIZE = + VDO_ENCODED_HEADER_SIZE + sizeof(struct block_map_state_2_0), + RECOVERY_JOURNAL_COMPONENT_ENCODED_SIZE = + VDO_ENCODED_HEADER_SIZE + sizeof(struct recovery_journal_state_7_0), + SLAB_DEPOT_COMPONENT_ENCODED_SIZE = + VDO_ENCODED_HEADER_SIZE + sizeof(struct slab_depot_state_2_0), + VDO_PARTITION_COUNT = 4, + VDO_LAYOUT_ENCODED_SIZE = (VDO_ENCODED_HEADER_SIZE + + sizeof(struct layout_3_0) + + (sizeof(struct partition_3_0) * VDO_PARTITION_COUNT)), + VDO_SUPER_BLOCK_FIXED_SIZE = VDO_ENCODED_HEADER_SIZE + sizeof(u32), + VDO_MAX_COMPONENT_DATA_SIZE = VDO_SECTOR_SIZE - VDO_SUPER_BLOCK_FIXED_SIZE, + VDO_COMPONENT_ENCODED_SIZE = + (sizeof(struct packed_version_number) + sizeof(struct packed_vdo_component_41_0)), + VDO_COMPONENT_DATA_OFFSET = VDO_ENCODED_HEADER_SIZE, + VDO_COMPONENT_DATA_SIZE = (sizeof(release_version_number_t) + + sizeof(struct packed_version_number) + + VDO_COMPONENT_ENCODED_SIZE + + VDO_LAYOUT_ENCODED_SIZE + + RECOVERY_JOURNAL_COMPONENT_ENCODED_SIZE + + SLAB_DEPOT_COMPONENT_ENCODED_SIZE + + BLOCK_MAP_COMPONENT_ENCODED_SIZE), +}; + +/* The entirety of the component data encoded in the VDO super block. */ +struct vdo_component_states { + /* The release version */ + release_version_number_t release_version; + + /* The VDO volume version */ + struct version_number volume_version; + + /* Components */ + struct vdo_component vdo; + struct block_map_state_2_0 block_map; + struct recovery_journal_state_7_0 recovery_journal; + struct slab_depot_state_2_0 slab_depot; + + /* Our partitioning of the underlying storage */ + struct layout layout; +}; + +/** + * vdo_are_same_version() - Check whether two version numbers are the same. + * @version_a: The first version. + * @version_b: The second version. + * + * Return: true if the two versions are the same. + */ +static inline bool +vdo_are_same_version(struct version_number version_a, struct version_number version_b) +{ + return ((version_a.major_version == version_b.major_version) && + (version_a.minor_version == version_b.minor_version)); +} + +/** + * vdo_is_upgradable_version() - Check whether an actual version is upgradable to an expected + * version. + * @expected_version: The expected version. + * @actual_version: The version being validated. + * + * An actual version is upgradable if its major number is expected but its minor number differs, + * and the expected version's minor number is greater than the actual version's minor number. + * + * Return: true if the actual version is upgradable. + */ +static inline bool vdo_is_upgradable_version(struct version_number expected_version, + struct version_number actual_version) +{ + return ((expected_version.major_version == actual_version.major_version) && + (expected_version.minor_version > actual_version.minor_version)); +} + +int __must_check vdo_validate_header(const struct header *expected_header, + const struct header *actual_header, + bool exact_size, + const char *component_name); + +void vdo_encode_header(u8 *buffer, size_t *offset, const struct header *header); +void vdo_decode_header(u8 *buffer, size_t *offset, struct header *header); + +/** + * vdo_pack_version_number() - Convert a version_number to its packed on-disk representation. + * @version: The version number to convert. + * + * Return: the platform-independent representation of the version + */ +static inline struct packed_version_number vdo_pack_version_number(struct version_number version) +{ + return (struct packed_version_number) { + .major_version = __cpu_to_le32(version.major_version), + .minor_version = __cpu_to_le32(version.minor_version), + }; +} + +/** + * vdo_unpack_version_number() - Convert a packed_version_number to its native in-memory + * representation. + * @version: The version number to convert. + * + * Return: The platform-independent representation of the version. + */ +static inline struct version_number vdo_unpack_version_number(struct packed_version_number version) +{ + return (struct version_number) { + .major_version = __le32_to_cpu(version.major_version), + .minor_version = __le32_to_cpu(version.minor_version), + }; +} + +/** + * vdo_pack_header() - Convert a component header to its packed on-disk representation. + * @header: The header to convert. + * + * Return: the platform-independent representation of the header + */ +static inline struct packed_header vdo_pack_header(const struct header *header) +{ + return (struct packed_header) { + .id = __cpu_to_le32(header->id), + .version = vdo_pack_version_number(header->version), + .size = __cpu_to_le64(header->size), + }; +} + +/** + * vdo_unpack_header() - Convert a packed_header to its native in-memory representation. + * @header: The header to convert. + * + * Return: The platform-independent representation of the version. + */ +static inline struct header vdo_unpack_header(const struct packed_header *header) +{ + return (struct header) { + .id = __le32_to_cpu(header->id), + .version = vdo_unpack_version_number(header->version), + .size = __le64_to_cpu(header->size), + }; +} + +/** + * vdo_get_index_region_start() - Get the start of the index region from a geometry. + * @geometry: The geometry. + * + * Return: The start of the index region. + */ +static inline physical_block_number_t __must_check +vdo_get_index_region_start(struct volume_geometry geometry) +{ + return geometry.regions[VDO_INDEX_REGION].start_block; +} + +/** + * vdo_get_data_region_start() - Get the start of the data region from a geometry. + * @geometry: The geometry. + * + * Return: The start of the data region. + */ +static inline physical_block_number_t __must_check +vdo_get_data_region_start(struct volume_geometry geometry) +{ + return geometry.regions[VDO_DATA_REGION].start_block; +} + +/** + * vdo_get_index_region_size() - Get the size of the index region from a geometry. + * @geometry: The geometry. + * + * Return: The size of the index region. + */ +static inline physical_block_number_t __must_check +vdo_get_index_region_size(struct volume_geometry geometry) +{ + return vdo_get_data_region_start(geometry) - + vdo_get_index_region_start(geometry); +} + +int __must_check vdo_parse_geometry_block(unsigned char *block, struct volume_geometry *geometry); + +static inline bool vdo_is_state_compressed(const enum block_mapping_state mapping_state) +{ + return (mapping_state > VDO_MAPPING_STATE_UNCOMPRESSED); +} + +static inline struct block_map_entry +vdo_pack_block_map_entry(physical_block_number_t pbn, enum block_mapping_state mapping_state) +{ + return (struct block_map_entry) { + .mapping_state = (mapping_state & 0x0F), + .pbn_high_nibble = ((pbn >> 32) & 0x0F), + .pbn_low_word = __cpu_to_le32(pbn & UINT_MAX), + }; +} + +static inline struct data_location vdo_unpack_block_map_entry(const struct block_map_entry *entry) +{ + physical_block_number_t low32 = __le32_to_cpu(entry->pbn_low_word); + physical_block_number_t high4 = entry->pbn_high_nibble; + + return (struct data_location) { + .pbn = ((high4 << 32) | low32), + .state = entry->mapping_state, + }; +} + +static inline bool vdo_is_mapped_location(const struct data_location *location) +{ + return (location->state != VDO_MAPPING_STATE_UNMAPPED); +} + +static inline bool vdo_is_valid_location(const struct data_location *location) +{ + if (location->pbn == VDO_ZERO_BLOCK) + return !vdo_is_state_compressed(location->state); + else + return vdo_is_mapped_location(location); +} + +static inline physical_block_number_t __must_check +vdo_get_block_map_page_pbn(const struct block_map_page *page) +{ + return __le64_to_cpu(page->header.pbn); +} + +struct block_map_page *vdo_format_block_map_page(void *buffer, + nonce_t nonce, + physical_block_number_t pbn, + bool initialized); + +enum block_map_page_validity __must_check +vdo_validate_block_map_page(struct block_map_page *page, + nonce_t nonce, + physical_block_number_t pbn); + +static inline page_count_t vdo_compute_block_map_page_count(block_count_t entries) +{ + return DIV_ROUND_UP(entries, VDO_BLOCK_MAP_ENTRIES_PER_PAGE); +} + +block_count_t __must_check +vdo_compute_new_forest_pages(root_count_t root_count, + struct boundary *old_sizes, + block_count_t entries, + struct boundary *new_sizes); + +/** + * vdo_pack_recovery_journal_entry() - Return the packed, on-disk representation of a recovery + * journal entry. + * @entry: The journal entry to pack. + * + * Return: The packed representation of the journal entry. + */ +static inline struct packed_recovery_journal_entry +vdo_pack_recovery_journal_entry(const struct recovery_journal_entry *entry) +{ + return (struct packed_recovery_journal_entry) { + .operation = entry->operation, + .slot_low = entry->slot.slot & 0x3F, + .slot_high = (entry->slot.slot >> 6) & 0x0F, + .pbn_high_nibble = (entry->slot.pbn >> 32) & 0x0F, + .pbn_low_word = __cpu_to_le32(entry->slot.pbn & UINT_MAX), + .mapping = vdo_pack_block_map_entry(entry->mapping.pbn, entry->mapping.state), + .unmapping = vdo_pack_block_map_entry(entry->unmapping.pbn, + entry->unmapping.state), + }; +} + +/** + * vdo_unpack_recovery_journal_entry() - Unpack the on-disk representation of a recovery journal + * entry. + * @entry: The recovery journal entry to unpack. + * + * Return: The unpacked entry. + */ +static inline struct recovery_journal_entry +vdo_unpack_recovery_journal_entry(const struct packed_recovery_journal_entry *entry) +{ + physical_block_number_t low32 = __le32_to_cpu(entry->pbn_low_word); + physical_block_number_t high4 = entry->pbn_high_nibble; + + return (struct recovery_journal_entry) { + .operation = entry->operation, + .slot = { + .pbn = ((high4 << 32) | low32), + .slot = (entry->slot_low | (entry->slot_high << 6)), + }, + .mapping = vdo_unpack_block_map_entry(&entry->mapping), + .unmapping = vdo_unpack_block_map_entry(&entry->unmapping), + }; +} + +const char * __must_check vdo_get_journal_operation_name(enum journal_operation operation); + +/** + * vdo_is_valid_recovery_journal_sector() - Determine whether the header of the given sector could + * describe a valid sector for the given journal block + * header. + * @header: The unpacked block header to compare against. + * @sector: The packed sector to check. + * @sector_number: The number of the sector being checked. + * + * Return: true if the sector matches the block header. + */ +static inline bool __must_check +vdo_is_valid_recovery_journal_sector(const struct recovery_block_header *header, + const struct packed_journal_sector *sector, + u8 sector_number) +{ + if ((header->check_byte != sector->check_byte) || + (header->recovery_count != sector->recovery_count)) + return false; + + if (header->metadata_type == VDO_METADATA_RECOVERY_JOURNAL_2) + return sector->entry_count <= RECOVERY_JOURNAL_ENTRIES_PER_SECTOR; + + if (sector_number == 7) + return sector->entry_count <= RECOVERY_JOURNAL_1_ENTRIES_IN_LAST_SECTOR; + + return sector->entry_count <= RECOVERY_JOURNAL_1_ENTRIES_PER_SECTOR; +} + +/** + * vdo_compute_recovery_journal_block_number() - Compute the physical block number of the recovery + * journal block which would have a given sequence + * number. + * @journal_size: The size of the journal. + * @sequence_number: The sequence number. + * + * Return: The pbn of the journal block which would the specified sequence number. + */ +static inline physical_block_number_t __must_check +vdo_compute_recovery_journal_block_number(block_count_t journal_size, + sequence_number_t sequence_number) +{ + /* + * Since journal size is a power of two, the block number modulus can just be extracted + * from the low-order bits of the sequence. + */ + return (sequence_number & (journal_size - 1)); +} + +/** + * vdo_get_journal_block_sector() - Find the recovery journal sector from the block header and + * sector number. + * @header: The header of the recovery journal block. + * @sector_number: The index of the sector (1-based). + * + * Return: A packed recovery journal sector. + */ +static inline struct packed_journal_sector * __must_check +vdo_get_journal_block_sector(struct packed_journal_header *header, int sector_number) +{ + char *sector_data = ((char *) header) + (VDO_SECTOR_SIZE * sector_number); + + return (struct packed_journal_sector *) sector_data; +} + +/** + * vdo_pack_recovery_block_header() - Generate the packed representation of a recovery block + * header. + * @header: The header containing the values to encode. + * @packed: The header into which to pack the values. + */ +static inline void vdo_pack_recovery_block_header(const struct recovery_block_header *header, + struct packed_journal_header *packed) +{ + *packed = (struct packed_journal_header) { + .block_map_head = __cpu_to_le64(header->block_map_head), + .slab_journal_head = __cpu_to_le64(header->slab_journal_head), + .sequence_number = __cpu_to_le64(header->sequence_number), + .nonce = __cpu_to_le64(header->nonce), + .logical_blocks_used = __cpu_to_le64(header->logical_blocks_used), + .block_map_data_blocks = __cpu_to_le64(header->block_map_data_blocks), + .entry_count = __cpu_to_le16(header->entry_count), + .check_byte = header->check_byte, + .recovery_count = header->recovery_count, + .metadata_type = header->metadata_type, + }; +} + +/** + * vdo_unpack_recovery_block_header() - Decode the packed representation of a recovery block + * header. + * @packed: The packed header to decode. + * + * Return: The unpacked header. + */ +static inline struct recovery_block_header +vdo_unpack_recovery_block_header(const struct packed_journal_header *packed) +{ + return (struct recovery_block_header) { + .block_map_head = __le64_to_cpu(packed->block_map_head), + .slab_journal_head = __le64_to_cpu(packed->slab_journal_head), + .sequence_number = __le64_to_cpu(packed->sequence_number), + .nonce = __le64_to_cpu(packed->nonce), + .logical_blocks_used = __le64_to_cpu(packed->logical_blocks_used), + .block_map_data_blocks = __le64_to_cpu(packed->block_map_data_blocks), + .entry_count = __le16_to_cpu(packed->entry_count), + .check_byte = packed->check_byte, + .recovery_count = packed->recovery_count, + .metadata_type = packed->metadata_type, + }; +} + +/** + * vdo_compute_slab_count() - Compute the number of slabs a depot with given parameters would have. + * @first_block: PBN of the first data block. + * @last_block: PBN of the last data block. + * @slab_size_shift: Exponent for the number of blocks per slab. + * + * Return: The number of slabs. + */ +static inline slab_count_t +vdo_compute_slab_count(physical_block_number_t first_block, + physical_block_number_t last_block, + unsigned int slab_size_shift) +{ + return (slab_count_t) ((last_block - first_block) >> slab_size_shift); +} + +int __must_check vdo_configure_slab_depot(const struct partition *partition, + struct slab_config slab_config, + zone_count_t zone_count, + struct slab_depot_state_2_0 *state); + +int __must_check vdo_configure_slab(block_count_t slab_size, + block_count_t slab_journal_blocks, + struct slab_config *slab_config); + +/** + * vdo_get_saved_reference_count_size() - Get the number of blocks required to save a reference + * counts state covering the specified number of data + * blocks. + * @block_count: The number of physical data blocks that can be referenced. + * + * Return: The number of blocks required to save reference counts with the given block count. + */ +static inline block_count_t +vdo_get_saved_reference_count_size(block_count_t block_count) +{ + return DIV_ROUND_UP(block_count, COUNTS_PER_BLOCK); +} + +/** + * vdo_get_slab_journal_start_block() - Get the physical block number of the start of the slab + * journal relative to the start block allocator partition. + * @slab_config: The slab configuration of the VDO. + * @origin: The first block of the slab. + */ +static inline physical_block_number_t __must_check +vdo_get_slab_journal_start_block(const struct slab_config *slab_config, + physical_block_number_t origin) +{ + return origin + slab_config->data_blocks + slab_config->reference_count_blocks; +} + +/** + * vdo_advance_journal_point() - Move the given journal point forward by one entry. + * @point: The journal point to adjust. + * @entries_per_block: The number of entries in one full block. + */ +static inline void +vdo_advance_journal_point(struct journal_point *point, journal_entry_count_t entries_per_block) +{ + point->entry_count++; + if (point->entry_count == entries_per_block) { + point->sequence_number++; + point->entry_count = 0; + } +} + +/** + * vdo_before_journal_point() - Check whether the first point precedes the second point. + * @first: The first journal point. + * @second: The second journal point. + * + * Return: true if the first point precedes the second point. + */ +static inline bool +vdo_before_journal_point(const struct journal_point *first, const struct journal_point *second) +{ + return ((first->sequence_number < second->sequence_number) || + ((first->sequence_number == second->sequence_number) && + (first->entry_count < second->entry_count))); +} + +/** + * vdo_pack_journal_point() - Encode the journal location represented by a + * journal_point into a packed_journal_point. + * @unpacked: The unpacked input point. + * @packed: The packed output point. + */ +static inline void +vdo_pack_journal_point(const struct journal_point *unpacked, struct packed_journal_point *packed) +{ + packed->encoded_point = + __cpu_to_le64((unpacked->sequence_number << 16) | unpacked->entry_count); +} + +/** + * vdo_unpack_journal_point() - Decode the journal location represented by a packed_journal_point + * into a journal_point. + * @packed: The packed input point. + * @unpacked: The unpacked output point. + */ +static inline void +vdo_unpack_journal_point(const struct packed_journal_point *packed, struct journal_point *unpacked) +{ + u64 native = __le64_to_cpu(packed->encoded_point); + + unpacked->sequence_number = (native >> 16); + unpacked->entry_count = (native & 0xffff); +} + +/** + * vdo_pack_slab_journal_block_header() - Generate the packed representation of a slab block + * header. + * @header: The header containing the values to encode. + * @packed: The header into which to pack the values. + */ +static inline void +vdo_pack_slab_journal_block_header(const struct slab_journal_block_header *header, + struct packed_slab_journal_block_header *packed) +{ + packed->head = __cpu_to_le64(header->head); + packed->sequence_number = __cpu_to_le64(header->sequence_number); + packed->nonce = __cpu_to_le64(header->nonce); + packed->entry_count = __cpu_to_le16(header->entry_count); + packed->metadata_type = header->metadata_type; + packed->has_block_map_increments = header->has_block_map_increments; + + vdo_pack_journal_point(&header->recovery_point, &packed->recovery_point); +} + +/** + * vdo_unpack_slab_journal_block_header() - Decode the packed representation of a slab block + * header. + * @packed: The packed header to decode. + * @header: The header into which to unpack the values. + */ +static inline void +vdo_unpack_slab_journal_block_header(const struct packed_slab_journal_block_header *packed, + struct slab_journal_block_header *header) +{ + *header = (struct slab_journal_block_header) { + .head = __le64_to_cpu(packed->head), + .sequence_number = __le64_to_cpu(packed->sequence_number), + .nonce = __le64_to_cpu(packed->nonce), + .entry_count = __le16_to_cpu(packed->entry_count), + .metadata_type = packed->metadata_type, + .has_block_map_increments = packed->has_block_map_increments, + }; + vdo_unpack_journal_point(&packed->recovery_point, &header->recovery_point); +} + +/** + * vdo_pack_slab_journal_entry() - Generate the packed encoding of a slab journal entry. + * @packed: The entry into which to pack the values. + * @sbn: The slab block number of the entry to encode. + * @is_increment: The increment flag. + */ +static inline void vdo_pack_slab_journal_entry(packed_slab_journal_entry *packed, + slab_block_number sbn, + bool is_increment) +{ + packed->offset_low8 = (sbn & 0x0000FF); + packed->offset_mid8 = (sbn & 0x00FF00) >> 8; + packed->offset_high7 = (sbn & 0x7F0000) >> 16; + packed->increment = is_increment ? 1 : 0; +} + +/** + * vdo_unpack_slab_journal_entry() - Decode the packed representation of a slab journal entry. + * @packed: The packed entry to decode. + * + * Return: The decoded slab journal entry. + */ +static inline struct slab_journal_entry __must_check +vdo_unpack_slab_journal_entry(const packed_slab_journal_entry *packed) +{ + struct slab_journal_entry entry; + + entry.sbn = packed->offset_high7; + entry.sbn <<= 8; + entry.sbn |= packed->offset_mid8; + entry.sbn <<= 8; + entry.sbn |= packed->offset_low8; + entry.operation = VDO_JOURNAL_DATA_REMAPPING; + entry.increment = packed->increment; + return entry; +} + +struct slab_journal_entry __must_check +vdo_decode_slab_journal_entry(struct packed_slab_journal_block *block, + journal_entry_count_t entry_count); + +/** + * vdo_get_slab_summary_hint_shift() - Compute the shift for slab summary hints. + * @slab_size_shift: Exponent for the number of blocks per slab. + * + * Return: The hint shift. + */ +static inline u8 __must_check vdo_get_slab_summary_hint_shift(unsigned int slab_size_shift) +{ + return ((slab_size_shift > VDO_SLAB_SUMMARY_FULLNESS_HINT_BITS) ? + (slab_size_shift - VDO_SLAB_SUMMARY_FULLNESS_HINT_BITS) : + 0); +} + +int __must_check vdo_initialize_layout(block_count_t size, + physical_block_number_t offset, + block_count_t block_map_blocks, + block_count_t journal_blocks, + block_count_t summary_blocks, + struct layout *layout); + +void vdo_uninitialize_layout(struct layout *layout); + +int __must_check vdo_get_partition(struct layout *layout, + enum partition_id id, + struct partition **partition_ptr); + +struct partition * __must_check +vdo_get_known_partition(struct layout *layout, enum partition_id id); + +int vdo_validate_config(const struct vdo_config *config, + block_count_t physical_block_count, + block_count_t logical_block_count); + +void vdo_destroy_component_states(struct vdo_component_states *states); + +int __must_check +vdo_decode_component_states(u8 *buffer, + struct volume_geometry *geometry, + struct vdo_component_states *states); + +int __must_check +vdo_validate_component_states(struct vdo_component_states *states, + nonce_t geometry_nonce, + block_count_t physical_size, + block_count_t logical_size); + +void vdo_encode_super_block(u8 *buffer, struct vdo_component_states *states); +int __must_check vdo_decode_super_block(u8 *buffer); + +/* We start with 0L and postcondition with ~0L to match our historical usage in userspace. */ +static inline u32 vdo_crc32(const void *buf, unsigned long len) +{ + return (crc32(0L, buf, len) ^ ~0L); +} + +#endif /* VDO_ENCODINGS_H */ diff --git a/drivers/md/dm-vdo/flush.c b/drivers/md/dm-vdo/flush.c new file mode 100644 index 00000000000..e5088ac08a6 --- /dev/null +++ b/drivers/md/dm-vdo/flush.c @@ -0,0 +1,563 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "flush.h" + +#include <linux/mempool.h> +#include <linux/spinlock.h> + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "admin-state.h" +#include "completion.h" +#include "io-submitter.h" +#include "logical-zone.h" +#include "slab-depot.h" +#include "types.h" +#include "vdo.h" + +struct flusher { + struct vdo_completion completion; + /** The vdo to which this flusher belongs */ + struct vdo *vdo; + /** The administrative state of the flusher */ + struct admin_state state; + /** The current flush generation of the vdo */ + sequence_number_t flush_generation; + /** The first unacknowledged flush generation */ + sequence_number_t first_unacknowledged_generation; + /** The queue of flush requests waiting to notify other threads */ + struct wait_queue notifiers; + /** The queue of flush requests waiting for VIOs to complete */ + struct wait_queue pending_flushes; + /** The flush generation for which notifications are being sent */ + sequence_number_t notify_generation; + /** The logical zone to notify next */ + struct logical_zone *logical_zone_to_notify; + /** The ID of the thread on which flush requests should be made */ + thread_id_t thread_id; + /** The pool of flush requests */ + mempool_t *flush_pool; + /** Bios waiting for a flush request to become available */ + struct bio_list waiting_flush_bios; + /** The lock to protect the previous fields */ + spinlock_t lock; + /** The rotor for selecting the bio queue for submitting flush bios */ + zone_count_t bio_queue_rotor; + /** The number of flushes submitted to the current bio queue */ + int flush_count; +}; + +/** + * assert_on_flusher_thread() - Check that we are on the flusher thread. + * @flusher: The flusher. + * @caller: The function which is asserting. + */ +static inline void assert_on_flusher_thread(struct flusher *flusher, const char *caller) +{ + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == flusher->thread_id), + "%s() called from flusher thread", + caller); +} + +/** + * as_flusher() - Convert a generic vdo_completion to a flusher. + * @completion: The completion to convert. + * + * Return: The completion as a flusher. + */ +static struct flusher *as_flusher(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion, VDO_FLUSH_NOTIFICATION_COMPLETION); + return container_of(completion, struct flusher, completion); +} + +/** + * completion_as_vdo_flush() - Convert a generic vdo_completion to a vdo_flush. + * @completion: The completion to convert. + * + * Return: The completion as a vdo_flush. + */ +static inline struct vdo_flush *completion_as_vdo_flush(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion, VDO_FLUSH_COMPLETION); + return container_of(completion, struct vdo_flush, completion); +} + +/** + * waiter_as_flush() - Convert a vdo_flush's generic wait queue entry back to the vdo_flush. + * @waiter: The wait queue entry to convert. + * + * Return: The wait queue entry as a vdo_flush. + */ +static struct vdo_flush *waiter_as_flush(struct waiter *waiter) +{ + return container_of(waiter, struct vdo_flush, waiter); +} + +static void *allocate_flush(gfp_t gfp_mask, void *pool_data) +{ + struct vdo_flush *flush; + + if ((gfp_mask & GFP_NOWAIT) == GFP_NOWAIT) { + flush = UDS_ALLOCATE_NOWAIT(struct vdo_flush, __func__); + } else { + int result = UDS_ALLOCATE(1, struct vdo_flush, __func__, &flush); + + if (result != VDO_SUCCESS) + uds_log_error_strerror(result, "failed to allocate spare flush"); + } + + if (flush != NULL) { + struct flusher *flusher = pool_data; + + vdo_initialize_completion(&flush->completion, flusher->vdo, VDO_FLUSH_COMPLETION); + } + + return flush; +} + +static void free_flush(void *element, void *pool_data __always_unused) +{ + UDS_FREE(element); +} + +/** + * vdo_make_flusher() - Make a flusher for a vdo. + * @vdo: The vdo which owns the flusher. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make_flusher(struct vdo *vdo) +{ + int result = UDS_ALLOCATE(1, struct flusher, __func__, &vdo->flusher); + + if (result != VDO_SUCCESS) + return result; + + vdo->flusher->vdo = vdo; + vdo->flusher->thread_id = vdo->thread_config.packer_thread; + vdo_set_admin_state_code(&vdo->flusher->state, VDO_ADMIN_STATE_NORMAL_OPERATION); + vdo_initialize_completion(&vdo->flusher->completion, vdo, + VDO_FLUSH_NOTIFICATION_COMPLETION); + + spin_lock_init(&vdo->flusher->lock); + bio_list_init(&vdo->flusher->waiting_flush_bios); + vdo->flusher->flush_pool = mempool_create(1, allocate_flush, free_flush, vdo->flusher); + return ((vdo->flusher->flush_pool == NULL) ? -ENOMEM : VDO_SUCCESS); +} + +/** + * vdo_free_flusher() - Free a flusher. + * @flusher: The flusher to free. + */ +void vdo_free_flusher(struct flusher *flusher) +{ + if (flusher == NULL) + return; + + if (flusher->flush_pool != NULL) + mempool_destroy(UDS_FORGET(flusher->flush_pool)); + UDS_FREE(flusher); +} + +/** + * vdo_get_flusher_thread_id() - Get the ID of the thread on which flusher functions should be + * called. + * @flusher: The flusher to query. + * + * Return: The ID of the thread which handles the flusher. + */ +thread_id_t vdo_get_flusher_thread_id(struct flusher *flusher) +{ + return flusher->thread_id; +} + +static void notify_flush(struct flusher *flusher); +static void vdo_complete_flush(struct vdo_flush *flush); + +/** + * finish_notification() - Finish the notification process. + * @completion: The flusher completion. + * + * Finishes the notification process by checking if any flushes have completed and then starting + * the notification of the next flush request if one came in while the current notification was in + * progress. This callback is registered in flush_packer_callback(). + */ +static void finish_notification(struct vdo_completion *completion) +{ + struct flusher *flusher = as_flusher(completion); + + assert_on_flusher_thread(flusher, __func__); + + vdo_enqueue_waiter(&flusher->pending_flushes, + vdo_dequeue_next_waiter(&flusher->notifiers)); + vdo_complete_flushes(flusher); + if (vdo_has_waiters(&flusher->notifiers)) + notify_flush(flusher); +} + +/** + * flush_packer_callback() - Flush the packer. + * @completion: The flusher completion. + * + * Flushes the packer now that all of the logical and physical zones have been notified of the new + * flush request. This callback is registered in increment_generation(). + */ +static void flush_packer_callback(struct vdo_completion *completion) +{ + struct flusher *flusher = as_flusher(completion); + + vdo_increment_packer_flush_generation(flusher->vdo->packer); + vdo_launch_completion_callback(completion, finish_notification, flusher->thread_id); +} + +/** + * increment_generation() - Increment the flush generation in a logical zone. + * @completion: The flusher as a completion. + * + * If there are more logical zones, go on to the next one, otherwise, prepare the physical zones. + * This callback is registered both in notify_flush() and in itself. + */ +static void increment_generation(struct vdo_completion *completion) +{ + struct flusher *flusher = as_flusher(completion); + struct logical_zone *zone = flusher->logical_zone_to_notify; + + vdo_increment_logical_zone_flush_generation(zone, flusher->notify_generation); + if (zone->next == NULL) { + vdo_launch_completion_callback(completion, + flush_packer_callback, + flusher->thread_id); + return; + } + + flusher->logical_zone_to_notify = zone->next; + vdo_launch_completion_callback(completion, + increment_generation, + flusher->logical_zone_to_notify->thread_id); +} + +/** + * notify_flush() - Launch a flush notification. + * @flusher: The flusher doing the notification. + */ +static void notify_flush(struct flusher *flusher) +{ + struct vdo_flush *flush = waiter_as_flush(vdo_get_first_waiter(&flusher->notifiers)); + + flusher->notify_generation = flush->flush_generation; + flusher->logical_zone_to_notify = &flusher->vdo->logical_zones->zones[0]; + flusher->completion.requeue = true; + vdo_launch_completion_callback(&flusher->completion, + increment_generation, + flusher->logical_zone_to_notify->thread_id); +} + +/** + * flush_vdo() - Start processing a flush request. + * @completion: A flush request (as a vdo_completion) + * + * This callback is registered in launch_flush(). + */ +static void flush_vdo(struct vdo_completion *completion) +{ + struct vdo_flush *flush = completion_as_vdo_flush(completion); + struct flusher *flusher = completion->vdo->flusher; + bool may_notify; + int result; + + assert_on_flusher_thread(flusher, __func__); + result = ASSERT(vdo_is_state_normal(&flusher->state), "flusher is in normal operation"); + if (result != VDO_SUCCESS) { + vdo_enter_read_only_mode(flusher->vdo, result); + vdo_complete_flush(flush); + return; + } + + flush->flush_generation = flusher->flush_generation++; + may_notify = !vdo_has_waiters(&flusher->notifiers); + vdo_enqueue_waiter(&flusher->notifiers, &flush->waiter); + if (may_notify) + notify_flush(flusher); +} + +/** + * check_for_drain_complete() - Check whether the flusher has drained. + * @flusher: The flusher. + */ +static void check_for_drain_complete(struct flusher *flusher) +{ + bool drained; + + if (!vdo_is_state_draining(&flusher->state) || vdo_has_waiters(&flusher->pending_flushes)) + return; + + spin_lock(&flusher->lock); + drained = bio_list_empty(&flusher->waiting_flush_bios); + spin_unlock(&flusher->lock); + + if (drained) + vdo_finish_draining(&flusher->state); +} + +/** + * vdo_complete_flushes() - Attempt to complete any flushes which might have finished. + * @flusher: The flusher. + */ +void vdo_complete_flushes(struct flusher *flusher) +{ + sequence_number_t oldest_active_generation = U64_MAX; + struct logical_zone *zone; + + assert_on_flusher_thread(flusher, __func__); + + for (zone = &flusher->vdo->logical_zones->zones[0]; zone != NULL; zone = zone->next) + oldest_active_generation = + min(oldest_active_generation, READ_ONCE(zone->oldest_active_generation)); + + while (vdo_has_waiters(&flusher->pending_flushes)) { + struct vdo_flush *flush = + waiter_as_flush(vdo_get_first_waiter(&flusher->pending_flushes)); + + if (flush->flush_generation >= oldest_active_generation) + return; + + ASSERT_LOG_ONLY((flush->flush_generation == + flusher->first_unacknowledged_generation), + "acknowledged next expected flush, %llu, was: %llu", + (unsigned long long) flusher->first_unacknowledged_generation, + (unsigned long long) flush->flush_generation); + vdo_dequeue_next_waiter(&flusher->pending_flushes); + vdo_complete_flush(flush); + flusher->first_unacknowledged_generation++; + } + + check_for_drain_complete(flusher); +} + +/** + * vdo_dump_flusher() - Dump the flusher, in a thread-unsafe fashion. + * @flusher: The flusher. + */ +void vdo_dump_flusher(const struct flusher *flusher) +{ + uds_log_info("struct flusher"); + uds_log_info(" flush_generation=%llu first_unacknowledged_generation=%llu", + (unsigned long long) flusher->flush_generation, + (unsigned long long) flusher->first_unacknowledged_generation); + uds_log_info(" notifiers queue is %s; pending_flushes queue is %s", + (vdo_has_waiters(&flusher->notifiers) ? "not empty" : "empty"), + (vdo_has_waiters(&flusher->pending_flushes) ? "not empty" : "empty")); +} + +/** + * initialize_flush() - Initialize a vdo_flush structure. + * @flush: The flush to initialize. + * @vdo: The vdo being flushed. + * + * Initializes a vdo_flush structure, transferring all the bios in the flusher's waiting_flush_bios + * list to it. The caller MUST already hold the lock. + */ +static void initialize_flush(struct vdo_flush *flush, struct vdo *vdo) +{ + bio_list_init(&flush->bios); + bio_list_merge(&flush->bios, &vdo->flusher->waiting_flush_bios); + bio_list_init(&vdo->flusher->waiting_flush_bios); +} + +static void launch_flush(struct vdo_flush *flush) +{ + struct vdo_completion *completion = &flush->completion; + + vdo_prepare_completion(completion, + flush_vdo, + flush_vdo, + completion->vdo->thread_config.packer_thread, + NULL); + vdo_enqueue_completion(completion, VDO_DEFAULT_Q_FLUSH_PRIORITY); +} + +/** + * vdo_launch_flush() - Function called to start processing a flush request. + * @vdo: The vdo. + * @bio: The bio containing an empty flush request. + * + * This is called when we receive an empty flush bio from the block layer, and before acknowledging + * a non-empty bio with the FUA flag set. + */ +void vdo_launch_flush(struct vdo *vdo, struct bio *bio) +{ + /* + * Try to allocate a vdo_flush to represent the flush request. If the allocation fails, + * we'll deal with it later. + */ + struct vdo_flush *flush = mempool_alloc(vdo->flusher->flush_pool, GFP_NOWAIT); + struct flusher *flusher = vdo->flusher; + const struct admin_state_code *code = vdo_get_admin_state_code(&flusher->state); + + ASSERT_LOG_ONLY(!code->quiescent, "Flushing not allowed in state %s", code->name); + + spin_lock(&flusher->lock); + + /* We have a new bio to start. Add it to the list. */ + bio_list_add(&flusher->waiting_flush_bios, bio); + + if (flush == NULL) { + spin_unlock(&flusher->lock); + return; + } + + /* We have flushes to start. Capture them in the vdo_flush structure. */ + initialize_flush(flush, vdo); + spin_unlock(&flusher->lock); + + /* Finish launching the flushes. */ + launch_flush(flush); +} + +/** + * release_flush() - Release a vdo_flush structure that has completed its work. + * @flush: The completed flush structure to re-use or free. + * + * If there are any pending flush requests whose vdo_flush allocation failed, they will be launched + * by immediately re-using the released vdo_flush. If there is no spare vdo_flush, the released + * structure will become the spare. Otherwise, the vdo_flush will be freed. + */ +static void release_flush(struct vdo_flush *flush) +{ + bool relaunch_flush; + struct flusher *flusher = flush->completion.vdo->flusher; + + spin_lock(&flusher->lock); + if (bio_list_empty(&flusher->waiting_flush_bios)) { + relaunch_flush = false; + } else { + /* We have flushes to start. Capture them in a flush request. */ + initialize_flush(flush, flusher->vdo); + relaunch_flush = true; + } + spin_unlock(&flusher->lock); + + if (relaunch_flush) { + /* Finish launching the flushes. */ + launch_flush(flush); + return; + } + + mempool_free(flush, flusher->flush_pool); +} + +/** + * vdo_complete_flush_callback() - Function called to complete and free a flush request, registered + * in vdo_complete_flush(). + * @completion: The flush request. + */ +static void vdo_complete_flush_callback(struct vdo_completion *completion) +{ + struct vdo_flush *flush = completion_as_vdo_flush(completion); + struct vdo *vdo = completion->vdo; + struct bio *bio; + + while ((bio = bio_list_pop(&flush->bios)) != NULL) { + /* + * We're not acknowledging this bio now, but we'll never touch it again, so this is + * the last chance to account for it. + */ + vdo_count_bios(&vdo->stats.bios_acknowledged, bio); + + /* Update the device, and send it on down... */ + bio_set_dev(bio, vdo_get_backing_device(vdo)); + atomic64_inc(&vdo->stats.flush_out); + submit_bio_noacct(bio); + } + + + /* + * Release the flush structure, freeing it, re-using it as the spare, or using it to launch + * any flushes that had to wait when allocations failed. + */ + release_flush(flush); +} + +/** + * select_bio_queue() - Select the bio queue on which to finish a flush request. + * @flusher: The flusher finishing the request. + */ +static thread_id_t select_bio_queue(struct flusher *flusher) +{ + struct vdo *vdo = flusher->vdo; + zone_count_t bio_threads = flusher->vdo->thread_config.bio_thread_count; + int interval; + + if (bio_threads == 1) + return vdo->thread_config.bio_threads[0]; + + interval = vdo->device_config->thread_counts.bio_rotation_interval; + if (flusher->flush_count == interval) { + flusher->flush_count = 1; + flusher->bio_queue_rotor = ((flusher->bio_queue_rotor + 1) % bio_threads); + } else { + flusher->flush_count++; + } + + return vdo->thread_config.bio_threads[flusher->bio_queue_rotor]; +} + +/** + * vdo_complete_flush() - Complete and free a vdo flush request. + * @flush: The flush request. + */ +static void vdo_complete_flush(struct vdo_flush *flush) +{ + struct vdo_completion *completion = &flush->completion; + + vdo_prepare_completion(completion, + vdo_complete_flush_callback, + vdo_complete_flush_callback, + select_bio_queue(completion->vdo->flusher), + NULL); + vdo_enqueue_completion(completion, BIO_Q_FLUSH_PRIORITY); +} + +/** + * initiate_drain() - Initiate a drain. + * + * Implements vdo_admin_initiator. + */ +static void initiate_drain(struct admin_state *state) +{ + check_for_drain_complete(container_of(state, struct flusher, state)); +} + +/** + * vdo_drain_flusher() - Drain the flusher. + * @flusher: The flusher to drain. + * @completion: The completion to finish when the flusher has drained. + * + * Drains the flusher by preventing any more VIOs from entering the flusher and then flushing. The + * flusher will be left in the suspended state. + */ +void vdo_drain_flusher(struct flusher *flusher, struct vdo_completion *completion) +{ + assert_on_flusher_thread(flusher, __func__); + vdo_start_draining(&flusher->state, + VDO_ADMIN_STATE_SUSPENDING, + completion, + initiate_drain); +} + +/** + * vdo_resume_flusher() - Resume a flusher which has been suspended. + * @flusher: The flusher to resume. + * @parent: The completion to finish when the flusher has resumed. + */ +void vdo_resume_flusher(struct flusher *flusher, struct vdo_completion *parent) +{ + assert_on_flusher_thread(flusher, __func__); + vdo_continue_completion(parent, vdo_resume_if_quiescent(&flusher->state)); +} diff --git a/drivers/md/dm-vdo/flush.h b/drivers/md/dm-vdo/flush.h new file mode 100644 index 00000000000..2e66a477d5d --- /dev/null +++ b/drivers/md/dm-vdo/flush.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_FLUSH_H +#define VDO_FLUSH_H + +#include "types.h" +#include "vio.h" +#include "wait-queue.h" +#include "work-queue.h" + +/* A marker for tracking which journal entries are affected by a flush request. */ +struct vdo_flush { + /* The completion for enqueueing this flush request. */ + struct vdo_completion completion; + /* The flush bios covered by this request */ + struct bio_list bios; + /* The wait queue entry for this flush */ + struct waiter waiter; + /* Which flush this struct represents */ + sequence_number_t flush_generation; +}; + +struct flusher; + +int __must_check vdo_make_flusher(struct vdo *vdo); + +void vdo_free_flusher(struct flusher *flusher); + +thread_id_t __must_check vdo_get_flusher_thread_id(struct flusher *flusher); + +void vdo_complete_flushes(struct flusher *flusher); + +void vdo_dump_flusher(const struct flusher *flusher); + +void vdo_launch_flush(struct vdo *vdo, struct bio *bio); + +void vdo_drain_flusher(struct flusher *flusher, struct vdo_completion *completion); + +void vdo_resume_flusher(struct flusher *flusher, struct vdo_completion *parent); + +#endif /* VDO_FLUSH_H */ diff --git a/drivers/md/dm-vdo/int-map.c b/drivers/md/dm-vdo/int-map.c new file mode 100644 index 00000000000..9beb9642ae1 --- /dev/null +++ b/drivers/md/dm-vdo/int-map.c @@ -0,0 +1,710 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +/** + * DOC: + * + * Hash table implementation of a map from integers to pointers, implemented using the Hopscotch + * Hashing algorithm by Herlihy, Shavit, and Tzafrir (see + * http://en.wikipedia.org/wiki/Hopscotch_hashing). This implementation does not contain any of the + * locking/concurrency features of the algorithm, just the collision resolution scheme. + * + * Hopscotch Hashing is based on hashing with open addressing and linear probing. All the entries + * are stored in a fixed array of buckets, with no dynamic allocation for collisions. Unlike linear + * probing, all the entries that hash to a given bucket are stored within a fixed neighborhood + * starting at that bucket. Chaining is effectively represented as a bit vector relative to each + * bucket instead of as pointers or explicit offsets. + * + * When an empty bucket cannot be found within a given neighborhood, subsequent neighborhoods are + * searched, and one or more entries will "hop" into those neighborhoods. When this process works, + * an empty bucket will move into the desired neighborhood, allowing the entry to be added. When + * that process fails (typically when the buckets are around 90% full), the table must be resized + * and the all entries rehashed and added to the expanded table. + * + * Unlike linear probing, the number of buckets that must be searched in the worst case has a fixed + * upper bound (the size of the neighborhood). Those entries occupy a small number of memory cache + * lines, leading to improved use of the cache (fewer misses on both successful and unsuccessful + * searches). Hopscotch hashing outperforms linear probing at much higher load factors, so even + * with the increased memory burden for maintaining the hop vectors, less memory is needed to + * achieve that performance. Hopscotch is also immune to "contamination" from deleting entries + * since entries are genuinely removed instead of being replaced by a placeholder. + * + * The published description of the algorithm used a bit vector, but the paper alludes to an offset + * scheme which is used by this implementation. Since the entries in the neighborhood are within N + * entries of the hash bucket at the start of the neighborhood, a pair of small offset fields each + * log2(N) bits wide is all that's needed to maintain the hops as a linked list. In order to encode + * "no next hop" (i.e. NULL) as the natural initial value of zero, the offsets are biased by one + * (i.e. 0 => NULL, 1 => offset=0, 2 => offset=1, etc.) We can represent neighborhoods of up to 255 + * entries with just 8+8=16 bits per entry. The hop list is sorted by hop offset so the first entry + * in the list is always the bucket closest to the start of the neighborhood. + * + * While individual accesses tend to be very fast, the table resize operations are very, very + * expensive. If an upper bound on the latency of adding an entry to the table is needed, we either + * need to ensure the table is pre-sized to be large enough so no resize is ever needed, or we'll + * need to develop an approach to incrementally resize the table. + */ + +#include "int-map.h" + +#include <linux/minmax.h> + +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "permassert.h" + +enum { + DEFAULT_CAPACITY = 16, /* the number of neighborhoods in a new table */ + NEIGHBORHOOD = 255, /* the number of buckets in each neighborhood */ + MAX_PROBES = 1024, /* limit on the number of probes for a free bucket */ + NULL_HOP_OFFSET = 0, /* the hop offset value terminating the hop list */ + DEFAULT_LOAD = 75 /* a compromise between memory use and performance */ +}; + +/** + * struct bucket - hash bucket + * + * Buckets are packed together to reduce memory usage and improve cache efficiency. It would be + * tempting to encode the hop offsets separately and maintain alignment of key/value pairs, but + * it's crucial to keep the hop fields near the buckets that they use them so they'll tend to share + * cache lines. + */ +struct __packed bucket { + /** + * @first_hop: The biased offset of the first entry in the hop list of the neighborhood + * that hashes to this bucket. + */ + u8 first_hop; + /** @next_hop: The biased offset of the next bucket in the hop list. */ + u8 next_hop; + /** @key: The key stored in this bucket. */ + u64 key; + /** @value: The value stored in this bucket (NULL if empty). */ + void *value; +}; + +/** + * struct int_map - The concrete definition of the opaque int_map type. + * + * To avoid having to wrap the neighborhoods of the last entries back around to the start of the + * bucket array, we allocate a few more buckets at the end of the array instead, which is why + * capacity and bucket_count are different. + */ +struct int_map { + /** @size: The number of entries stored in the map. */ + size_t size; + /** @capacity: The number of neighborhoods in the map. */ + size_t capacity; + /* @bucket_count: The number of buckets in the bucket array. */ + size_t bucket_count; + /** @buckets: The array of hash buckets. */ + struct bucket *buckets; +}; + +/** + * mix() - The Google CityHash 16-byte hash mixing function. + * @input1: The first input value. + * @input2: The second input value. + * + * Return: A hash of the two inputs. + */ +static u64 mix(u64 input1, u64 input2) +{ + static const u64 CITY_MULTIPLIER = 0x9ddfea08eb382d69ULL; + u64 hash = (input1 ^ input2); + + hash *= CITY_MULTIPLIER; + hash ^= (hash >> 47); + hash ^= input2; + hash *= CITY_MULTIPLIER; + hash ^= (hash >> 47); + hash *= CITY_MULTIPLIER; + return hash; +} + +/** + * hash_key() - Calculate a 64-bit non-cryptographic hash value for the provided 64-bit integer + * key. + * @key: The mapping key. + * + * The implementation is based on Google's CityHash, only handling the specific case of an 8-byte + * input. + * + * Return: The hash of the mapping key. + */ +static u64 hash_key(u64 key) +{ + /* + * Aliasing restrictions forbid us from casting pointer types, so use a union to convert a + * single u64 to two u32 values. + */ + union { + u64 u64; + u32 u32[2]; + } pun = {.u64 = key}; + + return mix(sizeof(key) + (((u64) pun.u32[0]) << 3), pun.u32[1]); +} + +/** + * allocate_buckets() - Initialize an int_map. + * @map: The map to initialize. + * @capacity: The initial capacity of the map. + * + * Return: UDS_SUCCESS or an error code. + */ +static int allocate_buckets(struct int_map *map, size_t capacity) +{ + map->size = 0; + map->capacity = capacity; + + /* + * Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a full neighborhood + * without have to wrap back around to element zero. + */ + map->bucket_count = capacity + (NEIGHBORHOOD - 1); + return UDS_ALLOCATE(map->bucket_count, struct bucket, + "struct int_map buckets", &map->buckets); +} + +/** + * vdo_make_int_map() - Allocate and initialize an int_map. + * @initial_capacity: The number of entries the map should initially be capable of holding (zero + * tells the map to use its own small default). + * @initial_load: The load factor of the map, expressed as an integer percentage (typically in the + * range 50 to 90, with zero telling the map to use its own default). + * @map_ptr: Output, a pointer to hold the new int_map. + * + * Return: UDS_SUCCESS or an error code. + */ +int vdo_make_int_map(size_t initial_capacity, unsigned int initial_load, struct int_map **map_ptr) +{ + struct int_map *map; + int result; + size_t capacity; + + /* Use the default initial load if the caller did not specify one. */ + if (initial_load == 0) + initial_load = DEFAULT_LOAD; + if (initial_load > 100) + return UDS_INVALID_ARGUMENT; + + result = UDS_ALLOCATE(1, struct int_map, "struct int_map", &map); + if (result != UDS_SUCCESS) + return result; + + /* Use the default capacity if the caller did not specify one. */ + capacity = (initial_capacity > 0) ? initial_capacity : DEFAULT_CAPACITY; + + /* + * Scale up the capacity by the specified initial load factor. (i.e to hold 1000 entries at + * 80% load we need a capacity of 1250) + */ + capacity = capacity * 100 / initial_load; + + result = allocate_buckets(map, capacity); + if (result != UDS_SUCCESS) { + vdo_free_int_map(UDS_FORGET(map)); + return result; + } + + *map_ptr = map; + return UDS_SUCCESS; +} + +/** + * vdo_free_int_map() - Free an int_map. + * @map: The int_map to free. + * + * NOTE: The map does not own the pointer values stored in the map and they are not freed by this + * call. + */ +void vdo_free_int_map(struct int_map *map) +{ + if (map == NULL) + return; + + UDS_FREE(UDS_FORGET(map->buckets)); + UDS_FREE(UDS_FORGET(map)); +} + +/** + * vdo_int_map_size() - Get the number of entries stored in an int_map. + * @map: The int_map to query. + * + * Return: The number of entries in the map. + */ +size_t vdo_int_map_size(const struct int_map *map) +{ + return map->size; +} + +/** + * dereference_hop() - Convert a biased hop offset within a neighborhood to a pointer to the bucket + * it references. + * @neighborhood: The first bucket in the neighborhood. + * @hop_offset: The biased hop offset to the desired bucket. + * + * Return: NULL if hop_offset is zero, otherwise a pointer to the bucket in the neighborhood at + * hop_offset - 1. + */ +static struct bucket *dereference_hop(struct bucket *neighborhood, unsigned int hop_offset) +{ + if (hop_offset == NULL_HOP_OFFSET) + return NULL; + + STATIC_ASSERT(NULL_HOP_OFFSET == 0); + return &neighborhood[hop_offset - 1]; +} + +/** + * insert_in_hop_list() - Add a bucket into the hop list for the neighborhood. + * @neighborhood: The first bucket in the neighborhood. + * @new_bucket: The bucket to add to the hop list. + * + * The bucket is inserted it into the list so the hop list remains sorted by hop offset. + */ +static void insert_in_hop_list(struct bucket *neighborhood, struct bucket *new_bucket) +{ + /* Zero indicates a NULL hop offset, so bias the hop offset by one. */ + int hop_offset = 1 + (new_bucket - neighborhood); + + /* Handle the special case of adding a bucket at the start of the list. */ + int next_hop = neighborhood->first_hop; + + if ((next_hop == NULL_HOP_OFFSET) || (next_hop > hop_offset)) { + new_bucket->next_hop = next_hop; + neighborhood->first_hop = hop_offset; + return; + } + + /* Search the hop list for the insertion point that maintains the sort order. */ + for (;;) { + struct bucket *bucket = dereference_hop(neighborhood, next_hop); + + next_hop = bucket->next_hop; + + if ((next_hop == NULL_HOP_OFFSET) || (next_hop > hop_offset)) { + new_bucket->next_hop = next_hop; + bucket->next_hop = hop_offset; + return; + } + } +} + +/** + * select_bucket() - Select and return the hash bucket for a given search key. + * @map: The map to search. + * @key: The mapping key. + */ +static struct bucket *select_bucket(const struct int_map *map, u64 key) +{ + /* + * Calculate a good hash value for the provided key. We want exactly 32 bits, so mask the + * result. + */ + u64 hash = hash_key(key) & 0xFFFFFFFF; + + /* + * Scale the 32-bit hash to a bucket index by treating it as a binary fraction and + * multiplying that by the capacity. If the hash is uniformly distributed over [0 .. + * 2^32-1], then (hash * capacity / 2^32) should be uniformly distributed over [0 .. + * capacity-1]. The multiply and shift is much faster than a divide (modulus) on X86 CPUs. + */ + return &map->buckets[(hash * map->capacity) >> 32]; +} + +/** + * search_hop_list() - Search the hop list associated with given hash bucket for a given search + * key. + * @map: The map being searched. + * @bucket: The map bucket to search for the key. + * @key: The mapping key. + * @previous_ptr: Output. if not NULL, a pointer in which to store the bucket in the list preceding + * the one that had the matching key + * + * If the key is found, returns a pointer to the entry (bucket or collision), otherwise returns + * NULL. + * + * Return: An entry that matches the key, or NULL if not found. + */ +static struct bucket *search_hop_list(struct int_map *map __always_unused, + struct bucket *bucket, + u64 key, + struct bucket **previous_ptr) +{ + struct bucket *previous = NULL; + unsigned int next_hop = bucket->first_hop; + + while (next_hop != NULL_HOP_OFFSET) { + /* + * Check the neighboring bucket indexed by the offset for the + * desired key. + */ + struct bucket *entry = dereference_hop(bucket, next_hop); + + if ((key == entry->key) && (entry->value != NULL)) { + if (previous_ptr != NULL) + *previous_ptr = previous; + return entry; + } + next_hop = entry->next_hop; + previous = entry; + } + return NULL; +} + +/** + * vdo_int_map_get() - Retrieve the value associated with a given key from the int_map. + * @map: The int_map to query. + * @key: The key to look up. + * + * Return: The value associated with the given key, or NULL if the key is not mapped to any value. + */ +void *vdo_int_map_get(struct int_map *map, u64 key) +{ + struct bucket *match = search_hop_list(map, select_bucket(map, key), key, NULL); + + return ((match != NULL) ? match->value : NULL); +} + +/** + * resize_buckets() - Increase the number of hash buckets. + * @map: The map to resize. + * + * Resizes and rehashes all the existing entries, storing them in the new buckets. + * + * Return: UDS_SUCCESS or an error code. + */ +static int resize_buckets(struct int_map *map) +{ + int result; + size_t i; + + /* Copy the top-level map data to the stack. */ + struct int_map old_map = *map; + + /* Re-initialize the map to be empty and 50% larger. */ + size_t new_capacity = map->capacity / 2 * 3; + + uds_log_info("%s: attempting resize from %zu to %zu, current size=%zu", + __func__, map->capacity, new_capacity, map->size); + result = allocate_buckets(map, new_capacity); + if (result != UDS_SUCCESS) { + *map = old_map; + return result; + } + + /* Populate the new hash table from the entries in the old bucket array. */ + for (i = 0; i < old_map.bucket_count; i++) { + struct bucket *entry = &old_map.buckets[i]; + + if (entry->value == NULL) + continue; + + result = vdo_int_map_put(map, entry->key, entry->value, true, NULL); + if (result != UDS_SUCCESS) { + /* Destroy the new partial map and restore the map from the stack. */ + UDS_FREE(UDS_FORGET(map->buckets)); + *map = old_map; + return result; + } + } + + /* Destroy the old bucket array. */ + UDS_FREE(UDS_FORGET(old_map.buckets)); + return UDS_SUCCESS; +} + +/** + * find_empty_bucket() - Probe the bucket array starting at the given bucket for the next empty + * bucket, returning a pointer to it. + * @map: The map containing the buckets to search. + * @bucket: The bucket at which to start probing. + * @max_probes: The maximum number of buckets to search. + * + * NULL will be returned if the search reaches the end of the bucket array or if the number of + * linear probes exceeds a specified limit. + * + * Return: The next empty bucket, or NULL if the search failed. + */ +static struct bucket * +find_empty_bucket(struct int_map *map, struct bucket *bucket, unsigned int max_probes) +{ + /* + * Limit the search to either the nearer of the end of the bucket array or a fixed distance + * beyond the initial bucket. + */ + ptrdiff_t remaining = &map->buckets[map->bucket_count] - bucket; + struct bucket *sentinel = &bucket[min_t(ptrdiff_t, remaining, max_probes)]; + struct bucket *entry; + + for (entry = bucket; entry < sentinel; entry++) + if (entry->value == NULL) + return entry; + return NULL; +} + +/** + * move_empty_bucket() - Move an empty bucket closer to the start of the bucket array. + * @map: The map containing the bucket. + * @hole: The empty bucket to fill with an entry that precedes it in one of its enclosing + * neighborhoods. + * + * This searches the neighborhoods that contain the empty bucket for a non-empty bucket closer to + * the start of the array. If such a bucket is found, this swaps the two buckets by moving the + * entry to the empty bucket. + * + * Return: The bucket that was vacated by moving its entry to the provided hole, or NULL if no + * entry could be moved. + */ +static struct bucket *move_empty_bucket(struct int_map *map __always_unused, struct bucket *hole) +{ + /* + * Examine every neighborhood that the empty bucket is part of, starting with the one in + * which it is the last bucket. No boundary check is needed for the negative array + * arithmetic since this function is only called when hole is at least NEIGHBORHOOD cells + * deeper into the array than a valid bucket. + */ + struct bucket *bucket; + + for (bucket = &hole[1 - NEIGHBORHOOD]; bucket < hole; bucket++) { + /* + * Find the entry that is nearest to the bucket, which means it will be nearest to + * the hash bucket whose neighborhood is full. + */ + struct bucket *new_hole = dereference_hop(bucket, bucket->first_hop); + + if (new_hole == NULL) + /* + * There are no buckets in this neighborhood that are in use by this one + * (they must all be owned by overlapping neighborhoods). + */ + continue; + + /* + * Skip this bucket if its first entry is actually further away than the hole that + * we're already trying to fill. + */ + if (hole < new_hole) + continue; + + /* + * We've found an entry in this neighborhood that we can "hop" further away, moving + * the hole closer to the hash bucket, if not all the way into its neighborhood. + */ + + /* + * The entry that will be the new hole is the first bucket in the list, so setting + * first_hop is all that's needed remove it from the list. + */ + bucket->first_hop = new_hole->next_hop; + new_hole->next_hop = NULL_HOP_OFFSET; + + /* Move the entry into the original hole. */ + hole->key = new_hole->key; + hole->value = new_hole->value; + new_hole->value = NULL; + + /* Insert the filled hole into the hop list for the neighborhood. */ + insert_in_hop_list(bucket, hole); + return new_hole; + } + + /* We couldn't find an entry to relocate to the hole. */ + return NULL; +} + +/** + * update_mapping() - Find and update any existing mapping for a given key, returning the value + * associated with the key in the provided pointer. + * @map: The int_map to attempt to modify. + * @neighborhood: The first bucket in the neighborhood that would contain the search key + * @key: The key with which to associate the new value. + * @new_value: The value to be associated with the key. + * @update: Whether to overwrite an existing value. + * @old_value_ptr: a pointer in which to store the old value (unmodified if no mapping was found) + * + * Return: true if the map contains a mapping for the key, false if it does not. + */ +static bool update_mapping(struct int_map *map, + struct bucket *neighborhood, + u64 key, + void *new_value, + bool update, + void **old_value_ptr) +{ + struct bucket *bucket = search_hop_list(map, neighborhood, key, NULL); + + if (bucket == NULL) + /* There is no bucket containing the key in the neighborhood. */ + return false; + + /* + * Return the value of the current mapping (if desired) and update the mapping with the new + * value (if desired). + */ + if (old_value_ptr != NULL) + *old_value_ptr = bucket->value; + if (update) + bucket->value = new_value; + return true; +} + +/** + * find_or_make_vacancy() - Find an empty bucket. + * @map: The int_map to search or modify. + * @neighborhood: The first bucket in the neighborhood in which an empty bucket is needed for a new + * mapping. + * + * Find an empty bucket in a specified neighborhood for a new mapping or attempt to re-arrange + * mappings so there is such a bucket. This operation may fail (returning NULL) if an empty bucket + * is not available or could not be relocated to the neighborhood. + * + * Return: a pointer to an empty bucket in the desired neighborhood, or NULL if a vacancy could not + * be found or arranged. + */ +static struct bucket *find_or_make_vacancy(struct int_map *map, struct bucket *neighborhood) +{ + /* Probe within and beyond the neighborhood for the first empty bucket. */ + struct bucket *hole = find_empty_bucket(map, neighborhood, MAX_PROBES); + + /* + * Keep trying until the empty bucket is in the bucket's neighborhood or we are unable to + * move it any closer by swapping it with a filled bucket. + */ + while (hole != NULL) { + int distance = hole - neighborhood; + + if (distance < NEIGHBORHOOD) + /* + * We've found or relocated an empty bucket close enough to the initial + * hash bucket to be referenced by its hop vector. + */ + return hole; + + /* + * The nearest empty bucket isn't within the neighborhood that must contain the new + * entry, so try to swap it with bucket that is closer. + */ + hole = move_empty_bucket(map, hole); + } + + return NULL; +} + +/** + * vdo_int_map_put() - Try to associate a value with an integer. + * @map: The int_map to attempt to modify. + * @key: The key with which to associate the new value. + * @new_value: The value to be associated with the key. + * @update: Whether to overwrite an existing value. + * @old_value_ptr: A pointer in which to store either the old value (if the key was already mapped) + * or NULL if the map did not contain the key; NULL may be provided if the caller + * does not need to know the old value + * + * Try to associate a value (a pointer) with an integer in an int_map. If the map already contains + * a mapping for the provided key, the old value is only replaced with the specified value if + * update is true. In either case the old value is returned. If the map does not already contain a + * value for the specified key, the new value is added regardless of the value of update. + * + * Return: UDS_SUCCESS or an error code. + */ +int vdo_int_map_put(struct int_map *map, u64 key, void *new_value, bool update, void **old_value_ptr) +{ + struct bucket *neighborhood, *bucket; + + if (new_value == NULL) + return UDS_INVALID_ARGUMENT; + + /* + * Select the bucket at the start of the neighborhood that must contain any entry for the + * provided key. + */ + neighborhood = select_bucket(map, key); + + /* + * Check whether the neighborhood already contains an entry for the key, in which case we + * optionally update it, returning the old value. + */ + if (update_mapping(map, neighborhood, key, new_value, update, old_value_ptr)) + return UDS_SUCCESS; + + /* + * Find an empty bucket in the desired neighborhood for the new entry or re-arrange entries + * in the map so there is such a bucket. This operation will usually succeed; the loop body + * will only be executed on the rare occasions that we have to resize the map. + */ + while ((bucket = find_or_make_vacancy(map, neighborhood)) == NULL) { + int result; + + /* + * There is no empty bucket in which to put the new entry in the current map, so + * we're forced to allocate a new bucket array with a larger capacity, re-hash all + * the entries into those buckets, and try again (a very expensive operation for + * large maps). + */ + result = resize_buckets(map); + if (result != UDS_SUCCESS) + return result; + + /* + * Resizing the map invalidates all pointers to buckets, so recalculate the + * neighborhood pointer. + */ + neighborhood = select_bucket(map, key); + } + + /* Put the new entry in the empty bucket, adding it to the neighborhood. */ + bucket->key = key; + bucket->value = new_value; + insert_in_hop_list(neighborhood, bucket); + map->size += 1; + + /* There was no existing entry, so there was no old value to be returned. */ + if (old_value_ptr != NULL) + *old_value_ptr = NULL; + return UDS_SUCCESS; +} + +/** + * vdo_int_map_remove() - Remove the mapping for a given key from the int_map. + * @map: The int_map from which to remove the mapping. + * @key: The key whose mapping is to be removed. + * + * Return: the value that was associated with the key, or NULL if it was not mapped. + */ +void *vdo_int_map_remove(struct int_map *map, u64 key) +{ + void *value; + + /* Select the bucket to search and search it for an existing entry. */ + struct bucket *bucket = select_bucket(map, key); + struct bucket *previous; + struct bucket *victim = search_hop_list(map, bucket, key, &previous); + + if (victim == NULL) + /* There is no matching entry to remove. */ + return NULL; + + /* + * We found an entry to remove. Save the mapped value to return later and empty the bucket. + */ + map->size -= 1; + value = victim->value; + victim->value = NULL; + victim->key = 0; + + /* The victim bucket is now empty, but it still needs to be spliced out of the hop list. */ + if (previous == NULL) + /* The victim is the head of the list, so swing first_hop. */ + bucket->first_hop = victim->next_hop; + else + previous->next_hop = victim->next_hop; + victim->next_hop = NULL_HOP_OFFSET; + + return value; +} diff --git a/drivers/md/dm-vdo/int-map.h b/drivers/md/dm-vdo/int-map.h new file mode 100644 index 00000000000..cced57c4016 --- /dev/null +++ b/drivers/md/dm-vdo/int-map.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_INT_MAP_H +#define VDO_INT_MAP_H + +#include <linux/compiler.h> +#include <linux/types.h> + +/** + * DOC: int_map + * + * An int_map associates pointers (void *) with integer keys (u64). NULL pointer values are + * not supported. + * + * The map is implemented as hash table, which should provide constant-time insert, query, and + * remove operations, although the insert may occasionally grow the table, which is linear in the + * number of entries in the map. The table will grow as needed to hold new entries, but will not + * shrink as entries are removed. + */ + +struct int_map; + +int __must_check +vdo_make_int_map(size_t initial_capacity, unsigned int initial_load, struct int_map **map_ptr); + +void vdo_free_int_map(struct int_map *map); + +size_t vdo_int_map_size(const struct int_map *map); + +void *vdo_int_map_get(struct int_map *map, u64 key); + +int __must_check +vdo_int_map_put(struct int_map *map, u64 key, void *new_value, bool update, void **old_value_ptr); + +void *vdo_int_map_remove(struct int_map *map, u64 key); + +#endif /* VDO_INT_MAP_H */ diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c new file mode 100644 index 00000000000..a77f6c4ec7a --- /dev/null +++ b/drivers/md/dm-vdo/io-submitter.c @@ -0,0 +1,483 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "io-submitter.h" + +#include <linux/bio.h> +#include <linux/kernel.h> +#include <linux/mutex.h> + +#include "memory-alloc.h" +#include "permassert.h" + +#include "data-vio.h" +#include "logger.h" +#include "types.h" +#include "vdo.h" +#include "vio.h" + +/* + * Submission of bio operations to the underlying storage device will go through a separate work + * queue thread (or more than one) to prevent blocking in other threads if the storage device has a + * full queue. The plug structure allows that thread to do better batching of requests to make the + * I/O more efficient. + * + * When multiple worker threads are used, a thread is chosen for a I/O operation submission based + * on the PBN, so a given PBN will consistently wind up on the same thread. Flush operations are + * assigned round-robin. + * + * The map (protected by the mutex) collects pending I/O operations so that the worker thread can + * reorder them to try to encourage I/O request merging in the request queue underneath. + */ +struct bio_queue_data { + struct vdo_work_queue *queue; + struct blk_plug plug; + struct int_map *map; + struct mutex lock; + unsigned int queue_number; +}; + +struct io_submitter { + unsigned int num_bio_queues_used; + unsigned int bio_queue_rotation_interval; + struct bio_queue_data bio_queue_data[]; +}; + +static void start_bio_queue(void *ptr) +{ + struct bio_queue_data *bio_queue_data = (struct bio_queue_data *) ptr; + + blk_start_plug(&bio_queue_data->plug); +} + +static void finish_bio_queue(void *ptr) +{ + struct bio_queue_data *bio_queue_data = (struct bio_queue_data *) ptr; + + blk_finish_plug(&bio_queue_data->plug); +} + +static const struct vdo_work_queue_type bio_queue_type = { + .start = start_bio_queue, + .finish = finish_bio_queue, + .max_priority = BIO_Q_MAX_PRIORITY, + .default_priority = BIO_Q_DATA_PRIORITY, +}; + +/** + * count_all_bios() - Determine which bio counter to use. + * @vio: The vio associated with the bio. + * @bio: The bio to count. + */ +static void count_all_bios(struct vio *vio, struct bio *bio) +{ + struct atomic_statistics *stats = &vio->completion.vdo->stats; + + if (is_data_vio(vio)) { + vdo_count_bios(&stats->bios_out, bio); + return; + } + + vdo_count_bios(&stats->bios_meta, bio); + if (vio->type == VIO_TYPE_RECOVERY_JOURNAL) + vdo_count_bios(&stats->bios_journal, bio); + else if (vio->type == VIO_TYPE_BLOCK_MAP) + vdo_count_bios(&stats->bios_page_cache, bio); +} + +/** + * assert_in_bio_zone() - Assert that a vio is in the correct bio zone and not in interrupt + * context. + * @vio: The vio to check. + */ +static void assert_in_bio_zone(struct vio *vio) +{ + ASSERT_LOG_ONLY(!in_interrupt(), "not in interrupt context"); + assert_vio_in_bio_zone(vio); +} + +/** + * send_bio_to_device() - Update stats and tracing info, then submit the supplied bio to the OS for + * processing. + * @vio: The vio associated with the bio. + * @bio: The bio to submit to the OS. + */ +static void send_bio_to_device(struct vio *vio, struct bio *bio) +{ + struct vdo *vdo = vio->completion.vdo; + + assert_in_bio_zone(vio); + atomic64_inc(&vdo->stats.bios_submitted); + count_all_bios(vio, bio); + bio_set_dev(bio, vdo_get_backing_device(vdo)); + submit_bio_noacct(bio); +} + +static sector_t get_bio_sector(struct bio *bio) +{ + return bio->bi_iter.bi_sector; +} + +/** + * process_vio_io() - Submits a vio's bio to the underlying block device. May block if the device + * is busy. This callback should be used by vios which did not attempt to merge. + */ +void process_vio_io(struct vdo_completion *completion) +{ + struct vio *vio = as_vio(completion); + + send_bio_to_device(vio, vio->bio); +} + +/** + * get_bio_list() - Extract the list of bios to submit from a vio. + * @vio: The vio submitting I/O. + * + * The list will always contain at least one entry (the bio for the vio on which it is called), but + * other bios may have been merged with it as well. + * + * Return: bio The head of the bio list to submit. + */ +static struct bio *get_bio_list(struct vio *vio) +{ + struct bio *bio; + struct io_submitter *submitter = vio->completion.vdo->io_submitter; + struct bio_queue_data *bio_queue_data = &(submitter->bio_queue_data[vio->bio_zone]); + + assert_in_bio_zone(vio); + + mutex_lock(&bio_queue_data->lock); + vdo_int_map_remove(bio_queue_data->map, get_bio_sector(vio->bios_merged.head)); + vdo_int_map_remove(bio_queue_data->map, get_bio_sector(vio->bios_merged.tail)); + bio = vio->bios_merged.head; + bio_list_init(&vio->bios_merged); + mutex_unlock(&bio_queue_data->lock); + + return bio; +} + +/** + * process_data_vio_io() - Submit a data_vio's bio to the storage below along with any bios that + * have been merged with it. + * + * Context: This call may block and so should only be called from a bio thread. + */ +static void process_data_vio_io(struct vdo_completion *completion) +{ + struct bio *bio, *next; + struct vio *vio = as_vio(completion); + + assert_in_bio_zone(vio); + for (bio = get_bio_list(vio); bio != NULL; bio = next) { + next = bio->bi_next; + bio->bi_next = NULL; + send_bio_to_device((struct vio *) bio->bi_private, bio); + } +} + +/** + * get_mergeable_locked() - Attempt to find an already queued bio that the current bio can be + * merged with. + * @map: The bio map to use for merging. + * @vio: The vio we want to merge. + * @back_merge: Set to true for a back merge, false for a front merge. + * + * There are two types of merging possible, forward and backward, which are distinguished by a flag + * that uses kernel elevator terminology. + * + * Return: the vio to merge to, NULL if no merging is possible. + */ +static struct vio *get_mergeable_locked(struct int_map *map, struct vio *vio, bool back_merge) +{ + struct bio *bio = vio->bio; + sector_t merge_sector = get_bio_sector(bio); + struct vio *vio_merge; + + if (back_merge) + merge_sector -= VDO_SECTORS_PER_BLOCK; + else + merge_sector += VDO_SECTORS_PER_BLOCK; + + vio_merge = vdo_int_map_get(map, merge_sector); + + if (vio_merge == NULL) + return NULL; + + if (vio->completion.priority != vio_merge->completion.priority) + return NULL; + + if (bio_data_dir(bio) != bio_data_dir(vio_merge->bio)) + return NULL; + + if (bio_list_empty(&vio_merge->bios_merged)) + return NULL; + + if (back_merge) + return ((get_bio_sector(vio_merge->bios_merged.tail) == merge_sector) ? + vio_merge : + NULL); + + return ((get_bio_sector(vio_merge->bios_merged.head) == merge_sector) ? vio_merge : NULL); +} + +static int map_merged_vio(struct int_map *bio_map, struct vio *vio) +{ + int result; + + result = vdo_int_map_put(bio_map, get_bio_sector(vio->bios_merged.head), vio, true, NULL); + if (result != VDO_SUCCESS) + return result; + + return vdo_int_map_put(bio_map, get_bio_sector(vio->bios_merged.tail), vio, true, NULL); +} + +static int merge_to_prev_tail(struct int_map *bio_map, struct vio *vio, struct vio *prev_vio) +{ + vdo_int_map_remove(bio_map, get_bio_sector(prev_vio->bios_merged.tail)); + bio_list_merge(&prev_vio->bios_merged, &vio->bios_merged); + return map_merged_vio(bio_map, prev_vio); +} + +static int merge_to_next_head(struct int_map *bio_map, struct vio *vio, struct vio *next_vio) +{ + /* + * Handle "next merge" and "gap fill" cases the same way so as to reorder bios in a way + * that's compatible with using funnel queues in work queues. This avoids removing an + * existing completion. + */ + vdo_int_map_remove(bio_map, get_bio_sector(next_vio->bios_merged.head)); + bio_list_merge_head(&next_vio->bios_merged, &vio->bios_merged); + return map_merged_vio(bio_map, next_vio); +} + +/** + * try_bio_map_merge() - Attempt to merge a vio's bio with other pending I/Os. + * @vio: The vio to merge. + * + * Currently this is only used for data_vios, but is broken out for future use with metadata vios. + * + * Return: whether or not the vio was merged. + */ +static bool try_bio_map_merge(struct vio *vio) +{ + int result; + bool merged = true; + struct bio *bio = vio->bio; + struct vio *prev_vio, *next_vio; + struct vdo *vdo = vio->completion.vdo; + struct bio_queue_data *bio_queue_data = &vdo->io_submitter->bio_queue_data[vio->bio_zone]; + + bio->bi_next = NULL; + bio_list_init(&vio->bios_merged); + bio_list_add(&vio->bios_merged, bio); + + mutex_lock(&bio_queue_data->lock); + prev_vio = get_mergeable_locked(bio_queue_data->map, vio, true); + next_vio = get_mergeable_locked(bio_queue_data->map, vio, false); + if (prev_vio == next_vio) + next_vio = NULL; + + if ((prev_vio == NULL) && (next_vio == NULL)) { + /* no merge. just add to bio_queue */ + merged = false; + result = vdo_int_map_put(bio_queue_data->map, + get_bio_sector(bio), + vio, + true, + NULL); + } else if (next_vio == NULL) { + /* Only prev. merge to prev's tail */ + result = merge_to_prev_tail(bio_queue_data->map, vio, prev_vio); + } else { + /* Only next. merge to next's head */ + result = merge_to_next_head(bio_queue_data->map, vio, next_vio); + } + + mutex_unlock(&bio_queue_data->lock); + + /* We don't care about failure of int_map_put in this case. */ + ASSERT_LOG_ONLY(result == UDS_SUCCESS, "bio map insertion succeeds"); + return merged; +} + +/** + * submit_data_vio_io() - Submit I/O for a data_vio. + * @data_vio: the data_vio for which to issue I/O. + * + * If possible, this I/O will be merged other pending I/Os. Otherwise, the data_vio will be sent to + * the appropriate bio zone directly. + */ +void submit_data_vio_io(struct data_vio *data_vio) +{ + if (try_bio_map_merge(&data_vio->vio)) + return; + + launch_data_vio_bio_zone_callback(data_vio, process_data_vio_io); +} + +/** + * vdo_submit_metadata_io() - Submit I/O for a metadata vio. + * @vio: the vio for which to issue I/O + * @physical: the physical block number to read or write + * @callback: the bio endio function which will be called after the I/O completes + * @error_handler: the handler for submission or I/O errors (may be NULL) + * @operation: the type of I/O to perform + * @data: the buffer to read or write (may be NULL) + * + * The vio is enqueued on a vdo bio queue so that bio submission (which may block) does not block + * other vdo threads. + * + * That the error handler will run on the correct thread is only true so long as the thread calling + * this function, and the thread set in the endio callback are the same, as well as the fact that + * no error can occur on the bio queue. Currently this is true for all callers, but additional care + * will be needed if this ever changes. + */ +void vdo_submit_metadata_io(struct vio *vio, + physical_block_number_t physical, + bio_end_io_t callback, + vdo_action *error_handler, + unsigned int operation, + char *data) +{ + struct vdo_completion *completion = &vio->completion; + int result; + const struct admin_state_code *code = vdo_get_admin_state(completion->vdo); + + + ASSERT_LOG_ONLY(!code->quiescent, "I/O not allowed in state %s", code->name); + ASSERT_LOG_ONLY(vio->bio->bi_next == NULL, "metadata bio has no next bio"); + + vdo_reset_completion(completion); + completion->error_handler = error_handler; + result = vio_reset_bio(vio, data, callback, operation | REQ_META, physical); + if (result != VDO_SUCCESS) { + continue_vio(vio, result); + return; + } + + vdo_set_completion_callback(completion, process_vio_io, get_vio_bio_zone_thread_id(vio)); + vdo_launch_completion_with_priority(completion, get_metadata_priority(vio)); +} + +/** + * vdo_make_io_submitter() - Create an io_submitter structure. + * @thread_count: Number of bio-submission threads to set up. + * @rotation_interval: Interval to use when rotating between bio-submission threads when enqueuing + * completions. + * @max_requests_active: Number of bios for merge tracking. + * @vdo: The vdo which will use this submitter. + * @io_submitter: pointer to the new data structure. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make_io_submitter(unsigned int thread_count, + unsigned int rotation_interval, + unsigned int max_requests_active, + struct vdo *vdo, + struct io_submitter **io_submitter_ptr) +{ + unsigned int i; + struct io_submitter *io_submitter; + int result; + + result = UDS_ALLOCATE_EXTENDED(struct io_submitter, + thread_count, + struct bio_queue_data, + "bio submission data", + &io_submitter); + if (result != UDS_SUCCESS) + return result; + + io_submitter->bio_queue_rotation_interval = rotation_interval; + + /* Setup for each bio-submission work queue */ + for (i = 0; i < thread_count; i++) { + struct bio_queue_data *bio_queue_data = &io_submitter->bio_queue_data[i]; + + mutex_init(&bio_queue_data->lock); + /* + * One I/O operation per request, but both first & last sector numbers. + * + * If requests are assigned to threads round-robin, they should be distributed + * quite evenly. But if they're assigned based on PBN, things can sometimes be very + * uneven. So for now, we'll assume that all requests *may* wind up on one thread, + * and thus all in the same map. + */ + result = vdo_make_int_map(max_requests_active * 2, 0, &bio_queue_data->map); + if (result != 0) { + /* + * Clean up the partially initialized bio-queue entirely and indicate that + * initialization failed. + */ + uds_log_error("bio map initialization failed %d", result); + vdo_cleanup_io_submitter(io_submitter); + vdo_free_io_submitter(io_submitter); + return result; + } + + bio_queue_data->queue_number = i; + result = vdo_make_thread(vdo, + vdo->thread_config.bio_threads[i], + &bio_queue_type, + 1, + (void **) &bio_queue_data); + if (result != VDO_SUCCESS) { + /* + * Clean up the partially initialized bio-queue entirely and indicate that + * initialization failed. + */ + vdo_free_int_map(UDS_FORGET(bio_queue_data->map)); + uds_log_error("bio queue initialization failed %d", result); + vdo_cleanup_io_submitter(io_submitter); + vdo_free_io_submitter(io_submitter); + return result; + } + + bio_queue_data->queue = vdo->threads[vdo->thread_config.bio_threads[i]].queue; + io_submitter->num_bio_queues_used++; + } + + *io_submitter_ptr = io_submitter; + + return VDO_SUCCESS; +} + +/** + * vdo_cleanup_io_submitter() - Tear down the io_submitter fields as needed for a physical layer. + * @io_submitter: The I/O submitter data to tear down (may be NULL). + */ +void vdo_cleanup_io_submitter(struct io_submitter *io_submitter) +{ + int i; + + if (io_submitter == NULL) + return; + + for (i = io_submitter->num_bio_queues_used - 1; i >= 0; i--) + vdo_finish_work_queue(io_submitter->bio_queue_data[i].queue); +} + +/** + * vdo_free_io_submitter() - Free the io_submitter fields and structure as needed. + * @io_submitter: The I/O submitter data to destroy. + * + * This must be called after vdo_cleanup_io_submitter(). It is used to release resources late in + * the shutdown process to avoid or reduce the chance of race conditions. + */ +void vdo_free_io_submitter(struct io_submitter *io_submitter) +{ + int i; + + if (io_submitter == NULL) + return; + + for (i = io_submitter->num_bio_queues_used - 1; i >= 0; i--) { + io_submitter->num_bio_queues_used--; + /* vdo_destroy() will free the work queue, so just give up our reference to it. */ + UDS_FORGET(io_submitter->bio_queue_data[i].queue); + vdo_free_int_map(UDS_FORGET(io_submitter->bio_queue_data[i].map)); + } + UDS_FREE(io_submitter); +} diff --git a/drivers/md/dm-vdo/io-submitter.h b/drivers/md/dm-vdo/io-submitter.h new file mode 100644 index 00000000000..5bd05090f67 --- /dev/null +++ b/drivers/md/dm-vdo/io-submitter.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_IO_SUBMITTER_H +#define VDO_IO_SUBMITTER_H + +#include <linux/bio.h> + +#include "types.h" + +struct io_submitter; + +int vdo_make_io_submitter(unsigned int thread_count, + unsigned int rotation_interval, + unsigned int max_requests_active, + struct vdo *vdo, + struct io_submitter **io_submitter); + +void vdo_cleanup_io_submitter(struct io_submitter *io_submitter); + +void vdo_free_io_submitter(struct io_submitter *io_submitter); + +void process_vio_io(struct vdo_completion *completion); + +void submit_data_vio_io(struct data_vio *data_vio); + +void vdo_submit_metadata_io(struct vio *vio, + physical_block_number_t physical, + bio_end_io_t callback, + vdo_action *error_handler, + unsigned int operation, + char *data); + +static inline void submit_metadata_vio(struct vio *vio, + physical_block_number_t physical, + bio_end_io_t callback, + vdo_action *error_handler, + unsigned int operation) +{ + vdo_submit_metadata_io(vio, physical, callback, error_handler, operation, vio->data); +} + +static inline void +submit_flush_vio(struct vio *vio, bio_end_io_t callback, vdo_action *error_handler) +{ + /* FIXME: Can we just use REQ_OP_FLUSH? */ + vdo_submit_metadata_io(vio, 0, callback, error_handler, REQ_OP_WRITE | REQ_PREFLUSH, NULL); +} + +#endif /* VDO_IO_SUBMITTER_H */ diff --git a/drivers/md/dm-vdo/logical-zone.c b/drivers/md/dm-vdo/logical-zone.c new file mode 100644 index 00000000000..6fffac8169a --- /dev/null +++ b/drivers/md/dm-vdo/logical-zone.c @@ -0,0 +1,378 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "logical-zone.h" + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" +#include "string-utils.h" + +#include "action-manager.h" +#include "admin-state.h" +#include "block-map.h" +#include "completion.h" +#include "constants.h" +#include "data-vio.h" +#include "flush.h" +#include "int-map.h" +#include "physical-zone.h" +#include "vdo.h" + +enum { + ALLOCATIONS_PER_ZONE = 128, +}; + +/** + * as_logical_zone() - Convert a generic vdo_completion to a logical_zone. + * @completion: The completion to convert. + * + * Return: The completion as a logical_zone. + */ +static struct logical_zone *as_logical_zone(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion, VDO_GENERATION_FLUSHED_COMPLETION); + return container_of(completion, struct logical_zone, completion); +} + +/* get_thread_id_for_zone() - Implements vdo_zone_thread_getter. */ +static thread_id_t get_thread_id_for_zone(void *context, zone_count_t zone_number) +{ + struct logical_zones *zones = context; + + return zones->zones[zone_number].thread_id; +} + +/** + * initialize_zone() - Initialize a logical zone. + * @zones: The logical_zones to which this zone belongs. + * @zone_number: The logical_zone's index. + */ +static int initialize_zone(struct logical_zones *zones, zone_count_t zone_number) +{ + int result; + struct vdo *vdo = zones->vdo; + struct logical_zone *zone = &zones->zones[zone_number]; + zone_count_t allocation_zone_number; + + result = vdo_make_int_map(VDO_LOCK_MAP_CAPACITY, 0, &zone->lbn_operations); + if (result != VDO_SUCCESS) + return result; + + if (zone_number < vdo->thread_config.logical_zone_count - 1) + zone->next = &zones->zones[zone_number + 1]; + + vdo_initialize_completion(&zone->completion, vdo, VDO_GENERATION_FLUSHED_COMPLETION); + zone->zones = zones; + zone->zone_number = zone_number; + zone->thread_id = vdo->thread_config.logical_threads[zone_number]; + zone->block_map_zone = &vdo->block_map->zones[zone_number]; + INIT_LIST_HEAD(&zone->write_vios); + vdo_set_admin_state_code(&zone->state, VDO_ADMIN_STATE_NORMAL_OPERATION); + + allocation_zone_number = zone->thread_id % vdo->thread_config.physical_zone_count; + zone->allocation_zone = &vdo->physical_zones->zones[allocation_zone_number]; + + return vdo_make_default_thread(vdo, zone->thread_id); +} + +/** + * vdo_make_logical_zones() - Create a set of logical zones. + * @vdo: The vdo to which the zones will belong. + * @zones_ptr: A pointer to hold the new zones. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_make_logical_zones(struct vdo *vdo, struct logical_zones **zones_ptr) +{ + struct logical_zones *zones; + int result; + zone_count_t zone; + zone_count_t zone_count = vdo->thread_config.logical_zone_count; + + if (zone_count == 0) + return VDO_SUCCESS; + + result = UDS_ALLOCATE_EXTENDED(struct logical_zones, zone_count, + struct logical_zone, __func__, &zones); + if (result != VDO_SUCCESS) + return result; + + zones->vdo = vdo; + zones->zone_count = zone_count; + for (zone = 0; zone < zone_count; zone++) { + result = initialize_zone(zones, zone); + if (result != VDO_SUCCESS) { + vdo_free_logical_zones(zones); + return result; + } + } + + result = vdo_make_action_manager(zones->zone_count, + get_thread_id_for_zone, + vdo->thread_config.admin_thread, + zones, + NULL, + vdo, + &zones->manager); + if (result != VDO_SUCCESS) { + vdo_free_logical_zones(zones); + return result; + } + + *zones_ptr = zones; + return VDO_SUCCESS; +} + +/** + * vdo_free_logical_zones() - Free a set of logical zones. + * @zones: The set of zones to free. + */ +void vdo_free_logical_zones(struct logical_zones *zones) +{ + zone_count_t index; + + if (zones == NULL) + return; + + UDS_FREE(UDS_FORGET(zones->manager)); + + for (index = 0; index < zones->zone_count; index++) + vdo_free_int_map(UDS_FORGET(zones->zones[index].lbn_operations)); + + UDS_FREE(zones); +} + +static inline void assert_on_zone_thread(struct logical_zone *zone, const char *what) +{ + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == zone->thread_id), + "%s() called on correct thread", what); +} + +/** + * check_for_drain_complete() - Check whether this zone has drained. + * @zone: The zone to check. + */ +static void check_for_drain_complete(struct logical_zone *zone) +{ + if (!vdo_is_state_draining(&zone->state) || zone->notifying || + !list_empty(&zone->write_vios)) + return; + + vdo_finish_draining(&zone->state); +} + +/** + * initiate_drain() - Initiate a drain. + * + * Implements vdo_admin_initiator. + */ +static void initiate_drain(struct admin_state *state) +{ + check_for_drain_complete(container_of(state, struct logical_zone, state)); +} + +/** + * drain_logical_zone() - Drain a logical zone. + * + * Implements vdo_zone_action. + */ +static void +drain_logical_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent) +{ + struct logical_zones *zones = context; + + vdo_start_draining(&zones->zones[zone_number].state, + vdo_get_current_manager_operation(zones->manager), + parent, + initiate_drain); +} + +void vdo_drain_logical_zones(struct logical_zones *zones, + const struct admin_state_code *operation, + struct vdo_completion *parent) +{ + vdo_schedule_operation(zones->manager, operation, NULL, drain_logical_zone, NULL, parent); +} + +/** + * resume_logical_zone() - Resume a logical zone. + * + * Implements vdo_zone_action. + */ +static void +resume_logical_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent) +{ + struct logical_zone *zone = &(((struct logical_zones *) context)->zones[zone_number]); + + vdo_fail_completion(parent, vdo_resume_if_quiescent(&zone->state)); +} + +/** + * vdo_resume_logical_zones() - Resume a set of logical zones. + * @zones: The logical zones to resume. + * @parent: The object to notify when the zones have resumed. + */ +void vdo_resume_logical_zones(struct logical_zones *zones, struct vdo_completion *parent) +{ + vdo_schedule_operation(zones->manager, VDO_ADMIN_STATE_RESUMING, NULL, + resume_logical_zone, NULL, parent); +} + +/** + * update_oldest_active_generation() - Update the oldest active generation. + * @zone: The zone. + * + * Return: true if the oldest active generation has changed. + */ +static bool update_oldest_active_generation(struct logical_zone *zone) +{ + struct data_vio *data_vio = + list_first_entry_or_null(&zone->write_vios, struct data_vio, write_entry); + sequence_number_t oldest = + (data_vio == NULL) ? zone->flush_generation : data_vio->flush_generation; + + if (oldest == zone->oldest_active_generation) + return false; + + WRITE_ONCE(zone->oldest_active_generation, oldest); + return true; +} + +/** + * vdo_increment_logical_zone_flush_generation() - Increment the flush generation in a logical + * zone. + * @zone: The logical zone. + * @expected_generation: The expected value of the flush generation before the increment. + */ +void vdo_increment_logical_zone_flush_generation(struct logical_zone *zone, + sequence_number_t expected_generation) +{ + assert_on_zone_thread(zone, __func__); + ASSERT_LOG_ONLY((zone->flush_generation == expected_generation), + "logical zone %u flush generation %llu should be %llu before increment", + zone->zone_number, + (unsigned long long) zone->flush_generation, + (unsigned long long) expected_generation); + + zone->flush_generation++; + zone->ios_in_flush_generation = 0; + update_oldest_active_generation(zone); +} + +/** + * vdo_acquire_flush_generation_lock() - Acquire the shared lock on a flush generation by a write + * data_vio. + * @data_vio: The data_vio. + */ +void vdo_acquire_flush_generation_lock(struct data_vio *data_vio) +{ + struct logical_zone *zone = data_vio->logical.zone; + + assert_on_zone_thread(zone, __func__); + ASSERT_LOG_ONLY(vdo_is_state_normal(&zone->state), "vdo state is normal"); + + data_vio->flush_generation = zone->flush_generation; + list_add_tail(&data_vio->write_entry, &zone->write_vios); + zone->ios_in_flush_generation++; +} + +static void attempt_generation_complete_notification(struct vdo_completion *completion); + +/** + * notify_flusher() - Notify the flush that at least one generation no longer has active VIOs. + * @completion: The zone completion. + * + * This callback is registered in attempt_generation_complete_notification(). + */ +static void notify_flusher(struct vdo_completion *completion) +{ + struct logical_zone *zone = as_logical_zone(completion); + + vdo_complete_flushes(zone->zones->vdo->flusher); + vdo_launch_completion_callback(completion, + attempt_generation_complete_notification, + zone->thread_id); +} + +/** + * void attempt_generation_complete_notification() - Notify the flusher if some generation no + * longer has active VIOs. + * @completion: The zone completion. + */ +static void attempt_generation_complete_notification(struct vdo_completion *completion) +{ + struct logical_zone *zone = as_logical_zone(completion); + + assert_on_zone_thread(zone, __func__); + if (zone->oldest_active_generation <= zone->notification_generation) { + zone->notifying = false; + check_for_drain_complete(zone); + return; + } + + zone->notifying = true; + zone->notification_generation = zone->oldest_active_generation; + vdo_launch_completion_callback(&zone->completion, notify_flusher, + vdo_get_flusher_thread_id(zone->zones->vdo->flusher)); +} + +/** + * vdo_release_flush_generation_lock() - Release the shared lock on a flush generation held by a + * write data_vio. + * @data_vio: The data_vio whose lock is to be released. + * + * If there are pending flushes, and this data_vio completes the oldest generation active in this + * zone, an attempt will be made to finish any flushes which may now be complete. + */ +void vdo_release_flush_generation_lock(struct data_vio *data_vio) +{ + struct logical_zone *zone = data_vio->logical.zone; + + assert_on_zone_thread(zone, __func__); + + if (!data_vio_has_flush_generation_lock(data_vio)) + return; + + list_del_init(&data_vio->write_entry); + ASSERT_LOG_ONLY((zone->oldest_active_generation <= data_vio->flush_generation), + "data_vio releasing lock on generation %llu is not older than oldest active generation %llu", + (unsigned long long) data_vio->flush_generation, + (unsigned long long) zone->oldest_active_generation); + + if (!update_oldest_active_generation(zone) || zone->notifying) + return; + + attempt_generation_complete_notification(&zone->completion); +} + +struct physical_zone *vdo_get_next_allocation_zone(struct logical_zone *zone) +{ + if (zone->allocation_count == ALLOCATIONS_PER_ZONE) { + zone->allocation_count = 0; + zone->allocation_zone = zone->allocation_zone->next; + } + + zone->allocation_count++; + return zone->allocation_zone; +} + +/** + * vdo_dump_logical_zone() - Dump information about a logical zone to the log for debugging. + * @zone: The zone to dump + * + * Context: the information is dumped in a thread-unsafe fashion. + * + */ +void vdo_dump_logical_zone(const struct logical_zone *zone) +{ + uds_log_info("logical_zone %u", zone->zone_number); + uds_log_info(" flush_generation=%llu oldest_active_generation=%llu notification_generation=%llu notifying=%s ios_in_flush_generation=%llu", + (unsigned long long) READ_ONCE(zone->flush_generation), + (unsigned long long) READ_ONCE(zone->oldest_active_generation), + (unsigned long long) READ_ONCE(zone->notification_generation), + uds_bool_to_string(READ_ONCE(zone->notifying)), + (unsigned long long) READ_ONCE(zone->ios_in_flush_generation)); +} diff --git a/drivers/md/dm-vdo/logical-zone.h b/drivers/md/dm-vdo/logical-zone.h new file mode 100644 index 00000000000..3da788b0c2f --- /dev/null +++ b/drivers/md/dm-vdo/logical-zone.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_LOGICAL_ZONE_H +#define VDO_LOGICAL_ZONE_H + +#include <linux/list.h> + +#include "admin-state.h" +#include "int-map.h" +#include "types.h" + +struct physical_zone; + +struct logical_zone { + /* The completion for flush notifications */ + struct vdo_completion completion; + /* The owner of this zone */ + struct logical_zones *zones; + /* Which logical zone this is */ + zone_count_t zone_number; + /* The thread id for this zone */ + thread_id_t thread_id; + /* In progress operations keyed by LBN */ + struct int_map *lbn_operations; + /* The logical to physical map */ + struct block_map_zone *block_map_zone; + /* The current flush generation */ + sequence_number_t flush_generation; + /* + * The oldest active generation in this zone. This is mutated only on the logical zone + * thread but is queried from the flusher thread. + */ + sequence_number_t oldest_active_generation; + /* The number of IOs in the current flush generation */ + block_count_t ios_in_flush_generation; + /* The youngest generation of the current notification */ + sequence_number_t notification_generation; + /* Whether a notification is in progress */ + bool notifying; + /* The queue of active data write VIOs */ + struct list_head write_vios; + /* The administrative state of the zone */ + struct admin_state state; + /* The physical zone from which to allocate */ + struct physical_zone *allocation_zone; + /* The number of allocations done from the current allocation_zone */ + block_count_t allocation_count; + /* The next zone */ + struct logical_zone *next; +}; + +struct logical_zones { + /* The vdo whose zones these are */ + struct vdo *vdo; + /* The manager for administrative actions */ + struct action_manager *manager; + /* The number of zones */ + zone_count_t zone_count; + /* The logical zones themselves */ + struct logical_zone zones[]; +}; + +int __must_check vdo_make_logical_zones(struct vdo *vdo, struct logical_zones **zones_ptr); + +void vdo_free_logical_zones(struct logical_zones *zones); + +void vdo_drain_logical_zones(struct logical_zones *zones, + const struct admin_state_code *operation, + struct vdo_completion *completion); + +void vdo_resume_logical_zones(struct logical_zones *zones, struct vdo_completion *parent); + +void vdo_increment_logical_zone_flush_generation(struct logical_zone *zone, + sequence_number_t expected_generation); + +void vdo_acquire_flush_generation_lock(struct data_vio *data_vio); + +void vdo_release_flush_generation_lock(struct data_vio *data_vio); + +struct physical_zone * __must_check vdo_get_next_allocation_zone(struct logical_zone *zone); + +void vdo_dump_logical_zone(const struct logical_zone *zone); + +#endif /* VDO_LOGICAL_ZONE_H */ diff --git a/drivers/md/dm-vdo/message-stats.c b/drivers/md/dm-vdo/message-stats.c new file mode 100644 index 00000000000..43a37623b90 --- /dev/null +++ b/drivers/md/dm-vdo/message-stats.c @@ -0,0 +1,1222 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "dedupe.h" +#include "logger.h" +#include "memory-alloc.h" +#include "message-stats.h" +#include "statistics.h" +#include "thread-device.h" +#include "vdo.h" + +static int write_u64(char *prefix, + u64 value, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int count = scnprintf(*buf, *maxlen, "%s%llu%s", + prefix == NULL ? "" : prefix, + value, + suffix == NULL ? "" : suffix); + *buf += count; + *maxlen -= count; + if (count >= *maxlen) + return VDO_UNEXPECTED_EOF; + return VDO_SUCCESS; +} + +static int write_u32(char *prefix, + u32 value, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int count = scnprintf(*buf, *maxlen, "%s%u%s", + prefix == NULL ? "" : prefix, + value, + suffix == NULL ? "" : suffix); + *buf += count; + *maxlen -= count; + if (count >= *maxlen) + return VDO_UNEXPECTED_EOF; + return VDO_SUCCESS; +} + +static int write_block_count_t(char *prefix, + block_count_t value, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int count = scnprintf(*buf, *maxlen, "%s%llu%s", + prefix == NULL ? "" : prefix, + value, + suffix == NULL ? "" : suffix); + *buf += count; + *maxlen -= count; + if (count >= *maxlen) + return VDO_UNEXPECTED_EOF; + return VDO_SUCCESS; +} + +static int write_string(char *prefix, + char *value, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int count = scnprintf(*buf, *maxlen, "%s%s%s", + prefix == NULL ? "" : prefix, + value, + suffix == NULL ? "" : suffix); + *buf += count; + *maxlen -= count; + if (count >= *maxlen) + return VDO_UNEXPECTED_EOF; + return VDO_SUCCESS; +} + +static int write_bool(char *prefix, + bool value, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int count = scnprintf(*buf, *maxlen, "%s%d%s", + prefix == NULL ? "" : prefix, + value, + suffix == NULL ? "" : suffix); + *buf += count; + *maxlen -= count; + if (count >= *maxlen) + return VDO_UNEXPECTED_EOF; + return VDO_SUCCESS; +} + +static int write_u8(char *prefix, + u8 value, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int count = scnprintf(*buf, *maxlen, "%s%u%s", + prefix == NULL ? "" : prefix, + value, + suffix == NULL ? "" : suffix); + *buf += count; + *maxlen -= count; + if (count >= *maxlen) + return VDO_UNEXPECTED_EOF; + return VDO_SUCCESS; +} + +static int write_block_allocator_statistics(char *prefix, + struct block_allocator_statistics *stats, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int result; + + result = write_string(prefix, "{ ", NULL, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + /* The total number of slabs from which blocks may be allocated */ + result = write_u64("slabCount : ", + stats->slab_count, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* The total number of slabs from which blocks have ever been allocated */ + result = write_u64("slabsOpened : ", + stats->slabs_opened, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* The number of times since loading that a slab has been re-opened */ + result = write_u64("slabsReopened : ", + stats->slabs_reopened, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_string(NULL, "}", suffix, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + return VDO_SUCCESS; +} + +static int write_commit_statistics(char *prefix, + struct commit_statistics *stats, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int result; + + result = write_string(prefix, "{ ", NULL, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + /* The total number of items on which processing has started */ + result = write_u64("started : ", + stats->started, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* The total number of items for which a write operation has been issued */ + result = write_u64("written : ", + stats->written, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* The total number of items for which a write operation has completed */ + result = write_u64("committed : ", + stats->committed, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_string(NULL, "}", suffix, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + return VDO_SUCCESS; +} + +static int write_recovery_journal_statistics(char *prefix, + struct recovery_journal_statistics *stats, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int result; + + result = write_string(prefix, "{ ", NULL, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of times the on-disk journal was full */ + result = write_u64("diskFull : ", + stats->disk_full, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of times the recovery journal requested slab journal commits. */ + result = write_u64("slabJournalCommitsRequested : ", + stats->slab_journal_commits_requested, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Write/Commit totals for individual journal entries */ + result = write_commit_statistics("entries : ", + &stats->entries, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Write/Commit totals for journal blocks */ + result = write_commit_statistics("blocks : ", + &stats->blocks, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_string(NULL, "}", suffix, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + return VDO_SUCCESS; +} + +static int write_packer_statistics(char *prefix, + struct packer_statistics *stats, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int result; + + result = write_string(prefix, "{ ", NULL, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of compressed data items written since startup */ + result = write_u64("compressedFragmentsWritten : ", + stats->compressed_fragments_written, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of blocks containing compressed items written since startup */ + result = write_u64("compressedBlocksWritten : ", + stats->compressed_blocks_written, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of VIOs that are pending in the packer */ + result = write_u64("compressedFragmentsInPacker : ", + stats->compressed_fragments_in_packer, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_string(NULL, "}", suffix, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + return VDO_SUCCESS; +} + +static int write_slab_journal_statistics(char *prefix, + struct slab_journal_statistics *stats, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int result; + + result = write_string(prefix, "{ ", NULL, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of times the on-disk journal was full */ + result = write_u64("diskFullCount : ", + stats->disk_full_count, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of times an entry was added over the flush threshold */ + result = write_u64("flushCount : ", + stats->flush_count, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of times an entry was added over the block threshold */ + result = write_u64("blockedCount : ", + stats->blocked_count, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of times a tail block was written */ + result = write_u64("blocksWritten : ", + stats->blocks_written, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of times we had to wait for the tail to write */ + result = write_u64("tailBusyCount : ", + stats->tail_busy_count, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_string(NULL, "}", suffix, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + return VDO_SUCCESS; +} + +static int write_slab_summary_statistics(char *prefix, + struct slab_summary_statistics *stats, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int result; + + result = write_string(prefix, "{ ", NULL, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of blocks written */ + result = write_u64("blocksWritten : ", + stats->blocks_written, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_string(NULL, "}", suffix, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + return VDO_SUCCESS; +} + +static int write_ref_counts_statistics(char *prefix, + struct ref_counts_statistics *stats, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int result; + + result = write_string(prefix, "{ ", NULL, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of reference blocks written */ + result = write_u64("blocksWritten : ", + stats->blocks_written, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_string(NULL, "}", suffix, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + return VDO_SUCCESS; +} + +static int write_block_map_statistics(char *prefix, + struct block_map_statistics *stats, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int result; + + result = write_string(prefix, "{ ", NULL, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of dirty (resident) pages */ + result = write_u32("dirtyPages : ", + stats->dirty_pages, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of clean (resident) pages */ + result = write_u32("cleanPages : ", + stats->clean_pages, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of free pages */ + result = write_u32("freePages : ", + stats->free_pages, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of pages in failed state */ + result = write_u32("failedPages : ", + stats->failed_pages, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of pages incoming */ + result = write_u32("incomingPages : ", + stats->incoming_pages, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of pages outgoing */ + result = write_u32("outgoingPages : ", + stats->outgoing_pages, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* how many times free page not avail */ + result = write_u32("cachePressure : ", + stats->cache_pressure, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of get_vdo_page() calls for read */ + result = write_u64("readCount : ", + stats->read_count, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of get_vdo_page() calls for write */ + result = write_u64("writeCount : ", + stats->write_count, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of times pages failed to read */ + result = write_u64("failedReads : ", + stats->failed_reads, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of times pages failed to write */ + result = write_u64("failedWrites : ", + stats->failed_writes, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of gets that are reclaimed */ + result = write_u64("reclaimed : ", + stats->reclaimed, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of gets for outgoing pages */ + result = write_u64("readOutgoing : ", + stats->read_outgoing, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of gets that were already there */ + result = write_u64("foundInCache : ", + stats->found_in_cache, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of gets requiring discard */ + result = write_u64("discardRequired : ", + stats->discard_required, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of gets enqueued for their page */ + result = write_u64("waitForPage : ", + stats->wait_for_page, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of gets that have to fetch */ + result = write_u64("fetchRequired : ", + stats->fetch_required, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of page fetches */ + result = write_u64("pagesLoaded : ", + stats->pages_loaded, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of page saves */ + result = write_u64("pagesSaved : ", + stats->pages_saved, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* the number of flushes issued */ + result = write_u64("flushCount : ", + stats->flush_count, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_string(NULL, "}", suffix, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + return VDO_SUCCESS; +} + +static int write_hash_lock_statistics(char *prefix, + struct hash_lock_statistics *stats, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int result; + + result = write_string(prefix, "{ ", NULL, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of times the UDS advice proved correct */ + result = write_u64("dedupeAdviceValid : ", + stats->dedupe_advice_valid, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of times the UDS advice proved incorrect */ + result = write_u64("dedupeAdviceStale : ", + stats->dedupe_advice_stale, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of writes with the same data as another in-flight write */ + result = write_u64("concurrentDataMatches : ", + stats->concurrent_data_matches, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of writes whose hash collided with an in-flight write */ + result = write_u64("concurrentHashCollisions : ", + stats->concurrent_hash_collisions, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Current number of dedupe queries that are in flight */ + result = write_u32("currDedupeQueries : ", + stats->curr_dedupe_queries, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_string(NULL, "}", suffix, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + return VDO_SUCCESS; +} + +static int write_error_statistics(char *prefix, + struct error_statistics *stats, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int result; + + result = write_string(prefix, "{ ", NULL, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of times VDO got an invalid dedupe advice PBN from UDS */ + result = write_u64("invalidAdvicePBNCount : ", + stats->invalid_advice_pbn_count, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of times a VIO completed with a VDO_NO_SPACE error */ + result = write_u64("noSpaceErrorCount : ", + stats->no_space_error_count, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of times a VIO completed with a VDO_READ_ONLY error */ + result = write_u64("readOnlyErrorCount : ", + stats->read_only_error_count, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_string(NULL, "}", suffix, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + return VDO_SUCCESS; +} + +static int write_bio_stats(char *prefix, + struct bio_stats *stats, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int result; + + result = write_string(prefix, "{ ", NULL, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of REQ_OP_READ bios */ + result = write_u64("read : ", + stats->read, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of REQ_OP_WRITE bios with data */ + result = write_u64("write : ", + stats->write, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of bios tagged with REQ_PREFLUSH and containing no data */ + result = write_u64("emptyFlush : ", + stats->empty_flush, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of REQ_OP_DISCARD bios */ + result = write_u64("discard : ", + stats->discard, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of bios tagged with REQ_PREFLUSH */ + result = write_u64("flush : ", + stats->flush, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of bios tagged with REQ_FUA */ + result = write_u64("fua : ", + stats->fua, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_string(NULL, "}", suffix, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + return VDO_SUCCESS; +} + +static int write_memory_usage(char *prefix, + struct memory_usage *stats, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int result; + + result = write_string(prefix, "{ ", NULL, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + /* Tracked bytes currently allocated. */ + result = write_u64("bytesUsed : ", + stats->bytes_used, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Maximum tracked bytes allocated. */ + result = write_u64("peakBytesUsed : ", + stats->peak_bytes_used, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_string(NULL, "}", suffix, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + return VDO_SUCCESS; +} + +static int write_index_statistics(char *prefix, + struct index_statistics *stats, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int result; + + result = write_string(prefix, "{ ", NULL, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of records stored in the index */ + result = write_u64("entriesIndexed : ", + stats->entries_indexed, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of post calls that found an existing entry */ + result = write_u64("postsFound : ", + stats->posts_found, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of post calls that added a new entry */ + result = write_u64("postsNotFound : ", + stats->posts_not_found, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of query calls that found an existing entry */ + result = write_u64("queriesFound : ", + stats->queries_found, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of query calls that added a new entry */ + result = write_u64("queriesNotFound : ", + stats->queries_not_found, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of update calls that found an existing entry */ + result = write_u64("updatesFound : ", + stats->updates_found, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of update calls that added a new entry */ + result = write_u64("updatesNotFound : ", + stats->updates_not_found, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of entries discarded */ + result = write_u64("entriesDiscarded : ", + stats->entries_discarded, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_string(NULL, "}", suffix, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + return VDO_SUCCESS; +} + +static int write_vdo_statistics(char *prefix, + struct vdo_statistics *stats, + char *suffix, + char **buf, + unsigned int *maxlen) +{ + int result; + + result = write_string(prefix, "{ ", NULL, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_u32("version : ", + stats->version, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_u32("releaseVersion : ", + stats->release_version, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of blocks used for data */ + result = write_u64("dataBlocksUsed : ", + stats->data_blocks_used, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of blocks used for VDO metadata */ + result = write_u64("overheadBlocksUsed : ", + stats->overhead_blocks_used, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of logical blocks that are currently mapped to physical blocks */ + result = write_u64("logicalBlocksUsed : ", + stats->logical_blocks_used, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of physical blocks */ + result = write_block_count_t("physicalBlocks : ", + stats->physical_blocks, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* number of logical blocks */ + result = write_block_count_t("logicalBlocks : ", + stats->logical_blocks, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Size of the block map page cache, in bytes */ + result = write_u64("blockMapCacheSize : ", + stats->block_map_cache_size, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* The physical block size */ + result = write_u64("blockSize : ", + stats->block_size, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of times the VDO has successfully recovered */ + result = write_u64("completeRecoveries : ", + stats->complete_recoveries, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of times the VDO has recovered from read-only mode */ + result = write_u64("readOnlyRecoveries : ", + stats->read_only_recoveries, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* String describing the operating mode of the VDO */ + result = write_string("mode : ", + stats->mode, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Whether the VDO is in recovery mode */ + result = write_bool("inRecoveryMode : ", + stats->in_recovery_mode, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* What percentage of recovery mode work has been completed */ + result = write_u8("recoveryPercentage : ", + stats->recovery_percentage, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* The statistics for the compressed block packer */ + result = write_packer_statistics("packer : ", + &stats->packer, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Counters for events in the block allocator */ + result = write_block_allocator_statistics("allocator : ", + &stats->allocator, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Counters for events in the recovery journal */ + result = write_recovery_journal_statistics("journal : ", + &stats->journal, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* The statistics for the slab journals */ + result = write_slab_journal_statistics("slabJournal : ", + &stats->slab_journal, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* The statistics for the slab summary */ + result = write_slab_summary_statistics("slabSummary : ", + &stats->slab_summary, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* The statistics for the reference counts */ + result = write_ref_counts_statistics("refCounts : ", + &stats->ref_counts, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* The statistics for the block map */ + result = write_block_map_statistics("blockMap : ", + &stats->block_map, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* The dedupe statistics from hash locks */ + result = write_hash_lock_statistics("hashLock : ", + &stats->hash_lock, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Counts of error conditions */ + result = write_error_statistics("errors : ", + &stats->errors, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* The VDO instance */ + result = write_u32("instance : ", + stats->instance, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Current number of active VIOs */ + result = write_u32("currentVIOsInProgress : ", + stats->current_vios_in_progress, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Maximum number of active VIOs */ + result = write_u32("maxVIOs : ", + stats->max_vios, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of times the UDS index was too slow in responding */ + result = write_u64("dedupeAdviceTimeouts : ", + stats->dedupe_advice_timeouts, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Number of flush requests submitted to the storage device */ + result = write_u64("flushOut : ", + stats->flush_out, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Logical block size */ + result = write_u64("logicalBlockSize : ", + stats->logical_block_size, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Bios submitted into VDO from above */ + result = write_bio_stats("biosIn : ", + &stats->bios_in, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_bio_stats("biosInPartial : ", + &stats->bios_in_partial, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Bios submitted onward for user data */ + result = write_bio_stats("biosOut : ", + &stats->bios_out, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Bios submitted onward for metadata */ + result = write_bio_stats("biosMeta : ", + &stats->bios_meta, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_bio_stats("biosJournal : ", + &stats->bios_journal, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_bio_stats("biosPageCache : ", + &stats->bios_page_cache, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_bio_stats("biosOutCompleted : ", + &stats->bios_out_completed, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_bio_stats("biosMetaCompleted : ", + &stats->bios_meta_completed, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_bio_stats("biosJournalCompleted : ", + &stats->bios_journal_completed, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_bio_stats("biosPageCacheCompleted : ", + &stats->bios_page_cache_completed, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_bio_stats("biosAcknowledged : ", + &stats->bios_acknowledged, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_bio_stats("biosAcknowledgedPartial : ", + &stats->bios_acknowledged_partial, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Current number of bios in progress */ + result = write_bio_stats("biosInProgress : ", + &stats->bios_in_progress, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* Memory usage stats. */ + result = write_memory_usage("memoryUsage : ", + &stats->memory_usage, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + /* The statistics for the UDS index */ + result = write_index_statistics("index : ", + &stats->index, + ", ", + buf, + maxlen); + if (result != VDO_SUCCESS) + return result; + result = write_string(NULL, "}", suffix, buf, maxlen); + if (result != VDO_SUCCESS) + return result; + return VDO_SUCCESS; +} + +int vdo_write_stats(struct vdo *vdo, + char *buf, + unsigned int maxlen) +{ + struct vdo_statistics *stats; + int result; + + result = UDS_ALLOCATE(1, struct vdo_statistics, __func__, &stats); + if (result != VDO_SUCCESS) + return result; + + vdo_fetch_statistics(vdo, stats); + result = write_vdo_statistics(NULL, stats, NULL, &buf, &maxlen); + UDS_FREE(stats); + return result; +} diff --git a/drivers/md/dm-vdo/message-stats.h b/drivers/md/dm-vdo/message-stats.h new file mode 100644 index 00000000000..fdcc819ce11 --- /dev/null +++ b/drivers/md/dm-vdo/message-stats.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_MESSAGE_STATS_H +#define VDO_MESSAGE_STATS_H + +#include "types.h" + +int vdo_write_stats(struct vdo *vdo, char *buf, unsigned int maxlen); + +#endif /* VDO_MESSAGE_STATS_H */ diff --git a/drivers/md/dm-vdo/packer.c b/drivers/md/dm-vdo/packer.c new file mode 100644 index 00000000000..f9b67777850 --- /dev/null +++ b/drivers/md/dm-vdo/packer.c @@ -0,0 +1,794 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "packer.h" + +#include <linux/atomic.h> +#include <linux/blkdev.h> + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" +#include "string-utils.h" + +#include "admin-state.h" +#include "completion.h" +#include "constants.h" +#include "data-vio.h" +#include "dedupe.h" +#include "encodings.h" +#include "io-submitter.h" +#include "physical-zone.h" +#include "status-codes.h" +#include "vdo.h" +#include "vio.h" + +static const struct version_number COMPRESSED_BLOCK_1_0 = { + .major_version = 1, + .minor_version = 0, +}; + +enum { + COMPRESSED_BLOCK_1_0_SIZE = 4 + 4 + (2 * VDO_MAX_COMPRESSION_SLOTS), +}; + +/** + * vdo_get_compressed_block_fragment() - Get a reference to a compressed fragment from a compressed + * block. + * @mapping_state [in] The mapping state for the look up. + * @compressed_block [in] The compressed block that was read from disk. + * @fragment_offset [out] The offset of the fragment within a compressed block. + * @fragment_size [out] The size of the fragment. + * + * Return: If a valid compressed fragment is found, VDO_SUCCESS; otherwise, VDO_INVALID_FRAGMENT if + * the fragment is invalid. + */ +int vdo_get_compressed_block_fragment(enum block_mapping_state mapping_state, + struct compressed_block *block, + u16 *fragment_offset, + u16 *fragment_size) +{ + u16 compressed_size; + u16 offset = 0; + unsigned int i; + u8 slot; + struct version_number version; + + if (!vdo_is_state_compressed(mapping_state)) + return VDO_INVALID_FRAGMENT; + + version = vdo_unpack_version_number(block->header.version); + if (!vdo_are_same_version(version, COMPRESSED_BLOCK_1_0)) + return VDO_INVALID_FRAGMENT; + + slot = mapping_state - VDO_MAPPING_STATE_COMPRESSED_BASE; + if (slot >= VDO_MAX_COMPRESSION_SLOTS) + return VDO_INVALID_FRAGMENT; + + compressed_size = __le16_to_cpu(block->header.sizes[slot]); + for (i = 0; i < slot; i++) { + offset += __le16_to_cpu(block->header.sizes[i]); + if (offset >= VDO_COMPRESSED_BLOCK_DATA_SIZE) + return VDO_INVALID_FRAGMENT; + } + + if ((offset + compressed_size) > VDO_COMPRESSED_BLOCK_DATA_SIZE) + return VDO_INVALID_FRAGMENT; + + *fragment_offset = offset; + *fragment_size = compressed_size; + return VDO_SUCCESS; +} + +/** + * assert_on_packer_thread() - Check that we are on the packer thread. + * @packer: The packer. + * @caller: The function which is asserting. + */ +static inline void assert_on_packer_thread(struct packer *packer, const char *caller) +{ + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == packer->thread_id), + "%s() called from packer thread", caller); +} + +/** + * insert_in_sorted_list() - Insert a bin to the list. + * @packer: The packer. + * @bin: The bin to move to its sorted position. + * + * The list is in ascending order of free space. Since all bins are already in the list, this + * actually moves the bin to the correct position in the list. + */ +static void insert_in_sorted_list(struct packer *packer, struct packer_bin *bin) +{ + struct packer_bin *active_bin; + + list_for_each_entry(active_bin, &packer->bins, list) + if (active_bin->free_space > bin->free_space) { + list_move_tail(&bin->list, &active_bin->list); + return; + } + + list_move_tail(&bin->list, &packer->bins); +} + +/** + * make_bin() - Allocate a bin and put it into the packer's list. + * @packer: The packer. + */ +static int __must_check make_bin(struct packer *packer) +{ + struct packer_bin *bin; + int result; + + result = UDS_ALLOCATE_EXTENDED(struct packer_bin, + VDO_MAX_COMPRESSION_SLOTS, + struct vio *, + __func__, + &bin); + if (result != VDO_SUCCESS) + return result; + + bin->free_space = VDO_COMPRESSED_BLOCK_DATA_SIZE; + INIT_LIST_HEAD(&bin->list); + list_add_tail(&bin->list, &packer->bins); + return VDO_SUCCESS; +} + +/** + * vdo_make_packer() - Make a new block packer. + * + * @vdo: The vdo to which this packer belongs. + * @bin_count: The number of partial bins to keep in memory. + * @packer_ptr: A pointer to hold the new packer. + * + * Return: VDO_SUCCESS or an error + */ +int vdo_make_packer(struct vdo *vdo, block_count_t bin_count, struct packer **packer_ptr) +{ + struct packer *packer; + block_count_t i; + int result; + + result = UDS_ALLOCATE(1, struct packer, __func__, &packer); + if (result != VDO_SUCCESS) + return result; + + packer->thread_id = vdo->thread_config.packer_thread; + packer->size = bin_count; + INIT_LIST_HEAD(&packer->bins); + vdo_set_admin_state_code(&packer->state, VDO_ADMIN_STATE_NORMAL_OPERATION); + + for (i = 0; i < bin_count; i++) { + result = make_bin(packer); + if (result != VDO_SUCCESS) { + vdo_free_packer(packer); + return result; + } + } + + /* + * The canceled bin can hold up to half the number of user vios. Every canceled vio in the + * bin must have a canceler for which it is waiting, and any canceler will only have + * canceled one lock holder at a time. + */ + result = UDS_ALLOCATE_EXTENDED(struct packer_bin, + MAXIMUM_VDO_USER_VIOS / 2, + struct vio *, __func__, + &packer->canceled_bin); + if (result != VDO_SUCCESS) { + vdo_free_packer(packer); + return result; + } + + result = vdo_make_default_thread(vdo, packer->thread_id); + if (result != VDO_SUCCESS) { + vdo_free_packer(packer); + return result; + } + + *packer_ptr = packer; + return VDO_SUCCESS; +} + +/** + * vdo_free_packer() - Free a block packer. + * @packer: The packer to free. + */ +void vdo_free_packer(struct packer *packer) +{ + struct packer_bin *bin, *tmp; + + if (packer == NULL) + return; + + list_for_each_entry_safe(bin, tmp, &packer->bins, list) { + list_del_init(&bin->list); + UDS_FREE(bin); + } + + UDS_FREE(UDS_FORGET(packer->canceled_bin)); + UDS_FREE(packer); +} + +/** + * get_packer_from_data_vio() - Get the packer from a data_vio. + * @data_vio: The data_vio. + * + * Return: The packer from the VDO to which the data_vio belongs. + */ +static inline struct packer *get_packer_from_data_vio(struct data_vio *data_vio) +{ + return vdo_from_data_vio(data_vio)->packer; +} + +/** + * vdo_get_packer_statistics() - Get the current statistics from the packer. + * @packer: The packer to query. + * + * Return: a copy of the current statistics for the packer. + */ +struct packer_statistics vdo_get_packer_statistics(const struct packer *packer) +{ + const struct packer_statistics *stats = &packer->statistics; + + return (struct packer_statistics) { + .compressed_fragments_written = READ_ONCE(stats->compressed_fragments_written), + .compressed_blocks_written = READ_ONCE(stats->compressed_blocks_written), + .compressed_fragments_in_packer = READ_ONCE(stats->compressed_fragments_in_packer), + }; +} + +/** + * abort_packing() - Abort packing a data_vio. + * @data_vio: The data_vio to abort. + */ +static void abort_packing(struct data_vio *data_vio) +{ + struct packer *packer = get_packer_from_data_vio(data_vio); + + WRITE_ONCE(packer->statistics.compressed_fragments_in_packer, + packer->statistics.compressed_fragments_in_packer - 1); + + write_data_vio(data_vio); +} + +/** + * release_compressed_write_waiter() - Update a data_vio for which a successful compressed write + * has completed and send it on its way. + + * @data_vio: The data_vio to release. + * @allocation: The allocation to which the compressed block was written. + */ +static void +release_compressed_write_waiter(struct data_vio *data_vio, struct allocation *allocation) +{ + data_vio->new_mapped = (struct zoned_pbn) { + .pbn = allocation->pbn, + .zone = allocation->zone, + .state = data_vio->compression.slot + VDO_MAPPING_STATE_COMPRESSED_BASE, + }; + + vdo_share_compressed_write_lock(data_vio, allocation->lock); + update_metadata_for_data_vio_write(data_vio, allocation->lock); +} + +/** + * finish_compressed_write() - Finish a compressed block write. + * @completion: The compressed write completion. + * + * This callback is registered in continue_after_allocation(). + */ +static void finish_compressed_write(struct vdo_completion *completion) +{ + struct data_vio *agent = as_data_vio(completion); + struct data_vio *client, *next; + + assert_data_vio_in_allocated_zone(agent); + + /* + * Process all the non-agent waiters first to ensure that the pbn lock can not be released + * until all of them have had a chance to journal their increfs. + */ + for (client = agent->compression.next_in_batch; client != NULL; client = next) { + next = client->compression.next_in_batch; + release_compressed_write_waiter(client, &agent->allocation); + } + + completion->error_handler = handle_data_vio_error; + release_compressed_write_waiter(agent, &agent->allocation); +} + +static void handle_compressed_write_error(struct vdo_completion *completion) +{ + struct data_vio *agent = as_data_vio(completion); + struct allocation *allocation = &agent->allocation; + struct data_vio *client, *next; + + if (vdo_requeue_completion_if_needed(completion, allocation->zone->thread_id)) + return; + + update_vio_error_stats(as_vio(completion), + "Completing compressed write vio for physical block %llu with error", + (unsigned long long) allocation->pbn); + + for (client = agent->compression.next_in_batch; client != NULL; client = next) { + next = client->compression.next_in_batch; + write_data_vio(client); + } + + /* Now that we've released the batch from the packer, forget the error and continue on. */ + vdo_reset_completion(completion); + completion->error_handler = handle_data_vio_error; + write_data_vio(agent); +} + +/** + * add_to_bin() - Put a data_vio in a specific packer_bin in which it will definitely fit. + * @bin: The bin in which to put the data_vio. + * @data_vio: The data_vio to add. + */ +static void add_to_bin(struct packer_bin *bin, struct data_vio *data_vio) +{ + data_vio->compression.bin = bin; + data_vio->compression.slot = bin->slots_used; + bin->incoming[bin->slots_used++] = data_vio; +} + +/** + * remove_from_bin() - Get the next data_vio whose compression has not been canceled from a bin. + * @packer: The packer. + * @bin: The bin from which to get a data_vio. + * + * Any canceled data_vios will be moved to the canceled bin. + * Return: An uncanceled data_vio from the bin or NULL if there are none. + */ +static struct data_vio *remove_from_bin(struct packer *packer, struct packer_bin *bin) +{ + while (bin->slots_used > 0) { + struct data_vio *data_vio = bin->incoming[--bin->slots_used]; + + if (!advance_data_vio_compression_stage(data_vio).may_not_compress) { + data_vio->compression.bin = NULL; + return data_vio; + } + + add_to_bin(packer->canceled_bin, data_vio); + } + + /* The bin is now empty. */ + bin->free_space = VDO_COMPRESSED_BLOCK_DATA_SIZE; + return NULL; +} + +/** + * initialize_compressed_block() - Initialize a compressed block. + * @block: The compressed block to initialize. + * @size: The size of the agent's fragment. + * + * This method initializes the compressed block in the compressed write agent. Because the + * compressor already put the agent's compressed fragment at the start of the compressed block's + * data field, it needn't be copied. So all we need do is initialize the header and set the size of + * the agent's fragment. + */ +static void initialize_compressed_block(struct compressed_block *block, u16 size) +{ + /* + * Make sure the block layout isn't accidentally changed by changing the length of the + * block header. + */ + STATIC_ASSERT_SIZEOF(struct compressed_block_header, COMPRESSED_BLOCK_1_0_SIZE); + + block->header.version = vdo_pack_version_number(COMPRESSED_BLOCK_1_0); + block->header.sizes[0] = __cpu_to_le16(size); +} + +/** + * pack_fragment() - Pack a data_vio's fragment into the compressed block in which it is already + * known to fit. + * @compression: The agent's compression_state to pack in to. + * @data_vio: The data_vio to pack. + * @offset: The offset into the compressed block at which to pack the fragment. + * @compressed_block: The compressed block which will be written out when batch is fully packed. + * + * Return: The new amount of space used. + */ +static block_size_t __must_check +pack_fragment(struct compression_state *compression, + struct data_vio *data_vio, + block_size_t offset, + slot_number_t slot, + struct compressed_block *block) +{ + struct compression_state *to_pack = &data_vio->compression; + char *fragment = to_pack->block->data; + + to_pack->next_in_batch = compression->next_in_batch; + compression->next_in_batch = data_vio; + to_pack->slot = slot; + block->header.sizes[slot] = __cpu_to_le16(to_pack->size); + memcpy(&block->data[offset], fragment, to_pack->size); + return (offset + to_pack->size); +} + +/** + * compressed_write_end_io() - The bio_end_io for a compressed block write. + * @bio: The bio for the compressed write. + */ +static void compressed_write_end_io(struct bio *bio) +{ + struct data_vio *data_vio = vio_as_data_vio(bio->bi_private); + + vdo_count_completed_bios(bio); + set_data_vio_allocated_zone_callback(data_vio, finish_compressed_write); + continue_data_vio_with_error(data_vio, blk_status_to_errno(bio->bi_status)); +} + +/** + * write_bin() - Write out a bin. + * @packer: The packer. + * @bin: The bin to write. + */ +static void write_bin(struct packer *packer, struct packer_bin *bin) +{ + int result; + block_size_t offset; + slot_number_t slot = 1; + struct compression_state *compression; + struct compressed_block *block; + struct data_vio *agent = remove_from_bin(packer, bin); + struct data_vio *client; + struct packer_statistics *stats; + + if (agent == NULL) + return; + + compression = &agent->compression; + compression->slot = 0; + block = compression->block; + initialize_compressed_block(block, compression->size); + offset = compression->size; + + while ((client = remove_from_bin(packer, bin)) != NULL) + offset = pack_fragment(compression, client, offset, slot++, block); + + /* + * If the batch contains only a single vio, then we save nothing by saving the compressed + * form. Continue processing the single vio in the batch. + */ + if (slot == 1) { + abort_packing(agent); + return; + } + + if (slot < VDO_MAX_COMPRESSION_SLOTS) + /* Clear out the sizes of the unused slots */ + memset(&block->header.sizes[slot], + 0, + (VDO_MAX_COMPRESSION_SLOTS - slot) * sizeof(__le16)); + + agent->vio.completion.error_handler = handle_compressed_write_error; + if (vdo_is_read_only(vdo_from_data_vio(agent))) { + continue_data_vio_with_error(agent, VDO_READ_ONLY); + return; + } + + result = vio_reset_bio(&agent->vio, + (char *) block, + compressed_write_end_io, + REQ_OP_WRITE, + agent->allocation.pbn); + if (result != VDO_SUCCESS) { + continue_data_vio_with_error(agent, result); + return; + } + + /* + * Once the compressed write is submitted, the fragments are no longer in the packer, so + * update stats now. + */ + stats = &packer->statistics; + WRITE_ONCE(stats->compressed_fragments_in_packer, + (stats->compressed_fragments_in_packer - slot)); + WRITE_ONCE(stats->compressed_fragments_written, + (stats->compressed_fragments_written + slot)); + WRITE_ONCE(stats->compressed_blocks_written, stats->compressed_blocks_written + 1); + + submit_data_vio_io(agent); +} + +/** + * add_data_vio_to_packer_bin() - Add a data_vio to a bin's incoming queue + * @packer: The packer. + * @bin: The bin to which to add the data_vio. + * @data_vio: The data_vio to add to the bin's queue. + * + * Adds a data_vio to a bin's incoming queue, handles logical space change, and calls physical + * space processor. + */ +static void add_data_vio_to_packer_bin(struct packer *packer, + struct packer_bin *bin, + struct data_vio *data_vio) +{ + /* If the selected bin doesn't have room, start a new batch to make room. */ + if (bin->free_space < data_vio->compression.size) + write_bin(packer, bin); + + add_to_bin(bin, data_vio); + bin->free_space -= data_vio->compression.size; + + /* If we happen to exactly fill the bin, start a new batch. */ + if ((bin->slots_used == VDO_MAX_COMPRESSION_SLOTS) || + (bin->free_space == 0)) + write_bin(packer, bin); + + /* Now that we've finished changing the free space, restore the sort order. */ + insert_in_sorted_list(packer, bin); +} + +/** + * select_bin() - Select the bin that should be used to pack the compressed data in a data_vio with + * other data_vios. + * @packer: The packer. + * @data_vio: The data_vio. + */ +static struct packer_bin * __must_check +select_bin(struct packer *packer, struct data_vio *data_vio) +{ + /* + * First best fit: select the bin with the least free space that has enough room for the + * compressed data in the data_vio. + */ + struct packer_bin *bin, *fullest_bin; + + list_for_each_entry(bin, &packer->bins, list) + if (bin->free_space >= data_vio->compression.size) + return bin; + + /* + * None of the bins have enough space for the data_vio. We're not allowed to create new + * bins, so we have to overflow one of the existing bins. It's pretty intuitive to select + * the fullest bin, since that "wastes" the least amount of free space in the compressed + * block. But if the space currently used in the fullest bin is smaller than the compressed + * size of the incoming block, it seems wrong to force that bin to write when giving up on + * compressing the incoming data_vio would likewise "waste" the least amount of free space. + */ + fullest_bin = list_first_entry(&packer->bins, struct packer_bin, list); + if (data_vio->compression.size >= + (VDO_COMPRESSED_BLOCK_DATA_SIZE - fullest_bin->free_space)) + return NULL; + + /* + * The fullest bin doesn't have room, but writing it out and starting a new batch with the + * incoming data_vio will increase the packer's free space. + */ + return fullest_bin; +} + +/** + * vdo_attempt_packing() - Attempt to rewrite the data in this data_vio as part of a compressed + * block. + * @data_vio: The data_vio to pack. + */ +void vdo_attempt_packing(struct data_vio *data_vio) +{ + int result; + struct packer_bin *bin; + struct data_vio_compression_status status = get_data_vio_compression_status(data_vio); + struct packer *packer = get_packer_from_data_vio(data_vio); + + assert_on_packer_thread(packer, __func__); + + result = ASSERT((status.stage == DATA_VIO_COMPRESSING), + "attempt to pack data_vio not ready for packing, stage: %u", + status.stage); + if (result != VDO_SUCCESS) + return; + + /* + * Increment whether or not this data_vio will be packed or not since abort_packing() + * always decrements the counter. + */ + WRITE_ONCE(packer->statistics.compressed_fragments_in_packer, + packer->statistics.compressed_fragments_in_packer + 1); + + /* + * If packing of this data_vio is disallowed for administrative reasons, give up before + * making any state changes. + */ + if (!vdo_is_state_normal(&packer->state) || + (data_vio->flush_generation < packer->flush_generation)) { + abort_packing(data_vio); + return; + } + + /* + * The check of may_vio_block_in_packer() here will set the data_vio's compression state to + * VIO_PACKING if the data_vio is allowed to be compressed (if it has already been + * canceled, we'll fall out here). Once the data_vio is in the VIO_PACKING state, it must + * be guaranteed to be put in a bin before any more requests can be processed by the packer + * thread. Otherwise, a canceling data_vio could attempt to remove the canceled data_vio + * from the packer and fail to rendezvous with it (VDO-2809). We must also make sure that + * we will actually bin the data_vio and not give up on it as being larger than the space + * used in the fullest bin. Hence we must call select_bin() before calling + * may_vio_block_in_packer() (VDO-2826). + */ + bin = select_bin(packer, data_vio); + if ((bin == NULL) || + (advance_data_vio_compression_stage(data_vio).stage != DATA_VIO_PACKING)) { + abort_packing(data_vio); + return; + } + + add_data_vio_to_packer_bin(packer, bin, data_vio); +} + +/** + * check_for_drain_complete() - Check whether the packer has drained. + * @packer: The packer. + */ +static void check_for_drain_complete(struct packer *packer) +{ + if (vdo_is_state_draining(&packer->state) && (packer->canceled_bin->slots_used == 0)) + vdo_finish_draining(&packer->state); +} + +/** + * write_all_non_empty_bins() - Write out all non-empty bins on behalf of a flush or suspend. + * @packer: The packer being flushed. + */ +static void write_all_non_empty_bins(struct packer *packer) +{ + struct packer_bin *bin; + + list_for_each_entry(bin, &packer->bins, list) + write_bin(packer, bin); + /* + * We don't need to re-sort the bin here since this loop will make every bin have + * the same amount of free space, so every ordering is sorted. + */ + + check_for_drain_complete(packer); +} + +/** + * vdo_flush_packer() - Request that the packer flush asynchronously. + * @packer: The packer to flush. + * + * All bins with at least two compressed data blocks will be written out, and any solitary pending + * VIOs will be released from the packer. While flushing is in progress, any VIOs submitted to + * vdo_attempt_packing() will be continued immediately without attempting to pack them. + */ +void vdo_flush_packer(struct packer *packer) +{ + assert_on_packer_thread(packer, __func__); + if (vdo_is_state_normal(&packer->state)) + write_all_non_empty_bins(packer); +} + +/** + * vdo_remove_lock_holder_from_packer() - Remove a lock holder from the packer. + * @completion: The data_vio which needs a lock held by a data_vio in the packer. The data_vio's + * compression.lock_holder field will point to the data_vio to remove. + */ +void vdo_remove_lock_holder_from_packer(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + struct packer *packer = get_packer_from_data_vio(data_vio); + struct data_vio *lock_holder; + struct packer_bin *bin; + slot_number_t slot; + + assert_data_vio_in_packer_zone(data_vio); + + lock_holder = UDS_FORGET(data_vio->compression.lock_holder); + bin = lock_holder->compression.bin; + ASSERT_LOG_ONLY((bin != NULL), "data_vio in packer has a bin"); + + slot = lock_holder->compression.slot; + bin->slots_used--; + if (slot < bin->slots_used) { + bin->incoming[slot] = bin->incoming[bin->slots_used]; + bin->incoming[slot]->compression.slot = slot; + } + + lock_holder->compression.bin = NULL; + lock_holder->compression.slot = 0; + + if (bin != packer->canceled_bin) { + bin->free_space += lock_holder->compression.size; + insert_in_sorted_list(packer, bin); + } + + abort_packing(lock_holder); + check_for_drain_complete(packer); +} + +/** + * vdo_increment_packer_flush_generation() - Increment the flush generation in the packer. + * @packer: The packer. + * + * This will also cause the packer to flush so that any VIOs from previous generations will exit + * the packer. + */ +void vdo_increment_packer_flush_generation(struct packer *packer) +{ + assert_on_packer_thread(packer, __func__); + packer->flush_generation++; + vdo_flush_packer(packer); +} + +/** + * initiate_drain() - Initiate a drain. + * + * Implements vdo_admin_initiator. + */ +static void initiate_drain(struct admin_state *state) +{ + struct packer *packer = container_of(state, struct packer, state); + + write_all_non_empty_bins(packer); +} + +/** + * vdo_drain_packer() - Drain the packer by preventing any more VIOs from entering the packer and + * then flushing. + * @packer: The packer to drain. + * @completion: The completion to finish when the packer has drained. + */ +void vdo_drain_packer(struct packer *packer, struct vdo_completion *completion) +{ + assert_on_packer_thread(packer, __func__); + vdo_start_draining(&packer->state, VDO_ADMIN_STATE_SUSPENDING, completion, initiate_drain); +} + +/** + * vdo_resume_packer() - Resume a packer which has been suspended. + * @packer: The packer to resume. + * @parent: The completion to finish when the packer has resumed. + */ +void vdo_resume_packer(struct packer *packer, struct vdo_completion *parent) +{ + assert_on_packer_thread(packer, __func__); + vdo_continue_completion(parent, vdo_resume_if_quiescent(&packer->state)); +} + +static void dump_packer_bin(const struct packer_bin *bin, bool canceled) +{ + if (bin->slots_used == 0) + /* Don't dump empty bins. */ + return; + + uds_log_info(" %sBin slots_used=%u free_space=%zu", + (canceled ? "Canceled" : ""), bin->slots_used, + bin->free_space); + + /* + * FIXME: dump vios in bin->incoming? The vios should have been dumped from the vio pool. + * Maybe just dump their addresses so it's clear they're here? + */ +} + +/** + * vdo_dump_packer() - Dump the packer. + * @packer: The packer. + * + * Context: dumps in a thread-unsafe fashion. + */ +void vdo_dump_packer(const struct packer *packer) +{ + struct packer_bin *bin; + + uds_log_info("packer"); + uds_log_info(" flushGeneration=%llu state %s packer_bin_count=%llu", + (unsigned long long) packer->flush_generation, + vdo_get_admin_state_code(&packer->state)->name, + (unsigned long long) packer->size); + + list_for_each_entry(bin, &packer->bins, list) + dump_packer_bin(bin, false); + + dump_packer_bin(packer->canceled_bin, true); +} diff --git a/drivers/md/dm-vdo/packer.h b/drivers/md/dm-vdo/packer.h new file mode 100644 index 00000000000..8eed435cf9f --- /dev/null +++ b/drivers/md/dm-vdo/packer.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_PACKER_H +#define VDO_PACKER_H + +#include <linux/list.h> + +#include "admin-state.h" +#include "constants.h" +#include "encodings.h" +#include "statistics.h" +#include "types.h" +#include "wait-queue.h" + +enum { + DEFAULT_PACKER_BINS = 16, +}; + +/* The header of a compressed block. */ +struct compressed_block_header { + /* Unsigned 32-bit major and minor versions, little-endian */ + struct packed_version_number version; + + /* List of unsigned 16-bit compressed block sizes, little-endian */ + __le16 sizes[VDO_MAX_COMPRESSION_SLOTS]; +} __packed; + +enum { + VDO_COMPRESSED_BLOCK_DATA_SIZE = VDO_BLOCK_SIZE - sizeof(struct compressed_block_header), + + /* + * A compressed block is only written if we can pack at least two fragments into it, so a + * fragment which fills the entire data portion of a compressed block is too big. + */ + VDO_MAX_COMPRESSED_FRAGMENT_SIZE = VDO_COMPRESSED_BLOCK_DATA_SIZE - 1, +}; + +/* * The compressed block overlay. */ +struct compressed_block { + struct compressed_block_header header; + char data[VDO_COMPRESSED_BLOCK_DATA_SIZE]; +} __packed; + +/* + * Each packer_bin holds an incomplete batch of data_vios that only partially fill a compressed + * block. The bins are kept in a ring sorted by the amount of unused space so the first bin with + * enough space to hold a newly-compressed data_vio can easily be found. When the bin fills up or + * is flushed, the first uncanceled data_vio in the bin is selected to be the agent for that bin. + * Upon entering the packer, each data_vio already has its compressed data in the first slot of the + * data_vio's compressed_block (overlaid on the data_vio's scratch_block). So the agent's fragment + * is already in place. The fragments for the other uncanceled data_vios in the bin are packed into + * the agent's compressed block. The agent then writes out the compressed block. If the write is + * successful, the agent shares its pbn lock which each of the other data_vios in its compressed + * block and sends each on its way. Finally the agent itself continues on the write path as before. + * + * There is one special bin which is used to hold data_vios which have been canceled and removed + * from their bin by the packer. These data_vios need to wait for the canceller to rendezvous with + * them (VDO-2809) and so they sit in this special bin. + */ +struct packer_bin { + /* List links for packer.packer_bins */ + struct list_head list; + /* The number of items in the bin */ + slot_number_t slots_used; + /* The number of compressed block bytes remaining in the current batch */ + size_t free_space; + /* The current partial batch of data_vios, waiting for more */ + struct data_vio *incoming[]; +}; + +struct packer { + /* The ID of the packer's callback thread */ + thread_id_t thread_id; + /* The number of bins */ + block_count_t size; + /* A list of all packer_bins, kept sorted by free_space */ + struct list_head bins; + /* + * A bin to hold data_vios which were canceled out of the packer and are waiting to + * rendezvous with the canceling data_vio. + */ + struct packer_bin *canceled_bin; + + /* The current flush generation */ + sequence_number_t flush_generation; + + /* The administrative state of the packer */ + struct admin_state state; + + /* Statistics are only updated on the packer thread, but are accessed from other threads */ + struct packer_statistics statistics; +}; + +int vdo_get_compressed_block_fragment(enum block_mapping_state mapping_state, + struct compressed_block *block, + u16 *fragment_offset, + u16 *fragment_size); + +int __must_check +vdo_make_packer(struct vdo *vdo, block_count_t bin_count, struct packer **packer_ptr); + +void vdo_free_packer(struct packer *packer); + +struct packer_statistics __must_check vdo_get_packer_statistics(const struct packer *packer); + +void vdo_attempt_packing(struct data_vio *data_vio); + +void vdo_flush_packer(struct packer *packer); + +void vdo_remove_lock_holder_from_packer(struct vdo_completion *completion); + +void vdo_increment_packer_flush_generation(struct packer *packer); + +void vdo_drain_packer(struct packer *packer, struct vdo_completion *completion); + +void vdo_resume_packer(struct packer *packer, struct vdo_completion *parent); + +void vdo_dump_packer(const struct packer *packer); + +#endif /* VDO_PACKER_H */ diff --git a/drivers/md/dm-vdo/physical-zone.c b/drivers/md/dm-vdo/physical-zone.c new file mode 100644 index 00000000000..f07718a662d --- /dev/null +++ b/drivers/md/dm-vdo/physical-zone.c @@ -0,0 +1,650 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "physical-zone.h" + +#include <linux/list.h> + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "block-map.h" +#include "completion.h" +#include "constants.h" +#include "data-vio.h" +#include "dedupe.h" +#include "encodings.h" +#include "flush.h" +#include "int-map.h" +#include "slab-depot.h" +#include "status-codes.h" +#include "vdo.h" + +enum { + /* Each user data_vio needs a PBN read lock and write lock. */ + LOCK_POOL_CAPACITY = 2 * MAXIMUM_VDO_USER_VIOS, +}; + +struct pbn_lock_implementation { + enum pbn_lock_type type; + const char *name; + const char *release_reason; +}; + +/* This array must have an entry for every pbn_lock_type value. */ +static const struct pbn_lock_implementation LOCK_IMPLEMENTATIONS[] = { + [VIO_READ_LOCK] = { + .type = VIO_READ_LOCK, + .name = "read", + .release_reason = "candidate duplicate", + }, + [VIO_WRITE_LOCK] = { + .type = VIO_WRITE_LOCK, + .name = "write", + .release_reason = "newly allocated", + }, + [VIO_BLOCK_MAP_WRITE_LOCK] = { + .type = VIO_BLOCK_MAP_WRITE_LOCK, + .name = "block map write", + .release_reason = "block map write", + }, +}; + +static inline bool has_lock_type(const struct pbn_lock *lock, enum pbn_lock_type type) +{ + return (lock->implementation == &LOCK_IMPLEMENTATIONS[type]); +} + +/** + * vdo_is_pbn_read_lock() - Check whether a pbn_lock is a read lock. + * @lock: The lock to check. + * + * Return: true if the lock is a read lock. + */ +bool vdo_is_pbn_read_lock(const struct pbn_lock *lock) +{ + return has_lock_type(lock, VIO_READ_LOCK); +} + +static inline void set_pbn_lock_type(struct pbn_lock *lock, enum pbn_lock_type type) +{ + lock->implementation = &LOCK_IMPLEMENTATIONS[type]; +} + +/** + * vdo_downgrade_pbn_write_lock() - Downgrade a PBN write lock to a PBN read lock. + * @lock: The PBN write lock to downgrade. + * + * The lock holder count is cleared and the caller is responsible for setting the new count. + */ +void vdo_downgrade_pbn_write_lock(struct pbn_lock *lock, bool compressed_write) +{ + ASSERT_LOG_ONLY(!vdo_is_pbn_read_lock(lock), + "PBN lock must not already have been downgraded"); + ASSERT_LOG_ONLY(!has_lock_type(lock, VIO_BLOCK_MAP_WRITE_LOCK), + "must not downgrade block map write locks"); + ASSERT_LOG_ONLY(lock->holder_count == 1, + "PBN write lock should have one holder but has %u", + lock->holder_count); + /* + * data_vio write locks are downgraded in place--the writer retains the hold on the lock. + * If this was a compressed write, the holder has not yet journaled its own inc ref, + * otherwise, it has. + */ + lock->increment_limit = + (compressed_write ? MAXIMUM_REFERENCE_COUNT : MAXIMUM_REFERENCE_COUNT - 1); + set_pbn_lock_type(lock, VIO_READ_LOCK); +} + +/** + * vdo_claim_pbn_lock_increment() - Try to claim one of the available reference count increments on + * a read lock. + * @lock: The PBN read lock from which to claim an increment. + * + * Claims may be attempted from any thread. A claim is only valid until the PBN lock is released. + * + * Return: true if the claim succeeded, guaranteeing one increment can be made without overflowing + * the PBN's reference count. + */ +bool vdo_claim_pbn_lock_increment(struct pbn_lock *lock) +{ + /* + * Claim the next free reference atomically since hash locks from multiple hash zone + * threads might be concurrently deduplicating against a single PBN lock on compressed + * block. As long as hitting the increment limit will lead to the PBN lock being released + * in a sane time-frame, we won't overflow a 32-bit claim counter, allowing a simple add + * instead of a compare-and-swap. + */ + u32 claim_number = (u32) atomic_add_return(1, &lock->increments_claimed); + + return (claim_number <= lock->increment_limit); +} + +/** + * vdo_assign_pbn_lock_provisional_reference() - Inform a PBN lock that it is responsible for a + * provisional reference. + * @lock: The PBN lock. + */ +void vdo_assign_pbn_lock_provisional_reference(struct pbn_lock *lock) +{ + ASSERT_LOG_ONLY(!lock->has_provisional_reference, + "lock does not have a provisional reference"); + lock->has_provisional_reference = true; +} + +/** + * vdo_unassign_pbn_lock_provisional_reference() - Inform a PBN lock that it is no longer + * responsible for a provisional reference. + * @lock: The PBN lock. + */ +void vdo_unassign_pbn_lock_provisional_reference(struct pbn_lock *lock) +{ + lock->has_provisional_reference = false; +} + +/** + * release_pbn_lock_provisional_reference() - If the lock is responsible for a provisional + * reference, release that reference. + * @lock: The lock. + * @locked_pbn: The PBN covered by the lock. + * @allocator: The block allocator from which to release the reference. + * + * This method is called when the lock is released. + */ +static void +release_pbn_lock_provisional_reference(struct pbn_lock *lock, + physical_block_number_t locked_pbn, + struct block_allocator *allocator) +{ + int result; + + if (!vdo_pbn_lock_has_provisional_reference(lock)) + return; + + result = vdo_release_block_reference(allocator, locked_pbn); + if (result != VDO_SUCCESS) + uds_log_error_strerror(result, + "Failed to release reference to %s physical block %llu", + lock->implementation->release_reason, + (unsigned long long) locked_pbn); + + vdo_unassign_pbn_lock_provisional_reference(lock); +} + +/** + * union idle_pbn_lock - PBN lock list entries. + * + * Unused (idle) PBN locks are kept in a list. Just like in a malloc implementation, the lock + * structure is unused memory, so we can save a bit of space (and not pollute the lock structure + * proper) by using a union to overlay the lock structure with the free list. + */ +typedef union { + /** @entry: Only used while locks are in the pool. */ + struct list_head entry; + /** @lock: Only used while locks are not in the pool. */ + struct pbn_lock lock; +} idle_pbn_lock; + +/** + * struct pbn_lock_pool - list of PBN locks. + * + * The lock pool is little more than the memory allocated for the locks. + */ +struct pbn_lock_pool { + /** @capacity: The number of locks allocated for the pool. */ + size_t capacity; + /** @borrowed: The number of locks currently borrowed from the pool. */ + size_t borrowed; + /** @idle_list: A list containing all idle PBN lock instances. */ + struct list_head idle_list; + /** @locks: The memory for all the locks allocated by this pool. */ + idle_pbn_lock locks[]; +}; + +/** + * return_pbn_lock_to_pool() - Return a pbn lock to its pool. + * @pool: The pool from which the lock was borrowed. + * @lock: The last reference to the lock being returned. + * + * It must be the last live reference, as if the memory were being freed (the lock memory will + * re-initialized or zeroed). + */ +static void return_pbn_lock_to_pool(struct pbn_lock_pool *pool, struct pbn_lock *lock) +{ + idle_pbn_lock *idle; + + /* A bit expensive, but will promptly catch some use-after-free errors. */ + memset(lock, 0, sizeof(*lock)); + + idle = container_of(lock, idle_pbn_lock, lock); + INIT_LIST_HEAD(&idle->entry); + list_add_tail(&idle->entry, &pool->idle_list); + + ASSERT_LOG_ONLY(pool->borrowed > 0, "shouldn't return more than borrowed"); + pool->borrowed -= 1; +} + +/** + * make_pbn_lock_pool() - Create a new PBN lock pool and all the lock instances it can loan out. + * + * @capacity: The number of PBN locks to allocate for the pool. + * @pool_ptr: A pointer to receive the new pool. + * + * Return: VDO_SUCCESS or an error code. + */ +static int make_pbn_lock_pool(size_t capacity, struct pbn_lock_pool **pool_ptr) +{ + size_t i; + struct pbn_lock_pool *pool; + int result; + + result = UDS_ALLOCATE_EXTENDED(struct pbn_lock_pool, + capacity, + idle_pbn_lock, + __func__, + &pool); + if (result != VDO_SUCCESS) + return result; + + pool->capacity = capacity; + pool->borrowed = capacity; + INIT_LIST_HEAD(&pool->idle_list); + + for (i = 0; i < capacity; i++) + return_pbn_lock_to_pool(pool, &pool->locks[i].lock); + + *pool_ptr = pool; + return VDO_SUCCESS; +} + +/** + * vdo_free_pbn_lock_pool() - Free a PBN lock pool. + * @pool: The lock pool to free. + * + * This also frees all the PBN locks it allocated, so the caller must ensure that all locks have + * been returned to the pool. + */ +static void free_pbn_lock_pool(struct pbn_lock_pool *pool) +{ + if (pool == NULL) + return; + + ASSERT_LOG_ONLY(pool->borrowed == 0, + "All PBN locks must be returned to the pool before it is freed, but %zu locks are still on loan", + pool->borrowed); + UDS_FREE(pool); +} + +/** + * borrow_pbn_lock_from_pool() - Borrow a PBN lock from the pool and initialize it with the + * provided type. + * @pool: The pool from which to borrow. + * @type: The type with which to initialize the lock. + * @lock_ptr: A pointer to receive the borrowed lock. + * + * Pools do not grow on demand or allocate memory, so this will fail if the pool is empty. Borrowed + * locks are still associated with this pool and must be returned to only this pool. + * + * Return: VDO_SUCCESS, or VDO_LOCK_ERROR if the pool is empty. + */ +static int __must_check +borrow_pbn_lock_from_pool(struct pbn_lock_pool *pool, + enum pbn_lock_type type, + struct pbn_lock **lock_ptr) +{ + int result; + struct list_head *idle_entry; + idle_pbn_lock *idle; + + if (pool->borrowed >= pool->capacity) + return uds_log_error_strerror(VDO_LOCK_ERROR, "no free PBN locks left to borrow"); + pool->borrowed += 1; + + result = ASSERT(!list_empty(&pool->idle_list), + "idle list should not be empty if pool not at capacity"); + if (result != VDO_SUCCESS) + return result; + + idle_entry = pool->idle_list.prev; + list_del(idle_entry); + memset(idle_entry, 0, sizeof(*idle_entry)); + + idle = list_entry(idle_entry, idle_pbn_lock, entry); + idle->lock.holder_count = 0; + set_pbn_lock_type(&idle->lock, type); + + *lock_ptr = &idle->lock; + return VDO_SUCCESS; +} + +/** + * initialize_zone() - Initialize a physical zone. + * @vdo: The vdo to which the zone will belong. + * @zones: The physical_zones to which the zone being initialized belongs + * + * Return: VDO_SUCCESS or an error code. + */ +static int initialize_zone(struct vdo *vdo, struct physical_zones *zones) +{ + int result; + zone_count_t zone_number = zones->zone_count; + struct physical_zone *zone = &zones->zones[zone_number]; + + result = vdo_make_int_map(VDO_LOCK_MAP_CAPACITY, 0, &zone->pbn_operations); + if (result != VDO_SUCCESS) + return result; + + result = make_pbn_lock_pool(LOCK_POOL_CAPACITY, &zone->lock_pool); + if (result != VDO_SUCCESS) { + vdo_free_int_map(zone->pbn_operations); + return result; + } + + zone->zone_number = zone_number; + zone->thread_id = vdo->thread_config.physical_threads[zone_number]; + zone->allocator = &vdo->depot->allocators[zone_number]; + zone->next = &zones->zones[(zone_number + 1) % vdo->thread_config.physical_zone_count]; + result = vdo_make_default_thread(vdo, zone->thread_id); + if (result != VDO_SUCCESS) { + free_pbn_lock_pool(UDS_FORGET(zone->lock_pool)); + vdo_free_int_map(zone->pbn_operations); + return result; + } + return result; +} + +/** + * vdo_make_physical_zones() - Make the physical zones for a vdo. + * @vdo: The vdo being constructed + * @zones_ptr: A pointer to hold the zones + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_make_physical_zones(struct vdo *vdo, struct physical_zones **zones_ptr) +{ + struct physical_zones *zones; + int result; + zone_count_t zone_count = vdo->thread_config.physical_zone_count; + + if (zone_count == 0) + return VDO_SUCCESS; + + result = UDS_ALLOCATE_EXTENDED(struct physical_zones, + zone_count, + struct physical_zone, + __func__, + &zones); + if (result != VDO_SUCCESS) + return result; + + for (zones->zone_count = 0; zones->zone_count < zone_count; zones->zone_count++) { + result = initialize_zone(vdo, zones); + if (result != VDO_SUCCESS) { + vdo_free_physical_zones(zones); + return result; + } + } + + *zones_ptr = zones; + return VDO_SUCCESS; +} + +/** + * vdo_free_physical_zones() - Destroy the physical zones. + * @zones: The zones to free. + */ +void vdo_free_physical_zones(struct physical_zones *zones) +{ + zone_count_t index; + + if (zones == NULL) + return; + + for (index = 0; index < zones->zone_count; index++) { + struct physical_zone *zone = &zones->zones[index]; + + free_pbn_lock_pool(UDS_FORGET(zone->lock_pool)); + vdo_free_int_map(UDS_FORGET(zone->pbn_operations)); + } + + UDS_FREE(zones); +} + +/** + * vdo_get_physical_zone_pbn_lock() - Get the lock on a PBN if one exists. + * @zone: The physical zone responsible for the PBN. + * @pbn: The physical block number whose lock is desired. + * + * Return: The lock or NULL if the PBN is not locked. + */ +struct pbn_lock * +vdo_get_physical_zone_pbn_lock(struct physical_zone *zone, physical_block_number_t pbn) +{ + return ((zone == NULL) ? NULL : vdo_int_map_get(zone->pbn_operations, pbn)); +} + +/** + * vdo_attempt_physical_zone_pbn_lock() - Attempt to lock a physical block in the zone responsible + * for it. + * @zone: The physical zone responsible for the PBN. + * @pbn: The physical block number to lock. + * @type: The type with which to initialize a new lock. + * @lock_ptr: A pointer to receive the lock, existing or new. + * + * If the PBN is already locked, the existing lock will be returned. Otherwise, a new lock instance + * will be borrowed from the pool, initialized, and returned. The lock owner will be NULL for a new + * lock acquired by the caller, who is responsible for setting that field promptly. The lock owner + * will be non-NULL when there is already an existing lock on the PBN. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_attempt_physical_zone_pbn_lock(struct physical_zone *zone, + physical_block_number_t pbn, + enum pbn_lock_type type, + struct pbn_lock **lock_ptr) +{ + /* + * Borrow and prepare a lock from the pool so we don't have to do two int_map accesses in + * the common case of no lock contention. + */ + struct pbn_lock *lock, *new_lock = NULL; + int result; + + result = borrow_pbn_lock_from_pool(zone->lock_pool, type, &new_lock); + if (result != VDO_SUCCESS) { + ASSERT_LOG_ONLY(false, "must always be able to borrow a PBN lock"); + return result; + } + + result = vdo_int_map_put(zone->pbn_operations, pbn, new_lock, false, (void **) &lock); + if (result != VDO_SUCCESS) { + return_pbn_lock_to_pool(zone->lock_pool, new_lock); + return result; + } + + if (lock != NULL) { + /* The lock is already held, so we don't need the borrowed one. */ + return_pbn_lock_to_pool(zone->lock_pool, UDS_FORGET(new_lock)); + result = ASSERT(lock->holder_count > 0, + "physical block %llu lock held", + (unsigned long long) pbn); + if (result != VDO_SUCCESS) + return result; + *lock_ptr = lock; + } else { + *lock_ptr = new_lock; + } + return VDO_SUCCESS; +} + +/** + * allocate_and_lock_block() - Attempt to allocate a block from this zone. + * @allocation: The struct allocation of the data_vio attempting to allocate. + * + * If a block is allocated, the recipient will also hold a lock on it. + * + * Return: VDO_SUCESSS if a block was allocated, or an error code. + */ +static int allocate_and_lock_block(struct allocation *allocation) +{ + int result; + struct pbn_lock *lock; + + ASSERT_LOG_ONLY(allocation->lock == NULL, + "must not allocate a block while already holding a lock on one"); + + result = vdo_allocate_block(allocation->zone->allocator, &allocation->pbn); + if (result != VDO_SUCCESS) + return result; + + result = vdo_attempt_physical_zone_pbn_lock(allocation->zone, + allocation->pbn, + allocation->write_lock_type, + &lock); + if (result != VDO_SUCCESS) + return result; + + if (lock->holder_count > 0) + /* This block is already locked, which should be impossible. */ + return uds_log_error_strerror(VDO_LOCK_ERROR, + "Newly allocated block %llu was spuriously locked (holder_count=%u)", + (unsigned long long) allocation->pbn, + lock->holder_count); + + /* We've successfully acquired a new lock, so mark it as ours. */ + lock->holder_count += 1; + allocation->lock = lock; + vdo_assign_pbn_lock_provisional_reference(lock); + return VDO_SUCCESS; +} + +/** + * retry_allocation() - Retry allocating a block now that we're done waiting for scrubbing. + * @waiter: The allocating_vio that was waiting to allocate. + * @context: The context (unused). + */ +static void retry_allocation(struct waiter *waiter, void *context __always_unused) +{ + struct data_vio *data_vio = waiter_as_data_vio(waiter); + + /* Now that some slab has scrubbed, restart the allocation process. */ + data_vio->allocation.wait_for_clean_slab = false; + data_vio->allocation.first_allocation_zone = data_vio->allocation.zone->zone_number; + continue_data_vio(data_vio); +} + +/** + * continue_allocating() - Continue searching for an allocation by enqueuing to wait for scrubbing + * or switching to the next zone. + * @data_vio: The data_vio attempting to get an allocation. + * + * This method should only be called from the error handler set in data_vio_allocate_data_block. + * + * Return: true if the allocation process has continued in another zone. + */ +static bool continue_allocating(struct data_vio *data_vio) +{ + struct allocation *allocation = &data_vio->allocation; + struct physical_zone *zone = allocation->zone; + struct vdo_completion *completion = &data_vio->vio.completion; + int result = VDO_SUCCESS; + bool was_waiting = allocation->wait_for_clean_slab; + bool tried_all = (allocation->first_allocation_zone == zone->next->zone_number); + + vdo_reset_completion(completion); + + if (tried_all && !was_waiting) { + /* + * We've already looked in all the zones, and found nothing. So go through the + * zones again, and wait for each to scrub before trying to allocate. + */ + allocation->wait_for_clean_slab = true; + allocation->first_allocation_zone = zone->zone_number; + } + + if (allocation->wait_for_clean_slab) { + data_vio->waiter.callback = retry_allocation; + result = vdo_enqueue_clean_slab_waiter(zone->allocator, &data_vio->waiter); + if (result == VDO_SUCCESS) + /* We've enqueued to wait for a slab to be scrubbed. */ + return true; + + if ((result != VDO_NO_SPACE) || (was_waiting && tried_all)) { + vdo_set_completion_result(completion, result); + return false; + } + } + + allocation->zone = zone->next; + completion->callback_thread_id = allocation->zone->thread_id; + vdo_launch_completion(completion); + return true; +} + +/** + * vdo_allocate_block_in_zone() - Attempt to allocate a block in the current physical zone, and if + * that fails try the next if possible. + * @data_vio: The data_vio needing an allocation. + * + * Return: true if a block was allocated, if not the data_vio will have been dispatched so the + * caller must not touch it. + */ +bool vdo_allocate_block_in_zone(struct data_vio *data_vio) +{ + int result = allocate_and_lock_block(&data_vio->allocation); + + if (result == VDO_SUCCESS) + return true; + + if ((result != VDO_NO_SPACE) || !continue_allocating(data_vio)) + continue_data_vio_with_error(data_vio, result); + + return false; +} + +/** + * vdo_release_physical_zone_pbn_lock() - Release a physical block lock if it is held and return it + * to the lock pool. + * @zone: The physical zone in which the lock was obtained. + * @locked_pbn: The physical block number to unlock. + * @lock: The lock being released. + * + * It must be the last live reference, as if the memory were being freed (the + * lock memory will re-initialized or zeroed). + */ +void vdo_release_physical_zone_pbn_lock(struct physical_zone *zone, + physical_block_number_t locked_pbn, + struct pbn_lock *lock) +{ + struct pbn_lock *holder; + + if (lock == NULL) + return; + + ASSERT_LOG_ONLY(lock->holder_count > 0, "should not be releasing a lock that is not held"); + + lock->holder_count -= 1; + if (lock->holder_count > 0) + /* The lock was shared and is still referenced, so don't release it yet. */ + return; + + holder = vdo_int_map_remove(zone->pbn_operations, locked_pbn); + ASSERT_LOG_ONLY((lock == holder), + "physical block lock mismatch for block %llu", + (unsigned long long) locked_pbn); + + release_pbn_lock_provisional_reference(lock, locked_pbn, zone->allocator); + return_pbn_lock_to_pool(zone->lock_pool, lock); +} + +/** + * vdo_dump_physical_zone() - Dump information about a physical zone to the log for debugging. + * @zone: The zone to dump. + */ +void vdo_dump_physical_zone(const struct physical_zone *zone) +{ + vdo_dump_block_allocator(zone->allocator); +} diff --git a/drivers/md/dm-vdo/physical-zone.h b/drivers/md/dm-vdo/physical-zone.h new file mode 100644 index 00000000000..55b7341ff39 --- /dev/null +++ b/drivers/md/dm-vdo/physical-zone.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_PHYSICAL_ZONE_H +#define VDO_PHYSICAL_ZONE_H + +#include <linux/atomic.h> + +#include "types.h" + +/* + * The type of a PBN lock. + */ +enum pbn_lock_type { + VIO_READ_LOCK, + VIO_WRITE_LOCK, + VIO_BLOCK_MAP_WRITE_LOCK, +}; + +struct pbn_lock_implementation; + +/* + * A PBN lock. + */ +struct pbn_lock { + /* The implementation of the lock */ + const struct pbn_lock_implementation *implementation; + + /* The number of VIOs holding or sharing this lock */ + data_vio_count_t holder_count; + /* + * The number of compressed block writers holding a share of this lock while they are + * acquiring a reference to the PBN. + */ + u8 fragment_locks; + + /* Whether the locked PBN has been provisionally referenced on behalf of the lock holder. */ + bool has_provisional_reference; + + /* + * For read locks, the number of references that were known to be available on the locked + * block at the time the lock was acquired. + */ + u8 increment_limit; + + /* + * For read locks, the number of data_vios that have tried to claim one of the available + * increments during the lifetime of the lock. Each claim will first increment this + * counter, so it can exceed the increment limit. + */ + atomic_t increments_claimed; +}; + +struct physical_zone { + /* Which physical zone this is */ + zone_count_t zone_number; + /* The thread ID for this zone */ + thread_id_t thread_id; + /* In progress operations keyed by PBN */ + struct int_map *pbn_operations; + /* Pool of unused pbn_lock instances */ + struct pbn_lock_pool *lock_pool; + /* The block allocator for this zone */ + struct block_allocator *allocator; + /* The next zone from which to attempt an allocation */ + struct physical_zone *next; +}; + +struct physical_zones { + /* The number of zones */ + zone_count_t zone_count; + /* The physical zones themselves */ + struct physical_zone zones[]; +}; + +bool __must_check vdo_is_pbn_read_lock(const struct pbn_lock *lock); +void vdo_downgrade_pbn_write_lock(struct pbn_lock *lock, bool compressed_write); +bool __must_check vdo_claim_pbn_lock_increment(struct pbn_lock *lock); + +/** + * vdo_pbn_lock_has_provisional_reference() - Check whether a PBN lock has a provisional reference. + * @lock: The PBN lock. + */ +static inline bool vdo_pbn_lock_has_provisional_reference(struct pbn_lock *lock) +{ + return ((lock != NULL) && lock->has_provisional_reference); +} + +void vdo_assign_pbn_lock_provisional_reference(struct pbn_lock *lock); +void vdo_unassign_pbn_lock_provisional_reference(struct pbn_lock *lock); + +int __must_check vdo_make_physical_zones(struct vdo *vdo, struct physical_zones **zones_ptr); + +void vdo_free_physical_zones(struct physical_zones *zones); + +struct pbn_lock * __must_check +vdo_get_physical_zone_pbn_lock(struct physical_zone *zone, physical_block_number_t pbn); + +int __must_check +vdo_attempt_physical_zone_pbn_lock(struct physical_zone *zone, + physical_block_number_t pbn, + enum pbn_lock_type type, + struct pbn_lock **lock_ptr); + +bool __must_check vdo_allocate_block_in_zone(struct data_vio *data_vio); + +void vdo_release_physical_zone_pbn_lock(struct physical_zone *zone, + physical_block_number_t locked_pbn, + struct pbn_lock *lock); + +void vdo_dump_physical_zone(const struct physical_zone *zone); + +#endif /* VDO_PHYSICAL_ZONE_H */ diff --git a/drivers/md/dm-vdo/pointer-map.c b/drivers/md/dm-vdo/pointer-map.c new file mode 100644 index 00000000000..8511fef70ee --- /dev/null +++ b/drivers/md/dm-vdo/pointer-map.c @@ -0,0 +1,691 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +/** + * DOC: + * + * Hash table implementation of a map from integers to pointers, implemented using the Hopscotch + * Hashing algorithm by Herlihy, Shavit, and Tzafrir (see + * http://en.wikipedia.org/wiki/Hopscotch_hashing). This implementation does not contain any of the + * locking/concurrency features of the algorithm, just the collision resolution scheme. + * + * Hopscotch Hashing is based on hashing with open addressing and linear probing. All the entries + * are stored in a fixed array of buckets, with no dynamic allocation for collisions. Unlike linear + * probing, all the entries that hash to a given bucket are stored within a fixed neighborhood + * starting at that bucket. Chaining is effectively represented as a bit vector relative to each + * bucket instead of as pointers or explicit offsets. + * + * When an empty bucket cannot be found within a given neighborhood, subsequent neighborhoods are + * searched, and one or more entries will "hop" into those neighborhoods. When this process works, + * an empty bucket will move into the desired neighborhood, allowing the entry to be added. When + * that process fails (typically when the buckets are around 90% full), the table must be resized + * and the all entries rehashed and added to the expanded table. + * + * Unlike linear probing, the number of buckets that must be searched in the worst case has a fixed + * upper bound (the size of the neighborhood). Those entries occupy a small number of memory cache + * lines, leading to improved use of the cache (fewer misses on both successful and unsuccessful + * searches). Hopscotch hashing outperforms linear probing at much higher load factors, so even + * with the increased memory burden for maintaining the hop vectors, less memory is needed to + * achieve that performance. Hopscotch is also immune to "contamination" from deleting entries + * since entries are genuinely removed instead of being replaced by a placeholder. + * + * The published description of the algorithm used a bit vector, but the paper alludes to an offset + * scheme which is used by this implementation. Since the entries in the neighborhood are within N + * entries of the hash bucket at the start of the neighborhood, a pair of small offset fields each + * log2(N) bits wide is all that's needed to maintain the hops as a linked list. In order to encode + * "no next hop" (i.e. NULL) as the natural initial value of zero, the offsets are biased by one + * (i.e. 0 => NULL, 1 => offset=0, 2 => offset=1, etc.) We can represent neighborhoods of up to 255 + * entries with just 8+8=16 bits per entry. The hop list is sorted by hop offset so the first entry + * in the list is always the bucket closest to the start of the neighborhood. + * + * While individual accesses tend to be very fast, the table resize operations are very, very + * expensive. If an upper bound on the latency of adding an entry to the table is needed, we either + * need to ensure the table is pre-sized to be large enough so no resize is ever needed, or we'll + * need to develop an approach to incrementally resize the table. + */ + +#include "pointer-map.h" + +#include <linux/minmax.h> + +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "permassert.h" + +enum { + DEFAULT_CAPACITY = 16, /* the number of neighborhoods in a new table */ + NEIGHBORHOOD = 255, /* the number of buckets in each neighborhood */ + MAX_PROBES = 1024, /* limit on the number of probes for a free bucket */ + NULL_HOP_OFFSET = 0, /* the hop offset value terminating the hop list */ + DEFAULT_LOAD = 75 /* a compromise between memory use and performance */ +}; + +/** + * struct bucket - Hash buckets. + * + * Buckets are packed together to reduce memory usage and improve cache efficiency. It would be + * tempting to encode the hop offsets separately and maintain alignment of key/value pairs, but + * it's crucial to keep the hop fields near the buckets that they use them so they'll tend to share + * cache lines. + */ +struct __packed bucket { + /** + * @first_hop: The biased offset of the first entry in the hop list of the neighborhood + * that hashes to this bucket. + */ + u8 first_hop; + /** @next_hop: the biased offset of the next bucket in the hop list. */ + u8 next_hop; + /** @key: The key stored in this bucket. */ + const void *key; + /** @value: The value stored in this bucket (NULL if empty). */ + void *value; +}; + +/** + * struct pointer_map - The concrete definition of the opaque pointer_map type. + * + * To avoid having to wrap the neighborhoods of the last entries back around to the start of the + * bucket array, we allocate a few more buckets at the end of the array instead, which is why + * capacity and bucket_count are different. + */ +struct pointer_map { + /** @size: The number of entries stored in the map. */ + size_t size; + /** @capacity: The number of neighborhoods in the map. */ + size_t capacity; + /** @bucket_count: The number of buckets in the bucket array. */ + size_t bucket_count; + /** @buckets: The array of hash buckets. */ + struct bucket *buckets; + /** @comparator: The function for comparing keys for equality. */ + pointer_key_comparator *comparator; + /** @hasher: The function for getting a hash code from a key. */ + pointer_key_hasher *hasher; +}; + +/** + * allocate_buckets() - Initialize a pointer_map. + * @map: The map to initialize. + * @capacity: The initial capacity of the map. + * + * Return: UDS_SUCCESS or an error code. + */ +static int allocate_buckets(struct pointer_map *map, size_t capacity) +{ + map->size = 0; + map->capacity = capacity; + + /* + * Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a full neighborhood + * without have to wrap back around to element zero. + */ + map->bucket_count = capacity + (NEIGHBORHOOD - 1); + return UDS_ALLOCATE(map->bucket_count, + struct bucket, + "pointer_map buckets", + &map->buckets); +} + +/** + * vdo_make_pointer_map() - Allocate and initialize a pointer_map. + * @initial_capacity: The number of entries the map should initially be capable of holding (zero + * tells the map to use its own small default). + * @initial_load: The load factor of the map, expressed as an integer percentage (typically in the + * range 50 to 90, with zero telling the map to use its own default). + * @comparator: The function to use to compare the referents of two pointer keys for equality. + * @hasher: The function to use obtain the hash code associated with each pointer key + * @map_ptr: A pointer to hold the new pointer_map. + * + * Return: UDS_SUCCESS or an error code. + */ +int vdo_make_pointer_map(size_t initial_capacity, + unsigned int initial_load, + pointer_key_comparator comparator, + pointer_key_hasher hasher, + struct pointer_map **map_ptr) +{ + int result; + struct pointer_map *map; + size_t capacity; + + /* Use the default initial load if the caller did not specify one. */ + if (initial_load == 0) + initial_load = DEFAULT_LOAD; + if (initial_load > 100) + return UDS_INVALID_ARGUMENT; + + result = UDS_ALLOCATE(1, struct pointer_map, "pointer_map", &map); + if (result != UDS_SUCCESS) + return result; + + map->hasher = hasher; + map->comparator = comparator; + + /* Use the default capacity if the caller did not specify one. */ + capacity = (initial_capacity > 0) ? initial_capacity : DEFAULT_CAPACITY; + + /* + * Scale up the capacity by the specified initial load factor. (i.e to hold 1000 entries at + * 80% load we need a capacity of 1250) + */ + capacity = capacity * 100 / initial_load; + + result = allocate_buckets(map, capacity); + if (result != UDS_SUCCESS) { + vdo_free_pointer_map(UDS_FORGET(map)); + return result; + } + + *map_ptr = map; + return UDS_SUCCESS; +} + +/** + * vdo_free_pointer_map() - Free a pointer_map. + * @map: The pointer_map to free. + * + * The map does not own the pointer keys and values stored in the map and they are not freed by + * this call. + */ +void vdo_free_pointer_map(struct pointer_map *map) +{ + if (map == NULL) + return; + + UDS_FREE(UDS_FORGET(map->buckets)); + UDS_FREE(UDS_FORGET(map)); +} + +/** + * vdo_pointer_map_size() - Get the number of entries stored in a pointer_map. + * @map: The pointer_map to query. + * + * Return: The number of entries in the map. + */ +size_t vdo_pointer_map_size(const struct pointer_map *map) +{ + return map->size; +} + +/** + * dereference_hop() - Convert a biased hop offset within a neighborhood to a pointer to the bucket + * it references. + * @neighborhood: The first bucket in the neighborhood. + * @hop_offset: The biased hop offset to the desired bucket. + * + * Return: NULL if hop_offset is zero, otherwise a pointer to the bucket in the neighborhood at + * hop_offset - 1. + */ +static struct bucket *dereference_hop(struct bucket *neighborhood, unsigned int hop_offset) +{ + if (hop_offset == NULL_HOP_OFFSET) + return NULL; + + STATIC_ASSERT(NULL_HOP_OFFSET == 0); + return &neighborhood[hop_offset - 1]; +} + +/** + * insert_in_hop_list() - Add a bucket into the hop list for the neighborhood, inserting it into + * the list so the hop list remains sorted by hop offset. + * @neighborhood: The first bucket in the neighborhood. + * @new_bucket: The bucket to add to the hop list. + */ +static void insert_in_hop_list(struct bucket *neighborhood, struct bucket *new_bucket) +{ + /* Zero indicates a NULL hop offset, so bias the hop offset by one. */ + int hop_offset = 1 + (new_bucket - neighborhood); + + /* Handle the special case of adding a bucket at the start of the list. */ + int next_hop = neighborhood->first_hop; + + if ((next_hop == NULL_HOP_OFFSET) || (next_hop > hop_offset)) { + new_bucket->next_hop = next_hop; + neighborhood->first_hop = hop_offset; + return; + } + + /* Search the hop list for the insertion point that maintains the sort order. */ + for (;;) { + struct bucket *bucket = dereference_hop(neighborhood, next_hop); + + next_hop = bucket->next_hop; + + if ((next_hop == NULL_HOP_OFFSET) || (next_hop > hop_offset)) { + new_bucket->next_hop = next_hop; + bucket->next_hop = hop_offset; + return; + } + } +} + +/** + * select_bucket() - Select and return the hash bucket for a given search key. + * @map: The map to search. + * @key: The mapping key. + */ +static struct bucket *select_bucket(const struct pointer_map *map, const void *key) +{ + /* + * Scale the 32-bit hash to a bucket index by treating it as a binary fraction and + * multiplying that by the capacity. If the hash is uniformly distributed over [0 .. + * 2^32-1], then (hash * capacity / 2^32) should be uniformly distributed over [0 .. + * capacity-1]. The multiply and shift is much faster than a divide (modulus) on X86 CPUs. + */ + u64 hash = map->hasher(key); + + return &map->buckets[(hash * map->capacity) >> 32]; +} + +/** + * search_hop_list() - Search the hop list. + * @map: The map being searched. + * @bucket: The map bucket to search for the key. + * @key: The mapping key. + * @previous_ptr: if not NULL, a pointer in which to store the bucket in the list preceding the one + * that had the matching key. + * + * Searches the hop list associated with given hash bucket for a given search key. If the key is + * found, returns a pointer to the entry (bucket or collision), otherwise returns NULL. + * + * Return: an entry that matches the key, or NULL if not found. + */ +static struct bucket *search_hop_list(struct pointer_map *map, + struct bucket *bucket, + const void *key, + struct bucket **previous_ptr) +{ + struct bucket *previous = NULL; + unsigned int next_hop = bucket->first_hop; + + while (next_hop != NULL_HOP_OFFSET) { + /* Check the neighboring bucket indexed by the offset for the desired key. */ + struct bucket *entry = dereference_hop(bucket, next_hop); + + if ((entry->value != NULL) && map->comparator(key, entry->key)) { + if (previous_ptr != NULL) + *previous_ptr = previous; + return entry; + } + next_hop = entry->next_hop; + previous = entry; + } + return NULL; +} + +/** + * vdo_pointer_map_get() - Retrieve the value associated with a given key from the pointer_map. + * @map: The pointer_map to query. + * @key: The key to look up (may be NULL if the comparator and hasher functions support it). + * + * Return: the value associated with the given key, or NULL if the key is not mapped to any value. + */ +void *vdo_pointer_map_get(struct pointer_map *map, const void *key) +{ + struct bucket *match = search_hop_list(map, select_bucket(map, key), key, NULL); + + return ((match != NULL) ? match->value : NULL); +} + +/** + * resize_buckets() - Increase the number of hash buckets and rehash all the existing entries, + * storing them in the new buckets. + * @map: The map to resize. + */ +static int resize_buckets(struct pointer_map *map) +{ + int result; + size_t i; + + /* Copy the top-level map data to the stack. */ + struct pointer_map old_map = *map; + + /* Re-initialize the map to be empty and 50% larger. */ + size_t new_capacity = map->capacity / 2 * 3; + + uds_log_info("%s: attempting resize from %zu to %zu, current size=%zu", + __func__, + map->capacity, + new_capacity, + map->size); + result = allocate_buckets(map, new_capacity); + if (result != UDS_SUCCESS) { + *map = old_map; + return result; + } + + /* Populate the new hash table from the entries in the old bucket array. */ + for (i = 0; i < old_map.bucket_count; i++) { + struct bucket *entry = &old_map.buckets[i]; + + if (entry->value == NULL) + continue; + + result = vdo_pointer_map_put(map, entry->key, entry->value, true, NULL); + if (result != UDS_SUCCESS) { + /* Destroy the new partial map and restore the map from the stack. */ + UDS_FREE(UDS_FORGET(map->buckets)); + *map = old_map; + return result; + } + } + + /* Destroy the old bucket array. */ + UDS_FREE(UDS_FORGET(old_map.buckets)); + return UDS_SUCCESS; +} + +/** + * find_empty_bucket() - Probe the bucket array starting at the given bucket for the next empty + * bucket, returning a pointer to it. + * @map: The map containing the buckets to search. + * @bucket: The bucket at which to start probing. + * @max_probes: The maximum number of buckets to search. + * + * NULL will be returned if the search reaches the end of the bucket array or if the number of + * linear probes exceeds a specified limit. + * + * Return: The next empty bucket, or NULL if the search failed. + */ +static struct bucket * +find_empty_bucket(struct pointer_map *map, struct bucket *bucket, unsigned int max_probes) +{ + /* + * Limit the search to either the nearer of the end of the bucket array or a fixed distance + * beyond the initial bucket. + */ + ptrdiff_t remaining = &map->buckets[map->bucket_count] - bucket; + struct bucket *sentinel = &bucket[min_t(ptrdiff_t, remaining, max_probes)]; + struct bucket *entry; + + for (entry = bucket; entry < sentinel; entry++) + if (entry->value == NULL) + return entry; + return NULL; +} + +/** + * move_empty_bucket() - Move an empty bucket closer to the start of the bucket array. + * @map: The map containing the bucket. + + * @hole: The empty bucket to fill with an entry that precedes it in one of its enclosing + * neighborhoods. + * + * This searches the neighborhoods that contain the empty bucket for a non-empty bucket closer to + * the start of the array. If such a bucket is found, this swaps the two buckets by moving the + * entry to the empty bucket. + * + * Return: The bucket that was vacated by moving its entry to the provided hole, or NULL if no + * entry could be moved. + */ +static struct bucket * +move_empty_bucket(struct pointer_map *map __always_unused, struct bucket *hole) +{ + /* + * Examine every neighborhood that the empty bucket is part of, starting with the one in + * which it is the last bucket. No boundary check is needed for the negative array + * arithmetic since this function is only called when hole is at least NEIGHBORHOOD cells + * deeper into the array than a valid bucket. + */ + struct bucket *bucket; + + for (bucket = &hole[1 - NEIGHBORHOOD]; bucket < hole; bucket++) { + /* + * Find the entry that is nearest to the bucket, which means it will be nearest to + * the hash bucket whose neighborhood is full. + */ + struct bucket *new_hole = dereference_hop(bucket, bucket->first_hop); + + if (new_hole == NULL) + /* + * There are no buckets in this neighborhood that are in use by this one + * (they must all be owned by overlapping neighborhoods). + */ + continue; + + /* + * Skip this bucket if its first entry is actually further away than the hole that + * we're already trying to fill. + */ + if (hole < new_hole) + continue; + + /* + * We've found an entry in this neighborhood that we can "hop" further away, moving + * the hole closer to the hash bucket, if not all the way into its neighborhood. + */ + + /* + * The entry that will be the new hole is the first bucket in the list, so setting + * first_hop is all that's needed remove it from the list. + */ + bucket->first_hop = new_hole->next_hop; + new_hole->next_hop = NULL_HOP_OFFSET; + + /* Move the entry into the original hole. */ + hole->key = new_hole->key; + hole->value = new_hole->value; + new_hole->value = NULL; + + /* Insert the filled hole into the hop list for the neighborhood. */ + insert_in_hop_list(bucket, hole); + return new_hole; + } + + /* We couldn't find an entry to relocate to the hole. */ + return NULL; +} + +/** + * update_mapping() - Find and update any existing mapping for a given key, returning the value + * associated with the key in the provided pointer. + * @map: The pointer_map to attempt to modify. + * @neighborhood: The first bucket in the neighborhood that would contain the search key. + * @key: The key with which to associate the new value. + * @new_value: The value to be associated with the key. + * @update: Whether to overwrite an existing value. + * @old_value_ptr: A pointer in which to store the old value (unmodified if no mapping was found). + * + * Return: true if the map contains a mapping for the key, false if it does not. + */ +static bool update_mapping(struct pointer_map *map, + struct bucket *neighborhood, + const void *key, + void *new_value, + bool update, + void **old_value_ptr) +{ + struct bucket *bucket = search_hop_list(map, neighborhood, key, NULL); + + if (bucket == NULL) + /* There is no bucket containing the key in the neighborhood. */ + return false; + + /* + * Return the value of the current mapping (if desired) and update the mapping with the new + * value (if desired). + */ + if (old_value_ptr != NULL) + *old_value_ptr = bucket->value; + if (update) { + /* + * We're dropping the old key pointer on the floor here, assuming it's a property + * of the value or that it's otherwise safe to just forget. + */ + bucket->key = key; + bucket->value = new_value; + } + return true; +} + +/** + * find_or_make_vacancy() - Find an empty bucket in a specified neighborhood for a new mapping or + * attempt to re-arrange mappings so there is such a bucket. + * @map: The pointer_map to search or modify. + * @neighborhood: The first bucket in the neighborhood in which an empty bucket is needed for a new + * mapping. + * + * This operation may fail (returning NULL) if an empty bucket is not available or could not be + * relocated to the neighborhood. + * + * Return: A pointer to an empty bucket in the desired neighborhood, or NULL if a vacancy could not + * be found or arranged. + */ +static struct bucket *find_or_make_vacancy(struct pointer_map *map, struct bucket *neighborhood) +{ + /* Probe within and beyond the neighborhood for the first empty bucket. */ + struct bucket *hole = find_empty_bucket(map, neighborhood, MAX_PROBES); + + /* + * Keep trying until the empty bucket is in the bucket's neighborhood or we are unable to + * move it any closer by swapping it with a filled bucket. + */ + while (hole != NULL) { + int distance = hole - neighborhood; + + if (distance < NEIGHBORHOOD) + /* + * We've found or relocated an empty bucket close enough to the initial + * hash bucket to be referenced by its hop vector. + */ + return hole; + + /* + * The nearest empty bucket isn't within the neighborhood that must contain the new + * entry, so try to swap it with bucket that is closer. + */ + hole = move_empty_bucket(map, hole); + } + + return NULL; +} + +/** + * vdo_pointer_map_put() - Try to associate a value (a pointer) with an integer in a pointer_map. + * @map: The pointer_map to attempt to modify. + * @key: The key with which to associate the new value (may be NULL if the comparator and hasher + * functions support it). + * @new_value: The value to be associated with the key. + * @update: Whether to overwrite an existing value. + * @old_value_ptr: A pointer in which to store either the old value (if the key was already mapped) + * or NULL if the map did not contain the key; NULL may be provided if the caller + * does not need to know the old value. + * + * If the map already contains a mapping for the provided key, the old value is only replaced with + * the specified value if update is true. In either case the old value is returned. If the map does + * not already contain a value for the specified key, the new value is added regardless of the + * value of update. + * + * If the value stored in the map is updated, then the key stored in the map will also be updated + * with the key provided by this call. The old key will not be returned due to the memory + * management assumptions described in the interface header comment. + * + * Return: UDS_SUCCESS or an error code. + */ +int vdo_pointer_map_put(struct pointer_map *map, + const void *key, + void *new_value, + bool update, + void **old_value_ptr) +{ + struct bucket *neighborhood, *bucket; + + if (new_value == NULL) + return UDS_INVALID_ARGUMENT; + + /* + * Select the bucket at the start of the neighborhood that must contain any entry for the + * provided key. + */ + neighborhood = select_bucket(map, key); + + /* + * Check whether the neighborhood already contains an entry for the key, in which case we + * optionally update it, returning the old value. + */ + if (update_mapping(map, neighborhood, key, new_value, update, old_value_ptr)) + return UDS_SUCCESS; + + /* + * Find an empty bucket in the desired neighborhood for the new entry or re-arrange entries + * in the map so there is such a bucket. This operation will usually succeed; the loop body + * will only be executed on the rare occasions that we have to resize the map. + */ + while ((bucket = find_or_make_vacancy(map, neighborhood)) == NULL) { + /* + * There is no empty bucket in which to put the new entry in the current map, so + * we're forced to allocate a new bucket array with a larger capacity, re-hash all + * the entries into those buckets, and try again (a very expensive operation for + * large maps). + */ + int result = resize_buckets(map); + + if (result != UDS_SUCCESS) + return result; + + /* + * Resizing the map invalidates all pointers to buckets, so + * recalculate the neighborhood pointer. + */ + neighborhood = select_bucket(map, key); + } + + /* Put the new entry in the empty bucket, adding it to the neighborhood. */ + bucket->key = key; + bucket->value = new_value; + insert_in_hop_list(neighborhood, bucket); + map->size += 1; + + /* + * There was no existing entry, so there was no old value to be + * returned. + */ + if (old_value_ptr != NULL) + *old_value_ptr = NULL; + return UDS_SUCCESS; +} + +/** + * vdo_pointer_map_remove() - Remove the mapping for a given key from the pointer_map. + * @map: The pointer_map from which to remove the mapping. + * @key: The key whose mapping is to be removed (may be NULL if the comparator and hasher functions + * support it). + * + * Return: the value that was associated with the key, or NULL if it was not mapped. + */ +void *vdo_pointer_map_remove(struct pointer_map *map, const void *key) +{ + void *value; + + /* Select the bucket to search and search it for an existing entry. */ + struct bucket *bucket = select_bucket(map, key); + struct bucket *previous; + struct bucket *victim = search_hop_list(map, bucket, key, &previous); + + if (victim == NULL) + /* There is no matching entry to remove. */ + return NULL; + + /* + * We found an entry to remove. Save the mapped value to return later and empty the bucket. + */ + map->size -= 1; + value = victim->value; + victim->value = NULL; + victim->key = 0; + + /* The victim bucket is now empty, but it still needs to be spliced out of the hop list. */ + if (previous == NULL) + /* The victim is the head of the list, so swing first_hop. */ + bucket->first_hop = victim->next_hop; + else + previous->next_hop = victim->next_hop; + victim->next_hop = NULL_HOP_OFFSET; + + return value; +} diff --git a/drivers/md/dm-vdo/pointer-map.h b/drivers/md/dm-vdo/pointer-map.h new file mode 100644 index 00000000000..18de2979174 --- /dev/null +++ b/drivers/md/dm-vdo/pointer-map.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_POINTER_MAP_H +#define VDO_POINTER_MAP_H + +#include <linux/compiler.h> +#include <linux/types.h> + +/* + * A pointer_map associates pointer values (<code>void *</code>) with the data referenced by + * pointer keys (<code>void *</code>). <code>NULL</code> pointer values are not supported. A + * <code>NULL</code> key value is supported when the instance's key comparator and hasher functions + * support it. + * + * The map is implemented as hash table, which should provide constant-time insert, query, and + * remove operations, although the insert may occasionally grow the table, which is linear in the + * number of entries in the map. The table will grow as needed to hold new entries, but will not + * shrink as entries are removed. + * + * The key and value pointers passed to the map are retained and used by the map, but are not owned + * by the map. Freeing the map does not attempt to free the pointers. The client is entirely + * responsible for the memory managment of the keys and values. The current interface and + * implementation assume that keys will be properties of the values, or that keys will not be + * memory managed, or that keys will not need to be freed as a result of being replaced when a key + * is re-mapped. + */ + +struct pointer_map; + +/** + * typedef pointer_key_comparator - The prototype of functions that compare the referents of two + * pointer keys for equality. + * @this_key: The first element to compare. + * @that_key: The second element to compare. + * + * If two keys are equal, then both keys must have the same the hash code associated with them by + * the hasher function defined below. + * + * Return: true if and only if the referents of the two key pointers are to be treated as the same + * key by the map. + */ +typedef bool pointer_key_comparator(const void *this_key, const void *that_key); + +/** + * typedef pointer_key_hasher - The prototype of functions that get or calculate a hash code + * associated with the referent of pointer key. + * @key: The pointer key to hash. + * + * The hash code must be uniformly distributed over all u32 values. The hash code associated + * with a given key must not change while the key is in the map. If the comparator function says + * two keys are equal, then this function must return the same hash code for both keys. This + * function may be called many times for a key while an entry is stored for it in the map. + * + * Return: The hash code for the key. + */ +typedef u32 pointer_key_hasher(const void *key); + +int __must_check vdo_make_pointer_map(size_t initial_capacity, + unsigned int initial_load, + pointer_key_comparator comparator, + pointer_key_hasher hasher, + struct pointer_map **map_ptr); + +void vdo_free_pointer_map(struct pointer_map *map); + +size_t vdo_pointer_map_size(const struct pointer_map *map); + +void *vdo_pointer_map_get(struct pointer_map *map, const void *key); + +int __must_check vdo_pointer_map_put(struct pointer_map *map, + const void *key, + void *new_value, + bool update, + void **old_value_ptr); + +void *vdo_pointer_map_remove(struct pointer_map *map, const void *key); + +#endif /* VDO_POINTER_MAP_H */ diff --git a/drivers/md/dm-vdo/pool-sysfs-stats.c b/drivers/md/dm-vdo/pool-sysfs-stats.c new file mode 100644 index 00000000000..411ea5c143a --- /dev/null +++ b/drivers/md/dm-vdo/pool-sysfs-stats.c @@ -0,0 +1,2063 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include <linux/mutex.h> + +#include "logger.h" +#include "string-utils.h" + +#include "dedupe.h" +#include "pool-sysfs.h" +#include "statistics.h" +#include "vdo.h" + +struct pool_stats_attribute { + struct attribute attr; + ssize_t (*print)(struct vdo_statistics *stats, char *buf); +}; + +static ssize_t pool_stats_attr_show(struct kobject *directory, + struct attribute *attr, + char *buf) +{ + ssize_t size; + struct pool_stats_attribute *pool_stats_attr = + container_of(attr, struct pool_stats_attribute, attr); + struct vdo *vdo = container_of(directory, struct vdo, stats_directory); + + if (pool_stats_attr->print == NULL) + return -EINVAL; + + mutex_lock(&vdo->stats_mutex); + vdo_fetch_statistics(vdo, &vdo->stats_buffer); + size = pool_stats_attr->print(&vdo->stats_buffer, buf); + mutex_unlock(&vdo->stats_mutex); + + return size; +} + +const struct sysfs_ops vdo_pool_stats_sysfs_ops = { + .show = pool_stats_attr_show, + .store = NULL, +}; + +/* Number of blocks used for data */ +static ssize_t +pool_stats_print_data_blocks_used(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->data_blocks_used); +} + +static struct pool_stats_attribute pool_stats_attr_data_blocks_used = { + .attr = { .name = "data_blocks_used", .mode = 0444, }, + .print = pool_stats_print_data_blocks_used, +}; + +/* Number of blocks used for VDO metadata */ +static ssize_t +pool_stats_print_overhead_blocks_used(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->overhead_blocks_used); +} + +static struct pool_stats_attribute pool_stats_attr_overhead_blocks_used = { + .attr = { .name = "overhead_blocks_used", .mode = 0444, }, + .print = pool_stats_print_overhead_blocks_used, +}; + +/* Number of logical blocks that are currently mapped to physical blocks */ +static ssize_t +pool_stats_print_logical_blocks_used(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->logical_blocks_used); +} + +static struct pool_stats_attribute pool_stats_attr_logical_blocks_used = { + .attr = { .name = "logical_blocks_used", .mode = 0444, }, + .print = pool_stats_print_logical_blocks_used, +}; + +/* number of physical blocks */ +static ssize_t +pool_stats_print_physical_blocks(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->physical_blocks); +} + +static struct pool_stats_attribute pool_stats_attr_physical_blocks = { + .attr = { .name = "physical_blocks", .mode = 0444, }, + .print = pool_stats_print_physical_blocks, +}; + +/* number of logical blocks */ +static ssize_t +pool_stats_print_logical_blocks(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->logical_blocks); +} + +static struct pool_stats_attribute pool_stats_attr_logical_blocks = { + .attr = { .name = "logical_blocks", .mode = 0444, }, + .print = pool_stats_print_logical_blocks, +}; + +/* Size of the block map page cache, in bytes */ +static ssize_t +pool_stats_print_block_map_cache_size(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->block_map_cache_size); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_cache_size = { + .attr = { .name = "block_map_cache_size", .mode = 0444, }, + .print = pool_stats_print_block_map_cache_size, +}; + +/* The physical block size */ +static ssize_t +pool_stats_print_block_size(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->block_size); +} + +static struct pool_stats_attribute pool_stats_attr_block_size = { + .attr = { .name = "block_size", .mode = 0444, }, + .print = pool_stats_print_block_size, +}; + +/* Number of times the VDO has successfully recovered */ +static ssize_t +pool_stats_print_complete_recoveries(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->complete_recoveries); +} + +static struct pool_stats_attribute pool_stats_attr_complete_recoveries = { + .attr = { .name = "complete_recoveries", .mode = 0444, }, + .print = pool_stats_print_complete_recoveries, +}; + +/* Number of times the VDO has recovered from read-only mode */ +static ssize_t +pool_stats_print_read_only_recoveries(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->read_only_recoveries); +} + +static struct pool_stats_attribute pool_stats_attr_read_only_recoveries = { + .attr = { .name = "read_only_recoveries", .mode = 0444, }, + .print = pool_stats_print_read_only_recoveries, +}; + +/* String describing the operating mode of the VDO */ +static ssize_t +pool_stats_print_mode(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%s\n", stats->mode); +} + +static struct pool_stats_attribute pool_stats_attr_mode = { + .attr = { .name = "mode", .mode = 0444, }, + .print = pool_stats_print_mode, +}; + +/* Whether the VDO is in recovery mode */ +static ssize_t +pool_stats_print_in_recovery_mode(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%d\n", stats->in_recovery_mode); +} + +static struct pool_stats_attribute pool_stats_attr_in_recovery_mode = { + .attr = { .name = "in_recovery_mode", .mode = 0444, }, + .print = pool_stats_print_in_recovery_mode, +}; + +/* What percentage of recovery mode work has been completed */ +static ssize_t +pool_stats_print_recovery_percentage(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%u\n", stats->recovery_percentage); +} + +static struct pool_stats_attribute pool_stats_attr_recovery_percentage = { + .attr = { .name = "recovery_percentage", .mode = 0444, }, + .print = pool_stats_print_recovery_percentage, +}; + +/* Number of compressed data items written since startup */ +static ssize_t +pool_stats_print_packer_compressed_fragments_written(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->packer.compressed_fragments_written); +} + +static struct pool_stats_attribute pool_stats_attr_packer_compressed_fragments_written = { + .attr = { .name = "packer_compressed_fragments_written", .mode = 0444, }, + .print = pool_stats_print_packer_compressed_fragments_written, +}; + +/* Number of blocks containing compressed items written since startup */ +static ssize_t +pool_stats_print_packer_compressed_blocks_written(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->packer.compressed_blocks_written); +} + +static struct pool_stats_attribute pool_stats_attr_packer_compressed_blocks_written = { + .attr = { .name = "packer_compressed_blocks_written", .mode = 0444, }, + .print = pool_stats_print_packer_compressed_blocks_written, +}; + +/* Number of VIOs that are pending in the packer */ +static ssize_t +pool_stats_print_packer_compressed_fragments_in_packer(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->packer.compressed_fragments_in_packer); +} + +static struct pool_stats_attribute pool_stats_attr_packer_compressed_fragments_in_packer = { + .attr = { .name = "packer_compressed_fragments_in_packer", .mode = 0444, }, + .print = pool_stats_print_packer_compressed_fragments_in_packer, +}; + +/* The total number of slabs from which blocks may be allocated */ +static ssize_t +pool_stats_print_allocator_slab_count(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->allocator.slab_count); +} + +static struct pool_stats_attribute pool_stats_attr_allocator_slab_count = { + .attr = { .name = "allocator_slab_count", .mode = 0444, }, + .print = pool_stats_print_allocator_slab_count, +}; + +/* The total number of slabs from which blocks have ever been allocated */ +static ssize_t +pool_stats_print_allocator_slabs_opened(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->allocator.slabs_opened); +} + +static struct pool_stats_attribute pool_stats_attr_allocator_slabs_opened = { + .attr = { .name = "allocator_slabs_opened", .mode = 0444, }, + .print = pool_stats_print_allocator_slabs_opened, +}; + +/* The number of times since loading that a slab has been re-opened */ +static ssize_t +pool_stats_print_allocator_slabs_reopened(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->allocator.slabs_reopened); +} + +static struct pool_stats_attribute pool_stats_attr_allocator_slabs_reopened = { + .attr = { .name = "allocator_slabs_reopened", .mode = 0444, }, + .print = pool_stats_print_allocator_slabs_reopened, +}; + +/* Number of times the on-disk journal was full */ +static ssize_t +pool_stats_print_journal_disk_full(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->journal.disk_full); +} + +static struct pool_stats_attribute pool_stats_attr_journal_disk_full = { + .attr = { .name = "journal_disk_full", .mode = 0444, }, + .print = pool_stats_print_journal_disk_full, +}; + +/* Number of times the recovery journal requested slab journal commits. */ +static ssize_t +pool_stats_print_journal_slab_journal_commits_requested(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->journal.slab_journal_commits_requested); +} + +static struct pool_stats_attribute pool_stats_attr_journal_slab_journal_commits_requested = { + .attr = { .name = "journal_slab_journal_commits_requested", .mode = 0444, }, + .print = pool_stats_print_journal_slab_journal_commits_requested, +}; + +/* The total number of items on which processing has started */ +static ssize_t +pool_stats_print_journal_entries_started(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->journal.entries.started); +} + +static struct pool_stats_attribute pool_stats_attr_journal_entries_started = { + .attr = { .name = "journal_entries_started", .mode = 0444, }, + .print = pool_stats_print_journal_entries_started, +}; + +/* The total number of items for which a write operation has been issued */ +static ssize_t +pool_stats_print_journal_entries_written(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->journal.entries.written); +} + +static struct pool_stats_attribute pool_stats_attr_journal_entries_written = { + .attr = { .name = "journal_entries_written", .mode = 0444, }, + .print = pool_stats_print_journal_entries_written, +}; + +/* The total number of items for which a write operation has completed */ +static ssize_t +pool_stats_print_journal_entries_committed(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->journal.entries.committed); +} + +static struct pool_stats_attribute pool_stats_attr_journal_entries_committed = { + .attr = { .name = "journal_entries_committed", .mode = 0444, }, + .print = pool_stats_print_journal_entries_committed, +}; + +/* The total number of items on which processing has started */ +static ssize_t +pool_stats_print_journal_blocks_started(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->journal.blocks.started); +} + +static struct pool_stats_attribute pool_stats_attr_journal_blocks_started = { + .attr = { .name = "journal_blocks_started", .mode = 0444, }, + .print = pool_stats_print_journal_blocks_started, +}; + +/* The total number of items for which a write operation has been issued */ +static ssize_t +pool_stats_print_journal_blocks_written(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->journal.blocks.written); +} + +static struct pool_stats_attribute pool_stats_attr_journal_blocks_written = { + .attr = { .name = "journal_blocks_written", .mode = 0444, }, + .print = pool_stats_print_journal_blocks_written, +}; + +/* The total number of items for which a write operation has completed */ +static ssize_t +pool_stats_print_journal_blocks_committed(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->journal.blocks.committed); +} + +static struct pool_stats_attribute pool_stats_attr_journal_blocks_committed = { + .attr = { .name = "journal_blocks_committed", .mode = 0444, }, + .print = pool_stats_print_journal_blocks_committed, +}; + +/* Number of times the on-disk journal was full */ +static ssize_t +pool_stats_print_slab_journal_disk_full_count(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->slab_journal.disk_full_count); +} + +static struct pool_stats_attribute pool_stats_attr_slab_journal_disk_full_count = { + .attr = { .name = "slab_journal_disk_full_count", .mode = 0444, }, + .print = pool_stats_print_slab_journal_disk_full_count, +}; + +/* Number of times an entry was added over the flush threshold */ +static ssize_t +pool_stats_print_slab_journal_flush_count(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->slab_journal.flush_count); +} + +static struct pool_stats_attribute pool_stats_attr_slab_journal_flush_count = { + .attr = { .name = "slab_journal_flush_count", .mode = 0444, }, + .print = pool_stats_print_slab_journal_flush_count, +}; + +/* Number of times an entry was added over the block threshold */ +static ssize_t +pool_stats_print_slab_journal_blocked_count(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->slab_journal.blocked_count); +} + +static struct pool_stats_attribute pool_stats_attr_slab_journal_blocked_count = { + .attr = { .name = "slab_journal_blocked_count", .mode = 0444, }, + .print = pool_stats_print_slab_journal_blocked_count, +}; + +/* Number of times a tail block was written */ +static ssize_t +pool_stats_print_slab_journal_blocks_written(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->slab_journal.blocks_written); +} + +static struct pool_stats_attribute pool_stats_attr_slab_journal_blocks_written = { + .attr = { .name = "slab_journal_blocks_written", .mode = 0444, }, + .print = pool_stats_print_slab_journal_blocks_written, +}; + +/* Number of times we had to wait for the tail to write */ +static ssize_t +pool_stats_print_slab_journal_tail_busy_count(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->slab_journal.tail_busy_count); +} + +static struct pool_stats_attribute pool_stats_attr_slab_journal_tail_busy_count = { + .attr = { .name = "slab_journal_tail_busy_count", .mode = 0444, }, + .print = pool_stats_print_slab_journal_tail_busy_count, +}; + +/* Number of blocks written */ +static ssize_t +pool_stats_print_slab_summary_blocks_written(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->slab_summary.blocks_written); +} + +static struct pool_stats_attribute pool_stats_attr_slab_summary_blocks_written = { + .attr = { .name = "slab_summary_blocks_written", .mode = 0444, }, + .print = pool_stats_print_slab_summary_blocks_written, +}; + +/* Number of reference blocks written */ +static ssize_t +pool_stats_print_ref_counts_blocks_written(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->ref_counts.blocks_written); +} + +static struct pool_stats_attribute pool_stats_attr_ref_counts_blocks_written = { + .attr = { .name = "ref_counts_blocks_written", .mode = 0444, }, + .print = pool_stats_print_ref_counts_blocks_written, +}; + +/* number of dirty (resident) pages */ +static ssize_t +pool_stats_print_block_map_dirty_pages(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%u\n", stats->block_map.dirty_pages); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_dirty_pages = { + .attr = { .name = "block_map_dirty_pages", .mode = 0444, }, + .print = pool_stats_print_block_map_dirty_pages, +}; + +/* number of clean (resident) pages */ +static ssize_t +pool_stats_print_block_map_clean_pages(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%u\n", stats->block_map.clean_pages); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_clean_pages = { + .attr = { .name = "block_map_clean_pages", .mode = 0444, }, + .print = pool_stats_print_block_map_clean_pages, +}; + +/* number of free pages */ +static ssize_t +pool_stats_print_block_map_free_pages(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%u\n", stats->block_map.free_pages); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_free_pages = { + .attr = { .name = "block_map_free_pages", .mode = 0444, }, + .print = pool_stats_print_block_map_free_pages, +}; + +/* number of pages in failed state */ +static ssize_t +pool_stats_print_block_map_failed_pages(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%u\n", stats->block_map.failed_pages); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_failed_pages = { + .attr = { .name = "block_map_failed_pages", .mode = 0444, }, + .print = pool_stats_print_block_map_failed_pages, +}; + +/* number of pages incoming */ +static ssize_t +pool_stats_print_block_map_incoming_pages(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%u\n", stats->block_map.incoming_pages); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_incoming_pages = { + .attr = { .name = "block_map_incoming_pages", .mode = 0444, }, + .print = pool_stats_print_block_map_incoming_pages, +}; + +/* number of pages outgoing */ +static ssize_t +pool_stats_print_block_map_outgoing_pages(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%u\n", stats->block_map.outgoing_pages); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_outgoing_pages = { + .attr = { .name = "block_map_outgoing_pages", .mode = 0444, }, + .print = pool_stats_print_block_map_outgoing_pages, +}; + +/* how many times free page not avail */ +static ssize_t +pool_stats_print_block_map_cache_pressure(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%u\n", stats->block_map.cache_pressure); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_cache_pressure = { + .attr = { .name = "block_map_cache_pressure", .mode = 0444, }, + .print = pool_stats_print_block_map_cache_pressure, +}; + +/* number of get_vdo_page() calls for read */ +static ssize_t +pool_stats_print_block_map_read_count(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->block_map.read_count); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_read_count = { + .attr = { .name = "block_map_read_count", .mode = 0444, }, + .print = pool_stats_print_block_map_read_count, +}; + +/* number of get_vdo_page() calls for write */ +static ssize_t +pool_stats_print_block_map_write_count(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->block_map.write_count); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_write_count = { + .attr = { .name = "block_map_write_count", .mode = 0444, }, + .print = pool_stats_print_block_map_write_count, +}; + +/* number of times pages failed to read */ +static ssize_t +pool_stats_print_block_map_failed_reads(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->block_map.failed_reads); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_failed_reads = { + .attr = { .name = "block_map_failed_reads", .mode = 0444, }, + .print = pool_stats_print_block_map_failed_reads, +}; + +/* number of times pages failed to write */ +static ssize_t +pool_stats_print_block_map_failed_writes(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->block_map.failed_writes); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_failed_writes = { + .attr = { .name = "block_map_failed_writes", .mode = 0444, }, + .print = pool_stats_print_block_map_failed_writes, +}; + +/* number of gets that are reclaimed */ +static ssize_t +pool_stats_print_block_map_reclaimed(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->block_map.reclaimed); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_reclaimed = { + .attr = { .name = "block_map_reclaimed", .mode = 0444, }, + .print = pool_stats_print_block_map_reclaimed, +}; + +/* number of gets for outgoing pages */ +static ssize_t +pool_stats_print_block_map_read_outgoing(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->block_map.read_outgoing); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_read_outgoing = { + .attr = { .name = "block_map_read_outgoing", .mode = 0444, }, + .print = pool_stats_print_block_map_read_outgoing, +}; + +/* number of gets that were already there */ +static ssize_t +pool_stats_print_block_map_found_in_cache(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->block_map.found_in_cache); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_found_in_cache = { + .attr = { .name = "block_map_found_in_cache", .mode = 0444, }, + .print = pool_stats_print_block_map_found_in_cache, +}; + +/* number of gets requiring discard */ +static ssize_t +pool_stats_print_block_map_discard_required(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->block_map.discard_required); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_discard_required = { + .attr = { .name = "block_map_discard_required", .mode = 0444, }, + .print = pool_stats_print_block_map_discard_required, +}; + +/* number of gets enqueued for their page */ +static ssize_t +pool_stats_print_block_map_wait_for_page(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->block_map.wait_for_page); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_wait_for_page = { + .attr = { .name = "block_map_wait_for_page", .mode = 0444, }, + .print = pool_stats_print_block_map_wait_for_page, +}; + +/* number of gets that have to fetch */ +static ssize_t +pool_stats_print_block_map_fetch_required(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->block_map.fetch_required); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_fetch_required = { + .attr = { .name = "block_map_fetch_required", .mode = 0444, }, + .print = pool_stats_print_block_map_fetch_required, +}; + +/* number of page fetches */ +static ssize_t +pool_stats_print_block_map_pages_loaded(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->block_map.pages_loaded); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_pages_loaded = { + .attr = { .name = "block_map_pages_loaded", .mode = 0444, }, + .print = pool_stats_print_block_map_pages_loaded, +}; + +/* number of page saves */ +static ssize_t +pool_stats_print_block_map_pages_saved(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->block_map.pages_saved); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_pages_saved = { + .attr = { .name = "block_map_pages_saved", .mode = 0444, }, + .print = pool_stats_print_block_map_pages_saved, +}; + +/* the number of flushes issued */ +static ssize_t +pool_stats_print_block_map_flush_count(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->block_map.flush_count); +} + +static struct pool_stats_attribute pool_stats_attr_block_map_flush_count = { + .attr = { .name = "block_map_flush_count", .mode = 0444, }, + .print = pool_stats_print_block_map_flush_count, +}; + +/* Number of times the UDS advice proved correct */ +static ssize_t +pool_stats_print_hash_lock_dedupe_advice_valid(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->hash_lock.dedupe_advice_valid); +} + +static struct pool_stats_attribute pool_stats_attr_hash_lock_dedupe_advice_valid = { + .attr = { .name = "hash_lock_dedupe_advice_valid", .mode = 0444, }, + .print = pool_stats_print_hash_lock_dedupe_advice_valid, +}; + +/* Number of times the UDS advice proved incorrect */ +static ssize_t +pool_stats_print_hash_lock_dedupe_advice_stale(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->hash_lock.dedupe_advice_stale); +} + +static struct pool_stats_attribute pool_stats_attr_hash_lock_dedupe_advice_stale = { + .attr = { .name = "hash_lock_dedupe_advice_stale", .mode = 0444, }, + .print = pool_stats_print_hash_lock_dedupe_advice_stale, +}; + +/* Number of writes with the same data as another in-flight write */ +static ssize_t +pool_stats_print_hash_lock_concurrent_data_matches(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->hash_lock.concurrent_data_matches); +} + +static struct pool_stats_attribute pool_stats_attr_hash_lock_concurrent_data_matches = { + .attr = { .name = "hash_lock_concurrent_data_matches", .mode = 0444, }, + .print = pool_stats_print_hash_lock_concurrent_data_matches, +}; + +/* Number of writes whose hash collided with an in-flight write */ +static ssize_t +pool_stats_print_hash_lock_concurrent_hash_collisions(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->hash_lock.concurrent_hash_collisions); +} + +static struct pool_stats_attribute pool_stats_attr_hash_lock_concurrent_hash_collisions = { + .attr = { .name = "hash_lock_concurrent_hash_collisions", .mode = 0444, }, + .print = pool_stats_print_hash_lock_concurrent_hash_collisions, +}; + +/* Current number of dedupe queries that are in flight */ +static ssize_t +pool_stats_print_hash_lock_curr_dedupe_queries(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%u\n", stats->hash_lock.curr_dedupe_queries); +} + +static struct pool_stats_attribute pool_stats_attr_hash_lock_curr_dedupe_queries = { + .attr = { .name = "hash_lock_curr_dedupe_queries", .mode = 0444, }, + .print = pool_stats_print_hash_lock_curr_dedupe_queries, +}; + +/* number of times VDO got an invalid dedupe advice PBN from UDS */ +static ssize_t +pool_stats_print_errors_invalid_advice_pbn_count(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->errors.invalid_advice_pbn_count); +} + +static struct pool_stats_attribute pool_stats_attr_errors_invalid_advice_pbn_count = { + .attr = { .name = "errors_invalid_advice_pbn_count", .mode = 0444, }, + .print = pool_stats_print_errors_invalid_advice_pbn_count, +}; + +/* number of times a VIO completed with a VDO_NO_SPACE error */ +static ssize_t +pool_stats_print_errors_no_space_error_count(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->errors.no_space_error_count); +} + +static struct pool_stats_attribute pool_stats_attr_errors_no_space_error_count = { + .attr = { .name = "errors_no_space_error_count", .mode = 0444, }, + .print = pool_stats_print_errors_no_space_error_count, +}; + +/* number of times a VIO completed with a VDO_READ_ONLY error */ +static ssize_t +pool_stats_print_errors_read_only_error_count(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->errors.read_only_error_count); +} + +static struct pool_stats_attribute pool_stats_attr_errors_read_only_error_count = { + .attr = { .name = "errors_read_only_error_count", .mode = 0444, }, + .print = pool_stats_print_errors_read_only_error_count, +}; + +/* The VDO instance */ +static ssize_t +pool_stats_print_instance(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%u\n", stats->instance); +} + +static struct pool_stats_attribute pool_stats_attr_instance = { + .attr = { .name = "instance", .mode = 0444, }, + .print = pool_stats_print_instance, +}; + +/* Current number of active VIOs */ +static ssize_t +pool_stats_print_current_vios_in_progress(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%u\n", stats->current_vios_in_progress); +} + +static struct pool_stats_attribute pool_stats_attr_current_vios_in_progress = { + .attr = { .name = "current_vios_in_progress", .mode = 0444, }, + .print = pool_stats_print_current_vios_in_progress, +}; + +/* Maximum number of active VIOs */ +static ssize_t +pool_stats_print_max_vios(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%u\n", stats->max_vios); +} + +static struct pool_stats_attribute pool_stats_attr_max_vios = { + .attr = { .name = "max_vios", .mode = 0444, }, + .print = pool_stats_print_max_vios, +}; + +/* Number of times the UDS index was too slow in responding */ +static ssize_t +pool_stats_print_dedupe_advice_timeouts(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->dedupe_advice_timeouts); +} + +static struct pool_stats_attribute pool_stats_attr_dedupe_advice_timeouts = { + .attr = { .name = "dedupe_advice_timeouts", .mode = 0444, }, + .print = pool_stats_print_dedupe_advice_timeouts, +}; + +/* Number of flush requests submitted to the storage device */ +static ssize_t +pool_stats_print_flush_out(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->flush_out); +} + +static struct pool_stats_attribute pool_stats_attr_flush_out = { + .attr = { .name = "flush_out", .mode = 0444, }, + .print = pool_stats_print_flush_out, +}; + +/* Logical block size */ +static ssize_t +pool_stats_print_logical_block_size(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->logical_block_size); +} + +static struct pool_stats_attribute pool_stats_attr_logical_block_size = { + .attr = { .name = "logical_block_size", .mode = 0444, }, + .print = pool_stats_print_logical_block_size, +}; + +/* Number of REQ_OP_READ bios */ +static ssize_t +pool_stats_print_bios_in_read(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in.read); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_read = { + .attr = { .name = "bios_in_read", .mode = 0444, }, + .print = pool_stats_print_bios_in_read, +}; + +/* Number of REQ_OP_WRITE bios with data */ +static ssize_t +pool_stats_print_bios_in_write(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in.write); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_write = { + .attr = { .name = "bios_in_write", .mode = 0444, }, + .print = pool_stats_print_bios_in_write, +}; + +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ +static ssize_t +pool_stats_print_bios_in_empty_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in.empty_flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_empty_flush = { + .attr = { .name = "bios_in_empty_flush", .mode = 0444, }, + .print = pool_stats_print_bios_in_empty_flush, +}; + +/* Number of REQ_OP_DISCARD bios */ +static ssize_t +pool_stats_print_bios_in_discard(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in.discard); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_discard = { + .attr = { .name = "bios_in_discard", .mode = 0444, }, + .print = pool_stats_print_bios_in_discard, +}; + +/* Number of bios tagged with REQ_PREFLUSH */ +static ssize_t +pool_stats_print_bios_in_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in.flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_flush = { + .attr = { .name = "bios_in_flush", .mode = 0444, }, + .print = pool_stats_print_bios_in_flush, +}; + +/* Number of bios tagged with REQ_FUA */ +static ssize_t +pool_stats_print_bios_in_fua(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in.fua); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_fua = { + .attr = { .name = "bios_in_fua", .mode = 0444, }, + .print = pool_stats_print_bios_in_fua, +}; + +/* Number of REQ_OP_READ bios */ +static ssize_t +pool_stats_print_bios_in_partial_read(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in_partial.read); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_partial_read = { + .attr = { .name = "bios_in_partial_read", .mode = 0444, }, + .print = pool_stats_print_bios_in_partial_read, +}; + +/* Number of REQ_OP_WRITE bios with data */ +static ssize_t +pool_stats_print_bios_in_partial_write(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in_partial.write); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_partial_write = { + .attr = { .name = "bios_in_partial_write", .mode = 0444, }, + .print = pool_stats_print_bios_in_partial_write, +}; + +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ +static ssize_t +pool_stats_print_bios_in_partial_empty_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in_partial.empty_flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_partial_empty_flush = { + .attr = { .name = "bios_in_partial_empty_flush", .mode = 0444, }, + .print = pool_stats_print_bios_in_partial_empty_flush, +}; + +/* Number of REQ_OP_DISCARD bios */ +static ssize_t +pool_stats_print_bios_in_partial_discard(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in_partial.discard); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_partial_discard = { + .attr = { .name = "bios_in_partial_discard", .mode = 0444, }, + .print = pool_stats_print_bios_in_partial_discard, +}; + +/* Number of bios tagged with REQ_PREFLUSH */ +static ssize_t +pool_stats_print_bios_in_partial_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in_partial.flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_partial_flush = { + .attr = { .name = "bios_in_partial_flush", .mode = 0444, }, + .print = pool_stats_print_bios_in_partial_flush, +}; + +/* Number of bios tagged with REQ_FUA */ +static ssize_t +pool_stats_print_bios_in_partial_fua(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in_partial.fua); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_partial_fua = { + .attr = { .name = "bios_in_partial_fua", .mode = 0444, }, + .print = pool_stats_print_bios_in_partial_fua, +}; + +/* Number of REQ_OP_READ bios */ +static ssize_t +pool_stats_print_bios_out_read(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_out.read); +} + +static struct pool_stats_attribute pool_stats_attr_bios_out_read = { + .attr = { .name = "bios_out_read", .mode = 0444, }, + .print = pool_stats_print_bios_out_read, +}; + +/* Number of REQ_OP_WRITE bios with data */ +static ssize_t +pool_stats_print_bios_out_write(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_out.write); +} + +static struct pool_stats_attribute pool_stats_attr_bios_out_write = { + .attr = { .name = "bios_out_write", .mode = 0444, }, + .print = pool_stats_print_bios_out_write, +}; + +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ +static ssize_t +pool_stats_print_bios_out_empty_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_out.empty_flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_out_empty_flush = { + .attr = { .name = "bios_out_empty_flush", .mode = 0444, }, + .print = pool_stats_print_bios_out_empty_flush, +}; + +/* Number of REQ_OP_DISCARD bios */ +static ssize_t +pool_stats_print_bios_out_discard(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_out.discard); +} + +static struct pool_stats_attribute pool_stats_attr_bios_out_discard = { + .attr = { .name = "bios_out_discard", .mode = 0444, }, + .print = pool_stats_print_bios_out_discard, +}; + +/* Number of bios tagged with REQ_PREFLUSH */ +static ssize_t +pool_stats_print_bios_out_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_out.flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_out_flush = { + .attr = { .name = "bios_out_flush", .mode = 0444, }, + .print = pool_stats_print_bios_out_flush, +}; + +/* Number of bios tagged with REQ_FUA */ +static ssize_t +pool_stats_print_bios_out_fua(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_out.fua); +} + +static struct pool_stats_attribute pool_stats_attr_bios_out_fua = { + .attr = { .name = "bios_out_fua", .mode = 0444, }, + .print = pool_stats_print_bios_out_fua, +}; + +/* Number of REQ_OP_READ bios */ +static ssize_t +pool_stats_print_bios_meta_read(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_meta.read); +} + +static struct pool_stats_attribute pool_stats_attr_bios_meta_read = { + .attr = { .name = "bios_meta_read", .mode = 0444, }, + .print = pool_stats_print_bios_meta_read, +}; + +/* Number of REQ_OP_WRITE bios with data */ +static ssize_t +pool_stats_print_bios_meta_write(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_meta.write); +} + +static struct pool_stats_attribute pool_stats_attr_bios_meta_write = { + .attr = { .name = "bios_meta_write", .mode = 0444, }, + .print = pool_stats_print_bios_meta_write, +}; + +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ +static ssize_t +pool_stats_print_bios_meta_empty_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_meta.empty_flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_meta_empty_flush = { + .attr = { .name = "bios_meta_empty_flush", .mode = 0444, }, + .print = pool_stats_print_bios_meta_empty_flush, +}; + +/* Number of REQ_OP_DISCARD bios */ +static ssize_t +pool_stats_print_bios_meta_discard(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_meta.discard); +} + +static struct pool_stats_attribute pool_stats_attr_bios_meta_discard = { + .attr = { .name = "bios_meta_discard", .mode = 0444, }, + .print = pool_stats_print_bios_meta_discard, +}; + +/* Number of bios tagged with REQ_PREFLUSH */ +static ssize_t +pool_stats_print_bios_meta_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_meta.flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_meta_flush = { + .attr = { .name = "bios_meta_flush", .mode = 0444, }, + .print = pool_stats_print_bios_meta_flush, +}; + +/* Number of bios tagged with REQ_FUA */ +static ssize_t +pool_stats_print_bios_meta_fua(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_meta.fua); +} + +static struct pool_stats_attribute pool_stats_attr_bios_meta_fua = { + .attr = { .name = "bios_meta_fua", .mode = 0444, }, + .print = pool_stats_print_bios_meta_fua, +}; + +/* Number of REQ_OP_READ bios */ +static ssize_t +pool_stats_print_bios_journal_read(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_journal.read); +} + +static struct pool_stats_attribute pool_stats_attr_bios_journal_read = { + .attr = { .name = "bios_journal_read", .mode = 0444, }, + .print = pool_stats_print_bios_journal_read, +}; + +/* Number of REQ_OP_WRITE bios with data */ +static ssize_t +pool_stats_print_bios_journal_write(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_journal.write); +} + +static struct pool_stats_attribute pool_stats_attr_bios_journal_write = { + .attr = { .name = "bios_journal_write", .mode = 0444, }, + .print = pool_stats_print_bios_journal_write, +}; + +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ +static ssize_t +pool_stats_print_bios_journal_empty_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_journal.empty_flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_journal_empty_flush = { + .attr = { .name = "bios_journal_empty_flush", .mode = 0444, }, + .print = pool_stats_print_bios_journal_empty_flush, +}; + +/* Number of REQ_OP_DISCARD bios */ +static ssize_t +pool_stats_print_bios_journal_discard(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_journal.discard); +} + +static struct pool_stats_attribute pool_stats_attr_bios_journal_discard = { + .attr = { .name = "bios_journal_discard", .mode = 0444, }, + .print = pool_stats_print_bios_journal_discard, +}; + +/* Number of bios tagged with REQ_PREFLUSH */ +static ssize_t +pool_stats_print_bios_journal_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_journal.flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_journal_flush = { + .attr = { .name = "bios_journal_flush", .mode = 0444, }, + .print = pool_stats_print_bios_journal_flush, +}; + +/* Number of bios tagged with REQ_FUA */ +static ssize_t +pool_stats_print_bios_journal_fua(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_journal.fua); +} + +static struct pool_stats_attribute pool_stats_attr_bios_journal_fua = { + .attr = { .name = "bios_journal_fua", .mode = 0444, }, + .print = pool_stats_print_bios_journal_fua, +}; + +/* Number of REQ_OP_READ bios */ +static ssize_t +pool_stats_print_bios_page_cache_read(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_page_cache.read); +} + +static struct pool_stats_attribute pool_stats_attr_bios_page_cache_read = { + .attr = { .name = "bios_page_cache_read", .mode = 0444, }, + .print = pool_stats_print_bios_page_cache_read, +}; + +/* Number of REQ_OP_WRITE bios with data */ +static ssize_t +pool_stats_print_bios_page_cache_write(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_page_cache.write); +} + +static struct pool_stats_attribute pool_stats_attr_bios_page_cache_write = { + .attr = { .name = "bios_page_cache_write", .mode = 0444, }, + .print = pool_stats_print_bios_page_cache_write, +}; + +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ +static ssize_t +pool_stats_print_bios_page_cache_empty_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_page_cache.empty_flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_page_cache_empty_flush = { + .attr = { .name = "bios_page_cache_empty_flush", .mode = 0444, }, + .print = pool_stats_print_bios_page_cache_empty_flush, +}; + +/* Number of REQ_OP_DISCARD bios */ +static ssize_t +pool_stats_print_bios_page_cache_discard(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_page_cache.discard); +} + +static struct pool_stats_attribute pool_stats_attr_bios_page_cache_discard = { + .attr = { .name = "bios_page_cache_discard", .mode = 0444, }, + .print = pool_stats_print_bios_page_cache_discard, +}; + +/* Number of bios tagged with REQ_PREFLUSH */ +static ssize_t +pool_stats_print_bios_page_cache_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_page_cache.flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_page_cache_flush = { + .attr = { .name = "bios_page_cache_flush", .mode = 0444, }, + .print = pool_stats_print_bios_page_cache_flush, +}; + +/* Number of bios tagged with REQ_FUA */ +static ssize_t +pool_stats_print_bios_page_cache_fua(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_page_cache.fua); +} + +static struct pool_stats_attribute pool_stats_attr_bios_page_cache_fua = { + .attr = { .name = "bios_page_cache_fua", .mode = 0444, }, + .print = pool_stats_print_bios_page_cache_fua, +}; + +/* Number of REQ_OP_READ bios */ +static ssize_t +pool_stats_print_bios_out_completed_read(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_out_completed.read); +} + +static struct pool_stats_attribute pool_stats_attr_bios_out_completed_read = { + .attr = { .name = "bios_out_completed_read", .mode = 0444, }, + .print = pool_stats_print_bios_out_completed_read, +}; + +/* Number of REQ_OP_WRITE bios with data */ +static ssize_t +pool_stats_print_bios_out_completed_write(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_out_completed.write); +} + +static struct pool_stats_attribute pool_stats_attr_bios_out_completed_write = { + .attr = { .name = "bios_out_completed_write", .mode = 0444, }, + .print = pool_stats_print_bios_out_completed_write, +}; + +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ +static ssize_t +pool_stats_print_bios_out_completed_empty_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_out_completed.empty_flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_out_completed_empty_flush = { + .attr = { .name = "bios_out_completed_empty_flush", .mode = 0444, }, + .print = pool_stats_print_bios_out_completed_empty_flush, +}; + +/* Number of REQ_OP_DISCARD bios */ +static ssize_t +pool_stats_print_bios_out_completed_discard(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_out_completed.discard); +} + +static struct pool_stats_attribute pool_stats_attr_bios_out_completed_discard = { + .attr = { .name = "bios_out_completed_discard", .mode = 0444, }, + .print = pool_stats_print_bios_out_completed_discard, +}; + +/* Number of bios tagged with REQ_PREFLUSH */ +static ssize_t +pool_stats_print_bios_out_completed_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_out_completed.flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_out_completed_flush = { + .attr = { .name = "bios_out_completed_flush", .mode = 0444, }, + .print = pool_stats_print_bios_out_completed_flush, +}; + +/* Number of bios tagged with REQ_FUA */ +static ssize_t +pool_stats_print_bios_out_completed_fua(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_out_completed.fua); +} + +static struct pool_stats_attribute pool_stats_attr_bios_out_completed_fua = { + .attr = { .name = "bios_out_completed_fua", .mode = 0444, }, + .print = pool_stats_print_bios_out_completed_fua, +}; + +/* Number of REQ_OP_READ bios */ +static ssize_t +pool_stats_print_bios_meta_completed_read(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_meta_completed.read); +} + +static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_read = { + .attr = { .name = "bios_meta_completed_read", .mode = 0444, }, + .print = pool_stats_print_bios_meta_completed_read, +}; + +/* Number of REQ_OP_WRITE bios with data */ +static ssize_t +pool_stats_print_bios_meta_completed_write(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_meta_completed.write); +} + +static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_write = { + .attr = { .name = "bios_meta_completed_write", .mode = 0444, }, + .print = pool_stats_print_bios_meta_completed_write, +}; + +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ +static ssize_t +pool_stats_print_bios_meta_completed_empty_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_meta_completed.empty_flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_empty_flush = { + .attr = { .name = "bios_meta_completed_empty_flush", .mode = 0444, }, + .print = pool_stats_print_bios_meta_completed_empty_flush, +}; + +/* Number of REQ_OP_DISCARD bios */ +static ssize_t +pool_stats_print_bios_meta_completed_discard(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_meta_completed.discard); +} + +static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_discard = { + .attr = { .name = "bios_meta_completed_discard", .mode = 0444, }, + .print = pool_stats_print_bios_meta_completed_discard, +}; + +/* Number of bios tagged with REQ_PREFLUSH */ +static ssize_t +pool_stats_print_bios_meta_completed_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_meta_completed.flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_flush = { + .attr = { .name = "bios_meta_completed_flush", .mode = 0444, }, + .print = pool_stats_print_bios_meta_completed_flush, +}; + +/* Number of bios tagged with REQ_FUA */ +static ssize_t +pool_stats_print_bios_meta_completed_fua(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_meta_completed.fua); +} + +static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_fua = { + .attr = { .name = "bios_meta_completed_fua", .mode = 0444, }, + .print = pool_stats_print_bios_meta_completed_fua, +}; + +/* Number of REQ_OP_READ bios */ +static ssize_t +pool_stats_print_bios_journal_completed_read(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_journal_completed.read); +} + +static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_read = { + .attr = { .name = "bios_journal_completed_read", .mode = 0444, }, + .print = pool_stats_print_bios_journal_completed_read, +}; + +/* Number of REQ_OP_WRITE bios with data */ +static ssize_t +pool_stats_print_bios_journal_completed_write(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_journal_completed.write); +} + +static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_write = { + .attr = { .name = "bios_journal_completed_write", .mode = 0444, }, + .print = pool_stats_print_bios_journal_completed_write, +}; + +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ +static ssize_t +pool_stats_print_bios_journal_completed_empty_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_journal_completed.empty_flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_empty_flush = { + .attr = { .name = "bios_journal_completed_empty_flush", .mode = 0444, }, + .print = pool_stats_print_bios_journal_completed_empty_flush, +}; + +/* Number of REQ_OP_DISCARD bios */ +static ssize_t +pool_stats_print_bios_journal_completed_discard(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_journal_completed.discard); +} + +static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_discard = { + .attr = { .name = "bios_journal_completed_discard", .mode = 0444, }, + .print = pool_stats_print_bios_journal_completed_discard, +}; + +/* Number of bios tagged with REQ_PREFLUSH */ +static ssize_t +pool_stats_print_bios_journal_completed_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_journal_completed.flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_flush = { + .attr = { .name = "bios_journal_completed_flush", .mode = 0444, }, + .print = pool_stats_print_bios_journal_completed_flush, +}; + +/* Number of bios tagged with REQ_FUA */ +static ssize_t +pool_stats_print_bios_journal_completed_fua(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_journal_completed.fua); +} + +static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_fua = { + .attr = { .name = "bios_journal_completed_fua", .mode = 0444, }, + .print = pool_stats_print_bios_journal_completed_fua, +}; + +/* Number of REQ_OP_READ bios */ +static ssize_t +pool_stats_print_bios_page_cache_completed_read(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.read); +} + +static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_read = { + .attr = { .name = "bios_page_cache_completed_read", .mode = 0444, }, + .print = pool_stats_print_bios_page_cache_completed_read, +}; + +/* Number of REQ_OP_WRITE bios with data */ +static ssize_t +pool_stats_print_bios_page_cache_completed_write(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.write); +} + +static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_write = { + .attr = { .name = "bios_page_cache_completed_write", .mode = 0444, }, + .print = pool_stats_print_bios_page_cache_completed_write, +}; + +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ +static ssize_t +pool_stats_print_bios_page_cache_completed_empty_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.empty_flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_empty_flush = { + .attr = { .name = "bios_page_cache_completed_empty_flush", .mode = 0444, }, + .print = pool_stats_print_bios_page_cache_completed_empty_flush, +}; + +/* Number of REQ_OP_DISCARD bios */ +static ssize_t +pool_stats_print_bios_page_cache_completed_discard(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.discard); +} + +static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_discard = { + .attr = { .name = "bios_page_cache_completed_discard", .mode = 0444, }, + .print = pool_stats_print_bios_page_cache_completed_discard, +}; + +/* Number of bios tagged with REQ_PREFLUSH */ +static ssize_t +pool_stats_print_bios_page_cache_completed_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_flush = { + .attr = { .name = "bios_page_cache_completed_flush", .mode = 0444, }, + .print = pool_stats_print_bios_page_cache_completed_flush, +}; + +/* Number of bios tagged with REQ_FUA */ +static ssize_t +pool_stats_print_bios_page_cache_completed_fua(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.fua); +} + +static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_fua = { + .attr = { .name = "bios_page_cache_completed_fua", .mode = 0444, }, + .print = pool_stats_print_bios_page_cache_completed_fua, +}; + +/* Number of REQ_OP_READ bios */ +static ssize_t +pool_stats_print_bios_acknowledged_read(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_acknowledged.read); +} + +static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_read = { + .attr = { .name = "bios_acknowledged_read", .mode = 0444, }, + .print = pool_stats_print_bios_acknowledged_read, +}; + +/* Number of REQ_OP_WRITE bios with data */ +static ssize_t +pool_stats_print_bios_acknowledged_write(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_acknowledged.write); +} + +static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_write = { + .attr = { .name = "bios_acknowledged_write", .mode = 0444, }, + .print = pool_stats_print_bios_acknowledged_write, +}; + +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ +static ssize_t +pool_stats_print_bios_acknowledged_empty_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_acknowledged.empty_flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_empty_flush = { + .attr = { .name = "bios_acknowledged_empty_flush", .mode = 0444, }, + .print = pool_stats_print_bios_acknowledged_empty_flush, +}; + +/* Number of REQ_OP_DISCARD bios */ +static ssize_t +pool_stats_print_bios_acknowledged_discard(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_acknowledged.discard); +} + +static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_discard = { + .attr = { .name = "bios_acknowledged_discard", .mode = 0444, }, + .print = pool_stats_print_bios_acknowledged_discard, +}; + +/* Number of bios tagged with REQ_PREFLUSH */ +static ssize_t +pool_stats_print_bios_acknowledged_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_acknowledged.flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_flush = { + .attr = { .name = "bios_acknowledged_flush", .mode = 0444, }, + .print = pool_stats_print_bios_acknowledged_flush, +}; + +/* Number of bios tagged with REQ_FUA */ +static ssize_t +pool_stats_print_bios_acknowledged_fua(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_acknowledged.fua); +} + +static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_fua = { + .attr = { .name = "bios_acknowledged_fua", .mode = 0444, }, + .print = pool_stats_print_bios_acknowledged_fua, +}; + +/* Number of REQ_OP_READ bios */ +static ssize_t +pool_stats_print_bios_acknowledged_partial_read(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.read); +} + +static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_read = { + .attr = { .name = "bios_acknowledged_partial_read", .mode = 0444, }, + .print = pool_stats_print_bios_acknowledged_partial_read, +}; + +/* Number of REQ_OP_WRITE bios with data */ +static ssize_t +pool_stats_print_bios_acknowledged_partial_write(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.write); +} + +static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_write = { + .attr = { .name = "bios_acknowledged_partial_write", .mode = 0444, }, + .print = pool_stats_print_bios_acknowledged_partial_write, +}; + +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ +static ssize_t +pool_stats_print_bios_acknowledged_partial_empty_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.empty_flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_empty_flush = { + .attr = { .name = "bios_acknowledged_partial_empty_flush", .mode = 0444, }, + .print = pool_stats_print_bios_acknowledged_partial_empty_flush, +}; + +/* Number of REQ_OP_DISCARD bios */ +static ssize_t +pool_stats_print_bios_acknowledged_partial_discard(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.discard); +} + +static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_discard = { + .attr = { .name = "bios_acknowledged_partial_discard", .mode = 0444, }, + .print = pool_stats_print_bios_acknowledged_partial_discard, +}; + +/* Number of bios tagged with REQ_PREFLUSH */ +static ssize_t +pool_stats_print_bios_acknowledged_partial_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_flush = { + .attr = { .name = "bios_acknowledged_partial_flush", .mode = 0444, }, + .print = pool_stats_print_bios_acknowledged_partial_flush, +}; + +/* Number of bios tagged with REQ_FUA */ +static ssize_t +pool_stats_print_bios_acknowledged_partial_fua(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.fua); +} + +static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_fua = { + .attr = { .name = "bios_acknowledged_partial_fua", .mode = 0444, }, + .print = pool_stats_print_bios_acknowledged_partial_fua, +}; + +/* Number of REQ_OP_READ bios */ +static ssize_t +pool_stats_print_bios_in_progress_read(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in_progress.read); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_progress_read = { + .attr = { .name = "bios_in_progress_read", .mode = 0444, }, + .print = pool_stats_print_bios_in_progress_read, +}; + +/* Number of REQ_OP_WRITE bios with data */ +static ssize_t +pool_stats_print_bios_in_progress_write(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in_progress.write); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_progress_write = { + .attr = { .name = "bios_in_progress_write", .mode = 0444, }, + .print = pool_stats_print_bios_in_progress_write, +}; + +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ +static ssize_t +pool_stats_print_bios_in_progress_empty_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in_progress.empty_flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_progress_empty_flush = { + .attr = { .name = "bios_in_progress_empty_flush", .mode = 0444, }, + .print = pool_stats_print_bios_in_progress_empty_flush, +}; + +/* Number of REQ_OP_DISCARD bios */ +static ssize_t +pool_stats_print_bios_in_progress_discard(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in_progress.discard); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_progress_discard = { + .attr = { .name = "bios_in_progress_discard", .mode = 0444, }, + .print = pool_stats_print_bios_in_progress_discard, +}; + +/* Number of bios tagged with REQ_PREFLUSH */ +static ssize_t +pool_stats_print_bios_in_progress_flush(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in_progress.flush); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_progress_flush = { + .attr = { .name = "bios_in_progress_flush", .mode = 0444, }, + .print = pool_stats_print_bios_in_progress_flush, +}; + +/* Number of bios tagged with REQ_FUA */ +static ssize_t +pool_stats_print_bios_in_progress_fua(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->bios_in_progress.fua); +} + +static struct pool_stats_attribute pool_stats_attr_bios_in_progress_fua = { + .attr = { .name = "bios_in_progress_fua", .mode = 0444, }, + .print = pool_stats_print_bios_in_progress_fua, +}; + +/* Tracked bytes currently allocated. */ +static ssize_t +pool_stats_print_memory_usage_bytes_used(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->memory_usage.bytes_used); +} + +static struct pool_stats_attribute pool_stats_attr_memory_usage_bytes_used = { + .attr = { .name = "memory_usage_bytes_used", .mode = 0444, }, + .print = pool_stats_print_memory_usage_bytes_used, +}; + +/* Maximum tracked bytes allocated. */ +static ssize_t +pool_stats_print_memory_usage_peak_bytes_used(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->memory_usage.peak_bytes_used); +} + +static struct pool_stats_attribute pool_stats_attr_memory_usage_peak_bytes_used = { + .attr = { .name = "memory_usage_peak_bytes_used", .mode = 0444, }, + .print = pool_stats_print_memory_usage_peak_bytes_used, +}; + +/* Number of records stored in the index */ +static ssize_t +pool_stats_print_index_entries_indexed(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->index.entries_indexed); +} + +static struct pool_stats_attribute pool_stats_attr_index_entries_indexed = { + .attr = { .name = "index_entries_indexed", .mode = 0444, }, + .print = pool_stats_print_index_entries_indexed, +}; + +/* Number of post calls that found an existing entry */ +static ssize_t +pool_stats_print_index_posts_found(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->index.posts_found); +} + +static struct pool_stats_attribute pool_stats_attr_index_posts_found = { + .attr = { .name = "index_posts_found", .mode = 0444, }, + .print = pool_stats_print_index_posts_found, +}; + +/* Number of post calls that added a new entry */ +static ssize_t +pool_stats_print_index_posts_not_found(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->index.posts_not_found); +} + +static struct pool_stats_attribute pool_stats_attr_index_posts_not_found = { + .attr = { .name = "index_posts_not_found", .mode = 0444, }, + .print = pool_stats_print_index_posts_not_found, +}; + +/* Number of query calls that found an existing entry */ +static ssize_t +pool_stats_print_index_queries_found(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->index.queries_found); +} + +static struct pool_stats_attribute pool_stats_attr_index_queries_found = { + .attr = { .name = "index_queries_found", .mode = 0444, }, + .print = pool_stats_print_index_queries_found, +}; + +/* Number of query calls that added a new entry */ +static ssize_t +pool_stats_print_index_queries_not_found(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->index.queries_not_found); +} + +static struct pool_stats_attribute pool_stats_attr_index_queries_not_found = { + .attr = { .name = "index_queries_not_found", .mode = 0444, }, + .print = pool_stats_print_index_queries_not_found, +}; + +/* Number of update calls that found an existing entry */ +static ssize_t +pool_stats_print_index_updates_found(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->index.updates_found); +} + +static struct pool_stats_attribute pool_stats_attr_index_updates_found = { + .attr = { .name = "index_updates_found", .mode = 0444, }, + .print = pool_stats_print_index_updates_found, +}; + +/* Number of update calls that added a new entry */ +static ssize_t +pool_stats_print_index_updates_not_found(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->index.updates_not_found); +} + +static struct pool_stats_attribute pool_stats_attr_index_updates_not_found = { + .attr = { .name = "index_updates_not_found", .mode = 0444, }, + .print = pool_stats_print_index_updates_not_found, +}; + +/* Number of entries discarded */ +static ssize_t +pool_stats_print_index_entries_discarded(struct vdo_statistics *stats, char *buf) +{ + return sprintf(buf, "%llu\n", stats->index.entries_discarded); +} + +static struct pool_stats_attribute pool_stats_attr_index_entries_discarded = { + .attr = { .name = "index_entries_discarded", .mode = 0444, }, + .print = pool_stats_print_index_entries_discarded, +}; + +struct attribute *vdo_pool_stats_attrs[] = { + &pool_stats_attr_data_blocks_used.attr, + &pool_stats_attr_overhead_blocks_used.attr, + &pool_stats_attr_logical_blocks_used.attr, + &pool_stats_attr_physical_blocks.attr, + &pool_stats_attr_logical_blocks.attr, + &pool_stats_attr_block_map_cache_size.attr, + &pool_stats_attr_block_size.attr, + &pool_stats_attr_complete_recoveries.attr, + &pool_stats_attr_read_only_recoveries.attr, + &pool_stats_attr_mode.attr, + &pool_stats_attr_in_recovery_mode.attr, + &pool_stats_attr_recovery_percentage.attr, + &pool_stats_attr_packer_compressed_fragments_written.attr, + &pool_stats_attr_packer_compressed_blocks_written.attr, + &pool_stats_attr_packer_compressed_fragments_in_packer.attr, + &pool_stats_attr_allocator_slab_count.attr, + &pool_stats_attr_allocator_slabs_opened.attr, + &pool_stats_attr_allocator_slabs_reopened.attr, + &pool_stats_attr_journal_disk_full.attr, + &pool_stats_attr_journal_slab_journal_commits_requested.attr, + &pool_stats_attr_journal_entries_started.attr, + &pool_stats_attr_journal_entries_written.attr, + &pool_stats_attr_journal_entries_committed.attr, + &pool_stats_attr_journal_blocks_started.attr, + &pool_stats_attr_journal_blocks_written.attr, + &pool_stats_attr_journal_blocks_committed.attr, + &pool_stats_attr_slab_journal_disk_full_count.attr, + &pool_stats_attr_slab_journal_flush_count.attr, + &pool_stats_attr_slab_journal_blocked_count.attr, + &pool_stats_attr_slab_journal_blocks_written.attr, + &pool_stats_attr_slab_journal_tail_busy_count.attr, + &pool_stats_attr_slab_summary_blocks_written.attr, + &pool_stats_attr_ref_counts_blocks_written.attr, + &pool_stats_attr_block_map_dirty_pages.attr, + &pool_stats_attr_block_map_clean_pages.attr, + &pool_stats_attr_block_map_free_pages.attr, + &pool_stats_attr_block_map_failed_pages.attr, + &pool_stats_attr_block_map_incoming_pages.attr, + &pool_stats_attr_block_map_outgoing_pages.attr, + &pool_stats_attr_block_map_cache_pressure.attr, + &pool_stats_attr_block_map_read_count.attr, + &pool_stats_attr_block_map_write_count.attr, + &pool_stats_attr_block_map_failed_reads.attr, + &pool_stats_attr_block_map_failed_writes.attr, + &pool_stats_attr_block_map_reclaimed.attr, + &pool_stats_attr_block_map_read_outgoing.attr, + &pool_stats_attr_block_map_found_in_cache.attr, + &pool_stats_attr_block_map_discard_required.attr, + &pool_stats_attr_block_map_wait_for_page.attr, + &pool_stats_attr_block_map_fetch_required.attr, + &pool_stats_attr_block_map_pages_loaded.attr, + &pool_stats_attr_block_map_pages_saved.attr, + &pool_stats_attr_block_map_flush_count.attr, + &pool_stats_attr_hash_lock_dedupe_advice_valid.attr, + &pool_stats_attr_hash_lock_dedupe_advice_stale.attr, + &pool_stats_attr_hash_lock_concurrent_data_matches.attr, + &pool_stats_attr_hash_lock_concurrent_hash_collisions.attr, + &pool_stats_attr_hash_lock_curr_dedupe_queries.attr, + &pool_stats_attr_errors_invalid_advice_pbn_count.attr, + &pool_stats_attr_errors_no_space_error_count.attr, + &pool_stats_attr_errors_read_only_error_count.attr, + &pool_stats_attr_instance.attr, + &pool_stats_attr_current_vios_in_progress.attr, + &pool_stats_attr_max_vios.attr, + &pool_stats_attr_dedupe_advice_timeouts.attr, + &pool_stats_attr_flush_out.attr, + &pool_stats_attr_logical_block_size.attr, + &pool_stats_attr_bios_in_read.attr, + &pool_stats_attr_bios_in_write.attr, + &pool_stats_attr_bios_in_empty_flush.attr, + &pool_stats_attr_bios_in_discard.attr, + &pool_stats_attr_bios_in_flush.attr, + &pool_stats_attr_bios_in_fua.attr, + &pool_stats_attr_bios_in_partial_read.attr, + &pool_stats_attr_bios_in_partial_write.attr, + &pool_stats_attr_bios_in_partial_empty_flush.attr, + &pool_stats_attr_bios_in_partial_discard.attr, + &pool_stats_attr_bios_in_partial_flush.attr, + &pool_stats_attr_bios_in_partial_fua.attr, + &pool_stats_attr_bios_out_read.attr, + &pool_stats_attr_bios_out_write.attr, + &pool_stats_attr_bios_out_empty_flush.attr, + &pool_stats_attr_bios_out_discard.attr, + &pool_stats_attr_bios_out_flush.attr, + &pool_stats_attr_bios_out_fua.attr, + &pool_stats_attr_bios_meta_read.attr, + &pool_stats_attr_bios_meta_write.attr, + &pool_stats_attr_bios_meta_empty_flush.attr, + &pool_stats_attr_bios_meta_discard.attr, + &pool_stats_attr_bios_meta_flush.attr, + &pool_stats_attr_bios_meta_fua.attr, + &pool_stats_attr_bios_journal_read.attr, + &pool_stats_attr_bios_journal_write.attr, + &pool_stats_attr_bios_journal_empty_flush.attr, + &pool_stats_attr_bios_journal_discard.attr, + &pool_stats_attr_bios_journal_flush.attr, + &pool_stats_attr_bios_journal_fua.attr, + &pool_stats_attr_bios_page_cache_read.attr, + &pool_stats_attr_bios_page_cache_write.attr, + &pool_stats_attr_bios_page_cache_empty_flush.attr, + &pool_stats_attr_bios_page_cache_discard.attr, + &pool_stats_attr_bios_page_cache_flush.attr, + &pool_stats_attr_bios_page_cache_fua.attr, + &pool_stats_attr_bios_out_completed_read.attr, + &pool_stats_attr_bios_out_completed_write.attr, + &pool_stats_attr_bios_out_completed_empty_flush.attr, + &pool_stats_attr_bios_out_completed_discard.attr, + &pool_stats_attr_bios_out_completed_flush.attr, + &pool_stats_attr_bios_out_completed_fua.attr, + &pool_stats_attr_bios_meta_completed_read.attr, + &pool_stats_attr_bios_meta_completed_write.attr, + &pool_stats_attr_bios_meta_completed_empty_flush.attr, + &pool_stats_attr_bios_meta_completed_discard.attr, + &pool_stats_attr_bios_meta_completed_flush.attr, + &pool_stats_attr_bios_meta_completed_fua.attr, + &pool_stats_attr_bios_journal_completed_read.attr, + &pool_stats_attr_bios_journal_completed_write.attr, + &pool_stats_attr_bios_journal_completed_empty_flush.attr, + &pool_stats_attr_bios_journal_completed_discard.attr, + &pool_stats_attr_bios_journal_completed_flush.attr, + &pool_stats_attr_bios_journal_completed_fua.attr, + &pool_stats_attr_bios_page_cache_completed_read.attr, + &pool_stats_attr_bios_page_cache_completed_write.attr, + &pool_stats_attr_bios_page_cache_completed_empty_flush.attr, + &pool_stats_attr_bios_page_cache_completed_discard.attr, + &pool_stats_attr_bios_page_cache_completed_flush.attr, + &pool_stats_attr_bios_page_cache_completed_fua.attr, + &pool_stats_attr_bios_acknowledged_read.attr, + &pool_stats_attr_bios_acknowledged_write.attr, + &pool_stats_attr_bios_acknowledged_empty_flush.attr, + &pool_stats_attr_bios_acknowledged_discard.attr, + &pool_stats_attr_bios_acknowledged_flush.attr, + &pool_stats_attr_bios_acknowledged_fua.attr, + &pool_stats_attr_bios_acknowledged_partial_read.attr, + &pool_stats_attr_bios_acknowledged_partial_write.attr, + &pool_stats_attr_bios_acknowledged_partial_empty_flush.attr, + &pool_stats_attr_bios_acknowledged_partial_discard.attr, + &pool_stats_attr_bios_acknowledged_partial_flush.attr, + &pool_stats_attr_bios_acknowledged_partial_fua.attr, + &pool_stats_attr_bios_in_progress_read.attr, + &pool_stats_attr_bios_in_progress_write.attr, + &pool_stats_attr_bios_in_progress_empty_flush.attr, + &pool_stats_attr_bios_in_progress_discard.attr, + &pool_stats_attr_bios_in_progress_flush.attr, + &pool_stats_attr_bios_in_progress_fua.attr, + &pool_stats_attr_memory_usage_bytes_used.attr, + &pool_stats_attr_memory_usage_peak_bytes_used.attr, + &pool_stats_attr_index_entries_indexed.attr, + &pool_stats_attr_index_posts_found.attr, + &pool_stats_attr_index_posts_not_found.attr, + &pool_stats_attr_index_queries_found.attr, + &pool_stats_attr_index_queries_not_found.attr, + &pool_stats_attr_index_updates_found.attr, + &pool_stats_attr_index_updates_not_found.attr, + &pool_stats_attr_index_entries_discarded.attr, + NULL, +}; diff --git a/drivers/md/dm-vdo/pool-sysfs.c b/drivers/md/dm-vdo/pool-sysfs.c new file mode 100644 index 00000000000..73006e5859d --- /dev/null +++ b/drivers/md/dm-vdo/pool-sysfs.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "pool-sysfs.h" + +#include <linux/kstrtox.h> + +#include "memory-alloc.h" +#include "string-utils.h" + +#include "data-vio.h" +#include "dedupe.h" +#include "vdo.h" + +struct pool_attribute { + struct attribute attr; + ssize_t (*show)(struct vdo *vdo, char *buf); + ssize_t (*store)(struct vdo *vdo, const char *value, size_t count); +}; + +static ssize_t vdo_pool_attr_show(struct kobject *directory, struct attribute *attr, char *buf) +{ + struct pool_attribute *pool_attr = container_of(attr, struct pool_attribute, attr); + struct vdo *vdo = container_of(directory, struct vdo, vdo_directory); + + if (pool_attr->show == NULL) + return -EINVAL; + return pool_attr->show(vdo, buf); +} + +static ssize_t vdo_pool_attr_store(struct kobject *directory, + struct attribute *attr, + const char *buf, + size_t length) +{ + struct pool_attribute *pool_attr = container_of(attr, struct pool_attribute, attr); + struct vdo *vdo = container_of(directory, struct vdo, vdo_directory); + + if (pool_attr->store == NULL) + return -EINVAL; + return pool_attr->store(vdo, buf, length); +} + +static const struct sysfs_ops vdo_pool_sysfs_ops = { + .show = vdo_pool_attr_show, + .store = vdo_pool_attr_store, +}; + +static ssize_t pool_compressing_show(struct vdo *vdo, char *buf) +{ + return sprintf(buf, "%s\n", (vdo_get_compressing(vdo) ? "1" : "0")); +} + +static ssize_t pool_discards_active_show(struct vdo *vdo, char *buf) +{ + return sprintf(buf, "%u\n", get_data_vio_pool_active_discards(vdo->data_vio_pool)); +} + +static ssize_t pool_discards_limit_show(struct vdo *vdo, char *buf) +{ + return sprintf(buf, "%u\n", get_data_vio_pool_discard_limit(vdo->data_vio_pool)); +} + +static ssize_t pool_discards_limit_store(struct vdo *vdo, const char *buf, size_t length) +{ + unsigned int value; + int result; + + if ((length > 12) || (kstrtouint(buf, 10, &value) < 0) || (value < 1)) + return -EINVAL; + + result = set_data_vio_pool_discard_limit(vdo->data_vio_pool, value); + if (result != VDO_SUCCESS) + return -EINVAL; + + return length; +} + +static ssize_t pool_discards_maximum_show(struct vdo *vdo, char *buf) +{ + return sprintf(buf, "%u\n", get_data_vio_pool_maximum_discards(vdo->data_vio_pool)); +} + +static ssize_t pool_instance_show(struct vdo *vdo, char *buf) +{ + return sprintf(buf, "%u\n", vdo->instance); +} + +static ssize_t pool_requests_active_show(struct vdo *vdo, char *buf) +{ + return sprintf(buf, "%u\n", get_data_vio_pool_active_requests(vdo->data_vio_pool)); +} + +static ssize_t pool_requests_limit_show(struct vdo *vdo, char *buf) +{ + return sprintf(buf, "%u\n", get_data_vio_pool_request_limit(vdo->data_vio_pool)); +} + +static ssize_t pool_requests_maximum_show(struct vdo *vdo, char *buf) +{ + return sprintf(buf, "%u\n", get_data_vio_pool_maximum_requests(vdo->data_vio_pool)); +} + +static void vdo_pool_release(struct kobject *directory) +{ + UDS_FREE(container_of(directory, struct vdo, vdo_directory)); +} + +static struct pool_attribute vdo_pool_compressing_attr = { + .attr = { + .name = "compressing", + .mode = 0444, + }, + .show = pool_compressing_show, +}; + +static struct pool_attribute vdo_pool_discards_active_attr = { + .attr = { + .name = "discards_active", + .mode = 0444, + }, + .show = pool_discards_active_show, +}; + +static struct pool_attribute vdo_pool_discards_limit_attr = { + .attr = { + .name = "discards_limit", + .mode = 0644, + }, + .show = pool_discards_limit_show, + .store = pool_discards_limit_store, +}; + +static struct pool_attribute vdo_pool_discards_maximum_attr = { + .attr = { + .name = "discards_maximum", + .mode = 0444, + }, + .show = pool_discards_maximum_show, +}; + +static struct pool_attribute vdo_pool_instance_attr = { + .attr = { + .name = "instance", + .mode = 0444, + }, + .show = pool_instance_show, +}; + +static struct pool_attribute vdo_pool_requests_active_attr = { + .attr = { + .name = "requests_active", + .mode = 0444, + }, + .show = pool_requests_active_show, +}; + +static struct pool_attribute vdo_pool_requests_limit_attr = { + .attr = { + .name = "requests_limit", + .mode = 0444, + }, + .show = pool_requests_limit_show, +}; + +static struct pool_attribute vdo_pool_requests_maximum_attr = { + .attr = { + .name = "requests_maximum", + .mode = 0444, + }, + .show = pool_requests_maximum_show, +}; + +static struct attribute *pool_attrs[] = { + &vdo_pool_compressing_attr.attr, + &vdo_pool_discards_active_attr.attr, + &vdo_pool_discards_limit_attr.attr, + &vdo_pool_discards_maximum_attr.attr, + &vdo_pool_instance_attr.attr, + &vdo_pool_requests_active_attr.attr, + &vdo_pool_requests_limit_attr.attr, + &vdo_pool_requests_maximum_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(pool); + +struct kobj_type vdo_directory_type = { + .release = vdo_pool_release, + .sysfs_ops = &vdo_pool_sysfs_ops, + .default_groups = pool_groups, +}; diff --git a/drivers/md/dm-vdo/pool-sysfs.h b/drivers/md/dm-vdo/pool-sysfs.h new file mode 100644 index 00000000000..1e8a172c367 --- /dev/null +++ b/drivers/md/dm-vdo/pool-sysfs.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_POOL_SYSFS_H +#define VDO_POOL_SYSFS_H + +#include <linux/kobject.h> + +/* The kobj_type used for setting up the kernel layer kobject. */ +extern struct kobj_type vdo_directory_type; + +/* The sysfs_ops used for the "statistics" subdirectory. */ +extern const struct sysfs_ops vdo_pool_stats_sysfs_ops; +/* The attribute used for the "statistics" subdirectory. */ +extern struct attribute *vdo_pool_stats_attrs[]; + +#endif /* VDO_POOL_SYSFS_H */ diff --git a/drivers/md/dm-vdo/priority-table.c b/drivers/md/dm-vdo/priority-table.c new file mode 100644 index 00000000000..d0fb949af87 --- /dev/null +++ b/drivers/md/dm-vdo/priority-table.c @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "priority-table.h" + +#include <linux/log2.h> + +#include "errors.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "status-codes.h" + +/* We use a single 64-bit search vector, so the maximum priority is 63 */ +enum { + MAX_PRIORITY = 63 +}; + +/* + * All the entries with the same priority are queued in a circular list in a bucket for that + * priority. The table is essentially an array of buckets. + */ +struct bucket { + /* + * The head of a queue of table entries, all having the same priority + */ + struct list_head queue; + /* The priority of all the entries in this bucket */ + unsigned int priority; +}; + +/* + * A priority table is an array of buckets, indexed by priority. New entries are added to the end + * of the queue in the appropriate bucket. The dequeue operation finds the highest-priority + * non-empty bucket by searching a bit vector represented as a single 8-byte word, which is very + * fast with compiler and CPU support. + */ +struct priority_table { + /* The maximum priority of entries that may be stored in this table */ + unsigned int max_priority; + /* A bit vector flagging all buckets that are currently non-empty */ + u64 search_vector; + /* The array of all buckets, indexed by priority */ + struct bucket buckets[]; +}; + +/** + * vdo_make_priority_table() - Allocate and initialize a new priority_table. + * @max_priority: The maximum priority value for table entries. + * @table_ptr: A pointer to hold the new table. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_make_priority_table(unsigned int max_priority, struct priority_table **table_ptr) +{ + struct priority_table *table; + int result; + unsigned int priority; + + if (max_priority > MAX_PRIORITY) + return UDS_INVALID_ARGUMENT; + + result = UDS_ALLOCATE_EXTENDED(struct priority_table, max_priority + 1, + struct bucket, __func__, &table); + if (result != VDO_SUCCESS) + return result; + + for (priority = 0; priority <= max_priority; priority++) { + struct bucket *bucket = &table->buckets[priority]; + + bucket->priority = priority; + INIT_LIST_HEAD(&bucket->queue); + } + + table->max_priority = max_priority; + table->search_vector = 0; + + *table_ptr = table; + return VDO_SUCCESS; +} + +/** + * vdo_free_priority_table() - Free a priority_table. + * @table: The table to free. + * + * The table does not own the entries stored in it and they are not freed by this call. + */ +void vdo_free_priority_table(struct priority_table *table) +{ + if (table == NULL) + return; + + /* + * Unlink the buckets from any entries still in the table so the entries won't be left with + * dangling pointers to freed memory. + */ + vdo_reset_priority_table(table); + + UDS_FREE(table); +} + +/** + * vdo_reset_priority_table() - Reset a priority table, leaving it in the same empty state as when + * newly constructed. + * @table: The table to reset. + * + * The table does not own the entries stored in it and they are not freed (or even unlinked from + * each other) by this call. + */ +void vdo_reset_priority_table(struct priority_table *table) +{ + unsigned int priority; + + table->search_vector = 0; + for (priority = 0; priority <= table->max_priority; priority++) + list_del_init(&table->buckets[priority].queue); +} + +/** + * vdo_priority_table_enqueue() - Add a new entry to the priority table, appending it to the queue + * for entries with the specified priority. + * @table: The table in which to store the entry. + * @priority: The priority of the entry. + * @entry: The list_head embedded in the entry to store in the table (the caller must have + * initialized it). + */ +void vdo_priority_table_enqueue(struct priority_table *table, + unsigned int priority, + struct list_head *entry) +{ + ASSERT_LOG_ONLY((priority <= table->max_priority), + "entry priority must be valid for the table"); + + /* Append the entry to the queue in the specified bucket. */ + list_move_tail(entry, &table->buckets[priority].queue); + + /* Flag the bucket in the search vector since it must be non-empty. */ + table->search_vector |= (1ULL << priority); +} + +static inline void mark_bucket_empty(struct priority_table *table, struct bucket *bucket) +{ + table->search_vector &= ~(1ULL << bucket->priority); +} + +/** + * vdo_priority_table_dequeue() - Find the highest-priority entry in the table, remove it from the + * table, and return it. + * @table: The priority table from which to remove an entry. + * + * If there are multiple entries with the same priority, the one that has been in the table with + * that priority the longest will be returned. + * + * Return: The dequeued entry, or NULL if the table is currently empty. + */ +struct list_head *vdo_priority_table_dequeue(struct priority_table *table) +{ + struct bucket *bucket; + struct list_head *entry; + int top_priority; + + if (table->search_vector == 0) + /* All buckets are empty. */ + return NULL; + + /* + * Find the highest priority non-empty bucket by finding the highest-order non-zero bit in + * the search vector. + */ + top_priority = ilog2(table->search_vector); + + /* Dequeue the first entry in the bucket. */ + bucket = &table->buckets[top_priority]; + entry = bucket->queue.next; + list_del_init(entry); + + /* Clear the bit in the search vector if the bucket has been emptied. */ + if (list_empty(&bucket->queue)) + mark_bucket_empty(table, bucket); + + return entry; +} + +/** + * vdo_priority_table_remove() - Remove a specified entry from its priority table. + * @table: The table from which to remove the entry. + * @entry: The entry to remove from the table. + */ +void vdo_priority_table_remove(struct priority_table *table, struct list_head *entry) +{ + struct list_head *next_entry; + + /* + * We can't guard against calls where the entry is on a list for a different table, but + * it's easy to deal with an entry not in any table or list. + */ + if (list_empty(entry)) + return; + + /* + * Remove the entry from the bucket list, remembering a pointer to another entry in the + * ring. + */ + next_entry = entry->next; + list_del_init(entry); + + /* + * If the rest of the list is now empty, the next node must be the list head in the bucket + * and we can use it to update the search vector. + */ + if (list_empty(next_entry)) + mark_bucket_empty(table, list_entry(next_entry, struct bucket, queue)); +} + +/** + * vdo_is_priority_table_empty() - Return whether the priority table is empty. + * @table: The table to check. + * + * Return: true if the table is empty. + */ +bool vdo_is_priority_table_empty(struct priority_table *table) +{ + return (table->search_vector == 0); +} diff --git a/drivers/md/dm-vdo/priority-table.h b/drivers/md/dm-vdo/priority-table.h new file mode 100644 index 00000000000..7c5f689dc2a --- /dev/null +++ b/drivers/md/dm-vdo/priority-table.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_PRIORITY_TABLE_H +#define VDO_PRIORITY_TABLE_H + +#include <linux/list.h> + +/* + * A priority_table is a simple implementation of a priority queue for entries with priorities that + * are small non-negative integer values. It implements the obvious priority queue operations of + * enqueuing an entry and dequeuing an entry with the maximum priority. It also supports removing + * an arbitrary entry. The priority of an entry already in the table can be changed by removing it + * and re-enqueuing it with a different priority. All operations have O(1) complexity. + * + * The links for the table entries must be embedded in the entries themselves. Lists are used to + * link entries in the table and no wrapper type is declared, so an existing list entry in an + * object can also be used to queue it in a priority_table, assuming the field is not used for + * anything else while so queued. + * + * The table is implemented as an array of queues (circular lists) indexed by priority, along with + * a hint for which queues are non-empty. Steven Skiena calls a very similar structure a "bounded + * height priority queue", but given the resemblance to a hash table, "priority table" seems both + * shorter and more apt, if somewhat novel. + */ + +struct priority_table; + +int __must_check +vdo_make_priority_table(unsigned int max_priority, struct priority_table **table_ptr); + +void vdo_free_priority_table(struct priority_table *table); + +void vdo_priority_table_enqueue(struct priority_table *table, + unsigned int priority, + struct list_head *entry); + +void vdo_reset_priority_table(struct priority_table *table); + +struct list_head * __must_check vdo_priority_table_dequeue(struct priority_table *table); + +void vdo_priority_table_remove(struct priority_table *table, struct list_head *entry); + +bool __must_check vdo_is_priority_table_empty(struct priority_table *table); + +#endif /* VDO_PRIORITY_TABLE_H */ diff --git a/drivers/md/dm-vdo/recovery-journal.c b/drivers/md/dm-vdo/recovery-journal.c new file mode 100644 index 00000000000..09db9a9dffa --- /dev/null +++ b/drivers/md/dm-vdo/recovery-journal.c @@ -0,0 +1,1772 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "recovery-journal.h" + +#include <linux/atomic.h> +#include <linux/bio.h> + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "block-map.h" +#include "completion.h" +#include "constants.h" +#include "data-vio.h" +#include "encodings.h" +#include "io-submitter.h" +#include "slab-depot.h" +#include "types.h" +#include "vdo.h" +#include "vio.h" +#include "wait-queue.h" + +static const u64 RECOVERY_COUNT_MASK = 0xff; + +enum { + /* + * The number of reserved blocks must be large enough to prevent a new recovery journal + * block write from overwriting a block which appears to still be a valid head block of the + * journal. Currently, that means reserving enough space for all 2048 data_vios. + */ + RECOVERY_JOURNAL_RESERVED_BLOCKS = + (MAXIMUM_VDO_USER_VIOS / RECOVERY_JOURNAL_ENTRIES_PER_BLOCK) + 2, + WRITE_FLAGS = REQ_OP_WRITE | REQ_PRIO | REQ_PREFLUSH | REQ_SYNC | REQ_FUA, +}; + +/** + * DOC: Lock Counters. + * + * A lock_counter is intended to keep all of the locks for the blocks in the recovery journal. The + * per-zone counters are all kept in a single array which is arranged by zone (i.e. zone 0's lock 0 + * is at index 0, zone 0's lock 1 is at index 1, and zone 1's lock 0 is at index 'locks'. This + * arrangement is intended to minimize cache-line contention for counters from different zones. + * + * The locks are implemented as a single object instead of as a lock counter per lock both to + * afford this opportunity to reduce cache line contention and also to eliminate the need to have a + * completion per lock. + * + * Lock sets are laid out with the set for recovery journal first, followed by the logical zones, + * and then the physical zones. + */ + +enum lock_counter_state { + LOCK_COUNTER_STATE_NOT_NOTIFYING, + LOCK_COUNTER_STATE_NOTIFYING, + LOCK_COUNTER_STATE_SUSPENDED, +}; + +/** + * get_zone_count_ptr() - Get a pointer to the zone count for a given lock on a given zone. + * @journal: The recovery journal. + * @lock_number: The lock to get. + * @zone_type: The zone type whose count is desired. + * + * Return: A pointer to the zone count for the given lock and zone. + */ +static inline atomic_t *get_zone_count_ptr(struct recovery_journal *journal, + block_count_t lock_number, + enum vdo_zone_type zone_type) +{ + return ((zone_type == VDO_ZONE_TYPE_LOGICAL) + ? &journal->lock_counter.logical_zone_counts[lock_number] + : &journal->lock_counter.physical_zone_counts[lock_number]); +} + +/** + * get_counter() - Get the zone counter for a given lock on a given zone. + * @journal: The recovery journal. + * @lock_number: The lock to get. + * @zone_type: The zone type whose count is desired. + * @zone_id: The zone index whose count is desired. + * + * Return: The counter for the given lock and zone. + */ +static inline u16 *get_counter(struct recovery_journal *journal, + block_count_t lock_number, + enum vdo_zone_type zone_type, + zone_count_t zone_id) +{ + struct lock_counter *counter = &journal->lock_counter; + block_count_t zone_counter = (counter->locks * zone_id) + lock_number; + + if (zone_type == VDO_ZONE_TYPE_JOURNAL) + return &counter->journal_counters[zone_counter]; + + if (zone_type == VDO_ZONE_TYPE_LOGICAL) + return &counter->logical_counters[zone_counter]; + + return &counter->physical_counters[zone_counter]; +} + +static atomic_t *get_decrement_counter(struct recovery_journal *journal, block_count_t lock_number) +{ + return &journal->lock_counter.journal_decrement_counts[lock_number]; +} + +/** + * is_journal_zone_locked() - Check whether the journal zone is locked for a given lock. + * @journal: The recovery journal. + * @lock_number: The lock to check. + * + * Return: true if the journal zone is locked. + */ +static bool is_journal_zone_locked(struct recovery_journal *journal, block_count_t lock_number) +{ + u16 journal_value = *(get_counter(journal, lock_number, VDO_ZONE_TYPE_JOURNAL, 0)); + u32 decrements = atomic_read(get_decrement_counter(journal, lock_number)); + + /* Pairs with barrier in vdo_release_journal_entry_lock() */ + smp_rmb(); + ASSERT_LOG_ONLY((decrements <= journal_value), + "journal zone lock counter must not underflow"); + return (journal_value != decrements); +} + +/** + * vdo_release_recovery_journal_block_reference() - Release a reference to a recovery journal + * block. + * @journal: The recovery journal. + * @sequence_number: The journal sequence number of the referenced block. + * @zone_type: The type of the zone making the adjustment. + * @zone_id: The ID of the zone making the adjustment. + * + * If this is the last reference for a given zone type, an attempt will be made to reap the + * journal. + */ +void vdo_release_recovery_journal_block_reference(struct recovery_journal *journal, + sequence_number_t sequence_number, + enum vdo_zone_type zone_type, + zone_count_t zone_id) +{ + u16 *current_value; + block_count_t lock_number; + int prior_state; + + if (sequence_number == 0) + return; + + lock_number = vdo_get_recovery_journal_block_number(journal, sequence_number); + current_value = get_counter(journal, lock_number, zone_type, zone_id); + + ASSERT_LOG_ONLY((*current_value >= 1), "decrement of lock counter must not underflow"); + *current_value -= 1; + + if (zone_type == VDO_ZONE_TYPE_JOURNAL) { + if (is_journal_zone_locked(journal, lock_number)) + return; + } else { + atomic_t *zone_count; + + if (*current_value != 0) + return; + + zone_count = get_zone_count_ptr(journal, lock_number, zone_type); + + if (atomic_add_return(-1, zone_count) > 0) + return; + } + + /* + * Extra barriers because this was original developed using a CAS operation that implicitly + * had them. + */ + smp_mb__before_atomic(); + prior_state = atomic_cmpxchg(&journal->lock_counter.state, + LOCK_COUNTER_STATE_NOT_NOTIFYING, + LOCK_COUNTER_STATE_NOTIFYING); + /* same as before_atomic */ + smp_mb__after_atomic(); + + if (prior_state != LOCK_COUNTER_STATE_NOT_NOTIFYING) + return; + + vdo_launch_completion(&journal->lock_counter.completion); +} + +static inline struct recovery_journal_block * __must_check +get_journal_block(struct list_head *list) +{ + return list_first_entry_or_null(list, struct recovery_journal_block, list_node); +} + +/** + * pop_free_list() - Get a block from the end of the free list. + * @journal: The journal. + * + * Return: The block or NULL if the list is empty. + */ +static struct recovery_journal_block * __must_check +pop_free_list(struct recovery_journal *journal) +{ + struct recovery_journal_block *block; + + if (list_empty(&journal->free_tail_blocks)) + return NULL; + + block = list_last_entry(&journal->free_tail_blocks, + struct recovery_journal_block, + list_node); + list_del_init(&block->list_node); + return block; +} + +/** + * is_block_dirty() - Check whether a recovery block is dirty. + * @block: The block to check. + * + * Indicates it has any uncommitted entries, which includes both entries not written and entries + * written but not yet acknowledged. + * + * Return: true if the block has any uncommitted entries. + */ +static inline bool __must_check is_block_dirty(const struct recovery_journal_block *block) +{ + return (block->uncommitted_entry_count > 0); +} + +/** + * is_block_empty() - Check whether a journal block is empty. + * @block: The block to check. + * + * Return: true if the block has no entries. + */ +static inline bool __must_check is_block_empty(const struct recovery_journal_block *block) +{ + return (block->entry_count == 0); +} + +/** + * is_block_full() - Check whether a journal block is full. + * @block: The block to check. + * + * Return: true if the block is full. + */ +static inline bool __must_check is_block_full(const struct recovery_journal_block *block) +{ + return ((block == NULL) || (block->journal->entries_per_block == block->entry_count)); +} + +/** + * assert_on_journal_thread() - Assert that we are running on the journal thread. + * @journal: The journal. + * @function_name: The function doing the check (for logging). + */ +static void assert_on_journal_thread(struct recovery_journal *journal, const char *function_name) +{ + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == journal->thread_id), + "%s() called on journal thread", function_name); +} + +/** + * continue_waiter() - Release a data_vio from the journal. + * + * Invoked whenever a data_vio is to be released from the journal, either because its entry was + * committed to disk, or because there was an error. Implements waiter_callback. + */ +static void continue_waiter(struct waiter *waiter, void *context) +{ + continue_data_vio_with_error(waiter_as_data_vio(waiter), *((int *) context)); +} + +/** + * has_block_waiters() - Check whether the journal has any waiters on any blocks. + * @journal: The journal in question. + * + * Return: true if any block has a waiter. + */ +static inline bool has_block_waiters(struct recovery_journal *journal) +{ + struct recovery_journal_block *block = get_journal_block(&journal->active_tail_blocks); + + /* + * Either the first active tail block (if it exists) has waiters, or no active tail block + * has waiters. + */ + return ((block != NULL) && + (vdo_has_waiters(&block->entry_waiters) || + vdo_has_waiters(&block->commit_waiters))); +} + +static void recycle_journal_blocks(struct recovery_journal *journal); +static void recycle_journal_block(struct recovery_journal_block *block); +static void notify_commit_waiters(struct recovery_journal *journal); + +/** + * suspend_lock_counter() - Prevent the lock counter from notifying. + * @counter: The counter. + * + * Return: true if the lock counter was not notifying and hence the suspend was efficacious. + */ +static bool suspend_lock_counter(struct lock_counter *counter) +{ + int prior_state; + + /* + * Extra barriers because this was originally developed using a CAS operation that + * implicitly had them. + */ + smp_mb__before_atomic(); + prior_state = atomic_cmpxchg(&counter->state, + LOCK_COUNTER_STATE_NOT_NOTIFYING, + LOCK_COUNTER_STATE_SUSPENDED); + /* same as before_atomic */ + smp_mb__after_atomic(); + + return ((prior_state == LOCK_COUNTER_STATE_SUSPENDED) || + (prior_state == LOCK_COUNTER_STATE_NOT_NOTIFYING)); +} + +static inline bool is_read_only(struct recovery_journal *journal) +{ + return vdo_is_read_only(journal->flush_vio->completion.vdo); +} + +/** + * check_for_drain_complete() - Check whether the journal has drained. + * @journal: The journal which may have just drained. + */ +static void check_for_drain_complete(struct recovery_journal *journal) +{ + int result = VDO_SUCCESS; + + if (is_read_only(journal)) { + result = VDO_READ_ONLY; + /* + * Clean up any full active blocks which were not written due to read-only mode. + * + * FIXME: This would probably be better as a short-circuit in write_block(). + */ + notify_commit_waiters(journal); + recycle_journal_blocks(journal); + + /* Release any data_vios waiting to be assigned entries. */ + vdo_notify_all_waiters(&journal->entry_waiters, continue_waiter, &result); + } + + if (!vdo_is_state_draining(&journal->state) || + journal->reaping || + has_block_waiters(journal) || + vdo_has_waiters(&journal->entry_waiters) || + !suspend_lock_counter(&journal->lock_counter)) + return; + + if (vdo_is_state_saving(&journal->state)) { + if (journal->active_block != NULL) { + ASSERT_LOG_ONLY(((result == VDO_READ_ONLY) || + !is_block_dirty(journal->active_block)), + "journal being saved has clean active block"); + recycle_journal_block(journal->active_block); + } + + ASSERT_LOG_ONLY(list_empty(&journal->active_tail_blocks), + "all blocks in a journal being saved must be inactive"); + } + + vdo_finish_draining_with_result(&journal->state, result); +} + +/** + * notify_recovery_journal_of_read_only_mode() - Notify a recovery journal that the VDO has gone + * read-only. + * @listener: The journal. + * @parent: The completion to notify in order to acknowledge the notification. + * + * Implements vdo_read_only_notification. + */ +static void +notify_recovery_journal_of_read_only_mode(void *listener, struct vdo_completion *parent) +{ + check_for_drain_complete(listener); + vdo_finish_completion(parent); +} + +/** + * enter_journal_read_only_mode() - Put the journal in read-only mode. + * @journal: The journal which has failed. + * @error_code: The error result triggering this call. + * + * All attempts to add entries after this function is called will fail. All VIOs waiting for + * commits will be awakened with an error. + */ +static void enter_journal_read_only_mode(struct recovery_journal *journal, int error_code) +{ + vdo_enter_read_only_mode(journal->flush_vio->completion.vdo, error_code); + check_for_drain_complete(journal); +} + +/** + * vdo_get_recovery_journal_current_sequence_number() - Obtain the recovery journal's current + * sequence number. + * @journal: The journal in question. + * + * Exposed only so the block map can be initialized therefrom. + * + * Return: The sequence number of the tail block. + */ +sequence_number_t +vdo_get_recovery_journal_current_sequence_number(struct recovery_journal *journal) +{ + return journal->tail; +} + +/** + * get_recovery_journal_head() - Get the head of the recovery journal. + * @journal: The journal. + * + * The head is the lowest sequence number of the block map head and the slab journal head. + * + * Return: the head of the journal. + */ +static inline sequence_number_t get_recovery_journal_head(const struct recovery_journal *journal) +{ + return min(journal->block_map_head, journal->slab_journal_head); +} + +/** + * compute_recovery_count_byte() - Compute the recovery count byte for a given recovery count. + * @recovery_count: The recovery count. + * + * Return: The byte corresponding to the recovery count. + */ +static inline u8 __must_check compute_recovery_count_byte(u64 recovery_count) +{ + return (u8)(recovery_count & RECOVERY_COUNT_MASK); +} + +/** + * check_slab_journal_commit_threshold() - Check whether the journal is over the threshold, and if + * so, force the oldest slab journal tail block to commit. + * @journal: The journal. + */ +static void check_slab_journal_commit_threshold(struct recovery_journal *journal) +{ + block_count_t current_length = journal->tail - journal->slab_journal_head; + + if (current_length > journal->slab_journal_commit_threshold) { + journal->events.slab_journal_commits_requested++; + vdo_commit_oldest_slab_journal_tail_blocks(journal->depot, + journal->slab_journal_head); + } +} + +static void reap_recovery_journal(struct recovery_journal *journal); +static void assign_entries(struct recovery_journal *journal); + +/** + * finish_reaping() - Finish reaping the journal. + * @journal: The journal being reaped. + */ +static void finish_reaping(struct recovery_journal *journal) +{ + block_count_t blocks_reaped; + sequence_number_t old_head = get_recovery_journal_head(journal); + + journal->block_map_head = journal->block_map_reap_head; + journal->slab_journal_head = journal->slab_journal_reap_head; + blocks_reaped = get_recovery_journal_head(journal) - old_head; + journal->available_space += blocks_reaped * journal->entries_per_block; + journal->reaping = false; + check_slab_journal_commit_threshold(journal); + assign_entries(journal); + check_for_drain_complete(journal); +} + +/** + * complete_reaping() - Finish reaping the journal after flushing the lower layer. + * @completion: The journal's flush VIO. + * + * This is the callback registered in reap_recovery_journal(). + */ +static void complete_reaping(struct vdo_completion *completion) +{ + struct recovery_journal *journal = completion->parent; + + finish_reaping(journal); + + /* Try reaping again in case more locks were released while flush was out. */ + reap_recovery_journal(journal); +} + +/** + * handle_flush_error() - Handle an error when flushing the lower layer due to reaping. + * @completion: The journal's flush VIO. + */ +static void handle_flush_error(struct vdo_completion *completion) +{ + struct recovery_journal *journal = completion->parent; + + vio_record_metadata_io_error(as_vio(completion)); + journal->reaping = false; + enter_journal_read_only_mode(journal, completion->result); +} + +static void flush_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct recovery_journal *journal = vio->completion.parent; + + continue_vio_after_io(vio, complete_reaping, journal->thread_id); +} + +/** + * initialize_journal_state() - Set all journal fields appropriately to start journaling from the + * current active block. + * @journal: The journal to be reset based on its active block. + */ +static void initialize_journal_state(struct recovery_journal *journal) +{ + journal->append_point.sequence_number = journal->tail; + journal->last_write_acknowledged = journal->tail; + journal->block_map_head = journal->tail; + journal->slab_journal_head = journal->tail; + journal->block_map_reap_head = journal->tail; + journal->slab_journal_reap_head = journal->tail; + journal->block_map_head_block_number = + vdo_get_recovery_journal_block_number(journal, journal->block_map_head); + journal->slab_journal_head_block_number = + vdo_get_recovery_journal_block_number(journal, journal->slab_journal_head); + journal->available_space = + (journal->entries_per_block * vdo_get_recovery_journal_length(journal->size)); +} + +/** + * vdo_get_recovery_journal_length() - Get the number of usable recovery journal blocks. + * @journal_size: The size of the recovery journal in blocks. + * + * Return: the number of recovery journal blocks usable for entries. + */ +block_count_t vdo_get_recovery_journal_length(block_count_t journal_size) +{ + block_count_t reserved_blocks = journal_size / 4; + + if (reserved_blocks > RECOVERY_JOURNAL_RESERVED_BLOCKS) + reserved_blocks = RECOVERY_JOURNAL_RESERVED_BLOCKS; + return (journal_size - reserved_blocks); +} + +/** + * reap_recovery_journal_callback() - Attempt to reap the journal. + * @completion: The lock counter completion. + * + * Attempts to reap the journal now that all the locks on some journal block have been released. + * This is the callback registered with the lock counter. + */ +static void reap_recovery_journal_callback(struct vdo_completion *completion) +{ + struct recovery_journal *journal = (struct recovery_journal *) completion->parent; + /* + * The acknowledgement must be done before reaping so that there is no race between + * acknowledging the notification and unlocks wishing to notify. + */ + smp_wmb(); + atomic_set(&journal->lock_counter.state, LOCK_COUNTER_STATE_NOT_NOTIFYING); + + if (vdo_is_state_quiescing(&journal->state)) { + /* + * Don't start reaping when the journal is trying to quiesce. Do check if this + * notification is the last thing the is waiting on. + */ + check_for_drain_complete(journal); + return; + } + + reap_recovery_journal(journal); + check_slab_journal_commit_threshold(journal); +} + +/** + * initialize_lock_counter() - Initialize a lock counter. + * + * @journal: The recovery journal. + * @vdo: The vdo. + * + * Return: VDO_SUCCESS or an error. + */ +static int __must_check initialize_lock_counter(struct recovery_journal *journal, struct vdo *vdo) +{ + int result; + struct thread_config *config = &vdo->thread_config; + struct lock_counter *counter = &journal->lock_counter; + + result = UDS_ALLOCATE(journal->size, u16, __func__, &counter->journal_counters); + if (result != VDO_SUCCESS) + return result; + + result = UDS_ALLOCATE(journal->size, + atomic_t, + __func__, + &counter->journal_decrement_counts); + if (result != VDO_SUCCESS) + return result; + + result = UDS_ALLOCATE(journal->size * config->logical_zone_count, + u16, + __func__, + &counter->logical_counters); + if (result != VDO_SUCCESS) + return result; + + result = UDS_ALLOCATE(journal->size, atomic_t, __func__, &counter->logical_zone_counts); + if (result != VDO_SUCCESS) + return result; + + result = UDS_ALLOCATE(journal->size * config->physical_zone_count, + u16, + __func__, + &counter->physical_counters); + if (result != VDO_SUCCESS) + return result; + + result = UDS_ALLOCATE(journal->size, atomic_t, __func__, &counter->physical_zone_counts); + if (result != VDO_SUCCESS) + return result; + + vdo_initialize_completion(&counter->completion, vdo, VDO_LOCK_COUNTER_COMPLETION); + vdo_prepare_completion(&counter->completion, + reap_recovery_journal_callback, + reap_recovery_journal_callback, + config->journal_thread, + journal); + counter->logical_zones = config->logical_zone_count; + counter->physical_zones = config->physical_zone_count; + counter->locks = journal->size; + return VDO_SUCCESS; +} + +/** + * set_journal_tail() - Set the journal's tail sequence number. + * @journal: The journal whose tail is to be set. + * @tail: The new tail value. + */ +static void set_journal_tail(struct recovery_journal *journal, sequence_number_t tail) +{ + /* VDO does not support sequence numbers above 1 << 48 in the slab journal. */ + if (tail >= (1ULL << 48)) + enter_journal_read_only_mode(journal, VDO_JOURNAL_OVERFLOW); + + journal->tail = tail; +} + +/** + * initialize_recovery_block() - Initialize a journal block. + * @vdo: The vdo from which to construct vios. + * @journal: The journal to which the block will belong. + * @block: The block to initialize. + * + * Return: VDO_SUCCESS or an error. + */ +static int initialize_recovery_block(struct vdo *vdo, + struct recovery_journal *journal, + struct recovery_journal_block *block) +{ + char *data; + int result; + + /* + * Ensure that a block is large enough to store RECOVERY_JOURNAL_ENTRIES_PER_BLOCK entries. + */ + STATIC_ASSERT(RECOVERY_JOURNAL_ENTRIES_PER_BLOCK + <= ((VDO_BLOCK_SIZE - sizeof(struct packed_journal_header)) / + sizeof(struct packed_recovery_journal_entry))); + + /* + * Allocate a full block for the journal block even though not all of the space is used + * since the VIO needs to write a full disk block. + */ + result = UDS_ALLOCATE(VDO_BLOCK_SIZE, char, __func__, &data); + if (result != VDO_SUCCESS) + return result; + + result = allocate_vio_components(vdo, + VIO_TYPE_RECOVERY_JOURNAL, + VIO_PRIORITY_HIGH, + block, + 1, + data, + &block->vio); + if (result != VDO_SUCCESS) { + UDS_FREE(data); + return result; + } + + list_add_tail(&block->list_node, &journal->free_tail_blocks); + block->journal = journal; + return VDO_SUCCESS; +} + +/** + * vdo_decode_recovery_journal() - Make a recovery journal and initialize it with the state that + * was decoded from the super block. + * + * @state: The decoded state of the journal. + * @nonce: The nonce of the VDO. + * @vdo: The VDO. + * @partition: The partition for the journal. + * @recovery_count: The VDO's number of completed recoveries. + * @journal_size: The number of blocks in the journal on disk. + * @journal_ptr: The pointer to hold the new recovery journal. + * + * Return: A success or error code. + */ +int vdo_decode_recovery_journal(struct recovery_journal_state_7_0 state, + nonce_t nonce, + struct vdo *vdo, + struct partition *partition, + u64 recovery_count, + block_count_t journal_size, + struct recovery_journal **journal_ptr) +{ + block_count_t i; + struct recovery_journal *journal; + int result; + + result = UDS_ALLOCATE_EXTENDED(struct recovery_journal, + RECOVERY_JOURNAL_RESERVED_BLOCKS, + struct recovery_journal_block, + __func__, + &journal); + if (result != VDO_SUCCESS) + return result; + + INIT_LIST_HEAD(&journal->free_tail_blocks); + INIT_LIST_HEAD(&journal->active_tail_blocks); + vdo_initialize_wait_queue(&journal->pending_writes); + + journal->thread_id = vdo->thread_config.journal_thread; + journal->origin = partition->offset; + journal->nonce = nonce; + journal->recovery_count = compute_recovery_count_byte(recovery_count); + journal->size = journal_size; + journal->slab_journal_commit_threshold = (journal_size * 2) / 3; + journal->logical_blocks_used = state.logical_blocks_used; + journal->block_map_data_blocks = state.block_map_data_blocks; + journal->entries_per_block = RECOVERY_JOURNAL_ENTRIES_PER_BLOCK; + set_journal_tail(journal, state.journal_start); + initialize_journal_state(journal); + /* TODO: this will have to change if we make initial resume of a VDO a real resume */ + vdo_set_admin_state_code(&journal->state, VDO_ADMIN_STATE_SUSPENDED); + + for (i = 0; i < RECOVERY_JOURNAL_RESERVED_BLOCKS; i++) { + struct recovery_journal_block *block = &journal->blocks[i]; + + result = initialize_recovery_block(vdo, journal, block); + if (result != VDO_SUCCESS) { + vdo_free_recovery_journal(journal); + return result; + } + } + + result = initialize_lock_counter(journal, vdo); + if (result != VDO_SUCCESS) { + vdo_free_recovery_journal(journal); + return result; + } + + result = create_metadata_vio(vdo, + VIO_TYPE_RECOVERY_JOURNAL, + VIO_PRIORITY_HIGH, + journal, + NULL, + &journal->flush_vio); + if (result != VDO_SUCCESS) { + vdo_free_recovery_journal(journal); + return result; + } + + result = vdo_register_read_only_listener(vdo, + journal, + notify_recovery_journal_of_read_only_mode, + journal->thread_id); + if (result != VDO_SUCCESS) { + vdo_free_recovery_journal(journal); + return result; + } + + result = vdo_make_default_thread(vdo, journal->thread_id); + if (result != VDO_SUCCESS) { + vdo_free_recovery_journal(journal); + return result; + } + + journal->flush_vio->completion.callback_thread_id = journal->thread_id; + *journal_ptr = journal; + return VDO_SUCCESS; +} + +/** + * vdo_free_recovery_journal() - Free a recovery journal. + * @journal: The recovery journal to free. + */ +void vdo_free_recovery_journal(struct recovery_journal *journal) +{ + block_count_t i; + + if (journal == NULL) + return; + + UDS_FREE(UDS_FORGET(journal->lock_counter.logical_zone_counts)); + UDS_FREE(UDS_FORGET(journal->lock_counter.physical_zone_counts)); + UDS_FREE(UDS_FORGET(journal->lock_counter.journal_counters)); + UDS_FREE(UDS_FORGET(journal->lock_counter.journal_decrement_counts)); + UDS_FREE(UDS_FORGET(journal->lock_counter.logical_counters)); + UDS_FREE(UDS_FORGET(journal->lock_counter.physical_counters)); + free_vio(UDS_FORGET(journal->flush_vio)); + + /* + * FIXME: eventually, the journal should be constructed in a quiescent state which + * requires opening before use. + */ + if (!vdo_is_state_quiescent(&journal->state)) + ASSERT_LOG_ONLY(list_empty(&journal->active_tail_blocks), + "journal being freed has no active tail blocks"); + else if (!vdo_is_state_saved(&journal->state) && !list_empty(&journal->active_tail_blocks)) + uds_log_warning("journal being freed has uncommitted entries"); + + for (i = 0; i < RECOVERY_JOURNAL_RESERVED_BLOCKS; i++) { + struct recovery_journal_block *block = &journal->blocks[i]; + + UDS_FREE(UDS_FORGET(block->vio.data)); + free_vio_components(&block->vio); + } + + UDS_FREE(journal); +} + +/** + * vdo_initialize_recovery_journal_post_repair() - Initialize the journal after a repair. + * @journal: The journal in question. + * @recovery_count: The number of completed recoveries. + * @tail: The new tail block sequence number. + * @logical_blocks_used: The new number of logical blocks used. + * @block_map_data_blocks: The new number of block map data blocks. + */ +void vdo_initialize_recovery_journal_post_repair(struct recovery_journal *journal, + u64 recovery_count, + sequence_number_t tail, + block_count_t logical_blocks_used, + block_count_t block_map_data_blocks) +{ + set_journal_tail(journal, tail + 1); + journal->recovery_count = compute_recovery_count_byte(recovery_count); + initialize_journal_state(journal); + journal->logical_blocks_used = logical_blocks_used; + journal->block_map_data_blocks = block_map_data_blocks; +} + +/** + * vdo_get_journal_block_map_data_blocks_used() - Get the number of block map pages, allocated from + * data blocks, currently in use. + * @journal: The journal in question. + * + * Return: The number of block map pages allocated from slabs. + */ +block_count_t vdo_get_journal_block_map_data_blocks_used(struct recovery_journal *journal) +{ + return journal->block_map_data_blocks; +} + +/** + * vdo_get_recovery_journal_thread_id() - Get the ID of a recovery journal's thread. + * @journal: The journal to query. + * + * Return: The ID of the journal's thread. + */ +thread_id_t vdo_get_recovery_journal_thread_id(struct recovery_journal *journal) +{ + return journal->thread_id; +} + +/** + * vdo_open_recovery_journal() - Prepare the journal for new entries. + * @journal: The journal in question. + * @depot: The slab depot for this VDO. + * @block_map: The block map for this VDO. + */ +void vdo_open_recovery_journal(struct recovery_journal *journal, + struct slab_depot *depot, + struct block_map *block_map) +{ + journal->depot = depot; + journal->block_map = block_map; + WRITE_ONCE(journal->state.current_state, + VDO_ADMIN_STATE_NORMAL_OPERATION); +} + +/** + * vdo_record_recovery_journal() - Record the state of a recovery journal for encoding in the super + * block. + * @journal: the recovery journal. + * + * Return: the state of the journal. + */ +struct recovery_journal_state_7_0 +vdo_record_recovery_journal(const struct recovery_journal *journal) +{ + struct recovery_journal_state_7_0 state = { + .logical_blocks_used = journal->logical_blocks_used, + .block_map_data_blocks = journal->block_map_data_blocks, + }; + + if (vdo_is_state_saved(&journal->state)) + /* + * If the journal is saved, we should start one past the active block (since the + * active block is not guaranteed to be empty). + */ + state.journal_start = journal->tail; + else + /* + * When we're merely suspended or have gone read-only, we must record the first + * block that might have entries that need to be applied. + */ + state.journal_start = get_recovery_journal_head(journal); + + return state; +} + +/** + * get_block_header() - Get a pointer to the packed journal block header in the block buffer. + * @block: The recovery block. + * + * Return: The block's header. + */ +static inline struct packed_journal_header * +get_block_header(const struct recovery_journal_block *block) +{ + return (struct packed_journal_header *) block->vio.data; +} + +/** + * set_active_sector() - Set the current sector of the current block and initialize it. + * @block: The block to update. + * @sector: A pointer to the first byte of the new sector. + */ +static void set_active_sector(struct recovery_journal_block *block, void *sector) +{ + block->sector = (struct packed_journal_sector *) sector; + block->sector->check_byte = get_block_header(block)->check_byte; + block->sector->recovery_count = block->journal->recovery_count; + block->sector->entry_count = 0; +} + +/** + * advance_tail() - Advance the tail of the journal. + * @journal: The journal whose tail should be advanced. + * + * Return: true if the tail was advanced. + */ +static bool advance_tail(struct recovery_journal *journal) +{ + struct recovery_block_header unpacked; + struct packed_journal_header *header; + struct recovery_journal_block *block; + + block = journal->active_block = pop_free_list(journal); + if (block == NULL) + return false; + + list_move_tail(&block->list_node, &journal->active_tail_blocks); + + unpacked = (struct recovery_block_header) { + .metadata_type = VDO_METADATA_RECOVERY_JOURNAL_2, + .block_map_data_blocks = journal->block_map_data_blocks, + .logical_blocks_used = journal->logical_blocks_used, + .nonce = journal->nonce, + .recovery_count = journal->recovery_count, + .sequence_number = journal->tail, + .check_byte = vdo_compute_recovery_journal_check_byte(journal, journal->tail), + }; + + header = get_block_header(block); + memset(block->vio.data, 0x0, VDO_BLOCK_SIZE); + block->sequence_number = journal->tail; + block->entry_count = 0; + block->uncommitted_entry_count = 0; + block->block_number = vdo_get_recovery_journal_block_number(journal, journal->tail); + + vdo_pack_recovery_block_header(&unpacked, header); + set_active_sector(block, vdo_get_journal_block_sector(header, 1)); + set_journal_tail(journal, journal->tail + 1); + vdo_advance_block_map_era(journal->block_map, journal->tail); + return true; +} + +/** + * initialize_lock_count() - Initialize the value of the journal zone's counter for a given lock. + * @journal: The recovery journal. + * + * Context: This must be called from the journal zone. + */ +static void initialize_lock_count(struct recovery_journal *journal) +{ + u16 *journal_value; + block_count_t lock_number = journal->active_block->block_number; + atomic_t *decrement_counter = get_decrement_counter(journal, lock_number); + + journal_value = get_counter(journal, lock_number, VDO_ZONE_TYPE_JOURNAL, 0); + ASSERT_LOG_ONLY((*journal_value == atomic_read(decrement_counter)), + "count to be initialized not in use"); + *journal_value = journal->entries_per_block + 1; + atomic_set(decrement_counter, 0); +} + +/** + * prepare_to_assign_entry() - Prepare the currently active block to receive an entry and check + * whether an entry of the given type may be assigned at this time. + * @journal: The journal receiving an entry. + * + * Return: true if there is space in the journal to store an entry of the specified type. + */ +static bool prepare_to_assign_entry(struct recovery_journal *journal) +{ + if (journal->available_space == 0) + return false; + + if (is_block_full(journal->active_block) && !advance_tail(journal)) + return false; + + if (!is_block_empty(journal->active_block)) + return true; + + if ((journal->tail - get_recovery_journal_head(journal)) > journal->size) { + /* Cannot use this block since the journal is full. */ + journal->events.disk_full++; + return false; + } + + /* + * Don't allow the new block to be reaped until all of its entries have been committed to + * the block map and until the journal block has been fully committed as well. Because the + * block map update is done only after any slab journal entries have been made, the + * per-entry lock for the block map entry serves to protect those as well. + */ + initialize_lock_count(journal); + return true; +} + +static void write_blocks(struct recovery_journal *journal); + +/** + * schedule_block_write() - Queue a block for writing. + * @journal: The journal in question. + * @block: The block which is now ready to write. + * + * The block is expected to be full. If the block is currently writing, this is a noop as the block + * will be queued for writing when the write finishes. The block must not currently be queued for + * writing. + */ +static void +schedule_block_write(struct recovery_journal *journal, struct recovery_journal_block *block) +{ + if (!block->committing) + vdo_enqueue_waiter(&journal->pending_writes, &block->write_waiter); + /* + * At the end of adding entries, or discovering this partial block is now full and ready to + * rewrite, we will call write_blocks() and write a whole batch. + */ +} + +/** + * release_journal_block_reference() - Release a reference to a journal block. + * @block: The journal block from which to release a reference. + */ +static void release_journal_block_reference(struct recovery_journal_block *block) +{ + vdo_release_recovery_journal_block_reference(block->journal, + block->sequence_number, + VDO_ZONE_TYPE_JOURNAL, + 0); +} + +static void update_usages(struct recovery_journal *journal, struct data_vio *data_vio) +{ + if (data_vio->increment_updater.operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) { + journal->block_map_data_blocks++; + return; + } + + if (data_vio->new_mapped.state != VDO_MAPPING_STATE_UNMAPPED) + journal->logical_blocks_used++; + + if (data_vio->mapped.state != VDO_MAPPING_STATE_UNMAPPED) + journal->logical_blocks_used--; +} + +/** + * assign_entry() - Assign an entry waiter to the active block. + * + * Implements waiter_callback. + */ +static void assign_entry(struct waiter *waiter, void *context) +{ + struct data_vio *data_vio = waiter_as_data_vio(waiter); + struct recovery_journal_block *block = (struct recovery_journal_block *) context; + struct recovery_journal *journal = block->journal; + + /* Record the point at which we will make the journal entry. */ + data_vio->recovery_journal_point = (struct journal_point) { + .sequence_number = block->sequence_number, + .entry_count = block->entry_count, + }; + + update_usages(journal, data_vio); + journal->available_space--; + + if (!vdo_has_waiters(&block->entry_waiters)) + journal->events.blocks.started++; + + vdo_enqueue_waiter(&block->entry_waiters, &data_vio->waiter); + block->entry_count++; + block->uncommitted_entry_count++; + journal->events.entries.started++; + + if (is_block_full(block)) + /* + * The block is full, so we can write it anytime henceforth. If it is already + * committing, we'll queue it for writing when it comes back. + */ + schedule_block_write(journal, block); + + /* Force out slab journal tail blocks when threshold is reached. */ + check_slab_journal_commit_threshold(journal); +} + +static void assign_entries(struct recovery_journal *journal) +{ + if (journal->adding_entries) + /* Protect against re-entrancy. */ + return; + + journal->adding_entries = true; + while (vdo_has_waiters(&journal->entry_waiters) && prepare_to_assign_entry(journal)) + vdo_notify_next_waiter(&journal->entry_waiters, + assign_entry, + journal->active_block); + + /* Now that we've finished with entries, see if we have a batch of blocks to write. */ + write_blocks(journal); + journal->adding_entries = false; +} + +/** + * recycle_journal_block() - Prepare an in-memory journal block to be reused now that it has been + * fully committed. + * @block: The block to be recycled. + */ +static void recycle_journal_block(struct recovery_journal_block *block) +{ + struct recovery_journal *journal = block->journal; + block_count_t i; + + list_move_tail(&block->list_node, &journal->free_tail_blocks); + + /* Release any unused entry locks. */ + for (i = block->entry_count; i < journal->entries_per_block; i++) + release_journal_block_reference(block); + + /* + * Release our own lock against reaping now that the block is completely committed, or + * we're giving up because we're in read-only mode. + */ + if (block->entry_count > 0) + release_journal_block_reference(block); + + if (block == journal->active_block) + journal->active_block = NULL; +} + +/** + * continue_committed_waiter() - invoked whenever a VIO is to be released from the journal because + * its entry was committed to disk. + * + * Implements waiter_callback. + */ +static void continue_committed_waiter(struct waiter *waiter, void *context) +{ + struct data_vio *data_vio = waiter_as_data_vio(waiter); + struct recovery_journal *journal = (struct recovery_journal *)context; + int result = (is_read_only(journal) ? VDO_READ_ONLY : VDO_SUCCESS); + bool has_decrement; + + ASSERT_LOG_ONLY(vdo_before_journal_point(&journal->commit_point, + &data_vio->recovery_journal_point), + "DataVIOs released from recovery journal in order. Recovery journal point is (%llu, %u), but commit waiter point is (%llu, %u)", + (unsigned long long) journal->commit_point.sequence_number, + journal->commit_point.entry_count, + (unsigned long long) data_vio->recovery_journal_point.sequence_number, + data_vio->recovery_journal_point.entry_count); + + journal->commit_point = data_vio->recovery_journal_point; + data_vio->last_async_operation = VIO_ASYNC_OP_UPDATE_REFERENCE_COUNTS; + if (result != VDO_SUCCESS) { + continue_data_vio_with_error(data_vio, result); + return; + } + + /* + * The increment must be launched first since it must come before the + * decrement if they are in the same slab. + */ + has_decrement = (data_vio->decrement_updater.zpbn.pbn != VDO_ZERO_BLOCK); + if ((data_vio->increment_updater.zpbn.pbn != VDO_ZERO_BLOCK) || !has_decrement) + continue_data_vio(data_vio); + + if (has_decrement) + vdo_launch_completion(&data_vio->decrement_completion); +} + +/** + * notify_commit_waiters() - Notify any VIOs whose entries have now committed. + * @journal: The recovery journal to update. + */ +static void notify_commit_waiters(struct recovery_journal *journal) +{ + struct recovery_journal_block *block; + + list_for_each_entry(block, &journal->active_tail_blocks, list_node) { + if (block->committing) + return; + + vdo_notify_all_waiters(&block->commit_waiters, continue_committed_waiter, journal); + if (is_read_only(journal)) + vdo_notify_all_waiters(&block->entry_waiters, + continue_committed_waiter, + journal); + else if (is_block_dirty(block) || !is_block_full(block)) + /* Stop at partially-committed or partially-filled blocks. */ + return; + } +} + +/** + * recycle_journal_blocks() - Recycle any journal blocks which have been fully committed. + * @journal: The recovery journal to update. + */ +static void recycle_journal_blocks(struct recovery_journal *journal) +{ + struct recovery_journal_block *block, *tmp; + + list_for_each_entry_safe(block, tmp, &journal->active_tail_blocks, list_node) { + if (block->committing) + /* Don't recycle committing blocks. */ + return; + + if (!is_read_only(journal) && (is_block_dirty(block) || !is_block_full(block))) + /* + * Don't recycle partially written or partially full blocks, except in + * read-only mode. + */ + return; + + recycle_journal_block(block); + } +} + +/** + * complete_write() - Handle post-commit processing. + * @completion: The completion of the VIO writing this block. + * + * This is the callback registered by write_block(). If more entries accumulated in the block being + * committed while the commit was in progress, another commit will be initiated. + */ +static void complete_write(struct vdo_completion *completion) +{ + struct recovery_journal_block *block = completion->parent; + struct recovery_journal *journal = block->journal; + struct recovery_journal_block *last_active_block; + + assert_on_journal_thread(journal, __func__); + + journal->pending_write_count -= 1; + journal->events.blocks.committed += 1; + journal->events.entries.committed += block->entries_in_commit; + block->uncommitted_entry_count -= block->entries_in_commit; + block->entries_in_commit = 0; + block->committing = false; + + /* If this block is the latest block to be acknowledged, record that fact. */ + if (block->sequence_number > journal->last_write_acknowledged) + journal->last_write_acknowledged = block->sequence_number; + + last_active_block = get_journal_block(&journal->active_tail_blocks); + ASSERT_LOG_ONLY((block->sequence_number >= last_active_block->sequence_number), + "completed journal write is still active"); + + notify_commit_waiters(journal); + + /* + * Is this block now full? Reaping, and adding entries, might have already sent it off for + * rewriting; else, queue it for rewrite. + */ + if (is_block_dirty(block) && is_block_full(block)) + schedule_block_write(journal, block); + + recycle_journal_blocks(journal); + write_blocks(journal); + + check_for_drain_complete(journal); +} + +static void handle_write_error(struct vdo_completion *completion) +{ + struct recovery_journal_block *block = completion->parent; + struct recovery_journal *journal = block->journal; + + vio_record_metadata_io_error(as_vio(completion)); + uds_log_error_strerror(completion->result, + "cannot write recovery journal block %llu", + (unsigned long long) block->sequence_number); + enter_journal_read_only_mode(journal, completion->result); + complete_write(completion); +} + +static void complete_write_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct recovery_journal_block *block = vio->completion.parent; + struct recovery_journal *journal = block->journal; + + continue_vio_after_io(vio, complete_write, journal->thread_id); +} + +/** + * add_queued_recovery_entries() - Actually add entries from the queue to the given block. + * @block: The journal block. + */ +static void add_queued_recovery_entries(struct recovery_journal_block *block) +{ + while (vdo_has_waiters(&block->entry_waiters)) { + struct data_vio *data_vio = + waiter_as_data_vio(vdo_dequeue_next_waiter(&block->entry_waiters)); + struct tree_lock *lock = &data_vio->tree_lock; + struct packed_recovery_journal_entry *packed_entry; + struct recovery_journal_entry new_entry; + + if (block->sector->entry_count == RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) + set_active_sector(block, (char *) block->sector + VDO_SECTOR_SIZE); + + /* Compose and encode the entry. */ + packed_entry = &block->sector->entries[block->sector->entry_count++]; + new_entry = (struct recovery_journal_entry) { + .mapping = { + .pbn = data_vio->increment_updater.zpbn.pbn, + .state = data_vio->increment_updater.zpbn.state, + }, + .unmapping = { + .pbn = data_vio->decrement_updater.zpbn.pbn, + .state = data_vio->decrement_updater.zpbn.state, + }, + .operation = data_vio->increment_updater.operation, + .slot = lock->tree_slots[lock->height].block_map_slot, + }; + *packed_entry = vdo_pack_recovery_journal_entry(&new_entry); + data_vio->recovery_sequence_number = block->sequence_number; + + /* Enqueue the data_vio to wait for its entry to commit. */ + vdo_enqueue_waiter(&block->commit_waiters, &data_vio->waiter); + } +} + +/** + * write_block() - Issue a block for writing. + * + * Implements waiter_callback. + */ +static void write_block(struct waiter *waiter, void *context __always_unused) +{ + struct recovery_journal_block *block = + container_of(waiter, struct recovery_journal_block, write_waiter); + struct recovery_journal *journal = block->journal; + struct packed_journal_header *header = get_block_header(block); + + if (block->committing || !vdo_has_waiters(&block->entry_waiters) || is_read_only(journal)) + return; + + block->entries_in_commit = vdo_count_waiters(&block->entry_waiters); + add_queued_recovery_entries(block); + + journal->pending_write_count += 1; + journal->events.blocks.written += 1; + journal->events.entries.written += block->entries_in_commit; + + header->block_map_head = __cpu_to_le64(journal->block_map_head); + header->slab_journal_head = __cpu_to_le64(journal->slab_journal_head); + header->entry_count = __cpu_to_le16(block->entry_count); + + block->committing = true; + + /* + * We must issue a flush and a FUA for every commit. The flush is necessary to ensure that + * the data being referenced is stable. The FUA is necessary to ensure that the journal + * block itself is stable before allowing overwrites of the lbn's previous data. + */ + submit_metadata_vio(&block->vio, + journal->origin + block->block_number, + complete_write_endio, + handle_write_error, + WRITE_FLAGS); +} + + +/** + * write_blocks() - Attempt to commit blocks, according to write policy. + * @journal: The recovery journal. + */ +static void write_blocks(struct recovery_journal *journal) +{ + assert_on_journal_thread(journal, __func__); + /* + * We call this function after adding entries to the journal and after finishing a block + * write. Thus, when this function terminates we must either have no VIOs waiting in the + * journal or have some outstanding IO to provide a future wakeup. + * + * We want to only issue full blocks if there are no pending writes. However, if there are + * no outstanding writes and some unwritten entries, we must issue a block, even if it's + * the active block and it isn't full. + */ + if (journal->pending_write_count > 0) + return; + + /* Write all the full blocks. */ + vdo_notify_all_waiters(&journal->pending_writes, write_block, NULL); + + /* + * Do we need to write the active block? Only if we have no outstanding writes, even after + * issuing all of the full writes. + */ + if ((journal->pending_write_count == 0) && (journal->active_block != NULL)) + write_block(&journal->active_block->write_waiter, NULL); +} + +/** + * vdo_add_recovery_journal_entry() - Add an entry to a recovery journal. + * @journal: The journal in which to make an entry. + * @data_vio: The data_vio for which to add the entry. The entry will be taken + * from the logical and new_mapped fields of the data_vio. The + * data_vio's recovery_sequence_number field will be set to the + * sequence number of the journal block in which the entry was + * made. + * + * This method is asynchronous. The data_vio will not be called back until the entry is committed + * to the on-disk journal. + */ +void vdo_add_recovery_journal_entry(struct recovery_journal *journal, struct data_vio *data_vio) +{ + assert_on_journal_thread(journal, __func__); + if (!vdo_is_state_normal(&journal->state)) { + continue_data_vio_with_error(data_vio, VDO_INVALID_ADMIN_STATE); + return; + } + + if (is_read_only(journal)) { + continue_data_vio_with_error(data_vio, VDO_READ_ONLY); + return; + } + + ASSERT_LOG_ONLY(data_vio->recovery_sequence_number == 0, + "journal lock not held for new entry"); + + vdo_advance_journal_point(&journal->append_point, journal->entries_per_block); + vdo_enqueue_waiter(&journal->entry_waiters, &data_vio->waiter); + assign_entries(journal); +} + +/** + * is_lock_locked() - Check whether a lock is locked for a zone type. + * @journal: The recovery journal. + * @lock_number: The lock to check. + * @zone_type: The type of the zone. + * + * If the recovery journal has a lock on the lock number, both logical and physical zones are + * considered locked. + * + * Return: true if the specified lock has references (is locked). + */ +static bool +is_lock_locked(struct recovery_journal *journal, + block_count_t lock_number, + enum vdo_zone_type zone_type) +{ + atomic_t *zone_count; + bool locked; + + if (is_journal_zone_locked(journal, lock_number)) + return true; + + zone_count = get_zone_count_ptr(journal, lock_number, zone_type); + locked = (atomic_read(zone_count) != 0); + /* Pairs with implicit barrier in vdo_release_recovery_journal_block_reference() */ + smp_rmb(); + return locked; +} + +/** + * reap_recovery_journal() - Conduct a sweep on a recovery journal to reclaim unreferenced blocks. + * @journal: The recovery journal. + */ +static void reap_recovery_journal(struct recovery_journal *journal) +{ + if (journal->reaping) + /* + * We already have an outstanding reap in progress. We need to wait for it to + * finish. + */ + return; + + if (vdo_is_state_quiescent(&journal->state)) + /* We are supposed to not do IO. Don't botch it by reaping. */ + return; + + /* + * Start reclaiming blocks only when the journal head has no references. Then stop when a + * block is referenced. + */ + while ((journal->block_map_reap_head < journal->last_write_acknowledged) && + !is_lock_locked(journal, + journal->block_map_head_block_number, + VDO_ZONE_TYPE_LOGICAL)) { + journal->block_map_reap_head++; + if (++journal->block_map_head_block_number == journal->size) + journal->block_map_head_block_number = 0; + } + + while ((journal->slab_journal_reap_head < journal->last_write_acknowledged) && + !is_lock_locked(journal, + journal->slab_journal_head_block_number, + VDO_ZONE_TYPE_PHYSICAL)) { + journal->slab_journal_reap_head++; + if (++journal->slab_journal_head_block_number == journal->size) + journal->slab_journal_head_block_number = 0; + } + + if ((journal->block_map_reap_head == journal->block_map_head) && + (journal->slab_journal_reap_head == journal->slab_journal_head)) + /* Nothing happened. */ + return; + + /* + * If the block map head will advance, we must flush any block map page modified by the + * entries we are reaping. If the slab journal head will advance, we must flush the slab + * summary update covering the slab journal that just released some lock. + */ + journal->reaping = true; + submit_flush_vio(journal->flush_vio, flush_endio, handle_flush_error); +} + +/** + * vdo_acquire_recovery_journal_block_reference() - Acquire a reference to a recovery journal block + * from somewhere other than the journal itself. + * @journal: The recovery journal. + * @sequence_number: The journal sequence number of the referenced block. + * @zone_type: The type of the zone making the adjustment. + * @zone_id: The ID of the zone making the adjustment. + */ +void vdo_acquire_recovery_journal_block_reference(struct recovery_journal *journal, + sequence_number_t sequence_number, + enum vdo_zone_type zone_type, + zone_count_t zone_id) +{ + block_count_t lock_number; + u16 *current_value; + + if (sequence_number == 0) + return; + + ASSERT_LOG_ONLY((zone_type != VDO_ZONE_TYPE_JOURNAL), + "invalid lock count increment from journal zone"); + + lock_number = vdo_get_recovery_journal_block_number(journal, sequence_number); + current_value = get_counter(journal, lock_number, zone_type, zone_id); + ASSERT_LOG_ONLY(*current_value < U16_MAX, + "increment of lock counter must not overflow"); + + if (*current_value == 0) { + /* + * This zone is acquiring this lock for the first time. Extra barriers because this + * was original developed using an atomic add operation that implicitly had them. + */ + smp_mb__before_atomic(); + atomic_inc(get_zone_count_ptr(journal, lock_number, zone_type)); + /* same as before_atomic */ + smp_mb__after_atomic(); + } + *current_value += 1; +} + +/** + * vdo_release_journal_entry_lock() - Release a single per-entry reference count for a recovery + * journal block. + * @journal: The recovery journal. + * @sequence_number: The journal sequence number of the referenced block. + */ +void vdo_release_journal_entry_lock(struct recovery_journal *journal, + sequence_number_t sequence_number) +{ + block_count_t lock_number; + + if (sequence_number == 0) + return; + + lock_number = vdo_get_recovery_journal_block_number(journal, sequence_number); + /* + * Extra barriers because this was originally developed using an atomic add operation that + * implicitly had them. + */ + smp_mb__before_atomic(); + atomic_inc(get_decrement_counter(journal, lock_number)); + /* same as before_atomic */ + smp_mb__after_atomic(); +} + +/** + * initiate_drain() - Initiate a drain. + * + * Implements vdo_admin_initiator. + */ +static void initiate_drain(struct admin_state *state) +{ + check_for_drain_complete(container_of(state, struct recovery_journal, state)); +} + +/** + * vdo_drain_recovery_journal() - Drain recovery journal I/O. + * @journal: The journal to drain. + * @operation: The drain operation (suspend or save). + * @parent: The completion to notify once the journal is drained. + * + * All uncommitted entries will be written out. + */ +void vdo_drain_recovery_journal(struct recovery_journal *journal, + const struct admin_state_code *operation, + struct vdo_completion *parent) +{ + assert_on_journal_thread(journal, __func__); + vdo_start_draining(&journal->state, operation, parent, initiate_drain); +} + +/** + * resume_lock_counter() - Re-allow notifications from a suspended lock counter. + * @counter: The counter. + * + * Return: true if the lock counter was suspended. + */ +static bool resume_lock_counter(struct lock_counter *counter) +{ + int prior_state; + + /* + * Extra barriers because this was original developed using a CAS operation that implicitly + * had them. + */ + smp_mb__before_atomic(); + prior_state = atomic_cmpxchg(&counter->state, + LOCK_COUNTER_STATE_SUSPENDED, + LOCK_COUNTER_STATE_NOT_NOTIFYING); + /* same as before_atomic */ + smp_mb__after_atomic(); + + return (prior_state == LOCK_COUNTER_STATE_SUSPENDED); +} + +/** + * vdo_resume_recovery_journal() - Resume a recovery journal which has been drained. + * @journal: The journal to resume. + * @parent: The completion to finish once the journal is resumed. + */ +void vdo_resume_recovery_journal(struct recovery_journal *journal, struct vdo_completion *parent) +{ + bool saved; + + assert_on_journal_thread(journal, __func__); + saved = vdo_is_state_saved(&journal->state); + vdo_set_completion_result(parent, vdo_resume_if_quiescent(&journal->state)); + if (is_read_only(journal)) { + vdo_continue_completion(parent, VDO_READ_ONLY); + return; + } + + if (saved) + initialize_journal_state(journal); + + if (resume_lock_counter(&journal->lock_counter)) + /* We might have missed a notification. */ + reap_recovery_journal(journal); + + vdo_launch_completion(parent); +} + +/** + * vdo_get_recovery_journal_logical_blocks_used() - Get the number of logical blocks in use by the + * VDO. + * @journal: The journal. + * + * Return: The number of logical blocks in use by the VDO. + */ +block_count_t vdo_get_recovery_journal_logical_blocks_used(const struct recovery_journal *journal) +{ + return journal->logical_blocks_used; +} + +/** + * vdo_get_recovery_journal_statistics() - Get the current statistics from the recovery journal. + * @journal: The recovery journal to query. + * + * Return: A copy of the current statistics for the journal. + */ +struct recovery_journal_statistics +vdo_get_recovery_journal_statistics(const struct recovery_journal *journal) +{ + return journal->events; +} + +/** + * dump_recovery_block() - Dump the contents of the recovery block to the log. + * @block: The block to dump. + */ +static void dump_recovery_block(const struct recovery_journal_block *block) +{ + uds_log_info(" sequence number %llu; entries %u; %s; %zu entry waiters; %zu commit waiters", + (unsigned long long) block->sequence_number, + block->entry_count, + (block->committing ? "committing" : "waiting"), + vdo_count_waiters(&block->entry_waiters), + vdo_count_waiters(&block->commit_waiters)); +} + +/** + * vdo_dump_recovery_journal_statistics() - Dump some current statistics and other debug info from + * the recovery journal. + * @journal: The recovery journal to dump. + */ +void vdo_dump_recovery_journal_statistics(const struct recovery_journal *journal) +{ + const struct recovery_journal_block *block; + struct recovery_journal_statistics stats = vdo_get_recovery_journal_statistics(journal); + + uds_log_info("Recovery Journal"); + uds_log_info(" block_map_head=%llu slab_journal_head=%llu last_write_acknowledged=%llu tail=%llu block_map_reap_head=%llu slab_journal_reap_head=%llu disk_full=%llu slab_journal_commits_requested=%llu entry_waiters=%zu", + (unsigned long long) journal->block_map_head, + (unsigned long long) journal->slab_journal_head, + (unsigned long long) journal->last_write_acknowledged, + (unsigned long long) journal->tail, + (unsigned long long) journal->block_map_reap_head, + (unsigned long long) journal->slab_journal_reap_head, + (unsigned long long) stats.disk_full, + (unsigned long long) stats.slab_journal_commits_requested, + vdo_count_waiters(&journal->entry_waiters)); + uds_log_info(" entries: started=%llu written=%llu committed=%llu", + (unsigned long long) stats.entries.started, + (unsigned long long) stats.entries.written, + (unsigned long long) stats.entries.committed); + uds_log_info(" blocks: started=%llu written=%llu committed=%llu", + (unsigned long long) stats.blocks.started, + (unsigned long long) stats.blocks.written, + (unsigned long long) stats.blocks.committed); + + uds_log_info(" active blocks:"); + list_for_each_entry(block, &journal->active_tail_blocks, list_node) + dump_recovery_block(block); +} diff --git a/drivers/md/dm-vdo/recovery-journal.h b/drivers/md/dm-vdo/recovery-journal.h new file mode 100644 index 00000000000..5845f1e4b4c --- /dev/null +++ b/drivers/md/dm-vdo/recovery-journal.h @@ -0,0 +1,313 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_RECOVERY_JOURNAL_H +#define VDO_RECOVERY_JOURNAL_H + +#include <linux/list.h> + +#include "numeric.h" + +#include "admin-state.h" +#include "constants.h" +#include "encodings.h" +#include "flush.h" +#include "statistics.h" +#include "types.h" +#include "wait-queue.h" + +/** + * DOC: recovery journal. + * + * The recovery_journal provides a log of all block mapping and reference count changes which have + * not yet been stably written to the block map or slab journals. This log helps to reduce the + * write amplification of writes by providing amortization of slab journal and block map page + * updates. + * + * The journal consists of a set of on-disk blocks arranged as a circular log with monotonically + * increasing sequence numbers. Three sequence numbers serve to define the active extent of the + * journal. The 'head' is the oldest active block in the journal. The 'tail' is the end of the + * half-open interval containing the active blocks. 'active' is the number of the block actively + * receiving entries. In an empty journal, head == active == tail. Once any entries are added, tail + * = active + 1, and head may be any value in the interval [tail - size, active]. + * + * The journal also contains a set of in-memory blocks which are used to buffer up entries until + * they can be committed. In general the number of in-memory blocks ('tail_buffer_count') will be + * less than the on-disk size. Each in-memory block is also a vdo_completion. Each in-memory block + * has a vio which is used to commit that block to disk. The vio's data is the on-disk + * representation of the journal block. In addition each in-memory block has a buffer which is used + * to accumulate entries while a partial commit of the block is in progress. In-memory blocks are + * kept on two rings. Free blocks live on the 'free_tail_blocks' ring. When a block becomes active + * (see below) it is moved to the 'active_tail_blocks' ring. When a block is fully committed, it is + * moved back to the 'free_tail_blocks' ring. + * + * When entries are added to the journal, they are added to the active in-memory block, as + * indicated by the 'active_block' field. If the caller wishes to wait for the entry to be + * committed, the requesting VIO will be attached to the in-memory block to which the caller's + * entry was added. If the caller does wish to wait, or if the entry filled the active block, an + * attempt will be made to commit that block to disk. If there is already another commit in + * progress, the attempt will be ignored and then automatically retried when the in-progress commit + * completes. If there is no commit in progress, any data_vios waiting on the block are transferred + * to the block's vio which is then written, automatically waking all of the waiters when it + * completes. When the write completes, any entries which accumulated in the block are copied to + * the vio's data buffer. + * + * Finally, the journal maintains a set of counters, one for each on disk journal block. These + * counters are used as locks to prevent premature reaping of journal blocks. Each time a new + * sequence number is used, the counter for the corresponding block is incremented. The counter is + * subsequently decremented when that block is filled and then committed for the last time. This + * prevents blocks from being reaped while they are still being updated. The counter is also + * incremented once for each entry added to a block, and decremented once each time the block map + * is updated in memory for that request. This prevents blocks from being reaped while their VIOs + * are still active. Finally, each in-memory block map page tracks the oldest journal block that + * contains entries corresponding to uncommitted updates to that block map page. Each time an + * in-memory block map page is updated, it checks if the journal block for the VIO is earlier than + * the one it references, in which case it increments the count on the earlier journal block and + * decrements the count on the later journal block, maintaining a lock on the oldest journal block + * containing entries for that page. When a block map page has been flushed from the cache, the + * counter for the journal block it references is decremented. Whenever the counter for the head + * block goes to 0, the head is advanced until it comes to a block whose counter is not 0 or until + * it reaches the active block. This is the mechanism for reclaiming journal space on disk. + * + * If there is no in-memory space when a VIO attempts to add an entry, the VIO will be attached to + * the 'commit_completion' and will be woken the next time a full block has committed. If there is + * no on-disk space when a VIO attempts to add an entry, the VIO will be attached to the + * 'reap_completion', and will be woken the next time a journal block is reaped. + */ + +enum vdo_zone_type { + VDO_ZONE_TYPE_ADMIN, + VDO_ZONE_TYPE_JOURNAL, + VDO_ZONE_TYPE_LOGICAL, + VDO_ZONE_TYPE_PHYSICAL, +}; + +struct lock_counter { + /** The completion for notifying the owner of a lock release */ + struct vdo_completion completion; + /** The number of logical zones which may hold locks */ + zone_count_t logical_zones; + /** The number of physical zones which may hold locks */ + zone_count_t physical_zones; + /** The number of locks */ + block_count_t locks; + /** Whether the lock release notification is in flight */ + atomic_t state; + /** The number of logical zones which hold each lock */ + atomic_t *logical_zone_counts; + /** The number of physical zones which hold each lock */ + atomic_t *physical_zone_counts; + /** The per-lock counts for the journal zone */ + u16 *journal_counters; + /** The per-lock decrement counts for the journal zone */ + atomic_t *journal_decrement_counts; + /** The per-zone, per-lock reference counts for logical zones */ + u16 *logical_counters; + /** The per-zone, per-lock reference counts for physical zones */ + u16 *physical_counters; +}; + +struct recovery_journal_block { + /* The doubly linked pointers for the free or active lists */ + struct list_head list_node; + /* The waiter for the pending full block list */ + struct waiter write_waiter; + /* The journal to which this block belongs */ + struct recovery_journal *journal; + /* A pointer to the current sector in the packed block buffer */ + struct packed_journal_sector *sector; + /* The vio for writing this block */ + struct vio vio; + /* The sequence number for this block */ + sequence_number_t sequence_number; + /* The location of this block in the on-disk journal */ + physical_block_number_t block_number; + /* Whether this block is being committed */ + bool committing; + /* The total number of entries in this block */ + journal_entry_count_t entry_count; + /* The total number of uncommitted entries (queued or committing) */ + journal_entry_count_t uncommitted_entry_count; + /* The number of new entries in the current commit */ + journal_entry_count_t entries_in_commit; + /* The queue of vios which will make entries for the next commit */ + struct wait_queue entry_waiters; + /* The queue of vios waiting for the current commit */ + struct wait_queue commit_waiters; +}; + +struct recovery_journal { + /* The thread ID of the journal zone */ + thread_id_t thread_id; + /* The slab depot which can hold locks on this journal */ + struct slab_depot *depot; + /* The block map which can hold locks on this journal */ + struct block_map *block_map; + /* The queue of vios waiting to make entries */ + struct wait_queue entry_waiters; + /* The number of free entries in the journal */ + u64 available_space; + /* The number of decrement entries which need to be made */ + data_vio_count_t pending_decrement_count; + /* Whether the journal is adding entries from the increment or decrement waiters queues */ + bool adding_entries; + /* The administrative state of the journal */ + struct admin_state state; + /* Whether a reap is in progress */ + bool reaping; + /* The location of the first journal block */ + physical_block_number_t origin; + /* The oldest active block in the journal on disk for block map rebuild */ + sequence_number_t block_map_head; + /* The oldest active block in the journal on disk for slab journal replay */ + sequence_number_t slab_journal_head; + /* The newest block in the journal on disk to which a write has finished */ + sequence_number_t last_write_acknowledged; + /* The end of the half-open interval of the active journal */ + sequence_number_t tail; + /* The point at which the last entry will have been added */ + struct journal_point append_point; + /* The journal point of the vio most recently released from the journal */ + struct journal_point commit_point; + /* The nonce of the VDO */ + nonce_t nonce; + /* The number of recoveries completed by the VDO */ + u8 recovery_count; + /* The number of entries which fit in a single block */ + journal_entry_count_t entries_per_block; + /* Unused in-memory journal blocks */ + struct list_head free_tail_blocks; + /* In-memory journal blocks with records */ + struct list_head active_tail_blocks; + /* A pointer to the active block (the one we are adding entries to now) */ + struct recovery_journal_block *active_block; + /* Journal blocks that need writing */ + struct wait_queue pending_writes; + /* The new block map reap head after reaping */ + sequence_number_t block_map_reap_head; + /* The head block number for the block map rebuild range */ + block_count_t block_map_head_block_number; + /* The new slab journal reap head after reaping */ + sequence_number_t slab_journal_reap_head; + /* The head block number for the slab journal replay range */ + block_count_t slab_journal_head_block_number; + /* The data-less vio, usable only for flushing */ + struct vio *flush_vio; + /* The number of blocks in the on-disk journal */ + block_count_t size; + /* The number of logical blocks that are in-use */ + block_count_t logical_blocks_used; + /* The number of block map pages that are allocated */ + block_count_t block_map_data_blocks; + /* The number of journal blocks written but not yet acknowledged */ + block_count_t pending_write_count; + /* The threshold at which slab journal tail blocks will be written out */ + block_count_t slab_journal_commit_threshold; + /* Counters for events in the journal that are reported as statistics */ + struct recovery_journal_statistics events; + /* The locks for each on-disk block */ + struct lock_counter lock_counter; + /* The tail blocks */ + struct recovery_journal_block blocks[]; +}; + +/** + * vdo_get_recovery_journal_block_number() - Get the physical block number for a given sequence + * number. + * @journal: The journal. + * @sequence: The sequence number of the desired block. + * + * Return: The block number corresponding to the sequence number. + */ +static inline physical_block_number_t __must_check +vdo_get_recovery_journal_block_number(const struct recovery_journal *journal, + sequence_number_t sequence) +{ + /* + * Since journal size is a power of two, the block number modulus can just be extracted + * from the low-order bits of the sequence. + */ + return vdo_compute_recovery_journal_block_number(journal->size, sequence); +} + +/** + * vdo_compute_recovery_journal_check_byte() - Compute the check byte for a given sequence number. + * @journal: The journal. + * @sequence: The sequence number. + * + * Return: The check byte corresponding to the sequence number. + */ +static inline u8 __must_check +vdo_compute_recovery_journal_check_byte(const struct recovery_journal *journal, + sequence_number_t sequence) +{ + /* The check byte must change with each trip around the journal. */ + return (((sequence / journal->size) & 0x7F) | 0x80); +} + +int __must_check vdo_decode_recovery_journal(struct recovery_journal_state_7_0 state, + nonce_t nonce, + struct vdo *vdo, + struct partition *partition, + u64 recovery_count, + block_count_t journal_size, + struct recovery_journal **journal_ptr); + +void vdo_free_recovery_journal(struct recovery_journal *journal); + +void vdo_initialize_recovery_journal_post_repair(struct recovery_journal *journal, + u64 recovery_count, + sequence_number_t tail, + block_count_t logical_blocks_used, + block_count_t block_map_data_blocks); + +block_count_t __must_check +vdo_get_journal_block_map_data_blocks_used(struct recovery_journal *journal); + +thread_id_t __must_check vdo_get_recovery_journal_thread_id(struct recovery_journal *journal); + +void vdo_open_recovery_journal(struct recovery_journal *journal, + struct slab_depot *depot, + struct block_map *block_map); + +sequence_number_t +vdo_get_recovery_journal_current_sequence_number(struct recovery_journal *journal); + +block_count_t __must_check vdo_get_recovery_journal_length(block_count_t journal_size); + +struct recovery_journal_state_7_0 __must_check +vdo_record_recovery_journal(const struct recovery_journal *journal); + +void vdo_add_recovery_journal_entry(struct recovery_journal *journal, struct data_vio *data_vio); + +void vdo_acquire_recovery_journal_block_reference(struct recovery_journal *journal, + sequence_number_t sequence_number, + enum vdo_zone_type zone_type, + zone_count_t zone_id); + +void vdo_release_recovery_journal_block_reference(struct recovery_journal *journal, + sequence_number_t sequence_number, + enum vdo_zone_type zone_type, + zone_count_t zone_id); + +void vdo_release_journal_entry_lock(struct recovery_journal *journal, + sequence_number_t sequence_number); + +void vdo_drain_recovery_journal(struct recovery_journal *journal, + const struct admin_state_code *operation, + struct vdo_completion *parent); + +void vdo_resume_recovery_journal(struct recovery_journal *journal, + struct vdo_completion *parent); + +block_count_t __must_check +vdo_get_recovery_journal_logical_blocks_used(const struct recovery_journal *journal); + +struct recovery_journal_statistics __must_check +vdo_get_recovery_journal_statistics(const struct recovery_journal *journal); + +void vdo_dump_recovery_journal_statistics(const struct recovery_journal *journal); + +#endif /* VDO_RECOVERY_JOURNAL_H */ diff --git a/drivers/md/dm-vdo/release-versions.h b/drivers/md/dm-vdo/release-versions.h new file mode 100644 index 00000000000..0abc13c04b8 --- /dev/null +++ b/drivers/md/dm-vdo/release-versions.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef RELEASE_VERSIONS_H +#define RELEASE_VERSIONS_H + +enum { + VDO_OXYGEN_RELEASE_VERSION_NUMBER = 109583, + VDO_FLUORINE_RELEASE_VERSION_NUMBER = 115838, + VDO_NEON_RELEASE_VERSION_NUMBER = 120965, + VDO_SODIUM_RELEASE_VERSION_NUMBER = 127441, + VDO_MAGNESIUM_RELEASE_VERSION_NUMBER = 131337, + VDO_ALUMINUM_RELEASE_VERSION_NUMBER = 133524, + VDO_HEAD_RELEASE_VERSION_NUMBER = 0, + VDO_CURRENT_RELEASE_VERSION_NUMBER = VDO_HEAD_RELEASE_VERSION_NUMBER, +}; + +#endif /* not RELEASE_VERSIONS_H */ diff --git a/drivers/md/dm-vdo/repair.c b/drivers/md/dm-vdo/repair.c new file mode 100644 index 00000000000..455b9a89230 --- /dev/null +++ b/drivers/md/dm-vdo/repair.c @@ -0,0 +1,1775 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "repair.h" + +#include <linux/min_heap.h> +#include <linux/minmax.h> + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "block-map.h" +#include "completion.h" +#include "constants.h" +#include "encodings.h" +#include "int-map.h" +#include "io-submitter.h" +#include "recovery-journal.h" +#include "slab-depot.h" +#include "types.h" +#include "vdo.h" +#include "wait-queue.h" + +/* + * An explicitly numbered block mapping. Numbering the mappings allows them to be sorted by logical + * block number during repair while still preserving the relative order of journal entries with + * the same logical block number. + */ +struct numbered_block_mapping { + struct block_map_slot block_map_slot; + struct block_map_entry block_map_entry; + /* A serial number to use during replay */ + u32 number; +} __packed; + +/* + * The absolute position of an entry in the recovery journal, including the sector number and the + * entry number within the sector. + */ +struct recovery_point { + /* Block sequence number */ + sequence_number_t sequence_number; + /* Sector number */ + u8 sector_count; + /* Entry number */ + journal_entry_count_t entry_count; + /* Whether or not the increment portion of the current entry has been applied */ + bool increment_applied; +}; + +struct repair_completion { + /* The completion header */ + struct vdo_completion completion; + + /* A buffer to hold the data read off disk */ + char *journal_data; + + /* For loading the journal */ + data_vio_count_t vio_count; + data_vio_count_t vios_complete; + struct vio *vios; + + /* The number of entries to be applied to the block map */ + size_t block_map_entry_count; + /* The sequence number of the first valid block for block map recovery */ + sequence_number_t block_map_head; + /* The sequence number of the first valid block for slab journal replay */ + sequence_number_t slab_journal_head; + /* The sequence number of the last valid block of the journal (if known) */ + sequence_number_t tail; + /* + * The highest sequence number of the journal. During recovery (vs read-only rebuild), not + * the same as the tail, since the tail ignores blocks after the first hole. + */ + sequence_number_t highest_tail; + + /* The number of logical blocks currently known to be in use */ + block_count_t logical_blocks_used; + /* The number of block map data blocks known to be allocated */ + block_count_t block_map_data_blocks; + + /* These fields are for playing the journal into the block map */ + /* The entry data for the block map recovery */ + struct numbered_block_mapping *entries; + /* The number of entries in the entry array */ + size_t entry_count; + /* number of pending (non-ready) requests*/ + page_count_t outstanding; + /* number of page completions */ + page_count_t page_count; + bool launching; + /* + * a heap wrapping journal_entries. It re-orders and sorts journal entries in ascending LBN + * order, then original journal order. This permits efficient iteration over the journal + * entries in order. + */ + struct min_heap replay_heap; + /* Fields tracking progress through the journal entries. */ + struct numbered_block_mapping *current_entry; + struct numbered_block_mapping *current_unfetched_entry; + /* Current requested page's PBN */ + physical_block_number_t pbn; + + /* These fields are only used during recovery. */ + /* A location just beyond the last valid entry of the journal */ + struct recovery_point tail_recovery_point; + /* The location of the next recovery journal entry to apply */ + struct recovery_point next_recovery_point; + /* The journal point to give to the next synthesized decref */ + struct journal_point next_journal_point; + /* The number of entries played into slab journals */ + size_t entries_added_to_slab_journals; + + /* These fields are only used during read-only rebuild */ + page_count_t page_to_fetch; + /* the number of leaf pages in the block map */ + page_count_t leaf_pages; + /* the last slot of the block map */ + struct block_map_slot last_slot; + + /* + * The page completions used for playing the journal into the block map, and, during + * read-only rebuild, for rebuilding the reference counts from the block map. + */ + struct vdo_page_completion page_completions[]; +}; + +/* + * This is a min_heap callback function that orders numbered_block_mappings using the + * 'block_map_slot' field as the primary key and the mapping 'number' field as the secondary key. + * Using the mapping number preserves the journal order of entries for the same slot, allowing us + * to sort by slot while still ensuring we replay all entries with the same slot in the exact order + * as they appeared in the journal. + */ +static bool mapping_is_less_than(const void *item1, const void *item2) +{ + const struct numbered_block_mapping *mapping1 = + (const struct numbered_block_mapping *) item1; + const struct numbered_block_mapping *mapping2 = + (const struct numbered_block_mapping *) item2; + + if (mapping1->block_map_slot.pbn != mapping2->block_map_slot.pbn) + return mapping1->block_map_slot.pbn < mapping2->block_map_slot.pbn; + + if (mapping1->block_map_slot.slot != mapping2->block_map_slot.slot) + return mapping1->block_map_slot.slot < mapping2->block_map_slot.slot; + + if (mapping1->number != mapping2->number) + return mapping1->number < mapping2->number; + + return 0; +} + +static void swap_mappings(void *item1, void *item2) +{ + struct numbered_block_mapping *mapping1 = item1; + struct numbered_block_mapping *mapping2 = item2; + + swap(*mapping1, *mapping2); +} + +static const struct min_heap_callbacks repair_min_heap = { + .elem_size = sizeof(struct numbered_block_mapping), + .less = mapping_is_less_than, + .swp = swap_mappings, +}; + +static struct numbered_block_mapping * +sort_next_heap_element(struct repair_completion *repair) +{ + struct min_heap *heap = &repair->replay_heap; + struct numbered_block_mapping *last; + + if (heap->nr == 0) + return NULL; + + /* + * Swap the next heap element with the last one on the heap, popping it off the heap, + * restore the heap invariant, and return a pointer to the popped element. + */ + last = &repair->entries[--heap->nr]; + swap_mappings(heap->data, last); + min_heapify(heap, 0, &repair_min_heap); + return last; +} + +/** + * as_repair_completion() - Convert a generic completion to a repair_completion. + * @completion: The completion to convert. + * + * Return: The repair_completion. + */ +static inline struct repair_completion * __must_check +as_repair_completion(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion, VDO_REPAIR_COMPLETION); + return container_of(completion, struct repair_completion, completion); +} + +static void prepare_repair_completion(struct repair_completion *repair, + vdo_action *callback, + enum vdo_zone_type zone_type) +{ + struct vdo_completion *completion = &repair->completion; + const struct thread_config *thread_config = &completion->vdo->thread_config; + thread_id_t thread_id; + + /* All blockmap access is done on single thread, so use logical zone 0. */ + thread_id = ((zone_type == VDO_ZONE_TYPE_LOGICAL) ? + thread_config->logical_threads[0] : + thread_config->admin_thread); + vdo_reset_completion(completion); + vdo_set_completion_callback(completion, callback, thread_id); +} + +static void launch_repair_completion(struct repair_completion *repair, + vdo_action *callback, + enum vdo_zone_type zone_type) +{ + prepare_repair_completion(repair, callback, zone_type); + vdo_launch_completion(&repair->completion); +} + +static void uninitialize_vios(struct repair_completion *repair) +{ + while (repair->vio_count > 0) + free_vio_components(&repair->vios[--repair->vio_count]); + + UDS_FREE(UDS_FORGET(repair->vios)); +} + +static void free_repair_completion(struct repair_completion *repair) +{ + if (repair == NULL) + return; + + /* + * We do this here because this function is the only common bottleneck for all clean up + * paths. + */ + repair->completion.vdo->block_map->zones[0].page_cache.rebuilding = false; + + uninitialize_vios(repair); + UDS_FREE(UDS_FORGET(repair->journal_data)); + UDS_FREE(UDS_FORGET(repair->entries)); + UDS_FREE(repair); +} + +static void finish_repair(struct vdo_completion *completion) +{ + struct vdo_completion *parent = completion->parent; + struct vdo *vdo = completion->vdo; + struct repair_completion *repair = as_repair_completion(completion); + + vdo_assert_on_admin_thread(vdo, __func__); + + if (vdo->load_state != VDO_REBUILD_FOR_UPGRADE) + vdo->states.vdo.complete_recoveries++; + + vdo_initialize_recovery_journal_post_repair(vdo->recovery_journal, + vdo->states.vdo.complete_recoveries, + repair->highest_tail, + repair->logical_blocks_used, + repair->block_map_data_blocks); + free_repair_completion(UDS_FORGET(repair)); + + if (vdo_state_requires_read_only_rebuild(vdo->load_state)) { + uds_log_info("Read-only rebuild complete"); + vdo_launch_completion(parent); + return; + } + + /* FIXME: shouldn't this say either "recovery" or "repair"? */ + uds_log_info("Rebuild complete"); + + /* + * Now that we've freed the repair completion and its vast array of journal entries, we + * can allocate refcounts. + */ + vdo_continue_completion(parent, vdo_allocate_reference_counters(vdo->depot)); +} + +/** + * abort_repair() - Handle a repair error. + * @completion: The repair completion. + */ +static void abort_repair(struct vdo_completion *completion) +{ + struct vdo_completion *parent = completion->parent; + int result = completion->result; + struct repair_completion *repair = as_repair_completion(completion); + + if (vdo_state_requires_read_only_rebuild(completion->vdo->load_state)) + uds_log_info("Read-only rebuild aborted"); + else + uds_log_warning("Recovery aborted"); + + free_repair_completion(UDS_FORGET(repair)); + vdo_continue_completion(parent, result); +} + +/** + * abort_on_error() - Abort a repair if there is an error. + * @result: The result to check. + * @repair: The repair completion. + * + * Return: true if the result was an error. + */ +static bool __must_check abort_on_error(int result, struct repair_completion *repair) +{ + if (result == VDO_SUCCESS) + return false; + + vdo_fail_completion(&repair->completion, result); + return true; +} + +/** + * drain_slab_depot() - Flush out all dirty refcounts blocks now that they have been rebuilt or + * recovered. + */ +static void drain_slab_depot(struct vdo_completion *completion) +{ + struct vdo *vdo = completion->vdo; + struct repair_completion *repair = as_repair_completion(completion); + const struct admin_state_code *operation; + + vdo_assert_on_admin_thread(vdo, __func__); + + prepare_repair_completion(repair, finish_repair, VDO_ZONE_TYPE_ADMIN); + if (vdo_state_requires_read_only_rebuild(vdo->load_state)) { + uds_log_info("Saving rebuilt state"); + operation = VDO_ADMIN_STATE_REBUILDING; + } else { + uds_log_info("Replayed %zu journal entries into slab journals", + repair->entries_added_to_slab_journals); + operation = VDO_ADMIN_STATE_RECOVERING; + } + + vdo_drain_slab_depot(vdo->depot, operation, completion); +} + +/** + * flush_block_map_updates() - Flush the block map now that all the reference counts are rebuilt. + * @completion: The repair completion. + * + * This callback is registered in finish_if_done(). + */ +static void flush_block_map_updates(struct vdo_completion *completion) +{ + vdo_assert_on_admin_thread(completion->vdo, __func__); + + uds_log_info("Flushing block map changes"); + prepare_repair_completion(as_repair_completion(completion), + drain_slab_depot, + VDO_ZONE_TYPE_ADMIN); + vdo_drain_block_map(completion->vdo->block_map, VDO_ADMIN_STATE_RECOVERING, completion); +} + +static bool fetch_page(struct repair_completion *repair, struct vdo_completion *completion); + +/** + * handle_page_load_error() - Handle an error loading a page. + * @completion: The vdo_page_completion. + */ +static void handle_page_load_error(struct vdo_completion *completion) +{ + struct repair_completion *repair = completion->parent; + + repair->outstanding--; + vdo_set_completion_result(&repair->completion, completion->result); + vdo_release_page_completion(completion); + fetch_page(repair, completion); +} + +/** + * Unmap an invalid entry and indicate that its page must be written out. + * @page: The page containing the entries + * @completion: The page_completion for writing the page + * @slot: The slot to unmap + */ +static void +unmap_entry(struct block_map_page *page, struct vdo_completion *completion, slot_number_t slot) +{ + page->entries[slot] = vdo_pack_block_map_entry(VDO_ZERO_BLOCK, VDO_MAPPING_STATE_UNMAPPED); + vdo_request_page_write(completion); +} + +/** + * remove_out_of_bounds_entries(): Unmap entries which outside the logical space. + * @page: The page containing the entries + * @completion: The page_completion for writing the page + * @start: The first slot to check + */ +static void remove_out_of_bounds_entries(struct block_map_page *page, + struct vdo_completion *completion, + slot_number_t start) +{ + slot_number_t slot; + + for (slot = start; slot < VDO_BLOCK_MAP_ENTRIES_PER_PAGE; slot++) { + struct data_location mapping = vdo_unpack_block_map_entry(&page->entries[slot]); + + if (vdo_is_mapped_location(&mapping)) + unmap_entry(page, completion, slot); + } +} + +/** + * process_slot(): Update the reference counts for a single entry. + * @page: The page containing the entries + * @completion: The page_completion for writing the page + * @slot: The slot to check + * + * Return: true if the entry was a valid mapping + */ +static bool +process_slot(struct block_map_page *page, struct vdo_completion *completion, slot_number_t slot) +{ + struct slab_depot *depot = completion->vdo->depot; + int result; + struct data_location mapping = vdo_unpack_block_map_entry(&page->entries[slot]); + + if (!vdo_is_valid_location(&mapping)) { + /* This entry is invalid, so remove it from the page. */ + unmap_entry(page, completion, slot); + return false; + } + + if (!vdo_is_mapped_location(&mapping)) + return false; + + + if (mapping.pbn == VDO_ZERO_BLOCK) + return true; + + if (!vdo_is_physical_data_block(depot, mapping.pbn)) { + /* + * This is a nonsense mapping. Remove it from the map so we're at least consistent + * and mark the page dirty. + */ + unmap_entry(page, completion, slot); + return false; + } + + result = vdo_adjust_reference_count_for_rebuild(depot, + mapping.pbn, + VDO_JOURNAL_DATA_REMAPPING); + if (result == VDO_SUCCESS) + return true; + + uds_log_error_strerror(result, + "Could not adjust reference count for PBN %llu, slot %u mapped to PBN %llu", + (unsigned long long) vdo_get_block_map_page_pbn(page), + slot, + (unsigned long long) mapping.pbn); + unmap_entry(page, completion, slot); + return false; +} + +/** + * rebuild_reference_counts_from_page() - Rebuild reference counts from a block map page. + * @repair: The repair completion. + * @completion: The page completion holding the page. + */ +static void rebuild_reference_counts_from_page(struct repair_completion *repair, + struct vdo_completion *completion) +{ + slot_number_t slot, last_slot; + struct block_map_page *page; + int result; + + result = vdo_get_cached_page(completion, &page); + if (result != VDO_SUCCESS) { + vdo_set_completion_result(&repair->completion, result); + return; + } + + if (!page->header.initialized) + return; + + /* Remove any bogus entries which exist beyond the end of the logical space. */ + if (vdo_get_block_map_page_pbn(page) == repair->last_slot.pbn) { + last_slot = repair->last_slot.slot; + remove_out_of_bounds_entries(page, completion, last_slot); + } else { + last_slot = VDO_BLOCK_MAP_ENTRIES_PER_PAGE; + } + + /* Inform the slab depot of all entries on this page. */ + for (slot = 0; slot < last_slot; slot++) { + if (process_slot(page, completion, slot)) + repair->logical_blocks_used++; + } +} + +/** + * page_loaded() - Process a page which has just been loaded. + * @completion: The vdo_page_completion for the fetched page. + * + * This callback is registered by fetch_page(). + */ +static void page_loaded(struct vdo_completion *completion) +{ + struct repair_completion *repair = completion->parent; + + repair->outstanding--; + rebuild_reference_counts_from_page(repair, completion); + vdo_release_page_completion(completion); + + /* Advance progress to the next page, and fetch the next page we haven't yet requested. */ + fetch_page(repair, completion); +} + +static physical_block_number_t +get_pbn_to_fetch(struct repair_completion *repair, struct block_map *block_map) +{ + physical_block_number_t pbn = VDO_ZERO_BLOCK; + + if (repair->completion.result != VDO_SUCCESS) + return VDO_ZERO_BLOCK; + + while ((pbn == VDO_ZERO_BLOCK) && (repair->page_to_fetch < repair->leaf_pages)) + pbn = vdo_find_block_map_page_pbn(block_map, repair->page_to_fetch++); + + if (vdo_is_physical_data_block(repair->completion.vdo->depot, pbn)) + return pbn; + + vdo_set_completion_result(&repair->completion, VDO_BAD_MAPPING); + return VDO_ZERO_BLOCK; +} + +/** + * fetch_page() - Fetch a page from the block map. + * @repair: The repair_completion. + * @completion: The page completion to use. + * + * Return true if the rebuild is complete + */ +static bool fetch_page(struct repair_completion *repair, struct vdo_completion *completion) +{ + struct vdo_page_completion *page_completion = (struct vdo_page_completion *) completion; + struct block_map *block_map = repair->completion.vdo->block_map; + physical_block_number_t pbn = get_pbn_to_fetch(repair, block_map); + + if (pbn != VDO_ZERO_BLOCK) { + repair->outstanding++; + /* + * We must set the requeue flag here to ensure that we don't blow the stack if all + * the requested pages are already in the cache or get load errors. + */ + vdo_get_page(page_completion, + &block_map->zones[0], + pbn, + true, + repair, + page_loaded, + handle_page_load_error, + true); + } + + if (repair->outstanding > 0) + return false; + + launch_repair_completion(repair, flush_block_map_updates, VDO_ZONE_TYPE_ADMIN); + return true; +} + +/** + * rebuild_from_leaves() - Rebuild reference counts from the leaf block map pages. + * @completion: The repair completion. + * + * Rebuilds reference counts from the leaf block map pages now that reference counts have been + * rebuilt from the interior tree pages (which have been loaded in the process). This callback is + * registered in rebuild_reference_counts(). + */ +static void rebuild_from_leaves(struct vdo_completion *completion) +{ + page_count_t i; + struct repair_completion *repair = as_repair_completion(completion); + struct block_map *map = completion->vdo->block_map; + + repair->logical_blocks_used = 0; + + /* + * The PBN calculation doesn't work until the tree pages have been loaded, so we can't set + * this value at the start of repair. + */ + repair->leaf_pages = vdo_compute_block_map_page_count(map->entry_count); + repair->last_slot = (struct block_map_slot) { + .slot = map->entry_count % VDO_BLOCK_MAP_ENTRIES_PER_PAGE, + .pbn = vdo_find_block_map_page_pbn(map, repair->leaf_pages - 1), + }; + if (repair->last_slot.slot == 0) + repair->last_slot.slot = VDO_BLOCK_MAP_ENTRIES_PER_PAGE; + + for (i = 0; i < repair->page_count; i++) { + if (fetch_page(repair, &repair->page_completions[i].completion)) + /* + * The rebuild has already moved on, so it isn't safe nor is there a need + * to launch any more fetches. + */ + return; + } +} + +/** + * process_entry() - Process a single entry from the block map tree. + * @pbn: A pbn which holds a block map tree page. + * @completion: The parent completion of the traversal. + * + * Implements vdo_entry_callback. + * + * Return: VDO_SUCCESS or an error. + */ +static int process_entry(physical_block_number_t pbn, struct vdo_completion *completion) +{ + struct repair_completion *repair = as_repair_completion(completion); + struct slab_depot *depot = completion->vdo->depot; + int result; + + if ((pbn == VDO_ZERO_BLOCK) || !vdo_is_physical_data_block(depot, pbn)) + return uds_log_error_strerror(VDO_BAD_CONFIGURATION, + "PBN %llu out of range", + (unsigned long long) pbn); + + result = vdo_adjust_reference_count_for_rebuild(depot, + pbn, + VDO_JOURNAL_BLOCK_MAP_REMAPPING); + if (result != VDO_SUCCESS) + return uds_log_error_strerror(result, + "Could not adjust reference count for block map tree PBN %llu", + (unsigned long long) pbn); + + repair->block_map_data_blocks++; + return VDO_SUCCESS; +} + +static void rebuild_reference_counts(struct vdo_completion *completion) +{ + struct repair_completion *repair = as_repair_completion(completion); + struct vdo *vdo = completion->vdo; + struct vdo_page_cache *cache = &vdo->block_map->zones[0].page_cache; + + /* We must allocate ref_counts before we can rebuild them. */ + if (abort_on_error(vdo_allocate_reference_counters(vdo->depot), repair)) + return; + + /* + * Completion chaining from page cache hits can lead to stack overflow during the rebuild, + * so clear out the cache before this rebuild phase. + */ + if (abort_on_error(vdo_invalidate_page_cache(cache), repair)) + return; + + prepare_repair_completion(repair, rebuild_from_leaves, VDO_ZONE_TYPE_LOGICAL); + vdo_traverse_forest(vdo->block_map, process_entry, completion); +} + +/** + * increment_recovery_point() - Move the given recovery point forward by one entry. + */ +static void increment_recovery_point(struct recovery_point *point) +{ + if (++point->entry_count < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) + return; + + point->entry_count = 0; + if (point->sector_count < (VDO_SECTORS_PER_BLOCK - 1)) { + point->sector_count++; + return; + } + + point->sequence_number++; + point->sector_count = 1; +} + +/** + * advance_points() - Advance the current recovery and journal points. + * @repair: The repair_completion whose points are to be advanced. + * @entries_per_block: The number of entries in a recovery journal block. + */ +static void +advance_points(struct repair_completion *repair, journal_entry_count_t entries_per_block) +{ + if (!repair->next_recovery_point.increment_applied) { + repair->next_recovery_point.increment_applied = true; + return; + } + + increment_recovery_point(&repair->next_recovery_point); + vdo_advance_journal_point(&repair->next_journal_point, entries_per_block); + repair->next_recovery_point.increment_applied = false; +} + +/** + * before_recovery_point() - Check whether the first point precedes the second point. + * @first: The first recovery point. + * @second: The second recovery point. + * + * Return: true if the first point precedes the second point. + */ +static bool __must_check +before_recovery_point(const struct recovery_point *first, const struct recovery_point *second) +{ + if (first->sequence_number < second->sequence_number) + return true; + + if (first->sequence_number > second->sequence_number) + return false; + + if (first->sector_count < second->sector_count) + return true; + + return ((first->sector_count == second->sector_count) && + (first->entry_count < second->entry_count)); +} + +static struct packed_journal_sector * __must_check +get_sector(struct recovery_journal *journal, + char *journal_data, + sequence_number_t sequence, + u8 sector_number) +{ + off_t offset; + + offset = ((vdo_get_recovery_journal_block_number(journal, sequence) * VDO_BLOCK_SIZE) + + (VDO_SECTOR_SIZE * sector_number)); + return (struct packed_journal_sector *) (journal_data + offset); +} + +/** + * get_entry() - Unpack the recovery journal entry associated with the given recovery point. + * @repair: The repair completion. + * @point: The recovery point. + * + * Return: The unpacked contents of the matching recovery journal entry. + */ +static struct recovery_journal_entry +get_entry(const struct repair_completion *repair, const struct recovery_point *point) +{ + struct packed_journal_sector *sector; + + sector = get_sector(repair->completion.vdo->recovery_journal, + repair->journal_data, + point->sequence_number, + point->sector_count); + return vdo_unpack_recovery_journal_entry(§or->entries[point->entry_count]); +} + +/** + * validate_recovery_journal_entry() - Validate a recovery journal entry. + * @vdo: The vdo. + * @entry: The entry to validate. + * + * Return: VDO_SUCCESS or an error. + */ +static int +validate_recovery_journal_entry(const struct vdo *vdo, const struct recovery_journal_entry *entry) +{ + if ((entry->slot.pbn >= vdo->states.vdo.config.physical_blocks) || + (entry->slot.slot >= VDO_BLOCK_MAP_ENTRIES_PER_PAGE) || + !vdo_is_valid_location(&entry->mapping) || + !vdo_is_valid_location(&entry->unmapping) || + !vdo_is_physical_data_block(vdo->depot, entry->mapping.pbn) || + !vdo_is_physical_data_block(vdo->depot, entry->unmapping.pbn)) + return uds_log_error_strerror(VDO_CORRUPT_JOURNAL, + "Invalid entry: %s (%llu, %u) from %llu to %llu is not within bounds", + vdo_get_journal_operation_name(entry->operation), + (unsigned long long) entry->slot.pbn, + entry->slot.slot, + (unsigned long long) entry->unmapping.pbn, + (unsigned long long) entry->mapping.pbn); + + if ((entry->operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) && + (vdo_is_state_compressed(entry->mapping.state) || + (entry->mapping.pbn == VDO_ZERO_BLOCK) || + (entry->unmapping.state != VDO_MAPPING_STATE_UNMAPPED) || + (entry->unmapping.pbn != VDO_ZERO_BLOCK))) + return uds_log_error_strerror(VDO_CORRUPT_JOURNAL, + "Invalid entry: %s (%llu, %u) from %llu to %llu is not a valid tree mapping", + vdo_get_journal_operation_name(entry->operation), + (unsigned long long) entry->slot.pbn, + entry->slot.slot, + (unsigned long long) entry->unmapping.pbn, + (unsigned long long) entry->mapping.pbn); + + return VDO_SUCCESS; +} + +/** + * add_slab_journal_entries() - Replay recovery journal entries into the slab journals of the + * allocator currently being recovered. + * @completion: The allocator completion. + * + * Waits for slab journal tailblock space when necessary. This method is its own callback. + */ +static void add_slab_journal_entries(struct vdo_completion *completion) +{ + struct recovery_point *recovery_point; + struct repair_completion *repair = completion->parent; + struct vdo *vdo = completion->vdo; + struct recovery_journal *journal = vdo->recovery_journal; + struct block_allocator *allocator = vdo_as_block_allocator(completion); + + /* Get ready in case we need to enqueue again. */ + vdo_prepare_completion(completion, + add_slab_journal_entries, + vdo_notify_slab_journals_are_recovered, + completion->callback_thread_id, + repair); + for (recovery_point = &repair->next_recovery_point; + before_recovery_point(recovery_point, &repair->tail_recovery_point); + advance_points(repair, journal->entries_per_block)) { + int result; + physical_block_number_t pbn; + struct vdo_slab *slab; + struct recovery_journal_entry entry = get_entry(repair, recovery_point); + bool increment = !repair->next_recovery_point.increment_applied; + + if (increment) { + result = validate_recovery_journal_entry(vdo, &entry); + if (result != VDO_SUCCESS) { + vdo_enter_read_only_mode(vdo, result); + vdo_fail_completion(completion, result); + return; + } + + pbn = entry.mapping.pbn; + } else { + pbn = entry.unmapping.pbn; + } + + if (pbn == VDO_ZERO_BLOCK) + continue; + + slab = vdo_get_slab(vdo->depot, pbn); + if (slab->allocator != allocator) + continue; + + if (!vdo_attempt_replay_into_slab(slab, + pbn, + entry.operation, + increment, + &repair->next_journal_point, + completion)) + return; + + repair->entries_added_to_slab_journals++; + } + + vdo_notify_slab_journals_are_recovered(completion); +} + +/** + * vdo_replay_into_slab_journals() - Replay recovery journal entries in the slab journals of slabs + * owned by a given block_allocator. + * @allocator: The allocator whose slab journals are to be recovered. + * @context: The slab depot load context supplied by a recovery when it loads the depot. + */ +void vdo_replay_into_slab_journals(struct block_allocator *allocator, void *context) +{ + struct vdo_completion *completion = &allocator->completion; + struct repair_completion *repair = context; + struct vdo *vdo = completion->vdo; + + vdo_assert_on_physical_zone_thread(vdo, allocator->zone_number, __func__); + if (repair->entry_count == 0) { + /* there's nothing to replay */ + repair->logical_blocks_used = vdo->recovery_journal->logical_blocks_used; + repair->block_map_data_blocks = vdo->recovery_journal->block_map_data_blocks; + vdo_notify_slab_journals_are_recovered(completion); + return; + } + + repair->next_recovery_point = (struct recovery_point) { + .sequence_number = repair->slab_journal_head, + .sector_count = 1, + .entry_count = 0, + }; + + repair->next_journal_point = (struct journal_point) { + .sequence_number = repair->slab_journal_head, + .entry_count = 0, + }; + + uds_log_info("Replaying entries into slab journals for zone %u", allocator->zone_number); + completion->parent = repair; + add_slab_journal_entries(completion); +} + +static void load_slab_depot(struct vdo_completion *completion) +{ + struct repair_completion *repair = as_repair_completion(completion); + const struct admin_state_code *operation; + + vdo_assert_on_admin_thread(completion->vdo, __func__); + + if (vdo_state_requires_read_only_rebuild(completion->vdo->load_state)) { + prepare_repair_completion(repair, rebuild_reference_counts, VDO_ZONE_TYPE_LOGICAL); + operation = VDO_ADMIN_STATE_LOADING_FOR_REBUILD; + } else { + prepare_repair_completion(repair, drain_slab_depot, VDO_ZONE_TYPE_ADMIN); + operation = VDO_ADMIN_STATE_LOADING_FOR_RECOVERY; + } + + vdo_load_slab_depot(completion->vdo->depot, operation, completion, repair); +} + +static void flush_block_map(struct vdo_completion *completion) +{ + struct repair_completion *repair = as_repair_completion(completion); + const struct admin_state_code *operation; + + vdo_assert_on_admin_thread(completion->vdo, __func__); + + uds_log_info("Flushing block map changes"); + prepare_repair_completion(repair, load_slab_depot, VDO_ZONE_TYPE_ADMIN); + operation = (vdo_state_requires_read_only_rebuild(completion->vdo->load_state) ? + VDO_ADMIN_STATE_REBUILDING : + VDO_ADMIN_STATE_RECOVERING); + vdo_drain_block_map(completion->vdo->block_map, operation, completion); +} + +static bool finish_if_done(struct repair_completion *repair) +{ + /* Pages are still being launched or there is still work to do */ + if (repair->launching || (repair->outstanding > 0)) + return false; + + if (repair->completion.result != VDO_SUCCESS) { + page_count_t i; + + for (i = 0; i < repair->page_count; i++) { + struct vdo_page_completion *page_completion = + &repair->page_completions[i]; + + if (page_completion->ready) + vdo_release_page_completion(&page_completion->completion); + } + + vdo_launch_completion(&repair->completion); + return true; + } + + if (repair->current_entry >= repair->entries) + return false; + + launch_repair_completion(repair, flush_block_map, VDO_ZONE_TYPE_ADMIN); + return true; +} + +static void abort_block_map_recovery(struct repair_completion *repair, int result) +{ + vdo_set_completion_result(&repair->completion, result); + finish_if_done(repair); +} + +/** + * find_entry_starting_next_page() - Find the first journal entry after a given entry which is not + * on the same block map page. + * @current_entry: The entry to search from. + * @needs_sort: Whether sorting is needed to proceed. + * + * Return: Pointer to the first later journal entry on a different block map page, or a pointer to + * just before the journal entries if no subsequent entry is on a different block map page. + */ +static struct numbered_block_mapping * +find_entry_starting_next_page(struct repair_completion *repair, + struct numbered_block_mapping *current_entry, + bool needs_sort) +{ + size_t current_page; + + /* If current_entry is invalid, return immediately. */ + if (current_entry < repair->entries) + return current_entry; + + current_page = current_entry->block_map_slot.pbn; + + /* Decrement current_entry until it's out of bounds or on a different page. */ + while ((current_entry >= repair->entries) && + (current_entry->block_map_slot.pbn == current_page)) { + if (needs_sort) { + struct numbered_block_mapping *just_sorted_entry = + sort_next_heap_element(repair); + ASSERT_LOG_ONLY(just_sorted_entry < current_entry, + "heap is returning elements in an unexpected order"); + } + + current_entry--; + } + + return current_entry; +} + +/* + * Apply a range of journal entries [starting_entry, ending_entry) journal + * entries to a block map page. + */ +static void apply_journal_entries_to_page(struct block_map_page *page, + struct numbered_block_mapping *starting_entry, + struct numbered_block_mapping *ending_entry) +{ + struct numbered_block_mapping *current_entry = starting_entry; + + while (current_entry != ending_entry) { + page->entries[current_entry->block_map_slot.slot] = current_entry->block_map_entry; + current_entry--; + } +} + +static void recover_ready_pages(struct repair_completion *repair, + struct vdo_completion *completion); + +static void block_map_page_loaded(struct vdo_completion *completion) +{ + struct repair_completion *repair = as_repair_completion(completion->parent); + + repair->outstanding--; + if (!repair->launching) + recover_ready_pages(repair, completion); +} + +static void handle_block_map_page_load_error(struct vdo_completion *completion) +{ + struct repair_completion *repair = as_repair_completion(completion->parent); + + repair->outstanding--; + abort_block_map_recovery(repair, completion->result); +} + +static void fetch_block_map_page(struct repair_completion *repair, + struct vdo_completion *completion) +{ + physical_block_number_t pbn; + + if (repair->current_unfetched_entry < repair->entries) + /* Nothing left to fetch. */ + return; + + /* Fetch the next page we haven't yet requested. */ + pbn = repair->current_unfetched_entry->block_map_slot.pbn; + repair->current_unfetched_entry = + find_entry_starting_next_page(repair, repair->current_unfetched_entry, true); + repair->outstanding++; + vdo_get_page(((struct vdo_page_completion *) completion), + &repair->completion.vdo->block_map->zones[0], + pbn, + true, + &repair->completion, + block_map_page_loaded, + handle_block_map_page_load_error, + false); +} + +static struct vdo_page_completion * +get_next_page_completion(struct repair_completion *repair, struct vdo_page_completion *completion) +{ + completion++; + if (completion == (&repair->page_completions[repair->page_count])) + completion = &repair->page_completions[0]; + return completion; +} + +static void recover_ready_pages(struct repair_completion *repair, + struct vdo_completion *completion) +{ + struct vdo_page_completion *page_completion = (struct vdo_page_completion *) completion; + + if (finish_if_done(repair)) + return; + + if (repair->pbn != page_completion->pbn) + return; + + while (page_completion->ready) { + struct numbered_block_mapping *start_of_next_page; + struct block_map_page *page; + int result; + + result = vdo_get_cached_page(completion, &page); + if (result != VDO_SUCCESS) { + abort_block_map_recovery(repair, result); + return; + } + + start_of_next_page = + find_entry_starting_next_page(repair, repair->current_entry, false); + apply_journal_entries_to_page(page, repair->current_entry, start_of_next_page); + repair->current_entry = start_of_next_page; + vdo_request_page_write(completion); + vdo_release_page_completion(completion); + + if (finish_if_done(repair)) + return; + + repair->pbn = repair->current_entry->block_map_slot.pbn; + fetch_block_map_page(repair, completion); + page_completion = get_next_page_completion(repair, page_completion); + completion = &page_completion->completion; + } +} + +static void recover_block_map(struct vdo_completion *completion) +{ + struct repair_completion *repair = as_repair_completion(completion); + struct vdo *vdo = completion->vdo; + struct numbered_block_mapping *first_sorted_entry; + page_count_t i; + + vdo_assert_on_logical_zone_thread(vdo, 0, __func__); + + /* Suppress block map errors. */ + vdo->block_map->zones[0].page_cache.rebuilding = + vdo_state_requires_read_only_rebuild(vdo->load_state); + + if (repair->block_map_entry_count == 0) { + uds_log_info("Replaying 0 recovery entries into block map"); + UDS_FREE(UDS_FORGET(repair->journal_data)); + launch_repair_completion(repair, load_slab_depot, VDO_ZONE_TYPE_ADMIN); + return; + } + + /* + * Organize the journal entries into a binary heap so we can iterate over them in sorted + * order incrementally, avoiding an expensive sort call. + */ + repair->replay_heap = (struct min_heap) { + .data = repair->entries, + .nr = repair->block_map_entry_count, + .size = repair->block_map_entry_count, + }; + min_heapify_all(&repair->replay_heap, &repair_min_heap); + + uds_log_info("Replaying %zu recovery entries into block map", + repair->block_map_entry_count); + + repair->current_entry = &repair->entries[repair->block_map_entry_count - 1]; + first_sorted_entry = sort_next_heap_element(repair); + ASSERT_LOG_ONLY(first_sorted_entry == repair->current_entry, + "heap is returning elements in an unexpected order"); + + /* Prevent any page from being processed until all pages have been launched. */ + repair->launching = true; + repair->pbn = repair->current_entry->block_map_slot.pbn; + repair->current_unfetched_entry = repair->current_entry; + for (i = 0; i < repair->page_count; i++) { + if (repair->current_unfetched_entry < repair->entries) + break; + + fetch_block_map_page(repair, &repair->page_completions[i].completion); + } + repair->launching = false; + + /* Process any ready pages. */ + recover_ready_pages(repair, &repair->page_completions[0].completion); +} + +/** + * get_recovery_journal_block_header() - Get the block header for a block at a position in the + * journal data and unpack it. + * @journal: The recovery journal. + * @data: The recovery journal data. + * @sequence: The sequence number. + * + * Return: The unpacked header. + */ +static struct recovery_block_header __must_check +get_recovery_journal_block_header(struct recovery_journal *journal, + char *data, + sequence_number_t sequence) +{ + physical_block_number_t pbn = vdo_get_recovery_journal_block_number(journal, sequence); + char *header = &data[pbn * VDO_BLOCK_SIZE]; + + return vdo_unpack_recovery_block_header((struct packed_journal_header *) header); +} + +/** + * is_valid_recovery_journal_block() - Determine whether the given header describes a valid block + * for the given journal. + * @journal: The journal to use. + * @header: The unpacked block header to check. + * @old_ok: Whether an old format header is valid. + * + * A block is not valid if it is unformatted, or if it is older than the last successful recovery + * or reformat. + * + * Return: True if the header is valid. + */ +static bool __must_check +is_valid_recovery_journal_block(const struct recovery_journal *journal, + const struct recovery_block_header *header, + bool old_ok) +{ + if ((header->nonce != journal->nonce) || + (header->recovery_count != journal->recovery_count)) + return false; + + if (header->metadata_type == VDO_METADATA_RECOVERY_JOURNAL_2) + return (header->entry_count <= journal->entries_per_block); + + return (old_ok && + (header->metadata_type == VDO_METADATA_RECOVERY_JOURNAL) && + (header->entry_count <= RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK)); +} + +/** + * is_exact_recovery_journal_block() - Determine whether the given header describes the exact block + * indicated. + * @journal: The journal to use. + * @header: The unpacked block header to check. + * @sequence: The expected sequence number. + * @type: The expected metadata type. + * + * Return: True if the block matches. + */ +static bool __must_check +is_exact_recovery_journal_block(const struct recovery_journal *journal, + const struct recovery_block_header *header, + sequence_number_t sequence, + enum vdo_metadata_type type) +{ + return ((header->metadata_type == type) && + (header->sequence_number == sequence) && + (is_valid_recovery_journal_block(journal, header, true))); +} + +/** + * find_recovery_journal_head_and_tail() - Find the tail and head of the journal. + * + * Return: True if there were valid journal blocks. + */ +static bool find_recovery_journal_head_and_tail(struct repair_completion *repair) +{ + struct recovery_journal *journal = repair->completion.vdo->recovery_journal; + bool found_entries = false; + physical_block_number_t i; + + /* + * Ensure that we don't replay old entries since we know the tail recorded in the super + * block must be a lower bound. Not doing so can result in extra data loss by setting the + * tail too early. + */ + repair->highest_tail = journal->tail; + for (i = 0; i < journal->size; i++) { + struct recovery_block_header header = + get_recovery_journal_block_header(journal, repair->journal_data, i); + + if (!is_valid_recovery_journal_block(journal, &header, true)) + /* This block is old or incorrectly formatted */ + continue; + + if (vdo_get_recovery_journal_block_number(journal, header.sequence_number) != i) + /* This block is in the wrong location */ + continue; + + if (header.sequence_number >= repair->highest_tail) { + found_entries = true; + repair->highest_tail = header.sequence_number; + } + + if (!found_entries) + continue; + + if (header.block_map_head > repair->block_map_head) + repair->block_map_head = header.block_map_head; + + if (header.slab_journal_head > repair->slab_journal_head) + repair->slab_journal_head = header.slab_journal_head; + } + + return found_entries; +} + +/** + * unpack_entry(): Unpack a recovery journal entry in either format. + * @vdo: The vdo. + * @packed: The entry to unpack. + * @format: The expected format of the entry. + * @entry: The unpacked entry. + * + * Return: true if the entry should be applied.3 + */ +static bool unpack_entry(struct vdo *vdo, + char *packed, + enum vdo_metadata_type format, + struct recovery_journal_entry *entry) +{ + if (format == VDO_METADATA_RECOVERY_JOURNAL_2) { + struct packed_recovery_journal_entry *packed_entry = + (struct packed_recovery_journal_entry *) packed; + + *entry = vdo_unpack_recovery_journal_entry(packed_entry); + } else { + physical_block_number_t low32, high4; + + struct packed_recovery_journal_entry_1 *packed_entry = + (struct packed_recovery_journal_entry_1 *) packed; + + if (packed_entry->operation == VDO_JOURNAL_DATA_INCREMENT) + entry->operation = VDO_JOURNAL_DATA_REMAPPING; + else if (packed_entry->operation == VDO_JOURNAL_BLOCK_MAP_INCREMENT) + entry->operation = VDO_JOURNAL_BLOCK_MAP_REMAPPING; + else + return false; + + low32 = __le32_to_cpu(packed_entry->pbn_low_word); + high4 = packed_entry->pbn_high_nibble; + entry->slot = (struct block_map_slot) { + .pbn = ((high4 << 32) | low32), + .slot = (packed_entry->slot_low | (packed_entry->slot_high << 6)), + }; + entry->mapping = vdo_unpack_block_map_entry(&packed_entry->block_map_entry); + entry->unmapping = (struct data_location) { + .pbn = VDO_ZERO_BLOCK, + .state = VDO_MAPPING_STATE_UNMAPPED, + }; + } + + return (validate_recovery_journal_entry(vdo, entry) == VDO_SUCCESS); +} + +/** + * append_sector_entries() - Append an array of recovery journal entries from a journal block + * sector to the array of numbered mappings in the repair completion, + * numbering each entry in the order they are appended. + * @repair: The repair completion. + * @entries: The entries in the sector. + * @format: The format of the sector. + * @entry_count: The number of entries to append. + */ +static void append_sector_entries(struct repair_completion *repair, + char *entries, + enum vdo_metadata_type format, + journal_entry_count_t entry_count) +{ + journal_entry_count_t i; + struct vdo *vdo = repair->completion.vdo; + off_t increment = ((format == VDO_METADATA_RECOVERY_JOURNAL_2) + ? sizeof(struct packed_recovery_journal_entry) + : sizeof(struct packed_recovery_journal_entry_1)); + + for (i = 0; i < entry_count; i++, entries += increment) { + struct recovery_journal_entry entry; + + if (!unpack_entry(vdo, entries, format, &entry)) + /* When recovering from read-only mode, ignore damaged entries. */ + continue; + + repair->entries[repair->block_map_entry_count] = + (struct numbered_block_mapping) { + .block_map_slot = entry.slot, + .block_map_entry = vdo_pack_block_map_entry(entry.mapping.pbn, + entry.mapping.state), + .number = repair->block_map_entry_count, + }; + repair->block_map_entry_count++; + } +} + +static journal_entry_count_t entries_per_sector(enum vdo_metadata_type format, u8 sector_number) +{ + if (format == VDO_METADATA_RECOVERY_JOURNAL_2) + return RECOVERY_JOURNAL_ENTRIES_PER_SECTOR; + + return ((sector_number == (VDO_SECTORS_PER_BLOCK - 1)) + ? RECOVERY_JOURNAL_1_ENTRIES_IN_LAST_SECTOR + : RECOVERY_JOURNAL_1_ENTRIES_PER_SECTOR); +} + +static void extract_entries_from_block(struct repair_completion *repair, + struct recovery_journal *journal, + sequence_number_t sequence, + enum vdo_metadata_type format, + journal_entry_count_t entries) +{ + sector_count_t i; + struct recovery_block_header header = + get_recovery_journal_block_header(journal, repair->journal_data, sequence); + + if (!is_exact_recovery_journal_block(journal, &header, sequence, format)) + /* This block is invalid, so skip it. */ + return; + + entries = min(entries, header.entry_count); + for (i = 1; i < VDO_SECTORS_PER_BLOCK; i++) { + struct packed_journal_sector *sector = + get_sector(journal, repair->journal_data, sequence, i); + journal_entry_count_t sector_entries = min(entries, entries_per_sector(format, i)); + + if (vdo_is_valid_recovery_journal_sector(&header, sector, i)) { + /* Only extract as many as the block header calls for. */ + append_sector_entries(repair, + (char *) sector->entries, + format, + min_t(journal_entry_count_t, + sector->entry_count, + sector_entries)); + } + + /* + * Even if the sector wasn't full, count it as full when counting up to the + * entry count the block header claims. + */ + entries -= sector_entries; + } +} + +static int parse_journal_for_rebuild(struct repair_completion *repair) +{ + int result; + sequence_number_t i; + block_count_t count; + enum vdo_metadata_type format; + struct vdo *vdo = repair->completion.vdo; + struct recovery_journal *journal = vdo->recovery_journal; + journal_entry_count_t entries_per_block = journal->entries_per_block; + + format = get_recovery_journal_block_header(journal, + repair->journal_data, + repair->highest_tail).metadata_type; + if (format == VDO_METADATA_RECOVERY_JOURNAL) + entries_per_block = RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK; + + /* + * Allocate an array of numbered_block_mapping structures large enough to transcribe every + * packed_recovery_journal_entry from every valid journal block. + */ + count = ((repair->highest_tail - repair->block_map_head + 1) * entries_per_block); + result = UDS_ALLOCATE(count, struct numbered_block_mapping, __func__, &repair->entries); + if (result != VDO_SUCCESS) + return result; + + for (i = repair->block_map_head; i <= repair->highest_tail; i++) + extract_entries_from_block(repair, journal, i, format, entries_per_block); + + return VDO_SUCCESS; +} + +static int validate_heads(struct repair_completion *repair) +{ + /* Both reap heads must be behind the tail. */ + if ((repair->block_map_head <= repair->tail) && + (repair->slab_journal_head <= repair->tail)) + return VDO_SUCCESS; + + + return uds_log_error_strerror(VDO_CORRUPT_JOURNAL, + "Journal tail too early. block map head: %llu, slab journal head: %llu, tail: %llu", + (unsigned long long) repair->block_map_head, + (unsigned long long) repair->slab_journal_head, + (unsigned long long) repair->tail); +} + +/** + * extract_new_mappings() - Find all valid new mappings to be applied to the block map. + * + * The mappings are extracted from the journal and stored in a sortable array so that all of the + * mappings to be applied to a given block map page can be done in a single page fetch. + */ +static int extract_new_mappings(struct repair_completion *repair) +{ + int result; + struct vdo *vdo = repair->completion.vdo; + struct recovery_point recovery_point = { + .sequence_number = repair->block_map_head, + .sector_count = 1, + .entry_count = 0, + }; + + /* + * Allocate an array of numbered_block_mapping structs just large enough to transcribe + * every packed_recovery_journal_entry from every valid journal block. + */ + result = UDS_ALLOCATE(repair->entry_count, + struct numbered_block_mapping, + __func__, + &repair->entries); + if (result != VDO_SUCCESS) + return result; + + for (; before_recovery_point(&recovery_point, &repair->tail_recovery_point); + increment_recovery_point(&recovery_point)) { + struct recovery_journal_entry entry = get_entry(repair, &recovery_point); + + result = validate_recovery_journal_entry(vdo, &entry); + if (result != VDO_SUCCESS) { + vdo_enter_read_only_mode(vdo, result); + return result; + } + + repair->entries[repair->block_map_entry_count] = + (struct numbered_block_mapping) { + .block_map_slot = entry.slot, + .block_map_entry = vdo_pack_block_map_entry(entry.mapping.pbn, + entry.mapping.state), + .number = repair->block_map_entry_count, + }; + repair->block_map_entry_count++; + } + + result = ASSERT((repair->block_map_entry_count <= repair->entry_count), + "approximate entry count is an upper bound"); + if (result != VDO_SUCCESS) + vdo_enter_read_only_mode(vdo, result); + + return result; +} + +/** + * compute_usages() - Compute the lbns in use and block map data blocks counts from the tail of + * the journal. + */ +static noinline int compute_usages(struct repair_completion *repair) +{ + /* + * VDO-5182: function is declared noinline to avoid what is likely a spurious valgrind + * error about this structure being uninitialized. + */ + struct recovery_point recovery_point = { + .sequence_number = repair->tail, + .sector_count = 1, + .entry_count = 0, + }; + + struct vdo *vdo = repair->completion.vdo; + struct recovery_journal *journal = vdo->recovery_journal; + struct recovery_block_header header = + get_recovery_journal_block_header(journal, repair->journal_data, repair->tail); + + repair->logical_blocks_used = header.logical_blocks_used; + repair->block_map_data_blocks = header.block_map_data_blocks; + + for (; before_recovery_point(&recovery_point, &repair->tail_recovery_point); + increment_recovery_point(&recovery_point)) { + struct recovery_journal_entry entry = get_entry(repair, &recovery_point); + int result; + + result = validate_recovery_journal_entry(vdo, &entry); + if (result != VDO_SUCCESS) { + vdo_enter_read_only_mode(vdo, result); + return result; + } + + if (entry.operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) { + repair->block_map_data_blocks++; + continue; + } + + if (vdo_is_mapped_location(&entry.mapping)) + repair->logical_blocks_used++; + + if (vdo_is_mapped_location(&entry.unmapping)) + repair->logical_blocks_used--; + } + + return VDO_SUCCESS; +} + +static int parse_journal_for_recovery(struct repair_completion *repair) +{ + int result; + sequence_number_t i, head; + bool found_entries = false; + struct recovery_journal *journal = repair->completion.vdo->recovery_journal; + + head = min(repair->block_map_head, repair->slab_journal_head); + for (i = head; i <= repair->highest_tail; i++) { + struct recovery_block_header header; + journal_entry_count_t block_entries; + u8 j; + + repair->tail = i; + repair->tail_recovery_point = (struct recovery_point) { + .sequence_number = i, + .sector_count = 0, + .entry_count = 0, + }; + + header = get_recovery_journal_block_header(journal, repair->journal_data, i); + if (header.metadata_type == VDO_METADATA_RECOVERY_JOURNAL) { + /* This is an old format block, so we need to upgrade */ + uds_log_error_strerror(VDO_UNSUPPORTED_VERSION, + "Recovery journal is in the old format, a read-only rebuild is required."); + vdo_enter_read_only_mode(repair->completion.vdo, VDO_UNSUPPORTED_VERSION); + return VDO_UNSUPPORTED_VERSION; + } + + if (!is_exact_recovery_journal_block(journal, + &header, + i, + VDO_METADATA_RECOVERY_JOURNAL_2)) + /* A bad block header was found so this must be the end of the journal. */ + break; + + block_entries = header.entry_count; + + /* Examine each sector in turn to determine the last valid sector. */ + for (j = 1; j < VDO_SECTORS_PER_BLOCK; j++) { + struct packed_journal_sector *sector = + get_sector(journal, repair->journal_data, i, j); + journal_entry_count_t sector_entries = + min_t(journal_entry_count_t, sector->entry_count, block_entries); + + /* A bad sector means that this block was torn. */ + if (!vdo_is_valid_recovery_journal_sector(&header, sector, j)) + break; + + if (sector_entries > 0) { + found_entries = true; + repair->tail_recovery_point.sector_count++; + repair->tail_recovery_point.entry_count = sector_entries; + block_entries -= sector_entries; + repair->entry_count += sector_entries; + } + + /* If this sector is short, the later sectors can't matter. */ + if ((sector_entries < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) || + (block_entries == 0)) + break; + } + + /* If this block was not filled, or if it tore, no later block can matter. */ + if ((header.entry_count != journal->entries_per_block) || (block_entries > 0)) + break; + } + + if (!found_entries) + return validate_heads(repair); + + /* Set the tail to the last valid tail block, if there is one. */ + if (repair->tail_recovery_point.sector_count == 0) + repair->tail--; + + result = validate_heads(repair); + if (result != VDO_SUCCESS) + return result; + + uds_log_info("Highest-numbered recovery journal block has sequence number %llu, and the highest-numbered usable block is %llu", + (unsigned long long) repair->highest_tail, + (unsigned long long) repair->tail); + + result = extract_new_mappings(repair); + if (result != VDO_SUCCESS) + return result; + + return compute_usages(repair); +} + +static int parse_journal(struct repair_completion *repair) +{ + if (!find_recovery_journal_head_and_tail(repair)) + return VDO_SUCCESS; + + return (vdo_state_requires_read_only_rebuild(repair->completion.vdo->load_state) ? + parse_journal_for_rebuild(repair) : + parse_journal_for_recovery(repair)); +} + +static void finish_journal_load(struct vdo_completion *completion) +{ + struct repair_completion *repair = completion->parent; + + if (++repair->vios_complete != repair->vio_count) + return; + + uds_log_info("Finished reading recovery journal"); + uninitialize_vios(repair); + prepare_repair_completion(repair, recover_block_map, VDO_ZONE_TYPE_LOGICAL); + vdo_continue_completion(&repair->completion, parse_journal(repair)); +} + +static void handle_journal_load_error(struct vdo_completion *completion) +{ + struct repair_completion *repair = completion->parent; + + /* Preserve the error */ + vdo_set_completion_result(&repair->completion, completion->result); + vio_record_metadata_io_error(as_vio(completion)); + completion->callback(completion); +} + +static void read_journal_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vdo *vdo = vio->completion.vdo; + + continue_vio_after_io(vio, finish_journal_load, vdo->thread_config.admin_thread); +} + +/** + * vdo_repair(): Load the recovery journal and then recover or rebuild a vdo. + * @parent: The completion to notify when the operation is complete + */ +void vdo_repair(struct vdo_completion *parent) +{ + int result; + char *ptr; + struct repair_completion *repair; + struct vdo *vdo = parent->vdo; + struct recovery_journal *journal = vdo->recovery_journal; + physical_block_number_t pbn = journal->origin; + block_count_t remaining = journal->size; + block_count_t vio_count = DIV_ROUND_UP(remaining, MAX_BLOCKS_PER_VIO); + page_count_t page_count = min_t(page_count_t, + vdo->device_config->cache_size >> 1, + MAXIMUM_SIMULTANEOUS_VDO_BLOCK_MAP_RESTORATION_READS); + + vdo_assert_on_admin_thread(vdo, __func__); + + if (vdo->load_state == VDO_FORCE_REBUILD) { + uds_log_warning("Rebuilding reference counts to clear read-only mode"); + vdo->states.vdo.read_only_recoveries++; + } else if (vdo->load_state == VDO_REBUILD_FOR_UPGRADE) { + uds_log_warning("Rebuilding reference counts for upgrade"); + } else { + uds_log_warning("Device was dirty, rebuilding reference counts"); + } + + result = UDS_ALLOCATE_EXTENDED(struct repair_completion, + page_count, + struct vdo_page_completion, + __func__, + &repair); + if (result != VDO_SUCCESS) { + vdo_fail_completion(parent, result); + return; + } + + vdo_initialize_completion(&repair->completion, vdo, VDO_REPAIR_COMPLETION); + repair->completion.error_handler = abort_repair; + repair->completion.parent = parent; + prepare_repair_completion(repair, finish_repair, VDO_ZONE_TYPE_ADMIN); + repair->page_count = page_count; + + result = UDS_ALLOCATE(remaining * VDO_BLOCK_SIZE, char, __func__, &repair->journal_data); + if (abort_on_error(result, repair)) + return; + + result = UDS_ALLOCATE(vio_count, struct vio, __func__, &repair->vios); + if (abort_on_error(result, repair)) + return; + + ptr = repair->journal_data; + for (repair->vio_count = 0; repair->vio_count < vio_count; repair->vio_count++) { + block_count_t blocks = min_t(block_count_t, remaining, MAX_BLOCKS_PER_VIO); + + result = allocate_vio_components(vdo, + VIO_TYPE_RECOVERY_JOURNAL, + VIO_PRIORITY_METADATA, + repair, + blocks, + ptr, + &repair->vios[repair->vio_count]); + if (abort_on_error(result, repair)) + return; + + ptr += (blocks * VDO_BLOCK_SIZE); + remaining -= blocks; + } + + for (vio_count = 0; + vio_count < repair->vio_count; + vio_count++, pbn += MAX_BLOCKS_PER_VIO) + submit_metadata_vio(&repair->vios[vio_count], + pbn, + read_journal_endio, + handle_journal_load_error, + REQ_OP_READ); +} diff --git a/drivers/md/dm-vdo/repair.h b/drivers/md/dm-vdo/repair.h new file mode 100644 index 00000000000..a28637d0a05 --- /dev/null +++ b/drivers/md/dm-vdo/repair.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_REPAIR_H +#define VDO_REPAIR_H + +#include "types.h" + +void vdo_replay_into_slab_journals(struct block_allocator *allocator, void *context); +void vdo_repair(struct vdo_completion *parent); + +#endif /* VDO_REPAIR_H */ diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c new file mode 100644 index 00000000000..61bd375b234 --- /dev/null +++ b/drivers/md/dm-vdo/slab-depot.c @@ -0,0 +1,5212 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "slab-depot.h" + +#include <linux/atomic.h> +#include <linux/bio.h> +#include <linux/log2.h> +#include <linux/min_heap.h> +#include <linux/minmax.h> + +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "permassert.h" +#include "string-utils.h" + +#include "action-manager.h" +#include "admin-state.h" +#include "completion.h" +#include "constants.h" +#include "data-vio.h" +#include "encodings.h" +#include "io-submitter.h" +#include "physical-zone.h" +#include "priority-table.h" +#include "recovery-journal.h" +#include "repair.h" +#include "status-codes.h" +#include "types.h" +#include "vdo.h" +#include "vio.h" +#include "wait-queue.h" + +static const u64 BYTES_PER_WORD = sizeof(u64); +static const bool NORMAL_OPERATION = true; + +struct slab_journal_eraser { + struct vdo_completion *parent; + struct dm_kcopyd_client *client; + block_count_t blocks; + struct slab_iterator slabs; +}; + +/** + * get_lock() - Get the lock object for a slab journal block by sequence number. + * @journal: vdo_slab journal to retrieve from. + * @sequence_number: Sequence number of the block. + * + * Return: The lock object for the given sequence number. + */ +static inline struct journal_lock * __must_check +get_lock(struct slab_journal *journal, sequence_number_t sequence_number) +{ + return &journal->locks[sequence_number % journal->size]; +} + +static bool is_slab_open(struct vdo_slab *slab) +{ + return (!vdo_is_state_quiescing(&slab->state) && !vdo_is_state_quiescent(&slab->state)); +} + +/** + * must_make_entries_to_flush() - Check whether there are entry waiters which should delay a flush. + * @journal: The journal to check. + * + * Return: true if there are no entry waiters, or if the slab is unrecovered. + */ +static inline bool __must_check must_make_entries_to_flush(struct slab_journal *journal) +{ + return ((journal->slab->status != VDO_SLAB_REBUILDING) && + vdo_has_waiters(&journal->entry_waiters)); +} + +/** + * is_reaping() - Check whether a reap is currently in progress. + * @journal: The journal which may be reaping. + * + * Return: true if the journal is reaping. + */ +static inline bool __must_check is_reaping(struct slab_journal *journal) +{ + return (journal->head != journal->unreapable); +} + +/** + * initialize_tail_block() - Initialize tail block as a new block. + * @journal: The journal whose tail block is being initialized. + */ +static void initialize_tail_block(struct slab_journal *journal) +{ + struct slab_journal_block_header *header = &journal->tail_header; + + header->sequence_number = journal->tail; + header->entry_count = 0; + header->has_block_map_increments = false; +} + +/** + * initialize_journal_state() - Set all journal fields appropriately to start journaling. + * @journal: The journal to be reset, based on its tail sequence number. + */ +static void initialize_journal_state(struct slab_journal *journal) +{ + journal->unreapable = journal->head; + journal->reap_lock = get_lock(journal, journal->unreapable); + journal->next_commit = journal->tail; + journal->summarized = journal->last_summarized = journal->tail; + initialize_tail_block(journal); +} + +/** + * block_is_full() - Check whether a journal block is full. + * @journal: The slab journal for the block. + * + * Return: true if the tail block is full. + */ +static bool __must_check block_is_full(struct slab_journal *journal) +{ + journal_entry_count_t count = journal->tail_header.entry_count; + + return (journal->tail_header.has_block_map_increments ? + (journal->full_entries_per_block == count) : + (journal->entries_per_block == count)); +} + +static void add_entries(struct slab_journal *journal); +static void update_tail_block_location(struct slab_journal *journal); +static void release_journal_locks(struct waiter *waiter, void *context); + +/** + * is_slab_journal_blank() - Check whether a slab's journal is blank. + * + * A slab journal is blank if it has never had any entries recorded in it. + * + * Return: true if the slab's journal has never been modified. + */ +static bool is_slab_journal_blank(const struct vdo_slab *slab) +{ + return ((slab->journal.tail == 1) && (slab->journal.tail_header.entry_count == 0)); +} + +/** + * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct + * order. + * @journal: The journal to be marked dirty. + * @lock: The recovery journal lock held by the slab journal. + */ +static void mark_slab_journal_dirty(struct slab_journal *journal, sequence_number_t lock) +{ + struct slab_journal *dirty_journal; + struct list_head *dirty_list = &journal->slab->allocator->dirty_slab_journals; + + ASSERT_LOG_ONLY(journal->recovery_lock == 0, "slab journal was clean"); + + journal->recovery_lock = lock; + list_for_each_entry_reverse(dirty_journal, dirty_list, dirty_entry) { + if (dirty_journal->recovery_lock <= journal->recovery_lock) + break; + } + + list_move_tail(&journal->dirty_entry, dirty_journal->dirty_entry.next); +} + +static void mark_slab_journal_clean(struct slab_journal *journal) +{ + journal->recovery_lock = 0; + list_del_init(&journal->dirty_entry); +} + +static void check_if_slab_drained(struct vdo_slab *slab) +{ + bool read_only; + struct slab_journal *journal = &slab->journal; + const struct admin_state_code *code; + + if (!vdo_is_state_draining(&slab->state) || + must_make_entries_to_flush(journal) || + is_reaping(journal) || + journal->waiting_to_commit || + !list_empty(&journal->uncommitted_blocks) || + journal->updating_slab_summary || + (slab->active_count > 0)) + return; + + /* When not suspending or recovering, the slab must be clean. */ + code = vdo_get_admin_state_code(&slab->state); + read_only = vdo_is_read_only(slab->allocator->depot->vdo); + if (!read_only && + vdo_has_waiters(&slab->dirty_blocks) && + (code != VDO_ADMIN_STATE_SUSPENDING) && + (code != VDO_ADMIN_STATE_RECOVERING)) + return; + + vdo_finish_draining_with_result(&slab->state, (read_only ? VDO_READ_ONLY : VDO_SUCCESS)); +} + +/* FULLNESS HINT COMPUTATION */ + +/** + * compute_fullness_hint() - Translate a slab's free block count into a 'fullness hint' that can be + * stored in a slab_summary_entry's 7 bits that are dedicated to its free + * count. + * @depot: The depot whose summary being updated. + * @free_blocks: The number of free blocks. + * + * Note: the number of free blocks must be strictly less than 2^23 blocks, even though + * theoretically slabs could contain precisely 2^23 blocks; there is an assumption that at least + * one block is used by metadata. This assumption is necessary; otherwise, the fullness hint might + * overflow. The fullness hint formula is roughly (fullness >> 16) & 0x7f, but (2^23 >> 16) & 0x7f + * is 0, which would make it impossible to distinguish completely full from completely empty. + * + * Return: A fullness hint, which can be stored in 7 bits. + */ +static u8 __must_check compute_fullness_hint(struct slab_depot *depot, block_count_t free_blocks) +{ + block_count_t hint; + + ASSERT_LOG_ONLY((free_blocks < (1 << 23)), "free blocks must be less than 2^23"); + + if (free_blocks == 0) + return 0; + + hint = free_blocks >> depot->hint_shift; + return ((hint == 0) ? 1 : hint); +} + +/** + * check_summary_drain_complete() - Check whether an allocators summary has finished draining. + */ +static void check_summary_drain_complete(struct block_allocator *allocator) +{ + struct vdo *vdo = allocator->depot->vdo; + + if (!vdo_is_state_draining(&allocator->summary_state) || + (allocator->summary_write_count > 0)) + return; + + vdo_finish_operation(&allocator->summary_state, + (vdo_is_read_only(vdo) ? VDO_READ_ONLY : VDO_SUCCESS)); +} + +/** + * notify_summary_waiters() - Wake all the waiters in a given queue. + * @allocator: The block allocator summary which owns the queue. + * @queue: The queue to notify. + */ +static void notify_summary_waiters(struct block_allocator *allocator, struct wait_queue *queue) +{ + int result = (vdo_is_read_only(allocator->depot->vdo) ? VDO_READ_ONLY : VDO_SUCCESS); + + vdo_notify_all_waiters(queue, NULL, &result); +} + +static void launch_write(struct slab_summary_block *summary_block); + +/** + * finish_updating_slab_summary_block() - Finish processing a block which attempted to write, + * whether or not the attempt succeeded. + * @block: The block. + */ +static void finish_updating_slab_summary_block(struct slab_summary_block *block) +{ + notify_summary_waiters(block->allocator, &block->current_update_waiters); + block->writing = false; + block->allocator->summary_write_count--; + if (vdo_has_waiters(&block->next_update_waiters)) + launch_write(block); + else + check_summary_drain_complete(block->allocator); +} + +/** + * finish_update() - This is the callback for a successful summary block write. + * @completion: The write vio. + */ +static void finish_update(struct vdo_completion *completion) +{ + struct slab_summary_block *block = + container_of(as_vio(completion), struct slab_summary_block, vio); + + atomic64_inc(&block->allocator->depot->summary_statistics.blocks_written); + finish_updating_slab_summary_block(block); +} + +/** + * handle_write_error() - Handle an error writing a slab summary block. + * @completion: The write VIO. + */ +static void handle_write_error(struct vdo_completion *completion) +{ + struct slab_summary_block *block = + container_of(as_vio(completion), struct slab_summary_block, vio); + + vio_record_metadata_io_error(as_vio(completion)); + vdo_enter_read_only_mode(completion->vdo, completion->result); + finish_updating_slab_summary_block(block); +} + +static void write_slab_summary_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct slab_summary_block *block = container_of(vio, struct slab_summary_block, vio); + + continue_vio_after_io(vio, finish_update, block->allocator->thread_id); +} + +/** + * launch_write() - Write a slab summary block unless it is currently out for writing. + * @block: The block that needs to be committed. + */ +static void launch_write(struct slab_summary_block *block) +{ + struct block_allocator *allocator = block->allocator; + struct slab_depot *depot = allocator->depot; + physical_block_number_t pbn; + + if (block->writing) + return; + + allocator->summary_write_count++; + vdo_transfer_all_waiters(&block->next_update_waiters, &block->current_update_waiters); + block->writing = true; + + if (vdo_is_read_only(depot->vdo)) { + finish_updating_slab_summary_block(block); + return; + } + + memcpy(block->outgoing_entries, block->entries, VDO_BLOCK_SIZE); + + /* + * Flush before writing to ensure that the slab journal tail blocks and reference updates + * covered by this summary update are stable (VDO-2332). + */ + pbn = (depot->summary_origin + + (VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE * allocator->zone_number) + + block->index); + submit_metadata_vio(&block->vio, + pbn, + write_slab_summary_endio, + handle_write_error, + REQ_OP_WRITE | REQ_PREFLUSH); +} + +/** + * update_slab_summary_entry() - Update the entry for a slab. + * @slab: The slab whose entry is to be updated + * @waiter: The waiter that is updating the summary. + * @tail_block_offset: The offset of the slab journal's tail block. + * @load_ref_counts: Whether the reference counts must be loaded from disk on the vdo load. + * @is_clean: Whether the slab is clean. + * @free_blocks: The number of free blocks. + */ +static void +update_slab_summary_entry(struct vdo_slab *slab, + struct waiter *waiter, + tail_block_offset_t tail_block_offset, + bool load_ref_counts, + bool is_clean, + block_count_t free_blocks) +{ + u8 index = slab->slab_number / VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK; + struct block_allocator *allocator = slab->allocator; + struct slab_summary_block *block = &allocator->summary_blocks[index]; + int result; + struct slab_summary_entry *entry; + + if (vdo_is_read_only(block->vio.completion.vdo)) { + result = VDO_READ_ONLY; + waiter->callback(waiter, &result); + return; + } + + if (vdo_is_state_draining(&allocator->summary_state) || + vdo_is_state_quiescent(&allocator->summary_state)) { + result = VDO_INVALID_ADMIN_STATE; + waiter->callback(waiter, &result); + return; + } + + entry = &allocator->summary_entries[slab->slab_number]; + *entry = (struct slab_summary_entry) { + .tail_block_offset = tail_block_offset, + .load_ref_counts = (entry->load_ref_counts || load_ref_counts), + .is_dirty = !is_clean, + .fullness_hint = compute_fullness_hint(allocator->depot, free_blocks), + }; + vdo_enqueue_waiter(&block->next_update_waiters, waiter); + launch_write(block); +} + +/** + * finish_reaping() - Actually advance the head of the journal now that any necessary flushes are + * complete. + * @journal: The journal to be reaped. + */ +static void finish_reaping(struct slab_journal *journal) +{ + journal->head = journal->unreapable; + add_entries(journal); + check_if_slab_drained(journal->slab); +} + +static void reap_slab_journal(struct slab_journal *journal); + +/** + * complete_reaping() - Finish reaping now that we have flushed the lower layer and then try + * reaping again in case we deferred reaping due to an outstanding vio. + * @completion: The flush vio. + */ +static void complete_reaping(struct vdo_completion *completion) +{ + struct slab_journal *journal = completion->parent; + + return_vio_to_pool(journal->slab->allocator->vio_pool, + vio_as_pooled_vio(as_vio(UDS_FORGET(completion)))); + finish_reaping(journal); + reap_slab_journal(journal); +} + +/** + * handle_flush_error() - Handle an error flushing the lower layer. + * @completion: The flush vio. + */ +static void handle_flush_error(struct vdo_completion *completion) +{ + vio_record_metadata_io_error(as_vio(completion)); + vdo_enter_read_only_mode(completion->vdo, completion->result); + complete_reaping(completion); +} + +static void flush_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct slab_journal *journal = vio->completion.parent; + + continue_vio_after_io(vio, + complete_reaping, + journal->slab->allocator->thread_id); +} + +/** + * flush_for_reaping() - A waiter callback for getting a vio with which to flush the lower layer + * prior to reaping. + * @waiter: The journal as a flush waiter. + * @context: The newly acquired flush vio. + */ +static void flush_for_reaping(struct waiter *waiter, void *context) +{ + struct slab_journal *journal = container_of(waiter, struct slab_journal, flush_waiter); + struct pooled_vio *pooled = context; + struct vio *vio = &pooled->vio; + + vio->completion.parent = journal; + submit_flush_vio(vio, flush_endio, handle_flush_error); +} + +/** + * reap_slab_journal() - Conduct a reap on a slab journal to reclaim unreferenced blocks. + * @journal: The slab journal. + */ +static void reap_slab_journal(struct slab_journal *journal) +{ + bool reaped = false; + + if (is_reaping(journal)) + /* We already have a reap in progress so wait for it to finish. */ + return; + + if ((journal->slab->status != VDO_SLAB_REBUILT) || + !vdo_is_state_normal(&journal->slab->state) || + vdo_is_read_only(journal->slab->allocator->depot->vdo)) + /* + * We must not reap in the first two cases, and there's no point in read-only mode. + */ + return; + + /* + * Start reclaiming blocks only when the journal head has no references. Then stop when a + * block is referenced or reap reaches the most recently written block, referenced by the + * slab summary, which has the sequence number just before the tail. + */ + while ((journal->unreapable < journal->tail) && (journal->reap_lock->count == 0)) { + reaped = true; + journal->unreapable++; + journal->reap_lock++; + if (journal->reap_lock == &journal->locks[journal->size]) + journal->reap_lock = &journal->locks[0]; + } + + if (!reaped) + return; + + /* + * It is never safe to reap a slab journal block without first issuing a flush, regardless + * of whether a user flush has been received or not. In the absence of the flush, the + * reference block write which released the locks allowing the slab journal to reap may not + * be persisted. Although slab summary writes will eventually issue flushes, multiple slab + * journal block writes can be issued while previous slab summary updates have not yet been + * made. Even though those slab journal block writes will be ignored if the slab summary + * update is not persisted, they may still overwrite the to-be-reaped slab journal block + * resulting in a loss of reference count updates (VDO-2912). + */ + journal->flush_waiter.callback = flush_for_reaping; + acquire_vio_from_pool(journal->slab->allocator->vio_pool, &journal->flush_waiter); +} + +/** + * adjust_slab_journal_block_reference() - Adjust the reference count for a slab journal block. + * @journal: The slab journal. + * @sequence_number: The journal sequence number of the referenced block. + * @adjustment: Amount to adjust the reference counter. + * + * Note that when the adjustment is negative, the slab journal will be reaped. + */ +static void +adjust_slab_journal_block_reference(struct slab_journal *journal, + sequence_number_t sequence_number, + int adjustment) +{ + struct journal_lock *lock; + + if (sequence_number == 0) + return; + + if (journal->slab->status == VDO_SLAB_REPLAYING) + /* Locks should not be used during offline replay. */ + return; + + ASSERT_LOG_ONLY((adjustment != 0), "adjustment must be non-zero"); + lock = get_lock(journal, sequence_number); + if (adjustment < 0) + ASSERT_LOG_ONLY((-adjustment <= lock->count), + "adjustment %d of lock count %u for slab journal block %llu must not underflow", + adjustment, + lock->count, + (unsigned long long) sequence_number); + + lock->count += adjustment; + if (lock->count == 0) + reap_slab_journal(journal); +} + +/** + * release_journal_locks() - Callback invoked after a slab summary update completes. + * @waiter: The slab summary waiter that has just been notified. + * @context: The result code of the update. + * + * Registered in the constructor on behalf of update_tail_block_location(). + * + * Implements waiter_callback. + */ +static void release_journal_locks(struct waiter *waiter, void *context) +{ + sequence_number_t first, i; + struct slab_journal *journal = + container_of(waiter, struct slab_journal, slab_summary_waiter); + int result = *((int *)context); + + if (result != VDO_SUCCESS) { + if (result != VDO_READ_ONLY) + /* + * Don't bother logging what might be lots of errors if we are already in + * read-only mode. + */ + uds_log_error_strerror(result, + "failed slab summary update %llu", + (unsigned long long) journal->summarized); + + journal->updating_slab_summary = false; + vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result); + check_if_slab_drained(journal->slab); + return; + } + + if (journal->partial_write_in_progress && (journal->summarized == journal->tail)) { + journal->partial_write_in_progress = false; + add_entries(journal); + } + + first = journal->last_summarized; + journal->last_summarized = journal->summarized; + for (i = journal->summarized - 1; i >= first; i--) { + /* + * Release the lock the summarized block held on the recovery journal. (During + * replay, recovery_start will always be 0.) + */ + if (journal->recovery_journal != NULL) { + zone_count_t zone_number = journal->slab->allocator->zone_number; + + vdo_release_recovery_journal_block_reference(journal->recovery_journal, + get_lock(journal, i)->recovery_start, + VDO_ZONE_TYPE_PHYSICAL, + zone_number); + } + + /* + * Release our own lock against reaping for blocks that are committed. (This + * function will not change locks during replay.) + */ + adjust_slab_journal_block_reference(journal, i, -1); + } + + journal->updating_slab_summary = false; + + reap_slab_journal(journal); + + /* Check if the slab summary needs to be updated again. */ + update_tail_block_location(journal); +} + +/** + * update_tail_block_location() - Update the tail block location in the slab summary, if necessary. + * @journal: The slab journal that is updating its tail block location. + */ +static void update_tail_block_location(struct slab_journal *journal) +{ + block_count_t free_block_count; + struct vdo_slab *slab = journal->slab; + + if (journal->updating_slab_summary || + vdo_is_read_only(journal->slab->allocator->depot->vdo) || + (journal->last_summarized >= journal->next_commit)) { + check_if_slab_drained(slab); + return; + } + + if (slab->status != VDO_SLAB_REBUILT) { + u8 hint = slab->allocator->summary_entries[slab->slab_number].fullness_hint; + + free_block_count = ((block_count_t) hint) << slab->allocator->depot->hint_shift; + } else { + free_block_count = slab->free_blocks; + } + + journal->summarized = journal->next_commit; + journal->updating_slab_summary = true; + + /* + * Update slab summary as dirty. + * vdo_slab journal can only reap past sequence number 1 when all the ref counts for this + * slab have been written to the layer. Therefore, indicate that the ref counts must be + * loaded when the journal head has reaped past sequence number 1. + */ + update_slab_summary_entry(slab, + &journal->slab_summary_waiter, + journal->summarized % journal->size, + (journal->head > 1), + false, + free_block_count); +} + +/** + * reopen_slab_journal() - Reopen a slab's journal by emptying it and then adding pending entries. + */ +static void reopen_slab_journal(struct vdo_slab *slab) +{ + struct slab_journal *journal = &slab->journal; + sequence_number_t block; + + ASSERT_LOG_ONLY(journal->tail_header.entry_count == 0, + "vdo_slab journal's active block empty before reopening"); + journal->head = journal->tail; + initialize_journal_state(journal); + + /* Ensure no locks are spuriously held on an empty journal. */ + for (block = 1; block <= journal->size; block++) + ASSERT_LOG_ONLY((get_lock(journal, block)->count == 0), + "Scrubbed journal's block %llu is not locked", + (unsigned long long) block); + + add_entries(journal); +} + +static sequence_number_t get_committing_sequence_number(const struct pooled_vio *vio) +{ + const struct packed_slab_journal_block *block = + (const struct packed_slab_journal_block *) vio->vio.data; + + return __le64_to_cpu(block->header.sequence_number); +} + +/** + * complete_write() - Handle post-commit processing. + * @completion: The write vio as a completion. + * + * This is the callback registered by write_slab_journal_block(). + */ +static void complete_write(struct vdo_completion *completion) +{ + int result = completion->result; + struct pooled_vio *pooled = vio_as_pooled_vio(as_vio(completion)); + struct slab_journal *journal = completion->parent; + sequence_number_t committed = get_committing_sequence_number(pooled); + + list_del_init(&pooled->list_entry); + return_vio_to_pool(journal->slab->allocator->vio_pool, UDS_FORGET(pooled)); + + if (result != VDO_SUCCESS) { + vio_record_metadata_io_error(as_vio(completion)); + uds_log_error_strerror(result, + "cannot write slab journal block %llu", + (unsigned long long) committed); + vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result); + check_if_slab_drained(journal->slab); + return; + } + + WRITE_ONCE(journal->events->blocks_written, journal->events->blocks_written + 1); + + if (list_empty(&journal->uncommitted_blocks)) { + /* If no blocks are outstanding, then the commit point is at the tail. */ + journal->next_commit = journal->tail; + } else { + /* The commit point is always the beginning of the oldest incomplete block. */ + pooled = container_of(journal->uncommitted_blocks.next, + struct pooled_vio, + list_entry); + journal->next_commit = get_committing_sequence_number(pooled); + } + + update_tail_block_location(journal); +} + +static void write_slab_journal_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct slab_journal *journal = vio->completion.parent; + + continue_vio_after_io(vio, complete_write, journal->slab->allocator->thread_id); +} + +/** + * write_slab_journal_block() - Write a slab journal block. + * @waiter: The vio pool waiter which was just notified. + * @context: The vio pool entry for the write. + * + * Callback from acquire_vio_from_pool() registered in commit_tail(). + */ +static void write_slab_journal_block(struct waiter *waiter, void *context) +{ + struct pooled_vio *pooled = context; + struct vio *vio = &pooled->vio; + struct slab_journal *journal = container_of(waiter, struct slab_journal, resource_waiter); + struct slab_journal_block_header *header = &journal->tail_header; + int unused_entries = journal->entries_per_block - header->entry_count; + physical_block_number_t block_number; + const struct admin_state_code *operation; + + header->head = journal->head; + list_add_tail(&pooled->list_entry, &journal->uncommitted_blocks); + vdo_pack_slab_journal_block_header(header, &journal->block->header); + + /* Copy the tail block into the vio. */ + memcpy(pooled->vio.data, journal->block, VDO_BLOCK_SIZE); + + ASSERT_LOG_ONLY(unused_entries >= 0, "vdo_slab journal block is not overfull"); + if (unused_entries > 0) { + /* + * Release the per-entry locks for any unused entries in the block we are about to + * write. + */ + adjust_slab_journal_block_reference(journal, + header->sequence_number, + -unused_entries); + journal->partial_write_in_progress = !block_is_full(journal); + } + + block_number = journal->slab->journal_origin + (header->sequence_number % journal->size); + vio->completion.parent = journal; + + /* + * This block won't be read in recovery until the slab summary is updated to refer to it. + * The slab summary update does a flush which is sufficient to protect us from VDO-2331. + */ + submit_metadata_vio(UDS_FORGET(vio), + block_number, + write_slab_journal_endio, + complete_write, + REQ_OP_WRITE); + + /* Since the write is submitted, the tail block structure can be reused. */ + journal->tail++; + initialize_tail_block(journal); + journal->waiting_to_commit = false; + + operation = vdo_get_admin_state_code(&journal->slab->state); + if (operation == VDO_ADMIN_STATE_WAITING_FOR_RECOVERY) { + vdo_finish_operation(&journal->slab->state, + (vdo_is_read_only(journal->slab->allocator->depot->vdo) ? + VDO_READ_ONLY : + VDO_SUCCESS)); + return; + } + + add_entries(journal); +} + +/** + * commit_tail() - Commit the tail block of the slab journal. + * @journal: The journal whose tail block should be committed. + */ +static void commit_tail(struct slab_journal *journal) +{ + if ((journal->tail_header.entry_count == 0) && must_make_entries_to_flush(journal)) + /* + * There are no entries at the moment, but there are some waiters, so defer + * initiating the flush until those entries are ready to write. + */ + return; + + if (vdo_is_read_only(journal->slab->allocator->depot->vdo) || + journal->waiting_to_commit || + (journal->tail_header.entry_count == 0)) + /* + * There is nothing to do since the tail block is empty, or writing, or the journal + * is in read-only mode. + */ + return; + + /* + * Since we are about to commit the tail block, this journal no longer needs to be on the + * ring of journals which the recovery journal might ask to commit. + */ + mark_slab_journal_clean(journal); + + journal->waiting_to_commit = true; + + journal->resource_waiter.callback = write_slab_journal_block; + acquire_vio_from_pool(journal->slab->allocator->vio_pool, &journal->resource_waiter); +} + +/** + * encode_slab_journal_entry() - Encode a slab journal entry. + * @tail_header: The unpacked header for the block. + * @payload: The journal block payload to hold the entry. + * @sbn: The slab block number of the entry to encode. + * @operation: The type of the entry. + * @increment: True if this is an increment. + * + * Exposed for unit tests. + */ +static void +encode_slab_journal_entry(struct slab_journal_block_header *tail_header, + slab_journal_payload *payload, + slab_block_number sbn, + enum journal_operation operation, + bool increment) +{ + journal_entry_count_t entry_number = tail_header->entry_count++; + + if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) { + if (!tail_header->has_block_map_increments) { + memset(payload->full_entries.entry_types, + 0, + VDO_SLAB_JOURNAL_ENTRY_TYPES_SIZE); + tail_header->has_block_map_increments = true; + } + + payload->full_entries.entry_types[entry_number / 8] |= + ((u8)1 << (entry_number % 8)); + } + + vdo_pack_slab_journal_entry(&payload->entries[entry_number], sbn, increment); +} + +/** + * expand_journal_point() - Convert a recovery journal journal_point which refers to both an + * increment and a decrement to a single point which refers to one or the + * other. + * @recovery_point: The journal point to convert. + * @increment: Whether the current entry is an increment. + * + * Return: The expanded journal point + * + * Because each data_vio has but a single recovery journal point, but may need to make both + * increment and decrement entries in the same slab journal. In order to distinguish the two + * entries, the entry count of the expanded journal point is twice the actual recovery journal + * entry count for increments, and one more than that for decrements. + */ +static struct journal_point +expand_journal_point(struct journal_point recovery_point, bool increment) +{ + recovery_point.entry_count *= 2; + if (!increment) + recovery_point.entry_count++; + + return recovery_point; +} + +/** + * add_entry() - Actually add an entry to the slab journal, potentially firing off a write if a + * block becomes full. + * @journal: The slab journal to append to. + * @pbn: The pbn being adjusted. + * @operation: The type of entry to make. + * @increment: True if this is an increment. + * @recovery_point: The expanded recovery point. + * + * This function is synchronous. + */ +static void add_entry(struct slab_journal *journal, + physical_block_number_t pbn, + enum journal_operation operation, + bool increment, + struct journal_point recovery_point) +{ + struct packed_slab_journal_block *block = journal->block; + int result; + + result = ASSERT(vdo_before_journal_point(&journal->tail_header.recovery_point, + &recovery_point), + "recovery journal point is monotonically increasing, recovery point: %llu.%u, block recovery point: %llu.%u", + (unsigned long long) recovery_point.sequence_number, + recovery_point.entry_count, + (unsigned long long) journal->tail_header.recovery_point.sequence_number, + journal->tail_header.recovery_point.entry_count); + if (result != VDO_SUCCESS) { + vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result); + return; + } + + if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) { + result = ASSERT((journal->tail_header.entry_count < + journal->full_entries_per_block), + "block has room for full entries"); + if (result != VDO_SUCCESS) { + vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result); + return; + } + } + + encode_slab_journal_entry(&journal->tail_header, + &block->payload, + pbn - journal->slab->start, + operation, + increment); + journal->tail_header.recovery_point = recovery_point; + if (block_is_full(journal)) + commit_tail(journal); +} + +static inline block_count_t journal_length(const struct slab_journal *journal) +{ + return journal->tail - journal->head; +} + +/** + * vdo_attempt_replay_into_slab() - Replay a recovery journal entry into a slab's journal. + * @slab: The slab to play into. + * @pbn: The PBN for the entry. + * @operation: The type of entry to add. + * @increment: True if this entry is an increment. + * @recovery_point: The recovery journal point corresponding to this entry. + * @parent: The completion to notify when there is space to add the entry if the entry could not be + * added immediately. + * + * Return: true if the entry was added immediately. + */ +bool vdo_attempt_replay_into_slab(struct vdo_slab *slab, + physical_block_number_t pbn, + enum journal_operation operation, + bool increment, + struct journal_point *recovery_point, + struct vdo_completion *parent) +{ + struct slab_journal *journal = &slab->journal; + struct slab_journal_block_header *header = &journal->tail_header; + struct journal_point expanded = expand_journal_point(*recovery_point, increment); + + /* Only accept entries after the current recovery point. */ + if (!vdo_before_journal_point(&journal->tail_header.recovery_point, &expanded)) + return true; + + if ((header->entry_count >= journal->full_entries_per_block) && + (header->has_block_map_increments || (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING))) + /* + * The tail block does not have room for the entry we are attempting to add so + * commit the tail block now. + */ + commit_tail(journal); + + if (journal->waiting_to_commit) { + vdo_start_operation_with_waiter(&journal->slab->state, + VDO_ADMIN_STATE_WAITING_FOR_RECOVERY, + parent, + NULL); + return false; + } + + if (journal_length(journal) >= journal->size) { + /* + * We must have reaped the current head before the crash, since the blocked + * threshold keeps us from having more entries than fit in a slab journal; hence we + * can just advance the head (and unreapable block), as needed. + */ + journal->head++; + journal->unreapable++; + } + + if (journal->slab->status == VDO_SLAB_REBUILT) + journal->slab->status = VDO_SLAB_REPLAYING; + + add_entry(journal, pbn, operation, increment, expanded); + return true; +} + +/** + * requires_reaping() - Check whether the journal must be reaped before adding new entries. + * @journal: The journal to check. + * + * Return: true if the journal must be reaped. + */ +static bool requires_reaping(const struct slab_journal *journal) +{ + return (journal_length(journal) >= journal->blocking_threshold); +} + +/** finish_summary_update() - A waiter callback that resets the writing state of a slab. */ +static void finish_summary_update(struct waiter *waiter, void *context) +{ + struct vdo_slab *slab = container_of(waiter, struct vdo_slab, summary_waiter); + int result = *((int *) context); + + slab->active_count--; + + if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) { + uds_log_error_strerror(result, "failed to update slab summary"); + vdo_enter_read_only_mode(slab->allocator->depot->vdo, result); + } + + check_if_slab_drained(slab); +} + +static void write_reference_block(struct waiter *waiter, void *context); + +/** + * launch_reference_block_write() - Launch the write of a dirty reference block by first acquiring + * a VIO for it from the pool. + * @waiter: The waiter of the block which is starting to write. + * @context: The parent slab of the block. + * + * This can be asynchronous since the writer will have to wait if all VIOs in the pool are + * currently in use. + */ +static void launch_reference_block_write(struct waiter *waiter, void *context) +{ + struct vdo_slab *slab = context; + + if (vdo_is_read_only(slab->allocator->depot->vdo)) + return; + + slab->active_count++; + container_of(waiter, struct reference_block, waiter)->is_writing = true; + waiter->callback = write_reference_block; + acquire_vio_from_pool(slab->allocator->vio_pool, waiter); +} + +static void save_dirty_reference_blocks(struct vdo_slab *slab) +{ + vdo_notify_all_waiters(&slab->dirty_blocks, launch_reference_block_write, slab); + check_if_slab_drained(slab); +} + +/** + * finish_reference_block_write() - After a reference block has written, clean it, release its + * locks, and return its VIO to the pool. + * @completion: The VIO that just finished writing. + */ +static void finish_reference_block_write(struct vdo_completion *completion) +{ + struct vio *vio = as_vio(completion); + struct pooled_vio *pooled = vio_as_pooled_vio(vio); + struct reference_block *block = completion->parent; + struct vdo_slab *slab = block->slab; + tail_block_offset_t offset; + + slab->active_count--; + + /* Release the slab journal lock. */ + adjust_slab_journal_block_reference(&slab->journal, + block->slab_journal_lock_to_release, + -1); + return_vio_to_pool(slab->allocator->vio_pool, pooled); + + /* + * We can't clear the is_writing flag earlier as releasing the slab journal lock may cause + * us to be dirtied again, but we don't want to double enqueue. + */ + block->is_writing = false; + + if (vdo_is_read_only(completion->vdo)) { + check_if_slab_drained(slab); + return; + } + + /* Re-queue the block if it was re-dirtied while it was writing. */ + if (block->is_dirty) { + vdo_enqueue_waiter(&block->slab->dirty_blocks, &block->waiter); + if (vdo_is_state_draining(&slab->state)) + /* We must be saving, and this block will otherwise not be relaunched. */ + save_dirty_reference_blocks(slab); + + return; + } + + /* + * Mark the slab as clean in the slab summary if there are no dirty or writing blocks + * and no summary update in progress. + */ + if ((slab->active_count > 0) || vdo_has_waiters(&slab->dirty_blocks)) + return; + + offset = slab->allocator->summary_entries[slab->slab_number].tail_block_offset; + slab->active_count++; + slab->summary_waiter.callback = finish_summary_update; + update_slab_summary_entry(slab, + &slab->summary_waiter, + offset, + true, + true, + slab->free_blocks); +} + +/** + * get_reference_counters_for_block() - Find the reference counters for a given block. + * @block: The reference_block in question. + * + * Return: A pointer to the reference counters for this block. + */ +static vdo_refcount_t * __must_check +get_reference_counters_for_block(struct reference_block *block) +{ + size_t block_index = block - block->slab->reference_blocks; + + return &block->slab->counters[block_index * COUNTS_PER_BLOCK]; +} + +/** + * pack_reference_block() - Copy data from a reference block to a buffer ready to be written out. + * @block: The block to copy. + * @buffer: The char buffer to fill with the packed block. + */ +static void pack_reference_block(struct reference_block *block, void *buffer) +{ + struct packed_reference_block *packed = buffer; + vdo_refcount_t *counters = get_reference_counters_for_block(block); + sector_count_t i; + struct packed_journal_point commit_point; + + vdo_pack_journal_point(&block->slab->slab_journal_point, &commit_point); + + for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) { + packed->sectors[i].commit_point = commit_point; + memcpy(packed->sectors[i].counts, + counters + (i * COUNTS_PER_SECTOR), + (sizeof(vdo_refcount_t) * COUNTS_PER_SECTOR)); + } +} + +static void write_reference_block_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct reference_block *block = vio->completion.parent; + thread_id_t thread_id = block->slab->allocator->thread_id; + + continue_vio_after_io(vio, finish_reference_block_write, thread_id); +} + +/** + * handle_io_error() - Handle an I/O error reading or writing a reference count block. + * @completion: The VIO doing the I/O as a completion. + */ +static void handle_io_error(struct vdo_completion *completion) +{ + int result = completion->result; + struct vio *vio = as_vio(completion); + struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab; + + vio_record_metadata_io_error(vio); + return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio)); + slab->active_count--; + vdo_enter_read_only_mode(slab->allocator->depot->vdo, result); + check_if_slab_drained(slab); +} + +/** + * write_reference_block() - After a dirty block waiter has gotten a VIO from the VIO pool, copy + * its counters and associated data into the VIO, and launch the write. + * @waiter: The waiter of the dirty block. + * @context: The VIO returned by the pool. + */ +static void write_reference_block(struct waiter *waiter, void *context) +{ + size_t block_offset; + physical_block_number_t pbn; + struct pooled_vio *pooled = context; + struct vdo_completion *completion = &pooled->vio.completion; + struct reference_block *block = container_of(waiter, struct reference_block, waiter); + + pack_reference_block(block, pooled->vio.data); + block_offset = (block - block->slab->reference_blocks); + pbn = (block->slab->ref_counts_origin + block_offset); + block->slab_journal_lock_to_release = block->slab_journal_lock; + completion->parent = block; + + /* + * Mark the block as clean, since we won't be committing any updates that happen after this + * moment. As long as VIO order is preserved, two VIOs updating this block at once will not + * cause complications. + */ + block->is_dirty = false; + + /* + * Flush before writing to ensure that the recovery journal and slab journal entries which + * cover this reference update are stable (VDO-2331). + */ + WRITE_ONCE(block->slab->allocator->ref_counts_statistics.blocks_written, + block->slab->allocator->ref_counts_statistics.blocks_written + 1); + + completion->callback_thread_id = ((struct block_allocator *) pooled->context)->thread_id; + submit_metadata_vio(&pooled->vio, + pbn, + write_reference_block_endio, + handle_io_error, + REQ_OP_WRITE | REQ_PREFLUSH); +} + +static void reclaim_journal_space(struct slab_journal *journal) +{ + block_count_t length = journal_length(journal); + struct vdo_slab *slab = journal->slab; + block_count_t write_count = vdo_count_waiters(&slab->dirty_blocks); + block_count_t written; + + if ((length < journal->flushing_threshold) || (write_count == 0)) + return; + + /* The slab journal is over the first threshold, schedule some reference block writes. */ + WRITE_ONCE(journal->events->flush_count, journal->events->flush_count + 1); + if (length < journal->flushing_deadline) + /* Schedule more writes the closer to the deadline we get. */ + write_count = max_t(block_count_t, + write_count / (journal->flushing_deadline - length + 1), + 1); + + for (written = 0; written < write_count; written++) + vdo_notify_next_waiter(&slab->dirty_blocks, launch_reference_block_write, slab); +} + +/** + * reference_count_to_status() - Convert a reference count to a reference status. + * @count: The count to convert. + * + * Return: The appropriate reference status. + */ +static enum reference_status __must_check +reference_count_to_status(vdo_refcount_t count) +{ + if (count == EMPTY_REFERENCE_COUNT) + return RS_FREE; + else if (count == 1) + return RS_SINGLE; + else if (count == PROVISIONAL_REFERENCE_COUNT) + return RS_PROVISIONAL; + else + return RS_SHARED; +} + +/** + * dirty_block() - Mark a reference count block as dirty, potentially adding it to the dirty queue + * if it wasn't already dirty. + * @block: The reference block to mark as dirty. + */ +static void dirty_block(struct reference_block *block) +{ + if (block->is_dirty) + return; + + block->is_dirty = true; + if (!block->is_writing) + vdo_enqueue_waiter(&block->slab->dirty_blocks, &block->waiter); +} + +/** + * get_reference_block() - Get the reference block that covers the given block index. + */ +static struct reference_block * __must_check +get_reference_block(struct vdo_slab *slab, slab_block_number index) +{ + return &slab->reference_blocks[index / COUNTS_PER_BLOCK]; +} + +/** + * slab_block_number_from_pbn() - Determine the index within the slab of a particular physical + * block number. + * @slab: The slab. + * @physical_block_number: The physical block number. + * @slab_block_number_ptr: A pointer to the slab block number. + * + * Return: VDO_SUCCESS or an error code. + */ +static int __must_check +slab_block_number_from_pbn(struct vdo_slab *slab, + physical_block_number_t physical_block_number, + slab_block_number *slab_block_number_ptr) +{ + u64 slab_block_number; + + if (physical_block_number < slab->start) + return VDO_OUT_OF_RANGE; + + slab_block_number = physical_block_number - slab->start; + if (slab_block_number >= slab->allocator->depot->slab_config.data_blocks) + return VDO_OUT_OF_RANGE; + + *slab_block_number_ptr = slab_block_number; + return VDO_SUCCESS; +} + +/** + * get_reference_counter() - Get the reference counter that covers the given physical block number. + * @slab: The slab to query. + * @pbn: The physical block number. + * @counter_ptr: A pointer to the reference counter. + */ +static int __must_check +get_reference_counter(struct vdo_slab *slab, + physical_block_number_t pbn, + vdo_refcount_t **counter_ptr) +{ + slab_block_number index; + int result = slab_block_number_from_pbn(slab, pbn, &index); + + if (result != VDO_SUCCESS) + return result; + + *counter_ptr = &slab->counters[index]; + + return VDO_SUCCESS; +} + +static unsigned int calculate_slab_priority(struct vdo_slab *slab) +{ + block_count_t free_blocks = slab->free_blocks; + unsigned int unopened_slab_priority = slab->allocator->unopened_slab_priority; + unsigned int priority; + + /* + * Wholly full slabs must be the only ones with lowest priority, 0. + * + * Slabs that have never been opened (empty, newly initialized, and never been written to) + * have lower priority than previously opened slabs that have a significant number of free + * blocks. This ranking causes VDO to avoid writing physical blocks for the first time + * unless there are very few free blocks that have been previously written to. + * + * Since VDO doesn't discard blocks currently, reusing previously written blocks makes VDO + * a better client of any underlying storage that is thinly-provisioned (though discarding + * would be better). + * + * For all other slabs, the priority is derived from the logarithm of the number of free + * blocks. Slabs with the same order of magnitude of free blocks have the same priority. + * With 2^23 blocks, the priority will range from 1 to 25. The reserved + * unopened_slab_priority divides the range and is skipped by the logarithmic mapping. + */ + + if (free_blocks == 0) + return 0; + + if (is_slab_journal_blank(slab)) + return unopened_slab_priority; + + priority = (1 + ilog2(free_blocks)); + return ((priority < unopened_slab_priority) ? priority : priority + 1); +} + +/* + * Slabs are essentially prioritized by an approximation of the number of free blocks in the slab + * so slabs with lots of free blocks with be opened for allocation before slabs that have few free + * blocks. + */ +static void prioritize_slab(struct vdo_slab *slab) +{ + ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), + "a slab must not already be on a ring when prioritizing"); + slab->priority = calculate_slab_priority(slab); + vdo_priority_table_enqueue(slab->allocator->prioritized_slabs, + slab->priority, + &slab->allocq_entry); +} + +/** + * adjust_free_block_count() - Adjust the free block count and (if needed) reprioritize the slab. + * @increment: should be true if the free block count went up. + */ +static void adjust_free_block_count(struct vdo_slab *slab, bool increment) +{ + struct block_allocator *allocator = slab->allocator; + + WRITE_ONCE(allocator->allocated_blocks, + allocator->allocated_blocks + (increment ? -1 : 1)); + + /* The open slab doesn't need to be reprioritized until it is closed. */ + if (slab == allocator->open_slab) + return; + + /* Don't bother adjusting the priority table if unneeded. */ + if (slab->priority == calculate_slab_priority(slab)) + return; + + /* + * Reprioritize the slab to reflect the new free block count by removing it from the table + * and re-enqueuing it with the new priority. + */ + vdo_priority_table_remove(allocator->prioritized_slabs, &slab->allocq_entry); + prioritize_slab(slab); +} + +/** + * increment_for_data() - Increment the reference count for a data block. + * @slab: The slab which owns the block. + * @block: The reference block which contains the block being updated. + * @block_number: The block to update. + * @old_status: The reference status of the data block before this increment. + * @lock: The pbn_lock associated with this increment (may be NULL). + * @counter_ptr: A pointer to the count for the data block (in, out). + * @adjust_block_count: Whether to update the allocator's free block count. + * + * Return: VDO_SUCCESS or an error. + */ +static int increment_for_data(struct vdo_slab *slab, + struct reference_block *block, + slab_block_number block_number, + enum reference_status old_status, + struct pbn_lock *lock, + vdo_refcount_t *counter_ptr, + bool adjust_block_count) +{ + switch (old_status) { + case RS_FREE: + *counter_ptr = 1; + block->allocated_count++; + slab->free_blocks--; + if (adjust_block_count) + adjust_free_block_count(slab, false); + + break; + + case RS_PROVISIONAL: + *counter_ptr = 1; + break; + + default: + /* Single or shared */ + if (*counter_ptr >= MAXIMUM_REFERENCE_COUNT) + return uds_log_error_strerror(VDO_REF_COUNT_INVALID, + "Incrementing a block already having 254 references (slab %u, offset %u)", + slab->slab_number, + block_number); + (*counter_ptr)++; + } + + if (lock != NULL) + vdo_unassign_pbn_lock_provisional_reference(lock); + return VDO_SUCCESS; +} + +/** + * decrement_for_data() - Decrement the reference count for a data block. + * @slab: The slab which owns the block. + * @block: The reference block which contains the block being updated. + * @block_number: The block to update. + * @old_status: The reference status of the data block before this decrement. + * @updater: The reference updater doing this operation in case we need to look up the pbn lock. + * @lock: The pbn_lock associated with the block being decremented (may be NULL). + * @counter_ptr: A pointer to the count for the data block (in, out). + * @adjust_block_count: Whether to update the allocator's free block count. + * + * Return: VDO_SUCCESS or an error. + */ +static int decrement_for_data(struct vdo_slab *slab, + struct reference_block *block, + slab_block_number block_number, + enum reference_status old_status, + struct reference_updater *updater, + vdo_refcount_t *counter_ptr, + bool adjust_block_count) +{ + switch (old_status) { + case RS_FREE: + return uds_log_error_strerror(VDO_REF_COUNT_INVALID, + "Decrementing free block at offset %u in slab %u", + block_number, + slab->slab_number); + + case RS_PROVISIONAL: + case RS_SINGLE: + if (updater->zpbn.zone != NULL) { + struct pbn_lock *lock = vdo_get_physical_zone_pbn_lock(updater->zpbn.zone, + updater->zpbn.pbn); + + if (lock != NULL) { + /* + * There is a read lock on this block, so the block must not become + * unreferenced. + */ + *counter_ptr = PROVISIONAL_REFERENCE_COUNT; + vdo_assign_pbn_lock_provisional_reference(lock); + break; + } + } + + *counter_ptr = EMPTY_REFERENCE_COUNT; + block->allocated_count--; + slab->free_blocks++; + if (adjust_block_count) + adjust_free_block_count(slab, true); + + break; + + default: + /* Shared */ + (*counter_ptr)--; + } + + return VDO_SUCCESS; +} + +/** + * increment_for_block_map() - Increment the reference count for a block map page. + * @slab: The slab which owns the block. + * @block: The reference block which contains the block being updated. + * @block_number: The block to update. + * @old_status: The reference status of the block before this increment. + * @lock: The pbn_lock associated with this increment (may be NULL). + * @normal_operation: Whether we are in normal operation vs. recovery or rebuild. + * @counter_ptr: A pointer to the count for the block (in, out). + * @adjust_block_count: Whether to update the allocator's free block count. + * + * All block map increments should be from provisional to MAXIMUM_REFERENCE_COUNT. Since block map + * blocks never dedupe they should never be adjusted from any other state. The adjustment always + * results in MAXIMUM_REFERENCE_COUNT as this value is used to prevent dedupe against block map + * blocks. + * + * Return: VDO_SUCCESS or an error. + */ +static int increment_for_block_map(struct vdo_slab *slab, + struct reference_block *block, + slab_block_number block_number, + enum reference_status old_status, + struct pbn_lock *lock, + bool normal_operation, + vdo_refcount_t *counter_ptr, + bool adjust_block_count) +{ + switch (old_status) { + case RS_FREE: + if (normal_operation) + return uds_log_error_strerror(VDO_REF_COUNT_INVALID, + "Incrementing unallocated block map block (slab %u, offset %u)", + slab->slab_number, + block_number); + + *counter_ptr = MAXIMUM_REFERENCE_COUNT; + block->allocated_count++; + slab->free_blocks--; + if (adjust_block_count) + adjust_free_block_count(slab, false); + + return VDO_SUCCESS; + + case RS_PROVISIONAL: + if (!normal_operation) + return uds_log_error_strerror(VDO_REF_COUNT_INVALID, + "Block map block had provisional reference during replay (slab %u, offset %u)", + slab->slab_number, + block_number); + + *counter_ptr = MAXIMUM_REFERENCE_COUNT; + if (lock != NULL) + vdo_unassign_pbn_lock_provisional_reference(lock); + return VDO_SUCCESS; + + default: + return uds_log_error_strerror(VDO_REF_COUNT_INVALID, + "Incrementing a block map block which is already referenced %u times (slab %u, offset %u)", + *counter_ptr, + slab->slab_number, + block_number); + } +} + +static bool __must_check is_valid_journal_point(const struct journal_point *point) +{ + return ((point != NULL) && (point->sequence_number > 0)); +} + +/** + * update_reference_count() - Update the reference count of a block. + * @slab: The slab which owns the block. + * @block: The reference block which contains the block being updated. + * @block_number: The block to update. + * @slab_journal_point: The slab journal point at which this update is journaled. + * @updater: The reference updater. + * @normal_operation: Whether we are in normal operation vs. recovery or rebuild. + * @adjust_block_count: Whether to update the slab's free block count. + * @provisional_decrement_ptr: A pointer which will be set to true if this update was a decrement + * of a provisional reference. + * + * Return: VDO_SUCCESS or an error. + */ +static int +update_reference_count(struct vdo_slab *slab, + struct reference_block *block, + slab_block_number block_number, + const struct journal_point *slab_journal_point, + struct reference_updater *updater, + bool normal_operation, + bool adjust_block_count, + bool *provisional_decrement_ptr) +{ + vdo_refcount_t *counter_ptr = &slab->counters[block_number]; + enum reference_status old_status = reference_count_to_status(*counter_ptr); + int result; + + if (!updater->increment) { + result = decrement_for_data(slab, + block, + block_number, + old_status, + updater, + counter_ptr, + adjust_block_count); + if ((result == VDO_SUCCESS) && (old_status == RS_PROVISIONAL)) { + if (provisional_decrement_ptr != NULL) + *provisional_decrement_ptr = true; + return VDO_SUCCESS; + } + } else if (updater->operation == VDO_JOURNAL_DATA_REMAPPING) { + result = increment_for_data(slab, + block, + block_number, + old_status, + updater->lock, + counter_ptr, + adjust_block_count); + } else { + result = increment_for_block_map(slab, + block, + block_number, + old_status, + updater->lock, + normal_operation, + counter_ptr, + adjust_block_count); + } + + if (result != VDO_SUCCESS) + return result; + + if (is_valid_journal_point(slab_journal_point)) + slab->slab_journal_point = *slab_journal_point; + + return VDO_SUCCESS; +} + +static int __must_check +adjust_reference_count(struct vdo_slab *slab, + struct reference_updater *updater, + const struct journal_point *slab_journal_point) +{ + slab_block_number block_number; + int result; + struct reference_block *block; + bool provisional_decrement = false; + + if (!is_slab_open(slab)) + return VDO_INVALID_ADMIN_STATE; + + result = slab_block_number_from_pbn(slab, updater->zpbn.pbn, &block_number); + if (result != VDO_SUCCESS) + return result; + + block = get_reference_block(slab, block_number); + result = update_reference_count(slab, + block, + block_number, + slab_journal_point, + updater, + NORMAL_OPERATION, + true, + &provisional_decrement); + if ((result != VDO_SUCCESS) || provisional_decrement) + return result; + + if (block->is_dirty && (block->slab_journal_lock > 0)) { + sequence_number_t entry_lock = slab_journal_point->sequence_number; + /* + * This block is already dirty and a slab journal entry has been made for it since + * the last time it was clean. We must release the per-entry slab journal lock for + * the entry associated with the update we are now doing. + */ + result = ASSERT(is_valid_journal_point(slab_journal_point), + "Reference count adjustments need slab journal points."); + if (result != VDO_SUCCESS) + return result; + + adjust_slab_journal_block_reference(&slab->journal, entry_lock, -1); + return VDO_SUCCESS; + } + + /* + * This may be the first time we are applying an update for which there is a slab journal + * entry to this block since the block was cleaned. Therefore, we convert the per-entry + * slab journal lock to an uncommitted reference block lock, if there is a per-entry lock. + */ + if (is_valid_journal_point(slab_journal_point)) + block->slab_journal_lock = slab_journal_point->sequence_number; + else + block->slab_journal_lock = 0; + + dirty_block(block); + return VDO_SUCCESS; +} + +/** + * add_entry_from_waiter() - Add an entry to the slab journal. + * @waiter: The vio which should make an entry now. + * @context: The slab journal to make an entry in. + * + * This callback is invoked by add_entries() once it has determined that we are ready to make + * another entry in the slab journal. Implements waiter_callback. + */ +static void add_entry_from_waiter(struct waiter *waiter, void *context) +{ + int result; + struct reference_updater *updater = container_of(waiter, struct reference_updater, waiter); + struct data_vio *data_vio = data_vio_from_reference_updater(updater); + struct slab_journal *journal = context; + struct slab_journal_block_header *header = &journal->tail_header; + struct journal_point slab_journal_point = { + .sequence_number = header->sequence_number, + .entry_count = header->entry_count, + }; + sequence_number_t recovery_block = data_vio->recovery_journal_point.sequence_number; + + if (header->entry_count == 0) { + /* + * This is the first entry in the current tail block, so get a lock on the recovery + * journal which we will hold until this tail block is committed. + */ + get_lock(journal, header->sequence_number)->recovery_start = recovery_block; + if (journal->recovery_journal != NULL) { + zone_count_t zone_number = journal->slab->allocator->zone_number; + + vdo_acquire_recovery_journal_block_reference(journal->recovery_journal, + recovery_block, + VDO_ZONE_TYPE_PHYSICAL, + zone_number); + } + + mark_slab_journal_dirty(journal, recovery_block); + reclaim_journal_space(journal); + } + + add_entry(journal, + updater->zpbn.pbn, + updater->operation, + updater->increment, + expand_journal_point(data_vio->recovery_journal_point, updater->increment)); + + if (journal->slab->status != VDO_SLAB_REBUILT) { + /* + * If the slab is unrecovered, scrubbing will take care of the count since the + * update is now recorded in the journal. + */ + adjust_slab_journal_block_reference(journal, + slab_journal_point.sequence_number, + -1); + result = VDO_SUCCESS; + } else { + /* Now that an entry has been made in the slab journal, update the counter. */ + result = adjust_reference_count(journal->slab, updater, &slab_journal_point); + } + + if (updater->increment) + continue_data_vio_with_error(data_vio, result); + else + vdo_continue_completion(&data_vio->decrement_completion, result); +} + +/** + * is_next_entry_a_block_map_increment() - Check whether the next entry to be made is a block map + * increment. + * @journal: The journal. + * + * Return: true if the first entry waiter's operation is a block map increment. + */ +static inline bool is_next_entry_a_block_map_increment(struct slab_journal *journal) +{ + struct waiter *waiter = vdo_get_first_waiter(&journal->entry_waiters); + struct reference_updater *updater = container_of(waiter, struct reference_updater, waiter); + + return (updater->operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING); +} + +/** + * add_entries() - Add as many entries as possible from the queue of vios waiting to make entries. + * @journal: The journal to which entries may be added. + * + * By processing the queue in order, we ensure that slab journal entries are made in the same order + * as recovery journal entries for the same increment or decrement. + */ +static void add_entries(struct slab_journal *journal) +{ + if (journal->adding_entries) + /* Protect against re-entrancy. */ + return; + + journal->adding_entries = true; + while (vdo_has_waiters(&journal->entry_waiters)) { + struct slab_journal_block_header *header = &journal->tail_header; + + if (journal->partial_write_in_progress || + (journal->slab->status == VDO_SLAB_REBUILDING)) + /* + * Don't add entries while rebuilding or while a partial write is + * outstanding (VDO-2399). + */ + break; + + if (journal->waiting_to_commit) { + /* + * If we are waiting for resources to write the tail block, and the tail + * block is full, we can't make another entry. + */ + WRITE_ONCE(journal->events->tail_busy_count, + journal->events->tail_busy_count + 1); + break; + } else if (is_next_entry_a_block_map_increment(journal) && + (header->entry_count >= journal->full_entries_per_block)) { + /* + * The tail block does not have room for a block map increment, so commit + * it now. + */ + commit_tail(journal); + if (journal->waiting_to_commit) { + WRITE_ONCE(journal->events->tail_busy_count, + journal->events->tail_busy_count + 1); + break; + } + } + + /* If the slab is over the blocking threshold, make the vio wait. */ + if (requires_reaping(journal)) { + WRITE_ONCE(journal->events->blocked_count, + journal->events->blocked_count + 1); + save_dirty_reference_blocks(journal->slab); + break; + } + + if (header->entry_count == 0) { + struct journal_lock *lock = get_lock(journal, header->sequence_number); + + /* + * Check if the on disk slab journal is full. Because of the blocking and + * scrubbing thresholds, this should never happen. + */ + if (lock->count > 0) { + ASSERT_LOG_ONLY((journal->head + journal->size) == journal->tail, + "New block has locks, but journal is not full"); + + /* + * The blocking threshold must let the journal fill up if the new + * block has locks; if the blocking threshold is smaller than the + * journal size, the new block cannot possibly have locks already. + */ + ASSERT_LOG_ONLY((journal->blocking_threshold >= journal->size), + "New block can have locks already iff blocking threshold is at the end of the journal"); + + WRITE_ONCE(journal->events->disk_full_count, + journal->events->disk_full_count + 1); + save_dirty_reference_blocks(journal->slab); + break; + } + + /* + * Don't allow the new block to be reaped until all of the reference count + * blocks are written and the journal block has been fully committed as + * well. + */ + lock->count = journal->entries_per_block + 1; + + if (header->sequence_number == 1) { + struct vdo_slab *slab = journal->slab; + block_count_t i; + + /* + * This is the first entry in this slab journal, ever. Dirty all of + * the reference count blocks. Each will acquire a lock on the tail + * block so that the journal won't be reaped until the reference + * counts are initialized. The lock acquisition must be done by the + * ref_counts since here we don't know how many reference blocks + * the ref_counts has. + */ + for (i = 0; i < slab->reference_block_count; i++) { + slab->reference_blocks[i].slab_journal_lock = 1; + dirty_block(&slab->reference_blocks[i]); + } + + adjust_slab_journal_block_reference(journal, + 1, + slab->reference_block_count); + } + } + + vdo_notify_next_waiter(&journal->entry_waiters, add_entry_from_waiter, journal); + } + + journal->adding_entries = false; + + /* If there are no waiters, and we are flushing or saving, commit the tail block. */ + if (vdo_is_state_draining(&journal->slab->state) && + !vdo_is_state_suspending(&journal->slab->state) && + !vdo_has_waiters(&journal->entry_waiters)) + commit_tail(journal); +} + +/** + * reset_search_cursor() - Reset the free block search back to the first reference counter in the + * first reference block of a slab. + */ +static void reset_search_cursor(struct vdo_slab *slab) +{ + struct search_cursor *cursor = &slab->search_cursor; + + cursor->block = cursor->first_block; + cursor->index = 0; + /* Unit tests have slabs with only one reference block (and it's a runt). */ + cursor->end_index = min_t(u32, COUNTS_PER_BLOCK, slab->block_count); +} + +/** + * advance_search_cursor() - Advance the search cursor to the start of the next reference block in + * a slab, + * + * Wraps around to the first reference block if the current block is the last reference block. + * + * Return: true unless the cursor was at the last reference block. + */ +static bool advance_search_cursor(struct vdo_slab *slab) +{ + struct search_cursor *cursor = &slab->search_cursor; + + /* + * If we just finished searching the last reference block, then wrap back around to the + * start of the array. + */ + if (cursor->block == cursor->last_block) { + reset_search_cursor(slab); + return false; + } + + /* We're not already at the end, so advance to cursor to the next block. */ + cursor->block++; + cursor->index = cursor->end_index; + + if (cursor->block == cursor->last_block) + /* The last reference block will usually be a runt. */ + cursor->end_index = slab->block_count; + else + cursor->end_index += COUNTS_PER_BLOCK; + return true; +} + +/** + * vdo_adjust_reference_count_for_rebuild() - Adjust the reference count of a block during rebuild. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot, + physical_block_number_t pbn, + enum journal_operation operation) +{ + int result; + slab_block_number block_number; + struct reference_block *block; + struct vdo_slab *slab = vdo_get_slab(depot, pbn); + struct reference_updater updater = { + .operation = operation, + .increment = true, + }; + + result = slab_block_number_from_pbn(slab, pbn, &block_number); + if (result != VDO_SUCCESS) + return result; + + block = get_reference_block(slab, block_number); + result = update_reference_count(slab, + block, + block_number, + NULL, + &updater, + !NORMAL_OPERATION, + false, + NULL); + if (result != VDO_SUCCESS) + return result; + + dirty_block(block); + return VDO_SUCCESS; +} + +/** + * replay_reference_count_change() - Replay the reference count adjustment from a slab journal + * entry into the reference count for a block. + * @slab: The slab. + * @entry_point: The slab journal point for the entry. + * @entry: The slab journal entry being replayed. + * + * The adjustment will be ignored if it was already recorded in the reference count. + * + * Return: VDO_SUCCESS or an error code. + */ +static int +replay_reference_count_change(struct vdo_slab *slab, + const struct journal_point *entry_point, + struct slab_journal_entry entry) +{ + int result; + struct reference_block *block = get_reference_block(slab, entry.sbn); + sector_count_t sector = (entry.sbn % COUNTS_PER_BLOCK) / COUNTS_PER_SECTOR; + struct reference_updater updater = { + .operation = entry.operation, + .increment = entry.increment, + }; + + if (!vdo_before_journal_point(&block->commit_points[sector], entry_point)) + /* This entry is already reflected in the existing counts, so do nothing. */ + return VDO_SUCCESS; + + /* This entry is not yet counted in the reference counts. */ + result = update_reference_count(slab, + block, + entry.sbn, + entry_point, + &updater, + !NORMAL_OPERATION, + false, + NULL); + if (result != VDO_SUCCESS) + return result; + + dirty_block(block); + return VDO_SUCCESS; +} + +/** + * find_zero_byte_in_word() - Find the array index of the first zero byte in word-sized range of + * reference counters. + * @word_ptr: A pointer to the eight counter bytes to check. + * @start_index: The array index corresponding to word_ptr[0]. + * @fail_index: The array index to return if no zero byte is found. + * + * The search does no bounds checking; the function relies on the array being sufficiently padded. + * + * Return: The array index of the first zero byte in the word, or the value passed as fail_index if + * no zero byte was found. + */ +static inline slab_block_number +find_zero_byte_in_word(const u8 *word_ptr, + slab_block_number start_index, + slab_block_number fail_index) +{ + u64 word = get_unaligned_le64(word_ptr); + + /* This looks like a loop, but GCC will unroll the eight iterations for us. */ + unsigned int offset; + + for (offset = 0; offset < BYTES_PER_WORD; offset++) { + /* Assumes little-endian byte order, which we have on X86. */ + if ((word & 0xFF) == 0) + return (start_index + offset); + word >>= 8; + } + + return fail_index; +} + +/** + * vdo_find_free_block() - Find the first block with a reference count of zero in the specified + * range of reference counter indexes. + * @slab: The slab counters to scan. + * @index_ptr: A pointer to hold the array index of the free block. + * + * Exposed for unit testing. + * + * Return: true if a free block was found in the specified range. + */ +static bool +find_free_block(const struct vdo_slab *slab, slab_block_number *index_ptr) +{ + slab_block_number zero_index; + slab_block_number next_index = slab->search_cursor.index; + slab_block_number end_index = slab->search_cursor.end_index; + u8 *next_counter = &slab->counters[next_index]; + u8 *end_counter = &slab->counters[end_index]; + + /* + * Search every byte of the first unaligned word. (Array is padded so reading past end is + * safe.) + */ + zero_index = find_zero_byte_in_word(next_counter, next_index, end_index); + if (zero_index < end_index) { + *index_ptr = zero_index; + return true; + } + + /* + * On architectures where unaligned word access is expensive, this would be a good place to + * advance to an alignment boundary. + */ + next_index += BYTES_PER_WORD; + next_counter += BYTES_PER_WORD; + + /* + * Now we're word-aligned; check an word at a time until we find a word containing a zero. + * (Array is padded so reading past end is safe.) + */ + while (next_counter < end_counter) { + /* + * The following code is currently an exact copy of the code preceding the loop, + * but if you try to merge them by using a do loop, it runs slower because a jump + * instruction gets added at the start of the iteration. + */ + zero_index = find_zero_byte_in_word(next_counter, next_index, end_index); + if (zero_index < end_index) { + *index_ptr = zero_index; + return true; + } + + next_index += BYTES_PER_WORD; + next_counter += BYTES_PER_WORD; + } + + return false; +} + +/** + * search_current_reference_block() - Search the reference block currently saved in the search + * cursor for a reference count of zero, starting at the saved + * counter index. + * @slab: The slab to search. + * @free_index_ptr: A pointer to receive the array index of the zero reference count. + * + * Return: true if an unreferenced counter was found. + */ +static bool search_current_reference_block(const struct vdo_slab *slab, + slab_block_number *free_index_ptr) +{ + /* Don't bother searching if the current block is known to be full. */ + return ((slab->search_cursor.block->allocated_count < COUNTS_PER_BLOCK) && + find_free_block(slab, free_index_ptr)); +} + +/** + * search_reference_blocks() - Search each reference block for a reference count of zero. + * @slab: The slab to search. + * @free_index_ptr: A pointer to receive the array index of the zero reference count. + * + * Searches each reference block for a reference count of zero, starting at the reference block and + * counter index saved in the search cursor and searching up to the end of the last reference + * block. The search does not wrap. + * + * Return: true if an unreferenced counter was found. + */ +static bool +search_reference_blocks(struct vdo_slab *slab, slab_block_number *free_index_ptr) +{ + /* Start searching at the saved search position in the current block. */ + if (search_current_reference_block(slab, free_index_ptr)) + return true; + + /* Search each reference block up to the end of the slab. */ + while (advance_search_cursor(slab)) + if (search_current_reference_block(slab, free_index_ptr)) + return true; + + return false; +} + +/** + * make_provisional_reference() - Do the bookkeeping for making a provisional reference. + */ +static void +make_provisional_reference(struct vdo_slab *slab, slab_block_number block_number) +{ + struct reference_block *block = get_reference_block(slab, block_number); + + /* + * Make the initial transition from an unreferenced block to a + * provisionally allocated block. + */ + slab->counters[block_number] = PROVISIONAL_REFERENCE_COUNT; + + /* Account for the allocation. */ + block->allocated_count++; + slab->free_blocks--; +} + +/** + * dirty_all_reference_blocks() - Mark all reference count blocks in a slab as dirty. + */ +static void dirty_all_reference_blocks(struct vdo_slab *slab) +{ + block_count_t i; + + for (i = 0; i < slab->reference_block_count; i++) + dirty_block(&slab->reference_blocks[i]); +} + +/** + * clear_provisional_references() - Clear the provisional reference counts from a reference block. + * @block: The block to clear. + */ +static void clear_provisional_references(struct reference_block *block) +{ + vdo_refcount_t *counters = get_reference_counters_for_block(block); + block_count_t j; + + for (j = 0; j < COUNTS_PER_BLOCK; j++) { + if (counters[j] == PROVISIONAL_REFERENCE_COUNT) { + counters[j] = EMPTY_REFERENCE_COUNT; + block->allocated_count--; + } + } +} + +static inline bool journal_points_equal(struct journal_point first, struct journal_point second) +{ + return ((first.sequence_number == second.sequence_number) && + (first.entry_count == second.entry_count)); +} + +/** + * unpack_reference_block() - Unpack reference counts blocks into the internal memory structure. + * @packed: The written reference block to be unpacked. + * @block: The internal reference block to be loaded. + */ +static void +unpack_reference_block(struct packed_reference_block *packed, struct reference_block *block) +{ + block_count_t index; + sector_count_t i; + struct vdo_slab *slab = block->slab; + vdo_refcount_t *counters = get_reference_counters_for_block(block); + + for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) { + struct packed_reference_sector *sector = &packed->sectors[i]; + + vdo_unpack_journal_point(§or->commit_point, &block->commit_points[i]); + memcpy(counters + (i * COUNTS_PER_SECTOR), + sector->counts, + (sizeof(vdo_refcount_t) * COUNTS_PER_SECTOR)); + /* The slab_journal_point must be the latest point found in any sector. */ + if (vdo_before_journal_point(&slab->slab_journal_point, &block->commit_points[i])) + slab->slab_journal_point = block->commit_points[i]; + + if ((i > 0) && + !journal_points_equal(block->commit_points[0], block->commit_points[i])) { + size_t block_index = block - block->slab->reference_blocks; + + uds_log_warning("Torn write detected in sector %u of reference block %zu of slab %u", + i, + block_index, + block->slab->slab_number); + } + } + + block->allocated_count = 0; + for (index = 0; index < COUNTS_PER_BLOCK; index++) + if (counters[index] != EMPTY_REFERENCE_COUNT) + block->allocated_count++; +} + +/** + * finish_reference_block_load() - After a reference block has been read, unpack it. + * @completion: The VIO that just finished reading. + */ +static void finish_reference_block_load(struct vdo_completion *completion) +{ + struct vio *vio = as_vio(completion); + struct pooled_vio *pooled = vio_as_pooled_vio(vio); + struct reference_block *block = completion->parent; + struct vdo_slab *slab = block->slab; + + unpack_reference_block((struct packed_reference_block *) vio->data, block); + return_vio_to_pool(slab->allocator->vio_pool, pooled); + slab->active_count--; + clear_provisional_references(block); + + slab->free_blocks -= block->allocated_count; + check_if_slab_drained(slab); +} + +static void load_reference_block_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct reference_block *block = vio->completion.parent; + + continue_vio_after_io(vio, finish_reference_block_load, block->slab->allocator->thread_id); +} + +/** + * load_reference_block() - After a block waiter has gotten a VIO from the VIO pool, load the + * block. + * @waiter: The waiter of the block to load. + * @context: The VIO returned by the pool. + */ +static void load_reference_block(struct waiter *waiter, void *context) +{ + struct pooled_vio *pooled = context; + struct vio *vio = &pooled->vio; + struct reference_block *block = container_of(waiter, struct reference_block, waiter); + size_t block_offset = (block - block->slab->reference_blocks); + + vio->completion.parent = block; + submit_metadata_vio(vio, + block->slab->ref_counts_origin + block_offset, + load_reference_block_endio, + handle_io_error, + REQ_OP_READ); +} + +/** + * load_reference_blocks() - Load a slab's reference blocks from the underlying storage into a + * pre-allocated reference counter. + */ +static void load_reference_blocks(struct vdo_slab *slab) +{ + block_count_t i; + + slab->free_blocks = slab->block_count; + slab->active_count = slab->reference_block_count; + for (i = 0; i < slab->reference_block_count; i++) { + struct waiter *waiter = &slab->reference_blocks[i].waiter; + + waiter->callback = load_reference_block; + acquire_vio_from_pool(slab->allocator->vio_pool, waiter); + } +} + +/** + * drain_slab() - Drain all reference count I/O. + * + * Depending upon the type of drain being performed (as recorded in the ref_count's vdo_slab), the + * reference blocks may be loaded from disk or dirty reference blocks may be written out. + */ +static void drain_slab(struct vdo_slab *slab) +{ + bool save; + bool load; + const struct admin_state_code *state = vdo_get_admin_state_code(&slab->state); + + if (state == VDO_ADMIN_STATE_SUSPENDING) + return; + + if ((state != VDO_ADMIN_STATE_REBUILDING) && (state != VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING)) + commit_tail(&slab->journal); + + if ((state == VDO_ADMIN_STATE_RECOVERING) || (slab->counters == NULL)) + return; + + save = false; + load = slab->allocator->summary_entries[slab->slab_number].load_ref_counts; + if (state == VDO_ADMIN_STATE_SCRUBBING) { + if (load) { + load_reference_blocks(slab); + return; + } + } else if (state == VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING) { + if (!load) + /* These reference counts were never written, so mark them all dirty. */ + dirty_all_reference_blocks(slab); + + save = true; + } else if (state == VDO_ADMIN_STATE_REBUILDING) { + /* + * Write out the counters if the slab has written them before, or it has any + * non-zero reference counts, or there are any slab journal blocks. + */ + block_count_t data_blocks = slab->allocator->depot->slab_config.data_blocks; + + if (load || + (slab->free_blocks != data_blocks) || + !is_slab_journal_blank(slab)) { + dirty_all_reference_blocks(slab); + save = true; + } + } else if (state == VDO_ADMIN_STATE_SAVING) { + save = (slab->status == VDO_SLAB_REBUILT); + } else { + vdo_finish_draining_with_result(&slab->state, VDO_SUCCESS); + return; + } + + if (save) + save_dirty_reference_blocks(slab); +} + +static int allocate_slab_counters(struct vdo_slab *slab) +{ + int result; + size_t index, bytes; + + result = ASSERT(slab->reference_blocks == NULL, + "vdo_slab %u doesn't allocate refcounts twice", + slab->slab_number); + if (result != VDO_SUCCESS) + return result; + + result = UDS_ALLOCATE(slab->reference_block_count, + struct reference_block, + __func__, + &slab->reference_blocks); + if (result != VDO_SUCCESS) + return result; + + /* + * Allocate such that the runt slab has a full-length memory array, plus a little padding + * so we can word-search even at the very end. + */ + bytes = (slab->reference_block_count * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD); + result = UDS_ALLOCATE(bytes, vdo_refcount_t, "ref counts array", &slab->counters); + if (result != UDS_SUCCESS) { + UDS_FREE(UDS_FORGET(slab->reference_blocks)); + return result; + } + + slab->search_cursor.first_block = slab->reference_blocks; + slab->search_cursor.last_block = &slab->reference_blocks[slab->reference_block_count - 1]; + reset_search_cursor(slab); + + for (index = 0; index < slab->reference_block_count; index++) { + slab->reference_blocks[index] = (struct reference_block) { + .slab = slab, + }; + } + + return VDO_SUCCESS; +} + +static int allocate_counters_if_clean(struct vdo_slab *slab) +{ + if (vdo_is_state_clean_load(&slab->state)) + return allocate_slab_counters(slab); + + return VDO_SUCCESS; +} + +static void finish_loading_journal(struct vdo_completion *completion) +{ + struct vio *vio = as_vio(completion); + struct slab_journal *journal = completion->parent; + struct vdo_slab *slab = journal->slab; + struct packed_slab_journal_block *block = (struct packed_slab_journal_block *) vio->data; + struct slab_journal_block_header header; + + vdo_unpack_slab_journal_block_header(&block->header, &header); + + /* FIXME: should it be an error if the following conditional fails? */ + if ((header.metadata_type == VDO_METADATA_SLAB_JOURNAL) && + (header.nonce == slab->allocator->nonce)) { + journal->tail = header.sequence_number + 1; + + /* + * If the slab is clean, this implies the slab journal is empty, so advance the + * head appropriately. + */ + journal->head = (slab->allocator->summary_entries[slab->slab_number].is_dirty ? + header.head : + journal->tail); + journal->tail_header = header; + initialize_journal_state(journal); + } + + return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio)); + vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab)); +} + +static void read_slab_journal_tail_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct slab_journal *journal = vio->completion.parent; + + continue_vio_after_io(vio, finish_loading_journal, journal->slab->allocator->thread_id); +} + +static void handle_load_error(struct vdo_completion *completion) +{ + int result = completion->result; + struct slab_journal *journal = completion->parent; + struct vio *vio = as_vio(completion); + + vio_record_metadata_io_error(vio); + return_vio_to_pool(journal->slab->allocator->vio_pool, vio_as_pooled_vio(vio)); + vdo_finish_loading_with_result(&journal->slab->state, result); +} + +/** + * read_slab_journal_tail() - Read the slab journal tail block by using a vio acquired from the vio + * pool. + * @waiter: The vio pool waiter which has just been notified. + * @context: The vio pool entry given to the waiter. + * + * This is the success callback from acquire_vio_from_pool() when loading a slab journal. + */ +static void read_slab_journal_tail(struct waiter *waiter, void *context) +{ + struct slab_journal *journal = container_of(waiter, struct slab_journal, resource_waiter); + struct vdo_slab *slab = journal->slab; + struct pooled_vio *pooled = context; + struct vio *vio = &pooled->vio; + tail_block_offset_t last_commit_point = + slab->allocator->summary_entries[slab->slab_number].tail_block_offset; + + /* + * Slab summary keeps the commit point offset, so the tail block is the block before that. + * Calculation supports small journals in unit tests. + */ + tail_block_offset_t tail_block = ((last_commit_point == 0) ? + (tail_block_offset_t)(journal->size - 1) : + (last_commit_point - 1)); + + vio->completion.parent = journal; + vio->completion.callback_thread_id = slab->allocator->thread_id; + submit_metadata_vio(vio, + slab->journal_origin + tail_block, + read_slab_journal_tail_endio, + handle_load_error, + REQ_OP_READ); +} + +/** + * load_slab_journal() - Load a slab's journal by reading the journal's tail. + */ +static void load_slab_journal(struct vdo_slab *slab) +{ + struct slab_journal *journal = &slab->journal; + tail_block_offset_t last_commit_point; + + last_commit_point = slab->allocator->summary_entries[slab->slab_number].tail_block_offset; + if ((last_commit_point == 0) && + !slab->allocator->summary_entries[slab->slab_number].load_ref_counts) { + /* + * This slab claims that it has a tail block at (journal->size - 1), but a head of + * 1. This is impossible, due to the scrubbing threshold, on a real system, so + * don't bother reading the (bogus) data off disk. + */ + ASSERT_LOG_ONLY(((journal->size < 16) || + (journal->scrubbing_threshold < (journal->size - 1))), + "Scrubbing threshold protects against reads of unwritten slab journal blocks"); + vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab)); + return; + } + + journal->resource_waiter.callback = read_slab_journal_tail; + acquire_vio_from_pool(slab->allocator->vio_pool, &journal->resource_waiter); +} + +static void register_slab_for_scrubbing(struct vdo_slab *slab, bool high_priority) +{ + struct slab_scrubber *scrubber = &slab->allocator->scrubber; + + ASSERT_LOG_ONLY((slab->status != VDO_SLAB_REBUILT), "slab to be scrubbed is unrecovered"); + + if (slab->status != VDO_SLAB_REQUIRES_SCRUBBING) + return; + + list_del_init(&slab->allocq_entry); + if (!slab->was_queued_for_scrubbing) { + WRITE_ONCE(scrubber->slab_count, scrubber->slab_count + 1); + slab->was_queued_for_scrubbing = true; + } + + if (high_priority) { + slab->status = VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING; + list_add_tail(&slab->allocq_entry, &scrubber->high_priority_slabs); + return; + } + + list_add_tail(&slab->allocq_entry, &scrubber->slabs); +} + +/* Queue a slab for allocation or scrubbing. */ +static void queue_slab(struct vdo_slab *slab) +{ + struct block_allocator *allocator = slab->allocator; + block_count_t free_blocks; + int result; + + ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), + "a requeued slab must not already be on a ring"); + + if (vdo_is_read_only(allocator->depot->vdo)) + return; + + free_blocks = slab->free_blocks; + result = ASSERT((free_blocks <= allocator->depot->slab_config.data_blocks), + "rebuilt slab %u must have a valid free block count (has %llu, expected maximum %llu)", + slab->slab_number, + (unsigned long long) free_blocks, + (unsigned long long) allocator->depot->slab_config.data_blocks); + if (result != VDO_SUCCESS) { + vdo_enter_read_only_mode(allocator->depot->vdo, result); + return; + } + + if (slab->status != VDO_SLAB_REBUILT) { + register_slab_for_scrubbing(slab, false); + return; + } + + if (!vdo_is_state_resuming(&slab->state)) { + /* + * If the slab is resuming, we've already accounted for it here, so don't do it + * again. + * FIXME: under what situation would the slab be resuming here? + */ + WRITE_ONCE(allocator->allocated_blocks, allocator->allocated_blocks - free_blocks); + if (!is_slab_journal_blank(slab)) + WRITE_ONCE(allocator->statistics.slabs_opened, + allocator->statistics.slabs_opened + 1); + } + + if (allocator->depot->vdo->suspend_type == VDO_ADMIN_STATE_SAVING) + reopen_slab_journal(slab); + + prioritize_slab(slab); +} + +/** + * initiate_slab_action() - Initiate a slab action. + * + * Implements vdo_admin_initiator. + */ +static void initiate_slab_action(struct admin_state *state) +{ + struct vdo_slab *slab = container_of(state, struct vdo_slab, state); + + if (vdo_is_state_draining(state)) { + const struct admin_state_code *operation = vdo_get_admin_state_code(state); + + if (operation == VDO_ADMIN_STATE_SCRUBBING) + slab->status = VDO_SLAB_REBUILDING; + + drain_slab(slab); + check_if_slab_drained(slab); + return; + } + + if (vdo_is_state_loading(state)) { + load_slab_journal(slab); + return; + } + + if (vdo_is_state_resuming(state)) { + queue_slab(slab); + vdo_finish_resuming(state); + return; + } + + vdo_finish_operation(state, VDO_INVALID_ADMIN_STATE); +} + +/** + * get_next_slab() - Get the next slab to scrub. + * @scrubber: The slab scrubber. + * + * Return: The next slab to scrub or NULL if there are none. + */ +static struct vdo_slab *get_next_slab(struct slab_scrubber *scrubber) +{ + struct vdo_slab *slab; + + slab = list_first_entry_or_null(&scrubber->high_priority_slabs, + struct vdo_slab, + allocq_entry); + if (slab != NULL) + return slab; + + return list_first_entry_or_null(&scrubber->slabs, struct vdo_slab, allocq_entry); +} + +/** + * has_slabs_to_scrub() - Check whether a scrubber has slabs to scrub. + * @scrubber: The scrubber to check. + * + * Return: true if the scrubber has slabs to scrub. + */ +static bool __must_check has_slabs_to_scrub(struct slab_scrubber *scrubber) +{ + return (get_next_slab(scrubber) != NULL); +} + +/** + * uninitialize_scrubber_vio() - Clean up the slab_scrubber's vio. + * @scrubber: The scrubber. + */ +static void uninitialize_scrubber_vio(struct slab_scrubber *scrubber) +{ + UDS_FREE(UDS_FORGET(scrubber->vio.data)); + free_vio_components(&scrubber->vio); +} + +/** + * finish_scrubbing() - Stop scrubbing, either because there are no more slabs to scrub or because + * there's been an error. + * @scrubber: The scrubber. + */ +static void finish_scrubbing(struct slab_scrubber *scrubber, int result) +{ + bool notify = vdo_has_waiters(&scrubber->waiters); + bool done = !has_slabs_to_scrub(scrubber); + struct block_allocator *allocator = + container_of(scrubber, struct block_allocator, scrubber); + + if (done) + uninitialize_scrubber_vio(scrubber); + + if (scrubber->high_priority_only) { + scrubber->high_priority_only = false; + vdo_fail_completion(UDS_FORGET(scrubber->vio.completion.parent), result); + } else if (done && (atomic_add_return(-1, &allocator->depot->zones_to_scrub) == 0)) { + /* All of our slabs were scrubbed, and we're the last allocator to finish. */ + enum vdo_state prior_state = + atomic_cmpxchg(&allocator->depot->vdo->state, VDO_RECOVERING, VDO_DIRTY); + + /* + * To be safe, even if the CAS failed, ensure anything that follows is ordered with + * respect to whatever state change did happen. + */ + smp_mb__after_atomic(); + + /* + * We must check the VDO state here and not the depot's read_only_notifier since + * the compare-swap-above could have failed due to a read-only entry which our own + * thread does not yet know about. + */ + if (prior_state == VDO_DIRTY) + uds_log_info("VDO commencing normal operation"); + else if (prior_state == VDO_RECOVERING) + uds_log_info("Exiting recovery mode"); + } + + /* + * Note that the scrubber has stopped, and inform anyone who might be waiting for that to + * happen. + */ + if (!vdo_finish_draining(&scrubber->admin_state)) + WRITE_ONCE(scrubber->admin_state.current_state, VDO_ADMIN_STATE_SUSPENDED); + + /* + * We can't notify waiters until after we've finished draining or they'll just requeue. + * Fortunately if there were waiters, we can't have been freed yet. + */ + if (notify) + vdo_notify_all_waiters(&scrubber->waiters, NULL, NULL); +} + +static void scrub_next_slab(struct slab_scrubber *scrubber); + +/** + * slab_scrubbed() - Notify the scrubber that a slab has been scrubbed. + * @completion: The slab rebuild completion. + * + * This callback is registered in apply_journal_entries(). + */ +static void slab_scrubbed(struct vdo_completion *completion) +{ + struct slab_scrubber *scrubber = + container_of(as_vio(completion), struct slab_scrubber, vio); + struct vdo_slab *slab = scrubber->slab; + + slab->status = VDO_SLAB_REBUILT; + queue_slab(slab); + reopen_slab_journal(slab); + WRITE_ONCE(scrubber->slab_count, scrubber->slab_count - 1); + scrub_next_slab(scrubber); +} + +/** + * abort_scrubbing() - Abort scrubbing due to an error. + * @scrubber: The slab scrubber. + * @result: The error. + */ +static void abort_scrubbing(struct slab_scrubber *scrubber, int result) +{ + vdo_enter_read_only_mode(scrubber->vio.completion.vdo, result); + finish_scrubbing(scrubber, result); +} + +/** + * handle_scrubber_error() - Handle errors while rebuilding a slab. + * @completion: The slab rebuild completion. + */ +static void handle_scrubber_error(struct vdo_completion *completion) +{ + struct vio *vio = as_vio(completion); + + vio_record_metadata_io_error(vio); + abort_scrubbing(container_of(vio, struct slab_scrubber, vio), completion->result); +} + +/** + * apply_block_entries() - Apply all the entries in a block to the reference counts. + * @block: A block with entries to apply. + * @entry_count: The number of entries to apply. + * @block_number: The sequence number of the block. + * @slab: The slab to apply the entries to. + * + * Return: VDO_SUCCESS or an error code. + */ +static int apply_block_entries(struct packed_slab_journal_block *block, + journal_entry_count_t entry_count, + sequence_number_t block_number, + struct vdo_slab *slab) +{ + struct journal_point entry_point = { + .sequence_number = block_number, + .entry_count = 0, + }; + int result; + slab_block_number max_sbn = slab->end - slab->start; + + while (entry_point.entry_count < entry_count) { + struct slab_journal_entry entry = + vdo_decode_slab_journal_entry(block, entry_point.entry_count); + + if (entry.sbn > max_sbn) + /* This entry is out of bounds. */ + return uds_log_error_strerror(VDO_CORRUPT_JOURNAL, + "vdo_slab journal entry (%llu, %u) had invalid offset %u in slab (size %u blocks)", + (unsigned long long) block_number, + entry_point.entry_count, + entry.sbn, + max_sbn); + + result = replay_reference_count_change(slab, &entry_point, entry); + if (result != VDO_SUCCESS) { + uds_log_error_strerror(result, + "vdo_slab journal entry (%llu, %u) (%s of offset %u) could not be applied in slab %u", + (unsigned long long) block_number, + entry_point.entry_count, + vdo_get_journal_operation_name(entry.operation), + entry.sbn, + slab->slab_number); + return result; + } + entry_point.entry_count++; + } + + return VDO_SUCCESS; +} + +/** + * apply_journal_entries() - Find the relevant vio of the slab journal and apply all valid entries. + * @completion: The metadata read vio completion. + * + * This is a callback registered in start_scrubbing(). + */ +static void apply_journal_entries(struct vdo_completion *completion) +{ + int result; + struct slab_scrubber *scrubber + = container_of(as_vio(completion), struct slab_scrubber, vio); + struct vdo_slab *slab = scrubber->slab; + struct slab_journal *journal = &slab->journal; + + /* Find the boundaries of the useful part of the journal. */ + sequence_number_t tail = journal->tail; + tail_block_offset_t end_index = (tail - 1) % journal->size; + char *end_data = scrubber->vio.data + (end_index * VDO_BLOCK_SIZE); + struct packed_slab_journal_block *end_block = + (struct packed_slab_journal_block *) end_data; + + sequence_number_t head = __le64_to_cpu(end_block->header.head); + tail_block_offset_t head_index = head % journal->size; + block_count_t index = head_index; + + struct journal_point ref_counts_point = slab->slab_journal_point; + struct journal_point last_entry_applied = ref_counts_point; + sequence_number_t sequence; + + for (sequence = head; sequence < tail; sequence++) { + char *block_data = scrubber->vio.data + (index * VDO_BLOCK_SIZE); + struct packed_slab_journal_block *block = + (struct packed_slab_journal_block *) block_data; + struct slab_journal_block_header header; + + vdo_unpack_slab_journal_block_header(&block->header, &header); + + if ((header.nonce != slab->allocator->nonce) || + (header.metadata_type != VDO_METADATA_SLAB_JOURNAL) || + (header.sequence_number != sequence) || + (header.entry_count > journal->entries_per_block) || + (header.has_block_map_increments && + (header.entry_count > journal->full_entries_per_block))) { + /* The block is not what we expect it to be. */ + uds_log_error("vdo_slab journal block for slab %u was invalid", + slab->slab_number); + abort_scrubbing(scrubber, VDO_CORRUPT_JOURNAL); + return; + } + + result = apply_block_entries(block, header.entry_count, sequence, slab); + if (result != VDO_SUCCESS) { + abort_scrubbing(scrubber, result); + return; + } + + last_entry_applied.sequence_number = sequence; + last_entry_applied.entry_count = header.entry_count - 1; + index++; + if (index == journal->size) + index = 0; + } + + /* + * At the end of rebuild, the reference counters should be accurate to the end of the + * journal we just applied. + */ + result = ASSERT(!vdo_before_journal_point(&last_entry_applied, &ref_counts_point), + "Refcounts are not more accurate than the slab journal"); + if (result != VDO_SUCCESS) { + abort_scrubbing(scrubber, result); + return; + } + + /* Save out the rebuilt reference blocks. */ + vdo_prepare_completion(completion, + slab_scrubbed, + handle_scrubber_error, + slab->allocator->thread_id, + completion->parent); + vdo_start_operation_with_waiter(&slab->state, + VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING, + completion, + initiate_slab_action); +} + +static void read_slab_journal_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct slab_scrubber *scrubber = container_of(vio, struct slab_scrubber, vio); + + continue_vio_after_io(bio->bi_private, + apply_journal_entries, + scrubber->slab->allocator->thread_id); +} + +/** + * start_scrubbing() - Read the current slab's journal from disk now that it has been flushed. + * @completion: The scrubber's vio completion. + * + * This callback is registered in scrub_next_slab(). + */ +static void start_scrubbing(struct vdo_completion *completion) +{ + struct slab_scrubber *scrubber = + container_of(as_vio(completion), struct slab_scrubber, vio); + struct vdo_slab *slab = scrubber->slab; + + if (!slab->allocator->summary_entries[slab->slab_number].is_dirty) { + slab_scrubbed(completion); + return; + } + + submit_metadata_vio(&scrubber->vio, + slab->journal_origin, + read_slab_journal_endio, + handle_scrubber_error, + REQ_OP_READ); +} + +/** + * scrub_next_slab() - Scrub the next slab if there is one. + * @scrubber: The scrubber. + */ +static void scrub_next_slab(struct slab_scrubber *scrubber) +{ + struct vdo_completion *completion = &scrubber->vio.completion; + struct vdo_slab *slab; + + /* + * Note: this notify call is always safe only because scrubbing can only be started when + * the VDO is quiescent. + */ + vdo_notify_all_waiters(&scrubber->waiters, NULL, NULL); + + if (vdo_is_read_only(completion->vdo)) { + finish_scrubbing(scrubber, VDO_READ_ONLY); + return; + } + + slab = get_next_slab(scrubber); + if ((slab == NULL) || + (scrubber->high_priority_only && list_empty(&scrubber->high_priority_slabs))) { + finish_scrubbing(scrubber, VDO_SUCCESS); + return; + } + + if (vdo_finish_draining(&scrubber->admin_state)) + return; + + list_del_init(&slab->allocq_entry); + scrubber->slab = slab; + vdo_prepare_completion(completion, + start_scrubbing, + handle_scrubber_error, + slab->allocator->thread_id, + completion->parent); + vdo_start_operation_with_waiter(&slab->state, + VDO_ADMIN_STATE_SCRUBBING, + completion, + initiate_slab_action); +} + +/** + * scrub_slabs() - Scrub all of an allocator's slabs that are eligible for scrubbing. + * @allocator: The block_allocator to scrub. + * @parent: The completion to notify when scrubbing is done, implies high_priority, may be NULL. + */ +static void scrub_slabs(struct block_allocator *allocator, struct vdo_completion *parent) +{ + struct slab_scrubber *scrubber = &allocator->scrubber; + + scrubber->vio.completion.parent = parent; + scrubber->high_priority_only = (parent != NULL); + if (!has_slabs_to_scrub(scrubber)) { + finish_scrubbing(scrubber, VDO_SUCCESS); + return; + } + + if (scrubber->high_priority_only && + vdo_is_priority_table_empty(allocator->prioritized_slabs) && + list_empty(&scrubber->high_priority_slabs)) + register_slab_for_scrubbing(get_next_slab(scrubber), true); + + vdo_resume_if_quiescent(&scrubber->admin_state); + scrub_next_slab(scrubber); +} + +static inline void assert_on_allocator_thread(thread_id_t thread_id, const char *function_name) +{ + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == thread_id), + "%s called on correct thread", + function_name); +} + +static void register_slab_with_allocator(struct block_allocator *allocator, struct vdo_slab *slab) +{ + allocator->slab_count++; + allocator->last_slab = slab->slab_number; +} + +/** + * get_depot_slab_iterator() - Return a slab_iterator over the slabs in a slab_depot. + * @depot: The depot over which to iterate. + * @start: The number of the slab to start iterating from. + * @end: The number of the last slab which may be returned. + * @stride: The difference in slab number between successive slabs. + * + * Iteration always occurs from higher to lower numbered slabs. + * + * Return: An initialized iterator structure. + */ +static struct slab_iterator get_depot_slab_iterator(struct slab_depot *depot, + slab_count_t start, + slab_count_t end, + slab_count_t stride) +{ + struct vdo_slab **slabs = depot->slabs; + + return (struct slab_iterator) { + .slabs = slabs, + .next = (((slabs == NULL) || (start < end)) ? NULL : slabs[start]), + .end = end, + .stride = stride, + }; +} + +static struct slab_iterator get_slab_iterator(const struct block_allocator *allocator) +{ + return get_depot_slab_iterator(allocator->depot, + allocator->last_slab, + allocator->zone_number, + allocator->depot->zone_count); +} + +/** + * next_slab() - Get the next slab from a slab_iterator and advance the iterator + * @iterator: The slab_iterator. + * + * Return: The next slab or NULL if the iterator is exhausted. + */ +static struct vdo_slab *next_slab(struct slab_iterator *iterator) +{ + struct vdo_slab *slab = iterator->next; + + if ((slab == NULL) || (slab->slab_number < iterator->end + iterator->stride)) + iterator->next = NULL; + else + iterator->next = iterator->slabs[slab->slab_number - iterator->stride]; + + return slab; +} + +/** + * abort_waiter() - Abort vios waiting to make journal entries when read-only. + * + * This callback is invoked on all vios waiting to make slab journal entries after the VDO has gone + * into read-only mode. Implements waiter_callback. + */ +static void abort_waiter(struct waiter *waiter, void *context __always_unused) +{ + struct reference_updater *updater = container_of(waiter, struct reference_updater, waiter); + struct data_vio *data_vio = data_vio_from_reference_updater(updater); + + if (updater->increment) { + continue_data_vio_with_error(data_vio, VDO_READ_ONLY); + return; + } + + vdo_continue_completion(&data_vio->decrement_completion, VDO_READ_ONLY); +} + +/* Implements vdo_read_only_notification. */ +static void notify_block_allocator_of_read_only_mode(void *listener, struct vdo_completion *parent) +{ + struct block_allocator *allocator = listener; + struct slab_iterator iterator; + + assert_on_allocator_thread(allocator->thread_id, __func__); + iterator = get_slab_iterator(allocator); + while (iterator.next != NULL) { + struct vdo_slab *slab = next_slab(&iterator); + + vdo_notify_all_waiters(&slab->journal.entry_waiters, abort_waiter, &slab->journal); + check_if_slab_drained(slab); + } + + vdo_finish_completion(parent); +} + +/** + * vdo_acquire_provisional_reference() - Acquire a provisional reference on behalf of a PBN lock if + * the block it locks is unreferenced. + * @slab: The slab which contains the block. + * @pbn: The physical block to reference. + * @lock: The lock. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_acquire_provisional_reference(struct vdo_slab *slab, + physical_block_number_t pbn, + struct pbn_lock *lock) +{ + slab_block_number block_number; + int result; + + if (vdo_pbn_lock_has_provisional_reference(lock)) + return VDO_SUCCESS; + + if (!is_slab_open(slab)) + return VDO_INVALID_ADMIN_STATE; + + result = slab_block_number_from_pbn(slab, pbn, &block_number); + if (result != VDO_SUCCESS) + return result; + + if (slab->counters[block_number] == EMPTY_REFERENCE_COUNT) { + make_provisional_reference(slab, block_number); + if (lock != NULL) + vdo_assign_pbn_lock_provisional_reference(lock); + } + + if (vdo_pbn_lock_has_provisional_reference(lock)) + adjust_free_block_count(slab, false); + + return VDO_SUCCESS; +} + +static int __must_check +allocate_slab_block(struct vdo_slab *slab, physical_block_number_t *block_number_ptr) +{ + slab_block_number free_index; + + if (!is_slab_open(slab)) + return VDO_INVALID_ADMIN_STATE; + + if (!search_reference_blocks(slab, &free_index)) + return VDO_NO_SPACE; + + ASSERT_LOG_ONLY((slab->counters[free_index] == EMPTY_REFERENCE_COUNT), + "free block must have ref count of zero"); + make_provisional_reference(slab, free_index); + adjust_free_block_count(slab, false); + + /* + * Update the search hint so the next search will start at the array index just past the + * free block we just found. + */ + slab->search_cursor.index = (free_index + 1); + + *block_number_ptr = slab->start + free_index; + return VDO_SUCCESS; +} + +/** + * open_slab() - Prepare a slab to be allocated from. + * @slab: The slab. + */ +static void open_slab(struct vdo_slab *slab) +{ + reset_search_cursor(slab); + if (is_slab_journal_blank(slab)) { + WRITE_ONCE(slab->allocator->statistics.slabs_opened, + slab->allocator->statistics.slabs_opened + 1); + dirty_all_reference_blocks(slab); + } else { + WRITE_ONCE(slab->allocator->statistics.slabs_reopened, + slab->allocator->statistics.slabs_reopened + 1); + } + + slab->allocator->open_slab = slab; +} + + +/* + * The block allocated will have a provisional reference and the reference must be either confirmed + * with a subsequent increment or vacated with a subsequent decrement via + * vdo_release_block_reference(). + */ +int vdo_allocate_block(struct block_allocator *allocator, + physical_block_number_t *block_number_ptr) +{ + int result; + + if (allocator->open_slab != NULL) { + /* Try to allocate the next block in the currently open slab. */ + result = allocate_slab_block(allocator->open_slab, block_number_ptr); + if ((result == VDO_SUCCESS) || (result != VDO_NO_SPACE)) + return result; + + /* Put the exhausted open slab back into the priority table. */ + prioritize_slab(allocator->open_slab); + } + + /* Remove the highest priority slab from the priority table and make it the open slab. */ + open_slab(list_entry(vdo_priority_table_dequeue(allocator->prioritized_slabs), + struct vdo_slab, + allocq_entry)); + + /* + * Try allocating again. If we're out of space immediately after opening a slab, then every + * slab must be fully allocated. + */ + return allocate_slab_block(allocator->open_slab, block_number_ptr); +} + +/** + * vdo_enqueue_clean_slab_waiter() - Wait for a clean slab. + * @allocator: The block_allocator on which to wait. + * @waiter: The waiter. + * + * Return: VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no slabs to scrub, and + * some other error otherwise. + */ +int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator, struct waiter *waiter) +{ + if (vdo_is_read_only(allocator->depot->vdo)) + return VDO_READ_ONLY; + + if (vdo_is_state_quiescent(&allocator->scrubber.admin_state)) + return VDO_NO_SPACE; + + vdo_enqueue_waiter(&allocator->scrubber.waiters, waiter); + return VDO_SUCCESS; +} + +/** + * vdo_modify_reference_count() - Modify the reference count of a block by first making a slab + * journal entry and then updating the reference counter. + * + * @data_vio: The data_vio for which to add the entry. + * @updater: Which of the data_vio's reference updaters is being submitted. + */ +void vdo_modify_reference_count(struct vdo_completion *completion, + struct reference_updater *updater) +{ + struct vdo_slab *slab = vdo_get_slab(completion->vdo->depot, updater->zpbn.pbn); + + if (!is_slab_open(slab)) { + vdo_continue_completion(completion, VDO_INVALID_ADMIN_STATE); + return; + } + + if (vdo_is_read_only(completion->vdo)) { + vdo_continue_completion(completion, VDO_READ_ONLY); + return; + } + + vdo_enqueue_waiter(&slab->journal.entry_waiters, &updater->waiter); + if ((slab->status != VDO_SLAB_REBUILT) && requires_reaping(&slab->journal)) + register_slab_for_scrubbing(slab, true); + + add_entries(&slab->journal); +} + +/* Release an unused provisional reference. */ +int vdo_release_block_reference(struct block_allocator *allocator, physical_block_number_t pbn) +{ + struct reference_updater updater; + + if (pbn == VDO_ZERO_BLOCK) + return VDO_SUCCESS; + + updater = (struct reference_updater) { + .operation = VDO_JOURNAL_DATA_REMAPPING, + .increment = false, + .zpbn = { + .pbn = pbn, + }, + }; + + return adjust_reference_count(vdo_get_slab(allocator->depot, pbn), &updater, NULL); +} + +/** + * This is a min_heap callback function orders slab_status structures using the 'is_clean' field as + * the primary key and the 'emptiness' field as the secondary key. + * + * Slabs need to be pushed onto the rings in the same order they are to be popped off. Popping + * should always get the most empty first, so pushing should be from most empty to least empty. + * Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements + * before larger ones. + */ +static bool slab_status_is_less_than(const void *item1, const void *item2) +{ + const struct slab_status *info1 = (const struct slab_status *) item1; + const struct slab_status *info2 = (const struct slab_status *) item2; + + if (info1->is_clean != info2->is_clean) + return info1->is_clean; + if (info1->emptiness != info2->emptiness) + return info1->emptiness > info2->emptiness; + return info1->slab_number < info2->slab_number; +} + +static void swap_slab_statuses(void *item1, void *item2) +{ + struct slab_status *info1 = item1; + struct slab_status *info2 = item2; + + swap(*info1, *info2); +} + +static const struct min_heap_callbacks slab_status_min_heap = { + .elem_size = sizeof(struct slab_status), + .less = slab_status_is_less_than, + .swp = swap_slab_statuses, +}; + +/* Inform the slab actor that a action has finished on some slab; used by apply_to_slabs(). */ +static void slab_action_callback(struct vdo_completion *completion) +{ + struct block_allocator *allocator = vdo_as_block_allocator(completion); + struct slab_actor *actor = &allocator->slab_actor; + + if (--actor->slab_action_count == 0) { + actor->callback(completion); + return; + } + + vdo_reset_completion(completion); +} + +/* Preserve the error from part of an action and continue. */ +static void handle_operation_error(struct vdo_completion *completion) +{ + struct block_allocator *allocator = vdo_as_block_allocator(completion); + + if (allocator->state.waiter != NULL) + vdo_set_completion_result(allocator->state.waiter, completion->result); + completion->callback(completion); +} + +/* Perform an action on each of an allocator's slabs in parallel. */ +static void apply_to_slabs(struct block_allocator *allocator, vdo_action *callback) +{ + struct slab_iterator iterator; + + vdo_prepare_completion(&allocator->completion, + slab_action_callback, + handle_operation_error, + allocator->thread_id, + NULL); + allocator->completion.requeue = false; + + /* + * Since we are going to dequeue all of the slabs, the open slab will become invalid, so + * clear it. + */ + allocator->open_slab = NULL; + + /* Ensure that we don't finish before we're done starting. */ + allocator->slab_actor = (struct slab_actor) { + .slab_action_count = 1, + .callback = callback, + }; + + iterator = get_slab_iterator(allocator); + while (iterator.next != NULL) { + const struct admin_state_code *operation = + vdo_get_admin_state_code(&allocator->state); + struct vdo_slab *slab = next_slab(&iterator); + + list_del_init(&slab->allocq_entry); + allocator->slab_actor.slab_action_count++; + vdo_start_operation_with_waiter(&slab->state, + operation, + &allocator->completion, + initiate_slab_action); + } + + slab_action_callback(&allocator->completion); +} + +static void finish_loading_allocator(struct vdo_completion *completion) +{ + struct block_allocator *allocator = vdo_as_block_allocator(completion); + const struct admin_state_code *operation = vdo_get_admin_state_code(&allocator->state); + + if (allocator->eraser != NULL) + dm_kcopyd_client_destroy(UDS_FORGET(allocator->eraser)); + + if (operation == VDO_ADMIN_STATE_LOADING_FOR_RECOVERY) { + void *context = vdo_get_current_action_context(allocator->depot->action_manager); + + vdo_replay_into_slab_journals(allocator, context); + return; + } + + vdo_finish_loading(&allocator->state); +} + +static void erase_next_slab_journal(struct block_allocator *allocator); + +static void copy_callback(int read_err, unsigned long write_err, void *context) +{ + struct block_allocator *allocator = context; + int result = (((read_err == 0) && (write_err == 0)) ? VDO_SUCCESS : -EIO); + + if (result != VDO_SUCCESS) { + vdo_fail_completion(&allocator->completion, result); + return; + } + + erase_next_slab_journal(allocator); +} + +/* erase_next_slab_journal() - Erase the next slab journal. */ +static void erase_next_slab_journal(struct block_allocator *allocator) +{ + struct vdo_slab *slab; + physical_block_number_t pbn; + struct dm_io_region regions[1]; + struct slab_depot *depot = allocator->depot; + block_count_t blocks = depot->slab_config.slab_journal_blocks; + + if (allocator->slabs_to_erase.next == NULL) { + vdo_finish_completion(&allocator->completion); + return; + } + + slab = next_slab(&allocator->slabs_to_erase); + pbn = slab->journal_origin - depot->vdo->geometry.bio_offset; + regions[0] = (struct dm_io_region) { + .bdev = vdo_get_backing_device(depot->vdo), + .sector = pbn * VDO_SECTORS_PER_BLOCK, + .count = blocks * VDO_SECTORS_PER_BLOCK, + }; + dm_kcopyd_zero(allocator->eraser, 1, regions, 0, copy_callback, allocator); +} + +/* Implements vdo_admin_initiator. */ +static void initiate_load(struct admin_state *state) +{ + struct block_allocator *allocator = container_of(state, struct block_allocator, state); + const struct admin_state_code *operation = vdo_get_admin_state_code(state); + + if (operation == VDO_ADMIN_STATE_LOADING_FOR_REBUILD) { + /* + * Must requeue because the kcopyd client cannot be freed in the same stack frame + * as the kcopyd callback, lest it deadlock. + */ + vdo_prepare_completion_for_requeue(&allocator->completion, + finish_loading_allocator, + handle_operation_error, + allocator->thread_id, + NULL); + allocator->eraser = dm_kcopyd_client_create(NULL); + if (allocator->eraser == NULL) { + vdo_fail_completion(&allocator->completion, -ENOMEM); + return; + } + allocator->slabs_to_erase = get_slab_iterator(allocator); + + erase_next_slab_journal(allocator); + return; + } + + apply_to_slabs(allocator, finish_loading_allocator); +} + +/* + * vdo_notify_slab_journals_are_recovered(): Inform a block allocator that its slab journals have + * been recovered from the recovery journal. + * @completion The allocator completion + */ +void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion) +{ + struct block_allocator *allocator = vdo_as_block_allocator(completion); + + vdo_finish_loading_with_result(&allocator->state, completion->result); +} + +static int +get_slab_statuses(struct block_allocator *allocator, struct slab_status **statuses_ptr) +{ + int result; + struct slab_status *statuses; + struct slab_iterator iterator = get_slab_iterator(allocator); + + result = UDS_ALLOCATE(allocator->slab_count, struct slab_status, __func__, &statuses); + if (result != VDO_SUCCESS) + return result; + + *statuses_ptr = statuses; + + while (iterator.next != NULL) { + slab_count_t slab_number = next_slab(&iterator)->slab_number; + + *statuses++ = (struct slab_status) { + .slab_number = slab_number, + .is_clean = !allocator->summary_entries[slab_number].is_dirty, + .emptiness = allocator->summary_entries[slab_number].fullness_hint, + }; + } + + return VDO_SUCCESS; +} + +/* Prepare slabs for allocation or scrubbing. */ +static int __must_check +vdo_prepare_slabs_for_allocation(struct block_allocator *allocator) +{ + struct slab_status current_slab_status; + struct min_heap heap; + int result; + struct slab_status *slab_statuses; + struct slab_depot *depot = allocator->depot; + + WRITE_ONCE(allocator->allocated_blocks, + allocator->slab_count * depot->slab_config.data_blocks); + result = get_slab_statuses(allocator, &slab_statuses); + if (result != VDO_SUCCESS) + return result; + + /* Sort the slabs by cleanliness, then by emptiness hint. */ + heap = (struct min_heap) { + .data = slab_statuses, + .nr = allocator->slab_count, + .size = allocator->slab_count, + }; + min_heapify_all(&heap, &slab_status_min_heap); + + while (heap.nr > 0) { + bool high_priority; + struct vdo_slab *slab; + struct slab_journal *journal; + + current_slab_status = slab_statuses[0]; + min_heap_pop(&heap, &slab_status_min_heap); + slab = depot->slabs[current_slab_status.slab_number]; + + if ((depot->load_type == VDO_SLAB_DEPOT_REBUILD_LOAD) || + (!allocator->summary_entries[slab->slab_number].load_ref_counts && + current_slab_status.is_clean)) { + queue_slab(slab); + continue; + } + + slab->status = VDO_SLAB_REQUIRES_SCRUBBING; + journal = &slab->journal; + high_priority = ((current_slab_status.is_clean && + (depot->load_type == VDO_SLAB_DEPOT_NORMAL_LOAD)) || + (journal_length(journal) >= journal->scrubbing_threshold)); + register_slab_for_scrubbing(slab, high_priority); + } + + UDS_FREE(slab_statuses); + return VDO_SUCCESS; +} + +static const char *status_to_string(enum slab_rebuild_status status) +{ + switch (status) { + case VDO_SLAB_REBUILT: + return "REBUILT"; + case VDO_SLAB_REQUIRES_SCRUBBING: + return "SCRUBBING"; + case VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING: + return "PRIORITY_SCRUBBING"; + case VDO_SLAB_REBUILDING: + return "REBUILDING"; + case VDO_SLAB_REPLAYING: + return "REPLAYING"; + default: + return "UNKNOWN"; + } +} + +void vdo_dump_block_allocator(const struct block_allocator *allocator) +{ + unsigned int pause_counter = 0; + struct slab_iterator iterator = get_slab_iterator(allocator); + const struct slab_scrubber *scrubber = &allocator->scrubber; + + uds_log_info("block_allocator zone %u", allocator->zone_number); + while (iterator.next != NULL) { + struct vdo_slab *slab = next_slab(&iterator); + struct slab_journal *journal = &slab->journal; + + if (slab->reference_blocks != NULL) + /* Terse because there are a lot of slabs to dump and syslog is lossy. */ + uds_log_info("slab %u: P%u, %llu free", + slab->slab_number, + slab->priority, + (unsigned long long) slab->free_blocks); + else + uds_log_info("slab %u: status %s", + slab->slab_number, + status_to_string(slab->status)); + + uds_log_info(" slab journal: entry_waiters=%zu waiting_to_commit=%s updating_slab_summary=%s head=%llu unreapable=%llu tail=%llu next_commit=%llu summarized=%llu last_summarized=%llu recovery_lock=%llu dirty=%s", + vdo_count_waiters(&journal->entry_waiters), + uds_bool_to_string(journal->waiting_to_commit), + uds_bool_to_string(journal->updating_slab_summary), + (unsigned long long) journal->head, + (unsigned long long) journal->unreapable, + (unsigned long long) journal->tail, + (unsigned long long) journal->next_commit, + (unsigned long long) journal->summarized, + (unsigned long long) journal->last_summarized, + (unsigned long long) journal->recovery_lock, + uds_bool_to_string(journal->recovery_lock != 0)); + /* + * Given the frequency with which the locks are just a tiny bit off, it might be + * worth dumping all the locks, but that might be too much logging. + */ + + if (slab->counters != NULL) + /* Terse because there are a lot of slabs to dump and syslog is lossy. */ + uds_log_info(" slab: free=%u/%u blocks=%u dirty=%zu active=%zu journal@(%llu,%u)", + slab->free_blocks, + slab->block_count, + slab->reference_block_count, + vdo_count_waiters(&slab->dirty_blocks), + slab->active_count, + (unsigned long long) slab->slab_journal_point.sequence_number, + slab->slab_journal_point.entry_count); + else + uds_log_info(" no counters"); + + /* + * Wait for a while after each batch of 32 slabs dumped, an arbitrary number, + * allowing the kernel log a chance to be flushed instead of being overrun. + */ + if (pause_counter++ == 31) { + pause_counter = 0; + uds_pause_for_logger(); + } + } + + uds_log_info("slab_scrubber slab_count %u waiters %zu %s%s", + READ_ONCE(scrubber->slab_count), + vdo_count_waiters(&scrubber->waiters), + vdo_get_admin_state_code(&scrubber->admin_state)->name, + scrubber->high_priority_only ? ", high_priority_only " : ""); +} + +static void free_slab(struct vdo_slab *slab) +{ + if (slab == NULL) + return; + + list_del(&slab->allocq_entry); + UDS_FREE(UDS_FORGET(slab->journal.block)); + UDS_FREE(UDS_FORGET(slab->journal.locks)); + UDS_FREE(UDS_FORGET(slab->counters)); + UDS_FREE(UDS_FORGET(slab->reference_blocks)); + UDS_FREE(slab); +} + +static int initialize_slab_journal(struct vdo_slab *slab) +{ + struct slab_journal *journal = &slab->journal; + const struct slab_config *slab_config = &slab->allocator->depot->slab_config; + int result; + + result = UDS_ALLOCATE(slab_config->slab_journal_blocks, + struct journal_lock, + __func__, + &journal->locks); + if (result != VDO_SUCCESS) + return result; + + result = UDS_ALLOCATE(VDO_BLOCK_SIZE, + char, + "struct packed_slab_journal_block", + (char **) &journal->block); + if (result != VDO_SUCCESS) + return result; + + journal->slab = slab; + journal->size = slab_config->slab_journal_blocks; + journal->flushing_threshold = slab_config->slab_journal_flushing_threshold; + journal->blocking_threshold = slab_config->slab_journal_blocking_threshold; + journal->scrubbing_threshold = slab_config->slab_journal_scrubbing_threshold; + journal->entries_per_block = VDO_SLAB_JOURNAL_ENTRIES_PER_BLOCK; + journal->full_entries_per_block = VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK; + journal->events = &slab->allocator->slab_journal_statistics; + journal->recovery_journal = slab->allocator->depot->vdo->recovery_journal; + journal->tail = 1; + journal->head = 1; + + journal->flushing_deadline = journal->flushing_threshold; + /* + * Set there to be some time between the deadline and the blocking threshold, so that + * hopefully all are done before blocking. + */ + if ((journal->blocking_threshold - journal->flushing_threshold) > 5) + journal->flushing_deadline = journal->blocking_threshold - 5; + + journal->slab_summary_waiter.callback = release_journal_locks; + + INIT_LIST_HEAD(&journal->dirty_entry); + INIT_LIST_HEAD(&journal->uncommitted_blocks); + + journal->tail_header.nonce = slab->allocator->nonce; + journal->tail_header.metadata_type = VDO_METADATA_SLAB_JOURNAL; + initialize_journal_state(journal); + return VDO_SUCCESS; +} + +/** + * make_slab() - Construct a new, empty slab. + * @slab_origin: The physical block number within the block allocator partition of the first block + * in the slab. + * @allocator: The block allocator to which the slab belongs. + * @slab_number: The slab number of the slab. + * @is_new: true if this slab is being allocated as part of a resize. + * @slab_ptr: A pointer to receive the new slab. + * + * Return: VDO_SUCCESS or an error code. + */ +static int __must_check +make_slab(physical_block_number_t slab_origin, + struct block_allocator *allocator, + slab_count_t slab_number, + bool is_new, + struct vdo_slab **slab_ptr) +{ + const struct slab_config *slab_config = &allocator->depot->slab_config; + struct vdo_slab *slab; + int result; + + result = UDS_ALLOCATE(1, struct vdo_slab, __func__, &slab); + if (result != VDO_SUCCESS) + return result; + + *slab = (struct vdo_slab) { + .allocator = allocator, + .start = slab_origin, + .end = slab_origin + slab_config->slab_blocks, + .slab_number = slab_number, + .ref_counts_origin = slab_origin + slab_config->data_blocks, + .journal_origin = vdo_get_slab_journal_start_block(slab_config, slab_origin), + .block_count = slab_config->data_blocks, + .free_blocks = slab_config->data_blocks, + .reference_block_count = + vdo_get_saved_reference_count_size(slab_config->data_blocks), + }; + INIT_LIST_HEAD(&slab->allocq_entry); + + result = initialize_slab_journal(slab); + if (result != VDO_SUCCESS) { + free_slab(slab); + return result; + } + + if (is_new) { + vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NEW); + result = allocate_slab_counters(slab); + if (result != VDO_SUCCESS) { + free_slab(slab); + return result; + } + } else { + vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NORMAL_OPERATION); + } + + *slab_ptr = slab; + return VDO_SUCCESS; +} + +/** + * allocate_slabs() - Allocate a new slab pointer array. + * @depot: The depot. + * @slab_count: The number of slabs the depot should have in the new array. + * + * Any existing slab pointers will be copied into the new array, and slabs will be allocated as + * needed. The newly allocated slabs will not be distributed for use by the block allocators. + * + * Return: VDO_SUCCESS or an error code. + */ +static int allocate_slabs(struct slab_depot *depot, slab_count_t slab_count) +{ + block_count_t slab_size; + bool resizing = false; + physical_block_number_t slab_origin; + int result; + + result = UDS_ALLOCATE(slab_count, + struct vdo_slab *, + "slab pointer array", + &depot->new_slabs); + if (result != VDO_SUCCESS) + return result; + + if (depot->slabs != NULL) { + memcpy(depot->new_slabs, + depot->slabs, + depot->slab_count * sizeof(struct vdo_slab *)); + resizing = true; + } + + slab_size = depot->slab_config.slab_blocks; + slab_origin = depot->first_block + (depot->slab_count * slab_size); + + for (depot->new_slab_count = depot->slab_count; + depot->new_slab_count < slab_count; + depot->new_slab_count++, slab_origin += slab_size) { + struct block_allocator *allocator = + &depot->allocators[depot->new_slab_count % depot->zone_count]; + struct vdo_slab **slab_ptr = &depot->new_slabs[depot->new_slab_count]; + + result = make_slab(slab_origin, + allocator, + depot->new_slab_count, + resizing, + slab_ptr); + if (result != VDO_SUCCESS) + return result; + } + + return VDO_SUCCESS; +} + +/** + * vdo_abandon_new_slabs() - Abandon any new slabs in this depot, freeing them as needed. + * @depot: The depot. + */ +void vdo_abandon_new_slabs(struct slab_depot *depot) +{ + slab_count_t i; + + if (depot->new_slabs == NULL) + return; + + for (i = depot->slab_count; i < depot->new_slab_count; i++) + free_slab(UDS_FORGET(depot->new_slabs[i])); + depot->new_slab_count = 0; + depot->new_size = 0; + UDS_FREE(UDS_FORGET(depot->new_slabs)); +} + +/** + * get_allocator_thread_id() - Get the ID of the thread on which a given allocator operates. + * + * Implements vdo_zone_thread_getter. + */ +static thread_id_t get_allocator_thread_id(void *context, zone_count_t zone_number) +{ + return ((struct slab_depot *) context)->allocators[zone_number].thread_id; +} + +/** + * release_recovery_journal_lock() - Request the slab journal to release the recovery journal lock + * it may hold on a specified recovery journal block. + * @journal: The slab journal. + * @recovery_lock: The sequence number of the recovery journal block whose locks should be + * released. + * + * Return: true if the journal does hold a lock on the specified block (which it will release). + */ +static bool __must_check +release_recovery_journal_lock(struct slab_journal *journal, sequence_number_t recovery_lock) +{ + if (recovery_lock > journal->recovery_lock) { + ASSERT_LOG_ONLY((recovery_lock < journal->recovery_lock), + "slab journal recovery lock is not older than the recovery journal head"); + return false; + } + + if ((recovery_lock < journal->recovery_lock) || + vdo_is_read_only(journal->slab->allocator->depot->vdo)) + return false; + + /* All locks are held by the block which is in progress; write it. */ + commit_tail(journal); + return true; +} + +/* + * Request a commit of all dirty tail blocks which are locking the recovery journal block the depot + * is seeking to release. + * + * Implements vdo_zone_action. + */ +static void release_tail_block_locks(void *context, + zone_count_t zone_number, + struct vdo_completion *parent) +{ + struct slab_journal *journal, *tmp; + struct slab_depot *depot = context; + struct list_head *list = &depot->allocators[zone_number].dirty_slab_journals; + + list_for_each_entry_safe(journal, tmp, list, dirty_entry) { + if (!release_recovery_journal_lock(journal, depot->active_release_request)) + break; + } + + vdo_finish_completion(parent); +} + +/** + * prepare_for_tail_block_commit() - Prepare to commit oldest tail blocks. + * + * Implements vdo_action_preamble. + */ +static void prepare_for_tail_block_commit(void *context, struct vdo_completion *parent) +{ + struct slab_depot *depot = context; + + depot->active_release_request = depot->new_release_request; + vdo_finish_completion(parent); +} + +/** + * schedule_tail_block_commit() - Schedule a tail block commit if necessary. + * + * This method should not be called directly. Rather, call vdo_schedule_default_action() on the + * depot's action manager. + * + * Implements vdo_action_scheduler. + */ +static bool schedule_tail_block_commit(void *context) +{ + struct slab_depot *depot = context; + + if (depot->new_release_request == depot->active_release_request) + return false; + + return vdo_schedule_action(depot->action_manager, + prepare_for_tail_block_commit, + release_tail_block_locks, + NULL, + NULL); +} + +/** + * initialize_slab_scrubber() - Initialize an allocator's slab scrubber. + * @allocator: The allocator being initialized + * + * Return: VDO_SUCCESS or an error. + */ +static int initialize_slab_scrubber(struct block_allocator *allocator) +{ + struct slab_scrubber *scrubber = &allocator->scrubber; + block_count_t slab_journal_size = allocator->depot->slab_config.slab_journal_blocks; + char *journal_data; + int result; + + result = UDS_ALLOCATE(VDO_BLOCK_SIZE * slab_journal_size, char, __func__, &journal_data); + if (result != VDO_SUCCESS) + return result; + + result = allocate_vio_components(allocator->completion.vdo, + VIO_TYPE_SLAB_JOURNAL, + VIO_PRIORITY_METADATA, + allocator, + slab_journal_size, + journal_data, + &scrubber->vio); + if (result != VDO_SUCCESS) { + UDS_FREE(journal_data); + return result; + } + + INIT_LIST_HEAD(&scrubber->high_priority_slabs); + INIT_LIST_HEAD(&scrubber->slabs); + vdo_set_admin_state_code(&scrubber->admin_state, VDO_ADMIN_STATE_SUSPENDED); + return VDO_SUCCESS; +} + +/** + * initialize_slab_summary_block() - Initialize a slab_summary_block. + * @allocator: The allocator which owns the block. + * @index: The index of this block in its zone's summary. + * + * Return: VDO_SUCCESS or an error. + */ +static int __must_check +initialize_slab_summary_block(struct block_allocator *allocator, block_count_t index) +{ + struct slab_summary_block *block = &allocator->summary_blocks[index]; + int result; + + result = UDS_ALLOCATE(VDO_BLOCK_SIZE, char, __func__, &block->outgoing_entries); + if (result != VDO_SUCCESS) + return result; + + result = allocate_vio_components(allocator->depot->vdo, + VIO_TYPE_SLAB_SUMMARY, + VIO_PRIORITY_METADATA, + NULL, + 1, + block->outgoing_entries, + &block->vio); + if (result != VDO_SUCCESS) + return result; + + block->allocator = allocator; + block->entries = &allocator->summary_entries[VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK * index]; + block->index = index; + return VDO_SUCCESS; +} + +static int __must_check initialize_block_allocator(struct slab_depot *depot, zone_count_t zone) +{ + int result; + block_count_t i; + struct block_allocator *allocator = &depot->allocators[zone]; + struct vdo *vdo = depot->vdo; + block_count_t max_free_blocks = depot->slab_config.data_blocks; + unsigned int max_priority = (2 + ilog2(max_free_blocks)); + + *allocator = (struct block_allocator) { + .depot = depot, + .zone_number = zone, + .thread_id = vdo->thread_config.physical_threads[zone], + .nonce = vdo->states.vdo.nonce, + }; + + INIT_LIST_HEAD(&allocator->dirty_slab_journals); + vdo_set_admin_state_code(&allocator->state, VDO_ADMIN_STATE_NORMAL_OPERATION); + result = vdo_register_read_only_listener(vdo, + allocator, + notify_block_allocator_of_read_only_mode, + allocator->thread_id); + if (result != VDO_SUCCESS) + return result; + + vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION); + result = make_vio_pool(vdo, + BLOCK_ALLOCATOR_VIO_POOL_SIZE, + allocator->thread_id, + VIO_TYPE_SLAB_JOURNAL, + VIO_PRIORITY_METADATA, + allocator, + &allocator->vio_pool); + if (result != VDO_SUCCESS) + return result; + + result = initialize_slab_scrubber(allocator); + if (result != VDO_SUCCESS) + return result; + + result = vdo_make_priority_table(max_priority, &allocator->prioritized_slabs); + if (result != VDO_SUCCESS) + return result; + + result = UDS_ALLOCATE(VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE, + struct slab_summary_block, + __func__, + &allocator->summary_blocks); + if (result != VDO_SUCCESS) + return result; + + vdo_set_admin_state_code(&allocator->summary_state, VDO_ADMIN_STATE_NORMAL_OPERATION); + allocator->summary_entries = depot->summary_entries + (MAX_VDO_SLABS * zone); + + /* Initialize each summary block. */ + for (i = 0; i < VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE; i++) { + result = initialize_slab_summary_block(allocator, i); + if (result != VDO_SUCCESS) + return result; + } + + /* + * Performing well atop thin provisioned storage requires either that VDO discards freed + * blocks, or that the block allocator try to use slabs that already have allocated blocks + * in preference to slabs that have never been opened. For reasons we have not been able to + * fully understand, some SSD machines have been have been very sensitive (50% reduction in + * test throughput) to very slight differences in the timing and locality of block + * allocation. Assigning a low priority to unopened slabs (max_priority/2, say) would be + * ideal for the story, but anything less than a very high threshold (max_priority - 1) + * hurts on these machines. + * + * This sets the free block threshold for preferring to open an unopened slab to the binary + * floor of 3/4ths the total number of data blocks in a slab, which will generally evaluate + * to about half the slab size. + */ + allocator->unopened_slab_priority = (1 + ilog2((max_free_blocks * 3) / 4)); + + return VDO_SUCCESS; +} + +static int allocate_components(struct slab_depot *depot, + struct partition *summary_partition) +{ + int result; + zone_count_t zone; + slab_count_t slab_count; + u8 hint; + u32 i; + const struct thread_config *thread_config = &depot->vdo->thread_config; + + result = vdo_make_action_manager(depot->zone_count, + get_allocator_thread_id, + thread_config->journal_thread, + depot, + schedule_tail_block_commit, + depot->vdo, + &depot->action_manager); + if (result != VDO_SUCCESS) + return result; + + depot->origin = depot->first_block; + + /* block size must be a multiple of entry size */ + STATIC_ASSERT((VDO_BLOCK_SIZE % sizeof(struct slab_summary_entry)) == 0); + + depot->summary_origin = summary_partition->offset; + depot->hint_shift = vdo_get_slab_summary_hint_shift(depot->slab_size_shift); + result = UDS_ALLOCATE(MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES, + struct slab_summary_entry, + __func__, + &depot->summary_entries); + if (result != VDO_SUCCESS) + return result; + + + /* Initialize all the entries. */ + hint = compute_fullness_hint(depot, depot->slab_config.data_blocks); + for (i = 0; i < MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES; i++) { + /* + * This default tail block offset must be reflected in + * slabJournal.c::read_slab_journal_tail(). + */ + depot->summary_entries[i] = (struct slab_summary_entry) { + .tail_block_offset = 0, + .fullness_hint = hint, + .load_ref_counts = false, + .is_dirty = false, + }; + } + + if (result != VDO_SUCCESS) + return result; + + slab_count = vdo_compute_slab_count(depot->first_block, + depot->last_block, + depot->slab_size_shift); + if (thread_config->physical_zone_count > slab_count) + return uds_log_error_strerror(VDO_BAD_CONFIGURATION, + "%u physical zones exceeds slab count %u", + thread_config->physical_zone_count, + slab_count); + + /* Initialize the block allocators. */ + for (zone = 0; zone < depot->zone_count; zone++) { + result = initialize_block_allocator(depot, zone); + if (result != VDO_SUCCESS) + return result; + } + + /* Allocate slabs. */ + result = allocate_slabs(depot, slab_count); + if (result != VDO_SUCCESS) + return result; + + /* Use the new slabs. */ + for (i = depot->slab_count; i < depot->new_slab_count; i++) { + struct vdo_slab *slab = depot->new_slabs[i]; + + register_slab_with_allocator(slab->allocator, slab); + WRITE_ONCE(depot->slab_count, depot->slab_count + 1); + } + + depot->slabs = depot->new_slabs; + depot->new_slabs = NULL; + depot->new_slab_count = 0; + + return VDO_SUCCESS; +} + +/** + * vdo_decode_slab_depot() - Make a slab depot and configure it with the state read from the super + * block. + * @state: The slab depot state from the super block. + * @vdo: The VDO which will own the depot. + * @summary_partition: The partition which holds the slab summary. + * @depot_ptr: A pointer to hold the depot. + * + * Return: A success or error code. + */ +int vdo_decode_slab_depot(struct slab_depot_state_2_0 state, + struct vdo *vdo, + struct partition *summary_partition, + struct slab_depot **depot_ptr) +{ + unsigned int slab_size_shift; + struct slab_depot *depot; + int result; + + /* + * Calculate the bit shift for efficiently mapping block numbers to slabs. Using a shift + * requires that the slab size be a power of two. + */ + block_count_t slab_size = state.slab_config.slab_blocks; + + if (!is_power_of_2(slab_size)) + return uds_log_error_strerror(UDS_INVALID_ARGUMENT, + "slab size must be a power of two"); + slab_size_shift = ilog2(slab_size); + + result = UDS_ALLOCATE_EXTENDED(struct slab_depot, + vdo->thread_config.physical_zone_count, + struct block_allocator, + __func__, + &depot); + if (result != VDO_SUCCESS) + return result; + + depot->vdo = vdo; + depot->old_zone_count = state.zone_count; + depot->zone_count = vdo->thread_config.physical_zone_count; + depot->slab_config = state.slab_config; + depot->first_block = state.first_block; + depot->last_block = state.last_block; + depot->slab_size_shift = slab_size_shift; + + result = allocate_components(depot, summary_partition); + if (result != VDO_SUCCESS) { + vdo_free_slab_depot(depot); + return result; + } + + *depot_ptr = depot; + return VDO_SUCCESS; +} + +static void uninitialize_allocator_summary(struct block_allocator *allocator) +{ + block_count_t i; + + if (allocator->summary_blocks == NULL) + return; + + for (i = 0; i < VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE; i++) { + free_vio_components(&allocator->summary_blocks[i].vio); + UDS_FREE(UDS_FORGET(allocator->summary_blocks[i].outgoing_entries)); + } + + UDS_FREE(UDS_FORGET(allocator->summary_blocks)); +} + +/** + * vdo_free_slab_depot() - Destroy a slab depot. + * @depot: The depot to destroy. + */ +void vdo_free_slab_depot(struct slab_depot *depot) +{ + zone_count_t zone = 0; + + if (depot == NULL) + return; + + vdo_abandon_new_slabs(depot); + + for (zone = 0; zone < depot->zone_count; zone++) { + struct block_allocator *allocator = &depot->allocators[zone]; + + if (allocator->eraser != NULL) + dm_kcopyd_client_destroy(UDS_FORGET(allocator->eraser)); + + uninitialize_allocator_summary(allocator); + uninitialize_scrubber_vio(&allocator->scrubber); + free_vio_pool(UDS_FORGET(allocator->vio_pool)); + vdo_free_priority_table(UDS_FORGET(allocator->prioritized_slabs)); + } + + if (depot->slabs != NULL) { + slab_count_t i; + + for (i = 0; i < depot->slab_count; i++) + free_slab(UDS_FORGET(depot->slabs[i])); + } + + UDS_FREE(UDS_FORGET(depot->slabs)); + UDS_FREE(UDS_FORGET(depot->action_manager)); + UDS_FREE(UDS_FORGET(depot->summary_entries)); + UDS_FREE(depot); +} + +/** + * vdo_record_slab_depot() - Record the state of a slab depot for encoding into the super block. + * @depot: The depot to encode. + * + * Return: The depot state. + */ +struct slab_depot_state_2_0 vdo_record_slab_depot(const struct slab_depot *depot) +{ + /* + * If this depot is currently using 0 zones, it must have been synchronously loaded by a + * tool and is now being saved. We did not load and combine the slab summary, so we still + * need to do that next time we load with the old zone count rather than 0. + */ + struct slab_depot_state_2_0 state; + zone_count_t zones_to_record = depot->zone_count; + + if (depot->zone_count == 0) + zones_to_record = depot->old_zone_count; + + state = (struct slab_depot_state_2_0) { + .slab_config = depot->slab_config, + .first_block = depot->first_block, + .last_block = depot->last_block, + .zone_count = zones_to_record, + }; + + return state; +} + +/** + * vdo_allocate_reference_counters() - Allocate the reference counters for all slabs in the depot. + * + * Context: This method may be called only before entering normal operation from the load thread. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_allocate_reference_counters(struct slab_depot *depot) +{ + struct slab_iterator iterator = + get_depot_slab_iterator(depot, depot->slab_count - 1, 0, 1); + + while (iterator.next != NULL) { + int result = allocate_slab_counters(next_slab(&iterator)); + + if (result != VDO_SUCCESS) + return result; + } + + return VDO_SUCCESS; +} + +/** + * get_slab_number() - Get the number of the slab that contains a specified block. + * @depot: The slab depot. + * @pbn: The physical block number. + * @slab_number_ptr: A pointer to hold the slab number. + * + * Return: VDO_SUCCESS or an error. + */ +static int __must_check get_slab_number(const struct slab_depot *depot, + physical_block_number_t pbn, + slab_count_t *slab_number_ptr) +{ + slab_count_t slab_number; + + if (pbn < depot->first_block) + return VDO_OUT_OF_RANGE; + + slab_number = (pbn - depot->first_block) >> depot->slab_size_shift; + if (slab_number >= depot->slab_count) + return VDO_OUT_OF_RANGE; + + *slab_number_ptr = slab_number; + return VDO_SUCCESS; +} + +/** + * vdo_get_slab() - Get the slab object for the slab that contains a specified block. + * @depot: The slab depot. + * @pbn: The physical block number. + * + * Will put the VDO in read-only mode if the PBN is not a valid data block nor the zero block. + * + * Return: The slab containing the block, or NULL if the block number is the zero block or + * otherwise out of range. + */ +struct vdo_slab *vdo_get_slab(const struct slab_depot *depot, physical_block_number_t pbn) +{ + slab_count_t slab_number; + int result; + + if (pbn == VDO_ZERO_BLOCK) + return NULL; + + result = get_slab_number(depot, pbn, &slab_number); + if (result != VDO_SUCCESS) { + vdo_enter_read_only_mode(depot->vdo, result); + return NULL; + } + + return depot->slabs[slab_number]; +} + +/** + * vdo_get_increment_limit() - Determine how many new references a block can acquire. + * @depot: The slab depot. + * @pbn: The physical block number that is being queried. + * + * Context: This method must be called from the physical zone thread of the PBN. + * + * Return: The number of available references. + */ +u8 vdo_get_increment_limit(struct slab_depot *depot, physical_block_number_t pbn) +{ + struct vdo_slab *slab = vdo_get_slab(depot, pbn); + vdo_refcount_t *counter_ptr = NULL; + int result; + + if ((slab == NULL) || (slab->status != VDO_SLAB_REBUILT)) + return 0; + + result = get_reference_counter(slab, pbn, &counter_ptr); + if (result != VDO_SUCCESS) + return 0; + + if (*counter_ptr == PROVISIONAL_REFERENCE_COUNT) + return (MAXIMUM_REFERENCE_COUNT - 1); + + return (MAXIMUM_REFERENCE_COUNT - *counter_ptr); +} + +/** + * vdo_is_physical_data_block() - Determine whether the given PBN refers to a data block. + * @depot: The depot. + * @pbn: The physical block number to ask about. + * + * Return: True if the PBN corresponds to a data block. + */ +bool vdo_is_physical_data_block(const struct slab_depot *depot, physical_block_number_t pbn) +{ + slab_count_t slab_number; + slab_block_number sbn; + + return ((pbn == VDO_ZERO_BLOCK) || + ((get_slab_number(depot, pbn, &slab_number) == VDO_SUCCESS) && + (slab_block_number_from_pbn(depot->slabs[slab_number], pbn, &sbn) == + VDO_SUCCESS))); +} + +/** + * vdo_get_slab_depot_allocated_blocks() - Get the total number of data blocks allocated across all + * the slabs in the depot. + * @depot: The slab depot. + * + * This is the total number of blocks with a non-zero reference count. + * + * Context: This may be called from any thread. + * + * Return: The total number of blocks with a non-zero reference count. + */ +block_count_t vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot) +{ + block_count_t total = 0; + zone_count_t zone; + + for (zone = 0; zone < depot->zone_count; zone++) + /* The allocators are responsible for thread safety. */ + total += READ_ONCE(depot->allocators[zone].allocated_blocks); + return total; +} + +/** + * vdo_get_slab_depot_data_blocks() - Get the total number of data blocks in all the slabs in the + * depot. + * @depot: The slab depot. + * + * Context: This may be called from any thread. + * + * Return: The total number of data blocks in all slabs. + */ +block_count_t vdo_get_slab_depot_data_blocks(const struct slab_depot *depot) +{ + return (READ_ONCE(depot->slab_count) * depot->slab_config.data_blocks); +} + +/** + * finish_combining_zones() - Clean up after saving out the combined slab summary. + * @completion: The vio which was used to write the summary data. + */ +static void finish_combining_zones(struct vdo_completion *completion) +{ + int result = completion->result; + struct vdo_completion *parent = completion->parent; + + free_vio(as_vio(UDS_FORGET(completion))); + vdo_fail_completion(parent, result); +} + +static void handle_combining_error(struct vdo_completion *completion) +{ + vio_record_metadata_io_error(as_vio(completion)); + finish_combining_zones(completion); +} + +static void write_summary_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vdo *vdo = vio->completion.vdo; + + continue_vio_after_io(vio, finish_combining_zones, vdo->thread_config.admin_thread); +} + +/** + * combine_summaries() - Treating the current entries buffer as the on-disk value of all zones, + * update every zone to the correct values for every slab. + * @depot: The depot whose summary entries should be combined. + */ +static void combine_summaries(struct slab_depot *depot) +{ + /* + * Combine all the old summary data into the portion of the buffer corresponding to the + * first zone. + */ + zone_count_t zone = 0; + struct slab_summary_entry *entries = depot->summary_entries; + + if (depot->old_zone_count > 1) { + slab_count_t entry_number; + + for (entry_number = 0; entry_number < MAX_VDO_SLABS; entry_number++) { + if (zone != 0) + memcpy(entries + entry_number, + entries + (zone * MAX_VDO_SLABS) + entry_number, + sizeof(struct slab_summary_entry)); + zone++; + if (zone == depot->old_zone_count) + zone = 0; + } + } + + /* Copy the combined data to each zones's region of the buffer. */ + for (zone = 1; zone < MAX_VDO_PHYSICAL_ZONES; zone++) + memcpy(entries + (zone * MAX_VDO_SLABS), + entries, + MAX_VDO_SLABS * sizeof(struct slab_summary_entry)); +} + +/** + * finish_loading_summary() - Finish loading slab summary data. + * @completion: The vio which was used to read the summary data. + * + * Combines the slab summary data from all the previously written zones and copies the combined + * summary to each partition's data region. Then writes the combined summary back out to disk. This + * callback is registered in load_summary_endio(). + */ +static void finish_loading_summary(struct vdo_completion *completion) +{ + struct slab_depot *depot = completion->vdo->depot; + + /* Combine the summary from each zone so each zone is correct for all slabs. */ + combine_summaries(depot); + + /* Write the combined summary back out. */ + submit_metadata_vio(as_vio(completion), + depot->summary_origin, + write_summary_endio, + handle_combining_error, + REQ_OP_WRITE); +} + +static void load_summary_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vdo *vdo = vio->completion.vdo; + + continue_vio_after_io(vio, finish_loading_summary, vdo->thread_config.admin_thread); +} + +/** + * load_slab_summary() - The preamble of a load operation. + * + * Implements vdo_action_preamble. + */ +static void load_slab_summary(void *context, struct vdo_completion *parent) +{ + int result; + struct vio *vio; + struct slab_depot *depot = context; + const struct admin_state_code *operation = + vdo_get_current_manager_operation(depot->action_manager); + + result = create_multi_block_metadata_vio(depot->vdo, + VIO_TYPE_SLAB_SUMMARY, + VIO_PRIORITY_METADATA, + parent, + VDO_SLAB_SUMMARY_BLOCKS, + (char *) depot->summary_entries, + &vio); + if (result != VDO_SUCCESS) { + vdo_fail_completion(parent, result); + return; + } + + if ((operation == VDO_ADMIN_STATE_FORMATTING) || + (operation == VDO_ADMIN_STATE_LOADING_FOR_REBUILD)) { + finish_loading_summary(&vio->completion); + return; + } + + submit_metadata_vio(vio, + depot->summary_origin, + load_summary_endio, + handle_combining_error, + REQ_OP_READ); +} + +/* Implements vdo_zone_action. */ +static void load_allocator(void *context, zone_count_t zone_number, struct vdo_completion *parent) +{ + struct slab_depot *depot = context; + + vdo_start_loading(&depot->allocators[zone_number].state, + vdo_get_current_manager_operation(depot->action_manager), + parent, + initiate_load); +} + +/** + * vdo_load_slab_depot() - Asynchronously load any slab depot state that isn't included in the + * super_block component. + * @depot: The depot to load. + * @operation: The type of load to perform. + * @parent: The completion to notify when the load is complete. + * @context: Additional context for the load operation; may be NULL. + * + * This method may be called only before entering normal operation from the load thread. + */ +void vdo_load_slab_depot(struct slab_depot *depot, + const struct admin_state_code *operation, + struct vdo_completion *parent, + void *context) +{ + if (vdo_assert_load_operation(operation, parent)) + vdo_schedule_operation_with_context(depot->action_manager, + operation, + load_slab_summary, + load_allocator, + NULL, + context, + parent); +} + +/* Implements vdo_zone_action. */ +static void prepare_to_allocate(void *context, + zone_count_t zone_number, + struct vdo_completion *parent) +{ + struct slab_depot *depot = context; + struct block_allocator *allocator = &depot->allocators[zone_number]; + int result; + + result = vdo_prepare_slabs_for_allocation(allocator); + if (result != VDO_SUCCESS) { + vdo_fail_completion(parent, result); + return; + } + + scrub_slabs(allocator, parent); +} + +/** + * vdo_prepare_slab_depot_to_allocate() - Prepare the slab depot to come online and start + * allocating blocks. + * @depot: The depot to prepare. + * @load_type: The load type. + * @parent: The completion to notify when the operation is complete. + * + * This method may be called only before entering normal operation from the load thread. It must be + * called before allocation may proceed. + */ +void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot, + enum slab_depot_load_type load_type, + struct vdo_completion *parent) +{ + depot->load_type = load_type; + atomic_set(&depot->zones_to_scrub, depot->zone_count); + vdo_schedule_action(depot->action_manager, NULL, prepare_to_allocate, NULL, parent); +} + +/** + * vdo_update_slab_depot_size() - Update the slab depot to reflect its new size in memory. + * @depot: The depot to update. + * + * This size is saved to disk as part of the super block. + */ +void vdo_update_slab_depot_size(struct slab_depot *depot) +{ + depot->last_block = depot->new_last_block; +} + +/** + * vdo_prepare_to_grow_slab_depot() - Allocate new memory needed for a resize of a slab depot to + * the given size. + * @depot: The depot to prepare to resize. + * @partition: The new depot partition + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_prepare_to_grow_slab_depot(struct slab_depot *depot, const struct partition *partition) +{ + struct slab_depot_state_2_0 new_state; + int result; + slab_count_t new_slab_count; + + if ((partition->count >> depot->slab_size_shift) <= depot->slab_count) + return VDO_INCREMENT_TOO_SMALL; + + /* Generate the depot configuration for the new block count. */ + ASSERT_LOG_ONLY(depot->first_block == partition->offset, + "New slab depot partition doesn't change origin"); + result = vdo_configure_slab_depot(partition, + depot->slab_config, + depot->zone_count, + &new_state); + if (result != VDO_SUCCESS) + return result; + + new_slab_count = vdo_compute_slab_count(depot->first_block, + new_state.last_block, + depot->slab_size_shift); + if (new_slab_count <= depot->slab_count) + return uds_log_error_strerror(VDO_INCREMENT_TOO_SMALL, "Depot can only grow"); + if (new_slab_count == depot->new_slab_count) + /* Check it out, we've already got all the new slabs allocated! */ + return VDO_SUCCESS; + + vdo_abandon_new_slabs(depot); + result = allocate_slabs(depot, new_slab_count); + if (result != VDO_SUCCESS) { + vdo_abandon_new_slabs(depot); + return result; + } + + depot->new_size = partition->count; + depot->old_last_block = depot->last_block; + depot->new_last_block = new_state.last_block; + + return VDO_SUCCESS; +} + +/** + * finish_registration() - Finish registering new slabs now that all of the allocators have + * received their new slabs. + * + * Implements vdo_action_conclusion. + */ +static int finish_registration(void *context) +{ + struct slab_depot *depot = context; + + WRITE_ONCE(depot->slab_count, depot->new_slab_count); + UDS_FREE(depot->slabs); + depot->slabs = depot->new_slabs; + depot->new_slabs = NULL; + depot->new_slab_count = 0; + return VDO_SUCCESS; +} + +/* Implements vdo_zone_action. */ +static void register_new_slabs(void *context, + zone_count_t zone_number, + struct vdo_completion *parent) +{ + struct slab_depot *depot = context; + struct block_allocator *allocator = &depot->allocators[zone_number]; + slab_count_t i; + + for (i = depot->slab_count; i < depot->new_slab_count; i++) { + struct vdo_slab *slab = depot->new_slabs[i]; + + if (slab->allocator == allocator) + register_slab_with_allocator(allocator, slab); + } + + vdo_finish_completion(parent); +} + +/** + * vdo_use_new_slabs() - Use the new slabs allocated for resize. + * @depot: The depot. + * @parent: The object to notify when complete. + */ +void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent) +{ + ASSERT_LOG_ONLY(depot->new_slabs != NULL, "Must have new slabs to use"); + vdo_schedule_operation(depot->action_manager, + VDO_ADMIN_STATE_SUSPENDED_OPERATION, + NULL, + register_new_slabs, + finish_registration, + parent); +} + +/** + * stop_scrubbing() - Tell the scrubber to stop scrubbing after it finishes the slab it is + * currently working on. + * @scrubber: The scrubber to stop. + * @parent: The completion to notify when scrubbing has stopped. + */ +static void stop_scrubbing(struct block_allocator *allocator) +{ + struct slab_scrubber *scrubber = &allocator->scrubber; + + if (vdo_is_state_quiescent(&scrubber->admin_state)) + vdo_finish_completion(&allocator->completion); + else + vdo_start_draining(&scrubber->admin_state, + VDO_ADMIN_STATE_SUSPENDING, + &allocator->completion, + NULL); +} + +/** + * Implements vdo_admin_initiator. + */ +static void initiate_summary_drain(struct admin_state *state) +{ + check_summary_drain_complete(container_of(state, struct block_allocator, summary_state)); +} + +static void do_drain_step(struct vdo_completion *completion) +{ + struct block_allocator *allocator = vdo_as_block_allocator(completion); + + vdo_prepare_completion_for_requeue(&allocator->completion, + do_drain_step, + handle_operation_error, + allocator->thread_id, + NULL); + switch (++allocator->drain_step) { + case VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER: + stop_scrubbing(allocator); + return; + + case VDO_DRAIN_ALLOCATOR_STEP_SLABS: + apply_to_slabs(allocator, do_drain_step); + return; + + case VDO_DRAIN_ALLOCATOR_STEP_SUMMARY: + vdo_start_draining(&allocator->summary_state, + vdo_get_admin_state_code(&allocator->state), + completion, + initiate_summary_drain); + return; + + case VDO_DRAIN_ALLOCATOR_STEP_FINISHED: + ASSERT_LOG_ONLY(!is_vio_pool_busy(allocator->vio_pool), "vio pool not busy"); + vdo_finish_draining_with_result(&allocator->state, completion->result); + return; + + default: + vdo_finish_draining_with_result(&allocator->state, UDS_BAD_STATE); + } +} + +/* Implements vdo_admin_initiator. */ +static void initiate_drain(struct admin_state *state) +{ + struct block_allocator *allocator = container_of(state, struct block_allocator, state); + + allocator->drain_step = VDO_DRAIN_ALLOCATOR_START; + do_drain_step(&allocator->completion); +} + +/* + * Drain all allocator I/O. Depending upon the type of drain, some or all dirty metadata may be + * written to disk. The type of drain will be determined from the state of the allocator's depot. + * + * Implements vdo_zone_action. + */ +static void drain_allocator(void *context, zone_count_t zone_number, struct vdo_completion *parent) +{ + struct slab_depot *depot = context; + + vdo_start_draining(&depot->allocators[zone_number].state, + vdo_get_current_manager_operation(depot->action_manager), + parent, + initiate_drain); +} + +/** + * vdo_drain_slab_depot() - Drain all slab depot I/O. + * @depot: The depot to drain. + * @operation: The drain operation (flush, rebuild, suspend, or save). + * @parent: The completion to finish when the drain is complete. + * + * If saving, or flushing, all dirty depot metadata will be written out. If saving or suspending, + * the depot will be left in a suspended state. + */ +void vdo_drain_slab_depot(struct slab_depot *depot, + const struct admin_state_code *operation, + struct vdo_completion *parent) +{ + vdo_schedule_operation(depot->action_manager, + operation, + NULL, + drain_allocator, + NULL, + parent); +} + +/** + * resume_scrubbing() - Tell the scrubber to resume scrubbing if it has been stopped. + * @alocator: The allocator being resumed. + */ +static void resume_scrubbing(struct block_allocator *allocator) +{ + int result; + struct slab_scrubber *scrubber = &allocator->scrubber; + + if (!has_slabs_to_scrub(scrubber)) { + vdo_finish_completion(&allocator->completion); + return; + } + + result = vdo_resume_if_quiescent(&scrubber->admin_state); + if (result != VDO_SUCCESS) { + vdo_fail_completion(&allocator->completion, result); + return; + } + + scrub_next_slab(scrubber); + vdo_finish_completion(&allocator->completion); +} + +static void do_resume_step(struct vdo_completion *completion) +{ + struct block_allocator *allocator = vdo_as_block_allocator(completion); + + vdo_prepare_completion_for_requeue(&allocator->completion, + do_resume_step, + handle_operation_error, + allocator->thread_id, + NULL); + switch (--allocator->drain_step) { + case VDO_DRAIN_ALLOCATOR_STEP_SUMMARY: + vdo_fail_completion(completion, + vdo_resume_if_quiescent(&allocator->summary_state)); + return; + + case VDO_DRAIN_ALLOCATOR_STEP_SLABS: + apply_to_slabs(allocator, do_resume_step); + return; + + case VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER: + resume_scrubbing(allocator); + return; + + case VDO_DRAIN_ALLOCATOR_START: + vdo_finish_resuming_with_result(&allocator->state, completion->result); + return; + + default: + vdo_finish_resuming_with_result(&allocator->state, UDS_BAD_STATE); + } +} + +/* Implements vdo_admin_initiator. */ +static void initiate_resume(struct admin_state *state) +{ + struct block_allocator *allocator = container_of(state, struct block_allocator, state); + + allocator->drain_step = VDO_DRAIN_ALLOCATOR_STEP_FINISHED; + do_resume_step(&allocator->completion); +} + +/* Implements vdo_zone_action. */ +static void resume_allocator(void *context, + zone_count_t zone_number, + struct vdo_completion *parent) +{ + struct slab_depot *depot = context; + + vdo_start_resuming(&depot->allocators[zone_number].state, + vdo_get_current_manager_operation(depot->action_manager), + parent, + initiate_resume); +} + +/** + * vdo_resume_slab_depot() - Resume a suspended slab depot. + * @depot: The depot to resume. + * @parent: The completion to finish when the depot has resumed. + */ +void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent) +{ + if (vdo_is_read_only(depot->vdo)) { + vdo_continue_completion(parent, VDO_READ_ONLY); + return; + } + + vdo_schedule_operation(depot->action_manager, + VDO_ADMIN_STATE_RESUMING, + NULL, + resume_allocator, + NULL, + parent); +} + +/** + * vdo_commit_oldest_slab_journal_tail_blocks() - Commit all dirty tail blocks which are locking a + * given recovery journal block. + * @depot: The depot. + * @recovery_block_number: The sequence number of the recovery journal block whose locks should be + * released. + * + * Context: This method must be called from the journal zone thread. + */ +void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot, + sequence_number_t recovery_block_number) +{ + if (depot == NULL) + return; + + depot->new_release_request = recovery_block_number; + vdo_schedule_default_action(depot->action_manager); +} + +/* Implements vdo_zone_action. */ +static void scrub_all_unrecovered_slabs(void *context, + zone_count_t zone_number, + struct vdo_completion *parent) +{ + struct slab_depot *depot = context; + + scrub_slabs(&depot->allocators[zone_number], NULL); + vdo_launch_completion(parent); +} + +/** + * vdo_scrub_all_unrecovered_slabs() - Scrub all unrecovered slabs. + * @depot: The depot to scrub. + * @parent: The object to notify when scrubbing has been launched for all zones. + */ +void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot, struct vdo_completion *parent) +{ + vdo_schedule_action(depot->action_manager, + NULL, + scrub_all_unrecovered_slabs, + NULL, + parent); +} + +/** + * get_block_allocator_statistics() - Get the total of the statistics from all the block + * allocators in the depot. + * @depot: The slab depot. + * + * Return: The statistics from all block allocators in the depot. + */ +static struct block_allocator_statistics __must_check +get_block_allocator_statistics(const struct slab_depot *depot) +{ + struct block_allocator_statistics totals; + zone_count_t zone; + + memset(&totals, 0, sizeof(totals)); + + for (zone = 0; zone < depot->zone_count; zone++) { + const struct block_allocator *allocator = &depot->allocators[zone]; + const struct block_allocator_statistics *stats = &allocator->statistics; + + totals.slab_count += allocator->slab_count; + totals.slabs_opened += READ_ONCE(stats->slabs_opened); + totals.slabs_reopened += READ_ONCE(stats->slabs_reopened); + } + + return totals; +} + +/** + * get_ref_counts_statistics() - Get the cumulative ref_counts statistics for the depot. + * @depot: The slab depot. + * + * Return: The cumulative statistics for all ref_counts in the depot. + */ +static struct ref_counts_statistics __must_check +get_ref_counts_statistics(const struct slab_depot *depot) +{ + struct ref_counts_statistics totals; + zone_count_t zone; + + memset(&totals, 0, sizeof(totals)); + + for (zone = 0; zone < depot->zone_count; zone++) { + totals.blocks_written += + READ_ONCE(depot->allocators[zone].ref_counts_statistics.blocks_written); + } + + return totals; +} + +/** + * get_depot_slab_journal_statistics() - Get the aggregated slab journal statistics for the depot. + * @depot: The slab depot. + * + * Return: The aggregated statistics for all slab journals in the depot. + */ +static struct slab_journal_statistics __must_check +get_slab_journal_statistics(const struct slab_depot *depot) +{ + struct slab_journal_statistics totals; + zone_count_t zone; + + memset(&totals, 0, sizeof(totals)); + + for (zone = 0; zone < depot->zone_count; zone++) { + const struct slab_journal_statistics *stats = + &depot->allocators[zone].slab_journal_statistics; + + totals.disk_full_count += READ_ONCE(stats->disk_full_count); + totals.flush_count += READ_ONCE(stats->flush_count); + totals.blocked_count += READ_ONCE(stats->blocked_count); + totals.blocks_written += READ_ONCE(stats->blocks_written); + totals.tail_busy_count += READ_ONCE(stats->tail_busy_count); + } + + return totals; +} + +/** + * vdo_get_slab_depot_statistics() - Get all the vdo_statistics fields that are properties of the + * slab depot. + * @depot: The slab depot. + * @stats: The vdo statistics structure to partially fill. + */ +void vdo_get_slab_depot_statistics(const struct slab_depot *depot, struct vdo_statistics *stats) +{ + slab_count_t slab_count = READ_ONCE(depot->slab_count); + slab_count_t unrecovered = 0; + zone_count_t zone; + + for (zone = 0; zone < depot->zone_count; zone++) { + /* The allocators are responsible for thread safety. */ + unrecovered += READ_ONCE(depot->allocators[zone].scrubber.slab_count); + } + + stats->recovery_percentage = (slab_count - unrecovered) * 100 / slab_count; + stats->allocator = get_block_allocator_statistics(depot); + stats->ref_counts = get_ref_counts_statistics(depot); + stats->slab_journal = get_slab_journal_statistics(depot); + stats->slab_summary = (struct slab_summary_statistics) { + .blocks_written = atomic64_read(&depot->summary_statistics.blocks_written), + }; +} + +/** + * vdo_dump_slab_depot() - Dump the slab depot, in a thread-unsafe fashion. + * @depot: The slab depot. + */ +void vdo_dump_slab_depot(const struct slab_depot *depot) +{ + uds_log_info("vdo slab depot"); + uds_log_info(" zone_count=%u old_zone_count=%u slabCount=%u active_release_request=%llu new_release_request=%llu", + (unsigned int) depot->zone_count, + (unsigned int) depot->old_zone_count, + READ_ONCE(depot->slab_count), + (unsigned long long) depot->active_release_request, + (unsigned long long) depot->new_release_request); +} diff --git a/drivers/md/dm-vdo/slab-depot.h b/drivers/md/dm-vdo/slab-depot.h new file mode 100644 index 00000000000..2f9b3cf4a0d --- /dev/null +++ b/drivers/md/dm-vdo/slab-depot.h @@ -0,0 +1,594 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_SLAB_DEPOT_H +#define VDO_SLAB_DEPOT_H + +#include <linux/atomic.h> +#include <linux/dm-kcopyd.h> +#include <linux/list.h> + +#include "numeric.h" + +#include "admin-state.h" +#include "completion.h" +#include "data-vio.h" +#include "encodings.h" +#include "physical-zone.h" +#include "priority-table.h" +#include "recovery-journal.h" +#include "statistics.h" +#include "types.h" +#include "vio.h" +#include "wait-queue.h" + +/* + * A slab_depot is responsible for managing all of the slabs and block allocators of a VDO. It has + * a single array of slabs in order to eliminate the need for additional math in order to compute + * which physical zone a PBN is in. It also has a block_allocator per zone. + * + * Load operations are required to be performed on a single thread. Normal operations are assumed + * to be performed in the appropriate zone. Allocations and reference count updates must be done + * from the thread of their physical zone. Requests to commit slab journal tail blocks from the + * recovery journal must be done on the journal zone thread. Save operations are required to be + * launched from the same thread as the original load operation. + */ + +enum { + /* The number of vios in the vio pool is proportional to the throughput of the VDO. */ + BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128, +}; + +/* + * Represents the possible status of a block. + */ +enum reference_status { + RS_FREE, /* this block is free */ + RS_SINGLE, /* this block is singly-referenced */ + RS_SHARED, /* this block is shared */ + RS_PROVISIONAL /* this block is provisionally allocated */ +}; + +struct vdo_slab; + +struct journal_lock { + u16 count; + sequence_number_t recovery_start; +}; + +struct slab_journal { + /* A waiter object for getting a VIO pool entry */ + struct waiter resource_waiter; + /* A waiter object for updating the slab summary */ + struct waiter slab_summary_waiter; + /* A waiter object for getting a vio with which to flush */ + struct waiter flush_waiter; + /* The queue of VIOs waiting to make an entry */ + struct wait_queue entry_waiters; + /* The parent slab reference of this journal */ + struct vdo_slab *slab; + + /* Whether a tail block commit is pending */ + bool waiting_to_commit; + /* Whether the journal is updating the slab summary */ + bool updating_slab_summary; + /* Whether the journal is adding entries from the entry_waiters queue */ + bool adding_entries; + /* Whether a partial write is in progress */ + bool partial_write_in_progress; + + /* The oldest block in the journal on disk */ + sequence_number_t head; + /* The oldest block in the journal which may not be reaped */ + sequence_number_t unreapable; + /* The end of the half-open interval of the active journal */ + sequence_number_t tail; + /* The next journal block to be committed */ + sequence_number_t next_commit; + /* The tail sequence number that is written in the slab summary */ + sequence_number_t summarized; + /* The tail sequence number that was last summarized in slab summary */ + sequence_number_t last_summarized; + + /* The sequence number of the recovery journal lock */ + sequence_number_t recovery_lock; + + /* + * The number of entries which fit in a single block. Can't use the constant because unit + * tests change this number. + */ + journal_entry_count_t entries_per_block; + /* + * The number of full entries which fit in a single block. Can't use the constant because + * unit tests change this number. + */ + journal_entry_count_t full_entries_per_block; + + /* The recovery journal of the VDO (slab journal holds locks on it) */ + struct recovery_journal *recovery_journal; + + /* The statistics shared by all slab journals in our physical zone */ + struct slab_journal_statistics *events; + /* A list of the VIO pool entries for outstanding journal block writes */ + struct list_head uncommitted_blocks; + + /* + * The current tail block header state. This will be packed into the block just before it + * is written. + */ + struct slab_journal_block_header tail_header; + /* A pointer to a block-sized buffer holding the packed block data */ + struct packed_slab_journal_block *block; + + /* The number of blocks in the on-disk journal */ + block_count_t size; + /* The number of blocks at which to start pushing reference blocks */ + block_count_t flushing_threshold; + /* The number of blocks at which all reference blocks should be writing */ + block_count_t flushing_deadline; + /* The number of blocks at which to wait for reference blocks to write */ + block_count_t blocking_threshold; + /* The number of blocks at which to scrub the slab before coming online */ + block_count_t scrubbing_threshold; + + /* This list entry is for block_allocator to keep a queue of dirty journals */ + struct list_head dirty_entry; + + /* The lock for the oldest unreaped block of the journal */ + struct journal_lock *reap_lock; + /* The locks for each on disk block */ + struct journal_lock *locks; +}; + +/* + * Reference_block structure + * + * Blocks are used as a proxy, permitting saves of partial refcounts. + */ +struct reference_block { + /* This block waits on the ref_counts to tell it to write */ + struct waiter waiter; + /* The slab to which this reference_block belongs */ + struct vdo_slab *slab; + /* The number of references in this block that represent allocations */ + block_size_t allocated_count; + /* The slab journal block on which this block must hold a lock */ + sequence_number_t slab_journal_lock; + /* The slab journal block which should be released when this block is committed */ + sequence_number_t slab_journal_lock_to_release; + /* The point up to which each sector is accurate on disk */ + struct journal_point commit_points[VDO_SECTORS_PER_BLOCK]; + /* Whether this block has been modified since it was written to disk */ + bool is_dirty; + /* Whether this block is currently writing */ + bool is_writing; +}; + +/* The search_cursor represents the saved position of a free block search. */ +struct search_cursor { + /* The reference block containing the current search index */ + struct reference_block *block; + /* The position at which to start searching for the next free counter */ + slab_block_number index; + /* The position just past the last valid counter in the current block */ + slab_block_number end_index; + + /* A pointer to the first reference block in the slab */ + struct reference_block *first_block; + /* A pointer to the last reference block in the slab */ + struct reference_block *last_block; +}; + +enum slab_rebuild_status { + VDO_SLAB_REBUILT, + VDO_SLAB_REPLAYING, + VDO_SLAB_REQUIRES_SCRUBBING, + VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING, + VDO_SLAB_REBUILDING, +}; + +/* + * This is the type declaration for the vdo_slab type. A vdo_slab currently consists of a run of + * 2^23 data blocks, but that will soon change to dedicate a small number of those blocks for + * metadata storage for the reference counts and slab journal for the slab. + * + * A reference count is maintained for each physical block number. The vast majority of blocks have + * a very small reference count (usually 0 or 1). For references less than or equal to MAXIMUM_REFS + * (254) the reference count is stored in counters[pbn]. + */ +struct vdo_slab { + /* A list entry to queue this slab in a block_allocator list */ + struct list_head allocq_entry; + + /* The struct block_allocator that owns this slab */ + struct block_allocator *allocator; + + /* The journal for this slab */ + struct slab_journal journal; + + /* The slab number of this slab */ + slab_count_t slab_number; + /* The offset in the allocator partition of the first block in this slab */ + physical_block_number_t start; + /* The offset of the first block past the end of this slab */ + physical_block_number_t end; + /* The starting translated PBN of the slab journal */ + physical_block_number_t journal_origin; + /* The starting translated PBN of the reference counts */ + physical_block_number_t ref_counts_origin; + + /* The administrative state of the slab */ + struct admin_state state; + /* The status of the slab */ + enum slab_rebuild_status status; + /* Whether the slab was ever queued for scrubbing */ + bool was_queued_for_scrubbing; + + /* The priority at which this slab has been queued for allocation */ + u8 priority; + + /* Fields beyond this point are the reference counts for the data blocks in this slab. */ + /* The size of the counters array */ + u32 block_count; + /* The number of free blocks */ + u32 free_blocks; + /* The array of reference counts */ + vdo_refcount_t *counters; /* use UDS_ALLOCATE to align data ptr */ + + /* The saved block pointer and array indexes for the free block search */ + struct search_cursor search_cursor; + + /* A list of the dirty blocks waiting to be written out */ + struct wait_queue dirty_blocks; + /* The number of blocks which are currently writing */ + size_t active_count; + + /* A waiter object for updating the slab summary */ + struct waiter summary_waiter; + + /* The latest slab journal for which there has been a reference count update */ + struct journal_point slab_journal_point; + + /* The number of reference count blocks */ + u32 reference_block_count; + /* reference count block array */ + struct reference_block *reference_blocks; +}; + +enum block_allocator_drain_step { + VDO_DRAIN_ALLOCATOR_START, + VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER, + VDO_DRAIN_ALLOCATOR_STEP_SLABS, + VDO_DRAIN_ALLOCATOR_STEP_SUMMARY, + VDO_DRAIN_ALLOCATOR_STEP_FINISHED, +}; + +struct slab_scrubber { + /* The queue of slabs to scrub first */ + struct list_head high_priority_slabs; + /* The queue of slabs to scrub once there are no high_priority_slabs */ + struct list_head slabs; + /* The queue of VIOs waiting for a slab to be scrubbed */ + struct wait_queue waiters; + + /* + * The number of slabs that are unrecovered or being scrubbed. This field is modified by + * the physical zone thread, but is queried by other threads. + */ + slab_count_t slab_count; + + /* The administrative state of the scrubber */ + struct admin_state admin_state; + /* Whether to only scrub high-priority slabs */ + bool high_priority_only; + /* The slab currently being scrubbed */ + struct vdo_slab *slab; + /* The vio for loading slab journal blocks */ + struct vio vio; +}; + +/* A sub-structure for applying actions in parallel to all an allocator's slabs. */ +struct slab_actor { + /* The number of slabs performing a slab action */ + slab_count_t slab_action_count; + /* The method to call when a slab action has been completed by all slabs */ + vdo_action *callback; +}; + +/* A slab_iterator is a structure for iterating over a set of slabs. */ +struct slab_iterator { + struct vdo_slab **slabs; + struct vdo_slab *next; + slab_count_t end; + slab_count_t stride; +}; + +/* + * The slab_summary provides hints during load and recovery about the state of the slabs in order + * to avoid the need to read the slab journals in their entirety before a VDO can come online. + * + * The information in the summary for each slab includes the rough number of free blocks (which is + * used to prioritize scrubbing), the cleanliness of a slab (so that clean slabs containing free + * space will be used on restart), and the location of the tail block of the slab's journal. + * + * The slab_summary has its own partition at the end of the volume which is sized to allow for a + * complete copy of the summary for each of up to 16 physical zones. + * + * During resize, the slab_summary moves its backing partition and is saved once moved; the + * slab_summary is not permitted to overwrite the previous recovery journal space. + * + * The slab_summary does not have its own version information, but relies on the VDO volume version + * number. + */ + +/* + * A slab status is a very small structure for use in determining the ordering of slabs in the + * scrubbing process. + */ +struct slab_status { + slab_count_t slab_number; + bool is_clean; + u8 emptiness; +}; + +struct slab_summary_block { + /* The block_allocator to which this block belongs */ + struct block_allocator *allocator; + /* The index of this block in its zone's summary */ + block_count_t index; + /* Whether this block has a write outstanding */ + bool writing; + /* Ring of updates waiting on the outstanding write */ + struct wait_queue current_update_waiters; + /* Ring of updates waiting on the next write */ + struct wait_queue next_update_waiters; + /* The active slab_summary_entry array for this block */ + struct slab_summary_entry *entries; + /* The vio used to write this block */ + struct vio vio; + /* The packed entries, one block long, backing the vio */ + char *outgoing_entries; +}; + +/* + * The statistics for all the slab summary zones owned by this slab summary. These fields are all + * mutated only by their physical zone threads, but are read by other threads when gathering + * statistics for the entire depot. + */ +struct atomic_slab_summary_statistics { + /* Number of blocks written */ + atomic64_t blocks_written; +}; + +struct block_allocator { + struct vdo_completion completion; + /* The slab depot for this allocator */ + struct slab_depot *depot; + /* The nonce of the VDO */ + nonce_t nonce; + /* The physical zone number of this allocator */ + zone_count_t zone_number; + /* The thread ID for this allocator's physical zone */ + thread_id_t thread_id; + /* The number of slabs in this allocator */ + slab_count_t slab_count; + /* The number of the last slab owned by this allocator */ + slab_count_t last_slab; + /* The reduced priority level used to preserve unopened slabs */ + unsigned int unopened_slab_priority; + /* The state of this allocator */ + struct admin_state state; + /* The actor for applying an action to all slabs */ + struct slab_actor slab_actor; + + /* The slab from which blocks are currently being allocated */ + struct vdo_slab *open_slab; + /* A priority queue containing all slabs available for allocation */ + struct priority_table *prioritized_slabs; + /* The slab scrubber */ + struct slab_scrubber scrubber; + /* What phase of the close operation the allocator is to perform */ + enum block_allocator_drain_step drain_step; + + /* + * These statistics are all mutated only by the physical zone thread, but are read by other + * threads when gathering statistics for the entire depot. + */ + /* + * The count of allocated blocks in this zone. Not in block_allocator_statistics for + * historical reasons. + */ + u64 allocated_blocks; + /* Statistics for this block allocator */ + struct block_allocator_statistics statistics; + /* Cumulative statistics for the slab journals in this zone */ + struct slab_journal_statistics slab_journal_statistics; + /* Cumulative statistics for the reference counters in this zone */ + struct ref_counts_statistics ref_counts_statistics; + + /* + * This is the head of a queue of slab journals which have entries in their tail blocks + * which have not yet started to commit. When the recovery journal is under space pressure, + * slab journals which have uncommitted entries holding a lock on the recovery journal head + * are forced to commit their blocks early. This list is kept in order, with the tail + * containing the slab journal holding the most recent recovery journal lock. + */ + struct list_head dirty_slab_journals; + + /* The vio pool for reading and writing block allocator metadata */ + struct vio_pool *vio_pool; + /* The dm_kcopyd client for erasing slab journals */ + struct dm_kcopyd_client *eraser; + /* Iterator over the slabs to be erased */ + struct slab_iterator slabs_to_erase; + + /* The portion of the slab summary managed by this allocator */ + /* The state of the slab summary */ + struct admin_state summary_state; + /* The number of outstanding summary writes */ + block_count_t summary_write_count; + /* The array (owned by the blocks) of all entries */ + struct slab_summary_entry *summary_entries; + /* The array of slab_summary_blocks */ + struct slab_summary_block *summary_blocks; +}; + +enum slab_depot_load_type { + VDO_SLAB_DEPOT_NORMAL_LOAD, + VDO_SLAB_DEPOT_RECOVERY_LOAD, + VDO_SLAB_DEPOT_REBUILD_LOAD +}; + +struct slab_depot { + zone_count_t zone_count; + zone_count_t old_zone_count; + struct vdo *vdo; + struct slab_config slab_config; + struct action_manager *action_manager; + + physical_block_number_t first_block; + physical_block_number_t last_block; + physical_block_number_t origin; + + /* slab_size == (1 << slab_size_shift) */ + unsigned int slab_size_shift; + + /* Determines how slabs should be queued during load */ + enum slab_depot_load_type load_type; + + /* The state for notifying slab journals to release recovery journal */ + sequence_number_t active_release_request; + sequence_number_t new_release_request; + + /* State variables for scrubbing complete handling */ + atomic_t zones_to_scrub; + + /* Array of pointers to individually allocated slabs */ + struct vdo_slab **slabs; + /* The number of slabs currently allocated and stored in 'slabs' */ + slab_count_t slab_count; + + /* Array of pointers to a larger set of slabs (used during resize) */ + struct vdo_slab **new_slabs; + /* The number of slabs currently allocated and stored in 'new_slabs' */ + slab_count_t new_slab_count; + /* The size that 'new_slabs' was allocated for */ + block_count_t new_size; + + /* The last block before resize, for rollback */ + physical_block_number_t old_last_block; + /* The last block after resize, for resize */ + physical_block_number_t new_last_block; + + /* The statistics for the slab summary */ + struct atomic_slab_summary_statistics summary_statistics; + /* The start of the slab summary partition */ + physical_block_number_t summary_origin; + /* The number of bits to shift to get a 7-bit fullness hint */ + unsigned int hint_shift; + /* The slab summary entries for all of the zones the partition can hold */ + struct slab_summary_entry *summary_entries; + + /* The block allocators for this depot */ + struct block_allocator allocators[]; +}; + +struct reference_updater; + +bool __must_check +vdo_attempt_replay_into_slab(struct vdo_slab *slab, + physical_block_number_t pbn, + enum journal_operation operation, + bool increment, + struct journal_point *recovery_point, + struct vdo_completion *parent); + +int __must_check +vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot, + physical_block_number_t pbn, + enum journal_operation operation); + +static inline struct block_allocator *vdo_as_block_allocator(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion, VDO_BLOCK_ALLOCATOR_COMPLETION); + return container_of(completion, struct block_allocator, completion); +} + +int __must_check vdo_acquire_provisional_reference(struct vdo_slab *slab, + physical_block_number_t pbn, + struct pbn_lock *lock); + +int __must_check +vdo_allocate_block(struct block_allocator *allocator, physical_block_number_t *block_number_ptr); + +int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator, struct waiter *waiter); + +void vdo_modify_reference_count(struct vdo_completion *completion, + struct reference_updater *updater); + +int __must_check vdo_release_block_reference(struct block_allocator *allocator, + physical_block_number_t pbn); + +void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion); + +void vdo_dump_block_allocator(const struct block_allocator *allocator); + +int __must_check vdo_decode_slab_depot(struct slab_depot_state_2_0 state, + struct vdo *vdo, + struct partition *summary_partition, + struct slab_depot **depot_ptr); + +void vdo_free_slab_depot(struct slab_depot *depot); + +struct slab_depot_state_2_0 __must_check vdo_record_slab_depot(const struct slab_depot *depot); + +int __must_check vdo_allocate_reference_counters(struct slab_depot *depot); + +struct vdo_slab * __must_check +vdo_get_slab(const struct slab_depot *depot, physical_block_number_t pbn); + +u8 __must_check vdo_get_increment_limit(struct slab_depot *depot, physical_block_number_t pbn); + +bool __must_check +vdo_is_physical_data_block(const struct slab_depot *depot, physical_block_number_t pbn); + +block_count_t __must_check vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot); + +block_count_t __must_check vdo_get_slab_depot_data_blocks(const struct slab_depot *depot); + +void vdo_get_slab_depot_statistics(const struct slab_depot *depot, struct vdo_statistics *stats); + +void vdo_load_slab_depot(struct slab_depot *depot, + const struct admin_state_code *operation, + struct vdo_completion *parent, + void *context); + +void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot, + enum slab_depot_load_type load_type, + struct vdo_completion *parent); + +void vdo_update_slab_depot_size(struct slab_depot *depot); + +int __must_check +vdo_prepare_to_grow_slab_depot(struct slab_depot *depot, const struct partition *partition); + +void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent); + +void vdo_abandon_new_slabs(struct slab_depot *depot); + +void vdo_drain_slab_depot(struct slab_depot *depot, + const struct admin_state_code *operation, + struct vdo_completion *parent); + +void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent); + +void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot, + sequence_number_t recovery_block_number); + +void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot, struct vdo_completion *parent); + +void vdo_dump_slab_depot(const struct slab_depot *depot); + +#endif /* VDO_SLAB_DEPOT_H */ diff --git a/drivers/md/dm-vdo/statistics.h b/drivers/md/dm-vdo/statistics.h new file mode 100644 index 00000000000..28f69ea0c8b --- /dev/null +++ b/drivers/md/dm-vdo/statistics.h @@ -0,0 +1,279 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef STATISTICS_H +#define STATISTICS_H + +#include "types.h" + +enum { + STATISTICS_VERSION = 36, +}; + +struct block_allocator_statistics { + /** The total number of slabs from which blocks may be allocated */ + u64 slab_count; + /** The total number of slabs from which blocks have ever been allocated */ + u64 slabs_opened; + /** The number of times since loading that a slab has been re-opened */ + u64 slabs_reopened; +}; + +/** + * Counters for tracking the number of items written (blocks, requests, etc.) + * that keep track of totals at steps in the write pipeline. Three counters + * allow the number of buffered, in-memory items and the number of in-flight, + * unacknowledged writes to be derived, while still tracking totals for + * reporting purposes + */ +struct commit_statistics { + /** The total number of items on which processing has started */ + u64 started; + /** The total number of items for which a write operation has been issued */ + u64 written; + /** The total number of items for which a write operation has completed */ + u64 committed; +}; + +/** Counters for events in the recovery journal */ +struct recovery_journal_statistics { + /** Number of times the on-disk journal was full */ + u64 disk_full; + /** Number of times the recovery journal requested slab journal commits. */ + u64 slab_journal_commits_requested; + /** Write/Commit totals for individual journal entries */ + struct commit_statistics entries; + /** Write/Commit totals for journal blocks */ + struct commit_statistics blocks; +}; + +/** The statistics for the compressed block packer. */ +struct packer_statistics { + /** Number of compressed data items written since startup */ + u64 compressed_fragments_written; + /** Number of blocks containing compressed items written since startup */ + u64 compressed_blocks_written; + /** Number of VIOs that are pending in the packer */ + u64 compressed_fragments_in_packer; +}; + +/** The statistics for the slab journals. */ +struct slab_journal_statistics { + /** Number of times the on-disk journal was full */ + u64 disk_full_count; + /** Number of times an entry was added over the flush threshold */ + u64 flush_count; + /** Number of times an entry was added over the block threshold */ + u64 blocked_count; + /** Number of times a tail block was written */ + u64 blocks_written; + /** Number of times we had to wait for the tail to write */ + u64 tail_busy_count; +}; + +/** The statistics for the slab summary. */ +struct slab_summary_statistics { + /** Number of blocks written */ + u64 blocks_written; +}; + +/** The statistics for the reference counts. */ +struct ref_counts_statistics { + /** Number of reference blocks written */ + u64 blocks_written; +}; + +/** The statistics for the block map. */ +struct block_map_statistics { + /** number of dirty (resident) pages */ + u32 dirty_pages; + /** number of clean (resident) pages */ + u32 clean_pages; + /** number of free pages */ + u32 free_pages; + /** number of pages in failed state */ + u32 failed_pages; + /** number of pages incoming */ + u32 incoming_pages; + /** number of pages outgoing */ + u32 outgoing_pages; + /** how many times free page not avail */ + u32 cache_pressure; + /** number of get_vdo_page() calls for read */ + u64 read_count; + /** number of get_vdo_page() calls for write */ + u64 write_count; + /** number of times pages failed to read */ + u64 failed_reads; + /** number of times pages failed to write */ + u64 failed_writes; + /** number of gets that are reclaimed */ + u64 reclaimed; + /** number of gets for outgoing pages */ + u64 read_outgoing; + /** number of gets that were already there */ + u64 found_in_cache; + /** number of gets requiring discard */ + u64 discard_required; + /** number of gets enqueued for their page */ + u64 wait_for_page; + /** number of gets that have to fetch */ + u64 fetch_required; + /** number of page fetches */ + u64 pages_loaded; + /** number of page saves */ + u64 pages_saved; + /** the number of flushes issued */ + u64 flush_count; +}; + +/** The dedupe statistics from hash locks */ +struct hash_lock_statistics { + /** Number of times the UDS advice proved correct */ + u64 dedupe_advice_valid; + /** Number of times the UDS advice proved incorrect */ + u64 dedupe_advice_stale; + /** Number of writes with the same data as another in-flight write */ + u64 concurrent_data_matches; + /** Number of writes whose hash collided with an in-flight write */ + u64 concurrent_hash_collisions; + /** Current number of dedupe queries that are in flight */ + u32 curr_dedupe_queries; +}; + +/** Counts of error conditions in VDO. */ +struct error_statistics { + /** number of times VDO got an invalid dedupe advice PBN from UDS */ + u64 invalid_advice_pbn_count; + /** number of times a VIO completed with a VDO_NO_SPACE error */ + u64 no_space_error_count; + /** number of times a VIO completed with a VDO_READ_ONLY error */ + u64 read_only_error_count; +}; + +struct bio_stats { + /** Number of REQ_OP_READ bios */ + u64 read; + /** Number of REQ_OP_WRITE bios with data */ + u64 write; + /** Number of bios tagged with REQ_PREFLUSH and containing no data */ + u64 empty_flush; + /** Number of REQ_OP_DISCARD bios */ + u64 discard; + /** Number of bios tagged with REQ_PREFLUSH */ + u64 flush; + /** Number of bios tagged with REQ_FUA */ + u64 fua; +}; + +struct memory_usage { + /** Tracked bytes currently allocated. */ + u64 bytes_used; + /** Maximum tracked bytes allocated. */ + u64 peak_bytes_used; +}; + +/** UDS index statistics */ +struct index_statistics { + /** Number of records stored in the index */ + u64 entries_indexed; + /** Number of post calls that found an existing entry */ + u64 posts_found; + /** Number of post calls that added a new entry */ + u64 posts_not_found; + /** Number of query calls that found an existing entry */ + u64 queries_found; + /** Number of query calls that added a new entry */ + u64 queries_not_found; + /** Number of update calls that found an existing entry */ + u64 updates_found; + /** Number of update calls that added a new entry */ + u64 updates_not_found; + /** Number of entries discarded */ + u64 entries_discarded; +}; + +/** The statistics of the vdo service. */ +struct vdo_statistics { + u32 version; + u32 release_version; + /** Number of blocks used for data */ + u64 data_blocks_used; + /** Number of blocks used for VDO metadata */ + u64 overhead_blocks_used; + /** Number of logical blocks that are currently mapped to physical blocks */ + u64 logical_blocks_used; + /** number of physical blocks */ + block_count_t physical_blocks; + /** number of logical blocks */ + block_count_t logical_blocks; + /** Size of the block map page cache, in bytes */ + u64 block_map_cache_size; + /** The physical block size */ + u64 block_size; + /** Number of times the VDO has successfully recovered */ + u64 complete_recoveries; + /** Number of times the VDO has recovered from read-only mode */ + u64 read_only_recoveries; + /** String describing the operating mode of the VDO */ + char mode[15]; + /** Whether the VDO is in recovery mode */ + bool in_recovery_mode; + /** What percentage of recovery mode work has been completed */ + u8 recovery_percentage; + /** The statistics for the compressed block packer */ + struct packer_statistics packer; + /** Counters for events in the block allocator */ + struct block_allocator_statistics allocator; + /** Counters for events in the recovery journal */ + struct recovery_journal_statistics journal; + /** The statistics for the slab journals */ + struct slab_journal_statistics slab_journal; + /** The statistics for the slab summary */ + struct slab_summary_statistics slab_summary; + /** The statistics for the reference counts */ + struct ref_counts_statistics ref_counts; + /** The statistics for the block map */ + struct block_map_statistics block_map; + /** The dedupe statistics from hash locks */ + struct hash_lock_statistics hash_lock; + /** Counts of error conditions */ + struct error_statistics errors; + /** The VDO instance */ + u32 instance; + /** Current number of active VIOs */ + u32 current_vios_in_progress; + /** Maximum number of active VIOs */ + u32 max_vios; + /** Number of times the UDS index was too slow in responding */ + u64 dedupe_advice_timeouts; + /** Number of flush requests submitted to the storage device */ + u64 flush_out; + /** Logical block size */ + u64 logical_block_size; + /** Bios submitted into VDO from above */ + struct bio_stats bios_in; + struct bio_stats bios_in_partial; + /** Bios submitted onward for user data */ + struct bio_stats bios_out; + /** Bios submitted onward for metadata */ + struct bio_stats bios_meta; + struct bio_stats bios_journal; + struct bio_stats bios_page_cache; + struct bio_stats bios_out_completed; + struct bio_stats bios_meta_completed; + struct bio_stats bios_journal_completed; + struct bio_stats bios_page_cache_completed; + struct bio_stats bios_acknowledged; + struct bio_stats bios_acknowledged_partial; + /** Current number of bios in progress */ + struct bio_stats bios_in_progress; + /** Memory usage stats. */ + struct memory_usage memory_usage; + /** The statistics for the UDS index */ + struct index_statistics index; +}; + +#endif /* not STATISTICS_H */ diff --git a/drivers/md/dm-vdo/status-codes.c b/drivers/md/dm-vdo/status-codes.c new file mode 100644 index 00000000000..bbad5a918e1 --- /dev/null +++ b/drivers/md/dm-vdo/status-codes.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "status-codes.h" + + +#include "errors.h" +#include "logger.h" +#include "permassert.h" +#include "uds-threads.h" + +const struct error_info vdo_status_list[] = { + { "VDO_NOT_IMPLEMENTED", "Not implemented" }, + { "VDO_OUT_OF_RANGE", "Out of range" }, + { "VDO_REF_COUNT_INVALID", "Reference count would become invalid" }, + { "VDO_NO_SPACE", "Out of space" }, + { "VDO_UNEXPECTED_EOF", "Unexpected EOF on block read" }, + { "VDO_BAD_CONFIGURATION", "Bad configuration option" }, + { "VDO_SOCKET_ERROR", "Socket error" }, + { "VDO_BAD_ALIGNMENT", "Mis-aligned block reference" }, + { "VDO_COMPONENT_BUSY", "Prior operation still in progress" }, + { "VDO_BAD_PAGE", "Corrupt or incorrect page" }, + { "VDO_UNSUPPORTED_VERSION", "Unsupported component version" }, + { "VDO_INCORRECT_COMPONENT", "Component id mismatch in decoder" }, + { "VDO_PARAMETER_MISMATCH", "Parameters have conflicting values" }, + { "VDO_BLOCK_SIZE_TOO_SMALL", "The block size is too small" }, + { "VDO_UNKNOWN_PARTITION", "No partition exists with a given id" }, + { "VDO_PARTITION_EXISTS", "A partition already exists with a given id" }, + { "VDO_NOT_READ_ONLY", "The device is not in read-only mode" }, + { "VDO_INCREMENT_TOO_SMALL", "Physical block growth of too few blocks" }, + { "VDO_CHECKSUM_MISMATCH", "Incorrect checksum" }, + { "VDO_RECOVERY_JOURNAL_FULL", "The recovery journal is full" }, + { "VDO_LOCK_ERROR", "A lock is held incorrectly" }, + { "VDO_READ_ONLY", "The device is in read-only mode" }, + { "VDO_SHUTTING_DOWN", "The device is shutting down" }, + { "VDO_CORRUPT_JOURNAL", "Recovery journal entries corrupted" }, + { "VDO_TOO_MANY_SLABS", "Exceeds maximum number of slabs supported" }, + { "VDO_INVALID_FRAGMENT", "Compressed block fragment is invalid" }, + { "VDO_RETRY_AFTER_REBUILD", "Retry operation after rebuilding finishes" }, + { "VDO_UNKNOWN_COMMAND", "The extended command is not known" }, + { "VDO_COMMAND_ERROR", "Bad extended command parameters" }, + { "VDO_CANNOT_DETERMINE_SIZE", "Cannot determine config sizes to fit" }, + { "VDO_BAD_MAPPING", "Invalid page mapping" }, + { "VDO_READ_CACHE_BUSY", "Read cache has no free slots" }, + { "VDO_BIO_CREATION_FAILED", "Bio creation failed" }, + { "VDO_BAD_MAGIC", "Bad magic number" }, + { "VDO_BAD_NONCE", "Bad nonce" }, + { "VDO_JOURNAL_OVERFLOW", "Journal sequence number overflow" }, + { "VDO_INVALID_ADMIN_STATE", "Invalid operation for current state" }, + { "VDO_CANT_ADD_SYSFS_NODE", "Failed to add sysfs node" }, +}; + +static atomic_t vdo_status_codes_registered = ATOMIC_INIT(0); +static int status_code_registration_result; + +static void do_status_code_registration(void) +{ + int result; + + STATIC_ASSERT((VDO_STATUS_CODE_LAST - VDO_STATUS_CODE_BASE) == + ARRAY_SIZE(vdo_status_list)); + + result = uds_register_error_block("VDO Status", + VDO_STATUS_CODE_BASE, + VDO_STATUS_CODE_BLOCK_END, + vdo_status_list, + sizeof(vdo_status_list)); + /* + * The following test handles cases where libvdo is statically linked against both the test + * modules and the test driver (because multiple instances of this module call their own + * copy of this function once each, resulting in multiple calls to register_error_block + * which is shared in libuds). + */ + if (result == UDS_DUPLICATE_NAME) + result = UDS_SUCCESS; + + status_code_registration_result = (result == UDS_SUCCESS) ? VDO_SUCCESS : result; +} + +/** + * vdo_register_status_codes() - Register the VDO status codes if needed. + * Return: A success or error code. + */ +int vdo_register_status_codes(void) +{ + uds_perform_once(&vdo_status_codes_registered, do_status_code_registration); + return status_code_registration_result; +} + +/** + * vdo_map_to_system_error() - Given an error code, return a value we can return to the OS. + * @error: The error code to convert. + * + * The input error code may be a system-generated value (such as -EIO), an errno macro used in our + * code (such as EIO), or a UDS or VDO status code; the result must be something the rest of the OS + * can consume (negative errno values such as -EIO, in the case of the kernel). + * + * Return: A system error code value. + */ +int vdo_map_to_system_error(int error) +{ + char error_name[UDS_MAX_ERROR_NAME_SIZE]; + char error_message[UDS_MAX_ERROR_MESSAGE_SIZE]; + + /* 0 is success, negative a system error code */ + if (likely(error <= 0)) + return error; + if (error < 1024) + return -error; + + /* VDO or UDS error */ + switch (error) { + case VDO_NO_SPACE: + return -ENOSPC; + case VDO_READ_ONLY: + return -EIO; + default: + uds_log_info("%s: mapping internal status code %d (%s: %s) to EIO", + __func__, + error, + uds_string_error_name(error, error_name, sizeof(error_name)), + uds_string_error(error, error_message, sizeof(error_message))); + return -EIO; + } +} diff --git a/drivers/md/dm-vdo/status-codes.h b/drivers/md/dm-vdo/status-codes.h new file mode 100644 index 00000000000..34ad2445419 --- /dev/null +++ b/drivers/md/dm-vdo/status-codes.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_STATUS_CODES_H +#define VDO_STATUS_CODES_H + +#include "errors.h" + +enum { + UDS_BLOCK_SIZE = UDS_ERROR_CODE_BLOCK_END - UDS_ERROR_CODE_BASE, + VDO_BLOCK_START = UDS_ERROR_CODE_BLOCK_END, + VDO_BLOCK_END = VDO_BLOCK_START + UDS_BLOCK_SIZE, + PRP_BLOCK_START = VDO_BLOCK_END, + PRP_BLOCK_END = PRP_BLOCK_START + UDS_BLOCK_SIZE, +}; + +/* VDO-specific status codes. */ +enum vdo_status_codes { + /* successful result */ + VDO_SUCCESS, + /* base of all VDO errors */ + VDO_STATUS_CODE_BASE = VDO_BLOCK_START, + /* we haven't written this yet */ + VDO_NOT_IMPLEMENTED = VDO_STATUS_CODE_BASE, + /* input out of range */ + VDO_OUT_OF_RANGE, + /* an invalid reference count would result */ + VDO_REF_COUNT_INVALID, + /* a free block could not be allocated */ + VDO_NO_SPACE, + /* unexpected EOF on block read */ + VDO_UNEXPECTED_EOF, + /* improper or missing configuration option */ + VDO_BAD_CONFIGURATION, + /* socket opening or binding problem */ + VDO_SOCKET_ERROR, + /* read or write on non-aligned offset */ + VDO_BAD_ALIGNMENT, + /* prior operation still in progress */ + VDO_COMPONENT_BUSY, + /* page contents incorrect or corrupt data */ + VDO_BAD_PAGE, + /* unsupported version of some component */ + VDO_UNSUPPORTED_VERSION, + /* component id mismatch in decoder */ + VDO_INCORRECT_COMPONENT, + /* parameters have conflicting values */ + VDO_PARAMETER_MISMATCH, + /* the block size is too small */ + VDO_BLOCK_SIZE_TOO_SMALL, + /* no partition exists with a given id */ + VDO_UNKNOWN_PARTITION, + /* a partition already exists with a given id */ + VDO_PARTITION_EXISTS, + /* the VDO is not in read-only mode */ + VDO_NOT_READ_ONLY, + /* physical block growth of too few blocks */ + VDO_INCREMENT_TOO_SMALL, + /* incorrect checksum */ + VDO_CHECKSUM_MISMATCH, + /* the recovery journal is full */ + VDO_RECOVERY_JOURNAL_FULL, + /* a lock is held incorrectly */ + VDO_LOCK_ERROR, + /* the VDO is in read-only mode */ + VDO_READ_ONLY, + /* the VDO is shutting down */ + VDO_SHUTTING_DOWN, + /* the recovery journal has corrupt entries */ + VDO_CORRUPT_JOURNAL, + /* exceeds maximum number of slabs supported */ + VDO_TOO_MANY_SLABS, + /* a compressed block fragment is invalid */ + VDO_INVALID_FRAGMENT, + /* action is unsupported while rebuilding */ + VDO_RETRY_AFTER_REBUILD, + /* the extended command is not known */ + VDO_UNKNOWN_COMMAND, + /* bad extended command parameters */ + VDO_COMMAND_ERROR, + /* cannot determine sizes to fit */ + VDO_CANNOT_DETERMINE_SIZE, + /* a block map entry is invalid */ + VDO_BAD_MAPPING, + /* read cache has no free slots */ + VDO_READ_CACHE_BUSY, + /* bio_add_page failed */ + VDO_BIO_CREATION_FAILED, + /* bad magic number */ + VDO_BAD_MAGIC, + /* bad nonce */ + VDO_BAD_NONCE, + /* sequence number overflow */ + VDO_JOURNAL_OVERFLOW, + /* the VDO is not in a state to perform an admin operation */ + VDO_INVALID_ADMIN_STATE, + /* failure adding a sysfs node */ + VDO_CANT_ADD_SYSFS_NODE, + /* one more than last error code */ + VDO_STATUS_CODE_LAST, + VDO_STATUS_CODE_BLOCK_END = VDO_BLOCK_END +}; + +extern const struct error_info vdo_status_list[]; + +int vdo_register_status_codes(void); + +int vdo_map_to_system_error(int error); + +#endif /* VDO_STATUS_CODES_H */ diff --git a/drivers/md/dm-vdo/sysfs.c b/drivers/md/dm-vdo/sysfs.c new file mode 100644 index 00000000000..a091933a0a5 --- /dev/null +++ b/drivers/md/dm-vdo/sysfs.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include <linux/module.h> + +#include "logger.h" + +#include "constants.h" +#include "dedupe.h" +#include "vdo.h" + +static int vdo_log_level_show(char *buf, const struct kernel_param *kp) +{ + return sprintf(buf, "%s\n", uds_log_priority_to_string(uds_get_log_level())); +} + +static int vdo_log_level_store(const char *buf, const struct kernel_param *kp) +{ + static char internal_buf[11]; + + int n = strlen(buf); + + if (n > 10) + return -EINVAL; + + memset(internal_buf, '\000', sizeof(internal_buf)); + memcpy(internal_buf, buf, n); + if (internal_buf[n - 1] == '\n') + internal_buf[n - 1] = '\000'; + uds_set_log_level(uds_log_string_to_priority(internal_buf)); + return 0; +} + + +static int vdo_dedupe_timeout_interval_store(const char *buf, const struct kernel_param *kp) +{ + int result = param_set_uint(buf, kp); + + if (result != 0) + return result; + vdo_set_dedupe_index_timeout_interval(*(uint *)kp->arg); + return 0; +} + +static int vdo_min_dedupe_timer_interval_store(const char *buf, const struct kernel_param *kp) +{ + int result = param_set_uint(buf, kp); + + if (result != 0) + return result; + vdo_set_dedupe_index_min_timer_interval(*(uint *)kp->arg); + return 0; +} + +static const struct kernel_param_ops log_level_ops = { + .set = vdo_log_level_store, + .get = vdo_log_level_show, +}; + + +static const struct kernel_param_ops dedupe_timeout_ops = { + .set = vdo_dedupe_timeout_interval_store, + .get = param_get_uint, +}; + +static const struct kernel_param_ops dedupe_timer_ops = { + .set = vdo_min_dedupe_timer_interval_store, + .get = param_get_uint, +}; + +module_param_cb(log_level, &log_level_ops, NULL, 0644); + + +module_param_cb(deduplication_timeout_interval, + &dedupe_timeout_ops, + &vdo_dedupe_index_timeout_interval, + 0644); + +module_param_cb(min_deduplication_timer_interval, + &dedupe_timer_ops, + &vdo_dedupe_index_min_timer_interval, + 0644); diff --git a/drivers/md/dm-vdo/types.h b/drivers/md/dm-vdo/types.h new file mode 100644 index 00000000000..ad719427290 --- /dev/null +++ b/drivers/md/dm-vdo/types.h @@ -0,0 +1,403 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_TYPES_H +#define VDO_TYPES_H + +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/device-mapper.h> +#include <linux/list.h> +#include <linux/compiler_attributes.h> +#include <linux/types.h> + +#include "funnel-queue.h" + +/* A size type in blocks. */ +typedef u64 block_count_t; + +/* The size of a block. */ +typedef u16 block_size_t; + +/* A counter for data_vios */ +typedef u16 data_vio_count_t; + +/* A height within a tree. */ +typedef u8 height_t; + +/* The logical block number as used by the consumer. */ +typedef u64 logical_block_number_t; + +/* The type of the nonce used to identify instances of VDO. */ +typedef u64 nonce_t; + +/* A size in pages. */ +typedef u32 page_count_t; + +/* A page number. */ +typedef u32 page_number_t; + +/* + * The physical (well, less logical) block number at which the block is found on the underlying + * device. + */ +typedef u64 physical_block_number_t; + +/* + * A release version number. These numbers are used to make the numbering space for component + * versions independent across release branches. + * + * Really an enum, but we have to specify the size for encoding; see release_versions.h for the + * enumeration values. + */ +typedef u32 release_version_number_t; + +/* A count of tree roots. */ +typedef u8 root_count_t; + +/* A number of sectors. */ +typedef u8 sector_count_t; + +/* A sequence number. */ +typedef u64 sequence_number_t; + +/* The offset of a block within a slab. */ +typedef u32 slab_block_number; + +/* A size type in slabs. */ +typedef u16 slab_count_t; + +/* A slot in a bin or block map page. */ +typedef u16 slot_number_t; + +/* typedef thread_count_t - A thread counter. */ +typedef u8 thread_count_t; + +/* typedef thread_id_t - A thread ID, vdo threads are numbered sequentially from 0. */ +typedef u8 thread_id_t; + +/* A zone counter */ +typedef u8 zone_count_t; + +/* The following enums are persisted on storage, so the values must be preserved. */ + +/* The current operating mode of the VDO. */ +enum vdo_state { + VDO_DIRTY = 0, + VDO_NEW = 1, + VDO_CLEAN = 2, + VDO_READ_ONLY_MODE = 3, + VDO_FORCE_REBUILD = 4, + VDO_RECOVERING = 5, + VDO_REPLAYING = 6, /* VDO_REPLAYING is never set anymore, but retained for upgrade */ + VDO_REBUILD_FOR_UPGRADE = 7, + + /* Keep VDO_STATE_COUNT at the bottom. */ + VDO_STATE_COUNT +}; + +/** + * vdo_state_requires_read_only_rebuild() - Check whether a vdo_state indicates + * that a read-only rebuild is required. + * @state: The vdo_state to check. + * + * Return: true if the state indicates a rebuild is required + */ +static inline bool __must_check vdo_state_requires_read_only_rebuild(enum vdo_state state) +{ + return ((state == VDO_FORCE_REBUILD) || (state == VDO_REBUILD_FOR_UPGRADE)); +} + +/** + * vdo_state_requires_recovery() - Check whether a vdo state indicates that recovery is needed. + * @state: The state to check. + * + * Return: true if the state indicates a recovery is required + */ +static inline bool __must_check vdo_state_requires_recovery(enum vdo_state state) +{ + return ((state == VDO_DIRTY) || (state == VDO_REPLAYING) || (state == VDO_RECOVERING)); +} + +/* + * The current operation on a physical block (from the point of view of the recovery journal, slab + * journals, and reference counts. + */ +enum journal_operation { + VDO_JOURNAL_DATA_REMAPPING = 0, + VDO_JOURNAL_BLOCK_MAP_REMAPPING = 1, +} __packed; + +/* Partition IDs encoded in the volume layout in the super block. */ +enum partition_id { + VDO_BLOCK_MAP_PARTITION = 0, + VDO_SLAB_DEPOT_PARTITION = 1, + VDO_RECOVERY_JOURNAL_PARTITION = 2, + VDO_SLAB_SUMMARY_PARTITION = 3, +} __packed; + +/* Metadata types for the vdo. */ +enum vdo_metadata_type { + VDO_METADATA_RECOVERY_JOURNAL = 1, + VDO_METADATA_SLAB_JOURNAL = 2, + VDO_METADATA_RECOVERY_JOURNAL_2 = 3, +} __packed; + +/* A position in the block map where a block map entry is stored. */ +struct block_map_slot { + physical_block_number_t pbn; + slot_number_t slot; +}; + +/* + * Four bits of each five-byte block map entry contain a mapping state value used to distinguish + * unmapped or trimmed logical blocks (which are treated as mapped to the zero block) from entries + * that have been mapped to a physical block, including the zero block. + * + * FIXME: these should maybe be defines. + */ +enum block_mapping_state { + VDO_MAPPING_STATE_UNMAPPED = 0, /* Must be zero to be the default value */ + VDO_MAPPING_STATE_UNCOMPRESSED = 1, /* A normal (uncompressed) block */ + VDO_MAPPING_STATE_COMPRESSED_BASE = 2, /* Compressed in slot 0 */ + VDO_MAPPING_STATE_COMPRESSED_MAX = 15, /* Compressed in slot 13 */ +}; + +enum { + VDO_MAX_COMPRESSION_SLOTS = + (VDO_MAPPING_STATE_COMPRESSED_MAX - VDO_MAPPING_STATE_COMPRESSED_BASE + 1), +}; + + +struct data_location { + physical_block_number_t pbn; + enum block_mapping_state state; +}; + +/* The configuration of a single slab derived from the configured block size and slab size. */ +struct slab_config { + /* total number of blocks in the slab */ + block_count_t slab_blocks; + /* number of blocks available for data */ + block_count_t data_blocks; + /* number of blocks for reference counts */ + block_count_t reference_count_blocks; + /* number of blocks for the slab journal */ + block_count_t slab_journal_blocks; + /* + * Number of blocks after which the slab journal starts pushing out a reference_block for + * each new entry it receives. + */ + block_count_t slab_journal_flushing_threshold; + /* + * Number of blocks after which the slab journal pushes out all reference_blocks and makes + * all vios wait. + */ + block_count_t slab_journal_blocking_threshold; + /* Number of blocks after which the slab must be scrubbed before coming online. */ + block_count_t slab_journal_scrubbing_threshold; +} __packed; + +/* + * This structure is memcmp'd for equality. Keep it packed and don't add any fields that are not + * properly set in both extant and parsed configs. + */ +struct thread_count_config { + unsigned int bio_ack_threads; + unsigned int bio_threads; + unsigned int bio_rotation_interval; + unsigned int cpu_threads; + unsigned int logical_zones; + unsigned int physical_zones; + unsigned int hash_zones; +} __packed; + +struct device_config { + struct dm_target *owning_target; + struct dm_dev *owned_device; + struct vdo *vdo; + /* All configs referencing a layer are kept on a list in the layer */ + struct list_head config_list; + char *original_string; + unsigned int version; + char *parent_device_name; + block_count_t physical_blocks; + /* + * This is the number of logical blocks from VDO's internal point of view. It is the number + * of 4K blocks regardles of the value of the logical_block_size parameter below. + */ + block_count_t logical_blocks; + unsigned int logical_block_size; + unsigned int cache_size; + unsigned int block_map_maximum_age; + bool deduplication; + bool compression; + struct thread_count_config thread_counts; + block_count_t max_discard_blocks; +}; + +enum vdo_completion_type { + /* Keep VDO_UNSET_COMPLETION_TYPE at the top. */ + VDO_UNSET_COMPLETION_TYPE, + VDO_ACTION_COMPLETION, + VDO_ADMIN_COMPLETION, + VDO_BLOCK_ALLOCATOR_COMPLETION, + VDO_DATA_VIO_POOL_COMPLETION, + VDO_DECREMENT_COMPLETION, + VDO_FLUSH_COMPLETION, + VDO_FLUSH_NOTIFICATION_COMPLETION, + VDO_GENERATION_FLUSHED_COMPLETION, + VDO_HASH_ZONE_COMPLETION, + VDO_HASH_ZONES_COMPLETION, + VDO_LOCK_COUNTER_COMPLETION, + VDO_PAGE_COMPLETION, + VDO_READ_ONLY_MODE_COMPLETION, + VDO_REPAIR_COMPLETION, + VDO_SYNC_COMPLETION, + VIO_COMPLETION, +} __packed; + +struct vdo_completion; + +/** + * typedef vdo_action - An asynchronous VDO operation. + * @completion: The completion of the operation. + */ +typedef void vdo_action(struct vdo_completion *completion); + +enum vdo_completion_priority { + BIO_ACK_Q_ACK_PRIORITY = 0, + BIO_ACK_Q_MAX_PRIORITY = 0, + BIO_Q_COMPRESSED_DATA_PRIORITY = 0, + BIO_Q_DATA_PRIORITY = 0, + BIO_Q_FLUSH_PRIORITY = 2, + BIO_Q_HIGH_PRIORITY = 2, + BIO_Q_METADATA_PRIORITY = 1, + BIO_Q_VERIFY_PRIORITY = 1, + BIO_Q_MAX_PRIORITY = 2, + CPU_Q_COMPLETE_VIO_PRIORITY = 0, + CPU_Q_COMPLETE_READ_PRIORITY = 0, + CPU_Q_COMPRESS_BLOCK_PRIORITY = 0, + CPU_Q_EVENT_REPORTER_PRIORITY = 0, + CPU_Q_HASH_BLOCK_PRIORITY = 0, + CPU_Q_MAX_PRIORITY = 0, + UDS_Q_PRIORITY = 0, + UDS_Q_MAX_PRIORITY = 0, + VDO_DEFAULT_Q_COMPLETION_PRIORITY = 1, + VDO_DEFAULT_Q_FLUSH_PRIORITY = 2, + VDO_DEFAULT_Q_MAP_BIO_PRIORITY = 0, + VDO_DEFAULT_Q_SYNC_PRIORITY = 2, + VDO_DEFAULT_Q_VIO_CALLBACK_PRIORITY = 1, + VDO_DEFAULT_Q_MAX_PRIORITY = 2, + /* The maximum allowable priority */ + VDO_WORK_Q_MAX_PRIORITY = 2, + /* A value which must be out of range for a valid priority */ + VDO_WORK_Q_DEFAULT_PRIORITY = VDO_WORK_Q_MAX_PRIORITY + 1, +}; + +struct vdo_completion { + /* The type of completion this is */ + enum vdo_completion_type type; + + /* + * <code>true</code> once the processing of the operation is complete. This flag should not + * be used by waiters external to the VDO base as it is used to gate calling the callback. + */ + bool complete; + + /* + * If true, queue this completion on the next callback invocation, even if it is already + * running on the correct thread. + */ + bool requeue; + + /* The ID of the thread which should run the next callback */ + thread_id_t callback_thread_id; + + /* The result of the operation */ + int result; + + /* The VDO on which this completion operates */ + struct vdo *vdo; + + /* The callback which will be called once the operation is complete */ + vdo_action *callback; + + /* Callback which, if set, will be called if an error result is set */ + vdo_action *error_handler; + + /* The parent object, if any, that spawned this completion */ + void *parent; + + /* Entry link for lock-free work queue */ + struct funnel_queue_entry work_queue_entry_link; + enum vdo_completion_priority priority; + struct vdo_work_queue *my_queue; + u64 enqueue_time; +}; + +struct block_allocator; +struct data_vio; +struct vdo; +struct vdo_config; + +/* vio types for statistics and instrumentation. */ +enum vio_type { + VIO_TYPE_UNINITIALIZED = 0, + VIO_TYPE_DATA, + VIO_TYPE_BLOCK_ALLOCATOR, + VIO_TYPE_BLOCK_MAP, + VIO_TYPE_BLOCK_MAP_INTERIOR, + VIO_TYPE_GEOMETRY, + VIO_TYPE_PARTITION_COPY, + VIO_TYPE_RECOVERY_JOURNAL, + VIO_TYPE_SLAB_JOURNAL, + VIO_TYPE_SLAB_SUMMARY, + VIO_TYPE_SUPER_BLOCK, +} __packed; + +/* Priority levels for asynchronous I/O operations performed on a vio. */ +enum vio_priority { + VIO_PRIORITY_LOW = 0, + VIO_PRIORITY_DATA = VIO_PRIORITY_LOW, + VIO_PRIORITY_COMPRESSED_DATA = VIO_PRIORITY_DATA, + VIO_PRIORITY_METADATA, + VIO_PRIORITY_HIGH, +} __packed; + +/* + * A wrapper for a bio. All I/O to the storage below a vdo is conducted via vios. + */ +struct vio { + /* The completion for this vio */ + struct vdo_completion completion; + + /* The bio zone in which I/O should be processed */ + zone_count_t bio_zone; + + /* The queueing priority of the vio operation */ + enum vio_priority priority; + + /* The vio type is used for statistics and instrumentation. */ + enum vio_type type; + + /* The size of this vio in blocks */ + unsigned int block_count; + + /* The data being read or written. */ + char *data; + + /* The VDO-owned bio to use for all IO for this vio */ + struct bio *bio; + + /* + * A list of enqueued bios with consecutive block numbers, stored by vdo_submit_bio() under + * the first-enqueued vio. The other vios are found via their bio entries in this list, and + * are not added to the work queue as separate completions. + */ + struct bio_list bios_merged; +}; + +#endif /* VDO_TYPES_H */ diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c new file mode 100644 index 00000000000..c85d349a5fb --- /dev/null +++ b/drivers/md/dm-vdo/vdo.c @@ -0,0 +1,1846 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +/* + * This file contains the main entry points for normal operations on a vdo as well as functions for + * constructing and destroying vdo instances (in memory). + */ + +/** + * DOC: + * + * A read_only_notifier has a single completion which is used to perform read-only notifications, + * however, vdo_enter_read_only_mode() may be called from any thread. A pair of fields, protected + * by a spinlock, are used to control the read-only mode entry process. The first field holds the + * read-only error. The second is the state field, which may hold any of the four special values + * enumerated here. + * + * When vdo_enter_read_only_mode() is called from some vdo thread, if the read_only_error field + * already contains an error (i.e. its value is not VDO_SUCCESS), then some other error has already + * initiated the read-only process, and nothing more is done. Otherwise, the new error is stored in + * the read_only_error field, and the state field is consulted. If the state is MAY_NOTIFY, it is + * set to NOTIFYING, and the notification process begins. If the state is MAY_NOT_NOTIFY, then + * notifications are currently disallowed, generally due to the vdo being suspended. In this case, + * the nothing more will be done until the vdo is resumed, at which point the notification will be + * performed. In any other case, the vdo is already read-only, and there is nothing more to do. + */ + +#include "vdo.h" + +#include <linux/completion.h> +#include <linux/device-mapper.h> +#include <linux/kernel.h> +#include <linux/lz4.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/types.h> + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" +#include "string-utils.h" + +#include "block-map.h" +#include "completion.h" +#include "data-vio.h" +#include "dedupe.h" +#include "encodings.h" +#include "io-submitter.h" +#include "logical-zone.h" +#include "packer.h" +#include "physical-zone.h" +#include "pool-sysfs.h" +#include "recovery-journal.h" +#include "release-versions.h" +#include "slab-depot.h" +#include "statistics.h" +#include "status-codes.h" +#include "vio.h" +#include "work-queue.h" + + +enum { PARANOID_THREAD_CONSISTENCY_CHECKS = 0 }; + +struct sync_completion { + struct vdo_completion vdo_completion; + struct completion completion; +}; + +/* + * We don't expect this set to ever get really large, so a linked list is adequate. We can use a + * pointer_map if we need to later. + */ +struct device_registry { + struct list_head links; + /* TODO: Convert to rcu per kernel recommendation. */ + rwlock_t lock; +}; + +static struct device_registry registry; + +/** + * vdo_initialize_device_registry_once() - Initialize the necessary structures for the device + * registry. + */ +void vdo_initialize_device_registry_once(void) +{ + INIT_LIST_HEAD(®istry.links); + rwlock_init(®istry.lock); +} + +/** vdo_is_equal() - Implements vdo_filter_t. */ +static bool vdo_is_equal(struct vdo *vdo, const void *context) +{ + return ((void *) vdo == context); +} + +/** + * filter_vdos_locked() - Find a vdo in the registry if it exists there. + * @filter: The filter function to apply to devices. + * @context: A bit of context to provide the filter. + * + * Context: Must be called holding the lock. + * + * Return: the vdo object found, if any. + */ +static struct vdo * __must_check filter_vdos_locked(vdo_filter_t *filter, const void *context) +{ + struct vdo *vdo; + + list_for_each_entry(vdo, ®istry.links, registration) + if (filter(vdo, context)) + return vdo; + + return NULL; +} + +/** + * vdo_find_matching() - Find and return the first (if any) vdo matching a given filter function. + * @filter: The filter function to apply to vdos. + * @context: A bit of context to provide the filter. + */ +struct vdo *vdo_find_matching(vdo_filter_t *filter, const void *context) +{ + struct vdo *vdo; + + read_lock(®istry.lock); + vdo = filter_vdos_locked(filter, context); + read_unlock(®istry.lock); + return vdo; +} + +static void start_vdo_request_queue(void *ptr) +{ + struct vdo_thread *thread = vdo_get_work_queue_owner(vdo_get_current_work_queue()); + + uds_register_allocating_thread(&thread->allocating_thread, + &thread->vdo->allocations_allowed); +} + +static void finish_vdo_request_queue(void *ptr) +{ + uds_unregister_allocating_thread(); +} + +#ifdef MODULE +#define MODULE_NAME THIS_MODULE->name +#else +#define MODULE_NAME "dm-vdo" +#endif /* MODULE */ + +static const struct vdo_work_queue_type default_queue_type = { + .start = start_vdo_request_queue, + .finish = finish_vdo_request_queue, + .max_priority = VDO_DEFAULT_Q_MAX_PRIORITY, + .default_priority = VDO_DEFAULT_Q_COMPLETION_PRIORITY, +}; + +static const struct vdo_work_queue_type bio_ack_q_type = { + .start = NULL, + .finish = NULL, + .max_priority = BIO_ACK_Q_MAX_PRIORITY, + .default_priority = BIO_ACK_Q_ACK_PRIORITY, +}; + +static const struct vdo_work_queue_type cpu_q_type = { + .start = NULL, + .finish = NULL, + .max_priority = CPU_Q_MAX_PRIORITY, + .default_priority = CPU_Q_MAX_PRIORITY, +}; + +static void uninitialize_thread_config(struct thread_config *config) +{ + UDS_FREE(UDS_FORGET(config->logical_threads)); + UDS_FREE(UDS_FORGET(config->physical_threads)); + UDS_FREE(UDS_FORGET(config->hash_zone_threads)); + UDS_FREE(UDS_FORGET(config->bio_threads)); + memset(config, 0, sizeof(struct thread_config)); +} + +static void +assign_thread_ids(struct thread_config *config, thread_id_t thread_ids[], zone_count_t count) +{ + zone_count_t zone; + + for (zone = 0; zone < count; zone++) + thread_ids[zone] = config->thread_count++; +} + +/** + * initialize_thread_config() - Initialize the thread mapping + * + * If the logical, physical, and hash zone counts are all 0, a single thread will be shared by all + * three plus the packer and recovery journal. Otherwise, there must be at least one of each type, + * and each will have its own thread, as will the packer and recovery journal. + * + * Return: VDO_SUCCESS or an error. + */ +static int __must_check +initialize_thread_config(struct thread_count_config counts, struct thread_config *config) +{ + int result; + bool single = ((counts.logical_zones + counts.physical_zones + counts.hash_zones) == 0); + + config->bio_thread_count = counts.bio_threads; + if (single) { + config->logical_zone_count = 1; + config->physical_zone_count = 1; + config->hash_zone_count = 1; + } else { + config->logical_zone_count = counts.logical_zones; + config->physical_zone_count = counts.physical_zones; + config->hash_zone_count = counts.hash_zones; + } + + result = UDS_ALLOCATE(config->logical_zone_count, + thread_id_t, + "logical thread array", + &config->logical_threads); + if (result != VDO_SUCCESS) { + uninitialize_thread_config(config); + return result; + } + + result = UDS_ALLOCATE(config->physical_zone_count, + thread_id_t, + "physical thread array", + &config->physical_threads); + if (result != VDO_SUCCESS) { + uninitialize_thread_config(config); + return result; + } + + result = UDS_ALLOCATE(config->hash_zone_count, + thread_id_t, + "hash thread array", + &config->hash_zone_threads); + if (result != VDO_SUCCESS) { + uninitialize_thread_config(config); + return result; + } + + result = UDS_ALLOCATE(config->bio_thread_count, + thread_id_t, + "bio thread array", + &config->bio_threads); + if (result != VDO_SUCCESS) { + uninitialize_thread_config(config); + return result; + } + + if (single) { + config->logical_threads[0] = config->thread_count; + config->physical_threads[0] = config->thread_count; + config->hash_zone_threads[0] = config->thread_count++; + } else { + config->admin_thread = config->thread_count; + config->journal_thread = config->thread_count++; + config->packer_thread = config->thread_count++; + assign_thread_ids(config, config->logical_threads, counts.logical_zones); + assign_thread_ids(config, config->physical_threads, counts.physical_zones); + assign_thread_ids(config, config->hash_zone_threads, counts.hash_zones); + } + + config->dedupe_thread = config->thread_count++; + config->bio_ack_thread = + ((counts.bio_ack_threads > 0) ? config->thread_count++ : VDO_INVALID_THREAD_ID); + config->cpu_thread = config->thread_count++; + assign_thread_ids(config, config->bio_threads, counts.bio_threads); + return VDO_SUCCESS; +} + +/** + * vdo_read_geometry_block() - Synchronously read the geometry block from a vdo's underlying block + * device. + * @vdo: The vdo whose geometry is to be read. + * + * Return: VDO_SUCCESS or an error code. + */ +static int __must_check read_geometry_block(struct vdo *vdo) +{ + struct vio *vio; + char *block; + int result; + + result = UDS_ALLOCATE(VDO_BLOCK_SIZE, u8, __func__, &block); + if (result != VDO_SUCCESS) + return result; + + result = create_metadata_vio(vdo, VIO_TYPE_GEOMETRY, VIO_PRIORITY_HIGH, NULL, block, &vio); + if (result != VDO_SUCCESS) { + UDS_FREE(block); + return result; + } + + /* + * This is only safe because, having not already loaded the geometry, the vdo's geometry's + * bio_offset field is 0, so the fact that vio_reset_bio() will substract that offset from + * the supplied pbn is not a problem. + */ + result = vio_reset_bio(vio, block, NULL, REQ_OP_READ, VDO_GEOMETRY_BLOCK_LOCATION); + if (result != VDO_SUCCESS) { + free_vio(UDS_FORGET(vio)); + UDS_FREE(block); + return result; + } + + bio_set_dev(vio->bio, vdo_get_backing_device(vdo)); + submit_bio_wait(vio->bio); + result = blk_status_to_errno(vio->bio->bi_status); + free_vio(UDS_FORGET(vio)); + if (result != 0) { + uds_log_error_strerror(result, "synchronous read failed"); + UDS_FREE(block); + return -EIO; + } + + result = vdo_parse_geometry_block((u8 *) block, &vdo->geometry); + UDS_FREE(block); + return result; +} + +static bool get_zone_thread_name(const thread_id_t thread_ids[], + zone_count_t count, + thread_id_t id, + const char *prefix, + char *buffer, + size_t buffer_length) +{ + if (id >= thread_ids[0]) { + thread_id_t index = id - thread_ids[0]; + + if (index < count) { + snprintf(buffer, buffer_length, "%s%d", prefix, index); + return true; + } + } + return false; +} + +/** + * get_thread_name() - Format the name of the worker thread desired to support a given work queue. + * @thread_config: The thread configuration. + * @thread_id: The thread id. + * @buffer: Where to put the formatted name. + * @buffer_length: Size of the output buffer. + * + * The physical layer may add a prefix identifying the product; the output from this function + * should just identify the thread. + */ +static void +get_thread_name(const struct thread_config *thread_config, + thread_id_t thread_id, + char *buffer, + size_t buffer_length) +{ + if (thread_id == thread_config->journal_thread) { + if (thread_config->packer_thread == thread_id) { + /* + * This is the "single thread" config where one thread is used for the + * journal, packer, logical, physical, and hash zones. In that case, it is + * known as the "request queue." + */ + snprintf(buffer, buffer_length, "reqQ"); + return; + } + + snprintf(buffer, buffer_length, "journalQ"); + return; + } else if (thread_id == thread_config->admin_thread) { + /* Theoretically this could be different from the journal thread. */ + snprintf(buffer, buffer_length, "adminQ"); + return; + } else if (thread_id == thread_config->packer_thread) { + snprintf(buffer, buffer_length, "packerQ"); + return; + } else if (thread_id == thread_config->dedupe_thread) { + snprintf(buffer, buffer_length, "dedupeQ"); + return; + } else if (thread_id == thread_config->bio_ack_thread) { + snprintf(buffer, buffer_length, "ackQ"); + return; + } else if (thread_id == thread_config->cpu_thread) { + snprintf(buffer, buffer_length, "cpuQ"); + return; + } + + if (get_zone_thread_name(thread_config->logical_threads, + thread_config->logical_zone_count, + thread_id, + "logQ", + buffer, + buffer_length)) + return; + + if (get_zone_thread_name(thread_config->physical_threads, + thread_config->physical_zone_count, + thread_id, + "physQ", + buffer, + buffer_length)) + return; + + if (get_zone_thread_name(thread_config->hash_zone_threads, + thread_config->hash_zone_count, + thread_id, + "hashQ", + buffer, + buffer_length)) + return; + + if (get_zone_thread_name(thread_config->bio_threads, + thread_config->bio_thread_count, + thread_id, + "bioQ", + buffer, + buffer_length)) + return; + + /* Some sort of misconfiguration? */ + snprintf(buffer, buffer_length, "reqQ%d", thread_id); +} + +/** + * vdo_make_thread() - Construct a single vdo work_queue and its associated thread (or threads for + * round-robin queues). + * @vdo: The vdo which owns the thread. + * @thread_id: The id of the thread to create (as determined by the thread_config). + * @type: The description of the work queue for this thread. + * @queue_count: The number of actual threads/queues contained in the "thread". + * @contexts: An array of queue_count contexts, one for each individual queue; may be NULL. + * + * Each "thread" constructed by this method is represented by a unique thread id in the thread + * config, and completions can be enqueued to the queue and run on the threads comprising this + * entity. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make_thread(struct vdo *vdo, + thread_id_t thread_id, + const struct vdo_work_queue_type *type, + unsigned int queue_count, + void *contexts[]) +{ + struct vdo_thread *thread = &vdo->threads[thread_id]; + char queue_name[MAX_VDO_WORK_QUEUE_NAME_LEN]; + + if (type == NULL) + type = &default_queue_type; + + if (thread->queue != NULL) + return ASSERT(vdo_work_queue_type_is(thread->queue, type), + "already constructed vdo thread %u is of the correct type", + thread_id); + + thread->vdo = vdo; + thread->thread_id = thread_id; + get_thread_name(&vdo->thread_config, thread_id, queue_name, sizeof(queue_name)); + return vdo_make_work_queue(vdo->thread_name_prefix, + queue_name, + thread, + type, + queue_count, + contexts, + &thread->queue); +} + +/** + * register_vdo() - Register a VDO; it must not already be registered. + * @vdo: The vdo to register. + * + * Return: VDO_SUCCESS or an error. + */ +static int register_vdo(struct vdo *vdo) +{ + int result; + + write_lock(®istry.lock); + result = ASSERT(filter_vdos_locked(vdo_is_equal, vdo) == NULL, + "VDO not already registered"); + if (result == VDO_SUCCESS) { + INIT_LIST_HEAD(&vdo->registration); + list_add_tail(&vdo->registration, ®istry.links); + } + write_unlock(®istry.lock); + + return result; +} + +/** + * initialize_vdo() - Do the portion of initializing a vdo which will clean up after itself on + * error. + * @vdo: The vdo being initialized + * @config: The configuration of the vdo + * @instance: The instance number of the vdo + * @reason: The buffer to hold the failure reason on error + */ +static int +initialize_vdo(struct vdo *vdo, struct device_config *config, unsigned int instance, char **reason) +{ + int result; + zone_count_t i; + + vdo->device_config = config; + vdo->starting_sector_offset = config->owning_target->begin; + vdo->instance = instance; + vdo->allocations_allowed = true; + vdo_set_admin_state_code(&vdo->admin.state, VDO_ADMIN_STATE_NEW); + INIT_LIST_HEAD(&vdo->device_config_list); + vdo_initialize_completion(&vdo->admin.completion, vdo, VDO_ADMIN_COMPLETION); + init_completion(&vdo->admin.callback_sync); + mutex_init(&vdo->stats_mutex); + result = read_geometry_block(vdo); + if (result != VDO_SUCCESS) { + *reason = "Could not load geometry block"; + return result; + } + + result = initialize_thread_config(config->thread_counts, &vdo->thread_config); + if (result != VDO_SUCCESS) { + *reason = "Cannot create thread configuration"; + return result; + } + + uds_log_info("zones: %d logical, %d physical, %d hash; total threads: %d", + config->thread_counts.logical_zones, + config->thread_counts.physical_zones, + config->thread_counts.hash_zones, + vdo->thread_config.thread_count); + + /* Compression context storage */ + result = UDS_ALLOCATE(config->thread_counts.cpu_threads, + char *, + "LZ4 context", + &vdo->compression_context); + if (result != VDO_SUCCESS) { + *reason = "cannot allocate LZ4 context"; + return result; + } + + for (i = 0; i < config->thread_counts.cpu_threads; i++) { + result = UDS_ALLOCATE(LZ4_MEM_COMPRESS, + char, + "LZ4 context", + &vdo->compression_context[i]); + if (result != VDO_SUCCESS) { + *reason = "cannot allocate LZ4 context"; + return result; + } + } + + result = register_vdo(vdo); + if (result != VDO_SUCCESS) { + *reason = "Cannot add VDO to device registry"; + return result; + } + + vdo_set_admin_state_code(&vdo->admin.state, VDO_ADMIN_STATE_INITIALIZED); + return result; +} + +/** + * vdo_make() - Allocate and initialize a vdo. + * @instance: Device instantiation counter. + * @config: The device configuration. + * @reason: The reason for any failure during this call. + * @vdo_ptr: A pointer to hold the created vdo. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make(unsigned int instance, + struct device_config *config, + char **reason, + struct vdo **vdo_ptr) +{ + int result; + struct vdo *vdo; + + /* VDO-3769 - Set a generic reason so we don't ever return garbage. */ + *reason = "Unspecified error"; + + result = UDS_ALLOCATE(1, struct vdo, __func__, &vdo); + if (result != UDS_SUCCESS) { + *reason = "Cannot allocate VDO"; + return result; + } + + result = initialize_vdo(vdo, config, instance, reason); + if (result != VDO_SUCCESS) { + vdo_destroy(vdo); + return result; + } + + /* From here on, the caller will clean up if there is an error. */ + *vdo_ptr = vdo; + + snprintf(vdo->thread_name_prefix, + sizeof(vdo->thread_name_prefix), + "%s%u", + MODULE_NAME, + instance); + BUG_ON(vdo->thread_name_prefix[0] == '\0'); + result = UDS_ALLOCATE(vdo->thread_config.thread_count, + struct vdo_thread, + __func__, + &vdo->threads); + if (result != VDO_SUCCESS) { + *reason = "Cannot allocate thread structures"; + return result; + } + + result = vdo_make_thread(vdo, + vdo->thread_config.admin_thread, + &default_queue_type, + 1, + NULL); + if (result != VDO_SUCCESS) { + *reason = "Cannot make admin thread"; + return result; + } + + result = vdo_make_flusher(vdo); + if (result != VDO_SUCCESS) { + *reason = "Cannot make flusher zones"; + return result; + } + + result = vdo_make_packer(vdo, DEFAULT_PACKER_BINS, &vdo->packer); + if (result != VDO_SUCCESS) { + *reason = "Cannot make packer zones"; + return result; + } + + BUG_ON(vdo->device_config->logical_block_size <= 0); + BUG_ON(vdo->device_config->owned_device == NULL); + result = make_data_vio_pool(vdo, + MAXIMUM_VDO_USER_VIOS, + MAXIMUM_VDO_USER_VIOS * 3 / 4, + &vdo->data_vio_pool); + if (result != VDO_SUCCESS) { + *reason = "Cannot allocate data_vio pool"; + return result; + } + + result = vdo_make_io_submitter(config->thread_counts.bio_threads, + config->thread_counts.bio_rotation_interval, + get_data_vio_pool_request_limit(vdo->data_vio_pool), + vdo, + &vdo->io_submitter); + if (result != VDO_SUCCESS) { + *reason = "bio submission initialization failed"; + return result; + } + + if (vdo_uses_bio_ack_queue(vdo)) { + result = vdo_make_thread(vdo, + vdo->thread_config.bio_ack_thread, + &bio_ack_q_type, + config->thread_counts.bio_ack_threads, + NULL); + if (result != VDO_SUCCESS) { + *reason = "bio ack queue initialization failed"; + return result; + } + } + + result = vdo_make_thread(vdo, + vdo->thread_config.cpu_thread, + &cpu_q_type, + config->thread_counts.cpu_threads, + (void **) vdo->compression_context); + if (result != VDO_SUCCESS) { + *reason = "CPU queue initialization failed"; + return result; + } + + return VDO_SUCCESS; +} + +static void finish_vdo(struct vdo *vdo) +{ + int i; + + if (vdo->threads == NULL) + return; + + vdo_cleanup_io_submitter(vdo->io_submitter); + vdo_finish_dedupe_index(vdo->hash_zones); + + for (i = 0; i < vdo->thread_config.thread_count; i++) + vdo_finish_work_queue(vdo->threads[i].queue); +} + +/** + * free_listeners() - Free the list of read-only listeners associated with a thread. + * @thread_data: The thread holding the list to free. + */ +static void free_listeners(struct vdo_thread *thread) +{ + struct read_only_listener *listener, *next; + + for (listener = UDS_FORGET(thread->listeners); listener != NULL; listener = next) { + next = UDS_FORGET(listener->next); + UDS_FREE(listener); + } +} + +static void uninitialize_super_block(struct vdo_super_block *super_block) +{ + free_vio_components(&super_block->vio); + UDS_FREE(super_block->buffer); +} + +/** + * unregister_vdo() - Remove a vdo from the device registry. + * @vdo: The vdo to remove. + */ +static void unregister_vdo(struct vdo *vdo) +{ + write_lock(®istry.lock); + if (filter_vdos_locked(vdo_is_equal, vdo) == vdo) + list_del_init(&vdo->registration); + + write_unlock(®istry.lock); +} + +/** + * vdo_destroy() - Destroy a vdo instance. + * @vdo: The vdo to destroy (may be NULL). + */ +void vdo_destroy(struct vdo *vdo) +{ + unsigned int i; + + if (vdo == NULL) + return; + + /* A running VDO should never be destroyed without suspending first. */ + BUG_ON(vdo_get_admin_state(vdo)->normal); + + vdo->allocations_allowed = true; + + /* Stop services that need to gather VDO statistics from the worker threads. */ + if (vdo->sysfs_added) { + init_completion(&vdo->stats_shutdown); + kobject_put(&vdo->stats_directory); + wait_for_completion(&vdo->stats_shutdown); + } + + finish_vdo(vdo); + unregister_vdo(vdo); + free_data_vio_pool(vdo->data_vio_pool); + vdo_free_io_submitter(UDS_FORGET(vdo->io_submitter)); + vdo_free_flusher(UDS_FORGET(vdo->flusher)); + vdo_free_packer(UDS_FORGET(vdo->packer)); + vdo_free_recovery_journal(UDS_FORGET(vdo->recovery_journal)); + vdo_free_slab_depot(UDS_FORGET(vdo->depot)); + vdo_uninitialize_layout(&vdo->layout); + vdo_uninitialize_layout(&vdo->next_layout); + if (vdo->partition_copier) + dm_kcopyd_client_destroy(UDS_FORGET(vdo->partition_copier)); + uninitialize_super_block(&vdo->super_block); + vdo_free_block_map(UDS_FORGET(vdo->block_map)); + vdo_free_hash_zones(UDS_FORGET(vdo->hash_zones)); + vdo_free_physical_zones(UDS_FORGET(vdo->physical_zones)); + vdo_free_logical_zones(UDS_FORGET(vdo->logical_zones)); + + if (vdo->threads != NULL) { + for (i = 0; i < vdo->thread_config.thread_count; i++) { + free_listeners(&vdo->threads[i]); + vdo_free_work_queue(UDS_FORGET(vdo->threads[i].queue)); + } + UDS_FREE(UDS_FORGET(vdo->threads)); + } + + uninitialize_thread_config(&vdo->thread_config); + + if (vdo->compression_context != NULL) { + for (i = 0; i < vdo->device_config->thread_counts.cpu_threads; i++) + UDS_FREE(UDS_FORGET(vdo->compression_context[i])); + + UDS_FREE(UDS_FORGET(vdo->compression_context)); + } + + /* + * The call to kobject_put on the kobj sysfs node will decrement its reference count; when + * the count goes to zero the VDO object will be freed as a side effect. + */ + if (!vdo->sysfs_added) + UDS_FREE(vdo); + else + kobject_put(&vdo->vdo_directory); +} + +static int initialize_super_block(struct vdo *vdo, struct vdo_super_block *super_block) +{ + int result; + + result = UDS_ALLOCATE(VDO_BLOCK_SIZE, + char, + "encoded super block", + (char **) &vdo->super_block.buffer); + if (result != VDO_SUCCESS) + return result; + + return allocate_vio_components(vdo, + VIO_TYPE_SUPER_BLOCK, + VIO_PRIORITY_METADATA, + NULL, + 1, + (char *) super_block->buffer, + &vdo->super_block.vio); +} + +/** + * finish_reading_super_block() - Continue after loading the super block. + * @completion: The super block vio. + * + * This callback is registered in vdo_load_super_block(). + */ +static void finish_reading_super_block(struct vdo_completion *completion) +{ + struct vdo_super_block *super_block = + container_of(as_vio(completion), struct vdo_super_block, vio); + + vdo_continue_completion(UDS_FORGET(completion->parent), + vdo_decode_super_block(super_block->buffer)); +} + +/** + * handle_super_block_read_error() - Handle an error reading the super block. + * @completion: The super block vio. + * + * This error handler is registered in vdo_load_super_block(). + */ +static void handle_super_block_read_error(struct vdo_completion *completion) +{ + vio_record_metadata_io_error(as_vio(completion)); + finish_reading_super_block(completion); +} + +static void read_super_block_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vdo_completion *parent = vio->completion.parent; + + continue_vio_after_io(vio, finish_reading_super_block, parent->callback_thread_id); +} + +/** + * vdo_load_super_block() - Allocate a super block and read its contents from storage. + * @vdo: The vdo containing the super block on disk. + * @parent: The completion to notify after loading the super block. + */ +void vdo_load_super_block(struct vdo *vdo, struct vdo_completion *parent) +{ + int result; + + result = initialize_super_block(vdo, &vdo->super_block); + if (result != VDO_SUCCESS) { + vdo_continue_completion(parent, result); + return; + } + + vdo->super_block.vio.completion.parent = parent; + submit_metadata_vio(&vdo->super_block.vio, + vdo_get_data_region_start(vdo->geometry), + read_super_block_endio, + handle_super_block_read_error, + REQ_OP_READ); +} + +/** + * pool_stats_release() - Signal that sysfs stats have been shut down. + * @directory: The vdo stats directory. + */ +static void pool_stats_release(struct kobject *directory) +{ + struct vdo *vdo = container_of(directory, struct vdo, stats_directory); + + complete(&vdo->stats_shutdown); +} + +ATTRIBUTE_GROUPS(vdo_pool_stats); +static struct kobj_type stats_directory_type = { + .release = pool_stats_release, + .sysfs_ops = &vdo_pool_stats_sysfs_ops, + .default_groups = vdo_pool_stats_groups, +}; + +/** + * vdo_add_sysfs_stats_dir() - Add the stats directory to the vdo sysfs directory. + * @vdo: The vdo. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_add_sysfs_stats_dir(struct vdo *vdo) +{ + int result; + + kobject_init(&vdo->stats_directory, &stats_directory_type); + result = kobject_add(&vdo->stats_directory, &vdo->vdo_directory, "statistics"); + if (result != 0) + return VDO_CANT_ADD_SYSFS_NODE; + + return VDO_SUCCESS; +} + +/** + * vdo_get_backing_device() - Get the block device object underlying a vdo. + * @vdo: The vdo. + * + * Return: The vdo's current block device. + */ +struct block_device *vdo_get_backing_device(const struct vdo *vdo) +{ + return vdo->device_config->owned_device->bdev; +} + +/** + * vdo_get_device_name() - Get the device name associated with the vdo target. + * @target: The target device interface. + * + * Return: The block device name. + */ +const char *vdo_get_device_name(const struct dm_target *target) +{ + return dm_device_name(dm_table_get_md(target->table)); +} + +/** + * vdo_synchronous_flush() - Issue a flush request and wait for it to complete. + * @vdo: The vdo. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_synchronous_flush(struct vdo *vdo) +{ + int result; + struct bio bio; + + bio_init(&bio, vdo_get_backing_device(vdo), 0, 0, + REQ_OP_WRITE | REQ_PREFLUSH); + submit_bio_wait(&bio); + result = blk_status_to_errno(bio.bi_status); + + atomic64_inc(&vdo->stats.flush_out); + if (result != 0) { + uds_log_error_strerror(result, "synchronous flush failed"); + result = -EIO; + } + + bio_uninit(&bio); + return result; +} + +/** + * vdo_get_state() - Get the current state of the vdo. + * @vdo: The vdo. + + * Context: This method may be called from any thread. + * + * Return: The current state of the vdo. + */ +enum vdo_state vdo_get_state(const struct vdo *vdo) +{ + enum vdo_state state = atomic_read(&vdo->state); + + /* pairs with barriers where state field is changed */ + smp_rmb(); + return state; +} + +/** + * vdo_set_state() - Set the current state of the vdo. + * @vdo: The vdo whose state is to be set. + * @state: The new state of the vdo. + * + * Context: This method may be called from any thread. + */ +void vdo_set_state(struct vdo *vdo, enum vdo_state state) +{ + /* pairs with barrier in vdo_get_state */ + smp_wmb(); + atomic_set(&vdo->state, state); +} + +/** + * vdo_get_admin_state() - Get the admin state of the vdo. + * @vdo: The vdo. + * + * Return: The code for the vdo's current admin state. + */ +const struct admin_state_code *vdo_get_admin_state(const struct vdo *vdo) +{ + return vdo_get_admin_state_code(&vdo->admin.state); +} + +/** + * record_vdo() - Record the state of the VDO for encoding in the super block. + */ +static void record_vdo(struct vdo *vdo) +{ + vdo->states.release_version = vdo->geometry.release_version; + vdo->states.vdo.state = vdo_get_state(vdo); + vdo->states.block_map = vdo_record_block_map(vdo->block_map); + vdo->states.recovery_journal = vdo_record_recovery_journal(vdo->recovery_journal); + vdo->states.slab_depot = vdo_record_slab_depot(vdo->depot); + vdo->states.layout = vdo->layout; +} + +/** + * continue_super_block_parent() - Continue the parent of a super block save operation. + * @completion: The super block vio. + * + * This callback is registered in vdo_save_components(). + */ +static void continue_super_block_parent(struct vdo_completion *completion) +{ + vdo_continue_completion(UDS_FORGET(completion->parent), completion->result); +} + +/** + * handle_save_error() - Log a super block save error. + * @completion: The super block vio. + * + * This error handler is registered in vdo_save_components(). + */ +static void handle_save_error(struct vdo_completion *completion) +{ + struct vdo_super_block *super_block = + container_of(as_vio(completion), struct vdo_super_block, vio); + + vio_record_metadata_io_error(&super_block->vio); + uds_log_error_strerror(completion->result, "super block save failed"); + /* + * Mark the super block as unwritable so that we won't attempt to write it again. This + * avoids the case where a growth attempt fails writing the super block with the new size, + * but the subsequent attempt to write out the read-only state succeeds. In this case, + * writes which happened just before the suspend would not be visible if the VDO is + * restarted without rebuilding, but, after a read-only rebuild, the effects of those + * writes would reappear. + */ + super_block->unwriteable = true; + completion->callback(completion); +} + +static void super_block_write_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vdo_completion *parent = vio->completion.parent; + + continue_vio_after_io(vio, continue_super_block_parent, parent->callback_thread_id); +} + +/** + * vdo_save_components() - Encode the vdo and save the super block asynchronously. + * @vdo: The vdo whose state is being saved. + * @parent: The completion to notify when the save is complete. + */ +void vdo_save_components(struct vdo *vdo, struct vdo_completion *parent) +{ + struct vdo_super_block *super_block = &vdo->super_block; + + if (super_block->unwriteable) { + vdo_continue_completion(parent, VDO_READ_ONLY); + return; + } + + if (super_block->vio.completion.parent != NULL) { + vdo_continue_completion(parent, VDO_COMPONENT_BUSY); + return; + } + + record_vdo(vdo); + + vdo_encode_super_block(super_block->buffer, &vdo->states); + super_block->vio.completion.parent = parent; + super_block->vio.completion.callback_thread_id = parent->callback_thread_id; + submit_metadata_vio(&super_block->vio, + vdo_get_data_region_start(vdo->geometry), + super_block_write_endio, + handle_save_error, + REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA); +} + +/** + * vdo_register_read_only_listener() - Register a listener to be notified when the VDO goes + * read-only. + * @vdo: The vdo to register with. + * @listener: The object to notify. + * @notification: The function to call to send the notification. + * @thread_id: The id of the thread on which to send the notification. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_register_read_only_listener(struct vdo *vdo, + void *listener, + vdo_read_only_notification *notification, + thread_id_t thread_id) +{ + struct vdo_thread *thread = &vdo->threads[thread_id]; + struct read_only_listener *read_only_listener; + int result; + + result = ASSERT(thread_id != vdo->thread_config.dedupe_thread, + "read only listener not registered on dedupe thread"); + if (result != VDO_SUCCESS) + return result; + + result = UDS_ALLOCATE(1, struct read_only_listener, __func__, &read_only_listener); + if (result != VDO_SUCCESS) + return result; + + *read_only_listener = (struct read_only_listener) { + .listener = listener, + .notify = notification, + .next = thread->listeners, + }; + + thread->listeners = read_only_listener; + return VDO_SUCCESS; +} + +/** + * notify_vdo_of_read_only_mode() - Notify a vdo that it is going read-only. + * @listener: The vdo. + * @parent: The completion to notify in order to acknowledge the notification. + * + * This will save the read-only state to the super block. + * + * Implements vdo_read_only_notification. + */ +static void notify_vdo_of_read_only_mode(void *listener, struct vdo_completion *parent) +{ + struct vdo *vdo = listener; + + if (vdo_in_read_only_mode(vdo)) + vdo_finish_completion(parent); + + vdo_set_state(vdo, VDO_READ_ONLY_MODE); + vdo_save_components(vdo, parent); +} + +/** + * vdo_enable_read_only_entry() - Enable a vdo to enter read-only mode on errors. + * @vdo: The vdo to enable. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_enable_read_only_entry(struct vdo *vdo) +{ + thread_id_t id; + bool is_read_only = vdo_in_read_only_mode(vdo); + struct read_only_notifier *notifier = &vdo->read_only_notifier; + + if (is_read_only) { + notifier->read_only_error = VDO_READ_ONLY; + notifier->state = NOTIFIED; + } else { + notifier->state = MAY_NOT_NOTIFY; + } + + spin_lock_init(¬ifier->lock); + vdo_initialize_completion(¬ifier->completion, vdo, VDO_READ_ONLY_MODE_COMPLETION); + + for (id = 0; id < vdo->thread_config.thread_count; id++) + vdo->threads[id].is_read_only = is_read_only; + + return vdo_register_read_only_listener(vdo, + vdo, + notify_vdo_of_read_only_mode, + vdo->thread_config.admin_thread); +} + +/** + * vdo_wait_until_not_entering_read_only_mode() - Wait until no read-only notifications are in + * progress and prevent any subsequent + * notifications. + * @parent: The completion to notify when no threads are entering read-only mode. + * + * Notifications may be re-enabled by calling vdo_allow_read_only_mode_entry(). + */ +void vdo_wait_until_not_entering_read_only_mode(struct vdo_completion *parent) +{ + struct vdo *vdo = parent->vdo; + struct read_only_notifier *notifier = &vdo->read_only_notifier; + + vdo_assert_on_admin_thread(vdo, __func__); + + if (notifier->waiter != NULL) { + vdo_continue_completion(parent, VDO_COMPONENT_BUSY); + return; + } + + spin_lock(¬ifier->lock); + if (notifier->state == NOTIFYING) + notifier->waiter = parent; + else if (notifier->state == MAY_NOTIFY) + notifier->state = MAY_NOT_NOTIFY; + spin_unlock(¬ifier->lock); + + if (notifier->waiter == NULL) { + /* + * A notification was not in progress, and now they are + * disallowed. + */ + vdo_launch_completion(parent); + return; + } +} + +/** + * as_notifier() - Convert a generic vdo_completion to a read_only_notifier. + * @completion: The completion to convert. + * + * Return: The completion as a read_only_notifier. + */ +static inline struct read_only_notifier *as_notifier(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion, VDO_READ_ONLY_MODE_COMPLETION); + return container_of(completion, struct read_only_notifier, completion); +} + +/** + * finish_entering_read_only_mode() - Complete the process of entering read only mode. + * @completion: The read-only mode completion. + */ +static void finish_entering_read_only_mode(struct vdo_completion *completion) +{ + struct read_only_notifier *notifier = as_notifier(completion); + + vdo_assert_on_admin_thread(completion->vdo, __func__); + + spin_lock(¬ifier->lock); + notifier->state = NOTIFIED; + spin_unlock(¬ifier->lock); + + if (notifier->waiter != NULL) + vdo_continue_completion(UDS_FORGET(notifier->waiter), completion->result); +} + +/** + * make_thread_read_only() - Inform each thread that the VDO is in read-only mode. + * @completion: The read-only mode completion. + */ +static void make_thread_read_only(struct vdo_completion *completion) +{ + struct vdo *vdo = completion->vdo; + thread_id_t thread_id = completion->callback_thread_id; + struct read_only_notifier *notifier = as_notifier(completion); + struct read_only_listener *listener = completion->parent; + + if (listener == NULL) { + /* This is the first call on this thread */ + struct vdo_thread *thread = &vdo->threads[thread_id]; + + thread->is_read_only = true; + listener = thread->listeners; + if (thread_id == 0) + uds_log_error_strerror(READ_ONCE(notifier->read_only_error), + "Unrecoverable error, entering read-only mode"); + } else { + /* We've just finished notifying a listener */ + listener = listener->next; + } + + if (listener != NULL) { + /* We have a listener to notify */ + vdo_prepare_completion(completion, + make_thread_read_only, + make_thread_read_only, + thread_id, + listener); + listener->notify(listener->listener, completion); + return; + } + + /* We're done with this thread */ + if (++thread_id == vdo->thread_config.dedupe_thread) + /* + * We don't want to notify the dedupe thread since it may be + * blocked rebuilding the index. + */ + ++thread_id; + + if (thread_id >= vdo->thread_config.thread_count) + /* There are no more threads */ + vdo_prepare_completion(completion, + finish_entering_read_only_mode, + finish_entering_read_only_mode, + vdo->thread_config.admin_thread, + NULL); + else + vdo_prepare_completion(completion, + make_thread_read_only, + make_thread_read_only, + thread_id, + NULL); + + vdo_launch_completion(completion); +} + +/** + * vdo_allow_read_only_mode_entry() - Allow the notifier to put the VDO into read-only mode, + * reversing the effects of + * vdo_wait_until_not_entering_read_only_mode(). + * @parent: The object to notify once the operation is complete. + * + * If some thread tried to put the vdo into read-only mode while notifications were disallowed, it + * will be done when this method is called. If that happens, the parent will not be notified until + * the vdo has actually entered read-only mode and attempted to save the super block. + * + * Context: This method may only be called from the admin thread. + */ +void vdo_allow_read_only_mode_entry(struct vdo_completion *parent) +{ + struct vdo *vdo = parent->vdo; + struct read_only_notifier *notifier = &vdo->read_only_notifier; + + vdo_assert_on_admin_thread(vdo, __func__); + + if (notifier->waiter != NULL) { + vdo_continue_completion(parent, VDO_COMPONENT_BUSY); + return; + } + + spin_lock(¬ifier->lock); + if (notifier->state == MAY_NOT_NOTIFY) { + if (notifier->read_only_error == VDO_SUCCESS) { + notifier->state = MAY_NOTIFY; + } else { + notifier->state = NOTIFYING; + notifier->waiter = parent; + } + } + spin_unlock(¬ifier->lock); + + if (notifier->waiter == NULL) { + /* We're done */ + vdo_launch_completion(parent); + return; + } + + /* Do the pending notification. */ + make_thread_read_only(¬ifier->completion); +} + +/** + * vdo_enter_read_only_mode() - Put a VDO into read-only mode and save the read-only state in the + * super block. + * @vdo: The vdo. + * @error_code: The error which caused the VDO to enter read-only mode. + * + * This method is a no-op if the VDO is already read-only. + */ +void vdo_enter_read_only_mode(struct vdo *vdo, int error_code) +{ + bool notify = false; + thread_id_t thread_id = vdo_get_callback_thread_id(); + struct read_only_notifier *notifier = &vdo->read_only_notifier; + struct vdo_thread *thread; + + if (thread_id != VDO_INVALID_THREAD_ID) { + thread = &vdo->threads[thread_id]; + if (thread->is_read_only) + /* This thread has already gone read-only. */ + return; + + /* Record for this thread that the VDO is read-only. */ + thread->is_read_only = true; + } + + spin_lock(¬ifier->lock); + if (notifier->read_only_error == VDO_SUCCESS) { + WRITE_ONCE(notifier->read_only_error, error_code); + if (notifier->state == MAY_NOTIFY) { + notifier->state = NOTIFYING; + notify = true; + } + } + spin_unlock(¬ifier->lock); + + if (!notify) + /* The notifier is already aware of a read-only error */ + return; + + /* Initiate a notification starting on the lowest numbered thread. */ + vdo_launch_completion_callback(¬ifier->completion, make_thread_read_only, 0); +} + +/** + * vdo_is_read_only() - Check whether the VDO is read-only. + * @vdo: The vdo. + * + * Return: true if the vdo is read-only. + * + * This method may be called from any thread, as opposed to examining the VDO's state field which + * is only safe to check from the admin thread. + */ +bool vdo_is_read_only(struct vdo *vdo) +{ + return vdo->threads[vdo_get_callback_thread_id()].is_read_only; +} + +/** + * vdo_in_read_only_mode() - Check whether a vdo is in read-only mode. + * @vdo: The vdo to query. + * + * Return: true if the vdo is in read-only mode. + */ +bool vdo_in_read_only_mode(const struct vdo *vdo) +{ + return (vdo_get_state(vdo) == VDO_READ_ONLY_MODE); +} + +/** + * vdo_in_recovery_mode() - Check whether the vdo is in recovery mode. + * @vdo: The vdo to query. + * + * Return: true if the vdo is in recovery mode. + */ +bool vdo_in_recovery_mode(const struct vdo *vdo) +{ + return (vdo_get_state(vdo) == VDO_RECOVERING); +} + +/** + * vdo_enter_recovery_mode() - Put the vdo into recovery mode. + * @vdo: The vdo. + */ +void vdo_enter_recovery_mode(struct vdo *vdo) +{ + vdo_assert_on_admin_thread(vdo, __func__); + + if (vdo_in_read_only_mode(vdo)) + return; + + uds_log_info("Entering recovery mode"); + vdo_set_state(vdo, VDO_RECOVERING); +} + +/** + * complete_synchronous_action() - Signal the waiting thread that a synchronous action is complete. + * @completion: The sync completion. + */ +static void complete_synchronous_action(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion, VDO_SYNC_COMPLETION); + complete(&(container_of(completion, struct sync_completion, vdo_completion)->completion)); +} + +/** + * perform_synchronous_action() - Launch an action on a VDO thread and wait for it to complete. + * @vdo: The vdo. + * @action: The callback to launch. + * @thread_id: The thread on which to run the action. + * @parent: The parent of the sync completion (may be NULL). + */ +static int perform_synchronous_action(struct vdo *vdo, + vdo_action *action, + thread_id_t thread_id, + void *parent) +{ + struct sync_completion sync; + + vdo_initialize_completion(&sync.vdo_completion, vdo, VDO_SYNC_COMPLETION); + init_completion(&sync.completion); + sync.vdo_completion.parent = parent; + vdo_launch_completion_callback(&sync.vdo_completion, action, thread_id); + wait_for_completion(&sync.completion); + return sync.vdo_completion.result; +} + +/** + * set_compression_callback() - Callback to turn compression on or off. + * @completion: The completion. + */ +static void set_compression_callback(struct vdo_completion *completion) +{ + struct vdo *vdo = completion->vdo; + bool *enable = completion->parent; + bool was_enabled = vdo_get_compressing(vdo); + + if (*enable != was_enabled) { + WRITE_ONCE(vdo->compressing, *enable); + if (was_enabled) + /* Signal the packer to flush since compression has been disabled. */ + vdo_flush_packer(vdo->packer); + } + + uds_log_info("compression is %s", (*enable ? "enabled" : "disabled")); + *enable = was_enabled; + complete_synchronous_action(completion); +} + +/** + * vdo_set_compressing() - Turn compression on or off. + * @vdo: The vdo. + * @enable: Whether to enable or disable compression. + * + * Return: Whether compression was previously on or off. + */ +bool vdo_set_compressing(struct vdo *vdo, bool enable) +{ + perform_synchronous_action(vdo, + set_compression_callback, + vdo->thread_config.packer_thread, + &enable); + return enable; +} + +/** + * vdo_get_compressing() - Get whether compression is enabled in a vdo. + * @vdo: The vdo. + * + * Return: State of compression. + */ +bool vdo_get_compressing(struct vdo *vdo) +{ + return READ_ONCE(vdo->compressing); +} + +static size_t get_block_map_cache_size(const struct vdo *vdo) +{ + return ((size_t) vdo->device_config->cache_size) * VDO_BLOCK_SIZE; +} + +static struct error_statistics __must_check get_vdo_error_statistics(const struct vdo *vdo) +{ + /* + * The error counts can be incremented from arbitrary threads and so must be incremented + * atomically, but they are just statistics with no semantics that could rely on memory + * order, so unfenced reads are sufficient. + */ + const struct atomic_statistics *atoms = &vdo->stats; + + return (struct error_statistics) { + .invalid_advice_pbn_count = atomic64_read(&atoms->invalid_advice_pbn_count), + .no_space_error_count = atomic64_read(&atoms->no_space_error_count), + .read_only_error_count = atomic64_read(&atoms->read_only_error_count), + }; +} + +static void copy_bio_stat(struct bio_stats *b, const struct atomic_bio_stats *a) +{ + b->read = atomic64_read(&a->read); + b->write = atomic64_read(&a->write); + b->discard = atomic64_read(&a->discard); + b->flush = atomic64_read(&a->flush); + b->empty_flush = atomic64_read(&a->empty_flush); + b->fua = atomic64_read(&a->fua); +} + +static struct bio_stats subtract_bio_stats(struct bio_stats minuend, struct bio_stats subtrahend) +{ + return (struct bio_stats) { + .read = minuend.read - subtrahend.read, + .write = minuend.write - subtrahend.write, + .discard = minuend.discard - subtrahend.discard, + .flush = minuend.flush - subtrahend.flush, + .empty_flush = minuend.empty_flush - subtrahend.empty_flush, + .fua = minuend.fua - subtrahend.fua, + }; +} + +/** + * vdo_get_physical_blocks_allocated() - Get the number of physical blocks in use by user data. + * @vdo: The vdo. + * + * Return: The number of blocks allocated for user data. + */ +static block_count_t __must_check vdo_get_physical_blocks_allocated(const struct vdo *vdo) +{ + return (vdo_get_slab_depot_allocated_blocks(vdo->depot) - + vdo_get_journal_block_map_data_blocks_used(vdo->recovery_journal)); +} + +/** + * vdo_get_physical_blocks_overhead() - Get the number of physical blocks used by vdo metadata. + * @vdo: The vdo. + * + * Return: The number of overhead blocks. + */ +static block_count_t __must_check vdo_get_physical_blocks_overhead(const struct vdo *vdo) +{ + /* + * config.physical_blocks is mutated during resize and is in a packed structure, + * but resize runs on admin thread. + * TODO: Verify that this is always safe. + */ + return (vdo->states.vdo.config.physical_blocks - + vdo_get_slab_depot_data_blocks(vdo->depot) + + vdo_get_journal_block_map_data_blocks_used(vdo->recovery_journal)); +} + +static const char *vdo_describe_state(enum vdo_state state) +{ + /* These strings should all fit in the 15 chars of VDOStatistics.mode. */ + switch (state) { + case VDO_RECOVERING: + return "recovering"; + + case VDO_READ_ONLY_MODE: + return "read-only"; + + default: + return "normal"; + } +} + +/** + * get_vdo_statistics() - Populate a vdo_statistics structure on the admin thread. + * @vdo: The vdo. + * @stats: The statistics structure to populate. + */ +static void get_vdo_statistics(const struct vdo *vdo, struct vdo_statistics *stats) +{ + struct recovery_journal *journal = vdo->recovery_journal; + enum vdo_state state = vdo_get_state(vdo); + + vdo_assert_on_admin_thread(vdo, __func__); + + /* start with a clean slate */ + memset(stats, 0, sizeof(struct vdo_statistics)); + + /* + * These are immutable properties of the vdo object, so it is safe to query them from any + * thread. + */ + stats->version = STATISTICS_VERSION; + stats->release_version = VDO_CURRENT_RELEASE_VERSION_NUMBER; + stats->logical_blocks = vdo->states.vdo.config.logical_blocks; + /* + * config.physical_blocks is mutated during resize and is in a packed structure, but resize + * runs on the admin thread. + * TODO: verify that this is always safe + */ + stats->physical_blocks = vdo->states.vdo.config.physical_blocks; + stats->block_size = VDO_BLOCK_SIZE; + stats->complete_recoveries = vdo->states.vdo.complete_recoveries; + stats->read_only_recoveries = vdo->states.vdo.read_only_recoveries; + stats->block_map_cache_size = get_block_map_cache_size(vdo); + + /* The callees are responsible for thread-safety. */ + stats->data_blocks_used = vdo_get_physical_blocks_allocated(vdo); + stats->overhead_blocks_used = vdo_get_physical_blocks_overhead(vdo); + stats->logical_blocks_used = vdo_get_recovery_journal_logical_blocks_used(journal); + vdo_get_slab_depot_statistics(vdo->depot, stats); + stats->journal = vdo_get_recovery_journal_statistics(journal); + stats->packer = vdo_get_packer_statistics(vdo->packer); + stats->block_map = vdo_get_block_map_statistics(vdo->block_map); + vdo_get_dedupe_statistics(vdo->hash_zones, stats); + stats->errors = get_vdo_error_statistics(vdo); + stats->in_recovery_mode = (state == VDO_RECOVERING); + snprintf(stats->mode, sizeof(stats->mode), "%s", vdo_describe_state(state)); + + stats->instance = vdo->instance; + stats->current_vios_in_progress = get_data_vio_pool_active_requests(vdo->data_vio_pool); + stats->max_vios = get_data_vio_pool_maximum_requests(vdo->data_vio_pool); + + stats->flush_out = atomic64_read(&vdo->stats.flush_out); + stats->logical_block_size = vdo->device_config->logical_block_size; + copy_bio_stat(&stats->bios_in, &vdo->stats.bios_in); + copy_bio_stat(&stats->bios_in_partial, &vdo->stats.bios_in_partial); + copy_bio_stat(&stats->bios_out, &vdo->stats.bios_out); + copy_bio_stat(&stats->bios_meta, &vdo->stats.bios_meta); + copy_bio_stat(&stats->bios_journal, &vdo->stats.bios_journal); + copy_bio_stat(&stats->bios_page_cache, &vdo->stats.bios_page_cache); + copy_bio_stat(&stats->bios_out_completed, &vdo->stats.bios_out_completed); + copy_bio_stat(&stats->bios_meta_completed, &vdo->stats.bios_meta_completed); + copy_bio_stat(&stats->bios_journal_completed, &vdo->stats.bios_journal_completed); + copy_bio_stat(&stats->bios_page_cache_completed, &vdo->stats.bios_page_cache_completed); + copy_bio_stat(&stats->bios_acknowledged, &vdo->stats.bios_acknowledged); + copy_bio_stat(&stats->bios_acknowledged_partial, &vdo->stats.bios_acknowledged_partial); + stats->bios_in_progress = subtract_bio_stats(stats->bios_in, stats->bios_acknowledged); + uds_get_memory_stats(&stats->memory_usage.bytes_used, + &stats->memory_usage.peak_bytes_used); +} + +/** + * vdo_fetch_statistics_callback() - Action to populate a vdo_statistics + * structure on the admin thread. + * @completion: The completion. + * + * This callback is registered in vdo_fetch_statistics(). + */ +static void vdo_fetch_statistics_callback(struct vdo_completion *completion) +{ + get_vdo_statistics(completion->vdo, completion->parent); + complete_synchronous_action(completion); +} + +/** + * vdo_fetch_statistics() - Fetch statistics on the correct thread. + * @vdo: The vdo. + * @stats: The vdo statistics are returned here. + */ +void vdo_fetch_statistics(struct vdo *vdo, struct vdo_statistics *stats) +{ + perform_synchronous_action(vdo, + vdo_fetch_statistics_callback, + vdo->thread_config.admin_thread, + stats); +} + +/** + * vdo_get_callback_thread_id() - Get the id of the callback thread on which a completion is + * currently running. + * + * Return: The current thread ID, or -1 if no such thread. + */ +thread_id_t vdo_get_callback_thread_id(void) +{ + struct vdo_work_queue *queue = vdo_get_current_work_queue(); + struct vdo_thread *thread; + thread_id_t thread_id; + + if (queue == NULL) + return VDO_INVALID_THREAD_ID; + + thread = vdo_get_work_queue_owner(queue); + thread_id = thread->thread_id; + + if (PARANOID_THREAD_CONSISTENCY_CHECKS) { + BUG_ON(thread_id >= thread->vdo->thread_config.thread_count); + BUG_ON(thread != &thread->vdo->threads[thread_id]); + } + + return thread_id; +} + +/** + * vdo_dump_status() - Dump status information about a vdo to the log for debugging. + * @vdo: The vdo to dump. + */ +void vdo_dump_status(const struct vdo *vdo) +{ + zone_count_t zone; + + vdo_dump_flusher(vdo->flusher); + vdo_dump_recovery_journal_statistics(vdo->recovery_journal); + vdo_dump_packer(vdo->packer); + vdo_dump_slab_depot(vdo->depot); + + for (zone = 0; zone < vdo->thread_config.logical_zone_count; zone++) + vdo_dump_logical_zone(&vdo->logical_zones->zones[zone]); + + for (zone = 0; zone < vdo->thread_config.physical_zone_count; zone++) + vdo_dump_physical_zone(&vdo->physical_zones->zones[zone]); + + vdo_dump_hash_zones(vdo->hash_zones); +} + +/** + * vdo_assert_on_admin_thread() - Assert that we are running on the admin thread. + * @vdo: The vdo. + * @name: The name of the function which should be running on the admin thread (for logging). + */ +void vdo_assert_on_admin_thread(const struct vdo *vdo, const char *name) +{ + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == vdo->thread_config.admin_thread), + "%s called on admin thread", + name); +} + +/** + * vdo_assert_on_logical_zone_thread() - Assert that this function was called on the specified + * logical zone thread. + * @vdo: The vdo. + * @logical_zone: The number of the logical zone. + * @name: The name of the calling function. + */ +void vdo_assert_on_logical_zone_thread(const struct vdo *vdo, + zone_count_t logical_zone, + const char *name) +{ + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == + vdo->thread_config.logical_threads[logical_zone]), + "%s called on logical thread", + name); +} + +/** + * vdo_assert_on_physical_zone_thread() - Assert that this function was called on the specified + * physical zone thread. + * @vdo: The vdo. + * @physical_zone: The number of the physical zone. + * @name: The name of the calling function. + */ +void vdo_assert_on_physical_zone_thread(const struct vdo *vdo, + zone_count_t physical_zone, + const char *name) +{ + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == + vdo->thread_config.physical_threads[physical_zone]), + "%s called on physical thread", + name); +} + +/** + * vdo_get_physical_zone() - Get the physical zone responsible for a given physical block number. + * @vdo: The vdo containing the physical zones. + * @pbn: The PBN of the data block. + * @zone_ptr: A pointer to return the physical zone. + * + * Gets the physical zone responsible for a given physical block number of a data block in this vdo + * instance, or of the zero block (for which a NULL zone is returned). For any other block number + * that is not in the range of valid data block numbers in any slab, an error will be returned. + * This function is safe to call on invalid block numbers; it will not put the vdo into read-only + * mode. + * + * Return: VDO_SUCCESS or VDO_OUT_OF_RANGE if the block number is invalid or an error code for any + * other failure. + */ +int vdo_get_physical_zone(const struct vdo *vdo, + physical_block_number_t pbn, + struct physical_zone **zone_ptr) +{ + struct vdo_slab *slab; + int result; + + if (pbn == VDO_ZERO_BLOCK) { + *zone_ptr = NULL; + return VDO_SUCCESS; + } + + /* + * Used because it does a more restrictive bounds check than vdo_get_slab(), and done first + * because it won't trigger read-only mode on an invalid PBN. + */ + if (!vdo_is_physical_data_block(vdo->depot, pbn)) + return VDO_OUT_OF_RANGE; + + /* With the PBN already checked, we should always succeed in finding a slab. */ + slab = vdo_get_slab(vdo->depot, pbn); + result = ASSERT(slab != NULL, "vdo_get_slab must succeed on all valid PBNs"); + if (result != VDO_SUCCESS) + return result; + + *zone_ptr = &vdo->physical_zones->zones[slab->allocator->zone_number]; + return VDO_SUCCESS; +} diff --git a/drivers/md/dm-vdo/vdo.h b/drivers/md/dm-vdo/vdo.h new file mode 100644 index 00000000000..40e6d32fafa --- /dev/null +++ b/drivers/md/dm-vdo/vdo.h @@ -0,0 +1,381 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_H +#define VDO_H + +#include <linux/atomic.h> +#include <linux/blk_types.h> +#include <linux/completion.h> +#include <linux/dm-kcopyd.h> +#include <linux/kobject.h> +#include <linux/list.h> +#include <linux/spinlock.h> + +#include "admin-state.h" +#include "encodings.h" +#include "packer.h" +#include "physical-zone.h" +#include "statistics.h" +#include "thread-registry.h" +#include "types.h" +#include "uds.h" +#include "work-queue.h" + +enum notifier_state { + /** Notifications are allowed but not in progress */ + MAY_NOTIFY, + /** A notification is in progress */ + NOTIFYING, + /** Notifications are not allowed */ + MAY_NOT_NOTIFY, + /** A notification has completed */ + NOTIFIED, +}; + +/** + * typedef vdo_read_only_notification - A function to notify a listener that the VDO has gone + * read-only. + * @listener: The object to notify. + * @parent: The completion to notify in order to acknowledge the notification. + */ +typedef void vdo_read_only_notification(void *listener, struct vdo_completion *parent); + +/* + * An object to be notified when the VDO enters read-only mode + */ +struct read_only_listener { + /* The listener */ + void *listener; + /* The method to call to notify the listener */ + vdo_read_only_notification *notify; + /* A pointer to the next listener */ + struct read_only_listener *next; +}; + +struct vdo_thread { + struct vdo *vdo; + thread_id_t thread_id; + struct vdo_work_queue *queue; + /* + * Each thread maintains its own notion of whether the VDO is read-only so that the + * read-only state can be checked from any base thread without worrying about + * synchronization or thread safety. This does mean that knowledge of the VDO going + * read-only does not occur simultaneously across the VDO's threads, but that does not seem + * to cause any problems. + */ + bool is_read_only; + /* + * A list of objects waiting to be notified on this thread that the VDO has entered + * read-only mode. + */ + struct read_only_listener *listeners; + struct registered_thread allocating_thread; +}; + +/* Keep struct bio statistics atomically */ +struct atomic_bio_stats { + atomic64_t read; /* Number of not REQ_WRITE bios */ + atomic64_t write; /* Number of REQ_WRITE bios */ + atomic64_t discard; /* Number of REQ_DISCARD bios */ + atomic64_t flush; /* Number of REQ_FLUSH bios */ + atomic64_t empty_flush; /* Number of REQ_PREFLUSH bios without data */ + atomic64_t fua; /* Number of REQ_FUA bios */ +}; + +/* Counters are atomic since updates can arrive concurrently from arbitrary threads. */ +struct atomic_statistics { + atomic64_t bios_submitted; + atomic64_t bios_completed; + atomic64_t flush_out; + atomic64_t invalid_advice_pbn_count; + atomic64_t no_space_error_count; + atomic64_t read_only_error_count; + struct atomic_bio_stats bios_in; + struct atomic_bio_stats bios_in_partial; + struct atomic_bio_stats bios_out; + struct atomic_bio_stats bios_out_completed; + struct atomic_bio_stats bios_acknowledged; + struct atomic_bio_stats bios_acknowledged_partial; + struct atomic_bio_stats bios_meta; + struct atomic_bio_stats bios_meta_completed; + struct atomic_bio_stats bios_journal; + struct atomic_bio_stats bios_journal_completed; + struct atomic_bio_stats bios_page_cache; + struct atomic_bio_stats bios_page_cache_completed; +}; + +struct read_only_notifier { + /* The completion for entering read-only mode */ + struct vdo_completion completion; + /* A completion waiting for notifications to be drained or enabled */ + struct vdo_completion *waiter; + /* Lock to protect the next two fields */ + spinlock_t lock; + /* The code of the error which put the VDO into read-only mode */ + int read_only_error; + /* The current state of the notifier (values described above) */ + enum notifier_state state; +}; + +/* + * The thread ID returned when the current thread is not a vdo thread, or can not be determined + * (usually due to being at interrupt context). + */ +#define VDO_INVALID_THREAD_ID ((thread_id_t) -1) + +struct thread_config { + zone_count_t logical_zone_count; + zone_count_t physical_zone_count; + zone_count_t hash_zone_count; + thread_count_t bio_thread_count; + thread_count_t thread_count; + thread_id_t admin_thread; + thread_id_t journal_thread; + thread_id_t packer_thread; + thread_id_t dedupe_thread; + thread_id_t bio_ack_thread; + thread_id_t cpu_thread; + thread_id_t *logical_threads; + thread_id_t *physical_threads; + thread_id_t *hash_zone_threads; + thread_id_t *bio_threads; +}; + +struct thread_count_config; + +struct vdo_super_block { + /* The vio for reading and writing the super block to disk */ + struct vio vio; + /* A buffer to hold the super block */ + u8 *buffer; + /* Whether this super block may not be written */ + bool unwriteable; +}; + +struct data_vio_pool; + +struct vdo_administrator { + struct vdo_completion completion; + struct admin_state state; + atomic_t busy; + u32 phase; + struct completion callback_sync; +}; + +struct vdo { + char thread_name_prefix[MAX_VDO_WORK_QUEUE_NAME_LEN]; + struct vdo_thread *threads; + vdo_action *action; + struct vdo_completion *completion; + struct vio_tracer *vio_tracer; + + /* The atomic version of the state of this vdo */ + atomic_t state; + /* The full state of all components */ + struct vdo_component_states states; + /* + * A counter value to attach to thread names and log messages to identify the individual + * device. + */ + unsigned int instance; + /* The read-only notifier */ + struct read_only_notifier read_only_notifier; + /* The load-time configuration of this vdo */ + struct device_config *device_config; + /* The thread mapping */ + struct thread_config thread_config; + + /* The super block */ + struct vdo_super_block super_block; + + /* The partitioning of the underlying storage */ + struct layout layout; + struct layout next_layout; + struct dm_kcopyd_client *partition_copier; + + /* The block map */ + struct block_map *block_map; + + /* The journal for block map recovery */ + struct recovery_journal *recovery_journal; + + /* The slab depot */ + struct slab_depot *depot; + + /* The compressed-block packer */ + struct packer *packer; + /* Whether incoming data should be compressed */ + bool compressing; + + /* The handler for flush requests */ + struct flusher *flusher; + + /* The state the vdo was in when loaded (primarily for unit tests) */ + enum vdo_state load_state; + + /* The logical zones of this vdo */ + struct logical_zones *logical_zones; + + /* The physical zones of this vdo */ + struct physical_zones *physical_zones; + + /* The hash lock zones of this vdo */ + struct hash_zones *hash_zones; + + /* Bio submission manager used for sending bios to the storage device. */ + struct io_submitter *io_submitter; + + /* The pool of data_vios for servicing incoming bios */ + struct data_vio_pool *data_vio_pool; + + /* The manager for administrative operations */ + struct vdo_administrator admin; + + /* Flags controlling administrative operations */ + const struct admin_state_code *suspend_type; + bool allocations_allowed; + bool dump_on_shutdown; + atomic_t processing_message; + + /* + * Statistics + * Atomic stats counters + */ + struct atomic_statistics stats; + /* Used to gather statistics without allocating memory */ + struct vdo_statistics stats_buffer; + /* Protects the stats_buffer */ + struct mutex stats_mutex; + /* true if sysfs directory is set up */ + bool sysfs_added; + /* Used when shutting down the sysfs statistics */ + struct completion stats_shutdown; + + + /* A list of all device_configs referencing this vdo */ + struct list_head device_config_list; + + /* This VDO's list entry for the device registry */ + struct list_head registration; + + /* Underlying block device info. */ + u64 starting_sector_offset; + struct volume_geometry geometry; + + /* For sysfs */ + struct kobject vdo_directory; + struct kobject stats_directory; + + /* N blobs of context data for LZ4 code, one per CPU thread. */ + char **compression_context; +}; + + +/** + * vdo_uses_bio_ack_queue() - Indicate whether the vdo is configured to use a separate work queue + * for acknowledging received and processed bios. + * @vdo: The vdo. + * + * Note that this directly controls the handling of write operations, but the compile-time flag + * VDO_USE_BIO_ACK_QUEUE_FOR_READ is also checked for read operations. + * + * Return: Whether a bio-acknowledgement work queue is in use. + */ +static inline bool vdo_uses_bio_ack_queue(struct vdo *vdo) +{ + return vdo->device_config->thread_counts.bio_ack_threads > 0; +} + +/** + * typedef vdo_filter_t - Method type for vdo matching methods. + * + * A filter function returns false if the vdo doesn't match. + */ +typedef bool vdo_filter_t(struct vdo *vdo, const void *context); + +void vdo_initialize_device_registry_once(void); +struct vdo * __must_check vdo_find_matching(vdo_filter_t *filter, const void *context); + +int __must_check vdo_make_thread(struct vdo *vdo, + thread_id_t thread_id, + const struct vdo_work_queue_type *type, + unsigned int queue_count, + void *contexts[]); + +static inline int __must_check vdo_make_default_thread(struct vdo *vdo, thread_id_t thread_id) +{ + return vdo_make_thread(vdo, thread_id, NULL, 1, NULL); +} + +int __must_check +vdo_make(unsigned int instance, struct device_config *config, char **reason, struct vdo **vdo_ptr); + +void vdo_destroy(struct vdo *vdo); + +void vdo_load_super_block(struct vdo *vdo, struct vdo_completion *parent); + +int __must_check vdo_add_sysfs_stats_dir(struct vdo *vdo); + +struct block_device * __must_check vdo_get_backing_device(const struct vdo *vdo); + +const char * __must_check vdo_get_device_name(const struct dm_target *target); + +int __must_check vdo_synchronous_flush(struct vdo *vdo); + +const struct admin_state_code * __must_check vdo_get_admin_state(const struct vdo *vdo); + +bool vdo_set_compressing(struct vdo *vdo, bool enable); + +bool vdo_get_compressing(struct vdo *vdo); + +void vdo_fetch_statistics(struct vdo *vdo, struct vdo_statistics *stats); + +thread_id_t vdo_get_callback_thread_id(void); + +enum vdo_state __must_check vdo_get_state(const struct vdo *vdo); + +void vdo_set_state(struct vdo *vdo, enum vdo_state state); + +void vdo_save_components(struct vdo *vdo, struct vdo_completion *parent); + +int vdo_register_read_only_listener(struct vdo *vdo, + void *listener, + vdo_read_only_notification *notification, + thread_id_t thread_id); + +int vdo_enable_read_only_entry(struct vdo *vdo); + +void vdo_wait_until_not_entering_read_only_mode(struct vdo_completion *parent); + +void vdo_allow_read_only_mode_entry(struct vdo_completion *parent); + +void vdo_enter_read_only_mode(struct vdo *vdo, int error_code); + +bool __must_check vdo_is_read_only(struct vdo *vdo); + +bool __must_check vdo_in_read_only_mode(const struct vdo *vdo); + +bool __must_check vdo_in_recovery_mode(const struct vdo *vdo); + +void vdo_enter_recovery_mode(struct vdo *vdo); + +void vdo_assert_on_admin_thread(const struct vdo *vdo, const char *name); + +void vdo_assert_on_logical_zone_thread(const struct vdo *vdo, + zone_count_t logical_zone, + const char *name); + +void vdo_assert_on_physical_zone_thread(const struct vdo *vdo, + zone_count_t physical_zone, + const char *name); + +int __must_check vdo_get_physical_zone(const struct vdo *vdo, + physical_block_number_t pbn, + struct physical_zone **zone_ptr); + +void vdo_dump_status(const struct vdo *vdo); + +#endif /* VDO_H */ diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c new file mode 100644 index 00000000000..e42b13bf1cd --- /dev/null +++ b/drivers/md/dm-vdo/vio.c @@ -0,0 +1,525 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "vio.h" + +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/kernel.h> +#include <linux/ratelimit.h> + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "constants.h" +#include "io-submitter.h" +#include "vdo.h" + +/* A vio_pool is a collection of preallocated vios. */ +struct vio_pool { + /** The number of objects managed by the pool */ + size_t size; + /** The list of objects which are available */ + struct list_head available; + /** The queue of requestors waiting for objects from the pool */ + struct wait_queue waiting; + /** The number of objects currently in use */ + size_t busy_count; + /** The list of objects which are in use */ + struct list_head busy; + /** The ID of the thread on which this pool may be used */ + thread_id_t thread_id; + /** The buffer backing the pool's vios */ + char *buffer; + /** The pool entries */ + struct pooled_vio vios[]; +}; + +physical_block_number_t pbn_from_vio_bio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vdo *vdo = vio->completion.vdo; + physical_block_number_t pbn = bio->bi_iter.bi_sector / VDO_SECTORS_PER_BLOCK; + + return ((pbn == VDO_GEOMETRY_BLOCK_LOCATION) ? pbn : pbn + vdo->geometry.bio_offset); +} + +static int create_multi_block_bio(block_count_t size, struct bio **bio_ptr) +{ + struct bio *bio = NULL; + int result; + + result = UDS_ALLOCATE_EXTENDED(struct bio, size + 1, struct bio_vec, "bio", &bio); + if (result != VDO_SUCCESS) + return result; + + *bio_ptr = bio; + return VDO_SUCCESS; +} + +int vdo_create_bio(struct bio **bio_ptr) +{ + return create_multi_block_bio(1, bio_ptr); +} + +void vdo_free_bio(struct bio *bio) +{ + if (bio == NULL) + return; + + bio_uninit(bio); + UDS_FREE(UDS_FORGET(bio)); +} + +int allocate_vio_components(struct vdo *vdo, + enum vio_type vio_type, + enum vio_priority priority, + void *parent, + unsigned int block_count, + char *data, + struct vio *vio) +{ + struct bio *bio; + int result; + + result = ASSERT(block_count <= MAX_BLOCKS_PER_VIO, + "block count %u does not exceed maximum %u", + block_count, + MAX_BLOCKS_PER_VIO); + if (result != VDO_SUCCESS) + return result; + + result = ASSERT(((vio_type != VIO_TYPE_UNINITIALIZED) && (vio_type != VIO_TYPE_DATA)), + "%d is a metadata type", + vio_type); + if (result != VDO_SUCCESS) + return result; + + result = create_multi_block_bio(block_count, &bio); + if (result != VDO_SUCCESS) + return result; + + initialize_vio(vio, bio, block_count, vio_type, priority, vdo); + vio->completion.parent = parent; + vio->data = data; + return VDO_SUCCESS; +} + +/** + * create_multi_block_metadata_vio() - Create a vio. + * @vdo: The vdo on which the vio will operate. + * @vio_type: The type of vio to create. + * @priority: The relative priority to assign to the vio. + * @parent: The parent of the vio. + * @block_count: The size of the vio in blocks. + * @data: The buffer. + * @vio_ptr: A pointer to hold the new vio. + * + * Return: VDO_SUCCESS or an error. + */ +int create_multi_block_metadata_vio(struct vdo *vdo, + enum vio_type vio_type, + enum vio_priority priority, + void *parent, + unsigned int block_count, + char *data, + struct vio **vio_ptr) +{ + struct vio *vio; + int result; + + /* If struct vio grows past 256 bytes, we'll lose benefits of VDOSTORY-176. */ + STATIC_ASSERT(sizeof(struct vio) <= 256); + + /* + * Metadata vios should use direct allocation and not use the buffer pool, which is + * reserved for submissions from the linux block layer. + */ + result = UDS_ALLOCATE(1, struct vio, __func__, &vio); + if (result != VDO_SUCCESS) { + uds_log_error("metadata vio allocation failure %d", result); + return result; + } + + result = allocate_vio_components(vdo, vio_type, priority, parent, block_count, data, vio); + if (result != VDO_SUCCESS) { + UDS_FREE(vio); + return result; + } + + *vio_ptr = vio; + return VDO_SUCCESS; +} + +/** + * free_vio_components(): Free the components of a vio embedded in a larger structure. + * @vio: The vio to destroy + */ +void free_vio_components(struct vio *vio) +{ + if (vio == NULL) + return; + + BUG_ON(is_data_vio(vio)); + vdo_free_bio(UDS_FORGET(vio->bio)); +} + +/** + * free_vio() - Destroy a vio. + * @vio: The vio to destroy. + */ +void free_vio(struct vio *vio) +{ + free_vio_components(vio); + UDS_FREE(vio); +} + +/* Set bio properties for a VDO read or write. */ +void vdo_set_bio_properties(struct bio *bio, + struct vio *vio, + bio_end_io_t callback, + unsigned int bi_opf, + physical_block_number_t pbn) +{ + struct vdo *vdo = vio->completion.vdo; + struct device_config *config = vdo->device_config; + + pbn -= vdo->geometry.bio_offset; + vio->bio_zone = ((pbn / config->thread_counts.bio_rotation_interval) % + config->thread_counts.bio_threads); + + bio->bi_private = vio; + bio->bi_end_io = callback; + bio->bi_opf = bi_opf; + bio->bi_iter.bi_sector = pbn * VDO_SECTORS_PER_BLOCK; +} + +/* + * Prepares the bio to perform IO with the specified buffer. May only be used on a VDO-allocated + * bio, as it assumes the bio wraps a 4k buffer that is 4k aligned, but there does not have to be a + * vio associated with the bio. + */ +int vio_reset_bio(struct vio *vio, + char *data, + bio_end_io_t callback, + unsigned int bi_opf, + physical_block_number_t pbn) +{ + int bvec_count, offset, len, i; + struct bio *bio = vio->bio; + + bio_reset(bio, bio->bi_bdev, bi_opf); + vdo_set_bio_properties(bio, vio, callback, bi_opf, pbn); + if (data == NULL) + return VDO_SUCCESS; + + bio->bi_io_vec = bio->bi_inline_vecs; + bio->bi_max_vecs = vio->block_count + 1; + len = VDO_BLOCK_SIZE * vio->block_count; + offset = offset_in_page(data); + bvec_count = DIV_ROUND_UP(offset + len, PAGE_SIZE); + + /* + * If we knew that data was always on one page, or contiguous pages, we wouldn't need the + * loop. But if we're using vmalloc, it's not impossible that the data is in different + * pages that can't be merged in bio_add_page... + */ + for (i = 0; (i < bvec_count) && (len > 0); i++) { + struct page *page; + int bytes_added; + int bytes = PAGE_SIZE - offset; + + if (bytes > len) + bytes = len; + + page = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data); + bytes_added = bio_add_page(bio, page, bytes, offset); + + if (bytes_added != bytes) + return uds_log_error_strerror(VDO_BIO_CREATION_FAILED, + "Could only add %i bytes to bio", + bytes_added); + + data += bytes; + len -= bytes; + offset = 0; + } + + return VDO_SUCCESS; +} + +/** + * update_vio_error_stats() - Update per-vio error stats and log the error. + * @vio: The vio which got an error. + * @format: The format of the message to log (a printf style format). + */ +void update_vio_error_stats(struct vio *vio, const char *format, ...) +{ + static DEFINE_RATELIMIT_STATE(error_limiter, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + va_list args; + int priority; + struct vdo *vdo = vio->completion.vdo; + + switch (vio->completion.result) { + case VDO_READ_ONLY: + atomic64_inc(&vdo->stats.read_only_error_count); + return; + + case VDO_NO_SPACE: + atomic64_inc(&vdo->stats.no_space_error_count); + priority = UDS_LOG_DEBUG; + break; + + default: + priority = UDS_LOG_ERR; + } + + if (!__ratelimit(&error_limiter)) + return; + + va_start(args, format); + uds_vlog_strerror(priority, vio->completion.result, UDS_LOGGING_MODULE_NAME, format, args); + va_end(args); +} + +void vio_record_metadata_io_error(struct vio *vio) +{ + const char *description; + physical_block_number_t pbn = pbn_from_vio_bio(vio->bio); + + if (bio_op(vio->bio) == REQ_OP_READ) + description = "read"; + else if ((vio->bio->bi_opf & REQ_PREFLUSH) == REQ_PREFLUSH) + description = (((vio->bio->bi_opf & REQ_FUA) == REQ_FUA) ? + "write+preflush+fua" : + "write+preflush"); + else if ((vio->bio->bi_opf & REQ_FUA) == REQ_FUA) + description = "write+fua"; + else + description = "write"; + + update_vio_error_stats(vio, + "Completing %s vio of type %u for physical block %llu with error", + description, + vio->type, + (unsigned long long) pbn); +} + +/** + * make_vio_pool() - Create a new vio pool. + * @vdo: The vdo. + * @pool_size: The number of vios in the pool. + * @thread_id: The ID of the thread using this pool. + * @vio_type: The type of vios in the pool. + * @priority: The priority with which vios from the pool should be enqueued. + * @context: The context that each entry will have. + * @pool_ptr: The resulting pool. + * + * Return: A success or error code. + */ +int make_vio_pool(struct vdo *vdo, + size_t pool_size, + thread_id_t thread_id, + enum vio_type vio_type, + enum vio_priority priority, + void *context, + struct vio_pool **pool_ptr) +{ + struct vio_pool *pool; + char *ptr; + int result; + + result = UDS_ALLOCATE_EXTENDED(struct vio_pool, + pool_size, + struct pooled_vio, + __func__, + &pool); + if (result != VDO_SUCCESS) + return result; + + pool->thread_id = thread_id; + INIT_LIST_HEAD(&pool->available); + INIT_LIST_HEAD(&pool->busy); + + result = UDS_ALLOCATE(pool_size * VDO_BLOCK_SIZE, char, "VIO pool buffer", &pool->buffer); + if (result != VDO_SUCCESS) { + free_vio_pool(pool); + return result; + } + + ptr = pool->buffer; + for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += VDO_BLOCK_SIZE) { + struct pooled_vio *pooled = &pool->vios[pool->size]; + + result = allocate_vio_components(vdo, + vio_type, + priority, + NULL, + 1, + ptr, + &pooled->vio); + if (result != VDO_SUCCESS) { + free_vio_pool(pool); + return result; + } + + pooled->context = context; + list_add_tail(&pooled->pool_entry, &pool->available); + } + + *pool_ptr = pool; + return VDO_SUCCESS; +} + +/** + * free_vio_pool() - Destroy a vio pool. + * @pool: The pool to free. + */ +void free_vio_pool(struct vio_pool *pool) +{ + struct pooled_vio *pooled, *tmp; + + if (pool == NULL) + return; + + /* Remove all available vios from the object pool. */ + ASSERT_LOG_ONLY(!vdo_has_waiters(&pool->waiting), + "VIO pool must not have any waiters when being freed"); + ASSERT_LOG_ONLY((pool->busy_count == 0), + "VIO pool must not have %zu busy entries when being freed", + pool->busy_count); + ASSERT_LOG_ONLY(list_empty(&pool->busy), + "VIO pool must not have busy entries when being freed"); + + list_for_each_entry_safe(pooled, tmp, &pool->available, pool_entry) { + list_del(&pooled->pool_entry); + free_vio_components(&pooled->vio); + pool->size--; + } + + ASSERT_LOG_ONLY(pool->size == 0, + "VIO pool must not have missing entries when being freed"); + + UDS_FREE(UDS_FORGET(pool->buffer)); + UDS_FREE(pool); +} + +/** + * is_vio_pool_busy() - Check whether an vio pool has outstanding entries. + * + * Return: true if the pool is busy. + */ +bool is_vio_pool_busy(struct vio_pool *pool) +{ + return (pool->busy_count != 0); +} + +/** + * acquire_vio_from_pool() - Acquire a vio and buffer from the pool (asynchronous). + * @pool: The vio pool. + * @waiter: Object that is requesting a vio. + */ +void acquire_vio_from_pool(struct vio_pool *pool, struct waiter *waiter) +{ + struct pooled_vio *pooled; + + ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()), + "acquire from active vio_pool called from correct thread"); + + if (list_empty(&pool->available)) { + vdo_enqueue_waiter(&pool->waiting, waiter); + return; + } + + pooled = list_first_entry(&pool->available, struct pooled_vio, pool_entry); + pool->busy_count++; + list_move_tail(&pooled->pool_entry, &pool->busy); + (*waiter->callback)(waiter, pooled); +} + +/** + * return_vio_to_pool() - Return a vio to the pool + * @pool: The vio pool. + * @vio: The pooled vio to return. + */ +void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio) +{ + ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()), + "vio pool entry returned on same thread as it was acquired"); + + vio->vio.completion.error_handler = NULL; + vio->vio.completion.parent = NULL; + if (vdo_has_waiters(&pool->waiting)) { + vdo_notify_next_waiter(&pool->waiting, NULL, vio); + return; + } + + list_move_tail(&vio->pool_entry, &pool->available); + --pool->busy_count; +} + +/* + * Various counting functions for statistics. + * These are used for bios coming into VDO, as well as bios generated by VDO. + */ +void vdo_count_bios(struct atomic_bio_stats *bio_stats, struct bio *bio) +{ + if (((bio->bi_opf & REQ_PREFLUSH) != 0) && (bio->bi_iter.bi_size == 0)) { + atomic64_inc(&bio_stats->empty_flush); + atomic64_inc(&bio_stats->flush); + return; + } + + switch (bio_op(bio)) { + case REQ_OP_WRITE: + atomic64_inc(&bio_stats->write); + break; + case REQ_OP_READ: + atomic64_inc(&bio_stats->read); + break; + case REQ_OP_DISCARD: + atomic64_inc(&bio_stats->discard); + break; + /* + * All other operations are filtered out in dmvdo.c, or not created by VDO, so + * shouldn't exist. + */ + default: + ASSERT_LOG_ONLY(0, "Bio operation %d not a write, read, discard, or empty flush", + bio_op(bio)); + } + + if ((bio->bi_opf & REQ_PREFLUSH) != 0) + atomic64_inc(&bio_stats->flush); + if (bio->bi_opf & REQ_FUA) + atomic64_inc(&bio_stats->fua); +} + +static void count_all_bios_completed(struct vio *vio, struct bio *bio) +{ + struct atomic_statistics *stats = &vio->completion.vdo->stats; + + if (is_data_vio(vio)) { + vdo_count_bios(&stats->bios_out_completed, bio); + return; + } + + vdo_count_bios(&stats->bios_meta_completed, bio); + if (vio->type == VIO_TYPE_RECOVERY_JOURNAL) + vdo_count_bios(&stats->bios_journal_completed, bio); + else if (vio->type == VIO_TYPE_BLOCK_MAP) + vdo_count_bios(&stats->bios_page_cache_completed, bio); +} + +void vdo_count_completed_bios(struct bio *bio) +{ + struct vio *vio = (struct vio *) bio->bi_private; + + atomic64_inc(&vio->completion.vdo->stats.bios_completed); + count_all_bios_completed(vio, bio); +} diff --git a/drivers/md/dm-vdo/vio.h b/drivers/md/dm-vdo/vio.h new file mode 100644 index 00000000000..f39f568834e --- /dev/null +++ b/drivers/md/dm-vdo/vio.h @@ -0,0 +1,221 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VIO_H +#define VIO_H + +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/list.h> + +#include "completion.h" +#include "constants.h" +#include "types.h" +#include "vdo.h" + +enum { + MAX_BLOCKS_PER_VIO = (BIO_MAX_VECS << PAGE_SHIFT) / VDO_BLOCK_SIZE, +}; + +struct pooled_vio { + /* The underlying vio */ + struct vio vio; + /* The list entry for chaining pooled vios together */ + struct list_head list_entry; + /* The context set by the pool */ + void *context; + /* The list entry used by the pool */ + struct list_head pool_entry; +}; + +/** + * as_vio() - Convert a generic vdo_completion to a vio. + * @completion: The completion to convert. + * + * Return: The completion as a vio. + */ +static inline struct vio *as_vio(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion, VIO_COMPLETION); + return container_of(completion, struct vio, completion); +} + +/** + * get_vio_bio_zone_thread_id() - Get the thread id of the bio zone in which a vio should submit + * its I/O. + * @vio: The vio. + * + * Return: The id of the bio zone thread the vio should use. + */ +static inline thread_id_t __must_check get_vio_bio_zone_thread_id(struct vio *vio) +{ + return vio->completion.vdo->thread_config.bio_threads[vio->bio_zone]; +} + +physical_block_number_t __must_check pbn_from_vio_bio(struct bio *bio); + +/** + * assert_vio_in_bio_zone() - Check that a vio is running on the correct thread for its bio zone. + * @vio: The vio to check. + */ +static inline void assert_vio_in_bio_zone(struct vio *vio) +{ + thread_id_t expected = get_vio_bio_zone_thread_id(vio); + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((expected == thread_id), + "vio I/O for physical block %llu on thread %u, should be on bio zone thread %u", + (unsigned long long) pbn_from_vio_bio(vio->bio), + thread_id, + expected); +} + +int vdo_create_bio(struct bio **bio_ptr); +void vdo_free_bio(struct bio *bio); +int allocate_vio_components(struct vdo *vdo, + enum vio_type vio_type, + enum vio_priority priority, + void *parent, + unsigned int block_count, + char *data, + struct vio *vio); +int __must_check create_multi_block_metadata_vio(struct vdo *vdo, + enum vio_type vio_type, + enum vio_priority priority, + void *parent, + unsigned int block_count, + char *data, + struct vio **vio_ptr); + +static inline int __must_check +create_metadata_vio(struct vdo *vdo, + enum vio_type vio_type, + enum vio_priority priority, + void *parent, + char *data, + struct vio **vio_ptr) +{ + return create_multi_block_metadata_vio(vdo, vio_type, priority, parent, 1, data, vio_ptr); +} + +void free_vio_components(struct vio *vio); +void free_vio(struct vio *vio); + +/** + * initialize_vio() - Initialize a vio. + * @vio: The vio to initialize. + * @bio: The bio this vio should use for its I/O. + * @block_count: The size of this vio in vdo blocks. + * @vio_type: The vio type. + * @priority: The relative priority of the vio. + * @vdo: The vdo for this vio. + */ +static inline void initialize_vio(struct vio *vio, + struct bio *bio, + unsigned int block_count, + enum vio_type vio_type, + enum vio_priority priority, + struct vdo *vdo) +{ + /* data_vio's may not span multiple blocks */ + BUG_ON((vio_type == VIO_TYPE_DATA) && (block_count != 1)); + + vio->bio = bio; + vio->block_count = block_count; + vio->type = vio_type; + vio->priority = priority; + vdo_initialize_completion(&vio->completion, vdo, VIO_COMPLETION); +} + +void vdo_set_bio_properties(struct bio *bio, + struct vio *vio, + bio_end_io_t callback, + unsigned int bi_opf, + physical_block_number_t pbn); + +int vio_reset_bio(struct vio *vio, + char *data, + bio_end_io_t callback, + unsigned int bi_opf, + physical_block_number_t pbn); + +void update_vio_error_stats(struct vio *vio, const char *format, ...) + __printf(2, 3); + +/** + * is_data_vio() - Check whether a vio is servicing an external data request. + * @vio: The vio to check. + */ +static inline bool is_data_vio(struct vio *vio) +{ + return (vio->type == VIO_TYPE_DATA); +} + +/** + * get_metadata_priority() - Convert a vio's priority to a work item priority. + * @vio: The vio. + * + * Return: The priority with which to submit the vio's bio. + */ +static inline enum vdo_completion_priority get_metadata_priority(struct vio *vio) +{ + return ((vio->priority == VIO_PRIORITY_HIGH) ? + BIO_Q_HIGH_PRIORITY : + BIO_Q_METADATA_PRIORITY); +} + +/** + * continue_vio() - Enqueue a vio to run its next callback. + * @vio: The vio to continue. + * + * Return: The result of the current operation. + */ +static inline void continue_vio(struct vio *vio, int result) +{ + if (unlikely(result != VDO_SUCCESS)) + vdo_set_completion_result(&vio->completion, result); + + vdo_enqueue_completion(&vio->completion, VDO_WORK_Q_DEFAULT_PRIORITY); +} + +void vdo_count_bios(struct atomic_bio_stats *bio_stats, struct bio *bio); +void vdo_count_completed_bios(struct bio *bio); + +/** + * continue_vio_after_io() - Continue a vio now that its I/O has returned. + */ +static inline void continue_vio_after_io(struct vio *vio, vdo_action *callback, thread_id_t thread) +{ + vdo_count_completed_bios(vio->bio); + vdo_set_completion_callback(&vio->completion, callback, thread); + continue_vio(vio, blk_status_to_errno(vio->bio->bi_status)); +} + +void vio_record_metadata_io_error(struct vio *vio); + +/* A vio_pool is a collection of preallocated vios used to write arbitrary metadata blocks. */ + +static inline struct pooled_vio *vio_as_pooled_vio(struct vio *vio) +{ + return container_of(vio, struct pooled_vio, vio); +} + +struct vio_pool; + +int __must_check make_vio_pool(struct vdo *vdo, + size_t pool_size, + thread_id_t thread_id, + enum vio_type vio_type, + enum vio_priority priority, + void *context, + struct vio_pool **pool_ptr); +void free_vio_pool(struct vio_pool *pool); +bool __must_check is_vio_pool_busy(struct vio_pool *pool); +void acquire_vio_from_pool(struct vio_pool *pool, struct waiter *waiter); +void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio); + +#endif /* VIO_H */ diff --git a/drivers/md/dm-vdo/wait-queue.c b/drivers/md/dm-vdo/wait-queue.c new file mode 100644 index 00000000000..4048c11c3e5 --- /dev/null +++ b/drivers/md/dm-vdo/wait-queue.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "wait-queue.h" + +#include <linux/device-mapper.h> + +#include "permassert.h" + +#include "status-codes.h" + +/** + * vdo_enqueue_waiter() - Add a waiter to the tail end of a wait queue. + * @queue: The queue to which to add the waiter. + * @waiter: The waiter to add to the queue. + * + * The waiter must not already be waiting in a queue. + * + * Return: VDO_SUCCESS or an error code. + */ +void vdo_enqueue_waiter(struct wait_queue *queue, struct waiter *waiter) +{ + BUG_ON(waiter->next_waiter != NULL); + + if (queue->last_waiter == NULL) { + /* + * The queue is empty, so form the initial circular list by self-linking the + * initial waiter. + */ + waiter->next_waiter = waiter; + } else { + /* Splice the new waiter in at the end of the queue. */ + waiter->next_waiter = queue->last_waiter->next_waiter; + queue->last_waiter->next_waiter = waiter; + } + + /* In both cases, the waiter we added to the ring becomes the last waiter. */ + queue->last_waiter = waiter; + queue->queue_length += 1; +} + +/** + * vdo_transfer_all_waiters() - Transfer all waiters from one wait queue to a second queue, + * emptying the first queue. + * @from_queue: The queue containing the waiters to move. + * @to_queue: The queue that will receive the waiters from the first queue. + */ +void vdo_transfer_all_waiters(struct wait_queue *from_queue, struct wait_queue *to_queue) +{ + /* If the source queue is empty, there's nothing to do. */ + if (!vdo_has_waiters(from_queue)) + return; + + if (vdo_has_waiters(to_queue)) { + /* + * Both queues are non-empty. Splice the two circular lists together by swapping + * the next (head) pointers in the list tails. + */ + struct waiter *from_head = from_queue->last_waiter->next_waiter; + struct waiter *to_head = to_queue->last_waiter->next_waiter; + + to_queue->last_waiter->next_waiter = from_head; + from_queue->last_waiter->next_waiter = to_head; + } + + to_queue->last_waiter = from_queue->last_waiter; + to_queue->queue_length += from_queue->queue_length; + vdo_initialize_wait_queue(from_queue); +} + +/** + * vdo_notify_all_waiters() - Notify all the entries waiting in a queue. + * @queue: The wait queue containing the waiters to notify. + * @callback: The function to call to notify each waiter, or NULL to invoke the callback field + * registered in each waiter. + * @context: The context to pass to the callback function. + * + * Notifies all the entries waiting in a queue to continue execution by invoking a callback + * function on each of them in turn. The queue is copied and emptied before invoking any callbacks, + * and only the waiters that were in the queue at the start of the call will be notified. + */ +void vdo_notify_all_waiters(struct wait_queue *queue, waiter_callback *callback, void *context) +{ + /* + * Copy and empty the queue first, avoiding the possibility of an infinite loop if entries + * are returned to the queue by the callback function. + */ + struct wait_queue waiters; + + vdo_initialize_wait_queue(&waiters); + vdo_transfer_all_waiters(queue, &waiters); + + /* Drain the copied queue, invoking the callback on every entry. */ + while (vdo_notify_next_waiter(&waiters, callback, context)) + /* All the work is done by the loop condition. */ + ; +} + +/** + * vdo_get_first_waiter() - Return the waiter that is at the head end of a wait queue. + * @queue: The queue from which to get the first waiter. + * + * Return: The first (oldest) waiter in the queue, or NULL if the queue is empty. + */ +struct waiter *vdo_get_first_waiter(const struct wait_queue *queue) +{ + struct waiter *last_waiter = queue->last_waiter; + + if (last_waiter == NULL) + /* There are no waiters, so we're done. */ + return NULL; + + /* The queue is circular, so the last entry links to the head of the queue. */ + return last_waiter->next_waiter; +} + +/** + * vdo_dequeue_matching_waiters() - Remove all waiters that match based on the specified matching + * method and append them to a wait_queue. + * @queue: The wait queue to process. + * @match_method: The method to determine matching. + * @match_context: Contextual info for the match method. + * @matched_queue: A wait_queue to store matches. + */ +void vdo_dequeue_matching_waiters(struct wait_queue *queue, + waiter_match *match_method, + void *match_context, + struct wait_queue *matched_queue) +{ + struct wait_queue matched_waiters, iteration_queue; + + vdo_initialize_wait_queue(&matched_waiters); + + vdo_initialize_wait_queue(&iteration_queue); + vdo_transfer_all_waiters(queue, &iteration_queue); + while (vdo_has_waiters(&iteration_queue)) { + struct waiter *waiter = vdo_dequeue_next_waiter(&iteration_queue); + + vdo_enqueue_waiter((match_method(waiter, match_context) ? + &matched_waiters : + queue), + waiter); + } + + vdo_transfer_all_waiters(&matched_waiters, matched_queue); +} + +/** + * vdo_dequeue_next_waiter() - Remove the first waiter from the head end of a wait queue. + * @queue: The wait queue from which to remove the first entry. + * + * The caller will be responsible for waking the waiter by invoking the correct callback function + * to resume its execution. + * + * Return: The first (oldest) waiter in the queue, or NULL if the queue is empty. + */ +struct waiter *vdo_dequeue_next_waiter(struct wait_queue *queue) +{ + struct waiter *first_waiter = vdo_get_first_waiter(queue); + struct waiter *last_waiter = queue->last_waiter; + + if (first_waiter == NULL) + return NULL; + + if (first_waiter == last_waiter) + /* The queue has a single entry, so just empty it out by nulling the tail. */ + queue->last_waiter = NULL; + else + /* + * The queue has more than one entry, so splice the first waiter out of the + * circular queue. + */ + last_waiter->next_waiter = first_waiter->next_waiter; + + /* The waiter is no longer in a wait queue. */ + first_waiter->next_waiter = NULL; + queue->queue_length -= 1; + return first_waiter; +} + +/** + * vdo_notify_next_waiter() - Notify the next entry waiting in a queue. + * @queue: The wait queue containing the waiter to notify. + * @callback: The function to call to notify the waiter, or NULL to invoke the callback field + * registered in the waiter. + * @context: The context to pass to the callback function. + * + * Notifies the next entry waiting in a queue to continue execution by invoking a callback function + * on it after removing it from the queue. + * + * Return: true if there was a waiter in the queue. + */ +bool vdo_notify_next_waiter(struct wait_queue *queue, waiter_callback *callback, void *context) +{ + struct waiter *waiter = vdo_dequeue_next_waiter(queue); + + if (waiter == NULL) + return false; + + if (callback == NULL) + callback = waiter->callback; + (*callback)(waiter, context); + return true; +} + +/** + * vdo_get_next_waiter() - Get the waiter after this one, for debug iteration. + * @queue: The wait queue. + * @waiter: A waiter. + * + * Return: The next waiter, or NULL. + */ +const struct waiter * +vdo_get_next_waiter(const struct wait_queue *queue, const struct waiter *waiter) +{ + struct waiter *first_waiter = vdo_get_first_waiter(queue); + + if (waiter == NULL) + return first_waiter; + return ((waiter->next_waiter != first_waiter) ? waiter->next_waiter : NULL); +} diff --git a/drivers/md/dm-vdo/wait-queue.h b/drivers/md/dm-vdo/wait-queue.h new file mode 100644 index 00000000000..f5f8bb3a7a4 --- /dev/null +++ b/drivers/md/dm-vdo/wait-queue.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_WAIT_QUEUE_H +#define VDO_WAIT_QUEUE_H + +#include <linux/compiler.h> +#include <linux/types.h> + +/** + * DOC: Wait queues. + * + * A wait queue is a circular list of entries waiting to be notified of a change in a condition. + * Keeping a circular list allows the queue structure to simply be a pointer to the tail (newest) + * entry in the queue, supporting constant-time enqueue and dequeue operations. A null pointer is + * an empty queue. + * + * An empty queue: + * queue0.last_waiter -> NULL + * + * A singleton queue: + * queue1.last_waiter -> entry1 -> entry1 -> [...] + * + * A three-element queue: + * queue2.last_waiter -> entry3 -> entry1 -> entry2 -> entry3 -> [...] + */ + +struct waiter; + +struct wait_queue { + /* The tail of the queue, the last (most recently added) entry */ + struct waiter *last_waiter; + /* The number of waiters currently in the queue */ + size_t queue_length; +}; + +/** + * typedef waiter_callback - Callback type for functions which will be called to resume processing + * of a waiter after it has been removed from its wait queue. + */ +typedef void waiter_callback(struct waiter *waiter, void *context); + +/** + * typedef waiter_match - Method type for waiter matching methods. + * + * A waiter_match method returns false if the waiter does not match. + */ +typedef bool waiter_match(struct waiter *waiter, void *context); + +/* The queue entry structure for entries in a wait_queue. */ +struct waiter { + /* + * The next waiter in the queue. If this entry is the last waiter, then this is actually a + * pointer back to the head of the queue. + */ + struct waiter *next_waiter; + + /* Optional waiter-specific callback to invoke when waking this waiter. */ + waiter_callback *callback; +}; + +/** + * is_waiting() -Check whether a waiter is waiting. + * @waiter: The waiter to check. + * + * Return: true if the waiter is on some wait_queue. + */ +static inline bool vdo_is_waiting(struct waiter *waiter) +{ + return (waiter->next_waiter != NULL); +} + +/** + * initialize_wait_queue() - Initialize a wait queue. + * @queue: The queue to initialize. + */ +static inline void vdo_initialize_wait_queue(struct wait_queue *queue) +{ + *queue = (struct wait_queue) { + .last_waiter = NULL, + .queue_length = 0, + }; +} + +/** + * has_waiters() - Check whether a wait queue has any entries waiting in it. + * @queue: The queue to query. + * + * Return: true if there are any waiters in the queue. + */ +static inline bool __must_check vdo_has_waiters(const struct wait_queue *queue) +{ + return (queue->last_waiter != NULL); +} + +void vdo_enqueue_waiter(struct wait_queue *queue, struct waiter *waiter); + +void vdo_notify_all_waiters(struct wait_queue *queue, waiter_callback *callback, void *context); + +bool vdo_notify_next_waiter(struct wait_queue *queue, waiter_callback *callback, void *context); + +void vdo_transfer_all_waiters(struct wait_queue *from_queue, struct wait_queue *to_queue); + +struct waiter *vdo_get_first_waiter(const struct wait_queue *queue); + +void vdo_dequeue_matching_waiters(struct wait_queue *queue, + waiter_match *match_method, + void *match_context, + struct wait_queue *matched_queue); + +struct waiter *vdo_dequeue_next_waiter(struct wait_queue *queue); + +/** + * count_waiters() - Count the number of waiters in a wait queue. + * @queue: The wait queue to query. + * + * Return: The number of waiters in the queue. + */ +static inline size_t __must_check vdo_count_waiters(const struct wait_queue *queue) +{ + return queue->queue_length; +} + +const struct waiter * __must_check +vdo_get_next_waiter(const struct wait_queue *queue, const struct waiter *waiter); + +#endif /* VDO_WAIT_QUEUE_H */ diff --git a/drivers/md/dm-vdo/work-queue.c b/drivers/md/dm-vdo/work-queue.c new file mode 100644 index 00000000000..40cf1bfb947 --- /dev/null +++ b/drivers/md/dm-vdo/work-queue.c @@ -0,0 +1,658 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "work-queue.h" + +#include <linux/atomic.h> +#include <linux/cache.h> +#include <linux/completion.h> +#include <linux/err.h> +#include <linux/kthread.h> +#include <linux/percpu.h> + +#include "funnel-queue.h" +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "permassert.h" +#include "string-utils.h" + +#include "completion.h" +#include "status-codes.h" + +static DEFINE_PER_CPU(unsigned int, service_queue_rotor); + +/** + * DOC: Work queue definition. + * + * There are two types of work queues: simple, with one worker thread, and round-robin, which uses + * a group of the former to do the work, and assigns work to them in round-robin fashion (roughly). + * Externally, both are represented via the same common sub-structure, though there's actually not + * a great deal of overlap between the two types internally. + */ +struct vdo_work_queue { + /* Name of just the work queue (e.g., "cpuQ12") */ + char *name; + bool round_robin_mode; + struct vdo_thread *owner; + /* Life cycle functions, etc */ + const struct vdo_work_queue_type *type; +}; + +struct simple_work_queue { + struct vdo_work_queue common; + struct funnel_queue *priority_lists[VDO_WORK_Q_MAX_PRIORITY + 1]; + void *private; + + /* + * The fields above are unchanged after setup but often read, and are good candidates for + * caching -- and if the max priority is 2, just fit in one x86-64 cache line if aligned. + * The fields below are often modified as we sleep and wake, so we want a separate cache + * line for performance. + */ + + /* Any (0 or 1) worker threads waiting for new work to do */ + wait_queue_head_t waiting_worker_threads ____cacheline_aligned; + /* Hack to reduce wakeup calls if the worker thread is running */ + atomic_t idle; + + /* These are infrequently used so in terms of performance we don't care where they land. */ + struct task_struct *thread; + /* Notify creator once worker has initialized */ + struct completion *started; +}; + +struct round_robin_work_queue { + struct vdo_work_queue common; + struct simple_work_queue **service_queues; + unsigned int num_service_queues; +}; + +static inline struct simple_work_queue *as_simple_work_queue(struct vdo_work_queue *queue) +{ + return ((queue == NULL) ? NULL : container_of(queue, struct simple_work_queue, common)); +} + +static inline struct round_robin_work_queue * +as_round_robin_work_queue(struct vdo_work_queue *queue) +{ + return ((queue == NULL) ? + NULL : + container_of(queue, struct round_robin_work_queue, common)); +} + +/* Processing normal completions. */ + +/* + * Dequeue and return the next waiting completion, if any. + * + * We scan the funnel queues from highest priority to lowest, once; there is therefore a race + * condition where a high-priority completion can be enqueued followed by a lower-priority one, and + * we'll grab the latter (but we'll catch the high-priority item on the next call). If strict + * enforcement of priorities becomes necessary, this function will need fixing. + */ +static struct vdo_completion *poll_for_completion(struct simple_work_queue *queue) +{ + int i; + + for (i = queue->common.type->max_priority; i >= 0; i--) { + struct funnel_queue_entry *link = uds_funnel_queue_poll(queue->priority_lists[i]); + + if (link != NULL) + return container_of(link, struct vdo_completion, work_queue_entry_link); + } + + return NULL; +} + +static void +enqueue_work_queue_completion(struct simple_work_queue *queue, struct vdo_completion *completion) +{ + ASSERT_LOG_ONLY(completion->my_queue == NULL, + "completion %px (fn %px) to enqueue (%px) is not already queued (%px)", + completion, + completion->callback, + queue, + completion->my_queue); + if (completion->priority == VDO_WORK_Q_DEFAULT_PRIORITY) + completion->priority = queue->common.type->default_priority; + + if (ASSERT(completion->priority <= queue->common.type->max_priority, + "priority is in range for queue") != VDO_SUCCESS) + completion->priority = 0; + + completion->my_queue = &queue->common; + + /* Funnel queue handles the synchronization for the put. */ + uds_funnel_queue_put(queue->priority_lists[completion->priority], + &completion->work_queue_entry_link); + + /* + * Due to how funnel queue synchronization is handled (just atomic operations), the + * simplest safe implementation here would be to wake-up any waiting threads after + * enqueueing each item. Even if the funnel queue is not empty at the time of adding an + * item to the queue, the consumer thread may not see this since it is not guaranteed to + * have the same view of the queue as a producer thread. + * + * However, the above is wasteful so instead we attempt to minimize the number of thread + * wakeups. Using an idle flag, and careful ordering using memory barriers, we should be + * able to determine when the worker thread might be asleep or going to sleep. We use + * cmpxchg to try to take ownership (vs other producer threads) of the responsibility for + * waking the worker thread, so multiple wakeups aren't tried at once. + * + * This was tuned for some x86 boxes that were handy; it's untested whether doing the read + * first is any better or worse for other platforms, even other x86 configurations. + */ + smp_mb(); + if ((atomic_read(&queue->idle) != 1) || (atomic_cmpxchg(&queue->idle, 1, 0) != 1)) + return; + + /* There's a maximum of one thread in this list. */ + wake_up(&queue->waiting_worker_threads); +} + +static void run_start_hook(struct simple_work_queue *queue) +{ + if (queue->common.type->start != NULL) + queue->common.type->start(queue->private); +} + +static void run_finish_hook(struct simple_work_queue *queue) +{ + if (queue->common.type->finish != NULL) + queue->common.type->finish(queue->private); +} + +/* + * Wait for the next completion to process, or until kthread_should_stop indicates that it's time + * for us to shut down. + * + * If kthread_should_stop says it's time to stop but we have pending completions return a + * completion. + * + * Also update statistics relating to scheduler interactions. + */ +static struct vdo_completion *wait_for_next_completion(struct simple_work_queue *queue) +{ + struct vdo_completion *completion; + DEFINE_WAIT(wait); + + while (true) { + prepare_to_wait(&queue->waiting_worker_threads, &wait, TASK_INTERRUPTIBLE); + /* + * Don't set the idle flag until a wakeup will not be lost. + * + * Force synchronization between setting the idle flag and checking the funnel + * queue; the producer side will do them in the reverse order. (There's still a + * race condition we've chosen to allow, because we've got a timeout below that + * unwedges us if we hit it, but this may narrow the window a little.) + */ + atomic_set(&queue->idle, 1); + smp_mb(); /* store-load barrier between "idle" and funnel queue */ + + completion = poll_for_completion(queue); + if (completion != NULL) + break; + + /* + * We need to check for thread-stop after setting TASK_INTERRUPTIBLE state up + * above. Otherwise, schedule() will put the thread to sleep and might miss a + * wakeup from kthread_stop() call in vdo_finish_work_queue(). + */ + if (kthread_should_stop()) + break; + + schedule(); + + /* + * Most of the time when we wake, it should be because there's work to do. If it + * was a spurious wakeup, continue looping. + */ + completion = poll_for_completion(queue); + if (completion != NULL) + break; + } + + finish_wait(&queue->waiting_worker_threads, &wait); + atomic_set(&queue->idle, 0); + + return completion; +} + +static void process_completion(struct simple_work_queue *queue, struct vdo_completion *completion) +{ + if (ASSERT(completion->my_queue == &queue->common, + "completion %px from queue %px marked as being in this queue (%px)", + completion, + queue, + completion->my_queue) == UDS_SUCCESS) + completion->my_queue = NULL; + + vdo_run_completion(completion); +} + +static void service_work_queue(struct simple_work_queue *queue) +{ + run_start_hook(queue); + + while (true) { + struct vdo_completion *completion = poll_for_completion(queue); + + if (completion == NULL) + completion = wait_for_next_completion(queue); + + if (completion == NULL) + /* No completions but kthread_should_stop() was triggered. */ + break; + + process_completion(queue, completion); + + /* + * Be friendly to a CPU that has other work to do, if the kernel has told us to. + * This speeds up some performance tests; that "other work" might include other VDO + * threads. + */ + if (need_resched()) + cond_resched(); + } + + run_finish_hook(queue); +} + +static int work_queue_runner(void *ptr) +{ + struct simple_work_queue *queue = ptr; + + complete(queue->started); + service_work_queue(queue); + return 0; +} + +/* Creation & teardown */ + +static void free_simple_work_queue(struct simple_work_queue *queue) +{ + unsigned int i; + + for (i = 0; i <= VDO_WORK_Q_MAX_PRIORITY; i++) + uds_free_funnel_queue(queue->priority_lists[i]); + UDS_FREE(queue->common.name); + UDS_FREE(queue); +} + +static void free_round_robin_work_queue(struct round_robin_work_queue *queue) +{ + struct simple_work_queue **queue_table = queue->service_queues; + unsigned int count = queue->num_service_queues; + unsigned int i; + + queue->service_queues = NULL; + + for (i = 0; i < count; i++) + free_simple_work_queue(queue_table[i]); + UDS_FREE(queue_table); + UDS_FREE(queue->common.name); + UDS_FREE(queue); +} + +void vdo_free_work_queue(struct vdo_work_queue *queue) +{ + if (queue == NULL) + return; + + vdo_finish_work_queue(queue); + + if (queue->round_robin_mode) + free_round_robin_work_queue(as_round_robin_work_queue(queue)); + else + free_simple_work_queue(as_simple_work_queue(queue)); +} + +static int make_simple_work_queue(const char *thread_name_prefix, + const char *name, + struct vdo_thread *owner, + void *private, + const struct vdo_work_queue_type *type, + struct simple_work_queue **queue_ptr) +{ + DECLARE_COMPLETION_ONSTACK(started); + struct simple_work_queue *queue; + int i; + struct task_struct *thread = NULL; + int result; + + ASSERT_LOG_ONLY((type->max_priority <= VDO_WORK_Q_MAX_PRIORITY), + "queue priority count %u within limit %u", + type->max_priority, + VDO_WORK_Q_MAX_PRIORITY); + + result = UDS_ALLOCATE(1, struct simple_work_queue, "simple work queue", &queue); + if (result != UDS_SUCCESS) + return result; + + queue->private = private; + queue->started = &started; + queue->common.type = type; + queue->common.owner = owner; + init_waitqueue_head(&queue->waiting_worker_threads); + + result = uds_duplicate_string(name, "queue name", &queue->common.name); + if (result != VDO_SUCCESS) { + UDS_FREE(queue); + return -ENOMEM; + } + + for (i = 0; i <= type->max_priority; i++) { + result = uds_make_funnel_queue(&queue->priority_lists[i]); + if (result != UDS_SUCCESS) { + free_simple_work_queue(queue); + return result; + } + } + + thread = kthread_run(work_queue_runner, + queue, + "%s:%s", + thread_name_prefix, + queue->common.name); + if (IS_ERR(thread)) { + free_simple_work_queue(queue); + return (int) PTR_ERR(thread); + } + + queue->thread = thread; + + /* + * If we don't wait to ensure the thread is running VDO code, a quick kthread_stop (due to + * errors elsewhere) could cause it to never get as far as running VDO, skipping the + * cleanup code. + * + * Eventually we should just make that path safe too, and then we won't need this + * synchronization. + */ + wait_for_completion(&started); + + *queue_ptr = queue; + return UDS_SUCCESS; +} + +/** + * Create a work queue; if multiple threads are requested, completions will be distributed to them + * in round-robin fashion. + * + * Each queue is associated with a struct vdo_thread which has a single vdo thread id. Regardless + * of the actual number of queues and threads allocated here, code outside of the queue + * implementation will treat this as a single zone. + */ +int vdo_make_work_queue(const char *thread_name_prefix, + const char *name, + struct vdo_thread *owner, + const struct vdo_work_queue_type *type, + unsigned int thread_count, + void *thread_privates[], + struct vdo_work_queue **queue_ptr) +{ + struct round_robin_work_queue *queue; + int result; + char thread_name[TASK_COMM_LEN]; + unsigned int i; + + if (thread_count == 1) { + struct simple_work_queue *simple_queue; + void *context = ((thread_privates != NULL) ? thread_privates[0] : NULL); + + result = make_simple_work_queue(thread_name_prefix, + name, + owner, + context, + type, + &simple_queue); + if (result == VDO_SUCCESS) + *queue_ptr = &simple_queue->common; + return result; + } + + result = UDS_ALLOCATE(1, struct round_robin_work_queue, "round-robin work queue", &queue); + if (result != UDS_SUCCESS) + return result; + + result = UDS_ALLOCATE(thread_count, + struct simple_work_queue *, + "subordinate work queues", + &queue->service_queues); + if (result != UDS_SUCCESS) { + UDS_FREE(queue); + return result; + } + + queue->num_service_queues = thread_count; + queue->common.round_robin_mode = true; + queue->common.owner = owner; + + result = uds_duplicate_string(name, "queue name", &queue->common.name); + if (result != VDO_SUCCESS) { + UDS_FREE(queue->service_queues); + UDS_FREE(queue); + return -ENOMEM; + } + + *queue_ptr = &queue->common; + + for (i = 0; i < thread_count; i++) { + void *context = ((thread_privates != NULL) ? thread_privates[i] : NULL); + + snprintf(thread_name, sizeof(thread_name), "%s%u", name, i); + result = make_simple_work_queue(thread_name_prefix, + thread_name, + owner, + context, + type, + &queue->service_queues[i]); + if (result != VDO_SUCCESS) { + queue->num_service_queues = i; + /* Destroy previously created subordinates. */ + vdo_free_work_queue(UDS_FORGET(*queue_ptr)); + return result; + } + } + + return VDO_SUCCESS; +} + +static void finish_simple_work_queue(struct simple_work_queue *queue) +{ + if (queue->thread == NULL) + return; + + /* Tells the worker thread to shut down and waits for it to exit. */ + kthread_stop(queue->thread); + queue->thread = NULL; +} + +static void finish_round_robin_work_queue(struct round_robin_work_queue *queue) +{ + struct simple_work_queue **queue_table = queue->service_queues; + unsigned int count = queue->num_service_queues; + unsigned int i; + + for (i = 0; i < count; i++) + finish_simple_work_queue(queue_table[i]); +} + +/* No enqueueing of completions should be done once this function is called. */ +void vdo_finish_work_queue(struct vdo_work_queue *queue) +{ + if (queue == NULL) + return; + + if (queue->round_robin_mode) + finish_round_robin_work_queue(as_round_robin_work_queue(queue)); + else + finish_simple_work_queue(as_simple_work_queue(queue)); +} + +/* Debugging dumps */ + +static void dump_simple_work_queue(struct simple_work_queue *queue) +{ + const char *thread_status = "no threads"; + char task_state_report = '-'; + + if (queue->thread != NULL) { + task_state_report = task_state_to_char(queue->thread); + thread_status = atomic_read(&queue->idle) ? "idle" : "running"; + } + + uds_log_info("workQ %px (%s) %s (%c)", + &queue->common, + queue->common.name, + thread_status, + task_state_report); + + /* ->waiting_worker_threads wait queue status? anyone waiting? */ +} + +/** + * Write to the buffer some info about the completion, for logging. Since the common use case is + * dumping info about a lot of completions to syslog all at once, the format favors brevity over + * readability. + */ +void vdo_dump_work_queue(struct vdo_work_queue *queue) +{ + if (queue->round_robin_mode) { + struct round_robin_work_queue *round_robin = as_round_robin_work_queue(queue); + unsigned int i; + + for (i = 0; i < round_robin->num_service_queues; i++) + dump_simple_work_queue(round_robin->service_queues[i]); + } else { + dump_simple_work_queue(as_simple_work_queue(queue)); + } +} + +static void get_function_name(void *pointer, char *buffer, size_t buffer_length) +{ + if (pointer == NULL) { + /* + * Format "%ps" logs a null pointer as "(null)" with a bunch of leading spaces. We + * sometimes use this when logging lots of data; don't be so verbose. + */ + strncpy(buffer, "-", buffer_length); + } else { + /* + * Use a pragma to defeat gcc's format checking, which doesn't understand that + * "%ps" actually does support a precision spec in Linux kernel code. + */ + char *space; + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat" + snprintf(buffer, buffer_length, "%.*ps", buffer_length - 1, pointer); +#pragma GCC diagnostic pop + + space = strchr(buffer, ' '); + if (space != NULL) + *space = '\0'; + } +} + +void vdo_dump_completion_to_buffer(struct vdo_completion *completion, char *buffer, size_t length) +{ + size_t current_length = + scnprintf(buffer, + length, + "%.*s/", + TASK_COMM_LEN, + (completion->my_queue == NULL ? "-" : completion->my_queue->name)); + + if (current_length < length) + get_function_name((void *) completion->callback, + buffer + current_length, + length - current_length); +} + +/* Completion submission */ +/* + * If the completion has a timeout that has already passed, the timeout handler function may be + * invoked by this function. + */ +void vdo_enqueue_work_queue(struct vdo_work_queue *queue, struct vdo_completion *completion) +{ + /* + * Convert the provided generic vdo_work_queue to the simple_work_queue to actually queue + * on. + */ + struct simple_work_queue *simple_queue = NULL; + + if (!queue->round_robin_mode) { + simple_queue = as_simple_work_queue(queue); + } else { + struct round_robin_work_queue *round_robin = as_round_robin_work_queue(queue); + + /* + * It shouldn't be a big deal if the same rotor gets used for multiple work queues. + * Any patterns that might develop are likely to be disrupted by random ordering of + * multiple completions and migration between cores, unless the load is so light as + * to be regular in ordering of tasks and the threads are confined to individual + * cores; with a load that light we won't care. + */ + unsigned int rotor = this_cpu_inc_return(service_queue_rotor); + unsigned int index = rotor % round_robin->num_service_queues; + + simple_queue = round_robin->service_queues[index]; + } + + enqueue_work_queue_completion(simple_queue, completion); +} + +/* Misc */ + +/* + * Return the work queue pointer recorded at initialization time in the work-queue stack handle + * initialized on the stack of the current thread, if any. + */ +static struct simple_work_queue *get_current_thread_work_queue(void) +{ + /* + * In interrupt context, if a vdo thread is what got interrupted, the calls below will find + * the queue for the thread which was interrupted. However, the interrupted thread may have + * been processing a completion, in which case starting to process another would violate + * our concurrency assumptions. + */ + if (in_interrupt()) + return NULL; + if (kthread_func(current) != work_queue_runner) + /* Not a VDO work queue thread. */ + return NULL; + return kthread_data(current); +} + +struct vdo_work_queue *vdo_get_current_work_queue(void) +{ + struct simple_work_queue *queue = get_current_thread_work_queue(); + + return (queue == NULL) ? NULL : &queue->common; +} + +struct vdo_thread *vdo_get_work_queue_owner(struct vdo_work_queue *queue) +{ + return queue->owner; +} + +/** + * Returns the private data for the current thread's work queue, or NULL if none or if the current + * thread is not a work queue thread. + */ +void *vdo_get_work_queue_private_data(void) +{ + struct simple_work_queue *queue = get_current_thread_work_queue(); + + return (queue != NULL) ? queue->private : NULL; +} + +bool vdo_work_queue_type_is(struct vdo_work_queue *queue, const struct vdo_work_queue_type *type) +{ + return (queue->type == type); +} diff --git a/drivers/md/dm-vdo/work-queue.h b/drivers/md/dm-vdo/work-queue.h new file mode 100644 index 00000000000..d1e05f8901d --- /dev/null +++ b/drivers/md/dm-vdo/work-queue.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_WORK_QUEUE_H +#define VDO_WORK_QUEUE_H + +#include <linux/sched.h> /* for TASK_COMM_LEN */ + +#include "types.h" + +enum { + MAX_VDO_WORK_QUEUE_NAME_LEN = TASK_COMM_LEN, +}; + +struct vdo_work_queue_type { + void (*start)(void *context); + void (*finish)(void *context); + enum vdo_completion_priority max_priority; + enum vdo_completion_priority default_priority; +}; + +struct vdo_completion; +struct vdo_thread; +struct vdo_work_queue; + +int vdo_make_work_queue(const char *thread_name_prefix, + const char *name, + struct vdo_thread *owner, + const struct vdo_work_queue_type *type, + unsigned int thread_count, + void *thread_privates[], + struct vdo_work_queue **queue_ptr); + +void vdo_enqueue_work_queue(struct vdo_work_queue *queue, struct vdo_completion *completion); + +void vdo_finish_work_queue(struct vdo_work_queue *queue); + +void vdo_free_work_queue(struct vdo_work_queue *queue); + +void vdo_dump_work_queue(struct vdo_work_queue *queue); + +void vdo_dump_completion_to_buffer(struct vdo_completion *completion, char *buffer, size_t length); + +void *vdo_get_work_queue_private_data(void); +struct vdo_work_queue *vdo_get_current_work_queue(void); +struct vdo_thread *vdo_get_work_queue_owner(struct vdo_work_queue *queue); + +bool __must_check +vdo_work_queue_type_is(struct vdo_work_queue *queue, const struct vdo_work_queue_type *type); + +#endif /* VDO_WORK_QUEUE_H */ -- 2.40.0 -- dm-devel mailing list dm-devel@xxxxxxxxxx https://listman.redhat.com/mailman/listinfo/dm-devel