André Almeida <andrealmeid@xxxxxxxxxxxxx> writes: > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index e0fce93ac127..8b745f229789 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -10,74 +10,153 @@ struct blk_mq_tags; > struct blk_flush_queue; > > /** > - * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware block device > + * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware > + * block device > */ > struct blk_mq_hw_ctx { > struct { > + /** @lock: Lock for accessing dispatch queue */ > spinlock_t lock; > + /** > + * @dispatch: Queue of dispatched requests, waiting for > + * workers to send them to the hardware. > + */ It's been a few years since I looked at the block layer, but isn't this used to hold requests that were taken from the blk_mq_ctx, but couldn't be dispatched because the queue was full? > struct list_head dispatch; > - unsigned long state; /* BLK_MQ_S_* flags */ > + /** > + * @state: BLK_MQ_S_* flags. Define the state of the hw > + * queue (active, scheduled to restart, stopped). > + */ > + unsigned long state; > } ____cacheline_aligned_in_smp; > > + /** > + * @run_work: Worker for dispatch scheduled requests to hardware. > + * BLK_MQ_CPU_WORK_BATCH workers will be assigned to a CPU, then the > + * following ones will be assigned to the next_cpu. > + */ > struct delayed_work run_work; > + /** @cpumask: Map of available CPUs. */ Available cpus where this hctx can run. > cpumask_var_t cpumask; > + /** > + * @next_cpu: Index of CPU/workqueue to be used in the next batch > + * of workers. > + */ > int next_cpu; > + /** > + * @next_cpu_batch: Counter of how many works letf in the batch before > + * changing to the next CPU. A batch has the size > + * of BLK_MQ_CPU_WORK_BATCH. > + */ > int next_cpu_batch; > > - unsigned long flags; /* BLK_MQ_F_* flags */ > + /** @flags: BLK_MQ_F_* flags. Define the behaviour of the queue. */ > + unsigned long flags; > > + /** @sched_data: Data to support schedulers. */ > void *sched_data; > + /** @queue: Queue of request to be dispatched. */ > struct request_queue *queue; Maybe "the request queue this hctx belongs to" > + /** @fq: Queue of requests to be flushed from memory to storage. */ > struct blk_flush_queue *fq; Hmm, i think this needs clarification. > > + /** > + * @driver_data: Pointer to data owned by the block driver that created > + * this hctx > + */ > void *driver_data; > > + /** > + * @ctx_map: Bitmap for each software queue. If bit is on, there is a > + * pending request. > + */ > struct sbitmap ctx_map; > > + /** > + * @dispatch_from: Queue to be used when there is no scheduler > + * was selected. > + */ "when there is no scheduler selected" or "when no scheduler was selected" > struct blk_mq_ctx *dispatch_from; > + /** > + * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to > + * decide if the hw_queue is busy using Exponential Weighted Moving > + * Average algorithm. > + */ > unsigned int dispatch_busy; > > + /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ > unsigned short type; > + /** @nr_ctx: Number of software queues. */ > unsigned short nr_ctx; > + /** @ctxs: Array of software queues. */ > struct blk_mq_ctx **ctxs; > > + /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ > spinlock_t dispatch_wait_lock; > + /** > + * @dispatch_wait: Waitqueue for dispatched requests. Request here will > + * be processed when percpu_ref_is_zero(&q->q_usage_counter) evaluates > + * true for q as a request_queue. > + */ That's not really useful, since it is stating what the code does. It would be interesting to say why this is needed, i.e. to re-run the queue and dispatch requests if the queue was full in the previous attempt. > wait_queue_entry_t dispatch_wait; > + /** @wait_index: Index of next wait queue to be used. */ > atomic_t wait_index; > > + /** @tags: Request tags. */ > struct blk_mq_tags *tags; > + /** @sched_tags: Request tags for schedulers. */ > struct blk_mq_tags *sched_tags; > > + /** @queued: Number of queued requests. */ > unsigned long queued; > + /** @run: Number of dispatched requests. */ > unsigned long run; > #define BLK_MQ_MAX_DISPATCH_ORDER 7 > + /** @dispatched: Number of dispatch requests by queue. */ > unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; > > + /** @numa_node: NUMA node the storage adapter has been connected to. */ > unsigned int numa_node; > + /** @queue_num: Index of this hardware queue. */ > unsigned int queue_num; > > + /** @nr_active: Number of active tags. */ > atomic_t nr_active; > > + /** @cpuhp_dead: List to store request if some CPU die. */ > struct hlist_node cpuhp_dead; > + /** @kobj: Kernel object for sysfs. */ > struct kobject kobj; > > + /** @poll_considered: Count times blk_poll() was called. */ > unsigned long poll_considered; > + /** @poll_invoked: Count how many requests blk_poll() polled. */ > unsigned long poll_invoked; > + /** @poll_success: Count how many polled requests were completed. */ > unsigned long poll_success; > > #ifdef CONFIG_BLK_DEBUG_FS > + /** > + * @debugfs_dir: debugfs directory for this hardware queue. Named > + * as cpu<cpu_number>. > + */ > struct dentry *debugfs_dir; > + /** @sched_debugfs_dir: debugfs directory for the scheduler. */ > struct dentry *sched_debugfs_dir; > #endif > > + /** @hctx_list: List of all hardware queues. */ > struct list_head hctx_list; > > - /* Must be the last member - see also blk_mq_hw_ctx_size(). */ > + /** > + * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is > + * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also > + * blk_mq_hw_ctx_size(). > + */ > struct srcu_struct srcu[0]; > }; > > /** > - * struct blk_mq_queue_map - ctx -> hctx mapping > + * struct blk_mq_queue_map - Map software queues to hardware queues > * @mq_map: CPU ID to hardware queue index map. This is an array > * with nr_cpu_ids elements. Each element has a value in the range > * [@queue_offset, @queue_offset + @nr_queues). > @@ -92,10 +171,17 @@ struct blk_mq_queue_map { > unsigned int queue_offset; > }; > > +/** > + * enum hctx_type - Type of hardware queue > + * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. > + * @HCTX_TYPE_READ: Just for READ I/O. > + * @HCTX_TYPE_POLL: Polled I/O of any kind. > + * @HCTX_MAX_TYPES: Number of types of hctx. > + */ > enum hctx_type { > - HCTX_TYPE_DEFAULT, /* all I/O not otherwise accounted for */ > - HCTX_TYPE_READ, /* just for READ I/O */ > - HCTX_TYPE_POLL, /* polled I/O of any kind */ > + HCTX_TYPE_DEFAULT, > + HCTX_TYPE_READ, > + HCTX_TYPE_POLL, > > HCTX_MAX_TYPES, > }; > @@ -147,6 +233,12 @@ struct blk_mq_tag_set { > struct list_head tag_list; > }; > > +/** > + * struct blk_mq_queue_data - Data about a request inserted in a queue > + * > + * @rq: Data about the block IO request. > + * @last: If it is the last request in the queue. > + */ > struct blk_mq_queue_data { > struct request *rq; > bool last; > @@ -174,81 +266,101 @@ typedef bool (busy_fn)(struct request_queue *); > typedef void (complete_fn)(struct request *); > typedef void (cleanup_rq_fn)(struct request *); > > - > +/** > + * struct blk_mq_ops - list of callback functions that implements block driver > + * behaviour. > + */ > struct blk_mq_ops { > - /* > - * Queue request > + /** > + * @queue_rq: Queue a new request from block IO. > */ > queue_rq_fn *queue_rq; > > - /* > - * If a driver uses bd->last to judge when to submit requests to > - * hardware, it must define this function. In case of errors that > - * make us stop issuing further requests, this hook serves the > + /** > + * @commit_rqs: If a driver uses bd->last to judge when to submit > + * requests to hardware, it must define this function. In case of errors > + * that make us stop issuing further requests, this hook serves the > * purpose of kicking the hardware (which the last request otherwise > * would have done). > */ > commit_rqs_fn *commit_rqs; > > - /* > - * Reserve budget before queue request, once .queue_rq is > + /** > + * @get_budget: Reserve budget before queue request, once .queue_rq is > * run, it is driver's responsibility to release the > * reserved budget. Also we have to handle failure case > * of .get_budget for avoiding I/O deadlock. > */ > get_budget_fn *get_budget; > + /** > + * @put_budget: Release the reserved budget. > + */ > put_budget_fn *put_budget; > > - /* > - * Called on request timeout > + /** > + * @timeout: Called on request timeout. > */ > timeout_fn *timeout; > > - /* > - * Called to poll for completion of a specific tag. > + /** > + * @poll: Called to poll for completion of a specific tag. > */ > poll_fn *poll; > > + /** > + * @complete: Mark the request as complete. > + */ > complete_fn *complete; > > - /* > - * Called when the block layer side of a hardware queue has been > - * set up, allowing the driver to allocate/init matching structures. > - * Ditto for exit/teardown. > + /** > + * @init_hctx: Called when the block layer side of a hardware queue has > + * been set up, allowing the driver to allocate/init matching > + * structures. > */ > init_hctx_fn *init_hctx; > + /** > + * @exit_hctx: Ditto for exit/teardown. > + */ > exit_hctx_fn *exit_hctx; > > - /* > - * Called for every command allocated by the block layer to allow > - * the driver to set up driver specific data. > + /** > + * @init_request: Called for every command allocated by the block layer > + * to allow the driver to set up driver specific data. > * > * Tag greater than or equal to queue_depth is for setting up > * flush request. > - * > - * Ditto for exit/teardown. > */ > init_request_fn *init_request; > + /** > + * @exit_request: Ditto for exit/teardown. > + */ > exit_request_fn *exit_request; > - /* Called from inside blk_get_request() */ > + > + /** > + * @initialize_rq_fn: Called from inside blk_get_request(). > + */ > void (*initialize_rq_fn)(struct request *rq); > > - /* > - * Called before freeing one request which isn't completed yet, > - * and usually for freeing the driver private data > + /** > + * @cleanup_rq: Called before freeing one request which isn't completed > + * yet, and usually for freeing the driver private data. > */ > cleanup_rq_fn *cleanup_rq; > > - /* > - * If set, returns whether or not this queue currently is busy > + /** > + * @busy: If set, returns whether or not this queue currently is busy. > */ > busy_fn *busy; > > + /** > + * @map_queues: This allows drivers specify their own queue mapping by > + * overriding the setup-time function that builds the mq_map. > + */ > map_queues_fn *map_queues; > > #ifdef CONFIG_BLK_DEBUG_FS > - /* > - * Used by the debugfs implementation to show driver-specific > + /** > + * @show_rq: Used by the debugfs implementation to show driver-specific > * information about a request. > */ > void (*show_rq)(struct seq_file *m, struct request *rq); > @@ -391,14 +503,29 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q); > > unsigned int blk_mq_rq_cpu(struct request *rq); > > -/* > +/** > + * blk_mq_rq_from_pdu - cast a PDU to a request > + * @pdu: the PDU (Protocol Data Unit) to be casted > + * > + * Return: request > + * > * Driver command data is immediately after the request. So subtract request > - * size to get back to the original request, add request size to get the PDU. > + * size to get back to the original request. > */ > static inline struct request *blk_mq_rq_from_pdu(void *pdu) > { > return pdu - sizeof(struct request); > } > + > +/** > + * blk_mq_rq_to_pdu - cast a request to a PDU > + * @rq: the request to be casted > + * > + * Return: pointer to the PDU > + * > + * Driver command data is immediately after the request. So add request to get > + * the PDU. > + */ > static inline void *blk_mq_rq_to_pdu(struct request *rq) > { > return rq + 1; -- Gabriel Krisman Bertazi